{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0052194084670405, "eval_steps": 500, "global_step": 6500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015464913976416007, "grad_norm": 6.911925792694092, "learning_rate": 2.577319587628866e-09, "logits/chosen": 7.725544452667236, "logits/rejected": 7.458861827850342, "logps/chosen": -328.6910400390625, "logps/rejected": -265.0786437988281, "loss": 0.7306, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007827766239643097, "rewards/margins": -0.06521320343017578, "rewards/rejected": 0.06599598377943039, "step": 1 }, { "epoch": 0.00030929827952832015, "grad_norm": 6.559286594390869, "learning_rate": 5.154639175257732e-09, "logits/chosen": 10.706411361694336, "logits/rejected": 9.259780883789062, "logps/chosen": -513.0947265625, "logps/rejected": -438.3065185546875, "loss": 0.6917, "rewards/accuracies": 0.625, "rewards/chosen": 0.006189629435539246, "rewards/margins": 0.016045667231082916, "rewards/rejected": -0.009856035001575947, "step": 2 }, { "epoch": 0.00046394741929248017, "grad_norm": 4.6717209815979, "learning_rate": 7.731958762886597e-09, "logits/chosen": 5.7964887619018555, "logits/rejected": 7.366323471069336, "logps/chosen": -346.95306396484375, "logps/rejected": -268.921630859375, "loss": 0.696, "rewards/accuracies": 0.25, "rewards/chosen": 0.007051087915897369, "rewards/margins": 0.0025309547781944275, "rewards/rejected": 0.004520131275057793, "step": 3 }, { "epoch": 0.0006185965590566403, "grad_norm": 3.4333865642547607, "learning_rate": 1.0309278350515464e-08, "logits/chosen": 14.677587509155273, "logits/rejected": 10.515243530273438, "logps/chosen": -245.28768920898438, "logps/rejected": -242.6073455810547, "loss": 0.6476, "rewards/accuracies": 0.75, "rewards/chosen": 0.03533508628606796, "rewards/margins": 0.09676864743232727, "rewards/rejected": -0.06143355742096901, "step": 4 }, { "epoch": 0.0007732456988208003, "grad_norm": 5.887388706207275, "learning_rate": 1.2886597938144331e-08, "logits/chosen": 8.125778198242188, "logits/rejected": 7.515656471252441, "logps/chosen": -264.28338623046875, "logps/rejected": -239.83038330078125, "loss": 0.7056, "rewards/accuracies": 0.625, "rewards/chosen": 0.01714172586798668, "rewards/margins": -0.02102193795144558, "rewards/rejected": 0.03816366195678711, "step": 5 }, { "epoch": 0.0009278948385849603, "grad_norm": 6.934746742248535, "learning_rate": 1.5463917525773195e-08, "logits/chosen": 8.793354988098145, "logits/rejected": 12.097320556640625, "logps/chosen": -251.8073272705078, "logps/rejected": -353.7068176269531, "loss": 0.699, "rewards/accuracies": 0.375, "rewards/chosen": -0.04178829491138458, "rewards/margins": -0.008631706237792969, "rewards/rejected": -0.03315658122301102, "step": 6 }, { "epoch": 0.0010825439783491205, "grad_norm": 5.932006359100342, "learning_rate": 1.8041237113402063e-08, "logits/chosen": 13.62399673461914, "logits/rejected": 4.940817832946777, "logps/chosen": -301.8253479003906, "logps/rejected": -327.65887451171875, "loss": 0.7116, "rewards/accuracies": 0.375, "rewards/chosen": -0.031215764582157135, "rewards/margins": -0.029828118160367012, "rewards/rejected": -0.0013876445591449738, "step": 7 }, { "epoch": 0.0012371931181132806, "grad_norm": 5.266278266906738, "learning_rate": 2.061855670103093e-08, "logits/chosen": 5.989433765411377, "logits/rejected": 6.766448020935059, "logps/chosen": -217.256591796875, "logps/rejected": -272.15509033203125, "loss": 0.7083, "rewards/accuracies": 0.375, "rewards/chosen": 0.0018736126367002726, "rewards/margins": -0.028320670127868652, "rewards/rejected": 0.03019428253173828, "step": 8 }, { "epoch": 0.0013918422578774407, "grad_norm": 4.636612892150879, "learning_rate": 2.3195876288659797e-08, "logits/chosen": 7.120231628417969, "logits/rejected": 7.554953575134277, "logps/chosen": -209.84974670410156, "logps/rejected": -197.9786834716797, "loss": 0.7263, "rewards/accuracies": 0.25, "rewards/chosen": -0.016598273068666458, "rewards/margins": -0.06356807053089142, "rewards/rejected": 0.04696979373693466, "step": 9 }, { "epoch": 0.0015464913976416005, "grad_norm": 4.485163688659668, "learning_rate": 2.5773195876288662e-08, "logits/chosen": 6.194683074951172, "logits/rejected": 4.8854875564575195, "logps/chosen": -246.56932067871094, "logps/rejected": -217.247314453125, "loss": 0.706, "rewards/accuracies": 0.5, "rewards/chosen": -0.023969482630491257, "rewards/margins": -0.020935650914907455, "rewards/rejected": -0.003033827990293503, "step": 10 }, { "epoch": 0.0017011405374057606, "grad_norm": 7.249931812286377, "learning_rate": 2.8350515463917528e-08, "logits/chosen": 15.518509864807129, "logits/rejected": 11.09177017211914, "logps/chosen": -274.49652099609375, "logps/rejected": -226.093994140625, "loss": 0.718, "rewards/accuracies": 0.25, "rewards/chosen": -0.005039501935243607, "rewards/margins": -0.04767618328332901, "rewards/rejected": 0.0426366813480854, "step": 11 }, { "epoch": 0.0018557896771699207, "grad_norm": 4.163851737976074, "learning_rate": 3.092783505154639e-08, "logits/chosen": 12.197102546691895, "logits/rejected": 8.043967247009277, "logps/chosen": -261.67230224609375, "logps/rejected": -195.1876220703125, "loss": 0.683, "rewards/accuracies": 0.5, "rewards/chosen": 0.008725881576538086, "rewards/margins": 0.022560596466064453, "rewards/rejected": -0.013834714889526367, "step": 12 }, { "epoch": 0.002010438816934081, "grad_norm": 5.067342758178711, "learning_rate": 3.350515463917526e-08, "logits/chosen": 17.780834197998047, "logits/rejected": 17.275726318359375, "logps/chosen": -296.10980224609375, "logps/rejected": -271.4646301269531, "loss": 0.6786, "rewards/accuracies": 0.5, "rewards/chosen": 0.03504469618201256, "rewards/margins": 0.03791084885597229, "rewards/rejected": -0.002866152673959732, "step": 13 }, { "epoch": 0.002165087956698241, "grad_norm": 5.199130535125732, "learning_rate": 3.608247422680413e-08, "logits/chosen": 6.776096343994141, "logits/rejected": 14.16385555267334, "logps/chosen": -232.74832153320312, "logps/rejected": -261.77581787109375, "loss": 0.7054, "rewards/accuracies": 0.5, "rewards/chosen": -0.06902992725372314, "rewards/margins": -0.022028755396604538, "rewards/rejected": -0.047001175582408905, "step": 14 }, { "epoch": 0.002319737096462401, "grad_norm": 4.540201663970947, "learning_rate": 3.865979381443299e-08, "logits/chosen": 10.701871871948242, "logits/rejected": 13.761884689331055, "logps/chosen": -287.3067626953125, "logps/rejected": -248.40289306640625, "loss": 0.7116, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004139924421906471, "rewards/margins": -0.03300056606531143, "rewards/rejected": 0.03258657455444336, "step": 15 }, { "epoch": 0.002474386236226561, "grad_norm": 4.610867977142334, "learning_rate": 4.123711340206186e-08, "logits/chosen": 12.358034133911133, "logits/rejected": -0.12671267986297607, "logps/chosen": -275.07647705078125, "logps/rejected": -172.946533203125, "loss": 0.7131, "rewards/accuracies": 0.375, "rewards/chosen": -0.008443592116236687, "rewards/margins": -0.03740517795085907, "rewards/rejected": 0.028961585834622383, "step": 16 }, { "epoch": 0.0026290353759907212, "grad_norm": 3.652007579803467, "learning_rate": 4.381443298969072e-08, "logits/chosen": 16.298797607421875, "logits/rejected": 7.837356090545654, "logps/chosen": -196.04367065429688, "logps/rejected": -154.35011291503906, "loss": 0.6704, "rewards/accuracies": 0.5, "rewards/chosen": 0.012971021234989166, "rewards/margins": 0.049596380442380905, "rewards/rejected": -0.03662535920739174, "step": 17 }, { "epoch": 0.0027836845157548813, "grad_norm": 5.670866012573242, "learning_rate": 4.6391752577319594e-08, "logits/chosen": 16.421899795532227, "logits/rejected": 9.962812423706055, "logps/chosen": -472.7196044921875, "logps/rejected": -284.9296875, "loss": 0.6217, "rewards/accuracies": 0.875, "rewards/chosen": 0.12304907292127609, "rewards/margins": 0.15355949103832245, "rewards/rejected": -0.030510425567626953, "step": 18 }, { "epoch": 0.002938333655519041, "grad_norm": 6.218260288238525, "learning_rate": 4.896907216494846e-08, "logits/chosen": 10.792370796203613, "logits/rejected": 3.2917964458465576, "logps/chosen": -410.13140869140625, "logps/rejected": -323.98284912109375, "loss": 0.6851, "rewards/accuracies": 0.375, "rewards/chosen": 0.0450771301984787, "rewards/margins": 0.019390054047107697, "rewards/rejected": 0.025687076151371002, "step": 19 }, { "epoch": 0.003092982795283201, "grad_norm": 5.937564373016357, "learning_rate": 5.1546391752577325e-08, "logits/chosen": 2.615875244140625, "logits/rejected": 6.465677261352539, "logps/chosen": -194.7277374267578, "logps/rejected": -249.28128051757812, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": 0.02864077314734459, "rewards/margins": 0.03823430836200714, "rewards/rejected": -0.009593534283339977, "step": 20 }, { "epoch": 0.003247631935047361, "grad_norm": 4.729965686798096, "learning_rate": 5.412371134020619e-08, "logits/chosen": 9.202268600463867, "logits/rejected": 11.467805862426758, "logps/chosen": -208.8218994140625, "logps/rejected": -304.7249755859375, "loss": 0.7092, "rewards/accuracies": 0.375, "rewards/chosen": 0.007246019318699837, "rewards/margins": -0.0298062302172184, "rewards/rejected": 0.037052251398563385, "step": 21 }, { "epoch": 0.003402281074811521, "grad_norm": 4.760029315948486, "learning_rate": 5.6701030927835055e-08, "logits/chosen": 9.577291488647461, "logits/rejected": 12.896297454833984, "logps/chosen": -231.807373046875, "logps/rejected": -322.9843444824219, "loss": 0.6655, "rewards/accuracies": 0.625, "rewards/chosen": 0.048436544835567474, "rewards/margins": 0.058693885803222656, "rewards/rejected": -0.010257341898977757, "step": 22 }, { "epoch": 0.0035569302145756813, "grad_norm": 5.684564113616943, "learning_rate": 5.927835051546392e-08, "logits/chosen": 11.468250274658203, "logits/rejected": 2.6983911991119385, "logps/chosen": -363.1484375, "logps/rejected": -283.53765869140625, "loss": 0.7331, "rewards/accuracies": 0.375, "rewards/chosen": -0.030769065022468567, "rewards/margins": -0.06990260630846024, "rewards/rejected": 0.039133548736572266, "step": 23 }, { "epoch": 0.0037115793543398413, "grad_norm": 7.322030067443848, "learning_rate": 6.185567010309278e-08, "logits/chosen": 8.942168235778809, "logits/rejected": 10.792451858520508, "logps/chosen": -243.0218963623047, "logps/rejected": -256.6020812988281, "loss": 0.7369, "rewards/accuracies": 0.375, "rewards/chosen": -0.10762663185596466, "rewards/margins": -0.07802677154541016, "rewards/rejected": -0.029599856585264206, "step": 24 }, { "epoch": 0.0038662284941040014, "grad_norm": 6.209859371185303, "learning_rate": 6.443298969072165e-08, "logits/chosen": 12.64309024810791, "logits/rejected": 3.72812557220459, "logps/chosen": -305.7120361328125, "logps/rejected": -228.66659545898438, "loss": 0.7297, "rewards/accuracies": 0.375, "rewards/chosen": -0.02528085745871067, "rewards/margins": -0.06709671020507812, "rewards/rejected": 0.041815854609012604, "step": 25 }, { "epoch": 0.004020877633868162, "grad_norm": 4.138147830963135, "learning_rate": 6.701030927835052e-08, "logits/chosen": 8.450196266174316, "logits/rejected": 8.557428359985352, "logps/chosen": -277.9859313964844, "logps/rejected": -268.0521545410156, "loss": 0.7216, "rewards/accuracies": 0.375, "rewards/chosen": -0.004189919680356979, "rewards/margins": -0.05424795299768448, "rewards/rejected": 0.0500580370426178, "step": 26 }, { "epoch": 0.004175526773632322, "grad_norm": 5.362881183624268, "learning_rate": 6.95876288659794e-08, "logits/chosen": 8.368182182312012, "logits/rejected": 7.4664459228515625, "logps/chosen": -285.5622253417969, "logps/rejected": -252.48875427246094, "loss": 0.7122, "rewards/accuracies": 0.25, "rewards/chosen": -0.0396575927734375, "rewards/margins": -0.03567342832684517, "rewards/rejected": -0.0039841653779149055, "step": 27 }, { "epoch": 0.004330175913396482, "grad_norm": 5.1329827308654785, "learning_rate": 7.216494845360825e-08, "logits/chosen": 5.287141799926758, "logits/rejected": 9.286478042602539, "logps/chosen": -184.01339721679688, "logps/rejected": -349.4659118652344, "loss": 0.7461, "rewards/accuracies": 0.375, "rewards/chosen": -0.045447640120983124, "rewards/margins": -0.09351148456335068, "rewards/rejected": 0.04806385189294815, "step": 28 }, { "epoch": 0.004484825053160642, "grad_norm": 5.443169116973877, "learning_rate": 7.474226804123713e-08, "logits/chosen": 2.246333599090576, "logits/rejected": 9.324928283691406, "logps/chosen": -206.28512573242188, "logps/rejected": -225.4776153564453, "loss": 0.7056, "rewards/accuracies": 0.5, "rewards/chosen": -0.034403469413518906, "rewards/margins": -0.022684622555971146, "rewards/rejected": -0.01171884499490261, "step": 29 }, { "epoch": 0.004639474192924802, "grad_norm": 5.341912269592285, "learning_rate": 7.731958762886598e-08, "logits/chosen": 10.89834213256836, "logits/rejected": 2.3132970333099365, "logps/chosen": -408.22601318359375, "logps/rejected": -238.31373596191406, "loss": 0.7202, "rewards/accuracies": 0.375, "rewards/chosen": -0.023410415276885033, "rewards/margins": -0.048658084124326706, "rewards/rejected": 0.025247670710086823, "step": 30 }, { "epoch": 0.004794123332688962, "grad_norm": 4.035882949829102, "learning_rate": 7.989690721649484e-08, "logits/chosen": 7.057154655456543, "logits/rejected": 0.49073028564453125, "logps/chosen": -306.31097412109375, "logps/rejected": -182.73757934570312, "loss": 0.6574, "rewards/accuracies": 0.5, "rewards/chosen": 0.073747918009758, "rewards/margins": 0.07770189642906189, "rewards/rejected": -0.003953981213271618, "step": 31 }, { "epoch": 0.004948772472453122, "grad_norm": 5.315826416015625, "learning_rate": 8.247422680412371e-08, "logits/chosen": 9.184189796447754, "logits/rejected": 7.225438117980957, "logps/chosen": -260.186279296875, "logps/rejected": -234.22914123535156, "loss": 0.7139, "rewards/accuracies": 0.125, "rewards/chosen": -0.07834196835756302, "rewards/margins": -0.0400027260184288, "rewards/rejected": -0.03833923488855362, "step": 32 }, { "epoch": 0.005103421612217282, "grad_norm": 4.139220714569092, "learning_rate": 8.505154639175257e-08, "logits/chosen": 9.457880020141602, "logits/rejected": 9.91469955444336, "logps/chosen": -265.49456787109375, "logps/rejected": -215.19534301757812, "loss": 0.6897, "rewards/accuracies": 0.5, "rewards/chosen": 0.0047846343368291855, "rewards/margins": 0.010850241407752037, "rewards/rejected": -0.006065608002245426, "step": 33 }, { "epoch": 0.0052580707519814425, "grad_norm": 8.514988899230957, "learning_rate": 8.762886597938144e-08, "logits/chosen": 10.960465431213379, "logits/rejected": 11.944863319396973, "logps/chosen": -364.357421875, "logps/rejected": -378.77349853515625, "loss": 0.7028, "rewards/accuracies": 0.625, "rewards/chosen": -0.0046242717653512955, "rewards/margins": -0.014529705047607422, "rewards/rejected": 0.009905435144901276, "step": 34 }, { "epoch": 0.005412719891745603, "grad_norm": 5.189547061920166, "learning_rate": 9.02061855670103e-08, "logits/chosen": 10.958404541015625, "logits/rejected": 10.693275451660156, "logps/chosen": -273.6878967285156, "logps/rejected": -254.7626190185547, "loss": 0.6742, "rewards/accuracies": 0.5, "rewards/chosen": 0.04495172202587128, "rewards/margins": 0.04239773750305176, "rewards/rejected": 0.0025539863854646683, "step": 35 }, { "epoch": 0.005567369031509763, "grad_norm": 6.511617660522461, "learning_rate": 9.278350515463919e-08, "logits/chosen": 9.624488830566406, "logits/rejected": 8.125849723815918, "logps/chosen": -309.81689453125, "logps/rejected": -246.7415771484375, "loss": 0.6828, "rewards/accuracies": 0.625, "rewards/chosen": 0.025867179036140442, "rewards/margins": 0.021593429148197174, "rewards/rejected": 0.004273748956620693, "step": 36 }, { "epoch": 0.005722018171273923, "grad_norm": 4.336499214172363, "learning_rate": 9.536082474226806e-08, "logits/chosen": 9.214275360107422, "logits/rejected": 2.736907482147217, "logps/chosen": -182.815673828125, "logps/rejected": -139.5016632080078, "loss": 0.7167, "rewards/accuracies": 0.125, "rewards/chosen": -0.00035815173760056496, "rewards/margins": -0.045297957956790924, "rewards/rejected": 0.044939808547496796, "step": 37 }, { "epoch": 0.005876667311038082, "grad_norm": 5.187273025512695, "learning_rate": 9.793814432989692e-08, "logits/chosen": 10.795100212097168, "logits/rejected": 11.301115036010742, "logps/chosen": -276.6474304199219, "logps/rejected": -295.5259704589844, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": 0.00455322302877903, "rewards/margins": 0.02165413089096546, "rewards/rejected": -0.017100907862186432, "step": 38 }, { "epoch": 0.006031316450802242, "grad_norm": 20.21113395690918, "learning_rate": 1.0051546391752579e-07, "logits/chosen": 10.748891830444336, "logits/rejected": 8.51008415222168, "logps/chosen": -377.041015625, "logps/rejected": -276.65789794921875, "loss": 0.6994, "rewards/accuracies": 0.25, "rewards/chosen": -0.013005254790186882, "rewards/margins": -0.004641342908143997, "rewards/rejected": -0.008363913744688034, "step": 39 }, { "epoch": 0.006185965590566402, "grad_norm": 4.1608147621154785, "learning_rate": 1.0309278350515465e-07, "logits/chosen": 15.16433048248291, "logits/rejected": 12.891679763793945, "logps/chosen": -272.31048583984375, "logps/rejected": -207.171875, "loss": 0.7103, "rewards/accuracies": 0.375, "rewards/chosen": -0.010014916770160198, "rewards/margins": -0.028248880058526993, "rewards/rejected": 0.01823396608233452, "step": 40 }, { "epoch": 0.006340614730330562, "grad_norm": 4.690361499786377, "learning_rate": 1.0567010309278352e-07, "logits/chosen": 11.853530883789062, "logits/rejected": 13.110629081726074, "logps/chosen": -251.1317138671875, "logps/rejected": -312.3910827636719, "loss": 0.74, "rewards/accuracies": 0.125, "rewards/chosen": -0.03138899803161621, "rewards/margins": -0.08699889481067657, "rewards/rejected": 0.055609896779060364, "step": 41 }, { "epoch": 0.006495263870094722, "grad_norm": 4.178880214691162, "learning_rate": 1.0824742268041238e-07, "logits/chosen": 6.848409175872803, "logits/rejected": 8.219043731689453, "logps/chosen": -166.265380859375, "logps/rejected": -169.47552490234375, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": 0.003542447928339243, "rewards/margins": -0.005707143805921078, "rewards/rejected": 0.009249591268599033, "step": 42 }, { "epoch": 0.006649913009858882, "grad_norm": 4.671844005584717, "learning_rate": 1.1082474226804125e-07, "logits/chosen": 9.44178581237793, "logits/rejected": 2.4399619102478027, "logps/chosen": -252.87725830078125, "logps/rejected": -205.69924926757812, "loss": 0.7013, "rewards/accuracies": 0.375, "rewards/chosen": -0.0025249021127820015, "rewards/margins": -0.012448456138372421, "rewards/rejected": 0.009923554956912994, "step": 43 }, { "epoch": 0.006804562149623042, "grad_norm": 7.647763252258301, "learning_rate": 1.1340206185567011e-07, "logits/chosen": 9.81114673614502, "logits/rejected": 2.1354012489318848, "logps/chosen": -230.75701904296875, "logps/rejected": -146.80470275878906, "loss": 0.7639, "rewards/accuracies": 0.375, "rewards/chosen": -0.05395350605249405, "rewards/margins": -0.12842561304569244, "rewards/rejected": 0.0744720995426178, "step": 44 }, { "epoch": 0.0069592112893872024, "grad_norm": 3.828352451324463, "learning_rate": 1.1597938144329898e-07, "logits/chosen": 12.649640083312988, "logits/rejected": 10.158154487609863, "logps/chosen": -221.09107971191406, "logps/rejected": -145.47027587890625, "loss": 0.6753, "rewards/accuracies": 0.625, "rewards/chosen": 0.012367580085992813, "rewards/margins": 0.037506103515625, "rewards/rejected": -0.02513851970434189, "step": 45 }, { "epoch": 0.0071138604291513625, "grad_norm": 4.946978569030762, "learning_rate": 1.1855670103092784e-07, "logits/chosen": 10.738189697265625, "logits/rejected": 11.961038589477539, "logps/chosen": -219.86959838867188, "logps/rejected": -224.05934143066406, "loss": 0.7233, "rewards/accuracies": 0.125, "rewards/chosen": 0.019327593967318535, "rewards/margins": -0.05806604027748108, "rewards/rejected": 0.07739362865686417, "step": 46 }, { "epoch": 0.007268509568915523, "grad_norm": 4.380319595336914, "learning_rate": 1.211340206185567e-07, "logits/chosen": 17.339160919189453, "logits/rejected": 6.111804962158203, "logps/chosen": -201.5426025390625, "logps/rejected": -145.981201171875, "loss": 0.6785, "rewards/accuracies": 0.5, "rewards/chosen": 0.055193666368722916, "rewards/margins": 0.03219471126794815, "rewards/rejected": 0.022998955100774765, "step": 47 }, { "epoch": 0.007423158708679683, "grad_norm": 3.666557550430298, "learning_rate": 1.2371134020618556e-07, "logits/chosen": 13.23106861114502, "logits/rejected": 8.200860977172852, "logps/chosen": -166.75164794921875, "logps/rejected": -111.83175659179688, "loss": 0.6843, "rewards/accuracies": 0.5, "rewards/chosen": 0.0284881591796875, "rewards/margins": 0.019208334386348724, "rewards/rejected": 0.009279822930693626, "step": 48 }, { "epoch": 0.007577807848443843, "grad_norm": 5.851673126220703, "learning_rate": 1.2628865979381446e-07, "logits/chosen": 4.434458255767822, "logits/rejected": 5.869389057159424, "logps/chosen": -293.1557922363281, "logps/rejected": -324.98394775390625, "loss": 0.6812, "rewards/accuracies": 0.625, "rewards/chosen": 0.02338256873190403, "rewards/margins": 0.02982187271118164, "rewards/rejected": -0.006439303979277611, "step": 49 }, { "epoch": 0.007732456988208003, "grad_norm": 7.123608112335205, "learning_rate": 1.288659793814433e-07, "logits/chosen": 12.065396308898926, "logits/rejected": 6.792296886444092, "logps/chosen": -243.808349609375, "logps/rejected": -256.5181579589844, "loss": 0.661, "rewards/accuracies": 0.625, "rewards/chosen": 0.03321099281311035, "rewards/margins": 0.06735959649085999, "rewards/rejected": -0.03414859622716904, "step": 50 }, { "epoch": 0.007887106127972164, "grad_norm": 4.990226745605469, "learning_rate": 1.3144329896907217e-07, "logits/chosen": 9.566271781921387, "logits/rejected": 13.786304473876953, "logps/chosen": -238.0357666015625, "logps/rejected": -297.1591796875, "loss": 0.752, "rewards/accuracies": 0.125, "rewards/chosen": -0.042253829538822174, "rewards/margins": -0.10889139026403427, "rewards/rejected": 0.0666375607252121, "step": 51 }, { "epoch": 0.008041755267736324, "grad_norm": 3.86415696144104, "learning_rate": 1.3402061855670105e-07, "logits/chosen": 2.9946484565734863, "logits/rejected": 2.850429058074951, "logps/chosen": -228.489501953125, "logps/rejected": -198.38775634765625, "loss": 0.6746, "rewards/accuracies": 0.5, "rewards/chosen": 0.07479849457740784, "rewards/margins": 0.04300132393836975, "rewards/rejected": 0.031797170639038086, "step": 52 }, { "epoch": 0.008196404407500484, "grad_norm": 9.43787956237793, "learning_rate": 1.3659793814432992e-07, "logits/chosen": 8.113728523254395, "logits/rejected": 4.7881245613098145, "logps/chosen": -421.5853576660156, "logps/rejected": -274.0136413574219, "loss": 0.6726, "rewards/accuracies": 0.375, "rewards/chosen": 0.0017354898154735565, "rewards/margins": 0.04958948865532875, "rewards/rejected": -0.047853998839855194, "step": 53 }, { "epoch": 0.008351053547264644, "grad_norm": 5.2647881507873535, "learning_rate": 1.391752577319588e-07, "logits/chosen": 13.694591522216797, "logits/rejected": 8.33656120300293, "logps/chosen": -310.445068359375, "logps/rejected": -200.9225311279297, "loss": 0.6577, "rewards/accuracies": 0.75, "rewards/chosen": 0.05243654549121857, "rewards/margins": 0.07473216205835342, "rewards/rejected": -0.022295618429780006, "step": 54 }, { "epoch": 0.008505702687028804, "grad_norm": 5.5423665046691895, "learning_rate": 1.4175257731958764e-07, "logits/chosen": 14.974117279052734, "logits/rejected": 8.096324920654297, "logps/chosen": -318.1484680175781, "logps/rejected": -251.4805908203125, "loss": 0.7033, "rewards/accuracies": 0.375, "rewards/chosen": 0.0038333898410201073, "rewards/margins": -0.017112065106630325, "rewards/rejected": 0.020945454016327858, "step": 55 }, { "epoch": 0.008660351826792964, "grad_norm": 5.278740406036377, "learning_rate": 1.443298969072165e-07, "logits/chosen": 6.886699676513672, "logits/rejected": 5.67666482925415, "logps/chosen": -430.3440246582031, "logps/rejected": -272.5577392578125, "loss": 0.7056, "rewards/accuracies": 0.375, "rewards/chosen": -0.008675767108798027, "rewards/margins": -0.023654652759432793, "rewards/rejected": 0.014978885650634766, "step": 56 }, { "epoch": 0.008815000966557124, "grad_norm": 7.0525336265563965, "learning_rate": 1.4690721649484538e-07, "logits/chosen": 11.573299407958984, "logits/rejected": 6.818861484527588, "logps/chosen": -327.9615173339844, "logps/rejected": -329.19232177734375, "loss": 0.6403, "rewards/accuracies": 0.875, "rewards/chosen": 0.06640644371509552, "rewards/margins": 0.11603251099586487, "rewards/rejected": -0.04962606728076935, "step": 57 }, { "epoch": 0.008969650106321284, "grad_norm": 5.868617534637451, "learning_rate": 1.4948453608247425e-07, "logits/chosen": 13.533453941345215, "logits/rejected": 12.22661018371582, "logps/chosen": -325.85284423828125, "logps/rejected": -333.2220764160156, "loss": 0.695, "rewards/accuracies": 0.25, "rewards/chosen": 0.026420211419463158, "rewards/margins": -0.0008131072390824556, "rewards/rejected": 0.027233313769102097, "step": 58 }, { "epoch": 0.009124299246085444, "grad_norm": 3.9673924446105957, "learning_rate": 1.520618556701031e-07, "logits/chosen": 11.48481559753418, "logits/rejected": -0.8207135200500488, "logps/chosen": -279.1275939941406, "logps/rejected": -135.3128662109375, "loss": 0.7097, "rewards/accuracies": 0.25, "rewards/chosen": 0.0018380163237452507, "rewards/margins": -0.03214993700385094, "rewards/rejected": 0.033987950533628464, "step": 59 }, { "epoch": 0.009278948385849604, "grad_norm": 4.902527809143066, "learning_rate": 1.5463917525773197e-07, "logits/chosen": 12.110644340515137, "logits/rejected": 10.14527702331543, "logps/chosen": -291.07928466796875, "logps/rejected": -293.1893310546875, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004943609237670898, "rewards/margins": 0.01345379650592804, "rewards/rejected": -0.01394815556704998, "step": 60 }, { "epoch": 0.009433597525613764, "grad_norm": 3.8037991523742676, "learning_rate": 1.5721649484536084e-07, "logits/chosen": 12.840753555297852, "logits/rejected": 5.81508207321167, "logps/chosen": -178.47412109375, "logps/rejected": -151.3430633544922, "loss": 0.7173, "rewards/accuracies": 0.125, "rewards/chosen": -0.001392459962517023, "rewards/margins": -0.04688852280378342, "rewards/rejected": 0.04549605771899223, "step": 61 }, { "epoch": 0.009588246665377925, "grad_norm": 3.996386766433716, "learning_rate": 1.5979381443298969e-07, "logits/chosen": 6.25969123840332, "logits/rejected": 4.946198463439941, "logps/chosen": -213.48226928710938, "logps/rejected": -213.9498291015625, "loss": 0.6963, "rewards/accuracies": 0.375, "rewards/chosen": -0.007098199799656868, "rewards/margins": -0.0002037547528743744, "rewards/rejected": -0.006894445046782494, "step": 62 }, { "epoch": 0.009742895805142085, "grad_norm": 5.785022258758545, "learning_rate": 1.6237113402061858e-07, "logits/chosen": 10.98247241973877, "logits/rejected": 8.06905460357666, "logps/chosen": -291.7880859375, "logps/rejected": -269.6845397949219, "loss": 0.7234, "rewards/accuracies": 0.375, "rewards/chosen": 0.0038303378969430923, "rewards/margins": -0.05777034908533096, "rewards/rejected": 0.0616006925702095, "step": 63 }, { "epoch": 0.009897544944906245, "grad_norm": 4.6899871826171875, "learning_rate": 1.6494845360824743e-07, "logits/chosen": 6.98136043548584, "logits/rejected": 8.341474533081055, "logps/chosen": -186.41683959960938, "logps/rejected": -244.73849487304688, "loss": 0.6532, "rewards/accuracies": 0.625, "rewards/chosen": 0.011904047802090645, "rewards/margins": 0.08552099019289017, "rewards/rejected": -0.07361693680286407, "step": 64 }, { "epoch": 0.010052194084670405, "grad_norm": 8.800151824951172, "learning_rate": 1.675257731958763e-07, "logits/chosen": 0.23321041464805603, "logits/rejected": 6.668447971343994, "logps/chosen": -258.418701171875, "logps/rejected": -374.27886962890625, "loss": 0.6568, "rewards/accuracies": 0.625, "rewards/chosen": 0.05098380893468857, "rewards/margins": 0.08276348561048508, "rewards/rejected": -0.03177966922521591, "step": 65 }, { "epoch": 0.010206843224434565, "grad_norm": 5.204796314239502, "learning_rate": 1.7010309278350515e-07, "logits/chosen": 2.947244167327881, "logits/rejected": 3.6227288246154785, "logps/chosen": -145.93878173828125, "logps/rejected": -151.84219360351562, "loss": 0.709, "rewards/accuracies": 0.5, "rewards/chosen": -0.0032607070170342922, "rewards/margins": -0.02523498237133026, "rewards/rejected": 0.02197427675127983, "step": 66 }, { "epoch": 0.010361492364198725, "grad_norm": 5.2081732749938965, "learning_rate": 1.7268041237113404e-07, "logits/chosen": 8.921749114990234, "logits/rejected": 8.524456024169922, "logps/chosen": -222.26181030273438, "logps/rejected": -163.75942993164062, "loss": 0.7049, "rewards/accuracies": 0.625, "rewards/chosen": -0.01697554439306259, "rewards/margins": -0.022300314158201218, "rewards/rejected": 0.005324769299477339, "step": 67 }, { "epoch": 0.010516141503962885, "grad_norm": 4.866977214813232, "learning_rate": 1.752577319587629e-07, "logits/chosen": 12.838434219360352, "logits/rejected": 11.409696578979492, "logps/chosen": -328.6891784667969, "logps/rejected": -283.2961120605469, "loss": 0.6979, "rewards/accuracies": 0.5, "rewards/chosen": 0.004480741918087006, "rewards/margins": -0.0014948388561606407, "rewards/rejected": 0.0059755826368927956, "step": 68 }, { "epoch": 0.010670790643727045, "grad_norm": 7.5063700675964355, "learning_rate": 1.7783505154639176e-07, "logits/chosen": 8.556709289550781, "logits/rejected": 8.246336936950684, "logps/chosen": -422.247802734375, "logps/rejected": -360.22540283203125, "loss": 0.7694, "rewards/accuracies": 0.5, "rewards/chosen": -0.06881704181432724, "rewards/margins": -0.13730239868164062, "rewards/rejected": 0.06848535686731339, "step": 69 }, { "epoch": 0.010825439783491205, "grad_norm": 5.279910087585449, "learning_rate": 1.804123711340206e-07, "logits/chosen": 12.430498123168945, "logits/rejected": 7.914384365081787, "logps/chosen": -238.27554321289062, "logps/rejected": -174.36917114257812, "loss": 0.677, "rewards/accuracies": 0.5, "rewards/chosen": 0.019179202616214752, "rewards/margins": 0.034891657531261444, "rewards/rejected": -0.015712453052401543, "step": 70 }, { "epoch": 0.010980088923255365, "grad_norm": 5.323324680328369, "learning_rate": 1.829896907216495e-07, "logits/chosen": 7.749332427978516, "logits/rejected": 4.102944374084473, "logps/chosen": -226.2488555908203, "logps/rejected": -150.99114990234375, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": -0.055611733347177505, "rewards/margins": -0.013157534413039684, "rewards/rejected": -0.04245419427752495, "step": 71 }, { "epoch": 0.011134738063019525, "grad_norm": 7.350645065307617, "learning_rate": 1.8556701030927838e-07, "logits/chosen": 8.904353141784668, "logits/rejected": 7.493212699890137, "logps/chosen": -319.02978515625, "logps/rejected": -309.9676513671875, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": -0.0036481861025094986, "rewards/margins": 0.0438537634909153, "rewards/rejected": -0.047501951456069946, "step": 72 }, { "epoch": 0.011289387202783685, "grad_norm": 5.224555969238281, "learning_rate": 1.8814432989690722e-07, "logits/chosen": 12.686028480529785, "logits/rejected": 8.858221054077148, "logps/chosen": -298.44476318359375, "logps/rejected": -260.1572265625, "loss": 0.7105, "rewards/accuracies": 0.375, "rewards/chosen": -0.03465289995074272, "rewards/margins": -0.03115687146782875, "rewards/rejected": -0.003496028482913971, "step": 73 }, { "epoch": 0.011444036342547845, "grad_norm": 4.9264044761657715, "learning_rate": 1.9072164948453612e-07, "logits/chosen": 7.611832618713379, "logits/rejected": 11.591278076171875, "logps/chosen": -361.92724609375, "logps/rejected": -452.9621887207031, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": 0.03055468015372753, "rewards/margins": 0.006420614197850227, "rewards/rejected": 0.024134064093232155, "step": 74 }, { "epoch": 0.011598685482312004, "grad_norm": 6.524920463562012, "learning_rate": 1.9329896907216497e-07, "logits/chosen": 10.921269416809082, "logits/rejected": 7.254299163818359, "logps/chosen": -237.08285522460938, "logps/rejected": -226.239501953125, "loss": 0.7351, "rewards/accuracies": 0.125, "rewards/chosen": -0.060632992535829544, "rewards/margins": -0.08086157590150833, "rewards/rejected": 0.02022857591509819, "step": 75 }, { "epoch": 0.011753334622076164, "grad_norm": 5.861778736114502, "learning_rate": 1.9587628865979384e-07, "logits/chosen": 5.802974224090576, "logits/rejected": 4.184164524078369, "logps/chosen": -245.282958984375, "logps/rejected": -231.754150390625, "loss": 0.6817, "rewards/accuracies": 0.5, "rewards/chosen": 0.0020096758380532265, "rewards/margins": 0.026843644678592682, "rewards/rejected": -0.02483396604657173, "step": 76 }, { "epoch": 0.011907983761840324, "grad_norm": 5.2542724609375, "learning_rate": 1.9845360824742268e-07, "logits/chosen": 12.744388580322266, "logits/rejected": 8.454718589782715, "logps/chosen": -358.8239440917969, "logps/rejected": -302.8267822265625, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": 0.03047761879861355, "rewards/margins": 0.05145206302404404, "rewards/rejected": -0.020974446088075638, "step": 77 }, { "epoch": 0.012062632901604484, "grad_norm": 4.325829029083252, "learning_rate": 2.0103092783505158e-07, "logits/chosen": 9.856746673583984, "logits/rejected": 12.297477722167969, "logps/chosen": -209.46218872070312, "logps/rejected": -284.36248779296875, "loss": 0.6958, "rewards/accuracies": 0.375, "rewards/chosen": 0.0018074996769428253, "rewards/margins": -0.0029071811586618423, "rewards/rejected": 0.004714678041636944, "step": 78 }, { "epoch": 0.012217282041368644, "grad_norm": 4.3449273109436035, "learning_rate": 2.0360824742268043e-07, "logits/chosen": 10.401052474975586, "logits/rejected": 0.46581029891967773, "logps/chosen": -286.626953125, "logps/rejected": -175.97720336914062, "loss": 0.6794, "rewards/accuracies": 0.75, "rewards/chosen": 0.012897204607725143, "rewards/margins": 0.030520722270011902, "rewards/rejected": -0.017623521387577057, "step": 79 }, { "epoch": 0.012371931181132804, "grad_norm": 5.984966278076172, "learning_rate": 2.061855670103093e-07, "logits/chosen": 15.044057846069336, "logits/rejected": 9.719198226928711, "logps/chosen": -321.7178955078125, "logps/rejected": -354.29620361328125, "loss": 0.7061, "rewards/accuracies": 0.375, "rewards/chosen": -0.03402872383594513, "rewards/margins": -0.020597314462065697, "rewards/rejected": -0.013431406579911709, "step": 80 }, { "epoch": 0.012526580320896964, "grad_norm": 3.8173110485076904, "learning_rate": 2.0876288659793814e-07, "logits/chosen": 8.523874282836914, "logits/rejected": 5.635761260986328, "logps/chosen": -167.03334045410156, "logps/rejected": -187.72418212890625, "loss": 0.664, "rewards/accuracies": 0.625, "rewards/chosen": 0.0035264017060399055, "rewards/margins": 0.06371049582958221, "rewards/rejected": -0.06018409878015518, "step": 81 }, { "epoch": 0.012681229460661124, "grad_norm": 4.351160049438477, "learning_rate": 2.1134020618556704e-07, "logits/chosen": 6.980379104614258, "logits/rejected": 10.344731330871582, "logps/chosen": -148.7363739013672, "logps/rejected": -154.00201416015625, "loss": 0.7276, "rewards/accuracies": 0.25, "rewards/chosen": -0.020682476460933685, "rewards/margins": -0.06525816023349762, "rewards/rejected": 0.04457569122314453, "step": 82 }, { "epoch": 0.012835878600425284, "grad_norm": 5.728026390075684, "learning_rate": 2.139175257731959e-07, "logits/chosen": 9.606285095214844, "logits/rejected": 8.50086784362793, "logps/chosen": -352.49932861328125, "logps/rejected": -303.3203125, "loss": 0.6661, "rewards/accuracies": 0.75, "rewards/chosen": -0.013780402019619942, "rewards/margins": 0.057223130017519, "rewards/rejected": -0.07100353389978409, "step": 83 }, { "epoch": 0.012990527740189444, "grad_norm": 9.317477226257324, "learning_rate": 2.1649484536082476e-07, "logits/chosen": 10.684642791748047, "logits/rejected": 7.32394552230835, "logps/chosen": -330.871826171875, "logps/rejected": -251.26858520507812, "loss": 0.751, "rewards/accuracies": 0.375, "rewards/chosen": -0.0837281197309494, "rewards/margins": -0.10600291192531586, "rewards/rejected": 0.02227477915585041, "step": 84 }, { "epoch": 0.013145176879953605, "grad_norm": 6.320870399475098, "learning_rate": 2.190721649484536e-07, "logits/chosen": 6.818489074707031, "logits/rejected": 4.175490379333496, "logps/chosen": -322.9170837402344, "logps/rejected": -286.32928466796875, "loss": 0.7179, "rewards/accuracies": 0.5, "rewards/chosen": -0.03987712785601616, "rewards/margins": -0.04615812748670578, "rewards/rejected": 0.006280993577092886, "step": 85 }, { "epoch": 0.013299826019717765, "grad_norm": 9.204850196838379, "learning_rate": 2.216494845360825e-07, "logits/chosen": 8.331786155700684, "logits/rejected": 7.814069747924805, "logps/chosen": -486.9302978515625, "logps/rejected": -450.00140380859375, "loss": 0.7185, "rewards/accuracies": 0.25, "rewards/chosen": -0.035970304161310196, "rewards/margins": -0.03715171292424202, "rewards/rejected": 0.0011814087629318237, "step": 86 }, { "epoch": 0.013454475159481925, "grad_norm": 5.094347953796387, "learning_rate": 2.2422680412371135e-07, "logits/chosen": 9.036510467529297, "logits/rejected": 5.101694107055664, "logps/chosen": -305.684326171875, "logps/rejected": -170.80503845214844, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": 0.014848662540316582, "rewards/margins": 0.0014084815047681332, "rewards/rejected": 0.013440179638564587, "step": 87 }, { "epoch": 0.013609124299246085, "grad_norm": 3.4267141819000244, "learning_rate": 2.2680412371134022e-07, "logits/chosen": 17.177186965942383, "logits/rejected": 10.287654876708984, "logps/chosen": -189.2765655517578, "logps/rejected": -184.96090698242188, "loss": 0.6992, "rewards/accuracies": 0.625, "rewards/chosen": -0.013287735171616077, "rewards/margins": -0.011234856210649014, "rewards/rejected": -0.002052880357950926, "step": 88 }, { "epoch": 0.013763773439010245, "grad_norm": 3.644435167312622, "learning_rate": 2.2938144329896907e-07, "logits/chosen": 8.002180099487305, "logits/rejected": 12.51270866394043, "logps/chosen": -109.40216064453125, "logps/rejected": -209.42178344726562, "loss": 0.7122, "rewards/accuracies": 0.375, "rewards/chosen": -0.029180288314819336, "rewards/margins": -0.0355406329035759, "rewards/rejected": 0.006360341794788837, "step": 89 }, { "epoch": 0.013918422578774405, "grad_norm": 4.993024826049805, "learning_rate": 2.3195876288659797e-07, "logits/chosen": 13.116409301757812, "logits/rejected": 8.573915481567383, "logps/chosen": -285.0827331542969, "logps/rejected": -235.10255432128906, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.019628314301371574, "rewards/margins": -0.0007615312933921814, "rewards/rejected": 0.020389841869473457, "step": 90 }, { "epoch": 0.014073071718538565, "grad_norm": 3.503422498703003, "learning_rate": 2.345360824742268e-07, "logits/chosen": 11.814714431762695, "logits/rejected": 7.204892635345459, "logps/chosen": -193.9886474609375, "logps/rejected": -129.6933135986328, "loss": 0.687, "rewards/accuracies": 0.5, "rewards/chosen": -0.024440957233309746, "rewards/margins": 0.014166689477860928, "rewards/rejected": -0.0386076457798481, "step": 91 }, { "epoch": 0.014227720858302725, "grad_norm": 4.424459934234619, "learning_rate": 2.3711340206185568e-07, "logits/chosen": 8.881943702697754, "logits/rejected": 9.744006156921387, "logps/chosen": -303.65576171875, "logps/rejected": -298.8911437988281, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": -0.0425170436501503, "rewards/margins": -0.048736244440078735, "rewards/rejected": 0.006219197064638138, "step": 92 }, { "epoch": 0.014382369998066885, "grad_norm": 5.523021697998047, "learning_rate": 2.3969072164948455e-07, "logits/chosen": 10.548042297363281, "logits/rejected": 7.480461120605469, "logps/chosen": -267.7251892089844, "logps/rejected": -251.8106689453125, "loss": 0.712, "rewards/accuracies": 0.5, "rewards/chosen": -0.004119347780942917, "rewards/margins": -0.032270170748233795, "rewards/rejected": 0.02815082296729088, "step": 93 }, { "epoch": 0.014537019137831045, "grad_norm": 6.018385887145996, "learning_rate": 2.422680412371134e-07, "logits/chosen": 9.545321464538574, "logits/rejected": 5.171817779541016, "logps/chosen": -192.32418823242188, "logps/rejected": -162.64511108398438, "loss": 0.7192, "rewards/accuracies": 0.375, "rewards/chosen": 0.01340007595717907, "rewards/margins": -0.04911398887634277, "rewards/rejected": 0.06251406669616699, "step": 94 }, { "epoch": 0.014691668277595205, "grad_norm": 3.7559831142425537, "learning_rate": 2.448453608247423e-07, "logits/chosen": 12.315309524536133, "logits/rejected": 13.280659675598145, "logps/chosen": -205.85284423828125, "logps/rejected": -168.16867065429688, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.014152051880955696, "rewards/margins": 0.03300756961107254, "rewards/rejected": -0.01885552518069744, "step": 95 }, { "epoch": 0.014846317417359365, "grad_norm": 5.631770133972168, "learning_rate": 2.474226804123711e-07, "logits/chosen": 8.322446823120117, "logits/rejected": 5.429612636566162, "logps/chosen": -250.402587890625, "logps/rejected": -206.419189453125, "loss": 0.6821, "rewards/accuracies": 0.5, "rewards/chosen": 0.05188605934381485, "rewards/margins": 0.024941157549619675, "rewards/rejected": 0.026944901794195175, "step": 96 }, { "epoch": 0.015000966557123525, "grad_norm": 4.542779922485352, "learning_rate": 2.5000000000000004e-07, "logits/chosen": 3.9670815467834473, "logits/rejected": 7.176328182220459, "logps/chosen": -354.7858581542969, "logps/rejected": -319.0560607910156, "loss": 0.6375, "rewards/accuracies": 0.75, "rewards/chosen": 0.08435390144586563, "rewards/margins": 0.11990010738372803, "rewards/rejected": -0.035546205937862396, "step": 97 }, { "epoch": 0.015155615696887685, "grad_norm": 8.824295997619629, "learning_rate": 2.525773195876289e-07, "logits/chosen": 6.885655403137207, "logits/rejected": 7.994781494140625, "logps/chosen": -394.6037292480469, "logps/rejected": -472.6541748046875, "loss": 0.6897, "rewards/accuracies": 0.5, "rewards/chosen": 0.004859543405473232, "rewards/margins": 0.012601710855960846, "rewards/rejected": -0.007742166519165039, "step": 98 }, { "epoch": 0.015310264836651846, "grad_norm": 4.0067219734191895, "learning_rate": 2.5515463917525773e-07, "logits/chosen": 7.246218681335449, "logits/rejected": 0.9063689708709717, "logps/chosen": -260.90106201171875, "logps/rejected": -166.2843475341797, "loss": 0.6978, "rewards/accuracies": 0.375, "rewards/chosen": -0.02266242541372776, "rewards/margins": -0.007983684539794922, "rewards/rejected": -0.014678740873932838, "step": 99 }, { "epoch": 0.015464913976416006, "grad_norm": 5.003303527832031, "learning_rate": 2.577319587628866e-07, "logits/chosen": 6.180871486663818, "logits/rejected": 2.7873902320861816, "logps/chosen": -300.70928955078125, "logps/rejected": -213.31500244140625, "loss": 0.6601, "rewards/accuracies": 0.75, "rewards/chosen": 0.06647412478923798, "rewards/margins": 0.06903731822967529, "rewards/rejected": -0.0025631911121308804, "step": 100 }, { "epoch": 0.015619563116180166, "grad_norm": 6.092252731323242, "learning_rate": 2.603092783505155e-07, "logits/chosen": 16.338642120361328, "logits/rejected": 9.950223922729492, "logps/chosen": -262.9970703125, "logps/rejected": -246.46461486816406, "loss": 0.6718, "rewards/accuracies": 0.75, "rewards/chosen": 0.037627674639225006, "rewards/margins": 0.05207538604736328, "rewards/rejected": -0.01444771233946085, "step": 101 }, { "epoch": 0.015774212255944327, "grad_norm": 4.586551189422607, "learning_rate": 2.6288659793814435e-07, "logits/chosen": 9.419777870178223, "logits/rejected": 1.2569756507873535, "logps/chosen": -283.7944030761719, "logps/rejected": -207.13519287109375, "loss": 0.705, "rewards/accuracies": 0.5, "rewards/chosen": -0.0035183895379304886, "rewards/margins": -0.018809601664543152, "rewards/rejected": 0.015291215851902962, "step": 102 }, { "epoch": 0.015928861395708486, "grad_norm": 5.104458332061768, "learning_rate": 2.654639175257732e-07, "logits/chosen": 9.748336791992188, "logits/rejected": 10.516401290893555, "logps/chosen": -267.2437438964844, "logps/rejected": -233.54745483398438, "loss": 0.7415, "rewards/accuracies": 0.25, "rewards/chosen": -0.04259653016924858, "rewards/margins": -0.09110765904188156, "rewards/rejected": 0.04851112514734268, "step": 103 }, { "epoch": 0.016083510535472648, "grad_norm": 4.945676803588867, "learning_rate": 2.680412371134021e-07, "logits/chosen": 6.208408355712891, "logits/rejected": 11.386116027832031, "logps/chosen": -256.3028259277344, "logps/rejected": -299.6868591308594, "loss": 0.6756, "rewards/accuracies": 0.625, "rewards/chosen": 0.011820506304502487, "rewards/margins": 0.037409354001283646, "rewards/rejected": -0.02558884769678116, "step": 104 }, { "epoch": 0.016238159675236806, "grad_norm": 5.4342875480651855, "learning_rate": 2.7061855670103096e-07, "logits/chosen": 12.268007278442383, "logits/rejected": 7.494583606719971, "logps/chosen": -398.30792236328125, "logps/rejected": -256.7729797363281, "loss": 0.6882, "rewards/accuracies": 0.5, "rewards/chosen": -0.0017288196831941605, "rewards/margins": 0.018518351018428802, "rewards/rejected": -0.020247172564268112, "step": 105 }, { "epoch": 0.016392808815000968, "grad_norm": 5.318408966064453, "learning_rate": 2.7319587628865984e-07, "logits/chosen": 7.161886215209961, "logits/rejected": 5.1986494064331055, "logps/chosen": -309.04693603515625, "logps/rejected": -280.927490234375, "loss": 0.6737, "rewards/accuracies": 0.75, "rewards/chosen": 0.0156279094517231, "rewards/margins": 0.04376005753874779, "rewards/rejected": -0.028132153674960136, "step": 106 }, { "epoch": 0.016547457954765126, "grad_norm": 5.483700275421143, "learning_rate": 2.7577319587628865e-07, "logits/chosen": 10.534073829650879, "logits/rejected": 11.566827774047852, "logps/chosen": -250.28680419921875, "logps/rejected": -362.1705322265625, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.008404970169067383, "rewards/margins": 0.022122427821159363, "rewards/rejected": -0.01371745951473713, "step": 107 }, { "epoch": 0.016702107094529288, "grad_norm": 4.226380825042725, "learning_rate": 2.783505154639176e-07, "logits/chosen": 8.030059814453125, "logits/rejected": 6.6773810386657715, "logps/chosen": -275.9827880859375, "logps/rejected": -259.358642578125, "loss": 0.6674, "rewards/accuracies": 0.625, "rewards/chosen": 0.030481815338134766, "rewards/margins": 0.056189581751823425, "rewards/rejected": -0.02570776641368866, "step": 108 }, { "epoch": 0.016856756234293446, "grad_norm": 4.288832664489746, "learning_rate": 2.809278350515464e-07, "logits/chosen": 5.9005126953125, "logits/rejected": 3.222754716873169, "logps/chosen": -239.56863403320312, "logps/rejected": -230.7248992919922, "loss": 0.7088, "rewards/accuracies": 0.25, "rewards/chosen": 0.019434787333011627, "rewards/margins": -0.029011959210038185, "rewards/rejected": 0.04844675213098526, "step": 109 }, { "epoch": 0.017011405374057608, "grad_norm": 5.753316879272461, "learning_rate": 2.8350515463917527e-07, "logits/chosen": 9.93628215789795, "logits/rejected": 5.556370735168457, "logps/chosen": -377.8464050292969, "logps/rejected": -261.0926513671875, "loss": 0.695, "rewards/accuracies": 0.375, "rewards/chosen": 0.008871268481016159, "rewards/margins": -0.001321074552834034, "rewards/rejected": 0.010192345827817917, "step": 110 }, { "epoch": 0.017166054513821766, "grad_norm": 5.302252292633057, "learning_rate": 2.8608247422680414e-07, "logits/chosen": 9.056117057800293, "logits/rejected": 7.192343711853027, "logps/chosen": -256.74346923828125, "logps/rejected": -260.11798095703125, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.028095673769712448, "rewards/margins": 0.06538401544094086, "rewards/rejected": -0.03728833422064781, "step": 111 }, { "epoch": 0.017320703653585928, "grad_norm": 4.046882152557373, "learning_rate": 2.88659793814433e-07, "logits/chosen": 12.280829429626465, "logits/rejected": 7.815947532653809, "logps/chosen": -211.55368041992188, "logps/rejected": -204.84384155273438, "loss": 0.6957, "rewards/accuracies": 0.625, "rewards/chosen": -0.01647648960351944, "rewards/margins": -0.004299450665712357, "rewards/rejected": -0.012177038937807083, "step": 112 }, { "epoch": 0.017475352793350087, "grad_norm": 6.47496223449707, "learning_rate": 2.912371134020619e-07, "logits/chosen": 8.501127243041992, "logits/rejected": 0.23458325862884521, "logps/chosen": -377.19732666015625, "logps/rejected": -308.78424072265625, "loss": 0.7093, "rewards/accuracies": 0.375, "rewards/chosen": -0.07137012481689453, "rewards/margins": -0.030649472028017044, "rewards/rejected": -0.04072065278887749, "step": 113 }, { "epoch": 0.01763000193311425, "grad_norm": 5.730913162231445, "learning_rate": 2.9381443298969076e-07, "logits/chosen": 13.995811462402344, "logits/rejected": -0.11919510364532471, "logps/chosen": -325.1755676269531, "logps/rejected": -133.4680633544922, "loss": 0.6689, "rewards/accuracies": 0.75, "rewards/chosen": 0.017102956771850586, "rewards/margins": 0.05096147209405899, "rewards/rejected": -0.033858511596918106, "step": 114 }, { "epoch": 0.017784651072878407, "grad_norm": 4.757335186004639, "learning_rate": 2.963917525773196e-07, "logits/chosen": 10.673439025878906, "logits/rejected": 2.0020837783813477, "logps/chosen": -271.75323486328125, "logps/rejected": -191.2560272216797, "loss": 0.6589, "rewards/accuracies": 0.375, "rewards/chosen": 0.06459856033325195, "rewards/margins": 0.08755870163440704, "rewards/rejected": -0.02296013943850994, "step": 115 }, { "epoch": 0.01793930021264257, "grad_norm": 6.322711944580078, "learning_rate": 2.989690721649485e-07, "logits/chosen": 19.059375762939453, "logits/rejected": 16.80080223083496, "logps/chosen": -327.9287109375, "logps/rejected": -344.2548522949219, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": 0.055001161992549896, "rewards/margins": 0.057523250579833984, "rewards/rejected": -0.002522086724638939, "step": 116 }, { "epoch": 0.018093949352406727, "grad_norm": 5.038933277130127, "learning_rate": 3.0154639175257737e-07, "logits/chosen": 7.062486171722412, "logits/rejected": 3.4042725563049316, "logps/chosen": -319.0403137207031, "logps/rejected": -235.81442260742188, "loss": 0.7182, "rewards/accuracies": 0.25, "rewards/chosen": -0.028659485280513763, "rewards/margins": -0.047144271433353424, "rewards/rejected": 0.018484782427549362, "step": 117 }, { "epoch": 0.01824859849217089, "grad_norm": 3.1864757537841797, "learning_rate": 3.041237113402062e-07, "logits/chosen": 8.75621223449707, "logits/rejected": 5.865131378173828, "logps/chosen": -162.49127197265625, "logps/rejected": -152.47247314453125, "loss": 0.6563, "rewards/accuracies": 0.75, "rewards/chosen": 0.052471160888671875, "rewards/margins": 0.07654944062232971, "rewards/rejected": -0.02407827600836754, "step": 118 }, { "epoch": 0.018403247631935047, "grad_norm": 6.459438323974609, "learning_rate": 3.0670103092783506e-07, "logits/chosen": 4.881914138793945, "logits/rejected": 4.98189115524292, "logps/chosen": -414.2001037597656, "logps/rejected": -396.5462341308594, "loss": 0.7469, "rewards/accuracies": 0.375, "rewards/chosen": -0.08090643584728241, "rewards/margins": -0.09828218817710876, "rewards/rejected": 0.017375757917761803, "step": 119 }, { "epoch": 0.01855789677169921, "grad_norm": 6.3285980224609375, "learning_rate": 3.0927835051546394e-07, "logits/chosen": 12.205191612243652, "logits/rejected": 9.061829566955566, "logps/chosen": -264.37127685546875, "logps/rejected": -220.64028930664062, "loss": 0.6812, "rewards/accuracies": 0.75, "rewards/chosen": 0.02196674421429634, "rewards/margins": 0.028208348900079727, "rewards/rejected": -0.00624160747975111, "step": 120 }, { "epoch": 0.018712545911463367, "grad_norm": 5.029258728027344, "learning_rate": 3.118556701030928e-07, "logits/chosen": 8.74778938293457, "logits/rejected": 6.037238121032715, "logps/chosen": -285.9646301269531, "logps/rejected": -211.68907165527344, "loss": 0.7156, "rewards/accuracies": 0.375, "rewards/chosen": -0.010250378400087357, "rewards/margins": -0.03964881971478462, "rewards/rejected": 0.029398441314697266, "step": 121 }, { "epoch": 0.01886719505122753, "grad_norm": 5.2786173820495605, "learning_rate": 3.144329896907217e-07, "logits/chosen": 9.570748329162598, "logits/rejected": 13.717671394348145, "logps/chosen": -218.76718139648438, "logps/rejected": -259.5052795410156, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": -0.02082233503460884, "rewards/margins": 0.009538032114505768, "rewards/rejected": -0.03036036714911461, "step": 122 }, { "epoch": 0.019021844190991687, "grad_norm": 4.859240531921387, "learning_rate": 3.1701030927835055e-07, "logits/chosen": 9.398500442504883, "logits/rejected": 8.594369888305664, "logps/chosen": -285.2724609375, "logps/rejected": -272.74517822265625, "loss": 0.7316, "rewards/accuracies": 0.5, "rewards/chosen": -0.037125684320926666, "rewards/margins": -0.07307553291320801, "rewards/rejected": 0.03594984859228134, "step": 123 }, { "epoch": 0.01917649333075585, "grad_norm": 4.039507865905762, "learning_rate": 3.1958762886597937e-07, "logits/chosen": 12.342098236083984, "logits/rejected": 13.515018463134766, "logps/chosen": -147.92825317382812, "logps/rejected": -214.56097412109375, "loss": 0.6894, "rewards/accuracies": 0.5, "rewards/chosen": 0.0400967113673687, "rewards/margins": 0.008935356512665749, "rewards/rejected": 0.0311613529920578, "step": 124 }, { "epoch": 0.019331142470520007, "grad_norm": 3.4822394847869873, "learning_rate": 3.2216494845360824e-07, "logits/chosen": 4.24094295501709, "logits/rejected": 9.235977172851562, "logps/chosen": -123.66722106933594, "logps/rejected": -160.98106384277344, "loss": 0.6765, "rewards/accuracies": 0.5, "rewards/chosen": 0.013030624017119408, "rewards/margins": 0.035941220819950104, "rewards/rejected": -0.022910594940185547, "step": 125 }, { "epoch": 0.01948579161028417, "grad_norm": 5.78380823135376, "learning_rate": 3.2474226804123717e-07, "logits/chosen": 7.895442962646484, "logits/rejected": 7.685380935668945, "logps/chosen": -298.37115478515625, "logps/rejected": -292.2838439941406, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.006781863979995251, "rewards/margins": 0.007892562076449394, "rewards/rejected": -0.01467442698776722, "step": 126 }, { "epoch": 0.019640440750048328, "grad_norm": 4.466803550720215, "learning_rate": 3.2731958762886604e-07, "logits/chosen": 10.08015251159668, "logits/rejected": 11.464984893798828, "logps/chosen": -205.8199462890625, "logps/rejected": -266.9953308105469, "loss": 0.6638, "rewards/accuracies": 0.75, "rewards/chosen": 0.02930777333676815, "rewards/margins": 0.06376784294843674, "rewards/rejected": -0.03446006774902344, "step": 127 }, { "epoch": 0.01979508988981249, "grad_norm": 7.772876262664795, "learning_rate": 3.2989690721649486e-07, "logits/chosen": 11.536581039428711, "logits/rejected": 13.485919952392578, "logps/chosen": -402.9762878417969, "logps/rejected": -324.923828125, "loss": 0.7065, "rewards/accuracies": 0.375, "rewards/chosen": 0.07936153560876846, "rewards/margins": -0.02284088358283043, "rewards/rejected": 0.1022024154663086, "step": 128 }, { "epoch": 0.019949739029576648, "grad_norm": 6.40769624710083, "learning_rate": 3.3247422680412373e-07, "logits/chosen": 14.489065170288086, "logits/rejected": 13.18747329711914, "logps/chosen": -463.24761962890625, "logps/rejected": -356.75726318359375, "loss": 0.6877, "rewards/accuracies": 0.625, "rewards/chosen": 0.012290572747588158, "rewards/margins": 0.017167475074529648, "rewards/rejected": -0.004876900464296341, "step": 129 }, { "epoch": 0.02010438816934081, "grad_norm": 4.053723335266113, "learning_rate": 3.350515463917526e-07, "logits/chosen": 2.3003878593444824, "logits/rejected": 11.692012786865234, "logps/chosen": -221.3137969970703, "logps/rejected": -297.11968994140625, "loss": 0.6608, "rewards/accuracies": 0.75, "rewards/chosen": 0.008316326886415482, "rewards/margins": 0.07795529812574387, "rewards/rejected": -0.06963896751403809, "step": 130 }, { "epoch": 0.020259037309104968, "grad_norm": 6.093809604644775, "learning_rate": 3.3762886597938147e-07, "logits/chosen": 12.096522331237793, "logits/rejected": 2.8824214935302734, "logps/chosen": -351.3785705566406, "logps/rejected": -216.28469848632812, "loss": 0.6721, "rewards/accuracies": 0.75, "rewards/chosen": 0.029314802959561348, "rewards/margins": 0.052289389073848724, "rewards/rejected": -0.022974587976932526, "step": 131 }, { "epoch": 0.02041368644886913, "grad_norm": 3.7649548053741455, "learning_rate": 3.402061855670103e-07, "logits/chosen": 16.990177154541016, "logits/rejected": 15.147708892822266, "logps/chosen": -196.39218139648438, "logps/rejected": -199.55445861816406, "loss": 0.6732, "rewards/accuracies": 0.5, "rewards/chosen": 0.0535799041390419, "rewards/margins": 0.04155528545379639, "rewards/rejected": 0.01202461775392294, "step": 132 }, { "epoch": 0.020568335588633288, "grad_norm": 4.7234416007995605, "learning_rate": 3.427835051546392e-07, "logits/chosen": 10.940417289733887, "logits/rejected": 6.862434387207031, "logps/chosen": -322.29254150390625, "logps/rejected": -309.9874267578125, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": 0.008367825299501419, "rewards/margins": 0.009527873247861862, "rewards/rejected": -0.0011600481811910868, "step": 133 }, { "epoch": 0.02072298472839745, "grad_norm": 4.684025287628174, "learning_rate": 3.453608247422681e-07, "logits/chosen": 13.74543571472168, "logits/rejected": 10.144643783569336, "logps/chosen": -331.08648681640625, "logps/rejected": -277.6265869140625, "loss": 0.7038, "rewards/accuracies": 0.625, "rewards/chosen": -0.0042053223587572575, "rewards/margins": -0.019487954676151276, "rewards/rejected": 0.01528263185173273, "step": 134 }, { "epoch": 0.020877633868161608, "grad_norm": 5.05457067489624, "learning_rate": 3.4793814432989696e-07, "logits/chosen": 10.365510940551758, "logits/rejected": 5.414824962615967, "logps/chosen": -296.9322509765625, "logps/rejected": -221.81578063964844, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": 0.046361736953258514, "rewards/margins": 0.04304616525769234, "rewards/rejected": 0.003315567970275879, "step": 135 }, { "epoch": 0.02103228300792577, "grad_norm": 4.940082550048828, "learning_rate": 3.505154639175258e-07, "logits/chosen": 10.648636817932129, "logits/rejected": 8.979429244995117, "logps/chosen": -273.0692138671875, "logps/rejected": -231.90045166015625, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 0.006854342296719551, "rewards/margins": 0.057478807866573334, "rewards/rejected": -0.05062446370720863, "step": 136 }, { "epoch": 0.02118693214768993, "grad_norm": 7.012190818786621, "learning_rate": 3.5309278350515465e-07, "logits/chosen": 8.252876281738281, "logits/rejected": 5.904921531677246, "logps/chosen": -306.2396240234375, "logps/rejected": -305.6393737792969, "loss": 0.689, "rewards/accuracies": 0.75, "rewards/chosen": 0.02063150331377983, "rewards/margins": 0.0295425858348608, "rewards/rejected": -0.00891108624637127, "step": 137 }, { "epoch": 0.02134158128745409, "grad_norm": 5.608420372009277, "learning_rate": 3.556701030927835e-07, "logits/chosen": 13.231416702270508, "logits/rejected": 9.319743156433105, "logps/chosen": -292.5795593261719, "logps/rejected": -225.84927368164062, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": -0.01856699027121067, "rewards/margins": 0.03866300731897354, "rewards/rejected": -0.05722999572753906, "step": 138 }, { "epoch": 0.02149623042721825, "grad_norm": 5.710394382476807, "learning_rate": 3.582474226804124e-07, "logits/chosen": 19.21364974975586, "logits/rejected": 11.254575729370117, "logps/chosen": -354.6864013671875, "logps/rejected": -268.7855224609375, "loss": 0.7212, "rewards/accuracies": 0.25, "rewards/chosen": -0.03736724704504013, "rewards/margins": -0.051706213504076004, "rewards/rejected": 0.014338970184326172, "step": 139 }, { "epoch": 0.02165087956698241, "grad_norm": 4.571855545043945, "learning_rate": 3.608247422680412e-07, "logits/chosen": 14.058659553527832, "logits/rejected": 10.861377716064453, "logps/chosen": -280.15386962890625, "logps/rejected": -282.8993225097656, "loss": 0.7012, "rewards/accuracies": 0.25, "rewards/chosen": 0.04230537265539169, "rewards/margins": -0.011055950075387955, "rewards/rejected": 0.05336131900548935, "step": 140 }, { "epoch": 0.02180552870674657, "grad_norm": 5.263911247253418, "learning_rate": 3.6340206185567014e-07, "logits/chosen": 7.461472034454346, "logits/rejected": 1.0462312698364258, "logps/chosen": -330.1807861328125, "logps/rejected": -182.9571533203125, "loss": 0.6717, "rewards/accuracies": 0.625, "rewards/chosen": 0.008874009363353252, "rewards/margins": 0.04460351541638374, "rewards/rejected": -0.03572950139641762, "step": 141 }, { "epoch": 0.02196017784651073, "grad_norm": 4.290580749511719, "learning_rate": 3.65979381443299e-07, "logits/chosen": 7.0978007316589355, "logits/rejected": 7.093575954437256, "logps/chosen": -177.14627075195312, "logps/rejected": -217.9191436767578, "loss": 0.6974, "rewards/accuracies": 0.625, "rewards/chosen": -0.030620671808719635, "rewards/margins": -0.0039480216801166534, "rewards/rejected": -0.026672648265957832, "step": 142 }, { "epoch": 0.02211482698627489, "grad_norm": 3.931922197341919, "learning_rate": 3.685567010309279e-07, "logits/chosen": 8.608698844909668, "logits/rejected": 5.749289035797119, "logps/chosen": -220.7203369140625, "logps/rejected": -195.58392333984375, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": 0.016330527141690254, "rewards/margins": 0.028108548372983932, "rewards/rejected": -0.011778019368648529, "step": 143 }, { "epoch": 0.02226947612603905, "grad_norm": 4.380866527557373, "learning_rate": 3.7113402061855675e-07, "logits/chosen": 12.03700065612793, "logits/rejected": 9.799747467041016, "logps/chosen": -183.94830322265625, "logps/rejected": -159.5745086669922, "loss": 0.6842, "rewards/accuracies": 0.625, "rewards/chosen": 0.041863348335027695, "rewards/margins": 0.02260288968682289, "rewards/rejected": 0.019260454922914505, "step": 144 }, { "epoch": 0.02242412526580321, "grad_norm": 13.73681354522705, "learning_rate": 3.737113402061856e-07, "logits/chosen": 13.341185569763184, "logits/rejected": 4.3040289878845215, "logps/chosen": -417.81817626953125, "logps/rejected": -280.46783447265625, "loss": 0.6873, "rewards/accuracies": 0.625, "rewards/chosen": 0.014989567920565605, "rewards/margins": 0.014561032876372337, "rewards/rejected": 0.0004285350441932678, "step": 145 }, { "epoch": 0.02257877440556737, "grad_norm": 5.867197513580322, "learning_rate": 3.7628865979381445e-07, "logits/chosen": 7.845378398895264, "logits/rejected": 7.68946647644043, "logps/chosen": -204.5855255126953, "logps/rejected": -263.0268859863281, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": -0.04747460037469864, "rewards/margins": 0.003543257713317871, "rewards/rejected": -0.05101785808801651, "step": 146 }, { "epoch": 0.02273342354533153, "grad_norm": 7.118080139160156, "learning_rate": 3.788659793814433e-07, "logits/chosen": 15.928805351257324, "logits/rejected": 6.229506015777588, "logps/chosen": -341.1634826660156, "logps/rejected": -129.547119140625, "loss": 0.7244, "rewards/accuracies": 0.25, "rewards/chosen": -0.04797687754034996, "rewards/margins": -0.05064563825726509, "rewards/rejected": 0.00266876257956028, "step": 147 }, { "epoch": 0.02288807268509569, "grad_norm": 7.006904602050781, "learning_rate": 3.8144329896907224e-07, "logits/chosen": 13.481695175170898, "logits/rejected": 8.497061729431152, "logps/chosen": -340.57769775390625, "logps/rejected": -283.00592041015625, "loss": 0.6955, "rewards/accuracies": 0.5, "rewards/chosen": -0.00449838861823082, "rewards/margins": 0.0006693825125694275, "rewards/rejected": -0.005167771130800247, "step": 148 }, { "epoch": 0.02304272182485985, "grad_norm": 9.270798683166504, "learning_rate": 3.8402061855670106e-07, "logits/chosen": 6.4740190505981445, "logits/rejected": 0.8017644286155701, "logps/chosen": -213.53318786621094, "logps/rejected": -273.9697265625, "loss": 0.6893, "rewards/accuracies": 0.5, "rewards/chosen": 0.03169379383325577, "rewards/margins": 0.014923380687832832, "rewards/rejected": 0.016770416870713234, "step": 149 }, { "epoch": 0.023197370964624008, "grad_norm": 3.4471242427825928, "learning_rate": 3.8659793814432993e-07, "logits/chosen": 9.30126953125, "logits/rejected": 6.892515182495117, "logps/chosen": -193.80108642578125, "logps/rejected": -131.40599060058594, "loss": 0.7022, "rewards/accuracies": 0.5, "rewards/chosen": -0.009104728698730469, "rewards/margins": -0.01559281162917614, "rewards/rejected": 0.00648808479309082, "step": 150 }, { "epoch": 0.02335202010438817, "grad_norm": 5.857764720916748, "learning_rate": 3.891752577319588e-07, "logits/chosen": 9.284612655639648, "logits/rejected": 6.896724224090576, "logps/chosen": -316.2874450683594, "logps/rejected": -264.9364013671875, "loss": 0.6794, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007912623696029186, "rewards/margins": 0.03216905891895294, "rewards/rejected": -0.03137779235839844, "step": 151 }, { "epoch": 0.023506669244152328, "grad_norm": 4.955041885375977, "learning_rate": 3.917525773195877e-07, "logits/chosen": 7.255969047546387, "logits/rejected": 12.712529182434082, "logps/chosen": -261.017333984375, "logps/rejected": -297.79144287109375, "loss": 0.6967, "rewards/accuracies": 0.75, "rewards/chosen": -0.02873850055038929, "rewards/margins": -0.004970931448042393, "rewards/rejected": -0.023767568171024323, "step": 152 }, { "epoch": 0.02366131838391649, "grad_norm": 6.317713260650635, "learning_rate": 3.943298969072165e-07, "logits/chosen": 11.194629669189453, "logits/rejected": 10.033746719360352, "logps/chosen": -334.9357604980469, "logps/rejected": -294.2626953125, "loss": 0.7025, "rewards/accuracies": 0.5, "rewards/chosen": -0.029807284474372864, "rewards/margins": -0.008442975580692291, "rewards/rejected": -0.021364308893680573, "step": 153 }, { "epoch": 0.023815967523680648, "grad_norm": 4.404287338256836, "learning_rate": 3.9690721649484537e-07, "logits/chosen": 14.251007080078125, "logits/rejected": 7.5094709396362305, "logps/chosen": -293.74658203125, "logps/rejected": -288.35394287109375, "loss": 0.658, "rewards/accuracies": 0.625, "rewards/chosen": 0.06871271133422852, "rewards/margins": 0.07606048882007599, "rewards/rejected": -0.007347774691879749, "step": 154 }, { "epoch": 0.02397061666344481, "grad_norm": 4.846568584442139, "learning_rate": 3.9948453608247424e-07, "logits/chosen": 6.9730119705200195, "logits/rejected": 6.351569175720215, "logps/chosen": -270.74835205078125, "logps/rejected": -237.1637420654297, "loss": 0.7098, "rewards/accuracies": 0.375, "rewards/chosen": -0.035985566675662994, "rewards/margins": -0.03157832846045494, "rewards/rejected": -0.004407238215208054, "step": 155 }, { "epoch": 0.024125265803208968, "grad_norm": 6.149508953094482, "learning_rate": 4.0206185567010316e-07, "logits/chosen": 7.503884792327881, "logits/rejected": 5.384027004241943, "logps/chosen": -232.51004028320312, "logps/rejected": -208.80050659179688, "loss": 0.699, "rewards/accuracies": 0.375, "rewards/chosen": -0.012560082599520683, "rewards/margins": -0.008591175079345703, "rewards/rejected": -0.0039689065888524055, "step": 156 }, { "epoch": 0.02427991494297313, "grad_norm": 7.02464485168457, "learning_rate": 4.0463917525773204e-07, "logits/chosen": 12.675241470336914, "logits/rejected": 13.16816234588623, "logps/chosen": -362.6820068359375, "logps/rejected": -485.51751708984375, "loss": 0.7168, "rewards/accuracies": 0.25, "rewards/chosen": 0.007289508357644081, "rewards/margins": -0.034657854586839676, "rewards/rejected": 0.041947364807128906, "step": 157 }, { "epoch": 0.024434564082737288, "grad_norm": 3.8207221031188965, "learning_rate": 4.0721649484536085e-07, "logits/chosen": 7.228775978088379, "logits/rejected": 6.786842346191406, "logps/chosen": -178.40481567382812, "logps/rejected": -187.9625244140625, "loss": 0.6889, "rewards/accuracies": 0.25, "rewards/chosen": 0.030375385656952858, "rewards/margins": 0.010330486111342907, "rewards/rejected": 0.020044900476932526, "step": 158 }, { "epoch": 0.02458921322250145, "grad_norm": 4.20653772354126, "learning_rate": 4.0979381443298973e-07, "logits/chosen": 3.8242712020874023, "logits/rejected": 5.5489091873168945, "logps/chosen": -272.1354675292969, "logps/rejected": -266.16693115234375, "loss": 0.6443, "rewards/accuracies": 0.75, "rewards/chosen": 0.05270209163427353, "rewards/margins": 0.10416822880506516, "rewards/rejected": -0.05146613344550133, "step": 159 }, { "epoch": 0.02474386236226561, "grad_norm": 4.795764446258545, "learning_rate": 4.123711340206186e-07, "logits/chosen": 4.660576820373535, "logits/rejected": 7.1966328620910645, "logps/chosen": -228.36221313476562, "logps/rejected": -224.59349060058594, "loss": 0.6512, "rewards/accuracies": 1.0, "rewards/chosen": 0.03494858741760254, "rewards/margins": 0.08641910552978516, "rewards/rejected": -0.05147051811218262, "step": 160 }, { "epoch": 0.02489851150202977, "grad_norm": 5.600543975830078, "learning_rate": 4.149484536082474e-07, "logits/chosen": 9.707596778869629, "logits/rejected": 6.820638656616211, "logps/chosen": -264.181396484375, "logps/rejected": -214.16131591796875, "loss": 0.7034, "rewards/accuracies": 0.375, "rewards/chosen": -0.0313660129904747, "rewards/margins": -0.01497330516576767, "rewards/rejected": -0.016392705962061882, "step": 161 }, { "epoch": 0.02505316064179393, "grad_norm": 6.325645446777344, "learning_rate": 4.175257731958763e-07, "logits/chosen": 11.53718376159668, "logits/rejected": 14.02033805847168, "logps/chosen": -348.6204833984375, "logps/rejected": -331.4289245605469, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": -0.007296848110854626, "rewards/margins": 0.010853290557861328, "rewards/rejected": -0.01815013960003853, "step": 162 }, { "epoch": 0.02520780978155809, "grad_norm": 4.813772678375244, "learning_rate": 4.201030927835052e-07, "logits/chosen": 15.513379096984863, "logits/rejected": 9.078393936157227, "logps/chosen": -295.4510498046875, "logps/rejected": -229.6817626953125, "loss": 0.7497, "rewards/accuracies": 0.375, "rewards/chosen": -0.03236312419176102, "rewards/margins": -0.09920233488082886, "rewards/rejected": 0.06683921813964844, "step": 163 }, { "epoch": 0.02536245892132225, "grad_norm": 5.510994911193848, "learning_rate": 4.226804123711341e-07, "logits/chosen": 10.846382141113281, "logits/rejected": 11.770069122314453, "logps/chosen": -189.88009643554688, "logps/rejected": -156.14712524414062, "loss": 0.655, "rewards/accuracies": 0.75, "rewards/chosen": 0.05986461788415909, "rewards/margins": 0.08025836944580078, "rewards/rejected": -0.020393753424286842, "step": 164 }, { "epoch": 0.02551710806108641, "grad_norm": 4.027902126312256, "learning_rate": 4.2525773195876296e-07, "logits/chosen": 10.94518756866455, "logits/rejected": 9.858892440795898, "logps/chosen": -273.0108337402344, "logps/rejected": -244.79905700683594, "loss": 0.6941, "rewards/accuracies": 0.25, "rewards/chosen": -0.012119006365537643, "rewards/margins": 0.005785701796412468, "rewards/rejected": -0.01790471374988556, "step": 165 }, { "epoch": 0.02567175720085057, "grad_norm": 4.807586669921875, "learning_rate": 4.278350515463918e-07, "logits/chosen": 8.141124725341797, "logits/rejected": 6.653397560119629, "logps/chosen": -297.3433532714844, "logps/rejected": -242.2059326171875, "loss": 0.6815, "rewards/accuracies": 0.625, "rewards/chosen": 0.03848304599523544, "rewards/margins": 0.025315478444099426, "rewards/rejected": 0.013167573139071465, "step": 166 }, { "epoch": 0.02582640634061473, "grad_norm": 4.420519828796387, "learning_rate": 4.3041237113402065e-07, "logits/chosen": 11.321237564086914, "logits/rejected": 11.26107120513916, "logps/chosen": -194.85035705566406, "logps/rejected": -196.01620483398438, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": 0.0019239448010921478, "rewards/margins": -0.017923161387443542, "rewards/rejected": 0.01984710618853569, "step": 167 }, { "epoch": 0.02598105548037889, "grad_norm": 6.189317226409912, "learning_rate": 4.329896907216495e-07, "logits/chosen": 12.190542221069336, "logits/rejected": 11.473383903503418, "logps/chosen": -288.4436950683594, "logps/rejected": -383.51922607421875, "loss": 0.7033, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007077232003211975, "rewards/margins": -0.017600730061531067, "rewards/rejected": 0.018308449536561966, "step": 168 }, { "epoch": 0.02613570462014305, "grad_norm": 4.858217239379883, "learning_rate": 4.3556701030927834e-07, "logits/chosen": 13.371498107910156, "logits/rejected": 8.25893783569336, "logps/chosen": -208.8777618408203, "logps/rejected": -223.69036865234375, "loss": 0.7022, "rewards/accuracies": 0.375, "rewards/chosen": -0.03538203611969948, "rewards/margins": -0.01615438610315323, "rewards/rejected": -0.01922765001654625, "step": 169 }, { "epoch": 0.02629035375990721, "grad_norm": 6.036569595336914, "learning_rate": 4.381443298969072e-07, "logits/chosen": 8.494549751281738, "logits/rejected": 9.896512031555176, "logps/chosen": -156.01968383789062, "logps/rejected": -187.5347442626953, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": -0.0037443162873387337, "rewards/margins": -0.04112754017114639, "rewards/rejected": 0.037383221089839935, "step": 170 }, { "epoch": 0.02644500289967137, "grad_norm": 5.20986795425415, "learning_rate": 4.4072164948453614e-07, "logits/chosen": 13.111475944519043, "logits/rejected": 6.484719753265381, "logps/chosen": -409.5218505859375, "logps/rejected": -234.3643035888672, "loss": 0.683, "rewards/accuracies": 0.25, "rewards/chosen": -0.01825409010052681, "rewards/margins": 0.024442581459879875, "rewards/rejected": -0.04269666597247124, "step": 171 }, { "epoch": 0.02659965203943553, "grad_norm": 5.2855658531188965, "learning_rate": 4.43298969072165e-07, "logits/chosen": 4.768103122711182, "logits/rejected": 6.6989593505859375, "logps/chosen": -260.90045166015625, "logps/rejected": -291.176513671875, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": 0.02348909340798855, "rewards/margins": 0.013461204245686531, "rewards/rejected": 0.010027887299656868, "step": 172 }, { "epoch": 0.02675430117919969, "grad_norm": 6.277451992034912, "learning_rate": 4.458762886597939e-07, "logits/chosen": 10.987936973571777, "logits/rejected": 13.236594200134277, "logps/chosen": -335.50177001953125, "logps/rejected": -352.0394287109375, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": -0.005882358178496361, "rewards/margins": 0.01384363230317831, "rewards/rejected": -0.019725989550352097, "step": 173 }, { "epoch": 0.02690895031896385, "grad_norm": 4.070093154907227, "learning_rate": 4.484536082474227e-07, "logits/chosen": 6.1086859703063965, "logits/rejected": 6.355241775512695, "logps/chosen": -255.8524932861328, "logps/rejected": -230.45578002929688, "loss": 0.7045, "rewards/accuracies": 0.5, "rewards/chosen": -0.0077735427767038345, "rewards/margins": -0.017763851210474968, "rewards/rejected": 0.009990310296416283, "step": 174 }, { "epoch": 0.02706359945872801, "grad_norm": 7.5643181800842285, "learning_rate": 4.5103092783505157e-07, "logits/chosen": 8.44739055633545, "logits/rejected": 8.935912132263184, "logps/chosen": -179.26397705078125, "logps/rejected": -202.01805114746094, "loss": 0.6886, "rewards/accuracies": 0.625, "rewards/chosen": 0.017644383013248444, "rewards/margins": 0.01310274749994278, "rewards/rejected": 0.004541635047644377, "step": 175 }, { "epoch": 0.02721824859849217, "grad_norm": 4.193178653717041, "learning_rate": 4.5360824742268044e-07, "logits/chosen": 8.576276779174805, "logits/rejected": 11.404003143310547, "logps/chosen": -219.38568115234375, "logps/rejected": -182.7236328125, "loss": 0.7097, "rewards/accuracies": 0.25, "rewards/chosen": -0.002615641802549362, "rewards/margins": -0.031035995110869408, "rewards/rejected": 0.028420355170965195, "step": 176 }, { "epoch": 0.02737289773825633, "grad_norm": 4.314047813415527, "learning_rate": 4.561855670103093e-07, "logits/chosen": 6.486111164093018, "logits/rejected": 14.272531509399414, "logps/chosen": -110.76365661621094, "logps/rejected": -200.30520629882812, "loss": 0.7035, "rewards/accuracies": 0.375, "rewards/chosen": 0.014495277777314186, "rewards/margins": -0.0194854736328125, "rewards/rejected": 0.03398074954748154, "step": 177 }, { "epoch": 0.02752754687802049, "grad_norm": 9.834793090820312, "learning_rate": 4.5876288659793813e-07, "logits/chosen": 11.759220123291016, "logits/rejected": 13.046407699584961, "logps/chosen": -258.09619140625, "logps/rejected": -273.06842041015625, "loss": 0.7337, "rewards/accuracies": 0.25, "rewards/chosen": -0.057515814900398254, "rewards/margins": -0.07578323036432266, "rewards/rejected": 0.018267419189214706, "step": 178 }, { "epoch": 0.02768219601778465, "grad_norm": 4.0035786628723145, "learning_rate": 4.6134020618556706e-07, "logits/chosen": 4.870166301727295, "logits/rejected": 7.615548133850098, "logps/chosen": -199.38555908203125, "logps/rejected": -275.7410888671875, "loss": 0.701, "rewards/accuracies": 0.375, "rewards/chosen": -0.01069631427526474, "rewards/margins": -0.014157723635435104, "rewards/rejected": 0.0034614093601703644, "step": 179 }, { "epoch": 0.02783684515754881, "grad_norm": 7.824204444885254, "learning_rate": 4.6391752577319593e-07, "logits/chosen": 8.274697303771973, "logits/rejected": 12.119913101196289, "logps/chosen": -322.6107482910156, "logps/rejected": -353.1990966796875, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.03774423524737358, "rewards/margins": 0.07958593219518661, "rewards/rejected": -0.04184170067310333, "step": 180 }, { "epoch": 0.02799149429731297, "grad_norm": 7.327850341796875, "learning_rate": 4.664948453608248e-07, "logits/chosen": 13.005859375, "logits/rejected": 8.920368194580078, "logps/chosen": -304.26446533203125, "logps/rejected": -248.5641632080078, "loss": 0.6879, "rewards/accuracies": 0.375, "rewards/chosen": 0.0012749661691486835, "rewards/margins": 0.018329523503780365, "rewards/rejected": -0.017054561525583267, "step": 181 }, { "epoch": 0.02814614343707713, "grad_norm": 4.1393256187438965, "learning_rate": 4.690721649484536e-07, "logits/chosen": 9.603414535522461, "logits/rejected": 6.550224304199219, "logps/chosen": -229.20713806152344, "logps/rejected": -226.37852478027344, "loss": 0.7114, "rewards/accuracies": 0.5, "rewards/chosen": -0.019595623016357422, "rewards/margins": -0.029221773147583008, "rewards/rejected": 0.009626151993870735, "step": 182 }, { "epoch": 0.02830079257684129, "grad_norm": 4.45011568069458, "learning_rate": 4.716494845360825e-07, "logits/chosen": 7.681704044342041, "logits/rejected": 5.8652238845825195, "logps/chosen": -272.65020751953125, "logps/rejected": -244.9304962158203, "loss": 0.6791, "rewards/accuracies": 0.375, "rewards/chosen": 0.0519106425344944, "rewards/margins": 0.03381602466106415, "rewards/rejected": 0.018094610422849655, "step": 183 }, { "epoch": 0.02845544171660545, "grad_norm": 7.654642105102539, "learning_rate": 4.7422680412371136e-07, "logits/chosen": 9.277904510498047, "logits/rejected": 9.229616165161133, "logps/chosen": -276.8092346191406, "logps/rejected": -291.9592590332031, "loss": 0.6564, "rewards/accuracies": 0.875, "rewards/chosen": 0.031248953193426132, "rewards/margins": 0.07713842391967773, "rewards/rejected": -0.0458894744515419, "step": 184 }, { "epoch": 0.028610090856369612, "grad_norm": 6.350154399871826, "learning_rate": 4.7680412371134024e-07, "logits/chosen": 16.259220123291016, "logits/rejected": 13.335022926330566, "logps/chosen": -268.2291259765625, "logps/rejected": -270.746337890625, "loss": 0.7032, "rewards/accuracies": 0.625, "rewards/chosen": -0.030719853937625885, "rewards/margins": -0.015138531103730202, "rewards/rejected": -0.015581320971250534, "step": 185 }, { "epoch": 0.02876473999613377, "grad_norm": 5.605805397033691, "learning_rate": 4.793814432989691e-07, "logits/chosen": 7.675743579864502, "logits/rejected": 4.859321117401123, "logps/chosen": -338.41265869140625, "logps/rejected": -347.5958557128906, "loss": 0.5978, "rewards/accuracies": 0.75, "rewards/chosen": 0.07572059333324432, "rewards/margins": 0.21087026596069336, "rewards/rejected": -0.13514965772628784, "step": 186 }, { "epoch": 0.028919389135897932, "grad_norm": 5.362581729888916, "learning_rate": 4.81958762886598e-07, "logits/chosen": 15.630083084106445, "logits/rejected": 13.24819564819336, "logps/chosen": -297.172119140625, "logps/rejected": -249.78953552246094, "loss": 0.7117, "rewards/accuracies": 0.375, "rewards/chosen": -0.012043287977576256, "rewards/margins": -0.03339576721191406, "rewards/rejected": 0.021352481096982956, "step": 187 }, { "epoch": 0.02907403827566209, "grad_norm": 4.199508190155029, "learning_rate": 4.845360824742269e-07, "logits/chosen": 9.092329025268555, "logits/rejected": 10.078535079956055, "logps/chosen": -225.06993103027344, "logps/rejected": -267.7950439453125, "loss": 0.7125, "rewards/accuracies": 0.25, "rewards/chosen": -0.010802840813994408, "rewards/margins": -0.03644533455371857, "rewards/rejected": 0.02564249187707901, "step": 188 }, { "epoch": 0.029228687415426252, "grad_norm": 3.7857048511505127, "learning_rate": 4.871134020618557e-07, "logits/chosen": 0.11447806656360626, "logits/rejected": 2.68275785446167, "logps/chosen": -143.5679168701172, "logps/rejected": -192.89923095703125, "loss": 0.719, "rewards/accuracies": 0.25, "rewards/chosen": -0.035102128982543945, "rewards/margins": -0.04992819204926491, "rewards/rejected": 0.014826059341430664, "step": 189 }, { "epoch": 0.02938333655519041, "grad_norm": 6.220125675201416, "learning_rate": 4.896907216494846e-07, "logits/chosen": 12.432947158813477, "logits/rejected": 12.735036849975586, "logps/chosen": -249.94854736328125, "logps/rejected": -289.95025634765625, "loss": 0.6844, "rewards/accuracies": 0.625, "rewards/chosen": 0.005805587396025658, "rewards/margins": 0.02017221227288246, "rewards/rejected": -0.014366628602147102, "step": 190 }, { "epoch": 0.029537985694954572, "grad_norm": 4.018984317779541, "learning_rate": 4.922680412371135e-07, "logits/chosen": 11.0584716796875, "logits/rejected": 8.789422988891602, "logps/chosen": -234.2735595703125, "logps/rejected": -233.0352783203125, "loss": 0.7069, "rewards/accuracies": 0.25, "rewards/chosen": 0.01151285320520401, "rewards/margins": -0.023624751716852188, "rewards/rejected": 0.0351376049220562, "step": 191 }, { "epoch": 0.02969263483471873, "grad_norm": 5.126651763916016, "learning_rate": 4.948453608247422e-07, "logits/chosen": 10.567253112792969, "logits/rejected": 4.80112361907959, "logps/chosen": -329.8632507324219, "logps/rejected": -254.94168090820312, "loss": 0.6513, "rewards/accuracies": 0.5, "rewards/chosen": 0.06606252491474152, "rewards/margins": 0.09130598604679108, "rewards/rejected": -0.025243476033210754, "step": 192 }, { "epoch": 0.029847283974482892, "grad_norm": 4.853884220123291, "learning_rate": 4.974226804123711e-07, "logits/chosen": 15.442766189575195, "logits/rejected": 9.313232421875, "logps/chosen": -335.0982971191406, "logps/rejected": -212.04495239257812, "loss": 0.7052, "rewards/accuracies": 0.5, "rewards/chosen": -0.051625922322273254, "rewards/margins": -0.019226882606744766, "rewards/rejected": -0.03239903599023819, "step": 193 }, { "epoch": 0.03000193311424705, "grad_norm": 3.774740695953369, "learning_rate": 5.000000000000001e-07, "logits/chosen": 4.590182304382324, "logits/rejected": 8.030871391296387, "logps/chosen": -155.51739501953125, "logps/rejected": -200.157958984375, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": 0.03606202453374863, "rewards/margins": 0.05163295194506645, "rewards/rejected": -0.015570924617350101, "step": 194 }, { "epoch": 0.030156582254011213, "grad_norm": 5.68532133102417, "learning_rate": 5.02577319587629e-07, "logits/chosen": 10.800435066223145, "logits/rejected": 5.137495040893555, "logps/chosen": -477.038818359375, "logps/rejected": -351.06561279296875, "loss": 0.6281, "rewards/accuracies": 0.875, "rewards/chosen": 0.04695453867316246, "rewards/margins": 0.1407017707824707, "rewards/rejected": -0.09374723583459854, "step": 195 }, { "epoch": 0.03031123139377537, "grad_norm": 5.503288269042969, "learning_rate": 5.051546391752578e-07, "logits/chosen": 8.266733169555664, "logits/rejected": 8.95901870727539, "logps/chosen": -233.74664306640625, "logps/rejected": -250.41323852539062, "loss": 0.7025, "rewards/accuracies": 0.5, "rewards/chosen": -0.005411338992416859, "rewards/margins": -0.01662764512002468, "rewards/rejected": 0.011216306127607822, "step": 196 }, { "epoch": 0.030465880533539533, "grad_norm": 5.579014778137207, "learning_rate": 5.077319587628866e-07, "logits/chosen": 9.309625625610352, "logits/rejected": 3.965503692626953, "logps/chosen": -299.0317077636719, "logps/rejected": -270.5412902832031, "loss": 0.6949, "rewards/accuracies": 0.25, "rewards/chosen": 0.04466180503368378, "rewards/margins": 0.0016595353372395039, "rewards/rejected": 0.04300227016210556, "step": 197 }, { "epoch": 0.03062052967330369, "grad_norm": 5.246983528137207, "learning_rate": 5.103092783505155e-07, "logits/chosen": 12.451875686645508, "logits/rejected": 13.854093551635742, "logps/chosen": -242.00267028808594, "logps/rejected": -255.1998291015625, "loss": 0.7272, "rewards/accuracies": 0.25, "rewards/chosen": -0.02913236990571022, "rewards/margins": -0.06588325649499893, "rewards/rejected": 0.03675089031457901, "step": 198 }, { "epoch": 0.030775178813067853, "grad_norm": 4.322015762329102, "learning_rate": 5.128865979381443e-07, "logits/chosen": 10.467855453491211, "logits/rejected": 5.929073333740234, "logps/chosen": -309.23040771484375, "logps/rejected": -255.4852294921875, "loss": 0.6511, "rewards/accuracies": 0.75, "rewards/chosen": 0.05621838942170143, "rewards/margins": 0.08871493488550186, "rewards/rejected": -0.03249654546380043, "step": 199 }, { "epoch": 0.03092982795283201, "grad_norm": 5.768648147583008, "learning_rate": 5.154639175257732e-07, "logits/chosen": 8.117884635925293, "logits/rejected": 2.875415802001953, "logps/chosen": -289.96588134765625, "logps/rejected": -215.07467651367188, "loss": 0.7327, "rewards/accuracies": 0.375, "rewards/chosen": -0.02657940238714218, "rewards/margins": -0.07221557199954987, "rewards/rejected": 0.04563617706298828, "step": 200 }, { "epoch": 0.031084477092596173, "grad_norm": 5.3595075607299805, "learning_rate": 5.180412371134022e-07, "logits/chosen": 12.535530090332031, "logits/rejected": 8.698311805725098, "logps/chosen": -297.65924072265625, "logps/rejected": -240.9003448486328, "loss": 0.6846, "rewards/accuracies": 0.625, "rewards/chosen": 0.03645692020654678, "rewards/margins": 0.02216940000653267, "rewards/rejected": 0.01428751926869154, "step": 201 }, { "epoch": 0.03123912623236033, "grad_norm": 5.111512184143066, "learning_rate": 5.20618556701031e-07, "logits/chosen": 9.871399879455566, "logits/rejected": 14.595333099365234, "logps/chosen": -206.845703125, "logps/rejected": -191.87661743164062, "loss": 0.7088, "rewards/accuracies": 0.375, "rewards/chosen": 0.005719520151615143, "rewards/margins": -0.029749490320682526, "rewards/rejected": 0.03546901047229767, "step": 202 }, { "epoch": 0.03139377537212449, "grad_norm": 5.261501312255859, "learning_rate": 5.231958762886598e-07, "logits/chosen": 7.539558410644531, "logits/rejected": 6.129846572875977, "logps/chosen": -231.51771545410156, "logps/rejected": -193.36862182617188, "loss": 0.7117, "rewards/accuracies": 0.25, "rewards/chosen": 0.005268574692308903, "rewards/margins": -0.03355374187231064, "rewards/rejected": 0.03882231563329697, "step": 203 }, { "epoch": 0.031548424511888655, "grad_norm": 5.315875053405762, "learning_rate": 5.257731958762887e-07, "logits/chosen": 10.683394432067871, "logits/rejected": 5.820173740386963, "logps/chosen": -249.38955688476562, "logps/rejected": -200.80557250976562, "loss": 0.7192, "rewards/accuracies": 0.5, "rewards/chosen": 0.012097455561161041, "rewards/margins": -0.04487933963537216, "rewards/rejected": 0.0569767951965332, "step": 204 }, { "epoch": 0.03170307365165281, "grad_norm": 4.840034484863281, "learning_rate": 5.283505154639176e-07, "logits/chosen": 13.294004440307617, "logits/rejected": 4.9387664794921875, "logps/chosen": -343.0023193359375, "logps/rejected": -164.44154357910156, "loss": 0.6756, "rewards/accuracies": 0.625, "rewards/chosen": 0.02777595818042755, "rewards/margins": 0.038086939603090286, "rewards/rejected": -0.010310984216630459, "step": 205 }, { "epoch": 0.03185772279141697, "grad_norm": 8.74624252319336, "learning_rate": 5.309278350515464e-07, "logits/chosen": 7.686714172363281, "logits/rejected": 9.148231506347656, "logps/chosen": -210.98745727539062, "logps/rejected": -289.07427978515625, "loss": 0.7007, "rewards/accuracies": 0.75, "rewards/chosen": -0.01983022876083851, "rewards/margins": -0.009432818740606308, "rewards/rejected": -0.0103974100202322, "step": 206 }, { "epoch": 0.03201237193118113, "grad_norm": 5.937737464904785, "learning_rate": 5.335051546391753e-07, "logits/chosen": 7.729095458984375, "logits/rejected": 5.516740798950195, "logps/chosen": -329.46875, "logps/rejected": -308.0854187011719, "loss": 0.6882, "rewards/accuracies": 0.625, "rewards/chosen": 0.0182005874812603, "rewards/margins": 0.010803531855344772, "rewards/rejected": 0.007397056557238102, "step": 207 }, { "epoch": 0.032167021070945295, "grad_norm": 3.5201690196990967, "learning_rate": 5.360824742268042e-07, "logits/chosen": 12.320011138916016, "logits/rejected": 7.052798271179199, "logps/chosen": -216.28662109375, "logps/rejected": -148.6501922607422, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": 0.0409000888466835, "rewards/margins": 0.041211411356925964, "rewards/rejected": -0.0003113262355327606, "step": 208 }, { "epoch": 0.032321670210709454, "grad_norm": 3.8653488159179688, "learning_rate": 5.386597938144331e-07, "logits/chosen": 11.556921005249023, "logits/rejected": 12.459211349487305, "logps/chosen": -202.27687072753906, "logps/rejected": -191.4716796875, "loss": 0.6586, "rewards/accuracies": 0.625, "rewards/chosen": 0.004669379908591509, "rewards/margins": 0.07291088253259659, "rewards/rejected": -0.06824149936437607, "step": 209 }, { "epoch": 0.03247631935047361, "grad_norm": 7.129583835601807, "learning_rate": 5.412371134020619e-07, "logits/chosen": 7.569952964782715, "logits/rejected": 2.0636281967163086, "logps/chosen": -320.1714172363281, "logps/rejected": -294.3085021972656, "loss": 0.7065, "rewards/accuracies": 0.75, "rewards/chosen": -0.02710290253162384, "rewards/margins": -0.020142219960689545, "rewards/rejected": -0.006960679776966572, "step": 210 }, { "epoch": 0.03263096849023777, "grad_norm": 4.896057605743408, "learning_rate": 5.438144329896908e-07, "logits/chosen": 14.750947952270508, "logits/rejected": 12.198291778564453, "logps/chosen": -309.416259765625, "logps/rejected": -275.0567626953125, "loss": 0.6486, "rewards/accuracies": 0.75, "rewards/chosen": 0.0052623748779296875, "rewards/margins": 0.09564967453479767, "rewards/rejected": -0.09038729965686798, "step": 211 }, { "epoch": 0.032785617630001936, "grad_norm": 5.7905168533325195, "learning_rate": 5.463917525773197e-07, "logits/chosen": 5.347293853759766, "logits/rejected": 5.936334133148193, "logps/chosen": -276.7212829589844, "logps/rejected": -216.69508361816406, "loss": 0.7259, "rewards/accuracies": 0.375, "rewards/chosen": -0.05456046760082245, "rewards/margins": -0.05719555914402008, "rewards/rejected": 0.002635098062455654, "step": 212 }, { "epoch": 0.032940266769766094, "grad_norm": 6.859431743621826, "learning_rate": 5.489690721649485e-07, "logits/chosen": 9.798736572265625, "logits/rejected": 7.827033996582031, "logps/chosen": -310.09796142578125, "logps/rejected": -202.17762756347656, "loss": 0.6818, "rewards/accuracies": 0.75, "rewards/chosen": 0.030898097902536392, "rewards/margins": 0.033220648765563965, "rewards/rejected": -0.0023225508630275726, "step": 213 }, { "epoch": 0.03309491590953025, "grad_norm": 4.4510273933410645, "learning_rate": 5.515463917525773e-07, "logits/chosen": 8.931305885314941, "logits/rejected": 7.163075923919678, "logps/chosen": -142.5396728515625, "logps/rejected": -132.72410583496094, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.01383285690099001, "rewards/margins": 0.01216497365385294, "rewards/rejected": 0.0016678813844919205, "step": 214 }, { "epoch": 0.03324956504929441, "grad_norm": 4.577381134033203, "learning_rate": 5.541237113402062e-07, "logits/chosen": 6.717674255371094, "logits/rejected": 8.618230819702148, "logps/chosen": -213.08270263671875, "logps/rejected": -289.16326904296875, "loss": 0.7375, "rewards/accuracies": 0.375, "rewards/chosen": -0.04158239811658859, "rewards/margins": -0.07986173778772354, "rewards/rejected": 0.03827934339642525, "step": 215 }, { "epoch": 0.033404214189058576, "grad_norm": 6.901246070861816, "learning_rate": 5.567010309278352e-07, "logits/chosen": 10.95913028717041, "logits/rejected": 10.665098190307617, "logps/chosen": -184.7791748046875, "logps/rejected": -208.35800170898438, "loss": 0.6516, "rewards/accuracies": 0.75, "rewards/chosen": 0.03146687150001526, "rewards/margins": 0.08955011516809464, "rewards/rejected": -0.05808325111865997, "step": 216 }, { "epoch": 0.033558863328822734, "grad_norm": 5.082674026489258, "learning_rate": 5.59278350515464e-07, "logits/chosen": 14.751993179321289, "logits/rejected": 6.697968482971191, "logps/chosen": -324.7636413574219, "logps/rejected": -241.14578247070312, "loss": 0.6883, "rewards/accuracies": 0.5, "rewards/chosen": -0.048441458493471146, "rewards/margins": 0.015271376818418503, "rewards/rejected": -0.06371283531188965, "step": 217 }, { "epoch": 0.03371351246858689, "grad_norm": 3.8480961322784424, "learning_rate": 5.618556701030928e-07, "logits/chosen": 10.180124282836914, "logits/rejected": 12.340204238891602, "logps/chosen": -134.39517211914062, "logps/rejected": -149.95010375976562, "loss": 0.7153, "rewards/accuracies": 0.375, "rewards/chosen": -0.004220293834805489, "rewards/margins": -0.04071822762489319, "rewards/rejected": 0.03649792820215225, "step": 218 }, { "epoch": 0.03386816160835105, "grad_norm": 4.407406806945801, "learning_rate": 5.644329896907217e-07, "logits/chosen": 16.62118148803711, "logits/rejected": 7.807868957519531, "logps/chosen": -354.60162353515625, "logps/rejected": -201.27975463867188, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": 0.028145790100097656, "rewards/margins": -0.03343295678496361, "rewards/rejected": 0.06157875061035156, "step": 219 }, { "epoch": 0.034022810748115216, "grad_norm": 4.289771556854248, "learning_rate": 5.670103092783505e-07, "logits/chosen": 7.658382415771484, "logits/rejected": 5.6317572593688965, "logps/chosen": -205.30947875976562, "logps/rejected": -259.0213623046875, "loss": 0.6738, "rewards/accuracies": 0.5, "rewards/chosen": 0.03346576914191246, "rewards/margins": 0.042540889233350754, "rewards/rejected": -0.009075116366147995, "step": 220 }, { "epoch": 0.034177459887879375, "grad_norm": 3.378429651260376, "learning_rate": 5.695876288659794e-07, "logits/chosen": 5.626931190490723, "logits/rejected": 5.373292922973633, "logps/chosen": -171.7481689453125, "logps/rejected": -139.57618713378906, "loss": 0.7024, "rewards/accuracies": 0.375, "rewards/chosen": -0.008703136816620827, "rewards/margins": -0.017125798389315605, "rewards/rejected": 0.008422663435339928, "step": 221 }, { "epoch": 0.03433210902764353, "grad_norm": 4.4845075607299805, "learning_rate": 5.721649484536083e-07, "logits/chosen": 3.8155980110168457, "logits/rejected": 8.531209945678711, "logps/chosen": -170.51113891601562, "logps/rejected": -235.72474670410156, "loss": 0.7049, "rewards/accuracies": 0.625, "rewards/chosen": 0.005229093134403229, "rewards/margins": -0.018719002604484558, "rewards/rejected": 0.023948095738887787, "step": 222 }, { "epoch": 0.03448675816740769, "grad_norm": 5.294673919677734, "learning_rate": 5.747422680412372e-07, "logits/chosen": 6.5246686935424805, "logits/rejected": 9.144453048706055, "logps/chosen": -277.46026611328125, "logps/rejected": -325.8945007324219, "loss": 0.6959, "rewards/accuracies": 0.375, "rewards/chosen": -0.011956976726651192, "rewards/margins": 0.002624470740556717, "rewards/rejected": -0.01458144560456276, "step": 223 }, { "epoch": 0.034641407307171856, "grad_norm": 4.342596054077148, "learning_rate": 5.77319587628866e-07, "logits/chosen": 12.956892013549805, "logits/rejected": 6.823075294494629, "logps/chosen": -302.9952087402344, "logps/rejected": -165.18283081054688, "loss": 0.6869, "rewards/accuracies": 0.375, "rewards/chosen": -0.030018998309969902, "rewards/margins": 0.017669297754764557, "rewards/rejected": -0.04768829792737961, "step": 224 }, { "epoch": 0.034796056446936015, "grad_norm": 4.966875076293945, "learning_rate": 5.798969072164949e-07, "logits/chosen": 10.153735160827637, "logits/rejected": 7.376662254333496, "logps/chosen": -285.5647888183594, "logps/rejected": -221.4811553955078, "loss": 0.7308, "rewards/accuracies": 0.25, "rewards/chosen": -0.006377171725034714, "rewards/margins": -0.07181944698095322, "rewards/rejected": 0.0654422789812088, "step": 225 }, { "epoch": 0.03495070558670017, "grad_norm": 4.671102523803711, "learning_rate": 5.824742268041238e-07, "logits/chosen": 13.67463207244873, "logits/rejected": 5.030215263366699, "logps/chosen": -282.1994934082031, "logps/rejected": -198.82147216796875, "loss": 0.6852, "rewards/accuracies": 0.5, "rewards/chosen": 0.0008883010596036911, "rewards/margins": 0.022807549685239792, "rewards/rejected": -0.02191925048828125, "step": 226 }, { "epoch": 0.03510535472646433, "grad_norm": 6.9201436042785645, "learning_rate": 5.850515463917526e-07, "logits/chosen": 3.512515068054199, "logits/rejected": 7.149104595184326, "logps/chosen": -278.8149719238281, "logps/rejected": -391.1086730957031, "loss": 0.7129, "rewards/accuracies": 0.5, "rewards/chosen": -0.027152635157108307, "rewards/margins": -0.02888527885079384, "rewards/rejected": 0.0017326362431049347, "step": 227 }, { "epoch": 0.0352600038662285, "grad_norm": 3.4967703819274902, "learning_rate": 5.876288659793815e-07, "logits/chosen": 15.019588470458984, "logits/rejected": 11.224769592285156, "logps/chosen": -215.16358947753906, "logps/rejected": -185.71273803710938, "loss": 0.6679, "rewards/accuracies": 0.75, "rewards/chosen": 0.02270641177892685, "rewards/margins": 0.05463428422808647, "rewards/rejected": -0.03192787244915962, "step": 228 }, { "epoch": 0.035414653005992655, "grad_norm": 5.382562637329102, "learning_rate": 5.902061855670104e-07, "logits/chosen": 6.3345417976379395, "logits/rejected": 6.99592399597168, "logps/chosen": -278.01287841796875, "logps/rejected": -231.80795288085938, "loss": 0.7005, "rewards/accuracies": 0.375, "rewards/chosen": -0.03036472760140896, "rewards/margins": -0.013205980882048607, "rewards/rejected": -0.01715874671936035, "step": 229 }, { "epoch": 0.03556930214575681, "grad_norm": 4.758025646209717, "learning_rate": 5.927835051546392e-07, "logits/chosen": 8.463251113891602, "logits/rejected": 12.116561889648438, "logps/chosen": -191.79452514648438, "logps/rejected": -224.87753295898438, "loss": 0.7104, "rewards/accuracies": 0.25, "rewards/chosen": -0.002331519266590476, "rewards/margins": -0.033104680478572845, "rewards/rejected": 0.030773162841796875, "step": 230 }, { "epoch": 0.03572395128552097, "grad_norm": 3.2386176586151123, "learning_rate": 5.95360824742268e-07, "logits/chosen": 9.068927764892578, "logits/rejected": 9.351752281188965, "logps/chosen": -136.34466552734375, "logps/rejected": -155.36605834960938, "loss": 0.6806, "rewards/accuracies": 0.375, "rewards/chosen": 0.004687094129621983, "rewards/margins": 0.026478338986635208, "rewards/rejected": -0.02179124392569065, "step": 231 }, { "epoch": 0.03587860042528514, "grad_norm": 5.229002475738525, "learning_rate": 5.97938144329897e-07, "logits/chosen": 14.541337013244629, "logits/rejected": 5.303619384765625, "logps/chosen": -345.19927978515625, "logps/rejected": -286.30828857421875, "loss": 0.6369, "rewards/accuracies": 0.875, "rewards/chosen": 0.08268857002258301, "rewards/margins": 0.11939717084169388, "rewards/rejected": -0.03670859336853027, "step": 232 }, { "epoch": 0.036033249565049295, "grad_norm": 6.893274784088135, "learning_rate": 6.005154639175259e-07, "logits/chosen": 14.66867733001709, "logits/rejected": 9.734107971191406, "logps/chosen": -410.5635986328125, "logps/rejected": -300.903076171875, "loss": 0.7371, "rewards/accuracies": 0.25, "rewards/chosen": -0.07901516556739807, "rewards/margins": -0.07119274139404297, "rewards/rejected": -0.007822412997484207, "step": 233 }, { "epoch": 0.036187898704813454, "grad_norm": 3.7327115535736084, "learning_rate": 6.030927835051547e-07, "logits/chosen": 7.433411598205566, "logits/rejected": 9.584771156311035, "logps/chosen": -154.3632049560547, "logps/rejected": -192.30227661132812, "loss": 0.6981, "rewards/accuracies": 0.375, "rewards/chosen": -0.01377263106405735, "rewards/margins": -0.008358431980013847, "rewards/rejected": -0.005414200946688652, "step": 234 }, { "epoch": 0.03634254784457761, "grad_norm": 4.205161094665527, "learning_rate": 6.056701030927835e-07, "logits/chosen": 8.071340560913086, "logits/rejected": 8.306831359863281, "logps/chosen": -181.2935791015625, "logps/rejected": -179.22610473632812, "loss": 0.7108, "rewards/accuracies": 0.25, "rewards/chosen": -0.003616809844970703, "rewards/margins": -0.0334671288728714, "rewards/rejected": 0.029850320890545845, "step": 235 }, { "epoch": 0.03649719698434178, "grad_norm": 4.360382080078125, "learning_rate": 6.082474226804124e-07, "logits/chosen": 11.336736679077148, "logits/rejected": 9.09284496307373, "logps/chosen": -337.3224182128906, "logps/rejected": -279.7801208496094, "loss": 0.7039, "rewards/accuracies": 0.375, "rewards/chosen": 0.018046284094452858, "rewards/margins": -0.020105741918087006, "rewards/rejected": 0.03815202787518501, "step": 236 }, { "epoch": 0.036651846124105936, "grad_norm": 6.259074687957764, "learning_rate": 6.108247422680413e-07, "logits/chosen": 8.761602401733398, "logits/rejected": 10.941136360168457, "logps/chosen": -396.19451904296875, "logps/rejected": -382.68408203125, "loss": 0.7346, "rewards/accuracies": 0.125, "rewards/chosen": -0.018232012167572975, "rewards/margins": -0.07538881152868271, "rewards/rejected": 0.05715680494904518, "step": 237 }, { "epoch": 0.036806495263870094, "grad_norm": 6.0970988273620605, "learning_rate": 6.134020618556701e-07, "logits/chosen": 6.664066791534424, "logits/rejected": 4.940464973449707, "logps/chosen": -354.9864501953125, "logps/rejected": -308.2416687011719, "loss": 0.6844, "rewards/accuracies": 0.625, "rewards/chosen": -0.002811622340232134, "rewards/margins": 0.020966840907931328, "rewards/rejected": -0.023778462782502174, "step": 238 }, { "epoch": 0.03696114440363425, "grad_norm": 4.786613941192627, "learning_rate": 6.15979381443299e-07, "logits/chosen": 10.859687805175781, "logits/rejected": 12.618412017822266, "logps/chosen": -223.54574584960938, "logps/rejected": -236.51113891601562, "loss": 0.705, "rewards/accuracies": 0.5, "rewards/chosen": -0.04124817997217178, "rewards/margins": -0.017923451960086823, "rewards/rejected": -0.02332472801208496, "step": 239 }, { "epoch": 0.03711579354339842, "grad_norm": 6.639766693115234, "learning_rate": 6.185567010309279e-07, "logits/chosen": 4.527590751647949, "logits/rejected": 4.626595497131348, "logps/chosen": -315.76593017578125, "logps/rejected": -259.0128173828125, "loss": 0.7226, "rewards/accuracies": 0.375, "rewards/chosen": -0.024262474849820137, "rewards/margins": -0.055610038340091705, "rewards/rejected": 0.03134756162762642, "step": 240 }, { "epoch": 0.037270442683162576, "grad_norm": 5.480623722076416, "learning_rate": 6.211340206185567e-07, "logits/chosen": 12.011322021484375, "logits/rejected": 15.609920501708984, "logps/chosen": -293.7601318359375, "logps/rejected": -340.34759521484375, "loss": 0.7107, "rewards/accuracies": 0.375, "rewards/chosen": 0.01647777482867241, "rewards/margins": -0.02978820912539959, "rewards/rejected": 0.04626598209142685, "step": 241 }, { "epoch": 0.037425091822926734, "grad_norm": 5.76165771484375, "learning_rate": 6.237113402061856e-07, "logits/chosen": 12.008529663085938, "logits/rejected": 5.540915489196777, "logps/chosen": -328.7164001464844, "logps/rejected": -246.66229248046875, "loss": 0.7215, "rewards/accuracies": 0.375, "rewards/chosen": -0.03191695362329483, "rewards/margins": -0.05326266586780548, "rewards/rejected": 0.02134571224451065, "step": 242 }, { "epoch": 0.03757974096269089, "grad_norm": 5.17066764831543, "learning_rate": 6.262886597938145e-07, "logits/chosen": 9.700451850891113, "logits/rejected": 8.016971588134766, "logps/chosen": -217.6426239013672, "logps/rejected": -221.5758056640625, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": -0.013753032311797142, "rewards/margins": -0.002923298627138138, "rewards/rejected": -0.010829731822013855, "step": 243 }, { "epoch": 0.03773439010245506, "grad_norm": 11.010458946228027, "learning_rate": 6.288659793814434e-07, "logits/chosen": 7.685364723205566, "logits/rejected": 1.084984302520752, "logps/chosen": -314.3756103515625, "logps/rejected": -188.3623504638672, "loss": 0.7075, "rewards/accuracies": 0.625, "rewards/chosen": 0.007253644987940788, "rewards/margins": -0.021234802901744843, "rewards/rejected": 0.028488444164395332, "step": 244 }, { "epoch": 0.037889039242219216, "grad_norm": 5.415032386779785, "learning_rate": 6.314432989690722e-07, "logits/chosen": 9.269636154174805, "logits/rejected": 10.793008804321289, "logps/chosen": -294.50616455078125, "logps/rejected": -287.6456298828125, "loss": 0.6644, "rewards/accuracies": 0.5, "rewards/chosen": 0.03605537861585617, "rewards/margins": 0.06008339300751686, "rewards/rejected": -0.02402801625430584, "step": 245 }, { "epoch": 0.038043688381983375, "grad_norm": 5.652324676513672, "learning_rate": 6.340206185567011e-07, "logits/chosen": 12.311849594116211, "logits/rejected": 7.074306964874268, "logps/chosen": -409.453125, "logps/rejected": -253.30751037597656, "loss": 0.6632, "rewards/accuracies": 0.75, "rewards/chosen": 0.0446925163269043, "rewards/margins": 0.06195654720067978, "rewards/rejected": -0.01726403459906578, "step": 246 }, { "epoch": 0.03819833752174753, "grad_norm": 4.350164413452148, "learning_rate": 6.365979381443299e-07, "logits/chosen": 6.472908020019531, "logits/rejected": 3.3387234210968018, "logps/chosen": -300.912353515625, "logps/rejected": -208.2216796875, "loss": 0.6648, "rewards/accuracies": 0.625, "rewards/chosen": -0.014641453512012959, "rewards/margins": 0.060414668172597885, "rewards/rejected": -0.07505612075328827, "step": 247 }, { "epoch": 0.0383529866615117, "grad_norm": 4.907051086425781, "learning_rate": 6.391752577319587e-07, "logits/chosen": 6.359959602355957, "logits/rejected": 8.720989227294922, "logps/chosen": -236.83102416992188, "logps/rejected": -220.8046417236328, "loss": 0.6988, "rewards/accuracies": 0.625, "rewards/chosen": -0.011784838512539864, "rewards/margins": -0.0011364268139004707, "rewards/rejected": -0.010648416355252266, "step": 248 }, { "epoch": 0.03850763580127586, "grad_norm": 5.66964864730835, "learning_rate": 6.417525773195876e-07, "logits/chosen": 17.11200714111328, "logits/rejected": 17.57602882385254, "logps/chosen": -354.4012451171875, "logps/rejected": -323.81201171875, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": 0.05915484577417374, "rewards/margins": 0.03593145310878754, "rewards/rejected": 0.023223400115966797, "step": 249 }, { "epoch": 0.038662284941040015, "grad_norm": 4.988640785217285, "learning_rate": 6.443298969072165e-07, "logits/chosen": 11.416837692260742, "logits/rejected": 8.00868034362793, "logps/chosen": -242.99166870117188, "logps/rejected": -235.29714965820312, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": -0.028706837445497513, "rewards/margins": 0.004238747991621494, "rewards/rejected": -0.03294558823108673, "step": 250 }, { "epoch": 0.03881693408080417, "grad_norm": 4.878240585327148, "learning_rate": 6.469072164948455e-07, "logits/chosen": 5.145282745361328, "logits/rejected": 4.16807746887207, "logps/chosen": -214.65066528320312, "logps/rejected": -169.9105224609375, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": -0.01110463123768568, "rewards/margins": 0.013231130316853523, "rewards/rejected": -0.024335766211152077, "step": 251 }, { "epoch": 0.03897158322056834, "grad_norm": 4.538815498352051, "learning_rate": 6.494845360824743e-07, "logits/chosen": 11.637895584106445, "logits/rejected": 13.668797492980957, "logps/chosen": -278.2616882324219, "logps/rejected": -258.5806884765625, "loss": 0.7241, "rewards/accuracies": 0.375, "rewards/chosen": -0.03656420856714249, "rewards/margins": -0.0572175532579422, "rewards/rejected": 0.020653344690799713, "step": 252 }, { "epoch": 0.0391262323603325, "grad_norm": 4.906022548675537, "learning_rate": 6.520618556701032e-07, "logits/chosen": 9.867277145385742, "logits/rejected": 7.840575218200684, "logps/chosen": -349.86749267578125, "logps/rejected": -285.179443359375, "loss": 0.6982, "rewards/accuracies": 0.25, "rewards/chosen": -0.03592377156019211, "rewards/margins": -0.004194688051939011, "rewards/rejected": -0.0317290797829628, "step": 253 }, { "epoch": 0.039280881500096655, "grad_norm": 6.599225044250488, "learning_rate": 6.546391752577321e-07, "logits/chosen": 15.538134574890137, "logits/rejected": 11.52098274230957, "logps/chosen": -367.9677734375, "logps/rejected": -349.4126281738281, "loss": 0.6764, "rewards/accuracies": 0.75, "rewards/chosen": 0.03877105936408043, "rewards/margins": 0.03812885284423828, "rewards/rejected": 0.0006422027945518494, "step": 254 }, { "epoch": 0.039435530639860814, "grad_norm": 5.181053638458252, "learning_rate": 6.57216494845361e-07, "logits/chosen": 12.436885833740234, "logits/rejected": 9.671371459960938, "logps/chosen": -382.39788818359375, "logps/rejected": -313.9125061035156, "loss": 0.7045, "rewards/accuracies": 0.5, "rewards/chosen": 0.004657313227653503, "rewards/margins": -0.014827582985162735, "rewards/rejected": 0.019484899938106537, "step": 255 }, { "epoch": 0.03959017977962498, "grad_norm": 5.534814357757568, "learning_rate": 6.597938144329897e-07, "logits/chosen": 11.357664108276367, "logits/rejected": 1.0162296295166016, "logps/chosen": -395.32830810546875, "logps/rejected": -249.87985229492188, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": -0.020236967131495476, "rewards/margins": 0.05510273575782776, "rewards/rejected": -0.07533970475196838, "step": 256 }, { "epoch": 0.03974482891938914, "grad_norm": 5.347562313079834, "learning_rate": 6.623711340206186e-07, "logits/chosen": 15.50752067565918, "logits/rejected": 7.486250877380371, "logps/chosen": -333.0701904296875, "logps/rejected": -243.05026245117188, "loss": 0.7043, "rewards/accuracies": 0.5, "rewards/chosen": -0.019951246678829193, "rewards/margins": -0.01807079091668129, "rewards/rejected": -0.0018804557621479034, "step": 257 }, { "epoch": 0.039899478059153295, "grad_norm": 4.464232444763184, "learning_rate": 6.649484536082475e-07, "logits/chosen": 5.619850158691406, "logits/rejected": 1.327756643295288, "logps/chosen": -208.21044921875, "logps/rejected": -188.10911560058594, "loss": 0.6919, "rewards/accuracies": 0.625, "rewards/chosen": -0.0006053922697901726, "rewards/margins": 0.004115653224289417, "rewards/rejected": -0.0047210450284183025, "step": 258 }, { "epoch": 0.040054127198917454, "grad_norm": 6.004766464233398, "learning_rate": 6.675257731958763e-07, "logits/chosen": 11.703359603881836, "logits/rejected": 9.75662612915039, "logps/chosen": -456.85687255859375, "logps/rejected": -357.17144775390625, "loss": 0.715, "rewards/accuracies": 0.5, "rewards/chosen": 0.047693539410829544, "rewards/margins": -0.03385457396507263, "rewards/rejected": 0.08154811710119247, "step": 259 }, { "epoch": 0.04020877633868162, "grad_norm": 5.586493015289307, "learning_rate": 6.701030927835052e-07, "logits/chosen": 13.049908638000488, "logits/rejected": 13.037406921386719, "logps/chosen": -289.798828125, "logps/rejected": -279.8524169921875, "loss": 0.7376, "rewards/accuracies": 0.125, "rewards/chosen": 0.0055467598140239716, "rewards/margins": -0.0849161148071289, "rewards/rejected": 0.09046287834644318, "step": 260 }, { "epoch": 0.04036342547844578, "grad_norm": 5.428988456726074, "learning_rate": 6.726804123711341e-07, "logits/chosen": 13.159706115722656, "logits/rejected": 16.09222412109375, "logps/chosen": -339.9249267578125, "logps/rejected": -432.504638671875, "loss": 0.7467, "rewards/accuracies": 0.375, "rewards/chosen": -0.030592726543545723, "rewards/margins": -0.09264698624610901, "rewards/rejected": 0.06205425783991814, "step": 261 }, { "epoch": 0.040518074618209936, "grad_norm": 3.8464701175689697, "learning_rate": 6.752577319587629e-07, "logits/chosen": 11.381383895874023, "logits/rejected": 3.2776682376861572, "logps/chosen": -203.27291870117188, "logps/rejected": -171.57546997070312, "loss": 0.7256, "rewards/accuracies": 0.25, "rewards/chosen": -0.0026408201083540916, "rewards/margins": -0.06274910271167755, "rewards/rejected": 0.060108281672000885, "step": 262 }, { "epoch": 0.040672723757974094, "grad_norm": 4.425137519836426, "learning_rate": 6.778350515463917e-07, "logits/chosen": 14.23583698272705, "logits/rejected": 5.952187538146973, "logps/chosen": -278.9794921875, "logps/rejected": -276.9984436035156, "loss": 0.7253, "rewards/accuracies": 0.125, "rewards/chosen": -0.03361053392291069, "rewards/margins": -0.061350345611572266, "rewards/rejected": 0.027739809826016426, "step": 263 }, { "epoch": 0.04082737289773826, "grad_norm": 6.176183700561523, "learning_rate": 6.804123711340206e-07, "logits/chosen": 7.840469837188721, "logits/rejected": 8.058832168579102, "logps/chosen": -375.3706359863281, "logps/rejected": -321.70721435546875, "loss": 0.7301, "rewards/accuracies": 0.375, "rewards/chosen": -0.025685694068670273, "rewards/margins": -0.06539702415466309, "rewards/rejected": 0.03971133381128311, "step": 264 }, { "epoch": 0.04098202203750242, "grad_norm": 6.129113674163818, "learning_rate": 6.829896907216495e-07, "logits/chosen": 16.604860305786133, "logits/rejected": 7.964756965637207, "logps/chosen": -387.2036437988281, "logps/rejected": -248.52195739746094, "loss": 0.731, "rewards/accuracies": 0.5, "rewards/chosen": -0.06861067563295364, "rewards/margins": -0.06551866978406906, "rewards/rejected": -0.003092002123594284, "step": 265 }, { "epoch": 0.041136671177266576, "grad_norm": 5.237852096557617, "learning_rate": 6.855670103092784e-07, "logits/chosen": 8.166175842285156, "logits/rejected": 7.472873210906982, "logps/chosen": -352.65679931640625, "logps/rejected": -317.54736328125, "loss": 0.6935, "rewards/accuracies": 0.375, "rewards/chosen": -0.008534002117812634, "rewards/margins": 0.010436061769723892, "rewards/rejected": -0.018970057368278503, "step": 266 }, { "epoch": 0.041291320317030734, "grad_norm": 4.047346115112305, "learning_rate": 6.881443298969073e-07, "logits/chosen": 10.062591552734375, "logits/rejected": 7.389678478240967, "logps/chosen": -203.36083984375, "logps/rejected": -213.27920532226562, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": 0.047116756439208984, "rewards/margins": 0.051412105560302734, "rewards/rejected": -0.00429534912109375, "step": 267 }, { "epoch": 0.0414459694567949, "grad_norm": 4.589973449707031, "learning_rate": 6.907216494845362e-07, "logits/chosen": 9.534613609313965, "logits/rejected": 5.6274333000183105, "logps/chosen": -252.291015625, "logps/rejected": -247.4940948486328, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": -0.019533302634954453, "rewards/margins": 0.02815093845129013, "rewards/rejected": -0.047684237360954285, "step": 268 }, { "epoch": 0.04160061859655906, "grad_norm": 6.799015045166016, "learning_rate": 6.93298969072165e-07, "logits/chosen": 7.078848361968994, "logits/rejected": 11.0949068069458, "logps/chosen": -197.40658569335938, "logps/rejected": -237.89590454101562, "loss": 0.6889, "rewards/accuracies": 0.5, "rewards/chosen": 0.031607821583747864, "rewards/margins": 0.010406684130430222, "rewards/rejected": 0.021201133728027344, "step": 269 }, { "epoch": 0.041755267736323216, "grad_norm": 4.157468318939209, "learning_rate": 6.958762886597939e-07, "logits/chosen": 7.125399589538574, "logits/rejected": 11.697006225585938, "logps/chosen": -170.443359375, "logps/rejected": -197.20567321777344, "loss": 0.7335, "rewards/accuracies": 0.25, "rewards/chosen": -0.0439486987888813, "rewards/margins": -0.07237127423286438, "rewards/rejected": 0.02842256985604763, "step": 270 }, { "epoch": 0.041909916876087375, "grad_norm": 5.464375019073486, "learning_rate": 6.984536082474228e-07, "logits/chosen": 10.274873733520508, "logits/rejected": 7.8898773193359375, "logps/chosen": -222.9231414794922, "logps/rejected": -151.10484313964844, "loss": 0.7088, "rewards/accuracies": 0.375, "rewards/chosen": -0.0074091204442083836, "rewards/margins": -0.02668764814734459, "rewards/rejected": 0.019278524443507195, "step": 271 }, { "epoch": 0.04206456601585154, "grad_norm": 6.094438076019287, "learning_rate": 7.010309278350516e-07, "logits/chosen": 4.080531120300293, "logits/rejected": 4.95831298828125, "logps/chosen": -315.23846435546875, "logps/rejected": -244.72308349609375, "loss": 0.6854, "rewards/accuracies": 0.625, "rewards/chosen": 0.06399526447057724, "rewards/margins": 0.017319299280643463, "rewards/rejected": 0.04667596518993378, "step": 272 }, { "epoch": 0.0422192151556157, "grad_norm": 6.01910400390625, "learning_rate": 7.036082474226804e-07, "logits/chosen": 9.406524658203125, "logits/rejected": 11.486480712890625, "logps/chosen": -404.70562744140625, "logps/rejected": -401.83380126953125, "loss": 0.6725, "rewards/accuracies": 0.5, "rewards/chosen": 0.004156113136559725, "rewards/margins": 0.044912248849868774, "rewards/rejected": -0.04075613245368004, "step": 273 }, { "epoch": 0.04237386429537986, "grad_norm": 5.294518947601318, "learning_rate": 7.061855670103093e-07, "logits/chosen": 15.296045303344727, "logits/rejected": 8.236815452575684, "logps/chosen": -243.39500427246094, "logps/rejected": -190.75949096679688, "loss": 0.7023, "rewards/accuracies": 0.375, "rewards/chosen": -0.01391210500150919, "rewards/margins": -0.013545418158173561, "rewards/rejected": -0.0003666868433356285, "step": 274 }, { "epoch": 0.042528513435144015, "grad_norm": 4.91370964050293, "learning_rate": 7.087628865979382e-07, "logits/chosen": 11.261940956115723, "logits/rejected": 5.535235404968262, "logps/chosen": -381.68475341796875, "logps/rejected": -241.4693603515625, "loss": 0.6714, "rewards/accuracies": 0.75, "rewards/chosen": 0.06995878368616104, "rewards/margins": 0.04795096069574356, "rewards/rejected": 0.02200782299041748, "step": 275 }, { "epoch": 0.04268316257490818, "grad_norm": 8.425891876220703, "learning_rate": 7.11340206185567e-07, "logits/chosen": 8.027288436889648, "logits/rejected": 7.0562896728515625, "logps/chosen": -257.78424072265625, "logps/rejected": -268.9976806640625, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": -0.016646670177578926, "rewards/margins": 0.004961486905813217, "rewards/rejected": -0.02160816267132759, "step": 276 }, { "epoch": 0.04283781171467234, "grad_norm": 6.820143222808838, "learning_rate": 7.139175257731959e-07, "logits/chosen": 11.263406753540039, "logits/rejected": 3.7569408416748047, "logps/chosen": -443.0341491699219, "logps/rejected": -337.4405822753906, "loss": 0.7387, "rewards/accuracies": 0.25, "rewards/chosen": 0.01112661324441433, "rewards/margins": -0.08719973266124725, "rewards/rejected": 0.09832635521888733, "step": 277 }, { "epoch": 0.0429924608544365, "grad_norm": 4.918961524963379, "learning_rate": 7.164948453608248e-07, "logits/chosen": 12.5675630569458, "logits/rejected": 6.431389808654785, "logps/chosen": -301.914306640625, "logps/rejected": -271.76953125, "loss": 0.6813, "rewards/accuracies": 0.5, "rewards/chosen": -0.0032967086881399155, "rewards/margins": 0.027314042672514915, "rewards/rejected": -0.03061075322329998, "step": 278 }, { "epoch": 0.043147109994200655, "grad_norm": 6.123851299285889, "learning_rate": 7.190721649484537e-07, "logits/chosen": 2.2918896675109863, "logits/rejected": 6.765120983123779, "logps/chosen": -304.2936706542969, "logps/rejected": -402.9510192871094, "loss": 0.6498, "rewards/accuracies": 0.75, "rewards/chosen": 0.07635727524757385, "rewards/margins": 0.09249601513147354, "rewards/rejected": -0.016138743609189987, "step": 279 }, { "epoch": 0.04330175913396482, "grad_norm": 4.967680931091309, "learning_rate": 7.216494845360824e-07, "logits/chosen": 14.142841339111328, "logits/rejected": 6.342775821685791, "logps/chosen": -384.2119140625, "logps/rejected": -277.9646301269531, "loss": 0.6662, "rewards/accuracies": 0.5, "rewards/chosen": 0.04666900634765625, "rewards/margins": 0.06462021172046661, "rewards/rejected": -0.017951201647520065, "step": 280 }, { "epoch": 0.04345640827372898, "grad_norm": 6.099856853485107, "learning_rate": 7.242268041237115e-07, "logits/chosen": 12.001784324645996, "logits/rejected": 3.5784199237823486, "logps/chosen": -309.82696533203125, "logps/rejected": -272.38836669921875, "loss": 0.7136, "rewards/accuracies": 0.25, "rewards/chosen": -0.02949538454413414, "rewards/margins": -0.031954433768987656, "rewards/rejected": 0.0024590478278696537, "step": 281 }, { "epoch": 0.04361105741349314, "grad_norm": 6.103199005126953, "learning_rate": 7.268041237113403e-07, "logits/chosen": 13.201889038085938, "logits/rejected": 6.912469863891602, "logps/chosen": -382.53936767578125, "logps/rejected": -298.7481689453125, "loss": 0.6957, "rewards/accuracies": 0.375, "rewards/chosen": 0.008629227057099342, "rewards/margins": -0.00011920928955078125, "rewards/rejected": 0.008748434484004974, "step": 282 }, { "epoch": 0.043765706553257296, "grad_norm": 3.697726249694824, "learning_rate": 7.293814432989691e-07, "logits/chosen": 5.166966915130615, "logits/rejected": 4.032721519470215, "logps/chosen": -201.18789672851562, "logps/rejected": -185.63388061523438, "loss": 0.7003, "rewards/accuracies": 0.375, "rewards/chosen": 0.02250685729086399, "rewards/margins": -0.01213073544204235, "rewards/rejected": 0.03463759273290634, "step": 283 }, { "epoch": 0.04392035569302146, "grad_norm": 5.440157890319824, "learning_rate": 7.31958762886598e-07, "logits/chosen": 5.483170509338379, "logits/rejected": 0.9315651655197144, "logps/chosen": -324.3583679199219, "logps/rejected": -193.5889892578125, "loss": 0.7359, "rewards/accuracies": 0.25, "rewards/chosen": -0.03743145614862442, "rewards/margins": -0.07862356305122375, "rewards/rejected": 0.041192103177309036, "step": 284 }, { "epoch": 0.04407500483278562, "grad_norm": 4.959395408630371, "learning_rate": 7.345360824742269e-07, "logits/chosen": 7.315350532531738, "logits/rejected": 5.967403888702393, "logps/chosen": -235.76220703125, "logps/rejected": -225.611328125, "loss": 0.7132, "rewards/accuracies": 0.375, "rewards/chosen": 0.041570376604795456, "rewards/margins": -0.029239557683467865, "rewards/rejected": 0.07080993801355362, "step": 285 }, { "epoch": 0.04422965397254978, "grad_norm": 6.410409927368164, "learning_rate": 7.371134020618558e-07, "logits/chosen": 14.67337417602539, "logits/rejected": 14.789937019348145, "logps/chosen": -299.0250244140625, "logps/rejected": -256.5709228515625, "loss": 0.731, "rewards/accuracies": 0.25, "rewards/chosen": -0.009676171466708183, "rewards/margins": -0.07034854590892792, "rewards/rejected": 0.06067238003015518, "step": 286 }, { "epoch": 0.044384303112313936, "grad_norm": 3.8615458011627197, "learning_rate": 7.396907216494846e-07, "logits/chosen": 12.82430648803711, "logits/rejected": 6.008855819702148, "logps/chosen": -278.73486328125, "logps/rejected": -257.6081848144531, "loss": 0.6434, "rewards/accuracies": 0.875, "rewards/chosen": 0.0827404037117958, "rewards/margins": 0.11119689792394638, "rewards/rejected": -0.028456497937440872, "step": 287 }, { "epoch": 0.0445389522520781, "grad_norm": 6.949217319488525, "learning_rate": 7.422680412371135e-07, "logits/chosen": 11.780693054199219, "logits/rejected": 9.067593574523926, "logps/chosen": -508.15875244140625, "logps/rejected": -371.2984619140625, "loss": 0.7018, "rewards/accuracies": 0.5, "rewards/chosen": 0.006142043508589268, "rewards/margins": -0.010315701365470886, "rewards/rejected": 0.016457747668027878, "step": 288 }, { "epoch": 0.04469360139184226, "grad_norm": 5.891060829162598, "learning_rate": 7.448453608247423e-07, "logits/chosen": 6.376235008239746, "logits/rejected": 9.458555221557617, "logps/chosen": -388.1113586425781, "logps/rejected": -407.88543701171875, "loss": 0.6802, "rewards/accuracies": 0.5, "rewards/chosen": 0.011972811073064804, "rewards/margins": 0.03756694495677948, "rewards/rejected": -0.025594139471650124, "step": 289 }, { "epoch": 0.04484825053160642, "grad_norm": 7.497729778289795, "learning_rate": 7.474226804123711e-07, "logits/chosen": 8.1383638381958, "logits/rejected": 6.186180114746094, "logps/chosen": -200.9911651611328, "logps/rejected": -196.11199951171875, "loss": 0.7483, "rewards/accuracies": 0.25, "rewards/chosen": -0.03496842086315155, "rewards/margins": -0.09396882355213165, "rewards/rejected": 0.059000395238399506, "step": 290 }, { "epoch": 0.045002899671370576, "grad_norm": 4.562068462371826, "learning_rate": 7.5e-07, "logits/chosen": 10.212431907653809, "logits/rejected": 4.131237030029297, "logps/chosen": -192.6217041015625, "logps/rejected": -162.54302978515625, "loss": 0.6536, "rewards/accuracies": 0.875, "rewards/chosen": 0.04490475356578827, "rewards/margins": 0.08690089732408524, "rewards/rejected": -0.04199614375829697, "step": 291 }, { "epoch": 0.04515754881113474, "grad_norm": 4.859351634979248, "learning_rate": 7.525773195876289e-07, "logits/chosen": 11.998026847839355, "logits/rejected": 6.427470684051514, "logps/chosen": -307.7152099609375, "logps/rejected": -237.17135620117188, "loss": 0.6581, "rewards/accuracies": 0.5, "rewards/chosen": 0.09181271493434906, "rewards/margins": 0.08123352378606796, "rewards/rejected": 0.0105791836977005, "step": 292 }, { "epoch": 0.0453121979508989, "grad_norm": 5.752589225769043, "learning_rate": 7.551546391752578e-07, "logits/chosen": 8.069319725036621, "logits/rejected": 9.863545417785645, "logps/chosen": -327.6481018066406, "logps/rejected": -369.81683349609375, "loss": 0.7216, "rewards/accuracies": 0.25, "rewards/chosen": -0.033083729445934296, "rewards/margins": -0.0516609251499176, "rewards/rejected": 0.018577193841338158, "step": 293 }, { "epoch": 0.04546684709066306, "grad_norm": 5.523726463317871, "learning_rate": 7.577319587628866e-07, "logits/chosen": 15.03489875793457, "logits/rejected": 10.836637496948242, "logps/chosen": -348.3163146972656, "logps/rejected": -263.0343017578125, "loss": 0.6694, "rewards/accuracies": 0.75, "rewards/chosen": 0.03772168233990669, "rewards/margins": 0.05891909450292587, "rewards/rejected": -0.02119741216301918, "step": 294 }, { "epoch": 0.045621496230427216, "grad_norm": 4.027552604675293, "learning_rate": 7.603092783505155e-07, "logits/chosen": 9.441871643066406, "logits/rejected": 9.530956268310547, "logps/chosen": -234.0558319091797, "logps/rejected": -234.98226928710938, "loss": 0.6858, "rewards/accuracies": 0.375, "rewards/chosen": -0.02280271053314209, "rewards/margins": 0.02082827314734459, "rewards/rejected": -0.04363097995519638, "step": 295 }, { "epoch": 0.04577614537019138, "grad_norm": 6.074437141418457, "learning_rate": 7.628865979381445e-07, "logits/chosen": 12.259597778320312, "logits/rejected": 5.049566268920898, "logps/chosen": -371.72308349609375, "logps/rejected": -179.959716796875, "loss": 0.7025, "rewards/accuracies": 0.625, "rewards/chosen": 0.012221671640872955, "rewards/margins": -0.013094663619995117, "rewards/rejected": 0.025316333398222923, "step": 296 }, { "epoch": 0.04593079450995554, "grad_norm": 5.406341552734375, "learning_rate": 7.654639175257734e-07, "logits/chosen": 9.372140884399414, "logits/rejected": 10.361873626708984, "logps/chosen": -328.73101806640625, "logps/rejected": -280.20892333984375, "loss": 0.6424, "rewards/accuracies": 0.875, "rewards/chosen": 0.045784760266542435, "rewards/margins": 0.10740404576063156, "rewards/rejected": -0.06161927804350853, "step": 297 }, { "epoch": 0.0460854436497197, "grad_norm": 4.657041549682617, "learning_rate": 7.680412371134021e-07, "logits/chosen": 6.708735466003418, "logits/rejected": 8.383312225341797, "logps/chosen": -229.92178344726562, "logps/rejected": -261.3010559082031, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": 0.012757347896695137, "rewards/margins": 0.011767909862101078, "rewards/rejected": 0.0009894361719489098, "step": 298 }, { "epoch": 0.04624009278948386, "grad_norm": 5.440919399261475, "learning_rate": 7.70618556701031e-07, "logits/chosen": 10.24075984954834, "logits/rejected": 11.400381088256836, "logps/chosen": -268.0219421386719, "logps/rejected": -261.69720458984375, "loss": 0.6787, "rewards/accuracies": 0.625, "rewards/chosen": 0.04366712644696236, "rewards/margins": 0.03244920074939728, "rewards/rejected": 0.011217927560210228, "step": 299 }, { "epoch": 0.046394741929248015, "grad_norm": 10.694855690002441, "learning_rate": 7.731958762886599e-07, "logits/chosen": 9.533609390258789, "logits/rejected": 4.375674724578857, "logps/chosen": -258.9429931640625, "logps/rejected": -183.02110290527344, "loss": 0.6859, "rewards/accuracies": 0.625, "rewards/chosen": 0.025099849328398705, "rewards/margins": 0.01829042285680771, "rewards/rejected": 0.006809425540268421, "step": 300 }, { "epoch": 0.04654939106901218, "grad_norm": 4.438869476318359, "learning_rate": 7.757731958762887e-07, "logits/chosen": 9.963016510009766, "logits/rejected": 11.539241790771484, "logps/chosen": -311.2004089355469, "logps/rejected": -357.8814697265625, "loss": 0.6657, "rewards/accuracies": 0.625, "rewards/chosen": 0.025063039734959602, "rewards/margins": 0.06273975223302841, "rewards/rejected": -0.03767671808600426, "step": 301 }, { "epoch": 0.04670404020877634, "grad_norm": 6.975356101989746, "learning_rate": 7.783505154639176e-07, "logits/chosen": 9.927818298339844, "logits/rejected": 3.5872130393981934, "logps/chosen": -310.9517517089844, "logps/rejected": -217.1642303466797, "loss": 0.6634, "rewards/accuracies": 0.625, "rewards/chosen": 0.05728526785969734, "rewards/margins": 0.06350240856409073, "rewards/rejected": -0.006217147223651409, "step": 302 }, { "epoch": 0.0468586893485405, "grad_norm": 5.248288154602051, "learning_rate": 7.809278350515465e-07, "logits/chosen": 12.73100471496582, "logits/rejected": 12.335542678833008, "logps/chosen": -340.8487548828125, "logps/rejected": -341.8193359375, "loss": 0.6971, "rewards/accuracies": 0.375, "rewards/chosen": -0.03075290098786354, "rewards/margins": -0.006798101589083672, "rewards/rejected": -0.02395479753613472, "step": 303 }, { "epoch": 0.047013338488304655, "grad_norm": 3.1768858432769775, "learning_rate": 7.835051546391754e-07, "logits/chosen": 10.36426067352295, "logits/rejected": 8.524162292480469, "logps/chosen": -169.6994171142578, "logps/rejected": -157.38575744628906, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 0.04603080824017525, "rewards/margins": 0.03396458923816681, "rewards/rejected": 0.012066220864653587, "step": 304 }, { "epoch": 0.04716798762806882, "grad_norm": 5.595256328582764, "learning_rate": 7.860824742268041e-07, "logits/chosen": 9.441547393798828, "logits/rejected": 5.497175216674805, "logps/chosen": -383.0271911621094, "logps/rejected": -286.5113525390625, "loss": 0.6937, "rewards/accuracies": 0.5, "rewards/chosen": 0.04546399414539337, "rewards/margins": 0.000604487955570221, "rewards/rejected": 0.04485950618982315, "step": 305 }, { "epoch": 0.04732263676783298, "grad_norm": 5.533136367797852, "learning_rate": 7.88659793814433e-07, "logits/chosen": 10.888647079467773, "logits/rejected": 6.140551567077637, "logps/chosen": -298.96881103515625, "logps/rejected": -317.203125, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": 0.02645902708172798, "rewards/margins": 0.015329839661717415, "rewards/rejected": 0.011129188351333141, "step": 306 }, { "epoch": 0.04747728590759714, "grad_norm": 4.928723335266113, "learning_rate": 7.912371134020619e-07, "logits/chosen": 9.593779563903809, "logits/rejected": 11.967325210571289, "logps/chosen": -312.11602783203125, "logps/rejected": -284.61907958984375, "loss": 0.7073, "rewards/accuracies": 0.375, "rewards/chosen": -0.04487323760986328, "rewards/margins": -0.02668905258178711, "rewards/rejected": -0.018184185028076172, "step": 307 }, { "epoch": 0.047631935047361296, "grad_norm": 6.147202491760254, "learning_rate": 7.938144329896907e-07, "logits/chosen": 9.028889656066895, "logits/rejected": 11.248432159423828, "logps/chosen": -355.0299987792969, "logps/rejected": -317.34173583984375, "loss": 0.6661, "rewards/accuracies": 0.625, "rewards/chosen": 0.024903394281864166, "rewards/margins": 0.05873899534344673, "rewards/rejected": -0.033835604786872864, "step": 308 }, { "epoch": 0.04778658418712546, "grad_norm": 5.863432884216309, "learning_rate": 7.963917525773196e-07, "logits/chosen": 16.674617767333984, "logits/rejected": 12.86128044128418, "logps/chosen": -284.236572265625, "logps/rejected": -285.61669921875, "loss": 0.7351, "rewards/accuracies": 0.25, "rewards/chosen": -0.06699486076831818, "rewards/margins": -0.07974996417760849, "rewards/rejected": 0.012755108065903187, "step": 309 }, { "epoch": 0.04794123332688962, "grad_norm": 4.738588333129883, "learning_rate": 7.989690721649485e-07, "logits/chosen": 14.011253356933594, "logits/rejected": 7.4572930335998535, "logps/chosen": -300.88555908203125, "logps/rejected": -234.4568328857422, "loss": 0.6884, "rewards/accuracies": 0.625, "rewards/chosen": 0.015485147945582867, "rewards/margins": 0.014288333244621754, "rewards/rejected": 0.001196814700961113, "step": 310 }, { "epoch": 0.04809588246665378, "grad_norm": 4.169150352478027, "learning_rate": 8.015463917525775e-07, "logits/chosen": 5.935704231262207, "logits/rejected": 4.86208963394165, "logps/chosen": -197.8755340576172, "logps/rejected": -205.51852416992188, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": 0.031170319765806198, "rewards/margins": 0.012457729317247868, "rewards/rejected": 0.018712591379880905, "step": 311 }, { "epoch": 0.048250531606417936, "grad_norm": 6.766357421875, "learning_rate": 8.041237113402063e-07, "logits/chosen": 12.25485610961914, "logits/rejected": 5.660464286804199, "logps/chosen": -340.59417724609375, "logps/rejected": -208.60427856445312, "loss": 0.7113, "rewards/accuracies": 0.5, "rewards/chosen": -0.017274286597967148, "rewards/margins": -0.03250467777252197, "rewards/rejected": 0.015230393968522549, "step": 312 }, { "epoch": 0.0484051807461821, "grad_norm": 4.594578266143799, "learning_rate": 8.067010309278352e-07, "logits/chosen": 16.12421417236328, "logits/rejected": 8.62697696685791, "logps/chosen": -265.9131774902344, "logps/rejected": -187.4433135986328, "loss": 0.6859, "rewards/accuracies": 0.625, "rewards/chosen": 0.014047149568796158, "rewards/margins": 0.015713075175881386, "rewards/rejected": -0.001665925607085228, "step": 313 }, { "epoch": 0.04855982988594626, "grad_norm": 3.4818480014801025, "learning_rate": 8.092783505154641e-07, "logits/chosen": 13.8653564453125, "logits/rejected": 6.9018049240112305, "logps/chosen": -192.21493530273438, "logps/rejected": -151.43930053710938, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": 0.02290806919336319, "rewards/margins": 0.050026800483465195, "rewards/rejected": -0.027118727564811707, "step": 314 }, { "epoch": 0.04871447902571042, "grad_norm": 7.035672187805176, "learning_rate": 8.118556701030928e-07, "logits/chosen": 3.897305488586426, "logits/rejected": 3.538041353225708, "logps/chosen": -207.46163940429688, "logps/rejected": -209.98765563964844, "loss": 0.691, "rewards/accuracies": 0.625, "rewards/chosen": -0.024081282317638397, "rewards/margins": 0.009569570422172546, "rewards/rejected": -0.033650852739810944, "step": 315 }, { "epoch": 0.048869128165474576, "grad_norm": 7.0732340812683105, "learning_rate": 8.144329896907217e-07, "logits/chosen": 9.977954864501953, "logits/rejected": 10.643462181091309, "logps/chosen": -367.1545715332031, "logps/rejected": -323.3158264160156, "loss": 0.7364, "rewards/accuracies": 0.375, "rewards/chosen": -0.059644319117069244, "rewards/margins": -0.07982683181762695, "rewards/rejected": 0.02018251270055771, "step": 316 }, { "epoch": 0.04902377730523874, "grad_norm": 8.79511833190918, "learning_rate": 8.170103092783506e-07, "logits/chosen": 3.738468647003174, "logits/rejected": 11.525028228759766, "logps/chosen": -165.37423706054688, "logps/rejected": -243.92523193359375, "loss": 0.7152, "rewards/accuracies": 0.5, "rewards/chosen": 0.0034049982205033302, "rewards/margins": -0.03902578353881836, "rewards/rejected": 0.042430780827999115, "step": 317 }, { "epoch": 0.0491784264450029, "grad_norm": 4.1245951652526855, "learning_rate": 8.195876288659795e-07, "logits/chosen": 3.153686285018921, "logits/rejected": 5.225451469421387, "logps/chosen": -225.98306274414062, "logps/rejected": -253.50357055664062, "loss": 0.7139, "rewards/accuracies": 0.25, "rewards/chosen": -0.02681265026330948, "rewards/margins": -0.039389848709106445, "rewards/rejected": 0.012577196583151817, "step": 318 }, { "epoch": 0.04933307558476706, "grad_norm": 5.788033962249756, "learning_rate": 8.221649484536083e-07, "logits/chosen": 14.813587188720703, "logits/rejected": 14.116519927978516, "logps/chosen": -235.96603393554688, "logps/rejected": -232.52171325683594, "loss": 0.6833, "rewards/accuracies": 0.5, "rewards/chosen": -0.00020151096396148205, "rewards/margins": 0.022872546687722206, "rewards/rejected": -0.023074055090546608, "step": 319 }, { "epoch": 0.04948772472453122, "grad_norm": 5.623997211456299, "learning_rate": 8.247422680412372e-07, "logits/chosen": 6.207259178161621, "logits/rejected": -4.1912994384765625, "logps/chosen": -315.69964599609375, "logps/rejected": -167.5287628173828, "loss": 0.689, "rewards/accuracies": 0.75, "rewards/chosen": 0.04370555654168129, "rewards/margins": 0.009932899847626686, "rewards/rejected": 0.03377266228199005, "step": 320 }, { "epoch": 0.04964237386429538, "grad_norm": 3.6895346641540527, "learning_rate": 8.273195876288661e-07, "logits/chosen": 11.903623580932617, "logits/rejected": 8.772470474243164, "logps/chosen": -230.1386260986328, "logps/rejected": -203.36868286132812, "loss": 0.7055, "rewards/accuracies": 0.5, "rewards/chosen": -0.007718563079833984, "rewards/margins": -0.02218818850815296, "rewards/rejected": 0.014469623565673828, "step": 321 }, { "epoch": 0.04979702300405954, "grad_norm": 4.731302261352539, "learning_rate": 8.298969072164948e-07, "logits/chosen": 13.479562759399414, "logits/rejected": 6.947115898132324, "logps/chosen": -373.03350830078125, "logps/rejected": -224.48995971679688, "loss": 0.6551, "rewards/accuracies": 0.75, "rewards/chosen": 0.05569906532764435, "rewards/margins": 0.08135590702295303, "rewards/rejected": -0.025656841695308685, "step": 322 }, { "epoch": 0.0499516721438237, "grad_norm": 4.305988788604736, "learning_rate": 8.324742268041237e-07, "logits/chosen": 9.580376625061035, "logits/rejected": 0.8384977579116821, "logps/chosen": -253.3452606201172, "logps/rejected": -160.5526123046875, "loss": 0.691, "rewards/accuracies": 0.25, "rewards/chosen": -0.0063598137348890305, "rewards/margins": 0.01699228212237358, "rewards/rejected": -0.02335209771990776, "step": 323 }, { "epoch": 0.05010632128358786, "grad_norm": 4.122289657592773, "learning_rate": 8.350515463917526e-07, "logits/chosen": 14.331572532653809, "logits/rejected": 11.76861572265625, "logps/chosen": -265.4837341308594, "logps/rejected": -267.43017578125, "loss": 0.6954, "rewards/accuracies": 0.625, "rewards/chosen": 0.0019929646514356136, "rewards/margins": -0.002494524698704481, "rewards/rejected": 0.004487491212785244, "step": 324 }, { "epoch": 0.05026097042335202, "grad_norm": 6.475592613220215, "learning_rate": 8.376288659793815e-07, "logits/chosen": 10.418147087097168, "logits/rejected": 2.835232734680176, "logps/chosen": -562.50048828125, "logps/rejected": -400.02081298828125, "loss": 0.6729, "rewards/accuracies": 0.875, "rewards/chosen": 0.04315614700317383, "rewards/margins": 0.04643592983484268, "rewards/rejected": -0.0032797809690237045, "step": 325 }, { "epoch": 0.05041561956311618, "grad_norm": 4.158835411071777, "learning_rate": 8.402061855670104e-07, "logits/chosen": 17.390661239624023, "logits/rejected": 7.49455451965332, "logps/chosen": -279.53399658203125, "logps/rejected": -149.30152893066406, "loss": 0.6685, "rewards/accuracies": 0.375, "rewards/chosen": -0.025096513330936432, "rewards/margins": 0.057434868067502975, "rewards/rejected": -0.08253137767314911, "step": 326 }, { "epoch": 0.05057026870288034, "grad_norm": 9.585373878479004, "learning_rate": 8.427835051546393e-07, "logits/chosen": 8.694064140319824, "logits/rejected": 3.928234100341797, "logps/chosen": -238.50270080566406, "logps/rejected": -144.52503967285156, "loss": 0.6718, "rewards/accuracies": 0.5, "rewards/chosen": 0.012478066608309746, "rewards/margins": 0.04846363142132759, "rewards/rejected": -0.035985566675662994, "step": 327 }, { "epoch": 0.0507249178426445, "grad_norm": 4.9702301025390625, "learning_rate": 8.453608247422682e-07, "logits/chosen": 11.128905296325684, "logits/rejected": 13.74989128112793, "logps/chosen": -309.1045227050781, "logps/rejected": -372.2152404785156, "loss": 0.7297, "rewards/accuracies": 0.25, "rewards/chosen": -0.0483829528093338, "rewards/margins": -0.06745920330286026, "rewards/rejected": 0.019076254218816757, "step": 328 }, { "epoch": 0.05087956698240866, "grad_norm": 9.810717582702637, "learning_rate": 8.47938144329897e-07, "logits/chosen": 6.248475551605225, "logits/rejected": 4.073984146118164, "logps/chosen": -238.91769409179688, "logps/rejected": -212.77735900878906, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": -0.009897114709019661, "rewards/margins": 0.053171947598457336, "rewards/rejected": -0.06306906044483185, "step": 329 }, { "epoch": 0.05103421612217282, "grad_norm": 5.923515319824219, "learning_rate": 8.505154639175259e-07, "logits/chosen": 8.707401275634766, "logits/rejected": 12.377830505371094, "logps/chosen": -207.44644165039062, "logps/rejected": -218.87484741210938, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": -0.006574677303433418, "rewards/margins": 0.0008665574714541435, "rewards/rejected": -0.007441233843564987, "step": 330 }, { "epoch": 0.05118886526193698, "grad_norm": 5.5287981033325195, "learning_rate": 8.530927835051547e-07, "logits/chosen": 11.679425239562988, "logits/rejected": 7.240042209625244, "logps/chosen": -285.194580078125, "logps/rejected": -235.5878143310547, "loss": 0.6694, "rewards/accuracies": 0.625, "rewards/chosen": 0.0436800979077816, "rewards/margins": 0.053113702684640884, "rewards/rejected": -0.009433603845536709, "step": 331 }, { "epoch": 0.05134351440170114, "grad_norm": 6.355861186981201, "learning_rate": 8.556701030927836e-07, "logits/chosen": 11.296228408813477, "logits/rejected": 9.15643310546875, "logps/chosen": -325.03143310546875, "logps/rejected": -298.59246826171875, "loss": 0.6962, "rewards/accuracies": 0.75, "rewards/chosen": -0.004544067662209272, "rewards/margins": -0.002562951296567917, "rewards/rejected": -0.001981116831302643, "step": 332 }, { "epoch": 0.0514981635414653, "grad_norm": 5.970607757568359, "learning_rate": 8.582474226804124e-07, "logits/chosen": 10.666913986206055, "logits/rejected": 12.998638153076172, "logps/chosen": -465.88665771484375, "logps/rejected": -456.6786804199219, "loss": 0.7013, "rewards/accuracies": 0.5, "rewards/chosen": 0.0484885647892952, "rewards/margins": -0.012132979929447174, "rewards/rejected": 0.06062154844403267, "step": 333 }, { "epoch": 0.05165281268122946, "grad_norm": 5.521275043487549, "learning_rate": 8.608247422680413e-07, "logits/chosen": 14.129186630249023, "logits/rejected": 2.064903974533081, "logps/chosen": -357.00714111328125, "logps/rejected": -193.00653076171875, "loss": 0.6755, "rewards/accuracies": 0.75, "rewards/chosen": 0.027088262140750885, "rewards/margins": 0.03876437991857529, "rewards/rejected": -0.0116761215031147, "step": 334 }, { "epoch": 0.05180746182099362, "grad_norm": 4.094925880432129, "learning_rate": 8.634020618556702e-07, "logits/chosen": 11.846107482910156, "logits/rejected": 7.880058288574219, "logps/chosen": -278.2975158691406, "logps/rejected": -205.12425231933594, "loss": 0.6777, "rewards/accuracies": 0.75, "rewards/chosen": 0.01932988129556179, "rewards/margins": 0.03342390060424805, "rewards/rejected": -0.014094019308686256, "step": 335 }, { "epoch": 0.05196211096075778, "grad_norm": 6.341585636138916, "learning_rate": 8.65979381443299e-07, "logits/chosen": 17.249914169311523, "logits/rejected": 7.681830406188965, "logps/chosen": -381.96405029296875, "logps/rejected": -283.91192626953125, "loss": 0.7154, "rewards/accuracies": 0.375, "rewards/chosen": -0.11012067645788193, "rewards/margins": -0.034940049052238464, "rewards/rejected": -0.07518062740564346, "step": 336 }, { "epoch": 0.05211676010052194, "grad_norm": 4.551817893981934, "learning_rate": 8.685567010309279e-07, "logits/chosen": 6.682127952575684, "logits/rejected": 7.055165767669678, "logps/chosen": -267.300048828125, "logps/rejected": -249.1988067626953, "loss": 0.7096, "rewards/accuracies": 0.5, "rewards/chosen": 0.008320286870002747, "rewards/margins": -0.023122448474168777, "rewards/rejected": 0.03144273906946182, "step": 337 }, { "epoch": 0.0522714092402861, "grad_norm": 6.360701560974121, "learning_rate": 8.711340206185567e-07, "logits/chosen": 10.616209030151367, "logits/rejected": 11.711845397949219, "logps/chosen": -366.0268859863281, "logps/rejected": -393.9095764160156, "loss": 0.7314, "rewards/accuracies": 0.375, "rewards/chosen": -0.015015266835689545, "rewards/margins": -0.06660046428442001, "rewards/rejected": 0.05158519744873047, "step": 338 }, { "epoch": 0.05242605838005026, "grad_norm": 5.620662212371826, "learning_rate": 8.737113402061856e-07, "logits/chosen": 15.40963363647461, "logits/rejected": 14.104247093200684, "logps/chosen": -314.3692626953125, "logps/rejected": -269.7205810546875, "loss": 0.6943, "rewards/accuracies": 0.375, "rewards/chosen": 0.09499354660511017, "rewards/margins": -0.0008111950010061264, "rewards/rejected": 0.09580473601818085, "step": 339 }, { "epoch": 0.05258070751981442, "grad_norm": 5.203341007232666, "learning_rate": 8.762886597938144e-07, "logits/chosen": 4.358460426330566, "logits/rejected": 1.727196455001831, "logps/chosen": -201.946044921875, "logps/rejected": -136.56553649902344, "loss": 0.7446, "rewards/accuracies": 0.25, "rewards/chosen": -0.05658016353845596, "rewards/margins": -0.0963442325592041, "rewards/rejected": 0.03976407274603844, "step": 340 }, { "epoch": 0.05273535665957858, "grad_norm": 5.324661731719971, "learning_rate": 8.788659793814433e-07, "logits/chosen": 8.280628204345703, "logits/rejected": 8.178197860717773, "logps/chosen": -370.210205078125, "logps/rejected": -334.6656799316406, "loss": 0.6759, "rewards/accuracies": 0.625, "rewards/chosen": 0.05555377155542374, "rewards/margins": 0.03767814487218857, "rewards/rejected": 0.01787562295794487, "step": 341 }, { "epoch": 0.05289000579934274, "grad_norm": 7.864620685577393, "learning_rate": 8.814432989690723e-07, "logits/chosen": 6.158729076385498, "logits/rejected": 4.1754961013793945, "logps/chosen": -478.3458557128906, "logps/rejected": -256.2119140625, "loss": 0.7192, "rewards/accuracies": 0.25, "rewards/chosen": 0.014479974284768105, "rewards/margins": -0.044060613960027695, "rewards/rejected": 0.05854058265686035, "step": 342 }, { "epoch": 0.0530446549391069, "grad_norm": 4.414979457855225, "learning_rate": 8.840206185567011e-07, "logits/chosen": 10.996040344238281, "logits/rejected": 1.509592890739441, "logps/chosen": -261.48602294921875, "logps/rejected": -142.6768341064453, "loss": 0.6914, "rewards/accuracies": 0.375, "rewards/chosen": 0.019737720489501953, "rewards/margins": 0.0056035518646240234, "rewards/rejected": 0.01413416862487793, "step": 343 }, { "epoch": 0.05319930407887106, "grad_norm": 4.6226091384887695, "learning_rate": 8.8659793814433e-07, "logits/chosen": 10.649694442749023, "logits/rejected": 9.918621063232422, "logps/chosen": -309.9231262207031, "logps/rejected": -225.6995086669922, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.006951688788831234, "rewards/margins": 0.0020817983895540237, "rewards/rejected": 0.004869889467954636, "step": 344 }, { "epoch": 0.053353953218635224, "grad_norm": 4.1718878746032715, "learning_rate": 8.891752577319589e-07, "logits/chosen": 7.7937397956848145, "logits/rejected": 8.358108520507812, "logps/chosen": -270.802734375, "logps/rejected": -261.93414306640625, "loss": 0.6993, "rewards/accuracies": 0.375, "rewards/chosen": 0.005881977267563343, "rewards/margins": -0.009798050858080387, "rewards/rejected": 0.01568002626299858, "step": 345 }, { "epoch": 0.05350860235839938, "grad_norm": 5.7086501121521, "learning_rate": 8.917525773195878e-07, "logits/chosen": 12.025075912475586, "logits/rejected": 0.27682721614837646, "logps/chosen": -311.3431396484375, "logps/rejected": -198.30581665039062, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": -0.05789661407470703, "rewards/margins": -0.03056936338543892, "rewards/rejected": -0.027327250689268112, "step": 346 }, { "epoch": 0.05366325149816354, "grad_norm": 3.6815907955169678, "learning_rate": 8.943298969072166e-07, "logits/chosen": 6.982710838317871, "logits/rejected": 6.452592849731445, "logps/chosen": -211.49989318847656, "logps/rejected": -182.47181701660156, "loss": 0.6772, "rewards/accuracies": 0.375, "rewards/chosen": 0.060828495770692825, "rewards/margins": 0.035726070404052734, "rewards/rejected": 0.02510242722928524, "step": 347 }, { "epoch": 0.0538179006379277, "grad_norm": 4.95511531829834, "learning_rate": 8.969072164948454e-07, "logits/chosen": 14.31574821472168, "logits/rejected": 16.112995147705078, "logps/chosen": -292.53240966796875, "logps/rejected": -384.8315124511719, "loss": 0.7026, "rewards/accuracies": 0.625, "rewards/chosen": -0.043743617832660675, "rewards/margins": -0.008153248578310013, "rewards/rejected": -0.035590361803770065, "step": 348 }, { "epoch": 0.053972549777691864, "grad_norm": 6.050301551818848, "learning_rate": 8.994845360824743e-07, "logits/chosen": 10.782508850097656, "logits/rejected": 10.924026489257812, "logps/chosen": -328.68798828125, "logps/rejected": -301.2613525390625, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": 0.008078956976532936, "rewards/margins": 0.005491733551025391, "rewards/rejected": 0.0025872234255075455, "step": 349 }, { "epoch": 0.05412719891745602, "grad_norm": 4.550487041473389, "learning_rate": 9.020618556701031e-07, "logits/chosen": 7.750008583068848, "logits/rejected": 6.9923810958862305, "logps/chosen": -279.5172424316406, "logps/rejected": -223.89041137695312, "loss": 0.6841, "rewards/accuracies": 0.625, "rewards/chosen": 0.0246523879468441, "rewards/margins": 0.018850183114409447, "rewards/rejected": 0.00580220390111208, "step": 350 }, { "epoch": 0.05428184805722018, "grad_norm": 11.858559608459473, "learning_rate": 9.04639175257732e-07, "logits/chosen": 8.763167381286621, "logits/rejected": 11.137678146362305, "logps/chosen": -227.9757080078125, "logps/rejected": -290.70648193359375, "loss": 0.678, "rewards/accuracies": 0.5, "rewards/chosen": 0.05212879180908203, "rewards/margins": 0.039518166333436966, "rewards/rejected": 0.01261062454432249, "step": 351 }, { "epoch": 0.05443649719698434, "grad_norm": 5.670365810394287, "learning_rate": 9.072164948453609e-07, "logits/chosen": 13.587382316589355, "logits/rejected": 10.373586654663086, "logps/chosen": -338.270263671875, "logps/rejected": -274.0187072753906, "loss": 0.6978, "rewards/accuracies": 0.625, "rewards/chosen": 0.028385069221258163, "rewards/margins": -0.007985593751072884, "rewards/rejected": 0.0363706573843956, "step": 352 }, { "epoch": 0.054591146336748504, "grad_norm": 5.200290203094482, "learning_rate": 9.097938144329898e-07, "logits/chosen": 5.452160835266113, "logits/rejected": 4.716856956481934, "logps/chosen": -306.956787109375, "logps/rejected": -302.99609375, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": 0.02313823625445366, "rewards/margins": 0.031882576644420624, "rewards/rejected": -0.008744340389966965, "step": 353 }, { "epoch": 0.05474579547651266, "grad_norm": 4.956686496734619, "learning_rate": 9.123711340206186e-07, "logits/chosen": 15.669855117797852, "logits/rejected": 5.030588626861572, "logps/chosen": -255.5647735595703, "logps/rejected": -134.20985412597656, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011940496042370796, "rewards/margins": 0.000958440825343132, "rewards/rejected": -0.0021524904295802116, "step": 354 }, { "epoch": 0.05490044461627682, "grad_norm": 5.248405933380127, "learning_rate": 9.149484536082474e-07, "logits/chosen": 12.432308197021484, "logits/rejected": 10.941661834716797, "logps/chosen": -366.5779113769531, "logps/rejected": -343.7878112792969, "loss": 0.7115, "rewards/accuracies": 0.25, "rewards/chosen": 0.0014962200075387955, "rewards/margins": -0.022401336580514908, "rewards/rejected": 0.023897551000118256, "step": 355 }, { "epoch": 0.05505509375604098, "grad_norm": 4.469074726104736, "learning_rate": 9.175257731958763e-07, "logits/chosen": 6.67618989944458, "logits/rejected": 10.412915229797363, "logps/chosen": -209.1573028564453, "logps/rejected": -242.23846435546875, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": -0.002662944607436657, "rewards/margins": 0.011094575747847557, "rewards/rejected": -0.013757515698671341, "step": 356 }, { "epoch": 0.055209742895805144, "grad_norm": 4.443343162536621, "learning_rate": 9.201030927835052e-07, "logits/chosen": 12.131492614746094, "logits/rejected": 9.86172866821289, "logps/chosen": -294.4602355957031, "logps/rejected": -263.77783203125, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": -0.010724161751568317, "rewards/margins": 0.018154192715883255, "rewards/rejected": -0.028878355398774147, "step": 357 }, { "epoch": 0.0553643920355693, "grad_norm": 8.816765785217285, "learning_rate": 9.226804123711341e-07, "logits/chosen": 12.643805503845215, "logits/rejected": 2.08780574798584, "logps/chosen": -519.277587890625, "logps/rejected": -241.80743408203125, "loss": 0.7157, "rewards/accuracies": 0.5, "rewards/chosen": -0.04626712203025818, "rewards/margins": -0.03952989727258682, "rewards/rejected": -0.0067372312769293785, "step": 358 }, { "epoch": 0.05551904117533346, "grad_norm": 5.776669979095459, "learning_rate": 9.25257731958763e-07, "logits/chosen": 12.58725357055664, "logits/rejected": 8.617329597473145, "logps/chosen": -403.50390625, "logps/rejected": -354.3662109375, "loss": 0.6816, "rewards/accuracies": 0.625, "rewards/chosen": -0.003914071246981621, "rewards/margins": 0.027027608826756477, "rewards/rejected": -0.03094167448580265, "step": 359 }, { "epoch": 0.05567369031509762, "grad_norm": 6.46565055847168, "learning_rate": 9.278350515463919e-07, "logits/chosen": 9.653300285339355, "logits/rejected": 9.97469711303711, "logps/chosen": -298.61175537109375, "logps/rejected": -283.46844482421875, "loss": 0.6743, "rewards/accuracies": 0.625, "rewards/chosen": -0.027576066553592682, "rewards/margins": 0.0417148619890213, "rewards/rejected": -0.06929092109203339, "step": 360 }, { "epoch": 0.055828339454861785, "grad_norm": 6.373989105224609, "learning_rate": 9.304123711340207e-07, "logits/chosen": 8.019002914428711, "logits/rejected": 11.971508979797363, "logps/chosen": -237.29595947265625, "logps/rejected": -309.50732421875, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": 0.004565784707665443, "rewards/margins": 0.017758727073669434, "rewards/rejected": -0.013192940503358841, "step": 361 }, { "epoch": 0.05598298859462594, "grad_norm": 5.160031795501709, "learning_rate": 9.329896907216496e-07, "logits/chosen": 8.270573616027832, "logits/rejected": 12.278438568115234, "logps/chosen": -298.4097900390625, "logps/rejected": -364.43438720703125, "loss": 0.6691, "rewards/accuracies": 0.625, "rewards/chosen": -0.03375225141644478, "rewards/margins": 0.0528591126203537, "rewards/rejected": -0.08661137521266937, "step": 362 }, { "epoch": 0.0561376377343901, "grad_norm": 3.9054408073425293, "learning_rate": 9.355670103092785e-07, "logits/chosen": 11.082379341125488, "logits/rejected": 4.30116081237793, "logps/chosen": -269.73187255859375, "logps/rejected": -168.511474609375, "loss": 0.6715, "rewards/accuracies": 0.625, "rewards/chosen": 0.024539470672607422, "rewards/margins": 0.04987955838441849, "rewards/rejected": -0.025340082123875618, "step": 363 }, { "epoch": 0.05629228687415426, "grad_norm": 5.839847564697266, "learning_rate": 9.381443298969072e-07, "logits/chosen": 6.032514572143555, "logits/rejected": 7.4395222663879395, "logps/chosen": -265.63519287109375, "logps/rejected": -252.85031127929688, "loss": 0.6561, "rewards/accuracies": 0.875, "rewards/chosen": 0.07532262802124023, "rewards/margins": 0.07692494988441467, "rewards/rejected": -0.0016023144125938416, "step": 364 }, { "epoch": 0.056446936013918425, "grad_norm": 3.9480528831481934, "learning_rate": 9.407216494845361e-07, "logits/chosen": 9.79598617553711, "logits/rejected": 9.774332046508789, "logps/chosen": -236.4007568359375, "logps/rejected": -228.12460327148438, "loss": 0.6995, "rewards/accuracies": 0.5, "rewards/chosen": 0.008272027596831322, "rewards/margins": -0.009929515421390533, "rewards/rejected": 0.018201543018221855, "step": 365 }, { "epoch": 0.05660158515368258, "grad_norm": 4.725208759307861, "learning_rate": 9.43298969072165e-07, "logits/chosen": 10.195878982543945, "logits/rejected": 9.985554695129395, "logps/chosen": -287.7193298339844, "logps/rejected": -245.8714141845703, "loss": 0.6939, "rewards/accuracies": 0.375, "rewards/chosen": -0.00581636605784297, "rewards/margins": 0.002316952683031559, "rewards/rejected": -0.008133318275213242, "step": 366 }, { "epoch": 0.05675623429344674, "grad_norm": 4.951709270477295, "learning_rate": 9.458762886597939e-07, "logits/chosen": 14.735419273376465, "logits/rejected": 9.899681091308594, "logps/chosen": -285.44085693359375, "logps/rejected": -273.3875732421875, "loss": 0.7172, "rewards/accuracies": 0.625, "rewards/chosen": -0.04300088807940483, "rewards/margins": -0.04202437400817871, "rewards/rejected": -0.0009765159338712692, "step": 367 }, { "epoch": 0.0569108834332109, "grad_norm": 4.781497001647949, "learning_rate": 9.484536082474227e-07, "logits/chosen": 15.991767883300781, "logits/rejected": 8.424766540527344, "logps/chosen": -238.76235961914062, "logps/rejected": -197.5927734375, "loss": 0.7247, "rewards/accuracies": 0.125, "rewards/chosen": -0.021326255053281784, "rewards/margins": -0.060691025108098984, "rewards/rejected": 0.0393647700548172, "step": 368 }, { "epoch": 0.057065532572975065, "grad_norm": 5.740688800811768, "learning_rate": 9.510309278350516e-07, "logits/chosen": 10.063053131103516, "logits/rejected": 2.9771573543548584, "logps/chosen": -334.138427734375, "logps/rejected": -214.73902893066406, "loss": 0.6442, "rewards/accuracies": 0.75, "rewards/chosen": 0.06613150238990784, "rewards/margins": 0.10492897033691406, "rewards/rejected": -0.03879747539758682, "step": 369 }, { "epoch": 0.057220181712739224, "grad_norm": 4.534903049468994, "learning_rate": 9.536082474226805e-07, "logits/chosen": 11.396720886230469, "logits/rejected": 6.763251781463623, "logps/chosen": -216.68621826171875, "logps/rejected": -183.06671142578125, "loss": 0.6859, "rewards/accuracies": 0.625, "rewards/chosen": -0.03284458816051483, "rewards/margins": 0.017432022839784622, "rewards/rejected": -0.050276611000299454, "step": 370 }, { "epoch": 0.05737483085250338, "grad_norm": 7.5135016441345215, "learning_rate": 9.561855670103093e-07, "logits/chosen": 9.614764213562012, "logits/rejected": 8.616850852966309, "logps/chosen": -319.6884765625, "logps/rejected": -315.4322509765625, "loss": 0.7099, "rewards/accuracies": 0.5, "rewards/chosen": 0.027391627430915833, "rewards/margins": -0.02809848263859749, "rewards/rejected": 0.05549011379480362, "step": 371 }, { "epoch": 0.05752947999226754, "grad_norm": 6.570878505706787, "learning_rate": 9.587628865979382e-07, "logits/chosen": 6.676506042480469, "logits/rejected": 3.8560619354248047, "logps/chosen": -217.4125213623047, "logps/rejected": -242.31954956054688, "loss": 0.6844, "rewards/accuracies": 0.5, "rewards/chosen": 0.0360744446516037, "rewards/margins": 0.02077498659491539, "rewards/rejected": 0.015299463644623756, "step": 372 }, { "epoch": 0.057684129132031706, "grad_norm": 3.395559310913086, "learning_rate": 9.61340206185567e-07, "logits/chosen": 7.980839729309082, "logits/rejected": 13.185905456542969, "logps/chosen": -166.61965942382812, "logps/rejected": -147.8430633544922, "loss": 0.7083, "rewards/accuracies": 0.5, "rewards/chosen": -0.025401689112186432, "rewards/margins": -0.02807903103530407, "rewards/rejected": 0.0026773447170853615, "step": 373 }, { "epoch": 0.057838778271795864, "grad_norm": 4.8726911544799805, "learning_rate": 9.63917525773196e-07, "logits/chosen": 12.443926811218262, "logits/rejected": 15.207575798034668, "logps/chosen": -322.5223083496094, "logps/rejected": -278.626953125, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.0005522725405171514, "rewards/margins": 0.03925461694598198, "rewards/rejected": -0.0387023463845253, "step": 374 }, { "epoch": 0.05799342741156002, "grad_norm": 5.701084136962891, "learning_rate": 9.664948453608248e-07, "logits/chosen": 13.062406539916992, "logits/rejected": 4.93897819519043, "logps/chosen": -365.8446960449219, "logps/rejected": -220.0662078857422, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": 0.08141865581274033, "rewards/margins": 0.054802462458610535, "rewards/rejected": 0.026616191491484642, "step": 375 }, { "epoch": 0.05814807655132418, "grad_norm": 4.8684844970703125, "learning_rate": 9.690721649484537e-07, "logits/chosen": 14.162793159484863, "logits/rejected": 10.294622421264648, "logps/chosen": -398.8664855957031, "logps/rejected": -331.074951171875, "loss": 0.7148, "rewards/accuracies": 0.5, "rewards/chosen": -0.047443680465221405, "rewards/margins": -0.03860035538673401, "rewards/rejected": -0.00884332600980997, "step": 376 }, { "epoch": 0.058302725691088346, "grad_norm": 5.442417621612549, "learning_rate": 9.716494845360826e-07, "logits/chosen": 9.5631103515625, "logits/rejected": 9.802350044250488, "logps/chosen": -315.45654296875, "logps/rejected": -310.20556640625, "loss": 0.6863, "rewards/accuracies": 0.625, "rewards/chosen": 0.013831520453095436, "rewards/margins": 0.017550181597471237, "rewards/rejected": -0.0037186630070209503, "step": 377 }, { "epoch": 0.058457374830852504, "grad_norm": 6.687643051147461, "learning_rate": 9.742268041237114e-07, "logits/chosen": 14.423138618469238, "logits/rejected": 13.543318748474121, "logps/chosen": -313.0621643066406, "logps/rejected": -310.69012451171875, "loss": 0.6815, "rewards/accuracies": 0.625, "rewards/chosen": 0.03683624416589737, "rewards/margins": 0.0302339568734169, "rewards/rejected": 0.006602289155125618, "step": 378 }, { "epoch": 0.05861202397061666, "grad_norm": 5.6178812980651855, "learning_rate": 9.768041237113403e-07, "logits/chosen": 9.242607116699219, "logits/rejected": 8.472833633422852, "logps/chosen": -381.8070983886719, "logps/rejected": -400.93890380859375, "loss": 0.6787, "rewards/accuracies": 0.375, "rewards/chosen": 0.040642257779836655, "rewards/margins": 0.036738112568855286, "rewards/rejected": 0.003904152661561966, "step": 379 }, { "epoch": 0.05876667311038082, "grad_norm": 3.849717378616333, "learning_rate": 9.793814432989692e-07, "logits/chosen": 6.134887218475342, "logits/rejected": 7.432956218719482, "logps/chosen": -202.69830322265625, "logps/rejected": -218.01083374023438, "loss": 0.6905, "rewards/accuracies": 0.5, "rewards/chosen": -0.003968000411987305, "rewards/margins": 0.01186499372124672, "rewards/rejected": -0.015832997858524323, "step": 380 }, { "epoch": 0.058921322250144986, "grad_norm": 4.675786972045898, "learning_rate": 9.81958762886598e-07, "logits/chosen": 13.579373359680176, "logits/rejected": 11.059019088745117, "logps/chosen": -277.2948913574219, "logps/rejected": -249.16050720214844, "loss": 0.6668, "rewards/accuracies": 0.625, "rewards/chosen": 0.03067338652908802, "rewards/margins": 0.05679219216108322, "rewards/rejected": -0.0261188056319952, "step": 381 }, { "epoch": 0.059075971389909145, "grad_norm": 6.958254814147949, "learning_rate": 9.84536082474227e-07, "logits/chosen": 13.963619232177734, "logits/rejected": 11.869502067565918, "logps/chosen": -339.5461120605469, "logps/rejected": -330.08624267578125, "loss": 0.7129, "rewards/accuracies": 0.5, "rewards/chosen": -0.03049173578619957, "rewards/margins": -0.03578352928161621, "rewards/rejected": 0.005291796289384365, "step": 382 }, { "epoch": 0.0592306205296733, "grad_norm": 5.348432540893555, "learning_rate": 9.871134020618558e-07, "logits/chosen": 11.107304573059082, "logits/rejected": 8.949459075927734, "logps/chosen": -354.39605712890625, "logps/rejected": -313.8327331542969, "loss": 0.6789, "rewards/accuracies": 0.625, "rewards/chosen": 0.012913036160171032, "rewards/margins": 0.030461497604846954, "rewards/rejected": -0.017548464238643646, "step": 383 }, { "epoch": 0.05938526966943746, "grad_norm": 6.003533363342285, "learning_rate": 9.896907216494845e-07, "logits/chosen": 5.539496421813965, "logits/rejected": 10.436893463134766, "logps/chosen": -300.509765625, "logps/rejected": -333.1765441894531, "loss": 0.6981, "rewards/accuracies": 0.5, "rewards/chosen": 0.00812673382461071, "rewards/margins": -0.008057691156864166, "rewards/rejected": 0.016184426844120026, "step": 384 }, { "epoch": 0.05953991880920163, "grad_norm": 4.731614112854004, "learning_rate": 9.922680412371133e-07, "logits/chosen": 9.498493194580078, "logits/rejected": 5.924538612365723, "logps/chosen": -375.8652648925781, "logps/rejected": -273.45050048828125, "loss": 0.7049, "rewards/accuracies": 0.375, "rewards/chosen": 0.008946752175688744, "rewards/margins": -0.018858052790164948, "rewards/rejected": 0.027804803103208542, "step": 385 }, { "epoch": 0.059694567948965785, "grad_norm": 4.286332607269287, "learning_rate": 9.948453608247422e-07, "logits/chosen": 14.273489952087402, "logits/rejected": 7.135898113250732, "logps/chosen": -252.98452758789062, "logps/rejected": -209.14920043945312, "loss": 0.7084, "rewards/accuracies": 0.625, "rewards/chosen": 0.0075575849041342735, "rewards/margins": -0.018668843433260918, "rewards/rejected": 0.026226425543427467, "step": 386 }, { "epoch": 0.05984921708872994, "grad_norm": 3.7653305530548096, "learning_rate": 9.974226804123713e-07, "logits/chosen": 9.252832412719727, "logits/rejected": 8.167980194091797, "logps/chosen": -131.02125549316406, "logps/rejected": -157.9849853515625, "loss": 0.6939, "rewards/accuracies": 0.625, "rewards/chosen": -0.006353235337883234, "rewards/margins": -4.544481635093689e-05, "rewards/rejected": -0.006307792849838734, "step": 387 }, { "epoch": 0.0600038662284941, "grad_norm": 5.94622278213501, "learning_rate": 1.0000000000000002e-06, "logits/chosen": 15.253323554992676, "logits/rejected": 13.738725662231445, "logps/chosen": -405.9993896484375, "logps/rejected": -355.36004638671875, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": -0.010611534118652344, "rewards/margins": 0.009074783883988857, "rewards/rejected": -0.019686317071318626, "step": 388 }, { "epoch": 0.06015851536825827, "grad_norm": 5.627812385559082, "learning_rate": 1.002577319587629e-06, "logits/chosen": 7.367169380187988, "logits/rejected": 11.624138832092285, "logps/chosen": -234.81594848632812, "logps/rejected": -327.74371337890625, "loss": 0.7235, "rewards/accuracies": 0.5, "rewards/chosen": -0.04003448784351349, "rewards/margins": -0.055303286761045456, "rewards/rejected": 0.015268802642822266, "step": 389 }, { "epoch": 0.060313164508022425, "grad_norm": 4.381931781768799, "learning_rate": 1.005154639175258e-06, "logits/chosen": 10.94035530090332, "logits/rejected": 4.489120006561279, "logps/chosen": -360.4523620605469, "logps/rejected": -250.97390747070312, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": -0.0281643345952034, "rewards/margins": 0.011478759348392487, "rewards/rejected": -0.039643097668886185, "step": 390 }, { "epoch": 0.060467813647786584, "grad_norm": 5.554924964904785, "learning_rate": 1.0077319587628868e-06, "logits/chosen": 10.382328033447266, "logits/rejected": 10.860540390014648, "logps/chosen": -305.2872009277344, "logps/rejected": -389.4974670410156, "loss": 0.6972, "rewards/accuracies": 0.375, "rewards/chosen": 0.019063852727413177, "rewards/margins": -0.0033590756356716156, "rewards/rejected": 0.02242293208837509, "step": 391 }, { "epoch": 0.06062246278755074, "grad_norm": 6.395705223083496, "learning_rate": 1.0103092783505157e-06, "logits/chosen": 8.464802742004395, "logits/rejected": 6.494001388549805, "logps/chosen": -419.99530029296875, "logps/rejected": -296.424560546875, "loss": 0.692, "rewards/accuracies": 0.375, "rewards/chosen": -0.020837876945734024, "rewards/margins": 0.009765293449163437, "rewards/rejected": -0.03060317412018776, "step": 392 }, { "epoch": 0.06077711192731491, "grad_norm": 4.983519077301025, "learning_rate": 1.0128865979381445e-06, "logits/chosen": 14.822391510009766, "logits/rejected": 12.447269439697266, "logps/chosen": -376.1868896484375, "logps/rejected": -295.7694396972656, "loss": 0.6795, "rewards/accuracies": 0.75, "rewards/chosen": 0.06785649806261063, "rewards/margins": 0.0322817787528038, "rewards/rejected": 0.03557472676038742, "step": 393 }, { "epoch": 0.060931761067079065, "grad_norm": 46.94654083251953, "learning_rate": 1.0154639175257732e-06, "logits/chosen": 10.589373588562012, "logits/rejected": 3.4049954414367676, "logps/chosen": -168.27459716796875, "logps/rejected": -220.32583618164062, "loss": 0.6713, "rewards/accuracies": 0.625, "rewards/chosen": 0.04598045349121094, "rewards/margins": 0.045572809875011444, "rewards/rejected": 0.0004076478071510792, "step": 394 }, { "epoch": 0.061086410206843224, "grad_norm": 5.823394775390625, "learning_rate": 1.018041237113402e-06, "logits/chosen": 7.790483474731445, "logits/rejected": 12.002765655517578, "logps/chosen": -220.02870178222656, "logps/rejected": -320.35577392578125, "loss": 0.7181, "rewards/accuracies": 0.25, "rewards/chosen": 0.003933548927307129, "rewards/margins": -0.045825716108083725, "rewards/rejected": 0.04975926876068115, "step": 395 }, { "epoch": 0.06124105934660738, "grad_norm": 4.505932807922363, "learning_rate": 1.020618556701031e-06, "logits/chosen": 9.639538764953613, "logits/rejected": 7.1267900466918945, "logps/chosen": -224.4041748046875, "logps/rejected": -176.70773315429688, "loss": 0.6818, "rewards/accuracies": 0.625, "rewards/chosen": 0.02993164025247097, "rewards/margins": 0.02464001253247261, "rewards/rejected": 0.00529162771999836, "step": 396 }, { "epoch": 0.06139570848637155, "grad_norm": 5.793320178985596, "learning_rate": 1.0231958762886598e-06, "logits/chosen": 12.236940383911133, "logits/rejected": 11.046968460083008, "logps/chosen": -365.28973388671875, "logps/rejected": -373.0707702636719, "loss": 0.6593, "rewards/accuracies": 0.5, "rewards/chosen": 0.052540771663188934, "rewards/margins": 0.08348321914672852, "rewards/rejected": -0.030942440032958984, "step": 397 }, { "epoch": 0.061550357626135706, "grad_norm": 4.560193061828613, "learning_rate": 1.0257731958762887e-06, "logits/chosen": 10.098865509033203, "logits/rejected": 8.987324714660645, "logps/chosen": -281.2982177734375, "logps/rejected": -271.20001220703125, "loss": 0.6421, "rewards/accuracies": 0.625, "rewards/chosen": 0.031172750517725945, "rewards/margins": 0.10895471274852753, "rewards/rejected": -0.07778196781873703, "step": 398 }, { "epoch": 0.061705006765899864, "grad_norm": 3.6590681076049805, "learning_rate": 1.0283505154639175e-06, "logits/chosen": 3.6563708782196045, "logits/rejected": 9.398565292358398, "logps/chosen": -106.84200286865234, "logps/rejected": -124.43630981445312, "loss": 0.7112, "rewards/accuracies": 0.25, "rewards/chosen": -0.018233537673950195, "rewards/margins": -0.03455691412091255, "rewards/rejected": 0.016323376446962357, "step": 399 }, { "epoch": 0.06185965590566402, "grad_norm": 5.60069465637207, "learning_rate": 1.0309278350515464e-06, "logits/chosen": 11.04920768737793, "logits/rejected": 12.640439987182617, "logps/chosen": -289.4459228515625, "logps/rejected": -264.5858154296875, "loss": 0.6786, "rewards/accuracies": 0.5, "rewards/chosen": 0.01958027109503746, "rewards/margins": 0.035243794322013855, "rewards/rejected": -0.015663526952266693, "step": 400 }, { "epoch": 0.06201430504542819, "grad_norm": 5.548246383666992, "learning_rate": 1.0335051546391753e-06, "logits/chosen": 8.104073524475098, "logits/rejected": 9.847138404846191, "logps/chosen": -281.1682434082031, "logps/rejected": -272.9338684082031, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.009098435752093792, "rewards/margins": 0.023684311658143997, "rewards/rejected": -0.01458587683737278, "step": 401 }, { "epoch": 0.062168954185192346, "grad_norm": 4.97169828414917, "learning_rate": 1.0360824742268044e-06, "logits/chosen": 15.493053436279297, "logits/rejected": 2.851205348968506, "logps/chosen": -234.340576171875, "logps/rejected": -148.24948120117188, "loss": 0.7244, "rewards/accuracies": 0.25, "rewards/chosen": -0.010373781435191631, "rewards/margins": -0.057037945836782455, "rewards/rejected": 0.046664170920848846, "step": 402 }, { "epoch": 0.062323603324956504, "grad_norm": 5.792660713195801, "learning_rate": 1.038659793814433e-06, "logits/chosen": 13.377229690551758, "logits/rejected": 8.254720687866211, "logps/chosen": -370.7276611328125, "logps/rejected": -287.15008544921875, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": 0.06946516036987305, "rewards/margins": 0.05564098805189133, "rewards/rejected": 0.013824177905917168, "step": 403 }, { "epoch": 0.06247825246472066, "grad_norm": 3.400301933288574, "learning_rate": 1.041237113402062e-06, "logits/chosen": 7.354280471801758, "logits/rejected": 4.088191032409668, "logps/chosen": -177.28558349609375, "logps/rejected": -170.65638732910156, "loss": 0.6603, "rewards/accuracies": 0.625, "rewards/chosen": 0.031157229095697403, "rewards/margins": 0.07195504009723663, "rewards/rejected": -0.04079780727624893, "step": 404 }, { "epoch": 0.06263290160448483, "grad_norm": 4.521085262298584, "learning_rate": 1.0438144329896908e-06, "logits/chosen": 15.999117851257324, "logits/rejected": 7.129674434661865, "logps/chosen": -287.62994384765625, "logps/rejected": -186.813720703125, "loss": 0.6759, "rewards/accuracies": 0.75, "rewards/chosen": -0.010795880109071732, "rewards/margins": 0.04042024165391922, "rewards/rejected": -0.05121612548828125, "step": 405 }, { "epoch": 0.06278755074424898, "grad_norm": 3.939335823059082, "learning_rate": 1.0463917525773196e-06, "logits/chosen": 14.739656448364258, "logits/rejected": 12.417524337768555, "logps/chosen": -256.73895263671875, "logps/rejected": -189.15966796875, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.030969714745879173, "rewards/margins": 0.013964174315333366, "rewards/rejected": 0.017005540430545807, "step": 406 }, { "epoch": 0.06294219988401314, "grad_norm": 4.425426483154297, "learning_rate": 1.0489690721649485e-06, "logits/chosen": 11.847260475158691, "logits/rejected": 7.343934535980225, "logps/chosen": -186.3024444580078, "logps/rejected": -154.26666259765625, "loss": 0.7195, "rewards/accuracies": 0.375, "rewards/chosen": -0.04618697240948677, "rewards/margins": -0.04906139522790909, "rewards/rejected": 0.0028744228184223175, "step": 407 }, { "epoch": 0.06309684902377731, "grad_norm": 6.185189247131348, "learning_rate": 1.0515463917525774e-06, "logits/chosen": 5.740227699279785, "logits/rejected": 9.117938041687012, "logps/chosen": -327.19561767578125, "logps/rejected": -353.5487060546875, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.005328178405761719, "rewards/margins": 0.024118423461914062, "rewards/rejected": -0.018790245056152344, "step": 408 }, { "epoch": 0.06325149816354146, "grad_norm": 4.2316508293151855, "learning_rate": 1.0541237113402063e-06, "logits/chosen": 7.834961891174316, "logits/rejected": 7.438101291656494, "logps/chosen": -191.24432373046875, "logps/rejected": -211.92042541503906, "loss": 0.6478, "rewards/accuracies": 0.5, "rewards/chosen": 0.06866011768579483, "rewards/margins": 0.10433387756347656, "rewards/rejected": -0.03567375987768173, "step": 409 }, { "epoch": 0.06340614730330563, "grad_norm": 5.276022911071777, "learning_rate": 1.0567010309278351e-06, "logits/chosen": 7.849164009094238, "logits/rejected": 8.43691635131836, "logps/chosen": -310.4337158203125, "logps/rejected": -318.9949951171875, "loss": 0.7336, "rewards/accuracies": 0.25, "rewards/chosen": -0.044051505625247955, "rewards/margins": -0.0714423730969429, "rewards/rejected": 0.027390865609049797, "step": 410 }, { "epoch": 0.06356079644306979, "grad_norm": 5.893574237823486, "learning_rate": 1.059278350515464e-06, "logits/chosen": 6.151208877563477, "logits/rejected": 6.023972511291504, "logps/chosen": -233.06085205078125, "logps/rejected": -253.8050537109375, "loss": 0.6646, "rewards/accuracies": 0.625, "rewards/chosen": 0.05469723045825958, "rewards/margins": 0.05942115932703018, "rewards/rejected": -0.004723930731415749, "step": 411 }, { "epoch": 0.06371544558283394, "grad_norm": 4.401363372802734, "learning_rate": 1.0618556701030929e-06, "logits/chosen": 12.957690238952637, "logits/rejected": 9.535648345947266, "logps/chosen": -304.7477722167969, "logps/rejected": -230.508056640625, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": -0.0379658006131649, "rewards/margins": -0.04407656192779541, "rewards/rejected": 0.006110764108598232, "step": 412 }, { "epoch": 0.06387009472259811, "grad_norm": 3.616117477416992, "learning_rate": 1.0644329896907218e-06, "logits/chosen": 11.035181999206543, "logits/rejected": 11.399690628051758, "logps/chosen": -139.2234344482422, "logps/rejected": -147.9811248779297, "loss": 0.7337, "rewards/accuracies": 0.125, "rewards/chosen": -0.03679187595844269, "rewards/margins": -0.07756879925727844, "rewards/rejected": 0.040776923298835754, "step": 413 }, { "epoch": 0.06402474386236226, "grad_norm": 4.700901985168457, "learning_rate": 1.0670103092783506e-06, "logits/chosen": 7.755059719085693, "logits/rejected": 18.040864944458008, "logps/chosen": -175.4807891845703, "logps/rejected": -251.79000854492188, "loss": 0.7011, "rewards/accuracies": 0.625, "rewards/chosen": -0.009173011407256126, "rewards/margins": -0.012544157914817333, "rewards/rejected": 0.003371143713593483, "step": 414 }, { "epoch": 0.06417939300212643, "grad_norm": 3.2760438919067383, "learning_rate": 1.0695876288659795e-06, "logits/chosen": 6.526534080505371, "logits/rejected": 7.675030708312988, "logps/chosen": -130.85191345214844, "logps/rejected": -144.63307189941406, "loss": 0.6737, "rewards/accuracies": 0.5, "rewards/chosen": 0.06054973974823952, "rewards/margins": 0.04247203469276428, "rewards/rejected": 0.018077708780765533, "step": 415 }, { "epoch": 0.06433404214189059, "grad_norm": 6.944642543792725, "learning_rate": 1.0721649484536084e-06, "logits/chosen": 9.642186164855957, "logits/rejected": 6.517422676086426, "logps/chosen": -372.90985107421875, "logps/rejected": -269.1368103027344, "loss": 0.7411, "rewards/accuracies": 0.5, "rewards/chosen": -0.04121983051300049, "rewards/margins": -0.08632919937372208, "rewards/rejected": 0.04510936886072159, "step": 416 }, { "epoch": 0.06448869128165474, "grad_norm": 5.0813140869140625, "learning_rate": 1.0747422680412372e-06, "logits/chosen": 12.539143562316895, "logits/rejected": 10.774240493774414, "logps/chosen": -312.68560791015625, "logps/rejected": -258.262939453125, "loss": 0.7357, "rewards/accuracies": 0.25, "rewards/chosen": -0.08162746578454971, "rewards/margins": -0.07941237092018127, "rewards/rejected": -0.0022150976583361626, "step": 417 }, { "epoch": 0.06464334042141891, "grad_norm": 4.718321800231934, "learning_rate": 1.0773195876288661e-06, "logits/chosen": 11.76107120513916, "logits/rejected": 1.2383140325546265, "logps/chosen": -248.8386688232422, "logps/rejected": -154.86441040039062, "loss": 0.7073, "rewards/accuracies": 0.375, "rewards/chosen": 0.017749402672052383, "rewards/margins": -0.019788123667240143, "rewards/rejected": 0.037537530064582825, "step": 418 }, { "epoch": 0.06479798956118307, "grad_norm": 3.8745291233062744, "learning_rate": 1.079896907216495e-06, "logits/chosen": 8.4127836227417, "logits/rejected": 10.399481773376465, "logps/chosen": -229.52426147460938, "logps/rejected": -256.41644287109375, "loss": 0.6635, "rewards/accuracies": 0.625, "rewards/chosen": 0.00891094096004963, "rewards/margins": 0.07323947548866272, "rewards/rejected": -0.06432852894067764, "step": 419 }, { "epoch": 0.06495263870094722, "grad_norm": 7.327706336975098, "learning_rate": 1.0824742268041239e-06, "logits/chosen": 11.710941314697266, "logits/rejected": 4.853361129760742, "logps/chosen": -187.626220703125, "logps/rejected": -176.29119873046875, "loss": 0.6926, "rewards/accuracies": 0.375, "rewards/chosen": 0.020308634266257286, "rewards/margins": 0.006362845189869404, "rewards/rejected": 0.013945795595645905, "step": 420 }, { "epoch": 0.06510728784071139, "grad_norm": 5.054439544677734, "learning_rate": 1.0850515463917527e-06, "logits/chosen": 7.633783340454102, "logits/rejected": 9.53019905090332, "logps/chosen": -332.91033935546875, "logps/rejected": -347.2127990722656, "loss": 0.6818, "rewards/accuracies": 0.625, "rewards/chosen": 0.0553162582218647, "rewards/margins": 0.02460308186709881, "rewards/rejected": 0.030713174492120743, "step": 421 }, { "epoch": 0.06526193698047554, "grad_norm": 4.469846725463867, "learning_rate": 1.0876288659793816e-06, "logits/chosen": 12.601400375366211, "logits/rejected": 6.941188335418701, "logps/chosen": -297.45391845703125, "logps/rejected": -243.72630310058594, "loss": 0.722, "rewards/accuracies": 0.25, "rewards/chosen": -0.02540598064661026, "rewards/margins": -0.05200149863958359, "rewards/rejected": 0.026595521718263626, "step": 422 }, { "epoch": 0.0654165861202397, "grad_norm": 3.918545961380005, "learning_rate": 1.0902061855670105e-06, "logits/chosen": 10.874977111816406, "logits/rejected": 5.981527328491211, "logps/chosen": -258.41192626953125, "logps/rejected": -224.7599639892578, "loss": 0.6514, "rewards/accuracies": 0.625, "rewards/chosen": 0.07436370849609375, "rewards/margins": 0.08920204639434814, "rewards/rejected": -0.01483833882957697, "step": 423 }, { "epoch": 0.06557123526000387, "grad_norm": 5.158308029174805, "learning_rate": 1.0927835051546393e-06, "logits/chosen": 13.492680549621582, "logits/rejected": 7.164361953735352, "logps/chosen": -385.2947998046875, "logps/rejected": -204.1776885986328, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.006291484460234642, "rewards/margins": 0.01792421191930771, "rewards/rejected": -0.011632728390395641, "step": 424 }, { "epoch": 0.06572588439976802, "grad_norm": 6.421451568603516, "learning_rate": 1.0953608247422682e-06, "logits/chosen": 14.007320404052734, "logits/rejected": 14.686290740966797, "logps/chosen": -375.481689453125, "logps/rejected": -330.3765869140625, "loss": 0.6328, "rewards/accuracies": 0.875, "rewards/chosen": 0.02172689512372017, "rewards/margins": 0.12835693359375, "rewards/rejected": -0.10663004219532013, "step": 425 }, { "epoch": 0.06588053353953219, "grad_norm": 5.324506759643555, "learning_rate": 1.097938144329897e-06, "logits/chosen": 9.088860511779785, "logits/rejected": 9.94772720336914, "logps/chosen": -412.4358825683594, "logps/rejected": -346.6433410644531, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": 0.02656860277056694, "rewards/margins": 0.08164462447166443, "rewards/rejected": -0.05507602542638779, "step": 426 }, { "epoch": 0.06603518267929635, "grad_norm": 10.140251159667969, "learning_rate": 1.1005154639175257e-06, "logits/chosen": 12.245105743408203, "logits/rejected": 8.658808708190918, "logps/chosen": -461.3382568359375, "logps/rejected": -376.05560302734375, "loss": 0.7176, "rewards/accuracies": 0.25, "rewards/chosen": -0.06021628528833389, "rewards/margins": -0.04364652931690216, "rewards/rejected": -0.01656975969672203, "step": 427 }, { "epoch": 0.0661898318190605, "grad_norm": 3.773886203765869, "learning_rate": 1.1030927835051546e-06, "logits/chosen": 10.883367538452148, "logits/rejected": 6.539021015167236, "logps/chosen": -315.97607421875, "logps/rejected": -244.35726928710938, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": 0.045493461191654205, "rewards/margins": 0.07271971553564072, "rewards/rejected": -0.02722625620663166, "step": 428 }, { "epoch": 0.06634448095882467, "grad_norm": 5.636545658111572, "learning_rate": 1.1056701030927835e-06, "logits/chosen": 5.362805366516113, "logits/rejected": 1.5850732326507568, "logps/chosen": -365.0799255371094, "logps/rejected": -250.87652587890625, "loss": 0.7222, "rewards/accuracies": 0.375, "rewards/chosen": -0.08530960232019424, "rewards/margins": -0.055381014943122864, "rewards/rejected": -0.02992858737707138, "step": 429 }, { "epoch": 0.06649913009858882, "grad_norm": 5.54054594039917, "learning_rate": 1.1082474226804124e-06, "logits/chosen": 6.456350803375244, "logits/rejected": 5.648205757141113, "logps/chosen": -317.45147705078125, "logps/rejected": -265.6895751953125, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": 0.04060354083776474, "rewards/margins": 0.08768625557422638, "rewards/rejected": -0.04708271473646164, "step": 430 }, { "epoch": 0.06665377923835299, "grad_norm": 4.973996162414551, "learning_rate": 1.1108247422680412e-06, "logits/chosen": 14.110208511352539, "logits/rejected": 10.118903160095215, "logps/chosen": -272.4558410644531, "logps/rejected": -246.0497283935547, "loss": 0.7095, "rewards/accuracies": 0.5, "rewards/chosen": -0.015845393761992455, "rewards/margins": -0.029163219034671783, "rewards/rejected": 0.01331782341003418, "step": 431 }, { "epoch": 0.06680842837811715, "grad_norm": 7.229224681854248, "learning_rate": 1.1134020618556703e-06, "logits/chosen": 14.000032424926758, "logits/rejected": 11.125240325927734, "logps/chosen": -401.0099792480469, "logps/rejected": -432.0771484375, "loss": 0.7027, "rewards/accuracies": 0.5, "rewards/chosen": -0.0053405724465847015, "rewards/margins": -0.01319103129208088, "rewards/rejected": 0.007850456051528454, "step": 432 }, { "epoch": 0.0669630775178813, "grad_norm": 5.13885498046875, "learning_rate": 1.1159793814432992e-06, "logits/chosen": 15.442996978759766, "logits/rejected": 11.216538429260254, "logps/chosen": -385.5250549316406, "logps/rejected": -268.22015380859375, "loss": 0.719, "rewards/accuracies": 0.375, "rewards/chosen": 0.004207659512758255, "rewards/margins": -0.04783777892589569, "rewards/rejected": 0.052045442163944244, "step": 433 }, { "epoch": 0.06711772665764547, "grad_norm": 4.798912048339844, "learning_rate": 1.118556701030928e-06, "logits/chosen": 11.21569538116455, "logits/rejected": 10.157526016235352, "logps/chosen": -246.6448974609375, "logps/rejected": -230.35739135742188, "loss": 0.7533, "rewards/accuracies": 0.0, "rewards/chosen": -0.031946610659360886, "rewards/margins": -0.11454010009765625, "rewards/rejected": 0.08259349316358566, "step": 434 }, { "epoch": 0.06727237579740963, "grad_norm": 4.816092491149902, "learning_rate": 1.121134020618557e-06, "logits/chosen": 6.873818397521973, "logits/rejected": 2.828075408935547, "logps/chosen": -271.92919921875, "logps/rejected": -181.554443359375, "loss": 0.72, "rewards/accuracies": 0.125, "rewards/chosen": -0.03334550932049751, "rewards/margins": -0.05021844431757927, "rewards/rejected": 0.01687292940914631, "step": 435 }, { "epoch": 0.06742702493717379, "grad_norm": 4.457712173461914, "learning_rate": 1.1237113402061856e-06, "logits/chosen": 13.320279121398926, "logits/rejected": 9.802282333374023, "logps/chosen": -330.33319091796875, "logps/rejected": -269.45849609375, "loss": 0.6793, "rewards/accuracies": 0.625, "rewards/chosen": 0.017458343878388405, "rewards/margins": 0.03067474253475666, "rewards/rejected": -0.013216400519013405, "step": 436 }, { "epoch": 0.06758167407693795, "grad_norm": 4.063880920410156, "learning_rate": 1.1262886597938145e-06, "logits/chosen": 16.130605697631836, "logits/rejected": 12.348139762878418, "logps/chosen": -198.15565490722656, "logps/rejected": -178.13626098632812, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.02518201246857643, "rewards/margins": 0.015363645739853382, "rewards/rejected": 0.0098183648660779, "step": 437 }, { "epoch": 0.0677363232167021, "grad_norm": 4.596961498260498, "learning_rate": 1.1288659793814433e-06, "logits/chosen": 9.14218807220459, "logits/rejected": 10.569528579711914, "logps/chosen": -193.8854217529297, "logps/rejected": -218.80386352539062, "loss": 0.7241, "rewards/accuracies": 0.375, "rewards/chosen": -0.02099437825381756, "rewards/margins": -0.05657162517309189, "rewards/rejected": 0.03557724878191948, "step": 438 }, { "epoch": 0.06789097235646627, "grad_norm": 5.912600517272949, "learning_rate": 1.1314432989690722e-06, "logits/chosen": 9.41981315612793, "logits/rejected": 6.537056922912598, "logps/chosen": -351.82135009765625, "logps/rejected": -227.25894165039062, "loss": 0.7363, "rewards/accuracies": 0.5, "rewards/chosen": -0.13925524055957794, "rewards/margins": -0.07983632385730743, "rewards/rejected": -0.05941891670227051, "step": 439 }, { "epoch": 0.06804562149623043, "grad_norm": 6.625514984130859, "learning_rate": 1.134020618556701e-06, "logits/chosen": 11.954994201660156, "logits/rejected": 8.536954879760742, "logps/chosen": -305.7756652832031, "logps/rejected": -244.58151245117188, "loss": 0.7504, "rewards/accuracies": 0.25, "rewards/chosen": -0.08066359162330627, "rewards/margins": -0.10602931678295135, "rewards/rejected": 0.025365734472870827, "step": 440 }, { "epoch": 0.06820027063599458, "grad_norm": 7.536952018737793, "learning_rate": 1.13659793814433e-06, "logits/chosen": 13.029542922973633, "logits/rejected": 7.037688732147217, "logps/chosen": -576.3472290039062, "logps/rejected": -304.11407470703125, "loss": 0.756, "rewards/accuracies": 0.375, "rewards/chosen": -0.03906955569982529, "rewards/margins": -0.1104925125837326, "rewards/rejected": 0.07142295688390732, "step": 441 }, { "epoch": 0.06835491977575875, "grad_norm": 4.61942720413208, "learning_rate": 1.1391752577319588e-06, "logits/chosen": 10.842082977294922, "logits/rejected": 12.782186508178711, "logps/chosen": -230.60238647460938, "logps/rejected": -241.21810913085938, "loss": 0.7052, "rewards/accuracies": 0.5, "rewards/chosen": -0.007932376116514206, "rewards/margins": -0.01750480942428112, "rewards/rejected": 0.009572433307766914, "step": 442 }, { "epoch": 0.06850956891552291, "grad_norm": 6.207898139953613, "learning_rate": 1.1417525773195877e-06, "logits/chosen": 5.723091125488281, "logits/rejected": 9.703177452087402, "logps/chosen": -364.4405822753906, "logps/rejected": -386.5208740234375, "loss": 0.7192, "rewards/accuracies": 0.375, "rewards/chosen": -0.0430697463452816, "rewards/margins": -0.04799194633960724, "rewards/rejected": 0.004922197666019201, "step": 443 }, { "epoch": 0.06866421805528707, "grad_norm": 5.925559043884277, "learning_rate": 1.1443298969072166e-06, "logits/chosen": 10.710570335388184, "logits/rejected": 5.233012676239014, "logps/chosen": -327.94757080078125, "logps/rejected": -242.01473999023438, "loss": 0.6889, "rewards/accuracies": 0.625, "rewards/chosen": -0.022325709462165833, "rewards/margins": 0.011135056614875793, "rewards/rejected": -0.03346076235175133, "step": 444 }, { "epoch": 0.06881886719505123, "grad_norm": 4.352199554443359, "learning_rate": 1.1469072164948454e-06, "logits/chosen": 19.635128021240234, "logits/rejected": 11.797924041748047, "logps/chosen": -264.49005126953125, "logps/rejected": -145.75155639648438, "loss": 0.6844, "rewards/accuracies": 0.625, "rewards/chosen": 0.002335714176297188, "rewards/margins": 0.018510818481445312, "rewards/rejected": -0.016175102442502975, "step": 445 }, { "epoch": 0.06897351633481538, "grad_norm": 12.978840827941895, "learning_rate": 1.1494845360824743e-06, "logits/chosen": 13.32224178314209, "logits/rejected": 10.29178237915039, "logps/chosen": -213.24143981933594, "logps/rejected": -228.46661376953125, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": 0.007376480847597122, "rewards/margins": 0.005146408919245005, "rewards/rejected": 0.002230070997029543, "step": 446 }, { "epoch": 0.06912816547457955, "grad_norm": 4.917621612548828, "learning_rate": 1.1520618556701032e-06, "logits/chosen": 8.363395690917969, "logits/rejected": 8.210826873779297, "logps/chosen": -360.5667419433594, "logps/rejected": -286.4749755859375, "loss": 0.665, "rewards/accuracies": 0.75, "rewards/chosen": 0.01968412473797798, "rewards/margins": 0.06538467854261398, "rewards/rejected": -0.0457005500793457, "step": 447 }, { "epoch": 0.06928281461434371, "grad_norm": 5.113491535186768, "learning_rate": 1.154639175257732e-06, "logits/chosen": 2.8186323642730713, "logits/rejected": 4.038246154785156, "logps/chosen": -273.90380859375, "logps/rejected": -269.4390563964844, "loss": 0.6874, "rewards/accuracies": 0.625, "rewards/chosen": 0.04487667232751846, "rewards/margins": 0.02400503121316433, "rewards/rejected": 0.020871637389063835, "step": 448 }, { "epoch": 0.06943746375410786, "grad_norm": 5.9082159996032715, "learning_rate": 1.157216494845361e-06, "logits/chosen": 6.520223617553711, "logits/rejected": 7.196210861206055, "logps/chosen": -242.16896057128906, "logps/rejected": -244.6864776611328, "loss": 0.7472, "rewards/accuracies": 0.25, "rewards/chosen": -0.07740268856287003, "rewards/margins": -0.10075444728136063, "rewards/rejected": 0.0233517624437809, "step": 449 }, { "epoch": 0.06959211289387203, "grad_norm": 4.350265979766846, "learning_rate": 1.1597938144329898e-06, "logits/chosen": 6.269900321960449, "logits/rejected": 5.303152084350586, "logps/chosen": -223.11007690429688, "logps/rejected": -233.52423095703125, "loss": 0.6804, "rewards/accuracies": 0.5, "rewards/chosen": 0.0032989010214805603, "rewards/margins": 0.03347807005047798, "rewards/rejected": -0.030179165303707123, "step": 450 }, { "epoch": 0.06974676203363618, "grad_norm": 5.940832614898682, "learning_rate": 1.1623711340206187e-06, "logits/chosen": 9.687047004699707, "logits/rejected": 12.396611213684082, "logps/chosen": -251.50350952148438, "logps/rejected": -299.99945068359375, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": 0.06874170899391174, "rewards/margins": 0.050896354019641876, "rewards/rejected": 0.01784534379839897, "step": 451 }, { "epoch": 0.06990141117340035, "grad_norm": 5.027914047241211, "learning_rate": 1.1649484536082475e-06, "logits/chosen": 5.539091110229492, "logits/rejected": 5.175711631774902, "logps/chosen": -217.0523681640625, "logps/rejected": -277.43768310546875, "loss": 0.7147, "rewards/accuracies": 0.5, "rewards/chosen": -0.008119489066302776, "rewards/margins": -0.03714494779706001, "rewards/rejected": 0.029025457799434662, "step": 452 }, { "epoch": 0.07005606031316451, "grad_norm": 4.091969966888428, "learning_rate": 1.1675257731958764e-06, "logits/chosen": 9.525016784667969, "logits/rejected": 6.513009071350098, "logps/chosen": -183.4822998046875, "logps/rejected": -162.91171264648438, "loss": 0.6526, "rewards/accuracies": 0.875, "rewards/chosen": 0.011124372482299805, "rewards/margins": 0.08528690040111542, "rewards/rejected": -0.07416252791881561, "step": 453 }, { "epoch": 0.07021070945292866, "grad_norm": 4.9361419677734375, "learning_rate": 1.1701030927835053e-06, "logits/chosen": 15.196216583251953, "logits/rejected": 8.254142761230469, "logps/chosen": -341.0078125, "logps/rejected": -272.251220703125, "loss": 0.7192, "rewards/accuracies": 0.25, "rewards/chosen": -0.027997970581054688, "rewards/margins": -0.04884038120508194, "rewards/rejected": 0.020842408761382103, "step": 454 }, { "epoch": 0.07036535859269283, "grad_norm": 5.737484931945801, "learning_rate": 1.1726804123711342e-06, "logits/chosen": 8.865446090698242, "logits/rejected": 12.857247352600098, "logps/chosen": -264.6304931640625, "logps/rejected": -265.1905517578125, "loss": 0.6574, "rewards/accuracies": 0.75, "rewards/chosen": 0.0091384407132864, "rewards/margins": 0.07763008773326874, "rewards/rejected": -0.06849164515733719, "step": 455 }, { "epoch": 0.070520007732457, "grad_norm": 5.029343128204346, "learning_rate": 1.175257731958763e-06, "logits/chosen": 6.768464088439941, "logits/rejected": 4.105677604675293, "logps/chosen": -300.0268249511719, "logps/rejected": -210.82620239257812, "loss": 0.6701, "rewards/accuracies": 0.625, "rewards/chosen": 0.006981231272220612, "rewards/margins": 0.048430733382701874, "rewards/rejected": -0.04144950211048126, "step": 456 }, { "epoch": 0.07067465687222114, "grad_norm": 6.606870651245117, "learning_rate": 1.177835051546392e-06, "logits/chosen": 9.166215896606445, "logits/rejected": 2.806367874145508, "logps/chosen": -352.10003662109375, "logps/rejected": -343.111328125, "loss": 0.7365, "rewards/accuracies": 0.375, "rewards/chosen": -0.06574497371912003, "rewards/margins": -0.08144745975732803, "rewards/rejected": 0.01570248417556286, "step": 457 }, { "epoch": 0.07082930601198531, "grad_norm": 7.202205657958984, "learning_rate": 1.1804123711340208e-06, "logits/chosen": 3.586365222930908, "logits/rejected": 0.8253377676010132, "logps/chosen": -360.13494873046875, "logps/rejected": -324.2682189941406, "loss": 0.7161, "rewards/accuracies": 0.375, "rewards/chosen": -0.060147859156131744, "rewards/margins": -0.04070886969566345, "rewards/rejected": -0.019438982009887695, "step": 458 }, { "epoch": 0.07098395515174946, "grad_norm": 5.205109596252441, "learning_rate": 1.1829896907216496e-06, "logits/chosen": 11.185762405395508, "logits/rejected": 14.334480285644531, "logps/chosen": -299.57733154296875, "logps/rejected": -290.84576416015625, "loss": 0.6919, "rewards/accuracies": 0.375, "rewards/chosen": 0.0498957633972168, "rewards/margins": 0.009350206702947617, "rewards/rejected": 0.04054555669426918, "step": 459 }, { "epoch": 0.07113860429151363, "grad_norm": 5.1602020263671875, "learning_rate": 1.1855670103092783e-06, "logits/chosen": 12.370702743530273, "logits/rejected": 8.091865539550781, "logps/chosen": -318.81549072265625, "logps/rejected": -297.52362060546875, "loss": 0.6797, "rewards/accuracies": 0.75, "rewards/chosen": -0.04072318226099014, "rewards/margins": 0.0316314697265625, "rewards/rejected": -0.07235465198755264, "step": 460 }, { "epoch": 0.07129325343127779, "grad_norm": 5.133179664611816, "learning_rate": 1.1881443298969072e-06, "logits/chosen": 3.630934000015259, "logits/rejected": 11.368070602416992, "logps/chosen": -270.8773498535156, "logps/rejected": -243.56610107421875, "loss": 0.7303, "rewards/accuracies": 0.125, "rewards/chosen": -0.03770842403173447, "rewards/margins": -0.07174105942249298, "rewards/rejected": 0.034032631665468216, "step": 461 }, { "epoch": 0.07144790257104194, "grad_norm": 5.185516834259033, "learning_rate": 1.190721649484536e-06, "logits/chosen": 7.575233459472656, "logits/rejected": 4.580668926239014, "logps/chosen": -204.6844482421875, "logps/rejected": -197.384765625, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": -0.0013845921494066715, "rewards/margins": 0.006259298417717218, "rewards/rejected": -0.007643889635801315, "step": 462 }, { "epoch": 0.07160255171080611, "grad_norm": 6.377435207366943, "learning_rate": 1.1932989690721651e-06, "logits/chosen": 4.983016014099121, "logits/rejected": 0.05048179626464844, "logps/chosen": -319.78411865234375, "logps/rejected": -216.42721557617188, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": -0.03390403091907501, "rewards/margins": -0.0013873083516955376, "rewards/rejected": -0.0325167179107666, "step": 463 }, { "epoch": 0.07175720085057027, "grad_norm": 5.488708972930908, "learning_rate": 1.195876288659794e-06, "logits/chosen": 9.706180572509766, "logits/rejected": 8.32201099395752, "logps/chosen": -288.8235778808594, "logps/rejected": -275.0206298828125, "loss": 0.7092, "rewards/accuracies": 0.25, "rewards/chosen": 0.02388305775821209, "rewards/margins": -0.030299853533506393, "rewards/rejected": 0.054182909429073334, "step": 464 }, { "epoch": 0.07191184999033443, "grad_norm": 8.434926986694336, "learning_rate": 1.1984536082474229e-06, "logits/chosen": 7.797680377960205, "logits/rejected": 9.956478118896484, "logps/chosen": -395.3771667480469, "logps/rejected": -477.75323486328125, "loss": 0.6461, "rewards/accuracies": 0.875, "rewards/chosen": 0.04650573432445526, "rewards/margins": 0.10006103664636612, "rewards/rejected": -0.05355529859662056, "step": 465 }, { "epoch": 0.07206649913009859, "grad_norm": 4.710972309112549, "learning_rate": 1.2010309278350517e-06, "logits/chosen": 12.573779106140137, "logits/rejected": 8.880084991455078, "logps/chosen": -196.49710083007812, "logps/rejected": -228.15316772460938, "loss": 0.7172, "rewards/accuracies": 0.375, "rewards/chosen": -0.05125265195965767, "rewards/margins": -0.04457240551710129, "rewards/rejected": -0.006680251099169254, "step": 466 }, { "epoch": 0.07222114826986274, "grad_norm": 5.636977195739746, "learning_rate": 1.2036082474226806e-06, "logits/chosen": 12.430391311645508, "logits/rejected": 10.582672119140625, "logps/chosen": -465.9703063964844, "logps/rejected": -376.74609375, "loss": 0.6615, "rewards/accuracies": 0.5, "rewards/chosen": -0.03753151744604111, "rewards/margins": 0.07629828155040741, "rewards/rejected": -0.11382980644702911, "step": 467 }, { "epoch": 0.07237579740962691, "grad_norm": 5.817788600921631, "learning_rate": 1.2061855670103095e-06, "logits/chosen": 10.093749046325684, "logits/rejected": 9.10906982421875, "logps/chosen": -375.3330078125, "logps/rejected": -334.5609436035156, "loss": 0.7106, "rewards/accuracies": 0.5, "rewards/chosen": -0.04627704620361328, "rewards/margins": -0.03184280917048454, "rewards/rejected": -0.014434242621064186, "step": 468 }, { "epoch": 0.07253044654939107, "grad_norm": 3.9110538959503174, "learning_rate": 1.2087628865979382e-06, "logits/chosen": 6.988757610321045, "logits/rejected": 3.1502225399017334, "logps/chosen": -252.61614990234375, "logps/rejected": -187.54251098632812, "loss": 0.6927, "rewards/accuracies": 0.375, "rewards/chosen": -0.016286754980683327, "rewards/margins": 0.0049581993371248245, "rewards/rejected": -0.02124495431780815, "step": 469 }, { "epoch": 0.07268509568915522, "grad_norm": 5.507916450500488, "learning_rate": 1.211340206185567e-06, "logits/chosen": 9.607152938842773, "logits/rejected": 11.154939651489258, "logps/chosen": -304.7886047363281, "logps/rejected": -338.859130859375, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": -0.029600191861391068, "rewards/margins": 0.012299539521336555, "rewards/rejected": -0.041899729520082474, "step": 470 }, { "epoch": 0.07283974482891939, "grad_norm": 18.582128524780273, "learning_rate": 1.213917525773196e-06, "logits/chosen": 9.084453582763672, "logits/rejected": 1.4555106163024902, "logps/chosen": -508.26171875, "logps/rejected": -334.971923828125, "loss": 0.7142, "rewards/accuracies": 0.5, "rewards/chosen": -0.05624312907457352, "rewards/margins": -0.0235869362950325, "rewards/rejected": -0.032656192779541016, "step": 471 }, { "epoch": 0.07299439396868355, "grad_norm": 5.251086711883545, "learning_rate": 1.2164948453608248e-06, "logits/chosen": 4.806502342224121, "logits/rejected": 5.997603893280029, "logps/chosen": -318.2728271484375, "logps/rejected": -330.8165588378906, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": 0.024497367441654205, "rewards/margins": 0.07905077934265137, "rewards/rejected": -0.05455341190099716, "step": 472 }, { "epoch": 0.0731490431084477, "grad_norm": 7.7333149909973145, "learning_rate": 1.2190721649484536e-06, "logits/chosen": 8.06280517578125, "logits/rejected": 6.172645568847656, "logps/chosen": -264.54034423828125, "logps/rejected": -267.8489990234375, "loss": 0.7209, "rewards/accuracies": 0.375, "rewards/chosen": -0.05796775966882706, "rewards/margins": -0.051055099815130234, "rewards/rejected": -0.00691266031935811, "step": 473 }, { "epoch": 0.07330369224821187, "grad_norm": 5.668310642242432, "learning_rate": 1.2216494845360825e-06, "logits/chosen": 14.605587005615234, "logits/rejected": 11.51723861694336, "logps/chosen": -415.44781494140625, "logps/rejected": -361.1017761230469, "loss": 0.7256, "rewards/accuracies": 0.375, "rewards/chosen": -0.11909104138612747, "rewards/margins": -0.05120258405804634, "rewards/rejected": -0.06788845360279083, "step": 474 }, { "epoch": 0.07345834138797602, "grad_norm": 4.868416786193848, "learning_rate": 1.2242268041237114e-06, "logits/chosen": 13.368406295776367, "logits/rejected": 9.429543495178223, "logps/chosen": -296.8177795410156, "logps/rejected": -251.13247680664062, "loss": 0.6958, "rewards/accuracies": 0.375, "rewards/chosen": -0.06994041800498962, "rewards/margins": -0.003677511587738991, "rewards/rejected": -0.06626291573047638, "step": 475 }, { "epoch": 0.07361299052774019, "grad_norm": 5.10068416595459, "learning_rate": 1.2268041237113403e-06, "logits/chosen": 10.763989448547363, "logits/rejected": 11.519791603088379, "logps/chosen": -302.250732421875, "logps/rejected": -260.015380859375, "loss": 0.6773, "rewards/accuracies": 0.75, "rewards/chosen": -0.01711585558950901, "rewards/margins": 0.034076668322086334, "rewards/rejected": -0.051192522048950195, "step": 476 }, { "epoch": 0.07376763966750435, "grad_norm": 4.7022705078125, "learning_rate": 1.2293814432989691e-06, "logits/chosen": 14.871297836303711, "logits/rejected": 8.740377426147461, "logps/chosen": -415.8657531738281, "logps/rejected": -297.3988342285156, "loss": 0.6549, "rewards/accuracies": 0.75, "rewards/chosen": 0.046979717910289764, "rewards/margins": 0.0824311226606369, "rewards/rejected": -0.035451412200927734, "step": 477 }, { "epoch": 0.0739222888072685, "grad_norm": 4.601221561431885, "learning_rate": 1.231958762886598e-06, "logits/chosen": 14.473128318786621, "logits/rejected": 15.302721977233887, "logps/chosen": -240.88668823242188, "logps/rejected": -258.47900390625, "loss": 0.748, "rewards/accuracies": 0.25, "rewards/chosen": -0.0831640213727951, "rewards/margins": -0.10379353165626526, "rewards/rejected": 0.020629502832889557, "step": 478 }, { "epoch": 0.07407693794703267, "grad_norm": 5.6209716796875, "learning_rate": 1.2345360824742269e-06, "logits/chosen": 8.16950798034668, "logits/rejected": 4.3024492263793945, "logps/chosen": -281.9227294921875, "logps/rejected": -232.9622802734375, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": -0.011417914181947708, "rewards/margins": 0.010905311442911625, "rewards/rejected": -0.022323228418827057, "step": 479 }, { "epoch": 0.07423158708679684, "grad_norm": 4.501147747039795, "learning_rate": 1.2371134020618557e-06, "logits/chosen": 9.206696510314941, "logits/rejected": 10.740234375, "logps/chosen": -264.6712951660156, "logps/rejected": -250.37405395507812, "loss": 0.6617, "rewards/accuracies": 0.625, "rewards/chosen": -0.003394031897187233, "rewards/margins": 0.07045107334852219, "rewards/rejected": -0.07384509593248367, "step": 480 }, { "epoch": 0.07438623622656099, "grad_norm": 13.334127426147461, "learning_rate": 1.2396907216494846e-06, "logits/chosen": 12.364567756652832, "logits/rejected": 9.056158065795898, "logps/chosen": -301.6614685058594, "logps/rejected": -292.27978515625, "loss": 0.6771, "rewards/accuracies": 0.75, "rewards/chosen": 0.009432125836610794, "rewards/margins": 0.03644128143787384, "rewards/rejected": -0.027009155601263046, "step": 481 }, { "epoch": 0.07454088536632515, "grad_norm": 4.799774169921875, "learning_rate": 1.2422680412371135e-06, "logits/chosen": 12.694483757019043, "logits/rejected": 8.58286190032959, "logps/chosen": -259.81097412109375, "logps/rejected": -252.54266357421875, "loss": 0.7058, "rewards/accuracies": 0.375, "rewards/chosen": -0.04981064796447754, "rewards/margins": -0.022566698491573334, "rewards/rejected": -0.027243951335549355, "step": 482 }, { "epoch": 0.0746955345060893, "grad_norm": 3.73966646194458, "learning_rate": 1.2448453608247424e-06, "logits/chosen": 9.738327980041504, "logits/rejected": 10.59136962890625, "logps/chosen": -193.85232543945312, "logps/rejected": -163.93902587890625, "loss": 0.707, "rewards/accuracies": 0.25, "rewards/chosen": -0.033843282610177994, "rewards/margins": -0.026288272812962532, "rewards/rejected": -0.007555009797215462, "step": 483 }, { "epoch": 0.07485018364585347, "grad_norm": 4.871867656707764, "learning_rate": 1.2474226804123712e-06, "logits/chosen": 15.306303977966309, "logits/rejected": 13.005790710449219, "logps/chosen": -267.81549072265625, "logps/rejected": -257.08367919921875, "loss": 0.7025, "rewards/accuracies": 0.5, "rewards/chosen": -0.021944332867860794, "rewards/margins": -0.01678919792175293, "rewards/rejected": -0.00515513401478529, "step": 484 }, { "epoch": 0.07500483278561763, "grad_norm": 6.419384956359863, "learning_rate": 1.25e-06, "logits/chosen": 5.472255229949951, "logits/rejected": 0.6009783148765564, "logps/chosen": -323.40191650390625, "logps/rejected": -193.5068817138672, "loss": 0.723, "rewards/accuracies": 0.125, "rewards/chosen": -0.06473977863788605, "rewards/margins": -0.05760948732495308, "rewards/rejected": -0.007130288984626532, "step": 485 }, { "epoch": 0.07515948192538179, "grad_norm": 6.550868034362793, "learning_rate": 1.252577319587629e-06, "logits/chosen": 8.418712615966797, "logits/rejected": 3.8301689624786377, "logps/chosen": -363.01776123046875, "logps/rejected": -280.04931640625, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": 0.07135991752147675, "rewards/margins": 0.08949494361877441, "rewards/rejected": -0.01813502237200737, "step": 486 }, { "epoch": 0.07531413106514595, "grad_norm": 3.9579107761383057, "learning_rate": 1.2551546391752578e-06, "logits/chosen": 9.789752960205078, "logits/rejected": 8.411087989807129, "logps/chosen": -275.6781005859375, "logps/rejected": -144.12982177734375, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.0124449972063303, "rewards/margins": 0.026842165738344193, "rewards/rejected": -0.014397167600691319, "step": 487 }, { "epoch": 0.07546878020491012, "grad_norm": 5.32539176940918, "learning_rate": 1.2577319587628867e-06, "logits/chosen": 11.784067153930664, "logits/rejected": 10.029605865478516, "logps/chosen": -335.7101135253906, "logps/rejected": -288.75372314453125, "loss": 0.7238, "rewards/accuracies": 0.5, "rewards/chosen": -0.08159056305885315, "rewards/margins": -0.055178213864564896, "rewards/rejected": -0.026412345468997955, "step": 488 }, { "epoch": 0.07562342934467427, "grad_norm": 5.820296287536621, "learning_rate": 1.2603092783505156e-06, "logits/chosen": 11.580249786376953, "logits/rejected": 6.299615859985352, "logps/chosen": -277.8828125, "logps/rejected": -212.2325439453125, "loss": 0.717, "rewards/accuracies": 0.5, "rewards/chosen": -0.057082418352365494, "rewards/margins": -0.041176747530698776, "rewards/rejected": -0.01590566709637642, "step": 489 }, { "epoch": 0.07577807848443843, "grad_norm": 5.272955894470215, "learning_rate": 1.2628865979381445e-06, "logits/chosen": 10.399672508239746, "logits/rejected": 5.922280788421631, "logps/chosen": -187.53143310546875, "logps/rejected": -154.47177124023438, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": -0.04404468834400177, "rewards/margins": 0.005482863634824753, "rewards/rejected": -0.04952755197882652, "step": 490 }, { "epoch": 0.07593272762420258, "grad_norm": 4.080560207366943, "learning_rate": 1.2654639175257733e-06, "logits/chosen": 9.95780086517334, "logits/rejected": 11.035721778869629, "logps/chosen": -161.07461547851562, "logps/rejected": -181.50311279296875, "loss": 0.7322, "rewards/accuracies": 0.375, "rewards/chosen": -0.023262454196810722, "rewards/margins": -0.07226769626140594, "rewards/rejected": 0.04900524765253067, "step": 491 }, { "epoch": 0.07608737676396675, "grad_norm": 4.837400436401367, "learning_rate": 1.2680412371134022e-06, "logits/chosen": 13.23857307434082, "logits/rejected": 14.329360961914062, "logps/chosen": -399.9242248535156, "logps/rejected": -387.9140625, "loss": 0.658, "rewards/accuracies": 0.625, "rewards/chosen": -0.044596292078495026, "rewards/margins": 0.07503624260425568, "rewards/rejected": -0.1196325272321701, "step": 492 }, { "epoch": 0.07624202590373091, "grad_norm": 7.8216118812561035, "learning_rate": 1.2706185567010309e-06, "logits/chosen": 8.370171546936035, "logits/rejected": 8.059374809265137, "logps/chosen": -409.1068115234375, "logps/rejected": -309.05181884765625, "loss": 0.6999, "rewards/accuracies": 0.375, "rewards/chosen": -0.0683976262807846, "rewards/margins": -0.007261945866048336, "rewards/rejected": -0.061135679483413696, "step": 493 }, { "epoch": 0.07639667504349507, "grad_norm": 5.2427496910095215, "learning_rate": 1.2731958762886597e-06, "logits/chosen": 6.7537031173706055, "logits/rejected": 10.230915069580078, "logps/chosen": -225.65072631835938, "logps/rejected": -272.75469970703125, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": 0.03593377768993378, "rewards/margins": 0.005887121893465519, "rewards/rejected": 0.030046656727790833, "step": 494 }, { "epoch": 0.07655132418325923, "grad_norm": 4.877942085266113, "learning_rate": 1.2757731958762886e-06, "logits/chosen": 18.830055236816406, "logits/rejected": 10.248369216918945, "logps/chosen": -288.5787658691406, "logps/rejected": -205.6837158203125, "loss": 0.7313, "rewards/accuracies": 0.375, "rewards/chosen": -0.047826193273067474, "rewards/margins": -0.07010393589735031, "rewards/rejected": 0.02227773703634739, "step": 495 }, { "epoch": 0.0767059733230234, "grad_norm": 4.733504772186279, "learning_rate": 1.2783505154639175e-06, "logits/chosen": 5.889375686645508, "logits/rejected": 2.9468088150024414, "logps/chosen": -275.24468994140625, "logps/rejected": -284.8700866699219, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.07246246188879013, "rewards/margins": 0.08885151147842407, "rewards/rejected": -0.016389036551117897, "step": 496 }, { "epoch": 0.07686062246278755, "grad_norm": 4.67637825012207, "learning_rate": 1.2809278350515464e-06, "logits/chosen": 9.688321113586426, "logits/rejected": 3.4134669303894043, "logps/chosen": -266.0443115234375, "logps/rejected": -193.4385986328125, "loss": 0.7056, "rewards/accuracies": 0.5, "rewards/chosen": -0.08322501182556152, "rewards/margins": -0.021423693746328354, "rewards/rejected": -0.06180131435394287, "step": 497 }, { "epoch": 0.07701527160255171, "grad_norm": 5.164875030517578, "learning_rate": 1.2835051546391752e-06, "logits/chosen": 14.148039817810059, "logits/rejected": 13.427299499511719, "logps/chosen": -314.2421875, "logps/rejected": -299.50311279296875, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": -0.029363252222537994, "rewards/margins": 0.01818694919347763, "rewards/rejected": -0.047550201416015625, "step": 498 }, { "epoch": 0.07716992074231586, "grad_norm": 7.062736511230469, "learning_rate": 1.286082474226804e-06, "logits/chosen": 13.982134819030762, "logits/rejected": 7.812150478363037, "logps/chosen": -365.7272033691406, "logps/rejected": -315.32684326171875, "loss": 0.732, "rewards/accuracies": 0.375, "rewards/chosen": -0.06968193501234055, "rewards/margins": -0.07258491963148117, "rewards/rejected": 0.0029029827564954758, "step": 499 }, { "epoch": 0.07732456988208003, "grad_norm": 4.314591407775879, "learning_rate": 1.288659793814433e-06, "logits/chosen": 5.615242958068848, "logits/rejected": 11.519858360290527, "logps/chosen": -196.22201538085938, "logps/rejected": -251.9803009033203, "loss": 0.7353, "rewards/accuracies": 0.375, "rewards/chosen": -0.05603757128119469, "rewards/margins": -0.07838129997253418, "rewards/rejected": 0.022343730553984642, "step": 500 }, { "epoch": 0.0774792190218442, "grad_norm": 4.882893085479736, "learning_rate": 1.291237113402062e-06, "logits/chosen": 7.686586380004883, "logits/rejected": 6.984974384307861, "logps/chosen": -271.8753967285156, "logps/rejected": -204.36770629882812, "loss": 0.6561, "rewards/accuracies": 0.625, "rewards/chosen": 0.04684881865978241, "rewards/margins": 0.08257794380187988, "rewards/rejected": -0.035729121416807175, "step": 501 }, { "epoch": 0.07763386816160835, "grad_norm": 6.275918960571289, "learning_rate": 1.293814432989691e-06, "logits/chosen": 7.644626140594482, "logits/rejected": 3.437337875366211, "logps/chosen": -290.195068359375, "logps/rejected": -223.37295532226562, "loss": 0.7042, "rewards/accuracies": 0.375, "rewards/chosen": -0.059528157114982605, "rewards/margins": -0.018102407455444336, "rewards/rejected": -0.04142574965953827, "step": 502 }, { "epoch": 0.07778851730137251, "grad_norm": 3.9693758487701416, "learning_rate": 1.2963917525773198e-06, "logits/chosen": 8.159713745117188, "logits/rejected": 10.401387214660645, "logps/chosen": -174.96224975585938, "logps/rejected": -226.48019409179688, "loss": 0.6738, "rewards/accuracies": 0.5, "rewards/chosen": -0.062317825853824615, "rewards/margins": 0.041219066828489304, "rewards/rejected": -0.10353689640760422, "step": 503 }, { "epoch": 0.07794316644113668, "grad_norm": 4.535689353942871, "learning_rate": 1.2989690721649487e-06, "logits/chosen": 6.409902572631836, "logits/rejected": 12.520186424255371, "logps/chosen": -159.933349609375, "logps/rejected": -203.27352905273438, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": -0.05015776678919792, "rewards/margins": -0.012445949949324131, "rewards/rejected": -0.03771181032061577, "step": 504 }, { "epoch": 0.07809781558090083, "grad_norm": 5.357067108154297, "learning_rate": 1.3015463917525775e-06, "logits/chosen": 7.758027076721191, "logits/rejected": 12.18793773651123, "logps/chosen": -215.1466522216797, "logps/rejected": -241.67506408691406, "loss": 0.7558, "rewards/accuracies": 0.25, "rewards/chosen": -0.0855511724948883, "rewards/margins": -0.11551766842603683, "rewards/rejected": 0.02996649779379368, "step": 505 }, { "epoch": 0.078252464720665, "grad_norm": 5.963812828063965, "learning_rate": 1.3041237113402064e-06, "logits/chosen": 9.35734748840332, "logits/rejected": 8.196986198425293, "logps/chosen": -345.1124572753906, "logps/rejected": -295.1198425292969, "loss": 0.6454, "rewards/accuracies": 0.625, "rewards/chosen": 0.02703724056482315, "rewards/margins": 0.10452951490879059, "rewards/rejected": -0.07749228924512863, "step": 506 }, { "epoch": 0.07840711386042915, "grad_norm": 5.133436679840088, "learning_rate": 1.3067010309278353e-06, "logits/chosen": 11.193446159362793, "logits/rejected": 11.346586227416992, "logps/chosen": -319.44683837890625, "logps/rejected": -258.2880859375, "loss": 0.7167, "rewards/accuracies": 0.5, "rewards/chosen": 0.006875228136777878, "rewards/margins": -0.03874626010656357, "rewards/rejected": 0.045621491968631744, "step": 507 }, { "epoch": 0.07856176300019331, "grad_norm": 5.560775279998779, "learning_rate": 1.3092783505154642e-06, "logits/chosen": 8.42133903503418, "logits/rejected": 13.05972671508789, "logps/chosen": -256.1662292480469, "logps/rejected": -285.46466064453125, "loss": 0.6702, "rewards/accuracies": 0.625, "rewards/chosen": 0.009373044595122337, "rewards/margins": 0.05296625941991806, "rewards/rejected": -0.04359322041273117, "step": 508 }, { "epoch": 0.07871641213995748, "grad_norm": 5.174693584442139, "learning_rate": 1.311855670103093e-06, "logits/chosen": 14.411124229431152, "logits/rejected": 7.032424449920654, "logps/chosen": -344.90020751953125, "logps/rejected": -208.19241333007812, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": -0.027427388355135918, "rewards/margins": 0.022199105471372604, "rewards/rejected": -0.04962649196386337, "step": 509 }, { "epoch": 0.07887106127972163, "grad_norm": 6.354555606842041, "learning_rate": 1.314432989690722e-06, "logits/chosen": 10.666659355163574, "logits/rejected": 7.222296714782715, "logps/chosen": -426.1317138671875, "logps/rejected": -366.894775390625, "loss": 0.7071, "rewards/accuracies": 0.5, "rewards/chosen": -0.03403759002685547, "rewards/margins": -0.022094538435339928, "rewards/rejected": -0.01194305531680584, "step": 510 }, { "epoch": 0.07902571041948579, "grad_norm": 4.84645938873291, "learning_rate": 1.3170103092783506e-06, "logits/chosen": 11.212747573852539, "logits/rejected": 8.291570663452148, "logps/chosen": -244.2994384765625, "logps/rejected": -194.34815979003906, "loss": 0.6755, "rewards/accuracies": 0.375, "rewards/chosen": -0.004062940366566181, "rewards/margins": 0.04598658159375191, "rewards/rejected": -0.050049517303705215, "step": 511 }, { "epoch": 0.07918035955924996, "grad_norm": 5.589974403381348, "learning_rate": 1.3195876288659794e-06, "logits/chosen": 11.877053260803223, "logits/rejected": 6.465781211853027, "logps/chosen": -297.2283020019531, "logps/rejected": -201.77923583984375, "loss": 0.7144, "rewards/accuracies": 0.375, "rewards/chosen": -0.05955009162425995, "rewards/margins": -0.037588972598314285, "rewards/rejected": -0.021961115300655365, "step": 512 }, { "epoch": 0.07933500869901411, "grad_norm": 8.377959251403809, "learning_rate": 1.3221649484536083e-06, "logits/chosen": 9.283735275268555, "logits/rejected": 9.296442031860352, "logps/chosen": -477.893310546875, "logps/rejected": -347.45343017578125, "loss": 0.7207, "rewards/accuracies": 0.5, "rewards/chosen": -0.05752735584974289, "rewards/margins": -0.03290129452943802, "rewards/rejected": -0.02462606132030487, "step": 513 }, { "epoch": 0.07948965783877827, "grad_norm": 3.3914053440093994, "learning_rate": 1.3247422680412372e-06, "logits/chosen": 5.411382675170898, "logits/rejected": 6.597458839416504, "logps/chosen": -170.07208251953125, "logps/rejected": -188.1590576171875, "loss": 0.6755, "rewards/accuracies": 0.5, "rewards/chosen": -0.011275816708803177, "rewards/margins": 0.04031095653772354, "rewards/rejected": -0.05158677324652672, "step": 514 }, { "epoch": 0.07964430697854243, "grad_norm": 7.341994762420654, "learning_rate": 1.327319587628866e-06, "logits/chosen": 9.048376083374023, "logits/rejected": 7.71989631652832, "logps/chosen": -410.7899169921875, "logps/rejected": -339.4091796875, "loss": 0.734, "rewards/accuracies": 0.25, "rewards/chosen": -0.0873725414276123, "rewards/margins": -0.07666054368019104, "rewards/rejected": -0.010712003335356712, "step": 515 }, { "epoch": 0.07979895611830659, "grad_norm": 5.503435134887695, "learning_rate": 1.329896907216495e-06, "logits/chosen": 11.117693901062012, "logits/rejected": 10.440019607543945, "logps/chosen": -401.076416015625, "logps/rejected": -438.62042236328125, "loss": 0.732, "rewards/accuracies": 0.375, "rewards/chosen": -0.11154460906982422, "rewards/margins": -0.07412925362586975, "rewards/rejected": -0.037415362894535065, "step": 516 }, { "epoch": 0.07995360525807076, "grad_norm": 4.144252777099609, "learning_rate": 1.3324742268041238e-06, "logits/chosen": 12.842230796813965, "logits/rejected": 4.817409038543701, "logps/chosen": -260.1328430175781, "logps/rejected": -125.98538208007812, "loss": 0.7327, "rewards/accuracies": 0.375, "rewards/chosen": -0.06426072120666504, "rewards/margins": -0.0740605890750885, "rewards/rejected": 0.00979986134916544, "step": 517 }, { "epoch": 0.08010825439783491, "grad_norm": 4.73619270324707, "learning_rate": 1.3350515463917527e-06, "logits/chosen": 11.06511116027832, "logits/rejected": 6.746665954589844, "logps/chosen": -269.87432861328125, "logps/rejected": -219.50643920898438, "loss": 0.6768, "rewards/accuracies": 0.375, "rewards/chosen": -0.02751307561993599, "rewards/margins": 0.037844181060791016, "rewards/rejected": -0.0653572604060173, "step": 518 }, { "epoch": 0.08026290353759907, "grad_norm": 4.447077751159668, "learning_rate": 1.3376288659793815e-06, "logits/chosen": 6.431484222412109, "logits/rejected": 9.76042366027832, "logps/chosen": -227.97085571289062, "logps/rejected": -296.36651611328125, "loss": 0.6371, "rewards/accuracies": 1.0, "rewards/chosen": 0.026898100972175598, "rewards/margins": 0.11998605728149414, "rewards/rejected": -0.09308796375989914, "step": 519 }, { "epoch": 0.08041755267736324, "grad_norm": 5.414590835571289, "learning_rate": 1.3402061855670104e-06, "logits/chosen": 10.79727554321289, "logits/rejected": 3.2325119972229004, "logps/chosen": -248.14511108398438, "logps/rejected": -175.93927001953125, "loss": 0.6965, "rewards/accuracies": 0.25, "rewards/chosen": -0.04287712648510933, "rewards/margins": -0.0063204774633049965, "rewards/rejected": -0.03655664995312691, "step": 520 }, { "epoch": 0.08057220181712739, "grad_norm": 5.638577938079834, "learning_rate": 1.3427835051546393e-06, "logits/chosen": 10.725425720214844, "logits/rejected": 3.684582233428955, "logps/chosen": -338.5487365722656, "logps/rejected": -221.99310302734375, "loss": 0.6994, "rewards/accuracies": 0.625, "rewards/chosen": 0.019631575793027878, "rewards/margins": -0.007610607892274857, "rewards/rejected": 0.027242185547947884, "step": 521 }, { "epoch": 0.08072685095689155, "grad_norm": 5.674072265625, "learning_rate": 1.3453608247422681e-06, "logits/chosen": 9.33547592163086, "logits/rejected": 15.28099250793457, "logps/chosen": -229.82672119140625, "logps/rejected": -376.06597900390625, "loss": 0.653, "rewards/accuracies": 0.75, "rewards/chosen": -0.02353219874203205, "rewards/margins": 0.09013090282678604, "rewards/rejected": -0.11366310715675354, "step": 522 }, { "epoch": 0.0808815000966557, "grad_norm": 5.811645984649658, "learning_rate": 1.347938144329897e-06, "logits/chosen": 12.050795555114746, "logits/rejected": 5.4996337890625, "logps/chosen": -403.49053955078125, "logps/rejected": -293.321533203125, "loss": 0.7094, "rewards/accuracies": 0.5, "rewards/chosen": -0.061141159385442734, "rewards/margins": -0.0239457655698061, "rewards/rejected": -0.03719539940357208, "step": 523 }, { "epoch": 0.08103614923641987, "grad_norm": 6.312408447265625, "learning_rate": 1.3505154639175259e-06, "logits/chosen": 10.700739860534668, "logits/rejected": 1.81795334815979, "logps/chosen": -483.1976623535156, "logps/rejected": -274.3953857421875, "loss": 0.6914, "rewards/accuracies": 0.375, "rewards/chosen": -0.024495694786310196, "rewards/margins": 0.007960964925587177, "rewards/rejected": -0.0324566587805748, "step": 524 }, { "epoch": 0.08119079837618404, "grad_norm": 4.696097373962402, "learning_rate": 1.3530927835051548e-06, "logits/chosen": 8.211674690246582, "logits/rejected": 1.1401091814041138, "logps/chosen": -264.7847595214844, "logps/rejected": -209.84481811523438, "loss": 0.7006, "rewards/accuracies": 0.375, "rewards/chosen": -0.044145919382572174, "rewards/margins": -0.00855002086609602, "rewards/rejected": -0.03559589385986328, "step": 525 }, { "epoch": 0.08134544751594819, "grad_norm": 5.374699592590332, "learning_rate": 1.3556701030927834e-06, "logits/chosen": 8.196305274963379, "logits/rejected": 11.697808265686035, "logps/chosen": -214.82339477539062, "logps/rejected": -317.4474182128906, "loss": 0.7165, "rewards/accuracies": 0.125, "rewards/chosen": -0.02326676994562149, "rewards/margins": -0.040444038808345795, "rewards/rejected": 0.017177274450659752, "step": 526 }, { "epoch": 0.08150009665571235, "grad_norm": 4.186614990234375, "learning_rate": 1.3582474226804123e-06, "logits/chosen": 7.532651424407959, "logits/rejected": 8.822759628295898, "logps/chosen": -210.77732849121094, "logps/rejected": -226.58636474609375, "loss": 0.6827, "rewards/accuracies": 0.75, "rewards/chosen": 0.026653384789824486, "rewards/margins": 0.024219894781708717, "rewards/rejected": 0.002433490939438343, "step": 527 }, { "epoch": 0.08165474579547652, "grad_norm": 3.884063482284546, "learning_rate": 1.3608247422680412e-06, "logits/chosen": 12.936111450195312, "logits/rejected": 8.74923324584961, "logps/chosen": -274.812744140625, "logps/rejected": -207.52850341796875, "loss": 0.6762, "rewards/accuracies": 0.5, "rewards/chosen": -0.018146181479096413, "rewards/margins": 0.04085822403430939, "rewards/rejected": -0.05900440737605095, "step": 528 }, { "epoch": 0.08180939493524067, "grad_norm": 4.786996841430664, "learning_rate": 1.36340206185567e-06, "logits/chosen": 12.112577438354492, "logits/rejected": 10.60428237915039, "logps/chosen": -309.19598388671875, "logps/rejected": -299.437744140625, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": -0.0023768888786435127, "rewards/margins": -0.009706975892186165, "rewards/rejected": 0.007330084219574928, "step": 529 }, { "epoch": 0.08196404407500484, "grad_norm": 6.0366973876953125, "learning_rate": 1.365979381443299e-06, "logits/chosen": 11.771197319030762, "logits/rejected": 8.55543041229248, "logps/chosen": -305.8131103515625, "logps/rejected": -220.6937713623047, "loss": 0.7013, "rewards/accuracies": 0.625, "rewards/chosen": -0.018656635656952858, "rewards/margins": -0.014781379140913486, "rewards/rejected": -0.003875256050378084, "step": 530 }, { "epoch": 0.08211869321476899, "grad_norm": 3.151768207550049, "learning_rate": 1.368556701030928e-06, "logits/chosen": 6.802198886871338, "logits/rejected": 9.858036041259766, "logps/chosen": -151.63719177246094, "logps/rejected": -151.3238067626953, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": -0.006170701235532761, "rewards/margins": 0.028571534901857376, "rewards/rejected": -0.03474223613739014, "step": 531 }, { "epoch": 0.08227334235453315, "grad_norm": 5.344779968261719, "learning_rate": 1.3711340206185569e-06, "logits/chosen": 11.66389274597168, "logits/rejected": 9.489412307739258, "logps/chosen": -285.146484375, "logps/rejected": -234.99050903320312, "loss": 0.715, "rewards/accuracies": 0.375, "rewards/chosen": -0.07612819969654083, "rewards/margins": -0.039783380925655365, "rewards/rejected": -0.03634481504559517, "step": 532 }, { "epoch": 0.08242799149429732, "grad_norm": 5.25248908996582, "learning_rate": 1.3737113402061857e-06, "logits/chosen": 10.95274829864502, "logits/rejected": 8.533432960510254, "logps/chosen": -344.74951171875, "logps/rejected": -303.6307067871094, "loss": 0.703, "rewards/accuracies": 0.5, "rewards/chosen": -0.036418817937374115, "rewards/margins": -0.016526460647583008, "rewards/rejected": -0.019892359152436256, "step": 533 }, { "epoch": 0.08258264063406147, "grad_norm": 6.047691822052002, "learning_rate": 1.3762886597938146e-06, "logits/chosen": 13.251497268676758, "logits/rejected": 6.974300861358643, "logps/chosen": -297.87652587890625, "logps/rejected": -203.72567749023438, "loss": 0.713, "rewards/accuracies": 0.375, "rewards/chosen": -0.060970306396484375, "rewards/margins": -0.03670930862426758, "rewards/rejected": -0.024260997772216797, "step": 534 }, { "epoch": 0.08273728977382563, "grad_norm": 6.274545192718506, "learning_rate": 1.3788659793814435e-06, "logits/chosen": 9.905800819396973, "logits/rejected": 11.10840129852295, "logps/chosen": -370.43463134765625, "logps/rejected": -324.4618225097656, "loss": 0.7433, "rewards/accuracies": 0.0, "rewards/chosen": -0.08701552450656891, "rewards/margins": -0.09692764282226562, "rewards/rejected": 0.00991210900247097, "step": 535 }, { "epoch": 0.0828919389135898, "grad_norm": 6.441309452056885, "learning_rate": 1.3814432989690724e-06, "logits/chosen": 13.563328742980957, "logits/rejected": 9.363935470581055, "logps/chosen": -342.4273986816406, "logps/rejected": -271.4912414550781, "loss": 0.7163, "rewards/accuracies": 0.375, "rewards/chosen": -0.01731271669268608, "rewards/margins": -0.03972053527832031, "rewards/rejected": 0.022407814860343933, "step": 536 }, { "epoch": 0.08304658805335395, "grad_norm": 5.751824378967285, "learning_rate": 1.3840206185567012e-06, "logits/chosen": 4.694397926330566, "logits/rejected": 9.730031967163086, "logps/chosen": -216.31333923339844, "logps/rejected": -267.26898193359375, "loss": 0.7978, "rewards/accuracies": 0.125, "rewards/chosen": -0.18462657928466797, "rewards/margins": -0.18465977907180786, "rewards/rejected": 3.318674862384796e-05, "step": 537 }, { "epoch": 0.08320123719311812, "grad_norm": 3.7181038856506348, "learning_rate": 1.38659793814433e-06, "logits/chosen": 10.303196907043457, "logits/rejected": 7.523611068725586, "logps/chosen": -217.4943389892578, "logps/rejected": -219.6516571044922, "loss": 0.6568, "rewards/accuracies": 0.625, "rewards/chosen": -0.02828398160636425, "rewards/margins": 0.07878727465867996, "rewards/rejected": -0.10707125812768936, "step": 538 }, { "epoch": 0.08335588633288227, "grad_norm": 5.212622165679932, "learning_rate": 1.389175257731959e-06, "logits/chosen": 13.040685653686523, "logits/rejected": 11.43275260925293, "logps/chosen": -367.2395324707031, "logps/rejected": -331.2080078125, "loss": 0.6972, "rewards/accuracies": 0.5, "rewards/chosen": -0.027867890894412994, "rewards/margins": -0.0011987686157226562, "rewards/rejected": -0.02666911855340004, "step": 539 }, { "epoch": 0.08351053547264643, "grad_norm": 5.434782981872559, "learning_rate": 1.3917525773195878e-06, "logits/chosen": 15.580764770507812, "logits/rejected": 8.096982955932617, "logps/chosen": -358.6407165527344, "logps/rejected": -276.34918212890625, "loss": 0.7091, "rewards/accuracies": 0.375, "rewards/chosen": -0.055728793144226074, "rewards/margins": -0.021243112161755562, "rewards/rejected": -0.034485675394535065, "step": 540 }, { "epoch": 0.0836651846124106, "grad_norm": 4.162454605102539, "learning_rate": 1.3943298969072167e-06, "logits/chosen": 11.824553489685059, "logits/rejected": 8.227606773376465, "logps/chosen": -147.75181579589844, "logps/rejected": -146.95799255371094, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": -0.008574152365326881, "rewards/margins": 0.030722906813025475, "rewards/rejected": -0.039297059178352356, "step": 541 }, { "epoch": 0.08381983375217475, "grad_norm": 4.5789384841918945, "learning_rate": 1.3969072164948456e-06, "logits/chosen": 11.601738929748535, "logits/rejected": 5.445502758026123, "logps/chosen": -345.2118225097656, "logps/rejected": -195.30538940429688, "loss": 0.7202, "rewards/accuracies": 0.625, "rewards/chosen": -0.04216070473194122, "rewards/margins": -0.04864849895238876, "rewards/rejected": 0.006487798877060413, "step": 542 }, { "epoch": 0.08397448289193891, "grad_norm": 5.624594688415527, "learning_rate": 1.3994845360824745e-06, "logits/chosen": 5.455178737640381, "logits/rejected": 7.166283130645752, "logps/chosen": -264.54278564453125, "logps/rejected": -264.7269592285156, "loss": 0.7131, "rewards/accuracies": 0.25, "rewards/chosen": -0.02523994818329811, "rewards/margins": -0.03223336115479469, "rewards/rejected": 0.006993414834141731, "step": 543 }, { "epoch": 0.08412913203170308, "grad_norm": 16.302080154418945, "learning_rate": 1.4020618556701031e-06, "logits/chosen": 10.338399887084961, "logits/rejected": 8.731912612915039, "logps/chosen": -262.99542236328125, "logps/rejected": -217.37112426757812, "loss": 0.6435, "rewards/accuracies": 0.875, "rewards/chosen": 0.017029715701937675, "rewards/margins": 0.10734157264232635, "rewards/rejected": -0.09031186252832413, "step": 544 }, { "epoch": 0.08428378117146723, "grad_norm": 5.504406929016113, "learning_rate": 1.404639175257732e-06, "logits/chosen": 4.236848831176758, "logits/rejected": 1.2474476099014282, "logps/chosen": -275.2344055175781, "logps/rejected": -255.11891174316406, "loss": 0.7035, "rewards/accuracies": 0.5, "rewards/chosen": -0.011736463755369186, "rewards/margins": -0.013480950146913528, "rewards/rejected": 0.0017444845288991928, "step": 545 }, { "epoch": 0.0844384303112314, "grad_norm": 3.7987728118896484, "learning_rate": 1.4072164948453609e-06, "logits/chosen": 10.348388671875, "logits/rejected": 1.9263609647750854, "logps/chosen": -193.91937255859375, "logps/rejected": -115.6342544555664, "loss": 0.6972, "rewards/accuracies": 0.25, "rewards/chosen": -0.03842787817120552, "rewards/margins": -0.007029199041426182, "rewards/rejected": -0.03139868006110191, "step": 546 }, { "epoch": 0.08459307945099555, "grad_norm": 4.090860843658447, "learning_rate": 1.4097938144329897e-06, "logits/chosen": 10.532543182373047, "logits/rejected": 5.598051071166992, "logps/chosen": -320.3072204589844, "logps/rejected": -237.58522033691406, "loss": 0.7144, "rewards/accuracies": 0.125, "rewards/chosen": -0.05214262008666992, "rewards/margins": -0.03299293294548988, "rewards/rejected": -0.01914968341588974, "step": 547 }, { "epoch": 0.08474772859075971, "grad_norm": 6.043724536895752, "learning_rate": 1.4123711340206186e-06, "logits/chosen": 13.783498764038086, "logits/rejected": 11.574289321899414, "logps/chosen": -316.70062255859375, "logps/rejected": -304.0550537109375, "loss": 0.7216, "rewards/accuracies": 0.375, "rewards/chosen": -0.07665050029754639, "rewards/margins": -0.05147576332092285, "rewards/rejected": -0.025174735113978386, "step": 548 }, { "epoch": 0.08490237773052388, "grad_norm": 4.6276044845581055, "learning_rate": 1.4149484536082475e-06, "logits/chosen": 9.441754341125488, "logits/rejected": 9.189139366149902, "logps/chosen": -269.8464050292969, "logps/rejected": -245.50872802734375, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": -0.058960575610399246, "rewards/margins": 0.04430823773145676, "rewards/rejected": -0.1032688170671463, "step": 549 }, { "epoch": 0.08505702687028803, "grad_norm": 5.975076675415039, "learning_rate": 1.4175257731958764e-06, "logits/chosen": 9.283742904663086, "logits/rejected": 2.3773369789123535, "logps/chosen": -307.9149169921875, "logps/rejected": -230.32321166992188, "loss": 0.6745, "rewards/accuracies": 0.75, "rewards/chosen": -0.041390422731637955, "rewards/margins": 0.043630022555589676, "rewards/rejected": -0.08502044528722763, "step": 550 }, { "epoch": 0.0852116760100522, "grad_norm": 4.116117477416992, "learning_rate": 1.4201030927835052e-06, "logits/chosen": 8.857202529907227, "logits/rejected": 4.9324140548706055, "logps/chosen": -160.62957763671875, "logps/rejected": -131.80567932128906, "loss": 0.6457, "rewards/accuracies": 0.75, "rewards/chosen": 0.04518408700823784, "rewards/margins": 0.09874320030212402, "rewards/rejected": -0.053559109568595886, "step": 551 }, { "epoch": 0.08536632514981636, "grad_norm": 5.2435302734375, "learning_rate": 1.422680412371134e-06, "logits/chosen": 13.257979393005371, "logits/rejected": 12.709647178649902, "logps/chosen": -226.0782928466797, "logps/rejected": -280.5997009277344, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": -0.03819599002599716, "rewards/margins": 0.020241308957338333, "rewards/rejected": -0.05843730270862579, "step": 552 }, { "epoch": 0.08552097428958051, "grad_norm": 9.359949111938477, "learning_rate": 1.425257731958763e-06, "logits/chosen": 14.066431045532227, "logits/rejected": 9.620611190795898, "logps/chosen": -301.92462158203125, "logps/rejected": -208.7350311279297, "loss": 0.6852, "rewards/accuracies": 0.75, "rewards/chosen": -0.02114868350327015, "rewards/margins": 0.01965189166367054, "rewards/rejected": -0.04080057144165039, "step": 553 }, { "epoch": 0.08567562342934468, "grad_norm": 5.54028844833374, "learning_rate": 1.4278350515463918e-06, "logits/chosen": 13.469223976135254, "logits/rejected": 13.597623825073242, "logps/chosen": -279.43377685546875, "logps/rejected": -301.3321533203125, "loss": 0.7009, "rewards/accuracies": 0.75, "rewards/chosen": -0.07416782528162003, "rewards/margins": -0.010303354822099209, "rewards/rejected": -0.06386446952819824, "step": 554 }, { "epoch": 0.08583027256910883, "grad_norm": 5.318565368652344, "learning_rate": 1.4304123711340207e-06, "logits/chosen": 3.1903958320617676, "logits/rejected": 12.17082405090332, "logps/chosen": -221.4241943359375, "logps/rejected": -349.0831298828125, "loss": 0.682, "rewards/accuracies": 0.25, "rewards/chosen": -0.01637106016278267, "rewards/margins": 0.034010179340839386, "rewards/rejected": -0.05038123577833176, "step": 555 }, { "epoch": 0.085984921708873, "grad_norm": 4.948302745819092, "learning_rate": 1.4329896907216496e-06, "logits/chosen": 6.0074872970581055, "logits/rejected": 5.867912292480469, "logps/chosen": -257.4555969238281, "logps/rejected": -264.7852783203125, "loss": 0.7198, "rewards/accuracies": 0.25, "rewards/chosen": -0.010210896842181683, "rewards/margins": -0.04925103485584259, "rewards/rejected": 0.03904014080762863, "step": 556 }, { "epoch": 0.08613957084863716, "grad_norm": 5.331164360046387, "learning_rate": 1.4355670103092785e-06, "logits/chosen": 7.234327793121338, "logits/rejected": 4.675284385681152, "logps/chosen": -262.0631408691406, "logps/rejected": -226.23211669921875, "loss": 0.7169, "rewards/accuracies": 0.625, "rewards/chosen": -0.03545413166284561, "rewards/margins": -0.041504621505737305, "rewards/rejected": 0.006050491239875555, "step": 557 }, { "epoch": 0.08629421998840131, "grad_norm": 5.373672962188721, "learning_rate": 1.4381443298969073e-06, "logits/chosen": 3.5728745460510254, "logits/rejected": 3.163386106491089, "logps/chosen": -229.23838806152344, "logps/rejected": -206.3196563720703, "loss": 0.6944, "rewards/accuracies": 0.5, "rewards/chosen": 0.007729053497314453, "rewards/margins": 0.0011762618087232113, "rewards/rejected": 0.006552794016897678, "step": 558 }, { "epoch": 0.08644886912816548, "grad_norm": 4.6300506591796875, "learning_rate": 1.440721649484536e-06, "logits/chosen": 13.43500804901123, "logits/rejected": 8.804279327392578, "logps/chosen": -281.1897277832031, "logps/rejected": -220.85140991210938, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.05231967195868492, "rewards/margins": 0.00836491584777832, "rewards/rejected": -0.06068458408117294, "step": 559 }, { "epoch": 0.08660351826792964, "grad_norm": 3.967717409133911, "learning_rate": 1.4432989690721649e-06, "logits/chosen": 13.546951293945312, "logits/rejected": 12.877527236938477, "logps/chosen": -261.8711242675781, "logps/rejected": -196.81423950195312, "loss": 0.6834, "rewards/accuracies": 0.75, "rewards/chosen": 0.020794298499822617, "rewards/margins": 0.023573974147439003, "rewards/rejected": -0.002779675181955099, "step": 560 }, { "epoch": 0.08675816740769379, "grad_norm": 4.789272785186768, "learning_rate": 1.4458762886597942e-06, "logits/chosen": 12.408659934997559, "logits/rejected": 7.128690719604492, "logps/chosen": -334.94708251953125, "logps/rejected": -242.84063720703125, "loss": 0.7051, "rewards/accuracies": 0.5, "rewards/chosen": -0.06501893699169159, "rewards/margins": -0.019919585436582565, "rewards/rejected": -0.04509935528039932, "step": 561 }, { "epoch": 0.08691281654745796, "grad_norm": 4.86594295501709, "learning_rate": 1.448453608247423e-06, "logits/chosen": 15.593953132629395, "logits/rejected": 10.727025985717773, "logps/chosen": -242.47654724121094, "logps/rejected": -193.03759765625, "loss": 0.6682, "rewards/accuracies": 0.75, "rewards/chosen": 0.04373032972216606, "rewards/margins": 0.054003361612558365, "rewards/rejected": -0.01027302723377943, "step": 562 }, { "epoch": 0.08706746568722211, "grad_norm": 3.5607645511627197, "learning_rate": 1.4510309278350517e-06, "logits/chosen": 9.887504577636719, "logits/rejected": 6.421091079711914, "logps/chosen": -208.6868438720703, "logps/rejected": -188.00584411621094, "loss": 0.6621, "rewards/accuracies": 0.625, "rewards/chosen": -0.011667155660688877, "rewards/margins": 0.06540470570325851, "rewards/rejected": -0.07707186043262482, "step": 563 }, { "epoch": 0.08722211482698627, "grad_norm": 6.123905181884766, "learning_rate": 1.4536082474226806e-06, "logits/chosen": 4.700023651123047, "logits/rejected": 10.194036483764648, "logps/chosen": -284.0270690917969, "logps/rejected": -314.90087890625, "loss": 0.791, "rewards/accuracies": 0.375, "rewards/chosen": -0.12916384637355804, "rewards/margins": -0.1683189868927002, "rewards/rejected": 0.039155151695013046, "step": 564 }, { "epoch": 0.08737676396675044, "grad_norm": 7.623630046844482, "learning_rate": 1.4561855670103094e-06, "logits/chosen": 10.971306800842285, "logits/rejected": 11.86168098449707, "logps/chosen": -307.81231689453125, "logps/rejected": -314.9770812988281, "loss": 0.7005, "rewards/accuracies": 0.375, "rewards/chosen": -0.018035219982266426, "rewards/margins": -0.013626815751194954, "rewards/rejected": -0.004408406559377909, "step": 565 }, { "epoch": 0.08753141310651459, "grad_norm": 4.394433975219727, "learning_rate": 1.4587628865979383e-06, "logits/chosen": 13.23172378540039, "logits/rejected": 12.297246932983398, "logps/chosen": -322.37738037109375, "logps/rejected": -309.84039306640625, "loss": 0.6609, "rewards/accuracies": 0.625, "rewards/chosen": 0.01140308566391468, "rewards/margins": 0.07004957646131516, "rewards/rejected": -0.058646488934755325, "step": 566 }, { "epoch": 0.08768606224627876, "grad_norm": 4.123403072357178, "learning_rate": 1.4613402061855672e-06, "logits/chosen": 8.478995323181152, "logits/rejected": 5.38689661026001, "logps/chosen": -178.5983123779297, "logps/rejected": -173.1959228515625, "loss": 0.6682, "rewards/accuracies": 0.5, "rewards/chosen": -0.020140552893280983, "rewards/margins": 0.05587553605437279, "rewards/rejected": -0.07601609081029892, "step": 567 }, { "epoch": 0.08784071138604292, "grad_norm": 4.850125312805176, "learning_rate": 1.463917525773196e-06, "logits/chosen": 5.15814733505249, "logits/rejected": 6.131396293640137, "logps/chosen": -235.00698852539062, "logps/rejected": -256.5286865234375, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": -0.039864830672740936, "rewards/margins": 0.00997433066368103, "rewards/rejected": -0.04983916133642197, "step": 568 }, { "epoch": 0.08799536052580707, "grad_norm": 5.217621803283691, "learning_rate": 1.466494845360825e-06, "logits/chosen": 12.082124710083008, "logits/rejected": 9.792362213134766, "logps/chosen": -338.691162109375, "logps/rejected": -304.8896179199219, "loss": 0.7155, "rewards/accuracies": 0.375, "rewards/chosen": -0.10010968148708344, "rewards/margins": -0.03405303880572319, "rewards/rejected": -0.06605663895606995, "step": 569 }, { "epoch": 0.08815000966557124, "grad_norm": 9.45602798461914, "learning_rate": 1.4690721649484538e-06, "logits/chosen": 6.290523052215576, "logits/rejected": 7.201108455657959, "logps/chosen": -331.61712646484375, "logps/rejected": -379.7377624511719, "loss": 0.6739, "rewards/accuracies": 0.75, "rewards/chosen": -0.003652568906545639, "rewards/margins": 0.049550626426935196, "rewards/rejected": -0.053203195333480835, "step": 570 }, { "epoch": 0.08830465880533539, "grad_norm": 6.101128101348877, "learning_rate": 1.4716494845360827e-06, "logits/chosen": 6.905837535858154, "logits/rejected": 3.919721841812134, "logps/chosen": -336.28363037109375, "logps/rejected": -318.51483154296875, "loss": 0.7239, "rewards/accuracies": 0.5, "rewards/chosen": -0.0894021987915039, "rewards/margins": -0.05167561024427414, "rewards/rejected": -0.037726595997810364, "step": 571 }, { "epoch": 0.08845930794509956, "grad_norm": 3.39512038230896, "learning_rate": 1.4742268041237115e-06, "logits/chosen": 9.016313552856445, "logits/rejected": 7.709306716918945, "logps/chosen": -157.89161682128906, "logps/rejected": -154.44366455078125, "loss": 0.6827, "rewards/accuracies": 0.5, "rewards/chosen": 0.014220332726836205, "rewards/margins": 0.02199702337384224, "rewards/rejected": -0.007776690647006035, "step": 572 }, { "epoch": 0.08861395708486372, "grad_norm": 4.189332962036133, "learning_rate": 1.4768041237113404e-06, "logits/chosen": 11.938968658447266, "logits/rejected": 10.110847473144531, "logps/chosen": -209.30987548828125, "logps/rejected": -176.3541259765625, "loss": 0.6913, "rewards/accuracies": 0.75, "rewards/chosen": -0.04447899013757706, "rewards/margins": 0.005109596997499466, "rewards/rejected": -0.049588583409786224, "step": 573 }, { "epoch": 0.08876860622462787, "grad_norm": 5.407228469848633, "learning_rate": 1.4793814432989693e-06, "logits/chosen": 10.162530899047852, "logits/rejected": 9.525054931640625, "logps/chosen": -213.1966552734375, "logps/rejected": -269.3774719238281, "loss": 0.667, "rewards/accuracies": 0.75, "rewards/chosen": -0.03099069744348526, "rewards/margins": 0.05701952055096626, "rewards/rejected": -0.08801022171974182, "step": 574 }, { "epoch": 0.08892325536439204, "grad_norm": 4.868646144866943, "learning_rate": 1.4819587628865981e-06, "logits/chosen": 6.9227375984191895, "logits/rejected": 5.648509979248047, "logps/chosen": -292.35418701171875, "logps/rejected": -233.95372009277344, "loss": 0.726, "rewards/accuracies": 0.375, "rewards/chosen": -0.12666283547878265, "rewards/margins": -0.049906060099601746, "rewards/rejected": -0.07675676792860031, "step": 575 }, { "epoch": 0.0890779045041562, "grad_norm": 3.63031268119812, "learning_rate": 1.484536082474227e-06, "logits/chosen": 13.012777328491211, "logits/rejected": 8.049663543701172, "logps/chosen": -146.19189453125, "logps/rejected": -133.70364379882812, "loss": 0.698, "rewards/accuracies": 0.625, "rewards/chosen": -0.0466887503862381, "rewards/margins": -0.0050859469920396805, "rewards/rejected": -0.04160280153155327, "step": 576 }, { "epoch": 0.08923255364392035, "grad_norm": 7.05629825592041, "learning_rate": 1.4871134020618557e-06, "logits/chosen": 9.393556594848633, "logits/rejected": 9.363550186157227, "logps/chosen": -175.64712524414062, "logps/rejected": -230.2144317626953, "loss": 0.7652, "rewards/accuracies": 0.5, "rewards/chosen": -0.06522531062364578, "rewards/margins": -0.11029025912284851, "rewards/rejected": 0.04506495222449303, "step": 577 }, { "epoch": 0.08938720278368452, "grad_norm": 6.8438591957092285, "learning_rate": 1.4896907216494846e-06, "logits/chosen": 7.139414310455322, "logits/rejected": 5.4244914054870605, "logps/chosen": -305.4336853027344, "logps/rejected": -273.1817932128906, "loss": 0.7222, "rewards/accuracies": 0.5, "rewards/chosen": -0.032219935208559036, "rewards/margins": -0.05352919548749924, "rewards/rejected": 0.021309256553649902, "step": 578 }, { "epoch": 0.08954185192344867, "grad_norm": 5.016536712646484, "learning_rate": 1.4922680412371134e-06, "logits/chosen": 12.598133087158203, "logits/rejected": 8.138943672180176, "logps/chosen": -274.45623779296875, "logps/rejected": -223.34591674804688, "loss": 0.6702, "rewards/accuracies": 0.75, "rewards/chosen": -0.008155249990522861, "rewards/margins": 0.05107593908905983, "rewards/rejected": -0.05923118814826012, "step": 579 }, { "epoch": 0.08969650106321284, "grad_norm": 5.520704746246338, "learning_rate": 1.4948453608247423e-06, "logits/chosen": 9.072236061096191, "logits/rejected": 11.72704029083252, "logps/chosen": -310.29144287109375, "logps/rejected": -319.53228759765625, "loss": 0.7474, "rewards/accuracies": 0.5, "rewards/chosen": -0.11069688946008682, "rewards/margins": -0.09896030277013779, "rewards/rejected": -0.011736582964658737, "step": 580 }, { "epoch": 0.089851150202977, "grad_norm": 4.271003246307373, "learning_rate": 1.4974226804123712e-06, "logits/chosen": 12.873431205749512, "logits/rejected": 11.894736289978027, "logps/chosen": -254.03485107421875, "logps/rejected": -251.77838134765625, "loss": 0.697, "rewards/accuracies": 0.375, "rewards/chosen": -0.030272291973233223, "rewards/margins": -0.002185058780014515, "rewards/rejected": -0.028087232261896133, "step": 581 }, { "epoch": 0.09000579934274115, "grad_norm": 4.902781963348389, "learning_rate": 1.5e-06, "logits/chosen": 9.885713577270508, "logits/rejected": 8.76339054107666, "logps/chosen": -377.46044921875, "logps/rejected": -339.8441162109375, "loss": 0.6819, "rewards/accuracies": 0.5, "rewards/chosen": 0.033589839935302734, "rewards/margins": 0.02693825401365757, "rewards/rejected": 0.006651591509580612, "step": 582 }, { "epoch": 0.09016044848250532, "grad_norm": 7.76738977432251, "learning_rate": 1.502577319587629e-06, "logits/chosen": 8.139245986938477, "logits/rejected": -1.6834871768951416, "logps/chosen": -399.81158447265625, "logps/rejected": -163.51123046875, "loss": 0.6969, "rewards/accuracies": 0.5, "rewards/chosen": 0.0031184665858745575, "rewards/margins": 1.3081356883049011e-05, "rewards/rejected": 0.003105376847088337, "step": 583 }, { "epoch": 0.09031509762226948, "grad_norm": 5.397767066955566, "learning_rate": 1.5051546391752578e-06, "logits/chosen": 7.014987945556641, "logits/rejected": 9.233689308166504, "logps/chosen": -257.14874267578125, "logps/rejected": -223.935546875, "loss": 0.735, "rewards/accuracies": 0.375, "rewards/chosen": 0.012377787381410599, "rewards/margins": -0.0755234807729721, "rewards/rejected": 0.08790126442909241, "step": 584 }, { "epoch": 0.09046974676203363, "grad_norm": 6.752077579498291, "learning_rate": 1.5077319587628867e-06, "logits/chosen": 8.48318862915039, "logits/rejected": 8.355714797973633, "logps/chosen": -221.28858947753906, "logps/rejected": -293.10003662109375, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": 0.016498947516083717, "rewards/margins": 0.05205293744802475, "rewards/rejected": -0.035553980618715286, "step": 585 }, { "epoch": 0.0906243959017978, "grad_norm": 5.544448375701904, "learning_rate": 1.5103092783505155e-06, "logits/chosen": 11.244234085083008, "logits/rejected": 8.31943130493164, "logps/chosen": -368.56195068359375, "logps/rejected": -313.887451171875, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": -0.03733672946691513, "rewards/margins": 0.0167783722281456, "rewards/rejected": -0.05411510542035103, "step": 586 }, { "epoch": 0.09077904504156195, "grad_norm": 6.980535507202148, "learning_rate": 1.5128865979381444e-06, "logits/chosen": 12.466981887817383, "logits/rejected": 10.62402057647705, "logps/chosen": -485.16552734375, "logps/rejected": -321.82177734375, "loss": 0.6935, "rewards/accuracies": 0.75, "rewards/chosen": 0.02911529503762722, "rewards/margins": 0.006746768951416016, "rewards/rejected": 0.022368527948856354, "step": 587 }, { "epoch": 0.09093369418132612, "grad_norm": 5.6725945472717285, "learning_rate": 1.5154639175257733e-06, "logits/chosen": 10.236791610717773, "logits/rejected": 11.348725318908691, "logps/chosen": -290.7213439941406, "logps/rejected": -306.6461181640625, "loss": 0.6822, "rewards/accuracies": 0.625, "rewards/chosen": 0.04713492467999458, "rewards/margins": 0.02363448217511177, "rewards/rejected": 0.023500442504882812, "step": 588 }, { "epoch": 0.09108834332109028, "grad_norm": 3.99106764793396, "learning_rate": 1.5180412371134021e-06, "logits/chosen": 12.880867004394531, "logits/rejected": 1.6681647300720215, "logps/chosen": -228.91464233398438, "logps/rejected": -94.42927551269531, "loss": 0.7188, "rewards/accuracies": 0.375, "rewards/chosen": -0.042986344546079636, "rewards/margins": -0.048634838312864304, "rewards/rejected": 0.005648494698107243, "step": 589 }, { "epoch": 0.09124299246085443, "grad_norm": 4.398662090301514, "learning_rate": 1.520618556701031e-06, "logits/chosen": 10.31039810180664, "logits/rejected": 11.152352333068848, "logps/chosen": -245.16677856445312, "logps/rejected": -235.45811462402344, "loss": 0.6564, "rewards/accuracies": 0.75, "rewards/chosen": -0.02379927784204483, "rewards/margins": 0.08172422647476196, "rewards/rejected": -0.1055234968662262, "step": 590 }, { "epoch": 0.0913976416006186, "grad_norm": 7.149019241333008, "learning_rate": 1.5231958762886599e-06, "logits/chosen": 2.468039035797119, "logits/rejected": 6.3760905265808105, "logps/chosen": -195.27056884765625, "logps/rejected": -318.6320495605469, "loss": 0.6857, "rewards/accuracies": 0.5, "rewards/chosen": 0.02985386922955513, "rewards/margins": 0.02219838835299015, "rewards/rejected": 0.007655477151274681, "step": 591 }, { "epoch": 0.09155229074038276, "grad_norm": 5.477923393249512, "learning_rate": 1.525773195876289e-06, "logits/chosen": 9.466489791870117, "logits/rejected": 11.058809280395508, "logps/chosen": -297.7689208984375, "logps/rejected": -338.0509948730469, "loss": 0.6585, "rewards/accuracies": 0.75, "rewards/chosen": 0.018276499584317207, "rewards/margins": 0.07651957869529724, "rewards/rejected": -0.05824308097362518, "step": 592 }, { "epoch": 0.09170693988014691, "grad_norm": 5.714723110198975, "learning_rate": 1.5283505154639178e-06, "logits/chosen": 14.133981704711914, "logits/rejected": 7.50548791885376, "logps/chosen": -311.7084045410156, "logps/rejected": -237.20822143554688, "loss": 0.6725, "rewards/accuracies": 0.5, "rewards/chosen": -0.031813692301511765, "rewards/margins": 0.049206286668777466, "rewards/rejected": -0.08101997524499893, "step": 593 }, { "epoch": 0.09186158901991108, "grad_norm": 3.7252426147460938, "learning_rate": 1.5309278350515467e-06, "logits/chosen": 5.907700538635254, "logits/rejected": 6.212896347045898, "logps/chosen": -138.7192840576172, "logps/rejected": -149.2027130126953, "loss": 0.6663, "rewards/accuracies": 0.75, "rewards/chosen": 0.038338325917720795, "rewards/margins": 0.05797116458415985, "rewards/rejected": -0.019632840529084206, "step": 594 }, { "epoch": 0.09201623815967523, "grad_norm": 6.494401454925537, "learning_rate": 1.5335051546391756e-06, "logits/chosen": 6.552602767944336, "logits/rejected": 6.334468364715576, "logps/chosen": -301.9626159667969, "logps/rejected": -302.86895751953125, "loss": 0.6654, "rewards/accuracies": 0.75, "rewards/chosen": 0.024748800322413445, "rewards/margins": 0.060784436762332916, "rewards/rejected": -0.03603563457727432, "step": 595 }, { "epoch": 0.0921708872994394, "grad_norm": 7.3843464851379395, "learning_rate": 1.5360824742268042e-06, "logits/chosen": 11.448627471923828, "logits/rejected": 2.547853469848633, "logps/chosen": -354.51171875, "logps/rejected": -222.20797729492188, "loss": 0.6837, "rewards/accuracies": 0.5, "rewards/chosen": 0.0057009682059288025, "rewards/margins": 0.040397725999355316, "rewards/rejected": -0.03469677269458771, "step": 596 }, { "epoch": 0.09232553643920356, "grad_norm": 4.499715805053711, "learning_rate": 1.5386597938144331e-06, "logits/chosen": 11.953042030334473, "logits/rejected": 9.38812255859375, "logps/chosen": -294.1684265136719, "logps/rejected": -175.8557891845703, "loss": 0.6909, "rewards/accuracies": 0.5, "rewards/chosen": -0.054490186274051666, "rewards/margins": 0.009673496708273888, "rewards/rejected": -0.0641636773943901, "step": 597 }, { "epoch": 0.09248018557896771, "grad_norm": 6.344499588012695, "learning_rate": 1.541237113402062e-06, "logits/chosen": 12.79347038269043, "logits/rejected": 8.125385284423828, "logps/chosen": -376.8821105957031, "logps/rejected": -276.41351318359375, "loss": 0.7106, "rewards/accuracies": 0.5, "rewards/chosen": -0.08567304909229279, "rewards/margins": -0.03087618388235569, "rewards/rejected": -0.054796863347291946, "step": 598 }, { "epoch": 0.09263483471873188, "grad_norm": 5.350180149078369, "learning_rate": 1.5438144329896909e-06, "logits/chosen": 9.485483169555664, "logits/rejected": 0.19389140605926514, "logps/chosen": -300.34283447265625, "logps/rejected": -278.94049072265625, "loss": 0.6956, "rewards/accuracies": 0.25, "rewards/chosen": 0.0005204216577112675, "rewards/margins": -0.00031356606632471085, "rewards/rejected": 0.0008339891210198402, "step": 599 }, { "epoch": 0.09278948385849603, "grad_norm": 4.526947498321533, "learning_rate": 1.5463917525773197e-06, "logits/chosen": 10.212084770202637, "logits/rejected": 5.363928318023682, "logps/chosen": -274.0177917480469, "logps/rejected": -252.82843017578125, "loss": 0.6389, "rewards/accuracies": 0.75, "rewards/chosen": 0.048618413507938385, "rewards/margins": 0.11915703117847443, "rewards/rejected": -0.07053861767053604, "step": 600 }, { "epoch": 0.0929441329982602, "grad_norm": 4.0233378410339355, "learning_rate": 1.5489690721649486e-06, "logits/chosen": 8.673198699951172, "logits/rejected": 2.4412617683410645, "logps/chosen": -289.31805419921875, "logps/rejected": -250.50515747070312, "loss": 0.6486, "rewards/accuracies": 0.625, "rewards/chosen": 0.025545261800289154, "rewards/margins": 0.10764918476343155, "rewards/rejected": -0.0821039155125618, "step": 601 }, { "epoch": 0.09309878213802436, "grad_norm": 8.741628646850586, "learning_rate": 1.5515463917525775e-06, "logits/chosen": 11.945576667785645, "logits/rejected": 8.788779258728027, "logps/chosen": -715.8478393554688, "logps/rejected": -467.29986572265625, "loss": 0.6618, "rewards/accuracies": 0.75, "rewards/chosen": 0.03313750401139259, "rewards/margins": 0.10495051741600037, "rewards/rejected": -0.07181301712989807, "step": 602 }, { "epoch": 0.09325343127778851, "grad_norm": 4.780418395996094, "learning_rate": 1.5541237113402063e-06, "logits/chosen": 7.057133674621582, "logits/rejected": 9.991583824157715, "logps/chosen": -225.34634399414062, "logps/rejected": -225.10385131835938, "loss": 0.7096, "rewards/accuracies": 0.25, "rewards/chosen": -0.04378471523523331, "rewards/margins": -0.019158653914928436, "rewards/rejected": -0.02462606318295002, "step": 603 }, { "epoch": 0.09340808041755268, "grad_norm": 4.472261905670166, "learning_rate": 1.5567010309278352e-06, "logits/chosen": 9.863222122192383, "logits/rejected": 7.078555107116699, "logps/chosen": -223.77841186523438, "logps/rejected": -211.4258270263672, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.014171219430863857, "rewards/margins": 0.0021557817235589027, "rewards/rejected": 0.012015435844659805, "step": 604 }, { "epoch": 0.09356272955731684, "grad_norm": 5.152721405029297, "learning_rate": 1.559278350515464e-06, "logits/chosen": 11.461219787597656, "logits/rejected": 9.197368621826172, "logps/chosen": -310.2313232421875, "logps/rejected": -313.9874572753906, "loss": 0.6654, "rewards/accuracies": 0.625, "rewards/chosen": 0.05733604356646538, "rewards/margins": 0.060371968895196915, "rewards/rejected": -0.003035927191376686, "step": 605 }, { "epoch": 0.093717378697081, "grad_norm": 3.8857390880584717, "learning_rate": 1.561855670103093e-06, "logits/chosen": 14.510835647583008, "logits/rejected": 8.955690383911133, "logps/chosen": -185.84414672851562, "logps/rejected": -135.18988037109375, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": -0.042194414883852005, "rewards/margins": -0.002108335494995117, "rewards/rejected": -0.04008607938885689, "step": 606 }, { "epoch": 0.09387202783684516, "grad_norm": 6.44877815246582, "learning_rate": 1.5644329896907218e-06, "logits/chosen": 13.071895599365234, "logits/rejected": 6.534764766693115, "logps/chosen": -347.78436279296875, "logps/rejected": -371.85028076171875, "loss": 0.6985, "rewards/accuracies": 0.375, "rewards/chosen": -0.03660164028406143, "rewards/margins": -0.0078088222071528435, "rewards/rejected": -0.028792815282940865, "step": 607 }, { "epoch": 0.09402667697660931, "grad_norm": 5.619678497314453, "learning_rate": 1.5670103092783507e-06, "logits/chosen": 14.796981811523438, "logits/rejected": 11.97298812866211, "logps/chosen": -281.37017822265625, "logps/rejected": -267.58111572265625, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": -0.07332611083984375, "rewards/margins": -0.11931552737951279, "rewards/rejected": 0.045989420264959335, "step": 608 }, { "epoch": 0.09418132611637348, "grad_norm": 5.50604248046875, "learning_rate": 1.5695876288659796e-06, "logits/chosen": 7.825628757476807, "logits/rejected": 4.102483749389648, "logps/chosen": -341.34710693359375, "logps/rejected": -278.0054931640625, "loss": 0.671, "rewards/accuracies": 0.625, "rewards/chosen": 0.016111426055431366, "rewards/margins": 0.05413079261779785, "rewards/rejected": -0.038019370287656784, "step": 609 }, { "epoch": 0.09433597525613764, "grad_norm": 5.484696388244629, "learning_rate": 1.5721649484536082e-06, "logits/chosen": 6.812306880950928, "logits/rejected": 6.816822052001953, "logps/chosen": -336.73529052734375, "logps/rejected": -343.827392578125, "loss": 0.6566, "rewards/accuracies": 0.75, "rewards/chosen": 0.008094217628240585, "rewards/margins": 0.0756252333521843, "rewards/rejected": -0.06753101944923401, "step": 610 }, { "epoch": 0.09449062439590179, "grad_norm": 5.3768110275268555, "learning_rate": 1.5747422680412371e-06, "logits/chosen": 11.924095153808594, "logits/rejected": 10.34201431274414, "logps/chosen": -258.12548828125, "logps/rejected": -195.8258056640625, "loss": 0.7354, "rewards/accuracies": 0.5, "rewards/chosen": -0.03276486694812775, "rewards/margins": -0.0749281495809555, "rewards/rejected": 0.04216327518224716, "step": 611 }, { "epoch": 0.09464527353566596, "grad_norm": 4.8351826667785645, "learning_rate": 1.577319587628866e-06, "logits/chosen": 7.209897994995117, "logits/rejected": 8.280296325683594, "logps/chosen": -252.28445434570312, "logps/rejected": -274.72479248046875, "loss": 0.6394, "rewards/accuracies": 0.75, "rewards/chosen": -0.01845426671206951, "rewards/margins": 0.116290383040905, "rewards/rejected": -0.13474464416503906, "step": 612 }, { "epoch": 0.09479992267543012, "grad_norm": 5.207186698913574, "learning_rate": 1.5798969072164949e-06, "logits/chosen": 9.675490379333496, "logits/rejected": 6.493882179260254, "logps/chosen": -496.61614990234375, "logps/rejected": -297.22283935546875, "loss": 0.6582, "rewards/accuracies": 0.5, "rewards/chosen": 0.028522584587335587, "rewards/margins": 0.0827580988407135, "rewards/rejected": -0.054235510528087616, "step": 613 }, { "epoch": 0.09495457181519427, "grad_norm": 4.982122898101807, "learning_rate": 1.5824742268041237e-06, "logits/chosen": 11.553534507751465, "logits/rejected": 10.82922649383545, "logps/chosen": -243.1285400390625, "logps/rejected": -209.518310546875, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": -0.007285021711140871, "rewards/margins": -0.03921153396368027, "rewards/rejected": 0.031926512718200684, "step": 614 }, { "epoch": 0.09510922095495844, "grad_norm": 5.356812477111816, "learning_rate": 1.5850515463917526e-06, "logits/chosen": 12.095643997192383, "logits/rejected": 4.967586517333984, "logps/chosen": -262.9018249511719, "logps/rejected": -209.69000244140625, "loss": 0.6472, "rewards/accuracies": 0.625, "rewards/chosen": 0.005501079838722944, "rewards/margins": 0.10470442473888397, "rewards/rejected": -0.09920335561037064, "step": 615 }, { "epoch": 0.09526387009472259, "grad_norm": 4.781611919403076, "learning_rate": 1.5876288659793815e-06, "logits/chosen": 11.37546157836914, "logits/rejected": 8.753242492675781, "logps/chosen": -236.43675231933594, "logps/rejected": -231.66348266601562, "loss": 0.7134, "rewards/accuracies": 0.25, "rewards/chosen": -0.05269956216216087, "rewards/margins": -0.037966012954711914, "rewards/rejected": -0.014733552932739258, "step": 616 }, { "epoch": 0.09541851923448676, "grad_norm": 5.962507247924805, "learning_rate": 1.5902061855670103e-06, "logits/chosen": 11.31015682220459, "logits/rejected": 6.8168182373046875, "logps/chosen": -370.357421875, "logps/rejected": -332.8713684082031, "loss": 0.7109, "rewards/accuracies": 0.5, "rewards/chosen": -0.06651440262794495, "rewards/margins": -0.029712533578276634, "rewards/rejected": -0.036801863461732864, "step": 617 }, { "epoch": 0.09557316837425092, "grad_norm": 4.861644268035889, "learning_rate": 1.5927835051546392e-06, "logits/chosen": 8.432106018066406, "logits/rejected": -0.2939087152481079, "logps/chosen": -329.53631591796875, "logps/rejected": -212.93565368652344, "loss": 0.7326, "rewards/accuracies": 0.375, "rewards/chosen": 0.009197043254971504, "rewards/margins": -0.06665685027837753, "rewards/rejected": 0.07585389912128448, "step": 618 }, { "epoch": 0.09572781751401507, "grad_norm": 5.125204086303711, "learning_rate": 1.595360824742268e-06, "logits/chosen": 5.196965217590332, "logits/rejected": 4.761816024780273, "logps/chosen": -197.16281127929688, "logps/rejected": -129.31732177734375, "loss": 0.6997, "rewards/accuracies": 0.375, "rewards/chosen": -0.0391143299639225, "rewards/margins": -0.01229917909950018, "rewards/rejected": -0.026815149933099747, "step": 619 }, { "epoch": 0.09588246665377924, "grad_norm": 8.327315330505371, "learning_rate": 1.597938144329897e-06, "logits/chosen": 10.22437858581543, "logits/rejected": 7.134114742279053, "logps/chosen": -340.1286315917969, "logps/rejected": -298.02813720703125, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": -0.029244422912597656, "rewards/margins": 0.0943283662199974, "rewards/rejected": -0.12357278168201447, "step": 620 }, { "epoch": 0.0960371157935434, "grad_norm": 5.642324447631836, "learning_rate": 1.6005154639175258e-06, "logits/chosen": 13.739320755004883, "logits/rejected": 12.696159362792969, "logps/chosen": -314.5406799316406, "logps/rejected": -323.3088073730469, "loss": 0.6784, "rewards/accuracies": 0.375, "rewards/chosen": -0.011061572469770908, "rewards/margins": 0.040704868733882904, "rewards/rejected": -0.051766443997621536, "step": 621 }, { "epoch": 0.09619176493330756, "grad_norm": 5.283963203430176, "learning_rate": 1.603092783505155e-06, "logits/chosen": 13.151761054992676, "logits/rejected": 17.402347564697266, "logps/chosen": -220.0209197998047, "logps/rejected": -191.966552734375, "loss": 0.6695, "rewards/accuracies": 0.625, "rewards/chosen": 0.033338069915771484, "rewards/margins": 0.052843473851680756, "rewards/rejected": -0.01950540393590927, "step": 622 }, { "epoch": 0.09634641407307172, "grad_norm": 4.335860729217529, "learning_rate": 1.6056701030927838e-06, "logits/chosen": 6.806107521057129, "logits/rejected": 15.09583854675293, "logps/chosen": -181.46981811523438, "logps/rejected": -268.5600891113281, "loss": 0.7306, "rewards/accuracies": 0.25, "rewards/chosen": -0.03473300859332085, "rewards/margins": -0.06548719108104706, "rewards/rejected": 0.03075418621301651, "step": 623 }, { "epoch": 0.09650106321283587, "grad_norm": 5.092329978942871, "learning_rate": 1.6082474226804127e-06, "logits/chosen": 10.802000045776367, "logits/rejected": 10.009002685546875, "logps/chosen": -210.10116577148438, "logps/rejected": -251.5255889892578, "loss": 0.6683, "rewards/accuracies": 0.75, "rewards/chosen": -0.005318784154951572, "rewards/margins": 0.05525808781385422, "rewards/rejected": -0.06057686731219292, "step": 624 }, { "epoch": 0.09665571235260004, "grad_norm": 4.9923248291015625, "learning_rate": 1.6108247422680415e-06, "logits/chosen": 5.184527397155762, "logits/rejected": 8.153695106506348, "logps/chosen": -206.96498107910156, "logps/rejected": -223.87667846679688, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": -0.036101486533880234, "rewards/margins": 0.03194420412182808, "rewards/rejected": -0.06804568320512772, "step": 625 }, { "epoch": 0.0968103614923642, "grad_norm": 4.65030574798584, "learning_rate": 1.6134020618556704e-06, "logits/chosen": 14.926023483276367, "logits/rejected": 13.717376708984375, "logps/chosen": -177.5963592529297, "logps/rejected": -249.663330078125, "loss": 0.6647, "rewards/accuracies": 0.875, "rewards/chosen": -0.006681108847260475, "rewards/margins": 0.05900819972157478, "rewards/rejected": -0.06568930298089981, "step": 626 }, { "epoch": 0.09696501063212835, "grad_norm": 4.603990077972412, "learning_rate": 1.6159793814432993e-06, "logits/chosen": 14.098691940307617, "logits/rejected": 6.207292556762695, "logps/chosen": -364.30755615234375, "logps/rejected": -272.8124084472656, "loss": 0.6847, "rewards/accuracies": 0.5, "rewards/chosen": -0.021910574287176132, "rewards/margins": 0.02428731881082058, "rewards/rejected": -0.04619789496064186, "step": 627 }, { "epoch": 0.09711965977189252, "grad_norm": 5.58837890625, "learning_rate": 1.6185567010309281e-06, "logits/chosen": 9.047807693481445, "logits/rejected": 4.971400260925293, "logps/chosen": -262.41424560546875, "logps/rejected": -293.35064697265625, "loss": 0.71, "rewards/accuracies": 0.5, "rewards/chosen": -0.03303651884198189, "rewards/margins": -0.030684994533658028, "rewards/rejected": -0.0023515233770012856, "step": 628 }, { "epoch": 0.09727430891165668, "grad_norm": 5.33598518371582, "learning_rate": 1.6211340206185568e-06, "logits/chosen": 5.499825477600098, "logits/rejected": 4.790665626525879, "logps/chosen": -268.12353515625, "logps/rejected": -268.49896240234375, "loss": 0.7246, "rewards/accuracies": 0.375, "rewards/chosen": -0.07068347930908203, "rewards/margins": -0.06015462800860405, "rewards/rejected": -0.010528851300477982, "step": 629 }, { "epoch": 0.09742895805142084, "grad_norm": 4.960086822509766, "learning_rate": 1.6237113402061857e-06, "logits/chosen": 15.52161693572998, "logits/rejected": 11.660200119018555, "logps/chosen": -326.958251953125, "logps/rejected": -333.1241455078125, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": 0.03655433654785156, "rewards/margins": 0.1183067262172699, "rewards/rejected": -0.08175239711999893, "step": 630 }, { "epoch": 0.097583607191185, "grad_norm": 4.868167400360107, "learning_rate": 1.6262886597938145e-06, "logits/chosen": 14.511956214904785, "logits/rejected": 6.359712600708008, "logps/chosen": -269.334228515625, "logps/rejected": -196.94873046875, "loss": 0.6468, "rewards/accuracies": 0.75, "rewards/chosen": 0.08865445107221603, "rewards/margins": 0.10222003608942032, "rewards/rejected": -0.013565592467784882, "step": 631 }, { "epoch": 0.09773825633094915, "grad_norm": 6.033456325531006, "learning_rate": 1.6288659793814434e-06, "logits/chosen": 13.104146957397461, "logits/rejected": 8.012776374816895, "logps/chosen": -276.33807373046875, "logps/rejected": -290.249267578125, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": -0.03361959382891655, "rewards/margins": 0.041159387677907944, "rewards/rejected": -0.07477898895740509, "step": 632 }, { "epoch": 0.09789290547071332, "grad_norm": 7.156194686889648, "learning_rate": 1.6314432989690723e-06, "logits/chosen": 14.901607513427734, "logits/rejected": 10.479061126708984, "logps/chosen": -433.1152648925781, "logps/rejected": -299.2436828613281, "loss": 0.7059, "rewards/accuracies": 0.5, "rewards/chosen": -0.003666020929813385, "rewards/margins": -0.020360376685857773, "rewards/rejected": 0.016694355756044388, "step": 633 }, { "epoch": 0.09804755461047748, "grad_norm": 3.93912672996521, "learning_rate": 1.6340206185567012e-06, "logits/chosen": 9.796585083007812, "logits/rejected": 4.748625755310059, "logps/chosen": -230.2073974609375, "logps/rejected": -189.3337860107422, "loss": 0.6884, "rewards/accuracies": 0.375, "rewards/chosen": 0.04565896838903427, "rewards/margins": 0.024111414328217506, "rewards/rejected": 0.021547559648752213, "step": 634 }, { "epoch": 0.09820220375024163, "grad_norm": 7.107681751251221, "learning_rate": 1.63659793814433e-06, "logits/chosen": 8.632162094116211, "logits/rejected": 9.427631378173828, "logps/chosen": -387.70245361328125, "logps/rejected": -285.6554870605469, "loss": 0.7229, "rewards/accuracies": 0.5, "rewards/chosen": -0.026623060926795006, "rewards/margins": -0.05328959971666336, "rewards/rejected": 0.02666654624044895, "step": 635 }, { "epoch": 0.0983568528900058, "grad_norm": 4.889127254486084, "learning_rate": 1.639175257731959e-06, "logits/chosen": 8.18308162689209, "logits/rejected": 13.358589172363281, "logps/chosen": -200.55355834960938, "logps/rejected": -285.7563781738281, "loss": 0.7322, "rewards/accuracies": 0.25, "rewards/chosen": -0.004796029534190893, "rewards/margins": -0.06852416694164276, "rewards/rejected": 0.06372814625501633, "step": 636 }, { "epoch": 0.09851150202976997, "grad_norm": 4.188296794891357, "learning_rate": 1.6417525773195878e-06, "logits/chosen": 9.515541076660156, "logits/rejected": 0.9121682643890381, "logps/chosen": -191.35414123535156, "logps/rejected": -132.52049255371094, "loss": 0.692, "rewards/accuracies": 0.5, "rewards/chosen": -0.05044122040271759, "rewards/margins": 0.003530503250658512, "rewards/rejected": -0.05397172272205353, "step": 637 }, { "epoch": 0.09866615116953412, "grad_norm": 3.6274330615997314, "learning_rate": 1.6443298969072167e-06, "logits/chosen": 9.701925277709961, "logits/rejected": 3.2490687370300293, "logps/chosen": -225.98117065429688, "logps/rejected": -140.39572143554688, "loss": 0.7037, "rewards/accuracies": 0.125, "rewards/chosen": -0.046563006937503815, "rewards/margins": -0.019072817638516426, "rewards/rejected": -0.02749018743634224, "step": 638 }, { "epoch": 0.09882080030929828, "grad_norm": 5.348845958709717, "learning_rate": 1.6469072164948455e-06, "logits/chosen": 11.14402961730957, "logits/rejected": 5.489739418029785, "logps/chosen": -414.48193359375, "logps/rejected": -297.72247314453125, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": -0.02482414059340954, "rewards/margins": -0.0009044390171766281, "rewards/rejected": -0.02391970530152321, "step": 639 }, { "epoch": 0.09897544944906243, "grad_norm": 5.5072174072265625, "learning_rate": 1.6494845360824744e-06, "logits/chosen": 6.948462963104248, "logits/rejected": 10.435422897338867, "logps/chosen": -357.62353515625, "logps/rejected": -391.70892333984375, "loss": 0.6993, "rewards/accuracies": 0.625, "rewards/chosen": -0.003362540155649185, "rewards/margins": 0.0031153932213783264, "rewards/rejected": -0.006477933377027512, "step": 640 }, { "epoch": 0.0991300985888266, "grad_norm": 4.387056350708008, "learning_rate": 1.6520618556701033e-06, "logits/chosen": 10.224788665771484, "logits/rejected": 6.8633503913879395, "logps/chosen": -137.84393310546875, "logps/rejected": -127.134765625, "loss": 0.7039, "rewards/accuracies": 0.375, "rewards/chosen": -0.011342001147568226, "rewards/margins": -0.017363524064421654, "rewards/rejected": 0.006021523382514715, "step": 641 }, { "epoch": 0.09928474772859076, "grad_norm": 2.762166976928711, "learning_rate": 1.6546391752577321e-06, "logits/chosen": 6.713349342346191, "logits/rejected": 10.584786415100098, "logps/chosen": -73.66946411132812, "logps/rejected": -101.34318542480469, "loss": 0.6699, "rewards/accuracies": 0.75, "rewards/chosen": -0.001994467107579112, "rewards/margins": 0.04827606678009033, "rewards/rejected": -0.05027053505182266, "step": 642 }, { "epoch": 0.09943939686835492, "grad_norm": 4.7673020362854, "learning_rate": 1.6572164948453608e-06, "logits/chosen": 7.310449600219727, "logits/rejected": 10.991437911987305, "logps/chosen": -191.6696319580078, "logps/rejected": -239.61160278320312, "loss": 0.6844, "rewards/accuracies": 0.5, "rewards/chosen": 0.03263092041015625, "rewards/margins": 0.024889947846531868, "rewards/rejected": 0.007740974426269531, "step": 643 }, { "epoch": 0.09959404600811908, "grad_norm": 5.100652694702148, "learning_rate": 1.6597938144329897e-06, "logits/chosen": 10.107490539550781, "logits/rejected": 10.81308650970459, "logps/chosen": -223.90249633789062, "logps/rejected": -318.9140930175781, "loss": 0.681, "rewards/accuracies": 0.625, "rewards/chosen": -0.007338476367294788, "rewards/margins": 0.030842041596770287, "rewards/rejected": -0.03818051889538765, "step": 644 }, { "epoch": 0.09974869514788325, "grad_norm": 5.188139915466309, "learning_rate": 1.6623711340206185e-06, "logits/chosen": 11.427698135375977, "logits/rejected": 8.363601684570312, "logps/chosen": -278.658447265625, "logps/rejected": -254.81475830078125, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": 0.024146556854248047, "rewards/margins": 0.04017200693488121, "rewards/rejected": -0.016025448217988014, "step": 645 }, { "epoch": 0.0999033442876474, "grad_norm": 4.890308856964111, "learning_rate": 1.6649484536082474e-06, "logits/chosen": 12.572355270385742, "logits/rejected": 6.571372032165527, "logps/chosen": -324.322509765625, "logps/rejected": -228.4810791015625, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": -0.03328218683600426, "rewards/margins": -0.017046835273504257, "rewards/rejected": -0.0162353515625, "step": 646 }, { "epoch": 0.10005799342741156, "grad_norm": 4.256762981414795, "learning_rate": 1.6675257731958763e-06, "logits/chosen": 7.919941425323486, "logits/rejected": 4.48151969909668, "logps/chosen": -269.82080078125, "logps/rejected": -290.96575927734375, "loss": 0.6229, "rewards/accuracies": 0.75, "rewards/chosen": 0.06998326629400253, "rewards/margins": 0.15377354621887207, "rewards/rejected": -0.08379028737545013, "step": 647 }, { "epoch": 0.10021264256717571, "grad_norm": 4.747322082519531, "learning_rate": 1.6701030927835052e-06, "logits/chosen": 14.111015319824219, "logits/rejected": 10.853255271911621, "logps/chosen": -317.4160461425781, "logps/rejected": -211.2255859375, "loss": 0.6949, "rewards/accuracies": 0.25, "rewards/chosen": -0.029404643923044205, "rewards/margins": 0.002740904688835144, "rewards/rejected": -0.03214554861187935, "step": 648 }, { "epoch": 0.10036729170693988, "grad_norm": 3.3639962673187256, "learning_rate": 1.672680412371134e-06, "logits/chosen": 9.712835311889648, "logits/rejected": 9.351411819458008, "logps/chosen": -151.31210327148438, "logps/rejected": -135.27659606933594, "loss": 0.6971, "rewards/accuracies": 0.25, "rewards/chosen": 0.003421616042032838, "rewards/margins": -0.005889464169740677, "rewards/rejected": 0.009311079978942871, "step": 649 }, { "epoch": 0.10052194084670404, "grad_norm": 4.250289440155029, "learning_rate": 1.675257731958763e-06, "logits/chosen": 8.930456161499023, "logits/rejected": 12.005864143371582, "logps/chosen": -177.62753295898438, "logps/rejected": -221.66717529296875, "loss": 0.6454, "rewards/accuracies": 0.875, "rewards/chosen": 0.008941221982240677, "rewards/margins": 0.10039810836315155, "rewards/rejected": -0.09145689010620117, "step": 650 }, { "epoch": 0.1006765899864682, "grad_norm": 6.616263389587402, "learning_rate": 1.6778350515463918e-06, "logits/chosen": 9.84302043914795, "logits/rejected": 5.0720624923706055, "logps/chosen": -214.24847412109375, "logps/rejected": -151.21328735351562, "loss": 0.7356, "rewards/accuracies": 0.375, "rewards/chosen": -0.10174660384654999, "rewards/margins": -0.07674827426671982, "rewards/rejected": -0.024998335167765617, "step": 651 }, { "epoch": 0.10083123912623236, "grad_norm": 4.11628532409668, "learning_rate": 1.6804123711340209e-06, "logits/chosen": 1.6993416547775269, "logits/rejected": 0.3504621982574463, "logps/chosen": -307.0272216796875, "logps/rejected": -255.17030334472656, "loss": 0.6447, "rewards/accuracies": 0.75, "rewards/chosen": 0.05690937116742134, "rewards/margins": 0.1046469658613205, "rewards/rejected": -0.04773760214447975, "step": 652 }, { "epoch": 0.10098588826599653, "grad_norm": 4.548579692840576, "learning_rate": 1.6829896907216497e-06, "logits/chosen": 10.458688735961914, "logits/rejected": 6.558831691741943, "logps/chosen": -195.62979125976562, "logps/rejected": -202.48150634765625, "loss": 0.6612, "rewards/accuracies": 0.5, "rewards/chosen": 0.007487392984330654, "rewards/margins": 0.06877279281616211, "rewards/rejected": -0.06128539890050888, "step": 653 }, { "epoch": 0.10114053740576068, "grad_norm": 7.213531970977783, "learning_rate": 1.6855670103092786e-06, "logits/chosen": 6.283636093139648, "logits/rejected": 7.394184589385986, "logps/chosen": -334.258544921875, "logps/rejected": -553.8726196289062, "loss": 0.7252, "rewards/accuracies": 0.5, "rewards/chosen": -0.04854417219758034, "rewards/margins": -0.05938858911395073, "rewards/rejected": 0.010844423435628414, "step": 654 }, { "epoch": 0.10129518654552484, "grad_norm": 5.0834879875183105, "learning_rate": 1.6881443298969075e-06, "logits/chosen": 11.197104454040527, "logits/rejected": -6.347384452819824, "logps/chosen": -488.41802978515625, "logps/rejected": -232.65145874023438, "loss": 0.648, "rewards/accuracies": 0.75, "rewards/chosen": 0.07724142074584961, "rewards/margins": 0.10025287419557571, "rewards/rejected": -0.023011445999145508, "step": 655 }, { "epoch": 0.101449835685289, "grad_norm": 4.407314777374268, "learning_rate": 1.6907216494845363e-06, "logits/chosen": 6.098791122436523, "logits/rejected": 6.52764368057251, "logps/chosen": -220.281494140625, "logps/rejected": -245.36526489257812, "loss": 0.6444, "rewards/accuracies": 0.75, "rewards/chosen": 0.020389366894960403, "rewards/margins": 0.10595996677875519, "rewards/rejected": -0.08557059615850449, "step": 656 }, { "epoch": 0.10160448482505316, "grad_norm": 5.5342583656311035, "learning_rate": 1.6932989690721652e-06, "logits/chosen": 9.752843856811523, "logits/rejected": 5.9225263595581055, "logps/chosen": -354.33734130859375, "logps/rejected": -258.33380126953125, "loss": 0.7011, "rewards/accuracies": 0.625, "rewards/chosen": -0.06359577178955078, "rewards/margins": -0.005949879065155983, "rewards/rejected": -0.05764589458703995, "step": 657 }, { "epoch": 0.10175913396481732, "grad_norm": 4.831279754638672, "learning_rate": 1.695876288659794e-06, "logits/chosen": 15.619799613952637, "logits/rejected": 7.549403190612793, "logps/chosen": -356.50537109375, "logps/rejected": -279.78076171875, "loss": 0.6546, "rewards/accuracies": 0.875, "rewards/chosen": 0.03341083601117134, "rewards/margins": 0.08175259083509445, "rewards/rejected": -0.04834175109863281, "step": 658 }, { "epoch": 0.10191378310458148, "grad_norm": 5.875309944152832, "learning_rate": 1.698453608247423e-06, "logits/chosen": 11.178624153137207, "logits/rejected": 8.83117389678955, "logps/chosen": -359.5606689453125, "logps/rejected": -378.2495422363281, "loss": 0.7065, "rewards/accuracies": 0.75, "rewards/chosen": 0.008427286520600319, "rewards/margins": -0.01331019401550293, "rewards/rejected": 0.0217374786734581, "step": 659 }, { "epoch": 0.10206843224434564, "grad_norm": 4.892007350921631, "learning_rate": 1.7010309278350518e-06, "logits/chosen": 16.60784339904785, "logits/rejected": 10.378694534301758, "logps/chosen": -181.0744171142578, "logps/rejected": -120.24176025390625, "loss": 0.7297, "rewards/accuracies": 0.25, "rewards/chosen": -0.04470715671777725, "rewards/margins": -0.06933944672346115, "rewards/rejected": 0.0246322862803936, "step": 660 }, { "epoch": 0.1022230813841098, "grad_norm": 5.259274005889893, "learning_rate": 1.7036082474226807e-06, "logits/chosen": 7.3079938888549805, "logits/rejected": 10.6466646194458, "logps/chosen": -266.22613525390625, "logps/rejected": -331.6214904785156, "loss": 0.7575, "rewards/accuracies": 0.375, "rewards/chosen": -0.013918399810791016, "rewards/margins": -0.11277685314416885, "rewards/rejected": 0.09885846078395844, "step": 661 }, { "epoch": 0.10237773052387396, "grad_norm": 5.762076377868652, "learning_rate": 1.7061855670103094e-06, "logits/chosen": 12.200499534606934, "logits/rejected": 8.767509460449219, "logps/chosen": -254.0989990234375, "logps/rejected": -182.6186065673828, "loss": 0.7156, "rewards/accuracies": 0.5, "rewards/chosen": -0.045737892389297485, "rewards/margins": -0.03786543384194374, "rewards/rejected": -0.007872462272644043, "step": 662 }, { "epoch": 0.10253237966363812, "grad_norm": 4.698825359344482, "learning_rate": 1.7087628865979382e-06, "logits/chosen": 11.65015983581543, "logits/rejected": 7.005955696105957, "logps/chosen": -339.91546630859375, "logps/rejected": -220.9632568359375, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": -0.017162036150693893, "rewards/margins": 0.011417672038078308, "rewards/rejected": -0.02857971377670765, "step": 663 }, { "epoch": 0.10268702880340227, "grad_norm": 5.734563827514648, "learning_rate": 1.7113402061855671e-06, "logits/chosen": 5.212826728820801, "logits/rejected": 7.359377861022949, "logps/chosen": -257.7365417480469, "logps/rejected": -309.5773010253906, "loss": 0.6746, "rewards/accuracies": 0.5, "rewards/chosen": 0.04576730728149414, "rewards/margins": 0.04579916596412659, "rewards/rejected": -3.1853094696998596e-05, "step": 664 }, { "epoch": 0.10284167794316644, "grad_norm": 5.3968586921691895, "learning_rate": 1.713917525773196e-06, "logits/chosen": 9.994443893432617, "logits/rejected": 10.042808532714844, "logps/chosen": -311.7440185546875, "logps/rejected": -281.3293151855469, "loss": 0.6683, "rewards/accuracies": 0.5, "rewards/chosen": 0.0023292554542422295, "rewards/margins": 0.06412233412265778, "rewards/rejected": -0.06179308891296387, "step": 665 }, { "epoch": 0.1029963270829306, "grad_norm": 4.270318031311035, "learning_rate": 1.7164948453608249e-06, "logits/chosen": 7.720704555511475, "logits/rejected": 11.80801010131836, "logps/chosen": -180.4175567626953, "logps/rejected": -247.20968627929688, "loss": 0.71, "rewards/accuracies": 0.5, "rewards/chosen": 0.027014685794711113, "rewards/margins": -0.02740330807864666, "rewards/rejected": 0.054417990148067474, "step": 666 }, { "epoch": 0.10315097622269476, "grad_norm": 4.354268550872803, "learning_rate": 1.7190721649484537e-06, "logits/chosen": 14.810428619384766, "logits/rejected": 10.384760856628418, "logps/chosen": -247.15892028808594, "logps/rejected": -205.6116180419922, "loss": 0.6377, "rewards/accuracies": 0.625, "rewards/chosen": 0.0993863046169281, "rewards/margins": 0.12247291207313538, "rewards/rejected": -0.02308659814298153, "step": 667 }, { "epoch": 0.10330562536245892, "grad_norm": 6.416478633880615, "learning_rate": 1.7216494845360826e-06, "logits/chosen": 12.943609237670898, "logits/rejected": 10.954690933227539, "logps/chosen": -314.3682861328125, "logps/rejected": -257.19830322265625, "loss": 0.6339, "rewards/accuracies": 0.75, "rewards/chosen": 0.09125775843858719, "rewards/margins": 0.1301651895046234, "rewards/rejected": -0.038907431066036224, "step": 668 }, { "epoch": 0.10346027450222309, "grad_norm": 5.524900436401367, "learning_rate": 1.7242268041237115e-06, "logits/chosen": 9.195277214050293, "logits/rejected": 9.423467636108398, "logps/chosen": -292.70257568359375, "logps/rejected": -316.78643798828125, "loss": 0.7015, "rewards/accuracies": 0.625, "rewards/chosen": -0.011797618120908737, "rewards/margins": -0.01085028238594532, "rewards/rejected": -0.0009473334066569805, "step": 669 }, { "epoch": 0.10361492364198724, "grad_norm": 9.757253646850586, "learning_rate": 1.7268041237113403e-06, "logits/chosen": 14.634187698364258, "logits/rejected": 5.348187446594238, "logps/chosen": -465.4376220703125, "logps/rejected": -342.2071228027344, "loss": 0.7478, "rewards/accuracies": 0.375, "rewards/chosen": -0.03725433349609375, "rewards/margins": -0.094207763671875, "rewards/rejected": 0.05695343390107155, "step": 670 }, { "epoch": 0.1037695727817514, "grad_norm": 5.383145332336426, "learning_rate": 1.7293814432989692e-06, "logits/chosen": 13.826841354370117, "logits/rejected": 9.040946960449219, "logps/chosen": -269.61590576171875, "logps/rejected": -259.8312072753906, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": 0.01482701301574707, "rewards/margins": -0.011121200397610664, "rewards/rejected": 0.025948215276002884, "step": 671 }, { "epoch": 0.10392422192151556, "grad_norm": 5.767911911010742, "learning_rate": 1.731958762886598e-06, "logits/chosen": 12.13133430480957, "logits/rejected": 7.115202903747559, "logps/chosen": -259.7037658691406, "logps/rejected": -172.8024139404297, "loss": 0.7274, "rewards/accuracies": 0.5, "rewards/chosen": -0.07644939422607422, "rewards/margins": -0.05740680545568466, "rewards/rejected": -0.019042588770389557, "step": 672 }, { "epoch": 0.10407887106127972, "grad_norm": 6.355388164520264, "learning_rate": 1.734536082474227e-06, "logits/chosen": 10.470122337341309, "logits/rejected": 10.729490280151367, "logps/chosen": -353.16351318359375, "logps/rejected": -359.7750244140625, "loss": 0.6569, "rewards/accuracies": 0.625, "rewards/chosen": 0.10434875637292862, "rewards/margins": 0.0798313170671463, "rewards/rejected": 0.024517439305782318, "step": 673 }, { "epoch": 0.10423352020104389, "grad_norm": 5.094385623931885, "learning_rate": 1.7371134020618558e-06, "logits/chosen": 11.618303298950195, "logits/rejected": 9.52016830444336, "logps/chosen": -290.1989440917969, "logps/rejected": -239.11581420898438, "loss": 0.6721, "rewards/accuracies": 0.625, "rewards/chosen": 0.03366727754473686, "rewards/margins": 0.04694719612598419, "rewards/rejected": -0.013279914855957031, "step": 674 }, { "epoch": 0.10438816934080804, "grad_norm": 7.462923526763916, "learning_rate": 1.7396907216494847e-06, "logits/chosen": 14.113037109375, "logits/rejected": 17.088916778564453, "logps/chosen": -433.5478515625, "logps/rejected": -552.8109741210938, "loss": 0.7075, "rewards/accuracies": 0.625, "rewards/chosen": -0.010138891637325287, "rewards/margins": 0.004543498158454895, "rewards/rejected": -0.014682380482554436, "step": 675 }, { "epoch": 0.1045428184805722, "grad_norm": 5.817153453826904, "learning_rate": 1.7422680412371134e-06, "logits/chosen": 12.25714111328125, "logits/rejected": 10.902167320251465, "logps/chosen": -438.2728271484375, "logps/rejected": -402.656982421875, "loss": 0.6757, "rewards/accuracies": 0.75, "rewards/chosen": 0.005818936973810196, "rewards/margins": 0.03894777595996857, "rewards/rejected": -0.03312883526086807, "step": 676 }, { "epoch": 0.10469746762033637, "grad_norm": 4.1766204833984375, "learning_rate": 1.7448453608247422e-06, "logits/chosen": 11.384872436523438, "logits/rejected": 9.269880294799805, "logps/chosen": -235.53416442871094, "logps/rejected": -199.64015197753906, "loss": 0.6744, "rewards/accuracies": 0.5, "rewards/chosen": -0.01969142258167267, "rewards/margins": 0.040616437792778015, "rewards/rejected": -0.060307860374450684, "step": 677 }, { "epoch": 0.10485211676010052, "grad_norm": 7.616860866546631, "learning_rate": 1.747422680412371e-06, "logits/chosen": 12.98716926574707, "logits/rejected": 6.189037799835205, "logps/chosen": -346.7720031738281, "logps/rejected": -234.07763671875, "loss": 0.7155, "rewards/accuracies": 0.5, "rewards/chosen": 0.031049348413944244, "rewards/margins": -0.0368557944893837, "rewards/rejected": 0.06790514290332794, "step": 678 }, { "epoch": 0.10500676589986468, "grad_norm": 8.19212818145752, "learning_rate": 1.75e-06, "logits/chosen": 12.169953346252441, "logits/rejected": 7.735172271728516, "logps/chosen": -531.0531616210938, "logps/rejected": -340.606689453125, "loss": 0.6756, "rewards/accuracies": 0.5, "rewards/chosen": 0.06909886002540588, "rewards/margins": 0.045296572148799896, "rewards/rejected": 0.02380228042602539, "step": 679 }, { "epoch": 0.10516141503962884, "grad_norm": 4.897167205810547, "learning_rate": 1.7525773195876288e-06, "logits/chosen": 8.591468811035156, "logits/rejected": 11.312469482421875, "logps/chosen": -280.17218017578125, "logps/rejected": -281.02801513671875, "loss": 0.6602, "rewards/accuracies": 0.5, "rewards/chosen": 0.04004840925335884, "rewards/margins": 0.07665366679430008, "rewards/rejected": -0.03660526126623154, "step": 680 }, { "epoch": 0.105316064179393, "grad_norm": 5.469785213470459, "learning_rate": 1.7551546391752577e-06, "logits/chosen": 12.214111328125, "logits/rejected": 6.799766540527344, "logps/chosen": -305.38201904296875, "logps/rejected": -276.7591857910156, "loss": 0.6804, "rewards/accuracies": 0.625, "rewards/chosen": -0.015477752313017845, "rewards/margins": 0.033103086054325104, "rewards/rejected": -0.0485808402299881, "step": 681 }, { "epoch": 0.10547071331915717, "grad_norm": 5.973385334014893, "learning_rate": 1.7577319587628866e-06, "logits/chosen": 12.681440353393555, "logits/rejected": 6.3310699462890625, "logps/chosen": -371.85302734375, "logps/rejected": -261.76910400390625, "loss": 0.702, "rewards/accuracies": 0.625, "rewards/chosen": 0.002875950187444687, "rewards/margins": -0.013464685529470444, "rewards/rejected": 0.01634063757956028, "step": 682 }, { "epoch": 0.10562536245892132, "grad_norm": 4.383820056915283, "learning_rate": 1.7603092783505157e-06, "logits/chosen": 10.694823265075684, "logits/rejected": 7.794074058532715, "logps/chosen": -264.9433898925781, "logps/rejected": -207.11183166503906, "loss": 0.6724, "rewards/accuracies": 0.625, "rewards/chosen": 0.006166078150272369, "rewards/margins": 0.04402055963873863, "rewards/rejected": -0.03785448148846626, "step": 683 }, { "epoch": 0.10578001159868548, "grad_norm": 5.311039447784424, "learning_rate": 1.7628865979381445e-06, "logits/chosen": 7.801418304443359, "logits/rejected": -0.11302995681762695, "logps/chosen": -291.1820983886719, "logps/rejected": -183.00271606445312, "loss": 0.6769, "rewards/accuracies": 0.625, "rewards/chosen": -0.009747695177793503, "rewards/margins": 0.03845924139022827, "rewards/rejected": -0.04820694774389267, "step": 684 }, { "epoch": 0.10593466073844965, "grad_norm": 5.3003830909729, "learning_rate": 1.7654639175257734e-06, "logits/chosen": 10.9329252243042, "logits/rejected": 4.558310508728027, "logps/chosen": -395.85552978515625, "logps/rejected": -277.6243591308594, "loss": 0.7033, "rewards/accuracies": 0.625, "rewards/chosen": 0.0415351428091526, "rewards/margins": -0.015120504423975945, "rewards/rejected": 0.0566556453704834, "step": 685 }, { "epoch": 0.1060893098782138, "grad_norm": 8.903987884521484, "learning_rate": 1.7680412371134023e-06, "logits/chosen": 8.132436752319336, "logits/rejected": 6.30673360824585, "logps/chosen": -285.2313232421875, "logps/rejected": -332.7186279296875, "loss": 0.7038, "rewards/accuracies": 0.5, "rewards/chosen": -0.013564299792051315, "rewards/margins": -0.01806449331343174, "rewards/rejected": 0.004500195384025574, "step": 686 }, { "epoch": 0.10624395901797797, "grad_norm": 2.749605178833008, "learning_rate": 1.7706185567010312e-06, "logits/chosen": 10.753188133239746, "logits/rejected": 11.1934814453125, "logps/chosen": -112.49217987060547, "logps/rejected": -118.42501831054688, "loss": 0.6618, "rewards/accuracies": 0.75, "rewards/chosen": 0.0041128164157271385, "rewards/margins": 0.06557545065879822, "rewards/rejected": -0.0614626407623291, "step": 687 }, { "epoch": 0.10639860815774212, "grad_norm": 6.521875858306885, "learning_rate": 1.77319587628866e-06, "logits/chosen": 11.76104736328125, "logits/rejected": 14.46867561340332, "logps/chosen": -386.2987976074219, "logps/rejected": -358.4405212402344, "loss": 0.6871, "rewards/accuracies": 0.375, "rewards/chosen": 0.0009296387434005737, "rewards/margins": 0.020166778936982155, "rewards/rejected": -0.019237138330936432, "step": 688 }, { "epoch": 0.10655325729750628, "grad_norm": 4.162387371063232, "learning_rate": 1.775773195876289e-06, "logits/chosen": 13.494083404541016, "logits/rejected": 4.4459075927734375, "logps/chosen": -311.7069396972656, "logps/rejected": -192.32937622070312, "loss": 0.7064, "rewards/accuracies": 0.375, "rewards/chosen": 0.0037535633891820908, "rewards/margins": -0.01547413133084774, "rewards/rejected": 0.01922769472002983, "step": 689 }, { "epoch": 0.10670790643727045, "grad_norm": 3.5574254989624023, "learning_rate": 1.7783505154639178e-06, "logits/chosen": 9.295692443847656, "logits/rejected": 13.15600299835205, "logps/chosen": -152.50228881835938, "logps/rejected": -183.5394287109375, "loss": 0.6742, "rewards/accuracies": 0.75, "rewards/chosen": -0.02263064496219158, "rewards/margins": 0.03988990932703018, "rewards/rejected": -0.06252054870128632, "step": 690 }, { "epoch": 0.1068625555770346, "grad_norm": 3.871260643005371, "learning_rate": 1.7809278350515466e-06, "logits/chosen": 13.814468383789062, "logits/rejected": 4.13873815536499, "logps/chosen": -223.12741088867188, "logps/rejected": -127.20864868164062, "loss": 0.6302, "rewards/accuracies": 0.875, "rewards/chosen": 0.08524029701948166, "rewards/margins": 0.14162299036979675, "rewards/rejected": -0.05638270452618599, "step": 691 }, { "epoch": 0.10701720471679876, "grad_norm": 4.686367988586426, "learning_rate": 1.7835051546391755e-06, "logits/chosen": 9.703445434570312, "logits/rejected": 6.450917720794678, "logps/chosen": -242.64956665039062, "logps/rejected": -234.71563720703125, "loss": 0.6345, "rewards/accuracies": 0.875, "rewards/chosen": 0.09268367290496826, "rewards/margins": 0.1274859756231308, "rewards/rejected": -0.03480229526758194, "step": 692 }, { "epoch": 0.10717185385656293, "grad_norm": 7.197606563568115, "learning_rate": 1.7860824742268044e-06, "logits/chosen": 6.909630298614502, "logits/rejected": 4.325741767883301, "logps/chosen": -202.50653076171875, "logps/rejected": -201.484130859375, "loss": 0.7627, "rewards/accuracies": 0.25, "rewards/chosen": -0.02727665938436985, "rewards/margins": -0.12527775764465332, "rewards/rejected": 0.09800110757350922, "step": 693 }, { "epoch": 0.10732650299632708, "grad_norm": 7.277336120605469, "learning_rate": 1.7886597938144333e-06, "logits/chosen": 9.90390396118164, "logits/rejected": 6.36905574798584, "logps/chosen": -280.7347412109375, "logps/rejected": -233.89019775390625, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.04039278253912926, "rewards/margins": 0.016063379123806953, "rewards/rejected": 0.024329401552677155, "step": 694 }, { "epoch": 0.10748115213609125, "grad_norm": 5.291078567504883, "learning_rate": 1.791237113402062e-06, "logits/chosen": 9.95087718963623, "logits/rejected": 11.674186706542969, "logps/chosen": -219.01148986816406, "logps/rejected": -252.99777221679688, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": 0.0033414606004953384, "rewards/margins": 0.08014824986457825, "rewards/rejected": -0.07680678367614746, "step": 695 }, { "epoch": 0.1076358012758554, "grad_norm": 7.976032733917236, "learning_rate": 1.7938144329896908e-06, "logits/chosen": 3.961123466491699, "logits/rejected": 9.905696868896484, "logps/chosen": -193.1763458251953, "logps/rejected": -311.06683349609375, "loss": 0.7636, "rewards/accuracies": 0.25, "rewards/chosen": -0.07428845763206482, "rewards/margins": -0.13129006326198578, "rewards/rejected": 0.057001590728759766, "step": 696 }, { "epoch": 0.10779045041561956, "grad_norm": 5.6274094581604, "learning_rate": 1.7963917525773197e-06, "logits/chosen": 13.91313362121582, "logits/rejected": 13.364702224731445, "logps/chosen": -237.36709594726562, "logps/rejected": -210.49732971191406, "loss": 0.7022, "rewards/accuracies": 0.375, "rewards/chosen": 0.02926046960055828, "rewards/margins": -0.01489575020968914, "rewards/rejected": 0.04415621981024742, "step": 697 }, { "epoch": 0.10794509955538373, "grad_norm": 3.6515707969665527, "learning_rate": 1.7989690721649485e-06, "logits/chosen": 6.404356956481934, "logits/rejected": 9.36177921295166, "logps/chosen": -97.04611206054688, "logps/rejected": -156.2027587890625, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": -0.003941057715564966, "rewards/margins": 0.040685079991817474, "rewards/rejected": -0.0446261391043663, "step": 698 }, { "epoch": 0.10809974869514788, "grad_norm": 4.04915714263916, "learning_rate": 1.8015463917525774e-06, "logits/chosen": 12.119953155517578, "logits/rejected": 11.199554443359375, "logps/chosen": -272.0041198730469, "logps/rejected": -259.0806579589844, "loss": 0.67, "rewards/accuracies": 0.625, "rewards/chosen": 0.06366066634654999, "rewards/margins": 0.0506453737616539, "rewards/rejected": 0.013015293516218662, "step": 699 }, { "epoch": 0.10825439783491204, "grad_norm": 4.469096660614014, "learning_rate": 1.8041237113402063e-06, "logits/chosen": 7.191540241241455, "logits/rejected": 12.124711990356445, "logps/chosen": -120.91340637207031, "logps/rejected": -182.94583129882812, "loss": 0.7016, "rewards/accuracies": 0.625, "rewards/chosen": 0.01614956744015217, "rewards/margins": -0.01321706548333168, "rewards/rejected": 0.029366634786128998, "step": 700 }, { "epoch": 0.10840904697467621, "grad_norm": 5.043667316436768, "learning_rate": 1.8067010309278352e-06, "logits/chosen": 5.647159576416016, "logits/rejected": 5.3715715408325195, "logps/chosen": -273.57574462890625, "logps/rejected": -243.84884643554688, "loss": 0.7001, "rewards/accuracies": 0.625, "rewards/chosen": 0.00802641175687313, "rewards/margins": -0.008949998766183853, "rewards/rejected": 0.016976404935121536, "step": 701 }, { "epoch": 0.10856369611444036, "grad_norm": 7.200451850891113, "learning_rate": 1.809278350515464e-06, "logits/chosen": 7.708015441894531, "logits/rejected": 10.075786590576172, "logps/chosen": -223.64666748046875, "logps/rejected": -216.2239532470703, "loss": 0.7707, "rewards/accuracies": 0.25, "rewards/chosen": -0.038020942360162735, "rewards/margins": -0.13916154205799103, "rewards/rejected": 0.101140595972538, "step": 702 }, { "epoch": 0.10871834525420453, "grad_norm": 8.820755004882812, "learning_rate": 1.811855670103093e-06, "logits/chosen": 12.685633659362793, "logits/rejected": 6.820279121398926, "logps/chosen": -452.3470153808594, "logps/rejected": -323.22589111328125, "loss": 0.7096, "rewards/accuracies": 0.5, "rewards/chosen": -0.03092489391565323, "rewards/margins": -0.021825987845659256, "rewards/rejected": -0.009098910726606846, "step": 703 }, { "epoch": 0.10887299439396868, "grad_norm": 3.7269973754882812, "learning_rate": 1.8144329896907218e-06, "logits/chosen": 12.842632293701172, "logits/rejected": 8.11235237121582, "logps/chosen": -265.1856994628906, "logps/rejected": -196.07656860351562, "loss": 0.6798, "rewards/accuracies": 0.625, "rewards/chosen": 0.010847426950931549, "rewards/margins": 0.03411087766289711, "rewards/rejected": -0.02326345443725586, "step": 704 }, { "epoch": 0.10902764353373284, "grad_norm": 7.225241184234619, "learning_rate": 1.8170103092783506e-06, "logits/chosen": 11.678240776062012, "logits/rejected": 2.29213547706604, "logps/chosen": -378.49603271484375, "logps/rejected": -187.14480590820312, "loss": 0.6924, "rewards/accuracies": 0.375, "rewards/chosen": 0.037880994379520416, "rewards/margins": 0.0048796930350363255, "rewards/rejected": 0.03300130367279053, "step": 705 }, { "epoch": 0.10918229267349701, "grad_norm": 4.838374137878418, "learning_rate": 1.8195876288659795e-06, "logits/chosen": 12.484935760498047, "logits/rejected": 12.068343162536621, "logps/chosen": -351.9620056152344, "logps/rejected": -334.5666809082031, "loss": 0.6531, "rewards/accuracies": 0.75, "rewards/chosen": 0.06834468990564346, "rewards/margins": 0.09771782159805298, "rewards/rejected": -0.02937312424182892, "step": 706 }, { "epoch": 0.10933694181326116, "grad_norm": 5.253668308258057, "learning_rate": 1.8221649484536084e-06, "logits/chosen": 12.10118579864502, "logits/rejected": 3.0983409881591797, "logps/chosen": -213.44265747070312, "logps/rejected": -169.13650512695312, "loss": 0.6807, "rewards/accuracies": 0.5, "rewards/chosen": -0.053801730275154114, "rewards/margins": 0.028064537793397903, "rewards/rejected": -0.08186626434326172, "step": 707 }, { "epoch": 0.10949159095302533, "grad_norm": 6.283231258392334, "learning_rate": 1.8247422680412373e-06, "logits/chosen": 7.627786159515381, "logits/rejected": 6.472894668579102, "logps/chosen": -293.1684265136719, "logps/rejected": -301.99786376953125, "loss": 0.7124, "rewards/accuracies": 0.625, "rewards/chosen": 0.004636000841856003, "rewards/margins": -0.023963116109371185, "rewards/rejected": 0.02859911322593689, "step": 708 }, { "epoch": 0.10964624009278949, "grad_norm": 5.071584224700928, "learning_rate": 1.827319587628866e-06, "logits/chosen": 7.017948150634766, "logits/rejected": 5.0013837814331055, "logps/chosen": -201.0968475341797, "logps/rejected": -217.12728881835938, "loss": 0.7346, "rewards/accuracies": 0.375, "rewards/chosen": 0.007269810885190964, "rewards/margins": -0.07340269535779953, "rewards/rejected": 0.0806725025177002, "step": 709 }, { "epoch": 0.10980088923255364, "grad_norm": 6.132742881774902, "learning_rate": 1.8298969072164948e-06, "logits/chosen": 10.921630859375, "logits/rejected": 5.869736671447754, "logps/chosen": -330.6422119140625, "logps/rejected": -225.01193237304688, "loss": 0.7306, "rewards/accuracies": 0.25, "rewards/chosen": -0.03784565627574921, "rewards/margins": -0.06417790055274963, "rewards/rejected": 0.02633224055171013, "step": 710 }, { "epoch": 0.10995553837231781, "grad_norm": 4.467646598815918, "learning_rate": 1.8324742268041237e-06, "logits/chosen": 14.898259162902832, "logits/rejected": 10.122255325317383, "logps/chosen": -230.3477325439453, "logps/rejected": -226.500244140625, "loss": 0.6848, "rewards/accuracies": 0.375, "rewards/chosen": -0.04691576585173607, "rewards/margins": 0.023256495594978333, "rewards/rejected": -0.0701722651720047, "step": 711 }, { "epoch": 0.11011018751208196, "grad_norm": 5.877440452575684, "learning_rate": 1.8350515463917525e-06, "logits/chosen": 7.848925590515137, "logits/rejected": 9.357367515563965, "logps/chosen": -344.7926330566406, "logps/rejected": -372.2003173828125, "loss": 0.6958, "rewards/accuracies": 0.625, "rewards/chosen": 0.057058047503232956, "rewards/margins": 0.013203656300902367, "rewards/rejected": 0.04385438188910484, "step": 712 }, { "epoch": 0.11026483665184612, "grad_norm": 3.61445951461792, "learning_rate": 1.8376288659793818e-06, "logits/chosen": 10.201498985290527, "logits/rejected": 10.242499351501465, "logps/chosen": -230.43814086914062, "logps/rejected": -193.33441162109375, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": 0.0902554988861084, "rewards/margins": 0.09338449686765671, "rewards/rejected": -0.0031290054321289062, "step": 713 }, { "epoch": 0.11041948579161029, "grad_norm": 7.836570739746094, "learning_rate": 1.8402061855670105e-06, "logits/chosen": 10.343636512756348, "logits/rejected": 12.921567916870117, "logps/chosen": -268.2269287109375, "logps/rejected": -278.09881591796875, "loss": 0.7247, "rewards/accuracies": 0.625, "rewards/chosen": -0.03335551917552948, "rewards/margins": -0.04644375666975975, "rewards/rejected": 0.013088226318359375, "step": 714 }, { "epoch": 0.11057413493137444, "grad_norm": 5.876894950866699, "learning_rate": 1.8427835051546394e-06, "logits/chosen": 7.584066867828369, "logits/rejected": 12.958361625671387, "logps/chosen": -184.07427978515625, "logps/rejected": -342.74554443359375, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": -0.01125945895910263, "rewards/margins": 0.022399526089429855, "rewards/rejected": -0.03365898132324219, "step": 715 }, { "epoch": 0.1107287840711386, "grad_norm": 4.255242347717285, "learning_rate": 1.8453608247422682e-06, "logits/chosen": 10.220253944396973, "logits/rejected": 10.728243827819824, "logps/chosen": -196.3232421875, "logps/rejected": -233.96810913085938, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": -0.028787709772586823, "rewards/margins": 0.03922424837946892, "rewards/rejected": -0.06801195442676544, "step": 716 }, { "epoch": 0.11088343321090277, "grad_norm": 5.621160984039307, "learning_rate": 1.847938144329897e-06, "logits/chosen": 11.877320289611816, "logits/rejected": 10.266227722167969, "logps/chosen": -220.4231719970703, "logps/rejected": -188.4232177734375, "loss": 0.7308, "rewards/accuracies": 0.375, "rewards/chosen": -0.07582726329565048, "rewards/margins": -0.06901021301746368, "rewards/rejected": -0.00681705679744482, "step": 717 }, { "epoch": 0.11103808235066692, "grad_norm": 4.846379280090332, "learning_rate": 1.850515463917526e-06, "logits/chosen": 11.470077514648438, "logits/rejected": 4.535397529602051, "logps/chosen": -249.92218017578125, "logps/rejected": -232.28977966308594, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": 0.012532854452729225, "rewards/margins": 0.06642108410596848, "rewards/rejected": -0.0538882240653038, "step": 718 }, { "epoch": 0.11119273149043109, "grad_norm": 6.110698699951172, "learning_rate": 1.8530927835051548e-06, "logits/chosen": 15.015692710876465, "logits/rejected": 9.169832229614258, "logps/chosen": -241.60145568847656, "logps/rejected": -238.036376953125, "loss": 0.7212, "rewards/accuracies": 0.375, "rewards/chosen": -0.04998607933521271, "rewards/margins": -0.03686261177062988, "rewards/rejected": -0.0131234647706151, "step": 719 }, { "epoch": 0.11134738063019524, "grad_norm": 4.128805637359619, "learning_rate": 1.8556701030927837e-06, "logits/chosen": 5.779862880706787, "logits/rejected": 3.782573938369751, "logps/chosen": -206.86532592773438, "logps/rejected": -224.34893798828125, "loss": 0.6883, "rewards/accuracies": 0.625, "rewards/chosen": 0.08036962151527405, "rewards/margins": 0.012198328971862793, "rewards/rejected": 0.06817128509283066, "step": 720 }, { "epoch": 0.1115020297699594, "grad_norm": 5.853085041046143, "learning_rate": 1.8582474226804126e-06, "logits/chosen": 7.352654457092285, "logits/rejected": 7.837530136108398, "logps/chosen": -309.820556640625, "logps/rejected": -359.3823547363281, "loss": 0.69, "rewards/accuracies": 0.375, "rewards/chosen": 0.028368379920721054, "rewards/margins": 0.011280491948127747, "rewards/rejected": 0.017087887972593307, "step": 721 }, { "epoch": 0.11165667890972357, "grad_norm": 5.479710578918457, "learning_rate": 1.8608247422680415e-06, "logits/chosen": 4.995455741882324, "logits/rejected": 5.5761942863464355, "logps/chosen": -201.00523376464844, "logps/rejected": -191.72509765625, "loss": 0.7481, "rewards/accuracies": 0.25, "rewards/chosen": -0.040320709347724915, "rewards/margins": -0.09750836342573166, "rewards/rejected": 0.057187654078006744, "step": 722 }, { "epoch": 0.11181132804948772, "grad_norm": 5.180700302124023, "learning_rate": 1.8634020618556703e-06, "logits/chosen": 12.719510078430176, "logits/rejected": 9.608434677124023, "logps/chosen": -365.6546630859375, "logps/rejected": -337.07769775390625, "loss": 0.6768, "rewards/accuracies": 0.75, "rewards/chosen": 0.014326428063213825, "rewards/margins": 0.044106535613536835, "rewards/rejected": -0.029780101031064987, "step": 723 }, { "epoch": 0.11196597718925189, "grad_norm": 4.572819232940674, "learning_rate": 1.8659793814432992e-06, "logits/chosen": 11.909309387207031, "logits/rejected": 5.242926120758057, "logps/chosen": -318.7596435546875, "logps/rejected": -204.9488983154297, "loss": 0.6155, "rewards/accuracies": 0.875, "rewards/chosen": 0.10008469223976135, "rewards/margins": 0.16630098223686218, "rewards/rejected": -0.06621628254652023, "step": 724 }, { "epoch": 0.11212062632901605, "grad_norm": 5.882225513458252, "learning_rate": 1.868556701030928e-06, "logits/chosen": 10.94326114654541, "logits/rejected": 7.2069549560546875, "logps/chosen": -338.3840637207031, "logps/rejected": -263.2120361328125, "loss": 0.6733, "rewards/accuracies": 0.5, "rewards/chosen": -0.02367725595831871, "rewards/margins": 0.04506239667534828, "rewards/rejected": -0.06873965263366699, "step": 725 }, { "epoch": 0.1122752754687802, "grad_norm": 6.208929061889648, "learning_rate": 1.871134020618557e-06, "logits/chosen": 7.1215925216674805, "logits/rejected": 2.3828399181365967, "logps/chosen": -271.33428955078125, "logps/rejected": -259.35662841796875, "loss": 0.7052, "rewards/accuracies": 0.375, "rewards/chosen": 0.005681419745087624, "rewards/margins": -0.008107852190732956, "rewards/rejected": 0.013789273798465729, "step": 726 }, { "epoch": 0.11242992460854437, "grad_norm": 6.331758975982666, "learning_rate": 1.8737113402061858e-06, "logits/chosen": 10.625401496887207, "logits/rejected": 7.8632588386535645, "logps/chosen": -236.84521484375, "logps/rejected": -169.7083282470703, "loss": 0.7339, "rewards/accuracies": 0.5, "rewards/chosen": -0.0017783879302442074, "rewards/margins": -0.07529734075069427, "rewards/rejected": 0.07351894676685333, "step": 727 }, { "epoch": 0.11258457374830852, "grad_norm": 5.382650375366211, "learning_rate": 1.8762886597938145e-06, "logits/chosen": 5.8831658363342285, "logits/rejected": 5.561832427978516, "logps/chosen": -288.421875, "logps/rejected": -250.149658203125, "loss": 0.6735, "rewards/accuracies": 0.5, "rewards/chosen": 0.09198927879333496, "rewards/margins": 0.051101118326187134, "rewards/rejected": 0.040888167917728424, "step": 728 }, { "epoch": 0.11273922288807268, "grad_norm": 5.3047614097595215, "learning_rate": 1.8788659793814434e-06, "logits/chosen": 9.696928024291992, "logits/rejected": 13.497712135314941, "logps/chosen": -191.14898681640625, "logps/rejected": -281.43035888671875, "loss": 0.713, "rewards/accuracies": 0.375, "rewards/chosen": 0.04756645858287811, "rewards/margins": -0.031801559031009674, "rewards/rejected": 0.07936801016330719, "step": 729 }, { "epoch": 0.11289387202783685, "grad_norm": 4.395238876342773, "learning_rate": 1.8814432989690722e-06, "logits/chosen": 6.414369583129883, "logits/rejected": 1.8793587684631348, "logps/chosen": -217.67242431640625, "logps/rejected": -179.20553588867188, "loss": 0.7062, "rewards/accuracies": 0.375, "rewards/chosen": -0.0620153471827507, "rewards/margins": -0.024518823251128197, "rewards/rejected": -0.03749651834368706, "step": 730 }, { "epoch": 0.113048521167601, "grad_norm": 5.196818828582764, "learning_rate": 1.884020618556701e-06, "logits/chosen": 3.1169824600219727, "logits/rejected": 4.476332187652588, "logps/chosen": -310.5924377441406, "logps/rejected": -323.5171203613281, "loss": 0.606, "rewards/accuracies": 0.875, "rewards/chosen": 0.10536368191242218, "rewards/margins": 0.19147217273712158, "rewards/rejected": -0.0861084908246994, "step": 731 }, { "epoch": 0.11320317030736517, "grad_norm": 5.124047756195068, "learning_rate": 1.88659793814433e-06, "logits/chosen": 13.44982624053955, "logits/rejected": 6.7886738777160645, "logps/chosen": -374.034423828125, "logps/rejected": -250.93060302734375, "loss": 0.6665, "rewards/accuracies": 0.5, "rewards/chosen": 0.04483547434210777, "rewards/margins": 0.06021089479327202, "rewards/rejected": -0.015375422313809395, "step": 732 }, { "epoch": 0.11335781944712933, "grad_norm": 4.819911479949951, "learning_rate": 1.8891752577319588e-06, "logits/chosen": 12.030155181884766, "logits/rejected": 2.9154651165008545, "logps/chosen": -333.56658935546875, "logps/rejected": -184.98818969726562, "loss": 0.6634, "rewards/accuracies": 0.625, "rewards/chosen": 0.04062338173389435, "rewards/margins": 0.06568670272827148, "rewards/rejected": -0.025063324719667435, "step": 733 }, { "epoch": 0.11351246858689348, "grad_norm": 4.5744147300720215, "learning_rate": 1.8917525773195877e-06, "logits/chosen": 8.956939697265625, "logits/rejected": 14.21845817565918, "logps/chosen": -185.28526306152344, "logps/rejected": -251.44049072265625, "loss": 0.6308, "rewards/accuracies": 0.875, "rewards/chosen": 0.09377098083496094, "rewards/margins": 0.13898378610610962, "rewards/rejected": -0.04521279036998749, "step": 734 }, { "epoch": 0.11366711772665765, "grad_norm": 11.274410247802734, "learning_rate": 1.8943298969072166e-06, "logits/chosen": 8.833852767944336, "logits/rejected": 11.25501537322998, "logps/chosen": -244.5928192138672, "logps/rejected": -269.6267395019531, "loss": 0.7713, "rewards/accuracies": 0.125, "rewards/chosen": -0.03720083460211754, "rewards/margins": -0.14621639251708984, "rewards/rejected": 0.1090155616402626, "step": 735 }, { "epoch": 0.1138217668664218, "grad_norm": 5.071261882781982, "learning_rate": 1.8969072164948455e-06, "logits/chosen": 11.275022506713867, "logits/rejected": 11.953080177307129, "logps/chosen": -285.617919921875, "logps/rejected": -308.9759216308594, "loss": 0.6671, "rewards/accuracies": 0.5, "rewards/chosen": 0.03621811047196388, "rewards/margins": 0.06337056308984756, "rewards/rejected": -0.027152445167303085, "step": 736 }, { "epoch": 0.11397641600618597, "grad_norm": 7.943196773529053, "learning_rate": 1.8994845360824743e-06, "logits/chosen": 10.303644180297852, "logits/rejected": 11.87435531616211, "logps/chosen": -214.60906982421875, "logps/rejected": -205.105712890625, "loss": 0.702, "rewards/accuracies": 0.5, "rewards/chosen": 0.043195489794015884, "rewards/margins": -0.006416483782231808, "rewards/rejected": 0.049611978232860565, "step": 737 }, { "epoch": 0.11413106514595013, "grad_norm": 8.889657020568848, "learning_rate": 1.9020618556701032e-06, "logits/chosen": 13.355319023132324, "logits/rejected": 11.208335876464844, "logps/chosen": -397.4225769042969, "logps/rejected": -385.44598388671875, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": 0.07396354526281357, "rewards/margins": 0.11955565214157104, "rewards/rejected": -0.045592114329338074, "step": 738 }, { "epoch": 0.11428571428571428, "grad_norm": 4.7317633628845215, "learning_rate": 1.904639175257732e-06, "logits/chosen": 6.910393714904785, "logits/rejected": 8.529176712036133, "logps/chosen": -223.09652709960938, "logps/rejected": -248.20338439941406, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": 0.030397707596421242, "rewards/margins": -0.03460369259119034, "rewards/rejected": 0.06500139832496643, "step": 739 }, { "epoch": 0.11444036342547845, "grad_norm": 4.626914024353027, "learning_rate": 1.907216494845361e-06, "logits/chosen": 3.751765012741089, "logits/rejected": 5.63156270980835, "logps/chosen": -172.82577514648438, "logps/rejected": -230.5620574951172, "loss": 0.7216, "rewards/accuracies": 0.25, "rewards/chosen": -0.04254341125488281, "rewards/margins": -0.0542665459215641, "rewards/rejected": 0.011723138391971588, "step": 740 }, { "epoch": 0.11459501256524261, "grad_norm": 5.269464492797852, "learning_rate": 1.90979381443299e-06, "logits/chosen": 6.896068572998047, "logits/rejected": 4.070405960083008, "logps/chosen": -253.23797607421875, "logps/rejected": -240.21958923339844, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": 0.04485926777124405, "rewards/margins": 9.636580944061279e-05, "rewards/rejected": 0.04476289823651314, "step": 741 }, { "epoch": 0.11474966170500676, "grad_norm": 5.3275370597839355, "learning_rate": 1.9123711340206187e-06, "logits/chosen": 13.170534133911133, "logits/rejected": 9.266887664794922, "logps/chosen": -320.3880615234375, "logps/rejected": -287.1246643066406, "loss": 0.6922, "rewards/accuracies": 0.5, "rewards/chosen": -0.022713851183652878, "rewards/margins": 0.01951933279633522, "rewards/rejected": -0.0422331802546978, "step": 742 }, { "epoch": 0.11490431084477093, "grad_norm": 3.633965492248535, "learning_rate": 1.9149484536082476e-06, "logits/chosen": 12.454980850219727, "logits/rejected": 0.3428466320037842, "logps/chosen": -177.30487060546875, "logps/rejected": -80.21995544433594, "loss": 0.6473, "rewards/accuracies": 0.75, "rewards/chosen": 0.07775764167308807, "rewards/margins": 0.09602270275354385, "rewards/rejected": -0.01826505735516548, "step": 743 }, { "epoch": 0.11505895998453508, "grad_norm": 7.145014762878418, "learning_rate": 1.9175257731958764e-06, "logits/chosen": 11.250179290771484, "logits/rejected": 14.61050796508789, "logps/chosen": -230.84727478027344, "logps/rejected": -364.293701171875, "loss": 0.6433, "rewards/accuracies": 1.0, "rewards/chosen": -0.010029507800936699, "rewards/margins": 0.10356760025024414, "rewards/rejected": -0.11359710991382599, "step": 744 }, { "epoch": 0.11521360912429925, "grad_norm": 5.105704307556152, "learning_rate": 1.9201030927835053e-06, "logits/chosen": 10.922404289245605, "logits/rejected": 13.398233413696289, "logps/chosen": -254.3232421875, "logps/rejected": -341.56536865234375, "loss": 0.7304, "rewards/accuracies": 0.25, "rewards/chosen": -0.08438950031995773, "rewards/margins": -0.06996269524097443, "rewards/rejected": -0.014426801353693008, "step": 745 }, { "epoch": 0.11536825826406341, "grad_norm": 4.958789348602295, "learning_rate": 1.922680412371134e-06, "logits/chosen": 14.878204345703125, "logits/rejected": 6.612778186798096, "logps/chosen": -401.16156005859375, "logps/rejected": -341.08294677734375, "loss": 0.6621, "rewards/accuracies": 0.375, "rewards/chosen": 0.047301530838012695, "rewards/margins": 0.08037014305591583, "rewards/rejected": -0.03306861221790314, "step": 746 }, { "epoch": 0.11552290740382756, "grad_norm": 5.68366003036499, "learning_rate": 1.925257731958763e-06, "logits/chosen": 8.169615745544434, "logits/rejected": 12.711262702941895, "logps/chosen": -352.0115051269531, "logps/rejected": -413.6510925292969, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": 0.0029344093054533005, "rewards/margins": 0.07123227417469025, "rewards/rejected": -0.0682978630065918, "step": 747 }, { "epoch": 0.11567755654359173, "grad_norm": 5.375355243682861, "learning_rate": 1.927835051546392e-06, "logits/chosen": 13.744950294494629, "logits/rejected": 14.151660919189453, "logps/chosen": -311.1435546875, "logps/rejected": -313.9859619140625, "loss": 0.7034, "rewards/accuracies": 0.25, "rewards/chosen": 0.03731326758861542, "rewards/margins": -0.014317415654659271, "rewards/rejected": 0.05163068696856499, "step": 748 }, { "epoch": 0.11583220568335588, "grad_norm": 5.679323673248291, "learning_rate": 1.930412371134021e-06, "logits/chosen": 10.623978614807129, "logits/rejected": 10.223285675048828, "logps/chosen": -310.0151062011719, "logps/rejected": -344.6908264160156, "loss": 0.6705, "rewards/accuracies": 0.5, "rewards/chosen": 0.06432309746742249, "rewards/margins": 0.055307816714048386, "rewards/rejected": 0.009015275165438652, "step": 749 }, { "epoch": 0.11598685482312004, "grad_norm": 5.993717193603516, "learning_rate": 1.9329896907216497e-06, "logits/chosen": 12.796170234680176, "logits/rejected": 8.972481727600098, "logps/chosen": -377.73797607421875, "logps/rejected": -269.9862365722656, "loss": 0.6818, "rewards/accuracies": 0.375, "rewards/chosen": 0.04607239365577698, "rewards/margins": 0.02355222962796688, "rewards/rejected": 0.022520162165164948, "step": 750 }, { "epoch": 0.11614150396288421, "grad_norm": 6.605037212371826, "learning_rate": 1.9355670103092785e-06, "logits/chosen": 5.111069679260254, "logits/rejected": 3.73654842376709, "logps/chosen": -196.4358367919922, "logps/rejected": -231.71827697753906, "loss": 0.647, "rewards/accuracies": 0.75, "rewards/chosen": 0.05673956498503685, "rewards/margins": 0.09972162544727325, "rewards/rejected": -0.042982056736946106, "step": 751 }, { "epoch": 0.11629615310264836, "grad_norm": 6.3636860847473145, "learning_rate": 1.9381443298969074e-06, "logits/chosen": 9.221412658691406, "logits/rejected": 5.37880277633667, "logps/chosen": -341.12127685546875, "logps/rejected": -292.807373046875, "loss": 0.7052, "rewards/accuracies": 0.375, "rewards/chosen": -0.03568115085363388, "rewards/margins": -0.017798233777284622, "rewards/rejected": -0.017882922664284706, "step": 752 }, { "epoch": 0.11645080224241253, "grad_norm": 6.042775630950928, "learning_rate": 1.9407216494845363e-06, "logits/chosen": 10.474544525146484, "logits/rejected": 12.03170394897461, "logps/chosen": -216.24810791015625, "logps/rejected": -325.63226318359375, "loss": 0.6611, "rewards/accuracies": 0.75, "rewards/chosen": -0.03577017784118652, "rewards/margins": 0.0670805424451828, "rewards/rejected": -0.10285072773694992, "step": 753 }, { "epoch": 0.11660545138217669, "grad_norm": 3.8389718532562256, "learning_rate": 1.943298969072165e-06, "logits/chosen": 8.815988540649414, "logits/rejected": 2.8289542198181152, "logps/chosen": -184.33404541015625, "logps/rejected": -172.734375, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.003516819328069687, "rewards/margins": 0.005642890930175781, "rewards/rejected": -0.009159708395600319, "step": 754 }, { "epoch": 0.11676010052194084, "grad_norm": 4.905189514160156, "learning_rate": 1.945876288659794e-06, "logits/chosen": 7.152502059936523, "logits/rejected": 7.8270649909973145, "logps/chosen": -264.0049133300781, "logps/rejected": -260.3672790527344, "loss": 0.6685, "rewards/accuracies": 0.625, "rewards/chosen": 0.056339167058467865, "rewards/margins": 0.05267782509326935, "rewards/rejected": 0.0036613456904888153, "step": 755 }, { "epoch": 0.11691474966170501, "grad_norm": 3.4967939853668213, "learning_rate": 1.948453608247423e-06, "logits/chosen": 7.135779857635498, "logits/rejected": 5.624508857727051, "logps/chosen": -190.70980834960938, "logps/rejected": -167.38829040527344, "loss": 0.6475, "rewards/accuracies": 0.375, "rewards/chosen": 0.05304960906505585, "rewards/margins": 0.10574927181005478, "rewards/rejected": -0.05269966274499893, "step": 756 }, { "epoch": 0.11706939880146916, "grad_norm": 5.922613620758057, "learning_rate": 1.9510309278350518e-06, "logits/chosen": 9.0873384475708, "logits/rejected": 7.914632320404053, "logps/chosen": -404.3426208496094, "logps/rejected": -358.2450866699219, "loss": 0.6516, "rewards/accuracies": 0.5, "rewards/chosen": 0.12389259040355682, "rewards/margins": 0.09641342610120773, "rewards/rejected": 0.027479171752929688, "step": 757 }, { "epoch": 0.11722404794123333, "grad_norm": 50.92366409301758, "learning_rate": 1.9536082474226806e-06, "logits/chosen": 7.9488325119018555, "logits/rejected": 13.052627563476562, "logps/chosen": -163.4320068359375, "logps/rejected": -236.76510620117188, "loss": 0.6034, "rewards/accuracies": 0.875, "rewards/chosen": 0.05622458457946777, "rewards/margins": 0.20247511565685272, "rewards/rejected": -0.14625054597854614, "step": 758 }, { "epoch": 0.11737869708099749, "grad_norm": 4.3590240478515625, "learning_rate": 1.9561855670103095e-06, "logits/chosen": 8.652853012084961, "logits/rejected": 3.7794785499572754, "logps/chosen": -310.9944763183594, "logps/rejected": -201.96273803710938, "loss": 0.6033, "rewards/accuracies": 1.0, "rewards/chosen": 0.1254536658525467, "rewards/margins": 0.19502457976341248, "rewards/rejected": -0.06957092136144638, "step": 759 }, { "epoch": 0.11753334622076164, "grad_norm": 4.805168628692627, "learning_rate": 1.9587628865979384e-06, "logits/chosen": 11.73290729522705, "logits/rejected": 10.892315864562988, "logps/chosen": -280.9211730957031, "logps/rejected": -273.315673828125, "loss": 0.6826, "rewards/accuracies": 0.375, "rewards/chosen": 0.07339353859424591, "rewards/margins": 0.022656060755252838, "rewards/rejected": 0.050737474113702774, "step": 760 }, { "epoch": 0.11768799536052581, "grad_norm": 4.360302925109863, "learning_rate": 1.9613402061855673e-06, "logits/chosen": 8.350854873657227, "logits/rejected": 10.155590057373047, "logps/chosen": -282.90374755859375, "logps/rejected": -316.4969482421875, "loss": 0.6799, "rewards/accuracies": 0.625, "rewards/chosen": 0.01883707195520401, "rewards/margins": 0.0349605567753315, "rewards/rejected": -0.016123484820127487, "step": 761 }, { "epoch": 0.11784264450028997, "grad_norm": 5.7184624671936035, "learning_rate": 1.963917525773196e-06, "logits/chosen": 10.24630069732666, "logits/rejected": 14.16028881072998, "logps/chosen": -329.63446044921875, "logps/rejected": -369.6271057128906, "loss": 0.6652, "rewards/accuracies": 0.5, "rewards/chosen": 0.04944153130054474, "rewards/margins": 0.09121362119913101, "rewards/rejected": -0.04177207872271538, "step": 762 }, { "epoch": 0.11799729364005412, "grad_norm": 6.68346643447876, "learning_rate": 1.966494845360825e-06, "logits/chosen": 11.568010330200195, "logits/rejected": 4.642280578613281, "logps/chosen": -424.32977294921875, "logps/rejected": -360.83892822265625, "loss": 0.6274, "rewards/accuracies": 0.5, "rewards/chosen": 0.17120762169361115, "rewards/margins": 0.1653505265712738, "rewards/rejected": 0.00585708674043417, "step": 763 }, { "epoch": 0.11815194277981829, "grad_norm": 5.750217914581299, "learning_rate": 1.969072164948454e-06, "logits/chosen": 16.134275436401367, "logits/rejected": 11.000141143798828, "logps/chosen": -325.0711669921875, "logps/rejected": -225.18186950683594, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.06829166412353516, "rewards/margins": 0.018773367628455162, "rewards/rejected": 0.04951830208301544, "step": 764 }, { "epoch": 0.11830659191958244, "grad_norm": 25.471750259399414, "learning_rate": 1.9716494845360827e-06, "logits/chosen": 5.202812194824219, "logits/rejected": 9.177970886230469, "logps/chosen": -228.17567443847656, "logps/rejected": -225.7200164794922, "loss": 0.7583, "rewards/accuracies": 0.25, "rewards/chosen": 0.009062983095645905, "rewards/margins": -0.11821872740983963, "rewards/rejected": 0.12728172540664673, "step": 765 }, { "epoch": 0.1184612410593466, "grad_norm": 6.5312676429748535, "learning_rate": 1.9742268041237116e-06, "logits/chosen": 9.137779235839844, "logits/rejected": 9.747842788696289, "logps/chosen": -275.69580078125, "logps/rejected": -245.17681884765625, "loss": 0.7522, "rewards/accuracies": 0.25, "rewards/chosen": 0.023091748356819153, "rewards/margins": -0.08754248917102814, "rewards/rejected": 0.1106342300772667, "step": 766 }, { "epoch": 0.11861589019911077, "grad_norm": 4.121193885803223, "learning_rate": 1.9768041237113405e-06, "logits/chosen": 11.731197357177734, "logits/rejected": 5.214690208435059, "logps/chosen": -307.0433654785156, "logps/rejected": -128.8443145751953, "loss": 0.689, "rewards/accuracies": 0.5, "rewards/chosen": -0.007788658142089844, "rewards/margins": 0.011392402462661266, "rewards/rejected": -0.019181059673428535, "step": 767 }, { "epoch": 0.11877053933887492, "grad_norm": 5.948873996734619, "learning_rate": 1.979381443298969e-06, "logits/chosen": -1.1010417938232422, "logits/rejected": 3.06386137008667, "logps/chosen": -277.4606628417969, "logps/rejected": -318.19659423828125, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": 0.09069681167602539, "rewards/margins": 0.05126165971159935, "rewards/rejected": 0.03943514823913574, "step": 768 }, { "epoch": 0.11892518847863909, "grad_norm": 4.787690162658691, "learning_rate": 1.981958762886598e-06, "logits/chosen": 11.404735565185547, "logits/rejected": 8.035576820373535, "logps/chosen": -182.26968383789062, "logps/rejected": -146.99777221679688, "loss": 0.7037, "rewards/accuracies": 0.5, "rewards/chosen": -0.037906453013420105, "rewards/margins": -0.004474181216210127, "rewards/rejected": -0.03343227133154869, "step": 769 }, { "epoch": 0.11907983761840325, "grad_norm": 5.287943363189697, "learning_rate": 1.9845360824742267e-06, "logits/chosen": 14.088287353515625, "logits/rejected": 11.371744155883789, "logps/chosen": -342.7298889160156, "logps/rejected": -270.8135070800781, "loss": 0.6924, "rewards/accuracies": 0.5, "rewards/chosen": 0.09487209469079971, "rewards/margins": 0.00558633916079998, "rewards/rejected": 0.08928576111793518, "step": 770 }, { "epoch": 0.1192344867581674, "grad_norm": 4.892053127288818, "learning_rate": 1.9871134020618556e-06, "logits/chosen": 5.921697616577148, "logits/rejected": 2.2698230743408203, "logps/chosen": -271.9732666015625, "logps/rejected": -201.50393676757812, "loss": 0.7266, "rewards/accuracies": 0.375, "rewards/chosen": 0.0035150516778230667, "rewards/margins": -0.049635402858257294, "rewards/rejected": 0.05315046012401581, "step": 771 }, { "epoch": 0.11938913589793157, "grad_norm": 4.96153450012207, "learning_rate": 1.9896907216494844e-06, "logits/chosen": 6.176779747009277, "logits/rejected": 2.5378684997558594, "logps/chosen": -200.56024169921875, "logps/rejected": -211.8494110107422, "loss": 0.659, "rewards/accuracies": 0.625, "rewards/chosen": 0.04171309247612953, "rewards/margins": 0.0731162503361702, "rewards/rejected": -0.03140316158533096, "step": 772 }, { "epoch": 0.11954378503769572, "grad_norm": 6.2721381187438965, "learning_rate": 1.9922680412371137e-06, "logits/chosen": 7.779531478881836, "logits/rejected": 5.811192989349365, "logps/chosen": -355.29254150390625, "logps/rejected": -382.0240173339844, "loss": 0.7882, "rewards/accuracies": 0.0, "rewards/chosen": 0.011780645698308945, "rewards/margins": -0.17687024176120758, "rewards/rejected": 0.18865087628364563, "step": 773 }, { "epoch": 0.11969843417745989, "grad_norm": 4.7964653968811035, "learning_rate": 1.9948453608247426e-06, "logits/chosen": 12.759563446044922, "logits/rejected": 8.546693801879883, "logps/chosen": -227.78878784179688, "logps/rejected": -184.16769409179688, "loss": 0.636, "rewards/accuracies": 0.875, "rewards/chosen": 0.1331547200679779, "rewards/margins": 0.1289692223072052, "rewards/rejected": 0.004185512661933899, "step": 774 }, { "epoch": 0.11985308331722405, "grad_norm": 5.230913162231445, "learning_rate": 1.9974226804123715e-06, "logits/chosen": 8.235557556152344, "logits/rejected": 11.511534690856934, "logps/chosen": -229.26487731933594, "logps/rejected": -298.178466796875, "loss": 0.676, "rewards/accuracies": 0.5, "rewards/chosen": 0.07019595801830292, "rewards/margins": 0.04300350695848465, "rewards/rejected": 0.02719244733452797, "step": 775 }, { "epoch": 0.1200077324569882, "grad_norm": 4.418065547943115, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 10.97363567352295, "logits/rejected": 11.506315231323242, "logps/chosen": -261.0842590332031, "logps/rejected": -276.09649658203125, "loss": 0.6842, "rewards/accuracies": 0.625, "rewards/chosen": -0.012931514531373978, "rewards/margins": 0.030180953443050385, "rewards/rejected": -0.043112464249134064, "step": 776 }, { "epoch": 0.12016238159675237, "grad_norm": 4.583871364593506, "learning_rate": 2.002577319587629e-06, "logits/chosen": 5.414307594299316, "logits/rejected": 3.444096565246582, "logps/chosen": -315.27557373046875, "logps/rejected": -214.92742919921875, "loss": 0.6907, "rewards/accuracies": 0.625, "rewards/chosen": 0.07584243267774582, "rewards/margins": 0.010672946460545063, "rewards/rejected": 0.06516948342323303, "step": 777 }, { "epoch": 0.12031703073651653, "grad_norm": 4.775980472564697, "learning_rate": 2.005154639175258e-06, "logits/chosen": 6.203409671783447, "logits/rejected": 5.393270015716553, "logps/chosen": -152.17747497558594, "logps/rejected": -174.61561584472656, "loss": 0.7065, "rewards/accuracies": 0.375, "rewards/chosen": -0.10393404960632324, "rewards/margins": -0.022334814071655273, "rewards/rejected": -0.08159923553466797, "step": 778 }, { "epoch": 0.12047167987628069, "grad_norm": 4.170057773590088, "learning_rate": 2.007731958762887e-06, "logits/chosen": 8.063127517700195, "logits/rejected": 7.265625953674316, "logps/chosen": -179.56381225585938, "logps/rejected": -182.98263549804688, "loss": 0.6838, "rewards/accuracies": 0.375, "rewards/chosen": 0.10676445066928864, "rewards/margins": 0.028914690017700195, "rewards/rejected": 0.07784977555274963, "step": 779 }, { "epoch": 0.12062632901604485, "grad_norm": 4.094460964202881, "learning_rate": 2.010309278350516e-06, "logits/chosen": 13.347660064697266, "logits/rejected": 9.314810752868652, "logps/chosen": -252.34799194335938, "logps/rejected": -241.8000946044922, "loss": 0.6382, "rewards/accuracies": 0.625, "rewards/chosen": 0.08473721146583557, "rewards/margins": 0.1212126761674881, "rewards/rejected": -0.036475468426942825, "step": 780 }, { "epoch": 0.120780978155809, "grad_norm": 3.916102647781372, "learning_rate": 2.0128865979381447e-06, "logits/chosen": 2.2494542598724365, "logits/rejected": 9.365983963012695, "logps/chosen": -129.24871826171875, "logps/rejected": -147.7412109375, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": -8.373335003852844e-05, "rewards/margins": 0.04763312637805939, "rewards/rejected": -0.04771685600280762, "step": 781 }, { "epoch": 0.12093562729557317, "grad_norm": 4.596820831298828, "learning_rate": 2.0154639175257736e-06, "logits/chosen": 14.319719314575195, "logits/rejected": 13.616143226623535, "logps/chosen": -244.30213928222656, "logps/rejected": -255.2755584716797, "loss": 0.6644, "rewards/accuracies": 0.75, "rewards/chosen": 0.05731811374425888, "rewards/margins": 0.060666944831609726, "rewards/rejected": -0.003348827362060547, "step": 782 }, { "epoch": 0.12109027643533733, "grad_norm": 7.918374061584473, "learning_rate": 2.0180412371134024e-06, "logits/chosen": 14.829571723937988, "logits/rejected": 12.51628589630127, "logps/chosen": -223.22036743164062, "logps/rejected": -236.76748657226562, "loss": 0.7175, "rewards/accuracies": 0.375, "rewards/chosen": 0.007377816364169121, "rewards/margins": -0.03495025634765625, "rewards/rejected": 0.04232807084918022, "step": 783 }, { "epoch": 0.12124492557510148, "grad_norm": 4.7898030281066895, "learning_rate": 2.0206185567010313e-06, "logits/chosen": 9.497858047485352, "logits/rejected": 12.780324935913086, "logps/chosen": -190.3990936279297, "logps/rejected": -213.04229736328125, "loss": 0.6751, "rewards/accuracies": 0.5, "rewards/chosen": 0.00687398761510849, "rewards/margins": 0.04763360694050789, "rewards/rejected": -0.0407596156001091, "step": 784 }, { "epoch": 0.12139957471486565, "grad_norm": 5.661484718322754, "learning_rate": 2.02319587628866e-06, "logits/chosen": 8.925309181213379, "logits/rejected": 6.319267272949219, "logps/chosen": -194.45753479003906, "logps/rejected": -164.46499633789062, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": -0.0011563310399651527, "rewards/margins": 0.009312868118286133, "rewards/rejected": -0.010469197295606136, "step": 785 }, { "epoch": 0.12155422385462981, "grad_norm": 6.201214790344238, "learning_rate": 2.025773195876289e-06, "logits/chosen": 15.013701438903809, "logits/rejected": 11.145907402038574, "logps/chosen": -433.2425231933594, "logps/rejected": -408.227294921875, "loss": 0.669, "rewards/accuracies": 0.625, "rewards/chosen": 0.03738289326429367, "rewards/margins": 0.051407914608716965, "rewards/rejected": -0.014025024138391018, "step": 786 }, { "epoch": 0.12170887299439397, "grad_norm": 4.733052730560303, "learning_rate": 2.0283505154639175e-06, "logits/chosen": 8.976184844970703, "logits/rejected": 0.9876854419708252, "logps/chosen": -195.0241241455078, "logps/rejected": -126.92036437988281, "loss": 0.6738, "rewards/accuracies": 0.625, "rewards/chosen": 0.032509900629520416, "rewards/margins": 0.04624588415026665, "rewards/rejected": -0.01373598538339138, "step": 787 }, { "epoch": 0.12186352213415813, "grad_norm": 5.2377777099609375, "learning_rate": 2.0309278350515464e-06, "logits/chosen": 9.633549690246582, "logits/rejected": 8.667489051818848, "logps/chosen": -242.53811645507812, "logps/rejected": -229.4419708251953, "loss": 0.7222, "rewards/accuracies": 0.125, "rewards/chosen": 0.029599763453006744, "rewards/margins": -0.0503387451171875, "rewards/rejected": 0.07993850857019424, "step": 788 }, { "epoch": 0.12201817127392228, "grad_norm": 6.034938335418701, "learning_rate": 2.0335051546391752e-06, "logits/chosen": 7.839482307434082, "logits/rejected": 6.206631660461426, "logps/chosen": -354.34478759765625, "logps/rejected": -278.4581298828125, "loss": 0.6565, "rewards/accuracies": 0.5, "rewards/chosen": 0.08779926598072052, "rewards/margins": 0.08366823196411133, "rewards/rejected": 0.004131029359996319, "step": 789 }, { "epoch": 0.12217282041368645, "grad_norm": 4.621279239654541, "learning_rate": 2.036082474226804e-06, "logits/chosen": 13.00024700164795, "logits/rejected": 13.73876953125, "logps/chosen": -241.05104064941406, "logps/rejected": -224.72862243652344, "loss": 0.7827, "rewards/accuracies": 0.0, "rewards/chosen": -0.03783273696899414, "rewards/margins": -0.16726845502853394, "rewards/rejected": 0.129435732960701, "step": 790 }, { "epoch": 0.12232746955345061, "grad_norm": 5.090161323547363, "learning_rate": 2.038659793814433e-06, "logits/chosen": 6.883066177368164, "logits/rejected": 3.850621223449707, "logps/chosen": -175.68406677246094, "logps/rejected": -196.53976440429688, "loss": 0.6936, "rewards/accuracies": 0.375, "rewards/chosen": 0.0826881155371666, "rewards/margins": 0.017871970310807228, "rewards/rejected": 0.06481613963842392, "step": 791 }, { "epoch": 0.12248211869321476, "grad_norm": 5.466822147369385, "learning_rate": 2.041237113402062e-06, "logits/chosen": 7.592716217041016, "logits/rejected": 11.036369323730469, "logps/chosen": -282.68658447265625, "logps/rejected": -372.5218200683594, "loss": 0.6922, "rewards/accuracies": 0.375, "rewards/chosen": 0.02853524498641491, "rewards/margins": 0.007233594078570604, "rewards/rejected": 0.02130165323615074, "step": 792 }, { "epoch": 0.12263676783297893, "grad_norm": 5.506161212921143, "learning_rate": 2.0438144329896907e-06, "logits/chosen": 4.128499507904053, "logits/rejected": 5.9841389656066895, "logps/chosen": -328.60919189453125, "logps/rejected": -304.7389221191406, "loss": 0.7031, "rewards/accuracies": 0.625, "rewards/chosen": 0.04923431575298309, "rewards/margins": -0.006314259022474289, "rewards/rejected": 0.055548571050167084, "step": 793 }, { "epoch": 0.1227914169727431, "grad_norm": 5.09592866897583, "learning_rate": 2.0463917525773196e-06, "logits/chosen": 11.008781433105469, "logits/rejected": 13.028718948364258, "logps/chosen": -283.5499267578125, "logps/rejected": -314.2843322753906, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": 0.06488609313964844, "rewards/margins": 0.0037350673228502274, "rewards/rejected": 0.06115102767944336, "step": 794 }, { "epoch": 0.12294606611250725, "grad_norm": 4.265661239624023, "learning_rate": 2.0489690721649485e-06, "logits/chosen": 13.848640441894531, "logits/rejected": 11.360034942626953, "logps/chosen": -296.02288818359375, "logps/rejected": -225.19744873046875, "loss": 0.6794, "rewards/accuracies": 0.5, "rewards/chosen": 0.017357969656586647, "rewards/margins": 0.03893432393670082, "rewards/rejected": -0.021576358005404472, "step": 795 }, { "epoch": 0.12310071525227141, "grad_norm": 4.471963882446289, "learning_rate": 2.0515463917525773e-06, "logits/chosen": 4.800446510314941, "logits/rejected": 12.012195587158203, "logps/chosen": -187.22003173828125, "logps/rejected": -246.9071044921875, "loss": 0.6755, "rewards/accuracies": 0.625, "rewards/chosen": 0.020716626197099686, "rewards/margins": 0.04033312946557999, "rewards/rejected": -0.019616510719060898, "step": 796 }, { "epoch": 0.12325536439203556, "grad_norm": 5.098708629608154, "learning_rate": 2.0541237113402062e-06, "logits/chosen": 8.195775985717773, "logits/rejected": 15.901062965393066, "logps/chosen": -197.41647338867188, "logps/rejected": -285.45166015625, "loss": 0.6833, "rewards/accuracies": 0.5, "rewards/chosen": 0.004872034303843975, "rewards/margins": 0.040291640907526016, "rewards/rejected": -0.035419613122940063, "step": 797 }, { "epoch": 0.12341001353179973, "grad_norm": 5.737367153167725, "learning_rate": 2.056701030927835e-06, "logits/chosen": 13.195758819580078, "logits/rejected": 3.952853202819824, "logps/chosen": -405.4150085449219, "logps/rejected": -279.95404052734375, "loss": 0.6469, "rewards/accuracies": 0.625, "rewards/chosen": 0.10786724090576172, "rewards/margins": 0.1071086898446083, "rewards/rejected": 0.0007585529237985611, "step": 798 }, { "epoch": 0.1235646626715639, "grad_norm": 4.648750305175781, "learning_rate": 2.059278350515464e-06, "logits/chosen": 6.546786785125732, "logits/rejected": 10.23299503326416, "logps/chosen": -195.3010711669922, "logps/rejected": -216.12734985351562, "loss": 0.6892, "rewards/accuracies": 0.625, "rewards/chosen": 0.04346632957458496, "rewards/margins": 0.010684346780180931, "rewards/rejected": 0.03278198093175888, "step": 799 }, { "epoch": 0.12371931181132804, "grad_norm": 5.58543062210083, "learning_rate": 2.061855670103093e-06, "logits/chosen": 9.791015625, "logits/rejected": 9.70248031616211, "logps/chosen": -317.1544189453125, "logps/rejected": -342.6253356933594, "loss": 0.6238, "rewards/accuracies": 0.625, "rewards/chosen": 0.13601836562156677, "rewards/margins": 0.15982751548290253, "rewards/rejected": -0.023809146136045456, "step": 800 }, { "epoch": 0.12387396095109221, "grad_norm": 5.999906063079834, "learning_rate": 2.0644329896907217e-06, "logits/chosen": 15.210868835449219, "logits/rejected": 10.143805503845215, "logps/chosen": -367.5404052734375, "logps/rejected": -262.6044006347656, "loss": 0.6791, "rewards/accuracies": 0.75, "rewards/chosen": 0.11988945305347443, "rewards/margins": 0.05030715465545654, "rewards/rejected": 0.06958229839801788, "step": 801 }, { "epoch": 0.12402861009085638, "grad_norm": 12.851231575012207, "learning_rate": 2.0670103092783506e-06, "logits/chosen": 18.283771514892578, "logits/rejected": 9.252113342285156, "logps/chosen": -371.8199768066406, "logps/rejected": -262.4722900390625, "loss": 0.6784, "rewards/accuracies": 0.625, "rewards/chosen": 0.03439944237470627, "rewards/margins": 0.03801487386226654, "rewards/rejected": -0.003615431487560272, "step": 802 }, { "epoch": 0.12418325923062053, "grad_norm": 3.7239251136779785, "learning_rate": 2.0695876288659794e-06, "logits/chosen": 14.889500617980957, "logits/rejected": 16.419139862060547, "logps/chosen": -185.39463806152344, "logps/rejected": -192.43190002441406, "loss": 0.7052, "rewards/accuracies": 0.375, "rewards/chosen": -0.02721242792904377, "rewards/margins": -0.021084880456328392, "rewards/rejected": -0.00612754886969924, "step": 803 }, { "epoch": 0.12433790837038469, "grad_norm": 7.316143035888672, "learning_rate": 2.0721649484536087e-06, "logits/chosen": 8.062252044677734, "logits/rejected": 7.828424453735352, "logps/chosen": -358.885986328125, "logps/rejected": -353.7328186035156, "loss": 0.74, "rewards/accuracies": 0.375, "rewards/chosen": -0.08713152259588242, "rewards/margins": -0.061764907091856, "rewards/rejected": -0.02536662295460701, "step": 804 }, { "epoch": 0.12449255751014884, "grad_norm": 5.290778636932373, "learning_rate": 2.0747422680412376e-06, "logits/chosen": 8.164313316345215, "logits/rejected": 7.313644886016846, "logps/chosen": -380.3600158691406, "logps/rejected": -335.2882080078125, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": -0.0015501962043344975, "rewards/margins": 0.06636910140514374, "rewards/rejected": -0.0679192990064621, "step": 805 }, { "epoch": 0.12464720664991301, "grad_norm": 4.303194046020508, "learning_rate": 2.077319587628866e-06, "logits/chosen": 6.468660831451416, "logits/rejected": 9.87783432006836, "logps/chosen": -166.953857421875, "logps/rejected": -272.2720642089844, "loss": 0.6919, "rewards/accuracies": 0.375, "rewards/chosen": -0.01073012501001358, "rewards/margins": 0.0056481375358998775, "rewards/rejected": -0.016378259286284447, "step": 806 }, { "epoch": 0.12480185578967717, "grad_norm": 5.501675128936768, "learning_rate": 2.079896907216495e-06, "logits/chosen": 15.556056022644043, "logits/rejected": 11.417365074157715, "logps/chosen": -347.63348388671875, "logps/rejected": -239.30458068847656, "loss": 0.6314, "rewards/accuracies": 0.875, "rewards/chosen": 0.12454967200756073, "rewards/margins": 0.13267840445041656, "rewards/rejected": -0.008128738962113857, "step": 807 }, { "epoch": 0.12495650492944133, "grad_norm": 4.462109088897705, "learning_rate": 2.082474226804124e-06, "logits/chosen": 11.569940567016602, "logits/rejected": 4.980714797973633, "logps/chosen": -225.48834228515625, "logps/rejected": -188.32470703125, "loss": 0.6275, "rewards/accuracies": 0.5, "rewards/chosen": 0.1012401133775711, "rewards/margins": 0.14932766556739807, "rewards/rejected": -0.04808754846453667, "step": 808 }, { "epoch": 0.12511115406920548, "grad_norm": 5.018195629119873, "learning_rate": 2.0850515463917527e-06, "logits/chosen": 6.377014636993408, "logits/rejected": 9.590864181518555, "logps/chosen": -304.1554870605469, "logps/rejected": -264.3926696777344, "loss": 0.7024, "rewards/accuracies": 0.5, "rewards/chosen": 0.04383482784032822, "rewards/margins": -0.0163697712123394, "rewards/rejected": 0.060204602777957916, "step": 809 }, { "epoch": 0.12526580320896966, "grad_norm": 5.525274753570557, "learning_rate": 2.0876288659793816e-06, "logits/chosen": 11.017049789428711, "logits/rejected": 6.923498630523682, "logps/chosen": -299.4324035644531, "logps/rejected": -224.84326171875, "loss": 0.7309, "rewards/accuracies": 0.25, "rewards/chosen": 0.019978739321231842, "rewards/margins": -0.06723172962665558, "rewards/rejected": 0.08721046894788742, "step": 810 }, { "epoch": 0.1254204523487338, "grad_norm": 5.988023281097412, "learning_rate": 2.0902061855670104e-06, "logits/chosen": 12.080341339111328, "logits/rejected": 8.940905570983887, "logps/chosen": -288.75872802734375, "logps/rejected": -245.6265411376953, "loss": 0.6778, "rewards/accuracies": 0.375, "rewards/chosen": 0.010688398033380508, "rewards/margins": 0.042087554931640625, "rewards/rejected": -0.03139915689826012, "step": 811 }, { "epoch": 0.12557510148849796, "grad_norm": 5.466331481933594, "learning_rate": 2.0927835051546393e-06, "logits/chosen": 3.9428858757019043, "logits/rejected": 6.832188129425049, "logps/chosen": -237.5424041748047, "logps/rejected": -221.52450561523438, "loss": 0.7522, "rewards/accuracies": 0.125, "rewards/chosen": 0.008899642154574394, "rewards/margins": -0.10910601913928986, "rewards/rejected": 0.1180056557059288, "step": 812 }, { "epoch": 0.12572975062826214, "grad_norm": 5.38535213470459, "learning_rate": 2.095360824742268e-06, "logits/chosen": 11.530954360961914, "logits/rejected": 10.034501075744629, "logps/chosen": -230.23403930664062, "logps/rejected": -247.35653686523438, "loss": 0.7041, "rewards/accuracies": 0.5, "rewards/chosen": -0.010055923834443092, "rewards/margins": -0.01503563392907381, "rewards/rejected": 0.004979707300662994, "step": 813 }, { "epoch": 0.1258843997680263, "grad_norm": 4.561618328094482, "learning_rate": 2.097938144329897e-06, "logits/chosen": 17.721664428710938, "logits/rejected": 13.507036209106445, "logps/chosen": -297.8882751464844, "logps/rejected": -251.82601928710938, "loss": 0.6516, "rewards/accuracies": 0.625, "rewards/chosen": 0.11350574344396591, "rewards/margins": 0.09214649349451065, "rewards/rejected": 0.02135925367474556, "step": 814 }, { "epoch": 0.12603904890779044, "grad_norm": 5.475361347198486, "learning_rate": 2.100515463917526e-06, "logits/chosen": 12.706809997558594, "logits/rejected": 4.335636138916016, "logps/chosen": -334.39837646484375, "logps/rejected": -292.5032958984375, "loss": 0.6386, "rewards/accuracies": 0.875, "rewards/chosen": 0.03400973975658417, "rewards/margins": 0.11597838997840881, "rewards/rejected": -0.08196864277124405, "step": 815 }, { "epoch": 0.12619369804755462, "grad_norm": 7.0570878982543945, "learning_rate": 2.1030927835051548e-06, "logits/chosen": 9.889067649841309, "logits/rejected": 8.568881034851074, "logps/chosen": -315.8548583984375, "logps/rejected": -297.20184326171875, "loss": 0.7286, "rewards/accuracies": 0.375, "rewards/chosen": -0.037375811487436295, "rewards/margins": -0.06345844268798828, "rewards/rejected": 0.026082634925842285, "step": 816 }, { "epoch": 0.12634834718731877, "grad_norm": 8.126748085021973, "learning_rate": 2.1056701030927837e-06, "logits/chosen": 4.589727401733398, "logits/rejected": 9.561027526855469, "logps/chosen": -325.199462890625, "logps/rejected": -371.70489501953125, "loss": 0.6273, "rewards/accuracies": 0.75, "rewards/chosen": 0.06847696006298065, "rewards/margins": 0.1443004608154297, "rewards/rejected": -0.07582350075244904, "step": 817 }, { "epoch": 0.12650299632708292, "grad_norm": 4.674380779266357, "learning_rate": 2.1082474226804125e-06, "logits/chosen": 7.388611793518066, "logits/rejected": 4.8958024978637695, "logps/chosen": -284.6629333496094, "logps/rejected": -235.03884887695312, "loss": 0.6684, "rewards/accuracies": 0.75, "rewards/chosen": 0.008503109216690063, "rewards/margins": 0.05912017822265625, "rewards/rejected": -0.05061707645654678, "step": 818 }, { "epoch": 0.1266576454668471, "grad_norm": 4.499520301818848, "learning_rate": 2.1108247422680414e-06, "logits/chosen": 13.807245254516602, "logits/rejected": 9.02981185913086, "logps/chosen": -238.164794921875, "logps/rejected": -246.54641723632812, "loss": 0.7396, "rewards/accuracies": 0.125, "rewards/chosen": -0.010256385430693626, "rewards/margins": -0.08858489990234375, "rewards/rejected": 0.07832851260900497, "step": 819 }, { "epoch": 0.12681229460661125, "grad_norm": 5.532468795776367, "learning_rate": 2.1134020618556703e-06, "logits/chosen": 13.084516525268555, "logits/rejected": 10.398052215576172, "logps/chosen": -327.9105224609375, "logps/rejected": -317.53192138671875, "loss": 0.6004, "rewards/accuracies": 0.75, "rewards/chosen": 0.21336251497268677, "rewards/margins": 0.21609102189540863, "rewards/rejected": -0.002728504128754139, "step": 820 }, { "epoch": 0.1269669437463754, "grad_norm": 9.940056800842285, "learning_rate": 2.115979381443299e-06, "logits/chosen": 3.2517383098602295, "logits/rejected": 8.900216102600098, "logps/chosen": -285.9754943847656, "logps/rejected": -209.3849334716797, "loss": 0.6975, "rewards/accuracies": 0.625, "rewards/chosen": 0.033056069165468216, "rewards/margins": 0.0024034082889556885, "rewards/rejected": 0.030652664601802826, "step": 821 }, { "epoch": 0.12712159288613958, "grad_norm": 5.591113090515137, "learning_rate": 2.118556701030928e-06, "logits/chosen": 2.3430514335632324, "logits/rejected": 3.8197216987609863, "logps/chosen": -508.2909240722656, "logps/rejected": -183.2859649658203, "loss": 0.654, "rewards/accuracies": 0.75, "rewards/chosen": -0.006513596512377262, "rewards/margins": 0.08492724597454071, "rewards/rejected": -0.0914408415555954, "step": 822 }, { "epoch": 0.12727624202590374, "grad_norm": 6.254790306091309, "learning_rate": 2.121134020618557e-06, "logits/chosen": 8.330195426940918, "logits/rejected": 2.063925266265869, "logps/chosen": -289.0109558105469, "logps/rejected": -219.625732421875, "loss": 0.7446, "rewards/accuracies": 0.5, "rewards/chosen": -0.036389730870723724, "rewards/margins": -0.06864052265882492, "rewards/rejected": 0.0322507843375206, "step": 823 }, { "epoch": 0.1274308911656679, "grad_norm": 5.575355052947998, "learning_rate": 2.1237113402061858e-06, "logits/chosen": 7.975765705108643, "logits/rejected": 10.704412460327148, "logps/chosen": -236.17141723632812, "logps/rejected": -391.30511474609375, "loss": 0.6372, "rewards/accuracies": 0.875, "rewards/chosen": 0.0528411902487278, "rewards/margins": 0.11911468207836151, "rewards/rejected": -0.06627349555492401, "step": 824 }, { "epoch": 0.12758554030543204, "grad_norm": 5.477494716644287, "learning_rate": 2.1262886597938146e-06, "logits/chosen": 9.324621200561523, "logits/rejected": 1.4057573080062866, "logps/chosen": -315.09405517578125, "logps/rejected": -252.90664672851562, "loss": 0.6818, "rewards/accuracies": 0.5, "rewards/chosen": 0.030775122344493866, "rewards/margins": 0.026944708079099655, "rewards/rejected": 0.003830414265394211, "step": 825 }, { "epoch": 0.12774018944519622, "grad_norm": 5.634899616241455, "learning_rate": 2.1288659793814435e-06, "logits/chosen": 10.121933937072754, "logits/rejected": 9.240137100219727, "logps/chosen": -308.15411376953125, "logps/rejected": -332.68438720703125, "loss": 0.7094, "rewards/accuracies": 0.5, "rewards/chosen": -0.052181147038936615, "rewards/margins": -0.028965899720788002, "rewards/rejected": -0.023215247318148613, "step": 826 }, { "epoch": 0.12789483858496037, "grad_norm": 4.997093200683594, "learning_rate": 2.1314432989690724e-06, "logits/chosen": 12.90583610534668, "logits/rejected": 10.992410659790039, "logps/chosen": -287.5904541015625, "logps/rejected": -243.84765625, "loss": 0.6812, "rewards/accuracies": 0.5, "rewards/chosen": 0.06390509754419327, "rewards/margins": 0.043041277676820755, "rewards/rejected": 0.020863819867372513, "step": 827 }, { "epoch": 0.12804948772472452, "grad_norm": 11.93228530883789, "learning_rate": 2.1340206185567012e-06, "logits/chosen": 9.490852355957031, "logits/rejected": 4.092257022857666, "logps/chosen": -424.6783447265625, "logps/rejected": -347.0323791503906, "loss": 0.7611, "rewards/accuracies": 0.375, "rewards/chosen": -0.06445684283971786, "rewards/margins": -0.12477999180555344, "rewards/rejected": 0.06032313406467438, "step": 828 }, { "epoch": 0.1282041368644887, "grad_norm": 4.208880424499512, "learning_rate": 2.13659793814433e-06, "logits/chosen": 11.985099792480469, "logits/rejected": 11.756420135498047, "logps/chosen": -215.1483917236328, "logps/rejected": -232.8167266845703, "loss": 0.6597, "rewards/accuracies": 0.75, "rewards/chosen": 0.06749783456325531, "rewards/margins": 0.06984911113977432, "rewards/rejected": -0.002351284958422184, "step": 829 }, { "epoch": 0.12835878600425285, "grad_norm": 5.282952308654785, "learning_rate": 2.139175257731959e-06, "logits/chosen": 11.35549545288086, "logits/rejected": 11.96148681640625, "logps/chosen": -281.9228820800781, "logps/rejected": -311.50067138671875, "loss": 0.6736, "rewards/accuracies": 0.5, "rewards/chosen": -0.014527034014463425, "rewards/margins": 0.04262495040893555, "rewards/rejected": -0.05715198814868927, "step": 830 }, { "epoch": 0.128513435144017, "grad_norm": 5.628005504608154, "learning_rate": 2.141752577319588e-06, "logits/chosen": 14.49488639831543, "logits/rejected": 7.180295944213867, "logps/chosen": -225.1959228515625, "logps/rejected": -151.926025390625, "loss": 0.6973, "rewards/accuracies": 0.375, "rewards/chosen": -0.06507306545972824, "rewards/margins": -0.0008908025920391083, "rewards/rejected": -0.06418225169181824, "step": 831 }, { "epoch": 0.12866808428378118, "grad_norm": 4.31758975982666, "learning_rate": 2.1443298969072167e-06, "logits/chosen": 6.613127708435059, "logits/rejected": 10.01853084564209, "logps/chosen": -148.9969940185547, "logps/rejected": -172.67262268066406, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": 0.05273721367120743, "rewards/margins": 0.04980647563934326, "rewards/rejected": 0.0029307371005415916, "step": 832 }, { "epoch": 0.12882273342354533, "grad_norm": 5.901344299316406, "learning_rate": 2.1469072164948456e-06, "logits/chosen": 11.104122161865234, "logits/rejected": 10.231244087219238, "logps/chosen": -287.00421142578125, "logps/rejected": -311.8900146484375, "loss": 0.7569, "rewards/accuracies": 0.25, "rewards/chosen": -0.04850844666361809, "rewards/margins": -0.10779397189617157, "rewards/rejected": 0.059285521507263184, "step": 833 }, { "epoch": 0.12897738256330948, "grad_norm": 5.326846122741699, "learning_rate": 2.1494845360824745e-06, "logits/chosen": 11.311027526855469, "logits/rejected": 6.194657325744629, "logps/chosen": -275.8129577636719, "logps/rejected": -249.55160522460938, "loss": 0.6156, "rewards/accuracies": 0.625, "rewards/chosen": 0.02035531774163246, "rewards/margins": 0.20545989274978638, "rewards/rejected": -0.1851045787334442, "step": 834 }, { "epoch": 0.12913203170307366, "grad_norm": 5.7647318840026855, "learning_rate": 2.1520618556701033e-06, "logits/chosen": 13.287602424621582, "logits/rejected": 6.345489501953125, "logps/chosen": -336.2106628417969, "logps/rejected": -238.15841674804688, "loss": 0.6602, "rewards/accuracies": 0.625, "rewards/chosen": -0.011835005134344101, "rewards/margins": 0.07760138809680939, "rewards/rejected": -0.08943638950586319, "step": 835 }, { "epoch": 0.12928668084283781, "grad_norm": 5.190736770629883, "learning_rate": 2.1546391752577322e-06, "logits/chosen": 14.597845077514648, "logits/rejected": 14.97640609741211, "logps/chosen": -305.5723876953125, "logps/rejected": -259.9925537109375, "loss": 0.6951, "rewards/accuracies": 0.375, "rewards/chosen": -0.04437980800867081, "rewards/margins": 0.008184665814042091, "rewards/rejected": -0.052564479410648346, "step": 836 }, { "epoch": 0.12944132998260197, "grad_norm": 5.012102127075195, "learning_rate": 2.157216494845361e-06, "logits/chosen": 10.570706367492676, "logits/rejected": 4.804713726043701, "logps/chosen": -239.45379638671875, "logps/rejected": -216.71066284179688, "loss": 0.6994, "rewards/accuracies": 0.375, "rewards/chosen": 0.006371835246682167, "rewards/margins": -0.0052504995837807655, "rewards/rejected": 0.011622333899140358, "step": 837 }, { "epoch": 0.12959597912236615, "grad_norm": 5.319317817687988, "learning_rate": 2.15979381443299e-06, "logits/chosen": 6.073154449462891, "logits/rejected": -1.7007324695587158, "logps/chosen": -325.26202392578125, "logps/rejected": -249.67059326171875, "loss": 0.668, "rewards/accuracies": 0.625, "rewards/chosen": 0.08003725856542587, "rewards/margins": 0.0574459582567215, "rewards/rejected": 0.022591307759284973, "step": 838 }, { "epoch": 0.1297506282621303, "grad_norm": 5.549462795257568, "learning_rate": 2.162371134020619e-06, "logits/chosen": 12.748003005981445, "logits/rejected": 8.749460220336914, "logps/chosen": -365.42523193359375, "logps/rejected": -306.39288330078125, "loss": 0.6868, "rewards/accuracies": 0.375, "rewards/chosen": 0.08983974903821945, "rewards/margins": 0.017597097903490067, "rewards/rejected": 0.07224264740943909, "step": 839 }, { "epoch": 0.12990527740189445, "grad_norm": 5.197721481323242, "learning_rate": 2.1649484536082477e-06, "logits/chosen": 15.839910507202148, "logits/rejected": 9.044678688049316, "logps/chosen": -357.5267639160156, "logps/rejected": -239.7066650390625, "loss": 0.6307, "rewards/accuracies": 0.875, "rewards/chosen": 0.13030031323432922, "rewards/margins": 0.13541758060455322, "rewards/rejected": -0.005117248743772507, "step": 840 }, { "epoch": 0.1300599265416586, "grad_norm": 7.483394622802734, "learning_rate": 2.1675257731958766e-06, "logits/chosen": 9.625455856323242, "logits/rejected": 5.541503429412842, "logps/chosen": -275.71551513671875, "logps/rejected": -236.4689178466797, "loss": 0.6559, "rewards/accuracies": 0.5, "rewards/chosen": -0.01684422791004181, "rewards/margins": 0.08393378555774689, "rewards/rejected": -0.1007780134677887, "step": 841 }, { "epoch": 0.13021457568142278, "grad_norm": 4.375672817230225, "learning_rate": 2.1701030927835055e-06, "logits/chosen": 12.223703384399414, "logits/rejected": 4.873989105224609, "logps/chosen": -272.21771240234375, "logps/rejected": -177.20758056640625, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": -0.00025720614939928055, "rewards/margins": 0.004235647618770599, "rewards/rejected": -0.0044928546994924545, "step": 842 }, { "epoch": 0.13036922482118693, "grad_norm": 4.192330360412598, "learning_rate": 2.1726804123711343e-06, "logits/chosen": 9.940579414367676, "logits/rejected": 11.158378601074219, "logps/chosen": -214.2007293701172, "logps/rejected": -230.5918731689453, "loss": 0.6812, "rewards/accuracies": 0.75, "rewards/chosen": 0.005637500435113907, "rewards/margins": 0.028319569304585457, "rewards/rejected": -0.0226820707321167, "step": 843 }, { "epoch": 0.13052387396095108, "grad_norm": 7.0680952072143555, "learning_rate": 2.175257731958763e-06, "logits/chosen": 9.581306457519531, "logits/rejected": 7.343588829040527, "logps/chosen": -320.553955078125, "logps/rejected": -280.4522705078125, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": 0.07376085221767426, "rewards/margins": 0.012072324752807617, "rewards/rejected": 0.06168852001428604, "step": 844 }, { "epoch": 0.13067852310071526, "grad_norm": 6.187963008880615, "learning_rate": 2.177835051546392e-06, "logits/chosen": 8.123754501342773, "logits/rejected": 8.557390213012695, "logps/chosen": -197.9710235595703, "logps/rejected": -185.9560089111328, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": -0.016050808131694794, "rewards/margins": -0.030372004956007004, "rewards/rejected": 0.014321183785796165, "step": 845 }, { "epoch": 0.1308331722404794, "grad_norm": 4.1606268882751465, "learning_rate": 2.180412371134021e-06, "logits/chosen": 9.032398223876953, "logits/rejected": 2.874891757965088, "logps/chosen": -290.2815856933594, "logps/rejected": -169.91775512695312, "loss": 0.5965, "rewards/accuracies": 0.75, "rewards/chosen": 0.19966769218444824, "rewards/margins": 0.22657713294029236, "rewards/rejected": -0.026909448206424713, "step": 846 }, { "epoch": 0.13098782138024356, "grad_norm": 5.486983299255371, "learning_rate": 2.18298969072165e-06, "logits/chosen": 8.518402099609375, "logits/rejected": 7.906534671783447, "logps/chosen": -315.7825927734375, "logps/rejected": -397.38824462890625, "loss": 0.6529, "rewards/accuracies": 0.625, "rewards/chosen": 0.1356370449066162, "rewards/margins": 0.08912573009729385, "rewards/rejected": 0.046511318534612656, "step": 847 }, { "epoch": 0.13114247052000774, "grad_norm": 5.048002243041992, "learning_rate": 2.1855670103092787e-06, "logits/chosen": 5.9496002197265625, "logits/rejected": 2.853168487548828, "logps/chosen": -420.3154296875, "logps/rejected": -264.4692687988281, "loss": 0.6216, "rewards/accuracies": 0.75, "rewards/chosen": 0.16529923677444458, "rewards/margins": 0.15697579085826874, "rewards/rejected": 0.008323431946337223, "step": 848 }, { "epoch": 0.1312971196597719, "grad_norm": 6.496761322021484, "learning_rate": 2.1881443298969076e-06, "logits/chosen": 9.39857292175293, "logits/rejected": 7.443991661071777, "logps/chosen": -356.5985107421875, "logps/rejected": -357.9806213378906, "loss": 0.6649, "rewards/accuracies": 0.5, "rewards/chosen": 0.03705472871661186, "rewards/margins": 0.06678623706102371, "rewards/rejected": -0.02973151206970215, "step": 849 }, { "epoch": 0.13145176879953605, "grad_norm": 5.437524318695068, "learning_rate": 2.1907216494845364e-06, "logits/chosen": 12.375151634216309, "logits/rejected": 9.113358497619629, "logps/chosen": -290.90386962890625, "logps/rejected": -338.252685546875, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": 0.010050198063254356, "rewards/margins": 0.039315223693847656, "rewards/rejected": -0.02926502376794815, "step": 850 }, { "epoch": 0.13160641793930022, "grad_norm": 5.165545463562012, "learning_rate": 2.1932989690721653e-06, "logits/chosen": 6.2288618087768555, "logits/rejected": 10.506753921508789, "logps/chosen": -294.6086120605469, "logps/rejected": -350.2838134765625, "loss": 0.6546, "rewards/accuracies": 0.5, "rewards/chosen": 0.14319303631782532, "rewards/margins": 0.08622626960277557, "rewards/rejected": 0.05696675926446915, "step": 851 }, { "epoch": 0.13176106707906438, "grad_norm": 8.821476936340332, "learning_rate": 2.195876288659794e-06, "logits/chosen": 15.098304748535156, "logits/rejected": 10.107544898986816, "logps/chosen": -405.1445617675781, "logps/rejected": -450.9846496582031, "loss": 0.7068, "rewards/accuracies": 0.375, "rewards/chosen": 0.2196439653635025, "rewards/margins": -0.013928033411502838, "rewards/rejected": 0.23357200622558594, "step": 852 }, { "epoch": 0.13191571621882853, "grad_norm": 6.702816963195801, "learning_rate": 2.1984536082474226e-06, "logits/chosen": 8.889533996582031, "logits/rejected": 6.199413299560547, "logps/chosen": -368.3157958984375, "logps/rejected": -259.1598205566406, "loss": 0.7288, "rewards/accuracies": 0.375, "rewards/chosen": 0.017586207017302513, "rewards/margins": -0.057279422879219055, "rewards/rejected": 0.07486562430858612, "step": 853 }, { "epoch": 0.1320703653585927, "grad_norm": 5.448475360870361, "learning_rate": 2.2010309278350515e-06, "logits/chosen": 5.404626369476318, "logits/rejected": 7.960400581359863, "logps/chosen": -207.77020263671875, "logps/rejected": -236.05502319335938, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.0670294314622879, "rewards/margins": 0.009743781760334969, "rewards/rejected": 0.057285647839307785, "step": 854 }, { "epoch": 0.13222501449835686, "grad_norm": 5.092534065246582, "learning_rate": 2.2036082474226804e-06, "logits/chosen": 14.00489616394043, "logits/rejected": 4.81197452545166, "logps/chosen": -330.98785400390625, "logps/rejected": -217.79623413085938, "loss": 0.6497, "rewards/accuracies": 0.75, "rewards/chosen": 0.08687266707420349, "rewards/margins": 0.09883155673742294, "rewards/rejected": -0.011958885937929153, "step": 855 }, { "epoch": 0.132379663638121, "grad_norm": 5.820626258850098, "learning_rate": 2.2061855670103092e-06, "logits/chosen": 13.154562950134277, "logits/rejected": 3.9785842895507812, "logps/chosen": -245.25205993652344, "logps/rejected": -226.92996215820312, "loss": 0.7128, "rewards/accuracies": 0.25, "rewards/chosen": -0.08124828338623047, "rewards/margins": -0.03117704577744007, "rewards/rejected": -0.05007123947143555, "step": 856 }, { "epoch": 0.13253431277788516, "grad_norm": 5.1035542488098145, "learning_rate": 2.208762886597938e-06, "logits/chosen": 15.868261337280273, "logits/rejected": 9.982535362243652, "logps/chosen": -284.6107177734375, "logps/rejected": -245.8839874267578, "loss": 0.6598, "rewards/accuracies": 0.5, "rewards/chosen": 0.09444428235292435, "rewards/margins": 0.08277445286512375, "rewards/rejected": 0.0116698257625103, "step": 857 }, { "epoch": 0.13268896191764934, "grad_norm": 5.5282745361328125, "learning_rate": 2.211340206185567e-06, "logits/chosen": 9.167112350463867, "logits/rejected": 13.077407836914062, "logps/chosen": -206.6728973388672, "logps/rejected": -246.09349060058594, "loss": 0.7257, "rewards/accuracies": 0.5, "rewards/chosen": -0.003094123676419258, "rewards/margins": -0.053266741335392, "rewards/rejected": 0.05017261952161789, "step": 858 }, { "epoch": 0.1328436110574135, "grad_norm": 6.413069725036621, "learning_rate": 2.213917525773196e-06, "logits/chosen": 9.0770845413208, "logits/rejected": 5.942964553833008, "logps/chosen": -518.3092651367188, "logps/rejected": -350.59393310546875, "loss": 0.7058, "rewards/accuracies": 0.75, "rewards/chosen": 0.08394508063793182, "rewards/margins": -0.005816606339067221, "rewards/rejected": 0.08976168930530548, "step": 859 }, { "epoch": 0.13299826019717764, "grad_norm": 4.293361186981201, "learning_rate": 2.2164948453608247e-06, "logits/chosen": 12.792933464050293, "logits/rejected": 9.101966857910156, "logps/chosen": -232.47250366210938, "logps/rejected": -184.42225646972656, "loss": 0.6937, "rewards/accuracies": 0.25, "rewards/chosen": 0.07570510357618332, "rewards/margins": 0.006333686411380768, "rewards/rejected": 0.06937141716480255, "step": 860 }, { "epoch": 0.13315290933694182, "grad_norm": 4.916195869445801, "learning_rate": 2.2190721649484536e-06, "logits/chosen": 7.360235214233398, "logits/rejected": 7.22442626953125, "logps/chosen": -252.9958953857422, "logps/rejected": -253.37078857421875, "loss": 0.608, "rewards/accuracies": 0.75, "rewards/chosen": 0.04272756725549698, "rewards/margins": 0.20476779341697693, "rewards/rejected": -0.16204023361206055, "step": 861 }, { "epoch": 0.13330755847670597, "grad_norm": 5.882905006408691, "learning_rate": 2.2216494845360825e-06, "logits/chosen": 9.378471374511719, "logits/rejected": 11.289730072021484, "logps/chosen": -259.3278503417969, "logps/rejected": -263.862548828125, "loss": 0.7256, "rewards/accuracies": 0.5, "rewards/chosen": -0.029156304895877838, "rewards/margins": -0.05369654297828674, "rewards/rejected": 0.024540234357118607, "step": 862 }, { "epoch": 0.13346220761647012, "grad_norm": 6.041230201721191, "learning_rate": 2.2242268041237113e-06, "logits/chosen": 3.6784310340881348, "logits/rejected": 12.983327865600586, "logps/chosen": -144.37530517578125, "logps/rejected": -295.48150634765625, "loss": 0.6475, "rewards/accuracies": 0.75, "rewards/chosen": 0.05102301016449928, "rewards/margins": 0.09967866539955139, "rewards/rejected": -0.04865565150976181, "step": 863 }, { "epoch": 0.1336168567562343, "grad_norm": 5.762960433959961, "learning_rate": 2.2268041237113406e-06, "logits/chosen": 6.7540740966796875, "logits/rejected": 3.442861557006836, "logps/chosen": -254.0617218017578, "logps/rejected": -221.40017700195312, "loss": 0.701, "rewards/accuracies": 0.625, "rewards/chosen": 0.09227581322193146, "rewards/margins": -0.004321906715631485, "rewards/rejected": 0.09659771621227264, "step": 864 }, { "epoch": 0.13377150589599845, "grad_norm": 6.7018723487854, "learning_rate": 2.2293814432989695e-06, "logits/chosen": 11.998213768005371, "logits/rejected": 8.129022598266602, "logps/chosen": -427.28228759765625, "logps/rejected": -294.0709228515625, "loss": 0.7175, "rewards/accuracies": 0.5, "rewards/chosen": -0.0322965644299984, "rewards/margins": 0.006228595972061157, "rewards/rejected": -0.03852515667676926, "step": 865 }, { "epoch": 0.1339261550357626, "grad_norm": 6.30493688583374, "learning_rate": 2.2319587628865984e-06, "logits/chosen": 12.574810028076172, "logits/rejected": 11.15985107421875, "logps/chosen": -337.1407470703125, "logps/rejected": -246.75933837890625, "loss": 0.7369, "rewards/accuracies": 0.375, "rewards/chosen": 0.006773374974727631, "rewards/margins": -0.07670469582080841, "rewards/rejected": 0.08347807824611664, "step": 866 }, { "epoch": 0.13408080417552679, "grad_norm": 4.71605920791626, "learning_rate": 2.2345360824742272e-06, "logits/chosen": 7.984208106994629, "logits/rejected": 5.614459991455078, "logps/chosen": -264.9400634765625, "logps/rejected": -245.4064483642578, "loss": 0.6501, "rewards/accuracies": 0.75, "rewards/chosen": 0.07264199107885361, "rewards/margins": 0.09673988819122314, "rewards/rejected": -0.02409788966178894, "step": 867 }, { "epoch": 0.13423545331529094, "grad_norm": 4.266623020172119, "learning_rate": 2.237113402061856e-06, "logits/chosen": 10.277667999267578, "logits/rejected": 12.970866203308105, "logps/chosen": -192.5612030029297, "logps/rejected": -283.6071472167969, "loss": 0.6569, "rewards/accuracies": 0.375, "rewards/chosen": -0.0014364225789904594, "rewards/margins": 0.09040756523609161, "rewards/rejected": -0.09184398502111435, "step": 868 }, { "epoch": 0.1343901024550551, "grad_norm": 5.543199062347412, "learning_rate": 2.239690721649485e-06, "logits/chosen": 13.170656204223633, "logits/rejected": 9.070703506469727, "logps/chosen": -437.1995849609375, "logps/rejected": -411.743896484375, "loss": 0.6599, "rewards/accuracies": 0.875, "rewards/chosen": 0.07504886388778687, "rewards/margins": 0.07696985453367233, "rewards/rejected": -0.0019209831953048706, "step": 869 }, { "epoch": 0.13454475159481927, "grad_norm": 4.192004203796387, "learning_rate": 2.242268041237114e-06, "logits/chosen": 8.867194175720215, "logits/rejected": 7.47512674331665, "logps/chosen": -178.85935974121094, "logps/rejected": -190.47662353515625, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": -0.09408316761255264, "rewards/margins": 0.009184072725474834, "rewards/rejected": -0.10326724499464035, "step": 870 }, { "epoch": 0.13469940073458342, "grad_norm": 9.250446319580078, "learning_rate": 2.2448453608247427e-06, "logits/chosen": 13.235067367553711, "logits/rejected": 5.132692337036133, "logps/chosen": -263.7756042480469, "logps/rejected": -132.05821228027344, "loss": 0.7728, "rewards/accuracies": 0.25, "rewards/chosen": -0.08982138335704803, "rewards/margins": -0.1378098577260971, "rewards/rejected": 0.04798846319317818, "step": 871 }, { "epoch": 0.13485404987434757, "grad_norm": 5.271758556365967, "learning_rate": 2.247422680412371e-06, "logits/chosen": 5.6582207679748535, "logits/rejected": 2.5591440200805664, "logps/chosen": -274.581298828125, "logps/rejected": -215.13096618652344, "loss": 0.6764, "rewards/accuracies": 0.375, "rewards/chosen": 0.10240302234888077, "rewards/margins": 0.04517187923192978, "rewards/rejected": 0.05723114311695099, "step": 872 }, { "epoch": 0.13500869901411172, "grad_norm": 5.767563819885254, "learning_rate": 2.25e-06, "logits/chosen": 14.18519401550293, "logits/rejected": 13.963915824890137, "logps/chosen": -360.17596435546875, "logps/rejected": -314.26702880859375, "loss": 0.7058, "rewards/accuracies": 0.625, "rewards/chosen": -0.013453193940222263, "rewards/margins": -0.005100056529045105, "rewards/rejected": -0.00835314393043518, "step": 873 }, { "epoch": 0.1351633481538759, "grad_norm": 4.836509704589844, "learning_rate": 2.252577319587629e-06, "logits/chosen": 12.89808177947998, "logits/rejected": 9.826332092285156, "logps/chosen": -304.12188720703125, "logps/rejected": -229.79782104492188, "loss": 0.6395, "rewards/accuracies": 0.5, "rewards/chosen": 0.08812665939331055, "rewards/margins": 0.13255575299263, "rewards/rejected": -0.04442910850048065, "step": 874 }, { "epoch": 0.13531799729364005, "grad_norm": 5.204434394836426, "learning_rate": 2.255154639175258e-06, "logits/chosen": 5.15670919418335, "logits/rejected": 4.148692607879639, "logps/chosen": -327.5721740722656, "logps/rejected": -258.166748046875, "loss": 0.6624, "rewards/accuracies": 0.75, "rewards/chosen": 0.13059142231941223, "rewards/margins": 0.07351868599653244, "rewards/rejected": 0.05707273632287979, "step": 875 }, { "epoch": 0.1354726464334042, "grad_norm": 4.169265270233154, "learning_rate": 2.2577319587628867e-06, "logits/chosen": 16.191415786743164, "logits/rejected": 13.364799499511719, "logps/chosen": -314.1923828125, "logps/rejected": -269.66839599609375, "loss": 0.6483, "rewards/accuracies": 0.625, "rewards/chosen": 0.09650377929210663, "rewards/margins": 0.1148630827665329, "rewards/rejected": -0.01835930347442627, "step": 876 }, { "epoch": 0.13562729557316838, "grad_norm": 6.274011135101318, "learning_rate": 2.2603092783505155e-06, "logits/chosen": 8.142695426940918, "logits/rejected": 5.887526988983154, "logps/chosen": -328.4047546386719, "logps/rejected": -203.423095703125, "loss": 0.6657, "rewards/accuracies": 0.625, "rewards/chosen": 0.15985208749771118, "rewards/margins": 0.06586318463087082, "rewards/rejected": 0.09398889541625977, "step": 877 }, { "epoch": 0.13578194471293253, "grad_norm": 7.283665180206299, "learning_rate": 2.2628865979381444e-06, "logits/chosen": 11.808151245117188, "logits/rejected": 7.962191581726074, "logps/chosen": -460.52899169921875, "logps/rejected": -322.98748779296875, "loss": 0.5729, "rewards/accuracies": 0.625, "rewards/chosen": 0.39074021577835083, "rewards/margins": 0.3238605856895447, "rewards/rejected": 0.06687964498996735, "step": 878 }, { "epoch": 0.13593659385269669, "grad_norm": 5.229115962982178, "learning_rate": 2.2654639175257733e-06, "logits/chosen": 14.614744186401367, "logits/rejected": 7.676170825958252, "logps/chosen": -330.6083679199219, "logps/rejected": -216.42282104492188, "loss": 0.6678, "rewards/accuracies": 0.5, "rewards/chosen": 0.13434715569019318, "rewards/margins": 0.0567832887172699, "rewards/rejected": 0.07756385952234268, "step": 879 }, { "epoch": 0.13609124299246086, "grad_norm": 4.286057472229004, "learning_rate": 2.268041237113402e-06, "logits/chosen": 12.034272193908691, "logits/rejected": 9.07854175567627, "logps/chosen": -202.08837890625, "logps/rejected": -171.132080078125, "loss": 0.6994, "rewards/accuracies": 0.375, "rewards/chosen": 0.039801403880119324, "rewards/margins": 0.0036340728402137756, "rewards/rejected": 0.03616733476519585, "step": 880 }, { "epoch": 0.13624589213222502, "grad_norm": 4.793503761291504, "learning_rate": 2.270618556701031e-06, "logits/chosen": 10.853227615356445, "logits/rejected": 7.406322956085205, "logps/chosen": -288.4029846191406, "logps/rejected": -249.38790893554688, "loss": 0.7017, "rewards/accuracies": 0.625, "rewards/chosen": 0.11276023834943771, "rewards/margins": -0.008917592465877533, "rewards/rejected": 0.12167783081531525, "step": 881 }, { "epoch": 0.13640054127198917, "grad_norm": 5.04573917388916, "learning_rate": 2.27319587628866e-06, "logits/chosen": 8.643777847290039, "logits/rejected": 11.178739547729492, "logps/chosen": -215.95718383789062, "logps/rejected": -273.3276062011719, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.11197976768016815, "rewards/margins": 0.020266003906726837, "rewards/rejected": 0.09171376377344131, "step": 882 }, { "epoch": 0.13655519041175335, "grad_norm": 4.487614631652832, "learning_rate": 2.2757731958762888e-06, "logits/chosen": 9.808544158935547, "logits/rejected": 2.70812726020813, "logps/chosen": -235.29429626464844, "logps/rejected": -155.73947143554688, "loss": 0.6418, "rewards/accuracies": 0.75, "rewards/chosen": 0.05298071354627609, "rewards/margins": 0.10992717742919922, "rewards/rejected": -0.05694647133350372, "step": 883 }, { "epoch": 0.1367098395515175, "grad_norm": 4.402811527252197, "learning_rate": 2.2783505154639176e-06, "logits/chosen": 8.570183753967285, "logits/rejected": 5.887582302093506, "logps/chosen": -272.9149169921875, "logps/rejected": -246.6571807861328, "loss": 0.6751, "rewards/accuracies": 0.375, "rewards/chosen": -0.010329247452318668, "rewards/margins": 0.04991140961647034, "rewards/rejected": -0.06024065613746643, "step": 884 }, { "epoch": 0.13686448869128165, "grad_norm": 5.491387844085693, "learning_rate": 2.2809278350515465e-06, "logits/chosen": 11.135345458984375, "logits/rejected": -0.7600812911987305, "logps/chosen": -212.52496337890625, "logps/rejected": -116.70137786865234, "loss": 0.6569, "rewards/accuracies": 0.875, "rewards/chosen": 0.07405834645032883, "rewards/margins": 0.08941201865673065, "rewards/rejected": -0.015353678725659847, "step": 885 }, { "epoch": 0.13701913783104583, "grad_norm": 9.304813385009766, "learning_rate": 2.2835051546391754e-06, "logits/chosen": 10.337177276611328, "logits/rejected": 4.320044994354248, "logps/chosen": -332.7056579589844, "logps/rejected": -221.1385498046875, "loss": 0.7446, "rewards/accuracies": 0.375, "rewards/chosen": 0.06646871566772461, "rewards/margins": -0.09340272098779678, "rewards/rejected": 0.159871444106102, "step": 886 }, { "epoch": 0.13717378697080998, "grad_norm": 12.956573486328125, "learning_rate": 2.2860824742268043e-06, "logits/chosen": 7.518157958984375, "logits/rejected": 11.64919662475586, "logps/chosen": -184.48733520507812, "logps/rejected": -200.37107849121094, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": 0.1380397379398346, "rewards/margins": 0.0472596138715744, "rewards/rejected": 0.0907801166176796, "step": 887 }, { "epoch": 0.13732843611057413, "grad_norm": 4.373281002044678, "learning_rate": 2.288659793814433e-06, "logits/chosen": 9.1815185546875, "logits/rejected": 7.211836814880371, "logps/chosen": -246.74102783203125, "logps/rejected": -241.49554443359375, "loss": 0.6733, "rewards/accuracies": 0.75, "rewards/chosen": 0.14692078530788422, "rewards/margins": 0.04744339734315872, "rewards/rejected": 0.0994773805141449, "step": 888 }, { "epoch": 0.13748308525033828, "grad_norm": 4.062875747680664, "learning_rate": 2.291237113402062e-06, "logits/chosen": 10.741153717041016, "logits/rejected": 10.303879737854004, "logps/chosen": -160.2999725341797, "logps/rejected": -162.9789276123047, "loss": 0.6677, "rewards/accuracies": 0.75, "rewards/chosen": 0.05793919786810875, "rewards/margins": 0.057959653437137604, "rewards/rejected": -2.045556902885437e-05, "step": 889 }, { "epoch": 0.13763773439010246, "grad_norm": 5.453105926513672, "learning_rate": 2.293814432989691e-06, "logits/chosen": 6.773975372314453, "logits/rejected": 7.228199481964111, "logps/chosen": -283.3519287109375, "logps/rejected": -223.96343994140625, "loss": 0.6572, "rewards/accuracies": 0.625, "rewards/chosen": 0.03751359134912491, "rewards/margins": 0.08041834831237793, "rewards/rejected": -0.04290475323796272, "step": 890 }, { "epoch": 0.1377923835298666, "grad_norm": 4.047957897186279, "learning_rate": 2.2963917525773198e-06, "logits/chosen": 9.822409629821777, "logits/rejected": 4.881087303161621, "logps/chosen": -235.6240997314453, "logps/rejected": -185.0804901123047, "loss": 0.5943, "rewards/accuracies": 0.875, "rewards/chosen": 0.13318605720996857, "rewards/margins": 0.21671128273010254, "rewards/rejected": -0.08352524042129517, "step": 891 }, { "epoch": 0.13794703266963076, "grad_norm": 7.919179916381836, "learning_rate": 2.2989690721649486e-06, "logits/chosen": 2.247959852218628, "logits/rejected": 7.165493965148926, "logps/chosen": -437.8634033203125, "logps/rejected": -280.78741455078125, "loss": 0.6935, "rewards/accuracies": 0.5, "rewards/chosen": 0.018958479166030884, "rewards/margins": 0.030875300988554955, "rewards/rejected": -0.011916822753846645, "step": 892 }, { "epoch": 0.13810168180939494, "grad_norm": 4.436543941497803, "learning_rate": 2.3015463917525775e-06, "logits/chosen": 9.745589256286621, "logits/rejected": 9.891400337219238, "logps/chosen": -170.15150451660156, "logps/rejected": -185.68832397460938, "loss": 0.6224, "rewards/accuracies": 0.375, "rewards/chosen": 0.13712917268276215, "rewards/margins": 0.1751626431941986, "rewards/rejected": -0.03803347796201706, "step": 893 }, { "epoch": 0.1382563309491591, "grad_norm": 6.057544231414795, "learning_rate": 2.3041237113402064e-06, "logits/chosen": 12.952285766601562, "logits/rejected": 10.196444511413574, "logps/chosen": -366.32781982421875, "logps/rejected": -299.0440673828125, "loss": 0.6722, "rewards/accuracies": 0.625, "rewards/chosen": 0.2091427445411682, "rewards/margins": 0.08215785026550293, "rewards/rejected": 0.1269848644733429, "step": 894 }, { "epoch": 0.13841098008892325, "grad_norm": 4.52533483505249, "learning_rate": 2.3067010309278352e-06, "logits/chosen": 8.695606231689453, "logits/rejected": 5.417052745819092, "logps/chosen": -237.46693420410156, "logps/rejected": -207.29637145996094, "loss": 0.6279, "rewards/accuracies": 0.75, "rewards/chosen": 0.16573992371559143, "rewards/margins": 0.14171624183654785, "rewards/rejected": 0.024023674428462982, "step": 895 }, { "epoch": 0.13856562922868743, "grad_norm": 5.627161026000977, "learning_rate": 2.309278350515464e-06, "logits/chosen": 12.578010559082031, "logits/rejected": 8.984378814697266, "logps/chosen": -293.34515380859375, "logps/rejected": -285.53143310546875, "loss": 0.7107, "rewards/accuracies": 0.5, "rewards/chosen": 0.07159089297056198, "rewards/margins": -0.012142401188611984, "rewards/rejected": 0.08373329788446426, "step": 896 }, { "epoch": 0.13872027836845158, "grad_norm": 7.131906032562256, "learning_rate": 2.311855670103093e-06, "logits/chosen": 10.172256469726562, "logits/rejected": 11.816521644592285, "logps/chosen": -396.30206298828125, "logps/rejected": -454.7700500488281, "loss": 0.7719, "rewards/accuracies": 0.625, "rewards/chosen": 0.006964873522520065, "rewards/margins": -0.11543674767017365, "rewards/rejected": 0.12240162491798401, "step": 897 }, { "epoch": 0.13887492750821573, "grad_norm": 4.766530513763428, "learning_rate": 2.314432989690722e-06, "logits/chosen": 14.655416488647461, "logits/rejected": 4.846123695373535, "logps/chosen": -326.9876403808594, "logps/rejected": -204.0702667236328, "loss": 0.6333, "rewards/accuracies": 0.625, "rewards/chosen": 0.1927243173122406, "rewards/margins": 0.1624782681465149, "rewards/rejected": 0.0302460677921772, "step": 898 }, { "epoch": 0.1390295766479799, "grad_norm": 14.805150985717773, "learning_rate": 2.3170103092783507e-06, "logits/chosen": 13.82388973236084, "logits/rejected": -2.2094528675079346, "logps/chosen": -423.536376953125, "logps/rejected": -366.45751953125, "loss": 0.6195, "rewards/accuracies": 0.75, "rewards/chosen": 0.2512151598930359, "rewards/margins": 0.1839398741722107, "rewards/rejected": 0.06727529317140579, "step": 899 }, { "epoch": 0.13918422578774406, "grad_norm": 8.064802169799805, "learning_rate": 2.3195876288659796e-06, "logits/chosen": 2.946336030960083, "logits/rejected": 5.952096939086914, "logps/chosen": -200.036865234375, "logps/rejected": -207.7844696044922, "loss": 0.6429, "rewards/accuracies": 0.75, "rewards/chosen": 0.11251163482666016, "rewards/margins": 0.11533677577972412, "rewards/rejected": -0.0028251418843865395, "step": 900 }, { "epoch": 0.1393388749275082, "grad_norm": 3.7710654735565186, "learning_rate": 2.3221649484536085e-06, "logits/chosen": 7.682502746582031, "logits/rejected": 6.702328205108643, "logps/chosen": -202.08265686035156, "logps/rejected": -177.52894592285156, "loss": 0.6666, "rewards/accuracies": 0.5, "rewards/chosen": 0.08657179772853851, "rewards/margins": 0.058722883462905884, "rewards/rejected": 0.02784891426563263, "step": 901 }, { "epoch": 0.13949352406727236, "grad_norm": 5.264594554901123, "learning_rate": 2.3247422680412373e-06, "logits/chosen": 12.423885345458984, "logits/rejected": 9.60908317565918, "logps/chosen": -234.58888244628906, "logps/rejected": -198.98355102539062, "loss": 0.7939, "rewards/accuracies": 0.375, "rewards/chosen": -0.0695679634809494, "rewards/margins": -0.1825190633535385, "rewards/rejected": 0.11295108497142792, "step": 902 }, { "epoch": 0.13964817320703654, "grad_norm": 4.88804817199707, "learning_rate": 2.3273195876288662e-06, "logits/chosen": 6.858978271484375, "logits/rejected": 7.906588554382324, "logps/chosen": -266.4230041503906, "logps/rejected": -273.89398193359375, "loss": 0.671, "rewards/accuracies": 0.5, "rewards/chosen": 0.08675932884216309, "rewards/margins": 0.05689845606684685, "rewards/rejected": 0.02986087277531624, "step": 903 }, { "epoch": 0.1398028223468007, "grad_norm": 5.948923587799072, "learning_rate": 2.329896907216495e-06, "logits/chosen": 10.992117881774902, "logits/rejected": 6.869849681854248, "logps/chosen": -309.8206481933594, "logps/rejected": -274.9728698730469, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": 0.08765468746423721, "rewards/margins": 0.12816838920116425, "rewards/rejected": -0.04051370918750763, "step": 904 }, { "epoch": 0.13995747148656484, "grad_norm": 4.1175031661987305, "learning_rate": 2.332474226804124e-06, "logits/chosen": 11.287524223327637, "logits/rejected": 7.089748382568359, "logps/chosen": -247.5906524658203, "logps/rejected": -220.42759704589844, "loss": 0.6206, "rewards/accuracies": 0.625, "rewards/chosen": 0.09364671260118484, "rewards/margins": 0.16723881661891937, "rewards/rejected": -0.07359209656715393, "step": 905 }, { "epoch": 0.14011212062632902, "grad_norm": 6.615764141082764, "learning_rate": 2.335051546391753e-06, "logits/chosen": 12.893607139587402, "logits/rejected": 8.206303596496582, "logps/chosen": -479.9052429199219, "logps/rejected": -372.7142333984375, "loss": 0.6399, "rewards/accuracies": 0.5, "rewards/chosen": 0.2191700041294098, "rewards/margins": 0.14063653349876404, "rewards/rejected": 0.07853345572948456, "step": 906 }, { "epoch": 0.14026676976609317, "grad_norm": 4.0273823738098145, "learning_rate": 2.3376288659793817e-06, "logits/chosen": 5.282630443572998, "logits/rejected": 6.603236198425293, "logps/chosen": -156.4139862060547, "logps/rejected": -171.40670776367188, "loss": 0.6445, "rewards/accuracies": 0.75, "rewards/chosen": 0.06814494729042053, "rewards/margins": 0.11642909049987793, "rewards/rejected": -0.048284150660037994, "step": 907 }, { "epoch": 0.14042141890585733, "grad_norm": 7.370420932769775, "learning_rate": 2.3402061855670106e-06, "logits/chosen": 8.733667373657227, "logits/rejected": 4.371000289916992, "logps/chosen": -297.81390380859375, "logps/rejected": -174.73419189453125, "loss": 0.8273, "rewards/accuracies": 0.0, "rewards/chosen": -0.11691976338624954, "rewards/margins": -0.24147753417491913, "rewards/rejected": 0.12455777823925018, "step": 908 }, { "epoch": 0.1405760680456215, "grad_norm": 5.129427433013916, "learning_rate": 2.3427835051546394e-06, "logits/chosen": 9.334989547729492, "logits/rejected": 10.864970207214355, "logps/chosen": -289.06121826171875, "logps/rejected": -258.4640197753906, "loss": 0.677, "rewards/accuracies": 0.625, "rewards/chosen": 0.03410263732075691, "rewards/margins": 0.038875989615917206, "rewards/rejected": -0.004773356020450592, "step": 909 }, { "epoch": 0.14073071718538566, "grad_norm": 6.479569911956787, "learning_rate": 2.3453608247422683e-06, "logits/chosen": 15.983110427856445, "logits/rejected": 11.189767837524414, "logps/chosen": -448.77374267578125, "logps/rejected": -292.93084716796875, "loss": 0.7404, "rewards/accuracies": 0.625, "rewards/chosen": 0.11327371001243591, "rewards/margins": -0.028903141617774963, "rewards/rejected": 0.14217686653137207, "step": 910 }, { "epoch": 0.1408853663251498, "grad_norm": 10.384605407714844, "learning_rate": 2.347938144329897e-06, "logits/chosen": 13.814935684204102, "logits/rejected": 12.97150707244873, "logps/chosen": -205.5994110107422, "logps/rejected": -240.3565216064453, "loss": 0.7341, "rewards/accuracies": 0.375, "rewards/chosen": 0.015475651249289513, "rewards/margins": -0.06577196717262268, "rewards/rejected": 0.08124761283397675, "step": 911 }, { "epoch": 0.141040015464914, "grad_norm": 4.507956027984619, "learning_rate": 2.350515463917526e-06, "logits/chosen": 13.520294189453125, "logits/rejected": 10.504230499267578, "logps/chosen": -213.5272979736328, "logps/rejected": -166.77674865722656, "loss": 0.6763, "rewards/accuracies": 0.625, "rewards/chosen": -0.038066767156124115, "rewards/margins": 0.054728999733924866, "rewards/rejected": -0.09279575943946838, "step": 912 }, { "epoch": 0.14119466460467814, "grad_norm": 5.875748634338379, "learning_rate": 2.353092783505155e-06, "logits/chosen": 14.746333122253418, "logits/rejected": 9.695646286010742, "logps/chosen": -336.8843078613281, "logps/rejected": -286.27386474609375, "loss": 0.6768, "rewards/accuracies": 0.625, "rewards/chosen": 0.11925964802503586, "rewards/margins": 0.04703053832054138, "rewards/rejected": 0.07222909480333328, "step": 913 }, { "epoch": 0.1413493137444423, "grad_norm": 7.717573165893555, "learning_rate": 2.355670103092784e-06, "logits/chosen": 10.052270889282227, "logits/rejected": 11.42772388458252, "logps/chosen": -309.8962097167969, "logps/rejected": -310.22540283203125, "loss": 0.8004, "rewards/accuracies": 0.5, "rewards/chosen": 0.023360159248113632, "rewards/margins": -0.15259763598442078, "rewards/rejected": 0.1759577989578247, "step": 914 }, { "epoch": 0.14150396288420647, "grad_norm": 6.7534613609313965, "learning_rate": 2.3582474226804127e-06, "logits/chosen": 5.907132148742676, "logits/rejected": 10.063715934753418, "logps/chosen": -264.49053955078125, "logps/rejected": -345.8446044921875, "loss": 0.6768, "rewards/accuracies": 0.375, "rewards/chosen": 0.043520018458366394, "rewards/margins": 0.04153170436620712, "rewards/rejected": 0.0019883131608366966, "step": 915 }, { "epoch": 0.14165861202397062, "grad_norm": 6.351677894592285, "learning_rate": 2.3608247422680415e-06, "logits/chosen": 8.423628807067871, "logits/rejected": 8.350414276123047, "logps/chosen": -267.1183776855469, "logps/rejected": -242.9943084716797, "loss": 0.7203, "rewards/accuracies": 0.5, "rewards/chosen": 0.02573833242058754, "rewards/margins": -0.041520215570926666, "rewards/rejected": 0.0672585517168045, "step": 916 }, { "epoch": 0.14181326116373477, "grad_norm": 5.402352809906006, "learning_rate": 2.3634020618556704e-06, "logits/chosen": 8.543071746826172, "logits/rejected": 6.9717206954956055, "logps/chosen": -259.31573486328125, "logps/rejected": -240.54495239257812, "loss": 0.7229, "rewards/accuracies": 0.5, "rewards/chosen": -0.004346180707216263, "rewards/margins": -0.04667573422193527, "rewards/rejected": 0.04232954978942871, "step": 917 }, { "epoch": 0.14196791030349892, "grad_norm": 6.482724189758301, "learning_rate": 2.3659793814432993e-06, "logits/chosen": 10.48664379119873, "logits/rejected": 2.9964404106140137, "logps/chosen": -276.0944519042969, "logps/rejected": -153.4932403564453, "loss": 0.6688, "rewards/accuracies": 0.5, "rewards/chosen": 0.031136134639382362, "rewards/margins": 0.057294465601444244, "rewards/rejected": -0.02615833468735218, "step": 918 }, { "epoch": 0.1421225594432631, "grad_norm": 5.261061668395996, "learning_rate": 2.3685567010309277e-06, "logits/chosen": 8.841046333312988, "logits/rejected": 12.002863883972168, "logps/chosen": -242.5989990234375, "logps/rejected": -229.36228942871094, "loss": 0.7317, "rewards/accuracies": 0.375, "rewards/chosen": 0.08840465545654297, "rewards/margins": -0.05870799720287323, "rewards/rejected": 0.1471126675605774, "step": 919 }, { "epoch": 0.14227720858302725, "grad_norm": 7.081851482391357, "learning_rate": 2.3711340206185566e-06, "logits/chosen": 10.484249114990234, "logits/rejected": 10.433899879455566, "logps/chosen": -339.4588623046875, "logps/rejected": -397.8988037109375, "loss": 0.6804, "rewards/accuracies": 0.625, "rewards/chosen": 0.20025300979614258, "rewards/margins": 0.05834160000085831, "rewards/rejected": 0.14191141724586487, "step": 920 }, { "epoch": 0.1424318577227914, "grad_norm": 4.927915096282959, "learning_rate": 2.3737113402061855e-06, "logits/chosen": 8.86042308807373, "logits/rejected": 8.4859619140625, "logps/chosen": -257.8925476074219, "logps/rejected": -276.67095947265625, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 0.11479340493679047, "rewards/margins": 0.002673577517271042, "rewards/rejected": 0.11211982369422913, "step": 921 }, { "epoch": 0.14258650686255558, "grad_norm": 5.104118824005127, "learning_rate": 2.3762886597938144e-06, "logits/chosen": 7.407227516174316, "logits/rejected": 10.410012245178223, "logps/chosen": -181.71759033203125, "logps/rejected": -241.81973266601562, "loss": 0.7042, "rewards/accuracies": 0.375, "rewards/chosen": 0.045998621731996536, "rewards/margins": -0.004401572048664093, "rewards/rejected": 0.050400182604789734, "step": 922 }, { "epoch": 0.14274115600231974, "grad_norm": 3.99607253074646, "learning_rate": 2.3788659793814432e-06, "logits/chosen": 13.399528503417969, "logits/rejected": 10.451894760131836, "logps/chosen": -239.62503051757812, "logps/rejected": -196.5133056640625, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.13701358437538147, "rewards/margins": 0.08184747397899628, "rewards/rejected": 0.055166102945804596, "step": 923 }, { "epoch": 0.1428958051420839, "grad_norm": 4.764994144439697, "learning_rate": 2.381443298969072e-06, "logits/chosen": 7.255903720855713, "logits/rejected": 4.167789936065674, "logps/chosen": -216.55809020996094, "logps/rejected": -162.16822814941406, "loss": 0.6769, "rewards/accuracies": 0.5, "rewards/chosen": 0.12852072715759277, "rewards/margins": 0.0515337809920311, "rewards/rejected": 0.07698693871498108, "step": 924 }, { "epoch": 0.14305045428184807, "grad_norm": 6.34387731552124, "learning_rate": 2.3840206185567014e-06, "logits/chosen": 11.356218338012695, "logits/rejected": 6.107921600341797, "logps/chosen": -232.00921630859375, "logps/rejected": -195.72372436523438, "loss": 0.6688, "rewards/accuracies": 0.5, "rewards/chosen": 0.04922781139612198, "rewards/margins": 0.07268504798412323, "rewards/rejected": -0.0234572384506464, "step": 925 }, { "epoch": 0.14320510342161222, "grad_norm": 8.591683387756348, "learning_rate": 2.3865979381443303e-06, "logits/chosen": 4.65860652923584, "logits/rejected": 3.262638568878174, "logps/chosen": -295.3010559082031, "logps/rejected": -261.58953857421875, "loss": 0.6899, "rewards/accuracies": 0.5, "rewards/chosen": 0.05151667818427086, "rewards/margins": 0.06415456533432007, "rewards/rejected": -0.012637898325920105, "step": 926 }, { "epoch": 0.14335975256137637, "grad_norm": 4.3053879737854, "learning_rate": 2.389175257731959e-06, "logits/chosen": 11.637528419494629, "logits/rejected": 5.480941295623779, "logps/chosen": -199.94320678710938, "logps/rejected": -131.50418090820312, "loss": 0.656, "rewards/accuracies": 0.625, "rewards/chosen": 0.01199965551495552, "rewards/margins": 0.08124446868896484, "rewards/rejected": -0.06924481689929962, "step": 927 }, { "epoch": 0.14351440170114055, "grad_norm": 4.252354621887207, "learning_rate": 2.391752577319588e-06, "logits/chosen": 8.545031547546387, "logits/rejected": 5.23971700668335, "logps/chosen": -209.64871215820312, "logps/rejected": -168.820556640625, "loss": 0.6862, "rewards/accuracies": 0.25, "rewards/chosen": 0.12477627396583557, "rewards/margins": 0.04728969931602478, "rewards/rejected": 0.0774865597486496, "step": 928 }, { "epoch": 0.1436690508409047, "grad_norm": 4.484482765197754, "learning_rate": 2.394329896907217e-06, "logits/chosen": 11.50522518157959, "logits/rejected": 6.6114325523376465, "logps/chosen": -207.19639587402344, "logps/rejected": -205.10179138183594, "loss": 0.6751, "rewards/accuracies": 0.5, "rewards/chosen": 0.1282355785369873, "rewards/margins": 0.05071539431810379, "rewards/rejected": 0.07752018421888351, "step": 929 }, { "epoch": 0.14382369998066885, "grad_norm": 6.674232482910156, "learning_rate": 2.3969072164948458e-06, "logits/chosen": 12.247712135314941, "logits/rejected": 9.308394432067871, "logps/chosen": -376.05029296875, "logps/rejected": -356.26300048828125, "loss": 0.6671, "rewards/accuracies": 0.5, "rewards/chosen": 0.1441488265991211, "rewards/margins": 0.06759348511695862, "rewards/rejected": 0.07655534893274307, "step": 930 }, { "epoch": 0.14397834912043303, "grad_norm": 6.294856071472168, "learning_rate": 2.3994845360824746e-06, "logits/chosen": 9.637535095214844, "logits/rejected": 6.786552429199219, "logps/chosen": -359.70330810546875, "logps/rejected": -263.89666748046875, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.12031211704015732, "rewards/margins": 0.11500054597854614, "rewards/rejected": 0.005311585962772369, "step": 931 }, { "epoch": 0.14413299826019718, "grad_norm": 7.539927005767822, "learning_rate": 2.4020618556701035e-06, "logits/chosen": 12.931175231933594, "logits/rejected": 6.994305610656738, "logps/chosen": -425.36505126953125, "logps/rejected": -248.13870239257812, "loss": 0.681, "rewards/accuracies": 0.25, "rewards/chosen": -0.020102309063076973, "rewards/margins": 0.04040088132023811, "rewards/rejected": -0.060503195971250534, "step": 932 }, { "epoch": 0.14428764739996133, "grad_norm": 4.166161060333252, "learning_rate": 2.4046391752577324e-06, "logits/chosen": 13.583078384399414, "logits/rejected": 8.128865242004395, "logps/chosen": -266.9848937988281, "logps/rejected": -185.58139038085938, "loss": 0.6447, "rewards/accuracies": 0.75, "rewards/chosen": 0.11701817810535431, "rewards/margins": 0.11121020466089249, "rewards/rejected": 0.005807972513139248, "step": 933 }, { "epoch": 0.14444229653972548, "grad_norm": 4.818428993225098, "learning_rate": 2.4072164948453612e-06, "logits/chosen": 9.980958938598633, "logits/rejected": 6.4126715660095215, "logps/chosen": -236.86404418945312, "logps/rejected": -190.4046630859375, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": 0.17218610644340515, "rewards/margins": -0.009835533797740936, "rewards/rejected": 0.1820216178894043, "step": 934 }, { "epoch": 0.14459694567948966, "grad_norm": 4.900993347167969, "learning_rate": 2.40979381443299e-06, "logits/chosen": 16.548980712890625, "logits/rejected": 13.31427001953125, "logps/chosen": -278.5684814453125, "logps/rejected": -290.00640869140625, "loss": 0.7218, "rewards/accuracies": 0.375, "rewards/chosen": 0.08795805275440216, "rewards/margins": -0.0549248643219471, "rewards/rejected": 0.14288291335105896, "step": 935 }, { "epoch": 0.14475159481925381, "grad_norm": 5.169083595275879, "learning_rate": 2.412371134020619e-06, "logits/chosen": 13.235276222229004, "logits/rejected": 8.044557571411133, "logps/chosen": -321.1685485839844, "logps/rejected": -253.15933227539062, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": 0.3566027581691742, "rewards/margins": 0.19021005928516388, "rewards/rejected": 0.1663927137851715, "step": 936 }, { "epoch": 0.14490624395901797, "grad_norm": 5.428519248962402, "learning_rate": 2.414948453608248e-06, "logits/chosen": 13.332061767578125, "logits/rejected": 6.041755676269531, "logps/chosen": -327.3083801269531, "logps/rejected": -182.99327087402344, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": 0.03844413906335831, "rewards/margins": 0.16897039115428925, "rewards/rejected": -0.13052625954151154, "step": 937 }, { "epoch": 0.14506089309878215, "grad_norm": 4.59999418258667, "learning_rate": 2.4175257731958763e-06, "logits/chosen": 7.4586687088012695, "logits/rejected": 11.942691802978516, "logps/chosen": -253.45135498046875, "logps/rejected": -333.246826171875, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.07155981659889221, "rewards/margins": 0.04269304499030113, "rewards/rejected": 0.028866764158010483, "step": 938 }, { "epoch": 0.1452155422385463, "grad_norm": 4.321788787841797, "learning_rate": 2.420103092783505e-06, "logits/chosen": 10.987452507019043, "logits/rejected": 5.7488250732421875, "logps/chosen": -190.94424438476562, "logps/rejected": -192.43084716796875, "loss": 0.6624, "rewards/accuracies": 0.625, "rewards/chosen": 0.06689973175525665, "rewards/margins": 0.07895570248365402, "rewards/rejected": -0.01205596886575222, "step": 939 }, { "epoch": 0.14537019137831045, "grad_norm": 7.607157230377197, "learning_rate": 2.422680412371134e-06, "logits/chosen": 9.883402824401855, "logits/rejected": 6.4782843589782715, "logps/chosen": -209.8402099609375, "logps/rejected": -215.99461364746094, "loss": 0.722, "rewards/accuracies": 0.375, "rewards/chosen": 0.12372054904699326, "rewards/margins": -0.02321319654583931, "rewards/rejected": 0.14693374931812286, "step": 940 }, { "epoch": 0.14552484051807463, "grad_norm": 4.300503730773926, "learning_rate": 2.425257731958763e-06, "logits/chosen": 11.408533096313477, "logits/rejected": 12.018718719482422, "logps/chosen": -218.31005859375, "logps/rejected": -237.34414672851562, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": 0.10544481873512268, "rewards/margins": 0.042931847274303436, "rewards/rejected": 0.06251297146081924, "step": 941 }, { "epoch": 0.14567948965783878, "grad_norm": 3.9465460777282715, "learning_rate": 2.427835051546392e-06, "logits/chosen": 11.535292625427246, "logits/rejected": 7.290932655334473, "logps/chosen": -203.159912109375, "logps/rejected": -154.8922882080078, "loss": 0.6419, "rewards/accuracies": 0.625, "rewards/chosen": 0.12058965861797333, "rewards/margins": 0.11256952583789825, "rewards/rejected": 0.008020136505365372, "step": 942 }, { "epoch": 0.14583413879760293, "grad_norm": 7.470331192016602, "learning_rate": 2.4304123711340207e-06, "logits/chosen": 6.8543548583984375, "logits/rejected": 3.3807930946350098, "logps/chosen": -217.67507934570312, "logps/rejected": -208.99429321289062, "loss": 0.6889, "rewards/accuracies": 0.625, "rewards/chosen": 0.16164270043373108, "rewards/margins": 0.032285548746585846, "rewards/rejected": 0.12935715913772583, "step": 943 }, { "epoch": 0.1459887879373671, "grad_norm": 10.592182159423828, "learning_rate": 2.4329896907216495e-06, "logits/chosen": 12.615392684936523, "logits/rejected": 5.006398677825928, "logps/chosen": -511.2911376953125, "logps/rejected": -286.5256042480469, "loss": 0.6697, "rewards/accuracies": 0.75, "rewards/chosen": 0.17020931839942932, "rewards/margins": 0.05973578244447708, "rewards/rejected": 0.11047354340553284, "step": 944 }, { "epoch": 0.14614343707713126, "grad_norm": 5.410158157348633, "learning_rate": 2.4355670103092784e-06, "logits/chosen": 10.480123519897461, "logits/rejected": 7.627450942993164, "logps/chosen": -333.2989196777344, "logps/rejected": -249.7787628173828, "loss": 0.7145, "rewards/accuracies": 0.375, "rewards/chosen": 0.14121408760547638, "rewards/margins": -0.02265309914946556, "rewards/rejected": 0.16386719048023224, "step": 945 }, { "epoch": 0.1462980862168954, "grad_norm": 6.3184404373168945, "learning_rate": 2.4381443298969073e-06, "logits/chosen": 7.971908092498779, "logits/rejected": 11.381896018981934, "logps/chosen": -269.70904541015625, "logps/rejected": -297.00341796875, "loss": 0.6728, "rewards/accuracies": 0.75, "rewards/chosen": 0.3306790292263031, "rewards/margins": 0.054555416107177734, "rewards/rejected": 0.27612361311912537, "step": 946 }, { "epoch": 0.1464527353566596, "grad_norm": 4.406201362609863, "learning_rate": 2.440721649484536e-06, "logits/chosen": 15.634912490844727, "logits/rejected": 8.238141059875488, "logps/chosen": -226.30343627929688, "logps/rejected": -154.6337890625, "loss": 0.6622, "rewards/accuracies": 0.375, "rewards/chosen": 0.25033968687057495, "rewards/margins": 0.07584743201732635, "rewards/rejected": 0.1744922399520874, "step": 947 }, { "epoch": 0.14660738449642374, "grad_norm": 5.298978328704834, "learning_rate": 2.443298969072165e-06, "logits/chosen": 10.781770706176758, "logits/rejected": 7.479361057281494, "logps/chosen": -321.77734375, "logps/rejected": -233.26644897460938, "loss": 0.6438, "rewards/accuracies": 0.75, "rewards/chosen": 0.12140927463769913, "rewards/margins": 0.12022820115089417, "rewards/rejected": 0.0011810734868049622, "step": 948 }, { "epoch": 0.1467620336361879, "grad_norm": 4.054439067840576, "learning_rate": 2.445876288659794e-06, "logits/chosen": 9.1417818069458, "logits/rejected": 5.940650939941406, "logps/chosen": -210.92037963867188, "logps/rejected": -197.6656951904297, "loss": 0.6272, "rewards/accuracies": 0.875, "rewards/chosen": 0.11081576347351074, "rewards/margins": 0.1395270824432373, "rewards/rejected": -0.028711318969726562, "step": 949 }, { "epoch": 0.14691668277595205, "grad_norm": 4.954822540283203, "learning_rate": 2.4484536082474228e-06, "logits/chosen": 8.866207122802734, "logits/rejected": 7.824855804443359, "logps/chosen": -202.92568969726562, "logps/rejected": -220.5973663330078, "loss": 0.7237, "rewards/accuracies": 0.5, "rewards/chosen": 0.1889454871416092, "rewards/margins": -0.035764746367931366, "rewards/rejected": 0.22471022605895996, "step": 950 }, { "epoch": 0.14707133191571622, "grad_norm": 4.961806297302246, "learning_rate": 2.4510309278350516e-06, "logits/chosen": 6.981586456298828, "logits/rejected": 3.450075626373291, "logps/chosen": -274.5780944824219, "logps/rejected": -225.8917694091797, "loss": 0.6571, "rewards/accuracies": 0.625, "rewards/chosen": 0.1524912416934967, "rewards/margins": 0.07941422611474991, "rewards/rejected": 0.07307702302932739, "step": 951 }, { "epoch": 0.14722598105548038, "grad_norm": 4.451239585876465, "learning_rate": 2.4536082474226805e-06, "logits/chosen": 11.398658752441406, "logits/rejected": 0.2986936569213867, "logps/chosen": -301.5375061035156, "logps/rejected": -183.1888885498047, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": 0.22814522683620453, "rewards/margins": 0.12175770103931427, "rewards/rejected": 0.10638751834630966, "step": 952 }, { "epoch": 0.14738063019524453, "grad_norm": 9.133742332458496, "learning_rate": 2.4561855670103094e-06, "logits/chosen": 11.362016677856445, "logits/rejected": 10.932202339172363, "logps/chosen": -322.4869079589844, "logps/rejected": -290.24273681640625, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": 0.12620776891708374, "rewards/margins": 0.011840078979730606, "rewards/rejected": 0.11436767131090164, "step": 953 }, { "epoch": 0.1475352793350087, "grad_norm": 4.968396186828613, "learning_rate": 2.4587628865979383e-06, "logits/chosen": 11.792253494262695, "logits/rejected": 11.920696258544922, "logps/chosen": -252.83316040039062, "logps/rejected": -294.8987731933594, "loss": 0.7015, "rewards/accuracies": 0.25, "rewards/chosen": 0.14753074944019318, "rewards/margins": -0.0014636069536209106, "rewards/rejected": 0.1489943563938141, "step": 954 }, { "epoch": 0.14768992847477286, "grad_norm": 4.555266857147217, "learning_rate": 2.4613402061855676e-06, "logits/chosen": 9.884590148925781, "logits/rejected": 7.298913478851318, "logps/chosen": -258.48516845703125, "logps/rejected": -205.30691528320312, "loss": 0.6705, "rewards/accuracies": 0.75, "rewards/chosen": 0.19206787645816803, "rewards/margins": 0.048691462725400925, "rewards/rejected": 0.1433764100074768, "step": 955 }, { "epoch": 0.147844577614537, "grad_norm": 7.090183734893799, "learning_rate": 2.463917525773196e-06, "logits/chosen": 10.31575870513916, "logits/rejected": 11.121665000915527, "logps/chosen": -268.80474853515625, "logps/rejected": -319.9278564453125, "loss": 0.738, "rewards/accuracies": 0.5, "rewards/chosen": 0.20418377220630646, "rewards/margins": -0.06507459282875061, "rewards/rejected": 0.2692583501338959, "step": 956 }, { "epoch": 0.1479992267543012, "grad_norm": 18.6771297454834, "learning_rate": 2.466494845360825e-06, "logits/chosen": 13.731008529663086, "logits/rejected": 8.2936429977417, "logps/chosen": -417.11334228515625, "logps/rejected": -276.3769836425781, "loss": 0.7469, "rewards/accuracies": 0.5, "rewards/chosen": 0.30361098051071167, "rewards/margins": -0.09678135812282562, "rewards/rejected": 0.4003923535346985, "step": 957 }, { "epoch": 0.14815387589406534, "grad_norm": 5.036862850189209, "learning_rate": 2.4690721649484537e-06, "logits/chosen": 5.275806427001953, "logits/rejected": 12.655960083007812, "logps/chosen": -152.2441864013672, "logps/rejected": -290.1501159667969, "loss": 0.68, "rewards/accuracies": 0.375, "rewards/chosen": 0.20953655242919922, "rewards/margins": 0.030674651265144348, "rewards/rejected": 0.17886190116405487, "step": 958 }, { "epoch": 0.1483085250338295, "grad_norm": 7.2273101806640625, "learning_rate": 2.4716494845360826e-06, "logits/chosen": 11.772583961486816, "logits/rejected": 7.524477005004883, "logps/chosen": -416.39324951171875, "logps/rejected": -290.85516357421875, "loss": 0.7542, "rewards/accuracies": 0.5, "rewards/chosen": 0.06137719005346298, "rewards/margins": -0.1101866289973259, "rewards/rejected": 0.17156381905078888, "step": 959 }, { "epoch": 0.14846317417359367, "grad_norm": 5.530838966369629, "learning_rate": 2.4742268041237115e-06, "logits/chosen": 12.333992004394531, "logits/rejected": 6.999913215637207, "logps/chosen": -341.40716552734375, "logps/rejected": -283.63714599609375, "loss": 0.6264, "rewards/accuracies": 0.5, "rewards/chosen": 0.29755330085754395, "rewards/margins": 0.1672951579093933, "rewards/rejected": 0.13025812804698944, "step": 960 }, { "epoch": 0.14861782331335782, "grad_norm": 5.298342227935791, "learning_rate": 2.4768041237113404e-06, "logits/chosen": 12.743539810180664, "logits/rejected": 6.548881530761719, "logps/chosen": -312.2908630371094, "logps/rejected": -227.13064575195312, "loss": 0.6071, "rewards/accuracies": 0.75, "rewards/chosen": 0.3252825438976288, "rewards/margins": 0.20396390557289124, "rewards/rejected": 0.12131863087415695, "step": 961 }, { "epoch": 0.14877247245312197, "grad_norm": 5.345992088317871, "learning_rate": 2.4793814432989692e-06, "logits/chosen": 6.8151116371154785, "logits/rejected": 9.889668464660645, "logps/chosen": -192.15992736816406, "logps/rejected": -223.66824340820312, "loss": 0.7355, "rewards/accuracies": 0.125, "rewards/chosen": 0.10050278156995773, "rewards/margins": -0.07099009305238724, "rewards/rejected": 0.17149285972118378, "step": 962 }, { "epoch": 0.14892712159288615, "grad_norm": 5.785558700561523, "learning_rate": 2.481958762886598e-06, "logits/chosen": 5.8352203369140625, "logits/rejected": 8.236224174499512, "logps/chosen": -288.91510009765625, "logps/rejected": -382.5042724609375, "loss": 0.651, "rewards/accuracies": 0.625, "rewards/chosen": 0.3705422580242157, "rewards/margins": 0.12195216119289398, "rewards/rejected": 0.24859008193016052, "step": 963 }, { "epoch": 0.1490817707326503, "grad_norm": 5.374144554138184, "learning_rate": 2.484536082474227e-06, "logits/chosen": 16.47252655029297, "logits/rejected": 11.239618301391602, "logps/chosen": -274.526611328125, "logps/rejected": -278.0457763671875, "loss": 0.7024, "rewards/accuracies": 0.5, "rewards/chosen": 0.21998809278011322, "rewards/margins": -0.016491007059812546, "rewards/rejected": 0.23647911846637726, "step": 964 }, { "epoch": 0.14923641987241446, "grad_norm": 4.982558727264404, "learning_rate": 2.487113402061856e-06, "logits/chosen": 6.582665920257568, "logits/rejected": 5.46577262878418, "logps/chosen": -309.485107421875, "logps/rejected": -185.4723663330078, "loss": 0.6505, "rewards/accuracies": 0.5, "rewards/chosen": 0.28469952940940857, "rewards/margins": 0.10418701171875, "rewards/rejected": 0.18051251769065857, "step": 965 }, { "epoch": 0.1493910690121786, "grad_norm": 5.450310707092285, "learning_rate": 2.4896907216494847e-06, "logits/chosen": 15.865681648254395, "logits/rejected": 8.391288757324219, "logps/chosen": -343.5689392089844, "logps/rejected": -308.62872314453125, "loss": 0.6467, "rewards/accuracies": 0.75, "rewards/chosen": 0.3063068389892578, "rewards/margins": 0.09905394911766052, "rewards/rejected": 0.2072528898715973, "step": 966 }, { "epoch": 0.14954571815194279, "grad_norm": 16.31694221496582, "learning_rate": 2.4922680412371136e-06, "logits/chosen": 10.372198104858398, "logits/rejected": 5.018614292144775, "logps/chosen": -255.9956512451172, "logps/rejected": -165.75328063964844, "loss": 0.757, "rewards/accuracies": 0.25, "rewards/chosen": 0.04015684127807617, "rewards/margins": -0.10746342688798904, "rewards/rejected": 0.14762026071548462, "step": 967 }, { "epoch": 0.14970036729170694, "grad_norm": 5.886409282684326, "learning_rate": 2.4948453608247425e-06, "logits/chosen": 9.90416145324707, "logits/rejected": 7.3132476806640625, "logps/chosen": -318.612548828125, "logps/rejected": -303.4840087890625, "loss": 0.5518, "rewards/accuracies": 1.0, "rewards/chosen": 0.5386236310005188, "rewards/margins": 0.31132185459136963, "rewards/rejected": 0.22730179131031036, "step": 968 }, { "epoch": 0.1498550164314711, "grad_norm": 9.646836280822754, "learning_rate": 2.4974226804123713e-06, "logits/chosen": 14.051658630371094, "logits/rejected": 7.878078460693359, "logps/chosen": -494.6185607910156, "logps/rejected": -331.8997802734375, "loss": 0.7778, "rewards/accuracies": 0.5, "rewards/chosen": 0.27219200134277344, "rewards/margins": -0.10797785967588425, "rewards/rejected": 0.3801698684692383, "step": 969 }, { "epoch": 0.15000966557123527, "grad_norm": 4.787323474884033, "learning_rate": 2.5e-06, "logits/chosen": 13.235980987548828, "logits/rejected": 9.642841339111328, "logps/chosen": -281.2373046875, "logps/rejected": -190.06008911132812, "loss": 0.6396, "rewards/accuracies": 0.75, "rewards/chosen": 0.2701765298843384, "rewards/margins": 0.14296922087669373, "rewards/rejected": 0.12720732390880585, "step": 970 }, { "epoch": 0.15016431471099942, "grad_norm": 4.692470550537109, "learning_rate": 2.502577319587629e-06, "logits/chosen": 9.996617317199707, "logits/rejected": 9.307639122009277, "logps/chosen": -247.04067993164062, "logps/rejected": -222.12002563476562, "loss": 0.6645, "rewards/accuracies": 0.5, "rewards/chosen": 0.27485352754592896, "rewards/margins": 0.07333764433860779, "rewards/rejected": 0.20151585340499878, "step": 971 }, { "epoch": 0.15031896385076357, "grad_norm": 4.71809720993042, "learning_rate": 2.505154639175258e-06, "logits/chosen": 5.420339107513428, "logits/rejected": 3.3367981910705566, "logps/chosen": -228.22998046875, "logps/rejected": -163.241943359375, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": 0.14215907454490662, "rewards/margins": 0.008592421188950539, "rewards/rejected": 0.13356666266918182, "step": 972 }, { "epoch": 0.15047361299052775, "grad_norm": 6.231609344482422, "learning_rate": 2.507731958762887e-06, "logits/chosen": 8.344324111938477, "logits/rejected": 5.516124725341797, "logps/chosen": -328.7281494140625, "logps/rejected": -370.429443359375, "loss": 0.6591, "rewards/accuracies": 0.625, "rewards/chosen": 0.29244059324264526, "rewards/margins": 0.09757347404956818, "rewards/rejected": 0.19486714899539948, "step": 973 }, { "epoch": 0.1506282621302919, "grad_norm": 7.182100296020508, "learning_rate": 2.5103092783505157e-06, "logits/chosen": 7.675025939941406, "logits/rejected": 1.1511075496673584, "logps/chosen": -493.1493225097656, "logps/rejected": -235.66189575195312, "loss": 0.7178, "rewards/accuracies": 0.5, "rewards/chosen": 0.11076326668262482, "rewards/margins": -0.0384642668068409, "rewards/rejected": 0.14922752976417542, "step": 974 }, { "epoch": 0.15078291127005605, "grad_norm": 6.490096092224121, "learning_rate": 2.5128865979381446e-06, "logits/chosen": 11.676152229309082, "logits/rejected": 5.522094249725342, "logps/chosen": -347.0611877441406, "logps/rejected": -304.6292419433594, "loss": 0.6271, "rewards/accuracies": 0.625, "rewards/chosen": 0.4794609546661377, "rewards/margins": 0.15278568863868713, "rewards/rejected": 0.3266752362251282, "step": 975 }, { "epoch": 0.15093756040982023, "grad_norm": 3.932523488998413, "learning_rate": 2.5154639175257734e-06, "logits/chosen": 8.104815483093262, "logits/rejected": 7.596432685852051, "logps/chosen": -187.56895446777344, "logps/rejected": -168.3927459716797, "loss": 0.7214, "rewards/accuracies": 0.25, "rewards/chosen": 0.22360765933990479, "rewards/margins": -0.04936189949512482, "rewards/rejected": 0.2729695737361908, "step": 976 }, { "epoch": 0.15109220954958438, "grad_norm": 4.268491268157959, "learning_rate": 2.5180412371134023e-06, "logits/chosen": 5.396533489227295, "logits/rejected": 8.581335067749023, "logps/chosen": -397.6007080078125, "logps/rejected": -239.90518188476562, "loss": 0.6384, "rewards/accuracies": 0.75, "rewards/chosen": 0.2249927520751953, "rewards/margins": 0.1317373365163803, "rewards/rejected": 0.0932554304599762, "step": 977 }, { "epoch": 0.15124685868934853, "grad_norm": 4.023215293884277, "learning_rate": 2.520618556701031e-06, "logits/chosen": 10.017741203308105, "logits/rejected": 9.831050872802734, "logps/chosen": -144.67971801757812, "logps/rejected": -177.0681610107422, "loss": 0.6578, "rewards/accuracies": 0.5, "rewards/chosen": 0.22600960731506348, "rewards/margins": 0.08895401656627655, "rewards/rejected": 0.13705560564994812, "step": 978 }, { "epoch": 0.1514015078291127, "grad_norm": 4.044947624206543, "learning_rate": 2.52319587628866e-06, "logits/chosen": 13.742888450622559, "logits/rejected": 6.837340354919434, "logps/chosen": -289.7095642089844, "logps/rejected": -149.99945068359375, "loss": 0.5796, "rewards/accuracies": 0.875, "rewards/chosen": 0.3312702775001526, "rewards/margins": 0.2694663405418396, "rewards/rejected": 0.0618039146065712, "step": 979 }, { "epoch": 0.15155615696887687, "grad_norm": 4.179415702819824, "learning_rate": 2.525773195876289e-06, "logits/chosen": 9.955364227294922, "logits/rejected": 7.088047027587891, "logps/chosen": -235.14581298828125, "logps/rejected": -220.76541137695312, "loss": 0.6716, "rewards/accuracies": 0.625, "rewards/chosen": 0.22456562519073486, "rewards/margins": 0.0635136216878891, "rewards/rejected": 0.16105200350284576, "step": 980 }, { "epoch": 0.15171080610864102, "grad_norm": 5.6151275634765625, "learning_rate": 2.528350515463918e-06, "logits/chosen": 16.08432960510254, "logits/rejected": 9.709497451782227, "logps/chosen": -319.97003173828125, "logps/rejected": -290.68487548828125, "loss": 0.7048, "rewards/accuracies": 0.5, "rewards/chosen": 0.2854556143283844, "rewards/margins": 0.00053425133228302, "rewards/rejected": 0.2849213778972626, "step": 981 }, { "epoch": 0.15186545524840517, "grad_norm": 5.1181769371032715, "learning_rate": 2.5309278350515467e-06, "logits/chosen": 11.758771896362305, "logits/rejected": 11.075485229492188, "logps/chosen": -256.3808898925781, "logps/rejected": -235.63259887695312, "loss": 0.6035, "rewards/accuracies": 0.75, "rewards/chosen": 0.3456147313117981, "rewards/margins": 0.23047533631324768, "rewards/rejected": 0.11513939499855042, "step": 982 }, { "epoch": 0.15202010438816935, "grad_norm": 5.064818859100342, "learning_rate": 2.5335051546391755e-06, "logits/chosen": 4.538435935974121, "logits/rejected": 0.7975058555603027, "logps/chosen": -129.87490844726562, "logps/rejected": -97.656982421875, "loss": 0.7387, "rewards/accuracies": 0.375, "rewards/chosen": 0.11231029778718948, "rewards/margins": -0.08480727672576904, "rewards/rejected": 0.19711756706237793, "step": 983 }, { "epoch": 0.1521747535279335, "grad_norm": 5.55540132522583, "learning_rate": 2.5360824742268044e-06, "logits/chosen": 7.44645881652832, "logits/rejected": 3.70943284034729, "logps/chosen": -354.41949462890625, "logps/rejected": -403.79052734375, "loss": 0.6173, "rewards/accuracies": 0.5, "rewards/chosen": 0.3005375862121582, "rewards/margins": 0.18250522017478943, "rewards/rejected": 0.11803236603736877, "step": 984 }, { "epoch": 0.15232940266769765, "grad_norm": 5.49982213973999, "learning_rate": 2.538659793814433e-06, "logits/chosen": 7.697822093963623, "logits/rejected": 6.096981048583984, "logps/chosen": -307.3810119628906, "logps/rejected": -237.4229736328125, "loss": 0.7645, "rewards/accuracies": 0.5, "rewards/chosen": 0.1885399967432022, "rewards/margins": -0.1180962473154068, "rewards/rejected": 0.306636244058609, "step": 985 }, { "epoch": 0.15248405180746183, "grad_norm": 4.034977912902832, "learning_rate": 2.5412371134020617e-06, "logits/chosen": 7.3290815353393555, "logits/rejected": 5.644658088684082, "logps/chosen": -197.15191650390625, "logps/rejected": -149.66915893554688, "loss": 0.6645, "rewards/accuracies": 0.5, "rewards/chosen": 0.2998509407043457, "rewards/margins": 0.06729154288768768, "rewards/rejected": 0.23255939781665802, "step": 986 }, { "epoch": 0.15263870094722598, "grad_norm": 4.73818302154541, "learning_rate": 2.5438144329896906e-06, "logits/chosen": 12.465482711791992, "logits/rejected": 7.708515644073486, "logps/chosen": -249.0347900390625, "logps/rejected": -167.53024291992188, "loss": 0.6615, "rewards/accuracies": 0.375, "rewards/chosen": 0.4345059096813202, "rewards/margins": 0.10229988396167755, "rewards/rejected": 0.3322060704231262, "step": 987 }, { "epoch": 0.15279335008699013, "grad_norm": 14.134047508239746, "learning_rate": 2.5463917525773195e-06, "logits/chosen": 11.476678848266602, "logits/rejected": 2.8608508110046387, "logps/chosen": -265.569580078125, "logps/rejected": -235.99075317382812, "loss": 0.6952, "rewards/accuracies": 0.25, "rewards/chosen": 0.25356826186180115, "rewards/margins": 0.0375649631023407, "rewards/rejected": 0.21600332856178284, "step": 988 }, { "epoch": 0.1529479992267543, "grad_norm": 6.283043384552002, "learning_rate": 2.5489690721649483e-06, "logits/chosen": 9.685466766357422, "logits/rejected": 4.425581455230713, "logps/chosen": -383.323486328125, "logps/rejected": -348.57958984375, "loss": 0.6695, "rewards/accuracies": 0.5, "rewards/chosen": 0.46346864104270935, "rewards/margins": 0.06451568007469177, "rewards/rejected": 0.3989529609680176, "step": 989 }, { "epoch": 0.15310264836651846, "grad_norm": 5.045665740966797, "learning_rate": 2.5515463917525772e-06, "logits/chosen": 16.158935546875, "logits/rejected": 7.748905658721924, "logps/chosen": -225.56414794921875, "logps/rejected": -144.9650421142578, "loss": 0.6233, "rewards/accuracies": 0.625, "rewards/chosen": 0.22278550267219543, "rewards/margins": 0.1727612167596817, "rewards/rejected": 0.05002429336309433, "step": 990 }, { "epoch": 0.1532572975062826, "grad_norm": 4.56468391418457, "learning_rate": 2.554123711340206e-06, "logits/chosen": 6.196763038635254, "logits/rejected": 9.28365421295166, "logps/chosen": -205.75367736816406, "logps/rejected": -211.6551055908203, "loss": 0.7291, "rewards/accuracies": 0.25, "rewards/chosen": 0.2946475148200989, "rewards/margins": -0.06527872383594513, "rewards/rejected": 0.3599262535572052, "step": 991 }, { "epoch": 0.1534119466460468, "grad_norm": 4.6295061111450195, "learning_rate": 2.556701030927835e-06, "logits/chosen": 12.202375411987305, "logits/rejected": 11.065433502197266, "logps/chosen": -208.89736938476562, "logps/rejected": -234.885009765625, "loss": 0.6338, "rewards/accuracies": 0.75, "rewards/chosen": 0.3158901631832123, "rewards/margins": 0.13359761238098145, "rewards/rejected": 0.18229256570339203, "step": 992 }, { "epoch": 0.15356659578581094, "grad_norm": 9.558801651000977, "learning_rate": 2.559278350515464e-06, "logits/chosen": 11.489753723144531, "logits/rejected": 4.942627429962158, "logps/chosen": -340.67266845703125, "logps/rejected": -239.7880859375, "loss": 0.6952, "rewards/accuracies": 0.5, "rewards/chosen": 0.25585898756980896, "rewards/margins": -0.0014899037778377533, "rewards/rejected": 0.257348895072937, "step": 993 }, { "epoch": 0.1537212449255751, "grad_norm": 5.386810779571533, "learning_rate": 2.5618556701030927e-06, "logits/chosen": 10.71761703491211, "logits/rejected": 5.424169540405273, "logps/chosen": -273.7106018066406, "logps/rejected": -142.55484008789062, "loss": 0.7423, "rewards/accuracies": 0.375, "rewards/chosen": 0.3274467885494232, "rewards/margins": -0.07112008333206177, "rewards/rejected": 0.3985668420791626, "step": 994 }, { "epoch": 0.15387589406533927, "grad_norm": 5.826381206512451, "learning_rate": 2.5644329896907216e-06, "logits/chosen": 14.185192108154297, "logits/rejected": 17.662160873413086, "logps/chosen": -178.9151611328125, "logps/rejected": -290.31744384765625, "loss": 0.6422, "rewards/accuracies": 0.625, "rewards/chosen": 0.33144572377204895, "rewards/margins": 0.11291008442640305, "rewards/rejected": 0.2185356169939041, "step": 995 }, { "epoch": 0.15403054320510343, "grad_norm": 9.259116172790527, "learning_rate": 2.5670103092783504e-06, "logits/chosen": 12.790938377380371, "logits/rejected": 12.904714584350586, "logps/chosen": -454.6855163574219, "logps/rejected": -398.2311096191406, "loss": 0.7704, "rewards/accuracies": 0.25, "rewards/chosen": 0.28311118483543396, "rewards/margins": -0.14085274934768677, "rewards/rejected": 0.4239639639854431, "step": 996 }, { "epoch": 0.15418519234486758, "grad_norm": 6.129758834838867, "learning_rate": 2.5695876288659793e-06, "logits/chosen": 6.6786346435546875, "logits/rejected": 10.564947128295898, "logps/chosen": -281.6134338378906, "logps/rejected": -294.8741149902344, "loss": 0.7838, "rewards/accuracies": 0.5, "rewards/chosen": 0.4145020544528961, "rewards/margins": -0.14903785288333893, "rewards/rejected": 0.5635399222373962, "step": 997 }, { "epoch": 0.15433984148463173, "grad_norm": 6.357661724090576, "learning_rate": 2.572164948453608e-06, "logits/chosen": 12.67042350769043, "logits/rejected": 7.238968372344971, "logps/chosen": -203.879150390625, "logps/rejected": -163.60195922851562, "loss": 0.6808, "rewards/accuracies": 0.625, "rewards/chosen": 0.23579970002174377, "rewards/margins": 0.03664977848529816, "rewards/rejected": 0.19914992153644562, "step": 998 }, { "epoch": 0.1544944906243959, "grad_norm": 4.323291301727295, "learning_rate": 2.574742268041237e-06, "logits/chosen": 18.202255249023438, "logits/rejected": 6.99099063873291, "logps/chosen": -288.840576171875, "logps/rejected": -187.57894897460938, "loss": 0.5939, "rewards/accuracies": 0.875, "rewards/chosen": 0.5485618710517883, "rewards/margins": 0.22565820813179016, "rewards/rejected": 0.32290369272232056, "step": 999 }, { "epoch": 0.15464913976416006, "grad_norm": 4.010663032531738, "learning_rate": 2.577319587628866e-06, "logits/chosen": 4.965423583984375, "logits/rejected": 4.6642866134643555, "logps/chosen": -201.75933837890625, "logps/rejected": -229.17315673828125, "loss": 0.648, "rewards/accuracies": 0.5, "rewards/chosen": 0.3071884512901306, "rewards/margins": 0.10149803757667542, "rewards/rejected": 0.2056904137134552, "step": 1000 }, { "epoch": 0.1548037889039242, "grad_norm": 3.9429001808166504, "learning_rate": 2.5798969072164952e-06, "logits/chosen": 8.92806625366211, "logits/rejected": 7.9105072021484375, "logps/chosen": -161.7950439453125, "logps/rejected": -157.60714721679688, "loss": 0.7161, "rewards/accuracies": 0.5, "rewards/chosen": 0.3454486131668091, "rewards/margins": -0.041734881699085236, "rewards/rejected": 0.3871834874153137, "step": 1001 }, { "epoch": 0.1549584380436884, "grad_norm": 5.263321399688721, "learning_rate": 2.582474226804124e-06, "logits/chosen": 13.339742660522461, "logits/rejected": 11.67384147644043, "logps/chosen": -297.85296630859375, "logps/rejected": -260.2172546386719, "loss": 0.6301, "rewards/accuracies": 0.875, "rewards/chosen": 0.4796229600906372, "rewards/margins": 0.14002437889575958, "rewards/rejected": 0.33959856629371643, "step": 1002 }, { "epoch": 0.15511308718345254, "grad_norm": 4.655385971069336, "learning_rate": 2.585051546391753e-06, "logits/chosen": 11.733928680419922, "logits/rejected": 4.142250061035156, "logps/chosen": -368.67431640625, "logps/rejected": -214.8388671875, "loss": 0.58, "rewards/accuracies": 0.875, "rewards/chosen": 0.6160106658935547, "rewards/margins": 0.27698814868927, "rewards/rejected": 0.33902251720428467, "step": 1003 }, { "epoch": 0.1552677363232167, "grad_norm": 8.346925735473633, "learning_rate": 2.587628865979382e-06, "logits/chosen": 9.91627311706543, "logits/rejected": 6.078705787658691, "logps/chosen": -324.4015197753906, "logps/rejected": -352.47760009765625, "loss": 0.6009, "rewards/accuracies": 0.5, "rewards/chosen": 0.43736210465431213, "rewards/margins": 0.2276042103767395, "rewards/rejected": 0.20975790917873383, "step": 1004 }, { "epoch": 0.15542238546298087, "grad_norm": 5.191592693328857, "learning_rate": 2.5902061855670107e-06, "logits/chosen": 11.86085319519043, "logits/rejected": 6.025583267211914, "logps/chosen": -260.1004638671875, "logps/rejected": -238.47109985351562, "loss": 0.6702, "rewards/accuracies": 0.5, "rewards/chosen": 0.32853877544403076, "rewards/margins": 0.07583131641149521, "rewards/rejected": 0.25270742177963257, "step": 1005 }, { "epoch": 0.15557703460274502, "grad_norm": 5.581071376800537, "learning_rate": 2.5927835051546396e-06, "logits/chosen": 17.071813583374023, "logits/rejected": 12.390661239624023, "logps/chosen": -301.09259033203125, "logps/rejected": -250.86766052246094, "loss": 0.6759, "rewards/accuracies": 0.625, "rewards/chosen": 0.4589708149433136, "rewards/margins": 0.05198876932263374, "rewards/rejected": 0.40698206424713135, "step": 1006 }, { "epoch": 0.15573168374250917, "grad_norm": 4.009683609008789, "learning_rate": 2.5953608247422685e-06, "logits/chosen": 5.798210144042969, "logits/rejected": 5.148221015930176, "logps/chosen": -243.46795654296875, "logps/rejected": -227.66705322265625, "loss": 0.6779, "rewards/accuracies": 0.5, "rewards/chosen": 0.33121249079704285, "rewards/margins": 0.05117817595601082, "rewards/rejected": 0.28003430366516113, "step": 1007 }, { "epoch": 0.15588633288227335, "grad_norm": 5.627560615539551, "learning_rate": 2.5979381443298973e-06, "logits/chosen": 13.577735900878906, "logits/rejected": 5.309107303619385, "logps/chosen": -187.17648315429688, "logps/rejected": -99.2226333618164, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": 0.32158276438713074, "rewards/margins": 0.05657311528921127, "rewards/rejected": 0.26500964164733887, "step": 1008 }, { "epoch": 0.1560409820220375, "grad_norm": 5.927374839782715, "learning_rate": 2.600515463917526e-06, "logits/chosen": 12.391993522644043, "logits/rejected": 7.203285217285156, "logps/chosen": -290.9713134765625, "logps/rejected": -267.4643249511719, "loss": 0.7314, "rewards/accuracies": 0.375, "rewards/chosen": 0.5320847034454346, "rewards/margins": -0.02508671209216118, "rewards/rejected": 0.5571714043617249, "step": 1009 }, { "epoch": 0.15619563116180166, "grad_norm": 12.79286003112793, "learning_rate": 2.603092783505155e-06, "logits/chosen": 8.318655014038086, "logits/rejected": 10.794252395629883, "logps/chosen": -168.07498168945312, "logps/rejected": -164.96377563476562, "loss": 0.73, "rewards/accuracies": 0.5, "rewards/chosen": 0.2749417722225189, "rewards/margins": -0.061866626143455505, "rewards/rejected": 0.3368084132671356, "step": 1010 }, { "epoch": 0.15635028030156584, "grad_norm": 5.886778354644775, "learning_rate": 2.605670103092784e-06, "logits/chosen": 6.2451090812683105, "logits/rejected": 3.7573976516723633, "logps/chosen": -396.6445617675781, "logps/rejected": -256.5961608886719, "loss": 0.65, "rewards/accuracies": 0.375, "rewards/chosen": 0.46838143467903137, "rewards/margins": 0.10984040051698685, "rewards/rejected": 0.35854101181030273, "step": 1011 }, { "epoch": 0.15650492944133, "grad_norm": 9.064360618591309, "learning_rate": 2.608247422680413e-06, "logits/chosen": 10.707468032836914, "logits/rejected": 11.799666404724121, "logps/chosen": -229.15879821777344, "logps/rejected": -270.58349609375, "loss": 0.6652, "rewards/accuracies": 0.625, "rewards/chosen": 0.2975532114505768, "rewards/margins": 0.06605391204357147, "rewards/rejected": 0.2314992994070053, "step": 1012 }, { "epoch": 0.15665957858109414, "grad_norm": 6.990705966949463, "learning_rate": 2.6108247422680417e-06, "logits/chosen": 8.056679725646973, "logits/rejected": 10.398065567016602, "logps/chosen": -277.4003601074219, "logps/rejected": -353.6678771972656, "loss": 0.7056, "rewards/accuracies": 0.5, "rewards/chosen": 0.48444175720214844, "rewards/margins": 0.002009958028793335, "rewards/rejected": 0.4824318289756775, "step": 1013 }, { "epoch": 0.1568142277208583, "grad_norm": 4.032031059265137, "learning_rate": 2.6134020618556706e-06, "logits/chosen": 12.851255416870117, "logits/rejected": 1.686974287033081, "logps/chosen": -285.05657958984375, "logps/rejected": -106.9432373046875, "loss": 0.6104, "rewards/accuracies": 0.75, "rewards/chosen": 0.5892062187194824, "rewards/margins": 0.20518063008785248, "rewards/rejected": 0.38402560353279114, "step": 1014 }, { "epoch": 0.15696887686062247, "grad_norm": 5.427288055419922, "learning_rate": 2.6159793814432994e-06, "logits/chosen": 6.395529747009277, "logits/rejected": 7.157956123352051, "logps/chosen": -206.54147338867188, "logps/rejected": -208.5509796142578, "loss": 0.7323, "rewards/accuracies": 0.25, "rewards/chosen": 0.4238506555557251, "rewards/margins": -0.048391249030828476, "rewards/rejected": 0.4722418785095215, "step": 1015 }, { "epoch": 0.15712352600038662, "grad_norm": 4.94955587387085, "learning_rate": 2.6185567010309283e-06, "logits/chosen": 13.247678756713867, "logits/rejected": 8.358420372009277, "logps/chosen": -250.1439666748047, "logps/rejected": -183.1035919189453, "loss": 0.6645, "rewards/accuracies": 0.625, "rewards/chosen": 0.3969514071941376, "rewards/margins": 0.07653766125440598, "rewards/rejected": 0.3204137682914734, "step": 1016 }, { "epoch": 0.15727817514015077, "grad_norm": 4.708651065826416, "learning_rate": 2.621134020618557e-06, "logits/chosen": 14.124601364135742, "logits/rejected": 12.939817428588867, "logps/chosen": -257.5981140136719, "logps/rejected": -253.38003540039062, "loss": 0.616, "rewards/accuracies": 0.875, "rewards/chosen": 0.6638963222503662, "rewards/margins": 0.18651682138442993, "rewards/rejected": 0.4773794710636139, "step": 1017 }, { "epoch": 0.15743282427991495, "grad_norm": 6.371457576751709, "learning_rate": 2.623711340206186e-06, "logits/chosen": 10.791617393493652, "logits/rejected": 10.859783172607422, "logps/chosen": -287.60955810546875, "logps/rejected": -307.47918701171875, "loss": 0.6261, "rewards/accuracies": 0.625, "rewards/chosen": 0.4228264093399048, "rewards/margins": 0.1539658159017563, "rewards/rejected": 0.2688605785369873, "step": 1018 }, { "epoch": 0.1575874734196791, "grad_norm": 5.202495098114014, "learning_rate": 2.626288659793815e-06, "logits/chosen": 5.763272762298584, "logits/rejected": 9.891640663146973, "logps/chosen": -236.70794677734375, "logps/rejected": -297.4866943359375, "loss": 0.5873, "rewards/accuracies": 0.75, "rewards/chosen": 0.48614585399627686, "rewards/margins": 0.2402709424495697, "rewards/rejected": 0.24587489664554596, "step": 1019 }, { "epoch": 0.15774212255944325, "grad_norm": 5.173934459686279, "learning_rate": 2.628865979381444e-06, "logits/chosen": 10.963274002075195, "logits/rejected": 7.169569969177246, "logps/chosen": -262.1689758300781, "logps/rejected": -244.37644958496094, "loss": 0.7165, "rewards/accuracies": 0.375, "rewards/chosen": 0.47549164295196533, "rewards/margins": -0.037476323544979095, "rewards/rejected": 0.5129680037498474, "step": 1020 }, { "epoch": 0.15789677169920743, "grad_norm": 6.314521789550781, "learning_rate": 2.6314432989690727e-06, "logits/chosen": 10.755579948425293, "logits/rejected": 2.741549491882324, "logps/chosen": -288.4797668457031, "logps/rejected": -283.5696716308594, "loss": 0.6343, "rewards/accuracies": 0.75, "rewards/chosen": 0.5280244946479797, "rewards/margins": 0.13582980632781982, "rewards/rejected": 0.3921946585178375, "step": 1021 }, { "epoch": 0.15805142083897158, "grad_norm": 5.325987815856934, "learning_rate": 2.634020618556701e-06, "logits/chosen": 4.969079971313477, "logits/rejected": 12.771631240844727, "logps/chosen": -227.09994506835938, "logps/rejected": -374.99560546875, "loss": 0.6831, "rewards/accuracies": 0.625, "rewards/chosen": 0.509530246257782, "rewards/margins": 0.029446369037032127, "rewards/rejected": 0.4800838828086853, "step": 1022 }, { "epoch": 0.15820606997873574, "grad_norm": 4.745158672332764, "learning_rate": 2.63659793814433e-06, "logits/chosen": 7.745759963989258, "logits/rejected": 14.217255592346191, "logps/chosen": -131.89381408691406, "logps/rejected": -202.36746215820312, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": 0.4468061625957489, "rewards/margins": 0.047152094542980194, "rewards/rejected": 0.3996540606021881, "step": 1023 }, { "epoch": 0.15836071911849992, "grad_norm": 8.940340042114258, "learning_rate": 2.639175257731959e-06, "logits/chosen": 3.577395439147949, "logits/rejected": 11.726722717285156, "logps/chosen": -166.974609375, "logps/rejected": -214.47677612304688, "loss": 0.7807, "rewards/accuracies": 0.25, "rewards/chosen": 0.17657072842121124, "rewards/margins": -0.15617960691452026, "rewards/rejected": 0.3327503204345703, "step": 1024 }, { "epoch": 0.15851536825826407, "grad_norm": 5.5178608894348145, "learning_rate": 2.6417525773195877e-06, "logits/chosen": 11.245694160461426, "logits/rejected": 6.978783130645752, "logps/chosen": -297.09613037109375, "logps/rejected": -197.52920532226562, "loss": 0.7745, "rewards/accuracies": 0.5, "rewards/chosen": 0.30007678270339966, "rewards/margins": -0.11804361641407013, "rewards/rejected": 0.4181203842163086, "step": 1025 }, { "epoch": 0.15867001739802822, "grad_norm": 5.789656639099121, "learning_rate": 2.6443298969072166e-06, "logits/chosen": 10.028810501098633, "logits/rejected": 10.683966636657715, "logps/chosen": -307.38726806640625, "logps/rejected": -258.4380798339844, "loss": 0.7284, "rewards/accuracies": 0.5, "rewards/chosen": 0.4536043107509613, "rewards/margins": -0.051944829523563385, "rewards/rejected": 0.5055491328239441, "step": 1026 }, { "epoch": 0.1588246665377924, "grad_norm": 5.341628551483154, "learning_rate": 2.6469072164948455e-06, "logits/chosen": 3.1536104679107666, "logits/rejected": 8.352611541748047, "logps/chosen": -207.67640686035156, "logps/rejected": -323.90362548828125, "loss": 0.7076, "rewards/accuracies": 0.375, "rewards/chosen": 0.26018840074539185, "rewards/margins": -0.027193743735551834, "rewards/rejected": 0.2873821258544922, "step": 1027 }, { "epoch": 0.15897931567755655, "grad_norm": 6.4074835777282715, "learning_rate": 2.6494845360824743e-06, "logits/chosen": 6.083681583404541, "logits/rejected": 10.88270378112793, "logps/chosen": -280.3446044921875, "logps/rejected": -292.8846740722656, "loss": 0.8587, "rewards/accuracies": 0.25, "rewards/chosen": 0.11750331521034241, "rewards/margins": -0.26196616888046265, "rewards/rejected": 0.37946951389312744, "step": 1028 }, { "epoch": 0.1591339648173207, "grad_norm": 7.285872936248779, "learning_rate": 2.6520618556701032e-06, "logits/chosen": 11.835395812988281, "logits/rejected": 2.4885847568511963, "logps/chosen": -610.2842407226562, "logps/rejected": -328.08502197265625, "loss": 0.7197, "rewards/accuracies": 0.5, "rewards/chosen": 0.4710925221443176, "rewards/margins": 0.00777001678943634, "rewards/rejected": 0.4633224904537201, "step": 1029 }, { "epoch": 0.15928861395708485, "grad_norm": 6.519668102264404, "learning_rate": 2.654639175257732e-06, "logits/chosen": 7.799099922180176, "logits/rejected": 11.39924430847168, "logps/chosen": -314.4749755859375, "logps/rejected": -299.37939453125, "loss": 0.7686, "rewards/accuracies": 0.375, "rewards/chosen": 0.21852943301200867, "rewards/margins": -0.13165950775146484, "rewards/rejected": 0.3501889407634735, "step": 1030 }, { "epoch": 0.15944326309684903, "grad_norm": 6.97037935256958, "learning_rate": 2.657216494845361e-06, "logits/chosen": 5.605464935302734, "logits/rejected": 4.981527328491211, "logps/chosen": -276.9613037109375, "logps/rejected": -366.38836669921875, "loss": 0.7046, "rewards/accuracies": 0.375, "rewards/chosen": 0.49983900785446167, "rewards/margins": 0.019785024225711823, "rewards/rejected": 0.48005399107933044, "step": 1031 }, { "epoch": 0.15959791223661318, "grad_norm": 6.064166069030762, "learning_rate": 2.65979381443299e-06, "logits/chosen": 7.023273468017578, "logits/rejected": 4.334907054901123, "logps/chosen": -277.53472900390625, "logps/rejected": -236.01010131835938, "loss": 0.6762, "rewards/accuracies": 0.625, "rewards/chosen": 0.6226499676704407, "rewards/margins": 0.05863867700099945, "rewards/rejected": 0.56401127576828, "step": 1032 }, { "epoch": 0.15975256137637733, "grad_norm": 11.71610164642334, "learning_rate": 2.6623711340206187e-06, "logits/chosen": 13.209150314331055, "logits/rejected": 6.806638240814209, "logps/chosen": -648.9050903320312, "logps/rejected": -339.5271911621094, "loss": 0.8801, "rewards/accuracies": 0.25, "rewards/chosen": 0.2154412418603897, "rewards/margins": -0.2950975298881531, "rewards/rejected": 0.5105387568473816, "step": 1033 }, { "epoch": 0.1599072105161415, "grad_norm": 5.398703098297119, "learning_rate": 2.6649484536082476e-06, "logits/chosen": 10.184659957885742, "logits/rejected": 4.911975860595703, "logps/chosen": -350.12945556640625, "logps/rejected": -298.5102233886719, "loss": 0.6045, "rewards/accuracies": 0.625, "rewards/chosen": 0.566582202911377, "rewards/margins": 0.21806208789348602, "rewards/rejected": 0.34852010011672974, "step": 1034 }, { "epoch": 0.16006185965590566, "grad_norm": 4.126475811004639, "learning_rate": 2.6675257731958765e-06, "logits/chosen": 8.251382827758789, "logits/rejected": 5.719577789306641, "logps/chosen": -183.52490234375, "logps/rejected": -196.65928649902344, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 0.32802990078926086, "rewards/margins": 0.0051604341715574265, "rewards/rejected": 0.3228694200515747, "step": 1035 }, { "epoch": 0.16021650879566982, "grad_norm": 6.685421466827393, "learning_rate": 2.6701030927835053e-06, "logits/chosen": 10.221075057983398, "logits/rejected": 6.6625213623046875, "logps/chosen": -383.50048828125, "logps/rejected": -333.1712341308594, "loss": 0.7448, "rewards/accuracies": 0.5, "rewards/chosen": 0.5246036648750305, "rewards/margins": -0.0873337984085083, "rewards/rejected": 0.6119374632835388, "step": 1036 }, { "epoch": 0.160371157935434, "grad_norm": 5.351931571960449, "learning_rate": 2.672680412371134e-06, "logits/chosen": 7.194896697998047, "logits/rejected": 5.363762855529785, "logps/chosen": -177.01670837402344, "logps/rejected": -250.59585571289062, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": 0.15219125151634216, "rewards/margins": 0.056029822677373886, "rewards/rejected": 0.09616144001483917, "step": 1037 }, { "epoch": 0.16052580707519815, "grad_norm": 7.467764854431152, "learning_rate": 2.675257731958763e-06, "logits/chosen": 12.162158012390137, "logits/rejected": 13.805460929870605, "logps/chosen": -376.8035583496094, "logps/rejected": -349.16436767578125, "loss": 0.8115, "rewards/accuracies": 0.375, "rewards/chosen": 0.26126694679260254, "rewards/margins": -0.19738717377185822, "rewards/rejected": 0.45865413546562195, "step": 1038 }, { "epoch": 0.1606804562149623, "grad_norm": 6.053816795349121, "learning_rate": 2.677835051546392e-06, "logits/chosen": 10.336067199707031, "logits/rejected": 11.565001487731934, "logps/chosen": -330.21539306640625, "logps/rejected": -445.71380615234375, "loss": 0.7374, "rewards/accuracies": 0.375, "rewards/chosen": 0.44596052169799805, "rewards/margins": -0.07219447940587997, "rewards/rejected": 0.5181549787521362, "step": 1039 }, { "epoch": 0.16083510535472648, "grad_norm": 5.0752034187316895, "learning_rate": 2.680412371134021e-06, "logits/chosen": 8.184542655944824, "logits/rejected": 7.182358741760254, "logps/chosen": -216.462158203125, "logps/rejected": -159.28610229492188, "loss": 0.7403, "rewards/accuracies": 0.375, "rewards/chosen": 0.2973100244998932, "rewards/margins": -0.08360461890697479, "rewards/rejected": 0.3809146285057068, "step": 1040 }, { "epoch": 0.16098975449449063, "grad_norm": 5.078243732452393, "learning_rate": 2.6829896907216497e-06, "logits/chosen": 8.63563060760498, "logits/rejected": 9.155065536499023, "logps/chosen": -328.1712341308594, "logps/rejected": -258.8982849121094, "loss": 0.6608, "rewards/accuracies": 0.75, "rewards/chosen": 0.4773038923740387, "rewards/margins": 0.08635596930980682, "rewards/rejected": 0.3909479081630707, "step": 1041 }, { "epoch": 0.16114440363425478, "grad_norm": 5.658509731292725, "learning_rate": 2.6855670103092786e-06, "logits/chosen": 9.052953720092773, "logits/rejected": 8.630266189575195, "logps/chosen": -299.23150634765625, "logps/rejected": -259.9249572753906, "loss": 0.6643, "rewards/accuracies": 0.5, "rewards/chosen": 0.37799960374832153, "rewards/margins": 0.08504701405763626, "rewards/rejected": 0.29295259714126587, "step": 1042 }, { "epoch": 0.16129905277401896, "grad_norm": 6.068130016326904, "learning_rate": 2.6881443298969074e-06, "logits/chosen": 9.236040115356445, "logits/rejected": 5.622822284698486, "logps/chosen": -266.4791564941406, "logps/rejected": -219.8063507080078, "loss": 0.8015, "rewards/accuracies": 0.375, "rewards/chosen": 0.3484756052494049, "rewards/margins": -0.1862947940826416, "rewards/rejected": 0.5347704291343689, "step": 1043 }, { "epoch": 0.1614537019137831, "grad_norm": 4.1639180183410645, "learning_rate": 2.6907216494845363e-06, "logits/chosen": 12.256208419799805, "logits/rejected": 10.92822265625, "logps/chosen": -206.211669921875, "logps/rejected": -272.11505126953125, "loss": 0.6241, "rewards/accuracies": 0.5, "rewards/chosen": 0.4437619149684906, "rewards/margins": 0.1814982295036316, "rewards/rejected": 0.262263685464859, "step": 1044 }, { "epoch": 0.16160835105354726, "grad_norm": 3.9855000972747803, "learning_rate": 2.693298969072165e-06, "logits/chosen": 13.220599174499512, "logits/rejected": 8.862778663635254, "logps/chosen": -220.43321228027344, "logps/rejected": -182.53016662597656, "loss": 0.6332, "rewards/accuracies": 0.5, "rewards/chosen": 0.36090487241744995, "rewards/margins": 0.14688324928283691, "rewards/rejected": 0.21402163803577423, "step": 1045 }, { "epoch": 0.1617630001933114, "grad_norm": 6.5949387550354, "learning_rate": 2.695876288659794e-06, "logits/chosen": 7.444316864013672, "logits/rejected": 2.5016591548919678, "logps/chosen": -429.6894226074219, "logps/rejected": -334.570556640625, "loss": 0.6245, "rewards/accuracies": 0.75, "rewards/chosen": 0.5750318765640259, "rewards/margins": 0.1503545194864273, "rewards/rejected": 0.42467740178108215, "step": 1046 }, { "epoch": 0.1619176493330756, "grad_norm": 4.156692981719971, "learning_rate": 2.698453608247423e-06, "logits/chosen": -1.1847783327102661, "logits/rejected": 10.830698013305664, "logps/chosen": -122.63531494140625, "logps/rejected": -227.61268615722656, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.4892508387565613, "rewards/margins": 0.028514418751001358, "rewards/rejected": 0.4607364237308502, "step": 1047 }, { "epoch": 0.16207229847283974, "grad_norm": 7.735217094421387, "learning_rate": 2.7010309278350518e-06, "logits/chosen": 9.772907257080078, "logits/rejected": 10.852302551269531, "logps/chosen": -292.5924987792969, "logps/rejected": -311.0618896484375, "loss": 0.7872, "rewards/accuracies": 0.125, "rewards/chosen": 0.23518210649490356, "rewards/margins": -0.16996806859970093, "rewards/rejected": 0.4051501750946045, "step": 1048 }, { "epoch": 0.1622269476126039, "grad_norm": 8.69966983795166, "learning_rate": 2.7036082474226807e-06, "logits/chosen": 14.306394577026367, "logits/rejected": 7.621552467346191, "logps/chosen": -365.2208251953125, "logps/rejected": -231.18020629882812, "loss": 0.6779, "rewards/accuracies": 0.5, "rewards/chosen": 0.5280090570449829, "rewards/margins": 0.05756300687789917, "rewards/rejected": 0.47044605016708374, "step": 1049 }, { "epoch": 0.16238159675236807, "grad_norm": 7.324671268463135, "learning_rate": 2.7061855670103095e-06, "logits/chosen": 11.567021369934082, "logits/rejected": 11.947700500488281, "logps/chosen": -284.952392578125, "logps/rejected": -280.4742431640625, "loss": 0.6987, "rewards/accuracies": 0.625, "rewards/chosen": 0.2622503340244293, "rewards/margins": 0.004250619560480118, "rewards/rejected": 0.2579997181892395, "step": 1050 }, { "epoch": 0.16253624589213223, "grad_norm": 5.369792938232422, "learning_rate": 2.708762886597938e-06, "logits/chosen": 12.093174934387207, "logits/rejected": 6.71939754486084, "logps/chosen": -366.75848388671875, "logps/rejected": -303.83197021484375, "loss": 0.6249, "rewards/accuracies": 0.5, "rewards/chosen": 0.49151867628097534, "rewards/margins": 0.18071764707565308, "rewards/rejected": 0.31080102920532227, "step": 1051 }, { "epoch": 0.16269089503189638, "grad_norm": 5.221163749694824, "learning_rate": 2.711340206185567e-06, "logits/chosen": 6.979132175445557, "logits/rejected": 10.874004364013672, "logps/chosen": -154.37570190429688, "logps/rejected": -187.8915557861328, "loss": 0.6768, "rewards/accuracies": 0.625, "rewards/chosen": 0.4858044385910034, "rewards/margins": 0.03911856934428215, "rewards/rejected": 0.4466858506202698, "step": 1052 }, { "epoch": 0.16284554417166056, "grad_norm": 3.984731912612915, "learning_rate": 2.7139175257731957e-06, "logits/chosen": 12.964475631713867, "logits/rejected": 5.655695915222168, "logps/chosen": -177.99411010742188, "logps/rejected": -157.4450225830078, "loss": 0.6622, "rewards/accuracies": 0.5, "rewards/chosen": 0.319799542427063, "rewards/margins": 0.08091521263122559, "rewards/rejected": 0.23888429999351501, "step": 1053 }, { "epoch": 0.1630001933114247, "grad_norm": 4.853552341461182, "learning_rate": 2.7164948453608246e-06, "logits/chosen": 10.205548286437988, "logits/rejected": 4.836327075958252, "logps/chosen": -281.8667907714844, "logps/rejected": -237.60916137695312, "loss": 0.642, "rewards/accuracies": 0.75, "rewards/chosen": 0.3726174831390381, "rewards/margins": 0.11989913880825043, "rewards/rejected": 0.25271835923194885, "step": 1054 }, { "epoch": 0.16315484245118886, "grad_norm": 4.472945690155029, "learning_rate": 2.7190721649484535e-06, "logits/chosen": 12.537957191467285, "logits/rejected": 7.656261920928955, "logps/chosen": -182.21368408203125, "logps/rejected": -169.21478271484375, "loss": 0.7051, "rewards/accuracies": 0.625, "rewards/chosen": 0.3439086675643921, "rewards/margins": -0.007755044847726822, "rewards/rejected": 0.351663738489151, "step": 1055 }, { "epoch": 0.16330949159095304, "grad_norm": 4.707562446594238, "learning_rate": 2.7216494845360823e-06, "logits/chosen": 12.646970748901367, "logits/rejected": 7.461970329284668, "logps/chosen": -301.0125732421875, "logps/rejected": -219.021240234375, "loss": 0.6498, "rewards/accuracies": 0.625, "rewards/chosen": 0.6307682991027832, "rewards/margins": 0.12720398604869843, "rewards/rejected": 0.5035642981529236, "step": 1056 }, { "epoch": 0.1634641407307172, "grad_norm": 43.98398971557617, "learning_rate": 2.724226804123711e-06, "logits/chosen": 11.250030517578125, "logits/rejected": 6.399295806884766, "logps/chosen": -223.62542724609375, "logps/rejected": -166.40493774414062, "loss": 0.6384, "rewards/accuracies": 0.875, "rewards/chosen": 0.45335260033607483, "rewards/margins": 0.12117159366607666, "rewards/rejected": 0.33218103647232056, "step": 1057 }, { "epoch": 0.16361878987048134, "grad_norm": 5.9794487953186035, "learning_rate": 2.72680412371134e-06, "logits/chosen": 6.903171062469482, "logits/rejected": 11.396451950073242, "logps/chosen": -210.54454040527344, "logps/rejected": -237.95388793945312, "loss": 0.7177, "rewards/accuracies": 0.5, "rewards/chosen": 0.24321851134300232, "rewards/margins": -0.03795791044831276, "rewards/rejected": 0.2811764180660248, "step": 1058 }, { "epoch": 0.1637734390102455, "grad_norm": 10.869904518127441, "learning_rate": 2.729381443298969e-06, "logits/chosen": 7.053282737731934, "logits/rejected": 5.508359432220459, "logps/chosen": -281.413818359375, "logps/rejected": -377.7446594238281, "loss": 0.695, "rewards/accuracies": 0.5, "rewards/chosen": 0.3586414158344269, "rewards/margins": 0.05114555358886719, "rewards/rejected": 0.3074958324432373, "step": 1059 }, { "epoch": 0.16392808815000967, "grad_norm": 4.94764518737793, "learning_rate": 2.731958762886598e-06, "logits/chosen": 8.505027770996094, "logits/rejected": 5.51999568939209, "logps/chosen": -311.41778564453125, "logps/rejected": -285.7543029785156, "loss": 0.6836, "rewards/accuracies": 0.75, "rewards/chosen": 0.4464007616043091, "rewards/margins": 0.06450801342725754, "rewards/rejected": 0.3818928003311157, "step": 1060 }, { "epoch": 0.16408273728977382, "grad_norm": 4.890074253082275, "learning_rate": 2.734536082474227e-06, "logits/chosen": 9.940583229064941, "logits/rejected": 11.309616088867188, "logps/chosen": -253.55035400390625, "logps/rejected": -276.87957763671875, "loss": 0.6167, "rewards/accuracies": 0.875, "rewards/chosen": 0.44649219512939453, "rewards/margins": 0.16762083768844604, "rewards/rejected": 0.2788713574409485, "step": 1061 }, { "epoch": 0.16423738642953797, "grad_norm": 4.957050323486328, "learning_rate": 2.737113402061856e-06, "logits/chosen": 12.855979919433594, "logits/rejected": 12.664342880249023, "logps/chosen": -226.63661193847656, "logps/rejected": -215.68629455566406, "loss": 0.7752, "rewards/accuracies": 0.375, "rewards/chosen": 0.3890753984451294, "rewards/margins": -0.13341045379638672, "rewards/rejected": 0.5224858522415161, "step": 1062 }, { "epoch": 0.16439203556930215, "grad_norm": 5.184538841247559, "learning_rate": 2.739690721649485e-06, "logits/chosen": 3.3324735164642334, "logits/rejected": 4.615572452545166, "logps/chosen": -212.3704071044922, "logps/rejected": -233.98492431640625, "loss": 0.7259, "rewards/accuracies": 0.375, "rewards/chosen": 0.18181224167346954, "rewards/margins": -0.05426469445228577, "rewards/rejected": 0.2360769510269165, "step": 1063 }, { "epoch": 0.1645466847090663, "grad_norm": 4.704477787017822, "learning_rate": 2.7422680412371137e-06, "logits/chosen": 9.060919761657715, "logits/rejected": 5.374079704284668, "logps/chosen": -182.55682373046875, "logps/rejected": -149.00543212890625, "loss": 0.7116, "rewards/accuracies": 0.625, "rewards/chosen": 0.2934325933456421, "rewards/margins": -0.0037285350263118744, "rewards/rejected": 0.29716113209724426, "step": 1064 }, { "epoch": 0.16470133384883046, "grad_norm": 6.550030708312988, "learning_rate": 2.7448453608247426e-06, "logits/chosen": 11.847772598266602, "logits/rejected": 13.012176513671875, "logps/chosen": -229.9514923095703, "logps/rejected": -250.713134765625, "loss": 0.7289, "rewards/accuracies": 0.5, "rewards/chosen": 0.26736605167388916, "rewards/margins": -0.054277244955301285, "rewards/rejected": 0.32164329290390015, "step": 1065 }, { "epoch": 0.16485598298859463, "grad_norm": 5.203851699829102, "learning_rate": 2.7474226804123715e-06, "logits/chosen": 4.889805316925049, "logits/rejected": 4.330077648162842, "logps/chosen": -178.52366638183594, "logps/rejected": -208.51792907714844, "loss": 0.6948, "rewards/accuracies": 0.625, "rewards/chosen": 0.5053683519363403, "rewards/margins": 0.017319679260253906, "rewards/rejected": 0.4880486726760864, "step": 1066 }, { "epoch": 0.1650106321283588, "grad_norm": 7.30642032623291, "learning_rate": 2.7500000000000004e-06, "logits/chosen": 6.800315856933594, "logits/rejected": 7.008328437805176, "logps/chosen": -274.5916442871094, "logps/rejected": -264.5065612792969, "loss": 0.7605, "rewards/accuracies": 0.25, "rewards/chosen": 0.21765394508838654, "rewards/margins": -0.11245432496070862, "rewards/rejected": 0.33010828495025635, "step": 1067 }, { "epoch": 0.16516528126812294, "grad_norm": 15.721375465393066, "learning_rate": 2.7525773195876292e-06, "logits/chosen": 7.161839485168457, "logits/rejected": 5.509325981140137, "logps/chosen": -136.89422607421875, "logps/rejected": -159.31271362304688, "loss": 0.6906, "rewards/accuracies": 0.375, "rewards/chosen": 0.21712228655815125, "rewards/margins": 0.025398138910531998, "rewards/rejected": 0.19172416627407074, "step": 1068 }, { "epoch": 0.16531993040788712, "grad_norm": 4.264392852783203, "learning_rate": 2.755154639175258e-06, "logits/chosen": 10.415778160095215, "logits/rejected": 4.979681491851807, "logps/chosen": -245.2407684326172, "logps/rejected": -208.16725158691406, "loss": 0.57, "rewards/accuracies": 0.875, "rewards/chosen": 0.48708388209342957, "rewards/margins": 0.2765290141105652, "rewards/rejected": 0.21055488288402557, "step": 1069 }, { "epoch": 0.16547457954765127, "grad_norm": 5.592239856719971, "learning_rate": 2.757731958762887e-06, "logits/chosen": 11.432573318481445, "logits/rejected": 4.233443737030029, "logps/chosen": -293.978515625, "logps/rejected": -223.2623291015625, "loss": 0.6272, "rewards/accuracies": 0.75, "rewards/chosen": 0.3736454248428345, "rewards/margins": 0.17266154289245605, "rewards/rejected": 0.2009839117527008, "step": 1070 }, { "epoch": 0.16562922868741542, "grad_norm": 5.213015079498291, "learning_rate": 2.760309278350516e-06, "logits/chosen": 7.342867374420166, "logits/rejected": 6.233464241027832, "logps/chosen": -204.68927001953125, "logps/rejected": -212.27098083496094, "loss": 0.7297, "rewards/accuracies": 0.375, "rewards/chosen": 0.37975209951400757, "rewards/margins": -0.0384133905172348, "rewards/rejected": 0.41816550493240356, "step": 1071 }, { "epoch": 0.1657838778271796, "grad_norm": 4.677437782287598, "learning_rate": 2.7628865979381447e-06, "logits/chosen": 11.655706405639648, "logits/rejected": 9.31197738647461, "logps/chosen": -340.36053466796875, "logps/rejected": -271.4561767578125, "loss": 0.6443, "rewards/accuracies": 0.75, "rewards/chosen": 0.37624871730804443, "rewards/margins": 0.14655783772468567, "rewards/rejected": 0.22969086468219757, "step": 1072 }, { "epoch": 0.16593852696694375, "grad_norm": 4.539829254150391, "learning_rate": 2.7654639175257736e-06, "logits/chosen": 5.378650665283203, "logits/rejected": 2.23707914352417, "logps/chosen": -308.83624267578125, "logps/rejected": -245.52114868164062, "loss": 0.6274, "rewards/accuracies": 0.625, "rewards/chosen": 0.3348231315612793, "rewards/margins": 0.15908174216747284, "rewards/rejected": 0.17574140429496765, "step": 1073 }, { "epoch": 0.1660931761067079, "grad_norm": 4.869034290313721, "learning_rate": 2.7680412371134025e-06, "logits/chosen": 10.294317245483398, "logits/rejected": 5.702367305755615, "logps/chosen": -208.36639404296875, "logps/rejected": -130.01109313964844, "loss": 0.7049, "rewards/accuracies": 0.375, "rewards/chosen": 0.2122596800327301, "rewards/margins": -0.021868279203772545, "rewards/rejected": 0.2341279685497284, "step": 1074 }, { "epoch": 0.16624782524647205, "grad_norm": 5.645140647888184, "learning_rate": 2.7706185567010313e-06, "logits/chosen": 10.160078048706055, "logits/rejected": 5.1125054359436035, "logps/chosen": -215.4303436279297, "logps/rejected": -137.60739135742188, "loss": 0.7799, "rewards/accuracies": 0.375, "rewards/chosen": 0.13486647605895996, "rewards/margins": -0.15635111927986145, "rewards/rejected": 0.2912175953388214, "step": 1075 }, { "epoch": 0.16640247438623623, "grad_norm": 7.067988395690918, "learning_rate": 2.77319587628866e-06, "logits/chosen": 9.530256271362305, "logits/rejected": 10.991371154785156, "logps/chosen": -354.59027099609375, "logps/rejected": -343.3446960449219, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.43891656398773193, "rewards/margins": 0.08997649699449539, "rewards/rejected": 0.3489401042461395, "step": 1076 }, { "epoch": 0.16655712352600038, "grad_norm": 4.581558704376221, "learning_rate": 2.775773195876289e-06, "logits/chosen": 10.148000717163086, "logits/rejected": 10.160100936889648, "logps/chosen": -256.7032165527344, "logps/rejected": -243.78848266601562, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.35880598425865173, "rewards/margins": 0.09301841259002686, "rewards/rejected": 0.2657875418663025, "step": 1077 }, { "epoch": 0.16671177266576453, "grad_norm": 3.985835313796997, "learning_rate": 2.778350515463918e-06, "logits/chosen": 4.755812644958496, "logits/rejected": 10.252729415893555, "logps/chosen": -196.23898315429688, "logps/rejected": -246.3616943359375, "loss": 0.5999, "rewards/accuracies": 1.0, "rewards/chosen": 0.4461737871170044, "rewards/margins": 0.20455661416053772, "rewards/rejected": 0.24161720275878906, "step": 1078 }, { "epoch": 0.16686642180552871, "grad_norm": 7.236598491668701, "learning_rate": 2.780927835051547e-06, "logits/chosen": 7.2391839027404785, "logits/rejected": 7.341989040374756, "logps/chosen": -326.91363525390625, "logps/rejected": -255.23779296875, "loss": 0.7417, "rewards/accuracies": 0.25, "rewards/chosen": 0.24083857238292694, "rewards/margins": -0.07181355357170105, "rewards/rejected": 0.3126521408557892, "step": 1079 }, { "epoch": 0.16702107094529287, "grad_norm": 3.996279001235962, "learning_rate": 2.7835051546391757e-06, "logits/chosen": 4.7879438400268555, "logits/rejected": 3.020608425140381, "logps/chosen": -211.53652954101562, "logps/rejected": -131.73794555664062, "loss": 0.656, "rewards/accuracies": 0.625, "rewards/chosen": 0.17334146797657013, "rewards/margins": 0.07940573990345001, "rewards/rejected": 0.09393573552370071, "step": 1080 }, { "epoch": 0.16717572008505702, "grad_norm": 5.130178451538086, "learning_rate": 2.7860824742268046e-06, "logits/chosen": 8.511739730834961, "logits/rejected": 12.763547897338867, "logps/chosen": -210.34835815429688, "logps/rejected": -323.07696533203125, "loss": 0.6607, "rewards/accuracies": 0.625, "rewards/chosen": 0.29380524158477783, "rewards/margins": 0.07359995692968369, "rewards/rejected": 0.22020526230335236, "step": 1081 }, { "epoch": 0.1673303692248212, "grad_norm": 5.002078533172607, "learning_rate": 2.7886597938144334e-06, "logits/chosen": 15.420188903808594, "logits/rejected": 8.079397201538086, "logps/chosen": -307.494384765625, "logps/rejected": -237.58912658691406, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": 0.4491264224052429, "rewards/margins": 0.10898799449205399, "rewards/rejected": 0.34013843536376953, "step": 1082 }, { "epoch": 0.16748501836458535, "grad_norm": 6.499565124511719, "learning_rate": 2.7912371134020623e-06, "logits/chosen": 5.123217582702637, "logits/rejected": 9.870834350585938, "logps/chosen": -262.5680847167969, "logps/rejected": -320.9296569824219, "loss": 0.7234, "rewards/accuracies": 0.375, "rewards/chosen": 0.23025751113891602, "rewards/margins": -0.0433901809155941, "rewards/rejected": 0.2736476957798004, "step": 1083 }, { "epoch": 0.1676396675043495, "grad_norm": 5.860519886016846, "learning_rate": 2.793814432989691e-06, "logits/chosen": 12.18691635131836, "logits/rejected": 13.408405303955078, "logps/chosen": -318.7335205078125, "logps/rejected": -346.5426025390625, "loss": 0.5911, "rewards/accuracies": 0.875, "rewards/chosen": 0.3901365399360657, "rewards/margins": 0.24204924702644348, "rewards/rejected": 0.1480872929096222, "step": 1084 }, { "epoch": 0.16779431664411368, "grad_norm": 5.866950988769531, "learning_rate": 2.79639175257732e-06, "logits/chosen": 11.779134750366211, "logits/rejected": 8.920845031738281, "logps/chosen": -418.793212890625, "logps/rejected": -367.96392822265625, "loss": 0.6016, "rewards/accuracies": 0.75, "rewards/chosen": 0.44371214509010315, "rewards/margins": 0.2046000063419342, "rewards/rejected": 0.23911209404468536, "step": 1085 }, { "epoch": 0.16794896578387783, "grad_norm": 9.693355560302734, "learning_rate": 2.798969072164949e-06, "logits/chosen": 7.844634056091309, "logits/rejected": 2.2951765060424805, "logps/chosen": -282.4667663574219, "logps/rejected": -279.37127685546875, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": 0.3938562273979187, "rewards/margins": 0.016431525349617004, "rewards/rejected": 0.3774247467517853, "step": 1086 }, { "epoch": 0.16810361492364198, "grad_norm": 4.724522590637207, "learning_rate": 2.801546391752578e-06, "logits/chosen": 10.812071800231934, "logits/rejected": 5.6684064865112305, "logps/chosen": -259.1080627441406, "logps/rejected": -203.34054565429688, "loss": 0.6614, "rewards/accuracies": 0.5, "rewards/chosen": 0.27513280510902405, "rewards/margins": 0.09737339615821838, "rewards/rejected": 0.17775940895080566, "step": 1087 }, { "epoch": 0.16825826406340616, "grad_norm": 7.932330131530762, "learning_rate": 2.8041237113402062e-06, "logits/chosen": 14.963586807250977, "logits/rejected": 7.2815446853637695, "logps/chosen": -418.70245361328125, "logps/rejected": -304.7147521972656, "loss": 0.7348, "rewards/accuracies": 0.5, "rewards/chosen": 0.21449965238571167, "rewards/margins": -0.05291939526796341, "rewards/rejected": 0.2674190402030945, "step": 1088 }, { "epoch": 0.1684129132031703, "grad_norm": 4.958596229553223, "learning_rate": 2.806701030927835e-06, "logits/chosen": 14.71570873260498, "logits/rejected": 12.743423461914062, "logps/chosen": -280.58978271484375, "logps/rejected": -268.813232421875, "loss": 0.7072, "rewards/accuracies": 0.5, "rewards/chosen": 0.40674203634262085, "rewards/margins": -0.01915750280022621, "rewards/rejected": 0.4258995056152344, "step": 1089 }, { "epoch": 0.16856756234293446, "grad_norm": 4.17123556137085, "learning_rate": 2.809278350515464e-06, "logits/chosen": 9.626422882080078, "logits/rejected": 11.575504302978516, "logps/chosen": -164.32858276367188, "logps/rejected": -125.93989562988281, "loss": 0.7044, "rewards/accuracies": 0.5, "rewards/chosen": 0.19911348819732666, "rewards/margins": -0.016457343474030495, "rewards/rejected": 0.2155708372592926, "step": 1090 }, { "epoch": 0.16872221148269861, "grad_norm": 5.6398396492004395, "learning_rate": 2.811855670103093e-06, "logits/chosen": 9.416651725769043, "logits/rejected": 11.358421325683594, "logps/chosen": -247.67654418945312, "logps/rejected": -291.9112548828125, "loss": 0.7478, "rewards/accuracies": 0.5, "rewards/chosen": 0.2780582308769226, "rewards/margins": -0.07273931801319122, "rewards/rejected": 0.350797563791275, "step": 1091 }, { "epoch": 0.1688768606224628, "grad_norm": 5.406024932861328, "learning_rate": 2.8144329896907217e-06, "logits/chosen": 5.060151100158691, "logits/rejected": 10.967604637145996, "logps/chosen": -271.1733093261719, "logps/rejected": -326.7130432128906, "loss": 0.6533, "rewards/accuracies": 0.375, "rewards/chosen": 0.5075684189796448, "rewards/margins": 0.10499919950962067, "rewards/rejected": 0.4025692045688629, "step": 1092 }, { "epoch": 0.16903150976222694, "grad_norm": 3.9346923828125, "learning_rate": 2.8170103092783506e-06, "logits/chosen": 12.876140594482422, "logits/rejected": 8.337103843688965, "logps/chosen": -301.7906494140625, "logps/rejected": -180.44461059570312, "loss": 0.5736, "rewards/accuracies": 1.0, "rewards/chosen": 0.36086779832839966, "rewards/margins": 0.26002708077430725, "rewards/rejected": 0.10084071010351181, "step": 1093 }, { "epoch": 0.1691861589019911, "grad_norm": 6.346408367156982, "learning_rate": 2.8195876288659795e-06, "logits/chosen": 6.796268939971924, "logits/rejected": 2.912069320678711, "logps/chosen": -311.06317138671875, "logps/rejected": -282.170654296875, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 0.3297578692436218, "rewards/margins": 0.01216554269194603, "rewards/rejected": 0.3175923228263855, "step": 1094 }, { "epoch": 0.16934080804175528, "grad_norm": 5.333662033081055, "learning_rate": 2.8221649484536083e-06, "logits/chosen": 10.003667831420898, "logits/rejected": 5.769536972045898, "logps/chosen": -159.35244750976562, "logps/rejected": -166.38031005859375, "loss": 0.7419, "rewards/accuracies": 0.125, "rewards/chosen": 0.22861595451831818, "rewards/margins": -0.09019675105810165, "rewards/rejected": 0.31881269812583923, "step": 1095 }, { "epoch": 0.16949545718151943, "grad_norm": 4.490705490112305, "learning_rate": 2.8247422680412372e-06, "logits/chosen": 13.215551376342773, "logits/rejected": 6.94949197769165, "logps/chosen": -136.20623779296875, "logps/rejected": -99.8686294555664, "loss": 0.6786, "rewards/accuracies": 0.625, "rewards/chosen": 0.18240973353385925, "rewards/margins": 0.03176534175872803, "rewards/rejected": 0.15064439177513123, "step": 1096 }, { "epoch": 0.16965010632128358, "grad_norm": 4.48181676864624, "learning_rate": 2.827319587628866e-06, "logits/chosen": 15.186075210571289, "logits/rejected": 5.635807991027832, "logps/chosen": -359.2030029296875, "logps/rejected": -208.1219024658203, "loss": 0.6312, "rewards/accuracies": 0.625, "rewards/chosen": 0.3525662422180176, "rewards/margins": 0.13948187232017517, "rewards/rejected": 0.2130843997001648, "step": 1097 }, { "epoch": 0.16980475546104776, "grad_norm": 5.749701976776123, "learning_rate": 2.829896907216495e-06, "logits/chosen": 9.422935485839844, "logits/rejected": 9.543334007263184, "logps/chosen": -287.0223388671875, "logps/rejected": -192.75335693359375, "loss": 0.7258, "rewards/accuracies": 0.5, "rewards/chosen": 0.2940881848335266, "rewards/margins": -0.041253600269556046, "rewards/rejected": 0.33534178137779236, "step": 1098 }, { "epoch": 0.1699594046008119, "grad_norm": 5.74893856048584, "learning_rate": 2.832474226804124e-06, "logits/chosen": 6.8382568359375, "logits/rejected": 8.938178062438965, "logps/chosen": -209.59371948242188, "logps/rejected": -229.0655517578125, "loss": 0.6306, "rewards/accuracies": 0.5, "rewards/chosen": 0.24713760614395142, "rewards/margins": 0.15542274713516235, "rewards/rejected": 0.09171485900878906, "step": 1099 }, { "epoch": 0.17011405374057606, "grad_norm": 6.221514701843262, "learning_rate": 2.8350515463917527e-06, "logits/chosen": 8.15424633026123, "logits/rejected": 4.358251571655273, "logps/chosen": -324.70751953125, "logps/rejected": -235.7432861328125, "loss": 0.7359, "rewards/accuracies": 0.375, "rewards/chosen": 0.27727362513542175, "rewards/margins": -0.07042375206947327, "rewards/rejected": 0.3476974070072174, "step": 1100 }, { "epoch": 0.17026870288034024, "grad_norm": 8.049890518188477, "learning_rate": 2.8376288659793816e-06, "logits/chosen": 14.120906829833984, "logits/rejected": 1.464968204498291, "logps/chosen": -474.0608825683594, "logps/rejected": -195.5423583984375, "loss": 0.641, "rewards/accuracies": 0.5, "rewards/chosen": 0.2594659924507141, "rewards/margins": 0.14085495471954346, "rewards/rejected": 0.11861105263233185, "step": 1101 }, { "epoch": 0.1704233520201044, "grad_norm": 6.466252326965332, "learning_rate": 2.8402061855670104e-06, "logits/chosen": 9.82530689239502, "logits/rejected": 6.830409049987793, "logps/chosen": -265.00286865234375, "logps/rejected": -204.07083129882812, "loss": 0.7323, "rewards/accuracies": 0.25, "rewards/chosen": 0.14010316133499146, "rewards/margins": -0.0686800479888916, "rewards/rejected": 0.20878319442272186, "step": 1102 }, { "epoch": 0.17057800115986854, "grad_norm": 6.551792144775391, "learning_rate": 2.8427835051546393e-06, "logits/chosen": 4.779054641723633, "logits/rejected": 9.211370468139648, "logps/chosen": -218.4180145263672, "logps/rejected": -287.0144958496094, "loss": 0.7409, "rewards/accuracies": 0.5, "rewards/chosen": 0.28126710653305054, "rewards/margins": -0.07243074476718903, "rewards/rejected": 0.35369786620140076, "step": 1103 }, { "epoch": 0.17073265029963272, "grad_norm": 7.692407131195068, "learning_rate": 2.845360824742268e-06, "logits/chosen": 3.636638641357422, "logits/rejected": 4.813644886016846, "logps/chosen": -321.5621643066406, "logps/rejected": -285.57684326171875, "loss": 0.5944, "rewards/accuracies": 0.875, "rewards/chosen": 0.41143059730529785, "rewards/margins": 0.21557457745075226, "rewards/rejected": 0.1958560347557068, "step": 1104 }, { "epoch": 0.17088729943939687, "grad_norm": 4.704918384552002, "learning_rate": 2.847938144329897e-06, "logits/chosen": 12.658109664916992, "logits/rejected": 3.816486120223999, "logps/chosen": -253.438232421875, "logps/rejected": -136.7063751220703, "loss": 0.6994, "rewards/accuracies": 0.375, "rewards/chosen": 0.23000502586364746, "rewards/margins": -0.010163930244743824, "rewards/rejected": 0.240168958902359, "step": 1105 }, { "epoch": 0.17104194857916102, "grad_norm": 5.638489246368408, "learning_rate": 2.850515463917526e-06, "logits/chosen": 8.123842239379883, "logits/rejected": 9.553081512451172, "logps/chosen": -294.9290466308594, "logps/rejected": -389.6015930175781, "loss": 0.6294, "rewards/accuracies": 0.625, "rewards/chosen": 0.38644325733184814, "rewards/margins": 0.192067950963974, "rewards/rejected": 0.19437527656555176, "step": 1106 }, { "epoch": 0.17119659771892518, "grad_norm": 4.879226207733154, "learning_rate": 2.853092783505155e-06, "logits/chosen": 8.909822463989258, "logits/rejected": 2.842954158782959, "logps/chosen": -259.98614501953125, "logps/rejected": -186.7456817626953, "loss": 0.6814, "rewards/accuracies": 0.5, "rewards/chosen": 0.3965057134628296, "rewards/margins": 0.040973421186208725, "rewards/rejected": 0.35553228855133057, "step": 1107 }, { "epoch": 0.17135124685868935, "grad_norm": 4.824392795562744, "learning_rate": 2.8556701030927837e-06, "logits/chosen": 13.868636131286621, "logits/rejected": 5.040534019470215, "logps/chosen": -368.81231689453125, "logps/rejected": -231.47988891601562, "loss": 0.6243, "rewards/accuracies": 0.875, "rewards/chosen": 0.3407599925994873, "rewards/margins": 0.14472103118896484, "rewards/rejected": 0.19603893160820007, "step": 1108 }, { "epoch": 0.1715058959984535, "grad_norm": 3.7489826679229736, "learning_rate": 2.8582474226804125e-06, "logits/chosen": 7.300200462341309, "logits/rejected": 10.822530746459961, "logps/chosen": -109.24542236328125, "logps/rejected": -153.12359619140625, "loss": 0.7127, "rewards/accuracies": 0.5, "rewards/chosen": 0.27764689922332764, "rewards/margins": -0.02867947146296501, "rewards/rejected": 0.30632635951042175, "step": 1109 }, { "epoch": 0.17166054513821766, "grad_norm": 4.981750011444092, "learning_rate": 2.8608247422680414e-06, "logits/chosen": 6.557028293609619, "logits/rejected": 8.688417434692383, "logps/chosen": -260.2157287597656, "logps/rejected": -247.9229736328125, "loss": 0.6951, "rewards/accuracies": 0.375, "rewards/chosen": 0.36442968249320984, "rewards/margins": 0.024903878569602966, "rewards/rejected": 0.3395257890224457, "step": 1110 }, { "epoch": 0.17181519427798184, "grad_norm": 4.8829026222229, "learning_rate": 2.8634020618556703e-06, "logits/chosen": 11.387208938598633, "logits/rejected": 15.943961143493652, "logps/chosen": -182.64720153808594, "logps/rejected": -205.38931274414062, "loss": 0.7763, "rewards/accuracies": 0.375, "rewards/chosen": 0.16148997843265533, "rewards/margins": -0.13857947289943695, "rewards/rejected": 0.3000694513320923, "step": 1111 }, { "epoch": 0.171969843417746, "grad_norm": 6.7271647453308105, "learning_rate": 2.865979381443299e-06, "logits/chosen": 10.159669876098633, "logits/rejected": 5.57767391204834, "logps/chosen": -355.318603515625, "logps/rejected": -253.44464111328125, "loss": 0.6631, "rewards/accuracies": 0.75, "rewards/chosen": 0.4220062792301178, "rewards/margins": 0.06916923820972443, "rewards/rejected": 0.3528370261192322, "step": 1112 }, { "epoch": 0.17212449255751014, "grad_norm": 8.745165824890137, "learning_rate": 2.868556701030928e-06, "logits/chosen": 5.87391471862793, "logits/rejected": 12.86990737915039, "logps/chosen": -257.3597106933594, "logps/rejected": -343.64068603515625, "loss": 0.6151, "rewards/accuracies": 0.625, "rewards/chosen": 0.3618261218070984, "rewards/margins": 0.18963623046875, "rewards/rejected": 0.17218990623950958, "step": 1113 }, { "epoch": 0.17227914169727432, "grad_norm": 6.6859636306762695, "learning_rate": 2.871134020618557e-06, "logits/chosen": 10.206993103027344, "logits/rejected": 9.294522285461426, "logps/chosen": -225.59593200683594, "logps/rejected": -213.65155029296875, "loss": 0.7244, "rewards/accuracies": 0.5, "rewards/chosen": 0.2583962678909302, "rewards/margins": -0.0537845604121685, "rewards/rejected": 0.3121808171272278, "step": 1114 }, { "epoch": 0.17243379083703847, "grad_norm": 5.0290846824646, "learning_rate": 2.8737113402061858e-06, "logits/chosen": 13.735154151916504, "logits/rejected": 10.959716796875, "logps/chosen": -353.849853515625, "logps/rejected": -318.1784362792969, "loss": 0.6468, "rewards/accuracies": 0.75, "rewards/chosen": 0.5507327914237976, "rewards/margins": 0.10231511294841766, "rewards/rejected": 0.44841769337654114, "step": 1115 }, { "epoch": 0.17258843997680262, "grad_norm": 6.560153961181641, "learning_rate": 2.8762886597938146e-06, "logits/chosen": 14.087587356567383, "logits/rejected": 8.3621826171875, "logps/chosen": -329.96136474609375, "logps/rejected": -228.5741424560547, "loss": 0.78, "rewards/accuracies": 0.25, "rewards/chosen": 0.263899564743042, "rewards/margins": -0.1511520892381668, "rewards/rejected": 0.4150516390800476, "step": 1116 }, { "epoch": 0.1727430891165668, "grad_norm": 6.531588554382324, "learning_rate": 2.878865979381443e-06, "logits/chosen": 7.251614093780518, "logits/rejected": 8.09341812133789, "logps/chosen": -284.9375305175781, "logps/rejected": -341.9988098144531, "loss": 0.6242, "rewards/accuracies": 0.875, "rewards/chosen": 0.3623020350933075, "rewards/margins": 0.14683209359645844, "rewards/rejected": 0.21546995639801025, "step": 1117 }, { "epoch": 0.17289773825633095, "grad_norm": 5.873194694519043, "learning_rate": 2.881443298969072e-06, "logits/chosen": 10.672133445739746, "logits/rejected": 6.375923156738281, "logps/chosen": -315.1903076171875, "logps/rejected": -234.17271423339844, "loss": 0.713, "rewards/accuracies": 0.25, "rewards/chosen": 0.359017938375473, "rewards/margins": -0.000476837158203125, "rewards/rejected": 0.35949477553367615, "step": 1118 }, { "epoch": 0.1730523873960951, "grad_norm": 5.246539115905762, "learning_rate": 2.884020618556701e-06, "logits/chosen": 6.875603675842285, "logits/rejected": 9.460760116577148, "logps/chosen": -220.30201721191406, "logps/rejected": -241.61851501464844, "loss": 0.7387, "rewards/accuracies": 0.5, "rewards/chosen": 0.3561538755893707, "rewards/margins": -0.05357912927865982, "rewards/rejected": 0.40973299741744995, "step": 1119 }, { "epoch": 0.17320703653585928, "grad_norm": 3.860882043838501, "learning_rate": 2.8865979381443297e-06, "logits/chosen": 8.976292610168457, "logits/rejected": 10.67917537689209, "logps/chosen": -201.78549194335938, "logps/rejected": -191.02020263671875, "loss": 0.7099, "rewards/accuracies": 0.375, "rewards/chosen": 0.34864211082458496, "rewards/margins": -0.011206664144992828, "rewards/rejected": 0.3598487675189972, "step": 1120 }, { "epoch": 0.17336168567562343, "grad_norm": 4.3847880363464355, "learning_rate": 2.8891752577319586e-06, "logits/chosen": 9.891502380371094, "logits/rejected": 1.0228139162063599, "logps/chosen": -268.3402099609375, "logps/rejected": -168.21463012695312, "loss": 0.6232, "rewards/accuracies": 0.625, "rewards/chosen": 0.38970279693603516, "rewards/margins": 0.18353596329689026, "rewards/rejected": 0.2061668336391449, "step": 1121 }, { "epoch": 0.17351633481538759, "grad_norm": 5.330456733703613, "learning_rate": 2.8917525773195883e-06, "logits/chosen": 10.398155212402344, "logits/rejected": 12.598572731018066, "logps/chosen": -226.34432983398438, "logps/rejected": -263.0143127441406, "loss": 0.7005, "rewards/accuracies": 0.375, "rewards/chosen": 0.2910397946834564, "rewards/margins": 0.01652168482542038, "rewards/rejected": 0.27451813220977783, "step": 1122 }, { "epoch": 0.17367098395515174, "grad_norm": 4.6225810050964355, "learning_rate": 2.894329896907217e-06, "logits/chosen": 15.732912063598633, "logits/rejected": 6.839818477630615, "logps/chosen": -323.67706298828125, "logps/rejected": -197.86285400390625, "loss": 0.5815, "rewards/accuracies": 0.875, "rewards/chosen": 0.5045099258422852, "rewards/margins": 0.2553408741950989, "rewards/rejected": 0.24916905164718628, "step": 1123 }, { "epoch": 0.17382563309491592, "grad_norm": 5.172116756439209, "learning_rate": 2.896907216494846e-06, "logits/chosen": 6.7666850090026855, "logits/rejected": 10.524429321289062, "logps/chosen": -238.834228515625, "logps/rejected": -263.2930908203125, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": 0.2878759205341339, "rewards/margins": 0.0442294105887413, "rewards/rejected": 0.24364647269248962, "step": 1124 }, { "epoch": 0.17398028223468007, "grad_norm": 6.477171897888184, "learning_rate": 2.899484536082475e-06, "logits/chosen": 8.801326751708984, "logits/rejected": 10.45910930633545, "logps/chosen": -305.6610412597656, "logps/rejected": -239.3955535888672, "loss": 0.7493, "rewards/accuracies": 0.375, "rewards/chosen": 0.3966098129749298, "rewards/margins": -0.09666098654270172, "rewards/rejected": 0.4932708144187927, "step": 1125 }, { "epoch": 0.17413493137444422, "grad_norm": 11.301116943359375, "learning_rate": 2.9020618556701034e-06, "logits/chosen": 2.8457140922546387, "logits/rejected": 3.8694403171539307, "logps/chosen": -293.6243591308594, "logps/rejected": -235.6193084716797, "loss": 0.6714, "rewards/accuracies": 0.5, "rewards/chosen": 0.34122511744499207, "rewards/margins": 0.07552716135978699, "rewards/rejected": 0.2656979560852051, "step": 1126 }, { "epoch": 0.1742895805142084, "grad_norm": 12.629762649536133, "learning_rate": 2.9046391752577322e-06, "logits/chosen": 5.864099502563477, "logits/rejected": 4.938446521759033, "logps/chosen": -306.47955322265625, "logps/rejected": -223.91578674316406, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": 0.36849290132522583, "rewards/margins": 0.11496633291244507, "rewards/rejected": 0.2535265386104584, "step": 1127 }, { "epoch": 0.17444422965397255, "grad_norm": 5.387455463409424, "learning_rate": 2.907216494845361e-06, "logits/chosen": 11.163328170776367, "logits/rejected": 9.876559257507324, "logps/chosen": -310.73931884765625, "logps/rejected": -248.8851318359375, "loss": 0.675, "rewards/accuracies": 0.625, "rewards/chosen": 0.45418691635131836, "rewards/margins": 0.06985370814800262, "rewards/rejected": 0.38433319330215454, "step": 1128 }, { "epoch": 0.1745988787937367, "grad_norm": 5.634506702423096, "learning_rate": 2.90979381443299e-06, "logits/chosen": 6.626094818115234, "logits/rejected": 4.87941837310791, "logps/chosen": -353.708984375, "logps/rejected": -228.18756103515625, "loss": 0.613, "rewards/accuracies": 0.75, "rewards/chosen": 0.47076427936553955, "rewards/margins": 0.22619284689426422, "rewards/rejected": 0.24457146227359772, "step": 1129 }, { "epoch": 0.17475352793350088, "grad_norm": 6.415378093719482, "learning_rate": 2.912371134020619e-06, "logits/chosen": 11.546716690063477, "logits/rejected": 11.263907432556152, "logps/chosen": -309.4408874511719, "logps/rejected": -282.27777099609375, "loss": 0.702, "rewards/accuracies": 0.5, "rewards/chosen": 0.3375720977783203, "rewards/margins": 0.0003494769334793091, "rewards/rejected": 0.3372226357460022, "step": 1130 }, { "epoch": 0.17490817707326503, "grad_norm": 5.494746685028076, "learning_rate": 2.9149484536082477e-06, "logits/chosen": 10.173686981201172, "logits/rejected": 8.87675666809082, "logps/chosen": -261.99468994140625, "logps/rejected": -225.0587158203125, "loss": 0.7565, "rewards/accuracies": 0.5, "rewards/chosen": 0.2946622967720032, "rewards/margins": -0.10347646474838257, "rewards/rejected": 0.39813879132270813, "step": 1131 }, { "epoch": 0.17506282621302918, "grad_norm": 5.844377517700195, "learning_rate": 2.9175257731958766e-06, "logits/chosen": 3.5838165283203125, "logits/rejected": 6.299263954162598, "logps/chosen": -200.82269287109375, "logps/rejected": -343.9166259765625, "loss": 0.7216, "rewards/accuracies": 0.625, "rewards/chosen": 0.1865125596523285, "rewards/margins": -0.0347355417907238, "rewards/rejected": 0.22124812006950378, "step": 1132 }, { "epoch": 0.17521747535279336, "grad_norm": 5.7626872062683105, "learning_rate": 2.9201030927835055e-06, "logits/chosen": 10.046320915222168, "logits/rejected": 8.99346923828125, "logps/chosen": -319.71990966796875, "logps/rejected": -292.3989562988281, "loss": 0.7177, "rewards/accuracies": 0.375, "rewards/chosen": 0.343557745218277, "rewards/margins": -0.04012775793671608, "rewards/rejected": 0.38368549942970276, "step": 1133 }, { "epoch": 0.1753721244925575, "grad_norm": 4.6312079429626465, "learning_rate": 2.9226804123711343e-06, "logits/chosen": 6.430576801300049, "logits/rejected": 7.720532417297363, "logps/chosen": -159.00131225585938, "logps/rejected": -147.68310546875, "loss": 0.7829, "rewards/accuracies": 0.25, "rewards/chosen": 0.22012931108474731, "rewards/margins": -0.1513059437274933, "rewards/rejected": 0.3714352250099182, "step": 1134 }, { "epoch": 0.17552677363232166, "grad_norm": 7.163835048675537, "learning_rate": 2.9252577319587632e-06, "logits/chosen": 3.8095626831054688, "logits/rejected": 6.109020709991455, "logps/chosen": -269.7215881347656, "logps/rejected": -268.4971008300781, "loss": 0.7648, "rewards/accuracies": 0.375, "rewards/chosen": 0.30219149589538574, "rewards/margins": -0.11694401502609253, "rewards/rejected": 0.41913551092147827, "step": 1135 }, { "epoch": 0.17568142277208584, "grad_norm": 6.825829982757568, "learning_rate": 2.927835051546392e-06, "logits/chosen": 10.532018661499023, "logits/rejected": 9.871192932128906, "logps/chosen": -245.007080078125, "logps/rejected": -341.94451904296875, "loss": 0.8056, "rewards/accuracies": 0.25, "rewards/chosen": 0.23278719186782837, "rewards/margins": -0.19790929555892944, "rewards/rejected": 0.4306964874267578, "step": 1136 }, { "epoch": 0.17583607191185, "grad_norm": 4.912806987762451, "learning_rate": 2.930412371134021e-06, "logits/chosen": 12.713714599609375, "logits/rejected": 3.8755369186401367, "logps/chosen": -305.22222900390625, "logps/rejected": -143.66517639160156, "loss": 0.6406, "rewards/accuracies": 0.625, "rewards/chosen": 0.3633865714073181, "rewards/margins": 0.12963023781776428, "rewards/rejected": 0.23375634849071503, "step": 1137 }, { "epoch": 0.17599072105161415, "grad_norm": 4.775628089904785, "learning_rate": 2.93298969072165e-06, "logits/chosen": 7.143723487854004, "logits/rejected": 2.2544302940368652, "logps/chosen": -290.8276672363281, "logps/rejected": -177.09397888183594, "loss": 0.6548, "rewards/accuracies": 0.75, "rewards/chosen": 0.3753347098827362, "rewards/margins": 0.08654476702213287, "rewards/rejected": 0.28878992795944214, "step": 1138 }, { "epoch": 0.1761453701913783, "grad_norm": 5.060400485992432, "learning_rate": 2.9355670103092787e-06, "logits/chosen": 5.091097354888916, "logits/rejected": 11.51400375366211, "logps/chosen": -189.26113891601562, "logps/rejected": -309.8868408203125, "loss": 0.708, "rewards/accuracies": 0.625, "rewards/chosen": 0.37924912571907043, "rewards/margins": -0.02081029862165451, "rewards/rejected": 0.40005940198898315, "step": 1139 }, { "epoch": 0.17630001933114248, "grad_norm": 7.438137531280518, "learning_rate": 2.9381443298969076e-06, "logits/chosen": 9.425956726074219, "logits/rejected": 6.930305480957031, "logps/chosen": -290.3516540527344, "logps/rejected": -205.43338012695312, "loss": 0.666, "rewards/accuracies": 0.625, "rewards/chosen": 0.3329712152481079, "rewards/margins": 0.06372203677892685, "rewards/rejected": 0.2692491412162781, "step": 1140 }, { "epoch": 0.17645466847090663, "grad_norm": 8.290541648864746, "learning_rate": 2.9407216494845364e-06, "logits/chosen": 13.327945709228516, "logits/rejected": 10.471671104431152, "logps/chosen": -341.0982971191406, "logps/rejected": -255.54840087890625, "loss": 0.6337, "rewards/accuracies": 0.75, "rewards/chosen": 0.4476722776889801, "rewards/margins": 0.1451585739850998, "rewards/rejected": 0.3025137186050415, "step": 1141 }, { "epoch": 0.17660931761067078, "grad_norm": 7.709554672241211, "learning_rate": 2.9432989690721653e-06, "logits/chosen": 11.955734252929688, "logits/rejected": 5.034045219421387, "logps/chosen": -382.2080993652344, "logps/rejected": -312.8702392578125, "loss": 0.6873, "rewards/accuracies": 0.625, "rewards/chosen": 0.3577193319797516, "rewards/margins": 0.031980231404304504, "rewards/rejected": 0.3257391154766083, "step": 1142 }, { "epoch": 0.17676396675043496, "grad_norm": 4.033751010894775, "learning_rate": 2.945876288659794e-06, "logits/chosen": 12.809724807739258, "logits/rejected": 10.267937660217285, "logps/chosen": -276.56463623046875, "logps/rejected": -139.99610900878906, "loss": 0.6773, "rewards/accuracies": 0.625, "rewards/chosen": 0.3085514307022095, "rewards/margins": 0.037268564105033875, "rewards/rejected": 0.2712828516960144, "step": 1143 }, { "epoch": 0.1769186158901991, "grad_norm": 6.057156562805176, "learning_rate": 2.948453608247423e-06, "logits/chosen": 8.920923233032227, "logits/rejected": 6.629421234130859, "logps/chosen": -249.32882690429688, "logps/rejected": -200.7079620361328, "loss": 0.7066, "rewards/accuracies": 0.5, "rewards/chosen": 0.29187577962875366, "rewards/margins": -0.02302558720111847, "rewards/rejected": 0.31490135192871094, "step": 1144 }, { "epoch": 0.17707326502996326, "grad_norm": 5.577688694000244, "learning_rate": 2.951030927835052e-06, "logits/chosen": 7.342004299163818, "logits/rejected": 5.811574459075928, "logps/chosen": -296.17120361328125, "logps/rejected": -296.16351318359375, "loss": 0.7219, "rewards/accuracies": 0.5, "rewards/chosen": 0.4578924775123596, "rewards/margins": -0.040445707738399506, "rewards/rejected": 0.4983382225036621, "step": 1145 }, { "epoch": 0.17722791416972744, "grad_norm": 4.61027193069458, "learning_rate": 2.953608247422681e-06, "logits/chosen": 9.528794288635254, "logits/rejected": 8.37557315826416, "logps/chosen": -165.78585815429688, "logps/rejected": -181.5104522705078, "loss": 0.6326, "rewards/accuracies": 0.75, "rewards/chosen": 0.379351407289505, "rewards/margins": 0.15150676667690277, "rewards/rejected": 0.22784467041492462, "step": 1146 }, { "epoch": 0.1773825633094916, "grad_norm": 6.47112512588501, "learning_rate": 2.9561855670103097e-06, "logits/chosen": 11.794828414916992, "logits/rejected": 10.370773315429688, "logps/chosen": -407.6488342285156, "logps/rejected": -350.3975830078125, "loss": 0.6214, "rewards/accuracies": 0.75, "rewards/chosen": 0.5501800775527954, "rewards/margins": 0.1756875067949295, "rewards/rejected": 0.3744925558567047, "step": 1147 }, { "epoch": 0.17753721244925574, "grad_norm": 7.51181697845459, "learning_rate": 2.9587628865979385e-06, "logits/chosen": 14.049497604370117, "logits/rejected": 11.448801040649414, "logps/chosen": -395.26617431640625, "logps/rejected": -311.9085388183594, "loss": 0.6729, "rewards/accuracies": 0.625, "rewards/chosen": 0.5568580627441406, "rewards/margins": 0.08066901564598083, "rewards/rejected": 0.4761890470981598, "step": 1148 }, { "epoch": 0.17769186158901992, "grad_norm": 4.4073967933654785, "learning_rate": 2.9613402061855674e-06, "logits/chosen": 8.318864822387695, "logits/rejected": 3.232300043106079, "logps/chosen": -328.7777099609375, "logps/rejected": -202.08578491210938, "loss": 0.6601, "rewards/accuracies": 0.5, "rewards/chosen": 0.42732587456703186, "rewards/margins": 0.08265604823827744, "rewards/rejected": 0.34466981887817383, "step": 1149 }, { "epoch": 0.17784651072878407, "grad_norm": 6.041601181030273, "learning_rate": 2.9639175257731963e-06, "logits/chosen": 8.695036888122559, "logits/rejected": 11.523868560791016, "logps/chosen": -244.824951171875, "logps/rejected": -202.55184936523438, "loss": 0.7253, "rewards/accuracies": 0.5, "rewards/chosen": 0.3459719717502594, "rewards/margins": -0.03434457629919052, "rewards/rejected": 0.38031652569770813, "step": 1150 }, { "epoch": 0.17800115986854823, "grad_norm": 5.251492977142334, "learning_rate": 2.966494845360825e-06, "logits/chosen": 9.244641304016113, "logits/rejected": 16.107168197631836, "logps/chosen": -190.07156372070312, "logps/rejected": -262.0315856933594, "loss": 0.6743, "rewards/accuracies": 0.75, "rewards/chosen": 0.31787556409835815, "rewards/margins": 0.0451195053756237, "rewards/rejected": 0.27275604009628296, "step": 1151 }, { "epoch": 0.1781558090083124, "grad_norm": 5.284173488616943, "learning_rate": 2.969072164948454e-06, "logits/chosen": 5.584968566894531, "logits/rejected": 10.666986465454102, "logps/chosen": -219.20596313476562, "logps/rejected": -246.28956604003906, "loss": 0.6795, "rewards/accuracies": 0.375, "rewards/chosen": 0.3630213737487793, "rewards/margins": 0.031061813235282898, "rewards/rejected": 0.3319595754146576, "step": 1152 }, { "epoch": 0.17831045814807656, "grad_norm": 5.383566379547119, "learning_rate": 2.971649484536083e-06, "logits/chosen": 10.384988784790039, "logits/rejected": 3.793872833251953, "logps/chosen": -376.3614501953125, "logps/rejected": -208.56626892089844, "loss": 0.6429, "rewards/accuracies": 0.625, "rewards/chosen": 0.36733898520469666, "rewards/margins": 0.11935469508171082, "rewards/rejected": 0.24798427522182465, "step": 1153 }, { "epoch": 0.1784651072878407, "grad_norm": 3.970797538757324, "learning_rate": 2.9742268041237114e-06, "logits/chosen": 11.370634078979492, "logits/rejected": 8.494200706481934, "logps/chosen": -254.89219665527344, "logps/rejected": -215.02896118164062, "loss": 0.6171, "rewards/accuracies": 0.625, "rewards/chosen": 0.5302475094795227, "rewards/margins": 0.1801958680152893, "rewards/rejected": 0.3500516414642334, "step": 1154 }, { "epoch": 0.17861975642760486, "grad_norm": 5.383657455444336, "learning_rate": 2.9768041237113402e-06, "logits/chosen": 9.273427963256836, "logits/rejected": 8.824771881103516, "logps/chosen": -274.83673095703125, "logps/rejected": -364.8233947753906, "loss": 0.6692, "rewards/accuracies": 0.625, "rewards/chosen": 0.38373440504074097, "rewards/margins": 0.07081535458564758, "rewards/rejected": 0.31291908025741577, "step": 1155 }, { "epoch": 0.17877440556736904, "grad_norm": 6.889599323272705, "learning_rate": 2.979381443298969e-06, "logits/chosen": 14.462203979492188, "logits/rejected": 12.723499298095703, "logps/chosen": -342.1315612792969, "logps/rejected": -350.3135681152344, "loss": 0.7387, "rewards/accuracies": 0.5, "rewards/chosen": 0.33119669556617737, "rewards/margins": -0.08083860576152802, "rewards/rejected": 0.4120352864265442, "step": 1156 }, { "epoch": 0.1789290547071332, "grad_norm": 6.282871723175049, "learning_rate": 2.981958762886598e-06, "logits/chosen": 6.331623077392578, "logits/rejected": 8.313983917236328, "logps/chosen": -333.8280029296875, "logps/rejected": -313.6961364746094, "loss": 0.6366, "rewards/accuracies": 0.625, "rewards/chosen": 0.6078802943229675, "rewards/margins": 0.142788365483284, "rewards/rejected": 0.4650919437408447, "step": 1157 }, { "epoch": 0.17908370384689734, "grad_norm": 5.11278772354126, "learning_rate": 2.984536082474227e-06, "logits/chosen": 9.76057243347168, "logits/rejected": 7.916048049926758, "logps/chosen": -324.56512451171875, "logps/rejected": -254.02684020996094, "loss": 0.6605, "rewards/accuracies": 0.625, "rewards/chosen": 0.39771729707717896, "rewards/margins": 0.11197453737258911, "rewards/rejected": 0.28574275970458984, "step": 1158 }, { "epoch": 0.17923835298666152, "grad_norm": 4.654809474945068, "learning_rate": 2.9871134020618557e-06, "logits/chosen": 18.386581420898438, "logits/rejected": 5.525882244110107, "logps/chosen": -347.45355224609375, "logps/rejected": -181.9476318359375, "loss": 0.6651, "rewards/accuracies": 0.625, "rewards/chosen": 0.3984912633895874, "rewards/margins": 0.0668419897556305, "rewards/rejected": 0.3316493332386017, "step": 1159 }, { "epoch": 0.17939300212642567, "grad_norm": 4.14867639541626, "learning_rate": 2.9896907216494846e-06, "logits/chosen": 6.37682580947876, "logits/rejected": 5.399381637573242, "logps/chosen": -189.04971313476562, "logps/rejected": -172.76202392578125, "loss": 0.6191, "rewards/accuracies": 0.75, "rewards/chosen": 0.4838985800743103, "rewards/margins": 0.16113080084323883, "rewards/rejected": 0.3227677643299103, "step": 1160 }, { "epoch": 0.17954765126618982, "grad_norm": 3.794466495513916, "learning_rate": 2.9922680412371135e-06, "logits/chosen": 15.56403923034668, "logits/rejected": 8.906413078308105, "logps/chosen": -182.099853515625, "logps/rejected": -131.7032928466797, "loss": 0.6389, "rewards/accuracies": 0.625, "rewards/chosen": 0.32996755838394165, "rewards/margins": 0.13160750269889832, "rewards/rejected": 0.19836005568504333, "step": 1161 }, { "epoch": 0.179702300405954, "grad_norm": 4.580986022949219, "learning_rate": 2.9948453608247423e-06, "logits/chosen": 11.547233581542969, "logits/rejected": 13.655698776245117, "logps/chosen": -212.57530212402344, "logps/rejected": -214.5176239013672, "loss": 0.7137, "rewards/accuracies": 0.5, "rewards/chosen": 0.3403546214103699, "rewards/margins": -0.02418498322367668, "rewards/rejected": 0.36453962326049805, "step": 1162 }, { "epoch": 0.17985694954571815, "grad_norm": 5.870129108428955, "learning_rate": 2.997422680412371e-06, "logits/chosen": 15.221019744873047, "logits/rejected": 8.538546562194824, "logps/chosen": -321.4717102050781, "logps/rejected": -206.94219970703125, "loss": 0.7296, "rewards/accuracies": 0.375, "rewards/chosen": 0.18510447442531586, "rewards/margins": -0.06214247643947601, "rewards/rejected": 0.24724695086479187, "step": 1163 }, { "epoch": 0.1800115986854823, "grad_norm": 4.725268363952637, "learning_rate": 3e-06, "logits/chosen": 11.91286849975586, "logits/rejected": 7.065916061401367, "logps/chosen": -343.61468505859375, "logps/rejected": -212.65185546875, "loss": 0.6219, "rewards/accuracies": 0.5, "rewards/chosen": 0.47155943512916565, "rewards/margins": 0.24598152935504913, "rewards/rejected": 0.22557789087295532, "step": 1164 }, { "epoch": 0.18016624782524648, "grad_norm": 4.854676723480225, "learning_rate": 3.002577319587629e-06, "logits/chosen": 13.181015968322754, "logits/rejected": 15.521026611328125, "logps/chosen": -220.15182495117188, "logps/rejected": -260.1375427246094, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.27434659004211426, "rewards/margins": 0.04805588722229004, "rewards/rejected": 0.22629070281982422, "step": 1165 }, { "epoch": 0.18032089696501064, "grad_norm": 4.67647647857666, "learning_rate": 3.005154639175258e-06, "logits/chosen": 13.731133460998535, "logits/rejected": 9.85840892791748, "logps/chosen": -312.1427001953125, "logps/rejected": -219.77969360351562, "loss": 0.6634, "rewards/accuracies": 0.625, "rewards/chosen": 0.5383355021476746, "rewards/margins": 0.07219067215919495, "rewards/rejected": 0.466144859790802, "step": 1166 }, { "epoch": 0.1804755461047748, "grad_norm": 4.745318412780762, "learning_rate": 3.0077319587628867e-06, "logits/chosen": 9.040209770202637, "logits/rejected": 10.990416526794434, "logps/chosen": -316.904541015625, "logps/rejected": -230.6239471435547, "loss": 0.6518, "rewards/accuracies": 0.625, "rewards/chosen": 0.4587368369102478, "rewards/margins": 0.09942308813333511, "rewards/rejected": 0.3593137264251709, "step": 1167 }, { "epoch": 0.18063019524453897, "grad_norm": 4.76668643951416, "learning_rate": 3.0103092783505156e-06, "logits/chosen": 15.52818489074707, "logits/rejected": 10.53999137878418, "logps/chosen": -279.0848388671875, "logps/rejected": -184.01812744140625, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": 0.3073820173740387, "rewards/margins": 0.07401542365550995, "rewards/rejected": 0.23336659371852875, "step": 1168 }, { "epoch": 0.18078484438430312, "grad_norm": 4.650648593902588, "learning_rate": 3.0128865979381444e-06, "logits/chosen": 12.051068305969238, "logits/rejected": 10.280696868896484, "logps/chosen": -339.3119812011719, "logps/rejected": -288.9632873535156, "loss": 0.5826, "rewards/accuracies": 0.625, "rewards/chosen": 0.6108314990997314, "rewards/margins": 0.28709107637405396, "rewards/rejected": 0.3237403929233551, "step": 1169 }, { "epoch": 0.18093949352406727, "grad_norm": 8.303998947143555, "learning_rate": 3.0154639175257733e-06, "logits/chosen": 7.365882873535156, "logits/rejected": 3.311863899230957, "logps/chosen": -291.14337158203125, "logps/rejected": -276.2372741699219, "loss": 0.7404, "rewards/accuracies": 0.375, "rewards/chosen": 0.4151206910610199, "rewards/margins": -0.07277338951826096, "rewards/rejected": 0.48789405822753906, "step": 1170 }, { "epoch": 0.18109414266383142, "grad_norm": 11.77899169921875, "learning_rate": 3.018041237113402e-06, "logits/chosen": 14.795269966125488, "logits/rejected": 14.246530532836914, "logps/chosen": -393.1174621582031, "logps/rejected": -318.1690979003906, "loss": 0.7647, "rewards/accuracies": 0.375, "rewards/chosen": 0.34867727756500244, "rewards/margins": -0.12560062110424042, "rewards/rejected": 0.47427788376808167, "step": 1171 }, { "epoch": 0.1812487918035956, "grad_norm": 4.764474391937256, "learning_rate": 3.020618556701031e-06, "logits/chosen": 15.29426383972168, "logits/rejected": 10.25067138671875, "logps/chosen": -290.40289306640625, "logps/rejected": -196.67935180664062, "loss": 0.656, "rewards/accuracies": 0.5, "rewards/chosen": 0.3114931881427765, "rewards/margins": 0.12225961685180664, "rewards/rejected": 0.18923360109329224, "step": 1172 }, { "epoch": 0.18140344094335975, "grad_norm": 4.871850967407227, "learning_rate": 3.02319587628866e-06, "logits/chosen": 8.374554634094238, "logits/rejected": 4.219167709350586, "logps/chosen": -257.68853759765625, "logps/rejected": -267.280517578125, "loss": 0.67, "rewards/accuracies": 0.5, "rewards/chosen": 0.4222041666507721, "rewards/margins": 0.06348080933094025, "rewards/rejected": 0.35872337222099304, "step": 1173 }, { "epoch": 0.1815580900831239, "grad_norm": 4.874150276184082, "learning_rate": 3.025773195876289e-06, "logits/chosen": 14.099828720092773, "logits/rejected": 4.365835189819336, "logps/chosen": -254.84820556640625, "logps/rejected": -155.9647216796875, "loss": 0.5837, "rewards/accuracies": 0.875, "rewards/chosen": 0.381515234708786, "rewards/margins": 0.23910629749298096, "rewards/rejected": 0.14240892231464386, "step": 1174 }, { "epoch": 0.18171273922288808, "grad_norm": 6.128303527832031, "learning_rate": 3.0283505154639177e-06, "logits/chosen": 5.875522136688232, "logits/rejected": 7.177199840545654, "logps/chosen": -165.80751037597656, "logps/rejected": -208.37188720703125, "loss": 0.772, "rewards/accuracies": 0.375, "rewards/chosen": 0.29461461305618286, "rewards/margins": -0.13151991367340088, "rewards/rejected": 0.42613452672958374, "step": 1175 }, { "epoch": 0.18186738836265223, "grad_norm": 5.21152400970459, "learning_rate": 3.0309278350515465e-06, "logits/chosen": 10.468564987182617, "logits/rejected": 6.813868522644043, "logps/chosen": -280.7398681640625, "logps/rejected": -233.3089599609375, "loss": 0.655, "rewards/accuracies": 0.5, "rewards/chosen": 0.5198568105697632, "rewards/margins": 0.12020877748727798, "rewards/rejected": 0.3996480107307434, "step": 1176 }, { "epoch": 0.18202203750241638, "grad_norm": 8.711030960083008, "learning_rate": 3.0335051546391754e-06, "logits/chosen": 9.909565925598145, "logits/rejected": 7.333517551422119, "logps/chosen": -401.7020568847656, "logps/rejected": -566.3073120117188, "loss": 0.754, "rewards/accuracies": 0.25, "rewards/chosen": 0.3427141606807709, "rewards/margins": -0.08943916857242584, "rewards/rejected": 0.4321533143520355, "step": 1177 }, { "epoch": 0.18217668664218056, "grad_norm": 5.159784317016602, "learning_rate": 3.0360824742268043e-06, "logits/chosen": 5.915364742279053, "logits/rejected": 5.309688568115234, "logps/chosen": -226.29086303710938, "logps/rejected": -190.01730346679688, "loss": 0.6341, "rewards/accuracies": 0.625, "rewards/chosen": 0.46011900901794434, "rewards/margins": 0.15468966960906982, "rewards/rejected": 0.3054293394088745, "step": 1178 }, { "epoch": 0.18233133578194471, "grad_norm": 4.520188808441162, "learning_rate": 3.038659793814433e-06, "logits/chosen": 5.355957984924316, "logits/rejected": 6.04193639755249, "logps/chosen": -238.64024353027344, "logps/rejected": -228.92774963378906, "loss": 0.7198, "rewards/accuracies": 0.625, "rewards/chosen": 0.4141625761985779, "rewards/margins": -0.03612041845917702, "rewards/rejected": 0.4502830505371094, "step": 1179 }, { "epoch": 0.18248598492170887, "grad_norm": 5.92887544631958, "learning_rate": 3.041237113402062e-06, "logits/chosen": 7.3638529777526855, "logits/rejected": 4.4720258712768555, "logps/chosen": -288.26507568359375, "logps/rejected": -205.216064453125, "loss": 0.6825, "rewards/accuracies": 0.5, "rewards/chosen": 0.238226979970932, "rewards/margins": 0.04762476682662964, "rewards/rejected": 0.19060221314430237, "step": 1180 }, { "epoch": 0.18264063406147305, "grad_norm": 7.217073917388916, "learning_rate": 3.043814432989691e-06, "logits/chosen": 10.551058769226074, "logits/rejected": 4.731176376342773, "logps/chosen": -338.35711669921875, "logps/rejected": -216.30242919921875, "loss": 0.6928, "rewards/accuracies": 0.75, "rewards/chosen": 0.4426460266113281, "rewards/margins": 0.03709197789430618, "rewards/rejected": 0.40555405616760254, "step": 1181 }, { "epoch": 0.1827952832012372, "grad_norm": 6.286205768585205, "learning_rate": 3.0463917525773198e-06, "logits/chosen": 8.504984855651855, "logits/rejected": 4.316731929779053, "logps/chosen": -219.84622192382812, "logps/rejected": -222.9134063720703, "loss": 0.6741, "rewards/accuracies": 0.5, "rewards/chosen": 0.38894233107566833, "rewards/margins": 0.06511226296424866, "rewards/rejected": 0.3238300681114197, "step": 1182 }, { "epoch": 0.18294993234100135, "grad_norm": 4.606963634490967, "learning_rate": 3.048969072164949e-06, "logits/chosen": 8.149803161621094, "logits/rejected": 4.0450053215026855, "logps/chosen": -265.49542236328125, "logps/rejected": -237.5177764892578, "loss": 0.6337, "rewards/accuracies": 0.75, "rewards/chosen": 0.6777053475379944, "rewards/margins": 0.13223743438720703, "rewards/rejected": 0.5454679131507874, "step": 1183 }, { "epoch": 0.18310458148076553, "grad_norm": 5.380277156829834, "learning_rate": 3.051546391752578e-06, "logits/chosen": 13.869263648986816, "logits/rejected": 9.955480575561523, "logps/chosen": -388.0926818847656, "logps/rejected": -282.67889404296875, "loss": 0.604, "rewards/accuracies": 0.875, "rewards/chosen": 0.5288920402526855, "rewards/margins": 0.19544801115989685, "rewards/rejected": 0.3334440290927887, "step": 1184 }, { "epoch": 0.18325923062052968, "grad_norm": 6.7408952713012695, "learning_rate": 3.054123711340207e-06, "logits/chosen": 11.916411399841309, "logits/rejected": 10.430109024047852, "logps/chosen": -248.553955078125, "logps/rejected": -242.81214904785156, "loss": 0.7607, "rewards/accuracies": 0.375, "rewards/chosen": 0.27385321259498596, "rewards/margins": -0.11242671310901642, "rewards/rejected": 0.3862799108028412, "step": 1185 }, { "epoch": 0.18341387976029383, "grad_norm": 9.486915588378906, "learning_rate": 3.0567010309278357e-06, "logits/chosen": 11.307969093322754, "logits/rejected": 9.538265228271484, "logps/chosen": -348.06982421875, "logps/rejected": -356.6627197265625, "loss": 0.7011, "rewards/accuracies": 0.5, "rewards/chosen": 0.5180324912071228, "rewards/margins": 0.012578964233398438, "rewards/rejected": 0.5054535865783691, "step": 1186 }, { "epoch": 0.18356852890005798, "grad_norm": 5.4604716300964355, "learning_rate": 3.0592783505154646e-06, "logits/chosen": 7.491862773895264, "logits/rejected": 10.485586166381836, "logps/chosen": -236.80398559570312, "logps/rejected": -288.0054626464844, "loss": 0.6647, "rewards/accuracies": 0.625, "rewards/chosen": 0.4369448721408844, "rewards/margins": 0.11426103860139847, "rewards/rejected": 0.32268381118774414, "step": 1187 }, { "epoch": 0.18372317803982216, "grad_norm": 24.633495330810547, "learning_rate": 3.0618556701030934e-06, "logits/chosen": 18.332578659057617, "logits/rejected": 9.687328338623047, "logps/chosen": -360.6594543457031, "logps/rejected": -236.90896606445312, "loss": 0.6789, "rewards/accuracies": 0.375, "rewards/chosen": 0.4057636260986328, "rewards/margins": 0.03886289522051811, "rewards/rejected": 0.3669007122516632, "step": 1188 }, { "epoch": 0.1838778271795863, "grad_norm": 5.3053154945373535, "learning_rate": 3.0644329896907223e-06, "logits/chosen": 17.353923797607422, "logits/rejected": 16.056013107299805, "logps/chosen": -226.23492431640625, "logps/rejected": -236.54217529296875, "loss": 0.6326, "rewards/accuracies": 0.875, "rewards/chosen": 0.500227689743042, "rewards/margins": 0.1318841576576233, "rewards/rejected": 0.3683435618877411, "step": 1189 }, { "epoch": 0.18403247631935046, "grad_norm": 8.136807441711426, "learning_rate": 3.067010309278351e-06, "logits/chosen": 4.9448747634887695, "logits/rejected": 9.623682975769043, "logps/chosen": -205.90737915039062, "logps/rejected": -155.84613037109375, "loss": 0.8436, "rewards/accuracies": 0.25, "rewards/chosen": 0.2626339793205261, "rewards/margins": -0.2467816025018692, "rewards/rejected": 0.5094155669212341, "step": 1190 }, { "epoch": 0.18418712545911464, "grad_norm": 4.0371527671813965, "learning_rate": 3.06958762886598e-06, "logits/chosen": 3.384756088256836, "logits/rejected": 5.1296796798706055, "logps/chosen": -156.49325561523438, "logps/rejected": -174.53515625, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 0.412746399641037, "rewards/margins": 0.04412619024515152, "rewards/rejected": 0.3686201870441437, "step": 1191 }, { "epoch": 0.1843417745988788, "grad_norm": 4.892765998840332, "learning_rate": 3.0721649484536085e-06, "logits/chosen": 10.358184814453125, "logits/rejected": 7.822065353393555, "logps/chosen": -171.96621704101562, "logps/rejected": -118.33464813232422, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": 0.39216411113739014, "rewards/margins": 0.027994196861982346, "rewards/rejected": 0.3641698956489563, "step": 1192 }, { "epoch": 0.18449642373864295, "grad_norm": 21.119739532470703, "learning_rate": 3.0747422680412374e-06, "logits/chosen": 12.397688865661621, "logits/rejected": 11.894867897033691, "logps/chosen": -306.3863220214844, "logps/rejected": -322.08416748046875, "loss": 0.782, "rewards/accuracies": 0.375, "rewards/chosen": 0.3565625548362732, "rewards/margins": -0.12484966218471527, "rewards/rejected": 0.48141220211982727, "step": 1193 }, { "epoch": 0.18465107287840712, "grad_norm": 5.164624214172363, "learning_rate": 3.0773195876288662e-06, "logits/chosen": 5.650229454040527, "logits/rejected": 4.6545562744140625, "logps/chosen": -207.19813537597656, "logps/rejected": -236.67950439453125, "loss": 0.6101, "rewards/accuracies": 0.75, "rewards/chosen": 0.6313309073448181, "rewards/margins": 0.20105575025081635, "rewards/rejected": 0.43027520179748535, "step": 1194 }, { "epoch": 0.18480572201817128, "grad_norm": 10.106759071350098, "learning_rate": 3.079896907216495e-06, "logits/chosen": 6.211137294769287, "logits/rejected": 8.031717300415039, "logps/chosen": -208.40060424804688, "logps/rejected": -265.97882080078125, "loss": 0.6533, "rewards/accuracies": 0.5, "rewards/chosen": 0.4110247790813446, "rewards/margins": 0.1256294846534729, "rewards/rejected": 0.2853952944278717, "step": 1195 }, { "epoch": 0.18496037115793543, "grad_norm": 4.772614479064941, "learning_rate": 3.082474226804124e-06, "logits/chosen": 7.721717357635498, "logits/rejected": 5.436555862426758, "logps/chosen": -225.6160888671875, "logps/rejected": -152.51763916015625, "loss": 0.73, "rewards/accuracies": 0.125, "rewards/chosen": 0.38208839297294617, "rewards/margins": -0.06981611251831055, "rewards/rejected": 0.4519044756889343, "step": 1196 }, { "epoch": 0.1851150202976996, "grad_norm": 5.007623195648193, "learning_rate": 3.085051546391753e-06, "logits/chosen": 9.873635292053223, "logits/rejected": 5.753932476043701, "logps/chosen": -270.7194519042969, "logps/rejected": -234.20751953125, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.34522420167922974, "rewards/margins": 0.0871821939945221, "rewards/rejected": 0.25804200768470764, "step": 1197 }, { "epoch": 0.18526966943746376, "grad_norm": 6.744195938110352, "learning_rate": 3.0876288659793817e-06, "logits/chosen": 15.802885055541992, "logits/rejected": 15.260936737060547, "logps/chosen": -300.5866394042969, "logps/rejected": -264.56549072265625, "loss": 0.6644, "rewards/accuracies": 0.375, "rewards/chosen": 0.5173041224479675, "rewards/margins": 0.0647246390581131, "rewards/rejected": 0.4525794982910156, "step": 1198 }, { "epoch": 0.1854243185772279, "grad_norm": 6.954531192779541, "learning_rate": 3.0902061855670106e-06, "logits/chosen": 12.912381172180176, "logits/rejected": 9.995681762695312, "logps/chosen": -394.2403564453125, "logps/rejected": -345.6117248535156, "loss": 0.688, "rewards/accuracies": 0.625, "rewards/chosen": 0.49471521377563477, "rewards/margins": 0.028119433671236038, "rewards/rejected": 0.46659573912620544, "step": 1199 }, { "epoch": 0.18557896771699206, "grad_norm": 6.823832035064697, "learning_rate": 3.0927835051546395e-06, "logits/chosen": 5.388941287994385, "logits/rejected": 7.926120281219482, "logps/chosen": -224.07943725585938, "logps/rejected": -279.90966796875, "loss": 0.7515, "rewards/accuracies": 0.375, "rewards/chosen": 0.4284442663192749, "rewards/margins": -0.0993957370519638, "rewards/rejected": 0.5278400182723999, "step": 1200 }, { "epoch": 0.18573361685675624, "grad_norm": 5.436979293823242, "learning_rate": 3.0953608247422683e-06, "logits/chosen": 12.300765991210938, "logits/rejected": 9.664896011352539, "logps/chosen": -286.91607666015625, "logps/rejected": -291.53656005859375, "loss": 0.6764, "rewards/accuracies": 0.5, "rewards/chosen": 0.6235439777374268, "rewards/margins": 0.048819400370121, "rewards/rejected": 0.574724555015564, "step": 1201 }, { "epoch": 0.1858882659965204, "grad_norm": 6.066813945770264, "learning_rate": 3.097938144329897e-06, "logits/chosen": 9.628548622131348, "logits/rejected": 16.925989151000977, "logps/chosen": -183.95472717285156, "logps/rejected": -290.9954833984375, "loss": 0.7769, "rewards/accuracies": 0.25, "rewards/chosen": 0.42231979966163635, "rewards/margins": -0.1516256332397461, "rewards/rejected": 0.5739454030990601, "step": 1202 }, { "epoch": 0.18604291513628454, "grad_norm": 5.672567844390869, "learning_rate": 3.100515463917526e-06, "logits/chosen": 13.347369194030762, "logits/rejected": 12.573772430419922, "logps/chosen": -325.94793701171875, "logps/rejected": -342.54583740234375, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": 0.561053991317749, "rewards/margins": 0.01670600287616253, "rewards/rejected": 0.5443479418754578, "step": 1203 }, { "epoch": 0.18619756427604872, "grad_norm": 5.7790398597717285, "learning_rate": 3.103092783505155e-06, "logits/chosen": 7.422928810119629, "logits/rejected": 6.094414710998535, "logps/chosen": -242.73480224609375, "logps/rejected": -189.88925170898438, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": 0.36713293194770813, "rewards/margins": 0.010114848613739014, "rewards/rejected": 0.3570181131362915, "step": 1204 }, { "epoch": 0.18635221341581287, "grad_norm": 7.573155403137207, "learning_rate": 3.105670103092784e-06, "logits/chosen": 5.264227867126465, "logits/rejected": 7.8597540855407715, "logps/chosen": -314.0546569824219, "logps/rejected": -322.2362365722656, "loss": 0.7352, "rewards/accuracies": 0.375, "rewards/chosen": 0.415477454662323, "rewards/margins": -0.06558741629123688, "rewards/rejected": 0.4810648560523987, "step": 1205 }, { "epoch": 0.18650686255557702, "grad_norm": 5.678481578826904, "learning_rate": 3.1082474226804127e-06, "logits/chosen": 10.003520965576172, "logits/rejected": 10.643257141113281, "logps/chosen": -250.01773071289062, "logps/rejected": -241.93426513671875, "loss": 0.6846, "rewards/accuracies": 0.5, "rewards/chosen": 0.32810544967651367, "rewards/margins": 0.031289104372262955, "rewards/rejected": 0.296816349029541, "step": 1206 }, { "epoch": 0.1866615116953412, "grad_norm": 5.1704864501953125, "learning_rate": 3.1108247422680416e-06, "logits/chosen": 10.481098175048828, "logits/rejected": 7.659857273101807, "logps/chosen": -228.67388916015625, "logps/rejected": -189.83267211914062, "loss": 0.71, "rewards/accuracies": 0.25, "rewards/chosen": 0.3601820170879364, "rewards/margins": -0.01919415593147278, "rewards/rejected": 0.3793761432170868, "step": 1207 }, { "epoch": 0.18681616083510535, "grad_norm": 5.7654571533203125, "learning_rate": 3.1134020618556704e-06, "logits/chosen": 6.949052810668945, "logits/rejected": 0.6345968246459961, "logps/chosen": -270.25494384765625, "logps/rejected": -184.03750610351562, "loss": 0.7517, "rewards/accuracies": 0.375, "rewards/chosen": 0.3889699876308441, "rewards/margins": -0.09388561546802521, "rewards/rejected": 0.48285558819770813, "step": 1208 }, { "epoch": 0.1869708099748695, "grad_norm": 4.760369777679443, "learning_rate": 3.1159793814432993e-06, "logits/chosen": 14.434054374694824, "logits/rejected": 9.79536247253418, "logps/chosen": -289.9521789550781, "logps/rejected": -228.3650665283203, "loss": 0.6686, "rewards/accuracies": 0.625, "rewards/chosen": 0.4275573790073395, "rewards/margins": 0.0646824836730957, "rewards/rejected": 0.3628748953342438, "step": 1209 }, { "epoch": 0.18712545911463369, "grad_norm": 14.572488784790039, "learning_rate": 3.118556701030928e-06, "logits/chosen": 11.860732078552246, "logits/rejected": 9.801314353942871, "logps/chosen": -328.2403564453125, "logps/rejected": -261.2964782714844, "loss": 0.6285, "rewards/accuracies": 0.75, "rewards/chosen": 0.5980567932128906, "rewards/margins": 0.14208680391311646, "rewards/rejected": 0.45596998929977417, "step": 1210 }, { "epoch": 0.18728010825439784, "grad_norm": 6.36720609664917, "learning_rate": 3.121134020618557e-06, "logits/chosen": 9.14716911315918, "logits/rejected": 6.533000946044922, "logps/chosen": -297.1690673828125, "logps/rejected": -278.9299011230469, "loss": 0.7287, "rewards/accuracies": 0.25, "rewards/chosen": 0.3750815987586975, "rewards/margins": -0.047883618623018265, "rewards/rejected": 0.4229651987552643, "step": 1211 }, { "epoch": 0.187434757394162, "grad_norm": 7.197250843048096, "learning_rate": 3.123711340206186e-06, "logits/chosen": 8.806530952453613, "logits/rejected": 8.954459190368652, "logps/chosen": -300.0014953613281, "logps/rejected": -336.94805908203125, "loss": 0.7026, "rewards/accuracies": 0.375, "rewards/chosen": 0.37373867630958557, "rewards/margins": 0.03315524384379387, "rewards/rejected": 0.3405834436416626, "step": 1212 }, { "epoch": 0.18758940653392617, "grad_norm": 5.8357696533203125, "learning_rate": 3.126288659793815e-06, "logits/chosen": 9.556649208068848, "logits/rejected": 11.319966316223145, "logps/chosen": -358.9806823730469, "logps/rejected": -330.43853759765625, "loss": 0.7371, "rewards/accuracies": 0.375, "rewards/chosen": 0.4974590539932251, "rewards/margins": -0.07199737429618835, "rewards/rejected": 0.5694563984870911, "step": 1213 }, { "epoch": 0.18774405567369032, "grad_norm": 6.848079204559326, "learning_rate": 3.1288659793814437e-06, "logits/chosen": 6.142574310302734, "logits/rejected": 5.4072957038879395, "logps/chosen": -269.4252014160156, "logps/rejected": -296.4049987792969, "loss": 0.6496, "rewards/accuracies": 0.5, "rewards/chosen": 0.6798981428146362, "rewards/margins": 0.1462177187204361, "rewards/rejected": 0.5336804389953613, "step": 1214 }, { "epoch": 0.18789870481345447, "grad_norm": 6.2556071281433105, "learning_rate": 3.1314432989690725e-06, "logits/chosen": 11.529394149780273, "logits/rejected": 5.767757892608643, "logps/chosen": -280.46600341796875, "logps/rejected": -160.28274536132812, "loss": 0.7714, "rewards/accuracies": 0.0, "rewards/chosen": 0.24274583160877228, "rewards/margins": -0.14023223519325256, "rewards/rejected": 0.38297808170318604, "step": 1215 }, { "epoch": 0.18805335395321862, "grad_norm": 2.86610746383667, "learning_rate": 3.1340206185567014e-06, "logits/chosen": 6.791140079498291, "logits/rejected": 7.40846061706543, "logps/chosen": -108.73541259765625, "logps/rejected": -108.48594665527344, "loss": 0.6359, "rewards/accuracies": 0.625, "rewards/chosen": 0.4238719642162323, "rewards/margins": 0.12224177271127701, "rewards/rejected": 0.3016301989555359, "step": 1216 }, { "epoch": 0.1882080030929828, "grad_norm": 6.95395040512085, "learning_rate": 3.1365979381443303e-06, "logits/chosen": 10.178174018859863, "logits/rejected": 8.106420516967773, "logps/chosen": -393.7174987792969, "logps/rejected": -381.8161926269531, "loss": 0.679, "rewards/accuracies": 0.75, "rewards/chosen": 0.5753136873245239, "rewards/margins": 0.10919182747602463, "rewards/rejected": 0.4661218523979187, "step": 1217 }, { "epoch": 0.18836265223274695, "grad_norm": 4.500452041625977, "learning_rate": 3.139175257731959e-06, "logits/chosen": 8.35544490814209, "logits/rejected": 4.280854225158691, "logps/chosen": -264.40826416015625, "logps/rejected": -249.41998291015625, "loss": 0.6094, "rewards/accuracies": 0.875, "rewards/chosen": 0.4063566327095032, "rewards/margins": 0.19208449125289917, "rewards/rejected": 0.2142721563577652, "step": 1218 }, { "epoch": 0.1885173013725111, "grad_norm": 4.626194953918457, "learning_rate": 3.141752577319588e-06, "logits/chosen": 8.580806732177734, "logits/rejected": 9.48959732055664, "logps/chosen": -319.87115478515625, "logps/rejected": -339.5631103515625, "loss": 0.6076, "rewards/accuracies": 0.75, "rewards/chosen": 0.5394479036331177, "rewards/margins": 0.20765885710716248, "rewards/rejected": 0.3317890167236328, "step": 1219 }, { "epoch": 0.18867195051227528, "grad_norm": 5.603726863861084, "learning_rate": 3.1443298969072165e-06, "logits/chosen": 9.180947303771973, "logits/rejected": 0.005242586135864258, "logps/chosen": -313.40899658203125, "logps/rejected": -191.90228271484375, "loss": 0.6322, "rewards/accuracies": 0.625, "rewards/chosen": 0.5424249172210693, "rewards/margins": 0.13756096363067627, "rewards/rejected": 0.40486395359039307, "step": 1220 }, { "epoch": 0.18882659965203943, "grad_norm": 9.603288650512695, "learning_rate": 3.1469072164948453e-06, "logits/chosen": 7.6078715324401855, "logits/rejected": 10.072007179260254, "logps/chosen": -247.25083923339844, "logps/rejected": -269.4310302734375, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": 0.4233908951282501, "rewards/margins": 0.14118510484695435, "rewards/rejected": 0.2822057604789734, "step": 1221 }, { "epoch": 0.18898124879180359, "grad_norm": 3.6513845920562744, "learning_rate": 3.1494845360824742e-06, "logits/chosen": 12.510441780090332, "logits/rejected": 8.736970901489258, "logps/chosen": -235.2794647216797, "logps/rejected": -152.3460693359375, "loss": 0.6386, "rewards/accuracies": 0.75, "rewards/chosen": 0.48582226037979126, "rewards/margins": 0.12270636111497879, "rewards/rejected": 0.36311590671539307, "step": 1222 }, { "epoch": 0.18913589793156776, "grad_norm": 4.3698506355285645, "learning_rate": 3.152061855670103e-06, "logits/chosen": 5.830020904541016, "logits/rejected": 0.8380942940711975, "logps/chosen": -228.67303466796875, "logps/rejected": -148.95106506347656, "loss": 0.6044, "rewards/accuracies": 0.75, "rewards/chosen": 0.48227614164352417, "rewards/margins": 0.20268318057060242, "rewards/rejected": 0.27959299087524414, "step": 1223 }, { "epoch": 0.18929054707133192, "grad_norm": 5.827017307281494, "learning_rate": 3.154639175257732e-06, "logits/chosen": 9.446216583251953, "logits/rejected": 10.732011795043945, "logps/chosen": -257.458251953125, "logps/rejected": -302.6133728027344, "loss": 0.7338, "rewards/accuracies": 0.375, "rewards/chosen": 0.34902212023735046, "rewards/margins": -0.057440612465143204, "rewards/rejected": 0.40646272897720337, "step": 1224 }, { "epoch": 0.18944519621109607, "grad_norm": 8.706448554992676, "learning_rate": 3.157216494845361e-06, "logits/chosen": 11.037115097045898, "logits/rejected": 7.214148044586182, "logps/chosen": -283.21307373046875, "logps/rejected": -283.8851623535156, "loss": 0.6702, "rewards/accuracies": 0.625, "rewards/chosen": 0.5103629231452942, "rewards/margins": 0.06468954682350159, "rewards/rejected": 0.445673406124115, "step": 1225 }, { "epoch": 0.18959984535086025, "grad_norm": 4.980746746063232, "learning_rate": 3.1597938144329897e-06, "logits/chosen": 6.920391082763672, "logits/rejected": 7.818106174468994, "logps/chosen": -322.8192138671875, "logps/rejected": -290.6893310546875, "loss": 0.63, "rewards/accuracies": 0.75, "rewards/chosen": 0.5367587804794312, "rewards/margins": 0.14732728898525238, "rewards/rejected": 0.3894314765930176, "step": 1226 }, { "epoch": 0.1897544944906244, "grad_norm": 5.300254821777344, "learning_rate": 3.1623711340206186e-06, "logits/chosen": 8.445317268371582, "logits/rejected": 9.44327163696289, "logps/chosen": -203.99066162109375, "logps/rejected": -152.672119140625, "loss": 0.7431, "rewards/accuracies": 0.375, "rewards/chosen": 0.46075350046157837, "rewards/margins": -0.08466558158397675, "rewards/rejected": 0.5454190969467163, "step": 1227 }, { "epoch": 0.18990914363038855, "grad_norm": 31.06129264831543, "learning_rate": 3.1649484536082475e-06, "logits/chosen": 6.146291732788086, "logits/rejected": 9.227516174316406, "logps/chosen": -198.13046264648438, "logps/rejected": -260.4383544921875, "loss": 0.7542, "rewards/accuracies": 0.25, "rewards/chosen": 0.503790557384491, "rewards/margins": -0.1043115183711052, "rewards/rejected": 0.6081020832061768, "step": 1228 }, { "epoch": 0.19006379277015273, "grad_norm": 9.248373031616211, "learning_rate": 3.1675257731958763e-06, "logits/chosen": 9.757558822631836, "logits/rejected": 7.950119495391846, "logps/chosen": -297.04473876953125, "logps/rejected": -293.9345703125, "loss": 0.6587, "rewards/accuracies": 0.375, "rewards/chosen": 0.6458636522293091, "rewards/margins": 0.0953555479645729, "rewards/rejected": 0.5505080223083496, "step": 1229 }, { "epoch": 0.19021844190991688, "grad_norm": 19.740877151489258, "learning_rate": 3.170103092783505e-06, "logits/chosen": 9.515580177307129, "logits/rejected": -1.551758885383606, "logps/chosen": -217.4691619873047, "logps/rejected": -115.7433853149414, "loss": 0.7155, "rewards/accuracies": 0.375, "rewards/chosen": 0.29984092712402344, "rewards/margins": -0.03631095588207245, "rewards/rejected": 0.3361518979072571, "step": 1230 }, { "epoch": 0.19037309104968103, "grad_norm": 5.421351909637451, "learning_rate": 3.172680412371134e-06, "logits/chosen": 8.8172025680542, "logits/rejected": 7.618391513824463, "logps/chosen": -246.85678100585938, "logps/rejected": -225.1162567138672, "loss": 0.7273, "rewards/accuracies": 0.375, "rewards/chosen": 0.38675397634506226, "rewards/margins": -0.05506296083331108, "rewards/rejected": 0.44181695580482483, "step": 1231 }, { "epoch": 0.19052774018944518, "grad_norm": 7.741054534912109, "learning_rate": 3.175257731958763e-06, "logits/chosen": 11.476730346679688, "logits/rejected": 12.512069702148438, "logps/chosen": -417.05908203125, "logps/rejected": -348.0390319824219, "loss": 0.731, "rewards/accuracies": 0.25, "rewards/chosen": 0.4446180462837219, "rewards/margins": -0.05932272970676422, "rewards/rejected": 0.5039408206939697, "step": 1232 }, { "epoch": 0.19068238932920936, "grad_norm": 4.535299777984619, "learning_rate": 3.177835051546392e-06, "logits/chosen": 10.029155731201172, "logits/rejected": 5.178633689880371, "logps/chosen": -294.2568054199219, "logps/rejected": -243.51840209960938, "loss": 0.6316, "rewards/accuracies": 0.75, "rewards/chosen": 0.5292582511901855, "rewards/margins": 0.14780116081237793, "rewards/rejected": 0.3814570903778076, "step": 1233 }, { "epoch": 0.1908370384689735, "grad_norm": 4.163630962371826, "learning_rate": 3.1804123711340207e-06, "logits/chosen": 8.796285629272461, "logits/rejected": 1.4558358192443848, "logps/chosen": -301.8804931640625, "logps/rejected": -201.09512329101562, "loss": 0.5625, "rewards/accuracies": 0.875, "rewards/chosen": 0.655165433883667, "rewards/margins": 0.3351425230503082, "rewards/rejected": 0.32002294063568115, "step": 1234 }, { "epoch": 0.19099168760873766, "grad_norm": 6.68710994720459, "learning_rate": 3.1829896907216496e-06, "logits/chosen": 8.119000434875488, "logits/rejected": 7.294620513916016, "logps/chosen": -353.65087890625, "logps/rejected": -350.862060546875, "loss": 0.7298, "rewards/accuracies": 0.5, "rewards/chosen": 0.5050303936004639, "rewards/margins": -0.05927343666553497, "rewards/rejected": 0.5643037557601929, "step": 1235 }, { "epoch": 0.19114633674850184, "grad_norm": 5.4455342292785645, "learning_rate": 3.1855670103092784e-06, "logits/chosen": 14.876818656921387, "logits/rejected": 13.735360145568848, "logps/chosen": -282.4361572265625, "logps/rejected": -353.4479675292969, "loss": 0.5893, "rewards/accuracies": 0.875, "rewards/chosen": 0.6284136772155762, "rewards/margins": 0.24710312485694885, "rewards/rejected": 0.3813105523586273, "step": 1236 }, { "epoch": 0.191300985888266, "grad_norm": 10.938210487365723, "learning_rate": 3.1881443298969073e-06, "logits/chosen": 11.34907341003418, "logits/rejected": 5.414515495300293, "logps/chosen": -398.83441162109375, "logps/rejected": -237.18492126464844, "loss": 0.6393, "rewards/accuracies": 0.625, "rewards/chosen": 0.6420078873634338, "rewards/margins": 0.11836714297533035, "rewards/rejected": 0.5236407518386841, "step": 1237 }, { "epoch": 0.19145563502803015, "grad_norm": 8.826349258422852, "learning_rate": 3.190721649484536e-06, "logits/chosen": 13.579442024230957, "logits/rejected": 9.15294361114502, "logps/chosen": -294.3518371582031, "logps/rejected": -255.98394775390625, "loss": 0.7309, "rewards/accuracies": 0.625, "rewards/chosen": 0.4393633306026459, "rewards/margins": -0.05169167369604111, "rewards/rejected": 0.4910550117492676, "step": 1238 }, { "epoch": 0.19161028416779433, "grad_norm": 5.464394569396973, "learning_rate": 3.193298969072165e-06, "logits/chosen": 3.160524368286133, "logits/rejected": 0.9966862201690674, "logps/chosen": -233.94293212890625, "logps/rejected": -200.45401000976562, "loss": 0.7004, "rewards/accuracies": 0.625, "rewards/chosen": 0.5308198928833008, "rewards/margins": 0.007534883916378021, "rewards/rejected": 0.5232850313186646, "step": 1239 }, { "epoch": 0.19176493330755848, "grad_norm": 4.881063938140869, "learning_rate": 3.195876288659794e-06, "logits/chosen": 11.210578918457031, "logits/rejected": 7.351213455200195, "logps/chosen": -193.02114868164062, "logps/rejected": -181.2538299560547, "loss": 0.6749, "rewards/accuracies": 0.5, "rewards/chosen": 0.4692145586013794, "rewards/margins": 0.05012582987546921, "rewards/rejected": 0.4190887212753296, "step": 1240 }, { "epoch": 0.19191958244732263, "grad_norm": 63.044219970703125, "learning_rate": 3.1984536082474228e-06, "logits/chosen": 12.134191513061523, "logits/rejected": 14.954732894897461, "logps/chosen": -229.34231567382812, "logps/rejected": -278.4432373046875, "loss": 0.7254, "rewards/accuracies": 0.5, "rewards/chosen": 0.3644210398197174, "rewards/margins": -0.05073036998510361, "rewards/rejected": 0.4151514172554016, "step": 1241 }, { "epoch": 0.1920742315870868, "grad_norm": 4.016191482543945, "learning_rate": 3.2010309278350517e-06, "logits/chosen": 15.895033836364746, "logits/rejected": 8.987711906433105, "logps/chosen": -294.3758850097656, "logps/rejected": -147.92990112304688, "loss": 0.5742, "rewards/accuracies": 0.75, "rewards/chosen": 0.6089287400245667, "rewards/margins": 0.2810121178627014, "rewards/rejected": 0.32791662216186523, "step": 1242 }, { "epoch": 0.19222888072685096, "grad_norm": 5.765580177307129, "learning_rate": 3.203608247422681e-06, "logits/chosen": 8.613408088684082, "logits/rejected": 8.338961601257324, "logps/chosen": -296.4032897949219, "logps/rejected": -270.2394104003906, "loss": 0.6631, "rewards/accuracies": 0.625, "rewards/chosen": 0.4387373924255371, "rewards/margins": 0.07200220972299576, "rewards/rejected": 0.36673516035079956, "step": 1243 }, { "epoch": 0.1923835298666151, "grad_norm": 5.679923057556152, "learning_rate": 3.20618556701031e-06, "logits/chosen": 5.362502574920654, "logits/rejected": 12.569731712341309, "logps/chosen": -167.60635375976562, "logps/rejected": -258.2720642089844, "loss": 0.7329, "rewards/accuracies": 0.5, "rewards/chosen": 0.4995642304420471, "rewards/margins": -0.06724506616592407, "rewards/rejected": 0.5668092966079712, "step": 1244 }, { "epoch": 0.1925381790063793, "grad_norm": 4.202012062072754, "learning_rate": 3.2087628865979387e-06, "logits/chosen": 8.0896635055542, "logits/rejected": 10.821783065795898, "logps/chosen": -194.33839416503906, "logps/rejected": -237.69244384765625, "loss": 0.7228, "rewards/accuracies": 0.5, "rewards/chosen": 0.45260074734687805, "rewards/margins": -0.04802299290895462, "rewards/rejected": 0.5006237626075745, "step": 1245 }, { "epoch": 0.19269282814614344, "grad_norm": 4.239159107208252, "learning_rate": 3.2113402061855676e-06, "logits/chosen": 7.96912145614624, "logits/rejected": 9.134454727172852, "logps/chosen": -176.5997314453125, "logps/rejected": -213.2306671142578, "loss": 0.7185, "rewards/accuracies": 0.5, "rewards/chosen": 0.5998779535293579, "rewards/margins": -0.03788008168339729, "rewards/rejected": 0.6377581357955933, "step": 1246 }, { "epoch": 0.1928474772859076, "grad_norm": 4.657351493835449, "learning_rate": 3.2139175257731964e-06, "logits/chosen": 7.918962478637695, "logits/rejected": 3.499849796295166, "logps/chosen": -213.512939453125, "logps/rejected": -168.80560302734375, "loss": 0.6266, "rewards/accuracies": 0.75, "rewards/chosen": 0.5484223365783691, "rewards/margins": 0.158765509724617, "rewards/rejected": 0.38965684175491333, "step": 1247 }, { "epoch": 0.19300212642567174, "grad_norm": 4.860681056976318, "learning_rate": 3.2164948453608253e-06, "logits/chosen": 11.14093017578125, "logits/rejected": 6.807981967926025, "logps/chosen": -201.78338623046875, "logps/rejected": -175.76773071289062, "loss": 0.6059, "rewards/accuracies": 0.875, "rewards/chosen": 0.49234187602996826, "rewards/margins": 0.195110023021698, "rewards/rejected": 0.29723188281059265, "step": 1248 }, { "epoch": 0.19315677556543592, "grad_norm": 26.690044403076172, "learning_rate": 3.219072164948454e-06, "logits/chosen": 11.988876342773438, "logits/rejected": 7.563211441040039, "logps/chosen": -397.692626953125, "logps/rejected": -368.33111572265625, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": 0.5251701474189758, "rewards/margins": 0.06899290531873703, "rewards/rejected": 0.4561772346496582, "step": 1249 }, { "epoch": 0.19331142470520007, "grad_norm": 4.7454071044921875, "learning_rate": 3.221649484536083e-06, "logits/chosen": 9.304478645324707, "logits/rejected": 9.801323890686035, "logps/chosen": -174.55299377441406, "logps/rejected": -251.90016174316406, "loss": 0.6694, "rewards/accuracies": 0.625, "rewards/chosen": 0.40585270524024963, "rewards/margins": 0.05353386700153351, "rewards/rejected": 0.3523188531398773, "step": 1250 }, { "epoch": 0.19346607384496423, "grad_norm": 6.183058738708496, "learning_rate": 3.224226804123712e-06, "logits/chosen": 14.203052520751953, "logits/rejected": 9.77611255645752, "logps/chosen": -393.35284423828125, "logps/rejected": -310.80517578125, "loss": 0.608, "rewards/accuracies": 0.625, "rewards/chosen": 0.6375166177749634, "rewards/margins": 0.20667657256126404, "rewards/rejected": 0.43084004521369934, "step": 1251 }, { "epoch": 0.1936207229847284, "grad_norm": 10.06793212890625, "learning_rate": 3.226804123711341e-06, "logits/chosen": 7.5367817878723145, "logits/rejected": 3.743190050125122, "logps/chosen": -313.5653076171875, "logps/rejected": -307.9297180175781, "loss": 0.6956, "rewards/accuracies": 0.5, "rewards/chosen": 0.43930351734161377, "rewards/margins": 0.10602889209985733, "rewards/rejected": 0.33327460289001465, "step": 1252 }, { "epoch": 0.19377537212449256, "grad_norm": 5.206816673278809, "learning_rate": 3.2293814432989697e-06, "logits/chosen": 3.010655403137207, "logits/rejected": 11.41939926147461, "logps/chosen": -150.63723754882812, "logps/rejected": -224.94993591308594, "loss": 0.7624, "rewards/accuracies": 0.125, "rewards/chosen": 0.47446364164352417, "rewards/margins": -0.12096692621707916, "rewards/rejected": 0.5954306125640869, "step": 1253 }, { "epoch": 0.1939300212642567, "grad_norm": 5.481898784637451, "learning_rate": 3.2319587628865985e-06, "logits/chosen": 12.624159812927246, "logits/rejected": 6.349588394165039, "logps/chosen": -281.23651123046875, "logps/rejected": -243.25228881835938, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": 0.4826972484588623, "rewards/margins": 0.05940217897295952, "rewards/rejected": 0.4232950806617737, "step": 1254 }, { "epoch": 0.1940846704040209, "grad_norm": 4.178683280944824, "learning_rate": 3.2345360824742274e-06, "logits/chosen": 12.260648727416992, "logits/rejected": 3.160780668258667, "logps/chosen": -265.9971618652344, "logps/rejected": -153.32199096679688, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": 0.47502070665359497, "rewards/margins": 0.11583016812801361, "rewards/rejected": 0.35919055342674255, "step": 1255 }, { "epoch": 0.19423931954378504, "grad_norm": 6.976434707641602, "learning_rate": 3.2371134020618563e-06, "logits/chosen": 11.857710838317871, "logits/rejected": 7.212756633758545, "logps/chosen": -316.90765380859375, "logps/rejected": -269.6816101074219, "loss": 0.6918, "rewards/accuracies": 0.625, "rewards/chosen": 0.6193729639053345, "rewards/margins": 0.024546809494495392, "rewards/rejected": 0.5948261022567749, "step": 1256 }, { "epoch": 0.1943939686835492, "grad_norm": 5.100480556488037, "learning_rate": 3.239690721649485e-06, "logits/chosen": 7.107606410980225, "logits/rejected": 9.467611312866211, "logps/chosen": -309.34173583984375, "logps/rejected": -270.22894287109375, "loss": 0.6226, "rewards/accuracies": 0.625, "rewards/chosen": 0.5951852798461914, "rewards/margins": 0.1706523299217224, "rewards/rejected": 0.4245328903198242, "step": 1257 }, { "epoch": 0.19454861782331337, "grad_norm": 5.747494220733643, "learning_rate": 3.2422680412371136e-06, "logits/chosen": 13.994686126708984, "logits/rejected": 13.255468368530273, "logps/chosen": -256.3341064453125, "logps/rejected": -238.83697509765625, "loss": 0.7348, "rewards/accuracies": 0.375, "rewards/chosen": 0.6035789251327515, "rewards/margins": -0.07494983077049255, "rewards/rejected": 0.6785287857055664, "step": 1258 }, { "epoch": 0.19470326696307752, "grad_norm": 5.3106889724731445, "learning_rate": 3.2448453608247425e-06, "logits/chosen": 10.338050842285156, "logits/rejected": 8.256830215454102, "logps/chosen": -187.38653564453125, "logps/rejected": -230.9452362060547, "loss": 0.7212, "rewards/accuracies": 0.375, "rewards/chosen": 0.4010266661643982, "rewards/margins": -0.031024547293782234, "rewards/rejected": 0.43205124139785767, "step": 1259 }, { "epoch": 0.19485791610284167, "grad_norm": 6.178385257720947, "learning_rate": 3.2474226804123714e-06, "logits/chosen": 9.199402809143066, "logits/rejected": 15.232884407043457, "logps/chosen": -333.17889404296875, "logps/rejected": -418.5166320800781, "loss": 0.7099, "rewards/accuracies": 0.375, "rewards/chosen": 0.8721463680267334, "rewards/margins": -0.0024234652519226074, "rewards/rejected": 0.8745697736740112, "step": 1260 }, { "epoch": 0.19501256524260585, "grad_norm": 6.837693214416504, "learning_rate": 3.2500000000000002e-06, "logits/chosen": 12.279129028320312, "logits/rejected": 6.870036602020264, "logps/chosen": -329.546142578125, "logps/rejected": -227.9161834716797, "loss": 0.7091, "rewards/accuracies": 0.5, "rewards/chosen": 0.40584298968315125, "rewards/margins": -0.016572382301092148, "rewards/rejected": 0.4224153757095337, "step": 1261 }, { "epoch": 0.19516721438237, "grad_norm": 35.677696228027344, "learning_rate": 3.252577319587629e-06, "logits/chosen": 5.335585594177246, "logits/rejected": 6.641610622406006, "logps/chosen": -217.2725372314453, "logps/rejected": -204.86941528320312, "loss": 0.7201, "rewards/accuracies": 0.25, "rewards/chosen": 0.5300777554512024, "rewards/margins": -0.0398736447095871, "rewards/rejected": 0.5699514150619507, "step": 1262 }, { "epoch": 0.19532186352213415, "grad_norm": 9.768061637878418, "learning_rate": 3.255154639175258e-06, "logits/chosen": 8.994683265686035, "logits/rejected": 6.920213222503662, "logps/chosen": -593.1331176757812, "logps/rejected": -482.54583740234375, "loss": 0.6406, "rewards/accuracies": 0.75, "rewards/chosen": 0.7754771113395691, "rewards/margins": 0.14648504555225372, "rewards/rejected": 0.6289920806884766, "step": 1263 }, { "epoch": 0.1954765126618983, "grad_norm": 5.485254764556885, "learning_rate": 3.257731958762887e-06, "logits/chosen": 15.874935150146484, "logits/rejected": 5.449057579040527, "logps/chosen": -473.91400146484375, "logps/rejected": -279.2198181152344, "loss": 0.6103, "rewards/accuracies": 0.5, "rewards/chosen": 0.6073750853538513, "rewards/margins": 0.2410089075565338, "rewards/rejected": 0.3663662075996399, "step": 1264 }, { "epoch": 0.19563116180166248, "grad_norm": 5.843264102935791, "learning_rate": 3.2603092783505157e-06, "logits/chosen": 16.228412628173828, "logits/rejected": 13.174749374389648, "logps/chosen": -384.05303955078125, "logps/rejected": -392.5466613769531, "loss": 0.597, "rewards/accuracies": 0.75, "rewards/chosen": 0.7092897891998291, "rewards/margins": 0.22083836793899536, "rewards/rejected": 0.48845142126083374, "step": 1265 }, { "epoch": 0.19578581094142664, "grad_norm": 4.950313091278076, "learning_rate": 3.2628865979381446e-06, "logits/chosen": 8.358231544494629, "logits/rejected": 5.719239711761475, "logps/chosen": -209.3634490966797, "logps/rejected": -101.864990234375, "loss": 0.6857, "rewards/accuracies": 0.375, "rewards/chosen": 0.30012694001197815, "rewards/margins": 0.0218371395021677, "rewards/rejected": 0.278289794921875, "step": 1266 }, { "epoch": 0.1959404600811908, "grad_norm": 5.50861120223999, "learning_rate": 3.2654639175257735e-06, "logits/chosen": 12.006473541259766, "logits/rejected": 6.123589515686035, "logps/chosen": -273.00140380859375, "logps/rejected": -228.69090270996094, "loss": 0.6964, "rewards/accuracies": 0.625, "rewards/chosen": 0.6794090270996094, "rewards/margins": 0.04387977719306946, "rewards/rejected": 0.6355292797088623, "step": 1267 }, { "epoch": 0.19609510922095497, "grad_norm": 6.035185813903809, "learning_rate": 3.2680412371134023e-06, "logits/chosen": 10.129387855529785, "logits/rejected": 8.775681495666504, "logps/chosen": -365.5245666503906, "logps/rejected": -323.67236328125, "loss": 0.6466, "rewards/accuracies": 0.625, "rewards/chosen": 0.568237841129303, "rewards/margins": 0.1666661500930786, "rewards/rejected": 0.4015716314315796, "step": 1268 }, { "epoch": 0.19624975836071912, "grad_norm": 5.689505100250244, "learning_rate": 3.270618556701031e-06, "logits/chosen": 10.030207633972168, "logits/rejected": 7.495403289794922, "logps/chosen": -305.68170166015625, "logps/rejected": -283.243408203125, "loss": 0.6295, "rewards/accuracies": 0.75, "rewards/chosen": 0.7296551465988159, "rewards/margins": 0.14317269623279572, "rewards/rejected": 0.5864824652671814, "step": 1269 }, { "epoch": 0.19640440750048327, "grad_norm": 5.353102684020996, "learning_rate": 3.27319587628866e-06, "logits/chosen": 11.297200202941895, "logits/rejected": 10.937126159667969, "logps/chosen": -315.881591796875, "logps/rejected": -327.8677062988281, "loss": 0.6324, "rewards/accuracies": 0.75, "rewards/chosen": 0.6738921403884888, "rewards/margins": 0.13873621821403503, "rewards/rejected": 0.5351558923721313, "step": 1270 }, { "epoch": 0.19655905664024745, "grad_norm": 5.362412929534912, "learning_rate": 3.275773195876289e-06, "logits/chosen": 8.960491180419922, "logits/rejected": 8.659473419189453, "logps/chosen": -223.84324645996094, "logps/rejected": -226.14895629882812, "loss": 0.6768, "rewards/accuracies": 0.5, "rewards/chosen": 0.5809481143951416, "rewards/margins": 0.04989110305905342, "rewards/rejected": 0.5310570001602173, "step": 1271 }, { "epoch": 0.1967137057800116, "grad_norm": 4.925468921661377, "learning_rate": 3.278350515463918e-06, "logits/chosen": 13.636338233947754, "logits/rejected": 12.647415161132812, "logps/chosen": -178.77841186523438, "logps/rejected": -187.86346435546875, "loss": 0.6352, "rewards/accuracies": 0.625, "rewards/chosen": 0.5157396793365479, "rewards/margins": 0.148208349943161, "rewards/rejected": 0.367531418800354, "step": 1272 }, { "epoch": 0.19686835491977575, "grad_norm": 5.796316146850586, "learning_rate": 3.2809278350515467e-06, "logits/chosen": 14.344132423400879, "logits/rejected": 6.3060760498046875, "logps/chosen": -433.5544738769531, "logps/rejected": -325.4803161621094, "loss": 0.6271, "rewards/accuracies": 0.5, "rewards/chosen": 0.8045179843902588, "rewards/margins": 0.1832927167415619, "rewards/rejected": 0.6212251782417297, "step": 1273 }, { "epoch": 0.19702300405953993, "grad_norm": 5.151941299438477, "learning_rate": 3.2835051546391756e-06, "logits/chosen": 9.456802368164062, "logits/rejected": 11.307422637939453, "logps/chosen": -346.2414245605469, "logps/rejected": -380.61273193359375, "loss": 0.6291, "rewards/accuracies": 0.625, "rewards/chosen": 0.8784075379371643, "rewards/margins": 0.19248169660568237, "rewards/rejected": 0.6859257817268372, "step": 1274 }, { "epoch": 0.19717765319930408, "grad_norm": 6.6808013916015625, "learning_rate": 3.2860824742268044e-06, "logits/chosen": 3.3954012393951416, "logits/rejected": 4.781131744384766, "logps/chosen": -305.18524169921875, "logps/rejected": -295.1188049316406, "loss": 0.7924, "rewards/accuracies": 0.625, "rewards/chosen": 0.5000433325767517, "rewards/margins": -0.11500897258520126, "rewards/rejected": 0.6150522828102112, "step": 1275 }, { "epoch": 0.19733230233906823, "grad_norm": 7.2388386726379395, "learning_rate": 3.2886597938144333e-06, "logits/chosen": 6.953413009643555, "logits/rejected": 7.740036964416504, "logps/chosen": -232.2669677734375, "logps/rejected": -260.6564025878906, "loss": 0.7105, "rewards/accuracies": 0.375, "rewards/chosen": 0.43705442547798157, "rewards/margins": -0.006439249962568283, "rewards/rejected": 0.44349366426467896, "step": 1276 }, { "epoch": 0.1974869514788324, "grad_norm": 4.207391262054443, "learning_rate": 3.291237113402062e-06, "logits/chosen": 14.358092308044434, "logits/rejected": 14.29216194152832, "logps/chosen": -242.71578979492188, "logps/rejected": -167.500732421875, "loss": 0.6449, "rewards/accuracies": 0.5, "rewards/chosen": 0.49289676547050476, "rewards/margins": 0.11610956490039825, "rewards/rejected": 0.3767872452735901, "step": 1277 }, { "epoch": 0.19764160061859656, "grad_norm": 4.676634788513184, "learning_rate": 3.293814432989691e-06, "logits/chosen": 13.802257537841797, "logits/rejected": 11.306392669677734, "logps/chosen": -276.5202941894531, "logps/rejected": -213.8582763671875, "loss": 0.6764, "rewards/accuracies": 0.75, "rewards/chosen": 0.3753712773323059, "rewards/margins": 0.04441824555397034, "rewards/rejected": 0.33095303177833557, "step": 1278 }, { "epoch": 0.19779624975836071, "grad_norm": 5.706592082977295, "learning_rate": 3.29639175257732e-06, "logits/chosen": 8.070343017578125, "logits/rejected": 16.978242874145508, "logps/chosen": -158.07843017578125, "logps/rejected": -311.79840087890625, "loss": 0.671, "rewards/accuracies": 0.5, "rewards/chosen": 0.4136684536933899, "rewards/margins": 0.05436870455741882, "rewards/rejected": 0.35929977893829346, "step": 1279 }, { "epoch": 0.19795089889812487, "grad_norm": 4.6041107177734375, "learning_rate": 3.298969072164949e-06, "logits/chosen": 10.371328353881836, "logits/rejected": 10.11214542388916, "logps/chosen": -292.2493896484375, "logps/rejected": -311.4103698730469, "loss": 0.6225, "rewards/accuracies": 0.625, "rewards/chosen": 0.5738158226013184, "rewards/margins": 0.22375929355621338, "rewards/rejected": 0.35005655884742737, "step": 1280 }, { "epoch": 0.19810554803788905, "grad_norm": 5.371640205383301, "learning_rate": 3.3015463917525777e-06, "logits/chosen": 5.121698379516602, "logits/rejected": 7.031170845031738, "logps/chosen": -263.9494934082031, "logps/rejected": -246.33132934570312, "loss": 0.6014, "rewards/accuracies": 0.75, "rewards/chosen": 0.6553155183792114, "rewards/margins": 0.20567500591278076, "rewards/rejected": 0.44964051246643066, "step": 1281 }, { "epoch": 0.1982601971776532, "grad_norm": 3.934953212738037, "learning_rate": 3.3041237113402065e-06, "logits/chosen": 9.958637237548828, "logits/rejected": 2.769880771636963, "logps/chosen": -342.2831115722656, "logps/rejected": -206.97698974609375, "loss": 0.6192, "rewards/accuracies": 0.625, "rewards/chosen": 0.546492338180542, "rewards/margins": 0.18888181447982788, "rewards/rejected": 0.35761046409606934, "step": 1282 }, { "epoch": 0.19841484631741735, "grad_norm": 6.036837577819824, "learning_rate": 3.3067010309278354e-06, "logits/chosen": 12.089359283447266, "logits/rejected": 7.27589225769043, "logps/chosen": -411.790771484375, "logps/rejected": -327.0368957519531, "loss": 0.5481, "rewards/accuracies": 1.0, "rewards/chosen": 0.7499051094055176, "rewards/margins": 0.33648520708084106, "rewards/rejected": 0.4134199619293213, "step": 1283 }, { "epoch": 0.19856949545718153, "grad_norm": 5.0055766105651855, "learning_rate": 3.3092783505154643e-06, "logits/chosen": 3.3996565341949463, "logits/rejected": 12.273038864135742, "logps/chosen": -231.321533203125, "logps/rejected": -298.3727111816406, "loss": 0.5749, "rewards/accuracies": 0.75, "rewards/chosen": 0.6002375483512878, "rewards/margins": 0.28655147552490234, "rewards/rejected": 0.3136860728263855, "step": 1284 }, { "epoch": 0.19872414459694568, "grad_norm": 5.848355293273926, "learning_rate": 3.311855670103093e-06, "logits/chosen": 9.540555953979492, "logits/rejected": 1.333407998085022, "logps/chosen": -249.20144653320312, "logps/rejected": -209.50518798828125, "loss": 0.6305, "rewards/accuracies": 0.75, "rewards/chosen": 0.6329747438430786, "rewards/margins": 0.14627324044704437, "rewards/rejected": 0.4867015480995178, "step": 1285 }, { "epoch": 0.19887879373670983, "grad_norm": 7.092436790466309, "learning_rate": 3.3144329896907216e-06, "logits/chosen": 7.742453098297119, "logits/rejected": 13.437719345092773, "logps/chosen": -212.93783569335938, "logps/rejected": -277.6034851074219, "loss": 0.6316, "rewards/accuracies": 0.75, "rewards/chosen": 0.6184993982315063, "rewards/margins": 0.15867950022220612, "rewards/rejected": 0.45981988310813904, "step": 1286 }, { "epoch": 0.199033442876474, "grad_norm": 5.362861156463623, "learning_rate": 3.3170103092783505e-06, "logits/chosen": 6.933655738830566, "logits/rejected": 9.977997779846191, "logps/chosen": -257.5059814453125, "logps/rejected": -220.40133666992188, "loss": 0.6184, "rewards/accuracies": 0.5, "rewards/chosen": 0.723708987236023, "rewards/margins": 0.20602166652679443, "rewards/rejected": 0.5176873207092285, "step": 1287 }, { "epoch": 0.19918809201623816, "grad_norm": 4.728067874908447, "learning_rate": 3.3195876288659793e-06, "logits/chosen": 12.968427658081055, "logits/rejected": 5.33281946182251, "logps/chosen": -248.13919067382812, "logps/rejected": -142.6277618408203, "loss": 0.617, "rewards/accuracies": 0.5, "rewards/chosen": 0.6389952898025513, "rewards/margins": 0.19254299998283386, "rewards/rejected": 0.4464523196220398, "step": 1288 }, { "epoch": 0.1993427411560023, "grad_norm": 5.08624792098999, "learning_rate": 3.3221649484536082e-06, "logits/chosen": 6.731633186340332, "logits/rejected": 10.717260360717773, "logps/chosen": -183.9955291748047, "logps/rejected": -258.29541015625, "loss": 0.7536, "rewards/accuracies": 0.25, "rewards/chosen": 0.5029999613761902, "rewards/margins": -0.05758479982614517, "rewards/rejected": 0.5605847835540771, "step": 1289 }, { "epoch": 0.1994973902957665, "grad_norm": 4.8729567527771, "learning_rate": 3.324742268041237e-06, "logits/chosen": 9.901467323303223, "logits/rejected": 4.756739616394043, "logps/chosen": -338.7579650878906, "logps/rejected": -251.5039520263672, "loss": 0.553, "rewards/accuracies": 0.875, "rewards/chosen": 0.7929681539535522, "rewards/margins": 0.35089725255966187, "rewards/rejected": 0.44207093119621277, "step": 1290 }, { "epoch": 0.19965203943553064, "grad_norm": 4.968284606933594, "learning_rate": 3.327319587628866e-06, "logits/chosen": 15.702203750610352, "logits/rejected": 8.607516288757324, "logps/chosen": -228.546875, "logps/rejected": -177.3660125732422, "loss": 0.6942, "rewards/accuracies": 0.5, "rewards/chosen": 0.5267329812049866, "rewards/margins": 0.03915111720561981, "rewards/rejected": 0.48758184909820557, "step": 1291 }, { "epoch": 0.1998066885752948, "grad_norm": 3.717909097671509, "learning_rate": 3.329896907216495e-06, "logits/chosen": 12.51144027709961, "logits/rejected": 8.418695449829102, "logps/chosen": -137.5762176513672, "logps/rejected": -131.330322265625, "loss": 0.6422, "rewards/accuracies": 0.75, "rewards/chosen": 0.4770965576171875, "rewards/margins": 0.11830544471740723, "rewards/rejected": 0.3587911128997803, "step": 1292 }, { "epoch": 0.19996133771505897, "grad_norm": 4.210697174072266, "learning_rate": 3.3324742268041237e-06, "logits/chosen": 10.194799423217773, "logits/rejected": 2.809494972229004, "logps/chosen": -253.68234252929688, "logps/rejected": -188.58297729492188, "loss": 0.5157, "rewards/accuracies": 0.875, "rewards/chosen": 0.681464433670044, "rewards/margins": 0.41363441944122314, "rewards/rejected": 0.26783010363578796, "step": 1293 }, { "epoch": 0.20011598685482312, "grad_norm": 5.845391750335693, "learning_rate": 3.3350515463917526e-06, "logits/chosen": 12.989534378051758, "logits/rejected": 12.628778457641602, "logps/chosen": -300.50445556640625, "logps/rejected": -337.2967529296875, "loss": 0.7478, "rewards/accuracies": 0.25, "rewards/chosen": 0.6544358134269714, "rewards/margins": -0.1000395342707634, "rewards/rejected": 0.7544753551483154, "step": 1294 }, { "epoch": 0.20027063599458728, "grad_norm": 7.5394368171691895, "learning_rate": 3.3376288659793814e-06, "logits/chosen": 5.752139568328857, "logits/rejected": 7.404584884643555, "logps/chosen": -339.4193115234375, "logps/rejected": -278.47821044921875, "loss": 0.6552, "rewards/accuracies": 0.625, "rewards/chosen": 0.8480408787727356, "rewards/margins": 0.10558734089136124, "rewards/rejected": 0.7424535155296326, "step": 1295 }, { "epoch": 0.20042528513435143, "grad_norm": 5.130138874053955, "learning_rate": 3.3402061855670103e-06, "logits/chosen": 7.999868392944336, "logits/rejected": 4.941250801086426, "logps/chosen": -254.15121459960938, "logps/rejected": -243.2579345703125, "loss": 0.6662, "rewards/accuracies": 0.75, "rewards/chosen": 0.7615885734558105, "rewards/margins": 0.09718703478574753, "rewards/rejected": 0.6644015312194824, "step": 1296 }, { "epoch": 0.2005799342741156, "grad_norm": 5.25628662109375, "learning_rate": 3.342783505154639e-06, "logits/chosen": 4.310413360595703, "logits/rejected": 14.12138843536377, "logps/chosen": -165.95352172851562, "logps/rejected": -278.21661376953125, "loss": 0.8053, "rewards/accuracies": 0.5, "rewards/chosen": 0.3964526057243347, "rewards/margins": -0.184920534491539, "rewards/rejected": 0.5813732147216797, "step": 1297 }, { "epoch": 0.20073458341387976, "grad_norm": 6.754405975341797, "learning_rate": 3.345360824742268e-06, "logits/chosen": 3.988142490386963, "logits/rejected": 4.422004699707031, "logps/chosen": -258.9167175292969, "logps/rejected": -285.71490478515625, "loss": 0.6882, "rewards/accuracies": 0.625, "rewards/chosen": 0.4739401936531067, "rewards/margins": 0.06007649004459381, "rewards/rejected": 0.4138637185096741, "step": 1298 }, { "epoch": 0.2008892325536439, "grad_norm": 7.065251350402832, "learning_rate": 3.347938144329897e-06, "logits/chosen": 11.343421936035156, "logits/rejected": 3.5668082237243652, "logps/chosen": -391.7399597167969, "logps/rejected": -301.20721435546875, "loss": 0.5808, "rewards/accuracies": 0.875, "rewards/chosen": 0.9373821020126343, "rewards/margins": 0.2596184015274048, "rewards/rejected": 0.6777637004852295, "step": 1299 }, { "epoch": 0.2010438816934081, "grad_norm": 6.205452919006348, "learning_rate": 3.350515463917526e-06, "logits/chosen": 9.35049057006836, "logits/rejected": 7.737802505493164, "logps/chosen": -208.43252563476562, "logps/rejected": -178.4341583251953, "loss": 0.6672, "rewards/accuracies": 0.375, "rewards/chosen": 0.6666086316108704, "rewards/margins": 0.07263593375682831, "rewards/rejected": 0.5939726829528809, "step": 1300 }, { "epoch": 0.20119853083317224, "grad_norm": 6.771202087402344, "learning_rate": 3.3530927835051547e-06, "logits/chosen": 4.866030693054199, "logits/rejected": 7.370279312133789, "logps/chosen": -217.40887451171875, "logps/rejected": -311.69830322265625, "loss": 0.722, "rewards/accuracies": 0.625, "rewards/chosen": 0.5795423984527588, "rewards/margins": 0.01841604709625244, "rewards/rejected": 0.5611263513565063, "step": 1301 }, { "epoch": 0.2013531799729364, "grad_norm": 5.767834186553955, "learning_rate": 3.3556701030927835e-06, "logits/chosen": 12.459856986999512, "logits/rejected": 6.914124965667725, "logps/chosen": -316.07208251953125, "logps/rejected": -298.9940185546875, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": 0.7994516491889954, "rewards/margins": 0.17962822318077087, "rewards/rejected": 0.6198234558105469, "step": 1302 }, { "epoch": 0.20150782911270057, "grad_norm": 5.931360721588135, "learning_rate": 3.3582474226804124e-06, "logits/chosen": 11.712615966796875, "logits/rejected": 11.847522735595703, "logps/chosen": -362.90533447265625, "logps/rejected": -348.126708984375, "loss": 0.6551, "rewards/accuracies": 0.625, "rewards/chosen": 0.4957813322544098, "rewards/margins": 0.10481216013431549, "rewards/rejected": 0.3909691870212555, "step": 1303 }, { "epoch": 0.20166247825246472, "grad_norm": 5.806168079376221, "learning_rate": 3.3608247422680417e-06, "logits/chosen": 13.309556007385254, "logits/rejected": 8.737560272216797, "logps/chosen": -259.7633972167969, "logps/rejected": -284.0533447265625, "loss": 0.6523, "rewards/accuracies": 0.375, "rewards/chosen": 0.9872535467147827, "rewards/margins": 0.12479966133832932, "rewards/rejected": 0.8624539375305176, "step": 1304 }, { "epoch": 0.20181712739222887, "grad_norm": 4.399667263031006, "learning_rate": 3.3634020618556706e-06, "logits/chosen": 9.206022262573242, "logits/rejected": 9.837970733642578, "logps/chosen": -335.8686218261719, "logps/rejected": -273.716796875, "loss": 0.6568, "rewards/accuracies": 0.5, "rewards/chosen": 0.7067378759384155, "rewards/margins": 0.1392958164215088, "rewards/rejected": 0.5674420595169067, "step": 1305 }, { "epoch": 0.20197177653199305, "grad_norm": 5.653664588928223, "learning_rate": 3.3659793814432995e-06, "logits/chosen": 7.799934387207031, "logits/rejected": 8.471553802490234, "logps/chosen": -242.89889526367188, "logps/rejected": -230.40338134765625, "loss": 0.6197, "rewards/accuracies": 0.75, "rewards/chosen": 0.8361656069755554, "rewards/margins": 0.17100675404071808, "rewards/rejected": 0.6651588678359985, "step": 1306 }, { "epoch": 0.2021264256717572, "grad_norm": 7.008354663848877, "learning_rate": 3.3685567010309283e-06, "logits/chosen": 4.932295799255371, "logits/rejected": 2.5523951053619385, "logps/chosen": -341.0740661621094, "logps/rejected": -248.2135467529297, "loss": 0.7656, "rewards/accuracies": 0.375, "rewards/chosen": 0.6207624673843384, "rewards/margins": -0.10794610530138016, "rewards/rejected": 0.7287085652351379, "step": 1307 }, { "epoch": 0.20228107481152136, "grad_norm": 5.2920732498168945, "learning_rate": 3.371134020618557e-06, "logits/chosen": 14.190934181213379, "logits/rejected": 7.582371711730957, "logps/chosen": -409.98773193359375, "logps/rejected": -309.5728759765625, "loss": 0.6637, "rewards/accuracies": 0.5, "rewards/chosen": 0.8439762592315674, "rewards/margins": 0.21576416492462158, "rewards/rejected": 0.6282120943069458, "step": 1308 }, { "epoch": 0.20243572395128553, "grad_norm": 7.5023298263549805, "learning_rate": 3.373711340206186e-06, "logits/chosen": 5.844460487365723, "logits/rejected": 3.437920570373535, "logps/chosen": -261.376953125, "logps/rejected": -236.91403198242188, "loss": 0.7436, "rewards/accuracies": 0.625, "rewards/chosen": 0.4867055416107178, "rewards/margins": -0.08074112236499786, "rewards/rejected": 0.5674466490745544, "step": 1309 }, { "epoch": 0.20259037309104969, "grad_norm": 4.355799674987793, "learning_rate": 3.376288659793815e-06, "logits/chosen": 11.189939498901367, "logits/rejected": 5.724081039428711, "logps/chosen": -259.7492370605469, "logps/rejected": -170.89703369140625, "loss": 0.6478, "rewards/accuracies": 0.75, "rewards/chosen": 0.5804348587989807, "rewards/margins": 0.10685832798480988, "rewards/rejected": 0.47357654571533203, "step": 1310 }, { "epoch": 0.20274502223081384, "grad_norm": 4.544366836547852, "learning_rate": 3.378865979381444e-06, "logits/chosen": 8.01708698272705, "logits/rejected": 8.608769416809082, "logps/chosen": -258.0055847167969, "logps/rejected": -274.4293212890625, "loss": 0.6794, "rewards/accuracies": 0.25, "rewards/chosen": 0.49057745933532715, "rewards/margins": 0.07700784504413605, "rewards/rejected": 0.41356968879699707, "step": 1311 }, { "epoch": 0.202899671370578, "grad_norm": 6.106559753417969, "learning_rate": 3.3814432989690727e-06, "logits/chosen": 6.4856672286987305, "logits/rejected": 11.039149284362793, "logps/chosen": -169.0098876953125, "logps/rejected": -348.0252990722656, "loss": 0.7136, "rewards/accuracies": 0.625, "rewards/chosen": 0.5099309086799622, "rewards/margins": 0.0011292845010757446, "rewards/rejected": 0.5088015794754028, "step": 1312 }, { "epoch": 0.20305432051034217, "grad_norm": 5.431767463684082, "learning_rate": 3.3840206185567016e-06, "logits/chosen": 5.8516340255737305, "logits/rejected": 5.124634265899658, "logps/chosen": -259.8133850097656, "logps/rejected": -274.2174072265625, "loss": 0.6135, "rewards/accuracies": 0.75, "rewards/chosen": 0.782206118106842, "rewards/margins": 0.2607690095901489, "rewards/rejected": 0.5214371681213379, "step": 1313 }, { "epoch": 0.20320896965010632, "grad_norm": 6.771440029144287, "learning_rate": 3.3865979381443304e-06, "logits/chosen": 12.835247039794922, "logits/rejected": 6.702828884124756, "logps/chosen": -286.2394104003906, "logps/rejected": -162.53269958496094, "loss": 0.7807, "rewards/accuracies": 0.625, "rewards/chosen": 0.4118680953979492, "rewards/margins": -0.1132233515381813, "rewards/rejected": 0.5250914096832275, "step": 1314 }, { "epoch": 0.20336361878987047, "grad_norm": 5.3595805168151855, "learning_rate": 3.3891752577319593e-06, "logits/chosen": 14.908011436462402, "logits/rejected": 13.651371955871582, "logps/chosen": -303.52569580078125, "logps/rejected": -280.60687255859375, "loss": 0.6852, "rewards/accuracies": 0.5, "rewards/chosen": 0.5502185821533203, "rewards/margins": 0.04772016033530235, "rewards/rejected": 0.5024983882904053, "step": 1315 }, { "epoch": 0.20351826792963465, "grad_norm": 10.267784118652344, "learning_rate": 3.391752577319588e-06, "logits/chosen": 6.735694408416748, "logits/rejected": 8.959792137145996, "logps/chosen": -251.3719482421875, "logps/rejected": -246.74952697753906, "loss": 0.901, "rewards/accuracies": 0.125, "rewards/chosen": 0.31730565428733826, "rewards/margins": -0.3489902913570404, "rewards/rejected": 0.6662959456443787, "step": 1316 }, { "epoch": 0.2036729170693988, "grad_norm": 7.04838228225708, "learning_rate": 3.394329896907217e-06, "logits/chosen": 2.9848341941833496, "logits/rejected": 4.147202014923096, "logps/chosen": -267.9589538574219, "logps/rejected": -267.35107421875, "loss": 0.6618, "rewards/accuracies": 0.5, "rewards/chosen": 0.4270978569984436, "rewards/margins": 0.11878165602684021, "rewards/rejected": 0.308316171169281, "step": 1317 }, { "epoch": 0.20382756620916295, "grad_norm": 6.5067925453186035, "learning_rate": 3.396907216494846e-06, "logits/chosen": 9.304525375366211, "logits/rejected": 8.927728652954102, "logps/chosen": -520.3414916992188, "logps/rejected": -377.6937561035156, "loss": 0.61, "rewards/accuracies": 0.875, "rewards/chosen": 0.6300702691078186, "rewards/margins": 0.22060082852840424, "rewards/rejected": 0.4094694256782532, "step": 1318 }, { "epoch": 0.20398221534892713, "grad_norm": 5.2436442375183105, "learning_rate": 3.399484536082475e-06, "logits/chosen": 7.295438766479492, "logits/rejected": 4.511636257171631, "logps/chosen": -197.48475646972656, "logps/rejected": -201.05982971191406, "loss": 0.7343, "rewards/accuracies": 0.5, "rewards/chosen": 0.5716656446456909, "rewards/margins": -0.042453303933143616, "rewards/rejected": 0.6141189932823181, "step": 1319 }, { "epoch": 0.20413686448869128, "grad_norm": 7.030094146728516, "learning_rate": 3.4020618556701037e-06, "logits/chosen": 9.978487014770508, "logits/rejected": 10.432518005371094, "logps/chosen": -385.94537353515625, "logps/rejected": -363.43292236328125, "loss": 0.653, "rewards/accuracies": 0.625, "rewards/chosen": 0.5859411954879761, "rewards/margins": 0.10076151043176651, "rewards/rejected": 0.4851796627044678, "step": 1320 }, { "epoch": 0.20429151362845543, "grad_norm": 5.305937767028809, "learning_rate": 3.4046391752577325e-06, "logits/chosen": 14.626575469970703, "logits/rejected": 7.095052242279053, "logps/chosen": -364.23223876953125, "logps/rejected": -269.1084289550781, "loss": 0.5275, "rewards/accuracies": 0.875, "rewards/chosen": 0.684237539768219, "rewards/margins": 0.39492228627204895, "rewards/rejected": 0.28931522369384766, "step": 1321 }, { "epoch": 0.2044461627682196, "grad_norm": 4.810122013092041, "learning_rate": 3.4072164948453614e-06, "logits/chosen": 13.464654922485352, "logits/rejected": 11.766478538513184, "logps/chosen": -308.0333251953125, "logps/rejected": -298.7428283691406, "loss": 0.6219, "rewards/accuracies": 0.5, "rewards/chosen": 0.5712913274765015, "rewards/margins": 0.189250648021698, "rewards/rejected": 0.38204070925712585, "step": 1322 }, { "epoch": 0.20460081190798377, "grad_norm": 5.0442891120910645, "learning_rate": 3.4097938144329903e-06, "logits/chosen": 15.703763008117676, "logits/rejected": 8.28512954711914, "logps/chosen": -356.3396301269531, "logps/rejected": -245.67477416992188, "loss": 0.6153, "rewards/accuracies": 0.75, "rewards/chosen": 0.585950493812561, "rewards/margins": 0.1931637078523636, "rewards/rejected": 0.39278680086135864, "step": 1323 }, { "epoch": 0.20475546104774792, "grad_norm": 6.674186706542969, "learning_rate": 3.4123711340206187e-06, "logits/chosen": 10.403979301452637, "logits/rejected": 10.42829704284668, "logps/chosen": -263.7735595703125, "logps/rejected": -225.29261779785156, "loss": 0.5342, "rewards/accuracies": 0.875, "rewards/chosen": 0.6374138593673706, "rewards/margins": 0.41126564145088196, "rewards/rejected": 0.22614821791648865, "step": 1324 }, { "epoch": 0.2049101101875121, "grad_norm": 8.718478202819824, "learning_rate": 3.4149484536082476e-06, "logits/chosen": 5.965991973876953, "logits/rejected": 7.266295433044434, "logps/chosen": -371.6241149902344, "logps/rejected": -343.738525390625, "loss": 0.737, "rewards/accuracies": 0.375, "rewards/chosen": 0.5868619084358215, "rewards/margins": -0.06999720633029938, "rewards/rejected": 0.6568591594696045, "step": 1325 }, { "epoch": 0.20506475932727625, "grad_norm": 5.564294338226318, "learning_rate": 3.4175257731958765e-06, "logits/chosen": 7.479012489318848, "logits/rejected": 9.721044540405273, "logps/chosen": -169.46725463867188, "logps/rejected": -233.14649963378906, "loss": 0.7074, "rewards/accuracies": 0.5, "rewards/chosen": 0.2817467153072357, "rewards/margins": 0.0021076202392578125, "rewards/rejected": 0.2796390652656555, "step": 1326 }, { "epoch": 0.2052194084670404, "grad_norm": 5.872929096221924, "learning_rate": 3.4201030927835053e-06, "logits/chosen": 11.168549537658691, "logits/rejected": 11.150811195373535, "logps/chosen": -280.85418701171875, "logps/rejected": -228.03378295898438, "loss": 0.7117, "rewards/accuracies": 0.625, "rewards/chosen": 0.42225030064582825, "rewards/margins": -0.010264307260513306, "rewards/rejected": 0.43251463770866394, "step": 1327 }, { "epoch": 0.20537405760680455, "grad_norm": 5.55307149887085, "learning_rate": 3.4226804123711342e-06, "logits/chosen": 12.75794792175293, "logits/rejected": 9.967142105102539, "logps/chosen": -332.015380859375, "logps/rejected": -330.7392578125, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.6682014465332031, "rewards/margins": 0.06450548768043518, "rewards/rejected": 0.6036959886550903, "step": 1328 }, { "epoch": 0.20552870674656873, "grad_norm": 6.932251930236816, "learning_rate": 3.425257731958763e-06, "logits/chosen": 12.506941795349121, "logits/rejected": 5.496970176696777, "logps/chosen": -540.5363159179688, "logps/rejected": -341.3646240234375, "loss": 0.703, "rewards/accuracies": 0.5, "rewards/chosen": 0.7784515619277954, "rewards/margins": -0.0071449726819992065, "rewards/rejected": 0.785596489906311, "step": 1329 }, { "epoch": 0.20568335588633288, "grad_norm": 6.123511791229248, "learning_rate": 3.427835051546392e-06, "logits/chosen": 14.895683288574219, "logits/rejected": 11.09814167022705, "logps/chosen": -384.80328369140625, "logps/rejected": -344.2386474609375, "loss": 0.6329, "rewards/accuracies": 0.5, "rewards/chosen": 0.4796812832355499, "rewards/margins": 0.17634254693984985, "rewards/rejected": 0.3033387362957001, "step": 1330 }, { "epoch": 0.20583800502609703, "grad_norm": 5.52587890625, "learning_rate": 3.430412371134021e-06, "logits/chosen": 7.867013931274414, "logits/rejected": 8.348973274230957, "logps/chosen": -235.122802734375, "logps/rejected": -251.46774291992188, "loss": 0.7049, "rewards/accuracies": 0.625, "rewards/chosen": 0.6932826042175293, "rewards/margins": 0.015125781297683716, "rewards/rejected": 0.6781567335128784, "step": 1331 }, { "epoch": 0.2059926541658612, "grad_norm": 4.428856372833252, "learning_rate": 3.4329896907216497e-06, "logits/chosen": 12.31286907196045, "logits/rejected": 10.521028518676758, "logps/chosen": -394.11236572265625, "logps/rejected": -353.8339538574219, "loss": 0.5127, "rewards/accuracies": 0.875, "rewards/chosen": 0.7203955054283142, "rewards/margins": 0.4415269196033478, "rewards/rejected": 0.27886858582496643, "step": 1332 }, { "epoch": 0.20614730330562536, "grad_norm": 5.749904632568359, "learning_rate": 3.4355670103092786e-06, "logits/chosen": 8.381030082702637, "logits/rejected": 3.0804190635681152, "logps/chosen": -355.7543640136719, "logps/rejected": -166.9155731201172, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.378226101398468, "rewards/margins": 0.024712078273296356, "rewards/rejected": 0.35351401567459106, "step": 1333 }, { "epoch": 0.2063019524453895, "grad_norm": 6.69580602645874, "learning_rate": 3.4381443298969074e-06, "logits/chosen": 6.467792987823486, "logits/rejected": 6.528688907623291, "logps/chosen": -282.5873718261719, "logps/rejected": -274.0964050292969, "loss": 0.6595, "rewards/accuracies": 0.75, "rewards/chosen": 0.42681685090065, "rewards/margins": 0.10628613829612732, "rewards/rejected": 0.3205307126045227, "step": 1334 }, { "epoch": 0.2064566015851537, "grad_norm": 8.357756614685059, "learning_rate": 3.4407216494845363e-06, "logits/chosen": 7.871211051940918, "logits/rejected": 7.975522518157959, "logps/chosen": -134.2395782470703, "logps/rejected": -170.60951232910156, "loss": 0.8547, "rewards/accuracies": 0.75, "rewards/chosen": 0.2632666230201721, "rewards/margins": -0.14537882804870605, "rewards/rejected": 0.4086454212665558, "step": 1335 }, { "epoch": 0.20661125072491784, "grad_norm": 7.131282329559326, "learning_rate": 3.443298969072165e-06, "logits/chosen": 7.78883695602417, "logits/rejected": 8.601940155029297, "logps/chosen": -363.1313781738281, "logps/rejected": -415.93212890625, "loss": 0.7514, "rewards/accuracies": 0.5, "rewards/chosen": 0.4316656291484833, "rewards/margins": -0.039987191557884216, "rewards/rejected": 0.4716528356075287, "step": 1336 }, { "epoch": 0.206765899864682, "grad_norm": 17.950986862182617, "learning_rate": 3.445876288659794e-06, "logits/chosen": 10.061782836914062, "logits/rejected": 2.5823841094970703, "logps/chosen": -387.6070251464844, "logps/rejected": -216.02276611328125, "loss": 0.8459, "rewards/accuracies": 0.375, "rewards/chosen": 0.3303816020488739, "rewards/margins": -0.14370346069335938, "rewards/rejected": 0.4740850627422333, "step": 1337 }, { "epoch": 0.20692054900444617, "grad_norm": 9.825039863586426, "learning_rate": 3.448453608247423e-06, "logits/chosen": 11.339818000793457, "logits/rejected": 3.227383613586426, "logps/chosen": -448.6005554199219, "logps/rejected": -308.6451416015625, "loss": 0.7428, "rewards/accuracies": 0.5, "rewards/chosen": 0.49702179431915283, "rewards/margins": -0.05179423838853836, "rewards/rejected": 0.5488160252571106, "step": 1338 }, { "epoch": 0.20707519814421033, "grad_norm": 5.399560451507568, "learning_rate": 3.451030927835052e-06, "logits/chosen": 7.201487064361572, "logits/rejected": 8.292201042175293, "logps/chosen": -222.55514526367188, "logps/rejected": -231.8472137451172, "loss": 0.6329, "rewards/accuracies": 0.75, "rewards/chosen": 0.42050811648368835, "rewards/margins": 0.13930931687355042, "rewards/rejected": 0.28119879961013794, "step": 1339 }, { "epoch": 0.20722984728397448, "grad_norm": 5.372221946716309, "learning_rate": 3.4536082474226807e-06, "logits/chosen": 9.994074821472168, "logits/rejected": 2.6565442085266113, "logps/chosen": -508.5455627441406, "logps/rejected": -259.623291015625, "loss": 0.646, "rewards/accuracies": 0.625, "rewards/chosen": 0.46773701906204224, "rewards/margins": 0.11834974586963654, "rewards/rejected": 0.3493872880935669, "step": 1340 }, { "epoch": 0.20738449642373866, "grad_norm": 5.956606864929199, "learning_rate": 3.4561855670103095e-06, "logits/chosen": 14.832313537597656, "logits/rejected": 10.273258209228516, "logps/chosen": -332.6368103027344, "logps/rejected": -294.1409606933594, "loss": 0.7089, "rewards/accuracies": 0.5, "rewards/chosen": 0.40060776472091675, "rewards/margins": 0.005407616496086121, "rewards/rejected": 0.3952001929283142, "step": 1341 }, { "epoch": 0.2075391455635028, "grad_norm": 4.652444362640381, "learning_rate": 3.4587628865979384e-06, "logits/chosen": 9.282845497131348, "logits/rejected": 5.308042526245117, "logps/chosen": -177.93292236328125, "logps/rejected": -149.10462951660156, "loss": 0.7116, "rewards/accuracies": 0.375, "rewards/chosen": 0.6127756834030151, "rewards/margins": 0.0020549073815345764, "rewards/rejected": 0.6107207536697388, "step": 1342 }, { "epoch": 0.20769379470326696, "grad_norm": 7.147805213928223, "learning_rate": 3.4613402061855673e-06, "logits/chosen": 5.852838516235352, "logits/rejected": 8.764505386352539, "logps/chosen": -235.6710205078125, "logps/rejected": -307.25634765625, "loss": 0.6973, "rewards/accuracies": 0.625, "rewards/chosen": 0.4334206283092499, "rewards/margins": 0.016242746263742447, "rewards/rejected": 0.41717788577079773, "step": 1343 }, { "epoch": 0.2078484438430311, "grad_norm": 6.007249355316162, "learning_rate": 3.463917525773196e-06, "logits/chosen": 6.329930305480957, "logits/rejected": 6.019890308380127, "logps/chosen": -336.0419921875, "logps/rejected": -291.0050354003906, "loss": 0.673, "rewards/accuracies": 0.5, "rewards/chosen": 0.41354864835739136, "rewards/margins": 0.1292746663093567, "rewards/rejected": 0.28427398204803467, "step": 1344 }, { "epoch": 0.2080030929827953, "grad_norm": 3.8515822887420654, "learning_rate": 3.466494845360825e-06, "logits/chosen": 11.170055389404297, "logits/rejected": 6.0909929275512695, "logps/chosen": -184.79251098632812, "logps/rejected": -128.2896270751953, "loss": 0.6112, "rewards/accuracies": 0.75, "rewards/chosen": 0.5677140951156616, "rewards/margins": 0.19397355616092682, "rewards/rejected": 0.3737405240535736, "step": 1345 }, { "epoch": 0.20815774212255944, "grad_norm": 6.491765022277832, "learning_rate": 3.469072164948454e-06, "logits/chosen": 10.483287811279297, "logits/rejected": 3.665989398956299, "logps/chosen": -382.7060241699219, "logps/rejected": -187.4184112548828, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.3527681529521942, "rewards/margins": 0.021226219832897186, "rewards/rejected": 0.33154192566871643, "step": 1346 }, { "epoch": 0.2083123912623236, "grad_norm": 4.384526252746582, "learning_rate": 3.4716494845360828e-06, "logits/chosen": 6.553007125854492, "logits/rejected": 3.0628886222839355, "logps/chosen": -240.4276123046875, "logps/rejected": -196.03707885742188, "loss": 0.5373, "rewards/accuracies": 1.0, "rewards/chosen": 0.6536540389060974, "rewards/margins": 0.36939775943756104, "rewards/rejected": 0.28425630927085876, "step": 1347 }, { "epoch": 0.20846704040208777, "grad_norm": 4.414042949676514, "learning_rate": 3.4742268041237117e-06, "logits/chosen": 12.046533584594727, "logits/rejected": 6.874114036560059, "logps/chosen": -192.94686889648438, "logps/rejected": -105.77291107177734, "loss": 0.6606, "rewards/accuracies": 0.625, "rewards/chosen": 0.4994664490222931, "rewards/margins": 0.07198914885520935, "rewards/rejected": 0.42747727036476135, "step": 1348 }, { "epoch": 0.20862168954185192, "grad_norm": 4.539097309112549, "learning_rate": 3.4768041237113405e-06, "logits/chosen": 5.563868999481201, "logits/rejected": 4.425388813018799, "logps/chosen": -207.09054565429688, "logps/rejected": -190.4141082763672, "loss": 0.6657, "rewards/accuracies": 0.5, "rewards/chosen": 0.5783560276031494, "rewards/margins": 0.07256454229354858, "rewards/rejected": 0.5057914853096008, "step": 1349 }, { "epoch": 0.20877633868161607, "grad_norm": 6.892353534698486, "learning_rate": 3.4793814432989694e-06, "logits/chosen": 8.358343124389648, "logits/rejected": 6.98340368270874, "logps/chosen": -257.2492980957031, "logps/rejected": -248.3079833984375, "loss": 0.6452, "rewards/accuracies": 0.5, "rewards/chosen": 0.43171054124832153, "rewards/margins": 0.14787976443767548, "rewards/rejected": 0.28383079171180725, "step": 1350 }, { "epoch": 0.20893098782138025, "grad_norm": 8.68060302734375, "learning_rate": 3.4819587628865983e-06, "logits/chosen": 11.185699462890625, "logits/rejected": 6.291648864746094, "logps/chosen": -488.0755615234375, "logps/rejected": -431.4562072753906, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": 0.71002197265625, "rewards/margins": 0.15664920210838318, "rewards/rejected": 0.5533727407455444, "step": 1351 }, { "epoch": 0.2090856369611444, "grad_norm": 4.891376495361328, "learning_rate": 3.4845360824742267e-06, "logits/chosen": 9.884033203125, "logits/rejected": 9.315380096435547, "logps/chosen": -254.7776641845703, "logps/rejected": -262.2553405761719, "loss": 0.5941, "rewards/accuracies": 0.75, "rewards/chosen": 0.5134794116020203, "rewards/margins": 0.23740099370479584, "rewards/rejected": 0.2760784327983856, "step": 1352 }, { "epoch": 0.20924028610090856, "grad_norm": 6.2578301429748535, "learning_rate": 3.4871134020618556e-06, "logits/chosen": 6.928276062011719, "logits/rejected": 3.1283023357391357, "logps/chosen": -270.92510986328125, "logps/rejected": -207.70321655273438, "loss": 0.7315, "rewards/accuracies": 0.375, "rewards/chosen": 0.45973482728004456, "rewards/margins": -0.043284885585308075, "rewards/rejected": 0.5030196905136108, "step": 1353 }, { "epoch": 0.20939493524067274, "grad_norm": 4.849658489227295, "learning_rate": 3.4896907216494845e-06, "logits/chosen": 11.711618423461914, "logits/rejected": 7.391726493835449, "logps/chosen": -290.62371826171875, "logps/rejected": -274.7575378417969, "loss": 0.6334, "rewards/accuracies": 0.75, "rewards/chosen": 0.29147475957870483, "rewards/margins": 0.17306284606456757, "rewards/rejected": 0.11841192096471786, "step": 1354 }, { "epoch": 0.2095495843804369, "grad_norm": 4.921021461486816, "learning_rate": 3.4922680412371133e-06, "logits/chosen": 11.806081771850586, "logits/rejected": 9.388275146484375, "logps/chosen": -246.70620727539062, "logps/rejected": -272.8097839355469, "loss": 0.5829, "rewards/accuracies": 1.0, "rewards/chosen": 0.629475474357605, "rewards/margins": 0.24401850998401642, "rewards/rejected": 0.38545700907707214, "step": 1355 }, { "epoch": 0.20970423352020104, "grad_norm": 8.295026779174805, "learning_rate": 3.494845360824742e-06, "logits/chosen": 4.895730495452881, "logits/rejected": 6.708144664764404, "logps/chosen": -280.4863586425781, "logps/rejected": -356.3783264160156, "loss": 0.6362, "rewards/accuracies": 0.625, "rewards/chosen": 0.7940717935562134, "rewards/margins": 0.15280523896217346, "rewards/rejected": 0.6412665843963623, "step": 1356 }, { "epoch": 0.2098588826599652, "grad_norm": 4.719005107879639, "learning_rate": 3.497422680412371e-06, "logits/chosen": 4.938675880432129, "logits/rejected": 6.426571369171143, "logps/chosen": -206.31689453125, "logps/rejected": -209.0506591796875, "loss": 0.6483, "rewards/accuracies": 0.5, "rewards/chosen": 0.43130356073379517, "rewards/margins": 0.10776080936193466, "rewards/rejected": 0.3235427737236023, "step": 1357 }, { "epoch": 0.21001353179972937, "grad_norm": 5.728810787200928, "learning_rate": 3.5e-06, "logits/chosen": 7.770626068115234, "logits/rejected": 9.35832405090332, "logps/chosen": -329.96270751953125, "logps/rejected": -399.1640625, "loss": 0.6299, "rewards/accuracies": 0.5, "rewards/chosen": 0.4822203516960144, "rewards/margins": 0.176669642329216, "rewards/rejected": 0.3055507242679596, "step": 1358 }, { "epoch": 0.21016818093949352, "grad_norm": 7.7981109619140625, "learning_rate": 3.502577319587629e-06, "logits/chosen": 6.6221466064453125, "logits/rejected": 9.354161262512207, "logps/chosen": -378.2397155761719, "logps/rejected": -354.189453125, "loss": 0.9013, "rewards/accuracies": 0.125, "rewards/chosen": 0.4871975779533386, "rewards/margins": -0.3183588981628418, "rewards/rejected": 0.8055565357208252, "step": 1359 }, { "epoch": 0.21032283007925767, "grad_norm": 4.4969024658203125, "learning_rate": 3.5051546391752577e-06, "logits/chosen": 10.313762664794922, "logits/rejected": 5.391558647155762, "logps/chosen": -311.0745544433594, "logps/rejected": -229.59690856933594, "loss": 0.5052, "rewards/accuracies": 1.0, "rewards/chosen": 0.6098133325576782, "rewards/margins": 0.4761411249637604, "rewards/rejected": 0.13367222249507904, "step": 1360 }, { "epoch": 0.21047747921902185, "grad_norm": 4.826947212219238, "learning_rate": 3.5077319587628866e-06, "logits/chosen": 11.115373611450195, "logits/rejected": 6.972024440765381, "logps/chosen": -221.2061004638672, "logps/rejected": -173.62017822265625, "loss": 0.6988, "rewards/accuracies": 0.25, "rewards/chosen": 0.3351612091064453, "rewards/margins": 0.009484857320785522, "rewards/rejected": 0.3256763517856598, "step": 1361 }, { "epoch": 0.210632128358786, "grad_norm": 5.5508809089660645, "learning_rate": 3.5103092783505154e-06, "logits/chosen": 10.264398574829102, "logits/rejected": 8.313765525817871, "logps/chosen": -345.8274841308594, "logps/rejected": -303.15081787109375, "loss": 0.6368, "rewards/accuracies": 0.625, "rewards/chosen": 0.4567398130893707, "rewards/margins": 0.15106360614299774, "rewards/rejected": 0.3056762218475342, "step": 1362 }, { "epoch": 0.21078677749855015, "grad_norm": 3.8625824451446533, "learning_rate": 3.5128865979381443e-06, "logits/chosen": 6.132232189178467, "logits/rejected": 8.672809600830078, "logps/chosen": -114.29231262207031, "logps/rejected": -125.7341537475586, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 0.3751299977302551, "rewards/margins": 0.03930889442563057, "rewards/rejected": 0.33582109212875366, "step": 1363 }, { "epoch": 0.21094142663831433, "grad_norm": 42.03532791137695, "learning_rate": 3.515463917525773e-06, "logits/chosen": 9.723971366882324, "logits/rejected": 9.946759223937988, "logps/chosen": -252.67922973632812, "logps/rejected": -264.53125, "loss": 0.6371, "rewards/accuracies": 0.75, "rewards/chosen": 0.601596474647522, "rewards/margins": 0.12183618545532227, "rewards/rejected": 0.4797602593898773, "step": 1364 }, { "epoch": 0.21109607577807848, "grad_norm": 4.07283878326416, "learning_rate": 3.5180412371134025e-06, "logits/chosen": 13.018229484558105, "logits/rejected": 3.55332612991333, "logps/chosen": -213.94863891601562, "logps/rejected": -103.7365493774414, "loss": 0.5938, "rewards/accuracies": 0.875, "rewards/chosen": 0.43834927678108215, "rewards/margins": 0.2561228573322296, "rewards/rejected": 0.18222640454769135, "step": 1365 }, { "epoch": 0.21125072491784264, "grad_norm": 5.292239665985107, "learning_rate": 3.5206185567010313e-06, "logits/chosen": 10.229877471923828, "logits/rejected": 0.362331748008728, "logps/chosen": -386.6089172363281, "logps/rejected": -212.48472595214844, "loss": 0.6455, "rewards/accuracies": 0.375, "rewards/chosen": 0.539689302444458, "rewards/margins": 0.14695516228675842, "rewards/rejected": 0.3927341401576996, "step": 1366 }, { "epoch": 0.21140537405760682, "grad_norm": 5.216536998748779, "learning_rate": 3.5231958762886602e-06, "logits/chosen": 6.677067756652832, "logits/rejected": 4.888310432434082, "logps/chosen": -201.47525024414062, "logps/rejected": -202.88449096679688, "loss": 0.746, "rewards/accuracies": 0.5, "rewards/chosen": 0.35402822494506836, "rewards/margins": -0.07672037929296494, "rewards/rejected": 0.4307486116886139, "step": 1367 }, { "epoch": 0.21156002319737097, "grad_norm": 5.612256050109863, "learning_rate": 3.525773195876289e-06, "logits/chosen": 10.84018325805664, "logits/rejected": 5.113863945007324, "logps/chosen": -355.4080810546875, "logps/rejected": -215.42857360839844, "loss": 0.7804, "rewards/accuracies": 0.375, "rewards/chosen": 0.5365468859672546, "rewards/margins": -0.1097065880894661, "rewards/rejected": 0.6462534666061401, "step": 1368 }, { "epoch": 0.21171467233713512, "grad_norm": 26.769519805908203, "learning_rate": 3.528350515463918e-06, "logits/chosen": 8.440407752990723, "logits/rejected": 10.98375129699707, "logps/chosen": -406.82733154296875, "logps/rejected": -433.5931396484375, "loss": 0.6864, "rewards/accuracies": 0.75, "rewards/chosen": 0.5005760192871094, "rewards/margins": 0.028843581676483154, "rewards/rejected": 0.4717324376106262, "step": 1369 }, { "epoch": 0.2118693214768993, "grad_norm": 6.7686004638671875, "learning_rate": 3.530927835051547e-06, "logits/chosen": 4.020563125610352, "logits/rejected": 6.296971797943115, "logps/chosen": -223.6477508544922, "logps/rejected": -333.93701171875, "loss": 0.6866, "rewards/accuracies": 0.375, "rewards/chosen": 0.5287067890167236, "rewards/margins": 0.0202481746673584, "rewards/rejected": 0.5084586143493652, "step": 1370 }, { "epoch": 0.21202397061666345, "grad_norm": 17.801685333251953, "learning_rate": 3.5335051546391757e-06, "logits/chosen": 9.04471206665039, "logits/rejected": 4.763840198516846, "logps/chosen": -397.53448486328125, "logps/rejected": -329.04559326171875, "loss": 0.7461, "rewards/accuracies": 0.25, "rewards/chosen": 0.37218376994132996, "rewards/margins": -0.07964642345905304, "rewards/rejected": 0.4518301784992218, "step": 1371 }, { "epoch": 0.2121786197564276, "grad_norm": 4.7403669357299805, "learning_rate": 3.5360824742268046e-06, "logits/chosen": 10.435708045959473, "logits/rejected": 7.7940168380737305, "logps/chosen": -220.83447265625, "logps/rejected": -167.44602966308594, "loss": 0.628, "rewards/accuracies": 0.625, "rewards/chosen": 0.5339148044586182, "rewards/margins": 0.15386907756328583, "rewards/rejected": 0.38004571199417114, "step": 1372 }, { "epoch": 0.21233326889619175, "grad_norm": 6.455552577972412, "learning_rate": 3.5386597938144334e-06, "logits/chosen": 17.841386795043945, "logits/rejected": 10.11112117767334, "logps/chosen": -340.7460632324219, "logps/rejected": -238.43994140625, "loss": 0.6542, "rewards/accuracies": 0.625, "rewards/chosen": 0.42541542649269104, "rewards/margins": 0.08890493959188461, "rewards/rejected": 0.33651047945022583, "step": 1373 }, { "epoch": 0.21248791803595593, "grad_norm": 5.383336067199707, "learning_rate": 3.5412371134020623e-06, "logits/chosen": 6.971219062805176, "logits/rejected": 8.764307975769043, "logps/chosen": -170.48658752441406, "logps/rejected": -164.9232177734375, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": 0.3233797550201416, "rewards/margins": 0.053184788674116135, "rewards/rejected": 0.27019497752189636, "step": 1374 }, { "epoch": 0.21264256717572008, "grad_norm": 5.377937316894531, "learning_rate": 3.543814432989691e-06, "logits/chosen": 10.733101844787598, "logits/rejected": 3.9719767570495605, "logps/chosen": -411.1726989746094, "logps/rejected": -304.9864501953125, "loss": 0.5804, "rewards/accuracies": 0.625, "rewards/chosen": 0.5369677543640137, "rewards/margins": 0.31531763076782227, "rewards/rejected": 0.2216501235961914, "step": 1375 }, { "epoch": 0.21279721631548423, "grad_norm": 4.059070110321045, "learning_rate": 3.54639175257732e-06, "logits/chosen": 8.050950050354004, "logits/rejected": 9.078701972961426, "logps/chosen": -210.8147430419922, "logps/rejected": -284.099609375, "loss": 0.5366, "rewards/accuracies": 0.875, "rewards/chosen": 0.6488400101661682, "rewards/margins": 0.40098732709884644, "rewards/rejected": 0.24785271286964417, "step": 1376 }, { "epoch": 0.2129518654552484, "grad_norm": 4.732486248016357, "learning_rate": 3.548969072164949e-06, "logits/chosen": 9.83571720123291, "logits/rejected": 8.933853149414062, "logps/chosen": -200.92828369140625, "logps/rejected": -192.86416625976562, "loss": 0.7852, "rewards/accuracies": 0.125, "rewards/chosen": 0.41808220744132996, "rewards/margins": -0.1393115222454071, "rewards/rejected": 0.5573937296867371, "step": 1377 }, { "epoch": 0.21310651459501256, "grad_norm": 6.51684045791626, "learning_rate": 3.551546391752578e-06, "logits/chosen": 8.409055709838867, "logits/rejected": 5.54688024520874, "logps/chosen": -302.28753662109375, "logps/rejected": -228.40945434570312, "loss": 0.7138, "rewards/accuracies": 0.5, "rewards/chosen": 0.5649549961090088, "rewards/margins": -0.01456449180841446, "rewards/rejected": 0.579519510269165, "step": 1378 }, { "epoch": 0.21326116373477672, "grad_norm": 6.601457118988037, "learning_rate": 3.5541237113402067e-06, "logits/chosen": 3.236544609069824, "logits/rejected": 10.034658432006836, "logps/chosen": -251.71966552734375, "logps/rejected": -375.01556396484375, "loss": 0.782, "rewards/accuracies": 0.25, "rewards/chosen": 0.29124915599823, "rewards/margins": -0.10657037049531937, "rewards/rejected": 0.39781951904296875, "step": 1379 }, { "epoch": 0.2134158128745409, "grad_norm": 5.969194412231445, "learning_rate": 3.5567010309278356e-06, "logits/chosen": 14.562919616699219, "logits/rejected": 12.604349136352539, "logps/chosen": -420.05902099609375, "logps/rejected": -321.63702392578125, "loss": 0.67, "rewards/accuracies": 0.625, "rewards/chosen": 0.6438345909118652, "rewards/margins": 0.0644078180193901, "rewards/rejected": 0.5794267654418945, "step": 1380 }, { "epoch": 0.21357046201430505, "grad_norm": 7.169862270355225, "learning_rate": 3.5592783505154644e-06, "logits/chosen": 5.652187824249268, "logits/rejected": 10.172074317932129, "logps/chosen": -304.4931945800781, "logps/rejected": -318.00958251953125, "loss": 0.7506, "rewards/accuracies": 0.25, "rewards/chosen": 0.28241586685180664, "rewards/margins": -0.07487916946411133, "rewards/rejected": 0.35729503631591797, "step": 1381 }, { "epoch": 0.2137251111540692, "grad_norm": 4.332362174987793, "learning_rate": 3.5618556701030933e-06, "logits/chosen": 12.813555717468262, "logits/rejected": 9.093693733215332, "logps/chosen": -227.2550811767578, "logps/rejected": -211.45059204101562, "loss": 0.5874, "rewards/accuracies": 0.75, "rewards/chosen": 0.4965195655822754, "rewards/margins": 0.289533793926239, "rewards/rejected": 0.20698577165603638, "step": 1382 }, { "epoch": 0.21387976029383338, "grad_norm": 5.209338188171387, "learning_rate": 3.564432989690722e-06, "logits/chosen": 7.237485885620117, "logits/rejected": 15.047758102416992, "logps/chosen": -287.3205871582031, "logps/rejected": -337.389892578125, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": 0.463872492313385, "rewards/margins": 0.12451709806919098, "rewards/rejected": 0.3393554091453552, "step": 1383 }, { "epoch": 0.21403440943359753, "grad_norm": 5.4234490394592285, "learning_rate": 3.567010309278351e-06, "logits/chosen": 11.620508193969727, "logits/rejected": 4.772186279296875, "logps/chosen": -295.1038513183594, "logps/rejected": -208.3180694580078, "loss": 0.6859, "rewards/accuracies": 0.5, "rewards/chosen": 0.46672195196151733, "rewards/margins": 0.04274645820260048, "rewards/rejected": 0.42397546768188477, "step": 1384 }, { "epoch": 0.21418905857336168, "grad_norm": 5.287847995758057, "learning_rate": 3.56958762886598e-06, "logits/chosen": 8.704248428344727, "logits/rejected": 6.20478630065918, "logps/chosen": -248.81182861328125, "logps/rejected": -213.886474609375, "loss": 0.6694, "rewards/accuracies": 0.625, "rewards/chosen": 0.19882261753082275, "rewards/margins": 0.06858278065919876, "rewards/rejected": 0.1302398294210434, "step": 1385 }, { "epoch": 0.21434370771312586, "grad_norm": 5.387875080108643, "learning_rate": 3.5721649484536088e-06, "logits/chosen": 7.495245456695557, "logits/rejected": 5.227661609649658, "logps/chosen": -212.9635009765625, "logps/rejected": -222.36404418945312, "loss": 0.6765, "rewards/accuracies": 0.625, "rewards/chosen": 0.38130658864974976, "rewards/margins": 0.08010635524988174, "rewards/rejected": 0.301200270652771, "step": 1386 }, { "epoch": 0.21449835685289, "grad_norm": 5.341694355010986, "learning_rate": 3.5747422680412377e-06, "logits/chosen": 14.002464294433594, "logits/rejected": 3.797640800476074, "logps/chosen": -352.9326171875, "logps/rejected": -213.20823669433594, "loss": 0.5679, "rewards/accuracies": 0.875, "rewards/chosen": 0.548779308795929, "rewards/margins": 0.2892724871635437, "rewards/rejected": 0.25950682163238525, "step": 1387 }, { "epoch": 0.21465300599265416, "grad_norm": 4.783133029937744, "learning_rate": 3.5773195876288665e-06, "logits/chosen": 14.291772842407227, "logits/rejected": 6.613865375518799, "logps/chosen": -233.8211669921875, "logps/rejected": -121.76593017578125, "loss": 0.6629, "rewards/accuracies": 0.5, "rewards/chosen": 0.44137075543403625, "rewards/margins": 0.0829995647072792, "rewards/rejected": 0.35837119817733765, "step": 1388 }, { "epoch": 0.2148076551324183, "grad_norm": 4.695958137512207, "learning_rate": 3.5798969072164954e-06, "logits/chosen": 9.069969177246094, "logits/rejected": 12.631689071655273, "logps/chosen": -154.82574462890625, "logps/rejected": -233.6972198486328, "loss": 0.7544, "rewards/accuracies": 0.5, "rewards/chosen": 0.31659260392189026, "rewards/margins": -0.0945776104927063, "rewards/rejected": 0.41117021441459656, "step": 1389 }, { "epoch": 0.2149623042721825, "grad_norm": 17.922128677368164, "learning_rate": 3.582474226804124e-06, "logits/chosen": 11.884603500366211, "logits/rejected": 7.742044448852539, "logps/chosen": -313.76458740234375, "logps/rejected": -242.19577026367188, "loss": 0.5482, "rewards/accuracies": 0.625, "rewards/chosen": 0.7540775537490845, "rewards/margins": 0.4058167040348053, "rewards/rejected": 0.3482608199119568, "step": 1390 }, { "epoch": 0.21511695341194664, "grad_norm": 7.593837738037109, "learning_rate": 3.5850515463917527e-06, "logits/chosen": 11.985014915466309, "logits/rejected": 12.171064376831055, "logps/chosen": -424.50408935546875, "logps/rejected": -381.389404296875, "loss": 0.6374, "rewards/accuracies": 0.75, "rewards/chosen": 0.4164169430732727, "rewards/margins": 0.12984275817871094, "rewards/rejected": 0.28657418489456177, "step": 1391 }, { "epoch": 0.2152716025517108, "grad_norm": 4.750051021575928, "learning_rate": 3.5876288659793816e-06, "logits/chosen": 11.197408676147461, "logits/rejected": 8.01699447631836, "logps/chosen": -249.84201049804688, "logps/rejected": -241.58238220214844, "loss": 0.6087, "rewards/accuracies": 0.75, "rewards/chosen": 0.5981346368789673, "rewards/margins": 0.2080833911895752, "rewards/rejected": 0.3900512754917145, "step": 1392 }, { "epoch": 0.21542625169147497, "grad_norm": 6.1215314865112305, "learning_rate": 3.5902061855670105e-06, "logits/chosen": 7.208744049072266, "logits/rejected": 0.5497065782546997, "logps/chosen": -353.7894287109375, "logps/rejected": -192.8113555908203, "loss": 0.6358, "rewards/accuracies": 0.75, "rewards/chosen": 0.42695724964141846, "rewards/margins": 0.12696388363838196, "rewards/rejected": 0.2999933362007141, "step": 1393 }, { "epoch": 0.21558090083123913, "grad_norm": 5.062748908996582, "learning_rate": 3.5927835051546393e-06, "logits/chosen": 15.541715621948242, "logits/rejected": 12.296457290649414, "logps/chosen": -359.38653564453125, "logps/rejected": -321.9974670410156, "loss": 0.5524, "rewards/accuracies": 0.625, "rewards/chosen": 0.6806190609931946, "rewards/margins": 0.37431812286376953, "rewards/rejected": 0.30630093812942505, "step": 1394 }, { "epoch": 0.21573554997100328, "grad_norm": 5.828248023986816, "learning_rate": 3.595360824742268e-06, "logits/chosen": 14.44190788269043, "logits/rejected": 4.765242576599121, "logps/chosen": -308.0531311035156, "logps/rejected": -229.64662170410156, "loss": 0.6615, "rewards/accuracies": 0.75, "rewards/chosen": 0.32577210664749146, "rewards/margins": 0.09610196202993393, "rewards/rejected": 0.22967013716697693, "step": 1395 }, { "epoch": 0.21589019911076746, "grad_norm": 3.8994383811950684, "learning_rate": 3.597938144329897e-06, "logits/chosen": 9.809871673583984, "logits/rejected": 4.176308631896973, "logps/chosen": -268.6292724609375, "logps/rejected": -149.78109741210938, "loss": 0.5801, "rewards/accuracies": 0.75, "rewards/chosen": 0.5215524435043335, "rewards/margins": 0.2858830690383911, "rewards/rejected": 0.23566940426826477, "step": 1396 }, { "epoch": 0.2160448482505316, "grad_norm": 4.680583477020264, "learning_rate": 3.600515463917526e-06, "logits/chosen": 9.197677612304688, "logits/rejected": 8.19057559967041, "logps/chosen": -301.29156494140625, "logps/rejected": -311.92864990234375, "loss": 0.5896, "rewards/accuracies": 0.625, "rewards/chosen": 0.6327707767486572, "rewards/margins": 0.2566205859184265, "rewards/rejected": 0.37615013122558594, "step": 1397 }, { "epoch": 0.21619949739029576, "grad_norm": 5.29899263381958, "learning_rate": 3.603092783505155e-06, "logits/chosen": 9.07162094116211, "logits/rejected": 9.636396408081055, "logps/chosen": -226.06842041015625, "logps/rejected": -219.00634765625, "loss": 0.7619, "rewards/accuracies": 0.375, "rewards/chosen": 0.41803330183029175, "rewards/margins": -0.1127861887216568, "rewards/rejected": 0.5308195352554321, "step": 1398 }, { "epoch": 0.21635414653005994, "grad_norm": 4.626092910766602, "learning_rate": 3.6056701030927837e-06, "logits/chosen": 2.620670795440674, "logits/rejected": 2.706982135772705, "logps/chosen": -153.52186584472656, "logps/rejected": -179.35394287109375, "loss": 0.8015, "rewards/accuracies": 0.25, "rewards/chosen": 0.2736055254936218, "rewards/margins": -0.18276971578598022, "rewards/rejected": 0.45637527108192444, "step": 1399 }, { "epoch": 0.2165087956698241, "grad_norm": 6.05400276184082, "learning_rate": 3.6082474226804126e-06, "logits/chosen": 8.269233703613281, "logits/rejected": 4.95513916015625, "logps/chosen": -277.6695251464844, "logps/rejected": -196.7139892578125, "loss": 0.6571, "rewards/accuracies": 0.625, "rewards/chosen": 0.39224058389663696, "rewards/margins": 0.08993808180093765, "rewards/rejected": 0.3023025095462799, "step": 1400 }, { "epoch": 0.21666344480958824, "grad_norm": 4.07888126373291, "learning_rate": 3.6108247422680414e-06, "logits/chosen": 6.532243251800537, "logits/rejected": 6.481266021728516, "logps/chosen": -173.1339111328125, "logps/rejected": -186.14576721191406, "loss": 0.6752, "rewards/accuracies": 0.625, "rewards/chosen": 0.24564766883850098, "rewards/margins": 0.05224600434303284, "rewards/rejected": 0.19340167939662933, "step": 1401 }, { "epoch": 0.21681809394935242, "grad_norm": 4.770934104919434, "learning_rate": 3.6134020618556703e-06, "logits/chosen": 3.8904683589935303, "logits/rejected": 0.7848445177078247, "logps/chosen": -332.042724609375, "logps/rejected": -232.2508544921875, "loss": 0.678, "rewards/accuracies": 0.625, "rewards/chosen": 0.3198614716529846, "rewards/margins": 0.047458574175834656, "rewards/rejected": 0.27240291237831116, "step": 1402 }, { "epoch": 0.21697274308911657, "grad_norm": 5.7592034339904785, "learning_rate": 3.615979381443299e-06, "logits/chosen": 10.140690803527832, "logits/rejected": 14.099028587341309, "logps/chosen": -210.9591064453125, "logps/rejected": -241.9478302001953, "loss": 0.762, "rewards/accuracies": 0.25, "rewards/chosen": 0.3259265422821045, "rewards/margins": -0.05784143507480621, "rewards/rejected": 0.3837679922580719, "step": 1403 }, { "epoch": 0.21712739222888072, "grad_norm": 4.688066482543945, "learning_rate": 3.618556701030928e-06, "logits/chosen": 12.091397285461426, "logits/rejected": 11.99979305267334, "logps/chosen": -185.437744140625, "logps/rejected": -193.048095703125, "loss": 0.7284, "rewards/accuracies": 0.25, "rewards/chosen": 0.30165940523147583, "rewards/margins": -0.04808884486556053, "rewards/rejected": 0.3497482240200043, "step": 1404 }, { "epoch": 0.21728204136864487, "grad_norm": 4.5660858154296875, "learning_rate": 3.621134020618557e-06, "logits/chosen": 7.162897109985352, "logits/rejected": 5.639487266540527, "logps/chosen": -178.85079956054688, "logps/rejected": -166.9153289794922, "loss": 0.6449, "rewards/accuracies": 0.75, "rewards/chosen": 0.31257164478302, "rewards/margins": 0.1051870733499527, "rewards/rejected": 0.20738458633422852, "step": 1405 }, { "epoch": 0.21743669050840905, "grad_norm": 3.9991378784179688, "learning_rate": 3.623711340206186e-06, "logits/chosen": 8.645508766174316, "logits/rejected": 4.361780643463135, "logps/chosen": -186.33148193359375, "logps/rejected": -149.33509826660156, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": 0.4362901449203491, "rewards/margins": 0.26957952976226807, "rewards/rejected": 0.16671063005924225, "step": 1406 }, { "epoch": 0.2175913396481732, "grad_norm": 4.87369966506958, "learning_rate": 3.6262886597938147e-06, "logits/chosen": 13.010942459106445, "logits/rejected": 9.216177940368652, "logps/chosen": -339.8858642578125, "logps/rejected": -251.97030639648438, "loss": 0.5785, "rewards/accuracies": 0.625, "rewards/chosen": 0.4198285937309265, "rewards/margins": 0.2869997024536133, "rewards/rejected": 0.13282892107963562, "step": 1407 }, { "epoch": 0.21774598878793736, "grad_norm": 5.114838123321533, "learning_rate": 3.6288659793814435e-06, "logits/chosen": 15.066329002380371, "logits/rejected": 13.697265625, "logps/chosen": -224.92408752441406, "logps/rejected": -230.80490112304688, "loss": 0.6566, "rewards/accuracies": 0.625, "rewards/chosen": 0.362186074256897, "rewards/margins": 0.07888708263635635, "rewards/rejected": 0.28329896926879883, "step": 1408 }, { "epoch": 0.21790063792770153, "grad_norm": 7.391239643096924, "learning_rate": 3.6314432989690724e-06, "logits/chosen": 8.528319358825684, "logits/rejected": 8.180903434753418, "logps/chosen": -283.19189453125, "logps/rejected": -282.02960205078125, "loss": 0.5061, "rewards/accuracies": 0.875, "rewards/chosen": 0.4431118369102478, "rewards/margins": 0.46919894218444824, "rewards/rejected": -0.026087090373039246, "step": 1409 }, { "epoch": 0.2180552870674657, "grad_norm": 5.6612043380737305, "learning_rate": 3.6340206185567013e-06, "logits/chosen": 13.293410301208496, "logits/rejected": 9.366764068603516, "logps/chosen": -275.1622314453125, "logps/rejected": -240.09393310546875, "loss": 0.6125, "rewards/accuracies": 0.625, "rewards/chosen": 0.3947445750236511, "rewards/margins": 0.2122238278388977, "rewards/rejected": 0.1825207769870758, "step": 1410 }, { "epoch": 0.21820993620722984, "grad_norm": 8.534502029418945, "learning_rate": 3.63659793814433e-06, "logits/chosen": 15.37127685546875, "logits/rejected": 17.36404037475586, "logps/chosen": -227.03945922851562, "logps/rejected": -260.7655334472656, "loss": 0.8373, "rewards/accuracies": 0.375, "rewards/chosen": 0.35013309121131897, "rewards/margins": -0.20924700796604156, "rewards/rejected": 0.5593801140785217, "step": 1411 }, { "epoch": 0.21836458534699402, "grad_norm": 5.936330795288086, "learning_rate": 3.639175257731959e-06, "logits/chosen": 6.021060943603516, "logits/rejected": 7.963349342346191, "logps/chosen": -233.08522033691406, "logps/rejected": -316.3594970703125, "loss": 0.5922, "rewards/accuracies": 0.75, "rewards/chosen": 0.41357719898223877, "rewards/margins": 0.2534785270690918, "rewards/rejected": 0.16009865701198578, "step": 1412 }, { "epoch": 0.21851923448675817, "grad_norm": 7.7762017250061035, "learning_rate": 3.641752577319588e-06, "logits/chosen": 6.270167350769043, "logits/rejected": 11.790924072265625, "logps/chosen": -188.77398681640625, "logps/rejected": -274.53271484375, "loss": 0.7211, "rewards/accuracies": 0.5, "rewards/chosen": 0.4920801520347595, "rewards/margins": 0.054075688123703, "rewards/rejected": 0.4380044639110565, "step": 1413 }, { "epoch": 0.21867388362652232, "grad_norm": 4.668298721313477, "learning_rate": 3.6443298969072168e-06, "logits/chosen": 11.339103698730469, "logits/rejected": 3.700221300125122, "logps/chosen": -292.1308288574219, "logps/rejected": -210.3017578125, "loss": 0.6201, "rewards/accuracies": 0.625, "rewards/chosen": 0.49662187695503235, "rewards/margins": 0.21353943645954132, "rewards/rejected": 0.28308239579200745, "step": 1414 }, { "epoch": 0.2188285327662865, "grad_norm": 5.152096271514893, "learning_rate": 3.6469072164948456e-06, "logits/chosen": 8.295866966247559, "logits/rejected": 9.64769172668457, "logps/chosen": -204.8546600341797, "logps/rejected": -184.64170837402344, "loss": 0.7069, "rewards/accuracies": 0.125, "rewards/chosen": 0.1396666020154953, "rewards/margins": -0.005854126065969467, "rewards/rejected": 0.14552073180675507, "step": 1415 }, { "epoch": 0.21898318190605065, "grad_norm": 6.844688415527344, "learning_rate": 3.6494845360824745e-06, "logits/chosen": 15.606342315673828, "logits/rejected": 12.615938186645508, "logps/chosen": -369.7417297363281, "logps/rejected": -294.941650390625, "loss": 0.7012, "rewards/accuracies": 0.625, "rewards/chosen": 0.2298278510570526, "rewards/margins": 0.002103324979543686, "rewards/rejected": 0.22772449254989624, "step": 1416 }, { "epoch": 0.2191378310458148, "grad_norm": 8.374110221862793, "learning_rate": 3.6520618556701034e-06, "logits/chosen": 3.5735723972320557, "logits/rejected": 5.957098007202148, "logps/chosen": -299.9869079589844, "logps/rejected": -386.76800537109375, "loss": 0.7991, "rewards/accuracies": 0.5, "rewards/chosen": 0.35375627875328064, "rewards/margins": -0.1731717586517334, "rewards/rejected": 0.5269280672073364, "step": 1417 }, { "epoch": 0.21929248018557898, "grad_norm": 6.793594837188721, "learning_rate": 3.654639175257732e-06, "logits/chosen": 3.817567825317383, "logits/rejected": 11.726768493652344, "logps/chosen": -182.70130920410156, "logps/rejected": -328.7852478027344, "loss": 0.7583, "rewards/accuracies": 0.5, "rewards/chosen": 0.4035118818283081, "rewards/margins": -0.04737842082977295, "rewards/rejected": 0.45089030265808105, "step": 1418 }, { "epoch": 0.21944712932534313, "grad_norm": 5.400929927825928, "learning_rate": 3.6572164948453607e-06, "logits/chosen": 10.78718376159668, "logits/rejected": 11.716590881347656, "logps/chosen": -312.64202880859375, "logps/rejected": -300.51251220703125, "loss": 0.5112, "rewards/accuracies": 0.875, "rewards/chosen": 0.638419508934021, "rewards/margins": 0.43650394678115845, "rewards/rejected": 0.20191554725170135, "step": 1419 }, { "epoch": 0.21960177846510728, "grad_norm": 4.759858131408691, "learning_rate": 3.6597938144329896e-06, "logits/chosen": 16.28638458251953, "logits/rejected": 9.044527053833008, "logps/chosen": -236.13827514648438, "logps/rejected": -170.5867462158203, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.336395263671875, "rewards/margins": 0.03040466457605362, "rewards/rejected": 0.305990606546402, "step": 1420 }, { "epoch": 0.21975642760487143, "grad_norm": 5.24135684967041, "learning_rate": 3.6623711340206185e-06, "logits/chosen": 13.941750526428223, "logits/rejected": 5.963501930236816, "logps/chosen": -323.9901123046875, "logps/rejected": -213.09153747558594, "loss": 0.645, "rewards/accuracies": 0.625, "rewards/chosen": 0.29366081953048706, "rewards/margins": 0.11821715533733368, "rewards/rejected": 0.17544367909431458, "step": 1421 }, { "epoch": 0.21991107674463561, "grad_norm": 5.48724365234375, "learning_rate": 3.6649484536082473e-06, "logits/chosen": 11.891497611999512, "logits/rejected": 7.095841884613037, "logps/chosen": -316.9349060058594, "logps/rejected": -240.37310791015625, "loss": 0.6301, "rewards/accuracies": 0.5, "rewards/chosen": 0.4686710834503174, "rewards/margins": 0.16829286515712738, "rewards/rejected": 0.3003782331943512, "step": 1422 }, { "epoch": 0.22006572588439977, "grad_norm": 4.1777849197387695, "learning_rate": 3.667525773195876e-06, "logits/chosen": 10.771716117858887, "logits/rejected": 11.025160789489746, "logps/chosen": -175.10061645507812, "logps/rejected": -187.09262084960938, "loss": 0.6045, "rewards/accuracies": 0.75, "rewards/chosen": 0.12658023834228516, "rewards/margins": 0.2283097505569458, "rewards/rejected": -0.10172948986291885, "step": 1423 }, { "epoch": 0.22022037502416392, "grad_norm": 5.127035140991211, "learning_rate": 3.670103092783505e-06, "logits/chosen": 11.211700439453125, "logits/rejected": 15.031888961791992, "logps/chosen": -225.87051391601562, "logps/rejected": -338.299560546875, "loss": 0.6782, "rewards/accuracies": 0.625, "rewards/chosen": 0.04646292328834534, "rewards/margins": 0.07809567451477051, "rewards/rejected": -0.03163275122642517, "step": 1424 }, { "epoch": 0.2203750241639281, "grad_norm": 6.30739164352417, "learning_rate": 3.6726804123711348e-06, "logits/chosen": 14.393754005432129, "logits/rejected": 10.955628395080566, "logps/chosen": -375.6960754394531, "logps/rejected": -276.3924865722656, "loss": 0.6804, "rewards/accuracies": 0.5, "rewards/chosen": 0.20841285586357117, "rewards/margins": 0.06851330399513245, "rewards/rejected": 0.13989953696727753, "step": 1425 }, { "epoch": 0.22052967330369225, "grad_norm": 5.809107303619385, "learning_rate": 3.6752577319587637e-06, "logits/chosen": 12.49923038482666, "logits/rejected": 11.387870788574219, "logps/chosen": -314.86712646484375, "logps/rejected": -211.35494995117188, "loss": 0.7159, "rewards/accuracies": 0.375, "rewards/chosen": 0.2514320909976959, "rewards/margins": -0.03221062570810318, "rewards/rejected": 0.2836427390575409, "step": 1426 }, { "epoch": 0.2206843224434564, "grad_norm": 7.781691551208496, "learning_rate": 3.677835051546392e-06, "logits/chosen": 4.049658298492432, "logits/rejected": 2.064927816390991, "logps/chosen": -278.684326171875, "logps/rejected": -251.45425415039062, "loss": 0.6375, "rewards/accuracies": 0.75, "rewards/chosen": 0.15351057052612305, "rewards/margins": 0.21297302842140198, "rewards/rejected": -0.059462450444698334, "step": 1427 }, { "epoch": 0.22083897158322058, "grad_norm": 5.709812641143799, "learning_rate": 3.680412371134021e-06, "logits/chosen": 10.702022552490234, "logits/rejected": 12.592925071716309, "logps/chosen": -189.70355224609375, "logps/rejected": -218.53134155273438, "loss": 0.7302, "rewards/accuracies": 0.5, "rewards/chosen": 0.23434460163116455, "rewards/margins": -0.034587450325489044, "rewards/rejected": 0.268932044506073, "step": 1428 }, { "epoch": 0.22099362072298473, "grad_norm": 5.003627300262451, "learning_rate": 3.68298969072165e-06, "logits/chosen": 3.042603015899658, "logits/rejected": 4.243104457855225, "logps/chosen": -237.20458984375, "logps/rejected": -270.1956787109375, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": 0.23203736543655396, "rewards/margins": 0.0267547108232975, "rewards/rejected": 0.20528264343738556, "step": 1429 }, { "epoch": 0.22114826986274888, "grad_norm": 5.373926639556885, "learning_rate": 3.6855670103092787e-06, "logits/chosen": 12.566391944885254, "logits/rejected": 8.30225944519043, "logps/chosen": -256.839599609375, "logps/rejected": -220.32296752929688, "loss": 0.6062, "rewards/accuracies": 0.875, "rewards/chosen": 0.2936462163925171, "rewards/margins": 0.1967245191335678, "rewards/rejected": 0.09692173451185226, "step": 1430 }, { "epoch": 0.22130291900251306, "grad_norm": 5.836172580718994, "learning_rate": 3.6881443298969076e-06, "logits/chosen": 12.115289688110352, "logits/rejected": 5.989727020263672, "logps/chosen": -417.7890625, "logps/rejected": -242.09805297851562, "loss": 0.6706, "rewards/accuracies": 0.5, "rewards/chosen": 0.30823010206222534, "rewards/margins": 0.05545293912291527, "rewards/rejected": 0.2527771592140198, "step": 1431 }, { "epoch": 0.2214575681422772, "grad_norm": 6.998680591583252, "learning_rate": 3.6907216494845365e-06, "logits/chosen": 10.992440223693848, "logits/rejected": 7.924320697784424, "logps/chosen": -262.57159423828125, "logps/rejected": -178.3026885986328, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.2293817102909088, "rewards/margins": 0.08067440986633301, "rewards/rejected": 0.1487073004245758, "step": 1432 }, { "epoch": 0.22161221728204136, "grad_norm": 4.474908351898193, "learning_rate": 3.6932989690721653e-06, "logits/chosen": 8.56814956665039, "logits/rejected": 6.0948615074157715, "logps/chosen": -236.75711059570312, "logps/rejected": -175.08424377441406, "loss": 0.7114, "rewards/accuracies": 0.125, "rewards/chosen": 0.3214804530143738, "rewards/margins": 0.00786089338362217, "rewards/rejected": 0.31361955404281616, "step": 1433 }, { "epoch": 0.22176686642180554, "grad_norm": 4.403102874755859, "learning_rate": 3.695876288659794e-06, "logits/chosen": 3.6690077781677246, "logits/rejected": 3.438993453979492, "logps/chosen": -182.67446899414062, "logps/rejected": -156.81800842285156, "loss": 0.7076, "rewards/accuracies": 0.625, "rewards/chosen": 0.13388250768184662, "rewards/margins": 0.002867031842470169, "rewards/rejected": 0.13101546466350555, "step": 1434 }, { "epoch": 0.2219215155615697, "grad_norm": 4.940154552459717, "learning_rate": 3.698453608247423e-06, "logits/chosen": 10.204668998718262, "logits/rejected": 8.290806770324707, "logps/chosen": -271.18048095703125, "logps/rejected": -234.708740234375, "loss": 0.5917, "rewards/accuracies": 0.625, "rewards/chosen": 0.47779157757759094, "rewards/margins": 0.29514938592910767, "rewards/rejected": 0.18264217674732208, "step": 1435 }, { "epoch": 0.22207616470133384, "grad_norm": 8.411388397216797, "learning_rate": 3.701030927835052e-06, "logits/chosen": 6.597256660461426, "logits/rejected": 8.360712051391602, "logps/chosen": -239.87237548828125, "logps/rejected": -249.18927001953125, "loss": 0.6604, "rewards/accuracies": 0.5, "rewards/chosen": 0.31735408306121826, "rewards/margins": 0.07976202666759491, "rewards/rejected": 0.23759204149246216, "step": 1436 }, { "epoch": 0.222230813841098, "grad_norm": 5.877204895019531, "learning_rate": 3.703608247422681e-06, "logits/chosen": 7.842079162597656, "logits/rejected": 7.477012634277344, "logps/chosen": -381.36895751953125, "logps/rejected": -303.085205078125, "loss": 0.6412, "rewards/accuracies": 0.75, "rewards/chosen": 0.5196260809898376, "rewards/margins": 0.12545932829380035, "rewards/rejected": 0.3941667675971985, "step": 1437 }, { "epoch": 0.22238546298086218, "grad_norm": 5.436801910400391, "learning_rate": 3.7061855670103097e-06, "logits/chosen": 9.974928855895996, "logits/rejected": 7.617944717407227, "logps/chosen": -273.0373229980469, "logps/rejected": -241.4461212158203, "loss": 0.5863, "rewards/accuracies": 0.625, "rewards/chosen": 0.3655317425727844, "rewards/margins": 0.3113168478012085, "rewards/rejected": 0.05421486496925354, "step": 1438 }, { "epoch": 0.22254011212062633, "grad_norm": 4.737017631530762, "learning_rate": 3.7087628865979386e-06, "logits/chosen": 7.763040065765381, "logits/rejected": 14.306317329406738, "logps/chosen": -206.13400268554688, "logps/rejected": -321.5020446777344, "loss": 0.6455, "rewards/accuracies": 0.625, "rewards/chosen": 0.11589355766773224, "rewards/margins": 0.14583788812160492, "rewards/rejected": -0.029944326728582382, "step": 1439 }, { "epoch": 0.22269476126039048, "grad_norm": 8.504049301147461, "learning_rate": 3.7113402061855674e-06, "logits/chosen": 11.967203140258789, "logits/rejected": 9.252222061157227, "logps/chosen": -363.0977783203125, "logps/rejected": -408.66375732421875, "loss": 0.732, "rewards/accuracies": 0.375, "rewards/chosen": 0.22312985360622406, "rewards/margins": -0.062021404504776, "rewards/rejected": 0.28515127301216125, "step": 1440 }, { "epoch": 0.22284941040015466, "grad_norm": 3.8094871044158936, "learning_rate": 3.7139175257731963e-06, "logits/chosen": 4.198802947998047, "logits/rejected": 8.757087707519531, "logps/chosen": -148.41961669921875, "logps/rejected": -214.957275390625, "loss": 0.7111, "rewards/accuracies": 0.375, "rewards/chosen": 0.24298730492591858, "rewards/margins": 0.02833227440714836, "rewards/rejected": 0.21465502679347992, "step": 1441 }, { "epoch": 0.2230040595399188, "grad_norm": 6.074606418609619, "learning_rate": 3.716494845360825e-06, "logits/chosen": 13.379493713378906, "logits/rejected": 13.085268020629883, "logps/chosen": -331.3122863769531, "logps/rejected": -305.8445739746094, "loss": 0.7226, "rewards/accuracies": 0.5, "rewards/chosen": 0.20761454105377197, "rewards/margins": -0.04384039342403412, "rewards/rejected": 0.2514549195766449, "step": 1442 }, { "epoch": 0.22315870867968296, "grad_norm": 5.571346282958984, "learning_rate": 3.719072164948454e-06, "logits/chosen": 12.065376281738281, "logits/rejected": 8.790617942810059, "logps/chosen": -226.50143432617188, "logps/rejected": -194.60081481933594, "loss": 0.7055, "rewards/accuracies": 0.625, "rewards/chosen": 0.06936287879943848, "rewards/margins": 0.01522497832775116, "rewards/rejected": 0.054137907922267914, "step": 1443 }, { "epoch": 0.22331335781944714, "grad_norm": 6.028081893920898, "learning_rate": 3.721649484536083e-06, "logits/chosen": 6.155329704284668, "logits/rejected": 10.264471054077148, "logps/chosen": -200.39776611328125, "logps/rejected": -286.0333251953125, "loss": 0.7078, "rewards/accuracies": 0.625, "rewards/chosen": 0.3994177579879761, "rewards/margins": -0.009107328951358795, "rewards/rejected": 0.4085250496864319, "step": 1444 }, { "epoch": 0.2234680069592113, "grad_norm": 4.44863224029541, "learning_rate": 3.724226804123712e-06, "logits/chosen": 10.437271118164062, "logits/rejected": 6.324764728546143, "logps/chosen": -210.79452514648438, "logps/rejected": -231.02670288085938, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": 0.20896272361278534, "rewards/margins": 0.026787996292114258, "rewards/rejected": 0.18217472732067108, "step": 1445 }, { "epoch": 0.22362265609897544, "grad_norm": 5.622677326202393, "learning_rate": 3.7268041237113407e-06, "logits/chosen": 9.252970695495605, "logits/rejected": 13.275067329406738, "logps/chosen": -173.14688110351562, "logps/rejected": -266.2110595703125, "loss": 0.6823, "rewards/accuracies": 0.375, "rewards/chosen": 0.22605973482131958, "rewards/margins": 0.06749209016561508, "rewards/rejected": 0.1585676372051239, "step": 1446 }, { "epoch": 0.22377730523873962, "grad_norm": 5.240253925323486, "learning_rate": 3.7293814432989695e-06, "logits/chosen": 11.51059627532959, "logits/rejected": 3.2109203338623047, "logps/chosen": -386.291259765625, "logps/rejected": -248.99964904785156, "loss": 0.6739, "rewards/accuracies": 0.5, "rewards/chosen": 0.34667378664016724, "rewards/margins": 0.09923753142356873, "rewards/rejected": 0.24743622541427612, "step": 1447 }, { "epoch": 0.22393195437850377, "grad_norm": 5.03073263168335, "learning_rate": 3.7319587628865984e-06, "logits/chosen": 13.226451873779297, "logits/rejected": 7.708680152893066, "logps/chosen": -265.1357421875, "logps/rejected": -203.337646484375, "loss": 0.6255, "rewards/accuracies": 0.625, "rewards/chosen": 0.2357672154903412, "rewards/margins": 0.16455897688865662, "rewards/rejected": 0.07120823860168457, "step": 1448 }, { "epoch": 0.22408660351826792, "grad_norm": 5.565544128417969, "learning_rate": 3.7345360824742273e-06, "logits/chosen": 9.389707565307617, "logits/rejected": 6.84724760055542, "logps/chosen": -293.00640869140625, "logps/rejected": -324.0738220214844, "loss": 0.5909, "rewards/accuracies": 0.875, "rewards/chosen": 0.5901398658752441, "rewards/margins": 0.2525695264339447, "rewards/rejected": 0.33757033944129944, "step": 1449 }, { "epoch": 0.2242412526580321, "grad_norm": 4.719089031219482, "learning_rate": 3.737113402061856e-06, "logits/chosen": 9.208586692810059, "logits/rejected": 5.907774925231934, "logps/chosen": -249.88706970214844, "logps/rejected": -190.32154846191406, "loss": 0.7511, "rewards/accuracies": 0.625, "rewards/chosen": 0.17082425951957703, "rewards/margins": -0.06860889494419098, "rewards/rejected": 0.239433154463768, "step": 1450 }, { "epoch": 0.22439590179779625, "grad_norm": 6.521057605743408, "learning_rate": 3.739690721649485e-06, "logits/chosen": 6.830074310302734, "logits/rejected": 11.48775863647461, "logps/chosen": -336.54241943359375, "logps/rejected": -379.4432678222656, "loss": 0.739, "rewards/accuracies": 0.5, "rewards/chosen": 0.07159577310085297, "rewards/margins": -0.02558770775794983, "rewards/rejected": 0.0971834659576416, "step": 1451 }, { "epoch": 0.2245505509375604, "grad_norm": 4.088323593139648, "learning_rate": 3.742268041237114e-06, "logits/chosen": 4.787644386291504, "logits/rejected": 7.796564102172852, "logps/chosen": -143.172119140625, "logps/rejected": -124.1253662109375, "loss": 0.7437, "rewards/accuracies": 0.25, "rewards/chosen": 0.3385204076766968, "rewards/margins": -0.07669384777545929, "rewards/rejected": 0.41521427035331726, "step": 1452 }, { "epoch": 0.22470520007732456, "grad_norm": 10.13058853149414, "learning_rate": 3.7448453608247428e-06, "logits/chosen": 7.847586154937744, "logits/rejected": 9.587234497070312, "logps/chosen": -302.4804382324219, "logps/rejected": -334.10821533203125, "loss": 0.6763, "rewards/accuracies": 0.5, "rewards/chosen": 0.20742465555667877, "rewards/margins": 0.05256490036845207, "rewards/rejected": 0.1548597514629364, "step": 1453 }, { "epoch": 0.22485984921708874, "grad_norm": 5.530125141143799, "learning_rate": 3.7474226804123716e-06, "logits/chosen": 10.282870292663574, "logits/rejected": 9.538536071777344, "logps/chosen": -282.2089538574219, "logps/rejected": -224.3106231689453, "loss": 0.6578, "rewards/accuracies": 0.5, "rewards/chosen": 0.19963115453720093, "rewards/margins": 0.1490197479724884, "rewards/rejected": 0.050611402839422226, "step": 1454 }, { "epoch": 0.2250144983568529, "grad_norm": 7.162295341491699, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 8.318441390991211, "logits/rejected": 4.61562967300415, "logps/chosen": -332.745361328125, "logps/rejected": -363.1005859375, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": 0.3675723969936371, "rewards/margins": 0.15564587712287903, "rewards/rejected": 0.21192653477191925, "step": 1455 }, { "epoch": 0.22516914749661704, "grad_norm": 6.704213619232178, "learning_rate": 3.752577319587629e-06, "logits/chosen": 8.838983535766602, "logits/rejected": 10.01030158996582, "logps/chosen": -271.49639892578125, "logps/rejected": -283.4617614746094, "loss": 0.7733, "rewards/accuracies": 0.5, "rewards/chosen": 0.17520073056221008, "rewards/margins": -0.12454129755496979, "rewards/rejected": 0.29974204301834106, "step": 1456 }, { "epoch": 0.22532379663638122, "grad_norm": 5.502676010131836, "learning_rate": 3.755154639175258e-06, "logits/chosen": 10.801664352416992, "logits/rejected": 7.115739822387695, "logps/chosen": -193.4436798095703, "logps/rejected": -200.88377380371094, "loss": 0.6639, "rewards/accuracies": 0.5, "rewards/chosen": 0.15627413988113403, "rewards/margins": 0.08754958212375641, "rewards/rejected": 0.06872454285621643, "step": 1457 }, { "epoch": 0.22547844577614537, "grad_norm": 4.756232261657715, "learning_rate": 3.7577319587628867e-06, "logits/chosen": 5.954425811767578, "logits/rejected": 8.172614097595215, "logps/chosen": -216.77645874023438, "logps/rejected": -235.64361572265625, "loss": 0.6287, "rewards/accuracies": 0.625, "rewards/chosen": 0.28982970118522644, "rewards/margins": 0.17168134450912476, "rewards/rejected": 0.11814837902784348, "step": 1458 }, { "epoch": 0.22563309491590952, "grad_norm": 6.785560607910156, "learning_rate": 3.7603092783505156e-06, "logits/chosen": 7.969601631164551, "logits/rejected": 2.9462082386016846, "logps/chosen": -277.36767578125, "logps/rejected": -198.71920776367188, "loss": 0.6588, "rewards/accuracies": 0.5, "rewards/chosen": 0.33390480279922485, "rewards/margins": 0.08317037671804428, "rewards/rejected": 0.2507344186306, "step": 1459 }, { "epoch": 0.2257877440556737, "grad_norm": 6.045064926147461, "learning_rate": 3.7628865979381445e-06, "logits/chosen": 7.3320207595825195, "logits/rejected": 14.427106857299805, "logps/chosen": -232.15289306640625, "logps/rejected": -355.29205322265625, "loss": 0.6108, "rewards/accuracies": 0.625, "rewards/chosen": 0.25549107789993286, "rewards/margins": 0.2516169548034668, "rewards/rejected": 0.0038741156458854675, "step": 1460 }, { "epoch": 0.22594239319543785, "grad_norm": 6.161303520202637, "learning_rate": 3.7654639175257733e-06, "logits/chosen": 10.21287727355957, "logits/rejected": 9.032976150512695, "logps/chosen": -242.93142700195312, "logps/rejected": -217.16339111328125, "loss": 0.75, "rewards/accuracies": 0.375, "rewards/chosen": 0.0685734823346138, "rewards/margins": -0.027738407254219055, "rewards/rejected": 0.09631189703941345, "step": 1461 }, { "epoch": 0.226097042335202, "grad_norm": 4.703522205352783, "learning_rate": 3.768041237113402e-06, "logits/chosen": 9.26286792755127, "logits/rejected": 7.608859062194824, "logps/chosen": -172.74000549316406, "logps/rejected": -154.00189208984375, "loss": 0.7552, "rewards/accuracies": 0.25, "rewards/chosen": 0.17624440789222717, "rewards/margins": -0.10906385630369186, "rewards/rejected": 0.28530827164649963, "step": 1462 }, { "epoch": 0.22625169147496618, "grad_norm": 6.46061372756958, "learning_rate": 3.770618556701031e-06, "logits/chosen": 7.008559703826904, "logits/rejected": 6.716447830200195, "logps/chosen": -260.0907897949219, "logps/rejected": -231.19032287597656, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.3687230944633484, "rewards/margins": 0.28190720081329346, "rewards/rejected": 0.08681588619947433, "step": 1463 }, { "epoch": 0.22640634061473033, "grad_norm": 6.741631507873535, "learning_rate": 3.77319587628866e-06, "logits/chosen": 12.439546585083008, "logits/rejected": 11.839517593383789, "logps/chosen": -370.86474609375, "logps/rejected": -357.56500244140625, "loss": 0.6303, "rewards/accuracies": 0.75, "rewards/chosen": 0.23264950513839722, "rewards/margins": 0.14716365933418274, "rewards/rejected": 0.08548584580421448, "step": 1464 }, { "epoch": 0.22656098975449449, "grad_norm": 5.3718695640563965, "learning_rate": 3.775773195876289e-06, "logits/chosen": 8.539482116699219, "logits/rejected": 7.709983825683594, "logps/chosen": -218.16976928710938, "logps/rejected": -207.93258666992188, "loss": 0.6249, "rewards/accuracies": 0.625, "rewards/chosen": 0.4782260060310364, "rewards/margins": 0.1768878996372223, "rewards/rejected": 0.3013381063938141, "step": 1465 }, { "epoch": 0.22671563889425866, "grad_norm": 3.3755064010620117, "learning_rate": 3.7783505154639177e-06, "logits/chosen": 11.33003044128418, "logits/rejected": 8.022276878356934, "logps/chosen": -228.516845703125, "logps/rejected": -172.59207153320312, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": 0.1908881813287735, "rewards/margins": 0.37673819065093994, "rewards/rejected": -0.18584999442100525, "step": 1466 }, { "epoch": 0.22687028803402282, "grad_norm": 8.281699180603027, "learning_rate": 3.7809278350515466e-06, "logits/chosen": 12.606637954711914, "logits/rejected": 7.0932230949401855, "logps/chosen": -289.2562255859375, "logps/rejected": -218.06861877441406, "loss": 0.8437, "rewards/accuracies": 0.5, "rewards/chosen": -0.2584618628025055, "rewards/margins": -0.2069428563117981, "rewards/rejected": -0.051519013941287994, "step": 1467 }, { "epoch": 0.22702493717378697, "grad_norm": 4.887508392333984, "learning_rate": 3.7835051546391754e-06, "logits/chosen": 13.424625396728516, "logits/rejected": 12.916078567504883, "logps/chosen": -259.698486328125, "logps/rejected": -217.0360107421875, "loss": 0.6681, "rewards/accuracies": 0.625, "rewards/chosen": 0.026994463056325912, "rewards/margins": 0.06224951893091202, "rewards/rejected": -0.035255055874586105, "step": 1468 }, { "epoch": 0.22717958631355112, "grad_norm": 5.405309677124023, "learning_rate": 3.7860824742268043e-06, "logits/chosen": 8.76015853881836, "logits/rejected": 6.77261209487915, "logps/chosen": -233.000732421875, "logps/rejected": -218.54171752929688, "loss": 0.6836, "rewards/accuracies": 0.375, "rewards/chosen": 0.23566150665283203, "rewards/margins": 0.040636539459228516, "rewards/rejected": 0.19502496719360352, "step": 1469 }, { "epoch": 0.2273342354533153, "grad_norm": 4.913394927978516, "learning_rate": 3.788659793814433e-06, "logits/chosen": 6.526886463165283, "logits/rejected": -3.7138564586639404, "logps/chosen": -435.60552978515625, "logps/rejected": -183.29588317871094, "loss": 0.4968, "rewards/accuracies": 0.875, "rewards/chosen": 0.3711024522781372, "rewards/margins": 0.5004416704177856, "rewards/rejected": -0.12933921813964844, "step": 1470 }, { "epoch": 0.22748888459307945, "grad_norm": 6.3503570556640625, "learning_rate": 3.791237113402062e-06, "logits/chosen": 10.012077331542969, "logits/rejected": 5.791125774383545, "logps/chosen": -344.0557556152344, "logps/rejected": -249.77151489257812, "loss": 0.5983, "rewards/accuracies": 0.75, "rewards/chosen": 0.285371869802475, "rewards/margins": 0.2382340282201767, "rewards/rejected": 0.04713783040642738, "step": 1471 }, { "epoch": 0.2276435337328436, "grad_norm": 5.755252361297607, "learning_rate": 3.793814432989691e-06, "logits/chosen": 9.312189102172852, "logits/rejected": 3.8381175994873047, "logps/chosen": -344.6442565917969, "logps/rejected": -243.95826721191406, "loss": 0.6072, "rewards/accuracies": 0.75, "rewards/chosen": 0.4146071672439575, "rewards/margins": 0.20414811372756958, "rewards/rejected": 0.21045905351638794, "step": 1472 }, { "epoch": 0.22779818287260778, "grad_norm": 6.811728000640869, "learning_rate": 3.79639175257732e-06, "logits/chosen": 6.243160724639893, "logits/rejected": 6.7085442543029785, "logps/chosen": -238.72018432617188, "logps/rejected": -319.30352783203125, "loss": 0.8162, "rewards/accuracies": 0.125, "rewards/chosen": 0.13786697387695312, "rewards/margins": -0.17182514071464539, "rewards/rejected": 0.3096920847892761, "step": 1473 }, { "epoch": 0.22795283201237193, "grad_norm": 5.297938346862793, "learning_rate": 3.7989690721649487e-06, "logits/chosen": 5.921926498413086, "logits/rejected": 9.10495662689209, "logps/chosen": -323.8555603027344, "logps/rejected": -271.42218017578125, "loss": 0.6342, "rewards/accuracies": 0.75, "rewards/chosen": 0.11444978415966034, "rewards/margins": 0.1501908302307129, "rewards/rejected": -0.03574104607105255, "step": 1474 }, { "epoch": 0.22810748115213608, "grad_norm": 5.245988845825195, "learning_rate": 3.8015463917525775e-06, "logits/chosen": 10.41042709350586, "logits/rejected": 2.9716429710388184, "logps/chosen": -257.4392395019531, "logps/rejected": -169.25694274902344, "loss": 0.7139, "rewards/accuracies": 0.5, "rewards/chosen": 0.11719532310962677, "rewards/margins": -0.026326656341552734, "rewards/rejected": 0.1435219645500183, "step": 1475 }, { "epoch": 0.22826213029190026, "grad_norm": 6.935062885284424, "learning_rate": 3.8041237113402064e-06, "logits/chosen": 6.816610336303711, "logits/rejected": 6.755310535430908, "logps/chosen": -374.6769714355469, "logps/rejected": -278.3578796386719, "loss": 0.6354, "rewards/accuracies": 0.375, "rewards/chosen": 0.2937660217285156, "rewards/margins": 0.15558524429798126, "rewards/rejected": 0.13818077743053436, "step": 1476 }, { "epoch": 0.2284167794316644, "grad_norm": 6.507308006286621, "learning_rate": 3.8067010309278353e-06, "logits/chosen": 8.174610137939453, "logits/rejected": 6.285118103027344, "logps/chosen": -449.8434753417969, "logps/rejected": -493.5425109863281, "loss": 0.6402, "rewards/accuracies": 0.875, "rewards/chosen": 0.34505289793014526, "rewards/margins": 0.13829079270362854, "rewards/rejected": 0.20676209032535553, "step": 1477 }, { "epoch": 0.22857142857142856, "grad_norm": 8.636043548583984, "learning_rate": 3.809278350515464e-06, "logits/chosen": 7.702323913574219, "logits/rejected": 10.521150588989258, "logps/chosen": -389.50396728515625, "logps/rejected": -573.4085693359375, "loss": 0.6624, "rewards/accuracies": 0.625, "rewards/chosen": 0.1923900544643402, "rewards/margins": 0.1506527066230774, "rewards/rejected": 0.04173736646771431, "step": 1478 }, { "epoch": 0.22872607771119274, "grad_norm": 7.029668807983398, "learning_rate": 3.811855670103093e-06, "logits/chosen": 9.470576286315918, "logits/rejected": 8.039572715759277, "logps/chosen": -410.92645263671875, "logps/rejected": -317.221435546875, "loss": 0.7133, "rewards/accuracies": 0.375, "rewards/chosen": 0.4764593243598938, "rewards/margins": -0.013340139761567116, "rewards/rejected": 0.48979946970939636, "step": 1479 }, { "epoch": 0.2288807268509569, "grad_norm": 6.377582550048828, "learning_rate": 3.814432989690722e-06, "logits/chosen": 11.972147941589355, "logits/rejected": 8.803773880004883, "logps/chosen": -282.05181884765625, "logps/rejected": -225.65359497070312, "loss": 0.719, "rewards/accuracies": 0.25, "rewards/chosen": 0.12003964930772781, "rewards/margins": -0.035159334540367126, "rewards/rejected": 0.15519899129867554, "step": 1480 }, { "epoch": 0.22903537599072105, "grad_norm": 5.947975158691406, "learning_rate": 3.81701030927835e-06, "logits/chosen": 9.577737808227539, "logits/rejected": 12.643302917480469, "logps/chosen": -175.74168395996094, "logps/rejected": -218.39828491210938, "loss": 0.786, "rewards/accuracies": 0.25, "rewards/chosen": 0.07232853770256042, "rewards/margins": -0.1332847774028778, "rewards/rejected": 0.20561333000659943, "step": 1481 }, { "epoch": 0.22919002513048523, "grad_norm": 7.358123302459717, "learning_rate": 3.81958762886598e-06, "logits/chosen": 5.825680255889893, "logits/rejected": 9.34192180633545, "logps/chosen": -247.2740020751953, "logps/rejected": -237.767333984375, "loss": 0.7652, "rewards/accuracies": 0.625, "rewards/chosen": 0.18771938979625702, "rewards/margins": -0.1045319065451622, "rewards/rejected": 0.2922512888908386, "step": 1482 }, { "epoch": 0.22934467427024938, "grad_norm": 4.354575157165527, "learning_rate": 3.822164948453608e-06, "logits/chosen": 9.432476043701172, "logits/rejected": 7.647014617919922, "logps/chosen": -209.530029296875, "logps/rejected": -231.0635528564453, "loss": 0.6061, "rewards/accuracies": 0.625, "rewards/chosen": 0.1467059701681137, "rewards/margins": 0.19666936993598938, "rewards/rejected": -0.04996339604258537, "step": 1483 }, { "epoch": 0.22949932341001353, "grad_norm": 6.312913417816162, "learning_rate": 3.824742268041237e-06, "logits/chosen": 13.776065826416016, "logits/rejected": 6.200751781463623, "logps/chosen": -339.8843078613281, "logps/rejected": -319.6251220703125, "loss": 0.6722, "rewards/accuracies": 0.625, "rewards/chosen": 0.008113861083984375, "rewards/margins": 0.09153690189123154, "rewards/rejected": -0.08342304080724716, "step": 1484 }, { "epoch": 0.22965397254977768, "grad_norm": 4.326000690460205, "learning_rate": 3.827319587628866e-06, "logits/chosen": 9.664262771606445, "logits/rejected": 10.277108192443848, "logps/chosen": -214.204833984375, "logps/rejected": -218.7750244140625, "loss": 0.6324, "rewards/accuracies": 0.625, "rewards/chosen": 0.2436470091342926, "rewards/margins": 0.14362019300460815, "rewards/rejected": 0.10002681612968445, "step": 1485 }, { "epoch": 0.22980862168954186, "grad_norm": 7.29660177230835, "learning_rate": 3.829896907216495e-06, "logits/chosen": 9.311269760131836, "logits/rejected": 5.835186958312988, "logps/chosen": -378.5521240234375, "logps/rejected": -303.630126953125, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": 0.44335824251174927, "rewards/margins": 0.24892520904541016, "rewards/rejected": 0.19443301856517792, "step": 1486 }, { "epoch": 0.229963270829306, "grad_norm": 5.4405341148376465, "learning_rate": 3.832474226804124e-06, "logits/chosen": 4.626574516296387, "logits/rejected": 11.722896575927734, "logps/chosen": -240.33250427246094, "logps/rejected": -279.8541259765625, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.21309161186218262, "rewards/margins": 0.041017621755599976, "rewards/rejected": 0.17207399010658264, "step": 1487 }, { "epoch": 0.23011791996907016, "grad_norm": 4.598001480102539, "learning_rate": 3.835051546391753e-06, "logits/chosen": 6.937432765960693, "logits/rejected": 10.293776512145996, "logps/chosen": -172.78076171875, "logps/rejected": -203.2093048095703, "loss": 0.6964, "rewards/accuracies": 0.5, "rewards/chosen": 0.09580355137586594, "rewards/margins": 0.031737860292196274, "rewards/rejected": 0.06406569480895996, "step": 1488 }, { "epoch": 0.23027256910883434, "grad_norm": 5.912476062774658, "learning_rate": 3.837628865979382e-06, "logits/chosen": 15.034684181213379, "logits/rejected": 14.747718811035156, "logps/chosen": -296.0722961425781, "logps/rejected": -346.833251953125, "loss": 0.7026, "rewards/accuracies": 0.375, "rewards/chosen": 0.2537423372268677, "rewards/margins": 0.13809485733509064, "rewards/rejected": 0.11564750969409943, "step": 1489 }, { "epoch": 0.2304272182485985, "grad_norm": 5.391479969024658, "learning_rate": 3.840206185567011e-06, "logits/chosen": 12.513710021972656, "logits/rejected": 8.448549270629883, "logps/chosen": -347.7033996582031, "logps/rejected": -295.8345947265625, "loss": 0.59, "rewards/accuracies": 0.75, "rewards/chosen": 0.19974061846733093, "rewards/margins": 0.23782959580421448, "rewards/rejected": -0.03808898478746414, "step": 1490 }, { "epoch": 0.23058186738836264, "grad_norm": 6.188045501708984, "learning_rate": 3.84278350515464e-06, "logits/chosen": 13.082235336303711, "logits/rejected": 6.8130106925964355, "logps/chosen": -305.37542724609375, "logps/rejected": -244.87841796875, "loss": 0.7601, "rewards/accuracies": 0.375, "rewards/chosen": 0.03320226073265076, "rewards/margins": -0.10829858481884003, "rewards/rejected": 0.14150084555149078, "step": 1491 }, { "epoch": 0.23073651652812682, "grad_norm": 4.750273704528809, "learning_rate": 3.845360824742268e-06, "logits/chosen": 11.077557563781738, "logits/rejected": 11.949806213378906, "logps/chosen": -176.37054443359375, "logps/rejected": -263.3591613769531, "loss": 0.7083, "rewards/accuracies": 0.625, "rewards/chosen": 0.2062620222568512, "rewards/margins": 0.019134104251861572, "rewards/rejected": 0.18712788820266724, "step": 1492 }, { "epoch": 0.23089116566789097, "grad_norm": 5.911018371582031, "learning_rate": 3.847938144329898e-06, "logits/chosen": 9.231491088867188, "logits/rejected": 6.493816375732422, "logps/chosen": -251.95294189453125, "logps/rejected": -196.33901977539062, "loss": 0.6706, "rewards/accuracies": 0.625, "rewards/chosen": -0.05813436210155487, "rewards/margins": 0.06341734528541565, "rewards/rejected": -0.12155171483755112, "step": 1493 }, { "epoch": 0.23104581480765513, "grad_norm": 7.735345363616943, "learning_rate": 3.850515463917526e-06, "logits/chosen": 10.80581283569336, "logits/rejected": 6.760698318481445, "logps/chosen": -273.9007263183594, "logps/rejected": -231.0111083984375, "loss": 0.6673, "rewards/accuracies": 0.625, "rewards/chosen": 0.3082673251628876, "rewards/margins": 0.09262268245220184, "rewards/rejected": 0.21564464271068573, "step": 1494 }, { "epoch": 0.2312004639474193, "grad_norm": 7.347454071044922, "learning_rate": 3.853092783505155e-06, "logits/chosen": 11.645853042602539, "logits/rejected": 4.602846145629883, "logps/chosen": -281.2930603027344, "logps/rejected": -199.41683959960938, "loss": 0.6123, "rewards/accuracies": 0.75, "rewards/chosen": 0.3027874231338501, "rewards/margins": 0.22355104982852936, "rewards/rejected": 0.07923637330532074, "step": 1495 }, { "epoch": 0.23135511308718346, "grad_norm": 6.737334251403809, "learning_rate": 3.855670103092784e-06, "logits/chosen": 7.279929161071777, "logits/rejected": 9.469220161437988, "logps/chosen": -248.23397827148438, "logps/rejected": -327.2369079589844, "loss": 0.7361, "rewards/accuracies": 0.375, "rewards/chosen": -0.16058771312236786, "rewards/margins": -0.058486759662628174, "rewards/rejected": -0.10210093855857849, "step": 1496 }, { "epoch": 0.2315097622269476, "grad_norm": 5.270401954650879, "learning_rate": 3.858247422680413e-06, "logits/chosen": 10.151684761047363, "logits/rejected": 9.549076080322266, "logps/chosen": -310.0138244628906, "logps/rejected": -265.59857177734375, "loss": 0.6461, "rewards/accuracies": 0.5, "rewards/chosen": 0.3578866422176361, "rewards/margins": 0.1330842673778534, "rewards/rejected": 0.2248024046421051, "step": 1497 }, { "epoch": 0.23166441136671176, "grad_norm": 8.023092269897461, "learning_rate": 3.860824742268042e-06, "logits/chosen": 12.492981910705566, "logits/rejected": 5.868310928344727, "logps/chosen": -334.49578857421875, "logps/rejected": -284.79058837890625, "loss": 0.7028, "rewards/accuracies": 0.625, "rewards/chosen": 0.5174224376678467, "rewards/margins": -0.006380315870046616, "rewards/rejected": 0.5238027572631836, "step": 1498 }, { "epoch": 0.23181906050647594, "grad_norm": 6.564235687255859, "learning_rate": 3.863402061855671e-06, "logits/chosen": 9.212175369262695, "logits/rejected": 8.734397888183594, "logps/chosen": -211.65870666503906, "logps/rejected": -230.56903076171875, "loss": 0.802, "rewards/accuracies": 0.25, "rewards/chosen": 0.11530418694019318, "rewards/margins": -0.17269396781921387, "rewards/rejected": 0.28799813985824585, "step": 1499 }, { "epoch": 0.2319737096462401, "grad_norm": 6.599989891052246, "learning_rate": 3.865979381443299e-06, "logits/chosen": 9.26891803741455, "logits/rejected": 9.027416229248047, "logps/chosen": -320.5238037109375, "logps/rejected": -285.834716796875, "loss": 0.7892, "rewards/accuracies": 0.5, "rewards/chosen": 0.17476335167884827, "rewards/margins": -0.09738259762525558, "rewards/rejected": 0.27214595675468445, "step": 1500 }, { "epoch": 0.23212835878600424, "grad_norm": 6.970137596130371, "learning_rate": 3.868556701030929e-06, "logits/chosen": 3.8541297912597656, "logits/rejected": 11.664453506469727, "logps/chosen": -336.1994323730469, "logps/rejected": -305.06134033203125, "loss": 0.7175, "rewards/accuracies": 0.5, "rewards/chosen": 0.2546382248401642, "rewards/margins": 0.001676023006439209, "rewards/rejected": 0.252962201833725, "step": 1501 }, { "epoch": 0.23228300792576842, "grad_norm": 7.253159523010254, "learning_rate": 3.871134020618557e-06, "logits/chosen": 9.755655288696289, "logits/rejected": 10.065564155578613, "logps/chosen": -256.7037048339844, "logps/rejected": -261.17730712890625, "loss": 0.8747, "rewards/accuracies": 0.25, "rewards/chosen": 0.03782206028699875, "rewards/margins": -0.2745472192764282, "rewards/rejected": 0.31236928701400757, "step": 1502 }, { "epoch": 0.23243765706553257, "grad_norm": 5.231308460235596, "learning_rate": 3.873711340206186e-06, "logits/chosen": 12.124849319458008, "logits/rejected": 8.130109786987305, "logps/chosen": -254.45901489257812, "logps/rejected": -222.69906616210938, "loss": 0.5712, "rewards/accuracies": 0.625, "rewards/chosen": 0.4752688705921173, "rewards/margins": 0.31232118606567383, "rewards/rejected": 0.1629476547241211, "step": 1503 }, { "epoch": 0.23259230620529672, "grad_norm": 9.625741958618164, "learning_rate": 3.876288659793815e-06, "logits/chosen": 9.822915077209473, "logits/rejected": 10.296433448791504, "logps/chosen": -315.6681823730469, "logps/rejected": -281.40643310546875, "loss": 0.8795, "rewards/accuracies": 0.375, "rewards/chosen": -0.02320127747952938, "rewards/margins": -0.3095248341560364, "rewards/rejected": 0.28632354736328125, "step": 1504 }, { "epoch": 0.2327469553450609, "grad_norm": 5.206031322479248, "learning_rate": 3.878865979381444e-06, "logits/chosen": 11.777912139892578, "logits/rejected": 0.42134547233581543, "logps/chosen": -331.1743469238281, "logps/rejected": -156.06776428222656, "loss": 0.5898, "rewards/accuracies": 0.625, "rewards/chosen": 0.2575674057006836, "rewards/margins": 0.2603178024291992, "rewards/rejected": -0.002750396728515625, "step": 1505 }, { "epoch": 0.23290160448482505, "grad_norm": 6.149837017059326, "learning_rate": 3.8814432989690726e-06, "logits/chosen": 4.658173561096191, "logits/rejected": 8.412554740905762, "logps/chosen": -245.06982421875, "logps/rejected": -283.7773132324219, "loss": 0.7421, "rewards/accuracies": 0.5, "rewards/chosen": 0.140394926071167, "rewards/margins": -0.061921752989292145, "rewards/rejected": 0.20231667160987854, "step": 1506 }, { "epoch": 0.2330562536245892, "grad_norm": 4.144974708557129, "learning_rate": 3.884020618556701e-06, "logits/chosen": 12.493021011352539, "logits/rejected": 9.275261878967285, "logps/chosen": -278.2568664550781, "logps/rejected": -251.72659301757812, "loss": 0.569, "rewards/accuracies": 0.75, "rewards/chosen": 0.47479915618896484, "rewards/margins": 0.30153805017471313, "rewards/rejected": 0.1732611209154129, "step": 1507 }, { "epoch": 0.23321090276435338, "grad_norm": 4.9788594245910645, "learning_rate": 3.88659793814433e-06, "logits/chosen": 12.702082633972168, "logits/rejected": 10.610586166381836, "logps/chosen": -335.4980163574219, "logps/rejected": -266.1140441894531, "loss": 0.5811, "rewards/accuracies": 0.75, "rewards/chosen": 0.38709479570388794, "rewards/margins": 0.2924399673938751, "rewards/rejected": 0.0946548581123352, "step": 1508 }, { "epoch": 0.23336555190411754, "grad_norm": 7.7753987312316895, "learning_rate": 3.889175257731959e-06, "logits/chosen": 9.48106861114502, "logits/rejected": 5.862563133239746, "logps/chosen": -365.45733642578125, "logps/rejected": -320.953125, "loss": 0.8501, "rewards/accuracies": 0.5, "rewards/chosen": 0.253966361284256, "rewards/margins": -0.19125162065029144, "rewards/rejected": 0.4452179968357086, "step": 1509 }, { "epoch": 0.2335202010438817, "grad_norm": 4.381181240081787, "learning_rate": 3.891752577319588e-06, "logits/chosen": 8.655179023742676, "logits/rejected": 0.4067535400390625, "logps/chosen": -216.61383056640625, "logps/rejected": -192.57862854003906, "loss": 0.5729, "rewards/accuracies": 0.75, "rewards/chosen": 0.5711411833763123, "rewards/margins": 0.3399847447872162, "rewards/rejected": 0.23115646839141846, "step": 1510 }, { "epoch": 0.23367485018364587, "grad_norm": 12.981870651245117, "learning_rate": 3.8943298969072165e-06, "logits/chosen": 8.160552978515625, "logits/rejected": 8.916869163513184, "logps/chosen": -225.50807189941406, "logps/rejected": -312.6629943847656, "loss": 0.6959, "rewards/accuracies": 0.625, "rewards/chosen": 0.30918991565704346, "rewards/margins": 0.038649603724479675, "rewards/rejected": 0.2705402970314026, "step": 1511 }, { "epoch": 0.23382949932341002, "grad_norm": 6.409939765930176, "learning_rate": 3.896907216494846e-06, "logits/chosen": 8.442054748535156, "logits/rejected": 6.3834228515625, "logps/chosen": -229.24179077148438, "logps/rejected": -286.79840087890625, "loss": 0.7896, "rewards/accuracies": 0.5, "rewards/chosen": -0.1021028533577919, "rewards/margins": -0.11848890781402588, "rewards/rejected": 0.01638607680797577, "step": 1512 }, { "epoch": 0.23398414846317417, "grad_norm": 4.915167808532715, "learning_rate": 3.899484536082474e-06, "logits/chosen": 7.4133172035217285, "logits/rejected": 3.7280473709106445, "logps/chosen": -211.74615478515625, "logps/rejected": -198.40516662597656, "loss": 0.6095, "rewards/accuracies": 0.625, "rewards/chosen": 0.18608593940734863, "rewards/margins": 0.2144806981086731, "rewards/rejected": -0.02839474007487297, "step": 1513 }, { "epoch": 0.23413879760293832, "grad_norm": 5.191720485687256, "learning_rate": 3.9020618556701035e-06, "logits/chosen": 15.940607070922852, "logits/rejected": 6.936572074890137, "logps/chosen": -336.0983581542969, "logps/rejected": -260.76580810546875, "loss": 0.6357, "rewards/accuracies": 0.5, "rewards/chosen": 0.40377309918403625, "rewards/margins": 0.27017515897750854, "rewards/rejected": 0.1335979551076889, "step": 1514 }, { "epoch": 0.2342934467427025, "grad_norm": 4.383785247802734, "learning_rate": 3.904639175257732e-06, "logits/chosen": 1.3904391527175903, "logits/rejected": -3.068742275238037, "logps/chosen": -199.25372314453125, "logps/rejected": -123.5685806274414, "loss": 0.5633, "rewards/accuracies": 0.75, "rewards/chosen": 0.5240488648414612, "rewards/margins": 0.3650621771812439, "rewards/rejected": 0.15898671746253967, "step": 1515 }, { "epoch": 0.23444809588246665, "grad_norm": 7.265319347381592, "learning_rate": 3.907216494845361e-06, "logits/chosen": 13.460977554321289, "logits/rejected": 6.150660514831543, "logps/chosen": -406.9249267578125, "logps/rejected": -235.89553833007812, "loss": 0.6503, "rewards/accuracies": 0.625, "rewards/chosen": 0.5234739780426025, "rewards/margins": 0.1150905191898346, "rewards/rejected": 0.40838345885276794, "step": 1516 }, { "epoch": 0.2346027450222308, "grad_norm": 7.195724964141846, "learning_rate": 3.90979381443299e-06, "logits/chosen": 8.465538024902344, "logits/rejected": 6.386586666107178, "logps/chosen": -360.35247802734375, "logps/rejected": -264.21124267578125, "loss": 0.7, "rewards/accuracies": 0.375, "rewards/chosen": 0.5524378418922424, "rewards/margins": 0.0303567573428154, "rewards/rejected": 0.5220810770988464, "step": 1517 }, { "epoch": 0.23475739416199498, "grad_norm": 7.234546184539795, "learning_rate": 3.912371134020619e-06, "logits/chosen": 15.706295013427734, "logits/rejected": 10.237701416015625, "logps/chosen": -426.7939758300781, "logps/rejected": -433.022216796875, "loss": 0.6667, "rewards/accuracies": 0.375, "rewards/chosen": 0.3625059127807617, "rewards/margins": 0.06743927299976349, "rewards/rejected": 0.2950666546821594, "step": 1518 }, { "epoch": 0.23491204330175913, "grad_norm": 5.0474534034729, "learning_rate": 3.9149484536082475e-06, "logits/chosen": 10.240234375, "logits/rejected": 4.4165825843811035, "logps/chosen": -183.04933166503906, "logps/rejected": -149.90525817871094, "loss": 0.7293, "rewards/accuracies": 0.5, "rewards/chosen": 0.20261268317699432, "rewards/margins": -0.04941230267286301, "rewards/rejected": 0.25202497839927673, "step": 1519 }, { "epoch": 0.23506669244152328, "grad_norm": 5.936202526092529, "learning_rate": 3.917525773195877e-06, "logits/chosen": 10.639086723327637, "logits/rejected": 3.121960401535034, "logps/chosen": -246.2490692138672, "logps/rejected": -164.32725524902344, "loss": 0.7013, "rewards/accuracies": 0.625, "rewards/chosen": 0.36068159341812134, "rewards/margins": -0.00103764608502388, "rewards/rejected": 0.3617192506790161, "step": 1520 }, { "epoch": 0.23522134158128746, "grad_norm": 6.167348861694336, "learning_rate": 3.920103092783505e-06, "logits/chosen": 11.816542625427246, "logits/rejected": 4.546465873718262, "logps/chosen": -280.189453125, "logps/rejected": -210.26174926757812, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": 0.12677177786827087, "rewards/margins": 0.15038365125656128, "rewards/rejected": -0.02361188270151615, "step": 1521 }, { "epoch": 0.23537599072105161, "grad_norm": 5.172669887542725, "learning_rate": 3.9226804123711345e-06, "logits/chosen": 10.670907974243164, "logits/rejected": 7.24764347076416, "logps/chosen": -278.16339111328125, "logps/rejected": -181.60498046875, "loss": 0.6806, "rewards/accuracies": 0.375, "rewards/chosen": 0.4874260127544403, "rewards/margins": 0.043611686676740646, "rewards/rejected": 0.44381433725357056, "step": 1522 }, { "epoch": 0.23553063986081577, "grad_norm": 5.252559185028076, "learning_rate": 3.925257731958763e-06, "logits/chosen": 8.412662506103516, "logits/rejected": 3.091952323913574, "logps/chosen": -214.8580322265625, "logps/rejected": -172.47805786132812, "loss": 0.6562, "rewards/accuracies": 0.625, "rewards/chosen": 0.4008265733718872, "rewards/margins": 0.09591773897409439, "rewards/rejected": 0.3049088716506958, "step": 1523 }, { "epoch": 0.23568528900057995, "grad_norm": 5.445528984069824, "learning_rate": 3.927835051546392e-06, "logits/chosen": 11.759615898132324, "logits/rejected": 6.603482246398926, "logps/chosen": -347.9300537109375, "logps/rejected": -254.6511993408203, "loss": 0.6081, "rewards/accuracies": 0.625, "rewards/chosen": 0.6747207045555115, "rewards/margins": 0.21302491426467896, "rewards/rejected": 0.4616957902908325, "step": 1524 }, { "epoch": 0.2358399381403441, "grad_norm": 4.5485734939575195, "learning_rate": 3.930412371134021e-06, "logits/chosen": 10.335458755493164, "logits/rejected": 2.6694533824920654, "logps/chosen": -356.0323486328125, "logps/rejected": -181.03753662109375, "loss": 0.5395, "rewards/accuracies": 0.875, "rewards/chosen": 0.6421796083450317, "rewards/margins": 0.38767755031585693, "rewards/rejected": 0.2545021176338196, "step": 1525 }, { "epoch": 0.23599458728010825, "grad_norm": 5.563239097595215, "learning_rate": 3.93298969072165e-06, "logits/chosen": 9.825559616088867, "logits/rejected": 8.412909507751465, "logps/chosen": -270.6493225097656, "logps/rejected": -206.90167236328125, "loss": 0.7916, "rewards/accuracies": 0.25, "rewards/chosen": 0.30442261695861816, "rewards/margins": -0.13934195041656494, "rewards/rejected": 0.4437645673751831, "step": 1526 }, { "epoch": 0.23614923641987243, "grad_norm": 5.295621395111084, "learning_rate": 3.9355670103092784e-06, "logits/chosen": 10.532032012939453, "logits/rejected": 6.786726951599121, "logps/chosen": -329.4215087890625, "logps/rejected": -267.92071533203125, "loss": 0.6152, "rewards/accuracies": 0.625, "rewards/chosen": 0.26189011335372925, "rewards/margins": 0.19583845138549805, "rewards/rejected": 0.06605168431997299, "step": 1527 }, { "epoch": 0.23630388555963658, "grad_norm": 8.2406587600708, "learning_rate": 3.938144329896908e-06, "logits/chosen": 6.929096221923828, "logits/rejected": 5.003818035125732, "logps/chosen": -304.64306640625, "logps/rejected": -254.5154571533203, "loss": 0.6335, "rewards/accuracies": 0.5, "rewards/chosen": 0.5827102065086365, "rewards/margins": 0.13970284163951874, "rewards/rejected": 0.44300737977027893, "step": 1528 }, { "epoch": 0.23645853469940073, "grad_norm": 4.663034915924072, "learning_rate": 3.940721649484536e-06, "logits/chosen": 9.873632431030273, "logits/rejected": 3.9288434982299805, "logps/chosen": -293.5548095703125, "logps/rejected": -181.2091522216797, "loss": 0.5975, "rewards/accuracies": 0.625, "rewards/chosen": 0.6812810897827148, "rewards/margins": 0.29897189140319824, "rewards/rejected": 0.382309228181839, "step": 1529 }, { "epoch": 0.23661318383916488, "grad_norm": 4.7728118896484375, "learning_rate": 3.9432989690721655e-06, "logits/chosen": 12.11138916015625, "logits/rejected": 7.631097793579102, "logps/chosen": -212.34243774414062, "logps/rejected": -150.57159423828125, "loss": 0.7554, "rewards/accuracies": 0.5, "rewards/chosen": 0.4381111264228821, "rewards/margins": -0.07770747691392899, "rewards/rejected": 0.5158185958862305, "step": 1530 }, { "epoch": 0.23676783297892906, "grad_norm": 6.447408199310303, "learning_rate": 3.945876288659794e-06, "logits/chosen": 7.343404293060303, "logits/rejected": 6.306171894073486, "logps/chosen": -202.13076782226562, "logps/rejected": -227.48036193847656, "loss": 0.6646, "rewards/accuracies": 0.5, "rewards/chosen": 0.4075896441936493, "rewards/margins": 0.08385476469993591, "rewards/rejected": 0.3237348794937134, "step": 1531 }, { "epoch": 0.2369224821186932, "grad_norm": 5.694151401519775, "learning_rate": 3.948453608247423e-06, "logits/chosen": 2.9482839107513428, "logits/rejected": 9.376598358154297, "logps/chosen": -183.54022216796875, "logps/rejected": -232.29525756835938, "loss": 0.7505, "rewards/accuracies": 0.375, "rewards/chosen": 0.519823431968689, "rewards/margins": -0.08198332041501999, "rewards/rejected": 0.6018067598342896, "step": 1532 }, { "epoch": 0.23707713125845736, "grad_norm": 4.7428717613220215, "learning_rate": 3.951030927835052e-06, "logits/chosen": 9.520221710205078, "logits/rejected": 9.238412857055664, "logps/chosen": -289.4178161621094, "logps/rejected": -280.3835754394531, "loss": 0.5764, "rewards/accuracies": 0.75, "rewards/chosen": 0.5405789017677307, "rewards/margins": 0.27859997749328613, "rewards/rejected": 0.2619789242744446, "step": 1533 }, { "epoch": 0.23723178039822154, "grad_norm": 4.616872310638428, "learning_rate": 3.953608247422681e-06, "logits/chosen": 12.716232299804688, "logits/rejected": 8.899825096130371, "logps/chosen": -259.2904052734375, "logps/rejected": -192.69430541992188, "loss": 0.6037, "rewards/accuracies": 0.875, "rewards/chosen": 0.46938180923461914, "rewards/margins": 0.20717325806617737, "rewards/rejected": 0.26220858097076416, "step": 1534 }, { "epoch": 0.2373864295379857, "grad_norm": 5.762049198150635, "learning_rate": 3.956185567010309e-06, "logits/chosen": 9.231230735778809, "logits/rejected": 7.194056510925293, "logps/chosen": -315.4071044921875, "logps/rejected": -275.0879821777344, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.4359944462776184, "rewards/margins": 0.048519887030124664, "rewards/rejected": 0.38747456669807434, "step": 1535 }, { "epoch": 0.23754107867774985, "grad_norm": 5.51515007019043, "learning_rate": 3.958762886597938e-06, "logits/chosen": 6.62996244430542, "logits/rejected": 8.82288932800293, "logps/chosen": -298.00604248046875, "logps/rejected": -337.2183837890625, "loss": 0.648, "rewards/accuracies": 0.625, "rewards/chosen": 0.6336236000061035, "rewards/margins": 0.13676565885543823, "rewards/rejected": 0.4968579411506653, "step": 1536 }, { "epoch": 0.23769572781751402, "grad_norm": 7.043104648590088, "learning_rate": 3.961340206185567e-06, "logits/chosen": 10.491825103759766, "logits/rejected": 9.131352424621582, "logps/chosen": -365.07122802734375, "logps/rejected": -265.52874755859375, "loss": 0.8316, "rewards/accuracies": 0.25, "rewards/chosen": 0.23324403166770935, "rewards/margins": -0.23364277184009552, "rewards/rejected": 0.46688681840896606, "step": 1537 }, { "epoch": 0.23785037695727818, "grad_norm": 4.385149955749512, "learning_rate": 3.963917525773196e-06, "logits/chosen": 7.493856430053711, "logits/rejected": 1.1993488073349, "logps/chosen": -378.59686279296875, "logps/rejected": -259.1129150390625, "loss": 0.5343, "rewards/accuracies": 0.875, "rewards/chosen": 0.6846151947975159, "rewards/margins": 0.36997056007385254, "rewards/rejected": 0.31464463472366333, "step": 1538 }, { "epoch": 0.23800502609704233, "grad_norm": 6.514186859130859, "learning_rate": 3.966494845360825e-06, "logits/chosen": 6.022164821624756, "logits/rejected": 10.044828414916992, "logps/chosen": -297.1440124511719, "logps/rejected": -300.332763671875, "loss": 0.6612, "rewards/accuracies": 0.625, "rewards/chosen": 0.358091801404953, "rewards/margins": 0.09648928046226501, "rewards/rejected": 0.261602520942688, "step": 1539 }, { "epoch": 0.2381596752368065, "grad_norm": 4.723688125610352, "learning_rate": 3.969072164948453e-06, "logits/chosen": 6.895518779754639, "logits/rejected": 2.1096956729888916, "logps/chosen": -210.67108154296875, "logps/rejected": -185.7576141357422, "loss": 0.5632, "rewards/accuracies": 1.0, "rewards/chosen": 0.6370294094085693, "rewards/margins": 0.2994311451911926, "rewards/rejected": 0.3375983238220215, "step": 1540 }, { "epoch": 0.23831432437657066, "grad_norm": 4.970569610595703, "learning_rate": 3.971649484536083e-06, "logits/chosen": 11.017690658569336, "logits/rejected": 7.6477484703063965, "logps/chosen": -181.72262573242188, "logps/rejected": -161.9139404296875, "loss": 0.7472, "rewards/accuracies": 0.5, "rewards/chosen": 0.5567940473556519, "rewards/margins": -0.009918492287397385, "rewards/rejected": 0.5667125582695007, "step": 1541 }, { "epoch": 0.2384689735163348, "grad_norm": 6.497038841247559, "learning_rate": 3.974226804123711e-06, "logits/chosen": 12.549454689025879, "logits/rejected": 8.95539665222168, "logps/chosen": -235.93719482421875, "logps/rejected": -244.67599487304688, "loss": 0.7134, "rewards/accuracies": 0.5, "rewards/chosen": 0.23067083954811096, "rewards/margins": -0.0008360408246517181, "rewards/rejected": 0.23150688409805298, "step": 1542 }, { "epoch": 0.238623622656099, "grad_norm": 5.013777732849121, "learning_rate": 3.97680412371134e-06, "logits/chosen": 7.2159318923950195, "logits/rejected": 6.313971042633057, "logps/chosen": -193.31979370117188, "logps/rejected": -206.64132690429688, "loss": 0.6661, "rewards/accuracies": 0.5, "rewards/chosen": 0.40325412154197693, "rewards/margins": 0.09369264543056488, "rewards/rejected": 0.30956146121025085, "step": 1543 }, { "epoch": 0.23877827179586314, "grad_norm": 6.626040935516357, "learning_rate": 3.979381443298969e-06, "logits/chosen": 11.671626091003418, "logits/rejected": 7.379290580749512, "logps/chosen": -303.1752014160156, "logps/rejected": -229.61151123046875, "loss": 0.5585, "rewards/accuracies": 0.75, "rewards/chosen": 0.5849947929382324, "rewards/margins": 0.34085673093795776, "rewards/rejected": 0.24413806200027466, "step": 1544 }, { "epoch": 0.2389329209356273, "grad_norm": 5.476341724395752, "learning_rate": 3.981958762886598e-06, "logits/chosen": 10.761817932128906, "logits/rejected": 11.846903800964355, "logps/chosen": -233.01882934570312, "logps/rejected": -271.3255615234375, "loss": 0.6231, "rewards/accuracies": 0.875, "rewards/chosen": 0.4730564057826996, "rewards/margins": 0.20200592279434204, "rewards/rejected": 0.27105045318603516, "step": 1545 }, { "epoch": 0.23908757007539144, "grad_norm": 7.053686618804932, "learning_rate": 3.9845360824742274e-06, "logits/chosen": 6.4414215087890625, "logits/rejected": 9.58053970336914, "logps/chosen": -257.3913879394531, "logps/rejected": -323.2955322265625, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": 0.4482116401195526, "rewards/margins": 0.047430846840143204, "rewards/rejected": 0.4007807970046997, "step": 1546 }, { "epoch": 0.23924221921515562, "grad_norm": 5.480428218841553, "learning_rate": 3.987113402061856e-06, "logits/chosen": 9.976581573486328, "logits/rejected": 9.315201759338379, "logps/chosen": -245.08792114257812, "logps/rejected": -224.33184814453125, "loss": 0.6179, "rewards/accuracies": 0.625, "rewards/chosen": 0.5003190636634827, "rewards/margins": 0.197228342294693, "rewards/rejected": 0.3030906617641449, "step": 1547 }, { "epoch": 0.23939686835491977, "grad_norm": 4.120488166809082, "learning_rate": 3.989690721649485e-06, "logits/chosen": 7.97170877456665, "logits/rejected": 6.064027786254883, "logps/chosen": -177.2056884765625, "logps/rejected": -172.6649169921875, "loss": 0.5638, "rewards/accuracies": 0.875, "rewards/chosen": 0.7762455940246582, "rewards/margins": 0.30764156579971313, "rewards/rejected": 0.46860408782958984, "step": 1548 }, { "epoch": 0.23955151749468392, "grad_norm": 5.3417067527771, "learning_rate": 3.992268041237114e-06, "logits/chosen": 7.547265529632568, "logits/rejected": 3.9034485816955566, "logps/chosen": -249.04042053222656, "logps/rejected": -248.63888549804688, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.40392619371414185, "rewards/margins": 0.10767553746700287, "rewards/rejected": 0.2962506413459778, "step": 1549 }, { "epoch": 0.2397061666344481, "grad_norm": 6.094179630279541, "learning_rate": 3.994845360824743e-06, "logits/chosen": 11.436868667602539, "logits/rejected": 7.787228107452393, "logps/chosen": -346.1475830078125, "logps/rejected": -321.69775390625, "loss": 0.634, "rewards/accuracies": 0.5, "rewards/chosen": 0.44328147172927856, "rewards/margins": 0.18709611892700195, "rewards/rejected": 0.2561853528022766, "step": 1550 }, { "epoch": 0.23986081577421225, "grad_norm": 5.648887634277344, "learning_rate": 3.997422680412371e-06, "logits/chosen": 2.649181842803955, "logits/rejected": 6.367504119873047, "logps/chosen": -287.20831298828125, "logps/rejected": -307.246337890625, "loss": 0.684, "rewards/accuracies": 0.375, "rewards/chosen": 0.4901849925518036, "rewards/margins": 0.05955544486641884, "rewards/rejected": 0.43062952160835266, "step": 1551 }, { "epoch": 0.2400154649139764, "grad_norm": 5.494936466217041, "learning_rate": 4.000000000000001e-06, "logits/chosen": 12.190316200256348, "logits/rejected": 10.302474021911621, "logps/chosen": -343.9850158691406, "logps/rejected": -264.27911376953125, "loss": 0.5388, "rewards/accuracies": 0.75, "rewards/chosen": 0.6995195746421814, "rewards/margins": 0.39650246500968933, "rewards/rejected": 0.30301710963249207, "step": 1552 }, { "epoch": 0.24017011405374059, "grad_norm": 4.425969123840332, "learning_rate": 4.002577319587629e-06, "logits/chosen": 9.866296768188477, "logits/rejected": 4.985841274261475, "logps/chosen": -296.0503234863281, "logps/rejected": -240.78343200683594, "loss": 0.6032, "rewards/accuracies": 1.0, "rewards/chosen": 0.5350544452667236, "rewards/margins": 0.196751207113266, "rewards/rejected": 0.33830320835113525, "step": 1553 }, { "epoch": 0.24032476319350474, "grad_norm": 8.744105339050293, "learning_rate": 4.005154639175258e-06, "logits/chosen": 7.268885612487793, "logits/rejected": 6.854941368103027, "logps/chosen": -415.1798095703125, "logps/rejected": -364.4555969238281, "loss": 0.7227, "rewards/accuracies": 0.75, "rewards/chosen": 0.9483609199523926, "rewards/margins": 0.011875815689563751, "rewards/rejected": 0.9364850521087646, "step": 1554 }, { "epoch": 0.2404794123332689, "grad_norm": 7.5815653800964355, "learning_rate": 4.007731958762887e-06, "logits/chosen": 12.504404067993164, "logits/rejected": 11.693572998046875, "logps/chosen": -306.8857727050781, "logps/rejected": -252.85137939453125, "loss": 0.6413, "rewards/accuracies": 0.75, "rewards/chosen": 0.4932171106338501, "rewards/margins": 0.21040642261505127, "rewards/rejected": 0.28281062841415405, "step": 1555 }, { "epoch": 0.24063406147303307, "grad_norm": 4.477160453796387, "learning_rate": 4.010309278350516e-06, "logits/chosen": 7.074172019958496, "logits/rejected": 9.082114219665527, "logps/chosen": -247.4729766845703, "logps/rejected": -216.42205810546875, "loss": 0.5979, "rewards/accuracies": 0.75, "rewards/chosen": 0.4961368441581726, "rewards/margins": 0.21538829803466797, "rewards/rejected": 0.280748575925827, "step": 1556 }, { "epoch": 0.24078871061279722, "grad_norm": 3.7664945125579834, "learning_rate": 4.012886597938145e-06, "logits/chosen": 3.4420535564422607, "logits/rejected": 6.456167221069336, "logps/chosen": -156.06997680664062, "logps/rejected": -170.4356689453125, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": 0.2626434862613678, "rewards/margins": 0.10600829124450684, "rewards/rejected": 0.15663519501686096, "step": 1557 }, { "epoch": 0.24094335975256137, "grad_norm": 7.399625778198242, "learning_rate": 4.015463917525774e-06, "logits/chosen": 12.656184196472168, "logits/rejected": 12.132808685302734, "logps/chosen": -348.25836181640625, "logps/rejected": -312.4849853515625, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.5322256088256836, "rewards/margins": 0.015666592866182327, "rewards/rejected": 0.5165590643882751, "step": 1558 }, { "epoch": 0.24109800889232555, "grad_norm": 3.438904285430908, "learning_rate": 4.018041237113402e-06, "logits/chosen": 9.07430648803711, "logits/rejected": 5.470651626586914, "logps/chosen": -165.8836669921875, "logps/rejected": -133.9071502685547, "loss": 0.5751, "rewards/accuracies": 0.875, "rewards/chosen": 0.45373183488845825, "rewards/margins": 0.283252090215683, "rewards/rejected": 0.17047977447509766, "step": 1559 }, { "epoch": 0.2412526580320897, "grad_norm": 6.311559200286865, "learning_rate": 4.020618556701032e-06, "logits/chosen": 10.098783493041992, "logits/rejected": 3.9479897022247314, "logps/chosen": -333.0740966796875, "logps/rejected": -276.79388427734375, "loss": 0.624, "rewards/accuracies": 0.625, "rewards/chosen": 0.7639670372009277, "rewards/margins": 0.24557381868362427, "rewards/rejected": 0.5183932781219482, "step": 1560 }, { "epoch": 0.24140730717185385, "grad_norm": 5.065113544464111, "learning_rate": 4.02319587628866e-06, "logits/chosen": 8.530557632446289, "logits/rejected": 0.9309936165809631, "logps/chosen": -230.94143676757812, "logps/rejected": -145.53173828125, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.27160483598709106, "rewards/margins": 0.06093420833349228, "rewards/rejected": 0.2106706202030182, "step": 1561 }, { "epoch": 0.241561956311618, "grad_norm": 5.3548197746276855, "learning_rate": 4.025773195876289e-06, "logits/chosen": 7.3155717849731445, "logits/rejected": 11.198726654052734, "logps/chosen": -193.5297088623047, "logps/rejected": -182.76806640625, "loss": 0.7579, "rewards/accuracies": 0.5, "rewards/chosen": 0.2433968484401703, "rewards/margins": -0.09672746807336807, "rewards/rejected": 0.34012433886528015, "step": 1562 }, { "epoch": 0.24171660545138218, "grad_norm": 8.268684387207031, "learning_rate": 4.028350515463918e-06, "logits/chosen": 11.107728004455566, "logits/rejected": 8.472933769226074, "logps/chosen": -338.319580078125, "logps/rejected": -301.05352783203125, "loss": 0.7797, "rewards/accuracies": 0.25, "rewards/chosen": 0.5623660683631897, "rewards/margins": -0.13360103964805603, "rewards/rejected": 0.6959670782089233, "step": 1563 }, { "epoch": 0.24187125459114633, "grad_norm": 3.832423210144043, "learning_rate": 4.030927835051547e-06, "logits/chosen": 10.927206039428711, "logits/rejected": 2.791125774383545, "logps/chosen": -234.20565795898438, "logps/rejected": -149.89859008789062, "loss": 0.5153, "rewards/accuracies": 0.875, "rewards/chosen": 0.534905195236206, "rewards/margins": 0.48860862851142883, "rewards/rejected": 0.04629654437303543, "step": 1564 }, { "epoch": 0.24202590373091049, "grad_norm": 5.430241584777832, "learning_rate": 4.033505154639176e-06, "logits/chosen": 6.212075710296631, "logits/rejected": 6.049513816833496, "logps/chosen": -185.7310791015625, "logps/rejected": -189.97540283203125, "loss": 0.7401, "rewards/accuracies": 0.375, "rewards/chosen": 0.26261845231056213, "rewards/margins": -0.06871247291564941, "rewards/rejected": 0.33133092522621155, "step": 1565 }, { "epoch": 0.24218055287067466, "grad_norm": 6.031858921051025, "learning_rate": 4.036082474226805e-06, "logits/chosen": 9.077997207641602, "logits/rejected": 3.846890449523926, "logps/chosen": -250.1879119873047, "logps/rejected": -238.61819458007812, "loss": 0.6995, "rewards/accuracies": 0.375, "rewards/chosen": 0.3761819303035736, "rewards/margins": 0.13276588916778564, "rewards/rejected": 0.24341602623462677, "step": 1566 }, { "epoch": 0.24233520201043882, "grad_norm": 5.185763359069824, "learning_rate": 4.038659793814433e-06, "logits/chosen": 11.459232330322266, "logits/rejected": 5.572287559509277, "logps/chosen": -200.27467346191406, "logps/rejected": -147.2150115966797, "loss": 0.6437, "rewards/accuracies": 0.5, "rewards/chosen": 0.43723392486572266, "rewards/margins": 0.1866198629140854, "rewards/rejected": 0.25061407685279846, "step": 1567 }, { "epoch": 0.24248985115020297, "grad_norm": 5.5745110511779785, "learning_rate": 4.041237113402063e-06, "logits/chosen": 9.668834686279297, "logits/rejected": 9.785215377807617, "logps/chosen": -385.4316101074219, "logps/rejected": -336.98590087890625, "loss": 0.5286, "rewards/accuracies": 0.875, "rewards/chosen": 0.6820379495620728, "rewards/margins": 0.40019628405570984, "rewards/rejected": 0.2818416655063629, "step": 1568 }, { "epoch": 0.24264450028996715, "grad_norm": 19.378620147705078, "learning_rate": 4.043814432989691e-06, "logits/chosen": 9.61506175994873, "logits/rejected": 5.413683891296387, "logps/chosen": -289.4730224609375, "logps/rejected": -198.4378204345703, "loss": 0.5973, "rewards/accuracies": 0.625, "rewards/chosen": 0.535370409488678, "rewards/margins": 0.2744349241256714, "rewards/rejected": 0.2609354853630066, "step": 1569 }, { "epoch": 0.2427991494297313, "grad_norm": 4.232359409332275, "learning_rate": 4.04639175257732e-06, "logits/chosen": 9.682815551757812, "logits/rejected": 7.381643295288086, "logps/chosen": -207.211181640625, "logps/rejected": -205.03805541992188, "loss": 0.6202, "rewards/accuracies": 0.5, "rewards/chosen": 0.5808166265487671, "rewards/margins": 0.1779598742723465, "rewards/rejected": 0.4028567671775818, "step": 1570 }, { "epoch": 0.24295379856949545, "grad_norm": 4.199387073516846, "learning_rate": 4.048969072164949e-06, "logits/chosen": 8.136574745178223, "logits/rejected": 9.329063415527344, "logps/chosen": -140.48373413085938, "logps/rejected": -144.8646697998047, "loss": 0.6798, "rewards/accuracies": 0.5, "rewards/chosen": 0.40834593772888184, "rewards/margins": 0.05896200239658356, "rewards/rejected": 0.3493839204311371, "step": 1571 }, { "epoch": 0.24310844770925963, "grad_norm": 6.335525035858154, "learning_rate": 4.051546391752578e-06, "logits/chosen": 7.455455303192139, "logits/rejected": 3.4150402545928955, "logps/chosen": -470.0790100097656, "logps/rejected": -354.06585693359375, "loss": 0.6715, "rewards/accuracies": 0.625, "rewards/chosen": 0.620119571685791, "rewards/margins": 0.07851359993219376, "rewards/rejected": 0.5416059494018555, "step": 1572 }, { "epoch": 0.24326309684902378, "grad_norm": 6.335812091827393, "learning_rate": 4.0541237113402066e-06, "logits/chosen": 11.296810150146484, "logits/rejected": 12.278915405273438, "logps/chosen": -304.5184631347656, "logps/rejected": -305.2149353027344, "loss": 0.6959, "rewards/accuracies": 0.625, "rewards/chosen": 0.5601184368133545, "rewards/margins": 0.08246137946844101, "rewards/rejected": 0.4776570200920105, "step": 1573 }, { "epoch": 0.24341774598878793, "grad_norm": 5.249716758728027, "learning_rate": 4.056701030927835e-06, "logits/chosen": 5.735694885253906, "logits/rejected": 3.0682520866394043, "logps/chosen": -306.17108154296875, "logps/rejected": -201.2200927734375, "loss": 0.5644, "rewards/accuracies": 0.625, "rewards/chosen": 0.7355351448059082, "rewards/margins": 0.3275569975376129, "rewards/rejected": 0.40797820687294006, "step": 1574 }, { "epoch": 0.2435723951285521, "grad_norm": 5.277350902557373, "learning_rate": 4.059278350515464e-06, "logits/chosen": 15.300821304321289, "logits/rejected": 13.396979331970215, "logps/chosen": -234.18836975097656, "logps/rejected": -215.06817626953125, "loss": 0.5788, "rewards/accuracies": 0.75, "rewards/chosen": 0.42278116941452026, "rewards/margins": 0.30266231298446655, "rewards/rejected": 0.1201188713312149, "step": 1575 }, { "epoch": 0.24372704426831626, "grad_norm": 7.821404933929443, "learning_rate": 4.061855670103093e-06, "logits/chosen": 9.823543548583984, "logits/rejected": 9.900336265563965, "logps/chosen": -257.73492431640625, "logps/rejected": -306.6255187988281, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": 0.35069897770881653, "rewards/margins": 0.030589543282985687, "rewards/rejected": 0.320109486579895, "step": 1576 }, { "epoch": 0.2438816934080804, "grad_norm": 7.5015997886657715, "learning_rate": 4.064432989690722e-06, "logits/chosen": 10.402287483215332, "logits/rejected": 6.180578708648682, "logps/chosen": -367.4396057128906, "logps/rejected": -246.1971435546875, "loss": 0.6785, "rewards/accuracies": 0.5, "rewards/chosen": 0.33299723267555237, "rewards/margins": 0.07326700538396835, "rewards/rejected": 0.2597302496433258, "step": 1577 }, { "epoch": 0.24403634254784456, "grad_norm": 18.336650848388672, "learning_rate": 4.0670103092783505e-06, "logits/chosen": 11.310956954956055, "logits/rejected": 10.829543113708496, "logps/chosen": -177.29788208007812, "logps/rejected": -208.85240173339844, "loss": 0.6101, "rewards/accuracies": 0.625, "rewards/chosen": 0.424050897359848, "rewards/margins": 0.24967622756958008, "rewards/rejected": 0.17437466979026794, "step": 1578 }, { "epoch": 0.24419099168760874, "grad_norm": 5.507360458374023, "learning_rate": 4.06958762886598e-06, "logits/chosen": 3.5101687908172607, "logits/rejected": 0.9511356949806213, "logps/chosen": -216.30337524414062, "logps/rejected": -138.9408416748047, "loss": 0.6354, "rewards/accuracies": 0.625, "rewards/chosen": 0.4257768988609314, "rewards/margins": 0.20511355996131897, "rewards/rejected": 0.220663383603096, "step": 1579 }, { "epoch": 0.2443456408273729, "grad_norm": 5.62885046005249, "learning_rate": 4.072164948453608e-06, "logits/chosen": 0.9026474952697754, "logits/rejected": 5.950027942657471, "logps/chosen": -160.66421508789062, "logps/rejected": -215.285888671875, "loss": 0.6083, "rewards/accuracies": 0.75, "rewards/chosen": 0.3418775796890259, "rewards/margins": 0.24033251404762268, "rewards/rejected": 0.101545050740242, "step": 1580 }, { "epoch": 0.24450028996713705, "grad_norm": 4.013876914978027, "learning_rate": 4.0747422680412375e-06, "logits/chosen": 13.218616485595703, "logits/rejected": 9.569124221801758, "logps/chosen": -304.29290771484375, "logps/rejected": -217.02273559570312, "loss": 0.5242, "rewards/accuracies": 0.875, "rewards/chosen": 0.7105241417884827, "rewards/margins": 0.42840376496315, "rewards/rejected": 0.2821202874183655, "step": 1581 }, { "epoch": 0.24465493910690123, "grad_norm": 6.176245212554932, "learning_rate": 4.077319587628866e-06, "logits/chosen": 6.742888450622559, "logits/rejected": 9.035784721374512, "logps/chosen": -348.742919921875, "logps/rejected": -335.8526611328125, "loss": 0.7004, "rewards/accuracies": 0.375, "rewards/chosen": 0.25292858481407166, "rewards/margins": 0.051133204251527786, "rewards/rejected": 0.20179539918899536, "step": 1582 }, { "epoch": 0.24480958824666538, "grad_norm": 7.001703262329102, "learning_rate": 4.079896907216495e-06, "logits/chosen": 13.059893608093262, "logits/rejected": 6.234818935394287, "logps/chosen": -288.611572265625, "logps/rejected": -171.21954345703125, "loss": 0.7763, "rewards/accuracies": 0.375, "rewards/chosen": 0.16351842880249023, "rewards/margins": -0.08241356909275055, "rewards/rejected": 0.24593199789524078, "step": 1583 }, { "epoch": 0.24496423738642953, "grad_norm": 6.992018222808838, "learning_rate": 4.082474226804124e-06, "logits/chosen": 11.671810150146484, "logits/rejected": 12.481635093688965, "logps/chosen": -272.3096618652344, "logps/rejected": -287.1587829589844, "loss": 0.8005, "rewards/accuracies": 0.375, "rewards/chosen": 0.5078313946723938, "rewards/margins": -0.13844740390777588, "rewards/rejected": 0.6462787985801697, "step": 1584 }, { "epoch": 0.2451188865261937, "grad_norm": 6.328573703765869, "learning_rate": 4.085051546391753e-06, "logits/chosen": 11.106008529663086, "logits/rejected": 9.432999610900879, "logps/chosen": -264.5981140136719, "logps/rejected": -257.2050476074219, "loss": 0.8491, "rewards/accuracies": 0.25, "rewards/chosen": 0.18957120180130005, "rewards/margins": -0.2706175148487091, "rewards/rejected": 0.46018871665000916, "step": 1585 }, { "epoch": 0.24527353566595786, "grad_norm": 7.428191661834717, "learning_rate": 4.0876288659793815e-06, "logits/chosen": 9.431912422180176, "logits/rejected": 9.949995994567871, "logps/chosen": -282.7473449707031, "logps/rejected": -262.3726806640625, "loss": 0.7759, "rewards/accuracies": 0.625, "rewards/chosen": 0.34010347723960876, "rewards/margins": -0.1167859435081482, "rewards/rejected": 0.45688945055007935, "step": 1586 }, { "epoch": 0.245428184805722, "grad_norm": 4.919670581817627, "learning_rate": 4.090206185567011e-06, "logits/chosen": 10.43871021270752, "logits/rejected": 7.3854875564575195, "logps/chosen": -300.2728576660156, "logps/rejected": -241.9805145263672, "loss": 0.6257, "rewards/accuracies": 0.5, "rewards/chosen": 0.4826265275478363, "rewards/margins": 0.1631772518157959, "rewards/rejected": 0.3194493055343628, "step": 1587 }, { "epoch": 0.2455828339454862, "grad_norm": 6.834775924682617, "learning_rate": 4.092783505154639e-06, "logits/chosen": 6.900688648223877, "logits/rejected": 2.7163596153259277, "logps/chosen": -490.18011474609375, "logps/rejected": -290.9938049316406, "loss": 0.5916, "rewards/accuracies": 0.75, "rewards/chosen": 0.5109007358551025, "rewards/margins": 0.28936702013015747, "rewards/rejected": 0.2215336710214615, "step": 1588 }, { "epoch": 0.24573748308525034, "grad_norm": 17.95108413696289, "learning_rate": 4.0953608247422685e-06, "logits/chosen": 10.227059364318848, "logits/rejected": 3.6857471466064453, "logps/chosen": -287.2636413574219, "logps/rejected": -164.34503173828125, "loss": 0.7411, "rewards/accuracies": 0.375, "rewards/chosen": 0.24053683876991272, "rewards/margins": -0.06568513810634613, "rewards/rejected": 0.30622199177742004, "step": 1589 }, { "epoch": 0.2458921322250145, "grad_norm": 6.457596778869629, "learning_rate": 4.097938144329897e-06, "logits/chosen": 8.670398712158203, "logits/rejected": 12.373117446899414, "logps/chosen": -273.8775329589844, "logps/rejected": -326.6858215332031, "loss": 0.6257, "rewards/accuracies": 0.625, "rewards/chosen": 0.629082977771759, "rewards/margins": 0.16207952797412872, "rewards/rejected": 0.4670034348964691, "step": 1590 }, { "epoch": 0.24604678136477867, "grad_norm": 5.513363361358643, "learning_rate": 4.100515463917526e-06, "logits/chosen": 16.222354888916016, "logits/rejected": 9.223556518554688, "logps/chosen": -337.9263916015625, "logps/rejected": -254.47622680664062, "loss": 0.5971, "rewards/accuracies": 0.75, "rewards/chosen": 0.42576923966407776, "rewards/margins": 0.25148630142211914, "rewards/rejected": 0.17428293824195862, "step": 1591 }, { "epoch": 0.24620143050454282, "grad_norm": 5.461479187011719, "learning_rate": 4.103092783505155e-06, "logits/chosen": 13.338172912597656, "logits/rejected": 7.191806793212891, "logps/chosen": -331.81536865234375, "logps/rejected": -265.1946105957031, "loss": 0.4908, "rewards/accuracies": 0.875, "rewards/chosen": 0.7693902850151062, "rewards/margins": 0.5378403663635254, "rewards/rejected": 0.23154990375041962, "step": 1592 }, { "epoch": 0.24635607964430697, "grad_norm": 4.133162975311279, "learning_rate": 4.105670103092784e-06, "logits/chosen": 18.246402740478516, "logits/rejected": 11.348210334777832, "logps/chosen": -275.86944580078125, "logps/rejected": -222.93460083007812, "loss": 0.5855, "rewards/accuracies": 0.875, "rewards/chosen": 0.6855422258377075, "rewards/margins": 0.24522744119167328, "rewards/rejected": 0.44031476974487305, "step": 1593 }, { "epoch": 0.24651072878407113, "grad_norm": 5.335321426391602, "learning_rate": 4.1082474226804124e-06, "logits/chosen": 10.178993225097656, "logits/rejected": 10.26882553100586, "logps/chosen": -178.4753875732422, "logps/rejected": -188.35284423828125, "loss": 0.7575, "rewards/accuracies": 0.5, "rewards/chosen": 0.25981560349464417, "rewards/margins": -0.07432928681373596, "rewards/rejected": 0.3341448903083801, "step": 1594 }, { "epoch": 0.2466653779238353, "grad_norm": 7.256267070770264, "learning_rate": 4.110824742268042e-06, "logits/chosen": 7.20297908782959, "logits/rejected": 5.310847759246826, "logps/chosen": -175.30859375, "logps/rejected": -192.25233459472656, "loss": 0.7024, "rewards/accuracies": 0.375, "rewards/chosen": 0.4013350307941437, "rewards/margins": 0.052470821887254715, "rewards/rejected": 0.34886422753334045, "step": 1595 }, { "epoch": 0.24682002706359946, "grad_norm": 5.77985143661499, "learning_rate": 4.11340206185567e-06, "logits/chosen": 4.736292839050293, "logits/rejected": 12.441851615905762, "logps/chosen": -263.7264709472656, "logps/rejected": -282.23565673828125, "loss": 0.6815, "rewards/accuracies": 0.625, "rewards/chosen": 0.24189285933971405, "rewards/margins": 0.05961134284734726, "rewards/rejected": 0.18228153884410858, "step": 1596 }, { "epoch": 0.2469746762033636, "grad_norm": 7.368546485900879, "learning_rate": 4.1159793814432995e-06, "logits/chosen": 5.932742118835449, "logits/rejected": 4.723309516906738, "logps/chosen": -504.2133483886719, "logps/rejected": -400.3802795410156, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 0.46605217456817627, "rewards/margins": 0.01820240169763565, "rewards/rejected": 0.4478497803211212, "step": 1597 }, { "epoch": 0.2471293253431278, "grad_norm": 6.774259567260742, "learning_rate": 4.118556701030928e-06, "logits/chosen": 16.846004486083984, "logits/rejected": 13.96391773223877, "logps/chosen": -404.5115051269531, "logps/rejected": -248.13941955566406, "loss": 0.6575, "rewards/accuracies": 0.625, "rewards/chosen": 0.3905647397041321, "rewards/margins": 0.2489706128835678, "rewards/rejected": 0.14159414172172546, "step": 1598 }, { "epoch": 0.24728397448289194, "grad_norm": 6.341897487640381, "learning_rate": 4.121134020618557e-06, "logits/chosen": 8.28592300415039, "logits/rejected": 4.471281051635742, "logps/chosen": -482.963134765625, "logps/rejected": -298.1980895996094, "loss": 0.579, "rewards/accuracies": 0.625, "rewards/chosen": 0.8686650991439819, "rewards/margins": 0.3004007637500763, "rewards/rejected": 0.5682643055915833, "step": 1599 }, { "epoch": 0.2474386236226561, "grad_norm": 5.28187894821167, "learning_rate": 4.123711340206186e-06, "logits/chosen": 16.991960525512695, "logits/rejected": 8.26224422454834, "logps/chosen": -344.2832336425781, "logps/rejected": -244.84840393066406, "loss": 0.7102, "rewards/accuracies": 0.5, "rewards/chosen": 0.17857977747917175, "rewards/margins": 0.028863973915576935, "rewards/rejected": 0.14971581101417542, "step": 1600 }, { "epoch": 0.24759327276242027, "grad_norm": 11.196612358093262, "learning_rate": 4.126288659793815e-06, "logits/chosen": 6.732871055603027, "logits/rejected": 7.686327934265137, "logps/chosen": -247.2836151123047, "logps/rejected": -235.54293823242188, "loss": 0.5831, "rewards/accuracies": 0.75, "rewards/chosen": 0.4772689640522003, "rewards/margins": 0.3054541349411011, "rewards/rejected": 0.17181482911109924, "step": 1601 }, { "epoch": 0.24774792190218442, "grad_norm": 8.007533073425293, "learning_rate": 4.128865979381443e-06, "logits/chosen": 7.7536725997924805, "logits/rejected": 7.5432515144348145, "logps/chosen": -322.0111999511719, "logps/rejected": -283.08660888671875, "loss": 0.7439, "rewards/accuracies": 0.375, "rewards/chosen": 0.212103933095932, "rewards/margins": -0.03596443682909012, "rewards/rejected": 0.24806839227676392, "step": 1602 }, { "epoch": 0.24790257104194857, "grad_norm": 5.018898010253906, "learning_rate": 4.131443298969072e-06, "logits/chosen": 8.799638748168945, "logits/rejected": 11.236151695251465, "logps/chosen": -209.7759246826172, "logps/rejected": -249.03305053710938, "loss": 0.6216, "rewards/accuracies": 0.625, "rewards/chosen": 0.3298637270927429, "rewards/margins": 0.18953871726989746, "rewards/rejected": 0.14032503962516785, "step": 1603 }, { "epoch": 0.24805722018171275, "grad_norm": 6.016971111297607, "learning_rate": 4.134020618556701e-06, "logits/chosen": 7.859749794006348, "logits/rejected": 11.439133644104004, "logps/chosen": -217.74844360351562, "logps/rejected": -226.37728881835938, "loss": 0.6472, "rewards/accuracies": 0.75, "rewards/chosen": 0.35840949416160583, "rewards/margins": 0.13885095715522766, "rewards/rejected": 0.21955853700637817, "step": 1604 }, { "epoch": 0.2482118693214769, "grad_norm": 6.286149024963379, "learning_rate": 4.13659793814433e-06, "logits/chosen": 10.11738109588623, "logits/rejected": 7.317766189575195, "logps/chosen": -195.7477569580078, "logps/rejected": -173.75634765625, "loss": 0.7227, "rewards/accuracies": 0.5, "rewards/chosen": 0.17697674036026, "rewards/margins": -0.02410898357629776, "rewards/rejected": 0.20108571648597717, "step": 1605 }, { "epoch": 0.24836651846124105, "grad_norm": 6.834414958953857, "learning_rate": 4.139175257731959e-06, "logits/chosen": 8.190263748168945, "logits/rejected": 7.97369384765625, "logps/chosen": -357.20166015625, "logps/rejected": -356.8304443359375, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.43308526277542114, "rewards/margins": 0.13444869220256805, "rewards/rejected": 0.2986365556716919, "step": 1606 }, { "epoch": 0.24852116760100523, "grad_norm": 5.023197650909424, "learning_rate": 4.141752577319588e-06, "logits/chosen": 7.65443754196167, "logits/rejected": 8.107499122619629, "logps/chosen": -194.73854064941406, "logps/rejected": -196.713623046875, "loss": 0.6928, "rewards/accuracies": 0.375, "rewards/chosen": 0.29810890555381775, "rewards/margins": 0.02460518479347229, "rewards/rejected": 0.27350372076034546, "step": 1607 }, { "epoch": 0.24867581674076938, "grad_norm": 5.9183831214904785, "learning_rate": 4.1443298969072175e-06, "logits/chosen": 11.174837112426758, "logits/rejected": 8.177064895629883, "logps/chosen": -351.370849609375, "logps/rejected": -272.376220703125, "loss": 0.6632, "rewards/accuracies": 0.5, "rewards/chosen": 0.32531893253326416, "rewards/margins": 0.1618170589208603, "rewards/rejected": 0.1635018289089203, "step": 1608 }, { "epoch": 0.24883046588053354, "grad_norm": 8.947794914245605, "learning_rate": 4.146907216494846e-06, "logits/chosen": 10.086214065551758, "logits/rejected": 5.1015400886535645, "logps/chosen": -352.83734130859375, "logps/rejected": -262.85809326171875, "loss": 0.8886, "rewards/accuracies": 0.125, "rewards/chosen": 0.2161579132080078, "rewards/margins": -0.3356893062591553, "rewards/rejected": 0.5518472194671631, "step": 1609 }, { "epoch": 0.2489851150202977, "grad_norm": 6.343766212463379, "learning_rate": 4.149484536082475e-06, "logits/chosen": 12.456357955932617, "logits/rejected": 9.871862411499023, "logps/chosen": -263.312744140625, "logps/rejected": -190.8375244140625, "loss": 0.7309, "rewards/accuracies": 0.375, "rewards/chosen": 0.43510711193084717, "rewards/margins": -0.004523627460002899, "rewards/rejected": 0.43963077664375305, "step": 1610 }, { "epoch": 0.24913976416006187, "grad_norm": 4.517666816711426, "learning_rate": 4.152061855670104e-06, "logits/chosen": 6.7022552490234375, "logits/rejected": 5.9243669509887695, "logps/chosen": -301.82916259765625, "logps/rejected": -282.73187255859375, "loss": 0.617, "rewards/accuracies": 0.75, "rewards/chosen": 0.5187374949455261, "rewards/margins": 0.2564624845981598, "rewards/rejected": 0.2622750401496887, "step": 1611 }, { "epoch": 0.24929441329982602, "grad_norm": 4.882069110870361, "learning_rate": 4.154639175257732e-06, "logits/chosen": 9.98108196258545, "logits/rejected": 7.616365432739258, "logps/chosen": -230.05496215820312, "logps/rejected": -234.039306640625, "loss": 0.642, "rewards/accuracies": 0.5, "rewards/chosen": 0.49911755323410034, "rewards/margins": 0.19729860126972198, "rewards/rejected": 0.30181896686553955, "step": 1612 }, { "epoch": 0.24944906243959017, "grad_norm": 83.67471313476562, "learning_rate": 4.1572164948453614e-06, "logits/chosen": 10.120527267456055, "logits/rejected": 8.16556167602539, "logps/chosen": -299.514892578125, "logps/rejected": -271.455810546875, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": 0.421031653881073, "rewards/margins": 0.13261628150939941, "rewards/rejected": 0.2884153723716736, "step": 1613 }, { "epoch": 0.24960371157935435, "grad_norm": 5.759856224060059, "learning_rate": 4.15979381443299e-06, "logits/chosen": 11.615106582641602, "logits/rejected": 10.498109817504883, "logps/chosen": -306.879638671875, "logps/rejected": -309.7626647949219, "loss": 0.7423, "rewards/accuracies": 0.5, "rewards/chosen": 0.590875506401062, "rewards/margins": -0.003452412784099579, "rewards/rejected": 0.5943279266357422, "step": 1614 }, { "epoch": 0.2497583607191185, "grad_norm": 5.235974311828613, "learning_rate": 4.162371134020619e-06, "logits/chosen": 7.874249458312988, "logits/rejected": 4.454874038696289, "logps/chosen": -294.3907775878906, "logps/rejected": -220.292724609375, "loss": 0.5762, "rewards/accuracies": 0.75, "rewards/chosen": 0.6139842867851257, "rewards/margins": 0.27210158109664917, "rewards/rejected": 0.34188273549079895, "step": 1615 }, { "epoch": 0.24991300985888265, "grad_norm": 5.683008670806885, "learning_rate": 4.164948453608248e-06, "logits/chosen": 12.201066970825195, "logits/rejected": 8.841693878173828, "logps/chosen": -486.6582946777344, "logps/rejected": -356.3330078125, "loss": 0.5924, "rewards/accuracies": 0.5, "rewards/chosen": 0.7200773358345032, "rewards/margins": 0.32216161489486694, "rewards/rejected": 0.3979157507419586, "step": 1616 }, { "epoch": 0.2500676589986468, "grad_norm": 5.7631731033325195, "learning_rate": 4.167525773195877e-06, "logits/chosen": 4.660618782043457, "logits/rejected": 0.7919836044311523, "logps/chosen": -215.94320678710938, "logps/rejected": -204.84751892089844, "loss": 0.7062, "rewards/accuracies": 0.5, "rewards/chosen": 0.22886334359645844, "rewards/margins": 0.019197996705770493, "rewards/rejected": 0.20966535806655884, "step": 1617 }, { "epoch": 0.25022230813841095, "grad_norm": 8.097609519958496, "learning_rate": 4.170103092783505e-06, "logits/chosen": 8.240747451782227, "logits/rejected": 10.663824081420898, "logps/chosen": -277.2115478515625, "logps/rejected": -318.309326171875, "loss": 0.7297, "rewards/accuracies": 0.25, "rewards/chosen": 0.40421128273010254, "rewards/margins": -0.0374966636300087, "rewards/rejected": 0.44170790910720825, "step": 1618 }, { "epoch": 0.25037695727817516, "grad_norm": 4.644769191741943, "learning_rate": 4.172680412371135e-06, "logits/chosen": 13.678262710571289, "logits/rejected": 2.361966133117676, "logps/chosen": -355.73724365234375, "logps/rejected": -234.37550354003906, "loss": 0.5003, "rewards/accuracies": 0.875, "rewards/chosen": 0.40933820605278015, "rewards/margins": 0.539950430393219, "rewards/rejected": -0.13061223924160004, "step": 1619 }, { "epoch": 0.2505316064179393, "grad_norm": 7.742588996887207, "learning_rate": 4.175257731958763e-06, "logits/chosen": 9.044269561767578, "logits/rejected": 2.665898084640503, "logps/chosen": -373.65533447265625, "logps/rejected": -328.0936279296875, "loss": 0.7179, "rewards/accuracies": 0.375, "rewards/chosen": 0.44949454069137573, "rewards/margins": 0.12218938022851944, "rewards/rejected": 0.3273051977157593, "step": 1620 }, { "epoch": 0.25068625555770346, "grad_norm": 4.420767784118652, "learning_rate": 4.177835051546392e-06, "logits/chosen": 9.100753784179688, "logits/rejected": 9.952964782714844, "logps/chosen": -195.91946411132812, "logps/rejected": -197.62229919433594, "loss": 0.6165, "rewards/accuracies": 0.625, "rewards/chosen": 0.0967133492231369, "rewards/margins": 0.1918657422065735, "rewards/rejected": -0.09515240788459778, "step": 1621 }, { "epoch": 0.2508409046974676, "grad_norm": 6.149387359619141, "learning_rate": 4.180412371134021e-06, "logits/chosen": 5.239181995391846, "logits/rejected": 5.390114784240723, "logps/chosen": -296.5237121582031, "logps/rejected": -221.3077850341797, "loss": 0.6636, "rewards/accuracies": 0.5, "rewards/chosen": 0.31191134452819824, "rewards/margins": 0.16515719890594482, "rewards/rejected": 0.14675411581993103, "step": 1622 }, { "epoch": 0.25099555383723177, "grad_norm": 4.210749626159668, "learning_rate": 4.18298969072165e-06, "logits/chosen": 10.543679237365723, "logits/rejected": 10.2120361328125, "logps/chosen": -275.5555419921875, "logps/rejected": -276.81646728515625, "loss": 0.6211, "rewards/accuracies": 0.625, "rewards/chosen": 0.528279721736908, "rewards/margins": 0.18649163842201233, "rewards/rejected": 0.34178805351257324, "step": 1623 }, { "epoch": 0.2511502029769959, "grad_norm": 17.685640335083008, "learning_rate": 4.185567010309279e-06, "logits/chosen": 7.824277877807617, "logits/rejected": 13.62466812133789, "logps/chosen": -158.25189208984375, "logps/rejected": -357.7012939453125, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": 0.4482974112033844, "rewards/margins": 0.056787874549627304, "rewards/rejected": 0.3915095329284668, "step": 1624 }, { "epoch": 0.2513048521167601, "grad_norm": 5.255959987640381, "learning_rate": 4.188144329896908e-06, "logits/chosen": 2.517392635345459, "logits/rejected": 4.677642822265625, "logps/chosen": -170.2711181640625, "logps/rejected": -164.18605041503906, "loss": 0.8421, "rewards/accuracies": 0.25, "rewards/chosen": 0.19714275002479553, "rewards/margins": -0.2503814697265625, "rewards/rejected": 0.44752421975135803, "step": 1625 }, { "epoch": 0.2514595012565243, "grad_norm": 5.370728969573975, "learning_rate": 4.190721649484536e-06, "logits/chosen": 9.427894592285156, "logits/rejected": 3.8264122009277344, "logps/chosen": -317.9552001953125, "logps/rejected": -252.83407592773438, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": 0.5245187878608704, "rewards/margins": 0.14788249135017395, "rewards/rejected": 0.3766363263130188, "step": 1626 }, { "epoch": 0.2516141503962884, "grad_norm": 7.949696063995361, "learning_rate": 4.193298969072166e-06, "logits/chosen": 12.46045970916748, "logits/rejected": 10.630002975463867, "logps/chosen": -449.3725280761719, "logps/rejected": -302.653564453125, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": 0.06579238176345825, "rewards/margins": 0.2672802209854126, "rewards/rejected": -0.20148782432079315, "step": 1627 }, { "epoch": 0.2517687995360526, "grad_norm": 5.2793965339660645, "learning_rate": 4.195876288659794e-06, "logits/chosen": 9.032638549804688, "logits/rejected": 3.4847865104675293, "logps/chosen": -363.011962890625, "logps/rejected": -237.36341857910156, "loss": 0.6662, "rewards/accuracies": 0.625, "rewards/chosen": 0.3056390881538391, "rewards/margins": 0.1222473755478859, "rewards/rejected": 0.1833917200565338, "step": 1628 }, { "epoch": 0.25192344867581673, "grad_norm": 4.780789852142334, "learning_rate": 4.198453608247423e-06, "logits/chosen": 2.772305488586426, "logits/rejected": 10.86107349395752, "logps/chosen": -136.4967498779297, "logps/rejected": -205.36329650878906, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.30957695841789246, "rewards/margins": 0.06535394489765167, "rewards/rejected": 0.2442229986190796, "step": 1629 }, { "epoch": 0.2520780978155809, "grad_norm": 5.505533695220947, "learning_rate": 4.201030927835052e-06, "logits/chosen": 5.628686904907227, "logits/rejected": 8.052618026733398, "logps/chosen": -294.0024108886719, "logps/rejected": -278.51141357421875, "loss": 0.7394, "rewards/accuracies": 0.375, "rewards/chosen": 0.30035290122032166, "rewards/margins": -0.0683484673500061, "rewards/rejected": 0.36870136857032776, "step": 1630 }, { "epoch": 0.2522327469553451, "grad_norm": 6.067412376403809, "learning_rate": 4.203608247422681e-06, "logits/chosen": 3.856903553009033, "logits/rejected": 9.425829887390137, "logps/chosen": -223.6259765625, "logps/rejected": -271.5736083984375, "loss": 0.5983, "rewards/accuracies": 0.75, "rewards/chosen": 0.28897133469581604, "rewards/margins": 0.2328396588563919, "rewards/rejected": 0.056131646037101746, "step": 1631 }, { "epoch": 0.25238739609510924, "grad_norm": 5.93890380859375, "learning_rate": 4.2061855670103096e-06, "logits/chosen": 0.9251695871353149, "logits/rejected": 7.725450038909912, "logps/chosen": -178.5623779296875, "logps/rejected": -275.084716796875, "loss": 0.7078, "rewards/accuracies": 0.625, "rewards/chosen": 0.32795462012290955, "rewards/margins": -0.009829465299844742, "rewards/rejected": 0.3377840518951416, "step": 1632 }, { "epoch": 0.2525420452348734, "grad_norm": 4.634039402008057, "learning_rate": 4.208762886597939e-06, "logits/chosen": 10.779047012329102, "logits/rejected": 3.873975992202759, "logps/chosen": -457.6269226074219, "logps/rejected": -299.7712097167969, "loss": 0.4737, "rewards/accuracies": 0.875, "rewards/chosen": 0.7059726715087891, "rewards/margins": 0.5996737480163574, "rewards/rejected": 0.10629893094301224, "step": 1633 }, { "epoch": 0.25269669437463754, "grad_norm": 4.973537921905518, "learning_rate": 4.211340206185567e-06, "logits/chosen": 11.1174955368042, "logits/rejected": 9.119796752929688, "logps/chosen": -235.68455505371094, "logps/rejected": -231.4317626953125, "loss": 0.6717, "rewards/accuracies": 0.625, "rewards/chosen": 0.34784913063049316, "rewards/margins": 0.09272685647010803, "rewards/rejected": 0.2551223039627075, "step": 1634 }, { "epoch": 0.2528513435144017, "grad_norm": 4.96306037902832, "learning_rate": 4.213917525773197e-06, "logits/chosen": 11.122123718261719, "logits/rejected": 3.534989356994629, "logps/chosen": -298.7386474609375, "logps/rejected": -162.73345947265625, "loss": 0.781, "rewards/accuracies": 0.5, "rewards/chosen": 0.1640453338623047, "rewards/margins": -0.07453343272209167, "rewards/rejected": 0.23857876658439636, "step": 1635 }, { "epoch": 0.25300599265416585, "grad_norm": 6.668501853942871, "learning_rate": 4.216494845360825e-06, "logits/chosen": 9.434202194213867, "logits/rejected": 11.486614227294922, "logps/chosen": -228.00320434570312, "logps/rejected": -258.9673767089844, "loss": 0.7452, "rewards/accuracies": 0.375, "rewards/chosen": 0.11207906901836395, "rewards/margins": -0.08272166550159454, "rewards/rejected": 0.1948007494211197, "step": 1636 }, { "epoch": 0.25316064179393, "grad_norm": 10.877275466918945, "learning_rate": 4.219072164948454e-06, "logits/chosen": 10.078407287597656, "logits/rejected": 5.6256422996521, "logps/chosen": -330.3979187011719, "logps/rejected": -288.82073974609375, "loss": 0.6321, "rewards/accuracies": 0.75, "rewards/chosen": 0.2939951419830322, "rewards/margins": 0.22671625018119812, "rewards/rejected": 0.0672789141535759, "step": 1637 }, { "epoch": 0.2533152909336942, "grad_norm": 6.319427013397217, "learning_rate": 4.221649484536083e-06, "logits/chosen": 7.259459495544434, "logits/rejected": 7.920317649841309, "logps/chosen": -268.65576171875, "logps/rejected": -293.05499267578125, "loss": 0.669, "rewards/accuracies": 0.625, "rewards/chosen": 0.3166123628616333, "rewards/margins": 0.06800252199172974, "rewards/rejected": 0.24860984086990356, "step": 1638 }, { "epoch": 0.25346994007345836, "grad_norm": 8.284077644348145, "learning_rate": 4.224226804123711e-06, "logits/chosen": 11.431381225585938, "logits/rejected": 13.871949195861816, "logps/chosen": -375.19586181640625, "logps/rejected": -450.5081481933594, "loss": 0.8782, "rewards/accuracies": 0.375, "rewards/chosen": 0.4328592121601105, "rewards/margins": -0.18595139682292938, "rewards/rejected": 0.6188106536865234, "step": 1639 }, { "epoch": 0.2536245892132225, "grad_norm": 6.5860185623168945, "learning_rate": 4.2268041237113405e-06, "logits/chosen": 14.755675315856934, "logits/rejected": 9.009217262268066, "logps/chosen": -259.04412841796875, "logps/rejected": -217.031005859375, "loss": 0.5198, "rewards/accuracies": 0.75, "rewards/chosen": 0.486317902803421, "rewards/margins": 0.48018601536750793, "rewards/rejected": 0.0061319246888160706, "step": 1640 }, { "epoch": 0.25377923835298666, "grad_norm": 5.292786598205566, "learning_rate": 4.229381443298969e-06, "logits/chosen": 12.739208221435547, "logits/rejected": 10.681746482849121, "logps/chosen": -283.26129150390625, "logps/rejected": -278.7433776855469, "loss": 0.7217, "rewards/accuracies": 0.25, "rewards/chosen": 0.2046753317117691, "rewards/margins": 0.019237905740737915, "rewards/rejected": 0.18543744087219238, "step": 1641 }, { "epoch": 0.2539338874927508, "grad_norm": 7.914367198944092, "learning_rate": 4.231958762886598e-06, "logits/chosen": 8.906744956970215, "logits/rejected": 2.5963854789733887, "logps/chosen": -577.107666015625, "logps/rejected": -331.62530517578125, "loss": 0.7303, "rewards/accuracies": 0.375, "rewards/chosen": 0.3582886755466461, "rewards/margins": -0.004259783774614334, "rewards/rejected": 0.36254844069480896, "step": 1642 }, { "epoch": 0.25408853663251496, "grad_norm": 7.514982223510742, "learning_rate": 4.234536082474227e-06, "logits/chosen": 11.91797924041748, "logits/rejected": 6.141276836395264, "logps/chosen": -412.68017578125, "logps/rejected": -341.97161865234375, "loss": 0.6345, "rewards/accuracies": 0.75, "rewards/chosen": 0.5013439655303955, "rewards/margins": 0.20196658372879028, "rewards/rejected": 0.29937744140625, "step": 1643 }, { "epoch": 0.25424318577227917, "grad_norm": 6.684088706970215, "learning_rate": 4.237113402061856e-06, "logits/chosen": 11.187084197998047, "logits/rejected": 0.9069557189941406, "logps/chosen": -429.87554931640625, "logps/rejected": -237.06298828125, "loss": 0.7121, "rewards/accuracies": 0.625, "rewards/chosen": 0.22557444870471954, "rewards/margins": -0.005273483693599701, "rewards/rejected": 0.23084792494773865, "step": 1644 }, { "epoch": 0.2543978349120433, "grad_norm": 7.095033645629883, "learning_rate": 4.2396907216494845e-06, "logits/chosen": 11.935456275939941, "logits/rejected": 10.36489200592041, "logps/chosen": -277.1987609863281, "logps/rejected": -232.415771484375, "loss": 0.834, "rewards/accuracies": 0.25, "rewards/chosen": 0.11732234060764313, "rewards/margins": -0.24008171260356903, "rewards/rejected": 0.35740405321121216, "step": 1645 }, { "epoch": 0.25455248405180747, "grad_norm": 3.8879542350769043, "learning_rate": 4.242268041237114e-06, "logits/chosen": 12.882658958435059, "logits/rejected": 8.600262641906738, "logps/chosen": -179.5771942138672, "logps/rejected": -165.47805786132812, "loss": 0.6328, "rewards/accuracies": 0.75, "rewards/chosen": 0.32318249344825745, "rewards/margins": 0.19653114676475525, "rewards/rejected": 0.126651331782341, "step": 1646 }, { "epoch": 0.2547071331915716, "grad_norm": 6.627065181732178, "learning_rate": 4.244845360824742e-06, "logits/chosen": 6.602793216705322, "logits/rejected": 5.097395896911621, "logps/chosen": -158.929443359375, "logps/rejected": -182.35226440429688, "loss": 0.6806, "rewards/accuracies": 0.625, "rewards/chosen": 0.37746188044548035, "rewards/margins": 0.06501216441392899, "rewards/rejected": 0.31244972348213196, "step": 1647 }, { "epoch": 0.2548617823313358, "grad_norm": 6.520936489105225, "learning_rate": 4.2474226804123715e-06, "logits/chosen": 8.824220657348633, "logits/rejected": 14.6446533203125, "logps/chosen": -217.00958251953125, "logps/rejected": -292.0254821777344, "loss": 0.6742, "rewards/accuracies": 0.5, "rewards/chosen": 0.5182183980941772, "rewards/margins": 0.17121267318725586, "rewards/rejected": 0.347005695104599, "step": 1648 }, { "epoch": 0.2550164314710999, "grad_norm": 5.7352728843688965, "learning_rate": 4.25e-06, "logits/chosen": 10.118045806884766, "logits/rejected": 10.122156143188477, "logps/chosen": -234.01589965820312, "logps/rejected": -286.669677734375, "loss": 0.7225, "rewards/accuracies": 0.5, "rewards/chosen": 0.3616633415222168, "rewards/margins": 0.03921976685523987, "rewards/rejected": 0.32244354486465454, "step": 1649 }, { "epoch": 0.2551710806108641, "grad_norm": 5.220893383026123, "learning_rate": 4.252577319587629e-06, "logits/chosen": 11.87497615814209, "logits/rejected": 8.589349746704102, "logps/chosen": -211.21163940429688, "logps/rejected": -181.89675903320312, "loss": 0.6012, "rewards/accuracies": 0.875, "rewards/chosen": 0.4932010769844055, "rewards/margins": 0.21084357798099518, "rewards/rejected": 0.2823575437068939, "step": 1650 }, { "epoch": 0.2553257297506283, "grad_norm": 7.583624362945557, "learning_rate": 4.255154639175258e-06, "logits/chosen": 7.972025394439697, "logits/rejected": 6.4408159255981445, "logps/chosen": -251.74795532226562, "logps/rejected": -201.86666870117188, "loss": 0.8614, "rewards/accuracies": 0.375, "rewards/chosen": 0.28842028975486755, "rewards/margins": -0.2272483855485916, "rewards/rejected": 0.5156686902046204, "step": 1651 }, { "epoch": 0.25548037889039243, "grad_norm": 5.173532962799072, "learning_rate": 4.257731958762887e-06, "logits/chosen": 6.8554182052612305, "logits/rejected": 2.7411251068115234, "logps/chosen": -272.21905517578125, "logps/rejected": -204.11715698242188, "loss": 0.7193, "rewards/accuracies": 0.375, "rewards/chosen": 0.3607516288757324, "rewards/margins": 0.019206956028938293, "rewards/rejected": 0.3415446877479553, "step": 1652 }, { "epoch": 0.2556350280301566, "grad_norm": 5.8570122718811035, "learning_rate": 4.2603092783505155e-06, "logits/chosen": 8.766138076782227, "logits/rejected": 9.980649948120117, "logps/chosen": -356.2545166015625, "logps/rejected": -280.7750244140625, "loss": 0.6481, "rewards/accuracies": 0.5, "rewards/chosen": 0.3571777939796448, "rewards/margins": 0.11741085350513458, "rewards/rejected": 0.23976698517799377, "step": 1653 }, { "epoch": 0.25578967716992074, "grad_norm": 5.639509201049805, "learning_rate": 4.262886597938145e-06, "logits/chosen": 6.747480869293213, "logits/rejected": 2.5935261249542236, "logps/chosen": -275.094482421875, "logps/rejected": -267.2625427246094, "loss": 0.6513, "rewards/accuracies": 0.625, "rewards/chosen": 0.37122949957847595, "rewards/margins": 0.1012352854013443, "rewards/rejected": 0.26999419927597046, "step": 1654 }, { "epoch": 0.2559443263096849, "grad_norm": 4.119788646697998, "learning_rate": 4.265463917525773e-06, "logits/chosen": 13.585538864135742, "logits/rejected": 9.719527244567871, "logps/chosen": -247.7025146484375, "logps/rejected": -235.82984924316406, "loss": 0.5361, "rewards/accuracies": 0.75, "rewards/chosen": 0.6978378295898438, "rewards/margins": 0.3924921154975891, "rewards/rejected": 0.30534565448760986, "step": 1655 }, { "epoch": 0.25609897544944904, "grad_norm": 4.179389476776123, "learning_rate": 4.2680412371134025e-06, "logits/chosen": 13.3646240234375, "logits/rejected": 6.416746139526367, "logps/chosen": -302.365478515625, "logps/rejected": -181.3973388671875, "loss": 0.5955, "rewards/accuracies": 0.75, "rewards/chosen": 0.5110149383544922, "rewards/margins": 0.23698779940605164, "rewards/rejected": 0.27402716875076294, "step": 1656 }, { "epoch": 0.25625362458921325, "grad_norm": 11.048225402832031, "learning_rate": 4.270618556701031e-06, "logits/chosen": 5.747959136962891, "logits/rejected": 6.425881385803223, "logps/chosen": -410.8418273925781, "logps/rejected": -422.99749755859375, "loss": 0.6099, "rewards/accuracies": 0.625, "rewards/chosen": 0.3581824004650116, "rewards/margins": 0.2130957990884781, "rewards/rejected": 0.1450866162776947, "step": 1657 }, { "epoch": 0.2564082737289774, "grad_norm": 4.898638725280762, "learning_rate": 4.27319587628866e-06, "logits/chosen": 11.426324844360352, "logits/rejected": 4.630359172821045, "logps/chosen": -333.5100402832031, "logps/rejected": -225.53717041015625, "loss": 0.6479, "rewards/accuracies": 0.75, "rewards/chosen": 0.4307011663913727, "rewards/margins": 0.12580576539039612, "rewards/rejected": 0.30489540100097656, "step": 1658 }, { "epoch": 0.25656292286874155, "grad_norm": 6.974332809448242, "learning_rate": 4.275773195876289e-06, "logits/chosen": 11.037900924682617, "logits/rejected": 9.508896827697754, "logps/chosen": -236.75973510742188, "logps/rejected": -258.734375, "loss": 0.7501, "rewards/accuracies": 0.375, "rewards/chosen": 0.5625379085540771, "rewards/margins": 0.05542746186256409, "rewards/rejected": 0.5071104764938354, "step": 1659 }, { "epoch": 0.2567175720085057, "grad_norm": 4.777845859527588, "learning_rate": 4.278350515463918e-06, "logits/chosen": 12.794593811035156, "logits/rejected": 14.51907730102539, "logps/chosen": -234.52081298828125, "logps/rejected": -265.2019958496094, "loss": 0.5942, "rewards/accuracies": 0.625, "rewards/chosen": 0.5771160125732422, "rewards/margins": 0.2502594590187073, "rewards/rejected": 0.3268565535545349, "step": 1660 }, { "epoch": 0.25687222114826985, "grad_norm": 9.301328659057617, "learning_rate": 4.2809278350515464e-06, "logits/chosen": 6.797787189483643, "logits/rejected": 0.6090431213378906, "logps/chosen": -408.44281005859375, "logps/rejected": -304.8644714355469, "loss": 0.6822, "rewards/accuracies": 0.375, "rewards/chosen": 0.2010902464389801, "rewards/margins": 0.07193639874458313, "rewards/rejected": 0.12915386259555817, "step": 1661 }, { "epoch": 0.257026870288034, "grad_norm": 5.022156715393066, "learning_rate": 4.283505154639176e-06, "logits/chosen": 9.151747703552246, "logits/rejected": 8.361489295959473, "logps/chosen": -152.9624481201172, "logps/rejected": -217.64102172851562, "loss": 0.7814, "rewards/accuracies": 0.375, "rewards/chosen": 0.3426362872123718, "rewards/margins": -0.09035389125347137, "rewards/rejected": 0.4329901933670044, "step": 1662 }, { "epoch": 0.25718151942779816, "grad_norm": 6.543351650238037, "learning_rate": 4.286082474226804e-06, "logits/chosen": 4.54701566696167, "logits/rejected": 3.7109899520874023, "logps/chosen": -293.29534912109375, "logps/rejected": -317.0552062988281, "loss": 0.7445, "rewards/accuracies": 0.5, "rewards/chosen": 0.5780367851257324, "rewards/margins": -0.05578899383544922, "rewards/rejected": 0.6338257789611816, "step": 1663 }, { "epoch": 0.25733616856756236, "grad_norm": 5.040768146514893, "learning_rate": 4.2886597938144335e-06, "logits/chosen": 8.072590827941895, "logits/rejected": 7.120708465576172, "logps/chosen": -211.13418579101562, "logps/rejected": -194.79547119140625, "loss": 0.6281, "rewards/accuracies": 0.625, "rewards/chosen": 0.49554014205932617, "rewards/margins": 0.20108658075332642, "rewards/rejected": 0.29445356130599976, "step": 1664 }, { "epoch": 0.2574908177073265, "grad_norm": 7.257230281829834, "learning_rate": 4.291237113402062e-06, "logits/chosen": 10.890982627868652, "logits/rejected": 5.684013843536377, "logps/chosen": -232.88424682617188, "logps/rejected": -133.4580078125, "loss": 0.8374, "rewards/accuracies": 0.25, "rewards/chosen": 0.12326832860708237, "rewards/margins": -0.23687195777893066, "rewards/rejected": 0.36014029383659363, "step": 1665 }, { "epoch": 0.25764546684709067, "grad_norm": 6.2694268226623535, "learning_rate": 4.293814432989691e-06, "logits/chosen": 10.840110778808594, "logits/rejected": 8.990663528442383, "logps/chosen": -203.58351135253906, "logps/rejected": -160.192138671875, "loss": 0.7588, "rewards/accuracies": 0.5, "rewards/chosen": 0.26135969161987305, "rewards/margins": -0.06229672580957413, "rewards/rejected": 0.3236564099788666, "step": 1666 }, { "epoch": 0.2578001159868548, "grad_norm": 4.647719860076904, "learning_rate": 4.29639175257732e-06, "logits/chosen": 3.5154929161071777, "logits/rejected": 9.406538009643555, "logps/chosen": -167.55267333984375, "logps/rejected": -214.77041625976562, "loss": 0.674, "rewards/accuracies": 0.5, "rewards/chosen": 0.1570068895816803, "rewards/margins": 0.09085395187139511, "rewards/rejected": 0.06615294516086578, "step": 1667 }, { "epoch": 0.25795476512661897, "grad_norm": 10.691215515136719, "learning_rate": 4.298969072164949e-06, "logits/chosen": 8.109671592712402, "logits/rejected": 11.143040657043457, "logps/chosen": -233.75888061523438, "logps/rejected": -306.23980712890625, "loss": 0.6186, "rewards/accuracies": 0.625, "rewards/chosen": 0.3036283254623413, "rewards/margins": 0.21426591277122498, "rewards/rejected": 0.08936242759227753, "step": 1668 }, { "epoch": 0.2581094142663831, "grad_norm": 6.247630596160889, "learning_rate": 4.301546391752578e-06, "logits/chosen": 6.889999866485596, "logits/rejected": 3.4866364002227783, "logps/chosen": -317.400634765625, "logps/rejected": -247.3701171875, "loss": 0.7172, "rewards/accuracies": 0.375, "rewards/chosen": 0.2345121055841446, "rewards/margins": -0.020523402839899063, "rewards/rejected": 0.25503548979759216, "step": 1669 }, { "epoch": 0.2582640634061473, "grad_norm": 7.309544563293457, "learning_rate": 4.304123711340207e-06, "logits/chosen": 12.075065612792969, "logits/rejected": 9.863668441772461, "logps/chosen": -461.16357421875, "logps/rejected": -365.98004150390625, "loss": 0.6289, "rewards/accuracies": 0.625, "rewards/chosen": 0.5054642558097839, "rewards/margins": 0.2024945318698883, "rewards/rejected": 0.30296972393989563, "step": 1670 }, { "epoch": 0.2584187125459115, "grad_norm": 5.27731990814209, "learning_rate": 4.306701030927836e-06, "logits/chosen": 10.16232681274414, "logits/rejected": 12.613861083984375, "logps/chosen": -293.225341796875, "logps/rejected": -207.11866760253906, "loss": 0.7588, "rewards/accuracies": 0.25, "rewards/chosen": 0.4922961890697479, "rewards/margins": -0.09473396092653275, "rewards/rejected": 0.5870301723480225, "step": 1671 }, { "epoch": 0.25857336168567563, "grad_norm": 3.647665023803711, "learning_rate": 4.3092783505154644e-06, "logits/chosen": 9.743517875671387, "logits/rejected": 5.776437759399414, "logps/chosen": -228.10964965820312, "logps/rejected": -208.38943481445312, "loss": 0.5856, "rewards/accuracies": 0.875, "rewards/chosen": 0.37795019149780273, "rewards/margins": 0.2556622624397278, "rewards/rejected": 0.12228794395923615, "step": 1672 }, { "epoch": 0.2587280108254398, "grad_norm": 7.569202899932861, "learning_rate": 4.311855670103094e-06, "logits/chosen": 10.79423713684082, "logits/rejected": 8.504213333129883, "logps/chosen": -243.462646484375, "logps/rejected": -225.04257202148438, "loss": 0.7559, "rewards/accuracies": 0.5, "rewards/chosen": 0.018120139837265015, "rewards/margins": -0.08692573010921478, "rewards/rejected": 0.1050458699464798, "step": 1673 }, { "epoch": 0.25888265996520393, "grad_norm": 44.99276351928711, "learning_rate": 4.314432989690722e-06, "logits/chosen": 14.782356262207031, "logits/rejected": 7.280447483062744, "logps/chosen": -210.545166015625, "logps/rejected": -175.98812866210938, "loss": 0.6504, "rewards/accuracies": 0.5, "rewards/chosen": -0.01798591949045658, "rewards/margins": 0.16718244552612305, "rewards/rejected": -0.18516835570335388, "step": 1674 }, { "epoch": 0.2590373091049681, "grad_norm": 5.283841133117676, "learning_rate": 4.3170103092783515e-06, "logits/chosen": 12.421058654785156, "logits/rejected": 8.152573585510254, "logps/chosen": -263.1407775878906, "logps/rejected": -192.9669647216797, "loss": 0.6822, "rewards/accuracies": 0.625, "rewards/chosen": 0.19494442641735077, "rewards/margins": 0.02758745476603508, "rewards/rejected": 0.1673569679260254, "step": 1675 }, { "epoch": 0.2591919582447323, "grad_norm": 5.319368839263916, "learning_rate": 4.31958762886598e-06, "logits/chosen": 8.562289237976074, "logits/rejected": 10.118972778320312, "logps/chosen": -312.8533935546875, "logps/rejected": -229.1541748046875, "loss": 0.661, "rewards/accuracies": 0.5, "rewards/chosen": 0.4450657367706299, "rewards/margins": 0.13138996064662933, "rewards/rejected": 0.31367579102516174, "step": 1676 }, { "epoch": 0.25934660738449644, "grad_norm": 5.762267112731934, "learning_rate": 4.322164948453608e-06, "logits/chosen": 12.108770370483398, "logits/rejected": 6.452144622802734, "logps/chosen": -373.93341064453125, "logps/rejected": -272.74200439453125, "loss": 0.5323, "rewards/accuracies": 1.0, "rewards/chosen": 0.47330552339553833, "rewards/margins": 0.3820696473121643, "rewards/rejected": 0.09123587608337402, "step": 1677 }, { "epoch": 0.2595012565242606, "grad_norm": 5.103640556335449, "learning_rate": 4.324742268041238e-06, "logits/chosen": 15.442018508911133, "logits/rejected": 5.498007297515869, "logps/chosen": -192.68496704101562, "logps/rejected": -119.8779067993164, "loss": 0.7747, "rewards/accuracies": 0.25, "rewards/chosen": 0.3068602979183197, "rewards/margins": -0.13264621794223785, "rewards/rejected": 0.43950650095939636, "step": 1678 }, { "epoch": 0.25965590566402474, "grad_norm": 6.467741012573242, "learning_rate": 4.327319587628866e-06, "logits/chosen": 4.662561416625977, "logits/rejected": 7.070420265197754, "logps/chosen": -190.84994506835938, "logps/rejected": -263.6214599609375, "loss": 0.7372, "rewards/accuracies": 0.5, "rewards/chosen": 0.4969891309738159, "rewards/margins": -0.020562991499900818, "rewards/rejected": 0.5175521373748779, "step": 1679 }, { "epoch": 0.2598105548037889, "grad_norm": 5.728135108947754, "learning_rate": 4.329896907216495e-06, "logits/chosen": 9.04445743560791, "logits/rejected": 9.94682788848877, "logps/chosen": -303.6062316894531, "logps/rejected": -279.8541564941406, "loss": 0.7454, "rewards/accuracies": 0.5, "rewards/chosen": 0.444322407245636, "rewards/margins": -0.039310432970523834, "rewards/rejected": 0.4836328625679016, "step": 1680 }, { "epoch": 0.25996520394355305, "grad_norm": 7.5171799659729, "learning_rate": 4.332474226804124e-06, "logits/chosen": 7.127584934234619, "logits/rejected": 6.6687822341918945, "logps/chosen": -326.9767150878906, "logps/rejected": -347.5093688964844, "loss": 0.6549, "rewards/accuracies": 0.5, "rewards/chosen": 0.5778396725654602, "rewards/margins": 0.191556915640831, "rewards/rejected": 0.386282742023468, "step": 1681 }, { "epoch": 0.2601198530833172, "grad_norm": 7.1069016456604, "learning_rate": 4.335051546391753e-06, "logits/chosen": 11.272130966186523, "logits/rejected": 9.587542533874512, "logps/chosen": -218.4287872314453, "logps/rejected": -246.72174072265625, "loss": 0.7376, "rewards/accuracies": 0.375, "rewards/chosen": 0.16508060693740845, "rewards/margins": -0.0521341934800148, "rewards/rejected": 0.21721479296684265, "step": 1682 }, { "epoch": 0.2602745022230814, "grad_norm": 9.286980628967285, "learning_rate": 4.337628865979382e-06, "logits/chosen": 9.217291831970215, "logits/rejected": 12.489313125610352, "logps/chosen": -252.23968505859375, "logps/rejected": -276.44757080078125, "loss": 0.8488, "rewards/accuracies": 0.375, "rewards/chosen": -0.1445694863796234, "rewards/margins": -0.14328508079051971, "rewards/rejected": -0.0012844055891036987, "step": 1683 }, { "epoch": 0.26042915136284556, "grad_norm": 4.6686553955078125, "learning_rate": 4.340206185567011e-06, "logits/chosen": 11.10117244720459, "logits/rejected": 6.254631519317627, "logps/chosen": -384.20159912109375, "logps/rejected": -287.5000305175781, "loss": 0.5521, "rewards/accuracies": 0.625, "rewards/chosen": 0.558184027671814, "rewards/margins": 0.44858017563819885, "rewards/rejected": 0.1096038818359375, "step": 1684 }, { "epoch": 0.2605838005026097, "grad_norm": 4.128678798675537, "learning_rate": 4.342783505154639e-06, "logits/chosen": 7.401800155639648, "logits/rejected": 4.638880252838135, "logps/chosen": -181.85731506347656, "logps/rejected": -173.14053344726562, "loss": 0.6075, "rewards/accuracies": 0.875, "rewards/chosen": 0.3325316309928894, "rewards/margins": 0.18382640182971954, "rewards/rejected": 0.14870524406433105, "step": 1685 }, { "epoch": 0.26073844964237386, "grad_norm": 5.629097938537598, "learning_rate": 4.345360824742269e-06, "logits/chosen": 12.161772727966309, "logits/rejected": 3.5483956336975098, "logps/chosen": -326.270263671875, "logps/rejected": -203.72274780273438, "loss": 0.5637, "rewards/accuracies": 0.875, "rewards/chosen": 0.6203939318656921, "rewards/margins": 0.31491580605506897, "rewards/rejected": 0.30547815561294556, "step": 1686 }, { "epoch": 0.260893098782138, "grad_norm": 4.986377239227295, "learning_rate": 4.347938144329897e-06, "logits/chosen": 7.248554706573486, "logits/rejected": 3.0901105403900146, "logps/chosen": -230.0486297607422, "logps/rejected": -199.36036682128906, "loss": 0.6536, "rewards/accuracies": 0.375, "rewards/chosen": 0.2983247637748718, "rewards/margins": 0.15213394165039062, "rewards/rejected": 0.1461908370256424, "step": 1687 }, { "epoch": 0.26104774792190216, "grad_norm": 5.384634017944336, "learning_rate": 4.350515463917526e-06, "logits/chosen": 10.910951614379883, "logits/rejected": 7.930371284484863, "logps/chosen": -265.8597106933594, "logps/rejected": -133.46841430664062, "loss": 0.7254, "rewards/accuracies": 0.25, "rewards/chosen": 0.2534804940223694, "rewards/margins": 0.0074874237179756165, "rewards/rejected": 0.24599304795265198, "step": 1688 }, { "epoch": 0.26120239706166637, "grad_norm": 6.221289157867432, "learning_rate": 4.353092783505155e-06, "logits/chosen": 2.6747543811798096, "logits/rejected": 8.717193603515625, "logps/chosen": -171.8345947265625, "logps/rejected": -222.59078979492188, "loss": 0.7153, "rewards/accuracies": 0.5, "rewards/chosen": 0.19862869381904602, "rewards/margins": 0.056714288890361786, "rewards/rejected": 0.14191439747810364, "step": 1689 }, { "epoch": 0.2613570462014305, "grad_norm": 6.161037921905518, "learning_rate": 4.355670103092784e-06, "logits/chosen": 15.940458297729492, "logits/rejected": 7.757214546203613, "logps/chosen": -455.1711730957031, "logps/rejected": -309.407470703125, "loss": 0.5328, "rewards/accuracies": 0.875, "rewards/chosen": 0.7765721678733826, "rewards/margins": 0.3917813301086426, "rewards/rejected": 0.3847908079624176, "step": 1690 }, { "epoch": 0.26151169534119467, "grad_norm": 5.468064785003662, "learning_rate": 4.358247422680413e-06, "logits/chosen": 11.471540451049805, "logits/rejected": 12.728561401367188, "logps/chosen": -262.777587890625, "logps/rejected": -268.7331237792969, "loss": 0.6184, "rewards/accuracies": 0.875, "rewards/chosen": 0.304311603307724, "rewards/margins": 0.16495707631111145, "rewards/rejected": 0.13935452699661255, "step": 1691 }, { "epoch": 0.2616663444809588, "grad_norm": 5.665408134460449, "learning_rate": 4.360824742268042e-06, "logits/chosen": 8.737391471862793, "logits/rejected": 7.977653503417969, "logps/chosen": -258.18414306640625, "logps/rejected": -266.08135986328125, "loss": 0.6747, "rewards/accuracies": 0.625, "rewards/chosen": 0.505414605140686, "rewards/margins": 0.05552331358194351, "rewards/rejected": 0.44989126920700073, "step": 1692 }, { "epoch": 0.261820993620723, "grad_norm": 4.014686107635498, "learning_rate": 4.36340206185567e-06, "logits/chosen": 16.23103141784668, "logits/rejected": 4.625252723693848, "logps/chosen": -248.75076293945312, "logps/rejected": -174.1435089111328, "loss": 0.5379, "rewards/accuracies": 0.75, "rewards/chosen": 0.5051984786987305, "rewards/margins": 0.39069533348083496, "rewards/rejected": 0.11450319737195969, "step": 1693 }, { "epoch": 0.2619756427604871, "grad_norm": 5.6513352394104, "learning_rate": 4.3659793814433e-06, "logits/chosen": 11.060733795166016, "logits/rejected": 10.457305908203125, "logps/chosen": -246.74212646484375, "logps/rejected": -221.2449951171875, "loss": 0.7504, "rewards/accuracies": 0.375, "rewards/chosen": 0.3163064122200012, "rewards/margins": -0.08438654243946075, "rewards/rejected": 0.4006929397583008, "step": 1694 }, { "epoch": 0.2621302919002513, "grad_norm": 5.130451679229736, "learning_rate": 4.368556701030928e-06, "logits/chosen": 12.811918258666992, "logits/rejected": 9.328104972839355, "logps/chosen": -163.4132843017578, "logps/rejected": -154.5377197265625, "loss": 0.7893, "rewards/accuracies": 0.5, "rewards/chosen": 0.14657628536224365, "rewards/margins": -0.1565559357404709, "rewards/rejected": 0.30313220620155334, "step": 1695 }, { "epoch": 0.2622849410400155, "grad_norm": 5.4119648933410645, "learning_rate": 4.371134020618557e-06, "logits/chosen": 11.776997566223145, "logits/rejected": 3.36003041267395, "logps/chosen": -283.3559875488281, "logps/rejected": -194.5707244873047, "loss": 0.5654, "rewards/accuracies": 0.75, "rewards/chosen": 0.34145188331604004, "rewards/margins": 0.34150564670562744, "rewards/rejected": -5.374103784561157e-05, "step": 1696 }, { "epoch": 0.26243959017977964, "grad_norm": 5.5374603271484375, "learning_rate": 4.373711340206186e-06, "logits/chosen": 10.599066734313965, "logits/rejected": 10.56446361541748, "logps/chosen": -321.669189453125, "logps/rejected": -271.11395263671875, "loss": 0.7352, "rewards/accuracies": 0.375, "rewards/chosen": 0.5816910266876221, "rewards/margins": 0.0010752901434898376, "rewards/rejected": 0.580615758895874, "step": 1697 }, { "epoch": 0.2625942393195438, "grad_norm": 4.938562393188477, "learning_rate": 4.376288659793815e-06, "logits/chosen": 11.090388298034668, "logits/rejected": 13.657381057739258, "logps/chosen": -226.67254638671875, "logps/rejected": -220.35751342773438, "loss": 0.6667, "rewards/accuracies": 0.625, "rewards/chosen": 0.38853949308395386, "rewards/margins": 0.07145550847053528, "rewards/rejected": 0.3170839846134186, "step": 1698 }, { "epoch": 0.26274888845930794, "grad_norm": 6.904003620147705, "learning_rate": 4.3788659793814436e-06, "logits/chosen": 5.284897327423096, "logits/rejected": 8.073166847229004, "logps/chosen": -289.9504089355469, "logps/rejected": -265.20428466796875, "loss": 0.6191, "rewards/accuracies": 0.625, "rewards/chosen": 0.41621050238609314, "rewards/margins": 0.22259020805358887, "rewards/rejected": 0.19362029433250427, "step": 1699 }, { "epoch": 0.2629035375990721, "grad_norm": 6.463097095489502, "learning_rate": 4.381443298969073e-06, "logits/chosen": 10.876604080200195, "logits/rejected": 13.045934677124023, "logps/chosen": -255.5340118408203, "logps/rejected": -292.7287902832031, "loss": 0.8986, "rewards/accuracies": 0.25, "rewards/chosen": 0.08803338557481766, "rewards/margins": -0.3250718116760254, "rewards/rejected": 0.41310518980026245, "step": 1700 }, { "epoch": 0.26305818673883624, "grad_norm": 7.426027774810791, "learning_rate": 4.384020618556701e-06, "logits/chosen": 11.007621765136719, "logits/rejected": 6.974452972412109, "logps/chosen": -433.1227111816406, "logps/rejected": -328.71759033203125, "loss": 0.7557, "rewards/accuracies": 0.5, "rewards/chosen": 0.39031946659088135, "rewards/margins": -0.06136823073029518, "rewards/rejected": 0.45168769359588623, "step": 1701 }, { "epoch": 0.26321283587860045, "grad_norm": 8.258471488952637, "learning_rate": 4.386597938144331e-06, "logits/chosen": 11.523098945617676, "logits/rejected": 6.325174331665039, "logps/chosen": -483.7724304199219, "logps/rejected": -299.81927490234375, "loss": 0.7251, "rewards/accuracies": 0.375, "rewards/chosen": 0.20430830121040344, "rewards/margins": -0.03184375911951065, "rewards/rejected": 0.23615208268165588, "step": 1702 }, { "epoch": 0.2633674850183646, "grad_norm": 7.3353986740112305, "learning_rate": 4.389175257731959e-06, "logits/chosen": 7.863901138305664, "logits/rejected": 13.702529907226562, "logps/chosen": -255.89523315429688, "logps/rejected": -355.883056640625, "loss": 0.8622, "rewards/accuracies": 0.25, "rewards/chosen": 0.09760475158691406, "rewards/margins": -0.2865573763847351, "rewards/rejected": 0.38416212797164917, "step": 1703 }, { "epoch": 0.26352213415812875, "grad_norm": 5.180741310119629, "learning_rate": 4.391752577319588e-06, "logits/chosen": 16.025365829467773, "logits/rejected": 7.518298149108887, "logps/chosen": -442.07183837890625, "logps/rejected": -320.32086181640625, "loss": 0.5579, "rewards/accuracies": 0.75, "rewards/chosen": 0.6019032001495361, "rewards/margins": 0.3831135630607605, "rewards/rejected": 0.21878957748413086, "step": 1704 }, { "epoch": 0.2636767832978929, "grad_norm": 5.708284378051758, "learning_rate": 4.394329896907217e-06, "logits/chosen": 6.307971000671387, "logits/rejected": 4.043267726898193, "logps/chosen": -271.7491149902344, "logps/rejected": -216.861083984375, "loss": 0.6418, "rewards/accuracies": 0.625, "rewards/chosen": 0.1972552090883255, "rewards/margins": 0.2408042699098587, "rewards/rejected": -0.0435490608215332, "step": 1705 }, { "epoch": 0.26383143243765705, "grad_norm": 4.850820064544678, "learning_rate": 4.396907216494845e-06, "logits/chosen": 10.822961807250977, "logits/rejected": 7.7307538986206055, "logps/chosen": -199.04415893554688, "logps/rejected": -173.9315948486328, "loss": 0.6792, "rewards/accuracies": 0.5, "rewards/chosen": 0.45624539256095886, "rewards/margins": 0.10952162742614746, "rewards/rejected": 0.3467237651348114, "step": 1706 }, { "epoch": 0.2639860815774212, "grad_norm": 7.11625862121582, "learning_rate": 4.3994845360824745e-06, "logits/chosen": 7.0849151611328125, "logits/rejected": 7.574535369873047, "logps/chosen": -406.97882080078125, "logps/rejected": -318.5047607421875, "loss": 0.7184, "rewards/accuracies": 0.375, "rewards/chosen": 0.2544782757759094, "rewards/margins": 0.004982903599739075, "rewards/rejected": 0.24949535727500916, "step": 1707 }, { "epoch": 0.2641407307171854, "grad_norm": 4.960830211639404, "learning_rate": 4.402061855670103e-06, "logits/chosen": 9.605134963989258, "logits/rejected": 13.243675231933594, "logps/chosen": -202.01596069335938, "logps/rejected": -214.9547119140625, "loss": 0.5788, "rewards/accuracies": 0.875, "rewards/chosen": 0.5835309624671936, "rewards/margins": 0.27158015966415405, "rewards/rejected": 0.31195080280303955, "step": 1708 }, { "epoch": 0.26429537985694956, "grad_norm": 5.407628059387207, "learning_rate": 4.404639175257732e-06, "logits/chosen": 10.823848724365234, "logits/rejected": 3.217395782470703, "logps/chosen": -341.1961669921875, "logps/rejected": -243.29244995117188, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": 0.4170490503311157, "rewards/margins": 0.11604584753513336, "rewards/rejected": 0.3010031580924988, "step": 1709 }, { "epoch": 0.2644500289967137, "grad_norm": 5.524591445922852, "learning_rate": 4.407216494845361e-06, "logits/chosen": 5.351621150970459, "logits/rejected": 6.015663146972656, "logps/chosen": -225.0330047607422, "logps/rejected": -236.33151245117188, "loss": 0.6225, "rewards/accuracies": 0.625, "rewards/chosen": 0.3774985373020172, "rewards/margins": 0.1686972975730896, "rewards/rejected": 0.20880120992660522, "step": 1710 }, { "epoch": 0.26460467813647787, "grad_norm": 6.06163215637207, "learning_rate": 4.40979381443299e-06, "logits/chosen": 9.44446849822998, "logits/rejected": 11.471393585205078, "logps/chosen": -210.42742919921875, "logps/rejected": -231.7503662109375, "loss": 0.8547, "rewards/accuracies": 0.25, "rewards/chosen": 0.4026919901371002, "rewards/margins": -0.2805594801902771, "rewards/rejected": 0.6832515001296997, "step": 1711 }, { "epoch": 0.264759327276242, "grad_norm": 7.141834259033203, "learning_rate": 4.4123711340206185e-06, "logits/chosen": 3.8508729934692383, "logits/rejected": -0.2764413356781006, "logps/chosen": -281.7173767089844, "logps/rejected": -257.2532958984375, "loss": 0.678, "rewards/accuracies": 0.625, "rewards/chosen": 0.3090146780014038, "rewards/margins": 0.06591783463954926, "rewards/rejected": 0.24309682846069336, "step": 1712 }, { "epoch": 0.26491397641600617, "grad_norm": 5.941365718841553, "learning_rate": 4.414948453608248e-06, "logits/chosen": 9.955275535583496, "logits/rejected": 13.661100387573242, "logps/chosen": -323.3267517089844, "logps/rejected": -266.95220947265625, "loss": 0.6119, "rewards/accuracies": 0.875, "rewards/chosen": 0.3876960277557373, "rewards/margins": 0.17844253778457642, "rewards/rejected": 0.20925350487232208, "step": 1713 }, { "epoch": 0.2650686255557703, "grad_norm": 3.762624740600586, "learning_rate": 4.417525773195876e-06, "logits/chosen": 15.55923843383789, "logits/rejected": 10.97128677368164, "logps/chosen": -271.4499206542969, "logps/rejected": -244.6823272705078, "loss": 0.5464, "rewards/accuracies": 0.875, "rewards/chosen": 0.7321747541427612, "rewards/margins": 0.3452012240886688, "rewards/rejected": 0.38697350025177, "step": 1714 }, { "epoch": 0.2652232746955345, "grad_norm": 4.991570472717285, "learning_rate": 4.4201030927835055e-06, "logits/chosen": 9.879316329956055, "logits/rejected": 9.301066398620605, "logps/chosen": -282.86724853515625, "logps/rejected": -261.4393310546875, "loss": 0.6509, "rewards/accuracies": 0.75, "rewards/chosen": 0.4787258803844452, "rewards/margins": 0.12435813248157501, "rewards/rejected": 0.354367733001709, "step": 1715 }, { "epoch": 0.2653779238352987, "grad_norm": 5.48406982421875, "learning_rate": 4.422680412371134e-06, "logits/chosen": 13.096549034118652, "logits/rejected": 9.426155090332031, "logps/chosen": -362.8670959472656, "logps/rejected": -337.4403381347656, "loss": 0.5968, "rewards/accuracies": 0.75, "rewards/chosen": 0.5546704530715942, "rewards/margins": 0.23997437953948975, "rewards/rejected": 0.3146960735321045, "step": 1716 }, { "epoch": 0.26553257297506283, "grad_norm": 4.979221343994141, "learning_rate": 4.425257731958763e-06, "logits/chosen": 10.707191467285156, "logits/rejected": 7.892974376678467, "logps/chosen": -299.4150085449219, "logps/rejected": -316.5184631347656, "loss": 0.5376, "rewards/accuracies": 0.875, "rewards/chosen": 0.626193642616272, "rewards/margins": 0.36955520510673523, "rewards/rejected": 0.25663843750953674, "step": 1717 }, { "epoch": 0.265687222114827, "grad_norm": 6.365272045135498, "learning_rate": 4.427835051546392e-06, "logits/chosen": 16.767621994018555, "logits/rejected": 8.144224166870117, "logps/chosen": -425.71563720703125, "logps/rejected": -292.3013916015625, "loss": 0.614, "rewards/accuracies": 0.75, "rewards/chosen": 0.7393825054168701, "rewards/margins": 0.20860975980758667, "rewards/rejected": 0.5307728052139282, "step": 1718 }, { "epoch": 0.26584187125459113, "grad_norm": 16.35604476928711, "learning_rate": 4.430412371134021e-06, "logits/chosen": 13.156147003173828, "logits/rejected": 13.855541229248047, "logps/chosen": -273.77984619140625, "logps/rejected": -252.74386596679688, "loss": 0.7051, "rewards/accuracies": 0.25, "rewards/chosen": 0.5535398721694946, "rewards/margins": -0.001130755990743637, "rewards/rejected": 0.5546705722808838, "step": 1719 }, { "epoch": 0.2659965203943553, "grad_norm": 5.132080554962158, "learning_rate": 4.4329896907216494e-06, "logits/chosen": 10.233236312866211, "logits/rejected": 11.238988876342773, "logps/chosen": -325.6272888183594, "logps/rejected": -342.97979736328125, "loss": 0.679, "rewards/accuracies": 0.5, "rewards/chosen": 0.3435250520706177, "rewards/margins": 0.08480892330408096, "rewards/rejected": 0.2587161064147949, "step": 1720 }, { "epoch": 0.2661511695341195, "grad_norm": 5.789462566375732, "learning_rate": 4.435567010309279e-06, "logits/chosen": 11.562353134155273, "logits/rejected": 9.545722007751465, "logps/chosen": -341.5428466796875, "logps/rejected": -295.9103088378906, "loss": 0.4605, "rewards/accuracies": 0.875, "rewards/chosen": 0.6406839489936829, "rewards/margins": 0.5698068737983704, "rewards/rejected": 0.07087710499763489, "step": 1721 }, { "epoch": 0.26630581867388364, "grad_norm": 4.944628715515137, "learning_rate": 4.438144329896907e-06, "logits/chosen": 7.905723571777344, "logits/rejected": 1.8851712942123413, "logps/chosen": -295.4744567871094, "logps/rejected": -176.78048706054688, "loss": 0.5922, "rewards/accuracies": 0.75, "rewards/chosen": 0.5240675210952759, "rewards/margins": 0.23503799736499786, "rewards/rejected": 0.2890295088291168, "step": 1722 }, { "epoch": 0.2664604678136478, "grad_norm": 5.386043071746826, "learning_rate": 4.4407216494845365e-06, "logits/chosen": 7.1710920333862305, "logits/rejected": 7.8755693435668945, "logps/chosen": -266.4452209472656, "logps/rejected": -274.92462158203125, "loss": 0.6017, "rewards/accuracies": 0.75, "rewards/chosen": 0.8069710731506348, "rewards/margins": 0.21230602264404297, "rewards/rejected": 0.5946650505065918, "step": 1723 }, { "epoch": 0.26661511695341195, "grad_norm": 5.013392448425293, "learning_rate": 4.443298969072165e-06, "logits/chosen": 9.766303062438965, "logits/rejected": 6.226452827453613, "logps/chosen": -284.0867614746094, "logps/rejected": -198.92022705078125, "loss": 0.6465, "rewards/accuracies": 0.625, "rewards/chosen": 0.37502872943878174, "rewards/margins": 0.19307051599025726, "rewards/rejected": 0.18195819854736328, "step": 1724 }, { "epoch": 0.2667697660931761, "grad_norm": 5.038034915924072, "learning_rate": 4.445876288659794e-06, "logits/chosen": 11.282554626464844, "logits/rejected": 6.617173194885254, "logps/chosen": -264.6533203125, "logps/rejected": -198.9580535888672, "loss": 0.5856, "rewards/accuracies": 0.625, "rewards/chosen": 0.4753740429878235, "rewards/margins": 0.35207927227020264, "rewards/rejected": 0.12329478561878204, "step": 1725 }, { "epoch": 0.26692441523294025, "grad_norm": 5.749571323394775, "learning_rate": 4.448453608247423e-06, "logits/chosen": 10.442389488220215, "logits/rejected": 14.080352783203125, "logps/chosen": -257.47186279296875, "logps/rejected": -313.631591796875, "loss": 0.7405, "rewards/accuracies": 0.5, "rewards/chosen": 0.4114357829093933, "rewards/margins": -0.018024668097496033, "rewards/rejected": 0.42946046590805054, "step": 1726 }, { "epoch": 0.2670790643727044, "grad_norm": 5.406755447387695, "learning_rate": 4.451030927835052e-06, "logits/chosen": 13.89488410949707, "logits/rejected": 7.5984344482421875, "logps/chosen": -538.5991821289062, "logps/rejected": -347.7793884277344, "loss": 0.5461, "rewards/accuracies": 0.625, "rewards/chosen": 0.8351243138313293, "rewards/margins": 0.4403015375137329, "rewards/rejected": 0.39482277631759644, "step": 1727 }, { "epoch": 0.2672337135124686, "grad_norm": 5.1755452156066895, "learning_rate": 4.453608247422681e-06, "logits/chosen": 14.283370971679688, "logits/rejected": 9.318902015686035, "logps/chosen": -230.26361083984375, "logps/rejected": -198.26577758789062, "loss": 0.6505, "rewards/accuracies": 0.625, "rewards/chosen": 0.22499942779541016, "rewards/margins": 0.12262983620166779, "rewards/rejected": 0.10236959159374237, "step": 1728 }, { "epoch": 0.26738836265223276, "grad_norm": 7.17335844039917, "learning_rate": 4.45618556701031e-06, "logits/chosen": -0.17161251604557037, "logits/rejected": 4.816195964813232, "logps/chosen": -227.96697998046875, "logps/rejected": -295.21270751953125, "loss": 0.8224, "rewards/accuracies": 0.5, "rewards/chosen": 0.39172661304473877, "rewards/margins": -0.1557048112154007, "rewards/rejected": 0.5474314093589783, "step": 1729 }, { "epoch": 0.2675430117919969, "grad_norm": 5.123386859893799, "learning_rate": 4.458762886597939e-06, "logits/chosen": 10.307762145996094, "logits/rejected": 5.519076824188232, "logps/chosen": -287.9240417480469, "logps/rejected": -260.66473388671875, "loss": 0.6445, "rewards/accuracies": 0.625, "rewards/chosen": 0.2875879406929016, "rewards/margins": 0.20955267548561096, "rewards/rejected": 0.07803526520729065, "step": 1730 }, { "epoch": 0.26769766093176106, "grad_norm": 6.955427646636963, "learning_rate": 4.4613402061855675e-06, "logits/chosen": 10.058084487915039, "logits/rejected": 7.790159225463867, "logps/chosen": -392.1209716796875, "logps/rejected": -346.4644470214844, "loss": 0.8549, "rewards/accuracies": 0.5, "rewards/chosen": 0.5513603687286377, "rewards/margins": -0.15822352468967438, "rewards/rejected": 0.7095838189125061, "step": 1731 }, { "epoch": 0.2678523100715252, "grad_norm": 6.139858245849609, "learning_rate": 4.463917525773197e-06, "logits/chosen": 10.285011291503906, "logits/rejected": 11.999436378479004, "logps/chosen": -199.24026489257812, "logps/rejected": -302.02435302734375, "loss": 0.6686, "rewards/accuracies": 0.375, "rewards/chosen": 0.06514232605695724, "rewards/margins": 0.11715100705623627, "rewards/rejected": -0.052008673548698425, "step": 1732 }, { "epoch": 0.26800695921128936, "grad_norm": 23.301212310791016, "learning_rate": 4.466494845360825e-06, "logits/chosen": 13.485984802246094, "logits/rejected": 12.20804500579834, "logps/chosen": -209.71197509765625, "logps/rejected": -210.00933837890625, "loss": 0.7298, "rewards/accuracies": 0.5, "rewards/chosen": 0.11429931223392487, "rewards/margins": -0.04964572936296463, "rewards/rejected": 0.1639450639486313, "step": 1733 }, { "epoch": 0.26816160835105357, "grad_norm": 3.736797332763672, "learning_rate": 4.4690721649484545e-06, "logits/chosen": 8.650613784790039, "logits/rejected": 6.061467170715332, "logps/chosen": -176.56268310546875, "logps/rejected": -146.519287109375, "loss": 0.5244, "rewards/accuracies": 0.625, "rewards/chosen": 0.4185592532157898, "rewards/margins": 0.5009682178497314, "rewards/rejected": -0.08240897953510284, "step": 1734 }, { "epoch": 0.2683162574908177, "grad_norm": 6.25091552734375, "learning_rate": 4.471649484536083e-06, "logits/chosen": 12.991402626037598, "logits/rejected": 9.105416297912598, "logps/chosen": -230.97344970703125, "logps/rejected": -192.5790252685547, "loss": 0.7408, "rewards/accuracies": 0.625, "rewards/chosen": 0.10231724381446838, "rewards/margins": -0.06344317644834518, "rewards/rejected": 0.16576042771339417, "step": 1735 }, { "epoch": 0.2684709066305819, "grad_norm": 5.054879188537598, "learning_rate": 4.474226804123712e-06, "logits/chosen": 7.346775054931641, "logits/rejected": 5.71868896484375, "logps/chosen": -269.82110595703125, "logps/rejected": -210.7763671875, "loss": 0.6934, "rewards/accuracies": 0.625, "rewards/chosen": 0.510530948638916, "rewards/margins": 0.0674944818019867, "rewards/rejected": 0.44303640723228455, "step": 1736 }, { "epoch": 0.268625555770346, "grad_norm": 8.783943176269531, "learning_rate": 4.476804123711341e-06, "logits/chosen": 8.429193496704102, "logits/rejected": 7.957465171813965, "logps/chosen": -388.2986145019531, "logps/rejected": -327.13800048828125, "loss": 0.8703, "rewards/accuracies": 0.5, "rewards/chosen": 0.16317768394947052, "rewards/margins": -0.11072373390197754, "rewards/rejected": 0.27390143275260925, "step": 1737 }, { "epoch": 0.2687802049101102, "grad_norm": 5.733576774597168, "learning_rate": 4.47938144329897e-06, "logits/chosen": 4.806110858917236, "logits/rejected": 3.667363405227661, "logps/chosen": -382.98919677734375, "logps/rejected": -364.3193359375, "loss": 0.6953, "rewards/accuracies": 0.625, "rewards/chosen": 0.5008050203323364, "rewards/margins": 0.13074997067451477, "rewards/rejected": 0.37005510926246643, "step": 1738 }, { "epoch": 0.26893485404987433, "grad_norm": 10.084681510925293, "learning_rate": 4.4819587628865984e-06, "logits/chosen": 12.417266845703125, "logits/rejected": 7.827620506286621, "logps/chosen": -422.2778625488281, "logps/rejected": -310.5722961425781, "loss": 0.4593, "rewards/accuracies": 0.875, "rewards/chosen": 0.6576128602027893, "rewards/margins": 0.6134498715400696, "rewards/rejected": 0.044163040816783905, "step": 1739 }, { "epoch": 0.26908950318963853, "grad_norm": 5.354651927947998, "learning_rate": 4.484536082474228e-06, "logits/chosen": 11.033258438110352, "logits/rejected": 10.85450267791748, "logps/chosen": -260.737060546875, "logps/rejected": -284.7623596191406, "loss": 0.6446, "rewards/accuracies": 0.5, "rewards/chosen": 0.5706996917724609, "rewards/margins": 0.18445071578025818, "rewards/rejected": 0.38624900579452515, "step": 1740 }, { "epoch": 0.2692441523294027, "grad_norm": 6.8571553230285645, "learning_rate": 4.487113402061856e-06, "logits/chosen": 9.52800464630127, "logits/rejected": 8.58477783203125, "logps/chosen": -331.73193359375, "logps/rejected": -230.53656005859375, "loss": 0.6536, "rewards/accuracies": 0.625, "rewards/chosen": 0.28750234842300415, "rewards/margins": 0.09903968870639801, "rewards/rejected": 0.18846264481544495, "step": 1741 }, { "epoch": 0.26939880146916684, "grad_norm": 5.678842544555664, "learning_rate": 4.4896907216494855e-06, "logits/chosen": 9.500005722045898, "logits/rejected": 4.818561553955078, "logps/chosen": -299.9071044921875, "logps/rejected": -218.9936981201172, "loss": 0.721, "rewards/accuracies": 0.375, "rewards/chosen": 0.051835354417562485, "rewards/margins": 0.0384562611579895, "rewards/rejected": 0.013379089534282684, "step": 1742 }, { "epoch": 0.269553450608931, "grad_norm": 8.079463005065918, "learning_rate": 4.492268041237114e-06, "logits/chosen": 10.020328521728516, "logits/rejected": 10.742426872253418, "logps/chosen": -280.5080261230469, "logps/rejected": -320.8731689453125, "loss": 0.8585, "rewards/accuracies": 0.5, "rewards/chosen": 0.3448030650615692, "rewards/margins": -0.21688544750213623, "rewards/rejected": 0.5616885423660278, "step": 1743 }, { "epoch": 0.26970809974869514, "grad_norm": 5.063076019287109, "learning_rate": 4.494845360824742e-06, "logits/chosen": 12.055166244506836, "logits/rejected": 10.694090843200684, "logps/chosen": -306.79461669921875, "logps/rejected": -290.21490478515625, "loss": 0.6368, "rewards/accuracies": 0.625, "rewards/chosen": 0.375826895236969, "rewards/margins": 0.17593315243721008, "rewards/rejected": 0.1998937726020813, "step": 1744 }, { "epoch": 0.2698627488884593, "grad_norm": 5.686788558959961, "learning_rate": 4.497422680412372e-06, "logits/chosen": 13.565349578857422, "logits/rejected": 12.07940673828125, "logps/chosen": -241.08929443359375, "logps/rejected": -207.55294799804688, "loss": 0.749, "rewards/accuracies": 0.125, "rewards/chosen": 0.19779139757156372, "rewards/margins": -0.10124626755714417, "rewards/rejected": 0.2990376651287079, "step": 1745 }, { "epoch": 0.27001739802822344, "grad_norm": 3.8289053440093994, "learning_rate": 4.5e-06, "logits/chosen": 9.720939636230469, "logits/rejected": 6.20883846282959, "logps/chosen": -198.0955810546875, "logps/rejected": -175.57017517089844, "loss": 0.6088, "rewards/accuracies": 0.625, "rewards/chosen": 0.38892191648483276, "rewards/margins": 0.22377412021160126, "rewards/rejected": 0.1651478111743927, "step": 1746 }, { "epoch": 0.27017204716798765, "grad_norm": 5.094119548797607, "learning_rate": 4.502577319587629e-06, "logits/chosen": 7.088400840759277, "logits/rejected": 8.52591323852539, "logps/chosen": -238.23483276367188, "logps/rejected": -290.15411376953125, "loss": 0.6651, "rewards/accuracies": 0.5, "rewards/chosen": 0.2936222553253174, "rewards/margins": 0.09353528171777725, "rewards/rejected": 0.20008698105812073, "step": 1747 }, { "epoch": 0.2703266963077518, "grad_norm": 7.345438480377197, "learning_rate": 4.505154639175258e-06, "logits/chosen": 3.275681495666504, "logits/rejected": 3.0518741607666016, "logps/chosen": -223.5284423828125, "logps/rejected": -251.5111083984375, "loss": 0.6543, "rewards/accuracies": 0.375, "rewards/chosen": 0.3825116753578186, "rewards/margins": 0.174207866191864, "rewards/rejected": 0.2083037942647934, "step": 1748 }, { "epoch": 0.27048134544751595, "grad_norm": 5.7420973777771, "learning_rate": 4.507731958762887e-06, "logits/chosen": 13.98678207397461, "logits/rejected": 13.518287658691406, "logps/chosen": -310.6715393066406, "logps/rejected": -307.84368896484375, "loss": 0.6516, "rewards/accuracies": 0.75, "rewards/chosen": 0.29958975315093994, "rewards/margins": 0.1392795592546463, "rewards/rejected": 0.16031017899513245, "step": 1749 }, { "epoch": 0.2706359945872801, "grad_norm": 13.066579818725586, "learning_rate": 4.510309278350516e-06, "logits/chosen": 8.892087936401367, "logits/rejected": 14.950610160827637, "logps/chosen": -230.78277587890625, "logps/rejected": -292.1122131347656, "loss": 0.7221, "rewards/accuracies": 0.625, "rewards/chosen": 0.22716808319091797, "rewards/margins": -0.04069451242685318, "rewards/rejected": 0.26786261796951294, "step": 1750 }, { "epoch": 0.27079064372704426, "grad_norm": 6.974279880523682, "learning_rate": 4.512886597938145e-06, "logits/chosen": 10.362371444702148, "logits/rejected": 7.3704938888549805, "logps/chosen": -261.9510192871094, "logps/rejected": -275.69244384765625, "loss": 0.6753, "rewards/accuracies": 0.5, "rewards/chosen": 0.27854663133621216, "rewards/margins": 0.07175316661596298, "rewards/rejected": 0.20679347217082977, "step": 1751 }, { "epoch": 0.2709452928668084, "grad_norm": 6.656082630157471, "learning_rate": 4.515463917525773e-06, "logits/chosen": 8.17276668548584, "logits/rejected": 9.863580703735352, "logps/chosen": -323.19586181640625, "logps/rejected": -431.58294677734375, "loss": 0.732, "rewards/accuracies": 0.375, "rewards/chosen": 0.8858366012573242, "rewards/margins": 0.03014180064201355, "rewards/rejected": 0.8556947708129883, "step": 1752 }, { "epoch": 0.2710999420065726, "grad_norm": 5.26326322555542, "learning_rate": 4.518041237113403e-06, "logits/chosen": 14.49161434173584, "logits/rejected": 6.8541083335876465, "logps/chosen": -285.4656982421875, "logps/rejected": -214.22303771972656, "loss": 0.5935, "rewards/accuracies": 0.625, "rewards/chosen": 0.5470476746559143, "rewards/margins": 0.2501283884048462, "rewards/rejected": 0.2969193458557129, "step": 1753 }, { "epoch": 0.27125459114633677, "grad_norm": 5.698396682739258, "learning_rate": 4.520618556701031e-06, "logits/chosen": 4.595928192138672, "logits/rejected": 4.694220066070557, "logps/chosen": -200.26171875, "logps/rejected": -192.99400329589844, "loss": 0.7333, "rewards/accuracies": 0.625, "rewards/chosen": 0.34854525327682495, "rewards/margins": -0.011685170233249664, "rewards/rejected": 0.3602304458618164, "step": 1754 }, { "epoch": 0.2714092402861009, "grad_norm": 4.87807035446167, "learning_rate": 4.52319587628866e-06, "logits/chosen": 10.981599807739258, "logits/rejected": 4.471719264984131, "logps/chosen": -234.43411254882812, "logps/rejected": -153.1018524169922, "loss": 0.6299, "rewards/accuracies": 0.625, "rewards/chosen": 0.5472139120101929, "rewards/margins": 0.15522579848766327, "rewards/rejected": 0.3919881582260132, "step": 1755 }, { "epoch": 0.27156388942586507, "grad_norm": 54.97401428222656, "learning_rate": 4.525773195876289e-06, "logits/chosen": 10.627900123596191, "logits/rejected": 11.327165603637695, "logps/chosen": -336.5604248046875, "logps/rejected": -403.004150390625, "loss": 0.627, "rewards/accuracies": 0.75, "rewards/chosen": 0.6421036124229431, "rewards/margins": 0.16009390354156494, "rewards/rejected": 0.4820097088813782, "step": 1756 }, { "epoch": 0.2717185385656292, "grad_norm": 4.172118186950684, "learning_rate": 4.528350515463918e-06, "logits/chosen": 14.645185470581055, "logits/rejected": 10.396769523620605, "logps/chosen": -321.38385009765625, "logps/rejected": -217.132568359375, "loss": 0.5333, "rewards/accuracies": 0.75, "rewards/chosen": 0.4707261025905609, "rewards/margins": 0.38813915848731995, "rewards/rejected": 0.08258695900440216, "step": 1757 }, { "epoch": 0.27187318770539337, "grad_norm": 10.270281791687012, "learning_rate": 4.5309278350515466e-06, "logits/chosen": 8.66783618927002, "logits/rejected": 7.582070350646973, "logps/chosen": -408.40521240234375, "logps/rejected": -355.104248046875, "loss": 0.8924, "rewards/accuracies": 0.25, "rewards/chosen": 0.47501498460769653, "rewards/margins": -0.2812090516090393, "rewards/rejected": 0.7562240362167358, "step": 1758 }, { "epoch": 0.2720278368451575, "grad_norm": 6.416598796844482, "learning_rate": 4.533505154639176e-06, "logits/chosen": 11.456398010253906, "logits/rejected": 6.370075702667236, "logps/chosen": -250.26211547851562, "logps/rejected": -201.88763427734375, "loss": 0.5553, "rewards/accuracies": 0.75, "rewards/chosen": 0.2968330383300781, "rewards/margins": 0.4610915184020996, "rewards/rejected": -0.16425848007202148, "step": 1759 }, { "epoch": 0.27218248598492173, "grad_norm": 4.307449817657471, "learning_rate": 4.536082474226804e-06, "logits/chosen": 11.395735740661621, "logits/rejected": 8.634366989135742, "logps/chosen": -230.93817138671875, "logps/rejected": -216.629638671875, "loss": 0.5468, "rewards/accuracies": 0.875, "rewards/chosen": 0.47546061873435974, "rewards/margins": 0.4234132766723633, "rewards/rejected": 0.05204734578728676, "step": 1760 }, { "epoch": 0.2723371351246859, "grad_norm": 6.60194730758667, "learning_rate": 4.538659793814434e-06, "logits/chosen": 14.477359771728516, "logits/rejected": 10.578109741210938, "logps/chosen": -456.6590576171875, "logps/rejected": -266.31964111328125, "loss": 0.6187, "rewards/accuracies": 0.5, "rewards/chosen": 0.22925835847854614, "rewards/margins": 0.21333390474319458, "rewards/rejected": 0.01592445559799671, "step": 1761 }, { "epoch": 0.27249178426445003, "grad_norm": 4.966465473175049, "learning_rate": 4.541237113402062e-06, "logits/chosen": 11.965703010559082, "logits/rejected": 8.348986625671387, "logps/chosen": -150.67515563964844, "logps/rejected": -164.79669189453125, "loss": 0.5886, "rewards/accuracies": 0.625, "rewards/chosen": 0.5295576453208923, "rewards/margins": 0.2687874436378479, "rewards/rejected": 0.2607702314853668, "step": 1762 }, { "epoch": 0.2726464334042142, "grad_norm": 11.325162887573242, "learning_rate": 4.543814432989691e-06, "logits/chosen": 6.723527908325195, "logits/rejected": 10.936582565307617, "logps/chosen": -386.983154296875, "logps/rejected": -415.4788818359375, "loss": 0.7443, "rewards/accuracies": 0.5, "rewards/chosen": 0.3540104329586029, "rewards/margins": -0.019354645162820816, "rewards/rejected": 0.37336504459381104, "step": 1763 }, { "epoch": 0.27280108254397833, "grad_norm": 7.192172527313232, "learning_rate": 4.54639175257732e-06, "logits/chosen": 11.676755905151367, "logits/rejected": 9.913164138793945, "logps/chosen": -252.0538330078125, "logps/rejected": -227.24188232421875, "loss": 0.6357, "rewards/accuracies": 0.625, "rewards/chosen": 0.58184814453125, "rewards/margins": 0.23793716728687286, "rewards/rejected": 0.34391099214553833, "step": 1764 }, { "epoch": 0.2729557316837425, "grad_norm": 5.468150615692139, "learning_rate": 4.548969072164949e-06, "logits/chosen": 9.721074104309082, "logits/rejected": 13.080572128295898, "logps/chosen": -173.80316162109375, "logps/rejected": -202.72247314453125, "loss": 0.683, "rewards/accuracies": 0.375, "rewards/chosen": 0.380825936794281, "rewards/margins": 0.049785513430833817, "rewards/rejected": 0.3310404419898987, "step": 1765 }, { "epoch": 0.2731103808235067, "grad_norm": 4.978980541229248, "learning_rate": 4.5515463917525776e-06, "logits/chosen": 14.374263763427734, "logits/rejected": 5.3594255447387695, "logps/chosen": -211.71661376953125, "logps/rejected": -182.83963012695312, "loss": 0.6036, "rewards/accuracies": 0.625, "rewards/chosen": 0.3469385504722595, "rewards/margins": 0.28878986835479736, "rewards/rejected": 0.05814870446920395, "step": 1766 }, { "epoch": 0.27326502996327084, "grad_norm": 6.417335033416748, "learning_rate": 4.554123711340207e-06, "logits/chosen": 8.773098945617676, "logits/rejected": 8.000666618347168, "logps/chosen": -441.27325439453125, "logps/rejected": -332.311279296875, "loss": 0.6698, "rewards/accuracies": 0.375, "rewards/chosen": 0.45418065786361694, "rewards/margins": 0.11314989626407623, "rewards/rejected": 0.3410307466983795, "step": 1767 }, { "epoch": 0.273419679103035, "grad_norm": 6.2654595375061035, "learning_rate": 4.556701030927835e-06, "logits/chosen": 11.711050033569336, "logits/rejected": 7.446597099304199, "logps/chosen": -515.9819946289062, "logps/rejected": -352.4246520996094, "loss": 0.6114, "rewards/accuracies": 0.625, "rewards/chosen": 0.5711075067520142, "rewards/margins": 0.2978874444961548, "rewards/rejected": 0.2732200622558594, "step": 1768 }, { "epoch": 0.27357432824279915, "grad_norm": 10.886341094970703, "learning_rate": 4.559278350515465e-06, "logits/chosen": 7.859816551208496, "logits/rejected": 2.989755153656006, "logps/chosen": -160.63906860351562, "logps/rejected": -102.91426086425781, "loss": 0.703, "rewards/accuracies": 0.5, "rewards/chosen": 0.3161394000053406, "rewards/margins": 0.0051191262900829315, "rewards/rejected": 0.3110203146934509, "step": 1769 }, { "epoch": 0.2737289773825633, "grad_norm": 7.261675834655762, "learning_rate": 4.561855670103093e-06, "logits/chosen": 12.162296295166016, "logits/rejected": 9.816460609436035, "logps/chosen": -343.8886413574219, "logps/rejected": -308.63885498046875, "loss": 0.7363, "rewards/accuracies": 0.625, "rewards/chosen": 0.3564913272857666, "rewards/margins": 0.03462439030408859, "rewards/rejected": 0.3218669295310974, "step": 1770 }, { "epoch": 0.27388362652232745, "grad_norm": 6.246756553649902, "learning_rate": 4.5644329896907215e-06, "logits/chosen": 9.481365203857422, "logits/rejected": 10.303899765014648, "logps/chosen": -266.0, "logps/rejected": -248.3848419189453, "loss": 0.5617, "rewards/accuracies": 0.75, "rewards/chosen": 0.37504637241363525, "rewards/margins": 0.3056323826313019, "rewards/rejected": 0.06941400468349457, "step": 1771 }, { "epoch": 0.27403827566209166, "grad_norm": 3.724700689315796, "learning_rate": 4.567010309278351e-06, "logits/chosen": 8.440160751342773, "logits/rejected": 7.323162078857422, "logps/chosen": -147.2929229736328, "logps/rejected": -155.32650756835938, "loss": 0.5384, "rewards/accuracies": 0.875, "rewards/chosen": 0.4414490759372711, "rewards/margins": 0.3574179410934448, "rewards/rejected": 0.0840311050415039, "step": 1772 }, { "epoch": 0.2741929248018558, "grad_norm": 5.508972644805908, "learning_rate": 4.569587628865979e-06, "logits/chosen": 4.529531478881836, "logits/rejected": 5.403200626373291, "logps/chosen": -319.9095458984375, "logps/rejected": -235.11407470703125, "loss": 0.5968, "rewards/accuracies": 0.75, "rewards/chosen": 0.37142059206962585, "rewards/margins": 0.21721532940864563, "rewards/rejected": 0.15420524775981903, "step": 1773 }, { "epoch": 0.27434757394161996, "grad_norm": 9.819826126098633, "learning_rate": 4.5721649484536085e-06, "logits/chosen": 11.389838218688965, "logits/rejected": 11.219137191772461, "logps/chosen": -307.8130798339844, "logps/rejected": -353.7999267578125, "loss": 0.6348, "rewards/accuracies": 0.625, "rewards/chosen": 0.41786760091781616, "rewards/margins": 0.16911393404006958, "rewards/rejected": 0.24875369668006897, "step": 1774 }, { "epoch": 0.2745022230813841, "grad_norm": 5.779160499572754, "learning_rate": 4.574742268041237e-06, "logits/chosen": 8.296561241149902, "logits/rejected": 4.77658748626709, "logps/chosen": -416.005859375, "logps/rejected": -371.0732116699219, "loss": 0.67, "rewards/accuracies": 0.625, "rewards/chosen": 0.5942328572273254, "rewards/margins": 0.1252460479736328, "rewards/rejected": 0.468986839056015, "step": 1775 }, { "epoch": 0.27465687222114826, "grad_norm": 4.862935543060303, "learning_rate": 4.577319587628866e-06, "logits/chosen": 7.9415602684021, "logits/rejected": 6.139190673828125, "logps/chosen": -208.67913818359375, "logps/rejected": -175.70506286621094, "loss": 0.6236, "rewards/accuracies": 0.625, "rewards/chosen": 0.5619739890098572, "rewards/margins": 0.18216942250728607, "rewards/rejected": 0.3798045516014099, "step": 1776 }, { "epoch": 0.2748115213609124, "grad_norm": 10.83504867553711, "learning_rate": 4.579896907216495e-06, "logits/chosen": 7.566333770751953, "logits/rejected": 4.853797912597656, "logps/chosen": -219.80227661132812, "logps/rejected": -215.96945190429688, "loss": 0.51, "rewards/accuracies": 0.625, "rewards/chosen": 0.5173207521438599, "rewards/margins": 0.589341402053833, "rewards/rejected": -0.07202073931694031, "step": 1777 }, { "epoch": 0.27496617050067657, "grad_norm": 6.655467510223389, "learning_rate": 4.582474226804124e-06, "logits/chosen": 9.469337463378906, "logits/rejected": 4.3183512687683105, "logps/chosen": -294.57684326171875, "logps/rejected": -245.7779998779297, "loss": 0.7966, "rewards/accuracies": 0.5, "rewards/chosen": 0.17119817435741425, "rewards/margins": -0.1174437552690506, "rewards/rejected": 0.28864192962646484, "step": 1778 }, { "epoch": 0.2751208196404408, "grad_norm": 6.520418643951416, "learning_rate": 4.5850515463917525e-06, "logits/chosen": 10.906291961669922, "logits/rejected": 5.719893932342529, "logps/chosen": -338.7460632324219, "logps/rejected": -327.1804504394531, "loss": 0.617, "rewards/accuracies": 0.75, "rewards/chosen": 0.38511353731155396, "rewards/margins": 0.22053347527980804, "rewards/rejected": 0.1645800620317459, "step": 1779 }, { "epoch": 0.2752754687802049, "grad_norm": 4.582620620727539, "learning_rate": 4.587628865979382e-06, "logits/chosen": 4.078237056732178, "logits/rejected": 6.392640113830566, "logps/chosen": -219.04946899414062, "logps/rejected": -252.43405151367188, "loss": 0.6023, "rewards/accuracies": 0.625, "rewards/chosen": 0.5643264651298523, "rewards/margins": 0.25029510259628296, "rewards/rejected": 0.3140313923358917, "step": 1780 }, { "epoch": 0.2754301179199691, "grad_norm": 5.459748268127441, "learning_rate": 4.59020618556701e-06, "logits/chosen": 8.431791305541992, "logits/rejected": 7.698966026306152, "logps/chosen": -231.88125610351562, "logps/rejected": -203.00279235839844, "loss": 0.6682, "rewards/accuracies": 0.625, "rewards/chosen": 0.3660655915737152, "rewards/margins": 0.09622454643249512, "rewards/rejected": 0.2698410451412201, "step": 1781 }, { "epoch": 0.2755847670597332, "grad_norm": 6.183064937591553, "learning_rate": 4.5927835051546395e-06, "logits/chosen": 13.087644577026367, "logits/rejected": 11.896936416625977, "logps/chosen": -232.48736572265625, "logps/rejected": -258.69830322265625, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": 0.29107093811035156, "rewards/margins": 0.14245158433914185, "rewards/rejected": 0.14861935377120972, "step": 1782 }, { "epoch": 0.2757394161994974, "grad_norm": 4.173166751861572, "learning_rate": 4.595360824742268e-06, "logits/chosen": 8.520410537719727, "logits/rejected": 7.409262657165527, "logps/chosen": -190.48806762695312, "logps/rejected": -168.40975952148438, "loss": 0.6975, "rewards/accuracies": 0.5, "rewards/chosen": 0.25488972663879395, "rewards/margins": 0.07498779892921448, "rewards/rejected": 0.17990194261074066, "step": 1783 }, { "epoch": 0.27589406533926153, "grad_norm": 6.094630718231201, "learning_rate": 4.597938144329897e-06, "logits/chosen": 12.645772933959961, "logits/rejected": 6.3182172775268555, "logps/chosen": -419.4778137207031, "logps/rejected": -291.7328186035156, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": 0.40914487838745117, "rewards/margins": 0.07191429287195206, "rewards/rejected": 0.3372305929660797, "step": 1784 }, { "epoch": 0.27604871447902574, "grad_norm": 4.593143463134766, "learning_rate": 4.600515463917526e-06, "logits/chosen": 10.098987579345703, "logits/rejected": 7.184027671813965, "logps/chosen": -288.43408203125, "logps/rejected": -263.3219909667969, "loss": 0.6541, "rewards/accuracies": 0.5, "rewards/chosen": 0.7223830223083496, "rewards/margins": 0.2738035023212433, "rewards/rejected": 0.4485795199871063, "step": 1785 }, { "epoch": 0.2762033636187899, "grad_norm": 6.917372703552246, "learning_rate": 4.603092783505155e-06, "logits/chosen": 10.330402374267578, "logits/rejected": 5.444697380065918, "logps/chosen": -301.9897155761719, "logps/rejected": -222.5584716796875, "loss": 0.7968, "rewards/accuracies": 0.375, "rewards/chosen": 0.07506342977285385, "rewards/margins": -0.13927708566188812, "rewards/rejected": 0.21434050798416138, "step": 1786 }, { "epoch": 0.27635801275855404, "grad_norm": 3.787792205810547, "learning_rate": 4.6056701030927834e-06, "logits/chosen": 14.770713806152344, "logits/rejected": 7.072144508361816, "logps/chosen": -293.251953125, "logps/rejected": -208.4970703125, "loss": 0.4716, "rewards/accuracies": 0.875, "rewards/chosen": 0.8451826572418213, "rewards/margins": 0.5348148345947266, "rewards/rejected": 0.31036776304244995, "step": 1787 }, { "epoch": 0.2765126618983182, "grad_norm": 13.960423469543457, "learning_rate": 4.608247422680413e-06, "logits/chosen": 13.204172134399414, "logits/rejected": 7.987703323364258, "logps/chosen": -413.2759094238281, "logps/rejected": -338.0579833984375, "loss": 0.6093, "rewards/accuracies": 0.75, "rewards/chosen": 0.38115859031677246, "rewards/margins": 0.26606765389442444, "rewards/rejected": 0.11509095132350922, "step": 1788 }, { "epoch": 0.27666731103808234, "grad_norm": 4.321527481079102, "learning_rate": 4.610824742268042e-06, "logits/chosen": 7.346288204193115, "logits/rejected": 5.108887672424316, "logps/chosen": -173.19715881347656, "logps/rejected": -155.81553649902344, "loss": 0.7, "rewards/accuracies": 0.5, "rewards/chosen": 0.552959680557251, "rewards/margins": 0.058785922825336456, "rewards/rejected": 0.4941737949848175, "step": 1789 }, { "epoch": 0.2768219601778465, "grad_norm": 5.214527130126953, "learning_rate": 4.6134020618556705e-06, "logits/chosen": 4.459333896636963, "logits/rejected": 0.02829456329345703, "logps/chosen": -238.23802185058594, "logps/rejected": -156.39300537109375, "loss": 0.7568, "rewards/accuracies": 0.5, "rewards/chosen": 0.5082700848579407, "rewards/margins": 0.04841124638915062, "rewards/rejected": 0.45985886454582214, "step": 1790 }, { "epoch": 0.27697660931761064, "grad_norm": 5.841790676116943, "learning_rate": 4.6159793814433e-06, "logits/chosen": 13.923933029174805, "logits/rejected": 8.109094619750977, "logps/chosen": -308.5247802734375, "logps/rejected": -241.20712280273438, "loss": 0.6594, "rewards/accuracies": 0.5, "rewards/chosen": 0.26008549332618713, "rewards/margins": 0.18404704332351685, "rewards/rejected": 0.07603845000267029, "step": 1791 }, { "epoch": 0.27713125845737485, "grad_norm": 10.888556480407715, "learning_rate": 4.618556701030928e-06, "logits/chosen": 7.914096832275391, "logits/rejected": 9.503911972045898, "logps/chosen": -255.510986328125, "logps/rejected": -272.71563720703125, "loss": 0.8884, "rewards/accuracies": 0.375, "rewards/chosen": -0.12503138184547424, "rewards/margins": -0.29305964708328247, "rewards/rejected": 0.1680283099412918, "step": 1792 }, { "epoch": 0.277285907597139, "grad_norm": 4.766130447387695, "learning_rate": 4.6211340206185575e-06, "logits/chosen": 13.7325439453125, "logits/rejected": 11.023902893066406, "logps/chosen": -231.98545837402344, "logps/rejected": -183.91726684570312, "loss": 0.6273, "rewards/accuracies": 0.75, "rewards/chosen": 0.5380759835243225, "rewards/margins": 0.15609519183635712, "rewards/rejected": 0.3819808065891266, "step": 1793 }, { "epoch": 0.27744055673690315, "grad_norm": 5.318512916564941, "learning_rate": 4.623711340206186e-06, "logits/chosen": 7.283623218536377, "logits/rejected": 9.433982849121094, "logps/chosen": -169.26589965820312, "logps/rejected": -164.61131286621094, "loss": 0.7234, "rewards/accuracies": 0.375, "rewards/chosen": 0.09434748440980911, "rewards/margins": -0.03460858017206192, "rewards/rejected": 0.12895606458187103, "step": 1794 }, { "epoch": 0.2775952058766673, "grad_norm": 4.173285007476807, "learning_rate": 4.626288659793815e-06, "logits/chosen": 5.179205894470215, "logits/rejected": 6.072615623474121, "logps/chosen": -168.74044799804688, "logps/rejected": -209.5545654296875, "loss": 0.5635, "rewards/accuracies": 0.75, "rewards/chosen": 0.6600791215896606, "rewards/margins": 0.40953323245048523, "rewards/rejected": 0.2505458891391754, "step": 1795 }, { "epoch": 0.27774985501643146, "grad_norm": 7.360849857330322, "learning_rate": 4.628865979381444e-06, "logits/chosen": 15.077102661132812, "logits/rejected": 5.993697643280029, "logps/chosen": -399.30975341796875, "logps/rejected": -319.9261779785156, "loss": 0.4653, "rewards/accuracies": 0.875, "rewards/chosen": 0.6223754286766052, "rewards/margins": 0.6062986254692078, "rewards/rejected": 0.01607675850391388, "step": 1796 }, { "epoch": 0.2779045041561956, "grad_norm": 5.941712856292725, "learning_rate": 4.631443298969073e-06, "logits/chosen": 9.63193416595459, "logits/rejected": 6.623553276062012, "logps/chosen": -276.166748046875, "logps/rejected": -252.10830688476562, "loss": 0.6072, "rewards/accuracies": 0.5, "rewards/chosen": 0.3636169135570526, "rewards/margins": 0.33589398860931396, "rewards/rejected": 0.02772293984889984, "step": 1797 }, { "epoch": 0.2780591532959598, "grad_norm": 5.1528167724609375, "learning_rate": 4.6340206185567015e-06, "logits/chosen": 12.019081115722656, "logits/rejected": 7.0318098068237305, "logps/chosen": -384.81573486328125, "logps/rejected": -245.92637634277344, "loss": 0.5801, "rewards/accuracies": 0.5, "rewards/chosen": 0.7144142389297485, "rewards/margins": 0.3055454194545746, "rewards/rejected": 0.40886878967285156, "step": 1798 }, { "epoch": 0.27821380243572397, "grad_norm": 6.089789390563965, "learning_rate": 4.636597938144331e-06, "logits/chosen": 11.860133171081543, "logits/rejected": 4.922367095947266, "logps/chosen": -321.1929016113281, "logps/rejected": -220.14512634277344, "loss": 0.6868, "rewards/accuracies": 0.5, "rewards/chosen": 0.6045522093772888, "rewards/margins": 0.04126105457544327, "rewards/rejected": 0.5632911920547485, "step": 1799 }, { "epoch": 0.2783684515754881, "grad_norm": 6.841023921966553, "learning_rate": 4.639175257731959e-06, "logits/chosen": 12.983312606811523, "logits/rejected": 6.042972564697266, "logps/chosen": -296.8444519042969, "logps/rejected": -144.22396850585938, "loss": 0.7715, "rewards/accuracies": 0.375, "rewards/chosen": 0.19157515466213226, "rewards/margins": -0.0874970555305481, "rewards/rejected": 0.27907222509384155, "step": 1800 }, { "epoch": 0.27852310071525227, "grad_norm": 5.250925540924072, "learning_rate": 4.6417525773195885e-06, "logits/chosen": 11.337623596191406, "logits/rejected": 4.912841796875, "logps/chosen": -263.2588195800781, "logps/rejected": -176.6988525390625, "loss": 0.6328, "rewards/accuracies": 0.75, "rewards/chosen": 0.24549323320388794, "rewards/margins": 0.21916508674621582, "rewards/rejected": 0.02632814645767212, "step": 1801 }, { "epoch": 0.2786777498550164, "grad_norm": 6.687630653381348, "learning_rate": 4.644329896907217e-06, "logits/chosen": 9.23815631866455, "logits/rejected": 14.233956336975098, "logps/chosen": -333.1272277832031, "logps/rejected": -381.56103515625, "loss": 0.8792, "rewards/accuracies": 0.375, "rewards/chosen": 0.43065834045410156, "rewards/margins": -0.2751563787460327, "rewards/rejected": 0.7058147192001343, "step": 1802 }, { "epoch": 0.2788323989947806, "grad_norm": 4.982458591461182, "learning_rate": 4.646907216494846e-06, "logits/chosen": 13.7312650680542, "logits/rejected": 8.046867370605469, "logps/chosen": -303.96014404296875, "logps/rejected": -254.2519073486328, "loss": 0.4666, "rewards/accuracies": 0.75, "rewards/chosen": 0.3461725115776062, "rewards/margins": 0.7006341218948364, "rewards/rejected": -0.3544616103172302, "step": 1803 }, { "epoch": 0.2789870481345447, "grad_norm": 6.613254070281982, "learning_rate": 4.649484536082475e-06, "logits/chosen": 7.195912837982178, "logits/rejected": 12.799966812133789, "logps/chosen": -305.4556579589844, "logps/rejected": -274.7067565917969, "loss": 0.6977, "rewards/accuracies": 0.625, "rewards/chosen": 0.3151063919067383, "rewards/margins": 0.08902379870414734, "rewards/rejected": 0.22608262300491333, "step": 1804 }, { "epoch": 0.27914169727430893, "grad_norm": 4.40196418762207, "learning_rate": 4.652061855670104e-06, "logits/chosen": 7.241674423217773, "logits/rejected": 5.626523017883301, "logps/chosen": -208.08758544921875, "logps/rejected": -208.23648071289062, "loss": 0.581, "rewards/accuracies": 0.75, "rewards/chosen": 0.3735140860080719, "rewards/margins": 0.26652076840400696, "rewards/rejected": 0.10699333250522614, "step": 1805 }, { "epoch": 0.2792963464140731, "grad_norm": 6.872039794921875, "learning_rate": 4.6546391752577324e-06, "logits/chosen": 6.387746810913086, "logits/rejected": 2.7451484203338623, "logps/chosen": -401.534423828125, "logps/rejected": -312.84368896484375, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.5557008981704712, "rewards/margins": 0.01845632866024971, "rewards/rejected": 0.5372445583343506, "step": 1806 }, { "epoch": 0.27945099555383723, "grad_norm": 5.204464912414551, "learning_rate": 4.657216494845362e-06, "logits/chosen": 15.71424388885498, "logits/rejected": 2.1320204734802246, "logps/chosen": -498.5172424316406, "logps/rejected": -208.55564880371094, "loss": 0.517, "rewards/accuracies": 0.875, "rewards/chosen": 0.5891244411468506, "rewards/margins": 0.5190377235412598, "rewards/rejected": 0.07008671760559082, "step": 1807 }, { "epoch": 0.2796056446936014, "grad_norm": 37.6791877746582, "learning_rate": 4.65979381443299e-06, "logits/chosen": 14.20694637298584, "logits/rejected": 7.893401145935059, "logps/chosen": -529.0643920898438, "logps/rejected": -397.0866394042969, "loss": 0.6454, "rewards/accuracies": 0.625, "rewards/chosen": 0.9732526540756226, "rewards/margins": 0.1775982677936554, "rewards/rejected": 0.7956543564796448, "step": 1808 }, { "epoch": 0.27976029383336554, "grad_norm": 6.8347649574279785, "learning_rate": 4.662371134020619e-06, "logits/chosen": 8.797388076782227, "logits/rejected": 4.059911727905273, "logps/chosen": -364.923828125, "logps/rejected": -330.72515869140625, "loss": 0.5717, "rewards/accuracies": 0.625, "rewards/chosen": 0.4942196011543274, "rewards/margins": 0.3306804597377777, "rewards/rejected": 0.1635391265153885, "step": 1809 }, { "epoch": 0.2799149429731297, "grad_norm": 5.941415786743164, "learning_rate": 4.664948453608248e-06, "logits/chosen": -0.28004634380340576, "logits/rejected": 6.157761573791504, "logps/chosen": -142.97694396972656, "logps/rejected": -224.66497802734375, "loss": 0.7229, "rewards/accuracies": 0.5, "rewards/chosen": 0.23308923840522766, "rewards/margins": 0.00542113184928894, "rewards/rejected": 0.22766809165477753, "step": 1810 }, { "epoch": 0.2800695921128939, "grad_norm": 5.795331001281738, "learning_rate": 4.667525773195876e-06, "logits/chosen": 9.838809967041016, "logits/rejected": 11.281718254089355, "logps/chosen": -306.9436340332031, "logps/rejected": -324.8358459472656, "loss": 0.766, "rewards/accuracies": 0.5, "rewards/chosen": 0.5886789560317993, "rewards/margins": -0.11536659300327301, "rewards/rejected": 0.7040455341339111, "step": 1811 }, { "epoch": 0.28022424125265805, "grad_norm": 4.510373592376709, "learning_rate": 4.670103092783506e-06, "logits/chosen": 13.319091796875, "logits/rejected": 6.25773286819458, "logps/chosen": -308.00250244140625, "logps/rejected": -214.00946044921875, "loss": 0.5636, "rewards/accuracies": 0.875, "rewards/chosen": 0.4302256405353546, "rewards/margins": 0.29232674837112427, "rewards/rejected": 0.13789892196655273, "step": 1812 }, { "epoch": 0.2803788903924222, "grad_norm": 5.403369903564453, "learning_rate": 4.672680412371134e-06, "logits/chosen": 10.388248443603516, "logits/rejected": 4.79567813873291, "logps/chosen": -357.135009765625, "logps/rejected": -273.6259460449219, "loss": 0.5983, "rewards/accuracies": 0.625, "rewards/chosen": 0.47838470339775085, "rewards/margins": 0.30037403106689453, "rewards/rejected": 0.17801065742969513, "step": 1813 }, { "epoch": 0.28053353953218635, "grad_norm": 4.630216121673584, "learning_rate": 4.675257731958763e-06, "logits/chosen": 7.771177291870117, "logits/rejected": 5.1385111808776855, "logps/chosen": -280.0901794433594, "logps/rejected": -209.920166015625, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": 0.3488076627254486, "rewards/margins": 0.022928372025489807, "rewards/rejected": 0.3258792757987976, "step": 1814 }, { "epoch": 0.2806881886719505, "grad_norm": 5.719762325286865, "learning_rate": 4.677835051546392e-06, "logits/chosen": 10.391979217529297, "logits/rejected": 4.174276351928711, "logps/chosen": -435.1410217285156, "logps/rejected": -276.4684753417969, "loss": 0.6236, "rewards/accuracies": 0.5, "rewards/chosen": 0.5524491667747498, "rewards/margins": 0.23280295729637146, "rewards/rejected": 0.3196461796760559, "step": 1815 }, { "epoch": 0.28084283781171465, "grad_norm": 5.505342960357666, "learning_rate": 4.680412371134021e-06, "logits/chosen": 1.3984167575836182, "logits/rejected": 3.2833070755004883, "logps/chosen": -188.52151489257812, "logps/rejected": -235.49627685546875, "loss": 0.5879, "rewards/accuracies": 0.875, "rewards/chosen": 0.4416334331035614, "rewards/margins": 0.3539799749851227, "rewards/rejected": 0.08765344321727753, "step": 1816 }, { "epoch": 0.28099748695147886, "grad_norm": 6.206616401672363, "learning_rate": 4.68298969072165e-06, "logits/chosen": 11.167319297790527, "logits/rejected": 8.613869667053223, "logps/chosen": -426.288330078125, "logps/rejected": -329.3836975097656, "loss": 0.4729, "rewards/accuracies": 0.875, "rewards/chosen": 0.613318681716919, "rewards/margins": 0.54759681224823, "rewards/rejected": 0.06572189927101135, "step": 1817 }, { "epoch": 0.281152136091243, "grad_norm": 9.026396751403809, "learning_rate": 4.685567010309279e-06, "logits/chosen": 8.833276748657227, "logits/rejected": 8.582449913024902, "logps/chosen": -230.39968872070312, "logps/rejected": -245.29591369628906, "loss": 0.7819, "rewards/accuracies": 0.625, "rewards/chosen": 0.46692636609077454, "rewards/margins": -0.04104386270046234, "rewards/rejected": 0.5079702138900757, "step": 1818 }, { "epoch": 0.28130678523100716, "grad_norm": 6.190130233764648, "learning_rate": 4.688144329896907e-06, "logits/chosen": 6.207818508148193, "logits/rejected": 4.249599933624268, "logps/chosen": -252.45883178710938, "logps/rejected": -162.86692810058594, "loss": 0.752, "rewards/accuracies": 0.375, "rewards/chosen": 0.35861286520957947, "rewards/margins": -0.08175475895404816, "rewards/rejected": 0.4403676390647888, "step": 1819 }, { "epoch": 0.2814614343707713, "grad_norm": 6.999055862426758, "learning_rate": 4.690721649484537e-06, "logits/chosen": 12.285684585571289, "logits/rejected": 6.994052886962891, "logps/chosen": -317.63763427734375, "logps/rejected": -270.59002685546875, "loss": 0.5903, "rewards/accuracies": 0.75, "rewards/chosen": 0.48645782470703125, "rewards/margins": 0.2407977283000946, "rewards/rejected": 0.24566012620925903, "step": 1820 }, { "epoch": 0.28161608351053546, "grad_norm": 6.097827434539795, "learning_rate": 4.693298969072165e-06, "logits/chosen": 10.929293632507324, "logits/rejected": 3.2800440788269043, "logps/chosen": -340.63427734375, "logps/rejected": -256.7178649902344, "loss": 0.6055, "rewards/accuracies": 0.875, "rewards/chosen": 0.5508156418800354, "rewards/margins": 0.24711717665195465, "rewards/rejected": 0.30369845032691956, "step": 1821 }, { "epoch": 0.2817707326502996, "grad_norm": 4.320054054260254, "learning_rate": 4.695876288659794e-06, "logits/chosen": 13.914053916931152, "logits/rejected": 9.445515632629395, "logps/chosen": -247.6464385986328, "logps/rejected": -210.24453735351562, "loss": 0.5064, "rewards/accuracies": 0.875, "rewards/chosen": 0.5620380640029907, "rewards/margins": 0.49877631664276123, "rewards/rejected": 0.06326168775558472, "step": 1822 }, { "epoch": 0.28192538179006377, "grad_norm": 4.688449382781982, "learning_rate": 4.698453608247423e-06, "logits/chosen": 11.72197151184082, "logits/rejected": 7.280068874359131, "logps/chosen": -272.1938171386719, "logps/rejected": -223.8502960205078, "loss": 0.6593, "rewards/accuracies": 0.5, "rewards/chosen": 0.7122691869735718, "rewards/margins": 0.14574995636940002, "rewards/rejected": 0.5665192604064941, "step": 1823 }, { "epoch": 0.282080030929828, "grad_norm": 5.492824554443359, "learning_rate": 4.701030927835052e-06, "logits/chosen": 14.48817253112793, "logits/rejected": 10.518872261047363, "logps/chosen": -266.54486083984375, "logps/rejected": -227.7613525390625, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": 0.4933539628982544, "rewards/margins": 0.15884561836719513, "rewards/rejected": 0.33450832962989807, "step": 1824 }, { "epoch": 0.2822346800695921, "grad_norm": 4.968796253204346, "learning_rate": 4.7036082474226806e-06, "logits/chosen": 10.975472450256348, "logits/rejected": 7.610395431518555, "logps/chosen": -382.6282043457031, "logps/rejected": -240.187744140625, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": 0.985701858997345, "rewards/margins": 0.4036686420440674, "rewards/rejected": 0.5820331573486328, "step": 1825 }, { "epoch": 0.2823893292093563, "grad_norm": 5.8526835441589355, "learning_rate": 4.70618556701031e-06, "logits/chosen": 9.63501262664795, "logits/rejected": 5.556789398193359, "logps/chosen": -248.28903198242188, "logps/rejected": -235.29010009765625, "loss": 0.7531, "rewards/accuracies": 0.5, "rewards/chosen": 0.3147670030593872, "rewards/margins": -0.08388163149356842, "rewards/rejected": 0.39864861965179443, "step": 1826 }, { "epoch": 0.28254397834912043, "grad_norm": 4.779936790466309, "learning_rate": 4.708762886597938e-06, "logits/chosen": 6.816708564758301, "logits/rejected": 9.040908813476562, "logps/chosen": -182.03797912597656, "logps/rejected": -177.436279296875, "loss": 0.7321, "rewards/accuracies": 0.25, "rewards/chosen": 0.4913061261177063, "rewards/margins": -0.052229247987270355, "rewards/rejected": 0.5435353517532349, "step": 1827 }, { "epoch": 0.2826986274888846, "grad_norm": 5.755284786224365, "learning_rate": 4.711340206185568e-06, "logits/chosen": 8.66589641571045, "logits/rejected": 8.729596138000488, "logps/chosen": -295.18707275390625, "logps/rejected": -263.3929748535156, "loss": 0.6528, "rewards/accuracies": 0.5, "rewards/chosen": 0.40980178117752075, "rewards/margins": 0.1322551816701889, "rewards/rejected": 0.27754661440849304, "step": 1828 }, { "epoch": 0.28285327662864873, "grad_norm": 8.393353462219238, "learning_rate": 4.713917525773196e-06, "logits/chosen": 2.3168864250183105, "logits/rejected": 2.603562593460083, "logps/chosen": -203.56890869140625, "logps/rejected": -175.8870086669922, "loss": 0.7487, "rewards/accuracies": 0.5, "rewards/chosen": 0.37378817796707153, "rewards/margins": -0.08337822556495667, "rewards/rejected": 0.4571663737297058, "step": 1829 }, { "epoch": 0.28300792576841294, "grad_norm": 7.22050142288208, "learning_rate": 4.716494845360825e-06, "logits/chosen": 10.363870620727539, "logits/rejected": 8.915002822875977, "logps/chosen": -404.7255554199219, "logps/rejected": -308.6307373046875, "loss": 0.7054, "rewards/accuracies": 0.375, "rewards/chosen": 0.6660505533218384, "rewards/margins": 0.04919108748435974, "rewards/rejected": 0.6168594360351562, "step": 1830 }, { "epoch": 0.2831625749081771, "grad_norm": 7.997409820556641, "learning_rate": 4.719072164948454e-06, "logits/chosen": 7.335951805114746, "logits/rejected": 6.4761810302734375, "logps/chosen": -368.7034912109375, "logps/rejected": -321.3094482421875, "loss": 0.7032, "rewards/accuracies": 0.375, "rewards/chosen": 0.33495789766311646, "rewards/margins": 0.0901574194431305, "rewards/rejected": 0.24480049312114716, "step": 1831 }, { "epoch": 0.28331722404794124, "grad_norm": 3.9359004497528076, "learning_rate": 4.721649484536083e-06, "logits/chosen": 8.565652847290039, "logits/rejected": 7.981936931610107, "logps/chosen": -174.45608520507812, "logps/rejected": -187.18368530273438, "loss": 0.5904, "rewards/accuracies": 0.625, "rewards/chosen": 0.6424105167388916, "rewards/margins": 0.2401329129934311, "rewards/rejected": 0.4022775888442993, "step": 1832 }, { "epoch": 0.2834718731877054, "grad_norm": 3.5301573276519775, "learning_rate": 4.7242268041237115e-06, "logits/chosen": 14.39261531829834, "logits/rejected": 6.3522047996521, "logps/chosen": -205.04884338378906, "logps/rejected": -107.8465576171875, "loss": 0.5299, "rewards/accuracies": 0.75, "rewards/chosen": 0.5573626756668091, "rewards/margins": 0.4881877303123474, "rewards/rejected": 0.06917493045330048, "step": 1833 }, { "epoch": 0.28362652232746954, "grad_norm": 5.176958084106445, "learning_rate": 4.726804123711341e-06, "logits/chosen": 10.465843200683594, "logits/rejected": 10.067968368530273, "logps/chosen": -261.00506591796875, "logps/rejected": -348.6839599609375, "loss": 0.6793, "rewards/accuracies": 0.625, "rewards/chosen": 0.563470721244812, "rewards/margins": 0.05134439468383789, "rewards/rejected": 0.5121262669563293, "step": 1834 }, { "epoch": 0.2837811714672337, "grad_norm": 6.1779327392578125, "learning_rate": 4.729381443298969e-06, "logits/chosen": 11.66494369506836, "logits/rejected": 8.29582691192627, "logps/chosen": -332.11004638671875, "logps/rejected": -339.4618835449219, "loss": 0.623, "rewards/accuracies": 0.625, "rewards/chosen": 0.7681221961975098, "rewards/margins": 0.1700955480337143, "rewards/rejected": 0.5980266332626343, "step": 1835 }, { "epoch": 0.28393582060699785, "grad_norm": 4.645990371704102, "learning_rate": 4.731958762886599e-06, "logits/chosen": 10.935059547424316, "logits/rejected": 8.363554000854492, "logps/chosen": -213.8449249267578, "logps/rejected": -161.9983673095703, "loss": 0.6058, "rewards/accuracies": 0.625, "rewards/chosen": 0.28287580609321594, "rewards/margins": 0.2302890419960022, "rewards/rejected": 0.05258675664663315, "step": 1836 }, { "epoch": 0.28409046974676205, "grad_norm": 6.875711917877197, "learning_rate": 4.734536082474227e-06, "logits/chosen": 15.596538543701172, "logits/rejected": 12.098448753356934, "logps/chosen": -333.8212890625, "logps/rejected": -198.9110565185547, "loss": 0.8139, "rewards/accuracies": 0.375, "rewards/chosen": 0.32294559478759766, "rewards/margins": -0.16767026484012604, "rewards/rejected": 0.4906158745288849, "step": 1837 }, { "epoch": 0.2842451188865262, "grad_norm": 6.2596282958984375, "learning_rate": 4.7371134020618555e-06, "logits/chosen": 9.050671577453613, "logits/rejected": 7.256229877471924, "logps/chosen": -256.604248046875, "logps/rejected": -235.84396362304688, "loss": 0.7985, "rewards/accuracies": 0.375, "rewards/chosen": 0.36071914434432983, "rewards/margins": -0.14179196953773499, "rewards/rejected": 0.5025111436843872, "step": 1838 }, { "epoch": 0.28439976802629036, "grad_norm": 4.4672980308532715, "learning_rate": 4.739690721649485e-06, "logits/chosen": 11.085832595825195, "logits/rejected": 6.049576759338379, "logps/chosen": -252.97964477539062, "logps/rejected": -170.05857849121094, "loss": 0.6416, "rewards/accuracies": 0.875, "rewards/chosen": 0.6068791747093201, "rewards/margins": 0.1697457730770111, "rewards/rejected": 0.43713343143463135, "step": 1839 }, { "epoch": 0.2845544171660545, "grad_norm": 5.493722915649414, "learning_rate": 4.742268041237113e-06, "logits/chosen": 10.334281921386719, "logits/rejected": 11.191232681274414, "logps/chosen": -214.6047821044922, "logps/rejected": -284.8372802734375, "loss": 0.6452, "rewards/accuracies": 0.5, "rewards/chosen": 0.2926071286201477, "rewards/margins": 0.15418806672096252, "rewards/rejected": 0.13841906189918518, "step": 1840 }, { "epoch": 0.28470906630581866, "grad_norm": 6.68534517288208, "learning_rate": 4.7448453608247425e-06, "logits/chosen": 6.496179580688477, "logits/rejected": 11.83876895904541, "logps/chosen": -270.593017578125, "logps/rejected": -339.4285888671875, "loss": 0.5293, "rewards/accuracies": 1.0, "rewards/chosen": 0.6822841763496399, "rewards/margins": 0.3938106596469879, "rewards/rejected": 0.288473516702652, "step": 1841 }, { "epoch": 0.2848637154455828, "grad_norm": 5.296319961547852, "learning_rate": 4.747422680412371e-06, "logits/chosen": 9.883523941040039, "logits/rejected": 5.976771354675293, "logps/chosen": -202.47079467773438, "logps/rejected": -153.80572509765625, "loss": 0.7006, "rewards/accuracies": 0.375, "rewards/chosen": 0.5124492645263672, "rewards/margins": 0.010732997208833694, "rewards/rejected": 0.5017163157463074, "step": 1842 }, { "epoch": 0.285018364585347, "grad_norm": 4.896380424499512, "learning_rate": 4.75e-06, "logits/chosen": 11.31805419921875, "logits/rejected": 7.822057247161865, "logps/chosen": -185.64405822753906, "logps/rejected": -175.93832397460938, "loss": 0.6689, "rewards/accuracies": 0.625, "rewards/chosen": 0.2248258739709854, "rewards/margins": 0.09107781946659088, "rewards/rejected": 0.13374805450439453, "step": 1843 }, { "epoch": 0.28517301372511117, "grad_norm": 4.91972541809082, "learning_rate": 4.752577319587629e-06, "logits/chosen": 8.028700828552246, "logits/rejected": 2.651580810546875, "logps/chosen": -279.5573425292969, "logps/rejected": -196.71580505371094, "loss": 0.63, "rewards/accuracies": 0.625, "rewards/chosen": 0.5947310924530029, "rewards/margins": 0.2853817939758301, "rewards/rejected": 0.30934929847717285, "step": 1844 }, { "epoch": 0.2853276628648753, "grad_norm": 5.742367267608643, "learning_rate": 4.755154639175258e-06, "logits/chosen": 9.682886123657227, "logits/rejected": 13.92170238494873, "logps/chosen": -242.05584716796875, "logps/rejected": -283.5407409667969, "loss": 0.7506, "rewards/accuracies": 0.5, "rewards/chosen": 0.5419736504554749, "rewards/margins": -0.02542208880186081, "rewards/rejected": 0.5673957467079163, "step": 1845 }, { "epoch": 0.28548231200463947, "grad_norm": 4.9652323722839355, "learning_rate": 4.7577319587628865e-06, "logits/chosen": 6.738117694854736, "logits/rejected": 4.107222557067871, "logps/chosen": -251.4000701904297, "logps/rejected": -215.6869354248047, "loss": 0.5736, "rewards/accuracies": 0.75, "rewards/chosen": 0.4638819098472595, "rewards/margins": 0.2936471104621887, "rewards/rejected": 0.1702347695827484, "step": 1846 }, { "epoch": 0.2856369611444036, "grad_norm": 3.9702770709991455, "learning_rate": 4.760309278350516e-06, "logits/chosen": 10.768385887145996, "logits/rejected": 8.962892532348633, "logps/chosen": -194.072998046875, "logps/rejected": -190.41769409179688, "loss": 0.6338, "rewards/accuracies": 0.375, "rewards/chosen": 0.39902105927467346, "rewards/margins": 0.17166250944137573, "rewards/rejected": 0.22735854983329773, "step": 1847 }, { "epoch": 0.2857916102841678, "grad_norm": 5.993771076202393, "learning_rate": 4.762886597938144e-06, "logits/chosen": 7.632426738739014, "logits/rejected": 12.546123504638672, "logps/chosen": -137.77102661132812, "logps/rejected": -260.28582763671875, "loss": 0.6888, "rewards/accuracies": 0.5, "rewards/chosen": 0.49250122904777527, "rewards/margins": 0.11576317250728607, "rewards/rejected": 0.3767380714416504, "step": 1848 }, { "epoch": 0.285946259423932, "grad_norm": 5.581829071044922, "learning_rate": 4.7654639175257735e-06, "logits/chosen": 7.275015830993652, "logits/rejected": 7.232879161834717, "logps/chosen": -192.13311767578125, "logps/rejected": -218.48243713378906, "loss": 0.7322, "rewards/accuracies": 0.5, "rewards/chosen": 0.37595364451408386, "rewards/margins": -0.05832003802061081, "rewards/rejected": 0.4342736601829529, "step": 1849 }, { "epoch": 0.28610090856369613, "grad_norm": 6.8089704513549805, "learning_rate": 4.768041237113403e-06, "logits/chosen": 7.087652683258057, "logits/rejected": 6.997608661651611, "logps/chosen": -263.650634765625, "logps/rejected": -193.8986053466797, "loss": 0.7268, "rewards/accuracies": 0.625, "rewards/chosen": 0.5429936647415161, "rewards/margins": 0.028792712837457657, "rewards/rejected": 0.5142009258270264, "step": 1850 }, { "epoch": 0.2862555577034603, "grad_norm": 7.340090751647949, "learning_rate": 4.770618556701031e-06, "logits/chosen": 4.927595615386963, "logits/rejected": 3.9277572631835938, "logps/chosen": -224.3555908203125, "logps/rejected": -273.0819091796875, "loss": 0.7241, "rewards/accuracies": 0.5, "rewards/chosen": 0.6165887713432312, "rewards/margins": 0.16151177883148193, "rewards/rejected": 0.4550769627094269, "step": 1851 }, { "epoch": 0.28641020684322444, "grad_norm": 5.625838279724121, "learning_rate": 4.7731958762886605e-06, "logits/chosen": 8.615274429321289, "logits/rejected": 9.036361694335938, "logps/chosen": -275.8540954589844, "logps/rejected": -293.46234130859375, "loss": 0.6642, "rewards/accuracies": 0.625, "rewards/chosen": 0.34709426760673523, "rewards/margins": 0.1716088503599167, "rewards/rejected": 0.17548543214797974, "step": 1852 }, { "epoch": 0.2865648559829886, "grad_norm": 6.167919158935547, "learning_rate": 4.775773195876289e-06, "logits/chosen": 12.999015808105469, "logits/rejected": 6.3859992027282715, "logps/chosen": -378.6021728515625, "logps/rejected": -347.91046142578125, "loss": 0.5097, "rewards/accuracies": 1.0, "rewards/chosen": 0.8138599395751953, "rewards/margins": 0.42903411388397217, "rewards/rejected": 0.38482582569122314, "step": 1853 }, { "epoch": 0.28671950512275274, "grad_norm": 5.481585502624512, "learning_rate": 4.778350515463918e-06, "logits/chosen": 7.518723964691162, "logits/rejected": 3.4797585010528564, "logps/chosen": -199.45236206054688, "logps/rejected": -189.0595703125, "loss": 0.757, "rewards/accuracies": 0.375, "rewards/chosen": 0.13073891401290894, "rewards/margins": 0.033858686685562134, "rewards/rejected": 0.0968802273273468, "step": 1854 }, { "epoch": 0.2868741542625169, "grad_norm": 4.221721649169922, "learning_rate": 4.780927835051547e-06, "logits/chosen": 11.214179039001465, "logits/rejected": 5.374087333679199, "logps/chosen": -234.01914978027344, "logps/rejected": -249.1956329345703, "loss": 0.5659, "rewards/accuracies": 0.625, "rewards/chosen": 0.8257064819335938, "rewards/margins": 0.4029068946838379, "rewards/rejected": 0.42279961705207825, "step": 1855 }, { "epoch": 0.2870288034022811, "grad_norm": 3.967857599258423, "learning_rate": 4.783505154639176e-06, "logits/chosen": 13.628289222717285, "logits/rejected": 5.328762054443359, "logps/chosen": -294.2099609375, "logps/rejected": -188.9038543701172, "loss": 0.588, "rewards/accuracies": 0.5, "rewards/chosen": 0.807749330997467, "rewards/margins": 0.36114585399627686, "rewards/rejected": 0.4466035068035126, "step": 1856 }, { "epoch": 0.28718345254204525, "grad_norm": 5.563789367675781, "learning_rate": 4.7860824742268045e-06, "logits/chosen": 9.868415832519531, "logits/rejected": 11.074239730834961, "logps/chosen": -306.042236328125, "logps/rejected": -333.7354736328125, "loss": 0.6442, "rewards/accuracies": 0.625, "rewards/chosen": 0.5254322290420532, "rewards/margins": 0.25586453080177307, "rewards/rejected": 0.26956766843795776, "step": 1857 }, { "epoch": 0.2873381016818094, "grad_norm": 6.030600547790527, "learning_rate": 4.788659793814434e-06, "logits/chosen": 7.580175399780273, "logits/rejected": 0.6681503057479858, "logps/chosen": -274.1636962890625, "logps/rejected": -180.44439697265625, "loss": 0.6774, "rewards/accuracies": 0.5, "rewards/chosen": 0.3951334059238434, "rewards/margins": 0.0551123172044754, "rewards/rejected": 0.3400210738182068, "step": 1858 }, { "epoch": 0.28749275082157355, "grad_norm": 4.292391300201416, "learning_rate": 4.791237113402062e-06, "logits/chosen": 6.318345069885254, "logits/rejected": 3.2153501510620117, "logps/chosen": -246.58587646484375, "logps/rejected": -190.0252685546875, "loss": 0.6109, "rewards/accuracies": 0.625, "rewards/chosen": 0.49242615699768066, "rewards/margins": 0.2143019437789917, "rewards/rejected": 0.27812421321868896, "step": 1859 }, { "epoch": 0.2876473999613377, "grad_norm": 4.7412872314453125, "learning_rate": 4.7938144329896915e-06, "logits/chosen": 10.898558616638184, "logits/rejected": 8.342809677124023, "logps/chosen": -209.5616455078125, "logps/rejected": -228.09487915039062, "loss": 0.6185, "rewards/accuracies": 0.75, "rewards/chosen": 0.7684593200683594, "rewards/margins": 0.1832033097743988, "rewards/rejected": 0.5852559804916382, "step": 1860 }, { "epoch": 0.28780204910110185, "grad_norm": 5.757569313049316, "learning_rate": 4.79639175257732e-06, "logits/chosen": 8.592177391052246, "logits/rejected": 9.75408935546875, "logps/chosen": -274.6553955078125, "logps/rejected": -263.5838317871094, "loss": 0.5883, "rewards/accuracies": 0.75, "rewards/chosen": 0.5710976123809814, "rewards/margins": 0.2770535349845886, "rewards/rejected": 0.2940441071987152, "step": 1861 }, { "epoch": 0.28795669824086606, "grad_norm": 4.772401809692383, "learning_rate": 4.798969072164949e-06, "logits/chosen": 8.272906303405762, "logits/rejected": 9.007293701171875, "logps/chosen": -242.4165496826172, "logps/rejected": -223.509521484375, "loss": 0.6601, "rewards/accuracies": 0.5, "rewards/chosen": 0.6363809108734131, "rewards/margins": 0.11577824503183365, "rewards/rejected": 0.5206026434898376, "step": 1862 }, { "epoch": 0.2881113473806302, "grad_norm": 10.107269287109375, "learning_rate": 4.801546391752578e-06, "logits/chosen": 8.460515975952148, "logits/rejected": 12.47944450378418, "logps/chosen": -233.12344360351562, "logps/rejected": -396.3954772949219, "loss": 0.6955, "rewards/accuracies": 0.625, "rewards/chosen": 0.6309150457382202, "rewards/margins": 0.03176078945398331, "rewards/rejected": 0.5991542339324951, "step": 1863 }, { "epoch": 0.28826599652039436, "grad_norm": 5.095467567443848, "learning_rate": 4.804123711340207e-06, "logits/chosen": 12.573497772216797, "logits/rejected": 9.259232521057129, "logps/chosen": -263.8731689453125, "logps/rejected": -244.2327880859375, "loss": 0.6588, "rewards/accuracies": 0.5, "rewards/chosen": 0.7351876497268677, "rewards/margins": 0.18546277284622192, "rewards/rejected": 0.5497248768806458, "step": 1864 }, { "epoch": 0.2884206456601585, "grad_norm": 6.954251289367676, "learning_rate": 4.8067010309278354e-06, "logits/chosen": 3.989964008331299, "logits/rejected": 2.697913885116577, "logps/chosen": -398.9283752441406, "logps/rejected": -371.2256774902344, "loss": 0.8019, "rewards/accuracies": 0.375, "rewards/chosen": 0.6110036969184875, "rewards/margins": -0.13032497465610504, "rewards/rejected": 0.7413287162780762, "step": 1865 }, { "epoch": 0.28857529479992267, "grad_norm": 12.53165054321289, "learning_rate": 4.809278350515465e-06, "logits/chosen": 10.261015892028809, "logits/rejected": 5.023041725158691, "logps/chosen": -328.63525390625, "logps/rejected": -256.99017333984375, "loss": 0.6777, "rewards/accuracies": 0.375, "rewards/chosen": 0.76494300365448, "rewards/margins": 0.14583933353424072, "rewards/rejected": 0.6191036701202393, "step": 1866 }, { "epoch": 0.2887299439396868, "grad_norm": 8.203725814819336, "learning_rate": 4.811855670103093e-06, "logits/chosen": 11.475428581237793, "logits/rejected": 14.639174461364746, "logps/chosen": -224.88278198242188, "logps/rejected": -345.712646484375, "loss": 0.7935, "rewards/accuracies": 0.375, "rewards/chosen": 0.7569916844367981, "rewards/margins": -0.15340301394462585, "rewards/rejected": 0.9103946685791016, "step": 1867 }, { "epoch": 0.28888459307945097, "grad_norm": 5.280242443084717, "learning_rate": 4.8144329896907225e-06, "logits/chosen": 9.474067687988281, "logits/rejected": 5.182483196258545, "logps/chosen": -268.6368408203125, "logps/rejected": -176.23895263671875, "loss": 0.6656, "rewards/accuracies": 0.625, "rewards/chosen": 0.7429871559143066, "rewards/margins": 0.15090298652648926, "rewards/rejected": 0.5920841097831726, "step": 1868 }, { "epoch": 0.2890392422192152, "grad_norm": 6.1685357093811035, "learning_rate": 4.817010309278351e-06, "logits/chosen": 7.478416442871094, "logits/rejected": 5.823670864105225, "logps/chosen": -300.9277038574219, "logps/rejected": -227.3800506591797, "loss": 0.672, "rewards/accuracies": 0.75, "rewards/chosen": 0.6526117324829102, "rewards/margins": 0.08100330829620361, "rewards/rejected": 0.5716084241867065, "step": 1869 }, { "epoch": 0.2891938913589793, "grad_norm": 4.832686901092529, "learning_rate": 4.81958762886598e-06, "logits/chosen": 9.126526832580566, "logits/rejected": 7.351092338562012, "logps/chosen": -213.6137237548828, "logps/rejected": -182.9091339111328, "loss": 0.6143, "rewards/accuracies": 0.875, "rewards/chosen": 0.32643720507621765, "rewards/margins": 0.24625521898269653, "rewards/rejected": 0.08018197864294052, "step": 1870 }, { "epoch": 0.2893485404987435, "grad_norm": 4.023343086242676, "learning_rate": 4.822164948453609e-06, "logits/chosen": 10.134033203125, "logits/rejected": 10.816076278686523, "logps/chosen": -245.91146850585938, "logps/rejected": -273.0650939941406, "loss": 0.4692, "rewards/accuracies": 0.75, "rewards/chosen": 0.7666264772415161, "rewards/margins": 0.5872600078582764, "rewards/rejected": 0.17936649918556213, "step": 1871 }, { "epoch": 0.28950318963850763, "grad_norm": 5.908870220184326, "learning_rate": 4.824742268041238e-06, "logits/chosen": 4.060463905334473, "logits/rejected": 5.546267986297607, "logps/chosen": -270.0245361328125, "logps/rejected": -274.52679443359375, "loss": 0.684, "rewards/accuracies": 0.75, "rewards/chosen": 0.46871280670166016, "rewards/margins": 0.10605449229478836, "rewards/rejected": 0.3626583218574524, "step": 1872 }, { "epoch": 0.2896578387782718, "grad_norm": 8.977509498596191, "learning_rate": 4.827319587628866e-06, "logits/chosen": 7.3169145584106445, "logits/rejected": 7.2450175285339355, "logps/chosen": -226.78680419921875, "logps/rejected": -215.60064697265625, "loss": 0.682, "rewards/accuracies": 0.375, "rewards/chosen": 0.4504333436489105, "rewards/margins": 0.06812896579504013, "rewards/rejected": 0.3823044002056122, "step": 1873 }, { "epoch": 0.28981248791803593, "grad_norm": 6.160987854003906, "learning_rate": 4.829896907216496e-06, "logits/chosen": 10.08566665649414, "logits/rejected": 15.144140243530273, "logps/chosen": -248.6840362548828, "logps/rejected": -341.2016296386719, "loss": 0.7478, "rewards/accuracies": 0.625, "rewards/chosen": 0.6291478872299194, "rewards/margins": -0.07631361484527588, "rewards/rejected": 0.7054615020751953, "step": 1874 }, { "epoch": 0.28996713705780014, "grad_norm": 14.102324485778809, "learning_rate": 4.832474226804124e-06, "logits/chosen": 7.883795738220215, "logits/rejected": 9.165726661682129, "logps/chosen": -233.27447509765625, "logps/rejected": -258.0277404785156, "loss": 0.9088, "rewards/accuracies": 0.25, "rewards/chosen": 0.5611255764961243, "rewards/margins": -0.3357929289340973, "rewards/rejected": 0.8969184756278992, "step": 1875 }, { "epoch": 0.2901217861975643, "grad_norm": 4.813839912414551, "learning_rate": 4.835051546391753e-06, "logits/chosen": 13.958651542663574, "logits/rejected": 1.5933709144592285, "logps/chosen": -415.9864807128906, "logps/rejected": -271.37457275390625, "loss": 0.4467, "rewards/accuracies": 0.875, "rewards/chosen": 0.8756765127182007, "rewards/margins": 0.6316781044006348, "rewards/rejected": 0.2439984381198883, "step": 1876 }, { "epoch": 0.29027643533732844, "grad_norm": 4.900857448577881, "learning_rate": 4.837628865979382e-06, "logits/chosen": 12.156620025634766, "logits/rejected": 8.674437522888184, "logps/chosen": -307.1737060546875, "logps/rejected": -233.92822265625, "loss": 0.5752, "rewards/accuracies": 0.625, "rewards/chosen": 0.7308698296546936, "rewards/margins": 0.3228812515735626, "rewards/rejected": 0.407988578081131, "step": 1877 }, { "epoch": 0.2904310844770926, "grad_norm": 9.24219799041748, "learning_rate": 4.84020618556701e-06, "logits/chosen": 3.6516213417053223, "logits/rejected": 3.7382593154907227, "logps/chosen": -181.85975646972656, "logps/rejected": -220.06683349609375, "loss": 0.7359, "rewards/accuracies": 0.25, "rewards/chosen": 0.41353029012680054, "rewards/margins": -0.05325116962194443, "rewards/rejected": 0.4667814373970032, "step": 1878 }, { "epoch": 0.29058573361685675, "grad_norm": 5.075562477111816, "learning_rate": 4.84278350515464e-06, "logits/chosen": 7.875941276550293, "logits/rejected": 4.466923713684082, "logps/chosen": -266.3796691894531, "logps/rejected": -259.0067138671875, "loss": 0.5886, "rewards/accuracies": 0.625, "rewards/chosen": 0.390921026468277, "rewards/margins": 0.2779941260814667, "rewards/rejected": 0.11292685568332672, "step": 1879 }, { "epoch": 0.2907403827566209, "grad_norm": 5.127997398376465, "learning_rate": 4.845360824742268e-06, "logits/chosen": 7.927471160888672, "logits/rejected": 7.130305290222168, "logps/chosen": -258.9188232421875, "logps/rejected": -238.37164306640625, "loss": 0.6294, "rewards/accuracies": 0.75, "rewards/chosen": 0.6842255592346191, "rewards/margins": 0.17241010069847107, "rewards/rejected": 0.5118154287338257, "step": 1880 }, { "epoch": 0.2908950318963851, "grad_norm": 10.512694358825684, "learning_rate": 4.847938144329897e-06, "logits/chosen": 9.732919692993164, "logits/rejected": 5.461235523223877, "logps/chosen": -250.4842987060547, "logps/rejected": -300.2943420410156, "loss": 0.9948, "rewards/accuracies": 0.125, "rewards/chosen": 0.21167299151420593, "rewards/margins": -0.4778497517108917, "rewards/rejected": 0.6895227432250977, "step": 1881 }, { "epoch": 0.29104968103614925, "grad_norm": 7.691608905792236, "learning_rate": 4.850515463917526e-06, "logits/chosen": 12.284557342529297, "logits/rejected": 9.69810962677002, "logps/chosen": -337.4669494628906, "logps/rejected": -307.0381774902344, "loss": 0.7999, "rewards/accuracies": 0.5, "rewards/chosen": 0.49800485372543335, "rewards/margins": -0.14114035665988922, "rewards/rejected": 0.6391451954841614, "step": 1882 }, { "epoch": 0.2912043301759134, "grad_norm": 4.9191813468933105, "learning_rate": 4.853092783505155e-06, "logits/chosen": 6.480351448059082, "logits/rejected": 7.531797409057617, "logps/chosen": -363.82794189453125, "logps/rejected": -362.02984619140625, "loss": 0.4703, "rewards/accuracies": 0.875, "rewards/chosen": 0.7637132406234741, "rewards/margins": 0.538051962852478, "rewards/rejected": 0.2256612479686737, "step": 1883 }, { "epoch": 0.29135897931567756, "grad_norm": 6.706927299499512, "learning_rate": 4.855670103092784e-06, "logits/chosen": 7.001772880554199, "logits/rejected": 8.997282028198242, "logps/chosen": -242.09133911132812, "logps/rejected": -302.67266845703125, "loss": 0.6685, "rewards/accuracies": 0.5, "rewards/chosen": 0.3338572680950165, "rewards/margins": 0.17152796685695648, "rewards/rejected": 0.16232930123806, "step": 1884 }, { "epoch": 0.2915136284554417, "grad_norm": 4.6081109046936035, "learning_rate": 4.858247422680413e-06, "logits/chosen": 10.130657196044922, "logits/rejected": 7.838085651397705, "logps/chosen": -268.6007080078125, "logps/rejected": -249.39320373535156, "loss": 0.6622, "rewards/accuracies": 0.5, "rewards/chosen": 0.5536366701126099, "rewards/margins": 0.0809537023305893, "rewards/rejected": 0.4726829528808594, "step": 1885 }, { "epoch": 0.29166827759520586, "grad_norm": 5.161870002746582, "learning_rate": 4.860824742268041e-06, "logits/chosen": 8.898221969604492, "logits/rejected": 8.204389572143555, "logps/chosen": -246.04754638671875, "logps/rejected": -285.5431823730469, "loss": 0.6054, "rewards/accuracies": 0.625, "rewards/chosen": 0.5714199542999268, "rewards/margins": 0.24117514491081238, "rewards/rejected": 0.33024483919143677, "step": 1886 }, { "epoch": 0.29182292673497, "grad_norm": 5.844018936157227, "learning_rate": 4.863402061855671e-06, "logits/chosen": 2.492987632751465, "logits/rejected": 2.259042501449585, "logps/chosen": -177.7725830078125, "logps/rejected": -200.97930908203125, "loss": 0.7019, "rewards/accuracies": 0.625, "rewards/chosen": 0.3963301479816437, "rewards/margins": 0.000283166766166687, "rewards/rejected": 0.3960469961166382, "step": 1887 }, { "epoch": 0.2919775758747342, "grad_norm": 6.1057000160217285, "learning_rate": 4.865979381443299e-06, "logits/chosen": 11.475907325744629, "logits/rejected": 4.0576653480529785, "logps/chosen": -369.1065673828125, "logps/rejected": -285.66680908203125, "loss": 0.4942, "rewards/accuracies": 0.75, "rewards/chosen": 1.0629470348358154, "rewards/margins": 0.6337527632713318, "rewards/rejected": 0.4291943311691284, "step": 1888 }, { "epoch": 0.29213222501449837, "grad_norm": 7.132784843444824, "learning_rate": 4.868556701030928e-06, "logits/chosen": 8.809747695922852, "logits/rejected": 8.460738182067871, "logps/chosen": -268.1192626953125, "logps/rejected": -250.61138916015625, "loss": 0.6827, "rewards/accuracies": 0.75, "rewards/chosen": 0.5380210876464844, "rewards/margins": 0.03575601428747177, "rewards/rejected": 0.5022650957107544, "step": 1889 }, { "epoch": 0.2922868741542625, "grad_norm": 6.852219104766846, "learning_rate": 4.871134020618557e-06, "logits/chosen": 4.585185527801514, "logits/rejected": 6.711069583892822, "logps/chosen": -231.81878662109375, "logps/rejected": -285.2042236328125, "loss": 0.8898, "rewards/accuracies": 0.25, "rewards/chosen": 0.20117750763893127, "rewards/margins": -0.3282470703125, "rewards/rejected": 0.5294245481491089, "step": 1890 }, { "epoch": 0.2924415232940267, "grad_norm": 5.409084796905518, "learning_rate": 4.873711340206186e-06, "logits/chosen": 9.516378402709961, "logits/rejected": 6.283413410186768, "logps/chosen": -245.01898193359375, "logps/rejected": -178.39122009277344, "loss": 0.6912, "rewards/accuracies": 0.625, "rewards/chosen": 0.4946885406970978, "rewards/margins": 0.02823571488261223, "rewards/rejected": 0.46645283699035645, "step": 1891 }, { "epoch": 0.2925961724337908, "grad_norm": 7.7536540031433105, "learning_rate": 4.8762886597938146e-06, "logits/chosen": 10.342660903930664, "logits/rejected": 12.251018524169922, "logps/chosen": -301.5098876953125, "logps/rejected": -310.0304870605469, "loss": 0.6272, "rewards/accuracies": 0.625, "rewards/chosen": 0.42624300718307495, "rewards/margins": 0.20878592133522034, "rewards/rejected": 0.2174571007490158, "step": 1892 }, { "epoch": 0.292750821573555, "grad_norm": 6.0120768547058105, "learning_rate": 4.878865979381444e-06, "logits/chosen": 2.740140199661255, "logits/rejected": 7.8863372802734375, "logps/chosen": -238.9498748779297, "logps/rejected": -266.8082580566406, "loss": 0.8034, "rewards/accuracies": 0.5, "rewards/chosen": 0.48856353759765625, "rewards/margins": -0.156171977519989, "rewards/rejected": 0.6447355151176453, "step": 1893 }, { "epoch": 0.2929054707133192, "grad_norm": 5.141645908355713, "learning_rate": 4.881443298969072e-06, "logits/chosen": 11.683637619018555, "logits/rejected": 5.295614242553711, "logps/chosen": -350.488525390625, "logps/rejected": -293.3786926269531, "loss": 0.6445, "rewards/accuracies": 0.625, "rewards/chosen": 0.3900063633918762, "rewards/margins": 0.25055134296417236, "rewards/rejected": 0.13945499062538147, "step": 1894 }, { "epoch": 0.29306011985308333, "grad_norm": 7.291069984436035, "learning_rate": 4.884020618556702e-06, "logits/chosen": 5.5757598876953125, "logits/rejected": 7.077906608581543, "logps/chosen": -228.49021911621094, "logps/rejected": -351.3414306640625, "loss": 0.8233, "rewards/accuracies": 0.5, "rewards/chosen": 0.13726310431957245, "rewards/margins": -0.2067205309867859, "rewards/rejected": 0.34398362040519714, "step": 1895 }, { "epoch": 0.2932147689928475, "grad_norm": 4.559737205505371, "learning_rate": 4.88659793814433e-06, "logits/chosen": 9.19395923614502, "logits/rejected": 6.853704452514648, "logps/chosen": -184.25628662109375, "logps/rejected": -172.0687255859375, "loss": 0.6758, "rewards/accuracies": 0.5, "rewards/chosen": 0.3070237338542938, "rewards/margins": 0.07533854991197586, "rewards/rejected": 0.23168519139289856, "step": 1896 }, { "epoch": 0.29336941813261164, "grad_norm": 5.506324768066406, "learning_rate": 4.889175257731959e-06, "logits/chosen": 11.05807113647461, "logits/rejected": 11.027753829956055, "logps/chosen": -261.2674865722656, "logps/rejected": -253.5176239013672, "loss": 0.6262, "rewards/accuracies": 0.5, "rewards/chosen": 0.4731980860233307, "rewards/margins": 0.2658166289329529, "rewards/rejected": 0.2073814421892166, "step": 1897 }, { "epoch": 0.2935240672723758, "grad_norm": 4.869597434997559, "learning_rate": 4.891752577319588e-06, "logits/chosen": 6.269387245178223, "logits/rejected": -1.3710306882858276, "logps/chosen": -396.888916015625, "logps/rejected": -213.6097412109375, "loss": 0.5333, "rewards/accuracies": 0.875, "rewards/chosen": 0.6167778372764587, "rewards/margins": 0.44955456256866455, "rewards/rejected": 0.1672232747077942, "step": 1898 }, { "epoch": 0.29367871641213994, "grad_norm": 5.392108917236328, "learning_rate": 4.894329896907217e-06, "logits/chosen": 13.807971954345703, "logits/rejected": 9.324326515197754, "logps/chosen": -290.9385681152344, "logps/rejected": -183.27549743652344, "loss": 0.7525, "rewards/accuracies": 0.375, "rewards/chosen": 0.24965815246105194, "rewards/margins": -0.06177006661891937, "rewards/rejected": 0.3114282190799713, "step": 1899 }, { "epoch": 0.2938333655519041, "grad_norm": 6.8372907638549805, "learning_rate": 4.8969072164948455e-06, "logits/chosen": 15.559110641479492, "logits/rejected": 12.51849365234375, "logps/chosen": -301.92877197265625, "logps/rejected": -255.7155303955078, "loss": 0.7602, "rewards/accuracies": 0.375, "rewards/chosen": 0.32814091444015503, "rewards/margins": -0.06982055306434631, "rewards/rejected": 0.39796149730682373, "step": 1900 }, { "epoch": 0.2939880146916683, "grad_norm": 25.758333206176758, "learning_rate": 4.899484536082475e-06, "logits/chosen": 5.710659027099609, "logits/rejected": 8.67061710357666, "logps/chosen": -101.15560913085938, "logps/rejected": -179.40164184570312, "loss": 0.7277, "rewards/accuracies": 0.5, "rewards/chosen": -0.008172348141670227, "rewards/margins": -0.021855982020497322, "rewards/rejected": 0.013683632016181946, "step": 1901 }, { "epoch": 0.29414266383143245, "grad_norm": 7.125190734863281, "learning_rate": 4.902061855670103e-06, "logits/chosen": 10.322349548339844, "logits/rejected": 5.1161346435546875, "logps/chosen": -475.27099609375, "logps/rejected": -407.8321533203125, "loss": 0.6615, "rewards/accuracies": 0.75, "rewards/chosen": 0.4830693006515503, "rewards/margins": 0.10783567279577255, "rewards/rejected": 0.3752336800098419, "step": 1902 }, { "epoch": 0.2942973129711966, "grad_norm": 5.718871593475342, "learning_rate": 4.904639175257732e-06, "logits/chosen": 11.220110893249512, "logits/rejected": 4.4469146728515625, "logps/chosen": -397.81866455078125, "logps/rejected": -423.42266845703125, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": 0.5649611949920654, "rewards/margins": 0.33048421144485474, "rewards/rejected": 0.23447701334953308, "step": 1903 }, { "epoch": 0.29445196211096075, "grad_norm": 5.313636302947998, "learning_rate": 4.907216494845361e-06, "logits/chosen": 13.76280403137207, "logits/rejected": 7.907626152038574, "logps/chosen": -296.14886474609375, "logps/rejected": -302.18231201171875, "loss": 0.5519, "rewards/accuracies": 0.625, "rewards/chosen": 0.5160866975784302, "rewards/margins": 0.4192940592765808, "rewards/rejected": 0.09679260104894638, "step": 1904 }, { "epoch": 0.2946066112507249, "grad_norm": 5.146751880645752, "learning_rate": 4.9097938144329895e-06, "logits/chosen": 4.972388744354248, "logits/rejected": 8.123661994934082, "logps/chosen": -188.42562866210938, "logps/rejected": -231.59475708007812, "loss": 0.6479, "rewards/accuracies": 0.625, "rewards/chosen": 0.2599448263645172, "rewards/margins": 0.15737327933311462, "rewards/rejected": 0.10257153958082199, "step": 1905 }, { "epoch": 0.29476126039048905, "grad_norm": 4.919521331787109, "learning_rate": 4.912371134020619e-06, "logits/chosen": 9.509730339050293, "logits/rejected": 6.672048091888428, "logps/chosen": -227.66221618652344, "logps/rejected": -260.90985107421875, "loss": 0.7182, "rewards/accuracies": 0.5, "rewards/chosen": 0.5440791845321655, "rewards/margins": 0.03063793107867241, "rewards/rejected": 0.513441264629364, "step": 1906 }, { "epoch": 0.29491590953025326, "grad_norm": 6.301992416381836, "learning_rate": 4.914948453608247e-06, "logits/chosen": 10.11520004272461, "logits/rejected": 7.37161922454834, "logps/chosen": -329.0301513671875, "logps/rejected": -180.2372589111328, "loss": 0.7617, "rewards/accuracies": 0.5, "rewards/chosen": 0.19168326258659363, "rewards/margins": -0.044951409101486206, "rewards/rejected": 0.23663464188575745, "step": 1907 }, { "epoch": 0.2950705586700174, "grad_norm": 7.471937656402588, "learning_rate": 4.9175257731958765e-06, "logits/chosen": 15.103532791137695, "logits/rejected": 11.24356460571289, "logps/chosen": -305.7756652832031, "logps/rejected": -300.25408935546875, "loss": 0.4992, "rewards/accuracies": 1.0, "rewards/chosen": 0.7134256362915039, "rewards/margins": 0.472978800535202, "rewards/rejected": 0.24044686555862427, "step": 1908 }, { "epoch": 0.29522520780978156, "grad_norm": 5.336564540863037, "learning_rate": 4.920103092783505e-06, "logits/chosen": 8.325382232666016, "logits/rejected": 9.150381088256836, "logps/chosen": -185.45504760742188, "logps/rejected": -240.892333984375, "loss": 0.7211, "rewards/accuracies": 0.375, "rewards/chosen": 0.3793754577636719, "rewards/margins": 0.0637800320982933, "rewards/rejected": 0.31559544801712036, "step": 1909 }, { "epoch": 0.2953798569495457, "grad_norm": 5.9475998878479, "learning_rate": 4.922680412371135e-06, "logits/chosen": 9.271333694458008, "logits/rejected": 7.484991073608398, "logps/chosen": -305.6849670410156, "logps/rejected": -296.37451171875, "loss": 0.7072, "rewards/accuracies": 0.5, "rewards/chosen": 0.3814612925052643, "rewards/margins": 0.027897104620933533, "rewards/rejected": 0.35356417298316956, "step": 1910 }, { "epoch": 0.29553450608930987, "grad_norm": 5.959292888641357, "learning_rate": 4.9252577319587635e-06, "logits/chosen": 6.174468040466309, "logits/rejected": 1.7129467725753784, "logps/chosen": -331.75115966796875, "logps/rejected": -309.6365661621094, "loss": 0.6681, "rewards/accuracies": 0.625, "rewards/chosen": 0.6077805757522583, "rewards/margins": 0.12058689445257187, "rewards/rejected": 0.48719361424446106, "step": 1911 }, { "epoch": 0.295689155229074, "grad_norm": 5.185881614685059, "learning_rate": 4.927835051546392e-06, "logits/chosen": 14.473819732666016, "logits/rejected": 6.585643768310547, "logps/chosen": -234.30099487304688, "logps/rejected": -189.82171630859375, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": 0.3584960699081421, "rewards/margins": 0.11982384324073792, "rewards/rejected": 0.23867221176624298, "step": 1912 }, { "epoch": 0.2958438043688382, "grad_norm": 5.288536548614502, "learning_rate": 4.930412371134021e-06, "logits/chosen": 17.776643753051758, "logits/rejected": 13.918618202209473, "logps/chosen": -286.01763916015625, "logps/rejected": -267.4415283203125, "loss": 0.5566, "rewards/accuracies": 0.75, "rewards/chosen": 0.7876964807510376, "rewards/margins": 0.341988205909729, "rewards/rejected": 0.4457082748413086, "step": 1913 }, { "epoch": 0.2959984535086024, "grad_norm": 8.25245475769043, "learning_rate": 4.93298969072165e-06, "logits/chosen": 5.127202987670898, "logits/rejected": 8.262370109558105, "logps/chosen": -217.8651123046875, "logps/rejected": -223.11862182617188, "loss": 0.7562, "rewards/accuracies": 0.25, "rewards/chosen": 0.240508571267128, "rewards/margins": -0.1099180281162262, "rewards/rejected": 0.3504266142845154, "step": 1914 }, { "epoch": 0.29615310264836653, "grad_norm": 4.382015705108643, "learning_rate": 4.935567010309279e-06, "logits/chosen": 2.3040614128112793, "logits/rejected": 10.225667953491211, "logps/chosen": -132.89752197265625, "logps/rejected": -221.07054138183594, "loss": 0.6706, "rewards/accuracies": 0.375, "rewards/chosen": 0.23351004719734192, "rewards/margins": 0.11221817135810852, "rewards/rejected": 0.121291883289814, "step": 1915 }, { "epoch": 0.2963077517881307, "grad_norm": 5.382826328277588, "learning_rate": 4.9381443298969075e-06, "logits/chosen": 9.483293533325195, "logits/rejected": 6.829405307769775, "logps/chosen": -416.3201904296875, "logps/rejected": -375.0426025390625, "loss": 0.5482, "rewards/accuracies": 0.625, "rewards/chosen": 0.6910586357116699, "rewards/margins": 0.46107226610183716, "rewards/rejected": 0.22998639941215515, "step": 1916 }, { "epoch": 0.29646240092789483, "grad_norm": 5.007412910461426, "learning_rate": 4.940721649484537e-06, "logits/chosen": 7.947233200073242, "logits/rejected": 3.218405246734619, "logps/chosen": -222.14443969726562, "logps/rejected": -171.01058959960938, "loss": 0.715, "rewards/accuracies": 0.5, "rewards/chosen": 0.18438078463077545, "rewards/margins": 0.0009992271661758423, "rewards/rejected": 0.18338152766227722, "step": 1917 }, { "epoch": 0.296617050067659, "grad_norm": 4.753635883331299, "learning_rate": 4.943298969072165e-06, "logits/chosen": 6.59550666809082, "logits/rejected": 8.799602508544922, "logps/chosen": -198.42910766601562, "logps/rejected": -260.1075744628906, "loss": 0.6509, "rewards/accuracies": 0.5, "rewards/chosen": 0.2855777144432068, "rewards/margins": 0.1866045594215393, "rewards/rejected": 0.09897316992282867, "step": 1918 }, { "epoch": 0.29677169920742313, "grad_norm": 5.054181098937988, "learning_rate": 4.9458762886597945e-06, "logits/chosen": 8.46769905090332, "logits/rejected": 6.968910217285156, "logps/chosen": -210.31954956054688, "logps/rejected": -207.2449951171875, "loss": 0.7131, "rewards/accuracies": 0.125, "rewards/chosen": 0.26277339458465576, "rewards/margins": -0.032272592186927795, "rewards/rejected": 0.29504600167274475, "step": 1919 }, { "epoch": 0.29692634834718734, "grad_norm": 4.9403581619262695, "learning_rate": 4.948453608247423e-06, "logits/chosen": 5.502076625823975, "logits/rejected": 5.065120220184326, "logps/chosen": -233.93626403808594, "logps/rejected": -219.66500854492188, "loss": 0.6379, "rewards/accuracies": 0.625, "rewards/chosen": 0.15127001702785492, "rewards/margins": 0.28466981649398804, "rewards/rejected": -0.1333998143672943, "step": 1920 }, { "epoch": 0.2970809974869515, "grad_norm": 5.581308841705322, "learning_rate": 4.951030927835052e-06, "logits/chosen": 8.548807144165039, "logits/rejected": 6.752242565155029, "logps/chosen": -239.69493103027344, "logps/rejected": -214.94403076171875, "loss": 0.7011, "rewards/accuracies": 0.625, "rewards/chosen": 0.33907243609428406, "rewards/margins": 0.01254080981016159, "rewards/rejected": 0.32653161883354187, "step": 1921 }, { "epoch": 0.29723564662671564, "grad_norm": 7.465334892272949, "learning_rate": 4.953608247422681e-06, "logits/chosen": 8.181343078613281, "logits/rejected": 7.318568229675293, "logps/chosen": -257.0267333984375, "logps/rejected": -235.25161743164062, "loss": 0.7066, "rewards/accuracies": 0.375, "rewards/chosen": 0.18755990266799927, "rewards/margins": 0.05862199515104294, "rewards/rejected": 0.12893790006637573, "step": 1922 }, { "epoch": 0.2973902957664798, "grad_norm": 6.766858100891113, "learning_rate": 4.95618556701031e-06, "logits/chosen": 10.215593338012695, "logits/rejected": 9.005621910095215, "logps/chosen": -339.88751220703125, "logps/rejected": -298.79998779296875, "loss": 0.8354, "rewards/accuracies": 0.5, "rewards/chosen": 0.3374425768852234, "rewards/margins": -0.14313651621341705, "rewards/rejected": 0.48057910799980164, "step": 1923 }, { "epoch": 0.29754494490624395, "grad_norm": 7.537698745727539, "learning_rate": 4.9587628865979385e-06, "logits/chosen": 3.9117486476898193, "logits/rejected": 8.305133819580078, "logps/chosen": -304.59515380859375, "logps/rejected": -311.7738037109375, "loss": 0.6832, "rewards/accuracies": 0.625, "rewards/chosen": 0.1922599971294403, "rewards/margins": 0.0949392318725586, "rewards/rejected": 0.09732075035572052, "step": 1924 }, { "epoch": 0.2976995940460081, "grad_norm": 6.216429710388184, "learning_rate": 4.961340206185568e-06, "logits/chosen": 11.453048706054688, "logits/rejected": 8.348071098327637, "logps/chosen": -244.04605102539062, "logps/rejected": -194.7714385986328, "loss": 0.6168, "rewards/accuracies": 0.5, "rewards/chosen": 0.3159163296222687, "rewards/margins": 0.2807968258857727, "rewards/rejected": 0.03511953726410866, "step": 1925 }, { "epoch": 0.2978542431857723, "grad_norm": 5.361112117767334, "learning_rate": 4.963917525773196e-06, "logits/chosen": 8.843154907226562, "logits/rejected": 6.045343399047852, "logps/chosen": -215.91107177734375, "logps/rejected": -151.71206665039062, "loss": 0.7189, "rewards/accuracies": 0.5, "rewards/chosen": 0.15566149353981018, "rewards/margins": -0.0055234357714653015, "rewards/rejected": 0.16118493676185608, "step": 1926 }, { "epoch": 0.29800889232553646, "grad_norm": 4.3612589836120605, "learning_rate": 4.9664948453608255e-06, "logits/chosen": 9.401978492736816, "logits/rejected": 4.440295219421387, "logps/chosen": -277.6397399902344, "logps/rejected": -153.89511108398438, "loss": 0.5945, "rewards/accuracies": 0.75, "rewards/chosen": 0.2461000382900238, "rewards/margins": 0.24713626503944397, "rewards/rejected": -0.001036219298839569, "step": 1927 }, { "epoch": 0.2981635414653006, "grad_norm": 6.060153484344482, "learning_rate": 4.969072164948454e-06, "logits/chosen": 3.191788673400879, "logits/rejected": 9.357587814331055, "logps/chosen": -161.32582092285156, "logps/rejected": -197.40357971191406, "loss": 0.7306, "rewards/accuracies": 0.375, "rewards/chosen": 0.21285487711429596, "rewards/margins": -0.030037857592105865, "rewards/rejected": 0.24289274215698242, "step": 1928 }, { "epoch": 0.29831819060506476, "grad_norm": 4.475562572479248, "learning_rate": 4.971649484536083e-06, "logits/chosen": 9.300079345703125, "logits/rejected": 4.895201206207275, "logps/chosen": -266.2282409667969, "logps/rejected": -223.19154357910156, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": 0.2918722629547119, "rewards/margins": 0.12380611151456833, "rewards/rejected": 0.16806615889072418, "step": 1929 }, { "epoch": 0.2984728397448289, "grad_norm": 5.8332014083862305, "learning_rate": 4.974226804123712e-06, "logits/chosen": 4.643984317779541, "logits/rejected": 5.1854248046875, "logps/chosen": -207.89447021484375, "logps/rejected": -229.30625915527344, "loss": 0.671, "rewards/accuracies": 0.625, "rewards/chosen": 0.19573122262954712, "rewards/margins": 0.0773552656173706, "rewards/rejected": 0.11837596446275711, "step": 1930 }, { "epoch": 0.29862748888459306, "grad_norm": 5.776227951049805, "learning_rate": 4.976804123711341e-06, "logits/chosen": 18.28095817565918, "logits/rejected": 6.923507213592529, "logps/chosen": -321.23638916015625, "logps/rejected": -317.8025817871094, "loss": 0.6399, "rewards/accuracies": 0.625, "rewards/chosen": 0.5662283897399902, "rewards/margins": 0.18982170522212982, "rewards/rejected": 0.3764066994190216, "step": 1931 }, { "epoch": 0.2987821380243572, "grad_norm": 6.289240837097168, "learning_rate": 4.9793814432989694e-06, "logits/chosen": 8.18239974975586, "logits/rejected": 8.755838394165039, "logps/chosen": -270.99853515625, "logps/rejected": -321.1993103027344, "loss": 0.6784, "rewards/accuracies": 0.5, "rewards/chosen": 0.31380495429039, "rewards/margins": 0.25641822814941406, "rewards/rejected": 0.05738672614097595, "step": 1932 }, { "epoch": 0.2989367871641214, "grad_norm": 4.342260837554932, "learning_rate": 4.981958762886599e-06, "logits/chosen": 11.738896369934082, "logits/rejected": -0.8477178812026978, "logps/chosen": -318.3328552246094, "logps/rejected": -168.71299743652344, "loss": 0.5549, "rewards/accuracies": 0.75, "rewards/chosen": 0.6941225528717041, "rewards/margins": 0.32181793451309204, "rewards/rejected": 0.3723045587539673, "step": 1933 }, { "epoch": 0.29909143630388557, "grad_norm": 3.701807737350464, "learning_rate": 4.984536082474227e-06, "logits/chosen": 9.444266319274902, "logits/rejected": 5.12848424911499, "logps/chosen": -149.723876953125, "logps/rejected": -140.1483154296875, "loss": 0.6138, "rewards/accuracies": 0.625, "rewards/chosen": 0.0888395830988884, "rewards/margins": 0.236131489276886, "rewards/rejected": -0.147291898727417, "step": 1934 }, { "epoch": 0.2992460854436497, "grad_norm": 6.159956455230713, "learning_rate": 4.9871134020618565e-06, "logits/chosen": 10.32263469696045, "logits/rejected": 12.635894775390625, "logps/chosen": -237.2451934814453, "logps/rejected": -269.63775634765625, "loss": 0.7376, "rewards/accuracies": 0.625, "rewards/chosen": 0.1999063491821289, "rewards/margins": -0.04331938549876213, "rewards/rejected": 0.24322575330734253, "step": 1935 }, { "epoch": 0.2994007345834139, "grad_norm": 5.250171184539795, "learning_rate": 4.989690721649485e-06, "logits/chosen": 9.438034057617188, "logits/rejected": 7.704252243041992, "logps/chosen": -328.2623291015625, "logps/rejected": -293.2724609375, "loss": 0.5856, "rewards/accuracies": 0.625, "rewards/chosen": 0.10132008790969849, "rewards/margins": 0.2781168818473816, "rewards/rejected": -0.1767968088388443, "step": 1936 }, { "epoch": 0.299555383723178, "grad_norm": 5.326220989227295, "learning_rate": 4.992268041237114e-06, "logits/chosen": 9.928227424621582, "logits/rejected": 4.256413459777832, "logps/chosen": -305.5445556640625, "logps/rejected": -260.82257080078125, "loss": 0.6563, "rewards/accuracies": 0.625, "rewards/chosen": 0.3811005651950836, "rewards/margins": 0.1830521821975708, "rewards/rejected": 0.19804838299751282, "step": 1937 }, { "epoch": 0.2997100328629422, "grad_norm": 5.169743537902832, "learning_rate": 4.994845360824743e-06, "logits/chosen": 9.099005699157715, "logits/rejected": 7.959979057312012, "logps/chosen": -203.81283569335938, "logps/rejected": -144.13304138183594, "loss": 0.6044, "rewards/accuracies": 0.875, "rewards/chosen": 0.26745331287384033, "rewards/margins": 0.2917126417160034, "rewards/rejected": -0.024259347468614578, "step": 1938 }, { "epoch": 0.2998646820027064, "grad_norm": 6.053447246551514, "learning_rate": 4.997422680412372e-06, "logits/chosen": 9.17519760131836, "logits/rejected": 8.03541374206543, "logps/chosen": -342.9123840332031, "logps/rejected": -308.0648193359375, "loss": 0.6035, "rewards/accuracies": 0.625, "rewards/chosen": 0.4306587278842926, "rewards/margins": 0.2345738410949707, "rewards/rejected": 0.1960848867893219, "step": 1939 }, { "epoch": 0.30001933114247054, "grad_norm": 10.721685409545898, "learning_rate": 5e-06, "logits/chosen": 11.93785572052002, "logits/rejected": 9.35814380645752, "logps/chosen": -360.3119201660156, "logps/rejected": -281.695068359375, "loss": 0.7666, "rewards/accuracies": 0.375, "rewards/chosen": 0.33686235547065735, "rewards/margins": -0.09953603893518448, "rewards/rejected": 0.436398446559906, "step": 1940 }, { "epoch": 0.3001739802822347, "grad_norm": 7.545107841491699, "learning_rate": 4.999713598350327e-06, "logits/chosen": 13.678266525268555, "logits/rejected": 5.557208061218262, "logps/chosen": -290.08184814453125, "logps/rejected": -289.8662414550781, "loss": 0.7128, "rewards/accuracies": 0.5, "rewards/chosen": 0.10415621101856232, "rewards/margins": 0.02573418617248535, "rewards/rejected": 0.07842201739549637, "step": 1941 }, { "epoch": 0.30032862942199884, "grad_norm": 6.301037788391113, "learning_rate": 4.999427196700654e-06, "logits/chosen": 18.108572006225586, "logits/rejected": 5.521419048309326, "logps/chosen": -401.7284851074219, "logps/rejected": -217.22152709960938, "loss": 0.6597, "rewards/accuracies": 0.625, "rewards/chosen": 0.3661457896232605, "rewards/margins": 0.10110605508089066, "rewards/rejected": 0.26503968238830566, "step": 1942 }, { "epoch": 0.300483278561763, "grad_norm": 6.519114017486572, "learning_rate": 4.99914079505098e-06, "logits/chosen": 8.60999870300293, "logits/rejected": 8.846593856811523, "logps/chosen": -323.40234375, "logps/rejected": -315.4803161621094, "loss": 0.8658, "rewards/accuracies": 0.25, "rewards/chosen": 0.3486313819885254, "rewards/margins": -0.27207040786743164, "rewards/rejected": 0.6207018494606018, "step": 1943 }, { "epoch": 0.30063792770152714, "grad_norm": 8.513544082641602, "learning_rate": 4.998854393401306e-06, "logits/chosen": 6.264193534851074, "logits/rejected": 6.085964202880859, "logps/chosen": -300.51007080078125, "logps/rejected": -305.0381774902344, "loss": 0.8527, "rewards/accuracies": 0.25, "rewards/chosen": 0.02136526256799698, "rewards/margins": -0.23535224795341492, "rewards/rejected": 0.2567175030708313, "step": 1944 }, { "epoch": 0.3007925768412913, "grad_norm": 8.875266075134277, "learning_rate": 4.998567991751633e-06, "logits/chosen": 12.945000648498535, "logits/rejected": 6.4350266456604, "logps/chosen": -488.75390625, "logps/rejected": -327.65692138671875, "loss": 0.4758, "rewards/accuracies": 1.0, "rewards/chosen": 0.7236288189888, "rewards/margins": 0.513248085975647, "rewards/rejected": 0.21038076281547546, "step": 1945 }, { "epoch": 0.3009472259810555, "grad_norm": 6.929609775543213, "learning_rate": 4.9982815901019595e-06, "logits/chosen": 9.850441932678223, "logits/rejected": 6.299242973327637, "logps/chosen": -321.49749755859375, "logps/rejected": -262.7158508300781, "loss": 0.7369, "rewards/accuracies": 0.375, "rewards/chosen": 0.2698798179626465, "rewards/margins": -0.0516773946583271, "rewards/rejected": 0.3215572237968445, "step": 1946 }, { "epoch": 0.30110187512081965, "grad_norm": 6.095489501953125, "learning_rate": 4.997995188452286e-06, "logits/chosen": 11.506305694580078, "logits/rejected": 14.566590309143066, "logps/chosen": -274.4747619628906, "logps/rejected": -304.9311218261719, "loss": 0.6408, "rewards/accuracies": 0.5, "rewards/chosen": 0.35204535722732544, "rewards/margins": 0.1680188626050949, "rewards/rejected": 0.18402647972106934, "step": 1947 }, { "epoch": 0.3012565242605838, "grad_norm": 8.11747932434082, "learning_rate": 4.997708786802613e-06, "logits/chosen": 9.627846717834473, "logits/rejected": 10.565401077270508, "logps/chosen": -280.70074462890625, "logps/rejected": -286.8978271484375, "loss": 0.9348, "rewards/accuracies": 0.25, "rewards/chosen": 0.27578890323638916, "rewards/margins": -0.39433878660202026, "rewards/rejected": 0.6701276898384094, "step": 1948 }, { "epoch": 0.30141117340034795, "grad_norm": 5.044299125671387, "learning_rate": 4.997422385152939e-06, "logits/chosen": 7.704577445983887, "logits/rejected": 11.479877471923828, "logps/chosen": -190.8633575439453, "logps/rejected": -219.56350708007812, "loss": 0.6736, "rewards/accuracies": 0.5, "rewards/chosen": 0.0736057311296463, "rewards/margins": 0.11645365506410599, "rewards/rejected": -0.042847927659749985, "step": 1949 }, { "epoch": 0.3015658225401121, "grad_norm": 5.353774070739746, "learning_rate": 4.997135983503265e-06, "logits/chosen": -0.5301303863525391, "logits/rejected": 2.583714723587036, "logps/chosen": -161.01283264160156, "logps/rejected": -195.58291625976562, "loss": 0.6867, "rewards/accuracies": 0.625, "rewards/chosen": 0.001950286328792572, "rewards/margins": 0.060441188514232635, "rewards/rejected": -0.05849090963602066, "step": 1950 }, { "epoch": 0.30172047167987626, "grad_norm": 11.192580223083496, "learning_rate": 4.996849581853592e-06, "logits/chosen": 14.546045303344727, "logits/rejected": 7.657373428344727, "logps/chosen": -268.5328674316406, "logps/rejected": -163.61904907226562, "loss": 0.6853, "rewards/accuracies": 0.375, "rewards/chosen": 0.22746187448501587, "rewards/margins": 0.06721755117177963, "rewards/rejected": 0.16024431586265564, "step": 1951 }, { "epoch": 0.30187512081964046, "grad_norm": 5.781955242156982, "learning_rate": 4.9965631802039185e-06, "logits/chosen": 5.6796345710754395, "logits/rejected": 5.839045524597168, "logps/chosen": -244.78334045410156, "logps/rejected": -272.55352783203125, "loss": 0.6228, "rewards/accuracies": 0.5, "rewards/chosen": 0.37983834743499756, "rewards/margins": 0.24405992031097412, "rewards/rejected": 0.13577842712402344, "step": 1952 }, { "epoch": 0.3020297699594046, "grad_norm": 3.289320230484009, "learning_rate": 4.996276778554245e-06, "logits/chosen": 7.486164569854736, "logits/rejected": 6.053140640258789, "logps/chosen": -139.509765625, "logps/rejected": -154.30303955078125, "loss": 0.5658, "rewards/accuracies": 0.625, "rewards/chosen": 0.3910551369190216, "rewards/margins": 0.35984504222869873, "rewards/rejected": 0.03121008723974228, "step": 1953 }, { "epoch": 0.30218441909916877, "grad_norm": 6.5685882568359375, "learning_rate": 4.995990376904572e-06, "logits/chosen": 6.637234687805176, "logits/rejected": 4.8490309715271, "logps/chosen": -253.0272979736328, "logps/rejected": -243.67648315429688, "loss": 0.8978, "rewards/accuracies": 0.5, "rewards/chosen": 0.18919944763183594, "rewards/margins": -0.27830982208251953, "rewards/rejected": 0.46750926971435547, "step": 1954 }, { "epoch": 0.3023390682389329, "grad_norm": 10.512470245361328, "learning_rate": 4.995703975254898e-06, "logits/chosen": 10.211843490600586, "logits/rejected": 9.98330020904541, "logps/chosen": -292.5981140136719, "logps/rejected": -256.3668212890625, "loss": 0.7808, "rewards/accuracies": 0.25, "rewards/chosen": 0.46049395203590393, "rewards/margins": -0.12379749119281769, "rewards/rejected": 0.5842914581298828, "step": 1955 }, { "epoch": 0.30249371737869707, "grad_norm": 7.871715068817139, "learning_rate": 4.995417573605224e-06, "logits/chosen": 10.148628234863281, "logits/rejected": 10.65623664855957, "logps/chosen": -237.61911010742188, "logps/rejected": -261.0722351074219, "loss": 0.8023, "rewards/accuracies": 0.625, "rewards/chosen": 0.018633782863616943, "rewards/margins": -0.12444295734167099, "rewards/rejected": 0.14307674765586853, "step": 1956 }, { "epoch": 0.3026483665184612, "grad_norm": 4.232603549957275, "learning_rate": 4.995131171955551e-06, "logits/chosen": 8.947652816772461, "logits/rejected": 7.929649829864502, "logps/chosen": -158.567138671875, "logps/rejected": -170.98976135253906, "loss": 0.5916, "rewards/accuracies": 0.75, "rewards/chosen": 0.2423844337463379, "rewards/margins": 0.2443198263645172, "rewards/rejected": -0.0019353926181793213, "step": 1957 }, { "epoch": 0.3028030156582254, "grad_norm": 6.2412614822387695, "learning_rate": 4.9948447703058776e-06, "logits/chosen": 9.776228904724121, "logits/rejected": 5.629969120025635, "logps/chosen": -281.0980529785156, "logps/rejected": -257.5005798339844, "loss": 0.5469, "rewards/accuracies": 0.75, "rewards/chosen": 0.4187763035297394, "rewards/margins": 0.3742130398750305, "rewards/rejected": 0.04456329345703125, "step": 1958 }, { "epoch": 0.3029576647979896, "grad_norm": 3.4976234436035156, "learning_rate": 4.994558368656204e-06, "logits/chosen": 8.489951133728027, "logits/rejected": 10.002816200256348, "logps/chosen": -223.14892578125, "logps/rejected": -228.04150390625, "loss": 0.5302, "rewards/accuracies": 0.875, "rewards/chosen": 0.6216352581977844, "rewards/margins": 0.41556477546691895, "rewards/rejected": 0.20607048273086548, "step": 1959 }, { "epoch": 0.30311231393775373, "grad_norm": 103.04702758789062, "learning_rate": 4.99427196700653e-06, "logits/chosen": 6.267934799194336, "logits/rejected": 10.490842819213867, "logps/chosen": -259.6253662109375, "logps/rejected": -306.8897399902344, "loss": 0.5228, "rewards/accuracies": 0.75, "rewards/chosen": 0.5385517477989197, "rewards/margins": 0.44119182229042053, "rewards/rejected": 0.09735993295907974, "step": 1960 }, { "epoch": 0.3032669630775179, "grad_norm": 6.962917804718018, "learning_rate": 4.993985565356857e-06, "logits/chosen": 10.575376510620117, "logits/rejected": 11.073034286499023, "logps/chosen": -247.47543334960938, "logps/rejected": -222.78102111816406, "loss": 0.8459, "rewards/accuracies": 0.375, "rewards/chosen": 0.11868830025196075, "rewards/margins": -0.2354481816291809, "rewards/rejected": 0.35413646697998047, "step": 1961 }, { "epoch": 0.30342161221728203, "grad_norm": 4.687536239624023, "learning_rate": 4.993699163707183e-06, "logits/chosen": 7.472748279571533, "logits/rejected": 5.770908355712891, "logps/chosen": -238.845947265625, "logps/rejected": -196.5159912109375, "loss": 0.6528, "rewards/accuracies": 0.625, "rewards/chosen": 0.4907844066619873, "rewards/margins": 0.1519974172115326, "rewards/rejected": 0.3387869596481323, "step": 1962 }, { "epoch": 0.3035762613570462, "grad_norm": 8.46551513671875, "learning_rate": 4.99341276205751e-06, "logits/chosen": 4.822601318359375, "logits/rejected": 3.2210443019866943, "logps/chosen": -292.7070007324219, "logps/rejected": -211.46051025390625, "loss": 0.8919, "rewards/accuracies": 0.25, "rewards/chosen": -0.25995302200317383, "rewards/margins": -0.2796034812927246, "rewards/rejected": 0.01965045928955078, "step": 1963 }, { "epoch": 0.30373091049681034, "grad_norm": 4.55012321472168, "learning_rate": 4.993126360407836e-06, "logits/chosen": 12.848557472229004, "logits/rejected": 2.4359664916992188, "logps/chosen": -373.224609375, "logps/rejected": -177.35702514648438, "loss": 0.4911, "rewards/accuracies": 0.75, "rewards/chosen": 0.6293210387229919, "rewards/margins": 0.5611113905906677, "rewards/rejected": 0.06820964813232422, "step": 1964 }, { "epoch": 0.30388555963657454, "grad_norm": 4.771052837371826, "learning_rate": 4.9928399587581624e-06, "logits/chosen": 7.136575222015381, "logits/rejected": 7.365444183349609, "logps/chosen": -253.6295928955078, "logps/rejected": -266.80560302734375, "loss": 0.617, "rewards/accuracies": 0.5, "rewards/chosen": 0.2940073013305664, "rewards/margins": 0.3670746386051178, "rewards/rejected": -0.07306733727455139, "step": 1965 }, { "epoch": 0.3040402087763387, "grad_norm": 5.675067901611328, "learning_rate": 4.992553557108489e-06, "logits/chosen": 10.713343620300293, "logits/rejected": 11.298481941223145, "logps/chosen": -244.64675903320312, "logps/rejected": -202.63841247558594, "loss": 0.8748, "rewards/accuracies": 0.375, "rewards/chosen": -0.009420007467269897, "rewards/margins": -0.24955837428569794, "rewards/rejected": 0.24013838171958923, "step": 1966 }, { "epoch": 0.30419485791610285, "grad_norm": 4.353074073791504, "learning_rate": 4.992267155458816e-06, "logits/chosen": 11.110458374023438, "logits/rejected": 7.546187400817871, "logps/chosen": -278.967041015625, "logps/rejected": -242.59310913085938, "loss": 0.5162, "rewards/accuracies": 0.875, "rewards/chosen": 0.456194669008255, "rewards/margins": 0.42083263397216797, "rewards/rejected": 0.03536204993724823, "step": 1967 }, { "epoch": 0.304349507055867, "grad_norm": 4.183549404144287, "learning_rate": 4.991980753809142e-06, "logits/chosen": 9.97763442993164, "logits/rejected": 8.955684661865234, "logps/chosen": -226.4222869873047, "logps/rejected": -195.60052490234375, "loss": 0.5923, "rewards/accuracies": 0.75, "rewards/chosen": 0.4704108238220215, "rewards/margins": 0.25863757729530334, "rewards/rejected": 0.21177320182323456, "step": 1968 }, { "epoch": 0.30450415619563115, "grad_norm": 5.6945977210998535, "learning_rate": 4.991694352159469e-06, "logits/chosen": 13.426898956298828, "logits/rejected": 14.656112670898438, "logps/chosen": -337.1523742675781, "logps/rejected": -321.0163879394531, "loss": 0.6999, "rewards/accuracies": 0.5, "rewards/chosen": 0.38470345735549927, "rewards/margins": 0.03220170736312866, "rewards/rejected": 0.3525017499923706, "step": 1969 }, { "epoch": 0.3046588053353953, "grad_norm": 7.493653297424316, "learning_rate": 4.991407950509795e-06, "logits/chosen": 11.03613567352295, "logits/rejected": 9.68317985534668, "logps/chosen": -349.5970458984375, "logps/rejected": -320.911865234375, "loss": 0.6386, "rewards/accuracies": 0.625, "rewards/chosen": 0.30209922790527344, "rewards/margins": 0.154245063662529, "rewards/rejected": 0.14785417914390564, "step": 1970 }, { "epoch": 0.3048134544751595, "grad_norm": 5.85058069229126, "learning_rate": 4.9911215488601215e-06, "logits/chosen": 11.295326232910156, "logits/rejected": 10.579447746276855, "logps/chosen": -189.58340454101562, "logps/rejected": -276.0993957519531, "loss": 0.6909, "rewards/accuracies": 0.5, "rewards/chosen": 0.18556396663188934, "rewards/margins": 0.020555786788463593, "rewards/rejected": 0.16500815749168396, "step": 1971 }, { "epoch": 0.30496810361492366, "grad_norm": 5.871170520782471, "learning_rate": 4.990835147210448e-06, "logits/chosen": 11.124595642089844, "logits/rejected": 5.645484447479248, "logps/chosen": -366.6029357910156, "logps/rejected": -198.0446014404297, "loss": 0.5875, "rewards/accuracies": 0.625, "rewards/chosen": 0.446881502866745, "rewards/margins": 0.35985517501831055, "rewards/rejected": 0.08702629804611206, "step": 1972 }, { "epoch": 0.3051227527546878, "grad_norm": 4.633442401885986, "learning_rate": 4.990548745560775e-06, "logits/chosen": 8.628091812133789, "logits/rejected": 3.560645580291748, "logps/chosen": -284.3542175292969, "logps/rejected": -215.97756958007812, "loss": 0.5286, "rewards/accuracies": 0.75, "rewards/chosen": 0.5714645385742188, "rewards/margins": 0.4734707474708557, "rewards/rejected": 0.09799380600452423, "step": 1973 }, { "epoch": 0.30527740189445196, "grad_norm": 3.6263229846954346, "learning_rate": 4.9902623439111014e-06, "logits/chosen": 2.573881149291992, "logits/rejected": 7.097978591918945, "logps/chosen": -172.2783660888672, "logps/rejected": -212.84161376953125, "loss": 0.5485, "rewards/accuracies": 0.625, "rewards/chosen": 0.3548279106616974, "rewards/margins": 0.3845118284225464, "rewards/rejected": -0.029683917760849, "step": 1974 }, { "epoch": 0.3054320510342161, "grad_norm": 3.9928176403045654, "learning_rate": 4.989975942261428e-06, "logits/chosen": 13.164188385009766, "logits/rejected": 10.088249206542969, "logps/chosen": -272.97381591796875, "logps/rejected": -258.22918701171875, "loss": 0.5234, "rewards/accuracies": 0.875, "rewards/chosen": 0.41110047698020935, "rewards/margins": 0.4962027370929718, "rewards/rejected": -0.08510227501392365, "step": 1975 }, { "epoch": 0.30558670017398026, "grad_norm": 8.181061744689941, "learning_rate": 4.989689540611755e-06, "logits/chosen": 8.641823768615723, "logits/rejected": 9.229660987854004, "logps/chosen": -395.608642578125, "logps/rejected": -354.9344787597656, "loss": 0.6836, "rewards/accuracies": 0.375, "rewards/chosen": 0.2017725110054016, "rewards/margins": 0.15754592418670654, "rewards/rejected": 0.04422654211521149, "step": 1976 }, { "epoch": 0.3057413493137444, "grad_norm": 6.364742279052734, "learning_rate": 4.9894031389620805e-06, "logits/chosen": 13.63949966430664, "logits/rejected": 2.9821300506591797, "logps/chosen": -414.233642578125, "logps/rejected": -171.18295288085938, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": 0.26278382539749146, "rewards/margins": 0.10288484394550323, "rewards/rejected": 0.15989898145198822, "step": 1977 }, { "epoch": 0.3058959984535086, "grad_norm": 5.2550578117370605, "learning_rate": 4.989116737312407e-06, "logits/chosen": 8.25478744506836, "logits/rejected": 8.853614807128906, "logps/chosen": -172.90084838867188, "logps/rejected": -190.1629638671875, "loss": 0.7209, "rewards/accuracies": 0.5, "rewards/chosen": -0.057640835642814636, "rewards/margins": -0.021668102592229843, "rewards/rejected": -0.03597274422645569, "step": 1978 }, { "epoch": 0.3060506475932728, "grad_norm": 5.316424369812012, "learning_rate": 4.988830335662734e-06, "logits/chosen": 9.975484848022461, "logits/rejected": 0.7701832056045532, "logps/chosen": -228.9788818359375, "logps/rejected": -135.40347290039062, "loss": 0.63, "rewards/accuracies": 0.5, "rewards/chosen": 0.1471642255783081, "rewards/margins": 0.2743827998638153, "rewards/rejected": -0.1272185742855072, "step": 1979 }, { "epoch": 0.3062052967330369, "grad_norm": 5.436166763305664, "learning_rate": 4.9885439340130605e-06, "logits/chosen": 12.710076332092285, "logits/rejected": 10.410833358764648, "logps/chosen": -184.4937286376953, "logps/rejected": -161.0513916015625, "loss": 0.6438, "rewards/accuracies": 0.375, "rewards/chosen": 0.1457688808441162, "rewards/margins": 0.2016940414905548, "rewards/rejected": -0.055925153195858, "step": 1980 }, { "epoch": 0.3063599458728011, "grad_norm": 4.560668468475342, "learning_rate": 4.988257532363387e-06, "logits/chosen": 8.936497688293457, "logits/rejected": 4.412057399749756, "logps/chosen": -239.14283752441406, "logps/rejected": -176.81141662597656, "loss": 0.5607, "rewards/accuracies": 0.875, "rewards/chosen": 0.18006639182567596, "rewards/margins": 0.3544565439224243, "rewards/rejected": -0.17439012229442596, "step": 1981 }, { "epoch": 0.3065145950125652, "grad_norm": 4.5908918380737305, "learning_rate": 4.987971130713714e-06, "logits/chosen": 10.937694549560547, "logits/rejected": 5.790031909942627, "logps/chosen": -342.538818359375, "logps/rejected": -272.4270324707031, "loss": 0.4938, "rewards/accuracies": 0.875, "rewards/chosen": 0.3979566693305969, "rewards/margins": 0.512667179107666, "rewards/rejected": -0.11471052467823029, "step": 1982 }, { "epoch": 0.3066692441523294, "grad_norm": 8.673511505126953, "learning_rate": 4.98768472906404e-06, "logits/chosen": 9.652889251708984, "logits/rejected": 6.896554470062256, "logps/chosen": -346.6098327636719, "logps/rejected": -264.585205078125, "loss": 0.893, "rewards/accuracies": 0.25, "rewards/chosen": 0.30448389053344727, "rewards/margins": -0.21031302213668823, "rewards/rejected": 0.5147969126701355, "step": 1983 }, { "epoch": 0.3068238932920936, "grad_norm": 4.490819454193115, "learning_rate": 4.987398327414366e-06, "logits/chosen": 8.901927947998047, "logits/rejected": 6.203165054321289, "logps/chosen": -181.65650939941406, "logps/rejected": -208.1868133544922, "loss": 0.5695, "rewards/accuracies": 0.75, "rewards/chosen": 0.1907789260149002, "rewards/margins": 0.34390294551849365, "rewards/rejected": -0.15312398970127106, "step": 1984 }, { "epoch": 0.30697854243185774, "grad_norm": 5.110992908477783, "learning_rate": 4.987111925764693e-06, "logits/chosen": 12.324292182922363, "logits/rejected": 7.515582084655762, "logps/chosen": -207.66912841796875, "logps/rejected": -211.3025360107422, "loss": 0.7212, "rewards/accuracies": 0.25, "rewards/chosen": 0.12857075035572052, "rewards/margins": 0.013825636357069016, "rewards/rejected": 0.1147451102733612, "step": 1985 }, { "epoch": 0.3071331915716219, "grad_norm": 4.301996231079102, "learning_rate": 4.9868255241150196e-06, "logits/chosen": 4.550024509429932, "logits/rejected": 7.1157121658325195, "logps/chosen": -144.9849090576172, "logps/rejected": -201.24571228027344, "loss": 0.565, "rewards/accuracies": 0.75, "rewards/chosen": 0.18839310109615326, "rewards/margins": 0.3174161911010742, "rewards/rejected": -0.12902307510375977, "step": 1986 }, { "epoch": 0.30728784071138604, "grad_norm": 9.139196395874023, "learning_rate": 4.986539122465346e-06, "logits/chosen": 10.21019458770752, "logits/rejected": 9.454524993896484, "logps/chosen": -217.67738342285156, "logps/rejected": -216.4231414794922, "loss": 0.5819, "rewards/accuracies": 0.625, "rewards/chosen": 0.34340447187423706, "rewards/margins": 0.2638482451438904, "rewards/rejected": 0.07955625653266907, "step": 1987 }, { "epoch": 0.3074424898511502, "grad_norm": 7.220220565795898, "learning_rate": 4.986252720815673e-06, "logits/chosen": 11.023681640625, "logits/rejected": 6.770074367523193, "logps/chosen": -287.69525146484375, "logps/rejected": -212.20657348632812, "loss": 0.7852, "rewards/accuracies": 0.25, "rewards/chosen": -0.09922752529382706, "rewards/margins": -0.13665685057640076, "rewards/rejected": 0.0374293252825737, "step": 1988 }, { "epoch": 0.30759713899091434, "grad_norm": 5.522611141204834, "learning_rate": 4.985966319165999e-06, "logits/chosen": 7.377992630004883, "logits/rejected": 8.722118377685547, "logps/chosen": -282.89727783203125, "logps/rejected": -243.05999755859375, "loss": 0.7548, "rewards/accuracies": 0.5, "rewards/chosen": 0.10561446845531464, "rewards/margins": -0.08198529481887817, "rewards/rejected": 0.187599778175354, "step": 1989 }, { "epoch": 0.30775178813067855, "grad_norm": 4.899139404296875, "learning_rate": 4.985679917516325e-06, "logits/chosen": 8.418660163879395, "logits/rejected": 2.672091007232666, "logps/chosen": -389.9258117675781, "logps/rejected": -245.32838439941406, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": 0.495248019695282, "rewards/margins": 0.34115707874298096, "rewards/rejected": 0.15409094095230103, "step": 1990 }, { "epoch": 0.3079064372704427, "grad_norm": 7.869766712188721, "learning_rate": 4.985393515866652e-06, "logits/chosen": 11.306015014648438, "logits/rejected": 10.009170532226562, "logps/chosen": -280.33416748046875, "logps/rejected": -291.0931091308594, "loss": 0.9103, "rewards/accuracies": 0.375, "rewards/chosen": 0.2642689049243927, "rewards/margins": -0.325820654630661, "rewards/rejected": 0.5900895595550537, "step": 1991 }, { "epoch": 0.30806108641020685, "grad_norm": 5.085631370544434, "learning_rate": 4.985107114216979e-06, "logits/chosen": 6.397157669067383, "logits/rejected": 5.009548187255859, "logps/chosen": -215.83450317382812, "logps/rejected": -214.5556640625, "loss": 0.7128, "rewards/accuracies": 0.75, "rewards/chosen": 0.13566569983959198, "rewards/margins": -0.004653111100196838, "rewards/rejected": 0.14031882584095, "step": 1992 }, { "epoch": 0.308215735549971, "grad_norm": 4.604753017425537, "learning_rate": 4.984820712567304e-06, "logits/chosen": 11.945621490478516, "logits/rejected": 6.725895881652832, "logps/chosen": -266.83837890625, "logps/rejected": -229.85540771484375, "loss": 0.548, "rewards/accuracies": 0.875, "rewards/chosen": 0.4130420684814453, "rewards/margins": 0.3493255376815796, "rewards/rejected": 0.06371650099754333, "step": 1993 }, { "epoch": 0.30837038468973516, "grad_norm": 5.085018157958984, "learning_rate": 4.984534310917631e-06, "logits/chosen": 12.733266830444336, "logits/rejected": 6.141596794128418, "logps/chosen": -377.250244140625, "logps/rejected": -258.7804260253906, "loss": 0.5826, "rewards/accuracies": 0.625, "rewards/chosen": 0.48643437027931213, "rewards/margins": 0.3208006024360657, "rewards/rejected": 0.16563378274440765, "step": 1994 }, { "epoch": 0.3085250338294993, "grad_norm": 7.331588268280029, "learning_rate": 4.984247909267958e-06, "logits/chosen": 8.357823371887207, "logits/rejected": 6.840785026550293, "logps/chosen": -364.28765869140625, "logps/rejected": -366.29766845703125, "loss": 0.622, "rewards/accuracies": 0.75, "rewards/chosen": 0.099069744348526, "rewards/margins": 0.1920163631439209, "rewards/rejected": -0.0929466187953949, "step": 1995 }, { "epoch": 0.30867968296926346, "grad_norm": 5.524991035461426, "learning_rate": 4.983961507618284e-06, "logits/chosen": 10.343605995178223, "logits/rejected": 4.031404495239258, "logps/chosen": -281.15313720703125, "logps/rejected": -186.6189727783203, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": 0.0052339062094688416, "rewards/margins": 0.013379998505115509, "rewards/rejected": -0.008146099746227264, "step": 1996 }, { "epoch": 0.30883433210902766, "grad_norm": 13.873757362365723, "learning_rate": 4.983675105968611e-06, "logits/chosen": 4.512668609619141, "logits/rejected": 6.311601638793945, "logps/chosen": -237.05955505371094, "logps/rejected": -390.25628662109375, "loss": 0.5836, "rewards/accuracies": 0.75, "rewards/chosen": 0.26084285974502563, "rewards/margins": 0.4081624746322632, "rewards/rejected": -0.14731964468955994, "step": 1997 }, { "epoch": 0.3089889812487918, "grad_norm": 7.652311325073242, "learning_rate": 4.983388704318937e-06, "logits/chosen": 9.728889465332031, "logits/rejected": 6.985532760620117, "logps/chosen": -296.9800109863281, "logps/rejected": -201.91685485839844, "loss": 0.7697, "rewards/accuracies": 0.375, "rewards/chosen": 0.09284285455942154, "rewards/margins": 0.024631395936012268, "rewards/rejected": 0.06821145862340927, "step": 1998 }, { "epoch": 0.30914363038855597, "grad_norm": 4.929397106170654, "learning_rate": 4.9831023026692635e-06, "logits/chosen": 7.380917549133301, "logits/rejected": 5.23086404800415, "logps/chosen": -254.52708435058594, "logps/rejected": -243.0704345703125, "loss": 0.5669, "rewards/accuracies": 0.875, "rewards/chosen": 0.31868162751197815, "rewards/margins": 0.33252009749412537, "rewards/rejected": -0.013838473707437515, "step": 1999 }, { "epoch": 0.3092982795283201, "grad_norm": 5.268299579620361, "learning_rate": 4.98281590101959e-06, "logits/chosen": 11.959253311157227, "logits/rejected": 8.007569313049316, "logps/chosen": -248.97425842285156, "logps/rejected": -166.68801879882812, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.1765538901090622, "rewards/margins": 0.061085328459739685, "rewards/rejected": 0.11546854674816132, "step": 2000 }, { "epoch": 0.30945292866808427, "grad_norm": 6.657409191131592, "learning_rate": 4.982529499369917e-06, "logits/chosen": 6.553297996520996, "logits/rejected": 7.741517543792725, "logps/chosen": -261.2109069824219, "logps/rejected": -257.0432434082031, "loss": 0.7359, "rewards/accuracies": 0.625, "rewards/chosen": 0.39706581830978394, "rewards/margins": -0.0038317739963531494, "rewards/rejected": 0.4008976221084595, "step": 2001 }, { "epoch": 0.3096075778078484, "grad_norm": 7.364019870758057, "learning_rate": 4.982243097720243e-06, "logits/chosen": 3.949641227722168, "logits/rejected": 3.9044790267944336, "logps/chosen": -209.78517150878906, "logps/rejected": -234.69386291503906, "loss": 0.5677, "rewards/accuracies": 0.625, "rewards/chosen": 0.19461870193481445, "rewards/margins": 0.3288024067878723, "rewards/rejected": -0.13418368995189667, "step": 2002 }, { "epoch": 0.30976222694761263, "grad_norm": 6.282543659210205, "learning_rate": 4.981956696070569e-06, "logits/chosen": 7.29435920715332, "logits/rejected": 6.744174957275391, "logps/chosen": -283.6720886230469, "logps/rejected": -359.3691711425781, "loss": 0.7836, "rewards/accuracies": 0.5, "rewards/chosen": 0.2742149233818054, "rewards/margins": -0.022047381848096848, "rewards/rejected": 0.29626235365867615, "step": 2003 }, { "epoch": 0.3099168760873768, "grad_norm": 6.137124538421631, "learning_rate": 4.981670294420896e-06, "logits/chosen": 5.864310264587402, "logits/rejected": 8.316765785217285, "logps/chosen": -232.30392456054688, "logps/rejected": -281.315673828125, "loss": 0.7373, "rewards/accuracies": 0.375, "rewards/chosen": 0.3055242896080017, "rewards/margins": 0.02452573925256729, "rewards/rejected": 0.2809985280036926, "step": 2004 }, { "epoch": 0.31007152522714093, "grad_norm": 3.532426118850708, "learning_rate": 4.9813838927712225e-06, "logits/chosen": 7.614628791809082, "logits/rejected": 3.5947980880737305, "logps/chosen": -214.77496337890625, "logps/rejected": -170.39398193359375, "loss": 0.5666, "rewards/accuracies": 0.75, "rewards/chosen": 0.402915358543396, "rewards/margins": 0.3127111792564392, "rewards/rejected": 0.0902041494846344, "step": 2005 }, { "epoch": 0.3102261743669051, "grad_norm": 6.598997592926025, "learning_rate": 4.981097491121549e-06, "logits/chosen": 10.780252456665039, "logits/rejected": 6.045149803161621, "logps/chosen": -336.3409729003906, "logps/rejected": -240.98468017578125, "loss": 0.6383, "rewards/accuracies": 0.75, "rewards/chosen": 0.18612727522850037, "rewards/margins": 0.15972900390625, "rewards/rejected": 0.026398273184895515, "step": 2006 }, { "epoch": 0.31038082350666923, "grad_norm": 8.307952880859375, "learning_rate": 4.980811089471876e-06, "logits/chosen": 9.119329452514648, "logits/rejected": 8.917032241821289, "logps/chosen": -273.92578125, "logps/rejected": -295.33203125, "loss": 0.7864, "rewards/accuracies": 0.5, "rewards/chosen": 0.23545576632022858, "rewards/margins": 0.06440198421478271, "rewards/rejected": 0.17105376720428467, "step": 2007 }, { "epoch": 0.3105354726464334, "grad_norm": 3.9343833923339844, "learning_rate": 4.9805246878222025e-06, "logits/chosen": 11.872191429138184, "logits/rejected": 3.4664392471313477, "logps/chosen": -250.74996948242188, "logps/rejected": -228.10499572753906, "loss": 0.475, "rewards/accuracies": 0.875, "rewards/chosen": 0.3100832998752594, "rewards/margins": 0.6007246971130371, "rewards/rejected": -0.2906413972377777, "step": 2008 }, { "epoch": 0.31069012178619754, "grad_norm": 8.843900680541992, "learning_rate": 4.980238286172529e-06, "logits/chosen": 7.892078399658203, "logits/rejected": 12.621454238891602, "logps/chosen": -366.3011169433594, "logps/rejected": -580.13427734375, "loss": 0.8792, "rewards/accuracies": 0.375, "rewards/chosen": 0.20299360156059265, "rewards/margins": -0.2374470829963684, "rewards/rejected": 0.44044065475463867, "step": 2009 }, { "epoch": 0.31084477092596174, "grad_norm": 5.974053382873535, "learning_rate": 4.979951884522855e-06, "logits/chosen": 11.918599128723145, "logits/rejected": 5.2390217781066895, "logps/chosen": -217.66104125976562, "logps/rejected": -163.15391540527344, "loss": 0.8386, "rewards/accuracies": 0.25, "rewards/chosen": 0.13871696591377258, "rewards/margins": -0.2529628276824951, "rewards/rejected": 0.3916797637939453, "step": 2010 }, { "epoch": 0.3109994200657259, "grad_norm": 4.927247047424316, "learning_rate": 4.979665482873182e-06, "logits/chosen": 14.197017669677734, "logits/rejected": 11.9215726852417, "logps/chosen": -214.18751525878906, "logps/rejected": -193.50003051757812, "loss": 0.5485, "rewards/accuracies": 0.875, "rewards/chosen": 0.3066695034503937, "rewards/margins": 0.3455147445201874, "rewards/rejected": -0.03884520381689072, "step": 2011 }, { "epoch": 0.31115406920549005, "grad_norm": 11.098302841186523, "learning_rate": 4.979379081223508e-06, "logits/chosen": 7.39481782913208, "logits/rejected": 5.808822154998779, "logps/chosen": -275.3327331542969, "logps/rejected": -235.2631378173828, "loss": 0.7748, "rewards/accuracies": 0.5, "rewards/chosen": 0.6115728616714478, "rewards/margins": -0.04276243597269058, "rewards/rejected": 0.6543353199958801, "step": 2012 }, { "epoch": 0.3113087183452542, "grad_norm": 4.694543838500977, "learning_rate": 4.979092679573835e-06, "logits/chosen": 10.235353469848633, "logits/rejected": 4.523075103759766, "logps/chosen": -281.56561279296875, "logps/rejected": -220.22119140625, "loss": 0.4915, "rewards/accuracies": 1.0, "rewards/chosen": 0.6348758935928345, "rewards/margins": 0.4904972314834595, "rewards/rejected": 0.14437872171401978, "step": 2013 }, { "epoch": 0.31146336748501835, "grad_norm": 5.491322040557861, "learning_rate": 4.9788062779241615e-06, "logits/chosen": 10.438916206359863, "logits/rejected": 12.557527542114258, "logps/chosen": -183.73220825195312, "logps/rejected": -216.85330200195312, "loss": 0.7605, "rewards/accuracies": 0.5, "rewards/chosen": 0.18518799543380737, "rewards/margins": -0.10930729657411575, "rewards/rejected": 0.29449528455734253, "step": 2014 }, { "epoch": 0.3116180166247825, "grad_norm": 3.6118359565734863, "learning_rate": 4.978519876274488e-06, "logits/chosen": 10.015729904174805, "logits/rejected": 7.5527215003967285, "logps/chosen": -132.3544464111328, "logps/rejected": -97.63077545166016, "loss": 0.6793, "rewards/accuracies": 0.375, "rewards/chosen": -0.030257228761911392, "rewards/margins": 0.08355279266834259, "rewards/rejected": -0.11381002515554428, "step": 2015 }, { "epoch": 0.3117726657645467, "grad_norm": 5.444869518280029, "learning_rate": 4.978233474624814e-06, "logits/chosen": 12.174051284790039, "logits/rejected": 3.7872884273529053, "logps/chosen": -430.60711669921875, "logps/rejected": -239.57635498046875, "loss": 0.4853, "rewards/accuracies": 0.875, "rewards/chosen": 0.5860814452171326, "rewards/margins": 0.5089577436447144, "rewards/rejected": 0.07712364941835403, "step": 2016 }, { "epoch": 0.31192731490431086, "grad_norm": 7.583166122436523, "learning_rate": 4.977947072975141e-06, "logits/chosen": 10.321952819824219, "logits/rejected": 7.761547565460205, "logps/chosen": -301.8120422363281, "logps/rejected": -195.99807739257812, "loss": 0.6006, "rewards/accuracies": 0.625, "rewards/chosen": 0.23609209060668945, "rewards/margins": 0.32064980268478394, "rewards/rejected": -0.08455768972635269, "step": 2017 }, { "epoch": 0.312081964044075, "grad_norm": 4.882416725158691, "learning_rate": 4.977660671325467e-06, "logits/chosen": 7.85862398147583, "logits/rejected": 6.824814796447754, "logps/chosen": -201.79331970214844, "logps/rejected": -167.5405731201172, "loss": 0.6951, "rewards/accuracies": 0.5, "rewards/chosen": -0.020072750747203827, "rewards/margins": 0.015002727508544922, "rewards/rejected": -0.03507547825574875, "step": 2018 }, { "epoch": 0.31223661318383916, "grad_norm": 5.949890613555908, "learning_rate": 4.977374269675794e-06, "logits/chosen": 10.985013008117676, "logits/rejected": 8.955205917358398, "logps/chosen": -351.3401794433594, "logps/rejected": -263.6220703125, "loss": 0.546, "rewards/accuracies": 1.0, "rewards/chosen": 0.44436824321746826, "rewards/margins": 0.33157527446746826, "rewards/rejected": 0.11279293894767761, "step": 2019 }, { "epoch": 0.3123912623236033, "grad_norm": 7.030670166015625, "learning_rate": 4.977087868026121e-06, "logits/chosen": 11.158683776855469, "logits/rejected": 4.009839057922363, "logps/chosen": -267.39544677734375, "logps/rejected": -220.5675048828125, "loss": 0.7079, "rewards/accuracies": 0.5, "rewards/chosen": 0.14739905297756195, "rewards/margins": 0.21238264441490173, "rewards/rejected": -0.06498361378908157, "step": 2020 }, { "epoch": 0.31254591146336747, "grad_norm": 111.6714859008789, "learning_rate": 4.976801466376447e-06, "logits/chosen": 9.698179244995117, "logits/rejected": 14.525428771972656, "logps/chosen": -230.2836151123047, "logps/rejected": -270.64727783203125, "loss": 0.6364, "rewards/accuracies": 0.625, "rewards/chosen": 0.09449329972267151, "rewards/margins": 0.21789181232452393, "rewards/rejected": -0.12339849770069122, "step": 2021 }, { "epoch": 0.31270056060313167, "grad_norm": 7.74885368347168, "learning_rate": 4.976515064726774e-06, "logits/chosen": 9.741405487060547, "logits/rejected": 11.121164321899414, "logps/chosen": -280.96734619140625, "logps/rejected": -271.37615966796875, "loss": 0.8575, "rewards/accuracies": 0.5, "rewards/chosen": 0.183790922164917, "rewards/margins": -0.17563287913799286, "rewards/rejected": 0.35942378640174866, "step": 2022 }, { "epoch": 0.3128552097428958, "grad_norm": 4.498934268951416, "learning_rate": 4.9762286630771e-06, "logits/chosen": 7.058158874511719, "logits/rejected": 5.465290069580078, "logps/chosen": -163.27890014648438, "logps/rejected": -159.01113891601562, "loss": 0.6766, "rewards/accuracies": 0.375, "rewards/chosen": 0.01532740518450737, "rewards/margins": 0.14587508141994476, "rewards/rejected": -0.1305476576089859, "step": 2023 }, { "epoch": 0.31300985888266, "grad_norm": 4.784653663635254, "learning_rate": 4.975942261427426e-06, "logits/chosen": 10.165705680847168, "logits/rejected": 7.393817901611328, "logps/chosen": -228.40866088867188, "logps/rejected": -236.97006225585938, "loss": 0.6559, "rewards/accuracies": 0.5, "rewards/chosen": 0.3274020254611969, "rewards/margins": 0.11454229801893234, "rewards/rejected": 0.21285971999168396, "step": 2024 }, { "epoch": 0.3131645080224241, "grad_norm": 6.6084184646606445, "learning_rate": 4.975655859777753e-06, "logits/chosen": 10.696527481079102, "logits/rejected": 10.428831100463867, "logps/chosen": -250.44305419921875, "logps/rejected": -298.8981018066406, "loss": 0.6702, "rewards/accuracies": 0.75, "rewards/chosen": 0.22198249399662018, "rewards/margins": 0.17556850612163544, "rewards/rejected": 0.046414002776145935, "step": 2025 }, { "epoch": 0.3133191571621883, "grad_norm": 3.688711166381836, "learning_rate": 4.97536945812808e-06, "logits/chosen": 7.6138458251953125, "logits/rejected": -0.48515424132347107, "logps/chosen": -278.2098693847656, "logps/rejected": -166.37527465820312, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": 0.05551151931285858, "rewards/margins": 0.31901147961616516, "rewards/rejected": -0.2634999752044678, "step": 2026 }, { "epoch": 0.31347380630195243, "grad_norm": 9.181396484375, "learning_rate": 4.9750830564784054e-06, "logits/chosen": 9.64320182800293, "logits/rejected": 7.418163299560547, "logps/chosen": -329.3189697265625, "logps/rejected": -323.0888671875, "loss": 0.657, "rewards/accuracies": 0.5, "rewards/chosen": 0.395430326461792, "rewards/margins": 0.10457059741020203, "rewards/rejected": 0.2908596992492676, "step": 2027 }, { "epoch": 0.3136284554417166, "grad_norm": 8.308619499206543, "learning_rate": 4.974796654828732e-06, "logits/chosen": 8.056121826171875, "logits/rejected": 8.806220054626465, "logps/chosen": -351.40740966796875, "logps/rejected": -317.88018798828125, "loss": 0.7981, "rewards/accuracies": 0.625, "rewards/chosen": 0.19664278626441956, "rewards/margins": -0.08082132041454315, "rewards/rejected": 0.2774640917778015, "step": 2028 }, { "epoch": 0.3137831045814808, "grad_norm": 5.15437126159668, "learning_rate": 4.974510253179059e-06, "logits/chosen": 8.79755687713623, "logits/rejected": 2.5902252197265625, "logps/chosen": -226.75881958007812, "logps/rejected": -212.37757873535156, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": 0.10303307324647903, "rewards/margins": 0.3763677477836609, "rewards/rejected": -0.27333468198776245, "step": 2029 }, { "epoch": 0.31393775372124494, "grad_norm": 7.080493927001953, "learning_rate": 4.974223851529385e-06, "logits/chosen": 10.762468338012695, "logits/rejected": 8.365921974182129, "logps/chosen": -407.4847106933594, "logps/rejected": -316.06561279296875, "loss": 0.8173, "rewards/accuracies": 0.375, "rewards/chosen": 0.12943677604198456, "rewards/margins": -0.1618298590183258, "rewards/rejected": 0.29126664996147156, "step": 2030 }, { "epoch": 0.3140924028610091, "grad_norm": 6.306060314178467, "learning_rate": 4.973937449879711e-06, "logits/chosen": 4.155032157897949, "logits/rejected": 1.6732174158096313, "logps/chosen": -344.86700439453125, "logps/rejected": -301.115234375, "loss": 0.6702, "rewards/accuracies": 0.625, "rewards/chosen": 0.3862399458885193, "rewards/margins": 0.11926203221082687, "rewards/rejected": 0.26697787642478943, "step": 2031 }, { "epoch": 0.31424705200077324, "grad_norm": 5.579253673553467, "learning_rate": 4.973651048230038e-06, "logits/chosen": 3.562178373336792, "logits/rejected": 10.578177452087402, "logps/chosen": -175.40646362304688, "logps/rejected": -236.90765380859375, "loss": 0.7272, "rewards/accuracies": 0.75, "rewards/chosen": 0.08180347084999084, "rewards/margins": -0.008615009486675262, "rewards/rejected": 0.09041848033666611, "step": 2032 }, { "epoch": 0.3144017011405374, "grad_norm": 5.391582489013672, "learning_rate": 4.9733646465803645e-06, "logits/chosen": 14.033145904541016, "logits/rejected": 6.150842666625977, "logps/chosen": -436.641357421875, "logps/rejected": -314.458984375, "loss": 0.5474, "rewards/accuracies": 0.75, "rewards/chosen": 0.36052456498146057, "rewards/margins": 0.4293084144592285, "rewards/rejected": -0.06878386437892914, "step": 2033 }, { "epoch": 0.31455635028030154, "grad_norm": 6.181769371032715, "learning_rate": 4.973078244930691e-06, "logits/chosen": 8.313186645507812, "logits/rejected": 15.240447998046875, "logps/chosen": -142.17416381835938, "logps/rejected": -193.90313720703125, "loss": 0.7031, "rewards/accuracies": 0.75, "rewards/chosen": 0.09663400053977966, "rewards/margins": 0.08778846263885498, "rewards/rejected": 0.008845508098602295, "step": 2034 }, { "epoch": 0.31471099942006575, "grad_norm": 6.115557670593262, "learning_rate": 4.972791843281018e-06, "logits/chosen": 6.296451568603516, "logits/rejected": 12.146219253540039, "logps/chosen": -215.63710021972656, "logps/rejected": -330.9129638671875, "loss": 0.5741, "rewards/accuracies": 0.875, "rewards/chosen": 0.3868020176887512, "rewards/margins": 0.3071805238723755, "rewards/rejected": 0.07962150871753693, "step": 2035 }, { "epoch": 0.3148656485598299, "grad_norm": 8.364924430847168, "learning_rate": 4.972505441631344e-06, "logits/chosen": 1.0487046241760254, "logits/rejected": 2.093881845474243, "logps/chosen": -232.45321655273438, "logps/rejected": -247.24136352539062, "loss": 0.7716, "rewards/accuracies": 0.5, "rewards/chosen": 0.3331920802593231, "rewards/margins": -0.089888796210289, "rewards/rejected": 0.4230808615684509, "step": 2036 }, { "epoch": 0.31502029769959405, "grad_norm": 4.711363315582275, "learning_rate": 4.97221903998167e-06, "logits/chosen": 9.087247848510742, "logits/rejected": 4.578160762786865, "logps/chosen": -352.3943786621094, "logps/rejected": -262.8236083984375, "loss": 0.5877, "rewards/accuracies": 0.875, "rewards/chosen": 0.3747193217277527, "rewards/margins": 0.28360098600387573, "rewards/rejected": 0.09111829102039337, "step": 2037 }, { "epoch": 0.3151749468393582, "grad_norm": 4.907288551330566, "learning_rate": 4.971932638331997e-06, "logits/chosen": 15.42646598815918, "logits/rejected": 13.73568344116211, "logps/chosen": -289.27386474609375, "logps/rejected": -324.1131591796875, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": 0.18994426727294922, "rewards/margins": 0.5177173614501953, "rewards/rejected": -0.3277730941772461, "step": 2038 }, { "epoch": 0.31532959597912236, "grad_norm": 6.172505855560303, "learning_rate": 4.9716462366823236e-06, "logits/chosen": 11.385139465332031, "logits/rejected": 7.40675687789917, "logps/chosen": -332.9402160644531, "logps/rejected": -225.0248260498047, "loss": 0.652, "rewards/accuracies": 0.5, "rewards/chosen": 0.3862195909023285, "rewards/margins": 0.15352395176887512, "rewards/rejected": 0.23269563913345337, "step": 2039 }, { "epoch": 0.3154842451188865, "grad_norm": 6.155210971832275, "learning_rate": 4.97135983503265e-06, "logits/chosen": 13.192663192749023, "logits/rejected": 14.762581825256348, "logps/chosen": -284.51519775390625, "logps/rejected": -294.5789794921875, "loss": 0.732, "rewards/accuracies": 0.5, "rewards/chosen": -0.006152346730232239, "rewards/margins": -0.018543139100074768, "rewards/rejected": 0.012390803545713425, "step": 2040 }, { "epoch": 0.31563889425865066, "grad_norm": 4.210750579833984, "learning_rate": 4.971073433382977e-06, "logits/chosen": 7.417428016662598, "logits/rejected": 4.093751907348633, "logps/chosen": -226.21636962890625, "logps/rejected": -159.87815856933594, "loss": 0.7023, "rewards/accuracies": 0.375, "rewards/chosen": -0.03874000534415245, "rewards/margins": 0.03634684532880783, "rewards/rejected": -0.07508685439825058, "step": 2041 }, { "epoch": 0.31579354339841487, "grad_norm": 6.30993127822876, "learning_rate": 4.9707870317333035e-06, "logits/chosen": 11.614374160766602, "logits/rejected": 6.277976036071777, "logps/chosen": -370.63507080078125, "logps/rejected": -303.1798095703125, "loss": 0.6582, "rewards/accuracies": 0.375, "rewards/chosen": 0.3636024594306946, "rewards/margins": 0.3033856153488159, "rewards/rejected": 0.060216791927814484, "step": 2042 }, { "epoch": 0.315948192538179, "grad_norm": 4.3987717628479, "learning_rate": 4.970500630083629e-06, "logits/chosen": 6.204204082489014, "logits/rejected": 3.2409186363220215, "logps/chosen": -231.05934143066406, "logps/rejected": -186.857666015625, "loss": 0.5877, "rewards/accuracies": 0.75, "rewards/chosen": 0.1683204621076584, "rewards/margins": 0.318449467420578, "rewards/rejected": -0.15012900531291962, "step": 2043 }, { "epoch": 0.31610284167794317, "grad_norm": 8.114767074584961, "learning_rate": 4.970214228433956e-06, "logits/chosen": 10.443578720092773, "logits/rejected": 8.904316902160645, "logps/chosen": -264.67864990234375, "logps/rejected": -261.36260986328125, "loss": 0.834, "rewards/accuracies": 0.375, "rewards/chosen": 0.031170370057225227, "rewards/margins": -0.15537044405937195, "rewards/rejected": 0.18654079735279083, "step": 2044 }, { "epoch": 0.3162574908177073, "grad_norm": 6.54292106628418, "learning_rate": 4.969927826784283e-06, "logits/chosen": 6.789311408996582, "logits/rejected": 10.426478385925293, "logps/chosen": -267.3919372558594, "logps/rejected": -362.5985107421875, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": 0.17277130484580994, "rewards/margins": 0.3037240505218506, "rewards/rejected": -0.13095274567604065, "step": 2045 }, { "epoch": 0.31641213995747147, "grad_norm": 5.540056228637695, "learning_rate": 4.969641425134609e-06, "logits/chosen": 4.076648712158203, "logits/rejected": 7.316946983337402, "logps/chosen": -192.0807647705078, "logps/rejected": -181.27587890625, "loss": 0.7152, "rewards/accuracies": 0.375, "rewards/chosen": -0.08584483712911606, "rewards/margins": 0.024266544729471207, "rewards/rejected": -0.11011138558387756, "step": 2046 }, { "epoch": 0.3165667890972356, "grad_norm": 4.6728739738464355, "learning_rate": 4.969355023484936e-06, "logits/chosen": 7.171144485473633, "logits/rejected": 4.804661750793457, "logps/chosen": -156.41595458984375, "logps/rejected": -121.79785919189453, "loss": 0.6667, "rewards/accuracies": 0.5, "rewards/chosen": 0.02266831323504448, "rewards/margins": 0.1850113570690155, "rewards/rejected": -0.16234304010868073, "step": 2047 }, { "epoch": 0.31672143823699983, "grad_norm": 4.945480823516846, "learning_rate": 4.9690686218352626e-06, "logits/chosen": 13.925334930419922, "logits/rejected": 11.48580265045166, "logps/chosen": -336.47216796875, "logps/rejected": -339.148193359375, "loss": 0.5536, "rewards/accuracies": 0.625, "rewards/chosen": 0.35403376817703247, "rewards/margins": 0.39997369050979614, "rewards/rejected": -0.04593987762928009, "step": 2048 }, { "epoch": 0.316876087376764, "grad_norm": 7.455511569976807, "learning_rate": 4.968782220185588e-06, "logits/chosen": 14.020986557006836, "logits/rejected": 6.704688549041748, "logps/chosen": -441.7218933105469, "logps/rejected": -265.552734375, "loss": 0.6791, "rewards/accuracies": 0.5, "rewards/chosen": 0.02594432234764099, "rewards/margins": 0.11928005516529083, "rewards/rejected": -0.09333573281764984, "step": 2049 }, { "epoch": 0.31703073651652813, "grad_norm": 7.04271125793457, "learning_rate": 4.968495818535915e-06, "logits/chosen": 4.9354448318481445, "logits/rejected": 4.99284553527832, "logps/chosen": -371.6170654296875, "logps/rejected": -294.0312194824219, "loss": 0.6185, "rewards/accuracies": 0.625, "rewards/chosen": 0.4960813522338867, "rewards/margins": 0.2775948643684387, "rewards/rejected": 0.21848651766777039, "step": 2050 }, { "epoch": 0.3171853856562923, "grad_norm": 5.3045854568481445, "learning_rate": 4.968209416886242e-06, "logits/chosen": 10.03846263885498, "logits/rejected": 8.394601821899414, "logps/chosen": -254.42642211914062, "logps/rejected": -289.1803894042969, "loss": 0.4709, "rewards/accuracies": 0.875, "rewards/chosen": 0.330724835395813, "rewards/margins": 0.6609184741973877, "rewards/rejected": -0.3301936388015747, "step": 2051 }, { "epoch": 0.31734003479605644, "grad_norm": 7.794218063354492, "learning_rate": 4.967923015236568e-06, "logits/chosen": 6.5661940574646, "logits/rejected": 4.515810489654541, "logps/chosen": -221.69247436523438, "logps/rejected": -225.12625122070312, "loss": 0.9342, "rewards/accuracies": 0.375, "rewards/chosen": -0.217045396566391, "rewards/margins": -0.34060293436050415, "rewards/rejected": 0.12355758249759674, "step": 2052 }, { "epoch": 0.3174946839358206, "grad_norm": 6.268409729003906, "learning_rate": 4.967636613586895e-06, "logits/chosen": 9.71400260925293, "logits/rejected": 4.3478102684021, "logps/chosen": -243.1073760986328, "logps/rejected": -232.1676483154297, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.15780669450759888, "rewards/margins": 0.15356865525245667, "rewards/rejected": 0.004238061606884003, "step": 2053 }, { "epoch": 0.3176493330755848, "grad_norm": 4.044487953186035, "learning_rate": 4.967350211937222e-06, "logits/chosen": 7.557414531707764, "logits/rejected": 12.595425605773926, "logps/chosen": -142.3138427734375, "logps/rejected": -183.90841674804688, "loss": 0.5899, "rewards/accuracies": 0.625, "rewards/chosen": -0.06036997586488724, "rewards/margins": 0.31504297256469727, "rewards/rejected": -0.3754129409790039, "step": 2054 }, { "epoch": 0.31780398221534895, "grad_norm": 5.1918721199035645, "learning_rate": 4.967063810287548e-06, "logits/chosen": 7.354862213134766, "logits/rejected": 10.796515464782715, "logps/chosen": -263.85601806640625, "logps/rejected": -288.7096862792969, "loss": 0.6433, "rewards/accuracies": 0.75, "rewards/chosen": 0.17918884754180908, "rewards/margins": 0.18749159574508667, "rewards/rejected": -0.008302763104438782, "step": 2055 }, { "epoch": 0.3179586313551131, "grad_norm": 7.184632301330566, "learning_rate": 4.966777408637874e-06, "logits/chosen": 7.668334007263184, "logits/rejected": 7.771810054779053, "logps/chosen": -276.070556640625, "logps/rejected": -262.524658203125, "loss": 0.7717, "rewards/accuracies": 0.625, "rewards/chosen": -0.099912628531456, "rewards/margins": -0.04781359061598778, "rewards/rejected": -0.05209903419017792, "step": 2056 }, { "epoch": 0.31811328049487725, "grad_norm": 5.591556072235107, "learning_rate": 4.966491006988201e-06, "logits/chosen": 6.896025657653809, "logits/rejected": 4.624721050262451, "logps/chosen": -220.68096923828125, "logps/rejected": -269.56719970703125, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": -0.028016850352287292, "rewards/margins": 0.1233828067779541, "rewards/rejected": -0.1513996571302414, "step": 2057 }, { "epoch": 0.3182679296346414, "grad_norm": 9.149316787719727, "learning_rate": 4.966204605338527e-06, "logits/chosen": 13.482200622558594, "logits/rejected": 8.05477237701416, "logps/chosen": -411.701416015625, "logps/rejected": -340.6821594238281, "loss": 0.9006, "rewards/accuracies": 0.25, "rewards/chosen": 0.006598569452762604, "rewards/margins": -0.28060245513916016, "rewards/rejected": 0.2872009873390198, "step": 2058 }, { "epoch": 0.31842257877440555, "grad_norm": 6.2234392166137695, "learning_rate": 4.965918203688854e-06, "logits/chosen": 10.994446754455566, "logits/rejected": 6.4054975509643555, "logps/chosen": -376.44976806640625, "logps/rejected": -220.66795349121094, "loss": 0.5575, "rewards/accuracies": 0.625, "rewards/chosen": 0.2972099184989929, "rewards/margins": 0.5810242891311646, "rewards/rejected": -0.28381434082984924, "step": 2059 }, { "epoch": 0.3185772279141697, "grad_norm": 7.8359456062316895, "learning_rate": 4.965631802039181e-06, "logits/chosen": 10.574647903442383, "logits/rejected": 4.935401916503906, "logps/chosen": -500.7237548828125, "logps/rejected": -347.1833190917969, "loss": 0.5659, "rewards/accuracies": 0.75, "rewards/chosen": 0.1308046281337738, "rewards/margins": 0.42363840341567993, "rewards/rejected": -0.29283377528190613, "step": 2060 }, { "epoch": 0.3187318770539339, "grad_norm": 5.31462287902832, "learning_rate": 4.9653454003895065e-06, "logits/chosen": 8.97025203704834, "logits/rejected": 6.5046186447143555, "logps/chosen": -378.29754638671875, "logps/rejected": -308.29364013671875, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": 0.22091758251190186, "rewards/margins": 0.31939026713371277, "rewards/rejected": -0.09847268462181091, "step": 2061 }, { "epoch": 0.31888652619369806, "grad_norm": 4.882965564727783, "learning_rate": 4.965058998739833e-06, "logits/chosen": 9.242798805236816, "logits/rejected": -0.6929713487625122, "logps/chosen": -308.7655944824219, "logps/rejected": -159.95748901367188, "loss": 0.5731, "rewards/accuracies": 0.875, "rewards/chosen": 0.10764069855213165, "rewards/margins": 0.3808231055736542, "rewards/rejected": -0.27318239212036133, "step": 2062 }, { "epoch": 0.3190411753334622, "grad_norm": 3.991692304611206, "learning_rate": 4.96477259709016e-06, "logits/chosen": 10.273489952087402, "logits/rejected": 7.59541130065918, "logps/chosen": -256.82427978515625, "logps/rejected": -248.18356323242188, "loss": 0.6164, "rewards/accuracies": 0.5, "rewards/chosen": 0.13332833349704742, "rewards/margins": 0.3333299160003662, "rewards/rejected": -0.20000162720680237, "step": 2063 }, { "epoch": 0.31919582447322636, "grad_norm": 5.288827419281006, "learning_rate": 4.9644861954404864e-06, "logits/chosen": 12.139446258544922, "logits/rejected": 7.976694107055664, "logps/chosen": -392.28106689453125, "logps/rejected": -291.5730895996094, "loss": 0.6064, "rewards/accuracies": 0.75, "rewards/chosen": 0.12893258035182953, "rewards/margins": 0.22405657172203064, "rewards/rejected": -0.0951240062713623, "step": 2064 }, { "epoch": 0.3193504736129905, "grad_norm": 4.785806179046631, "learning_rate": 4.964199793790812e-06, "logits/chosen": 11.824102401733398, "logits/rejected": 11.754941940307617, "logps/chosen": -252.7989959716797, "logps/rejected": -201.30653381347656, "loss": 0.7488, "rewards/accuracies": 0.375, "rewards/chosen": 0.009758569300174713, "rewards/margins": -0.08507233113050461, "rewards/rejected": 0.09483090043067932, "step": 2065 }, { "epoch": 0.31950512275275467, "grad_norm": 7.117434501647949, "learning_rate": 4.963913392141139e-06, "logits/chosen": 7.4334235191345215, "logits/rejected": 11.768050193786621, "logps/chosen": -244.57630920410156, "logps/rejected": -266.88580322265625, "loss": 0.6925, "rewards/accuracies": 0.75, "rewards/chosen": -0.05017123371362686, "rewards/margins": 0.10714679956436157, "rewards/rejected": -0.15731802582740784, "step": 2066 }, { "epoch": 0.3196597718925189, "grad_norm": 5.985042572021484, "learning_rate": 4.9636269904914655e-06, "logits/chosen": 8.437536239624023, "logits/rejected": 4.00594425201416, "logps/chosen": -195.0152587890625, "logps/rejected": -184.0806884765625, "loss": 0.6619, "rewards/accuracies": 0.5, "rewards/chosen": 0.11138291656970978, "rewards/margins": 0.23827090859413147, "rewards/rejected": -0.1268879920244217, "step": 2067 }, { "epoch": 0.319814421032283, "grad_norm": 6.469642162322998, "learning_rate": 4.963340588841792e-06, "logits/chosen": 2.145012855529785, "logits/rejected": 6.518708229064941, "logps/chosen": -277.6102600097656, "logps/rejected": -319.8463439941406, "loss": 0.7123, "rewards/accuracies": 0.375, "rewards/chosen": -0.30444109439849854, "rewards/margins": 0.010153524577617645, "rewards/rejected": -0.3145946264266968, "step": 2068 }, { "epoch": 0.3199690701720472, "grad_norm": 4.855186462402344, "learning_rate": 4.963054187192118e-06, "logits/chosen": 12.217367172241211, "logits/rejected": 11.836400985717773, "logps/chosen": -322.51153564453125, "logps/rejected": -326.32000732421875, "loss": 0.4969, "rewards/accuracies": 0.75, "rewards/chosen": 0.4644565284252167, "rewards/margins": 0.5568170547485352, "rewards/rejected": -0.0923604965209961, "step": 2069 }, { "epoch": 0.32012371931181133, "grad_norm": 4.596843719482422, "learning_rate": 4.962767785542445e-06, "logits/chosen": 9.516210556030273, "logits/rejected": 8.359548568725586, "logps/chosen": -255.34906005859375, "logps/rejected": -230.70223999023438, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": 0.11020180583000183, "rewards/margins": 0.25397002696990967, "rewards/rejected": -0.14376823604106903, "step": 2070 }, { "epoch": 0.3202783684515755, "grad_norm": 4.529107570648193, "learning_rate": 4.962481383892771e-06, "logits/chosen": 0.8398746252059937, "logits/rejected": 4.05929708480835, "logps/chosen": -163.35752868652344, "logps/rejected": -171.0614013671875, "loss": 0.6696, "rewards/accuracies": 0.375, "rewards/chosen": -0.03319668769836426, "rewards/margins": 0.05870075523853302, "rewards/rejected": -0.09189744293689728, "step": 2071 }, { "epoch": 0.32043301759133963, "grad_norm": 6.0078840255737305, "learning_rate": 4.962194982243098e-06, "logits/chosen": 2.13967227935791, "logits/rejected": 8.819110870361328, "logps/chosen": -144.24813842773438, "logps/rejected": -197.061279296875, "loss": 0.7275, "rewards/accuracies": 0.75, "rewards/chosen": -0.16000080108642578, "rewards/margins": 0.0023308396339416504, "rewards/rejected": -0.16233164072036743, "step": 2072 }, { "epoch": 0.3205876667311038, "grad_norm": 6.134139060974121, "learning_rate": 4.961908580593425e-06, "logits/chosen": 7.407665729522705, "logits/rejected": 7.107675075531006, "logps/chosen": -384.6820373535156, "logps/rejected": -256.6728820800781, "loss": 0.7827, "rewards/accuracies": 0.625, "rewards/chosen": -0.07551416754722595, "rewards/margins": -0.08895669132471085, "rewards/rejected": 0.013442512601613998, "step": 2073 }, { "epoch": 0.320742315870868, "grad_norm": 8.088295936584473, "learning_rate": 4.961622178943751e-06, "logits/chosen": 2.8475310802459717, "logits/rejected": 3.932326316833496, "logps/chosen": -308.1170959472656, "logps/rejected": -241.9561309814453, "loss": 0.5189, "rewards/accuracies": 0.875, "rewards/chosen": 0.2706906497478485, "rewards/margins": 0.5015276074409485, "rewards/rejected": -0.23083695769309998, "step": 2074 }, { "epoch": 0.32089696501063214, "grad_norm": 45.37759780883789, "learning_rate": 4.961335777294078e-06, "logits/chosen": 9.011927604675293, "logits/rejected": 8.506170272827148, "logps/chosen": -374.6376953125, "logps/rejected": -316.80126953125, "loss": 0.6749, "rewards/accuracies": 0.375, "rewards/chosen": -0.10515378415584564, "rewards/margins": 0.11395111680030823, "rewards/rejected": -0.21910487115383148, "step": 2075 }, { "epoch": 0.3210516141503963, "grad_norm": 5.602989196777344, "learning_rate": 4.961049375644404e-06, "logits/chosen": 12.401848793029785, "logits/rejected": 12.414278030395508, "logps/chosen": -281.40625, "logps/rejected": -253.7079315185547, "loss": 0.7303, "rewards/accuracies": 0.375, "rewards/chosen": 0.0323820635676384, "rewards/margins": 0.0015699826180934906, "rewards/rejected": 0.030812077224254608, "step": 2076 }, { "epoch": 0.32120626329016044, "grad_norm": 6.0530290603637695, "learning_rate": 4.96076297399473e-06, "logits/chosen": 9.598310470581055, "logits/rejected": 4.026642799377441, "logps/chosen": -376.65087890625, "logps/rejected": -279.203369140625, "loss": 0.6874, "rewards/accuracies": 0.625, "rewards/chosen": -0.1759847104549408, "rewards/margins": 0.05274325981736183, "rewards/rejected": -0.22872796654701233, "step": 2077 }, { "epoch": 0.3213609124299246, "grad_norm": 10.415705680847168, "learning_rate": 4.960476572345057e-06, "logits/chosen": 7.924570083618164, "logits/rejected": 5.085732936859131, "logps/chosen": -295.445068359375, "logps/rejected": -214.3765411376953, "loss": 0.7776, "rewards/accuracies": 0.375, "rewards/chosen": -0.24705390632152557, "rewards/margins": -0.08882950991392136, "rewards/rejected": -0.15822440385818481, "step": 2078 }, { "epoch": 0.32151556156968875, "grad_norm": 5.727771282196045, "learning_rate": 4.960190170695384e-06, "logits/chosen": 5.952622413635254, "logits/rejected": 4.558812618255615, "logps/chosen": -222.62860107421875, "logps/rejected": -184.36595153808594, "loss": 0.8128, "rewards/accuracies": 0.25, "rewards/chosen": -0.4535398781299591, "rewards/margins": -0.16294801235198975, "rewards/rejected": -0.290591835975647, "step": 2079 }, { "epoch": 0.32167021070945295, "grad_norm": 6.961986541748047, "learning_rate": 4.95990376904571e-06, "logits/chosen": 11.362753868103027, "logits/rejected": 8.551691055297852, "logps/chosen": -270.36566162109375, "logps/rejected": -272.29693603515625, "loss": 0.7045, "rewards/accuracies": 0.625, "rewards/chosen": -0.04510479047894478, "rewards/margins": 0.051329128444194794, "rewards/rejected": -0.09643393009901047, "step": 2080 }, { "epoch": 0.3218248598492171, "grad_norm": 8.04045295715332, "learning_rate": 4.959617367396037e-06, "logits/chosen": 5.838197231292725, "logits/rejected": 2.3217861652374268, "logps/chosen": -303.8896484375, "logps/rejected": -261.91864013671875, "loss": 0.5366, "rewards/accuracies": 0.875, "rewards/chosen": 0.18876473605632782, "rewards/margins": 0.40919047594070435, "rewards/rejected": -0.22042575478553772, "step": 2081 }, { "epoch": 0.32197950898898126, "grad_norm": 6.574918270111084, "learning_rate": 4.959330965746363e-06, "logits/chosen": 9.111629486083984, "logits/rejected": 4.173726558685303, "logps/chosen": -306.8055419921875, "logps/rejected": -274.29632568359375, "loss": 0.701, "rewards/accuracies": 0.625, "rewards/chosen": -0.16950541734695435, "rewards/margins": 0.027989469468593597, "rewards/rejected": -0.19749489426612854, "step": 2082 }, { "epoch": 0.3221341581287454, "grad_norm": 7.2142791748046875, "learning_rate": 4.959044564096689e-06, "logits/chosen": 16.955080032348633, "logits/rejected": 16.498762130737305, "logps/chosen": -366.94305419921875, "logps/rejected": -268.07958984375, "loss": 0.6393, "rewards/accuracies": 0.625, "rewards/chosen": 0.3865606188774109, "rewards/margins": 0.18941573798656464, "rewards/rejected": 0.19714492559432983, "step": 2083 }, { "epoch": 0.32228880726850956, "grad_norm": 6.034248352050781, "learning_rate": 4.958758162447016e-06, "logits/chosen": 11.447286605834961, "logits/rejected": 10.902953147888184, "logps/chosen": -280.1450500488281, "logps/rejected": -301.2906799316406, "loss": 0.6028, "rewards/accuracies": 0.625, "rewards/chosen": 0.019470401108264923, "rewards/margins": 0.23493099212646484, "rewards/rejected": -0.21546059846878052, "step": 2084 }, { "epoch": 0.3224434564082737, "grad_norm": 6.518868446350098, "learning_rate": 4.958471760797343e-06, "logits/chosen": 6.861526966094971, "logits/rejected": 7.466789722442627, "logps/chosen": -229.49166870117188, "logps/rejected": -219.41001892089844, "loss": 0.7894, "rewards/accuracies": 0.25, "rewards/chosen": -0.057122763246297836, "rewards/margins": -0.11546182632446289, "rewards/rejected": 0.05833907425403595, "step": 2085 }, { "epoch": 0.3225981055480379, "grad_norm": 90.26467895507812, "learning_rate": 4.958185359147669e-06, "logits/chosen": 7.893308639526367, "logits/rejected": 3.7543699741363525, "logps/chosen": -231.8357391357422, "logps/rejected": -215.47564697265625, "loss": 0.5828, "rewards/accuracies": 0.5, "rewards/chosen": -0.07351937890052795, "rewards/margins": 0.3292279839515686, "rewards/rejected": -0.40274736285209656, "step": 2086 }, { "epoch": 0.32275275468780207, "grad_norm": 6.308873176574707, "learning_rate": 4.957898957497996e-06, "logits/chosen": 13.796135902404785, "logits/rejected": 9.136918067932129, "logps/chosen": -247.26329040527344, "logps/rejected": -213.85240173339844, "loss": 0.6556, "rewards/accuracies": 0.5, "rewards/chosen": -0.03864327073097229, "rewards/margins": 0.11595988273620605, "rewards/rejected": -0.15460315346717834, "step": 2087 }, { "epoch": 0.3229074038275662, "grad_norm": 9.705131530761719, "learning_rate": 4.957612555848323e-06, "logits/chosen": 8.198763847351074, "logits/rejected": 10.425496101379395, "logps/chosen": -239.85275268554688, "logps/rejected": -360.1874084472656, "loss": 0.7244, "rewards/accuracies": 0.5, "rewards/chosen": 0.087799072265625, "rewards/margins": 0.17272567749023438, "rewards/rejected": -0.08492660522460938, "step": 2088 }, { "epoch": 0.32306205296733037, "grad_norm": 6.444095611572266, "learning_rate": 4.9573261541986485e-06, "logits/chosen": 12.79790210723877, "logits/rejected": 4.606011867523193, "logps/chosen": -301.3984375, "logps/rejected": -238.3819580078125, "loss": 0.7282, "rewards/accuracies": 0.5, "rewards/chosen": -0.3333694338798523, "rewards/margins": -0.024694059044122696, "rewards/rejected": -0.3086753785610199, "step": 2089 }, { "epoch": 0.3232167021070945, "grad_norm": 14.251069068908691, "learning_rate": 4.957039752548975e-06, "logits/chosen": 14.770837783813477, "logits/rejected": 0.0905866026878357, "logps/chosen": -520.4288940429688, "logps/rejected": -313.33428955078125, "loss": 0.6629, "rewards/accuracies": 0.5, "rewards/chosen": 0.1710100919008255, "rewards/margins": 0.1557396948337555, "rewards/rejected": 0.015270419418811798, "step": 2090 }, { "epoch": 0.3233713512468587, "grad_norm": 5.565276622772217, "learning_rate": 4.956753350899302e-06, "logits/chosen": 4.931995868682861, "logits/rejected": 9.044991493225098, "logps/chosen": -241.9562225341797, "logps/rejected": -231.12319946289062, "loss": 0.7162, "rewards/accuracies": 0.625, "rewards/chosen": -0.1074863001704216, "rewards/margins": 0.3090801239013672, "rewards/rejected": -0.4165664315223694, "step": 2091 }, { "epoch": 0.3235260003866228, "grad_norm": 4.336131572723389, "learning_rate": 4.956466949249628e-06, "logits/chosen": 11.24570083618164, "logits/rejected": 10.222318649291992, "logps/chosen": -217.121337890625, "logps/rejected": -206.05752563476562, "loss": 0.5962, "rewards/accuracies": 0.625, "rewards/chosen": -0.13777370750904083, "rewards/margins": 0.28516972064971924, "rewards/rejected": -0.42294347286224365, "step": 2092 }, { "epoch": 0.32368064952638703, "grad_norm": 5.697318077087402, "learning_rate": 4.956180547599955e-06, "logits/chosen": 6.302378177642822, "logits/rejected": 10.067934036254883, "logps/chosen": -289.6753845214844, "logps/rejected": -320.90655517578125, "loss": 0.6154, "rewards/accuracies": 0.75, "rewards/chosen": 0.2821327745914459, "rewards/margins": 0.1861279457807541, "rewards/rejected": 0.09600482136011124, "step": 2093 }, { "epoch": 0.3238352986661512, "grad_norm": 7.901885032653809, "learning_rate": 4.955894145950282e-06, "logits/chosen": 16.094385147094727, "logits/rejected": 2.6949076652526855, "logps/chosen": -439.53515625, "logps/rejected": -155.47381591796875, "loss": 0.5518, "rewards/accuracies": 0.75, "rewards/chosen": 0.10470141470432281, "rewards/margins": 0.508083701133728, "rewards/rejected": -0.40338224172592163, "step": 2094 }, { "epoch": 0.32398994780591533, "grad_norm": 4.967139720916748, "learning_rate": 4.9556077443006075e-06, "logits/chosen": 10.589612007141113, "logits/rejected": 10.220674514770508, "logps/chosen": -315.0977478027344, "logps/rejected": -268.65277099609375, "loss": 0.5815, "rewards/accuracies": 0.75, "rewards/chosen": 0.048830799758434296, "rewards/margins": 0.24980440735816956, "rewards/rejected": -0.20097360014915466, "step": 2095 }, { "epoch": 0.3241445969456795, "grad_norm": 5.359178066253662, "learning_rate": 4.955321342650934e-06, "logits/chosen": 7.773028373718262, "logits/rejected": 4.351597785949707, "logps/chosen": -226.37010192871094, "logps/rejected": -161.7034912109375, "loss": 0.6527, "rewards/accuracies": 0.375, "rewards/chosen": 0.08516424894332886, "rewards/margins": 0.09899123013019562, "rewards/rejected": -0.013826975598931313, "step": 2096 }, { "epoch": 0.32429924608544364, "grad_norm": 3.8178317546844482, "learning_rate": 4.955034941001261e-06, "logits/chosen": 15.25799560546875, "logits/rejected": 8.078344345092773, "logps/chosen": -173.55859375, "logps/rejected": -186.33509826660156, "loss": 0.6671, "rewards/accuracies": 0.375, "rewards/chosen": -0.07011398673057556, "rewards/margins": 0.10311809182167053, "rewards/rejected": -0.1732320785522461, "step": 2097 }, { "epoch": 0.3244538952252078, "grad_norm": 4.950554847717285, "learning_rate": 4.9547485393515875e-06, "logits/chosen": 14.872785568237305, "logits/rejected": 8.911056518554688, "logps/chosen": -358.3226623535156, "logps/rejected": -257.03985595703125, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": 0.23612824082374573, "rewards/margins": 0.49434345960617065, "rewards/rejected": -0.2582152485847473, "step": 2098 }, { "epoch": 0.324608544364972, "grad_norm": 5.108302116394043, "learning_rate": 4.954462137701913e-06, "logits/chosen": 11.643625259399414, "logits/rejected": 6.648677825927734, "logps/chosen": -287.53985595703125, "logps/rejected": -146.73568725585938, "loss": 0.7027, "rewards/accuracies": 0.625, "rewards/chosen": -0.23673954606056213, "rewards/margins": -0.0164263267070055, "rewards/rejected": -0.22031322121620178, "step": 2099 }, { "epoch": 0.32476319350473615, "grad_norm": 4.536948204040527, "learning_rate": 4.95417573605224e-06, "logits/chosen": 10.21849250793457, "logits/rejected": 3.383326292037964, "logps/chosen": -306.2857360839844, "logps/rejected": -190.88919067382812, "loss": 0.4834, "rewards/accuracies": 0.75, "rewards/chosen": 0.44766801595687866, "rewards/margins": 0.6022131443023682, "rewards/rejected": -0.1545451581478119, "step": 2100 }, { "epoch": 0.3249178426445003, "grad_norm": 5.082120418548584, "learning_rate": 4.9538893344025666e-06, "logits/chosen": 7.061097621917725, "logits/rejected": 3.894728660583496, "logps/chosen": -364.2991943359375, "logps/rejected": -309.91064453125, "loss": 0.4424, "rewards/accuracies": 0.875, "rewards/chosen": 0.3529106378555298, "rewards/margins": 0.6648345589637756, "rewards/rejected": -0.31192389130592346, "step": 2101 }, { "epoch": 0.32507249178426445, "grad_norm": 4.058323860168457, "learning_rate": 4.953602932752893e-06, "logits/chosen": 9.08685302734375, "logits/rejected": 9.114982604980469, "logps/chosen": -183.080322265625, "logps/rejected": -221.96896362304688, "loss": 0.5011, "rewards/accuracies": 0.75, "rewards/chosen": 0.20131412148475647, "rewards/margins": 0.6041401028633118, "rewards/rejected": -0.4028259813785553, "step": 2102 }, { "epoch": 0.3252271409240286, "grad_norm": 4.774957656860352, "learning_rate": 4.953316531103219e-06, "logits/chosen": 13.167054176330566, "logits/rejected": 5.450692176818848, "logps/chosen": -331.02880859375, "logps/rejected": -213.2152862548828, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": 0.455463707447052, "rewards/margins": 0.3619253933429718, "rewards/rejected": 0.09353828430175781, "step": 2103 }, { "epoch": 0.32538179006379275, "grad_norm": 4.387048244476318, "learning_rate": 4.953030129453546e-06, "logits/chosen": 8.454203605651855, "logits/rejected": 10.961136817932129, "logps/chosen": -245.9151611328125, "logps/rejected": -292.20928955078125, "loss": 0.5568, "rewards/accuracies": 0.875, "rewards/chosen": 0.021159403026103973, "rewards/margins": 0.33990398049354553, "rewards/rejected": -0.31874457001686096, "step": 2104 }, { "epoch": 0.3255364392035569, "grad_norm": 5.282306671142578, "learning_rate": 4.952743727803872e-06, "logits/chosen": 11.518450736999512, "logits/rejected": 7.982766151428223, "logps/chosen": -334.3492431640625, "logps/rejected": -217.17800903320312, "loss": 0.6692, "rewards/accuracies": 0.75, "rewards/chosen": -0.03269042819738388, "rewards/margins": 0.11648471653461456, "rewards/rejected": -0.14917513728141785, "step": 2105 }, { "epoch": 0.3256910883433211, "grad_norm": 5.2052202224731445, "learning_rate": 4.952457326154199e-06, "logits/chosen": 12.729175567626953, "logits/rejected": 6.6468095779418945, "logps/chosen": -306.32366943359375, "logps/rejected": -265.7822570800781, "loss": 0.5525, "rewards/accuracies": 0.875, "rewards/chosen": 0.09600372612476349, "rewards/margins": 0.3471567928791046, "rewards/rejected": -0.2511530816555023, "step": 2106 }, { "epoch": 0.32584573748308526, "grad_norm": 4.601687431335449, "learning_rate": 4.952170924504526e-06, "logits/chosen": 13.283109664916992, "logits/rejected": 3.378521203994751, "logps/chosen": -374.67254638671875, "logps/rejected": -253.61602783203125, "loss": 0.5105, "rewards/accuracies": 0.75, "rewards/chosen": 0.42353230714797974, "rewards/margins": 0.49716857075691223, "rewards/rejected": -0.0736362412571907, "step": 2107 }, { "epoch": 0.3260003866228494, "grad_norm": 4.700244426727295, "learning_rate": 4.9518845228548514e-06, "logits/chosen": 6.466777801513672, "logits/rejected": 3.8909730911254883, "logps/chosen": -231.74525451660156, "logps/rejected": -174.89566040039062, "loss": 0.6818, "rewards/accuracies": 0.375, "rewards/chosen": 0.32801496982574463, "rewards/margins": 0.11791111528873444, "rewards/rejected": 0.2101038247346878, "step": 2108 }, { "epoch": 0.32615503576261357, "grad_norm": 4.501307010650635, "learning_rate": 4.951598121205178e-06, "logits/chosen": 13.879536628723145, "logits/rejected": 7.336886882781982, "logps/chosen": -202.54266357421875, "logps/rejected": -116.31472778320312, "loss": 0.6563, "rewards/accuracies": 0.5, "rewards/chosen": -0.013379007577896118, "rewards/margins": 0.16892746090888977, "rewards/rejected": -0.1823064386844635, "step": 2109 }, { "epoch": 0.3263096849023777, "grad_norm": 7.5384602546691895, "learning_rate": 4.951311719555505e-06, "logits/chosen": 13.817075729370117, "logits/rejected": 5.907853603363037, "logps/chosen": -234.12368774414062, "logps/rejected": -131.2285919189453, "loss": 0.7606, "rewards/accuracies": 0.375, "rewards/chosen": -0.1672285497188568, "rewards/margins": -0.025172285735607147, "rewards/rejected": -0.14205628633499146, "step": 2110 }, { "epoch": 0.32646433404214187, "grad_norm": 6.5641093254089355, "learning_rate": 4.951025317905831e-06, "logits/chosen": 8.768404006958008, "logits/rejected": 13.349371910095215, "logps/chosen": -275.34429931640625, "logps/rejected": -348.5492248535156, "loss": 0.7386, "rewards/accuracies": 0.625, "rewards/chosen": -0.18378829956054688, "rewards/margins": -0.06050229072570801, "rewards/rejected": -0.12328600883483887, "step": 2111 }, { "epoch": 0.3266189831819061, "grad_norm": 7.226702690124512, "learning_rate": 4.950738916256158e-06, "logits/chosen": 8.845691680908203, "logits/rejected": 12.88252067565918, "logps/chosen": -301.1474609375, "logps/rejected": -445.46343994140625, "loss": 0.7479, "rewards/accuracies": 0.375, "rewards/chosen": 0.08916091918945312, "rewards/margins": -0.03999757766723633, "rewards/rejected": 0.12915846705436707, "step": 2112 }, { "epoch": 0.3267736323216702, "grad_norm": 7.249813079833984, "learning_rate": 4.950452514606485e-06, "logits/chosen": 5.492649078369141, "logits/rejected": 0.48049187660217285, "logps/chosen": -425.01708984375, "logps/rejected": -296.83221435546875, "loss": 0.7061, "rewards/accuracies": 0.625, "rewards/chosen": 0.30592823028564453, "rewards/margins": 0.052317190915346146, "rewards/rejected": 0.2536110579967499, "step": 2113 }, { "epoch": 0.3269282814614344, "grad_norm": 6.455074310302734, "learning_rate": 4.950166112956811e-06, "logits/chosen": 7.3708882331848145, "logits/rejected": 11.505752563476562, "logps/chosen": -282.5945739746094, "logps/rejected": -288.6684265136719, "loss": 0.6424, "rewards/accuracies": 0.5, "rewards/chosen": 0.12609276175498962, "rewards/margins": 0.15550580620765686, "rewards/rejected": -0.029413044452667236, "step": 2114 }, { "epoch": 0.32708293060119853, "grad_norm": 6.537588119506836, "learning_rate": 4.949879711307137e-06, "logits/chosen": 13.17055606842041, "logits/rejected": 5.763317108154297, "logps/chosen": -323.1358947753906, "logps/rejected": -250.46908569335938, "loss": 0.5604, "rewards/accuracies": 0.5, "rewards/chosen": 0.3411043584346771, "rewards/margins": 0.4357706606388092, "rewards/rejected": -0.09466633945703506, "step": 2115 }, { "epoch": 0.3272375797409627, "grad_norm": 5.728628158569336, "learning_rate": 4.949593309657464e-06, "logits/chosen": 11.254746437072754, "logits/rejected": 9.19675064086914, "logps/chosen": -234.3589630126953, "logps/rejected": -301.9439697265625, "loss": 0.6734, "rewards/accuracies": 0.375, "rewards/chosen": 0.1750057190656662, "rewards/margins": 0.3434600830078125, "rewards/rejected": -0.1684543341398239, "step": 2116 }, { "epoch": 0.32739222888072683, "grad_norm": 4.271417140960693, "learning_rate": 4.9493069080077904e-06, "logits/chosen": 9.724249839782715, "logits/rejected": 6.8040266036987305, "logps/chosen": -313.16461181640625, "logps/rejected": -248.35000610351562, "loss": 0.5831, "rewards/accuracies": 0.75, "rewards/chosen": 0.024228574708104134, "rewards/margins": 0.28434285521507263, "rewards/rejected": -0.26011431217193604, "step": 2117 }, { "epoch": 0.327546878020491, "grad_norm": 7.3210554122924805, "learning_rate": 4.949020506358117e-06, "logits/chosen": 3.5579299926757812, "logits/rejected": 4.765020847320557, "logps/chosen": -287.75970458984375, "logps/rejected": -297.7496032714844, "loss": 0.7942, "rewards/accuracies": 0.375, "rewards/chosen": -0.10287874937057495, "rewards/margins": -0.12194245308637619, "rewards/rejected": 0.019063711166381836, "step": 2118 }, { "epoch": 0.3277015271602552, "grad_norm": 6.927490234375, "learning_rate": 4.948734104708444e-06, "logits/chosen": 12.66522216796875, "logits/rejected": 7.755368232727051, "logps/chosen": -276.4278259277344, "logps/rejected": -245.3193359375, "loss": 0.6959, "rewards/accuracies": 0.5, "rewards/chosen": 0.06859144568443298, "rewards/margins": 0.12725737690925598, "rewards/rejected": -0.05866594240069389, "step": 2119 }, { "epoch": 0.32785617630001934, "grad_norm": 15.658063888549805, "learning_rate": 4.94844770305877e-06, "logits/chosen": 12.226007461547852, "logits/rejected": 9.363908767700195, "logps/chosen": -143.8397216796875, "logps/rejected": -190.3950653076172, "loss": 0.8341, "rewards/accuracies": 0.375, "rewards/chosen": -0.6046377420425415, "rewards/margins": -0.2315879464149475, "rewards/rejected": -0.3730497658252716, "step": 2120 }, { "epoch": 0.3280108254397835, "grad_norm": 4.808434963226318, "learning_rate": 4.948161301409097e-06, "logits/chosen": 7.353221416473389, "logits/rejected": 10.257070541381836, "logps/chosen": -173.77374267578125, "logps/rejected": -243.19540405273438, "loss": 0.6458, "rewards/accuracies": 0.625, "rewards/chosen": 0.04191150516271591, "rewards/margins": 0.2779221534729004, "rewards/rejected": -0.2360106259584427, "step": 2121 }, { "epoch": 0.32816547457954764, "grad_norm": 6.093598365783691, "learning_rate": 4.947874899759423e-06, "logits/chosen": 12.379583358764648, "logits/rejected": 7.348960876464844, "logps/chosen": -349.2428894042969, "logps/rejected": -271.2073059082031, "loss": 0.6332, "rewards/accuracies": 0.625, "rewards/chosen": 0.2535230815410614, "rewards/margins": 0.37280115485191345, "rewards/rejected": -0.11927807331085205, "step": 2122 }, { "epoch": 0.3283201237193118, "grad_norm": 6.790486812591553, "learning_rate": 4.9475884981097495e-06, "logits/chosen": 11.065315246582031, "logits/rejected": 5.394992351531982, "logps/chosen": -319.71978759765625, "logps/rejected": -227.23475646972656, "loss": 0.7013, "rewards/accuracies": 0.5, "rewards/chosen": 0.2551566958427429, "rewards/margins": 0.07567586749792099, "rewards/rejected": 0.17948083579540253, "step": 2123 }, { "epoch": 0.32847477285907595, "grad_norm": 6.010940074920654, "learning_rate": 4.947302096460076e-06, "logits/chosen": 10.837716102600098, "logits/rejected": 6.9498820304870605, "logps/chosen": -429.13482666015625, "logps/rejected": -370.83343505859375, "loss": 0.7371, "rewards/accuracies": 0.375, "rewards/chosen": 0.30295678973197937, "rewards/margins": 0.16916626691818237, "rewards/rejected": 0.1337904930114746, "step": 2124 }, { "epoch": 0.32862942199884015, "grad_norm": 6.506544589996338, "learning_rate": 4.947015694810403e-06, "logits/chosen": 2.0541067123413086, "logits/rejected": 1.5029680728912354, "logps/chosen": -227.41168212890625, "logps/rejected": -219.27052307128906, "loss": 0.6139, "rewards/accuracies": 0.5, "rewards/chosen": 0.010286381468176842, "rewards/margins": 0.1993921399116516, "rewards/rejected": -0.18910574913024902, "step": 2125 }, { "epoch": 0.3287840711386043, "grad_norm": 4.318284511566162, "learning_rate": 4.9467292931607294e-06, "logits/chosen": 8.311731338500977, "logits/rejected": 1.882987380027771, "logps/chosen": -206.57373046875, "logps/rejected": -185.872314453125, "loss": 0.5713, "rewards/accuracies": 0.5, "rewards/chosen": -0.03734751045703888, "rewards/margins": 0.38800737261772156, "rewards/rejected": -0.42535486817359924, "step": 2126 }, { "epoch": 0.32893872027836846, "grad_norm": 6.992857456207275, "learning_rate": 4.946442891511056e-06, "logits/chosen": 10.2127046585083, "logits/rejected": 8.646617889404297, "logps/chosen": -376.332275390625, "logps/rejected": -331.8555908203125, "loss": 0.7944, "rewards/accuracies": 0.625, "rewards/chosen": -0.23878727853298187, "rewards/margins": -0.11294031888246536, "rewards/rejected": -0.1258469521999359, "step": 2127 }, { "epoch": 0.3290933694181326, "grad_norm": 4.463717937469482, "learning_rate": 4.946156489861382e-06, "logits/chosen": 10.69537353515625, "logits/rejected": 4.715465068817139, "logps/chosen": -296.04986572265625, "logps/rejected": -245.56149291992188, "loss": 0.5613, "rewards/accuracies": 0.875, "rewards/chosen": 0.549194872379303, "rewards/margins": 0.33023345470428467, "rewards/rejected": 0.2189614474773407, "step": 2128 }, { "epoch": 0.32924801855789676, "grad_norm": 4.673308849334717, "learning_rate": 4.9458700882117085e-06, "logits/chosen": 11.814477920532227, "logits/rejected": 4.963963508605957, "logps/chosen": -307.353759765625, "logps/rejected": -207.64404296875, "loss": 0.5448, "rewards/accuracies": 0.875, "rewards/chosen": 0.2856033444404602, "rewards/margins": 0.5145837068557739, "rewards/rejected": -0.22898036241531372, "step": 2129 }, { "epoch": 0.3294026676976609, "grad_norm": 8.315261840820312, "learning_rate": 4.945583686562035e-06, "logits/chosen": 11.927656173706055, "logits/rejected": 9.667914390563965, "logps/chosen": -384.94854736328125, "logps/rejected": -338.69158935546875, "loss": 0.6677, "rewards/accuracies": 0.625, "rewards/chosen": 0.2725120484828949, "rewards/margins": 0.13412266969680786, "rewards/rejected": 0.13838940858840942, "step": 2130 }, { "epoch": 0.3295573168374251, "grad_norm": 4.120357036590576, "learning_rate": 4.945297284912362e-06, "logits/chosen": 10.142644882202148, "logits/rejected": 6.500965118408203, "logps/chosen": -223.88516235351562, "logps/rejected": -137.29342651367188, "loss": 0.6292, "rewards/accuracies": 0.625, "rewards/chosen": -0.005709746852517128, "rewards/margins": 0.16655507683753967, "rewards/rejected": -0.17226482927799225, "step": 2131 }, { "epoch": 0.32971196597718927, "grad_norm": 5.709237575531006, "learning_rate": 4.9450108832626885e-06, "logits/chosen": 8.734935760498047, "logits/rejected": 3.5999698638916016, "logps/chosen": -222.15179443359375, "logps/rejected": -206.25152587890625, "loss": 0.712, "rewards/accuracies": 0.5, "rewards/chosen": -0.12729153037071228, "rewards/margins": 0.18759658932685852, "rewards/rejected": -0.3148880898952484, "step": 2132 }, { "epoch": 0.3298666151169534, "grad_norm": 7.291140556335449, "learning_rate": 4.944724481613014e-06, "logits/chosen": 4.563351631164551, "logits/rejected": 5.958873271942139, "logps/chosen": -266.71563720703125, "logps/rejected": -329.9641418457031, "loss": 0.6278, "rewards/accuracies": 0.75, "rewards/chosen": -0.0905306339263916, "rewards/margins": 0.22541368007659912, "rewards/rejected": -0.3159443140029907, "step": 2133 }, { "epoch": 0.3300212642567176, "grad_norm": 6.44389533996582, "learning_rate": 4.944438079963341e-06, "logits/chosen": 4.952027320861816, "logits/rejected": 5.0103888511657715, "logps/chosen": -237.36703491210938, "logps/rejected": -254.44178771972656, "loss": 0.7994, "rewards/accuracies": 0.25, "rewards/chosen": 0.16160213947296143, "rewards/margins": -0.07677437365055084, "rewards/rejected": 0.23837654292583466, "step": 2134 }, { "epoch": 0.3301759133964817, "grad_norm": 16.3490047454834, "learning_rate": 4.944151678313668e-06, "logits/chosen": 8.593355178833008, "logits/rejected": 7.395482063293457, "logps/chosen": -335.38397216796875, "logps/rejected": -294.83551025390625, "loss": 0.8929, "rewards/accuracies": 0.375, "rewards/chosen": 0.19003048539161682, "rewards/margins": -0.3215496838092804, "rewards/rejected": 0.511580228805542, "step": 2135 }, { "epoch": 0.3303305625362459, "grad_norm": 5.849039077758789, "learning_rate": 4.943865276663994e-06, "logits/chosen": 18.099807739257812, "logits/rejected": 12.95901870727539, "logps/chosen": -231.89871215820312, "logps/rejected": -217.69552612304688, "loss": 0.6823, "rewards/accuracies": 0.625, "rewards/chosen": 0.13253240287303925, "rewards/margins": 0.10790696740150452, "rewards/rejected": 0.024625442922115326, "step": 2136 }, { "epoch": 0.33048521167601, "grad_norm": 19.142820358276367, "learning_rate": 4.94357887501432e-06, "logits/chosen": 12.757999420166016, "logits/rejected": 0.3978739082813263, "logps/chosen": -303.5205993652344, "logps/rejected": -153.62709045410156, "loss": 0.5001, "rewards/accuracies": 0.875, "rewards/chosen": 0.392508327960968, "rewards/margins": 0.4961417317390442, "rewards/rejected": -0.10363340377807617, "step": 2137 }, { "epoch": 0.33063986081577423, "grad_norm": 5.831638813018799, "learning_rate": 4.943292473364647e-06, "logits/chosen": 1.585843801498413, "logits/rejected": 2.117445707321167, "logps/chosen": -205.6033172607422, "logps/rejected": -255.28086853027344, "loss": 0.7501, "rewards/accuracies": 0.625, "rewards/chosen": -0.10074068605899811, "rewards/margins": -0.06774191558361053, "rewards/rejected": -0.03299874812364578, "step": 2138 }, { "epoch": 0.3307945099555384, "grad_norm": 7.719616889953613, "learning_rate": 4.943006071714973e-06, "logits/chosen": 12.479236602783203, "logits/rejected": 10.195213317871094, "logps/chosen": -280.171630859375, "logps/rejected": -305.09393310546875, "loss": 0.777, "rewards/accuracies": 0.625, "rewards/chosen": 0.22542619705200195, "rewards/margins": -0.02217789739370346, "rewards/rejected": 0.247604101896286, "step": 2139 }, { "epoch": 0.33094915909530254, "grad_norm": 4.695708751678467, "learning_rate": 4.9427196700653e-06, "logits/chosen": 10.320137023925781, "logits/rejected": 8.84394359588623, "logps/chosen": -252.85000610351562, "logps/rejected": -201.3157958984375, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": 0.3142176568508148, "rewards/margins": 0.13354313373565674, "rewards/rejected": 0.18067452311515808, "step": 2140 }, { "epoch": 0.3311038082350667, "grad_norm": 6.1608171463012695, "learning_rate": 4.942433268415626e-06, "logits/chosen": 6.467574119567871, "logits/rejected": 6.092167377471924, "logps/chosen": -216.0699005126953, "logps/rejected": -213.14077758789062, "loss": 0.8243, "rewards/accuracies": 0.25, "rewards/chosen": -0.17924192547798157, "rewards/margins": -0.18616530299186707, "rewards/rejected": 0.006923386827111244, "step": 2141 }, { "epoch": 0.33125845737483084, "grad_norm": 4.709331512451172, "learning_rate": 4.9421468667659525e-06, "logits/chosen": 12.148876190185547, "logits/rejected": 10.136076927185059, "logps/chosen": -248.241455078125, "logps/rejected": -265.0379333496094, "loss": 0.6733, "rewards/accuracies": 0.375, "rewards/chosen": 0.036957353353500366, "rewards/margins": 0.10949775576591492, "rewards/rejected": -0.07254038006067276, "step": 2142 }, { "epoch": 0.331413106514595, "grad_norm": 13.91828727722168, "learning_rate": 4.941860465116279e-06, "logits/chosen": 10.493155479431152, "logits/rejected": 8.240067481994629, "logps/chosen": -205.35140991210938, "logps/rejected": -286.3068542480469, "loss": 0.8099, "rewards/accuracies": 0.375, "rewards/chosen": 0.07062984257936478, "rewards/margins": -0.1602824628353119, "rewards/rejected": 0.23091231286525726, "step": 2143 }, { "epoch": 0.3315677556543592, "grad_norm": 5.486730575561523, "learning_rate": 4.941574063466606e-06, "logits/chosen": 10.753125190734863, "logits/rejected": 5.998003005981445, "logps/chosen": -145.19281005859375, "logps/rejected": -147.0620574951172, "loss": 0.6293, "rewards/accuracies": 0.75, "rewards/chosen": -0.012790344655513763, "rewards/margins": 0.17342935502529144, "rewards/rejected": -0.1862196922302246, "step": 2144 }, { "epoch": 0.33172240479412335, "grad_norm": 5.468768119812012, "learning_rate": 4.941287661816932e-06, "logits/chosen": 10.622962951660156, "logits/rejected": 6.332726955413818, "logps/chosen": -341.4171447753906, "logps/rejected": -252.20851135253906, "loss": 0.6374, "rewards/accuracies": 0.75, "rewards/chosen": 0.3033198416233063, "rewards/margins": 0.21081838011741638, "rewards/rejected": 0.0925014540553093, "step": 2145 }, { "epoch": 0.3318770539338875, "grad_norm": 6.681360244750977, "learning_rate": 4.941001260167259e-06, "logits/chosen": 6.566615581512451, "logits/rejected": 8.766490936279297, "logps/chosen": -307.0692138671875, "logps/rejected": -330.61456298828125, "loss": 0.6946, "rewards/accuracies": 0.625, "rewards/chosen": 0.11982184648513794, "rewards/margins": 0.0829787403345108, "rewards/rejected": 0.036843106150627136, "step": 2146 }, { "epoch": 0.33203170307365165, "grad_norm": 5.155014514923096, "learning_rate": 4.940714858517586e-06, "logits/chosen": 8.016521453857422, "logits/rejected": 3.768669843673706, "logps/chosen": -331.3190612792969, "logps/rejected": -281.0326232910156, "loss": 0.5944, "rewards/accuracies": 0.625, "rewards/chosen": 0.2365173101425171, "rewards/margins": 0.2611900269985199, "rewards/rejected": -0.024672742933034897, "step": 2147 }, { "epoch": 0.3321863522134158, "grad_norm": 4.372710704803467, "learning_rate": 4.9404284568679115e-06, "logits/chosen": 9.412343978881836, "logits/rejected": 7.2935357093811035, "logps/chosen": -228.67184448242188, "logps/rejected": -178.893310546875, "loss": 0.6754, "rewards/accuracies": 0.75, "rewards/chosen": 0.01220519095659256, "rewards/margins": 0.10863519459962845, "rewards/rejected": -0.09643000364303589, "step": 2148 }, { "epoch": 0.33234100135317995, "grad_norm": 6.063559532165527, "learning_rate": 4.940142055218238e-06, "logits/chosen": 12.230186462402344, "logits/rejected": 8.85799789428711, "logps/chosen": -325.6956787109375, "logps/rejected": -299.7931213378906, "loss": 0.6154, "rewards/accuracies": 0.75, "rewards/chosen": 0.35750043392181396, "rewards/margins": 0.3025447428226471, "rewards/rejected": 0.05495566874742508, "step": 2149 }, { "epoch": 0.3324956504929441, "grad_norm": 5.640564441680908, "learning_rate": 4.939855653568565e-06, "logits/chosen": 12.984869003295898, "logits/rejected": 4.921185493469238, "logps/chosen": -243.9294891357422, "logps/rejected": -144.32144165039062, "loss": 0.6189, "rewards/accuracies": 0.875, "rewards/chosen": 0.23765462636947632, "rewards/margins": 0.19950953125953674, "rewards/rejected": 0.03814505785703659, "step": 2150 }, { "epoch": 0.3326502996327083, "grad_norm": 4.923940658569336, "learning_rate": 4.9395692519188915e-06, "logits/chosen": 15.85429859161377, "logits/rejected": 12.531364440917969, "logps/chosen": -255.62393188476562, "logps/rejected": -256.057861328125, "loss": 0.6162, "rewards/accuracies": 0.625, "rewards/chosen": 0.11795315891504288, "rewards/margins": 0.23732790350914001, "rewards/rejected": -0.11937475204467773, "step": 2151 }, { "epoch": 0.33280494877247246, "grad_norm": 4.349578857421875, "learning_rate": 4.939282850269218e-06, "logits/chosen": 4.954311370849609, "logits/rejected": 7.276858806610107, "logps/chosen": -260.74139404296875, "logps/rejected": -193.87864685058594, "loss": 0.5965, "rewards/accuracies": 0.625, "rewards/chosen": 0.3750847578048706, "rewards/margins": 0.27639806270599365, "rewards/rejected": 0.09868665784597397, "step": 2152 }, { "epoch": 0.3329595979122366, "grad_norm": 4.945383071899414, "learning_rate": 4.938996448619545e-06, "logits/chosen": 10.00120735168457, "logits/rejected": 6.675858974456787, "logps/chosen": -278.8822021484375, "logps/rejected": -257.4023742675781, "loss": 0.4772, "rewards/accuracies": 0.75, "rewards/chosen": 0.27263402938842773, "rewards/margins": 0.6318854093551636, "rewards/rejected": -0.35925135016441345, "step": 2153 }, { "epoch": 0.33311424705200077, "grad_norm": 7.046670436859131, "learning_rate": 4.938710046969871e-06, "logits/chosen": 8.616357803344727, "logits/rejected": 8.351316452026367, "logps/chosen": -418.61322021484375, "logps/rejected": -293.4689025878906, "loss": 0.8005, "rewards/accuracies": 0.375, "rewards/chosen": 0.19877421855926514, "rewards/margins": -0.12812680006027222, "rewards/rejected": 0.32690104842185974, "step": 2154 }, { "epoch": 0.3332688961917649, "grad_norm": 6.001054286956787, "learning_rate": 4.938423645320197e-06, "logits/chosen": 8.804516792297363, "logits/rejected": 8.200180053710938, "logps/chosen": -309.0278015136719, "logps/rejected": -242.1177520751953, "loss": 0.7059, "rewards/accuracies": 0.625, "rewards/chosen": 0.1057676374912262, "rewards/margins": 0.0562465637922287, "rewards/rejected": 0.0495210662484169, "step": 2155 }, { "epoch": 0.33342354533152907, "grad_norm": 8.740618705749512, "learning_rate": 4.938137243670524e-06, "logits/chosen": 9.079069137573242, "logits/rejected": 5.190154552459717, "logps/chosen": -274.79742431640625, "logps/rejected": -216.66575622558594, "loss": 0.7554, "rewards/accuracies": 0.5, "rewards/chosen": 0.0934847816824913, "rewards/margins": -0.02816416695713997, "rewards/rejected": 0.12164896726608276, "step": 2156 }, { "epoch": 0.3335781944712933, "grad_norm": 5.348598480224609, "learning_rate": 4.9378508420208505e-06, "logits/chosen": 13.66172981262207, "logits/rejected": 9.360877990722656, "logps/chosen": -383.59332275390625, "logps/rejected": -299.9488525390625, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": 0.578677773475647, "rewards/margins": 0.43235549330711365, "rewards/rejected": 0.14632225036621094, "step": 2157 }, { "epoch": 0.33373284361105743, "grad_norm": 7.374210834503174, "learning_rate": 4.937564440371177e-06, "logits/chosen": 13.268245697021484, "logits/rejected": 6.78769588470459, "logps/chosen": -332.8704528808594, "logps/rejected": -289.2275085449219, "loss": 0.6614, "rewards/accuracies": 0.75, "rewards/chosen": 0.23434966802597046, "rewards/margins": 0.10861466079950333, "rewards/rejected": 0.12573499977588654, "step": 2158 }, { "epoch": 0.3338874927508216, "grad_norm": 7.228075981140137, "learning_rate": 4.937278038721504e-06, "logits/chosen": 11.237176895141602, "logits/rejected": 7.574673175811768, "logps/chosen": -277.8772277832031, "logps/rejected": -312.7956848144531, "loss": 0.8184, "rewards/accuracies": 0.375, "rewards/chosen": 0.10549846291542053, "rewards/margins": -0.21149574220180511, "rewards/rejected": 0.31699422001838684, "step": 2159 }, { "epoch": 0.33404214189058573, "grad_norm": 6.167690277099609, "learning_rate": 4.9369916370718305e-06, "logits/chosen": 10.053128242492676, "logits/rejected": 4.656221866607666, "logps/chosen": -305.4530944824219, "logps/rejected": -212.2640380859375, "loss": 0.7537, "rewards/accuracies": 0.5, "rewards/chosen": -0.026428520679473877, "rewards/margins": -0.06817140430212021, "rewards/rejected": 0.04174289107322693, "step": 2160 }, { "epoch": 0.3341967910303499, "grad_norm": 5.511512279510498, "learning_rate": 4.936705235422156e-06, "logits/chosen": 9.553276062011719, "logits/rejected": 11.722877502441406, "logps/chosen": -261.0096130371094, "logps/rejected": -209.82296752929688, "loss": 0.5279, "rewards/accuracies": 0.625, "rewards/chosen": 0.3160475492477417, "rewards/margins": 0.5216274261474609, "rewards/rejected": -0.20557984709739685, "step": 2161 }, { "epoch": 0.33435144017011403, "grad_norm": 8.41563606262207, "learning_rate": 4.936418833772483e-06, "logits/chosen": 8.621390342712402, "logits/rejected": 7.7258687019348145, "logps/chosen": -335.4895935058594, "logps/rejected": -345.52081298828125, "loss": 0.709, "rewards/accuracies": 0.375, "rewards/chosen": 0.4417629539966583, "rewards/margins": -0.0197154451161623, "rewards/rejected": 0.46147841215133667, "step": 2162 }, { "epoch": 0.33450608930987824, "grad_norm": 4.94342041015625, "learning_rate": 4.93613243212281e-06, "logits/chosen": 6.482465744018555, "logits/rejected": 4.99796199798584, "logps/chosen": -288.6210021972656, "logps/rejected": -260.36444091796875, "loss": 0.6365, "rewards/accuracies": 0.5, "rewards/chosen": 0.4152541160583496, "rewards/margins": 0.14538335800170898, "rewards/rejected": 0.2698707580566406, "step": 2163 }, { "epoch": 0.3346607384496424, "grad_norm": 7.682459831237793, "learning_rate": 4.935846030473136e-06, "logits/chosen": 14.980879783630371, "logits/rejected": 9.9533052444458, "logps/chosen": -410.21746826171875, "logps/rejected": -413.191650390625, "loss": 0.7724, "rewards/accuracies": 0.625, "rewards/chosen": 0.16669350862503052, "rewards/margins": -0.06279012560844421, "rewards/rejected": 0.22948360443115234, "step": 2164 }, { "epoch": 0.33481538758940654, "grad_norm": 5.459781646728516, "learning_rate": 4.935559628823463e-06, "logits/chosen": 13.500865936279297, "logits/rejected": 13.78359603881836, "logps/chosen": -333.18414306640625, "logps/rejected": -322.5073547363281, "loss": 0.643, "rewards/accuracies": 0.75, "rewards/chosen": 0.17819365859031677, "rewards/margins": 0.20343102514743805, "rewards/rejected": -0.02523735910654068, "step": 2165 }, { "epoch": 0.3349700367291707, "grad_norm": 9.41622543334961, "learning_rate": 4.935273227173789e-06, "logits/chosen": 9.599180221557617, "logits/rejected": 2.0726683139801025, "logps/chosen": -274.6078186035156, "logps/rejected": -159.72003173828125, "loss": 0.7232, "rewards/accuracies": 0.25, "rewards/chosen": 0.06719403713941574, "rewards/margins": 0.02843063324689865, "rewards/rejected": 0.038763418793678284, "step": 2166 }, { "epoch": 0.33512468586893485, "grad_norm": 4.893899440765381, "learning_rate": 4.934986825524115e-06, "logits/chosen": 7.2651872634887695, "logits/rejected": 4.972268581390381, "logps/chosen": -427.45635986328125, "logps/rejected": -338.7460632324219, "loss": 0.5612, "rewards/accuracies": 0.75, "rewards/chosen": 0.6526381969451904, "rewards/margins": 0.38484787940979004, "rewards/rejected": 0.2677903175354004, "step": 2167 }, { "epoch": 0.335279335008699, "grad_norm": 4.368253231048584, "learning_rate": 4.934700423874442e-06, "logits/chosen": 5.152036190032959, "logits/rejected": 5.321083068847656, "logps/chosen": -211.33250427246094, "logps/rejected": -188.63491821289062, "loss": 0.7066, "rewards/accuracies": 0.625, "rewards/chosen": 0.22109776735305786, "rewards/margins": 0.04243742674589157, "rewards/rejected": 0.1786603331565857, "step": 2168 }, { "epoch": 0.33543398414846315, "grad_norm": 5.0637407302856445, "learning_rate": 4.934414022224769e-06, "logits/chosen": 10.351551055908203, "logits/rejected": 6.950599670410156, "logps/chosen": -304.55194091796875, "logps/rejected": -267.5262451171875, "loss": 0.6939, "rewards/accuracies": 0.75, "rewards/chosen": 0.1781468391418457, "rewards/margins": 0.03295135870575905, "rewards/rejected": 0.14519548416137695, "step": 2169 }, { "epoch": 0.33558863328822736, "grad_norm": 6.258752822875977, "learning_rate": 4.934127620575095e-06, "logits/chosen": 6.840770244598389, "logits/rejected": 14.694913864135742, "logps/chosen": -328.0792541503906, "logps/rejected": -494.0980529785156, "loss": 0.7807, "rewards/accuracies": 0.625, "rewards/chosen": 0.35887566208839417, "rewards/margins": -0.006628043949604034, "rewards/rejected": 0.3655036985874176, "step": 2170 }, { "epoch": 0.3357432824279915, "grad_norm": 6.289960861206055, "learning_rate": 4.933841218925421e-06, "logits/chosen": 2.8483967781066895, "logits/rejected": 4.20119571685791, "logps/chosen": -266.65350341796875, "logps/rejected": -330.10186767578125, "loss": 0.5466, "rewards/accuracies": 0.625, "rewards/chosen": -0.10865803062915802, "rewards/margins": 0.442324161529541, "rewards/rejected": -0.5509821772575378, "step": 2171 }, { "epoch": 0.33589793156775566, "grad_norm": 7.433115005493164, "learning_rate": 4.933554817275748e-06, "logits/chosen": 4.327009677886963, "logits/rejected": 9.659065246582031, "logps/chosen": -175.13323974609375, "logps/rejected": -283.44146728515625, "loss": 0.9958, "rewards/accuracies": 0.375, "rewards/chosen": -0.10162576287984848, "rewards/margins": -0.4215138852596283, "rewards/rejected": 0.3198881149291992, "step": 2172 }, { "epoch": 0.3360525807075198, "grad_norm": 5.919958114624023, "learning_rate": 4.933268415626074e-06, "logits/chosen": 9.624743461608887, "logits/rejected": 6.929082870483398, "logps/chosen": -343.0086669921875, "logps/rejected": -272.3357238769531, "loss": 0.676, "rewards/accuracies": 0.625, "rewards/chosen": 0.35381561517715454, "rewards/margins": 0.09450992196798325, "rewards/rejected": 0.2593056857585907, "step": 2173 }, { "epoch": 0.33620722984728396, "grad_norm": 5.625269889831543, "learning_rate": 4.932982013976401e-06, "logits/chosen": 9.900564193725586, "logits/rejected": 9.90178108215332, "logps/chosen": -278.70416259765625, "logps/rejected": -254.29090881347656, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": 0.2910246253013611, "rewards/margins": 0.1581825464963913, "rewards/rejected": 0.1328420639038086, "step": 2174 }, { "epoch": 0.3363618789870481, "grad_norm": 7.171331882476807, "learning_rate": 4.932695612326727e-06, "logits/chosen": 12.84779167175293, "logits/rejected": 10.251667022705078, "logps/chosen": -303.6091613769531, "logps/rejected": -233.2792510986328, "loss": 0.8089, "rewards/accuracies": 0.375, "rewards/chosen": 0.06807422637939453, "rewards/margins": -0.10673590749502182, "rewards/rejected": 0.17481011152267456, "step": 2175 }, { "epoch": 0.3365165281268123, "grad_norm": 7.200506210327148, "learning_rate": 4.9324092106770535e-06, "logits/chosen": 10.479219436645508, "logits/rejected": 13.919760704040527, "logps/chosen": -317.1406555175781, "logps/rejected": -386.9718933105469, "loss": 0.8333, "rewards/accuracies": 0.375, "rewards/chosen": 0.18851739168167114, "rewards/margins": -0.15954551100730896, "rewards/rejected": 0.3480629026889801, "step": 2176 }, { "epoch": 0.33667117726657647, "grad_norm": 4.878973960876465, "learning_rate": 4.93212280902738e-06, "logits/chosen": 7.161655426025391, "logits/rejected": 11.909017562866211, "logps/chosen": -175.854736328125, "logps/rejected": -215.295166015625, "loss": 0.6436, "rewards/accuracies": 0.75, "rewards/chosen": -0.0897265076637268, "rewards/margins": 0.13689425587654114, "rewards/rejected": -0.22662077844142914, "step": 2177 }, { "epoch": 0.3368258264063406, "grad_norm": 3.977186918258667, "learning_rate": 4.931836407377707e-06, "logits/chosen": 12.02243423461914, "logits/rejected": 7.770849227905273, "logps/chosen": -293.2452392578125, "logps/rejected": -252.00106811523438, "loss": 0.5136, "rewards/accuracies": 0.875, "rewards/chosen": 0.3526585102081299, "rewards/margins": 0.46740198135375977, "rewards/rejected": -0.11474344879388809, "step": 2178 }, { "epoch": 0.3369804755461048, "grad_norm": 6.393489360809326, "learning_rate": 4.9315500057280334e-06, "logits/chosen": 10.56814193725586, "logits/rejected": 9.51702880859375, "logps/chosen": -194.17556762695312, "logps/rejected": -158.67721557617188, "loss": 0.673, "rewards/accuracies": 0.625, "rewards/chosen": -0.08892497420310974, "rewards/margins": 0.10081363469362259, "rewards/rejected": -0.18973861634731293, "step": 2179 }, { "epoch": 0.3371351246858689, "grad_norm": 5.057383060455322, "learning_rate": 4.93126360407836e-06, "logits/chosen": 15.431549072265625, "logits/rejected": 8.38217830657959, "logps/chosen": -275.71258544921875, "logps/rejected": -220.55601501464844, "loss": 0.5836, "rewards/accuracies": 0.75, "rewards/chosen": 0.05329176038503647, "rewards/margins": 0.2801361083984375, "rewards/rejected": -0.22684431076049805, "step": 2180 }, { "epoch": 0.3372897738256331, "grad_norm": 4.324843883514404, "learning_rate": 4.930977202428686e-06, "logits/chosen": 10.693016052246094, "logits/rejected": 7.600887775421143, "logps/chosen": -344.5273132324219, "logps/rejected": -274.1141357421875, "loss": 0.5364, "rewards/accuracies": 0.75, "rewards/chosen": 0.16971053183078766, "rewards/margins": 0.38878917694091797, "rewards/rejected": -0.2190786600112915, "step": 2181 }, { "epoch": 0.33744442296539723, "grad_norm": 7.040586471557617, "learning_rate": 4.9306908007790125e-06, "logits/chosen": 12.739887237548828, "logits/rejected": 11.451869010925293, "logps/chosen": -263.77716064453125, "logps/rejected": -203.22349548339844, "loss": 0.6895, "rewards/accuracies": 0.75, "rewards/chosen": -0.3552180230617523, "rewards/margins": 0.11035862565040588, "rewards/rejected": -0.4655766487121582, "step": 2182 }, { "epoch": 0.33759907210516144, "grad_norm": 6.448904037475586, "learning_rate": 4.930404399129339e-06, "logits/chosen": 10.3959379196167, "logits/rejected": 7.854723930358887, "logps/chosen": -364.96221923828125, "logps/rejected": -341.5102844238281, "loss": 0.5505, "rewards/accuracies": 0.625, "rewards/chosen": -0.5349284410476685, "rewards/margins": 0.3714437186717987, "rewards/rejected": -0.9063721895217896, "step": 2183 }, { "epoch": 0.3377537212449256, "grad_norm": 5.008645534515381, "learning_rate": 4.930117997479666e-06, "logits/chosen": 10.696040153503418, "logits/rejected": 10.032888412475586, "logps/chosen": -176.92333984375, "logps/rejected": -195.1748046875, "loss": 0.7449, "rewards/accuracies": 0.625, "rewards/chosen": 0.02067551203072071, "rewards/margins": -0.03131965547800064, "rewards/rejected": 0.051995180547237396, "step": 2184 }, { "epoch": 0.33790837038468974, "grad_norm": 5.502933502197266, "learning_rate": 4.9298315958299925e-06, "logits/chosen": 8.644451141357422, "logits/rejected": 7.277812957763672, "logps/chosen": -224.171630859375, "logps/rejected": -212.43795776367188, "loss": 0.7228, "rewards/accuracies": 0.5, "rewards/chosen": 0.11053973436355591, "rewards/margins": -0.032625388354063034, "rewards/rejected": 0.14316511154174805, "step": 2185 }, { "epoch": 0.3380630195244539, "grad_norm": 5.761327743530273, "learning_rate": 4.929545194180319e-06, "logits/chosen": 11.314397811889648, "logits/rejected": 6.3425140380859375, "logps/chosen": -294.193603515625, "logps/rejected": -187.38900756835938, "loss": 0.7198, "rewards/accuracies": 0.5, "rewards/chosen": 0.20676729083061218, "rewards/margins": 0.06161829084157944, "rewards/rejected": 0.14514902234077454, "step": 2186 }, { "epoch": 0.33821766866421804, "grad_norm": 5.611957550048828, "learning_rate": 4.929258792530646e-06, "logits/chosen": 7.5964579582214355, "logits/rejected": 12.299043655395508, "logps/chosen": -167.5782012939453, "logps/rejected": -275.7806396484375, "loss": 0.7945, "rewards/accuracies": 0.125, "rewards/chosen": 0.03267532214522362, "rewards/margins": -0.1259826272726059, "rewards/rejected": 0.158657968044281, "step": 2187 }, { "epoch": 0.3383723178039822, "grad_norm": 4.5019426345825195, "learning_rate": 4.928972390880972e-06, "logits/chosen": 9.627721786499023, "logits/rejected": 10.180431365966797, "logps/chosen": -216.41009521484375, "logps/rejected": -252.80288696289062, "loss": 0.4925, "rewards/accuracies": 0.625, "rewards/chosen": 0.07169594615697861, "rewards/margins": 0.5863649845123291, "rewards/rejected": -0.5146690607070923, "step": 2188 }, { "epoch": 0.3385269669437464, "grad_norm": 5.478781700134277, "learning_rate": 4.928685989231298e-06, "logits/chosen": 13.346943855285645, "logits/rejected": 8.802850723266602, "logps/chosen": -250.389892578125, "logps/rejected": -213.4042510986328, "loss": 0.6149, "rewards/accuracies": 0.75, "rewards/chosen": -0.08631086349487305, "rewards/margins": 0.3143469989299774, "rewards/rejected": -0.4006578326225281, "step": 2189 }, { "epoch": 0.33868161608351055, "grad_norm": 8.719413757324219, "learning_rate": 4.928399587581625e-06, "logits/chosen": 14.230716705322266, "logits/rejected": 10.871758460998535, "logps/chosen": -336.6778259277344, "logps/rejected": -285.1334228515625, "loss": 0.744, "rewards/accuracies": 0.375, "rewards/chosen": 0.07175198197364807, "rewards/margins": -0.08417779952287674, "rewards/rejected": 0.1559297740459442, "step": 2190 }, { "epoch": 0.3388362652232747, "grad_norm": 5.874048709869385, "learning_rate": 4.9281131859319516e-06, "logits/chosen": 15.300064086914062, "logits/rejected": 15.285356521606445, "logps/chosen": -464.52099609375, "logps/rejected": -439.29705810546875, "loss": 0.5821, "rewards/accuracies": 0.75, "rewards/chosen": 0.15989167988300323, "rewards/margins": 0.3341308832168579, "rewards/rejected": -0.1742391735315323, "step": 2191 }, { "epoch": 0.33899091436303885, "grad_norm": 6.36256742477417, "learning_rate": 4.927826784282278e-06, "logits/chosen": 8.372645378112793, "logits/rejected": 6.759990692138672, "logps/chosen": -180.9885711669922, "logps/rejected": -180.5035400390625, "loss": 0.9234, "rewards/accuracies": 0.25, "rewards/chosen": -0.34161269664764404, "rewards/margins": -0.36969423294067383, "rewards/rejected": 0.028081543743610382, "step": 2192 }, { "epoch": 0.339145563502803, "grad_norm": 4.614841461181641, "learning_rate": 4.927540382632605e-06, "logits/chosen": 10.616250991821289, "logits/rejected": 3.9560906887054443, "logps/chosen": -281.41162109375, "logps/rejected": -225.94224548339844, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": 0.007724002003669739, "rewards/margins": 0.2634137272834778, "rewards/rejected": -0.25568974018096924, "step": 2193 }, { "epoch": 0.33930021264256716, "grad_norm": 5.53109073638916, "learning_rate": 4.927253980982931e-06, "logits/chosen": 9.959586143493652, "logits/rejected": 9.219850540161133, "logps/chosen": -254.1416473388672, "logps/rejected": -333.04266357421875, "loss": 0.6569, "rewards/accuracies": 0.75, "rewards/chosen": 0.01833067089319229, "rewards/margins": 0.1865755319595337, "rewards/rejected": -0.1682448387145996, "step": 2194 }, { "epoch": 0.33945486178233136, "grad_norm": 3.877243757247925, "learning_rate": 4.926967579333257e-06, "logits/chosen": 13.282426834106445, "logits/rejected": 3.8752942085266113, "logps/chosen": -415.6174011230469, "logps/rejected": -287.8602600097656, "loss": 0.4, "rewards/accuracies": 0.875, "rewards/chosen": 0.4146263003349304, "rewards/margins": 0.9070665836334229, "rewards/rejected": -0.4924403131008148, "step": 2195 }, { "epoch": 0.3396095109220955, "grad_norm": 8.95366096496582, "learning_rate": 4.926681177683584e-06, "logits/chosen": 8.199341773986816, "logits/rejected": 6.678696632385254, "logps/chosen": -372.427001953125, "logps/rejected": -428.2519836425781, "loss": 0.8344, "rewards/accuracies": 0.375, "rewards/chosen": -0.3477782607078552, "rewards/margins": -0.14664092659950256, "rewards/rejected": -0.20113736391067505, "step": 2196 }, { "epoch": 0.33976416006185967, "grad_norm": 4.847280979156494, "learning_rate": 4.926394776033911e-06, "logits/chosen": 9.063350677490234, "logits/rejected": 8.378971099853516, "logps/chosen": -245.4602508544922, "logps/rejected": -283.3411865234375, "loss": 0.4844, "rewards/accuracies": 1.0, "rewards/chosen": -0.08104472607374191, "rewards/margins": 0.5104107856750488, "rewards/rejected": -0.5914554595947266, "step": 2197 }, { "epoch": 0.3399188092016238, "grad_norm": 4.669248104095459, "learning_rate": 4.926108374384237e-06, "logits/chosen": 9.406679153442383, "logits/rejected": 11.379308700561523, "logps/chosen": -363.9560546875, "logps/rejected": -322.1780090332031, "loss": 0.5854, "rewards/accuracies": 0.625, "rewards/chosen": 0.01684923656284809, "rewards/margins": 0.2974766790866852, "rewards/rejected": -0.28062745928764343, "step": 2198 }, { "epoch": 0.34007345834138797, "grad_norm": 4.05111837387085, "learning_rate": 4.925821972734564e-06, "logits/chosen": 6.986756801605225, "logits/rejected": 2.258618116378784, "logps/chosen": -189.69117736816406, "logps/rejected": -121.53974914550781, "loss": 0.6285, "rewards/accuracies": 0.5, "rewards/chosen": 0.07507272064685822, "rewards/margins": 0.18000581860542297, "rewards/rejected": -0.10493309795856476, "step": 2199 }, { "epoch": 0.3402281074811521, "grad_norm": 5.7093329429626465, "learning_rate": 4.92553557108489e-06, "logits/chosen": 8.678044319152832, "logits/rejected": 6.821519374847412, "logps/chosen": -267.0439453125, "logps/rejected": -271.0375671386719, "loss": 0.5964, "rewards/accuracies": 0.625, "rewards/chosen": -0.015613652765750885, "rewards/margins": 0.2787485718727112, "rewards/rejected": -0.29436224699020386, "step": 2200 }, { "epoch": 0.34038275662091627, "grad_norm": 5.728244304656982, "learning_rate": 4.925249169435216e-06, "logits/chosen": 7.073269844055176, "logits/rejected": 6.776679992675781, "logps/chosen": -295.5339050292969, "logps/rejected": -302.1988525390625, "loss": 0.6306, "rewards/accuracies": 0.75, "rewards/chosen": -0.1701340675354004, "rewards/margins": 0.1837719976902008, "rewards/rejected": -0.3539060652256012, "step": 2201 }, { "epoch": 0.3405374057606805, "grad_norm": 4.501295566558838, "learning_rate": 4.924962767785543e-06, "logits/chosen": 7.147919654846191, "logits/rejected": 8.420303344726562, "logps/chosen": -247.03421020507812, "logps/rejected": -244.56967163085938, "loss": 0.6329, "rewards/accuracies": 0.625, "rewards/chosen": -0.1667885184288025, "rewards/margins": 0.1886894255876541, "rewards/rejected": -0.3554779589176178, "step": 2202 }, { "epoch": 0.34069205490044463, "grad_norm": 9.208796501159668, "learning_rate": 4.92467636613587e-06, "logits/chosen": 10.132715225219727, "logits/rejected": 13.70450210571289, "logps/chosen": -383.57269287109375, "logps/rejected": -455.46966552734375, "loss": 0.8123, "rewards/accuracies": 0.625, "rewards/chosen": -0.004334352910518646, "rewards/margins": -0.1488373577594757, "rewards/rejected": 0.14450302720069885, "step": 2203 }, { "epoch": 0.3408467040402088, "grad_norm": 6.116032600402832, "learning_rate": 4.9243899644861955e-06, "logits/chosen": 8.444271087646484, "logits/rejected": 8.459249496459961, "logps/chosen": -304.50775146484375, "logps/rejected": -287.3347473144531, "loss": 0.6582, "rewards/accuracies": 0.625, "rewards/chosen": 0.13697025179862976, "rewards/margins": 0.2546781599521637, "rewards/rejected": -0.11770791560411453, "step": 2204 }, { "epoch": 0.34100135317997293, "grad_norm": 5.337936878204346, "learning_rate": 4.924103562836522e-06, "logits/chosen": 13.895332336425781, "logits/rejected": 11.867212295532227, "logps/chosen": -291.1551513671875, "logps/rejected": -318.891845703125, "loss": 0.5838, "rewards/accuracies": 0.5, "rewards/chosen": -0.05068197846412659, "rewards/margins": 0.5032179951667786, "rewards/rejected": -0.5539000034332275, "step": 2205 }, { "epoch": 0.3411560023197371, "grad_norm": 6.2754807472229, "learning_rate": 4.923817161186849e-06, "logits/chosen": 9.783641815185547, "logits/rejected": 3.7100839614868164, "logps/chosen": -288.7557373046875, "logps/rejected": -171.37759399414062, "loss": 0.7735, "rewards/accuracies": 0.375, "rewards/chosen": -0.10352823138237, "rewards/margins": 0.23346653580665588, "rewards/rejected": -0.3369947373867035, "step": 2206 }, { "epoch": 0.34131065145950124, "grad_norm": 5.5735554695129395, "learning_rate": 4.923530759537175e-06, "logits/chosen": 4.470004081726074, "logits/rejected": 10.329557418823242, "logps/chosen": -280.0700988769531, "logps/rejected": -381.5083923339844, "loss": 0.666, "rewards/accuracies": 0.625, "rewards/chosen": -0.01585289090871811, "rewards/margins": 0.15275263786315918, "rewards/rejected": -0.1686055213212967, "step": 2207 }, { "epoch": 0.34146530059926544, "grad_norm": 5.168330669403076, "learning_rate": 4.923244357887502e-06, "logits/chosen": 6.794681072235107, "logits/rejected": 2.8319389820098877, "logps/chosen": -300.73040771484375, "logps/rejected": -267.44580078125, "loss": 0.6125, "rewards/accuracies": 0.75, "rewards/chosen": 0.12931528687477112, "rewards/margins": 0.24279488623142242, "rewards/rejected": -0.1134796068072319, "step": 2208 }, { "epoch": 0.3416199497390296, "grad_norm": 19.52543830871582, "learning_rate": 4.922957956237828e-06, "logits/chosen": 11.238948822021484, "logits/rejected": 8.896709442138672, "logps/chosen": -376.65240478515625, "logps/rejected": -315.87274169921875, "loss": 0.7895, "rewards/accuracies": 0.375, "rewards/chosen": -0.013472366146743298, "rewards/margins": -0.12151608616113663, "rewards/rejected": 0.10804371535778046, "step": 2209 }, { "epoch": 0.34177459887879375, "grad_norm": 5.254403591156006, "learning_rate": 4.9226715545881545e-06, "logits/chosen": 8.904789924621582, "logits/rejected": 6.793849468231201, "logps/chosen": -289.3121032714844, "logps/rejected": -208.19049072265625, "loss": 0.75, "rewards/accuracies": 0.375, "rewards/chosen": -0.07712142169475555, "rewards/margins": -0.037978239357471466, "rewards/rejected": -0.03914318233728409, "step": 2210 }, { "epoch": 0.3419292480185579, "grad_norm": 4.339156150817871, "learning_rate": 4.922385152938481e-06, "logits/chosen": 6.7839436531066895, "logits/rejected": 1.5634078979492188, "logps/chosen": -257.8121337890625, "logps/rejected": -244.70849609375, "loss": 0.429, "rewards/accuracies": 0.875, "rewards/chosen": 0.10001610964536667, "rewards/margins": 0.7864616513252258, "rewards/rejected": -0.6864455938339233, "step": 2211 }, { "epoch": 0.34208389715832205, "grad_norm": 8.168787002563477, "learning_rate": 4.922098751288808e-06, "logits/chosen": 6.811740875244141, "logits/rejected": 3.902513027191162, "logps/chosen": -348.7326354980469, "logps/rejected": -273.1086120605469, "loss": 0.8642, "rewards/accuracies": 0.5, "rewards/chosen": -0.2507321536540985, "rewards/margins": -0.21108895540237427, "rewards/rejected": -0.03964319825172424, "step": 2212 }, { "epoch": 0.3422385462980862, "grad_norm": 5.378510475158691, "learning_rate": 4.9218123496391345e-06, "logits/chosen": 8.771527290344238, "logits/rejected": 10.911977767944336, "logps/chosen": -254.5963592529297, "logps/rejected": -284.21697998046875, "loss": 0.5783, "rewards/accuracies": 0.75, "rewards/chosen": -0.008564659394323826, "rewards/margins": 0.3100131154060364, "rewards/rejected": -0.31857776641845703, "step": 2213 }, { "epoch": 0.34239319543785035, "grad_norm": 5.584686756134033, "learning_rate": 4.92152594798946e-06, "logits/chosen": 8.772775650024414, "logits/rejected": 2.0024702548980713, "logps/chosen": -287.03704833984375, "logps/rejected": -234.1224365234375, "loss": 0.6686, "rewards/accuracies": 0.5, "rewards/chosen": -0.25973814725875854, "rewards/margins": 0.09461994469165802, "rewards/rejected": -0.35435810685157776, "step": 2214 }, { "epoch": 0.34254784457761456, "grad_norm": 6.358705997467041, "learning_rate": 4.921239546339787e-06, "logits/chosen": 13.620830535888672, "logits/rejected": 9.011594772338867, "logps/chosen": -362.0309143066406, "logps/rejected": -261.93896484375, "loss": 0.7534, "rewards/accuracies": 0.5, "rewards/chosen": 0.0829969272017479, "rewards/margins": 0.05566546320915222, "rewards/rejected": 0.027331486344337463, "step": 2215 }, { "epoch": 0.3427024937173787, "grad_norm": 9.422063827514648, "learning_rate": 4.920953144690114e-06, "logits/chosen": 12.300759315490723, "logits/rejected": 9.482490539550781, "logps/chosen": -384.3802490234375, "logps/rejected": -325.10321044921875, "loss": 0.5804, "rewards/accuracies": 0.625, "rewards/chosen": -0.015626903623342514, "rewards/margins": 0.28644314408302307, "rewards/rejected": -0.3020700514316559, "step": 2216 }, { "epoch": 0.34285714285714286, "grad_norm": 5.594699859619141, "learning_rate": 4.92066674304044e-06, "logits/chosen": 13.004911422729492, "logits/rejected": 11.286243438720703, "logps/chosen": -273.7823791503906, "logps/rejected": -270.719482421875, "loss": 0.7184, "rewards/accuracies": 0.5, "rewards/chosen": -0.0906803160905838, "rewards/margins": 0.11050395667552948, "rewards/rejected": -0.2011842578649521, "step": 2217 }, { "epoch": 0.343011791996907, "grad_norm": 5.89525032043457, "learning_rate": 4.920380341390767e-06, "logits/chosen": 11.978071212768555, "logits/rejected": 3.3232693672180176, "logps/chosen": -273.1345520019531, "logps/rejected": -219.61477661132812, "loss": 0.697, "rewards/accuracies": 0.75, "rewards/chosen": -0.17827454209327698, "rewards/margins": 0.06826652586460114, "rewards/rejected": -0.2465410828590393, "step": 2218 }, { "epoch": 0.34316644113667116, "grad_norm": 7.048234462738037, "learning_rate": 4.9200939397410935e-06, "logits/chosen": 7.120995998382568, "logits/rejected": 1.8566869497299194, "logps/chosen": -237.83474731445312, "logps/rejected": -157.23245239257812, "loss": 0.686, "rewards/accuracies": 0.5, "rewards/chosen": -0.1351737082004547, "rewards/margins": 0.06153935194015503, "rewards/rejected": -0.19671306014060974, "step": 2219 }, { "epoch": 0.3433210902764353, "grad_norm": 5.469114303588867, "learning_rate": 4.919807538091419e-06, "logits/chosen": 12.084733963012695, "logits/rejected": 5.589107990264893, "logps/chosen": -326.6589050292969, "logps/rejected": -289.4862365722656, "loss": 0.5376, "rewards/accuracies": 0.75, "rewards/chosen": -0.03627218306064606, "rewards/margins": 0.37036412954330444, "rewards/rejected": -0.4066363275051117, "step": 2220 }, { "epoch": 0.3434757394161995, "grad_norm": 11.814653396606445, "learning_rate": 4.919521136441746e-06, "logits/chosen": 14.363693237304688, "logits/rejected": 8.811527252197266, "logps/chosen": -452.9128723144531, "logps/rejected": -419.9919738769531, "loss": 0.5306, "rewards/accuracies": 0.75, "rewards/chosen": 0.24756795167922974, "rewards/margins": 0.41560888290405273, "rewards/rejected": -0.168040931224823, "step": 2221 }, { "epoch": 0.3436303885559637, "grad_norm": 25.004858016967773, "learning_rate": 4.919234734792073e-06, "logits/chosen": 7.665427207946777, "logits/rejected": 7.383940696716309, "logps/chosen": -231.5487518310547, "logps/rejected": -214.76895141601562, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": 0.0779341608285904, "rewards/margins": 0.05487308278679848, "rewards/rejected": 0.023061085492372513, "step": 2222 }, { "epoch": 0.3437850376957278, "grad_norm": 9.273181915283203, "learning_rate": 4.918948333142399e-06, "logits/chosen": 9.621376037597656, "logits/rejected": 6.972739219665527, "logps/chosen": -274.7452087402344, "logps/rejected": -231.89039611816406, "loss": 0.7466, "rewards/accuracies": 0.5, "rewards/chosen": -0.07981610298156738, "rewards/margins": -0.04024926945567131, "rewards/rejected": -0.03956683352589607, "step": 2223 }, { "epoch": 0.343939686835492, "grad_norm": 4.530200004577637, "learning_rate": 4.918661931492726e-06, "logits/chosen": 11.825055122375488, "logits/rejected": 7.726041793823242, "logps/chosen": -233.1574249267578, "logps/rejected": -174.32904052734375, "loss": 0.6889, "rewards/accuracies": 0.375, "rewards/chosen": -0.11028782278299332, "rewards/margins": 0.04926891252398491, "rewards/rejected": -0.15955673158168793, "step": 2224 }, { "epoch": 0.3440943359752561, "grad_norm": 5.863092422485352, "learning_rate": 4.918375529843053e-06, "logits/chosen": 11.903350830078125, "logits/rejected": 5.086323261260986, "logps/chosen": -315.8117370605469, "logps/rejected": -285.3423767089844, "loss": 0.6754, "rewards/accuracies": 0.5, "rewards/chosen": -0.3649864196777344, "rewards/margins": 0.11556059122085571, "rewards/rejected": -0.4805470108985901, "step": 2225 }, { "epoch": 0.3442489851150203, "grad_norm": 4.307432651519775, "learning_rate": 4.918089128193379e-06, "logits/chosen": 8.137996673583984, "logits/rejected": 4.582399845123291, "logps/chosen": -192.79525756835938, "logps/rejected": -139.80307006835938, "loss": 0.6747, "rewards/accuracies": 0.5, "rewards/chosen": -0.1812305897474289, "rewards/margins": 0.06533050537109375, "rewards/rejected": -0.24656111001968384, "step": 2226 }, { "epoch": 0.3444036342547845, "grad_norm": 5.5591912269592285, "learning_rate": 4.917802726543705e-06, "logits/chosen": 11.140790939331055, "logits/rejected": 11.527606964111328, "logps/chosen": -274.5819396972656, "logps/rejected": -245.5169219970703, "loss": 0.5775, "rewards/accuracies": 0.625, "rewards/chosen": -0.013919852674007416, "rewards/margins": 0.40499410033226013, "rewards/rejected": -0.41891396045684814, "step": 2227 }, { "epoch": 0.34455828339454864, "grad_norm": 4.2306318283081055, "learning_rate": 4.917516324894032e-06, "logits/chosen": 5.727298259735107, "logits/rejected": 4.192427635192871, "logps/chosen": -219.2635498046875, "logps/rejected": -179.5087127685547, "loss": 0.5566, "rewards/accuracies": 0.875, "rewards/chosen": 0.06968345493078232, "rewards/margins": 0.3410758972167969, "rewards/rejected": -0.27139243483543396, "step": 2228 }, { "epoch": 0.3447129325343128, "grad_norm": 4.568408966064453, "learning_rate": 4.917229923244358e-06, "logits/chosen": 8.758159637451172, "logits/rejected": 4.163976192474365, "logps/chosen": -338.8127136230469, "logps/rejected": -240.6302947998047, "loss": 0.6097, "rewards/accuracies": 0.75, "rewards/chosen": -0.06348161399364471, "rewards/margins": 0.32506924867630005, "rewards/rejected": -0.38855087757110596, "step": 2229 }, { "epoch": 0.34486758167407694, "grad_norm": 5.886579990386963, "learning_rate": 4.916943521594685e-06, "logits/chosen": 6.055108547210693, "logits/rejected": 6.719015121459961, "logps/chosen": -235.1964874267578, "logps/rejected": -243.42535400390625, "loss": 0.5802, "rewards/accuracies": 0.625, "rewards/chosen": -0.09445696324110031, "rewards/margins": 0.28427010774612427, "rewards/rejected": -0.3787270486354828, "step": 2230 }, { "epoch": 0.3450222308138411, "grad_norm": 76.57919311523438, "learning_rate": 4.916657119945012e-06, "logits/chosen": 6.070754528045654, "logits/rejected": 4.538302421569824, "logps/chosen": -206.94229125976562, "logps/rejected": -160.03150939941406, "loss": 0.6597, "rewards/accuracies": 0.625, "rewards/chosen": -0.2322043478488922, "rewards/margins": 0.10275064408779144, "rewards/rejected": -0.33495500683784485, "step": 2231 }, { "epoch": 0.34517687995360524, "grad_norm": 5.143581867218018, "learning_rate": 4.916370718295338e-06, "logits/chosen": 10.92449951171875, "logits/rejected": 6.929837226867676, "logps/chosen": -268.9730224609375, "logps/rejected": -217.0604248046875, "loss": 0.6188, "rewards/accuracies": 0.75, "rewards/chosen": -0.10262413322925568, "rewards/margins": 0.2187485694885254, "rewards/rejected": -0.32137271761894226, "step": 2232 }, { "epoch": 0.3453315290933694, "grad_norm": 4.1480207443237305, "learning_rate": 4.916084316645665e-06, "logits/chosen": 14.199913024902344, "logits/rejected": 6.951969623565674, "logps/chosen": -331.29266357421875, "logps/rejected": -209.20079040527344, "loss": 0.5474, "rewards/accuracies": 0.625, "rewards/chosen": -0.05355348438024521, "rewards/margins": 0.39276790618896484, "rewards/rejected": -0.44632136821746826, "step": 2233 }, { "epoch": 0.3454861782331336, "grad_norm": 4.425983428955078, "learning_rate": 4.915797914995991e-06, "logits/chosen": 7.606254577636719, "logits/rejected": 2.658592939376831, "logps/chosen": -302.90618896484375, "logps/rejected": -203.21453857421875, "loss": 0.5824, "rewards/accuracies": 0.5, "rewards/chosen": -0.08009238541126251, "rewards/margins": 0.3808273673057556, "rewards/rejected": -0.46091973781585693, "step": 2234 }, { "epoch": 0.34564082737289775, "grad_norm": 6.441736221313477, "learning_rate": 4.915511513346317e-06, "logits/chosen": 8.51942253112793, "logits/rejected": 9.007296562194824, "logps/chosen": -459.0113830566406, "logps/rejected": -360.12518310546875, "loss": 0.6071, "rewards/accuracies": 0.5, "rewards/chosen": 0.048676781356334686, "rewards/margins": 0.34012749791145325, "rewards/rejected": -0.2914506793022156, "step": 2235 }, { "epoch": 0.3457954765126619, "grad_norm": 5.1384406089782715, "learning_rate": 4.915225111696644e-06, "logits/chosen": 4.124293327331543, "logits/rejected": 4.67480993270874, "logps/chosen": -317.9867248535156, "logps/rejected": -262.2640380859375, "loss": 0.6127, "rewards/accuracies": 0.75, "rewards/chosen": 0.1634017527103424, "rewards/margins": 0.25820791721343994, "rewards/rejected": -0.09480614960193634, "step": 2236 }, { "epoch": 0.34595012565242605, "grad_norm": 5.26631498336792, "learning_rate": 4.914938710046971e-06, "logits/chosen": 6.151358127593994, "logits/rejected": 6.918093681335449, "logps/chosen": -232.6698760986328, "logps/rejected": -234.5141143798828, "loss": 0.6538, "rewards/accuracies": 0.5, "rewards/chosen": -0.15083113312721252, "rewards/margins": 0.1375989317893982, "rewards/rejected": -0.2884300649166107, "step": 2237 }, { "epoch": 0.3461047747921902, "grad_norm": 5.205026626586914, "learning_rate": 4.9146523083972965e-06, "logits/chosen": 10.508944511413574, "logits/rejected": 9.416999816894531, "logps/chosen": -276.3905334472656, "logps/rejected": -253.36036682128906, "loss": 0.6187, "rewards/accuracies": 0.625, "rewards/chosen": 0.1749594658613205, "rewards/margins": 0.273596853017807, "rewards/rejected": -0.09863739460706711, "step": 2238 }, { "epoch": 0.34625942393195436, "grad_norm": 5.7534003257751465, "learning_rate": 4.914365906747623e-06, "logits/chosen": 11.587326049804688, "logits/rejected": 10.811962127685547, "logps/chosen": -147.7239990234375, "logps/rejected": -181.17127990722656, "loss": 0.7236, "rewards/accuracies": 0.75, "rewards/chosen": -0.3307652473449707, "rewards/margins": 0.07926079630851746, "rewards/rejected": -0.41002607345581055, "step": 2239 }, { "epoch": 0.34641407307171856, "grad_norm": 3.9810519218444824, "learning_rate": 4.91407950509795e-06, "logits/chosen": 12.147308349609375, "logits/rejected": 11.696866035461426, "logps/chosen": -157.50863647460938, "logps/rejected": -172.9642791748047, "loss": 0.6447, "rewards/accuracies": 0.75, "rewards/chosen": -0.23018760979175568, "rewards/margins": 0.27839189767837524, "rewards/rejected": -0.5085794925689697, "step": 2240 }, { "epoch": 0.3465687222114827, "grad_norm": 6.633463382720947, "learning_rate": 4.9137931034482765e-06, "logits/chosen": 9.445849418640137, "logits/rejected": -0.09200930595397949, "logps/chosen": -380.5946350097656, "logps/rejected": -251.05003356933594, "loss": 0.7112, "rewards/accuracies": 0.5, "rewards/chosen": -0.2328595221042633, "rewards/margins": 0.06577625870704651, "rewards/rejected": -0.2986357808113098, "step": 2241 }, { "epoch": 0.34672337135124687, "grad_norm": 5.461822032928467, "learning_rate": 4.913506701798602e-06, "logits/chosen": 9.661162376403809, "logits/rejected": 9.147989273071289, "logps/chosen": -206.82998657226562, "logps/rejected": -261.5982360839844, "loss": 0.7271, "rewards/accuracies": 0.25, "rewards/chosen": 0.009399129077792168, "rewards/margins": -0.04206313192844391, "rewards/rejected": 0.051462262868881226, "step": 2242 }, { "epoch": 0.346878020491011, "grad_norm": 5.272211074829102, "learning_rate": 4.913220300148929e-06, "logits/chosen": 7.325489521026611, "logits/rejected": 8.20553970336914, "logps/chosen": -260.3209228515625, "logps/rejected": -252.24371337890625, "loss": 0.6031, "rewards/accuracies": 0.625, "rewards/chosen": -0.050969406962394714, "rewards/margins": 0.32596737146377563, "rewards/rejected": -0.37693679332733154, "step": 2243 }, { "epoch": 0.34703266963077517, "grad_norm": 4.581593990325928, "learning_rate": 4.9129338984992556e-06, "logits/chosen": 7.456971168518066, "logits/rejected": 7.230869770050049, "logps/chosen": -212.39593505859375, "logps/rejected": -179.48623657226562, "loss": 0.6884, "rewards/accuracies": 0.75, "rewards/chosen": -0.19984811544418335, "rewards/margins": 0.06016957014799118, "rewards/rejected": -0.2600176930427551, "step": 2244 }, { "epoch": 0.3471873187705393, "grad_norm": 5.243470668792725, "learning_rate": 4.912647496849582e-06, "logits/chosen": 9.719179153442383, "logits/rejected": 11.93798828125, "logps/chosen": -278.61029052734375, "logps/rejected": -315.90972900390625, "loss": 0.5682, "rewards/accuracies": 0.625, "rewards/chosen": 0.3115367293357849, "rewards/margins": 0.36510318517684937, "rewards/rejected": -0.05356644466519356, "step": 2245 }, { "epoch": 0.3473419679103035, "grad_norm": 5.636280059814453, "learning_rate": 4.912361095199909e-06, "logits/chosen": 14.947442054748535, "logits/rejected": 11.073668479919434, "logps/chosen": -288.8595275878906, "logps/rejected": -238.2893829345703, "loss": 0.6711, "rewards/accuracies": 0.375, "rewards/chosen": -0.1511373519897461, "rewards/margins": 0.12412319332361221, "rewards/rejected": -0.2752605676651001, "step": 2246 }, { "epoch": 0.3474966170500677, "grad_norm": 4.914024829864502, "learning_rate": 4.912074693550235e-06, "logits/chosen": 9.521003723144531, "logits/rejected": 10.034378051757812, "logps/chosen": -232.0491943359375, "logps/rejected": -207.06263732910156, "loss": 0.7122, "rewards/accuracies": 0.375, "rewards/chosen": -0.010371115058660507, "rewards/margins": 0.057499319314956665, "rewards/rejected": -0.06787042319774628, "step": 2247 }, { "epoch": 0.34765126618983183, "grad_norm": 5.5847320556640625, "learning_rate": 4.911788291900561e-06, "logits/chosen": 9.698742866516113, "logits/rejected": 6.02374792098999, "logps/chosen": -261.2921142578125, "logps/rejected": -183.23049926757812, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": -0.021566370502114296, "rewards/margins": 0.14653608202934265, "rewards/rejected": -0.1681024730205536, "step": 2248 }, { "epoch": 0.347805915329596, "grad_norm": 5.414330005645752, "learning_rate": 4.911501890250888e-06, "logits/chosen": 11.729146003723145, "logits/rejected": 8.237777709960938, "logps/chosen": -256.3666687011719, "logps/rejected": -278.5428466796875, "loss": 0.8102, "rewards/accuracies": 0.5, "rewards/chosen": -0.174337700009346, "rewards/margins": -0.10303632915019989, "rewards/rejected": -0.07130137830972672, "step": 2249 }, { "epoch": 0.34796056446936013, "grad_norm": 5.685588359832764, "learning_rate": 4.911215488601215e-06, "logits/chosen": 7.431952953338623, "logits/rejected": 8.43886947631836, "logps/chosen": -270.51116943359375, "logps/rejected": -275.28558349609375, "loss": 0.6361, "rewards/accuracies": 0.625, "rewards/chosen": 0.13253358006477356, "rewards/margins": 0.16085144877433777, "rewards/rejected": -0.02831786870956421, "step": 2250 }, { "epoch": 0.3481152136091243, "grad_norm": 29.71629524230957, "learning_rate": 4.910929086951541e-06, "logits/chosen": 10.248223304748535, "logits/rejected": 11.20692253112793, "logps/chosen": -231.45877075195312, "logps/rejected": -249.94818115234375, "loss": 0.6803, "rewards/accuracies": 0.5, "rewards/chosen": -0.1668645441532135, "rewards/margins": 0.14421576261520386, "rewards/rejected": -0.31108030676841736, "step": 2251 }, { "epoch": 0.34826986274888844, "grad_norm": 7.318175315856934, "learning_rate": 4.910642685301868e-06, "logits/chosen": 9.091716766357422, "logits/rejected": 7.413163661956787, "logps/chosen": -349.4971923828125, "logps/rejected": -264.2409973144531, "loss": 0.7543, "rewards/accuracies": 0.5, "rewards/chosen": -0.16747255623340607, "rewards/margins": 0.02091556042432785, "rewards/rejected": -0.18838812410831451, "step": 2252 }, { "epoch": 0.34842451188865264, "grad_norm": 7.042592525482178, "learning_rate": 4.910356283652194e-06, "logits/chosen": 8.61669921875, "logits/rejected": 13.160215377807617, "logps/chosen": -283.1356506347656, "logps/rejected": -348.42083740234375, "loss": 0.8397, "rewards/accuracies": 0.5, "rewards/chosen": -0.24656841158866882, "rewards/margins": -0.16773390769958496, "rewards/rejected": -0.07883447408676147, "step": 2253 }, { "epoch": 0.3485791610284168, "grad_norm": 4.277012348175049, "learning_rate": 4.91006988200252e-06, "logits/chosen": 9.668946266174316, "logits/rejected": 5.8493757247924805, "logps/chosen": -251.37216186523438, "logps/rejected": -202.52203369140625, "loss": 0.606, "rewards/accuracies": 0.75, "rewards/chosen": 0.021043196320533752, "rewards/margins": 0.2766110599040985, "rewards/rejected": -0.25556787848472595, "step": 2254 }, { "epoch": 0.34873381016818095, "grad_norm": 5.632655620574951, "learning_rate": 4.909783480352847e-06, "logits/chosen": 3.3474035263061523, "logits/rejected": 0.006913661956787109, "logps/chosen": -265.8349914550781, "logps/rejected": -197.7389678955078, "loss": 0.6076, "rewards/accuracies": 0.875, "rewards/chosen": -0.09871774911880493, "rewards/margins": 0.3149941563606262, "rewards/rejected": -0.41371190547943115, "step": 2255 }, { "epoch": 0.3488884593079451, "grad_norm": 4.305490493774414, "learning_rate": 4.909497078703174e-06, "logits/chosen": 9.300392150878906, "logits/rejected": 4.30137825012207, "logps/chosen": -421.07952880859375, "logps/rejected": -352.4152526855469, "loss": 0.5095, "rewards/accuracies": 0.75, "rewards/chosen": 0.34856438636779785, "rewards/margins": 0.4709594249725342, "rewards/rejected": -0.12239501625299454, "step": 2256 }, { "epoch": 0.34904310844770925, "grad_norm": 3.666188955307007, "learning_rate": 4.9092106770535e-06, "logits/chosen": 13.80720043182373, "logits/rejected": 9.943268775939941, "logps/chosen": -293.3656005859375, "logps/rejected": -204.82345581054688, "loss": 0.528, "rewards/accuracies": 0.75, "rewards/chosen": -0.10769538581371307, "rewards/margins": 0.4935707449913025, "rewards/rejected": -0.6012661457061768, "step": 2257 }, { "epoch": 0.3491977575874734, "grad_norm": 6.167476654052734, "learning_rate": 4.908924275403827e-06, "logits/chosen": 11.450634002685547, "logits/rejected": 14.47146224975586, "logps/chosen": -269.498779296875, "logps/rejected": -457.2481689453125, "loss": 0.6449, "rewards/accuracies": 0.5, "rewards/chosen": -0.1913461685180664, "rewards/margins": 0.16590233147144318, "rewards/rejected": -0.3572485148906708, "step": 2258 }, { "epoch": 0.34935240672723755, "grad_norm": 5.208954334259033, "learning_rate": 4.908637873754154e-06, "logits/chosen": 4.9774370193481445, "logits/rejected": 5.742297649383545, "logps/chosen": -249.7729949951172, "logps/rejected": -218.14566040039062, "loss": 0.7735, "rewards/accuracies": 0.375, "rewards/chosen": -0.28843870759010315, "rewards/margins": -0.06605186313390732, "rewards/rejected": -0.22238683700561523, "step": 2259 }, { "epoch": 0.34950705586700176, "grad_norm": 7.198918342590332, "learning_rate": 4.9083514721044794e-06, "logits/chosen": 5.99508810043335, "logits/rejected": 7.031045913696289, "logps/chosen": -267.08453369140625, "logps/rejected": -258.5322570800781, "loss": 0.754, "rewards/accuracies": 0.5, "rewards/chosen": -0.14965543150901794, "rewards/margins": -0.06985634565353394, "rewards/rejected": -0.07979907840490341, "step": 2260 }, { "epoch": 0.3496617050067659, "grad_norm": 6.3062310218811035, "learning_rate": 4.908065070454806e-06, "logits/chosen": 9.616310119628906, "logits/rejected": 4.98814058303833, "logps/chosen": -346.2275085449219, "logps/rejected": -287.83349609375, "loss": 0.7748, "rewards/accuracies": 0.375, "rewards/chosen": -0.34294837713241577, "rewards/margins": -0.04744395613670349, "rewards/rejected": -0.2955043911933899, "step": 2261 }, { "epoch": 0.34981635414653006, "grad_norm": 6.942808151245117, "learning_rate": 4.907778668805133e-06, "logits/chosen": 14.936433792114258, "logits/rejected": 18.18695640563965, "logps/chosen": -272.3729248046875, "logps/rejected": -328.1497802734375, "loss": 0.7993, "rewards/accuracies": 0.25, "rewards/chosen": -0.32496678829193115, "rewards/margins": -0.14285282790660858, "rewards/rejected": -0.18211393058300018, "step": 2262 }, { "epoch": 0.3499710032862942, "grad_norm": 5.081689357757568, "learning_rate": 4.907492267155459e-06, "logits/chosen": 8.887516975402832, "logits/rejected": 1.470055103302002, "logps/chosen": -240.7576904296875, "logps/rejected": -160.95785522460938, "loss": 0.5616, "rewards/accuracies": 0.75, "rewards/chosen": 0.012402251362800598, "rewards/margins": 0.3340449929237366, "rewards/rejected": -0.3216427266597748, "step": 2263 }, { "epoch": 0.35012565242605836, "grad_norm": 6.787292957305908, "learning_rate": 4.907205865505786e-06, "logits/chosen": 12.554620742797852, "logits/rejected": 5.174549102783203, "logps/chosen": -236.28616333007812, "logps/rejected": -209.17721557617188, "loss": 0.8091, "rewards/accuracies": 0.25, "rewards/chosen": -0.33284130692481995, "rewards/margins": -0.141166090965271, "rewards/rejected": -0.19167517125606537, "step": 2264 }, { "epoch": 0.3502803015658225, "grad_norm": 5.783636093139648, "learning_rate": 4.906919463856113e-06, "logits/chosen": 7.47484016418457, "logits/rejected": 12.06732177734375, "logps/chosen": -332.9858703613281, "logps/rejected": -283.30963134765625, "loss": 0.7008, "rewards/accuracies": 0.5, "rewards/chosen": -0.2375369518995285, "rewards/margins": 0.04651055112481117, "rewards/rejected": -0.2840474843978882, "step": 2265 }, { "epoch": 0.3504349507055867, "grad_norm": 5.621984004974365, "learning_rate": 4.906633062206439e-06, "logits/chosen": 11.89096450805664, "logits/rejected": 6.294903755187988, "logps/chosen": -354.2367858886719, "logps/rejected": -260.3260498046875, "loss": 0.6839, "rewards/accuracies": 0.5, "rewards/chosen": 0.02276735007762909, "rewards/margins": 0.07391981780529022, "rewards/rejected": -0.051152464002370834, "step": 2266 }, { "epoch": 0.3505895998453509, "grad_norm": 4.112491607666016, "learning_rate": 4.906346660556765e-06, "logits/chosen": 8.1788911819458, "logits/rejected": 1.4742443561553955, "logps/chosen": -210.604248046875, "logps/rejected": -129.72760009765625, "loss": 0.6717, "rewards/accuracies": 0.75, "rewards/chosen": -0.28726643323898315, "rewards/margins": 0.07727478444576263, "rewards/rejected": -0.364541232585907, "step": 2267 }, { "epoch": 0.350744248985115, "grad_norm": 5.981924057006836, "learning_rate": 4.906060258907092e-06, "logits/chosen": 8.20511531829834, "logits/rejected": 7.404324054718018, "logps/chosen": -259.75482177734375, "logps/rejected": -266.87823486328125, "loss": 0.5223, "rewards/accuracies": 0.875, "rewards/chosen": 0.12701082229614258, "rewards/margins": 0.4615393877029419, "rewards/rejected": -0.33452853560447693, "step": 2268 }, { "epoch": 0.3508988981248792, "grad_norm": 5.859411239624023, "learning_rate": 4.9057738572574184e-06, "logits/chosen": 8.135759353637695, "logits/rejected": 2.4925124645233154, "logps/chosen": -301.67962646484375, "logps/rejected": -244.54283142089844, "loss": 0.651, "rewards/accuracies": 0.5, "rewards/chosen": -0.06861220300197601, "rewards/margins": 0.16191913187503815, "rewards/rejected": -0.23053131997585297, "step": 2269 }, { "epoch": 0.35105354726464333, "grad_norm": 4.8766279220581055, "learning_rate": 4.905487455607745e-06, "logits/chosen": 8.630785942077637, "logits/rejected": 6.301517963409424, "logps/chosen": -250.0829620361328, "logps/rejected": -205.16307067871094, "loss": 0.6342, "rewards/accuracies": 0.5, "rewards/chosen": -0.13005010783672333, "rewards/margins": 0.23674780130386353, "rewards/rejected": -0.36679789423942566, "step": 2270 }, { "epoch": 0.3512081964044075, "grad_norm": 6.100093841552734, "learning_rate": 4.905201053958072e-06, "logits/chosen": 10.4290771484375, "logits/rejected": 5.6397786140441895, "logps/chosen": -329.4725341796875, "logps/rejected": -310.0322265625, "loss": 0.705, "rewards/accuracies": 0.5, "rewards/chosen": -0.009967800229787827, "rewards/margins": 0.027770325541496277, "rewards/rejected": -0.0377381294965744, "step": 2271 }, { "epoch": 0.3513628455441717, "grad_norm": 5.898587226867676, "learning_rate": 4.9049146523083975e-06, "logits/chosen": 8.528572082519531, "logits/rejected": 5.490638732910156, "logps/chosen": -269.6099853515625, "logps/rejected": -209.4634246826172, "loss": 0.602, "rewards/accuracies": 0.75, "rewards/chosen": -0.1952863335609436, "rewards/margins": 0.3484611213207245, "rewards/rejected": -0.5437474250793457, "step": 2272 }, { "epoch": 0.35151749468393584, "grad_norm": 7.538588523864746, "learning_rate": 4.904628250658724e-06, "logits/chosen": 13.407453536987305, "logits/rejected": 11.88882064819336, "logps/chosen": -409.357421875, "logps/rejected": -364.2845458984375, "loss": 0.6862, "rewards/accuracies": 0.375, "rewards/chosen": -0.15313872694969177, "rewards/margins": 0.17631609737873077, "rewards/rejected": -0.32945480942726135, "step": 2273 }, { "epoch": 0.3516721438237, "grad_norm": 5.733290672302246, "learning_rate": 4.904341849009051e-06, "logits/chosen": 8.919346809387207, "logits/rejected": 11.112564086914062, "logps/chosen": -239.14163208007812, "logps/rejected": -251.54750061035156, "loss": 0.7497, "rewards/accuracies": 0.375, "rewards/chosen": -0.6215527057647705, "rewards/margins": -0.05928035080432892, "rewards/rejected": -0.5622723698616028, "step": 2274 }, { "epoch": 0.35182679296346414, "grad_norm": 4.8864216804504395, "learning_rate": 4.9040554473593775e-06, "logits/chosen": 13.415364265441895, "logits/rejected": 12.386824607849121, "logps/chosen": -370.70404052734375, "logps/rejected": -351.29815673828125, "loss": 0.4881, "rewards/accuracies": 0.75, "rewards/chosen": 0.12877292931079865, "rewards/margins": 0.6097846031188965, "rewards/rejected": -0.48101168870925903, "step": 2275 }, { "epoch": 0.3519814421032283, "grad_norm": 5.105952739715576, "learning_rate": 4.903769045709703e-06, "logits/chosen": 10.409931182861328, "logits/rejected": 6.689871788024902, "logps/chosen": -291.099609375, "logps/rejected": -251.84365844726562, "loss": 0.6403, "rewards/accuracies": 0.5, "rewards/chosen": -0.15651798248291016, "rewards/margins": 0.18619213998317719, "rewards/rejected": -0.34271010756492615, "step": 2276 }, { "epoch": 0.35213609124299244, "grad_norm": 8.522160530090332, "learning_rate": 4.90348264406003e-06, "logits/chosen": 2.8824234008789062, "logits/rejected": 4.933001518249512, "logps/chosen": -381.6719970703125, "logps/rejected": -445.5632629394531, "loss": 0.7482, "rewards/accuracies": 0.5, "rewards/chosen": -0.1836051046848297, "rewards/margins": 0.06200967729091644, "rewards/rejected": -0.24561476707458496, "step": 2277 }, { "epoch": 0.3522907403827566, "grad_norm": 5.22220516204834, "learning_rate": 4.903196242410357e-06, "logits/chosen": 14.69093132019043, "logits/rejected": 8.37982177734375, "logps/chosen": -263.33050537109375, "logps/rejected": -180.16409301757812, "loss": 0.7331, "rewards/accuracies": 0.375, "rewards/chosen": -0.22532400488853455, "rewards/margins": -0.01858241856098175, "rewards/rejected": -0.2067415714263916, "step": 2278 }, { "epoch": 0.3524453895225208, "grad_norm": 4.423905849456787, "learning_rate": 4.902909840760683e-06, "logits/chosen": 12.967979431152344, "logits/rejected": 10.532442092895508, "logps/chosen": -367.1943664550781, "logps/rejected": -423.98193359375, "loss": 0.4146, "rewards/accuracies": 0.875, "rewards/chosen": 0.0062543898820877075, "rewards/margins": 0.765472412109375, "rewards/rejected": -0.7592180371284485, "step": 2279 }, { "epoch": 0.35260003866228495, "grad_norm": 5.410378932952881, "learning_rate": 4.902623439111009e-06, "logits/chosen": 10.105243682861328, "logits/rejected": 14.415291786193848, "logps/chosen": -278.61737060546875, "logps/rejected": -319.6083984375, "loss": 0.6428, "rewards/accuracies": 0.5, "rewards/chosen": -0.31036606431007385, "rewards/margins": 0.13710956275463104, "rewards/rejected": -0.4474756121635437, "step": 2280 }, { "epoch": 0.3527546878020491, "grad_norm": 6.157070159912109, "learning_rate": 4.902337037461336e-06, "logits/chosen": 10.1044282913208, "logits/rejected": 10.5338716506958, "logps/chosen": -369.8498840332031, "logps/rejected": -346.7928466796875, "loss": 0.711, "rewards/accuracies": 0.5, "rewards/chosen": -0.28167325258255005, "rewards/margins": 0.026620671153068542, "rewards/rejected": -0.308293879032135, "step": 2281 }, { "epoch": 0.35290933694181326, "grad_norm": 5.407443523406982, "learning_rate": 4.902050635811662e-06, "logits/chosen": 13.642023086547852, "logits/rejected": 6.602418422698975, "logps/chosen": -405.72283935546875, "logps/rejected": -314.2531433105469, "loss": 0.5513, "rewards/accuracies": 0.875, "rewards/chosen": -0.10360391438007355, "rewards/margins": 0.4180288314819336, "rewards/rejected": -0.521632730960846, "step": 2282 }, { "epoch": 0.3530639860815774, "grad_norm": 5.393918037414551, "learning_rate": 4.901764234161989e-06, "logits/chosen": 8.057491302490234, "logits/rejected": 9.188366889953613, "logps/chosen": -284.2396240234375, "logps/rejected": -279.5758056640625, "loss": 0.716, "rewards/accuracies": 0.5, "rewards/chosen": -0.39083796739578247, "rewards/margins": 0.004945278167724609, "rewards/rejected": -0.3957832455635071, "step": 2283 }, { "epoch": 0.35321863522134156, "grad_norm": 6.1068196296691895, "learning_rate": 4.901477832512316e-06, "logits/chosen": 7.461799144744873, "logits/rejected": 8.400287628173828, "logps/chosen": -166.0146484375, "logps/rejected": -312.38836669921875, "loss": 0.8087, "rewards/accuracies": 0.5, "rewards/chosen": -0.32630789279937744, "rewards/margins": -0.14760400354862213, "rewards/rejected": -0.1787038892507553, "step": 2284 }, { "epoch": 0.35337328436110577, "grad_norm": 6.6303391456604, "learning_rate": 4.901191430862642e-06, "logits/chosen": 8.566883087158203, "logits/rejected": 6.3126020431518555, "logps/chosen": -232.5137939453125, "logps/rejected": -220.58419799804688, "loss": 0.8915, "rewards/accuracies": 0.625, "rewards/chosen": -0.18723392486572266, "rewards/margins": -0.20679838955402374, "rewards/rejected": 0.019564446061849594, "step": 2285 }, { "epoch": 0.3535279335008699, "grad_norm": 5.50675106048584, "learning_rate": 4.900905029212968e-06, "logits/chosen": 10.489021301269531, "logits/rejected": 9.962228775024414, "logps/chosen": -235.81460571289062, "logps/rejected": -209.7769012451172, "loss": 0.6718, "rewards/accuracies": 0.625, "rewards/chosen": -0.3101990222930908, "rewards/margins": 0.11363208293914795, "rewards/rejected": -0.42383110523223877, "step": 2286 }, { "epoch": 0.35368258264063407, "grad_norm": 8.460928916931152, "learning_rate": 4.900618627563295e-06, "logits/chosen": 5.3383283615112305, "logits/rejected": 4.417959213256836, "logps/chosen": -326.1287536621094, "logps/rejected": -305.16351318359375, "loss": 0.7241, "rewards/accuracies": 0.375, "rewards/chosen": -0.3920285403728485, "rewards/margins": -0.0179438553750515, "rewards/rejected": -0.3740846514701843, "step": 2287 }, { "epoch": 0.3538372317803982, "grad_norm": 4.480382442474365, "learning_rate": 4.900332225913621e-06, "logits/chosen": 6.651538848876953, "logits/rejected": 8.158883094787598, "logps/chosen": -377.5992431640625, "logps/rejected": -313.6252136230469, "loss": 0.5973, "rewards/accuracies": 0.75, "rewards/chosen": -0.03289628028869629, "rewards/margins": 0.39587706327438354, "rewards/rejected": -0.42877334356307983, "step": 2288 }, { "epoch": 0.35399188092016237, "grad_norm": 3.833733081817627, "learning_rate": 4.900045824263948e-06, "logits/chosen": 13.12617015838623, "logits/rejected": 9.737539291381836, "logps/chosen": -224.25856018066406, "logps/rejected": -223.23565673828125, "loss": 0.5379, "rewards/accuracies": 0.875, "rewards/chosen": -0.067721888422966, "rewards/margins": 0.3821626901626587, "rewards/rejected": -0.4498845934867859, "step": 2289 }, { "epoch": 0.3541465300599265, "grad_norm": 8.063943862915039, "learning_rate": 4.899759422614275e-06, "logits/chosen": 8.838645935058594, "logits/rejected": 8.395843505859375, "logps/chosen": -322.738037109375, "logps/rejected": -291.0431213378906, "loss": 0.8257, "rewards/accuracies": 0.25, "rewards/chosen": -0.36284393072128296, "rewards/margins": -0.09391230344772339, "rewards/rejected": -0.26893162727355957, "step": 2290 }, { "epoch": 0.3543011791996907, "grad_norm": 5.950629234313965, "learning_rate": 4.899473020964601e-06, "logits/chosen": 8.83854866027832, "logits/rejected": 7.512988090515137, "logps/chosen": -295.1648254394531, "logps/rejected": -230.810791015625, "loss": 0.7297, "rewards/accuracies": 0.375, "rewards/chosen": -0.4573357105255127, "rewards/margins": -0.018091723322868347, "rewards/rejected": -0.43924397230148315, "step": 2291 }, { "epoch": 0.3544558283394549, "grad_norm": 5.183650493621826, "learning_rate": 4.899186619314928e-06, "logits/chosen": 8.526028633117676, "logits/rejected": 7.30854606628418, "logps/chosen": -186.0430450439453, "logps/rejected": -195.26051330566406, "loss": 0.6957, "rewards/accuracies": 0.625, "rewards/chosen": -0.05327737331390381, "rewards/margins": 0.03432953357696533, "rewards/rejected": -0.08760692179203033, "step": 2292 }, { "epoch": 0.35461047747921903, "grad_norm": 5.21556282043457, "learning_rate": 4.898900217665254e-06, "logits/chosen": 10.062601089477539, "logits/rejected": 5.812929153442383, "logps/chosen": -251.52444458007812, "logps/rejected": -237.75575256347656, "loss": 0.6474, "rewards/accuracies": 0.5, "rewards/chosen": -0.09459376335144043, "rewards/margins": 0.15834593772888184, "rewards/rejected": -0.25293970108032227, "step": 2293 }, { "epoch": 0.3547651266189832, "grad_norm": 7.025015354156494, "learning_rate": 4.8986138160155805e-06, "logits/chosen": 14.059396743774414, "logits/rejected": 5.465035915374756, "logps/chosen": -467.369384765625, "logps/rejected": -299.49078369140625, "loss": 0.7008, "rewards/accuracies": 0.625, "rewards/chosen": 0.033702850341796875, "rewards/margins": 0.15374523401260376, "rewards/rejected": -0.12004238367080688, "step": 2294 }, { "epoch": 0.35491977575874734, "grad_norm": 6.39870023727417, "learning_rate": 4.898327414365907e-06, "logits/chosen": 8.50955581665039, "logits/rejected": 2.735879898071289, "logps/chosen": -303.5803527832031, "logps/rejected": -185.3979949951172, "loss": 0.5488, "rewards/accuracies": 0.875, "rewards/chosen": -0.16922327876091003, "rewards/margins": 0.3481117784976959, "rewards/rejected": -0.517335057258606, "step": 2295 }, { "epoch": 0.3550744248985115, "grad_norm": 5.158504009246826, "learning_rate": 4.898041012716234e-06, "logits/chosen": 11.480414390563965, "logits/rejected": 9.00924301147461, "logps/chosen": -316.7840576171875, "logps/rejected": -314.14239501953125, "loss": 0.5916, "rewards/accuracies": 0.5, "rewards/chosen": 0.03316483274102211, "rewards/margins": 0.45047634840011597, "rewards/rejected": -0.41731148958206177, "step": 2296 }, { "epoch": 0.35522907403827564, "grad_norm": 6.351102352142334, "learning_rate": 4.89775461106656e-06, "logits/chosen": 7.808524131774902, "logits/rejected": 10.521486282348633, "logps/chosen": -189.92694091796875, "logps/rejected": -234.21682739257812, "loss": 0.6965, "rewards/accuracies": 0.5, "rewards/chosen": -0.09540493786334991, "rewards/margins": 0.0428440198302269, "rewards/rejected": -0.13824895024299622, "step": 2297 }, { "epoch": 0.35538372317803985, "grad_norm": 9.060208320617676, "learning_rate": 4.897468209416887e-06, "logits/chosen": 5.821726322174072, "logits/rejected": 7.238389015197754, "logps/chosen": -336.39300537109375, "logps/rejected": -371.8484802246094, "loss": 0.8943, "rewards/accuracies": 0.5, "rewards/chosen": -0.27768027782440186, "rewards/margins": -0.24758805334568024, "rewards/rejected": -0.030092239379882812, "step": 2298 }, { "epoch": 0.355538372317804, "grad_norm": 4.79667854309082, "learning_rate": 4.897181807767214e-06, "logits/chosen": 5.603011608123779, "logits/rejected": 0.2982349395751953, "logps/chosen": -256.0111083984375, "logps/rejected": -255.9495849609375, "loss": 0.5096, "rewards/accuracies": 0.75, "rewards/chosen": -0.21172352135181427, "rewards/margins": 0.49048200249671936, "rewards/rejected": -0.7022055387496948, "step": 2299 }, { "epoch": 0.35569302145756815, "grad_norm": 7.194302558898926, "learning_rate": 4.8968954061175395e-06, "logits/chosen": 9.853011131286621, "logits/rejected": 10.196281433105469, "logps/chosen": -308.0816955566406, "logps/rejected": -375.0216064453125, "loss": 0.7722, "rewards/accuracies": 0.5, "rewards/chosen": -0.28454896807670593, "rewards/margins": -0.07456383109092712, "rewards/rejected": -0.2099851667881012, "step": 2300 }, { "epoch": 0.3558476705973323, "grad_norm": 8.1284761428833, "learning_rate": 4.896609004467866e-06, "logits/chosen": 7.840907096862793, "logits/rejected": 1.9497251510620117, "logps/chosen": -343.1737976074219, "logps/rejected": -227.4717559814453, "loss": 0.6274, "rewards/accuracies": 0.75, "rewards/chosen": -0.11587969213724136, "rewards/margins": 0.396634578704834, "rewards/rejected": -0.5125142335891724, "step": 2301 }, { "epoch": 0.35600231973709645, "grad_norm": 3.761204481124878, "learning_rate": 4.896322602818193e-06, "logits/chosen": 10.910774230957031, "logits/rejected": 1.5914866924285889, "logps/chosen": -166.80218505859375, "logps/rejected": -92.43940734863281, "loss": 0.6409, "rewards/accuracies": 0.5, "rewards/chosen": -0.1651858389377594, "rewards/margins": 0.17134442925453186, "rewards/rejected": -0.33653026819229126, "step": 2302 }, { "epoch": 0.3561569688768606, "grad_norm": 5.362090587615967, "learning_rate": 4.8960362011685195e-06, "logits/chosen": 12.594182014465332, "logits/rejected": 8.193999290466309, "logps/chosen": -305.1719970703125, "logps/rejected": -297.2008056640625, "loss": 0.5383, "rewards/accuracies": 0.75, "rewards/chosen": 0.061117835342884064, "rewards/margins": 0.36943426728248596, "rewards/rejected": -0.3083164095878601, "step": 2303 }, { "epoch": 0.3563116180166248, "grad_norm": 7.27452278137207, "learning_rate": 4.895749799518846e-06, "logits/chosen": 5.877128601074219, "logits/rejected": 3.7415316104888916, "logps/chosen": -336.67230224609375, "logps/rejected": -371.3394775390625, "loss": 0.7349, "rewards/accuracies": 0.5, "rewards/chosen": 0.2061605006456375, "rewards/margins": 0.18396373093128204, "rewards/rejected": 0.02219676598906517, "step": 2304 }, { "epoch": 0.35646626715638896, "grad_norm": 6.953784942626953, "learning_rate": 4.895463397869173e-06, "logits/chosen": 10.48383903503418, "logits/rejected": 10.955982208251953, "logps/chosen": -341.8731689453125, "logps/rejected": -298.1326599121094, "loss": 0.7131, "rewards/accuracies": 0.5, "rewards/chosen": -0.011556625366210938, "rewards/margins": 0.10991425812244415, "rewards/rejected": -0.12147089093923569, "step": 2305 }, { "epoch": 0.3566209162961531, "grad_norm": 7.077308177947998, "learning_rate": 4.8951769962194986e-06, "logits/chosen": 8.61199951171875, "logits/rejected": 6.04293966293335, "logps/chosen": -280.35284423828125, "logps/rejected": -333.212646484375, "loss": 0.6358, "rewards/accuracies": 0.5, "rewards/chosen": -0.14419767260551453, "rewards/margins": 0.2684920132160187, "rewards/rejected": -0.4126897156238556, "step": 2306 }, { "epoch": 0.35677556543591726, "grad_norm": 6.757407188415527, "learning_rate": 4.894890594569825e-06, "logits/chosen": 8.389930725097656, "logits/rejected": 3.219229221343994, "logps/chosen": -285.2623596191406, "logps/rejected": -250.55419921875, "loss": 0.7, "rewards/accuracies": 0.625, "rewards/chosen": -0.33544251322746277, "rewards/margins": 0.04589072987437248, "rewards/rejected": -0.38133326172828674, "step": 2307 }, { "epoch": 0.3569302145756814, "grad_norm": 21.499324798583984, "learning_rate": 4.894604192920152e-06, "logits/chosen": 10.170759201049805, "logits/rejected": 8.207755088806152, "logps/chosen": -246.21438598632812, "logps/rejected": -275.232421875, "loss": 0.7691, "rewards/accuracies": 0.375, "rewards/chosen": -0.08954611420631409, "rewards/margins": -0.11354275792837143, "rewards/rejected": 0.023996641859412193, "step": 2308 }, { "epoch": 0.35708486371544557, "grad_norm": 3.6801435947418213, "learning_rate": 4.8943177912704785e-06, "logits/chosen": 8.054658889770508, "logits/rejected": 7.391181468963623, "logps/chosen": -185.26820373535156, "logps/rejected": -186.87498474121094, "loss": 0.6266, "rewards/accuracies": 0.5, "rewards/chosen": -0.06425218284130096, "rewards/margins": 0.19823215901851654, "rewards/rejected": -0.2624843716621399, "step": 2309 }, { "epoch": 0.3572395128552097, "grad_norm": 6.398554801940918, "learning_rate": 4.894031389620804e-06, "logits/chosen": 0.9814324378967285, "logits/rejected": 6.214434623718262, "logps/chosen": -226.17984008789062, "logps/rejected": -302.9360656738281, "loss": 0.544, "rewards/accuracies": 0.875, "rewards/chosen": -0.29893386363983154, "rewards/margins": 0.42912763357162476, "rewards/rejected": -0.7280614376068115, "step": 2310 }, { "epoch": 0.3573941619949739, "grad_norm": 7.246306419372559, "learning_rate": 4.893744987971131e-06, "logits/chosen": 12.344511985778809, "logits/rejected": 8.692779541015625, "logps/chosen": -453.1650695800781, "logps/rejected": -329.99603271484375, "loss": 0.6425, "rewards/accuracies": 0.5, "rewards/chosen": -0.17128944396972656, "rewards/margins": 0.2065943032503128, "rewards/rejected": -0.37788376212120056, "step": 2311 }, { "epoch": 0.3575488111347381, "grad_norm": 6.005757808685303, "learning_rate": 4.893458586321458e-06, "logits/chosen": 7.452842712402344, "logits/rejected": 4.272350788116455, "logps/chosen": -278.6836853027344, "logps/rejected": -273.89813232421875, "loss": 0.6309, "rewards/accuracies": 0.625, "rewards/chosen": -0.3194414973258972, "rewards/margins": 0.35720765590667725, "rewards/rejected": -0.6766491532325745, "step": 2312 }, { "epoch": 0.3577034602745022, "grad_norm": 5.901392459869385, "learning_rate": 4.893172184671784e-06, "logits/chosen": 11.812231063842773, "logits/rejected": 13.108213424682617, "logps/chosen": -379.9096374511719, "logps/rejected": -374.9116516113281, "loss": 0.7209, "rewards/accuracies": 0.625, "rewards/chosen": -0.18073472380638123, "rewards/margins": 0.039542943239212036, "rewards/rejected": -0.22027769684791565, "step": 2313 }, { "epoch": 0.3578581094142664, "grad_norm": 5.417755126953125, "learning_rate": 4.89288578302211e-06, "logits/chosen": 13.69114875793457, "logits/rejected": 13.75348949432373, "logps/chosen": -401.909423828125, "logps/rejected": -433.63507080078125, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": 0.020201489329338074, "rewards/margins": 0.22761499881744385, "rewards/rejected": -0.2074134796857834, "step": 2314 }, { "epoch": 0.35801275855403053, "grad_norm": 6.451322078704834, "learning_rate": 4.892599381372437e-06, "logits/chosen": 9.982239723205566, "logits/rejected": 5.295004844665527, "logps/chosen": -264.6368408203125, "logps/rejected": -252.98638916015625, "loss": 0.6977, "rewards/accuracies": 0.625, "rewards/chosen": -0.5814623832702637, "rewards/margins": 0.03783878684043884, "rewards/rejected": -0.6193010807037354, "step": 2315 }, { "epoch": 0.3581674076937947, "grad_norm": 7.075787544250488, "learning_rate": 4.892312979722763e-06, "logits/chosen": 11.914676666259766, "logits/rejected": 13.889586448669434, "logps/chosen": -335.5152587890625, "logps/rejected": -384.6313171386719, "loss": 0.7642, "rewards/accuracies": 0.375, "rewards/chosen": -0.018859509378671646, "rewards/margins": 0.03121916949748993, "rewards/rejected": -0.05007869005203247, "step": 2316 }, { "epoch": 0.3583220568335589, "grad_norm": 4.819097995758057, "learning_rate": 4.89202657807309e-06, "logits/chosen": 13.529319763183594, "logits/rejected": 6.044322967529297, "logps/chosen": -290.4903869628906, "logps/rejected": -226.93785095214844, "loss": 0.6716, "rewards/accuracies": 0.375, "rewards/chosen": -0.5643563866615295, "rewards/margins": 0.13221922516822815, "rewards/rejected": -0.6965756416320801, "step": 2317 }, { "epoch": 0.35847670597332304, "grad_norm": 9.079181671142578, "learning_rate": 4.891740176423417e-06, "logits/chosen": 11.359947204589844, "logits/rejected": 4.493671894073486, "logps/chosen": -481.7691345214844, "logps/rejected": -331.90264892578125, "loss": 0.736, "rewards/accuracies": 0.5, "rewards/chosen": -0.3059215843677521, "rewards/margins": -0.00804688036441803, "rewards/rejected": -0.29787468910217285, "step": 2318 }, { "epoch": 0.3586313551130872, "grad_norm": 3.0993034839630127, "learning_rate": 4.8914537747737425e-06, "logits/chosen": 12.550003051757812, "logits/rejected": 4.218654632568359, "logps/chosen": -308.19268798828125, "logps/rejected": -203.6925048828125, "loss": 0.4321, "rewards/accuracies": 1.0, "rewards/chosen": 0.3049798011779785, "rewards/margins": 0.6847801804542542, "rewards/rejected": -0.37980034947395325, "step": 2319 }, { "epoch": 0.35878600425285134, "grad_norm": 5.668067455291748, "learning_rate": 4.891167373124069e-06, "logits/chosen": 13.877278327941895, "logits/rejected": 11.128886222839355, "logps/chosen": -254.1451416015625, "logps/rejected": -327.8868713378906, "loss": 0.603, "rewards/accuracies": 0.625, "rewards/chosen": -0.2491917908191681, "rewards/margins": 0.33042365312576294, "rewards/rejected": -0.5796154141426086, "step": 2320 }, { "epoch": 0.3589406533926155, "grad_norm": 4.336638927459717, "learning_rate": 4.890880971474396e-06, "logits/chosen": 8.338726997375488, "logits/rejected": -1.4895474910736084, "logps/chosen": -273.78668212890625, "logps/rejected": -113.2089614868164, "loss": 0.4632, "rewards/accuracies": 1.0, "rewards/chosen": 0.01187887042760849, "rewards/margins": 0.5928508639335632, "rewards/rejected": -0.5809720158576965, "step": 2321 }, { "epoch": 0.35909530253237965, "grad_norm": 5.571256160736084, "learning_rate": 4.8905945698247224e-06, "logits/chosen": 9.629920959472656, "logits/rejected": 5.384699821472168, "logps/chosen": -329.11309814453125, "logps/rejected": -267.02130126953125, "loss": 0.616, "rewards/accuracies": 0.75, "rewards/chosen": -0.0476585328578949, "rewards/margins": 0.1954512596130371, "rewards/rejected": -0.243109792470932, "step": 2322 }, { "epoch": 0.3592499516721438, "grad_norm": 6.020268440246582, "learning_rate": 4.890308168175049e-06, "logits/chosen": 10.552306175231934, "logits/rejected": 9.562264442443848, "logps/chosen": -261.8734436035156, "logps/rejected": -296.9286193847656, "loss": 0.7023, "rewards/accuracies": 0.5, "rewards/chosen": -0.1796172857284546, "rewards/margins": 0.09834901988506317, "rewards/rejected": -0.27796632051467896, "step": 2323 }, { "epoch": 0.359404600811908, "grad_norm": 6.4720377922058105, "learning_rate": 4.890021766525376e-06, "logits/chosen": 12.219856262207031, "logits/rejected": 9.231483459472656, "logps/chosen": -243.53668212890625, "logps/rejected": -217.1230926513672, "loss": 0.763, "rewards/accuracies": 0.25, "rewards/chosen": -0.12192372977733612, "rewards/margins": -0.11754226684570312, "rewards/rejected": -0.004381466656923294, "step": 2324 }, { "epoch": 0.35955924995167216, "grad_norm": 3.857593297958374, "learning_rate": 4.889735364875702e-06, "logits/chosen": 9.698766708374023, "logits/rejected": 12.826196670532227, "logps/chosen": -186.67410278320312, "logps/rejected": -218.66256713867188, "loss": 0.564, "rewards/accuracies": 0.875, "rewards/chosen": -0.1999448835849762, "rewards/margins": 0.2949702739715576, "rewards/rejected": -0.4949151873588562, "step": 2325 }, { "epoch": 0.3597138990914363, "grad_norm": 6.291881561279297, "learning_rate": 4.889448963226028e-06, "logits/chosen": 16.033544540405273, "logits/rejected": 10.01887035369873, "logps/chosen": -243.99578857421875, "logps/rejected": -255.91879272460938, "loss": 0.6742, "rewards/accuracies": 0.625, "rewards/chosen": -0.3948141932487488, "rewards/margins": 0.2785394489765167, "rewards/rejected": -0.6733536124229431, "step": 2326 }, { "epoch": 0.35986854823120046, "grad_norm": 5.036777019500732, "learning_rate": 4.889162561576355e-06, "logits/chosen": 10.51666259765625, "logits/rejected": 6.418890476226807, "logps/chosen": -241.8315887451172, "logps/rejected": -179.93482971191406, "loss": 0.672, "rewards/accuracies": 0.5, "rewards/chosen": -0.19013568758964539, "rewards/margins": 0.14670856297016144, "rewards/rejected": -0.33684423565864563, "step": 2327 }, { "epoch": 0.3600231973709646, "grad_norm": 5.67700719833374, "learning_rate": 4.8888761599266815e-06, "logits/chosen": 10.7471923828125, "logits/rejected": 7.230803966522217, "logps/chosen": -392.7158508300781, "logps/rejected": -280.47772216796875, "loss": 0.564, "rewards/accuracies": 0.75, "rewards/chosen": 0.021594032645225525, "rewards/margins": 0.4154955744743347, "rewards/rejected": -0.3939015567302704, "step": 2328 }, { "epoch": 0.36017784651072876, "grad_norm": 5.598464488983154, "learning_rate": 4.888589758277008e-06, "logits/chosen": 10.959207534790039, "logits/rejected": 12.219639778137207, "logps/chosen": -306.4432373046875, "logps/rejected": -337.88043212890625, "loss": 0.5788, "rewards/accuracies": 0.75, "rewards/chosen": -0.25032198429107666, "rewards/margins": 0.29896223545074463, "rewards/rejected": -0.5492841601371765, "step": 2329 }, { "epoch": 0.36033249565049297, "grad_norm": 6.4484028816223145, "learning_rate": 4.888303356627335e-06, "logits/chosen": 16.292165756225586, "logits/rejected": 6.625369071960449, "logps/chosen": -425.8658752441406, "logps/rejected": -284.1050109863281, "loss": 0.6799, "rewards/accuracies": 0.75, "rewards/chosen": -0.3909912109375, "rewards/margins": 0.10917416214942932, "rewards/rejected": -0.5001653432846069, "step": 2330 }, { "epoch": 0.3604871447902571, "grad_norm": 6.582698822021484, "learning_rate": 4.8880169549776614e-06, "logits/chosen": 2.9579524993896484, "logits/rejected": 7.169062614440918, "logps/chosen": -157.89012145996094, "logps/rejected": -239.32408142089844, "loss": 1.0264, "rewards/accuracies": 0.5, "rewards/chosen": -0.6352663040161133, "rewards/margins": -0.39971378445625305, "rewards/rejected": -0.23555254936218262, "step": 2331 }, { "epoch": 0.36064179393002127, "grad_norm": 5.106259346008301, "learning_rate": 4.887730553327987e-06, "logits/chosen": 15.27623176574707, "logits/rejected": 6.068172931671143, "logps/chosen": -382.0967712402344, "logps/rejected": -254.91404724121094, "loss": 0.5111, "rewards/accuracies": 0.875, "rewards/chosen": -0.12590059638023376, "rewards/margins": 0.4322192072868347, "rewards/rejected": -0.5581198334693909, "step": 2332 }, { "epoch": 0.3607964430697854, "grad_norm": 4.261303424835205, "learning_rate": 4.887444151678314e-06, "logits/chosen": 9.271157264709473, "logits/rejected": 9.21375846862793, "logps/chosen": -189.50863647460938, "logps/rejected": -156.10137939453125, "loss": 0.5979, "rewards/accuracies": 0.75, "rewards/chosen": -0.2089763581752777, "rewards/margins": 0.24481363594532013, "rewards/rejected": -0.45379000902175903, "step": 2333 }, { "epoch": 0.3609510922095496, "grad_norm": 6.183831214904785, "learning_rate": 4.8871577500286405e-06, "logits/chosen": 5.848118782043457, "logits/rejected": 12.195114135742188, "logps/chosen": -231.59710693359375, "logps/rejected": -307.4557189941406, "loss": 0.8476, "rewards/accuracies": 0.5, "rewards/chosen": -0.3755185008049011, "rewards/margins": -0.1300891786813736, "rewards/rejected": -0.24542932212352753, "step": 2334 }, { "epoch": 0.3611057413493137, "grad_norm": 5.124051570892334, "learning_rate": 4.886871348378967e-06, "logits/chosen": 10.781159400939941, "logits/rejected": 12.605043411254883, "logps/chosen": -223.93026733398438, "logps/rejected": -209.89682006835938, "loss": 0.7497, "rewards/accuracies": 0.5, "rewards/chosen": -0.32741034030914307, "rewards/margins": -0.05141666904091835, "rewards/rejected": -0.2759937047958374, "step": 2335 }, { "epoch": 0.36126039048907793, "grad_norm": 5.250643253326416, "learning_rate": 4.886584946729294e-06, "logits/chosen": 8.486542701721191, "logits/rejected": 4.996033668518066, "logps/chosen": -232.08023071289062, "logps/rejected": -136.31983947753906, "loss": 0.4867, "rewards/accuracies": 0.875, "rewards/chosen": -0.005316736176609993, "rewards/margins": 0.5012938380241394, "rewards/rejected": -0.506610631942749, "step": 2336 }, { "epoch": 0.3614150396288421, "grad_norm": 6.31569766998291, "learning_rate": 4.8862985450796205e-06, "logits/chosen": 8.633461952209473, "logits/rejected": 3.6754908561706543, "logps/chosen": -357.68524169921875, "logps/rejected": -220.08511352539062, "loss": 0.6646, "rewards/accuracies": 0.75, "rewards/chosen": -0.07489103823900223, "rewards/margins": 0.12376195192337036, "rewards/rejected": -0.198652982711792, "step": 2337 }, { "epoch": 0.36156968876860623, "grad_norm": 5.512584686279297, "learning_rate": 4.886012143429947e-06, "logits/chosen": 10.981819152832031, "logits/rejected": -1.7955965995788574, "logps/chosen": -284.3398742675781, "logps/rejected": -160.30117797851562, "loss": 0.6178, "rewards/accuracies": 0.625, "rewards/chosen": -0.3555864989757538, "rewards/margins": 0.2697451710700989, "rewards/rejected": -0.6253317594528198, "step": 2338 }, { "epoch": 0.3617243379083704, "grad_norm": 6.299797534942627, "learning_rate": 4.885725741780273e-06, "logits/chosen": 12.408297538757324, "logits/rejected": 12.135342597961426, "logps/chosen": -288.1808776855469, "logps/rejected": -345.37286376953125, "loss": 0.667, "rewards/accuracies": 0.625, "rewards/chosen": -0.1627824902534485, "rewards/margins": 0.1001102402806282, "rewards/rejected": -0.2628927230834961, "step": 2339 }, { "epoch": 0.36187898704813454, "grad_norm": 5.398398399353027, "learning_rate": 4.8854393401306e-06, "logits/chosen": 8.593003273010254, "logits/rejected": 8.416049003601074, "logps/chosen": -227.72084045410156, "logps/rejected": -214.4921417236328, "loss": 0.6238, "rewards/accuracies": 0.625, "rewards/chosen": -0.19532723724842072, "rewards/margins": 0.3638247847557068, "rewards/rejected": -0.5591520071029663, "step": 2340 }, { "epoch": 0.3620336361878987, "grad_norm": 4.384174823760986, "learning_rate": 4.885152938480926e-06, "logits/chosen": 9.611751556396484, "logits/rejected": 8.927783012390137, "logps/chosen": -322.15179443359375, "logps/rejected": -262.297119140625, "loss": 0.6181, "rewards/accuracies": 0.625, "rewards/chosen": -0.258592426776886, "rewards/margins": 0.2849966287612915, "rewards/rejected": -0.5435889959335327, "step": 2341 }, { "epoch": 0.36218828532766284, "grad_norm": 7.699501991271973, "learning_rate": 4.884866536831253e-06, "logits/chosen": 18.6249942779541, "logits/rejected": 11.507184982299805, "logps/chosen": -368.5857849121094, "logps/rejected": -314.67877197265625, "loss": 0.8746, "rewards/accuracies": 0.375, "rewards/chosen": -0.5849729776382446, "rewards/margins": -0.209983229637146, "rewards/rejected": -0.37498971819877625, "step": 2342 }, { "epoch": 0.36234293446742705, "grad_norm": 7.476219654083252, "learning_rate": 4.8845801351815796e-06, "logits/chosen": 6.921806335449219, "logits/rejected": 9.927090644836426, "logps/chosen": -314.04937744140625, "logps/rejected": -340.1742248535156, "loss": 0.805, "rewards/accuracies": 0.5, "rewards/chosen": -0.34801045060157776, "rewards/margins": -0.1438456028699875, "rewards/rejected": -0.20416483283042908, "step": 2343 }, { "epoch": 0.3624975836071912, "grad_norm": 4.399616241455078, "learning_rate": 4.884293733531905e-06, "logits/chosen": 14.571311950683594, "logits/rejected": 6.413580417633057, "logps/chosen": -318.14996337890625, "logps/rejected": -269.2610778808594, "loss": 0.5276, "rewards/accuracies": 0.75, "rewards/chosen": -0.30601176619529724, "rewards/margins": 0.5159397125244141, "rewards/rejected": -0.8219515085220337, "step": 2344 }, { "epoch": 0.36265223274695535, "grad_norm": 7.860400199890137, "learning_rate": 4.884007331882232e-06, "logits/chosen": 12.45594310760498, "logits/rejected": 8.262201309204102, "logps/chosen": -275.4473571777344, "logps/rejected": -226.03073120117188, "loss": 0.6237, "rewards/accuracies": 0.875, "rewards/chosen": -0.15561650693416595, "rewards/margins": 0.3029440641403198, "rewards/rejected": -0.458560585975647, "step": 2345 }, { "epoch": 0.3628068818867195, "grad_norm": 8.57366943359375, "learning_rate": 4.883720930232559e-06, "logits/chosen": 5.303665637969971, "logits/rejected": 6.4736247062683105, "logps/chosen": -275.08843994140625, "logps/rejected": -369.48492431640625, "loss": 0.78, "rewards/accuracies": 0.5, "rewards/chosen": -0.28122615814208984, "rewards/margins": -0.032385632395744324, "rewards/rejected": -0.24884052574634552, "step": 2346 }, { "epoch": 0.36296153102648365, "grad_norm": 3.795870065689087, "learning_rate": 4.883434528582885e-06, "logits/chosen": 6.8698906898498535, "logits/rejected": 3.725409984588623, "logps/chosen": -170.1791534423828, "logps/rejected": -148.29632568359375, "loss": 0.564, "rewards/accuracies": 0.625, "rewards/chosen": -0.3769446313381195, "rewards/margins": 0.33366379141807556, "rewards/rejected": -0.7106083631515503, "step": 2347 }, { "epoch": 0.3631161801662478, "grad_norm": 7.328372001647949, "learning_rate": 4.883148126933211e-06, "logits/chosen": 4.369106292724609, "logits/rejected": 12.003111839294434, "logps/chosen": -333.47235107421875, "logps/rejected": -472.6842956542969, "loss": 0.7566, "rewards/accuracies": 0.375, "rewards/chosen": -0.17366792261600494, "rewards/margins": 0.07455785572528839, "rewards/rejected": -0.24822577834129333, "step": 2348 }, { "epoch": 0.363270829306012, "grad_norm": 6.609291076660156, "learning_rate": 4.882861725283538e-06, "logits/chosen": 10.910910606384277, "logits/rejected": 8.999237060546875, "logps/chosen": -274.8633728027344, "logps/rejected": -322.33367919921875, "loss": 0.7171, "rewards/accuracies": 0.25, "rewards/chosen": -0.06630589812994003, "rewards/margins": 0.023826494812965393, "rewards/rejected": -0.09013240039348602, "step": 2349 }, { "epoch": 0.36342547844577616, "grad_norm": 4.94293737411499, "learning_rate": 4.882575323633864e-06, "logits/chosen": 12.231851577758789, "logits/rejected": 1.6587518453598022, "logps/chosen": -244.8386993408203, "logps/rejected": -133.571533203125, "loss": 0.6544, "rewards/accuracies": 0.625, "rewards/chosen": -0.3842792212963104, "rewards/margins": 0.11548030376434326, "rewards/rejected": -0.4997595250606537, "step": 2350 }, { "epoch": 0.3635801275855403, "grad_norm": 4.537017822265625, "learning_rate": 4.882288921984191e-06, "logits/chosen": 5.542438507080078, "logits/rejected": 7.889125823974609, "logps/chosen": -269.3973693847656, "logps/rejected": -283.2834167480469, "loss": 0.6178, "rewards/accuracies": 0.625, "rewards/chosen": 0.12994347512722015, "rewards/margins": 0.25747787952423096, "rewards/rejected": -0.1275343894958496, "step": 2351 }, { "epoch": 0.36373477672530447, "grad_norm": 13.991962432861328, "learning_rate": 4.882002520334517e-06, "logits/chosen": 10.358453750610352, "logits/rejected": 5.808764934539795, "logps/chosen": -215.3419647216797, "logps/rejected": -172.7190704345703, "loss": 0.6002, "rewards/accuracies": 0.625, "rewards/chosen": -0.11588548123836517, "rewards/margins": 0.2355099618434906, "rewards/rejected": -0.3513954281806946, "step": 2352 }, { "epoch": 0.3638894258650686, "grad_norm": 5.103969097137451, "learning_rate": 4.8817161186848435e-06, "logits/chosen": 7.68189811706543, "logits/rejected": 6.0498127937316895, "logps/chosen": -184.1732635498047, "logps/rejected": -217.8790740966797, "loss": 0.7151, "rewards/accuracies": 0.375, "rewards/chosen": -0.35028061270713806, "rewards/margins": 0.060576677322387695, "rewards/rejected": -0.41085729002952576, "step": 2353 }, { "epoch": 0.36404407500483277, "grad_norm": 5.266611576080322, "learning_rate": 4.88142971703517e-06, "logits/chosen": 6.056519508361816, "logits/rejected": 6.682257652282715, "logps/chosen": -220.18661499023438, "logps/rejected": -272.0639343261719, "loss": 0.6582, "rewards/accuracies": 0.5, "rewards/chosen": 0.012373637408018112, "rewards/margins": 0.12627770006656647, "rewards/rejected": -0.11390408128499985, "step": 2354 }, { "epoch": 0.3641987241445969, "grad_norm": 2.9377174377441406, "learning_rate": 4.881143315385497e-06, "logits/chosen": 7.589189529418945, "logits/rejected": 3.4054646492004395, "logps/chosen": -171.55177307128906, "logps/rejected": -164.3984375, "loss": 0.5127, "rewards/accuracies": 0.75, "rewards/chosen": -0.14273259043693542, "rewards/margins": 0.5173119306564331, "rewards/rejected": -0.6600445508956909, "step": 2355 }, { "epoch": 0.3643533732843611, "grad_norm": 6.725290775299072, "learning_rate": 4.8808569137358235e-06, "logits/chosen": 14.152974128723145, "logits/rejected": 13.475658416748047, "logps/chosen": -367.3597106933594, "logps/rejected": -272.80126953125, "loss": 0.8065, "rewards/accuracies": 0.5, "rewards/chosen": -0.0828234925866127, "rewards/margins": -0.1337573379278183, "rewards/rejected": 0.0509338453412056, "step": 2356 }, { "epoch": 0.3645080224241253, "grad_norm": 3.9029901027679443, "learning_rate": 4.88057051208615e-06, "logits/chosen": 11.248859405517578, "logits/rejected": 8.602968215942383, "logps/chosen": -293.4554443359375, "logps/rejected": -330.5849304199219, "loss": 0.5125, "rewards/accuracies": 0.875, "rewards/chosen": 0.08047932386398315, "rewards/margins": 0.4764372408390045, "rewards/rejected": -0.39595791697502136, "step": 2357 }, { "epoch": 0.36466267156388943, "grad_norm": 6.206760406494141, "learning_rate": 4.880284110436477e-06, "logits/chosen": 9.542941093444824, "logits/rejected": 12.10243034362793, "logps/chosen": -347.6027526855469, "logps/rejected": -366.81890869140625, "loss": 0.7131, "rewards/accuracies": 0.75, "rewards/chosen": -0.0746513307094574, "rewards/margins": 0.07401637732982635, "rewards/rejected": -0.14866770803928375, "step": 2358 }, { "epoch": 0.3648173207036536, "grad_norm": 5.568342208862305, "learning_rate": 4.8799977087868026e-06, "logits/chosen": 5.272181987762451, "logits/rejected": 7.458010196685791, "logps/chosen": -190.6249542236328, "logps/rejected": -248.82757568359375, "loss": 0.6727, "rewards/accuracies": 0.5, "rewards/chosen": -0.31546923518180847, "rewards/margins": 0.07132832705974579, "rewards/rejected": -0.38679754734039307, "step": 2359 }, { "epoch": 0.36497196984341773, "grad_norm": 5.3074774742126465, "learning_rate": 4.879711307137129e-06, "logits/chosen": 9.64173412322998, "logits/rejected": 8.144506454467773, "logps/chosen": -255.3279266357422, "logps/rejected": -238.05764770507812, "loss": 0.6663, "rewards/accuracies": 0.5, "rewards/chosen": -0.3361782431602478, "rewards/margins": 0.13661952316761017, "rewards/rejected": -0.4727977514266968, "step": 2360 }, { "epoch": 0.3651266189831819, "grad_norm": 6.0828351974487305, "learning_rate": 4.879424905487456e-06, "logits/chosen": 10.668529510498047, "logits/rejected": 9.489906311035156, "logps/chosen": -321.1646423339844, "logps/rejected": -282.66729736328125, "loss": 0.7068, "rewards/accuracies": 0.625, "rewards/chosen": -0.24036678671836853, "rewards/margins": 0.06111770123243332, "rewards/rejected": -0.30148449540138245, "step": 2361 }, { "epoch": 0.3652812681229461, "grad_norm": 6.2184529304504395, "learning_rate": 4.8791385038377825e-06, "logits/chosen": 10.269166946411133, "logits/rejected": 10.164972305297852, "logps/chosen": -302.8077697753906, "logps/rejected": -338.8974609375, "loss": 0.6137, "rewards/accuracies": 0.5, "rewards/chosen": -0.09644690155982971, "rewards/margins": 0.20035693049430847, "rewards/rejected": -0.2968038022518158, "step": 2362 }, { "epoch": 0.36543591726271024, "grad_norm": 5.861464023590088, "learning_rate": 4.878852102188109e-06, "logits/chosen": 6.682910919189453, "logits/rejected": 1.3267126083374023, "logps/chosen": -203.043212890625, "logps/rejected": -184.43333435058594, "loss": 0.7404, "rewards/accuracies": 0.375, "rewards/chosen": -0.5007709264755249, "rewards/margins": -0.03474564850330353, "rewards/rejected": -0.4660252630710602, "step": 2363 }, { "epoch": 0.3655905664024744, "grad_norm": 5.164547443389893, "learning_rate": 4.878565700538436e-06, "logits/chosen": 6.862906455993652, "logits/rejected": 7.993058204650879, "logps/chosen": -294.6205749511719, "logps/rejected": -302.4521789550781, "loss": 0.67, "rewards/accuracies": 0.75, "rewards/chosen": 0.05370015650987625, "rewards/margins": 0.2336564064025879, "rewards/rejected": -0.17995625734329224, "step": 2364 }, { "epoch": 0.36574521554223854, "grad_norm": 7.211198329925537, "learning_rate": 4.878279298888762e-06, "logits/chosen": 10.631901741027832, "logits/rejected": 6.959458827972412, "logps/chosen": -410.842041015625, "logps/rejected": -338.8072814941406, "loss": 0.6009, "rewards/accuracies": 0.625, "rewards/chosen": -0.06388301402330399, "rewards/margins": 0.31291818618774414, "rewards/rejected": -0.3768012225627899, "step": 2365 }, { "epoch": 0.3658998646820027, "grad_norm": 7.287830352783203, "learning_rate": 4.877992897239088e-06, "logits/chosen": 9.525850296020508, "logits/rejected": 4.018980503082275, "logps/chosen": -241.68467712402344, "logps/rejected": -166.04310607910156, "loss": 0.7677, "rewards/accuracies": 0.625, "rewards/chosen": -0.29303646087646484, "rewards/margins": -0.014094941318035126, "rewards/rejected": -0.2789415121078491, "step": 2366 }, { "epoch": 0.36605451382176685, "grad_norm": 4.5175652503967285, "learning_rate": 4.877706495589415e-06, "logits/chosen": 7.682052135467529, "logits/rejected": 7.023406982421875, "logps/chosen": -197.8852996826172, "logps/rejected": -216.0009002685547, "loss": 0.5906, "rewards/accuracies": 0.5, "rewards/chosen": 0.14389914274215698, "rewards/margins": 0.3550753891468048, "rewards/rejected": -0.21117626130580902, "step": 2367 }, { "epoch": 0.36620916296153105, "grad_norm": 4.615246295928955, "learning_rate": 4.877420093939742e-06, "logits/chosen": 13.965363502502441, "logits/rejected": 11.511537551879883, "logps/chosen": -302.0827941894531, "logps/rejected": -216.3037567138672, "loss": 0.657, "rewards/accuracies": 0.5, "rewards/chosen": -0.055520229041576385, "rewards/margins": 0.1335691511631012, "rewards/rejected": -0.18908938765525818, "step": 2368 }, { "epoch": 0.3663638121012952, "grad_norm": 6.347133159637451, "learning_rate": 4.877133692290068e-06, "logits/chosen": 10.524088859558105, "logits/rejected": 13.706088066101074, "logps/chosen": -275.6937255859375, "logps/rejected": -296.3673400878906, "loss": 0.7062, "rewards/accuracies": 0.625, "rewards/chosen": -0.12865515053272247, "rewards/margins": 0.10073898732662201, "rewards/rejected": -0.22939416766166687, "step": 2369 }, { "epoch": 0.36651846124105936, "grad_norm": 4.301425457000732, "learning_rate": 4.876847290640395e-06, "logits/chosen": 10.749343872070312, "logits/rejected": 3.797736167907715, "logps/chosen": -294.8398132324219, "logps/rejected": -196.67572021484375, "loss": 0.5578, "rewards/accuracies": 0.625, "rewards/chosen": 0.1838666945695877, "rewards/margins": 0.3971025049686432, "rewards/rejected": -0.21323581039905548, "step": 2370 }, { "epoch": 0.3666731103808235, "grad_norm": 4.686715602874756, "learning_rate": 4.8765608889907215e-06, "logits/chosen": 10.325092315673828, "logits/rejected": 0.4748508930206299, "logps/chosen": -273.77069091796875, "logps/rejected": -163.8691864013672, "loss": 0.634, "rewards/accuracies": 0.5, "rewards/chosen": -0.09108364582061768, "rewards/margins": 0.28942739963531494, "rewards/rejected": -0.3805110454559326, "step": 2371 }, { "epoch": 0.36682775952058766, "grad_norm": 3.7439653873443604, "learning_rate": 4.876274487341047e-06, "logits/chosen": 4.917524337768555, "logits/rejected": 3.206486701965332, "logps/chosen": -254.50897216796875, "logps/rejected": -143.85720825195312, "loss": 0.576, "rewards/accuracies": 0.75, "rewards/chosen": -0.05421806871891022, "rewards/margins": 0.29991936683654785, "rewards/rejected": -0.3541374206542969, "step": 2372 }, { "epoch": 0.3669824086603518, "grad_norm": 5.920764446258545, "learning_rate": 4.875988085691374e-06, "logits/chosen": 12.630250930786133, "logits/rejected": 12.027786254882812, "logps/chosen": -293.1812744140625, "logps/rejected": -277.10992431640625, "loss": 0.7128, "rewards/accuracies": 0.5, "rewards/chosen": -0.2882464528083801, "rewards/margins": 0.04276639223098755, "rewards/rejected": -0.3310127854347229, "step": 2373 }, { "epoch": 0.36713705780011596, "grad_norm": 5.011660575866699, "learning_rate": 4.875701684041701e-06, "logits/chosen": 5.424834251403809, "logits/rejected": 3.492262363433838, "logps/chosen": -185.93032836914062, "logps/rejected": -192.091552734375, "loss": 0.6556, "rewards/accuracies": 0.625, "rewards/chosen": -0.12888306379318237, "rewards/margins": 0.15367940068244934, "rewards/rejected": -0.2825624942779541, "step": 2374 }, { "epoch": 0.36729170693988017, "grad_norm": 5.110328197479248, "learning_rate": 4.875415282392027e-06, "logits/chosen": 6.217820167541504, "logits/rejected": 5.6821064949035645, "logps/chosen": -278.514892578125, "logps/rejected": -290.5674743652344, "loss": 0.6137, "rewards/accuracies": 0.5, "rewards/chosen": -0.2135058492422104, "rewards/margins": 0.29518839716911316, "rewards/rejected": -0.5086942315101624, "step": 2375 }, { "epoch": 0.3674463560796443, "grad_norm": 5.255284309387207, "learning_rate": 4.875128880742354e-06, "logits/chosen": 10.826190948486328, "logits/rejected": 6.227410793304443, "logps/chosen": -356.1262512207031, "logps/rejected": -232.88088989257812, "loss": 0.7445, "rewards/accuracies": 0.5, "rewards/chosen": -0.030341394245624542, "rewards/margins": 0.10422481596469879, "rewards/rejected": -0.13456621766090393, "step": 2376 }, { "epoch": 0.36760100521940847, "grad_norm": 5.0341997146606445, "learning_rate": 4.87484247909268e-06, "logits/chosen": 12.82275390625, "logits/rejected": 9.950830459594727, "logps/chosen": -204.505615234375, "logps/rejected": -177.79937744140625, "loss": 0.6338, "rewards/accuracies": 0.625, "rewards/chosen": -0.276117205619812, "rewards/margins": 0.14440016448497772, "rewards/rejected": -0.42051735520362854, "step": 2377 }, { "epoch": 0.3677556543591726, "grad_norm": 5.252141952514648, "learning_rate": 4.874556077443006e-06, "logits/chosen": 14.976068496704102, "logits/rejected": 1.7651383876800537, "logps/chosen": -469.15869140625, "logps/rejected": -251.12576293945312, "loss": 0.5003, "rewards/accuracies": 0.875, "rewards/chosen": 0.083179771900177, "rewards/margins": 0.5186883211135864, "rewards/rejected": -0.4355085492134094, "step": 2378 }, { "epoch": 0.3679103034989368, "grad_norm": 4.702194690704346, "learning_rate": 4.874269675793333e-06, "logits/chosen": 8.767407417297363, "logits/rejected": 8.445731163024902, "logps/chosen": -224.59815979003906, "logps/rejected": -246.44595336914062, "loss": 0.6852, "rewards/accuracies": 0.625, "rewards/chosen": -0.09728317707777023, "rewards/margins": 0.060402654111385345, "rewards/rejected": -0.15768583118915558, "step": 2379 }, { "epoch": 0.3680649526387009, "grad_norm": 12.987638473510742, "learning_rate": 4.87398327414366e-06, "logits/chosen": 9.733770370483398, "logits/rejected": 10.07154655456543, "logps/chosen": -270.3354187011719, "logps/rejected": -327.7171936035156, "loss": 0.5639, "rewards/accuracies": 0.625, "rewards/chosen": 0.12341442704200745, "rewards/margins": 0.36424195766448975, "rewards/rejected": -0.2408275604248047, "step": 2380 }, { "epoch": 0.36821960177846513, "grad_norm": 6.640517234802246, "learning_rate": 4.873696872493986e-06, "logits/chosen": 1.9106736183166504, "logits/rejected": 7.916538238525391, "logps/chosen": -173.9552001953125, "logps/rejected": -203.43161010742188, "loss": 0.8338, "rewards/accuracies": 0.5, "rewards/chosen": -0.48268765211105347, "rewards/margins": -0.17412656545639038, "rewards/rejected": -0.3085610866546631, "step": 2381 }, { "epoch": 0.3683742509182293, "grad_norm": 8.06789779663086, "learning_rate": 4.873410470844312e-06, "logits/chosen": 7.257105827331543, "logits/rejected": 11.474893569946289, "logps/chosen": -364.2514953613281, "logps/rejected": -368.940673828125, "loss": 0.6514, "rewards/accuracies": 0.75, "rewards/chosen": -0.47181302309036255, "rewards/margins": 0.10804357379674911, "rewards/rejected": -0.5798565745353699, "step": 2382 }, { "epoch": 0.36852890005799344, "grad_norm": 6.6192307472229, "learning_rate": 4.873124069194639e-06, "logits/chosen": 4.267529010772705, "logits/rejected": -0.2083573341369629, "logps/chosen": -307.0151672363281, "logps/rejected": -195.8062286376953, "loss": 0.6424, "rewards/accuracies": 0.5, "rewards/chosen": -0.02369135618209839, "rewards/margins": 0.4160001575946808, "rewards/rejected": -0.43969154357910156, "step": 2383 }, { "epoch": 0.3686835491977576, "grad_norm": 5.832785606384277, "learning_rate": 4.8728376675449654e-06, "logits/chosen": 5.619063377380371, "logits/rejected": 6.483266353607178, "logps/chosen": -140.34205627441406, "logps/rejected": -136.52633666992188, "loss": 0.8751, "rewards/accuracies": 0.125, "rewards/chosen": -0.4540998339653015, "rewards/margins": -0.3129621744155884, "rewards/rejected": -0.14113768935203552, "step": 2384 }, { "epoch": 0.36883819833752174, "grad_norm": 4.609332084655762, "learning_rate": 4.872551265895292e-06, "logits/chosen": 12.109814643859863, "logits/rejected": 9.321410179138184, "logps/chosen": -209.52783203125, "logps/rejected": -260.81695556640625, "loss": 0.6112, "rewards/accuracies": 0.5, "rewards/chosen": -0.10003555566072464, "rewards/margins": 0.21664945781230927, "rewards/rejected": -0.3166849911212921, "step": 2385 }, { "epoch": 0.3689928474772859, "grad_norm": 4.604510307312012, "learning_rate": 4.872264864245618e-06, "logits/chosen": 7.773797035217285, "logits/rejected": 9.025056838989258, "logps/chosen": -202.4029083251953, "logps/rejected": -193.79739379882812, "loss": 0.6154, "rewards/accuracies": 0.625, "rewards/chosen": -0.1768815517425537, "rewards/margins": 0.20046842098236084, "rewards/rejected": -0.37734994292259216, "step": 2386 }, { "epoch": 0.36914749661705004, "grad_norm": 4.75264835357666, "learning_rate": 4.8719784625959445e-06, "logits/chosen": 5.044244289398193, "logits/rejected": 2.677712917327881, "logps/chosen": -225.36862182617188, "logps/rejected": -164.3058319091797, "loss": 0.6358, "rewards/accuracies": 0.75, "rewards/chosen": -0.3549291491508484, "rewards/margins": 0.17416629195213318, "rewards/rejected": -0.5290954113006592, "step": 2387 }, { "epoch": 0.36930214575681425, "grad_norm": 4.902527332305908, "learning_rate": 4.871692060946271e-06, "logits/chosen": 10.700467109680176, "logits/rejected": 4.135343074798584, "logps/chosen": -281.79925537109375, "logps/rejected": -200.09530639648438, "loss": 0.5722, "rewards/accuracies": 0.75, "rewards/chosen": 0.10641193389892578, "rewards/margins": 0.34592026472091675, "rewards/rejected": -0.23950831592082977, "step": 2388 }, { "epoch": 0.3694567948965784, "grad_norm": 4.677819728851318, "learning_rate": 4.871405659296598e-06, "logits/chosen": 12.49563217163086, "logits/rejected": 9.802335739135742, "logps/chosen": -274.1383361816406, "logps/rejected": -216.48519897460938, "loss": 0.6362, "rewards/accuracies": 0.625, "rewards/chosen": 0.11770892888307571, "rewards/margins": 0.1549261510372162, "rewards/rejected": -0.03721722960472107, "step": 2389 }, { "epoch": 0.36961144403634255, "grad_norm": 4.814740180969238, "learning_rate": 4.8711192576469245e-06, "logits/chosen": 16.692867279052734, "logits/rejected": 10.169358253479004, "logps/chosen": -285.1744689941406, "logps/rejected": -209.13375854492188, "loss": 0.6241, "rewards/accuracies": 0.75, "rewards/chosen": 0.0999113991856575, "rewards/margins": 0.19803360104560852, "rewards/rejected": -0.09812220931053162, "step": 2390 }, { "epoch": 0.3697660931761067, "grad_norm": 6.51661491394043, "learning_rate": 4.870832855997251e-06, "logits/chosen": 9.130752563476562, "logits/rejected": 11.86257553100586, "logps/chosen": -385.63970947265625, "logps/rejected": -411.24090576171875, "loss": 0.7663, "rewards/accuracies": 0.5, "rewards/chosen": 0.060430534183979034, "rewards/margins": -0.021870076656341553, "rewards/rejected": 0.0823005735874176, "step": 2391 }, { "epoch": 0.36992074231587085, "grad_norm": 4.617303371429443, "learning_rate": 4.870546454347577e-06, "logits/chosen": 9.277838706970215, "logits/rejected": 5.246686935424805, "logps/chosen": -242.91604614257812, "logps/rejected": -205.66676330566406, "loss": 0.5409, "rewards/accuracies": 0.75, "rewards/chosen": -0.03974518924951553, "rewards/margins": 0.4512771964073181, "rewards/rejected": -0.49102237820625305, "step": 2392 }, { "epoch": 0.370075391455635, "grad_norm": 4.115781307220459, "learning_rate": 4.870260052697904e-06, "logits/chosen": 11.460028648376465, "logits/rejected": 9.533269882202148, "logps/chosen": -216.5787353515625, "logps/rejected": -219.90652465820312, "loss": 0.55, "rewards/accuracies": 0.625, "rewards/chosen": 0.21053925156593323, "rewards/margins": 0.5041624307632446, "rewards/rejected": -0.2936231791973114, "step": 2393 }, { "epoch": 0.3702300405953992, "grad_norm": 5.619273662567139, "learning_rate": 4.86997365104823e-06, "logits/chosen": 12.299894332885742, "logits/rejected": 10.783432006835938, "logps/chosen": -297.86395263671875, "logps/rejected": -242.69082641601562, "loss": 0.7247, "rewards/accuracies": 0.625, "rewards/chosen": -0.12371206283569336, "rewards/margins": 0.06363438069820404, "rewards/rejected": -0.1873464584350586, "step": 2394 }, { "epoch": 0.37038468973516336, "grad_norm": 4.4429612159729, "learning_rate": 4.869687249398557e-06, "logits/chosen": 15.541686058044434, "logits/rejected": 8.054214477539062, "logps/chosen": -297.373046875, "logps/rejected": -159.5595703125, "loss": 0.608, "rewards/accuracies": 0.5, "rewards/chosen": -0.04231991618871689, "rewards/margins": 0.29553380608558655, "rewards/rejected": -0.33785372972488403, "step": 2395 }, { "epoch": 0.3705393388749275, "grad_norm": 5.448227405548096, "learning_rate": 4.8694008477488836e-06, "logits/chosen": 8.33600902557373, "logits/rejected": 2.7957589626312256, "logps/chosen": -371.14532470703125, "logps/rejected": -300.59466552734375, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": 0.01331009715795517, "rewards/margins": 0.35207265615463257, "rewards/rejected": -0.3387625813484192, "step": 2396 }, { "epoch": 0.37069398801469167, "grad_norm": 7.116875171661377, "learning_rate": 4.86911444609921e-06, "logits/chosen": 4.180120468139648, "logits/rejected": 9.486620903015137, "logps/chosen": -265.0079345703125, "logps/rejected": -320.7835693359375, "loss": 0.8892, "rewards/accuracies": 0.125, "rewards/chosen": -0.5311102271080017, "rewards/margins": -0.3332226872444153, "rewards/rejected": -0.19788752496242523, "step": 2397 }, { "epoch": 0.3708486371544558, "grad_norm": 5.047080039978027, "learning_rate": 4.868828044449536e-06, "logits/chosen": 5.533426284790039, "logits/rejected": 5.468631744384766, "logps/chosen": -334.0582275390625, "logps/rejected": -331.41571044921875, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": 0.1340743452310562, "rewards/margins": 0.5435271263122559, "rewards/rejected": -0.4094528555870056, "step": 2398 }, { "epoch": 0.37100328629421997, "grad_norm": 5.215960502624512, "learning_rate": 4.868541642799863e-06, "logits/chosen": 10.979272842407227, "logits/rejected": 3.5393314361572266, "logps/chosen": -346.67572021484375, "logps/rejected": -270.45391845703125, "loss": 0.5911, "rewards/accuracies": 0.625, "rewards/chosen": -0.19033685326576233, "rewards/margins": 0.30940285325050354, "rewards/rejected": -0.4997396767139435, "step": 2399 }, { "epoch": 0.3711579354339841, "grad_norm": 7.44083833694458, "learning_rate": 4.868255241150189e-06, "logits/chosen": 9.197869300842285, "logits/rejected": 10.546732902526855, "logps/chosen": -301.154541015625, "logps/rejected": -325.6441345214844, "loss": 0.6718, "rewards/accuracies": 0.5, "rewards/chosen": 0.20265166461467743, "rewards/margins": 0.08321147412061691, "rewards/rejected": 0.11944020539522171, "step": 2400 }, { "epoch": 0.3713125845737483, "grad_norm": 7.2116780281066895, "learning_rate": 4.867968839500516e-06, "logits/chosen": 9.496402740478516, "logits/rejected": 7.272639274597168, "logps/chosen": -392.7807922363281, "logps/rejected": -390.8681640625, "loss": 0.7087, "rewards/accuracies": 0.375, "rewards/chosen": 0.08842869102954865, "rewards/margins": 0.24314594268798828, "rewards/rejected": -0.15471723675727844, "step": 2401 }, { "epoch": 0.3714672337135125, "grad_norm": 4.874665260314941, "learning_rate": 4.867682437850843e-06, "logits/chosen": 13.588971138000488, "logits/rejected": 12.379103660583496, "logps/chosen": -324.69183349609375, "logps/rejected": -274.97918701171875, "loss": 0.5758, "rewards/accuracies": 0.5, "rewards/chosen": 0.14593347907066345, "rewards/margins": 0.3790302872657776, "rewards/rejected": -0.23309679329395294, "step": 2402 }, { "epoch": 0.37162188285327663, "grad_norm": 9.878633499145508, "learning_rate": 4.867396036201169e-06, "logits/chosen": 10.561646461486816, "logits/rejected": 7.264138221740723, "logps/chosen": -426.41949462890625, "logps/rejected": -375.1007080078125, "loss": 0.7279, "rewards/accuracies": 0.5, "rewards/chosen": 0.0992315337061882, "rewards/margins": 0.0367194339632988, "rewards/rejected": 0.06251209229230881, "step": 2403 }, { "epoch": 0.3717765319930408, "grad_norm": 7.184102535247803, "learning_rate": 4.867109634551496e-06, "logits/chosen": 9.5424165725708, "logits/rejected": 7.9329328536987305, "logps/chosen": -302.94561767578125, "logps/rejected": -281.15411376953125, "loss": 0.6258, "rewards/accuracies": 0.75, "rewards/chosen": 0.08207236230373383, "rewards/margins": 0.327249675989151, "rewards/rejected": -0.24517729878425598, "step": 2404 }, { "epoch": 0.37193118113280493, "grad_norm": 5.745555877685547, "learning_rate": 4.866823232901822e-06, "logits/chosen": 8.223381996154785, "logits/rejected": 8.593246459960938, "logps/chosen": -252.44442749023438, "logps/rejected": -260.95721435546875, "loss": 0.8353, "rewards/accuracies": 0.375, "rewards/chosen": -0.38044825196266174, "rewards/margins": -0.16518260538578033, "rewards/rejected": -0.2152656763792038, "step": 2405 }, { "epoch": 0.3720858302725691, "grad_norm": 6.383440971374512, "learning_rate": 4.866536831252148e-06, "logits/chosen": 9.163786888122559, "logits/rejected": 10.457295417785645, "logps/chosen": -276.6761474609375, "logps/rejected": -272.112060546875, "loss": 0.7696, "rewards/accuracies": 0.75, "rewards/chosen": -0.10289005935192108, "rewards/margins": -0.04424424469470978, "rewards/rejected": -0.0586458183825016, "step": 2406 }, { "epoch": 0.3722404794123333, "grad_norm": 7.81800651550293, "learning_rate": 4.866250429602475e-06, "logits/chosen": 6.324779033660889, "logits/rejected": 3.4595487117767334, "logps/chosen": -265.68890380859375, "logps/rejected": -251.76902770996094, "loss": 0.7418, "rewards/accuracies": 0.75, "rewards/chosen": -0.07356545329093933, "rewards/margins": -0.036809131503105164, "rewards/rejected": -0.036756325513124466, "step": 2407 }, { "epoch": 0.37239512855209744, "grad_norm": 6.409412860870361, "learning_rate": 4.865964027952802e-06, "logits/chosen": 10.697577476501465, "logits/rejected": 12.21921157836914, "logps/chosen": -208.1806640625, "logps/rejected": -283.0392761230469, "loss": 0.8412, "rewards/accuracies": 0.375, "rewards/chosen": -0.22442729771137238, "rewards/margins": -0.1961127668619156, "rewards/rejected": -0.028314542025327682, "step": 2408 }, { "epoch": 0.3725497776918616, "grad_norm": 4.9999284744262695, "learning_rate": 4.865677626303128e-06, "logits/chosen": 10.129697799682617, "logits/rejected": 9.321220397949219, "logps/chosen": -251.20555114746094, "logps/rejected": -240.29701232910156, "loss": 0.7, "rewards/accuracies": 0.5, "rewards/chosen": -0.19308196008205414, "rewards/margins": 0.11935662478208542, "rewards/rejected": -0.31243857741355896, "step": 2409 }, { "epoch": 0.37270442683162575, "grad_norm": 5.70963191986084, "learning_rate": 4.865391224653455e-06, "logits/chosen": 12.110647201538086, "logits/rejected": 8.521402359008789, "logps/chosen": -345.49847412109375, "logps/rejected": -242.37579345703125, "loss": 0.616, "rewards/accuracies": 0.75, "rewards/chosen": -0.011140245944261551, "rewards/margins": 0.18487143516540527, "rewards/rejected": -0.19601169228553772, "step": 2410 }, { "epoch": 0.3728590759713899, "grad_norm": 3.2742412090301514, "learning_rate": 4.865104823003781e-06, "logits/chosen": 11.43698501586914, "logits/rejected": 9.223467826843262, "logps/chosen": -274.5985412597656, "logps/rejected": -199.3978271484375, "loss": 0.4854, "rewards/accuracies": 0.75, "rewards/chosen": 0.13046349585056305, "rewards/margins": 0.6204354763031006, "rewards/rejected": -0.48997196555137634, "step": 2411 }, { "epoch": 0.37301372511115405, "grad_norm": 5.112938404083252, "learning_rate": 4.864818421354107e-06, "logits/chosen": 14.252082824707031, "logits/rejected": 10.206888198852539, "logps/chosen": -368.84783935546875, "logps/rejected": -245.28164672851562, "loss": 0.5694, "rewards/accuracies": 0.625, "rewards/chosen": 0.00153336301445961, "rewards/margins": 0.3470481038093567, "rewards/rejected": -0.345514714717865, "step": 2412 }, { "epoch": 0.37316837425091826, "grad_norm": 4.096931457519531, "learning_rate": 4.864532019704434e-06, "logits/chosen": 7.786017417907715, "logits/rejected": 0.8664898872375488, "logps/chosen": -304.21197509765625, "logps/rejected": -188.03717041015625, "loss": 0.475, "rewards/accuracies": 0.75, "rewards/chosen": 0.26504290103912354, "rewards/margins": 0.6168004274368286, "rewards/rejected": -0.3517575263977051, "step": 2413 }, { "epoch": 0.3733230233906824, "grad_norm": 10.065600395202637, "learning_rate": 4.864245618054761e-06, "logits/chosen": 5.228086471557617, "logits/rejected": 7.355319023132324, "logps/chosen": -334.7033386230469, "logps/rejected": -249.94747924804688, "loss": 0.7697, "rewards/accuracies": 0.375, "rewards/chosen": -0.12815171480178833, "rewards/margins": -0.11044847965240479, "rewards/rejected": -0.017703257501125336, "step": 2414 }, { "epoch": 0.37347767253044656, "grad_norm": 5.694726467132568, "learning_rate": 4.8639592164050865e-06, "logits/chosen": 11.951788902282715, "logits/rejected": 5.954407691955566, "logps/chosen": -337.16973876953125, "logps/rejected": -208.03347778320312, "loss": 0.7349, "rewards/accuracies": 0.625, "rewards/chosen": -0.11614008247852325, "rewards/margins": -0.00870351493358612, "rewards/rejected": -0.10743656754493713, "step": 2415 }, { "epoch": 0.3736323216702107, "grad_norm": 4.3964033126831055, "learning_rate": 4.863672814755413e-06, "logits/chosen": 12.928388595581055, "logits/rejected": 13.472946166992188, "logps/chosen": -200.55270385742188, "logps/rejected": -194.20721435546875, "loss": 0.6371, "rewards/accuracies": 0.75, "rewards/chosen": 0.10276656597852707, "rewards/margins": 0.15031331777572632, "rewards/rejected": -0.047546759247779846, "step": 2416 }, { "epoch": 0.37378697080997486, "grad_norm": 5.179884433746338, "learning_rate": 4.86338641310574e-06, "logits/chosen": 13.089771270751953, "logits/rejected": 16.394187927246094, "logps/chosen": -353.4322814941406, "logps/rejected": -350.3993835449219, "loss": 0.7536, "rewards/accuracies": 0.375, "rewards/chosen": -0.02358187735080719, "rewards/margins": -0.027330204844474792, "rewards/rejected": 0.0037483256310224533, "step": 2417 }, { "epoch": 0.373941619949739, "grad_norm": 5.609171390533447, "learning_rate": 4.8631000114560665e-06, "logits/chosen": 10.013446807861328, "logits/rejected": 6.899214744567871, "logps/chosen": -330.164306640625, "logps/rejected": -232.083984375, "loss": 0.7513, "rewards/accuracies": 0.5, "rewards/chosen": -0.13165056705474854, "rewards/margins": -0.04262585937976837, "rewards/rejected": -0.08902469277381897, "step": 2418 }, { "epoch": 0.37409626908950316, "grad_norm": 5.190798282623291, "learning_rate": 4.862813609806393e-06, "logits/chosen": 5.626438140869141, "logits/rejected": 10.89083194732666, "logps/chosen": -138.60348510742188, "logps/rejected": -195.47848510742188, "loss": 0.7349, "rewards/accuracies": 0.25, "rewards/chosen": -0.09010524302721024, "rewards/margins": 0.030107304453849792, "rewards/rejected": -0.12021255493164062, "step": 2419 }, { "epoch": 0.37425091822926737, "grad_norm": 6.036233901977539, "learning_rate": 4.862527208156719e-06, "logits/chosen": 3.1176884174346924, "logits/rejected": 8.94989013671875, "logps/chosen": -161.10765075683594, "logps/rejected": -262.7567138671875, "loss": 0.7549, "rewards/accuracies": 0.5, "rewards/chosen": -0.24192127585411072, "rewards/margins": -0.046264126896858215, "rewards/rejected": -0.1956571489572525, "step": 2420 }, { "epoch": 0.3744055673690315, "grad_norm": 5.315489768981934, "learning_rate": 4.862240806507046e-06, "logits/chosen": 10.71999740600586, "logits/rejected": 4.455151557922363, "logps/chosen": -387.28582763671875, "logps/rejected": -201.41685485839844, "loss": 0.6543, "rewards/accuracies": 0.5, "rewards/chosen": -0.0010783225297927856, "rewards/margins": 0.2618444561958313, "rewards/rejected": -0.2629227638244629, "step": 2421 }, { "epoch": 0.3745602165087957, "grad_norm": 6.784698963165283, "learning_rate": 4.861954404857372e-06, "logits/chosen": 5.973268985748291, "logits/rejected": 2.865466833114624, "logps/chosen": -401.97113037109375, "logps/rejected": -291.6532287597656, "loss": 0.5707, "rewards/accuracies": 0.875, "rewards/chosen": 0.09425545483827591, "rewards/margins": 0.30102798342704773, "rewards/rejected": -0.20677253603935242, "step": 2422 }, { "epoch": 0.3747148656485598, "grad_norm": 4.686197757720947, "learning_rate": 4.861668003207699e-06, "logits/chosen": 11.251947402954102, "logits/rejected": 4.704822540283203, "logps/chosen": -337.48992919921875, "logps/rejected": -234.16897583007812, "loss": 0.5442, "rewards/accuracies": 0.875, "rewards/chosen": 0.038409046828746796, "rewards/margins": 0.35179638862609863, "rewards/rejected": -0.31338736414909363, "step": 2423 }, { "epoch": 0.374869514788324, "grad_norm": 4.958866119384766, "learning_rate": 4.8613816015580255e-06, "logits/chosen": 10.249595642089844, "logits/rejected": 0.1355055570602417, "logps/chosen": -347.5803527832031, "logps/rejected": -247.2126922607422, "loss": 0.5016, "rewards/accuracies": 0.875, "rewards/chosen": 0.09505057334899902, "rewards/margins": 0.4816757142543793, "rewards/rejected": -0.38662517070770264, "step": 2424 }, { "epoch": 0.37502416392808813, "grad_norm": 6.1127424240112305, "learning_rate": 4.861095199908351e-06, "logits/chosen": 11.60125732421875, "logits/rejected": 9.832794189453125, "logps/chosen": -767.0415649414062, "logps/rejected": -606.1099243164062, "loss": 0.5094, "rewards/accuracies": 1.0, "rewards/chosen": 0.5839893221855164, "rewards/margins": 0.4297370910644531, "rewards/rejected": 0.15425223112106323, "step": 2425 }, { "epoch": 0.37517881306785233, "grad_norm": 4.395288467407227, "learning_rate": 4.860808798258678e-06, "logits/chosen": 1.1999735832214355, "logits/rejected": 4.366621971130371, "logps/chosen": -201.7080841064453, "logps/rejected": -238.09596252441406, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": -0.14504578709602356, "rewards/margins": 0.07806803286075592, "rewards/rejected": -0.22311383485794067, "step": 2426 }, { "epoch": 0.3753334622076165, "grad_norm": 6.2342753410339355, "learning_rate": 4.860522396609005e-06, "logits/chosen": 5.233345031738281, "logits/rejected": 5.837580680847168, "logps/chosen": -215.23550415039062, "logps/rejected": -286.7138671875, "loss": 0.7454, "rewards/accuracies": 0.375, "rewards/chosen": -0.1587119996547699, "rewards/margins": 0.019587822258472443, "rewards/rejected": -0.17829981446266174, "step": 2427 }, { "epoch": 0.37548811134738064, "grad_norm": 5.123315811157227, "learning_rate": 4.860235994959331e-06, "logits/chosen": 11.131426811218262, "logits/rejected": 8.473309516906738, "logps/chosen": -197.91627502441406, "logps/rejected": -202.65553283691406, "loss": 0.6451, "rewards/accuracies": 0.5, "rewards/chosen": -0.2181018888950348, "rewards/margins": 0.13306578993797302, "rewards/rejected": -0.3511676788330078, "step": 2428 }, { "epoch": 0.3756427604871448, "grad_norm": 6.883605003356934, "learning_rate": 4.859949593309658e-06, "logits/chosen": 8.799736976623535, "logits/rejected": 9.347648620605469, "logps/chosen": -324.57952880859375, "logps/rejected": -336.3131408691406, "loss": 0.77, "rewards/accuracies": 0.5, "rewards/chosen": -0.09100465476512909, "rewards/margins": -0.06209847331047058, "rewards/rejected": -0.028906196355819702, "step": 2429 }, { "epoch": 0.37579740962690894, "grad_norm": 4.618454456329346, "learning_rate": 4.859663191659985e-06, "logits/chosen": 10.595348358154297, "logits/rejected": 8.491671562194824, "logps/chosen": -296.3447265625, "logps/rejected": -270.54449462890625, "loss": 0.5738, "rewards/accuracies": 0.75, "rewards/chosen": -0.03315233811736107, "rewards/margins": 0.33365505933761597, "rewards/rejected": -0.36680740118026733, "step": 2430 }, { "epoch": 0.3759520587666731, "grad_norm": 5.708675384521484, "learning_rate": 4.85937679001031e-06, "logits/chosen": 7.179444313049316, "logits/rejected": 5.314072608947754, "logps/chosen": -188.41534423828125, "logps/rejected": -206.8900909423828, "loss": 0.7006, "rewards/accuracies": 0.625, "rewards/chosen": -0.2678079605102539, "rewards/margins": 0.08242912590503693, "rewards/rejected": -0.350237101316452, "step": 2431 }, { "epoch": 0.37610670790643724, "grad_norm": 4.498577117919922, "learning_rate": 4.859090388360637e-06, "logits/chosen": 7.766833305358887, "logits/rejected": -1.2014284133911133, "logps/chosen": -277.64276123046875, "logps/rejected": -139.09707641601562, "loss": 0.727, "rewards/accuracies": 0.5, "rewards/chosen": -0.07459845393896103, "rewards/margins": 0.03489711880683899, "rewards/rejected": -0.10949559509754181, "step": 2432 }, { "epoch": 0.37626135704620145, "grad_norm": 5.34918737411499, "learning_rate": 4.858803986710964e-06, "logits/chosen": 2.9574499130249023, "logits/rejected": 3.4283151626586914, "logps/chosen": -268.2252197265625, "logps/rejected": -314.6051025390625, "loss": 0.6752, "rewards/accuracies": 0.5, "rewards/chosen": 0.04532814398407936, "rewards/margins": 0.08371267467737198, "rewards/rejected": -0.03838452696800232, "step": 2433 }, { "epoch": 0.3764160061859656, "grad_norm": 5.0474395751953125, "learning_rate": 4.85851758506129e-06, "logits/chosen": 5.905858516693115, "logits/rejected": 4.116021156311035, "logps/chosen": -210.8483123779297, "logps/rejected": -264.8016662597656, "loss": 0.6722, "rewards/accuracies": 0.5, "rewards/chosen": -0.15677852928638458, "rewards/margins": 0.1294841766357422, "rewards/rejected": -0.28626272082328796, "step": 2434 }, { "epoch": 0.37657065532572975, "grad_norm": 8.921321868896484, "learning_rate": 4.858231183411617e-06, "logits/chosen": 9.516305923461914, "logits/rejected": 8.695402145385742, "logps/chosen": -342.0606689453125, "logps/rejected": -241.3375701904297, "loss": 0.8331, "rewards/accuracies": 0.5, "rewards/chosen": -0.14257775247097015, "rewards/margins": -0.19309987127780914, "rewards/rejected": 0.05052214860916138, "step": 2435 }, { "epoch": 0.3767253044654939, "grad_norm": 7.870149612426758, "learning_rate": 4.857944781761944e-06, "logits/chosen": 11.549324035644531, "logits/rejected": 7.833742618560791, "logps/chosen": -425.5721130371094, "logps/rejected": -362.63372802734375, "loss": 0.7933, "rewards/accuracies": 0.5, "rewards/chosen": 0.21955886483192444, "rewards/margins": -0.036484137177467346, "rewards/rejected": 0.256043016910553, "step": 2436 }, { "epoch": 0.37687995360525806, "grad_norm": 4.738528728485107, "learning_rate": 4.85765838011227e-06, "logits/chosen": 14.041961669921875, "logits/rejected": 10.339014053344727, "logps/chosen": -249.52871704101562, "logps/rejected": -181.6715087890625, "loss": 0.6657, "rewards/accuracies": 0.5, "rewards/chosen": 0.020669549703598022, "rewards/margins": 0.11426806449890137, "rewards/rejected": -0.09359850734472275, "step": 2437 }, { "epoch": 0.3770346027450222, "grad_norm": 4.823914527893066, "learning_rate": 4.857371978462596e-06, "logits/chosen": 7.354625701904297, "logits/rejected": 12.840993881225586, "logps/chosen": -101.60289764404297, "logps/rejected": -157.95004272460938, "loss": 0.8454, "rewards/accuracies": 0.375, "rewards/chosen": -0.2161734700202942, "rewards/margins": -0.22304527461528778, "rewards/rejected": 0.006871797144412994, "step": 2438 }, { "epoch": 0.3771892518847864, "grad_norm": 6.762789249420166, "learning_rate": 4.857085576812923e-06, "logits/chosen": 4.13044548034668, "logits/rejected": 2.03116512298584, "logps/chosen": -288.9937744140625, "logps/rejected": -257.204833984375, "loss": 0.7641, "rewards/accuracies": 0.625, "rewards/chosen": -0.09213514626026154, "rewards/margins": -0.05110941827297211, "rewards/rejected": -0.04102573171257973, "step": 2439 }, { "epoch": 0.37734390102455057, "grad_norm": 6.0918803215026855, "learning_rate": 4.856799175163249e-06, "logits/chosen": 14.06298542022705, "logits/rejected": 10.342275619506836, "logps/chosen": -300.19561767578125, "logps/rejected": -284.68487548828125, "loss": 0.6534, "rewards/accuracies": 0.5, "rewards/chosen": 0.1113671213388443, "rewards/margins": 0.11857785284519196, "rewards/rejected": -0.007210716605186462, "step": 2440 }, { "epoch": 0.3774985501643147, "grad_norm": 5.471554756164551, "learning_rate": 4.856512773513576e-06, "logits/chosen": 12.553479194641113, "logits/rejected": 3.8274283409118652, "logps/chosen": -362.3578796386719, "logps/rejected": -190.26821899414062, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": 0.05194535851478577, "rewards/margins": 0.22257833182811737, "rewards/rejected": -0.1706329882144928, "step": 2441 }, { "epoch": 0.37765319930407887, "grad_norm": 6.16045618057251, "learning_rate": 4.856226371863903e-06, "logits/chosen": 9.616735458374023, "logits/rejected": 8.711997985839844, "logps/chosen": -199.96336364746094, "logps/rejected": -198.77276611328125, "loss": 0.6736, "rewards/accuracies": 0.625, "rewards/chosen": -0.19526158273220062, "rewards/margins": 0.12442216277122498, "rewards/rejected": -0.3196837306022644, "step": 2442 }, { "epoch": 0.377807848443843, "grad_norm": 6.503495216369629, "learning_rate": 4.855939970214229e-06, "logits/chosen": 5.232485294342041, "logits/rejected": 5.998983860015869, "logps/chosen": -391.4177551269531, "logps/rejected": -344.562255859375, "loss": 0.5262, "rewards/accuracies": 0.75, "rewards/chosen": 0.430552214384079, "rewards/margins": 0.4784107208251953, "rewards/rejected": -0.047858476638793945, "step": 2443 }, { "epoch": 0.37796249758360717, "grad_norm": 4.429693222045898, "learning_rate": 4.855653568564555e-06, "logits/chosen": 14.097070693969727, "logits/rejected": 3.1066911220550537, "logps/chosen": -283.040771484375, "logps/rejected": -140.26065063476562, "loss": 0.6073, "rewards/accuracies": 0.625, "rewards/chosen": -0.18656319379806519, "rewards/margins": 0.22773678600788116, "rewards/rejected": -0.41429999470710754, "step": 2444 }, { "epoch": 0.3781171467233714, "grad_norm": 5.513053894042969, "learning_rate": 4.855367166914882e-06, "logits/chosen": 4.974415302276611, "logits/rejected": 7.12093448638916, "logps/chosen": -225.90109252929688, "logps/rejected": -246.17764282226562, "loss": 0.6731, "rewards/accuracies": 0.375, "rewards/chosen": 0.09877672791481018, "rewards/margins": 0.09649492800235748, "rewards/rejected": 0.0022817999124526978, "step": 2445 }, { "epoch": 0.37827179586313553, "grad_norm": 4.6857829093933105, "learning_rate": 4.8550807652652085e-06, "logits/chosen": 5.275303840637207, "logits/rejected": 10.315199851989746, "logps/chosen": -151.8195343017578, "logps/rejected": -182.37969970703125, "loss": 0.693, "rewards/accuracies": 0.375, "rewards/chosen": -0.21341568231582642, "rewards/margins": 0.06520751118659973, "rewards/rejected": -0.27862319350242615, "step": 2446 }, { "epoch": 0.3784264450028997, "grad_norm": 17.664249420166016, "learning_rate": 4.854794363615535e-06, "logits/chosen": 5.6250715255737305, "logits/rejected": 3.5292861461639404, "logps/chosen": -325.349609375, "logps/rejected": -256.1471862792969, "loss": 0.7529, "rewards/accuracies": 0.375, "rewards/chosen": 0.06405648589134216, "rewards/margins": -0.07598079741001129, "rewards/rejected": 0.14003726840019226, "step": 2447 }, { "epoch": 0.37858109414266383, "grad_norm": 10.483590126037598, "learning_rate": 4.854507961965862e-06, "logits/chosen": 12.203974723815918, "logits/rejected": 10.922534942626953, "logps/chosen": -227.8511505126953, "logps/rejected": -169.13790893554688, "loss": 0.6498, "rewards/accuracies": 0.5, "rewards/chosen": -0.09709373116493225, "rewards/margins": 0.1330793797969818, "rewards/rejected": -0.23017311096191406, "step": 2448 }, { "epoch": 0.378735743282428, "grad_norm": 11.239873886108398, "learning_rate": 4.8542215603161876e-06, "logits/chosen": 7.109556198120117, "logits/rejected": 7.122159957885742, "logps/chosen": -376.6725158691406, "logps/rejected": -348.2467956542969, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": 0.19504088163375854, "rewards/margins": 0.0822812169790268, "rewards/rejected": 0.11275969445705414, "step": 2449 }, { "epoch": 0.37889039242219213, "grad_norm": 5.834151268005371, "learning_rate": 4.853935158666514e-06, "logits/chosen": 11.965173721313477, "logits/rejected": 6.965633392333984, "logps/chosen": -371.366943359375, "logps/rejected": -285.85308837890625, "loss": 0.6524, "rewards/accuracies": 0.375, "rewards/chosen": 0.18527603149414062, "rewards/margins": 0.11812802404165268, "rewards/rejected": 0.06714802980422974, "step": 2450 }, { "epoch": 0.3790450415619563, "grad_norm": 9.095908164978027, "learning_rate": 4.853648757016841e-06, "logits/chosen": 7.213448524475098, "logits/rejected": 8.360918998718262, "logps/chosen": -254.16258239746094, "logps/rejected": -268.68841552734375, "loss": 0.7383, "rewards/accuracies": 0.5, "rewards/chosen": 0.054283831268548965, "rewards/margins": -0.04590270295739174, "rewards/rejected": 0.10018652677536011, "step": 2451 }, { "epoch": 0.3791996907017205, "grad_norm": 8.800716400146484, "learning_rate": 4.8533623553671675e-06, "logits/chosen": 5.24588680267334, "logits/rejected": 5.768891334533691, "logps/chosen": -191.69677734375, "logps/rejected": -211.8994598388672, "loss": 0.6129, "rewards/accuracies": 0.875, "rewards/chosen": 0.0009166207164525986, "rewards/margins": 0.23153649270534515, "rewards/rejected": -0.230619877576828, "step": 2452 }, { "epoch": 0.37935433984148464, "grad_norm": 5.712619781494141, "learning_rate": 4.853075953717493e-06, "logits/chosen": 13.179658889770508, "logits/rejected": 13.897315979003906, "logps/chosen": -435.13336181640625, "logps/rejected": -484.74798583984375, "loss": 0.5517, "rewards/accuracies": 0.875, "rewards/chosen": 0.49636709690093994, "rewards/margins": 0.5442224740982056, "rewards/rejected": -0.047855377197265625, "step": 2453 }, { "epoch": 0.3795089889812488, "grad_norm": 7.024582862854004, "learning_rate": 4.85278955206782e-06, "logits/chosen": 9.524391174316406, "logits/rejected": 15.482491493225098, "logps/chosen": -312.1413269042969, "logps/rejected": -390.7240295410156, "loss": 0.7866, "rewards/accuracies": 0.5, "rewards/chosen": 0.007749810814857483, "rewards/margins": -0.103228360414505, "rewards/rejected": 0.11097818613052368, "step": 2454 }, { "epoch": 0.37966363812101295, "grad_norm": 4.870251655578613, "learning_rate": 4.852503150418147e-06, "logits/chosen": 12.390083312988281, "logits/rejected": 12.206705093383789, "logps/chosen": -229.89613342285156, "logps/rejected": -292.3168029785156, "loss": 0.6252, "rewards/accuracies": 0.625, "rewards/chosen": 0.15041351318359375, "rewards/margins": 0.30681112408638, "rewards/rejected": -0.15639764070510864, "step": 2455 }, { "epoch": 0.3798182872607771, "grad_norm": 4.591142177581787, "learning_rate": 4.852216748768473e-06, "logits/chosen": 9.408971786499023, "logits/rejected": 6.130207538604736, "logps/chosen": -273.287841796875, "logps/rejected": -245.18118286132812, "loss": 0.6123, "rewards/accuracies": 0.75, "rewards/chosen": 0.10999423265457153, "rewards/margins": 0.2851707637310028, "rewards/rejected": -0.17517653107643127, "step": 2456 }, { "epoch": 0.37997293640054125, "grad_norm": 5.637016296386719, "learning_rate": 4.8519303471188e-06, "logits/chosen": 13.771567344665527, "logits/rejected": 7.371804237365723, "logps/chosen": -322.08746337890625, "logps/rejected": -251.43751525878906, "loss": 0.6241, "rewards/accuracies": 0.625, "rewards/chosen": -0.0772399827837944, "rewards/margins": 0.1934891790151596, "rewards/rejected": -0.2707291841506958, "step": 2457 }, { "epoch": 0.38012758554030546, "grad_norm": 15.902688026428223, "learning_rate": 4.851643945469126e-06, "logits/chosen": 10.44197940826416, "logits/rejected": 7.801851272583008, "logps/chosen": -333.44769287109375, "logps/rejected": -323.7859802246094, "loss": 0.674, "rewards/accuracies": 0.5, "rewards/chosen": 0.1803637444972992, "rewards/margins": 0.09277015924453735, "rewards/rejected": 0.08759360760450363, "step": 2458 }, { "epoch": 0.3802822346800696, "grad_norm": 6.0199408531188965, "learning_rate": 4.851357543819452e-06, "logits/chosen": 8.649627685546875, "logits/rejected": 7.019885063171387, "logps/chosen": -264.76947021484375, "logps/rejected": -215.9906463623047, "loss": 0.6809, "rewards/accuracies": 0.5, "rewards/chosen": 0.0520319938659668, "rewards/margins": 0.09728287905454636, "rewards/rejected": -0.04525090008974075, "step": 2459 }, { "epoch": 0.38043688381983376, "grad_norm": 4.741609573364258, "learning_rate": 4.851071142169779e-06, "logits/chosen": 13.077438354492188, "logits/rejected": 7.652851581573486, "logps/chosen": -356.4844665527344, "logps/rejected": -272.8990478515625, "loss": 0.6353, "rewards/accuracies": 0.625, "rewards/chosen": 0.07975941151380539, "rewards/margins": 0.18288972973823547, "rewards/rejected": -0.1031302809715271, "step": 2460 }, { "epoch": 0.3805915329595979, "grad_norm": 5.356295585632324, "learning_rate": 4.850784740520106e-06, "logits/chosen": 11.079095840454102, "logits/rejected": 7.96843147277832, "logps/chosen": -177.5699005126953, "logps/rejected": -200.76068115234375, "loss": 0.7554, "rewards/accuracies": 0.625, "rewards/chosen": -0.18139201402664185, "rewards/margins": 0.04228571057319641, "rewards/rejected": -0.22367773950099945, "step": 2461 }, { "epoch": 0.38074618209936206, "grad_norm": 6.1840972900390625, "learning_rate": 4.850498338870432e-06, "logits/chosen": 5.411108493804932, "logits/rejected": 6.190216541290283, "logps/chosen": -240.19711303710938, "logps/rejected": -190.1748504638672, "loss": 0.7966, "rewards/accuracies": 0.75, "rewards/chosen": 0.07822180539369583, "rewards/margins": -0.12411999702453613, "rewards/rejected": 0.20234179496765137, "step": 2462 }, { "epoch": 0.3809008312391262, "grad_norm": 4.115326404571533, "learning_rate": 4.850211937220759e-06, "logits/chosen": 10.64490032196045, "logits/rejected": 5.670012474060059, "logps/chosen": -299.14886474609375, "logps/rejected": -327.2231750488281, "loss": 0.416, "rewards/accuracies": 0.875, "rewards/chosen": 0.2648065686225891, "rewards/margins": 0.9659033417701721, "rewards/rejected": -0.7010967135429382, "step": 2463 }, { "epoch": 0.38105548037889037, "grad_norm": 4.51612663269043, "learning_rate": 4.849925535571085e-06, "logits/chosen": 8.579599380493164, "logits/rejected": 6.506267547607422, "logps/chosen": -248.03672790527344, "logps/rejected": -188.42611694335938, "loss": 0.5647, "rewards/accuracies": 0.875, "rewards/chosen": 0.1253599226474762, "rewards/margins": 0.31172657012939453, "rewards/rejected": -0.18636667728424072, "step": 2464 }, { "epoch": 0.3812101295186546, "grad_norm": 5.703372478485107, "learning_rate": 4.8496391339214114e-06, "logits/chosen": 5.371494293212891, "logits/rejected": 8.029120445251465, "logps/chosen": -282.0240783691406, "logps/rejected": -277.1210632324219, "loss": 0.6906, "rewards/accuracies": 0.5, "rewards/chosen": 0.1797756552696228, "rewards/margins": 0.046551287174224854, "rewards/rejected": 0.13322439789772034, "step": 2465 }, { "epoch": 0.3813647786584187, "grad_norm": 5.621588230133057, "learning_rate": 4.849352732271738e-06, "logits/chosen": 5.82005500793457, "logits/rejected": 8.25497055053711, "logps/chosen": -256.2855224609375, "logps/rejected": -251.10400390625, "loss": 0.6562, "rewards/accuracies": 0.875, "rewards/chosen": 0.20535221695899963, "rewards/margins": 0.19338560104370117, "rewards/rejected": 0.011966601014137268, "step": 2466 }, { "epoch": 0.3815194277981829, "grad_norm": 36.747825622558594, "learning_rate": 4.849066330622065e-06, "logits/chosen": 15.000533103942871, "logits/rejected": 9.875008583068848, "logps/chosen": -308.3725891113281, "logps/rejected": -317.973876953125, "loss": 0.5727, "rewards/accuracies": 0.625, "rewards/chosen": 0.18519611656665802, "rewards/margins": 0.35210517048835754, "rewards/rejected": -0.16690905392169952, "step": 2467 }, { "epoch": 0.381674076937947, "grad_norm": 5.15939474105835, "learning_rate": 4.848779928972391e-06, "logits/chosen": 9.518121719360352, "logits/rejected": 3.5567731857299805, "logps/chosen": -364.44757080078125, "logps/rejected": -281.7632141113281, "loss": 0.6447, "rewards/accuracies": 0.75, "rewards/chosen": 0.27831774950027466, "rewards/margins": 0.18005478382110596, "rewards/rejected": 0.0982629805803299, "step": 2468 }, { "epoch": 0.3818287260777112, "grad_norm": 5.8277506828308105, "learning_rate": 4.848493527322718e-06, "logits/chosen": 8.182151794433594, "logits/rejected": 4.919912338256836, "logps/chosen": -291.82757568359375, "logps/rejected": -301.93280029296875, "loss": 0.6882, "rewards/accuracies": 0.625, "rewards/chosen": 0.16954413056373596, "rewards/margins": 0.06865277141332626, "rewards/rejected": 0.1008913516998291, "step": 2469 }, { "epoch": 0.38198337521747533, "grad_norm": 7.3568115234375, "learning_rate": 4.848207125673045e-06, "logits/chosen": 9.19233226776123, "logits/rejected": 5.596482276916504, "logps/chosen": -397.4821472167969, "logps/rejected": -285.5025939941406, "loss": 0.7785, "rewards/accuracies": 0.375, "rewards/chosen": 0.2325807511806488, "rewards/margins": -0.005638808012008667, "rewards/rejected": 0.23821955919265747, "step": 2470 }, { "epoch": 0.38213802435723954, "grad_norm": 19.54067611694336, "learning_rate": 4.8479207240233705e-06, "logits/chosen": 4.173725128173828, "logits/rejected": 1.8053724765777588, "logps/chosen": -428.13427734375, "logps/rejected": -344.1834716796875, "loss": 0.7293, "rewards/accuracies": 0.625, "rewards/chosen": 0.44601812958717346, "rewards/margins": 0.024462975561618805, "rewards/rejected": 0.42155516147613525, "step": 2471 }, { "epoch": 0.3822926734970037, "grad_norm": 4.856367588043213, "learning_rate": 4.847634322373697e-06, "logits/chosen": 12.701238632202148, "logits/rejected": 15.662541389465332, "logps/chosen": -146.4423828125, "logps/rejected": -178.07342529296875, "loss": 0.8074, "rewards/accuracies": 0.375, "rewards/chosen": -0.1506718248128891, "rewards/margins": -0.19828256964683533, "rewards/rejected": 0.04761075600981712, "step": 2472 }, { "epoch": 0.38244732263676784, "grad_norm": 6.804742813110352, "learning_rate": 4.847347920724024e-06, "logits/chosen": 7.160064697265625, "logits/rejected": 2.0367302894592285, "logps/chosen": -269.0357666015625, "logps/rejected": -201.4021759033203, "loss": 0.6672, "rewards/accuracies": 0.375, "rewards/chosen": 0.27771008014678955, "rewards/margins": 0.09715823829174042, "rewards/rejected": 0.18055182695388794, "step": 2473 }, { "epoch": 0.382601971776532, "grad_norm": 5.815539360046387, "learning_rate": 4.8470615190743504e-06, "logits/chosen": 4.998912811279297, "logits/rejected": 5.028353214263916, "logps/chosen": -246.58224487304688, "logps/rejected": -313.84161376953125, "loss": 0.8263, "rewards/accuracies": 0.375, "rewards/chosen": 0.034151893109083176, "rewards/margins": -0.08819104731082916, "rewards/rejected": 0.12234293669462204, "step": 2474 }, { "epoch": 0.38275662091629614, "grad_norm": 6.337403297424316, "learning_rate": 4.846775117424677e-06, "logits/chosen": 9.842008590698242, "logits/rejected": 5.350711822509766, "logps/chosen": -342.2659912109375, "logps/rejected": -242.08145141601562, "loss": 0.5456, "rewards/accuracies": 0.625, "rewards/chosen": 0.2532733082771301, "rewards/margins": 0.4525088667869568, "rewards/rejected": -0.19923552870750427, "step": 2475 }, { "epoch": 0.3829112700560603, "grad_norm": 5.715733051300049, "learning_rate": 4.846488715775004e-06, "logits/chosen": 8.07754898071289, "logits/rejected": 8.362151145935059, "logps/chosen": -272.25372314453125, "logps/rejected": -207.8125, "loss": 0.6929, "rewards/accuracies": 0.375, "rewards/chosen": 0.21087270975112915, "rewards/margins": 0.14852365851402283, "rewards/rejected": 0.06234902888536453, "step": 2476 }, { "epoch": 0.3830659191958245, "grad_norm": 5.360029697418213, "learning_rate": 4.8462023141253295e-06, "logits/chosen": 8.912494659423828, "logits/rejected": 12.23288345336914, "logps/chosen": -274.897216796875, "logps/rejected": -328.8795166015625, "loss": 0.6353, "rewards/accuracies": 0.5, "rewards/chosen": 0.28889456391334534, "rewards/margins": 0.2081594169139862, "rewards/rejected": 0.08073515444993973, "step": 2477 }, { "epoch": 0.38322056833558865, "grad_norm": 8.825865745544434, "learning_rate": 4.845915912475656e-06, "logits/chosen": 3.672886848449707, "logits/rejected": 5.712207794189453, "logps/chosen": -229.70065307617188, "logps/rejected": -348.7716064453125, "loss": 0.6415, "rewards/accuracies": 0.5, "rewards/chosen": 0.09264783561229706, "rewards/margins": 0.2627146244049072, "rewards/rejected": -0.17006680369377136, "step": 2478 }, { "epoch": 0.3833752174753528, "grad_norm": 3.5883078575134277, "learning_rate": 4.845629510825983e-06, "logits/chosen": 12.030282020568848, "logits/rejected": 4.84902286529541, "logps/chosen": -209.53594970703125, "logps/rejected": -129.4523162841797, "loss": 0.5904, "rewards/accuracies": 0.75, "rewards/chosen": 0.07043170183897018, "rewards/margins": 0.27338847517967224, "rewards/rejected": -0.20295676589012146, "step": 2479 }, { "epoch": 0.38352986661511695, "grad_norm": 4.4601731300354, "learning_rate": 4.8453431091763095e-06, "logits/chosen": 11.873638153076172, "logits/rejected": 9.215685844421387, "logps/chosen": -130.216064453125, "logps/rejected": -109.36691284179688, "loss": 0.7171, "rewards/accuracies": 0.5, "rewards/chosen": -0.19904479384422302, "rewards/margins": 0.024827249348163605, "rewards/rejected": -0.22387205064296722, "step": 2480 }, { "epoch": 0.3836845157548811, "grad_norm": 6.0697221755981445, "learning_rate": 4.845056707526636e-06, "logits/chosen": 7.541437149047852, "logits/rejected": 7.243613243103027, "logps/chosen": -165.21231079101562, "logps/rejected": -193.79469299316406, "loss": 0.845, "rewards/accuracies": 0.5, "rewards/chosen": -0.06775610893964767, "rewards/margins": -0.18812447786331177, "rewards/rejected": 0.12036838382482529, "step": 2481 }, { "epoch": 0.38383916489464526, "grad_norm": 6.477343559265137, "learning_rate": 4.844770305876963e-06, "logits/chosen": 6.0937886238098145, "logits/rejected": 5.872834205627441, "logps/chosen": -290.815673828125, "logps/rejected": -307.48541259765625, "loss": 0.708, "rewards/accuracies": 0.625, "rewards/chosen": -0.037440016865730286, "rewards/margins": 0.047222621738910675, "rewards/rejected": -0.08466263115406036, "step": 2482 }, { "epoch": 0.3839938140344094, "grad_norm": 4.505267143249512, "learning_rate": 4.844483904227289e-06, "logits/chosen": 11.938836097717285, "logits/rejected": 6.161751747131348, "logps/chosen": -235.750244140625, "logps/rejected": -174.49386596679688, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": 0.07571306079626083, "rewards/margins": 0.1427125334739685, "rewards/rejected": -0.06699948757886887, "step": 2483 }, { "epoch": 0.3841484631741736, "grad_norm": 5.465560436248779, "learning_rate": 4.844197502577615e-06, "logits/chosen": 8.367072105407715, "logits/rejected": 4.855196952819824, "logps/chosen": -237.4248809814453, "logps/rejected": -219.48329162597656, "loss": 0.7158, "rewards/accuracies": 0.5, "rewards/chosen": 0.11514270305633545, "rewards/margins": 0.10784518718719482, "rewards/rejected": 0.00729752704501152, "step": 2484 }, { "epoch": 0.38430311231393777, "grad_norm": 5.986687660217285, "learning_rate": 4.843911100927942e-06, "logits/chosen": 7.6505913734436035, "logits/rejected": 5.7743659019470215, "logps/chosen": -259.51507568359375, "logps/rejected": -233.96424865722656, "loss": 0.6931, "rewards/accuracies": 0.375, "rewards/chosen": 0.24423623085021973, "rewards/margins": 0.1518401801586151, "rewards/rejected": 0.09239606559276581, "step": 2485 }, { "epoch": 0.3844577614537019, "grad_norm": 5.960478782653809, "learning_rate": 4.8436246992782685e-06, "logits/chosen": 14.987554550170898, "logits/rejected": 12.703981399536133, "logps/chosen": -341.366943359375, "logps/rejected": -366.9012451171875, "loss": 0.6442, "rewards/accuracies": 0.5, "rewards/chosen": 0.15539970993995667, "rewards/margins": 0.16267462074756622, "rewards/rejected": -0.007274903357028961, "step": 2486 }, { "epoch": 0.38461241059346607, "grad_norm": 5.444278717041016, "learning_rate": 4.843338297628594e-06, "logits/chosen": 5.063711166381836, "logits/rejected": 3.09645938873291, "logps/chosen": -187.91961669921875, "logps/rejected": -200.99444580078125, "loss": 0.7127, "rewards/accuracies": 0.5, "rewards/chosen": 0.022503569722175598, "rewards/margins": 0.046679943799972534, "rewards/rejected": -0.02417636290192604, "step": 2487 }, { "epoch": 0.3847670597332302, "grad_norm": 7.042248249053955, "learning_rate": 4.843051895978921e-06, "logits/chosen": 12.392899513244629, "logits/rejected": 10.489843368530273, "logps/chosen": -368.0589599609375, "logps/rejected": -347.74932861328125, "loss": 0.6701, "rewards/accuracies": 0.625, "rewards/chosen": 0.3248254060745239, "rewards/margins": 0.09664908051490784, "rewards/rejected": 0.2281763106584549, "step": 2488 }, { "epoch": 0.3849217088729944, "grad_norm": 5.045039176940918, "learning_rate": 4.842765494329248e-06, "logits/chosen": 10.553121566772461, "logits/rejected": 7.557656764984131, "logps/chosen": -272.0636291503906, "logps/rejected": -231.60507202148438, "loss": 0.6309, "rewards/accuracies": 0.75, "rewards/chosen": 0.3065963685512543, "rewards/margins": 0.24395127594470978, "rewards/rejected": 0.0626450926065445, "step": 2489 }, { "epoch": 0.3850763580127586, "grad_norm": 6.289703369140625, "learning_rate": 4.842479092679574e-06, "logits/chosen": 6.633242607116699, "logits/rejected": -0.1066126823425293, "logps/chosen": -318.16986083984375, "logps/rejected": -186.30484008789062, "loss": 0.5141, "rewards/accuracies": 0.75, "rewards/chosen": 0.061443231999874115, "rewards/margins": 0.5793236494064331, "rewards/rejected": -0.5178803205490112, "step": 2490 }, { "epoch": 0.38523100715252273, "grad_norm": 5.394789695739746, "learning_rate": 4.842192691029901e-06, "logits/chosen": 8.646957397460938, "logits/rejected": 7.70578670501709, "logps/chosen": -262.5252990722656, "logps/rejected": -265.5787353515625, "loss": 0.6112, "rewards/accuracies": 0.875, "rewards/chosen": 0.2964068055152893, "rewards/margins": 0.18525764346122742, "rewards/rejected": 0.1111491322517395, "step": 2491 }, { "epoch": 0.3853856562922869, "grad_norm": 6.342586994171143, "learning_rate": 4.841906289380227e-06, "logits/chosen": 9.51095962524414, "logits/rejected": 9.00922966003418, "logps/chosen": -263.4512634277344, "logps/rejected": -284.76458740234375, "loss": 0.7145, "rewards/accuracies": 0.625, "rewards/chosen": 0.007275670766830444, "rewards/margins": 0.02656612917780876, "rewards/rejected": -0.019290447235107422, "step": 2492 }, { "epoch": 0.38554030543205103, "grad_norm": 5.253087043762207, "learning_rate": 4.841619887730553e-06, "logits/chosen": 7.767867088317871, "logits/rejected": 7.674994468688965, "logps/chosen": -205.35792541503906, "logps/rejected": -202.5142822265625, "loss": 0.6979, "rewards/accuracies": 0.625, "rewards/chosen": 0.08806352317333221, "rewards/margins": 0.10078594088554382, "rewards/rejected": -0.012722387909889221, "step": 2493 }, { "epoch": 0.3856949545718152, "grad_norm": 5.032226085662842, "learning_rate": 4.84133348608088e-06, "logits/chosen": 5.237201690673828, "logits/rejected": 12.96186637878418, "logps/chosen": -205.50302124023438, "logps/rejected": -314.1517333984375, "loss": 0.641, "rewards/accuracies": 0.625, "rewards/chosen": 0.003928437829017639, "rewards/margins": 0.21162468194961548, "rewards/rejected": -0.20769624412059784, "step": 2494 }, { "epoch": 0.38584960371157934, "grad_norm": 10.512274742126465, "learning_rate": 4.841047084431207e-06, "logits/chosen": 13.693273544311523, "logits/rejected": 12.609396934509277, "logps/chosen": -348.7419738769531, "logps/rejected": -311.8406982421875, "loss": 0.6647, "rewards/accuracies": 0.625, "rewards/chosen": 0.2086874097585678, "rewards/margins": 0.2260618507862091, "rewards/rejected": -0.017374418675899506, "step": 2495 }, { "epoch": 0.3860042528513435, "grad_norm": 5.769652366638184, "learning_rate": 4.840760682781533e-06, "logits/chosen": 10.47608470916748, "logits/rejected": 7.6666669845581055, "logps/chosen": -507.9436950683594, "logps/rejected": -390.9814758300781, "loss": 0.5936, "rewards/accuracies": 0.75, "rewards/chosen": 0.5202432870864868, "rewards/margins": 0.29551029205322266, "rewards/rejected": 0.22473296523094177, "step": 2496 }, { "epoch": 0.3861589019911077, "grad_norm": 5.407959938049316, "learning_rate": 4.840474281131859e-06, "logits/chosen": 8.83450698852539, "logits/rejected": 7.334245681762695, "logps/chosen": -352.4127197265625, "logps/rejected": -279.8899230957031, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": 0.0615481361746788, "rewards/margins": 0.04724645987153053, "rewards/rejected": 0.01430167630314827, "step": 2497 }, { "epoch": 0.38631355113087185, "grad_norm": 3.314016342163086, "learning_rate": 4.840187879482186e-06, "logits/chosen": 10.245983123779297, "logits/rejected": 0.1744976043701172, "logps/chosen": -343.8799743652344, "logps/rejected": -170.34158325195312, "loss": 0.5391, "rewards/accuracies": 0.625, "rewards/chosen": 0.33750298619270325, "rewards/margins": 0.4623255729675293, "rewards/rejected": -0.12482261657714844, "step": 2498 }, { "epoch": 0.386468200270636, "grad_norm": 6.9106316566467285, "learning_rate": 4.8399014778325125e-06, "logits/chosen": 5.494254112243652, "logits/rejected": 7.414394378662109, "logps/chosen": -225.8417510986328, "logps/rejected": -301.1886291503906, "loss": 0.8414, "rewards/accuracies": 0.25, "rewards/chosen": -0.174442857503891, "rewards/margins": -0.21674901247024536, "rewards/rejected": 0.04230612516403198, "step": 2499 }, { "epoch": 0.38662284941040015, "grad_norm": 4.893411159515381, "learning_rate": 4.839615076182839e-06, "logits/chosen": 8.914921760559082, "logits/rejected": 5.699570178985596, "logps/chosen": -204.90841674804688, "logps/rejected": -216.63169860839844, "loss": 0.6283, "rewards/accuracies": 0.625, "rewards/chosen": 0.20216476917266846, "rewards/margins": 0.14855243265628815, "rewards/rejected": 0.05361231788992882, "step": 2500 }, { "epoch": 0.3867774985501643, "grad_norm": 4.795331001281738, "learning_rate": 4.839328674533166e-06, "logits/chosen": 10.227174758911133, "logits/rejected": 8.926871299743652, "logps/chosen": -218.94346618652344, "logps/rejected": -223.26644897460938, "loss": 0.6811, "rewards/accuracies": 0.625, "rewards/chosen": 0.2945233881473541, "rewards/margins": 0.07627392560243607, "rewards/rejected": 0.21824945509433746, "step": 2501 }, { "epoch": 0.38693214768992845, "grad_norm": 5.449828624725342, "learning_rate": 4.839042272883492e-06, "logits/chosen": 7.624882698059082, "logits/rejected": 2.345890760421753, "logps/chosen": -359.16888427734375, "logps/rejected": -282.7822570800781, "loss": 0.5046, "rewards/accuracies": 0.875, "rewards/chosen": 0.16612538695335388, "rewards/margins": 0.46880584955215454, "rewards/rejected": -0.30268049240112305, "step": 2502 }, { "epoch": 0.38708679682969266, "grad_norm": 3.7475388050079346, "learning_rate": 4.838755871233819e-06, "logits/chosen": 9.804908752441406, "logits/rejected": 2.026059865951538, "logps/chosen": -201.38687133789062, "logps/rejected": -124.04072570800781, "loss": 0.6381, "rewards/accuracies": 0.625, "rewards/chosen": 0.06194836646318436, "rewards/margins": 0.1669876128435135, "rewards/rejected": -0.10503923892974854, "step": 2503 }, { "epoch": 0.3872414459694568, "grad_norm": 5.21207332611084, "learning_rate": 4.838469469584145e-06, "logits/chosen": 9.508917808532715, "logits/rejected": 7.36872673034668, "logps/chosen": -221.48583984375, "logps/rejected": -193.13037109375, "loss": 0.6793, "rewards/accuracies": 0.5, "rewards/chosen": -0.30535149574279785, "rewards/margins": 0.050060346722602844, "rewards/rejected": -0.3554118573665619, "step": 2504 }, { "epoch": 0.38739609510922096, "grad_norm": 5.580845355987549, "learning_rate": 4.8381830679344715e-06, "logits/chosen": 11.653223991394043, "logits/rejected": -0.8787755966186523, "logps/chosen": -359.5242614746094, "logps/rejected": -245.37086486816406, "loss": 0.5623, "rewards/accuracies": 0.625, "rewards/chosen": 0.33282265067100525, "rewards/margins": 0.374828577041626, "rewards/rejected": -0.04200592264533043, "step": 2505 }, { "epoch": 0.3875507442489851, "grad_norm": 5.351850986480713, "learning_rate": 4.837896666284798e-06, "logits/chosen": 12.371397972106934, "logits/rejected": 8.496321678161621, "logps/chosen": -293.55316162109375, "logps/rejected": -218.14512634277344, "loss": 0.6581, "rewards/accuracies": 0.375, "rewards/chosen": 0.09970217198133469, "rewards/margins": 0.16105279326438904, "rewards/rejected": -0.061350636184215546, "step": 2506 }, { "epoch": 0.38770539338874926, "grad_norm": 5.87265682220459, "learning_rate": 4.837610264635125e-06, "logits/chosen": 12.582254409790039, "logits/rejected": 6.969643592834473, "logps/chosen": -349.2928161621094, "logps/rejected": -305.8788146972656, "loss": 0.5681, "rewards/accuracies": 0.875, "rewards/chosen": 0.3168983459472656, "rewards/margins": 0.368699848651886, "rewards/rejected": -0.051801517605781555, "step": 2507 }, { "epoch": 0.3878600425285134, "grad_norm": 6.033074855804443, "learning_rate": 4.8373238629854515e-06, "logits/chosen": 6.3576765060424805, "logits/rejected": 6.106639862060547, "logps/chosen": -257.6221008300781, "logps/rejected": -257.79742431640625, "loss": 0.7602, "rewards/accuracies": 0.625, "rewards/chosen": -0.023577619343996048, "rewards/margins": 0.003074638545513153, "rewards/rejected": -0.02665223926305771, "step": 2508 }, { "epoch": 0.3880146916682776, "grad_norm": 3.952921152114868, "learning_rate": 4.837037461335778e-06, "logits/chosen": 14.972173690795898, "logits/rejected": 8.424567222595215, "logps/chosen": -254.81655883789062, "logps/rejected": -218.99913024902344, "loss": 0.5634, "rewards/accuracies": 0.625, "rewards/chosen": 0.19369012117385864, "rewards/margins": 0.3433961868286133, "rewards/rejected": -0.14970608055591583, "step": 2509 }, { "epoch": 0.3881693408080418, "grad_norm": 5.90886926651001, "learning_rate": 4.836751059686104e-06, "logits/chosen": 11.192469596862793, "logits/rejected": 10.767522811889648, "logps/chosen": -257.12591552734375, "logps/rejected": -272.29693603515625, "loss": 0.7189, "rewards/accuracies": 0.625, "rewards/chosen": 0.32702887058258057, "rewards/margins": 0.006801605224609375, "rewards/rejected": 0.3202272653579712, "step": 2510 }, { "epoch": 0.3883239899478059, "grad_norm": 5.425315856933594, "learning_rate": 4.8364646580364306e-06, "logits/chosen": 13.872915267944336, "logits/rejected": 7.135364532470703, "logps/chosen": -265.2104187011719, "logps/rejected": -195.73187255859375, "loss": 0.6287, "rewards/accuracies": 0.75, "rewards/chosen": 0.16727611422538757, "rewards/margins": 0.20995265245437622, "rewards/rejected": -0.04267653822898865, "step": 2511 }, { "epoch": 0.3884786390875701, "grad_norm": 6.344537734985352, "learning_rate": 4.836178256386757e-06, "logits/chosen": 6.726906776428223, "logits/rejected": 11.165605545043945, "logps/chosen": -174.9613800048828, "logps/rejected": -220.12994384765625, "loss": 0.9263, "rewards/accuracies": 0.25, "rewards/chosen": -0.37487179040908813, "rewards/margins": -0.375477135181427, "rewards/rejected": 0.0006053522229194641, "step": 2512 }, { "epoch": 0.38863328822733423, "grad_norm": 6.683211803436279, "learning_rate": 4.835891854737084e-06, "logits/chosen": 11.192131042480469, "logits/rejected": 10.389376640319824, "logps/chosen": -368.70855712890625, "logps/rejected": -358.7967529296875, "loss": 0.6662, "rewards/accuracies": 0.5, "rewards/chosen": 0.13360156118869781, "rewards/margins": 0.13726729154586792, "rewards/rejected": -0.003665730357170105, "step": 2513 }, { "epoch": 0.3887879373670984, "grad_norm": 4.948020935058594, "learning_rate": 4.8356054530874105e-06, "logits/chosen": 7.507864475250244, "logits/rejected": 4.774501800537109, "logps/chosen": -231.83563232421875, "logps/rejected": -231.54940795898438, "loss": 0.6304, "rewards/accuracies": 0.75, "rewards/chosen": 0.1089787483215332, "rewards/margins": 0.16259777545928955, "rewards/rejected": -0.05361901968717575, "step": 2514 }, { "epoch": 0.38894258650686253, "grad_norm": 5.454336643218994, "learning_rate": 4.835319051437737e-06, "logits/chosen": 7.162698745727539, "logits/rejected": 9.290777206420898, "logps/chosen": -347.8484191894531, "logps/rejected": -345.4284362792969, "loss": 0.5257, "rewards/accuracies": 0.75, "rewards/chosen": 0.2902758717536926, "rewards/margins": 0.4465555250644684, "rewards/rejected": -0.15627965331077576, "step": 2515 }, { "epoch": 0.38909723564662674, "grad_norm": 5.687977313995361, "learning_rate": 4.835032649788064e-06, "logits/chosen": 9.118745803833008, "logits/rejected": 10.99682331085205, "logps/chosen": -278.5380859375, "logps/rejected": -344.48211669921875, "loss": 0.777, "rewards/accuracies": 0.5, "rewards/chosen": 0.16848069429397583, "rewards/margins": -0.13640569150447845, "rewards/rejected": 0.3048864006996155, "step": 2516 }, { "epoch": 0.3892518847863909, "grad_norm": 6.410073757171631, "learning_rate": 4.83474624813839e-06, "logits/chosen": 10.351490020751953, "logits/rejected": 2.3732974529266357, "logps/chosen": -315.24359130859375, "logps/rejected": -167.97608947753906, "loss": 0.5684, "rewards/accuracies": 0.75, "rewards/chosen": 0.2100776880979538, "rewards/margins": 0.39044898748397827, "rewards/rejected": -0.18037129938602448, "step": 2517 }, { "epoch": 0.38940653392615504, "grad_norm": 7.10020637512207, "learning_rate": 4.834459846488716e-06, "logits/chosen": 9.872756958007812, "logits/rejected": 14.278976440429688, "logps/chosen": -280.0938720703125, "logps/rejected": -345.4791259765625, "loss": 0.8875, "rewards/accuracies": 0.25, "rewards/chosen": -0.17021316289901733, "rewards/margins": -0.2938002347946167, "rewards/rejected": 0.12358702719211578, "step": 2518 }, { "epoch": 0.3895611830659192, "grad_norm": 5.2860236167907715, "learning_rate": 4.834173444839043e-06, "logits/chosen": 10.695123672485352, "logits/rejected": 7.5514116287231445, "logps/chosen": -253.3675537109375, "logps/rejected": -164.79969787597656, "loss": 0.6995, "rewards/accuracies": 0.75, "rewards/chosen": 0.15195079147815704, "rewards/margins": 0.03273802995681763, "rewards/rejected": 0.11921275407075882, "step": 2519 }, { "epoch": 0.38971583220568334, "grad_norm": 7.132540225982666, "learning_rate": 4.83388704318937e-06, "logits/chosen": 4.442205905914307, "logits/rejected": 0.9681739211082458, "logps/chosen": -318.96722412109375, "logps/rejected": -290.3237609863281, "loss": 0.5952, "rewards/accuracies": 0.75, "rewards/chosen": 0.23839467763900757, "rewards/margins": 0.4139556884765625, "rewards/rejected": -0.17556098103523254, "step": 2520 }, { "epoch": 0.3898704813454475, "grad_norm": 20.26692771911621, "learning_rate": 4.833600641539695e-06, "logits/chosen": 9.219143867492676, "logits/rejected": 9.805583953857422, "logps/chosen": -395.576171875, "logps/rejected": -369.728759765625, "loss": 0.6226, "rewards/accuracies": 0.75, "rewards/chosen": 0.442513108253479, "rewards/margins": 0.17555636167526245, "rewards/rejected": 0.26695671677589417, "step": 2521 }, { "epoch": 0.3900251304852117, "grad_norm": 5.81402587890625, "learning_rate": 4.833314239890022e-06, "logits/chosen": 10.73791217803955, "logits/rejected": 10.264984130859375, "logps/chosen": -362.8399658203125, "logps/rejected": -313.6953125, "loss": 0.7329, "rewards/accuracies": 0.5, "rewards/chosen": 0.18623968958854675, "rewards/margins": 0.09422679245471954, "rewards/rejected": 0.09201288223266602, "step": 2522 }, { "epoch": 0.39017977962497585, "grad_norm": 6.399228096008301, "learning_rate": 4.833027838240349e-06, "logits/chosen": 5.795349597930908, "logits/rejected": 10.752036094665527, "logps/chosen": -251.02041625976562, "logps/rejected": -336.62774658203125, "loss": 0.7317, "rewards/accuracies": 0.375, "rewards/chosen": 0.2808384895324707, "rewards/margins": 0.027870308607816696, "rewards/rejected": 0.2529681921005249, "step": 2523 }, { "epoch": 0.39033442876474, "grad_norm": 4.551957130432129, "learning_rate": 4.832741436590675e-06, "logits/chosen": 8.038311004638672, "logits/rejected": 4.359333515167236, "logps/chosen": -267.29779052734375, "logps/rejected": -271.3774719238281, "loss": 0.6954, "rewards/accuracies": 0.5, "rewards/chosen": 0.11396326124668121, "rewards/margins": 0.15602554380893707, "rewards/rejected": -0.04206228256225586, "step": 2524 }, { "epoch": 0.39048907790450416, "grad_norm": 6.611552715301514, "learning_rate": 4.832455034941001e-06, "logits/chosen": 7.531148433685303, "logits/rejected": 5.5594482421875, "logps/chosen": -367.160888671875, "logps/rejected": -371.1960144042969, "loss": 0.6031, "rewards/accuracies": 0.625, "rewards/chosen": 0.3096430003643036, "rewards/margins": 0.4614374041557312, "rewards/rejected": -0.15179443359375, "step": 2525 }, { "epoch": 0.3906437270442683, "grad_norm": 4.8326239585876465, "learning_rate": 4.832168633291328e-06, "logits/chosen": 12.293390274047852, "logits/rejected": 13.100914001464844, "logps/chosen": -208.90545654296875, "logps/rejected": -263.8213806152344, "loss": 0.6037, "rewards/accuracies": 0.625, "rewards/chosen": 0.00018724799156188965, "rewards/margins": 0.2105756253004074, "rewards/rejected": -0.21038836240768433, "step": 2526 }, { "epoch": 0.39079837618403246, "grad_norm": 4.485903739929199, "learning_rate": 4.8318822316416544e-06, "logits/chosen": 5.969025611877441, "logits/rejected": 9.473962783813477, "logps/chosen": -228.4339141845703, "logps/rejected": -265.7933044433594, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": -0.09652829170227051, "rewards/margins": 0.15099148452281952, "rewards/rejected": -0.24751979112625122, "step": 2527 }, { "epoch": 0.3909530253237966, "grad_norm": 4.098492622375488, "learning_rate": 4.831595829991981e-06, "logits/chosen": 10.857067108154297, "logits/rejected": 6.082857608795166, "logps/chosen": -309.5762023925781, "logps/rejected": -260.5160217285156, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": 0.45469456911087036, "rewards/margins": 0.4366970360279083, "rewards/rejected": 0.017997555434703827, "step": 2528 }, { "epoch": 0.3911076744635608, "grad_norm": 4.350097179412842, "learning_rate": 4.831309428342308e-06, "logits/chosen": 12.32647705078125, "logits/rejected": 6.666042327880859, "logps/chosen": -278.85089111328125, "logps/rejected": -184.8273468017578, "loss": 0.5979, "rewards/accuracies": 0.75, "rewards/chosen": 0.15506428480148315, "rewards/margins": 0.36954832077026367, "rewards/rejected": -0.21448402106761932, "step": 2529 }, { "epoch": 0.39126232360332497, "grad_norm": 8.040081977844238, "learning_rate": 4.8310230266926335e-06, "logits/chosen": 9.168654441833496, "logits/rejected": 2.2769346237182617, "logps/chosen": -503.75732421875, "logps/rejected": -430.2760009765625, "loss": 0.6192, "rewards/accuracies": 0.75, "rewards/chosen": 0.3419133126735687, "rewards/margins": 0.25796353816986084, "rewards/rejected": 0.08394977450370789, "step": 2530 }, { "epoch": 0.3914169727430891, "grad_norm": 3.809129238128662, "learning_rate": 4.83073662504296e-06, "logits/chosen": 8.37309741973877, "logits/rejected": 4.278587818145752, "logps/chosen": -232.55032348632812, "logps/rejected": -192.03054809570312, "loss": 0.4953, "rewards/accuracies": 0.875, "rewards/chosen": 0.3572920560836792, "rewards/margins": 0.48990529775619507, "rewards/rejected": -0.13261322677135468, "step": 2531 }, { "epoch": 0.39157162188285327, "grad_norm": 10.449230194091797, "learning_rate": 4.830450223393287e-06, "logits/chosen": 1.190462589263916, "logits/rejected": -2.7900595664978027, "logps/chosen": -254.5145721435547, "logps/rejected": -143.00790405273438, "loss": 0.7438, "rewards/accuracies": 0.625, "rewards/chosen": -0.19025275111198425, "rewards/margins": -0.023232147097587585, "rewards/rejected": -0.16702061891555786, "step": 2532 }, { "epoch": 0.3917262710226174, "grad_norm": 6.154085159301758, "learning_rate": 4.8301638217436135e-06, "logits/chosen": 11.404205322265625, "logits/rejected": 14.83556842803955, "logps/chosen": -359.0123596191406, "logps/rejected": -366.4436950683594, "loss": 0.7402, "rewards/accuracies": 0.375, "rewards/chosen": 0.003390517085790634, "rewards/margins": -0.0783708393573761, "rewards/rejected": 0.08176136016845703, "step": 2533 }, { "epoch": 0.3918809201623816, "grad_norm": 4.602861404418945, "learning_rate": 4.82987742009394e-06, "logits/chosen": 13.853038787841797, "logits/rejected": 10.441329956054688, "logps/chosen": -239.82672119140625, "logps/rejected": -197.84478759765625, "loss": 0.5633, "rewards/accuracies": 0.625, "rewards/chosen": 0.2919125556945801, "rewards/margins": 0.5329559445381165, "rewards/rejected": -0.24104338884353638, "step": 2534 }, { "epoch": 0.3920355693021458, "grad_norm": 5.717794895172119, "learning_rate": 4.829591018444267e-06, "logits/chosen": 6.497745037078857, "logits/rejected": 9.020249366760254, "logps/chosen": -285.145751953125, "logps/rejected": -373.0025634765625, "loss": 0.616, "rewards/accuracies": 0.75, "rewards/chosen": 0.1580706685781479, "rewards/margins": 0.2852175831794739, "rewards/rejected": -0.12714692950248718, "step": 2535 }, { "epoch": 0.39219021844190993, "grad_norm": 4.775069713592529, "learning_rate": 4.8293046167945934e-06, "logits/chosen": 10.217897415161133, "logits/rejected": 7.607025146484375, "logps/chosen": -306.3009948730469, "logps/rejected": -244.65377807617188, "loss": 0.6108, "rewards/accuracies": 0.625, "rewards/chosen": 0.41449278593063354, "rewards/margins": 0.252946674823761, "rewards/rejected": 0.16154614090919495, "step": 2536 }, { "epoch": 0.3923448675816741, "grad_norm": 8.611421585083008, "learning_rate": 4.829018215144919e-06, "logits/chosen": 8.86595630645752, "logits/rejected": 3.967453956604004, "logps/chosen": -289.7342529296875, "logps/rejected": -245.5177459716797, "loss": 0.9922, "rewards/accuracies": 0.125, "rewards/chosen": -0.28698402643203735, "rewards/margins": -0.48094645142555237, "rewards/rejected": 0.19396242499351501, "step": 2537 }, { "epoch": 0.39249951672143824, "grad_norm": 5.9621782302856445, "learning_rate": 4.828731813495246e-06, "logits/chosen": 7.559358596801758, "logits/rejected": 2.3897948265075684, "logps/chosen": -267.21783447265625, "logps/rejected": -213.94320678710938, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": 0.03697100281715393, "rewards/margins": 0.06150517985224724, "rewards/rejected": -0.02453417330980301, "step": 2538 }, { "epoch": 0.3926541658612024, "grad_norm": 5.176915645599365, "learning_rate": 4.8284454118455725e-06, "logits/chosen": 9.754781723022461, "logits/rejected": 7.905757427215576, "logps/chosen": -243.39599609375, "logps/rejected": -216.9149169921875, "loss": 0.5959, "rewards/accuracies": 0.75, "rewards/chosen": 0.015784405171871185, "rewards/margins": 0.2242448776960373, "rewards/rejected": -0.20846045017242432, "step": 2539 }, { "epoch": 0.39280881500096654, "grad_norm": 6.611644744873047, "learning_rate": 4.828159010195899e-06, "logits/chosen": 10.470954895019531, "logits/rejected": 5.715402603149414, "logps/chosen": -257.9112854003906, "logps/rejected": -214.12939453125, "loss": 0.7329, "rewards/accuracies": 0.375, "rewards/chosen": -0.05962109938263893, "rewards/margins": 0.0030518919229507446, "rewards/rejected": -0.06267299503087997, "step": 2540 }, { "epoch": 0.39296346414073074, "grad_norm": 4.98043155670166, "learning_rate": 4.827872608546226e-06, "logits/chosen": 6.524811267852783, "logits/rejected": 8.880760192871094, "logps/chosen": -196.36111450195312, "logps/rejected": -215.7021484375, "loss": 0.6784, "rewards/accuracies": 0.75, "rewards/chosen": 0.11949749290943146, "rewards/margins": 0.09556332230567932, "rewards/rejected": 0.02393416315317154, "step": 2541 }, { "epoch": 0.3931181132804949, "grad_norm": 6.939080715179443, "learning_rate": 4.8275862068965525e-06, "logits/chosen": 12.446451187133789, "logits/rejected": 8.930511474609375, "logps/chosen": -355.4482421875, "logps/rejected": -376.34698486328125, "loss": 0.9135, "rewards/accuracies": 0.25, "rewards/chosen": -0.06933679431676865, "rewards/margins": -0.26839303970336914, "rewards/rejected": 0.1990562379360199, "step": 2542 }, { "epoch": 0.39327276242025905, "grad_norm": 5.849193096160889, "learning_rate": 4.827299805246878e-06, "logits/chosen": 9.99445915222168, "logits/rejected": 4.4871110916137695, "logps/chosen": -388.3697509765625, "logps/rejected": -235.374267578125, "loss": 0.5368, "rewards/accuracies": 0.75, "rewards/chosen": 0.16586171090602875, "rewards/margins": 0.4051932990550995, "rewards/rejected": -0.23933160305023193, "step": 2543 }, { "epoch": 0.3934274115600232, "grad_norm": 4.978914737701416, "learning_rate": 4.827013403597205e-06, "logits/chosen": 6.21391487121582, "logits/rejected": 6.505616188049316, "logps/chosen": -233.50897216796875, "logps/rejected": -194.19171142578125, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.11609721183776855, "rewards/margins": 0.03347183018922806, "rewards/rejected": 0.0826253816485405, "step": 2544 }, { "epoch": 0.39358206069978735, "grad_norm": 6.409840106964111, "learning_rate": 4.826727001947532e-06, "logits/chosen": 8.295774459838867, "logits/rejected": 2.168236494064331, "logps/chosen": -216.91693115234375, "logps/rejected": -179.8815155029297, "loss": 0.8029, "rewards/accuracies": 0.5, "rewards/chosen": -0.5232242941856384, "rewards/margins": -0.07691755890846252, "rewards/rejected": -0.4463067054748535, "step": 2545 }, { "epoch": 0.3937367098395515, "grad_norm": 5.3077073097229, "learning_rate": 4.826440600297858e-06, "logits/chosen": 7.847306251525879, "logits/rejected": 6.454771518707275, "logps/chosen": -305.9904479980469, "logps/rejected": -312.0799560546875, "loss": 0.59, "rewards/accuracies": 0.625, "rewards/chosen": -0.18082857131958008, "rewards/margins": 0.29177790880203247, "rewards/rejected": -0.47260648012161255, "step": 2546 }, { "epoch": 0.39389135897931565, "grad_norm": 4.155510902404785, "learning_rate": 4.826154198648185e-06, "logits/chosen": 10.758145332336426, "logits/rejected": 11.782370567321777, "logps/chosen": -229.51902770996094, "logps/rejected": -218.76719665527344, "loss": 0.6103, "rewards/accuracies": 0.75, "rewards/chosen": -0.01856493577361107, "rewards/margins": 0.24198251962661743, "rewards/rejected": -0.2605474591255188, "step": 2547 }, { "epoch": 0.39404600811907986, "grad_norm": 4.827736854553223, "learning_rate": 4.8258677969985116e-06, "logits/chosen": 11.421258926391602, "logits/rejected": 10.085678100585938, "logps/chosen": -259.4638671875, "logps/rejected": -254.18930053710938, "loss": 0.628, "rewards/accuracies": 0.625, "rewards/chosen": -0.0431852862238884, "rewards/margins": 0.2988182306289673, "rewards/rejected": -0.3420035243034363, "step": 2548 }, { "epoch": 0.394200657258844, "grad_norm": 3.9456310272216797, "learning_rate": 4.825581395348838e-06, "logits/chosen": 5.618587493896484, "logits/rejected": 3.565340042114258, "logps/chosen": -206.11956787109375, "logps/rejected": -254.37168884277344, "loss": 0.5384, "rewards/accuracies": 0.875, "rewards/chosen": 0.12535491585731506, "rewards/margins": 0.4303221106529236, "rewards/rejected": -0.3049672245979309, "step": 2549 }, { "epoch": 0.39435530639860816, "grad_norm": 6.977304458618164, "learning_rate": 4.825294993699164e-06, "logits/chosen": 6.748723030090332, "logits/rejected": 4.894326686859131, "logps/chosen": -328.8692932128906, "logps/rejected": -311.1910400390625, "loss": 0.6805, "rewards/accuracies": 0.625, "rewards/chosen": 0.23720084130764008, "rewards/margins": 0.09849634766578674, "rewards/rejected": 0.13870447874069214, "step": 2550 }, { "epoch": 0.3945099555383723, "grad_norm": 4.787998199462891, "learning_rate": 4.825008592049491e-06, "logits/chosen": 3.9990077018737793, "logits/rejected": 5.439508438110352, "logps/chosen": -238.10952758789062, "logps/rejected": -234.81243896484375, "loss": 0.6523, "rewards/accuracies": 0.75, "rewards/chosen": 0.09690293669700623, "rewards/margins": 0.14090105891227722, "rewards/rejected": -0.043998122215270996, "step": 2551 }, { "epoch": 0.39466460467813647, "grad_norm": 4.951591968536377, "learning_rate": 4.824722190399817e-06, "logits/chosen": 17.354629516601562, "logits/rejected": 7.6628875732421875, "logps/chosen": -285.8321533203125, "logps/rejected": -166.01473999023438, "loss": 0.7237, "rewards/accuracies": 0.5, "rewards/chosen": 0.11677198112010956, "rewards/margins": 0.06048339605331421, "rewards/rejected": 0.05628858506679535, "step": 2552 }, { "epoch": 0.3948192538179006, "grad_norm": 6.390881061553955, "learning_rate": 4.824435788750144e-06, "logits/chosen": 9.896942138671875, "logits/rejected": 6.755846977233887, "logps/chosen": -333.674072265625, "logps/rejected": -318.6329040527344, "loss": 0.6244, "rewards/accuracies": 0.5, "rewards/chosen": 0.24484139680862427, "rewards/margins": 0.20574340224266052, "rewards/rejected": 0.03909797966480255, "step": 2553 }, { "epoch": 0.3949739029576648, "grad_norm": 6.148293972015381, "learning_rate": 4.824149387100471e-06, "logits/chosen": 10.40079116821289, "logits/rejected": 6.3178019523620605, "logps/chosen": -325.0823974609375, "logps/rejected": -211.243896484375, "loss": 0.7218, "rewards/accuracies": 0.625, "rewards/chosen": 0.08466396480798721, "rewards/margins": 0.008593171834945679, "rewards/rejected": 0.07607080042362213, "step": 2554 }, { "epoch": 0.395128552097429, "grad_norm": 5.462231159210205, "learning_rate": 4.823862985450796e-06, "logits/chosen": 11.206364631652832, "logits/rejected": 5.20212459564209, "logps/chosen": -345.7375793457031, "logps/rejected": -241.34317016601562, "loss": 0.5237, "rewards/accuracies": 0.75, "rewards/chosen": 0.2157306969165802, "rewards/margins": 0.43911927938461304, "rewards/rejected": -0.22338858246803284, "step": 2555 }, { "epoch": 0.3952832012371931, "grad_norm": 6.3288187980651855, "learning_rate": 4.823576583801123e-06, "logits/chosen": 2.3407130241394043, "logits/rejected": 10.840267181396484, "logps/chosen": -244.71420288085938, "logps/rejected": -304.001708984375, "loss": 0.6252, "rewards/accuracies": 0.75, "rewards/chosen": 0.3951888680458069, "rewards/margins": 0.27616190910339355, "rewards/rejected": 0.11902695149183273, "step": 2556 }, { "epoch": 0.3954378503769573, "grad_norm": 5.270389556884766, "learning_rate": 4.82329018215145e-06, "logits/chosen": 6.011882305145264, "logits/rejected": 8.376742362976074, "logps/chosen": -194.97410583496094, "logps/rejected": -157.25685119628906, "loss": 0.8573, "rewards/accuracies": 0.25, "rewards/chosen": -0.2020559161901474, "rewards/margins": -0.22629514336585999, "rewards/rejected": 0.02423921972513199, "step": 2557 }, { "epoch": 0.39559249951672143, "grad_norm": 5.609000205993652, "learning_rate": 4.823003780501776e-06, "logits/chosen": 6.962016582489014, "logits/rejected": 6.735646724700928, "logps/chosen": -230.80062866210938, "logps/rejected": -273.57666015625, "loss": 0.6835, "rewards/accuracies": 0.625, "rewards/chosen": 0.11239200830459595, "rewards/margins": 0.12234941124916077, "rewards/rejected": -0.009957395493984222, "step": 2558 }, { "epoch": 0.3957471486564856, "grad_norm": 5.15081262588501, "learning_rate": 4.822717378852102e-06, "logits/chosen": 10.177614212036133, "logits/rejected": 5.896312236785889, "logps/chosen": -327.22991943359375, "logps/rejected": -244.12928771972656, "loss": 0.6143, "rewards/accuracies": 0.5, "rewards/chosen": 0.10368603467941284, "rewards/margins": 0.1988917589187622, "rewards/rejected": -0.09520575404167175, "step": 2559 }, { "epoch": 0.39590179779624973, "grad_norm": 5.369154930114746, "learning_rate": 4.822430977202429e-06, "logits/chosen": 12.63347053527832, "logits/rejected": 5.854383945465088, "logps/chosen": -336.1552429199219, "logps/rejected": -227.9302215576172, "loss": 0.6189, "rewards/accuracies": 0.5, "rewards/chosen": 0.3282032012939453, "rewards/margins": 0.19863511621952057, "rewards/rejected": 0.12956809997558594, "step": 2560 }, { "epoch": 0.39605644693601394, "grad_norm": 10.721266746520996, "learning_rate": 4.8221445755527555e-06, "logits/chosen": 9.032099723815918, "logits/rejected": 8.536942481994629, "logps/chosen": -345.3525390625, "logps/rejected": -327.91558837890625, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": 0.2865002751350403, "rewards/margins": 0.10141459852457047, "rewards/rejected": 0.18508568406105042, "step": 2561 }, { "epoch": 0.3962110960757781, "grad_norm": 6.783220291137695, "learning_rate": 4.821858173903082e-06, "logits/chosen": 7.5616960525512695, "logits/rejected": 5.122775554656982, "logps/chosen": -215.74496459960938, "logps/rejected": -163.34182739257812, "loss": 0.674, "rewards/accuracies": 0.75, "rewards/chosen": 0.037495702505111694, "rewards/margins": 0.08999853581190109, "rewards/rejected": -0.052502818405628204, "step": 2562 }, { "epoch": 0.39636574521554224, "grad_norm": 4.724612236022949, "learning_rate": 4.821571772253408e-06, "logits/chosen": 8.274641990661621, "logits/rejected": 1.1090553998947144, "logps/chosen": -338.54632568359375, "logps/rejected": -219.84732055664062, "loss": 0.5656, "rewards/accuracies": 0.625, "rewards/chosen": 0.48839202523231506, "rewards/margins": 0.36774560809135437, "rewards/rejected": 0.12064642459154129, "step": 2563 }, { "epoch": 0.3965203943553064, "grad_norm": 5.487222194671631, "learning_rate": 4.8212853706037346e-06, "logits/chosen": 13.157936096191406, "logits/rejected": 9.264841079711914, "logps/chosen": -352.5703430175781, "logps/rejected": -301.8311462402344, "loss": 0.5706, "rewards/accuracies": 0.75, "rewards/chosen": 0.3016829490661621, "rewards/margins": 0.3600809574127197, "rewards/rejected": -0.058398015797138214, "step": 2564 }, { "epoch": 0.39667504349507055, "grad_norm": 13.406632423400879, "learning_rate": 4.820998968954061e-06, "logits/chosen": 10.260294914245605, "logits/rejected": 9.320372581481934, "logps/chosen": -391.321044921875, "logps/rejected": -369.4396057128906, "loss": 0.6286, "rewards/accuracies": 0.625, "rewards/chosen": 0.23625889420509338, "rewards/margins": 0.2528381943702698, "rewards/rejected": -0.016579288989305496, "step": 2565 }, { "epoch": 0.3968296926348347, "grad_norm": 5.284693717956543, "learning_rate": 4.820712567304388e-06, "logits/chosen": 12.277772903442383, "logits/rejected": 15.693747520446777, "logps/chosen": -270.925048828125, "logps/rejected": -298.5973815917969, "loss": 0.6, "rewards/accuracies": 0.75, "rewards/chosen": 0.32151374220848083, "rewards/margins": 0.2964085638523102, "rewards/rejected": 0.025105183944106102, "step": 2566 }, { "epoch": 0.3969843417745989, "grad_norm": 3.9091832637786865, "learning_rate": 4.8204261656547145e-06, "logits/chosen": 11.815536499023438, "logits/rejected": 11.500572204589844, "logps/chosen": -223.6182861328125, "logps/rejected": -239.14381408691406, "loss": 0.5381, "rewards/accuracies": 0.75, "rewards/chosen": 0.09688058495521545, "rewards/margins": 0.3870798349380493, "rewards/rejected": -0.29019924998283386, "step": 2567 }, { "epoch": 0.39713899091436305, "grad_norm": 47.65726852416992, "learning_rate": 4.820139764005041e-06, "logits/chosen": 7.426468849182129, "logits/rejected": 7.232683181762695, "logps/chosen": -318.4400634765625, "logps/rejected": -312.878662109375, "loss": 0.6273, "rewards/accuracies": 0.875, "rewards/chosen": 0.25258469581604004, "rewards/margins": 0.16212821006774902, "rewards/rejected": 0.09045649319887161, "step": 2568 }, { "epoch": 0.3972936400541272, "grad_norm": 6.041717052459717, "learning_rate": 4.819853362355368e-06, "logits/chosen": 11.273836135864258, "logits/rejected": 12.104462623596191, "logps/chosen": -286.3793640136719, "logps/rejected": -298.6015319824219, "loss": 0.666, "rewards/accuracies": 0.5, "rewards/chosen": 0.18093900382518768, "rewards/margins": 0.10206803679466248, "rewards/rejected": 0.07887096703052521, "step": 2569 }, { "epoch": 0.39744828919389136, "grad_norm": 7.853445053100586, "learning_rate": 4.819566960705694e-06, "logits/chosen": 10.768254280090332, "logits/rejected": 7.515294551849365, "logps/chosen": -415.8226623535156, "logps/rejected": -295.79388427734375, "loss": 0.7132, "rewards/accuracies": 0.625, "rewards/chosen": 0.1591348648071289, "rewards/margins": 0.0030603818595409393, "rewards/rejected": 0.15607449412345886, "step": 2570 }, { "epoch": 0.3976029383336555, "grad_norm": 2.905759811401367, "learning_rate": 4.81928055905602e-06, "logits/chosen": 7.475478172302246, "logits/rejected": 3.498634099960327, "logps/chosen": -190.98486328125, "logps/rejected": -158.92971801757812, "loss": 0.4804, "rewards/accuracies": 0.875, "rewards/chosen": 0.14990076422691345, "rewards/margins": 0.6052706241607666, "rewards/rejected": -0.45536985993385315, "step": 2571 }, { "epoch": 0.39775758747341966, "grad_norm": 3.9463183879852295, "learning_rate": 4.818994157406347e-06, "logits/chosen": 13.09033489227295, "logits/rejected": 9.920575141906738, "logps/chosen": -195.91567993164062, "logps/rejected": -171.99148559570312, "loss": 0.6382, "rewards/accuracies": 0.5, "rewards/chosen": 0.11033868789672852, "rewards/margins": 0.18787871301174164, "rewards/rejected": -0.07754003256559372, "step": 2572 }, { "epoch": 0.3979122366131838, "grad_norm": 6.394832611083984, "learning_rate": 4.818707755756674e-06, "logits/chosen": 11.797989845275879, "logits/rejected": 3.8984971046447754, "logps/chosen": -306.2153625488281, "logps/rejected": -279.37811279296875, "loss": 0.677, "rewards/accuracies": 0.5, "rewards/chosen": 0.272072434425354, "rewards/margins": 0.1564340889453888, "rewards/rejected": 0.11563833057880402, "step": 2573 }, { "epoch": 0.398066885752948, "grad_norm": 5.186899185180664, "learning_rate": 4.818421354107e-06, "logits/chosen": 8.865129470825195, "logits/rejected": 3.626326560974121, "logps/chosen": -368.5791931152344, "logps/rejected": -319.8849182128906, "loss": 0.5948, "rewards/accuracies": 1.0, "rewards/chosen": 0.3122440278530121, "rewards/margins": 0.21599093079566956, "rewards/rejected": 0.09625311195850372, "step": 2574 }, { "epoch": 0.39822153489271217, "grad_norm": 9.321632385253906, "learning_rate": 4.818134952457327e-06, "logits/chosen": 9.455666542053223, "logits/rejected": 1.7659724950790405, "logps/chosen": -331.2635803222656, "logps/rejected": -242.7117462158203, "loss": 0.6646, "rewards/accuracies": 0.5, "rewards/chosen": 0.048567600548267365, "rewards/margins": 0.11471579968929291, "rewards/rejected": -0.06614819169044495, "step": 2575 }, { "epoch": 0.3983761840324763, "grad_norm": 4.7136664390563965, "learning_rate": 4.817848550807653e-06, "logits/chosen": 10.74659538269043, "logits/rejected": 11.096921920776367, "logps/chosen": -293.65875244140625, "logps/rejected": -247.22581481933594, "loss": 0.6089, "rewards/accuracies": 0.5, "rewards/chosen": 0.2365730255842209, "rewards/margins": 0.2898835837841034, "rewards/rejected": -0.0533105731010437, "step": 2576 }, { "epoch": 0.3985308331722405, "grad_norm": 4.790188312530518, "learning_rate": 4.817562149157979e-06, "logits/chosen": 16.982070922851562, "logits/rejected": 6.605229377746582, "logps/chosen": -309.56585693359375, "logps/rejected": -224.42669677734375, "loss": 0.5229, "rewards/accuracies": 0.875, "rewards/chosen": 0.4406345784664154, "rewards/margins": 0.4002976417541504, "rewards/rejected": 0.04033694788813591, "step": 2577 }, { "epoch": 0.3986854823120046, "grad_norm": 3.9144034385681152, "learning_rate": 4.817275747508306e-06, "logits/chosen": 12.170516014099121, "logits/rejected": 7.328759670257568, "logps/chosen": -259.0719909667969, "logps/rejected": -173.70632934570312, "loss": 0.6003, "rewards/accuracies": 0.625, "rewards/chosen": 0.35708701610565186, "rewards/margins": 0.2333994209766388, "rewards/rejected": 0.12368758767843246, "step": 2578 }, { "epoch": 0.3988401314517688, "grad_norm": 5.627710819244385, "learning_rate": 4.816989345858633e-06, "logits/chosen": 9.870912551879883, "logits/rejected": 10.388496398925781, "logps/chosen": -261.9947204589844, "logps/rejected": -227.18063354492188, "loss": 0.8076, "rewards/accuracies": 0.375, "rewards/chosen": 0.06697288155555725, "rewards/margins": -0.13728280365467072, "rewards/rejected": 0.20425567030906677, "step": 2579 }, { "epoch": 0.398994780591533, "grad_norm": 4.179599285125732, "learning_rate": 4.816702944208959e-06, "logits/chosen": 10.428400993347168, "logits/rejected": 3.407343864440918, "logps/chosen": -225.10702514648438, "logps/rejected": -121.5801773071289, "loss": 0.6205, "rewards/accuracies": 0.75, "rewards/chosen": 0.15225005149841309, "rewards/margins": 0.17652475833892822, "rewards/rejected": -0.024274736642837524, "step": 2580 }, { "epoch": 0.39914942973129713, "grad_norm": 3.1337106227874756, "learning_rate": 4.816416542559286e-06, "logits/chosen": 10.498144149780273, "logits/rejected": 5.005775451660156, "logps/chosen": -181.6263427734375, "logps/rejected": -121.93711853027344, "loss": 0.5385, "rewards/accuracies": 0.625, "rewards/chosen": 0.33230575919151306, "rewards/margins": 0.42928481101989746, "rewards/rejected": -0.09697907418012619, "step": 2581 }, { "epoch": 0.3993040788710613, "grad_norm": 4.620580196380615, "learning_rate": 4.816130140909613e-06, "logits/chosen": 8.123567581176758, "logits/rejected": 1.9313759803771973, "logps/chosen": -300.9485778808594, "logps/rejected": -255.19776916503906, "loss": 0.5571, "rewards/accuracies": 0.625, "rewards/chosen": 0.3754594922065735, "rewards/margins": 0.35716065764427185, "rewards/rejected": 0.018298834562301636, "step": 2582 }, { "epoch": 0.39945872801082544, "grad_norm": 4.154801845550537, "learning_rate": 4.815843739259938e-06, "logits/chosen": 11.134706497192383, "logits/rejected": 5.583846092224121, "logps/chosen": -248.82884216308594, "logps/rejected": -157.994384765625, "loss": 0.5464, "rewards/accuracies": 0.75, "rewards/chosen": 0.35838425159454346, "rewards/margins": 0.3369155526161194, "rewards/rejected": 0.021468685939908028, "step": 2583 }, { "epoch": 0.3996133771505896, "grad_norm": 9.80989933013916, "learning_rate": 4.815557337610265e-06, "logits/chosen": 11.670622825622559, "logits/rejected": 9.94440746307373, "logps/chosen": -289.3100280761719, "logps/rejected": -256.14117431640625, "loss": 0.8284, "rewards/accuracies": 0.375, "rewards/chosen": 0.249995619058609, "rewards/margins": -0.196833997964859, "rewards/rejected": 0.44682958722114563, "step": 2584 }, { "epoch": 0.39976802629035374, "grad_norm": 10.237334251403809, "learning_rate": 4.815270935960592e-06, "logits/chosen": 11.217272758483887, "logits/rejected": 9.922136306762695, "logps/chosen": -227.38626098632812, "logps/rejected": -226.56753540039062, "loss": 0.6419, "rewards/accuracies": 0.625, "rewards/chosen": 0.3075547218322754, "rewards/margins": 0.17571119964122772, "rewards/rejected": 0.13184353709220886, "step": 2585 }, { "epoch": 0.39992267543011795, "grad_norm": 7.261939525604248, "learning_rate": 4.814984534310918e-06, "logits/chosen": 9.860733032226562, "logits/rejected": 7.640769004821777, "logps/chosen": -213.65786743164062, "logps/rejected": -224.3811798095703, "loss": 0.5289, "rewards/accuracies": 0.875, "rewards/chosen": 0.30839210748672485, "rewards/margins": 0.37613165378570557, "rewards/rejected": -0.06773953884840012, "step": 2586 }, { "epoch": 0.4000773245698821, "grad_norm": 6.840734004974365, "learning_rate": 4.814698132661245e-06, "logits/chosen": 13.316822052001953, "logits/rejected": 9.663893699645996, "logps/chosen": -434.3657531738281, "logps/rejected": -384.26287841796875, "loss": 0.6405, "rewards/accuracies": 0.5, "rewards/chosen": 0.38900166749954224, "rewards/margins": 0.28886890411376953, "rewards/rejected": 0.10013273358345032, "step": 2587 }, { "epoch": 0.40023197370964625, "grad_norm": 5.709846019744873, "learning_rate": 4.814411731011572e-06, "logits/chosen": 13.386789321899414, "logits/rejected": 10.11871337890625, "logps/chosen": -342.57373046875, "logps/rejected": -325.471435546875, "loss": 0.6783, "rewards/accuracies": 0.5, "rewards/chosen": 0.5118557214736938, "rewards/margins": 0.26147741079330444, "rewards/rejected": 0.250378280878067, "step": 2588 }, { "epoch": 0.4003866228494104, "grad_norm": 5.633150100708008, "learning_rate": 4.8141253293618974e-06, "logits/chosen": 7.937169551849365, "logits/rejected": 5.875890731811523, "logps/chosen": -365.937255859375, "logps/rejected": -312.9879150390625, "loss": 0.6544, "rewards/accuracies": 0.625, "rewards/chosen": 0.5335246920585632, "rewards/margins": 0.2694160044193268, "rewards/rejected": 0.26410868763923645, "step": 2589 }, { "epoch": 0.40054127198917455, "grad_norm": 4.867786407470703, "learning_rate": 4.813838927712224e-06, "logits/chosen": 7.035674571990967, "logits/rejected": 5.373874187469482, "logps/chosen": -268.3909912109375, "logps/rejected": -227.382568359375, "loss": 0.6692, "rewards/accuracies": 0.5, "rewards/chosen": 0.4062163531780243, "rewards/margins": 0.14622172713279724, "rewards/rejected": 0.25999459624290466, "step": 2590 }, { "epoch": 0.4006959211289387, "grad_norm": 6.201251983642578, "learning_rate": 4.813552526062551e-06, "logits/chosen": 9.580256462097168, "logits/rejected": 8.534006118774414, "logps/chosen": -276.1340026855469, "logps/rejected": -275.29229736328125, "loss": 0.7921, "rewards/accuracies": 0.375, "rewards/chosen": 0.18257032334804535, "rewards/margins": -0.1590750366449356, "rewards/rejected": 0.34164533019065857, "step": 2591 }, { "epoch": 0.40085057026870285, "grad_norm": 6.719576358795166, "learning_rate": 4.813266124412877e-06, "logits/chosen": 9.957694053649902, "logits/rejected": 5.490819454193115, "logps/chosen": -234.45053100585938, "logps/rejected": -248.4857177734375, "loss": 0.776, "rewards/accuracies": 0.25, "rewards/chosen": 0.2196652591228485, "rewards/margins": -0.14617657661437988, "rewards/rejected": 0.365841805934906, "step": 2592 }, { "epoch": 0.40100521940846706, "grad_norm": 5.586428642272949, "learning_rate": 4.812979722763203e-06, "logits/chosen": 11.509471893310547, "logits/rejected": 13.922584533691406, "logps/chosen": -242.5986328125, "logps/rejected": -238.77996826171875, "loss": 0.6286, "rewards/accuracies": 0.625, "rewards/chosen": 0.41750866174697876, "rewards/margins": 0.2210138887166977, "rewards/rejected": 0.19649478793144226, "step": 2593 }, { "epoch": 0.4011598685482312, "grad_norm": 5.766911506652832, "learning_rate": 4.81269332111353e-06, "logits/chosen": 14.337347984313965, "logits/rejected": 11.379358291625977, "logps/chosen": -355.22607421875, "logps/rejected": -276.8265380859375, "loss": 0.5631, "rewards/accuracies": 0.625, "rewards/chosen": 0.7882957458496094, "rewards/margins": 0.39532890915870667, "rewards/rejected": 0.3929668664932251, "step": 2594 }, { "epoch": 0.40131451768799536, "grad_norm": 9.718783378601074, "learning_rate": 4.8124069194638565e-06, "logits/chosen": 0.02197432518005371, "logits/rejected": 6.0790910720825195, "logps/chosen": -186.4334259033203, "logps/rejected": -474.4280700683594, "loss": 0.9066, "rewards/accuracies": 0.625, "rewards/chosen": 0.012886762619018555, "rewards/margins": -0.28824201226234436, "rewards/rejected": 0.3011287748813629, "step": 2595 }, { "epoch": 0.4014691668277595, "grad_norm": 4.246051788330078, "learning_rate": 4.812120517814183e-06, "logits/chosen": 3.56642484664917, "logits/rejected": 3.160365104675293, "logps/chosen": -156.90469360351562, "logps/rejected": -136.982666015625, "loss": 0.6667, "rewards/accuracies": 0.75, "rewards/chosen": -0.01299332082271576, "rewards/margins": 0.1608019918203354, "rewards/rejected": -0.17379531264305115, "step": 2596 }, { "epoch": 0.40162381596752367, "grad_norm": 5.396277904510498, "learning_rate": 4.811834116164509e-06, "logits/chosen": 10.982786178588867, "logits/rejected": 7.774756908416748, "logps/chosen": -270.47857666015625, "logps/rejected": -193.02224731445312, "loss": 0.6256, "rewards/accuracies": 0.875, "rewards/chosen": 0.1582435965538025, "rewards/margins": 0.20662838220596313, "rewards/rejected": -0.04838474094867706, "step": 2597 }, { "epoch": 0.4017784651072878, "grad_norm": 4.19468355178833, "learning_rate": 4.811547714514836e-06, "logits/chosen": 9.862807273864746, "logits/rejected": 8.71035385131836, "logps/chosen": -209.9136199951172, "logps/rejected": -217.19625854492188, "loss": 0.5702, "rewards/accuracies": 0.75, "rewards/chosen": 0.19352124631404877, "rewards/margins": 0.3395649790763855, "rewards/rejected": -0.14604373276233673, "step": 2598 }, { "epoch": 0.401933114247052, "grad_norm": 7.871066570281982, "learning_rate": 4.811261312865162e-06, "logits/chosen": 11.740633010864258, "logits/rejected": 6.135998249053955, "logps/chosen": -309.8606262207031, "logps/rejected": -196.49359130859375, "loss": 0.6767, "rewards/accuracies": 0.75, "rewards/chosen": 0.1325240135192871, "rewards/margins": 0.080214723944664, "rewards/rejected": 0.05230928957462311, "step": 2599 }, { "epoch": 0.4020877633868162, "grad_norm": 4.6933817863464355, "learning_rate": 4.810974911215489e-06, "logits/chosen": 14.111406326293945, "logits/rejected": 6.670742034912109, "logps/chosen": -349.8078308105469, "logps/rejected": -197.30316162109375, "loss": 0.6643, "rewards/accuracies": 0.5, "rewards/chosen": 0.28924280405044556, "rewards/margins": 0.29984050989151, "rewards/rejected": -0.010597705841064453, "step": 2600 }, { "epoch": 0.40224241252658033, "grad_norm": 6.063241004943848, "learning_rate": 4.8106885095658156e-06, "logits/chosen": 7.733075141906738, "logits/rejected": 2.757169246673584, "logps/chosen": -329.32977294921875, "logps/rejected": -236.11361694335938, "loss": 0.613, "rewards/accuracies": 0.75, "rewards/chosen": 0.3940313160419464, "rewards/margins": 0.2367030680179596, "rewards/rejected": 0.157328262925148, "step": 2601 }, { "epoch": 0.4023970616663445, "grad_norm": 8.753619194030762, "learning_rate": 4.810402107916142e-06, "logits/chosen": 6.964667320251465, "logits/rejected": 8.155755043029785, "logps/chosen": -378.2042236328125, "logps/rejected": -301.0556945800781, "loss": 0.7407, "rewards/accuracies": 0.375, "rewards/chosen": 0.3217959403991699, "rewards/margins": -0.07635515928268433, "rewards/rejected": 0.39815109968185425, "step": 2602 }, { "epoch": 0.40255171080610863, "grad_norm": 3.9061129093170166, "learning_rate": 4.810115706266468e-06, "logits/chosen": 13.372148513793945, "logits/rejected": 8.747743606567383, "logps/chosen": -202.840576171875, "logps/rejected": -186.17889404296875, "loss": 0.5977, "rewards/accuracies": 0.625, "rewards/chosen": 0.4059521555900574, "rewards/margins": 0.25259929895401, "rewards/rejected": 0.15335288643836975, "step": 2603 }, { "epoch": 0.4027063599458728, "grad_norm": 4.528071880340576, "learning_rate": 4.809829304616795e-06, "logits/chosen": 10.3316011428833, "logits/rejected": 7.996026992797852, "logps/chosen": -213.27682495117188, "logps/rejected": -189.6713409423828, "loss": 0.6296, "rewards/accuracies": 0.625, "rewards/chosen": 0.15922811627388, "rewards/margins": 0.18942369520664215, "rewards/rejected": -0.030195560306310654, "step": 2604 }, { "epoch": 0.40286100908563693, "grad_norm": 4.940518379211426, "learning_rate": 4.809542902967121e-06, "logits/chosen": 8.851593971252441, "logits/rejected": 10.100085258483887, "logps/chosen": -212.69921875, "logps/rejected": -194.36570739746094, "loss": 0.7309, "rewards/accuracies": 0.375, "rewards/chosen": 0.2609790861606598, "rewards/margins": -0.042949870228767395, "rewards/rejected": 0.3039289712905884, "step": 2605 }, { "epoch": 0.40301565822540114, "grad_norm": 4.7839531898498535, "learning_rate": 4.809256501317448e-06, "logits/chosen": 7.9479193687438965, "logits/rejected": 2.2056541442871094, "logps/chosen": -292.2350769042969, "logps/rejected": -195.654541015625, "loss": 0.5978, "rewards/accuracies": 0.875, "rewards/chosen": 0.3009796440601349, "rewards/margins": 0.26149171590805054, "rewards/rejected": 0.03948793560266495, "step": 2606 }, { "epoch": 0.4031703073651653, "grad_norm": 6.859809398651123, "learning_rate": 4.808970099667775e-06, "logits/chosen": 9.672496795654297, "logits/rejected": 6.029942035675049, "logps/chosen": -316.6988525390625, "logps/rejected": -227.61627197265625, "loss": 0.5111, "rewards/accuracies": 1.0, "rewards/chosen": 0.33837226033210754, "rewards/margins": 0.5145007371902466, "rewards/rejected": -0.17612850666046143, "step": 2607 }, { "epoch": 0.40332495650492944, "grad_norm": 6.114803791046143, "learning_rate": 4.808683698018101e-06, "logits/chosen": 5.3322858810424805, "logits/rejected": 6.1041412353515625, "logps/chosen": -294.4306945800781, "logps/rejected": -294.32177734375, "loss": 0.7115, "rewards/accuracies": 0.625, "rewards/chosen": 0.32256880402565, "rewards/margins": 0.12169989943504333, "rewards/rejected": 0.2008688747882843, "step": 2608 }, { "epoch": 0.4034796056446936, "grad_norm": 8.884920120239258, "learning_rate": 4.808397296368427e-06, "logits/chosen": 9.282898902893066, "logits/rejected": 9.19538402557373, "logps/chosen": -281.532958984375, "logps/rejected": -261.17083740234375, "loss": 0.8956, "rewards/accuracies": 0.25, "rewards/chosen": 0.3274007737636566, "rewards/margins": -0.25450506806373596, "rewards/rejected": 0.5819058418273926, "step": 2609 }, { "epoch": 0.40363425478445775, "grad_norm": 4.668889999389648, "learning_rate": 4.808110894718754e-06, "logits/chosen": 10.31496810913086, "logits/rejected": 2.831465482711792, "logps/chosen": -376.6248779296875, "logps/rejected": -279.6778869628906, "loss": 0.4939, "rewards/accuracies": 0.75, "rewards/chosen": 0.8887655735015869, "rewards/margins": 0.5485795140266418, "rewards/rejected": 0.3401860296726227, "step": 2610 }, { "epoch": 0.4037889039242219, "grad_norm": 4.835183620452881, "learning_rate": 4.80782449306908e-06, "logits/chosen": 7.978914260864258, "logits/rejected": 1.4610515832901, "logps/chosen": -259.80926513671875, "logps/rejected": -255.30917358398438, "loss": 0.4105, "rewards/accuracies": 0.875, "rewards/chosen": 0.6189538240432739, "rewards/margins": 0.8281218409538269, "rewards/rejected": -0.20916807651519775, "step": 2611 }, { "epoch": 0.4039435530639861, "grad_norm": 9.31905746459961, "learning_rate": 4.807538091419407e-06, "logits/chosen": 2.4142608642578125, "logits/rejected": 2.5926458835601807, "logps/chosen": -309.0591735839844, "logps/rejected": -271.2691345214844, "loss": 0.9097, "rewards/accuracies": 0.25, "rewards/chosen": 0.06348896026611328, "rewards/margins": -0.361579954624176, "rewards/rejected": 0.4250689148902893, "step": 2612 }, { "epoch": 0.40409820220375026, "grad_norm": 6.038558483123779, "learning_rate": 4.807251689769734e-06, "logits/chosen": 4.386850357055664, "logits/rejected": 5.926063537597656, "logps/chosen": -333.52166748046875, "logps/rejected": -362.61676025390625, "loss": 0.6401, "rewards/accuracies": 0.5, "rewards/chosen": 0.280229777097702, "rewards/margins": 0.27903711795806885, "rewards/rejected": 0.0011926740407943726, "step": 2613 }, { "epoch": 0.4042528513435144, "grad_norm": 5.022916316986084, "learning_rate": 4.80696528812006e-06, "logits/chosen": 14.886497497558594, "logits/rejected": 12.660100936889648, "logps/chosen": -256.1756896972656, "logps/rejected": -247.79827880859375, "loss": 0.7212, "rewards/accuracies": 0.5, "rewards/chosen": 0.3640306293964386, "rewards/margins": 0.06014080345630646, "rewards/rejected": 0.3038898706436157, "step": 2614 }, { "epoch": 0.40440750048327856, "grad_norm": 4.518743515014648, "learning_rate": 4.806678886470387e-06, "logits/chosen": 8.15795612335205, "logits/rejected": 9.440757751464844, "logps/chosen": -163.04135131835938, "logps/rejected": -181.46841430664062, "loss": 0.6411, "rewards/accuracies": 0.75, "rewards/chosen": 0.2222786545753479, "rewards/margins": 0.19500431418418884, "rewards/rejected": 0.02727438509464264, "step": 2615 }, { "epoch": 0.4045621496230427, "grad_norm": 7.617118835449219, "learning_rate": 4.806392484820713e-06, "logits/chosen": 6.026540756225586, "logits/rejected": 4.109331130981445, "logps/chosen": -213.71939086914062, "logps/rejected": -217.4627227783203, "loss": 0.7957, "rewards/accuracies": 0.5, "rewards/chosen": 0.11376433074474335, "rewards/margins": -0.11360128223896027, "rewards/rejected": 0.2273656129837036, "step": 2616 }, { "epoch": 0.40471679876280686, "grad_norm": 5.676124572753906, "learning_rate": 4.8061060831710394e-06, "logits/chosen": 13.312227249145508, "logits/rejected": 12.515303611755371, "logps/chosen": -350.6089172363281, "logps/rejected": -317.6327209472656, "loss": 0.6383, "rewards/accuracies": 0.75, "rewards/chosen": 0.4327283203601837, "rewards/margins": 0.14337772130966187, "rewards/rejected": 0.28935056924819946, "step": 2617 }, { "epoch": 0.40487144790257107, "grad_norm": 5.661144256591797, "learning_rate": 4.805819681521366e-06, "logits/chosen": 9.972670555114746, "logits/rejected": 3.328108072280884, "logps/chosen": -333.13201904296875, "logps/rejected": -234.4336700439453, "loss": 0.5544, "rewards/accuracies": 0.75, "rewards/chosen": 0.5826784372329712, "rewards/margins": 0.3760630488395691, "rewards/rejected": 0.2066154032945633, "step": 2618 }, { "epoch": 0.4050260970423352, "grad_norm": 6.063949108123779, "learning_rate": 4.805533279871693e-06, "logits/chosen": 4.357425689697266, "logits/rejected": 6.712894439697266, "logps/chosen": -262.0039367675781, "logps/rejected": -239.83370971679688, "loss": 0.6836, "rewards/accuracies": 0.375, "rewards/chosen": 0.009914321824908257, "rewards/margins": 0.14029037952423096, "rewards/rejected": -0.13037605583667755, "step": 2619 }, { "epoch": 0.40518074618209937, "grad_norm": 4.06412935256958, "learning_rate": 4.805246878222019e-06, "logits/chosen": 7.4035258293151855, "logits/rejected": 7.448153018951416, "logps/chosen": -297.70245361328125, "logps/rejected": -214.7616729736328, "loss": 0.5293, "rewards/accuracies": 0.625, "rewards/chosen": 0.1439451277256012, "rewards/margins": 0.4220367968082428, "rewards/rejected": -0.2780916690826416, "step": 2620 }, { "epoch": 0.4053353953218635, "grad_norm": 5.077789306640625, "learning_rate": 4.804960476572346e-06, "logits/chosen": 8.757404327392578, "logits/rejected": 3.2138757705688477, "logps/chosen": -282.9140319824219, "logps/rejected": -219.0904998779297, "loss": 0.6875, "rewards/accuracies": 0.375, "rewards/chosen": 0.3097047805786133, "rewards/margins": 0.06329468637704849, "rewards/rejected": 0.24641010165214539, "step": 2621 }, { "epoch": 0.4054900444616277, "grad_norm": 5.799759387969971, "learning_rate": 4.804674074922672e-06, "logits/chosen": 5.377039909362793, "logits/rejected": 5.550652027130127, "logps/chosen": -234.64694213867188, "logps/rejected": -226.31576538085938, "loss": 0.7461, "rewards/accuracies": 0.375, "rewards/chosen": 0.0468912273645401, "rewards/margins": -0.034545764327049255, "rewards/rejected": 0.08143696188926697, "step": 2622 }, { "epoch": 0.4056446936013918, "grad_norm": 5.164262294769287, "learning_rate": 4.8043876732729985e-06, "logits/chosen": 6.502854824066162, "logits/rejected": 5.388241767883301, "logps/chosen": -364.510986328125, "logps/rejected": -197.87681579589844, "loss": 0.6442, "rewards/accuracies": 0.375, "rewards/chosen": 0.4631475806236267, "rewards/margins": 0.4379481077194214, "rewards/rejected": 0.025199517607688904, "step": 2623 }, { "epoch": 0.405799342741156, "grad_norm": 4.712680816650391, "learning_rate": 4.804101271623325e-06, "logits/chosen": 5.3691301345825195, "logits/rejected": 3.0496113300323486, "logps/chosen": -385.042236328125, "logps/rejected": -355.14483642578125, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.21682611107826233, "rewards/margins": 0.3757406771183014, "rewards/rejected": -0.15891456604003906, "step": 2624 }, { "epoch": 0.4059539918809202, "grad_norm": 11.883728981018066, "learning_rate": 4.803814869973652e-06, "logits/chosen": 4.919912338256836, "logits/rejected": -0.008937597274780273, "logps/chosen": -489.31768798828125, "logps/rejected": -355.6535339355469, "loss": 0.7418, "rewards/accuracies": 0.5, "rewards/chosen": -0.05142766237258911, "rewards/margins": -0.03966262564063072, "rewards/rejected": -0.011765042319893837, "step": 2625 }, { "epoch": 0.40610864102068434, "grad_norm": 4.274256229400635, "learning_rate": 4.8035284683239784e-06, "logits/chosen": 9.499042510986328, "logits/rejected": 4.244202136993408, "logps/chosen": -248.77346801757812, "logps/rejected": -177.20687866210938, "loss": 0.5185, "rewards/accuracies": 1.0, "rewards/chosen": 0.09489268809556961, "rewards/margins": 0.44498834013938904, "rewards/rejected": -0.3500956594944, "step": 2626 }, { "epoch": 0.4062632901604485, "grad_norm": 5.311555862426758, "learning_rate": 4.803242066674304e-06, "logits/chosen": 15.270570755004883, "logits/rejected": 11.609674453735352, "logps/chosen": -236.3300018310547, "logps/rejected": -225.8714141845703, "loss": 0.5434, "rewards/accuracies": 0.75, "rewards/chosen": 0.16436053812503815, "rewards/margins": 0.4649355709552765, "rewards/rejected": -0.3005750775337219, "step": 2627 }, { "epoch": 0.40641793930021264, "grad_norm": 4.342464447021484, "learning_rate": 4.802955665024631e-06, "logits/chosen": 9.718295097351074, "logits/rejected": 13.8018798828125, "logps/chosen": -207.02001953125, "logps/rejected": -273.1556396484375, "loss": 0.5812, "rewards/accuracies": 0.5, "rewards/chosen": 0.11753726005554199, "rewards/margins": 0.29417183995246887, "rewards/rejected": -0.17663460969924927, "step": 2628 }, { "epoch": 0.4065725884399768, "grad_norm": 6.279725074768066, "learning_rate": 4.8026692633749575e-06, "logits/chosen": 7.422210216522217, "logits/rejected": 9.758340835571289, "logps/chosen": -162.49378967285156, "logps/rejected": -157.6685791015625, "loss": 0.8373, "rewards/accuracies": 0.375, "rewards/chosen": -0.26207125186920166, "rewards/margins": -0.23835879564285278, "rewards/rejected": -0.023712441325187683, "step": 2629 }, { "epoch": 0.40672723757974094, "grad_norm": 4.622989177703857, "learning_rate": 4.802382861725284e-06, "logits/chosen": 12.836143493652344, "logits/rejected": 1.408252477645874, "logps/chosen": -403.2530517578125, "logps/rejected": -228.9989776611328, "loss": 0.4593, "rewards/accuracies": 0.75, "rewards/chosen": 0.13168393075466156, "rewards/margins": 0.768669843673706, "rewards/rejected": -0.6369858980178833, "step": 2630 }, { "epoch": 0.40688188671950515, "grad_norm": 6.396973609924316, "learning_rate": 4.80209646007561e-06, "logits/chosen": 9.866778373718262, "logits/rejected": 6.122995853424072, "logps/chosen": -472.73065185546875, "logps/rejected": -324.6229248046875, "loss": 0.6309, "rewards/accuracies": 0.5, "rewards/chosen": 0.3928345739841461, "rewards/margins": 0.23249207437038422, "rewards/rejected": 0.1603425145149231, "step": 2631 }, { "epoch": 0.4070365358592693, "grad_norm": 7.1897149085998535, "learning_rate": 4.801810058425937e-06, "logits/chosen": 11.339544296264648, "logits/rejected": 11.42724323272705, "logps/chosen": -363.2203063964844, "logps/rejected": -357.7216796875, "loss": 0.7904, "rewards/accuracies": 0.625, "rewards/chosen": -0.15831291675567627, "rewards/margins": -0.0312308669090271, "rewards/rejected": -0.12708204984664917, "step": 2632 }, { "epoch": 0.40719118499903345, "grad_norm": 4.733573913574219, "learning_rate": 4.801523656776263e-06, "logits/chosen": 6.047045707702637, "logits/rejected": 9.2461576461792, "logps/chosen": -215.75318908691406, "logps/rejected": -263.97296142578125, "loss": 0.6357, "rewards/accuracies": 0.75, "rewards/chosen": 0.2662281095981598, "rewards/margins": 0.14469946920871735, "rewards/rejected": 0.12152862548828125, "step": 2633 }, { "epoch": 0.4073458341387976, "grad_norm": 4.460962772369385, "learning_rate": 4.80123725512659e-06, "logits/chosen": 13.84663200378418, "logits/rejected": 4.173087120056152, "logps/chosen": -275.0989685058594, "logps/rejected": -222.12890625, "loss": 0.4643, "rewards/accuracies": 0.875, "rewards/chosen": 0.21884170174598694, "rewards/margins": 0.5685914158821106, "rewards/rejected": -0.34974968433380127, "step": 2634 }, { "epoch": 0.40750048327856175, "grad_norm": 5.4968414306640625, "learning_rate": 4.800950853476917e-06, "logits/chosen": 6.543176651000977, "logits/rejected": -2.3892300128936768, "logps/chosen": -268.4636535644531, "logps/rejected": -199.86373901367188, "loss": 0.5065, "rewards/accuracies": 1.0, "rewards/chosen": 0.26971369981765747, "rewards/margins": 0.47401466965675354, "rewards/rejected": -0.20430096983909607, "step": 2635 }, { "epoch": 0.4076551324183259, "grad_norm": 5.363725185394287, "learning_rate": 4.800664451827242e-06, "logits/chosen": 5.589339733123779, "logits/rejected": 7.286928176879883, "logps/chosen": -202.7041015625, "logps/rejected": -239.35354614257812, "loss": 0.7613, "rewards/accuracies": 0.375, "rewards/chosen": 0.08437521755695343, "rewards/margins": -0.10510352998971939, "rewards/rejected": 0.18947875499725342, "step": 2636 }, { "epoch": 0.40780978155809006, "grad_norm": 7.565221786499023, "learning_rate": 4.800378050177569e-06, "logits/chosen": 13.128049850463867, "logits/rejected": 11.742080688476562, "logps/chosen": -259.989990234375, "logps/rejected": -255.3491973876953, "loss": 0.751, "rewards/accuracies": 0.75, "rewards/chosen": -0.12680941820144653, "rewards/margins": -0.020378492772579193, "rewards/rejected": -0.10643090307712555, "step": 2637 }, { "epoch": 0.40796443069785426, "grad_norm": 4.714408874511719, "learning_rate": 4.800091648527896e-06, "logits/chosen": 11.26417350769043, "logits/rejected": 6.868785381317139, "logps/chosen": -329.751953125, "logps/rejected": -261.6939697265625, "loss": 0.6478, "rewards/accuracies": 0.625, "rewards/chosen": 0.2920898497104645, "rewards/margins": 0.14915743470191956, "rewards/rejected": 0.14293241500854492, "step": 2638 }, { "epoch": 0.4081190798376184, "grad_norm": 5.228346347808838, "learning_rate": 4.799805246878222e-06, "logits/chosen": 7.224055290222168, "logits/rejected": 5.372251987457275, "logps/chosen": -233.1369171142578, "logps/rejected": -186.95213317871094, "loss": 0.5775, "rewards/accuracies": 0.625, "rewards/chosen": 0.00942116230726242, "rewards/margins": 0.2986801266670227, "rewards/rejected": -0.2892589569091797, "step": 2639 }, { "epoch": 0.40827372897738257, "grad_norm": 4.8835530281066895, "learning_rate": 4.799518845228549e-06, "logits/chosen": 13.223508834838867, "logits/rejected": 7.286744594573975, "logps/chosen": -426.3082275390625, "logps/rejected": -247.29444885253906, "loss": 0.5832, "rewards/accuracies": 0.625, "rewards/chosen": 0.5898993015289307, "rewards/margins": 0.34897100925445557, "rewards/rejected": 0.2409282624721527, "step": 2640 }, { "epoch": 0.4084283781171467, "grad_norm": 7.119494915008545, "learning_rate": 4.799232443578876e-06, "logits/chosen": 10.611406326293945, "logits/rejected": 2.7520837783813477, "logps/chosen": -247.2782440185547, "logps/rejected": -170.2180938720703, "loss": 0.6579, "rewards/accuracies": 0.5, "rewards/chosen": -0.1309681087732315, "rewards/margins": 0.15472464263439178, "rewards/rejected": -0.2856927514076233, "step": 2641 }, { "epoch": 0.40858302725691087, "grad_norm": 6.773077487945557, "learning_rate": 4.7989460419292014e-06, "logits/chosen": 7.102503776550293, "logits/rejected": 7.385689735412598, "logps/chosen": -303.64093017578125, "logps/rejected": -308.61395263671875, "loss": 0.7539, "rewards/accuracies": 0.5, "rewards/chosen": 0.11752213537693024, "rewards/margins": 0.013698004186153412, "rewards/rejected": 0.10382413864135742, "step": 2642 }, { "epoch": 0.408737676396675, "grad_norm": 5.577019214630127, "learning_rate": 4.798659640279528e-06, "logits/chosen": 6.515275955200195, "logits/rejected": 10.408912658691406, "logps/chosen": -130.23463439941406, "logps/rejected": -216.39822387695312, "loss": 0.7539, "rewards/accuracies": 0.375, "rewards/chosen": -0.21503695845603943, "rewards/margins": -0.09368164837360382, "rewards/rejected": -0.12135529518127441, "step": 2643 }, { "epoch": 0.4088923255364392, "grad_norm": 11.766060829162598, "learning_rate": 4.798373238629855e-06, "logits/chosen": 12.002068519592285, "logits/rejected": 13.626076698303223, "logps/chosen": -221.24728393554688, "logps/rejected": -255.05416870117188, "loss": 0.9005, "rewards/accuracies": 0.625, "rewards/chosen": -0.05107155442237854, "rewards/margins": -0.2006019502878189, "rewards/rejected": 0.14953042566776276, "step": 2644 }, { "epoch": 0.4090469746762034, "grad_norm": 6.128371715545654, "learning_rate": 4.798086836980181e-06, "logits/chosen": 8.59428882598877, "logits/rejected": 4.739449501037598, "logps/chosen": -284.60009765625, "logps/rejected": -167.88858032226562, "loss": 0.7036, "rewards/accuracies": 0.375, "rewards/chosen": 0.022728145122528076, "rewards/margins": 0.016990456730127335, "rewards/rejected": 0.00573769211769104, "step": 2645 }, { "epoch": 0.40920162381596753, "grad_norm": 4.945323467254639, "learning_rate": 4.797800435330508e-06, "logits/chosen": 11.016851425170898, "logits/rejected": 8.419404029846191, "logps/chosen": -281.6387023925781, "logps/rejected": -245.2271728515625, "loss": 0.5763, "rewards/accuracies": 0.5, "rewards/chosen": 0.2981964349746704, "rewards/margins": 0.4338471293449402, "rewards/rejected": -0.13565070927143097, "step": 2646 }, { "epoch": 0.4093562729557317, "grad_norm": 11.966965675354004, "learning_rate": 4.797514033680835e-06, "logits/chosen": 5.151798725128174, "logits/rejected": 5.54546594619751, "logps/chosen": -281.3016052246094, "logps/rejected": -193.2689666748047, "loss": 0.7505, "rewards/accuracies": 0.25, "rewards/chosen": -0.013181231915950775, "rewards/margins": -0.08431143313646317, "rewards/rejected": 0.07113020122051239, "step": 2647 }, { "epoch": 0.40951092209549583, "grad_norm": 5.401256561279297, "learning_rate": 4.797227632031161e-06, "logits/chosen": 9.029970169067383, "logits/rejected": 3.045306444168091, "logps/chosen": -268.4500732421875, "logps/rejected": -243.4966278076172, "loss": 0.5756, "rewards/accuracies": 0.625, "rewards/chosen": 0.21103864908218384, "rewards/margins": 0.3457384407520294, "rewards/rejected": -0.13469980657100677, "step": 2648 }, { "epoch": 0.40966557123526, "grad_norm": 4.6939496994018555, "learning_rate": 4.796941230381487e-06, "logits/chosen": 8.520849227905273, "logits/rejected": 5.825819969177246, "logps/chosen": -251.12374877929688, "logps/rejected": -164.05984497070312, "loss": 0.6412, "rewards/accuracies": 0.75, "rewards/chosen": 0.3347279727458954, "rewards/margins": 0.24735581874847412, "rewards/rejected": 0.08737210929393768, "step": 2649 }, { "epoch": 0.4098202203750242, "grad_norm": 4.260602951049805, "learning_rate": 4.796654828731814e-06, "logits/chosen": 12.482410430908203, "logits/rejected": 3.0571346282958984, "logps/chosen": -291.24835205078125, "logps/rejected": -181.38148498535156, "loss": 0.5202, "rewards/accuracies": 0.625, "rewards/chosen": 0.14117297530174255, "rewards/margins": 0.47244641184806824, "rewards/rejected": -0.3312734067440033, "step": 2650 }, { "epoch": 0.40997486951478834, "grad_norm": 4.5313944816589355, "learning_rate": 4.7963684270821405e-06, "logits/chosen": 3.3460144996643066, "logits/rejected": 1.4234497547149658, "logps/chosen": -162.12477111816406, "logps/rejected": -195.40274047851562, "loss": 0.6493, "rewards/accuracies": 0.625, "rewards/chosen": 0.017408132553100586, "rewards/margins": 0.1096906065940857, "rewards/rejected": -0.09228246659040451, "step": 2651 }, { "epoch": 0.4101295186545525, "grad_norm": 5.835301399230957, "learning_rate": 4.796082025432467e-06, "logits/chosen": 6.379386901855469, "logits/rejected": 9.844743728637695, "logps/chosen": -225.2978515625, "logps/rejected": -215.4749755859375, "loss": 0.8894, "rewards/accuracies": 0.375, "rewards/chosen": -0.14622831344604492, "rewards/margins": -0.23944847285747528, "rewards/rejected": 0.09322013705968857, "step": 2652 }, { "epoch": 0.41028416779431665, "grad_norm": 6.756196975708008, "learning_rate": 4.795795623782794e-06, "logits/chosen": 7.943758010864258, "logits/rejected": 10.78510570526123, "logps/chosen": -227.11619567871094, "logps/rejected": -299.67352294921875, "loss": 0.9121, "rewards/accuracies": 0.5, "rewards/chosen": -0.2748430371284485, "rewards/margins": -0.18574418127536774, "rewards/rejected": -0.08909883350133896, "step": 2653 }, { "epoch": 0.4104388169340808, "grad_norm": 4.1023101806640625, "learning_rate": 4.79550922213312e-06, "logits/chosen": 10.251218795776367, "logits/rejected": 7.312133312225342, "logps/chosen": -278.2147216796875, "logps/rejected": -253.19630432128906, "loss": 0.5215, "rewards/accuracies": 0.625, "rewards/chosen": 0.3653113842010498, "rewards/margins": 0.5224370360374451, "rewards/rejected": -0.15712566673755646, "step": 2654 }, { "epoch": 0.41059346607384495, "grad_norm": 6.181456565856934, "learning_rate": 4.795222820483446e-06, "logits/chosen": 7.096429347991943, "logits/rejected": 10.792226791381836, "logps/chosen": -251.61062622070312, "logps/rejected": -300.1155090332031, "loss": 0.7853, "rewards/accuracies": 0.375, "rewards/chosen": 0.10043714940547943, "rewards/margins": -0.014385506510734558, "rewards/rejected": 0.11482267826795578, "step": 2655 }, { "epoch": 0.4107481152136091, "grad_norm": 6.432643413543701, "learning_rate": 4.794936418833773e-06, "logits/chosen": 9.615537643432617, "logits/rejected": 11.772068977355957, "logps/chosen": -269.7298889160156, "logps/rejected": -342.2661437988281, "loss": 0.8283, "rewards/accuracies": 0.5, "rewards/chosen": 0.07405209541320801, "rewards/margins": -0.17221638560295105, "rewards/rejected": 0.24626848101615906, "step": 2656 }, { "epoch": 0.4109027643533733, "grad_norm": 5.000306129455566, "learning_rate": 4.7946500171840995e-06, "logits/chosen": 8.370254516601562, "logits/rejected": 7.60905122756958, "logps/chosen": -266.63787841796875, "logps/rejected": -197.1949462890625, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": 0.09593873471021652, "rewards/margins": 0.08923462778329849, "rewards/rejected": 0.0067041050642728806, "step": 2657 }, { "epoch": 0.41105741349313746, "grad_norm": 6.106516361236572, "learning_rate": 4.794363615534426e-06, "logits/chosen": 12.595562934875488, "logits/rejected": 5.683802604675293, "logps/chosen": -344.0003662109375, "logps/rejected": -277.79376220703125, "loss": 0.6124, "rewards/accuracies": 0.625, "rewards/chosen": 0.35268765687942505, "rewards/margins": 0.23292341828346252, "rewards/rejected": 0.11976423859596252, "step": 2658 }, { "epoch": 0.4112120626329016, "grad_norm": 5.12757682800293, "learning_rate": 4.794077213884753e-06, "logits/chosen": 7.615944862365723, "logits/rejected": 13.078064918518066, "logps/chosen": -235.4261932373047, "logps/rejected": -291.4898376464844, "loss": 0.6256, "rewards/accuracies": 0.625, "rewards/chosen": 0.20176124572753906, "rewards/margins": 0.19874128699302673, "rewards/rejected": 0.003019958734512329, "step": 2659 }, { "epoch": 0.41136671177266576, "grad_norm": 3.7776591777801514, "learning_rate": 4.793790812235079e-06, "logits/chosen": 8.574359893798828, "logits/rejected": 2.1253788471221924, "logps/chosen": -256.5044250488281, "logps/rejected": -172.62010192871094, "loss": 0.5411, "rewards/accuracies": 0.875, "rewards/chosen": 0.12476853281259537, "rewards/margins": 0.513027548789978, "rewards/rejected": -0.38825899362564087, "step": 2660 }, { "epoch": 0.4115213609124299, "grad_norm": 8.783904075622559, "learning_rate": 4.793504410585405e-06, "logits/chosen": 12.412147521972656, "logits/rejected": 8.608375549316406, "logps/chosen": -575.7140502929688, "logps/rejected": -449.0717468261719, "loss": 0.7288, "rewards/accuracies": 0.5, "rewards/chosen": 0.13889333605766296, "rewards/margins": -0.002595890313386917, "rewards/rejected": 0.1414892077445984, "step": 2661 }, { "epoch": 0.41167601005219406, "grad_norm": 4.376724720001221, "learning_rate": 4.793218008935732e-06, "logits/chosen": 13.176595687866211, "logits/rejected": 4.990189552307129, "logps/chosen": -277.080322265625, "logps/rejected": -187.01858520507812, "loss": 0.6347, "rewards/accuracies": 0.5, "rewards/chosen": 0.06547775119543076, "rewards/margins": 0.19429269433021545, "rewards/rejected": -0.1288149356842041, "step": 2662 }, { "epoch": 0.41183065919195827, "grad_norm": 5.452521800994873, "learning_rate": 4.7929316072860586e-06, "logits/chosen": 12.210566520690918, "logits/rejected": 5.91943359375, "logps/chosen": -445.83416748046875, "logps/rejected": -314.81378173828125, "loss": 0.5722, "rewards/accuracies": 0.75, "rewards/chosen": 0.34836122393608093, "rewards/margins": 0.37429410219192505, "rewards/rejected": -0.025932878255844116, "step": 2663 }, { "epoch": 0.4119853083317224, "grad_norm": 3.8195464611053467, "learning_rate": 4.792645205636385e-06, "logits/chosen": 10.620893478393555, "logits/rejected": 11.452146530151367, "logps/chosen": -142.13693237304688, "logps/rejected": -149.37869262695312, "loss": 0.6116, "rewards/accuracies": 0.75, "rewards/chosen": -0.12462678551673889, "rewards/margins": 0.21381878852844238, "rewards/rejected": -0.3384455740451813, "step": 2664 }, { "epoch": 0.4121399574714866, "grad_norm": 3.477503776550293, "learning_rate": 4.792358803986711e-06, "logits/chosen": 9.744803428649902, "logits/rejected": 7.133264541625977, "logps/chosen": -216.46688842773438, "logps/rejected": -205.63217163085938, "loss": 0.5829, "rewards/accuracies": 0.75, "rewards/chosen": 0.09257908910512924, "rewards/margins": 0.29249659180641174, "rewards/rejected": -0.1999175101518631, "step": 2665 }, { "epoch": 0.4122946066112507, "grad_norm": 4.316352844238281, "learning_rate": 4.792072402337038e-06, "logits/chosen": 12.296398162841797, "logits/rejected": 5.777896881103516, "logps/chosen": -380.7566833496094, "logps/rejected": -261.357421875, "loss": 0.5551, "rewards/accuracies": 0.75, "rewards/chosen": 0.2547001838684082, "rewards/margins": 0.38686132431030273, "rewards/rejected": -0.13216114044189453, "step": 2666 }, { "epoch": 0.4124492557510149, "grad_norm": 4.193288326263428, "learning_rate": 4.791786000687364e-06, "logits/chosen": 2.1880016326904297, "logits/rejected": 6.825901031494141, "logps/chosen": -130.9357452392578, "logps/rejected": -174.4420166015625, "loss": 0.6474, "rewards/accuracies": 0.625, "rewards/chosen": -0.25551897287368774, "rewards/margins": 0.13312536478042603, "rewards/rejected": -0.38864439725875854, "step": 2667 }, { "epoch": 0.412603904890779, "grad_norm": 7.0151519775390625, "learning_rate": 4.791499599037691e-06, "logits/chosen": 11.798147201538086, "logits/rejected": 7.212460994720459, "logps/chosen": -265.9297790527344, "logps/rejected": -176.83584594726562, "loss": 0.6224, "rewards/accuracies": 0.625, "rewards/chosen": 0.21902886033058167, "rewards/margins": 0.3132617473602295, "rewards/rejected": -0.09423283487558365, "step": 2668 }, { "epoch": 0.4127585540305432, "grad_norm": 5.6192708015441895, "learning_rate": 4.791213197388017e-06, "logits/chosen": 10.524545669555664, "logits/rejected": 12.62818717956543, "logps/chosen": -235.33456420898438, "logps/rejected": -340.3367004394531, "loss": 0.6069, "rewards/accuracies": 0.625, "rewards/chosen": 0.2112850397825241, "rewards/margins": 0.30751287937164307, "rewards/rejected": -0.09622783958911896, "step": 2669 }, { "epoch": 0.4129132031703074, "grad_norm": 6.5952019691467285, "learning_rate": 4.7909267957383434e-06, "logits/chosen": 8.100357055664062, "logits/rejected": 6.451948165893555, "logps/chosen": -295.75579833984375, "logps/rejected": -313.97100830078125, "loss": 0.6492, "rewards/accuracies": 0.625, "rewards/chosen": 0.08507698774337769, "rewards/margins": 0.11653098464012146, "rewards/rejected": -0.03145400807261467, "step": 2670 }, { "epoch": 0.41306785231007154, "grad_norm": 6.971599102020264, "learning_rate": 4.79064039408867e-06, "logits/chosen": 10.957653045654297, "logits/rejected": 15.8810453414917, "logps/chosen": -211.7451629638672, "logps/rejected": -206.51112365722656, "loss": 0.803, "rewards/accuracies": 0.375, "rewards/chosen": -0.16833312809467316, "rewards/margins": -0.09789979457855225, "rewards/rejected": -0.07043331861495972, "step": 2671 }, { "epoch": 0.4132225014498357, "grad_norm": 4.678345680236816, "learning_rate": 4.790353992438997e-06, "logits/chosen": 9.228696823120117, "logits/rejected": 9.208124160766602, "logps/chosen": -190.8668670654297, "logps/rejected": -212.9173126220703, "loss": 0.6254, "rewards/accuracies": 0.625, "rewards/chosen": 0.0887783020734787, "rewards/margins": 0.23429659008979797, "rewards/rejected": -0.1455182582139969, "step": 2672 }, { "epoch": 0.41337715058959984, "grad_norm": 5.304049968719482, "learning_rate": 4.790067590789323e-06, "logits/chosen": 7.364456653594971, "logits/rejected": 7.57342529296875, "logps/chosen": -194.13265991210938, "logps/rejected": -197.5412139892578, "loss": 0.7354, "rewards/accuracies": 0.625, "rewards/chosen": -0.036277107894420624, "rewards/margins": -0.020879503339529037, "rewards/rejected": -0.01539759710431099, "step": 2673 }, { "epoch": 0.413531799729364, "grad_norm": 5.420442581176758, "learning_rate": 4.78978118913965e-06, "logits/chosen": 8.265457153320312, "logits/rejected": 3.7377610206604004, "logps/chosen": -293.60052490234375, "logps/rejected": -181.9404296875, "loss": 0.6926, "rewards/accuracies": 0.5, "rewards/chosen": -0.0356023907661438, "rewards/margins": 0.11932381987571716, "rewards/rejected": -0.15492619574069977, "step": 2674 }, { "epoch": 0.41368644886912814, "grad_norm": 5.860610008239746, "learning_rate": 4.789494787489976e-06, "logits/chosen": 6.292856216430664, "logits/rejected": 7.976338863372803, "logps/chosen": -262.763916015625, "logps/rejected": -285.5487060546875, "loss": 0.6629, "rewards/accuracies": 0.5, "rewards/chosen": 0.04891301691532135, "rewards/margins": 0.24372237920761108, "rewards/rejected": -0.19480934739112854, "step": 2675 }, { "epoch": 0.41384109800889235, "grad_norm": 7.555635452270508, "learning_rate": 4.7892083858403025e-06, "logits/chosen": 14.162042617797852, "logits/rejected": 10.465352058410645, "logps/chosen": -332.97528076171875, "logps/rejected": -391.433349609375, "loss": 0.5916, "rewards/accuracies": 0.625, "rewards/chosen": 0.06017686426639557, "rewards/margins": 0.30433887243270874, "rewards/rejected": -0.24416202306747437, "step": 2676 }, { "epoch": 0.4139957471486565, "grad_norm": 6.041835308074951, "learning_rate": 4.788921984190629e-06, "logits/chosen": 7.283111572265625, "logits/rejected": 2.9537463188171387, "logps/chosen": -291.0113220214844, "logps/rejected": -180.46897888183594, "loss": 0.6104, "rewards/accuracies": 0.875, "rewards/chosen": -0.14422979950904846, "rewards/margins": 0.18445970118045807, "rewards/rejected": -0.3286895155906677, "step": 2677 }, { "epoch": 0.41415039628842065, "grad_norm": 4.959870338439941, "learning_rate": 4.788635582540956e-06, "logits/chosen": 11.053314208984375, "logits/rejected": 10.915766716003418, "logps/chosen": -206.71646118164062, "logps/rejected": -220.74928283691406, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": 0.023371180519461632, "rewards/margins": 0.1551080048084259, "rewards/rejected": -0.1317368447780609, "step": 2678 }, { "epoch": 0.4143050454281848, "grad_norm": 5.344726085662842, "learning_rate": 4.7883491808912824e-06, "logits/chosen": 11.77116584777832, "logits/rejected": 5.133759021759033, "logps/chosen": -336.708251953125, "logps/rejected": -335.4376220703125, "loss": 0.6371, "rewards/accuracies": 0.5, "rewards/chosen": 0.18181678652763367, "rewards/margins": 0.2018643319606781, "rewards/rejected": -0.02004757523536682, "step": 2679 }, { "epoch": 0.41445969456794896, "grad_norm": 5.46013879776001, "learning_rate": 4.788062779241609e-06, "logits/chosen": 6.188963890075684, "logits/rejected": 7.754343032836914, "logps/chosen": -220.2676239013672, "logps/rejected": -192.42047119140625, "loss": 0.7615, "rewards/accuracies": 0.5, "rewards/chosen": -0.24339866638183594, "rewards/margins": 0.06646917760372162, "rewards/rejected": -0.30986788868904114, "step": 2680 }, { "epoch": 0.4146143437077131, "grad_norm": 6.969061374664307, "learning_rate": 4.787776377591936e-06, "logits/chosen": 9.019637107849121, "logits/rejected": 6.001626014709473, "logps/chosen": -383.18328857421875, "logps/rejected": -338.8443603515625, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.00911402702331543, "rewards/margins": 0.10183516144752502, "rewards/rejected": -0.092721126973629, "step": 2681 }, { "epoch": 0.4147689928474773, "grad_norm": 5.407541751861572, "learning_rate": 4.7874899759422615e-06, "logits/chosen": 6.0520524978637695, "logits/rejected": 5.730321407318115, "logps/chosen": -280.2451171875, "logps/rejected": -221.66537475585938, "loss": 0.7308, "rewards/accuracies": 0.375, "rewards/chosen": -0.21384906768798828, "rewards/margins": 0.059087567031383514, "rewards/rejected": -0.2729366421699524, "step": 2682 }, { "epoch": 0.41492364198724146, "grad_norm": 3.819279909133911, "learning_rate": 4.787203574292588e-06, "logits/chosen": 5.031618118286133, "logits/rejected": 2.9987237453460693, "logps/chosen": -186.62782287597656, "logps/rejected": -162.77984619140625, "loss": 0.49, "rewards/accuracies": 0.875, "rewards/chosen": 0.2552463412284851, "rewards/margins": 0.5567307472229004, "rewards/rejected": -0.3014844059944153, "step": 2683 }, { "epoch": 0.4150782911270056, "grad_norm": 5.949801445007324, "learning_rate": 4.786917172642915e-06, "logits/chosen": 10.622173309326172, "logits/rejected": 9.031545639038086, "logps/chosen": -301.82354736328125, "logps/rejected": -338.3058166503906, "loss": 0.6469, "rewards/accuracies": 0.5, "rewards/chosen": 0.13022488355636597, "rewards/margins": 0.18733233213424683, "rewards/rejected": -0.05710745230317116, "step": 2684 }, { "epoch": 0.41523294026676977, "grad_norm": 5.231034278869629, "learning_rate": 4.7866307709932415e-06, "logits/chosen": 7.656411170959473, "logits/rejected": 7.173588275909424, "logps/chosen": -267.7090148925781, "logps/rejected": -265.0752258300781, "loss": 0.5862, "rewards/accuracies": 0.625, "rewards/chosen": 0.17060573399066925, "rewards/margins": 0.28513848781585693, "rewards/rejected": -0.11453276872634888, "step": 2685 }, { "epoch": 0.4153875894065339, "grad_norm": 6.427034854888916, "learning_rate": 4.786344369343568e-06, "logits/chosen": 7.376415252685547, "logits/rejected": 7.928295135498047, "logps/chosen": -248.8770294189453, "logps/rejected": -314.45965576171875, "loss": 0.6753, "rewards/accuracies": 0.625, "rewards/chosen": 0.08439035713672638, "rewards/margins": 0.08876930177211761, "rewards/rejected": -0.004378937184810638, "step": 2686 }, { "epoch": 0.41554223854629807, "grad_norm": 5.761512279510498, "learning_rate": 4.786057967693895e-06, "logits/chosen": 4.417949676513672, "logits/rejected": 7.835887908935547, "logps/chosen": -222.18365478515625, "logps/rejected": -251.08248901367188, "loss": 0.7497, "rewards/accuracies": 0.625, "rewards/chosen": -0.026735767722129822, "rewards/margins": -0.0775153860449791, "rewards/rejected": 0.050779618322849274, "step": 2687 }, { "epoch": 0.4156968876860622, "grad_norm": 4.53059720993042, "learning_rate": 4.785771566044221e-06, "logits/chosen": 12.640189170837402, "logits/rejected": 5.8292646408081055, "logps/chosen": -327.42340087890625, "logps/rejected": -217.5017547607422, "loss": 0.5616, "rewards/accuracies": 0.75, "rewards/chosen": 0.48154470324516296, "rewards/margins": 0.3492458164691925, "rewards/rejected": 0.13229885697364807, "step": 2688 }, { "epoch": 0.41585153682582643, "grad_norm": 4.212057590484619, "learning_rate": 4.785485164394547e-06, "logits/chosen": 13.842592239379883, "logits/rejected": 7.01350212097168, "logps/chosen": -367.5593566894531, "logps/rejected": -265.8381652832031, "loss": 0.4781, "rewards/accuracies": 0.75, "rewards/chosen": 0.3196007311344147, "rewards/margins": 0.6862646341323853, "rewards/rejected": -0.36666393280029297, "step": 2689 }, { "epoch": 0.4160061859655906, "grad_norm": 5.01682186126709, "learning_rate": 4.785198762744874e-06, "logits/chosen": 15.273069381713867, "logits/rejected": 4.664915084838867, "logps/chosen": -540.3794555664062, "logps/rejected": -164.26556396484375, "loss": 0.549, "rewards/accuracies": 0.75, "rewards/chosen": 0.2815825045108795, "rewards/margins": 0.3410099744796753, "rewards/rejected": -0.059427469968795776, "step": 2690 }, { "epoch": 0.41616083510535473, "grad_norm": 4.6780219078063965, "learning_rate": 4.7849123610952005e-06, "logits/chosen": 11.537569999694824, "logits/rejected": 6.925926685333252, "logps/chosen": -293.36236572265625, "logps/rejected": -263.9833984375, "loss": 0.5832, "rewards/accuracies": 0.625, "rewards/chosen": 0.10317935794591904, "rewards/margins": 0.3733992576599121, "rewards/rejected": -0.2702198922634125, "step": 2691 }, { "epoch": 0.4163154842451189, "grad_norm": 5.195574760437012, "learning_rate": 4.784625959445527e-06, "logits/chosen": 8.589856147766113, "logits/rejected": 3.791367292404175, "logps/chosen": -295.43475341796875, "logps/rejected": -266.682861328125, "loss": 0.4888, "rewards/accuracies": 0.75, "rewards/chosen": -0.13618162274360657, "rewards/margins": 0.6136234402656555, "rewards/rejected": -0.7498050928115845, "step": 2692 }, { "epoch": 0.41647013338488303, "grad_norm": 4.422612190246582, "learning_rate": 4.784339557795854e-06, "logits/chosen": 10.778101921081543, "logits/rejected": 8.08666706085205, "logps/chosen": -283.65826416015625, "logps/rejected": -219.94129943847656, "loss": 0.5003, "rewards/accuracies": 0.875, "rewards/chosen": -0.18425625562667847, "rewards/margins": 0.6662859916687012, "rewards/rejected": -0.8505423069000244, "step": 2693 }, { "epoch": 0.4166247825246472, "grad_norm": 6.373537540435791, "learning_rate": 4.78405315614618e-06, "logits/chosen": 11.748650550842285, "logits/rejected": 5.870093822479248, "logps/chosen": -463.2820129394531, "logps/rejected": -372.34283447265625, "loss": 0.5473, "rewards/accuracies": 0.75, "rewards/chosen": 0.22026681900024414, "rewards/margins": 0.5614244937896729, "rewards/rejected": -0.34115761518478394, "step": 2694 }, { "epoch": 0.4167794316644114, "grad_norm": 6.2711896896362305, "learning_rate": 4.783766754496506e-06, "logits/chosen": 6.283385276794434, "logits/rejected": 4.548773288726807, "logps/chosen": -265.0761413574219, "logps/rejected": -314.5213623046875, "loss": 0.7434, "rewards/accuracies": 0.375, "rewards/chosen": 0.05093931779265404, "rewards/margins": -0.032683007419109344, "rewards/rejected": 0.08362233638763428, "step": 2695 }, { "epoch": 0.41693408080417554, "grad_norm": 5.825169563293457, "learning_rate": 4.783480352846833e-06, "logits/chosen": 3.1713123321533203, "logits/rejected": 4.2689971923828125, "logps/chosen": -262.2401123046875, "logps/rejected": -265.037109375, "loss": 0.5856, "rewards/accuracies": 0.5, "rewards/chosen": -0.14730964601039886, "rewards/margins": 0.2898842692375183, "rewards/rejected": -0.4371938705444336, "step": 2696 }, { "epoch": 0.4170887299439397, "grad_norm": 5.386709213256836, "learning_rate": 4.78319395119716e-06, "logits/chosen": 6.719409465789795, "logits/rejected": 5.37785530090332, "logps/chosen": -203.09637451171875, "logps/rejected": -224.99574279785156, "loss": 0.7086, "rewards/accuracies": 0.625, "rewards/chosen": -0.10430718958377838, "rewards/margins": 0.03172749653458595, "rewards/rejected": -0.13603466749191284, "step": 2697 }, { "epoch": 0.41724337908370385, "grad_norm": 6.9432525634765625, "learning_rate": 4.782907549547485e-06, "logits/chosen": 8.518115043640137, "logits/rejected": 4.223296642303467, "logps/chosen": -364.51220703125, "logps/rejected": -264.8878479003906, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": 0.020981982350349426, "rewards/margins": 0.1203949898481369, "rewards/rejected": -0.09941300749778748, "step": 2698 }, { "epoch": 0.417398028223468, "grad_norm": 3.7915279865264893, "learning_rate": 4.782621147897812e-06, "logits/chosen": 4.608911991119385, "logits/rejected": -3.6935369968414307, "logps/chosen": -225.62054443359375, "logps/rejected": -175.47084045410156, "loss": 0.4539, "rewards/accuracies": 0.875, "rewards/chosen": 0.11719133704900742, "rewards/margins": 0.7321680784225464, "rewards/rejected": -0.6149767637252808, "step": 2699 }, { "epoch": 0.41755267736323215, "grad_norm": 5.045612335205078, "learning_rate": 4.782334746248139e-06, "logits/chosen": 6.090024471282959, "logits/rejected": 5.915399551391602, "logps/chosen": -267.00274658203125, "logps/rejected": -267.0867004394531, "loss": 0.5902, "rewards/accuracies": 0.75, "rewards/chosen": -0.056253623217344284, "rewards/margins": 0.2599541246891022, "rewards/rejected": -0.31620773673057556, "step": 2700 }, { "epoch": 0.4177073265029963, "grad_norm": 6.319419860839844, "learning_rate": 4.782048344598465e-06, "logits/chosen": 11.35316276550293, "logits/rejected": -1.763852596282959, "logps/chosen": -402.2293395996094, "logps/rejected": -195.75038146972656, "loss": 0.6435, "rewards/accuracies": 0.75, "rewards/chosen": -0.2728157937526703, "rewards/margins": 0.25665366649627686, "rewards/rejected": -0.5294694900512695, "step": 2701 }, { "epoch": 0.4178619756427605, "grad_norm": 6.286221504211426, "learning_rate": 4.781761942948792e-06, "logits/chosen": 8.281381607055664, "logits/rejected": 7.197473526000977, "logps/chosen": -289.343017578125, "logps/rejected": -248.4585723876953, "loss": 0.6846, "rewards/accuracies": 0.375, "rewards/chosen": -0.14469853043556213, "rewards/margins": 0.046721309423446655, "rewards/rejected": -0.1914198398590088, "step": 2702 }, { "epoch": 0.41801662478252466, "grad_norm": 5.062898635864258, "learning_rate": 4.781475541299118e-06, "logits/chosen": 11.330883026123047, "logits/rejected": -0.18748599290847778, "logps/chosen": -462.5653381347656, "logps/rejected": -245.14747619628906, "loss": 0.5265, "rewards/accuracies": 0.625, "rewards/chosen": 0.49622058868408203, "rewards/margins": 0.517587423324585, "rewards/rejected": -0.02136683464050293, "step": 2703 }, { "epoch": 0.4181712739222888, "grad_norm": 4.331087112426758, "learning_rate": 4.7811891396494445e-06, "logits/chosen": 12.954935073852539, "logits/rejected": 13.768539428710938, "logps/chosen": -214.82540893554688, "logps/rejected": -249.99374389648438, "loss": 0.6415, "rewards/accuracies": 0.625, "rewards/chosen": 0.03949194401502609, "rewards/margins": 0.11332221329212189, "rewards/rejected": -0.0738302618265152, "step": 2704 }, { "epoch": 0.41832592306205296, "grad_norm": 4.418230056762695, "learning_rate": 4.780902737999771e-06, "logits/chosen": 10.763041496276855, "logits/rejected": 8.676929473876953, "logps/chosen": -308.13031005859375, "logps/rejected": -288.34442138671875, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 0.13142453134059906, "rewards/margins": 0.5743554830551147, "rewards/rejected": -0.44293099641799927, "step": 2705 }, { "epoch": 0.4184805722018171, "grad_norm": 5.628113269805908, "learning_rate": 4.780616336350098e-06, "logits/chosen": 8.764001846313477, "logits/rejected": 6.977086544036865, "logps/chosen": -185.44674682617188, "logps/rejected": -193.530517578125, "loss": 0.8034, "rewards/accuracies": 0.375, "rewards/chosen": -0.2779386639595032, "rewards/margins": -0.16259542107582092, "rewards/rejected": -0.11534324288368225, "step": 2706 }, { "epoch": 0.41863522134158127, "grad_norm": 4.679571628570557, "learning_rate": 4.780329934700424e-06, "logits/chosen": 14.376748085021973, "logits/rejected": 12.397417068481445, "logps/chosen": -323.1869812011719, "logps/rejected": -276.1672668457031, "loss": 0.6319, "rewards/accuracies": 0.625, "rewards/chosen": 0.20652762055397034, "rewards/margins": 0.2234170138835907, "rewards/rejected": -0.01688939332962036, "step": 2707 }, { "epoch": 0.41878987048134547, "grad_norm": 5.90140438079834, "learning_rate": 4.78004353305075e-06, "logits/chosen": 14.55916976928711, "logits/rejected": 12.281283378601074, "logps/chosen": -320.0032043457031, "logps/rejected": -313.7568664550781, "loss": 0.729, "rewards/accuracies": 0.625, "rewards/chosen": -0.14047031104564667, "rewards/margins": 0.04747982323169708, "rewards/rejected": -0.18795014917850494, "step": 2708 }, { "epoch": 0.4189445196211096, "grad_norm": 5.6167473793029785, "learning_rate": 4.779757131401077e-06, "logits/chosen": 9.743917465209961, "logits/rejected": 7.455842971801758, "logps/chosen": -329.2607727050781, "logps/rejected": -256.277587890625, "loss": 0.605, "rewards/accuracies": 0.625, "rewards/chosen": -0.0670848935842514, "rewards/margins": 0.254169762134552, "rewards/rejected": -0.3212546408176422, "step": 2709 }, { "epoch": 0.4190991687608738, "grad_norm": 9.54301643371582, "learning_rate": 4.7794707297514035e-06, "logits/chosen": 9.22585678100586, "logits/rejected": 6.81712532043457, "logps/chosen": -280.6036071777344, "logps/rejected": -287.950927734375, "loss": 0.5549, "rewards/accuracies": 0.625, "rewards/chosen": 0.21857662498950958, "rewards/margins": 0.42286786437034607, "rewards/rejected": -0.2042911946773529, "step": 2710 }, { "epoch": 0.4192538179006379, "grad_norm": 7.372808456420898, "learning_rate": 4.77918432810173e-06, "logits/chosen": 10.510807991027832, "logits/rejected": 6.736120223999023, "logps/chosen": -263.6864013671875, "logps/rejected": -315.4885559082031, "loss": 0.8089, "rewards/accuracies": 0.5, "rewards/chosen": -0.11675453186035156, "rewards/margins": -0.10326775908470154, "rewards/rejected": -0.013486750423908234, "step": 2711 }, { "epoch": 0.4194084670404021, "grad_norm": 5.351847171783447, "learning_rate": 4.778897926452057e-06, "logits/chosen": 12.728840827941895, "logits/rejected": 9.63144588470459, "logps/chosen": -285.2611083984375, "logps/rejected": -231.7850341796875, "loss": 0.7402, "rewards/accuracies": 0.375, "rewards/chosen": 0.2631918787956238, "rewards/margins": -0.04113469272851944, "rewards/rejected": 0.304326593875885, "step": 2712 }, { "epoch": 0.41956311618016623, "grad_norm": 6.6852803230285645, "learning_rate": 4.7786115248023835e-06, "logits/chosen": -0.5082281827926636, "logits/rejected": 2.6910643577575684, "logps/chosen": -205.9815216064453, "logps/rejected": -227.4179229736328, "loss": 0.5989, "rewards/accuracies": 0.625, "rewards/chosen": 0.10439737141132355, "rewards/margins": 0.257002055644989, "rewards/rejected": -0.15260466933250427, "step": 2713 }, { "epoch": 0.4197177653199304, "grad_norm": 4.424941062927246, "learning_rate": 4.77832512315271e-06, "logits/chosen": 14.498956680297852, "logits/rejected": 12.126620292663574, "logps/chosen": -296.50799560546875, "logps/rejected": -249.41639709472656, "loss": 0.4657, "rewards/accuracies": 0.75, "rewards/chosen": 0.2681650221347809, "rewards/margins": 0.7185261249542236, "rewards/rejected": -0.45036107301712036, "step": 2714 }, { "epoch": 0.4198724144596946, "grad_norm": 8.645773887634277, "learning_rate": 4.778038721503036e-06, "logits/chosen": 7.685153961181641, "logits/rejected": 8.528714179992676, "logps/chosen": -344.1767578125, "logps/rejected": -278.14825439453125, "loss": 0.8658, "rewards/accuracies": 0.5, "rewards/chosen": -0.5705689191818237, "rewards/margins": -0.10321656614542007, "rewards/rejected": -0.46735236048698425, "step": 2715 }, { "epoch": 0.42002706359945874, "grad_norm": 5.718757629394531, "learning_rate": 4.7777523198533626e-06, "logits/chosen": 11.820518493652344, "logits/rejected": 7.936539173126221, "logps/chosen": -295.724853515625, "logps/rejected": -233.44459533691406, "loss": 0.7412, "rewards/accuracies": 0.625, "rewards/chosen": -0.09451419115066528, "rewards/margins": -0.010176517069339752, "rewards/rejected": -0.08433766663074493, "step": 2716 }, { "epoch": 0.4201817127392229, "grad_norm": 8.840944290161133, "learning_rate": 4.777465918203689e-06, "logits/chosen": 11.691082000732422, "logits/rejected": 4.844560146331787, "logps/chosen": -317.56500244140625, "logps/rejected": -262.15966796875, "loss": 0.5694, "rewards/accuracies": 0.625, "rewards/chosen": 0.23842407763004303, "rewards/margins": 0.3859137296676636, "rewards/rejected": -0.14748963713645935, "step": 2717 }, { "epoch": 0.42033636187898704, "grad_norm": 4.4735493659973145, "learning_rate": 4.777179516554016e-06, "logits/chosen": 14.208394050598145, "logits/rejected": 6.30283784866333, "logps/chosen": -357.7921142578125, "logps/rejected": -321.6224060058594, "loss": 0.4229, "rewards/accuracies": 0.875, "rewards/chosen": 0.18252897262573242, "rewards/margins": 0.895096480846405, "rewards/rejected": -0.7125673890113831, "step": 2718 }, { "epoch": 0.4204910110187512, "grad_norm": 5.160461902618408, "learning_rate": 4.7768931149043425e-06, "logits/chosen": 7.7447052001953125, "logits/rejected": 2.7460551261901855, "logps/chosen": -241.39859008789062, "logps/rejected": -226.61546325683594, "loss": 0.5778, "rewards/accuracies": 0.875, "rewards/chosen": -0.13145749270915985, "rewards/margins": 0.2700778543949127, "rewards/rejected": -0.4015353322029114, "step": 2719 }, { "epoch": 0.42064566015851534, "grad_norm": 5.89119815826416, "learning_rate": 4.776606713254669e-06, "logits/chosen": 9.629146575927734, "logits/rejected": 12.043877601623535, "logps/chosen": -205.89869689941406, "logps/rejected": -200.8662872314453, "loss": 0.8836, "rewards/accuracies": 0.125, "rewards/chosen": -0.3858489990234375, "rewards/margins": -0.2856106758117676, "rewards/rejected": -0.10023832321166992, "step": 2720 }, { "epoch": 0.42080030929827955, "grad_norm": 8.336956024169922, "learning_rate": 4.776320311604995e-06, "logits/chosen": 6.635457992553711, "logits/rejected": 4.65915584564209, "logps/chosen": -427.4053039550781, "logps/rejected": -474.78729248046875, "loss": 0.5444, "rewards/accuracies": 0.75, "rewards/chosen": 0.3576936721801758, "rewards/margins": 0.3945466876029968, "rewards/rejected": -0.03685302287340164, "step": 2721 }, { "epoch": 0.4209549584380437, "grad_norm": 5.169402599334717, "learning_rate": 4.776033909955322e-06, "logits/chosen": 7.739950180053711, "logits/rejected": 5.9468278884887695, "logps/chosen": -302.6673583984375, "logps/rejected": -233.45791625976562, "loss": 0.589, "rewards/accuracies": 0.625, "rewards/chosen": 0.024704553186893463, "rewards/margins": 0.2438000738620758, "rewards/rejected": -0.21909551322460175, "step": 2722 }, { "epoch": 0.42110960757780785, "grad_norm": 4.91262674331665, "learning_rate": 4.775747508305648e-06, "logits/chosen": 12.377195358276367, "logits/rejected": 12.8310546875, "logps/chosen": -260.36761474609375, "logps/rejected": -311.017333984375, "loss": 0.6212, "rewards/accuracies": 0.375, "rewards/chosen": 0.2523934245109558, "rewards/margins": 0.28162336349487305, "rewards/rejected": -0.02922992780804634, "step": 2723 }, { "epoch": 0.421264256717572, "grad_norm": 6.092989921569824, "learning_rate": 4.775461106655975e-06, "logits/chosen": 9.967718124389648, "logits/rejected": 7.425543785095215, "logps/chosen": -416.6290283203125, "logps/rejected": -356.41510009765625, "loss": 0.6552, "rewards/accuracies": 0.625, "rewards/chosen": 0.1927390992641449, "rewards/margins": 0.1469866931438446, "rewards/rejected": 0.04575243219733238, "step": 2724 }, { "epoch": 0.42141890585733616, "grad_norm": 3.4592785835266113, "learning_rate": 4.775174705006302e-06, "logits/chosen": 9.121768951416016, "logits/rejected": 3.0271830558776855, "logps/chosen": -259.8033447265625, "logps/rejected": -153.08914184570312, "loss": 0.5117, "rewards/accuracies": 0.625, "rewards/chosen": 0.06965846568346024, "rewards/margins": 0.5046188235282898, "rewards/rejected": -0.43496036529541016, "step": 2725 }, { "epoch": 0.4215735549971003, "grad_norm": 4.231537818908691, "learning_rate": 4.774888303356628e-06, "logits/chosen": 8.927241325378418, "logits/rejected": 3.7615315914154053, "logps/chosen": -255.95484924316406, "logps/rejected": -205.25143432617188, "loss": 0.5355, "rewards/accuracies": 0.625, "rewards/chosen": 0.3888920843601227, "rewards/margins": 0.5090867280960083, "rewards/rejected": -0.12019462883472443, "step": 2726 }, { "epoch": 0.4217282041368645, "grad_norm": 4.578822612762451, "learning_rate": 4.774601901706955e-06, "logits/chosen": 13.02111530303955, "logits/rejected": 12.118375778198242, "logps/chosen": -377.34454345703125, "logps/rejected": -291.754150390625, "loss": 0.4507, "rewards/accuracies": 0.875, "rewards/chosen": 0.3327385187149048, "rewards/margins": 0.7138979434967041, "rewards/rejected": -0.3811594247817993, "step": 2727 }, { "epoch": 0.42188285327662867, "grad_norm": 4.192107677459717, "learning_rate": 4.774315500057281e-06, "logits/chosen": 11.796497344970703, "logits/rejected": 3.850597381591797, "logps/chosen": -337.1534118652344, "logps/rejected": -192.74920654296875, "loss": 0.492, "rewards/accuracies": 0.625, "rewards/chosen": 0.25665825605392456, "rewards/margins": 0.7990729212760925, "rewards/rejected": -0.542414665222168, "step": 2728 }, { "epoch": 0.4220375024163928, "grad_norm": 6.299352645874023, "learning_rate": 4.774029098407607e-06, "logits/chosen": 6.67086124420166, "logits/rejected": 5.477456569671631, "logps/chosen": -231.43360900878906, "logps/rejected": -253.68856811523438, "loss": 0.693, "rewards/accuracies": 0.75, "rewards/chosen": -0.16059312224388123, "rewards/margins": 0.030339710414409637, "rewards/rejected": -0.19093284010887146, "step": 2729 }, { "epoch": 0.42219215155615697, "grad_norm": 5.316195964813232, "learning_rate": 4.773742696757934e-06, "logits/chosen": 7.674101829528809, "logits/rejected": 6.588985919952393, "logps/chosen": -246.37860107421875, "logps/rejected": -202.45803833007812, "loss": 0.6541, "rewards/accuracies": 0.75, "rewards/chosen": 0.1592136025428772, "rewards/margins": 0.2028958797454834, "rewards/rejected": -0.043682292103767395, "step": 2730 }, { "epoch": 0.4223468006959211, "grad_norm": 7.087975025177002, "learning_rate": 4.773456295108261e-06, "logits/chosen": 8.891820907592773, "logits/rejected": -0.0876539945602417, "logps/chosen": -336.699951171875, "logps/rejected": -224.86866760253906, "loss": 0.828, "rewards/accuracies": 0.5, "rewards/chosen": -0.07742289453744888, "rewards/margins": -0.12086465954780579, "rewards/rejected": 0.0434417761862278, "step": 2731 }, { "epoch": 0.42250144983568527, "grad_norm": 4.648256301879883, "learning_rate": 4.7731698934585864e-06, "logits/chosen": 8.062427520751953, "logits/rejected": 7.9964447021484375, "logps/chosen": -209.71705627441406, "logps/rejected": -194.92779541015625, "loss": 0.546, "rewards/accuracies": 0.875, "rewards/chosen": 0.24536506831645966, "rewards/margins": 0.35593506693840027, "rewards/rejected": -0.11057000607252121, "step": 2732 }, { "epoch": 0.4226560989754494, "grad_norm": 6.438325881958008, "learning_rate": 4.772883491808913e-06, "logits/chosen": 10.194450378417969, "logits/rejected": 6.947749137878418, "logps/chosen": -349.1863098144531, "logps/rejected": -264.14288330078125, "loss": 0.6243, "rewards/accuracies": 0.625, "rewards/chosen": 0.11034897714853287, "rewards/margins": 0.2750949561595917, "rewards/rejected": -0.16474595665931702, "step": 2733 }, { "epoch": 0.42281074811521363, "grad_norm": 7.391777038574219, "learning_rate": 4.77259709015924e-06, "logits/chosen": 9.822986602783203, "logits/rejected": 10.895857810974121, "logps/chosen": -327.1275634765625, "logps/rejected": -291.96185302734375, "loss": 0.8514, "rewards/accuracies": 0.625, "rewards/chosen": -0.07280893623828888, "rewards/margins": -0.06047758460044861, "rewards/rejected": -0.012331336736679077, "step": 2734 }, { "epoch": 0.4229653972549778, "grad_norm": 7.118979454040527, "learning_rate": 4.772310688509566e-06, "logits/chosen": 9.729959487915039, "logits/rejected": 7.374753475189209, "logps/chosen": -159.21572875976562, "logps/rejected": -147.21315002441406, "loss": 0.6117, "rewards/accuracies": 0.625, "rewards/chosen": -0.05866682529449463, "rewards/margins": 0.341379314661026, "rewards/rejected": -0.400046169757843, "step": 2735 }, { "epoch": 0.42312004639474193, "grad_norm": 7.149129390716553, "learning_rate": 4.772024286859892e-06, "logits/chosen": 1.4898169040679932, "logits/rejected": 4.360662460327148, "logps/chosen": -315.31103515625, "logps/rejected": -289.0843505859375, "loss": 0.795, "rewards/accuracies": 0.375, "rewards/chosen": 0.12909287214279175, "rewards/margins": -0.0901762917637825, "rewards/rejected": 0.21926920115947723, "step": 2736 }, { "epoch": 0.4232746955345061, "grad_norm": 4.002818584442139, "learning_rate": 4.771737885210219e-06, "logits/chosen": 13.609821319580078, "logits/rejected": 11.763874053955078, "logps/chosen": -232.28904724121094, "logps/rejected": -215.25314331054688, "loss": 0.5194, "rewards/accuracies": 0.875, "rewards/chosen": 0.07355241477489471, "rewards/margins": 0.41456544399261475, "rewards/rejected": -0.3410130739212036, "step": 2737 }, { "epoch": 0.42342934467427024, "grad_norm": 6.173304557800293, "learning_rate": 4.7714514835605455e-06, "logits/chosen": 7.67469596862793, "logits/rejected": 6.299157619476318, "logps/chosen": -343.06353759765625, "logps/rejected": -363.572998046875, "loss": 0.6818, "rewards/accuracies": 0.75, "rewards/chosen": 0.04554472118616104, "rewards/margins": 0.09363280236721039, "rewards/rejected": -0.04808807373046875, "step": 2738 }, { "epoch": 0.4235839938140344, "grad_norm": 16.468717575073242, "learning_rate": 4.771165081910872e-06, "logits/chosen": 7.102536201477051, "logits/rejected": 5.167606830596924, "logps/chosen": -300.58056640625, "logps/rejected": -406.4028015136719, "loss": 0.9323, "rewards/accuracies": 0.5, "rewards/chosen": -0.28924620151519775, "rewards/margins": -0.31365644931793213, "rewards/rejected": 0.024410255253314972, "step": 2739 }, { "epoch": 0.4237386429537986, "grad_norm": 4.076026439666748, "learning_rate": 4.770878680261199e-06, "logits/chosen": 5.457551956176758, "logits/rejected": 7.410901069641113, "logps/chosen": -210.39723205566406, "logps/rejected": -216.5321807861328, "loss": 0.5563, "rewards/accuracies": 0.75, "rewards/chosen": -0.22935137152671814, "rewards/margins": 0.40673646330833435, "rewards/rejected": -0.6360877752304077, "step": 2740 }, { "epoch": 0.42389329209356275, "grad_norm": 5.155257225036621, "learning_rate": 4.770592278611525e-06, "logits/chosen": 12.875899314880371, "logits/rejected": 6.465787887573242, "logps/chosen": -288.6136779785156, "logps/rejected": -194.6484832763672, "loss": 0.6288, "rewards/accuracies": 0.625, "rewards/chosen": -0.08096771687269211, "rewards/margins": 0.21717016398906708, "rewards/rejected": -0.2981378734111786, "step": 2741 }, { "epoch": 0.4240479412333269, "grad_norm": 4.6979804039001465, "learning_rate": 4.770305876961851e-06, "logits/chosen": 8.04547119140625, "logits/rejected": 8.203258514404297, "logps/chosen": -443.29681396484375, "logps/rejected": -622.5830078125, "loss": 0.415, "rewards/accuracies": 0.875, "rewards/chosen": 0.5322229266166687, "rewards/margins": 0.7668886780738831, "rewards/rejected": -0.23466576635837555, "step": 2742 }, { "epoch": 0.42420259037309105, "grad_norm": 4.5598673820495605, "learning_rate": 4.770019475312178e-06, "logits/chosen": 5.1866135597229, "logits/rejected": 10.363449096679688, "logps/chosen": -173.93594360351562, "logps/rejected": -195.01580810546875, "loss": 0.7764, "rewards/accuracies": 0.25, "rewards/chosen": -0.375881552696228, "rewards/margins": -0.12658080458641052, "rewards/rejected": -0.24930071830749512, "step": 2743 }, { "epoch": 0.4243572395128552, "grad_norm": 4.098367691040039, "learning_rate": 4.7697330736625045e-06, "logits/chosen": 10.002933502197266, "logits/rejected": 8.631038665771484, "logps/chosen": -159.1553192138672, "logps/rejected": -164.67227172851562, "loss": 0.6615, "rewards/accuracies": 0.75, "rewards/chosen": -0.03972359746694565, "rewards/margins": 0.12288478761911392, "rewards/rejected": -0.16260838508605957, "step": 2744 }, { "epoch": 0.42451188865261935, "grad_norm": 5.395009994506836, "learning_rate": 4.769446672012831e-06, "logits/chosen": 5.2226338386535645, "logits/rejected": 2.028899908065796, "logps/chosen": -316.1319885253906, "logps/rejected": -284.5406494140625, "loss": 0.5824, "rewards/accuracies": 0.75, "rewards/chosen": -0.4411565661430359, "rewards/margins": 0.27461639046669006, "rewards/rejected": -0.7157729864120483, "step": 2745 }, { "epoch": 0.4246665377923835, "grad_norm": 4.257770538330078, "learning_rate": 4.769160270363158e-06, "logits/chosen": 8.514404296875, "logits/rejected": 9.623875617980957, "logps/chosen": -272.18939208984375, "logps/rejected": -290.9097595214844, "loss": 0.6206, "rewards/accuracies": 0.5, "rewards/chosen": 0.10785437375307083, "rewards/margins": 0.306676983833313, "rewards/rejected": -0.19882263243198395, "step": 2746 }, { "epoch": 0.4248211869321477, "grad_norm": 9.1245698928833, "learning_rate": 4.7688738687134845e-06, "logits/chosen": 11.109613418579102, "logits/rejected": 5.702695846557617, "logps/chosen": -410.4515075683594, "logps/rejected": -320.5570983886719, "loss": 0.7255, "rewards/accuracies": 0.5, "rewards/chosen": 0.06999436020851135, "rewards/margins": 0.008247099816799164, "rewards/rejected": 0.06174727529287338, "step": 2747 }, { "epoch": 0.42497583607191186, "grad_norm": 3.340402126312256, "learning_rate": 4.76858746706381e-06, "logits/chosen": 9.341480255126953, "logits/rejected": 5.266529083251953, "logps/chosen": -221.44529724121094, "logps/rejected": -157.51185607910156, "loss": 0.4104, "rewards/accuracies": 1.0, "rewards/chosen": -0.2109074592590332, "rewards/margins": 0.7160725593566895, "rewards/rejected": -0.9269800186157227, "step": 2748 }, { "epoch": 0.425130485211676, "grad_norm": 7.31988525390625, "learning_rate": 4.768301065414137e-06, "logits/chosen": 11.002527236938477, "logits/rejected": 8.636887550354004, "logps/chosen": -331.5923156738281, "logps/rejected": -307.7025146484375, "loss": 0.8956, "rewards/accuracies": 0.5, "rewards/chosen": -0.3867785930633545, "rewards/margins": -0.22611822187900543, "rewards/rejected": -0.16066037118434906, "step": 2749 }, { "epoch": 0.42528513435144016, "grad_norm": 5.898653030395508, "learning_rate": 4.768014663764464e-06, "logits/chosen": 13.53886604309082, "logits/rejected": 8.136905670166016, "logps/chosen": -326.3847961425781, "logps/rejected": -262.4565124511719, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": -0.40300822257995605, "rewards/margins": 0.22316938638687134, "rewards/rejected": -0.6261776089668274, "step": 2750 }, { "epoch": 0.4254397834912043, "grad_norm": 5.246368885040283, "learning_rate": 4.76772826211479e-06, "logits/chosen": 8.735239028930664, "logits/rejected": 4.377988815307617, "logps/chosen": -263.0289306640625, "logps/rejected": -202.62985229492188, "loss": 0.5937, "rewards/accuracies": 0.875, "rewards/chosen": -0.18815650045871735, "rewards/margins": 0.24281644821166992, "rewards/rejected": -0.43097296357154846, "step": 2751 }, { "epoch": 0.42559443263096847, "grad_norm": 6.4957451820373535, "learning_rate": 4.767441860465117e-06, "logits/chosen": 14.04420280456543, "logits/rejected": 8.219314575195312, "logps/chosen": -389.6460876464844, "logps/rejected": -237.43838500976562, "loss": 0.7016, "rewards/accuracies": 0.625, "rewards/chosen": -0.16693763434886932, "rewards/margins": 0.04170088469982147, "rewards/rejected": -0.208638533949852, "step": 2752 }, { "epoch": 0.4257490817707327, "grad_norm": 8.037784576416016, "learning_rate": 4.7671554588154436e-06, "logits/chosen": 14.647222518920898, "logits/rejected": 9.967000007629395, "logps/chosen": -342.6969909667969, "logps/rejected": -270.9526672363281, "loss": 0.8012, "rewards/accuracies": 0.5, "rewards/chosen": -0.3012911379337311, "rewards/margins": 0.06553798168897629, "rewards/rejected": -0.3668290972709656, "step": 2753 }, { "epoch": 0.4259037309104968, "grad_norm": 5.5696797370910645, "learning_rate": 4.766869057165769e-06, "logits/chosen": 3.4403927326202393, "logits/rejected": -1.1014904975891113, "logps/chosen": -204.20413208007812, "logps/rejected": -120.4216537475586, "loss": 0.707, "rewards/accuracies": 0.625, "rewards/chosen": -0.5116927623748779, "rewards/margins": 0.09660043567419052, "rewards/rejected": -0.6082931756973267, "step": 2754 }, { "epoch": 0.426058380050261, "grad_norm": 5.144669055938721, "learning_rate": 4.766582655516096e-06, "logits/chosen": 14.354291915893555, "logits/rejected": 13.039499282836914, "logps/chosen": -303.9670104980469, "logps/rejected": -264.73101806640625, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": -0.30414876341819763, "rewards/margins": 0.07078682631254196, "rewards/rejected": -0.3749356269836426, "step": 2755 }, { "epoch": 0.42621302919002513, "grad_norm": 6.493957996368408, "learning_rate": 4.766296253866423e-06, "logits/chosen": 6.75232458114624, "logits/rejected": 8.511398315429688, "logps/chosen": -335.8753356933594, "logps/rejected": -300.45843505859375, "loss": 0.7413, "rewards/accuracies": 0.25, "rewards/chosen": -0.0030376510694622993, "rewards/margins": -0.026888374239206314, "rewards/rejected": 0.023850727826356888, "step": 2756 }, { "epoch": 0.4263676783297893, "grad_norm": 7.644586086273193, "learning_rate": 4.766009852216749e-06, "logits/chosen": 10.150111198425293, "logits/rejected": 7.354369640350342, "logps/chosen": -292.9273681640625, "logps/rejected": -298.7638854980469, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": -0.14902573823928833, "rewards/margins": 0.040667399764060974, "rewards/rejected": -0.1896931231021881, "step": 2757 }, { "epoch": 0.42652232746955343, "grad_norm": 7.182144641876221, "learning_rate": 4.765723450567076e-06, "logits/chosen": 1.1237726211547852, "logits/rejected": 8.013139724731445, "logps/chosen": -141.20968627929688, "logps/rejected": -239.74183654785156, "loss": 0.8645, "rewards/accuracies": 0.625, "rewards/chosen": -0.21673187613487244, "rewards/margins": -0.17473876476287842, "rewards/rejected": -0.041993118822574615, "step": 2758 }, { "epoch": 0.42667697660931764, "grad_norm": 7.0834197998046875, "learning_rate": 4.765437048917403e-06, "logits/chosen": 10.693586349487305, "logits/rejected": 2.623507022857666, "logps/chosen": -400.997314453125, "logps/rejected": -294.77459716796875, "loss": 0.5706, "rewards/accuracies": 0.75, "rewards/chosen": 0.3296867609024048, "rewards/margins": 0.3362666368484497, "rewards/rejected": -0.006579883396625519, "step": 2759 }, { "epoch": 0.4268316257490818, "grad_norm": 5.904839515686035, "learning_rate": 4.765150647267729e-06, "logits/chosen": 12.633384704589844, "logits/rejected": 8.203807830810547, "logps/chosen": -336.92047119140625, "logps/rejected": -308.80755615234375, "loss": 0.6768, "rewards/accuracies": 0.5, "rewards/chosen": -0.09725838899612427, "rewards/margins": 0.06765101104974747, "rewards/rejected": -0.16490939259529114, "step": 2760 }, { "epoch": 0.42698627488884594, "grad_norm": 5.89691686630249, "learning_rate": 4.764864245618055e-06, "logits/chosen": 7.833645343780518, "logits/rejected": 9.76827335357666, "logps/chosen": -285.94732666015625, "logps/rejected": -266.9113464355469, "loss": 0.5762, "rewards/accuracies": 0.625, "rewards/chosen": 0.004704661667346954, "rewards/margins": 0.28045085072517395, "rewards/rejected": -0.2757461965084076, "step": 2761 }, { "epoch": 0.4271409240286101, "grad_norm": 9.509416580200195, "learning_rate": 4.764577843968382e-06, "logits/chosen": 7.200796604156494, "logits/rejected": 5.4527153968811035, "logps/chosen": -250.1338348388672, "logps/rejected": -241.51318359375, "loss": 0.8927, "rewards/accuracies": 0.25, "rewards/chosen": -0.21443305909633636, "rewards/margins": -0.30618977546691895, "rewards/rejected": 0.09175673872232437, "step": 2762 }, { "epoch": 0.42729557316837424, "grad_norm": 7.374832630157471, "learning_rate": 4.764291442318708e-06, "logits/chosen": 7.869853496551514, "logits/rejected": 13.285655975341797, "logps/chosen": -254.0676727294922, "logps/rejected": -334.44476318359375, "loss": 0.7752, "rewards/accuracies": 0.75, "rewards/chosen": -0.10692453384399414, "rewards/margins": 0.12878704071044922, "rewards/rejected": -0.23571158945560455, "step": 2763 }, { "epoch": 0.4274502223081384, "grad_norm": 3.639901638031006, "learning_rate": 4.764005040669035e-06, "logits/chosen": 6.623315811157227, "logits/rejected": 3.7466959953308105, "logps/chosen": -162.18270874023438, "logps/rejected": -151.36611938476562, "loss": 0.6373, "rewards/accuracies": 0.5, "rewards/chosen": 0.0015748068690299988, "rewards/margins": 0.17304383218288422, "rewards/rejected": -0.17146901786327362, "step": 2764 }, { "epoch": 0.42760487144790255, "grad_norm": 15.84953498840332, "learning_rate": 4.763718639019362e-06, "logits/chosen": 10.884163856506348, "logits/rejected": 7.472978115081787, "logps/chosen": -397.96728515625, "logps/rejected": -329.92913818359375, "loss": 0.5627, "rewards/accuracies": 0.625, "rewards/chosen": 0.07396392524242401, "rewards/margins": 0.39607954025268555, "rewards/rejected": -0.32211560010910034, "step": 2765 }, { "epoch": 0.42775952058766675, "grad_norm": 4.597046375274658, "learning_rate": 4.7634322373696875e-06, "logits/chosen": 8.468798637390137, "logits/rejected": 5.423861503601074, "logps/chosen": -267.7794494628906, "logps/rejected": -226.85537719726562, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 0.2799021005630493, "rewards/margins": 0.6052736639976501, "rewards/rejected": -0.32537156343460083, "step": 2766 }, { "epoch": 0.4279141697274309, "grad_norm": 6.724148273468018, "learning_rate": 4.763145835720014e-06, "logits/chosen": 8.814096450805664, "logits/rejected": 9.682229995727539, "logps/chosen": -223.65939331054688, "logps/rejected": -263.681396484375, "loss": 0.7809, "rewards/accuracies": 0.375, "rewards/chosen": -0.246595099568367, "rewards/margins": -0.036822013556957245, "rewards/rejected": -0.20977307856082916, "step": 2767 }, { "epoch": 0.42806881886719506, "grad_norm": 6.475399494171143, "learning_rate": 4.762859434070341e-06, "logits/chosen": 6.6114068031311035, "logits/rejected": 9.897765159606934, "logps/chosen": -256.59771728515625, "logps/rejected": -311.9908447265625, "loss": 0.7774, "rewards/accuracies": 0.625, "rewards/chosen": -0.36203208565711975, "rewards/margins": -0.07748544216156006, "rewards/rejected": -0.2845466434955597, "step": 2768 }, { "epoch": 0.4282234680069592, "grad_norm": 4.478710651397705, "learning_rate": 4.762573032420667e-06, "logits/chosen": 5.682971954345703, "logits/rejected": 4.280372619628906, "logps/chosen": -219.49554443359375, "logps/rejected": -241.3962860107422, "loss": 0.6291, "rewards/accuracies": 0.5, "rewards/chosen": -0.15944606065750122, "rewards/margins": 0.16063222289085388, "rewards/rejected": -0.3200782835483551, "step": 2769 }, { "epoch": 0.42837811714672336, "grad_norm": 5.020727634429932, "learning_rate": 4.762286630770993e-06, "logits/chosen": 11.01222038269043, "logits/rejected": 9.80000114440918, "logps/chosen": -200.78025817871094, "logps/rejected": -194.8872528076172, "loss": 0.5965, "rewards/accuracies": 0.75, "rewards/chosen": -0.6253675818443298, "rewards/margins": 0.2561553418636322, "rewards/rejected": -0.8815228939056396, "step": 2770 }, { "epoch": 0.4285327662864875, "grad_norm": 6.611806869506836, "learning_rate": 4.76200022912132e-06, "logits/chosen": 7.793523788452148, "logits/rejected": 6.686113357543945, "logps/chosen": -297.10504150390625, "logps/rejected": -245.47140502929688, "loss": 0.7107, "rewards/accuracies": 0.5, "rewards/chosen": -0.05979418754577637, "rewards/margins": 0.03167299926280975, "rewards/rejected": -0.09146718680858612, "step": 2771 }, { "epoch": 0.4286874154262517, "grad_norm": 4.4195637702941895, "learning_rate": 4.7617138274716465e-06, "logits/chosen": 8.466676712036133, "logits/rejected": 8.881500244140625, "logps/chosen": -208.73162841796875, "logps/rejected": -234.2777557373047, "loss": 0.5949, "rewards/accuracies": 0.75, "rewards/chosen": 0.037338923662900925, "rewards/margins": 0.26364660263061523, "rewards/rejected": -0.22630766034126282, "step": 2772 }, { "epoch": 0.42884206456601587, "grad_norm": 4.510226726531982, "learning_rate": 4.761427425821973e-06, "logits/chosen": 4.968331813812256, "logits/rejected": 1.514676809310913, "logps/chosen": -172.02102661132812, "logps/rejected": -167.14883422851562, "loss": 0.5744, "rewards/accuracies": 0.625, "rewards/chosen": -0.45994484424591064, "rewards/margins": 0.28233641386032104, "rewards/rejected": -0.7422811985015869, "step": 2773 }, { "epoch": 0.42899671370578, "grad_norm": 7.163201808929443, "learning_rate": 4.761141024172299e-06, "logits/chosen": 8.524158477783203, "logits/rejected": 10.98208236694336, "logps/chosen": -208.82626342773438, "logps/rejected": -274.8758544921875, "loss": 0.7989, "rewards/accuracies": 0.375, "rewards/chosen": -0.32244065403938293, "rewards/margins": -0.13323114812374115, "rewards/rejected": -0.18920952081680298, "step": 2774 }, { "epoch": 0.42915136284554417, "grad_norm": 7.631000518798828, "learning_rate": 4.760854622522626e-06, "logits/chosen": 5.421298503875732, "logits/rejected": 11.113502502441406, "logps/chosen": -211.71424865722656, "logps/rejected": -270.81817626953125, "loss": 0.9257, "rewards/accuracies": 0.25, "rewards/chosen": -0.49365323781967163, "rewards/margins": -0.3654343783855438, "rewards/rejected": -0.1282188445329666, "step": 2775 }, { "epoch": 0.4293060119853083, "grad_norm": 5.42144775390625, "learning_rate": 4.760568220872952e-06, "logits/chosen": 11.68799114227295, "logits/rejected": 4.073337078094482, "logps/chosen": -258.0840759277344, "logps/rejected": -200.898681640625, "loss": 0.6138, "rewards/accuracies": 0.875, "rewards/chosen": 0.03412303328514099, "rewards/margins": 0.3629974126815796, "rewards/rejected": -0.3288743495941162, "step": 2776 }, { "epoch": 0.4294606611250725, "grad_norm": 8.292977333068848, "learning_rate": 4.760281819223279e-06, "logits/chosen": 17.479278564453125, "logits/rejected": 11.391890525817871, "logps/chosen": -438.58154296875, "logps/rejected": -329.58953857421875, "loss": 0.5731, "rewards/accuracies": 0.5, "rewards/chosen": 0.05540771037340164, "rewards/margins": 0.4602375328540802, "rewards/rejected": -0.40482980012893677, "step": 2777 }, { "epoch": 0.4296153102648366, "grad_norm": 9.318918228149414, "learning_rate": 4.759995417573606e-06, "logits/chosen": 13.366079330444336, "logits/rejected": 13.685798645019531, "logps/chosen": -386.84356689453125, "logps/rejected": -404.6522521972656, "loss": 0.9005, "rewards/accuracies": 0.125, "rewards/chosen": -0.30128782987594604, "rewards/margins": -0.3135948181152344, "rewards/rejected": 0.012306969612836838, "step": 2778 }, { "epoch": 0.42976995940460083, "grad_norm": 6.1490159034729, "learning_rate": 4.759709015923932e-06, "logits/chosen": 14.408186912536621, "logits/rejected": 10.164176940917969, "logps/chosen": -382.7894287109375, "logps/rejected": -365.68572998046875, "loss": 0.6623, "rewards/accuracies": 0.5, "rewards/chosen": -0.1661672592163086, "rewards/margins": 0.12371157854795456, "rewards/rejected": -0.28987884521484375, "step": 2779 }, { "epoch": 0.429924608544365, "grad_norm": 5.8148674964904785, "learning_rate": 4.759422614274258e-06, "logits/chosen": 5.316891193389893, "logits/rejected": 7.019569396972656, "logps/chosen": -225.36041259765625, "logps/rejected": -211.029296875, "loss": 0.6179, "rewards/accuracies": 0.625, "rewards/chosen": -0.021791886538267136, "rewards/margins": 0.19866633415222168, "rewards/rejected": -0.2204582393169403, "step": 2780 }, { "epoch": 0.43007925768412913, "grad_norm": 7.198339462280273, "learning_rate": 4.759136212624585e-06, "logits/chosen": 0.17101365327835083, "logits/rejected": 4.186741828918457, "logps/chosen": -231.4630126953125, "logps/rejected": -214.36569213867188, "loss": 0.889, "rewards/accuracies": 0.625, "rewards/chosen": -0.22137005627155304, "rewards/margins": -0.26349198818206787, "rewards/rejected": 0.04212189465761185, "step": 2781 }, { "epoch": 0.4302339068238933, "grad_norm": 4.7100419998168945, "learning_rate": 4.758849810974911e-06, "logits/chosen": 7.900914192199707, "logits/rejected": 1.3058967590332031, "logps/chosen": -265.0892333984375, "logps/rejected": -199.54608154296875, "loss": 0.5855, "rewards/accuracies": 0.875, "rewards/chosen": -0.014899879693984985, "rewards/margins": 0.41165053844451904, "rewards/rejected": -0.4265504479408264, "step": 2782 }, { "epoch": 0.43038855596365744, "grad_norm": 3.7495269775390625, "learning_rate": 4.758563409325238e-06, "logits/chosen": 10.084142684936523, "logits/rejected": 10.360098838806152, "logps/chosen": -258.8919982910156, "logps/rejected": -308.36767578125, "loss": 0.4645, "rewards/accuracies": 0.75, "rewards/chosen": -0.16352024674415588, "rewards/margins": 0.7146258354187012, "rewards/rejected": -0.8781461715698242, "step": 2783 }, { "epoch": 0.4305432051034216, "grad_norm": 5.727351188659668, "learning_rate": 4.758277007675565e-06, "logits/chosen": 9.807077407836914, "logits/rejected": 12.750134468078613, "logps/chosen": -231.79595947265625, "logps/rejected": -226.55422973632812, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": -0.06301812827587128, "rewards/margins": 0.1178392767906189, "rewards/rejected": -0.18085739016532898, "step": 2784 }, { "epoch": 0.4306978542431858, "grad_norm": 5.970767974853516, "learning_rate": 4.757990606025891e-06, "logits/chosen": 7.597332954406738, "logits/rejected": 7.153900146484375, "logps/chosen": -261.1533508300781, "logps/rejected": -346.2449645996094, "loss": 0.6197, "rewards/accuracies": 0.875, "rewards/chosen": -0.13725723326206207, "rewards/margins": 0.39906907081604004, "rewards/rejected": -0.5363263487815857, "step": 2785 }, { "epoch": 0.43085250338294995, "grad_norm": 7.419806003570557, "learning_rate": 4.757704204376218e-06, "logits/chosen": 4.775491237640381, "logits/rejected": 3.671041250228882, "logps/chosen": -330.25640869140625, "logps/rejected": -289.3252868652344, "loss": 0.8059, "rewards/accuracies": 0.25, "rewards/chosen": -0.49421679973602295, "rewards/margins": -0.12825129926204681, "rewards/rejected": -0.36596548557281494, "step": 2786 }, { "epoch": 0.4310071525227141, "grad_norm": 5.847108364105225, "learning_rate": 4.757417802726544e-06, "logits/chosen": 9.564638137817383, "logits/rejected": 5.162547588348389, "logps/chosen": -210.74020385742188, "logps/rejected": -176.9446258544922, "loss": 0.7796, "rewards/accuracies": 0.25, "rewards/chosen": -0.5194675922393799, "rewards/margins": -0.12075714766979218, "rewards/rejected": -0.3987103998661041, "step": 2787 }, { "epoch": 0.43116180166247825, "grad_norm": 8.376914978027344, "learning_rate": 4.75713140107687e-06, "logits/chosen": 14.467245101928711, "logits/rejected": 14.03645133972168, "logps/chosen": -341.5711669921875, "logps/rejected": -375.37548828125, "loss": 0.7979, "rewards/accuracies": 0.375, "rewards/chosen": -0.3634641766548157, "rewards/margins": -0.10142546892166138, "rewards/rejected": -0.2620387077331543, "step": 2788 }, { "epoch": 0.4313164508022424, "grad_norm": 5.915359973907471, "learning_rate": 4.756844999427197e-06, "logits/chosen": 10.123217582702637, "logits/rejected": 3.3616385459899902, "logps/chosen": -269.1231689453125, "logps/rejected": -189.5192108154297, "loss": 0.6006, "rewards/accuracies": 0.75, "rewards/chosen": -0.040657080709934235, "rewards/margins": 0.22448387742042542, "rewards/rejected": -0.26514095067977905, "step": 2789 }, { "epoch": 0.43147109994200655, "grad_norm": 6.117700576782227, "learning_rate": 4.756558597777524e-06, "logits/chosen": 13.525590896606445, "logits/rejected": 4.828794956207275, "logps/chosen": -455.7845458984375, "logps/rejected": -342.3318786621094, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 0.12162887305021286, "rewards/margins": 0.7615545988082886, "rewards/rejected": -0.6399257183074951, "step": 2790 }, { "epoch": 0.43162574908177076, "grad_norm": 6.3880181312561035, "learning_rate": 4.75627219612785e-06, "logits/chosen": 4.344879150390625, "logits/rejected": 8.407329559326172, "logps/chosen": -195.07583618164062, "logps/rejected": -236.76116943359375, "loss": 0.9007, "rewards/accuracies": 0.375, "rewards/chosen": -0.5900249481201172, "rewards/margins": -0.2911919057369232, "rewards/rejected": -0.29883310198783875, "step": 2791 }, { "epoch": 0.4317803982215349, "grad_norm": 4.570748805999756, "learning_rate": 4.755985794478177e-06, "logits/chosen": 8.43942642211914, "logits/rejected": 7.202610015869141, "logps/chosen": -160.795654296875, "logps/rejected": -120.71747589111328, "loss": 0.6239, "rewards/accuracies": 0.625, "rewards/chosen": -0.26789963245391846, "rewards/margins": 0.19951948523521423, "rewards/rejected": -0.4674190878868103, "step": 2792 }, { "epoch": 0.43193504736129906, "grad_norm": 7.541436195373535, "learning_rate": 4.755699392828504e-06, "logits/chosen": 6.8108367919921875, "logits/rejected": 8.16777229309082, "logps/chosen": -251.1971893310547, "logps/rejected": -302.0908203125, "loss": 0.809, "rewards/accuracies": 0.375, "rewards/chosen": -0.5668714046478271, "rewards/margins": -0.15265551209449768, "rewards/rejected": -0.41421589255332947, "step": 2793 }, { "epoch": 0.4320896965010632, "grad_norm": 5.5171098709106445, "learning_rate": 4.7554129911788294e-06, "logits/chosen": 8.900758743286133, "logits/rejected": 10.663663864135742, "logps/chosen": -213.49937438964844, "logps/rejected": -206.3883819580078, "loss": 0.7854, "rewards/accuracies": 0.375, "rewards/chosen": -0.3913537263870239, "rewards/margins": -0.15273170173168182, "rewards/rejected": -0.23862199485301971, "step": 2794 }, { "epoch": 0.43224434564082737, "grad_norm": 4.662868976593018, "learning_rate": 4.755126589529156e-06, "logits/chosen": 10.188685417175293, "logits/rejected": 6.852789878845215, "logps/chosen": -402.2821044921875, "logps/rejected": -321.64715576171875, "loss": 0.5504, "rewards/accuracies": 0.875, "rewards/chosen": -0.12097405642271042, "rewards/margins": 0.6118749380111694, "rewards/rejected": -0.7328490018844604, "step": 2795 }, { "epoch": 0.4323989947805915, "grad_norm": 6.1262078285217285, "learning_rate": 4.754840187879483e-06, "logits/chosen": 4.809596061706543, "logits/rejected": 6.813551425933838, "logps/chosen": -212.06600952148438, "logps/rejected": -278.2464904785156, "loss": 0.6351, "rewards/accuracies": 0.375, "rewards/chosen": -0.49957358837127686, "rewards/margins": 0.17979955673217773, "rewards/rejected": -0.6793731451034546, "step": 2796 }, { "epoch": 0.43255364392035567, "grad_norm": 7.292859077453613, "learning_rate": 4.754553786229809e-06, "logits/chosen": 11.588642120361328, "logits/rejected": 8.493711471557617, "logps/chosen": -362.13037109375, "logps/rejected": -222.6273651123047, "loss": 0.6902, "rewards/accuracies": 0.5, "rewards/chosen": -0.5309098958969116, "rewards/margins": 0.03510220721364021, "rewards/rejected": -0.5660121440887451, "step": 2797 }, { "epoch": 0.4327082930601199, "grad_norm": 4.58829927444458, "learning_rate": 4.754267384580136e-06, "logits/chosen": 10.057673454284668, "logits/rejected": 13.428799629211426, "logps/chosen": -166.45016479492188, "logps/rejected": -250.3113250732422, "loss": 0.5886, "rewards/accuracies": 0.75, "rewards/chosen": -0.44345489144325256, "rewards/margins": 0.23104631900787354, "rewards/rejected": -0.6745012402534485, "step": 2798 }, { "epoch": 0.432862942199884, "grad_norm": 4.8461127281188965, "learning_rate": 4.753980982930463e-06, "logits/chosen": 9.331192016601562, "logits/rejected": -5.255367755889893, "logps/chosen": -269.07080078125, "logps/rejected": -113.04387664794922, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": 0.07054150849580765, "rewards/margins": 0.24580320715904236, "rewards/rejected": -0.1752616912126541, "step": 2799 }, { "epoch": 0.4330175913396482, "grad_norm": 5.574551105499268, "learning_rate": 4.7536945812807885e-06, "logits/chosen": 10.010777473449707, "logits/rejected": 6.791634559631348, "logps/chosen": -336.68402099609375, "logps/rejected": -296.0834655761719, "loss": 0.4608, "rewards/accuracies": 0.75, "rewards/chosen": 0.34524527192115784, "rewards/margins": 0.6705272793769836, "rewards/rejected": -0.3252819776535034, "step": 2800 }, { "epoch": 0.43317224047941233, "grad_norm": 6.900188446044922, "learning_rate": 4.753408179631115e-06, "logits/chosen": 5.616233825683594, "logits/rejected": 5.674280643463135, "logps/chosen": -214.99447631835938, "logps/rejected": -224.55767822265625, "loss": 0.685, "rewards/accuracies": 0.5, "rewards/chosen": -0.06657677888870239, "rewards/margins": 0.04779187589883804, "rewards/rejected": -0.11436866223812103, "step": 2801 }, { "epoch": 0.4333268896191765, "grad_norm": 5.857180118560791, "learning_rate": 4.753121777981442e-06, "logits/chosen": 8.992115020751953, "logits/rejected": 8.140558242797852, "logps/chosen": -225.6531982421875, "logps/rejected": -241.54335021972656, "loss": 0.5998, "rewards/accuracies": 0.625, "rewards/chosen": -0.18340301513671875, "rewards/margins": 0.3001747131347656, "rewards/rejected": -0.48357775807380676, "step": 2802 }, { "epoch": 0.43348153875894063, "grad_norm": 13.215141296386719, "learning_rate": 4.7528353763317685e-06, "logits/chosen": 7.431465148925781, "logits/rejected": 2.723891496658325, "logps/chosen": -303.5306701660156, "logps/rejected": -220.5635986328125, "loss": 0.8223, "rewards/accuracies": 0.75, "rewards/chosen": 0.08407649397850037, "rewards/margins": 0.09192532300949097, "rewards/rejected": -0.007848832756280899, "step": 2803 }, { "epoch": 0.43363618789870484, "grad_norm": 4.5164103507995605, "learning_rate": 4.752548974682094e-06, "logits/chosen": 8.535188674926758, "logits/rejected": 7.057811737060547, "logps/chosen": -312.494873046875, "logps/rejected": -323.5174560546875, "loss": 0.4802, "rewards/accuracies": 0.75, "rewards/chosen": 0.00788029283285141, "rewards/margins": 0.5675268173217773, "rewards/rejected": -0.559646487236023, "step": 2804 }, { "epoch": 0.433790837038469, "grad_norm": 5.599987983703613, "learning_rate": 4.752262573032421e-06, "logits/chosen": 12.378469467163086, "logits/rejected": 3.429933547973633, "logps/chosen": -301.00347900390625, "logps/rejected": -180.8714141845703, "loss": 0.73, "rewards/accuracies": 0.5, "rewards/chosen": -0.37512171268463135, "rewards/margins": 0.00924713909626007, "rewards/rejected": -0.3843688368797302, "step": 2805 }, { "epoch": 0.43394548617823314, "grad_norm": 6.036183834075928, "learning_rate": 4.7519761713827476e-06, "logits/chosen": 11.895010948181152, "logits/rejected": 7.091794967651367, "logps/chosen": -292.4738464355469, "logps/rejected": -212.21270751953125, "loss": 0.6228, "rewards/accuracies": 0.875, "rewards/chosen": -0.36054885387420654, "rewards/margins": 0.26990586519241333, "rewards/rejected": -0.6304547786712646, "step": 2806 }, { "epoch": 0.4341001353179973, "grad_norm": 6.840506553649902, "learning_rate": 4.751689769733074e-06, "logits/chosen": 4.852654457092285, "logits/rejected": 2.638561248779297, "logps/chosen": -264.4471740722656, "logps/rejected": -238.7056121826172, "loss": 0.8123, "rewards/accuracies": 0.375, "rewards/chosen": -0.12147527933120728, "rewards/margins": -0.16497759521007538, "rewards/rejected": 0.04350230097770691, "step": 2807 }, { "epoch": 0.43425478445776144, "grad_norm": 7.319533824920654, "learning_rate": 4.7514033680834e-06, "logits/chosen": 7.211735248565674, "logits/rejected": 1.5934264659881592, "logps/chosen": -304.82257080078125, "logps/rejected": -213.3721466064453, "loss": 0.808, "rewards/accuracies": 0.5, "rewards/chosen": -0.3614337742328644, "rewards/margins": -0.13401289284229279, "rewards/rejected": -0.2274208962917328, "step": 2808 }, { "epoch": 0.4344094335975256, "grad_norm": 7.605015754699707, "learning_rate": 4.751116966433727e-06, "logits/chosen": 5.696356296539307, "logits/rejected": 3.17057728767395, "logps/chosen": -355.5419006347656, "logps/rejected": -240.73291015625, "loss": 0.8108, "rewards/accuracies": 0.625, "rewards/chosen": -0.20709803700447083, "rewards/margins": 0.03326638042926788, "rewards/rejected": -0.2403644174337387, "step": 2809 }, { "epoch": 0.43456408273728975, "grad_norm": 4.371910095214844, "learning_rate": 4.750830564784053e-06, "logits/chosen": 14.828933715820312, "logits/rejected": 11.163362503051758, "logps/chosen": -291.53472900390625, "logps/rejected": -258.2392578125, "loss": 0.4902, "rewards/accuracies": 0.625, "rewards/chosen": 0.13775327801704407, "rewards/margins": 0.5750832557678223, "rewards/rejected": -0.4373300075531006, "step": 2810 }, { "epoch": 0.43471873187705395, "grad_norm": 6.886568069458008, "learning_rate": 4.75054416313438e-06, "logits/chosen": 9.987464904785156, "logits/rejected": 6.372430801391602, "logps/chosen": -282.7420349121094, "logps/rejected": -230.234375, "loss": 0.8763, "rewards/accuracies": 0.5, "rewards/chosen": -0.39969274401664734, "rewards/margins": -0.12942704558372498, "rewards/rejected": -0.27026569843292236, "step": 2811 }, { "epoch": 0.4348733810168181, "grad_norm": 6.040317058563232, "learning_rate": 4.750257761484707e-06, "logits/chosen": 14.01717758178711, "logits/rejected": 8.728301048278809, "logps/chosen": -246.2141571044922, "logps/rejected": -220.49880981445312, "loss": 0.707, "rewards/accuracies": 0.625, "rewards/chosen": -0.25272905826568604, "rewards/margins": 0.14834925532341003, "rewards/rejected": -0.40107834339141846, "step": 2812 }, { "epoch": 0.43502803015658226, "grad_norm": 6.753664970397949, "learning_rate": 4.749971359835032e-06, "logits/chosen": 15.51312255859375, "logits/rejected": 10.871882438659668, "logps/chosen": -388.32086181640625, "logps/rejected": -313.42144775390625, "loss": 0.5732, "rewards/accuracies": 0.625, "rewards/chosen": 0.021067030727863312, "rewards/margins": 0.5964447259902954, "rewards/rejected": -0.5753776431083679, "step": 2813 }, { "epoch": 0.4351826792963464, "grad_norm": 6.621103286743164, "learning_rate": 4.749684958185359e-06, "logits/chosen": 16.63807487487793, "logits/rejected": 3.980222463607788, "logps/chosen": -451.0174560546875, "logps/rejected": -295.88629150390625, "loss": 0.6123, "rewards/accuracies": 0.75, "rewards/chosen": -0.025811776518821716, "rewards/margins": 0.34195002913475037, "rewards/rejected": -0.3677617907524109, "step": 2814 }, { "epoch": 0.43533732843611056, "grad_norm": 5.459160327911377, "learning_rate": 4.749398556535686e-06, "logits/chosen": 11.013882637023926, "logits/rejected": 8.598686218261719, "logps/chosen": -280.7895812988281, "logps/rejected": -251.208251953125, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": -0.5252373218536377, "rewards/margins": 0.2069089114665985, "rewards/rejected": -0.7321462631225586, "step": 2815 }, { "epoch": 0.4354919775758747, "grad_norm": 22.75695037841797, "learning_rate": 4.749112154886012e-06, "logits/chosen": 8.6644926071167, "logits/rejected": 7.899404525756836, "logps/chosen": -186.85472106933594, "logps/rejected": -175.57276916503906, "loss": 0.6322, "rewards/accuracies": 0.5, "rewards/chosen": -0.21507593989372253, "rewards/margins": 0.36467692255973816, "rewards/rejected": -0.5797528028488159, "step": 2816 }, { "epoch": 0.4356466267156389, "grad_norm": 5.813061237335205, "learning_rate": 4.748825753236339e-06, "logits/chosen": 11.378844261169434, "logits/rejected": 5.882504463195801, "logps/chosen": -236.85659790039062, "logps/rejected": -165.74310302734375, "loss": 0.6468, "rewards/accuracies": 0.75, "rewards/chosen": -0.14460229873657227, "rewards/margins": 0.2838023006916046, "rewards/rejected": -0.4284045696258545, "step": 2817 }, { "epoch": 0.43580127585540307, "grad_norm": 5.486540794372559, "learning_rate": 4.748539351586666e-06, "logits/chosen": 12.64401626586914, "logits/rejected": 8.501300811767578, "logps/chosen": -360.9309387207031, "logps/rejected": -238.2698974609375, "loss": 0.6023, "rewards/accuracies": 0.75, "rewards/chosen": -0.018212974071502686, "rewards/margins": 0.44703519344329834, "rewards/rejected": -0.465248167514801, "step": 2818 }, { "epoch": 0.4359559249951672, "grad_norm": 4.105498790740967, "learning_rate": 4.748252949936992e-06, "logits/chosen": 8.176006317138672, "logits/rejected": 3.262908458709717, "logps/chosen": -326.33197021484375, "logps/rejected": -244.0655059814453, "loss": 0.5462, "rewards/accuracies": 0.625, "rewards/chosen": 0.08322440087795258, "rewards/margins": 0.5353583097457886, "rewards/rejected": -0.4521338939666748, "step": 2819 }, { "epoch": 0.4361105741349314, "grad_norm": 5.889348030090332, "learning_rate": 4.747966548287318e-06, "logits/chosen": 6.981510639190674, "logits/rejected": 2.598665475845337, "logps/chosen": -312.58270263671875, "logps/rejected": -209.3723907470703, "loss": 0.5675, "rewards/accuracies": 0.75, "rewards/chosen": -0.08875159919261932, "rewards/margins": 0.5042783617973328, "rewards/rejected": -0.5930299758911133, "step": 2820 }, { "epoch": 0.4362652232746955, "grad_norm": 7.233922481536865, "learning_rate": 4.747680146637645e-06, "logits/chosen": 13.439886093139648, "logits/rejected": 8.223491668701172, "logps/chosen": -302.23590087890625, "logps/rejected": -223.22503662109375, "loss": 0.6514, "rewards/accuracies": 0.625, "rewards/chosen": -0.22159802913665771, "rewards/margins": 0.38722625374794006, "rewards/rejected": -0.6088243126869202, "step": 2821 }, { "epoch": 0.4364198724144597, "grad_norm": 5.701486110687256, "learning_rate": 4.7473937449879714e-06, "logits/chosen": 7.395998954772949, "logits/rejected": 5.183539390563965, "logps/chosen": -242.25369262695312, "logps/rejected": -202.43270874023438, "loss": 0.729, "rewards/accuracies": 0.5, "rewards/chosen": -0.2294432818889618, "rewards/margins": -0.01485791802406311, "rewards/rejected": -0.21458537876605988, "step": 2822 }, { "epoch": 0.4365745215542239, "grad_norm": 6.663989543914795, "learning_rate": 4.747107343338298e-06, "logits/chosen": 14.649640083312988, "logits/rejected": 13.448821067810059, "logps/chosen": -389.4757080078125, "logps/rejected": -348.6912841796875, "loss": 0.8359, "rewards/accuracies": 0.75, "rewards/chosen": -0.027445029467344284, "rewards/margins": -0.13992060720920563, "rewards/rejected": 0.11247558891773224, "step": 2823 }, { "epoch": 0.43672917069398803, "grad_norm": 4.454593658447266, "learning_rate": 4.746820941688625e-06, "logits/chosen": 12.053659439086914, "logits/rejected": 12.979633331298828, "logps/chosen": -189.28482055664062, "logps/rejected": -211.57125854492188, "loss": 0.6615, "rewards/accuracies": 0.625, "rewards/chosen": 0.048810768872499466, "rewards/margins": 0.28202423453330994, "rewards/rejected": -0.23321343958377838, "step": 2824 }, { "epoch": 0.4368838198337522, "grad_norm": 4.350654602050781, "learning_rate": 4.746534540038951e-06, "logits/chosen": 11.314224243164062, "logits/rejected": 7.191380500793457, "logps/chosen": -184.46389770507812, "logps/rejected": -124.232177734375, "loss": 0.6688, "rewards/accuracies": 0.75, "rewards/chosen": -0.13321954011917114, "rewards/margins": 0.22559450566768646, "rewards/rejected": -0.3588140308856964, "step": 2825 }, { "epoch": 0.43703846897351634, "grad_norm": 4.4734649658203125, "learning_rate": 4.746248138389278e-06, "logits/chosen": 13.605212211608887, "logits/rejected": 10.309515953063965, "logps/chosen": -289.1519775390625, "logps/rejected": -278.0863952636719, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": 0.18803460896015167, "rewards/margins": 0.2650759816169739, "rewards/rejected": -0.07704134285449982, "step": 2826 }, { "epoch": 0.4371931181132805, "grad_norm": 5.424923896789551, "learning_rate": 4.745961736739604e-06, "logits/chosen": 9.14299201965332, "logits/rejected": 2.465217113494873, "logps/chosen": -312.8207702636719, "logps/rejected": -308.5585632324219, "loss": 0.5008, "rewards/accuracies": 0.625, "rewards/chosen": 0.1549084633588791, "rewards/margins": 0.6946346759796143, "rewards/rejected": -0.5397261381149292, "step": 2827 }, { "epoch": 0.43734776725304464, "grad_norm": 4.931646823883057, "learning_rate": 4.7456753350899305e-06, "logits/chosen": 12.082155227661133, "logits/rejected": 12.245010375976562, "logps/chosen": -328.2345275878906, "logps/rejected": -312.2367858886719, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": 0.02204418182373047, "rewards/margins": 0.13654464483261108, "rewards/rejected": -0.11450046300888062, "step": 2828 }, { "epoch": 0.4375024163928088, "grad_norm": 5.402237892150879, "learning_rate": 4.745388933440257e-06, "logits/chosen": 4.226941108703613, "logits/rejected": 7.545684814453125, "logps/chosen": -247.03013610839844, "logps/rejected": -343.6475524902344, "loss": 0.5657, "rewards/accuracies": 0.625, "rewards/chosen": 0.02545851469039917, "rewards/margins": 0.4725891649723053, "rewards/rejected": -0.4471306800842285, "step": 2829 }, { "epoch": 0.437657065532573, "grad_norm": 7.002410888671875, "learning_rate": 4.745102531790584e-06, "logits/chosen": 2.431375741958618, "logits/rejected": 4.3468017578125, "logps/chosen": -320.364990234375, "logps/rejected": -299.33636474609375, "loss": 0.703, "rewards/accuracies": 0.625, "rewards/chosen": -0.1979716718196869, "rewards/margins": 0.05305410921573639, "rewards/rejected": -0.2510257959365845, "step": 2830 }, { "epoch": 0.43781171467233715, "grad_norm": 5.792113304138184, "learning_rate": 4.7448161301409104e-06, "logits/chosen": 3.8318676948547363, "logits/rejected": 5.64227819442749, "logps/chosen": -224.19606018066406, "logps/rejected": -263.5726318359375, "loss": 0.6752, "rewards/accuracies": 0.5, "rewards/chosen": -0.2541836202144623, "rewards/margins": 0.05528821051120758, "rewards/rejected": -0.30947184562683105, "step": 2831 }, { "epoch": 0.4379663638121013, "grad_norm": 6.636241436004639, "learning_rate": 4.744529728491237e-06, "logits/chosen": 10.726471900939941, "logits/rejected": 8.35133171081543, "logps/chosen": -432.3463134765625, "logps/rejected": -384.0025634765625, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.3270619511604309, "rewards/margins": 0.1283237338066101, "rewards/rejected": 0.1987381875514984, "step": 2832 }, { "epoch": 0.43812101295186545, "grad_norm": 6.241170406341553, "learning_rate": 4.744243326841563e-06, "logits/chosen": 11.469947814941406, "logits/rejected": 8.822287559509277, "logps/chosen": -292.97149658203125, "logps/rejected": -263.7149658203125, "loss": 0.6589, "rewards/accuracies": 0.5, "rewards/chosen": -0.4599659740924835, "rewards/margins": 0.15655073523521423, "rewards/rejected": -0.6165167093276978, "step": 2833 }, { "epoch": 0.4382756620916296, "grad_norm": 6.51314115524292, "learning_rate": 4.7439569251918895e-06, "logits/chosen": 14.622237205505371, "logits/rejected": 9.365476608276367, "logps/chosen": -258.34130859375, "logps/rejected": -245.51507568359375, "loss": 0.6062, "rewards/accuracies": 0.5, "rewards/chosen": 0.07307276129722595, "rewards/margins": 0.4358707070350647, "rewards/rejected": -0.36279794573783875, "step": 2834 }, { "epoch": 0.43843031123139375, "grad_norm": 5.3864593505859375, "learning_rate": 4.743670523542216e-06, "logits/chosen": 13.829833984375, "logits/rejected": 5.317742347717285, "logps/chosen": -223.54281616210938, "logps/rejected": -174.62063598632812, "loss": 0.7263, "rewards/accuracies": 0.375, "rewards/chosen": -0.31119033694267273, "rewards/margins": 0.16185730695724487, "rewards/rejected": -0.4730476140975952, "step": 2835 }, { "epoch": 0.43858496037115796, "grad_norm": 5.920149803161621, "learning_rate": 4.743384121892543e-06, "logits/chosen": 10.097222328186035, "logits/rejected": 7.227667808532715, "logps/chosen": -217.132080078125, "logps/rejected": -186.79025268554688, "loss": 0.7485, "rewards/accuracies": 0.625, "rewards/chosen": -0.20261025428771973, "rewards/margins": -0.0316656157374382, "rewards/rejected": -0.17094466090202332, "step": 2836 }, { "epoch": 0.4387396095109221, "grad_norm": 3.3589744567871094, "learning_rate": 4.7430977202428695e-06, "logits/chosen": 12.82377815246582, "logits/rejected": 7.432671070098877, "logps/chosen": -341.67144775390625, "logps/rejected": -225.3497772216797, "loss": 0.46, "rewards/accuracies": 0.625, "rewards/chosen": 0.05353812873363495, "rewards/margins": 0.6709260940551758, "rewards/rejected": -0.6173880100250244, "step": 2837 }, { "epoch": 0.43889425865068626, "grad_norm": 4.675251007080078, "learning_rate": 4.742811318593195e-06, "logits/chosen": 16.821693420410156, "logits/rejected": 13.110118865966797, "logps/chosen": -333.9576110839844, "logps/rejected": -315.6451110839844, "loss": 0.4815, "rewards/accuracies": 0.625, "rewards/chosen": 0.10564003884792328, "rewards/margins": 0.7636474967002869, "rewards/rejected": -0.6580074429512024, "step": 2838 }, { "epoch": 0.4390489077904504, "grad_norm": 5.882259368896484, "learning_rate": 4.742524916943522e-06, "logits/chosen": 6.317479133605957, "logits/rejected": 10.260385513305664, "logps/chosen": -229.90170288085938, "logps/rejected": -252.86549377441406, "loss": 0.5991, "rewards/accuracies": 0.75, "rewards/chosen": -0.21342070400714874, "rewards/margins": 0.35805028676986694, "rewards/rejected": -0.5714709758758545, "step": 2839 }, { "epoch": 0.43920355693021457, "grad_norm": 6.055997848510742, "learning_rate": 4.742238515293849e-06, "logits/chosen": 16.44618034362793, "logits/rejected": 6.760526657104492, "logps/chosen": -368.89093017578125, "logps/rejected": -266.67120361328125, "loss": 0.5047, "rewards/accuracies": 0.75, "rewards/chosen": -0.1716228574514389, "rewards/margins": 0.5712116956710815, "rewards/rejected": -0.7428345680236816, "step": 2840 }, { "epoch": 0.4393582060699787, "grad_norm": 6.78181791305542, "learning_rate": 4.741952113644175e-06, "logits/chosen": 4.90526008605957, "logits/rejected": 3.522831916809082, "logps/chosen": -288.4462585449219, "logps/rejected": -307.6053466796875, "loss": 0.7357, "rewards/accuracies": 0.5, "rewards/chosen": -0.16314341127872467, "rewards/margins": -0.04299222677946091, "rewards/rejected": -0.12015116959810257, "step": 2841 }, { "epoch": 0.43951285520974287, "grad_norm": 5.552892208099365, "learning_rate": 4.741665711994501e-06, "logits/chosen": 13.207711219787598, "logits/rejected": 7.869141578674316, "logps/chosen": -348.44677734375, "logps/rejected": -238.80465698242188, "loss": 0.624, "rewards/accuracies": 0.625, "rewards/chosen": 0.20664730668067932, "rewards/margins": 0.18964087963104248, "rewards/rejected": 0.017006421461701393, "step": 2842 }, { "epoch": 0.4396675043495071, "grad_norm": 8.40849781036377, "learning_rate": 4.741379310344828e-06, "logits/chosen": 6.449000358581543, "logits/rejected": 13.801970481872559, "logps/chosen": -252.5522003173828, "logps/rejected": -338.7855224609375, "loss": 0.609, "rewards/accuracies": 0.75, "rewards/chosen": -0.24829742312431335, "rewards/margins": 0.2070353627204895, "rewards/rejected": -0.45533275604248047, "step": 2843 }, { "epoch": 0.43982215348927123, "grad_norm": 5.977924823760986, "learning_rate": 4.741092908695154e-06, "logits/chosen": 10.545989990234375, "logits/rejected": 9.57856273651123, "logps/chosen": -256.90362548828125, "logps/rejected": -298.98876953125, "loss": 0.6774, "rewards/accuracies": 0.75, "rewards/chosen": 0.09113011509180069, "rewards/margins": 0.3714331388473511, "rewards/rejected": -0.2803030014038086, "step": 2844 }, { "epoch": 0.4399768026290354, "grad_norm": 4.474948406219482, "learning_rate": 4.740806507045481e-06, "logits/chosen": 6.020287990570068, "logits/rejected": 6.616371154785156, "logps/chosen": -229.8592529296875, "logps/rejected": -273.079833984375, "loss": 0.5296, "rewards/accuracies": 0.75, "rewards/chosen": -0.09826397150754929, "rewards/margins": 0.45664700865745544, "rewards/rejected": -0.5549110174179077, "step": 2845 }, { "epoch": 0.44013145176879953, "grad_norm": 5.890885829925537, "learning_rate": 4.740520105395807e-06, "logits/chosen": 13.939849853515625, "logits/rejected": 10.829734802246094, "logps/chosen": -317.510009765625, "logps/rejected": -270.29644775390625, "loss": 0.7285, "rewards/accuracies": 0.625, "rewards/chosen": 0.20510119199752808, "rewards/margins": -0.011336132884025574, "rewards/rejected": 0.21643735468387604, "step": 2846 }, { "epoch": 0.4402861009085637, "grad_norm": 5.421611309051514, "learning_rate": 4.7402337037461335e-06, "logits/chosen": 15.615734100341797, "logits/rejected": 10.996831893920898, "logps/chosen": -395.7857971191406, "logps/rejected": -286.41119384765625, "loss": 0.5148, "rewards/accuracies": 0.75, "rewards/chosen": 0.42165595293045044, "rewards/margins": 0.45399296283721924, "rewards/rejected": -0.0323369987308979, "step": 2847 }, { "epoch": 0.44044075004832783, "grad_norm": 4.428069591522217, "learning_rate": 4.73994730209646e-06, "logits/chosen": 11.204302787780762, "logits/rejected": 4.326806545257568, "logps/chosen": -277.58074951171875, "logps/rejected": -194.81951904296875, "loss": 0.5133, "rewards/accuracies": 0.75, "rewards/chosen": 0.38343364000320435, "rewards/margins": 0.5048680901527405, "rewards/rejected": -0.12143450230360031, "step": 2848 }, { "epoch": 0.44059539918809204, "grad_norm": 6.054380416870117, "learning_rate": 4.739660900446787e-06, "logits/chosen": 6.838757514953613, "logits/rejected": 1.1049487590789795, "logps/chosen": -302.639404296875, "logps/rejected": -203.12832641601562, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": 0.3010578751564026, "rewards/margins": 0.2759125828742981, "rewards/rejected": 0.025145292282104492, "step": 2849 }, { "epoch": 0.4407500483278562, "grad_norm": 4.552211761474609, "learning_rate": 4.739374498797113e-06, "logits/chosen": 5.224865436553955, "logits/rejected": -1.8634520769119263, "logps/chosen": -224.8545379638672, "logps/rejected": -157.0994110107422, "loss": 0.6149, "rewards/accuracies": 0.75, "rewards/chosen": 0.10895290225744247, "rewards/margins": 0.39938727021217346, "rewards/rejected": -0.2904343605041504, "step": 2850 }, { "epoch": 0.44090469746762034, "grad_norm": 5.609575271606445, "learning_rate": 4.73908809714744e-06, "logits/chosen": 9.490568161010742, "logits/rejected": 7.982059001922607, "logps/chosen": -423.14227294921875, "logps/rejected": -411.0915832519531, "loss": 0.7401, "rewards/accuracies": 0.5, "rewards/chosen": 0.2742403745651245, "rewards/margins": 0.0011831223964691162, "rewards/rejected": 0.2730572521686554, "step": 2851 }, { "epoch": 0.4410593466073845, "grad_norm": 6.922028064727783, "learning_rate": 4.738801695497767e-06, "logits/chosen": 6.390465259552002, "logits/rejected": 12.317313194274902, "logps/chosen": -209.84487915039062, "logps/rejected": -293.3603210449219, "loss": 0.9191, "rewards/accuracies": 0.375, "rewards/chosen": -0.09546690434217453, "rewards/margins": -0.3679962754249573, "rewards/rejected": 0.2725293040275574, "step": 2852 }, { "epoch": 0.44121399574714865, "grad_norm": 5.7474517822265625, "learning_rate": 4.7385152938480925e-06, "logits/chosen": 14.656574249267578, "logits/rejected": 11.81248664855957, "logps/chosen": -393.20941162109375, "logps/rejected": -329.83905029296875, "loss": 0.6291, "rewards/accuracies": 0.625, "rewards/chosen": 0.3799876868724823, "rewards/margins": 0.2613934278488159, "rewards/rejected": 0.11859427392482758, "step": 2853 }, { "epoch": 0.4413686448869128, "grad_norm": 5.243478775024414, "learning_rate": 4.738228892198419e-06, "logits/chosen": 8.810315132141113, "logits/rejected": 5.108021259307861, "logps/chosen": -298.6282958984375, "logps/rejected": -248.97943115234375, "loss": 0.5645, "rewards/accuracies": 0.75, "rewards/chosen": 0.18958550691604614, "rewards/margins": 0.35238659381866455, "rewards/rejected": -0.16280107200145721, "step": 2854 }, { "epoch": 0.44152329402667695, "grad_norm": 4.685748100280762, "learning_rate": 4.737942490548746e-06, "logits/chosen": 6.761685371398926, "logits/rejected": 6.452707767486572, "logps/chosen": -197.73789978027344, "logps/rejected": -193.42196655273438, "loss": 0.713, "rewards/accuracies": 0.5, "rewards/chosen": 0.055004216730594635, "rewards/margins": 0.0031135305762290955, "rewards/rejected": 0.05189068615436554, "step": 2855 }, { "epoch": 0.44167794316644116, "grad_norm": 3.807940721511841, "learning_rate": 4.7376560888990725e-06, "logits/chosen": 13.604524612426758, "logits/rejected": 6.8091630935668945, "logps/chosen": -358.82720947265625, "logps/rejected": -285.5086669921875, "loss": 0.3906, "rewards/accuracies": 1.0, "rewards/chosen": 0.7971546649932861, "rewards/margins": 0.861739993095398, "rewards/rejected": -0.0645853579044342, "step": 2856 }, { "epoch": 0.4418325923062053, "grad_norm": 9.43445110321045, "learning_rate": 4.737369687249399e-06, "logits/chosen": 4.530782222747803, "logits/rejected": 4.439804553985596, "logps/chosen": -345.72705078125, "logps/rejected": -370.6066589355469, "loss": 0.9023, "rewards/accuracies": 0.5, "rewards/chosen": 0.012003429234027863, "rewards/margins": -0.31864115595817566, "rewards/rejected": 0.3306445777416229, "step": 2857 }, { "epoch": 0.44198724144596946, "grad_norm": 5.291871547698975, "learning_rate": 4.737083285599726e-06, "logits/chosen": 6.8617634773254395, "logits/rejected": 6.9650678634643555, "logps/chosen": -291.0069274902344, "logps/rejected": -243.76925659179688, "loss": 0.7181, "rewards/accuracies": 0.625, "rewards/chosen": 0.23095805943012238, "rewards/margins": 0.05102190375328064, "rewards/rejected": 0.17993612587451935, "step": 2858 }, { "epoch": 0.4421418905857336, "grad_norm": 6.2592997550964355, "learning_rate": 4.736796883950052e-06, "logits/chosen": 7.91221809387207, "logits/rejected": 1.1641794443130493, "logps/chosen": -255.61685180664062, "logps/rejected": -223.37796020507812, "loss": 0.6719, "rewards/accuracies": 0.5, "rewards/chosen": -0.12598839402198792, "rewards/margins": 0.2353305071592331, "rewards/rejected": -0.3613188862800598, "step": 2859 }, { "epoch": 0.44229653972549776, "grad_norm": 5.587939262390137, "learning_rate": 4.736510482300378e-06, "logits/chosen": 13.008611679077148, "logits/rejected": 6.4830241203308105, "logps/chosen": -363.98974609375, "logps/rejected": -269.67724609375, "loss": 0.5449, "rewards/accuracies": 0.625, "rewards/chosen": 0.4826095700263977, "rewards/margins": 0.5990228056907654, "rewards/rejected": -0.11641322821378708, "step": 2860 }, { "epoch": 0.4424511888652619, "grad_norm": 6.520329475402832, "learning_rate": 4.736224080650705e-06, "logits/chosen": 6.634753227233887, "logits/rejected": 1.4946156740188599, "logps/chosen": -333.1385192871094, "logps/rejected": -314.5243835449219, "loss": 0.7414, "rewards/accuracies": 0.625, "rewards/chosen": 0.028812743723392487, "rewards/margins": 0.25756770372390747, "rewards/rejected": -0.2287549376487732, "step": 2861 }, { "epoch": 0.4426058380050261, "grad_norm": 4.743101596832275, "learning_rate": 4.7359376790010315e-06, "logits/chosen": 5.956636428833008, "logits/rejected": 4.081417083740234, "logps/chosen": -215.38755798339844, "logps/rejected": -246.11183166503906, "loss": 0.5464, "rewards/accuracies": 0.75, "rewards/chosen": -0.011276345700025558, "rewards/margins": 0.41927570104599, "rewards/rejected": -0.43055206537246704, "step": 2862 }, { "epoch": 0.44276048714479027, "grad_norm": 6.915797710418701, "learning_rate": 4.735651277351358e-06, "logits/chosen": 7.433876037597656, "logits/rejected": 8.607778549194336, "logps/chosen": -335.3842468261719, "logps/rejected": -365.8195495605469, "loss": 0.6955, "rewards/accuracies": 0.375, "rewards/chosen": 0.27969077229499817, "rewards/margins": 0.04275818169116974, "rewards/rejected": 0.23693259060382843, "step": 2863 }, { "epoch": 0.4429151362845544, "grad_norm": 8.339519500732422, "learning_rate": 4.735364875701685e-06, "logits/chosen": 6.564026832580566, "logits/rejected": 4.65277099609375, "logps/chosen": -240.2432861328125, "logps/rejected": -315.1837158203125, "loss": 0.7565, "rewards/accuracies": 0.25, "rewards/chosen": 0.027648955583572388, "rewards/margins": 0.10455577075481415, "rewards/rejected": -0.07690680772066116, "step": 2864 }, { "epoch": 0.4430697854243186, "grad_norm": 7.633686542510986, "learning_rate": 4.7350784740520115e-06, "logits/chosen": 8.107829093933105, "logits/rejected": 5.75361442565918, "logps/chosen": -299.97369384765625, "logps/rejected": -305.9490051269531, "loss": 0.7386, "rewards/accuracies": 0.5, "rewards/chosen": -0.10715429484844208, "rewards/margins": 0.0795750766992569, "rewards/rejected": -0.18672938644886017, "step": 2865 }, { "epoch": 0.4432244345640827, "grad_norm": 4.541005611419678, "learning_rate": 4.734792072402337e-06, "logits/chosen": 6.470778465270996, "logits/rejected": 7.629186630249023, "logps/chosen": -208.55015563964844, "logps/rejected": -222.7590789794922, "loss": 0.6301, "rewards/accuracies": 0.625, "rewards/chosen": 0.01057577133178711, "rewards/margins": 0.20005419850349426, "rewards/rejected": -0.18947842717170715, "step": 2866 }, { "epoch": 0.4433790837038469, "grad_norm": 3.8445582389831543, "learning_rate": 4.734505670752664e-06, "logits/chosen": 13.53183364868164, "logits/rejected": 8.576092720031738, "logps/chosen": -319.5737609863281, "logps/rejected": -236.65890502929688, "loss": 0.4916, "rewards/accuracies": 0.75, "rewards/chosen": 0.44668495655059814, "rewards/margins": 0.614438533782959, "rewards/rejected": -0.16775354743003845, "step": 2867 }, { "epoch": 0.4435337328436111, "grad_norm": 3.8433218002319336, "learning_rate": 4.7342192691029906e-06, "logits/chosen": 11.131497383117676, "logits/rejected": 8.642338752746582, "logps/chosen": -379.417236328125, "logps/rejected": -328.353515625, "loss": 0.5224, "rewards/accuracies": 0.75, "rewards/chosen": 0.39090633392333984, "rewards/margins": 0.48262134194374084, "rewards/rejected": -0.091715008020401, "step": 2868 }, { "epoch": 0.44368838198337524, "grad_norm": 6.492985248565674, "learning_rate": 4.733932867453317e-06, "logits/chosen": 14.70721435546875, "logits/rejected": 10.450970649719238, "logps/chosen": -430.2880554199219, "logps/rejected": -311.1890563964844, "loss": 0.6463, "rewards/accuracies": 0.5, "rewards/chosen": 0.14477311074733734, "rewards/margins": 0.19298803806304932, "rewards/rejected": -0.04821494221687317, "step": 2869 }, { "epoch": 0.4438430311231394, "grad_norm": 6.110063076019287, "learning_rate": 4.733646465803644e-06, "logits/chosen": 7.2248921394348145, "logits/rejected": 12.311100006103516, "logps/chosen": -249.59519958496094, "logps/rejected": -327.64544677734375, "loss": 0.869, "rewards/accuracies": 0.25, "rewards/chosen": -0.14278936386108398, "rewards/margins": -0.2975122928619385, "rewards/rejected": 0.1547229290008545, "step": 2870 }, { "epoch": 0.44399768026290354, "grad_norm": 7.39842414855957, "learning_rate": 4.73336006415397e-06, "logits/chosen": 11.0711669921875, "logits/rejected": 2.909083843231201, "logps/chosen": -305.2994384765625, "logps/rejected": -229.380615234375, "loss": 0.7751, "rewards/accuracies": 0.625, "rewards/chosen": 0.04197189211845398, "rewards/margins": 0.14328841865062714, "rewards/rejected": -0.10131651163101196, "step": 2871 }, { "epoch": 0.4441523294026677, "grad_norm": 4.908721446990967, "learning_rate": 4.733073662504296e-06, "logits/chosen": 8.833524703979492, "logits/rejected": 8.912151336669922, "logps/chosen": -206.72425842285156, "logps/rejected": -327.9241638183594, "loss": 0.5799, "rewards/accuracies": 0.5, "rewards/chosen": 0.1936500072479248, "rewards/margins": 0.2807844877243042, "rewards/rejected": -0.0871344804763794, "step": 2872 }, { "epoch": 0.44430697854243184, "grad_norm": 3.709312915802002, "learning_rate": 4.732787260854623e-06, "logits/chosen": 10.40578556060791, "logits/rejected": 10.66954231262207, "logps/chosen": -296.4912109375, "logps/rejected": -288.18365478515625, "loss": 0.5902, "rewards/accuracies": 0.375, "rewards/chosen": 0.4365907907485962, "rewards/margins": 0.40280190110206604, "rewards/rejected": 0.03378888964653015, "step": 2873 }, { "epoch": 0.444461627682196, "grad_norm": 4.196516036987305, "learning_rate": 4.73250085920495e-06, "logits/chosen": 6.399055004119873, "logits/rejected": 8.222249984741211, "logps/chosen": -161.1379852294922, "logps/rejected": -190.58763122558594, "loss": 0.6653, "rewards/accuracies": 0.5, "rewards/chosen": 0.29221320152282715, "rewards/margins": 0.09238973259925842, "rewards/rejected": 0.19982348382472992, "step": 2874 }, { "epoch": 0.4446162768219602, "grad_norm": 3.84584379196167, "learning_rate": 4.732214457555276e-06, "logits/chosen": 9.932168960571289, "logits/rejected": 8.567069053649902, "logps/chosen": -199.1146697998047, "logps/rejected": -188.4136962890625, "loss": 0.6714, "rewards/accuracies": 0.625, "rewards/chosen": -0.05072886496782303, "rewards/margins": 0.12173128873109818, "rewards/rejected": -0.1724601686000824, "step": 2875 }, { "epoch": 0.44477092596172435, "grad_norm": 7.2381591796875, "learning_rate": 4.731928055905602e-06, "logits/chosen": 4.177708148956299, "logits/rejected": 0.7572007179260254, "logps/chosen": -238.58975219726562, "logps/rejected": -212.23751831054688, "loss": 0.4967, "rewards/accuracies": 0.75, "rewards/chosen": 0.26416513323783875, "rewards/margins": 0.6271116733551025, "rewards/rejected": -0.3629465401172638, "step": 2876 }, { "epoch": 0.4449255751014885, "grad_norm": 4.084290027618408, "learning_rate": 4.731641654255929e-06, "logits/chosen": 11.849916458129883, "logits/rejected": 9.19716739654541, "logps/chosen": -207.31561279296875, "logps/rejected": -205.89488220214844, "loss": 0.5158, "rewards/accuracies": 0.875, "rewards/chosen": 0.21161805093288422, "rewards/margins": 0.5698764324188232, "rewards/rejected": -0.35825836658477783, "step": 2877 }, { "epoch": 0.44508022424125265, "grad_norm": 4.484077453613281, "learning_rate": 4.731355252606255e-06, "logits/chosen": 11.6097993850708, "logits/rejected": 5.1055097579956055, "logps/chosen": -326.378173828125, "logps/rejected": -210.7898712158203, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 0.4242036044597626, "rewards/margins": 0.5861848592758179, "rewards/rejected": -0.16198131442070007, "step": 2878 }, { "epoch": 0.4452348733810168, "grad_norm": 6.345067977905273, "learning_rate": 4.731068850956582e-06, "logits/chosen": 14.856422424316406, "logits/rejected": 8.679730415344238, "logps/chosen": -344.90924072265625, "logps/rejected": -295.05096435546875, "loss": 0.7574, "rewards/accuracies": 0.25, "rewards/chosen": -0.2681482434272766, "rewards/margins": -0.04513958841562271, "rewards/rejected": -0.2230086326599121, "step": 2879 }, { "epoch": 0.44538952252078096, "grad_norm": 5.224836826324463, "learning_rate": 4.730782449306908e-06, "logits/chosen": 12.31800651550293, "logits/rejected": 5.714697360992432, "logps/chosen": -284.20989990234375, "logps/rejected": -223.97613525390625, "loss": 0.5641, "rewards/accuracies": 0.875, "rewards/chosen": 0.1688869595527649, "rewards/margins": 0.37766990065574646, "rewards/rejected": -0.20878297090530396, "step": 2880 }, { "epoch": 0.44554417166054516, "grad_norm": 8.624768257141113, "learning_rate": 4.7304960476572345e-06, "logits/chosen": 11.577617645263672, "logits/rejected": 7.260366439819336, "logps/chosen": -387.6743469238281, "logps/rejected": -306.60614013671875, "loss": 0.8103, "rewards/accuracies": 0.625, "rewards/chosen": -0.02204202115535736, "rewards/margins": -0.09120999276638031, "rewards/rejected": 0.06916798651218414, "step": 2881 }, { "epoch": 0.4456988208003093, "grad_norm": 4.786019802093506, "learning_rate": 4.730209646007561e-06, "logits/chosen": 8.661097526550293, "logits/rejected": 3.394134521484375, "logps/chosen": -293.8501281738281, "logps/rejected": -209.88909912109375, "loss": 0.6564, "rewards/accuracies": 0.625, "rewards/chosen": 0.21976405382156372, "rewards/margins": 0.349984347820282, "rewards/rejected": -0.13022029399871826, "step": 2882 }, { "epoch": 0.44585346994007347, "grad_norm": 5.908203601837158, "learning_rate": 4.729923244357888e-06, "logits/chosen": 11.351553916931152, "logits/rejected": 9.534684181213379, "logps/chosen": -214.1370086669922, "logps/rejected": -250.79733276367188, "loss": 0.7505, "rewards/accuracies": 0.375, "rewards/chosen": -0.20431514084339142, "rewards/margins": -0.060582488775253296, "rewards/rejected": -0.14373263716697693, "step": 2883 }, { "epoch": 0.4460081190798376, "grad_norm": 4.996636390686035, "learning_rate": 4.7296368427082144e-06, "logits/chosen": 12.525541305541992, "logits/rejected": 10.830989837646484, "logps/chosen": -268.62042236328125, "logps/rejected": -227.0744171142578, "loss": 0.6361, "rewards/accuracies": 0.75, "rewards/chosen": 0.4013764560222626, "rewards/margins": 0.19802626967430115, "rewards/rejected": 0.20335017144680023, "step": 2884 }, { "epoch": 0.44616276821960177, "grad_norm": 4.845539569854736, "learning_rate": 4.729350441058541e-06, "logits/chosen": 8.097562789916992, "logits/rejected": 4.926815032958984, "logps/chosen": -145.10507202148438, "logps/rejected": -147.08103942871094, "loss": 0.7837, "rewards/accuracies": 0.375, "rewards/chosen": -0.06862720847129822, "rewards/margins": -0.09219758212566376, "rewards/rejected": 0.023570358753204346, "step": 2885 }, { "epoch": 0.4463174173593659, "grad_norm": 3.833444833755493, "learning_rate": 4.729064039408867e-06, "logits/chosen": 11.570324897766113, "logits/rejected": 3.198279857635498, "logps/chosen": -240.45358276367188, "logps/rejected": -182.78855895996094, "loss": 0.5184, "rewards/accuracies": 0.75, "rewards/chosen": 0.4192468225955963, "rewards/margins": 0.4428374469280243, "rewards/rejected": -0.023590609431266785, "step": 2886 }, { "epoch": 0.44647206649913007, "grad_norm": 6.880417346954346, "learning_rate": 4.7287776377591935e-06, "logits/chosen": 14.952377319335938, "logits/rejected": 6.515292167663574, "logps/chosen": -471.180419921875, "logps/rejected": -306.2154235839844, "loss": 0.6907, "rewards/accuracies": 0.375, "rewards/chosen": -0.017264170572161674, "rewards/margins": 0.1398581862449646, "rewards/rejected": -0.15712234377861023, "step": 2887 }, { "epoch": 0.4466267156388943, "grad_norm": 6.328181743621826, "learning_rate": 4.72849123610952e-06, "logits/chosen": 5.250241756439209, "logits/rejected": 8.796283721923828, "logps/chosen": -282.8057556152344, "logps/rejected": -321.9102478027344, "loss": 0.7552, "rewards/accuracies": 0.375, "rewards/chosen": 0.23810362815856934, "rewards/margins": -0.07757376879453659, "rewards/rejected": 0.3156774044036865, "step": 2888 }, { "epoch": 0.44678136477865843, "grad_norm": 5.26899528503418, "learning_rate": 4.728204834459847e-06, "logits/chosen": 8.859580993652344, "logits/rejected": 12.903532981872559, "logps/chosen": -283.0203857421875, "logps/rejected": -275.4232177734375, "loss": 0.6869, "rewards/accuracies": 0.625, "rewards/chosen": 0.17853814363479614, "rewards/margins": 0.049526508897542953, "rewards/rejected": 0.1290116310119629, "step": 2889 }, { "epoch": 0.4469360139184226, "grad_norm": 7.279280662536621, "learning_rate": 4.7279184328101735e-06, "logits/chosen": 7.685791969299316, "logits/rejected": 10.467572212219238, "logps/chosen": -234.00697326660156, "logps/rejected": -311.71002197265625, "loss": 0.8521, "rewards/accuracies": 0.5, "rewards/chosen": 0.07433100044727325, "rewards/margins": -0.19696417450904846, "rewards/rejected": 0.2712951898574829, "step": 2890 }, { "epoch": 0.44709066305818673, "grad_norm": 6.728243827819824, "learning_rate": 4.7276320311605e-06, "logits/chosen": 7.963534832000732, "logits/rejected": 6.767995357513428, "logps/chosen": -333.11199951171875, "logps/rejected": -330.29608154296875, "loss": 0.8359, "rewards/accuracies": 0.375, "rewards/chosen": 0.24529170989990234, "rewards/margins": -0.007717058062553406, "rewards/rejected": 0.25300878286361694, "step": 2891 }, { "epoch": 0.4472453121979509, "grad_norm": 5.864964962005615, "learning_rate": 4.727345629510826e-06, "logits/chosen": 4.441109657287598, "logits/rejected": 6.527352809906006, "logps/chosen": -211.81524658203125, "logps/rejected": -303.90679931640625, "loss": 0.7232, "rewards/accuracies": 0.5, "rewards/chosen": 0.17581599950790405, "rewards/margins": -0.0022001415491104126, "rewards/rejected": 0.17801614105701447, "step": 2892 }, { "epoch": 0.44739996133771504, "grad_norm": 5.700013637542725, "learning_rate": 4.727059227861153e-06, "logits/chosen": 14.111230850219727, "logits/rejected": 4.846957206726074, "logps/chosen": -390.2244873046875, "logps/rejected": -273.7905578613281, "loss": 0.4364, "rewards/accuracies": 0.75, "rewards/chosen": 0.48367539048194885, "rewards/margins": 0.8490333557128906, "rewards/rejected": -0.36535799503326416, "step": 2893 }, { "epoch": 0.44755461047747924, "grad_norm": 8.39551830291748, "learning_rate": 4.726772826211479e-06, "logits/chosen": 6.948728561401367, "logits/rejected": 5.054862976074219, "logps/chosen": -444.1217346191406, "logps/rejected": -310.09161376953125, "loss": 0.8162, "rewards/accuracies": 0.375, "rewards/chosen": 0.14990711212158203, "rewards/margins": 0.020329445600509644, "rewards/rejected": 0.12957763671875, "step": 2894 }, { "epoch": 0.4477092596172434, "grad_norm": 6.756582736968994, "learning_rate": 4.726486424561806e-06, "logits/chosen": 8.98362922668457, "logits/rejected": 10.83120059967041, "logps/chosen": -225.34942626953125, "logps/rejected": -311.18133544921875, "loss": 0.7712, "rewards/accuracies": 0.5, "rewards/chosen": 0.08209677040576935, "rewards/margins": -0.11063957214355469, "rewards/rejected": 0.19273634254932404, "step": 2895 }, { "epoch": 0.44786390875700754, "grad_norm": 6.001872539520264, "learning_rate": 4.7262000229121325e-06, "logits/chosen": 8.821746826171875, "logits/rejected": 10.922311782836914, "logps/chosen": -282.3558349609375, "logps/rejected": -290.3909912109375, "loss": 0.6489, "rewards/accuracies": 0.5, "rewards/chosen": 0.11806097626686096, "rewards/margins": 0.3032827079296112, "rewards/rejected": -0.18522171676158905, "step": 2896 }, { "epoch": 0.4480185578967717, "grad_norm": 4.782827854156494, "learning_rate": 4.725913621262459e-06, "logits/chosen": 6.726935386657715, "logits/rejected": 5.239489555358887, "logps/chosen": -283.99639892578125, "logps/rejected": -273.02099609375, "loss": 0.5221, "rewards/accuracies": 0.75, "rewards/chosen": 0.19964337348937988, "rewards/margins": 0.4158599078655243, "rewards/rejected": -0.2162165492773056, "step": 2897 }, { "epoch": 0.44817320703653585, "grad_norm": 5.477623462677002, "learning_rate": 4.725627219612786e-06, "logits/chosen": 14.605005264282227, "logits/rejected": 12.554754257202148, "logps/chosen": -345.5086669921875, "logps/rejected": -252.9807586669922, "loss": 0.639, "rewards/accuracies": 0.5, "rewards/chosen": 0.48494040966033936, "rewards/margins": 0.18280187249183655, "rewards/rejected": 0.3021385669708252, "step": 2898 }, { "epoch": 0.4483278561763, "grad_norm": 3.8706207275390625, "learning_rate": 4.725340817963112e-06, "logits/chosen": 8.385713577270508, "logits/rejected": 9.839127540588379, "logps/chosen": -174.33877563476562, "logps/rejected": -173.85263061523438, "loss": 0.4903, "rewards/accuracies": 0.75, "rewards/chosen": 0.2907941937446594, "rewards/margins": 0.5425559282302856, "rewards/rejected": -0.2517617344856262, "step": 2899 }, { "epoch": 0.4484825053160642, "grad_norm": 3.753722667694092, "learning_rate": 4.725054416313438e-06, "logits/chosen": 12.366252899169922, "logits/rejected": 4.724295616149902, "logps/chosen": -434.60870361328125, "logps/rejected": -274.9190368652344, "loss": 0.3718, "rewards/accuracies": 1.0, "rewards/chosen": 0.2013658583164215, "rewards/margins": 1.0279592275619507, "rewards/rejected": -0.8265933394432068, "step": 2900 }, { "epoch": 0.44863715445582836, "grad_norm": 6.165163516998291, "learning_rate": 4.724768014663765e-06, "logits/chosen": 8.658722877502441, "logits/rejected": 7.678178787231445, "logps/chosen": -170.8721160888672, "logps/rejected": -172.74989318847656, "loss": 0.9867, "rewards/accuracies": 0.0, "rewards/chosen": -0.27297621965408325, "rewards/margins": -0.4973093867301941, "rewards/rejected": 0.22433319687843323, "step": 2901 }, { "epoch": 0.4487918035955925, "grad_norm": 6.165961742401123, "learning_rate": 4.724481613014092e-06, "logits/chosen": 8.833492279052734, "logits/rejected": 2.277181625366211, "logps/chosen": -241.00518798828125, "logps/rejected": -152.55735778808594, "loss": 0.7116, "rewards/accuracies": 0.625, "rewards/chosen": -0.021444957703351974, "rewards/margins": -0.00022698938846588135, "rewards/rejected": -0.021217960864305496, "step": 2902 }, { "epoch": 0.44894645273535666, "grad_norm": 9.235562324523926, "learning_rate": 4.724195211364418e-06, "logits/chosen": 12.745402336120605, "logits/rejected": 5.982780456542969, "logps/chosen": -386.70623779296875, "logps/rejected": -264.0520324707031, "loss": 0.654, "rewards/accuracies": 0.75, "rewards/chosen": 0.6716396808624268, "rewards/margins": 0.4295971393585205, "rewards/rejected": 0.24204254150390625, "step": 2903 }, { "epoch": 0.4491011018751208, "grad_norm": 6.493156909942627, "learning_rate": 4.723908809714745e-06, "logits/chosen": 12.71495246887207, "logits/rejected": 10.095673561096191, "logps/chosen": -310.887939453125, "logps/rejected": -335.0145263671875, "loss": 0.6425, "rewards/accuracies": 0.625, "rewards/chosen": 0.12912635505199432, "rewards/margins": 0.1881587952375412, "rewards/rejected": -0.05903243273496628, "step": 2904 }, { "epoch": 0.44925575101488496, "grad_norm": 8.074202537536621, "learning_rate": 4.723622408065071e-06, "logits/chosen": 6.4809064865112305, "logits/rejected": 6.001316547393799, "logps/chosen": -368.92486572265625, "logps/rejected": -392.9451599121094, "loss": 0.8898, "rewards/accuracies": 0.25, "rewards/chosen": 0.3877643942832947, "rewards/margins": -0.2901355028152466, "rewards/rejected": 0.6778998970985413, "step": 2905 }, { "epoch": 0.4494104001546491, "grad_norm": 6.152915000915527, "learning_rate": 4.723336006415397e-06, "logits/chosen": 4.247409820556641, "logits/rejected": 11.452526092529297, "logps/chosen": -176.74588012695312, "logps/rejected": -200.24981689453125, "loss": 0.7763, "rewards/accuracies": 0.5, "rewards/chosen": -0.08430720865726471, "rewards/margins": -0.09208157658576965, "rewards/rejected": 0.007774388417601585, "step": 2906 }, { "epoch": 0.4495650492944133, "grad_norm": 3.5732929706573486, "learning_rate": 4.723049604765724e-06, "logits/chosen": 8.244793891906738, "logits/rejected": 7.020036697387695, "logps/chosen": -210.1520538330078, "logps/rejected": -169.88589477539062, "loss": 0.4803, "rewards/accuracies": 0.75, "rewards/chosen": 0.03771691024303436, "rewards/margins": 0.5712445378303528, "rewards/rejected": -0.5335276126861572, "step": 2907 }, { "epoch": 0.4497196984341775, "grad_norm": 5.908124923706055, "learning_rate": 4.722763203116051e-06, "logits/chosen": 6.088689804077148, "logits/rejected": 4.099931240081787, "logps/chosen": -207.8154296875, "logps/rejected": -217.70750427246094, "loss": 0.9709, "rewards/accuracies": 0.125, "rewards/chosen": -0.11870688199996948, "rewards/margins": -0.4534386396408081, "rewards/rejected": 0.3347318172454834, "step": 2908 }, { "epoch": 0.4498743475739416, "grad_norm": 6.127265930175781, "learning_rate": 4.7224768014663765e-06, "logits/chosen": 11.193788528442383, "logits/rejected": 10.419739723205566, "logps/chosen": -279.6499938964844, "logps/rejected": -285.610107421875, "loss": 0.7198, "rewards/accuracies": 0.5, "rewards/chosen": 0.1776106357574463, "rewards/margins": 0.06285585463047028, "rewards/rejected": 0.11475477367639542, "step": 2909 }, { "epoch": 0.4500289967137058, "grad_norm": 7.298559665679932, "learning_rate": 4.722190399816703e-06, "logits/chosen": 6.826419830322266, "logits/rejected": 5.780760288238525, "logps/chosen": -189.8668212890625, "logps/rejected": -177.15286254882812, "loss": 1.1304, "rewards/accuracies": 0.125, "rewards/chosen": -0.3428311049938202, "rewards/margins": -0.6570719480514526, "rewards/rejected": 0.31424081325531006, "step": 2910 }, { "epoch": 0.4501836458534699, "grad_norm": 3.988098621368408, "learning_rate": 4.72190399816703e-06, "logits/chosen": 4.990813255310059, "logits/rejected": 4.173823356628418, "logps/chosen": -241.4462890625, "logps/rejected": -195.440185546875, "loss": 0.5807, "rewards/accuracies": 0.75, "rewards/chosen": -0.08225131034851074, "rewards/margins": 0.3180958330631256, "rewards/rejected": -0.40034714341163635, "step": 2911 }, { "epoch": 0.4503382949932341, "grad_norm": 6.4938130378723145, "learning_rate": 4.721617596517356e-06, "logits/chosen": 12.63523006439209, "logits/rejected": 5.449173927307129, "logps/chosen": -310.046630859375, "logps/rejected": -224.2534942626953, "loss": 0.7482, "rewards/accuracies": 0.625, "rewards/chosen": -0.06133061647415161, "rewards/margins": -0.0026338621973991394, "rewards/rejected": -0.058696746826171875, "step": 2912 }, { "epoch": 0.4504929441329983, "grad_norm": 12.178394317626953, "learning_rate": 4.721331194867683e-06, "logits/chosen": 9.63875961303711, "logits/rejected": 6.1392316818237305, "logps/chosen": -288.7957763671875, "logps/rejected": -261.6700439453125, "loss": 0.7464, "rewards/accuracies": 0.5, "rewards/chosen": 0.07077684253454208, "rewards/margins": 0.054892003536224365, "rewards/rejected": 0.01588483154773712, "step": 2913 }, { "epoch": 0.45064759327276244, "grad_norm": 6.8520612716674805, "learning_rate": 4.721044793218009e-06, "logits/chosen": 11.555852890014648, "logits/rejected": 9.000267028808594, "logps/chosen": -354.20050048828125, "logps/rejected": -288.676513671875, "loss": 0.6133, "rewards/accuracies": 0.75, "rewards/chosen": 0.16141340136528015, "rewards/margins": 0.2694016993045807, "rewards/rejected": -0.10798830538988113, "step": 2914 }, { "epoch": 0.4508022424125266, "grad_norm": 6.7219438552856445, "learning_rate": 4.7207583915683355e-06, "logits/chosen": 7.465461254119873, "logits/rejected": 13.928438186645508, "logps/chosen": -259.7578125, "logps/rejected": -347.8528137207031, "loss": 0.8802, "rewards/accuracies": 0.25, "rewards/chosen": 0.0214422345161438, "rewards/margins": -0.19802358746528625, "rewards/rejected": 0.21946582198143005, "step": 2915 }, { "epoch": 0.45095689155229074, "grad_norm": 5.846672534942627, "learning_rate": 4.720471989918662e-06, "logits/chosen": 7.874068737030029, "logits/rejected": 11.163753509521484, "logps/chosen": -341.92974853515625, "logps/rejected": -356.7088928222656, "loss": 0.6387, "rewards/accuracies": 0.625, "rewards/chosen": 0.18847458064556122, "rewards/margins": 0.26053744554519653, "rewards/rejected": -0.07206287235021591, "step": 2916 }, { "epoch": 0.4511115406920549, "grad_norm": 3.4652023315429688, "learning_rate": 4.720185588268989e-06, "logits/chosen": 11.5037202835083, "logits/rejected": 4.1251983642578125, "logps/chosen": -203.9507293701172, "logps/rejected": -116.57904052734375, "loss": 0.6846, "rewards/accuracies": 0.375, "rewards/chosen": 0.03298850357532501, "rewards/margins": 0.11044225841760635, "rewards/rejected": -0.07745376229286194, "step": 2917 }, { "epoch": 0.45126618983181904, "grad_norm": 4.189902305603027, "learning_rate": 4.7198991866193155e-06, "logits/chosen": 11.779868125915527, "logits/rejected": -1.0590565204620361, "logps/chosen": -261.41375732421875, "logps/rejected": -124.47135925292969, "loss": 0.5819, "rewards/accuracies": 0.625, "rewards/chosen": 0.049158379435539246, "rewards/margins": 0.5056743025779724, "rewards/rejected": -0.456515908241272, "step": 2918 }, { "epoch": 0.4514208389715832, "grad_norm": 4.399352550506592, "learning_rate": 4.719612784969641e-06, "logits/chosen": 10.58271312713623, "logits/rejected": 10.691996574401855, "logps/chosen": -217.08486938476562, "logps/rejected": -211.27662658691406, "loss": 0.6223, "rewards/accuracies": 0.625, "rewards/chosen": 0.0179380364716053, "rewards/margins": 0.1928594410419464, "rewards/rejected": -0.1749214231967926, "step": 2919 }, { "epoch": 0.4515754881113474, "grad_norm": 5.943427562713623, "learning_rate": 4.719326383319968e-06, "logits/chosen": 6.621442794799805, "logits/rejected": 4.642213344573975, "logps/chosen": -247.37417602539062, "logps/rejected": -218.21728515625, "loss": 0.6181, "rewards/accuracies": 0.5, "rewards/chosen": 0.10349779576063156, "rewards/margins": 0.2949559688568115, "rewards/rejected": -0.19145813584327698, "step": 2920 }, { "epoch": 0.45173013725111155, "grad_norm": 3.9639053344726562, "learning_rate": 4.7190399816702946e-06, "logits/chosen": 7.8957438468933105, "logits/rejected": -1.8594489097595215, "logps/chosen": -338.7899169921875, "logps/rejected": -201.40090942382812, "loss": 0.5404, "rewards/accuracies": 0.875, "rewards/chosen": 0.4964830279350281, "rewards/margins": 0.4400922656059265, "rewards/rejected": 0.05639071762561798, "step": 2921 }, { "epoch": 0.4518847863908757, "grad_norm": 5.448639392852783, "learning_rate": 4.718753580020621e-06, "logits/chosen": 6.615348815917969, "logits/rejected": 2.5457427501678467, "logps/chosen": -345.60064697265625, "logps/rejected": -294.2164611816406, "loss": 0.5164, "rewards/accuracies": 0.75, "rewards/chosen": 0.34713125228881836, "rewards/margins": 0.5121694207191467, "rewards/rejected": -0.16503816843032837, "step": 2922 }, { "epoch": 0.45203943553063985, "grad_norm": 7.111696720123291, "learning_rate": 4.718467178370948e-06, "logits/chosen": 8.019306182861328, "logits/rejected": 7.517104625701904, "logps/chosen": -350.3014221191406, "logps/rejected": -288.8633117675781, "loss": 0.5941, "rewards/accuracies": 0.75, "rewards/chosen": 0.288277804851532, "rewards/margins": 0.28707897663116455, "rewards/rejected": 0.0011988431215286255, "step": 2923 }, { "epoch": 0.452194084670404, "grad_norm": 5.23995304107666, "learning_rate": 4.7181807767212745e-06, "logits/chosen": 8.124153137207031, "logits/rejected": 9.436251640319824, "logps/chosen": -284.27801513671875, "logps/rejected": -277.70611572265625, "loss": 0.7135, "rewards/accuracies": 0.5, "rewards/chosen": 0.08599749207496643, "rewards/margins": -0.021200411021709442, "rewards/rejected": 0.10719789564609528, "step": 2924 }, { "epoch": 0.45234873381016816, "grad_norm": 5.256038188934326, "learning_rate": 4.7178943750716e-06, "logits/chosen": 8.66554069519043, "logits/rejected": 4.414836406707764, "logps/chosen": -181.9287109375, "logps/rejected": -150.93508911132812, "loss": 0.6235, "rewards/accuracies": 0.75, "rewards/chosen": -0.0687284767627716, "rewards/margins": 0.2929876744747162, "rewards/rejected": -0.3617161512374878, "step": 2925 }, { "epoch": 0.45250338294993236, "grad_norm": 7.359097003936768, "learning_rate": 4.717607973421927e-06, "logits/chosen": 1.096543550491333, "logits/rejected": 6.549772262573242, "logps/chosen": -290.3392028808594, "logps/rejected": -384.2798156738281, "loss": 0.8194, "rewards/accuracies": 0.125, "rewards/chosen": 0.26173877716064453, "rewards/margins": -0.1917702704668045, "rewards/rejected": 0.45350903272628784, "step": 2926 }, { "epoch": 0.4526580320896965, "grad_norm": 4.2451629638671875, "learning_rate": 4.717321571772254e-06, "logits/chosen": 9.908976554870605, "logits/rejected": 6.200870037078857, "logps/chosen": -174.06968688964844, "logps/rejected": -154.8437042236328, "loss": 0.6444, "rewards/accuracies": 0.625, "rewards/chosen": -0.03125952184200287, "rewards/margins": 0.20702381432056427, "rewards/rejected": -0.23828335106372833, "step": 2927 }, { "epoch": 0.45281268122946067, "grad_norm": 4.497978210449219, "learning_rate": 4.71703517012258e-06, "logits/chosen": 9.630134582519531, "logits/rejected": 3.6748063564300537, "logps/chosen": -295.34820556640625, "logps/rejected": -263.46453857421875, "loss": 0.6878, "rewards/accuracies": 0.5, "rewards/chosen": 0.1658172607421875, "rewards/margins": 0.2573878765106201, "rewards/rejected": -0.091570645570755, "step": 2928 }, { "epoch": 0.4529673303692248, "grad_norm": 9.614076614379883, "learning_rate": 4.716748768472907e-06, "logits/chosen": 9.055042266845703, "logits/rejected": 5.912835121154785, "logps/chosen": -337.821044921875, "logps/rejected": -322.4443359375, "loss": 0.8793, "rewards/accuracies": 0.375, "rewards/chosen": -0.38718166947364807, "rewards/margins": -0.21364375948905945, "rewards/rejected": -0.17353789508342743, "step": 2929 }, { "epoch": 0.45312197950898897, "grad_norm": 6.579684734344482, "learning_rate": 4.716462366823234e-06, "logits/chosen": 9.525078773498535, "logits/rejected": 4.042651653289795, "logps/chosen": -430.9586486816406, "logps/rejected": -267.54595947265625, "loss": 0.533, "rewards/accuracies": 0.625, "rewards/chosen": 0.36265188455581665, "rewards/margins": 0.6446887254714966, "rewards/rejected": -0.28203684091567993, "step": 2930 }, { "epoch": 0.4532766286487531, "grad_norm": 4.843046188354492, "learning_rate": 4.71617596517356e-06, "logits/chosen": 9.071455001831055, "logits/rejected": 3.2884366512298584, "logps/chosen": -255.28253173828125, "logps/rejected": -168.43502807617188, "loss": 0.7336, "rewards/accuracies": 0.5, "rewards/chosen": 0.09355826675891876, "rewards/margins": 0.007359735667705536, "rewards/rejected": 0.08619852364063263, "step": 2931 }, { "epoch": 0.45343127778851733, "grad_norm": 5.47656774520874, "learning_rate": 4.715889563523886e-06, "logits/chosen": 13.489498138427734, "logits/rejected": 12.307373046875, "logps/chosen": -225.63623046875, "logps/rejected": -243.85015869140625, "loss": 0.6866, "rewards/accuracies": 0.5, "rewards/chosen": -0.09337463974952698, "rewards/margins": 0.06104620546102524, "rewards/rejected": -0.1544208526611328, "step": 2932 }, { "epoch": 0.4535859269282815, "grad_norm": 5.834225177764893, "learning_rate": 4.715603161874213e-06, "logits/chosen": 9.431708335876465, "logits/rejected": 7.638908386230469, "logps/chosen": -302.6466064453125, "logps/rejected": -247.7978057861328, "loss": 0.5391, "rewards/accuracies": 0.625, "rewards/chosen": 0.09487934410572052, "rewards/margins": 0.44622403383255005, "rewards/rejected": -0.3513447046279907, "step": 2933 }, { "epoch": 0.45374057606804563, "grad_norm": 4.184797286987305, "learning_rate": 4.715316760224539e-06, "logits/chosen": 8.781228065490723, "logits/rejected": 6.778829574584961, "logps/chosen": -264.7762451171875, "logps/rejected": -270.5966796875, "loss": 0.5381, "rewards/accuracies": 0.625, "rewards/chosen": 0.38937321305274963, "rewards/margins": 0.5425171256065369, "rewards/rejected": -0.15314389765262604, "step": 2934 }, { "epoch": 0.4538952252078098, "grad_norm": 7.629965305328369, "learning_rate": 4.715030358574866e-06, "logits/chosen": 8.832340240478516, "logits/rejected": 7.788751602172852, "logps/chosen": -261.5520324707031, "logps/rejected": -257.4610900878906, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": 0.2996445298194885, "rewards/margins": 0.5069663524627686, "rewards/rejected": -0.20732179284095764, "step": 2935 }, { "epoch": 0.45404987434757393, "grad_norm": 18.59811019897461, "learning_rate": 4.714743956925193e-06, "logits/chosen": 8.237777709960938, "logits/rejected": 6.447390556335449, "logps/chosen": -283.7626647949219, "logps/rejected": -213.61549377441406, "loss": 0.6715, "rewards/accuracies": 0.625, "rewards/chosen": 0.2837170958518982, "rewards/margins": 0.1279708445072174, "rewards/rejected": 0.15574628114700317, "step": 2936 }, { "epoch": 0.4542045234873381, "grad_norm": 5.235077857971191, "learning_rate": 4.714457555275519e-06, "logits/chosen": 13.669167518615723, "logits/rejected": 10.029403686523438, "logps/chosen": -276.7439270019531, "logps/rejected": -259.11639404296875, "loss": 0.5792, "rewards/accuracies": 0.625, "rewards/chosen": 0.1466926783323288, "rewards/margins": 0.3439185321331024, "rewards/rejected": -0.19722583889961243, "step": 2937 }, { "epoch": 0.45435917262710224, "grad_norm": 8.17039680480957, "learning_rate": 4.714171153625846e-06, "logits/chosen": 12.728302955627441, "logits/rejected": 10.468025207519531, "logps/chosen": -398.0960693359375, "logps/rejected": -307.9277648925781, "loss": 0.7742, "rewards/accuracies": 0.375, "rewards/chosen": -0.39349180459976196, "rewards/margins": -0.043975263833999634, "rewards/rejected": -0.3495165705680847, "step": 2938 }, { "epoch": 0.45451382176686644, "grad_norm": 4.526334285736084, "learning_rate": 4.713884751976172e-06, "logits/chosen": 8.52957820892334, "logits/rejected": 5.860341548919678, "logps/chosen": -276.20379638671875, "logps/rejected": -196.57733154296875, "loss": 0.599, "rewards/accuracies": 0.625, "rewards/chosen": -0.05858264118432999, "rewards/margins": 0.25034815073013306, "rewards/rejected": -0.30893078446388245, "step": 2939 }, { "epoch": 0.4546684709066306, "grad_norm": 5.230543613433838, "learning_rate": 4.713598350326498e-06, "logits/chosen": 9.470664024353027, "logits/rejected": 6.667529582977295, "logps/chosen": -258.9997253417969, "logps/rejected": -284.2755126953125, "loss": 0.6478, "rewards/accuracies": 0.625, "rewards/chosen": 0.1312621384859085, "rewards/margins": 0.14021340012550354, "rewards/rejected": -0.008951276540756226, "step": 2940 }, { "epoch": 0.45482312004639475, "grad_norm": 3.7914252281188965, "learning_rate": 4.713311948676825e-06, "logits/chosen": 9.263826370239258, "logits/rejected": -0.4324500560760498, "logps/chosen": -217.45944213867188, "logps/rejected": -125.88097381591797, "loss": 0.6231, "rewards/accuracies": 0.625, "rewards/chosen": -0.0592634491622448, "rewards/margins": 0.2843110263347626, "rewards/rejected": -0.3435744643211365, "step": 2941 }, { "epoch": 0.4549777691861589, "grad_norm": 9.012472152709961, "learning_rate": 4.713025547027152e-06, "logits/chosen": 6.490154266357422, "logits/rejected": 6.397916316986084, "logps/chosen": -307.388671875, "logps/rejected": -282.1990966796875, "loss": 0.9156, "rewards/accuracies": 0.375, "rewards/chosen": -0.1721625179052353, "rewards/margins": -0.19464808702468872, "rewards/rejected": 0.022485554218292236, "step": 2942 }, { "epoch": 0.45513241832592305, "grad_norm": 6.005348205566406, "learning_rate": 4.7127391453774775e-06, "logits/chosen": 13.595619201660156, "logits/rejected": 8.500800132751465, "logps/chosen": -287.17926025390625, "logps/rejected": -235.2257843017578, "loss": 0.6991, "rewards/accuracies": 0.625, "rewards/chosen": 0.12554672360420227, "rewards/margins": 0.049536775797605515, "rewards/rejected": 0.07600995153188705, "step": 2943 }, { "epoch": 0.4552870674656872, "grad_norm": 7.570594787597656, "learning_rate": 4.712452743727804e-06, "logits/chosen": 10.391090393066406, "logits/rejected": 5.073013782501221, "logps/chosen": -282.4736633300781, "logps/rejected": -215.39175415039062, "loss": 0.6653, "rewards/accuracies": 0.625, "rewards/chosen": -0.293901264667511, "rewards/margins": 0.13103628158569336, "rewards/rejected": -0.42493754625320435, "step": 2944 }, { "epoch": 0.4554417166054514, "grad_norm": 5.320469856262207, "learning_rate": 4.712166342078131e-06, "logits/chosen": 10.800678253173828, "logits/rejected": 7.414029121398926, "logps/chosen": -260.4130859375, "logps/rejected": -228.25308227539062, "loss": 0.6105, "rewards/accuracies": 0.5, "rewards/chosen": 0.12191639840602875, "rewards/margins": 0.27259302139282227, "rewards/rejected": -0.1506766378879547, "step": 2945 }, { "epoch": 0.45559636574521556, "grad_norm": 7.1692795753479, "learning_rate": 4.7118799404284574e-06, "logits/chosen": 5.790440082550049, "logits/rejected": 9.175068855285645, "logps/chosen": -303.69293212890625, "logps/rejected": -370.9253845214844, "loss": 0.948, "rewards/accuracies": 0.125, "rewards/chosen": -0.057733919471502304, "rewards/margins": -0.36218681931495667, "rewards/rejected": 0.30445292592048645, "step": 2946 }, { "epoch": 0.4557510148849797, "grad_norm": 5.838233470916748, "learning_rate": 4.711593538778783e-06, "logits/chosen": 13.011774063110352, "logits/rejected": 8.782716751098633, "logps/chosen": -422.3138122558594, "logps/rejected": -327.1571044921875, "loss": 0.6231, "rewards/accuracies": 0.625, "rewards/chosen": 0.583827018737793, "rewards/margins": 0.25265467166900635, "rewards/rejected": 0.3311723470687866, "step": 2947 }, { "epoch": 0.45590566402474386, "grad_norm": 7.294370174407959, "learning_rate": 4.71130713712911e-06, "logits/chosen": 9.52261734008789, "logits/rejected": 10.294153213500977, "logps/chosen": -422.61273193359375, "logps/rejected": -390.1136474609375, "loss": 0.569, "rewards/accuracies": 0.625, "rewards/chosen": 0.5218958258628845, "rewards/margins": 0.4072917699813843, "rewards/rejected": 0.11460405588150024, "step": 2948 }, { "epoch": 0.456060313164508, "grad_norm": 6.742400169372559, "learning_rate": 4.7110207354794366e-06, "logits/chosen": 9.81273365020752, "logits/rejected": 7.5383405685424805, "logps/chosen": -310.8827819824219, "logps/rejected": -266.2939758300781, "loss": 0.6422, "rewards/accuracies": 0.5, "rewards/chosen": 0.4784916043281555, "rewards/margins": 0.23882469534873962, "rewards/rejected": 0.2396669089794159, "step": 2949 }, { "epoch": 0.45621496230427216, "grad_norm": 6.285976409912109, "learning_rate": 4.710734333829763e-06, "logits/chosen": 15.320674896240234, "logits/rejected": 13.500925064086914, "logps/chosen": -305.8255310058594, "logps/rejected": -246.95144653320312, "loss": 0.6347, "rewards/accuracies": 0.625, "rewards/chosen": 0.2369910180568695, "rewards/margins": 0.14823904633522034, "rewards/rejected": 0.08875197917222977, "step": 2950 }, { "epoch": 0.4563696114440363, "grad_norm": 10.00040054321289, "learning_rate": 4.71044793218009e-06, "logits/chosen": 12.439031600952148, "logits/rejected": 8.216947555541992, "logps/chosen": -416.4801025390625, "logps/rejected": -335.2454528808594, "loss": 0.6619, "rewards/accuracies": 0.625, "rewards/chosen": 0.06558550894260406, "rewards/margins": 0.15731868147850037, "rewards/rejected": -0.0917331725358963, "step": 2951 }, { "epoch": 0.4565242605838005, "grad_norm": 4.993187427520752, "learning_rate": 4.710161530530416e-06, "logits/chosen": 5.436563968658447, "logits/rejected": 4.8355488777160645, "logps/chosen": -309.2663879394531, "logps/rejected": -268.904541015625, "loss": 0.639, "rewards/accuracies": 0.625, "rewards/chosen": 0.20571264624595642, "rewards/margins": 0.14895473420619965, "rewards/rejected": 0.05675792321562767, "step": 2952 }, { "epoch": 0.4566789097235647, "grad_norm": 3.816739559173584, "learning_rate": 4.709875128880742e-06, "logits/chosen": 10.772808074951172, "logits/rejected": 12.702529907226562, "logps/chosen": -130.58863830566406, "logps/rejected": -108.5149154663086, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": -0.06780728697776794, "rewards/margins": 0.02490680105984211, "rewards/rejected": -0.0927141010761261, "step": 2953 }, { "epoch": 0.4568335588633288, "grad_norm": 8.872064590454102, "learning_rate": 4.709588727231069e-06, "logits/chosen": 10.774144172668457, "logits/rejected": 9.176715850830078, "logps/chosen": -183.29100036621094, "logps/rejected": -157.44570922851562, "loss": 0.5443, "rewards/accuracies": 0.625, "rewards/chosen": 0.378996878862381, "rewards/margins": 0.3657678961753845, "rewards/rejected": 0.013228986412286758, "step": 2954 }, { "epoch": 0.456988208003093, "grad_norm": 11.244800567626953, "learning_rate": 4.709302325581396e-06, "logits/chosen": 12.992457389831543, "logits/rejected": 5.650915145874023, "logps/chosen": -223.0264892578125, "logps/rejected": -215.3613739013672, "loss": 0.7114, "rewards/accuracies": 0.5, "rewards/chosen": -0.029163725674152374, "rewards/margins": 0.1944197714328766, "rewards/rejected": -0.22358348965644836, "step": 2955 }, { "epoch": 0.45714285714285713, "grad_norm": 6.742232322692871, "learning_rate": 4.709015923931722e-06, "logits/chosen": 5.833223819732666, "logits/rejected": -0.9751105308532715, "logps/chosen": -503.0262756347656, "logps/rejected": -311.10504150390625, "loss": 0.6434, "rewards/accuracies": 0.5, "rewards/chosen": 0.26541581749916077, "rewards/margins": 0.1657085418701172, "rewards/rejected": 0.09970728307962418, "step": 2956 }, { "epoch": 0.4572975062826213, "grad_norm": 5.457131385803223, "learning_rate": 4.708729522282049e-06, "logits/chosen": 5.667617321014404, "logits/rejected": 9.613898277282715, "logps/chosen": -177.75857543945312, "logps/rejected": -226.108642578125, "loss": 0.8716, "rewards/accuracies": 0.375, "rewards/chosen": -0.24148930609226227, "rewards/margins": -0.23205962777137756, "rewards/rejected": -0.009429693222045898, "step": 2957 }, { "epoch": 0.4574521554223855, "grad_norm": 6.827175617218018, "learning_rate": 4.708443120632375e-06, "logits/chosen": 8.793025016784668, "logits/rejected": 11.108979225158691, "logps/chosen": -138.37109375, "logps/rejected": -147.51718139648438, "loss": 0.8412, "rewards/accuracies": 0.5, "rewards/chosen": -0.14409655332565308, "rewards/margins": -0.19285935163497925, "rewards/rejected": 0.04876277595758438, "step": 2958 }, { "epoch": 0.45760680456214964, "grad_norm": 5.0028157234191895, "learning_rate": 4.708156718982701e-06, "logits/chosen": 7.638659477233887, "logits/rejected": 9.501580238342285, "logps/chosen": -193.59210205078125, "logps/rejected": -217.2323760986328, "loss": 0.7786, "rewards/accuracies": 0.25, "rewards/chosen": -0.15716448426246643, "rewards/margins": -0.04010576009750366, "rewards/rejected": -0.11705875396728516, "step": 2959 }, { "epoch": 0.4577614537019138, "grad_norm": 5.470632553100586, "learning_rate": 4.707870317333028e-06, "logits/chosen": 8.95091724395752, "logits/rejected": 8.36347484588623, "logps/chosen": -276.8647155761719, "logps/rejected": -222.46798706054688, "loss": 0.7394, "rewards/accuracies": 0.375, "rewards/chosen": 0.07018567621707916, "rewards/margins": -0.04856785014271736, "rewards/rejected": 0.11875350773334503, "step": 2960 }, { "epoch": 0.45791610284167794, "grad_norm": 10.072470664978027, "learning_rate": 4.707583915683355e-06, "logits/chosen": 6.957911491394043, "logits/rejected": 8.475503921508789, "logps/chosen": -246.808837890625, "logps/rejected": -275.784912109375, "loss": 0.7485, "rewards/accuracies": 0.5, "rewards/chosen": 0.049744416028261185, "rewards/margins": -0.09210437536239624, "rewards/rejected": 0.14184880256652832, "step": 2961 }, { "epoch": 0.4580707519814421, "grad_norm": 5.873750686645508, "learning_rate": 4.707297514033681e-06, "logits/chosen": 8.11273193359375, "logits/rejected": 11.537481307983398, "logps/chosen": -180.97439575195312, "logps/rejected": -252.358154296875, "loss": 0.7115, "rewards/accuracies": 0.625, "rewards/chosen": 0.13970740139484406, "rewards/margins": 0.11542198061943054, "rewards/rejected": 0.024285420775413513, "step": 2962 }, { "epoch": 0.45822540112120624, "grad_norm": 5.016502857208252, "learning_rate": 4.707011112384008e-06, "logits/chosen": 9.926589012145996, "logits/rejected": 8.350566864013672, "logps/chosen": -256.6161804199219, "logps/rejected": -244.02792358398438, "loss": 0.5445, "rewards/accuracies": 0.875, "rewards/chosen": 0.28623121976852417, "rewards/margins": 0.3518240451812744, "rewards/rejected": -0.06559281051158905, "step": 2963 }, { "epoch": 0.45838005026097045, "grad_norm": 6.145665168762207, "learning_rate": 4.706724710734335e-06, "logits/chosen": 9.292642593383789, "logits/rejected": 7.054403305053711, "logps/chosen": -302.0745849609375, "logps/rejected": -288.5801086425781, "loss": 0.5826, "rewards/accuracies": 0.75, "rewards/chosen": 0.17600078880786896, "rewards/margins": 0.39004552364349365, "rewards/rejected": -0.2140447497367859, "step": 2964 }, { "epoch": 0.4585346994007346, "grad_norm": 5.751436233520508, "learning_rate": 4.70643830908466e-06, "logits/chosen": 11.112475395202637, "logits/rejected": 10.395916938781738, "logps/chosen": -268.319091796875, "logps/rejected": -253.51795959472656, "loss": 0.7416, "rewards/accuracies": 0.625, "rewards/chosen": 0.17272084951400757, "rewards/margins": 0.050621889531612396, "rewards/rejected": 0.12209896743297577, "step": 2965 }, { "epoch": 0.45868934854049875, "grad_norm": 4.8773040771484375, "learning_rate": 4.706151907434987e-06, "logits/chosen": 9.408148765563965, "logits/rejected": 5.81047248840332, "logps/chosen": -355.807861328125, "logps/rejected": -242.07968139648438, "loss": 0.543, "rewards/accuracies": 0.625, "rewards/chosen": 0.3769363462924957, "rewards/margins": 0.4655241072177887, "rewards/rejected": -0.08858775347471237, "step": 2966 }, { "epoch": 0.4588439976802629, "grad_norm": 6.105955600738525, "learning_rate": 4.705865505785314e-06, "logits/chosen": 7.414273262023926, "logits/rejected": 4.827837944030762, "logps/chosen": -419.0539245605469, "logps/rejected": -320.00335693359375, "loss": 0.519, "rewards/accuracies": 0.75, "rewards/chosen": 0.37135595083236694, "rewards/margins": 0.4644581079483032, "rewards/rejected": -0.09310220181941986, "step": 2967 }, { "epoch": 0.45899864682002706, "grad_norm": 6.328551292419434, "learning_rate": 4.70557910413564e-06, "logits/chosen": 7.7021942138671875, "logits/rejected": 6.684210777282715, "logps/chosen": -291.421142578125, "logps/rejected": -328.996337890625, "loss": 0.678, "rewards/accuracies": 0.625, "rewards/chosen": 0.04359845817089081, "rewards/margins": 0.07960616052150726, "rewards/rejected": -0.03600768744945526, "step": 2968 }, { "epoch": 0.4591532959597912, "grad_norm": 4.896423816680908, "learning_rate": 4.705292702485967e-06, "logits/chosen": 11.680112838745117, "logits/rejected": 6.156183242797852, "logps/chosen": -274.7214050292969, "logps/rejected": -170.7608642578125, "loss": 0.5834, "rewards/accuracies": 0.625, "rewards/chosen": 0.22536210715770721, "rewards/margins": 0.2836277484893799, "rewards/rejected": -0.05826563760638237, "step": 2969 }, { "epoch": 0.45930794509955536, "grad_norm": 6.469099998474121, "learning_rate": 4.705006300836294e-06, "logits/chosen": 10.331438064575195, "logits/rejected": 8.041559219360352, "logps/chosen": -336.5439453125, "logps/rejected": -307.2086486816406, "loss": 0.644, "rewards/accuracies": 0.625, "rewards/chosen": 0.03885392099618912, "rewards/margins": 0.17932654917240143, "rewards/rejected": -0.1404726207256317, "step": 2970 }, { "epoch": 0.45946259423931957, "grad_norm": 5.671962261199951, "learning_rate": 4.70471989918662e-06, "logits/chosen": 10.851548194885254, "logits/rejected": 11.975399017333984, "logps/chosen": -193.5721893310547, "logps/rejected": -231.0876922607422, "loss": 0.7137, "rewards/accuracies": 0.5, "rewards/chosen": 0.0047724246978759766, "rewards/margins": -0.009501226246356964, "rewards/rejected": 0.014273647218942642, "step": 2971 }, { "epoch": 0.4596172433790837, "grad_norm": 5.729818820953369, "learning_rate": 4.704433497536946e-06, "logits/chosen": 8.65424919128418, "logits/rejected": 6.078248977661133, "logps/chosen": -308.5588684082031, "logps/rejected": -304.5982971191406, "loss": 0.5868, "rewards/accuracies": 0.625, "rewards/chosen": 0.1789310723543167, "rewards/margins": 0.34274667501449585, "rewards/rejected": -0.16381561756134033, "step": 2972 }, { "epoch": 0.45977189251884787, "grad_norm": 9.466398239135742, "learning_rate": 4.704147095887273e-06, "logits/chosen": 15.809646606445312, "logits/rejected": 12.176141738891602, "logps/chosen": -345.842529296875, "logps/rejected": -261.44024658203125, "loss": 0.6041, "rewards/accuracies": 0.625, "rewards/chosen": 0.07272882759571075, "rewards/margins": 0.2708126902580261, "rewards/rejected": -0.19808383285999298, "step": 2973 }, { "epoch": 0.459926541658612, "grad_norm": 4.588893413543701, "learning_rate": 4.703860694237599e-06, "logits/chosen": 5.180237293243408, "logits/rejected": 1.8059334754943848, "logps/chosen": -415.674560546875, "logps/rejected": -228.9871063232422, "loss": 0.5759, "rewards/accuracies": 0.5, "rewards/chosen": 0.42974549531936646, "rewards/margins": 0.3696592450141907, "rewards/rejected": 0.060086242854595184, "step": 2974 }, { "epoch": 0.46008119079837617, "grad_norm": 4.644771575927734, "learning_rate": 4.703574292587926e-06, "logits/chosen": 6.418425559997559, "logits/rejected": 4.221429347991943, "logps/chosen": -132.37106323242188, "logps/rejected": -136.19680786132812, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": 0.03858347237110138, "rewards/margins": 0.09676718711853027, "rewards/rejected": -0.058183714747428894, "step": 2975 }, { "epoch": 0.4602358399381403, "grad_norm": 6.267160415649414, "learning_rate": 4.703287890938253e-06, "logits/chosen": 5.934901237487793, "logits/rejected": 9.727717399597168, "logps/chosen": -217.17721557617188, "logps/rejected": -239.1723175048828, "loss": 0.7471, "rewards/accuracies": 0.625, "rewards/chosen": -0.0834803581237793, "rewards/margins": 0.008295901119709015, "rewards/rejected": -0.0917762815952301, "step": 2976 }, { "epoch": 0.46039048907790453, "grad_norm": 3.273151159286499, "learning_rate": 4.7030014892885785e-06, "logits/chosen": 13.827066421508789, "logits/rejected": 7.5672807693481445, "logps/chosen": -153.92709350585938, "logps/rejected": -124.07454681396484, "loss": 0.5005, "rewards/accuracies": 0.75, "rewards/chosen": 0.3807571530342102, "rewards/margins": 0.49577221274375916, "rewards/rejected": -0.11501505970954895, "step": 2977 }, { "epoch": 0.4605451382176687, "grad_norm": 5.983728408813477, "learning_rate": 4.702715087638905e-06, "logits/chosen": 13.610491752624512, "logits/rejected": 7.699213981628418, "logps/chosen": -319.1959228515625, "logps/rejected": -244.608642578125, "loss": 0.6476, "rewards/accuracies": 0.625, "rewards/chosen": 0.4514175057411194, "rewards/margins": 0.2528921067714691, "rewards/rejected": 0.19852539896965027, "step": 2978 }, { "epoch": 0.46069978735743283, "grad_norm": 5.342053413391113, "learning_rate": 4.702428685989232e-06, "logits/chosen": 11.79953384399414, "logits/rejected": 7.764245986938477, "logps/chosen": -305.5440979003906, "logps/rejected": -251.59214782714844, "loss": 0.6267, "rewards/accuracies": 0.625, "rewards/chosen": 0.1859557330608368, "rewards/margins": 0.2612136900424957, "rewards/rejected": -0.07525796443223953, "step": 2979 }, { "epoch": 0.460854436497197, "grad_norm": 5.773763656616211, "learning_rate": 4.7021422843395585e-06, "logits/chosen": 12.285804748535156, "logits/rejected": 12.791536331176758, "logps/chosen": -244.81736755371094, "logps/rejected": -237.22161865234375, "loss": 0.7447, "rewards/accuracies": 0.375, "rewards/chosen": 0.004154682159423828, "rewards/margins": 0.12794196605682373, "rewards/rejected": -0.12378731369972229, "step": 2980 }, { "epoch": 0.46100908563696114, "grad_norm": 8.02035140991211, "learning_rate": 4.701855882689884e-06, "logits/chosen": 12.456417083740234, "logits/rejected": 7.1342315673828125, "logps/chosen": -371.58917236328125, "logps/rejected": -272.7286376953125, "loss": 0.7906, "rewards/accuracies": 0.5, "rewards/chosen": -0.19836415350437164, "rewards/margins": -0.05249124765396118, "rewards/rejected": -0.14587293565273285, "step": 2981 }, { "epoch": 0.4611637347767253, "grad_norm": 6.104375839233398, "learning_rate": 4.701569481040211e-06, "logits/chosen": 11.011655807495117, "logits/rejected": 10.13815689086914, "logps/chosen": -356.41497802734375, "logps/rejected": -345.7293701171875, "loss": 0.648, "rewards/accuracies": 0.625, "rewards/chosen": 0.31433993577957153, "rewards/margins": 0.15907107293605804, "rewards/rejected": 0.1552688628435135, "step": 2982 }, { "epoch": 0.46131838391648944, "grad_norm": 8.492865562438965, "learning_rate": 4.701283079390538e-06, "logits/chosen": 14.472335815429688, "logits/rejected": 8.111994743347168, "logps/chosen": -416.2086181640625, "logps/rejected": -369.51458740234375, "loss": 0.7069, "rewards/accuracies": 0.5, "rewards/chosen": 0.1842903196811676, "rewards/margins": 0.04865151643753052, "rewards/rejected": 0.13563883304595947, "step": 2983 }, { "epoch": 0.46147303305625365, "grad_norm": 4.215407371520996, "learning_rate": 4.700996677740864e-06, "logits/chosen": 11.113818168640137, "logits/rejected": 9.38661003112793, "logps/chosen": -231.343505859375, "logps/rejected": -197.98866271972656, "loss": 0.6674, "rewards/accuracies": 0.5, "rewards/chosen": 0.08780211210250854, "rewards/margins": 0.08906412124633789, "rewards/rejected": -0.0012619979679584503, "step": 2984 }, { "epoch": 0.4616276821960178, "grad_norm": 8.360186576843262, "learning_rate": 4.700710276091191e-06, "logits/chosen": 11.55423641204834, "logits/rejected": 7.486073017120361, "logps/chosen": -370.11126708984375, "logps/rejected": -306.4674987792969, "loss": 0.4897, "rewards/accuracies": 0.875, "rewards/chosen": -0.07265090942382812, "rewards/margins": 0.5572525262832642, "rewards/rejected": -0.6299034357070923, "step": 2985 }, { "epoch": 0.46178233133578195, "grad_norm": 5.565553665161133, "learning_rate": 4.700423874441517e-06, "logits/chosen": 18.98154067993164, "logits/rejected": 10.315452575683594, "logps/chosen": -302.1778259277344, "logps/rejected": -257.02337646484375, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": -0.035533711314201355, "rewards/margins": 0.07154185324907303, "rewards/rejected": -0.10707554966211319, "step": 2986 }, { "epoch": 0.4619369804755461, "grad_norm": 7.741940498352051, "learning_rate": 4.700137472791843e-06, "logits/chosen": 7.266263961791992, "logits/rejected": 6.052780628204346, "logps/chosen": -197.5580291748047, "logps/rejected": -155.3468017578125, "loss": 0.8848, "rewards/accuracies": 0.375, "rewards/chosen": 0.10885835438966751, "rewards/margins": -0.2579139471054077, "rewards/rejected": 0.36677226424217224, "step": 2987 }, { "epoch": 0.46209162961531025, "grad_norm": 5.416033744812012, "learning_rate": 4.69985107114217e-06, "logits/chosen": 9.60762882232666, "logits/rejected": 8.453782081604004, "logps/chosen": -237.88233947753906, "logps/rejected": -262.14422607421875, "loss": 0.6655, "rewards/accuracies": 0.75, "rewards/chosen": 0.1277083456516266, "rewards/margins": 0.12136536836624146, "rewards/rejected": 0.006342977285385132, "step": 2988 }, { "epoch": 0.4622462787550744, "grad_norm": 4.104170322418213, "learning_rate": 4.699564669492497e-06, "logits/chosen": 10.760554313659668, "logits/rejected": 4.247102737426758, "logps/chosen": -253.0543212890625, "logps/rejected": -199.67893981933594, "loss": 0.5376, "rewards/accuracies": 0.75, "rewards/chosen": 0.12506359815597534, "rewards/margins": 0.3998783528804779, "rewards/rejected": -0.2748147249221802, "step": 2989 }, { "epoch": 0.4624009278948386, "grad_norm": 7.315491199493408, "learning_rate": 4.699278267842823e-06, "logits/chosen": 4.25111198425293, "logits/rejected": 3.1076107025146484, "logps/chosen": -207.8371124267578, "logps/rejected": -184.70236206054688, "loss": 0.5232, "rewards/accuracies": 0.75, "rewards/chosen": 0.5599367618560791, "rewards/margins": 0.49828776717185974, "rewards/rejected": 0.06164901703596115, "step": 2990 }, { "epoch": 0.46255557703460276, "grad_norm": 6.039941310882568, "learning_rate": 4.698991866193149e-06, "logits/chosen": 14.211645126342773, "logits/rejected": 9.0758638381958, "logps/chosen": -402.0782470703125, "logps/rejected": -302.526123046875, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": 0.14402027428150177, "rewards/margins": 0.11809063702821732, "rewards/rejected": 0.02592964470386505, "step": 2991 }, { "epoch": 0.4627102261743669, "grad_norm": 9.251330375671387, "learning_rate": 4.698705464543476e-06, "logits/chosen": 8.933084487915039, "logits/rejected": 7.97614049911499, "logps/chosen": -380.0074768066406, "logps/rejected": -392.1715393066406, "loss": 0.8386, "rewards/accuracies": 0.5, "rewards/chosen": -0.03924284875392914, "rewards/margins": -0.2019944190979004, "rewards/rejected": 0.16275157034397125, "step": 2992 }, { "epoch": 0.46286487531413106, "grad_norm": 5.163235664367676, "learning_rate": 4.698419062893802e-06, "logits/chosen": 12.297382354736328, "logits/rejected": 7.0777788162231445, "logps/chosen": -309.34100341796875, "logps/rejected": -221.50167846679688, "loss": 0.5401, "rewards/accuracies": 0.875, "rewards/chosen": 0.46429651975631714, "rewards/margins": 0.3811018168926239, "rewards/rejected": 0.08319471776485443, "step": 2993 }, { "epoch": 0.4630195244538952, "grad_norm": 4.26530122756958, "learning_rate": 4.698132661244129e-06, "logits/chosen": 16.082658767700195, "logits/rejected": 12.471482276916504, "logps/chosen": -221.74954223632812, "logps/rejected": -208.36878967285156, "loss": 0.5515, "rewards/accuracies": 0.75, "rewards/chosen": 0.4126954674720764, "rewards/margins": 0.3598061501979828, "rewards/rejected": 0.05288929119706154, "step": 2994 }, { "epoch": 0.46317417359365937, "grad_norm": 7.143664836883545, "learning_rate": 4.697846259594456e-06, "logits/chosen": 10.269268989562988, "logits/rejected": 4.97338342666626, "logps/chosen": -290.7821044921875, "logps/rejected": -291.57269287109375, "loss": 0.7233, "rewards/accuracies": 0.5, "rewards/chosen": -0.0773792564868927, "rewards/margins": -0.0046173036098480225, "rewards/rejected": -0.07276192307472229, "step": 2995 }, { "epoch": 0.4633288227334235, "grad_norm": 3.841068744659424, "learning_rate": 4.697559857944782e-06, "logits/chosen": 11.763745307922363, "logits/rejected": 10.870201110839844, "logps/chosen": -222.72222900390625, "logps/rejected": -204.68389892578125, "loss": 0.4981, "rewards/accuracies": 0.875, "rewards/chosen": 0.1590469777584076, "rewards/margins": 0.48615387082099915, "rewards/rejected": -0.32710689306259155, "step": 2996 }, { "epoch": 0.4634834718731877, "grad_norm": 5.703514099121094, "learning_rate": 4.697273456295109e-06, "logits/chosen": 11.960027694702148, "logits/rejected": 6.618836402893066, "logps/chosen": -256.2616271972656, "logps/rejected": -279.80126953125, "loss": 0.638, "rewards/accuracies": 0.625, "rewards/chosen": 0.3547555208206177, "rewards/margins": 0.3694687783718109, "rewards/rejected": -0.014713302254676819, "step": 2997 }, { "epoch": 0.4636381210129519, "grad_norm": 4.851461887359619, "learning_rate": 4.696987054645435e-06, "logits/chosen": 8.914794921875, "logits/rejected": 8.558012008666992, "logps/chosen": -239.77220153808594, "logps/rejected": -263.8995361328125, "loss": 0.6449, "rewards/accuracies": 0.375, "rewards/chosen": 0.26013949513435364, "rewards/margins": 0.17149752378463745, "rewards/rejected": 0.08864197880029678, "step": 2998 }, { "epoch": 0.463792770152716, "grad_norm": 3.604166269302368, "learning_rate": 4.6967006529957614e-06, "logits/chosen": 11.806922912597656, "logits/rejected": 11.061351776123047, "logps/chosen": -198.9353485107422, "logps/rejected": -139.0316619873047, "loss": 0.5342, "rewards/accuracies": 0.75, "rewards/chosen": 0.14822879433631897, "rewards/margins": 0.40380263328552246, "rewards/rejected": -0.2555738389492035, "step": 2999 }, { "epoch": 0.4639474192924802, "grad_norm": 5.562335968017578, "learning_rate": 4.696414251346088e-06, "logits/chosen": 5.255735874176025, "logits/rejected": 9.8038330078125, "logps/chosen": -140.96401977539062, "logps/rejected": -195.1973876953125, "loss": 0.9344, "rewards/accuracies": 0.375, "rewards/chosen": -0.49991849064826965, "rewards/margins": -0.3037942349910736, "rewards/rejected": -0.19612424075603485, "step": 3000 }, { "epoch": 0.46410206843224433, "grad_norm": 4.986563205718994, "learning_rate": 4.696127849696415e-06, "logits/chosen": 10.662957191467285, "logits/rejected": 8.625347137451172, "logps/chosen": -311.1173400878906, "logps/rejected": -277.9862060546875, "loss": 0.5527, "rewards/accuracies": 0.75, "rewards/chosen": 0.4443734288215637, "rewards/margins": 0.40489834547042847, "rewards/rejected": 0.03947506099939346, "step": 3001 }, { "epoch": 0.4642567175720085, "grad_norm": 4.071439266204834, "learning_rate": 4.695841448046741e-06, "logits/chosen": 12.221359252929688, "logits/rejected": 1.933159351348877, "logps/chosen": -331.10601806640625, "logps/rejected": -141.78421020507812, "loss": 0.6015, "rewards/accuracies": 0.5, "rewards/chosen": 0.13682307302951813, "rewards/margins": 0.2776743173599243, "rewards/rejected": -0.14085125923156738, "step": 3002 }, { "epoch": 0.4644113667117727, "grad_norm": 6.970469951629639, "learning_rate": 4.695555046397068e-06, "logits/chosen": -0.08822989463806152, "logits/rejected": 4.778539657592773, "logps/chosen": -205.90802001953125, "logps/rejected": -223.74896240234375, "loss": 0.9828, "rewards/accuracies": 0.25, "rewards/chosen": -0.19687604904174805, "rewards/margins": -0.3774961531162262, "rewards/rejected": 0.18062010407447815, "step": 3003 }, { "epoch": 0.46456601585153684, "grad_norm": 3.9793319702148438, "learning_rate": 4.695268644747394e-06, "logits/chosen": 10.422563552856445, "logits/rejected": 3.5683414936065674, "logps/chosen": -317.6117858886719, "logps/rejected": -221.690673828125, "loss": 0.5104, "rewards/accuracies": 0.75, "rewards/chosen": 0.3746354281902313, "rewards/margins": 0.5272427797317505, "rewards/rejected": -0.15260739624500275, "step": 3004 }, { "epoch": 0.464720664991301, "grad_norm": 7.266453266143799, "learning_rate": 4.6949822430977205e-06, "logits/chosen": 8.791247367858887, "logits/rejected": 9.776388168334961, "logps/chosen": -283.5511474609375, "logps/rejected": -285.2673034667969, "loss": 0.7696, "rewards/accuracies": 0.25, "rewards/chosen": 0.14489275217056274, "rewards/margins": -0.13763217628002167, "rewards/rejected": 0.2825249135494232, "step": 3005 }, { "epoch": 0.46487531413106514, "grad_norm": 5.865922927856445, "learning_rate": 4.694695841448047e-06, "logits/chosen": 9.802221298217773, "logits/rejected": 10.44054889678955, "logps/chosen": -250.34982299804688, "logps/rejected": -335.50982666015625, "loss": 0.6637, "rewards/accuracies": 0.375, "rewards/chosen": -0.018009856343269348, "rewards/margins": 0.12503968179225922, "rewards/rejected": -0.14304950833320618, "step": 3006 }, { "epoch": 0.4650299632708293, "grad_norm": 5.864901542663574, "learning_rate": 4.694409439798374e-06, "logits/chosen": 10.353107452392578, "logits/rejected": 10.873077392578125, "logps/chosen": -276.3033752441406, "logps/rejected": -278.56768798828125, "loss": 0.7559, "rewards/accuracies": 0.375, "rewards/chosen": 0.2895589768886566, "rewards/margins": -0.09286236017942429, "rewards/rejected": 0.3824213743209839, "step": 3007 }, { "epoch": 0.46518461241059345, "grad_norm": 6.005579471588135, "learning_rate": 4.6941230381487005e-06, "logits/chosen": 14.687593460083008, "logits/rejected": 9.673669815063477, "logps/chosen": -321.59295654296875, "logps/rejected": -254.62124633789062, "loss": 0.6107, "rewards/accuracies": 0.5, "rewards/chosen": 0.25148218870162964, "rewards/margins": 0.27360886335372925, "rewards/rejected": -0.022126667201519012, "step": 3008 }, { "epoch": 0.46533926155035765, "grad_norm": 5.281496524810791, "learning_rate": 4.693836636499027e-06, "logits/chosen": 18.5379695892334, "logits/rejected": 7.6601996421813965, "logps/chosen": -305.9548034667969, "logps/rejected": -192.962158203125, "loss": 0.6505, "rewards/accuracies": 0.5, "rewards/chosen": 0.1769767850637436, "rewards/margins": 0.16448505222797394, "rewards/rejected": 0.012491695582866669, "step": 3009 }, { "epoch": 0.4654939106901218, "grad_norm": 5.080384731292725, "learning_rate": 4.693550234849354e-06, "logits/chosen": 9.558121681213379, "logits/rejected": 7.163371562957764, "logps/chosen": -228.67665100097656, "logps/rejected": -184.52972412109375, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.05075579881668091, "rewards/margins": 0.0759708434343338, "rewards/rejected": -0.02521505393087864, "step": 3010 }, { "epoch": 0.46564855982988596, "grad_norm": 11.415779113769531, "learning_rate": 4.6932638331996796e-06, "logits/chosen": 13.355599403381348, "logits/rejected": 8.992387771606445, "logps/chosen": -350.96917724609375, "logps/rejected": -305.41729736328125, "loss": 0.7469, "rewards/accuracies": 0.5, "rewards/chosen": 0.23132705688476562, "rewards/margins": 0.04031284525990486, "rewards/rejected": 0.19101420044898987, "step": 3011 }, { "epoch": 0.4658032089696501, "grad_norm": 6.177041530609131, "learning_rate": 4.692977431550006e-06, "logits/chosen": 7.324940204620361, "logits/rejected": 6.873721599578857, "logps/chosen": -333.996337890625, "logps/rejected": -265.8457336425781, "loss": 0.6312, "rewards/accuracies": 0.625, "rewards/chosen": 0.06832566857337952, "rewards/margins": 0.1922735720872879, "rewards/rejected": -0.12394791096448898, "step": 3012 }, { "epoch": 0.46595785810941426, "grad_norm": 5.809593200683594, "learning_rate": 4.692691029900333e-06, "logits/chosen": 8.676753997802734, "logits/rejected": 6.138293266296387, "logps/chosen": -261.811279296875, "logps/rejected": -249.48580932617188, "loss": 0.6832, "rewards/accuracies": 0.5, "rewards/chosen": -0.12032841891050339, "rewards/margins": 0.08426617830991745, "rewards/rejected": -0.20459461212158203, "step": 3013 }, { "epoch": 0.4661125072491784, "grad_norm": 3.7026448249816895, "learning_rate": 4.6924046282506595e-06, "logits/chosen": 8.78897762298584, "logits/rejected": 7.335955619812012, "logps/chosen": -135.98748779296875, "logps/rejected": -135.40573120117188, "loss": 0.6172, "rewards/accuracies": 0.625, "rewards/chosen": -0.013244912028312683, "rewards/margins": 0.21129214763641357, "rewards/rejected": -0.22453705966472626, "step": 3014 }, { "epoch": 0.46626715638894256, "grad_norm": 5.66171932220459, "learning_rate": 4.692118226600985e-06, "logits/chosen": 11.891119003295898, "logits/rejected": 8.937435150146484, "logps/chosen": -323.4237060546875, "logps/rejected": -344.78924560546875, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": 0.05019301176071167, "rewards/margins": 0.135787695646286, "rewards/rejected": -0.08559465408325195, "step": 3015 }, { "epoch": 0.46642180552870677, "grad_norm": 5.276484489440918, "learning_rate": 4.691831824951312e-06, "logits/chosen": 14.987350463867188, "logits/rejected": -1.9259618520736694, "logps/chosen": -478.56903076171875, "logps/rejected": -213.12319946289062, "loss": 0.413, "rewards/accuracies": 0.75, "rewards/chosen": 0.5866913199424744, "rewards/margins": 0.9898380637168884, "rewards/rejected": -0.40314677357673645, "step": 3016 }, { "epoch": 0.4665764546684709, "grad_norm": 6.515838623046875, "learning_rate": 4.691545423301639e-06, "logits/chosen": 11.047337532043457, "logits/rejected": 5.699032306671143, "logps/chosen": -327.6561279296875, "logps/rejected": -249.29318237304688, "loss": 0.5543, "rewards/accuracies": 0.875, "rewards/chosen": 0.47137489914894104, "rewards/margins": 0.3869286775588989, "rewards/rejected": 0.08444623649120331, "step": 3017 }, { "epoch": 0.46673110380823507, "grad_norm": 7.234194278717041, "learning_rate": 4.691259021651965e-06, "logits/chosen": 13.772784233093262, "logits/rejected": 10.25424575805664, "logps/chosen": -281.24603271484375, "logps/rejected": -284.25775146484375, "loss": 0.7704, "rewards/accuracies": 0.25, "rewards/chosen": 0.11060438305139542, "rewards/margins": -0.09794869273900986, "rewards/rejected": 0.20855306088924408, "step": 3018 }, { "epoch": 0.4668857529479992, "grad_norm": 4.7292656898498535, "learning_rate": 4.690972620002291e-06, "logits/chosen": 9.881389617919922, "logits/rejected": 6.236710548400879, "logps/chosen": -348.5396728515625, "logps/rejected": -274.4770202636719, "loss": 0.5974, "rewards/accuracies": 0.75, "rewards/chosen": 0.4725329577922821, "rewards/margins": 0.28675374388694763, "rewards/rejected": 0.18577921390533447, "step": 3019 }, { "epoch": 0.4670404020877634, "grad_norm": 7.421738147735596, "learning_rate": 4.690686218352618e-06, "logits/chosen": 8.122907638549805, "logits/rejected": 4.668972969055176, "logps/chosen": -331.0155029296875, "logps/rejected": -245.2442626953125, "loss": 0.793, "rewards/accuracies": 0.375, "rewards/chosen": -0.3071400821208954, "rewards/margins": -0.0647820383310318, "rewards/rejected": -0.24235805869102478, "step": 3020 }, { "epoch": 0.4671950512275275, "grad_norm": 7.01359748840332, "learning_rate": 4.690399816702944e-06, "logits/chosen": 12.085275650024414, "logits/rejected": 8.904709815979004, "logps/chosen": -352.9685974121094, "logps/rejected": -317.2801208496094, "loss": 0.6991, "rewards/accuracies": 0.375, "rewards/chosen": 0.38205835223197937, "rewards/margins": 0.054469138383865356, "rewards/rejected": 0.327589213848114, "step": 3021 }, { "epoch": 0.46734970036729173, "grad_norm": 4.579090595245361, "learning_rate": 4.690113415053271e-06, "logits/chosen": 12.5736722946167, "logits/rejected": 5.3734450340271, "logps/chosen": -318.8518371582031, "logps/rejected": -185.54391479492188, "loss": 0.603, "rewards/accuracies": 0.625, "rewards/chosen": 0.2514013350009918, "rewards/margins": 0.23206956684589386, "rewards/rejected": 0.019331790506839752, "step": 3022 }, { "epoch": 0.4675043495070559, "grad_norm": 7.670943260192871, "learning_rate": 4.689827013403598e-06, "logits/chosen": 14.403982162475586, "logits/rejected": 13.06241512298584, "logps/chosen": -388.9197998046875, "logps/rejected": -356.6515808105469, "loss": 0.8136, "rewards/accuracies": 0.625, "rewards/chosen": 0.25854986906051636, "rewards/margins": -0.17356276512145996, "rewards/rejected": 0.4321126341819763, "step": 3023 }, { "epoch": 0.46765899864682003, "grad_norm": 11.595117568969727, "learning_rate": 4.6895406117539235e-06, "logits/chosen": 4.267294883728027, "logits/rejected": 4.490293979644775, "logps/chosen": -425.6158447265625, "logps/rejected": -370.4498291015625, "loss": 0.7636, "rewards/accuracies": 0.5, "rewards/chosen": 0.424464613199234, "rewards/margins": -0.048869915306568146, "rewards/rejected": 0.47333452105522156, "step": 3024 }, { "epoch": 0.4678136477865842, "grad_norm": 4.990597248077393, "learning_rate": 4.68925421010425e-06, "logits/chosen": 10.850885391235352, "logits/rejected": 6.21992826461792, "logps/chosen": -383.0733642578125, "logps/rejected": -230.14834594726562, "loss": 0.639, "rewards/accuracies": 0.5, "rewards/chosen": 0.24378754198551178, "rewards/margins": 0.22680333256721497, "rewards/rejected": 0.01698426529765129, "step": 3025 }, { "epoch": 0.46796829692634834, "grad_norm": 6.668668270111084, "learning_rate": 4.688967808454577e-06, "logits/chosen": 8.512245178222656, "logits/rejected": 6.942404747009277, "logps/chosen": -318.826904296875, "logps/rejected": -267.2339782714844, "loss": 0.8185, "rewards/accuracies": 0.5, "rewards/chosen": 0.19500266015529633, "rewards/margins": -0.12995214760303497, "rewards/rejected": 0.3249548077583313, "step": 3026 }, { "epoch": 0.4681229460661125, "grad_norm": 4.147992134094238, "learning_rate": 4.6886814068049034e-06, "logits/chosen": 11.03657054901123, "logits/rejected": 7.150003910064697, "logps/chosen": -236.04449462890625, "logps/rejected": -226.43711853027344, "loss": 0.5467, "rewards/accuracies": 0.625, "rewards/chosen": 0.07402047514915466, "rewards/margins": 0.44944873452186584, "rewards/rejected": -0.3754281997680664, "step": 3027 }, { "epoch": 0.46827759520587664, "grad_norm": 7.048837184906006, "learning_rate": 4.68839500515523e-06, "logits/chosen": 10.253664016723633, "logits/rejected": 11.200864791870117, "logps/chosen": -248.10501098632812, "logps/rejected": -248.26441955566406, "loss": 0.8741, "rewards/accuracies": 0.5, "rewards/chosen": -0.024630114436149597, "rewards/margins": -0.2512737810611725, "rewards/rejected": 0.2266436666250229, "step": 3028 }, { "epoch": 0.46843224434564085, "grad_norm": 5.828037738800049, "learning_rate": 4.688108603505557e-06, "logits/chosen": 13.932458877563477, "logits/rejected": 16.701068878173828, "logps/chosen": -376.8634338378906, "logps/rejected": -373.6258239746094, "loss": 0.7036, "rewards/accuracies": 0.5, "rewards/chosen": 0.4477007985115051, "rewards/margins": -0.00186949223279953, "rewards/rejected": 0.44957026839256287, "step": 3029 }, { "epoch": 0.468586893485405, "grad_norm": 4.968430519104004, "learning_rate": 4.687822201855883e-06, "logits/chosen": 6.363219738006592, "logits/rejected": 6.011447906494141, "logps/chosen": -275.84844970703125, "logps/rejected": -273.5904541015625, "loss": 0.6512, "rewards/accuracies": 0.5, "rewards/chosen": 0.12790071964263916, "rewards/margins": 0.09668491035699844, "rewards/rejected": 0.031215816736221313, "step": 3030 }, { "epoch": 0.46874154262516915, "grad_norm": 4.819473743438721, "learning_rate": 4.687535800206209e-06, "logits/chosen": 12.618279457092285, "logits/rejected": 6.812953472137451, "logps/chosen": -247.2542724609375, "logps/rejected": -192.78634643554688, "loss": 0.6133, "rewards/accuracies": 0.625, "rewards/chosen": 0.14485806226730347, "rewards/margins": 0.21640536189079285, "rewards/rejected": -0.07154726982116699, "step": 3031 }, { "epoch": 0.4688961917649333, "grad_norm": 5.328410625457764, "learning_rate": 4.687249398556536e-06, "logits/chosen": 9.071479797363281, "logits/rejected": 9.877046585083008, "logps/chosen": -262.59552001953125, "logps/rejected": -262.7901916503906, "loss": 0.745, "rewards/accuracies": 0.5, "rewards/chosen": 0.10335926711559296, "rewards/margins": -0.009878858923912048, "rewards/rejected": 0.1132381483912468, "step": 3032 }, { "epoch": 0.46905084090469745, "grad_norm": 4.913109302520752, "learning_rate": 4.6869629969068625e-06, "logits/chosen": 11.790393829345703, "logits/rejected": 11.277243614196777, "logps/chosen": -250.356201171875, "logps/rejected": -268.9931945800781, "loss": 0.4965, "rewards/accuracies": 0.75, "rewards/chosen": 0.13035041093826294, "rewards/margins": 0.6155429482460022, "rewards/rejected": -0.48519250750541687, "step": 3033 }, { "epoch": 0.4692054900444616, "grad_norm": 5.327846527099609, "learning_rate": 4.686676595257189e-06, "logits/chosen": 7.431042671203613, "logits/rejected": 5.577322959899902, "logps/chosen": -286.6590881347656, "logps/rejected": -267.8074645996094, "loss": 0.6545, "rewards/accuracies": 0.375, "rewards/chosen": 0.29984867572784424, "rewards/margins": 0.13244876265525818, "rewards/rejected": 0.16739995777606964, "step": 3034 }, { "epoch": 0.4693601391842258, "grad_norm": 7.3163018226623535, "learning_rate": 4.686390193607516e-06, "logits/chosen": 10.322932243347168, "logits/rejected": 13.13760757446289, "logps/chosen": -274.2729797363281, "logps/rejected": -312.0372009277344, "loss": 0.7162, "rewards/accuracies": 0.375, "rewards/chosen": 0.34363099932670593, "rewards/margins": 0.14714661240577698, "rewards/rejected": 0.19648438692092896, "step": 3035 }, { "epoch": 0.46951478832398996, "grad_norm": 4.8995361328125, "learning_rate": 4.6861037919578424e-06, "logits/chosen": 10.715742111206055, "logits/rejected": 12.697279930114746, "logps/chosen": -138.29400634765625, "logps/rejected": -190.53488159179688, "loss": 0.7841, "rewards/accuracies": 0.375, "rewards/chosen": -0.18836352229118347, "rewards/margins": -0.018577218055725098, "rewards/rejected": -0.1697862595319748, "step": 3036 }, { "epoch": 0.4696694374637541, "grad_norm": 5.382208824157715, "learning_rate": 4.685817390308168e-06, "logits/chosen": 11.535953521728516, "logits/rejected": 9.934057235717773, "logps/chosen": -234.19854736328125, "logps/rejected": -294.6527099609375, "loss": 0.715, "rewards/accuracies": 0.5, "rewards/chosen": 0.07564811408519745, "rewards/margins": 0.17139311134815216, "rewards/rejected": -0.09574498236179352, "step": 3037 }, { "epoch": 0.46982408660351827, "grad_norm": 4.8675618171691895, "learning_rate": 4.685530988658495e-06, "logits/chosen": 10.951421737670898, "logits/rejected": 8.434406280517578, "logps/chosen": -255.77389526367188, "logps/rejected": -174.0486297607422, "loss": 0.6993, "rewards/accuracies": 0.75, "rewards/chosen": 0.3492065370082855, "rewards/margins": 0.11506514251232147, "rewards/rejected": 0.23414140939712524, "step": 3038 }, { "epoch": 0.4699787357432824, "grad_norm": 5.73569917678833, "learning_rate": 4.6852445870088215e-06, "logits/chosen": 13.290253639221191, "logits/rejected": 7.9289984703063965, "logps/chosen": -452.88470458984375, "logps/rejected": -278.16680908203125, "loss": 0.4814, "rewards/accuracies": 0.875, "rewards/chosen": 0.6856586933135986, "rewards/margins": 0.6601459980010986, "rewards/rejected": 0.025512687861919403, "step": 3039 }, { "epoch": 0.47013338488304657, "grad_norm": 8.164809226989746, "learning_rate": 4.684958185359148e-06, "logits/chosen": 9.7992582321167, "logits/rejected": 5.111950874328613, "logps/chosen": -339.24969482421875, "logps/rejected": -269.45684814453125, "loss": 0.6226, "rewards/accuracies": 0.625, "rewards/chosen": 0.4777964651584625, "rewards/margins": 0.29934245347976685, "rewards/rejected": 0.17845401167869568, "step": 3040 }, { "epoch": 0.4702880340228108, "grad_norm": 9.046942710876465, "learning_rate": 4.684671783709475e-06, "logits/chosen": 4.686356067657471, "logits/rejected": 6.7059760093688965, "logps/chosen": -340.31695556640625, "logps/rejected": -312.0832214355469, "loss": 0.7959, "rewards/accuracies": 0.625, "rewards/chosen": 0.43454858660697937, "rewards/margins": -0.13616794347763062, "rewards/rejected": 0.5707165002822876, "step": 3041 }, { "epoch": 0.4704426831625749, "grad_norm": 6.626064777374268, "learning_rate": 4.6843853820598015e-06, "logits/chosen": 11.066731452941895, "logits/rejected": 6.454031944274902, "logps/chosen": -315.42852783203125, "logps/rejected": -274.3255615234375, "loss": 0.5617, "rewards/accuracies": 0.625, "rewards/chosen": 0.6192066669464111, "rewards/margins": 0.5136699080467224, "rewards/rejected": 0.10553675144910812, "step": 3042 }, { "epoch": 0.4705973323023391, "grad_norm": 4.641066074371338, "learning_rate": 4.684098980410128e-06, "logits/chosen": 11.110712051391602, "logits/rejected": 7.017316818237305, "logps/chosen": -262.4399719238281, "logps/rejected": -189.96127319335938, "loss": 0.6281, "rewards/accuracies": 0.5, "rewards/chosen": 0.22620946168899536, "rewards/margins": 0.19253702461719513, "rewards/rejected": 0.03367242217063904, "step": 3043 }, { "epoch": 0.47075198144210323, "grad_norm": 4.517951965332031, "learning_rate": 4.683812578760454e-06, "logits/chosen": 9.379972457885742, "logits/rejected": 7.071934700012207, "logps/chosen": -318.04022216796875, "logps/rejected": -336.4866943359375, "loss": 0.5734, "rewards/accuracies": 0.625, "rewards/chosen": 0.5838018655776978, "rewards/margins": 0.3498693108558655, "rewards/rejected": 0.23393258452415466, "step": 3044 }, { "epoch": 0.4709066305818674, "grad_norm": 4.883569240570068, "learning_rate": 4.683526177110781e-06, "logits/chosen": 13.97895622253418, "logits/rejected": 7.639683723449707, "logps/chosen": -254.38832092285156, "logps/rejected": -179.5138702392578, "loss": 0.6596, "rewards/accuracies": 0.5, "rewards/chosen": 0.3233962953090668, "rewards/margins": 0.10601967573165894, "rewards/rejected": 0.21737661957740784, "step": 3045 }, { "epoch": 0.47106127972163153, "grad_norm": 4.058988094329834, "learning_rate": 4.683239775461107e-06, "logits/chosen": 6.2645063400268555, "logits/rejected": 6.5252790451049805, "logps/chosen": -204.92681884765625, "logps/rejected": -201.12493896484375, "loss": 0.6349, "rewards/accuracies": 0.5, "rewards/chosen": 0.255244642496109, "rewards/margins": 0.30357885360717773, "rewards/rejected": -0.04833421856164932, "step": 3046 }, { "epoch": 0.4712159288613957, "grad_norm": 6.851743221282959, "learning_rate": 4.682953373811434e-06, "logits/chosen": 6.320741653442383, "logits/rejected": 7.543875694274902, "logps/chosen": -245.53750610351562, "logps/rejected": -233.72482299804688, "loss": 0.716, "rewards/accuracies": 0.375, "rewards/chosen": -0.14921225607395172, "rewards/margins": 0.006156541407108307, "rewards/rejected": -0.15536880493164062, "step": 3047 }, { "epoch": 0.4713705780011599, "grad_norm": 5.082075119018555, "learning_rate": 4.6826669721617605e-06, "logits/chosen": 11.506973266601562, "logits/rejected": 4.627630710601807, "logps/chosen": -390.1814270019531, "logps/rejected": -311.8535461425781, "loss": 0.5567, "rewards/accuracies": 0.625, "rewards/chosen": 0.36454325914382935, "rewards/margins": 0.5128042697906494, "rewards/rejected": -0.14826098084449768, "step": 3048 }, { "epoch": 0.47152522714092404, "grad_norm": 11.612847328186035, "learning_rate": 4.682380570512086e-06, "logits/chosen": 7.6846537590026855, "logits/rejected": 5.296227931976318, "logps/chosen": -224.20394897460938, "logps/rejected": -196.99183654785156, "loss": 0.787, "rewards/accuracies": 0.375, "rewards/chosen": 0.02309267222881317, "rewards/margins": -0.1418527215719223, "rewards/rejected": 0.16494536399841309, "step": 3049 }, { "epoch": 0.4716798762806882, "grad_norm": 6.362085819244385, "learning_rate": 4.682094168862413e-06, "logits/chosen": 4.7474164962768555, "logits/rejected": 9.70876693725586, "logps/chosen": -262.0009765625, "logps/rejected": -298.1358642578125, "loss": 0.8143, "rewards/accuracies": 0.5, "rewards/chosen": 0.12552708387374878, "rewards/margins": 0.05059318244457245, "rewards/rejected": 0.07493391633033752, "step": 3050 }, { "epoch": 0.47183452542045234, "grad_norm": 7.122706890106201, "learning_rate": 4.68180776721274e-06, "logits/chosen": 3.781843900680542, "logits/rejected": 9.156026840209961, "logps/chosen": -198.87661743164062, "logps/rejected": -236.2912139892578, "loss": 0.7254, "rewards/accuracies": 0.375, "rewards/chosen": 0.02089890092611313, "rewards/margins": 0.03717374801635742, "rewards/rejected": -0.01627485454082489, "step": 3051 }, { "epoch": 0.4719891745602165, "grad_norm": 5.172514915466309, "learning_rate": 4.681521365563066e-06, "logits/chosen": 10.050740242004395, "logits/rejected": 4.665450096130371, "logps/chosen": -160.89524841308594, "logps/rejected": -143.93515014648438, "loss": 0.6421, "rewards/accuracies": 0.5, "rewards/chosen": 0.09480108320713043, "rewards/margins": 0.1491040587425232, "rewards/rejected": -0.05430297553539276, "step": 3052 }, { "epoch": 0.47214382369998065, "grad_norm": 4.853285789489746, "learning_rate": 4.681234963913392e-06, "logits/chosen": 10.171934127807617, "logits/rejected": 8.72216510772705, "logps/chosen": -221.22750854492188, "logps/rejected": -248.62179565429688, "loss": 0.6148, "rewards/accuracies": 0.75, "rewards/chosen": 0.248515784740448, "rewards/margins": 0.20848342776298523, "rewards/rejected": 0.04003235325217247, "step": 3053 }, { "epoch": 0.47229847283974485, "grad_norm": 5.922132968902588, "learning_rate": 4.680948562263719e-06, "logits/chosen": 10.94228458404541, "logits/rejected": 7.261427879333496, "logps/chosen": -189.93734741210938, "logps/rejected": -164.9632568359375, "loss": 0.7664, "rewards/accuracies": 0.5, "rewards/chosen": -0.11140406876802444, "rewards/margins": -0.004519656300544739, "rewards/rejected": -0.1068844348192215, "step": 3054 }, { "epoch": 0.472453121979509, "grad_norm": 8.111992835998535, "learning_rate": 4.680662160614045e-06, "logits/chosen": 6.163878440856934, "logits/rejected": 8.51284122467041, "logps/chosen": -324.60675048828125, "logps/rejected": -290.0404052734375, "loss": 0.7231, "rewards/accuracies": 0.375, "rewards/chosen": 0.3316325545310974, "rewards/margins": 0.031124353408813477, "rewards/rejected": 0.3005082309246063, "step": 3055 }, { "epoch": 0.47260777111927316, "grad_norm": 5.61262845993042, "learning_rate": 4.680375758964372e-06, "logits/chosen": 2.4881088733673096, "logits/rejected": 6.924922466278076, "logps/chosen": -178.37986755371094, "logps/rejected": -235.50222778320312, "loss": 0.7066, "rewards/accuracies": 0.5, "rewards/chosen": 0.17450298368930817, "rewards/margins": -0.002118505537509918, "rewards/rejected": 0.17662151157855988, "step": 3056 }, { "epoch": 0.4727624202590373, "grad_norm": 8.181360244750977, "learning_rate": 4.680089357314698e-06, "logits/chosen": 8.793135643005371, "logits/rejected": 5.763635635375977, "logps/chosen": -348.526123046875, "logps/rejected": -321.23480224609375, "loss": 0.8251, "rewards/accuracies": 0.375, "rewards/chosen": 0.03598213568329811, "rewards/margins": -0.15068131685256958, "rewards/rejected": 0.1866634488105774, "step": 3057 }, { "epoch": 0.47291706939880146, "grad_norm": 5.557372570037842, "learning_rate": 4.6798029556650245e-06, "logits/chosen": 9.104660034179688, "logits/rejected": 8.948171615600586, "logps/chosen": -189.0396728515625, "logps/rejected": -201.1279296875, "loss": 0.753, "rewards/accuracies": 0.375, "rewards/chosen": 0.06726804375648499, "rewards/margins": -0.09653091430664062, "rewards/rejected": 0.1637989580631256, "step": 3058 }, { "epoch": 0.4730717185385656, "grad_norm": 15.182119369506836, "learning_rate": 4.679516554015351e-06, "logits/chosen": 17.72298812866211, "logits/rejected": 14.906938552856445, "logps/chosen": -252.43881225585938, "logps/rejected": -295.6755065917969, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": 0.43697845935821533, "rewards/margins": 0.05405254662036896, "rewards/rejected": 0.3829258978366852, "step": 3059 }, { "epoch": 0.47322636767832976, "grad_norm": 50.033668518066406, "learning_rate": 4.679230152365678e-06, "logits/chosen": 6.469582557678223, "logits/rejected": 10.854336738586426, "logps/chosen": -340.3044738769531, "logps/rejected": -407.75933837890625, "loss": 0.8822, "rewards/accuracies": 0.375, "rewards/chosen": 0.16917115449905396, "rewards/margins": -0.2311612367630005, "rewards/rejected": 0.40033239126205444, "step": 3060 }, { "epoch": 0.47338101681809397, "grad_norm": 6.531163215637207, "learning_rate": 4.6789437507160045e-06, "logits/chosen": 5.292582035064697, "logits/rejected": 5.018515110015869, "logps/chosen": -207.46969604492188, "logps/rejected": -217.6377716064453, "loss": 0.7886, "rewards/accuracies": 0.375, "rewards/chosen": -0.07034319639205933, "rewards/margins": -0.15117892622947693, "rewards/rejected": 0.0808357447385788, "step": 3061 }, { "epoch": 0.4735356659578581, "grad_norm": 4.9883246421813965, "learning_rate": 4.678657349066331e-06, "logits/chosen": 7.317048072814941, "logits/rejected": 8.744040489196777, "logps/chosen": -271.0904235839844, "logps/rejected": -397.14715576171875, "loss": 0.5064, "rewards/accuracies": 0.875, "rewards/chosen": 0.25454989075660706, "rewards/margins": 0.527854859828949, "rewards/rejected": -0.27330493927001953, "step": 3062 }, { "epoch": 0.47369031509762227, "grad_norm": 6.8340678215026855, "learning_rate": 4.678370947416658e-06, "logits/chosen": 5.69686222076416, "logits/rejected": 10.370161056518555, "logps/chosen": -319.1346130371094, "logps/rejected": -314.53448486328125, "loss": 0.7169, "rewards/accuracies": 0.375, "rewards/chosen": 0.3967639207839966, "rewards/margins": 0.13788823783397675, "rewards/rejected": 0.25887566804885864, "step": 3063 }, { "epoch": 0.4738449642373864, "grad_norm": 5.961577892303467, "learning_rate": 4.6780845457669836e-06, "logits/chosen": 11.684815406799316, "logits/rejected": 9.028783798217773, "logps/chosen": -246.92117309570312, "logps/rejected": -227.92347717285156, "loss": 0.7111, "rewards/accuracies": 0.375, "rewards/chosen": 0.41813087463378906, "rewards/margins": 0.13988414406776428, "rewards/rejected": 0.27824676036834717, "step": 3064 }, { "epoch": 0.4739996133771506, "grad_norm": 5.843420505523682, "learning_rate": 4.67779814411731e-06, "logits/chosen": 6.879061222076416, "logits/rejected": 7.678164005279541, "logps/chosen": -298.0958557128906, "logps/rejected": -362.84759521484375, "loss": 0.7892, "rewards/accuracies": 0.375, "rewards/chosen": 0.1783698946237564, "rewards/margins": 0.08849802613258362, "rewards/rejected": 0.08987187594175339, "step": 3065 }, { "epoch": 0.4741542625169147, "grad_norm": 7.636201858520508, "learning_rate": 4.677511742467637e-06, "logits/chosen": 8.735830307006836, "logits/rejected": 9.869014739990234, "logps/chosen": -253.0509490966797, "logps/rejected": -264.879638671875, "loss": 0.8386, "rewards/accuracies": 0.375, "rewards/chosen": 0.22172188758850098, "rewards/margins": -0.18404985964298248, "rewards/rejected": 0.40577173233032227, "step": 3066 }, { "epoch": 0.47430891165667893, "grad_norm": 9.161300659179688, "learning_rate": 4.6772253408179635e-06, "logits/chosen": 8.81488037109375, "logits/rejected": 6.985283851623535, "logps/chosen": -306.99237060546875, "logps/rejected": -262.3344421386719, "loss": 1.0115, "rewards/accuracies": 0.375, "rewards/chosen": -0.07892848551273346, "rewards/margins": -0.41635429859161377, "rewards/rejected": 0.3374258279800415, "step": 3067 }, { "epoch": 0.4744635607964431, "grad_norm": 4.1316304206848145, "learning_rate": 4.67693893916829e-06, "logits/chosen": 10.428282737731934, "logits/rejected": 7.2699689865112305, "logps/chosen": -244.53489685058594, "logps/rejected": -213.11257934570312, "loss": 0.5632, "rewards/accuracies": 0.75, "rewards/chosen": 0.08608321845531464, "rewards/margins": 0.3916701674461365, "rewards/rejected": -0.30558690428733826, "step": 3068 }, { "epoch": 0.47461820993620724, "grad_norm": 5.726138591766357, "learning_rate": 4.676652537518617e-06, "logits/chosen": 10.898289680480957, "logits/rejected": 10.581116676330566, "logps/chosen": -273.69757080078125, "logps/rejected": -260.623046875, "loss": 0.7086, "rewards/accuracies": 0.5, "rewards/chosen": 0.010938942432403564, "rewards/margins": 0.28959521651268005, "rewards/rejected": -0.2786563038825989, "step": 3069 }, { "epoch": 0.4747728590759714, "grad_norm": 5.0053558349609375, "learning_rate": 4.676366135868943e-06, "logits/chosen": 8.677099227905273, "logits/rejected": 4.523341178894043, "logps/chosen": -354.2060241699219, "logps/rejected": -284.7891845703125, "loss": 0.5478, "rewards/accuracies": 0.75, "rewards/chosen": 0.3860313892364502, "rewards/margins": 0.3774925172328949, "rewards/rejected": 0.008538894355297089, "step": 3070 }, { "epoch": 0.47492750821573554, "grad_norm": 7.444076061248779, "learning_rate": 4.676079734219269e-06, "logits/chosen": 3.152534008026123, "logits/rejected": 8.387313842773438, "logps/chosen": -183.9143524169922, "logps/rejected": -293.35693359375, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": 0.054882604628801346, "rewards/margins": 0.16610418260097504, "rewards/rejected": -0.11122157424688339, "step": 3071 }, { "epoch": 0.4750821573554997, "grad_norm": 4.019036769866943, "learning_rate": 4.675793332569596e-06, "logits/chosen": 11.028069496154785, "logits/rejected": 6.256389617919922, "logps/chosen": -176.41415405273438, "logps/rejected": -96.75796508789062, "loss": 0.6107, "rewards/accuracies": 0.75, "rewards/chosen": -0.09741778671741486, "rewards/margins": 0.2165750116109848, "rewards/rejected": -0.31399279832839966, "step": 3072 }, { "epoch": 0.4752368064952639, "grad_norm": 6.574222564697266, "learning_rate": 4.6755069309199226e-06, "logits/chosen": 4.31530237197876, "logits/rejected": 4.590390682220459, "logps/chosen": -215.89593505859375, "logps/rejected": -269.44268798828125, "loss": 0.7181, "rewards/accuracies": 0.625, "rewards/chosen": 0.06494760513305664, "rewards/margins": 0.033523403108119965, "rewards/rejected": 0.031424202024936676, "step": 3073 }, { "epoch": 0.47539145563502805, "grad_norm": 5.839081764221191, "learning_rate": 4.675220529270249e-06, "logits/chosen": 8.054490089416504, "logits/rejected": 9.482805252075195, "logps/chosen": -225.25442504882812, "logps/rejected": -272.8577880859375, "loss": 0.7333, "rewards/accuracies": 0.5, "rewards/chosen": 0.1386970579624176, "rewards/margins": -0.01759222522377968, "rewards/rejected": 0.15628927946090698, "step": 3074 }, { "epoch": 0.4755461047747922, "grad_norm": 4.601315021514893, "learning_rate": 4.674934127620576e-06, "logits/chosen": 13.395176887512207, "logits/rejected": 7.851316452026367, "logps/chosen": -209.703857421875, "logps/rejected": -181.48806762695312, "loss": 0.6291, "rewards/accuracies": 0.875, "rewards/chosen": 0.2169143110513687, "rewards/margins": 0.2594432234764099, "rewards/rejected": -0.042528897523880005, "step": 3075 }, { "epoch": 0.47570075391455635, "grad_norm": 7.562048435211182, "learning_rate": 4.6746477259709025e-06, "logits/chosen": 4.879905700683594, "logits/rejected": 9.656997680664062, "logps/chosen": -223.97235107421875, "logps/rejected": -282.97393798828125, "loss": 0.8576, "rewards/accuracies": 0.25, "rewards/chosen": 0.23566828668117523, "rewards/margins": -0.16209626197814941, "rewards/rejected": 0.39776450395584106, "step": 3076 }, { "epoch": 0.4758554030543205, "grad_norm": 7.674280166625977, "learning_rate": 4.674361324321228e-06, "logits/chosen": 8.150238990783691, "logits/rejected": 2.8680195808410645, "logps/chosen": -320.40460205078125, "logps/rejected": -282.52117919921875, "loss": 0.7678, "rewards/accuracies": 0.5, "rewards/chosen": 0.1763329654932022, "rewards/margins": -0.020856082439422607, "rewards/rejected": 0.197189062833786, "step": 3077 }, { "epoch": 0.47601005219408465, "grad_norm": 6.089390277862549, "learning_rate": 4.674074922671555e-06, "logits/chosen": 11.534485816955566, "logits/rejected": 7.353596210479736, "logps/chosen": -311.7340393066406, "logps/rejected": -248.7273406982422, "loss": 0.5112, "rewards/accuracies": 1.0, "rewards/chosen": 0.47398436069488525, "rewards/margins": 0.4283140003681183, "rewards/rejected": 0.045670315623283386, "step": 3078 }, { "epoch": 0.4761647013338488, "grad_norm": 5.227465629577637, "learning_rate": 4.673788521021882e-06, "logits/chosen": 7.246560096740723, "logits/rejected": 5.192907333374023, "logps/chosen": -256.4664611816406, "logps/rejected": -264.9912109375, "loss": 0.6469, "rewards/accuracies": 0.75, "rewards/chosen": 0.21067595481872559, "rewards/margins": 0.12050056457519531, "rewards/rejected": 0.09017538279294968, "step": 3079 }, { "epoch": 0.476319350473613, "grad_norm": 4.536075592041016, "learning_rate": 4.673502119372208e-06, "logits/chosen": 9.53080940246582, "logits/rejected": 2.5608153343200684, "logps/chosen": -220.08416748046875, "logps/rejected": -171.38954162597656, "loss": 0.5955, "rewards/accuracies": 0.625, "rewards/chosen": 0.4440489113330841, "rewards/margins": 0.24446091055870056, "rewards/rejected": 0.19958800077438354, "step": 3080 }, { "epoch": 0.47647399961337716, "grad_norm": 4.754151821136475, "learning_rate": 4.673215717722535e-06, "logits/chosen": 10.98740005493164, "logits/rejected": 6.7599592208862305, "logps/chosen": -270.01141357421875, "logps/rejected": -188.2509765625, "loss": 0.6372, "rewards/accuracies": 0.625, "rewards/chosen": 0.3542192578315735, "rewards/margins": 0.216253861784935, "rewards/rejected": 0.1379653811454773, "step": 3081 }, { "epoch": 0.4766286487531413, "grad_norm": 6.546957015991211, "learning_rate": 4.672929316072862e-06, "logits/chosen": 12.19477367401123, "logits/rejected": 6.244563102722168, "logps/chosen": -320.218017578125, "logps/rejected": -249.3705291748047, "loss": 0.5361, "rewards/accuracies": 0.625, "rewards/chosen": 0.5429012775421143, "rewards/margins": 0.4676016867160797, "rewards/rejected": 0.07529956102371216, "step": 3082 }, { "epoch": 0.47678329789290547, "grad_norm": 4.286785125732422, "learning_rate": 4.672642914423187e-06, "logits/chosen": 6.530529975891113, "logits/rejected": 7.054716110229492, "logps/chosen": -214.75149536132812, "logps/rejected": -237.53419494628906, "loss": 0.6876, "rewards/accuracies": 0.375, "rewards/chosen": 0.311136394739151, "rewards/margins": 0.0889199748635292, "rewards/rejected": 0.2222164124250412, "step": 3083 }, { "epoch": 0.4769379470326696, "grad_norm": 5.053194046020508, "learning_rate": 4.672356512773514e-06, "logits/chosen": 8.8037109375, "logits/rejected": 9.088529586791992, "logps/chosen": -208.4940948486328, "logps/rejected": -206.71035766601562, "loss": 0.6826, "rewards/accuracies": 0.375, "rewards/chosen": 0.23251323401927948, "rewards/margins": 0.0745190978050232, "rewards/rejected": 0.1579941362142563, "step": 3084 }, { "epoch": 0.47709259617243377, "grad_norm": 7.402444839477539, "learning_rate": 4.672070111123841e-06, "logits/chosen": 10.791864395141602, "logits/rejected": 5.92660665512085, "logps/chosen": -299.4687805175781, "logps/rejected": -192.10618591308594, "loss": 0.6938, "rewards/accuracies": 0.375, "rewards/chosen": 0.13676252961158752, "rewards/margins": 0.23840409517288208, "rewards/rejected": -0.10164155811071396, "step": 3085 }, { "epoch": 0.477247245312198, "grad_norm": 5.109952926635742, "learning_rate": 4.671783709474167e-06, "logits/chosen": 10.46800708770752, "logits/rejected": 6.0344038009643555, "logps/chosen": -211.48513793945312, "logps/rejected": -251.4752655029297, "loss": 0.5387, "rewards/accuracies": 0.75, "rewards/chosen": 0.3065238296985626, "rewards/margins": 0.5590267181396484, "rewards/rejected": -0.2525028884410858, "step": 3086 }, { "epoch": 0.4774018944519621, "grad_norm": 4.210285663604736, "learning_rate": 4.671497307824493e-06, "logits/chosen": 11.207535743713379, "logits/rejected": 10.102213859558105, "logps/chosen": -234.4876708984375, "logps/rejected": -266.76043701171875, "loss": 0.6449, "rewards/accuracies": 0.75, "rewards/chosen": 0.3969913721084595, "rewards/margins": 0.1619933545589447, "rewards/rejected": 0.23499798774719238, "step": 3087 }, { "epoch": 0.4775565435917263, "grad_norm": 5.691965579986572, "learning_rate": 4.67121090617482e-06, "logits/chosen": 5.82404899597168, "logits/rejected": 7.568882465362549, "logps/chosen": -306.07879638671875, "logps/rejected": -267.31787109375, "loss": 0.6646, "rewards/accuracies": 0.625, "rewards/chosen": 0.2845696210861206, "rewards/margins": 0.133896604180336, "rewards/rejected": 0.1506730020046234, "step": 3088 }, { "epoch": 0.47771119273149043, "grad_norm": 37.72473907470703, "learning_rate": 4.6709245045251464e-06, "logits/chosen": 12.321680068969727, "logits/rejected": -0.4262933135032654, "logps/chosen": -320.69903564453125, "logps/rejected": -139.259033203125, "loss": 0.4984, "rewards/accuracies": 0.875, "rewards/chosen": 0.2670469880104065, "rewards/margins": 0.48887258768081665, "rewards/rejected": -0.22182559967041016, "step": 3089 }, { "epoch": 0.4778658418712546, "grad_norm": 6.362565517425537, "learning_rate": 4.670638102875473e-06, "logits/chosen": 10.374752044677734, "logits/rejected": 10.374979019165039, "logps/chosen": -248.73681640625, "logps/rejected": -266.8424987792969, "loss": 0.5548, "rewards/accuracies": 0.875, "rewards/chosen": 0.10653285682201385, "rewards/margins": 0.3668130934238434, "rewards/rejected": -0.26028022170066833, "step": 3090 }, { "epoch": 0.47802049101101873, "grad_norm": 4.842832565307617, "learning_rate": 4.670351701225799e-06, "logits/chosen": 11.556646347045898, "logits/rejected": 1.400526523590088, "logps/chosen": -485.8091735839844, "logps/rejected": -235.7618408203125, "loss": 0.4774, "rewards/accuracies": 0.875, "rewards/chosen": 0.5398233532905579, "rewards/margins": 0.5973602533340454, "rewards/rejected": -0.05753690004348755, "step": 3091 }, { "epoch": 0.4781751401507829, "grad_norm": 6.038628578186035, "learning_rate": 4.6700652995761255e-06, "logits/chosen": 15.006936073303223, "logits/rejected": 14.903401374816895, "logps/chosen": -363.13494873046875, "logps/rejected": -310.9366455078125, "loss": 0.7972, "rewards/accuracies": 0.625, "rewards/chosen": -0.046074531972408295, "rewards/margins": -0.10337035357952118, "rewards/rejected": 0.057295799255371094, "step": 3092 }, { "epoch": 0.4783297892905471, "grad_norm": 7.2743144035339355, "learning_rate": 4.669778897926452e-06, "logits/chosen": 7.723608016967773, "logits/rejected": 5.050766944885254, "logps/chosen": -418.19622802734375, "logps/rejected": -267.413330078125, "loss": 0.7688, "rewards/accuracies": 0.375, "rewards/chosen": 0.26131361722946167, "rewards/margins": -0.11878012865781784, "rewards/rejected": 0.3800937235355377, "step": 3093 }, { "epoch": 0.47848443843031124, "grad_norm": 5.995887756347656, "learning_rate": 4.669492496276779e-06, "logits/chosen": 8.6649751663208, "logits/rejected": 5.4912028312683105, "logps/chosen": -234.44033813476562, "logps/rejected": -187.11074829101562, "loss": 0.6416, "rewards/accuracies": 0.625, "rewards/chosen": 0.5310311317443848, "rewards/margins": 0.19231672585010529, "rewards/rejected": 0.3387144207954407, "step": 3094 }, { "epoch": 0.4786390875700754, "grad_norm": 4.676825046539307, "learning_rate": 4.6692060946271055e-06, "logits/chosen": 11.066549301147461, "logits/rejected": 3.7531991004943848, "logps/chosen": -258.84423828125, "logps/rejected": -168.07275390625, "loss": 0.6382, "rewards/accuracies": 0.5, "rewards/chosen": 0.16205647587776184, "rewards/margins": 0.1746605932712555, "rewards/rejected": -0.012604091316461563, "step": 3095 }, { "epoch": 0.47879373670983955, "grad_norm": 4.744071960449219, "learning_rate": 4.668919692977432e-06, "logits/chosen": 12.52891731262207, "logits/rejected": 7.974610328674316, "logps/chosen": -285.4501647949219, "logps/rejected": -218.45010375976562, "loss": 0.6795, "rewards/accuracies": 0.5, "rewards/chosen": 0.20454664528369904, "rewards/margins": 0.0979498028755188, "rewards/rejected": 0.10659685730934143, "step": 3096 }, { "epoch": 0.4789483858496037, "grad_norm": 41.192081451416016, "learning_rate": 4.668633291327758e-06, "logits/chosen": 11.171636581420898, "logits/rejected": 10.808284759521484, "logps/chosen": -268.31732177734375, "logps/rejected": -313.15496826171875, "loss": 0.6801, "rewards/accuracies": 0.375, "rewards/chosen": 0.3524947166442871, "rewards/margins": 0.05625344067811966, "rewards/rejected": 0.29624128341674805, "step": 3097 }, { "epoch": 0.47910303498936785, "grad_norm": 12.082710266113281, "learning_rate": 4.668346889678085e-06, "logits/chosen": 9.04279899597168, "logits/rejected": 4.530684471130371, "logps/chosen": -303.1335754394531, "logps/rejected": -236.166015625, "loss": 0.5328, "rewards/accuracies": 0.875, "rewards/chosen": 0.25709667801856995, "rewards/margins": 0.4197738766670227, "rewards/rejected": -0.16267719864845276, "step": 3098 }, { "epoch": 0.47925768412913206, "grad_norm": 7.9044189453125, "learning_rate": 4.668060488028411e-06, "logits/chosen": 10.09089469909668, "logits/rejected": 11.316904067993164, "logps/chosen": -302.5939636230469, "logps/rejected": -319.34088134765625, "loss": 0.7032, "rewards/accuracies": 0.375, "rewards/chosen": 0.12135867029428482, "rewards/margins": 0.13457375764846802, "rewards/rejected": -0.013215094804763794, "step": 3099 }, { "epoch": 0.4794123332688962, "grad_norm": 5.916541576385498, "learning_rate": 4.667774086378738e-06, "logits/chosen": 7.774745941162109, "logits/rejected": 4.12262487411499, "logps/chosen": -205.13259887695312, "logps/rejected": -114.50479125976562, "loss": 0.688, "rewards/accuracies": 0.625, "rewards/chosen": 0.07930092513561249, "rewards/margins": 0.05635065212845802, "rewards/rejected": 0.022950269281864166, "step": 3100 }, { "epoch": 0.47956698240866036, "grad_norm": 4.6126322746276855, "learning_rate": 4.6674876847290645e-06, "logits/chosen": 4.103279113769531, "logits/rejected": 3.9995923042297363, "logps/chosen": -266.60174560546875, "logps/rejected": -200.19781494140625, "loss": 0.584, "rewards/accuracies": 0.875, "rewards/chosen": 0.426532506942749, "rewards/margins": 0.2903444766998291, "rewards/rejected": 0.13618803024291992, "step": 3101 }, { "epoch": 0.4797216315484245, "grad_norm": 5.278357982635498, "learning_rate": 4.667201283079391e-06, "logits/chosen": 7.119441509246826, "logits/rejected": 1.8866794109344482, "logps/chosen": -224.30630493164062, "logps/rejected": -163.84725952148438, "loss": 0.6944, "rewards/accuracies": 0.75, "rewards/chosen": 0.0046348609030246735, "rewards/margins": 0.06837233901023865, "rewards/rejected": -0.06373748928308487, "step": 3102 }, { "epoch": 0.47987628068818866, "grad_norm": 6.516919136047363, "learning_rate": 4.666914881429717e-06, "logits/chosen": 10.329404830932617, "logits/rejected": 5.345468997955322, "logps/chosen": -314.9106140136719, "logps/rejected": -253.62203979492188, "loss": 0.6378, "rewards/accuracies": 0.75, "rewards/chosen": 0.25726640224456787, "rewards/margins": 0.16195009648799896, "rewards/rejected": 0.0953163281083107, "step": 3103 }, { "epoch": 0.4800309298279528, "grad_norm": 5.054995059967041, "learning_rate": 4.666628479780044e-06, "logits/chosen": 12.634053230285645, "logits/rejected": 6.672888278961182, "logps/chosen": -312.3268127441406, "logps/rejected": -250.08203125, "loss": 0.5894, "rewards/accuracies": 0.625, "rewards/chosen": 0.4982706308364868, "rewards/margins": 0.334505558013916, "rewards/rejected": 0.1637650430202484, "step": 3104 }, { "epoch": 0.480185578967717, "grad_norm": 6.274186611175537, "learning_rate": 4.66634207813037e-06, "logits/chosen": 7.107688903808594, "logits/rejected": 7.325886249542236, "logps/chosen": -256.94720458984375, "logps/rejected": -267.4947204589844, "loss": 0.6929, "rewards/accuracies": 0.375, "rewards/chosen": 0.31862062215805054, "rewards/margins": 0.05860920250415802, "rewards/rejected": 0.2600114047527313, "step": 3105 }, { "epoch": 0.48034022810748117, "grad_norm": 5.772373199462891, "learning_rate": 4.666055676480697e-06, "logits/chosen": 12.983613014221191, "logits/rejected": 13.001771926879883, "logps/chosen": -300.5957946777344, "logps/rejected": -301.8417663574219, "loss": 0.6318, "rewards/accuracies": 0.5, "rewards/chosen": 0.28683149814605713, "rewards/margins": 0.21356192231178284, "rewards/rejected": 0.07326958328485489, "step": 3106 }, { "epoch": 0.4804948772472453, "grad_norm": 3.7809906005859375, "learning_rate": 4.665769274831024e-06, "logits/chosen": 9.988656997680664, "logits/rejected": 8.445301055908203, "logps/chosen": -290.3752746582031, "logps/rejected": -163.20372009277344, "loss": 0.529, "rewards/accuracies": 0.75, "rewards/chosen": 0.36333411931991577, "rewards/margins": 0.4124641418457031, "rewards/rejected": -0.04913001134991646, "step": 3107 }, { "epoch": 0.4806495263870095, "grad_norm": 6.993603706359863, "learning_rate": 4.66548287318135e-06, "logits/chosen": 8.771278381347656, "logits/rejected": 13.76442813873291, "logps/chosen": -264.7418518066406, "logps/rejected": -280.3789978027344, "loss": 0.6834, "rewards/accuracies": 0.375, "rewards/chosen": 0.27477264404296875, "rewards/margins": 0.11957436800003052, "rewards/rejected": 0.15519830584526062, "step": 3108 }, { "epoch": 0.4808041755267736, "grad_norm": 4.009907245635986, "learning_rate": 4.665196471531677e-06, "logits/chosen": 6.039556980133057, "logits/rejected": 3.0460400581359863, "logps/chosen": -363.201171875, "logps/rejected": -243.82952880859375, "loss": 0.5105, "rewards/accuracies": 0.75, "rewards/chosen": 0.3379444479942322, "rewards/margins": 0.6768665313720703, "rewards/rejected": -0.3389221131801605, "step": 3109 }, { "epoch": 0.4809588246665378, "grad_norm": 6.993696689605713, "learning_rate": 4.664910069882003e-06, "logits/chosen": 13.721200942993164, "logits/rejected": 7.4661865234375, "logps/chosen": -361.32061767578125, "logps/rejected": -225.691162109375, "loss": 0.6726, "rewards/accuracies": 0.625, "rewards/chosen": 0.2556111216545105, "rewards/margins": 0.18139737844467163, "rewards/rejected": 0.07421379536390305, "step": 3110 }, { "epoch": 0.48111347380630193, "grad_norm": 9.799798965454102, "learning_rate": 4.664623668232329e-06, "logits/chosen": 11.079540252685547, "logits/rejected": 11.499974250793457, "logps/chosen": -441.81695556640625, "logps/rejected": -403.2650146484375, "loss": 0.767, "rewards/accuracies": 0.5, "rewards/chosen": 0.2043754607439041, "rewards/margins": -0.07378693670034409, "rewards/rejected": 0.2781623899936676, "step": 3111 }, { "epoch": 0.48126812294606613, "grad_norm": 6.90119743347168, "learning_rate": 4.664337266582656e-06, "logits/chosen": 7.199223518371582, "logits/rejected": 14.866822242736816, "logps/chosen": -310.2446594238281, "logps/rejected": -423.78887939453125, "loss": 0.6777, "rewards/accuracies": 0.375, "rewards/chosen": 0.08916039019823074, "rewards/margins": 0.09727483242750168, "rewards/rejected": -0.008114442229270935, "step": 3112 }, { "epoch": 0.4814227720858303, "grad_norm": 5.121906757354736, "learning_rate": 4.664050864932983e-06, "logits/chosen": 8.25510025024414, "logits/rejected": 5.587943077087402, "logps/chosen": -331.591796875, "logps/rejected": -258.9278564453125, "loss": 0.5827, "rewards/accuracies": 0.625, "rewards/chosen": 0.2942407727241516, "rewards/margins": 0.4519074857234955, "rewards/rejected": -0.15766669809818268, "step": 3113 }, { "epoch": 0.48157742122559444, "grad_norm": 4.946910381317139, "learning_rate": 4.663764463283309e-06, "logits/chosen": 13.50670337677002, "logits/rejected": 9.783855438232422, "logps/chosen": -238.26123046875, "logps/rejected": -217.48915100097656, "loss": 0.5451, "rewards/accuracies": 0.75, "rewards/chosen": 0.49375224113464355, "rewards/margins": 0.44415006041526794, "rewards/rejected": 0.049602165818214417, "step": 3114 }, { "epoch": 0.4817320703653586, "grad_norm": 5.88555383682251, "learning_rate": 4.663478061633636e-06, "logits/chosen": 10.317214965820312, "logits/rejected": 4.8607611656188965, "logps/chosen": -271.45263671875, "logps/rejected": -218.02114868164062, "loss": 0.6784, "rewards/accuracies": 0.625, "rewards/chosen": 0.19734486937522888, "rewards/margins": 0.042775336652994156, "rewards/rejected": 0.15456953644752502, "step": 3115 }, { "epoch": 0.48188671950512274, "grad_norm": 5.0947265625, "learning_rate": 4.663191659983962e-06, "logits/chosen": 8.713740348815918, "logits/rejected": 6.296026706695557, "logps/chosen": -223.08221435546875, "logps/rejected": -154.70620727539062, "loss": 0.7356, "rewards/accuracies": 0.375, "rewards/chosen": 0.1956629753112793, "rewards/margins": -0.03696922957897186, "rewards/rejected": 0.23263218998908997, "step": 3116 }, { "epoch": 0.4820413686448869, "grad_norm": 10.16594123840332, "learning_rate": 4.662905258334288e-06, "logits/chosen": 9.132078170776367, "logits/rejected": 9.844566345214844, "logps/chosen": -546.7945556640625, "logps/rejected": -361.9330749511719, "loss": 0.7085, "rewards/accuracies": 0.5, "rewards/chosen": 0.26457977294921875, "rewards/margins": 0.07653006166219711, "rewards/rejected": 0.18804970383644104, "step": 3117 }, { "epoch": 0.4821960177846511, "grad_norm": 5.411478042602539, "learning_rate": 4.662618856684615e-06, "logits/chosen": 7.599648475646973, "logits/rejected": 9.483412742614746, "logps/chosen": -241.53836059570312, "logps/rejected": -250.0540313720703, "loss": 0.63, "rewards/accuracies": 0.625, "rewards/chosen": 0.41807055473327637, "rewards/margins": 0.1949913650751114, "rewards/rejected": 0.22307920455932617, "step": 3118 }, { "epoch": 0.48235066692441525, "grad_norm": 3.5089199542999268, "learning_rate": 4.662332455034942e-06, "logits/chosen": 8.58030891418457, "logits/rejected": 5.935327529907227, "logps/chosen": -163.31246948242188, "logps/rejected": -102.3130874633789, "loss": 0.6235, "rewards/accuracies": 0.625, "rewards/chosen": 0.045124948024749756, "rewards/margins": 0.2460692673921585, "rewards/rejected": -0.20094431936740875, "step": 3119 }, { "epoch": 0.4825053160641794, "grad_norm": 8.25362491607666, "learning_rate": 4.662046053385268e-06, "logits/chosen": 14.522830963134766, "logits/rejected": 9.616324424743652, "logps/chosen": -417.04278564453125, "logps/rejected": -322.84259033203125, "loss": 0.7294, "rewards/accuracies": 0.5, "rewards/chosen": 0.18802796304225922, "rewards/margins": -0.04016497731208801, "rewards/rejected": 0.22819297015666962, "step": 3120 }, { "epoch": 0.48265996520394355, "grad_norm": 5.709573745727539, "learning_rate": 4.661759651735594e-06, "logits/chosen": 11.128902435302734, "logits/rejected": 8.42249870300293, "logps/chosen": -273.7630615234375, "logps/rejected": -234.2310333251953, "loss": 0.7539, "rewards/accuracies": 0.375, "rewards/chosen": 0.2798891067504883, "rewards/margins": 0.09342312812805176, "rewards/rejected": 0.18646597862243652, "step": 3121 }, { "epoch": 0.4828146143437077, "grad_norm": 6.523487091064453, "learning_rate": 4.661473250085921e-06, "logits/chosen": 7.685569763183594, "logits/rejected": 7.032012939453125, "logps/chosen": -301.07135009765625, "logps/rejected": -433.1878356933594, "loss": 0.565, "rewards/accuracies": 0.75, "rewards/chosen": 0.4107076823711395, "rewards/margins": 0.39451009035110474, "rewards/rejected": 0.016197582706809044, "step": 3122 }, { "epoch": 0.48296926348347186, "grad_norm": 7.286925792694092, "learning_rate": 4.6611868484362475e-06, "logits/chosen": 8.256193161010742, "logits/rejected": 10.909571647644043, "logps/chosen": -384.4923095703125, "logps/rejected": -283.5874938964844, "loss": 0.8165, "rewards/accuracies": 0.125, "rewards/chosen": 0.16667160391807556, "rewards/margins": -0.22564469277858734, "rewards/rejected": 0.3923163115978241, "step": 3123 }, { "epoch": 0.483123912623236, "grad_norm": 4.95969295501709, "learning_rate": 4.660900446786574e-06, "logits/chosen": 8.335537910461426, "logits/rejected": 6.66776704788208, "logps/chosen": -242.01260375976562, "logps/rejected": -229.59786987304688, "loss": 0.5817, "rewards/accuracies": 0.75, "rewards/chosen": 0.5110227465629578, "rewards/margins": 0.28857529163360596, "rewards/rejected": 0.2224474549293518, "step": 3124 }, { "epoch": 0.4832785617630002, "grad_norm": 8.299555778503418, "learning_rate": 4.6606140451369e-06, "logits/chosen": 14.352217674255371, "logits/rejected": 7.983645439147949, "logps/chosen": -324.21917724609375, "logps/rejected": -255.78677368164062, "loss": 0.3975, "rewards/accuracies": 1.0, "rewards/chosen": 0.5353757739067078, "rewards/margins": 0.8145855069160461, "rewards/rejected": -0.27920979261398315, "step": 3125 }, { "epoch": 0.48343321090276437, "grad_norm": 5.962520599365234, "learning_rate": 4.6603276434872266e-06, "logits/chosen": 10.823653221130371, "logits/rejected": 8.550436019897461, "logps/chosen": -284.08294677734375, "logps/rejected": -339.2457275390625, "loss": 0.5209, "rewards/accuracies": 0.75, "rewards/chosen": 0.606833815574646, "rewards/margins": 0.5953049659729004, "rewards/rejected": 0.0115288645029068, "step": 3126 }, { "epoch": 0.4835878600425285, "grad_norm": 11.590056419372559, "learning_rate": 4.660041241837553e-06, "logits/chosen": 5.133481979370117, "logits/rejected": 6.446111679077148, "logps/chosen": -315.7890930175781, "logps/rejected": -257.5774841308594, "loss": 0.7486, "rewards/accuracies": 0.25, "rewards/chosen": -0.008068151772022247, "rewards/margins": -0.045098286122083664, "rewards/rejected": 0.03703012689948082, "step": 3127 }, { "epoch": 0.48374250918229267, "grad_norm": 6.511941432952881, "learning_rate": 4.65975484018788e-06, "logits/chosen": 10.51563549041748, "logits/rejected": 8.67074203491211, "logps/chosen": -248.95477294921875, "logps/rejected": -211.943603515625, "loss": 0.758, "rewards/accuracies": 0.375, "rewards/chosen": 0.1153748631477356, "rewards/margins": -0.07697518169879913, "rewards/rejected": 0.19235001504421234, "step": 3128 }, { "epoch": 0.4838971583220568, "grad_norm": 4.336550235748291, "learning_rate": 4.6594684385382065e-06, "logits/chosen": 6.033234596252441, "logits/rejected": 5.192317962646484, "logps/chosen": -204.49119567871094, "logps/rejected": -197.9831085205078, "loss": 0.6913, "rewards/accuracies": 0.625, "rewards/chosen": 0.030600735917687416, "rewards/margins": 0.06455759704113007, "rewards/rejected": -0.0339568667113781, "step": 3129 }, { "epoch": 0.48405180746182097, "grad_norm": 4.572023868560791, "learning_rate": 4.659182036888532e-06, "logits/chosen": 10.968120574951172, "logits/rejected": 7.156067848205566, "logps/chosen": -209.73492431640625, "logps/rejected": -198.540283203125, "loss": 0.5592, "rewards/accuracies": 0.875, "rewards/chosen": 0.15447193384170532, "rewards/margins": 0.328130841255188, "rewards/rejected": -0.17365893721580505, "step": 3130 }, { "epoch": 0.4842064566015852, "grad_norm": 5.495441436767578, "learning_rate": 4.658895635238859e-06, "logits/chosen": 8.866323471069336, "logits/rejected": 8.06518268585205, "logps/chosen": -191.22218322753906, "logps/rejected": -159.1036834716797, "loss": 0.7118, "rewards/accuracies": 0.5, "rewards/chosen": 0.4054487645626068, "rewards/margins": 0.004098765552043915, "rewards/rejected": 0.4013499617576599, "step": 3131 }, { "epoch": 0.48436110574134933, "grad_norm": 5.050384521484375, "learning_rate": 4.658609233589186e-06, "logits/chosen": 10.044139862060547, "logits/rejected": 8.329888343811035, "logps/chosen": -292.9063415527344, "logps/rejected": -307.84405517578125, "loss": 0.5093, "rewards/accuracies": 0.875, "rewards/chosen": 0.39067453145980835, "rewards/margins": 0.4525166153907776, "rewards/rejected": -0.06184205785393715, "step": 3132 }, { "epoch": 0.4845157548811135, "grad_norm": 4.396191596984863, "learning_rate": 4.658322831939512e-06, "logits/chosen": 5.640777587890625, "logits/rejected": 2.405721426010132, "logps/chosen": -1031.5565185546875, "logps/rejected": -176.4678192138672, "loss": 0.609, "rewards/accuracies": 0.375, "rewards/chosen": 9.063767433166504, "rewards/margins": 8.928215026855469, "rewards/rejected": 0.1355513483285904, "step": 3133 }, { "epoch": 0.48467040402087763, "grad_norm": 5.602945804595947, "learning_rate": 4.658036430289839e-06, "logits/chosen": 7.063930511474609, "logits/rejected": 6.749483585357666, "logps/chosen": -145.97476196289062, "logps/rejected": -172.72418212890625, "loss": 0.8493, "rewards/accuracies": 0.25, "rewards/chosen": -0.026788663119077682, "rewards/margins": -0.2280847728252411, "rewards/rejected": 0.20129609107971191, "step": 3134 }, { "epoch": 0.4848250531606418, "grad_norm": 4.217652797698975, "learning_rate": 4.657750028640166e-06, "logits/chosen": 12.316527366638184, "logits/rejected": 2.074108600616455, "logps/chosen": -162.15072631835938, "logps/rejected": -92.1205062866211, "loss": 0.6599, "rewards/accuracies": 0.625, "rewards/chosen": 0.14895698428153992, "rewards/margins": 0.11452591419219971, "rewards/rejected": 0.034431055188179016, "step": 3135 }, { "epoch": 0.48497970230040593, "grad_norm": 4.476449012756348, "learning_rate": 4.657463626990491e-06, "logits/chosen": 9.3092041015625, "logits/rejected": 1.613875389099121, "logps/chosen": -330.0496520996094, "logps/rejected": -241.52801513671875, "loss": 0.5316, "rewards/accuracies": 0.75, "rewards/chosen": 0.5261832475662231, "rewards/margins": 0.5591482520103455, "rewards/rejected": -0.0329650416970253, "step": 3136 }, { "epoch": 0.48513435144017014, "grad_norm": 6.643457412719727, "learning_rate": 4.657177225340818e-06, "logits/chosen": 14.627910614013672, "logits/rejected": 7.467987060546875, "logps/chosen": -342.03985595703125, "logps/rejected": -284.0477294921875, "loss": 0.5862, "rewards/accuracies": 0.625, "rewards/chosen": 0.12968263030052185, "rewards/margins": 0.5245844125747681, "rewards/rejected": -0.3949018120765686, "step": 3137 }, { "epoch": 0.4852890005799343, "grad_norm": 5.43419075012207, "learning_rate": 4.656890823691145e-06, "logits/chosen": 4.246822357177734, "logits/rejected": 9.969505310058594, "logps/chosen": -177.4728546142578, "logps/rejected": -272.5858154296875, "loss": 0.8379, "rewards/accuracies": 0.25, "rewards/chosen": 0.1840386688709259, "rewards/margins": -0.21330195665359497, "rewards/rejected": 0.3973406255245209, "step": 3138 }, { "epoch": 0.48544364971969844, "grad_norm": 4.850854396820068, "learning_rate": 4.656604422041471e-06, "logits/chosen": 12.65326976776123, "logits/rejected": 6.869013786315918, "logps/chosen": -353.0364990234375, "logps/rejected": -236.920166015625, "loss": 0.5683, "rewards/accuracies": 0.625, "rewards/chosen": 0.5645352602005005, "rewards/margins": 0.3963402807712555, "rewards/rejected": 0.168194979429245, "step": 3139 }, { "epoch": 0.4855982988594626, "grad_norm": 4.66309928894043, "learning_rate": 4.656318020391798e-06, "logits/chosen": 9.907949447631836, "logits/rejected": 7.054296493530273, "logps/chosen": -235.88304138183594, "logps/rejected": -217.68917846679688, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": 0.35523533821105957, "rewards/margins": 0.14800453186035156, "rewards/rejected": 0.20723077654838562, "step": 3140 }, { "epoch": 0.48575294799922675, "grad_norm": 5.229705810546875, "learning_rate": 4.656031618742125e-06, "logits/chosen": 12.638096809387207, "logits/rejected": 10.311125755310059, "logps/chosen": -237.84715270996094, "logps/rejected": -209.0557098388672, "loss": 0.749, "rewards/accuracies": 0.375, "rewards/chosen": 0.181694895029068, "rewards/margins": -0.012629128992557526, "rewards/rejected": 0.1943240463733673, "step": 3141 }, { "epoch": 0.4859075971389909, "grad_norm": 11.435564041137695, "learning_rate": 4.655745217092451e-06, "logits/chosen": 13.609848022460938, "logits/rejected": 6.833785533905029, "logps/chosen": -416.77032470703125, "logps/rejected": -280.5407409667969, "loss": 0.7254, "rewards/accuracies": 0.5, "rewards/chosen": 0.23872433602809906, "rewards/margins": 0.01831910014152527, "rewards/rejected": 0.22040525078773499, "step": 3142 }, { "epoch": 0.48606224627875505, "grad_norm": 7.797854423522949, "learning_rate": 4.655458815442777e-06, "logits/chosen": 8.242634773254395, "logits/rejected": 6.678414821624756, "logps/chosen": -360.640380859375, "logps/rejected": -309.0814514160156, "loss": 0.8073, "rewards/accuracies": 0.625, "rewards/chosen": -0.1506882756948471, "rewards/margins": -0.08424608409404755, "rewards/rejected": -0.06644222140312195, "step": 3143 }, { "epoch": 0.48621689541851926, "grad_norm": 6.6474761962890625, "learning_rate": 4.655172413793104e-06, "logits/chosen": 13.172595024108887, "logits/rejected": 12.744207382202148, "logps/chosen": -293.0497741699219, "logps/rejected": -338.942626953125, "loss": 0.6977, "rewards/accuracies": 0.5, "rewards/chosen": 0.2808595597743988, "rewards/margins": 0.04525240510702133, "rewards/rejected": 0.23560716211795807, "step": 3144 }, { "epoch": 0.4863715445582834, "grad_norm": 5.822055816650391, "learning_rate": 4.65488601214343e-06, "logits/chosen": 16.076597213745117, "logits/rejected": 6.6547088623046875, "logps/chosen": -340.1954040527344, "logps/rejected": -194.8856201171875, "loss": 0.5939, "rewards/accuracies": 0.75, "rewards/chosen": 0.3909834623336792, "rewards/margins": 0.38622355461120605, "rewards/rejected": 0.004759877920150757, "step": 3145 }, { "epoch": 0.48652619369804756, "grad_norm": 4.3357648849487305, "learning_rate": 4.654599610493757e-06, "logits/chosen": 14.991737365722656, "logits/rejected": 10.983709335327148, "logps/chosen": -189.89471435546875, "logps/rejected": -186.35240173339844, "loss": 0.5711, "rewards/accuracies": 0.625, "rewards/chosen": 0.36932802200317383, "rewards/margins": 0.2914316952228546, "rewards/rejected": 0.0778963565826416, "step": 3146 }, { "epoch": 0.4866808428378117, "grad_norm": 8.602401733398438, "learning_rate": 4.654313208844084e-06, "logits/chosen": 14.080583572387695, "logits/rejected": 8.30788803100586, "logps/chosen": -537.536376953125, "logps/rejected": -455.89764404296875, "loss": 0.5965, "rewards/accuracies": 0.625, "rewards/chosen": 0.7892827987670898, "rewards/margins": 0.2590695321559906, "rewards/rejected": 0.5302132964134216, "step": 3147 }, { "epoch": 0.48683549197757586, "grad_norm": 7.847988128662109, "learning_rate": 4.65402680719441e-06, "logits/chosen": 8.244237899780273, "logits/rejected": 9.512640953063965, "logps/chosen": -360.02996826171875, "logps/rejected": -377.20526123046875, "loss": 0.7297, "rewards/accuracies": 0.5, "rewards/chosen": 0.4286859631538391, "rewards/margins": 0.07802101969718933, "rewards/rejected": 0.3506649434566498, "step": 3148 }, { "epoch": 0.48699014111734, "grad_norm": 4.725911617279053, "learning_rate": 4.653740405544736e-06, "logits/chosen": 13.503066062927246, "logits/rejected": 9.251468658447266, "logps/chosen": -334.5111083984375, "logps/rejected": -200.09103393554688, "loss": 0.6043, "rewards/accuracies": 0.5, "rewards/chosen": 0.4679725170135498, "rewards/margins": 0.25458309054374695, "rewards/rejected": 0.21338944137096405, "step": 3149 }, { "epoch": 0.4871447902571042, "grad_norm": 6.002705097198486, "learning_rate": 4.653454003895063e-06, "logits/chosen": 12.51889419555664, "logits/rejected": 5.8789801597595215, "logps/chosen": -398.3709411621094, "logps/rejected": -269.5645446777344, "loss": 0.6609, "rewards/accuracies": 0.75, "rewards/chosen": 0.28691694140434265, "rewards/margins": 0.16540218889713287, "rewards/rejected": 0.1215147003531456, "step": 3150 }, { "epoch": 0.4872994393968684, "grad_norm": 7.415189266204834, "learning_rate": 4.6531676022453894e-06, "logits/chosen": 10.190999984741211, "logits/rejected": 7.825841903686523, "logps/chosen": -262.91522216796875, "logps/rejected": -288.96209716796875, "loss": 0.7193, "rewards/accuracies": 0.75, "rewards/chosen": 0.19332566857337952, "rewards/margins": 0.13973182439804077, "rewards/rejected": 0.05359382927417755, "step": 3151 }, { "epoch": 0.4874540885366325, "grad_norm": 5.6488752365112305, "learning_rate": 4.652881200595716e-06, "logits/chosen": 11.01989459991455, "logits/rejected": 8.425956726074219, "logps/chosen": -260.1990661621094, "logps/rejected": -230.9673614501953, "loss": 0.7021, "rewards/accuracies": 0.625, "rewards/chosen": 0.09329281747341156, "rewards/margins": 0.04618797451257706, "rewards/rejected": 0.04710483178496361, "step": 3152 }, { "epoch": 0.4876087376763967, "grad_norm": 6.773861408233643, "learning_rate": 4.652594798946043e-06, "logits/chosen": 5.366913318634033, "logits/rejected": 4.152656555175781, "logps/chosen": -252.30450439453125, "logps/rejected": -186.62954711914062, "loss": 0.6081, "rewards/accuracies": 0.75, "rewards/chosen": 0.1350730061531067, "rewards/margins": 0.20778274536132812, "rewards/rejected": -0.07270975410938263, "step": 3153 }, { "epoch": 0.4877633868161608, "grad_norm": 5.900341033935547, "learning_rate": 4.6523083972963686e-06, "logits/chosen": 9.167007446289062, "logits/rejected": 10.83575439453125, "logps/chosen": -315.940673828125, "logps/rejected": -373.966064453125, "loss": 0.6625, "rewards/accuracies": 0.5, "rewards/chosen": 0.5532617568969727, "rewards/margins": 0.16774307191371918, "rewards/rejected": 0.38551872968673706, "step": 3154 }, { "epoch": 0.487918035955925, "grad_norm": 6.796382904052734, "learning_rate": 4.652021995646695e-06, "logits/chosen": 7.657487869262695, "logits/rejected": 10.32063102722168, "logps/chosen": -299.8795471191406, "logps/rejected": -416.1364440917969, "loss": 0.7348, "rewards/accuracies": 0.375, "rewards/chosen": 0.42327243089675903, "rewards/margins": -0.06519460678100586, "rewards/rejected": 0.4884670376777649, "step": 3155 }, { "epoch": 0.48807268509568913, "grad_norm": 6.500053405761719, "learning_rate": 4.651735593997022e-06, "logits/chosen": 9.578104019165039, "logits/rejected": 7.073984622955322, "logps/chosen": -276.14117431640625, "logps/rejected": -262.5292663574219, "loss": 0.8523, "rewards/accuracies": 0.5, "rewards/chosen": 0.36339902877807617, "rewards/margins": -0.15660923719406128, "rewards/rejected": 0.5200082659721375, "step": 3156 }, { "epoch": 0.48822733423545334, "grad_norm": 7.831830978393555, "learning_rate": 4.6514491923473485e-06, "logits/chosen": 3.694526195526123, "logits/rejected": 11.560588836669922, "logps/chosen": -200.13336181640625, "logps/rejected": -293.12005615234375, "loss": 0.982, "rewards/accuracies": 0.25, "rewards/chosen": -0.06465218216180801, "rewards/margins": -0.4228639006614685, "rewards/rejected": 0.3582116961479187, "step": 3157 }, { "epoch": 0.4883819833752175, "grad_norm": 7.800491809844971, "learning_rate": 4.651162790697675e-06, "logits/chosen": 6.872100830078125, "logits/rejected": 5.318763732910156, "logps/chosen": -365.7567443847656, "logps/rejected": -265.0841979980469, "loss": 0.4845, "rewards/accuracies": 0.875, "rewards/chosen": 0.5878764390945435, "rewards/margins": 0.6039544939994812, "rewards/rejected": -0.01607809215784073, "step": 3158 }, { "epoch": 0.48853663251498164, "grad_norm": 4.411405086517334, "learning_rate": 4.650876389048001e-06, "logits/chosen": 10.610103607177734, "logits/rejected": 8.967996597290039, "logps/chosen": -219.71969604492188, "logps/rejected": -195.4951171875, "loss": 0.6501, "rewards/accuracies": 0.75, "rewards/chosen": 0.2500418722629547, "rewards/margins": 0.16353864967823029, "rewards/rejected": 0.08650322258472443, "step": 3159 }, { "epoch": 0.4886912816547458, "grad_norm": 6.110515117645264, "learning_rate": 4.650589987398328e-06, "logits/chosen": 7.2000579833984375, "logits/rejected": 12.480628967285156, "logps/chosen": -256.54443359375, "logps/rejected": -297.2021179199219, "loss": 0.7546, "rewards/accuracies": 0.625, "rewards/chosen": 0.25119417905807495, "rewards/margins": 0.04521968960762024, "rewards/rejected": 0.2059745192527771, "step": 3160 }, { "epoch": 0.48884593079450994, "grad_norm": 5.841609001159668, "learning_rate": 4.650303585748654e-06, "logits/chosen": 7.3679518699646, "logits/rejected": 9.284711837768555, "logps/chosen": -259.32470703125, "logps/rejected": -282.07989501953125, "loss": 0.658, "rewards/accuracies": 0.625, "rewards/chosen": 0.13073723018169403, "rewards/margins": 0.11859878152608871, "rewards/rejected": 0.01213844120502472, "step": 3161 }, { "epoch": 0.4890005799342741, "grad_norm": 7.9683427810668945, "learning_rate": 4.650017184098981e-06, "logits/chosen": 9.648505210876465, "logits/rejected": 8.252156257629395, "logps/chosen": -234.68856811523438, "logps/rejected": -224.43719482421875, "loss": 0.8463, "rewards/accuracies": 0.625, "rewards/chosen": -0.12476016581058502, "rewards/margins": -0.0468250997364521, "rewards/rejected": -0.07793506234884262, "step": 3162 }, { "epoch": 0.4891552290740383, "grad_norm": 5.005395889282227, "learning_rate": 4.649730782449307e-06, "logits/chosen": 4.40004825592041, "logits/rejected": 2.7033019065856934, "logps/chosen": -219.5928955078125, "logps/rejected": -152.5099639892578, "loss": 0.6907, "rewards/accuracies": 0.375, "rewards/chosen": 0.24326755106449127, "rewards/margins": 0.040911633521318436, "rewards/rejected": 0.20235590636730194, "step": 3163 }, { "epoch": 0.48930987821380245, "grad_norm": 7.610771179199219, "learning_rate": 4.649444380799633e-06, "logits/chosen": 1.3592698574066162, "logits/rejected": 4.451530456542969, "logps/chosen": -313.43927001953125, "logps/rejected": -281.161376953125, "loss": 0.8932, "rewards/accuracies": 0.5, "rewards/chosen": 0.06496445834636688, "rewards/margins": -0.21391120553016663, "rewards/rejected": 0.2788756191730499, "step": 3164 }, { "epoch": 0.4894645273535666, "grad_norm": 10.818938255310059, "learning_rate": 4.64915797914996e-06, "logits/chosen": 11.451922416687012, "logits/rejected": 11.464862823486328, "logps/chosen": -435.517822265625, "logps/rejected": -528.1263427734375, "loss": 0.7433, "rewards/accuracies": 0.375, "rewards/chosen": 0.6088907718658447, "rewards/margins": 0.010804370045661926, "rewards/rejected": 0.598086416721344, "step": 3165 }, { "epoch": 0.48961917649333075, "grad_norm": 6.297966480255127, "learning_rate": 4.648871577500287e-06, "logits/chosen": 10.342071533203125, "logits/rejected": 10.34482192993164, "logps/chosen": -300.5644836425781, "logps/rejected": -302.13092041015625, "loss": 0.7791, "rewards/accuracies": 0.25, "rewards/chosen": 0.011102981865406036, "rewards/margins": -0.022024869918823242, "rewards/rejected": 0.033127881586551666, "step": 3166 }, { "epoch": 0.4897738256330949, "grad_norm": 5.925624370574951, "learning_rate": 4.648585175850613e-06, "logits/chosen": 8.92153549194336, "logits/rejected": 6.804349899291992, "logps/chosen": -229.2291259765625, "logps/rejected": -202.8915252685547, "loss": 0.6693, "rewards/accuracies": 0.625, "rewards/chosen": 0.051148418337106705, "rewards/margins": 0.12040044367313385, "rewards/rejected": -0.06925200670957565, "step": 3167 }, { "epoch": 0.48992847477285906, "grad_norm": 5.773336410522461, "learning_rate": 4.64829877420094e-06, "logits/chosen": 6.840940475463867, "logits/rejected": 4.913928985595703, "logps/chosen": -275.9788818359375, "logps/rejected": -224.2779541015625, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": 0.4275149405002594, "rewards/margins": 0.0990627259016037, "rewards/rejected": 0.3284522294998169, "step": 3168 }, { "epoch": 0.4900831239126232, "grad_norm": 5.766694068908691, "learning_rate": 4.648012372551266e-06, "logits/chosen": 12.817164421081543, "logits/rejected": 6.663919925689697, "logps/chosen": -316.7397155761719, "logps/rejected": -290.2966003417969, "loss": 0.54, "rewards/accuracies": 0.625, "rewards/chosen": 0.5047233700752258, "rewards/margins": 0.44056177139282227, "rewards/rejected": 0.06416158378124237, "step": 3169 }, { "epoch": 0.4902377730523874, "grad_norm": 4.884330749511719, "learning_rate": 4.647725970901592e-06, "logits/chosen": 11.26691722869873, "logits/rejected": 10.823739051818848, "logps/chosen": -155.59881591796875, "logps/rejected": -202.57444763183594, "loss": 0.7954, "rewards/accuracies": 0.375, "rewards/chosen": -0.28951627016067505, "rewards/margins": -0.1497904360294342, "rewards/rejected": -0.13972583413124084, "step": 3170 }, { "epoch": 0.49039242219215157, "grad_norm": 8.208195686340332, "learning_rate": 4.647439569251919e-06, "logits/chosen": 9.708375930786133, "logits/rejected": 5.051123142242432, "logps/chosen": -358.4781799316406, "logps/rejected": -281.9560546875, "loss": 0.7367, "rewards/accuracies": 0.5, "rewards/chosen": 0.5789129137992859, "rewards/margins": -0.017590772360563278, "rewards/rejected": 0.596503734588623, "step": 3171 }, { "epoch": 0.4905470713319157, "grad_norm": 6.089240550994873, "learning_rate": 4.647153167602246e-06, "logits/chosen": 17.972990036010742, "logits/rejected": 13.444500923156738, "logps/chosen": -266.7465515136719, "logps/rejected": -316.9468994140625, "loss": 0.7546, "rewards/accuracies": 0.375, "rewards/chosen": 0.42336201667785645, "rewards/margins": -0.08778373897075653, "rewards/rejected": 0.5111457705497742, "step": 3172 }, { "epoch": 0.49070172047167987, "grad_norm": 3.864051580429077, "learning_rate": 4.646866765952572e-06, "logits/chosen": 12.79916763305664, "logits/rejected": 10.045812606811523, "logps/chosen": -215.0010986328125, "logps/rejected": -207.05117797851562, "loss": 0.5086, "rewards/accuracies": 0.75, "rewards/chosen": 0.4178192913532257, "rewards/margins": 0.527061939239502, "rewards/rejected": -0.10924268513917923, "step": 3173 }, { "epoch": 0.490856369611444, "grad_norm": 6.854030132293701, "learning_rate": 4.646580364302899e-06, "logits/chosen": 11.282418251037598, "logits/rejected": 9.218419075012207, "logps/chosen": -289.53265380859375, "logps/rejected": -269.23040771484375, "loss": 0.7765, "rewards/accuracies": 0.5, "rewards/chosen": 0.37288475036621094, "rewards/margins": 0.04157883673906326, "rewards/rejected": 0.3313058912754059, "step": 3174 }, { "epoch": 0.4910110187512082, "grad_norm": 4.960069179534912, "learning_rate": 4.646293962653226e-06, "logits/chosen": 7.7244462966918945, "logits/rejected": 5.769742965698242, "logps/chosen": -307.2171630859375, "logps/rejected": -230.10629272460938, "loss": 0.5998, "rewards/accuracies": 0.625, "rewards/chosen": 0.029868803918361664, "rewards/margins": 0.2932475507259369, "rewards/rejected": -0.263378769159317, "step": 3175 }, { "epoch": 0.4911656678909724, "grad_norm": 6.106284141540527, "learning_rate": 4.6460075610035515e-06, "logits/chosen": 12.172809600830078, "logits/rejected": 9.250649452209473, "logps/chosen": -306.5214538574219, "logps/rejected": -255.6414337158203, "loss": 0.5988, "rewards/accuracies": 0.75, "rewards/chosen": 0.6228266954421997, "rewards/margins": 0.2738649547100067, "rewards/rejected": 0.34896180033683777, "step": 3176 }, { "epoch": 0.49132031703073653, "grad_norm": 4.351642608642578, "learning_rate": 4.645721159353878e-06, "logits/chosen": 8.836203575134277, "logits/rejected": 6.279273986816406, "logps/chosen": -252.66725158691406, "logps/rejected": -173.04774475097656, "loss": 0.6449, "rewards/accuracies": 0.625, "rewards/chosen": 0.22835348546504974, "rewards/margins": 0.219268798828125, "rewards/rejected": 0.009084686636924744, "step": 3177 }, { "epoch": 0.4914749661705007, "grad_norm": 4.168735027313232, "learning_rate": 4.645434757704205e-06, "logits/chosen": 8.805900573730469, "logits/rejected": 3.201615571975708, "logps/chosen": -207.68418884277344, "logps/rejected": -149.4184112548828, "loss": 0.7084, "rewards/accuracies": 0.75, "rewards/chosen": 0.14393866062164307, "rewards/margins": 0.10638261586427689, "rewards/rejected": 0.037556007504463196, "step": 3178 }, { "epoch": 0.49162961531026483, "grad_norm": 5.432239055633545, "learning_rate": 4.6451483560545314e-06, "logits/chosen": 5.614684104919434, "logits/rejected": 8.299880027770996, "logps/chosen": -233.6148681640625, "logps/rejected": -350.77996826171875, "loss": 0.6205, "rewards/accuracies": 0.625, "rewards/chosen": 0.2374475598335266, "rewards/margins": 0.3464166224002838, "rewards/rejected": -0.1089690625667572, "step": 3179 }, { "epoch": 0.491784264450029, "grad_norm": 5.511128902435303, "learning_rate": 4.644861954404858e-06, "logits/chosen": 8.252463340759277, "logits/rejected": 8.249320030212402, "logps/chosen": -331.7830505371094, "logps/rejected": -377.23779296875, "loss": 0.6224, "rewards/accuracies": 0.75, "rewards/chosen": 0.2984260320663452, "rewards/margins": 0.250837117433548, "rewards/rejected": 0.04758892208337784, "step": 3180 }, { "epoch": 0.49193891358979314, "grad_norm": 5.1030659675598145, "learning_rate": 4.644575552755185e-06, "logits/chosen": 5.189993858337402, "logits/rejected": -2.7467448711395264, "logps/chosen": -299.9344177246094, "logps/rejected": -180.72970581054688, "loss": 0.5793, "rewards/accuracies": 0.75, "rewards/chosen": 0.38742536306381226, "rewards/margins": 0.26406800746917725, "rewards/rejected": 0.123357392847538, "step": 3181 }, { "epoch": 0.49209356272955734, "grad_norm": 4.83907413482666, "learning_rate": 4.6442891511055105e-06, "logits/chosen": 11.629040718078613, "logits/rejected": 6.019209384918213, "logps/chosen": -326.4150695800781, "logps/rejected": -218.77005004882812, "loss": 0.4972, "rewards/accuracies": 0.875, "rewards/chosen": 0.4648973047733307, "rewards/margins": 0.5192514657974243, "rewards/rejected": -0.05435418337583542, "step": 3182 }, { "epoch": 0.4922482118693215, "grad_norm": 4.269180774688721, "learning_rate": 4.644002749455837e-06, "logits/chosen": 6.746547222137451, "logits/rejected": 2.949510335922241, "logps/chosen": -256.04364013671875, "logps/rejected": -221.83444213867188, "loss": 0.5363, "rewards/accuracies": 1.0, "rewards/chosen": 0.25615254044532776, "rewards/margins": 0.3992369472980499, "rewards/rejected": -0.14308443665504456, "step": 3183 }, { "epoch": 0.49240286100908565, "grad_norm": 2.823241949081421, "learning_rate": 4.643716347806164e-06, "logits/chosen": 9.455652236938477, "logits/rejected": 4.490926265716553, "logps/chosen": -201.31967163085938, "logps/rejected": -177.3224639892578, "loss": 0.4472, "rewards/accuracies": 0.75, "rewards/chosen": 0.23773860931396484, "rewards/margins": 0.7670780420303345, "rewards/rejected": -0.5293394327163696, "step": 3184 }, { "epoch": 0.4925575101488498, "grad_norm": 4.3402276039123535, "learning_rate": 4.6434299461564905e-06, "logits/chosen": 11.96626091003418, "logits/rejected": 1.8950477838516235, "logps/chosen": -319.28594970703125, "logps/rejected": -180.89744567871094, "loss": 0.4621, "rewards/accuracies": 0.875, "rewards/chosen": 0.2507093846797943, "rewards/margins": 0.6179460287094116, "rewards/rejected": -0.3672366142272949, "step": 3185 }, { "epoch": 0.49271215928861395, "grad_norm": 5.199821472167969, "learning_rate": 4.643143544506817e-06, "logits/chosen": 11.641837120056152, "logits/rejected": 8.420984268188477, "logps/chosen": -279.00006103515625, "logps/rejected": -258.0652160644531, "loss": 0.5754, "rewards/accuracies": 0.75, "rewards/chosen": 0.4567892551422119, "rewards/margins": 0.274496853351593, "rewards/rejected": 0.1822923719882965, "step": 3186 }, { "epoch": 0.4928668084283781, "grad_norm": 3.9842569828033447, "learning_rate": 4.642857142857144e-06, "logits/chosen": 10.269495964050293, "logits/rejected": 1.7518670558929443, "logps/chosen": -236.9564208984375, "logps/rejected": -165.79795837402344, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 0.30555787682533264, "rewards/margins": 0.6471152305603027, "rewards/rejected": -0.3415573537349701, "step": 3187 }, { "epoch": 0.49302145756814225, "grad_norm": 6.792036056518555, "learning_rate": 4.64257074120747e-06, "logits/chosen": 13.500274658203125, "logits/rejected": 9.345292091369629, "logps/chosen": -404.97894287109375, "logps/rejected": -392.1279602050781, "loss": 0.621, "rewards/accuracies": 0.75, "rewards/chosen": 0.6801363229751587, "rewards/margins": 0.20369091629981995, "rewards/rejected": 0.47644540667533875, "step": 3188 }, { "epoch": 0.49317610670790646, "grad_norm": 5.124135971069336, "learning_rate": 4.642284339557796e-06, "logits/chosen": 4.360634803771973, "logits/rejected": 4.045827865600586, "logps/chosen": -233.89944458007812, "logps/rejected": -305.7484130859375, "loss": 0.536, "rewards/accuracies": 0.625, "rewards/chosen": 0.38344401121139526, "rewards/margins": 0.4704377353191376, "rewards/rejected": -0.08699370920658112, "step": 3189 }, { "epoch": 0.4933307558476706, "grad_norm": 7.059565544128418, "learning_rate": 4.641997937908123e-06, "logits/chosen": 11.31144905090332, "logits/rejected": 9.760826110839844, "logps/chosen": -311.3580322265625, "logps/rejected": -326.622314453125, "loss": 0.9138, "rewards/accuracies": 0.375, "rewards/chosen": 0.26804038882255554, "rewards/margins": -0.30372685194015503, "rewards/rejected": 0.5717672109603882, "step": 3190 }, { "epoch": 0.49348540498743476, "grad_norm": 5.41403865814209, "learning_rate": 4.6417115362584495e-06, "logits/chosen": 9.400973320007324, "logits/rejected": 7.57589054107666, "logps/chosen": -223.21302795410156, "logps/rejected": -247.32305908203125, "loss": 0.622, "rewards/accuracies": 0.625, "rewards/chosen": 0.2883512079715729, "rewards/margins": 0.27353349328041077, "rewards/rejected": 0.014817751944065094, "step": 3191 }, { "epoch": 0.4936400541271989, "grad_norm": 7.883642196655273, "learning_rate": 4.641425134608775e-06, "logits/chosen": 6.131500720977783, "logits/rejected": 10.128080368041992, "logps/chosen": -331.9299621582031, "logps/rejected": -339.8541259765625, "loss": 0.8455, "rewards/accuracies": 0.25, "rewards/chosen": 0.32346320152282715, "rewards/margins": -0.18751011788845062, "rewards/rejected": 0.510973334312439, "step": 3192 }, { "epoch": 0.49379470326696306, "grad_norm": 4.2896623611450195, "learning_rate": 4.641138732959102e-06, "logits/chosen": 13.842658042907715, "logits/rejected": 10.419378280639648, "logps/chosen": -323.53662109375, "logps/rejected": -273.93701171875, "loss": 0.4599, "rewards/accuracies": 0.875, "rewards/chosen": 0.4829898774623871, "rewards/margins": 0.6405640244483948, "rewards/rejected": -0.15757417678833008, "step": 3193 }, { "epoch": 0.4939493524067272, "grad_norm": 4.135184288024902, "learning_rate": 4.640852331309429e-06, "logits/chosen": 12.738903999328613, "logits/rejected": 4.654675483703613, "logps/chosen": -346.5086364746094, "logps/rejected": -260.3005065917969, "loss": 0.4967, "rewards/accuracies": 0.75, "rewards/chosen": 0.36599692702293396, "rewards/margins": 0.7874415516853333, "rewards/rejected": -0.4214445948600769, "step": 3194 }, { "epoch": 0.4941040015464914, "grad_norm": 5.662519454956055, "learning_rate": 4.640565929659755e-06, "logits/chosen": 9.974206924438477, "logits/rejected": 9.11992359161377, "logps/chosen": -386.9579162597656, "logps/rejected": -398.414794921875, "loss": 0.5141, "rewards/accuracies": 0.875, "rewards/chosen": 0.3070283532142639, "rewards/margins": 0.45626509189605713, "rewards/rejected": -0.14923667907714844, "step": 3195 }, { "epoch": 0.4942586506862556, "grad_norm": 6.255503177642822, "learning_rate": 4.640279528010082e-06, "logits/chosen": 10.668098449707031, "logits/rejected": 7.344058036804199, "logps/chosen": -454.4828186035156, "logps/rejected": -437.55487060546875, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.4442010819911957, "rewards/margins": 0.41534993052482605, "rewards/rejected": 0.02885115146636963, "step": 3196 }, { "epoch": 0.4944132998260197, "grad_norm": 5.766711235046387, "learning_rate": 4.639993126360408e-06, "logits/chosen": 11.857966423034668, "logits/rejected": 6.200558185577393, "logps/chosen": -309.28753662109375, "logps/rejected": -250.5145721435547, "loss": 0.6096, "rewards/accuracies": 0.625, "rewards/chosen": -0.05699712410569191, "rewards/margins": 0.23993702232837677, "rewards/rejected": -0.2969341576099396, "step": 3197 }, { "epoch": 0.4945679489657839, "grad_norm": 6.10903787612915, "learning_rate": 4.639706724710734e-06, "logits/chosen": 4.986599922180176, "logits/rejected": 6.530762195587158, "logps/chosen": -194.1409149169922, "logps/rejected": -208.06642150878906, "loss": 0.7198, "rewards/accuracies": 0.5, "rewards/chosen": 0.34600967168807983, "rewards/margins": -0.02386576682329178, "rewards/rejected": 0.369875431060791, "step": 3198 }, { "epoch": 0.49472259810554803, "grad_norm": 4.649064540863037, "learning_rate": 4.639420323061061e-06, "logits/chosen": 6.362103462219238, "logits/rejected": 8.3464937210083, "logps/chosen": -400.4107666015625, "logps/rejected": -313.1363525390625, "loss": 0.5027, "rewards/accuracies": 0.75, "rewards/chosen": 0.5108106136322021, "rewards/margins": 0.5735217332839966, "rewards/rejected": -0.0627111941576004, "step": 3199 }, { "epoch": 0.4948772472453122, "grad_norm": 4.862316608428955, "learning_rate": 4.639133921411388e-06, "logits/chosen": 9.388262748718262, "logits/rejected": 6.110173225402832, "logps/chosen": -237.64242553710938, "logps/rejected": -212.5164794921875, "loss": 0.589, "rewards/accuracies": 0.75, "rewards/chosen": 0.1735171377658844, "rewards/margins": 0.26268139481544495, "rewards/rejected": -0.08916424959897995, "step": 3200 }, { "epoch": 0.49503189638507633, "grad_norm": 5.882922649383545, "learning_rate": 4.638847519761714e-06, "logits/chosen": 9.987051010131836, "logits/rejected": 5.234467506408691, "logps/chosen": -330.4905090332031, "logps/rejected": -279.2564697265625, "loss": 0.7626, "rewards/accuracies": 0.5, "rewards/chosen": 0.011462017893791199, "rewards/margins": -0.028851233422756195, "rewards/rejected": 0.040313251316547394, "step": 3201 }, { "epoch": 0.49518654552484054, "grad_norm": 6.285687446594238, "learning_rate": 4.63856111811204e-06, "logits/chosen": 10.027826309204102, "logits/rejected": 8.446650505065918, "logps/chosen": -290.5234680175781, "logps/rejected": -292.0268249511719, "loss": 0.7152, "rewards/accuracies": 0.625, "rewards/chosen": -0.0013534873723983765, "rewards/margins": 0.042198844254016876, "rewards/rejected": -0.043552324175834656, "step": 3202 }, { "epoch": 0.4953411946646047, "grad_norm": 6.7544989585876465, "learning_rate": 4.638274716462367e-06, "logits/chosen": 12.055245399475098, "logits/rejected": 8.946240425109863, "logps/chosen": -200.11856079101562, "logps/rejected": -188.32176208496094, "loss": 0.8951, "rewards/accuracies": 0.25, "rewards/chosen": -0.2900071144104004, "rewards/margins": -0.3132253885269165, "rewards/rejected": 0.023218244314193726, "step": 3203 }, { "epoch": 0.49549584380436884, "grad_norm": 4.476693630218506, "learning_rate": 4.6379883148126935e-06, "logits/chosen": 12.43474006652832, "logits/rejected": 6.799140930175781, "logps/chosen": -425.25323486328125, "logps/rejected": -358.7868347167969, "loss": 0.5003, "rewards/accuracies": 0.875, "rewards/chosen": 0.46089911460876465, "rewards/margins": 0.7050890922546387, "rewards/rejected": -0.24418994784355164, "step": 3204 }, { "epoch": 0.495650492944133, "grad_norm": 4.653582572937012, "learning_rate": 4.63770191316302e-06, "logits/chosen": 16.08995246887207, "logits/rejected": 12.875402450561523, "logps/chosen": -155.30169677734375, "logps/rejected": -124.24064636230469, "loss": 0.6236, "rewards/accuracies": 0.75, "rewards/chosen": 0.17245537042617798, "rewards/margins": 0.19474969804286957, "rewards/rejected": -0.02229432761669159, "step": 3205 }, { "epoch": 0.49580514208389714, "grad_norm": 5.727625370025635, "learning_rate": 4.637415511513347e-06, "logits/chosen": 16.610809326171875, "logits/rejected": 13.747440338134766, "logps/chosen": -362.3914794921875, "logps/rejected": -314.66455078125, "loss": 0.7018, "rewards/accuracies": 0.375, "rewards/chosen": 0.21794280409812927, "rewards/margins": 0.04338674619793892, "rewards/rejected": 0.17455606162548065, "step": 3206 }, { "epoch": 0.4959597912236613, "grad_norm": 5.008937358856201, "learning_rate": 4.637129109863673e-06, "logits/chosen": 9.213848114013672, "logits/rejected": 12.177340507507324, "logps/chosen": -185.40524291992188, "logps/rejected": -249.67605590820312, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": 0.2635473310947418, "rewards/margins": 0.030886556953191757, "rewards/rejected": 0.23266077041625977, "step": 3207 }, { "epoch": 0.4961144403634255, "grad_norm": 9.41383171081543, "learning_rate": 4.636842708214e-06, "logits/chosen": 8.661371231079102, "logits/rejected": 10.524496078491211, "logps/chosen": -266.9195251464844, "logps/rejected": -356.602783203125, "loss": 0.7901, "rewards/accuracies": 0.5, "rewards/chosen": 0.0417538583278656, "rewards/margins": 0.020147234201431274, "rewards/rejected": 0.021606631577014923, "step": 3208 }, { "epoch": 0.49626908950318965, "grad_norm": 8.27237606048584, "learning_rate": 4.636556306564326e-06, "logits/chosen": 5.4649457931518555, "logits/rejected": 9.136170387268066, "logps/chosen": -283.4508056640625, "logps/rejected": -325.275390625, "loss": 0.8573, "rewards/accuracies": 0.375, "rewards/chosen": -0.07023191452026367, "rewards/margins": -0.14170588552951813, "rewards/rejected": 0.07147398591041565, "step": 3209 }, { "epoch": 0.4964237386429538, "grad_norm": 5.10293436050415, "learning_rate": 4.6362699049146525e-06, "logits/chosen": 9.117206573486328, "logits/rejected": 10.583005905151367, "logps/chosen": -253.10757446289062, "logps/rejected": -258.6315612792969, "loss": 0.7278, "rewards/accuracies": 0.625, "rewards/chosen": 0.12601019442081451, "rewards/margins": -0.016238290816545486, "rewards/rejected": 0.1422484815120697, "step": 3210 }, { "epoch": 0.49657838778271796, "grad_norm": 4.5167012214660645, "learning_rate": 4.635983503264979e-06, "logits/chosen": 11.27578353881836, "logits/rejected": 1.932388186454773, "logps/chosen": -255.1419219970703, "logps/rejected": -204.13180541992188, "loss": 0.6155, "rewards/accuracies": 0.625, "rewards/chosen": 0.3284369111061096, "rewards/margins": 0.3237423300743103, "rewards/rejected": 0.0046945735812187195, "step": 3211 }, { "epoch": 0.4967330369224821, "grad_norm": 4.347433090209961, "learning_rate": 4.635697101615306e-06, "logits/chosen": 7.651025772094727, "logits/rejected": 1.1188621520996094, "logps/chosen": -285.5934143066406, "logps/rejected": -179.17422485351562, "loss": 0.5516, "rewards/accuracies": 0.625, "rewards/chosen": 0.1909923404455185, "rewards/margins": 0.3861883878707886, "rewards/rejected": -0.19519607722759247, "step": 3212 }, { "epoch": 0.49688768606224626, "grad_norm": 7.997645378112793, "learning_rate": 4.6354106999656325e-06, "logits/chosen": 8.994619369506836, "logits/rejected": 7.7450337409973145, "logps/chosen": -333.566650390625, "logps/rejected": -257.5031433105469, "loss": 0.7936, "rewards/accuracies": 0.75, "rewards/chosen": 0.25449323654174805, "rewards/margins": -0.037055641412734985, "rewards/rejected": 0.29154887795448303, "step": 3213 }, { "epoch": 0.49704233520201047, "grad_norm": 7.42900276184082, "learning_rate": 4.635124298315959e-06, "logits/chosen": 11.444816589355469, "logits/rejected": 12.119190216064453, "logps/chosen": -322.518798828125, "logps/rejected": -293.8309631347656, "loss": 0.7328, "rewards/accuracies": 0.625, "rewards/chosen": 0.04818095266819, "rewards/margins": 0.07652051746845245, "rewards/rejected": -0.028339581564068794, "step": 3214 }, { "epoch": 0.4971969843417746, "grad_norm": 8.868440628051758, "learning_rate": 4.634837896666285e-06, "logits/chosen": 9.840538024902344, "logits/rejected": 2.1293115615844727, "logps/chosen": -237.59378051757812, "logps/rejected": -132.53939819335938, "loss": 0.6684, "rewards/accuracies": 0.625, "rewards/chosen": 0.053783148527145386, "rewards/margins": 0.21836845576763153, "rewards/rejected": -0.16458530724048615, "step": 3215 }, { "epoch": 0.49735163348153877, "grad_norm": 5.591974258422852, "learning_rate": 4.6345514950166116e-06, "logits/chosen": 11.873985290527344, "logits/rejected": 8.65584945678711, "logps/chosen": -371.9913330078125, "logps/rejected": -289.956787109375, "loss": 0.5866, "rewards/accuracies": 0.625, "rewards/chosen": -0.1580563485622406, "rewards/margins": 0.36342161893844604, "rewards/rejected": -0.5214779376983643, "step": 3216 }, { "epoch": 0.4975062826213029, "grad_norm": 6.619940280914307, "learning_rate": 4.634265093366938e-06, "logits/chosen": 4.399501800537109, "logits/rejected": 3.6414170265197754, "logps/chosen": -326.87677001953125, "logps/rejected": -306.3594665527344, "loss": 0.6275, "rewards/accuracies": 0.75, "rewards/chosen": 0.1975744366645813, "rewards/margins": 0.16901811957359314, "rewards/rejected": 0.028556296601891518, "step": 3217 }, { "epoch": 0.49766093176106707, "grad_norm": 7.987555503845215, "learning_rate": 4.633978691717265e-06, "logits/chosen": 9.403387069702148, "logits/rejected": 4.45906925201416, "logps/chosen": -250.8699188232422, "logps/rejected": -169.99916076660156, "loss": 0.6851, "rewards/accuracies": 0.5, "rewards/chosen": 0.15247373282909393, "rewards/margins": 0.11631850153207779, "rewards/rejected": 0.03615522384643555, "step": 3218 }, { "epoch": 0.4978155809008312, "grad_norm": 5.61239767074585, "learning_rate": 4.6336922900675915e-06, "logits/chosen": 11.234419822692871, "logits/rejected": 3.178260564804077, "logps/chosen": -260.33056640625, "logps/rejected": -203.0194854736328, "loss": 0.7129, "rewards/accuracies": 0.375, "rewards/chosen": 0.218332439661026, "rewards/margins": 0.044287826865911484, "rewards/rejected": 0.17404459416866302, "step": 3219 }, { "epoch": 0.4979702300405954, "grad_norm": 5.32751989364624, "learning_rate": 4.633405888417918e-06, "logits/chosen": 15.610055923461914, "logits/rejected": 6.273090362548828, "logps/chosen": -328.39111328125, "logps/rejected": -183.8865966796875, "loss": 0.6169, "rewards/accuracies": 0.625, "rewards/chosen": 0.35059186816215515, "rewards/margins": 0.274660587310791, "rewards/rejected": 0.07593126595020294, "step": 3220 }, { "epoch": 0.4981248791803596, "grad_norm": 5.928618431091309, "learning_rate": 4.633119486768245e-06, "logits/chosen": 8.828058242797852, "logits/rejected": 8.824657440185547, "logps/chosen": -220.4580078125, "logps/rejected": -187.80079650878906, "loss": 0.8025, "rewards/accuracies": 0.5, "rewards/chosen": -0.0530797615647316, "rewards/margins": -0.08219853788614273, "rewards/rejected": 0.029118768870830536, "step": 3221 }, { "epoch": 0.49827952832012373, "grad_norm": 7.4839768409729, "learning_rate": 4.632833085118571e-06, "logits/chosen": 7.095251560211182, "logits/rejected": 10.842124938964844, "logps/chosen": -244.18775939941406, "logps/rejected": -382.8251647949219, "loss": 0.9543, "rewards/accuracies": 0.25, "rewards/chosen": 0.0409204363822937, "rewards/margins": -0.2923834025859833, "rewards/rejected": 0.3333038091659546, "step": 3222 }, { "epoch": 0.4984341774598879, "grad_norm": 6.719126224517822, "learning_rate": 4.632546683468897e-06, "logits/chosen": 10.39794921875, "logits/rejected": 6.886472225189209, "logps/chosen": -375.6524353027344, "logps/rejected": -253.48709106445312, "loss": 0.6206, "rewards/accuracies": 0.625, "rewards/chosen": 0.22531074285507202, "rewards/margins": 0.44207724928855896, "rewards/rejected": -0.21676646173000336, "step": 3223 }, { "epoch": 0.49858882659965204, "grad_norm": 3.482813835144043, "learning_rate": 4.632260281819224e-06, "logits/chosen": 12.072370529174805, "logits/rejected": 9.614316940307617, "logps/chosen": -145.61524963378906, "logps/rejected": -108.59986877441406, "loss": 0.6405, "rewards/accuracies": 0.625, "rewards/chosen": 0.2467600405216217, "rewards/margins": 0.18540742993354797, "rewards/rejected": 0.06135261058807373, "step": 3224 }, { "epoch": 0.4987434757394162, "grad_norm": 5.7496185302734375, "learning_rate": 4.6319738801695506e-06, "logits/chosen": 11.51933765411377, "logits/rejected": 10.159708976745605, "logps/chosen": -283.84857177734375, "logps/rejected": -276.3330993652344, "loss": 0.7419, "rewards/accuracies": 0.5, "rewards/chosen": 0.2894914746284485, "rewards/margins": -0.02282160520553589, "rewards/rejected": 0.3123130798339844, "step": 3225 }, { "epoch": 0.49889812487918034, "grad_norm": 6.08125638961792, "learning_rate": 4.631687478519876e-06, "logits/chosen": 13.310603141784668, "logits/rejected": 5.548508167266846, "logps/chosen": -322.26513671875, "logps/rejected": -239.59014892578125, "loss": 0.6613, "rewards/accuracies": 0.5, "rewards/chosen": 0.2916352450847626, "rewards/margins": 0.08447933197021484, "rewards/rejected": 0.20715588331222534, "step": 3226 }, { "epoch": 0.49905277401894454, "grad_norm": 7.443875789642334, "learning_rate": 4.631401076870203e-06, "logits/chosen": 12.941253662109375, "logits/rejected": 2.983773708343506, "logps/chosen": -415.9276428222656, "logps/rejected": -337.6793518066406, "loss": 0.6947, "rewards/accuracies": 0.625, "rewards/chosen": 0.5055520534515381, "rewards/margins": 0.03528820723295212, "rewards/rejected": 0.47026389837265015, "step": 3227 }, { "epoch": 0.4992074231587087, "grad_norm": 5.3274617195129395, "learning_rate": 4.63111467522053e-06, "logits/chosen": 12.25723648071289, "logits/rejected": 8.610937118530273, "logps/chosen": -294.1002502441406, "logps/rejected": -237.87545776367188, "loss": 0.5788, "rewards/accuracies": 0.75, "rewards/chosen": 0.44150614738464355, "rewards/margins": 0.3152434825897217, "rewards/rejected": 0.12626267969608307, "step": 3228 }, { "epoch": 0.49936207229847285, "grad_norm": 6.431368350982666, "learning_rate": 4.630828273570856e-06, "logits/chosen": -0.9038457870483398, "logits/rejected": 7.840060234069824, "logps/chosen": -206.0345458984375, "logps/rejected": -259.9377136230469, "loss": 0.8315, "rewards/accuracies": 0.375, "rewards/chosen": -0.2623577117919922, "rewards/margins": -0.23195143043994904, "rewards/rejected": -0.030406277626752853, "step": 3229 }, { "epoch": 0.499516721438237, "grad_norm": 5.165670871734619, "learning_rate": 4.630541871921182e-06, "logits/chosen": 12.120144844055176, "logits/rejected": 6.662358283996582, "logps/chosen": -279.2125244140625, "logps/rejected": -231.15696716308594, "loss": 0.6535, "rewards/accuracies": 0.625, "rewards/chosen": 0.16808763146400452, "rewards/margins": 0.2122688889503479, "rewards/rejected": -0.044181257486343384, "step": 3230 }, { "epoch": 0.49967137057800115, "grad_norm": 6.1574625968933105, "learning_rate": 4.630255470271509e-06, "logits/chosen": 6.7483391761779785, "logits/rejected": 8.390557289123535, "logps/chosen": -319.6745910644531, "logps/rejected": -254.29249572753906, "loss": 0.6291, "rewards/accuracies": 0.75, "rewards/chosen": 0.4285506308078766, "rewards/margins": 0.1769622564315796, "rewards/rejected": 0.2515884041786194, "step": 3231 }, { "epoch": 0.4998260197177653, "grad_norm": 5.711991310119629, "learning_rate": 4.6299690686218354e-06, "logits/chosen": 6.523872375488281, "logits/rejected": 4.62911319732666, "logps/chosen": -248.3011474609375, "logps/rejected": -189.9203338623047, "loss": 0.6659, "rewards/accuracies": 0.5, "rewards/chosen": 0.023024767637252808, "rewards/margins": 0.10130847990512848, "rewards/rejected": -0.07828371226787567, "step": 3232 }, { "epoch": 0.49998066885752945, "grad_norm": 3.901844024658203, "learning_rate": 4.629682666972162e-06, "logits/chosen": 11.681109428405762, "logits/rejected": 6.818157196044922, "logps/chosen": -258.7403564453125, "logps/rejected": -197.45782470703125, "loss": 0.5584, "rewards/accuracies": 0.625, "rewards/chosen": 0.4649995267391205, "rewards/margins": 0.3458278775215149, "rewards/rejected": 0.1191716194152832, "step": 3233 }, { "epoch": 0.5001353179972936, "grad_norm": 4.7333173751831055, "learning_rate": 4.629396265322489e-06, "logits/chosen": 10.792336463928223, "logits/rejected": 7.659724712371826, "logps/chosen": -287.09613037109375, "logps/rejected": -233.54824829101562, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": 0.009859800338745117, "rewards/margins": 0.02727828174829483, "rewards/rejected": -0.017418459057807922, "step": 3234 }, { "epoch": 0.5002899671370578, "grad_norm": 5.64075231552124, "learning_rate": 4.6291098636728145e-06, "logits/chosen": 10.718565940856934, "logits/rejected": 8.840188980102539, "logps/chosen": -301.17474365234375, "logps/rejected": -238.9667510986328, "loss": 0.7236, "rewards/accuracies": 0.5, "rewards/chosen": 0.09668870270252228, "rewards/margins": -0.04182238504290581, "rewards/rejected": 0.13851109147071838, "step": 3235 }, { "epoch": 0.5004446162768219, "grad_norm": 4.832586765289307, "learning_rate": 4.628823462023141e-06, "logits/chosen": 7.720620632171631, "logits/rejected": 6.52126407623291, "logps/chosen": -214.824462890625, "logps/rejected": -195.15853881835938, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": 0.30634641647338867, "rewards/margins": 0.14172527194023132, "rewards/rejected": 0.16462109982967377, "step": 3236 }, { "epoch": 0.5005992654165862, "grad_norm": 5.773725509643555, "learning_rate": 4.628537060373468e-06, "logits/chosen": 6.365090370178223, "logits/rejected": 5.241100311279297, "logps/chosen": -196.10340881347656, "logps/rejected": -185.79940795898438, "loss": 0.74, "rewards/accuracies": 0.375, "rewards/chosen": 0.029765471816062927, "rewards/margins": 0.015344507992267609, "rewards/rejected": 0.014420978724956512, "step": 3237 }, { "epoch": 0.5007539145563503, "grad_norm": 4.231893062591553, "learning_rate": 4.6282506587237945e-06, "logits/chosen": 6.115300178527832, "logits/rejected": 1.6248770952224731, "logps/chosen": -308.56719970703125, "logps/rejected": -176.61923217773438, "loss": 0.6071, "rewards/accuracies": 0.5, "rewards/chosen": 0.5727317333221436, "rewards/margins": 0.4009518027305603, "rewards/rejected": 0.17177993059158325, "step": 3238 }, { "epoch": 0.5009085636961145, "grad_norm": 4.538451671600342, "learning_rate": 4.627964257074121e-06, "logits/chosen": 1.1103272438049316, "logits/rejected": 2.763864755630493, "logps/chosen": -202.9613037109375, "logps/rejected": -152.6993865966797, "loss": 0.7266, "rewards/accuracies": 0.625, "rewards/chosen": 0.05628015100955963, "rewards/margins": -0.0032441522926092148, "rewards/rejected": 0.05952432006597519, "step": 3239 }, { "epoch": 0.5010632128358786, "grad_norm": 7.519440174102783, "learning_rate": 4.627677855424448e-06, "logits/chosen": 9.228011131286621, "logits/rejected": 3.3821310997009277, "logps/chosen": -324.1173095703125, "logps/rejected": -292.279541015625, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": 0.44543153047561646, "rewards/margins": 0.13470172882080078, "rewards/rejected": 0.3107297718524933, "step": 3240 }, { "epoch": 0.5012178619756428, "grad_norm": 4.642898082733154, "learning_rate": 4.6273914537747744e-06, "logits/chosen": 7.2239789962768555, "logits/rejected": 4.543787002563477, "logps/chosen": -243.32937622070312, "logps/rejected": -190.94973754882812, "loss": 0.5622, "rewards/accuracies": 0.625, "rewards/chosen": 0.29530858993530273, "rewards/margins": 0.3539070785045624, "rewards/rejected": -0.05859847366809845, "step": 3241 }, { "epoch": 0.5013725111154069, "grad_norm": 5.249964714050293, "learning_rate": 4.6271050521251e-06, "logits/chosen": 12.115538597106934, "logits/rejected": 5.638140678405762, "logps/chosen": -313.64910888671875, "logps/rejected": -287.5634765625, "loss": 0.5777, "rewards/accuracies": 0.875, "rewards/chosen": 0.08723636716604233, "rewards/margins": 0.554078221321106, "rewards/rejected": -0.46684184670448303, "step": 3242 }, { "epoch": 0.5015271602551711, "grad_norm": 6.670085906982422, "learning_rate": 4.626818650475427e-06, "logits/chosen": 3.2348008155822754, "logits/rejected": 3.5412447452545166, "logps/chosen": -223.87423706054688, "logps/rejected": -235.9828643798828, "loss": 0.6999, "rewards/accuracies": 0.5, "rewards/chosen": 0.3629200756549835, "rewards/margins": 0.028505805879831314, "rewards/rejected": 0.3344142436981201, "step": 3243 }, { "epoch": 0.5016818093949352, "grad_norm": 5.689203262329102, "learning_rate": 4.6265322488257535e-06, "logits/chosen": 8.295392990112305, "logits/rejected": 6.412006855010986, "logps/chosen": -209.7498321533203, "logps/rejected": -158.74578857421875, "loss": 0.64, "rewards/accuracies": 0.625, "rewards/chosen": 0.15107396245002747, "rewards/margins": 0.18129417300224304, "rewards/rejected": -0.03022022545337677, "step": 3244 }, { "epoch": 0.5018364585346994, "grad_norm": 9.3799467086792, "learning_rate": 4.62624584717608e-06, "logits/chosen": 6.568096160888672, "logits/rejected": 5.438168048858643, "logps/chosen": -303.2627868652344, "logps/rejected": -278.31903076171875, "loss": 0.7963, "rewards/accuracies": 0.25, "rewards/chosen": 0.13152244687080383, "rewards/margins": -0.10410839319229126, "rewards/rejected": 0.2356308400630951, "step": 3245 }, { "epoch": 0.5019911076744635, "grad_norm": 4.707427024841309, "learning_rate": 4.625959445526407e-06, "logits/chosen": 4.344818115234375, "logits/rejected": 7.874292850494385, "logps/chosen": -207.7290802001953, "logps/rejected": -212.3821258544922, "loss": 0.7722, "rewards/accuracies": 0.375, "rewards/chosen": -0.10631751269102097, "rewards/margins": -0.10347947478294373, "rewards/rejected": -0.002838030457496643, "step": 3246 }, { "epoch": 0.5021457568142277, "grad_norm": 4.798895835876465, "learning_rate": 4.6256730438767335e-06, "logits/chosen": 12.562665939331055, "logits/rejected": 7.572888374328613, "logps/chosen": -337.30364990234375, "logps/rejected": -263.802978515625, "loss": 0.6221, "rewards/accuracies": 0.5, "rewards/chosen": 0.5721014738082886, "rewards/margins": 0.22942838072776794, "rewards/rejected": 0.342673122882843, "step": 3247 }, { "epoch": 0.5023004059539918, "grad_norm": 4.289575576782227, "learning_rate": 4.625386642227059e-06, "logits/chosen": 5.521953105926514, "logits/rejected": 4.95518684387207, "logps/chosen": -215.48492431640625, "logps/rejected": -236.41635131835938, "loss": 0.6166, "rewards/accuracies": 0.625, "rewards/chosen": 0.2674950361251831, "rewards/margins": 0.18388274312019348, "rewards/rejected": 0.08361230790615082, "step": 3248 }, { "epoch": 0.502455055093756, "grad_norm": 6.433024883270264, "learning_rate": 4.625100240577386e-06, "logits/chosen": 12.959646224975586, "logits/rejected": 8.154869079589844, "logps/chosen": -374.6189880371094, "logps/rejected": -339.1207580566406, "loss": 0.6991, "rewards/accuracies": 0.5, "rewards/chosen": 0.45588111877441406, "rewards/margins": 0.07262136042118073, "rewards/rejected": 0.38325977325439453, "step": 3249 }, { "epoch": 0.5026097042335202, "grad_norm": 7.297660827636719, "learning_rate": 4.624813838927713e-06, "logits/chosen": 11.110713958740234, "logits/rejected": 9.868202209472656, "logps/chosen": -401.1929931640625, "logps/rejected": -391.1473083496094, "loss": 0.9218, "rewards/accuracies": 0.5, "rewards/chosen": 0.4221614897251129, "rewards/margins": -0.2851749360561371, "rewards/rejected": 0.70733642578125, "step": 3250 }, { "epoch": 0.5027643533732844, "grad_norm": 5.466851234436035, "learning_rate": 4.624527437278039e-06, "logits/chosen": 8.885056495666504, "logits/rejected": 9.736762046813965, "logps/chosen": -208.41220092773438, "logps/rejected": -236.73855590820312, "loss": 0.7338, "rewards/accuracies": 0.625, "rewards/chosen": 0.3178447186946869, "rewards/margins": 0.09304095804691315, "rewards/rejected": 0.22480377554893494, "step": 3251 }, { "epoch": 0.5029190025130486, "grad_norm": 3.7154295444488525, "learning_rate": 4.624241035628366e-06, "logits/chosen": 8.286853790283203, "logits/rejected": 3.569833278656006, "logps/chosen": -286.64752197265625, "logps/rejected": -210.27001953125, "loss": 0.5011, "rewards/accuracies": 0.625, "rewards/chosen": 0.5680737495422363, "rewards/margins": 0.6208111047744751, "rewards/rejected": -0.05273742228746414, "step": 3252 }, { "epoch": 0.5030736516528127, "grad_norm": 6.122959136962891, "learning_rate": 4.6239546339786925e-06, "logits/chosen": 9.299665451049805, "logits/rejected": 10.038492202758789, "logps/chosen": -141.4488067626953, "logps/rejected": -174.90399169921875, "loss": 0.7626, "rewards/accuracies": 0.375, "rewards/chosen": -0.1682216227054596, "rewards/margins": -0.08915533125400543, "rewards/rejected": -0.07906626909971237, "step": 3253 }, { "epoch": 0.5032283007925769, "grad_norm": 3.8736491203308105, "learning_rate": 4.623668232329019e-06, "logits/chosen": 8.56025505065918, "logits/rejected": 4.659777641296387, "logps/chosen": -242.66452026367188, "logps/rejected": -202.87985229492188, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": -0.04008864611387253, "rewards/margins": 0.5544979572296143, "rewards/rejected": -0.5945866107940674, "step": 3254 }, { "epoch": 0.503382949932341, "grad_norm": 6.088406085968018, "learning_rate": 4.623381830679345e-06, "logits/chosen": 6.944226264953613, "logits/rejected": 3.575235366821289, "logps/chosen": -218.08468627929688, "logps/rejected": -215.81625366210938, "loss": 0.7874, "rewards/accuracies": 0.5, "rewards/chosen": 0.17197920382022858, "rewards/margins": -0.05819801986217499, "rewards/rejected": 0.23017722368240356, "step": 3255 }, { "epoch": 0.5035375990721052, "grad_norm": 4.040688991546631, "learning_rate": 4.623095429029672e-06, "logits/chosen": 12.327072143554688, "logits/rejected": 4.876599311828613, "logps/chosen": -188.86550903320312, "logps/rejected": -155.1163787841797, "loss": 0.662, "rewards/accuracies": 0.5, "rewards/chosen": 0.19586950540542603, "rewards/margins": 0.11446381360292435, "rewards/rejected": 0.08140568435192108, "step": 3256 }, { "epoch": 0.5036922482118693, "grad_norm": 4.168957233428955, "learning_rate": 4.622809027379998e-06, "logits/chosen": 9.042384147644043, "logits/rejected": 6.491412162780762, "logps/chosen": -215.65042114257812, "logps/rejected": -173.77108764648438, "loss": 0.6335, "rewards/accuracies": 0.5, "rewards/chosen": 0.39675989747047424, "rewards/margins": 0.29335811734199524, "rewards/rejected": 0.10340175777673721, "step": 3257 }, { "epoch": 0.5038468973516335, "grad_norm": 5.800944805145264, "learning_rate": 4.622522625730325e-06, "logits/chosen": 11.838663101196289, "logits/rejected": 11.668035507202148, "logps/chosen": -265.31072998046875, "logps/rejected": -239.455810546875, "loss": 0.6991, "rewards/accuracies": 0.5, "rewards/chosen": 0.07556433975696564, "rewards/margins": 0.046987056732177734, "rewards/rejected": 0.028577271848917007, "step": 3258 }, { "epoch": 0.5040015464913976, "grad_norm": 9.113494873046875, "learning_rate": 4.622236224080652e-06, "logits/chosen": 5.679702281951904, "logits/rejected": 8.43759822845459, "logps/chosen": -347.2708740234375, "logps/rejected": -365.0799255371094, "loss": 0.6248, "rewards/accuracies": 0.75, "rewards/chosen": 0.7220704555511475, "rewards/margins": 0.165435791015625, "rewards/rejected": 0.5566346049308777, "step": 3259 }, { "epoch": 0.5041561956311618, "grad_norm": 35.40195083618164, "learning_rate": 4.621949822430977e-06, "logits/chosen": 9.104483604431152, "logits/rejected": 7.996103763580322, "logps/chosen": -295.86627197265625, "logps/rejected": -294.913818359375, "loss": 0.7372, "rewards/accuracies": 0.375, "rewards/chosen": 0.21118071675300598, "rewards/margins": 0.10836414247751236, "rewards/rejected": 0.10281655192375183, "step": 3260 }, { "epoch": 0.5043108447709259, "grad_norm": 4.244663715362549, "learning_rate": 4.621663420781304e-06, "logits/chosen": 6.868988513946533, "logits/rejected": 6.118412494659424, "logps/chosen": -130.43975830078125, "logps/rejected": -125.55975341796875, "loss": 0.639, "rewards/accuracies": 0.75, "rewards/chosen": -0.020024150609970093, "rewards/margins": 0.1453128159046173, "rewards/rejected": -0.1653369963169098, "step": 3261 }, { "epoch": 0.5044654939106902, "grad_norm": 30.666452407836914, "learning_rate": 4.621377019131631e-06, "logits/chosen": 4.70530366897583, "logits/rejected": 6.695226192474365, "logps/chosen": -281.1080627441406, "logps/rejected": -340.7483215332031, "loss": 0.6458, "rewards/accuracies": 0.625, "rewards/chosen": 0.5474357604980469, "rewards/margins": 0.12058459967374802, "rewards/rejected": 0.4268511235713959, "step": 3262 }, { "epoch": 0.5046201430504543, "grad_norm": 4.437601566314697, "learning_rate": 4.621090617481957e-06, "logits/chosen": 9.145987510681152, "logits/rejected": 4.361265182495117, "logps/chosen": -293.39300537109375, "logps/rejected": -210.44580078125, "loss": 0.6133, "rewards/accuracies": 0.75, "rewards/chosen": 0.4394845962524414, "rewards/margins": 0.18916437029838562, "rewards/rejected": 0.2503202259540558, "step": 3263 }, { "epoch": 0.5047747921902185, "grad_norm": 6.351206302642822, "learning_rate": 4.620804215832283e-06, "logits/chosen": 4.6507720947265625, "logits/rejected": 11.356451034545898, "logps/chosen": -210.76272583007812, "logps/rejected": -263.29034423828125, "loss": 0.8746, "rewards/accuracies": 0.375, "rewards/chosen": 0.09049965441226959, "rewards/margins": -0.3004557490348816, "rewards/rejected": 0.3909553289413452, "step": 3264 }, { "epoch": 0.5049294413299826, "grad_norm": 4.6295905113220215, "learning_rate": 4.62051781418261e-06, "logits/chosen": 10.884513854980469, "logits/rejected": 7.245157241821289, "logps/chosen": -254.57241821289062, "logps/rejected": -186.95785522460938, "loss": 0.5839, "rewards/accuracies": 0.625, "rewards/chosen": 0.6100372076034546, "rewards/margins": 0.29260027408599854, "rewards/rejected": 0.31743699312210083, "step": 3265 }, { "epoch": 0.5050840904697468, "grad_norm": 4.323636054992676, "learning_rate": 4.6202314125329365e-06, "logits/chosen": 6.880917549133301, "logits/rejected": -0.4303410053253174, "logps/chosen": -194.7524871826172, "logps/rejected": -135.21209716796875, "loss": 0.6091, "rewards/accuracies": 0.75, "rewards/chosen": 0.27044790983200073, "rewards/margins": 0.22092950344085693, "rewards/rejected": 0.04951842129230499, "step": 3266 }, { "epoch": 0.5052387396095109, "grad_norm": 5.352099418640137, "learning_rate": 4.619945010883263e-06, "logits/chosen": 1.3101264238357544, "logits/rejected": 7.754352569580078, "logps/chosen": -173.9705810546875, "logps/rejected": -257.4837646484375, "loss": 0.606, "rewards/accuracies": 0.625, "rewards/chosen": 0.3631083369255066, "rewards/margins": 0.26010382175445557, "rewards/rejected": 0.1030045598745346, "step": 3267 }, { "epoch": 0.5053933887492751, "grad_norm": 5.960330963134766, "learning_rate": 4.619658609233589e-06, "logits/chosen": 12.323392868041992, "logits/rejected": 9.421552658081055, "logps/chosen": -248.0334014892578, "logps/rejected": -188.06338500976562, "loss": 0.6137, "rewards/accuracies": 0.625, "rewards/chosen": 0.5610530972480774, "rewards/margins": 0.2127675712108612, "rewards/rejected": 0.3482855260372162, "step": 3268 }, { "epoch": 0.5055480378890392, "grad_norm": 4.314411640167236, "learning_rate": 4.6193722075839156e-06, "logits/chosen": 7.391791343688965, "logits/rejected": 8.912692070007324, "logps/chosen": -163.3362579345703, "logps/rejected": -181.1661376953125, "loss": 0.7793, "rewards/accuracies": 0.625, "rewards/chosen": -0.07061205059289932, "rewards/margins": -0.0177382230758667, "rewards/rejected": -0.052873801440000534, "step": 3269 }, { "epoch": 0.5057026870288034, "grad_norm": 4.295561790466309, "learning_rate": 4.619085805934242e-06, "logits/chosen": 11.544595718383789, "logits/rejected": 0.17121952772140503, "logps/chosen": -272.5765380859375, "logps/rejected": -145.60745239257812, "loss": 0.5283, "rewards/accuracies": 0.75, "rewards/chosen": 0.5199115872383118, "rewards/margins": 0.4205535650253296, "rewards/rejected": 0.09935805201530457, "step": 3270 }, { "epoch": 0.5058573361685675, "grad_norm": 5.221998691558838, "learning_rate": 4.618799404284569e-06, "logits/chosen": 9.329689025878906, "logits/rejected": 9.840164184570312, "logps/chosen": -291.57855224609375, "logps/rejected": -225.47178649902344, "loss": 0.5341, "rewards/accuracies": 0.625, "rewards/chosen": 0.6414833664894104, "rewards/margins": 0.5150984525680542, "rewards/rejected": 0.12638498842716217, "step": 3271 }, { "epoch": 0.5060119853083317, "grad_norm": 4.863838195800781, "learning_rate": 4.6185130026348955e-06, "logits/chosen": 9.67735767364502, "logits/rejected": 9.155167579650879, "logps/chosen": -229.3268585205078, "logps/rejected": -150.3831787109375, "loss": 0.7025, "rewards/accuracies": 0.625, "rewards/chosen": 0.2899929881095886, "rewards/margins": 0.037098973989486694, "rewards/rejected": 0.25289401412010193, "step": 3272 }, { "epoch": 0.5061666344480958, "grad_norm": 6.14678955078125, "learning_rate": 4.618226600985222e-06, "logits/chosen": 15.29581069946289, "logits/rejected": 16.354097366333008, "logps/chosen": -238.01573181152344, "logps/rejected": -286.6297607421875, "loss": 0.6868, "rewards/accuracies": 0.625, "rewards/chosen": 0.33115795254707336, "rewards/margins": 0.09647694230079651, "rewards/rejected": 0.23468102514743805, "step": 3273 }, { "epoch": 0.50632128358786, "grad_norm": 4.934665679931641, "learning_rate": 4.617940199335549e-06, "logits/chosen": 6.422244071960449, "logits/rejected": 7.959965705871582, "logps/chosen": -263.961669921875, "logps/rejected": -270.8356018066406, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": 0.5538961887359619, "rewards/margins": 0.11184568703174591, "rewards/rejected": 0.4420505166053772, "step": 3274 }, { "epoch": 0.5064759327276243, "grad_norm": 7.779551029205322, "learning_rate": 4.617653797685875e-06, "logits/chosen": 10.395113945007324, "logits/rejected": 5.497433662414551, "logps/chosen": -435.391845703125, "logps/rejected": -340.43768310546875, "loss": 0.6619, "rewards/accuracies": 0.5, "rewards/chosen": 0.5168371200561523, "rewards/margins": 0.2224440574645996, "rewards/rejected": 0.29439306259155273, "step": 3275 }, { "epoch": 0.5066305818673884, "grad_norm": 4.753671646118164, "learning_rate": 4.617367396036201e-06, "logits/chosen": 11.554730415344238, "logits/rejected": 8.039701461791992, "logps/chosen": -244.26229858398438, "logps/rejected": -231.60797119140625, "loss": 0.5446, "rewards/accuracies": 0.875, "rewards/chosen": 0.5520535707473755, "rewards/margins": 0.40712639689445496, "rewards/rejected": 0.1449272185564041, "step": 3276 }, { "epoch": 0.5067852310071526, "grad_norm": 4.9572978019714355, "learning_rate": 4.617080994386528e-06, "logits/chosen": 7.847681522369385, "logits/rejected": 4.94571590423584, "logps/chosen": -216.69163513183594, "logps/rejected": -176.18067932128906, "loss": 0.6321, "rewards/accuracies": 0.625, "rewards/chosen": 0.5498632192611694, "rewards/margins": 0.2688031494617462, "rewards/rejected": 0.2810600996017456, "step": 3277 }, { "epoch": 0.5069398801469167, "grad_norm": 8.32699966430664, "learning_rate": 4.6167945927368546e-06, "logits/chosen": 11.8165283203125, "logits/rejected": 14.487762451171875, "logps/chosen": -217.32606506347656, "logps/rejected": -264.3685302734375, "loss": 0.766, "rewards/accuracies": 0.5, "rewards/chosen": 0.20697329938411713, "rewards/margins": 0.015448138117790222, "rewards/rejected": 0.19152513146400452, "step": 3278 }, { "epoch": 0.5070945292866809, "grad_norm": 10.894147872924805, "learning_rate": 4.616508191087181e-06, "logits/chosen": 16.20415496826172, "logits/rejected": 15.814085006713867, "logps/chosen": -353.5644836425781, "logps/rejected": -269.7220458984375, "loss": 0.7998, "rewards/accuracies": 0.75, "rewards/chosen": 0.3779914975166321, "rewards/margins": 0.05745086073875427, "rewards/rejected": 0.3205406069755554, "step": 3279 }, { "epoch": 0.507249178426445, "grad_norm": 4.648820400238037, "learning_rate": 4.616221789437508e-06, "logits/chosen": 13.635940551757812, "logits/rejected": 5.431562423706055, "logps/chosen": -266.88037109375, "logps/rejected": -215.22857666015625, "loss": 0.4517, "rewards/accuracies": 1.0, "rewards/chosen": 0.6006661653518677, "rewards/margins": 0.6850570440292358, "rewards/rejected": -0.08439093083143234, "step": 3280 }, { "epoch": 0.5074038275662092, "grad_norm": 5.254561424255371, "learning_rate": 4.615935387787834e-06, "logits/chosen": 12.200284957885742, "logits/rejected": 10.553935050964355, "logps/chosen": -267.6666259765625, "logps/rejected": -237.68408203125, "loss": 0.6642, "rewards/accuracies": 0.625, "rewards/chosen": 0.5550568103790283, "rewards/margins": 0.20258420705795288, "rewards/rejected": 0.35247254371643066, "step": 3281 }, { "epoch": 0.5075584767059733, "grad_norm": 5.4262237548828125, "learning_rate": 4.61564898613816e-06, "logits/chosen": 14.365741729736328, "logits/rejected": 8.671972274780273, "logps/chosen": -278.0113220214844, "logps/rejected": -161.2666473388672, "loss": 0.6495, "rewards/accuracies": 0.5, "rewards/chosen": 0.5143827199935913, "rewards/margins": 0.23593389987945557, "rewards/rejected": 0.27844882011413574, "step": 3282 }, { "epoch": 0.5077131258457375, "grad_norm": 5.696606159210205, "learning_rate": 4.615362584488487e-06, "logits/chosen": 1.919853925704956, "logits/rejected": 3.4774177074432373, "logps/chosen": -281.231689453125, "logps/rejected": -219.40054321289062, "loss": 0.759, "rewards/accuracies": 0.375, "rewards/chosen": 0.24876460433006287, "rewards/margins": -0.013046935200691223, "rewards/rejected": 0.2618115544319153, "step": 3283 }, { "epoch": 0.5078677749855016, "grad_norm": 3.7839126586914062, "learning_rate": 4.615076182838814e-06, "logits/chosen": 9.875275611877441, "logits/rejected": 6.973371505737305, "logps/chosen": -203.93936157226562, "logps/rejected": -180.84817504882812, "loss": 0.5449, "rewards/accuracies": 0.75, "rewards/chosen": 0.3947101831436157, "rewards/margins": 0.46642735600471497, "rewards/rejected": -0.07171717286109924, "step": 3284 }, { "epoch": 0.5080224241252658, "grad_norm": 6.045161724090576, "learning_rate": 4.61478978118914e-06, "logits/chosen": 5.759314060211182, "logits/rejected": 11.320684432983398, "logps/chosen": -226.06271362304688, "logps/rejected": -297.14166259765625, "loss": 0.7682, "rewards/accuracies": 0.5, "rewards/chosen": 0.37793922424316406, "rewards/margins": 0.025321558117866516, "rewards/rejected": 0.35261765122413635, "step": 3285 }, { "epoch": 0.5081770732650299, "grad_norm": 7.158720016479492, "learning_rate": 4.614503379539467e-06, "logits/chosen": 9.620000839233398, "logits/rejected": 3.800607681274414, "logps/chosen": -236.81251525878906, "logps/rejected": -180.69149780273438, "loss": 0.6608, "rewards/accuracies": 0.625, "rewards/chosen": 0.020839691162109375, "rewards/margins": 0.1508420705795288, "rewards/rejected": -0.13000234961509705, "step": 3286 }, { "epoch": 0.5083317224047941, "grad_norm": 6.190760612487793, "learning_rate": 4.614216977889794e-06, "logits/chosen": 7.540580749511719, "logits/rejected": 0.6460055112838745, "logps/chosen": -337.2153625488281, "logps/rejected": -218.33355712890625, "loss": 0.6894, "rewards/accuracies": 0.375, "rewards/chosen": 0.19868355989456177, "rewards/margins": 0.17706742882728577, "rewards/rejected": 0.021616123616695404, "step": 3287 }, { "epoch": 0.5084863715445583, "grad_norm": 4.635907173156738, "learning_rate": 4.613930576240119e-06, "logits/chosen": 5.867904186248779, "logits/rejected": 7.914802074432373, "logps/chosen": -332.451416015625, "logps/rejected": -448.4495849609375, "loss": 0.581, "rewards/accuracies": 0.625, "rewards/chosen": 0.44966018199920654, "rewards/margins": 0.3103483021259308, "rewards/rejected": 0.13931189477443695, "step": 3288 }, { "epoch": 0.5086410206843225, "grad_norm": 4.746682167053223, "learning_rate": 4.613644174590446e-06, "logits/chosen": 4.311805725097656, "logits/rejected": 1.1995904445648193, "logps/chosen": -185.853759765625, "logps/rejected": -133.92442321777344, "loss": 0.6182, "rewards/accuracies": 0.75, "rewards/chosen": 0.23295500874519348, "rewards/margins": 0.3026370406150818, "rewards/rejected": -0.06968198716640472, "step": 3289 }, { "epoch": 0.5087956698240866, "grad_norm": 5.019349098205566, "learning_rate": 4.613357772940773e-06, "logits/chosen": 9.831005096435547, "logits/rejected": 0.4108743667602539, "logps/chosen": -401.7609558105469, "logps/rejected": -276.01409912109375, "loss": 0.5898, "rewards/accuracies": 0.5, "rewards/chosen": 0.6003977060317993, "rewards/margins": 0.5014404058456421, "rewards/rejected": 0.09895730018615723, "step": 3290 }, { "epoch": 0.5089503189638508, "grad_norm": 5.8945770263671875, "learning_rate": 4.613071371291099e-06, "logits/chosen": 8.602256774902344, "logits/rejected": 12.297430992126465, "logps/chosen": -326.4326171875, "logps/rejected": -419.716796875, "loss": 0.6549, "rewards/accuracies": 0.5, "rewards/chosen": 0.6179942488670349, "rewards/margins": 0.18271782994270325, "rewards/rejected": 0.4352763891220093, "step": 3291 }, { "epoch": 0.5091049681036149, "grad_norm": 6.074856758117676, "learning_rate": 4.612784969641426e-06, "logits/chosen": 12.51699161529541, "logits/rejected": 8.815010070800781, "logps/chosen": -259.63934326171875, "logps/rejected": -244.95547485351562, "loss": 0.6704, "rewards/accuracies": 0.75, "rewards/chosen": 0.27018746733665466, "rewards/margins": 0.0648518055677414, "rewards/rejected": 0.20533566176891327, "step": 3292 }, { "epoch": 0.5092596172433791, "grad_norm": 11.851131439208984, "learning_rate": 4.612498567991753e-06, "logits/chosen": 9.136540412902832, "logits/rejected": 13.335267066955566, "logps/chosen": -393.1720275878906, "logps/rejected": -356.2487487792969, "loss": 0.7954, "rewards/accuracies": 0.375, "rewards/chosen": 0.6250507831573486, "rewards/margins": -0.14003705978393555, "rewards/rejected": 0.7650877833366394, "step": 3293 }, { "epoch": 0.5094142663831432, "grad_norm": 5.108269214630127, "learning_rate": 4.6122121663420784e-06, "logits/chosen": 8.016669273376465, "logits/rejected": 5.554108142852783, "logps/chosen": -357.6695556640625, "logps/rejected": -236.93116760253906, "loss": 0.6403, "rewards/accuracies": 0.875, "rewards/chosen": 0.4220203161239624, "rewards/margins": 0.3163836896419525, "rewards/rejected": 0.1056365966796875, "step": 3294 }, { "epoch": 0.5095689155229074, "grad_norm": 5.631624221801758, "learning_rate": 4.611925764692405e-06, "logits/chosen": 13.836071968078613, "logits/rejected": 9.977951049804688, "logps/chosen": -519.4583129882812, "logps/rejected": -336.7672119140625, "loss": 0.5277, "rewards/accuracies": 0.875, "rewards/chosen": 1.0055391788482666, "rewards/margins": 0.6009042263031006, "rewards/rejected": 0.404634952545166, "step": 3295 }, { "epoch": 0.5097235646626715, "grad_norm": 6.327399253845215, "learning_rate": 4.611639363042732e-06, "logits/chosen": 4.6474761962890625, "logits/rejected": 7.623525619506836, "logps/chosen": -261.93402099609375, "logps/rejected": -304.2960205078125, "loss": 0.7615, "rewards/accuracies": 0.5, "rewards/chosen": -0.17954427003860474, "rewards/margins": -0.1032150387763977, "rewards/rejected": -0.07632923126220703, "step": 3296 }, { "epoch": 0.5098782138024357, "grad_norm": 5.544502258300781, "learning_rate": 4.611352961393058e-06, "logits/chosen": 11.478700637817383, "logits/rejected": 7.552406311035156, "logps/chosen": -302.0330505371094, "logps/rejected": -292.5581970214844, "loss": 0.5639, "rewards/accuracies": 0.625, "rewards/chosen": 0.2996593415737152, "rewards/margins": 0.44985684752464294, "rewards/rejected": -0.15019750595092773, "step": 3297 }, { "epoch": 0.5100328629421998, "grad_norm": 6.386444568634033, "learning_rate": 4.611066559743384e-06, "logits/chosen": 12.244367599487305, "logits/rejected": 8.603316307067871, "logps/chosen": -356.3914489746094, "logps/rejected": -236.01129150390625, "loss": 0.7436, "rewards/accuracies": 0.25, "rewards/chosen": 0.15625444054603577, "rewards/margins": 0.1432238221168518, "rewards/rejected": 0.013030633330345154, "step": 3298 }, { "epoch": 0.510187512081964, "grad_norm": 5.056331634521484, "learning_rate": 4.610780158093711e-06, "logits/chosen": 10.681868553161621, "logits/rejected": 10.155205726623535, "logps/chosen": -355.4100341796875, "logps/rejected": -333.34466552734375, "loss": 0.5839, "rewards/accuracies": 0.75, "rewards/chosen": 0.219159796833992, "rewards/margins": 0.30675363540649414, "rewards/rejected": -0.08759383857250214, "step": 3299 }, { "epoch": 0.5103421612217282, "grad_norm": 5.038776397705078, "learning_rate": 4.6104937564440375e-06, "logits/chosen": 6.918275833129883, "logits/rejected": 7.548701286315918, "logps/chosen": -251.03842163085938, "logps/rejected": -225.3931121826172, "loss": 0.7674, "rewards/accuracies": 0.5, "rewards/chosen": 0.31759369373321533, "rewards/margins": -0.07449433207511902, "rewards/rejected": 0.39208802580833435, "step": 3300 }, { "epoch": 0.5104968103614924, "grad_norm": 6.149869918823242, "learning_rate": 4.610207354794364e-06, "logits/chosen": 13.585990905761719, "logits/rejected": 14.574674606323242, "logps/chosen": -308.7471618652344, "logps/rejected": -375.1549072265625, "loss": 0.7064, "rewards/accuracies": 0.5, "rewards/chosen": 0.3792675733566284, "rewards/margins": 0.05164957046508789, "rewards/rejected": 0.3276180326938629, "step": 3301 }, { "epoch": 0.5106514595012566, "grad_norm": 8.68946361541748, "learning_rate": 4.60992095314469e-06, "logits/chosen": 4.943545341491699, "logits/rejected": 10.283045768737793, "logps/chosen": -335.4933776855469, "logps/rejected": -314.6751403808594, "loss": 0.9695, "rewards/accuracies": 0.375, "rewards/chosen": 0.09260483086109161, "rewards/margins": -0.36909058690071106, "rewards/rejected": 0.4616954028606415, "step": 3302 }, { "epoch": 0.5108061086410207, "grad_norm": 5.4128899574279785, "learning_rate": 4.609634551495017e-06, "logits/chosen": 11.946561813354492, "logits/rejected": 11.99673080444336, "logps/chosen": -246.31185913085938, "logps/rejected": -264.9508056640625, "loss": 0.6506, "rewards/accuracies": 0.5, "rewards/chosen": 0.27053916454315186, "rewards/margins": 0.17488783597946167, "rewards/rejected": 0.09565134346485138, "step": 3303 }, { "epoch": 0.5109607577807849, "grad_norm": 4.730881214141846, "learning_rate": 4.609348149845343e-06, "logits/chosen": 8.126791000366211, "logits/rejected": 6.974100112915039, "logps/chosen": -326.1795959472656, "logps/rejected": -277.41552734375, "loss": 0.552, "rewards/accuracies": 0.875, "rewards/chosen": 0.5544772148132324, "rewards/margins": 0.3648064136505127, "rewards/rejected": 0.18967080116271973, "step": 3304 }, { "epoch": 0.511115406920549, "grad_norm": 7.931200981140137, "learning_rate": 4.60906174819567e-06, "logits/chosen": 8.23427677154541, "logits/rejected": 3.824476480484009, "logps/chosen": -448.0661926269531, "logps/rejected": -372.74859619140625, "loss": 0.6634, "rewards/accuracies": 0.625, "rewards/chosen": 0.5778120756149292, "rewards/margins": 0.3968660533428192, "rewards/rejected": 0.1809459924697876, "step": 3305 }, { "epoch": 0.5112700560603132, "grad_norm": 6.869718074798584, "learning_rate": 4.6087753465459965e-06, "logits/chosen": 7.93504524230957, "logits/rejected": 3.603963851928711, "logps/chosen": -453.483154296875, "logps/rejected": -320.5999755859375, "loss": 0.5766, "rewards/accuracies": 0.625, "rewards/chosen": 0.38236987590789795, "rewards/margins": 0.3331788182258606, "rewards/rejected": 0.049191076308488846, "step": 3306 }, { "epoch": 0.5114247052000773, "grad_norm": 5.299606800079346, "learning_rate": 4.608488944896322e-06, "logits/chosen": 13.84046745300293, "logits/rejected": 11.413540840148926, "logps/chosen": -307.7435302734375, "logps/rejected": -404.3388366699219, "loss": 0.5461, "rewards/accuracies": 0.75, "rewards/chosen": 0.5440571904182434, "rewards/margins": 0.43519482016563416, "rewards/rejected": 0.10886240005493164, "step": 3307 }, { "epoch": 0.5115793543398415, "grad_norm": 7.436204433441162, "learning_rate": 4.608202543246649e-06, "logits/chosen": 3.9916067123413086, "logits/rejected": 8.170459747314453, "logps/chosen": -259.1344909667969, "logps/rejected": -317.2101135253906, "loss": 0.8142, "rewards/accuracies": 0.375, "rewards/chosen": 0.07210784405469894, "rewards/margins": -0.2072063535451889, "rewards/rejected": 0.27931419014930725, "step": 3308 }, { "epoch": 0.5117340034796056, "grad_norm": 6.892382621765137, "learning_rate": 4.607916141596976e-06, "logits/chosen": 8.205277442932129, "logits/rejected": 12.968907356262207, "logps/chosen": -364.7828063964844, "logps/rejected": -320.7512512207031, "loss": 0.7796, "rewards/accuracies": 0.5, "rewards/chosen": 0.10809154808521271, "rewards/margins": -0.11546183377504349, "rewards/rejected": 0.2235533744096756, "step": 3309 }, { "epoch": 0.5118886526193698, "grad_norm": 4.5968546867370605, "learning_rate": 4.607629739947302e-06, "logits/chosen": 14.16615104675293, "logits/rejected": 12.471616744995117, "logps/chosen": -332.67010498046875, "logps/rejected": -372.7623291015625, "loss": 0.5008, "rewards/accuracies": 0.875, "rewards/chosen": 0.2503507733345032, "rewards/margins": 0.4937131702899933, "rewards/rejected": -0.2433624267578125, "step": 3310 }, { "epoch": 0.5120433017591339, "grad_norm": 4.575873374938965, "learning_rate": 4.607343338297629e-06, "logits/chosen": 10.643853187561035, "logits/rejected": 10.89056396484375, "logps/chosen": -165.5745391845703, "logps/rejected": -197.70614624023438, "loss": 0.6435, "rewards/accuracies": 0.625, "rewards/chosen": 0.1332719326019287, "rewards/margins": 0.2689007520675659, "rewards/rejected": -0.13562878966331482, "step": 3311 }, { "epoch": 0.5121979508988981, "grad_norm": 6.630035877227783, "learning_rate": 4.607056936647956e-06, "logits/chosen": 11.515083312988281, "logits/rejected": 8.150697708129883, "logps/chosen": -243.21372985839844, "logps/rejected": -236.29318237304688, "loss": 0.6737, "rewards/accuracies": 0.625, "rewards/chosen": 0.053349412977695465, "rewards/margins": 0.11289205402135849, "rewards/rejected": -0.05954265594482422, "step": 3312 }, { "epoch": 0.5123526000386622, "grad_norm": 6.082345962524414, "learning_rate": 4.606770534998282e-06, "logits/chosen": 7.6710944175720215, "logits/rejected": 3.204085350036621, "logps/chosen": -255.6961669921875, "logps/rejected": -213.114990234375, "loss": 0.7178, "rewards/accuracies": 0.625, "rewards/chosen": 0.062436312437057495, "rewards/margins": 0.08954112231731415, "rewards/rejected": -0.027104809880256653, "step": 3313 }, { "epoch": 0.5125072491784265, "grad_norm": 7.639415740966797, "learning_rate": 4.606484133348608e-06, "logits/chosen": 8.23381233215332, "logits/rejected": 9.277178764343262, "logps/chosen": -283.31695556640625, "logps/rejected": -261.18572998046875, "loss": 0.6992, "rewards/accuracies": 0.625, "rewards/chosen": 0.2526858448982239, "rewards/margins": 0.08515415340662003, "rewards/rejected": 0.16753168404102325, "step": 3314 }, { "epoch": 0.5126618983181906, "grad_norm": 8.525605201721191, "learning_rate": 4.606197731698935e-06, "logits/chosen": 13.060556411743164, "logits/rejected": 9.44193172454834, "logps/chosen": -409.4605712890625, "logps/rejected": -380.96112060546875, "loss": 0.7424, "rewards/accuracies": 0.5, "rewards/chosen": 0.34490615129470825, "rewards/margins": 0.16014891862869263, "rewards/rejected": 0.18475723266601562, "step": 3315 }, { "epoch": 0.5128165474579548, "grad_norm": 8.903631210327148, "learning_rate": 4.605911330049261e-06, "logits/chosen": 7.798725128173828, "logits/rejected": 4.855804920196533, "logps/chosen": -328.53460693359375, "logps/rejected": -388.38330078125, "loss": 0.8374, "rewards/accuracies": 0.375, "rewards/chosen": -0.0003664987161755562, "rewards/margins": -0.14090563356876373, "rewards/rejected": 0.14053915441036224, "step": 3316 }, { "epoch": 0.512971196597719, "grad_norm": 7.961757183074951, "learning_rate": 4.605624928399588e-06, "logits/chosen": 6.386938095092773, "logits/rejected": 9.763251304626465, "logps/chosen": -301.98199462890625, "logps/rejected": -417.2717590332031, "loss": 0.5636, "rewards/accuracies": 0.75, "rewards/chosen": 0.5083305835723877, "rewards/margins": 0.4255778193473816, "rewards/rejected": 0.08275280892848969, "step": 3317 }, { "epoch": 0.5131258457374831, "grad_norm": 5.357951641082764, "learning_rate": 4.605338526749915e-06, "logits/chosen": 8.939440727233887, "logits/rejected": 0.5143543481826782, "logps/chosen": -330.86712646484375, "logps/rejected": -191.1465301513672, "loss": 0.4968, "rewards/accuracies": 0.625, "rewards/chosen": 0.19912022352218628, "rewards/margins": 0.7738736867904663, "rewards/rejected": -0.5747535228729248, "step": 3318 }, { "epoch": 0.5132804948772473, "grad_norm": 7.023497104644775, "learning_rate": 4.605052125100241e-06, "logits/chosen": 10.614002227783203, "logits/rejected": 4.7218146324157715, "logps/chosen": -517.1453857421875, "logps/rejected": -291.04052734375, "loss": 0.7068, "rewards/accuracies": 0.5, "rewards/chosen": 0.3102092742919922, "rewards/margins": -0.0038158856332302094, "rewards/rejected": 0.3140251636505127, "step": 3319 }, { "epoch": 0.5134351440170114, "grad_norm": 4.474532127380371, "learning_rate": 4.604765723450568e-06, "logits/chosen": 16.762693405151367, "logits/rejected": 10.661742210388184, "logps/chosen": -319.2034606933594, "logps/rejected": -233.9828338623047, "loss": 0.5909, "rewards/accuracies": 0.75, "rewards/chosen": 0.4443679451942444, "rewards/margins": 0.2620030641555786, "rewards/rejected": 0.18236495554447174, "step": 3320 }, { "epoch": 0.5135897931567756, "grad_norm": 5.109394550323486, "learning_rate": 4.604479321800894e-06, "logits/chosen": 12.434944152832031, "logits/rejected": 5.510983943939209, "logps/chosen": -251.61648559570312, "logps/rejected": -178.4016571044922, "loss": 0.6458, "rewards/accuracies": 0.625, "rewards/chosen": 0.4217732548713684, "rewards/margins": 0.22006481885910034, "rewards/rejected": 0.20170843601226807, "step": 3321 }, { "epoch": 0.5137444422965397, "grad_norm": 6.105430603027344, "learning_rate": 4.60419292015122e-06, "logits/chosen": 10.395490646362305, "logits/rejected": 3.4469900131225586, "logps/chosen": -355.70263671875, "logps/rejected": -247.75942993164062, "loss": 0.7775, "rewards/accuracies": 0.375, "rewards/chosen": 0.20129719376564026, "rewards/margins": -0.10168847441673279, "rewards/rejected": 0.30298566818237305, "step": 3322 }, { "epoch": 0.5138990914363039, "grad_norm": 14.232503890991211, "learning_rate": 4.603906518501547e-06, "logits/chosen": 7.293386936187744, "logits/rejected": 8.30022144317627, "logps/chosen": -228.21954345703125, "logps/rejected": -216.17160034179688, "loss": 0.7305, "rewards/accuracies": 0.625, "rewards/chosen": 0.18266472220420837, "rewards/margins": 0.0026621222496032715, "rewards/rejected": 0.1800025999546051, "step": 3323 }, { "epoch": 0.514053740576068, "grad_norm": 5.143128871917725, "learning_rate": 4.603620116851874e-06, "logits/chosen": 15.941125869750977, "logits/rejected": 13.429931640625, "logps/chosen": -310.6986083984375, "logps/rejected": -228.2845458984375, "loss": 0.5077, "rewards/accuracies": 0.875, "rewards/chosen": 0.4222492575645447, "rewards/margins": 0.46035271883010864, "rewards/rejected": -0.038103483617305756, "step": 3324 }, { "epoch": 0.5142083897158322, "grad_norm": 7.872115135192871, "learning_rate": 4.6033337152022e-06, "logits/chosen": 9.733478546142578, "logits/rejected": 6.192249298095703, "logps/chosen": -280.38580322265625, "logps/rejected": -262.6074523925781, "loss": 0.9302, "rewards/accuracies": 0.125, "rewards/chosen": -0.3527243733406067, "rewards/margins": -0.3561922311782837, "rewards/rejected": 0.003467857837677002, "step": 3325 }, { "epoch": 0.5143630388555963, "grad_norm": 5.421661853790283, "learning_rate": 4.603047313552527e-06, "logits/chosen": 12.775594711303711, "logits/rejected": 10.362990379333496, "logps/chosen": -279.92083740234375, "logps/rejected": -250.9683837890625, "loss": 0.7951, "rewards/accuracies": 0.625, "rewards/chosen": 0.36717861890792847, "rewards/margins": 0.12689314782619476, "rewards/rejected": 0.2402855008840561, "step": 3326 }, { "epoch": 0.5145176879953606, "grad_norm": 7.026094913482666, "learning_rate": 4.602760911902853e-06, "logits/chosen": 7.578186988830566, "logits/rejected": 7.290694713592529, "logps/chosen": -297.53546142578125, "logps/rejected": -408.7726135253906, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": 0.511529266834259, "rewards/margins": 0.09652864933013916, "rewards/rejected": 0.41500064730644226, "step": 3327 }, { "epoch": 0.5146723371351247, "grad_norm": 7.281806468963623, "learning_rate": 4.6024745102531795e-06, "logits/chosen": 12.974857330322266, "logits/rejected": 11.966880798339844, "logps/chosen": -411.5673828125, "logps/rejected": -498.97314453125, "loss": 0.6552, "rewards/accuracies": 0.375, "rewards/chosen": 0.7406171560287476, "rewards/margins": 0.14281733334064484, "rewards/rejected": 0.5977998375892639, "step": 3328 }, { "epoch": 0.5148269862748889, "grad_norm": 5.58329963684082, "learning_rate": 4.602188108603506e-06, "logits/chosen": 10.643851280212402, "logits/rejected": 3.0388851165771484, "logps/chosen": -371.23309326171875, "logps/rejected": -224.8085479736328, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 0.5755599737167358, "rewards/margins": 0.6589735746383667, "rewards/rejected": -0.08341364562511444, "step": 3329 }, { "epoch": 0.514981635414653, "grad_norm": 4.907145977020264, "learning_rate": 4.601901706953833e-06, "logits/chosen": 6.897857189178467, "logits/rejected": 3.476606845855713, "logps/chosen": -325.2074279785156, "logps/rejected": -177.27366638183594, "loss": 0.6531, "rewards/accuracies": 0.625, "rewards/chosen": 0.19923964142799377, "rewards/margins": 0.28254789113998413, "rewards/rejected": -0.08330824226140976, "step": 3330 }, { "epoch": 0.5151362845544172, "grad_norm": 10.427337646484375, "learning_rate": 4.601615305304159e-06, "logits/chosen": -0.9577646255493164, "logits/rejected": 1.4538899660110474, "logps/chosen": -331.46014404296875, "logps/rejected": -329.8386535644531, "loss": 0.7563, "rewards/accuracies": 0.5, "rewards/chosen": -0.2879761755466461, "rewards/margins": 0.08168478310108185, "rewards/rejected": -0.36966097354888916, "step": 3331 }, { "epoch": 0.5152909336941813, "grad_norm": 4.053294658660889, "learning_rate": 4.601328903654485e-06, "logits/chosen": 12.008349418640137, "logits/rejected": 2.210422992706299, "logps/chosen": -409.9730224609375, "logps/rejected": -272.80633544921875, "loss": 0.4101, "rewards/accuracies": 0.875, "rewards/chosen": 0.5077674984931946, "rewards/margins": 0.8883147239685059, "rewards/rejected": -0.38054725527763367, "step": 3332 }, { "epoch": 0.5154455828339455, "grad_norm": 6.067572593688965, "learning_rate": 4.601042502004812e-06, "logits/chosen": 8.359210014343262, "logits/rejected": 0.7269017100334167, "logps/chosen": -355.2889099121094, "logps/rejected": -178.30001831054688, "loss": 0.6275, "rewards/accuracies": 0.625, "rewards/chosen": 0.5712300539016724, "rewards/margins": 0.21494920551776886, "rewards/rejected": 0.3562808036804199, "step": 3333 }, { "epoch": 0.5156002319737096, "grad_norm": 7.381783962249756, "learning_rate": 4.6007561003551385e-06, "logits/chosen": 11.462834358215332, "logits/rejected": 12.725983619689941, "logps/chosen": -288.2518310546875, "logps/rejected": -265.99114990234375, "loss": 0.8291, "rewards/accuracies": 0.25, "rewards/chosen": 0.13342642784118652, "rewards/margins": -0.15023750066757202, "rewards/rejected": 0.28366392850875854, "step": 3334 }, { "epoch": 0.5157548811134738, "grad_norm": 7.987733364105225, "learning_rate": 4.600469698705465e-06, "logits/chosen": 8.43124008178711, "logits/rejected": 12.709207534790039, "logps/chosen": -236.6010284423828, "logps/rejected": -281.1996154785156, "loss": 0.732, "rewards/accuracies": 0.5, "rewards/chosen": 0.10303372144699097, "rewards/margins": -0.03188496083021164, "rewards/rejected": 0.1349186897277832, "step": 3335 }, { "epoch": 0.5159095302532379, "grad_norm": 5.715920925140381, "learning_rate": 4.600183297055791e-06, "logits/chosen": 5.269281387329102, "logits/rejected": 6.160981178283691, "logps/chosen": -193.3437042236328, "logps/rejected": -258.7533264160156, "loss": 0.7888, "rewards/accuracies": 0.5, "rewards/chosen": 0.19255928695201874, "rewards/margins": -0.03225240111351013, "rewards/rejected": 0.22481170296669006, "step": 3336 }, { "epoch": 0.5160641793930021, "grad_norm": 4.292242050170898, "learning_rate": 4.599896895406118e-06, "logits/chosen": 5.063158988952637, "logits/rejected": 8.99674129486084, "logps/chosen": -203.1636962890625, "logps/rejected": -198.94207763671875, "loss": 0.646, "rewards/accuracies": 0.625, "rewards/chosen": 0.3192750811576843, "rewards/margins": 0.2594013214111328, "rewards/rejected": 0.05987377092242241, "step": 3337 }, { "epoch": 0.5162188285327662, "grad_norm": 7.232443809509277, "learning_rate": 4.599610493756444e-06, "logits/chosen": 9.894434928894043, "logits/rejected": 3.0244178771972656, "logps/chosen": -450.18695068359375, "logps/rejected": -269.5166015625, "loss": 0.4681, "rewards/accuracies": 0.75, "rewards/chosen": 0.7104825973510742, "rewards/margins": 0.8683890700340271, "rewards/rejected": -0.15790653228759766, "step": 3338 }, { "epoch": 0.5163734776725305, "grad_norm": 5.255432605743408, "learning_rate": 4.599324092106771e-06, "logits/chosen": 12.322887420654297, "logits/rejected": 7.271556854248047, "logps/chosen": -212.72640991210938, "logps/rejected": -202.61166381835938, "loss": 0.6583, "rewards/accuracies": 0.5, "rewards/chosen": 0.293515682220459, "rewards/margins": 0.10923933237791061, "rewards/rejected": 0.18427634239196777, "step": 3339 }, { "epoch": 0.5165281268122947, "grad_norm": 3.8906614780426025, "learning_rate": 4.599037690457097e-06, "logits/chosen": 9.34835433959961, "logits/rejected": 13.081023216247559, "logps/chosen": -117.5382080078125, "logps/rejected": -186.06973266601562, "loss": 0.587, "rewards/accuracies": 0.625, "rewards/chosen": 0.037637557834386826, "rewards/margins": 0.29056259989738464, "rewards/rejected": -0.2529250383377075, "step": 3340 }, { "epoch": 0.5166827759520588, "grad_norm": 4.393553256988525, "learning_rate": 4.598751288807423e-06, "logits/chosen": 10.058629989624023, "logits/rejected": 9.304224014282227, "logps/chosen": -272.46649169921875, "logps/rejected": -260.66510009765625, "loss": 0.5814, "rewards/accuracies": 0.875, "rewards/chosen": 0.6412093639373779, "rewards/margins": 0.27334436774253845, "rewards/rejected": 0.3678649663925171, "step": 3341 }, { "epoch": 0.516837425091823, "grad_norm": 5.0212507247924805, "learning_rate": 4.59846488715775e-06, "logits/chosen": 9.178473472595215, "logits/rejected": 5.965265274047852, "logps/chosen": -260.2046203613281, "logps/rejected": -217.31329345703125, "loss": 0.6553, "rewards/accuracies": 0.625, "rewards/chosen": 0.35200709104537964, "rewards/margins": 0.11002466082572937, "rewards/rejected": 0.24198244512081146, "step": 3342 }, { "epoch": 0.5169920742315871, "grad_norm": 3.9991116523742676, "learning_rate": 4.598178485508077e-06, "logits/chosen": 15.431526184082031, "logits/rejected": 7.793521404266357, "logps/chosen": -221.8224334716797, "logps/rejected": -197.96881103515625, "loss": 0.5477, "rewards/accuracies": 0.625, "rewards/chosen": 0.4227851927280426, "rewards/margins": 0.5931785106658936, "rewards/rejected": -0.17039328813552856, "step": 3343 }, { "epoch": 0.5171467233713513, "grad_norm": 6.6603803634643555, "learning_rate": 4.597892083858403e-06, "logits/chosen": 0.7117253541946411, "logits/rejected": 4.9379072189331055, "logps/chosen": -267.120361328125, "logps/rejected": -328.4352111816406, "loss": 0.7424, "rewards/accuracies": 0.5, "rewards/chosen": 0.09934262186288834, "rewards/margins": 0.012568570673465729, "rewards/rejected": 0.0867740660905838, "step": 3344 }, { "epoch": 0.5173013725111154, "grad_norm": 6.0258307456970215, "learning_rate": 4.59760568220873e-06, "logits/chosen": 7.265829086303711, "logits/rejected": 5.597441673278809, "logps/chosen": -251.51492309570312, "logps/rejected": -212.12924194335938, "loss": 0.7808, "rewards/accuracies": 0.125, "rewards/chosen": 0.04207973554730415, "rewards/margins": -0.03954887390136719, "rewards/rejected": 0.08162861317396164, "step": 3345 }, { "epoch": 0.5174560216508796, "grad_norm": 5.801704406738281, "learning_rate": 4.597319280559057e-06, "logits/chosen": 9.716731071472168, "logits/rejected": 11.044365882873535, "logps/chosen": -249.14271545410156, "logps/rejected": -266.17303466796875, "loss": 0.7735, "rewards/accuracies": 0.375, "rewards/chosen": 0.14653760194778442, "rewards/margins": -0.052437543869018555, "rewards/rejected": 0.19897514581680298, "step": 3346 }, { "epoch": 0.5176106707906437, "grad_norm": 5.435059547424316, "learning_rate": 4.5970328789093824e-06, "logits/chosen": 6.191805839538574, "logits/rejected": 8.739500999450684, "logps/chosen": -301.4412841796875, "logps/rejected": -370.99737548828125, "loss": 0.6259, "rewards/accuracies": 0.625, "rewards/chosen": 0.2873663902282715, "rewards/margins": 0.16811160743236542, "rewards/rejected": 0.11925478279590607, "step": 3347 }, { "epoch": 0.5177653199304079, "grad_norm": 3.7823760509490967, "learning_rate": 4.596746477259709e-06, "logits/chosen": 11.955760955810547, "logits/rejected": 10.665847778320312, "logps/chosen": -227.86294555664062, "logps/rejected": -178.990966796875, "loss": 0.6769, "rewards/accuracies": 0.5, "rewards/chosen": 0.0681418627500534, "rewards/margins": 0.12963783740997314, "rewards/rejected": -0.061495959758758545, "step": 3348 }, { "epoch": 0.517919969070172, "grad_norm": 6.7825608253479, "learning_rate": 4.596460075610036e-06, "logits/chosen": 8.116400718688965, "logits/rejected": 4.816107273101807, "logps/chosen": -218.87765502929688, "logps/rejected": -189.20150756835938, "loss": 0.6806, "rewards/accuracies": 0.625, "rewards/chosen": 0.09532195329666138, "rewards/margins": 0.07856465876102448, "rewards/rejected": 0.016757257282733917, "step": 3349 }, { "epoch": 0.5180746182099362, "grad_norm": 6.819690704345703, "learning_rate": 4.596173673960362e-06, "logits/chosen": 7.481340408325195, "logits/rejected": 4.962677001953125, "logps/chosen": -323.698974609375, "logps/rejected": -308.75494384765625, "loss": 0.8075, "rewards/accuracies": 0.375, "rewards/chosen": 0.45470166206359863, "rewards/margins": -0.1253925859928131, "rewards/rejected": 0.5800942182540894, "step": 3350 }, { "epoch": 0.5182292673497003, "grad_norm": 5.916538238525391, "learning_rate": 4.595887272310689e-06, "logits/chosen": 8.10086441040039, "logits/rejected": 1.8858840465545654, "logps/chosen": -286.738037109375, "logps/rejected": -225.82388305664062, "loss": 0.6632, "rewards/accuracies": 0.375, "rewards/chosen": 0.44926026463508606, "rewards/margins": 0.10890054702758789, "rewards/rejected": 0.3403596878051758, "step": 3351 }, { "epoch": 0.5183839164894646, "grad_norm": 5.650180339813232, "learning_rate": 4.595600870661016e-06, "logits/chosen": 9.513080596923828, "logits/rejected": 6.768671035766602, "logps/chosen": -297.4496154785156, "logps/rejected": -318.6204528808594, "loss": 0.6118, "rewards/accuracies": 0.5, "rewards/chosen": 0.10306416451931, "rewards/margins": 0.3363010585308075, "rewards/rejected": -0.2332368791103363, "step": 3352 }, { "epoch": 0.5185385656292287, "grad_norm": 5.260039806365967, "learning_rate": 4.595314469011342e-06, "logits/chosen": 5.763668537139893, "logits/rejected": 4.022637367248535, "logps/chosen": -473.63323974609375, "logps/rejected": -402.4539489746094, "loss": 0.5797, "rewards/accuracies": 0.75, "rewards/chosen": 0.8152771592140198, "rewards/margins": 0.462116539478302, "rewards/rejected": 0.3531606197357178, "step": 3353 }, { "epoch": 0.5186932147689929, "grad_norm": 4.126369953155518, "learning_rate": 4.595028067361668e-06, "logits/chosen": 8.767402648925781, "logits/rejected": -1.541495442390442, "logps/chosen": -365.54766845703125, "logps/rejected": -259.57464599609375, "loss": 0.451, "rewards/accuracies": 0.75, "rewards/chosen": 0.473431795835495, "rewards/margins": 0.6864514350891113, "rewards/rejected": -0.21301960945129395, "step": 3354 }, { "epoch": 0.518847863908757, "grad_norm": 5.820002555847168, "learning_rate": 4.594741665711995e-06, "logits/chosen": 4.751580238342285, "logits/rejected": 8.376527786254883, "logps/chosen": -217.77182006835938, "logps/rejected": -243.0459747314453, "loss": 0.9261, "rewards/accuracies": 0.5, "rewards/chosen": 0.05957531929016113, "rewards/margins": -0.34137678146362305, "rewards/rejected": 0.4009520709514618, "step": 3355 }, { "epoch": 0.5190025130485212, "grad_norm": 3.4872448444366455, "learning_rate": 4.5944552640623214e-06, "logits/chosen": 12.761938095092773, "logits/rejected": 2.555190086364746, "logps/chosen": -360.17889404296875, "logps/rejected": -196.4600830078125, "loss": 0.4265, "rewards/accuracies": 0.75, "rewards/chosen": 0.641636848449707, "rewards/margins": 0.8073151111602783, "rewards/rejected": -0.1656782180070877, "step": 3356 }, { "epoch": 0.5191571621882853, "grad_norm": 3.894822835922241, "learning_rate": 4.594168862412648e-06, "logits/chosen": 1.6901307106018066, "logits/rejected": 1.0637298822402954, "logps/chosen": -216.74249267578125, "logps/rejected": -229.15560913085938, "loss": 0.4955, "rewards/accuracies": 0.75, "rewards/chosen": 0.39243441820144653, "rewards/margins": 0.5352980494499207, "rewards/rejected": -0.14286363124847412, "step": 3357 }, { "epoch": 0.5193118113280495, "grad_norm": 5.256778240203857, "learning_rate": 4.593882460762975e-06, "logits/chosen": 11.411818504333496, "logits/rejected": 2.7224555015563965, "logps/chosen": -304.2349853515625, "logps/rejected": -188.3083953857422, "loss": 0.6893, "rewards/accuracies": 0.375, "rewards/chosen": 0.18156929314136505, "rewards/margins": 0.09899642318487167, "rewards/rejected": 0.08257286995649338, "step": 3358 }, { "epoch": 0.5194664604678136, "grad_norm": 5.881741046905518, "learning_rate": 4.593596059113301e-06, "logits/chosen": 10.966991424560547, "logits/rejected": 6.728796005249023, "logps/chosen": -302.0898742675781, "logps/rejected": -256.14801025390625, "loss": 0.6459, "rewards/accuracies": 0.625, "rewards/chosen": 0.3486071527004242, "rewards/margins": 0.163596972823143, "rewards/rejected": 0.18501019477844238, "step": 3359 }, { "epoch": 0.5196211096075778, "grad_norm": 3.081699848175049, "learning_rate": 4.593309657463627e-06, "logits/chosen": 5.417734622955322, "logits/rejected": 1.3187285661697388, "logps/chosen": -310.59417724609375, "logps/rejected": -180.80331420898438, "loss": 0.5039, "rewards/accuracies": 0.75, "rewards/chosen": 0.30422377586364746, "rewards/margins": 0.5703117847442627, "rewards/rejected": -0.2660880386829376, "step": 3360 }, { "epoch": 0.5197757587473419, "grad_norm": 4.984732151031494, "learning_rate": 4.593023255813954e-06, "logits/chosen": 4.960865497589111, "logits/rejected": 4.9250569343566895, "logps/chosen": -172.64959716796875, "logps/rejected": -216.44898986816406, "loss": 0.7157, "rewards/accuracies": 0.625, "rewards/chosen": 0.31895291805267334, "rewards/margins": 0.05662061274051666, "rewards/rejected": 0.2623322606086731, "step": 3361 }, { "epoch": 0.5199304078871061, "grad_norm": 5.086485385894775, "learning_rate": 4.5927368541642805e-06, "logits/chosen": 11.037814140319824, "logits/rejected": 8.57933235168457, "logps/chosen": -306.3039855957031, "logps/rejected": -306.47149658203125, "loss": 0.6079, "rewards/accuracies": 0.75, "rewards/chosen": 0.5484976768493652, "rewards/margins": 0.22478577494621277, "rewards/rejected": 0.32371193170547485, "step": 3362 }, { "epoch": 0.5200850570268702, "grad_norm": 4.9316816329956055, "learning_rate": 4.592450452514607e-06, "logits/chosen": 10.591773986816406, "logits/rejected": 2.9878017902374268, "logps/chosen": -382.12664794921875, "logps/rejected": -290.87548828125, "loss": 0.4458, "rewards/accuracies": 1.0, "rewards/chosen": 0.8783547878265381, "rewards/margins": 0.6642925143241882, "rewards/rejected": 0.21406233310699463, "step": 3363 }, { "epoch": 0.5202397061666344, "grad_norm": 5.727143287658691, "learning_rate": 4.592164050864934e-06, "logits/chosen": 13.092672348022461, "logits/rejected": 8.68062973022461, "logps/chosen": -336.43170166015625, "logps/rejected": -313.5290832519531, "loss": 0.5498, "rewards/accuracies": 0.875, "rewards/chosen": 0.43714067339897156, "rewards/margins": 0.5199013948440552, "rewards/rejected": -0.08276071399450302, "step": 3364 }, { "epoch": 0.5203943553063987, "grad_norm": 5.615415096282959, "learning_rate": 4.59187764921526e-06, "logits/chosen": 9.310783386230469, "logits/rejected": 6.376582622528076, "logps/chosen": -387.70037841796875, "logps/rejected": -378.4019470214844, "loss": 0.5943, "rewards/accuracies": 0.5, "rewards/chosen": 0.5919109582901001, "rewards/margins": 0.2893992066383362, "rewards/rejected": 0.3025117516517639, "step": 3365 }, { "epoch": 0.5205490044461628, "grad_norm": 7.270103454589844, "learning_rate": 4.591591247565586e-06, "logits/chosen": 6.2316131591796875, "logits/rejected": 5.721863746643066, "logps/chosen": -199.2571258544922, "logps/rejected": -165.15481567382812, "loss": 0.8633, "rewards/accuracies": 0.375, "rewards/chosen": -0.0933922529220581, "rewards/margins": -0.24353134632110596, "rewards/rejected": 0.15013909339904785, "step": 3366 }, { "epoch": 0.520703653585927, "grad_norm": 5.480018615722656, "learning_rate": 4.591304845915913e-06, "logits/chosen": 11.577028274536133, "logits/rejected": 7.851813316345215, "logps/chosen": -281.567138671875, "logps/rejected": -266.55657958984375, "loss": 0.6757, "rewards/accuracies": 0.625, "rewards/chosen": 0.04081106185913086, "rewards/margins": 0.052083492279052734, "rewards/rejected": -0.011272445321083069, "step": 3367 }, { "epoch": 0.5208583027256911, "grad_norm": 6.0521111488342285, "learning_rate": 4.5910184442662396e-06, "logits/chosen": 7.320011138916016, "logits/rejected": 6.893837928771973, "logps/chosen": -211.69017028808594, "logps/rejected": -263.49395751953125, "loss": 0.7222, "rewards/accuracies": 0.5, "rewards/chosen": 0.32181161642074585, "rewards/margins": 0.07572829723358154, "rewards/rejected": 0.2460833489894867, "step": 3368 }, { "epoch": 0.5210129518654553, "grad_norm": 6.400270938873291, "learning_rate": 4.590732042616566e-06, "logits/chosen": 7.621551990509033, "logits/rejected": 4.2501606941223145, "logps/chosen": -395.25115966796875, "logps/rejected": -238.61761474609375, "loss": 0.7021, "rewards/accuracies": 0.5, "rewards/chosen": 0.493667334318161, "rewards/margins": 0.09907799959182739, "rewards/rejected": 0.39458930492401123, "step": 3369 }, { "epoch": 0.5211676010052194, "grad_norm": 4.112985134124756, "learning_rate": 4.590445640966892e-06, "logits/chosen": 6.5976457595825195, "logits/rejected": 5.435474872589111, "logps/chosen": -232.96754455566406, "logps/rejected": -203.96286010742188, "loss": 0.7434, "rewards/accuracies": 0.375, "rewards/chosen": 0.21193014085292816, "rewards/margins": 0.00992593914270401, "rewards/rejected": 0.20200420916080475, "step": 3370 }, { "epoch": 0.5213222501449836, "grad_norm": 6.024825572967529, "learning_rate": 4.590159239317219e-06, "logits/chosen": 10.6073637008667, "logits/rejected": 8.498064041137695, "logps/chosen": -237.94537353515625, "logps/rejected": -226.89024353027344, "loss": 0.5978, "rewards/accuracies": 0.75, "rewards/chosen": 0.39795076847076416, "rewards/margins": 0.23435693979263306, "rewards/rejected": 0.16359379887580872, "step": 3371 }, { "epoch": 0.5214768992847477, "grad_norm": 5.085453510284424, "learning_rate": 4.589872837667545e-06, "logits/chosen": 9.58914852142334, "logits/rejected": 8.148595809936523, "logps/chosen": -299.7940368652344, "logps/rejected": -241.12063598632812, "loss": 0.4255, "rewards/accuracies": 0.875, "rewards/chosen": 0.38754168152809143, "rewards/margins": 0.7746517062187195, "rewards/rejected": -0.38711005449295044, "step": 3372 }, { "epoch": 0.5216315484245119, "grad_norm": 4.598538875579834, "learning_rate": 4.589586436017872e-06, "logits/chosen": 15.125432968139648, "logits/rejected": 10.811992645263672, "logps/chosen": -261.6843566894531, "logps/rejected": -214.27566528320312, "loss": 0.5474, "rewards/accuracies": 0.625, "rewards/chosen": 0.4445863962173462, "rewards/margins": 0.391093909740448, "rewards/rejected": 0.05349244922399521, "step": 3373 }, { "epoch": 0.521786197564276, "grad_norm": 6.244147777557373, "learning_rate": 4.589300034368198e-06, "logits/chosen": 15.704962730407715, "logits/rejected": 7.775631904602051, "logps/chosen": -323.9530944824219, "logps/rejected": -188.38279724121094, "loss": 0.6641, "rewards/accuracies": 0.5, "rewards/chosen": -0.15005464851856232, "rewards/margins": 0.17448121309280396, "rewards/rejected": -0.3245358467102051, "step": 3374 }, { "epoch": 0.5219408467040402, "grad_norm": 6.626112461090088, "learning_rate": 4.589013632718524e-06, "logits/chosen": 11.426918029785156, "logits/rejected": 6.568182945251465, "logps/chosen": -284.2164001464844, "logps/rejected": -270.18804931640625, "loss": 0.7606, "rewards/accuracies": 0.375, "rewards/chosen": 0.2913309931755066, "rewards/margins": -0.08676622062921524, "rewards/rejected": 0.3780972361564636, "step": 3375 }, { "epoch": 0.5220954958438043, "grad_norm": 6.0998640060424805, "learning_rate": 4.588727231068851e-06, "logits/chosen": 7.51963996887207, "logits/rejected": 11.44102668762207, "logps/chosen": -272.22381591796875, "logps/rejected": -297.0274658203125, "loss": 0.7891, "rewards/accuracies": 0.625, "rewards/chosen": 0.07508973777294159, "rewards/margins": -0.09064282476902008, "rewards/rejected": 0.16573259234428406, "step": 3376 }, { "epoch": 0.5222501449835685, "grad_norm": 6.202936172485352, "learning_rate": 4.588440829419178e-06, "logits/chosen": 17.11812400817871, "logits/rejected": 7.117197036743164, "logps/chosen": -509.7786560058594, "logps/rejected": -291.5841979980469, "loss": 0.5102, "rewards/accuracies": 0.875, "rewards/chosen": 0.5514532327651978, "rewards/margins": 0.664260983467102, "rewards/rejected": -0.1128077581524849, "step": 3377 }, { "epoch": 0.5224047941233327, "grad_norm": 5.566222190856934, "learning_rate": 4.588154427769504e-06, "logits/chosen": 12.539167404174805, "logits/rejected": 4.024435043334961, "logps/chosen": -279.06842041015625, "logps/rejected": -186.0841827392578, "loss": 0.7426, "rewards/accuracies": 0.625, "rewards/chosen": 0.1587502360343933, "rewards/margins": -0.03681080415844917, "rewards/rejected": 0.19556105136871338, "step": 3378 }, { "epoch": 0.5225594432630969, "grad_norm": 5.379266738891602, "learning_rate": 4.587868026119831e-06, "logits/chosen": 10.733598709106445, "logits/rejected": 4.975314140319824, "logps/chosen": -332.7852783203125, "logps/rejected": -287.1891784667969, "loss": 0.5737, "rewards/accuracies": 0.75, "rewards/chosen": 0.2933945953845978, "rewards/margins": 0.4398816227912903, "rewards/rejected": -0.1464870274066925, "step": 3379 }, { "epoch": 0.522714092402861, "grad_norm": 6.186374664306641, "learning_rate": 4.587581624470157e-06, "logits/chosen": 11.564859390258789, "logits/rejected": 4.0376667976379395, "logps/chosen": -571.50341796875, "logps/rejected": -389.6288757324219, "loss": 0.5752, "rewards/accuracies": 0.625, "rewards/chosen": 0.8618119955062866, "rewards/margins": 0.44151562452316284, "rewards/rejected": 0.42029643058776855, "step": 3380 }, { "epoch": 0.5228687415426252, "grad_norm": 7.375655651092529, "learning_rate": 4.5872952228204835e-06, "logits/chosen": 10.973668098449707, "logits/rejected": 11.470388412475586, "logps/chosen": -261.96881103515625, "logps/rejected": -259.1724548339844, "loss": 0.7135, "rewards/accuracies": 0.5, "rewards/chosen": 0.10988417267799377, "rewards/margins": 0.1347062885761261, "rewards/rejected": -0.024822130799293518, "step": 3381 }, { "epoch": 0.5230233906823893, "grad_norm": 6.438285827636719, "learning_rate": 4.58700882117081e-06, "logits/chosen": 6.218245506286621, "logits/rejected": 6.388080596923828, "logps/chosen": -267.11468505859375, "logps/rejected": -264.1803894042969, "loss": 0.7052, "rewards/accuracies": 0.625, "rewards/chosen": 0.22538027167320251, "rewards/margins": 0.21441617608070374, "rewards/rejected": 0.01096411794424057, "step": 3382 }, { "epoch": 0.5231780398221535, "grad_norm": 5.4786272048950195, "learning_rate": 4.586722419521137e-06, "logits/chosen": 4.929106712341309, "logits/rejected": 5.664292812347412, "logps/chosen": -212.44558715820312, "logps/rejected": -242.03884887695312, "loss": 0.8249, "rewards/accuracies": 0.625, "rewards/chosen": 0.12968482077121735, "rewards/margins": -0.06922565400600433, "rewards/rejected": 0.19891047477722168, "step": 3383 }, { "epoch": 0.5233326889619176, "grad_norm": 4.984866142272949, "learning_rate": 4.5864360178714634e-06, "logits/chosen": 8.332356452941895, "logits/rejected": 4.080158710479736, "logps/chosen": -284.6562805175781, "logps/rejected": -196.8278350830078, "loss": 0.765, "rewards/accuracies": 0.375, "rewards/chosen": 0.23435664176940918, "rewards/margins": -0.049413904547691345, "rewards/rejected": 0.2837705612182617, "step": 3384 }, { "epoch": 0.5234873381016818, "grad_norm": 5.085991859436035, "learning_rate": 4.58614961622179e-06, "logits/chosen": 10.116312026977539, "logits/rejected": 3.499004364013672, "logps/chosen": -246.60089111328125, "logps/rejected": -172.03099060058594, "loss": 0.7529, "rewards/accuracies": 0.5, "rewards/chosen": 0.2066326141357422, "rewards/margins": 0.06853661686182022, "rewards/rejected": 0.13809600472450256, "step": 3385 }, { "epoch": 0.523641987241446, "grad_norm": 7.175114631652832, "learning_rate": 4.585863214572117e-06, "logits/chosen": 10.580463409423828, "logits/rejected": 8.281279563903809, "logps/chosen": -340.380615234375, "logps/rejected": -265.46368408203125, "loss": 0.6188, "rewards/accuracies": 0.625, "rewards/chosen": 0.3850410580635071, "rewards/margins": 0.3041542172431946, "rewards/rejected": 0.0808868482708931, "step": 3386 }, { "epoch": 0.5237966363812101, "grad_norm": 7.444200038909912, "learning_rate": 4.5855768129224425e-06, "logits/chosen": 12.52717113494873, "logits/rejected": 5.678231239318848, "logps/chosen": -221.00648498535156, "logps/rejected": -146.95504760742188, "loss": 0.6194, "rewards/accuracies": 0.625, "rewards/chosen": 0.11436930298805237, "rewards/margins": 0.2839950621128082, "rewards/rejected": -0.16962575912475586, "step": 3387 }, { "epoch": 0.5239512855209743, "grad_norm": 3.909456729888916, "learning_rate": 4.585290411272769e-06, "logits/chosen": 13.567419052124023, "logits/rejected": 5.5791850090026855, "logps/chosen": -289.1203308105469, "logps/rejected": -195.16036987304688, "loss": 0.4575, "rewards/accuracies": 0.875, "rewards/chosen": 0.37114086747169495, "rewards/margins": 0.6506978273391724, "rewards/rejected": -0.27955693006515503, "step": 3388 }, { "epoch": 0.5241059346607384, "grad_norm": 5.847412109375, "learning_rate": 4.585004009623096e-06, "logits/chosen": 12.366923332214355, "logits/rejected": 11.392325401306152, "logps/chosen": -413.7122802734375, "logps/rejected": -339.08795166015625, "loss": 0.5241, "rewards/accuracies": 0.875, "rewards/chosen": 0.7155740261077881, "rewards/margins": 0.6349714994430542, "rewards/rejected": 0.08060257136821747, "step": 3389 }, { "epoch": 0.5242605838005026, "grad_norm": 9.251568794250488, "learning_rate": 4.5847176079734225e-06, "logits/chosen": 6.035253524780273, "logits/rejected": 6.236891269683838, "logps/chosen": -358.50164794921875, "logps/rejected": -369.5618896484375, "loss": 0.8535, "rewards/accuracies": 0.625, "rewards/chosen": 0.30767059326171875, "rewards/margins": -0.08816948533058167, "rewards/rejected": 0.3958400785923004, "step": 3390 }, { "epoch": 0.5244152329402668, "grad_norm": 8.34519100189209, "learning_rate": 4.584431206323749e-06, "logits/chosen": 7.675492286682129, "logits/rejected": 5.672919273376465, "logps/chosen": -238.48988342285156, "logps/rejected": -189.22886657714844, "loss": 0.5139, "rewards/accuracies": 0.75, "rewards/chosen": 0.4910629093647003, "rewards/margins": 0.5237280130386353, "rewards/rejected": -0.03266506642103195, "step": 3391 }, { "epoch": 0.524569882080031, "grad_norm": 3.7723278999328613, "learning_rate": 4.584144804674076e-06, "logits/chosen": 7.779267311096191, "logits/rejected": 3.61149001121521, "logps/chosen": -201.44027709960938, "logps/rejected": -175.6761474609375, "loss": 0.4699, "rewards/accuracies": 0.75, "rewards/chosen": 0.4658648371696472, "rewards/margins": 0.5744634866714478, "rewards/rejected": -0.10859866440296173, "step": 3392 }, { "epoch": 0.5247245312197951, "grad_norm": 5.506012916564941, "learning_rate": 4.583858403024402e-06, "logits/chosen": 9.021503448486328, "logits/rejected": 4.083602428436279, "logps/chosen": -348.5492248535156, "logps/rejected": -223.51123046875, "loss": 0.763, "rewards/accuracies": 0.625, "rewards/chosen": 0.17847681045532227, "rewards/margins": 0.0058135539293289185, "rewards/rejected": 0.17266327142715454, "step": 3393 }, { "epoch": 0.5248791803595593, "grad_norm": 4.694247245788574, "learning_rate": 4.583572001374728e-06, "logits/chosen": 4.244956970214844, "logits/rejected": 7.447844505310059, "logps/chosen": -195.48538208007812, "logps/rejected": -140.19839477539062, "loss": 0.6707, "rewards/accuracies": 0.5, "rewards/chosen": 0.05313679203391075, "rewards/margins": 0.1356012225151062, "rewards/rejected": -0.08246441185474396, "step": 3394 }, { "epoch": 0.5250338294993234, "grad_norm": 5.010329723358154, "learning_rate": 4.583285599725055e-06, "logits/chosen": 6.858229637145996, "logits/rejected": 7.083197593688965, "logps/chosen": -181.20245361328125, "logps/rejected": -239.50001525878906, "loss": 0.7389, "rewards/accuracies": 0.625, "rewards/chosen": -0.08626832813024521, "rewards/margins": 0.036448098719120026, "rewards/rejected": -0.12271644175052643, "step": 3395 }, { "epoch": 0.5251884786390876, "grad_norm": 4.788662433624268, "learning_rate": 4.5829991980753815e-06, "logits/chosen": 5.641383171081543, "logits/rejected": 1.2096834182739258, "logps/chosen": -355.52496337890625, "logps/rejected": -265.81683349609375, "loss": 0.5497, "rewards/accuracies": 0.75, "rewards/chosen": 0.7591531276702881, "rewards/margins": 0.43359097838401794, "rewards/rejected": 0.32556214928627014, "step": 3396 }, { "epoch": 0.5253431277788517, "grad_norm": 6.415170192718506, "learning_rate": 4.582712796425708e-06, "logits/chosen": 8.305414199829102, "logits/rejected": 7.885847091674805, "logps/chosen": -283.60858154296875, "logps/rejected": -255.82012939453125, "loss": 0.8096, "rewards/accuracies": 0.5, "rewards/chosen": 0.3064641058444977, "rewards/margins": -0.15541419386863708, "rewards/rejected": 0.46187829971313477, "step": 3397 }, { "epoch": 0.5254977769186159, "grad_norm": 8.35158634185791, "learning_rate": 4.582426394776035e-06, "logits/chosen": 12.839118003845215, "logits/rejected": 7.774646759033203, "logps/chosen": -310.63714599609375, "logps/rejected": -212.08917236328125, "loss": 0.6301, "rewards/accuracies": 0.75, "rewards/chosen": 0.19210198521614075, "rewards/margins": 0.4572628140449524, "rewards/rejected": -0.26516082882881165, "step": 3398 }, { "epoch": 0.52565242605838, "grad_norm": 4.139445781707764, "learning_rate": 4.582139993126361e-06, "logits/chosen": 11.815128326416016, "logits/rejected": 7.3945841789245605, "logps/chosen": -334.0032043457031, "logps/rejected": -219.86627197265625, "loss": 0.4836, "rewards/accuracies": 0.625, "rewards/chosen": 0.44902172684669495, "rewards/margins": 0.6782470941543579, "rewards/rejected": -0.22922533750534058, "step": 3399 }, { "epoch": 0.5258070751981442, "grad_norm": 4.127014636993408, "learning_rate": 4.581853591476687e-06, "logits/chosen": 11.822992324829102, "logits/rejected": 4.1561737060546875, "logps/chosen": -292.7979431152344, "logps/rejected": -124.76052856445312, "loss": 0.5117, "rewards/accuracies": 0.75, "rewards/chosen": 0.1704309582710266, "rewards/margins": 0.4945645034313202, "rewards/rejected": -0.32413357496261597, "step": 3400 }, { "epoch": 0.5259617243379083, "grad_norm": 6.6358256340026855, "learning_rate": 4.581567189827014e-06, "logits/chosen": 4.832671642303467, "logits/rejected": 8.779804229736328, "logps/chosen": -182.05307006835938, "logps/rejected": -247.2474365234375, "loss": 0.6533, "rewards/accuracies": 0.5, "rewards/chosen": 0.42966824769973755, "rewards/margins": 0.130997434258461, "rewards/rejected": 0.29867082834243774, "step": 3401 }, { "epoch": 0.5261163734776725, "grad_norm": 4.693929195404053, "learning_rate": 4.581280788177341e-06, "logits/chosen": 10.109063148498535, "logits/rejected": 9.585689544677734, "logps/chosen": -203.67630004882812, "logps/rejected": -221.12509155273438, "loss": 0.6105, "rewards/accuracies": 0.625, "rewards/chosen": 0.36936986446380615, "rewards/margins": 0.34170639514923096, "rewards/rejected": 0.02766351029276848, "step": 3402 }, { "epoch": 0.5262710226174367, "grad_norm": 4.989546298980713, "learning_rate": 4.580994386527666e-06, "logits/chosen": 12.533157348632812, "logits/rejected": 4.344411849975586, "logps/chosen": -309.76190185546875, "logps/rejected": -215.86572265625, "loss": 0.6672, "rewards/accuracies": 0.625, "rewards/chosen": 0.24339275062084198, "rewards/margins": 0.4155833125114441, "rewards/rejected": -0.1721905767917633, "step": 3403 }, { "epoch": 0.5264256717572009, "grad_norm": 6.5985026359558105, "learning_rate": 4.580707984877993e-06, "logits/chosen": 9.559341430664062, "logits/rejected": 7.2042388916015625, "logps/chosen": -429.82208251953125, "logps/rejected": -311.39715576171875, "loss": 0.6336, "rewards/accuracies": 0.625, "rewards/chosen": 0.5456064343452454, "rewards/margins": 0.6279048919677734, "rewards/rejected": -0.08229847252368927, "step": 3404 }, { "epoch": 0.526580320896965, "grad_norm": 5.497079849243164, "learning_rate": 4.58042158322832e-06, "logits/chosen": 7.990666389465332, "logits/rejected": 13.559745788574219, "logps/chosen": -259.53070068359375, "logps/rejected": -371.3773193359375, "loss": 0.7, "rewards/accuracies": 0.375, "rewards/chosen": 0.18173199892044067, "rewards/margins": 0.040539830923080444, "rewards/rejected": 0.14119216799736023, "step": 3405 }, { "epoch": 0.5267349700367292, "grad_norm": 6.048287391662598, "learning_rate": 4.580135181578646e-06, "logits/chosen": 12.805288314819336, "logits/rejected": 5.341282844543457, "logps/chosen": -263.8695068359375, "logps/rejected": -167.55691528320312, "loss": 0.6966, "rewards/accuracies": 0.375, "rewards/chosen": 0.24285611510276794, "rewards/margins": 0.027521885931491852, "rewards/rejected": 0.2153342217206955, "step": 3406 }, { "epoch": 0.5268896191764934, "grad_norm": 4.9026336669921875, "learning_rate": 4.579848779928973e-06, "logits/chosen": 17.73627471923828, "logits/rejected": 9.94194507598877, "logps/chosen": -448.31109619140625, "logps/rejected": -341.8060302734375, "loss": 0.4685, "rewards/accuracies": 0.875, "rewards/chosen": 0.5143506526947021, "rewards/margins": 0.5725738406181335, "rewards/rejected": -0.05822315439581871, "step": 3407 }, { "epoch": 0.5270442683162575, "grad_norm": 6.575826168060303, "learning_rate": 4.579562378279299e-06, "logits/chosen": 8.31064224243164, "logits/rejected": 5.723313331604004, "logps/chosen": -377.745361328125, "logps/rejected": -327.5417175292969, "loss": 0.5335, "rewards/accuracies": 0.875, "rewards/chosen": 0.5305577516555786, "rewards/margins": 0.4008696675300598, "rewards/rejected": 0.12968809902668, "step": 3408 }, { "epoch": 0.5271989174560217, "grad_norm": 8.528643608093262, "learning_rate": 4.5792759766296255e-06, "logits/chosen": 11.447997093200684, "logits/rejected": 15.734691619873047, "logps/chosen": -321.5065002441406, "logps/rejected": -374.7230224609375, "loss": 0.8825, "rewards/accuracies": 0.375, "rewards/chosen": 0.09364481270313263, "rewards/margins": -0.1999102532863617, "rewards/rejected": 0.29355505108833313, "step": 3409 }, { "epoch": 0.5273535665957858, "grad_norm": 6.614365100860596, "learning_rate": 4.578989574979952e-06, "logits/chosen": 0.38496047258377075, "logits/rejected": 2.333106756210327, "logps/chosen": -209.05752563476562, "logps/rejected": -259.7377014160156, "loss": 0.8604, "rewards/accuracies": 0.625, "rewards/chosen": 0.050031304359436035, "rewards/margins": 0.10323281586170197, "rewards/rejected": -0.053201496601104736, "step": 3410 }, { "epoch": 0.52750821573555, "grad_norm": 5.590544700622559, "learning_rate": 4.578703173330279e-06, "logits/chosen": 4.341080188751221, "logits/rejected": 2.4473624229431152, "logps/chosen": -257.56475830078125, "logps/rejected": -172.60374450683594, "loss": 0.7988, "rewards/accuracies": 0.625, "rewards/chosen": 0.1192631721496582, "rewards/margins": -0.10656829178333282, "rewards/rejected": 0.22583146393299103, "step": 3411 }, { "epoch": 0.5276628648753141, "grad_norm": 5.8873186111450195, "learning_rate": 4.578416771680605e-06, "logits/chosen": 9.85287094116211, "logits/rejected": 6.649930953979492, "logps/chosen": -464.3546142578125, "logps/rejected": -346.43798828125, "loss": 0.4718, "rewards/accuracies": 0.75, "rewards/chosen": 0.6190974116325378, "rewards/margins": 0.5632246136665344, "rewards/rejected": 0.055872876197099686, "step": 3412 }, { "epoch": 0.5278175140150783, "grad_norm": 5.588597297668457, "learning_rate": 4.578130370030931e-06, "logits/chosen": 9.938053131103516, "logits/rejected": 0.3761177659034729, "logps/chosen": -347.6468200683594, "logps/rejected": -237.06393432617188, "loss": 0.6549, "rewards/accuracies": 0.625, "rewards/chosen": 0.40525197982788086, "rewards/margins": 0.13223856687545776, "rewards/rejected": 0.2730134129524231, "step": 3413 }, { "epoch": 0.5279721631548424, "grad_norm": 12.130578994750977, "learning_rate": 4.577843968381258e-06, "logits/chosen": 10.577235221862793, "logits/rejected": 11.355010986328125, "logps/chosen": -255.88067626953125, "logps/rejected": -193.0601806640625, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.3199271559715271, "rewards/margins": 0.22729697823524475, "rewards/rejected": 0.09263019263744354, "step": 3414 }, { "epoch": 0.5281268122946066, "grad_norm": 6.261444091796875, "learning_rate": 4.5775575667315845e-06, "logits/chosen": 8.687597274780273, "logits/rejected": -0.03992784023284912, "logps/chosen": -406.1783447265625, "logps/rejected": -225.51840209960938, "loss": 0.5584, "rewards/accuracies": 0.625, "rewards/chosen": 0.6052923798561096, "rewards/margins": 0.419197678565979, "rewards/rejected": 0.18609467148780823, "step": 3415 }, { "epoch": 0.5282814614343708, "grad_norm": 5.070169925689697, "learning_rate": 4.577271165081911e-06, "logits/chosen": 8.9213228225708, "logits/rejected": 7.145798683166504, "logps/chosen": -160.59693908691406, "logps/rejected": -112.22850799560547, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": 0.095132976770401, "rewards/margins": 0.11526663601398468, "rewards/rejected": -0.02013365551829338, "step": 3416 }, { "epoch": 0.528436110574135, "grad_norm": 5.485719203948975, "learning_rate": 4.576984763432238e-06, "logits/chosen": 11.692569732666016, "logits/rejected": 9.728137016296387, "logps/chosen": -276.0697937011719, "logps/rejected": -222.04473876953125, "loss": 0.6553, "rewards/accuracies": 0.5, "rewards/chosen": 0.400642067193985, "rewards/margins": 0.09185998141765594, "rewards/rejected": 0.30878210067749023, "step": 3417 }, { "epoch": 0.5285907597138991, "grad_norm": 5.471063613891602, "learning_rate": 4.5766983617825645e-06, "logits/chosen": 14.95722770690918, "logits/rejected": 12.853141784667969, "logps/chosen": -306.56756591796875, "logps/rejected": -268.8677673339844, "loss": 0.6376, "rewards/accuracies": 0.625, "rewards/chosen": 0.19057464599609375, "rewards/margins": 0.17712116241455078, "rewards/rejected": 0.013453483581542969, "step": 3418 }, { "epoch": 0.5287454088536633, "grad_norm": 5.295947551727295, "learning_rate": 4.57641196013289e-06, "logits/chosen": 8.691012382507324, "logits/rejected": 2.7754409313201904, "logps/chosen": -253.4430694580078, "logps/rejected": -248.24066162109375, "loss": 0.5742, "rewards/accuracies": 0.625, "rewards/chosen": 0.41307583451271057, "rewards/margins": 0.45983076095581055, "rewards/rejected": -0.04675493389368057, "step": 3419 }, { "epoch": 0.5289000579934274, "grad_norm": 5.374020099639893, "learning_rate": 4.576125558483217e-06, "logits/chosen": 7.413866996765137, "logits/rejected": 5.250053405761719, "logps/chosen": -220.97207641601562, "logps/rejected": -230.5499267578125, "loss": 0.5933, "rewards/accuracies": 0.625, "rewards/chosen": 0.46011465787887573, "rewards/margins": 0.24127589166164398, "rewards/rejected": 0.21883878111839294, "step": 3420 }, { "epoch": 0.5290547071331916, "grad_norm": 4.615992069244385, "learning_rate": 4.5758391568335436e-06, "logits/chosen": 9.28696346282959, "logits/rejected": 7.412108421325684, "logps/chosen": -220.29551696777344, "logps/rejected": -184.72955322265625, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": 0.009767621755599976, "rewards/margins": 0.21574127674102783, "rewards/rejected": -0.20597365498542786, "step": 3421 }, { "epoch": 0.5292093562729557, "grad_norm": 10.78402328491211, "learning_rate": 4.57555275518387e-06, "logits/chosen": 7.141679763793945, "logits/rejected": 1.6123631000518799, "logps/chosen": -270.43634033203125, "logps/rejected": -275.4430236816406, "loss": 0.7058, "rewards/accuracies": 0.625, "rewards/chosen": 0.11122465878725052, "rewards/margins": 0.05174332112073898, "rewards/rejected": 0.05948131904006004, "step": 3422 }, { "epoch": 0.5293640054127199, "grad_norm": 6.367552280426025, "learning_rate": 4.575266353534197e-06, "logits/chosen": 8.804319381713867, "logits/rejected": 9.815238952636719, "logps/chosen": -401.929443359375, "logps/rejected": -480.4015197753906, "loss": 0.5995, "rewards/accuracies": 0.625, "rewards/chosen": 0.19442030787467957, "rewards/margins": 0.22161945700645447, "rewards/rejected": -0.027199167758226395, "step": 3423 }, { "epoch": 0.529518654552484, "grad_norm": 5.090580463409424, "learning_rate": 4.5749799518845235e-06, "logits/chosen": 8.891847610473633, "logits/rejected": 9.425918579101562, "logps/chosen": -274.53045654296875, "logps/rejected": -257.40093994140625, "loss": 0.7402, "rewards/accuracies": 0.5, "rewards/chosen": 0.2104538083076477, "rewards/margins": -0.07750041037797928, "rewards/rejected": 0.2879542410373688, "step": 3424 }, { "epoch": 0.5296733036922482, "grad_norm": 4.47688102722168, "learning_rate": 4.57469355023485e-06, "logits/chosen": 4.2271246910095215, "logits/rejected": 0.5890989303588867, "logps/chosen": -290.80743408203125, "logps/rejected": -184.19851684570312, "loss": 0.7448, "rewards/accuracies": 0.375, "rewards/chosen": 0.17444077134132385, "rewards/margins": 0.025738459080457687, "rewards/rejected": 0.14870232343673706, "step": 3425 }, { "epoch": 0.5298279528320123, "grad_norm": 5.596575736999512, "learning_rate": 4.574407148585176e-06, "logits/chosen": 8.294352531433105, "logits/rejected": 3.2890946865081787, "logps/chosen": -211.54519653320312, "logps/rejected": -205.9228515625, "loss": 0.6152, "rewards/accuracies": 0.75, "rewards/chosen": 0.11202490329742432, "rewards/margins": 0.2089504450559616, "rewards/rejected": -0.09692555665969849, "step": 3426 }, { "epoch": 0.5299826019717765, "grad_norm": 6.116458892822266, "learning_rate": 4.574120746935503e-06, "logits/chosen": 7.7513203620910645, "logits/rejected": 9.854928970336914, "logps/chosen": -206.88775634765625, "logps/rejected": -222.33245849609375, "loss": 0.7963, "rewards/accuracies": 0.5, "rewards/chosen": 0.3991096615791321, "rewards/margins": -0.06701162457466125, "rewards/rejected": 0.46612128615379333, "step": 3427 }, { "epoch": 0.5301372511115406, "grad_norm": 4.590334892272949, "learning_rate": 4.573834345285829e-06, "logits/chosen": 12.637147903442383, "logits/rejected": 8.044875144958496, "logps/chosen": -279.09881591796875, "logps/rejected": -281.741455078125, "loss": 0.5024, "rewards/accuracies": 0.75, "rewards/chosen": 0.3421364724636078, "rewards/margins": 0.4927014112472534, "rewards/rejected": -0.15056495368480682, "step": 3428 }, { "epoch": 0.5302919002513049, "grad_norm": 5.477031707763672, "learning_rate": 4.573547943636156e-06, "logits/chosen": 8.205026626586914, "logits/rejected": 5.203507423400879, "logps/chosen": -258.4432678222656, "logps/rejected": -204.49960327148438, "loss": 0.8799, "rewards/accuracies": 0.375, "rewards/chosen": -0.04098685085773468, "rewards/margins": -0.08836564421653748, "rewards/rejected": 0.04737880453467369, "step": 3429 }, { "epoch": 0.530446549391069, "grad_norm": 5.692229270935059, "learning_rate": 4.5732615419864826e-06, "logits/chosen": 10.317362785339355, "logits/rejected": 4.045015811920166, "logps/chosen": -328.2392272949219, "logps/rejected": -254.13131713867188, "loss": 0.7093, "rewards/accuracies": 0.5, "rewards/chosen": 0.4088035821914673, "rewards/margins": 0.18261361122131348, "rewards/rejected": 0.22618995606899261, "step": 3430 }, { "epoch": 0.5306011985308332, "grad_norm": 4.959759712219238, "learning_rate": 4.572975140336809e-06, "logits/chosen": 8.069620132446289, "logits/rejected": 8.604790687561035, "logps/chosen": -246.03280639648438, "logps/rejected": -237.962158203125, "loss": 0.6758, "rewards/accuracies": 0.625, "rewards/chosen": 0.2610960006713867, "rewards/margins": 0.07367493957281113, "rewards/rejected": 0.187421053647995, "step": 3431 }, { "epoch": 0.5307558476705974, "grad_norm": 4.526830673217773, "learning_rate": 4.572688738687136e-06, "logits/chosen": 11.776190757751465, "logits/rejected": 10.155450820922852, "logps/chosen": -236.45068359375, "logps/rejected": -173.79054260253906, "loss": 0.6356, "rewards/accuracies": 0.625, "rewards/chosen": 0.3019082546234131, "rewards/margins": 0.15942107141017914, "rewards/rejected": 0.14248719811439514, "step": 3432 }, { "epoch": 0.5309104968103615, "grad_norm": 7.972452640533447, "learning_rate": 4.572402337037462e-06, "logits/chosen": 10.512863159179688, "logits/rejected": 14.09266185760498, "logps/chosen": -246.50445556640625, "logps/rejected": -273.2292785644531, "loss": 0.7721, "rewards/accuracies": 0.5, "rewards/chosen": 0.1766868233680725, "rewards/margins": -0.11256174743175507, "rewards/rejected": 0.2892485558986664, "step": 3433 }, { "epoch": 0.5310651459501257, "grad_norm": 3.793704032897949, "learning_rate": 4.572115935387788e-06, "logits/chosen": 10.908559799194336, "logits/rejected": 11.356244087219238, "logps/chosen": -166.76580810546875, "logps/rejected": -197.5608673095703, "loss": 0.7117, "rewards/accuracies": 0.5, "rewards/chosen": 0.1815110296010971, "rewards/margins": 0.019025102257728577, "rewards/rejected": 0.16248592734336853, "step": 3434 }, { "epoch": 0.5312197950898898, "grad_norm": 5.472989082336426, "learning_rate": 4.571829533738115e-06, "logits/chosen": 12.45029067993164, "logits/rejected": 11.325197219848633, "logps/chosen": -315.3360595703125, "logps/rejected": -245.63270568847656, "loss": 0.6477, "rewards/accuracies": 0.5, "rewards/chosen": 0.4961656332015991, "rewards/margins": 0.21825763583183289, "rewards/rejected": 0.27790799736976624, "step": 3435 }, { "epoch": 0.531374444229654, "grad_norm": 3.6238322257995605, "learning_rate": 4.571543132088442e-06, "logits/chosen": 10.989975929260254, "logits/rejected": 10.514347076416016, "logps/chosen": -161.6402587890625, "logps/rejected": -210.82290649414062, "loss": 0.5314, "rewards/accuracies": 0.875, "rewards/chosen": 0.15777117013931274, "rewards/margins": 0.3882730305194855, "rewards/rejected": -0.23050186038017273, "step": 3436 }, { "epoch": 0.5315290933694181, "grad_norm": 5.02354097366333, "learning_rate": 4.5712567304387674e-06, "logits/chosen": 13.740961074829102, "logits/rejected": 5.96253776550293, "logps/chosen": -337.1373596191406, "logps/rejected": -210.3853759765625, "loss": 0.5512, "rewards/accuracies": 0.75, "rewards/chosen": 0.3758665919303894, "rewards/margins": 0.5395940542221069, "rewards/rejected": -0.16372743248939514, "step": 3437 }, { "epoch": 0.5316837425091823, "grad_norm": 6.157371520996094, "learning_rate": 4.570970328789094e-06, "logits/chosen": 10.211882591247559, "logits/rejected": 9.94230842590332, "logps/chosen": -374.38336181640625, "logps/rejected": -501.30780029296875, "loss": 0.4766, "rewards/accuracies": 0.75, "rewards/chosen": 0.7491763234138489, "rewards/margins": 0.6523413062095642, "rewards/rejected": 0.09683503210544586, "step": 3438 }, { "epoch": 0.5318383916489464, "grad_norm": 5.146474838256836, "learning_rate": 4.570683927139421e-06, "logits/chosen": 9.268468856811523, "logits/rejected": 2.533730983734131, "logps/chosen": -216.64859008789062, "logps/rejected": -178.5709991455078, "loss": 0.5518, "rewards/accuracies": 0.75, "rewards/chosen": 0.11346770077943802, "rewards/margins": 0.35315102338790894, "rewards/rejected": -0.23968330025672913, "step": 3439 }, { "epoch": 0.5319930407887106, "grad_norm": 4.453693866729736, "learning_rate": 4.570397525489747e-06, "logits/chosen": 11.58981704711914, "logits/rejected": 13.036358833312988, "logps/chosen": -186.1175079345703, "logps/rejected": -180.80081176757812, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": -0.01144113577902317, "rewards/margins": 0.17896823585033417, "rewards/rejected": -0.1904093623161316, "step": 3440 }, { "epoch": 0.5321476899284747, "grad_norm": 4.810494422912598, "learning_rate": 4.570111123840073e-06, "logits/chosen": 8.736467361450195, "logits/rejected": 6.427340030670166, "logps/chosen": -297.27703857421875, "logps/rejected": -246.56668090820312, "loss": 0.5648, "rewards/accuracies": 0.75, "rewards/chosen": 0.3800322115421295, "rewards/margins": 0.4904134273529053, "rewards/rejected": -0.11038121581077576, "step": 3441 }, { "epoch": 0.532302339068239, "grad_norm": 5.815372943878174, "learning_rate": 4.5698247221904e-06, "logits/chosen": 8.633378982543945, "logits/rejected": 14.381312370300293, "logps/chosen": -230.50863647460938, "logps/rejected": -365.73797607421875, "loss": 0.5441, "rewards/accuracies": 0.75, "rewards/chosen": 0.4519106149673462, "rewards/margins": 0.5051904320716858, "rewards/rejected": -0.05327984690666199, "step": 3442 }, { "epoch": 0.5324569882080031, "grad_norm": 34.49391555786133, "learning_rate": 4.5695383205407265e-06, "logits/chosen": 8.620685577392578, "logits/rejected": 7.740238666534424, "logps/chosen": -245.320068359375, "logps/rejected": -274.30926513671875, "loss": 0.5606, "rewards/accuracies": 0.75, "rewards/chosen": 0.39388230443000793, "rewards/margins": 0.34456202387809753, "rewards/rejected": 0.04932032525539398, "step": 3443 }, { "epoch": 0.5326116373477673, "grad_norm": 5.060355186462402, "learning_rate": 4.569251918891053e-06, "logits/chosen": 18.39594268798828, "logits/rejected": 10.890106201171875, "logps/chosen": -284.83502197265625, "logps/rejected": -254.40145874023438, "loss": 0.6, "rewards/accuracies": 0.625, "rewards/chosen": 0.8119302988052368, "rewards/margins": 0.24286359548568726, "rewards/rejected": 0.5690666437149048, "step": 3444 }, { "epoch": 0.5327662864875314, "grad_norm": 6.999647617340088, "learning_rate": 4.56896551724138e-06, "logits/chosen": 8.147418022155762, "logits/rejected": 5.280096054077148, "logps/chosen": -318.1426696777344, "logps/rejected": -175.38006591796875, "loss": 0.6786, "rewards/accuracies": 0.375, "rewards/chosen": 0.28220951557159424, "rewards/margins": 0.06300004571676254, "rewards/rejected": 0.2192094773054123, "step": 3445 }, { "epoch": 0.5329209356272956, "grad_norm": 7.354351043701172, "learning_rate": 4.568679115591706e-06, "logits/chosen": 6.794188976287842, "logits/rejected": 8.347644805908203, "logps/chosen": -298.014404296875, "logps/rejected": -269.04791259765625, "loss": 0.8629, "rewards/accuracies": 0.25, "rewards/chosen": 0.33979547023773193, "rewards/margins": -0.2117002308368683, "rewards/rejected": 0.5514957308769226, "step": 3446 }, { "epoch": 0.5330755847670597, "grad_norm": 5.101029872894287, "learning_rate": 4.568392713942032e-06, "logits/chosen": 9.201620101928711, "logits/rejected": 3.855217456817627, "logps/chosen": -300.31732177734375, "logps/rejected": -262.1998291015625, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 0.5049896240234375, "rewards/margins": 0.45708370208740234, "rewards/rejected": 0.047905925661325455, "step": 3447 }, { "epoch": 0.5332302339068239, "grad_norm": 4.7378435134887695, "learning_rate": 4.568106312292359e-06, "logits/chosen": 11.092121124267578, "logits/rejected": 6.4346137046813965, "logps/chosen": -200.4429931640625, "logps/rejected": -162.90960693359375, "loss": 0.7474, "rewards/accuracies": 0.5, "rewards/chosen": 0.19061753153800964, "rewards/margins": 0.013008639216423035, "rewards/rejected": 0.1776089072227478, "step": 3448 }, { "epoch": 0.533384883046588, "grad_norm": 6.047914505004883, "learning_rate": 4.5678199106426855e-06, "logits/chosen": 4.059939861297607, "logits/rejected": 6.663802623748779, "logps/chosen": -211.39356994628906, "logps/rejected": -272.5367431640625, "loss": 0.6125, "rewards/accuracies": 0.875, "rewards/chosen": 0.6574857234954834, "rewards/margins": 0.22315870225429535, "rewards/rejected": 0.43432706594467163, "step": 3449 }, { "epoch": 0.5335395321863522, "grad_norm": 6.689841270446777, "learning_rate": 4.567533508993012e-06, "logits/chosen": 9.043456077575684, "logits/rejected": 6.569450378417969, "logps/chosen": -318.608154296875, "logps/rejected": -347.499267578125, "loss": 0.4927, "rewards/accuracies": 0.875, "rewards/chosen": 0.38673263788223267, "rewards/margins": 0.6726446151733398, "rewards/rejected": -0.2859119474887848, "step": 3450 }, { "epoch": 0.5336941813261163, "grad_norm": 4.430845260620117, "learning_rate": 4.567247107343339e-06, "logits/chosen": 17.865320205688477, "logits/rejected": 3.8432884216308594, "logps/chosen": -328.9352111816406, "logps/rejected": -199.6381378173828, "loss": 0.3972, "rewards/accuracies": 0.875, "rewards/chosen": 0.6083768606185913, "rewards/margins": 0.9470100402832031, "rewards/rejected": -0.3386331796646118, "step": 3451 }, { "epoch": 0.5338488304658805, "grad_norm": 3.575376272201538, "learning_rate": 4.566960705693665e-06, "logits/chosen": 9.262465476989746, "logits/rejected": 6.304553031921387, "logps/chosen": -197.23269653320312, "logps/rejected": -126.91417694091797, "loss": 0.5626, "rewards/accuracies": 0.625, "rewards/chosen": 0.3719860911369324, "rewards/margins": 0.3770997226238251, "rewards/rejected": -0.005113624036312103, "step": 3452 }, { "epoch": 0.5340034796056446, "grad_norm": 4.561135292053223, "learning_rate": 4.566674304043991e-06, "logits/chosen": 13.124151229858398, "logits/rejected": 7.113224029541016, "logps/chosen": -349.787841796875, "logps/rejected": -271.4983825683594, "loss": 0.4782, "rewards/accuracies": 0.75, "rewards/chosen": 0.5332621932029724, "rewards/margins": 0.7221713066101074, "rewards/rejected": -0.1889091432094574, "step": 3453 }, { "epoch": 0.5341581287454088, "grad_norm": 5.367588043212891, "learning_rate": 4.566387902394318e-06, "logits/chosen": 11.916638374328613, "logits/rejected": 5.375942707061768, "logps/chosen": -364.250244140625, "logps/rejected": -239.3827667236328, "loss": 0.7707, "rewards/accuracies": 0.5, "rewards/chosen": 0.05872933566570282, "rewards/margins": -0.05809955298900604, "rewards/rejected": 0.11682890355587006, "step": 3454 }, { "epoch": 0.5343127778851731, "grad_norm": 4.087586402893066, "learning_rate": 4.566101500744645e-06, "logits/chosen": 10.04790210723877, "logits/rejected": 3.5160884857177734, "logps/chosen": -282.19976806640625, "logps/rejected": -177.74288940429688, "loss": 0.5623, "rewards/accuracies": 0.875, "rewards/chosen": 0.5670511722564697, "rewards/margins": 0.32898226380348206, "rewards/rejected": 0.23806887865066528, "step": 3455 }, { "epoch": 0.5344674270249372, "grad_norm": 6.603880405426025, "learning_rate": 4.565815099094971e-06, "logits/chosen": 6.2787628173828125, "logits/rejected": 7.528284072875977, "logps/chosen": -241.634521484375, "logps/rejected": -232.2600860595703, "loss": 0.8805, "rewards/accuracies": 0.625, "rewards/chosen": 0.09804821014404297, "rewards/margins": -0.23695436120033264, "rewards/rejected": 0.3350025713443756, "step": 3456 }, { "epoch": 0.5346220761647014, "grad_norm": 5.394029140472412, "learning_rate": 4.565528697445298e-06, "logits/chosen": 9.487994194030762, "logits/rejected": 3.9948291778564453, "logps/chosen": -425.2696228027344, "logps/rejected": -370.9013671875, "loss": 0.4324, "rewards/accuracies": 0.875, "rewards/chosen": 0.5432972311973572, "rewards/margins": 0.8604394197463989, "rewards/rejected": -0.31714215874671936, "step": 3457 }, { "epoch": 0.5347767253044655, "grad_norm": 5.575886249542236, "learning_rate": 4.5652422957956245e-06, "logits/chosen": 10.354061126708984, "logits/rejected": 9.322901725769043, "logps/chosen": -323.078857421875, "logps/rejected": -292.0771179199219, "loss": 0.6504, "rewards/accuracies": 0.625, "rewards/chosen": 0.558993935585022, "rewards/margins": 0.23440010845661163, "rewards/rejected": 0.32459378242492676, "step": 3458 }, { "epoch": 0.5349313744442297, "grad_norm": 5.143550872802734, "learning_rate": 4.56495589414595e-06, "logits/chosen": 4.204448699951172, "logits/rejected": 4.879000663757324, "logps/chosen": -282.2350158691406, "logps/rejected": -269.42864990234375, "loss": 0.7038, "rewards/accuracies": 0.5, "rewards/chosen": 0.5985470414161682, "rewards/margins": 0.17674662172794342, "rewards/rejected": 0.421800434589386, "step": 3459 }, { "epoch": 0.5350860235839938, "grad_norm": 7.072441577911377, "learning_rate": 4.564669492496277e-06, "logits/chosen": 6.972856521606445, "logits/rejected": 10.14965534210205, "logps/chosen": -232.37599182128906, "logps/rejected": -250.8720703125, "loss": 1.0128, "rewards/accuracies": 0.5, "rewards/chosen": 0.25090086460113525, "rewards/margins": -0.4838963747024536, "rewards/rejected": 0.7347972989082336, "step": 3460 }, { "epoch": 0.535240672723758, "grad_norm": 5.6928486824035645, "learning_rate": 4.564383090846604e-06, "logits/chosen": 5.260196685791016, "logits/rejected": 11.90127944946289, "logps/chosen": -186.59771728515625, "logps/rejected": -212.52285766601562, "loss": 0.7685, "rewards/accuracies": 0.375, "rewards/chosen": 0.06098297983407974, "rewards/margins": -0.07548104226589203, "rewards/rejected": 0.13646404445171356, "step": 3461 }, { "epoch": 0.5353953218635221, "grad_norm": 9.665477752685547, "learning_rate": 4.56409668919693e-06, "logits/chosen": 7.550838947296143, "logits/rejected": 5.062317848205566, "logps/chosen": -312.83795166015625, "logps/rejected": -275.38922119140625, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": 0.6526155471801758, "rewards/margins": 0.3766680061817169, "rewards/rejected": 0.27594754099845886, "step": 3462 }, { "epoch": 0.5355499710032863, "grad_norm": 4.193955898284912, "learning_rate": 4.563810287547257e-06, "logits/chosen": 8.306144714355469, "logits/rejected": 9.304515838623047, "logps/chosen": -259.090087890625, "logps/rejected": -236.70970153808594, "loss": 0.488, "rewards/accuracies": 0.75, "rewards/chosen": 0.41990602016448975, "rewards/margins": 0.6112083792686462, "rewards/rejected": -0.19130229949951172, "step": 3463 }, { "epoch": 0.5357046201430504, "grad_norm": 4.4209184646606445, "learning_rate": 4.563523885897584e-06, "logits/chosen": 11.47598648071289, "logits/rejected": 10.894248008728027, "logps/chosen": -199.3096160888672, "logps/rejected": -177.6346893310547, "loss": 0.6981, "rewards/accuracies": 0.375, "rewards/chosen": 0.20075112581253052, "rewards/margins": 0.065799281001091, "rewards/rejected": 0.13495182991027832, "step": 3464 }, { "epoch": 0.5358592692828146, "grad_norm": 12.39144229888916, "learning_rate": 4.56323748424791e-06, "logits/chosen": 9.40106201171875, "logits/rejected": 13.345853805541992, "logps/chosen": -237.79360961914062, "logps/rejected": -363.6505126953125, "loss": 0.8116, "rewards/accuracies": 0.375, "rewards/chosen": 0.06312908977270126, "rewards/margins": -0.12420602142810822, "rewards/rejected": 0.18733511865139008, "step": 3465 }, { "epoch": 0.5360139184225787, "grad_norm": 6.134507179260254, "learning_rate": 4.562951082598236e-06, "logits/chosen": 3.9755682945251465, "logits/rejected": 9.521167755126953, "logps/chosen": -245.59780883789062, "logps/rejected": -218.2001953125, "loss": 0.9435, "rewards/accuracies": 0.375, "rewards/chosen": 0.17030514776706696, "rewards/margins": -0.36190271377563477, "rewards/rejected": 0.5322078466415405, "step": 3466 }, { "epoch": 0.5361685675623429, "grad_norm": 6.467967987060547, "learning_rate": 4.562664680948563e-06, "logits/chosen": 13.483763694763184, "logits/rejected": 9.015125274658203, "logps/chosen": -349.2707214355469, "logps/rejected": -347.36566162109375, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": 0.8794007301330566, "rewards/margins": 0.2796288728713989, "rewards/rejected": 0.5997718572616577, "step": 3467 }, { "epoch": 0.5363232167021071, "grad_norm": 4.400718688964844, "learning_rate": 4.562378279298889e-06, "logits/chosen": 11.813966751098633, "logits/rejected": 10.170332908630371, "logps/chosen": -340.02508544921875, "logps/rejected": -283.0907897949219, "loss": 0.5529, "rewards/accuracies": 0.75, "rewards/chosen": 0.580676794052124, "rewards/margins": 0.3901478350162506, "rewards/rejected": 0.1905289590358734, "step": 3468 }, { "epoch": 0.5364778658418713, "grad_norm": 4.829706192016602, "learning_rate": 4.562091877649216e-06, "logits/chosen": 11.952885627746582, "logits/rejected": 5.693309783935547, "logps/chosen": -429.1151123046875, "logps/rejected": -335.4317932128906, "loss": 0.4599, "rewards/accuracies": 1.0, "rewards/chosen": 0.6229270696640015, "rewards/margins": 0.6170374751091003, "rewards/rejected": 0.005889661610126495, "step": 3469 }, { "epoch": 0.5366325149816354, "grad_norm": 11.11780071258545, "learning_rate": 4.561805475999543e-06, "logits/chosen": 1.0482724905014038, "logits/rejected": 4.3756303787231445, "logps/chosen": -210.3115997314453, "logps/rejected": -444.1589050292969, "loss": 0.9004, "rewards/accuracies": 0.625, "rewards/chosen": 0.23940812051296234, "rewards/margins": 0.06473705172538757, "rewards/rejected": 0.17467106878757477, "step": 3470 }, { "epoch": 0.5367871641213996, "grad_norm": 5.7888946533203125, "learning_rate": 4.5615190743498685e-06, "logits/chosen": 6.986234664916992, "logits/rejected": 2.7387969493865967, "logps/chosen": -316.13995361328125, "logps/rejected": -260.50909423828125, "loss": 0.5771, "rewards/accuracies": 0.75, "rewards/chosen": 0.6364717483520508, "rewards/margins": 0.3485320210456848, "rewards/rejected": 0.28793975710868835, "step": 3471 }, { "epoch": 0.5369418132611637, "grad_norm": 4.900925159454346, "learning_rate": 4.561232672700195e-06, "logits/chosen": 7.6199846267700195, "logits/rejected": 4.588237762451172, "logps/chosen": -315.3199462890625, "logps/rejected": -267.2480773925781, "loss": 0.6199, "rewards/accuracies": 0.625, "rewards/chosen": 0.4717210531234741, "rewards/margins": 0.3848683834075928, "rewards/rejected": 0.08685269951820374, "step": 3472 }, { "epoch": 0.5370964624009279, "grad_norm": 5.508366584777832, "learning_rate": 4.560946271050522e-06, "logits/chosen": 9.255306243896484, "logits/rejected": 7.784077167510986, "logps/chosen": -371.81719970703125, "logps/rejected": -295.12811279296875, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.5096185207366943, "rewards/margins": 0.06269489973783493, "rewards/rejected": 0.4469236433506012, "step": 3473 }, { "epoch": 0.537251111540692, "grad_norm": 6.148151397705078, "learning_rate": 4.560659869400848e-06, "logits/chosen": 11.678831100463867, "logits/rejected": 4.39997673034668, "logps/chosen": -471.0237121582031, "logps/rejected": -311.6003723144531, "loss": 0.7037, "rewards/accuracies": 0.5, "rewards/chosen": 0.4104408025741577, "rewards/margins": 0.121530681848526, "rewards/rejected": 0.2889100909233093, "step": 3474 }, { "epoch": 0.5374057606804562, "grad_norm": 5.68243408203125, "learning_rate": 4.560373467751174e-06, "logits/chosen": 12.455754280090332, "logits/rejected": 6.358894348144531, "logps/chosen": -320.890625, "logps/rejected": -246.291259765625, "loss": 0.6719, "rewards/accuracies": 0.375, "rewards/chosen": 0.40316808223724365, "rewards/margins": 0.18727856874465942, "rewards/rejected": 0.215889573097229, "step": 3475 }, { "epoch": 0.5375604098202204, "grad_norm": 8.281245231628418, "learning_rate": 4.560087066101501e-06, "logits/chosen": 10.458235740661621, "logits/rejected": 8.534299850463867, "logps/chosen": -385.112060546875, "logps/rejected": -315.11456298828125, "loss": 0.8326, "rewards/accuracies": 0.25, "rewards/chosen": 0.313823401927948, "rewards/margins": -0.14138521254062653, "rewards/rejected": 0.4552086591720581, "step": 3476 }, { "epoch": 0.5377150589599845, "grad_norm": 7.64885139465332, "learning_rate": 4.5598006644518275e-06, "logits/chosen": 10.93093490600586, "logits/rejected": 9.699134826660156, "logps/chosen": -151.10316467285156, "logps/rejected": -153.74758911132812, "loss": 0.6304, "rewards/accuracies": 0.5, "rewards/chosen": 0.36934810876846313, "rewards/margins": 0.32461434602737427, "rewards/rejected": 0.04473382234573364, "step": 3477 }, { "epoch": 0.5378697080997487, "grad_norm": 4.424860954284668, "learning_rate": 4.559514262802154e-06, "logits/chosen": 13.512920379638672, "logits/rejected": 9.003018379211426, "logps/chosen": -341.8384704589844, "logps/rejected": -246.58056640625, "loss": 0.5653, "rewards/accuracies": 0.625, "rewards/chosen": 0.4532545208930969, "rewards/margins": 0.3818611204624176, "rewards/rejected": 0.07139340043067932, "step": 3478 }, { "epoch": 0.5380243572395128, "grad_norm": 5.093382358551025, "learning_rate": 4.55922786115248e-06, "logits/chosen": 9.175430297851562, "logits/rejected": 8.181477546691895, "logps/chosen": -277.0560302734375, "logps/rejected": -286.8317565917969, "loss": 0.5759, "rewards/accuracies": 0.625, "rewards/chosen": 0.3723825514316559, "rewards/margins": 0.5575502514839172, "rewards/rejected": -0.18516770005226135, "step": 3479 }, { "epoch": 0.5381790063792771, "grad_norm": 5.199785232543945, "learning_rate": 4.558941459502807e-06, "logits/chosen": 4.412034034729004, "logits/rejected": 2.59041690826416, "logps/chosen": -261.973388671875, "logps/rejected": -209.0919189453125, "loss": 0.5153, "rewards/accuracies": 0.875, "rewards/chosen": 0.1938885599374771, "rewards/margins": 0.5277411341667175, "rewards/rejected": -0.3338525891304016, "step": 3480 }, { "epoch": 0.5383336555190412, "grad_norm": 4.076631546020508, "learning_rate": 4.558655057853133e-06, "logits/chosen": 11.479317665100098, "logits/rejected": 3.094334840774536, "logps/chosen": -283.7841796875, "logps/rejected": -185.67349243164062, "loss": 0.5665, "rewards/accuracies": 0.625, "rewards/chosen": 0.2657521367073059, "rewards/margins": 0.45385316014289856, "rewards/rejected": -0.18810106813907623, "step": 3481 }, { "epoch": 0.5384883046588054, "grad_norm": 5.191612243652344, "learning_rate": 4.55836865620346e-06, "logits/chosen": 8.222091674804688, "logits/rejected": 6.054858684539795, "logps/chosen": -247.23997497558594, "logps/rejected": -219.9218292236328, "loss": 0.7472, "rewards/accuracies": 0.625, "rewards/chosen": 0.2999928891658783, "rewards/margins": 0.07902643084526062, "rewards/rejected": 0.22096644341945648, "step": 3482 }, { "epoch": 0.5386429537985695, "grad_norm": 5.727681636810303, "learning_rate": 4.5580822545537866e-06, "logits/chosen": 9.748743057250977, "logits/rejected": 12.55950927734375, "logps/chosen": -257.0787048339844, "logps/rejected": -303.491455078125, "loss": 0.7742, "rewards/accuracies": 0.375, "rewards/chosen": -0.00948730856180191, "rewards/margins": -0.09779801964759827, "rewards/rejected": 0.08831073343753815, "step": 3483 }, { "epoch": 0.5387976029383337, "grad_norm": 4.163386344909668, "learning_rate": 4.557795852904113e-06, "logits/chosen": 11.34853458404541, "logits/rejected": 6.750984191894531, "logps/chosen": -278.1275939941406, "logps/rejected": -209.5895538330078, "loss": 0.5179, "rewards/accuracies": 0.875, "rewards/chosen": 0.643409252166748, "rewards/margins": 0.4344027042388916, "rewards/rejected": 0.20900659263134003, "step": 3484 }, { "epoch": 0.5389522520780978, "grad_norm": 7.152817726135254, "learning_rate": 4.557509451254439e-06, "logits/chosen": 9.748087882995605, "logits/rejected": 9.264678001403809, "logps/chosen": -263.6056213378906, "logps/rejected": -268.05963134765625, "loss": 0.7036, "rewards/accuracies": 0.625, "rewards/chosen": 0.2854057550430298, "rewards/margins": 0.1319640576839447, "rewards/rejected": 0.1534416675567627, "step": 3485 }, { "epoch": 0.539106901217862, "grad_norm": 5.708414554595947, "learning_rate": 4.557223049604766e-06, "logits/chosen": 7.172023296356201, "logits/rejected": 6.871058940887451, "logps/chosen": -222.507080078125, "logps/rejected": -229.22044372558594, "loss": 0.6955, "rewards/accuracies": 0.75, "rewards/chosen": 0.02778664231300354, "rewards/margins": 0.08936890959739685, "rewards/rejected": -0.061582282185554504, "step": 3486 }, { "epoch": 0.5392615503576261, "grad_norm": 4.342827796936035, "learning_rate": 4.556936647955092e-06, "logits/chosen": 11.272900581359863, "logits/rejected": 6.238199234008789, "logps/chosen": -276.654296875, "logps/rejected": -161.07269287109375, "loss": 0.459, "rewards/accuracies": 0.875, "rewards/chosen": 0.3879266381263733, "rewards/margins": 0.6644210815429688, "rewards/rejected": -0.27649441361427307, "step": 3487 }, { "epoch": 0.5394161994973903, "grad_norm": 8.439592361450195, "learning_rate": 4.556650246305419e-06, "logits/chosen": 6.385427474975586, "logits/rejected": 14.341268539428711, "logps/chosen": -251.87225341796875, "logps/rejected": -321.5287170410156, "loss": 0.9173, "rewards/accuracies": 0.25, "rewards/chosen": 0.21228301525115967, "rewards/margins": -0.37563538551330566, "rewards/rejected": 0.5879184007644653, "step": 3488 }, { "epoch": 0.5395708486371544, "grad_norm": 7.2970075607299805, "learning_rate": 4.556363844655746e-06, "logits/chosen": 13.9177885055542, "logits/rejected": 7.445833683013916, "logps/chosen": -419.8946838378906, "logps/rejected": -305.1208190917969, "loss": 0.6625, "rewards/accuracies": 0.5, "rewards/chosen": 0.3855099081993103, "rewards/margins": 0.1586289405822754, "rewards/rejected": 0.2268809676170349, "step": 3489 }, { "epoch": 0.5397254977769186, "grad_norm": 3.7492423057556152, "learning_rate": 4.556077443006072e-06, "logits/chosen": 12.175384521484375, "logits/rejected": 2.543306350708008, "logps/chosen": -332.6507568359375, "logps/rejected": -249.10455322265625, "loss": 0.3563, "rewards/accuracies": 1.0, "rewards/chosen": 0.5663467049598694, "rewards/margins": 0.9684622287750244, "rewards/rejected": -0.4021156132221222, "step": 3490 }, { "epoch": 0.5398801469166827, "grad_norm": 4.765916347503662, "learning_rate": 4.555791041356399e-06, "logits/chosen": 14.103438377380371, "logits/rejected": 7.959611892700195, "logps/chosen": -399.951171875, "logps/rejected": -341.9591369628906, "loss": 0.4355, "rewards/accuracies": 0.875, "rewards/chosen": 0.5194175243377686, "rewards/margins": 0.7439227104187012, "rewards/rejected": -0.22450514137744904, "step": 3491 }, { "epoch": 0.5400347960564469, "grad_norm": 5.645360469818115, "learning_rate": 4.555504639706725e-06, "logits/chosen": 8.773210525512695, "logits/rejected": 9.583088874816895, "logps/chosen": -359.5025634765625, "logps/rejected": -274.38800048828125, "loss": 0.6962, "rewards/accuracies": 0.375, "rewards/chosen": 0.5723686218261719, "rewards/margins": 0.0845758393406868, "rewards/rejected": 0.48779281973838806, "step": 3492 }, { "epoch": 0.5401894451962111, "grad_norm": 4.530862331390381, "learning_rate": 4.555218238057051e-06, "logits/chosen": 10.9473295211792, "logits/rejected": 9.631717681884766, "logps/chosen": -229.36935424804688, "logps/rejected": -207.13758850097656, "loss": 0.6873, "rewards/accuracies": 0.25, "rewards/chosen": 0.13448746502399445, "rewards/margins": 0.14714720845222473, "rewards/rejected": -0.01265973225235939, "step": 3493 }, { "epoch": 0.5403440943359753, "grad_norm": 4.913314342498779, "learning_rate": 4.554931836407378e-06, "logits/chosen": 11.646885871887207, "logits/rejected": 1.310779333114624, "logps/chosen": -235.92115783691406, "logps/rejected": -109.67567443847656, "loss": 0.6707, "rewards/accuracies": 0.5, "rewards/chosen": 0.20431068539619446, "rewards/margins": 0.17526467144489288, "rewards/rejected": 0.029046017676591873, "step": 3494 }, { "epoch": 0.5404987434757395, "grad_norm": 6.161779403686523, "learning_rate": 4.554645434757705e-06, "logits/chosen": 7.614030838012695, "logits/rejected": 7.1966447830200195, "logps/chosen": -326.32421875, "logps/rejected": -297.5206298828125, "loss": 0.6546, "rewards/accuracies": 0.625, "rewards/chosen": 0.34260284900665283, "rewards/margins": 0.27079612016677856, "rewards/rejected": 0.07180673629045486, "step": 3495 }, { "epoch": 0.5406533926155036, "grad_norm": 6.2343573570251465, "learning_rate": 4.554359033108031e-06, "logits/chosen": 8.52818489074707, "logits/rejected": 9.596920013427734, "logps/chosen": -281.6808166503906, "logps/rejected": -291.0545959472656, "loss": 0.7382, "rewards/accuracies": 0.5, "rewards/chosen": 0.02100316435098648, "rewards/margins": 0.05088706314563751, "rewards/rejected": -0.02988389879465103, "step": 3496 }, { "epoch": 0.5408080417552678, "grad_norm": 14.629050254821777, "learning_rate": 4.554072631458358e-06, "logits/chosen": 5.954355716705322, "logits/rejected": 5.152848720550537, "logps/chosen": -301.63629150390625, "logps/rejected": -280.51593017578125, "loss": 0.5686, "rewards/accuracies": 0.75, "rewards/chosen": 0.40612396597862244, "rewards/margins": 0.3150443434715271, "rewards/rejected": 0.09107962250709534, "step": 3497 }, { "epoch": 0.5409626908950319, "grad_norm": 4.812487602233887, "learning_rate": 4.553786229808685e-06, "logits/chosen": 6.442961692810059, "logits/rejected": 7.085052013397217, "logps/chosen": -296.0628662109375, "logps/rejected": -298.15814208984375, "loss": 0.5238, "rewards/accuracies": 0.625, "rewards/chosen": 0.37529078125953674, "rewards/margins": 0.4374125599861145, "rewards/rejected": -0.06212177127599716, "step": 3498 }, { "epoch": 0.5411173400347961, "grad_norm": 5.505073070526123, "learning_rate": 4.5534998281590104e-06, "logits/chosen": 12.14414119720459, "logits/rejected": 11.873077392578125, "logps/chosen": -372.3367004394531, "logps/rejected": -424.401123046875, "loss": 0.5587, "rewards/accuracies": 0.75, "rewards/chosen": 0.36589258909225464, "rewards/margins": 0.4328452944755554, "rewards/rejected": -0.06695270538330078, "step": 3499 }, { "epoch": 0.5412719891745602, "grad_norm": 4.925987243652344, "learning_rate": 4.553213426509337e-06, "logits/chosen": 11.93224048614502, "logits/rejected": 9.289567947387695, "logps/chosen": -230.70289611816406, "logps/rejected": -221.42782592773438, "loss": 0.6273, "rewards/accuracies": 0.75, "rewards/chosen": 0.38956716656684875, "rewards/margins": 0.2550494968891144, "rewards/rejected": 0.13451766967773438, "step": 3500 }, { "epoch": 0.5414266383143244, "grad_norm": 5.067691802978516, "learning_rate": 4.552927024859664e-06, "logits/chosen": 12.06839656829834, "logits/rejected": 9.073575973510742, "logps/chosen": -392.45635986328125, "logps/rejected": -247.975830078125, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": 0.3974451422691345, "rewards/margins": 0.29540586471557617, "rewards/rejected": 0.10203930735588074, "step": 3501 }, { "epoch": 0.5415812874540885, "grad_norm": 6.119301795959473, "learning_rate": 4.55264062320999e-06, "logits/chosen": 10.994044303894043, "logits/rejected": 9.389493942260742, "logps/chosen": -159.7493896484375, "logps/rejected": -155.42787170410156, "loss": 0.9328, "rewards/accuracies": 0.375, "rewards/chosen": -0.2420315444469452, "rewards/margins": -0.3132123351097107, "rewards/rejected": 0.0711807906627655, "step": 3502 }, { "epoch": 0.5417359365938527, "grad_norm": 7.779451370239258, "learning_rate": 4.552354221560317e-06, "logits/chosen": 7.071882724761963, "logits/rejected": 8.056337356567383, "logps/chosen": -311.45361328125, "logps/rejected": -358.69610595703125, "loss": 0.8465, "rewards/accuracies": 0.375, "rewards/chosen": 0.13432732224464417, "rewards/margins": -0.13533399999141693, "rewards/rejected": 0.2696613371372223, "step": 3503 }, { "epoch": 0.5418905857336168, "grad_norm": 5.105988025665283, "learning_rate": 4.552067819910644e-06, "logits/chosen": 3.39768123626709, "logits/rejected": -2.911684513092041, "logps/chosen": -244.96241760253906, "logps/rejected": -200.06918334960938, "loss": 0.6047, "rewards/accuracies": 0.75, "rewards/chosen": 0.23985296487808228, "rewards/margins": 0.32487720251083374, "rewards/rejected": -0.08502425253391266, "step": 3504 }, { "epoch": 0.542045234873381, "grad_norm": 5.381991386413574, "learning_rate": 4.5517814182609695e-06, "logits/chosen": 10.398744583129883, "logits/rejected": 9.1279935836792, "logps/chosen": -359.0007629394531, "logps/rejected": -264.85064697265625, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": 0.15377554297447205, "rewards/margins": 0.026801161468029022, "rewards/rejected": 0.12697440385818481, "step": 3505 }, { "epoch": 0.5421998840131452, "grad_norm": 5.803683757781982, "learning_rate": 4.551495016611296e-06, "logits/chosen": 6.138507843017578, "logits/rejected": 9.383905410766602, "logps/chosen": -330.3441162109375, "logps/rejected": -371.12237548828125, "loss": 0.5767, "rewards/accuracies": 0.875, "rewards/chosen": 0.2503851652145386, "rewards/margins": 0.27842631936073303, "rewards/rejected": -0.028041183948516846, "step": 3506 }, { "epoch": 0.5423545331529094, "grad_norm": 6.395082950592041, "learning_rate": 4.551208614961623e-06, "logits/chosen": 12.034193992614746, "logits/rejected": 7.217423439025879, "logps/chosen": -298.9798583984375, "logps/rejected": -269.906494140625, "loss": 0.7352, "rewards/accuracies": 0.25, "rewards/chosen": 0.01863950863480568, "rewards/margins": -0.0606483593583107, "rewards/rejected": 0.07928786426782608, "step": 3507 }, { "epoch": 0.5425091822926735, "grad_norm": 5.099301338195801, "learning_rate": 4.5509222133119494e-06, "logits/chosen": 11.359386444091797, "logits/rejected": 5.786442279815674, "logps/chosen": -423.5165710449219, "logps/rejected": -365.9211730957031, "loss": 0.5684, "rewards/accuracies": 0.625, "rewards/chosen": 0.8092166781425476, "rewards/margins": 0.33447590470314026, "rewards/rejected": 0.47474080324172974, "step": 3508 }, { "epoch": 0.5426638314324377, "grad_norm": 7.289186954498291, "learning_rate": 4.550635811662275e-06, "logits/chosen": 12.071268081665039, "logits/rejected": 17.04667854309082, "logps/chosen": -255.82632446289062, "logps/rejected": -277.09454345703125, "loss": 0.7829, "rewards/accuracies": 0.5, "rewards/chosen": 0.15634021162986755, "rewards/margins": -0.11880205571651459, "rewards/rejected": 0.27514228224754333, "step": 3509 }, { "epoch": 0.5428184805722018, "grad_norm": 7.643459320068359, "learning_rate": 4.550349410012602e-06, "logits/chosen": 11.09959602355957, "logits/rejected": 7.014785289764404, "logps/chosen": -376.5906066894531, "logps/rejected": -375.122314453125, "loss": 0.5591, "rewards/accuracies": 0.75, "rewards/chosen": 0.0003324747085571289, "rewards/margins": 0.6094013452529907, "rewards/rejected": -0.6090688705444336, "step": 3510 }, { "epoch": 0.542973129711966, "grad_norm": 3.691310405731201, "learning_rate": 4.5500630083629286e-06, "logits/chosen": 7.974476337432861, "logits/rejected": 9.409144401550293, "logps/chosen": -145.7262420654297, "logps/rejected": -188.11610412597656, "loss": 0.5545, "rewards/accuracies": 0.625, "rewards/chosen": 0.3247499465942383, "rewards/margins": 0.3733232617378235, "rewards/rejected": -0.048573315143585205, "step": 3511 }, { "epoch": 0.5431277788517301, "grad_norm": 3.073251724243164, "learning_rate": 4.549776606713255e-06, "logits/chosen": 8.224628448486328, "logits/rejected": 1.4071017503738403, "logps/chosen": -141.3560791015625, "logps/rejected": -102.92980194091797, "loss": 0.5295, "rewards/accuracies": 0.875, "rewards/chosen": 0.38744989037513733, "rewards/margins": 0.4002656638622284, "rewards/rejected": -0.012815780937671661, "step": 3512 }, { "epoch": 0.5432824279914943, "grad_norm": 8.070582389831543, "learning_rate": 4.549490205063581e-06, "logits/chosen": 6.5147809982299805, "logits/rejected": 6.872115612030029, "logps/chosen": -246.87306213378906, "logps/rejected": -185.63099670410156, "loss": 0.6892, "rewards/accuracies": 0.5, "rewards/chosen": 0.10099928081035614, "rewards/margins": 0.10853702574968338, "rewards/rejected": -0.007537752389907837, "step": 3513 }, { "epoch": 0.5434370771312584, "grad_norm": 5.410460948944092, "learning_rate": 4.549203803413908e-06, "logits/chosen": 14.582372665405273, "logits/rejected": 7.697572708129883, "logps/chosen": -305.4378356933594, "logps/rejected": -212.38340759277344, "loss": 0.6185, "rewards/accuracies": 0.5, "rewards/chosen": 0.3665239214897156, "rewards/margins": 0.24178943037986755, "rewards/rejected": 0.12473449110984802, "step": 3514 }, { "epoch": 0.5435917262710226, "grad_norm": 5.217390060424805, "learning_rate": 4.548917401764234e-06, "logits/chosen": 15.824353218078613, "logits/rejected": 5.598424911499023, "logps/chosen": -286.70928955078125, "logps/rejected": -209.32269287109375, "loss": 0.6231, "rewards/accuracies": 0.75, "rewards/chosen": 0.3197060823440552, "rewards/margins": 0.21791549026966095, "rewards/rejected": 0.10179056972265244, "step": 3515 }, { "epoch": 0.5437463754107867, "grad_norm": 3.990569829940796, "learning_rate": 4.548631000114561e-06, "logits/chosen": 8.529427528381348, "logits/rejected": 5.0457916259765625, "logps/chosen": -187.49325561523438, "logps/rejected": -179.27442932128906, "loss": 0.4654, "rewards/accuracies": 0.75, "rewards/chosen": 0.33572715520858765, "rewards/margins": 0.691824734210968, "rewards/rejected": -0.35609766840934753, "step": 3516 }, { "epoch": 0.5439010245505509, "grad_norm": 6.907032012939453, "learning_rate": 4.548344598464888e-06, "logits/chosen": 3.0524420738220215, "logits/rejected": 1.3002849817276, "logps/chosen": -214.88133239746094, "logps/rejected": -282.6168518066406, "loss": 0.725, "rewards/accuracies": 0.375, "rewards/chosen": -0.06216317042708397, "rewards/margins": 0.07149878889322281, "rewards/rejected": -0.13366200029850006, "step": 3517 }, { "epoch": 0.544055673690315, "grad_norm": 4.797595977783203, "learning_rate": 4.548058196815213e-06, "logits/chosen": 13.109292030334473, "logits/rejected": 6.928149223327637, "logps/chosen": -227.20571899414062, "logps/rejected": -182.78598022460938, "loss": 0.578, "rewards/accuracies": 0.5, "rewards/chosen": 0.29015055298805237, "rewards/margins": 0.3859752416610718, "rewards/rejected": -0.09582467377185822, "step": 3518 }, { "epoch": 0.5442103228300793, "grad_norm": 4.863753318786621, "learning_rate": 4.54777179516554e-06, "logits/chosen": 8.01257610321045, "logits/rejected": 7.205863952636719, "logps/chosen": -240.56210327148438, "logps/rejected": -277.7075500488281, "loss": 0.5851, "rewards/accuracies": 0.625, "rewards/chosen": 0.4072505831718445, "rewards/margins": 0.41561296582221985, "rewards/rejected": -0.008362431079149246, "step": 3519 }, { "epoch": 0.5443649719698435, "grad_norm": 7.176610469818115, "learning_rate": 4.547485393515867e-06, "logits/chosen": 10.424981117248535, "logits/rejected": 11.230606079101562, "logps/chosen": -317.6285400390625, "logps/rejected": -343.45458984375, "loss": 0.8539, "rewards/accuracies": 0.375, "rewards/chosen": 0.4626142382621765, "rewards/margins": -0.2254081666469574, "rewards/rejected": 0.6880223751068115, "step": 3520 }, { "epoch": 0.5445196211096076, "grad_norm": 5.673208713531494, "learning_rate": 4.547198991866193e-06, "logits/chosen": 1.7676072120666504, "logits/rejected": -0.022732943296432495, "logps/chosen": -362.8194274902344, "logps/rejected": -242.46539306640625, "loss": 0.5426, "rewards/accuracies": 0.625, "rewards/chosen": 0.5794628858566284, "rewards/margins": 0.43827134370803833, "rewards/rejected": 0.14119155704975128, "step": 3521 }, { "epoch": 0.5446742702493718, "grad_norm": 7.997133255004883, "learning_rate": 4.54691259021652e-06, "logits/chosen": 14.438596725463867, "logits/rejected": 10.949785232543945, "logps/chosen": -309.0161437988281, "logps/rejected": -285.4857177734375, "loss": 0.8135, "rewards/accuracies": 0.375, "rewards/chosen": 0.14933447539806366, "rewards/margins": -0.05192427337169647, "rewards/rejected": 0.20125874876976013, "step": 3522 }, { "epoch": 0.5448289193891359, "grad_norm": 6.136359214782715, "learning_rate": 4.546626188566847e-06, "logits/chosen": 8.002218246459961, "logits/rejected": 10.703384399414062, "logps/chosen": -312.8731994628906, "logps/rejected": -309.1609191894531, "loss": 0.7467, "rewards/accuracies": 0.375, "rewards/chosen": 0.04328130558133125, "rewards/margins": -0.029910333454608917, "rewards/rejected": 0.07319164276123047, "step": 3523 }, { "epoch": 0.5449835685289001, "grad_norm": 4.540926933288574, "learning_rate": 4.546339786917173e-06, "logits/chosen": 13.019472122192383, "logits/rejected": 12.201904296875, "logps/chosen": -286.26007080078125, "logps/rejected": -230.09072875976562, "loss": 0.4987, "rewards/accuracies": 0.875, "rewards/chosen": 0.4957641661167145, "rewards/margins": 0.5642727613449097, "rewards/rejected": -0.06850862503051758, "step": 3524 }, { "epoch": 0.5451382176686642, "grad_norm": 6.054940700531006, "learning_rate": 4.546053385267499e-06, "logits/chosen": 8.482773780822754, "logits/rejected": -2.757509708404541, "logps/chosen": -411.403076171875, "logps/rejected": -225.9167022705078, "loss": 0.5334, "rewards/accuracies": 0.625, "rewards/chosen": 0.2993467450141907, "rewards/margins": 0.46557068824768066, "rewards/rejected": -0.1662239134311676, "step": 3525 }, { "epoch": 0.5452928668084284, "grad_norm": 5.451732158660889, "learning_rate": 4.545766983617826e-06, "logits/chosen": 4.632357120513916, "logits/rejected": 4.565715312957764, "logps/chosen": -461.7265625, "logps/rejected": -378.4767150878906, "loss": 0.5938, "rewards/accuracies": 0.625, "rewards/chosen": 0.06513690948486328, "rewards/margins": 0.4721967279911041, "rewards/rejected": -0.40705978870391846, "step": 3526 }, { "epoch": 0.5454475159481925, "grad_norm": 5.41460657119751, "learning_rate": 4.545480581968152e-06, "logits/chosen": 10.451261520385742, "logits/rejected": 5.0063958168029785, "logps/chosen": -195.10745239257812, "logps/rejected": -173.7678985595703, "loss": 0.7093, "rewards/accuracies": 0.5, "rewards/chosen": 0.4223962724208832, "rewards/margins": 0.025388240814208984, "rewards/rejected": 0.3970080316066742, "step": 3527 }, { "epoch": 0.5456021650879567, "grad_norm": 5.726624965667725, "learning_rate": 4.545194180318479e-06, "logits/chosen": 8.894241333007812, "logits/rejected": 9.959104537963867, "logps/chosen": -169.13803100585938, "logps/rejected": -195.6226806640625, "loss": 0.7416, "rewards/accuracies": 0.25, "rewards/chosen": -0.05895858630537987, "rewards/margins": -0.035721540451049805, "rewards/rejected": -0.023237038403749466, "step": 3528 }, { "epoch": 0.5457568142277208, "grad_norm": 5.137823104858398, "learning_rate": 4.544907778668806e-06, "logits/chosen": 14.346549034118652, "logits/rejected": 11.613884925842285, "logps/chosen": -341.1233215332031, "logps/rejected": -213.6710205078125, "loss": 0.6498, "rewards/accuracies": 0.375, "rewards/chosen": 0.48205041885375977, "rewards/margins": 0.1846107542514801, "rewards/rejected": 0.29743966460227966, "step": 3529 }, { "epoch": 0.545911463367485, "grad_norm": 6.021636962890625, "learning_rate": 4.544621377019132e-06, "logits/chosen": 13.294836044311523, "logits/rejected": 10.589038848876953, "logps/chosen": -316.819580078125, "logps/rejected": -299.698486328125, "loss": 0.6054, "rewards/accuracies": 0.5, "rewards/chosen": 0.12105311453342438, "rewards/margins": 0.24802693724632263, "rewards/rejected": -0.12697382271289825, "step": 3530 }, { "epoch": 0.5460661125072491, "grad_norm": 5.4508891105651855, "learning_rate": 4.544334975369458e-06, "logits/chosen": 10.107401847839355, "logits/rejected": 14.2821626663208, "logps/chosen": -188.77883911132812, "logps/rejected": -227.26724243164062, "loss": 0.6922, "rewards/accuracies": 0.625, "rewards/chosen": 0.32140201330184937, "rewards/margins": 0.12523728609085083, "rewards/rejected": 0.19616475701332092, "step": 3531 }, { "epoch": 0.5462207616470134, "grad_norm": 4.356362819671631, "learning_rate": 4.544048573719785e-06, "logits/chosen": 17.036211013793945, "logits/rejected": 9.378527641296387, "logps/chosen": -468.59832763671875, "logps/rejected": -339.0898742675781, "loss": 0.4228, "rewards/accuracies": 1.0, "rewards/chosen": 0.8874123096466064, "rewards/margins": 0.6814284324645996, "rewards/rejected": 0.20598383247852325, "step": 3532 }, { "epoch": 0.5463754107867775, "grad_norm": 4.040818214416504, "learning_rate": 4.5437621720701115e-06, "logits/chosen": 10.119794845581055, "logits/rejected": 2.0200061798095703, "logps/chosen": -395.4491882324219, "logps/rejected": -234.0541229248047, "loss": 0.4514, "rewards/accuracies": 0.875, "rewards/chosen": 0.42939889430999756, "rewards/margins": 0.6351261734962463, "rewards/rejected": -0.20572729408740997, "step": 3533 }, { "epoch": 0.5465300599265417, "grad_norm": 5.61366605758667, "learning_rate": 4.543475770420438e-06, "logits/chosen": 13.310968399047852, "logits/rejected": 16.62929344177246, "logps/chosen": -257.2803649902344, "logps/rejected": -280.9564208984375, "loss": 0.814, "rewards/accuracies": 0.25, "rewards/chosen": 0.169350728392601, "rewards/margins": -0.15942524373531342, "rewards/rejected": 0.32877594232559204, "step": 3534 }, { "epoch": 0.5466847090663058, "grad_norm": 4.956250190734863, "learning_rate": 4.543189368770765e-06, "logits/chosen": 10.099786758422852, "logits/rejected": 7.958563327789307, "logps/chosen": -211.59457397460938, "logps/rejected": -204.10702514648438, "loss": 0.5962, "rewards/accuracies": 0.625, "rewards/chosen": 0.3334071636199951, "rewards/margins": 0.3714045286178589, "rewards/rejected": -0.03799736499786377, "step": 3535 }, { "epoch": 0.54683935820607, "grad_norm": 4.367299556732178, "learning_rate": 4.5429029671210914e-06, "logits/chosen": 9.914778709411621, "logits/rejected": 2.0478150844573975, "logps/chosen": -205.86819458007812, "logps/rejected": -156.32708740234375, "loss": 0.6635, "rewards/accuracies": 0.5, "rewards/chosen": -0.11699000000953674, "rewards/margins": 0.11799439787864685, "rewards/rejected": -0.23498442769050598, "step": 3536 }, { "epoch": 0.5469940073458341, "grad_norm": 6.793894290924072, "learning_rate": 4.542616565471418e-06, "logits/chosen": 12.172344207763672, "logits/rejected": 12.992731094360352, "logps/chosen": -402.451904296875, "logps/rejected": -482.27203369140625, "loss": 0.7364, "rewards/accuracies": 0.375, "rewards/chosen": 0.6419467926025391, "rewards/margins": 0.10428544878959656, "rewards/rejected": 0.5376613736152649, "step": 3537 }, { "epoch": 0.5471486564855983, "grad_norm": 4.70053243637085, "learning_rate": 4.542330163821744e-06, "logits/chosen": 4.366672039031982, "logits/rejected": 4.480984687805176, "logps/chosen": -228.4396209716797, "logps/rejected": -214.60765075683594, "loss": 0.627, "rewards/accuracies": 0.5, "rewards/chosen": 0.3523925244808197, "rewards/margins": 0.24305859208106995, "rewards/rejected": 0.10933391749858856, "step": 3538 }, { "epoch": 0.5473033056253624, "grad_norm": 4.50254487991333, "learning_rate": 4.5420437621720705e-06, "logits/chosen": 10.565152168273926, "logits/rejected": 11.014066696166992, "logps/chosen": -222.46881103515625, "logps/rejected": -147.71978759765625, "loss": 0.6795, "rewards/accuracies": 0.375, "rewards/chosen": 0.2052536904811859, "rewards/margins": 0.08800911158323288, "rewards/rejected": 0.11724459379911423, "step": 3539 }, { "epoch": 0.5474579547651266, "grad_norm": 6.468800067901611, "learning_rate": 4.541757360522397e-06, "logits/chosen": 11.006531715393066, "logits/rejected": 7.352038383483887, "logps/chosen": -317.2406311035156, "logps/rejected": -351.90911865234375, "loss": 0.7672, "rewards/accuracies": 0.5, "rewards/chosen": 0.12836430966854095, "rewards/margins": 0.00013770908117294312, "rewards/rejected": 0.1282266080379486, "step": 3540 }, { "epoch": 0.5476126039048907, "grad_norm": 6.936290264129639, "learning_rate": 4.541470958872724e-06, "logits/chosen": 18.008426666259766, "logits/rejected": 8.986394882202148, "logps/chosen": -462.3390808105469, "logps/rejected": -255.6761932373047, "loss": 0.7355, "rewards/accuracies": 0.625, "rewards/chosen": -0.09092272818088531, "rewards/margins": 0.0928514152765274, "rewards/rejected": -0.1837741732597351, "step": 3541 }, { "epoch": 0.5477672530446549, "grad_norm": 5.683344841003418, "learning_rate": 4.5411845572230505e-06, "logits/chosen": 9.448622703552246, "logits/rejected": 5.785962104797363, "logps/chosen": -349.160400390625, "logps/rejected": -245.28756713867188, "loss": 0.5761, "rewards/accuracies": 0.5, "rewards/chosen": 0.1705780029296875, "rewards/margins": 0.36399269104003906, "rewards/rejected": -0.19341468811035156, "step": 3542 }, { "epoch": 0.547921902184419, "grad_norm": 5.563908100128174, "learning_rate": 4.540898155573376e-06, "logits/chosen": 11.009073257446289, "logits/rejected": 8.724419593811035, "logps/chosen": -205.60867309570312, "logps/rejected": -256.0479736328125, "loss": 0.6143, "rewards/accuracies": 0.5, "rewards/chosen": 0.5023806095123291, "rewards/margins": 0.2036203294992447, "rewards/rejected": 0.298760324716568, "step": 3543 }, { "epoch": 0.5480765513241833, "grad_norm": 4.960903644561768, "learning_rate": 4.540611753923703e-06, "logits/chosen": 5.844745635986328, "logits/rejected": 9.609888076782227, "logps/chosen": -207.8001251220703, "logps/rejected": -215.28321838378906, "loss": 0.6796, "rewards/accuracies": 0.5, "rewards/chosen": 0.2908862233161926, "rewards/margins": 0.07577388733625412, "rewards/rejected": 0.2151123583316803, "step": 3544 }, { "epoch": 0.5482312004639475, "grad_norm": 4.565761089324951, "learning_rate": 4.54032535227403e-06, "logits/chosen": 7.832420349121094, "logits/rejected": 3.3200812339782715, "logps/chosen": -198.38137817382812, "logps/rejected": -114.19021606445312, "loss": 0.6084, "rewards/accuracies": 0.75, "rewards/chosen": 0.2435060739517212, "rewards/margins": 0.22167813777923584, "rewards/rejected": 0.021827932447195053, "step": 3545 }, { "epoch": 0.5483858496037116, "grad_norm": 6.146039962768555, "learning_rate": 4.540038950624356e-06, "logits/chosen": 11.025949478149414, "logits/rejected": 6.19476318359375, "logps/chosen": -303.18719482421875, "logps/rejected": -210.86190795898438, "loss": 0.5568, "rewards/accuracies": 0.5, "rewards/chosen": 0.4069153666496277, "rewards/margins": 0.45468005537986755, "rewards/rejected": -0.04776472598314285, "step": 3546 }, { "epoch": 0.5485404987434758, "grad_norm": 5.0487589836120605, "learning_rate": 4.539752548974682e-06, "logits/chosen": 8.809881210327148, "logits/rejected": 9.234543800354004, "logps/chosen": -149.8020782470703, "logps/rejected": -146.05694580078125, "loss": 0.6717, "rewards/accuracies": 0.75, "rewards/chosen": 0.13258127868175507, "rewards/margins": 0.0861751139163971, "rewards/rejected": 0.04640618711709976, "step": 3547 }, { "epoch": 0.5486951478832399, "grad_norm": 5.862454414367676, "learning_rate": 4.539466147325009e-06, "logits/chosen": 5.978017330169678, "logits/rejected": 10.074748992919922, "logps/chosen": -264.7501220703125, "logps/rejected": -322.0801696777344, "loss": 0.5903, "rewards/accuracies": 0.625, "rewards/chosen": 0.4706363379955292, "rewards/margins": 0.24110819399356842, "rewards/rejected": 0.22952814400196075, "step": 3548 }, { "epoch": 0.5488497970230041, "grad_norm": 5.216935634613037, "learning_rate": 4.539179745675335e-06, "logits/chosen": 8.29134750366211, "logits/rejected": 5.364598274230957, "logps/chosen": -273.58294677734375, "logps/rejected": -217.62774658203125, "loss": 0.6012, "rewards/accuracies": 0.625, "rewards/chosen": 0.22625470161437988, "rewards/margins": 0.24434253573417664, "rewards/rejected": -0.018087834119796753, "step": 3549 }, { "epoch": 0.5490044461627682, "grad_norm": 5.841380596160889, "learning_rate": 4.538893344025662e-06, "logits/chosen": 8.870448112487793, "logits/rejected": 4.638943672180176, "logps/chosen": -340.1754150390625, "logps/rejected": -250.580078125, "loss": 0.7778, "rewards/accuracies": 0.5, "rewards/chosen": 0.25906795263290405, "rewards/margins": 0.3286592960357666, "rewards/rejected": -0.06959133595228195, "step": 3550 }, { "epoch": 0.5491590953025324, "grad_norm": 5.385190486907959, "learning_rate": 4.538606942375988e-06, "logits/chosen": 9.568578720092773, "logits/rejected": 6.89943265914917, "logps/chosen": -472.45697021484375, "logps/rejected": -391.3062438964844, "loss": 0.5963, "rewards/accuracies": 0.625, "rewards/chosen": 0.13642922043800354, "rewards/margins": 0.3578115403652191, "rewards/rejected": -0.22138234972953796, "step": 3551 }, { "epoch": 0.5493137444422965, "grad_norm": 6.736863613128662, "learning_rate": 4.5383205407263144e-06, "logits/chosen": 13.685302734375, "logits/rejected": 8.468063354492188, "logps/chosen": -301.55517578125, "logps/rejected": -283.5740966796875, "loss": 0.6054, "rewards/accuracies": 0.5, "rewards/chosen": 0.44643211364746094, "rewards/margins": 0.33363571763038635, "rewards/rejected": 0.11279641091823578, "step": 3552 }, { "epoch": 0.5494683935820607, "grad_norm": 5.982054710388184, "learning_rate": 4.538034139076641e-06, "logits/chosen": 12.448792457580566, "logits/rejected": 5.300612449645996, "logps/chosen": -274.3370056152344, "logps/rejected": -214.73365783691406, "loss": 0.7172, "rewards/accuracies": 0.75, "rewards/chosen": 0.18710699677467346, "rewards/margins": 0.20708554983139038, "rewards/rejected": -0.019978567957878113, "step": 3553 }, { "epoch": 0.5496230427218248, "grad_norm": 5.329343318939209, "learning_rate": 4.537747737426968e-06, "logits/chosen": 10.182079315185547, "logits/rejected": 9.306894302368164, "logps/chosen": -270.68115234375, "logps/rejected": -267.1793518066406, "loss": 0.7493, "rewards/accuracies": 0.375, "rewards/chosen": 0.05081549286842346, "rewards/margins": -0.07559870183467865, "rewards/rejected": 0.1264142096042633, "step": 3554 }, { "epoch": 0.549777691861589, "grad_norm": 5.044135093688965, "learning_rate": 4.537461335777294e-06, "logits/chosen": 10.9108304977417, "logits/rejected": 5.656590461730957, "logps/chosen": -337.8362731933594, "logps/rejected": -237.78717041015625, "loss": 0.5749, "rewards/accuracies": 0.625, "rewards/chosen": 0.4850406348705292, "rewards/margins": 0.43785780668258667, "rewards/rejected": 0.047182850539684296, "step": 3555 }, { "epoch": 0.5499323410013531, "grad_norm": 6.545263767242432, "learning_rate": 4.537174934127621e-06, "logits/chosen": 8.910379409790039, "logits/rejected": 6.235647678375244, "logps/chosen": -317.0166931152344, "logps/rejected": -209.79644775390625, "loss": 0.5564, "rewards/accuracies": 0.75, "rewards/chosen": 0.3248693645000458, "rewards/margins": 0.5088315606117249, "rewards/rejected": -0.18396218121051788, "step": 3556 }, { "epoch": 0.5500869901411174, "grad_norm": 5.0078511238098145, "learning_rate": 4.536888532477948e-06, "logits/chosen": 9.847518920898438, "logits/rejected": 9.366765022277832, "logps/chosen": -186.71817016601562, "logps/rejected": -182.91854858398438, "loss": 0.7238, "rewards/accuracies": 0.5, "rewards/chosen": 0.2906917631626129, "rewards/margins": 0.03638588637113571, "rewards/rejected": 0.2543058395385742, "step": 3557 }, { "epoch": 0.5502416392808815, "grad_norm": 4.022653579711914, "learning_rate": 4.5366021308282735e-06, "logits/chosen": 9.77560043334961, "logits/rejected": 3.795381546020508, "logps/chosen": -224.27218627929688, "logps/rejected": -170.6032257080078, "loss": 0.5359, "rewards/accuracies": 0.75, "rewards/chosen": 0.23060496151447296, "rewards/margins": 0.4511311948299408, "rewards/rejected": -0.22052627801895142, "step": 3558 }, { "epoch": 0.5503962884206457, "grad_norm": 4.373462677001953, "learning_rate": 4.5363157291786e-06, "logits/chosen": 5.43952751159668, "logits/rejected": 10.963338851928711, "logps/chosen": -177.14576721191406, "logps/rejected": -186.11505126953125, "loss": 0.6374, "rewards/accuracies": 0.375, "rewards/chosen": 0.22932806611061096, "rewards/margins": 0.22654861211776733, "rewards/rejected": 0.002779439091682434, "step": 3559 }, { "epoch": 0.5505509375604098, "grad_norm": 4.65736198425293, "learning_rate": 4.536029327528927e-06, "logits/chosen": 13.079739570617676, "logits/rejected": 11.66657543182373, "logps/chosen": -245.99560546875, "logps/rejected": -264.7964172363281, "loss": 0.7288, "rewards/accuracies": 0.5, "rewards/chosen": 0.510442316532135, "rewards/margins": -0.004677586257457733, "rewards/rejected": 0.5151199102401733, "step": 3560 }, { "epoch": 0.550705586700174, "grad_norm": 6.700300216674805, "learning_rate": 4.5357429258792535e-06, "logits/chosen": 15.049016952514648, "logits/rejected": 5.899397373199463, "logps/chosen": -390.96246337890625, "logps/rejected": -267.9217529296875, "loss": 0.7615, "rewards/accuracies": 0.375, "rewards/chosen": 0.0939340591430664, "rewards/margins": 0.03125503659248352, "rewards/rejected": 0.06267901510000229, "step": 3561 }, { "epoch": 0.5508602358399382, "grad_norm": 7.729654788970947, "learning_rate": 4.53545652422958e-06, "logits/chosen": 10.23967170715332, "logits/rejected": 5.977878093719482, "logps/chosen": -274.3505859375, "logps/rejected": -233.908447265625, "loss": 0.6385, "rewards/accuracies": 0.625, "rewards/chosen": 0.33057501912117004, "rewards/margins": 0.17289915680885315, "rewards/rejected": 0.1576758474111557, "step": 3562 }, { "epoch": 0.5510148849797023, "grad_norm": 4.259594917297363, "learning_rate": 4.535170122579907e-06, "logits/chosen": 10.6661958694458, "logits/rejected": 6.657013893127441, "logps/chosen": -286.9015197753906, "logps/rejected": -219.30804443359375, "loss": 0.5235, "rewards/accuracies": 0.875, "rewards/chosen": 0.5922577977180481, "rewards/margins": 0.5397415161132812, "rewards/rejected": 0.05251626670360565, "step": 3563 }, { "epoch": 0.5511695341194665, "grad_norm": 5.772739887237549, "learning_rate": 4.5348837209302326e-06, "logits/chosen": 7.150918006896973, "logits/rejected": 7.458500862121582, "logps/chosen": -305.84930419921875, "logps/rejected": -265.9232177734375, "loss": 0.82, "rewards/accuracies": 0.375, "rewards/chosen": 0.4643403887748718, "rewards/margins": -0.15803226828575134, "rewards/rejected": 0.6223726272583008, "step": 3564 }, { "epoch": 0.5513241832592306, "grad_norm": 7.2503533363342285, "learning_rate": 4.534597319280559e-06, "logits/chosen": 8.620634078979492, "logits/rejected": 7.966855525970459, "logps/chosen": -300.65728759765625, "logps/rejected": -342.868896484375, "loss": 0.8209, "rewards/accuracies": 0.5, "rewards/chosen": 0.5630488991737366, "rewards/margins": -0.13228988647460938, "rewards/rejected": 0.6953388452529907, "step": 3565 }, { "epoch": 0.5514788323989948, "grad_norm": 6.189802169799805, "learning_rate": 4.534310917630886e-06, "logits/chosen": 11.349376678466797, "logits/rejected": 8.790705680847168, "logps/chosen": -284.88287353515625, "logps/rejected": -212.41690063476562, "loss": 0.7765, "rewards/accuracies": 0.375, "rewards/chosen": -0.09331922978162766, "rewards/margins": -0.11698141694068909, "rewards/rejected": 0.023662179708480835, "step": 3566 }, { "epoch": 0.5516334815387589, "grad_norm": 4.301194667816162, "learning_rate": 4.5340245159812125e-06, "logits/chosen": 7.0621490478515625, "logits/rejected": 3.6898441314697266, "logps/chosen": -211.5169677734375, "logps/rejected": -179.8331298828125, "loss": 0.5851, "rewards/accuracies": 0.5, "rewards/chosen": 0.3927188217639923, "rewards/margins": 0.5453318357467651, "rewards/rejected": -0.15261292457580566, "step": 3567 }, { "epoch": 0.5517881306785231, "grad_norm": 5.55835485458374, "learning_rate": 4.533738114331539e-06, "logits/chosen": 11.471624374389648, "logits/rejected": 6.332268714904785, "logps/chosen": -344.49310302734375, "logps/rejected": -234.843994140625, "loss": 0.6997, "rewards/accuracies": 0.625, "rewards/chosen": 0.3348548114299774, "rewards/margins": 0.1306462287902832, "rewards/rejected": 0.20420856773853302, "step": 3568 }, { "epoch": 0.5519427798182872, "grad_norm": 6.9027934074401855, "learning_rate": 4.533451712681866e-06, "logits/chosen": 7.23335075378418, "logits/rejected": 9.39720630645752, "logps/chosen": -204.245849609375, "logps/rejected": -231.79544067382812, "loss": 0.9175, "rewards/accuracies": 0.5, "rewards/chosen": 0.14404296875, "rewards/margins": -0.33375269174575806, "rewards/rejected": 0.47779566049575806, "step": 3569 }, { "epoch": 0.5520974289580515, "grad_norm": 5.868442058563232, "learning_rate": 4.5331653110321925e-06, "logits/chosen": 9.503335952758789, "logits/rejected": 7.132210731506348, "logps/chosen": -198.42047119140625, "logps/rejected": -197.94317626953125, "loss": 0.743, "rewards/accuracies": 0.375, "rewards/chosen": 0.20590871572494507, "rewards/margins": -0.024768264964222908, "rewards/rejected": 0.23067699372768402, "step": 3570 }, { "epoch": 0.5522520780978156, "grad_norm": 5.358765602111816, "learning_rate": 4.532878909382518e-06, "logits/chosen": 12.876893043518066, "logits/rejected": 3.510669708251953, "logps/chosen": -281.7961730957031, "logps/rejected": -158.5064239501953, "loss": 0.8352, "rewards/accuracies": 0.375, "rewards/chosen": 0.0024895519018173218, "rewards/margins": -0.0842953622341156, "rewards/rejected": 0.08678492903709412, "step": 3571 }, { "epoch": 0.5524067272375798, "grad_norm": 3.619243621826172, "learning_rate": 4.532592507732845e-06, "logits/chosen": 10.332534790039062, "logits/rejected": 3.8928916454315186, "logps/chosen": -287.9915771484375, "logps/rejected": -245.3173370361328, "loss": 0.4249, "rewards/accuracies": 0.875, "rewards/chosen": 0.5293671488761902, "rewards/margins": 0.6992554664611816, "rewards/rejected": -0.16988831758499146, "step": 3572 }, { "epoch": 0.5525613763773439, "grad_norm": 6.228548049926758, "learning_rate": 4.5323061060831716e-06, "logits/chosen": 11.99790096282959, "logits/rejected": 9.32993221282959, "logps/chosen": -303.4930419921875, "logps/rejected": -248.70272827148438, "loss": 0.6333, "rewards/accuracies": 0.5, "rewards/chosen": 0.3519423007965088, "rewards/margins": 0.16439954936504364, "rewards/rejected": 0.18754275143146515, "step": 3573 }, { "epoch": 0.5527160255171081, "grad_norm": 9.161906242370605, "learning_rate": 4.532019704433498e-06, "logits/chosen": 7.7228217124938965, "logits/rejected": 13.913342475891113, "logps/chosen": -316.5376892089844, "logps/rejected": -406.7513122558594, "loss": 0.9234, "rewards/accuracies": 0.25, "rewards/chosen": 0.0004051625728607178, "rewards/margins": -0.39028510451316833, "rewards/rejected": 0.3906902074813843, "step": 3574 }, { "epoch": 0.5528706746568722, "grad_norm": 6.7055206298828125, "learning_rate": 4.531733302783825e-06, "logits/chosen": 5.911735534667969, "logits/rejected": 3.2133917808532715, "logps/chosen": -342.8312683105469, "logps/rejected": -359.0065612792969, "loss": 0.6271, "rewards/accuracies": 0.625, "rewards/chosen": 0.5394797325134277, "rewards/margins": 0.29227784276008606, "rewards/rejected": 0.24720191955566406, "step": 3575 }, { "epoch": 0.5530253237966364, "grad_norm": 5.960477828979492, "learning_rate": 4.531446901134151e-06, "logits/chosen": 7.805113792419434, "logits/rejected": 8.001245498657227, "logps/chosen": -354.50689697265625, "logps/rejected": -356.4140625, "loss": 0.7935, "rewards/accuracies": 0.625, "rewards/chosen": 0.3726765513420105, "rewards/margins": 0.042413897812366486, "rewards/rejected": 0.3302626311779022, "step": 3576 }, { "epoch": 0.5531799729364005, "grad_norm": 4.549402236938477, "learning_rate": 4.531160499484477e-06, "logits/chosen": 10.263496398925781, "logits/rejected": 4.837553024291992, "logps/chosen": -230.8402099609375, "logps/rejected": -279.2123718261719, "loss": 0.5779, "rewards/accuracies": 0.875, "rewards/chosen": 0.24746140837669373, "rewards/margins": 0.4961482882499695, "rewards/rejected": -0.24868685007095337, "step": 3577 }, { "epoch": 0.5533346220761647, "grad_norm": 5.337644100189209, "learning_rate": 4.530874097834804e-06, "logits/chosen": 9.329675674438477, "logits/rejected": 13.068574905395508, "logps/chosen": -269.3693542480469, "logps/rejected": -286.4462890625, "loss": 0.7257, "rewards/accuracies": 0.625, "rewards/chosen": 0.5886191129684448, "rewards/margins": 0.03895539790391922, "rewards/rejected": 0.5496636629104614, "step": 3578 }, { "epoch": 0.5534892712159288, "grad_norm": 7.192526340484619, "learning_rate": 4.530587696185131e-06, "logits/chosen": 10.102540969848633, "logits/rejected": 12.618480682373047, "logps/chosen": -332.9478759765625, "logps/rejected": -242.19119262695312, "loss": 0.8486, "rewards/accuracies": 0.5, "rewards/chosen": 0.028556670993566513, "rewards/margins": -0.20887719094753265, "rewards/rejected": 0.23743388056755066, "step": 3579 }, { "epoch": 0.553643920355693, "grad_norm": 5.215231895446777, "learning_rate": 4.530301294535457e-06, "logits/chosen": 10.68503475189209, "logits/rejected": 8.497568130493164, "logps/chosen": -320.5820617675781, "logps/rejected": -262.18023681640625, "loss": 0.6074, "rewards/accuracies": 0.625, "rewards/chosen": 0.4127378463745117, "rewards/margins": 0.23450195789337158, "rewards/rejected": 0.17823591828346252, "step": 3580 }, { "epoch": 0.5537985694954571, "grad_norm": 3.838759183883667, "learning_rate": 4.530014892885783e-06, "logits/chosen": 10.553598403930664, "logits/rejected": 11.064107894897461, "logps/chosen": -194.6639862060547, "logps/rejected": -200.62429809570312, "loss": 0.547, "rewards/accuracies": 0.75, "rewards/chosen": 0.4793577194213867, "rewards/margins": 0.38866546750068665, "rewards/rejected": 0.09069228172302246, "step": 3581 }, { "epoch": 0.5539532186352213, "grad_norm": 6.029939651489258, "learning_rate": 4.52972849123611e-06, "logits/chosen": 14.453691482543945, "logits/rejected": 5.123515605926514, "logps/chosen": -468.627197265625, "logps/rejected": -308.79254150390625, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": 0.48506975173950195, "rewards/margins": 0.23682114481925964, "rewards/rejected": 0.2482486069202423, "step": 3582 }, { "epoch": 0.5541078677749856, "grad_norm": 5.329648494720459, "learning_rate": 4.529442089586436e-06, "logits/chosen": 11.393535614013672, "logits/rejected": 11.046777725219727, "logps/chosen": -254.82723999023438, "logps/rejected": -268.787841796875, "loss": 0.7297, "rewards/accuracies": 0.5, "rewards/chosen": 0.22061532735824585, "rewards/margins": 0.025781691074371338, "rewards/rejected": 0.1948336660861969, "step": 3583 }, { "epoch": 0.5542625169147497, "grad_norm": 4.4506916999816895, "learning_rate": 4.529155687936763e-06, "logits/chosen": 6.265275001525879, "logits/rejected": -0.013617873191833496, "logps/chosen": -244.4647216796875, "logps/rejected": -158.00363159179688, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": 0.3447190821170807, "rewards/margins": 0.25595828890800476, "rewards/rejected": 0.08876079320907593, "step": 3584 }, { "epoch": 0.5544171660545139, "grad_norm": 10.360529899597168, "learning_rate": 4.528869286287089e-06, "logits/chosen": 8.019856452941895, "logits/rejected": 7.219570159912109, "logps/chosen": -395.52459716796875, "logps/rejected": -199.68572998046875, "loss": 0.8237, "rewards/accuracies": 0.5, "rewards/chosen": -0.061166711151599884, "rewards/margins": -0.0594443753361702, "rewards/rejected": -0.0017223283648490906, "step": 3585 }, { "epoch": 0.554571815194278, "grad_norm": 6.593823432922363, "learning_rate": 4.5285828846374155e-06, "logits/chosen": 7.215122222900391, "logits/rejected": 6.8266401290893555, "logps/chosen": -277.1084899902344, "logps/rejected": -263.850830078125, "loss": 0.7578, "rewards/accuracies": 0.375, "rewards/chosen": 0.3335644602775574, "rewards/margins": 0.0619109645485878, "rewards/rejected": 0.27165350317955017, "step": 3586 }, { "epoch": 0.5547264643340422, "grad_norm": 4.168130874633789, "learning_rate": 4.528296482987742e-06, "logits/chosen": 10.273550033569336, "logits/rejected": 15.078262329101562, "logps/chosen": -153.53765869140625, "logps/rejected": -222.0395965576172, "loss": 0.6152, "rewards/accuracies": 0.5, "rewards/chosen": 0.37599116563796997, "rewards/margins": 0.2333216369152069, "rewards/rejected": 0.14266952872276306, "step": 3587 }, { "epoch": 0.5548811134738063, "grad_norm": 4.672872066497803, "learning_rate": 4.528010081338069e-06, "logits/chosen": 7.191455841064453, "logits/rejected": 8.818643569946289, "logps/chosen": -228.7047576904297, "logps/rejected": -263.59332275390625, "loss": 0.6474, "rewards/accuracies": 0.625, "rewards/chosen": 0.5539558529853821, "rewards/margins": 0.22437722980976105, "rewards/rejected": 0.3295786380767822, "step": 3588 }, { "epoch": 0.5550357626135705, "grad_norm": 5.20742130279541, "learning_rate": 4.5277236796883954e-06, "logits/chosen": 7.266532897949219, "logits/rejected": 9.920047760009766, "logps/chosen": -283.619384765625, "logps/rejected": -306.19854736328125, "loss": 0.6844, "rewards/accuracies": 0.75, "rewards/chosen": 0.6154531836509705, "rewards/margins": 0.044618964195251465, "rewards/rejected": 0.5708341598510742, "step": 3589 }, { "epoch": 0.5551904117533346, "grad_norm": 9.111827850341797, "learning_rate": 4.527437278038722e-06, "logits/chosen": 4.200196743011475, "logits/rejected": 2.7336552143096924, "logps/chosen": -334.99664306640625, "logps/rejected": -377.4158935546875, "loss": 0.7059, "rewards/accuracies": 0.625, "rewards/chosen": 0.2466956079006195, "rewards/margins": -0.009928415529429913, "rewards/rejected": 0.2566240429878235, "step": 3590 }, { "epoch": 0.5553450608930988, "grad_norm": 7.970217227935791, "learning_rate": 4.527150876389048e-06, "logits/chosen": 7.0902862548828125, "logits/rejected": 7.37230920791626, "logps/chosen": -284.6959228515625, "logps/rejected": -310.8411560058594, "loss": 0.7812, "rewards/accuracies": 0.375, "rewards/chosen": 0.5816724300384521, "rewards/margins": -0.1375368982553482, "rewards/rejected": 0.7192093133926392, "step": 3591 }, { "epoch": 0.5554997100328629, "grad_norm": 6.084896087646484, "learning_rate": 4.5268644747393745e-06, "logits/chosen": 6.012429237365723, "logits/rejected": 1.8172115087509155, "logps/chosen": -296.15081787109375, "logps/rejected": -187.25193786621094, "loss": 0.6238, "rewards/accuracies": 0.5, "rewards/chosen": 0.053958505392074585, "rewards/margins": 0.22143949568271637, "rewards/rejected": -0.16748099029064178, "step": 3592 }, { "epoch": 0.5556543591726271, "grad_norm": 6.385085105895996, "learning_rate": 4.526578073089701e-06, "logits/chosen": 15.055275917053223, "logits/rejected": 5.783827304840088, "logps/chosen": -301.31109619140625, "logps/rejected": -221.2132110595703, "loss": 0.781, "rewards/accuracies": 0.375, "rewards/chosen": -0.07564354687929153, "rewards/margins": -0.13373184204101562, "rewards/rejected": 0.058088287711143494, "step": 3593 }, { "epoch": 0.5558090083123912, "grad_norm": 6.9723405838012695, "learning_rate": 4.526291671440028e-06, "logits/chosen": 13.371246337890625, "logits/rejected": 8.344164848327637, "logps/chosen": -263.8916015625, "logps/rejected": -230.72943115234375, "loss": 0.6329, "rewards/accuracies": 0.625, "rewards/chosen": 0.40703660249710083, "rewards/margins": 0.18481919169425964, "rewards/rejected": 0.2222174108028412, "step": 3594 }, { "epoch": 0.5559636574521554, "grad_norm": 11.827320098876953, "learning_rate": 4.5260052697903545e-06, "logits/chosen": 9.371259689331055, "logits/rejected": 10.057371139526367, "logps/chosen": -281.1583557128906, "logps/rejected": -296.6534423828125, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": 0.16524186730384827, "rewards/margins": 0.13216614723205566, "rewards/rejected": 0.03307570889592171, "step": 3595 }, { "epoch": 0.5561183065919196, "grad_norm": 7.136361122131348, "learning_rate": 4.525718868140681e-06, "logits/chosen": 6.639684677124023, "logits/rejected": 6.538175582885742, "logps/chosen": -306.7061462402344, "logps/rejected": -168.71986389160156, "loss": 0.7479, "rewards/accuracies": 0.375, "rewards/chosen": 0.1392790824174881, "rewards/margins": -0.004366010427474976, "rewards/rejected": 0.14364507794380188, "step": 3596 }, { "epoch": 0.5562729557316838, "grad_norm": 4.886064529418945, "learning_rate": 4.525432466491007e-06, "logits/chosen": 10.801265716552734, "logits/rejected": 6.915102005004883, "logps/chosen": -213.23863220214844, "logps/rejected": -254.48770141601562, "loss": 0.5227, "rewards/accuracies": 0.75, "rewards/chosen": 0.2671562731266022, "rewards/margins": 0.47924545407295227, "rewards/rejected": -0.2120891511440277, "step": 3597 }, { "epoch": 0.5564276048714479, "grad_norm": 4.799747943878174, "learning_rate": 4.525146064841334e-06, "logits/chosen": 10.222060203552246, "logits/rejected": 9.786624908447266, "logps/chosen": -212.83509826660156, "logps/rejected": -195.61679077148438, "loss": 0.6008, "rewards/accuracies": 0.75, "rewards/chosen": 0.39520156383514404, "rewards/margins": 0.260658860206604, "rewards/rejected": 0.13454273343086243, "step": 3598 }, { "epoch": 0.5565822540112121, "grad_norm": 5.969414710998535, "learning_rate": 4.52485966319166e-06, "logits/chosen": 9.955301284790039, "logits/rejected": 7.979180335998535, "logps/chosen": -231.13043212890625, "logps/rejected": -270.25738525390625, "loss": 0.6987, "rewards/accuracies": 0.5, "rewards/chosen": 0.5927839875221252, "rewards/margins": 0.16440770030021667, "rewards/rejected": 0.42837631702423096, "step": 3599 }, { "epoch": 0.5567369031509762, "grad_norm": 4.681183338165283, "learning_rate": 4.524573261541987e-06, "logits/chosen": 15.351505279541016, "logits/rejected": 11.054553031921387, "logps/chosen": -201.1328125, "logps/rejected": -217.37356567382812, "loss": 0.6229, "rewards/accuracies": 0.5, "rewards/chosen": 0.29159650206565857, "rewards/margins": 0.1804816722869873, "rewards/rejected": 0.11111484467983246, "step": 3600 }, { "epoch": 0.5568915522907404, "grad_norm": 4.315676212310791, "learning_rate": 4.5242868598923135e-06, "logits/chosen": 9.01484203338623, "logits/rejected": -0.9540919065475464, "logps/chosen": -343.8393249511719, "logps/rejected": -234.0284423828125, "loss": 0.4608, "rewards/accuracies": 0.875, "rewards/chosen": 0.4085361659526825, "rewards/margins": 0.7119013667106628, "rewards/rejected": -0.30336523056030273, "step": 3601 }, { "epoch": 0.5570462014305045, "grad_norm": 5.52816104888916, "learning_rate": 4.52400045824264e-06, "logits/chosen": 9.709121704101562, "logits/rejected": 8.241652488708496, "logps/chosen": -223.72882080078125, "logps/rejected": -208.41815185546875, "loss": 0.665, "rewards/accuracies": 0.625, "rewards/chosen": 0.1758045107126236, "rewards/margins": 0.1667385697364807, "rewards/rejected": 0.009065959602594376, "step": 3602 }, { "epoch": 0.5572008505702687, "grad_norm": 5.233506202697754, "learning_rate": 4.523714056592967e-06, "logits/chosen": 11.593863487243652, "logits/rejected": 13.646553039550781, "logps/chosen": -252.87530517578125, "logps/rejected": -259.9027099609375, "loss": 0.78, "rewards/accuracies": 0.375, "rewards/chosen": 0.12750694155693054, "rewards/margins": -0.04338666796684265, "rewards/rejected": 0.1708935797214508, "step": 3603 }, { "epoch": 0.5573554997100328, "grad_norm": 7.003399848937988, "learning_rate": 4.523427654943293e-06, "logits/chosen": 11.303274154663086, "logits/rejected": 5.539395809173584, "logps/chosen": -299.13519287109375, "logps/rejected": -204.26319885253906, "loss": 0.6439, "rewards/accuracies": 0.5, "rewards/chosen": 0.5186868906021118, "rewards/margins": 0.17752408981323242, "rewards/rejected": 0.3411628007888794, "step": 3604 }, { "epoch": 0.557510148849797, "grad_norm": 4.704915523529053, "learning_rate": 4.523141253293619e-06, "logits/chosen": 2.3328161239624023, "logits/rejected": 4.889397621154785, "logps/chosen": -146.39877319335938, "logps/rejected": -522.343505859375, "loss": 0.6458, "rewards/accuracies": 0.75, "rewards/chosen": 0.10141687095165253, "rewards/margins": 0.20595534145832062, "rewards/rejected": -0.1045384630560875, "step": 3605 }, { "epoch": 0.5576647979895611, "grad_norm": 4.940998077392578, "learning_rate": 4.522854851643946e-06, "logits/chosen": 7.311234951019287, "logits/rejected": 4.039869785308838, "logps/chosen": -196.97576904296875, "logps/rejected": -171.3727569580078, "loss": 0.6224, "rewards/accuracies": 0.625, "rewards/chosen": 0.4244900345802307, "rewards/margins": 0.2368585616350174, "rewards/rejected": 0.18763147294521332, "step": 3606 }, { "epoch": 0.5578194471293253, "grad_norm": 4.063277721405029, "learning_rate": 4.522568449994273e-06, "logits/chosen": 6.424857139587402, "logits/rejected": 7.713414192199707, "logps/chosen": -227.69915771484375, "logps/rejected": -219.16546630859375, "loss": 0.6333, "rewards/accuracies": 0.75, "rewards/chosen": 0.4822864532470703, "rewards/margins": 0.16694718599319458, "rewards/rejected": 0.3153392970561981, "step": 3607 }, { "epoch": 0.5579740962690894, "grad_norm": 5.726160049438477, "learning_rate": 4.522282048344599e-06, "logits/chosen": 10.90269947052002, "logits/rejected": 11.126384735107422, "logps/chosen": -262.9569396972656, "logps/rejected": -233.80538940429688, "loss": 0.7248, "rewards/accuracies": 0.5, "rewards/chosen": 0.22630012035369873, "rewards/margins": 0.009243473410606384, "rewards/rejected": 0.21705663204193115, "step": 3608 }, { "epoch": 0.5581287454088537, "grad_norm": 3.4471685886383057, "learning_rate": 4.521995646694926e-06, "logits/chosen": 8.60572624206543, "logits/rejected": 7.920344352722168, "logps/chosen": -218.6759490966797, "logps/rejected": -242.67666625976562, "loss": 0.4743, "rewards/accuracies": 0.875, "rewards/chosen": 0.33369648456573486, "rewards/margins": 0.5977742671966553, "rewards/rejected": -0.26407772302627563, "step": 3609 }, { "epoch": 0.5582833945486179, "grad_norm": 4.6529035568237305, "learning_rate": 4.521709245045252e-06, "logits/chosen": 6.490020751953125, "logits/rejected": 7.3491363525390625, "logps/chosen": -249.46868896484375, "logps/rejected": -216.26907348632812, "loss": 0.6209, "rewards/accuracies": 0.625, "rewards/chosen": 0.38138917088508606, "rewards/margins": 0.22886231541633606, "rewards/rejected": 0.1525268852710724, "step": 3610 }, { "epoch": 0.558438043688382, "grad_norm": 4.666914939880371, "learning_rate": 4.521422843395578e-06, "logits/chosen": 9.001693725585938, "logits/rejected": 8.91424560546875, "logps/chosen": -247.14523315429688, "logps/rejected": -252.96237182617188, "loss": 0.6423, "rewards/accuracies": 0.5, "rewards/chosen": 0.5848187208175659, "rewards/margins": 0.1357874721288681, "rewards/rejected": 0.449031263589859, "step": 3611 }, { "epoch": 0.5585926928281462, "grad_norm": 4.410085678100586, "learning_rate": 4.521136441745905e-06, "logits/chosen": 17.82365608215332, "logits/rejected": 10.565075874328613, "logps/chosen": -268.4102478027344, "logps/rejected": -239.2488555908203, "loss": 0.5114, "rewards/accuracies": 0.75, "rewards/chosen": 0.558750331401825, "rewards/margins": 0.5254983901977539, "rewards/rejected": 0.03325194865465164, "step": 3612 }, { "epoch": 0.5587473419679103, "grad_norm": 6.610159873962402, "learning_rate": 4.520850040096232e-06, "logits/chosen": 6.7402729988098145, "logits/rejected": 9.701754570007324, "logps/chosen": -284.0154724121094, "logps/rejected": -301.650146484375, "loss": 0.7617, "rewards/accuracies": 0.5, "rewards/chosen": 0.6410777568817139, "rewards/margins": -0.09951437264680862, "rewards/rejected": 0.7405920624732971, "step": 3613 }, { "epoch": 0.5589019911076745, "grad_norm": 5.578883647918701, "learning_rate": 4.520563638446558e-06, "logits/chosen": 10.940439224243164, "logits/rejected": 8.70705509185791, "logps/chosen": -386.8883972167969, "logps/rejected": -329.6268005371094, "loss": 0.6354, "rewards/accuracies": 0.625, "rewards/chosen": 0.732818603515625, "rewards/margins": 0.32858431339263916, "rewards/rejected": 0.4042343497276306, "step": 3614 }, { "epoch": 0.5590566402474386, "grad_norm": 6.58540153503418, "learning_rate": 4.520277236796884e-06, "logits/chosen": 6.423508644104004, "logits/rejected": 6.327029228210449, "logps/chosen": -315.8009338378906, "logps/rejected": -228.36264038085938, "loss": 0.6112, "rewards/accuracies": 0.5, "rewards/chosen": 0.4469373822212219, "rewards/margins": 0.24720098078250885, "rewards/rejected": 0.19973641633987427, "step": 3615 }, { "epoch": 0.5592112893872028, "grad_norm": 4.744726181030273, "learning_rate": 4.519990835147211e-06, "logits/chosen": 10.787200927734375, "logits/rejected": 8.305334091186523, "logps/chosen": -277.614990234375, "logps/rejected": -241.25241088867188, "loss": 0.5334, "rewards/accuracies": 0.625, "rewards/chosen": 0.7808390855789185, "rewards/margins": 0.43273472785949707, "rewards/rejected": 0.34810441732406616, "step": 3616 }, { "epoch": 0.5593659385269669, "grad_norm": 6.494253158569336, "learning_rate": 4.519704433497537e-06, "logits/chosen": 7.153563499450684, "logits/rejected": 5.177081108093262, "logps/chosen": -277.0997314453125, "logps/rejected": -269.0425109863281, "loss": 0.8098, "rewards/accuracies": 0.375, "rewards/chosen": 0.148020938038826, "rewards/margins": -0.07553607225418091, "rewards/rejected": 0.2235570251941681, "step": 3617 }, { "epoch": 0.5595205876667311, "grad_norm": 4.230822563171387, "learning_rate": 4.519418031847864e-06, "logits/chosen": 9.969825744628906, "logits/rejected": 7.020700454711914, "logps/chosen": -137.01666259765625, "logps/rejected": -109.45060729980469, "loss": 0.6858, "rewards/accuracies": 0.375, "rewards/chosen": 0.21498818695545197, "rewards/margins": 0.0501728281378746, "rewards/rejected": 0.16481536626815796, "step": 3618 }, { "epoch": 0.5596752368064952, "grad_norm": 5.804405689239502, "learning_rate": 4.51913163019819e-06, "logits/chosen": 8.10032844543457, "logits/rejected": 4.302483558654785, "logps/chosen": -312.70654296875, "logps/rejected": -288.6571960449219, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": 0.34845641255378723, "rewards/margins": 0.225398451089859, "rewards/rejected": 0.1230580061674118, "step": 3619 }, { "epoch": 0.5598298859462594, "grad_norm": 5.436731338500977, "learning_rate": 4.5188452285485165e-06, "logits/chosen": 10.971719741821289, "logits/rejected": 13.33697509765625, "logps/chosen": -262.65435791015625, "logps/rejected": -269.7942810058594, "loss": 0.6509, "rewards/accuracies": 0.5, "rewards/chosen": 0.2128930389881134, "rewards/margins": 0.13772663474082947, "rewards/rejected": 0.07516643404960632, "step": 3620 }, { "epoch": 0.5599845350860236, "grad_norm": 5.076542854309082, "learning_rate": 4.518558826898843e-06, "logits/chosen": 8.224015235900879, "logits/rejected": -0.49716126918792725, "logps/chosen": -274.8866271972656, "logps/rejected": -173.148193359375, "loss": 0.532, "rewards/accuracies": 0.875, "rewards/chosen": 0.7412557601928711, "rewards/margins": 0.4346083402633667, "rewards/rejected": 0.306647390127182, "step": 3621 }, { "epoch": 0.5601391842257878, "grad_norm": 4.226314067840576, "learning_rate": 4.51827242524917e-06, "logits/chosen": 10.973672866821289, "logits/rejected": 6.423691749572754, "logps/chosen": -265.5919494628906, "logps/rejected": -201.13607788085938, "loss": 0.6009, "rewards/accuracies": 0.625, "rewards/chosen": 0.5679236650466919, "rewards/margins": 0.2588083744049072, "rewards/rejected": 0.30911532044410706, "step": 3622 }, { "epoch": 0.5602938333655519, "grad_norm": 4.895264625549316, "learning_rate": 4.5179860235994965e-06, "logits/chosen": 7.835058212280273, "logits/rejected": 7.824412822723389, "logps/chosen": -212.9092559814453, "logps/rejected": -162.3740234375, "loss": 0.5371, "rewards/accuracies": 0.75, "rewards/chosen": 0.39376819133758545, "rewards/margins": 0.4903585910797119, "rewards/rejected": -0.09659042954444885, "step": 3623 }, { "epoch": 0.5604484825053161, "grad_norm": 4.777890682220459, "learning_rate": 4.517699621949822e-06, "logits/chosen": 7.037005424499512, "logits/rejected": 7.80773401260376, "logps/chosen": -224.48651123046875, "logps/rejected": -231.4244384765625, "loss": 0.6012, "rewards/accuracies": 0.75, "rewards/chosen": 0.6564013957977295, "rewards/margins": 0.272576242685318, "rewards/rejected": 0.3838251829147339, "step": 3624 }, { "epoch": 0.5606031316450802, "grad_norm": 5.514266014099121, "learning_rate": 4.517413220300149e-06, "logits/chosen": 12.807656288146973, "logits/rejected": 8.926528930664062, "logps/chosen": -251.1391143798828, "logps/rejected": -348.05987548828125, "loss": 0.7072, "rewards/accuracies": 0.625, "rewards/chosen": 0.35980215668678284, "rewards/margins": 0.11906873434782028, "rewards/rejected": 0.24073341488838196, "step": 3625 }, { "epoch": 0.5607577807848444, "grad_norm": 5.114682197570801, "learning_rate": 4.5171268186504756e-06, "logits/chosen": 8.046839714050293, "logits/rejected": 10.96063232421875, "logps/chosen": -160.58099365234375, "logps/rejected": -195.5984344482422, "loss": 0.7372, "rewards/accuracies": 0.625, "rewards/chosen": 0.23748008906841278, "rewards/margins": -0.05442379415035248, "rewards/rejected": 0.29190391302108765, "step": 3626 }, { "epoch": 0.5609124299246085, "grad_norm": 4.064693927764893, "learning_rate": 4.516840417000802e-06, "logits/chosen": 12.972407341003418, "logits/rejected": 6.20378303527832, "logps/chosen": -302.0226745605469, "logps/rejected": -201.37860107421875, "loss": 0.4759, "rewards/accuracies": 0.875, "rewards/chosen": 0.6497853994369507, "rewards/margins": 0.6856660842895508, "rewards/rejected": -0.03588065505027771, "step": 3627 }, { "epoch": 0.5610670790643727, "grad_norm": 7.956607818603516, "learning_rate": 4.516554015351129e-06, "logits/chosen": 10.036197662353516, "logits/rejected": 13.230546951293945, "logps/chosen": -274.1005554199219, "logps/rejected": -272.1798095703125, "loss": 1.0076, "rewards/accuracies": 0.25, "rewards/chosen": 0.15438240766525269, "rewards/margins": -0.43600937724113464, "rewards/rejected": 0.5903917551040649, "step": 3628 }, { "epoch": 0.5612217282041368, "grad_norm": 5.724521160125732, "learning_rate": 4.5162676137014555e-06, "logits/chosen": 1.750023365020752, "logits/rejected": 9.721931457519531, "logps/chosen": -196.28253173828125, "logps/rejected": -282.239990234375, "loss": 0.7141, "rewards/accuracies": 0.5, "rewards/chosen": 0.10155295580625534, "rewards/margins": 0.0940236896276474, "rewards/rejected": 0.007529273629188538, "step": 3629 }, { "epoch": 0.561376377343901, "grad_norm": 3.527271032333374, "learning_rate": 4.515981212051781e-06, "logits/chosen": 8.080997467041016, "logits/rejected": 2.065378427505493, "logps/chosen": -159.24478149414062, "logps/rejected": -119.29331970214844, "loss": 0.4922, "rewards/accuracies": 0.875, "rewards/chosen": 0.28644341230392456, "rewards/margins": 0.5212591886520386, "rewards/rejected": -0.23481574654579163, "step": 3630 }, { "epoch": 0.5615310264836652, "grad_norm": 7.018500804901123, "learning_rate": 4.515694810402108e-06, "logits/chosen": 4.208637237548828, "logits/rejected": 6.619284629821777, "logps/chosen": -167.72523498535156, "logps/rejected": -224.14328002929688, "loss": 0.8243, "rewards/accuracies": 0.625, "rewards/chosen": 0.2563505172729492, "rewards/margins": -0.03866724669933319, "rewards/rejected": 0.2950177490711212, "step": 3631 }, { "epoch": 0.5616856756234293, "grad_norm": 6.5749921798706055, "learning_rate": 4.515408408752435e-06, "logits/chosen": 7.861728668212891, "logits/rejected": 9.753066062927246, "logps/chosen": -260.8916015625, "logps/rejected": -310.81903076171875, "loss": 0.7772, "rewards/accuracies": 0.375, "rewards/chosen": -0.0019121654331684113, "rewards/margins": -0.13817378878593445, "rewards/rejected": 0.13626162707805634, "step": 3632 }, { "epoch": 0.5618403247631935, "grad_norm": 6.300232887268066, "learning_rate": 4.515122007102761e-06, "logits/chosen": 8.915915489196777, "logits/rejected": 6.9757256507873535, "logps/chosen": -273.7812194824219, "logps/rejected": -286.6151123046875, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": 0.6126001477241516, "rewards/margins": 0.35391557216644287, "rewards/rejected": 0.25868457555770874, "step": 3633 }, { "epoch": 0.5619949739029577, "grad_norm": 4.627342224121094, "learning_rate": 4.514835605453088e-06, "logits/chosen": 7.098930835723877, "logits/rejected": 4.131325721740723, "logps/chosen": -203.69952392578125, "logps/rejected": -164.17886352539062, "loss": 0.6091, "rewards/accuracies": 0.625, "rewards/chosen": 0.397869735956192, "rewards/margins": 0.2247088998556137, "rewards/rejected": 0.1731608510017395, "step": 3634 }, { "epoch": 0.5621496230427219, "grad_norm": 5.479161262512207, "learning_rate": 4.5145492038034146e-06, "logits/chosen": 4.6706085205078125, "logits/rejected": -0.017857074737548828, "logps/chosen": -271.0173645019531, "logps/rejected": -209.44134521484375, "loss": 0.6623, "rewards/accuracies": 0.625, "rewards/chosen": 0.07213973999023438, "rewards/margins": 0.20365524291992188, "rewards/rejected": -0.1315155327320099, "step": 3635 }, { "epoch": 0.562304272182486, "grad_norm": 7.688310146331787, "learning_rate": 4.514262802153741e-06, "logits/chosen": 6.278905868530273, "logits/rejected": 10.441362380981445, "logps/chosen": -303.8002624511719, "logps/rejected": -330.2528076171875, "loss": 0.7186, "rewards/accuracies": 0.5, "rewards/chosen": 0.2380618155002594, "rewards/margins": 0.03998413681983948, "rewards/rejected": 0.19807769358158112, "step": 3636 }, { "epoch": 0.5624589213222502, "grad_norm": 4.849839687347412, "learning_rate": 4.513976400504067e-06, "logits/chosen": 9.330001831054688, "logits/rejected": 13.519051551818848, "logps/chosen": -224.05209350585938, "logps/rejected": -274.3986511230469, "loss": 0.6609, "rewards/accuracies": 0.625, "rewards/chosen": 0.5663037300109863, "rewards/margins": 0.10518389195203781, "rewards/rejected": 0.46111980080604553, "step": 3637 }, { "epoch": 0.5626135704620143, "grad_norm": 6.879826545715332, "learning_rate": 4.513689998854394e-06, "logits/chosen": 5.863480567932129, "logits/rejected": -0.09810042381286621, "logps/chosen": -297.7219543457031, "logps/rejected": -219.74148559570312, "loss": 0.7047, "rewards/accuracies": 0.375, "rewards/chosen": -0.030804306268692017, "rewards/margins": 0.16858971118927002, "rewards/rejected": -0.19939398765563965, "step": 3638 }, { "epoch": 0.5627682196017785, "grad_norm": 114.09884643554688, "learning_rate": 4.51340359720472e-06, "logits/chosen": 11.261488914489746, "logits/rejected": 3.533715009689331, "logps/chosen": -315.0606689453125, "logps/rejected": -260.05902099609375, "loss": 0.5768, "rewards/accuracies": 0.75, "rewards/chosen": 0.4755340814590454, "rewards/margins": 0.44022253155708313, "rewards/rejected": 0.035311512649059296, "step": 3639 }, { "epoch": 0.5629228687415426, "grad_norm": 5.230422496795654, "learning_rate": 4.513117195555047e-06, "logits/chosen": 6.309638977050781, "logits/rejected": 5.827183723449707, "logps/chosen": -210.89346313476562, "logps/rejected": -244.02325439453125, "loss": 0.7449, "rewards/accuracies": 0.125, "rewards/chosen": 0.16464069485664368, "rewards/margins": -0.017087697982788086, "rewards/rejected": 0.18172839283943176, "step": 3640 }, { "epoch": 0.5630775178813068, "grad_norm": 5.134578704833984, "learning_rate": 4.512830793905374e-06, "logits/chosen": 9.236634254455566, "logits/rejected": 5.6499176025390625, "logps/chosen": -358.2993469238281, "logps/rejected": -399.5333251953125, "loss": 0.3958, "rewards/accuracies": 0.875, "rewards/chosen": 0.8248772621154785, "rewards/margins": 0.8987542390823364, "rewards/rejected": -0.07387696206569672, "step": 3641 }, { "epoch": 0.5632321670210709, "grad_norm": 4.882595062255859, "learning_rate": 4.5125443922557e-06, "logits/chosen": 9.226993560791016, "logits/rejected": 6.979156494140625, "logps/chosen": -232.09095764160156, "logps/rejected": -288.50079345703125, "loss": 0.5659, "rewards/accuracies": 0.75, "rewards/chosen": 0.5040963292121887, "rewards/margins": 0.32436180114746094, "rewards/rejected": 0.17973452806472778, "step": 3642 }, { "epoch": 0.5633868161608351, "grad_norm": 3.5573363304138184, "learning_rate": 4.512257990606026e-06, "logits/chosen": 10.841207504272461, "logits/rejected": 15.784500122070312, "logps/chosen": -153.48143005371094, "logps/rejected": -255.61265563964844, "loss": 0.5567, "rewards/accuracies": 0.875, "rewards/chosen": 0.10322020202875137, "rewards/margins": 0.3135751485824585, "rewards/rejected": -0.21035495400428772, "step": 3643 }, { "epoch": 0.5635414653005992, "grad_norm": 6.520923614501953, "learning_rate": 4.511971588956353e-06, "logits/chosen": 9.688759803771973, "logits/rejected": 13.476829528808594, "logps/chosen": -420.51678466796875, "logps/rejected": -450.56268310546875, "loss": 0.6223, "rewards/accuracies": 0.5, "rewards/chosen": 0.3638840317726135, "rewards/margins": 0.24585817754268646, "rewards/rejected": 0.11802583932876587, "step": 3644 }, { "epoch": 0.5636961144403634, "grad_norm": 5.898277282714844, "learning_rate": 4.511685187306679e-06, "logits/chosen": 9.18428897857666, "logits/rejected": 9.48148250579834, "logps/chosen": -250.00482177734375, "logps/rejected": -328.3102111816406, "loss": 0.5957, "rewards/accuracies": 0.75, "rewards/chosen": 0.44468259811401367, "rewards/margins": 0.25656577944755554, "rewards/rejected": 0.18811683356761932, "step": 3645 }, { "epoch": 0.5638507635801275, "grad_norm": 5.621179580688477, "learning_rate": 4.511398785657006e-06, "logits/chosen": 13.359064102172852, "logits/rejected": 11.708257675170898, "logps/chosen": -248.6935577392578, "logps/rejected": -213.87918090820312, "loss": 0.6613, "rewards/accuracies": 0.5, "rewards/chosen": 9.238719940185547e-07, "rewards/margins": 0.10682410001754761, "rewards/rejected": -0.10682317614555359, "step": 3646 }, { "epoch": 0.5640054127198918, "grad_norm": 3.9496428966522217, "learning_rate": 4.511112384007333e-06, "logits/chosen": 9.742288589477539, "logits/rejected": 5.747826099395752, "logps/chosen": -325.0531005859375, "logps/rejected": -275.94976806640625, "loss": 0.4777, "rewards/accuracies": 0.75, "rewards/chosen": 0.5693107843399048, "rewards/margins": 0.6642358899116516, "rewards/rejected": -0.09492513537406921, "step": 3647 }, { "epoch": 0.564160061859656, "grad_norm": 77.00012969970703, "learning_rate": 4.5108259823576585e-06, "logits/chosen": 13.071802139282227, "logits/rejected": 3.804473638534546, "logps/chosen": -258.6274719238281, "logps/rejected": -201.11111450195312, "loss": 0.7043, "rewards/accuracies": 0.5, "rewards/chosen": 0.25364238023757935, "rewards/margins": 0.08735001087188721, "rewards/rejected": 0.16629236936569214, "step": 3648 }, { "epoch": 0.5643147109994201, "grad_norm": 4.047515869140625, "learning_rate": 4.510539580707985e-06, "logits/chosen": 10.43231201171875, "logits/rejected": -1.2250852584838867, "logps/chosen": -310.70562744140625, "logps/rejected": -169.94357299804688, "loss": 0.6077, "rewards/accuracies": 0.5, "rewards/chosen": 0.36159229278564453, "rewards/margins": 0.3272063136100769, "rewards/rejected": 0.034385960549116135, "step": 3649 }, { "epoch": 0.5644693601391843, "grad_norm": 4.396068572998047, "learning_rate": 4.510253179058312e-06, "logits/chosen": 10.8219633102417, "logits/rejected": 3.9990551471710205, "logps/chosen": -328.985595703125, "logps/rejected": -262.834716796875, "loss": 0.5517, "rewards/accuracies": 0.75, "rewards/chosen": 0.12695759534835815, "rewards/margins": 0.372251957654953, "rewards/rejected": -0.24529439210891724, "step": 3650 }, { "epoch": 0.5646240092789484, "grad_norm": 5.448184013366699, "learning_rate": 4.5099667774086384e-06, "logits/chosen": 13.655328750610352, "logits/rejected": 9.747859954833984, "logps/chosen": -267.9127197265625, "logps/rejected": -266.034423828125, "loss": 0.5706, "rewards/accuracies": 0.75, "rewards/chosen": 0.2778496742248535, "rewards/margins": 0.2799261212348938, "rewards/rejected": -0.0020764395594596863, "step": 3651 }, { "epoch": 0.5647786584187126, "grad_norm": 5.679709434509277, "learning_rate": 4.509680375758965e-06, "logits/chosen": 6.006450653076172, "logits/rejected": 8.318628311157227, "logps/chosen": -225.3775177001953, "logps/rejected": -298.083251953125, "loss": 0.73, "rewards/accuracies": 0.375, "rewards/chosen": 0.18423233926296234, "rewards/margins": 0.009407687932252884, "rewards/rejected": 0.17482465505599976, "step": 3652 }, { "epoch": 0.5649333075584767, "grad_norm": 7.878204345703125, "learning_rate": 4.509393974109291e-06, "logits/chosen": 7.103664875030518, "logits/rejected": 10.92629337310791, "logps/chosen": -316.171875, "logps/rejected": -322.41424560546875, "loss": 1.0135, "rewards/accuracies": 0.25, "rewards/chosen": -0.08354025334119797, "rewards/margins": -0.4855995774269104, "rewards/rejected": 0.40205928683280945, "step": 3653 }, { "epoch": 0.5650879566982409, "grad_norm": 5.877285003662109, "learning_rate": 4.5091075724596175e-06, "logits/chosen": 10.732038497924805, "logits/rejected": 11.95211124420166, "logps/chosen": -181.72096252441406, "logps/rejected": -182.55142211914062, "loss": 0.7745, "rewards/accuracies": 0.375, "rewards/chosen": 0.28420814871788025, "rewards/margins": -0.08466291427612305, "rewards/rejected": 0.3688710629940033, "step": 3654 }, { "epoch": 0.565242605838005, "grad_norm": 7.787291526794434, "learning_rate": 4.508821170809944e-06, "logits/chosen": 6.113892555236816, "logits/rejected": 12.226123809814453, "logps/chosen": -300.123779296875, "logps/rejected": -379.57769775390625, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": 0.0034510791301727295, "rewards/margins": 0.10299862921237946, "rewards/rejected": -0.09954756498336792, "step": 3655 }, { "epoch": 0.5653972549777692, "grad_norm": 5.9066338539123535, "learning_rate": 4.508534769160271e-06, "logits/chosen": 9.238149642944336, "logits/rejected": 10.223678588867188, "logps/chosen": -236.20460510253906, "logps/rejected": -213.0561065673828, "loss": 0.7917, "rewards/accuracies": 0.5, "rewards/chosen": -0.044362232089042664, "rewards/margins": -0.10226662456989288, "rewards/rejected": 0.05790438503026962, "step": 3656 }, { "epoch": 0.5655519041175333, "grad_norm": 5.332758903503418, "learning_rate": 4.508248367510597e-06, "logits/chosen": 11.049283981323242, "logits/rejected": 12.803634643554688, "logps/chosen": -213.97698974609375, "logps/rejected": -205.62265014648438, "loss": 0.708, "rewards/accuracies": 0.625, "rewards/chosen": -0.314159095287323, "rewards/margins": 0.15524911880493164, "rewards/rejected": -0.46940815448760986, "step": 3657 }, { "epoch": 0.5657065532572975, "grad_norm": 6.386929988861084, "learning_rate": 4.507961965860923e-06, "logits/chosen": 10.231005668640137, "logits/rejected": 6.274954319000244, "logps/chosen": -313.561279296875, "logps/rejected": -308.9860534667969, "loss": 0.5213, "rewards/accuracies": 0.75, "rewards/chosen": 0.38761091232299805, "rewards/margins": 0.5744739770889282, "rewards/rejected": -0.18686306476593018, "step": 3658 }, { "epoch": 0.5658612023970616, "grad_norm": 3.730976104736328, "learning_rate": 4.50767556421125e-06, "logits/chosen": 10.138751029968262, "logits/rejected": 3.912522315979004, "logps/chosen": -263.2839050292969, "logps/rejected": -158.0906982421875, "loss": 0.511, "rewards/accuracies": 0.625, "rewards/chosen": 0.5830817222595215, "rewards/margins": 0.5625765323638916, "rewards/rejected": 0.02050519734621048, "step": 3659 }, { "epoch": 0.5660158515368259, "grad_norm": 4.402044296264648, "learning_rate": 4.507389162561577e-06, "logits/chosen": 12.287494659423828, "logits/rejected": 5.61386251449585, "logps/chosen": -301.85284423828125, "logps/rejected": -204.0203399658203, "loss": 0.498, "rewards/accuracies": 1.0, "rewards/chosen": 0.529687762260437, "rewards/margins": 0.47713372111320496, "rewards/rejected": 0.05255403369665146, "step": 3660 }, { "epoch": 0.56617050067659, "grad_norm": 4.895749092102051, "learning_rate": 4.507102760911903e-06, "logits/chosen": 11.684027671813965, "logits/rejected": 8.538240432739258, "logps/chosen": -303.5574645996094, "logps/rejected": -269.60223388671875, "loss": 0.6207, "rewards/accuracies": 0.5, "rewards/chosen": 0.312182754278183, "rewards/margins": 0.3468983471393585, "rewards/rejected": -0.03471565619111061, "step": 3661 }, { "epoch": 0.5663251498163542, "grad_norm": 3.913245916366577, "learning_rate": 4.50681635926223e-06, "logits/chosen": 15.79096794128418, "logits/rejected": 11.60367488861084, "logps/chosen": -249.99383544921875, "logps/rejected": -209.7312774658203, "loss": 0.5796, "rewards/accuracies": 0.75, "rewards/chosen": 0.3750917613506317, "rewards/margins": 0.2654780447483063, "rewards/rejected": 0.10961371660232544, "step": 3662 }, { "epoch": 0.5664797989561183, "grad_norm": 7.531073093414307, "learning_rate": 4.506529957612556e-06, "logits/chosen": 10.545195579528809, "logits/rejected": 11.463794708251953, "logps/chosen": -319.2952880859375, "logps/rejected": -325.82080078125, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": 0.09842488169670105, "rewards/margins": 0.31463348865509033, "rewards/rejected": -0.21620863676071167, "step": 3663 }, { "epoch": 0.5666344480958825, "grad_norm": 6.598790168762207, "learning_rate": 4.506243555962882e-06, "logits/chosen": 14.826682090759277, "logits/rejected": 5.902122974395752, "logps/chosen": -429.685302734375, "logps/rejected": -260.0694274902344, "loss": 0.7827, "rewards/accuracies": 0.75, "rewards/chosen": 0.5122130513191223, "rewards/margins": 0.07252617180347443, "rewards/rejected": 0.4396868646144867, "step": 3664 }, { "epoch": 0.5667890972356466, "grad_norm": 4.575944423675537, "learning_rate": 4.505957154313209e-06, "logits/chosen": 9.64094066619873, "logits/rejected": 4.752739906311035, "logps/chosen": -235.779052734375, "logps/rejected": -134.34237670898438, "loss": 0.6774, "rewards/accuracies": 0.625, "rewards/chosen": -0.22035714983940125, "rewards/margins": 0.13793662190437317, "rewards/rejected": -0.358293741941452, "step": 3665 }, { "epoch": 0.5669437463754108, "grad_norm": 6.269856929779053, "learning_rate": 4.505670752663536e-06, "logits/chosen": 12.617897033691406, "logits/rejected": 7.922951698303223, "logps/chosen": -353.73980712890625, "logps/rejected": -256.1517639160156, "loss": 0.6759, "rewards/accuracies": 0.375, "rewards/chosen": 0.2883417010307312, "rewards/margins": 0.1448616087436676, "rewards/rejected": 0.1434801071882248, "step": 3666 }, { "epoch": 0.5670983955151749, "grad_norm": 5.02537727355957, "learning_rate": 4.505384351013862e-06, "logits/chosen": 7.874388694763184, "logits/rejected": 5.053269863128662, "logps/chosen": -277.9246826171875, "logps/rejected": -257.0106201171875, "loss": 0.5354, "rewards/accuracies": 0.875, "rewards/chosen": 0.017989888787269592, "rewards/margins": 0.5474467277526855, "rewards/rejected": -0.5294568538665771, "step": 3667 }, { "epoch": 0.5672530446549391, "grad_norm": 5.3897600173950195, "learning_rate": 4.505097949364189e-06, "logits/chosen": 12.264787673950195, "logits/rejected": 11.02160930633545, "logps/chosen": -306.97894287109375, "logps/rejected": -288.6179504394531, "loss": 0.5599, "rewards/accuracies": 0.875, "rewards/chosen": 0.2851245403289795, "rewards/margins": 0.4912259876728058, "rewards/rejected": -0.2061014324426651, "step": 3668 }, { "epoch": 0.5674076937947032, "grad_norm": 4.734246253967285, "learning_rate": 4.504811547714516e-06, "logits/chosen": 9.199647903442383, "logits/rejected": 10.080564498901367, "logps/chosen": -251.60171508789062, "logps/rejected": -185.6214141845703, "loss": 0.5663, "rewards/accuracies": 0.625, "rewards/chosen": 0.41670435667037964, "rewards/margins": 0.30135297775268555, "rewards/rejected": 0.11535138636827469, "step": 3669 }, { "epoch": 0.5675623429344674, "grad_norm": 6.553821086883545, "learning_rate": 4.504525146064841e-06, "logits/chosen": 14.044720649719238, "logits/rejected": 11.622954368591309, "logps/chosen": -286.94451904296875, "logps/rejected": -244.75901794433594, "loss": 0.8628, "rewards/accuracies": 0.25, "rewards/chosen": -0.05636577308177948, "rewards/margins": -0.28418415784835815, "rewards/rejected": 0.22781839966773987, "step": 3670 }, { "epoch": 0.5677169920742315, "grad_norm": 5.8462324142456055, "learning_rate": 4.504238744415168e-06, "logits/chosen": 9.776199340820312, "logits/rejected": 10.036972045898438, "logps/chosen": -334.365234375, "logps/rejected": -349.728271484375, "loss": 0.6829, "rewards/accuracies": 0.5, "rewards/chosen": 0.0413224995136261, "rewards/margins": 0.14678648114204407, "rewards/rejected": -0.10546398162841797, "step": 3671 }, { "epoch": 0.5678716412139957, "grad_norm": 10.066862106323242, "learning_rate": 4.503952342765495e-06, "logits/chosen": 7.677642345428467, "logits/rejected": 11.483621597290039, "logps/chosen": -226.54116821289062, "logps/rejected": -322.24114990234375, "loss": 0.6928, "rewards/accuracies": 0.5, "rewards/chosen": 0.3108657896518707, "rewards/margins": 0.03601866587996483, "rewards/rejected": 0.274847149848938, "step": 3672 }, { "epoch": 0.56802629035376, "grad_norm": 7.763453483581543, "learning_rate": 4.503665941115821e-06, "logits/chosen": 13.709482192993164, "logits/rejected": 8.103269577026367, "logps/chosen": -344.9154052734375, "logps/rejected": -211.20916748046875, "loss": 0.5759, "rewards/accuracies": 0.625, "rewards/chosen": 0.1412796974182129, "rewards/margins": 0.3192577362060547, "rewards/rejected": -0.17797806859016418, "step": 3673 }, { "epoch": 0.5681809394935241, "grad_norm": 3.7702691555023193, "learning_rate": 4.503379539466148e-06, "logits/chosen": 17.089126586914062, "logits/rejected": 5.692880630493164, "logps/chosen": -310.4942932128906, "logps/rejected": -203.1164093017578, "loss": 0.4442, "rewards/accuracies": 1.0, "rewards/chosen": 0.14049378037452698, "rewards/margins": 0.6498605608940125, "rewards/rejected": -0.5093667507171631, "step": 3674 }, { "epoch": 0.5683355886332883, "grad_norm": 5.43742561340332, "learning_rate": 4.503093137816475e-06, "logits/chosen": 8.659968376159668, "logits/rejected": 5.5286173820495605, "logps/chosen": -380.5871276855469, "logps/rejected": -300.734130859375, "loss": 0.5399, "rewards/accuracies": 0.875, "rewards/chosen": 0.26291656494140625, "rewards/margins": 0.40566912293434143, "rewards/rejected": -0.14275255799293518, "step": 3675 }, { "epoch": 0.5684902377730524, "grad_norm": 5.515951633453369, "learning_rate": 4.5028067361668005e-06, "logits/chosen": 7.380363941192627, "logits/rejected": 3.4265217781066895, "logps/chosen": -269.34716796875, "logps/rejected": -227.3361053466797, "loss": 0.4686, "rewards/accuracies": 1.0, "rewards/chosen": 0.6138440370559692, "rewards/margins": 0.5778607726097107, "rewards/rejected": 0.03598332405090332, "step": 3676 }, { "epoch": 0.5686448869128166, "grad_norm": 5.3107194900512695, "learning_rate": 4.502520334517127e-06, "logits/chosen": 9.512046813964844, "logits/rejected": 8.318483352661133, "logps/chosen": -272.41619873046875, "logps/rejected": -260.7023620605469, "loss": 0.5269, "rewards/accuracies": 0.875, "rewards/chosen": 0.3114250898361206, "rewards/margins": 0.5542429685592651, "rewards/rejected": -0.24281789362430573, "step": 3677 }, { "epoch": 0.5687995360525807, "grad_norm": 5.999351978302002, "learning_rate": 4.502233932867454e-06, "logits/chosen": 14.127909660339355, "logits/rejected": 10.931772232055664, "logps/chosen": -305.1776123046875, "logps/rejected": -198.8520965576172, "loss": 0.8066, "rewards/accuracies": 0.25, "rewards/chosen": 0.0014509186148643494, "rewards/margins": -0.17532047629356384, "rewards/rejected": 0.1767714023590088, "step": 3678 }, { "epoch": 0.5689541851923449, "grad_norm": 6.649197578430176, "learning_rate": 4.50194753121778e-06, "logits/chosen": 12.58023738861084, "logits/rejected": 9.021209716796875, "logps/chosen": -320.255859375, "logps/rejected": -254.71054077148438, "loss": 0.7938, "rewards/accuracies": 0.25, "rewards/chosen": -0.031194686889648438, "rewards/margins": -0.037896350026130676, "rewards/rejected": 0.006701678037643433, "step": 3679 }, { "epoch": 0.569108834332109, "grad_norm": 8.822691917419434, "learning_rate": 4.501661129568107e-06, "logits/chosen": 8.005599975585938, "logits/rejected": 6.035144805908203, "logps/chosen": -513.3710327148438, "logps/rejected": -327.2975158691406, "loss": 0.7097, "rewards/accuracies": 0.625, "rewards/chosen": 0.27144578099250793, "rewards/margins": 0.23865589499473572, "rewards/rejected": 0.03278990089893341, "step": 3680 }, { "epoch": 0.5692634834718732, "grad_norm": 10.953230857849121, "learning_rate": 4.501374727918434e-06, "logits/chosen": 7.964801788330078, "logits/rejected": 3.9077465534210205, "logps/chosen": -429.30487060546875, "logps/rejected": -366.0997314453125, "loss": 0.604, "rewards/accuracies": 0.625, "rewards/chosen": 0.04000203311443329, "rewards/margins": 0.43759608268737793, "rewards/rejected": -0.39759403467178345, "step": 3681 }, { "epoch": 0.5694181326116373, "grad_norm": 4.495512008666992, "learning_rate": 4.5010883262687595e-06, "logits/chosen": 3.5331974029541016, "logits/rejected": 5.198853015899658, "logps/chosen": -180.3953857421875, "logps/rejected": -199.6255340576172, "loss": 0.6876, "rewards/accuracies": 0.375, "rewards/chosen": -0.03182120621204376, "rewards/margins": 0.17631599307060242, "rewards/rejected": -0.20813718438148499, "step": 3682 }, { "epoch": 0.5695727817514015, "grad_norm": 4.0957417488098145, "learning_rate": 4.500801924619086e-06, "logits/chosen": 8.53170108795166, "logits/rejected": 3.757833480834961, "logps/chosen": -201.50238037109375, "logps/rejected": -131.14205932617188, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": 0.22799883782863617, "rewards/margins": 0.6147207021713257, "rewards/rejected": -0.3867218494415283, "step": 3683 }, { "epoch": 0.5697274308911656, "grad_norm": 4.614190578460693, "learning_rate": 4.500515522969413e-06, "logits/chosen": 7.984928131103516, "logits/rejected": 5.929247856140137, "logps/chosen": -220.29620361328125, "logps/rejected": -210.05551147460938, "loss": 0.6095, "rewards/accuracies": 0.625, "rewards/chosen": 0.29516980051994324, "rewards/margins": 0.19861462712287903, "rewards/rejected": 0.0965551882982254, "step": 3684 }, { "epoch": 0.5698820800309299, "grad_norm": 4.490009784698486, "learning_rate": 4.5002291213197395e-06, "logits/chosen": 8.425851821899414, "logits/rejected": 6.92668342590332, "logps/chosen": -161.73577880859375, "logps/rejected": -160.24754333496094, "loss": 0.6626, "rewards/accuracies": 0.625, "rewards/chosen": 0.21719884872436523, "rewards/margins": 0.11057616025209427, "rewards/rejected": 0.10662268847227097, "step": 3685 }, { "epoch": 0.570036729170694, "grad_norm": 34.64583206176758, "learning_rate": 4.499942719670065e-06, "logits/chosen": 9.462136268615723, "logits/rejected": 7.680774688720703, "logps/chosen": -155.32489013671875, "logps/rejected": -124.76506805419922, "loss": 0.5926, "rewards/accuracies": 0.75, "rewards/chosen": 0.33925849199295044, "rewards/margins": 0.24680644273757935, "rewards/rejected": 0.0924520492553711, "step": 3686 }, { "epoch": 0.5701913783104582, "grad_norm": 11.269735336303711, "learning_rate": 4.499656318020392e-06, "logits/chosen": 8.73222541809082, "logits/rejected": 9.909281730651855, "logps/chosen": -261.3561706542969, "logps/rejected": -340.4133605957031, "loss": 0.6472, "rewards/accuracies": 0.625, "rewards/chosen": -0.06953859329223633, "rewards/margins": 0.1912437528371811, "rewards/rejected": -0.2607823312282562, "step": 3687 }, { "epoch": 0.5703460274502223, "grad_norm": 7.320501327514648, "learning_rate": 4.4993699163707186e-06, "logits/chosen": 7.51926851272583, "logits/rejected": 6.704490661621094, "logps/chosen": -261.43597412109375, "logps/rejected": -275.8111572265625, "loss": 0.8212, "rewards/accuracies": 0.375, "rewards/chosen": -0.05760951340198517, "rewards/margins": -0.13593344390392303, "rewards/rejected": 0.07832393050193787, "step": 3688 }, { "epoch": 0.5705006765899865, "grad_norm": 5.138495445251465, "learning_rate": 4.499083514721045e-06, "logits/chosen": 9.874979019165039, "logits/rejected": 12.54377555847168, "logps/chosen": -217.4697723388672, "logps/rejected": -224.77923583984375, "loss": 0.7372, "rewards/accuracies": 0.5, "rewards/chosen": -0.009426403790712357, "rewards/margins": -0.03714337199926376, "rewards/rejected": 0.027716970071196556, "step": 3689 }, { "epoch": 0.5706553257297506, "grad_norm": 6.244549751281738, "learning_rate": 4.498797113071372e-06, "logits/chosen": 7.455750465393066, "logits/rejected": 4.7432403564453125, "logps/chosen": -353.31573486328125, "logps/rejected": -292.26507568359375, "loss": 0.4749, "rewards/accuracies": 0.75, "rewards/chosen": 0.29922980070114136, "rewards/margins": 0.6084624528884888, "rewards/rejected": -0.309232622385025, "step": 3690 }, { "epoch": 0.5708099748695148, "grad_norm": 4.084224224090576, "learning_rate": 4.498510711421698e-06, "logits/chosen": 12.917793273925781, "logits/rejected": 3.246866226196289, "logps/chosen": -279.272705078125, "logps/rejected": -194.4368438720703, "loss": 0.4774, "rewards/accuracies": 0.875, "rewards/chosen": 0.6332556009292603, "rewards/margins": 0.6148768663406372, "rewards/rejected": 0.01837873086333275, "step": 3691 }, { "epoch": 0.5709646240092789, "grad_norm": 6.143355846405029, "learning_rate": 4.498224309772024e-06, "logits/chosen": 9.517169952392578, "logits/rejected": 6.6279826164245605, "logps/chosen": -286.85748291015625, "logps/rejected": -259.88275146484375, "loss": 0.7693, "rewards/accuracies": 0.625, "rewards/chosen": -0.07559224218130112, "rewards/margins": -0.08706313371658325, "rewards/rejected": 0.011470891535282135, "step": 3692 }, { "epoch": 0.5711192731490431, "grad_norm": 6.04624080657959, "learning_rate": 4.497937908122351e-06, "logits/chosen": 14.013015747070312, "logits/rejected": 3.865271806716919, "logps/chosen": -316.0419921875, "logps/rejected": -200.34170532226562, "loss": 0.6086, "rewards/accuracies": 0.875, "rewards/chosen": -0.07523509860038757, "rewards/margins": 0.21238252520561218, "rewards/rejected": -0.28761762380599976, "step": 3693 }, { "epoch": 0.5712739222888072, "grad_norm": 5.281108379364014, "learning_rate": 4.497651506472678e-06, "logits/chosen": 12.506205558776855, "logits/rejected": 5.24484920501709, "logps/chosen": -384.62353515625, "logps/rejected": -318.9117736816406, "loss": 0.4616, "rewards/accuracies": 0.75, "rewards/chosen": 0.5413500070571899, "rewards/margins": 0.6123498678207397, "rewards/rejected": -0.07099992036819458, "step": 3694 }, { "epoch": 0.5714285714285714, "grad_norm": 3.5025908946990967, "learning_rate": 4.497365104823004e-06, "logits/chosen": 10.446781158447266, "logits/rejected": 3.465008497238159, "logps/chosen": -371.855712890625, "logps/rejected": -193.8430938720703, "loss": 0.4506, "rewards/accuracies": 0.75, "rewards/chosen": 0.39129209518432617, "rewards/margins": 0.7095879316329956, "rewards/rejected": -0.31829580664634705, "step": 3695 }, { "epoch": 0.5715832205683355, "grad_norm": 5.249233245849609, "learning_rate": 4.49707870317333e-06, "logits/chosen": 2.9382739067077637, "logits/rejected": 1.4521952867507935, "logps/chosen": -451.50823974609375, "logps/rejected": -369.6345520019531, "loss": 0.6066, "rewards/accuracies": 0.625, "rewards/chosen": -0.07451462000608444, "rewards/margins": 0.23689767718315125, "rewards/rejected": -0.31141233444213867, "step": 3696 }, { "epoch": 0.5717378697080997, "grad_norm": 4.745990753173828, "learning_rate": 4.496792301523657e-06, "logits/chosen": 14.343040466308594, "logits/rejected": 7.235940933227539, "logps/chosen": -381.2447509765625, "logps/rejected": -349.32275390625, "loss": 0.4675, "rewards/accuracies": 0.875, "rewards/chosen": 0.3196329176425934, "rewards/margins": 0.612358033657074, "rewards/rejected": -0.2927250862121582, "step": 3697 }, { "epoch": 0.571892518847864, "grad_norm": 7.157134056091309, "learning_rate": 4.496505899873983e-06, "logits/chosen": 6.340457916259766, "logits/rejected": 4.8563642501831055, "logps/chosen": -281.021484375, "logps/rejected": -239.84185791015625, "loss": 0.7218, "rewards/accuracies": 0.75, "rewards/chosen": -0.34968167543411255, "rewards/margins": 0.1062493622303009, "rewards/rejected": -0.45593100786209106, "step": 3698 }, { "epoch": 0.5720471679876281, "grad_norm": 5.116054534912109, "learning_rate": 4.49621949822431e-06, "logits/chosen": 9.402952194213867, "logits/rejected": 11.37956428527832, "logps/chosen": -236.03652954101562, "logps/rejected": -253.1999053955078, "loss": 0.5495, "rewards/accuracies": 0.625, "rewards/chosen": 0.22523260116577148, "rewards/margins": 0.5181406736373901, "rewards/rejected": -0.29290807247161865, "step": 3699 }, { "epoch": 0.5722018171273923, "grad_norm": 4.399021625518799, "learning_rate": 4.495933096574637e-06, "logits/chosen": 16.857341766357422, "logits/rejected": 9.565850257873535, "logps/chosen": -334.3639831542969, "logps/rejected": -277.7099609375, "loss": 0.4457, "rewards/accuracies": 0.875, "rewards/chosen": 0.4729700982570648, "rewards/margins": 0.6835820078849792, "rewards/rejected": -0.21061192452907562, "step": 3700 }, { "epoch": 0.5723564662671564, "grad_norm": 5.277714729309082, "learning_rate": 4.495646694924963e-06, "logits/chosen": 13.100316047668457, "logits/rejected": 9.61971664428711, "logps/chosen": -316.03271484375, "logps/rejected": -217.42532348632812, "loss": 0.6467, "rewards/accuracies": 0.625, "rewards/chosen": 0.3280617296695709, "rewards/margins": 0.3129432201385498, "rewards/rejected": 0.015118509531021118, "step": 3701 }, { "epoch": 0.5725111154069206, "grad_norm": 5.60197639465332, "learning_rate": 4.49536029327529e-06, "logits/chosen": 15.08932113647461, "logits/rejected": 11.757494926452637, "logps/chosen": -355.3668212890625, "logps/rejected": -281.8846130371094, "loss": 0.4995, "rewards/accuracies": 0.875, "rewards/chosen": 0.0007755272090435028, "rewards/margins": 0.4936409592628479, "rewards/rejected": -0.4928654730319977, "step": 3702 }, { "epoch": 0.5726657645466847, "grad_norm": 7.01350736618042, "learning_rate": 4.495073891625616e-06, "logits/chosen": 5.314853668212891, "logits/rejected": 1.3873262405395508, "logps/chosen": -392.3323974609375, "logps/rejected": -270.7787780761719, "loss": 0.7477, "rewards/accuracies": 0.625, "rewards/chosen": -0.0016715079545974731, "rewards/margins": 0.015269089490175247, "rewards/rejected": -0.016940593719482422, "step": 3703 }, { "epoch": 0.5728204136864489, "grad_norm": 7.185494422912598, "learning_rate": 4.4947874899759424e-06, "logits/chosen": 5.782631874084473, "logits/rejected": -0.3535642623901367, "logps/chosen": -387.6290283203125, "logps/rejected": -327.96417236328125, "loss": 0.4869, "rewards/accuracies": 0.875, "rewards/chosen": 0.3660643994808197, "rewards/margins": 0.526902973651886, "rewards/rejected": -0.16083863377571106, "step": 3704 }, { "epoch": 0.572975062826213, "grad_norm": 5.6459784507751465, "learning_rate": 4.494501088326269e-06, "logits/chosen": 13.248144149780273, "logits/rejected": 7.682799816131592, "logps/chosen": -317.3186950683594, "logps/rejected": -252.63990783691406, "loss": 0.704, "rewards/accuracies": 0.5, "rewards/chosen": 0.3220228850841522, "rewards/margins": 0.01689395308494568, "rewards/rejected": 0.3051289916038513, "step": 3705 }, { "epoch": 0.5731297119659772, "grad_norm": 6.846540927886963, "learning_rate": 4.494214686676596e-06, "logits/chosen": 8.376680374145508, "logits/rejected": 6.260311126708984, "logps/chosen": -334.760009765625, "logps/rejected": -254.99815368652344, "loss": 0.7095, "rewards/accuracies": 0.375, "rewards/chosen": 0.1150674819946289, "rewards/margins": -0.015519723296165466, "rewards/rejected": 0.13058719038963318, "step": 3706 }, { "epoch": 0.5732843611057413, "grad_norm": 5.605840682983398, "learning_rate": 4.493928285026922e-06, "logits/chosen": 5.8190016746521, "logits/rejected": 4.096757888793945, "logps/chosen": -274.852783203125, "logps/rejected": -305.9874572753906, "loss": 0.6153, "rewards/accuracies": 0.625, "rewards/chosen": 0.42660802602767944, "rewards/margins": 0.2664538025856018, "rewards/rejected": 0.16015425324440002, "step": 3707 }, { "epoch": 0.5734390102455055, "grad_norm": 4.321354389190674, "learning_rate": 4.493641883377249e-06, "logits/chosen": 9.291486740112305, "logits/rejected": 7.191115379333496, "logps/chosen": -206.15481567382812, "logps/rejected": -164.13954162597656, "loss": 0.7039, "rewards/accuracies": 0.375, "rewards/chosen": 0.11247245222330093, "rewards/margins": 0.2309592366218567, "rewards/rejected": -0.11848677694797516, "step": 3708 }, { "epoch": 0.5735936593852696, "grad_norm": 5.628379821777344, "learning_rate": 4.493355481727575e-06, "logits/chosen": 3.79827880859375, "logits/rejected": 2.976776123046875, "logps/chosen": -384.87994384765625, "logps/rejected": -256.0940856933594, "loss": 0.5315, "rewards/accuracies": 0.75, "rewards/chosen": 0.39725008606910706, "rewards/margins": 0.4684295952320099, "rewards/rejected": -0.07117953151464462, "step": 3709 }, { "epoch": 0.5737483085250338, "grad_norm": 7.524038791656494, "learning_rate": 4.4930690800779015e-06, "logits/chosen": 7.303856372833252, "logits/rejected": 6.873887538909912, "logps/chosen": -247.5548095703125, "logps/rejected": -228.5362548828125, "loss": 0.744, "rewards/accuracies": 0.25, "rewards/chosen": -0.14499793946743011, "rewards/margins": 0.041953109204769135, "rewards/rejected": -0.18695104122161865, "step": 3710 }, { "epoch": 0.573902957664798, "grad_norm": 5.400707721710205, "learning_rate": 4.492782678428228e-06, "logits/chosen": 8.069753646850586, "logits/rejected": 2.2130513191223145, "logps/chosen": -360.14459228515625, "logps/rejected": -180.12789916992188, "loss": 0.6999, "rewards/accuracies": 0.375, "rewards/chosen": 0.20198871195316315, "rewards/margins": 0.34531301259994507, "rewards/rejected": -0.14332430064678192, "step": 3711 }, { "epoch": 0.5740576068045622, "grad_norm": 6.845179557800293, "learning_rate": 4.492496276778555e-06, "logits/chosen": 9.691807746887207, "logits/rejected": 2.09788179397583, "logps/chosen": -460.1844482421875, "logps/rejected": -309.6032409667969, "loss": 0.5093, "rewards/accuracies": 0.875, "rewards/chosen": 0.779831051826477, "rewards/margins": 0.4788621962070465, "rewards/rejected": 0.3009689450263977, "step": 3712 }, { "epoch": 0.5742122559443263, "grad_norm": 6.296635150909424, "learning_rate": 4.4922098751288814e-06, "logits/chosen": 12.056804656982422, "logits/rejected": 7.376004219055176, "logps/chosen": -336.9814147949219, "logps/rejected": -240.58607482910156, "loss": 0.6617, "rewards/accuracies": 0.625, "rewards/chosen": 0.003088429570198059, "rewards/margins": 0.14123979210853577, "rewards/rejected": -0.1381513476371765, "step": 3713 }, { "epoch": 0.5743669050840905, "grad_norm": 4.655721664428711, "learning_rate": 4.491923473479208e-06, "logits/chosen": 9.70938491821289, "logits/rejected": -1.8758000135421753, "logps/chosen": -408.7356262207031, "logps/rejected": -248.5621337890625, "loss": 0.5552, "rewards/accuracies": 0.625, "rewards/chosen": 0.21725772321224213, "rewards/margins": 0.7408715486526489, "rewards/rejected": -0.5236138105392456, "step": 3714 }, { "epoch": 0.5745215542238546, "grad_norm": 4.45045804977417, "learning_rate": 4.491637071829535e-06, "logits/chosen": 14.846986770629883, "logits/rejected": 10.147533416748047, "logps/chosen": -260.1597900390625, "logps/rejected": -213.46693420410156, "loss": 0.5569, "rewards/accuracies": 0.75, "rewards/chosen": 0.24194619059562683, "rewards/margins": 0.41860532760620117, "rewards/rejected": -0.17665912210941315, "step": 3715 }, { "epoch": 0.5746762033636188, "grad_norm": 6.709345817565918, "learning_rate": 4.4913506701798606e-06, "logits/chosen": 8.405929565429688, "logits/rejected": 9.518918991088867, "logps/chosen": -368.247802734375, "logps/rejected": -314.67413330078125, "loss": 0.7618, "rewards/accuracies": 0.5, "rewards/chosen": 0.18052148818969727, "rewards/margins": -0.07351011037826538, "rewards/rejected": 0.25403159856796265, "step": 3716 }, { "epoch": 0.574830852503383, "grad_norm": 3.5707855224609375, "learning_rate": 4.491064268530187e-06, "logits/chosen": 7.9638237953186035, "logits/rejected": 7.77506160736084, "logps/chosen": -241.71742248535156, "logps/rejected": -174.21189880371094, "loss": 0.5539, "rewards/accuracies": 0.875, "rewards/chosen": 0.4991726279258728, "rewards/margins": 0.35444897413253784, "rewards/rejected": 0.14472365379333496, "step": 3717 }, { "epoch": 0.5749855016431471, "grad_norm": 6.123157024383545, "learning_rate": 4.490777866880514e-06, "logits/chosen": 9.426716804504395, "logits/rejected": -0.6902797222137451, "logps/chosen": -340.7825012207031, "logps/rejected": -246.65695190429688, "loss": 0.5228, "rewards/accuracies": 0.875, "rewards/chosen": -0.03857307881116867, "rewards/margins": 0.42529231309890747, "rewards/rejected": -0.46386539936065674, "step": 3718 }, { "epoch": 0.5751401507829113, "grad_norm": 5.5533366203308105, "learning_rate": 4.4904914652308405e-06, "logits/chosen": 11.394113540649414, "logits/rejected": 14.207951545715332, "logps/chosen": -247.14279174804688, "logps/rejected": -282.165771484375, "loss": 0.5418, "rewards/accuracies": 0.75, "rewards/chosen": 0.17103368043899536, "rewards/margins": 0.5050920844078064, "rewards/rejected": -0.33405840396881104, "step": 3719 }, { "epoch": 0.5752947999226754, "grad_norm": 3.6601967811584473, "learning_rate": 4.490205063581166e-06, "logits/chosen": 8.682501792907715, "logits/rejected": 7.988053798675537, "logps/chosen": -402.9241638183594, "logps/rejected": -351.97186279296875, "loss": 0.4802, "rewards/accuracies": 0.75, "rewards/chosen": 0.57405686378479, "rewards/margins": 0.6669309139251709, "rewards/rejected": -0.09287404268980026, "step": 3720 }, { "epoch": 0.5754494490624396, "grad_norm": 3.6674857139587402, "learning_rate": 4.489918661931493e-06, "logits/chosen": 15.722311019897461, "logits/rejected": 3.220036029815674, "logps/chosen": -285.2763977050781, "logps/rejected": -180.33151245117188, "loss": 0.4644, "rewards/accuracies": 0.875, "rewards/chosen": 0.16027727723121643, "rewards/margins": 0.7090728282928467, "rewards/rejected": -0.5487955212593079, "step": 3721 }, { "epoch": 0.5756040982022037, "grad_norm": 6.872456073760986, "learning_rate": 4.48963226028182e-06, "logits/chosen": 13.623438835144043, "logits/rejected": 9.795961380004883, "logps/chosen": -243.22866821289062, "logps/rejected": -176.3389434814453, "loss": 0.681, "rewards/accuracies": 0.625, "rewards/chosen": 0.16342535614967346, "rewards/margins": 0.05227527767419815, "rewards/rejected": 0.11115007102489471, "step": 3722 }, { "epoch": 0.5757587473419679, "grad_norm": 6.244222164154053, "learning_rate": 4.489345858632146e-06, "logits/chosen": 8.075648307800293, "logits/rejected": 9.600786209106445, "logps/chosen": -282.38800048828125, "logps/rejected": -272.2184753417969, "loss": 0.8111, "rewards/accuracies": 0.375, "rewards/chosen": 0.10409536957740784, "rewards/margins": -0.1438450962305069, "rewards/rejected": 0.24794045090675354, "step": 3723 }, { "epoch": 0.5759133964817321, "grad_norm": 9.015183448791504, "learning_rate": 4.489059456982472e-06, "logits/chosen": 9.850502967834473, "logits/rejected": 7.741997718811035, "logps/chosen": -321.7296142578125, "logps/rejected": -202.72525024414062, "loss": 0.6638, "rewards/accuracies": 0.75, "rewards/chosen": -0.16827788949012756, "rewards/margins": 0.23673200607299805, "rewards/rejected": -0.4050098657608032, "step": 3724 }, { "epoch": 0.5760680456214963, "grad_norm": 5.416754245758057, "learning_rate": 4.488773055332799e-06, "logits/chosen": 13.911482810974121, "logits/rejected": 8.012048721313477, "logps/chosen": -278.4915466308594, "logps/rejected": -213.6652374267578, "loss": 0.7069, "rewards/accuracies": 0.5, "rewards/chosen": 0.09226924180984497, "rewards/margins": 0.08572528511285782, "rewards/rejected": 0.006543941795825958, "step": 3725 }, { "epoch": 0.5762226947612604, "grad_norm": 4.727205753326416, "learning_rate": 4.488486653683125e-06, "logits/chosen": 11.337738037109375, "logits/rejected": 2.549610137939453, "logps/chosen": -446.99798583984375, "logps/rejected": -215.03564453125, "loss": 0.4234, "rewards/accuracies": 0.75, "rewards/chosen": 0.7282956838607788, "rewards/margins": 0.8834214806556702, "rewards/rejected": -0.15512579679489136, "step": 3726 }, { "epoch": 0.5763773439010246, "grad_norm": 4.350319862365723, "learning_rate": 4.488200252033452e-06, "logits/chosen": 11.430807113647461, "logits/rejected": 5.448462009429932, "logps/chosen": -257.48455810546875, "logps/rejected": -218.32249450683594, "loss": 0.5524, "rewards/accuracies": 0.75, "rewards/chosen": 0.06599608063697815, "rewards/margins": 0.4484759569168091, "rewards/rejected": -0.3824799060821533, "step": 3727 }, { "epoch": 0.5765319930407887, "grad_norm": 4.239972114562988, "learning_rate": 4.487913850383779e-06, "logits/chosen": 12.200223922729492, "logits/rejected": 8.198999404907227, "logps/chosen": -281.66552734375, "logps/rejected": -260.97723388671875, "loss": 0.6026, "rewards/accuracies": 0.625, "rewards/chosen": 0.13597224652767181, "rewards/margins": 0.35886770486831665, "rewards/rejected": -0.22289542853832245, "step": 3728 }, { "epoch": 0.5766866421805529, "grad_norm": 4.722055435180664, "learning_rate": 4.4876274487341045e-06, "logits/chosen": 14.625142097473145, "logits/rejected": 9.34661865234375, "logps/chosen": -410.48968505859375, "logps/rejected": -324.8926086425781, "loss": 0.4963, "rewards/accuracies": 0.875, "rewards/chosen": 0.551278293132782, "rewards/margins": 0.4936273694038391, "rewards/rejected": 0.057650938630104065, "step": 3729 }, { "epoch": 0.576841291320317, "grad_norm": 5.584944725036621, "learning_rate": 4.487341047084431e-06, "logits/chosen": 11.08084487915039, "logits/rejected": 8.703192710876465, "logps/chosen": -342.2227783203125, "logps/rejected": -335.8408203125, "loss": 0.6348, "rewards/accuracies": 0.625, "rewards/chosen": 0.30301690101623535, "rewards/margins": 0.3336612284183502, "rewards/rejected": -0.03064434602856636, "step": 3730 }, { "epoch": 0.5769959404600812, "grad_norm": 4.056881427764893, "learning_rate": 4.487054645434758e-06, "logits/chosen": 11.559033393859863, "logits/rejected": 5.392719268798828, "logps/chosen": -325.2268371582031, "logps/rejected": -213.72418212890625, "loss": 0.5389, "rewards/accuracies": 0.75, "rewards/chosen": 0.04463265836238861, "rewards/margins": 0.41269466280937195, "rewards/rejected": -0.36806201934814453, "step": 3731 }, { "epoch": 0.5771505895998453, "grad_norm": 5.971040725708008, "learning_rate": 4.486768243785084e-06, "logits/chosen": 8.115890502929688, "logits/rejected": 8.35416316986084, "logps/chosen": -302.03509521484375, "logps/rejected": -333.9668884277344, "loss": 0.6932, "rewards/accuracies": 0.25, "rewards/chosen": -0.050124168395996094, "rewards/margins": 0.0197637677192688, "rewards/rejected": -0.06988793611526489, "step": 3732 }, { "epoch": 0.5773052387396095, "grad_norm": 5.840587615966797, "learning_rate": 4.486481842135411e-06, "logits/chosen": 11.901101112365723, "logits/rejected": 14.537363052368164, "logps/chosen": -268.8872985839844, "logps/rejected": -326.4714660644531, "loss": 0.7482, "rewards/accuracies": 0.5, "rewards/chosen": -0.04025077447295189, "rewards/margins": 0.03637939691543579, "rewards/rejected": -0.07663017511367798, "step": 3733 }, { "epoch": 0.5774598878793736, "grad_norm": 5.733351230621338, "learning_rate": 4.486195440485738e-06, "logits/chosen": 9.068757057189941, "logits/rejected": 4.97458028793335, "logps/chosen": -232.80511474609375, "logps/rejected": -195.374755859375, "loss": 0.6016, "rewards/accuracies": 0.75, "rewards/chosen": 0.12704233825206757, "rewards/margins": 0.23217704892158508, "rewards/rejected": -0.10513471812009811, "step": 3734 }, { "epoch": 0.5776145370191378, "grad_norm": 5.579530715942383, "learning_rate": 4.485909038836064e-06, "logits/chosen": 10.892488479614258, "logits/rejected": 6.22386360168457, "logps/chosen": -290.3475341796875, "logps/rejected": -221.51890563964844, "loss": 0.7976, "rewards/accuracies": 0.25, "rewards/chosen": -0.3246510624885559, "rewards/margins": -0.1318206787109375, "rewards/rejected": -0.1928303837776184, "step": 3735 }, { "epoch": 0.5777691861589019, "grad_norm": 5.380240440368652, "learning_rate": 4.48562263718639e-06, "logits/chosen": 10.766910552978516, "logits/rejected": 7.608599662780762, "logps/chosen": -323.02105712890625, "logps/rejected": -266.8172607421875, "loss": 0.6513, "rewards/accuracies": 0.375, "rewards/chosen": 0.2053164541721344, "rewards/margins": 0.22976070642471313, "rewards/rejected": -0.024444222450256348, "step": 3736 }, { "epoch": 0.5779238352986662, "grad_norm": 7.871776580810547, "learning_rate": 4.485336235536717e-06, "logits/chosen": 5.785160541534424, "logits/rejected": 6.462968349456787, "logps/chosen": -209.09873962402344, "logps/rejected": -276.9587707519531, "loss": 0.8479, "rewards/accuracies": 0.5, "rewards/chosen": -0.05167999118566513, "rewards/margins": -0.16479802131652832, "rewards/rejected": 0.11311802268028259, "step": 3737 }, { "epoch": 0.5780784844384304, "grad_norm": 5.114760398864746, "learning_rate": 4.4850498338870435e-06, "logits/chosen": 10.222497940063477, "logits/rejected": 9.965426445007324, "logps/chosen": -232.41343688964844, "logps/rejected": -200.0277099609375, "loss": 0.6488, "rewards/accuracies": 0.75, "rewards/chosen": -0.09685802459716797, "rewards/margins": 0.10919791460037231, "rewards/rejected": -0.20605593919754028, "step": 3738 }, { "epoch": 0.5782331335781945, "grad_norm": 3.8910679817199707, "learning_rate": 4.48476343223737e-06, "logits/chosen": 10.613103866577148, "logits/rejected": 11.750024795532227, "logps/chosen": -206.47462463378906, "logps/rejected": -269.7124328613281, "loss": 0.4296, "rewards/accuracies": 0.875, "rewards/chosen": 0.36297935247421265, "rewards/margins": 1.1942442655563354, "rewards/rejected": -0.8312649130821228, "step": 3739 }, { "epoch": 0.5783877827179587, "grad_norm": 4.667725563049316, "learning_rate": 4.484477030587697e-06, "logits/chosen": 12.33337116241455, "logits/rejected": 5.116974830627441, "logps/chosen": -253.1707763671875, "logps/rejected": -221.9131317138672, "loss": 0.5925, "rewards/accuracies": 0.75, "rewards/chosen": 0.1445111334323883, "rewards/margins": 0.2741219997406006, "rewards/rejected": -0.12961086630821228, "step": 3740 }, { "epoch": 0.5785424318577228, "grad_norm": 4.907107353210449, "learning_rate": 4.4841906289380234e-06, "logits/chosen": 11.9082670211792, "logits/rejected": 11.544818878173828, "logps/chosen": -309.4208679199219, "logps/rejected": -302.5118408203125, "loss": 0.5236, "rewards/accuracies": 0.75, "rewards/chosen": 0.27777189016342163, "rewards/margins": 0.47985750436782837, "rewards/rejected": -0.20208558440208435, "step": 3741 }, { "epoch": 0.578697080997487, "grad_norm": 4.337289333343506, "learning_rate": 4.483904227288349e-06, "logits/chosen": 10.786046981811523, "logits/rejected": 3.8704121112823486, "logps/chosen": -202.09494018554688, "logps/rejected": -95.15509033203125, "loss": 0.62, "rewards/accuracies": 0.75, "rewards/chosen": -0.16829673945903778, "rewards/margins": 0.32523277401924133, "rewards/rejected": -0.4935295581817627, "step": 3742 }, { "epoch": 0.5788517301372511, "grad_norm": 7.320075511932373, "learning_rate": 4.483617825638676e-06, "logits/chosen": 14.791908264160156, "logits/rejected": 9.116199493408203, "logps/chosen": -379.0672912597656, "logps/rejected": -293.0577697753906, "loss": 0.7252, "rewards/accuracies": 0.5, "rewards/chosen": -0.2709035873413086, "rewards/margins": 0.12911444902420044, "rewards/rejected": -0.40001803636550903, "step": 3743 }, { "epoch": 0.5790063792770153, "grad_norm": 4.902713775634766, "learning_rate": 4.4833314239890025e-06, "logits/chosen": 10.773898124694824, "logits/rejected": 6.587568283081055, "logps/chosen": -213.33270263671875, "logps/rejected": -187.54014587402344, "loss": 0.6313, "rewards/accuracies": 0.375, "rewards/chosen": 0.18620261549949646, "rewards/margins": 0.22033604979515076, "rewards/rejected": -0.0341334342956543, "step": 3744 }, { "epoch": 0.5791610284167794, "grad_norm": 9.9924898147583, "learning_rate": 4.483045022339329e-06, "logits/chosen": 8.371187210083008, "logits/rejected": 5.008256435394287, "logps/chosen": -263.6105651855469, "logps/rejected": -189.58517456054688, "loss": 0.8659, "rewards/accuracies": 0.375, "rewards/chosen": -0.054033905267715454, "rewards/margins": -0.22151866555213928, "rewards/rejected": 0.16748476028442383, "step": 3745 }, { "epoch": 0.5793156775565436, "grad_norm": 8.487750053405762, "learning_rate": 4.482758620689656e-06, "logits/chosen": 10.929519653320312, "logits/rejected": 10.445944786071777, "logps/chosen": -366.62847900390625, "logps/rejected": -356.4198303222656, "loss": 1.0023, "rewards/accuracies": 0.375, "rewards/chosen": -0.17304527759552002, "rewards/margins": -0.43341904878616333, "rewards/rejected": 0.2603738009929657, "step": 3746 }, { "epoch": 0.5794703266963077, "grad_norm": 5.879138469696045, "learning_rate": 4.4824722190399825e-06, "logits/chosen": 13.399274826049805, "logits/rejected": 7.804872989654541, "logps/chosen": -192.129638671875, "logps/rejected": -158.424560546875, "loss": 0.7957, "rewards/accuracies": 0.375, "rewards/chosen": -0.07273131608963013, "rewards/margins": -0.1360088288784027, "rewards/rejected": 0.06327749788761139, "step": 3747 }, { "epoch": 0.5796249758360719, "grad_norm": 6.219170093536377, "learning_rate": 4.482185817390309e-06, "logits/chosen": 8.170676231384277, "logits/rejected": 7.780335426330566, "logps/chosen": -233.9054718017578, "logps/rejected": -189.16046142578125, "loss": 0.8364, "rewards/accuracies": 0.375, "rewards/chosen": -0.21995553374290466, "rewards/margins": -0.21628113090991974, "rewards/rejected": -0.0036744065582752228, "step": 3748 }, { "epoch": 0.579779624975836, "grad_norm": 5.667336940765381, "learning_rate": 4.481899415740635e-06, "logits/chosen": 12.209220886230469, "logits/rejected": 8.255184173583984, "logps/chosen": -360.5721435546875, "logps/rejected": -305.6612548828125, "loss": 0.6736, "rewards/accuracies": 0.5, "rewards/chosen": 0.27346983551979065, "rewards/margins": 0.250363826751709, "rewards/rejected": 0.023106008768081665, "step": 3749 }, { "epoch": 0.5799342741156003, "grad_norm": 4.742862224578857, "learning_rate": 4.481613014090962e-06, "logits/chosen": 17.49440574645996, "logits/rejected": 14.298995018005371, "logps/chosen": -370.58538818359375, "logps/rejected": -371.48577880859375, "loss": 0.4384, "rewards/accuracies": 0.875, "rewards/chosen": 0.32053032517433167, "rewards/margins": 0.6644891500473022, "rewards/rejected": -0.3439588248729706, "step": 3750 }, { "epoch": 0.5800889232553644, "grad_norm": 4.997811794281006, "learning_rate": 4.481326612441288e-06, "logits/chosen": 10.554903030395508, "logits/rejected": 7.099453449249268, "logps/chosen": -226.81005859375, "logps/rejected": -230.79368591308594, "loss": 0.5953, "rewards/accuracies": 0.75, "rewards/chosen": -0.10634604096412659, "rewards/margins": 0.24701035022735596, "rewards/rejected": -0.35335636138916016, "step": 3751 }, { "epoch": 0.5802435723951286, "grad_norm": 9.690844535827637, "learning_rate": 4.481040210791615e-06, "logits/chosen": 10.978182792663574, "logits/rejected": 10.606393814086914, "logps/chosen": -432.098876953125, "logps/rejected": -440.06005859375, "loss": 0.8279, "rewards/accuracies": 0.375, "rewards/chosen": 0.3529754877090454, "rewards/margins": 0.04951745271682739, "rewards/rejected": 0.303458034992218, "step": 3752 }, { "epoch": 0.5803982215348927, "grad_norm": 10.848416328430176, "learning_rate": 4.4807538091419415e-06, "logits/chosen": 8.261761665344238, "logits/rejected": 7.789950370788574, "logps/chosen": -261.759765625, "logps/rejected": -262.27972412109375, "loss": 1.0525, "rewards/accuracies": 0.375, "rewards/chosen": -0.07930061221122742, "rewards/margins": -0.5074067711830139, "rewards/rejected": 0.4281061887741089, "step": 3753 }, { "epoch": 0.5805528706746569, "grad_norm": 7.323809623718262, "learning_rate": 4.480467407492267e-06, "logits/chosen": 7.141221523284912, "logits/rejected": 4.714208126068115, "logps/chosen": -362.9966125488281, "logps/rejected": -224.17605590820312, "loss": 0.6952, "rewards/accuracies": 0.75, "rewards/chosen": 0.03145284950733185, "rewards/margins": 0.23262004554271698, "rewards/rejected": -0.20116716623306274, "step": 3754 }, { "epoch": 0.580707519814421, "grad_norm": 4.843491077423096, "learning_rate": 4.480181005842594e-06, "logits/chosen": 9.270769119262695, "logits/rejected": 9.21450424194336, "logps/chosen": -218.399658203125, "logps/rejected": -258.3753662109375, "loss": 0.5349, "rewards/accuracies": 0.75, "rewards/chosen": 0.3864266574382782, "rewards/margins": 0.46174392104148865, "rewards/rejected": -0.07531729340553284, "step": 3755 }, { "epoch": 0.5808621689541852, "grad_norm": 7.27138090133667, "learning_rate": 4.479894604192921e-06, "logits/chosen": 12.108461380004883, "logits/rejected": 8.761161804199219, "logps/chosen": -421.70733642578125, "logps/rejected": -365.26702880859375, "loss": 0.6395, "rewards/accuracies": 0.625, "rewards/chosen": 0.237372487783432, "rewards/margins": 0.16200634837150574, "rewards/rejected": 0.07536615431308746, "step": 3756 }, { "epoch": 0.5810168180939493, "grad_norm": 4.791899681091309, "learning_rate": 4.479608202543247e-06, "logits/chosen": 9.259770393371582, "logits/rejected": 4.559820652008057, "logps/chosen": -258.1905822753906, "logps/rejected": -188.4403076171875, "loss": 0.6288, "rewards/accuracies": 0.5, "rewards/chosen": 0.16299495100975037, "rewards/margins": 0.2914098799228668, "rewards/rejected": -0.12841489911079407, "step": 3757 }, { "epoch": 0.5811714672337135, "grad_norm": 8.089067459106445, "learning_rate": 4.479321800893573e-06, "logits/chosen": 12.341484069824219, "logits/rejected": 6.69716215133667, "logps/chosen": -378.91436767578125, "logps/rejected": -270.20867919921875, "loss": 0.5691, "rewards/accuracies": 0.75, "rewards/chosen": 0.38474541902542114, "rewards/margins": 0.44884008169174194, "rewards/rejected": -0.06409463286399841, "step": 3758 }, { "epoch": 0.5813261163734776, "grad_norm": 6.568228244781494, "learning_rate": 4.4790353992439e-06, "logits/chosen": 7.807243347167969, "logits/rejected": 7.417841911315918, "logps/chosen": -304.3643798828125, "logps/rejected": -383.8334655761719, "loss": 0.7749, "rewards/accuracies": 0.625, "rewards/chosen": 0.2984338402748108, "rewards/margins": 0.15377725660800934, "rewards/rejected": 0.14465655386447906, "step": 3759 }, { "epoch": 0.5814807655132418, "grad_norm": 3.6757256984710693, "learning_rate": 4.478748997594226e-06, "logits/chosen": 12.442808151245117, "logits/rejected": 6.566100120544434, "logps/chosen": -337.8768005371094, "logps/rejected": -298.24859619140625, "loss": 0.4441, "rewards/accuracies": 0.875, "rewards/chosen": 0.3336055874824524, "rewards/margins": 0.7352687120437622, "rewards/rejected": -0.4016631245613098, "step": 3760 }, { "epoch": 0.5816354146530059, "grad_norm": 3.560396194458008, "learning_rate": 4.478462595944553e-06, "logits/chosen": 8.497380256652832, "logits/rejected": 7.978403091430664, "logps/chosen": -281.2266845703125, "logps/rejected": -273.2942199707031, "loss": 0.3832, "rewards/accuracies": 0.875, "rewards/chosen": 0.4074232280254364, "rewards/margins": 0.905182957649231, "rewards/rejected": -0.4977598190307617, "step": 3761 }, { "epoch": 0.5817900637927702, "grad_norm": 3.164163589477539, "learning_rate": 4.478176194294879e-06, "logits/chosen": 11.702451705932617, "logits/rejected": 5.535411834716797, "logps/chosen": -194.426513671875, "logps/rejected": -101.06061553955078, "loss": 0.5323, "rewards/accuracies": 0.75, "rewards/chosen": 0.22941532731056213, "rewards/margins": 0.46564823389053345, "rewards/rejected": -0.2362329214811325, "step": 3762 }, { "epoch": 0.5819447129325344, "grad_norm": 6.195248603820801, "learning_rate": 4.4778897926452055e-06, "logits/chosen": 10.390745162963867, "logits/rejected": 12.758419036865234, "logps/chosen": -242.35296630859375, "logps/rejected": -385.4520568847656, "loss": 0.7732, "rewards/accuracies": 0.375, "rewards/chosen": 0.24415001273155212, "rewards/margins": -0.10610131919384003, "rewards/rejected": 0.35025131702423096, "step": 3763 }, { "epoch": 0.5820993620722985, "grad_norm": 5.916524887084961, "learning_rate": 4.477603390995532e-06, "logits/chosen": 8.734375953674316, "logits/rejected": 6.744244575500488, "logps/chosen": -321.8634338378906, "logps/rejected": -327.76751708984375, "loss": 0.6345, "rewards/accuracies": 0.625, "rewards/chosen": 0.3101542294025421, "rewards/margins": 0.22043971717357635, "rewards/rejected": 0.08971453458070755, "step": 3764 }, { "epoch": 0.5822540112120627, "grad_norm": 6.159871578216553, "learning_rate": 4.477316989345859e-06, "logits/chosen": 9.64154052734375, "logits/rejected": 2.321913719177246, "logps/chosen": -248.2386016845703, "logps/rejected": -207.50784301757812, "loss": 0.6382, "rewards/accuracies": 0.5, "rewards/chosen": 0.06591348350048065, "rewards/margins": 0.24014215171337128, "rewards/rejected": -0.17422866821289062, "step": 3765 }, { "epoch": 0.5824086603518268, "grad_norm": 8.377087593078613, "learning_rate": 4.4770305876961855e-06, "logits/chosen": 5.4512715339660645, "logits/rejected": 6.379087924957275, "logps/chosen": -247.5706024169922, "logps/rejected": -344.85137939453125, "loss": 0.7427, "rewards/accuracies": 0.625, "rewards/chosen": 0.440658837556839, "rewards/margins": -0.02797914296388626, "rewards/rejected": 0.46863800287246704, "step": 3766 }, { "epoch": 0.582563309491591, "grad_norm": 4.889064311981201, "learning_rate": 4.476744186046512e-06, "logits/chosen": 7.883516311645508, "logits/rejected": 7.559130668640137, "logps/chosen": -213.40573120117188, "logps/rejected": -211.27577209472656, "loss": 0.6861, "rewards/accuracies": 0.5, "rewards/chosen": -0.0546211302280426, "rewards/margins": 0.021650459617376328, "rewards/rejected": -0.07627158612012863, "step": 3767 }, { "epoch": 0.5827179586313551, "grad_norm": 7.313686847686768, "learning_rate": 4.476457784396839e-06, "logits/chosen": 5.974917411804199, "logits/rejected": 7.605887413024902, "logps/chosen": -303.02294921875, "logps/rejected": -319.54815673828125, "loss": 0.6538, "rewards/accuracies": 0.625, "rewards/chosen": 0.21846377849578857, "rewards/margins": 0.20421268045902252, "rewards/rejected": 0.014251090586185455, "step": 3768 }, { "epoch": 0.5828726077711193, "grad_norm": 5.9766926765441895, "learning_rate": 4.4761713827471646e-06, "logits/chosen": 10.101509094238281, "logits/rejected": 12.571516036987305, "logps/chosen": -308.71331787109375, "logps/rejected": -300.5577697753906, "loss": 0.7918, "rewards/accuracies": 0.5, "rewards/chosen": 0.27153289318084717, "rewards/margins": -0.06378117203712463, "rewards/rejected": 0.3353140950202942, "step": 3769 }, { "epoch": 0.5830272569108834, "grad_norm": 4.380134105682373, "learning_rate": 4.475884981097491e-06, "logits/chosen": 5.582114219665527, "logits/rejected": 2.132516860961914, "logps/chosen": -322.6517333984375, "logps/rejected": -232.29786682128906, "loss": 0.6985, "rewards/accuracies": 0.375, "rewards/chosen": 0.2889731526374817, "rewards/margins": 0.10297083109617233, "rewards/rejected": 0.18600231409072876, "step": 3770 }, { "epoch": 0.5831819060506476, "grad_norm": 7.89937686920166, "learning_rate": 4.475598579447818e-06, "logits/chosen": 11.376214981079102, "logits/rejected": 10.147675514221191, "logps/chosen": -290.3760070800781, "logps/rejected": -192.64199829101562, "loss": 0.5719, "rewards/accuracies": 0.75, "rewards/chosen": -0.3364810049533844, "rewards/margins": 0.45987826585769653, "rewards/rejected": -0.7963593006134033, "step": 3771 }, { "epoch": 0.5833365551904117, "grad_norm": 4.555075168609619, "learning_rate": 4.4753121777981445e-06, "logits/chosen": 4.479366302490234, "logits/rejected": 3.993962287902832, "logps/chosen": -182.8438720703125, "logps/rejected": -271.4689636230469, "loss": 0.4553, "rewards/accuracies": 0.875, "rewards/chosen": -0.06888549774885178, "rewards/margins": 0.7116819620132446, "rewards/rejected": -0.7805674076080322, "step": 3772 }, { "epoch": 0.5834912043301759, "grad_norm": 5.773561000823975, "learning_rate": 4.475025776148471e-06, "logits/chosen": 13.352177619934082, "logits/rejected": 6.237696647644043, "logps/chosen": -187.14210510253906, "logps/rejected": -167.95602416992188, "loss": 0.4966, "rewards/accuracies": 0.875, "rewards/chosen": 0.27232131361961365, "rewards/margins": 0.48372483253479004, "rewards/rejected": -0.2114035189151764, "step": 3773 }, { "epoch": 0.58364585346994, "grad_norm": 4.6555681228637695, "learning_rate": 4.474739374498798e-06, "logits/chosen": 10.552513122558594, "logits/rejected": 11.679862022399902, "logps/chosen": -240.0597381591797, "logps/rejected": -232.78564453125, "loss": 0.6121, "rewards/accuracies": 0.625, "rewards/chosen": 0.10497722774744034, "rewards/margins": 0.2214737981557846, "rewards/rejected": -0.11649655550718307, "step": 3774 }, { "epoch": 0.5838005026097043, "grad_norm": 5.564403533935547, "learning_rate": 4.474452972849124e-06, "logits/chosen": 12.715398788452148, "logits/rejected": 7.832818031311035, "logps/chosen": -463.7010803222656, "logps/rejected": -375.20025634765625, "loss": 0.4966, "rewards/accuracies": 0.75, "rewards/chosen": 0.4750397205352783, "rewards/margins": 0.6359333395957947, "rewards/rejected": -0.16089363396167755, "step": 3775 }, { "epoch": 0.5839551517494684, "grad_norm": 4.329071044921875, "learning_rate": 4.47416657119945e-06, "logits/chosen": 9.388490676879883, "logits/rejected": 10.300674438476562, "logps/chosen": -183.52597045898438, "logps/rejected": -174.80609130859375, "loss": 0.6342, "rewards/accuracies": 0.75, "rewards/chosen": 0.3162219524383545, "rewards/margins": 0.1883973777294159, "rewards/rejected": 0.1278245598077774, "step": 3776 }, { "epoch": 0.5841098008892326, "grad_norm": 5.97024393081665, "learning_rate": 4.473880169549777e-06, "logits/chosen": 16.1877384185791, "logits/rejected": 9.781013488769531, "logps/chosen": -446.768798828125, "logps/rejected": -406.63165283203125, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": 0.7192550897598267, "rewards/margins": 0.36904335021972656, "rewards/rejected": 0.3502117097377777, "step": 3777 }, { "epoch": 0.5842644500289967, "grad_norm": 6.44588041305542, "learning_rate": 4.4735937679001036e-06, "logits/chosen": 13.135551452636719, "logits/rejected": 10.18069839477539, "logps/chosen": -365.21466064453125, "logps/rejected": -322.7725524902344, "loss": 0.6919, "rewards/accuracies": 0.75, "rewards/chosen": 0.38954561948776245, "rewards/margins": 0.11762365698814392, "rewards/rejected": 0.2719219923019409, "step": 3778 }, { "epoch": 0.5844190991687609, "grad_norm": 4.911144256591797, "learning_rate": 4.47330736625043e-06, "logits/chosen": 14.770395278930664, "logits/rejected": 10.70228385925293, "logps/chosen": -264.84332275390625, "logps/rejected": -250.11279296875, "loss": 0.6753, "rewards/accuracies": 0.625, "rewards/chosen": -0.2633327841758728, "rewards/margins": 0.1370125114917755, "rewards/rejected": -0.4003453254699707, "step": 3779 }, { "epoch": 0.584573748308525, "grad_norm": 6.50197696685791, "learning_rate": 4.473020964600757e-06, "logits/chosen": 13.485575675964355, "logits/rejected": 8.607733726501465, "logps/chosen": -273.1490478515625, "logps/rejected": -188.75436401367188, "loss": 0.727, "rewards/accuracies": 0.5, "rewards/chosen": 0.0367947518825531, "rewards/margins": -0.027381855994462967, "rewards/rejected": 0.06417660415172577, "step": 3780 }, { "epoch": 0.5847283974482892, "grad_norm": 10.383441925048828, "learning_rate": 4.4727345629510835e-06, "logits/chosen": 9.297343254089355, "logits/rejected": 5.5316362380981445, "logps/chosen": -531.0068359375, "logps/rejected": -336.4123840332031, "loss": 0.7295, "rewards/accuracies": 0.625, "rewards/chosen": 0.18946370482444763, "rewards/margins": 0.010195180773735046, "rewards/rejected": 0.17926853895187378, "step": 3781 }, { "epoch": 0.5848830465880533, "grad_norm": 5.7821760177612305, "learning_rate": 4.472448161301409e-06, "logits/chosen": 12.578073501586914, "logits/rejected": 14.491165161132812, "logps/chosen": -263.8685607910156, "logps/rejected": -294.4859313964844, "loss": 0.6188, "rewards/accuracies": 0.625, "rewards/chosen": 0.5995829105377197, "rewards/margins": 0.3486010432243347, "rewards/rejected": 0.2509818971157074, "step": 3782 }, { "epoch": 0.5850376957278175, "grad_norm": 4.529803276062012, "learning_rate": 4.472161759651736e-06, "logits/chosen": 12.318668365478516, "logits/rejected": 1.136120319366455, "logps/chosen": -329.66497802734375, "logps/rejected": -239.211669921875, "loss": 0.4283, "rewards/accuracies": 0.75, "rewards/chosen": 0.2693607211112976, "rewards/margins": 0.940729022026062, "rewards/rejected": -0.6713683605194092, "step": 3783 }, { "epoch": 0.5851923448675816, "grad_norm": 6.902698516845703, "learning_rate": 4.471875358002063e-06, "logits/chosen": 6.333864212036133, "logits/rejected": 6.201364994049072, "logps/chosen": -252.17929077148438, "logps/rejected": -338.15081787109375, "loss": 0.7896, "rewards/accuracies": 0.375, "rewards/chosen": 0.407183438539505, "rewards/margins": 0.04521150141954422, "rewards/rejected": 0.361971914768219, "step": 3784 }, { "epoch": 0.5853469940073458, "grad_norm": 6.2179646492004395, "learning_rate": 4.471588956352389e-06, "logits/chosen": 12.140695571899414, "logits/rejected": 13.30762004852295, "logps/chosen": -187.5406494140625, "logps/rejected": -265.6317443847656, "loss": 0.6663, "rewards/accuracies": 0.625, "rewards/chosen": -0.10119519382715225, "rewards/margins": 0.11078186333179474, "rewards/rejected": -0.2119770646095276, "step": 3785 }, { "epoch": 0.58550164314711, "grad_norm": 4.245568752288818, "learning_rate": 4.471302554702716e-06, "logits/chosen": 15.897817611694336, "logits/rejected": 5.275870323181152, "logps/chosen": -370.5501708984375, "logps/rejected": -225.25595092773438, "loss": 0.4457, "rewards/accuracies": 0.875, "rewards/chosen": 0.5728532671928406, "rewards/margins": 0.736522912979126, "rewards/rejected": -0.16366958618164062, "step": 3786 }, { "epoch": 0.5856562922868741, "grad_norm": 5.77073335647583, "learning_rate": 4.4710161530530426e-06, "logits/chosen": 12.041760444641113, "logits/rejected": 10.53640079498291, "logps/chosen": -279.2558288574219, "logps/rejected": -321.2295227050781, "loss": 0.5384, "rewards/accuracies": 0.875, "rewards/chosen": -0.10754089057445526, "rewards/margins": 0.43868744373321533, "rewards/rejected": -0.546228289604187, "step": 3787 }, { "epoch": 0.5858109414266384, "grad_norm": 3.7641026973724365, "learning_rate": 4.470729751403368e-06, "logits/chosen": 4.199711799621582, "logits/rejected": 4.029529571533203, "logps/chosen": -171.558349609375, "logps/rejected": -180.0418701171875, "loss": 0.6667, "rewards/accuracies": 0.5, "rewards/chosen": 0.1341724991798401, "rewards/margins": 0.1828451305627823, "rewards/rejected": -0.048672646284103394, "step": 3788 }, { "epoch": 0.5859655905664025, "grad_norm": 6.540365219116211, "learning_rate": 4.470443349753695e-06, "logits/chosen": 4.306085109710693, "logits/rejected": 8.175307273864746, "logps/chosen": -160.38243103027344, "logps/rejected": -202.55935668945312, "loss": 0.813, "rewards/accuracies": 0.375, "rewards/chosen": -0.04820629209280014, "rewards/margins": -0.1791524887084961, "rewards/rejected": 0.13094620406627655, "step": 3789 }, { "epoch": 0.5861202397061667, "grad_norm": 3.2858846187591553, "learning_rate": 4.470156948104022e-06, "logits/chosen": 11.635862350463867, "logits/rejected": 2.032432794570923, "logps/chosen": -256.81658935546875, "logps/rejected": -178.1287078857422, "loss": 0.4585, "rewards/accuracies": 0.75, "rewards/chosen": 0.881072998046875, "rewards/margins": 0.6806513071060181, "rewards/rejected": 0.2004217654466629, "step": 3790 }, { "epoch": 0.5862748888459308, "grad_norm": 6.301393985748291, "learning_rate": 4.469870546454348e-06, "logits/chosen": 9.518982887268066, "logits/rejected": 5.72247314453125, "logps/chosen": -404.8212585449219, "logps/rejected": -392.3601989746094, "loss": 0.5675, "rewards/accuracies": 0.875, "rewards/chosen": 0.5476022958755493, "rewards/margins": 0.3079988360404968, "rewards/rejected": 0.2396034300327301, "step": 3791 }, { "epoch": 0.586429537985695, "grad_norm": 4.827402591705322, "learning_rate": 4.469584144804674e-06, "logits/chosen": 8.841787338256836, "logits/rejected": 5.73464298248291, "logps/chosen": -217.51168823242188, "logps/rejected": -181.70196533203125, "loss": 0.6022, "rewards/accuracies": 0.5, "rewards/chosen": 0.2334670126438141, "rewards/margins": 0.2845512628555298, "rewards/rejected": -0.051084235310554504, "step": 3792 }, { "epoch": 0.5865841871254591, "grad_norm": 6.105525493621826, "learning_rate": 4.469297743155001e-06, "logits/chosen": 7.163834095001221, "logits/rejected": 9.782182693481445, "logps/chosen": -364.913818359375, "logps/rejected": -337.16357421875, "loss": 0.6958, "rewards/accuracies": 0.625, "rewards/chosen": 0.33818262815475464, "rewards/margins": 0.11036074161529541, "rewards/rejected": 0.227821946144104, "step": 3793 }, { "epoch": 0.5867388362652233, "grad_norm": 5.327899932861328, "learning_rate": 4.4690113415053274e-06, "logits/chosen": 15.233545303344727, "logits/rejected": 9.916312217712402, "logps/chosen": -309.1690368652344, "logps/rejected": -214.64369201660156, "loss": 0.6173, "rewards/accuracies": 0.625, "rewards/chosen": 0.49004632234573364, "rewards/margins": 0.28557318449020386, "rewards/rejected": 0.2044731229543686, "step": 3794 }, { "epoch": 0.5868934854049874, "grad_norm": 3.567430257797241, "learning_rate": 4.468724939855654e-06, "logits/chosen": 16.124679565429688, "logits/rejected": 9.374916076660156, "logps/chosen": -317.8848571777344, "logps/rejected": -225.60467529296875, "loss": 0.4853, "rewards/accuracies": 0.875, "rewards/chosen": 0.45219936966896057, "rewards/margins": 0.5519326329231262, "rewards/rejected": -0.09973322600126266, "step": 3795 }, { "epoch": 0.5870481345447516, "grad_norm": 4.013065338134766, "learning_rate": 4.46843853820598e-06, "logits/chosen": 11.811205863952637, "logits/rejected": 5.234990119934082, "logps/chosen": -328.9539489746094, "logps/rejected": -247.79653930664062, "loss": 0.4497, "rewards/accuracies": 0.875, "rewards/chosen": 0.5740846395492554, "rewards/margins": 0.704576849937439, "rewards/rejected": -0.13049226999282837, "step": 3796 }, { "epoch": 0.5872027836845157, "grad_norm": 4.756516933441162, "learning_rate": 4.4681521365563065e-06, "logits/chosen": 11.040724754333496, "logits/rejected": 6.3991498947143555, "logps/chosen": -275.6792297363281, "logps/rejected": -298.7868347167969, "loss": 0.4897, "rewards/accuracies": 0.75, "rewards/chosen": 0.24100293219089508, "rewards/margins": 0.5949441194534302, "rewards/rejected": -0.3539411425590515, "step": 3797 }, { "epoch": 0.5873574328242799, "grad_norm": 5.730184555053711, "learning_rate": 4.467865734906633e-06, "logits/chosen": 6.317093849182129, "logits/rejected": 10.418078422546387, "logps/chosen": -256.61669921875, "logps/rejected": -244.69720458984375, "loss": 0.738, "rewards/accuracies": 0.5, "rewards/chosen": 0.5190300941467285, "rewards/margins": 0.09673692286014557, "rewards/rejected": 0.42229318618774414, "step": 3798 }, { "epoch": 0.587512081964044, "grad_norm": 7.569241523742676, "learning_rate": 4.46757933325696e-06, "logits/chosen": 9.801020622253418, "logits/rejected": 14.157557487487793, "logps/chosen": -366.411865234375, "logps/rejected": -429.7743835449219, "loss": 0.762, "rewards/accuracies": 0.5, "rewards/chosen": -0.0666530579328537, "rewards/margins": -0.024210747331380844, "rewards/rejected": -0.04244232177734375, "step": 3799 }, { "epoch": 0.5876667311038082, "grad_norm": 4.484637260437012, "learning_rate": 4.4672929316072865e-06, "logits/chosen": 4.176270484924316, "logits/rejected": 2.5174739360809326, "logps/chosen": -150.77369689941406, "logps/rejected": -133.33351135253906, "loss": 0.7546, "rewards/accuracies": 0.625, "rewards/chosen": -0.23026953637599945, "rewards/margins": -0.06017979979515076, "rewards/rejected": -0.1700897216796875, "step": 3800 }, { "epoch": 0.5878213802435724, "grad_norm": 7.265353679656982, "learning_rate": 4.467006529957613e-06, "logits/chosen": 4.399864196777344, "logits/rejected": 1.9547100067138672, "logps/chosen": -269.5384521484375, "logps/rejected": -210.87155151367188, "loss": 0.7618, "rewards/accuracies": 0.625, "rewards/chosen": -0.06810353696346283, "rewards/margins": -0.034628890454769135, "rewards/rejected": -0.0334746390581131, "step": 3801 }, { "epoch": 0.5879760293833366, "grad_norm": 5.543549060821533, "learning_rate": 4.466720128307939e-06, "logits/chosen": 3.6068787574768066, "logits/rejected": 11.251604080200195, "logps/chosen": -118.15328979492188, "logps/rejected": -180.07046508789062, "loss": 0.8877, "rewards/accuracies": 0.25, "rewards/chosen": -0.14792117476463318, "rewards/margins": -0.30304527282714844, "rewards/rejected": 0.15512409806251526, "step": 3802 }, { "epoch": 0.5881306785231007, "grad_norm": 9.596186637878418, "learning_rate": 4.466433726658266e-06, "logits/chosen": 7.785845756530762, "logits/rejected": 6.174363613128662, "logps/chosen": -303.5019836425781, "logps/rejected": -303.1763610839844, "loss": 0.6227, "rewards/accuracies": 0.75, "rewards/chosen": 0.3537774085998535, "rewards/margins": 0.32071545720100403, "rewards/rejected": 0.033061981201171875, "step": 3803 }, { "epoch": 0.5882853276628649, "grad_norm": 5.346992015838623, "learning_rate": 4.466147325008592e-06, "logits/chosen": 10.00451946258545, "logits/rejected": 14.103031158447266, "logps/chosen": -239.40061950683594, "logps/rejected": -245.2680206298828, "loss": 0.6558, "rewards/accuracies": 0.375, "rewards/chosen": 0.3363875448703766, "rewards/margins": 0.17690998315811157, "rewards/rejected": 0.15947756171226501, "step": 3804 }, { "epoch": 0.588439976802629, "grad_norm": 6.007246017456055, "learning_rate": 4.465860923358919e-06, "logits/chosen": 5.067300319671631, "logits/rejected": 6.6585211753845215, "logps/chosen": -214.13839721679688, "logps/rejected": -256.30999755859375, "loss": 0.7586, "rewards/accuracies": 0.375, "rewards/chosen": 0.022285493090748787, "rewards/margins": -0.047274086624383926, "rewards/rejected": 0.06955958902835846, "step": 3805 }, { "epoch": 0.5885946259423932, "grad_norm": 4.4729509353637695, "learning_rate": 4.4655745217092455e-06, "logits/chosen": 11.940178871154785, "logits/rejected": 2.373706340789795, "logps/chosen": -277.29638671875, "logps/rejected": -191.27943420410156, "loss": 0.5275, "rewards/accuracies": 0.75, "rewards/chosen": 0.2847093641757965, "rewards/margins": 0.4315881133079529, "rewards/rejected": -0.14687877893447876, "step": 3806 }, { "epoch": 0.5887492750821574, "grad_norm": 4.49417781829834, "learning_rate": 4.465288120059572e-06, "logits/chosen": 5.422382354736328, "logits/rejected": 1.3937599658966064, "logps/chosen": -218.1869354248047, "logps/rejected": -146.658203125, "loss": 0.6267, "rewards/accuracies": 0.75, "rewards/chosen": 0.3639605641365051, "rewards/margins": 0.15747399628162384, "rewards/rejected": 0.2064865380525589, "step": 3807 }, { "epoch": 0.5889039242219215, "grad_norm": 4.653871059417725, "learning_rate": 4.465001718409898e-06, "logits/chosen": 10.03309440612793, "logits/rejected": 9.686051368713379, "logps/chosen": -230.60037231445312, "logps/rejected": -260.16497802734375, "loss": 0.5235, "rewards/accuracies": 0.75, "rewards/chosen": 0.5185046195983887, "rewards/margins": 0.46372655034065247, "rewards/rejected": 0.05477805435657501, "step": 3808 }, { "epoch": 0.5890585733616857, "grad_norm": 7.4475178718566895, "learning_rate": 4.464715316760225e-06, "logits/chosen": 6.612174034118652, "logits/rejected": 6.437985420227051, "logps/chosen": -207.719482421875, "logps/rejected": -233.73641967773438, "loss": 0.9596, "rewards/accuracies": 0.375, "rewards/chosen": -0.030739925801753998, "rewards/margins": -0.34236496686935425, "rewards/rejected": 0.31162506341934204, "step": 3809 }, { "epoch": 0.5892132225014498, "grad_norm": 5.061812400817871, "learning_rate": 4.464428915110551e-06, "logits/chosen": 9.823419570922852, "logits/rejected": 6.179940223693848, "logps/chosen": -300.6733703613281, "logps/rejected": -208.8863525390625, "loss": 0.6371, "rewards/accuracies": 0.75, "rewards/chosen": 0.26062458753585815, "rewards/margins": 0.26005592942237854, "rewards/rejected": 0.0005686730146408081, "step": 3810 }, { "epoch": 0.589367871641214, "grad_norm": 4.545434951782227, "learning_rate": 4.464142513460878e-06, "logits/chosen": 11.751673698425293, "logits/rejected": 9.986916542053223, "logps/chosen": -236.00482177734375, "logps/rejected": -223.0562744140625, "loss": 0.5292, "rewards/accuracies": 0.875, "rewards/chosen": 0.3402070999145508, "rewards/margins": 0.4128991961479187, "rewards/rejected": -0.07269209623336792, "step": 3811 }, { "epoch": 0.5895225207809781, "grad_norm": 6.3638105392456055, "learning_rate": 4.463856111811205e-06, "logits/chosen": 10.048778533935547, "logits/rejected": 1.469570279121399, "logps/chosen": -224.25860595703125, "logps/rejected": -227.16107177734375, "loss": 0.7472, "rewards/accuracies": 0.625, "rewards/chosen": -0.17586174607276917, "rewards/margins": 0.03858184814453125, "rewards/rejected": -0.21444359421730042, "step": 3812 }, { "epoch": 0.5896771699207423, "grad_norm": 5.346762657165527, "learning_rate": 4.463569710161531e-06, "logits/chosen": 8.229119300842285, "logits/rejected": 9.617132186889648, "logps/chosen": -275.83599853515625, "logps/rejected": -285.33917236328125, "loss": 0.6828, "rewards/accuracies": 0.5, "rewards/chosen": 0.498769074678421, "rewards/margins": 0.14329960942268372, "rewards/rejected": 0.3554695248603821, "step": 3813 }, { "epoch": 0.5898318190605065, "grad_norm": 5.457743167877197, "learning_rate": 4.463283308511858e-06, "logits/chosen": 11.086227416992188, "logits/rejected": 8.038616180419922, "logps/chosen": -286.2315673828125, "logps/rejected": -306.5230712890625, "loss": 0.5223, "rewards/accuracies": 0.75, "rewards/chosen": 0.6428254842758179, "rewards/margins": 0.4168838858604431, "rewards/rejected": 0.22594158351421356, "step": 3814 }, { "epoch": 0.5899864682002707, "grad_norm": 6.677029609680176, "learning_rate": 4.462996906862184e-06, "logits/chosen": 8.740394592285156, "logits/rejected": 9.674402236938477, "logps/chosen": -288.3377685546875, "logps/rejected": -251.75526428222656, "loss": 0.7327, "rewards/accuracies": 0.25, "rewards/chosen": 0.08418559283018112, "rewards/margins": 0.055370181798934937, "rewards/rejected": 0.028815407305955887, "step": 3815 }, { "epoch": 0.5901411173400348, "grad_norm": 6.691431999206543, "learning_rate": 4.46271050521251e-06, "logits/chosen": 7.850305557250977, "logits/rejected": 12.168737411499023, "logps/chosen": -242.0833740234375, "logps/rejected": -325.6463928222656, "loss": 0.977, "rewards/accuracies": 0.25, "rewards/chosen": -0.06883291900157928, "rewards/margins": -0.4395500421524048, "rewards/rejected": 0.3707171082496643, "step": 3816 }, { "epoch": 0.590295766479799, "grad_norm": 5.518157005310059, "learning_rate": 4.462424103562837e-06, "logits/chosen": 12.00661849975586, "logits/rejected": 10.266444206237793, "logps/chosen": -229.302978515625, "logps/rejected": -197.03114318847656, "loss": 0.7136, "rewards/accuracies": 0.375, "rewards/chosen": -0.07545642554759979, "rewards/margins": 0.03733096644282341, "rewards/rejected": -0.1127873957157135, "step": 3817 }, { "epoch": 0.5904504156195631, "grad_norm": 6.423452377319336, "learning_rate": 4.462137701913164e-06, "logits/chosen": 9.997434616088867, "logits/rejected": 7.990696430206299, "logps/chosen": -303.96868896484375, "logps/rejected": -210.50595092773438, "loss": 0.7367, "rewards/accuracies": 0.375, "rewards/chosen": 0.4308145046234131, "rewards/margins": 0.1137131005525589, "rewards/rejected": 0.317101389169693, "step": 3818 }, { "epoch": 0.5906050647593273, "grad_norm": 6.38089656829834, "learning_rate": 4.46185130026349e-06, "logits/chosen": 12.150814056396484, "logits/rejected": 5.259729862213135, "logps/chosen": -415.9914855957031, "logps/rejected": -306.4114074707031, "loss": 0.5969, "rewards/accuracies": 0.75, "rewards/chosen": 0.40292906761169434, "rewards/margins": 0.4225561022758484, "rewards/rejected": -0.01962708681821823, "step": 3819 }, { "epoch": 0.5907597138990914, "grad_norm": 7.6464338302612305, "learning_rate": 4.461564898613817e-06, "logits/chosen": 3.529846429824829, "logits/rejected": 6.337932586669922, "logps/chosen": -252.41787719726562, "logps/rejected": -370.6591491699219, "loss": 0.7054, "rewards/accuracies": 0.75, "rewards/chosen": 0.3005480468273163, "rewards/margins": 0.1689886599779129, "rewards/rejected": 0.13155938684940338, "step": 3820 }, { "epoch": 0.5909143630388556, "grad_norm": 3.996021032333374, "learning_rate": 4.461278496964143e-06, "logits/chosen": 10.435494422912598, "logits/rejected": 2.8606481552124023, "logps/chosen": -241.61514282226562, "logps/rejected": -169.81324768066406, "loss": 0.4563, "rewards/accuracies": 0.875, "rewards/chosen": 0.4743422269821167, "rewards/margins": 0.6487396359443665, "rewards/rejected": -0.17439743876457214, "step": 3821 }, { "epoch": 0.5910690121786197, "grad_norm": 9.007889747619629, "learning_rate": 4.460992095314469e-06, "logits/chosen": 7.150142192840576, "logits/rejected": 7.351118087768555, "logps/chosen": -345.4235534667969, "logps/rejected": -299.5750732421875, "loss": 0.6375, "rewards/accuracies": 0.625, "rewards/chosen": 0.19699811935424805, "rewards/margins": 0.23684947192668915, "rewards/rejected": -0.03985138610005379, "step": 3822 }, { "epoch": 0.5912236613183839, "grad_norm": 5.481825828552246, "learning_rate": 4.460705693664796e-06, "logits/chosen": 5.335975170135498, "logits/rejected": 6.277894973754883, "logps/chosen": -231.02935791015625, "logps/rejected": -288.1390380859375, "loss": 0.5218, "rewards/accuracies": 0.75, "rewards/chosen": 0.36069411039352417, "rewards/margins": 0.4748423099517822, "rewards/rejected": -0.11414818465709686, "step": 3823 }, { "epoch": 0.591378310458148, "grad_norm": 3.6080944538116455, "learning_rate": 4.460419292015123e-06, "logits/chosen": 17.56140899658203, "logits/rejected": 12.553186416625977, "logps/chosen": -288.2450256347656, "logps/rejected": -199.40164184570312, "loss": 0.5734, "rewards/accuracies": 0.75, "rewards/chosen": 0.3803260326385498, "rewards/margins": 0.4496496915817261, "rewards/rejected": -0.06932364404201508, "step": 3824 }, { "epoch": 0.5915329595979122, "grad_norm": 7.105933666229248, "learning_rate": 4.460132890365449e-06, "logits/chosen": 7.943840980529785, "logits/rejected": 7.007705211639404, "logps/chosen": -235.16064453125, "logps/rejected": -277.0152893066406, "loss": 0.7742, "rewards/accuracies": 0.625, "rewards/chosen": 0.34620848298072815, "rewards/margins": 0.01781301200389862, "rewards/rejected": 0.3283954858779907, "step": 3825 }, { "epoch": 0.5916876087376765, "grad_norm": 7.853766918182373, "learning_rate": 4.459846488715775e-06, "logits/chosen": 10.202415466308594, "logits/rejected": 6.962385654449463, "logps/chosen": -456.65521240234375, "logps/rejected": -348.7169189453125, "loss": 0.7503, "rewards/accuracies": 0.625, "rewards/chosen": 0.37237823009490967, "rewards/margins": 0.0034618377685546875, "rewards/rejected": 0.368916392326355, "step": 3826 }, { "epoch": 0.5918422578774406, "grad_norm": 7.678735256195068, "learning_rate": 4.459560087066102e-06, "logits/chosen": 6.420163631439209, "logits/rejected": 8.043878555297852, "logps/chosen": -292.2291564941406, "logps/rejected": -365.5219421386719, "loss": 1.0816, "rewards/accuracies": 0.375, "rewards/chosen": 0.12138272821903229, "rewards/margins": -0.44529005885124207, "rewards/rejected": 0.5666728019714355, "step": 3827 }, { "epoch": 0.5919969070172048, "grad_norm": 4.5391411781311035, "learning_rate": 4.4592736854164285e-06, "logits/chosen": 15.583244323730469, "logits/rejected": 8.068771362304688, "logps/chosen": -335.2794494628906, "logps/rejected": -264.66888427734375, "loss": 0.469, "rewards/accuracies": 1.0, "rewards/chosen": 0.3907284736633301, "rewards/margins": 0.5339459180831909, "rewards/rejected": -0.14321747422218323, "step": 3828 }, { "epoch": 0.5921515561569689, "grad_norm": 6.282165050506592, "learning_rate": 4.458987283766755e-06, "logits/chosen": 11.59078311920166, "logits/rejected": 2.815585136413574, "logps/chosen": -378.8709716796875, "logps/rejected": -231.02513122558594, "loss": 0.6197, "rewards/accuracies": 0.5, "rewards/chosen": 0.4551241993904114, "rewards/margins": 0.3134673535823822, "rewards/rejected": 0.14165687561035156, "step": 3829 }, { "epoch": 0.5923062052967331, "grad_norm": 6.958285331726074, "learning_rate": 4.458700882117081e-06, "logits/chosen": 9.933951377868652, "logits/rejected": 6.501849174499512, "logps/chosen": -294.91937255859375, "logps/rejected": -272.4971923828125, "loss": 0.6409, "rewards/accuracies": 0.5, "rewards/chosen": 0.036932624876499176, "rewards/margins": 0.2048449069261551, "rewards/rejected": -0.1679123044013977, "step": 3830 }, { "epoch": 0.5924608544364972, "grad_norm": 6.097238540649414, "learning_rate": 4.4584144804674076e-06, "logits/chosen": 9.449438095092773, "logits/rejected": 10.267274856567383, "logps/chosen": -268.89508056640625, "logps/rejected": -248.31124877929688, "loss": 0.7914, "rewards/accuracies": 0.625, "rewards/chosen": 0.2020013928413391, "rewards/margins": -0.02382427453994751, "rewards/rejected": 0.22582568228244781, "step": 3831 }, { "epoch": 0.5926155035762614, "grad_norm": 4.1570539474487305, "learning_rate": 4.458128078817734e-06, "logits/chosen": 9.957818984985352, "logits/rejected": -4.526979446411133, "logps/chosen": -316.7931823730469, "logps/rejected": -135.7413330078125, "loss": 0.4693, "rewards/accuracies": 0.75, "rewards/chosen": 0.19125862419605255, "rewards/margins": 0.7389428019523621, "rewards/rejected": -0.5476841926574707, "step": 3832 }, { "epoch": 0.5927701527160255, "grad_norm": 14.386735916137695, "learning_rate": 4.457841677168061e-06, "logits/chosen": 11.602571487426758, "logits/rejected": 6.634931564331055, "logps/chosen": -256.8283386230469, "logps/rejected": -205.56515502929688, "loss": 0.8041, "rewards/accuracies": 0.625, "rewards/chosen": 0.10648277401924133, "rewards/margins": -0.10399642586708069, "rewards/rejected": 0.21047921478748322, "step": 3833 }, { "epoch": 0.5929248018557897, "grad_norm": 4.555590629577637, "learning_rate": 4.4575552755183875e-06, "logits/chosen": 9.226140022277832, "logits/rejected": 5.296758651733398, "logps/chosen": -347.5211486816406, "logps/rejected": -244.83934020996094, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": 0.6001803874969482, "rewards/margins": 0.2005918025970459, "rewards/rejected": 0.39958858489990234, "step": 3834 }, { "epoch": 0.5930794509955538, "grad_norm": 7.292612075805664, "learning_rate": 4.457268873868713e-06, "logits/chosen": 5.563287258148193, "logits/rejected": 6.9433770179748535, "logps/chosen": -325.90484619140625, "logps/rejected": -262.2660217285156, "loss": 0.6572, "rewards/accuracies": 0.5, "rewards/chosen": 0.5089548230171204, "rewards/margins": 0.22604858875274658, "rewards/rejected": 0.28290626406669617, "step": 3835 }, { "epoch": 0.593234100135318, "grad_norm": 4.737924098968506, "learning_rate": 4.45698247221904e-06, "logits/chosen": 6.556214809417725, "logits/rejected": 8.457965850830078, "logps/chosen": -291.62994384765625, "logps/rejected": -344.5210266113281, "loss": 0.4397, "rewards/accuracies": 0.875, "rewards/chosen": 0.30360427498817444, "rewards/margins": 0.8144075870513916, "rewards/rejected": -0.5108033418655396, "step": 3836 }, { "epoch": 0.5933887492750821, "grad_norm": 6.47429084777832, "learning_rate": 4.456696070569367e-06, "logits/chosen": 14.299200057983398, "logits/rejected": 10.721549034118652, "logps/chosen": -416.1689758300781, "logps/rejected": -359.9927673339844, "loss": 0.6117, "rewards/accuracies": 0.625, "rewards/chosen": 0.7683303952217102, "rewards/margins": 0.29366546869277954, "rewards/rejected": 0.47466492652893066, "step": 3837 }, { "epoch": 0.5935433984148463, "grad_norm": 4.734052658081055, "learning_rate": 4.456409668919693e-06, "logits/chosen": 13.956354141235352, "logits/rejected": 12.158875465393066, "logps/chosen": -390.4969482421875, "logps/rejected": -265.8102722167969, "loss": 0.782, "rewards/accuracies": 0.375, "rewards/chosen": 0.20502310991287231, "rewards/margins": -0.12449908256530762, "rewards/rejected": 0.32952219247817993, "step": 3838 }, { "epoch": 0.5936980475546105, "grad_norm": 3.339404821395874, "learning_rate": 4.45612326727002e-06, "logits/chosen": 10.890353202819824, "logits/rejected": 3.5730478763580322, "logps/chosen": -276.361572265625, "logps/rejected": -129.41502380371094, "loss": 0.5567, "rewards/accuracies": 0.75, "rewards/chosen": 0.14239682257175446, "rewards/margins": 0.3733072876930237, "rewards/rejected": -0.23091045022010803, "step": 3839 }, { "epoch": 0.5938526966943747, "grad_norm": 3.691633701324463, "learning_rate": 4.4558368656203466e-06, "logits/chosen": 14.728607177734375, "logits/rejected": 5.328463554382324, "logps/chosen": -233.8325958251953, "logps/rejected": -135.4236602783203, "loss": 0.5692, "rewards/accuracies": 0.625, "rewards/chosen": 0.1695854365825653, "rewards/margins": 0.3227081894874573, "rewards/rejected": -0.15312275290489197, "step": 3840 }, { "epoch": 0.5940073458341388, "grad_norm": 7.030613422393799, "learning_rate": 4.455550463970672e-06, "logits/chosen": 7.401883602142334, "logits/rejected": 14.989568710327148, "logps/chosen": -201.39202880859375, "logps/rejected": -274.390625, "loss": 0.7675, "rewards/accuracies": 0.5, "rewards/chosen": -0.02464757114648819, "rewards/margins": -0.042287155985832214, "rewards/rejected": 0.01763959228992462, "step": 3841 }, { "epoch": 0.594161994973903, "grad_norm": 4.3906569480896, "learning_rate": 4.455264062320999e-06, "logits/chosen": 11.358149528503418, "logits/rejected": 9.51582145690918, "logps/chosen": -259.2760314941406, "logps/rejected": -230.26043701171875, "loss": 0.6523, "rewards/accuracies": 0.625, "rewards/chosen": 0.3054589331150055, "rewards/margins": 0.15285935997962952, "rewards/rejected": 0.15259957313537598, "step": 3842 }, { "epoch": 0.5943166441136671, "grad_norm": 5.709754943847656, "learning_rate": 4.454977660671326e-06, "logits/chosen": 5.496818542480469, "logits/rejected": 10.349492073059082, "logps/chosen": -282.9944152832031, "logps/rejected": -270.3442077636719, "loss": 0.6976, "rewards/accuracies": 0.75, "rewards/chosen": 0.13073718547821045, "rewards/margins": 0.007633231580257416, "rewards/rejected": 0.12310396134853363, "step": 3843 }, { "epoch": 0.5944712932534313, "grad_norm": 4.264304161071777, "learning_rate": 4.454691259021652e-06, "logits/chosen": 10.246711730957031, "logits/rejected": 4.339803695678711, "logps/chosen": -272.0284118652344, "logps/rejected": -175.11941528320312, "loss": 0.6163, "rewards/accuracies": 0.625, "rewards/chosen": 0.07712607085704803, "rewards/margins": 0.1974329948425293, "rewards/rejected": -0.12030691653490067, "step": 3844 }, { "epoch": 0.5946259423931954, "grad_norm": 18.1430606842041, "learning_rate": 4.454404857371979e-06, "logits/chosen": 9.121091842651367, "logits/rejected": 2.7126824855804443, "logps/chosen": -368.8348388671875, "logps/rejected": -270.771728515625, "loss": 0.7438, "rewards/accuracies": 0.5, "rewards/chosen": 0.30250370502471924, "rewards/margins": 0.003220662474632263, "rewards/rejected": 0.2992830276489258, "step": 3845 }, { "epoch": 0.5947805915329596, "grad_norm": 3.467616558074951, "learning_rate": 4.454118455722306e-06, "logits/chosen": 6.432444095611572, "logits/rejected": 4.621376037597656, "logps/chosen": -161.54161071777344, "logps/rejected": -147.1109161376953, "loss": 0.5771, "rewards/accuracies": 0.625, "rewards/chosen": 0.2612975835800171, "rewards/margins": 0.45776522159576416, "rewards/rejected": -0.19646763801574707, "step": 3846 }, { "epoch": 0.5949352406727237, "grad_norm": 6.248945236206055, "learning_rate": 4.453832054072632e-06, "logits/chosen": 2.3398420810699463, "logits/rejected": 7.798948287963867, "logps/chosen": -254.56573486328125, "logps/rejected": -282.75567626953125, "loss": 0.704, "rewards/accuracies": 0.375, "rewards/chosen": -0.05542530119419098, "rewards/margins": 0.1525730937719345, "rewards/rejected": -0.2079983949661255, "step": 3847 }, { "epoch": 0.5950898898124879, "grad_norm": 4.023340702056885, "learning_rate": 4.453545652422958e-06, "logits/chosen": 9.701852798461914, "logits/rejected": 2.5256001949310303, "logps/chosen": -195.063232421875, "logps/rejected": -162.69512939453125, "loss": 0.5169, "rewards/accuracies": 0.75, "rewards/chosen": 1.2550214529037476, "rewards/margins": 1.0442503690719604, "rewards/rejected": 0.21077106893062592, "step": 3848 }, { "epoch": 0.595244538952252, "grad_norm": 6.15097188949585, "learning_rate": 4.453259250773285e-06, "logits/chosen": 7.361240386962891, "logits/rejected": 5.024587631225586, "logps/chosen": -315.0986633300781, "logps/rejected": -231.8363494873047, "loss": 0.7076, "rewards/accuracies": 0.625, "rewards/chosen": 0.35231637954711914, "rewards/margins": 0.15271194279193878, "rewards/rejected": 0.19960442185401917, "step": 3849 }, { "epoch": 0.5953991880920162, "grad_norm": 4.188508033752441, "learning_rate": 4.452972849123611e-06, "logits/chosen": 7.146430015563965, "logits/rejected": 8.702651023864746, "logps/chosen": -279.047119140625, "logps/rejected": -298.24932861328125, "loss": 0.523, "rewards/accuracies": 0.75, "rewards/chosen": 0.35923728346824646, "rewards/margins": 0.5597008466720581, "rewards/rejected": -0.20046359300613403, "step": 3850 }, { "epoch": 0.5955538372317803, "grad_norm": 4.777875900268555, "learning_rate": 4.452686447473938e-06, "logits/chosen": 14.22570514678955, "logits/rejected": 6.961708068847656, "logps/chosen": -384.77789306640625, "logps/rejected": -272.19500732421875, "loss": 0.5584, "rewards/accuracies": 0.875, "rewards/chosen": 0.7798794507980347, "rewards/margins": 0.37557628750801086, "rewards/rejected": 0.4043031930923462, "step": 3851 }, { "epoch": 0.5957084863715446, "grad_norm": 21.18962287902832, "learning_rate": 4.452400045824265e-06, "logits/chosen": 8.901566505432129, "logits/rejected": 11.886366844177246, "logps/chosen": -213.25863647460938, "logps/rejected": -315.91119384765625, "loss": 0.7951, "rewards/accuracies": 0.5, "rewards/chosen": 0.2784026265144348, "rewards/margins": -0.04361967742443085, "rewards/rejected": 0.32202231884002686, "step": 3852 }, { "epoch": 0.5958631355113088, "grad_norm": 4.977728843688965, "learning_rate": 4.452113644174591e-06, "logits/chosen": 13.325644493103027, "logits/rejected": 10.87017822265625, "logps/chosen": -232.05148315429688, "logps/rejected": -221.09571838378906, "loss": 0.6102, "rewards/accuracies": 0.75, "rewards/chosen": 0.15060526132583618, "rewards/margins": 0.289823442697525, "rewards/rejected": -0.13921819627285004, "step": 3853 }, { "epoch": 0.5960177846510729, "grad_norm": 3.847041130065918, "learning_rate": 4.451827242524917e-06, "logits/chosen": 13.751547813415527, "logits/rejected": 5.848457336425781, "logps/chosen": -207.70892333984375, "logps/rejected": -119.81642150878906, "loss": 0.6032, "rewards/accuracies": 0.625, "rewards/chosen": 0.16600199043750763, "rewards/margins": 0.3585396409034729, "rewards/rejected": -0.19253763556480408, "step": 3854 }, { "epoch": 0.5961724337908371, "grad_norm": 8.761685371398926, "learning_rate": 4.451540840875244e-06, "logits/chosen": 12.882061004638672, "logits/rejected": 3.717501163482666, "logps/chosen": -441.8450927734375, "logps/rejected": -333.88946533203125, "loss": 0.7972, "rewards/accuracies": 0.5, "rewards/chosen": -0.02096595987677574, "rewards/margins": 0.3197178542613983, "rewards/rejected": -0.34068381786346436, "step": 3855 }, { "epoch": 0.5963270829306012, "grad_norm": 7.407461166381836, "learning_rate": 4.4512544392255704e-06, "logits/chosen": 8.958114624023438, "logits/rejected": 10.856538772583008, "logps/chosen": -282.1307373046875, "logps/rejected": -315.6894226074219, "loss": 0.8233, "rewards/accuracies": 0.375, "rewards/chosen": 0.33628833293914795, "rewards/margins": -0.17250213027000427, "rewards/rejected": 0.5087904334068298, "step": 3856 }, { "epoch": 0.5964817320703654, "grad_norm": 6.256063461303711, "learning_rate": 4.450968037575897e-06, "logits/chosen": 8.043414115905762, "logits/rejected": 2.5432448387145996, "logps/chosen": -280.5829162597656, "logps/rejected": -207.14633178710938, "loss": 0.7455, "rewards/accuracies": 0.75, "rewards/chosen": -0.015946581959724426, "rewards/margins": 0.127707839012146, "rewards/rejected": -0.14365443587303162, "step": 3857 }, { "epoch": 0.5966363812101295, "grad_norm": 5.679924011230469, "learning_rate": 4.450681635926224e-06, "logits/chosen": 9.277485847473145, "logits/rejected": 9.283084869384766, "logps/chosen": -209.117919921875, "logps/rejected": -267.44671630859375, "loss": 0.681, "rewards/accuracies": 0.625, "rewards/chosen": 0.3057743012905121, "rewards/margins": 0.16223332285881042, "rewards/rejected": 0.14354099333286285, "step": 3858 }, { "epoch": 0.5967910303498937, "grad_norm": 5.220340251922607, "learning_rate": 4.4503952342765495e-06, "logits/chosen": 7.739818572998047, "logits/rejected": 4.137601375579834, "logps/chosen": -256.84930419921875, "logps/rejected": -240.44061279296875, "loss": 0.674, "rewards/accuracies": 0.625, "rewards/chosen": 0.3029986619949341, "rewards/margins": 0.1641806960105896, "rewards/rejected": 0.13881796598434448, "step": 3859 }, { "epoch": 0.5969456794896578, "grad_norm": 4.8695268630981445, "learning_rate": 4.450108832626876e-06, "logits/chosen": 10.396373748779297, "logits/rejected": 7.189081192016602, "logps/chosen": -218.73585510253906, "logps/rejected": -150.6077423095703, "loss": 0.7079, "rewards/accuracies": 0.5, "rewards/chosen": 0.3363502323627472, "rewards/margins": 0.03585357964038849, "rewards/rejected": 0.3004966974258423, "step": 3860 }, { "epoch": 0.597100328629422, "grad_norm": 5.034574031829834, "learning_rate": 4.449822430977203e-06, "logits/chosen": 16.268657684326172, "logits/rejected": 10.540192604064941, "logps/chosen": -232.69375610351562, "logps/rejected": -195.15472412109375, "loss": 0.6918, "rewards/accuracies": 0.25, "rewards/chosen": 0.14549486339092255, "rewards/margins": 0.08534926176071167, "rewards/rejected": 0.06014557182788849, "step": 3861 }, { "epoch": 0.5972549777691861, "grad_norm": 4.311427593231201, "learning_rate": 4.4495360293275295e-06, "logits/chosen": 7.273316383361816, "logits/rejected": 3.2542858123779297, "logps/chosen": -180.52978515625, "logps/rejected": -131.00286865234375, "loss": 0.689, "rewards/accuracies": 0.375, "rewards/chosen": -0.07191963493824005, "rewards/margins": 0.08312667906284332, "rewards/rejected": -0.15504629909992218, "step": 3862 }, { "epoch": 0.5974096269089503, "grad_norm": 7.141695976257324, "learning_rate": 4.449249627677856e-06, "logits/chosen": 14.52280330657959, "logits/rejected": 11.800079345703125, "logps/chosen": -304.2814025878906, "logps/rejected": -331.8609619140625, "loss": 0.563, "rewards/accuracies": 0.625, "rewards/chosen": 0.5603678226470947, "rewards/margins": 0.45084840059280396, "rewards/rejected": 0.10951948165893555, "step": 3863 }, { "epoch": 0.5975642760487144, "grad_norm": 5.070034503936768, "learning_rate": 4.448963226028182e-06, "logits/chosen": 8.325569152832031, "logits/rejected": 10.92115306854248, "logps/chosen": -250.042724609375, "logps/rejected": -270.6550598144531, "loss": 0.586, "rewards/accuracies": 0.75, "rewards/chosen": 0.05914616584777832, "rewards/margins": 0.245741605758667, "rewards/rejected": -0.18659542500972748, "step": 3864 }, { "epoch": 0.5977189251884787, "grad_norm": 9.244002342224121, "learning_rate": 4.448676824378509e-06, "logits/chosen": 8.563394546508789, "logits/rejected": 8.198019981384277, "logps/chosen": -293.6990966796875, "logps/rejected": -294.89068603515625, "loss": 0.8749, "rewards/accuracies": 0.25, "rewards/chosen": -0.05756746232509613, "rewards/margins": -0.26582416892051697, "rewards/rejected": 0.20825672149658203, "step": 3865 }, { "epoch": 0.5978735743282428, "grad_norm": 5.550826549530029, "learning_rate": 4.448390422728835e-06, "logits/chosen": 9.64195728302002, "logits/rejected": 5.02839469909668, "logps/chosen": -213.12632751464844, "logps/rejected": -170.35665893554688, "loss": 0.603, "rewards/accuracies": 0.625, "rewards/chosen": 0.28932976722717285, "rewards/margins": 0.2146056592464447, "rewards/rejected": 0.07472410798072815, "step": 3866 }, { "epoch": 0.598028223468007, "grad_norm": 6.245521068572998, "learning_rate": 4.448104021079162e-06, "logits/chosen": 12.155706405639648, "logits/rejected": 11.467378616333008, "logps/chosen": -299.26348876953125, "logps/rejected": -261.70550537109375, "loss": 0.6703, "rewards/accuracies": 0.625, "rewards/chosen": 0.017500489950180054, "rewards/margins": 0.172099307179451, "rewards/rejected": -0.15459880232810974, "step": 3867 }, { "epoch": 0.5981828726077711, "grad_norm": 6.4365949630737305, "learning_rate": 4.447817619429488e-06, "logits/chosen": 14.18918228149414, "logits/rejected": 8.934442520141602, "logps/chosen": -401.605712890625, "logps/rejected": -339.048828125, "loss": 0.7455, "rewards/accuracies": 0.5, "rewards/chosen": 0.2455999255180359, "rewards/margins": 0.042389582842588425, "rewards/rejected": 0.20321033895015717, "step": 3868 }, { "epoch": 0.5983375217475353, "grad_norm": 7.192135334014893, "learning_rate": 4.447531217779814e-06, "logits/chosen": 7.545492649078369, "logits/rejected": 3.3931689262390137, "logps/chosen": -296.37615966796875, "logps/rejected": -181.6265106201172, "loss": 0.7363, "rewards/accuracies": 0.625, "rewards/chosen": 0.19697798788547516, "rewards/margins": 0.019417472183704376, "rewards/rejected": 0.17756050825119019, "step": 3869 }, { "epoch": 0.5984921708872994, "grad_norm": 6.12122917175293, "learning_rate": 4.447244816130141e-06, "logits/chosen": 10.559307098388672, "logits/rejected": 11.585685729980469, "logps/chosen": -291.38818359375, "logps/rejected": -283.39215087890625, "loss": 0.7217, "rewards/accuracies": 0.625, "rewards/chosen": 0.31820911169052124, "rewards/margins": 0.023026492446660995, "rewards/rejected": 0.29518258571624756, "step": 3870 }, { "epoch": 0.5986468200270636, "grad_norm": 6.119382858276367, "learning_rate": 4.446958414480468e-06, "logits/chosen": 14.64987850189209, "logits/rejected": 12.435090065002441, "logps/chosen": -290.3376770019531, "logps/rejected": -269.3126220703125, "loss": 0.7163, "rewards/accuracies": 0.5, "rewards/chosen": 0.2654527723789215, "rewards/margins": 0.09620637446641922, "rewards/rejected": 0.1692463904619217, "step": 3871 }, { "epoch": 0.5988014691668277, "grad_norm": 3.763533115386963, "learning_rate": 4.446672012830794e-06, "logits/chosen": 5.599374294281006, "logits/rejected": 3.2607431411743164, "logps/chosen": -261.7787170410156, "logps/rejected": -208.43399047851562, "loss": 0.604, "rewards/accuracies": 0.375, "rewards/chosen": 0.07701735198497772, "rewards/margins": 0.3463677167892456, "rewards/rejected": -0.2693503201007843, "step": 3872 }, { "epoch": 0.5989561183065919, "grad_norm": 4.655757427215576, "learning_rate": 4.446385611181121e-06, "logits/chosen": 9.932258605957031, "logits/rejected": 8.173981666564941, "logps/chosen": -266.74591064453125, "logps/rejected": -242.66490173339844, "loss": 0.6097, "rewards/accuracies": 0.875, "rewards/chosen": 0.3883005380630493, "rewards/margins": 0.21472401916980743, "rewards/rejected": 0.17357653379440308, "step": 3873 }, { "epoch": 0.599110767446356, "grad_norm": 4.346072196960449, "learning_rate": 4.446099209531447e-06, "logits/chosen": 5.958211898803711, "logits/rejected": 5.569001197814941, "logps/chosen": -175.85031127929688, "logps/rejected": -187.32028198242188, "loss": 0.8729, "rewards/accuracies": 0.375, "rewards/chosen": 0.055561259388923645, "rewards/margins": -0.1847131848335266, "rewards/rejected": 0.24027448892593384, "step": 3874 }, { "epoch": 0.5992654165861202, "grad_norm": 4.736356258392334, "learning_rate": 4.445812807881773e-06, "logits/chosen": 6.78809118270874, "logits/rejected": 13.442954063415527, "logps/chosen": -134.38375854492188, "logps/rejected": -239.40689086914062, "loss": 0.6526, "rewards/accuracies": 0.5, "rewards/chosen": 0.24932880699634552, "rewards/margins": 0.1478361338376999, "rewards/rejected": 0.10149267315864563, "step": 3875 }, { "epoch": 0.5994200657258844, "grad_norm": 6.878830432891846, "learning_rate": 4.4455264062321e-06, "logits/chosen": 8.970477104187012, "logits/rejected": 2.57135009765625, "logps/chosen": -255.8218994140625, "logps/rejected": -268.3470458984375, "loss": 0.6149, "rewards/accuracies": 0.625, "rewards/chosen": 0.13815096020698547, "rewards/margins": 0.4011203646659851, "rewards/rejected": -0.26296940445899963, "step": 3876 }, { "epoch": 0.5995747148656485, "grad_norm": 6.638808727264404, "learning_rate": 4.445240004582427e-06, "logits/chosen": 10.611102104187012, "logits/rejected": 7.543502330780029, "logps/chosen": -284.49700927734375, "logps/rejected": -269.28411865234375, "loss": 0.7401, "rewards/accuracies": 0.5, "rewards/chosen": 0.20888566970825195, "rewards/margins": -0.07874555140733719, "rewards/rejected": 0.28763121366500854, "step": 3877 }, { "epoch": 0.5997293640054128, "grad_norm": 7.689025402069092, "learning_rate": 4.444953602932753e-06, "logits/chosen": 10.126932144165039, "logits/rejected": 10.684929847717285, "logps/chosen": -434.2848815917969, "logps/rejected": -358.90533447265625, "loss": 0.7248, "rewards/accuracies": 0.625, "rewards/chosen": 0.626960039138794, "rewards/margins": 0.07752152532339096, "rewards/rejected": 0.5494384765625, "step": 3878 }, { "epoch": 0.5998840131451769, "grad_norm": 3.148813486099243, "learning_rate": 4.44466720128308e-06, "logits/chosen": 9.07924747467041, "logits/rejected": -1.7599058151245117, "logps/chosen": -269.93597412109375, "logps/rejected": -132.77688598632812, "loss": 0.4736, "rewards/accuracies": 0.875, "rewards/chosen": 0.7984883189201355, "rewards/margins": 0.8481366038322449, "rewards/rejected": -0.04964829981327057, "step": 3879 }, { "epoch": 0.6000386622849411, "grad_norm": 5.296482086181641, "learning_rate": 4.444380799633407e-06, "logits/chosen": 7.485034942626953, "logits/rejected": 6.309716701507568, "logps/chosen": -293.6840515136719, "logps/rejected": -215.73464965820312, "loss": 0.7029, "rewards/accuracies": 0.75, "rewards/chosen": 0.3642003536224365, "rewards/margins": 0.11009103059768677, "rewards/rejected": 0.25410932302474976, "step": 3880 }, { "epoch": 0.6001933114247052, "grad_norm": 4.511099338531494, "learning_rate": 4.4440943979837325e-06, "logits/chosen": 9.776927947998047, "logits/rejected": 6.702047348022461, "logps/chosen": -392.608154296875, "logps/rejected": -285.5046691894531, "loss": 0.5595, "rewards/accuracies": 0.75, "rewards/chosen": 0.37085992097854614, "rewards/margins": 0.3414820432662964, "rewards/rejected": 0.029377862811088562, "step": 3881 }, { "epoch": 0.6003479605644694, "grad_norm": 5.933185577392578, "learning_rate": 4.443807996334059e-06, "logits/chosen": 8.37652587890625, "logits/rejected": 11.007089614868164, "logps/chosen": -172.8363037109375, "logps/rejected": -191.82162475585938, "loss": 0.8593, "rewards/accuracies": 0.375, "rewards/chosen": 0.0849263072013855, "rewards/margins": -0.2628410756587982, "rewards/rejected": 0.34776735305786133, "step": 3882 }, { "epoch": 0.6005026097042335, "grad_norm": 5.383392810821533, "learning_rate": 4.443521594684386e-06, "logits/chosen": 10.5837984085083, "logits/rejected": 9.564327239990234, "logps/chosen": -246.11691284179688, "logps/rejected": -241.84043884277344, "loss": 0.642, "rewards/accuracies": 0.625, "rewards/chosen": 0.33095866441726685, "rewards/margins": 0.20551368594169617, "rewards/rejected": 0.12544497847557068, "step": 3883 }, { "epoch": 0.6006572588439977, "grad_norm": 5.434359073638916, "learning_rate": 4.443235193034712e-06, "logits/chosen": 14.77172565460205, "logits/rejected": 8.474355697631836, "logps/chosen": -376.6193542480469, "logps/rejected": -229.8174285888672, "loss": 0.5727, "rewards/accuracies": 0.625, "rewards/chosen": 0.6839893460273743, "rewards/margins": 0.3474277853965759, "rewards/rejected": 0.3365615904331207, "step": 3884 }, { "epoch": 0.6008119079837618, "grad_norm": 6.1834893226623535, "learning_rate": 4.442948791385039e-06, "logits/chosen": 7.505656719207764, "logits/rejected": 9.484930992126465, "logps/chosen": -347.8443603515625, "logps/rejected": -343.0873718261719, "loss": 0.6072, "rewards/accuracies": 0.625, "rewards/chosen": 0.656563401222229, "rewards/margins": 0.24430981278419495, "rewards/rejected": 0.41225361824035645, "step": 3885 }, { "epoch": 0.600966557123526, "grad_norm": 6.101221561431885, "learning_rate": 4.442662389735366e-06, "logits/chosen": 12.881450653076172, "logits/rejected": 5.2534708976745605, "logps/chosen": -412.58831787109375, "logps/rejected": -260.46978759765625, "loss": 0.5715, "rewards/accuracies": 0.5, "rewards/chosen": 0.4104612171649933, "rewards/margins": 0.38385143876075745, "rewards/rejected": 0.026609785854816437, "step": 3886 }, { "epoch": 0.6011212062632901, "grad_norm": 5.960506439208984, "learning_rate": 4.4423759880856915e-06, "logits/chosen": 10.128340721130371, "logits/rejected": 11.28730297088623, "logps/chosen": -248.02017211914062, "logps/rejected": -274.02532958984375, "loss": 0.8383, "rewards/accuracies": 0.25, "rewards/chosen": 0.5649007558822632, "rewards/margins": -0.21767978370189667, "rewards/rejected": 0.7825806140899658, "step": 3887 }, { "epoch": 0.6012758554030543, "grad_norm": 16.900802612304688, "learning_rate": 4.442089586436018e-06, "logits/chosen": 11.56409740447998, "logits/rejected": 7.57831335067749, "logps/chosen": -393.04583740234375, "logps/rejected": -164.96380615234375, "loss": 0.7871, "rewards/accuracies": 0.75, "rewards/chosen": -0.11604196578264236, "rewards/margins": -0.03710642457008362, "rewards/rejected": -0.07893553376197815, "step": 3888 }, { "epoch": 0.6014305045428184, "grad_norm": 5.817917346954346, "learning_rate": 4.441803184786345e-06, "logits/chosen": 7.873283863067627, "logits/rejected": 10.702044486999512, "logps/chosen": -177.4079132080078, "logps/rejected": -238.53720092773438, "loss": 0.8187, "rewards/accuracies": 0.25, "rewards/chosen": -0.07048788666725159, "rewards/margins": -0.18652144074440002, "rewards/rejected": 0.11603355407714844, "step": 3889 }, { "epoch": 0.6015851536825826, "grad_norm": 7.263823509216309, "learning_rate": 4.4415167831366715e-06, "logits/chosen": 13.044964790344238, "logits/rejected": 10.323070526123047, "logps/chosen": -305.1011962890625, "logps/rejected": -262.8005065917969, "loss": 0.9423, "rewards/accuracies": 0.375, "rewards/chosen": 0.18749113380908966, "rewards/margins": -0.21726539731025696, "rewards/rejected": 0.4047565162181854, "step": 3890 }, { "epoch": 0.6017398028223468, "grad_norm": 3.867288827896118, "learning_rate": 4.441230381486998e-06, "logits/chosen": 9.105090141296387, "logits/rejected": -2.6467838287353516, "logps/chosen": -274.07318115234375, "logps/rejected": -142.89996337890625, "loss": 0.5237, "rewards/accuracies": 0.625, "rewards/chosen": 0.3479274809360504, "rewards/margins": 0.5475428104400635, "rewards/rejected": -0.19961532950401306, "step": 3891 }, { "epoch": 0.601894451962111, "grad_norm": 5.0648322105407715, "learning_rate": 4.440943979837325e-06, "logits/chosen": 14.801642417907715, "logits/rejected": 6.9151105880737305, "logps/chosen": -290.8497009277344, "logps/rejected": -180.7908172607422, "loss": 0.661, "rewards/accuracies": 0.375, "rewards/chosen": 0.2503805160522461, "rewards/margins": 0.15373292565345764, "rewards/rejected": 0.09664759039878845, "step": 3892 }, { "epoch": 0.6020491011018752, "grad_norm": 12.034680366516113, "learning_rate": 4.440657578187651e-06, "logits/chosen": 12.5345458984375, "logits/rejected": 9.806342124938965, "logps/chosen": -524.4227294921875, "logps/rejected": -496.8655700683594, "loss": 0.6278, "rewards/accuracies": 0.75, "rewards/chosen": 0.5607287883758545, "rewards/margins": 0.21646201610565186, "rewards/rejected": 0.34426674246788025, "step": 3893 }, { "epoch": 0.6022037502416393, "grad_norm": 6.952269554138184, "learning_rate": 4.440371176537977e-06, "logits/chosen": 5.1526103019714355, "logits/rejected": 6.769500732421875, "logps/chosen": -247.334228515625, "logps/rejected": -285.9931945800781, "loss": 0.7882, "rewards/accuracies": 0.375, "rewards/chosen": -0.29980432987213135, "rewards/margins": -0.08810777962207794, "rewards/rejected": -0.21169652044773102, "step": 3894 }, { "epoch": 0.6023583993814035, "grad_norm": 5.4862446784973145, "learning_rate": 4.440084774888304e-06, "logits/chosen": 8.05799388885498, "logits/rejected": 7.2748870849609375, "logps/chosen": -240.3828125, "logps/rejected": -192.3104248046875, "loss": 0.7319, "rewards/accuracies": 0.625, "rewards/chosen": 0.3096157908439636, "rewards/margins": 0.17935967445373535, "rewards/rejected": 0.13025611639022827, "step": 3895 }, { "epoch": 0.6025130485211676, "grad_norm": 7.092531204223633, "learning_rate": 4.4397983732386305e-06, "logits/chosen": 11.093854904174805, "logits/rejected": 6.925727367401123, "logps/chosen": -302.0999755859375, "logps/rejected": -260.83953857421875, "loss": 0.8798, "rewards/accuracies": 0.25, "rewards/chosen": 0.5152764916419983, "rewards/margins": -0.2856329679489136, "rewards/rejected": 0.8009095191955566, "step": 3896 }, { "epoch": 0.6026676976609318, "grad_norm": 34.546836853027344, "learning_rate": 4.439511971588956e-06, "logits/chosen": 9.177679061889648, "logits/rejected": 3.0738775730133057, "logps/chosen": -396.36578369140625, "logps/rejected": -280.2530212402344, "loss": 0.7116, "rewards/accuracies": 0.75, "rewards/chosen": 0.5396541357040405, "rewards/margins": 0.027874130755662918, "rewards/rejected": 0.5117799639701843, "step": 3897 }, { "epoch": 0.6028223468006959, "grad_norm": 5.122930526733398, "learning_rate": 4.439225569939283e-06, "logits/chosen": 4.455689907073975, "logits/rejected": 8.010992050170898, "logps/chosen": -286.8979797363281, "logps/rejected": -381.677734375, "loss": 0.643, "rewards/accuracies": 0.375, "rewards/chosen": 0.5623769760131836, "rewards/margins": 0.14786362648010254, "rewards/rejected": 0.41451334953308105, "step": 3898 }, { "epoch": 0.6029769959404601, "grad_norm": 6.4154372215271, "learning_rate": 4.43893916828961e-06, "logits/chosen": 13.473001480102539, "logits/rejected": 8.6923828125, "logps/chosen": -345.00665283203125, "logps/rejected": -279.5361022949219, "loss": 0.7225, "rewards/accuracies": 0.5, "rewards/chosen": 0.4932249188423157, "rewards/margins": 0.09298592060804367, "rewards/rejected": 0.4002390205860138, "step": 3899 }, { "epoch": 0.6031316450802242, "grad_norm": 4.310980796813965, "learning_rate": 4.438652766639936e-06, "logits/chosen": 9.137060165405273, "logits/rejected": 6.654911518096924, "logps/chosen": -194.30029296875, "logps/rejected": -157.14157104492188, "loss": 0.6856, "rewards/accuracies": 0.5, "rewards/chosen": 0.23456516861915588, "rewards/margins": 0.12567058205604553, "rewards/rejected": 0.10889458656311035, "step": 3900 }, { "epoch": 0.6032862942199884, "grad_norm": 5.997705459594727, "learning_rate": 4.438366364990263e-06, "logits/chosen": 8.250368118286133, "logits/rejected": 7.210304260253906, "logps/chosen": -305.0679931640625, "logps/rejected": -318.2009582519531, "loss": 0.6903, "rewards/accuracies": 0.5, "rewards/chosen": 0.5197056531906128, "rewards/margins": 0.07911047339439392, "rewards/rejected": 0.4405951499938965, "step": 3901 }, { "epoch": 0.6034409433597525, "grad_norm": 6.1376118659973145, "learning_rate": 4.438079963340589e-06, "logits/chosen": 16.63678550720215, "logits/rejected": 9.755081176757812, "logps/chosen": -324.0816345214844, "logps/rejected": -280.4528503417969, "loss": 0.6879, "rewards/accuracies": 0.5, "rewards/chosen": 0.5481510162353516, "rewards/margins": 0.07492218911647797, "rewards/rejected": 0.4732288420200348, "step": 3902 }, { "epoch": 0.6035955924995168, "grad_norm": 6.018391132354736, "learning_rate": 4.437793561690915e-06, "logits/chosen": 9.6676025390625, "logits/rejected": 14.256958961486816, "logps/chosen": -245.39459228515625, "logps/rejected": -287.4531555175781, "loss": 0.7739, "rewards/accuracies": 0.5, "rewards/chosen": 0.5975342988967896, "rewards/margins": -0.1116015687584877, "rewards/rejected": 0.709135890007019, "step": 3903 }, { "epoch": 0.6037502416392809, "grad_norm": 4.122574806213379, "learning_rate": 4.437507160041242e-06, "logits/chosen": 5.369261264801025, "logits/rejected": 4.408687114715576, "logps/chosen": -189.4131622314453, "logps/rejected": -159.6771240234375, "loss": 0.683, "rewards/accuracies": 0.625, "rewards/chosen": 0.21424371004104614, "rewards/margins": 0.1198115199804306, "rewards/rejected": 0.09443218261003494, "step": 3904 }, { "epoch": 0.6039048907790451, "grad_norm": 8.841023445129395, "learning_rate": 4.437220758391569e-06, "logits/chosen": 15.080881118774414, "logits/rejected": 8.804738998413086, "logps/chosen": -391.0445556640625, "logps/rejected": -280.6028137207031, "loss": 0.5485, "rewards/accuracies": 0.5, "rewards/chosen": 0.4271688461303711, "rewards/margins": 0.5063155293464661, "rewards/rejected": -0.07914666831493378, "step": 3905 }, { "epoch": 0.6040595399188092, "grad_norm": 19.14792251586914, "learning_rate": 4.436934356741895e-06, "logits/chosen": 14.502641677856445, "logits/rejected": 9.034814834594727, "logps/chosen": -298.1357421875, "logps/rejected": -203.72283935546875, "loss": 0.4943, "rewards/accuracies": 0.875, "rewards/chosen": 0.6897628307342529, "rewards/margins": 0.606182336807251, "rewards/rejected": 0.08358044177293777, "step": 3906 }, { "epoch": 0.6042141890585734, "grad_norm": 6.1135478019714355, "learning_rate": 4.436647955092221e-06, "logits/chosen": 12.373702049255371, "logits/rejected": 8.945653915405273, "logps/chosen": -284.17144775390625, "logps/rejected": -290.6643981933594, "loss": 0.6898, "rewards/accuracies": 0.5, "rewards/chosen": 0.5265203714370728, "rewards/margins": 0.24908261001110077, "rewards/rejected": 0.2774377763271332, "step": 3907 }, { "epoch": 0.6043688381983375, "grad_norm": 5.55449914932251, "learning_rate": 4.436361553442548e-06, "logits/chosen": 12.21667194366455, "logits/rejected": 4.47822904586792, "logps/chosen": -360.3435974121094, "logps/rejected": -233.45675659179688, "loss": 0.603, "rewards/accuracies": 0.625, "rewards/chosen": 0.40338361263275146, "rewards/margins": 0.3075413405895233, "rewards/rejected": 0.09584227204322815, "step": 3908 }, { "epoch": 0.6045234873381017, "grad_norm": 3.4196724891662598, "learning_rate": 4.4360751517928744e-06, "logits/chosen": 7.5966033935546875, "logits/rejected": 6.637414932250977, "logps/chosen": -205.63720703125, "logps/rejected": -149.7642059326172, "loss": 0.6023, "rewards/accuracies": 0.5, "rewards/chosen": 0.5818638205528259, "rewards/margins": 0.28300806879997253, "rewards/rejected": 0.2988557815551758, "step": 3909 }, { "epoch": 0.6046781364778658, "grad_norm": 5.212469100952148, "learning_rate": 4.435788750143201e-06, "logits/chosen": 6.841164588928223, "logits/rejected": 9.152700424194336, "logps/chosen": -285.6550598144531, "logps/rejected": -394.9906005859375, "loss": 0.6463, "rewards/accuracies": 0.625, "rewards/chosen": 0.6218509078025818, "rewards/margins": 0.28151923418045044, "rewards/rejected": 0.34033164381980896, "step": 3910 }, { "epoch": 0.60483278561763, "grad_norm": 5.6171488761901855, "learning_rate": 4.435502348493528e-06, "logits/chosen": 4.333796501159668, "logits/rejected": 9.224414825439453, "logps/chosen": -244.13455200195312, "logps/rejected": -360.42962646484375, "loss": 0.6907, "rewards/accuracies": 0.5, "rewards/chosen": 0.5087676048278809, "rewards/margins": 0.27077844738960266, "rewards/rejected": 0.23798918724060059, "step": 3911 }, { "epoch": 0.6049874347573941, "grad_norm": 7.398587226867676, "learning_rate": 4.435215946843854e-06, "logits/chosen": 8.538468360900879, "logits/rejected": 7.4908246994018555, "logps/chosen": -414.14251708984375, "logps/rejected": -455.76507568359375, "loss": 0.6647, "rewards/accuracies": 0.5, "rewards/chosen": 0.648833155632019, "rewards/margins": 0.1741776019334793, "rewards/rejected": 0.4746555984020233, "step": 3912 }, { "epoch": 0.6051420838971583, "grad_norm": 4.901476860046387, "learning_rate": 4.434929545194181e-06, "logits/chosen": 9.439541816711426, "logits/rejected": 13.413737297058105, "logps/chosen": -259.1940612792969, "logps/rejected": -233.14608764648438, "loss": 0.7191, "rewards/accuracies": 0.375, "rewards/chosen": 0.40240737795829773, "rewards/margins": 0.05190809816122055, "rewards/rejected": 0.3504992723464966, "step": 3913 }, { "epoch": 0.6052967330369224, "grad_norm": 5.9158549308776855, "learning_rate": 4.434643143544507e-06, "logits/chosen": 13.142208099365234, "logits/rejected": 6.812465667724609, "logps/chosen": -324.46209716796875, "logps/rejected": -227.11952209472656, "loss": 0.6264, "rewards/accuracies": 0.625, "rewards/chosen": 0.6139547228813171, "rewards/margins": 0.2808613181114197, "rewards/rejected": 0.33309343457221985, "step": 3914 }, { "epoch": 0.6054513821766866, "grad_norm": 4.910852432250977, "learning_rate": 4.4343567418948335e-06, "logits/chosen": 11.10114574432373, "logits/rejected": 6.515744209289551, "logps/chosen": -253.697021484375, "logps/rejected": -200.24368286132812, "loss": 0.4024, "rewards/accuracies": 0.875, "rewards/chosen": 0.6055654287338257, "rewards/margins": 1.026044249534607, "rewards/rejected": -0.42047882080078125, "step": 3915 }, { "epoch": 0.6056060313164509, "grad_norm": 7.040600299835205, "learning_rate": 4.43407034024516e-06, "logits/chosen": 8.591606140136719, "logits/rejected": 10.577064514160156, "logps/chosen": -318.95697021484375, "logps/rejected": -353.3975830078125, "loss": 0.6405, "rewards/accuracies": 0.75, "rewards/chosen": 0.4841551184654236, "rewards/margins": 0.2094191014766693, "rewards/rejected": 0.2747359573841095, "step": 3916 }, { "epoch": 0.605760680456215, "grad_norm": 4.258431911468506, "learning_rate": 4.433783938595487e-06, "logits/chosen": 5.574944496154785, "logits/rejected": 5.272205352783203, "logps/chosen": -185.17529296875, "logps/rejected": -163.1497039794922, "loss": 0.6985, "rewards/accuracies": 0.375, "rewards/chosen": 0.21793369948863983, "rewards/margins": 0.038532041013240814, "rewards/rejected": 0.17940166592597961, "step": 3917 }, { "epoch": 0.6059153295959792, "grad_norm": 6.661002159118652, "learning_rate": 4.4334975369458135e-06, "logits/chosen": 7.947875499725342, "logits/rejected": -1.1607508659362793, "logps/chosen": -285.3018798828125, "logps/rejected": -155.67453002929688, "loss": 0.672, "rewards/accuracies": 0.5, "rewards/chosen": 0.025693196803331375, "rewards/margins": 0.3243691921234131, "rewards/rejected": -0.2986759841442108, "step": 3918 }, { "epoch": 0.6060699787357433, "grad_norm": 5.284203052520752, "learning_rate": 4.43321113529614e-06, "logits/chosen": 4.179821968078613, "logits/rejected": 3.223979949951172, "logps/chosen": -223.80006408691406, "logps/rejected": -257.2651672363281, "loss": 0.6962, "rewards/accuracies": 0.375, "rewards/chosen": 0.15460927784442902, "rewards/margins": 0.0940190926194191, "rewards/rejected": 0.060590144246816635, "step": 3919 }, { "epoch": 0.6062246278755075, "grad_norm": 7.079946041107178, "learning_rate": 4.432924733646466e-06, "logits/chosen": 5.260807991027832, "logits/rejected": 6.139631271362305, "logps/chosen": -268.0679016113281, "logps/rejected": -341.3180236816406, "loss": 0.755, "rewards/accuracies": 0.375, "rewards/chosen": 0.5222038626670837, "rewards/margins": 0.04032406210899353, "rewards/rejected": 0.4818797707557678, "step": 3920 }, { "epoch": 0.6063792770152716, "grad_norm": 5.207577705383301, "learning_rate": 4.4326383319967926e-06, "logits/chosen": 6.2708964347839355, "logits/rejected": 8.383788108825684, "logps/chosen": -203.42987060546875, "logps/rejected": -229.19252014160156, "loss": 0.655, "rewards/accuracies": 0.625, "rewards/chosen": 0.1914692521095276, "rewards/margins": 0.1311703622341156, "rewards/rejected": 0.06029890477657318, "step": 3921 }, { "epoch": 0.6065339261550358, "grad_norm": 5.684727668762207, "learning_rate": 4.432351930347119e-06, "logits/chosen": 10.120211601257324, "logits/rejected": 8.831645011901855, "logps/chosen": -341.0797119140625, "logps/rejected": -266.207763671875, "loss": 0.8905, "rewards/accuracies": 0.75, "rewards/chosen": 0.211262509226799, "rewards/margins": -0.19925397634506226, "rewards/rejected": 0.41051650047302246, "step": 3922 }, { "epoch": 0.6066885752947999, "grad_norm": 8.976771354675293, "learning_rate": 4.432065528697446e-06, "logits/chosen": 8.938323020935059, "logits/rejected": 9.13524341583252, "logps/chosen": -361.41998291015625, "logps/rejected": -339.8784484863281, "loss": 0.6839, "rewards/accuracies": 0.625, "rewards/chosen": 0.15553255379199982, "rewards/margins": 0.11367003619670868, "rewards/rejected": 0.04186253994703293, "step": 3923 }, { "epoch": 0.6068432244345641, "grad_norm": 7.550969123840332, "learning_rate": 4.4317791270477725e-06, "logits/chosen": 4.504551410675049, "logits/rejected": -1.2373692989349365, "logps/chosen": -243.65744018554688, "logps/rejected": -222.73126220703125, "loss": 0.7045, "rewards/accuracies": 0.375, "rewards/chosen": 0.2893436551094055, "rewards/margins": 0.07997182011604309, "rewards/rejected": 0.20937182009220123, "step": 3924 }, { "epoch": 0.6069978735743282, "grad_norm": 4.785045623779297, "learning_rate": 4.431492725398099e-06, "logits/chosen": 7.851067066192627, "logits/rejected": 7.215798377990723, "logps/chosen": -259.1886291503906, "logps/rejected": -277.53680419921875, "loss": 0.6859, "rewards/accuracies": 0.5, "rewards/chosen": 0.10007940232753754, "rewards/margins": 0.09074074029922485, "rewards/rejected": 0.009338676929473877, "step": 3925 }, { "epoch": 0.6071525227140924, "grad_norm": 18.191181182861328, "learning_rate": 4.431206323748426e-06, "logits/chosen": 3.268805742263794, "logits/rejected": 4.472982883453369, "logps/chosen": -178.28390502929688, "logps/rejected": -186.53646850585938, "loss": 0.702, "rewards/accuracies": 0.5, "rewards/chosen": 0.36703360080718994, "rewards/margins": 0.03177686780691147, "rewards/rejected": 0.33525675535202026, "step": 3926 }, { "epoch": 0.6073071718538565, "grad_norm": 4.964666843414307, "learning_rate": 4.430919922098752e-06, "logits/chosen": 8.175495147705078, "logits/rejected": 4.364938735961914, "logps/chosen": -330.52374267578125, "logps/rejected": -248.77520751953125, "loss": 0.603, "rewards/accuracies": 0.5, "rewards/chosen": 0.7188816666603088, "rewards/margins": 0.4681444764137268, "rewards/rejected": 0.25073719024658203, "step": 3927 }, { "epoch": 0.6074618209936207, "grad_norm": 4.099869728088379, "learning_rate": 4.430633520449078e-06, "logits/chosen": 5.738530158996582, "logits/rejected": 13.697628021240234, "logps/chosen": -165.16888427734375, "logps/rejected": -215.4642791748047, "loss": 0.7018, "rewards/accuracies": 0.375, "rewards/chosen": 0.287031888961792, "rewards/margins": 0.2301531583070755, "rewards/rejected": 0.056878771632909775, "step": 3928 }, { "epoch": 0.6076164701333849, "grad_norm": 6.254826545715332, "learning_rate": 4.430347118799405e-06, "logits/chosen": 9.18960952758789, "logits/rejected": 5.901559829711914, "logps/chosen": -266.15423583984375, "logps/rejected": -244.1021728515625, "loss": 0.7581, "rewards/accuracies": 0.375, "rewards/chosen": 0.4968794584274292, "rewards/margins": -0.09724986553192139, "rewards/rejected": 0.5941293239593506, "step": 3929 }, { "epoch": 0.6077711192731491, "grad_norm": 6.2791643142700195, "learning_rate": 4.4300607171497316e-06, "logits/chosen": 10.656717300415039, "logits/rejected": 12.716848373413086, "logps/chosen": -231.64620971679688, "logps/rejected": -269.5526123046875, "loss": 0.7628, "rewards/accuracies": 0.375, "rewards/chosen": 0.1951417475938797, "rewards/margins": 0.0024026334285736084, "rewards/rejected": 0.1927391141653061, "step": 3930 }, { "epoch": 0.6079257684129132, "grad_norm": 4.102294921875, "learning_rate": 4.429774315500057e-06, "logits/chosen": 6.861971855163574, "logits/rejected": 7.396608829498291, "logps/chosen": -182.29127502441406, "logps/rejected": -215.16659545898438, "loss": 0.6351, "rewards/accuracies": 0.625, "rewards/chosen": 0.11349441111087799, "rewards/margins": 0.26495811343193054, "rewards/rejected": -0.15146368741989136, "step": 3931 }, { "epoch": 0.6080804175526774, "grad_norm": 6.228318214416504, "learning_rate": 4.429487913850384e-06, "logits/chosen": 11.26646614074707, "logits/rejected": 9.691991806030273, "logps/chosen": -391.5594177246094, "logps/rejected": -313.89288330078125, "loss": 0.6799, "rewards/accuracies": 0.5, "rewards/chosen": 0.537706196308136, "rewards/margins": 0.08447578549385071, "rewards/rejected": 0.4532304108142853, "step": 3932 }, { "epoch": 0.6082350666924415, "grad_norm": 5.476931095123291, "learning_rate": 4.429201512200711e-06, "logits/chosen": 3.3181259632110596, "logits/rejected": 4.004659175872803, "logps/chosen": -319.52484130859375, "logps/rejected": -269.1839294433594, "loss": 0.6531, "rewards/accuracies": 0.5, "rewards/chosen": 0.41651710867881775, "rewards/margins": 0.22092759609222412, "rewards/rejected": 0.19558948278427124, "step": 3933 }, { "epoch": 0.6083897158322057, "grad_norm": 6.260740756988525, "learning_rate": 4.428915110551037e-06, "logits/chosen": 8.654623985290527, "logits/rejected": 5.765504360198975, "logps/chosen": -345.4053955078125, "logps/rejected": -248.0701141357422, "loss": 0.6519, "rewards/accuracies": 0.625, "rewards/chosen": 0.43752652406692505, "rewards/margins": 0.2794763743877411, "rewards/rejected": 0.15805016458034515, "step": 3934 }, { "epoch": 0.6085443649719698, "grad_norm": 6.456355571746826, "learning_rate": 4.428628708901363e-06, "logits/chosen": 7.498924732208252, "logits/rejected": 6.414624214172363, "logps/chosen": -299.5986022949219, "logps/rejected": -290.5528869628906, "loss": 0.8254, "rewards/accuracies": 0.375, "rewards/chosen": 0.030011199414730072, "rewards/margins": -0.11222369968891144, "rewards/rejected": 0.1422349065542221, "step": 3935 }, { "epoch": 0.608699014111734, "grad_norm": 4.903226852416992, "learning_rate": 4.42834230725169e-06, "logits/chosen": 2.0587334632873535, "logits/rejected": 5.142673969268799, "logps/chosen": -212.19400024414062, "logps/rejected": -155.07481384277344, "loss": 0.7332, "rewards/accuracies": 0.25, "rewards/chosen": 0.18088267743587494, "rewards/margins": -0.03783514350652695, "rewards/rejected": 0.2187178134918213, "step": 3936 }, { "epoch": 0.6088536632514981, "grad_norm": 4.980144023895264, "learning_rate": 4.428055905602016e-06, "logits/chosen": 13.498974800109863, "logits/rejected": 12.166093826293945, "logps/chosen": -306.012939453125, "logps/rejected": -347.42620849609375, "loss": 0.6097, "rewards/accuracies": 0.625, "rewards/chosen": 0.2839524447917938, "rewards/margins": 0.23469415307044983, "rewards/rejected": 0.0492582842707634, "step": 3937 }, { "epoch": 0.6090083123912623, "grad_norm": 4.113332271575928, "learning_rate": 4.427769503952343e-06, "logits/chosen": 5.48469877243042, "logits/rejected": 0.37880033254623413, "logps/chosen": -242.65028381347656, "logps/rejected": -152.6143798828125, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": 0.41076648235321045, "rewards/margins": 0.5718305110931396, "rewards/rejected": -0.1610640287399292, "step": 3938 }, { "epoch": 0.6091629615310264, "grad_norm": 4.880515098571777, "learning_rate": 4.42748310230267e-06, "logits/chosen": 6.751075744628906, "logits/rejected": 8.638193130493164, "logps/chosen": -166.15328979492188, "logps/rejected": -230.50582885742188, "loss": 0.6765, "rewards/accuracies": 0.5, "rewards/chosen": -0.33478519320487976, "rewards/margins": 0.0900709256529808, "rewards/rejected": -0.42485612630844116, "step": 3939 }, { "epoch": 0.6093176106707906, "grad_norm": 4.448638916015625, "learning_rate": 4.4271967006529955e-06, "logits/chosen": 7.269002914428711, "logits/rejected": 8.951229095458984, "logps/chosen": -194.74359130859375, "logps/rejected": -235.1671600341797, "loss": 0.6945, "rewards/accuracies": 0.5, "rewards/chosen": 0.15649251639842987, "rewards/margins": 0.09792394191026688, "rewards/rejected": 0.05856857821345329, "step": 3940 }, { "epoch": 0.6094722598105548, "grad_norm": 6.018690586090088, "learning_rate": 4.426910299003322e-06, "logits/chosen": 13.833003044128418, "logits/rejected": 8.66430950164795, "logps/chosen": -237.7458038330078, "logps/rejected": -161.79444885253906, "loss": 0.7577, "rewards/accuracies": 0.5, "rewards/chosen": 0.021770503371953964, "rewards/margins": 0.15440839529037476, "rewards/rejected": -0.1326378732919693, "step": 3941 }, { "epoch": 0.609626908950319, "grad_norm": 9.304879188537598, "learning_rate": 4.426623897353649e-06, "logits/chosen": 6.3647918701171875, "logits/rejected": 1.8037711381912231, "logps/chosen": -353.09991455078125, "logps/rejected": -228.76918029785156, "loss": 0.6942, "rewards/accuracies": 0.625, "rewards/chosen": 0.4195597767829895, "rewards/margins": 0.32777053117752075, "rewards/rejected": 0.09178925305604935, "step": 3942 }, { "epoch": 0.6097815580900832, "grad_norm": 5.764718532562256, "learning_rate": 4.4263374957039755e-06, "logits/chosen": 9.352285385131836, "logits/rejected": 8.429758071899414, "logps/chosen": -254.9012908935547, "logps/rejected": -251.05540466308594, "loss": 0.6218, "rewards/accuracies": 0.5, "rewards/chosen": 0.24047036468982697, "rewards/margins": 0.21459397673606873, "rewards/rejected": 0.025876376777887344, "step": 3943 }, { "epoch": 0.6099362072298473, "grad_norm": 2.95068621635437, "learning_rate": 4.426051094054302e-06, "logits/chosen": 9.840718269348145, "logits/rejected": 3.394331455230713, "logps/chosen": -453.9388427734375, "logps/rejected": -185.21783447265625, "loss": 0.409, "rewards/accuracies": 0.75, "rewards/chosen": 0.9666721820831299, "rewards/margins": 0.9936515092849731, "rewards/rejected": -0.02697935700416565, "step": 3944 }, { "epoch": 0.6100908563696115, "grad_norm": 3.6018004417419434, "learning_rate": 4.425764692404629e-06, "logits/chosen": 11.385409355163574, "logits/rejected": 2.0178732872009277, "logps/chosen": -191.98577880859375, "logps/rejected": -98.89240264892578, "loss": 0.538, "rewards/accuracies": 0.75, "rewards/chosen": 0.47789880633354187, "rewards/margins": 0.44668370485305786, "rewards/rejected": 0.031215094029903412, "step": 3945 }, { "epoch": 0.6102455055093756, "grad_norm": 5.9750566482543945, "learning_rate": 4.4254782907549554e-06, "logits/chosen": 8.962934494018555, "logits/rejected": 9.986520767211914, "logps/chosen": -217.88967895507812, "logps/rejected": -298.3156433105469, "loss": 0.7551, "rewards/accuracies": 0.375, "rewards/chosen": 0.1696626991033554, "rewards/margins": -0.06377942115068436, "rewards/rejected": 0.23344211280345917, "step": 3946 }, { "epoch": 0.6104001546491398, "grad_norm": 4.730971336364746, "learning_rate": 4.425191889105281e-06, "logits/chosen": 10.342905044555664, "logits/rejected": 12.241241455078125, "logps/chosen": -247.43345642089844, "logps/rejected": -272.8958740234375, "loss": 0.6278, "rewards/accuracies": 0.5, "rewards/chosen": 0.7114505171775818, "rewards/margins": 0.34041827917099, "rewards/rejected": 0.371032178401947, "step": 3947 }, { "epoch": 0.6105548037889039, "grad_norm": 5.059885501861572, "learning_rate": 4.424905487455608e-06, "logits/chosen": 11.473493576049805, "logits/rejected": 14.256406784057617, "logps/chosen": -295.53167724609375, "logps/rejected": -322.2238464355469, "loss": 0.6668, "rewards/accuracies": 0.375, "rewards/chosen": 0.09232574701309204, "rewards/margins": 0.2279464304447174, "rewards/rejected": -0.13562071323394775, "step": 3948 }, { "epoch": 0.6107094529286681, "grad_norm": 4.253401756286621, "learning_rate": 4.4246190858059345e-06, "logits/chosen": 12.653377532958984, "logits/rejected": 7.631697177886963, "logps/chosen": -273.0514831542969, "logps/rejected": -208.06675720214844, "loss": 0.6663, "rewards/accuracies": 0.5, "rewards/chosen": 0.3519936501979828, "rewards/margins": 0.13607236742973328, "rewards/rejected": 0.2159213125705719, "step": 3949 }, { "epoch": 0.6108641020684322, "grad_norm": 5.537108421325684, "learning_rate": 4.424332684156261e-06, "logits/chosen": 10.17874526977539, "logits/rejected": 11.40488052368164, "logps/chosen": -213.402587890625, "logps/rejected": -304.5474853515625, "loss": 0.726, "rewards/accuracies": 0.375, "rewards/chosen": 0.41043418645858765, "rewards/margins": 0.011886127293109894, "rewards/rejected": 0.39854809641838074, "step": 3950 }, { "epoch": 0.6110187512081964, "grad_norm": 4.392712593078613, "learning_rate": 4.424046282506588e-06, "logits/chosen": 9.43829345703125, "logits/rejected": 5.8715972900390625, "logps/chosen": -247.12969970703125, "logps/rejected": -268.78643798828125, "loss": 0.478, "rewards/accuracies": 0.75, "rewards/chosen": 0.4751456379890442, "rewards/margins": 0.6031975746154785, "rewards/rejected": -0.1280519962310791, "step": 3951 }, { "epoch": 0.6111734003479605, "grad_norm": 5.71186637878418, "learning_rate": 4.4237598808569145e-06, "logits/chosen": 9.687740325927734, "logits/rejected": 14.106892585754395, "logps/chosen": -191.24928283691406, "logps/rejected": -226.9354248046875, "loss": 0.6278, "rewards/accuracies": 0.75, "rewards/chosen": 0.3443165421485901, "rewards/margins": 0.2833443284034729, "rewards/rejected": 0.06097222864627838, "step": 3952 }, { "epoch": 0.6113280494877247, "grad_norm": 4.307219505310059, "learning_rate": 4.42347347920724e-06, "logits/chosen": 8.427547454833984, "logits/rejected": 5.390182018280029, "logps/chosen": -411.5386657714844, "logps/rejected": -280.1956787109375, "loss": 0.4709, "rewards/accuracies": 0.75, "rewards/chosen": 0.7386701107025146, "rewards/margins": 0.7951598167419434, "rewards/rejected": -0.056489646434783936, "step": 3953 }, { "epoch": 0.6114826986274888, "grad_norm": 5.931158542633057, "learning_rate": 4.423187077557567e-06, "logits/chosen": 8.853277206420898, "logits/rejected": 3.7948968410491943, "logps/chosen": -252.56930541992188, "logps/rejected": -229.06695556640625, "loss": 0.8123, "rewards/accuracies": 0.25, "rewards/chosen": 0.15185098350048065, "rewards/margins": -0.1505226045846939, "rewards/rejected": 0.30237358808517456, "step": 3954 }, { "epoch": 0.6116373477672531, "grad_norm": 5.277589321136475, "learning_rate": 4.422900675907894e-06, "logits/chosen": 10.294349670410156, "logits/rejected": 7.776795864105225, "logps/chosen": -240.8458251953125, "logps/rejected": -204.66903686523438, "loss": 0.6645, "rewards/accuracies": 0.875, "rewards/chosen": 0.3079814314842224, "rewards/margins": 0.18570742011070251, "rewards/rejected": 0.1222740188241005, "step": 3955 }, { "epoch": 0.6117919969070172, "grad_norm": 16.515262603759766, "learning_rate": 4.42261427425822e-06, "logits/chosen": 6.147159099578857, "logits/rejected": 9.732583045959473, "logps/chosen": -301.515869140625, "logps/rejected": -360.2158203125, "loss": 0.8199, "rewards/accuracies": 0.375, "rewards/chosen": -0.07122736424207687, "rewards/margins": -0.1844414919614792, "rewards/rejected": 0.11321412771940231, "step": 3956 }, { "epoch": 0.6119466460467814, "grad_norm": 5.09522819519043, "learning_rate": 4.422327872608547e-06, "logits/chosen": 6.902000427246094, "logits/rejected": 6.220500946044922, "logps/chosen": -207.66470336914062, "logps/rejected": -157.48452758789062, "loss": 0.7978, "rewards/accuracies": 0.375, "rewards/chosen": 0.28255677223205566, "rewards/margins": -0.09610196948051453, "rewards/rejected": 0.3786587119102478, "step": 3957 }, { "epoch": 0.6121012951865455, "grad_norm": 6.843777179718018, "learning_rate": 4.4220414709588735e-06, "logits/chosen": 8.429061889648438, "logits/rejected": 11.85678482055664, "logps/chosen": -159.63804626464844, "logps/rejected": -244.06492614746094, "loss": 0.7833, "rewards/accuracies": 0.625, "rewards/chosen": 0.15043459832668304, "rewards/margins": -0.03470522165298462, "rewards/rejected": 0.18513980507850647, "step": 3958 }, { "epoch": 0.6122559443263097, "grad_norm": 4.550302982330322, "learning_rate": 4.4217550693092e-06, "logits/chosen": 10.892744064331055, "logits/rejected": 9.70377254486084, "logps/chosen": -207.5938720703125, "logps/rejected": -183.10531616210938, "loss": 0.6456, "rewards/accuracies": 0.75, "rewards/chosen": 0.10899969935417175, "rewards/margins": 0.13289988040924072, "rewards/rejected": -0.02390018105506897, "step": 3959 }, { "epoch": 0.6124105934660738, "grad_norm": 9.154976844787598, "learning_rate": 4.421468667659526e-06, "logits/chosen": 11.393348693847656, "logits/rejected": 11.878911972045898, "logps/chosen": -281.6402282714844, "logps/rejected": -290.2956237792969, "loss": 0.5833, "rewards/accuracies": 0.625, "rewards/chosen": 0.4247601628303528, "rewards/margins": 0.3584325313568115, "rewards/rejected": 0.06632763147354126, "step": 3960 }, { "epoch": 0.612565242605838, "grad_norm": 3.601804494857788, "learning_rate": 4.421182266009853e-06, "logits/chosen": 12.146629333496094, "logits/rejected": 10.923734664916992, "logps/chosen": -258.5301513671875, "logps/rejected": -193.32644653320312, "loss": 0.5444, "rewards/accuracies": 0.75, "rewards/chosen": 0.8748998641967773, "rewards/margins": 0.47861260175704956, "rewards/rejected": 0.396287202835083, "step": 3961 }, { "epoch": 0.6127198917456022, "grad_norm": 6.186570644378662, "learning_rate": 4.420895864360179e-06, "logits/chosen": 9.150975227355957, "logits/rejected": 6.546099662780762, "logps/chosen": -353.0328063964844, "logps/rejected": -217.9554443359375, "loss": 0.7871, "rewards/accuracies": 0.625, "rewards/chosen": 0.33110055327415466, "rewards/margins": 0.06069934368133545, "rewards/rejected": 0.2704011797904968, "step": 3962 }, { "epoch": 0.6128745408853663, "grad_norm": 4.830526828765869, "learning_rate": 4.420609462710506e-06, "logits/chosen": 9.601766586303711, "logits/rejected": 5.484531879425049, "logps/chosen": -232.62954711914062, "logps/rejected": -177.02243041992188, "loss": 0.6411, "rewards/accuracies": 0.5, "rewards/chosen": 0.4963589012622833, "rewards/margins": 0.19993120431900024, "rewards/rejected": 0.2964276671409607, "step": 3963 }, { "epoch": 0.6130291900251305, "grad_norm": 3.7554080486297607, "learning_rate": 4.420323061060833e-06, "logits/chosen": 10.423748970031738, "logits/rejected": 10.842555046081543, "logps/chosen": -141.82188415527344, "logps/rejected": -170.78945922851562, "loss": 0.5048, "rewards/accuracies": 0.75, "rewards/chosen": 0.26879173517227173, "rewards/margins": 0.6932423710823059, "rewards/rejected": -0.4244506359100342, "step": 3964 }, { "epoch": 0.6131838391648946, "grad_norm": 5.461180210113525, "learning_rate": 4.420036659411158e-06, "logits/chosen": 7.354410171508789, "logits/rejected": 12.26761245727539, "logps/chosen": -189.40394592285156, "logps/rejected": -246.69393920898438, "loss": 0.7015, "rewards/accuracies": 0.625, "rewards/chosen": 0.3383534550666809, "rewards/margins": 0.03129100054502487, "rewards/rejected": 0.30706244707107544, "step": 3965 }, { "epoch": 0.6133384883046588, "grad_norm": 5.0663323402404785, "learning_rate": 4.419750257761485e-06, "logits/chosen": 12.438953399658203, "logits/rejected": 7.917116165161133, "logps/chosen": -244.00332641601562, "logps/rejected": -163.62026977539062, "loss": 0.8037, "rewards/accuracies": 0.5, "rewards/chosen": -0.03278389573097229, "rewards/margins": -0.03631211817264557, "rewards/rejected": 0.003528214991092682, "step": 3966 }, { "epoch": 0.613493137444423, "grad_norm": 9.220536231994629, "learning_rate": 4.419463856111812e-06, "logits/chosen": 10.85869312286377, "logits/rejected": 8.307962417602539, "logps/chosen": -281.2154541015625, "logps/rejected": -268.24786376953125, "loss": 0.5865, "rewards/accuracies": 0.625, "rewards/chosen": 0.6907461881637573, "rewards/margins": 0.2936241328716278, "rewards/rejected": 0.3971221148967743, "step": 3967 }, { "epoch": 0.6136477865841872, "grad_norm": 6.323410511016846, "learning_rate": 4.419177454462138e-06, "logits/chosen": 10.972429275512695, "logits/rejected": 6.477114677429199, "logps/chosen": -304.065185546875, "logps/rejected": -249.53591918945312, "loss": 0.7205, "rewards/accuracies": 0.375, "rewards/chosen": 0.4116255044937134, "rewards/margins": 0.025416608899831772, "rewards/rejected": 0.3862088918685913, "step": 3968 }, { "epoch": 0.6138024357239513, "grad_norm": 6.181182861328125, "learning_rate": 4.418891052812464e-06, "logits/chosen": 4.934808731079102, "logits/rejected": 6.155580520629883, "logps/chosen": -189.90966796875, "logps/rejected": -306.04119873046875, "loss": 0.6913, "rewards/accuracies": 0.5, "rewards/chosen": 0.36994311213493347, "rewards/margins": 0.058012351393699646, "rewards/rejected": 0.31193074584007263, "step": 3969 }, { "epoch": 0.6139570848637155, "grad_norm": 6.408806324005127, "learning_rate": 4.418604651162791e-06, "logits/chosen": 10.767641067504883, "logits/rejected": 5.702798843383789, "logps/chosen": -313.1101379394531, "logps/rejected": -293.96978759765625, "loss": 0.6186, "rewards/accuracies": 0.625, "rewards/chosen": 0.5324552655220032, "rewards/margins": 0.28967976570129395, "rewards/rejected": 0.2427755445241928, "step": 3970 }, { "epoch": 0.6141117340034796, "grad_norm": 6.188937664031982, "learning_rate": 4.4183182495131175e-06, "logits/chosen": 5.66517448425293, "logits/rejected": 2.293285846710205, "logps/chosen": -253.6072998046875, "logps/rejected": -204.28594970703125, "loss": 0.6067, "rewards/accuracies": 0.75, "rewards/chosen": 0.5332667231559753, "rewards/margins": 0.281089723110199, "rewards/rejected": 0.25217705965042114, "step": 3971 }, { "epoch": 0.6142663831432438, "grad_norm": 6.798548221588135, "learning_rate": 4.418031847863444e-06, "logits/chosen": 10.6843900680542, "logits/rejected": 8.478568077087402, "logps/chosen": -194.47265625, "logps/rejected": -195.5419921875, "loss": 0.8092, "rewards/accuracies": 0.125, "rewards/chosen": 0.19158917665481567, "rewards/margins": -0.20718078315258026, "rewards/rejected": 0.39876994490623474, "step": 3972 }, { "epoch": 0.6144210322830079, "grad_norm": 5.045307636260986, "learning_rate": 4.41774544621377e-06, "logits/chosen": 10.354766845703125, "logits/rejected": 5.917832851409912, "logps/chosen": -427.99493408203125, "logps/rejected": -338.61279296875, "loss": 0.6111, "rewards/accuracies": 0.75, "rewards/chosen": 0.8421720266342163, "rewards/margins": 0.27371490001678467, "rewards/rejected": 0.5684571266174316, "step": 3973 }, { "epoch": 0.6145756814227721, "grad_norm": 4.441810607910156, "learning_rate": 4.4174590445640966e-06, "logits/chosen": 11.248181343078613, "logits/rejected": 9.445834159851074, "logps/chosen": -250.52337646484375, "logps/rejected": -175.71202087402344, "loss": 0.6, "rewards/accuracies": 0.625, "rewards/chosen": 0.6608389616012573, "rewards/margins": 0.2901197075843811, "rewards/rejected": 0.3707192540168762, "step": 3974 }, { "epoch": 0.6147303305625362, "grad_norm": 6.860800266265869, "learning_rate": 4.417172642914423e-06, "logits/chosen": 8.25340461730957, "logits/rejected": 5.902225971221924, "logps/chosen": -324.9285888671875, "logps/rejected": -260.1461486816406, "loss": 0.7593, "rewards/accuracies": 0.125, "rewards/chosen": 0.4724811613559723, "rewards/margins": 0.021984606981277466, "rewards/rejected": 0.4504966139793396, "step": 3975 }, { "epoch": 0.6148849797023004, "grad_norm": 5.341482162475586, "learning_rate": 4.41688624126475e-06, "logits/chosen": 14.92526626586914, "logits/rejected": 9.183149337768555, "logps/chosen": -346.3665771484375, "logps/rejected": -261.3407287597656, "loss": 0.4572, "rewards/accuracies": 0.625, "rewards/chosen": 0.8901488780975342, "rewards/margins": 0.8458946347236633, "rewards/rejected": 0.04425421357154846, "step": 3976 }, { "epoch": 0.6150396288420645, "grad_norm": 7.1989665031433105, "learning_rate": 4.4165998396150765e-06, "logits/chosen": 12.811212539672852, "logits/rejected": 10.94886302947998, "logps/chosen": -499.0943298339844, "logps/rejected": -391.61492919921875, "loss": 0.8161, "rewards/accuracies": 0.25, "rewards/chosen": 0.40263786911964417, "rewards/margins": -0.16286352276802063, "rewards/rejected": 0.5655014514923096, "step": 3977 }, { "epoch": 0.6151942779818287, "grad_norm": 6.798708915710449, "learning_rate": 4.416313437965403e-06, "logits/chosen": 8.418795585632324, "logits/rejected": 7.710784912109375, "logps/chosen": -363.533203125, "logps/rejected": -294.5526123046875, "loss": 0.6379, "rewards/accuracies": 0.625, "rewards/chosen": 0.5380655527114868, "rewards/margins": 0.17283202707767487, "rewards/rejected": 0.36523354053497314, "step": 3978 }, { "epoch": 0.6153489271215928, "grad_norm": 3.110316514968872, "learning_rate": 4.416027036315729e-06, "logits/chosen": 10.87832260131836, "logits/rejected": 5.233730316162109, "logps/chosen": -192.003662109375, "logps/rejected": -155.4304656982422, "loss": 0.5194, "rewards/accuracies": 0.75, "rewards/chosen": 0.30637475848197937, "rewards/margins": 0.49352115392684937, "rewards/rejected": -0.18714639544487, "step": 3979 }, { "epoch": 0.6155035762613571, "grad_norm": 12.283734321594238, "learning_rate": 4.415740634666056e-06, "logits/chosen": 16.186311721801758, "logits/rejected": 6.734541416168213, "logps/chosen": -378.1627197265625, "logps/rejected": -228.70108032226562, "loss": 0.5701, "rewards/accuracies": 0.625, "rewards/chosen": 0.8108668327331543, "rewards/margins": 0.45011216402053833, "rewards/rejected": 0.36075466871261597, "step": 3980 }, { "epoch": 0.6156582254011213, "grad_norm": 5.618738174438477, "learning_rate": 4.415454233016382e-06, "logits/chosen": 10.618489265441895, "logits/rejected": 8.664163589477539, "logps/chosen": -302.9654541015625, "logps/rejected": -249.79965209960938, "loss": 0.5684, "rewards/accuracies": 0.625, "rewards/chosen": 0.6317172646522522, "rewards/margins": 0.4660932421684265, "rewards/rejected": 0.16562394797801971, "step": 3981 }, { "epoch": 0.6158128745408854, "grad_norm": 6.898849010467529, "learning_rate": 4.415167831366709e-06, "logits/chosen": 15.606817245483398, "logits/rejected": 5.793133735656738, "logps/chosen": -518.538330078125, "logps/rejected": -346.8183898925781, "loss": 0.8555, "rewards/accuracies": 0.5, "rewards/chosen": 0.20846271514892578, "rewards/margins": -0.21831157803535461, "rewards/rejected": 0.4267743229866028, "step": 3982 }, { "epoch": 0.6159675236806496, "grad_norm": 5.192005634307861, "learning_rate": 4.4148814297170356e-06, "logits/chosen": 6.179653644561768, "logits/rejected": 1.9027841091156006, "logps/chosen": -191.0103759765625, "logps/rejected": -96.84380340576172, "loss": 0.6513, "rewards/accuracies": 0.625, "rewards/chosen": 0.11486382782459259, "rewards/margins": 0.19265887141227722, "rewards/rejected": -0.07779502868652344, "step": 3983 }, { "epoch": 0.6161221728204137, "grad_norm": 5.486961364746094, "learning_rate": 4.414595028067362e-06, "logits/chosen": 11.87032699584961, "logits/rejected": 12.852231979370117, "logps/chosen": -181.870849609375, "logps/rejected": -230.91493225097656, "loss": 0.7214, "rewards/accuracies": 0.5, "rewards/chosen": -0.04374241083860397, "rewards/margins": 0.017684705555438995, "rewards/rejected": -0.06142710894346237, "step": 3984 }, { "epoch": 0.6162768219601779, "grad_norm": 6.306859970092773, "learning_rate": 4.414308626417689e-06, "logits/chosen": 12.358658790588379, "logits/rejected": 9.750887870788574, "logps/chosen": -325.25506591796875, "logps/rejected": -227.91432189941406, "loss": 0.7051, "rewards/accuracies": 0.625, "rewards/chosen": 0.4135579764842987, "rewards/margins": 0.16071553528308868, "rewards/rejected": 0.2528424561023712, "step": 3985 }, { "epoch": 0.616431471099942, "grad_norm": 5.853692531585693, "learning_rate": 4.414022224768015e-06, "logits/chosen": 10.070363998413086, "logits/rejected": 9.573240280151367, "logps/chosen": -213.6751708984375, "logps/rejected": -283.18560791015625, "loss": 0.7138, "rewards/accuracies": 0.625, "rewards/chosen": 0.5747483968734741, "rewards/margins": 0.16866031289100647, "rewards/rejected": 0.40608811378479004, "step": 3986 }, { "epoch": 0.6165861202397062, "grad_norm": 5.2673163414001465, "learning_rate": 4.413735823118341e-06, "logits/chosen": 9.505758285522461, "logits/rejected": 4.42695426940918, "logps/chosen": -299.49078369140625, "logps/rejected": -288.18914794921875, "loss": 0.5287, "rewards/accuracies": 0.875, "rewards/chosen": 0.6796090006828308, "rewards/margins": 0.44216328859329224, "rewards/rejected": 0.23744575679302216, "step": 3987 }, { "epoch": 0.6167407693794703, "grad_norm": 5.0639567375183105, "learning_rate": 4.413449421468668e-06, "logits/chosen": 15.185131072998047, "logits/rejected": 13.980964660644531, "logps/chosen": -265.08172607421875, "logps/rejected": -262.53045654296875, "loss": 0.6675, "rewards/accuracies": 0.5, "rewards/chosen": 0.1539907455444336, "rewards/margins": 0.1677388846874237, "rewards/rejected": -0.0137481689453125, "step": 3988 }, { "epoch": 0.6168954185192345, "grad_norm": 5.025876522064209, "learning_rate": 4.413163019818995e-06, "logits/chosen": 9.889678955078125, "logits/rejected": 10.165555000305176, "logps/chosen": -230.9527130126953, "logps/rejected": -238.69235229492188, "loss": 0.7475, "rewards/accuracies": 0.25, "rewards/chosen": 0.08680114895105362, "rewards/margins": -0.0775972381234169, "rewards/rejected": 0.1643984019756317, "step": 3989 }, { "epoch": 0.6170500676589986, "grad_norm": 3.8666720390319824, "learning_rate": 4.412876618169321e-06, "logits/chosen": 12.497709274291992, "logits/rejected": 5.2078657150268555, "logps/chosen": -226.739990234375, "logps/rejected": -129.91058349609375, "loss": 0.5501, "rewards/accuracies": 0.75, "rewards/chosen": 0.4635973274707794, "rewards/margins": 0.48007628321647644, "rewards/rejected": -0.016479015350341797, "step": 3990 }, { "epoch": 0.6172047167987628, "grad_norm": 4.434582233428955, "learning_rate": 4.412590216519648e-06, "logits/chosen": 8.910432815551758, "logits/rejected": 6.856439590454102, "logps/chosen": -252.7999267578125, "logps/rejected": -184.7388458251953, "loss": 0.5696, "rewards/accuracies": 0.875, "rewards/chosen": 0.6763100624084473, "rewards/margins": 0.4074946343898773, "rewards/rejected": 0.26881542801856995, "step": 3991 }, { "epoch": 0.6173593659385269, "grad_norm": 8.017807006835938, "learning_rate": 4.4123038148699746e-06, "logits/chosen": 4.168961524963379, "logits/rejected": 6.715488433837891, "logps/chosen": -360.7528991699219, "logps/rejected": -328.45361328125, "loss": 0.8931, "rewards/accuracies": 0.25, "rewards/chosen": 0.27519139647483826, "rewards/margins": -0.24730825424194336, "rewards/rejected": 0.522499680519104, "step": 3992 }, { "epoch": 0.6175140150782912, "grad_norm": 9.119096755981445, "learning_rate": 4.4120174132203e-06, "logits/chosen": 7.943830490112305, "logits/rejected": -0.4127311706542969, "logps/chosen": -330.6951599121094, "logps/rejected": -325.27099609375, "loss": 0.7636, "rewards/accuracies": 0.375, "rewards/chosen": 0.3674657940864563, "rewards/margins": -0.062451254576444626, "rewards/rejected": 0.4299170672893524, "step": 3993 }, { "epoch": 0.6176686642180553, "grad_norm": 6.174286365509033, "learning_rate": 4.411731011570627e-06, "logits/chosen": 6.646059989929199, "logits/rejected": 5.974409103393555, "logps/chosen": -303.280517578125, "logps/rejected": -227.81649780273438, "loss": 0.7623, "rewards/accuracies": 0.75, "rewards/chosen": 0.20779690146446228, "rewards/margins": -0.007047683000564575, "rewards/rejected": 0.21484455466270447, "step": 3994 }, { "epoch": 0.6178233133578195, "grad_norm": 6.199397087097168, "learning_rate": 4.411444609920954e-06, "logits/chosen": 9.876285552978516, "logits/rejected": 8.515664100646973, "logps/chosen": -356.68951416015625, "logps/rejected": -344.5235290527344, "loss": 0.7737, "rewards/accuracies": 0.375, "rewards/chosen": 0.5074409246444702, "rewards/margins": -0.11482200771570206, "rewards/rejected": 0.6222629547119141, "step": 3995 }, { "epoch": 0.6179779624975836, "grad_norm": 8.015007972717285, "learning_rate": 4.41115820827128e-06, "logits/chosen": 6.025833606719971, "logits/rejected": 7.431828498840332, "logps/chosen": -236.2277069091797, "logps/rejected": -310.51568603515625, "loss": 0.7423, "rewards/accuracies": 0.5, "rewards/chosen": 0.003694772720336914, "rewards/margins": -0.039074115455150604, "rewards/rejected": 0.04276890680193901, "step": 3996 }, { "epoch": 0.6181326116373478, "grad_norm": 5.4741411209106445, "learning_rate": 4.410871806621607e-06, "logits/chosen": 12.832688331604004, "logits/rejected": 7.965878486633301, "logps/chosen": -247.70278930664062, "logps/rejected": -225.88583374023438, "loss": 0.6795, "rewards/accuracies": 0.625, "rewards/chosen": 0.3956608772277832, "rewards/margins": 0.06082305312156677, "rewards/rejected": 0.33483782410621643, "step": 3997 }, { "epoch": 0.6182872607771119, "grad_norm": 4.838098526000977, "learning_rate": 4.410585404971934e-06, "logits/chosen": 15.044391632080078, "logits/rejected": 8.95964527130127, "logps/chosen": -299.5126647949219, "logps/rejected": -208.17086791992188, "loss": 0.5528, "rewards/accuracies": 0.75, "rewards/chosen": 0.8709566593170166, "rewards/margins": 0.38361066579818726, "rewards/rejected": 0.48734599351882935, "step": 3998 }, { "epoch": 0.6184419099168761, "grad_norm": 6.5764641761779785, "learning_rate": 4.4102990033222594e-06, "logits/chosen": 4.940681457519531, "logits/rejected": 6.823337554931641, "logps/chosen": -319.206298828125, "logps/rejected": -324.8540954589844, "loss": 0.7444, "rewards/accuracies": 0.375, "rewards/chosen": 0.334145188331604, "rewards/margins": -0.05645487457513809, "rewards/rejected": 0.3906000852584839, "step": 3999 }, { "epoch": 0.6185965590566402, "grad_norm": 5.284141540527344, "learning_rate": 4.410012601672586e-06, "logits/chosen": 10.129792213439941, "logits/rejected": 5.309225559234619, "logps/chosen": -277.09814453125, "logps/rejected": -245.0251922607422, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": 0.35479944944381714, "rewards/margins": 0.048466116189956665, "rewards/rejected": 0.3063333332538605, "step": 4000 }, { "epoch": 0.6187512081964044, "grad_norm": 6.510834693908691, "learning_rate": 4.409726200022913e-06, "logits/chosen": 10.96594524383545, "logits/rejected": 10.082406997680664, "logps/chosen": -377.9990539550781, "logps/rejected": -290.2900390625, "loss": 0.7381, "rewards/accuracies": 0.5, "rewards/chosen": 0.47586309909820557, "rewards/margins": 0.02584478259086609, "rewards/rejected": 0.4500183165073395, "step": 4001 }, { "epoch": 0.6189058573361685, "grad_norm": 5.866822242736816, "learning_rate": 4.409439798373239e-06, "logits/chosen": 10.911684036254883, "logits/rejected": 9.771059036254883, "logps/chosen": -249.86415100097656, "logps/rejected": -252.82192993164062, "loss": 0.8159, "rewards/accuracies": 0.375, "rewards/chosen": 0.22084388136863708, "rewards/margins": -0.16515156626701355, "rewards/rejected": 0.38599544763565063, "step": 4002 }, { "epoch": 0.6190605064759327, "grad_norm": 5.162505149841309, "learning_rate": 4.409153396723565e-06, "logits/chosen": 8.09380054473877, "logits/rejected": 10.095977783203125, "logps/chosen": -236.61373901367188, "logps/rejected": -283.0645751953125, "loss": 0.7665, "rewards/accuracies": 0.375, "rewards/chosen": 0.6227982044219971, "rewards/margins": -0.07890131324529648, "rewards/rejected": 0.7016994953155518, "step": 4003 }, { "epoch": 0.6192151556156968, "grad_norm": 6.186079978942871, "learning_rate": 4.408866995073892e-06, "logits/chosen": 5.812247276306152, "logits/rejected": 5.927313327789307, "logps/chosen": -295.0551452636719, "logps/rejected": -255.51995849609375, "loss": 0.846, "rewards/accuracies": 0.375, "rewards/chosen": 0.24706745147705078, "rewards/margins": -0.15385542809963226, "rewards/rejected": 0.40092286467552185, "step": 4004 }, { "epoch": 0.619369804755461, "grad_norm": 6.510254859924316, "learning_rate": 4.4085805934242185e-06, "logits/chosen": 8.036344528198242, "logits/rejected": 10.093371391296387, "logps/chosen": -309.7167053222656, "logps/rejected": -311.7046813964844, "loss": 0.6862, "rewards/accuracies": 0.5, "rewards/chosen": 0.6454997658729553, "rewards/margins": 0.07213295996189117, "rewards/rejected": 0.5733668208122253, "step": 4005 }, { "epoch": 0.6195244538952253, "grad_norm": 4.414552688598633, "learning_rate": 4.408294191774545e-06, "logits/chosen": 12.78913688659668, "logits/rejected": 12.12380599975586, "logps/chosen": -182.97825622558594, "logps/rejected": -152.61581420898438, "loss": 0.6497, "rewards/accuracies": 0.5, "rewards/chosen": 0.2689127027988434, "rewards/margins": 0.17303729057312012, "rewards/rejected": 0.09587539732456207, "step": 4006 }, { "epoch": 0.6196791030349894, "grad_norm": 4.505795955657959, "learning_rate": 4.408007790124871e-06, "logits/chosen": 10.55947208404541, "logits/rejected": 8.339699745178223, "logps/chosen": -320.95721435546875, "logps/rejected": -276.74407958984375, "loss": 0.586, "rewards/accuracies": 0.625, "rewards/chosen": 0.5985777974128723, "rewards/margins": 0.3434251844882965, "rewards/rejected": 0.2551526427268982, "step": 4007 }, { "epoch": 0.6198337521747536, "grad_norm": 6.069342613220215, "learning_rate": 4.407721388475198e-06, "logits/chosen": 7.596423149108887, "logits/rejected": 6.370104789733887, "logps/chosen": -249.0166778564453, "logps/rejected": -264.9747009277344, "loss": 0.665, "rewards/accuracies": 0.625, "rewards/chosen": 0.4278978407382965, "rewards/margins": 0.09891906380653381, "rewards/rejected": 0.3289787769317627, "step": 4008 }, { "epoch": 0.6199884013145177, "grad_norm": 4.244874000549316, "learning_rate": 4.407434986825524e-06, "logits/chosen": 10.794459342956543, "logits/rejected": 5.339234828948975, "logps/chosen": -305.938720703125, "logps/rejected": -218.4661865234375, "loss": 0.5758, "rewards/accuracies": 0.625, "rewards/chosen": 0.529402494430542, "rewards/margins": 0.36998555064201355, "rewards/rejected": 0.15941688418388367, "step": 4009 }, { "epoch": 0.6201430504542819, "grad_norm": 8.21546459197998, "learning_rate": 4.407148585175851e-06, "logits/chosen": 7.781423091888428, "logits/rejected": 8.992557525634766, "logps/chosen": -264.19403076171875, "logps/rejected": -339.052490234375, "loss": 0.8488, "rewards/accuracies": 0.625, "rewards/chosen": 0.09604862332344055, "rewards/margins": -0.21417336165905, "rewards/rejected": 0.3102220296859741, "step": 4010 }, { "epoch": 0.620297699594046, "grad_norm": 4.776425361633301, "learning_rate": 4.4068621835261775e-06, "logits/chosen": 12.80551528930664, "logits/rejected": 8.52711296081543, "logps/chosen": -364.6460266113281, "logps/rejected": -257.2583312988281, "loss": 0.5171, "rewards/accuracies": 0.625, "rewards/chosen": 0.8015909194946289, "rewards/margins": 0.5088747143745422, "rewards/rejected": 0.2927161455154419, "step": 4011 }, { "epoch": 0.6204523487338102, "grad_norm": 5.902646541595459, "learning_rate": 4.406575781876503e-06, "logits/chosen": 9.784926414489746, "logits/rejected": 9.356770515441895, "logps/chosen": -259.9383239746094, "logps/rejected": -343.2843017578125, "loss": 0.6867, "rewards/accuracies": 0.625, "rewards/chosen": 0.3118656575679779, "rewards/margins": 0.14351238310337067, "rewards/rejected": 0.16835325956344604, "step": 4012 }, { "epoch": 0.6206069978735743, "grad_norm": 4.823896408081055, "learning_rate": 4.40628938022683e-06, "logits/chosen": 14.486096382141113, "logits/rejected": 10.90367603302002, "logps/chosen": -234.39022827148438, "logps/rejected": -193.61355590820312, "loss": 0.5757, "rewards/accuracies": 0.875, "rewards/chosen": 0.6393522620201111, "rewards/margins": 0.27943816781044006, "rewards/rejected": 0.35991406440734863, "step": 4013 }, { "epoch": 0.6207616470133385, "grad_norm": 3.4552700519561768, "learning_rate": 4.406002978577157e-06, "logits/chosen": 13.32492733001709, "logits/rejected": 6.451300621032715, "logps/chosen": -245.94692993164062, "logps/rejected": -171.83734130859375, "loss": 0.4626, "rewards/accuracies": 0.75, "rewards/chosen": 0.5272878408432007, "rewards/margins": 0.6320104598999023, "rewards/rejected": -0.10472260415554047, "step": 4014 }, { "epoch": 0.6209162961531026, "grad_norm": 5.932234287261963, "learning_rate": 4.405716576927483e-06, "logits/chosen": 11.590948104858398, "logits/rejected": 9.118955612182617, "logps/chosen": -390.6847839355469, "logps/rejected": -273.2651062011719, "loss": 0.6525, "rewards/accuracies": 0.625, "rewards/chosen": 0.32982784509658813, "rewards/margins": 0.170156329870224, "rewards/rejected": 0.15967151522636414, "step": 4015 }, { "epoch": 0.6210709452928668, "grad_norm": 5.118629455566406, "learning_rate": 4.40543017527781e-06, "logits/chosen": 8.611971855163574, "logits/rejected": 1.606679081916809, "logps/chosen": -311.0571594238281, "logps/rejected": -192.13607788085938, "loss": 0.6249, "rewards/accuracies": 0.5, "rewards/chosen": 0.2589518427848816, "rewards/margins": 0.23573365807533264, "rewards/rejected": 0.023218199610710144, "step": 4016 }, { "epoch": 0.6212255944326309, "grad_norm": 7.144024848937988, "learning_rate": 4.405143773628137e-06, "logits/chosen": 8.36507797241211, "logits/rejected": 7.581270217895508, "logps/chosen": -235.99813842773438, "logps/rejected": -251.02427673339844, "loss": 0.8928, "rewards/accuracies": 0.375, "rewards/chosen": 0.06704816222190857, "rewards/margins": -0.21711713075637817, "rewards/rejected": 0.28416526317596436, "step": 4017 }, { "epoch": 0.6213802435723951, "grad_norm": 8.759623527526855, "learning_rate": 4.404857371978463e-06, "logits/chosen": 7.732784271240234, "logits/rejected": 5.147335052490234, "logps/chosen": -300.4581298828125, "logps/rejected": -253.42013549804688, "loss": 0.5774, "rewards/accuracies": 0.5, "rewards/chosen": 0.5996932983398438, "rewards/margins": 0.46005499362945557, "rewards/rejected": 0.13963833451271057, "step": 4018 }, { "epoch": 0.6215348927121593, "grad_norm": 5.850327491760254, "learning_rate": 4.404570970328789e-06, "logits/chosen": 7.610745429992676, "logits/rejected": 7.347476959228516, "logps/chosen": -195.70159912109375, "logps/rejected": -199.7276153564453, "loss": 0.7276, "rewards/accuracies": 0.375, "rewards/chosen": 0.21239474415779114, "rewards/margins": -0.008057206869125366, "rewards/rejected": 0.2204519361257553, "step": 4019 }, { "epoch": 0.6216895418519235, "grad_norm": 5.558006763458252, "learning_rate": 4.404284568679116e-06, "logits/chosen": 12.289063453674316, "logits/rejected": 12.644683837890625, "logps/chosen": -252.84823608398438, "logps/rejected": -280.3616943359375, "loss": 0.7419, "rewards/accuracies": 0.375, "rewards/chosen": 0.2592141330242157, "rewards/margins": -0.06418189406394958, "rewards/rejected": 0.3233960270881653, "step": 4020 }, { "epoch": 0.6218441909916876, "grad_norm": 5.637094497680664, "learning_rate": 4.403998167029442e-06, "logits/chosen": 12.32789421081543, "logits/rejected": 9.94038200378418, "logps/chosen": -253.4571533203125, "logps/rejected": -238.50840759277344, "loss": 0.8034, "rewards/accuracies": 0.25, "rewards/chosen": 0.5833390355110168, "rewards/margins": -0.07631255686283112, "rewards/rejected": 0.6596515774726868, "step": 4021 }, { "epoch": 0.6219988401314518, "grad_norm": 6.444872856140137, "learning_rate": 4.403711765379769e-06, "logits/chosen": 5.635624408721924, "logits/rejected": 3.9463272094726562, "logps/chosen": -175.76524353027344, "logps/rejected": -252.73910522460938, "loss": 0.6852, "rewards/accuracies": 0.25, "rewards/chosen": 0.32783323526382446, "rewards/margins": 0.0339665561914444, "rewards/rejected": 0.29386669397354126, "step": 4022 }, { "epoch": 0.6221534892712159, "grad_norm": 7.920750617980957, "learning_rate": 4.403425363730096e-06, "logits/chosen": 9.29922866821289, "logits/rejected": 10.711066246032715, "logps/chosen": -282.9151916503906, "logps/rejected": -272.07513427734375, "loss": 0.8038, "rewards/accuracies": 0.375, "rewards/chosen": 0.15416660904884338, "rewards/margins": -0.1464461386203766, "rewards/rejected": 0.30061274766921997, "step": 4023 }, { "epoch": 0.6223081384109801, "grad_norm": 5.705741882324219, "learning_rate": 4.403138962080422e-06, "logits/chosen": 6.605433464050293, "logits/rejected": 9.549003601074219, "logps/chosen": -237.51446533203125, "logps/rejected": -239.06573486328125, "loss": 0.8658, "rewards/accuracies": 0.375, "rewards/chosen": 0.04292944818735123, "rewards/margins": -0.2676040232181549, "rewards/rejected": 0.3105334937572479, "step": 4024 }, { "epoch": 0.6224627875507442, "grad_norm": 4.184696674346924, "learning_rate": 4.402852560430749e-06, "logits/chosen": 7.7875776290893555, "logits/rejected": 3.6724977493286133, "logps/chosen": -357.46630859375, "logps/rejected": -218.472412109375, "loss": 0.6519, "rewards/accuracies": 0.5, "rewards/chosen": 0.5142974257469177, "rewards/margins": 0.2307938039302826, "rewards/rejected": 0.2835036516189575, "step": 4025 }, { "epoch": 0.6226174366905084, "grad_norm": 4.74005651473999, "learning_rate": 4.402566158781075e-06, "logits/chosen": 4.800574779510498, "logits/rejected": 5.066928386688232, "logps/chosen": -243.83627319335938, "logps/rejected": -254.01856994628906, "loss": 0.6003, "rewards/accuracies": 0.625, "rewards/chosen": 0.027154691517353058, "rewards/margins": 0.3163312077522278, "rewards/rejected": -0.28917649388313293, "step": 4026 }, { "epoch": 0.6227720858302725, "grad_norm": 27.043251037597656, "learning_rate": 4.402279757131401e-06, "logits/chosen": 12.06676197052002, "logits/rejected": 6.024239540100098, "logps/chosen": -257.285400390625, "logps/rejected": -196.4248504638672, "loss": 0.5654, "rewards/accuracies": 0.875, "rewards/chosen": 0.29332152009010315, "rewards/margins": 0.32829222083091736, "rewards/rejected": -0.03497070074081421, "step": 4027 }, { "epoch": 0.6229267349700367, "grad_norm": 7.00987434387207, "learning_rate": 4.401993355481728e-06, "logits/chosen": 12.373955726623535, "logits/rejected": 6.466969966888428, "logps/chosen": -415.71502685546875, "logps/rejected": -338.3392639160156, "loss": 0.5655, "rewards/accuracies": 0.625, "rewards/chosen": 0.6827533841133118, "rewards/margins": 0.43671202659606934, "rewards/rejected": 0.24604134261608124, "step": 4028 }, { "epoch": 0.6230813841098009, "grad_norm": 4.14984655380249, "learning_rate": 4.401706953832055e-06, "logits/chosen": 10.959396362304688, "logits/rejected": 3.9779815673828125, "logps/chosen": -193.0467529296875, "logps/rejected": -148.32884216308594, "loss": 0.6356, "rewards/accuracies": 0.625, "rewards/chosen": 0.30270734429359436, "rewards/margins": 0.19289106130599976, "rewards/rejected": 0.10981626808643341, "step": 4029 }, { "epoch": 0.623236033249565, "grad_norm": 7.222524166107178, "learning_rate": 4.401420552182381e-06, "logits/chosen": 8.91360092163086, "logits/rejected": 9.961875915527344, "logps/chosen": -352.41119384765625, "logps/rejected": -377.68536376953125, "loss": 0.8253, "rewards/accuracies": 0.5, "rewards/chosen": 0.3565616011619568, "rewards/margins": -0.15135295689105988, "rewards/rejected": 0.5079146027565002, "step": 4030 }, { "epoch": 0.6233906823893293, "grad_norm": 7.554623603820801, "learning_rate": 4.401134150532708e-06, "logits/chosen": 10.2402925491333, "logits/rejected": 8.680259704589844, "logps/chosen": -317.49041748046875, "logps/rejected": -255.16287231445312, "loss": 0.6731, "rewards/accuracies": 0.75, "rewards/chosen": 0.29330506920814514, "rewards/margins": 0.13002590835094452, "rewards/rejected": 0.16327916085720062, "step": 4031 }, { "epoch": 0.6235453315290934, "grad_norm": 5.088848114013672, "learning_rate": 4.400847748883034e-06, "logits/chosen": 9.234837532043457, "logits/rejected": 9.693359375, "logps/chosen": -234.90647888183594, "logps/rejected": -227.0321044921875, "loss": 0.7181, "rewards/accuracies": 0.625, "rewards/chosen": 0.36205166578292847, "rewards/margins": 0.065810427069664, "rewards/rejected": 0.29624125361442566, "step": 4032 }, { "epoch": 0.6236999806688576, "grad_norm": 5.123340606689453, "learning_rate": 4.4005613472333605e-06, "logits/chosen": 12.701391220092773, "logits/rejected": 5.5076398849487305, "logps/chosen": -293.3648376464844, "logps/rejected": -204.4552764892578, "loss": 0.5675, "rewards/accuracies": 0.75, "rewards/chosen": 0.4183961749076843, "rewards/margins": 0.3845493495464325, "rewards/rejected": 0.03384685516357422, "step": 4033 }, { "epoch": 0.6238546298086217, "grad_norm": 5.9557204246521, "learning_rate": 4.400274945583687e-06, "logits/chosen": 5.805088520050049, "logits/rejected": 9.830362319946289, "logps/chosen": -257.8200378417969, "logps/rejected": -277.54986572265625, "loss": 0.7459, "rewards/accuracies": 0.375, "rewards/chosen": 0.33600106835365295, "rewards/margins": -0.013096760958433151, "rewards/rejected": 0.3490978181362152, "step": 4034 }, { "epoch": 0.6240092789483859, "grad_norm": 4.270277500152588, "learning_rate": 4.399988543934014e-06, "logits/chosen": 5.860166072845459, "logits/rejected": 3.655550479888916, "logps/chosen": -286.904541015625, "logps/rejected": -276.11834716796875, "loss": 0.5973, "rewards/accuracies": 0.5, "rewards/chosen": 0.2892366647720337, "rewards/margins": 0.3018868863582611, "rewards/rejected": -0.01265022624284029, "step": 4035 }, { "epoch": 0.62416392808815, "grad_norm": 5.476607799530029, "learning_rate": 4.39970214228434e-06, "logits/chosen": 9.59937572479248, "logits/rejected": 14.298587799072266, "logps/chosen": -233.6822967529297, "logps/rejected": -319.22747802734375, "loss": 0.6277, "rewards/accuracies": 0.625, "rewards/chosen": 0.2313644289970398, "rewards/margins": 0.19144420325756073, "rewards/rejected": 0.03992024064064026, "step": 4036 }, { "epoch": 0.6243185772279142, "grad_norm": 5.029600620269775, "learning_rate": 4.399415740634666e-06, "logits/chosen": 12.351079940795898, "logits/rejected": 1.7783212661743164, "logps/chosen": -258.0171203613281, "logps/rejected": -162.19139099121094, "loss": 0.5499, "rewards/accuracies": 0.875, "rewards/chosen": 0.2656036913394928, "rewards/margins": 0.3623029589653015, "rewards/rejected": -0.09669926762580872, "step": 4037 }, { "epoch": 0.6244732263676783, "grad_norm": 5.408899307250977, "learning_rate": 4.399129338984993e-06, "logits/chosen": 9.417326927185059, "logits/rejected": -1.510567307472229, "logps/chosen": -308.684814453125, "logps/rejected": -159.21141052246094, "loss": 0.6245, "rewards/accuracies": 0.75, "rewards/chosen": 0.29975539445877075, "rewards/margins": 0.18107320368289948, "rewards/rejected": 0.11868220567703247, "step": 4038 }, { "epoch": 0.6246278755074425, "grad_norm": 4.076916217803955, "learning_rate": 4.3988429373353195e-06, "logits/chosen": 6.813101291656494, "logits/rejected": 5.6339521408081055, "logps/chosen": -283.181396484375, "logps/rejected": -237.10426330566406, "loss": 0.534, "rewards/accuracies": 0.75, "rewards/chosen": 0.5305769443511963, "rewards/margins": 0.4034223258495331, "rewards/rejected": 0.1271546334028244, "step": 4039 }, { "epoch": 0.6247825246472066, "grad_norm": 13.659684181213379, "learning_rate": 4.398556535685646e-06, "logits/chosen": 10.600590705871582, "logits/rejected": 7.222653388977051, "logps/chosen": -281.54058837890625, "logps/rejected": -188.13075256347656, "loss": 0.475, "rewards/accuracies": 0.875, "rewards/chosen": 0.596403956413269, "rewards/margins": 0.7324687242507935, "rewards/rejected": -0.1360647827386856, "step": 4040 }, { "epoch": 0.6249371737869708, "grad_norm": 5.383350849151611, "learning_rate": 4.398270134035972e-06, "logits/chosen": 10.697227478027344, "logits/rejected": 7.29941463470459, "logps/chosen": -343.995361328125, "logps/rejected": -223.40553283691406, "loss": 0.6618, "rewards/accuracies": 0.5, "rewards/chosen": 0.6271936893463135, "rewards/margins": 0.23002690076828003, "rewards/rejected": 0.3971668481826782, "step": 4041 }, { "epoch": 0.6250918229267349, "grad_norm": 7.511831283569336, "learning_rate": 4.397983732386299e-06, "logits/chosen": 13.679967880249023, "logits/rejected": 3.9051451683044434, "logps/chosen": -299.9984436035156, "logps/rejected": -195.6130828857422, "loss": 0.5494, "rewards/accuracies": 0.875, "rewards/chosen": 0.4566035866737366, "rewards/margins": 0.34514856338500977, "rewards/rejected": 0.11145499348640442, "step": 4042 }, { "epoch": 0.6252464720664991, "grad_norm": 6.712210178375244, "learning_rate": 4.397697330736625e-06, "logits/chosen": 9.03857707977295, "logits/rejected": 6.271660327911377, "logps/chosen": -365.9111328125, "logps/rejected": -254.62643432617188, "loss": 0.7326, "rewards/accuracies": 0.375, "rewards/chosen": 0.346962571144104, "rewards/margins": 0.14806309342384338, "rewards/rejected": 0.19889947772026062, "step": 4043 }, { "epoch": 0.6254011212062633, "grad_norm": 5.4127678871154785, "learning_rate": 4.397410929086952e-06, "logits/chosen": 10.976731300354004, "logits/rejected": 9.926229476928711, "logps/chosen": -308.1781005859375, "logps/rejected": -348.260986328125, "loss": 0.6871, "rewards/accuracies": 0.375, "rewards/chosen": 0.36406442523002625, "rewards/margins": 0.24290120601654053, "rewards/rejected": 0.12116318196058273, "step": 4044 }, { "epoch": 0.6255557703460275, "grad_norm": 5.855021953582764, "learning_rate": 4.397124527437278e-06, "logits/chosen": 10.623845100402832, "logits/rejected": 11.452596664428711, "logps/chosen": -240.625, "logps/rejected": -219.32843017578125, "loss": 0.742, "rewards/accuracies": 0.5, "rewards/chosen": 0.2689090073108673, "rewards/margins": -0.040515296161174774, "rewards/rejected": 0.30942434072494507, "step": 4045 }, { "epoch": 0.6257104194857916, "grad_norm": 4.983871936798096, "learning_rate": 4.396838125787604e-06, "logits/chosen": 9.657818794250488, "logits/rejected": 4.532278060913086, "logps/chosen": -319.77886962890625, "logps/rejected": -217.75038146972656, "loss": 0.6808, "rewards/accuracies": 0.5, "rewards/chosen": 0.27850714325904846, "rewards/margins": 0.08938390761613846, "rewards/rejected": 0.1891232281923294, "step": 4046 }, { "epoch": 0.6258650686255558, "grad_norm": 5.406404972076416, "learning_rate": 4.396551724137931e-06, "logits/chosen": 12.965319633483887, "logits/rejected": 9.557802200317383, "logps/chosen": -244.3644256591797, "logps/rejected": -203.85362243652344, "loss": 0.6886, "rewards/accuracies": 0.5, "rewards/chosen": 0.2785509526729584, "rewards/margins": 0.1845540851354599, "rewards/rejected": 0.09399686753749847, "step": 4047 }, { "epoch": 0.62601971776532, "grad_norm": 5.415280818939209, "learning_rate": 4.396265322488258e-06, "logits/chosen": 8.632823944091797, "logits/rejected": 7.07003116607666, "logps/chosen": -297.716552734375, "logps/rejected": -301.20477294921875, "loss": 0.7321, "rewards/accuracies": 0.5, "rewards/chosen": 0.313380628824234, "rewards/margins": 0.017455842345952988, "rewards/rejected": 0.2959247827529907, "step": 4048 }, { "epoch": 0.6261743669050841, "grad_norm": 5.993331432342529, "learning_rate": 4.395978920838584e-06, "logits/chosen": 13.16100788116455, "logits/rejected": 4.71240758895874, "logps/chosen": -408.00927734375, "logps/rejected": -297.93756103515625, "loss": 0.6833, "rewards/accuracies": 0.75, "rewards/chosen": 0.3201228976249695, "rewards/margins": 0.15773311257362366, "rewards/rejected": 0.1623898148536682, "step": 4049 }, { "epoch": 0.6263290160448483, "grad_norm": 6.0911664962768555, "learning_rate": 4.395692519188911e-06, "logits/chosen": 8.533544540405273, "logits/rejected": 7.047549247741699, "logps/chosen": -297.1532897949219, "logps/rejected": -233.2585906982422, "loss": 0.8069, "rewards/accuracies": 0.375, "rewards/chosen": -0.2909713685512543, "rewards/margins": -0.14614449441432953, "rewards/rejected": -0.14482688903808594, "step": 4050 }, { "epoch": 0.6264836651846124, "grad_norm": 32.98212432861328, "learning_rate": 4.395406117539238e-06, "logits/chosen": 10.070265769958496, "logits/rejected": -1.7147183418273926, "logps/chosen": -233.75804138183594, "logps/rejected": -152.1121368408203, "loss": 0.5733, "rewards/accuracies": 0.625, "rewards/chosen": 0.29491937160491943, "rewards/margins": 0.48594149947166443, "rewards/rejected": -0.19102217257022858, "step": 4051 }, { "epoch": 0.6266383143243766, "grad_norm": 4.405770778656006, "learning_rate": 4.3951197158895634e-06, "logits/chosen": 15.662874221801758, "logits/rejected": 9.65223503112793, "logps/chosen": -370.09814453125, "logps/rejected": -266.93438720703125, "loss": 0.4927, "rewards/accuracies": 0.875, "rewards/chosen": 0.4853792190551758, "rewards/margins": 0.5229854583740234, "rewards/rejected": -0.037606243044137955, "step": 4052 }, { "epoch": 0.6267929634641407, "grad_norm": 3.3254263401031494, "learning_rate": 4.39483331423989e-06, "logits/chosen": 9.10347843170166, "logits/rejected": -0.4621303081512451, "logps/chosen": -229.28807067871094, "logps/rejected": -111.28083038330078, "loss": 0.4834, "rewards/accuracies": 0.75, "rewards/chosen": 0.2897731065750122, "rewards/margins": 0.61546790599823, "rewards/rejected": -0.32569482922554016, "step": 4053 }, { "epoch": 0.6269476126039049, "grad_norm": 5.133017539978027, "learning_rate": 4.394546912590217e-06, "logits/chosen": 9.82995891571045, "logits/rejected": 10.014596939086914, "logps/chosen": -294.539794921875, "logps/rejected": -291.3507080078125, "loss": 0.6141, "rewards/accuracies": 0.625, "rewards/chosen": 0.19719655811786652, "rewards/margins": 0.31432151794433594, "rewards/rejected": -0.11712498217821121, "step": 4054 }, { "epoch": 0.627102261743669, "grad_norm": 4.660998821258545, "learning_rate": 4.394260510940543e-06, "logits/chosen": 11.159553527832031, "logits/rejected": 12.038963317871094, "logps/chosen": -291.8470153808594, "logps/rejected": -262.78521728515625, "loss": 0.5008, "rewards/accuracies": 0.875, "rewards/chosen": 0.3734864592552185, "rewards/margins": 0.49905601143836975, "rewards/rejected": -0.12556958198547363, "step": 4055 }, { "epoch": 0.6272569108834332, "grad_norm": 4.688870906829834, "learning_rate": 4.39397410929087e-06, "logits/chosen": 5.18271541595459, "logits/rejected": 2.952927589416504, "logps/chosen": -308.16754150390625, "logps/rejected": -289.984619140625, "loss": 0.607, "rewards/accuracies": 0.625, "rewards/chosen": 0.3408759832382202, "rewards/margins": 0.26149341464042664, "rewards/rejected": 0.07938262820243835, "step": 4056 }, { "epoch": 0.6274115600231974, "grad_norm": 3.7789859771728516, "learning_rate": 4.393687707641197e-06, "logits/chosen": 11.14282512664795, "logits/rejected": 6.329801082611084, "logps/chosen": -234.9351348876953, "logps/rejected": -176.82781982421875, "loss": 0.6154, "rewards/accuracies": 0.75, "rewards/chosen": 0.08922234177589417, "rewards/margins": 0.22315536439418793, "rewards/rejected": -0.13393302261829376, "step": 4057 }, { "epoch": 0.6275662091629616, "grad_norm": 6.498647689819336, "learning_rate": 4.393401305991523e-06, "logits/chosen": 13.571723937988281, "logits/rejected": 11.705458641052246, "logps/chosen": -344.78802490234375, "logps/rejected": -301.2410888671875, "loss": 0.567, "rewards/accuracies": 0.75, "rewards/chosen": 0.26479947566986084, "rewards/margins": 0.43605488538742065, "rewards/rejected": -0.17125540971755981, "step": 4058 }, { "epoch": 0.6277208583027257, "grad_norm": 5.898116111755371, "learning_rate": 4.393114904341849e-06, "logits/chosen": 12.315786361694336, "logits/rejected": 13.710007667541504, "logps/chosen": -333.256591796875, "logps/rejected": -360.0888671875, "loss": 0.5931, "rewards/accuracies": 0.5, "rewards/chosen": 0.6876974105834961, "rewards/margins": 0.3988623023033142, "rewards/rejected": 0.2888351380825043, "step": 4059 }, { "epoch": 0.6278755074424899, "grad_norm": 5.135991096496582, "learning_rate": 4.392828502692176e-06, "logits/chosen": 11.135647773742676, "logits/rejected": 4.969450950622559, "logps/chosen": -283.25238037109375, "logps/rejected": -155.79037475585938, "loss": 0.7085, "rewards/accuracies": 0.5, "rewards/chosen": 0.06248144805431366, "rewards/margins": -0.007429918274283409, "rewards/rejected": 0.06991136819124222, "step": 4060 }, { "epoch": 0.628030156582254, "grad_norm": 5.357438087463379, "learning_rate": 4.3925421010425024e-06, "logits/chosen": 13.879589080810547, "logits/rejected": 10.412007331848145, "logps/chosen": -270.187744140625, "logps/rejected": -233.69711303710938, "loss": 0.4302, "rewards/accuracies": 0.875, "rewards/chosen": 0.4024237394332886, "rewards/margins": 0.7195690870285034, "rewards/rejected": -0.3171452581882477, "step": 4061 }, { "epoch": 0.6281848057220182, "grad_norm": 7.012094497680664, "learning_rate": 4.392255699392829e-06, "logits/chosen": 8.41109848022461, "logits/rejected": 1.721659779548645, "logps/chosen": -243.05487060546875, "logps/rejected": -321.6357421875, "loss": 0.6353, "rewards/accuracies": 0.75, "rewards/chosen": 0.2585796117782593, "rewards/margins": 0.31641823053359985, "rewards/rejected": -0.057838618755340576, "step": 4062 }, { "epoch": 0.6283394548617823, "grad_norm": 5.991780757904053, "learning_rate": 4.391969297743156e-06, "logits/chosen": 16.45221710205078, "logits/rejected": 10.865355491638184, "logps/chosen": -296.0362548828125, "logps/rejected": -326.06341552734375, "loss": 0.6332, "rewards/accuracies": 0.625, "rewards/chosen": 0.4141339063644409, "rewards/margins": 0.16587454080581665, "rewards/rejected": 0.24825933575630188, "step": 4063 }, { "epoch": 0.6284941040015465, "grad_norm": 5.383486270904541, "learning_rate": 4.391682896093482e-06, "logits/chosen": 12.996604919433594, "logits/rejected": 5.913437843322754, "logps/chosen": -351.9942626953125, "logps/rejected": -228.33493041992188, "loss": 0.6369, "rewards/accuracies": 0.625, "rewards/chosen": 0.46444129943847656, "rewards/margins": 0.2363877296447754, "rewards/rejected": 0.22805356979370117, "step": 4064 }, { "epoch": 0.6286487531413106, "grad_norm": 5.335546970367432, "learning_rate": 4.391396494443808e-06, "logits/chosen": 11.050395011901855, "logits/rejected": 10.575007438659668, "logps/chosen": -387.0365295410156, "logps/rejected": -293.0663146972656, "loss": 0.5247, "rewards/accuracies": 0.75, "rewards/chosen": 0.611580491065979, "rewards/margins": 0.46571481227874756, "rewards/rejected": 0.14586563408374786, "step": 4065 }, { "epoch": 0.6288034022810748, "grad_norm": 4.146840572357178, "learning_rate": 4.391110092794135e-06, "logits/chosen": 10.987333297729492, "logits/rejected": 8.577617645263672, "logps/chosen": -311.9043273925781, "logps/rejected": -271.08447265625, "loss": 0.5077, "rewards/accuracies": 0.875, "rewards/chosen": 0.6075385212898254, "rewards/margins": 0.4484216570854187, "rewards/rejected": 0.15911683440208435, "step": 4066 }, { "epoch": 0.6289580514208389, "grad_norm": 5.356069564819336, "learning_rate": 4.3908236911444615e-06, "logits/chosen": 16.6180419921875, "logits/rejected": 8.864065170288086, "logps/chosen": -377.5796813964844, "logps/rejected": -223.75234985351562, "loss": 0.4691, "rewards/accuracies": 0.875, "rewards/chosen": 0.7841887474060059, "rewards/margins": 0.6051515340805054, "rewards/rejected": 0.1790371835231781, "step": 4067 }, { "epoch": 0.6291127005606031, "grad_norm": 4.767258644104004, "learning_rate": 4.390537289494788e-06, "logits/chosen": 8.948322296142578, "logits/rejected": 5.09316873550415, "logps/chosen": -317.455078125, "logps/rejected": -229.35791015625, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": 0.5640243291854858, "rewards/margins": 0.328153133392334, "rewards/rejected": 0.23587122559547424, "step": 4068 }, { "epoch": 0.6292673497003672, "grad_norm": 4.488232612609863, "learning_rate": 4.390250887845115e-06, "logits/chosen": 14.293383598327637, "logits/rejected": 9.13192081451416, "logps/chosen": -356.42156982421875, "logps/rejected": -197.54701232910156, "loss": 0.5663, "rewards/accuracies": 0.75, "rewards/chosen": 0.6690505146980286, "rewards/margins": 0.31999778747558594, "rewards/rejected": 0.3490527272224426, "step": 4069 }, { "epoch": 0.6294219988401315, "grad_norm": 6.079133987426758, "learning_rate": 4.389964486195441e-06, "logits/chosen": 14.880195617675781, "logits/rejected": 9.341279983520508, "logps/chosen": -359.162353515625, "logps/rejected": -230.6212158203125, "loss": 0.6127, "rewards/accuracies": 0.625, "rewards/chosen": 0.33809083700180054, "rewards/margins": 0.32280534505844116, "rewards/rejected": 0.015285443514585495, "step": 4070 }, { "epoch": 0.6295766479798957, "grad_norm": 4.713767051696777, "learning_rate": 4.389678084545767e-06, "logits/chosen": 9.947871208190918, "logits/rejected": 4.430027008056641, "logps/chosen": -338.67694091796875, "logps/rejected": -253.3353729248047, "loss": 0.4061, "rewards/accuracies": 1.0, "rewards/chosen": 0.7556244730949402, "rewards/margins": 0.7415270209312439, "rewards/rejected": 0.014097407460212708, "step": 4071 }, { "epoch": 0.6297312971196598, "grad_norm": 9.3604097366333, "learning_rate": 4.389391682896094e-06, "logits/chosen": 9.743115425109863, "logits/rejected": 5.437849044799805, "logps/chosen": -323.4690246582031, "logps/rejected": -247.41807556152344, "loss": 0.8407, "rewards/accuracies": 0.5, "rewards/chosen": -0.014921359717845917, "rewards/margins": -0.16871388256549835, "rewards/rejected": 0.15379253029823303, "step": 4072 }, { "epoch": 0.629885946259424, "grad_norm": 4.237896919250488, "learning_rate": 4.3891052812464206e-06, "logits/chosen": 2.015172004699707, "logits/rejected": 7.489939212799072, "logps/chosen": -98.97224426269531, "logps/rejected": -151.1461944580078, "loss": 0.7955, "rewards/accuracies": 0.375, "rewards/chosen": 0.1738617718219757, "rewards/margins": -0.07680667191743851, "rewards/rejected": 0.2506684362888336, "step": 4073 }, { "epoch": 0.6300405953991881, "grad_norm": 4.616840839385986, "learning_rate": 4.388818879596747e-06, "logits/chosen": 8.80966854095459, "logits/rejected": 10.599469184875488, "logps/chosen": -164.808349609375, "logps/rejected": -170.6791534423828, "loss": 0.6441, "rewards/accuracies": 0.75, "rewards/chosen": 0.35915103554725647, "rewards/margins": 0.13075952231884003, "rewards/rejected": 0.22839152812957764, "step": 4074 }, { "epoch": 0.6301952445389523, "grad_norm": 5.706430912017822, "learning_rate": 4.388532477947073e-06, "logits/chosen": 8.524097442626953, "logits/rejected": 7.387247085571289, "logps/chosen": -170.9727783203125, "logps/rejected": -194.60433959960938, "loss": 0.6791, "rewards/accuracies": 0.625, "rewards/chosen": 0.30584239959716797, "rewards/margins": 0.08205842971801758, "rewards/rejected": 0.2237839698791504, "step": 4075 }, { "epoch": 0.6303498936787164, "grad_norm": 4.895020484924316, "learning_rate": 4.3882460762974e-06, "logits/chosen": 16.760425567626953, "logits/rejected": 12.785892486572266, "logps/chosen": -373.61981201171875, "logps/rejected": -346.08734130859375, "loss": 0.5195, "rewards/accuracies": 0.875, "rewards/chosen": 0.4349214732646942, "rewards/margins": 0.44925785064697266, "rewards/rejected": -0.014336392283439636, "step": 4076 }, { "epoch": 0.6305045428184806, "grad_norm": 6.131520748138428, "learning_rate": 4.387959674647726e-06, "logits/chosen": 6.4105610847473145, "logits/rejected": 9.859278678894043, "logps/chosen": -160.2409210205078, "logps/rejected": -226.63900756835938, "loss": 0.675, "rewards/accuracies": 0.5, "rewards/chosen": 0.2909572124481201, "rewards/margins": 0.16879379749298096, "rewards/rejected": 0.12216338515281677, "step": 4077 }, { "epoch": 0.6306591919582447, "grad_norm": 4.614214897155762, "learning_rate": 4.387673272998053e-06, "logits/chosen": 12.445802688598633, "logits/rejected": 7.001782417297363, "logps/chosen": -231.46791076660156, "logps/rejected": -178.32420349121094, "loss": 0.6319, "rewards/accuracies": 0.625, "rewards/chosen": 0.21844187378883362, "rewards/margins": 0.21396130323410034, "rewards/rejected": 0.004480551928281784, "step": 4078 }, { "epoch": 0.6308138410980089, "grad_norm": 6.138950347900391, "learning_rate": 4.387386871348379e-06, "logits/chosen": 10.218317031860352, "logits/rejected": 12.086372375488281, "logps/chosen": -453.27276611328125, "logps/rejected": -471.6017761230469, "loss": 0.77, "rewards/accuracies": 0.5, "rewards/chosen": 0.6691047549247742, "rewards/margins": -0.07518120110034943, "rewards/rejected": 0.7442859411239624, "step": 4079 }, { "epoch": 0.630968490237773, "grad_norm": 7.922305107116699, "learning_rate": 4.387100469698705e-06, "logits/chosen": 11.186609268188477, "logits/rejected": 4.4858479499816895, "logps/chosen": -272.8786315917969, "logps/rejected": -263.168701171875, "loss": 0.7476, "rewards/accuracies": 0.75, "rewards/chosen": 0.2294585257768631, "rewards/margins": -0.003894984722137451, "rewards/rejected": 0.23335354030132294, "step": 4080 }, { "epoch": 0.6311231393775372, "grad_norm": 5.0894012451171875, "learning_rate": 4.386814068049032e-06, "logits/chosen": 8.3685302734375, "logits/rejected": 9.309028625488281, "logps/chosen": -193.14651489257812, "logps/rejected": -212.30909729003906, "loss": 0.8284, "rewards/accuracies": 0.375, "rewards/chosen": 0.19910365343093872, "rewards/margins": -0.1602429896593094, "rewards/rejected": 0.3593466281890869, "step": 4081 }, { "epoch": 0.6312777885173013, "grad_norm": 4.277697563171387, "learning_rate": 4.386527666399359e-06, "logits/chosen": 8.325204849243164, "logits/rejected": 4.172941207885742, "logps/chosen": -358.3304443359375, "logps/rejected": -239.33416748046875, "loss": 0.5362, "rewards/accuracies": 0.75, "rewards/chosen": 0.6739838123321533, "rewards/margins": 0.4819828271865845, "rewards/rejected": 0.19200094044208527, "step": 4082 }, { "epoch": 0.6314324376570656, "grad_norm": 9.58552360534668, "learning_rate": 4.386241264749685e-06, "logits/chosen": 10.95848274230957, "logits/rejected": 6.122400283813477, "logps/chosen": -503.1493225097656, "logps/rejected": -393.48828125, "loss": 0.7519, "rewards/accuracies": 0.625, "rewards/chosen": 0.543312668800354, "rewards/margins": 0.0017075389623641968, "rewards/rejected": 0.5416051149368286, "step": 4083 }, { "epoch": 0.6315870867968297, "grad_norm": 4.633702754974365, "learning_rate": 4.385954863100012e-06, "logits/chosen": 11.489587783813477, "logits/rejected": 9.771651268005371, "logps/chosen": -132.3289337158203, "logps/rejected": -114.9928207397461, "loss": 0.658, "rewards/accuracies": 0.5, "rewards/chosen": 0.04910849779844284, "rewards/margins": 0.12203539907932281, "rewards/rejected": -0.07292690873146057, "step": 4084 }, { "epoch": 0.6317417359365939, "grad_norm": 30.265993118286133, "learning_rate": 4.385668461450338e-06, "logits/chosen": 11.734833717346191, "logits/rejected": 9.424222946166992, "logps/chosen": -279.73077392578125, "logps/rejected": -296.370849609375, "loss": 0.6579, "rewards/accuracies": 0.625, "rewards/chosen": 0.4311927855014801, "rewards/margins": 0.274163156747818, "rewards/rejected": 0.1570296287536621, "step": 4085 }, { "epoch": 0.631896385076358, "grad_norm": 5.297602653503418, "learning_rate": 4.3853820598006645e-06, "logits/chosen": 5.70874547958374, "logits/rejected": 4.449985504150391, "logps/chosen": -257.4253234863281, "logps/rejected": -226.8214111328125, "loss": 0.7094, "rewards/accuracies": 0.375, "rewards/chosen": 0.28552475571632385, "rewards/margins": 0.12024030089378357, "rewards/rejected": 0.1652844399213791, "step": 4086 }, { "epoch": 0.6320510342161222, "grad_norm": 5.243790149688721, "learning_rate": 4.385095658150991e-06, "logits/chosen": 15.595524787902832, "logits/rejected": 7.7643585205078125, "logps/chosen": -273.47283935546875, "logps/rejected": -200.00347900390625, "loss": 0.4896, "rewards/accuracies": 0.75, "rewards/chosen": 0.3304349482059479, "rewards/margins": 0.5725619196891785, "rewards/rejected": -0.2421269416809082, "step": 4087 }, { "epoch": 0.6322056833558863, "grad_norm": 5.842273235321045, "learning_rate": 4.384809256501318e-06, "logits/chosen": 9.634300231933594, "logits/rejected": 9.368896484375, "logps/chosen": -230.87673950195312, "logps/rejected": -249.20358276367188, "loss": 0.7919, "rewards/accuracies": 0.375, "rewards/chosen": 0.16327105462551117, "rewards/margins": -0.15904149413108826, "rewards/rejected": 0.32231253385543823, "step": 4088 }, { "epoch": 0.6323603324956505, "grad_norm": 7.393531322479248, "learning_rate": 4.384522854851644e-06, "logits/chosen": 11.060192108154297, "logits/rejected": 7.803918838500977, "logps/chosen": -329.99383544921875, "logps/rejected": -274.918212890625, "loss": 0.9994, "rewards/accuracies": 0.625, "rewards/chosen": 0.3079463243484497, "rewards/margins": -0.21172676980495453, "rewards/rejected": 0.5196730494499207, "step": 4089 }, { "epoch": 0.6325149816354146, "grad_norm": 3.7282092571258545, "learning_rate": 4.384236453201971e-06, "logits/chosen": 10.73511791229248, "logits/rejected": 9.05099105834961, "logps/chosen": -287.5132141113281, "logps/rejected": -176.15481567382812, "loss": 0.5454, "rewards/accuracies": 0.875, "rewards/chosen": 0.4524175524711609, "rewards/margins": 0.3680635392665863, "rewards/rejected": 0.08435405045747757, "step": 4090 }, { "epoch": 0.6326696307751788, "grad_norm": 6.695165157318115, "learning_rate": 4.383950051552297e-06, "logits/chosen": 13.514520645141602, "logits/rejected": 13.532526969909668, "logps/chosen": -264.879638671875, "logps/rejected": -300.1611633300781, "loss": 0.6222, "rewards/accuracies": 0.75, "rewards/chosen": 0.16812191903591156, "rewards/margins": 0.19048023223876953, "rewards/rejected": -0.022358322516083717, "step": 4091 }, { "epoch": 0.6328242799149429, "grad_norm": 5.201694965362549, "learning_rate": 4.3836636499026235e-06, "logits/chosen": 9.252204895019531, "logits/rejected": 10.041300773620605, "logps/chosen": -239.11080932617188, "logps/rejected": -212.36119079589844, "loss": 0.7278, "rewards/accuracies": 0.375, "rewards/chosen": 0.30925390124320984, "rewards/margins": -0.027025975286960602, "rewards/rejected": 0.33627986907958984, "step": 4092 }, { "epoch": 0.6329789290547071, "grad_norm": 4.844962120056152, "learning_rate": 4.38337724825295e-06, "logits/chosen": 8.327411651611328, "logits/rejected": 10.50060749053955, "logps/chosen": -261.7782287597656, "logps/rejected": -254.94830322265625, "loss": 0.7709, "rewards/accuracies": 0.5, "rewards/chosen": 0.5955994129180908, "rewards/margins": -0.06333044916391373, "rewards/rejected": 0.6589298844337463, "step": 4093 }, { "epoch": 0.6331335781944712, "grad_norm": 6.417834281921387, "learning_rate": 4.383090846603277e-06, "logits/chosen": 6.415332317352295, "logits/rejected": 7.511914253234863, "logps/chosen": -192.75027465820312, "logps/rejected": -206.2389373779297, "loss": 0.7683, "rewards/accuracies": 0.375, "rewards/chosen": 0.3792446553707123, "rewards/margins": -0.10934244096279144, "rewards/rejected": 0.4885871112346649, "step": 4094 }, { "epoch": 0.6332882273342354, "grad_norm": 3.8428287506103516, "learning_rate": 4.3828044449536035e-06, "logits/chosen": 8.29948616027832, "logits/rejected": 4.485544681549072, "logps/chosen": -221.4833984375, "logps/rejected": -184.5738525390625, "loss": 0.6488, "rewards/accuracies": 0.5, "rewards/chosen": 0.4823654294013977, "rewards/margins": 0.12600107491016388, "rewards/rejected": 0.356364369392395, "step": 4095 }, { "epoch": 0.6334428764739997, "grad_norm": 4.322994232177734, "learning_rate": 4.38251804330393e-06, "logits/chosen": 13.05551528930664, "logits/rejected": 13.500021934509277, "logps/chosen": -177.59197998046875, "logps/rejected": -201.31393432617188, "loss": 0.5887, "rewards/accuracies": 0.625, "rewards/chosen": 0.44487255811691284, "rewards/margins": 0.3228353261947632, "rewards/rejected": 0.12203723192214966, "step": 4096 }, { "epoch": 0.6335975256137638, "grad_norm": 6.2394866943359375, "learning_rate": 4.382231641654257e-06, "logits/chosen": 7.5533905029296875, "logits/rejected": 9.271539688110352, "logps/chosen": -274.69775390625, "logps/rejected": -303.3941345214844, "loss": 0.7027, "rewards/accuracies": 0.5, "rewards/chosen": -0.08952885121107101, "rewards/margins": 0.3045582175254822, "rewards/rejected": -0.3940870463848114, "step": 4097 }, { "epoch": 0.633752174753528, "grad_norm": 3.795670509338379, "learning_rate": 4.381945240004583e-06, "logits/chosen": 6.060182571411133, "logits/rejected": 9.485590934753418, "logps/chosen": -146.77719116210938, "logps/rejected": -228.85003662109375, "loss": 0.6222, "rewards/accuracies": 0.625, "rewards/chosen": 0.007003791630268097, "rewards/margins": 0.25495392084121704, "rewards/rejected": -0.24795013666152954, "step": 4098 }, { "epoch": 0.6339068238932921, "grad_norm": 4.512800693511963, "learning_rate": 4.381658838354909e-06, "logits/chosen": 8.659418106079102, "logits/rejected": 9.568912506103516, "logps/chosen": -236.9254608154297, "logps/rejected": -240.45501708984375, "loss": 0.6456, "rewards/accuracies": 0.5, "rewards/chosen": 0.1655310094356537, "rewards/margins": 0.13852977752685547, "rewards/rejected": 0.027001243084669113, "step": 4099 }, { "epoch": 0.6340614730330563, "grad_norm": 5.2828497886657715, "learning_rate": 4.381372436705236e-06, "logits/chosen": 8.062158584594727, "logits/rejected": 8.785603523254395, "logps/chosen": -270.3441162109375, "logps/rejected": -283.97796630859375, "loss": 0.5701, "rewards/accuracies": 0.75, "rewards/chosen": 0.43046730756759644, "rewards/margins": 0.370481014251709, "rewards/rejected": 0.059986360371112823, "step": 4100 }, { "epoch": 0.6342161221728204, "grad_norm": 5.401562213897705, "learning_rate": 4.3810860350555625e-06, "logits/chosen": 7.222257614135742, "logits/rejected": 8.367936134338379, "logps/chosen": -259.8377685546875, "logps/rejected": -279.1836242675781, "loss": 0.5507, "rewards/accuracies": 0.75, "rewards/chosen": 0.34457677602767944, "rewards/margins": 0.39129161834716797, "rewards/rejected": -0.046714894473552704, "step": 4101 }, { "epoch": 0.6343707713125846, "grad_norm": 4.979738712310791, "learning_rate": 4.380799633405889e-06, "logits/chosen": 13.21034049987793, "logits/rejected": 10.843297004699707, "logps/chosen": -248.76193237304688, "logps/rejected": -254.17855834960938, "loss": 0.7634, "rewards/accuracies": 0.625, "rewards/chosen": 0.4047614634037018, "rewards/margins": -0.03327387571334839, "rewards/rejected": 0.43803533911705017, "step": 4102 }, { "epoch": 0.6345254204523487, "grad_norm": 5.004741668701172, "learning_rate": 4.380513231756216e-06, "logits/chosen": 14.181822776794434, "logits/rejected": 6.330324172973633, "logps/chosen": -342.7789001464844, "logps/rejected": -332.1922912597656, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": 0.5983216762542725, "rewards/margins": 0.584115743637085, "rewards/rejected": 0.0142059326171875, "step": 4103 }, { "epoch": 0.6346800695921129, "grad_norm": 5.335628986358643, "learning_rate": 4.380226830106542e-06, "logits/chosen": 7.961426258087158, "logits/rejected": 5.05066442489624, "logps/chosen": -321.98577880859375, "logps/rejected": -229.30003356933594, "loss": 0.7195, "rewards/accuracies": 0.375, "rewards/chosen": 0.42834556102752686, "rewards/margins": 0.02657165378332138, "rewards/rejected": 0.40177392959594727, "step": 4104 }, { "epoch": 0.634834718731877, "grad_norm": 5.181117057800293, "learning_rate": 4.379940428456868e-06, "logits/chosen": 14.14529800415039, "logits/rejected": 10.406647682189941, "logps/chosen": -346.9060974121094, "logps/rejected": -245.6342010498047, "loss": 0.614, "rewards/accuracies": 0.625, "rewards/chosen": 0.47927939891815186, "rewards/margins": 0.27702128887176514, "rewards/rejected": 0.20225811004638672, "step": 4105 }, { "epoch": 0.6349893678716412, "grad_norm": 6.674239158630371, "learning_rate": 4.379654026807195e-06, "logits/chosen": 4.582878589630127, "logits/rejected": 7.766049861907959, "logps/chosen": -372.852783203125, "logps/rejected": -364.62176513671875, "loss": 0.6792, "rewards/accuracies": 0.625, "rewards/chosen": 0.3472209870815277, "rewards/margins": 0.18419525027275085, "rewards/rejected": 0.16302573680877686, "step": 4106 }, { "epoch": 0.6351440170114053, "grad_norm": 4.618480682373047, "learning_rate": 4.379367625157522e-06, "logits/chosen": 9.992536544799805, "logits/rejected": 10.62861156463623, "logps/chosen": -274.1743469238281, "logps/rejected": -179.41104125976562, "loss": 0.675, "rewards/accuracies": 0.625, "rewards/chosen": 0.20193880796432495, "rewards/margins": 0.09115152806043625, "rewards/rejected": 0.1107873022556305, "step": 4107 }, { "epoch": 0.6352986661511696, "grad_norm": 5.447495937347412, "learning_rate": 4.379081223507847e-06, "logits/chosen": 7.995439052581787, "logits/rejected": 1.4290084838867188, "logps/chosen": -284.39617919921875, "logps/rejected": -228.349609375, "loss": 0.5617, "rewards/accuracies": 0.625, "rewards/chosen": 0.22853010892868042, "rewards/margins": 0.33122774958610535, "rewards/rejected": -0.10269765555858612, "step": 4108 }, { "epoch": 0.6354533152909337, "grad_norm": 3.807565927505493, "learning_rate": 4.378794821858174e-06, "logits/chosen": 7.251883506774902, "logits/rejected": 4.4661054611206055, "logps/chosen": -242.60794067382812, "logps/rejected": -170.6915283203125, "loss": 0.644, "rewards/accuracies": 0.75, "rewards/chosen": 0.38709479570388794, "rewards/margins": 0.17864611744880676, "rewards/rejected": 0.20844869315624237, "step": 4109 }, { "epoch": 0.6356079644306979, "grad_norm": 4.537283420562744, "learning_rate": 4.378508420208501e-06, "logits/chosen": 8.381706237792969, "logits/rejected": 9.692851066589355, "logps/chosen": -206.86929321289062, "logps/rejected": -324.139404296875, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": 0.1472097933292389, "rewards/margins": 0.27765166759490967, "rewards/rejected": -0.1304418444633484, "step": 4110 }, { "epoch": 0.635762613570462, "grad_norm": 5.434293746948242, "learning_rate": 4.378222018558827e-06, "logits/chosen": 10.989086151123047, "logits/rejected": 10.933847427368164, "logps/chosen": -297.72991943359375, "logps/rejected": -222.47848510742188, "loss": 0.6294, "rewards/accuracies": 0.5, "rewards/chosen": 0.5885234475135803, "rewards/margins": 0.24290457367897034, "rewards/rejected": 0.3456188440322876, "step": 4111 }, { "epoch": 0.6359172627102262, "grad_norm": 3.9414892196655273, "learning_rate": 4.377935616909154e-06, "logits/chosen": 10.343132019042969, "logits/rejected": 11.0509614944458, "logps/chosen": -221.460693359375, "logps/rejected": -251.46202087402344, "loss": 0.6261, "rewards/accuracies": 0.625, "rewards/chosen": 0.41311532258987427, "rewards/margins": 0.24716883897781372, "rewards/rejected": 0.16594648361206055, "step": 4112 }, { "epoch": 0.6360719118499903, "grad_norm": 5.2202630043029785, "learning_rate": 4.37764921525948e-06, "logits/chosen": 12.073678016662598, "logits/rejected": 8.554848670959473, "logps/chosen": -357.453369140625, "logps/rejected": -381.09124755859375, "loss": 0.6563, "rewards/accuracies": 0.625, "rewards/chosen": 0.5166854858398438, "rewards/margins": 0.30638766288757324, "rewards/rejected": 0.21029780805110931, "step": 4113 }, { "epoch": 0.6362265609897545, "grad_norm": 4.668791770935059, "learning_rate": 4.3773628136098064e-06, "logits/chosen": 12.560453414916992, "logits/rejected": 9.434642791748047, "logps/chosen": -221.6604461669922, "logps/rejected": -155.72030639648438, "loss": 0.5726, "rewards/accuracies": 0.75, "rewards/chosen": 0.30154722929000854, "rewards/margins": 0.32892292737960815, "rewards/rejected": -0.027375690639019012, "step": 4114 }, { "epoch": 0.6363812101295186, "grad_norm": 17.79600715637207, "learning_rate": 4.377076411960133e-06, "logits/chosen": 11.8092041015625, "logits/rejected": 3.234706401824951, "logps/chosen": -257.2894287109375, "logps/rejected": -227.11520385742188, "loss": 0.683, "rewards/accuracies": 0.625, "rewards/chosen": 0.15519465506076813, "rewards/margins": 0.06854070723056793, "rewards/rejected": 0.0866539403796196, "step": 4115 }, { "epoch": 0.6365358592692828, "grad_norm": 5.477811813354492, "learning_rate": 4.37679001031046e-06, "logits/chosen": 6.463884353637695, "logits/rejected": 6.820122241973877, "logps/chosen": -223.0013427734375, "logps/rejected": -243.29583740234375, "loss": 0.6779, "rewards/accuracies": 0.375, "rewards/chosen": 0.3277086019515991, "rewards/margins": 0.08995497226715088, "rewards/rejected": 0.23775361478328705, "step": 4116 }, { "epoch": 0.636690508409047, "grad_norm": 8.808624267578125, "learning_rate": 4.376503608660786e-06, "logits/chosen": 8.11403751373291, "logits/rejected": 7.373326778411865, "logps/chosen": -274.62457275390625, "logps/rejected": -234.1453857421875, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": 0.14819729328155518, "rewards/margins": 0.3993549346923828, "rewards/rejected": -0.25115764141082764, "step": 4117 }, { "epoch": 0.6368451575488111, "grad_norm": 5.85345458984375, "learning_rate": 4.376217207011112e-06, "logits/chosen": 3.6756136417388916, "logits/rejected": 9.300027847290039, "logps/chosen": -240.51461791992188, "logps/rejected": -310.1883544921875, "loss": 0.6567, "rewards/accuracies": 0.75, "rewards/chosen": 0.3116907477378845, "rewards/margins": 0.17528238892555237, "rewards/rejected": 0.13640832901000977, "step": 4118 }, { "epoch": 0.6369998066885753, "grad_norm": 6.3791632652282715, "learning_rate": 4.375930805361439e-06, "logits/chosen": 8.829212188720703, "logits/rejected": 6.205772399902344, "logps/chosen": -354.41888427734375, "logps/rejected": -248.3518524169922, "loss": 0.6486, "rewards/accuracies": 0.75, "rewards/chosen": 0.26041820645332336, "rewards/margins": 0.2785182595252991, "rewards/rejected": -0.018100067973136902, "step": 4119 }, { "epoch": 0.6371544558283394, "grad_norm": 17.831085205078125, "learning_rate": 4.3756444037117655e-06, "logits/chosen": 6.502841472625732, "logits/rejected": 7.498453140258789, "logps/chosen": -224.22491455078125, "logps/rejected": -259.0049743652344, "loss": 0.6452, "rewards/accuracies": 0.5, "rewards/chosen": 0.4141100347042084, "rewards/margins": 0.12821297347545624, "rewards/rejected": 0.28589707612991333, "step": 4120 }, { "epoch": 0.6373091049681037, "grad_norm": 7.046199321746826, "learning_rate": 4.375358002062092e-06, "logits/chosen": 14.548553466796875, "logits/rejected": 7.386445999145508, "logps/chosen": -270.88934326171875, "logps/rejected": -172.00706481933594, "loss": 0.4265, "rewards/accuracies": 1.0, "rewards/chosen": 0.4424956440925598, "rewards/margins": 0.6944729685783386, "rewards/rejected": -0.2519773244857788, "step": 4121 }, { "epoch": 0.6374637541078678, "grad_norm": 4.747237205505371, "learning_rate": 4.375071600412419e-06, "logits/chosen": 11.472861289978027, "logits/rejected": 7.186437129974365, "logps/chosen": -238.30471801757812, "logps/rejected": -217.9029083251953, "loss": 0.5835, "rewards/accuracies": 0.5, "rewards/chosen": 0.4564840793609619, "rewards/margins": 0.30192387104034424, "rewards/rejected": 0.15456023812294006, "step": 4122 }, { "epoch": 0.637618403247632, "grad_norm": 4.763288497924805, "learning_rate": 4.3747851987627455e-06, "logits/chosen": 10.446076393127441, "logits/rejected": 3.2043774127960205, "logps/chosen": -384.4615173339844, "logps/rejected": -287.5572814941406, "loss": 0.5123, "rewards/accuracies": 0.875, "rewards/chosen": 0.501357913017273, "rewards/margins": 0.5178449153900146, "rewards/rejected": -0.01648702472448349, "step": 4123 }, { "epoch": 0.6377730523873961, "grad_norm": 11.977688789367676, "learning_rate": 4.374498797113071e-06, "logits/chosen": 16.81658172607422, "logits/rejected": 13.944828033447266, "logps/chosen": -328.14776611328125, "logps/rejected": -306.6591491699219, "loss": 0.7031, "rewards/accuracies": 0.5, "rewards/chosen": 0.39857369661331177, "rewards/margins": 0.07378018647432327, "rewards/rejected": 0.3247935175895691, "step": 4124 }, { "epoch": 0.6379277015271603, "grad_norm": 6.331130027770996, "learning_rate": 4.374212395463398e-06, "logits/chosen": 13.306492805480957, "logits/rejected": 7.61560583114624, "logps/chosen": -358.43426513671875, "logps/rejected": -205.383544921875, "loss": 0.767, "rewards/accuracies": 0.625, "rewards/chosen": 0.10752954334020615, "rewards/margins": 0.04026623070240021, "rewards/rejected": 0.06726332008838654, "step": 4125 }, { "epoch": 0.6380823506669244, "grad_norm": 6.325943946838379, "learning_rate": 4.3739259938137246e-06, "logits/chosen": 7.5689239501953125, "logits/rejected": 7.183380126953125, "logps/chosen": -171.04945373535156, "logps/rejected": -195.79745483398438, "loss": 0.7774, "rewards/accuracies": 0.375, "rewards/chosen": -0.3120458722114563, "rewards/margins": -0.043240297585725784, "rewards/rejected": -0.2688056230545044, "step": 4126 }, { "epoch": 0.6382369998066886, "grad_norm": 3.946017265319824, "learning_rate": 4.373639592164051e-06, "logits/chosen": 10.310894012451172, "logits/rejected": 11.078542709350586, "logps/chosen": -218.87835693359375, "logps/rejected": -229.44537353515625, "loss": 0.5646, "rewards/accuracies": 0.75, "rewards/chosen": 0.14756150543689728, "rewards/margins": 0.3348393440246582, "rewards/rejected": -0.18727785348892212, "step": 4127 }, { "epoch": 0.6383916489464527, "grad_norm": 3.8895344734191895, "learning_rate": 4.373353190514378e-06, "logits/chosen": 9.92867374420166, "logits/rejected": 3.5982136726379395, "logps/chosen": -198.17742919921875, "logps/rejected": -133.93963623046875, "loss": 0.6661, "rewards/accuracies": 0.5, "rewards/chosen": -0.00034789741039276123, "rewards/margins": 0.06967587769031525, "rewards/rejected": -0.0700237825512886, "step": 4128 }, { "epoch": 0.6385462980862169, "grad_norm": 3.924588441848755, "learning_rate": 4.3730667888647045e-06, "logits/chosen": 10.009392738342285, "logits/rejected": 6.078564643859863, "logps/chosen": -195.566162109375, "logps/rejected": -137.6949005126953, "loss": 0.6163, "rewards/accuracies": 0.625, "rewards/chosen": 0.20212063193321228, "rewards/margins": 0.3275145888328552, "rewards/rejected": -0.12539394199848175, "step": 4129 }, { "epoch": 0.638700947225981, "grad_norm": 5.081210613250732, "learning_rate": 4.372780387215031e-06, "logits/chosen": 6.342856407165527, "logits/rejected": 6.001924991607666, "logps/chosen": -323.96417236328125, "logps/rejected": -259.64434814453125, "loss": 0.6038, "rewards/accuracies": 0.625, "rewards/chosen": 0.4809591770172119, "rewards/margins": 0.31086859107017517, "rewards/rejected": 0.17009061574935913, "step": 4130 }, { "epoch": 0.6388555963657452, "grad_norm": 6.727889060974121, "learning_rate": 4.372493985565357e-06, "logits/chosen": 7.062030792236328, "logits/rejected": 11.667972564697266, "logps/chosen": -161.51248168945312, "logps/rejected": -234.14161682128906, "loss": 0.9305, "rewards/accuracies": 0.25, "rewards/chosen": -0.10976529866456985, "rewards/margins": -0.35832956433296204, "rewards/rejected": 0.2485642433166504, "step": 4131 }, { "epoch": 0.6390102455055093, "grad_norm": 3.9513015747070312, "learning_rate": 4.372207583915684e-06, "logits/chosen": 6.404477119445801, "logits/rejected": 1.3548694849014282, "logps/chosen": -203.42523193359375, "logps/rejected": -146.09397888183594, "loss": 0.5237, "rewards/accuracies": 0.5, "rewards/chosen": 0.07050585746765137, "rewards/margins": 0.4598422944545746, "rewards/rejected": -0.3893364667892456, "step": 4132 }, { "epoch": 0.6391648946452735, "grad_norm": 6.383387565612793, "learning_rate": 4.37192118226601e-06, "logits/chosen": 10.687776565551758, "logits/rejected": 6.707417964935303, "logps/chosen": -368.4574279785156, "logps/rejected": -281.74859619140625, "loss": 0.77, "rewards/accuracies": 0.375, "rewards/chosen": 0.12452947348356247, "rewards/margins": -0.053909800946712494, "rewards/rejected": 0.17843925952911377, "step": 4133 }, { "epoch": 0.6393195437850377, "grad_norm": 4.701065540313721, "learning_rate": 4.371634780616337e-06, "logits/chosen": 12.130321502685547, "logits/rejected": 5.955464839935303, "logps/chosen": -299.20086669921875, "logps/rejected": -204.91860961914062, "loss": 0.5892, "rewards/accuracies": 0.625, "rewards/chosen": 0.06123046204447746, "rewards/margins": 0.30267924070358276, "rewards/rejected": -0.241448774933815, "step": 4134 }, { "epoch": 0.6394741929248019, "grad_norm": 7.378091812133789, "learning_rate": 4.3713483789666636e-06, "logits/chosen": 7.9696478843688965, "logits/rejected": 5.825591087341309, "logps/chosen": -308.034423828125, "logps/rejected": -290.70965576171875, "loss": 0.9042, "rewards/accuracies": 0.25, "rewards/chosen": -0.09308815002441406, "rewards/margins": -0.31664466857910156, "rewards/rejected": 0.2235565185546875, "step": 4135 }, { "epoch": 0.639628842064566, "grad_norm": 15.90924072265625, "learning_rate": 4.37106197731699e-06, "logits/chosen": 6.8234052658081055, "logits/rejected": 6.639256954193115, "logps/chosen": -332.3922119140625, "logps/rejected": -335.5267639160156, "loss": 0.8146, "rewards/accuracies": 0.375, "rewards/chosen": 0.1977781355381012, "rewards/margins": -0.1273650974035263, "rewards/rejected": 0.3251432478427887, "step": 4136 }, { "epoch": 0.6397834912043302, "grad_norm": 4.474390983581543, "learning_rate": 4.370775575667317e-06, "logits/chosen": 12.286456108093262, "logits/rejected": 6.718454360961914, "logps/chosen": -298.04364013671875, "logps/rejected": -198.03721618652344, "loss": 0.6089, "rewards/accuracies": 0.75, "rewards/chosen": 0.33563196659088135, "rewards/margins": 0.21411949396133423, "rewards/rejected": 0.12151246517896652, "step": 4137 }, { "epoch": 0.6399381403440944, "grad_norm": 4.8385491371154785, "learning_rate": 4.370489174017643e-06, "logits/chosen": 10.93194580078125, "logits/rejected": 8.764164924621582, "logps/chosen": -333.0178527832031, "logps/rejected": -281.819091796875, "loss": 0.5366, "rewards/accuracies": 0.875, "rewards/chosen": 0.40943777561187744, "rewards/margins": 0.3770712912082672, "rewards/rejected": 0.03236646205186844, "step": 4138 }, { "epoch": 0.6400927894838585, "grad_norm": 5.442009925842285, "learning_rate": 4.370202772367969e-06, "logits/chosen": 14.258310317993164, "logits/rejected": 10.787344932556152, "logps/chosen": -314.5521545410156, "logps/rejected": -272.953369140625, "loss": 0.5668, "rewards/accuracies": 0.75, "rewards/chosen": 0.3717292547225952, "rewards/margins": 0.31740519404411316, "rewards/rejected": 0.05432405695319176, "step": 4139 }, { "epoch": 0.6402474386236227, "grad_norm": 4.450315952301025, "learning_rate": 4.369916370718296e-06, "logits/chosen": 11.830985069274902, "logits/rejected": 10.443124771118164, "logps/chosen": -175.95892333984375, "logps/rejected": -172.8510284423828, "loss": 0.7064, "rewards/accuracies": 0.5, "rewards/chosen": 0.17391079664230347, "rewards/margins": -0.0089874267578125, "rewards/rejected": 0.18289823830127716, "step": 4140 }, { "epoch": 0.6404020877633868, "grad_norm": 5.178179740905762, "learning_rate": 4.369629969068623e-06, "logits/chosen": 11.477242469787598, "logits/rejected": 6.701263427734375, "logps/chosen": -367.078125, "logps/rejected": -259.4224548339844, "loss": 0.5708, "rewards/accuracies": 0.5, "rewards/chosen": 0.3994961082935333, "rewards/margins": 0.43383142352104187, "rewards/rejected": -0.03433533012866974, "step": 4141 }, { "epoch": 0.640556736903151, "grad_norm": 4.926456451416016, "learning_rate": 4.369343567418948e-06, "logits/chosen": 9.36685848236084, "logits/rejected": 3.643974781036377, "logps/chosen": -264.55255126953125, "logps/rejected": -194.30076599121094, "loss": 0.6731, "rewards/accuracies": 0.625, "rewards/chosen": 0.20452560484409332, "rewards/margins": 0.18341843783855438, "rewards/rejected": 0.021107204258441925, "step": 4142 }, { "epoch": 0.6407113860429151, "grad_norm": 3.3367748260498047, "learning_rate": 4.369057165769275e-06, "logits/chosen": 11.512624740600586, "logits/rejected": 6.5777716636657715, "logps/chosen": -255.5541229248047, "logps/rejected": -189.23135375976562, "loss": 0.4946, "rewards/accuracies": 0.875, "rewards/chosen": 0.28150850534439087, "rewards/margins": 0.5028558969497681, "rewards/rejected": -0.2213473916053772, "step": 4143 }, { "epoch": 0.6408660351826793, "grad_norm": 4.676826000213623, "learning_rate": 4.368770764119602e-06, "logits/chosen": 8.315370559692383, "logits/rejected": 4.311043739318848, "logps/chosen": -316.3711242675781, "logps/rejected": -262.5533752441406, "loss": 0.4354, "rewards/accuracies": 0.75, "rewards/chosen": 0.48535990715026855, "rewards/margins": 0.7464770674705505, "rewards/rejected": -0.261117160320282, "step": 4144 }, { "epoch": 0.6410206843224434, "grad_norm": 6.268926620483398, "learning_rate": 4.368484362469928e-06, "logits/chosen": 12.007072448730469, "logits/rejected": 6.08164119720459, "logps/chosen": -357.03472900390625, "logps/rejected": -233.7212371826172, "loss": 0.5375, "rewards/accuracies": 0.625, "rewards/chosen": 0.2798842489719391, "rewards/margins": 0.6042281985282898, "rewards/rejected": -0.3243439197540283, "step": 4145 }, { "epoch": 0.6411753334622076, "grad_norm": 6.8585615158081055, "learning_rate": 4.368197960820255e-06, "logits/chosen": 8.012149810791016, "logits/rejected": 4.651320457458496, "logps/chosen": -277.5685729980469, "logps/rejected": -251.92120361328125, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": 0.022730447351932526, "rewards/margins": 0.04365553334355354, "rewards/rejected": -0.020925089716911316, "step": 4146 }, { "epoch": 0.6413299826019718, "grad_norm": 6.885255336761475, "learning_rate": 4.367911559170581e-06, "logits/chosen": 9.24830150604248, "logits/rejected": 11.41317367553711, "logps/chosen": -317.18365478515625, "logps/rejected": -357.7243347167969, "loss": 0.7713, "rewards/accuracies": 0.625, "rewards/chosen": 0.21779154241085052, "rewards/margins": 0.1615956574678421, "rewards/rejected": 0.056195855140686035, "step": 4147 }, { "epoch": 0.641484631741736, "grad_norm": 6.580196857452393, "learning_rate": 4.3676251575209075e-06, "logits/chosen": 12.149304389953613, "logits/rejected": 7.330109596252441, "logps/chosen": -377.93499755859375, "logps/rejected": -256.2394104003906, "loss": 0.4987, "rewards/accuracies": 0.875, "rewards/chosen": 0.4020899832248688, "rewards/margins": 0.47229433059692383, "rewards/rejected": -0.07020434737205505, "step": 4148 }, { "epoch": 0.6416392808815001, "grad_norm": 6.997609615325928, "learning_rate": 4.367338755871234e-06, "logits/chosen": 13.607803344726562, "logits/rejected": 7.595705032348633, "logps/chosen": -258.41131591796875, "logps/rejected": -247.26422119140625, "loss": 0.611, "rewards/accuracies": 0.75, "rewards/chosen": 0.3885740339756012, "rewards/margins": 0.2115287482738495, "rewards/rejected": 0.1770452857017517, "step": 4149 }, { "epoch": 0.6417939300212643, "grad_norm": 14.422523498535156, "learning_rate": 4.367052354221561e-06, "logits/chosen": 9.782825469970703, "logits/rejected": 6.187810897827148, "logps/chosen": -267.31878662109375, "logps/rejected": -216.63388061523438, "loss": 0.5862, "rewards/accuracies": 0.625, "rewards/chosen": 0.3532275855541229, "rewards/margins": 0.31848013401031494, "rewards/rejected": 0.03474743664264679, "step": 4150 }, { "epoch": 0.6419485791610284, "grad_norm": 5.5749077796936035, "learning_rate": 4.366765952571887e-06, "logits/chosen": 6.825323104858398, "logits/rejected": 9.243118286132812, "logps/chosen": -306.1999816894531, "logps/rejected": -338.7637939453125, "loss": 0.72, "rewards/accuracies": 0.5, "rewards/chosen": 0.07453508675098419, "rewards/margins": 0.00998525321483612, "rewards/rejected": 0.06454983353614807, "step": 4151 }, { "epoch": 0.6421032283007926, "grad_norm": 4.406378746032715, "learning_rate": 4.366479550922213e-06, "logits/chosen": 18.790006637573242, "logits/rejected": 10.795914649963379, "logps/chosen": -246.80125427246094, "logps/rejected": -188.6220703125, "loss": 0.6544, "rewards/accuracies": 0.5, "rewards/chosen": 0.16750024259090424, "rewards/margins": 0.2535654604434967, "rewards/rejected": -0.08606519550085068, "step": 4152 }, { "epoch": 0.6422578774405567, "grad_norm": 4.765065670013428, "learning_rate": 4.36619314927254e-06, "logits/chosen": 6.78310489654541, "logits/rejected": 4.059783458709717, "logps/chosen": -220.06101989746094, "logps/rejected": -161.9205322265625, "loss": 0.6182, "rewards/accuracies": 0.75, "rewards/chosen": 0.26454222202301025, "rewards/margins": 0.19134798645973206, "rewards/rejected": 0.073194220662117, "step": 4153 }, { "epoch": 0.6424125265803209, "grad_norm": 5.132529258728027, "learning_rate": 4.3659067476228665e-06, "logits/chosen": 10.343584060668945, "logits/rejected": 10.764266014099121, "logps/chosen": -337.9837951660156, "logps/rejected": -319.16815185546875, "loss": 0.5137, "rewards/accuracies": 0.875, "rewards/chosen": 0.6351209878921509, "rewards/margins": 0.5357867479324341, "rewards/rejected": 0.0993342399597168, "step": 4154 }, { "epoch": 0.642567175720085, "grad_norm": 4.381292819976807, "learning_rate": 4.365620345973193e-06, "logits/chosen": 6.6890869140625, "logits/rejected": 3.8603034019470215, "logps/chosen": -221.19137573242188, "logps/rejected": -180.6158905029297, "loss": 0.5325, "rewards/accuracies": 0.75, "rewards/chosen": 0.1360655277967453, "rewards/margins": 0.44591212272644043, "rewards/rejected": -0.30984658002853394, "step": 4155 }, { "epoch": 0.6427218248598492, "grad_norm": 5.729649066925049, "learning_rate": 4.36533394432352e-06, "logits/chosen": 10.13838005065918, "logits/rejected": 7.824774265289307, "logps/chosen": -287.04559326171875, "logps/rejected": -291.3411865234375, "loss": 0.737, "rewards/accuracies": 0.5, "rewards/chosen": 0.30130115151405334, "rewards/margins": -0.0588393434882164, "rewards/rejected": 0.36014050245285034, "step": 4156 }, { "epoch": 0.6428764739996133, "grad_norm": 3.3064358234405518, "learning_rate": 4.365047542673846e-06, "logits/chosen": 10.453022003173828, "logits/rejected": 4.2874555587768555, "logps/chosen": -217.84133911132812, "logps/rejected": -160.44595336914062, "loss": 0.4465, "rewards/accuracies": 0.75, "rewards/chosen": 0.4284857511520386, "rewards/margins": 0.7460076212882996, "rewards/rejected": -0.317521870136261, "step": 4157 }, { "epoch": 0.6430311231393775, "grad_norm": 5.186545372009277, "learning_rate": 4.364761141024172e-06, "logits/chosen": 7.370980262756348, "logits/rejected": 5.057917594909668, "logps/chosen": -223.27980041503906, "logps/rejected": -226.22323608398438, "loss": 0.5646, "rewards/accuracies": 0.625, "rewards/chosen": 0.1933669149875641, "rewards/margins": 0.36912909150123596, "rewards/rejected": -0.17576219141483307, "step": 4158 }, { "epoch": 0.6431857722791416, "grad_norm": 7.477304458618164, "learning_rate": 4.364474739374499e-06, "logits/chosen": 10.044393539428711, "logits/rejected": 12.840227127075195, "logps/chosen": -259.44842529296875, "logps/rejected": -343.2508544921875, "loss": 0.7668, "rewards/accuracies": 0.375, "rewards/chosen": 0.09966926276683807, "rewards/margins": -0.07662869989871979, "rewards/rejected": 0.17629796266555786, "step": 4159 }, { "epoch": 0.6433404214189059, "grad_norm": 4.798598289489746, "learning_rate": 4.364188337724826e-06, "logits/chosen": 11.262516975402832, "logits/rejected": 8.867025375366211, "logps/chosen": -219.349609375, "logps/rejected": -219.7161865234375, "loss": 0.6082, "rewards/accuracies": 0.75, "rewards/chosen": 0.4474555253982544, "rewards/margins": 0.23363742232322693, "rewards/rejected": 0.21381810307502747, "step": 4160 }, { "epoch": 0.6434950705586701, "grad_norm": 8.234296798706055, "learning_rate": 4.363901936075152e-06, "logits/chosen": 6.797112464904785, "logits/rejected": 4.032725811004639, "logps/chosen": -317.3948974609375, "logps/rejected": -278.6521301269531, "loss": 0.8229, "rewards/accuracies": 0.625, "rewards/chosen": 0.214555025100708, "rewards/margins": -0.07091964781284332, "rewards/rejected": 0.28547465801239014, "step": 4161 }, { "epoch": 0.6436497196984342, "grad_norm": 7.59962272644043, "learning_rate": 4.363615534425479e-06, "logits/chosen": 9.691587448120117, "logits/rejected": 6.5549702644348145, "logps/chosen": -318.69891357421875, "logps/rejected": -385.31280517578125, "loss": 0.7351, "rewards/accuracies": 0.625, "rewards/chosen": 0.22610989212989807, "rewards/margins": -0.055990688502788544, "rewards/rejected": 0.2821005582809448, "step": 4162 }, { "epoch": 0.6438043688381984, "grad_norm": 7.549393653869629, "learning_rate": 4.3633291327758055e-06, "logits/chosen": 6.954405784606934, "logits/rejected": 9.473011016845703, "logps/chosen": -393.3813171386719, "logps/rejected": -428.99505615234375, "loss": 0.7889, "rewards/accuracies": 0.5, "rewards/chosen": -0.0068602971732616425, "rewards/margins": 0.016147777438163757, "rewards/rejected": -0.023008093237876892, "step": 4163 }, { "epoch": 0.6439590179779625, "grad_norm": 5.818005084991455, "learning_rate": 4.363042731126131e-06, "logits/chosen": 8.630800247192383, "logits/rejected": 14.12895393371582, "logps/chosen": -236.03073120117188, "logps/rejected": -383.80047607421875, "loss": 0.5303, "rewards/accuracies": 0.625, "rewards/chosen": 0.24141044914722443, "rewards/margins": 0.4820749759674072, "rewards/rejected": -0.24066448211669922, "step": 4164 }, { "epoch": 0.6441136671177267, "grad_norm": 4.954399108886719, "learning_rate": 4.362756329476458e-06, "logits/chosen": 9.562021255493164, "logits/rejected": 5.233008861541748, "logps/chosen": -284.25341796875, "logps/rejected": -194.68319702148438, "loss": 0.6082, "rewards/accuracies": 0.5, "rewards/chosen": 0.3352839946746826, "rewards/margins": 0.26421594619750977, "rewards/rejected": 0.07106802612543106, "step": 4165 }, { "epoch": 0.6442683162574908, "grad_norm": 2.904010534286499, "learning_rate": 4.362469927826785e-06, "logits/chosen": 12.85045337677002, "logits/rejected": 11.012163162231445, "logps/chosen": -220.31076049804688, "logps/rejected": -179.41116333007812, "loss": 0.5487, "rewards/accuracies": 0.625, "rewards/chosen": 0.3166337013244629, "rewards/margins": 0.4388699531555176, "rewards/rejected": -0.12223625183105469, "step": 4166 }, { "epoch": 0.644422965397255, "grad_norm": 4.373697757720947, "learning_rate": 4.362183526177111e-06, "logits/chosen": 11.535093307495117, "logits/rejected": 6.671360015869141, "logps/chosen": -173.48941040039062, "logps/rejected": -137.11614990234375, "loss": 0.7012, "rewards/accuracies": 0.625, "rewards/chosen": 0.0783420279622078, "rewards/margins": 0.010277032852172852, "rewards/rejected": 0.06806500256061554, "step": 4167 }, { "epoch": 0.6445776145370191, "grad_norm": 7.098147869110107, "learning_rate": 4.361897124527438e-06, "logits/chosen": 11.066957473754883, "logits/rejected": 5.847835063934326, "logps/chosen": -542.644287109375, "logps/rejected": -420.784423828125, "loss": 0.5313, "rewards/accuracies": 0.625, "rewards/chosen": 0.5807441473007202, "rewards/margins": 0.4839019775390625, "rewards/rejected": 0.0968421921133995, "step": 4168 }, { "epoch": 0.6447322636767833, "grad_norm": 5.421589374542236, "learning_rate": 4.361610722877765e-06, "logits/chosen": 9.260985374450684, "logits/rejected": 7.635785102844238, "logps/chosen": -272.85833740234375, "logps/rejected": -278.8537902832031, "loss": 0.7274, "rewards/accuracies": 0.375, "rewards/chosen": -0.08183684945106506, "rewards/margins": -0.041879698634147644, "rewards/rejected": -0.039957139641046524, "step": 4169 }, { "epoch": 0.6448869128165474, "grad_norm": 13.882923126220703, "learning_rate": 4.361324321228091e-06, "logits/chosen": 5.356777191162109, "logits/rejected": 12.102420806884766, "logps/chosen": -246.50875854492188, "logps/rejected": -345.19415283203125, "loss": 1.1131, "rewards/accuracies": 0.125, "rewards/chosen": -0.2506532669067383, "rewards/margins": -0.6569260358810425, "rewards/rejected": 0.4062727093696594, "step": 4170 }, { "epoch": 0.6450415619563116, "grad_norm": 4.450241565704346, "learning_rate": 4.361037919578417e-06, "logits/chosen": 9.326611518859863, "logits/rejected": 5.942061424255371, "logps/chosen": -283.51800537109375, "logps/rejected": -245.8580322265625, "loss": 0.6022, "rewards/accuracies": 0.75, "rewards/chosen": 0.2722225785255432, "rewards/margins": 0.2200319617986679, "rewards/rejected": 0.052190594375133514, "step": 4171 }, { "epoch": 0.6451962110960758, "grad_norm": 5.066701889038086, "learning_rate": 4.360751517928744e-06, "logits/chosen": 10.758315086364746, "logits/rejected": 10.636816024780273, "logps/chosen": -234.97216796875, "logps/rejected": -190.72586059570312, "loss": 0.7421, "rewards/accuracies": 0.625, "rewards/chosen": -0.2692779302597046, "rewards/margins": -0.02789749950170517, "rewards/rejected": -0.24138040840625763, "step": 4172 }, { "epoch": 0.64535086023584, "grad_norm": 6.414374828338623, "learning_rate": 4.36046511627907e-06, "logits/chosen": 5.687942981719971, "logits/rejected": 8.49539566040039, "logps/chosen": -301.84619140625, "logps/rejected": -376.5115966796875, "loss": 0.6732, "rewards/accuracies": 0.625, "rewards/chosen": 0.4581771492958069, "rewards/margins": 0.07761573791503906, "rewards/rejected": 0.3805614411830902, "step": 4173 }, { "epoch": 0.6455055093756041, "grad_norm": 5.29481840133667, "learning_rate": 4.360178714629397e-06, "logits/chosen": 10.033263206481934, "logits/rejected": 4.910677433013916, "logps/chosen": -306.9974060058594, "logps/rejected": -265.02911376953125, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": 0.13457468152046204, "rewards/margins": 0.13367551565170288, "rewards/rejected": 0.0008991807699203491, "step": 4174 }, { "epoch": 0.6456601585153683, "grad_norm": 5.107746601104736, "learning_rate": 4.359892312979724e-06, "logits/chosen": 11.324872970581055, "logits/rejected": 10.652301788330078, "logps/chosen": -173.3695526123047, "logps/rejected": -146.36082458496094, "loss": 0.6692, "rewards/accuracies": 0.5, "rewards/chosen": 0.185777485370636, "rewards/margins": 0.07777142524719238, "rewards/rejected": 0.10800604522228241, "step": 4175 }, { "epoch": 0.6458148076551324, "grad_norm": 3.1758272647857666, "learning_rate": 4.3596059113300495e-06, "logits/chosen": 13.734806060791016, "logits/rejected": 7.527505874633789, "logps/chosen": -212.08401489257812, "logps/rejected": -142.43222045898438, "loss": 0.4843, "rewards/accuracies": 0.75, "rewards/chosen": 0.14847631752490997, "rewards/margins": 0.6263341903686523, "rewards/rejected": -0.47785788774490356, "step": 4176 }, { "epoch": 0.6459694567948966, "grad_norm": 6.027726650238037, "learning_rate": 4.359319509680376e-06, "logits/chosen": 10.803421020507812, "logits/rejected": 7.702727317810059, "logps/chosen": -344.2530517578125, "logps/rejected": -239.4901885986328, "loss": 0.7127, "rewards/accuracies": 0.75, "rewards/chosen": -0.1887633353471756, "rewards/margins": 0.030370034277439117, "rewards/rejected": -0.2191333770751953, "step": 4177 }, { "epoch": 0.6461241059346607, "grad_norm": 5.2316484451293945, "learning_rate": 4.359033108030703e-06, "logits/chosen": 7.239022254943848, "logits/rejected": 5.777777671813965, "logps/chosen": -302.7624816894531, "logps/rejected": -222.5479278564453, "loss": 0.5206, "rewards/accuracies": 0.625, "rewards/chosen": 0.1200920045375824, "rewards/margins": 0.4413255453109741, "rewards/rejected": -0.3212335705757141, "step": 4178 }, { "epoch": 0.6462787550744249, "grad_norm": 4.305361270904541, "learning_rate": 4.358746706381029e-06, "logits/chosen": 8.571762084960938, "logits/rejected": 10.736604690551758, "logps/chosen": -164.2189483642578, "logps/rejected": -203.00970458984375, "loss": 0.7198, "rewards/accuracies": 0.625, "rewards/chosen": -0.17898684740066528, "rewards/margins": -0.0001862645149230957, "rewards/rejected": -0.1788005828857422, "step": 4179 }, { "epoch": 0.646433404214189, "grad_norm": 5.800814628601074, "learning_rate": 4.358460304731355e-06, "logits/chosen": 5.612978935241699, "logits/rejected": 0.43852460384368896, "logps/chosen": -403.4559631347656, "logps/rejected": -282.67730712890625, "loss": 0.6122, "rewards/accuracies": 0.875, "rewards/chosen": 0.30948859453201294, "rewards/margins": 0.40233543515205383, "rewards/rejected": -0.09284687042236328, "step": 4180 }, { "epoch": 0.6465880533539532, "grad_norm": 4.620627403259277, "learning_rate": 4.358173903081682e-06, "logits/chosen": 12.765654563903809, "logits/rejected": 7.236401081085205, "logps/chosen": -258.3139343261719, "logps/rejected": -238.3670654296875, "loss": 0.5015, "rewards/accuracies": 0.75, "rewards/chosen": 0.36743491888046265, "rewards/margins": 0.5095010995864868, "rewards/rejected": -0.14206616580486298, "step": 4181 }, { "epoch": 0.6467427024937173, "grad_norm": 4.559831619262695, "learning_rate": 4.3578875014320085e-06, "logits/chosen": 12.864474296569824, "logits/rejected": 3.5196242332458496, "logps/chosen": -395.32879638671875, "logps/rejected": -253.01942443847656, "loss": 0.5143, "rewards/accuracies": 0.625, "rewards/chosen": 0.5292133688926697, "rewards/margins": 0.7272636890411377, "rewards/rejected": -0.19805032014846802, "step": 4182 }, { "epoch": 0.6468973516334815, "grad_norm": 7.63358211517334, "learning_rate": 4.357601099782335e-06, "logits/chosen": 13.847896575927734, "logits/rejected": 11.406129837036133, "logps/chosen": -362.0840148925781, "logps/rejected": -313.28045654296875, "loss": 0.6591, "rewards/accuracies": 0.75, "rewards/chosen": 0.44009914994239807, "rewards/margins": 0.24275389313697815, "rewards/rejected": 0.19734525680541992, "step": 4183 }, { "epoch": 0.6470520007732457, "grad_norm": 5.2918219566345215, "learning_rate": 4.357314698132662e-06, "logits/chosen": 10.877615928649902, "logits/rejected": 8.881293296813965, "logps/chosen": -247.1022491455078, "logps/rejected": -227.22557067871094, "loss": 0.6177, "rewards/accuracies": 0.5, "rewards/chosen": 0.3296305537223816, "rewards/margins": 0.2166401445865631, "rewards/rejected": 0.1129903793334961, "step": 4184 }, { "epoch": 0.6472066499130099, "grad_norm": 9.23476505279541, "learning_rate": 4.357028296482988e-06, "logits/chosen": 13.849346160888672, "logits/rejected": 6.874018669128418, "logps/chosen": -326.7024841308594, "logps/rejected": -259.6429443359375, "loss": 0.8702, "rewards/accuracies": 0.5, "rewards/chosen": -0.20097070932388306, "rewards/margins": -0.19209720194339752, "rewards/rejected": -0.00887349247932434, "step": 4185 }, { "epoch": 0.6473612990527741, "grad_norm": 6.325669288635254, "learning_rate": 4.356741894833314e-06, "logits/chosen": 7.294525146484375, "logits/rejected": 8.089756965637207, "logps/chosen": -322.8529052734375, "logps/rejected": -325.2636413574219, "loss": 0.664, "rewards/accuracies": 0.625, "rewards/chosen": 0.3753849267959595, "rewards/margins": 0.1250227987766266, "rewards/rejected": 0.2503621280193329, "step": 4186 }, { "epoch": 0.6475159481925382, "grad_norm": 4.741813659667969, "learning_rate": 4.356455493183641e-06, "logits/chosen": 13.535446166992188, "logits/rejected": 4.636565208435059, "logps/chosen": -367.2870788574219, "logps/rejected": -239.84005737304688, "loss": 0.5585, "rewards/accuracies": 0.75, "rewards/chosen": 0.31642550230026245, "rewards/margins": 0.44724729657173157, "rewards/rejected": -0.1308218091726303, "step": 4187 }, { "epoch": 0.6476705973323024, "grad_norm": 4.872653484344482, "learning_rate": 4.3561690915339676e-06, "logits/chosen": 9.763392448425293, "logits/rejected": 14.847209930419922, "logps/chosen": -214.76821899414062, "logps/rejected": -308.6475524902344, "loss": 0.5787, "rewards/accuracies": 0.75, "rewards/chosen": 0.37203243374824524, "rewards/margins": 0.3971463739871979, "rewards/rejected": -0.02511395514011383, "step": 4188 }, { "epoch": 0.6478252464720665, "grad_norm": 4.700504779815674, "learning_rate": 4.355882689884294e-06, "logits/chosen": 6.039549827575684, "logits/rejected": 0.44818228483200073, "logps/chosen": -300.58648681640625, "logps/rejected": -232.79554748535156, "loss": 0.4576, "rewards/accuracies": 0.875, "rewards/chosen": 0.6200546026229858, "rewards/margins": 0.6305831670761108, "rewards/rejected": -0.010528564453125, "step": 4189 }, { "epoch": 0.6479798956118307, "grad_norm": 4.852015018463135, "learning_rate": 4.35559628823462e-06, "logits/chosen": 8.171926498413086, "logits/rejected": 5.848831653594971, "logps/chosen": -205.46250915527344, "logps/rejected": -173.2490997314453, "loss": 0.6357, "rewards/accuracies": 0.625, "rewards/chosen": 0.42423662543296814, "rewards/margins": 0.21442672610282898, "rewards/rejected": 0.20980989933013916, "step": 4190 }, { "epoch": 0.6481345447515948, "grad_norm": 13.992420196533203, "learning_rate": 4.355309886584947e-06, "logits/chosen": 11.810670852661133, "logits/rejected": 7.921099662780762, "logps/chosen": -206.73741149902344, "logps/rejected": -152.72364807128906, "loss": 0.5261, "rewards/accuracies": 0.875, "rewards/chosen": 0.2047392874956131, "rewards/margins": 0.41251787543296814, "rewards/rejected": -0.20777854323387146, "step": 4191 }, { "epoch": 0.648289193891359, "grad_norm": 4.336026191711426, "learning_rate": 4.355023484935273e-06, "logits/chosen": 13.193326950073242, "logits/rejected": 6.914109230041504, "logps/chosen": -297.0978088378906, "logps/rejected": -223.58868408203125, "loss": 0.5482, "rewards/accuracies": 0.625, "rewards/chosen": 0.5427843332290649, "rewards/margins": 0.47084811329841614, "rewards/rejected": 0.0719362199306488, "step": 4192 }, { "epoch": 0.6484438430311231, "grad_norm": 5.0561981201171875, "learning_rate": 4.3547370832856e-06, "logits/chosen": 12.141061782836914, "logits/rejected": 2.2735249996185303, "logps/chosen": -280.3802490234375, "logps/rejected": -220.25477600097656, "loss": 0.4377, "rewards/accuracies": 0.875, "rewards/chosen": 0.30007994174957275, "rewards/margins": 0.8288781046867371, "rewards/rejected": -0.5287981033325195, "step": 4193 }, { "epoch": 0.6485984921708873, "grad_norm": 6.172568321228027, "learning_rate": 4.354450681635927e-06, "logits/chosen": 7.03871488571167, "logits/rejected": 11.420415878295898, "logps/chosen": -259.57501220703125, "logps/rejected": -418.56500244140625, "loss": 0.7309, "rewards/accuracies": 0.625, "rewards/chosen": 0.15208502113819122, "rewards/margins": 0.07502608001232147, "rewards/rejected": 0.07705894112586975, "step": 4194 }, { "epoch": 0.6487531413106514, "grad_norm": 5.374832630157471, "learning_rate": 4.354164279986253e-06, "logits/chosen": 12.77337646484375, "logits/rejected": 8.836723327636719, "logps/chosen": -365.0564880371094, "logps/rejected": -301.3245849609375, "loss": 0.5921, "rewards/accuracies": 0.75, "rewards/chosen": 0.4177880883216858, "rewards/margins": 0.3558177649974823, "rewards/rejected": 0.06197032332420349, "step": 4195 }, { "epoch": 0.6489077904504156, "grad_norm": 4.273466110229492, "learning_rate": 4.35387787833658e-06, "logits/chosen": 13.224781036376953, "logits/rejected": 10.768608093261719, "logps/chosen": -201.20626831054688, "logps/rejected": -198.96792602539062, "loss": 0.5346, "rewards/accuracies": 0.875, "rewards/chosen": 0.015607690438628197, "rewards/margins": 0.41390252113342285, "rewards/rejected": -0.3982948362827301, "step": 4196 }, { "epoch": 0.6490624395901797, "grad_norm": 5.9873809814453125, "learning_rate": 4.353591476686906e-06, "logits/chosen": 14.172565460205078, "logits/rejected": 9.2845458984375, "logps/chosen": -212.9275360107422, "logps/rejected": -157.1071014404297, "loss": 0.814, "rewards/accuracies": 0.5, "rewards/chosen": -0.04813404753804207, "rewards/margins": -0.1368042379617691, "rewards/rejected": 0.08867020159959793, "step": 4197 }, { "epoch": 0.649217088729944, "grad_norm": 3.5123026371002197, "learning_rate": 4.353305075037232e-06, "logits/chosen": 9.64143180847168, "logits/rejected": 8.916725158691406, "logps/chosen": -183.2423095703125, "logps/rejected": -201.08419799804688, "loss": 0.5009, "rewards/accuracies": 0.875, "rewards/chosen": 0.24202242493629456, "rewards/margins": 0.47435975074768066, "rewards/rejected": -0.2323373556137085, "step": 4198 }, { "epoch": 0.6493717378697081, "grad_norm": 6.294008731842041, "learning_rate": 4.353018673387559e-06, "logits/chosen": 8.272171974182129, "logits/rejected": 7.6526103019714355, "logps/chosen": -332.6680908203125, "logps/rejected": -330.3778381347656, "loss": 0.5758, "rewards/accuracies": 0.75, "rewards/chosen": 0.21828046441078186, "rewards/margins": 0.4001489281654358, "rewards/rejected": -0.18186840415000916, "step": 4199 }, { "epoch": 0.6495263870094723, "grad_norm": 4.988196849822998, "learning_rate": 4.352732271737886e-06, "logits/chosen": 3.080298662185669, "logits/rejected": 14.015260696411133, "logps/chosen": -206.3272705078125, "logps/rejected": -262.3306579589844, "loss": 0.7257, "rewards/accuracies": 0.625, "rewards/chosen": -0.030588969588279724, "rewards/margins": -0.026652857661247253, "rewards/rejected": -0.003936097025871277, "step": 4200 }, { "epoch": 0.6496810361492364, "grad_norm": 3.7875874042510986, "learning_rate": 4.352445870088212e-06, "logits/chosen": 8.165966033935547, "logits/rejected": 3.1907248497009277, "logps/chosen": -180.878173828125, "logps/rejected": -134.9561767578125, "loss": 0.559, "rewards/accuracies": 0.75, "rewards/chosen": 0.038340337574481964, "rewards/margins": 0.32512617111206055, "rewards/rejected": -0.2867858409881592, "step": 4201 }, { "epoch": 0.6498356852890006, "grad_norm": 5.892143249511719, "learning_rate": 4.352159468438539e-06, "logits/chosen": 12.405851364135742, "logits/rejected": 11.449054718017578, "logps/chosen": -295.89630126953125, "logps/rejected": -314.49774169921875, "loss": 0.5933, "rewards/accuracies": 0.75, "rewards/chosen": -0.048905763775110245, "rewards/margins": 0.2612883448600769, "rewards/rejected": -0.31019413471221924, "step": 4202 }, { "epoch": 0.6499903344287647, "grad_norm": 28.691387176513672, "learning_rate": 4.351873066788865e-06, "logits/chosen": 10.543848991394043, "logits/rejected": 9.813764572143555, "logps/chosen": -527.480224609375, "logps/rejected": -435.54315185546875, "loss": 0.8626, "rewards/accuracies": 0.5, "rewards/chosen": 0.23375284671783447, "rewards/margins": -0.08418767154216766, "rewards/rejected": 0.31794053316116333, "step": 4203 }, { "epoch": 0.6501449835685289, "grad_norm": 5.667877197265625, "learning_rate": 4.3515866651391914e-06, "logits/chosen": 10.827398300170898, "logits/rejected": 4.473579406738281, "logps/chosen": -403.90423583984375, "logps/rejected": -273.38189697265625, "loss": 0.5008, "rewards/accuracies": 0.75, "rewards/chosen": 0.2575262188911438, "rewards/margins": 0.7093676328659058, "rewards/rejected": -0.45184147357940674, "step": 4204 }, { "epoch": 0.650299632708293, "grad_norm": 5.46954345703125, "learning_rate": 4.351300263489518e-06, "logits/chosen": 13.069526672363281, "logits/rejected": 14.894620895385742, "logps/chosen": -300.318603515625, "logps/rejected": -305.42242431640625, "loss": 0.6536, "rewards/accuracies": 0.625, "rewards/chosen": 0.09276733547449112, "rewards/margins": 0.12498322874307632, "rewards/rejected": -0.03221588581800461, "step": 4205 }, { "epoch": 0.6504542818480572, "grad_norm": 5.898619651794434, "learning_rate": 4.351013861839845e-06, "logits/chosen": 5.566603660583496, "logits/rejected": 7.7404680252075195, "logps/chosen": -108.28042602539062, "logps/rejected": -171.0457763671875, "loss": 0.7786, "rewards/accuracies": 0.375, "rewards/chosen": -0.1528187096118927, "rewards/margins": -0.12764693796634674, "rewards/rejected": -0.025171803310513496, "step": 4206 }, { "epoch": 0.6506089309878214, "grad_norm": 5.101982116699219, "learning_rate": 4.350727460190171e-06, "logits/chosen": 9.90913200378418, "logits/rejected": 5.166670799255371, "logps/chosen": -283.5765380859375, "logps/rejected": -195.9736328125, "loss": 0.6274, "rewards/accuracies": 0.625, "rewards/chosen": 0.42315778136253357, "rewards/margins": 0.2216704785823822, "rewards/rejected": 0.20148730278015137, "step": 4207 }, { "epoch": 0.6507635801275855, "grad_norm": 5.125076770782471, "learning_rate": 4.350441058540498e-06, "logits/chosen": 8.263349533081055, "logits/rejected": 7.413053512573242, "logps/chosen": -385.040283203125, "logps/rejected": -288.6653137207031, "loss": 0.596, "rewards/accuracies": 0.5, "rewards/chosen": 0.2358895242214203, "rewards/margins": 0.3548605740070343, "rewards/rejected": -0.11897105723619461, "step": 4208 }, { "epoch": 0.6509182292673497, "grad_norm": 4.582486629486084, "learning_rate": 4.350154656890825e-06, "logits/chosen": 15.320573806762695, "logits/rejected": 8.661050796508789, "logps/chosen": -317.2214660644531, "logps/rejected": -250.9655303955078, "loss": 0.5543, "rewards/accuracies": 0.875, "rewards/chosen": 0.5238339900970459, "rewards/margins": 0.33945292234420776, "rewards/rejected": 0.18438109755516052, "step": 4209 }, { "epoch": 0.6510728784071138, "grad_norm": 5.058601379394531, "learning_rate": 4.3498682552411505e-06, "logits/chosen": 9.022960662841797, "logits/rejected": 13.238070487976074, "logps/chosen": -137.50807189941406, "logps/rejected": -229.68106079101562, "loss": 0.7082, "rewards/accuracies": 0.75, "rewards/chosen": -0.42664414644241333, "rewards/margins": 0.09030580520629883, "rewards/rejected": -0.5169499516487122, "step": 4210 }, { "epoch": 0.6512275275468781, "grad_norm": 5.154739856719971, "learning_rate": 4.349581853591477e-06, "logits/chosen": 12.505020141601562, "logits/rejected": 11.892744064331055, "logps/chosen": -305.08685302734375, "logps/rejected": -294.178955078125, "loss": 0.5936, "rewards/accuracies": 0.5, "rewards/chosen": 0.09012632817029953, "rewards/margins": 0.45098525285720825, "rewards/rejected": -0.3608589172363281, "step": 4211 }, { "epoch": 0.6513821766866422, "grad_norm": 6.85953426361084, "learning_rate": 4.349295451941804e-06, "logits/chosen": 11.5447416305542, "logits/rejected": 4.32448673248291, "logps/chosen": -315.2926940917969, "logps/rejected": -252.00189208984375, "loss": 0.6007, "rewards/accuracies": 0.5, "rewards/chosen": 0.3751358985900879, "rewards/margins": 0.44499102234840393, "rewards/rejected": -0.06985510140657425, "step": 4212 }, { "epoch": 0.6515368258264064, "grad_norm": 4.791493892669678, "learning_rate": 4.3490090502921304e-06, "logits/chosen": 12.159289360046387, "logits/rejected": 8.763517379760742, "logps/chosen": -220.3642578125, "logps/rejected": -247.2126007080078, "loss": 0.6161, "rewards/accuracies": 0.625, "rewards/chosen": 0.23202410340309143, "rewards/margins": 0.23403620719909668, "rewards/rejected": -0.0020121149718761444, "step": 4213 }, { "epoch": 0.6516914749661705, "grad_norm": 6.442799091339111, "learning_rate": 4.348722648642456e-06, "logits/chosen": 12.218147277832031, "logits/rejected": 10.689464569091797, "logps/chosen": -296.619140625, "logps/rejected": -269.91754150390625, "loss": 0.6472, "rewards/accuracies": 0.625, "rewards/chosen": 0.14552584290504456, "rewards/margins": 0.18558895587921143, "rewards/rejected": -0.040063098073005676, "step": 4214 }, { "epoch": 0.6518461241059347, "grad_norm": 4.191873073577881, "learning_rate": 4.348436246992783e-06, "logits/chosen": 5.560467720031738, "logits/rejected": 4.5959038734436035, "logps/chosen": -201.33773803710938, "logps/rejected": -183.98382568359375, "loss": 0.4289, "rewards/accuracies": 0.875, "rewards/chosen": 0.20138554275035858, "rewards/margins": 0.8380700349807739, "rewards/rejected": -0.6366845369338989, "step": 4215 }, { "epoch": 0.6520007732456988, "grad_norm": 4.2784247398376465, "learning_rate": 4.3481498453431095e-06, "logits/chosen": 15.070930480957031, "logits/rejected": 3.3545186519622803, "logps/chosen": -240.72650146484375, "logps/rejected": -161.81704711914062, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": 0.15401974320411682, "rewards/margins": 0.5519628524780273, "rewards/rejected": -0.39794304966926575, "step": 4216 }, { "epoch": 0.652155422385463, "grad_norm": 5.652374267578125, "learning_rate": 4.347863443693436e-06, "logits/chosen": 7.491048336029053, "logits/rejected": 4.3013482093811035, "logps/chosen": -324.8224182128906, "logps/rejected": -240.87069702148438, "loss": 0.6706, "rewards/accuracies": 0.5, "rewards/chosen": 0.07760705798864365, "rewards/margins": 0.20842379331588745, "rewards/rejected": -0.1308167427778244, "step": 4217 }, { "epoch": 0.6523100715252271, "grad_norm": 3.6855833530426025, "learning_rate": 4.347577042043762e-06, "logits/chosen": 5.974587440490723, "logits/rejected": 4.610760688781738, "logps/chosen": -259.886962890625, "logps/rejected": -249.80288696289062, "loss": 0.4882, "rewards/accuracies": 0.75, "rewards/chosen": 0.23094511032104492, "rewards/margins": 0.6611930727958679, "rewards/rejected": -0.430247962474823, "step": 4218 }, { "epoch": 0.6524647206649913, "grad_norm": 6.166820049285889, "learning_rate": 4.347290640394089e-06, "logits/chosen": 9.867472648620605, "logits/rejected": 9.522201538085938, "logps/chosen": -302.54522705078125, "logps/rejected": -267.424560546875, "loss": 0.7907, "rewards/accuracies": 0.375, "rewards/chosen": -0.42901021242141724, "rewards/margins": -0.06512265652418137, "rewards/rejected": -0.36388757824897766, "step": 4219 }, { "epoch": 0.6526193698047554, "grad_norm": 4.0333170890808105, "learning_rate": 4.347004238744415e-06, "logits/chosen": 14.675891876220703, "logits/rejected": 4.208190441131592, "logps/chosen": -303.3465576171875, "logps/rejected": -200.75021362304688, "loss": 0.5046, "rewards/accuracies": 0.875, "rewards/chosen": 0.2253396064043045, "rewards/margins": 0.4549718499183655, "rewards/rejected": -0.22963227331638336, "step": 4220 }, { "epoch": 0.6527740189445196, "grad_norm": 7.872147083282471, "learning_rate": 4.346717837094742e-06, "logits/chosen": 11.461825370788574, "logits/rejected": 9.13442611694336, "logps/chosen": -556.8522338867188, "logps/rejected": -439.05279541015625, "loss": 0.8367, "rewards/accuracies": 0.75, "rewards/chosen": -0.0441434346139431, "rewards/margins": -0.010390043258666992, "rewards/rejected": -0.033753395080566406, "step": 4221 }, { "epoch": 0.6529286680842837, "grad_norm": 4.7352495193481445, "learning_rate": 4.346431435445069e-06, "logits/chosen": 7.9646897315979, "logits/rejected": 7.855943202972412, "logps/chosen": -194.97967529296875, "logps/rejected": -223.33575439453125, "loss": 0.6245, "rewards/accuracies": 0.625, "rewards/chosen": -0.043425656855106354, "rewards/margins": 0.25277218222618103, "rewards/rejected": -0.2961978316307068, "step": 4222 }, { "epoch": 0.6530833172240479, "grad_norm": 6.273276329040527, "learning_rate": 4.346145033795394e-06, "logits/chosen": 8.105929374694824, "logits/rejected": 0.20725619792938232, "logps/chosen": -538.843505859375, "logps/rejected": -276.67919921875, "loss": 0.5755, "rewards/accuracies": 0.75, "rewards/chosen": -0.009483430534601212, "rewards/margins": 0.32109034061431885, "rewards/rejected": -0.33057376742362976, "step": 4223 }, { "epoch": 0.6532379663638122, "grad_norm": 5.0234694480896, "learning_rate": 4.345858632145721e-06, "logits/chosen": 8.780721664428711, "logits/rejected": 11.549026489257812, "logps/chosen": -175.511474609375, "logps/rejected": -238.11605834960938, "loss": 0.5695, "rewards/accuracies": 0.75, "rewards/chosen": 0.0908842384815216, "rewards/margins": 0.37255245447158813, "rewards/rejected": -0.28166818618774414, "step": 4224 }, { "epoch": 0.6533926155035763, "grad_norm": 5.550236225128174, "learning_rate": 4.345572230496048e-06, "logits/chosen": 1.433288335800171, "logits/rejected": 8.394508361816406, "logps/chosen": -238.88351440429688, "logps/rejected": -269.38739013671875, "loss": 0.7616, "rewards/accuracies": 0.5, "rewards/chosen": 0.0143667533993721, "rewards/margins": -0.013303359970450401, "rewards/rejected": 0.027670137584209442, "step": 4225 }, { "epoch": 0.6535472646433405, "grad_norm": 5.03551721572876, "learning_rate": 4.345285828846374e-06, "logits/chosen": 10.318571090698242, "logits/rejected": 12.024208068847656, "logps/chosen": -135.0189666748047, "logps/rejected": -179.40573120117188, "loss": 0.7662, "rewards/accuracies": 0.25, "rewards/chosen": -0.31906285881996155, "rewards/margins": -0.13008837401866913, "rewards/rejected": -0.18897446990013123, "step": 4226 }, { "epoch": 0.6537019137831046, "grad_norm": 4.7589898109436035, "learning_rate": 4.344999427196701e-06, "logits/chosen": 8.128241539001465, "logits/rejected": 4.892699241638184, "logps/chosen": -244.4506072998047, "logps/rejected": -226.9948272705078, "loss": 0.6246, "rewards/accuracies": 0.5, "rewards/chosen": 0.04702885448932648, "rewards/margins": 0.21536409854888916, "rewards/rejected": -0.1683352291584015, "step": 4227 }, { "epoch": 0.6538565629228688, "grad_norm": 6.265934467315674, "learning_rate": 4.344713025547028e-06, "logits/chosen": 6.535407543182373, "logits/rejected": 1.9626760482788086, "logps/chosen": -251.66062927246094, "logps/rejected": -258.51287841796875, "loss": 0.7185, "rewards/accuracies": 0.375, "rewards/chosen": -0.007463961839675903, "rewards/margins": 0.2547076344490051, "rewards/rejected": -0.26217156648635864, "step": 4228 }, { "epoch": 0.6540112120626329, "grad_norm": 5.701231479644775, "learning_rate": 4.344426623897354e-06, "logits/chosen": 12.259727478027344, "logits/rejected": 18.161714553833008, "logps/chosen": -229.688720703125, "logps/rejected": -247.94024658203125, "loss": 0.6685, "rewards/accuracies": 0.75, "rewards/chosen": -0.20073002576828003, "rewards/margins": 0.15554234385490417, "rewards/rejected": -0.356272429227829, "step": 4229 }, { "epoch": 0.6541658612023971, "grad_norm": 5.357940196990967, "learning_rate": 4.34414022224768e-06, "logits/chosen": 13.876051902770996, "logits/rejected": 9.536811828613281, "logps/chosen": -285.1541748046875, "logps/rejected": -298.2764892578125, "loss": 0.4744, "rewards/accuracies": 0.75, "rewards/chosen": 0.4411677122116089, "rewards/margins": 0.707834005355835, "rewards/rejected": -0.26666635274887085, "step": 4230 }, { "epoch": 0.6543205103421612, "grad_norm": 6.058716297149658, "learning_rate": 4.343853820598007e-06, "logits/chosen": 7.067928314208984, "logits/rejected": 12.358060836791992, "logps/chosen": -209.51629638671875, "logps/rejected": -309.0819396972656, "loss": 0.915, "rewards/accuracies": 0.375, "rewards/chosen": -0.11157140135765076, "rewards/margins": -0.14984466135501862, "rewards/rejected": 0.03827321529388428, "step": 4231 }, { "epoch": 0.6544751594819254, "grad_norm": 5.347863674163818, "learning_rate": 4.343567418948333e-06, "logits/chosen": 11.38589096069336, "logits/rejected": 11.748504638671875, "logps/chosen": -246.36474609375, "logps/rejected": -257.4831237792969, "loss": 0.6571, "rewards/accuracies": 0.5, "rewards/chosen": -0.08860114216804504, "rewards/margins": 0.31712016463279724, "rewards/rejected": -0.4057213068008423, "step": 4232 }, { "epoch": 0.6546298086216895, "grad_norm": 6.214756965637207, "learning_rate": 4.34328101729866e-06, "logits/chosen": 9.984513282775879, "logits/rejected": 8.25812816619873, "logps/chosen": -230.4915771484375, "logps/rejected": -182.3870391845703, "loss": 0.8067, "rewards/accuracies": 0.5, "rewards/chosen": 0.015525531023740768, "rewards/margins": -0.15137168765068054, "rewards/rejected": 0.16689720749855042, "step": 4233 }, { "epoch": 0.6547844577614537, "grad_norm": 5.609480857849121, "learning_rate": 4.342994615648987e-06, "logits/chosen": 12.323747634887695, "logits/rejected": 3.7065200805664062, "logps/chosen": -360.9742126464844, "logps/rejected": -235.20010375976562, "loss": 0.6359, "rewards/accuracies": 0.75, "rewards/chosen": 0.08385735750198364, "rewards/margins": 0.39086347818374634, "rewards/rejected": -0.3070061504840851, "step": 4234 }, { "epoch": 0.6549391069012178, "grad_norm": 4.826611042022705, "learning_rate": 4.342708213999313e-06, "logits/chosen": 13.70637321472168, "logits/rejected": 5.131006240844727, "logps/chosen": -428.180419921875, "logps/rejected": -234.23574829101562, "loss": 0.5878, "rewards/accuracies": 0.875, "rewards/chosen": 0.13843460381031036, "rewards/margins": 0.4423544406890869, "rewards/rejected": -0.30391985177993774, "step": 4235 }, { "epoch": 0.655093756040982, "grad_norm": 7.239215850830078, "learning_rate": 4.342421812349639e-06, "logits/chosen": 11.26409912109375, "logits/rejected": 4.930548667907715, "logps/chosen": -316.2461242675781, "logps/rejected": -234.1838836669922, "loss": 0.6595, "rewards/accuracies": 0.75, "rewards/chosen": -0.3526248037815094, "rewards/margins": 0.32214125990867615, "rewards/rejected": -0.6747660636901855, "step": 4236 }, { "epoch": 0.6552484051807462, "grad_norm": 5.34835147857666, "learning_rate": 4.342135410699966e-06, "logits/chosen": 10.50351333618164, "logits/rejected": 12.445616722106934, "logps/chosen": -305.2685241699219, "logps/rejected": -328.69464111328125, "loss": 0.6206, "rewards/accuracies": 0.375, "rewards/chosen": 0.06022585928440094, "rewards/margins": 0.3220090866088867, "rewards/rejected": -0.261783242225647, "step": 4237 }, { "epoch": 0.6554030543205104, "grad_norm": 5.125571250915527, "learning_rate": 4.3418490090502925e-06, "logits/chosen": 10.250349044799805, "logits/rejected": 5.923823356628418, "logps/chosen": -332.0323791503906, "logps/rejected": -289.42822265625, "loss": 0.6431, "rewards/accuracies": 0.75, "rewards/chosen": 0.3787103593349457, "rewards/margins": 0.19022627174854279, "rewards/rejected": 0.1884841024875641, "step": 4238 }, { "epoch": 0.6555577034602745, "grad_norm": 4.990293979644775, "learning_rate": 4.341562607400619e-06, "logits/chosen": 13.771848678588867, "logits/rejected": 3.848051071166992, "logps/chosen": -479.2643127441406, "logps/rejected": -284.7635498046875, "loss": 0.5347, "rewards/accuracies": 0.75, "rewards/chosen": 0.21116122603416443, "rewards/margins": 0.4802495241165161, "rewards/rejected": -0.2690882682800293, "step": 4239 }, { "epoch": 0.6557123526000387, "grad_norm": 5.700728416442871, "learning_rate": 4.341276205750946e-06, "logits/chosen": 7.67987585067749, "logits/rejected": 6.795275688171387, "logps/chosen": -296.4989318847656, "logps/rejected": -266.7330017089844, "loss": 0.6694, "rewards/accuracies": 0.625, "rewards/chosen": -0.16801680624485016, "rewards/margins": 0.16278572380542755, "rewards/rejected": -0.3308025598526001, "step": 4240 }, { "epoch": 0.6558670017398028, "grad_norm": 4.20328426361084, "learning_rate": 4.340989804101272e-06, "logits/chosen": 10.852849960327148, "logits/rejected": 9.256386756896973, "logps/chosen": -290.8721008300781, "logps/rejected": -245.87806701660156, "loss": 0.6535, "rewards/accuracies": 0.375, "rewards/chosen": 0.24068480730056763, "rewards/margins": 0.20167070627212524, "rewards/rejected": 0.03901408612728119, "step": 4241 }, { "epoch": 0.656021650879567, "grad_norm": 4.048861503601074, "learning_rate": 4.340703402451599e-06, "logits/chosen": 8.340597152709961, "logits/rejected": 8.36566162109375, "logps/chosen": -163.4769744873047, "logps/rejected": -135.12185668945312, "loss": 0.5529, "rewards/accuracies": 0.875, "rewards/chosen": 0.00452122837305069, "rewards/margins": 0.41734784841537476, "rewards/rejected": -0.4128265976905823, "step": 4242 }, { "epoch": 0.6561763000193311, "grad_norm": 4.394460201263428, "learning_rate": 4.340417000801925e-06, "logits/chosen": 6.019317150115967, "logits/rejected": 2.7620127201080322, "logps/chosen": -182.89154052734375, "logps/rejected": -104.48013305664062, "loss": 0.7581, "rewards/accuracies": 0.375, "rewards/chosen": -0.32511982321739197, "rewards/margins": 0.013522684574127197, "rewards/rejected": -0.33864250779151917, "step": 4243 }, { "epoch": 0.6563309491590953, "grad_norm": 7.645695686340332, "learning_rate": 4.3401305991522515e-06, "logits/chosen": 12.893644332885742, "logits/rejected": 12.326776504516602, "logps/chosen": -307.6593017578125, "logps/rejected": -297.2375183105469, "loss": 0.7687, "rewards/accuracies": 0.5, "rewards/chosen": 0.16537103056907654, "rewards/margins": -0.042695946991443634, "rewards/rejected": 0.20806699991226196, "step": 4244 }, { "epoch": 0.6564855982988594, "grad_norm": 6.866543292999268, "learning_rate": 4.339844197502578e-06, "logits/chosen": 7.2180094718933105, "logits/rejected": 12.493921279907227, "logps/chosen": -238.82546997070312, "logps/rejected": -335.80462646484375, "loss": 0.8607, "rewards/accuracies": 0.375, "rewards/chosen": -0.0891430526971817, "rewards/margins": -0.26331374049186707, "rewards/rejected": 0.17417068779468536, "step": 4245 }, { "epoch": 0.6566402474386236, "grad_norm": 3.879746675491333, "learning_rate": 4.339557795852905e-06, "logits/chosen": 12.361034393310547, "logits/rejected": 7.865085124969482, "logps/chosen": -253.78721618652344, "logps/rejected": -316.2119140625, "loss": 0.3618, "rewards/accuracies": 1.0, "rewards/chosen": 0.07004080712795258, "rewards/margins": 1.027457594871521, "rewards/rejected": -0.957416832447052, "step": 4246 }, { "epoch": 0.6567948965783877, "grad_norm": 6.232325077056885, "learning_rate": 4.3392713942032315e-06, "logits/chosen": 12.41917896270752, "logits/rejected": 4.786000728607178, "logps/chosen": -389.54949951171875, "logps/rejected": -281.8065490722656, "loss": 0.5207, "rewards/accuracies": 0.75, "rewards/chosen": 0.1898013949394226, "rewards/margins": 0.5042092800140381, "rewards/rejected": -0.3144078552722931, "step": 4247 }, { "epoch": 0.6569495457181519, "grad_norm": 7.850480556488037, "learning_rate": 4.338984992553557e-06, "logits/chosen": 10.405895233154297, "logits/rejected": 8.956665992736816, "logps/chosen": -301.396484375, "logps/rejected": -286.8551330566406, "loss": 0.6124, "rewards/accuracies": 0.75, "rewards/chosen": -0.29000720381736755, "rewards/margins": 0.27204352617263794, "rewards/rejected": -0.5620507597923279, "step": 4248 }, { "epoch": 0.6571041948579162, "grad_norm": 4.464251518249512, "learning_rate": 4.338698590903884e-06, "logits/chosen": 6.897383689880371, "logits/rejected": 8.609350204467773, "logps/chosen": -228.88211059570312, "logps/rejected": -246.05857849121094, "loss": 0.6018, "rewards/accuracies": 0.75, "rewards/chosen": 0.12348346412181854, "rewards/margins": 0.2385975867509842, "rewards/rejected": -0.11511412262916565, "step": 4249 }, { "epoch": 0.6572588439976803, "grad_norm": 4.924075603485107, "learning_rate": 4.338412189254211e-06, "logits/chosen": 7.3606157302856445, "logits/rejected": 7.459295272827148, "logps/chosen": -337.3214111328125, "logps/rejected": -300.5233154296875, "loss": 0.6147, "rewards/accuracies": 0.625, "rewards/chosen": 0.07398763298988342, "rewards/margins": 0.49985432624816895, "rewards/rejected": -0.4258667230606079, "step": 4250 }, { "epoch": 0.6574134931374445, "grad_norm": 5.979330062866211, "learning_rate": 4.338125787604537e-06, "logits/chosen": 10.71957778930664, "logits/rejected": 15.592201232910156, "logps/chosen": -293.1734924316406, "logps/rejected": -359.5371398925781, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": -0.027801796793937683, "rewards/margins": 0.26209214329719543, "rewards/rejected": -0.2898939251899719, "step": 4251 }, { "epoch": 0.6575681422772086, "grad_norm": 3.9610395431518555, "learning_rate": 4.337839385954863e-06, "logits/chosen": 6.776621341705322, "logits/rejected": -0.9367251396179199, "logps/chosen": -337.8732604980469, "logps/rejected": -216.66941833496094, "loss": 0.4719, "rewards/accuracies": 0.875, "rewards/chosen": -0.05533876270055771, "rewards/margins": 0.752065896987915, "rewards/rejected": -0.8074046969413757, "step": 4252 }, { "epoch": 0.6577227914169728, "grad_norm": 6.113455772399902, "learning_rate": 4.33755298430519e-06, "logits/chosen": 8.62043285369873, "logits/rejected": 5.310030937194824, "logps/chosen": -290.6543884277344, "logps/rejected": -278.75689697265625, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": 0.05785989761352539, "rewards/margins": 0.4392286539077759, "rewards/rejected": -0.3813687860965729, "step": 4253 }, { "epoch": 0.6578774405567369, "grad_norm": 7.48187255859375, "learning_rate": 4.337266582655516e-06, "logits/chosen": 13.339792251586914, "logits/rejected": 4.544426918029785, "logps/chosen": -404.67340087890625, "logps/rejected": -328.5487060546875, "loss": 0.5925, "rewards/accuracies": 0.625, "rewards/chosen": -0.22495344281196594, "rewards/margins": 0.2778828740119934, "rewards/rejected": -0.5028363466262817, "step": 4254 }, { "epoch": 0.6580320896965011, "grad_norm": 6.2130045890808105, "learning_rate": 4.336980181005843e-06, "logits/chosen": 9.765238761901855, "logits/rejected": 9.637968063354492, "logps/chosen": -267.8067932128906, "logps/rejected": -344.5045166015625, "loss": 0.8128, "rewards/accuracies": 0.5, "rewards/chosen": -0.20931066572666168, "rewards/margins": -0.15147018432617188, "rewards/rejected": -0.057840496301651, "step": 4255 }, { "epoch": 0.6581867388362652, "grad_norm": 5.443620681762695, "learning_rate": 4.336693779356169e-06, "logits/chosen": 5.635722637176514, "logits/rejected": 10.607624053955078, "logps/chosen": -183.70310974121094, "logps/rejected": -185.73695373535156, "loss": 0.7388, "rewards/accuracies": 0.375, "rewards/chosen": -0.34342384338378906, "rewards/margins": -0.03273334726691246, "rewards/rejected": -0.3106904923915863, "step": 4256 }, { "epoch": 0.6583413879760294, "grad_norm": 5.197975158691406, "learning_rate": 4.3364073777064954e-06, "logits/chosen": 8.877005577087402, "logits/rejected": 7.446663856506348, "logps/chosen": -285.80609130859375, "logps/rejected": -295.2646789550781, "loss": 0.5743, "rewards/accuracies": 0.625, "rewards/chosen": 0.24882307648658752, "rewards/margins": 0.31479302048683167, "rewards/rejected": -0.06596995890140533, "step": 4257 }, { "epoch": 0.6584960371157935, "grad_norm": 8.003013610839844, "learning_rate": 4.336120976056822e-06, "logits/chosen": 6.098358154296875, "logits/rejected": 7.598957061767578, "logps/chosen": -235.44683837890625, "logps/rejected": -255.38650512695312, "loss": 0.8314, "rewards/accuracies": 0.375, "rewards/chosen": -0.5272551774978638, "rewards/margins": -0.17554105818271637, "rewards/rejected": -0.3517141342163086, "step": 4258 }, { "epoch": 0.6586506862555577, "grad_norm": 5.2458577156066895, "learning_rate": 4.335834574407149e-06, "logits/chosen": 7.47066068649292, "logits/rejected": 8.930221557617188, "logps/chosen": -235.64450073242188, "logps/rejected": -229.45272827148438, "loss": 0.6839, "rewards/accuracies": 0.75, "rewards/chosen": -0.10166683793067932, "rewards/margins": 0.04539089649915695, "rewards/rejected": -0.14705774188041687, "step": 4259 }, { "epoch": 0.6588053353953218, "grad_norm": 12.956064224243164, "learning_rate": 4.335548172757475e-06, "logits/chosen": 2.390686273574829, "logits/rejected": 1.0085773468017578, "logps/chosen": -276.1402282714844, "logps/rejected": -279.8263854980469, "loss": 0.4848, "rewards/accuracies": 0.75, "rewards/chosen": 0.08727875351905823, "rewards/margins": 0.7267470359802246, "rewards/rejected": -0.6394683122634888, "step": 4260 }, { "epoch": 0.658959984535086, "grad_norm": 5.258927345275879, "learning_rate": 4.335261771107802e-06, "logits/chosen": 9.239931106567383, "logits/rejected": 7.764420032501221, "logps/chosen": -343.63128662109375, "logps/rejected": -236.1065216064453, "loss": 0.6428, "rewards/accuracies": 0.5, "rewards/chosen": 0.06139403581619263, "rewards/margins": 0.2517656087875366, "rewards/rejected": -0.190371572971344, "step": 4261 }, { "epoch": 0.6591146336748502, "grad_norm": 4.327454090118408, "learning_rate": 4.334975369458129e-06, "logits/chosen": 7.469304084777832, "logits/rejected": 2.7950024604797363, "logps/chosen": -233.54017639160156, "logps/rejected": -168.6626434326172, "loss": 0.5332, "rewards/accuracies": 0.75, "rewards/chosen": -0.21875320374965668, "rewards/margins": 0.5161008238792419, "rewards/rejected": -0.7348540425300598, "step": 4262 }, { "epoch": 0.6592692828146144, "grad_norm": 7.129507064819336, "learning_rate": 4.3346889678084545e-06, "logits/chosen": 0.9923268556594849, "logits/rejected": 3.7270820140838623, "logps/chosen": -203.24136352539062, "logps/rejected": -240.93637084960938, "loss": 0.9093, "rewards/accuracies": 0.5, "rewards/chosen": -0.5524977445602417, "rewards/margins": -0.25231146812438965, "rewards/rejected": -0.30018624663352966, "step": 4263 }, { "epoch": 0.6594239319543785, "grad_norm": 5.462010860443115, "learning_rate": 4.334402566158781e-06, "logits/chosen": 9.787393569946289, "logits/rejected": 12.91716194152832, "logps/chosen": -208.02406311035156, "logps/rejected": -236.16030883789062, "loss": 0.8064, "rewards/accuracies": 0.375, "rewards/chosen": -0.19643601775169373, "rewards/margins": -0.15476183593273163, "rewards/rejected": -0.041674189269542694, "step": 4264 }, { "epoch": 0.6595785810941427, "grad_norm": 4.6637067794799805, "learning_rate": 4.334116164509108e-06, "logits/chosen": 7.521944999694824, "logits/rejected": 5.622187614440918, "logps/chosen": -312.7679443359375, "logps/rejected": -226.11282348632812, "loss": 0.5584, "rewards/accuracies": 0.625, "rewards/chosen": 0.09454327076673508, "rewards/margins": 0.4101813733577728, "rewards/rejected": -0.31563809514045715, "step": 4265 }, { "epoch": 0.6597332302339068, "grad_norm": 5.969651699066162, "learning_rate": 4.3338297628594344e-06, "logits/chosen": 12.349713325500488, "logits/rejected": 16.498817443847656, "logps/chosen": -245.55372619628906, "logps/rejected": -292.7457275390625, "loss": 0.8999, "rewards/accuracies": 0.125, "rewards/chosen": -0.3214834928512573, "rewards/margins": -0.3493770360946655, "rewards/rejected": 0.02789352834224701, "step": 4266 }, { "epoch": 0.659887879373671, "grad_norm": 3.572495460510254, "learning_rate": 4.333543361209761e-06, "logits/chosen": 9.650606155395508, "logits/rejected": 10.575093269348145, "logps/chosen": -207.3527374267578, "logps/rejected": -262.22705078125, "loss": 0.4973, "rewards/accuracies": 0.875, "rewards/chosen": 0.24042657017707825, "rewards/margins": 0.6499561071395874, "rewards/rejected": -0.40952956676483154, "step": 4267 }, { "epoch": 0.6600425285134351, "grad_norm": 5.476774215698242, "learning_rate": 4.333256959560088e-06, "logits/chosen": 8.378408432006836, "logits/rejected": 6.034361362457275, "logps/chosen": -282.72552490234375, "logps/rejected": -283.5556335449219, "loss": 0.6397, "rewards/accuracies": 0.625, "rewards/chosen": 0.05240161344408989, "rewards/margins": 0.1941196620464325, "rewards/rejected": -0.1417180597782135, "step": 4268 }, { "epoch": 0.6601971776531993, "grad_norm": 4.192002773284912, "learning_rate": 4.3329705579104135e-06, "logits/chosen": 10.967140197753906, "logits/rejected": -2.1587822437286377, "logps/chosen": -217.4373779296875, "logps/rejected": -103.82136535644531, "loss": 0.5382, "rewards/accuracies": 0.75, "rewards/chosen": -0.2202337235212326, "rewards/margins": 0.514327347278595, "rewards/rejected": -0.734561026096344, "step": 4269 }, { "epoch": 0.6603518267929634, "grad_norm": 3.051832437515259, "learning_rate": 4.33268415626074e-06, "logits/chosen": 14.620075225830078, "logits/rejected": 2.9050979614257812, "logps/chosen": -289.6891174316406, "logps/rejected": -166.1214599609375, "loss": 0.374, "rewards/accuracies": 0.875, "rewards/chosen": 0.273196816444397, "rewards/margins": 1.0187900066375732, "rewards/rejected": -0.745593249797821, "step": 4270 }, { "epoch": 0.6605064759327276, "grad_norm": 4.67549991607666, "learning_rate": 4.332397754611067e-06, "logits/chosen": 10.286325454711914, "logits/rejected": 9.392341613769531, "logps/chosen": -225.73623657226562, "logps/rejected": -194.42324829101562, "loss": 0.6069, "rewards/accuracies": 0.375, "rewards/chosen": -0.168339803814888, "rewards/margins": 0.30439293384552, "rewards/rejected": -0.4727327823638916, "step": 4271 }, { "epoch": 0.6606611250724918, "grad_norm": 6.0189337730407715, "learning_rate": 4.3321113529613935e-06, "logits/chosen": 8.788934707641602, "logits/rejected": 5.407052993774414, "logps/chosen": -260.3956604003906, "logps/rejected": -202.4595184326172, "loss": 0.5524, "rewards/accuracies": 0.75, "rewards/chosen": -0.10741932690143585, "rewards/margins": 0.426333487033844, "rewards/rejected": -0.5337527990341187, "step": 4272 }, { "epoch": 0.6608157742122559, "grad_norm": 7.782709121704102, "learning_rate": 4.33182495131172e-06, "logits/chosen": 4.877094745635986, "logits/rejected": 9.291874885559082, "logps/chosen": -205.60812377929688, "logps/rejected": -262.8767395019531, "loss": 0.9725, "rewards/accuracies": 0.375, "rewards/chosen": -0.5181944370269775, "rewards/margins": -0.37633776664733887, "rewards/rejected": -0.14185667037963867, "step": 4273 }, { "epoch": 0.66097042335202, "grad_norm": 4.9661078453063965, "learning_rate": 4.331538549662047e-06, "logits/chosen": 10.442291259765625, "logits/rejected": 7.284004211425781, "logps/chosen": -304.2198486328125, "logps/rejected": -188.07688903808594, "loss": 0.5088, "rewards/accuracies": 0.625, "rewards/chosen": 0.16642338037490845, "rewards/margins": 0.5892209410667419, "rewards/rejected": -0.4227975904941559, "step": 4274 }, { "epoch": 0.6611250724917843, "grad_norm": 5.621748447418213, "learning_rate": 4.3312521480123734e-06, "logits/chosen": 1.8635404109954834, "logits/rejected": 7.7932281494140625, "logps/chosen": -124.76582336425781, "logps/rejected": -146.65882873535156, "loss": 0.7726, "rewards/accuracies": 0.5, "rewards/chosen": -0.5092434883117676, "rewards/margins": 0.07121413946151733, "rewards/rejected": -0.5804575681686401, "step": 4275 }, { "epoch": 0.6612797216315485, "grad_norm": 5.874105930328369, "learning_rate": 4.330965746362699e-06, "logits/chosen": 7.3795294761657715, "logits/rejected": 7.9654083251953125, "logps/chosen": -237.1683807373047, "logps/rejected": -271.6481018066406, "loss": 0.8578, "rewards/accuracies": 0.375, "rewards/chosen": -0.4786272644996643, "rewards/margins": -0.23775315284729004, "rewards/rejected": -0.24087411165237427, "step": 4276 }, { "epoch": 0.6614343707713126, "grad_norm": 6.900094985961914, "learning_rate": 4.330679344713026e-06, "logits/chosen": 9.951662063598633, "logits/rejected": 8.348591804504395, "logps/chosen": -312.5068359375, "logps/rejected": -278.4826354980469, "loss": 0.8105, "rewards/accuracies": 0.375, "rewards/chosen": -0.34500110149383545, "rewards/margins": -0.059378936886787415, "rewards/rejected": -0.28562214970588684, "step": 4277 }, { "epoch": 0.6615890199110768, "grad_norm": 6.97905158996582, "learning_rate": 4.3303929430633526e-06, "logits/chosen": 10.829272270202637, "logits/rejected": 11.108304977416992, "logps/chosen": -143.76087951660156, "logps/rejected": -167.43203735351562, "loss": 0.6705, "rewards/accuracies": 0.625, "rewards/chosen": -0.2697776257991791, "rewards/margins": 0.32433944940567017, "rewards/rejected": -0.5941171050071716, "step": 4278 }, { "epoch": 0.6617436690508409, "grad_norm": 5.425868034362793, "learning_rate": 4.330106541413679e-06, "logits/chosen": 9.530460357666016, "logits/rejected": 6.96138858795166, "logps/chosen": -223.8497772216797, "logps/rejected": -244.656494140625, "loss": 0.7347, "rewards/accuracies": 0.5, "rewards/chosen": -0.27032825350761414, "rewards/margins": -0.01611112430691719, "rewards/rejected": -0.25421711802482605, "step": 4279 }, { "epoch": 0.6618983181906051, "grad_norm": 4.473358631134033, "learning_rate": 4.329820139764006e-06, "logits/chosen": 11.572851181030273, "logits/rejected": 6.405040740966797, "logps/chosen": -260.8026428222656, "logps/rejected": -260.5738525390625, "loss": 0.5272, "rewards/accuracies": 0.875, "rewards/chosen": -0.11440470814704895, "rewards/margins": 0.5435508489608765, "rewards/rejected": -0.6579555869102478, "step": 4280 }, { "epoch": 0.6620529673303692, "grad_norm": 4.560245037078857, "learning_rate": 4.3295337381143325e-06, "logits/chosen": 8.806246757507324, "logits/rejected": 11.50917911529541, "logps/chosen": -256.7647705078125, "logps/rejected": -319.9904479980469, "loss": 0.5967, "rewards/accuracies": 0.625, "rewards/chosen": 0.011428158730268478, "rewards/margins": 0.3467073142528534, "rewards/rejected": -0.3352791368961334, "step": 4281 }, { "epoch": 0.6622076164701334, "grad_norm": 4.245309352874756, "learning_rate": 4.329247336464658e-06, "logits/chosen": 8.630610466003418, "logits/rejected": -2.6741414070129395, "logps/chosen": -241.89242553710938, "logps/rejected": -156.9348907470703, "loss": 0.6305, "rewards/accuracies": 0.75, "rewards/chosen": -0.03767205774784088, "rewards/margins": 0.18159788846969604, "rewards/rejected": -0.21926994621753693, "step": 4282 }, { "epoch": 0.6623622656098975, "grad_norm": 5.839372158050537, "learning_rate": 4.328960934814985e-06, "logits/chosen": 11.801851272583008, "logits/rejected": 9.311161041259766, "logps/chosen": -218.84344482421875, "logps/rejected": -238.715087890625, "loss": 0.7612, "rewards/accuracies": 0.375, "rewards/chosen": -0.27069205045700073, "rewards/margins": 0.1671137660741806, "rewards/rejected": -0.43780583143234253, "step": 4283 }, { "epoch": 0.6625169147496617, "grad_norm": 9.560470581054688, "learning_rate": 4.328674533165312e-06, "logits/chosen": 6.968115329742432, "logits/rejected": 12.529481887817383, "logps/chosen": -246.2449493408203, "logps/rejected": -286.8275451660156, "loss": 0.6173, "rewards/accuracies": 0.75, "rewards/chosen": -0.24121533334255219, "rewards/margins": 0.2748055160045624, "rewards/rejected": -0.5160208344459534, "step": 4284 }, { "epoch": 0.6626715638894258, "grad_norm": 5.678939342498779, "learning_rate": 4.328388131515638e-06, "logits/chosen": 9.270954132080078, "logits/rejected": 2.9593586921691895, "logps/chosen": -342.0398864746094, "logps/rejected": -291.84246826171875, "loss": 0.6204, "rewards/accuracies": 0.625, "rewards/chosen": 0.25605183839797974, "rewards/margins": 0.2629515826702118, "rewards/rejected": -0.006899736821651459, "step": 4285 }, { "epoch": 0.66282621302919, "grad_norm": 4.40380334854126, "learning_rate": 4.328101729865964e-06, "logits/chosen": 11.340071678161621, "logits/rejected": 1.762393832206726, "logps/chosen": -220.08724975585938, "logps/rejected": -103.06749725341797, "loss": 0.5901, "rewards/accuracies": 0.625, "rewards/chosen": -0.2431265115737915, "rewards/margins": 0.32804006338119507, "rewards/rejected": -0.5711665749549866, "step": 4286 }, { "epoch": 0.6629808621689541, "grad_norm": 4.827279090881348, "learning_rate": 4.327815328216291e-06, "logits/chosen": 6.844410419464111, "logits/rejected": 10.561067581176758, "logps/chosen": -229.61019897460938, "logps/rejected": -248.7239990234375, "loss": 0.6249, "rewards/accuracies": 0.625, "rewards/chosen": -0.19440661370754242, "rewards/margins": 0.1983400285243988, "rewards/rejected": -0.39274662733078003, "step": 4287 }, { "epoch": 0.6631355113087184, "grad_norm": 6.126718521118164, "learning_rate": 4.327528926566617e-06, "logits/chosen": 9.450445175170898, "logits/rejected": 7.474595069885254, "logps/chosen": -356.580078125, "logps/rejected": -327.7660217285156, "loss": 0.718, "rewards/accuracies": 0.5, "rewards/chosen": -0.18197079002857208, "rewards/margins": 0.013925556093454361, "rewards/rejected": -0.19589634239673615, "step": 4288 }, { "epoch": 0.6632901604484825, "grad_norm": 5.293323516845703, "learning_rate": 4.327242524916944e-06, "logits/chosen": 14.445408821105957, "logits/rejected": 11.202006340026855, "logps/chosen": -409.4034118652344, "logps/rejected": -336.47174072265625, "loss": 0.5409, "rewards/accuracies": 0.75, "rewards/chosen": 0.13379192352294922, "rewards/margins": 0.45122969150543213, "rewards/rejected": -0.3174377381801605, "step": 4289 }, { "epoch": 0.6634448095882467, "grad_norm": 4.291279315948486, "learning_rate": 4.32695612326727e-06, "logits/chosen": 15.586651802062988, "logits/rejected": 7.508134841918945, "logps/chosen": -445.3642272949219, "logps/rejected": -310.2279357910156, "loss": 0.449, "rewards/accuracies": 0.75, "rewards/chosen": 0.1344788521528244, "rewards/margins": 0.7428315281867981, "rewards/rejected": -0.6083526611328125, "step": 4290 }, { "epoch": 0.6635994587280108, "grad_norm": 22.539995193481445, "learning_rate": 4.3266697216175965e-06, "logits/chosen": 7.240866661071777, "logits/rejected": 3.2018611431121826, "logps/chosen": -428.97357177734375, "logps/rejected": -424.547119140625, "loss": 0.7442, "rewards/accuracies": 0.25, "rewards/chosen": 0.5236403346061707, "rewards/margins": -1.790374517440796e-05, "rewards/rejected": 0.523658275604248, "step": 4291 }, { "epoch": 0.663754107867775, "grad_norm": 5.53656530380249, "learning_rate": 4.326383319967923e-06, "logits/chosen": 10.309662818908691, "logits/rejected": -0.15265804529190063, "logps/chosen": -282.2122802734375, "logps/rejected": -171.11830139160156, "loss": 0.5686, "rewards/accuracies": 0.625, "rewards/chosen": -0.06645546108484268, "rewards/margins": 0.4247966408729553, "rewards/rejected": -0.49125218391418457, "step": 4292 }, { "epoch": 0.6639087570075392, "grad_norm": 5.4069294929504395, "learning_rate": 4.32609691831825e-06, "logits/chosen": 13.59645938873291, "logits/rejected": 9.281729698181152, "logps/chosen": -222.18453979492188, "logps/rejected": -152.1209259033203, "loss": 0.742, "rewards/accuracies": 0.375, "rewards/chosen": -0.32295939326286316, "rewards/margins": -0.025439254939556122, "rewards/rejected": -0.29752013087272644, "step": 4293 }, { "epoch": 0.6640634061473033, "grad_norm": 5.998040676116943, "learning_rate": 4.325810516668576e-06, "logits/chosen": 8.16534423828125, "logits/rejected": 7.85731315612793, "logps/chosen": -404.62542724609375, "logps/rejected": -334.340576171875, "loss": 0.6027, "rewards/accuracies": 0.625, "rewards/chosen": 0.05921955406665802, "rewards/margins": 0.3341347575187683, "rewards/rejected": -0.27491524815559387, "step": 4294 }, { "epoch": 0.6642180552870675, "grad_norm": 6.4560418128967285, "learning_rate": 4.325524115018903e-06, "logits/chosen": 6.789648056030273, "logits/rejected": 9.997292518615723, "logps/chosen": -206.29624938964844, "logps/rejected": -315.99078369140625, "loss": 0.6727, "rewards/accuracies": 0.75, "rewards/chosen": 0.011142492294311523, "rewards/margins": 0.1732703447341919, "rewards/rejected": -0.16212786734104156, "step": 4295 }, { "epoch": 0.6643727044268316, "grad_norm": 6.536364555358887, "learning_rate": 4.325237713369229e-06, "logits/chosen": 10.937895774841309, "logits/rejected": 10.581141471862793, "logps/chosen": -346.8070068359375, "logps/rejected": -271.35797119140625, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": -0.2658529281616211, "rewards/margins": 0.08295775204896927, "rewards/rejected": -0.34881070256233215, "step": 4296 }, { "epoch": 0.6645273535665958, "grad_norm": 5.573677062988281, "learning_rate": 4.3249513117195555e-06, "logits/chosen": 12.538296699523926, "logits/rejected": 5.398101806640625, "logps/chosen": -374.0251770019531, "logps/rejected": -247.6549530029297, "loss": 0.5247, "rewards/accuracies": 0.625, "rewards/chosen": 0.1656876802444458, "rewards/margins": 0.5455344915390015, "rewards/rejected": -0.37984687089920044, "step": 4297 }, { "epoch": 0.6646820027063599, "grad_norm": 5.437753677368164, "learning_rate": 4.324664910069882e-06, "logits/chosen": 12.648083686828613, "logits/rejected": 12.451542854309082, "logps/chosen": -216.94808959960938, "logps/rejected": -206.02369689941406, "loss": 0.7463, "rewards/accuracies": 0.375, "rewards/chosen": -0.2890317440032959, "rewards/margins": 0.10365095734596252, "rewards/rejected": -0.3926827013492584, "step": 4298 }, { "epoch": 0.6648366518461241, "grad_norm": 9.735902786254883, "learning_rate": 4.324378508420209e-06, "logits/chosen": 4.4970879554748535, "logits/rejected": 9.336906433105469, "logps/chosen": -297.6566162109375, "logps/rejected": -491.55145263671875, "loss": 0.9495, "rewards/accuracies": 0.625, "rewards/chosen": -0.5635269284248352, "rewards/margins": -0.16001738607883453, "rewards/rejected": -0.4035094976425171, "step": 4299 }, { "epoch": 0.6649913009858882, "grad_norm": 5.67490291595459, "learning_rate": 4.3240921067705355e-06, "logits/chosen": 17.460716247558594, "logits/rejected": 5.8881378173828125, "logps/chosen": -357.1310729980469, "logps/rejected": -159.79966735839844, "loss": 0.709, "rewards/accuracies": 0.375, "rewards/chosen": -0.3228677809238434, "rewards/margins": 0.021592095494270325, "rewards/rejected": -0.3444598615169525, "step": 4300 }, { "epoch": 0.6651459501256525, "grad_norm": 4.3194403648376465, "learning_rate": 4.323805705120862e-06, "logits/chosen": 7.022460460662842, "logits/rejected": 7.644617080688477, "logps/chosen": -172.9615020751953, "logps/rejected": -145.02493286132812, "loss": 0.6729, "rewards/accuracies": 0.5, "rewards/chosen": -0.2406686246395111, "rewards/margins": 0.08030257374048233, "rewards/rejected": -0.32097119092941284, "step": 4301 }, { "epoch": 0.6653005992654166, "grad_norm": 4.654993534088135, "learning_rate": 4.323519303471188e-06, "logits/chosen": 12.267364501953125, "logits/rejected": 7.867785453796387, "logps/chosen": -273.8438720703125, "logps/rejected": -257.78265380859375, "loss": 0.4982, "rewards/accuracies": 0.75, "rewards/chosen": -0.07739315927028656, "rewards/margins": 0.5061399340629578, "rewards/rejected": -0.5835331082344055, "step": 4302 }, { "epoch": 0.6654552484051808, "grad_norm": 4.152048110961914, "learning_rate": 4.323232901821515e-06, "logits/chosen": 8.7733736038208, "logits/rejected": 2.4716219902038574, "logps/chosen": -257.78582763671875, "logps/rejected": -141.50247192382812, "loss": 0.6444, "rewards/accuracies": 0.5, "rewards/chosen": -0.24107535183429718, "rewards/margins": 0.20370645821094513, "rewards/rejected": -0.44478175044059753, "step": 4303 }, { "epoch": 0.6656098975449449, "grad_norm": 7.598574638366699, "learning_rate": 4.322946500171841e-06, "logits/chosen": 12.207249641418457, "logits/rejected": 2.4437835216522217, "logps/chosen": -417.4272155761719, "logps/rejected": -250.96444702148438, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": -0.053061626851558685, "rewards/margins": 0.200920969247818, "rewards/rejected": -0.2539826035499573, "step": 4304 }, { "epoch": 0.6657645466847091, "grad_norm": 3.6164698600769043, "learning_rate": 4.322660098522168e-06, "logits/chosen": 8.91761589050293, "logits/rejected": 1.9214706420898438, "logps/chosen": -292.17584228515625, "logps/rejected": -209.76954650878906, "loss": 0.433, "rewards/accuracies": 0.875, "rewards/chosen": 0.10923368483781815, "rewards/margins": 0.700827419757843, "rewards/rejected": -0.5915937423706055, "step": 4305 }, { "epoch": 0.6659191958244732, "grad_norm": 4.041393280029297, "learning_rate": 4.3223736968724945e-06, "logits/chosen": 9.307053565979004, "logits/rejected": 7.442022323608398, "logps/chosen": -292.13287353515625, "logps/rejected": -233.47157287597656, "loss": 0.573, "rewards/accuracies": 0.625, "rewards/chosen": 0.18764419853687286, "rewards/margins": 0.4046606421470642, "rewards/rejected": -0.21701645851135254, "step": 4306 }, { "epoch": 0.6660738449642374, "grad_norm": 5.438999176025391, "learning_rate": 4.322087295222821e-06, "logits/chosen": 12.889373779296875, "logits/rejected": 6.3512091636657715, "logps/chosen": -265.8472900390625, "logps/rejected": -161.15321350097656, "loss": 0.656, "rewards/accuracies": 0.5, "rewards/chosen": -0.21987418830394745, "rewards/margins": 0.19599267840385437, "rewards/rejected": -0.4158668518066406, "step": 4307 }, { "epoch": 0.6662284941040015, "grad_norm": 5.292679786682129, "learning_rate": 4.321800893573148e-06, "logits/chosen": 6.2649359703063965, "logits/rejected": 9.074081420898438, "logps/chosen": -343.22161865234375, "logps/rejected": -397.093505859375, "loss": 0.5503, "rewards/accuracies": 0.625, "rewards/chosen": 0.12882918119430542, "rewards/margins": 0.5189768671989441, "rewards/rejected": -0.39014768600463867, "step": 4308 }, { "epoch": 0.6663831432437657, "grad_norm": 5.072567939758301, "learning_rate": 4.321514491923474e-06, "logits/chosen": 9.413214683532715, "logits/rejected": 7.422295570373535, "logps/chosen": -225.78944396972656, "logps/rejected": -247.0975799560547, "loss": 0.6059, "rewards/accuracies": 0.75, "rewards/chosen": -0.3850398361682892, "rewards/margins": 0.3058340549468994, "rewards/rejected": -0.690873920917511, "step": 4309 }, { "epoch": 0.6665377923835298, "grad_norm": 5.56306791305542, "learning_rate": 4.3212280902738e-06, "logits/chosen": 4.979337215423584, "logits/rejected": 4.984701156616211, "logps/chosen": -295.468994140625, "logps/rejected": -372.0696105957031, "loss": 0.5649, "rewards/accuracies": 0.625, "rewards/chosen": 0.09733524918556213, "rewards/margins": 0.3926045000553131, "rewards/rejected": -0.2952692210674286, "step": 4310 }, { "epoch": 0.666692441523294, "grad_norm": 4.59020471572876, "learning_rate": 4.320941688624127e-06, "logits/chosen": 4.973601341247559, "logits/rejected": 3.915710926055908, "logps/chosen": -208.26828002929688, "logps/rejected": -206.94432067871094, "loss": 0.5671, "rewards/accuracies": 0.75, "rewards/chosen": 0.27557921409606934, "rewards/margins": 0.39918094873428345, "rewards/rejected": -0.12360171973705292, "step": 4311 }, { "epoch": 0.6668470906630581, "grad_norm": 6.568686485290527, "learning_rate": 4.320655286974454e-06, "logits/chosen": 6.38916015625, "logits/rejected": 10.807207107543945, "logps/chosen": -345.784423828125, "logps/rejected": -344.0394592285156, "loss": 0.798, "rewards/accuracies": 0.375, "rewards/chosen": -0.3488273620605469, "rewards/margins": -0.14115303754806519, "rewards/rejected": -0.2076743245124817, "step": 4312 }, { "epoch": 0.6670017398028224, "grad_norm": 5.443709850311279, "learning_rate": 4.32036888532478e-06, "logits/chosen": 4.333506107330322, "logits/rejected": 5.4422502517700195, "logps/chosen": -226.25418090820312, "logps/rejected": -231.79800415039062, "loss": 0.7051, "rewards/accuracies": 0.625, "rewards/chosen": -0.33224472403526306, "rewards/margins": 0.1515967845916748, "rewards/rejected": -0.48384150862693787, "step": 4313 }, { "epoch": 0.6671563889425866, "grad_norm": 4.064946174621582, "learning_rate": 4.320082483675107e-06, "logits/chosen": 11.330009460449219, "logits/rejected": 8.791619300842285, "logps/chosen": -274.75244140625, "logps/rejected": -220.40699768066406, "loss": 0.6556, "rewards/accuracies": 0.5, "rewards/chosen": 0.074774369597435, "rewards/margins": 0.19703851640224457, "rewards/rejected": -0.12226416170597076, "step": 4314 }, { "epoch": 0.6673110380823507, "grad_norm": 3.7393038272857666, "learning_rate": 4.319796082025433e-06, "logits/chosen": 15.827827453613281, "logits/rejected": 2.6591029167175293, "logps/chosen": -356.34600830078125, "logps/rejected": -202.3115997314453, "loss": 0.382, "rewards/accuracies": 1.0, "rewards/chosen": 0.0466192290186882, "rewards/margins": 0.8728972673416138, "rewards/rejected": -0.8262780904769897, "step": 4315 }, { "epoch": 0.6674656872221149, "grad_norm": 5.586511611938477, "learning_rate": 4.319509680375759e-06, "logits/chosen": 10.831878662109375, "logits/rejected": 5.411952018737793, "logps/chosen": -277.65252685546875, "logps/rejected": -193.48443603515625, "loss": 0.7262, "rewards/accuracies": 0.5, "rewards/chosen": -0.0832025408744812, "rewards/margins": 0.01356622576713562, "rewards/rejected": -0.09676878154277802, "step": 4316 }, { "epoch": 0.667620336361879, "grad_norm": 5.125912189483643, "learning_rate": 4.319223278726086e-06, "logits/chosen": 11.209256172180176, "logits/rejected": 6.126062393188477, "logps/chosen": -355.3177795410156, "logps/rejected": -279.807861328125, "loss": 0.5187, "rewards/accuracies": 0.75, "rewards/chosen": -0.029079005122184753, "rewards/margins": 0.5901710987091064, "rewards/rejected": -0.6192500591278076, "step": 4317 }, { "epoch": 0.6677749855016432, "grad_norm": 4.7180609703063965, "learning_rate": 4.318936877076413e-06, "logits/chosen": 11.925938606262207, "logits/rejected": 16.087594985961914, "logps/chosen": -222.77183532714844, "logps/rejected": -241.7539520263672, "loss": 0.7657, "rewards/accuracies": 0.625, "rewards/chosen": -0.14713388681411743, "rewards/margins": -0.03299078345298767, "rewards/rejected": -0.11414310336112976, "step": 4318 }, { "epoch": 0.6679296346414073, "grad_norm": 5.248588562011719, "learning_rate": 4.318650475426739e-06, "logits/chosen": 5.701940536499023, "logits/rejected": 2.073434829711914, "logps/chosen": -338.78961181640625, "logps/rejected": -226.2742462158203, "loss": 0.4987, "rewards/accuracies": 0.875, "rewards/chosen": 0.1251198798418045, "rewards/margins": 0.6047331094741821, "rewards/rejected": -0.4796132445335388, "step": 4319 }, { "epoch": 0.6680842837811715, "grad_norm": 8.054244041442871, "learning_rate": 4.318364073777065e-06, "logits/chosen": 9.712955474853516, "logits/rejected": 7.13339900970459, "logps/chosen": -343.57763671875, "logps/rejected": -331.231201171875, "loss": 0.7035, "rewards/accuracies": 0.375, "rewards/chosen": -0.4916762709617615, "rewards/margins": 0.04455384612083435, "rewards/rejected": -0.5362300872802734, "step": 4320 }, { "epoch": 0.6682389329209356, "grad_norm": 5.975429534912109, "learning_rate": 4.318077672127392e-06, "logits/chosen": 8.411194801330566, "logits/rejected": 4.47360897064209, "logps/chosen": -240.09014892578125, "logps/rejected": -204.2136688232422, "loss": 0.7309, "rewards/accuracies": 0.5, "rewards/chosen": -0.31958699226379395, "rewards/margins": 0.06880658864974976, "rewards/rejected": -0.3883935809135437, "step": 4321 }, { "epoch": 0.6683935820606998, "grad_norm": 7.254704475402832, "learning_rate": 4.317791270477718e-06, "logits/chosen": 8.755146980285645, "logits/rejected": 2.7581326961517334, "logps/chosen": -409.25262451171875, "logps/rejected": -284.9300842285156, "loss": 0.8369, "rewards/accuracies": 0.5, "rewards/chosen": -0.43356791138648987, "rewards/margins": -0.1361943781375885, "rewards/rejected": -0.29737353324890137, "step": 4322 }, { "epoch": 0.6685482312004639, "grad_norm": 7.898281097412109, "learning_rate": 4.317504868828045e-06, "logits/chosen": 8.513710021972656, "logits/rejected": 11.34129524230957, "logps/chosen": -278.28265380859375, "logps/rejected": -290.51800537109375, "loss": 0.6963, "rewards/accuracies": 0.625, "rewards/chosen": -0.19352112710475922, "rewards/margins": 0.0626763105392456, "rewards/rejected": -0.256197452545166, "step": 4323 }, { "epoch": 0.6687028803402281, "grad_norm": 5.849246025085449, "learning_rate": 4.317218467178371e-06, "logits/chosen": 12.781208992004395, "logits/rejected": 8.761346817016602, "logps/chosen": -222.95230102539062, "logps/rejected": -221.46905517578125, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.4186118245124817, "rewards/margins": 0.0954810231924057, "rewards/rejected": -0.5140928030014038, "step": 4324 }, { "epoch": 0.6688575294799922, "grad_norm": 5.6869587898254395, "learning_rate": 4.3169320655286975e-06, "logits/chosen": 11.598677635192871, "logits/rejected": 8.498250007629395, "logps/chosen": -250.36099243164062, "logps/rejected": -244.9032745361328, "loss": 0.7611, "rewards/accuracies": 0.625, "rewards/chosen": -0.17815251648426056, "rewards/margins": -0.07841001451015472, "rewards/rejected": -0.09974252432584763, "step": 4325 }, { "epoch": 0.6690121786197565, "grad_norm": 5.009400844573975, "learning_rate": 4.316645663879024e-06, "logits/chosen": 12.952972412109375, "logits/rejected": 4.941444396972656, "logps/chosen": -316.83526611328125, "logps/rejected": -236.72406005859375, "loss": 0.645, "rewards/accuracies": 0.625, "rewards/chosen": -0.2974834442138672, "rewards/margins": 0.21905556321144104, "rewards/rejected": -0.5165389776229858, "step": 4326 }, { "epoch": 0.6691668277595206, "grad_norm": 5.019449710845947, "learning_rate": 4.316359262229351e-06, "logits/chosen": 9.394136428833008, "logits/rejected": 9.109855651855469, "logps/chosen": -247.48443603515625, "logps/rejected": -299.04351806640625, "loss": 0.5213, "rewards/accuracies": 0.625, "rewards/chosen": 0.10212001204490662, "rewards/margins": 0.6044638752937317, "rewards/rejected": -0.5023438930511475, "step": 4327 }, { "epoch": 0.6693214768992848, "grad_norm": 3.848778486251831, "learning_rate": 4.3160728605796775e-06, "logits/chosen": 12.363107681274414, "logits/rejected": 12.56724739074707, "logps/chosen": -160.00241088867188, "logps/rejected": -191.36505126953125, "loss": 0.5652, "rewards/accuracies": 0.75, "rewards/chosen": -0.08544743061065674, "rewards/margins": 0.46466854214668274, "rewards/rejected": -0.5501160025596619, "step": 4328 }, { "epoch": 0.6694761260390489, "grad_norm": 8.397557258605957, "learning_rate": 4.315786458930003e-06, "logits/chosen": 5.244521141052246, "logits/rejected": 1.1233984231948853, "logps/chosen": -292.2578430175781, "logps/rejected": -207.8626708984375, "loss": 0.9983, "rewards/accuracies": 0.375, "rewards/chosen": -0.7188864946365356, "rewards/margins": -0.4574933350086212, "rewards/rejected": -0.26139315962791443, "step": 4329 }, { "epoch": 0.6696307751788131, "grad_norm": 23.381149291992188, "learning_rate": 4.31550005728033e-06, "logits/chosen": 11.058280944824219, "logits/rejected": 4.830230236053467, "logps/chosen": -375.2982177734375, "logps/rejected": -342.3121032714844, "loss": 0.6342, "rewards/accuracies": 0.625, "rewards/chosen": 0.1727220118045807, "rewards/margins": 0.39396876096725464, "rewards/rejected": -0.22124677896499634, "step": 4330 }, { "epoch": 0.6697854243185772, "grad_norm": 8.040107727050781, "learning_rate": 4.3152136556306566e-06, "logits/chosen": 7.47574520111084, "logits/rejected": 3.1202948093414307, "logps/chosen": -261.68310546875, "logps/rejected": -232.18063354492188, "loss": 0.8969, "rewards/accuracies": 0.25, "rewards/chosen": -0.4198048412799835, "rewards/margins": -0.22327278554439545, "rewards/rejected": -0.19653202593326569, "step": 4331 }, { "epoch": 0.6699400734583414, "grad_norm": 4.902009010314941, "learning_rate": 4.314927253980983e-06, "logits/chosen": 8.587291717529297, "logits/rejected": 5.095376491546631, "logps/chosen": -260.6650695800781, "logps/rejected": -259.49822998046875, "loss": 0.4508, "rewards/accuracies": 0.875, "rewards/chosen": -0.2044256031513214, "rewards/margins": 0.6125047206878662, "rewards/rejected": -0.81693035364151, "step": 4332 }, { "epoch": 0.6700947225981055, "grad_norm": 7.022604465484619, "learning_rate": 4.31464085233131e-06, "logits/chosen": 6.559743881225586, "logits/rejected": 0.9831100702285767, "logps/chosen": -205.47491455078125, "logps/rejected": -126.33174896240234, "loss": 0.7315, "rewards/accuracies": 0.25, "rewards/chosen": -0.6412868499755859, "rewards/margins": -0.048475492745637894, "rewards/rejected": -0.5928113460540771, "step": 4333 }, { "epoch": 0.6702493717378697, "grad_norm": 6.118412017822266, "learning_rate": 4.3143544506816365e-06, "logits/chosen": 16.033222198486328, "logits/rejected": 7.6022539138793945, "logps/chosen": -440.2850341796875, "logps/rejected": -395.99530029296875, "loss": 0.588, "rewards/accuracies": 0.75, "rewards/chosen": 0.18403244018554688, "rewards/margins": 0.5059230923652649, "rewards/rejected": -0.3218906819820404, "step": 4334 }, { "epoch": 0.6704040208776338, "grad_norm": 5.674912929534912, "learning_rate": 4.314068049031962e-06, "logits/chosen": 12.06112003326416, "logits/rejected": -0.8558475971221924, "logps/chosen": -401.1207275390625, "logps/rejected": -296.3409423828125, "loss": 0.5808, "rewards/accuracies": 0.5, "rewards/chosen": -0.3929820954799652, "rewards/margins": 0.8912266492843628, "rewards/rejected": -1.2842087745666504, "step": 4335 }, { "epoch": 0.670558670017398, "grad_norm": 4.7469305992126465, "learning_rate": 4.313781647382289e-06, "logits/chosen": 4.4329328536987305, "logits/rejected": 3.6825971603393555, "logps/chosen": -181.5720672607422, "logps/rejected": -220.92105102539062, "loss": 0.5705, "rewards/accuracies": 0.625, "rewards/chosen": -0.3711830973625183, "rewards/margins": 0.3734095096588135, "rewards/rejected": -0.7445926666259766, "step": 4336 }, { "epoch": 0.6707133191571621, "grad_norm": 4.592440128326416, "learning_rate": 4.313495245732616e-06, "logits/chosen": 3.863528251647949, "logits/rejected": 7.95263671875, "logps/chosen": -168.6116485595703, "logps/rejected": -206.87734985351562, "loss": 0.6461, "rewards/accuracies": 0.5, "rewards/chosen": -0.012716487050056458, "rewards/margins": 0.20782911777496338, "rewards/rejected": -0.22054558992385864, "step": 4337 }, { "epoch": 0.6708679682969263, "grad_norm": 5.196115493774414, "learning_rate": 4.313208844082942e-06, "logits/chosen": 5.945255756378174, "logits/rejected": 2.1601343154907227, "logps/chosen": -439.0337829589844, "logps/rejected": -377.760986328125, "loss": 0.4033, "rewards/accuracies": 0.75, "rewards/chosen": 0.30622929334640503, "rewards/margins": 0.9300085306167603, "rewards/rejected": -0.6237791776657104, "step": 4338 }, { "epoch": 0.6710226174366906, "grad_norm": 8.12320613861084, "learning_rate": 4.312922442433269e-06, "logits/chosen": 11.290288925170898, "logits/rejected": 6.534239768981934, "logps/chosen": -225.21377563476562, "logps/rejected": -166.76809692382812, "loss": 0.7357, "rewards/accuracies": 0.375, "rewards/chosen": -0.3592991828918457, "rewards/margins": -0.04937276244163513, "rewards/rejected": -0.30992642045021057, "step": 4339 }, { "epoch": 0.6711772665764547, "grad_norm": 5.785003185272217, "learning_rate": 4.3126360407835956e-06, "logits/chosen": 10.92134952545166, "logits/rejected": 9.446928977966309, "logps/chosen": -324.3086853027344, "logps/rejected": -291.07965087890625, "loss": 0.6462, "rewards/accuracies": 0.625, "rewards/chosen": -0.11384683102369308, "rewards/margins": 0.19888365268707275, "rewards/rejected": -0.31273049116134644, "step": 4340 }, { "epoch": 0.6713319157162189, "grad_norm": 5.306368827819824, "learning_rate": 4.312349639133922e-06, "logits/chosen": 9.784195899963379, "logits/rejected": 5.088803768157959, "logps/chosen": -294.5977478027344, "logps/rejected": -251.1013641357422, "loss": 0.675, "rewards/accuracies": 0.5, "rewards/chosen": -0.169917032122612, "rewards/margins": 0.2355196177959442, "rewards/rejected": -0.405436635017395, "step": 4341 }, { "epoch": 0.671486564855983, "grad_norm": 5.676792621612549, "learning_rate": 4.312063237484248e-06, "logits/chosen": 8.502631187438965, "logits/rejected": 3.6801257133483887, "logps/chosen": -326.35186767578125, "logps/rejected": -221.66177368164062, "loss": 0.7225, "rewards/accuracies": 0.5, "rewards/chosen": -0.36491233110427856, "rewards/margins": 0.19939759373664856, "rewards/rejected": -0.5643098950386047, "step": 4342 }, { "epoch": 0.6716412139957472, "grad_norm": 6.94557523727417, "learning_rate": 4.311776835834575e-06, "logits/chosen": 11.946405410766602, "logits/rejected": 11.808531761169434, "logps/chosen": -373.92987060546875, "logps/rejected": -304.0584411621094, "loss": 0.8125, "rewards/accuracies": 0.375, "rewards/chosen": -0.1062505692243576, "rewards/margins": -0.066680908203125, "rewards/rejected": -0.039569661021232605, "step": 4343 }, { "epoch": 0.6717958631355113, "grad_norm": 4.628807067871094, "learning_rate": 4.311490434184901e-06, "logits/chosen": 7.7528228759765625, "logits/rejected": 6.748226165771484, "logps/chosen": -374.65594482421875, "logps/rejected": -311.2047424316406, "loss": 0.5592, "rewards/accuracies": 0.75, "rewards/chosen": -0.017075102776288986, "rewards/margins": 0.404699444770813, "rewards/rejected": -0.42177462577819824, "step": 4344 }, { "epoch": 0.6719505122752755, "grad_norm": 4.371573448181152, "learning_rate": 4.311204032535228e-06, "logits/chosen": 7.5436110496521, "logits/rejected": 6.836152076721191, "logps/chosen": -149.7579345703125, "logps/rejected": -152.1444854736328, "loss": 0.5987, "rewards/accuracies": 0.625, "rewards/chosen": 0.08918696641921997, "rewards/margins": 0.3966192305088043, "rewards/rejected": -0.30743223428726196, "step": 4345 }, { "epoch": 0.6721051614150396, "grad_norm": 5.504759311676025, "learning_rate": 4.310917630885555e-06, "logits/chosen": 12.810900688171387, "logits/rejected": 12.121976852416992, "logps/chosen": -285.6212158203125, "logps/rejected": -326.0976867675781, "loss": 0.7801, "rewards/accuracies": 0.5, "rewards/chosen": -0.24133431911468506, "rewards/margins": -0.037230681627988815, "rewards/rejected": -0.20410367846488953, "step": 4346 }, { "epoch": 0.6722598105548038, "grad_norm": 6.4040021896362305, "learning_rate": 4.310631229235881e-06, "logits/chosen": 5.288246154785156, "logits/rejected": 4.323495388031006, "logps/chosen": -283.30572509765625, "logps/rejected": -271.1005859375, "loss": 0.7062, "rewards/accuracies": 0.625, "rewards/chosen": -0.21802043914794922, "rewards/margins": 0.14184197783470154, "rewards/rejected": -0.35986241698265076, "step": 4347 }, { "epoch": 0.6724144596945679, "grad_norm": 6.111469268798828, "learning_rate": 4.310344827586207e-06, "logits/chosen": 5.314854145050049, "logits/rejected": 4.785194396972656, "logps/chosen": -314.21710205078125, "logps/rejected": -242.79913330078125, "loss": 0.7799, "rewards/accuracies": 0.5, "rewards/chosen": -0.2315208613872528, "rewards/margins": -0.022492021322250366, "rewards/rejected": -0.20902882516384125, "step": 4348 }, { "epoch": 0.6725691088343321, "grad_norm": 15.672969818115234, "learning_rate": 4.310058425936534e-06, "logits/chosen": 4.177640914916992, "logits/rejected": 8.71719741821289, "logps/chosen": -197.6545867919922, "logps/rejected": -265.61590576171875, "loss": 0.742, "rewards/accuracies": 0.625, "rewards/chosen": -0.36047232151031494, "rewards/margins": -0.0384647399187088, "rewards/rejected": -0.32200756669044495, "step": 4349 }, { "epoch": 0.6727237579740962, "grad_norm": 6.3508381843566895, "learning_rate": 4.30977202428686e-06, "logits/chosen": 7.652085304260254, "logits/rejected": 4.731571674346924, "logps/chosen": -242.31455993652344, "logps/rejected": -211.475830078125, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": -0.12293510884046555, "rewards/margins": 0.025870423763990402, "rewards/rejected": -0.14880552887916565, "step": 4350 }, { "epoch": 0.6728784071138604, "grad_norm": 7.533824920654297, "learning_rate": 4.309485622637187e-06, "logits/chosen": 7.912522792816162, "logits/rejected": 9.668782234191895, "logps/chosen": -229.46963500976562, "logps/rejected": -225.49020385742188, "loss": 0.7688, "rewards/accuracies": 0.5, "rewards/chosen": -0.48833540081977844, "rewards/margins": -0.060272425413131714, "rewards/rejected": -0.4280630052089691, "step": 4351 }, { "epoch": 0.6730330562536246, "grad_norm": 5.6148271560668945, "learning_rate": 4.309199220987514e-06, "logits/chosen": 9.664350509643555, "logits/rejected": 8.80941390991211, "logps/chosen": -293.586669921875, "logps/rejected": -265.803955078125, "loss": 0.626, "rewards/accuracies": 0.5, "rewards/chosen": 0.09197207540273666, "rewards/margins": 0.2634292244911194, "rewards/rejected": -0.17145714163780212, "step": 4352 }, { "epoch": 0.6731877053933888, "grad_norm": 4.790165901184082, "learning_rate": 4.3089128193378395e-06, "logits/chosen": 10.210770606994629, "logits/rejected": 7.330760955810547, "logps/chosen": -369.55419921875, "logps/rejected": -253.150146484375, "loss": 0.584, "rewards/accuracies": 0.625, "rewards/chosen": -0.1554819941520691, "rewards/margins": 0.39861422777175903, "rewards/rejected": -0.5540962219238281, "step": 4353 }, { "epoch": 0.6733423545331529, "grad_norm": 6.02699613571167, "learning_rate": 4.308626417688166e-06, "logits/chosen": 5.232302188873291, "logits/rejected": 4.87021541595459, "logps/chosen": -303.505615234375, "logps/rejected": -210.87477111816406, "loss": 0.7252, "rewards/accuracies": 0.375, "rewards/chosen": -2.489238977432251e-05, "rewards/margins": 0.21937112510204315, "rewards/rejected": -0.21939602494239807, "step": 4354 }, { "epoch": 0.6734970036729171, "grad_norm": 5.76442289352417, "learning_rate": 4.308340016038493e-06, "logits/chosen": 3.7333836555480957, "logits/rejected": 2.573406219482422, "logps/chosen": -224.85467529296875, "logps/rejected": -259.0929870605469, "loss": 0.6197, "rewards/accuracies": 0.375, "rewards/chosen": 0.017428487539291382, "rewards/margins": 0.3432026505470276, "rewards/rejected": -0.3257741928100586, "step": 4355 }, { "epoch": 0.6736516528126812, "grad_norm": 9.911994934082031, "learning_rate": 4.3080536143888194e-06, "logits/chosen": 4.855419158935547, "logits/rejected": 2.613642930984497, "logps/chosen": -386.39501953125, "logps/rejected": -298.31524658203125, "loss": 0.8119, "rewards/accuracies": 0.375, "rewards/chosen": -0.24843397736549377, "rewards/margins": -0.13192063570022583, "rewards/rejected": -0.11651334166526794, "step": 4356 }, { "epoch": 0.6738063019524454, "grad_norm": 7.51059627532959, "learning_rate": 4.307767212739146e-06, "logits/chosen": 13.94286060333252, "logits/rejected": 10.514548301696777, "logps/chosen": -498.1987609863281, "logps/rejected": -400.3186950683594, "loss": 0.6186, "rewards/accuracies": 0.75, "rewards/chosen": 0.10071755200624466, "rewards/margins": 0.19627895951271057, "rewards/rejected": -0.09556140005588531, "step": 4357 }, { "epoch": 0.6739609510922095, "grad_norm": 3.9615440368652344, "learning_rate": 4.307480811089472e-06, "logits/chosen": 10.846609115600586, "logits/rejected": 6.004369735717773, "logps/chosen": -285.78265380859375, "logps/rejected": -217.84197998046875, "loss": 0.5559, "rewards/accuracies": 0.75, "rewards/chosen": -0.06391362845897675, "rewards/margins": 0.3909119665622711, "rewards/rejected": -0.45482558012008667, "step": 4358 }, { "epoch": 0.6741156002319737, "grad_norm": 5.588890552520752, "learning_rate": 4.3071944094397985e-06, "logits/chosen": 7.391685962677002, "logits/rejected": 7.563086986541748, "logps/chosen": -258.9117736816406, "logps/rejected": -248.08059692382812, "loss": 0.8213, "rewards/accuracies": 0.375, "rewards/chosen": -0.2330484241247177, "rewards/margins": -0.08544944226741791, "rewards/rejected": -0.147598996758461, "step": 4359 }, { "epoch": 0.6742702493717379, "grad_norm": 13.001784324645996, "learning_rate": 4.306908007790125e-06, "logits/chosen": 5.21492862701416, "logits/rejected": 4.764323711395264, "logps/chosen": -274.4930419921875, "logps/rejected": -335.0644836425781, "loss": 0.8763, "rewards/accuracies": 0.5, "rewards/chosen": -0.4456023573875427, "rewards/margins": -0.12490373849868774, "rewards/rejected": -0.3206985592842102, "step": 4360 }, { "epoch": 0.674424898511502, "grad_norm": 8.679266929626465, "learning_rate": 4.306621606140452e-06, "logits/chosen": 12.838769912719727, "logits/rejected": 3.4350359439849854, "logps/chosen": -287.9986572265625, "logps/rejected": -241.20518493652344, "loss": 0.6003, "rewards/accuracies": 0.75, "rewards/chosen": -0.02727479487657547, "rewards/margins": 0.32795992493629456, "rewards/rejected": -0.3552347421646118, "step": 4361 }, { "epoch": 0.6745795476512662, "grad_norm": 3.8145883083343506, "learning_rate": 4.306335204490778e-06, "logits/chosen": 15.41836166381836, "logits/rejected": 6.221778869628906, "logps/chosen": -383.5130615234375, "logps/rejected": -244.79513549804688, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": 0.03227100148797035, "rewards/margins": 0.8833968639373779, "rewards/rejected": -0.8511258363723755, "step": 4362 }, { "epoch": 0.6747341967910303, "grad_norm": 5.615697383880615, "learning_rate": 4.306048802841104e-06, "logits/chosen": 11.220874786376953, "logits/rejected": 3.1929445266723633, "logps/chosen": -278.5677795410156, "logps/rejected": -189.5736846923828, "loss": 0.5682, "rewards/accuracies": 0.625, "rewards/chosen": -0.2512831687927246, "rewards/margins": 0.7028713822364807, "rewards/rejected": -0.9541544914245605, "step": 4363 }, { "epoch": 0.6748888459307945, "grad_norm": 6.706066131591797, "learning_rate": 4.305762401191431e-06, "logits/chosen": 11.945959091186523, "logits/rejected": 8.568140029907227, "logps/chosen": -273.2823486328125, "logps/rejected": -248.9542999267578, "loss": 0.6291, "rewards/accuracies": 0.625, "rewards/chosen": 0.08710212260484695, "rewards/margins": 0.17725151777267456, "rewards/rejected": -0.0901493951678276, "step": 4364 }, { "epoch": 0.6750434950705587, "grad_norm": 9.580127716064453, "learning_rate": 4.305475999541758e-06, "logits/chosen": 8.329070091247559, "logits/rejected": 5.84481143951416, "logps/chosen": -228.21717834472656, "logps/rejected": -219.42247009277344, "loss": 0.7962, "rewards/accuracies": 0.5, "rewards/chosen": -0.3564581274986267, "rewards/margins": 0.24120935797691345, "rewards/rejected": -0.5976675152778625, "step": 4365 }, { "epoch": 0.6751981442103229, "grad_norm": 5.860879421234131, "learning_rate": 4.305189597892084e-06, "logits/chosen": 7.781728744506836, "logits/rejected": 10.093608856201172, "logps/chosen": -242.03427124023438, "logps/rejected": -249.22378540039062, "loss": 0.7788, "rewards/accuracies": 0.375, "rewards/chosen": -0.2477954924106598, "rewards/margins": -0.13580816984176636, "rewards/rejected": -0.11198730021715164, "step": 4366 }, { "epoch": 0.675352793350087, "grad_norm": 4.436400890350342, "learning_rate": 4.304903196242411e-06, "logits/chosen": 10.72449016571045, "logits/rejected": 7.377851486206055, "logps/chosen": -267.89324951171875, "logps/rejected": -241.6751251220703, "loss": 0.5809, "rewards/accuracies": 0.625, "rewards/chosen": -0.27599436044692993, "rewards/margins": 0.36522021889686584, "rewards/rejected": -0.6412145495414734, "step": 4367 }, { "epoch": 0.6755074424898512, "grad_norm": 7.7967634201049805, "learning_rate": 4.304616794592737e-06, "logits/chosen": 7.131446361541748, "logits/rejected": 2.8030855655670166, "logps/chosen": -369.4346923828125, "logps/rejected": -317.81085205078125, "loss": 0.6587, "rewards/accuracies": 0.625, "rewards/chosen": -0.18229426443576813, "rewards/margins": 0.169584721326828, "rewards/rejected": -0.3518790006637573, "step": 4368 }, { "epoch": 0.6756620916296153, "grad_norm": 7.428574562072754, "learning_rate": 4.304330392943063e-06, "logits/chosen": 1.770752191543579, "logits/rejected": 8.010498046875, "logps/chosen": -322.97369384765625, "logps/rejected": -380.0075988769531, "loss": 0.8691, "rewards/accuracies": 0.375, "rewards/chosen": -0.5723614692687988, "rewards/margins": -0.19962920248508453, "rewards/rejected": -0.3727322518825531, "step": 4369 }, { "epoch": 0.6758167407693795, "grad_norm": 18.018117904663086, "learning_rate": 4.30404399129339e-06, "logits/chosen": 14.680455207824707, "logits/rejected": 6.410256385803223, "logps/chosen": -420.0663146972656, "logps/rejected": -229.6256561279297, "loss": 0.4546, "rewards/accuracies": 0.75, "rewards/chosen": 0.3136487305164337, "rewards/margins": 0.8922341465950012, "rewards/rejected": -0.5785854458808899, "step": 4370 }, { "epoch": 0.6759713899091436, "grad_norm": 5.81412935256958, "learning_rate": 4.303757589643717e-06, "logits/chosen": 9.64257526397705, "logits/rejected": 6.972983360290527, "logps/chosen": -270.0439147949219, "logps/rejected": -192.17625427246094, "loss": 0.6464, "rewards/accuracies": 0.625, "rewards/chosen": -0.24827784299850464, "rewards/margins": 0.1747988760471344, "rewards/rejected": -0.4230767488479614, "step": 4371 }, { "epoch": 0.6761260390489078, "grad_norm": 5.1410932540893555, "learning_rate": 4.303471187994043e-06, "logits/chosen": 11.954044342041016, "logits/rejected": 10.99544906616211, "logps/chosen": -274.9610595703125, "logps/rejected": -262.96014404296875, "loss": 0.6378, "rewards/accuracies": 0.625, "rewards/chosen": -0.4133809208869934, "rewards/margins": 0.28973305225372314, "rewards/rejected": -0.7031139731407166, "step": 4372 }, { "epoch": 0.6762806881886719, "grad_norm": 4.030061721801758, "learning_rate": 4.30318478634437e-06, "logits/chosen": 7.366610527038574, "logits/rejected": 2.4514148235321045, "logps/chosen": -207.94317626953125, "logps/rejected": -172.33828735351562, "loss": 0.5104, "rewards/accuracies": 0.875, "rewards/chosen": 0.24070709943771362, "rewards/margins": 0.5449966192245483, "rewards/rejected": -0.3042895495891571, "step": 4373 }, { "epoch": 0.6764353373284361, "grad_norm": 4.2293782234191895, "learning_rate": 4.302898384694697e-06, "logits/chosen": 14.109688758850098, "logits/rejected": 12.98661994934082, "logps/chosen": -342.3206787109375, "logps/rejected": -275.4928283691406, "loss": 0.5416, "rewards/accuracies": 0.625, "rewards/chosen": 0.3115765452384949, "rewards/margins": 0.5092012286186218, "rewards/rejected": -0.19762465357780457, "step": 4374 }, { "epoch": 0.6765899864682002, "grad_norm": 5.203789710998535, "learning_rate": 4.302611983045022e-06, "logits/chosen": 9.19174575805664, "logits/rejected": 10.340836524963379, "logps/chosen": -239.3130645751953, "logps/rejected": -267.2409973144531, "loss": 0.6258, "rewards/accuracies": 0.625, "rewards/chosen": -0.1554592251777649, "rewards/margins": 0.17030923068523407, "rewards/rejected": -0.32576844096183777, "step": 4375 }, { "epoch": 0.6767446356079644, "grad_norm": 4.06829309463501, "learning_rate": 4.302325581395349e-06, "logits/chosen": 11.189308166503906, "logits/rejected": 2.3020639419555664, "logps/chosen": -194.34716796875, "logps/rejected": -171.3843231201172, "loss": 0.6629, "rewards/accuracies": 0.5, "rewards/chosen": 0.11470726132392883, "rewards/margins": 0.12832361459732056, "rewards/rejected": -0.013616353273391724, "step": 4376 }, { "epoch": 0.6768992847477285, "grad_norm": 94.50373840332031, "learning_rate": 4.302039179745676e-06, "logits/chosen": 8.068161964416504, "logits/rejected": 11.06473159790039, "logps/chosen": -314.3012390136719, "logps/rejected": -436.3773193359375, "loss": 0.5777, "rewards/accuracies": 0.625, "rewards/chosen": 0.21490192413330078, "rewards/margins": 0.34530109167099, "rewards/rejected": -0.13039913773536682, "step": 4377 }, { "epoch": 0.6770539338874928, "grad_norm": 5.796728134155273, "learning_rate": 4.301752778096002e-06, "logits/chosen": 14.143423080444336, "logits/rejected": 9.135540008544922, "logps/chosen": -281.4481506347656, "logps/rejected": -257.16900634765625, "loss": 0.7034, "rewards/accuracies": 0.375, "rewards/chosen": -0.3008747100830078, "rewards/margins": 0.027569115161895752, "rewards/rejected": -0.32844382524490356, "step": 4378 }, { "epoch": 0.677208583027257, "grad_norm": 5.3141303062438965, "learning_rate": 4.301466376446329e-06, "logits/chosen": 3.780085802078247, "logits/rejected": 6.952800750732422, "logps/chosen": -178.5964813232422, "logps/rejected": -208.16461181640625, "loss": 0.7543, "rewards/accuracies": 0.375, "rewards/chosen": -0.08171045780181885, "rewards/margins": -0.07805463671684265, "rewards/rejected": -0.00365583598613739, "step": 4379 }, { "epoch": 0.6773632321670211, "grad_norm": 6.719045639038086, "learning_rate": 4.301179974796656e-06, "logits/chosen": 6.18869686126709, "logits/rejected": 7.674605846405029, "logps/chosen": -274.08355712890625, "logps/rejected": -256.7762451171875, "loss": 0.6847, "rewards/accuracies": 0.375, "rewards/chosen": -0.4902491569519043, "rewards/margins": 0.09670928120613098, "rewards/rejected": -0.5869584083557129, "step": 4380 }, { "epoch": 0.6775178813067853, "grad_norm": 5.575119495391846, "learning_rate": 4.3008935731469815e-06, "logits/chosen": 8.194402694702148, "logits/rejected": 9.91681957244873, "logps/chosen": -274.889404296875, "logps/rejected": -282.0581970214844, "loss": 0.7807, "rewards/accuracies": 0.625, "rewards/chosen": -0.28673964738845825, "rewards/margins": -0.11491774767637253, "rewards/rejected": -0.17182184755802155, "step": 4381 }, { "epoch": 0.6776725304465494, "grad_norm": 5.479812145233154, "learning_rate": 4.300607171497308e-06, "logits/chosen": 14.414141654968262, "logits/rejected": 13.416187286376953, "logps/chosen": -325.0621032714844, "logps/rejected": -312.00958251953125, "loss": 0.6873, "rewards/accuracies": 0.5, "rewards/chosen": -0.17287693917751312, "rewards/margins": 0.01997271180152893, "rewards/rejected": -0.19284962117671967, "step": 4382 }, { "epoch": 0.6778271795863136, "grad_norm": 5.18855094909668, "learning_rate": 4.300320769847635e-06, "logits/chosen": 11.120960235595703, "logits/rejected": 7.968018531799316, "logps/chosen": -376.40374755859375, "logps/rejected": -296.7601318359375, "loss": 0.6192, "rewards/accuracies": 0.75, "rewards/chosen": 0.49569687247276306, "rewards/margins": 0.2680850923061371, "rewards/rejected": 0.2276117503643036, "step": 4383 }, { "epoch": 0.6779818287260777, "grad_norm": 5.750880241394043, "learning_rate": 4.300034368197961e-06, "logits/chosen": 8.859659194946289, "logits/rejected": 3.8299648761749268, "logps/chosen": -240.78466796875, "logps/rejected": -204.93621826171875, "loss": 0.7089, "rewards/accuracies": 0.5, "rewards/chosen": -0.08001694083213806, "rewards/margins": 0.10315060615539551, "rewards/rejected": -0.18316754698753357, "step": 4384 }, { "epoch": 0.6781364778658419, "grad_norm": 7.315476894378662, "learning_rate": 4.299747966548288e-06, "logits/chosen": 9.85761833190918, "logits/rejected": 10.541047096252441, "logps/chosen": -297.17315673828125, "logps/rejected": -275.2012023925781, "loss": 0.8568, "rewards/accuracies": 0.25, "rewards/chosen": -0.16940820217132568, "rewards/margins": -0.22387093305587769, "rewards/rejected": 0.0544627383351326, "step": 4385 }, { "epoch": 0.678291127005606, "grad_norm": 4.195440769195557, "learning_rate": 4.299461564898615e-06, "logits/chosen": 7.4741082191467285, "logits/rejected": 5.230938911437988, "logps/chosen": -244.97683715820312, "logps/rejected": -198.05111694335938, "loss": 0.4738, "rewards/accuracies": 0.875, "rewards/chosen": 0.15811572968959808, "rewards/margins": 0.5458548665046692, "rewards/rejected": -0.3877390921115875, "step": 4386 }, { "epoch": 0.6784457761453702, "grad_norm": 6.165177822113037, "learning_rate": 4.2991751632489405e-06, "logits/chosen": 7.2447099685668945, "logits/rejected": 5.319435119628906, "logps/chosen": -305.50177001953125, "logps/rejected": -280.21044921875, "loss": 0.8067, "rewards/accuracies": 0.5, "rewards/chosen": 0.07919955253601074, "rewards/margins": 0.008561987429857254, "rewards/rejected": 0.07063756138086319, "step": 4387 }, { "epoch": 0.6786004252851343, "grad_norm": 3.8643977642059326, "learning_rate": 4.298888761599267e-06, "logits/chosen": 6.140326499938965, "logits/rejected": 3.7158539295196533, "logps/chosen": -178.57180786132812, "logps/rejected": -144.42684936523438, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": -0.142907053232193, "rewards/margins": 0.19747425615787506, "rewards/rejected": -0.34038129448890686, "step": 4388 }, { "epoch": 0.6787550744248985, "grad_norm": 4.747513294219971, "learning_rate": 4.298602359949594e-06, "logits/chosen": 4.594847679138184, "logits/rejected": 6.825936794281006, "logps/chosen": -285.394287109375, "logps/rejected": -273.9123840332031, "loss": 0.5593, "rewards/accuracies": 0.5, "rewards/chosen": -0.24942153692245483, "rewards/margins": 0.47802218794822693, "rewards/rejected": -0.7274438142776489, "step": 4389 }, { "epoch": 0.6789097235646627, "grad_norm": 8.338217735290527, "learning_rate": 4.2983159582999205e-06, "logits/chosen": 9.88192367553711, "logits/rejected": 12.31782054901123, "logps/chosen": -386.43475341796875, "logps/rejected": -344.5802001953125, "loss": 0.9233, "rewards/accuracies": 0.375, "rewards/chosen": 0.15460683405399323, "rewards/margins": -0.3263300061225891, "rewards/rejected": 0.48093682527542114, "step": 4390 }, { "epoch": 0.6790643727044269, "grad_norm": 6.41644287109375, "learning_rate": 4.298029556650246e-06, "logits/chosen": 4.8790507316589355, "logits/rejected": 6.105672836303711, "logps/chosen": -258.84197998046875, "logps/rejected": -250.13319396972656, "loss": 0.787, "rewards/accuracies": 0.375, "rewards/chosen": -0.020575448870658875, "rewards/margins": 0.12163665890693665, "rewards/rejected": -0.1422121226787567, "step": 4391 }, { "epoch": 0.679219021844191, "grad_norm": 7.0461273193359375, "learning_rate": 4.297743155000573e-06, "logits/chosen": 12.709023475646973, "logits/rejected": 7.563149452209473, "logps/chosen": -283.2870788574219, "logps/rejected": -267.26080322265625, "loss": 0.6535, "rewards/accuracies": 0.5, "rewards/chosen": 0.39243602752685547, "rewards/margins": 0.3009548485279083, "rewards/rejected": 0.09148116409778595, "step": 4392 }, { "epoch": 0.6793736709839552, "grad_norm": 4.431209564208984, "learning_rate": 4.2974567533508996e-06, "logits/chosen": 11.554564476013184, "logits/rejected": 8.213187217712402, "logps/chosen": -231.82083129882812, "logps/rejected": -216.91754150390625, "loss": 0.5243, "rewards/accuracies": 0.75, "rewards/chosen": -0.18199843168258667, "rewards/margins": 0.5338979959487915, "rewards/rejected": -0.7158964276313782, "step": 4393 }, { "epoch": 0.6795283201237193, "grad_norm": 5.199768543243408, "learning_rate": 4.297170351701226e-06, "logits/chosen": 8.7282133102417, "logits/rejected": 7.4432172775268555, "logps/chosen": -211.1273651123047, "logps/rejected": -268.0179138183594, "loss": 0.6277, "rewards/accuracies": 0.625, "rewards/chosen": -0.21734784543514252, "rewards/margins": 0.19855497777462006, "rewards/rejected": -0.4159027934074402, "step": 4394 }, { "epoch": 0.6796829692634835, "grad_norm": 6.761626243591309, "learning_rate": 4.296883950051553e-06, "logits/chosen": 7.202631950378418, "logits/rejected": 9.472408294677734, "logps/chosen": -301.03759765625, "logps/rejected": -423.3046569824219, "loss": 0.6538, "rewards/accuracies": 0.875, "rewards/chosen": -0.02093452960252762, "rewards/margins": 0.2627944052219391, "rewards/rejected": -0.2837289869785309, "step": 4395 }, { "epoch": 0.6798376184032476, "grad_norm": 5.459228515625, "learning_rate": 4.296597548401879e-06, "logits/chosen": 12.33895492553711, "logits/rejected": 9.177013397216797, "logps/chosen": -377.6455078125, "logps/rejected": -290.0875549316406, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": -0.047038257122039795, "rewards/margins": 0.1342509239912033, "rewards/rejected": -0.1812891960144043, "step": 4396 }, { "epoch": 0.6799922675430118, "grad_norm": 4.5779852867126465, "learning_rate": 4.296311146752205e-06, "logits/chosen": 14.320616722106934, "logits/rejected": 6.98124885559082, "logps/chosen": -268.4227294921875, "logps/rejected": -244.79818725585938, "loss": 0.633, "rewards/accuracies": 0.5, "rewards/chosen": 0.15360823273658752, "rewards/margins": 0.6148350238800049, "rewards/rejected": -0.46122676134109497, "step": 4397 }, { "epoch": 0.6801469166827759, "grad_norm": 3.5609493255615234, "learning_rate": 4.296024745102532e-06, "logits/chosen": 14.616093635559082, "logits/rejected": 7.603805065155029, "logps/chosen": -328.3641662597656, "logps/rejected": -206.73141479492188, "loss": 0.4058, "rewards/accuracies": 0.875, "rewards/chosen": 0.04530716314911842, "rewards/margins": 0.7912201881408691, "rewards/rejected": -0.745913028717041, "step": 4398 }, { "epoch": 0.6803015658225401, "grad_norm": 4.843770503997803, "learning_rate": 4.295738343452859e-06, "logits/chosen": 8.000632286071777, "logits/rejected": 7.5493364334106445, "logps/chosen": -281.0600891113281, "logps/rejected": -243.34136962890625, "loss": 0.6764, "rewards/accuracies": 0.5, "rewards/chosen": 0.2166353464126587, "rewards/margins": 0.11579858511686325, "rewards/rejected": 0.10083675384521484, "step": 4399 }, { "epoch": 0.6804562149623042, "grad_norm": 5.3286237716674805, "learning_rate": 4.295451941803185e-06, "logits/chosen": 16.993864059448242, "logits/rejected": 11.162191390991211, "logps/chosen": -320.25, "logps/rejected": -208.5397491455078, "loss": 0.6867, "rewards/accuracies": 0.5, "rewards/chosen": -0.1269436776638031, "rewards/margins": 0.10278471559286118, "rewards/rejected": -0.22972841560840607, "step": 4400 }, { "epoch": 0.6806108641020684, "grad_norm": 5.563046932220459, "learning_rate": 4.295165540153511e-06, "logits/chosen": 11.928476333618164, "logits/rejected": 5.195486068725586, "logps/chosen": -244.13023376464844, "logps/rejected": -165.9436492919922, "loss": 0.4953, "rewards/accuracies": 0.75, "rewards/chosen": 0.361529141664505, "rewards/margins": 0.5486453771591187, "rewards/rejected": -0.18711623549461365, "step": 4401 }, { "epoch": 0.6807655132418325, "grad_norm": 6.144144535064697, "learning_rate": 4.294879138503838e-06, "logits/chosen": 14.431041717529297, "logits/rejected": 7.683199882507324, "logps/chosen": -314.7454528808594, "logps/rejected": -218.9053955078125, "loss": 0.7225, "rewards/accuracies": 0.625, "rewards/chosen": -0.1462804675102234, "rewards/margins": 0.009972203522920609, "rewards/rejected": -0.1562526822090149, "step": 4402 }, { "epoch": 0.6809201623815968, "grad_norm": 4.993712425231934, "learning_rate": 4.294592736854164e-06, "logits/chosen": 14.189153671264648, "logits/rejected": 12.22339153289795, "logps/chosen": -332.0534362792969, "logps/rejected": -317.44464111328125, "loss": 0.5557, "rewards/accuracies": 0.625, "rewards/chosen": 0.07819414883852005, "rewards/margins": 0.40176600217819214, "rewards/rejected": -0.3235718607902527, "step": 4403 }, { "epoch": 0.681074811521361, "grad_norm": 5.071861743927002, "learning_rate": 4.294306335204491e-06, "logits/chosen": 17.42917251586914, "logits/rejected": 8.24921703338623, "logps/chosen": -359.1090087890625, "logps/rejected": -306.5159912109375, "loss": 0.5549, "rewards/accuracies": 0.625, "rewards/chosen": 0.2623753547668457, "rewards/margins": 0.3640523850917816, "rewards/rejected": -0.10167703032493591, "step": 4404 }, { "epoch": 0.6812294606611251, "grad_norm": 4.940032958984375, "learning_rate": 4.294019933554818e-06, "logits/chosen": 10.026111602783203, "logits/rejected": 9.074549674987793, "logps/chosen": -206.15184020996094, "logps/rejected": -166.01602172851562, "loss": 0.708, "rewards/accuracies": 0.5, "rewards/chosen": 0.22703701257705688, "rewards/margins": 0.013266075402498245, "rewards/rejected": 0.21377091109752655, "step": 4405 }, { "epoch": 0.6813841098008893, "grad_norm": 4.179180145263672, "learning_rate": 4.293733531905144e-06, "logits/chosen": 11.007497787475586, "logits/rejected": 3.752166509628296, "logps/chosen": -300.860595703125, "logps/rejected": -151.47885131835938, "loss": 0.54, "rewards/accuracies": 0.875, "rewards/chosen": 0.02710948884487152, "rewards/margins": 0.3848724961280823, "rewards/rejected": -0.35776302218437195, "step": 4406 }, { "epoch": 0.6815387589406534, "grad_norm": 5.589725494384766, "learning_rate": 4.293447130255471e-06, "logits/chosen": 7.565913677215576, "logits/rejected": 6.852021217346191, "logps/chosen": -312.3780517578125, "logps/rejected": -295.39202880859375, "loss": 0.6693, "rewards/accuracies": 0.375, "rewards/chosen": -0.07289613038301468, "rewards/margins": 0.2600077688694, "rewards/rejected": -0.3329039216041565, "step": 4407 }, { "epoch": 0.6816934080804176, "grad_norm": 5.711284637451172, "learning_rate": 4.293160728605797e-06, "logits/chosen": 13.750738143920898, "logits/rejected": 9.577877044677734, "logps/chosen": -294.51904296875, "logps/rejected": -234.39547729492188, "loss": 0.734, "rewards/accuracies": 0.625, "rewards/chosen": -0.1550583392381668, "rewards/margins": 0.0804409310221672, "rewards/rejected": -0.2354992926120758, "step": 4408 }, { "epoch": 0.6818480572201817, "grad_norm": 7.570682525634766, "learning_rate": 4.2928743269561234e-06, "logits/chosen": 12.190214157104492, "logits/rejected": 10.80136489868164, "logps/chosen": -347.2962341308594, "logps/rejected": -315.48724365234375, "loss": 0.9836, "rewards/accuracies": 0.25, "rewards/chosen": -0.4769257605075836, "rewards/margins": -0.2722587585449219, "rewards/rejected": -0.20466700196266174, "step": 4409 }, { "epoch": 0.6820027063599459, "grad_norm": 5.898629665374756, "learning_rate": 4.29258792530645e-06, "logits/chosen": 6.665280342102051, "logits/rejected": 14.026094436645508, "logps/chosen": -205.9501953125, "logps/rejected": -351.3404235839844, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": -0.2070237100124359, "rewards/margins": 0.20302194356918335, "rewards/rejected": -0.41004565358161926, "step": 4410 }, { "epoch": 0.68215735549971, "grad_norm": 7.967639446258545, "learning_rate": 4.292301523656777e-06, "logits/chosen": 6.183863639831543, "logits/rejected": 5.97257137298584, "logps/chosen": -256.78741455078125, "logps/rejected": -309.83544921875, "loss": 0.6602, "rewards/accuracies": 0.625, "rewards/chosen": -0.3181249499320984, "rewards/margins": 0.19149930775165558, "rewards/rejected": -0.509624183177948, "step": 4411 }, { "epoch": 0.6823120046394742, "grad_norm": 5.440560340881348, "learning_rate": 4.292015122007103e-06, "logits/chosen": 6.339186668395996, "logits/rejected": 6.893258094787598, "logps/chosen": -176.7098388671875, "logps/rejected": -170.19900512695312, "loss": 0.7526, "rewards/accuracies": 0.5, "rewards/chosen": 0.18195104598999023, "rewards/margins": -0.0760570615530014, "rewards/rejected": 0.25800809264183044, "step": 4412 }, { "epoch": 0.6824666537792383, "grad_norm": 4.359745502471924, "learning_rate": 4.29172872035743e-06, "logits/chosen": 14.661651611328125, "logits/rejected": 8.421096801757812, "logps/chosen": -322.3902587890625, "logps/rejected": -249.64199829101562, "loss": 0.4913, "rewards/accuracies": 0.625, "rewards/chosen": 0.26964256167411804, "rewards/margins": 0.714688777923584, "rewards/rejected": -0.4450462758541107, "step": 4413 }, { "epoch": 0.6826213029190025, "grad_norm": 4.002414703369141, "learning_rate": 4.291442318707756e-06, "logits/chosen": 10.898488998413086, "logits/rejected": 3.8738439083099365, "logps/chosen": -347.56207275390625, "logps/rejected": -221.91427612304688, "loss": 0.552, "rewards/accuracies": 0.625, "rewards/chosen": 0.2621403932571411, "rewards/margins": 0.5308627486228943, "rewards/rejected": -0.2687223255634308, "step": 4414 }, { "epoch": 0.6827759520587666, "grad_norm": 4.870889186859131, "learning_rate": 4.2911559170580825e-06, "logits/chosen": 9.475064277648926, "logits/rejected": 4.724165916442871, "logps/chosen": -274.3704528808594, "logps/rejected": -216.2876434326172, "loss": 0.5926, "rewards/accuracies": 0.625, "rewards/chosen": -0.11690820753574371, "rewards/margins": 0.4779534339904785, "rewards/rejected": -0.594861626625061, "step": 4415 }, { "epoch": 0.6829306011985309, "grad_norm": 6.941696643829346, "learning_rate": 4.290869515408409e-06, "logits/chosen": 14.138030052185059, "logits/rejected": 11.562395095825195, "logps/chosen": -292.364990234375, "logps/rejected": -236.08767700195312, "loss": 0.8252, "rewards/accuracies": 0.375, "rewards/chosen": -0.023112758994102478, "rewards/margins": -0.15580081939697266, "rewards/rejected": 0.13268806040287018, "step": 4416 }, { "epoch": 0.683085250338295, "grad_norm": 4.98858642578125, "learning_rate": 4.290583113758736e-06, "logits/chosen": 6.187417984008789, "logits/rejected": 7.744145393371582, "logps/chosen": -261.29656982421875, "logps/rejected": -330.3377685546875, "loss": 0.7879, "rewards/accuracies": 0.375, "rewards/chosen": -0.139823779463768, "rewards/margins": -0.07168054580688477, "rewards/rejected": -0.06814321875572205, "step": 4417 }, { "epoch": 0.6832398994780592, "grad_norm": 5.17976188659668, "learning_rate": 4.2902967121090624e-06, "logits/chosen": 12.808450698852539, "logits/rejected": 9.790185928344727, "logps/chosen": -235.89523315429688, "logps/rejected": -200.4786376953125, "loss": 0.7562, "rewards/accuracies": 0.625, "rewards/chosen": 0.06488892436027527, "rewards/margins": 0.04878142476081848, "rewards/rejected": 0.016107499599456787, "step": 4418 }, { "epoch": 0.6833945486178233, "grad_norm": 7.002490043640137, "learning_rate": 4.290010310459389e-06, "logits/chosen": 9.382917404174805, "logits/rejected": 8.479498863220215, "logps/chosen": -290.6116943359375, "logps/rejected": -265.64361572265625, "loss": 0.7379, "rewards/accuracies": 0.625, "rewards/chosen": -0.21291837096214294, "rewards/margins": 0.1116248369216919, "rewards/rejected": -0.32454317808151245, "step": 4419 }, { "epoch": 0.6835491977575875, "grad_norm": 4.761524677276611, "learning_rate": 4.289723908809716e-06, "logits/chosen": 10.212902069091797, "logits/rejected": 7.367902755737305, "logps/chosen": -253.96875, "logps/rejected": -220.33331298828125, "loss": 0.5865, "rewards/accuracies": 0.75, "rewards/chosen": 0.10504819452762604, "rewards/margins": 0.3236563205718994, "rewards/rejected": -0.21860815584659576, "step": 4420 }, { "epoch": 0.6837038468973516, "grad_norm": 6.993953704833984, "learning_rate": 4.2894375071600415e-06, "logits/chosen": 10.621439933776855, "logits/rejected": 6.5814056396484375, "logps/chosen": -222.0738525390625, "logps/rejected": -284.02703857421875, "loss": 0.5379, "rewards/accuracies": 0.625, "rewards/chosen": 0.2643647789955139, "rewards/margins": 0.5481058955192566, "rewards/rejected": -0.2837411165237427, "step": 4421 }, { "epoch": 0.6838584960371158, "grad_norm": 5.571416854858398, "learning_rate": 4.289151105510368e-06, "logits/chosen": 5.620655059814453, "logits/rejected": 7.134237766265869, "logps/chosen": -260.7199401855469, "logps/rejected": -263.5833435058594, "loss": 0.6576, "rewards/accuracies": 0.625, "rewards/chosen": 0.2956846356391907, "rewards/margins": 0.33304858207702637, "rewards/rejected": -0.03736397251486778, "step": 4422 }, { "epoch": 0.6840131451768799, "grad_norm": 5.099806308746338, "learning_rate": 4.288864703860695e-06, "logits/chosen": 6.836983680725098, "logits/rejected": -2.238745927810669, "logps/chosen": -316.50054931640625, "logps/rejected": -205.2968292236328, "loss": 0.5332, "rewards/accuracies": 0.75, "rewards/chosen": 0.2413993924856186, "rewards/margins": 0.5468438863754272, "rewards/rejected": -0.30544453859329224, "step": 4423 }, { "epoch": 0.6841677943166441, "grad_norm": 8.00323486328125, "learning_rate": 4.2885783022110215e-06, "logits/chosen": 11.033995628356934, "logits/rejected": 9.126618385314941, "logps/chosen": -422.2153625488281, "logps/rejected": -392.935791015625, "loss": 0.7761, "rewards/accuracies": 0.375, "rewards/chosen": 0.015468217432498932, "rewards/margins": -0.04781923443078995, "rewards/rejected": 0.06328745186328888, "step": 4424 }, { "epoch": 0.6843224434564082, "grad_norm": 4.680246353149414, "learning_rate": 4.288291900561347e-06, "logits/chosen": 11.454498291015625, "logits/rejected": 5.082576751708984, "logps/chosen": -283.6060485839844, "logps/rejected": -178.2688751220703, "loss": 0.6548, "rewards/accuracies": 0.5, "rewards/chosen": -0.08155956119298935, "rewards/margins": 0.22759394347667694, "rewards/rejected": -0.3091534972190857, "step": 4425 }, { "epoch": 0.6844770925961724, "grad_norm": 4.370017051696777, "learning_rate": 4.288005498911674e-06, "logits/chosen": 11.999460220336914, "logits/rejected": 8.618998527526855, "logps/chosen": -218.4921417236328, "logps/rejected": -193.80384826660156, "loss": 0.5025, "rewards/accuracies": 0.625, "rewards/chosen": 0.30304670333862305, "rewards/margins": 0.7256160974502563, "rewards/rejected": -0.4225694537162781, "step": 4426 }, { "epoch": 0.6846317417359365, "grad_norm": 3.3437304496765137, "learning_rate": 4.287719097262001e-06, "logits/chosen": 15.351802825927734, "logits/rejected": 8.893656730651855, "logps/chosen": -237.2900390625, "logps/rejected": -188.38681030273438, "loss": 0.3938, "rewards/accuracies": 0.75, "rewards/chosen": 0.5936269760131836, "rewards/margins": 0.8723742365837097, "rewards/rejected": -0.27874720096588135, "step": 4427 }, { "epoch": 0.6847863908757007, "grad_norm": 6.830431938171387, "learning_rate": 4.287432695612327e-06, "logits/chosen": 6.708425045013428, "logits/rejected": 5.250683784484863, "logps/chosen": -221.34515380859375, "logps/rejected": -232.97952270507812, "loss": 0.7828, "rewards/accuracies": 0.375, "rewards/chosen": -0.323251336812973, "rewards/margins": -0.041125714778900146, "rewards/rejected": -0.28212565183639526, "step": 4428 }, { "epoch": 0.684941040015465, "grad_norm": 3.639462471008301, "learning_rate": 4.287146293962653e-06, "logits/chosen": 9.48283576965332, "logits/rejected": 2.6893997192382812, "logps/chosen": -297.8948974609375, "logps/rejected": -135.90972900390625, "loss": 0.5344, "rewards/accuracies": 0.75, "rewards/chosen": 0.2145722359418869, "rewards/margins": 0.47692328691482544, "rewards/rejected": -0.26235103607177734, "step": 4429 }, { "epoch": 0.6850956891552291, "grad_norm": 7.06432580947876, "learning_rate": 4.28685989231298e-06, "logits/chosen": 10.755661964416504, "logits/rejected": 11.947458267211914, "logps/chosen": -287.0971374511719, "logps/rejected": -265.18096923828125, "loss": 0.8944, "rewards/accuracies": 0.25, "rewards/chosen": 0.08469399809837341, "rewards/margins": -0.3074387311935425, "rewards/rejected": 0.3921326994895935, "step": 4430 }, { "epoch": 0.6852503382949933, "grad_norm": 5.600857734680176, "learning_rate": 4.286573490663306e-06, "logits/chosen": 9.202146530151367, "logits/rejected": -0.888355553150177, "logps/chosen": -481.6180419921875, "logps/rejected": -187.26589965820312, "loss": 0.7872, "rewards/accuracies": 0.375, "rewards/chosen": -0.10225935280323029, "rewards/margins": 0.029102392494678497, "rewards/rejected": -0.13136178255081177, "step": 4431 }, { "epoch": 0.6854049874347574, "grad_norm": 6.241583824157715, "learning_rate": 4.286287089013633e-06, "logits/chosen": 10.553300857543945, "logits/rejected": 3.8909707069396973, "logps/chosen": -351.89794921875, "logps/rejected": -247.68299865722656, "loss": 0.792, "rewards/accuracies": 0.375, "rewards/chosen": 0.15536661446094513, "rewards/margins": -0.10992631316184998, "rewards/rejected": 0.2652929425239563, "step": 4432 }, { "epoch": 0.6855596365745216, "grad_norm": 6.158325672149658, "learning_rate": 4.28600068736396e-06, "logits/chosen": 10.460204124450684, "logits/rejected": 7.212018013000488, "logps/chosen": -250.78732299804688, "logps/rejected": -222.10556030273438, "loss": 0.6019, "rewards/accuracies": 0.5, "rewards/chosen": 0.1877039074897766, "rewards/margins": 0.23720739781856537, "rewards/rejected": -0.049503520131111145, "step": 4433 }, { "epoch": 0.6857142857142857, "grad_norm": 3.5610086917877197, "learning_rate": 4.2857142857142855e-06, "logits/chosen": 9.52037525177002, "logits/rejected": 6.160451412200928, "logps/chosen": -260.72686767578125, "logps/rejected": -179.30490112304688, "loss": 0.479, "rewards/accuracies": 0.625, "rewards/chosen": 0.18300504982471466, "rewards/margins": 0.7173242568969727, "rewards/rejected": -0.5343192219734192, "step": 4434 }, { "epoch": 0.6858689348540499, "grad_norm": 4.747145652770996, "learning_rate": 4.285427884064612e-06, "logits/chosen": 16.444198608398438, "logits/rejected": 6.017789840698242, "logps/chosen": -362.38812255859375, "logps/rejected": -233.85443115234375, "loss": 0.5094, "rewards/accuracies": 0.875, "rewards/chosen": 0.3864232897758484, "rewards/margins": 0.47597113251686096, "rewards/rejected": -0.08954782783985138, "step": 4435 }, { "epoch": 0.686023583993814, "grad_norm": 5.268344402313232, "learning_rate": 4.285141482414939e-06, "logits/chosen": 4.472221374511719, "logits/rejected": 10.768880844116211, "logps/chosen": -242.9765167236328, "logps/rejected": -352.6317138671875, "loss": 0.6258, "rewards/accuracies": 0.75, "rewards/chosen": 0.19073772430419922, "rewards/margins": 0.6525869369506836, "rewards/rejected": -0.4618492126464844, "step": 4436 }, { "epoch": 0.6861782331335782, "grad_norm": 6.1478352546691895, "learning_rate": 4.284855080765265e-06, "logits/chosen": 10.997254371643066, "logits/rejected": 7.679874420166016, "logps/chosen": -301.6500549316406, "logps/rejected": -271.2589111328125, "loss": 0.7048, "rewards/accuracies": 0.75, "rewards/chosen": -0.1435386687517166, "rewards/margins": 0.09836465120315552, "rewards/rejected": -0.24190330505371094, "step": 4437 }, { "epoch": 0.6863328822733423, "grad_norm": 6.086324214935303, "learning_rate": 4.284568679115592e-06, "logits/chosen": 11.764872550964355, "logits/rejected": 11.78868579864502, "logps/chosen": -252.14654541015625, "logps/rejected": -254.66934204101562, "loss": 0.7777, "rewards/accuracies": 0.625, "rewards/chosen": -0.23934301733970642, "rewards/margins": 0.22395600378513336, "rewards/rejected": -0.4632989764213562, "step": 4438 }, { "epoch": 0.6864875314131065, "grad_norm": 6.6365861892700195, "learning_rate": 4.284282277465919e-06, "logits/chosen": 5.342966079711914, "logits/rejected": 6.390328407287598, "logps/chosen": -190.566162109375, "logps/rejected": -236.94354248046875, "loss": 0.9509, "rewards/accuracies": 0.5, "rewards/chosen": -0.2988263964653015, "rewards/margins": -0.24544697999954224, "rewards/rejected": -0.05337939411401749, "step": 4439 }, { "epoch": 0.6866421805528706, "grad_norm": 3.7090659141540527, "learning_rate": 4.283995875816245e-06, "logits/chosen": 12.999027252197266, "logits/rejected": 6.8252997398376465, "logps/chosen": -150.48828125, "logps/rejected": -133.07794189453125, "loss": 0.619, "rewards/accuracies": 0.625, "rewards/chosen": -0.18166394531726837, "rewards/margins": 0.28471317887306213, "rewards/rejected": -0.4663771390914917, "step": 4440 }, { "epoch": 0.6867968296926348, "grad_norm": 3.3590099811553955, "learning_rate": 4.283709474166571e-06, "logits/chosen": 9.365459442138672, "logits/rejected": 8.170974731445312, "logps/chosen": -225.04531860351562, "logps/rejected": -187.85997009277344, "loss": 0.5743, "rewards/accuracies": 0.5, "rewards/chosen": -0.06690728664398193, "rewards/margins": 0.5250653624534607, "rewards/rejected": -0.5919726490974426, "step": 4441 }, { "epoch": 0.686951478832399, "grad_norm": 12.276659965515137, "learning_rate": 4.283423072516898e-06, "logits/chosen": 11.259847640991211, "logits/rejected": 3.8195507526397705, "logps/chosen": -296.73492431640625, "logps/rejected": -193.91964721679688, "loss": 0.4528, "rewards/accuracies": 0.875, "rewards/chosen": 0.16350306570529938, "rewards/margins": 0.7431578636169434, "rewards/rejected": -0.5796548128128052, "step": 4442 }, { "epoch": 0.6871061279721632, "grad_norm": 9.124277114868164, "learning_rate": 4.2831366708672245e-06, "logits/chosen": 12.566473960876465, "logits/rejected": 6.26070499420166, "logps/chosen": -391.4123840332031, "logps/rejected": -286.1532287597656, "loss": 0.7108, "rewards/accuracies": 0.625, "rewards/chosen": 0.1961219757795334, "rewards/margins": 0.22890543937683105, "rewards/rejected": -0.03278341144323349, "step": 4443 }, { "epoch": 0.6872607771119273, "grad_norm": 7.293119430541992, "learning_rate": 4.282850269217551e-06, "logits/chosen": 11.97053050994873, "logits/rejected": 10.036218643188477, "logps/chosen": -310.6858825683594, "logps/rejected": -325.09039306640625, "loss": 0.8062, "rewards/accuracies": 0.5, "rewards/chosen": 0.24150556325912476, "rewards/margins": -0.12163146585226059, "rewards/rejected": 0.36313700675964355, "step": 4444 }, { "epoch": 0.6874154262516915, "grad_norm": 5.329139232635498, "learning_rate": 4.282563867567878e-06, "logits/chosen": 7.069852828979492, "logits/rejected": 13.204593658447266, "logps/chosen": -260.12432861328125, "logps/rejected": -290.69818115234375, "loss": 0.6738, "rewards/accuracies": 0.75, "rewards/chosen": 0.2295803725719452, "rewards/margins": 0.1481151580810547, "rewards/rejected": 0.08146519958972931, "step": 4445 }, { "epoch": 0.6875700753914556, "grad_norm": 5.738361835479736, "learning_rate": 4.282277465918204e-06, "logits/chosen": 11.869058609008789, "logits/rejected": 3.2904181480407715, "logps/chosen": -269.22430419921875, "logps/rejected": -232.68667602539062, "loss": 0.679, "rewards/accuracies": 0.375, "rewards/chosen": 0.06167583167552948, "rewards/margins": 0.1215328648686409, "rewards/rejected": -0.05985704064369202, "step": 4446 }, { "epoch": 0.6877247245312198, "grad_norm": 6.327104568481445, "learning_rate": 4.28199106426853e-06, "logits/chosen": 11.759507179260254, "logits/rejected": 8.659193992614746, "logps/chosen": -330.90045166015625, "logps/rejected": -312.1015625, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": -0.029327020049095154, "rewards/margins": 0.07938119024038315, "rewards/rejected": -0.1087082028388977, "step": 4447 }, { "epoch": 0.687879373670984, "grad_norm": 6.324320316314697, "learning_rate": 4.281704662618857e-06, "logits/chosen": 6.9144415855407715, "logits/rejected": 6.886053085327148, "logps/chosen": -243.34698486328125, "logps/rejected": -234.27764892578125, "loss": 0.6951, "rewards/accuracies": 0.625, "rewards/chosen": 0.057009320706129074, "rewards/margins": 0.036147456616163254, "rewards/rejected": 0.020861878991127014, "step": 4448 }, { "epoch": 0.6880340228107481, "grad_norm": 6.772681713104248, "learning_rate": 4.2814182609691835e-06, "logits/chosen": 9.581363677978516, "logits/rejected": 9.872903823852539, "logps/chosen": -346.2281494140625, "logps/rejected": -332.0546875, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": 0.1287352740764618, "rewards/margins": 0.15438660979270935, "rewards/rejected": -0.025651350617408752, "step": 4449 }, { "epoch": 0.6881886719505123, "grad_norm": 5.153995037078857, "learning_rate": 4.28113185931951e-06, "logits/chosen": 6.044857978820801, "logits/rejected": -0.5891554355621338, "logps/chosen": -210.30276489257812, "logps/rejected": -151.76376342773438, "loss": 0.6379, "rewards/accuracies": 0.625, "rewards/chosen": 0.22512489557266235, "rewards/margins": 0.2831946015357971, "rewards/rejected": -0.058069705963134766, "step": 4450 }, { "epoch": 0.6883433210902764, "grad_norm": 3.6788439750671387, "learning_rate": 4.280845457669837e-06, "logits/chosen": 16.677631378173828, "logits/rejected": 10.06670093536377, "logps/chosen": -215.3868865966797, "logps/rejected": -158.34532165527344, "loss": 0.5268, "rewards/accuracies": 0.75, "rewards/chosen": 0.26501116156578064, "rewards/margins": 0.5089883208274841, "rewards/rejected": -0.24397718906402588, "step": 4451 }, { "epoch": 0.6884979702300406, "grad_norm": 7.015074729919434, "learning_rate": 4.2805590560201635e-06, "logits/chosen": 9.428455352783203, "logits/rejected": 11.390767097473145, "logps/chosen": -304.6409606933594, "logps/rejected": -299.46624755859375, "loss": 0.6543, "rewards/accuracies": 0.75, "rewards/chosen": 0.45304667949676514, "rewards/margins": 0.22036659717559814, "rewards/rejected": 0.232680082321167, "step": 4452 }, { "epoch": 0.6886526193698047, "grad_norm": 5.854787826538086, "learning_rate": 4.28027265437049e-06, "logits/chosen": 15.361802101135254, "logits/rejected": 9.837018013000488, "logps/chosen": -243.88218688964844, "logps/rejected": -166.57855224609375, "loss": 0.6084, "rewards/accuracies": 0.5, "rewards/chosen": 0.18128342926502228, "rewards/margins": 0.23061849176883698, "rewards/rejected": -0.0493350476026535, "step": 4453 }, { "epoch": 0.688807268509569, "grad_norm": 5.793326377868652, "learning_rate": 4.279986252720816e-06, "logits/chosen": 6.582826614379883, "logits/rejected": 6.552923202514648, "logps/chosen": -222.7197265625, "logps/rejected": -258.70953369140625, "loss": 0.5229, "rewards/accuracies": 0.75, "rewards/chosen": 0.3430105149745941, "rewards/margins": 0.5741848349571228, "rewards/rejected": -0.2311742752790451, "step": 4454 }, { "epoch": 0.6889619176493331, "grad_norm": 5.287814140319824, "learning_rate": 4.279699851071143e-06, "logits/chosen": 10.842239379882812, "logits/rejected": 4.468028545379639, "logps/chosen": -346.85540771484375, "logps/rejected": -279.16961669921875, "loss": 0.6007, "rewards/accuracies": 0.625, "rewards/chosen": 0.3884417414665222, "rewards/margins": 0.32020464539527893, "rewards/rejected": 0.06823711097240448, "step": 4455 }, { "epoch": 0.6891165667890973, "grad_norm": 3.6705517768859863, "learning_rate": 4.279413449421469e-06, "logits/chosen": 11.041379928588867, "logits/rejected": 5.593526840209961, "logps/chosen": -283.5278625488281, "logps/rejected": -227.07577514648438, "loss": 0.465, "rewards/accuracies": 0.75, "rewards/chosen": 0.21017169952392578, "rewards/margins": 0.7060978412628174, "rewards/rejected": -0.4959261417388916, "step": 4456 }, { "epoch": 0.6892712159288614, "grad_norm": 4.125463485717773, "learning_rate": 4.279127047771796e-06, "logits/chosen": 12.102180480957031, "logits/rejected": 6.286378383636475, "logps/chosen": -289.98516845703125, "logps/rejected": -212.45899963378906, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": 0.48257529735565186, "rewards/margins": 0.3367319107055664, "rewards/rejected": 0.14584335684776306, "step": 4457 }, { "epoch": 0.6894258650686256, "grad_norm": 4.032677173614502, "learning_rate": 4.2788406461221225e-06, "logits/chosen": 8.035992622375488, "logits/rejected": 3.0893349647521973, "logps/chosen": -298.1551818847656, "logps/rejected": -204.30711364746094, "loss": 0.5505, "rewards/accuracies": 0.625, "rewards/chosen": 0.5656207203865051, "rewards/margins": 0.3655545115470886, "rewards/rejected": 0.2000662088394165, "step": 4458 }, { "epoch": 0.6895805142083897, "grad_norm": 4.442121505737305, "learning_rate": 4.278554244472448e-06, "logits/chosen": 10.545802116394043, "logits/rejected": 8.706689834594727, "logps/chosen": -173.07183837890625, "logps/rejected": -166.60781860351562, "loss": 0.645, "rewards/accuracies": 0.75, "rewards/chosen": 0.2074240744113922, "rewards/margins": 0.12810516357421875, "rewards/rejected": 0.07931890338659286, "step": 4459 }, { "epoch": 0.6897351633481539, "grad_norm": 8.730128288269043, "learning_rate": 4.278267842822775e-06, "logits/chosen": 11.596196174621582, "logits/rejected": 6.471994876861572, "logps/chosen": -425.67608642578125, "logps/rejected": -317.4320983886719, "loss": 0.7719, "rewards/accuracies": 0.5, "rewards/chosen": 0.0530029833316803, "rewards/margins": -0.06280592083930969, "rewards/rejected": 0.11580891162157059, "step": 4460 }, { "epoch": 0.689889812487918, "grad_norm": 13.567523956298828, "learning_rate": 4.277981441173102e-06, "logits/chosen": 9.756856918334961, "logits/rejected": 6.920470237731934, "logps/chosen": -253.2501220703125, "logps/rejected": -212.18373107910156, "loss": 0.6299, "rewards/accuracies": 0.75, "rewards/chosen": 0.4337814450263977, "rewards/margins": 0.19981250166893005, "rewards/rejected": 0.23396892845630646, "step": 4461 }, { "epoch": 0.6900444616276822, "grad_norm": 11.470845222473145, "learning_rate": 4.277695039523428e-06, "logits/chosen": 5.811100006103516, "logits/rejected": 9.205923080444336, "logps/chosen": -353.174560546875, "logps/rejected": -362.16778564453125, "loss": 0.5478, "rewards/accuracies": 0.875, "rewards/chosen": 0.5000346899032593, "rewards/margins": 0.4216889441013336, "rewards/rejected": 0.07834573835134506, "step": 4462 }, { "epoch": 0.6901991107674463, "grad_norm": 3.72464919090271, "learning_rate": 4.277408637873754e-06, "logits/chosen": 16.846162796020508, "logits/rejected": 5.138298034667969, "logps/chosen": -342.4397888183594, "logps/rejected": -146.86964416503906, "loss": 0.4916, "rewards/accuracies": 0.625, "rewards/chosen": 0.09391441941261292, "rewards/margins": 0.794842541217804, "rewards/rejected": -0.7009280920028687, "step": 4463 }, { "epoch": 0.6903537599072105, "grad_norm": 5.917123317718506, "learning_rate": 4.277122236224081e-06, "logits/chosen": 8.068735122680664, "logits/rejected": 8.310803413391113, "logps/chosen": -179.04954528808594, "logps/rejected": -166.29823303222656, "loss": 0.6464, "rewards/accuracies": 0.375, "rewards/chosen": 6.006285548210144e-05, "rewards/margins": 0.1988905966281891, "rewards/rejected": -0.19883054494857788, "step": 4464 }, { "epoch": 0.6905084090469746, "grad_norm": 4.440203666687012, "learning_rate": 4.276835834574407e-06, "logits/chosen": 11.373984336853027, "logits/rejected": 0.045694708824157715, "logps/chosen": -251.7533416748047, "logps/rejected": -210.32533264160156, "loss": 0.4293, "rewards/accuracies": 0.75, "rewards/chosen": -0.00803305208683014, "rewards/margins": 0.9182306528091431, "rewards/rejected": -0.9262637495994568, "step": 4465 }, { "epoch": 0.6906630581867388, "grad_norm": 6.657516956329346, "learning_rate": 4.276549432924734e-06, "logits/chosen": 9.383832931518555, "logits/rejected": 7.182356834411621, "logps/chosen": -292.13641357421875, "logps/rejected": -170.91891479492188, "loss": 0.8599, "rewards/accuracies": 0.375, "rewards/chosen": -0.3271772265434265, "rewards/margins": -0.16560105979442596, "rewards/rejected": -0.16157618165016174, "step": 4466 }, { "epoch": 0.690817707326503, "grad_norm": 5.258181571960449, "learning_rate": 4.27626303127506e-06, "logits/chosen": 10.15674114227295, "logits/rejected": 6.404476642608643, "logps/chosen": -233.07278442382812, "logps/rejected": -255.77577209472656, "loss": 0.5231, "rewards/accuracies": 0.875, "rewards/chosen": 0.1105189397931099, "rewards/margins": 0.45449209213256836, "rewards/rejected": -0.34397315979003906, "step": 4467 }, { "epoch": 0.6909723564662672, "grad_norm": 5.729298114776611, "learning_rate": 4.2759766296253865e-06, "logits/chosen": 16.626935958862305, "logits/rejected": 12.685599327087402, "logps/chosen": -341.749267578125, "logps/rejected": -254.06082153320312, "loss": 0.6758, "rewards/accuracies": 0.625, "rewards/chosen": -0.12199517339468002, "rewards/margins": 0.13312430679798126, "rewards/rejected": -0.2551194727420807, "step": 4468 }, { "epoch": 0.6911270056060314, "grad_norm": 3.8000991344451904, "learning_rate": 4.275690227975713e-06, "logits/chosen": 10.480460166931152, "logits/rejected": 4.779432773590088, "logps/chosen": -187.3704376220703, "logps/rejected": -131.68356323242188, "loss": 0.542, "rewards/accuracies": 0.875, "rewards/chosen": 0.16134941577911377, "rewards/margins": 0.3910316228866577, "rewards/rejected": -0.22968220710754395, "step": 4469 }, { "epoch": 0.6912816547457955, "grad_norm": 4.3030500411987305, "learning_rate": 4.27540382632604e-06, "logits/chosen": 8.15811824798584, "logits/rejected": 7.014081954956055, "logps/chosen": -230.70645141601562, "logps/rejected": -187.0926513671875, "loss": 0.5585, "rewards/accuracies": 0.625, "rewards/chosen": 0.24688367545604706, "rewards/margins": 0.33801594376564026, "rewards/rejected": -0.091132253408432, "step": 4470 }, { "epoch": 0.6914363038855597, "grad_norm": 5.325830459594727, "learning_rate": 4.2751174246763664e-06, "logits/chosen": 11.083613395690918, "logits/rejected": 4.8147172927856445, "logps/chosen": -370.3558044433594, "logps/rejected": -254.15090942382812, "loss": 0.5852, "rewards/accuracies": 0.75, "rewards/chosen": 0.4289541244506836, "rewards/margins": 0.2925434708595276, "rewards/rejected": 0.136410653591156, "step": 4471 }, { "epoch": 0.6915909530253238, "grad_norm": 5.956872940063477, "learning_rate": 4.274831023026693e-06, "logits/chosen": 9.191061019897461, "logits/rejected": 8.84345817565918, "logps/chosen": -521.2496337890625, "logps/rejected": -432.1521301269531, "loss": 0.5759, "rewards/accuracies": 0.625, "rewards/chosen": 0.8666969537734985, "rewards/margins": 0.32008469104766846, "rewards/rejected": 0.5466122627258301, "step": 4472 }, { "epoch": 0.691745602165088, "grad_norm": 5.084835529327393, "learning_rate": 4.27454462137702e-06, "logits/chosen": 14.329225540161133, "logits/rejected": 16.042705535888672, "logps/chosen": -234.87545776367188, "logps/rejected": -271.8675842285156, "loss": 0.6391, "rewards/accuracies": 0.5, "rewards/chosen": 0.2103392779827118, "rewards/margins": 0.3674933612346649, "rewards/rejected": -0.15715409815311432, "step": 4473 }, { "epoch": 0.6919002513048521, "grad_norm": 5.046330451965332, "learning_rate": 4.2742582197273455e-06, "logits/chosen": 9.555638313293457, "logits/rejected": 6.8802809715271, "logps/chosen": -372.1163330078125, "logps/rejected": -311.79962158203125, "loss": 0.5371, "rewards/accuracies": 0.625, "rewards/chosen": 0.6100231409072876, "rewards/margins": 0.39819854497909546, "rewards/rejected": 0.21182462573051453, "step": 4474 }, { "epoch": 0.6920549004446163, "grad_norm": 4.712859630584717, "learning_rate": 4.273971818077672e-06, "logits/chosen": 5.801811695098877, "logits/rejected": 9.827659606933594, "logps/chosen": -122.18645477294922, "logps/rejected": -168.16775512695312, "loss": 0.857, "rewards/accuracies": 0.125, "rewards/chosen": -0.10234571993350983, "rewards/margins": -0.26679888367652893, "rewards/rejected": 0.1644531786441803, "step": 4475 }, { "epoch": 0.6922095495843804, "grad_norm": 4.788932800292969, "learning_rate": 4.273685416427999e-06, "logits/chosen": 6.0025129318237305, "logits/rejected": 6.455308437347412, "logps/chosen": -168.61260986328125, "logps/rejected": -158.684814453125, "loss": 0.6885, "rewards/accuracies": 0.5, "rewards/chosen": 0.12361730635166168, "rewards/margins": 0.22754812240600586, "rewards/rejected": -0.10393082350492477, "step": 4476 }, { "epoch": 0.6923641987241446, "grad_norm": 10.367257118225098, "learning_rate": 4.2733990147783255e-06, "logits/chosen": 6.191953182220459, "logits/rejected": 2.2458343505859375, "logps/chosen": -314.0201721191406, "logps/rejected": -335.0791015625, "loss": 0.6075, "rewards/accuracies": 0.75, "rewards/chosen": 0.3049100637435913, "rewards/margins": 0.3066188395023346, "rewards/rejected": -0.0017087459564208984, "step": 4477 }, { "epoch": 0.6925188478639087, "grad_norm": 5.876046657562256, "learning_rate": 4.273112613128652e-06, "logits/chosen": 7.488229751586914, "logits/rejected": 5.924365997314453, "logps/chosen": -311.04901123046875, "logps/rejected": -267.79742431640625, "loss": 0.7319, "rewards/accuracies": 0.5, "rewards/chosen": 0.31675636768341064, "rewards/margins": -0.012081734836101532, "rewards/rejected": 0.3288381099700928, "step": 4478 }, { "epoch": 0.6926734970036729, "grad_norm": 6.2538533210754395, "learning_rate": 4.272826211478979e-06, "logits/chosen": 8.13729190826416, "logits/rejected": 9.317706108093262, "logps/chosen": -293.9158020019531, "logps/rejected": -305.23419189453125, "loss": 0.6176, "rewards/accuracies": 0.5, "rewards/chosen": 0.32759976387023926, "rewards/margins": 0.29027268290519714, "rewards/rejected": 0.03732709586620331, "step": 4479 }, { "epoch": 0.6928281461434371, "grad_norm": 5.943842887878418, "learning_rate": 4.272539809829305e-06, "logits/chosen": 8.336824417114258, "logits/rejected": 7.882235527038574, "logps/chosen": -238.84783935546875, "logps/rejected": -273.76824951171875, "loss": 0.6015, "rewards/accuracies": 0.625, "rewards/chosen": -0.0010039806365966797, "rewards/margins": 0.2987978756427765, "rewards/rejected": -0.29980188608169556, "step": 4480 }, { "epoch": 0.6929827952832013, "grad_norm": 5.471213340759277, "learning_rate": 4.272253408179631e-06, "logits/chosen": 11.127744674682617, "logits/rejected": 5.473817825317383, "logps/chosen": -279.04754638671875, "logps/rejected": -292.2149658203125, "loss": 0.5747, "rewards/accuracies": 0.625, "rewards/chosen": 0.398656964302063, "rewards/margins": 0.31592652201652527, "rewards/rejected": 0.0827304795384407, "step": 4481 }, { "epoch": 0.6931374444229654, "grad_norm": 4.88809871673584, "learning_rate": 4.271967006529958e-06, "logits/chosen": 3.76755428314209, "logits/rejected": 2.7136266231536865, "logps/chosen": -179.02857971191406, "logps/rejected": -233.4971466064453, "loss": 0.556, "rewards/accuracies": 0.875, "rewards/chosen": 0.5539631843566895, "rewards/margins": 0.3540407419204712, "rewards/rejected": 0.19992247223854065, "step": 4482 }, { "epoch": 0.6932920935627296, "grad_norm": 5.8448591232299805, "learning_rate": 4.2716806048802846e-06, "logits/chosen": 13.463512420654297, "logits/rejected": 10.485298156738281, "logps/chosen": -413.3531494140625, "logps/rejected": -336.475341796875, "loss": 0.6576, "rewards/accuracies": 0.75, "rewards/chosen": 0.48680782318115234, "rewards/margins": 0.15102580189704895, "rewards/rejected": 0.3357820510864258, "step": 4483 }, { "epoch": 0.6934467427024937, "grad_norm": 4.641660213470459, "learning_rate": 4.271394203230611e-06, "logits/chosen": 11.000204086303711, "logits/rejected": 10.270837783813477, "logps/chosen": -292.2807922363281, "logps/rejected": -249.6904296875, "loss": 0.5762, "rewards/accuracies": 0.75, "rewards/chosen": 0.3724250793457031, "rewards/margins": 0.3019684851169586, "rewards/rejected": 0.0704566091299057, "step": 4484 }, { "epoch": 0.6936013918422579, "grad_norm": 4.448966979980469, "learning_rate": 4.271107801580938e-06, "logits/chosen": 10.849218368530273, "logits/rejected": 11.506805419921875, "logps/chosen": -271.62860107421875, "logps/rejected": -260.63311767578125, "loss": 0.5912, "rewards/accuracies": 0.625, "rewards/chosen": 0.29378825426101685, "rewards/margins": 0.4633154571056366, "rewards/rejected": -0.16952726244926453, "step": 4485 }, { "epoch": 0.693756040982022, "grad_norm": 5.159413814544678, "learning_rate": 4.2708213999312645e-06, "logits/chosen": -0.9197348356246948, "logits/rejected": 3.000253200531006, "logps/chosen": -192.28976440429688, "logps/rejected": -220.23855590820312, "loss": 0.769, "rewards/accuracies": 0.5, "rewards/chosen": -0.26910415291786194, "rewards/margins": 0.04591580480337143, "rewards/rejected": -0.31501996517181396, "step": 4486 }, { "epoch": 0.6939106901217862, "grad_norm": 6.626440525054932, "learning_rate": 4.27053499828159e-06, "logits/chosen": 6.265573501586914, "logits/rejected": 8.110580444335938, "logps/chosen": -260.4603271484375, "logps/rejected": -301.66131591796875, "loss": 0.884, "rewards/accuracies": 0.125, "rewards/chosen": 0.009654007852077484, "rewards/margins": -0.33339405059814453, "rewards/rejected": 0.3430480659008026, "step": 4487 }, { "epoch": 0.6940653392615503, "grad_norm": 4.198341369628906, "learning_rate": 4.270248596631917e-06, "logits/chosen": 12.62332534790039, "logits/rejected": 7.45077657699585, "logps/chosen": -217.82925415039062, "logps/rejected": -155.8576202392578, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": 0.432839035987854, "rewards/margins": 0.26763468980789185, "rewards/rejected": 0.16520433127880096, "step": 4488 }, { "epoch": 0.6942199884013145, "grad_norm": 4.832276821136475, "learning_rate": 4.269962194982244e-06, "logits/chosen": 10.574823379516602, "logits/rejected": 7.657534599304199, "logps/chosen": -199.66836547851562, "logps/rejected": -145.17208862304688, "loss": 0.6694, "rewards/accuracies": 0.625, "rewards/chosen": 0.13170365989208221, "rewards/margins": 0.12167917191982269, "rewards/rejected": 0.010024495422840118, "step": 4489 }, { "epoch": 0.6943746375410786, "grad_norm": 4.152619361877441, "learning_rate": 4.26967579333257e-06, "logits/chosen": 8.35746955871582, "logits/rejected": 5.2645721435546875, "logps/chosen": -326.18804931640625, "logps/rejected": -267.47222900390625, "loss": 0.5617, "rewards/accuracies": 0.625, "rewards/chosen": 0.5245493650436401, "rewards/margins": 0.3482816517353058, "rewards/rejected": 0.17626763880252838, "step": 4490 }, { "epoch": 0.6945292866808428, "grad_norm": 6.027318954467773, "learning_rate": 4.269389391682897e-06, "logits/chosen": 6.32731294631958, "logits/rejected": 7.404786109924316, "logps/chosen": -227.11331176757812, "logps/rejected": -283.74835205078125, "loss": 0.7601, "rewards/accuracies": 0.5, "rewards/chosen": 0.4080234467983246, "rewards/margins": -0.027156665921211243, "rewards/rejected": 0.4351801574230194, "step": 4491 }, { "epoch": 0.694683935820607, "grad_norm": 6.324120044708252, "learning_rate": 4.2691029900332236e-06, "logits/chosen": 8.313152313232422, "logits/rejected": 8.947222709655762, "logps/chosen": -255.2548828125, "logps/rejected": -286.03387451171875, "loss": 0.6289, "rewards/accuracies": 0.75, "rewards/chosen": 0.09388574957847595, "rewards/margins": 0.21506360173225403, "rewards/rejected": -0.12117785215377808, "step": 4492 }, { "epoch": 0.6948385849603712, "grad_norm": 4.870259761810303, "learning_rate": 4.268816588383549e-06, "logits/chosen": 7.434970378875732, "logits/rejected": 2.9332218170166016, "logps/chosen": -305.2400207519531, "logps/rejected": -239.05392456054688, "loss": 0.5754, "rewards/accuracies": 0.625, "rewards/chosen": 0.18469566106796265, "rewards/margins": 0.3774864673614502, "rewards/rejected": -0.19279080629348755, "step": 4493 }, { "epoch": 0.6949932341001354, "grad_norm": 6.050650596618652, "learning_rate": 4.268530186733876e-06, "logits/chosen": 8.938138008117676, "logits/rejected": 7.8709540367126465, "logps/chosen": -270.3529052734375, "logps/rejected": -231.974365234375, "loss": 0.6593, "rewards/accuracies": 0.875, "rewards/chosen": 0.2741074562072754, "rewards/margins": 0.08563751727342606, "rewards/rejected": 0.18846994638442993, "step": 4494 }, { "epoch": 0.6951478832398995, "grad_norm": 6.4017229080200195, "learning_rate": 4.268243785084203e-06, "logits/chosen": 14.082723617553711, "logits/rejected": 9.132933616638184, "logps/chosen": -396.4774169921875, "logps/rejected": -386.0800476074219, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": 0.35264596343040466, "rewards/margins": 0.24040357768535614, "rewards/rejected": 0.11224241554737091, "step": 4495 }, { "epoch": 0.6953025323796637, "grad_norm": 5.366425037384033, "learning_rate": 4.267957383434529e-06, "logits/chosen": 6.996615409851074, "logits/rejected": 4.5047478675842285, "logps/chosen": -385.19329833984375, "logps/rejected": -153.43960571289062, "loss": 0.4978, "rewards/accuracies": 0.75, "rewards/chosen": 0.7135995030403137, "rewards/margins": 0.548667848110199, "rewards/rejected": 0.16493159532546997, "step": 4496 }, { "epoch": 0.6954571815194278, "grad_norm": 5.529112815856934, "learning_rate": 4.267670981784855e-06, "logits/chosen": 8.390474319458008, "logits/rejected": 9.9286527633667, "logps/chosen": -238.40325927734375, "logps/rejected": -242.73394775390625, "loss": 0.6086, "rewards/accuracies": 0.75, "rewards/chosen": 0.18575145304203033, "rewards/margins": 0.2877410352230072, "rewards/rejected": -0.10198955237865448, "step": 4497 }, { "epoch": 0.695611830659192, "grad_norm": 5.971905708312988, "learning_rate": 4.267384580135182e-06, "logits/chosen": 13.26982307434082, "logits/rejected": 10.678499221801758, "logps/chosen": -267.1767578125, "logps/rejected": -237.23423767089844, "loss": 0.8925, "rewards/accuracies": 0.25, "rewards/chosen": 0.26273947954177856, "rewards/margins": -0.15909171104431152, "rewards/rejected": 0.4218311905860901, "step": 4498 }, { "epoch": 0.6957664797989561, "grad_norm": 4.168835163116455, "learning_rate": 4.267098178485508e-06, "logits/chosen": 9.130517959594727, "logits/rejected": 3.54677677154541, "logps/chosen": -227.73214721679688, "logps/rejected": -138.7923583984375, "loss": 0.4899, "rewards/accuracies": 0.875, "rewards/chosen": 0.4243078827857971, "rewards/margins": 0.5874795317649841, "rewards/rejected": -0.1631716787815094, "step": 4499 }, { "epoch": 0.6959211289387203, "grad_norm": 6.117710590362549, "learning_rate": 4.266811776835835e-06, "logits/chosen": 12.560050964355469, "logits/rejected": 11.859695434570312, "logps/chosen": -182.50404357910156, "logps/rejected": -286.2406311035156, "loss": 0.6277, "rewards/accuracies": 0.625, "rewards/chosen": 0.17282512784004211, "rewards/margins": 0.18937042355537415, "rewards/rejected": -0.01654529571533203, "step": 4500 }, { "epoch": 0.6960757780784844, "grad_norm": 5.799213409423828, "learning_rate": 4.266525375186161e-06, "logits/chosen": 11.16567611694336, "logits/rejected": 7.803576469421387, "logps/chosen": -298.4626159667969, "logps/rejected": -217.97315979003906, "loss": 0.6286, "rewards/accuracies": 0.625, "rewards/chosen": 0.3227972090244293, "rewards/margins": 0.20572544634342194, "rewards/rejected": 0.11707176268100739, "step": 4501 }, { "epoch": 0.6962304272182486, "grad_norm": 7.603994369506836, "learning_rate": 4.2662389735364875e-06, "logits/chosen": 6.379083633422852, "logits/rejected": 3.5388684272766113, "logps/chosen": -316.0343933105469, "logps/rejected": -299.93084716796875, "loss": 0.7439, "rewards/accuracies": 0.625, "rewards/chosen": 0.15197044610977173, "rewards/margins": 0.04690173268318176, "rewards/rejected": 0.10506869852542877, "step": 4502 }, { "epoch": 0.6963850763580127, "grad_norm": 6.077985763549805, "learning_rate": 4.265952571886814e-06, "logits/chosen": 6.600677490234375, "logits/rejected": 15.022510528564453, "logps/chosen": -209.51133728027344, "logps/rejected": -285.92681884765625, "loss": 0.8151, "rewards/accuracies": 0.375, "rewards/chosen": 0.0385466068983078, "rewards/margins": -0.17701105773448944, "rewards/rejected": 0.21555766463279724, "step": 4503 }, { "epoch": 0.6965397254977769, "grad_norm": 4.445544719696045, "learning_rate": 4.265666170237141e-06, "logits/chosen": 6.029338836669922, "logits/rejected": 7.2780375480651855, "logps/chosen": -214.21994018554688, "logps/rejected": -277.39996337890625, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": 0.15856796503067017, "rewards/margins": 0.385037362575531, "rewards/rejected": -0.22646939754486084, "step": 4504 }, { "epoch": 0.696694374637541, "grad_norm": 6.936688423156738, "learning_rate": 4.2653797685874675e-06, "logits/chosen": 4.115750789642334, "logits/rejected": 7.8610334396362305, "logps/chosen": -236.58029174804688, "logps/rejected": -278.53924560546875, "loss": 0.7073, "rewards/accuracies": 0.5, "rewards/chosen": 0.24960748851299286, "rewards/margins": 0.03195124864578247, "rewards/rejected": 0.2176562249660492, "step": 4505 }, { "epoch": 0.6968490237773053, "grad_norm": 4.63679313659668, "learning_rate": 4.265093366937793e-06, "logits/chosen": 14.686271667480469, "logits/rejected": 15.974813461303711, "logps/chosen": -236.45315551757812, "logps/rejected": -291.4210510253906, "loss": 0.5889, "rewards/accuracies": 0.625, "rewards/chosen": 0.4108043909072876, "rewards/margins": 0.3886268734931946, "rewards/rejected": 0.02217748761177063, "step": 4506 }, { "epoch": 0.6970036729170694, "grad_norm": 4.628255367279053, "learning_rate": 4.26480696528812e-06, "logits/chosen": 9.792828559875488, "logits/rejected": 6.230591297149658, "logps/chosen": -218.79173278808594, "logps/rejected": -169.16140747070312, "loss": 0.5344, "rewards/accuracies": 0.625, "rewards/chosen": 0.30802667140960693, "rewards/margins": 0.5733361840248108, "rewards/rejected": -0.26530954241752625, "step": 4507 }, { "epoch": 0.6971583220568336, "grad_norm": 3.8382160663604736, "learning_rate": 4.264520563638447e-06, "logits/chosen": 12.786280632019043, "logits/rejected": 3.418998956680298, "logps/chosen": -283.6490173339844, "logps/rejected": -180.78756713867188, "loss": 0.5065, "rewards/accuracies": 0.875, "rewards/chosen": 0.210466668009758, "rewards/margins": 0.4462009072303772, "rewards/rejected": -0.235734224319458, "step": 4508 }, { "epoch": 0.6973129711965977, "grad_norm": 5.651285171508789, "learning_rate": 4.264234161988773e-06, "logits/chosen": 14.744607925415039, "logits/rejected": 10.569001197814941, "logps/chosen": -315.6968688964844, "logps/rejected": -268.34527587890625, "loss": 0.7555, "rewards/accuracies": 0.625, "rewards/chosen": 0.14891844987869263, "rewards/margins": 0.03636118769645691, "rewards/rejected": 0.11255726218223572, "step": 4509 }, { "epoch": 0.6974676203363619, "grad_norm": 3.8856687545776367, "learning_rate": 4.2639477603391e-06, "logits/chosen": 13.814919471740723, "logits/rejected": 7.502386093139648, "logps/chosen": -260.0505065917969, "logps/rejected": -218.2507781982422, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 0.7865480780601501, "rewards/margins": 0.7240950465202332, "rewards/rejected": 0.06245298683643341, "step": 4510 }, { "epoch": 0.697622269476126, "grad_norm": 7.40568733215332, "learning_rate": 4.2636613586894265e-06, "logits/chosen": 12.888557434082031, "logits/rejected": 8.701112747192383, "logps/chosen": -299.17144775390625, "logps/rejected": -253.7444305419922, "loss": 0.8301, "rewards/accuracies": 0.375, "rewards/chosen": 0.061250388622283936, "rewards/margins": -0.17605458199977875, "rewards/rejected": 0.2373049557209015, "step": 4511 }, { "epoch": 0.6977769186158902, "grad_norm": 4.889257907867432, "learning_rate": 4.263374957039753e-06, "logits/chosen": 7.409084796905518, "logits/rejected": 4.115428924560547, "logps/chosen": -305.44793701171875, "logps/rejected": -267.0001220703125, "loss": 0.5515, "rewards/accuracies": 0.625, "rewards/chosen": 0.3631269931793213, "rewards/margins": 0.39390960335731506, "rewards/rejected": -0.030782606452703476, "step": 4512 }, { "epoch": 0.6979315677556543, "grad_norm": 5.9372053146362305, "learning_rate": 4.263088555390079e-06, "logits/chosen": 14.701896667480469, "logits/rejected": 10.373376846313477, "logps/chosen": -351.3394775390625, "logps/rejected": -372.42669677734375, "loss": 0.729, "rewards/accuracies": 0.375, "rewards/chosen": 0.4987856149673462, "rewards/margins": 0.013319596648216248, "rewards/rejected": 0.48546600341796875, "step": 4513 }, { "epoch": 0.6980862168954185, "grad_norm": 16.87580680847168, "learning_rate": 4.262802153740406e-06, "logits/chosen": 7.63148832321167, "logits/rejected": 11.962331771850586, "logps/chosen": -214.32266235351562, "logps/rejected": -246.63441467285156, "loss": 0.5951, "rewards/accuracies": 0.625, "rewards/chosen": 0.1629994958639145, "rewards/margins": 0.31202423572540283, "rewards/rejected": -0.14902472496032715, "step": 4514 }, { "epoch": 0.6982408660351827, "grad_norm": 10.542787551879883, "learning_rate": 4.262515752090732e-06, "logits/chosen": 11.416427612304688, "logits/rejected": 10.310189247131348, "logps/chosen": -337.01336669921875, "logps/rejected": -369.6172790527344, "loss": 0.857, "rewards/accuracies": 0.5, "rewards/chosen": 0.20953340828418732, "rewards/margins": -0.14943121373653412, "rewards/rejected": 0.35896462202072144, "step": 4515 }, { "epoch": 0.6983955151749468, "grad_norm": 4.263949871063232, "learning_rate": 4.262229350441059e-06, "logits/chosen": 7.161815643310547, "logits/rejected": 5.741499900817871, "logps/chosen": -192.8360137939453, "logps/rejected": -168.21011352539062, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": 0.5079464912414551, "rewards/margins": 0.29513001441955566, "rewards/rejected": 0.21281644701957703, "step": 4516 }, { "epoch": 0.698550164314711, "grad_norm": 4.787829399108887, "learning_rate": 4.261942948791386e-06, "logits/chosen": 13.464323043823242, "logits/rejected": 7.8164381980896, "logps/chosen": -182.98135375976562, "logps/rejected": -145.70376586914062, "loss": 0.6729, "rewards/accuracies": 0.625, "rewards/chosen": 0.16753850877285004, "rewards/margins": 0.1046389490365982, "rewards/rejected": 0.06289957463741302, "step": 4517 }, { "epoch": 0.6987048134544751, "grad_norm": 5.972153663635254, "learning_rate": 4.261656547141712e-06, "logits/chosen": 7.715356826782227, "logits/rejected": 11.312923431396484, "logps/chosen": -270.59625244140625, "logps/rejected": -294.0712890625, "loss": 0.7713, "rewards/accuracies": 0.375, "rewards/chosen": 0.25535401701927185, "rewards/margins": -0.048941247165203094, "rewards/rejected": 0.30429530143737793, "step": 4518 }, { "epoch": 0.6988594625942394, "grad_norm": 5.7866363525390625, "learning_rate": 4.261370145492039e-06, "logits/chosen": 10.942037582397461, "logits/rejected": 16.701414108276367, "logps/chosen": -205.97909545898438, "logps/rejected": -247.9434356689453, "loss": 0.7537, "rewards/accuracies": 0.5, "rewards/chosen": -0.05205276608467102, "rewards/margins": 0.06624981015920639, "rewards/rejected": -0.1183025985956192, "step": 4519 }, { "epoch": 0.6990141117340035, "grad_norm": 5.484710216522217, "learning_rate": 4.261083743842365e-06, "logits/chosen": 8.549579620361328, "logits/rejected": 7.959667205810547, "logps/chosen": -248.42132568359375, "logps/rejected": -236.71444702148438, "loss": 0.7051, "rewards/accuracies": 0.25, "rewards/chosen": 0.25760984420776367, "rewards/margins": 0.07971373200416565, "rewards/rejected": 0.17789611220359802, "step": 4520 }, { "epoch": 0.6991687608737677, "grad_norm": 5.9600300788879395, "learning_rate": 4.260797342192691e-06, "logits/chosen": 13.78126049041748, "logits/rejected": 3.591838836669922, "logps/chosen": -281.55450439453125, "logps/rejected": -188.72952270507812, "loss": 0.5198, "rewards/accuracies": 0.5, "rewards/chosen": 0.16513997316360474, "rewards/margins": 0.6938029527664185, "rewards/rejected": -0.5286629796028137, "step": 4521 }, { "epoch": 0.6993234100135318, "grad_norm": 4.384075164794922, "learning_rate": 4.260510940543018e-06, "logits/chosen": 7.3213210105896, "logits/rejected": 0.5768899917602539, "logps/chosen": -242.21971130371094, "logps/rejected": -195.26075744628906, "loss": 0.6667, "rewards/accuracies": 0.5, "rewards/chosen": 0.41109585762023926, "rewards/margins": 0.12469722330570221, "rewards/rejected": 0.28639861941337585, "step": 4522 }, { "epoch": 0.699478059153296, "grad_norm": 5.961554050445557, "learning_rate": 4.260224538893345e-06, "logits/chosen": 10.137752532958984, "logits/rejected": 11.230949401855469, "logps/chosen": -345.00958251953125, "logps/rejected": -326.57818603515625, "loss": 0.715, "rewards/accuracies": 0.5, "rewards/chosen": 0.24702347815036774, "rewards/margins": 0.18671254813671112, "rewards/rejected": 0.060310930013656616, "step": 4523 }, { "epoch": 0.6996327082930601, "grad_norm": 5.912920951843262, "learning_rate": 4.259938137243671e-06, "logits/chosen": 9.817709922790527, "logits/rejected": 2.472628593444824, "logps/chosen": -267.9261779785156, "logps/rejected": -233.56130981445312, "loss": 0.7176, "rewards/accuracies": 0.375, "rewards/chosen": 0.2913530468940735, "rewards/margins": 0.18615540862083435, "rewards/rejected": 0.10519766807556152, "step": 4524 }, { "epoch": 0.6997873574328243, "grad_norm": 8.732048988342285, "learning_rate": 4.259651735593998e-06, "logits/chosen": 5.436592102050781, "logits/rejected": 8.967561721801758, "logps/chosen": -207.32400512695312, "logps/rejected": -276.990966796875, "loss": 1.112, "rewards/accuracies": 0.5, "rewards/chosen": -0.25100183486938477, "rewards/margins": -0.5311140418052673, "rewards/rejected": 0.28011220693588257, "step": 4525 }, { "epoch": 0.6999420065725884, "grad_norm": 7.210418701171875, "learning_rate": 4.259365333944324e-06, "logits/chosen": 11.290397644042969, "logits/rejected": 6.048158168792725, "logps/chosen": -329.5198974609375, "logps/rejected": -248.43145751953125, "loss": 0.699, "rewards/accuracies": 0.5, "rewards/chosen": 0.6017400622367859, "rewards/margins": 0.13417525589466095, "rewards/rejected": 0.4675648510456085, "step": 4526 }, { "epoch": 0.7000966557123526, "grad_norm": 4.823277950286865, "learning_rate": 4.25907893229465e-06, "logits/chosen": 14.772256851196289, "logits/rejected": 4.461668968200684, "logps/chosen": -449.2216796875, "logps/rejected": -301.8907165527344, "loss": 0.407, "rewards/accuracies": 0.875, "rewards/chosen": 0.6074482202529907, "rewards/margins": 0.7500501871109009, "rewards/rejected": -0.14260196685791016, "step": 4527 }, { "epoch": 0.7002513048521167, "grad_norm": 10.850847244262695, "learning_rate": 4.258792530644977e-06, "logits/chosen": 9.387843132019043, "logits/rejected": 15.987394332885742, "logps/chosen": -233.4283447265625, "logps/rejected": -317.0649719238281, "loss": 0.9578, "rewards/accuracies": 0.625, "rewards/chosen": -0.3869436979293823, "rewards/margins": -0.13617311418056488, "rewards/rejected": -0.25077056884765625, "step": 4528 }, { "epoch": 0.7004059539918809, "grad_norm": 6.553378582000732, "learning_rate": 4.258506128995304e-06, "logits/chosen": 9.833511352539062, "logits/rejected": 9.896178245544434, "logps/chosen": -260.8777770996094, "logps/rejected": -245.5105743408203, "loss": 0.8856, "rewards/accuracies": 0.25, "rewards/chosen": 0.5739567875862122, "rewards/margins": -0.2990604043006897, "rewards/rejected": 0.8730171918869019, "step": 4529 }, { "epoch": 0.700560603131645, "grad_norm": 5.9011054039001465, "learning_rate": 4.25821972734563e-06, "logits/chosen": 10.517602920532227, "logits/rejected": 11.96645450592041, "logps/chosen": -261.49993896484375, "logps/rejected": -333.6334533691406, "loss": 0.6194, "rewards/accuracies": 0.75, "rewards/chosen": 0.6023586392402649, "rewards/margins": 0.22985060513019562, "rewards/rejected": 0.37250804901123047, "step": 4530 }, { "epoch": 0.7007152522714093, "grad_norm": 8.298968315124512, "learning_rate": 4.257933325695956e-06, "logits/chosen": 5.801590919494629, "logits/rejected": 4.023548603057861, "logps/chosen": -252.943603515625, "logps/rejected": -258.43560791015625, "loss": 0.6664, "rewards/accuracies": 0.625, "rewards/chosen": 0.34438368678092957, "rewards/margins": 0.33953070640563965, "rewards/rejected": 0.004852950572967529, "step": 4531 }, { "epoch": 0.7008699014111734, "grad_norm": 5.263658046722412, "learning_rate": 4.257646924046283e-06, "logits/chosen": 11.740427017211914, "logits/rejected": 3.867795944213867, "logps/chosen": -329.8431701660156, "logps/rejected": -238.7526092529297, "loss": 0.4305, "rewards/accuracies": 0.75, "rewards/chosen": 0.477817565202713, "rewards/margins": 0.7964357137680054, "rewards/rejected": -0.31861817836761475, "step": 4532 }, { "epoch": 0.7010245505509376, "grad_norm": 4.682222366333008, "learning_rate": 4.2573605223966095e-06, "logits/chosen": 12.433629989624023, "logits/rejected": 4.082302570343018, "logps/chosen": -290.85711669921875, "logps/rejected": -162.19224548339844, "loss": 0.6679, "rewards/accuracies": 0.625, "rewards/chosen": 0.2377374768257141, "rewards/margins": 0.061184413731098175, "rewards/rejected": 0.17655307054519653, "step": 4533 }, { "epoch": 0.7011791996907017, "grad_norm": 5.300076961517334, "learning_rate": 4.257074120746936e-06, "logits/chosen": 5.017838954925537, "logits/rejected": 6.487076759338379, "logps/chosen": -194.14794921875, "logps/rejected": -207.20458984375, "loss": 0.7196, "rewards/accuracies": 0.625, "rewards/chosen": 0.4330114424228668, "rewards/margins": 0.04724682867527008, "rewards/rejected": 0.38576459884643555, "step": 4534 }, { "epoch": 0.7013338488304659, "grad_norm": 3.788860321044922, "learning_rate": 4.256787719097262e-06, "logits/chosen": 12.992966651916504, "logits/rejected": 11.000411033630371, "logps/chosen": -218.46665954589844, "logps/rejected": -216.50485229492188, "loss": 0.5025, "rewards/accuracies": 0.875, "rewards/chosen": 0.5065352916717529, "rewards/margins": 0.5044621229171753, "rewards/rejected": 0.0020730923861265182, "step": 4535 }, { "epoch": 0.70148849797023, "grad_norm": 6.75756311416626, "learning_rate": 4.2565013174475886e-06, "logits/chosen": 7.504397869110107, "logits/rejected": 8.28226375579834, "logps/chosen": -173.804443359375, "logps/rejected": -207.50404357910156, "loss": 0.7555, "rewards/accuracies": 0.5, "rewards/chosen": 0.4218415319919586, "rewards/margins": -0.07979655265808105, "rewards/rejected": 0.5016380548477173, "step": 4536 }, { "epoch": 0.7016431471099942, "grad_norm": 26.790225982666016, "learning_rate": 4.256214915797915e-06, "logits/chosen": 9.217204093933105, "logits/rejected": 14.140243530273438, "logps/chosen": -297.89923095703125, "logps/rejected": -343.83428955078125, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": 0.45408692955970764, "rewards/margins": 0.1876131147146225, "rewards/rejected": 0.26647382974624634, "step": 4537 }, { "epoch": 0.7017977962497584, "grad_norm": 6.070600509643555, "learning_rate": 4.255928514148242e-06, "logits/chosen": 12.289417266845703, "logits/rejected": 8.979168891906738, "logps/chosen": -249.48367309570312, "logps/rejected": -179.6365966796875, "loss": 0.7263, "rewards/accuracies": 0.625, "rewards/chosen": 0.29100480675697327, "rewards/margins": 0.11057285964488983, "rewards/rejected": 0.18043194711208344, "step": 4538 }, { "epoch": 0.7019524453895225, "grad_norm": 7.463078022003174, "learning_rate": 4.255642112498568e-06, "logits/chosen": 12.173538208007812, "logits/rejected": 12.051815032958984, "logps/chosen": -382.2598876953125, "logps/rejected": -344.8181457519531, "loss": 0.6769, "rewards/accuracies": 0.5, "rewards/chosen": 0.2353130429983139, "rewards/margins": 0.3194710910320282, "rewards/rejected": -0.0841580480337143, "step": 4539 }, { "epoch": 0.7021070945292867, "grad_norm": 4.894156455993652, "learning_rate": 4.255355710848894e-06, "logits/chosen": 8.359127044677734, "logits/rejected": 7.2022600173950195, "logps/chosen": -262.2000427246094, "logps/rejected": -276.2204284667969, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": 0.7289108037948608, "rewards/margins": 0.3931763768196106, "rewards/rejected": 0.33573436737060547, "step": 4540 }, { "epoch": 0.7022617436690508, "grad_norm": 9.222636222839355, "learning_rate": 4.255069309199221e-06, "logits/chosen": 8.670086860656738, "logits/rejected": 12.20486831665039, "logps/chosen": -280.91339111328125, "logps/rejected": -349.40313720703125, "loss": 0.9708, "rewards/accuracies": 0.25, "rewards/chosen": -0.18760338425636292, "rewards/margins": -0.36386236548423767, "rewards/rejected": 0.17625896632671356, "step": 4541 }, { "epoch": 0.702416392808815, "grad_norm": 4.30269193649292, "learning_rate": 4.254782907549548e-06, "logits/chosen": 12.84939956665039, "logits/rejected": 13.319472312927246, "logps/chosen": -176.6269989013672, "logps/rejected": -174.7431182861328, "loss": 0.6821, "rewards/accuracies": 0.625, "rewards/chosen": 0.3207028806209564, "rewards/margins": 0.09814807772636414, "rewards/rejected": 0.2225547879934311, "step": 4542 }, { "epoch": 0.7025710419485791, "grad_norm": 4.3327555656433105, "learning_rate": 4.254496505899874e-06, "logits/chosen": 11.346357345581055, "logits/rejected": 8.260499954223633, "logps/chosen": -236.2779083251953, "logps/rejected": -185.6102294921875, "loss": 0.4655, "rewards/accuracies": 0.875, "rewards/chosen": 0.32453033328056335, "rewards/margins": 0.6208864450454712, "rewards/rejected": -0.29635608196258545, "step": 4543 }, { "epoch": 0.7027256910883434, "grad_norm": 4.27791690826416, "learning_rate": 4.254210104250201e-06, "logits/chosen": 11.70185375213623, "logits/rejected": 5.190775394439697, "logps/chosen": -261.27142333984375, "logps/rejected": -179.78782653808594, "loss": 0.5447, "rewards/accuracies": 0.625, "rewards/chosen": 0.45501822233200073, "rewards/margins": 0.3926829397678375, "rewards/rejected": 0.0623352974653244, "step": 4544 }, { "epoch": 0.7028803402281075, "grad_norm": 6.794124126434326, "learning_rate": 4.2539237026005276e-06, "logits/chosen": 7.268523693084717, "logits/rejected": 3.076155424118042, "logps/chosen": -360.5343322753906, "logps/rejected": -230.9642791748047, "loss": 0.5687, "rewards/accuracies": 0.75, "rewards/chosen": 0.7111493349075317, "rewards/margins": 0.31361597776412964, "rewards/rejected": 0.3975334167480469, "step": 4545 }, { "epoch": 0.7030349893678717, "grad_norm": 3.671812057495117, "learning_rate": 4.253637300950853e-06, "logits/chosen": 11.911089897155762, "logits/rejected": 10.885863304138184, "logps/chosen": -102.51800537109375, "logps/rejected": -132.78990173339844, "loss": 0.6537, "rewards/accuracies": 0.75, "rewards/chosen": -0.05699014663696289, "rewards/margins": 0.1282472014427185, "rewards/rejected": -0.1852373331785202, "step": 4546 }, { "epoch": 0.7031896385076358, "grad_norm": 5.970956325531006, "learning_rate": 4.25335089930118e-06, "logits/chosen": 13.8966064453125, "logits/rejected": 10.703203201293945, "logps/chosen": -326.9024658203125, "logps/rejected": -328.2403564453125, "loss": 0.5502, "rewards/accuracies": 0.625, "rewards/chosen": 0.3544917106628418, "rewards/margins": 0.4477842152118683, "rewards/rejected": -0.09329252690076828, "step": 4547 }, { "epoch": 0.7033442876474, "grad_norm": 5.157553195953369, "learning_rate": 4.253064497651507e-06, "logits/chosen": 6.360335350036621, "logits/rejected": 6.4123311042785645, "logps/chosen": -350.803955078125, "logps/rejected": -315.10003662109375, "loss": 0.6344, "rewards/accuracies": 0.625, "rewards/chosen": 0.4565875232219696, "rewards/margins": 0.1502254605293274, "rewards/rejected": 0.3063620626926422, "step": 4548 }, { "epoch": 0.7034989367871641, "grad_norm": 7.361348628997803, "learning_rate": 4.252778096001833e-06, "logits/chosen": 9.062346458435059, "logits/rejected": 7.906216621398926, "logps/chosen": -260.3416748046875, "logps/rejected": -255.54330444335938, "loss": 0.7359, "rewards/accuracies": 0.5, "rewards/chosen": 0.2632283866405487, "rewards/margins": -0.004096284508705139, "rewards/rejected": 0.26732465624809265, "step": 4549 }, { "epoch": 0.7036535859269283, "grad_norm": 4.224150657653809, "learning_rate": 4.25249169435216e-06, "logits/chosen": 8.379314422607422, "logits/rejected": 9.338112831115723, "logps/chosen": -161.53048706054688, "logps/rejected": -202.90817260742188, "loss": 0.6719, "rewards/accuracies": 0.625, "rewards/chosen": 0.30146777629852295, "rewards/margins": 0.0955372303724289, "rewards/rejected": 0.20593053102493286, "step": 4550 }, { "epoch": 0.7038082350666924, "grad_norm": 5.291877269744873, "learning_rate": 4.252205292702487e-06, "logits/chosen": 14.643348693847656, "logits/rejected": 12.800247192382812, "logps/chosen": -372.36785888671875, "logps/rejected": -353.1993408203125, "loss": 0.6063, "rewards/accuracies": 0.75, "rewards/chosen": 0.5627739429473877, "rewards/margins": 0.23615151643753052, "rewards/rejected": 0.3266223669052124, "step": 4551 }, { "epoch": 0.7039628842064566, "grad_norm": 5.587003231048584, "learning_rate": 4.251918891052813e-06, "logits/chosen": 6.755594253540039, "logits/rejected": 9.65949535369873, "logps/chosen": -193.87896728515625, "logps/rejected": -224.65139770507812, "loss": 0.7177, "rewards/accuracies": 0.625, "rewards/chosen": 0.28870445489883423, "rewards/margins": -0.018207117915153503, "rewards/rejected": 0.3069115877151489, "step": 4552 }, { "epoch": 0.7041175333462207, "grad_norm": 3.8572256565093994, "learning_rate": 4.251632489403139e-06, "logits/chosen": 4.550288200378418, "logits/rejected": 5.042516708374023, "logps/chosen": -166.97640991210938, "logps/rejected": -166.8123321533203, "loss": 0.6115, "rewards/accuracies": 0.625, "rewards/chosen": 0.15429887175559998, "rewards/margins": 0.5061265230178833, "rewards/rejected": -0.3518276810646057, "step": 4553 }, { "epoch": 0.7042721824859849, "grad_norm": 5.494017124176025, "learning_rate": 4.251346087753466e-06, "logits/chosen": 10.74178409576416, "logits/rejected": 10.703782081604004, "logps/chosen": -271.8233642578125, "logps/rejected": -284.4439697265625, "loss": 0.6403, "rewards/accuracies": 0.375, "rewards/chosen": 0.10947389155626297, "rewards/margins": 0.2876623272895813, "rewards/rejected": -0.17818844318389893, "step": 4554 }, { "epoch": 0.704426831625749, "grad_norm": 3.6626198291778564, "learning_rate": 4.251059686103792e-06, "logits/chosen": 10.443584442138672, "logits/rejected": 9.946053504943848, "logps/chosen": -169.6202392578125, "logps/rejected": -165.19149780273438, "loss": 0.5786, "rewards/accuracies": 0.625, "rewards/chosen": 0.1825583428144455, "rewards/margins": 0.416953444480896, "rewards/rejected": -0.23439514636993408, "step": 4555 }, { "epoch": 0.7045814807655132, "grad_norm": 4.254640579223633, "learning_rate": 4.250773284454119e-06, "logits/chosen": 11.594375610351562, "logits/rejected": 2.9958701133728027, "logps/chosen": -288.03485107421875, "logps/rejected": -138.9364471435547, "loss": 0.5742, "rewards/accuracies": 0.625, "rewards/chosen": 0.16672661900520325, "rewards/margins": 0.3213419020175934, "rewards/rejected": -0.15461528301239014, "step": 4556 }, { "epoch": 0.7047361299052775, "grad_norm": 6.709202289581299, "learning_rate": 4.250486882804446e-06, "logits/chosen": 10.963682174682617, "logits/rejected": 13.264007568359375, "logps/chosen": -294.2309265136719, "logps/rejected": -347.676025390625, "loss": 0.8937, "rewards/accuracies": 0.125, "rewards/chosen": 0.11398278176784515, "rewards/margins": -0.3024635314941406, "rewards/rejected": 0.4164462983608246, "step": 4557 }, { "epoch": 0.7048907790450416, "grad_norm": 6.53480339050293, "learning_rate": 4.250200481154772e-06, "logits/chosen": 9.597980499267578, "logits/rejected": 2.7294387817382812, "logps/chosen": -363.687255859375, "logps/rejected": -285.1516418457031, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": 0.15482865273952484, "rewards/margins": 0.08394481241703033, "rewards/rejected": 0.0708838552236557, "step": 4558 }, { "epoch": 0.7050454281848058, "grad_norm": 5.466675758361816, "learning_rate": 4.249914079505098e-06, "logits/chosen": 4.729236602783203, "logits/rejected": 5.2614569664001465, "logps/chosen": -213.840576171875, "logps/rejected": -232.79869079589844, "loss": 0.7956, "rewards/accuracies": 0.5, "rewards/chosen": 0.06657245010137558, "rewards/margins": -0.09675539284944534, "rewards/rejected": 0.16332782804965973, "step": 4559 }, { "epoch": 0.7052000773245699, "grad_norm": 6.556961536407471, "learning_rate": 4.249627677855425e-06, "logits/chosen": 8.17802619934082, "logits/rejected": 5.780283451080322, "logps/chosen": -362.2308654785156, "logps/rejected": -322.38104248046875, "loss": 0.667, "rewards/accuracies": 0.375, "rewards/chosen": 0.36667710542678833, "rewards/margins": 0.12676569819450378, "rewards/rejected": 0.23991142213344574, "step": 4560 }, { "epoch": 0.7053547264643341, "grad_norm": 14.223852157592773, "learning_rate": 4.2493412762057514e-06, "logits/chosen": 9.246179580688477, "logits/rejected": 7.893303871154785, "logps/chosen": -296.88665771484375, "logps/rejected": -242.22293090820312, "loss": 0.7423, "rewards/accuracies": 0.625, "rewards/chosen": -0.22583410143852234, "rewards/margins": 0.11306314170360565, "rewards/rejected": -0.3388972282409668, "step": 4561 }, { "epoch": 0.7055093756040982, "grad_norm": 6.777234077453613, "learning_rate": 4.249054874556078e-06, "logits/chosen": 10.873583793640137, "logits/rejected": 5.482688903808594, "logps/chosen": -371.5252990722656, "logps/rejected": -287.93756103515625, "loss": 0.5071, "rewards/accuracies": 0.875, "rewards/chosen": 0.38150864839553833, "rewards/margins": 0.5027434825897217, "rewards/rejected": -0.12123481929302216, "step": 4562 }, { "epoch": 0.7056640247438624, "grad_norm": 6.712673187255859, "learning_rate": 4.248768472906405e-06, "logits/chosen": 14.129240989685059, "logits/rejected": 8.121871948242188, "logps/chosen": -416.73236083984375, "logps/rejected": -246.70632934570312, "loss": 0.8318, "rewards/accuracies": 0.5, "rewards/chosen": -0.06838780641555786, "rewards/margins": -0.18072742223739624, "rewards/rejected": 0.11233963072299957, "step": 4563 }, { "epoch": 0.7058186738836265, "grad_norm": 5.995640754699707, "learning_rate": 4.2484820712567305e-06, "logits/chosen": 11.535335540771484, "logits/rejected": 8.821805953979492, "logps/chosen": -216.32337951660156, "logps/rejected": -208.96617126464844, "loss": 0.5594, "rewards/accuracies": 0.625, "rewards/chosen": 0.16813993453979492, "rewards/margins": 0.6013048887252808, "rewards/rejected": -0.4331649839878082, "step": 4564 }, { "epoch": 0.7059733230233907, "grad_norm": 7.763742923736572, "learning_rate": 4.248195669607057e-06, "logits/chosen": 9.432860374450684, "logits/rejected": 10.351784706115723, "logps/chosen": -287.97015380859375, "logps/rejected": -296.40155029296875, "loss": 0.766, "rewards/accuracies": 0.5, "rewards/chosen": 0.3054032325744629, "rewards/margins": -0.06390313804149628, "rewards/rejected": 0.36930638551712036, "step": 4565 }, { "epoch": 0.7061279721631548, "grad_norm": 5.013143539428711, "learning_rate": 4.247909267957384e-06, "logits/chosen": 8.991588592529297, "logits/rejected": 5.45574426651001, "logps/chosen": -192.02626037597656, "logps/rejected": -190.56459045410156, "loss": 0.766, "rewards/accuracies": 0.5, "rewards/chosen": 0.11150780320167542, "rewards/margins": -0.07187265902757645, "rewards/rejected": 0.18338046967983246, "step": 4566 }, { "epoch": 0.706282621302919, "grad_norm": 14.881312370300293, "learning_rate": 4.2476228663077105e-06, "logits/chosen": 13.406326293945312, "logits/rejected": 6.856800556182861, "logps/chosen": -361.27410888671875, "logps/rejected": -320.4654541015625, "loss": 0.5216, "rewards/accuracies": 0.875, "rewards/chosen": 0.5811535716056824, "rewards/margins": 0.4521726965904236, "rewards/rejected": 0.12898090481758118, "step": 4567 }, { "epoch": 0.7064372704426831, "grad_norm": 4.174317359924316, "learning_rate": 4.247336464658037e-06, "logits/chosen": 10.269937515258789, "logits/rejected": 2.4715161323547363, "logps/chosen": -368.53521728515625, "logps/rejected": -279.296630859375, "loss": 0.4159, "rewards/accuracies": 0.875, "rewards/chosen": 0.7417339086532593, "rewards/margins": 0.7443634271621704, "rewards/rejected": -0.0026294589042663574, "step": 4568 }, { "epoch": 0.7065919195824473, "grad_norm": 5.458750247955322, "learning_rate": 4.247050063008363e-06, "logits/chosen": 9.789627075195312, "logits/rejected": 9.200784683227539, "logps/chosen": -288.3679504394531, "logps/rejected": -316.68804931640625, "loss": 0.5609, "rewards/accuracies": 0.75, "rewards/chosen": 0.314578115940094, "rewards/margins": 0.6339272260665894, "rewards/rejected": -0.319349080324173, "step": 4569 }, { "epoch": 0.7067465687222115, "grad_norm": 4.499334335327148, "learning_rate": 4.24676366135869e-06, "logits/chosen": 11.671292304992676, "logits/rejected": 11.885964393615723, "logps/chosen": -248.450439453125, "logps/rejected": -294.1128234863281, "loss": 0.5969, "rewards/accuracies": 0.625, "rewards/chosen": 0.2155788391828537, "rewards/margins": 0.39140212535858154, "rewards/rejected": -0.17582333087921143, "step": 4570 }, { "epoch": 0.7069012178619757, "grad_norm": 6.113247394561768, "learning_rate": 4.246477259709016e-06, "logits/chosen": 8.994807243347168, "logits/rejected": 7.239126205444336, "logps/chosen": -296.262451171875, "logps/rejected": -285.0254821777344, "loss": 0.7345, "rewards/accuracies": 0.5, "rewards/chosen": 0.2146557867527008, "rewards/margins": -0.03775966912508011, "rewards/rejected": 0.2524154484272003, "step": 4571 }, { "epoch": 0.7070558670017398, "grad_norm": 4.778618335723877, "learning_rate": 4.246190858059343e-06, "logits/chosen": 13.52690315246582, "logits/rejected": 9.544361114501953, "logps/chosen": -233.23886108398438, "logps/rejected": -230.8426971435547, "loss": 0.5206, "rewards/accuracies": 0.75, "rewards/chosen": 0.4327920079231262, "rewards/margins": 0.5751252174377441, "rewards/rejected": -0.1423332244157791, "step": 4572 }, { "epoch": 0.707210516141504, "grad_norm": 4.7092719078063965, "learning_rate": 4.245904456409669e-06, "logits/chosen": 10.643328666687012, "logits/rejected": 4.016560077667236, "logps/chosen": -327.07720947265625, "logps/rejected": -218.01927185058594, "loss": 0.5235, "rewards/accuracies": 0.75, "rewards/chosen": 0.44711053371429443, "rewards/margins": 0.5179625749588013, "rewards/rejected": -0.07085199654102325, "step": 4573 }, { "epoch": 0.7073651652812681, "grad_norm": 5.633538722991943, "learning_rate": 4.245618054759995e-06, "logits/chosen": 5.417980670928955, "logits/rejected": 8.159159660339355, "logps/chosen": -250.2317352294922, "logps/rejected": -192.50909423828125, "loss": 0.7402, "rewards/accuracies": 0.625, "rewards/chosen": -0.0016814544796943665, "rewards/margins": -0.03897682577371597, "rewards/rejected": 0.03729536756873131, "step": 4574 }, { "epoch": 0.7075198144210323, "grad_norm": 6.199712753295898, "learning_rate": 4.245331653110322e-06, "logits/chosen": 16.160507202148438, "logits/rejected": 17.126243591308594, "logps/chosen": -296.52191162109375, "logps/rejected": -364.4744567871094, "loss": 0.8055, "rewards/accuracies": 0.25, "rewards/chosen": 0.1539459228515625, "rewards/margins": -0.0059756748378276825, "rewards/rejected": 0.15992160141468048, "step": 4575 }, { "epoch": 0.7076744635607964, "grad_norm": 7.218504905700684, "learning_rate": 4.245045251460649e-06, "logits/chosen": 6.133431434631348, "logits/rejected": 7.881924152374268, "logps/chosen": -270.05755615234375, "logps/rejected": -246.61935424804688, "loss": 0.9605, "rewards/accuracies": 0.375, "rewards/chosen": -0.08289370685815811, "rewards/margins": -0.18835283815860748, "rewards/rejected": 0.10545916855335236, "step": 4576 }, { "epoch": 0.7078291127005606, "grad_norm": 4.109829902648926, "learning_rate": 4.244758849810975e-06, "logits/chosen": 8.774371147155762, "logits/rejected": 12.36960220336914, "logps/chosen": -149.04518127441406, "logps/rejected": -179.76773071289062, "loss": 0.643, "rewards/accuracies": 0.625, "rewards/chosen": -0.0977703109383583, "rewards/margins": 0.1250896006822586, "rewards/rejected": -0.2228599190711975, "step": 4577 }, { "epoch": 0.7079837618403247, "grad_norm": 4.731785774230957, "learning_rate": 4.244472448161302e-06, "logits/chosen": 9.13316535949707, "logits/rejected": 7.804808139801025, "logps/chosen": -190.30599975585938, "logps/rejected": -174.75729370117188, "loss": 0.6876, "rewards/accuracies": 0.375, "rewards/chosen": 0.3220677375793457, "rewards/margins": 0.10329968482255936, "rewards/rejected": 0.21876806020736694, "step": 4578 }, { "epoch": 0.7081384109800889, "grad_norm": 6.957160472869873, "learning_rate": 4.244186046511628e-06, "logits/chosen": 10.29033088684082, "logits/rejected": 13.34824275970459, "logps/chosen": -278.59149169921875, "logps/rejected": -298.6543884277344, "loss": 0.8957, "rewards/accuracies": 0.25, "rewards/chosen": -0.2270442098379135, "rewards/margins": -0.28402191400527954, "rewards/rejected": 0.05697770416736603, "step": 4579 }, { "epoch": 0.708293060119853, "grad_norm": 5.32158088684082, "learning_rate": 4.243899644861954e-06, "logits/chosen": 6.668206691741943, "logits/rejected": 12.51567554473877, "logps/chosen": -356.54107666015625, "logps/rejected": -335.79022216796875, "loss": 0.7461, "rewards/accuracies": 0.375, "rewards/chosen": 0.16742274165153503, "rewards/margins": 0.03726506978273392, "rewards/rejected": 0.13015766441822052, "step": 4580 }, { "epoch": 0.7084477092596172, "grad_norm": 4.515707015991211, "learning_rate": 4.243613243212281e-06, "logits/chosen": 6.752403259277344, "logits/rejected": 5.891953945159912, "logps/chosen": -285.4992370605469, "logps/rejected": -256.10455322265625, "loss": 0.5511, "rewards/accuracies": 0.625, "rewards/chosen": 0.48147740960121155, "rewards/margins": 0.47435086965560913, "rewards/rejected": 0.007126554846763611, "step": 4581 }, { "epoch": 0.7086023583993813, "grad_norm": 4.8283514976501465, "learning_rate": 4.243326841562608e-06, "logits/chosen": 8.763612747192383, "logits/rejected": 7.175118446350098, "logps/chosen": -274.317626953125, "logps/rejected": -355.62518310546875, "loss": 0.5318, "rewards/accuracies": 0.625, "rewards/chosen": 0.2706715166568756, "rewards/margins": 0.47458314895629883, "rewards/rejected": -0.2039116472005844, "step": 4582 }, { "epoch": 0.7087570075391456, "grad_norm": 3.562120199203491, "learning_rate": 4.243040439912934e-06, "logits/chosen": 11.488204956054688, "logits/rejected": -0.03138244152069092, "logps/chosen": -390.5075988769531, "logps/rejected": -213.0691375732422, "loss": 0.4098, "rewards/accuracies": 0.75, "rewards/chosen": 0.7288947105407715, "rewards/margins": 1.2655043601989746, "rewards/rejected": -0.5366096496582031, "step": 4583 }, { "epoch": 0.7089116566789098, "grad_norm": 4.430549621582031, "learning_rate": 4.242754038263261e-06, "logits/chosen": 7.564137935638428, "logits/rejected": 4.704463005065918, "logps/chosen": -288.6220703125, "logps/rejected": -213.3743896484375, "loss": 0.5586, "rewards/accuracies": 0.75, "rewards/chosen": 0.33583182096481323, "rewards/margins": 0.37405115365982056, "rewards/rejected": -0.038219302892684937, "step": 4584 }, { "epoch": 0.7090663058186739, "grad_norm": 5.988737106323242, "learning_rate": 4.242467636613588e-06, "logits/chosen": 10.479631423950195, "logits/rejected": 7.0845818519592285, "logps/chosen": -262.98291015625, "logps/rejected": -323.0245361328125, "loss": 0.4956, "rewards/accuracies": 0.75, "rewards/chosen": 0.3370293974876404, "rewards/margins": 0.536069393157959, "rewards/rejected": -0.19904004037380219, "step": 4585 }, { "epoch": 0.7092209549584381, "grad_norm": 4.775357723236084, "learning_rate": 4.2421812349639135e-06, "logits/chosen": 8.966789245605469, "logits/rejected": 6.047226905822754, "logps/chosen": -346.08551025390625, "logps/rejected": -287.6668395996094, "loss": 0.5307, "rewards/accuracies": 0.75, "rewards/chosen": 0.42627131938934326, "rewards/margins": 0.3979650139808655, "rewards/rejected": 0.02830635756254196, "step": 4586 }, { "epoch": 0.7093756040982022, "grad_norm": 5.712526321411133, "learning_rate": 4.24189483331424e-06, "logits/chosen": 8.538501739501953, "logits/rejected": 7.301070690155029, "logps/chosen": -293.10028076171875, "logps/rejected": -237.252197265625, "loss": 0.6693, "rewards/accuracies": 0.5, "rewards/chosen": 0.3256973326206207, "rewards/margins": 0.23136553168296814, "rewards/rejected": 0.0943317785859108, "step": 4587 }, { "epoch": 0.7095302532379664, "grad_norm": 7.103586673736572, "learning_rate": 4.241608431664567e-06, "logits/chosen": 13.467236518859863, "logits/rejected": 11.850404739379883, "logps/chosen": -375.1611633300781, "logps/rejected": -340.20477294921875, "loss": 0.7296, "rewards/accuracies": 0.375, "rewards/chosen": 0.34818315505981445, "rewards/margins": -0.039399243891239166, "rewards/rejected": 0.3875824213027954, "step": 4588 }, { "epoch": 0.7096849023777305, "grad_norm": 5.8927903175354, "learning_rate": 4.241322030014893e-06, "logits/chosen": 12.597597122192383, "logits/rejected": 10.961711883544922, "logps/chosen": -469.7466125488281, "logps/rejected": -342.7825927734375, "loss": 0.5975, "rewards/accuracies": 0.75, "rewards/chosen": 0.6095735430717468, "rewards/margins": 0.23591360449790955, "rewards/rejected": 0.3736599385738373, "step": 4589 }, { "epoch": 0.7098395515174947, "grad_norm": 5.878236770629883, "learning_rate": 4.24103562836522e-06, "logits/chosen": 14.659733772277832, "logits/rejected": 9.472429275512695, "logps/chosen": -356.6368408203125, "logps/rejected": -354.6163635253906, "loss": 0.6378, "rewards/accuracies": 0.5, "rewards/chosen": 0.46231406927108765, "rewards/margins": 0.3263733983039856, "rewards/rejected": 0.13594065606594086, "step": 4590 }, { "epoch": 0.7099942006572588, "grad_norm": 4.429261207580566, "learning_rate": 4.240749226715547e-06, "logits/chosen": 9.442913055419922, "logits/rejected": 13.063085556030273, "logps/chosen": -218.58944702148438, "logps/rejected": -260.20867919921875, "loss": 0.5776, "rewards/accuracies": 0.75, "rewards/chosen": 0.28602471947669983, "rewards/margins": 0.3126041293144226, "rewards/rejected": -0.026579387485980988, "step": 4591 }, { "epoch": 0.710148849797023, "grad_norm": 5.877252578735352, "learning_rate": 4.2404628250658725e-06, "logits/chosen": 8.935186386108398, "logits/rejected": 5.961345672607422, "logps/chosen": -301.419189453125, "logps/rejected": -236.7633056640625, "loss": 0.717, "rewards/accuracies": 0.5, "rewards/chosen": 0.08373585343360901, "rewards/margins": 0.06022048741579056, "rewards/rejected": 0.02351538836956024, "step": 4592 }, { "epoch": 0.7103034989367871, "grad_norm": 6.092513084411621, "learning_rate": 4.240176423416199e-06, "logits/chosen": 8.973538398742676, "logits/rejected": 10.432695388793945, "logps/chosen": -208.08888244628906, "logps/rejected": -238.11029052734375, "loss": 0.7768, "rewards/accuracies": 0.625, "rewards/chosen": 0.1211184561252594, "rewards/margins": -0.09161757677793503, "rewards/rejected": 0.21273604035377502, "step": 4593 }, { "epoch": 0.7104581480765513, "grad_norm": 3.9608352184295654, "learning_rate": 4.239890021766526e-06, "logits/chosen": 7.756915092468262, "logits/rejected": 7.772074222564697, "logps/chosen": -299.4737243652344, "logps/rejected": -256.7357177734375, "loss": 0.5155, "rewards/accuracies": 0.75, "rewards/chosen": 0.5204596519470215, "rewards/margins": 0.48946577310562134, "rewards/rejected": 0.030993878841400146, "step": 4594 }, { "epoch": 0.7106127972163155, "grad_norm": 5.682147026062012, "learning_rate": 4.2396036201168525e-06, "logits/chosen": 3.9371743202209473, "logits/rejected": 4.048566818237305, "logps/chosen": -167.6571807861328, "logps/rejected": -160.6783905029297, "loss": 0.7465, "rewards/accuracies": 0.375, "rewards/chosen": 0.12488966435194016, "rewards/margins": -0.04817222058773041, "rewards/rejected": 0.17306189239025116, "step": 4595 }, { "epoch": 0.7107674463560797, "grad_norm": 3.626577138900757, "learning_rate": 4.239317218467179e-06, "logits/chosen": 6.447562217712402, "logits/rejected": 5.775452613830566, "logps/chosen": -174.85342407226562, "logps/rejected": -174.7040252685547, "loss": 0.5041, "rewards/accuracies": 0.75, "rewards/chosen": 0.38952410221099854, "rewards/margins": 0.6370323896408081, "rewards/rejected": -0.24750833213329315, "step": 4596 }, { "epoch": 0.7109220954958438, "grad_norm": 7.288050651550293, "learning_rate": 4.239030816817506e-06, "logits/chosen": -1.1223838329315186, "logits/rejected": 5.900338649749756, "logps/chosen": -234.98805236816406, "logps/rejected": -334.6891784667969, "loss": 0.9103, "rewards/accuracies": 0.5, "rewards/chosen": -0.2509725093841553, "rewards/margins": -0.13586796820163727, "rewards/rejected": -0.1151045560836792, "step": 4597 }, { "epoch": 0.711076744635608, "grad_norm": 7.594099521636963, "learning_rate": 4.2387444151678316e-06, "logits/chosen": 9.304906845092773, "logits/rejected": 10.392935752868652, "logps/chosen": -296.36346435546875, "logps/rejected": -285.77001953125, "loss": 0.8777, "rewards/accuracies": 0.5, "rewards/chosen": -0.037966907024383545, "rewards/margins": -0.24119937419891357, "rewards/rejected": 0.20323246717453003, "step": 4598 }, { "epoch": 0.7112313937753721, "grad_norm": 7.77683687210083, "learning_rate": 4.238458013518158e-06, "logits/chosen": 14.195968627929688, "logits/rejected": 9.00336742401123, "logps/chosen": -478.3730163574219, "logps/rejected": -306.6780700683594, "loss": 0.8273, "rewards/accuracies": 0.625, "rewards/chosen": -0.003941148519515991, "rewards/margins": -0.007843591272830963, "rewards/rejected": 0.003902435302734375, "step": 4599 }, { "epoch": 0.7113860429151363, "grad_norm": 5.745849609375, "learning_rate": 4.238171611868485e-06, "logits/chosen": 13.648274421691895, "logits/rejected": 9.800228118896484, "logps/chosen": -389.63946533203125, "logps/rejected": -339.66668701171875, "loss": 0.5396, "rewards/accuracies": 0.625, "rewards/chosen": 0.3609994947910309, "rewards/margins": 0.46152400970458984, "rewards/rejected": -0.10052451491355896, "step": 4600 }, { "epoch": 0.7115406920549004, "grad_norm": 4.448723793029785, "learning_rate": 4.2378852102188115e-06, "logits/chosen": 8.844828605651855, "logits/rejected": 10.707019805908203, "logps/chosen": -193.85595703125, "logps/rejected": -253.77130126953125, "loss": 0.6463, "rewards/accuracies": 0.625, "rewards/chosen": 0.14980114996433258, "rewards/margins": 0.1919388622045517, "rewards/rejected": -0.042137712240219116, "step": 4601 }, { "epoch": 0.7116953411946646, "grad_norm": 4.077887058258057, "learning_rate": 4.237598808569137e-06, "logits/chosen": 10.013148307800293, "logits/rejected": 9.555876731872559, "logps/chosen": -211.83291625976562, "logps/rejected": -139.9344024658203, "loss": 0.6275, "rewards/accuracies": 0.75, "rewards/chosen": 0.06236010789871216, "rewards/margins": 0.22210922837257385, "rewards/rejected": -0.1597491353750229, "step": 4602 }, { "epoch": 0.7118499903344288, "grad_norm": 4.701518535614014, "learning_rate": 4.237312406919464e-06, "logits/chosen": 12.729211807250977, "logits/rejected": 7.968535900115967, "logps/chosen": -407.1088562011719, "logps/rejected": -274.4529113769531, "loss": 0.4895, "rewards/accuracies": 0.875, "rewards/chosen": 0.4684843122959137, "rewards/margins": 0.5429566502571106, "rewards/rejected": -0.0744723379611969, "step": 4603 }, { "epoch": 0.7120046394741929, "grad_norm": 4.916139125823975, "learning_rate": 4.237026005269791e-06, "logits/chosen": 13.155076026916504, "logits/rejected": 16.855268478393555, "logps/chosen": -256.2487487792969, "logps/rejected": -299.71038818359375, "loss": 0.5594, "rewards/accuracies": 0.75, "rewards/chosen": 0.4148678779602051, "rewards/margins": 0.35789576172828674, "rewards/rejected": 0.056972116231918335, "step": 4604 }, { "epoch": 0.712159288613957, "grad_norm": 8.865031242370605, "learning_rate": 4.236739603620117e-06, "logits/chosen": 12.798454284667969, "logits/rejected": 12.440105438232422, "logps/chosen": -420.20477294921875, "logps/rejected": -419.4029541015625, "loss": 0.7462, "rewards/accuracies": 0.625, "rewards/chosen": 0.11067429184913635, "rewards/margins": 0.044924937188625336, "rewards/rejected": 0.06574936211109161, "step": 4605 }, { "epoch": 0.7123139377537212, "grad_norm": 4.790214538574219, "learning_rate": 4.236453201970444e-06, "logits/chosen": 12.263076782226562, "logits/rejected": 10.843096733093262, "logps/chosen": -198.3968963623047, "logps/rejected": -203.72879028320312, "loss": 0.6445, "rewards/accuracies": 0.375, "rewards/chosen": 0.3282395601272583, "rewards/margins": 0.2692464292049408, "rewards/rejected": 0.058993108570575714, "step": 4606 }, { "epoch": 0.7124685868934854, "grad_norm": 4.335485935211182, "learning_rate": 4.23616680032077e-06, "logits/chosen": 11.603706359863281, "logits/rejected": 3.3118488788604736, "logps/chosen": -300.590087890625, "logps/rejected": -239.49508666992188, "loss": 0.5544, "rewards/accuracies": 0.875, "rewards/chosen": 0.313501238822937, "rewards/margins": 0.3790134787559509, "rewards/rejected": -0.06551218032836914, "step": 4607 }, { "epoch": 0.7126232360332496, "grad_norm": 6.42044734954834, "learning_rate": 4.235880398671096e-06, "logits/chosen": 10.737825393676758, "logits/rejected": 4.036214828491211, "logps/chosen": -308.0980529785156, "logps/rejected": -223.8427276611328, "loss": 0.7902, "rewards/accuracies": 0.375, "rewards/chosen": -0.005572028458118439, "rewards/margins": -0.05523768067359924, "rewards/rejected": 0.04966564476490021, "step": 4608 }, { "epoch": 0.7127778851730138, "grad_norm": 5.577699184417725, "learning_rate": 4.235593997021423e-06, "logits/chosen": 8.560925483703613, "logits/rejected": 10.230661392211914, "logps/chosen": -253.0076446533203, "logps/rejected": -341.5283508300781, "loss": 0.6643, "rewards/accuracies": 0.375, "rewards/chosen": 0.3910121023654938, "rewards/margins": 0.1334420144557953, "rewards/rejected": 0.2575700879096985, "step": 4609 }, { "epoch": 0.7129325343127779, "grad_norm": 3.967543840408325, "learning_rate": 4.23530759537175e-06, "logits/chosen": 10.803560256958008, "logits/rejected": 7.799156188964844, "logps/chosen": -146.0970001220703, "logps/rejected": -107.52326202392578, "loss": 0.6864, "rewards/accuracies": 0.625, "rewards/chosen": -0.05748012661933899, "rewards/margins": 0.10552510619163513, "rewards/rejected": -0.16300520300865173, "step": 4610 }, { "epoch": 0.7130871834525421, "grad_norm": 9.559494972229004, "learning_rate": 4.235021193722076e-06, "logits/chosen": 9.288444519042969, "logits/rejected": 4.61497163772583, "logps/chosen": -457.3798828125, "logps/rejected": -427.8670349121094, "loss": 0.7114, "rewards/accuracies": 0.5, "rewards/chosen": 0.21303023397922516, "rewards/margins": 0.1577335000038147, "rewards/rejected": 0.05529669672250748, "step": 4611 }, { "epoch": 0.7132418325923062, "grad_norm": 6.40587854385376, "learning_rate": 4.234734792072402e-06, "logits/chosen": 4.82703971862793, "logits/rejected": 9.51150131225586, "logps/chosen": -222.35855102539062, "logps/rejected": -232.75439453125, "loss": 0.7053, "rewards/accuracies": 0.75, "rewards/chosen": -0.06283684074878693, "rewards/margins": -0.000910535454750061, "rewards/rejected": -0.06192629784345627, "step": 4612 }, { "epoch": 0.7133964817320704, "grad_norm": 4.430289268493652, "learning_rate": 4.234448390422729e-06, "logits/chosen": 13.957395553588867, "logits/rejected": 7.47810173034668, "logps/chosen": -318.04803466796875, "logps/rejected": -252.24575805664062, "loss": 0.6093, "rewards/accuracies": 0.5, "rewards/chosen": 0.3107309341430664, "rewards/margins": 0.2963135838508606, "rewards/rejected": 0.014417361468076706, "step": 4613 }, { "epoch": 0.7135511308718345, "grad_norm": 7.0096116065979, "learning_rate": 4.2341619887730554e-06, "logits/chosen": 7.8340630531311035, "logits/rejected": 5.389813423156738, "logps/chosen": -309.28106689453125, "logps/rejected": -244.09130859375, "loss": 0.7142, "rewards/accuracies": 0.5, "rewards/chosen": 0.26365795731544495, "rewards/margins": 0.06978225708007812, "rewards/rejected": 0.19387570023536682, "step": 4614 }, { "epoch": 0.7137057800115987, "grad_norm": 4.990397930145264, "learning_rate": 4.233875587123382e-06, "logits/chosen": 5.352141857147217, "logits/rejected": 7.53994083404541, "logps/chosen": -228.5999755859375, "logps/rejected": -245.09378051757812, "loss": 0.6676, "rewards/accuracies": 0.625, "rewards/chosen": -0.11973323673009872, "rewards/margins": 0.23036159574985504, "rewards/rejected": -0.3500947952270508, "step": 4615 }, { "epoch": 0.7138604291513628, "grad_norm": 4.495199203491211, "learning_rate": 4.233589185473709e-06, "logits/chosen": 7.831066131591797, "logits/rejected": 7.557775020599365, "logps/chosen": -197.90415954589844, "logps/rejected": -194.01568603515625, "loss": 0.7217, "rewards/accuracies": 0.5, "rewards/chosen": 0.16256499290466309, "rewards/margins": 0.013051211833953857, "rewards/rejected": 0.14951378107070923, "step": 4616 }, { "epoch": 0.714015078291127, "grad_norm": 9.58165168762207, "learning_rate": 4.233302783824035e-06, "logits/chosen": 6.459289073944092, "logits/rejected": 10.197653770446777, "logps/chosen": -232.92166137695312, "logps/rejected": -413.5774841308594, "loss": 0.8116, "rewards/accuracies": 0.375, "rewards/chosen": -0.212818905711174, "rewards/margins": -0.16318702697753906, "rewards/rejected": -0.04963187873363495, "step": 4617 }, { "epoch": 0.7141697274308911, "grad_norm": 3.74113130569458, "learning_rate": 4.233016382174361e-06, "logits/chosen": 13.833148956298828, "logits/rejected": 13.798471450805664, "logps/chosen": -209.2418212890625, "logps/rejected": -219.6009521484375, "loss": 0.4913, "rewards/accuracies": 1.0, "rewards/chosen": 0.28905540704727173, "rewards/margins": 0.5732909440994263, "rewards/rejected": -0.28423553705215454, "step": 4618 }, { "epoch": 0.7143243765706553, "grad_norm": 6.600011825561523, "learning_rate": 4.232729980524688e-06, "logits/chosen": 10.8261079788208, "logits/rejected": 3.906608819961548, "logps/chosen": -304.7872314453125, "logps/rejected": -288.09423828125, "loss": 0.5966, "rewards/accuracies": 0.625, "rewards/chosen": 0.19084981083869934, "rewards/margins": 0.286954790353775, "rewards/rejected": -0.09610500931739807, "step": 4619 }, { "epoch": 0.7144790257104194, "grad_norm": 4.34542179107666, "learning_rate": 4.2324435788750145e-06, "logits/chosen": 9.040791511535645, "logits/rejected": 4.181394577026367, "logps/chosen": -263.59197998046875, "logps/rejected": -264.7336120605469, "loss": 0.5591, "rewards/accuracies": 0.625, "rewards/chosen": 0.07334579527378082, "rewards/margins": 0.3570653796195984, "rewards/rejected": -0.28371959924697876, "step": 4620 }, { "epoch": 0.7146336748501837, "grad_norm": 4.266055583953857, "learning_rate": 4.232157177225341e-06, "logits/chosen": 5.790135383605957, "logits/rejected": 9.477067947387695, "logps/chosen": -190.33055114746094, "logps/rejected": -220.47218322753906, "loss": 0.6193, "rewards/accuracies": 0.75, "rewards/chosen": 0.28691577911376953, "rewards/margins": 0.18353229761123657, "rewards/rejected": 0.10338349640369415, "step": 4621 }, { "epoch": 0.7147883239899478, "grad_norm": 5.435953140258789, "learning_rate": 4.231870775575668e-06, "logits/chosen": 8.715919494628906, "logits/rejected": 2.5127782821655273, "logps/chosen": -201.9743194580078, "logps/rejected": -159.32742309570312, "loss": 0.6293, "rewards/accuracies": 0.625, "rewards/chosen": 0.08574648201465607, "rewards/margins": 0.1784057766199112, "rewards/rejected": -0.09265930950641632, "step": 4622 }, { "epoch": 0.714942973129712, "grad_norm": 4.416068077087402, "learning_rate": 4.2315843739259944e-06, "logits/chosen": 12.662212371826172, "logits/rejected": 6.926520824432373, "logps/chosen": -344.5833435058594, "logps/rejected": -218.7684783935547, "loss": 0.5106, "rewards/accuracies": 0.875, "rewards/chosen": 0.5622336864471436, "rewards/margins": 0.43748739361763, "rewards/rejected": 0.12474632263183594, "step": 4623 }, { "epoch": 0.7150976222694762, "grad_norm": 5.635395050048828, "learning_rate": 4.231297972276321e-06, "logits/chosen": 12.82259464263916, "logits/rejected": 8.568951606750488, "logps/chosen": -359.47039794921875, "logps/rejected": -295.04132080078125, "loss": 0.5342, "rewards/accuracies": 0.625, "rewards/chosen": 0.5028526782989502, "rewards/margins": 0.5173433423042297, "rewards/rejected": -0.014490664005279541, "step": 4624 }, { "epoch": 0.7152522714092403, "grad_norm": 4.45168399810791, "learning_rate": 4.231011570626647e-06, "logits/chosen": 12.053682327270508, "logits/rejected": 10.460845947265625, "logps/chosen": -231.22801208496094, "logps/rejected": -206.77816772460938, "loss": 0.6051, "rewards/accuracies": 0.875, "rewards/chosen": 0.5290884971618652, "rewards/margins": 0.293659508228302, "rewards/rejected": 0.23542898893356323, "step": 4625 }, { "epoch": 0.7154069205490045, "grad_norm": 7.048733234405518, "learning_rate": 4.2307251689769735e-06, "logits/chosen": 9.767511367797852, "logits/rejected": 8.867424011230469, "logps/chosen": -269.475341796875, "logps/rejected": -320.96697998046875, "loss": 0.7632, "rewards/accuracies": 0.5, "rewards/chosen": 0.14557959139347076, "rewards/margins": -0.036857739090919495, "rewards/rejected": 0.18243733048439026, "step": 4626 }, { "epoch": 0.7155615696887686, "grad_norm": 5.3465399742126465, "learning_rate": 4.2304387673273e-06, "logits/chosen": 14.575580596923828, "logits/rejected": 9.753442764282227, "logps/chosen": -366.173583984375, "logps/rejected": -227.4510498046875, "loss": 0.7096, "rewards/accuracies": 0.625, "rewards/chosen": 0.48749876022338867, "rewards/margins": 0.12785844504833221, "rewards/rejected": 0.35964030027389526, "step": 4627 }, { "epoch": 0.7157162188285328, "grad_norm": 5.9558892250061035, "learning_rate": 4.230152365677627e-06, "logits/chosen": 11.542832374572754, "logits/rejected": 5.083554267883301, "logps/chosen": -437.64544677734375, "logps/rejected": -360.44879150390625, "loss": 0.5831, "rewards/accuracies": 0.75, "rewards/chosen": 0.230820432305336, "rewards/margins": 0.2564258575439453, "rewards/rejected": -0.025605440139770508, "step": 4628 }, { "epoch": 0.7158708679682969, "grad_norm": 5.45145320892334, "learning_rate": 4.2298659640279535e-06, "logits/chosen": 12.42699146270752, "logits/rejected": 16.88634490966797, "logps/chosen": -278.68701171875, "logps/rejected": -246.53636169433594, "loss": 0.7133, "rewards/accuracies": 0.375, "rewards/chosen": 0.11747746914625168, "rewards/margins": 0.06037537008523941, "rewards/rejected": 0.057102106511592865, "step": 4629 }, { "epoch": 0.7160255171080611, "grad_norm": 18.71869468688965, "learning_rate": 4.22957956237828e-06, "logits/chosen": 4.967696666717529, "logits/rejected": 3.1390843391418457, "logps/chosen": -243.71731567382812, "logps/rejected": -166.47952270507812, "loss": 0.7619, "rewards/accuracies": 0.625, "rewards/chosen": 0.12153066694736481, "rewards/margins": 0.003656625747680664, "rewards/rejected": 0.11787405610084534, "step": 4630 }, { "epoch": 0.7161801662478252, "grad_norm": 6.964846134185791, "learning_rate": 4.229293160728607e-06, "logits/chosen": 10.464961051940918, "logits/rejected": 10.341142654418945, "logps/chosen": -324.0904541015625, "logps/rejected": -323.0029296875, "loss": 0.5813, "rewards/accuracies": 0.75, "rewards/chosen": 0.30943137407302856, "rewards/margins": 0.511340856552124, "rewards/rejected": -0.20190943777561188, "step": 4631 }, { "epoch": 0.7163348153875894, "grad_norm": 14.7042818069458, "learning_rate": 4.229006759078933e-06, "logits/chosen": 16.729562759399414, "logits/rejected": 10.402524948120117, "logps/chosen": -522.3245849609375, "logps/rejected": -402.0496520996094, "loss": 0.5883, "rewards/accuracies": 0.625, "rewards/chosen": 0.5634576082229614, "rewards/margins": 0.3435225486755371, "rewards/rejected": 0.21993504464626312, "step": 4632 }, { "epoch": 0.7164894645273535, "grad_norm": 4.383731842041016, "learning_rate": 4.228720357429259e-06, "logits/chosen": 14.411029815673828, "logits/rejected": 12.076712608337402, "logps/chosen": -314.4481506347656, "logps/rejected": -322.5937805175781, "loss": 0.5238, "rewards/accuracies": 0.625, "rewards/chosen": 0.3533988893032074, "rewards/margins": 0.6665071845054626, "rewards/rejected": -0.31310826539993286, "step": 4633 }, { "epoch": 0.7166441136671178, "grad_norm": 3.7176642417907715, "learning_rate": 4.228433955779586e-06, "logits/chosen": 7.941164016723633, "logits/rejected": 4.0201849937438965, "logps/chosen": -217.4586944580078, "logps/rejected": -147.8682861328125, "loss": 0.5419, "rewards/accuracies": 0.5, "rewards/chosen": 0.25656378269195557, "rewards/margins": 0.4995496869087219, "rewards/rejected": -0.24298591911792755, "step": 4634 }, { "epoch": 0.7167987628068819, "grad_norm": 4.803718566894531, "learning_rate": 4.2281475541299126e-06, "logits/chosen": 12.475752830505371, "logits/rejected": 10.77780532836914, "logps/chosen": -246.08956909179688, "logps/rejected": -254.8748321533203, "loss": 0.7456, "rewards/accuracies": 0.625, "rewards/chosen": 0.4601503908634186, "rewards/margins": 0.028110364452004433, "rewards/rejected": 0.4320400357246399, "step": 4635 }, { "epoch": 0.7169534119466461, "grad_norm": 5.811635494232178, "learning_rate": 4.227861152480238e-06, "logits/chosen": 10.892448425292969, "logits/rejected": 9.648085594177246, "logps/chosen": -397.5242004394531, "logps/rejected": -386.5086669921875, "loss": 0.5113, "rewards/accuracies": 0.875, "rewards/chosen": 0.4311075210571289, "rewards/margins": 0.46603813767433167, "rewards/rejected": -0.03493061661720276, "step": 4636 }, { "epoch": 0.7171080610864102, "grad_norm": 4.786439418792725, "learning_rate": 4.227574750830565e-06, "logits/chosen": 8.258153915405273, "logits/rejected": 5.33173942565918, "logps/chosen": -197.9464874267578, "logps/rejected": -166.2730712890625, "loss": 0.6797, "rewards/accuracies": 0.625, "rewards/chosen": 0.03668379783630371, "rewards/margins": 0.18249371647834778, "rewards/rejected": -0.14580993354320526, "step": 4637 }, { "epoch": 0.7172627102261744, "grad_norm": 7.147068500518799, "learning_rate": 4.227288349180892e-06, "logits/chosen": 5.039703845977783, "logits/rejected": 7.041079044342041, "logps/chosen": -270.3241882324219, "logps/rejected": -277.89080810546875, "loss": 0.844, "rewards/accuracies": 0.375, "rewards/chosen": -0.49101442098617554, "rewards/margins": -0.22120852768421173, "rewards/rejected": -0.269805908203125, "step": 4638 }, { "epoch": 0.7174173593659385, "grad_norm": 5.5372185707092285, "learning_rate": 4.227001947531218e-06, "logits/chosen": 9.179319381713867, "logits/rejected": 7.193024635314941, "logps/chosen": -204.68038940429688, "logps/rejected": -262.3078918457031, "loss": 0.6036, "rewards/accuracies": 0.625, "rewards/chosen": -0.05145713686943054, "rewards/margins": 0.2607852518558502, "rewards/rejected": -0.31224238872528076, "step": 4639 }, { "epoch": 0.7175720085057027, "grad_norm": 5.4153828620910645, "learning_rate": 4.226715545881544e-06, "logits/chosen": 9.343610763549805, "logits/rejected": 6.311092376708984, "logps/chosen": -247.0800018310547, "logps/rejected": -206.5257110595703, "loss": 0.5378, "rewards/accuracies": 0.625, "rewards/chosen": 0.29292821884155273, "rewards/margins": 0.46754083037376404, "rewards/rejected": -0.1746126115322113, "step": 4640 }, { "epoch": 0.7177266576454668, "grad_norm": 7.227595329284668, "learning_rate": 4.226429144231871e-06, "logits/chosen": 10.881799697875977, "logits/rejected": 3.8781089782714844, "logps/chosen": -375.35711669921875, "logps/rejected": -315.3063659667969, "loss": 0.7193, "rewards/accuracies": 0.625, "rewards/chosen": 0.07003575563430786, "rewards/margins": 0.26165515184402466, "rewards/rejected": -0.1916193962097168, "step": 4641 }, { "epoch": 0.717881306785231, "grad_norm": 6.14045524597168, "learning_rate": 4.226142742582197e-06, "logits/chosen": 10.574788093566895, "logits/rejected": 7.586615562438965, "logps/chosen": -318.93682861328125, "logps/rejected": -244.3237762451172, "loss": 0.6617, "rewards/accuracies": 0.75, "rewards/chosen": 0.20660200715065002, "rewards/margins": 0.17695912718772888, "rewards/rejected": 0.02964286133646965, "step": 4642 }, { "epoch": 0.7180359559249951, "grad_norm": 6.854588031768799, "learning_rate": 4.225856340932524e-06, "logits/chosen": 8.998845100402832, "logits/rejected": 3.6664860248565674, "logps/chosen": -356.71990966796875, "logps/rejected": -257.1462707519531, "loss": 0.5208, "rewards/accuracies": 0.625, "rewards/chosen": 0.35594692826271057, "rewards/margins": 0.6809406876564026, "rewards/rejected": -0.32499369978904724, "step": 4643 }, { "epoch": 0.7181906050647593, "grad_norm": 7.132308483123779, "learning_rate": 4.225569939282851e-06, "logits/chosen": 14.064380645751953, "logits/rejected": 11.397700309753418, "logps/chosen": -268.5389709472656, "logps/rejected": -252.5906982421875, "loss": 0.7136, "rewards/accuracies": 0.5, "rewards/chosen": 0.20234090089797974, "rewards/margins": 0.04124408960342407, "rewards/rejected": 0.16109681129455566, "step": 4644 }, { "epoch": 0.7183452542045234, "grad_norm": 6.435413837432861, "learning_rate": 4.2252835376331765e-06, "logits/chosen": 12.554622650146484, "logits/rejected": 10.075553894042969, "logps/chosen": -355.69091796875, "logps/rejected": -313.96343994140625, "loss": 0.8989, "rewards/accuracies": 0.375, "rewards/chosen": 0.1648985594511032, "rewards/margins": -0.282002329826355, "rewards/rejected": 0.446900874376297, "step": 4645 }, { "epoch": 0.7184999033442876, "grad_norm": 8.379067420959473, "learning_rate": 4.224997135983503e-06, "logits/chosen": 10.873297691345215, "logits/rejected": 12.469922065734863, "logps/chosen": -329.8974304199219, "logps/rejected": -233.2132568359375, "loss": 0.8804, "rewards/accuracies": 0.5, "rewards/chosen": 0.07426971942186356, "rewards/margins": -0.17683526873588562, "rewards/rejected": 0.2511049509048462, "step": 4646 }, { "epoch": 0.7186545524840519, "grad_norm": 3.860957384109497, "learning_rate": 4.22471073433383e-06, "logits/chosen": 10.99325942993164, "logits/rejected": 9.064567565917969, "logps/chosen": -160.0193634033203, "logps/rejected": -152.13067626953125, "loss": 0.7023, "rewards/accuracies": 0.375, "rewards/chosen": 0.2436378449201584, "rewards/margins": 0.03730266913771629, "rewards/rejected": 0.2063351571559906, "step": 4647 }, { "epoch": 0.718809201623816, "grad_norm": 4.60730504989624, "learning_rate": 4.2244243326841565e-06, "logits/chosen": 8.819602012634277, "logits/rejected": 7.903059959411621, "logps/chosen": -260.6100158691406, "logps/rejected": -294.0946044921875, "loss": 0.5749, "rewards/accuracies": 0.625, "rewards/chosen": 0.36508047580718994, "rewards/margins": 0.49078208208084106, "rewards/rejected": -0.12570160627365112, "step": 4648 }, { "epoch": 0.7189638507635802, "grad_norm": 4.122908115386963, "learning_rate": 4.224137931034483e-06, "logits/chosen": 13.057272911071777, "logits/rejected": 9.321096420288086, "logps/chosen": -256.8499450683594, "logps/rejected": -233.92526245117188, "loss": 0.4515, "rewards/accuracies": 1.0, "rewards/chosen": 0.49239271879196167, "rewards/margins": 0.6381007432937622, "rewards/rejected": -0.14570806920528412, "step": 4649 }, { "epoch": 0.7191184999033443, "grad_norm": 5.219837188720703, "learning_rate": 4.22385152938481e-06, "logits/chosen": 9.81836986541748, "logits/rejected": 12.089348793029785, "logps/chosen": -241.3326416015625, "logps/rejected": -275.6637268066406, "loss": 0.8797, "rewards/accuracies": 0.25, "rewards/chosen": 0.24137966334819794, "rewards/margins": -0.23598501086235046, "rewards/rejected": 0.4773646593093872, "step": 4650 }, { "epoch": 0.7192731490431085, "grad_norm": 4.851894855499268, "learning_rate": 4.2235651277351356e-06, "logits/chosen": 3.2410387992858887, "logits/rejected": 6.405688762664795, "logps/chosen": -320.13201904296875, "logps/rejected": -431.6505432128906, "loss": 0.5887, "rewards/accuracies": 0.5, "rewards/chosen": 0.3114035725593567, "rewards/margins": 0.4934540092945099, "rewards/rejected": -0.182050421833992, "step": 4651 }, { "epoch": 0.7194277981828726, "grad_norm": 5.242563724517822, "learning_rate": 4.223278726085462e-06, "logits/chosen": 9.595367431640625, "logits/rejected": 4.99884557723999, "logps/chosen": -393.42254638671875, "logps/rejected": -262.14764404296875, "loss": 0.6548, "rewards/accuracies": 0.5, "rewards/chosen": 0.34950342774391174, "rewards/margins": 0.2511463761329651, "rewards/rejected": 0.09835705906152725, "step": 4652 }, { "epoch": 0.7195824473226368, "grad_norm": 5.531010627746582, "learning_rate": 4.222992324435789e-06, "logits/chosen": 9.550742149353027, "logits/rejected": 9.265010833740234, "logps/chosen": -223.27291870117188, "logps/rejected": -259.66156005859375, "loss": 0.5514, "rewards/accuracies": 0.875, "rewards/chosen": 0.4141909182071686, "rewards/margins": 0.48686161637306213, "rewards/rejected": -0.07267066836357117, "step": 4653 }, { "epoch": 0.7197370964624009, "grad_norm": 3.8310351371765137, "learning_rate": 4.2227059227861155e-06, "logits/chosen": 14.591660499572754, "logits/rejected": 12.950172424316406, "logps/chosen": -251.24322509765625, "logps/rejected": -225.15335083007812, "loss": 0.5734, "rewards/accuracies": 0.375, "rewards/chosen": 0.20353688299655914, "rewards/margins": 0.3828543424606323, "rewards/rejected": -0.179317444562912, "step": 4654 }, { "epoch": 0.7198917456021651, "grad_norm": 4.616705894470215, "learning_rate": 4.222419521136442e-06, "logits/chosen": 11.38825798034668, "logits/rejected": 6.519626140594482, "logps/chosen": -178.34844970703125, "logps/rejected": -152.76821899414062, "loss": 0.5532, "rewards/accuracies": 0.5, "rewards/chosen": 0.04259766265749931, "rewards/margins": 0.45947548747062683, "rewards/rejected": -0.416877806186676, "step": 4655 }, { "epoch": 0.7200463947419292, "grad_norm": 4.2925028800964355, "learning_rate": 4.222133119486769e-06, "logits/chosen": 2.5514652729034424, "logits/rejected": 6.725386619567871, "logps/chosen": -186.39608764648438, "logps/rejected": -179.72972106933594, "loss": 0.6288, "rewards/accuracies": 0.75, "rewards/chosen": 0.32997971773147583, "rewards/margins": 0.1427549123764038, "rewards/rejected": 0.18722479045391083, "step": 4656 }, { "epoch": 0.7202010438816934, "grad_norm": 9.864386558532715, "learning_rate": 4.2218467178370955e-06, "logits/chosen": 12.211067199707031, "logits/rejected": 6.216061115264893, "logps/chosen": -313.1650390625, "logps/rejected": -233.2894744873047, "loss": 0.561, "rewards/accuracies": 0.875, "rewards/chosen": 0.7663730382919312, "rewards/margins": 0.3916390538215637, "rewards/rejected": 0.37473392486572266, "step": 4657 }, { "epoch": 0.7203556930214575, "grad_norm": 6.886110305786133, "learning_rate": 4.221560316187421e-06, "logits/chosen": 10.027338027954102, "logits/rejected": 7.370658874511719, "logps/chosen": -346.3270263671875, "logps/rejected": -281.984619140625, "loss": 0.8335, "rewards/accuracies": 0.125, "rewards/chosen": 0.22282001376152039, "rewards/margins": 0.032382868230342865, "rewards/rejected": 0.19043713808059692, "step": 4658 }, { "epoch": 0.7205103421612217, "grad_norm": 7.485498905181885, "learning_rate": 4.221273914537748e-06, "logits/chosen": 8.284996032714844, "logits/rejected": 10.953027725219727, "logps/chosen": -166.54698181152344, "logps/rejected": -210.5655975341797, "loss": 0.8508, "rewards/accuracies": 0.375, "rewards/chosen": 0.060010701417922974, "rewards/margins": -0.2089838683605194, "rewards/rejected": 0.2689945697784424, "step": 4659 }, { "epoch": 0.7206649913009859, "grad_norm": 3.434138298034668, "learning_rate": 4.220987512888075e-06, "logits/chosen": 13.868374824523926, "logits/rejected": 5.818326473236084, "logps/chosen": -275.8595275878906, "logps/rejected": -181.86875915527344, "loss": 0.4357, "rewards/accuracies": 0.875, "rewards/chosen": 0.7116816639900208, "rewards/margins": 0.7430300116539001, "rewards/rejected": -0.031348370015621185, "step": 4660 }, { "epoch": 0.7208196404407501, "grad_norm": 6.918261528015137, "learning_rate": 4.220701111238401e-06, "logits/chosen": 10.387393951416016, "logits/rejected": 7.188056468963623, "logps/chosen": -411.4892578125, "logps/rejected": -403.06341552734375, "loss": 0.6397, "rewards/accuracies": 0.625, "rewards/chosen": 0.44164544343948364, "rewards/margins": 0.20920506119728088, "rewards/rejected": 0.23244038224220276, "step": 4661 }, { "epoch": 0.7209742895805142, "grad_norm": 5.1262054443359375, "learning_rate": 4.220414709588728e-06, "logits/chosen": 10.633928298950195, "logits/rejected": 10.21422004699707, "logps/chosen": -194.33642578125, "logps/rejected": -295.7193908691406, "loss": 0.5587, "rewards/accuracies": 0.75, "rewards/chosen": 0.28601887822151184, "rewards/margins": 0.4282449781894684, "rewards/rejected": -0.14222612977027893, "step": 4662 }, { "epoch": 0.7211289387202784, "grad_norm": 5.970149993896484, "learning_rate": 4.2201283079390545e-06, "logits/chosen": 7.638833522796631, "logits/rejected": 7.223271369934082, "logps/chosen": -344.19964599609375, "logps/rejected": -227.15353393554688, "loss": 0.7451, "rewards/accuracies": 0.5, "rewards/chosen": 0.4394124150276184, "rewards/margins": 0.0007085800170898438, "rewards/rejected": 0.43870383501052856, "step": 4663 }, { "epoch": 0.7212835878600425, "grad_norm": 7.072237491607666, "learning_rate": 4.219841906289381e-06, "logits/chosen": 12.829007148742676, "logits/rejected": 7.617791652679443, "logps/chosen": -472.68359375, "logps/rejected": -373.8777160644531, "loss": 0.7759, "rewards/accuracies": 0.25, "rewards/chosen": 0.2337905764579773, "rewards/margins": -0.07853439450263977, "rewards/rejected": 0.31232497096061707, "step": 4664 }, { "epoch": 0.7214382369998067, "grad_norm": 5.687023162841797, "learning_rate": 4.219555504639707e-06, "logits/chosen": 12.600521087646484, "logits/rejected": 7.528505325317383, "logps/chosen": -310.04071044921875, "logps/rejected": -344.87689208984375, "loss": 0.5636, "rewards/accuracies": 0.625, "rewards/chosen": 0.65812087059021, "rewards/margins": 0.33446481823921204, "rewards/rejected": 0.32365599274635315, "step": 4665 }, { "epoch": 0.7215928861395708, "grad_norm": 5.265872955322266, "learning_rate": 4.219269102990034e-06, "logits/chosen": 12.636493682861328, "logits/rejected": 13.619648933410645, "logps/chosen": -290.1864013671875, "logps/rejected": -267.1124267578125, "loss": 0.7236, "rewards/accuracies": 0.625, "rewards/chosen": 0.18583893775939941, "rewards/margins": 0.06740047037601471, "rewards/rejected": 0.1184384748339653, "step": 4666 }, { "epoch": 0.721747535279335, "grad_norm": 5.027142524719238, "learning_rate": 4.21898270134036e-06, "logits/chosen": 7.26239538192749, "logits/rejected": 1.9124733209609985, "logps/chosen": -240.86940002441406, "logps/rejected": -187.12486267089844, "loss": 0.6469, "rewards/accuracies": 0.625, "rewards/chosen": -0.1723259836435318, "rewards/margins": 0.5006253123283386, "rewards/rejected": -0.6729512810707092, "step": 4667 }, { "epoch": 0.7219021844190991, "grad_norm": 6.47714376449585, "learning_rate": 4.218696299690687e-06, "logits/chosen": 11.352617263793945, "logits/rejected": 10.03576374053955, "logps/chosen": -303.8058776855469, "logps/rejected": -187.26290893554688, "loss": 0.8221, "rewards/accuracies": 0.625, "rewards/chosen": 0.18106481432914734, "rewards/margins": -0.1451394408941269, "rewards/rejected": 0.32620424032211304, "step": 4668 }, { "epoch": 0.7220568335588633, "grad_norm": 7.10595178604126, "learning_rate": 4.218409898041014e-06, "logits/chosen": 9.532785415649414, "logits/rejected": 5.463510513305664, "logps/chosen": -305.3661193847656, "logps/rejected": -405.561279296875, "loss": 0.6468, "rewards/accuracies": 0.5, "rewards/chosen": 0.4265161454677582, "rewards/margins": 0.1635202318429947, "rewards/rejected": 0.2629959285259247, "step": 4669 }, { "epoch": 0.7222114826986274, "grad_norm": 5.666043758392334, "learning_rate": 4.218123496391339e-06, "logits/chosen": 14.798239707946777, "logits/rejected": 12.213676452636719, "logps/chosen": -252.88157653808594, "logps/rejected": -224.15078735351562, "loss": 0.7198, "rewards/accuracies": 0.625, "rewards/chosen": 0.144762322306633, "rewards/margins": 0.16181355714797974, "rewards/rejected": -0.01705123484134674, "step": 4670 }, { "epoch": 0.7223661318383916, "grad_norm": 4.627646446228027, "learning_rate": 4.217837094741666e-06, "logits/chosen": 8.296591758728027, "logits/rejected": 7.6592817306518555, "logps/chosen": -243.8062744140625, "logps/rejected": -255.21615600585938, "loss": 0.6379, "rewards/accuracies": 0.375, "rewards/chosen": 0.14714735746383667, "rewards/margins": 0.24118289351463318, "rewards/rejected": -0.0940355509519577, "step": 4671 }, { "epoch": 0.7225207809781559, "grad_norm": 5.429712295532227, "learning_rate": 4.217550693091993e-06, "logits/chosen": 11.345115661621094, "logits/rejected": 10.367274284362793, "logps/chosen": -258.7862854003906, "logps/rejected": -271.5745544433594, "loss": 0.6911, "rewards/accuracies": 0.625, "rewards/chosen": 0.23161429166793823, "rewards/margins": 0.35463401675224304, "rewards/rejected": -0.1230197325348854, "step": 4672 }, { "epoch": 0.72267543011792, "grad_norm": 6.383058547973633, "learning_rate": 4.217264291442319e-06, "logits/chosen": 11.830735206604004, "logits/rejected": 5.300699234008789, "logps/chosen": -394.55120849609375, "logps/rejected": -401.2049255371094, "loss": 0.5976, "rewards/accuracies": 0.625, "rewards/chosen": 0.6347014307975769, "rewards/margins": 0.24926090240478516, "rewards/rejected": 0.38544055819511414, "step": 4673 }, { "epoch": 0.7228300792576842, "grad_norm": 5.6670451164245605, "learning_rate": 4.216977889792645e-06, "logits/chosen": 10.207159042358398, "logits/rejected": 10.706491470336914, "logps/chosen": -358.0547180175781, "logps/rejected": -348.438232421875, "loss": 0.51, "rewards/accuracies": 0.75, "rewards/chosen": 0.6313265562057495, "rewards/margins": 0.44541940093040466, "rewards/rejected": 0.18590718507766724, "step": 4674 }, { "epoch": 0.7229847283974483, "grad_norm": 4.458642959594727, "learning_rate": 4.216691488142972e-06, "logits/chosen": 13.730224609375, "logits/rejected": 7.8176188468933105, "logps/chosen": -215.33334350585938, "logps/rejected": -151.25917053222656, "loss": 0.5262, "rewards/accuracies": 0.625, "rewards/chosen": 0.22707036137580872, "rewards/margins": 0.5649893879890442, "rewards/rejected": -0.3379189968109131, "step": 4675 }, { "epoch": 0.7231393775372125, "grad_norm": 5.969623565673828, "learning_rate": 4.2164050864932984e-06, "logits/chosen": 4.943775177001953, "logits/rejected": 6.249081611633301, "logps/chosen": -267.60107421875, "logps/rejected": -259.83892822265625, "loss": 0.6768, "rewards/accuracies": 0.625, "rewards/chosen": 0.24906936287879944, "rewards/margins": 0.10447268933057785, "rewards/rejected": 0.144596666097641, "step": 4676 }, { "epoch": 0.7232940266769766, "grad_norm": 7.427109718322754, "learning_rate": 4.216118684843625e-06, "logits/chosen": 3.4262521266937256, "logits/rejected": 4.808072090148926, "logps/chosen": -288.4163513183594, "logps/rejected": -293.7523193359375, "loss": 0.8102, "rewards/accuracies": 0.5, "rewards/chosen": -0.14433889091014862, "rewards/margins": -0.06348054111003876, "rewards/rejected": -0.08085831999778748, "step": 4677 }, { "epoch": 0.7234486758167408, "grad_norm": 5.3566083908081055, "learning_rate": 4.215832283193952e-06, "logits/chosen": 13.013839721679688, "logits/rejected": 11.353792190551758, "logps/chosen": -306.7126159667969, "logps/rejected": -241.7992706298828, "loss": 0.5584, "rewards/accuracies": 0.75, "rewards/chosen": 0.34898167848587036, "rewards/margins": 0.4810735285282135, "rewards/rejected": -0.13209183514118195, "step": 4678 }, { "epoch": 0.7236033249565049, "grad_norm": 4.50926399230957, "learning_rate": 4.2155458815442775e-06, "logits/chosen": 11.905725479125977, "logits/rejected": 6.3731560707092285, "logps/chosen": -228.26748657226562, "logps/rejected": -194.92642211914062, "loss": 0.5013, "rewards/accuracies": 0.875, "rewards/chosen": 0.46943017840385437, "rewards/margins": 0.5827988386154175, "rewards/rejected": -0.1133686751127243, "step": 4679 }, { "epoch": 0.7237579740962691, "grad_norm": 4.547010898590088, "learning_rate": 4.215259479894604e-06, "logits/chosen": 14.651841163635254, "logits/rejected": 13.051101684570312, "logps/chosen": -285.75732421875, "logps/rejected": -295.51983642578125, "loss": 0.6404, "rewards/accuracies": 0.5, "rewards/chosen": 0.3800147473812103, "rewards/margins": 0.20444633066654205, "rewards/rejected": 0.17556840181350708, "step": 4680 }, { "epoch": 0.7239126232360332, "grad_norm": 3.6450984477996826, "learning_rate": 4.214973078244931e-06, "logits/chosen": 6.817581653594971, "logits/rejected": 3.956143379211426, "logps/chosen": -161.10848999023438, "logps/rejected": -129.74124145507812, "loss": 0.6635, "rewards/accuracies": 0.75, "rewards/chosen": 0.15393391251564026, "rewards/margins": 0.06820394098758698, "rewards/rejected": 0.08572997897863388, "step": 4681 }, { "epoch": 0.7240672723757974, "grad_norm": 5.038077354431152, "learning_rate": 4.2146866765952575e-06, "logits/chosen": 6.799264907836914, "logits/rejected": 10.929970741271973, "logps/chosen": -167.8234100341797, "logps/rejected": -192.6121826171875, "loss": 0.7377, "rewards/accuracies": 0.625, "rewards/chosen": 0.043410152196884155, "rewards/margins": 0.10863767564296722, "rewards/rejected": -0.06522750109434128, "step": 4682 }, { "epoch": 0.7242219215155615, "grad_norm": 7.810519695281982, "learning_rate": 4.214400274945584e-06, "logits/chosen": 6.762552738189697, "logits/rejected": 7.165021896362305, "logps/chosen": -168.40550231933594, "logps/rejected": -168.51998901367188, "loss": 0.9501, "rewards/accuracies": 0.25, "rewards/chosen": -0.6140388250350952, "rewards/margins": -0.37804698944091797, "rewards/rejected": -0.23599186539649963, "step": 4683 }, { "epoch": 0.7243765706553257, "grad_norm": 6.186726093292236, "learning_rate": 4.21411387329591e-06, "logits/chosen": 6.37883996963501, "logits/rejected": 3.2008724212646484, "logps/chosen": -265.9684753417969, "logps/rejected": -208.27178955078125, "loss": 0.4794, "rewards/accuracies": 0.625, "rewards/chosen": 0.25573229789733887, "rewards/margins": 0.6330490112304688, "rewards/rejected": -0.3773166835308075, "step": 4684 }, { "epoch": 0.7245312197950899, "grad_norm": 7.938426494598389, "learning_rate": 4.213827471646237e-06, "logits/chosen": 1.4458963871002197, "logits/rejected": 4.18729305267334, "logps/chosen": -433.803955078125, "logps/rejected": -410.27362060546875, "loss": 0.6554, "rewards/accuracies": 0.75, "rewards/chosen": 0.33891814947128296, "rewards/margins": 0.16704228520393372, "rewards/rejected": 0.17187586426734924, "step": 4685 }, { "epoch": 0.7246858689348541, "grad_norm": 15.807042121887207, "learning_rate": 4.213541069996563e-06, "logits/chosen": 14.012039184570312, "logits/rejected": 8.866901397705078, "logps/chosen": -315.65911865234375, "logps/rejected": -259.985107421875, "loss": 0.7432, "rewards/accuracies": 0.5, "rewards/chosen": 0.3463870882987976, "rewards/margins": 0.02259824424982071, "rewards/rejected": 0.3237888514995575, "step": 4686 }, { "epoch": 0.7248405180746182, "grad_norm": 4.948505878448486, "learning_rate": 4.21325466834689e-06, "logits/chosen": 17.208398818969727, "logits/rejected": 8.785602569580078, "logps/chosen": -292.0123291015625, "logps/rejected": -170.625732421875, "loss": 0.69, "rewards/accuracies": 0.5, "rewards/chosen": 0.3512876331806183, "rewards/margins": 0.16835862398147583, "rewards/rejected": 0.18292903900146484, "step": 4687 }, { "epoch": 0.7249951672143824, "grad_norm": 4.025766849517822, "learning_rate": 4.2129682666972166e-06, "logits/chosen": 13.391169548034668, "logits/rejected": 10.078253746032715, "logps/chosen": -210.8770751953125, "logps/rejected": -198.12782287597656, "loss": 0.7102, "rewards/accuracies": 0.25, "rewards/chosen": 0.010708902031183243, "rewards/margins": 0.06500528752803802, "rewards/rejected": -0.05429638922214508, "step": 4688 }, { "epoch": 0.7251498163541465, "grad_norm": 6.953975200653076, "learning_rate": 4.212681865047543e-06, "logits/chosen": 12.4249906539917, "logits/rejected": 7.346518516540527, "logps/chosen": -282.6623229980469, "logps/rejected": -266.74237060546875, "loss": 0.6153, "rewards/accuracies": 0.875, "rewards/chosen": -0.018584638833999634, "rewards/margins": 0.38569384813308716, "rewards/rejected": -0.4042784869670868, "step": 4689 }, { "epoch": 0.7253044654939107, "grad_norm": 7.109914779663086, "learning_rate": 4.21239546339787e-06, "logits/chosen": 15.69974136352539, "logits/rejected": 11.655282974243164, "logps/chosen": -400.8044738769531, "logps/rejected": -367.01409912109375, "loss": 0.6945, "rewards/accuracies": 0.625, "rewards/chosen": 0.18244630098342896, "rewards/margins": 0.11077319830656052, "rewards/rejected": 0.07167311012744904, "step": 4690 }, { "epoch": 0.7254591146336749, "grad_norm": 7.34127950668335, "learning_rate": 4.212109061748196e-06, "logits/chosen": 8.716079711914062, "logits/rejected": 8.624029159545898, "logps/chosen": -246.34829711914062, "logps/rejected": -177.95440673828125, "loss": 0.5315, "rewards/accuracies": 0.75, "rewards/chosen": 0.2629864811897278, "rewards/margins": 0.4446890652179718, "rewards/rejected": -0.1817026138305664, "step": 4691 }, { "epoch": 0.725613763773439, "grad_norm": 5.421318531036377, "learning_rate": 4.211822660098522e-06, "logits/chosen": 11.135812759399414, "logits/rejected": 12.228429794311523, "logps/chosen": -169.8853759765625, "logps/rejected": -197.93788146972656, "loss": 0.6319, "rewards/accuracies": 0.5, "rewards/chosen": -0.26685449481010437, "rewards/margins": 0.20001932978630066, "rewards/rejected": -0.46687382459640503, "step": 4692 }, { "epoch": 0.7257684129132032, "grad_norm": 6.548890590667725, "learning_rate": 4.211536258448849e-06, "logits/chosen": 6.484709739685059, "logits/rejected": 6.935871124267578, "logps/chosen": -194.16201782226562, "logps/rejected": -214.46810913085938, "loss": 0.712, "rewards/accuracies": 0.375, "rewards/chosen": -0.3927028477191925, "rewards/margins": 0.12043533474206924, "rewards/rejected": -0.5131381750106812, "step": 4693 }, { "epoch": 0.7259230620529673, "grad_norm": 4.851171016693115, "learning_rate": 4.211249856799176e-06, "logits/chosen": 9.42538070678711, "logits/rejected": 8.364532470703125, "logps/chosen": -263.5284118652344, "logps/rejected": -320.7128601074219, "loss": 0.4806, "rewards/accuracies": 0.875, "rewards/chosen": 0.2540663778781891, "rewards/margins": 0.527794361114502, "rewards/rejected": -0.27372801303863525, "step": 4694 }, { "epoch": 0.7260777111927315, "grad_norm": 3.683863878250122, "learning_rate": 4.210963455149502e-06, "logits/chosen": 10.690038681030273, "logits/rejected": 7.809244632720947, "logps/chosen": -220.42422485351562, "logps/rejected": -141.40753173828125, "loss": 0.4945, "rewards/accuracies": 0.75, "rewards/chosen": 0.37099236249923706, "rewards/margins": 0.5335652828216553, "rewards/rejected": -0.1625729501247406, "step": 4695 }, { "epoch": 0.7262323603324956, "grad_norm": 9.078512191772461, "learning_rate": 4.210677053499829e-06, "logits/chosen": 3.4368064403533936, "logits/rejected": 1.8862788677215576, "logps/chosen": -289.85150146484375, "logps/rejected": -457.8827209472656, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": 0.2011779397726059, "rewards/margins": 0.2352014183998108, "rewards/rejected": -0.034023478627204895, "step": 4696 }, { "epoch": 0.7263870094722598, "grad_norm": 4.544132709503174, "learning_rate": 4.2103906518501556e-06, "logits/chosen": 9.603703498840332, "logits/rejected": 4.0646233558654785, "logps/chosen": -269.1265869140625, "logps/rejected": -249.3544921875, "loss": 0.5998, "rewards/accuracies": 0.75, "rewards/chosen": 0.2045774757862091, "rewards/margins": 0.2614637017250061, "rewards/rejected": -0.05688624083995819, "step": 4697 }, { "epoch": 0.726541658612024, "grad_norm": 4.941222190856934, "learning_rate": 4.210104250200481e-06, "logits/chosen": 8.435171127319336, "logits/rejected": 10.146812438964844, "logps/chosen": -295.2537536621094, "logps/rejected": -275.2083740234375, "loss": 0.7124, "rewards/accuracies": 0.5, "rewards/chosen": 0.31353017687797546, "rewards/margins": -0.003013603389263153, "rewards/rejected": 0.3165437579154968, "step": 4698 }, { "epoch": 0.7266963077517882, "grad_norm": 5.756451606750488, "learning_rate": 4.209817848550808e-06, "logits/chosen": 14.595026969909668, "logits/rejected": 8.859549522399902, "logps/chosen": -290.5458984375, "logps/rejected": -230.3878173828125, "loss": 0.6936, "rewards/accuracies": 0.75, "rewards/chosen": 0.151070237159729, "rewards/margins": 0.1971987932920456, "rewards/rejected": -0.04612858593463898, "step": 4699 }, { "epoch": 0.7268509568915523, "grad_norm": 6.423048496246338, "learning_rate": 4.209531446901135e-06, "logits/chosen": 7.840253829956055, "logits/rejected": 8.613065719604492, "logps/chosen": -315.06787109375, "logps/rejected": -353.32757568359375, "loss": 0.6718, "rewards/accuracies": 0.5, "rewards/chosen": 0.17087221145629883, "rewards/margins": 0.2379167228937149, "rewards/rejected": -0.06704450398683548, "step": 4700 }, { "epoch": 0.7270056060313165, "grad_norm": 6.030930995941162, "learning_rate": 4.209245045251461e-06, "logits/chosen": 11.919048309326172, "logits/rejected": 16.987382888793945, "logps/chosen": -240.172607421875, "logps/rejected": -270.4597473144531, "loss": 0.7472, "rewards/accuracies": 0.5, "rewards/chosen": 0.25187310576438904, "rewards/margins": -0.061295412480831146, "rewards/rejected": 0.3131685256958008, "step": 4701 }, { "epoch": 0.7271602551710806, "grad_norm": 4.687925815582275, "learning_rate": 4.208958643601788e-06, "logits/chosen": 13.742670059204102, "logits/rejected": 12.651939392089844, "logps/chosen": -253.6357421875, "logps/rejected": -240.97171020507812, "loss": 0.6148, "rewards/accuracies": 0.625, "rewards/chosen": 0.2723259925842285, "rewards/margins": 0.20202729105949402, "rewards/rejected": 0.0702986791729927, "step": 4702 }, { "epoch": 0.7273149043108448, "grad_norm": 5.41274881362915, "learning_rate": 4.208672241952115e-06, "logits/chosen": 12.720199584960938, "logits/rejected": 3.3738980293273926, "logps/chosen": -396.3179626464844, "logps/rejected": -265.2395324707031, "loss": 0.4783, "rewards/accuracies": 0.75, "rewards/chosen": 0.6759431958198547, "rewards/margins": 0.7797024846076965, "rewards/rejected": -0.1037592887878418, "step": 4703 }, { "epoch": 0.7274695534506089, "grad_norm": 5.377590179443359, "learning_rate": 4.20838584030244e-06, "logits/chosen": 12.636724472045898, "logits/rejected": 12.154258728027344, "logps/chosen": -269.744873046875, "logps/rejected": -206.98141479492188, "loss": 0.7226, "rewards/accuracies": 0.375, "rewards/chosen": 0.09787529706954956, "rewards/margins": -0.03338862210512161, "rewards/rejected": 0.13126391172409058, "step": 4704 }, { "epoch": 0.7276242025903731, "grad_norm": 4.086818218231201, "learning_rate": 4.208099438652767e-06, "logits/chosen": 5.638157367706299, "logits/rejected": 6.079058647155762, "logps/chosen": -269.8873596191406, "logps/rejected": -220.60565185546875, "loss": 0.4946, "rewards/accuracies": 0.625, "rewards/chosen": 0.4810035824775696, "rewards/margins": 0.5613747835159302, "rewards/rejected": -0.08037119358778, "step": 4705 }, { "epoch": 0.7277788517301372, "grad_norm": 7.9334893226623535, "learning_rate": 4.207813037003094e-06, "logits/chosen": 10.022344589233398, "logits/rejected": 3.5141170024871826, "logps/chosen": -441.4126281738281, "logps/rejected": -255.1077117919922, "loss": 0.6307, "rewards/accuracies": 0.625, "rewards/chosen": 0.2591535747051239, "rewards/margins": 0.3867121934890747, "rewards/rejected": -0.1275586187839508, "step": 4706 }, { "epoch": 0.7279335008699014, "grad_norm": 5.438113689422607, "learning_rate": 4.20752663535342e-06, "logits/chosen": 14.031675338745117, "logits/rejected": 6.951600074768066, "logps/chosen": -468.0054016113281, "logps/rejected": -335.5232849121094, "loss": 0.5249, "rewards/accuracies": 0.625, "rewards/chosen": 0.1453966200351715, "rewards/margins": 0.5273954272270203, "rewards/rejected": -0.38199883699417114, "step": 4707 }, { "epoch": 0.7280881500096655, "grad_norm": 3.688727617263794, "learning_rate": 4.207240233703746e-06, "logits/chosen": 14.206121444702148, "logits/rejected": 5.133615493774414, "logps/chosen": -288.57904052734375, "logps/rejected": -191.79388427734375, "loss": 0.5379, "rewards/accuracies": 0.5, "rewards/chosen": 0.17728619277477264, "rewards/margins": 0.4602702856063843, "rewards/rejected": -0.2829841077327728, "step": 4708 }, { "epoch": 0.7282427991494297, "grad_norm": 8.024949073791504, "learning_rate": 4.206953832054073e-06, "logits/chosen": 14.46672248840332, "logits/rejected": 7.849008560180664, "logps/chosen": -473.2665100097656, "logps/rejected": -273.0191650390625, "loss": 0.6551, "rewards/accuracies": 0.375, "rewards/chosen": 0.28687095642089844, "rewards/margins": 0.13161106407642365, "rewards/rejected": 0.155259907245636, "step": 4709 }, { "epoch": 0.7283974482891938, "grad_norm": 4.506289482116699, "learning_rate": 4.2066674304043995e-06, "logits/chosen": 5.828250885009766, "logits/rejected": 6.773597717285156, "logps/chosen": -179.40469360351562, "logps/rejected": -224.75875854492188, "loss": 0.6985, "rewards/accuracies": 0.375, "rewards/chosen": -0.0276484452188015, "rewards/margins": 0.016211219131946564, "rewards/rejected": -0.04385966807603836, "step": 4710 }, { "epoch": 0.7285520974289581, "grad_norm": 4.69533109664917, "learning_rate": 4.206381028754726e-06, "logits/chosen": 10.374015808105469, "logits/rejected": 9.881458282470703, "logps/chosen": -278.9754943847656, "logps/rejected": -288.93609619140625, "loss": 0.6551, "rewards/accuracies": 0.75, "rewards/chosen": 0.49047279357910156, "rewards/margins": 0.2534168064594269, "rewards/rejected": 0.23705598711967468, "step": 4711 }, { "epoch": 0.7287067465687223, "grad_norm": 8.948022842407227, "learning_rate": 4.206094627105052e-06, "logits/chosen": 11.090749740600586, "logits/rejected": 11.137782096862793, "logps/chosen": -235.52304077148438, "logps/rejected": -216.55130004882812, "loss": 0.6269, "rewards/accuracies": 0.625, "rewards/chosen": 0.055186688899993896, "rewards/margins": 0.3071196675300598, "rewards/rejected": -0.2519330084323883, "step": 4712 }, { "epoch": 0.7288613957084864, "grad_norm": 4.812723159790039, "learning_rate": 4.205808225455379e-06, "logits/chosen": 9.250722885131836, "logits/rejected": 6.440821647644043, "logps/chosen": -253.46392822265625, "logps/rejected": -308.44012451171875, "loss": 0.5141, "rewards/accuracies": 0.625, "rewards/chosen": 0.4054829478263855, "rewards/margins": 0.5554682612419128, "rewards/rejected": -0.14998534321784973, "step": 4713 }, { "epoch": 0.7290160448482506, "grad_norm": 7.435058116912842, "learning_rate": 4.205521823805705e-06, "logits/chosen": 6.702761173248291, "logits/rejected": 9.35500431060791, "logps/chosen": -401.7355651855469, "logps/rejected": -339.3660888671875, "loss": 0.6215, "rewards/accuracies": 0.5, "rewards/chosen": -0.12997551262378693, "rewards/margins": 0.24712444841861725, "rewards/rejected": -0.3770999312400818, "step": 4714 }, { "epoch": 0.7291706939880147, "grad_norm": 6.192553997039795, "learning_rate": 4.205235422156032e-06, "logits/chosen": 6.972842216491699, "logits/rejected": 10.829671859741211, "logps/chosen": -224.13345336914062, "logps/rejected": -254.299072265625, "loss": 0.7744, "rewards/accuracies": 0.625, "rewards/chosen": -0.15659284591674805, "rewards/margins": -0.07944852858781815, "rewards/rejected": -0.07714433968067169, "step": 4715 }, { "epoch": 0.7293253431277789, "grad_norm": 8.186197280883789, "learning_rate": 4.2049490205063585e-06, "logits/chosen": 13.802979469299316, "logits/rejected": 7.220629692077637, "logps/chosen": -434.03558349609375, "logps/rejected": -249.51254272460938, "loss": 0.7413, "rewards/accuracies": 0.5, "rewards/chosen": -0.09063348919153214, "rewards/margins": -0.06410132348537445, "rewards/rejected": -0.02653217688202858, "step": 4716 }, { "epoch": 0.729479992267543, "grad_norm": 7.190593719482422, "learning_rate": 4.204662618856684e-06, "logits/chosen": -1.2313786745071411, "logits/rejected": 8.329784393310547, "logps/chosen": -188.1888427734375, "logps/rejected": -320.2513732910156, "loss": 0.8256, "rewards/accuracies": 0.375, "rewards/chosen": 0.2141110897064209, "rewards/margins": -0.10777591168880463, "rewards/rejected": 0.3218870162963867, "step": 4717 }, { "epoch": 0.7296346414073072, "grad_norm": 3.9558451175689697, "learning_rate": 4.204376217207011e-06, "logits/chosen": 13.355031967163086, "logits/rejected": 14.219476699829102, "logps/chosen": -238.9228515625, "logps/rejected": -235.6582489013672, "loss": 0.4603, "rewards/accuracies": 1.0, "rewards/chosen": 0.39660704135894775, "rewards/margins": 0.5685549974441528, "rewards/rejected": -0.17194795608520508, "step": 4718 }, { "epoch": 0.7297892905470713, "grad_norm": 3.863690137863159, "learning_rate": 4.204089815557338e-06, "logits/chosen": 15.465452194213867, "logits/rejected": 3.9272823333740234, "logps/chosen": -221.74464416503906, "logps/rejected": -112.0164794921875, "loss": 0.5725, "rewards/accuracies": 0.75, "rewards/chosen": 0.20435886085033417, "rewards/margins": 0.30952098965644836, "rewards/rejected": -0.10516209900379181, "step": 4719 }, { "epoch": 0.7299439396868355, "grad_norm": 5.704815864562988, "learning_rate": 4.203803413907664e-06, "logits/chosen": 7.788920879364014, "logits/rejected": 9.360577583312988, "logps/chosen": -278.876220703125, "logps/rejected": -325.6363525390625, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 0.30553552508354187, "rewards/margins": 0.5258297324180603, "rewards/rejected": -0.22029419243335724, "step": 4720 }, { "epoch": 0.7300985888265996, "grad_norm": 5.160268783569336, "learning_rate": 4.203517012257991e-06, "logits/chosen": 11.97552490234375, "logits/rejected": 4.361750602722168, "logps/chosen": -324.85284423828125, "logps/rejected": -205.0785675048828, "loss": 0.4747, "rewards/accuracies": 0.875, "rewards/chosen": 0.15700645744800568, "rewards/margins": 0.6302173733711243, "rewards/rejected": -0.4732109308242798, "step": 4721 }, { "epoch": 0.7302532379663638, "grad_norm": 9.462347030639648, "learning_rate": 4.203230610608318e-06, "logits/chosen": 9.682936668395996, "logits/rejected": 5.709912300109863, "logps/chosen": -255.21612548828125, "logps/rejected": -246.0841522216797, "loss": 0.7568, "rewards/accuracies": 0.625, "rewards/chosen": 0.016873463988304138, "rewards/margins": -0.029597178101539612, "rewards/rejected": 0.04647064581513405, "step": 4722 }, { "epoch": 0.7304078871061279, "grad_norm": 5.32291316986084, "learning_rate": 4.202944208958644e-06, "logits/chosen": 9.940293312072754, "logits/rejected": 6.945707321166992, "logps/chosen": -270.47711181640625, "logps/rejected": -200.5116424560547, "loss": 0.5554, "rewards/accuracies": 0.625, "rewards/chosen": 0.004439353942871094, "rewards/margins": 0.5420545339584351, "rewards/rejected": -0.5376152396202087, "step": 4723 }, { "epoch": 0.7305625362458922, "grad_norm": 5.322466850280762, "learning_rate": 4.20265780730897e-06, "logits/chosen": 11.245365142822266, "logits/rejected": 6.75719690322876, "logps/chosen": -281.06927490234375, "logps/rejected": -216.71633911132812, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 0.44138044118881226, "rewards/margins": 0.4201873540878296, "rewards/rejected": 0.021193087100982666, "step": 4724 }, { "epoch": 0.7307171853856563, "grad_norm": 9.266231536865234, "learning_rate": 4.202371405659297e-06, "logits/chosen": 8.465092658996582, "logits/rejected": 13.018942832946777, "logps/chosen": -375.2256774902344, "logps/rejected": -444.3348693847656, "loss": 0.7855, "rewards/accuracies": 0.625, "rewards/chosen": 0.4741671085357666, "rewards/margins": -0.051219433546066284, "rewards/rejected": 0.5253865718841553, "step": 4725 }, { "epoch": 0.7308718345254205, "grad_norm": 7.655884265899658, "learning_rate": 4.202085004009623e-06, "logits/chosen": 11.626346588134766, "logits/rejected": 8.894448280334473, "logps/chosen": -267.4024658203125, "logps/rejected": -333.1418762207031, "loss": 0.7425, "rewards/accuracies": 0.625, "rewards/chosen": 0.20657792687416077, "rewards/margins": 0.060829345136880875, "rewards/rejected": 0.14574861526489258, "step": 4726 }, { "epoch": 0.7310264836651846, "grad_norm": 6.579062461853027, "learning_rate": 4.20179860235995e-06, "logits/chosen": 11.06484603881836, "logits/rejected": 11.69767951965332, "logps/chosen": -353.1047058105469, "logps/rejected": -390.7818908691406, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.3880038857460022, "rewards/margins": 0.30388742685317993, "rewards/rejected": 0.08411645889282227, "step": 4727 }, { "epoch": 0.7311811328049488, "grad_norm": 5.405943393707275, "learning_rate": 4.201512200710277e-06, "logits/chosen": 5.7828521728515625, "logits/rejected": 0.461294025182724, "logps/chosen": -272.8395690917969, "logps/rejected": -191.7827606201172, "loss": 0.6459, "rewards/accuracies": 0.75, "rewards/chosen": 0.18535934388637543, "rewards/margins": 0.39121270179748535, "rewards/rejected": -0.20585334300994873, "step": 4728 }, { "epoch": 0.7313357819447129, "grad_norm": 5.422591209411621, "learning_rate": 4.201225799060603e-06, "logits/chosen": 9.699492454528809, "logits/rejected": 4.114933013916016, "logps/chosen": -381.24365234375, "logps/rejected": -239.6695556640625, "loss": 0.5684, "rewards/accuracies": 0.75, "rewards/chosen": 0.4555453062057495, "rewards/margins": 0.459337055683136, "rewards/rejected": -0.0037917643785476685, "step": 4729 }, { "epoch": 0.7314904310844771, "grad_norm": 4.382835865020752, "learning_rate": 4.200939397410929e-06, "logits/chosen": 11.869800567626953, "logits/rejected": 0.9620145559310913, "logps/chosen": -172.9580078125, "logps/rejected": -122.19593811035156, "loss": 0.6222, "rewards/accuracies": 0.75, "rewards/chosen": -0.09751349687576294, "rewards/margins": 0.22684121131896973, "rewards/rejected": -0.32435470819473267, "step": 4730 }, { "epoch": 0.7316450802242412, "grad_norm": 4.710762977600098, "learning_rate": 4.200652995761256e-06, "logits/chosen": 8.49771785736084, "logits/rejected": 6.4628214836120605, "logps/chosen": -198.4684600830078, "logps/rejected": -180.46646118164062, "loss": 0.5961, "rewards/accuracies": 0.625, "rewards/chosen": 0.13437537848949432, "rewards/margins": 0.4670109450817108, "rewards/rejected": -0.3326355516910553, "step": 4731 }, { "epoch": 0.7317997293640054, "grad_norm": 4.254397392272949, "learning_rate": 4.200366594111582e-06, "logits/chosen": 7.064027786254883, "logits/rejected": 5.760613441467285, "logps/chosen": -139.57577514648438, "logps/rejected": -183.42982482910156, "loss": 0.4471, "rewards/accuracies": 0.625, "rewards/chosen": 0.03424603492021561, "rewards/margins": 1.1380047798156738, "rewards/rejected": -1.1037589311599731, "step": 4732 }, { "epoch": 0.7319543785037695, "grad_norm": 4.80014181137085, "learning_rate": 4.200080192461909e-06, "logits/chosen": 10.552350044250488, "logits/rejected": 9.416030883789062, "logps/chosen": -274.9895324707031, "logps/rejected": -238.43426513671875, "loss": 0.7119, "rewards/accuracies": 0.625, "rewards/chosen": 0.2599753737449646, "rewards/margins": 0.06376844644546509, "rewards/rejected": 0.1962069272994995, "step": 4733 }, { "epoch": 0.7321090276435337, "grad_norm": 5.721470832824707, "learning_rate": 4.199793790812236e-06, "logits/chosen": 13.150030136108398, "logits/rejected": 11.381097793579102, "logps/chosen": -338.6885681152344, "logps/rejected": -347.7142028808594, "loss": 0.6256, "rewards/accuracies": 0.75, "rewards/chosen": 0.5123617649078369, "rewards/margins": 0.2433176338672638, "rewards/rejected": 0.26904410123825073, "step": 4734 }, { "epoch": 0.7322636767832978, "grad_norm": 8.358860969543457, "learning_rate": 4.199507389162562e-06, "logits/chosen": 9.067138671875, "logits/rejected": 9.469714164733887, "logps/chosen": -354.45556640625, "logps/rejected": -316.4116516113281, "loss": 0.8371, "rewards/accuracies": 0.5, "rewards/chosen": 0.11283457279205322, "rewards/margins": -0.10998454689979553, "rewards/rejected": 0.22281914949417114, "step": 4735 }, { "epoch": 0.7324183259230621, "grad_norm": 4.5861616134643555, "learning_rate": 4.199220987512889e-06, "logits/chosen": 13.042624473571777, "logits/rejected": 4.53838586807251, "logps/chosen": -462.93255615234375, "logps/rejected": -259.7115478515625, "loss": 0.438, "rewards/accuracies": 1.0, "rewards/chosen": 0.29311391711235046, "rewards/margins": 0.6601704359054565, "rewards/rejected": -0.3670564889907837, "step": 4736 }, { "epoch": 0.7325729750628263, "grad_norm": 6.264298439025879, "learning_rate": 4.198934585863215e-06, "logits/chosen": 6.475538730621338, "logits/rejected": 5.763686180114746, "logps/chosen": -136.54244995117188, "logps/rejected": -277.5910949707031, "loss": 0.6085, "rewards/accuracies": 0.5, "rewards/chosen": 0.07031980156898499, "rewards/margins": 0.27153682708740234, "rewards/rejected": -0.20121705532073975, "step": 4737 }, { "epoch": 0.7327276242025904, "grad_norm": 5.928725242614746, "learning_rate": 4.1986481842135415e-06, "logits/chosen": 7.112873554229736, "logits/rejected": 6.089903831481934, "logps/chosen": -181.59841918945312, "logps/rejected": -168.23574829101562, "loss": 0.7359, "rewards/accuracies": 0.375, "rewards/chosen": -0.16707392036914825, "rewards/margins": -0.03283804655075073, "rewards/rejected": -0.13423585891723633, "step": 4738 }, { "epoch": 0.7328822733423546, "grad_norm": 5.5603413581848145, "learning_rate": 4.198361782563868e-06, "logits/chosen": 10.272453308105469, "logits/rejected": 3.9385972023010254, "logps/chosen": -201.46035766601562, "logps/rejected": -181.38662719726562, "loss": 0.7557, "rewards/accuracies": 0.5, "rewards/chosen": -0.07115526497364044, "rewards/margins": -0.0715007334947586, "rewards/rejected": 0.00034546852111816406, "step": 4739 }, { "epoch": 0.7330369224821187, "grad_norm": 5.641921520233154, "learning_rate": 4.198075380914195e-06, "logits/chosen": 10.48651123046875, "logits/rejected": 8.48303508758545, "logps/chosen": -241.899658203125, "logps/rejected": -215.71957397460938, "loss": 0.6765, "rewards/accuracies": 0.375, "rewards/chosen": 0.1212267279624939, "rewards/margins": 0.1581042855978012, "rewards/rejected": -0.03687753528356552, "step": 4740 }, { "epoch": 0.7331915716218829, "grad_norm": 4.05303955078125, "learning_rate": 4.197788979264521e-06, "logits/chosen": 11.159069061279297, "logits/rejected": 7.462393760681152, "logps/chosen": -410.6734313964844, "logps/rejected": -311.5502014160156, "loss": 0.4606, "rewards/accuracies": 0.875, "rewards/chosen": 0.5157081484794617, "rewards/margins": 0.7549344301223755, "rewards/rejected": -0.239226296544075, "step": 4741 }, { "epoch": 0.733346220761647, "grad_norm": 5.8020339012146, "learning_rate": 4.197502577614847e-06, "logits/chosen": 12.310346603393555, "logits/rejected": 10.4533052444458, "logps/chosen": -382.34552001953125, "logps/rejected": -236.3104248046875, "loss": 0.6069, "rewards/accuracies": 0.5, "rewards/chosen": -0.0025434568524360657, "rewards/margins": 0.24380816519260406, "rewards/rejected": -0.24635162949562073, "step": 4742 }, { "epoch": 0.7335008699014112, "grad_norm": 6.338212966918945, "learning_rate": 4.197216175965174e-06, "logits/chosen": 9.718679428100586, "logits/rejected": 8.147027015686035, "logps/chosen": -296.2594299316406, "logps/rejected": -247.516845703125, "loss": 0.7865, "rewards/accuracies": 0.375, "rewards/chosen": -0.11442165821790695, "rewards/margins": 0.0996694266796112, "rewards/rejected": -0.21409109234809875, "step": 4743 }, { "epoch": 0.7336555190411753, "grad_norm": 5.800508499145508, "learning_rate": 4.1969297743155005e-06, "logits/chosen": 10.19094467163086, "logits/rejected": 10.416265487670898, "logps/chosen": -242.73402404785156, "logps/rejected": -208.94903564453125, "loss": 0.6988, "rewards/accuracies": 0.25, "rewards/chosen": 0.23169006407260895, "rewards/margins": 0.007804155349731445, "rewards/rejected": 0.2238859087228775, "step": 4744 }, { "epoch": 0.7338101681809395, "grad_norm": 5.29016637802124, "learning_rate": 4.196643372665827e-06, "logits/chosen": 13.938228607177734, "logits/rejected": 10.117415428161621, "logps/chosen": -378.0729675292969, "logps/rejected": -338.91815185546875, "loss": 0.5675, "rewards/accuracies": 0.625, "rewards/chosen": 0.39828574657440186, "rewards/margins": 0.3874906897544861, "rewards/rejected": 0.010795064270496368, "step": 4745 }, { "epoch": 0.7339648173207036, "grad_norm": 5.569178104400635, "learning_rate": 4.196356971016153e-06, "logits/chosen": 7.9367523193359375, "logits/rejected": 6.671178817749023, "logps/chosen": -352.7064208984375, "logps/rejected": -321.2003479003906, "loss": 0.6386, "rewards/accuracies": 0.75, "rewards/chosen": 0.08021535724401474, "rewards/margins": 0.21329127252101898, "rewards/rejected": -0.13307590782642365, "step": 4746 }, { "epoch": 0.7341194664604678, "grad_norm": 8.94774055480957, "learning_rate": 4.19607056936648e-06, "logits/chosen": 10.536354064941406, "logits/rejected": 4.0362935066223145, "logps/chosen": -517.2574462890625, "logps/rejected": -228.9273223876953, "loss": 0.6895, "rewards/accuracies": 0.625, "rewards/chosen": 0.14852410554885864, "rewards/margins": 0.19637462496757507, "rewards/rejected": -0.04785052686929703, "step": 4747 }, { "epoch": 0.7342741156002319, "grad_norm": 4.9800214767456055, "learning_rate": 4.195784167716806e-06, "logits/chosen": 13.256465911865234, "logits/rejected": 13.477039337158203, "logps/chosen": -220.45750427246094, "logps/rejected": -221.9154052734375, "loss": 0.6814, "rewards/accuracies": 0.875, "rewards/chosen": 0.09138321876525879, "rewards/margins": 0.09960539638996124, "rewards/rejected": -0.00822218507528305, "step": 4748 }, { "epoch": 0.7344287647399962, "grad_norm": 4.7813286781311035, "learning_rate": 4.195497766067133e-06, "logits/chosen": 11.675992965698242, "logits/rejected": 8.03116512298584, "logps/chosen": -236.76043701171875, "logps/rejected": -161.45748901367188, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": 0.004031464457511902, "rewards/margins": 0.19753924012184143, "rewards/rejected": -0.19350777566432953, "step": 4749 }, { "epoch": 0.7345834138797603, "grad_norm": 3.82041072845459, "learning_rate": 4.195211364417459e-06, "logits/chosen": 11.645238876342773, "logits/rejected": 8.816662788391113, "logps/chosen": -240.3917236328125, "logps/rejected": -207.9389190673828, "loss": 0.5737, "rewards/accuracies": 0.75, "rewards/chosen": 0.21743574738502502, "rewards/margins": 0.31267639994621277, "rewards/rejected": -0.09524065256118774, "step": 4750 }, { "epoch": 0.7347380630195245, "grad_norm": 5.4435505867004395, "learning_rate": 4.194924962767785e-06, "logits/chosen": 11.81915283203125, "logits/rejected": 16.080331802368164, "logps/chosen": -252.87496948242188, "logps/rejected": -351.0946960449219, "loss": 0.6349, "rewards/accuracies": 0.625, "rewards/chosen": -0.11683613061904907, "rewards/margins": 0.2595912516117096, "rewards/rejected": -0.37642738223075867, "step": 4751 }, { "epoch": 0.7348927121592886, "grad_norm": 6.407220363616943, "learning_rate": 4.194638561118112e-06, "logits/chosen": 11.030671119689941, "logits/rejected": 10.792478561401367, "logps/chosen": -316.3838195800781, "logps/rejected": -304.4842529296875, "loss": 0.7744, "rewards/accuracies": 0.375, "rewards/chosen": -0.02871304750442505, "rewards/margins": -0.08277063071727753, "rewards/rejected": 0.05405759811401367, "step": 4752 }, { "epoch": 0.7350473612990528, "grad_norm": 6.015807151794434, "learning_rate": 4.194352159468439e-06, "logits/chosen": 12.402322769165039, "logits/rejected": 11.337557792663574, "logps/chosen": -375.75665283203125, "logps/rejected": -358.983154296875, "loss": 0.5339, "rewards/accuracies": 0.75, "rewards/chosen": 0.5309127569198608, "rewards/margins": 0.4515155553817749, "rewards/rejected": 0.07939721643924713, "step": 4753 }, { "epoch": 0.7352020104388169, "grad_norm": 6.017291069030762, "learning_rate": 4.194065757818765e-06, "logits/chosen": 8.792664527893066, "logits/rejected": 6.598265171051025, "logps/chosen": -336.721923828125, "logps/rejected": -236.42062377929688, "loss": 0.6941, "rewards/accuracies": 0.5, "rewards/chosen": 0.33871591091156006, "rewards/margins": 0.2760820686817169, "rewards/rejected": 0.06263384222984314, "step": 4754 }, { "epoch": 0.7353566595785811, "grad_norm": 11.23551082611084, "learning_rate": 4.193779356169092e-06, "logits/chosen": 13.905418395996094, "logits/rejected": 12.497049331665039, "logps/chosen": -300.74884033203125, "logps/rejected": -259.16766357421875, "loss": 1.0682, "rewards/accuracies": 0.125, "rewards/chosen": 0.03809165954589844, "rewards/margins": -0.5893455743789673, "rewards/rejected": 0.6274372339248657, "step": 4755 }, { "epoch": 0.7355113087183452, "grad_norm": 5.389484405517578, "learning_rate": 4.193492954519419e-06, "logits/chosen": 13.51899528503418, "logits/rejected": 12.981062889099121, "logps/chosen": -326.459228515625, "logps/rejected": -315.577392578125, "loss": 0.6013, "rewards/accuracies": 0.75, "rewards/chosen": 0.4855082035064697, "rewards/margins": 0.24323587119579315, "rewards/rejected": 0.242272287607193, "step": 4756 }, { "epoch": 0.7356659578581094, "grad_norm": 6.513172626495361, "learning_rate": 4.1932065528697444e-06, "logits/chosen": 9.911660194396973, "logits/rejected": 8.243659973144531, "logps/chosen": -306.8182373046875, "logps/rejected": -256.8122863769531, "loss": 0.7903, "rewards/accuracies": 0.5, "rewards/chosen": 0.008855774998664856, "rewards/margins": -0.12659047544002533, "rewards/rejected": 0.13544625043869019, "step": 4757 }, { "epoch": 0.7358206069978735, "grad_norm": 60.23966598510742, "learning_rate": 4.192920151220071e-06, "logits/chosen": 8.791994094848633, "logits/rejected": 8.53953742980957, "logps/chosen": -418.1119384765625, "logps/rejected": -295.19207763671875, "loss": 0.8701, "rewards/accuracies": 0.25, "rewards/chosen": -0.4879385828971863, "rewards/margins": -0.28741228580474854, "rewards/rejected": -0.20052632689476013, "step": 4758 }, { "epoch": 0.7359752561376377, "grad_norm": 6.296421051025391, "learning_rate": 4.192633749570398e-06, "logits/chosen": 14.486000061035156, "logits/rejected": 10.41128158569336, "logps/chosen": -353.5166931152344, "logps/rejected": -281.64422607421875, "loss": 0.7174, "rewards/accuracies": 0.625, "rewards/chosen": 0.29859182238578796, "rewards/margins": 0.08769933879375458, "rewards/rejected": 0.2108924835920334, "step": 4759 }, { "epoch": 0.7361299052774019, "grad_norm": 5.47121000289917, "learning_rate": 4.192347347920724e-06, "logits/chosen": 9.593307495117188, "logits/rejected": 10.236595153808594, "logps/chosen": -160.1764373779297, "logps/rejected": -201.8399658203125, "loss": 0.7197, "rewards/accuracies": 0.625, "rewards/chosen": -0.28673186898231506, "rewards/margins": 0.010794736444950104, "rewards/rejected": -0.29752659797668457, "step": 4760 }, { "epoch": 0.736284554417166, "grad_norm": 4.923763275146484, "learning_rate": 4.192060946271051e-06, "logits/chosen": 11.913820266723633, "logits/rejected": 7.56839656829834, "logps/chosen": -241.36947631835938, "logps/rejected": -205.43011474609375, "loss": 0.7003, "rewards/accuracies": 0.5, "rewards/chosen": -0.09711894392967224, "rewards/margins": 0.06295973062515259, "rewards/rejected": -0.16007867455482483, "step": 4761 }, { "epoch": 0.7364392035569303, "grad_norm": 5.6501688957214355, "learning_rate": 4.191774544621378e-06, "logits/chosen": 7.237883567810059, "logits/rejected": 10.479668617248535, "logps/chosen": -323.89849853515625, "logps/rejected": -312.208984375, "loss": 0.8207, "rewards/accuracies": 0.375, "rewards/chosen": 0.18529434502124786, "rewards/margins": -0.1707799881696701, "rewards/rejected": 0.35607433319091797, "step": 4762 }, { "epoch": 0.7365938526966944, "grad_norm": 6.281984329223633, "learning_rate": 4.1914881429717035e-06, "logits/chosen": 13.688703536987305, "logits/rejected": 8.88680362701416, "logps/chosen": -439.0293273925781, "logps/rejected": -350.8081359863281, "loss": 0.7373, "rewards/accuracies": 0.5, "rewards/chosen": 0.26570481061935425, "rewards/margins": 0.019310586154460907, "rewards/rejected": 0.24639426171779633, "step": 4763 }, { "epoch": 0.7367485018364586, "grad_norm": 6.75985860824585, "learning_rate": 4.19120174132203e-06, "logits/chosen": 11.57260513305664, "logits/rejected": 12.854288101196289, "logps/chosen": -281.9943542480469, "logps/rejected": -294.39874267578125, "loss": 0.7733, "rewards/accuracies": 0.625, "rewards/chosen": 0.21525385975837708, "rewards/margins": 0.18043500185012817, "rewards/rejected": 0.0348188579082489, "step": 4764 }, { "epoch": 0.7369031509762227, "grad_norm": 6.307370662689209, "learning_rate": 4.190915339672357e-06, "logits/chosen": 11.254785537719727, "logits/rejected": 6.444768905639648, "logps/chosen": -269.56109619140625, "logps/rejected": -239.7490234375, "loss": 0.776, "rewards/accuracies": 0.375, "rewards/chosen": 0.33809754252433777, "rewards/margins": -0.055138155817985535, "rewards/rejected": 0.3932356834411621, "step": 4765 }, { "epoch": 0.7370578001159869, "grad_norm": 4.317245960235596, "learning_rate": 4.1906289380226834e-06, "logits/chosen": 7.762613773345947, "logits/rejected": 4.897261619567871, "logps/chosen": -210.5369110107422, "logps/rejected": -209.80514526367188, "loss": 0.5353, "rewards/accuracies": 0.625, "rewards/chosen": 0.011759907007217407, "rewards/margins": 0.45884081721305847, "rewards/rejected": -0.44708094000816345, "step": 4766 }, { "epoch": 0.737212449255751, "grad_norm": 7.767302989959717, "learning_rate": 4.19034253637301e-06, "logits/chosen": 12.339812278747559, "logits/rejected": 10.903360366821289, "logps/chosen": -319.3306884765625, "logps/rejected": -311.0950622558594, "loss": 0.5433, "rewards/accuracies": 0.75, "rewards/chosen": 0.4628310203552246, "rewards/margins": 0.5159217119216919, "rewards/rejected": -0.053090650588274, "step": 4767 }, { "epoch": 0.7373670983955152, "grad_norm": 5.3224406242370605, "learning_rate": 4.190056134723337e-06, "logits/chosen": 14.429060935974121, "logits/rejected": 14.173766136169434, "logps/chosen": -252.89291381835938, "logps/rejected": -274.78765869140625, "loss": 0.6876, "rewards/accuracies": 0.625, "rewards/chosen": -0.26096177101135254, "rewards/margins": 0.19047455489635468, "rewards/rejected": -0.4514363408088684, "step": 4768 }, { "epoch": 0.7375217475352793, "grad_norm": 5.439070701599121, "learning_rate": 4.189769733073663e-06, "logits/chosen": 11.795591354370117, "logits/rejected": 8.82680892944336, "logps/chosen": -266.1576232910156, "logps/rejected": -217.710205078125, "loss": 0.5542, "rewards/accuracies": 0.625, "rewards/chosen": 0.3184768855571747, "rewards/margins": 0.4833378195762634, "rewards/rejected": -0.16486096382141113, "step": 4769 }, { "epoch": 0.7376763966750435, "grad_norm": 4.673933029174805, "learning_rate": 4.189483331423989e-06, "logits/chosen": 12.298160552978516, "logits/rejected": 4.383668422698975, "logps/chosen": -269.30419921875, "logps/rejected": -130.8526153564453, "loss": 0.5515, "rewards/accuracies": 0.875, "rewards/chosen": 0.3152461647987366, "rewards/margins": 0.5219026803970337, "rewards/rejected": -0.20665651559829712, "step": 4770 }, { "epoch": 0.7378310458148076, "grad_norm": 4.872597694396973, "learning_rate": 4.189196929774316e-06, "logits/chosen": 9.257883071899414, "logits/rejected": 5.580127239227295, "logps/chosen": -308.6827392578125, "logps/rejected": -271.2395935058594, "loss": 0.5661, "rewards/accuracies": 0.75, "rewards/chosen": -0.05088486894965172, "rewards/margins": 0.5715640783309937, "rewards/rejected": -0.6224489808082581, "step": 4771 }, { "epoch": 0.7379856949545718, "grad_norm": 4.062547206878662, "learning_rate": 4.1889105281246425e-06, "logits/chosen": 13.11014461517334, "logits/rejected": 10.769596099853516, "logps/chosen": -329.4433898925781, "logps/rejected": -322.385986328125, "loss": 0.5676, "rewards/accuracies": 0.5, "rewards/chosen": 0.6099538207054138, "rewards/margins": 0.4774438142776489, "rewards/rejected": 0.1325100064277649, "step": 4772 }, { "epoch": 0.7381403440943359, "grad_norm": 7.330388069152832, "learning_rate": 4.188624126474969e-06, "logits/chosen": 11.050324440002441, "logits/rejected": 9.256900787353516, "logps/chosen": -226.64767456054688, "logps/rejected": -219.11105346679688, "loss": 0.847, "rewards/accuracies": 0.25, "rewards/chosen": 0.08482885360717773, "rewards/margins": -0.24830521643161774, "rewards/rejected": 0.3331340551376343, "step": 4773 }, { "epoch": 0.7382949932341001, "grad_norm": 8.037586212158203, "learning_rate": 4.188337724825296e-06, "logits/chosen": 16.929351806640625, "logits/rejected": 12.234892845153809, "logps/chosen": -501.61651611328125, "logps/rejected": -454.21514892578125, "loss": 0.7294, "rewards/accuracies": 0.5, "rewards/chosen": 0.52154541015625, "rewards/margins": 0.15335141122341156, "rewards/rejected": 0.368194043636322, "step": 4774 }, { "epoch": 0.7384496423738643, "grad_norm": 4.544396877288818, "learning_rate": 4.1880513231756224e-06, "logits/chosen": 8.279939651489258, "logits/rejected": 3.458927631378174, "logps/chosen": -303.1224060058594, "logps/rejected": -239.17803955078125, "loss": 0.5887, "rewards/accuracies": 0.625, "rewards/chosen": 0.03961247205734253, "rewards/margins": 0.48178744316101074, "rewards/rejected": -0.4421750009059906, "step": 4775 }, { "epoch": 0.7386042915136285, "grad_norm": 8.042491912841797, "learning_rate": 4.187764921525948e-06, "logits/chosen": 11.185622215270996, "logits/rejected": 9.732324600219727, "logps/chosen": -329.8443298339844, "logps/rejected": -279.06817626953125, "loss": 0.7629, "rewards/accuracies": 0.375, "rewards/chosen": -0.3160988688468933, "rewards/margins": -0.024421006441116333, "rewards/rejected": -0.29167789220809937, "step": 4776 }, { "epoch": 0.7387589406533926, "grad_norm": 4.627344608306885, "learning_rate": 4.187478519876275e-06, "logits/chosen": 8.658655166625977, "logits/rejected": 11.560956001281738, "logps/chosen": -211.19076538085938, "logps/rejected": -190.307373046875, "loss": 0.6018, "rewards/accuracies": 0.625, "rewards/chosen": 0.0053098164498806, "rewards/margins": 0.2644790709018707, "rewards/rejected": -0.25916922092437744, "step": 4777 }, { "epoch": 0.7389135897931568, "grad_norm": 4.292059898376465, "learning_rate": 4.1871921182266015e-06, "logits/chosen": 8.275537490844727, "logits/rejected": 3.691534996032715, "logps/chosen": -227.5767059326172, "logps/rejected": -151.46847534179688, "loss": 0.579, "rewards/accuracies": 0.625, "rewards/chosen": 0.25836095213890076, "rewards/margins": 0.37570351362228394, "rewards/rejected": -0.11734256148338318, "step": 4778 }, { "epoch": 0.739068238932921, "grad_norm": 5.199387550354004, "learning_rate": 4.186905716576928e-06, "logits/chosen": 12.26113510131836, "logits/rejected": 10.05549144744873, "logps/chosen": -325.6607971191406, "logps/rejected": -268.3236083984375, "loss": 0.5816, "rewards/accuracies": 0.75, "rewards/chosen": 0.3688335120677948, "rewards/margins": 0.30262085795402527, "rewards/rejected": 0.06621265411376953, "step": 4779 }, { "epoch": 0.7392228880726851, "grad_norm": 9.9437255859375, "learning_rate": 4.186619314927254e-06, "logits/chosen": 17.158878326416016, "logits/rejected": 16.86864471435547, "logps/chosen": -383.1300048828125, "logps/rejected": -381.4898681640625, "loss": 0.7604, "rewards/accuracies": 0.5, "rewards/chosen": 0.0322418250143528, "rewards/margins": -0.07112255692481995, "rewards/rejected": 0.10336437821388245, "step": 4780 }, { "epoch": 0.7393775372124493, "grad_norm": 4.513852596282959, "learning_rate": 4.186332913277581e-06, "logits/chosen": 10.36776065826416, "logits/rejected": 7.879003524780273, "logps/chosen": -179.33912658691406, "logps/rejected": -158.83901977539062, "loss": 0.7182, "rewards/accuracies": 0.625, "rewards/chosen": 0.020983316004276276, "rewards/margins": 0.04471863806247711, "rewards/rejected": -0.02373531460762024, "step": 4781 }, { "epoch": 0.7395321863522134, "grad_norm": 4.013879776000977, "learning_rate": 4.186046511627907e-06, "logits/chosen": 13.298583984375, "logits/rejected": 7.70428991317749, "logps/chosen": -355.2281494140625, "logps/rejected": -263.7358703613281, "loss": 0.4415, "rewards/accuracies": 0.875, "rewards/chosen": 0.35276365280151367, "rewards/margins": 0.7590234279632568, "rewards/rejected": -0.40625983476638794, "step": 4782 }, { "epoch": 0.7396868354919776, "grad_norm": 5.810451507568359, "learning_rate": 4.185760109978234e-06, "logits/chosen": 10.22127914428711, "logits/rejected": 12.51368522644043, "logps/chosen": -261.9689025878906, "logps/rejected": -249.46876525878906, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": 0.3779284656047821, "rewards/margins": 0.03404693305492401, "rewards/rejected": 0.3438815474510193, "step": 4783 }, { "epoch": 0.7398414846317417, "grad_norm": 5.822102069854736, "learning_rate": 4.18547370832856e-06, "logits/chosen": 8.113950729370117, "logits/rejected": 7.097465515136719, "logps/chosen": -358.13531494140625, "logps/rejected": -295.42572021484375, "loss": 0.6369, "rewards/accuracies": 0.625, "rewards/chosen": -0.02648501843214035, "rewards/margins": 0.7722447514533997, "rewards/rejected": -0.7987297177314758, "step": 4784 }, { "epoch": 0.7399961337715059, "grad_norm": 5.558611869812012, "learning_rate": 4.185187306678886e-06, "logits/chosen": 10.657334327697754, "logits/rejected": 9.81453800201416, "logps/chosen": -336.66241455078125, "logps/rejected": -271.5, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": -0.0750436782836914, "rewards/margins": 0.139068603515625, "rewards/rejected": -0.2141122817993164, "step": 4785 }, { "epoch": 0.74015078291127, "grad_norm": 3.9321529865264893, "learning_rate": 4.184900905029213e-06, "logits/chosen": 7.636336803436279, "logits/rejected": 9.096763610839844, "logps/chosen": -207.8610382080078, "logps/rejected": -231.0572509765625, "loss": 0.4856, "rewards/accuracies": 0.75, "rewards/chosen": 0.13540057837963104, "rewards/margins": 0.6203001141548157, "rewards/rejected": -0.4848995804786682, "step": 4786 }, { "epoch": 0.7403054320510342, "grad_norm": 5.173822402954102, "learning_rate": 4.18461450337954e-06, "logits/chosen": 7.526389122009277, "logits/rejected": 1.7991468906402588, "logps/chosen": -375.69525146484375, "logps/rejected": -292.9147644042969, "loss": 0.5748, "rewards/accuracies": 0.75, "rewards/chosen": 0.32168281078338623, "rewards/margins": 0.29248085618019104, "rewards/rejected": 0.0292019285261631, "step": 4787 }, { "epoch": 0.7404600811907984, "grad_norm": 10.130017280578613, "learning_rate": 4.184328101729866e-06, "logits/chosen": 5.063675880432129, "logits/rejected": 9.111383438110352, "logps/chosen": -440.70758056640625, "logps/rejected": -425.63262939453125, "loss": 0.8439, "rewards/accuracies": 0.375, "rewards/chosen": 0.05717173591256142, "rewards/margins": -0.10336827486753464, "rewards/rejected": 0.16054001450538635, "step": 4788 }, { "epoch": 0.7406147303305626, "grad_norm": 6.925905227661133, "learning_rate": 4.184041700080193e-06, "logits/chosen": 15.306264877319336, "logits/rejected": 8.550630569458008, "logps/chosen": -447.69024658203125, "logps/rejected": -315.3685302734375, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.3664945662021637, "rewards/margins": 0.13312828540802002, "rewards/rejected": 0.23336631059646606, "step": 4789 }, { "epoch": 0.7407693794703267, "grad_norm": 19.495683670043945, "learning_rate": 4.183755298430519e-06, "logits/chosen": 13.757787704467773, "logits/rejected": 2.7380151748657227, "logps/chosen": -394.5092468261719, "logps/rejected": -162.16622924804688, "loss": 0.7054, "rewards/accuracies": 0.5, "rewards/chosen": 0.35929790139198303, "rewards/margins": 0.0788784995675087, "rewards/rejected": 0.28041940927505493, "step": 4790 }, { "epoch": 0.7409240286100909, "grad_norm": 3.8974475860595703, "learning_rate": 4.1834688967808455e-06, "logits/chosen": 12.83820629119873, "logits/rejected": 6.434313774108887, "logps/chosen": -326.9476318359375, "logps/rejected": -289.3776550292969, "loss": 0.5284, "rewards/accuracies": 0.75, "rewards/chosen": 0.583159327507019, "rewards/margins": 0.5215884447097778, "rewards/rejected": 0.06157088279724121, "step": 4791 }, { "epoch": 0.741078677749855, "grad_norm": 4.9556193351745605, "learning_rate": 4.183182495131172e-06, "logits/chosen": 10.742406845092773, "logits/rejected": 8.162736892700195, "logps/chosen": -273.11474609375, "logps/rejected": -232.29864501953125, "loss": 0.5548, "rewards/accuracies": 0.625, "rewards/chosen": 0.23834289610385895, "rewards/margins": 0.4092642664909363, "rewards/rejected": -0.17092138528823853, "step": 4792 }, { "epoch": 0.7412333268896192, "grad_norm": 5.485816478729248, "learning_rate": 4.182896093481499e-06, "logits/chosen": 5.3336076736450195, "logits/rejected": 11.637184143066406, "logps/chosen": -161.272705078125, "logps/rejected": -269.3109130859375, "loss": 0.6561, "rewards/accuracies": 0.75, "rewards/chosen": 0.06546381115913391, "rewards/margins": 0.2819777727127075, "rewards/rejected": -0.2165139615535736, "step": 4793 }, { "epoch": 0.7413879760293833, "grad_norm": 4.629825115203857, "learning_rate": 4.182609691831825e-06, "logits/chosen": 13.952739715576172, "logits/rejected": 9.817672729492188, "logps/chosen": -304.21795654296875, "logps/rejected": -265.7994384765625, "loss": 0.5055, "rewards/accuracies": 0.875, "rewards/chosen": 0.8277340531349182, "rewards/margins": 0.5605777502059937, "rewards/rejected": 0.2671562135219574, "step": 4794 }, { "epoch": 0.7415426251691475, "grad_norm": 3.8976757526397705, "learning_rate": 4.182323290182152e-06, "logits/chosen": 9.156255722045898, "logits/rejected": 2.176840305328369, "logps/chosen": -236.00279235839844, "logps/rejected": -153.2358856201172, "loss": 0.6101, "rewards/accuracies": 0.75, "rewards/chosen": 0.3663691282272339, "rewards/margins": 0.2540814280509949, "rewards/rejected": 0.11228771507740021, "step": 4795 }, { "epoch": 0.7416972743089116, "grad_norm": 3.306509017944336, "learning_rate": 4.182036888532478e-06, "logits/chosen": 6.861904621124268, "logits/rejected": 6.354181289672852, "logps/chosen": -117.49771881103516, "logps/rejected": -133.2139434814453, "loss": 0.6124, "rewards/accuracies": 0.75, "rewards/chosen": 0.14508453011512756, "rewards/margins": 0.18305107951164246, "rewards/rejected": -0.037966538220644, "step": 4796 }, { "epoch": 0.7418519234486758, "grad_norm": 6.028225421905518, "learning_rate": 4.1817504868828045e-06, "logits/chosen": 13.706073760986328, "logits/rejected": 12.922555923461914, "logps/chosen": -371.95849609375, "logps/rejected": -356.96209716796875, "loss": 0.6777, "rewards/accuracies": 0.5, "rewards/chosen": -0.14072629809379578, "rewards/margins": 0.14261507987976074, "rewards/rejected": -0.28334131836891174, "step": 4797 }, { "epoch": 0.7420065725884399, "grad_norm": 6.3322343826293945, "learning_rate": 4.181464085233131e-06, "logits/chosen": 11.921670913696289, "logits/rejected": 6.583761215209961, "logps/chosen": -435.3843078613281, "logps/rejected": -408.2420349121094, "loss": 0.6996, "rewards/accuracies": 0.375, "rewards/chosen": 0.6460952162742615, "rewards/margins": 0.06955307722091675, "rewards/rejected": 0.5765421986579895, "step": 4798 }, { "epoch": 0.7421612217282041, "grad_norm": 5.877707481384277, "learning_rate": 4.181177683583458e-06, "logits/chosen": 8.08895492553711, "logits/rejected": 6.9810791015625, "logps/chosen": -216.67037963867188, "logps/rejected": -233.91574096679688, "loss": 0.7302, "rewards/accuracies": 0.5, "rewards/chosen": 0.2697771191596985, "rewards/margins": -0.04122409597039223, "rewards/rejected": 0.3110012114048004, "step": 4799 }, { "epoch": 0.7423158708679682, "grad_norm": 3.527134656906128, "learning_rate": 4.1808912819337845e-06, "logits/chosen": 8.34594440460205, "logits/rejected": 5.2310309410095215, "logps/chosen": -162.30355834960938, "logps/rejected": -170.98753356933594, "loss": 0.5852, "rewards/accuracies": 0.75, "rewards/chosen": 0.6474559903144836, "rewards/margins": 0.2936580777168274, "rewards/rejected": 0.353797972202301, "step": 4800 }, { "epoch": 0.7424705200077325, "grad_norm": 4.963498592376709, "learning_rate": 4.180604880284111e-06, "logits/chosen": 11.898832321166992, "logits/rejected": 10.114301681518555, "logps/chosen": -314.96209716796875, "logps/rejected": -280.0485534667969, "loss": 0.5025, "rewards/accuracies": 0.75, "rewards/chosen": 0.5252709984779358, "rewards/margins": 0.5472241044044495, "rewards/rejected": -0.021953091025352478, "step": 4801 }, { "epoch": 0.7426251691474967, "grad_norm": 5.899979114532471, "learning_rate": 4.180318478634438e-06, "logits/chosen": 13.386738777160645, "logits/rejected": 12.98686695098877, "logps/chosen": -277.8770751953125, "logps/rejected": -260.10968017578125, "loss": 0.7403, "rewards/accuracies": 0.5, "rewards/chosen": 0.2905813455581665, "rewards/margins": -0.00035734474658966064, "rewards/rejected": 0.29093867540359497, "step": 4802 }, { "epoch": 0.7427798182872608, "grad_norm": 4.862941265106201, "learning_rate": 4.1800320769847636e-06, "logits/chosen": 13.305197715759277, "logits/rejected": 10.171258926391602, "logps/chosen": -203.8580780029297, "logps/rejected": -215.6347198486328, "loss": 0.5774, "rewards/accuracies": 0.625, "rewards/chosen": 0.10327376425266266, "rewards/margins": 0.3759494125843048, "rewards/rejected": -0.27267563343048096, "step": 4803 }, { "epoch": 0.742934467427025, "grad_norm": 4.671733856201172, "learning_rate": 4.17974567533509e-06, "logits/chosen": 10.614520072937012, "logits/rejected": 12.669438362121582, "logps/chosen": -211.83184814453125, "logps/rejected": -282.8984375, "loss": 0.6863, "rewards/accuracies": 0.5, "rewards/chosen": 0.23597446084022522, "rewards/margins": 0.07573352754116058, "rewards/rejected": 0.16024094820022583, "step": 4804 }, { "epoch": 0.7430891165667891, "grad_norm": 5.944483757019043, "learning_rate": 4.179459273685417e-06, "logits/chosen": 9.159265518188477, "logits/rejected": 10.275708198547363, "logps/chosen": -225.89813232421875, "logps/rejected": -230.29693603515625, "loss": 0.7234, "rewards/accuracies": 0.375, "rewards/chosen": 0.0193081796169281, "rewards/margins": 0.05501945689320564, "rewards/rejected": -0.035711269825696945, "step": 4805 }, { "epoch": 0.7432437657065533, "grad_norm": 6.932787895202637, "learning_rate": 4.1791728720357435e-06, "logits/chosen": 11.603013038635254, "logits/rejected": 5.67227029800415, "logps/chosen": -236.5516815185547, "logps/rejected": -243.91864013671875, "loss": 0.7817, "rewards/accuracies": 0.375, "rewards/chosen": -0.384068101644516, "rewards/margins": 0.005043424665927887, "rewards/rejected": -0.3891115188598633, "step": 4806 }, { "epoch": 0.7433984148463174, "grad_norm": 5.3307061195373535, "learning_rate": 4.17888647038607e-06, "logits/chosen": 9.669853210449219, "logits/rejected": 6.428501605987549, "logps/chosen": -289.4296569824219, "logps/rejected": -262.89837646484375, "loss": 0.5536, "rewards/accuracies": 0.875, "rewards/chosen": 0.5136330723762512, "rewards/margins": 0.3940231204032898, "rewards/rejected": 0.11960999667644501, "step": 4807 }, { "epoch": 0.7435530639860816, "grad_norm": 6.641125202178955, "learning_rate": 4.178600068736397e-06, "logits/chosen": 8.668749809265137, "logits/rejected": 6.366659164428711, "logps/chosen": -221.5707244873047, "logps/rejected": -266.85565185546875, "loss": 0.8163, "rewards/accuracies": 0.5, "rewards/chosen": -0.02597782015800476, "rewards/margins": -0.13528549671173096, "rewards/rejected": 0.1093076765537262, "step": 4808 }, { "epoch": 0.7437077131258457, "grad_norm": 6.800802230834961, "learning_rate": 4.178313667086723e-06, "logits/chosen": 15.106536865234375, "logits/rejected": 7.903476715087891, "logps/chosen": -354.17694091796875, "logps/rejected": -264.623046875, "loss": 0.6969, "rewards/accuracies": 0.5, "rewards/chosen": 0.343596875667572, "rewards/margins": 0.26012516021728516, "rewards/rejected": 0.08347168564796448, "step": 4809 }, { "epoch": 0.7438623622656099, "grad_norm": 4.17566442489624, "learning_rate": 4.178027265437049e-06, "logits/chosen": 12.727313995361328, "logits/rejected": 3.4485666751861572, "logps/chosen": -314.00048828125, "logps/rejected": -192.40771484375, "loss": 0.5045, "rewards/accuracies": 0.875, "rewards/chosen": 0.3906874656677246, "rewards/margins": 0.4727931320667267, "rewards/rejected": -0.08210568130016327, "step": 4810 }, { "epoch": 0.744017011405374, "grad_norm": 5.741244792938232, "learning_rate": 4.177740863787376e-06, "logits/chosen": 8.71609115600586, "logits/rejected": 10.898783683776855, "logps/chosen": -184.07550048828125, "logps/rejected": -218.75607299804688, "loss": 0.6761, "rewards/accuracies": 0.625, "rewards/chosen": -0.16181382536888123, "rewards/margins": 0.14146538078784943, "rewards/rejected": -0.30327919125556946, "step": 4811 }, { "epoch": 0.7441716605451382, "grad_norm": 4.6770710945129395, "learning_rate": 4.177454462137703e-06, "logits/chosen": 13.488677024841309, "logits/rejected": 12.424948692321777, "logps/chosen": -214.08004760742188, "logps/rejected": -195.35145568847656, "loss": 0.6578, "rewards/accuracies": 0.5, "rewards/chosen": -0.06640143692493439, "rewards/margins": 0.23079198598861694, "rewards/rejected": -0.2971934676170349, "step": 4812 }, { "epoch": 0.7443263096849024, "grad_norm": 10.468841552734375, "learning_rate": 4.177168060488029e-06, "logits/chosen": 9.1223726272583, "logits/rejected": 7.271476745605469, "logps/chosen": -333.78521728515625, "logps/rejected": -372.5546875, "loss": 0.8147, "rewards/accuracies": 0.375, "rewards/chosen": 0.023865804076194763, "rewards/margins": -0.09401731938123703, "rewards/rejected": 0.1178831234574318, "step": 4813 }, { "epoch": 0.7444809588246666, "grad_norm": 5.845936298370361, "learning_rate": 4.176881658838355e-06, "logits/chosen": 1.451494812965393, "logits/rejected": 2.8326592445373535, "logps/chosen": -233.25576782226562, "logps/rejected": -339.4666442871094, "loss": 0.7356, "rewards/accuracies": 0.375, "rewards/chosen": 0.10075896978378296, "rewards/margins": -0.02923470363020897, "rewards/rejected": 0.12999367713928223, "step": 4814 }, { "epoch": 0.7446356079644307, "grad_norm": 4.696289539337158, "learning_rate": 4.176595257188682e-06, "logits/chosen": 7.498291015625, "logits/rejected": 4.349799156188965, "logps/chosen": -220.40553283691406, "logps/rejected": -196.92861938476562, "loss": 0.6683, "rewards/accuracies": 0.625, "rewards/chosen": 0.22897286713123322, "rewards/margins": 0.07385443896055222, "rewards/rejected": 0.1551184207201004, "step": 4815 }, { "epoch": 0.7447902571041949, "grad_norm": 8.36641788482666, "learning_rate": 4.176308855539008e-06, "logits/chosen": 7.235175132751465, "logits/rejected": 0.3660110831260681, "logps/chosen": -352.1417541503906, "logps/rejected": -302.70416259765625, "loss": 0.5522, "rewards/accuracies": 0.75, "rewards/chosen": 0.1253022700548172, "rewards/margins": 0.3599855303764343, "rewards/rejected": -0.2346833050251007, "step": 4816 }, { "epoch": 0.744944906243959, "grad_norm": 5.706331253051758, "learning_rate": 4.176022453889335e-06, "logits/chosen": 10.604809761047363, "logits/rejected": 6.052546977996826, "logps/chosen": -284.2577209472656, "logps/rejected": -254.03941345214844, "loss": 0.5085, "rewards/accuracies": 0.75, "rewards/chosen": 0.559387743473053, "rewards/margins": 0.5088394284248352, "rewards/rejected": 0.050548337399959564, "step": 4817 }, { "epoch": 0.7450995553837232, "grad_norm": 6.503905296325684, "learning_rate": 4.175736052239661e-06, "logits/chosen": 13.16185188293457, "logits/rejected": -0.5226092338562012, "logps/chosen": -340.4718017578125, "logps/rejected": -172.41815185546875, "loss": 0.4691, "rewards/accuracies": 0.875, "rewards/chosen": 0.34280359745025635, "rewards/margins": 0.6189937591552734, "rewards/rejected": -0.2761901915073395, "step": 4818 }, { "epoch": 0.7452542045234873, "grad_norm": 4.531585693359375, "learning_rate": 4.1754496505899874e-06, "logits/chosen": 10.873838424682617, "logits/rejected": 3.611543655395508, "logps/chosen": -407.49169921875, "logps/rejected": -338.61065673828125, "loss": 0.5227, "rewards/accuracies": 0.75, "rewards/chosen": 0.6937202215194702, "rewards/margins": 0.5156177878379822, "rewards/rejected": 0.178102508187294, "step": 4819 }, { "epoch": 0.7454088536632515, "grad_norm": 7.156266689300537, "learning_rate": 4.175163248940314e-06, "logits/chosen": 6.630589485168457, "logits/rejected": 8.299389839172363, "logps/chosen": -245.29534912109375, "logps/rejected": -301.42254638671875, "loss": 0.7384, "rewards/accuracies": 0.5, "rewards/chosen": 0.2609957456588745, "rewards/margins": -0.009239532053470612, "rewards/rejected": 0.2702353000640869, "step": 4820 }, { "epoch": 0.7455635028030156, "grad_norm": 7.928884983062744, "learning_rate": 4.174876847290641e-06, "logits/chosen": 9.81922721862793, "logits/rejected": 11.316617965698242, "logps/chosen": -264.1991882324219, "logps/rejected": -226.83143615722656, "loss": 0.7047, "rewards/accuracies": 0.5, "rewards/chosen": 0.19673889875411987, "rewards/margins": 0.1209615170955658, "rewards/rejected": 0.07577738165855408, "step": 4821 }, { "epoch": 0.7457181519427798, "grad_norm": 3.063037395477295, "learning_rate": 4.174590445640967e-06, "logits/chosen": 7.547990798950195, "logits/rejected": 0.8254170417785645, "logps/chosen": -208.6787567138672, "logps/rejected": -159.78138732910156, "loss": 0.5017, "rewards/accuracies": 0.75, "rewards/chosen": 0.24274849891662598, "rewards/margins": 0.5459417700767517, "rewards/rejected": -0.30319327116012573, "step": 4822 }, { "epoch": 0.745872801082544, "grad_norm": 7.068705081939697, "learning_rate": 4.174304043991293e-06, "logits/chosen": 8.822977066040039, "logits/rejected": 7.628363609313965, "logps/chosen": -352.21978759765625, "logps/rejected": -293.8230895996094, "loss": 0.7331, "rewards/accuracies": 0.5, "rewards/chosen": 0.11144157499074936, "rewards/margins": -0.049187783151865005, "rewards/rejected": 0.16062936186790466, "step": 4823 }, { "epoch": 0.7460274502223081, "grad_norm": 4.903248310089111, "learning_rate": 4.17401764234162e-06, "logits/chosen": 13.733642578125, "logits/rejected": 8.761543273925781, "logps/chosen": -401.84228515625, "logps/rejected": -241.805908203125, "loss": 0.5121, "rewards/accuracies": 0.875, "rewards/chosen": 0.31087398529052734, "rewards/margins": 0.5189108848571777, "rewards/rejected": -0.208036869764328, "step": 4824 }, { "epoch": 0.7461820993620722, "grad_norm": 7.397091865539551, "learning_rate": 4.1737312406919465e-06, "logits/chosen": 6.712996959686279, "logits/rejected": 7.006630897521973, "logps/chosen": -339.0171203613281, "logps/rejected": -324.1947021484375, "loss": 0.9088, "rewards/accuracies": 0.125, "rewards/chosen": 0.20211324095726013, "rewards/margins": -0.37746018171310425, "rewards/rejected": 0.5795734524726868, "step": 4825 }, { "epoch": 0.7463367485018365, "grad_norm": 6.357495307922363, "learning_rate": 4.173444839042273e-06, "logits/chosen": 11.699857711791992, "logits/rejected": 6.020086288452148, "logps/chosen": -350.0740966796875, "logps/rejected": -228.97189331054688, "loss": 0.7088, "rewards/accuracies": 0.5, "rewards/chosen": 0.17709054052829742, "rewards/margins": 0.11982870101928711, "rewards/rejected": 0.05726185441017151, "step": 4826 }, { "epoch": 0.7464913976416007, "grad_norm": 5.133193016052246, "learning_rate": 4.1731584373926e-06, "logits/chosen": 5.878551959991455, "logits/rejected": 5.414133071899414, "logps/chosen": -207.69473266601562, "logps/rejected": -205.99545288085938, "loss": 0.5645, "rewards/accuracies": 0.5, "rewards/chosen": 0.13004469871520996, "rewards/margins": 0.6078673005104065, "rewards/rejected": -0.47782260179519653, "step": 4827 }, { "epoch": 0.7466460467813648, "grad_norm": 4.098268508911133, "learning_rate": 4.1728720357429264e-06, "logits/chosen": 15.177474975585938, "logits/rejected": 9.74886417388916, "logps/chosen": -346.4871520996094, "logps/rejected": -283.2125244140625, "loss": 0.4848, "rewards/accuracies": 0.875, "rewards/chosen": 0.8915232419967651, "rewards/margins": 0.6109054684638977, "rewards/rejected": 0.28061771392822266, "step": 4828 }, { "epoch": 0.746800695921129, "grad_norm": 3.595144510269165, "learning_rate": 4.172585634093252e-06, "logits/chosen": 15.834580421447754, "logits/rejected": 10.500226020812988, "logps/chosen": -216.09938049316406, "logps/rejected": -152.27227783203125, "loss": 0.535, "rewards/accuracies": 0.75, "rewards/chosen": 0.27926358580589294, "rewards/margins": 0.600127637386322, "rewards/rejected": -0.3208640515804291, "step": 4829 }, { "epoch": 0.7469553450608931, "grad_norm": 6.200868129730225, "learning_rate": 4.172299232443579e-06, "logits/chosen": 11.274935722351074, "logits/rejected": 13.317134857177734, "logps/chosen": -343.1924133300781, "logps/rejected": -367.15350341796875, "loss": 0.6022, "rewards/accuracies": 0.625, "rewards/chosen": 0.5902072191238403, "rewards/margins": 0.24841104447841644, "rewards/rejected": 0.3417961597442627, "step": 4830 }, { "epoch": 0.7471099942006573, "grad_norm": 4.361725807189941, "learning_rate": 4.1720128307939055e-06, "logits/chosen": 9.544358253479004, "logits/rejected": 3.976625442504883, "logps/chosen": -247.9324493408203, "logps/rejected": -150.41969299316406, "loss": 0.6071, "rewards/accuracies": 0.625, "rewards/chosen": -0.020336657762527466, "rewards/margins": 0.2519673705101013, "rewards/rejected": -0.2723039984703064, "step": 4831 }, { "epoch": 0.7472646433404214, "grad_norm": 6.279647350311279, "learning_rate": 4.171726429144232e-06, "logits/chosen": 9.54776382446289, "logits/rejected": 8.914756774902344, "logps/chosen": -292.3633117675781, "logps/rejected": -472.9407958984375, "loss": 0.5509, "rewards/accuracies": 0.75, "rewards/chosen": 0.3753019869327545, "rewards/margins": 0.43533727526664734, "rewards/rejected": -0.060035280883312225, "step": 4832 }, { "epoch": 0.7474192924801856, "grad_norm": 4.035058975219727, "learning_rate": 4.171440027494559e-06, "logits/chosen": 8.587701797485352, "logits/rejected": 6.294947624206543, "logps/chosen": -273.48870849609375, "logps/rejected": -292.9427795410156, "loss": 0.4349, "rewards/accuracies": 1.0, "rewards/chosen": 0.12033711373806, "rewards/margins": 0.6289465427398682, "rewards/rejected": -0.508609414100647, "step": 4833 }, { "epoch": 0.7475739416199497, "grad_norm": 5.26654577255249, "learning_rate": 4.1711536258448855e-06, "logits/chosen": 9.489533424377441, "logits/rejected": 4.669025897979736, "logps/chosen": -272.92901611328125, "logps/rejected": -265.3211669921875, "loss": 0.6321, "rewards/accuracies": 0.625, "rewards/chosen": 0.6449675559997559, "rewards/margins": 0.20125295221805573, "rewards/rejected": 0.44371461868286133, "step": 4834 }, { "epoch": 0.7477285907597139, "grad_norm": 3.6931159496307373, "learning_rate": 4.170867224195212e-06, "logits/chosen": 10.135820388793945, "logits/rejected": 6.047074317932129, "logps/chosen": -194.03077697753906, "logps/rejected": -157.4449462890625, "loss": 0.6093, "rewards/accuracies": 0.5, "rewards/chosen": 0.5226764678955078, "rewards/margins": 0.2899424731731415, "rewards/rejected": 0.23273403942584991, "step": 4835 }, { "epoch": 0.747883239899478, "grad_norm": 5.036673545837402, "learning_rate": 4.170580822545538e-06, "logits/chosen": 9.969597816467285, "logits/rejected": 7.037627696990967, "logps/chosen": -236.26939392089844, "logps/rejected": -289.58880615234375, "loss": 0.5056, "rewards/accuracies": 0.75, "rewards/chosen": 0.2306259274482727, "rewards/margins": 0.6020475029945374, "rewards/rejected": -0.37142154574394226, "step": 4836 }, { "epoch": 0.7480378890392422, "grad_norm": 6.106447219848633, "learning_rate": 4.170294420895865e-06, "logits/chosen": 16.301727294921875, "logits/rejected": 11.901161193847656, "logps/chosen": -326.8699645996094, "logps/rejected": -294.5316162109375, "loss": 0.7339, "rewards/accuracies": 0.625, "rewards/chosen": 0.6186148524284363, "rewards/margins": -0.006930194795131683, "rewards/rejected": 0.6255450248718262, "step": 4837 }, { "epoch": 0.7481925381790063, "grad_norm": 5.405377388000488, "learning_rate": 4.170008019246191e-06, "logits/chosen": 12.78651237487793, "logits/rejected": 10.472010612487793, "logps/chosen": -296.6016845703125, "logps/rejected": -242.8929443359375, "loss": 0.7105, "rewards/accuracies": 0.75, "rewards/chosen": 0.42188936471939087, "rewards/margins": 0.01715688221156597, "rewards/rejected": 0.40473246574401855, "step": 4838 }, { "epoch": 0.7483471873187706, "grad_norm": 6.251347541809082, "learning_rate": 4.169721617596518e-06, "logits/chosen": 12.433619499206543, "logits/rejected": 6.631613731384277, "logps/chosen": -364.5036315917969, "logps/rejected": -280.1646423339844, "loss": 0.743, "rewards/accuracies": 0.625, "rewards/chosen": 0.13661594688892365, "rewards/margins": 0.024664364755153656, "rewards/rejected": 0.11195158958435059, "step": 4839 }, { "epoch": 0.7485018364585347, "grad_norm": 4.574646949768066, "learning_rate": 4.1694352159468446e-06, "logits/chosen": 11.220207214355469, "logits/rejected": 3.9194297790527344, "logps/chosen": -220.86715698242188, "logps/rejected": -218.00772094726562, "loss": 0.6247, "rewards/accuracies": 0.75, "rewards/chosen": 0.20093420147895813, "rewards/margins": 0.4526696801185608, "rewards/rejected": -0.25173550844192505, "step": 4840 }, { "epoch": 0.7486564855982989, "grad_norm": 4.667858123779297, "learning_rate": 4.169148814297171e-06, "logits/chosen": 11.73930549621582, "logits/rejected": 13.379814147949219, "logps/chosen": -159.39662170410156, "logps/rejected": -231.64088439941406, "loss": 0.6245, "rewards/accuracies": 0.5, "rewards/chosen": 0.36498862504959106, "rewards/margins": 0.2093839943408966, "rewards/rejected": 0.15560461580753326, "step": 4841 }, { "epoch": 0.748811134738063, "grad_norm": 5.942371368408203, "learning_rate": 4.168862412647497e-06, "logits/chosen": 12.18507194519043, "logits/rejected": 9.20965576171875, "logps/chosen": -223.5433349609375, "logps/rejected": -231.08551025390625, "loss": 0.7216, "rewards/accuracies": 0.5, "rewards/chosen": -0.23112405836582184, "rewards/margins": 0.02223964035511017, "rewards/rejected": -0.253363698720932, "step": 4842 }, { "epoch": 0.7489657838778272, "grad_norm": 5.485408306121826, "learning_rate": 4.168576010997824e-06, "logits/chosen": 10.809326171875, "logits/rejected": 9.459688186645508, "logps/chosen": -314.85882568359375, "logps/rejected": -269.78179931640625, "loss": 0.5429, "rewards/accuracies": 0.75, "rewards/chosen": 0.2031661570072174, "rewards/margins": 0.38123080134391785, "rewards/rejected": -0.17806468904018402, "step": 4843 }, { "epoch": 0.7491204330175913, "grad_norm": 5.794851779937744, "learning_rate": 4.16828960934815e-06, "logits/chosen": 9.650164604187012, "logits/rejected": 8.985267639160156, "logps/chosen": -226.5543975830078, "logps/rejected": -186.87440490722656, "loss": 0.7557, "rewards/accuracies": 0.375, "rewards/chosen": 0.12609292566776276, "rewards/margins": -0.03848074749112129, "rewards/rejected": 0.16457365453243256, "step": 4844 }, { "epoch": 0.7492750821573555, "grad_norm": 3.718604564666748, "learning_rate": 4.168003207698477e-06, "logits/chosen": 5.795703411102295, "logits/rejected": 4.269153118133545, "logps/chosen": -262.18975830078125, "logps/rejected": -191.9886016845703, "loss": 0.5186, "rewards/accuracies": 0.625, "rewards/chosen": 0.16473212838172913, "rewards/margins": 0.9451456665992737, "rewards/rejected": -0.7804135084152222, "step": 4845 }, { "epoch": 0.7494297312971197, "grad_norm": 16.071382522583008, "learning_rate": 4.167716806048804e-06, "logits/chosen": 10.279264450073242, "logits/rejected": 7.207713603973389, "logps/chosen": -295.3358154296875, "logps/rejected": -282.45159912109375, "loss": 0.7848, "rewards/accuracies": 0.5, "rewards/chosen": -0.02912130206823349, "rewards/margins": 0.013496320694684982, "rewards/rejected": -0.04261759668588638, "step": 4846 }, { "epoch": 0.7495843804368838, "grad_norm": 5.455828666687012, "learning_rate": 4.167430404399129e-06, "logits/chosen": 10.817137718200684, "logits/rejected": 6.942222595214844, "logps/chosen": -298.0824890136719, "logps/rejected": -174.29147338867188, "loss": 0.4863, "rewards/accuracies": 0.75, "rewards/chosen": 0.3604160249233246, "rewards/margins": 0.6177363395690918, "rewards/rejected": -0.2573203146457672, "step": 4847 }, { "epoch": 0.749739029576648, "grad_norm": 4.1171488761901855, "learning_rate": 4.167144002749456e-06, "logits/chosen": 11.83259391784668, "logits/rejected": 10.357523918151855, "logps/chosen": -162.07606506347656, "logps/rejected": -166.74497985839844, "loss": 0.519, "rewards/accuracies": 0.75, "rewards/chosen": 0.39864879846572876, "rewards/margins": 0.4506915807723999, "rewards/rejected": -0.052042774856090546, "step": 4848 }, { "epoch": 0.7498936787164121, "grad_norm": 5.217754364013672, "learning_rate": 4.166857601099783e-06, "logits/chosen": 10.635445594787598, "logits/rejected": 9.232568740844727, "logps/chosen": -395.853759765625, "logps/rejected": -313.3909912109375, "loss": 0.4958, "rewards/accuracies": 0.875, "rewards/chosen": 0.6965374946594238, "rewards/margins": 0.5588996410369873, "rewards/rejected": 0.1376378983259201, "step": 4849 }, { "epoch": 0.7500483278561763, "grad_norm": 3.989359140396118, "learning_rate": 4.166571199450109e-06, "logits/chosen": 8.043920516967773, "logits/rejected": 7.287498474121094, "logps/chosen": -249.3497314453125, "logps/rejected": -227.15982055664062, "loss": 0.4563, "rewards/accuracies": 0.875, "rewards/chosen": 0.04019976034760475, "rewards/margins": 0.7730265855789185, "rewards/rejected": -0.732826828956604, "step": 4850 }, { "epoch": 0.7502029769959404, "grad_norm": 6.043792724609375, "learning_rate": 4.166284797800436e-06, "logits/chosen": 11.858987808227539, "logits/rejected": 8.757853507995605, "logps/chosen": -270.83612060546875, "logps/rejected": -207.2410125732422, "loss": 0.7772, "rewards/accuracies": 0.5, "rewards/chosen": 0.0764111578464508, "rewards/margins": 0.02477322518825531, "rewards/rejected": 0.05163794755935669, "step": 4851 }, { "epoch": 0.7503576261357047, "grad_norm": 5.700096130371094, "learning_rate": 4.165998396150762e-06, "logits/chosen": 7.036066055297852, "logits/rejected": 4.6242828369140625, "logps/chosen": -213.81365966796875, "logps/rejected": -245.6555633544922, "loss": 0.7766, "rewards/accuracies": 0.5, "rewards/chosen": -0.23110955953598022, "rewards/margins": 0.07529954612255096, "rewards/rejected": -0.30640915036201477, "step": 4852 }, { "epoch": 0.7505122752754688, "grad_norm": 4.637636661529541, "learning_rate": 4.1657119945010885e-06, "logits/chosen": 13.166892051696777, "logits/rejected": 10.624645233154297, "logps/chosen": -322.92352294921875, "logps/rejected": -270.46881103515625, "loss": 0.4958, "rewards/accuracies": 0.875, "rewards/chosen": 0.34987935423851013, "rewards/margins": 0.6453158259391785, "rewards/rejected": -0.2954365015029907, "step": 4853 }, { "epoch": 0.750666924415233, "grad_norm": 9.836350440979004, "learning_rate": 4.165425592851415e-06, "logits/chosen": 9.596490859985352, "logits/rejected": 10.918330192565918, "logps/chosen": -286.5840759277344, "logps/rejected": -240.1240692138672, "loss": 0.9357, "rewards/accuracies": 0.375, "rewards/chosen": 0.301012247800827, "rewards/margins": -0.24442550539970398, "rewards/rejected": 0.5454376935958862, "step": 4854 }, { "epoch": 0.7508215735549971, "grad_norm": 7.767127513885498, "learning_rate": 4.165139191201742e-06, "logits/chosen": 11.761005401611328, "logits/rejected": 6.494842529296875, "logps/chosen": -437.18896484375, "logps/rejected": -340.0881652832031, "loss": 0.8958, "rewards/accuracies": 0.375, "rewards/chosen": 0.29401540756225586, "rewards/margins": -0.11122598499059677, "rewards/rejected": 0.4052414298057556, "step": 4855 }, { "epoch": 0.7509762226947613, "grad_norm": 11.616569519042969, "learning_rate": 4.1648527895520676e-06, "logits/chosen": 10.883437156677246, "logits/rejected": 7.339916229248047, "logps/chosen": -434.20819091796875, "logps/rejected": -354.2981262207031, "loss": 0.9145, "rewards/accuracies": 0.25, "rewards/chosen": -0.17182910442352295, "rewards/margins": -0.322274774312973, "rewards/rejected": 0.15044564008712769, "step": 4856 }, { "epoch": 0.7511308718345254, "grad_norm": 4.164758682250977, "learning_rate": 4.164566387902394e-06, "logits/chosen": 13.085493087768555, "logits/rejected": 8.32006549835205, "logps/chosen": -255.8421630859375, "logps/rejected": -229.529296875, "loss": 0.5219, "rewards/accuracies": 0.875, "rewards/chosen": 0.3584100306034088, "rewards/margins": 0.478179931640625, "rewards/rejected": -0.11976990103721619, "step": 4857 }, { "epoch": 0.7512855209742896, "grad_norm": 5.33148717880249, "learning_rate": 4.164279986252721e-06, "logits/chosen": 11.501684188842773, "logits/rejected": 11.381439208984375, "logps/chosen": -183.98374938964844, "logps/rejected": -188.72727966308594, "loss": 0.8294, "rewards/accuracies": 0.375, "rewards/chosen": -0.09933124482631683, "rewards/margins": -0.2000461220741272, "rewards/rejected": 0.10071487724781036, "step": 4858 }, { "epoch": 0.7514401701140537, "grad_norm": 5.681978225708008, "learning_rate": 4.1639935846030475e-06, "logits/chosen": 8.126919746398926, "logits/rejected": 8.684080123901367, "logps/chosen": -308.3127136230469, "logps/rejected": -320.227294921875, "loss": 0.6212, "rewards/accuracies": 0.625, "rewards/chosen": 0.34130972623825073, "rewards/margins": 0.402667373418808, "rewards/rejected": -0.06135760247707367, "step": 4859 }, { "epoch": 0.7515948192538179, "grad_norm": 7.054001808166504, "learning_rate": 4.163707182953374e-06, "logits/chosen": 3.9391956329345703, "logits/rejected": 6.212490558624268, "logps/chosen": -265.00872802734375, "logps/rejected": -275.0775146484375, "loss": 0.8545, "rewards/accuracies": 0.5, "rewards/chosen": 0.037232063710689545, "rewards/margins": -0.17208650708198547, "rewards/rejected": 0.20931857824325562, "step": 4860 }, { "epoch": 0.751749468393582, "grad_norm": 5.3153862953186035, "learning_rate": 4.163420781303701e-06, "logits/chosen": 4.634160041809082, "logits/rejected": 5.175274848937988, "logps/chosen": -303.40240478515625, "logps/rejected": -213.849365234375, "loss": 0.6406, "rewards/accuracies": 0.625, "rewards/chosen": 0.4911089539527893, "rewards/margins": 0.17803442478179932, "rewards/rejected": 0.31307452917099, "step": 4861 }, { "epoch": 0.7519041175333462, "grad_norm": 5.403745651245117, "learning_rate": 4.163134379654027e-06, "logits/chosen": 11.666973114013672, "logits/rejected": 9.547372817993164, "logps/chosen": -263.4168395996094, "logps/rejected": -278.2130432128906, "loss": 0.5922, "rewards/accuracies": 0.5, "rewards/chosen": 0.6372042894363403, "rewards/margins": 0.48984840512275696, "rewards/rejected": 0.14735586941242218, "step": 4862 }, { "epoch": 0.7520587666731103, "grad_norm": 3.2964301109313965, "learning_rate": 4.162847978004353e-06, "logits/chosen": 10.638582229614258, "logits/rejected": -2.3800859451293945, "logps/chosen": -269.4823913574219, "logps/rejected": -146.95364379882812, "loss": 0.4316, "rewards/accuracies": 0.75, "rewards/chosen": 0.4664738178253174, "rewards/margins": 0.8757360577583313, "rewards/rejected": -0.4092622697353363, "step": 4863 }, { "epoch": 0.7522134158128745, "grad_norm": 94.45955657958984, "learning_rate": 4.16256157635468e-06, "logits/chosen": 8.801929473876953, "logits/rejected": 8.142520904541016, "logps/chosen": -333.4564208984375, "logps/rejected": -262.99566650390625, "loss": 0.8018, "rewards/accuracies": 0.25, "rewards/chosen": 0.27472934126853943, "rewards/margins": -0.17398786544799805, "rewards/rejected": 0.4487171769142151, "step": 4864 }, { "epoch": 0.7523680649526387, "grad_norm": 4.626436233520508, "learning_rate": 4.162275174705007e-06, "logits/chosen": 13.201923370361328, "logits/rejected": 9.123678207397461, "logps/chosen": -180.09149169921875, "logps/rejected": -172.35079956054688, "loss": 0.6479, "rewards/accuracies": 0.625, "rewards/chosen": 0.08253325521945953, "rewards/margins": 0.2728095054626465, "rewards/rejected": -0.19027625024318695, "step": 4865 }, { "epoch": 0.7525227140924029, "grad_norm": 4.145317077636719, "learning_rate": 4.161988773055333e-06, "logits/chosen": 10.445066452026367, "logits/rejected": 12.700387001037598, "logps/chosen": -182.2593994140625, "logps/rejected": -271.501220703125, "loss": 0.5114, "rewards/accuracies": 1.0, "rewards/chosen": 0.3474692106246948, "rewards/margins": 0.45338213443756104, "rewards/rejected": -0.10591289401054382, "step": 4866 }, { "epoch": 0.752677363232167, "grad_norm": 5.330689430236816, "learning_rate": 4.16170237140566e-06, "logits/chosen": 11.336567878723145, "logits/rejected": 7.443289279937744, "logps/chosen": -295.2197265625, "logps/rejected": -226.88449096679688, "loss": 0.6524, "rewards/accuracies": 0.625, "rewards/chosen": 0.4521089792251587, "rewards/margins": 0.15858136117458344, "rewards/rejected": 0.29352760314941406, "step": 4867 }, { "epoch": 0.7528320123719312, "grad_norm": 4.9363112449646, "learning_rate": 4.1614159697559865e-06, "logits/chosen": 5.8075032234191895, "logits/rejected": 2.988018751144409, "logps/chosen": -180.41346740722656, "logps/rejected": -247.77944946289062, "loss": 0.5332, "rewards/accuracies": 0.5, "rewards/chosen": 0.7195602059364319, "rewards/margins": 1.1486563682556152, "rewards/rejected": -0.4290962219238281, "step": 4868 }, { "epoch": 0.7529866615116954, "grad_norm": 4.186629772186279, "learning_rate": 4.161129568106312e-06, "logits/chosen": 9.417548179626465, "logits/rejected": 7.2234697341918945, "logps/chosen": -448.9929504394531, "logps/rejected": -399.49212646484375, "loss": 0.5829, "rewards/accuracies": 0.375, "rewards/chosen": 3.9507672786712646, "rewards/margins": 9.619233131408691, "rewards/rejected": -5.668464660644531, "step": 4869 }, { "epoch": 0.7531413106514595, "grad_norm": 6.474702835083008, "learning_rate": 4.160843166456639e-06, "logits/chosen": 8.779424667358398, "logits/rejected": 11.26167106628418, "logps/chosen": -249.2095947265625, "logps/rejected": -236.08193969726562, "loss": 0.8036, "rewards/accuracies": 0.375, "rewards/chosen": 0.14109763503074646, "rewards/margins": -0.14996615052223206, "rewards/rejected": 0.2910637855529785, "step": 4870 }, { "epoch": 0.7532959597912237, "grad_norm": 5.033507823944092, "learning_rate": 4.160556764806966e-06, "logits/chosen": 11.104290008544922, "logits/rejected": 12.713186264038086, "logps/chosen": -235.8220672607422, "logps/rejected": -264.89544677734375, "loss": 0.6415, "rewards/accuracies": 0.625, "rewards/chosen": 0.5147908926010132, "rewards/margins": 0.1764892041683197, "rewards/rejected": 0.33830171823501587, "step": 4871 }, { "epoch": 0.7534506089309878, "grad_norm": 4.59453010559082, "learning_rate": 4.160270363157292e-06, "logits/chosen": 8.122228622436523, "logits/rejected": 11.204286575317383, "logps/chosen": -242.2407684326172, "logps/rejected": -155.99050903320312, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.1730659306049347, "rewards/margins": 0.0686965212225914, "rewards/rejected": 0.10436940938234329, "step": 4872 }, { "epoch": 0.753605258070752, "grad_norm": 4.717090129852295, "learning_rate": 4.159983961507619e-06, "logits/chosen": 7.51284122467041, "logits/rejected": 6.261141300201416, "logps/chosen": -250.036376953125, "logps/rejected": -254.99609375, "loss": 0.5665, "rewards/accuracies": 0.75, "rewards/chosen": 0.42540812492370605, "rewards/margins": 0.387265145778656, "rewards/rejected": 0.03814297914505005, "step": 4873 }, { "epoch": 0.7537599072105161, "grad_norm": 4.785600662231445, "learning_rate": 4.159697559857946e-06, "logits/chosen": 13.60636043548584, "logits/rejected": 7.181684494018555, "logps/chosen": -343.3254699707031, "logps/rejected": -249.38449096679688, "loss": 0.53, "rewards/accuracies": 0.625, "rewards/chosen": 0.5707874298095703, "rewards/margins": 0.7022783756256104, "rewards/rejected": -0.13149090111255646, "step": 4874 }, { "epoch": 0.7539145563502803, "grad_norm": 4.169939994812012, "learning_rate": 4.159411158208271e-06, "logits/chosen": 11.354193687438965, "logits/rejected": 4.276512622833252, "logps/chosen": -234.9937744140625, "logps/rejected": -173.69639587402344, "loss": 0.5464, "rewards/accuracies": 0.5, "rewards/chosen": 0.23608791828155518, "rewards/margins": 0.7283867597579956, "rewards/rejected": -0.49229881167411804, "step": 4875 }, { "epoch": 0.7540692054900444, "grad_norm": 7.825748920440674, "learning_rate": 4.159124756558598e-06, "logits/chosen": 10.39487361907959, "logits/rejected": 2.73970890045166, "logps/chosen": -262.90191650390625, "logps/rejected": -191.63388061523438, "loss": 1.0615, "rewards/accuracies": 0.25, "rewards/chosen": 0.14341247081756592, "rewards/margins": -0.47041571140289307, "rewards/rejected": 0.613828182220459, "step": 4876 }, { "epoch": 0.7542238546298087, "grad_norm": 6.113379001617432, "learning_rate": 4.158838354908925e-06, "logits/chosen": 10.946430206298828, "logits/rejected": 8.744179725646973, "logps/chosen": -334.80487060546875, "logps/rejected": -305.72503662109375, "loss": 0.7291, "rewards/accuracies": 0.375, "rewards/chosen": 0.5106526017189026, "rewards/margins": -0.039550863206386566, "rewards/rejected": 0.5502034425735474, "step": 4877 }, { "epoch": 0.7543785037695728, "grad_norm": 7.409026145935059, "learning_rate": 4.158551953259251e-06, "logits/chosen": 6.310328006744385, "logits/rejected": 6.384535789489746, "logps/chosen": -255.12356567382812, "logps/rejected": -273.47314453125, "loss": 0.6123, "rewards/accuracies": 0.625, "rewards/chosen": 0.11168798804283142, "rewards/margins": 0.41121432185173035, "rewards/rejected": -0.2995263636112213, "step": 4878 }, { "epoch": 0.754533152909337, "grad_norm": 4.742378234863281, "learning_rate": 4.158265551609578e-06, "logits/chosen": 8.79690933227539, "logits/rejected": 8.849470138549805, "logps/chosen": -203.0412139892578, "logps/rejected": -232.00360107421875, "loss": 0.6605, "rewards/accuracies": 0.5, "rewards/chosen": 0.32467830181121826, "rewards/margins": 0.10746662318706512, "rewards/rejected": 0.21721170842647552, "step": 4879 }, { "epoch": 0.7546878020491011, "grad_norm": 5.249179363250732, "learning_rate": 4.157979149959905e-06, "logits/chosen": 14.662715911865234, "logits/rejected": 4.638828754425049, "logps/chosen": -282.33544921875, "logps/rejected": -217.3389892578125, "loss": 0.4491, "rewards/accuracies": 0.875, "rewards/chosen": 0.1111377701163292, "rewards/margins": 0.9911292195320129, "rewards/rejected": -0.8799915313720703, "step": 4880 }, { "epoch": 0.7548424511888653, "grad_norm": 6.396276950836182, "learning_rate": 4.1576927483102304e-06, "logits/chosen": 14.171765327453613, "logits/rejected": 8.38521957397461, "logps/chosen": -312.46807861328125, "logps/rejected": -293.8635559082031, "loss": 0.5882, "rewards/accuracies": 0.75, "rewards/chosen": 0.8231029510498047, "rewards/margins": 0.3185581564903259, "rewards/rejected": 0.5045448541641235, "step": 4881 }, { "epoch": 0.7549971003286294, "grad_norm": 9.577472686767578, "learning_rate": 4.157406346660557e-06, "logits/chosen": 13.956840515136719, "logits/rejected": 4.463374137878418, "logps/chosen": -371.04583740234375, "logps/rejected": -228.9930419921875, "loss": 0.68, "rewards/accuracies": 0.375, "rewards/chosen": 0.14048179984092712, "rewards/margins": 0.14574843645095825, "rewards/rejected": -0.005266614258289337, "step": 4882 }, { "epoch": 0.7551517494683936, "grad_norm": 4.399951934814453, "learning_rate": 4.157119945010884e-06, "logits/chosen": 12.402183532714844, "logits/rejected": 8.673035621643066, "logps/chosen": -256.32330322265625, "logps/rejected": -179.59190368652344, "loss": 0.6318, "rewards/accuracies": 0.5, "rewards/chosen": 0.44452208280563354, "rewards/margins": 0.21307125687599182, "rewards/rejected": 0.2314508557319641, "step": 4883 }, { "epoch": 0.7553063986081577, "grad_norm": 7.600841522216797, "learning_rate": 4.15683354336121e-06, "logits/chosen": 5.742386341094971, "logits/rejected": 0.9494252800941467, "logps/chosen": -208.89401245117188, "logps/rejected": -207.08868408203125, "loss": 0.7911, "rewards/accuracies": 0.5, "rewards/chosen": 0.04910773038864136, "rewards/margins": -0.043230995535850525, "rewards/rejected": 0.09233871102333069, "step": 4884 }, { "epoch": 0.7554610477479219, "grad_norm": 5.3923726081848145, "learning_rate": 4.156547141711536e-06, "logits/chosen": 6.722903251647949, "logits/rejected": 4.033544540405273, "logps/chosen": -243.59324645996094, "logps/rejected": -210.42825317382812, "loss": 0.792, "rewards/accuracies": 0.5, "rewards/chosen": 0.19359853863716125, "rewards/margins": 0.09448637068271637, "rewards/rejected": 0.09911217540502548, "step": 4885 }, { "epoch": 0.755615696887686, "grad_norm": 10.00224781036377, "learning_rate": 4.156260740061863e-06, "logits/chosen": 6.39294958114624, "logits/rejected": 11.800689697265625, "logps/chosen": -212.2391815185547, "logps/rejected": -241.23724365234375, "loss": 0.806, "rewards/accuracies": 0.375, "rewards/chosen": -0.04922452196478844, "rewards/margins": -0.1879214346408844, "rewards/rejected": 0.13869690895080566, "step": 4886 }, { "epoch": 0.7557703460274502, "grad_norm": 3.972947835922241, "learning_rate": 4.1559743384121895e-06, "logits/chosen": 6.580738067626953, "logits/rejected": -0.7640472054481506, "logps/chosen": -221.8458251953125, "logps/rejected": -157.6392364501953, "loss": 0.478, "rewards/accuracies": 0.875, "rewards/chosen": 0.2947428822517395, "rewards/margins": 0.6070636510848999, "rewards/rejected": -0.3123207092285156, "step": 4887 }, { "epoch": 0.7559249951672143, "grad_norm": 5.774746417999268, "learning_rate": 4.155687936762516e-06, "logits/chosen": 2.8305511474609375, "logits/rejected": 2.8302910327911377, "logps/chosen": -232.02593994140625, "logps/rejected": -163.4444580078125, "loss": 0.8295, "rewards/accuracies": 0.375, "rewards/chosen": -0.06917095929384232, "rewards/margins": -0.12995997071266174, "rewards/rejected": 0.06078901141881943, "step": 4888 }, { "epoch": 0.7560796443069785, "grad_norm": 6.0303120613098145, "learning_rate": 4.155401535112843e-06, "logits/chosen": 6.560315132141113, "logits/rejected": 2.1548962593078613, "logps/chosen": -331.7688293457031, "logps/rejected": -228.27127075195312, "loss": 0.6343, "rewards/accuracies": 0.5, "rewards/chosen": 0.38874709606170654, "rewards/margins": 0.2311120480298996, "rewards/rejected": 0.15763501822948456, "step": 4889 }, { "epoch": 0.7562342934467428, "grad_norm": 4.204606056213379, "learning_rate": 4.155115133463169e-06, "logits/chosen": 12.035921096801758, "logits/rejected": 5.750936508178711, "logps/chosen": -287.957763671875, "logps/rejected": -219.12298583984375, "loss": 0.6494, "rewards/accuracies": 0.5, "rewards/chosen": 0.2778784930706024, "rewards/margins": 0.3323845863342285, "rewards/rejected": -0.05450611561536789, "step": 4890 }, { "epoch": 0.7563889425865069, "grad_norm": 3.987344980239868, "learning_rate": 4.154828731813495e-06, "logits/chosen": 9.979681015014648, "logits/rejected": 5.062463283538818, "logps/chosen": -354.886474609375, "logps/rejected": -304.448486328125, "loss": 0.4847, "rewards/accuracies": 0.875, "rewards/chosen": 0.45751842856407166, "rewards/margins": 0.5771106481552124, "rewards/rejected": -0.11959227919578552, "step": 4891 }, { "epoch": 0.7565435917262711, "grad_norm": 7.338598728179932, "learning_rate": 4.154542330163822e-06, "logits/chosen": 18.9298152923584, "logits/rejected": 14.901196479797363, "logps/chosen": -301.9948425292969, "logps/rejected": -228.91709899902344, "loss": 0.6626, "rewards/accuracies": 0.75, "rewards/chosen": 0.6963154077529907, "rewards/margins": 0.15749777853488922, "rewards/rejected": 0.5388177037239075, "step": 4892 }, { "epoch": 0.7566982408660352, "grad_norm": 4.613230228424072, "learning_rate": 4.1542559285141486e-06, "logits/chosen": 9.807443618774414, "logits/rejected": 9.253124237060547, "logps/chosen": -233.16943359375, "logps/rejected": -243.26144409179688, "loss": 0.6306, "rewards/accuracies": 0.625, "rewards/chosen": 0.042470961809158325, "rewards/margins": 0.2960153818130493, "rewards/rejected": -0.253544420003891, "step": 4893 }, { "epoch": 0.7568528900057994, "grad_norm": 5.299074649810791, "learning_rate": 4.153969526864475e-06, "logits/chosen": 5.765031814575195, "logits/rejected": 4.580773830413818, "logps/chosen": -357.42559814453125, "logps/rejected": -328.19256591796875, "loss": 0.507, "rewards/accuracies": 0.75, "rewards/chosen": 0.42679429054260254, "rewards/margins": 0.5977314710617065, "rewards/rejected": -0.17093724012374878, "step": 4894 }, { "epoch": 0.7570075391455635, "grad_norm": 4.942300319671631, "learning_rate": 4.153683125214801e-06, "logits/chosen": 13.360185623168945, "logits/rejected": 13.061016082763672, "logps/chosen": -289.798583984375, "logps/rejected": -281.2418212890625, "loss": 0.7466, "rewards/accuracies": 0.375, "rewards/chosen": 0.4073914885520935, "rewards/margins": -0.07652539759874344, "rewards/rejected": 0.48391687870025635, "step": 4895 }, { "epoch": 0.7571621882853277, "grad_norm": 6.595679759979248, "learning_rate": 4.153396723565128e-06, "logits/chosen": 11.112854957580566, "logits/rejected": 13.922195434570312, "logps/chosen": -244.72744750976562, "logps/rejected": -300.64288330078125, "loss": 0.759, "rewards/accuracies": 0.5, "rewards/chosen": 0.14549341797828674, "rewards/margins": -0.07439480721950531, "rewards/rejected": 0.21988821029663086, "step": 4896 }, { "epoch": 0.7573168374250918, "grad_norm": 7.344395637512207, "learning_rate": 4.153110321915454e-06, "logits/chosen": 13.290486335754395, "logits/rejected": 12.03635311126709, "logps/chosen": -289.7325134277344, "logps/rejected": -271.93499755859375, "loss": 0.7853, "rewards/accuracies": 0.5, "rewards/chosen": 0.49479347467422485, "rewards/margins": -0.0040227919816970825, "rewards/rejected": 0.49881622195243835, "step": 4897 }, { "epoch": 0.757471486564856, "grad_norm": 8.114124298095703, "learning_rate": 4.152823920265781e-06, "logits/chosen": 4.685720920562744, "logits/rejected": 9.170136451721191, "logps/chosen": -252.373779296875, "logps/rejected": -266.15460205078125, "loss": 0.7666, "rewards/accuracies": 0.625, "rewards/chosen": 0.3568749725818634, "rewards/margins": 0.19227677583694458, "rewards/rejected": 0.16459818184375763, "step": 4898 }, { "epoch": 0.7576261357046201, "grad_norm": 5.119345188140869, "learning_rate": 4.152537518616108e-06, "logits/chosen": 9.796183586120605, "logits/rejected": 9.1947603225708, "logps/chosen": -284.006591796875, "logps/rejected": -208.79620361328125, "loss": 0.7262, "rewards/accuracies": 0.5, "rewards/chosen": 0.3368726968765259, "rewards/margins": -0.016489751636981964, "rewards/rejected": 0.35336247086524963, "step": 4899 }, { "epoch": 0.7577807848443843, "grad_norm": 7.347695827484131, "learning_rate": 4.152251116966434e-06, "logits/chosen": 13.330623626708984, "logits/rejected": 5.243101119995117, "logps/chosen": -312.1974182128906, "logps/rejected": -243.76007080078125, "loss": 0.7052, "rewards/accuracies": 0.5, "rewards/chosen": 0.12130396068096161, "rewards/margins": 0.14944599568843842, "rewards/rejected": -0.028142064809799194, "step": 4900 }, { "epoch": 0.7579354339841484, "grad_norm": 6.108462333679199, "learning_rate": 4.151964715316761e-06, "logits/chosen": 14.061036109924316, "logits/rejected": 5.555707931518555, "logps/chosen": -408.7244873046875, "logps/rejected": -292.62860107421875, "loss": 0.5616, "rewards/accuracies": 0.75, "rewards/chosen": 0.662522554397583, "rewards/margins": 0.5321530103683472, "rewards/rejected": 0.13036946952342987, "step": 4901 }, { "epoch": 0.7580900831239126, "grad_norm": 5.217168807983398, "learning_rate": 4.151678313667087e-06, "logits/chosen": 14.124031066894531, "logits/rejected": 6.348136901855469, "logps/chosen": -321.36614990234375, "logps/rejected": -320.43267822265625, "loss": 0.5652, "rewards/accuracies": 0.75, "rewards/chosen": 0.5383907556533813, "rewards/margins": 0.3861134946346283, "rewards/rejected": 0.15227729082107544, "step": 4902 }, { "epoch": 0.7582447322636768, "grad_norm": 4.985553741455078, "learning_rate": 4.151391912017413e-06, "logits/chosen": 9.301740646362305, "logits/rejected": 8.294329643249512, "logps/chosen": -249.47061157226562, "logps/rejected": -267.64862060546875, "loss": 0.6142, "rewards/accuracies": 0.75, "rewards/chosen": 0.6381760835647583, "rewards/margins": 0.23665611445903778, "rewards/rejected": 0.4015199542045593, "step": 4903 }, { "epoch": 0.758399381403441, "grad_norm": 5.5900726318359375, "learning_rate": 4.15110551036774e-06, "logits/chosen": 11.056108474731445, "logits/rejected": 9.100286483764648, "logps/chosen": -239.73574829101562, "logps/rejected": -198.02235412597656, "loss": 0.8239, "rewards/accuracies": 0.375, "rewards/chosen": 0.04010138288140297, "rewards/margins": -0.15031249821186066, "rewards/rejected": 0.19041383266448975, "step": 4904 }, { "epoch": 0.7585540305432051, "grad_norm": 6.506530284881592, "learning_rate": 4.150819108718067e-06, "logits/chosen": 10.831025123596191, "logits/rejected": 8.439030647277832, "logps/chosen": -418.8076477050781, "logps/rejected": -347.3546447753906, "loss": 0.5912, "rewards/accuracies": 0.625, "rewards/chosen": 0.6486164331436157, "rewards/margins": 0.33601999282836914, "rewards/rejected": 0.3125964105129242, "step": 4905 }, { "epoch": 0.7587086796829693, "grad_norm": 6.190465927124023, "learning_rate": 4.150532707068393e-06, "logits/chosen": 8.84827995300293, "logits/rejected": 10.607156753540039, "logps/chosen": -227.6044921875, "logps/rejected": -306.0880126953125, "loss": 0.6276, "rewards/accuracies": 0.75, "rewards/chosen": 0.1654651165008545, "rewards/margins": 0.1953257918357849, "rewards/rejected": -0.029860682785511017, "step": 4906 }, { "epoch": 0.7588633288227334, "grad_norm": 4.181465148925781, "learning_rate": 4.15024630541872e-06, "logits/chosen": 5.151773929595947, "logits/rejected": -0.595993161201477, "logps/chosen": -272.97430419921875, "logps/rejected": -232.081787109375, "loss": 0.6154, "rewards/accuracies": 0.625, "rewards/chosen": 0.45322006940841675, "rewards/margins": 0.47670498490333557, "rewards/rejected": -0.023484911769628525, "step": 4907 }, { "epoch": 0.7590179779624976, "grad_norm": 6.769428253173828, "learning_rate": 4.149959903769046e-06, "logits/chosen": 12.372825622558594, "logits/rejected": 8.445409774780273, "logps/chosen": -211.14593505859375, "logps/rejected": -201.21739196777344, "loss": 0.6324, "rewards/accuracies": 0.625, "rewards/chosen": 0.4704332947731018, "rewards/margins": 0.27743685245513916, "rewards/rejected": 0.19299644231796265, "step": 4908 }, { "epoch": 0.7591726271022617, "grad_norm": 5.690778732299805, "learning_rate": 4.1496735021193724e-06, "logits/chosen": 4.834717273712158, "logits/rejected": 4.586916923522949, "logps/chosen": -334.7077941894531, "logps/rejected": -335.7353515625, "loss": 0.5469, "rewards/accuracies": 0.625, "rewards/chosen": 0.40061330795288086, "rewards/margins": 0.5135318636894226, "rewards/rejected": -0.11291855573654175, "step": 4909 }, { "epoch": 0.7593272762420259, "grad_norm": 5.367953300476074, "learning_rate": 4.149387100469699e-06, "logits/chosen": 5.731888294219971, "logits/rejected": 5.723881244659424, "logps/chosen": -225.95489501953125, "logps/rejected": -199.14968872070312, "loss": 0.6806, "rewards/accuracies": 0.5, "rewards/chosen": 0.10567726939916611, "rewards/margins": 0.08735520392656326, "rewards/rejected": 0.018322043120861053, "step": 4910 }, { "epoch": 0.75948192538179, "grad_norm": 5.577286720275879, "learning_rate": 4.149100698820026e-06, "logits/chosen": 7.287262916564941, "logits/rejected": 5.463646411895752, "logps/chosen": -240.44281005859375, "logps/rejected": -217.14999389648438, "loss": 0.6415, "rewards/accuracies": 0.625, "rewards/chosen": 0.14128904044628143, "rewards/margins": 0.1739535927772522, "rewards/rejected": -0.03266454115509987, "step": 4911 }, { "epoch": 0.7596365745215542, "grad_norm": 8.50859546661377, "learning_rate": 4.148814297170352e-06, "logits/chosen": 11.617916107177734, "logits/rejected": 7.3536906242370605, "logps/chosen": -236.2200164794922, "logps/rejected": -234.49818420410156, "loss": 0.7667, "rewards/accuracies": 0.375, "rewards/chosen": 0.3820585310459137, "rewards/margins": -0.055295251309871674, "rewards/rejected": 0.43735381960868835, "step": 4912 }, { "epoch": 0.7597912236613183, "grad_norm": 6.611783027648926, "learning_rate": 4.148527895520679e-06, "logits/chosen": 10.420655250549316, "logits/rejected": 12.067861557006836, "logps/chosen": -221.20645141601562, "logps/rejected": -231.5289306640625, "loss": 0.7919, "rewards/accuracies": 0.375, "rewards/chosen": 0.08360320329666138, "rewards/margins": -0.04212473705410957, "rewards/rejected": 0.12572795152664185, "step": 4913 }, { "epoch": 0.7599458728010825, "grad_norm": 4.780416965484619, "learning_rate": 4.148241493871006e-06, "logits/chosen": 16.03663444519043, "logits/rejected": 9.475438117980957, "logps/chosen": -422.89898681640625, "logps/rejected": -320.68621826171875, "loss": 0.3936, "rewards/accuracies": 0.875, "rewards/chosen": 0.5728496313095093, "rewards/margins": 0.8516335487365723, "rewards/rejected": -0.27878400683403015, "step": 4914 }, { "epoch": 0.7601005219408467, "grad_norm": 4.702272415161133, "learning_rate": 4.1479550922213315e-06, "logits/chosen": 11.05862808227539, "logits/rejected": 6.111822128295898, "logps/chosen": -435.7619323730469, "logps/rejected": -346.0679931640625, "loss": 0.4264, "rewards/accuracies": 0.875, "rewards/chosen": 0.9095495939254761, "rewards/margins": 0.7445962429046631, "rewards/rejected": 0.16495341062545776, "step": 4915 }, { "epoch": 0.7602551710806109, "grad_norm": 5.261887550354004, "learning_rate": 4.147668690571658e-06, "logits/chosen": 11.336010932922363, "logits/rejected": 7.728522300720215, "logps/chosen": -321.0860290527344, "logps/rejected": -234.15423583984375, "loss": 0.6268, "rewards/accuracies": 0.625, "rewards/chosen": 0.3022048771381378, "rewards/margins": 0.20865270495414734, "rewards/rejected": 0.09355220943689346, "step": 4916 }, { "epoch": 0.7604098202203751, "grad_norm": 5.102192401885986, "learning_rate": 4.147382288921985e-06, "logits/chosen": 14.532776832580566, "logits/rejected": 8.315821647644043, "logps/chosen": -181.035400390625, "logps/rejected": -107.02165222167969, "loss": 0.7974, "rewards/accuracies": 0.625, "rewards/chosen": -0.2955061197280884, "rewards/margins": 0.03571687638759613, "rewards/rejected": -0.3312229514122009, "step": 4917 }, { "epoch": 0.7605644693601392, "grad_norm": 5.660706043243408, "learning_rate": 4.1470958872723114e-06, "logits/chosen": 7.079586505889893, "logits/rejected": 9.5824556350708, "logps/chosen": -245.2150421142578, "logps/rejected": -275.98974609375, "loss": 0.6828, "rewards/accuracies": 0.625, "rewards/chosen": 0.2076207995414734, "rewards/margins": 0.12682899832725525, "rewards/rejected": 0.08079180866479874, "step": 4918 }, { "epoch": 0.7607191184999034, "grad_norm": 6.165192604064941, "learning_rate": 4.146809485622637e-06, "logits/chosen": 9.167647361755371, "logits/rejected": 5.921746253967285, "logps/chosen": -331.52447509765625, "logps/rejected": -273.09051513671875, "loss": 0.5835, "rewards/accuracies": 0.625, "rewards/chosen": 0.532878577709198, "rewards/margins": 0.3147548735141754, "rewards/rejected": 0.21812373399734497, "step": 4919 }, { "epoch": 0.7608737676396675, "grad_norm": 6.272213459014893, "learning_rate": 4.146523083972964e-06, "logits/chosen": 7.812296390533447, "logits/rejected": 7.431384086608887, "logps/chosen": -363.254638671875, "logps/rejected": -287.0889587402344, "loss": 0.6778, "rewards/accuracies": 0.375, "rewards/chosen": 0.539750874042511, "rewards/margins": 0.38153934478759766, "rewards/rejected": 0.15821152925491333, "step": 4920 }, { "epoch": 0.7610284167794317, "grad_norm": 5.577705383300781, "learning_rate": 4.1462366823232905e-06, "logits/chosen": 8.577412605285645, "logits/rejected": 13.420360565185547, "logps/chosen": -178.31939697265625, "logps/rejected": -206.78103637695312, "loss": 0.8226, "rewards/accuracies": 0.375, "rewards/chosen": -0.0735921561717987, "rewards/margins": -0.20445388555526733, "rewards/rejected": 0.13086172938346863, "step": 4921 }, { "epoch": 0.7611830659191958, "grad_norm": 4.6788835525512695, "learning_rate": 4.145950280673617e-06, "logits/chosen": 8.623282432556152, "logits/rejected": 3.681525468826294, "logps/chosen": -333.3215026855469, "logps/rejected": -256.15643310546875, "loss": 0.5208, "rewards/accuracies": 0.75, "rewards/chosen": 0.5315885543823242, "rewards/margins": 0.4524400532245636, "rewards/rejected": 0.07914847135543823, "step": 4922 }, { "epoch": 0.76133771505896, "grad_norm": 27.451623916625977, "learning_rate": 4.145663879023943e-06, "logits/chosen": 11.408885955810547, "logits/rejected": 9.009562492370605, "logps/chosen": -230.19955444335938, "logps/rejected": -240.6341552734375, "loss": 0.5536, "rewards/accuracies": 0.875, "rewards/chosen": 0.010083436965942383, "rewards/margins": 0.3672701418399811, "rewards/rejected": -0.3571867048740387, "step": 4923 }, { "epoch": 0.7614923641987241, "grad_norm": 5.923617362976074, "learning_rate": 4.14537747737427e-06, "logits/chosen": 7.339748859405518, "logits/rejected": 12.718791007995605, "logps/chosen": -188.9129638671875, "logps/rejected": -276.8684997558594, "loss": 0.7762, "rewards/accuracies": 0.625, "rewards/chosen": -0.2744036316871643, "rewards/margins": -0.07441519945859909, "rewards/rejected": -0.1999884396791458, "step": 4924 }, { "epoch": 0.7616470133384883, "grad_norm": 5.329258918762207, "learning_rate": 4.145091075724596e-06, "logits/chosen": 6.880941390991211, "logits/rejected": 6.928739547729492, "logps/chosen": -229.04409790039062, "logps/rejected": -280.45587158203125, "loss": 0.7642, "rewards/accuracies": 0.5, "rewards/chosen": 0.12328854203224182, "rewards/margins": -0.035579435527324677, "rewards/rejected": 0.1588679850101471, "step": 4925 }, { "epoch": 0.7618016624782524, "grad_norm": 16.788772583007812, "learning_rate": 4.144804674074923e-06, "logits/chosen": 11.350167274475098, "logits/rejected": 11.149557113647461, "logps/chosen": -355.59490966796875, "logps/rejected": -518.6079711914062, "loss": 0.4683, "rewards/accuracies": 0.75, "rewards/chosen": 0.29341644048690796, "rewards/margins": 0.756689190864563, "rewards/rejected": -0.4632726311683655, "step": 4926 }, { "epoch": 0.7619563116180166, "grad_norm": 4.877502918243408, "learning_rate": 4.14451827242525e-06, "logits/chosen": 13.342842102050781, "logits/rejected": 11.937655448913574, "logps/chosen": -229.94586181640625, "logps/rejected": -241.38990783691406, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": 0.057029012590646744, "rewards/margins": 0.0905604436993599, "rewards/rejected": -0.03353142738342285, "step": 4927 }, { "epoch": 0.7621109607577807, "grad_norm": 8.58271312713623, "learning_rate": 4.144231870775575e-06, "logits/chosen": 7.603719711303711, "logits/rejected": 7.8524370193481445, "logps/chosen": -161.15945434570312, "logps/rejected": -162.17681884765625, "loss": 1.0879, "rewards/accuracies": 0.5, "rewards/chosen": -0.05612868070602417, "rewards/margins": -0.42669743299484253, "rewards/rejected": 0.37056878209114075, "step": 4928 }, { "epoch": 0.762265609897545, "grad_norm": 4.385808944702148, "learning_rate": 4.143945469125902e-06, "logits/chosen": 11.460906982421875, "logits/rejected": 9.257423400878906, "logps/chosen": -238.97616577148438, "logps/rejected": -269.02203369140625, "loss": 0.5928, "rewards/accuracies": 0.625, "rewards/chosen": 0.3834801912307739, "rewards/margins": 0.4303295314311981, "rewards/rejected": -0.046849325299263, "step": 4929 }, { "epoch": 0.7624202590373091, "grad_norm": 3.991694211959839, "learning_rate": 4.143659067476229e-06, "logits/chosen": 7.582732200622559, "logits/rejected": 6.187265872955322, "logps/chosen": -147.570068359375, "logps/rejected": -144.92042541503906, "loss": 0.651, "rewards/accuracies": 0.5, "rewards/chosen": -0.1417761743068695, "rewards/margins": 0.13091130554676056, "rewards/rejected": -0.27268749475479126, "step": 4930 }, { "epoch": 0.7625749081770733, "grad_norm": 3.6845335960388184, "learning_rate": 4.143372665826555e-06, "logits/chosen": 7.688179016113281, "logits/rejected": 8.560306549072266, "logps/chosen": -236.12631225585938, "logps/rejected": -233.07296752929688, "loss": 0.6004, "rewards/accuracies": 0.625, "rewards/chosen": 0.3454280495643616, "rewards/margins": 0.23559586703777313, "rewards/rejected": 0.10983218997716904, "step": 4931 }, { "epoch": 0.7627295573168374, "grad_norm": 5.549973011016846, "learning_rate": 4.143086264176882e-06, "logits/chosen": 5.1850738525390625, "logits/rejected": 9.952247619628906, "logps/chosen": -300.2906494140625, "logps/rejected": -282.5285339355469, "loss": 0.6156, "rewards/accuracies": 0.75, "rewards/chosen": 0.10462726652622223, "rewards/margins": 0.1874665915966034, "rewards/rejected": -0.08283932507038116, "step": 4932 }, { "epoch": 0.7628842064566016, "grad_norm": 5.1061272621154785, "learning_rate": 4.142799862527209e-06, "logits/chosen": 8.262295722961426, "logits/rejected": 4.53387975692749, "logps/chosen": -290.8871154785156, "logps/rejected": -196.09738159179688, "loss": 0.6445, "rewards/accuracies": 0.625, "rewards/chosen": 0.3276713490486145, "rewards/margins": 0.24858100712299347, "rewards/rejected": 0.07909035682678223, "step": 4933 }, { "epoch": 0.7630388555963658, "grad_norm": 4.63276481628418, "learning_rate": 4.142513460877535e-06, "logits/chosen": 7.329858303070068, "logits/rejected": 5.278559684753418, "logps/chosen": -259.938720703125, "logps/rejected": -244.70745849609375, "loss": 0.5135, "rewards/accuracies": 0.75, "rewards/chosen": 0.6174301505088806, "rewards/margins": 0.5074489712715149, "rewards/rejected": 0.10998116433620453, "step": 4934 }, { "epoch": 0.7631935047361299, "grad_norm": 8.939595222473145, "learning_rate": 4.142227059227861e-06, "logits/chosen": 9.228285789489746, "logits/rejected": 12.601855278015137, "logps/chosen": -330.9233093261719, "logps/rejected": -389.1082763671875, "loss": 0.9763, "rewards/accuracies": 0.25, "rewards/chosen": 0.21591608226299286, "rewards/margins": -0.45536142587661743, "rewards/rejected": 0.6712775230407715, "step": 4935 }, { "epoch": 0.763348153875894, "grad_norm": 4.449682235717773, "learning_rate": 4.141940657578188e-06, "logits/chosen": 9.54892349243164, "logits/rejected": 9.508878707885742, "logps/chosen": -290.98773193359375, "logps/rejected": -252.12350463867188, "loss": 0.5775, "rewards/accuracies": 0.75, "rewards/chosen": 0.42847490310668945, "rewards/margins": 0.3792840242385864, "rewards/rejected": 0.04919089749455452, "step": 4936 }, { "epoch": 0.7635028030156582, "grad_norm": 5.2768025398254395, "learning_rate": 4.141654255928514e-06, "logits/chosen": 8.646503448486328, "logits/rejected": 11.98926067352295, "logps/chosen": -238.91754150390625, "logps/rejected": -274.1827392578125, "loss": 0.7279, "rewards/accuracies": 0.625, "rewards/chosen": 0.02121984213590622, "rewards/margins": -0.003342166543006897, "rewards/rejected": 0.024561986327171326, "step": 4937 }, { "epoch": 0.7636574521554224, "grad_norm": 6.944789409637451, "learning_rate": 4.141367854278841e-06, "logits/chosen": 7.973559379577637, "logits/rejected": 8.873360633850098, "logps/chosen": -279.97442626953125, "logps/rejected": -290.258056640625, "loss": 0.5769, "rewards/accuracies": 0.75, "rewards/chosen": 0.5695616006851196, "rewards/margins": 0.32906368374824524, "rewards/rejected": 0.24049793183803558, "step": 4938 }, { "epoch": 0.7638121012951865, "grad_norm": 4.392387390136719, "learning_rate": 4.141081452629168e-06, "logits/chosen": 15.18172550201416, "logits/rejected": 10.412322998046875, "logps/chosen": -241.785400390625, "logps/rejected": -210.549072265625, "loss": 0.4549, "rewards/accuracies": 0.875, "rewards/chosen": 0.5665448904037476, "rewards/margins": 0.665616512298584, "rewards/rejected": -0.09907159209251404, "step": 4939 }, { "epoch": 0.7639667504349507, "grad_norm": 4.9878644943237305, "learning_rate": 4.140795050979494e-06, "logits/chosen": 7.041317939758301, "logits/rejected": 4.476365566253662, "logps/chosen": -183.692138671875, "logps/rejected": -146.60140991210938, "loss": 0.6065, "rewards/accuracies": 0.75, "rewards/chosen": 0.14306746423244476, "rewards/margins": 0.34993869066238403, "rewards/rejected": -0.20687125623226166, "step": 4940 }, { "epoch": 0.7641213995747148, "grad_norm": 4.474164962768555, "learning_rate": 4.14050864932982e-06, "logits/chosen": 10.443655967712402, "logits/rejected": 9.39891529083252, "logps/chosen": -217.26190185546875, "logps/rejected": -156.72608947753906, "loss": 0.6492, "rewards/accuracies": 0.5, "rewards/chosen": 0.4619622528553009, "rewards/margins": 0.14141054451465607, "rewards/rejected": 0.32055172324180603, "step": 4941 }, { "epoch": 0.7642760487144791, "grad_norm": 7.124011039733887, "learning_rate": 4.140222247680147e-06, "logits/chosen": 14.211482048034668, "logits/rejected": 15.702484130859375, "logps/chosen": -313.53753662109375, "logps/rejected": -309.4259338378906, "loss": 0.725, "rewards/accuracies": 0.375, "rewards/chosen": 0.22411996126174927, "rewards/margins": 0.044881612062454224, "rewards/rejected": 0.17923830449581146, "step": 4942 }, { "epoch": 0.7644306978542432, "grad_norm": 14.382828712463379, "learning_rate": 4.1399358460304735e-06, "logits/chosen": 9.890050888061523, "logits/rejected": 6.561791896820068, "logps/chosen": -351.99639892578125, "logps/rejected": -202.32537841796875, "loss": 0.6888, "rewards/accuracies": 0.625, "rewards/chosen": 0.20855769515037537, "rewards/margins": 0.08406265079975128, "rewards/rejected": 0.12449502944946289, "step": 4943 }, { "epoch": 0.7645853469940074, "grad_norm": 6.3000993728637695, "learning_rate": 4.1396494443808e-06, "logits/chosen": 13.678356170654297, "logits/rejected": 5.869448184967041, "logps/chosen": -316.6748046875, "logps/rejected": -177.1873779296875, "loss": 0.5366, "rewards/accuracies": 0.625, "rewards/chosen": 0.3079023063182831, "rewards/margins": 0.7318978309631348, "rewards/rejected": -0.4239955246448517, "step": 4944 }, { "epoch": 0.7647399961337715, "grad_norm": 7.358652591705322, "learning_rate": 4.139363042731127e-06, "logits/chosen": 12.029150009155273, "logits/rejected": 7.779742240905762, "logps/chosen": -492.4407653808594, "logps/rejected": -350.1304931640625, "loss": 0.3968, "rewards/accuracies": 0.75, "rewards/chosen": 0.46798601746559143, "rewards/margins": 1.2406266927719116, "rewards/rejected": -0.7726407051086426, "step": 4945 }, { "epoch": 0.7648946452735357, "grad_norm": 6.211153984069824, "learning_rate": 4.139076641081453e-06, "logits/chosen": 11.553871154785156, "logits/rejected": 4.746696472167969, "logps/chosen": -402.80206298828125, "logps/rejected": -255.36207580566406, "loss": 0.5038, "rewards/accuracies": 0.625, "rewards/chosen": 0.8193070888519287, "rewards/margins": 0.6366795897483826, "rewards/rejected": 0.18262745440006256, "step": 4946 }, { "epoch": 0.7650492944132998, "grad_norm": 6.916305065155029, "learning_rate": 4.13879023943178e-06, "logits/chosen": 7.095345497131348, "logits/rejected": 4.810436248779297, "logps/chosen": -298.9525451660156, "logps/rejected": -263.8277587890625, "loss": 0.6291, "rewards/accuracies": 0.5, "rewards/chosen": 0.4684111475944519, "rewards/margins": 0.20465654134750366, "rewards/rejected": 0.26375460624694824, "step": 4947 }, { "epoch": 0.765203943553064, "grad_norm": 5.330521583557129, "learning_rate": 4.138503837782106e-06, "logits/chosen": 16.100194931030273, "logits/rejected": 10.801311492919922, "logps/chosen": -389.47113037109375, "logps/rejected": -285.7940979003906, "loss": 0.5101, "rewards/accuracies": 0.625, "rewards/chosen": 0.4962612986564636, "rewards/margins": 0.5578832626342773, "rewards/rejected": -0.06162194162607193, "step": 4948 }, { "epoch": 0.7653585926928281, "grad_norm": 6.446768283843994, "learning_rate": 4.1382174361324325e-06, "logits/chosen": 11.464221954345703, "logits/rejected": 12.036646842956543, "logps/chosen": -350.800537109375, "logps/rejected": -340.69647216796875, "loss": 0.6202, "rewards/accuracies": 0.625, "rewards/chosen": 0.2223339080810547, "rewards/margins": 0.40763378143310547, "rewards/rejected": -0.18529987335205078, "step": 4949 }, { "epoch": 0.7655132418325923, "grad_norm": 5.683520793914795, "learning_rate": 4.137931034482759e-06, "logits/chosen": 9.901934623718262, "logits/rejected": 10.866971969604492, "logps/chosen": -188.46136474609375, "logps/rejected": -220.008544921875, "loss": 0.8576, "rewards/accuracies": 0.375, "rewards/chosen": 0.2402886301279068, "rewards/margins": -0.23111507296562195, "rewards/rejected": 0.47140371799468994, "step": 4950 }, { "epoch": 0.7656678909723564, "grad_norm": 6.309943675994873, "learning_rate": 4.137644632833086e-06, "logits/chosen": 8.952722549438477, "logits/rejected": 3.296704053878784, "logps/chosen": -278.5149841308594, "logps/rejected": -235.2010955810547, "loss": 0.6499, "rewards/accuracies": 0.75, "rewards/chosen": -0.03517545759677887, "rewards/margins": 0.11505352705717087, "rewards/rejected": -0.15022897720336914, "step": 4951 }, { "epoch": 0.7658225401121206, "grad_norm": 6.074580669403076, "learning_rate": 4.1373582311834125e-06, "logits/chosen": 15.289661407470703, "logits/rejected": 13.359203338623047, "logps/chosen": -366.68023681640625, "logps/rejected": -341.90728759765625, "loss": 0.7817, "rewards/accuracies": 0.375, "rewards/chosen": 0.09628450125455856, "rewards/margins": -0.037759676575660706, "rewards/rejected": 0.13404417037963867, "step": 4952 }, { "epoch": 0.7659771892518847, "grad_norm": 4.777254581451416, "learning_rate": 4.137071829533738e-06, "logits/chosen": 8.840413093566895, "logits/rejected": 3.5354385375976562, "logps/chosen": -158.81382751464844, "logps/rejected": -146.3869171142578, "loss": 0.645, "rewards/accuracies": 0.625, "rewards/chosen": 0.1140359491109848, "rewards/margins": 0.21172145009040833, "rewards/rejected": -0.09768549352884293, "step": 4953 }, { "epoch": 0.766131838391649, "grad_norm": 6.983048915863037, "learning_rate": 4.136785427884065e-06, "logits/chosen": 6.146685600280762, "logits/rejected": 8.76120376586914, "logps/chosen": -278.0133056640625, "logps/rejected": -344.5935363769531, "loss": 0.9043, "rewards/accuracies": 0.375, "rewards/chosen": -0.20453988015651703, "rewards/margins": -0.2946438193321228, "rewards/rejected": 0.09010394662618637, "step": 4954 }, { "epoch": 0.7662864875314132, "grad_norm": 5.467168807983398, "learning_rate": 4.1364990262343916e-06, "logits/chosen": 10.983089447021484, "logits/rejected": 8.220662117004395, "logps/chosen": -386.868408203125, "logps/rejected": -316.7525634765625, "loss": 0.5906, "rewards/accuracies": 0.875, "rewards/chosen": 0.44696810841560364, "rewards/margins": 0.2936115562915802, "rewards/rejected": 0.15335655212402344, "step": 4955 }, { "epoch": 0.7664411366711773, "grad_norm": 6.071159362792969, "learning_rate": 4.136212624584718e-06, "logits/chosen": 9.73252010345459, "logits/rejected": 10.7449951171875, "logps/chosen": -280.9246520996094, "logps/rejected": -238.2398223876953, "loss": 0.7582, "rewards/accuracies": 0.625, "rewards/chosen": 0.10747911036014557, "rewards/margins": -0.057358723133802414, "rewards/rejected": 0.16483783721923828, "step": 4956 }, { "epoch": 0.7665957858109415, "grad_norm": 4.621884346008301, "learning_rate": 4.135926222935044e-06, "logits/chosen": 5.683119773864746, "logits/rejected": 11.96535873413086, "logps/chosen": -147.86508178710938, "logps/rejected": -208.42898559570312, "loss": 0.7092, "rewards/accuracies": 0.5, "rewards/chosen": -0.21912966668605804, "rewards/margins": 0.04856341332197189, "rewards/rejected": -0.26769304275512695, "step": 4957 }, { "epoch": 0.7667504349507056, "grad_norm": 4.188169479370117, "learning_rate": 4.135639821285371e-06, "logits/chosen": 14.546852111816406, "logits/rejected": 8.762707710266113, "logps/chosen": -266.2210693359375, "logps/rejected": -157.12014770507812, "loss": 0.7144, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008868449367582798, "rewards/margins": 0.0835040882229805, "rewards/rejected": -0.08439092338085175, "step": 4958 }, { "epoch": 0.7669050840904698, "grad_norm": 5.067059516906738, "learning_rate": 4.135353419635697e-06, "logits/chosen": 14.334366798400879, "logits/rejected": 7.460714340209961, "logps/chosen": -253.41856384277344, "logps/rejected": -176.9868927001953, "loss": 0.6364, "rewards/accuracies": 0.625, "rewards/chosen": -0.1870420277118683, "rewards/margins": 0.16429150104522705, "rewards/rejected": -0.35133352875709534, "step": 4959 }, { "epoch": 0.7670597332302339, "grad_norm": 4.723127365112305, "learning_rate": 4.135067017986024e-06, "logits/chosen": 9.781179428100586, "logits/rejected": 7.438054084777832, "logps/chosen": -200.795654296875, "logps/rejected": -223.2666778564453, "loss": 0.6219, "rewards/accuracies": 0.75, "rewards/chosen": 0.1913708746433258, "rewards/margins": 0.24018707871437073, "rewards/rejected": -0.04881620779633522, "step": 4960 }, { "epoch": 0.7672143823699981, "grad_norm": 4.799639701843262, "learning_rate": 4.13478061633635e-06, "logits/chosen": 9.256240844726562, "logits/rejected": 4.518589973449707, "logps/chosen": -257.2502136230469, "logps/rejected": -213.87136840820312, "loss": 0.6418, "rewards/accuracies": 0.75, "rewards/chosen": 0.017122391611337662, "rewards/margins": 0.18628929555416107, "rewards/rejected": -0.1691669076681137, "step": 4961 }, { "epoch": 0.7673690315097622, "grad_norm": 9.123564720153809, "learning_rate": 4.1344942146866764e-06, "logits/chosen": 4.829556465148926, "logits/rejected": 6.611625671386719, "logps/chosen": -330.2411193847656, "logps/rejected": -341.55194091796875, "loss": 0.8158, "rewards/accuracies": 0.375, "rewards/chosen": -0.19843865931034088, "rewards/margins": -0.1875607669353485, "rewards/rejected": -0.010877883061766624, "step": 4962 }, { "epoch": 0.7675236806495264, "grad_norm": 4.474172115325928, "learning_rate": 4.134207813037003e-06, "logits/chosen": 8.630349159240723, "logits/rejected": 1.6360594034194946, "logps/chosen": -315.9197998046875, "logps/rejected": -197.06532287597656, "loss": 0.5598, "rewards/accuracies": 0.625, "rewards/chosen": 0.31645143032073975, "rewards/margins": 0.400278240442276, "rewards/rejected": -0.08382683992385864, "step": 4963 }, { "epoch": 0.7676783297892905, "grad_norm": 5.443153381347656, "learning_rate": 4.13392141138733e-06, "logits/chosen": 7.913963794708252, "logits/rejected": 8.156455993652344, "logps/chosen": -246.21231079101562, "logps/rejected": -281.65289306640625, "loss": 0.5524, "rewards/accuracies": 0.625, "rewards/chosen": 0.2347273826599121, "rewards/margins": 0.4020341634750366, "rewards/rejected": -0.1673067808151245, "step": 4964 }, { "epoch": 0.7678329789290547, "grad_norm": 4.753653526306152, "learning_rate": 4.133635009737656e-06, "logits/chosen": 10.860950469970703, "logits/rejected": 4.748505592346191, "logps/chosen": -373.42120361328125, "logps/rejected": -267.4067077636719, "loss": 0.5052, "rewards/accuracies": 0.75, "rewards/chosen": 0.2632021903991699, "rewards/margins": 0.5404651761054993, "rewards/rejected": -0.27726298570632935, "step": 4965 }, { "epoch": 0.7679876280688188, "grad_norm": 5.557091236114502, "learning_rate": 4.133348608087983e-06, "logits/chosen": 13.716362953186035, "logits/rejected": 9.751205444335938, "logps/chosen": -358.31390380859375, "logps/rejected": -269.47705078125, "loss": 0.5535, "rewards/accuracies": 0.875, "rewards/chosen": 0.5474659204483032, "rewards/margins": 0.36240923404693604, "rewards/rejected": 0.1850566864013672, "step": 4966 }, { "epoch": 0.7681422772085831, "grad_norm": 5.823578834533691, "learning_rate": 4.13306220643831e-06, "logits/chosen": 8.433234214782715, "logits/rejected": 4.309297561645508, "logps/chosen": -269.5269775390625, "logps/rejected": -251.9385986328125, "loss": 0.6165, "rewards/accuracies": 0.75, "rewards/chosen": 0.014511585235595703, "rewards/margins": 0.20919501781463623, "rewards/rejected": -0.19468346238136292, "step": 4967 }, { "epoch": 0.7682969263483472, "grad_norm": 3.7419159412384033, "learning_rate": 4.1327758047886355e-06, "logits/chosen": 8.293190002441406, "logits/rejected": 5.601429462432861, "logps/chosen": -213.33518981933594, "logps/rejected": -152.77780151367188, "loss": 0.5051, "rewards/accuracies": 0.875, "rewards/chosen": -0.006918720901012421, "rewards/margins": 0.6592031121253967, "rewards/rejected": -0.6661218404769897, "step": 4968 }, { "epoch": 0.7684515754881114, "grad_norm": 5.663111686706543, "learning_rate": 4.132489403138962e-06, "logits/chosen": 13.131458282470703, "logits/rejected": 11.536008834838867, "logps/chosen": -243.63339233398438, "logps/rejected": -233.37100219726562, "loss": 0.7241, "rewards/accuracies": 0.625, "rewards/chosen": 0.3481830954551697, "rewards/margins": 0.0019211731851100922, "rewards/rejected": 0.3462618887424469, "step": 4969 }, { "epoch": 0.7686062246278755, "grad_norm": 5.256002902984619, "learning_rate": 4.132203001489289e-06, "logits/chosen": 8.378101348876953, "logits/rejected": 12.492430686950684, "logps/chosen": -143.73837280273438, "logps/rejected": -193.728271484375, "loss": 0.7182, "rewards/accuracies": 0.625, "rewards/chosen": 0.2743394374847412, "rewards/margins": -0.015911810100078583, "rewards/rejected": 0.290251225233078, "step": 4970 }, { "epoch": 0.7687608737676397, "grad_norm": 4.80961275100708, "learning_rate": 4.1319165998396154e-06, "logits/chosen": 3.9095687866210938, "logits/rejected": 5.7025322914123535, "logps/chosen": -201.0200958251953, "logps/rejected": -230.4980926513672, "loss": 0.6261, "rewards/accuracies": 0.875, "rewards/chosen": 0.23607198894023895, "rewards/margins": 0.3503708839416504, "rewards/rejected": -0.11429891735315323, "step": 4971 }, { "epoch": 0.7689155229074038, "grad_norm": 7.354063034057617, "learning_rate": 4.131630198189942e-06, "logits/chosen": 9.680054664611816, "logits/rejected": 10.74063777923584, "logps/chosen": -301.8260803222656, "logps/rejected": -282.119140625, "loss": 0.7921, "rewards/accuracies": 0.375, "rewards/chosen": 0.08449564129114151, "rewards/margins": -0.02164582349359989, "rewards/rejected": 0.10614146292209625, "step": 4972 }, { "epoch": 0.769070172047168, "grad_norm": 5.525781631469727, "learning_rate": 4.131343796540269e-06, "logits/chosen": 7.839278221130371, "logits/rejected": 9.007648468017578, "logps/chosen": -162.90118408203125, "logps/rejected": -163.5792694091797, "loss": 0.7822, "rewards/accuracies": 0.5, "rewards/chosen": 0.127905935049057, "rewards/margins": -0.11784529685974121, "rewards/rejected": 0.24575123190879822, "step": 4973 }, { "epoch": 0.7692248211869321, "grad_norm": 5.639936923980713, "learning_rate": 4.1310573948905945e-06, "logits/chosen": 11.192437171936035, "logits/rejected": 5.996338844299316, "logps/chosen": -308.0993957519531, "logps/rejected": -177.96994018554688, "loss": 0.7569, "rewards/accuracies": 0.5, "rewards/chosen": 0.15776333212852478, "rewards/margins": -0.01452586054801941, "rewards/rejected": 0.1722891926765442, "step": 4974 }, { "epoch": 0.7693794703266963, "grad_norm": 4.952838897705078, "learning_rate": 4.130770993240921e-06, "logits/chosen": 12.394794464111328, "logits/rejected": 8.547128677368164, "logps/chosen": -340.3673095703125, "logps/rejected": -217.74176025390625, "loss": 0.6889, "rewards/accuracies": 0.625, "rewards/chosen": 0.20627279579639435, "rewards/margins": 0.035782910883426666, "rewards/rejected": 0.17048987746238708, "step": 4975 }, { "epoch": 0.7695341194664604, "grad_norm": 6.5808820724487305, "learning_rate": 4.130484591591248e-06, "logits/chosen": 12.566085815429688, "logits/rejected": 8.219300270080566, "logps/chosen": -344.2880859375, "logps/rejected": -275.83807373046875, "loss": 0.5452, "rewards/accuracies": 0.75, "rewards/chosen": 0.2173694670200348, "rewards/margins": 0.44629353284835815, "rewards/rejected": -0.22892408072948456, "step": 4976 }, { "epoch": 0.7696887686062246, "grad_norm": 8.133243560791016, "learning_rate": 4.1301981899415745e-06, "logits/chosen": 9.09133529663086, "logits/rejected": 8.781269073486328, "logps/chosen": -545.9765014648438, "logps/rejected": -423.1166687011719, "loss": 0.4537, "rewards/accuracies": 1.0, "rewards/chosen": 0.380987286567688, "rewards/margins": 0.6742621660232544, "rewards/rejected": -0.2932748794555664, "step": 4977 }, { "epoch": 0.7698434177459887, "grad_norm": 5.225375175476074, "learning_rate": 4.129911788291901e-06, "logits/chosen": 9.772926330566406, "logits/rejected": 4.344759941101074, "logps/chosen": -332.7449951171875, "logps/rejected": -240.3415985107422, "loss": 0.5543, "rewards/accuracies": 0.75, "rewards/chosen": 0.5672166347503662, "rewards/margins": 0.4154619872570038, "rewards/rejected": 0.15175466239452362, "step": 4978 }, { "epoch": 0.7699980668857529, "grad_norm": 6.085546016693115, "learning_rate": 4.129625386642228e-06, "logits/chosen": 9.420472145080566, "logits/rejected": 14.559541702270508, "logps/chosen": -210.8945770263672, "logps/rejected": -284.9475402832031, "loss": 0.7385, "rewards/accuracies": 0.5, "rewards/chosen": -0.27289485931396484, "rewards/margins": 0.004035197198390961, "rewards/rejected": -0.276930034160614, "step": 4979 }, { "epoch": 0.7701527160255172, "grad_norm": 5.08929967880249, "learning_rate": 4.1293389849925544e-06, "logits/chosen": 13.113638877868652, "logits/rejected": 5.629592418670654, "logps/chosen": -288.2409362792969, "logps/rejected": -213.57568359375, "loss": 0.5785, "rewards/accuracies": 0.625, "rewards/chosen": -0.036273956298828125, "rewards/margins": 0.4562690258026123, "rewards/rejected": -0.49254298210144043, "step": 4980 }, { "epoch": 0.7703073651652813, "grad_norm": 7.583585262298584, "learning_rate": 4.12905258334288e-06, "logits/chosen": 8.815882682800293, "logits/rejected": 7.21228551864624, "logps/chosen": -397.4370422363281, "logps/rejected": -377.3230895996094, "loss": 0.7379, "rewards/accuracies": 0.625, "rewards/chosen": 0.3927827775478363, "rewards/margins": 0.03347659483551979, "rewards/rejected": 0.3593061566352844, "step": 4981 }, { "epoch": 0.7704620143050455, "grad_norm": 7.3094377517700195, "learning_rate": 4.128766181693207e-06, "logits/chosen": 7.136903285980225, "logits/rejected": 7.766732215881348, "logps/chosen": -341.82818603515625, "logps/rejected": -320.99542236328125, "loss": 0.7132, "rewards/accuracies": 0.5, "rewards/chosen": 0.1656390130519867, "rewards/margins": 0.06980957090854645, "rewards/rejected": 0.09582944214344025, "step": 4982 }, { "epoch": 0.7706166634448096, "grad_norm": 4.940464973449707, "learning_rate": 4.1284797800435335e-06, "logits/chosen": 11.49021053314209, "logits/rejected": 8.019957542419434, "logps/chosen": -340.2213134765625, "logps/rejected": -349.8654479980469, "loss": 0.4907, "rewards/accuracies": 0.75, "rewards/chosen": 0.28009265661239624, "rewards/margins": 0.5583404302597046, "rewards/rejected": -0.2782478630542755, "step": 4983 }, { "epoch": 0.7707713125845738, "grad_norm": 3.208820104598999, "learning_rate": 4.12819337839386e-06, "logits/chosen": 9.834213256835938, "logits/rejected": 6.487322807312012, "logps/chosen": -180.23861694335938, "logps/rejected": -119.73690032958984, "loss": 0.5539, "rewards/accuracies": 0.75, "rewards/chosen": 0.3990585207939148, "rewards/margins": 0.389373242855072, "rewards/rejected": 0.009685254655778408, "step": 4984 }, { "epoch": 0.7709259617243379, "grad_norm": 5.376028060913086, "learning_rate": 4.127906976744187e-06, "logits/chosen": 6.716535568237305, "logits/rejected": 11.201910018920898, "logps/chosen": -239.73699951171875, "logps/rejected": -263.1300048828125, "loss": 0.778, "rewards/accuracies": 0.625, "rewards/chosen": 0.36066412925720215, "rewards/margins": -0.0027289018034934998, "rewards/rejected": 0.36339300870895386, "step": 4985 }, { "epoch": 0.7710806108641021, "grad_norm": 4.978393077850342, "learning_rate": 4.1276205750945135e-06, "logits/chosen": 9.114242553710938, "logits/rejected": 10.812358856201172, "logps/chosen": -188.74911499023438, "logps/rejected": -207.45289611816406, "loss": 0.5866, "rewards/accuracies": 0.625, "rewards/chosen": -0.26826804876327515, "rewards/margins": 0.4088376462459564, "rewards/rejected": -0.6771056652069092, "step": 4986 }, { "epoch": 0.7712352600038662, "grad_norm": 5.60716438293457, "learning_rate": 4.127334173444839e-06, "logits/chosen": 14.299121856689453, "logits/rejected": 9.205286026000977, "logps/chosen": -429.9416198730469, "logps/rejected": -403.2704772949219, "loss": 0.5664, "rewards/accuracies": 0.75, "rewards/chosen": 0.5752893686294556, "rewards/margins": 0.33691641688346863, "rewards/rejected": 0.23837293684482574, "step": 4987 }, { "epoch": 0.7713899091436304, "grad_norm": 5.608139991760254, "learning_rate": 4.127047771795166e-06, "logits/chosen": 9.980561256408691, "logits/rejected": 7.705402374267578, "logps/chosen": -340.0922546386719, "logps/rejected": -294.58392333984375, "loss": 0.5385, "rewards/accuracies": 0.75, "rewards/chosen": -0.027302458882331848, "rewards/margins": 0.45669716596603394, "rewards/rejected": -0.4839996099472046, "step": 4988 }, { "epoch": 0.7715445582833945, "grad_norm": 7.99901819229126, "learning_rate": 4.126761370145493e-06, "logits/chosen": 4.902597427368164, "logits/rejected": -0.06300020217895508, "logps/chosen": -292.90606689453125, "logps/rejected": -405.0431823730469, "loss": 0.7373, "rewards/accuracies": 0.625, "rewards/chosen": 0.08763332664966583, "rewards/margins": 0.19842568039894104, "rewards/rejected": -0.11079235374927521, "step": 4989 }, { "epoch": 0.7716992074231587, "grad_norm": 52.14344024658203, "learning_rate": 4.126474968495819e-06, "logits/chosen": 5.340859413146973, "logits/rejected": 5.4607038497924805, "logps/chosen": -248.3724365234375, "logps/rejected": -188.8089141845703, "loss": 0.5625, "rewards/accuracies": 0.75, "rewards/chosen": 0.04004386439919472, "rewards/margins": 0.39794838428497314, "rewards/rejected": -0.35790449380874634, "step": 4990 }, { "epoch": 0.7718538565629228, "grad_norm": 5.102663993835449, "learning_rate": 4.126188566846145e-06, "logits/chosen": 12.467475891113281, "logits/rejected": 8.76297664642334, "logps/chosen": -270.7680358886719, "logps/rejected": -276.18695068359375, "loss": 0.6774, "rewards/accuracies": 0.375, "rewards/chosen": 0.039369676262140274, "rewards/margins": 0.06542877852916718, "rewards/rejected": -0.0260591059923172, "step": 4991 }, { "epoch": 0.772008505702687, "grad_norm": 4.296579837799072, "learning_rate": 4.125902165196472e-06, "logits/chosen": 11.526104927062988, "logits/rejected": 11.35513687133789, "logps/chosen": -204.5261688232422, "logps/rejected": -185.63218688964844, "loss": 0.5754, "rewards/accuracies": 0.5, "rewards/chosen": 0.11636167764663696, "rewards/margins": 0.43150269985198975, "rewards/rejected": -0.3151410222053528, "step": 4992 }, { "epoch": 0.7721631548424512, "grad_norm": 6.3434739112854, "learning_rate": 4.125615763546798e-06, "logits/chosen": 13.317995071411133, "logits/rejected": 2.7605810165405273, "logps/chosen": -307.75518798828125, "logps/rejected": -178.917236328125, "loss": 0.7527, "rewards/accuracies": 0.5, "rewards/chosen": -0.10010546445846558, "rewards/margins": 0.09486135840415955, "rewards/rejected": -0.19496682286262512, "step": 4993 }, { "epoch": 0.7723178039822154, "grad_norm": 4.062440395355225, "learning_rate": 4.125329361897125e-06, "logits/chosen": 10.902128219604492, "logits/rejected": 5.421245098114014, "logps/chosen": -213.04019165039062, "logps/rejected": -215.32192993164062, "loss": 0.6008, "rewards/accuracies": 0.625, "rewards/chosen": 0.18839725852012634, "rewards/margins": 0.2639303207397461, "rewards/rejected": -0.07553304731845856, "step": 4994 }, { "epoch": 0.7724724531219795, "grad_norm": 6.132646560668945, "learning_rate": 4.125042960247451e-06, "logits/chosen": 17.396726608276367, "logits/rejected": 11.30938720703125, "logps/chosen": -468.0548095703125, "logps/rejected": -302.32281494140625, "loss": 0.5418, "rewards/accuracies": 0.875, "rewards/chosen": 0.07890496402978897, "rewards/margins": 0.4038117229938507, "rewards/rejected": -0.32490673661231995, "step": 4995 }, { "epoch": 0.7726271022617437, "grad_norm": 5.574467658996582, "learning_rate": 4.1247565585977775e-06, "logits/chosen": 11.290813446044922, "logits/rejected": 5.554466724395752, "logps/chosen": -268.62811279296875, "logps/rejected": -196.56036376953125, "loss": 0.7009, "rewards/accuracies": 0.5, "rewards/chosen": -0.06991252303123474, "rewards/margins": 0.1462816745042801, "rewards/rejected": -0.21619418263435364, "step": 4996 }, { "epoch": 0.7727817514015078, "grad_norm": 5.451486110687256, "learning_rate": 4.124470156948104e-06, "logits/chosen": 11.673201560974121, "logits/rejected": 6.237618446350098, "logps/chosen": -254.86947631835938, "logps/rejected": -244.02609252929688, "loss": 0.6469, "rewards/accuracies": 0.375, "rewards/chosen": -0.0310808178037405, "rewards/margins": 0.23386698961257935, "rewards/rejected": -0.2649478316307068, "step": 4997 }, { "epoch": 0.772936400541272, "grad_norm": 5.163326263427734, "learning_rate": 4.124183755298431e-06, "logits/chosen": 13.895020484924316, "logits/rejected": 7.872200012207031, "logps/chosen": -313.21148681640625, "logps/rejected": -273.2898864746094, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": 0.2681724727153778, "rewards/margins": 0.34868061542510986, "rewards/rejected": -0.08050814270973206, "step": 4998 }, { "epoch": 0.7730910496810361, "grad_norm": 4.0748419761657715, "learning_rate": 4.123897353648757e-06, "logits/chosen": 9.035528182983398, "logits/rejected": 3.976304054260254, "logps/chosen": -314.4695739746094, "logps/rejected": -270.3165588378906, "loss": 0.4937, "rewards/accuracies": 0.875, "rewards/chosen": -0.05133618414402008, "rewards/margins": 0.5926632285118103, "rewards/rejected": -0.643999457359314, "step": 4999 }, { "epoch": 0.7732456988208003, "grad_norm": 4.605780124664307, "learning_rate": 4.123610951999084e-06, "logits/chosen": 10.038984298706055, "logits/rejected": 10.29881763458252, "logps/chosen": -179.95416259765625, "logps/rejected": -219.22116088867188, "loss": 0.641, "rewards/accuracies": 0.625, "rewards/chosen": 0.023074060678482056, "rewards/margins": 0.1538427174091339, "rewards/rejected": -0.13076864182949066, "step": 5000 }, { "epoch": 0.7734003479605644, "grad_norm": 4.89199686050415, "learning_rate": 4.12332455034941e-06, "logits/chosen": 7.259953022003174, "logits/rejected": 8.65902328491211, "logps/chosen": -395.1523132324219, "logps/rejected": -487.6247253417969, "loss": 0.7161, "rewards/accuracies": 0.625, "rewards/chosen": 0.12864947319030762, "rewards/margins": 0.10570831596851349, "rewards/rejected": 0.022941168397665024, "step": 5001 }, { "epoch": 0.7735549971003286, "grad_norm": 7.77675724029541, "learning_rate": 4.1230381486997365e-06, "logits/chosen": 7.0220723152160645, "logits/rejected": 9.41964340209961, "logps/chosen": -211.3035430908203, "logps/rejected": -282.9604797363281, "loss": 0.7156, "rewards/accuracies": 0.5, "rewards/chosen": -0.04720047116279602, "rewards/margins": 0.0022997967898845673, "rewards/rejected": -0.049500271677970886, "step": 5002 }, { "epoch": 0.7737096462400928, "grad_norm": 5.483887672424316, "learning_rate": 4.122751747050063e-06, "logits/chosen": 7.901454925537109, "logits/rejected": 9.114794731140137, "logps/chosen": -306.2235412597656, "logps/rejected": -273.46429443359375, "loss": 0.7368, "rewards/accuracies": 0.5, "rewards/chosen": 0.01944485306739807, "rewards/margins": 0.17546787858009338, "rewards/rejected": -0.1560230404138565, "step": 5003 }, { "epoch": 0.7738642953798569, "grad_norm": 4.512119770050049, "learning_rate": 4.12246534540039e-06, "logits/chosen": 6.18450403213501, "logits/rejected": 5.381974697113037, "logps/chosen": -284.0148620605469, "logps/rejected": -230.7216796875, "loss": 0.5546, "rewards/accuracies": 0.75, "rewards/chosen": 0.31669148802757263, "rewards/margins": 0.45358434319496155, "rewards/rejected": -0.13689284026622772, "step": 5004 }, { "epoch": 0.774018944519621, "grad_norm": 5.064146041870117, "learning_rate": 4.1221789437507165e-06, "logits/chosen": 9.378414154052734, "logits/rejected": 8.250425338745117, "logps/chosen": -268.95367431640625, "logps/rejected": -256.632080078125, "loss": 0.7122, "rewards/accuracies": 0.625, "rewards/chosen": 0.2516672909259796, "rewards/margins": 0.044621050357818604, "rewards/rejected": 0.207046240568161, "step": 5005 }, { "epoch": 0.7741735936593853, "grad_norm": 5.158390045166016, "learning_rate": 4.121892542101043e-06, "logits/chosen": 6.5220489501953125, "logits/rejected": 7.144718647003174, "logps/chosen": -272.63958740234375, "logps/rejected": -232.56692504882812, "loss": 0.5436, "rewards/accuracies": 0.75, "rewards/chosen": 0.4590062201023102, "rewards/margins": 0.4421843886375427, "rewards/rejected": 0.01682186871767044, "step": 5006 }, { "epoch": 0.7743282427991495, "grad_norm": 4.248188495635986, "learning_rate": 4.121606140451369e-06, "logits/chosen": 10.51870346069336, "logits/rejected": 8.302328109741211, "logps/chosen": -318.8675537109375, "logps/rejected": -234.2373809814453, "loss": 0.5413, "rewards/accuracies": 0.875, "rewards/chosen": 0.2951411008834839, "rewards/margins": 0.3687536418437958, "rewards/rejected": -0.07361254096031189, "step": 5007 }, { "epoch": 0.7744828919389136, "grad_norm": 4.663799285888672, "learning_rate": 4.1213197388016956e-06, "logits/chosen": 7.878110885620117, "logits/rejected": 2.713817596435547, "logps/chosen": -159.81491088867188, "logps/rejected": -154.60916137695312, "loss": 0.6746, "rewards/accuracies": 0.5, "rewards/chosen": 0.191911518573761, "rewards/margins": 0.0977015495300293, "rewards/rejected": 0.09420996904373169, "step": 5008 }, { "epoch": 0.7746375410786778, "grad_norm": 4.316108226776123, "learning_rate": 4.121033337152022e-06, "logits/chosen": 13.941579818725586, "logits/rejected": 6.019278526306152, "logps/chosen": -273.7798767089844, "logps/rejected": -162.07249450683594, "loss": 0.5554, "rewards/accuracies": 0.75, "rewards/chosen": 0.3104780316352844, "rewards/margins": 0.3796347677707672, "rewards/rejected": -0.06915673613548279, "step": 5009 }, { "epoch": 0.7747921902184419, "grad_norm": 7.142570495605469, "learning_rate": 4.120746935502349e-06, "logits/chosen": 6.283424377441406, "logits/rejected": 9.530901908874512, "logps/chosen": -266.60418701171875, "logps/rejected": -279.9219055175781, "loss": 0.8331, "rewards/accuracies": 0.25, "rewards/chosen": -0.2871186435222626, "rewards/margins": -0.09328995645046234, "rewards/rejected": -0.19382868707180023, "step": 5010 }, { "epoch": 0.7749468393582061, "grad_norm": 6.1782050132751465, "learning_rate": 4.1204605338526755e-06, "logits/chosen": 14.791120529174805, "logits/rejected": 10.545404434204102, "logps/chosen": -489.9462890625, "logps/rejected": -317.0007629394531, "loss": 0.7888, "rewards/accuracies": 0.625, "rewards/chosen": -0.02662043832242489, "rewards/margins": 0.09979776293039322, "rewards/rejected": -0.12641818821430206, "step": 5011 }, { "epoch": 0.7751014884979702, "grad_norm": 6.908194065093994, "learning_rate": 4.120174132203002e-06, "logits/chosen": 8.393936157226562, "logits/rejected": 13.2125244140625, "logps/chosen": -309.270263671875, "logps/rejected": -380.57861328125, "loss": 0.773, "rewards/accuracies": 0.625, "rewards/chosen": 0.01415882259607315, "rewards/margins": -0.011274144053459167, "rewards/rejected": 0.025432981550693512, "step": 5012 }, { "epoch": 0.7752561376377344, "grad_norm": 712.8502197265625, "learning_rate": 4.119887730553329e-06, "logits/chosen": 2.06146240234375, "logits/rejected": 5.159744739532471, "logps/chosen": -181.00794982910156, "logps/rejected": -722.385986328125, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": -0.15017251670360565, "rewards/margins": 0.28974276781082153, "rewards/rejected": -0.4399152994155884, "step": 5013 }, { "epoch": 0.7754107867774985, "grad_norm": 6.284900665283203, "learning_rate": 4.119601328903655e-06, "logits/chosen": 10.280906677246094, "logits/rejected": 13.962762832641602, "logps/chosen": -317.97283935546875, "logps/rejected": -346.02960205078125, "loss": 0.7059, "rewards/accuracies": 0.625, "rewards/chosen": 0.24447432160377502, "rewards/margins": 0.0720866397023201, "rewards/rejected": 0.17238768935203552, "step": 5014 }, { "epoch": 0.7755654359172627, "grad_norm": 4.670611381530762, "learning_rate": 4.119314927253981e-06, "logits/chosen": 8.168377876281738, "logits/rejected": 4.10474157333374, "logps/chosen": -415.0027160644531, "logps/rejected": -208.18939208984375, "loss": 0.4912, "rewards/accuracies": 0.75, "rewards/chosen": 0.02573414519429207, "rewards/margins": 0.544352650642395, "rewards/rejected": -0.5186185240745544, "step": 5015 }, { "epoch": 0.7757200850570268, "grad_norm": 8.4817476272583, "learning_rate": 4.119028525604308e-06, "logits/chosen": 10.971458435058594, "logits/rejected": 5.737685680389404, "logps/chosen": -208.27609252929688, "logps/rejected": -202.10458374023438, "loss": 0.7532, "rewards/accuracies": 0.5, "rewards/chosen": -0.27731525897979736, "rewards/margins": 0.05599093437194824, "rewards/rejected": -0.3333061933517456, "step": 5016 }, { "epoch": 0.775874734196791, "grad_norm": 6.134255886077881, "learning_rate": 4.118742123954635e-06, "logits/chosen": 5.332054615020752, "logits/rejected": 6.37807559967041, "logps/chosen": -237.30072021484375, "logps/rejected": -304.5001525878906, "loss": 0.755, "rewards/accuracies": 0.625, "rewards/chosen": -0.0508432611823082, "rewards/margins": 0.05668017268180847, "rewards/rejected": -0.10752344131469727, "step": 5017 }, { "epoch": 0.7760293833365552, "grad_norm": 3.983471632003784, "learning_rate": 4.118455722304961e-06, "logits/chosen": 10.750752449035645, "logits/rejected": 18.0624942779541, "logps/chosen": -201.6811065673828, "logps/rejected": -286.57427978515625, "loss": 0.6429, "rewards/accuracies": 0.375, "rewards/chosen": 0.19274616241455078, "rewards/margins": 0.41438019275665283, "rewards/rejected": -0.22163397073745728, "step": 5018 }, { "epoch": 0.7761840324763194, "grad_norm": 5.100841045379639, "learning_rate": 4.118169320655288e-06, "logits/chosen": 5.152981758117676, "logits/rejected": 6.58890438079834, "logps/chosen": -255.8887481689453, "logps/rejected": -262.0158386230469, "loss": 0.7641, "rewards/accuracies": 0.625, "rewards/chosen": 0.09908465296030045, "rewards/margins": -0.06049796938896179, "rewards/rejected": 0.15958262979984283, "step": 5019 }, { "epoch": 0.7763386816160835, "grad_norm": 5.116940021514893, "learning_rate": 4.117882919005614e-06, "logits/chosen": 10.215417861938477, "logits/rejected": 12.812397956848145, "logps/chosen": -168.50875854492188, "logps/rejected": -220.5176544189453, "loss": 0.7592, "rewards/accuracies": 0.5, "rewards/chosen": 0.09314832836389542, "rewards/margins": 0.09575970470905304, "rewards/rejected": -0.0026113614439964294, "step": 5020 }, { "epoch": 0.7764933307558477, "grad_norm": 3.7714180946350098, "learning_rate": 4.11759651735594e-06, "logits/chosen": 10.354387283325195, "logits/rejected": 7.624279975891113, "logps/chosen": -207.99615478515625, "logps/rejected": -215.74276733398438, "loss": 0.4377, "rewards/accuracies": 0.875, "rewards/chosen": 0.2874392569065094, "rewards/margins": 0.6683759689331055, "rewards/rejected": -0.38093677163124084, "step": 5021 }, { "epoch": 0.7766479798956119, "grad_norm": 4.5336713790893555, "learning_rate": 4.117310115706267e-06, "logits/chosen": 11.726278305053711, "logits/rejected": 8.242114067077637, "logps/chosen": -283.8611145019531, "logps/rejected": -244.39337158203125, "loss": 0.5896, "rewards/accuracies": 0.5, "rewards/chosen": 0.17777138948440552, "rewards/margins": 0.3340108394622803, "rewards/rejected": -0.15623946487903595, "step": 5022 }, { "epoch": 0.776802629035376, "grad_norm": 6.319726467132568, "learning_rate": 4.117023714056594e-06, "logits/chosen": 9.771453857421875, "logits/rejected": 6.453716278076172, "logps/chosen": -332.0880126953125, "logps/rejected": -237.7469940185547, "loss": 0.696, "rewards/accuracies": 0.5, "rewards/chosen": 0.2392318844795227, "rewards/margins": 0.017061710357666016, "rewards/rejected": 0.2221701592206955, "step": 5023 }, { "epoch": 0.7769572781751402, "grad_norm": 5.381030082702637, "learning_rate": 4.11673731240692e-06, "logits/chosen": 8.992779731750488, "logits/rejected": 8.476510047912598, "logps/chosen": -145.55551147460938, "logps/rejected": -119.64647674560547, "loss": 0.7134, "rewards/accuracies": 0.375, "rewards/chosen": 0.10848407447338104, "rewards/margins": -0.013696298003196716, "rewards/rejected": 0.12218037247657776, "step": 5024 }, { "epoch": 0.7771119273149043, "grad_norm": 6.5061187744140625, "learning_rate": 4.116450910757246e-06, "logits/chosen": 9.034185409545898, "logits/rejected": 7.38123893737793, "logps/chosen": -318.8076477050781, "logps/rejected": -249.76145935058594, "loss": 0.7415, "rewards/accuracies": 0.5, "rewards/chosen": 0.12011967599391937, "rewards/margins": 0.07943274080753326, "rewards/rejected": 0.04068693518638611, "step": 5025 }, { "epoch": 0.7772665764546685, "grad_norm": 5.886760234832764, "learning_rate": 4.116164509107573e-06, "logits/chosen": 13.575149536132812, "logits/rejected": 9.485139846801758, "logps/chosen": -245.511474609375, "logps/rejected": -216.506103515625, "loss": 0.6335, "rewards/accuracies": 0.625, "rewards/chosen": -0.20794911682605743, "rewards/margins": 0.1715211421251297, "rewards/rejected": -0.3794702887535095, "step": 5026 }, { "epoch": 0.7774212255944326, "grad_norm": 5.398969650268555, "learning_rate": 4.115878107457899e-06, "logits/chosen": 9.968337059020996, "logits/rejected": 6.6889166831970215, "logps/chosen": -240.43557739257812, "logps/rejected": -259.589111328125, "loss": 0.6504, "rewards/accuracies": 0.625, "rewards/chosen": -0.09291433542966843, "rewards/margins": 0.2952841520309448, "rewards/rejected": -0.38819852471351624, "step": 5027 }, { "epoch": 0.7775758747341968, "grad_norm": 4.397029876708984, "learning_rate": 4.115591705808226e-06, "logits/chosen": 5.869007587432861, "logits/rejected": 3.8941993713378906, "logps/chosen": -176.54193115234375, "logps/rejected": -143.39376831054688, "loss": 0.6702, "rewards/accuracies": 0.5, "rewards/chosen": -0.0621388703584671, "rewards/margins": 0.08066572993993759, "rewards/rejected": -0.14280462265014648, "step": 5028 }, { "epoch": 0.7777305238739609, "grad_norm": 3.883955955505371, "learning_rate": 4.115305304158552e-06, "logits/chosen": 13.221551895141602, "logits/rejected": 6.875918388366699, "logps/chosen": -257.21484375, "logps/rejected": -227.89378356933594, "loss": 0.5043, "rewards/accuracies": 1.0, "rewards/chosen": 0.2855057716369629, "rewards/margins": 0.44030851125717163, "rewards/rejected": -0.15480273962020874, "step": 5029 }, { "epoch": 0.7778851730137251, "grad_norm": 7.649988174438477, "learning_rate": 4.1150189025088785e-06, "logits/chosen": 10.078450202941895, "logits/rejected": 10.438804626464844, "logps/chosen": -258.0928955078125, "logps/rejected": -238.04818725585938, "loss": 0.6456, "rewards/accuracies": 0.5, "rewards/chosen": 0.05472786724567413, "rewards/margins": 0.3163161277770996, "rewards/rejected": -0.26158827543258667, "step": 5030 }, { "epoch": 0.7780398221534893, "grad_norm": 5.158097267150879, "learning_rate": 4.114732500859205e-06, "logits/chosen": 10.367234230041504, "logits/rejected": 11.947220802307129, "logps/chosen": -205.2589111328125, "logps/rejected": -246.32557678222656, "loss": 0.7401, "rewards/accuracies": 0.5, "rewards/chosen": -0.22857901453971863, "rewards/margins": -0.0002719275653362274, "rewards/rejected": -0.2283070683479309, "step": 5031 }, { "epoch": 0.7781944712932535, "grad_norm": 13.855603218078613, "learning_rate": 4.114446099209532e-06, "logits/chosen": 4.8544087409973145, "logits/rejected": 1.3936400413513184, "logps/chosen": -322.1920166015625, "logps/rejected": -280.5462341308594, "loss": 0.6983, "rewards/accuracies": 0.625, "rewards/chosen": 0.19767610728740692, "rewards/margins": 0.03756190091371536, "rewards/rejected": 0.16011419892311096, "step": 5032 }, { "epoch": 0.7783491204330176, "grad_norm": 4.3317670822143555, "learning_rate": 4.1141596975598584e-06, "logits/chosen": 10.71391773223877, "logits/rejected": 13.785128593444824, "logps/chosen": -265.25970458984375, "logps/rejected": -237.08724975585938, "loss": 0.582, "rewards/accuracies": 0.875, "rewards/chosen": -0.13994871079921722, "rewards/margins": 0.397346556186676, "rewards/rejected": -0.5372952222824097, "step": 5033 }, { "epoch": 0.7785037695727818, "grad_norm": 4.697053909301758, "learning_rate": 4.113873295910184e-06, "logits/chosen": 12.264410972595215, "logits/rejected": 11.431119918823242, "logps/chosen": -206.22607421875, "logps/rejected": -189.2880859375, "loss": 0.6978, "rewards/accuracies": 0.25, "rewards/chosen": 0.10574106872081757, "rewards/margins": 0.11151537299156189, "rewards/rejected": -0.005774319171905518, "step": 5034 }, { "epoch": 0.7786584187125459, "grad_norm": 8.809879302978516, "learning_rate": 4.113586894260511e-06, "logits/chosen": 6.489687919616699, "logits/rejected": 14.318650245666504, "logps/chosen": -238.0576171875, "logps/rejected": -327.53082275390625, "loss": 0.916, "rewards/accuracies": 0.375, "rewards/chosen": -0.6989277601242065, "rewards/margins": -0.2902526259422302, "rewards/rejected": -0.40867507457733154, "step": 5035 }, { "epoch": 0.7788130678523101, "grad_norm": 5.562313079833984, "learning_rate": 4.1133004926108375e-06, "logits/chosen": 6.82518196105957, "logits/rejected": 8.63303279876709, "logps/chosen": -273.5740051269531, "logps/rejected": -287.4134521484375, "loss": 0.7571, "rewards/accuracies": 0.5, "rewards/chosen": -0.10549216717481613, "rewards/margins": -0.03956685587763786, "rewards/rejected": -0.06592531502246857, "step": 5036 }, { "epoch": 0.7789677169920742, "grad_norm": 6.7447733879089355, "learning_rate": 4.113014090961164e-06, "logits/chosen": 9.13759708404541, "logits/rejected": 7.33263635635376, "logps/chosen": -317.3335876464844, "logps/rejected": -226.57843017578125, "loss": 0.6594, "rewards/accuracies": 0.375, "rewards/chosen": 0.23781397938728333, "rewards/margins": 0.34381064772605896, "rewards/rejected": -0.10599665343761444, "step": 5037 }, { "epoch": 0.7791223661318384, "grad_norm": 4.94521951675415, "learning_rate": 4.112727689311491e-06, "logits/chosen": 9.293581008911133, "logits/rejected": 7.0353474617004395, "logps/chosen": -171.9024658203125, "logps/rejected": -169.03271484375, "loss": 0.7697, "rewards/accuracies": 0.375, "rewards/chosen": -0.2942078411579132, "rewards/margins": -0.08927765488624573, "rewards/rejected": -0.20493021607398987, "step": 5038 }, { "epoch": 0.7792770152716025, "grad_norm": 4.07720422744751, "learning_rate": 4.1124412876618175e-06, "logits/chosen": 11.395575523376465, "logits/rejected": 5.534848690032959, "logps/chosen": -431.73687744140625, "logps/rejected": -300.9872741699219, "loss": 0.493, "rewards/accuracies": 1.0, "rewards/chosen": 0.45613712072372437, "rewards/margins": 0.5238387584686279, "rewards/rejected": -0.06770157814025879, "step": 5039 }, { "epoch": 0.7794316644113667, "grad_norm": 6.103538513183594, "learning_rate": 4.112154886012143e-06, "logits/chosen": 12.989797592163086, "logits/rejected": 8.313202857971191, "logps/chosen": -272.17584228515625, "logps/rejected": -225.73150634765625, "loss": 0.6352, "rewards/accuracies": 0.75, "rewards/chosen": 0.21702295541763306, "rewards/margins": 0.21988089382648468, "rewards/rejected": -0.0028579290956258774, "step": 5040 }, { "epoch": 0.7795863135511308, "grad_norm": 5.951514720916748, "learning_rate": 4.11186848436247e-06, "logits/chosen": 16.44672966003418, "logits/rejected": 13.903814315795898, "logps/chosen": -333.8041687011719, "logps/rejected": -317.7332763671875, "loss": 0.6947, "rewards/accuracies": 0.375, "rewards/chosen": 0.04458990693092346, "rewards/margins": 0.17886611819267273, "rewards/rejected": -0.13427621126174927, "step": 5041 }, { "epoch": 0.779740962690895, "grad_norm": 4.669124603271484, "learning_rate": 4.111582082712797e-06, "logits/chosen": 10.363603591918945, "logits/rejected": 4.938765525817871, "logps/chosen": -409.20855712890625, "logps/rejected": -240.4298553466797, "loss": 0.5331, "rewards/accuracies": 0.75, "rewards/chosen": 0.10699347406625748, "rewards/margins": 0.4335766136646271, "rewards/rejected": -0.3265831470489502, "step": 5042 }, { "epoch": 0.7798956118306591, "grad_norm": 5.747920513153076, "learning_rate": 4.111295681063123e-06, "logits/chosen": 16.73120880126953, "logits/rejected": 13.110562324523926, "logps/chosen": -293.6449279785156, "logps/rejected": -222.78025817871094, "loss": 0.7022, "rewards/accuracies": 0.375, "rewards/chosen": 0.27120092511177063, "rewards/margins": 0.1408054679632187, "rewards/rejected": 0.13039547204971313, "step": 5043 }, { "epoch": 0.7800502609704234, "grad_norm": 5.237953186035156, "learning_rate": 4.11100927941345e-06, "logits/chosen": 15.620123863220215, "logits/rejected": 10.183158874511719, "logps/chosen": -330.70416259765625, "logps/rejected": -286.80572509765625, "loss": 0.6073, "rewards/accuracies": 0.625, "rewards/chosen": 0.312949538230896, "rewards/margins": 0.25471144914627075, "rewards/rejected": 0.058238133788108826, "step": 5044 }, { "epoch": 0.7802049101101876, "grad_norm": 6.788024425506592, "learning_rate": 4.1107228777637766e-06, "logits/chosen": 13.35718059539795, "logits/rejected": 6.806447982788086, "logps/chosen": -387.2668762207031, "logps/rejected": -296.7503356933594, "loss": 0.5601, "rewards/accuracies": 0.625, "rewards/chosen": 0.24102340638637543, "rewards/margins": 0.4665447175502777, "rewards/rejected": -0.22552131116390228, "step": 5045 }, { "epoch": 0.7803595592499517, "grad_norm": 4.676558494567871, "learning_rate": 4.110436476114103e-06, "logits/chosen": 12.208867073059082, "logits/rejected": 8.325477600097656, "logps/chosen": -392.06427001953125, "logps/rejected": -307.0189514160156, "loss": 0.4081, "rewards/accuracies": 0.875, "rewards/chosen": 0.42378321290016174, "rewards/margins": 1.0710327625274658, "rewards/rejected": -0.6472495198249817, "step": 5046 }, { "epoch": 0.7805142083897159, "grad_norm": 5.9772443771362305, "learning_rate": 4.110150074464429e-06, "logits/chosen": 2.1557722091674805, "logits/rejected": 5.11979866027832, "logps/chosen": -220.51300048828125, "logps/rejected": -247.05026245117188, "loss": 0.6078, "rewards/accuracies": 0.625, "rewards/chosen": 0.13328826427459717, "rewards/margins": 0.20535486936569214, "rewards/rejected": -0.07206659018993378, "step": 5047 }, { "epoch": 0.78066885752948, "grad_norm": 7.3887200355529785, "learning_rate": 4.109863672814756e-06, "logits/chosen": 10.722755432128906, "logits/rejected": 8.231695175170898, "logps/chosen": -267.7508544921875, "logps/rejected": -228.03750610351562, "loss": 0.7786, "rewards/accuracies": 0.5, "rewards/chosen": -0.30812105536460876, "rewards/margins": -0.04417410120368004, "rewards/rejected": -0.2639469504356384, "step": 5048 }, { "epoch": 0.7808235066692442, "grad_norm": 4.58232307434082, "learning_rate": 4.109577271165082e-06, "logits/chosen": 11.72867202758789, "logits/rejected": -2.089128255844116, "logps/chosen": -307.6416015625, "logps/rejected": -166.57406616210938, "loss": 0.5575, "rewards/accuracies": 0.75, "rewards/chosen": 0.1341150403022766, "rewards/margins": 0.4138139486312866, "rewards/rejected": -0.2796989679336548, "step": 5049 }, { "epoch": 0.7809781558090083, "grad_norm": 7.005514621734619, "learning_rate": 4.109290869515409e-06, "logits/chosen": 15.507721900939941, "logits/rejected": 9.484916687011719, "logps/chosen": -319.23626708984375, "logps/rejected": -262.30194091796875, "loss": 0.6408, "rewards/accuracies": 0.625, "rewards/chosen": 0.11864430457353592, "rewards/margins": 0.15886190533638, "rewards/rejected": -0.04021759331226349, "step": 5050 }, { "epoch": 0.7811328049487725, "grad_norm": 6.510275840759277, "learning_rate": 4.109004467865736e-06, "logits/chosen": 8.56306266784668, "logits/rejected": 6.801578521728516, "logps/chosen": -410.47222900390625, "logps/rejected": -324.2342834472656, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": 0.605204164981842, "rewards/margins": 0.5338374376296997, "rewards/rejected": 0.07136678695678711, "step": 5051 }, { "epoch": 0.7812874540885366, "grad_norm": 3.914299488067627, "learning_rate": 4.108718066216062e-06, "logits/chosen": 10.392654418945312, "logits/rejected": 5.892790794372559, "logps/chosen": -228.93362426757812, "logps/rejected": -176.40707397460938, "loss": 0.6022, "rewards/accuracies": 0.75, "rewards/chosen": 0.47347593307495117, "rewards/margins": 0.2508716285228729, "rewards/rejected": 0.22260427474975586, "step": 5052 }, { "epoch": 0.7814421032283008, "grad_norm": 5.765299320220947, "learning_rate": 4.108431664566388e-06, "logits/chosen": 5.827978134155273, "logits/rejected": 8.501371383666992, "logps/chosen": -219.8353271484375, "logps/rejected": -256.9482727050781, "loss": 0.6881, "rewards/accuracies": 0.5, "rewards/chosen": 0.07321976870298386, "rewards/margins": 0.068837970495224, "rewards/rejected": 0.004381801933050156, "step": 5053 }, { "epoch": 0.7815967523680649, "grad_norm": 5.2162675857543945, "learning_rate": 4.108145262916715e-06, "logits/chosen": 9.194456100463867, "logits/rejected": 3.763007164001465, "logps/chosen": -386.1903381347656, "logps/rejected": -235.15545654296875, "loss": 0.608, "rewards/accuracies": 0.625, "rewards/chosen": 0.37361860275268555, "rewards/margins": 0.30699610710144043, "rewards/rejected": 0.06662251055240631, "step": 5054 }, { "epoch": 0.7817514015078291, "grad_norm": 5.34047794342041, "learning_rate": 4.107858861267041e-06, "logits/chosen": 6.982151985168457, "logits/rejected": 10.637344360351562, "logps/chosen": -189.2821502685547, "logps/rejected": -211.65943908691406, "loss": 0.7051, "rewards/accuracies": 0.625, "rewards/chosen": -0.06354260444641113, "rewards/margins": 0.2144639641046524, "rewards/rejected": -0.2780066132545471, "step": 5055 }, { "epoch": 0.7819060506475932, "grad_norm": 4.682369709014893, "learning_rate": 4.107572459617368e-06, "logits/chosen": 14.726280212402344, "logits/rejected": 8.44631290435791, "logps/chosen": -397.2965393066406, "logps/rejected": -244.98489379882812, "loss": 0.4395, "rewards/accuracies": 0.875, "rewards/chosen": 0.4837670624256134, "rewards/margins": 0.6795926094055176, "rewards/rejected": -0.19582557678222656, "step": 5056 }, { "epoch": 0.7820606997873575, "grad_norm": 3.8550431728363037, "learning_rate": 4.107286057967695e-06, "logits/chosen": 8.757742881774902, "logits/rejected": 1.8678923845291138, "logps/chosen": -263.2114562988281, "logps/rejected": -202.33299255371094, "loss": 0.5168, "rewards/accuracies": 0.75, "rewards/chosen": 0.23992905020713806, "rewards/margins": 0.5675321817398071, "rewards/rejected": -0.32760316133499146, "step": 5057 }, { "epoch": 0.7822153489271216, "grad_norm": 5.130674362182617, "learning_rate": 4.1069996563180205e-06, "logits/chosen": 14.660371780395508, "logits/rejected": 7.732184410095215, "logps/chosen": -340.6009826660156, "logps/rejected": -209.63592529296875, "loss": 0.735, "rewards/accuracies": 0.5, "rewards/chosen": 0.04329577833414078, "rewards/margins": 0.06805667281150818, "rewards/rejected": -0.024760913103818893, "step": 5058 }, { "epoch": 0.7823699980668858, "grad_norm": 5.810223579406738, "learning_rate": 4.106713254668347e-06, "logits/chosen": 12.289148330688477, "logits/rejected": 10.763877868652344, "logps/chosen": -275.8639831542969, "logps/rejected": -186.7761993408203, "loss": 0.6325, "rewards/accuracies": 0.5, "rewards/chosen": -0.09041323512792587, "rewards/margins": 0.33413732051849365, "rewards/rejected": -0.42455050349235535, "step": 5059 }, { "epoch": 0.7825246472066499, "grad_norm": 5.3058977127075195, "learning_rate": 4.106426853018674e-06, "logits/chosen": 8.109975814819336, "logits/rejected": 15.741321563720703, "logps/chosen": -267.0071716308594, "logps/rejected": -313.7628479003906, "loss": 0.7171, "rewards/accuracies": 0.375, "rewards/chosen": -0.08863870799541473, "rewards/margins": 0.036476440727710724, "rewards/rejected": -0.12511512637138367, "step": 5060 }, { "epoch": 0.7826792963464141, "grad_norm": 6.758449077606201, "learning_rate": 4.106140451369e-06, "logits/chosen": 10.126361846923828, "logits/rejected": 10.134641647338867, "logps/chosen": -277.5089416503906, "logps/rejected": -410.3080749511719, "loss": 0.8158, "rewards/accuracies": 0.5, "rewards/chosen": 0.1090768575668335, "rewards/margins": -0.08879843354225159, "rewards/rejected": 0.19787532091140747, "step": 5061 }, { "epoch": 0.7828339454861782, "grad_norm": 4.2877020835876465, "learning_rate": 4.105854049719327e-06, "logits/chosen": 9.756173133850098, "logits/rejected": 7.455391883850098, "logps/chosen": -276.5301513671875, "logps/rejected": -298.6921081542969, "loss": 0.4046, "rewards/accuracies": 0.875, "rewards/chosen": 0.09428445249795914, "rewards/margins": 0.8458055257797241, "rewards/rejected": -0.751521110534668, "step": 5062 }, { "epoch": 0.7829885946259424, "grad_norm": 6.585441589355469, "learning_rate": 4.105567648069653e-06, "logits/chosen": 9.223259925842285, "logits/rejected": 2.761556625366211, "logps/chosen": -285.99322509765625, "logps/rejected": -200.97621154785156, "loss": 0.6225, "rewards/accuracies": 0.625, "rewards/chosen": 0.24313923716545105, "rewards/margins": 0.35347050428390503, "rewards/rejected": -0.11033125221729279, "step": 5063 }, { "epoch": 0.7831432437657065, "grad_norm": 4.626454830169678, "learning_rate": 4.1052812464199795e-06, "logits/chosen": 16.85995864868164, "logits/rejected": 13.843454360961914, "logps/chosen": -341.9322509765625, "logps/rejected": -238.6962890625, "loss": 0.5362, "rewards/accuracies": 0.75, "rewards/chosen": -0.08032150566577911, "rewards/margins": 0.44775381684303284, "rewards/rejected": -0.5280753374099731, "step": 5064 }, { "epoch": 0.7832978929054707, "grad_norm": 6.047361850738525, "learning_rate": 4.104994844770306e-06, "logits/chosen": 11.13350772857666, "logits/rejected": 10.96049976348877, "logps/chosen": -196.30206298828125, "logps/rejected": -162.70474243164062, "loss": 0.7814, "rewards/accuracies": 0.375, "rewards/chosen": -0.15369367599487305, "rewards/margins": -0.06952162086963654, "rewards/rejected": -0.08417205512523651, "step": 5065 }, { "epoch": 0.7834525420452348, "grad_norm": 4.253897190093994, "learning_rate": 4.104708443120633e-06, "logits/chosen": 10.387052536010742, "logits/rejected": 5.422173500061035, "logps/chosen": -204.311279296875, "logps/rejected": -147.24813842773438, "loss": 0.6118, "rewards/accuracies": 0.75, "rewards/chosen": -0.10736402869224548, "rewards/margins": 0.255039781332016, "rewards/rejected": -0.3624038100242615, "step": 5066 }, { "epoch": 0.783607191184999, "grad_norm": 6.538941860198975, "learning_rate": 4.104422041470959e-06, "logits/chosen": 13.011324882507324, "logits/rejected": 7.646974563598633, "logps/chosen": -260.91448974609375, "logps/rejected": -188.34507751464844, "loss": 0.8237, "rewards/accuracies": 0.375, "rewards/chosen": 0.31144633889198303, "rewards/margins": -0.0360245555639267, "rewards/rejected": 0.3474709093570709, "step": 5067 }, { "epoch": 0.7837618403247631, "grad_norm": 5.236286163330078, "learning_rate": 4.104135639821285e-06, "logits/chosen": 6.552850723266602, "logits/rejected": 10.216680526733398, "logps/chosen": -174.12399291992188, "logps/rejected": -217.28062438964844, "loss": 0.6509, "rewards/accuracies": 0.625, "rewards/chosen": 0.3650899827480316, "rewards/margins": 0.11579195410013199, "rewards/rejected": 0.24929803609848022, "step": 5068 }, { "epoch": 0.7839164894645273, "grad_norm": 8.082655906677246, "learning_rate": 4.103849238171612e-06, "logits/chosen": 1.183900237083435, "logits/rejected": 10.036355018615723, "logps/chosen": -139.90013122558594, "logps/rejected": -330.72607421875, "loss": 0.6743, "rewards/accuracies": 0.5, "rewards/chosen": 0.2600708603858948, "rewards/margins": 0.3249843716621399, "rewards/rejected": -0.06491345912218094, "step": 5069 }, { "epoch": 0.7840711386042916, "grad_norm": 3.6693685054779053, "learning_rate": 4.103562836521939e-06, "logits/chosen": 6.519768238067627, "logits/rejected": 8.200125694274902, "logps/chosen": -176.1629638671875, "logps/rejected": -200.93399047851562, "loss": 0.5644, "rewards/accuracies": 0.75, "rewards/chosen": 0.42160022258758545, "rewards/margins": 0.35430848598480225, "rewards/rejected": 0.06729169934988022, "step": 5070 }, { "epoch": 0.7842257877440557, "grad_norm": 5.75965690612793, "learning_rate": 4.103276434872265e-06, "logits/chosen": 7.22062873840332, "logits/rejected": 11.40235710144043, "logps/chosen": -214.06446838378906, "logps/rejected": -284.55316162109375, "loss": 0.7208, "rewards/accuracies": 0.5, "rewards/chosen": 0.1937389373779297, "rewards/margins": -0.023232292383909225, "rewards/rejected": 0.21697121858596802, "step": 5071 }, { "epoch": 0.7843804368838199, "grad_norm": 5.923933506011963, "learning_rate": 4.102990033222592e-06, "logits/chosen": 11.293485641479492, "logits/rejected": 11.829269409179688, "logps/chosen": -268.4728698730469, "logps/rejected": -235.5037384033203, "loss": 0.7037, "rewards/accuracies": 0.625, "rewards/chosen": 0.030789926648139954, "rewards/margins": 0.024275191128253937, "rewards/rejected": 0.006514742970466614, "step": 5072 }, { "epoch": 0.784535086023584, "grad_norm": 5.9903764724731445, "learning_rate": 4.102703631572918e-06, "logits/chosen": 12.125068664550781, "logits/rejected": 4.556142807006836, "logps/chosen": -306.804931640625, "logps/rejected": -213.5801239013672, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": 0.08387833833694458, "rewards/margins": 0.22682486474514008, "rewards/rejected": -0.1429465264081955, "step": 5073 }, { "epoch": 0.7846897351633482, "grad_norm": 8.062975883483887, "learning_rate": 4.102417229923244e-06, "logits/chosen": 7.6582231521606445, "logits/rejected": 10.406158447265625, "logps/chosen": -362.16339111328125, "logps/rejected": -389.4421081542969, "loss": 0.8861, "rewards/accuracies": 0.375, "rewards/chosen": -0.22911453247070312, "rewards/margins": -0.24018210172653198, "rewards/rejected": 0.011067569255828857, "step": 5074 }, { "epoch": 0.7848443843031123, "grad_norm": 4.826886177062988, "learning_rate": 4.102130828273571e-06, "logits/chosen": 13.163078308105469, "logits/rejected": 6.655034065246582, "logps/chosen": -291.2995910644531, "logps/rejected": -205.60719299316406, "loss": 0.513, "rewards/accuracies": 0.75, "rewards/chosen": -0.07948438078165054, "rewards/margins": 0.5354306697845459, "rewards/rejected": -0.6149150133132935, "step": 5075 }, { "epoch": 0.7849990334428765, "grad_norm": 5.940065383911133, "learning_rate": 4.101844426623898e-06, "logits/chosen": 6.081146240234375, "logits/rejected": 8.027899742126465, "logps/chosen": -243.90274047851562, "logps/rejected": -253.07562255859375, "loss": 0.6405, "rewards/accuracies": 0.5, "rewards/chosen": -0.2440074384212494, "rewards/margins": 0.1856367290019989, "rewards/rejected": -0.42964422702789307, "step": 5076 }, { "epoch": 0.7851536825826406, "grad_norm": 4.854711532592773, "learning_rate": 4.101558024974224e-06, "logits/chosen": 10.162113189697266, "logits/rejected": 7.421090126037598, "logps/chosen": -240.82421875, "logps/rejected": -232.93780517578125, "loss": 0.5164, "rewards/accuracies": 0.875, "rewards/chosen": -0.08275684714317322, "rewards/margins": 0.4421387314796448, "rewards/rejected": -0.5248955488204956, "step": 5077 }, { "epoch": 0.7853083317224048, "grad_norm": 5.007492542266846, "learning_rate": 4.101271623324551e-06, "logits/chosen": 11.49973201751709, "logits/rejected": 12.11457633972168, "logps/chosen": -232.97732543945312, "logps/rejected": -226.9874267578125, "loss": 0.5829, "rewards/accuracies": 0.75, "rewards/chosen": 0.1904737502336502, "rewards/margins": 0.27222391963005066, "rewards/rejected": -0.08175017684698105, "step": 5078 }, { "epoch": 0.7854629808621689, "grad_norm": 4.608596324920654, "learning_rate": 4.100985221674878e-06, "logits/chosen": 10.496490478515625, "logits/rejected": 9.294227600097656, "logps/chosen": -182.33740234375, "logps/rejected": -103.50914001464844, "loss": 0.762, "rewards/accuracies": 0.5, "rewards/chosen": -0.2902719974517822, "rewards/margins": 0.05885876715183258, "rewards/rejected": -0.3491307497024536, "step": 5079 }, { "epoch": 0.7856176300019331, "grad_norm": 5.733582496643066, "learning_rate": 4.100698820025203e-06, "logits/chosen": 10.298238754272461, "logits/rejected": 4.564059734344482, "logps/chosen": -342.3865051269531, "logps/rejected": -224.36880493164062, "loss": 0.5779, "rewards/accuracies": 0.75, "rewards/chosen": 0.42482495307922363, "rewards/margins": 0.5499775409698486, "rewards/rejected": -0.125152587890625, "step": 5080 }, { "epoch": 0.7857722791416972, "grad_norm": 5.319206714630127, "learning_rate": 4.10041241837553e-06, "logits/chosen": 7.999185085296631, "logits/rejected": 3.193298101425171, "logps/chosen": -235.6840057373047, "logps/rejected": -216.12484741210938, "loss": 0.7459, "rewards/accuracies": 0.5, "rewards/chosen": -0.21852310001850128, "rewards/margins": 0.06859800219535828, "rewards/rejected": -0.28712111711502075, "step": 5081 }, { "epoch": 0.7859269282814615, "grad_norm": 4.088825225830078, "learning_rate": 4.100126016725857e-06, "logits/chosen": 12.149977684020996, "logits/rejected": 11.96504020690918, "logps/chosen": -165.03765869140625, "logps/rejected": -161.04461669921875, "loss": 0.6042, "rewards/accuracies": 0.75, "rewards/chosen": 0.09357132017612457, "rewards/margins": 0.3030185401439667, "rewards/rejected": -0.20944717526435852, "step": 5082 }, { "epoch": 0.7860815774212256, "grad_norm": 4.019356727600098, "learning_rate": 4.099839615076183e-06, "logits/chosen": 14.932348251342773, "logits/rejected": 7.120345115661621, "logps/chosen": -353.8023376464844, "logps/rejected": -201.31927490234375, "loss": 0.4418, "rewards/accuracies": 0.875, "rewards/chosen": 0.24206753075122833, "rewards/margins": 0.7625024318695068, "rewards/rejected": -0.5204348564147949, "step": 5083 }, { "epoch": 0.7862362265609898, "grad_norm": 5.034538269042969, "learning_rate": 4.09955321342651e-06, "logits/chosen": 10.297540664672852, "logits/rejected": 6.178125381469727, "logps/chosen": -334.7886962890625, "logps/rejected": -218.8343048095703, "loss": 0.5267, "rewards/accuracies": 0.625, "rewards/chosen": 0.44965869188308716, "rewards/margins": 0.44327741861343384, "rewards/rejected": 0.006381265819072723, "step": 5084 }, { "epoch": 0.7863908757007539, "grad_norm": 4.640108108520508, "learning_rate": 4.099266811776837e-06, "logits/chosen": 9.456334114074707, "logits/rejected": 5.47445821762085, "logps/chosen": -231.6034393310547, "logps/rejected": -152.19415283203125, "loss": 0.6264, "rewards/accuracies": 0.875, "rewards/chosen": 0.06949248909950256, "rewards/margins": 0.34694284200668335, "rewards/rejected": -0.2774503827095032, "step": 5085 }, { "epoch": 0.7865455248405181, "grad_norm": 4.491507053375244, "learning_rate": 4.0989804101271624e-06, "logits/chosen": 6.749477386474609, "logits/rejected": 1.9619042873382568, "logps/chosen": -241.92921447753906, "logps/rejected": -213.1754608154297, "loss": 0.5349, "rewards/accuracies": 0.75, "rewards/chosen": 0.2999480068683624, "rewards/margins": 0.6797693967819214, "rewards/rejected": -0.37982141971588135, "step": 5086 }, { "epoch": 0.7867001739802822, "grad_norm": 6.806582450866699, "learning_rate": 4.098694008477489e-06, "logits/chosen": 15.392090797424316, "logits/rejected": 5.7103705406188965, "logps/chosen": -442.79656982421875, "logps/rejected": -319.1934814453125, "loss": 0.752, "rewards/accuracies": 0.625, "rewards/chosen": 0.0019940361380577087, "rewards/margins": 0.27102822065353394, "rewards/rejected": -0.26903417706489563, "step": 5087 }, { "epoch": 0.7868548231200464, "grad_norm": 4.586902141571045, "learning_rate": 4.098407606827816e-06, "logits/chosen": 12.856761932373047, "logits/rejected": 10.159149169921875, "logps/chosen": -318.08782958984375, "logps/rejected": -234.09378051757812, "loss": 0.5427, "rewards/accuracies": 0.75, "rewards/chosen": 0.035257622599601746, "rewards/margins": 0.37344610691070557, "rewards/rejected": -0.338188499212265, "step": 5088 }, { "epoch": 0.7870094722598105, "grad_norm": 7.811407089233398, "learning_rate": 4.098121205178142e-06, "logits/chosen": 4.290340900421143, "logits/rejected": 5.923408031463623, "logps/chosen": -285.6490478515625, "logps/rejected": -298.68511962890625, "loss": 0.792, "rewards/accuracies": 0.5, "rewards/chosen": -0.13546524941921234, "rewards/margins": -0.04209838807582855, "rewards/rejected": -0.0933668464422226, "step": 5089 }, { "epoch": 0.7871641213995747, "grad_norm": 4.338775634765625, "learning_rate": 4.097834803528469e-06, "logits/chosen": 16.456586837768555, "logits/rejected": 6.50148868560791, "logps/chosen": -213.60464477539062, "logps/rejected": -145.39422607421875, "loss": 0.588, "rewards/accuracies": 0.5, "rewards/chosen": 0.018607236444950104, "rewards/margins": 0.30714306235313416, "rewards/rejected": -0.28853580355644226, "step": 5090 }, { "epoch": 0.7873187705393389, "grad_norm": 5.082007884979248, "learning_rate": 4.097548401878796e-06, "logits/chosen": 7.140385627746582, "logits/rejected": 4.2235307693481445, "logps/chosen": -301.2499084472656, "logps/rejected": -160.086181640625, "loss": 0.5724, "rewards/accuracies": 0.625, "rewards/chosen": 0.11972412467002869, "rewards/margins": 0.43268856406211853, "rewards/rejected": -0.31296437978744507, "step": 5091 }, { "epoch": 0.787473419679103, "grad_norm": 4.049871444702148, "learning_rate": 4.0972620002291215e-06, "logits/chosen": 9.526273727416992, "logits/rejected": 6.412437438964844, "logps/chosen": -380.4332275390625, "logps/rejected": -270.4364013671875, "loss": 0.5365, "rewards/accuracies": 0.625, "rewards/chosen": 0.4613839089870453, "rewards/margins": 0.615372896194458, "rewards/rejected": -0.15398894250392914, "step": 5092 }, { "epoch": 0.7876280688188672, "grad_norm": 9.423508644104004, "learning_rate": 4.096975598579448e-06, "logits/chosen": 7.798299789428711, "logits/rejected": 10.19260025024414, "logps/chosen": -607.5639038085938, "logps/rejected": -751.56982421875, "loss": 1.0192, "rewards/accuracies": 0.125, "rewards/chosen": -0.07918519526720047, "rewards/margins": -0.45033693313598633, "rewards/rejected": 0.37115171551704407, "step": 5093 }, { "epoch": 0.7877827179586313, "grad_norm": 3.54980206489563, "learning_rate": 4.096689196929775e-06, "logits/chosen": 10.68482780456543, "logits/rejected": 2.815458059310913, "logps/chosen": -336.3044128417969, "logps/rejected": -210.94725036621094, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 0.9575498104095459, "rewards/margins": 1.014569878578186, "rewards/rejected": -0.057019997388124466, "step": 5094 }, { "epoch": 0.7879373670983956, "grad_norm": 9.095545768737793, "learning_rate": 4.0964027952801015e-06, "logits/chosen": 12.557228088378906, "logits/rejected": 3.9230995178222656, "logps/chosen": -305.9664001464844, "logps/rejected": -194.7684326171875, "loss": 0.793, "rewards/accuracies": 0.625, "rewards/chosen": 0.0077245235443115234, "rewards/margins": 0.0496039092540741, "rewards/rejected": -0.04187939688563347, "step": 5095 }, { "epoch": 0.7880920162381597, "grad_norm": 4.574399948120117, "learning_rate": 4.096116393630427e-06, "logits/chosen": 11.70435905456543, "logits/rejected": 10.439863204956055, "logps/chosen": -288.5580139160156, "logps/rejected": -218.48220825195312, "loss": 0.5238, "rewards/accuracies": 0.75, "rewards/chosen": 0.6886293888092041, "rewards/margins": 0.6195163726806641, "rewards/rejected": 0.06911302357912064, "step": 5096 }, { "epoch": 0.7882466653779239, "grad_norm": 4.24513053894043, "learning_rate": 4.095829991980754e-06, "logits/chosen": 12.63845157623291, "logits/rejected": 10.504179000854492, "logps/chosen": -288.8387756347656, "logps/rejected": -195.49526977539062, "loss": 0.5844, "rewards/accuracies": 0.625, "rewards/chosen": 0.003933288156986237, "rewards/margins": 0.3645409941673279, "rewards/rejected": -0.36060771346092224, "step": 5097 }, { "epoch": 0.788401314517688, "grad_norm": 4.105783462524414, "learning_rate": 4.0955435903310806e-06, "logits/chosen": 9.760673522949219, "logits/rejected": 7.60066032409668, "logps/chosen": -224.1059112548828, "logps/rejected": -147.4083251953125, "loss": 0.6594, "rewards/accuracies": 0.75, "rewards/chosen": 0.1093551516532898, "rewards/margins": 0.08854524791240692, "rewards/rejected": 0.020809918642044067, "step": 5098 }, { "epoch": 0.7885559636574522, "grad_norm": 5.452014923095703, "learning_rate": 4.095257188681407e-06, "logits/chosen": 5.918210029602051, "logits/rejected": 10.329328536987305, "logps/chosen": -227.80001831054688, "logps/rejected": -229.72103881835938, "loss": 0.5554, "rewards/accuracies": 0.875, "rewards/chosen": 0.37150678038597107, "rewards/margins": 0.3152625560760498, "rewards/rejected": 0.05624423176050186, "step": 5099 }, { "epoch": 0.7887106127972163, "grad_norm": 7.25813102722168, "learning_rate": 4.094970787031734e-06, "logits/chosen": 7.437568664550781, "logits/rejected": 8.877620697021484, "logps/chosen": -190.92784118652344, "logps/rejected": -252.36814880371094, "loss": 0.8061, "rewards/accuracies": 0.5, "rewards/chosen": -0.05465591326355934, "rewards/margins": -0.14600211381912231, "rewards/rejected": 0.09134618192911148, "step": 5100 }, { "epoch": 0.7888652619369805, "grad_norm": 3.238044261932373, "learning_rate": 4.09468438538206e-06, "logits/chosen": -0.017635047435760498, "logits/rejected": 9.20185661315918, "logps/chosen": -136.63595581054688, "logps/rejected": -190.55198669433594, "loss": 0.446, "rewards/accuracies": 0.875, "rewards/chosen": 0.3917364478111267, "rewards/margins": 0.644311249256134, "rewards/rejected": -0.2525748014450073, "step": 5101 }, { "epoch": 0.7890199110767446, "grad_norm": 3.8613359928131104, "learning_rate": 4.094397983732386e-06, "logits/chosen": 8.534120559692383, "logits/rejected": 1.7898999452590942, "logps/chosen": -237.05540466308594, "logps/rejected": -130.94361877441406, "loss": 0.5233, "rewards/accuracies": 0.875, "rewards/chosen": 0.6398761868476868, "rewards/margins": 0.3905426561832428, "rewards/rejected": 0.24933350086212158, "step": 5102 }, { "epoch": 0.7891745602165088, "grad_norm": 4.5816497802734375, "learning_rate": 4.094111582082713e-06, "logits/chosen": 10.0445556640625, "logits/rejected": 8.66766357421875, "logps/chosen": -223.98605346679688, "logps/rejected": -252.81219482421875, "loss": 0.643, "rewards/accuracies": 0.625, "rewards/chosen": 0.4634278118610382, "rewards/margins": 0.22173044085502625, "rewards/rejected": 0.24169737100601196, "step": 5103 }, { "epoch": 0.7893292093562729, "grad_norm": 4.215822219848633, "learning_rate": 4.09382518043304e-06, "logits/chosen": 14.475205421447754, "logits/rejected": 4.325428009033203, "logps/chosen": -317.0892333984375, "logps/rejected": -163.09774780273438, "loss": 0.5252, "rewards/accuracies": 0.75, "rewards/chosen": 0.6705270409584045, "rewards/margins": 0.9162802696228027, "rewards/rejected": -0.24575325846672058, "step": 5104 }, { "epoch": 0.7894838584960371, "grad_norm": 5.75499963760376, "learning_rate": 4.093538778783366e-06, "logits/chosen": 9.2464017868042, "logits/rejected": 14.420654296875, "logps/chosen": -211.19117736816406, "logps/rejected": -287.4402770996094, "loss": 0.7857, "rewards/accuracies": 0.25, "rewards/chosen": 0.037461668252944946, "rewards/margins": -0.08721613138914108, "rewards/rejected": 0.12467780709266663, "step": 5105 }, { "epoch": 0.7896385076358012, "grad_norm": 4.349009037017822, "learning_rate": 4.093252377133692e-06, "logits/chosen": 5.422112941741943, "logits/rejected": 6.471564292907715, "logps/chosen": -168.31227111816406, "logps/rejected": -185.10122680664062, "loss": 0.7265, "rewards/accuracies": 0.5, "rewards/chosen": 0.2771903872489929, "rewards/margins": -0.017520040273666382, "rewards/rejected": 0.2947104275226593, "step": 5106 }, { "epoch": 0.7897931567755654, "grad_norm": 5.1660332679748535, "learning_rate": 4.092965975484019e-06, "logits/chosen": 11.746912002563477, "logits/rejected": 5.832705974578857, "logps/chosen": -399.9925842285156, "logps/rejected": -297.90032958984375, "loss": 0.6077, "rewards/accuracies": 0.625, "rewards/chosen": 0.7811201214790344, "rewards/margins": 0.24734467267990112, "rewards/rejected": 0.5337754487991333, "step": 5107 }, { "epoch": 0.7899478059153296, "grad_norm": 4.787499904632568, "learning_rate": 4.092679573834345e-06, "logits/chosen": 10.443513870239258, "logits/rejected": 9.022887229919434, "logps/chosen": -380.3749084472656, "logps/rejected": -300.4557800292969, "loss": 0.5348, "rewards/accuracies": 0.875, "rewards/chosen": 0.7033333778381348, "rewards/margins": 0.40314429998397827, "rewards/rejected": 0.3001891076564789, "step": 5108 }, { "epoch": 0.7901024550550938, "grad_norm": 5.681580066680908, "learning_rate": 4.092393172184672e-06, "logits/chosen": 13.270523071289062, "logits/rejected": 14.695281982421875, "logps/chosen": -329.2462158203125, "logps/rejected": -335.0748291015625, "loss": 0.6725, "rewards/accuracies": 0.75, "rewards/chosen": 0.2909242510795593, "rewards/margins": 0.21132926642894745, "rewards/rejected": 0.07959498465061188, "step": 5109 }, { "epoch": 0.790257104194858, "grad_norm": 5.15778923034668, "learning_rate": 4.092106770534999e-06, "logits/chosen": 10.040604591369629, "logits/rejected": 10.57836627960205, "logps/chosen": -244.24546813964844, "logps/rejected": -229.80064392089844, "loss": 0.6493, "rewards/accuracies": 0.625, "rewards/chosen": 0.048752300441265106, "rewards/margins": 0.11303119361400604, "rewards/rejected": -0.06427889317274094, "step": 5110 }, { "epoch": 0.7904117533346221, "grad_norm": 7.556445121765137, "learning_rate": 4.091820368885325e-06, "logits/chosen": 11.767011642456055, "logits/rejected": 12.161941528320312, "logps/chosen": -199.0382080078125, "logps/rejected": -182.70046997070312, "loss": 0.8979, "rewards/accuracies": 0.5, "rewards/chosen": -0.29699739813804626, "rewards/margins": -0.30969393253326416, "rewards/rejected": 0.012696534395217896, "step": 5111 }, { "epoch": 0.7905664024743863, "grad_norm": 5.7087249755859375, "learning_rate": 4.091533967235652e-06, "logits/chosen": 8.876701354980469, "logits/rejected": 9.305632591247559, "logps/chosen": -332.8764953613281, "logps/rejected": -360.33740234375, "loss": 0.5545, "rewards/accuracies": 0.75, "rewards/chosen": 0.47623586654663086, "rewards/margins": 0.3553975224494934, "rewards/rejected": 0.12083835154771805, "step": 5112 }, { "epoch": 0.7907210516141504, "grad_norm": 5.002257823944092, "learning_rate": 4.091247565585978e-06, "logits/chosen": 12.8779878616333, "logits/rejected": 10.324104309082031, "logps/chosen": -277.0475769042969, "logps/rejected": -255.50823974609375, "loss": 0.6383, "rewards/accuracies": 0.625, "rewards/chosen": 0.33567050099372864, "rewards/margins": 0.17392273247241974, "rewards/rejected": 0.1617477387189865, "step": 5113 }, { "epoch": 0.7908757007539146, "grad_norm": 4.451407432556152, "learning_rate": 4.0909611639363044e-06, "logits/chosen": 11.414949417114258, "logits/rejected": 12.021656036376953, "logps/chosen": -256.81536865234375, "logps/rejected": -249.47708129882812, "loss": 0.7281, "rewards/accuracies": 0.375, "rewards/chosen": 0.03548100218176842, "rewards/margins": 0.14254681766033173, "rewards/rejected": -0.1070658415555954, "step": 5114 }, { "epoch": 0.7910303498936787, "grad_norm": 7.36456298828125, "learning_rate": 4.090674762286631e-06, "logits/chosen": 12.94161319732666, "logits/rejected": 6.897037506103516, "logps/chosen": -369.4624938964844, "logps/rejected": -324.5177001953125, "loss": 0.6475, "rewards/accuracies": 0.75, "rewards/chosen": 0.4120739996433258, "rewards/margins": 0.18249830603599548, "rewards/rejected": 0.22957567870616913, "step": 5115 }, { "epoch": 0.7911849990334429, "grad_norm": 5.778082370758057, "learning_rate": 4.090388360636958e-06, "logits/chosen": 7.975973129272461, "logits/rejected": 6.598692893981934, "logps/chosen": -174.73782348632812, "logps/rejected": -143.19839477539062, "loss": 0.929, "rewards/accuracies": 0.625, "rewards/chosen": -0.0850757583975792, "rewards/margins": -0.247012197971344, "rewards/rejected": 0.1619364321231842, "step": 5116 }, { "epoch": 0.791339648173207, "grad_norm": 4.2440643310546875, "learning_rate": 4.090101958987284e-06, "logits/chosen": 12.873610496520996, "logits/rejected": 11.144428253173828, "logps/chosen": -254.34454345703125, "logps/rejected": -219.94285583496094, "loss": 0.6213, "rewards/accuracies": 0.875, "rewards/chosen": 0.5841102600097656, "rewards/margins": 0.20584255456924438, "rewards/rejected": 0.37826770544052124, "step": 5117 }, { "epoch": 0.7914942973129712, "grad_norm": 6.00151252746582, "learning_rate": 4.089815557337611e-06, "logits/chosen": 11.584171295166016, "logits/rejected": 11.935012817382812, "logps/chosen": -537.4677124023438, "logps/rejected": -427.73199462890625, "loss": 0.587, "rewards/accuracies": 0.75, "rewards/chosen": 0.5674301385879517, "rewards/margins": 0.32733821868896484, "rewards/rejected": 0.24009190499782562, "step": 5118 }, { "epoch": 0.7916489464527353, "grad_norm": 14.695266723632812, "learning_rate": 4.089529155687937e-06, "logits/chosen": 10.891056060791016, "logits/rejected": 8.556668281555176, "logps/chosen": -186.0068359375, "logps/rejected": -155.95144653320312, "loss": 0.7171, "rewards/accuracies": 0.625, "rewards/chosen": 0.019355714321136475, "rewards/margins": 0.12493747472763062, "rewards/rejected": -0.10558176040649414, "step": 5119 }, { "epoch": 0.7918035955924995, "grad_norm": 10.410287857055664, "learning_rate": 4.0892427540382635e-06, "logits/chosen": 4.316266059875488, "logits/rejected": 2.9128799438476562, "logps/chosen": -330.39459228515625, "logps/rejected": -263.1448974609375, "loss": 0.6007, "rewards/accuracies": 0.5, "rewards/chosen": 0.3537082076072693, "rewards/margins": 0.2715470790863037, "rewards/rejected": 0.08216113597154617, "step": 5120 }, { "epoch": 0.7919582447322637, "grad_norm": 3.5756843090057373, "learning_rate": 4.08895635238859e-06, "logits/chosen": 10.93596076965332, "logits/rejected": 6.558873176574707, "logps/chosen": -249.20327758789062, "logps/rejected": -189.0355224609375, "loss": 0.6086, "rewards/accuracies": 0.5, "rewards/chosen": 0.23424673080444336, "rewards/margins": 0.43993523716926575, "rewards/rejected": -0.2056885063648224, "step": 5121 }, { "epoch": 0.7921128938720279, "grad_norm": 4.506368637084961, "learning_rate": 4.088669950738917e-06, "logits/chosen": 11.031534194946289, "logits/rejected": 5.076327323913574, "logps/chosen": -262.301513671875, "logps/rejected": -210.02330017089844, "loss": 0.6183, "rewards/accuracies": 0.625, "rewards/chosen": -0.1448369175195694, "rewards/margins": 0.4317597448825836, "rewards/rejected": -0.5765966773033142, "step": 5122 }, { "epoch": 0.792267543011792, "grad_norm": 4.79826545715332, "learning_rate": 4.0883835490892434e-06, "logits/chosen": 8.827932357788086, "logits/rejected": 4.778672218322754, "logps/chosen": -157.8374786376953, "logps/rejected": -113.94066619873047, "loss": 0.7055, "rewards/accuracies": 0.5, "rewards/chosen": 0.3114785850048065, "rewards/margins": 0.009424515999853611, "rewards/rejected": 0.3020540475845337, "step": 5123 }, { "epoch": 0.7924221921515562, "grad_norm": 4.824007034301758, "learning_rate": 4.08809714743957e-06, "logits/chosen": 12.169127464294434, "logits/rejected": 6.964886665344238, "logps/chosen": -341.558837890625, "logps/rejected": -306.52142333984375, "loss": 0.4289, "rewards/accuracies": 1.0, "rewards/chosen": 0.5295025110244751, "rewards/margins": 0.7243529558181763, "rewards/rejected": -0.19485044479370117, "step": 5124 }, { "epoch": 0.7925768412913203, "grad_norm": 8.1026611328125, "learning_rate": 4.087810745789897e-06, "logits/chosen": 6.729506492614746, "logits/rejected": 7.2052717208862305, "logps/chosen": -253.596923828125, "logps/rejected": -289.608154296875, "loss": 0.7858, "rewards/accuracies": 0.375, "rewards/chosen": 0.4378250241279602, "rewards/margins": -0.023422781378030777, "rewards/rejected": 0.4612478017807007, "step": 5125 }, { "epoch": 0.7927314904310845, "grad_norm": 5.579225540161133, "learning_rate": 4.0875243441402225e-06, "logits/chosen": 8.467966079711914, "logits/rejected": 8.919305801391602, "logps/chosen": -246.54071044921875, "logps/rejected": -256.5277404785156, "loss": 0.5381, "rewards/accuracies": 0.625, "rewards/chosen": 0.41607218980789185, "rewards/margins": 0.6485758423805237, "rewards/rejected": -0.23250368237495422, "step": 5126 }, { "epoch": 0.7928861395708486, "grad_norm": 4.772952556610107, "learning_rate": 4.087237942490549e-06, "logits/chosen": 15.768258094787598, "logits/rejected": 10.663614273071289, "logps/chosen": -383.0423278808594, "logps/rejected": -291.88128662109375, "loss": 0.581, "rewards/accuracies": 0.75, "rewards/chosen": 0.5903576612472534, "rewards/margins": 0.30960386991500854, "rewards/rejected": 0.2807537317276001, "step": 5127 }, { "epoch": 0.7930407887106128, "grad_norm": 4.501585483551025, "learning_rate": 4.086951540840876e-06, "logits/chosen": 8.701677322387695, "logits/rejected": 7.465191841125488, "logps/chosen": -406.61651611328125, "logps/rejected": -273.605224609375, "loss": 0.4708, "rewards/accuracies": 0.875, "rewards/chosen": 0.2780885398387909, "rewards/margins": 0.6573121547698975, "rewards/rejected": -0.37922361493110657, "step": 5128 }, { "epoch": 0.7931954378503769, "grad_norm": 4.429296016693115, "learning_rate": 4.0866651391912025e-06, "logits/chosen": 9.797746658325195, "logits/rejected": 10.280259132385254, "logps/chosen": -216.98837280273438, "logps/rejected": -240.43994140625, "loss": 0.529, "rewards/accuracies": 0.625, "rewards/chosen": 0.47766926884651184, "rewards/margins": 0.5691008567810059, "rewards/rejected": -0.09143156558275223, "step": 5129 }, { "epoch": 0.7933500869901411, "grad_norm": 4.896242618560791, "learning_rate": 4.086378737541528e-06, "logits/chosen": 5.520730495452881, "logits/rejected": 5.328020095825195, "logps/chosen": -193.28045654296875, "logps/rejected": -157.02879333496094, "loss": 0.7188, "rewards/accuracies": 0.625, "rewards/chosen": 0.24938923120498657, "rewards/margins": 0.11459577083587646, "rewards/rejected": 0.1347934603691101, "step": 5130 }, { "epoch": 0.7935047361299052, "grad_norm": 3.339292049407959, "learning_rate": 4.086092335891855e-06, "logits/chosen": 10.50387191772461, "logits/rejected": 11.079645156860352, "logps/chosen": -208.0048370361328, "logps/rejected": -234.5660400390625, "loss": 0.5039, "rewards/accuracies": 0.625, "rewards/chosen": 0.34642529487609863, "rewards/margins": 0.591510534286499, "rewards/rejected": -0.24508532881736755, "step": 5131 }, { "epoch": 0.7936593852696694, "grad_norm": 8.280366897583008, "learning_rate": 4.085805934242182e-06, "logits/chosen": 0.9430966377258301, "logits/rejected": 3.7082700729370117, "logps/chosen": -307.4939270019531, "logps/rejected": -318.5357971191406, "loss": 0.9763, "rewards/accuracies": 0.375, "rewards/chosen": 0.2641964852809906, "rewards/margins": -0.19423817098140717, "rewards/rejected": 0.45843470096588135, "step": 5132 }, { "epoch": 0.7938140344094335, "grad_norm": 18.444961547851562, "learning_rate": 4.085519532592508e-06, "logits/chosen": 7.743846893310547, "logits/rejected": 5.6279168128967285, "logps/chosen": -255.98977661132812, "logps/rejected": -207.76181030273438, "loss": 0.6992, "rewards/accuracies": 0.5, "rewards/chosen": 0.13744987547397614, "rewards/margins": 0.15704721212387085, "rewards/rejected": -0.019597336649894714, "step": 5133 }, { "epoch": 0.7939686835491978, "grad_norm": 5.020827770233154, "learning_rate": 4.085233130942834e-06, "logits/chosen": 9.89692211151123, "logits/rejected": 9.981078147888184, "logps/chosen": -234.182373046875, "logps/rejected": -193.4803466796875, "loss": 0.8592, "rewards/accuracies": 0.375, "rewards/chosen": -0.12400683760643005, "rewards/margins": -0.1671830117702484, "rewards/rejected": 0.043176181614398956, "step": 5134 }, { "epoch": 0.794123332688962, "grad_norm": 4.169794082641602, "learning_rate": 4.084946729293161e-06, "logits/chosen": 17.274953842163086, "logits/rejected": 12.514240264892578, "logps/chosen": -296.9609375, "logps/rejected": -228.280029296875, "loss": 0.5501, "rewards/accuracies": 0.75, "rewards/chosen": 0.5431222915649414, "rewards/margins": 0.3721500337123871, "rewards/rejected": 0.17097224295139313, "step": 5135 }, { "epoch": 0.7942779818287261, "grad_norm": 5.927578926086426, "learning_rate": 4.084660327643487e-06, "logits/chosen": 14.158815383911133, "logits/rejected": 12.72148609161377, "logps/chosen": -350.7445983886719, "logps/rejected": -361.63128662109375, "loss": 0.6859, "rewards/accuracies": 0.625, "rewards/chosen": 0.5648687481880188, "rewards/margins": 0.13774964213371277, "rewards/rejected": 0.42711907625198364, "step": 5136 }, { "epoch": 0.7944326309684903, "grad_norm": 7.3381195068359375, "learning_rate": 4.084373925993814e-06, "logits/chosen": 8.293642044067383, "logits/rejected": 14.127386093139648, "logps/chosen": -194.8069305419922, "logps/rejected": -332.56597900390625, "loss": 0.8826, "rewards/accuracies": 0.125, "rewards/chosen": -0.020099062472581863, "rewards/margins": -0.22488906979560852, "rewards/rejected": 0.20479002594947815, "step": 5137 }, { "epoch": 0.7945872801082544, "grad_norm": 5.287918567657471, "learning_rate": 4.084087524344141e-06, "logits/chosen": 6.866658687591553, "logits/rejected": 8.704523086547852, "logps/chosen": -183.92649841308594, "logps/rejected": -299.900146484375, "loss": 0.8506, "rewards/accuracies": 0.375, "rewards/chosen": -0.05032868683338165, "rewards/margins": -0.1025393009185791, "rewards/rejected": 0.05221062898635864, "step": 5138 }, { "epoch": 0.7947419292480186, "grad_norm": 4.71909236907959, "learning_rate": 4.0838011226944665e-06, "logits/chosen": 12.919755935668945, "logits/rejected": 9.056034088134766, "logps/chosen": -211.45774841308594, "logps/rejected": -192.09457397460938, "loss": 0.5835, "rewards/accuracies": 0.875, "rewards/chosen": -0.15688000619411469, "rewards/margins": 0.3297611474990845, "rewards/rejected": -0.48664113879203796, "step": 5139 }, { "epoch": 0.7948965783877827, "grad_norm": 5.384582042694092, "learning_rate": 4.083514721044793e-06, "logits/chosen": 11.911674499511719, "logits/rejected": 11.493669509887695, "logps/chosen": -175.20628356933594, "logps/rejected": -214.06173706054688, "loss": 0.8418, "rewards/accuracies": 0.25, "rewards/chosen": -0.22032225131988525, "rewards/margins": -0.1977851390838623, "rewards/rejected": -0.022537142038345337, "step": 5140 }, { "epoch": 0.7950512275275469, "grad_norm": 7.07741641998291, "learning_rate": 4.08322831939512e-06, "logits/chosen": 8.259150505065918, "logits/rejected": 12.604560852050781, "logps/chosen": -256.9228515625, "logps/rejected": -376.0733642578125, "loss": 0.9283, "rewards/accuracies": 0.5, "rewards/chosen": 0.2508412301540375, "rewards/margins": -0.310799777507782, "rewards/rejected": 0.5616410374641418, "step": 5141 }, { "epoch": 0.795205876667311, "grad_norm": 6.959361553192139, "learning_rate": 4.082941917745446e-06, "logits/chosen": 6.577674865722656, "logits/rejected": 6.3845601081848145, "logps/chosen": -282.6300354003906, "logps/rejected": -269.59259033203125, "loss": 0.8414, "rewards/accuracies": 0.625, "rewards/chosen": 0.5009900331497192, "rewards/margins": -0.14512671530246735, "rewards/rejected": 0.6461167335510254, "step": 5142 }, { "epoch": 0.7953605258070752, "grad_norm": 4.713140964508057, "learning_rate": 4.082655516095773e-06, "logits/chosen": 9.76624584197998, "logits/rejected": 7.391097545623779, "logps/chosen": -266.0717468261719, "logps/rejected": -271.6428527832031, "loss": 0.5893, "rewards/accuracies": 0.75, "rewards/chosen": 0.14975817501544952, "rewards/margins": 0.2693978250026703, "rewards/rejected": -0.11963966488838196, "step": 5143 }, { "epoch": 0.7955151749468393, "grad_norm": 5.34617805480957, "learning_rate": 4.0823691144461e-06, "logits/chosen": 12.2053861618042, "logits/rejected": 12.456269264221191, "logps/chosen": -198.55780029296875, "logps/rejected": -211.1479949951172, "loss": 0.765, "rewards/accuracies": 0.5, "rewards/chosen": -0.17930379509925842, "rewards/margins": -0.00016549229621887207, "rewards/rejected": -0.17913833260536194, "step": 5144 }, { "epoch": 0.7956698240866035, "grad_norm": 5.029722213745117, "learning_rate": 4.082082712796426e-06, "logits/chosen": 11.243890762329102, "logits/rejected": 9.897314071655273, "logps/chosen": -294.2127685546875, "logps/rejected": -262.984375, "loss": 0.6215, "rewards/accuracies": 0.625, "rewards/chosen": 0.2950325012207031, "rewards/margins": 0.19400739669799805, "rewards/rejected": 0.10102510452270508, "step": 5145 }, { "epoch": 0.7958244732263676, "grad_norm": 6.0550856590271, "learning_rate": 4.081796311146752e-06, "logits/chosen": 11.434615135192871, "logits/rejected": 5.453403949737549, "logps/chosen": -260.6808776855469, "logps/rejected": -228.1754913330078, "loss": 0.6661, "rewards/accuracies": 0.625, "rewards/chosen": 0.13278117775917053, "rewards/margins": 0.10964999347925186, "rewards/rejected": 0.023131176829338074, "step": 5146 }, { "epoch": 0.7959791223661319, "grad_norm": 4.228728294372559, "learning_rate": 4.081509909497079e-06, "logits/chosen": 11.719847679138184, "logits/rejected": 9.542858123779297, "logps/chosen": -258.9104309082031, "logps/rejected": -271.6875915527344, "loss": 0.4688, "rewards/accuracies": 0.875, "rewards/chosen": 0.3374248743057251, "rewards/margins": 0.5757278800010681, "rewards/rejected": -0.2383030354976654, "step": 5147 }, { "epoch": 0.796133771505896, "grad_norm": 5.485743045806885, "learning_rate": 4.0812235078474055e-06, "logits/chosen": 2.7935211658477783, "logits/rejected": 2.319593906402588, "logps/chosen": -298.6899719238281, "logps/rejected": -220.15921020507812, "loss": 0.7439, "rewards/accuracies": 0.5, "rewards/chosen": 0.12295404076576233, "rewards/margins": -0.014685973525047302, "rewards/rejected": 0.13763999938964844, "step": 5148 }, { "epoch": 0.7962884206456602, "grad_norm": 6.5264506340026855, "learning_rate": 4.080937106197732e-06, "logits/chosen": 9.92123794555664, "logits/rejected": 10.865812301635742, "logps/chosen": -254.81570434570312, "logps/rejected": -238.35000610351562, "loss": 0.7695, "rewards/accuracies": 0.375, "rewards/chosen": 0.34687894582748413, "rewards/margins": -0.08180607110261917, "rewards/rejected": 0.4286850094795227, "step": 5149 }, { "epoch": 0.7964430697854243, "grad_norm": 4.936883449554443, "learning_rate": 4.080650704548059e-06, "logits/chosen": 11.194367408752441, "logits/rejected": 11.334209442138672, "logps/chosen": -346.46539306640625, "logps/rejected": -296.0880432128906, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": -0.00530165433883667, "rewards/margins": 0.5895830392837524, "rewards/rejected": -0.5948846936225891, "step": 5150 }, { "epoch": 0.7965977189251885, "grad_norm": 5.078946590423584, "learning_rate": 4.080364302898385e-06, "logits/chosen": 6.028511047363281, "logits/rejected": 11.33392333984375, "logps/chosen": -187.18377685546875, "logps/rejected": -288.6493225097656, "loss": 0.7016, "rewards/accuracies": 0.25, "rewards/chosen": -0.08418698608875275, "rewards/margins": -0.006801655516028404, "rewards/rejected": -0.07738533616065979, "step": 5151 }, { "epoch": 0.7967523680649526, "grad_norm": 5.873189926147461, "learning_rate": 4.080077901248711e-06, "logits/chosen": 15.880168914794922, "logits/rejected": 10.889388084411621, "logps/chosen": -375.6282653808594, "logps/rejected": -247.41439819335938, "loss": 0.7363, "rewards/accuracies": 0.5, "rewards/chosen": 0.055830977857112885, "rewards/margins": 0.16678863763809204, "rewards/rejected": -0.11095762252807617, "step": 5152 }, { "epoch": 0.7969070172047168, "grad_norm": 5.783189296722412, "learning_rate": 4.079791499599038e-06, "logits/chosen": 12.504559516906738, "logits/rejected": 7.759441375732422, "logps/chosen": -251.83560180664062, "logps/rejected": -265.9884033203125, "loss": 0.6184, "rewards/accuracies": 0.625, "rewards/chosen": 0.09201879799365997, "rewards/margins": 0.30969229340553284, "rewards/rejected": -0.21767349541187286, "step": 5153 }, { "epoch": 0.797061666344481, "grad_norm": 5.160167217254639, "learning_rate": 4.0795050979493645e-06, "logits/chosen": 9.942333221435547, "logits/rejected": 5.539155006408691, "logps/chosen": -227.46876525878906, "logps/rejected": -260.2804260253906, "loss": 0.4565, "rewards/accuracies": 0.75, "rewards/chosen": 0.1997339278459549, "rewards/margins": 0.7215489149093628, "rewards/rejected": -0.5218150615692139, "step": 5154 }, { "epoch": 0.7972163154842451, "grad_norm": 4.398248195648193, "learning_rate": 4.079218696299691e-06, "logits/chosen": 11.980247497558594, "logits/rejected": 5.4819416999816895, "logps/chosen": -341.745361328125, "logps/rejected": -247.32220458984375, "loss": 0.5341, "rewards/accuracies": 0.625, "rewards/chosen": 0.6152309775352478, "rewards/margins": 0.5775164365768433, "rewards/rejected": 0.037714630365371704, "step": 5155 }, { "epoch": 0.7973709646240092, "grad_norm": 4.17824125289917, "learning_rate": 4.078932294650018e-06, "logits/chosen": 15.08259105682373, "logits/rejected": 12.27125072479248, "logps/chosen": -300.5875549316406, "logps/rejected": -242.35751342773438, "loss": 0.5091, "rewards/accuracies": 0.75, "rewards/chosen": 0.4908035695552826, "rewards/margins": 0.4971941113471985, "rewards/rejected": -0.006390571594238281, "step": 5156 }, { "epoch": 0.7975256137637734, "grad_norm": 4.464823246002197, "learning_rate": 4.0786458930003445e-06, "logits/chosen": 11.842748641967773, "logits/rejected": 6.18039083480835, "logps/chosen": -341.8172302246094, "logps/rejected": -256.6737976074219, "loss": 0.5122, "rewards/accuracies": 0.875, "rewards/chosen": 0.25513607263565063, "rewards/margins": 0.44512036442756653, "rewards/rejected": -0.18998433649539948, "step": 5157 }, { "epoch": 0.7976802629035376, "grad_norm": 3.3774492740631104, "learning_rate": 4.078359491350671e-06, "logits/chosen": 10.157923698425293, "logits/rejected": 9.001262664794922, "logps/chosen": -164.86631774902344, "logps/rejected": -212.5494384765625, "loss": 0.5367, "rewards/accuracies": 0.625, "rewards/chosen": 0.19584083557128906, "rewards/margins": 0.4724709391593933, "rewards/rejected": -0.27663013339042664, "step": 5158 }, { "epoch": 0.7978349120433018, "grad_norm": 4.394164085388184, "learning_rate": 4.078073089700997e-06, "logits/chosen": 7.729325294494629, "logits/rejected": 7.729151725769043, "logps/chosen": -284.0955810546875, "logps/rejected": -264.1944274902344, "loss": 0.5913, "rewards/accuracies": 0.75, "rewards/chosen": 0.5126651525497437, "rewards/margins": 0.28072088956832886, "rewards/rejected": 0.23194430768489838, "step": 5159 }, { "epoch": 0.797989561183066, "grad_norm": 4.99221658706665, "learning_rate": 4.0777866880513236e-06, "logits/chosen": 2.469189405441284, "logits/rejected": -1.3221299648284912, "logps/chosen": -259.8636169433594, "logps/rejected": -167.8727569580078, "loss": 0.588, "rewards/accuracies": 0.625, "rewards/chosen": 0.1812053620815277, "rewards/margins": 0.43950381875038147, "rewards/rejected": -0.25829845666885376, "step": 5160 }, { "epoch": 0.7981442103228301, "grad_norm": 4.671767234802246, "learning_rate": 4.07750028640165e-06, "logits/chosen": 9.269762992858887, "logits/rejected": 2.148052215576172, "logps/chosen": -201.21466064453125, "logps/rejected": -128.78848266601562, "loss": 0.5709, "rewards/accuracies": 0.75, "rewards/chosen": 0.11846806108951569, "rewards/margins": 0.3675863742828369, "rewards/rejected": -0.24911832809448242, "step": 5161 }, { "epoch": 0.7982988594625943, "grad_norm": 4.324918270111084, "learning_rate": 4.077213884751977e-06, "logits/chosen": 11.156030654907227, "logits/rejected": 7.358633041381836, "logps/chosen": -251.2011260986328, "logps/rejected": -219.09864807128906, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": 0.2468796819448471, "rewards/margins": 0.493645578622818, "rewards/rejected": -0.2467658817768097, "step": 5162 }, { "epoch": 0.7984535086023584, "grad_norm": 4.701037883758545, "learning_rate": 4.0769274831023035e-06, "logits/chosen": 7.528332710266113, "logits/rejected": 2.950017213821411, "logps/chosen": -323.69158935546875, "logps/rejected": -219.5825958251953, "loss": 0.4739, "rewards/accuracies": 0.875, "rewards/chosen": 0.10166644304990768, "rewards/margins": 0.587394654750824, "rewards/rejected": -0.4857281744480133, "step": 5163 }, { "epoch": 0.7986081577421226, "grad_norm": 10.261622428894043, "learning_rate": 4.076641081452629e-06, "logits/chosen": 13.693697929382324, "logits/rejected": 0.9454028606414795, "logps/chosen": -288.722900390625, "logps/rejected": -178.91441345214844, "loss": 0.7346, "rewards/accuracies": 0.5, "rewards/chosen": 0.1125628650188446, "rewards/margins": 0.14303874969482422, "rewards/rejected": -0.030475907027721405, "step": 5164 }, { "epoch": 0.7987628068818867, "grad_norm": 5.021134853363037, "learning_rate": 4.076354679802956e-06, "logits/chosen": 14.34484577178955, "logits/rejected": 11.020774841308594, "logps/chosen": -276.3098449707031, "logps/rejected": -214.42710876464844, "loss": 0.7184, "rewards/accuracies": 0.375, "rewards/chosen": 0.07126487791538239, "rewards/margins": 0.03213825076818466, "rewards/rejected": 0.039126645773649216, "step": 5165 }, { "epoch": 0.7989174560216509, "grad_norm": 5.3498215675354, "learning_rate": 4.076068278153283e-06, "logits/chosen": 8.067974090576172, "logits/rejected": 2.415658473968506, "logps/chosen": -351.56158447265625, "logps/rejected": -266.89569091796875, "loss": 0.5354, "rewards/accuracies": 0.5, "rewards/chosen": 0.2089136838912964, "rewards/margins": 0.5643680691719055, "rewards/rejected": -0.35545435547828674, "step": 5166 }, { "epoch": 0.799072105161415, "grad_norm": 4.583428382873535, "learning_rate": 4.075781876503609e-06, "logits/chosen": 8.18742847442627, "logits/rejected": 11.306123733520508, "logps/chosen": -252.96444702148438, "logps/rejected": -219.61077880859375, "loss": 0.6739, "rewards/accuracies": 0.5, "rewards/chosen": 0.09037430584430695, "rewards/margins": 0.24785694479942322, "rewards/rejected": -0.15748265385627747, "step": 5167 }, { "epoch": 0.7992267543011792, "grad_norm": 6.829429626464844, "learning_rate": 4.075495474853935e-06, "logits/chosen": 5.8625688552856445, "logits/rejected": 11.135774612426758, "logps/chosen": -226.4049835205078, "logps/rejected": -315.36285400390625, "loss": 0.8188, "rewards/accuracies": 0.375, "rewards/chosen": 0.18695230782032013, "rewards/margins": -0.19627505540847778, "rewards/rejected": 0.3832273483276367, "step": 5168 }, { "epoch": 0.7993814034409433, "grad_norm": 5.711610794067383, "learning_rate": 4.075209073204262e-06, "logits/chosen": 6.895674705505371, "logits/rejected": 2.1807947158813477, "logps/chosen": -200.90115356445312, "logps/rejected": -177.5023956298828, "loss": 0.7065, "rewards/accuracies": 0.5, "rewards/chosen": 0.1053914874792099, "rewards/margins": 0.018756121397018433, "rewards/rejected": 0.08663541078567505, "step": 5169 }, { "epoch": 0.7995360525807075, "grad_norm": 8.075103759765625, "learning_rate": 4.074922671554588e-06, "logits/chosen": 12.314367294311523, "logits/rejected": 8.375946044921875, "logps/chosen": -521.67529296875, "logps/rejected": -387.8454284667969, "loss": 0.7433, "rewards/accuracies": 0.5, "rewards/chosen": 0.09303244203329086, "rewards/margins": -0.03562374413013458, "rewards/rejected": 0.12865619361400604, "step": 5170 }, { "epoch": 0.7996907017204716, "grad_norm": 6.888089656829834, "learning_rate": 4.074636269904915e-06, "logits/chosen": 11.074056625366211, "logits/rejected": 11.427193641662598, "logps/chosen": -266.93377685546875, "logps/rejected": -346.3176574707031, "loss": 0.7831, "rewards/accuracies": 0.5, "rewards/chosen": 0.06091861426830292, "rewards/margins": -0.06390589475631714, "rewards/rejected": 0.12482450902462006, "step": 5171 }, { "epoch": 0.7998453508602359, "grad_norm": 4.3380560874938965, "learning_rate": 4.074349868255242e-06, "logits/chosen": 5.799299240112305, "logits/rejected": 6.195982933044434, "logps/chosen": -266.902099609375, "logps/rejected": -223.0730743408203, "loss": 0.6685, "rewards/accuracies": 0.375, "rewards/chosen": 0.16649918258190155, "rewards/margins": 0.17663513123989105, "rewards/rejected": -0.010135941207408905, "step": 5172 }, { "epoch": 0.8, "grad_norm": 6.414584636688232, "learning_rate": 4.0740634666055675e-06, "logits/chosen": 8.644225120544434, "logits/rejected": 7.706579208374023, "logps/chosen": -278.543701171875, "logps/rejected": -291.5439453125, "loss": 0.6804, "rewards/accuracies": 0.625, "rewards/chosen": 0.5179382562637329, "rewards/margins": 0.40932753682136536, "rewards/rejected": 0.10861071944236755, "step": 5173 }, { "epoch": 0.8001546491397642, "grad_norm": 5.999967098236084, "learning_rate": 4.073777064955894e-06, "logits/chosen": 11.229823112487793, "logits/rejected": 12.47011947631836, "logps/chosen": -307.052978515625, "logps/rejected": -309.1876220703125, "loss": 0.7303, "rewards/accuracies": 0.25, "rewards/chosen": 0.2495204359292984, "rewards/margins": 0.010356783866882324, "rewards/rejected": 0.23916365206241608, "step": 5174 }, { "epoch": 0.8003092982795283, "grad_norm": 6.367274761199951, "learning_rate": 4.073490663306221e-06, "logits/chosen": 10.578140258789062, "logits/rejected": 3.7615132331848145, "logps/chosen": -334.2843017578125, "logps/rejected": -194.01795959472656, "loss": 0.6853, "rewards/accuracies": 0.5, "rewards/chosen": -0.15161211788654327, "rewards/margins": 0.1886766105890274, "rewards/rejected": -0.3402886986732483, "step": 5175 }, { "epoch": 0.8004639474192925, "grad_norm": 3.986971855163574, "learning_rate": 4.0732042616565474e-06, "logits/chosen": 15.800336837768555, "logits/rejected": 9.159343719482422, "logps/chosen": -207.65200805664062, "logps/rejected": -136.0389862060547, "loss": 0.601, "rewards/accuracies": 0.5, "rewards/chosen": 0.12586450576782227, "rewards/margins": 0.3264733552932739, "rewards/rejected": -0.20060878992080688, "step": 5176 }, { "epoch": 0.8006185965590567, "grad_norm": 7.346532821655273, "learning_rate": 4.072917860006874e-06, "logits/chosen": 13.264067649841309, "logits/rejected": 13.319330215454102, "logps/chosen": -360.99481201171875, "logps/rejected": -331.4978942871094, "loss": 0.9524, "rewards/accuracies": 0.25, "rewards/chosen": 0.03138361871242523, "rewards/margins": -0.38953450322151184, "rewards/rejected": 0.42091816663742065, "step": 5177 }, { "epoch": 0.8007732456988208, "grad_norm": 5.181656837463379, "learning_rate": 4.0726314583572e-06, "logits/chosen": 14.349178314208984, "logits/rejected": 10.960301399230957, "logps/chosen": -283.50250244140625, "logps/rejected": -264.8973083496094, "loss": 0.7506, "rewards/accuracies": 0.25, "rewards/chosen": -0.07823530584573746, "rewards/margins": 0.05874181538820267, "rewards/rejected": -0.13697710633277893, "step": 5178 }, { "epoch": 0.800927894838585, "grad_norm": 6.109396457672119, "learning_rate": 4.0723450567075265e-06, "logits/chosen": 9.910364151000977, "logits/rejected": 7.063610553741455, "logps/chosen": -280.49462890625, "logps/rejected": -298.12835693359375, "loss": 0.6456, "rewards/accuracies": 0.5, "rewards/chosen": -0.14363422989845276, "rewards/margins": 0.1463633030653, "rewards/rejected": -0.28999754786491394, "step": 5179 }, { "epoch": 0.8010825439783491, "grad_norm": 5.099960803985596, "learning_rate": 4.072058655057853e-06, "logits/chosen": 9.606693267822266, "logits/rejected": 4.290958881378174, "logps/chosen": -226.71255493164062, "logps/rejected": -218.27362060546875, "loss": 0.6723, "rewards/accuracies": 0.625, "rewards/chosen": 0.12202448397874832, "rewards/margins": 0.11381072551012039, "rewards/rejected": 0.008213765919208527, "step": 5180 }, { "epoch": 0.8012371931181133, "grad_norm": 5.887612819671631, "learning_rate": 4.07177225340818e-06, "logits/chosen": 12.942381858825684, "logits/rejected": 11.690461158752441, "logps/chosen": -327.50445556640625, "logps/rejected": -358.5029296875, "loss": 0.5994, "rewards/accuracies": 0.5, "rewards/chosen": 0.09523968398571014, "rewards/margins": 0.32843297719955444, "rewards/rejected": -0.2331932932138443, "step": 5181 }, { "epoch": 0.8013918422578774, "grad_norm": 6.461954116821289, "learning_rate": 4.0714858517585065e-06, "logits/chosen": 12.46915054321289, "logits/rejected": 12.396735191345215, "logps/chosen": -220.73936462402344, "logps/rejected": -184.2044677734375, "loss": 0.883, "rewards/accuracies": 0.375, "rewards/chosen": -0.20647715032100677, "rewards/margins": -0.2857552170753479, "rewards/rejected": 0.07927803695201874, "step": 5182 }, { "epoch": 0.8015464913976416, "grad_norm": 5.9712018966674805, "learning_rate": 4.071199450108833e-06, "logits/chosen": 8.546089172363281, "logits/rejected": 7.132925033569336, "logps/chosen": -339.44329833984375, "logps/rejected": -394.236328125, "loss": 0.5822, "rewards/accuracies": 0.625, "rewards/chosen": 0.3559885025024414, "rewards/margins": 0.3481229543685913, "rewards/rejected": 0.007865525782108307, "step": 5183 }, { "epoch": 0.8017011405374057, "grad_norm": 5.764330863952637, "learning_rate": 4.07091304845916e-06, "logits/chosen": 6.275364398956299, "logits/rejected": 7.319153308868408, "logps/chosen": -281.5815124511719, "logps/rejected": -280.7547607421875, "loss": 0.7068, "rewards/accuracies": 0.375, "rewards/chosen": -0.09090570360422134, "rewards/margins": 0.3019055128097534, "rewards/rejected": -0.39281123876571655, "step": 5184 }, { "epoch": 0.80185578967717, "grad_norm": 5.568892478942871, "learning_rate": 4.070626646809486e-06, "logits/chosen": 8.021758079528809, "logits/rejected": 7.616845607757568, "logps/chosen": -361.82623291015625, "logps/rejected": -381.05108642578125, "loss": 0.5372, "rewards/accuracies": 0.75, "rewards/chosen": 0.5105154514312744, "rewards/margins": 0.5455015897750854, "rewards/rejected": -0.03498610854148865, "step": 5185 }, { "epoch": 0.8020104388169341, "grad_norm": 4.404518127441406, "learning_rate": 4.070340245159812e-06, "logits/chosen": 10.226493835449219, "logits/rejected": 6.699295520782471, "logps/chosen": -327.3751220703125, "logps/rejected": -282.7770690917969, "loss": 0.4766, "rewards/accuracies": 0.75, "rewards/chosen": 0.42840346693992615, "rewards/margins": 0.6867592930793762, "rewards/rejected": -0.2583558261394501, "step": 5186 }, { "epoch": 0.8021650879566983, "grad_norm": 4.824042797088623, "learning_rate": 4.070053843510139e-06, "logits/chosen": 8.825841903686523, "logits/rejected": 3.003796339035034, "logps/chosen": -361.38232421875, "logps/rejected": -277.3885192871094, "loss": 0.516, "rewards/accuracies": 0.875, "rewards/chosen": 0.17106600105762482, "rewards/margins": 0.45207738876342773, "rewards/rejected": -0.2810113728046417, "step": 5187 }, { "epoch": 0.8023197370964624, "grad_norm": 5.579853057861328, "learning_rate": 4.0697674418604655e-06, "logits/chosen": 5.236525058746338, "logits/rejected": 12.648462295532227, "logps/chosen": -198.24014282226562, "logps/rejected": -352.86212158203125, "loss": 0.7394, "rewards/accuracies": 0.5, "rewards/chosen": -0.0928381085395813, "rewards/margins": 0.029683902859687805, "rewards/rejected": -0.12252199649810791, "step": 5188 }, { "epoch": 0.8024743862362266, "grad_norm": 4.990927696228027, "learning_rate": 4.069481040210792e-06, "logits/chosen": 11.756256103515625, "logits/rejected": 5.746004104614258, "logps/chosen": -273.04345703125, "logps/rejected": -235.57907104492188, "loss": 0.6993, "rewards/accuracies": 0.625, "rewards/chosen": -0.17345009744167328, "rewards/margins": 0.11696793138980865, "rewards/rejected": -0.29041802883148193, "step": 5189 }, { "epoch": 0.8026290353759907, "grad_norm": 4.492560863494873, "learning_rate": 4.069194638561119e-06, "logits/chosen": 15.694756507873535, "logits/rejected": 9.705060958862305, "logps/chosen": -294.7191467285156, "logps/rejected": -239.3253173828125, "loss": 0.4475, "rewards/accuracies": 1.0, "rewards/chosen": 0.6990019083023071, "rewards/margins": 0.6343756914138794, "rewards/rejected": 0.06462621688842773, "step": 5190 }, { "epoch": 0.8027836845157549, "grad_norm": 5.97263765335083, "learning_rate": 4.0689082369114455e-06, "logits/chosen": 7.277877330780029, "logits/rejected": -0.9505197405815125, "logps/chosen": -315.8487548828125, "logps/rejected": -203.42868041992188, "loss": 0.5871, "rewards/accuracies": 0.75, "rewards/chosen": 0.4197215735912323, "rewards/margins": 0.539340615272522, "rewards/rejected": -0.11961911618709564, "step": 5191 }, { "epoch": 0.802938333655519, "grad_norm": 6.718442440032959, "learning_rate": 4.068621835261771e-06, "logits/chosen": 8.153020858764648, "logits/rejected": 6.387395858764648, "logps/chosen": -273.1218566894531, "logps/rejected": -268.9293518066406, "loss": 0.6694, "rewards/accuracies": 0.375, "rewards/chosen": 0.02637871913611889, "rewards/margins": 0.12925881147384644, "rewards/rejected": -0.10288010537624359, "step": 5192 }, { "epoch": 0.8030929827952832, "grad_norm": 4.039098262786865, "learning_rate": 4.068335433612098e-06, "logits/chosen": 6.684462547302246, "logits/rejected": 8.002167701721191, "logps/chosen": -171.2302703857422, "logps/rejected": -238.5595703125, "loss": 0.5871, "rewards/accuracies": 0.625, "rewards/chosen": 0.5200514793395996, "rewards/margins": 0.3217238187789917, "rewards/rejected": 0.1983276903629303, "step": 5193 }, { "epoch": 0.8032476319350473, "grad_norm": 4.782079696655273, "learning_rate": 4.068049031962425e-06, "logits/chosen": 7.347402572631836, "logits/rejected": 4.715786457061768, "logps/chosen": -186.6715087890625, "logps/rejected": -177.94876098632812, "loss": 0.584, "rewards/accuracies": 0.75, "rewards/chosen": -0.09450964629650116, "rewards/margins": 0.3566397428512573, "rewards/rejected": -0.4511494040489197, "step": 5194 }, { "epoch": 0.8034022810748115, "grad_norm": 8.323575973510742, "learning_rate": 4.067762630312751e-06, "logits/chosen": 8.670562744140625, "logits/rejected": 11.893360137939453, "logps/chosen": -425.72332763671875, "logps/rejected": -712.8861083984375, "loss": 0.736, "rewards/accuracies": 0.5, "rewards/chosen": -0.008545875549316406, "rewards/margins": -0.024723917245864868, "rewards/rejected": 0.016178037971258163, "step": 5195 }, { "epoch": 0.8035569302145756, "grad_norm": 5.844688415527344, "learning_rate": 4.067476228663078e-06, "logits/chosen": 6.96752405166626, "logits/rejected": 10.892728805541992, "logps/chosen": -225.2916717529297, "logps/rejected": -256.3316650390625, "loss": 0.8014, "rewards/accuracies": 0.375, "rewards/chosen": 0.19091683626174927, "rewards/margins": -0.16553544998168945, "rewards/rejected": 0.3564522862434387, "step": 5196 }, { "epoch": 0.8037115793543398, "grad_norm": 5.034675121307373, "learning_rate": 4.0671898270134046e-06, "logits/chosen": 8.807684898376465, "logits/rejected": 7.643858432769775, "logps/chosen": -209.61215209960938, "logps/rejected": -233.79409790039062, "loss": 0.6496, "rewards/accuracies": 0.75, "rewards/chosen": 0.12015505135059357, "rewards/margins": 0.2830027639865875, "rewards/rejected": -0.16284771263599396, "step": 5197 }, { "epoch": 0.803866228494104, "grad_norm": 8.784025192260742, "learning_rate": 4.06690342536373e-06, "logits/chosen": 14.334604263305664, "logits/rejected": 10.856712341308594, "logps/chosen": -381.6451416015625, "logps/rejected": -294.3848876953125, "loss": 0.7972, "rewards/accuracies": 0.625, "rewards/chosen": -0.4531240165233612, "rewards/margins": 0.15361306071281433, "rewards/rejected": -0.6067371368408203, "step": 5198 }, { "epoch": 0.8040208776338682, "grad_norm": 4.886275768280029, "learning_rate": 4.066617023714057e-06, "logits/chosen": 8.171455383300781, "logits/rejected": 6.467146873474121, "logps/chosen": -201.5238037109375, "logps/rejected": -208.74974060058594, "loss": 0.5851, "rewards/accuracies": 0.875, "rewards/chosen": 0.11793877929449081, "rewards/margins": 0.2462824583053589, "rewards/rejected": -0.12834367156028748, "step": 5199 }, { "epoch": 0.8041755267736324, "grad_norm": 15.686994552612305, "learning_rate": 4.066330622064384e-06, "logits/chosen": 8.719632148742676, "logits/rejected": 4.046745777130127, "logps/chosen": -185.4056396484375, "logps/rejected": -144.71392822265625, "loss": 0.6332, "rewards/accuracies": 0.625, "rewards/chosen": -0.002722442150115967, "rewards/margins": 0.20654967427253723, "rewards/rejected": -0.2092721164226532, "step": 5200 }, { "epoch": 0.8043301759133965, "grad_norm": 7.648826599121094, "learning_rate": 4.06604422041471e-06, "logits/chosen": 11.504674911499023, "logits/rejected": 11.624543190002441, "logps/chosen": -258.1611633300781, "logps/rejected": -226.80657958984375, "loss": 0.6269, "rewards/accuracies": 0.625, "rewards/chosen": 0.30315646529197693, "rewards/margins": 0.18724174797534943, "rewards/rejected": 0.1159147173166275, "step": 5201 }, { "epoch": 0.8044848250531607, "grad_norm": 8.368888854980469, "learning_rate": 4.065757818765036e-06, "logits/chosen": 11.93128776550293, "logits/rejected": 9.404498100280762, "logps/chosen": -400.0206298828125, "logps/rejected": -376.31103515625, "loss": 0.7915, "rewards/accuracies": 0.25, "rewards/chosen": 0.03614950552582741, "rewards/margins": -0.14478681981563568, "rewards/rejected": 0.18093635141849518, "step": 5202 }, { "epoch": 0.8046394741929248, "grad_norm": 4.903611183166504, "learning_rate": 4.065471417115363e-06, "logits/chosen": 9.074488639831543, "logits/rejected": 2.2496421337127686, "logps/chosen": -298.34027099609375, "logps/rejected": -234.4497528076172, "loss": 0.7267, "rewards/accuracies": 0.375, "rewards/chosen": 0.21946890652179718, "rewards/margins": 0.04027573764324188, "rewards/rejected": 0.1791931688785553, "step": 5203 }, { "epoch": 0.804794123332689, "grad_norm": 4.993732929229736, "learning_rate": 4.065185015465689e-06, "logits/chosen": 7.348865509033203, "logits/rejected": 8.633148193359375, "logps/chosen": -189.44631958007812, "logps/rejected": -194.25660705566406, "loss": 0.6801, "rewards/accuracies": 0.5, "rewards/chosen": 0.07526562362909317, "rewards/margins": 0.07058967649936676, "rewards/rejected": 0.004675954580307007, "step": 5204 }, { "epoch": 0.8049487724724531, "grad_norm": 4.015836238861084, "learning_rate": 4.064898613816016e-06, "logits/chosen": 15.522954940795898, "logits/rejected": 11.1107759475708, "logps/chosen": -212.26478576660156, "logps/rejected": -184.9337158203125, "loss": 0.6158, "rewards/accuracies": 0.5, "rewards/chosen": 0.28311431407928467, "rewards/margins": 0.28049972653388977, "rewards/rejected": 0.002614624798297882, "step": 5205 }, { "epoch": 0.8051034216122173, "grad_norm": 6.178126811981201, "learning_rate": 4.064612212166342e-06, "logits/chosen": 17.10051155090332, "logits/rejected": 9.199691772460938, "logps/chosen": -404.9537048339844, "logps/rejected": -358.705810546875, "loss": 0.7296, "rewards/accuracies": 0.375, "rewards/chosen": 0.2712125778198242, "rewards/margins": 0.037758439779281616, "rewards/rejected": 0.2334541529417038, "step": 5206 }, { "epoch": 0.8052580707519814, "grad_norm": 5.445850372314453, "learning_rate": 4.0643258105166685e-06, "logits/chosen": 15.465826034545898, "logits/rejected": 11.676907539367676, "logps/chosen": -260.40869140625, "logps/rejected": -220.15194702148438, "loss": 0.7879, "rewards/accuracies": 0.5, "rewards/chosen": -0.1517251878976822, "rewards/margins": -0.15694619715213776, "rewards/rejected": 0.005220990628004074, "step": 5207 }, { "epoch": 0.8054127198917456, "grad_norm": 10.264176368713379, "learning_rate": 4.064039408866995e-06, "logits/chosen": 10.345460891723633, "logits/rejected": 10.223146438598633, "logps/chosen": -510.39801025390625, "logps/rejected": -551.498291015625, "loss": 0.7476, "rewards/accuracies": 0.5, "rewards/chosen": 0.11689186096191406, "rewards/margins": -0.03275033086538315, "rewards/rejected": 0.1496421843767166, "step": 5208 }, { "epoch": 0.8055673690315097, "grad_norm": 5.232631206512451, "learning_rate": 4.063753007217322e-06, "logits/chosen": 12.069369316101074, "logits/rejected": 9.627613067626953, "logps/chosen": -326.07733154296875, "logps/rejected": -250.61163330078125, "loss": 0.5366, "rewards/accuracies": 0.625, "rewards/chosen": 0.26227816939353943, "rewards/margins": 0.47139719128608704, "rewards/rejected": -0.2091190218925476, "step": 5209 }, { "epoch": 0.8057220181712739, "grad_norm": 4.837009429931641, "learning_rate": 4.0634666055676485e-06, "logits/chosen": 9.050187110900879, "logits/rejected": 8.561442375183105, "logps/chosen": -376.6332092285156, "logps/rejected": -324.16778564453125, "loss": 0.4906, "rewards/accuracies": 0.875, "rewards/chosen": 0.6458479166030884, "rewards/margins": 0.6174514889717102, "rewards/rejected": 0.02839641273021698, "step": 5210 }, { "epoch": 0.8058766673110381, "grad_norm": 8.03756332397461, "learning_rate": 4.063180203917974e-06, "logits/chosen": 16.023622512817383, "logits/rejected": 9.541019439697266, "logps/chosen": -312.97705078125, "logps/rejected": -287.52032470703125, "loss": 0.5909, "rewards/accuracies": 0.625, "rewards/chosen": 0.0995698869228363, "rewards/margins": 0.44314491748809814, "rewards/rejected": -0.34357503056526184, "step": 5211 }, { "epoch": 0.8060313164508023, "grad_norm": 11.008404731750488, "learning_rate": 4.062893802268301e-06, "logits/chosen": 5.884028434753418, "logits/rejected": 8.004257202148438, "logps/chosen": -428.6747131347656, "logps/rejected": -402.5483093261719, "loss": 0.7213, "rewards/accuracies": 0.5, "rewards/chosen": -0.22617530822753906, "rewards/margins": 0.05461283028125763, "rewards/rejected": -0.2807881534099579, "step": 5212 }, { "epoch": 0.8061859655905664, "grad_norm": 5.181330680847168, "learning_rate": 4.0626074006186276e-06, "logits/chosen": 5.363044738769531, "logits/rejected": 10.80592155456543, "logps/chosen": -169.23484802246094, "logps/rejected": -206.7460479736328, "loss": 0.6371, "rewards/accuracies": 0.625, "rewards/chosen": 0.06880469620227814, "rewards/margins": 0.16563266515731812, "rewards/rejected": -0.09682796150445938, "step": 5213 }, { "epoch": 0.8063406147303306, "grad_norm": 6.119858264923096, "learning_rate": 4.062320998968954e-06, "logits/chosen": 9.56867790222168, "logits/rejected": 6.6196770668029785, "logps/chosen": -289.2655029296875, "logps/rejected": -191.4665069580078, "loss": 0.6223, "rewards/accuracies": 0.375, "rewards/chosen": -0.06160430610179901, "rewards/margins": 0.3181297481060028, "rewards/rejected": -0.379734069108963, "step": 5214 }, { "epoch": 0.8064952638700947, "grad_norm": 5.101420879364014, "learning_rate": 4.062034597319281e-06, "logits/chosen": 7.706490516662598, "logits/rejected": 7.361589431762695, "logps/chosen": -195.20339965820312, "logps/rejected": -166.02003479003906, "loss": 0.8156, "rewards/accuracies": 0.375, "rewards/chosen": 0.1350744217634201, "rewards/margins": -0.071823850274086, "rewards/rejected": 0.2068982720375061, "step": 5215 }, { "epoch": 0.8066499130098589, "grad_norm": 5.387447834014893, "learning_rate": 4.0617481956696075e-06, "logits/chosen": 13.040346145629883, "logits/rejected": 8.966412544250488, "logps/chosen": -241.52308654785156, "logps/rejected": -196.57492065429688, "loss": 0.6967, "rewards/accuracies": 0.375, "rewards/chosen": -0.006958387792110443, "rewards/margins": 0.011606879532337189, "rewards/rejected": -0.01856527104973793, "step": 5216 }, { "epoch": 0.806804562149623, "grad_norm": 5.955066204071045, "learning_rate": 4.061461794019934e-06, "logits/chosen": 10.688066482543945, "logits/rejected": 6.326800346374512, "logps/chosen": -287.8813171386719, "logps/rejected": -230.87631225585938, "loss": 0.6571, "rewards/accuracies": 0.5, "rewards/chosen": -0.34742462635040283, "rewards/margins": 0.12873634696006775, "rewards/rejected": -0.47616100311279297, "step": 5217 }, { "epoch": 0.8069592112893872, "grad_norm": 5.2139506340026855, "learning_rate": 4.06117539237026e-06, "logits/chosen": 12.241575241088867, "logits/rejected": 10.76771354675293, "logps/chosen": -346.0780334472656, "logps/rejected": -317.2270202636719, "loss": 0.6177, "rewards/accuracies": 0.625, "rewards/chosen": 0.056915730237960815, "rewards/margins": 0.30003562569618225, "rewards/rejected": -0.24311991035938263, "step": 5218 }, { "epoch": 0.8071138604291513, "grad_norm": 3.31199049949646, "learning_rate": 4.060888990720587e-06, "logits/chosen": 8.206533432006836, "logits/rejected": 9.445413589477539, "logps/chosen": -111.2761459350586, "logps/rejected": -130.78717041015625, "loss": 0.68, "rewards/accuracies": 0.5, "rewards/chosen": -0.4808102250099182, "rewards/margins": 0.10177576541900635, "rewards/rejected": -0.5825859904289246, "step": 5219 }, { "epoch": 0.8072685095689155, "grad_norm": 6.173686981201172, "learning_rate": 4.060602589070913e-06, "logits/chosen": 6.404994010925293, "logits/rejected": 4.3115410804748535, "logps/chosen": -341.13824462890625, "logps/rejected": -216.08740234375, "loss": 0.6169, "rewards/accuracies": 0.75, "rewards/chosen": 0.1019790843129158, "rewards/margins": 0.29108184576034546, "rewards/rejected": -0.18910276889801025, "step": 5220 }, { "epoch": 0.8074231587086796, "grad_norm": 6.065658092498779, "learning_rate": 4.06031618742124e-06, "logits/chosen": 10.73746395111084, "logits/rejected": 4.924856185913086, "logps/chosen": -339.2300720214844, "logps/rejected": -298.42193603515625, "loss": 0.4699, "rewards/accuracies": 0.75, "rewards/chosen": 0.23669365048408508, "rewards/margins": 0.6594338417053223, "rewards/rejected": -0.42274025082588196, "step": 5221 }, { "epoch": 0.8075778078484438, "grad_norm": 5.646334648132324, "learning_rate": 4.060029785771567e-06, "logits/chosen": 15.446451187133789, "logits/rejected": 10.548297882080078, "logps/chosen": -310.388916015625, "logps/rejected": -259.560302734375, "loss": 0.5496, "rewards/accuracies": 0.625, "rewards/chosen": 0.3516327738761902, "rewards/margins": 0.5002945065498352, "rewards/rejected": -0.14866170287132263, "step": 5222 }, { "epoch": 0.8077324569882081, "grad_norm": 4.586674213409424, "learning_rate": 4.059743384121893e-06, "logits/chosen": 15.566143035888672, "logits/rejected": 6.519749164581299, "logps/chosen": -305.5724182128906, "logps/rejected": -173.30209350585938, "loss": 0.4968, "rewards/accuracies": 0.875, "rewards/chosen": 0.22559680044651031, "rewards/margins": 0.49867165088653564, "rewards/rejected": -0.2730748653411865, "step": 5223 }, { "epoch": 0.8078871061279722, "grad_norm": 5.123507499694824, "learning_rate": 4.05945698247222e-06, "logits/chosen": 10.788810729980469, "logits/rejected": 12.295494079589844, "logps/chosen": -246.04440307617188, "logps/rejected": -208.91676330566406, "loss": 0.6507, "rewards/accuracies": 0.625, "rewards/chosen": 0.06893173605203629, "rewards/margins": 0.13411402702331543, "rewards/rejected": -0.06518229842185974, "step": 5224 }, { "epoch": 0.8080417552677364, "grad_norm": 5.3833465576171875, "learning_rate": 4.059170580822546e-06, "logits/chosen": 15.8935546875, "logits/rejected": 8.98311996459961, "logps/chosen": -345.6173095703125, "logps/rejected": -268.0266418457031, "loss": 0.6135, "rewards/accuracies": 0.5, "rewards/chosen": 0.2668483853340149, "rewards/margins": 0.32655489444732666, "rewards/rejected": -0.059706494212150574, "step": 5225 }, { "epoch": 0.8081964044075005, "grad_norm": 6.116227149963379, "learning_rate": 4.058884179172872e-06, "logits/chosen": 9.894168853759766, "logits/rejected": 6.375942707061768, "logps/chosen": -312.7122802734375, "logps/rejected": -237.5018310546875, "loss": 0.5712, "rewards/accuracies": 0.75, "rewards/chosen": 0.057060711085796356, "rewards/margins": 0.3607668876647949, "rewards/rejected": -0.30370616912841797, "step": 5226 }, { "epoch": 0.8083510535472647, "grad_norm": 4.162163257598877, "learning_rate": 4.058597777523199e-06, "logits/chosen": 14.81265640258789, "logits/rejected": 0.16877001523971558, "logps/chosen": -325.4465637207031, "logps/rejected": -185.43394470214844, "loss": 0.4337, "rewards/accuracies": 0.75, "rewards/chosen": -0.08990420401096344, "rewards/margins": 0.8772229552268982, "rewards/rejected": -0.9671271443367004, "step": 5227 }, { "epoch": 0.8085057026870288, "grad_norm": 7.506683349609375, "learning_rate": 4.058311375873526e-06, "logits/chosen": 8.362614631652832, "logits/rejected": 11.137846946716309, "logps/chosen": -246.98936462402344, "logps/rejected": -203.04798889160156, "loss": 0.6791, "rewards/accuracies": 0.625, "rewards/chosen": 0.015744924545288086, "rewards/margins": 0.12110400199890137, "rewards/rejected": -0.10535907745361328, "step": 5228 }, { "epoch": 0.808660351826793, "grad_norm": 4.190384864807129, "learning_rate": 4.058024974223852e-06, "logits/chosen": 15.405750274658203, "logits/rejected": 14.223522186279297, "logps/chosen": -324.0042724609375, "logps/rejected": -241.855224609375, "loss": 0.4907, "rewards/accuracies": 0.75, "rewards/chosen": 0.07716482877731323, "rewards/margins": 0.6256026029586792, "rewards/rejected": -0.548437774181366, "step": 5229 }, { "epoch": 0.8088150009665571, "grad_norm": 13.006925582885742, "learning_rate": 4.057738572574179e-06, "logits/chosen": 10.146551132202148, "logits/rejected": 7.837651252746582, "logps/chosen": -316.58941650390625, "logps/rejected": -239.23733520507812, "loss": 1.0397, "rewards/accuracies": 0.375, "rewards/chosen": -0.233009472489357, "rewards/margins": -0.43496549129486084, "rewards/rejected": 0.20195600390434265, "step": 5230 }, { "epoch": 0.8089696501063213, "grad_norm": 6.152152061462402, "learning_rate": 4.057452170924505e-06, "logits/chosen": 8.831681251525879, "logits/rejected": 2.48677396774292, "logps/chosen": -428.4947814941406, "logps/rejected": -440.7967224121094, "loss": 0.5935, "rewards/accuracies": 0.875, "rewards/chosen": 0.35941529273986816, "rewards/margins": 0.5527947545051575, "rewards/rejected": -0.1933795064687729, "step": 5231 }, { "epoch": 0.8091242992460854, "grad_norm": 5.133934497833252, "learning_rate": 4.057165769274831e-06, "logits/chosen": 10.606398582458496, "logits/rejected": 4.552722930908203, "logps/chosen": -274.0418701171875, "logps/rejected": -207.39620971679688, "loss": 0.5806, "rewards/accuracies": 0.75, "rewards/chosen": 0.29895949363708496, "rewards/margins": 0.3206798732280731, "rewards/rejected": -0.021720364689826965, "step": 5232 }, { "epoch": 0.8092789483858496, "grad_norm": 5.302076816558838, "learning_rate": 4.056879367625158e-06, "logits/chosen": 13.161714553833008, "logits/rejected": 3.288963794708252, "logps/chosen": -606.9154052734375, "logps/rejected": -284.2491760253906, "loss": 0.4936, "rewards/accuracies": 0.625, "rewards/chosen": 0.5333611369132996, "rewards/margins": 0.7942339181900024, "rewards/rejected": -0.2608727812767029, "step": 5233 }, { "epoch": 0.8094335975256137, "grad_norm": 4.196181297302246, "learning_rate": 4.056592965975485e-06, "logits/chosen": 8.693201065063477, "logits/rejected": 4.886995315551758, "logps/chosen": -333.12176513671875, "logps/rejected": -240.75408935546875, "loss": 0.4369, "rewards/accuracies": 0.875, "rewards/chosen": 0.4680570960044861, "rewards/margins": 0.6776407957077026, "rewards/rejected": -0.20958368480205536, "step": 5234 }, { "epoch": 0.8095882466653779, "grad_norm": 5.188414573669434, "learning_rate": 4.056306564325811e-06, "logits/chosen": 7.497532844543457, "logits/rejected": 9.022340774536133, "logps/chosen": -290.15472412109375, "logps/rejected": -330.5630187988281, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.2860964834690094, "rewards/margins": 0.06554519385099411, "rewards/rejected": 0.22055131196975708, "step": 5235 }, { "epoch": 0.8097428958051421, "grad_norm": 19.002063751220703, "learning_rate": 4.056020162676137e-06, "logits/chosen": 3.7412750720977783, "logits/rejected": 8.126347541809082, "logps/chosen": -236.96688842773438, "logps/rejected": -321.1217041015625, "loss": 0.5616, "rewards/accuracies": 0.625, "rewards/chosen": 0.11160522699356079, "rewards/margins": 0.35723477602005005, "rewards/rejected": -0.24562956392765045, "step": 5236 }, { "epoch": 0.8098975449449063, "grad_norm": 9.314631462097168, "learning_rate": 4.055733761026464e-06, "logits/chosen": 11.819740295410156, "logits/rejected": 9.621925354003906, "logps/chosen": -459.09344482421875, "logps/rejected": -346.83538818359375, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": 0.2914165258407593, "rewards/margins": 0.2655884623527527, "rewards/rejected": 0.025828078389167786, "step": 5237 }, { "epoch": 0.8100521940846704, "grad_norm": 5.602231025695801, "learning_rate": 4.0554473593767904e-06, "logits/chosen": 2.766555070877075, "logits/rejected": 4.2995147705078125, "logps/chosen": -227.91781616210938, "logps/rejected": -281.0916748046875, "loss": 0.875, "rewards/accuracies": 0.5, "rewards/chosen": -0.061980172991752625, "rewards/margins": 0.04555127024650574, "rewards/rejected": -0.10753144323825836, "step": 5238 }, { "epoch": 0.8102068432244346, "grad_norm": 5.000399589538574, "learning_rate": 4.055160957727117e-06, "logits/chosen": 7.695864200592041, "logits/rejected": 6.486391067504883, "logps/chosen": -149.65419006347656, "logps/rejected": -163.77008056640625, "loss": 0.625, "rewards/accuracies": 0.625, "rewards/chosen": -0.35791176557540894, "rewards/margins": 0.18745939433574677, "rewards/rejected": -0.5453711748123169, "step": 5239 }, { "epoch": 0.8103614923641987, "grad_norm": 4.8953423500061035, "learning_rate": 4.054874556077443e-06, "logits/chosen": 9.88211727142334, "logits/rejected": 3.8782706260681152, "logps/chosen": -243.38905334472656, "logps/rejected": -215.76162719726562, "loss": 0.6898, "rewards/accuracies": 0.625, "rewards/chosen": 0.2254931628704071, "rewards/margins": 0.2197830080986023, "rewards/rejected": 0.0057101771235466, "step": 5240 }, { "epoch": 0.8105161415039629, "grad_norm": 4.812070369720459, "learning_rate": 4.0545881544277696e-06, "logits/chosen": 12.790386199951172, "logits/rejected": 8.769579887390137, "logps/chosen": -267.0860595703125, "logps/rejected": -227.5906982421875, "loss": 0.6181, "rewards/accuracies": 0.375, "rewards/chosen": -0.02703189104795456, "rewards/margins": 0.3609905242919922, "rewards/rejected": -0.38802242279052734, "step": 5241 }, { "epoch": 0.810670790643727, "grad_norm": 8.099353790283203, "learning_rate": 4.054301752778096e-06, "logits/chosen": 7.10730504989624, "logits/rejected": 4.798792839050293, "logps/chosen": -346.2955627441406, "logps/rejected": -336.1260070800781, "loss": 0.6816, "rewards/accuracies": 0.625, "rewards/chosen": 0.21830779314041138, "rewards/margins": 0.14087443053722382, "rewards/rejected": 0.07743337005376816, "step": 5242 }, { "epoch": 0.8108254397834912, "grad_norm": 5.594520092010498, "learning_rate": 4.054015351128423e-06, "logits/chosen": 9.29182243347168, "logits/rejected": 4.79741096496582, "logps/chosen": -268.1068115234375, "logps/rejected": -225.3123321533203, "loss": 0.6274, "rewards/accuracies": 0.75, "rewards/chosen": 0.11372890323400497, "rewards/margins": 0.37134867906570435, "rewards/rejected": -0.2576197385787964, "step": 5243 }, { "epoch": 0.8109800889232553, "grad_norm": 6.085484981536865, "learning_rate": 4.053728949478749e-06, "logits/chosen": 8.205678939819336, "logits/rejected": 8.612539291381836, "logps/chosen": -322.44305419921875, "logps/rejected": -254.89060974121094, "loss": 0.6224, "rewards/accuracies": 0.5, "rewards/chosen": -0.40665608644485474, "rewards/margins": 0.30481481552124023, "rewards/rejected": -0.711470901966095, "step": 5244 }, { "epoch": 0.8111347380630195, "grad_norm": 12.219535827636719, "learning_rate": 4.053442547829075e-06, "logits/chosen": 8.517681121826172, "logits/rejected": 8.036446571350098, "logps/chosen": -318.8990173339844, "logps/rejected": -342.68695068359375, "loss": 0.4727, "rewards/accuracies": 0.75, "rewards/chosen": 0.030153095722198486, "rewards/margins": 0.7724595069885254, "rewards/rejected": -0.7423064708709717, "step": 5245 }, { "epoch": 0.8112893872027837, "grad_norm": 6.658076286315918, "learning_rate": 4.053156146179402e-06, "logits/chosen": 11.386802673339844, "logits/rejected": 2.051666021347046, "logps/chosen": -341.3597106933594, "logps/rejected": -257.64520263671875, "loss": 0.5587, "rewards/accuracies": 0.75, "rewards/chosen": 0.3172401487827301, "rewards/margins": 0.3857433497905731, "rewards/rejected": -0.06850320100784302, "step": 5246 }, { "epoch": 0.8114440363425478, "grad_norm": 3.927082061767578, "learning_rate": 4.052869744529729e-06, "logits/chosen": 11.17611312866211, "logits/rejected": 7.619402885437012, "logps/chosen": -243.8050079345703, "logps/rejected": -237.6566925048828, "loss": 0.5277, "rewards/accuracies": 0.875, "rewards/chosen": 0.3991936147212982, "rewards/margins": 0.4035755395889282, "rewards/rejected": -0.004381915554404259, "step": 5247 }, { "epoch": 0.811598685482312, "grad_norm": 4.353519916534424, "learning_rate": 4.052583342880055e-06, "logits/chosen": 8.749837875366211, "logits/rejected": 9.650714874267578, "logps/chosen": -299.412109375, "logps/rejected": -283.8265380859375, "loss": 0.5378, "rewards/accuracies": 0.5, "rewards/chosen": 0.2868899703025818, "rewards/margins": 0.6394704580307007, "rewards/rejected": -0.3525804579257965, "step": 5248 }, { "epoch": 0.8117533346220762, "grad_norm": 5.908530235290527, "learning_rate": 4.052296941230382e-06, "logits/chosen": 0.9579715728759766, "logits/rejected": 5.862154960632324, "logps/chosen": -170.54190063476562, "logps/rejected": -192.50701904296875, "loss": 0.7956, "rewards/accuracies": 0.625, "rewards/chosen": -0.3323361277580261, "rewards/margins": -0.0383341908454895, "rewards/rejected": -0.29400190711021423, "step": 5249 }, { "epoch": 0.8119079837618404, "grad_norm": 6.58998441696167, "learning_rate": 4.0520105395807086e-06, "logits/chosen": 10.976187705993652, "logits/rejected": 9.866473197937012, "logps/chosen": -395.2896423339844, "logps/rejected": -298.64959716796875, "loss": 0.7724, "rewards/accuracies": 0.625, "rewards/chosen": 0.15527746081352234, "rewards/margins": 0.16969317197799683, "rewards/rejected": -0.014415740966796875, "step": 5250 }, { "epoch": 0.8120626329016045, "grad_norm": 3.6378870010375977, "learning_rate": 4.051724137931034e-06, "logits/chosen": 9.187033653259277, "logits/rejected": 0.9402756094932556, "logps/chosen": -189.7830810546875, "logps/rejected": -110.6497802734375, "loss": 0.4986, "rewards/accuracies": 0.875, "rewards/chosen": 0.04851336032152176, "rewards/margins": 0.49621251225471497, "rewards/rejected": -0.4476991593837738, "step": 5251 }, { "epoch": 0.8122172820413687, "grad_norm": 3.0564892292022705, "learning_rate": 4.051437736281361e-06, "logits/chosen": 11.147770881652832, "logits/rejected": 5.059230327606201, "logps/chosen": -132.44859313964844, "logps/rejected": -95.91191101074219, "loss": 0.5914, "rewards/accuracies": 0.75, "rewards/chosen": 0.3794665038585663, "rewards/margins": 0.28059878945350647, "rewards/rejected": 0.09886772930622101, "step": 5252 }, { "epoch": 0.8123719311811328, "grad_norm": 4.47088098526001, "learning_rate": 4.051151334631688e-06, "logits/chosen": 11.804327011108398, "logits/rejected": 5.642199516296387, "logps/chosen": -290.63037109375, "logps/rejected": -205.98760986328125, "loss": 0.6439, "rewards/accuracies": 0.75, "rewards/chosen": 0.30821478366851807, "rewards/margins": 0.27451837062835693, "rewards/rejected": 0.03369641304016113, "step": 5253 }, { "epoch": 0.812526580320897, "grad_norm": 5.9306840896606445, "learning_rate": 4.050864932982014e-06, "logits/chosen": 6.84375, "logits/rejected": 9.897790908813477, "logps/chosen": -157.9860382080078, "logps/rejected": -241.5548858642578, "loss": 0.753, "rewards/accuracies": 0.625, "rewards/chosen": -0.2498202621936798, "rewards/margins": 0.013972923159599304, "rewards/rejected": -0.26379314064979553, "step": 5254 }, { "epoch": 0.8126812294606611, "grad_norm": 5.209493637084961, "learning_rate": 4.050578531332341e-06, "logits/chosen": 11.861288070678711, "logits/rejected": 10.334795951843262, "logps/chosen": -292.6259765625, "logps/rejected": -265.9131164550781, "loss": 0.67, "rewards/accuracies": 0.625, "rewards/chosen": 0.009715085849165916, "rewards/margins": 0.07515254616737366, "rewards/rejected": -0.06543746590614319, "step": 5255 }, { "epoch": 0.8128358786004253, "grad_norm": 3.3782143592834473, "learning_rate": 4.050292129682668e-06, "logits/chosen": 10.3798828125, "logits/rejected": 11.115757942199707, "logps/chosen": -127.02919006347656, "logps/rejected": -146.3394317626953, "loss": 0.6686, "rewards/accuracies": 0.5, "rewards/chosen": 0.02231866866350174, "rewards/margins": 0.11710226535797119, "rewards/rejected": -0.09478359669446945, "step": 5256 }, { "epoch": 0.8129905277401894, "grad_norm": 5.4266252517700195, "learning_rate": 4.050005728032994e-06, "logits/chosen": 10.638373374938965, "logits/rejected": 5.4431352615356445, "logps/chosen": -286.39208984375, "logps/rejected": -243.83013916015625, "loss": 0.4835, "rewards/accuracies": 0.75, "rewards/chosen": 0.24860036373138428, "rewards/margins": 0.7253341674804688, "rewards/rejected": -0.4767337739467621, "step": 5257 }, { "epoch": 0.8131451768799536, "grad_norm": 18.151256561279297, "learning_rate": 4.04971932638332e-06, "logits/chosen": 6.482385635375977, "logits/rejected": 11.082147598266602, "logps/chosen": -245.0753173828125, "logps/rejected": -222.33807373046875, "loss": 0.7444, "rewards/accuracies": 0.25, "rewards/chosen": 0.28495582938194275, "rewards/margins": 0.008654996752738953, "rewards/rejected": 0.2763008177280426, "step": 5258 }, { "epoch": 0.8132998260197177, "grad_norm": 5.113682746887207, "learning_rate": 4.049432924733647e-06, "logits/chosen": 10.28695011138916, "logits/rejected": 9.846305847167969, "logps/chosen": -250.01727294921875, "logps/rejected": -239.14910888671875, "loss": 0.6061, "rewards/accuracies": 0.75, "rewards/chosen": 0.29099470376968384, "rewards/margins": 0.31740647554397583, "rewards/rejected": -0.02641179971396923, "step": 5259 }, { "epoch": 0.8134544751594819, "grad_norm": 4.1112775802612305, "learning_rate": 4.049146523083973e-06, "logits/chosen": 12.012083053588867, "logits/rejected": 7.357929229736328, "logps/chosen": -233.00326538085938, "logps/rejected": -156.93759155273438, "loss": 0.4742, "rewards/accuracies": 0.875, "rewards/chosen": 0.2811274528503418, "rewards/margins": 0.5422151684761047, "rewards/rejected": -0.26108771562576294, "step": 5260 }, { "epoch": 0.813609124299246, "grad_norm": 7.474087715148926, "learning_rate": 4.0488601214343e-06, "logits/chosen": 13.350547790527344, "logits/rejected": 9.982833862304688, "logps/chosen": -459.61865234375, "logps/rejected": -401.1523742675781, "loss": 0.7776, "rewards/accuracies": 0.625, "rewards/chosen": 0.3623935580253601, "rewards/margins": 0.008119970560073853, "rewards/rejected": 0.35427361726760864, "step": 5261 }, { "epoch": 0.8137637734390103, "grad_norm": 5.812631130218506, "learning_rate": 4.048573719784627e-06, "logits/chosen": 9.639986991882324, "logits/rejected": 7.043438911437988, "logps/chosen": -343.6244812011719, "logps/rejected": -254.70755004882812, "loss": 0.7194, "rewards/accuracies": 0.375, "rewards/chosen": 0.12544479966163635, "rewards/margins": 0.026358749717473984, "rewards/rejected": 0.09908603876829147, "step": 5262 }, { "epoch": 0.8139184225787744, "grad_norm": 3.132899045944214, "learning_rate": 4.048287318134953e-06, "logits/chosen": 10.487598419189453, "logits/rejected": 5.5532546043396, "logps/chosen": -271.3434753417969, "logps/rejected": -232.08139038085938, "loss": 0.3977, "rewards/accuracies": 0.75, "rewards/chosen": 0.4800993800163269, "rewards/margins": 0.8519445061683655, "rewards/rejected": -0.37184518575668335, "step": 5263 }, { "epoch": 0.8140730717185386, "grad_norm": 3.75488018989563, "learning_rate": 4.048000916485279e-06, "logits/chosen": 14.997414588928223, "logits/rejected": 6.509361267089844, "logps/chosen": -348.81103515625, "logps/rejected": -240.2603759765625, "loss": 0.4684, "rewards/accuracies": 0.75, "rewards/chosen": 0.32040759921073914, "rewards/margins": 0.6518306732177734, "rewards/rejected": -0.3314230740070343, "step": 5264 }, { "epoch": 0.8142277208583028, "grad_norm": 4.084811687469482, "learning_rate": 4.047714514835606e-06, "logits/chosen": 7.143255710601807, "logits/rejected": 2.140012741088867, "logps/chosen": -155.64756774902344, "logps/rejected": -101.34384155273438, "loss": 0.6055, "rewards/accuracies": 0.75, "rewards/chosen": -0.033007651567459106, "rewards/margins": 0.2969239354133606, "rewards/rejected": -0.3299315869808197, "step": 5265 }, { "epoch": 0.8143823699980669, "grad_norm": 5.536152362823486, "learning_rate": 4.0474281131859324e-06, "logits/chosen": 16.83504295349121, "logits/rejected": 8.692803382873535, "logps/chosen": -390.17431640625, "logps/rejected": -261.44927978515625, "loss": 0.6412, "rewards/accuracies": 0.75, "rewards/chosen": 0.13782596588134766, "rewards/margins": 0.1725536435842514, "rewards/rejected": -0.03472766652703285, "step": 5266 }, { "epoch": 0.814537019137831, "grad_norm": 7.197746276855469, "learning_rate": 4.047141711536259e-06, "logits/chosen": 11.130281448364258, "logits/rejected": 5.818378925323486, "logps/chosen": -465.3574523925781, "logps/rejected": -483.74114990234375, "loss": 0.5644, "rewards/accuracies": 0.75, "rewards/chosen": 0.20304261147975922, "rewards/margins": 0.5159529447555542, "rewards/rejected": -0.31291037797927856, "step": 5267 }, { "epoch": 0.8146916682775952, "grad_norm": 5.068341255187988, "learning_rate": 4.046855309886586e-06, "logits/chosen": 7.059698581695557, "logits/rejected": 8.794771194458008, "logps/chosen": -202.42514038085938, "logps/rejected": -188.94395446777344, "loss": 0.6312, "rewards/accuracies": 0.75, "rewards/chosen": -0.06027105450630188, "rewards/margins": 0.18109092116355896, "rewards/rejected": -0.24136200547218323, "step": 5268 }, { "epoch": 0.8148463174173594, "grad_norm": 17.070634841918945, "learning_rate": 4.046568908236912e-06, "logits/chosen": 9.194201469421387, "logits/rejected": 10.488310813903809, "logps/chosen": -214.42385864257812, "logps/rejected": -300.6953125, "loss": 0.6785, "rewards/accuracies": 0.375, "rewards/chosen": -0.13614827394485474, "rewards/margins": 0.292192280292511, "rewards/rejected": -0.42834052443504333, "step": 5269 }, { "epoch": 0.8150009665571235, "grad_norm": 4.295851230621338, "learning_rate": 4.046282506587238e-06, "logits/chosen": 12.289249420166016, "logits/rejected": 9.12799072265625, "logps/chosen": -284.0588073730469, "logps/rejected": -160.14846801757812, "loss": 0.6293, "rewards/accuracies": 0.875, "rewards/chosen": 0.36966472864151, "rewards/margins": 0.31363534927368164, "rewards/rejected": 0.056029416620731354, "step": 5270 }, { "epoch": 0.8151556156968877, "grad_norm": 6.943835258483887, "learning_rate": 4.045996104937565e-06, "logits/chosen": 8.417034149169922, "logits/rejected": 8.741055488586426, "logps/chosen": -347.30682373046875, "logps/rejected": -366.083740234375, "loss": 0.7137, "rewards/accuracies": 0.5, "rewards/chosen": 0.3768126368522644, "rewards/margins": 0.10149691253900528, "rewards/rejected": 0.27531570196151733, "step": 5271 }, { "epoch": 0.8153102648366518, "grad_norm": 6.641474723815918, "learning_rate": 4.0457097032878915e-06, "logits/chosen": 12.882699966430664, "logits/rejected": 11.63312816619873, "logps/chosen": -232.55972290039062, "logps/rejected": -255.19406127929688, "loss": 0.8741, "rewards/accuracies": 0.25, "rewards/chosen": -0.10579453408718109, "rewards/margins": -0.3159026801586151, "rewards/rejected": 0.21010813117027283, "step": 5272 }, { "epoch": 0.815464913976416, "grad_norm": 6.892787456512451, "learning_rate": 4.045423301638218e-06, "logits/chosen": 12.944417953491211, "logits/rejected": 11.377227783203125, "logps/chosen": -389.84710693359375, "logps/rejected": -344.00738525390625, "loss": 0.6537, "rewards/accuracies": 0.375, "rewards/chosen": 0.08367764949798584, "rewards/margins": 0.3446391820907593, "rewards/rejected": -0.26096153259277344, "step": 5273 }, { "epoch": 0.8156195631161801, "grad_norm": 7.29603910446167, "learning_rate": 4.045136899988544e-06, "logits/chosen": 7.714539527893066, "logits/rejected": 13.643836975097656, "logps/chosen": -235.70118713378906, "logps/rejected": -356.7230224609375, "loss": 1.0158, "rewards/accuracies": 0.375, "rewards/chosen": 0.09182576835155487, "rewards/margins": -0.4734768271446228, "rewards/rejected": 0.5653025507926941, "step": 5274 }, { "epoch": 0.8157742122559444, "grad_norm": 4.280092239379883, "learning_rate": 4.044850498338871e-06, "logits/chosen": 13.5042724609375, "logits/rejected": 10.427685737609863, "logps/chosen": -294.9332580566406, "logps/rejected": -253.32485961914062, "loss": 0.5618, "rewards/accuracies": 0.875, "rewards/chosen": 0.13593292236328125, "rewards/margins": 0.4069046974182129, "rewards/rejected": -0.27097177505493164, "step": 5275 }, { "epoch": 0.8159288613957085, "grad_norm": 4.629084587097168, "learning_rate": 4.044564096689197e-06, "logits/chosen": 9.6305570602417, "logits/rejected": 9.9197359085083, "logps/chosen": -396.084716796875, "logps/rejected": -289.13482666015625, "loss": 0.5654, "rewards/accuracies": 0.875, "rewards/chosen": 0.4238523244857788, "rewards/margins": 0.39534714818000793, "rewards/rejected": 0.028505191206932068, "step": 5276 }, { "epoch": 0.8160835105354727, "grad_norm": 6.249995708465576, "learning_rate": 4.044277695039524e-06, "logits/chosen": 7.059247970581055, "logits/rejected": 7.480415344238281, "logps/chosen": -309.423583984375, "logps/rejected": -180.4270477294922, "loss": 0.9654, "rewards/accuracies": 0.375, "rewards/chosen": -0.32799047231674194, "rewards/margins": -0.11884063482284546, "rewards/rejected": -0.2091498225927353, "step": 5277 }, { "epoch": 0.8162381596752368, "grad_norm": 3.9495480060577393, "learning_rate": 4.04399129338985e-06, "logits/chosen": 14.422968864440918, "logits/rejected": 5.2372660636901855, "logps/chosen": -439.24542236328125, "logps/rejected": -214.88162231445312, "loss": 0.3928, "rewards/accuracies": 0.875, "rewards/chosen": 0.4407007098197937, "rewards/margins": 1.2521064281463623, "rewards/rejected": -0.8114056587219238, "step": 5278 }, { "epoch": 0.816392808815001, "grad_norm": 6.598100662231445, "learning_rate": 4.043704891740176e-06, "logits/chosen": 9.808004379272461, "logits/rejected": 8.550536155700684, "logps/chosen": -219.38681030273438, "logps/rejected": -171.9385986328125, "loss": 0.934, "rewards/accuracies": 0.25, "rewards/chosen": -0.7619840502738953, "rewards/margins": -0.3532869815826416, "rewards/rejected": -0.40869706869125366, "step": 5279 }, { "epoch": 0.8165474579547651, "grad_norm": 4.516329765319824, "learning_rate": 4.043418490090503e-06, "logits/chosen": 9.034750938415527, "logits/rejected": 8.807994842529297, "logps/chosen": -255.7656707763672, "logps/rejected": -249.89437866210938, "loss": 0.4976, "rewards/accuracies": 0.875, "rewards/chosen": 0.14227935671806335, "rewards/margins": 0.6736327409744263, "rewards/rejected": -0.5313533544540405, "step": 5280 }, { "epoch": 0.8167021070945293, "grad_norm": 3.921006202697754, "learning_rate": 4.04313208844083e-06, "logits/chosen": 11.465631484985352, "logits/rejected": 7.1311869621276855, "logps/chosen": -256.7264709472656, "logps/rejected": -197.33151245117188, "loss": 0.4946, "rewards/accuracies": 0.875, "rewards/chosen": 0.35361552238464355, "rewards/margins": 0.5205506086349487, "rewards/rejected": -0.1669350564479828, "step": 5281 }, { "epoch": 0.8168567562342934, "grad_norm": 4.306658744812012, "learning_rate": 4.042845686791156e-06, "logits/chosen": 14.168390274047852, "logits/rejected": 13.578243255615234, "logps/chosen": -274.8170166015625, "logps/rejected": -331.6337585449219, "loss": 0.5469, "rewards/accuracies": 0.625, "rewards/chosen": 0.3678053617477417, "rewards/margins": 0.3964787423610687, "rewards/rejected": -0.02867337316274643, "step": 5282 }, { "epoch": 0.8170114053740576, "grad_norm": 6.740323066711426, "learning_rate": 4.042559285141483e-06, "logits/chosen": 18.647960662841797, "logits/rejected": 11.877840995788574, "logps/chosen": -392.2648010253906, "logps/rejected": -259.07025146484375, "loss": 0.7373, "rewards/accuracies": 0.5, "rewards/chosen": -0.08635292947292328, "rewards/margins": 0.15699368715286255, "rewards/rejected": -0.24334661662578583, "step": 5283 }, { "epoch": 0.8171660545138217, "grad_norm": 4.946192264556885, "learning_rate": 4.042272883491809e-06, "logits/chosen": 10.802740097045898, "logits/rejected": 8.902528762817383, "logps/chosen": -293.80413818359375, "logps/rejected": -310.8073425292969, "loss": 0.5727, "rewards/accuracies": 0.75, "rewards/chosen": 0.23894403874874115, "rewards/margins": 0.35257089138031006, "rewards/rejected": -0.1136268675327301, "step": 5284 }, { "epoch": 0.8173207036535859, "grad_norm": 5.884267807006836, "learning_rate": 4.041986481842135e-06, "logits/chosen": 14.318381309509277, "logits/rejected": 8.124183654785156, "logps/chosen": -250.32835388183594, "logps/rejected": -191.47222900390625, "loss": 0.6961, "rewards/accuracies": 0.625, "rewards/chosen": 0.015008017420768738, "rewards/margins": 0.2517485022544861, "rewards/rejected": -0.23674052953720093, "step": 5285 }, { "epoch": 0.81747535279335, "grad_norm": 5.399677753448486, "learning_rate": 4.041700080192462e-06, "logits/chosen": 11.85676097869873, "logits/rejected": 11.586560249328613, "logps/chosen": -219.27130126953125, "logps/rejected": -271.974609375, "loss": 0.6984, "rewards/accuracies": 0.5, "rewards/chosen": 0.17573201656341553, "rewards/margins": 0.2112746238708496, "rewards/rejected": -0.03554264083504677, "step": 5286 }, { "epoch": 0.8176300019331142, "grad_norm": 4.5546464920043945, "learning_rate": 4.041413678542789e-06, "logits/chosen": 4.765160083770752, "logits/rejected": 4.182767868041992, "logps/chosen": -184.91705322265625, "logps/rejected": -154.1429443359375, "loss": 0.8115, "rewards/accuracies": 0.375, "rewards/chosen": -0.21078996360301971, "rewards/margins": 0.0438825786113739, "rewards/rejected": -0.2546725571155548, "step": 5287 }, { "epoch": 0.8177846510728785, "grad_norm": 5.755075454711914, "learning_rate": 4.041127276893115e-06, "logits/chosen": 6.5628228187561035, "logits/rejected": 1.8516666889190674, "logps/chosen": -266.8765563964844, "logps/rejected": -285.1120910644531, "loss": 0.6808, "rewards/accuracies": 0.75, "rewards/chosen": -0.1825660765171051, "rewards/margins": 0.2668920159339905, "rewards/rejected": -0.44945812225341797, "step": 5288 }, { "epoch": 0.8179393002126426, "grad_norm": 4.75536584854126, "learning_rate": 4.040840875243442e-06, "logits/chosen": 14.11911678314209, "logits/rejected": 4.839972496032715, "logps/chosen": -297.9878845214844, "logps/rejected": -213.57167053222656, "loss": 0.6424, "rewards/accuracies": 0.625, "rewards/chosen": 0.11523404717445374, "rewards/margins": 0.14415527880191803, "rewards/rejected": -0.028921235352754593, "step": 5289 }, { "epoch": 0.8180939493524068, "grad_norm": 5.634769916534424, "learning_rate": 4.040554473593768e-06, "logits/chosen": 12.931995391845703, "logits/rejected": 8.381753921508789, "logps/chosen": -280.044189453125, "logps/rejected": -226.84364318847656, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": -0.11437497287988663, "rewards/margins": 0.10387403517961502, "rewards/rejected": -0.21824902296066284, "step": 5290 }, { "epoch": 0.8182485984921709, "grad_norm": 7.441853046417236, "learning_rate": 4.0402680719440945e-06, "logits/chosen": 6.6387619972229, "logits/rejected": 7.478278160095215, "logps/chosen": -279.8251953125, "logps/rejected": -294.8862609863281, "loss": 0.9915, "rewards/accuracies": 0.5, "rewards/chosen": 0.013854900375008583, "rewards/margins": -0.3584097623825073, "rewards/rejected": 0.37226468324661255, "step": 5291 }, { "epoch": 0.8184032476319351, "grad_norm": 5.081742286682129, "learning_rate": 4.039981670294421e-06, "logits/chosen": 9.369760513305664, "logits/rejected": 6.442256927490234, "logps/chosen": -217.87554931640625, "logps/rejected": -192.88534545898438, "loss": 0.596, "rewards/accuracies": 0.625, "rewards/chosen": -0.08706074953079224, "rewards/margins": 0.4786318838596344, "rewards/rejected": -0.565692663192749, "step": 5292 }, { "epoch": 0.8185578967716992, "grad_norm": 7.251420021057129, "learning_rate": 4.039695268644748e-06, "logits/chosen": 10.684796333312988, "logits/rejected": 5.992951393127441, "logps/chosen": -263.9416198730469, "logps/rejected": -199.82528686523438, "loss": 0.931, "rewards/accuracies": 0.375, "rewards/chosen": -0.32957836985588074, "rewards/margins": -0.3344048857688904, "rewards/rejected": 0.004826489835977554, "step": 5293 }, { "epoch": 0.8187125459114634, "grad_norm": 4.681700706481934, "learning_rate": 4.039408866995074e-06, "logits/chosen": 6.999462127685547, "logits/rejected": 3.8563921451568604, "logps/chosen": -220.25628662109375, "logps/rejected": -216.92608642578125, "loss": 0.5645, "rewards/accuracies": 0.625, "rewards/chosen": 0.43038418889045715, "rewards/margins": 0.35112860798835754, "rewards/rejected": 0.07925557345151901, "step": 5294 }, { "epoch": 0.8188671950512275, "grad_norm": 3.659858226776123, "learning_rate": 4.039122465345401e-06, "logits/chosen": 10.38975715637207, "logits/rejected": 10.337723731994629, "logps/chosen": -140.5755157470703, "logps/rejected": -112.56028747558594, "loss": 0.5598, "rewards/accuracies": 0.625, "rewards/chosen": -0.024062322452664375, "rewards/margins": 0.34360113739967346, "rewards/rejected": -0.3676634430885315, "step": 5295 }, { "epoch": 0.8190218441909917, "grad_norm": 7.283785343170166, "learning_rate": 4.038836063695728e-06, "logits/chosen": 10.32509994506836, "logits/rejected": 4.493505954742432, "logps/chosen": -315.95123291015625, "logps/rejected": -268.0956726074219, "loss": 0.5022, "rewards/accuracies": 0.75, "rewards/chosen": 0.26954537630081177, "rewards/margins": 0.47916051745414734, "rewards/rejected": -0.20961514115333557, "step": 5296 }, { "epoch": 0.8191764933307558, "grad_norm": 4.99456262588501, "learning_rate": 4.0385496620460535e-06, "logits/chosen": 7.853828430175781, "logits/rejected": 6.3915252685546875, "logps/chosen": -231.78533935546875, "logps/rejected": -265.7840576171875, "loss": 0.5818, "rewards/accuracies": 0.625, "rewards/chosen": 0.29572269320487976, "rewards/margins": 0.33043473958969116, "rewards/rejected": -0.034712053835392, "step": 5297 }, { "epoch": 0.81933114247052, "grad_norm": 5.044380187988281, "learning_rate": 4.03826326039638e-06, "logits/chosen": 10.707877159118652, "logits/rejected": 9.763927459716797, "logps/chosen": -169.720458984375, "logps/rejected": -169.6317901611328, "loss": 0.6595, "rewards/accuracies": 0.625, "rewards/chosen": -0.31982043385505676, "rewards/margins": 0.34559178352355957, "rewards/rejected": -0.6654122471809387, "step": 5298 }, { "epoch": 0.8194857916102841, "grad_norm": 7.007102012634277, "learning_rate": 4.037976858746707e-06, "logits/chosen": 11.934805870056152, "logits/rejected": 7.741524696350098, "logps/chosen": -231.00210571289062, "logps/rejected": -144.3037109375, "loss": 0.626, "rewards/accuracies": 0.5, "rewards/chosen": 0.08430337905883789, "rewards/margins": 0.2393733561038971, "rewards/rejected": -0.1550699770450592, "step": 5299 }, { "epoch": 0.8196404407500484, "grad_norm": 4.550992012023926, "learning_rate": 4.0376904570970335e-06, "logits/chosen": 12.091131210327148, "logits/rejected": 10.928199768066406, "logps/chosen": -252.32498168945312, "logps/rejected": -225.9612274169922, "loss": 0.4719, "rewards/accuracies": 0.75, "rewards/chosen": 0.03519707918167114, "rewards/margins": 0.6677951216697693, "rewards/rejected": -0.6325980424880981, "step": 5300 }, { "epoch": 0.8197950898898125, "grad_norm": 5.415155410766602, "learning_rate": 4.03740405544736e-06, "logits/chosen": 15.42573070526123, "logits/rejected": 14.31202220916748, "logps/chosen": -442.5439758300781, "logps/rejected": -331.3834228515625, "loss": 0.5775, "rewards/accuracies": 0.875, "rewards/chosen": 0.2566392421722412, "rewards/margins": 0.26325926184654236, "rewards/rejected": -0.006620034575462341, "step": 5301 }, { "epoch": 0.8199497390295767, "grad_norm": 4.5388922691345215, "learning_rate": 4.037117653797687e-06, "logits/chosen": 4.377840995788574, "logits/rejected": 8.438913345336914, "logps/chosen": -297.1492919921875, "logps/rejected": -307.8794860839844, "loss": 0.5323, "rewards/accuracies": 0.625, "rewards/chosen": 0.3450619876384735, "rewards/margins": 0.5069870948791504, "rewards/rejected": -0.16192513704299927, "step": 5302 }, { "epoch": 0.8201043881693408, "grad_norm": 4.9085307121276855, "learning_rate": 4.0368312521480126e-06, "logits/chosen": 4.341270446777344, "logits/rejected": 3.215613603591919, "logps/chosen": -213.53590393066406, "logps/rejected": -195.8070068359375, "loss": 0.7238, "rewards/accuracies": 0.5, "rewards/chosen": -0.09362641721963882, "rewards/margins": 0.07092452049255371, "rewards/rejected": -0.16455093026161194, "step": 5303 }, { "epoch": 0.820259037309105, "grad_norm": 5.224401473999023, "learning_rate": 4.036544850498339e-06, "logits/chosen": 11.544118881225586, "logits/rejected": 12.571065902709961, "logps/chosen": -266.6258850097656, "logps/rejected": -251.29742431640625, "loss": 0.6853, "rewards/accuracies": 0.375, "rewards/chosen": 0.2527645230293274, "rewards/margins": 0.13452434539794922, "rewards/rejected": 0.11824017763137817, "step": 5304 }, { "epoch": 0.8204136864488691, "grad_norm": 5.437252521514893, "learning_rate": 4.036258448848666e-06, "logits/chosen": 9.601806640625, "logits/rejected": 10.409278869628906, "logps/chosen": -300.93115234375, "logps/rejected": -269.85565185546875, "loss": 0.6094, "rewards/accuracies": 0.875, "rewards/chosen": 0.006354421377182007, "rewards/margins": 0.2257767617702484, "rewards/rejected": -0.2194223701953888, "step": 5305 }, { "epoch": 0.8205683355886333, "grad_norm": 4.728679656982422, "learning_rate": 4.0359720471989925e-06, "logits/chosen": 10.311020851135254, "logits/rejected": 4.833508014678955, "logps/chosen": -271.4335632324219, "logps/rejected": -202.4356689453125, "loss": 0.5953, "rewards/accuracies": 0.875, "rewards/chosen": -0.04705129191279411, "rewards/margins": 0.2946092188358307, "rewards/rejected": -0.3416604995727539, "step": 5306 }, { "epoch": 0.8207229847283974, "grad_norm": 6.367340087890625, "learning_rate": 4.035685645549319e-06, "logits/chosen": 8.099177360534668, "logits/rejected": 5.124929428100586, "logps/chosen": -316.1790771484375, "logps/rejected": -246.45358276367188, "loss": 0.7259, "rewards/accuracies": 0.375, "rewards/chosen": 0.006880074739456177, "rewards/margins": 0.03051258623600006, "rewards/rejected": -0.02363251894712448, "step": 5307 }, { "epoch": 0.8208776338681616, "grad_norm": 6.976737022399902, "learning_rate": 4.035399243899645e-06, "logits/chosen": 9.827499389648438, "logits/rejected": 4.327454566955566, "logps/chosen": -378.16302490234375, "logps/rejected": -315.242431640625, "loss": 0.5744, "rewards/accuracies": 0.625, "rewards/chosen": 0.05373586714267731, "rewards/margins": 0.5628617405891418, "rewards/rejected": -0.509125828742981, "step": 5308 }, { "epoch": 0.8210322830079257, "grad_norm": 7.578852653503418, "learning_rate": 4.035112842249972e-06, "logits/chosen": 10.64267635345459, "logits/rejected": 8.26196002960205, "logps/chosen": -471.53509521484375, "logps/rejected": -434.52349853515625, "loss": 0.6787, "rewards/accuracies": 0.5, "rewards/chosen": -0.047881707549095154, "rewards/margins": 0.13492760062217712, "rewards/rejected": -0.18280930817127228, "step": 5309 }, { "epoch": 0.8211869321476899, "grad_norm": 7.028905391693115, "learning_rate": 4.034826440600298e-06, "logits/chosen": 10.830615997314453, "logits/rejected": 9.741681098937988, "logps/chosen": -254.65875244140625, "logps/rejected": -199.8251190185547, "loss": 0.859, "rewards/accuracies": 0.375, "rewards/chosen": -0.15158559381961823, "rewards/margins": -0.1550678312778473, "rewards/rejected": 0.0034822598099708557, "step": 5310 }, { "epoch": 0.821341581287454, "grad_norm": 7.716348171234131, "learning_rate": 4.034540038950625e-06, "logits/chosen": 8.212759017944336, "logits/rejected": 11.404217720031738, "logps/chosen": -245.264404296875, "logps/rejected": -337.47528076171875, "loss": 0.8757, "rewards/accuracies": 0.5, "rewards/chosen": -0.39241209626197815, "rewards/margins": -0.19383685290813446, "rewards/rejected": -0.1985752135515213, "step": 5311 }, { "epoch": 0.8214962304272182, "grad_norm": 6.5120530128479, "learning_rate": 4.034253637300951e-06, "logits/chosen": 10.246106147766113, "logits/rejected": 13.368188858032227, "logps/chosen": -291.8525390625, "logps/rejected": -256.69110107421875, "loss": 0.774, "rewards/accuracies": 0.5, "rewards/chosen": 0.17371977865695953, "rewards/margins": -0.06294122338294983, "rewards/rejected": 0.23666101694107056, "step": 5312 }, { "epoch": 0.8216508795669825, "grad_norm": 4.0749030113220215, "learning_rate": 4.033967235651277e-06, "logits/chosen": 11.509986877441406, "logits/rejected": -0.17578458786010742, "logps/chosen": -229.82313537597656, "logps/rejected": -96.24790954589844, "loss": 0.6169, "rewards/accuracies": 0.75, "rewards/chosen": -0.0189799964427948, "rewards/margins": 0.38654884696006775, "rewards/rejected": -0.40552881360054016, "step": 5313 }, { "epoch": 0.8218055287067466, "grad_norm": 7.076247215270996, "learning_rate": 4.033680834001604e-06, "logits/chosen": 4.171049118041992, "logits/rejected": 6.825776100158691, "logps/chosen": -265.8440856933594, "logps/rejected": -373.115966796875, "loss": 0.6174, "rewards/accuracies": 0.625, "rewards/chosen": -0.3364883363246918, "rewards/margins": 0.3720911145210266, "rewards/rejected": -0.7085794806480408, "step": 5314 }, { "epoch": 0.8219601778465108, "grad_norm": 5.649542331695557, "learning_rate": 4.033394432351931e-06, "logits/chosen": 4.286419868469238, "logits/rejected": 1.6393518447875977, "logps/chosen": -210.84759521484375, "logps/rejected": -150.1285400390625, "loss": 0.8277, "rewards/accuracies": 0.5, "rewards/chosen": -0.3767644762992859, "rewards/margins": -0.07719491422176361, "rewards/rejected": -0.2995695471763611, "step": 5315 }, { "epoch": 0.8221148269862749, "grad_norm": 5.2629313468933105, "learning_rate": 4.033108030702257e-06, "logits/chosen": 8.696675300598145, "logits/rejected": 1.9972820281982422, "logps/chosen": -290.17559814453125, "logps/rejected": -229.56788635253906, "loss": 0.5815, "rewards/accuracies": 0.625, "rewards/chosen": 0.17074839770793915, "rewards/margins": 0.30529817938804626, "rewards/rejected": -0.1345497965812683, "step": 5316 }, { "epoch": 0.8222694761260391, "grad_norm": 5.135610103607178, "learning_rate": 4.032821629052583e-06, "logits/chosen": 4.589008331298828, "logits/rejected": 2.625314712524414, "logps/chosen": -247.08274841308594, "logps/rejected": -252.1951141357422, "loss": 0.6366, "rewards/accuracies": 0.5, "rewards/chosen": -0.1581580936908722, "rewards/margins": 0.2367747724056244, "rewards/rejected": -0.39493289589881897, "step": 5317 }, { "epoch": 0.8224241252658032, "grad_norm": 5.244497299194336, "learning_rate": 4.03253522740291e-06, "logits/chosen": 10.05750560760498, "logits/rejected": 9.302133560180664, "logps/chosen": -330.64202880859375, "logps/rejected": -356.7399597167969, "loss": 0.6178, "rewards/accuracies": 0.625, "rewards/chosen": 0.13964241743087769, "rewards/margins": 0.3444845676422119, "rewards/rejected": -0.2048421949148178, "step": 5318 }, { "epoch": 0.8225787744055674, "grad_norm": 4.925677299499512, "learning_rate": 4.0322488257532364e-06, "logits/chosen": 8.956275939941406, "logits/rejected": 9.288116455078125, "logps/chosen": -236.534912109375, "logps/rejected": -240.97195434570312, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": 0.32586202025413513, "rewards/margins": 0.4305882453918457, "rewards/rejected": -0.10472622513771057, "step": 5319 }, { "epoch": 0.8227334235453315, "grad_norm": 6.542516231536865, "learning_rate": 4.031962424103563e-06, "logits/chosen": 10.820230484008789, "logits/rejected": 6.561944007873535, "logps/chosen": -329.16485595703125, "logps/rejected": -360.517578125, "loss": 0.7851, "rewards/accuracies": 0.5, "rewards/chosen": 0.2560485303401947, "rewards/margins": -0.034902364015579224, "rewards/rejected": 0.2909509241580963, "step": 5320 }, { "epoch": 0.8228880726850957, "grad_norm": 5.495143413543701, "learning_rate": 4.03167602245389e-06, "logits/chosen": 5.645079135894775, "logits/rejected": 3.5093014240264893, "logps/chosen": -162.95184326171875, "logps/rejected": -218.01318359375, "loss": 0.7335, "rewards/accuracies": 0.375, "rewards/chosen": -0.09918948262929916, "rewards/margins": 0.13423582911491394, "rewards/rejected": -0.2334252893924713, "step": 5321 }, { "epoch": 0.8230427218248598, "grad_norm": 4.503170013427734, "learning_rate": 4.031389620804216e-06, "logits/chosen": 12.930352210998535, "logits/rejected": 11.06631088256836, "logps/chosen": -252.0088348388672, "logps/rejected": -265.4640197753906, "loss": 0.4894, "rewards/accuracies": 0.75, "rewards/chosen": 0.4509038031101227, "rewards/margins": 0.6731268763542175, "rewards/rejected": -0.22222310304641724, "step": 5322 }, { "epoch": 0.823197370964624, "grad_norm": 5.0754618644714355, "learning_rate": 4.031103219154542e-06, "logits/chosen": 12.904048919677734, "logits/rejected": 13.595926284790039, "logps/chosen": -297.0815734863281, "logps/rejected": -336.881591796875, "loss": 0.5675, "rewards/accuracies": 0.75, "rewards/chosen": 0.2576300799846649, "rewards/margins": 0.3779858946800232, "rewards/rejected": -0.12035579979419708, "step": 5323 }, { "epoch": 0.8233520201043881, "grad_norm": 4.792301654815674, "learning_rate": 4.030816817504869e-06, "logits/chosen": 6.062042713165283, "logits/rejected": 6.047024726867676, "logps/chosen": -216.16600036621094, "logps/rejected": -217.94049072265625, "loss": 0.5538, "rewards/accuracies": 0.875, "rewards/chosen": -0.0012787804007530212, "rewards/margins": 0.3623438775539398, "rewards/rejected": -0.36362266540527344, "step": 5324 }, { "epoch": 0.8235066692441523, "grad_norm": 6.105286121368408, "learning_rate": 4.0305304158551955e-06, "logits/chosen": 8.504137992858887, "logits/rejected": 4.4132513999938965, "logps/chosen": -269.54205322265625, "logps/rejected": -230.5826873779297, "loss": 0.7152, "rewards/accuracies": 0.5, "rewards/chosen": -0.17474594712257385, "rewards/margins": 0.09901612997055054, "rewards/rejected": -0.2737621068954468, "step": 5325 }, { "epoch": 0.8236613183839165, "grad_norm": 6.389162540435791, "learning_rate": 4.030244014205522e-06, "logits/chosen": 12.092460632324219, "logits/rejected": 4.347359657287598, "logps/chosen": -302.00079345703125, "logps/rejected": -264.7982177734375, "loss": 0.7441, "rewards/accuracies": 0.375, "rewards/chosen": -0.28832319378852844, "rewards/margins": 0.2440381646156311, "rewards/rejected": -0.5323613882064819, "step": 5326 }, { "epoch": 0.8238159675236807, "grad_norm": 4.724972724914551, "learning_rate": 4.029957612555849e-06, "logits/chosen": 14.11539363861084, "logits/rejected": 8.483016967773438, "logps/chosen": -310.4341125488281, "logps/rejected": -225.14439392089844, "loss": 0.5358, "rewards/accuracies": 0.75, "rewards/chosen": 0.2828636169433594, "rewards/margins": 0.46324360370635986, "rewards/rejected": -0.1803799867630005, "step": 5327 }, { "epoch": 0.8239706166634448, "grad_norm": 4.871413230895996, "learning_rate": 4.0296712109061754e-06, "logits/chosen": 9.946161270141602, "logits/rejected": 8.035512924194336, "logps/chosen": -214.70883178710938, "logps/rejected": -246.97708129882812, "loss": 0.5714, "rewards/accuracies": 0.875, "rewards/chosen": -0.09428431838750839, "rewards/margins": 0.4720229208469391, "rewards/rejected": -0.5663072466850281, "step": 5328 }, { "epoch": 0.824125265803209, "grad_norm": 6.055809020996094, "learning_rate": 4.029384809256502e-06, "logits/chosen": 8.911832809448242, "logits/rejected": -0.19004011154174805, "logps/chosen": -309.13616943359375, "logps/rejected": -156.112548828125, "loss": 0.5901, "rewards/accuracies": 0.75, "rewards/chosen": -0.1844894140958786, "rewards/margins": 0.6713892817497253, "rewards/rejected": -0.8558787703514099, "step": 5329 }, { "epoch": 0.8242799149429731, "grad_norm": 6.95323371887207, "learning_rate": 4.029098407606828e-06, "logits/chosen": 5.750208854675293, "logits/rejected": 6.927001476287842, "logps/chosen": -283.3548583984375, "logps/rejected": -279.2781066894531, "loss": 0.7321, "rewards/accuracies": 0.5, "rewards/chosen": -0.10793409496545792, "rewards/margins": -0.02041912078857422, "rewards/rejected": -0.0875149667263031, "step": 5330 }, { "epoch": 0.8244345640827373, "grad_norm": 11.965617179870605, "learning_rate": 4.0288120059571545e-06, "logits/chosen": 9.658398628234863, "logits/rejected": 6.472871780395508, "logps/chosen": -323.000732421875, "logps/rejected": -251.18576049804688, "loss": 0.8512, "rewards/accuracies": 0.5, "rewards/chosen": -0.13315190374851227, "rewards/margins": -0.04851962625980377, "rewards/rejected": -0.0846322551369667, "step": 5331 }, { "epoch": 0.8245892132225014, "grad_norm": 5.235379219055176, "learning_rate": 4.028525604307481e-06, "logits/chosen": 14.443598747253418, "logits/rejected": 6.930086135864258, "logps/chosen": -325.97821044921875, "logps/rejected": -247.26937866210938, "loss": 0.5519, "rewards/accuracies": 0.625, "rewards/chosen": 0.12148480862379074, "rewards/margins": 0.40008780360221863, "rewards/rejected": -0.2786029875278473, "step": 5332 }, { "epoch": 0.8247438623622656, "grad_norm": 6.801800727844238, "learning_rate": 4.028239202657808e-06, "logits/chosen": 8.464888572692871, "logits/rejected": 8.957808494567871, "logps/chosen": -174.44998168945312, "logps/rejected": -134.204833984375, "loss": 0.923, "rewards/accuracies": 0.25, "rewards/chosen": -0.6006500720977783, "rewards/margins": -0.3253081440925598, "rewards/rejected": -0.2753419578075409, "step": 5333 }, { "epoch": 0.8248985115020298, "grad_norm": 21.104555130004883, "learning_rate": 4.0279528010081345e-06, "logits/chosen": 8.508782386779785, "logits/rejected": 6.074336051940918, "logps/chosen": -350.965576171875, "logps/rejected": -335.1566162109375, "loss": 0.7483, "rewards/accuracies": 0.625, "rewards/chosen": -0.05120614171028137, "rewards/margins": 0.011261433362960815, "rewards/rejected": -0.062467582523822784, "step": 5334 }, { "epoch": 0.8250531606417939, "grad_norm": 4.703070640563965, "learning_rate": 4.027666399358461e-06, "logits/chosen": 9.430997848510742, "logits/rejected": 6.76487922668457, "logps/chosen": -211.81646728515625, "logps/rejected": -172.1246337890625, "loss": 0.5257, "rewards/accuracies": 0.625, "rewards/chosen": -0.021838165819644928, "rewards/margins": 0.5281886458396912, "rewards/rejected": -0.5500267744064331, "step": 5335 }, { "epoch": 0.825207809781558, "grad_norm": 4.348498344421387, "learning_rate": 4.027379997708788e-06, "logits/chosen": 10.154484748840332, "logits/rejected": 8.764572143554688, "logps/chosen": -380.19146728515625, "logps/rejected": -300.4892272949219, "loss": 0.4293, "rewards/accuracies": 0.875, "rewards/chosen": 0.0502408891916275, "rewards/margins": 0.743482232093811, "rewards/rejected": -0.6932413578033447, "step": 5336 }, { "epoch": 0.8253624589213222, "grad_norm": 7.4166765213012695, "learning_rate": 4.027093596059114e-06, "logits/chosen": 5.941498756408691, "logits/rejected": 7.464786529541016, "logps/chosen": -380.4335632324219, "logps/rejected": -355.916748046875, "loss": 0.4548, "rewards/accuracies": 0.875, "rewards/chosen": -0.3901115953922272, "rewards/margins": 0.611159086227417, "rewards/rejected": -1.0012707710266113, "step": 5337 }, { "epoch": 0.8255171080610864, "grad_norm": 12.903608322143555, "learning_rate": 4.02680719440944e-06, "logits/chosen": 5.729025840759277, "logits/rejected": 6.0861430168151855, "logps/chosen": -304.4761962890625, "logps/rejected": -260.3463134765625, "loss": 0.7369, "rewards/accuracies": 0.375, "rewards/chosen": 0.26506900787353516, "rewards/margins": -0.010061264038085938, "rewards/rejected": 0.2751302719116211, "step": 5338 }, { "epoch": 0.8256717572008506, "grad_norm": 12.802170753479004, "learning_rate": 4.026520792759767e-06, "logits/chosen": 11.008221626281738, "logits/rejected": 8.043703079223633, "logps/chosen": -286.43231201171875, "logps/rejected": -238.50338745117188, "loss": 0.7821, "rewards/accuracies": 0.5, "rewards/chosen": 0.017602648586034775, "rewards/margins": -0.053995564579963684, "rewards/rejected": 0.07159822434186935, "step": 5339 }, { "epoch": 0.8258264063406148, "grad_norm": 7.474262714385986, "learning_rate": 4.0262343911100935e-06, "logits/chosen": 8.915708541870117, "logits/rejected": 5.9769439697265625, "logps/chosen": -217.97325134277344, "logps/rejected": -189.8278350830078, "loss": 0.5692, "rewards/accuracies": 0.75, "rewards/chosen": 0.22961267828941345, "rewards/margins": 0.4130922257900238, "rewards/rejected": -0.18347957730293274, "step": 5340 }, { "epoch": 0.8259810554803789, "grad_norm": 5.888516902923584, "learning_rate": 4.025947989460419e-06, "logits/chosen": 4.281300067901611, "logits/rejected": 6.6664276123046875, "logps/chosen": -192.68429565429688, "logps/rejected": -207.56834411621094, "loss": 0.7238, "rewards/accuracies": 0.375, "rewards/chosen": -0.1519346684217453, "rewards/margins": -0.01962270587682724, "rewards/rejected": -0.13231196999549866, "step": 5341 }, { "epoch": 0.8261357046201431, "grad_norm": 5.999971389770508, "learning_rate": 4.025661587810746e-06, "logits/chosen": 9.47360610961914, "logits/rejected": 6.125233173370361, "logps/chosen": -370.8904724121094, "logps/rejected": -328.52227783203125, "loss": 0.6236, "rewards/accuracies": 0.625, "rewards/chosen": 0.2609485685825348, "rewards/margins": 0.29614725708961487, "rewards/rejected": -0.03519868105649948, "step": 5342 }, { "epoch": 0.8262903537599072, "grad_norm": 5.769198894500732, "learning_rate": 4.025375186161073e-06, "logits/chosen": 8.71756362915039, "logits/rejected": 8.844842910766602, "logps/chosen": -381.8267822265625, "logps/rejected": -336.4114074707031, "loss": 0.5582, "rewards/accuracies": 0.875, "rewards/chosen": 0.37201958894729614, "rewards/margins": 0.45419007539749146, "rewards/rejected": -0.08217048645019531, "step": 5343 }, { "epoch": 0.8264450028996714, "grad_norm": 7.81317663192749, "learning_rate": 4.025088784511399e-06, "logits/chosen": 11.422042846679688, "logits/rejected": 9.726343154907227, "logps/chosen": -239.04881286621094, "logps/rejected": -180.1737060546875, "loss": 0.7536, "rewards/accuracies": 0.625, "rewards/chosen": -0.7132056951522827, "rewards/margins": 0.01957044005393982, "rewards/rejected": -0.7327761054039001, "step": 5344 }, { "epoch": 0.8265996520394355, "grad_norm": 8.058053016662598, "learning_rate": 4.024802382861726e-06, "logits/chosen": 7.095990180969238, "logits/rejected": 3.0499749183654785, "logps/chosen": -354.446533203125, "logps/rejected": -227.48739624023438, "loss": 0.7898, "rewards/accuracies": 0.5, "rewards/chosen": -0.023910678923130035, "rewards/margins": -0.1181541159749031, "rewards/rejected": 0.09424343705177307, "step": 5345 }, { "epoch": 0.8267543011791997, "grad_norm": 4.05272102355957, "learning_rate": 4.024515981212052e-06, "logits/chosen": 13.564950942993164, "logits/rejected": 14.703371047973633, "logps/chosen": -223.377197265625, "logps/rejected": -237.2510986328125, "loss": 0.5241, "rewards/accuracies": 0.75, "rewards/chosen": -0.15794432163238525, "rewards/margins": 0.4584812521934509, "rewards/rejected": -0.6164255738258362, "step": 5346 }, { "epoch": 0.8269089503189638, "grad_norm": 5.342638969421387, "learning_rate": 4.024229579562378e-06, "logits/chosen": 6.897091865539551, "logits/rejected": 5.012201309204102, "logps/chosen": -244.1290740966797, "logps/rejected": -230.30862426757812, "loss": 0.712, "rewards/accuracies": 0.5, "rewards/chosen": -0.21836552023887634, "rewards/margins": 0.21496419608592987, "rewards/rejected": -0.43332967162132263, "step": 5347 }, { "epoch": 0.827063599458728, "grad_norm": 9.905418395996094, "learning_rate": 4.023943177912705e-06, "logits/chosen": 10.802815437316895, "logits/rejected": 12.32828140258789, "logps/chosen": -237.32496643066406, "logps/rejected": -235.06253051757812, "loss": 0.9861, "rewards/accuracies": 0.375, "rewards/chosen": -0.2220773696899414, "rewards/margins": -0.436323344707489, "rewards/rejected": 0.2142459899187088, "step": 5348 }, { "epoch": 0.8272182485984921, "grad_norm": 9.778711318969727, "learning_rate": 4.023656776263032e-06, "logits/chosen": 5.947680950164795, "logits/rejected": 7.65714693069458, "logps/chosen": -183.94166564941406, "logps/rejected": -246.54600524902344, "loss": 0.8198, "rewards/accuracies": 0.375, "rewards/chosen": -0.18549171090126038, "rewards/margins": -0.13598228991031647, "rewards/rejected": -0.04950941354036331, "step": 5349 }, { "epoch": 0.8273728977382563, "grad_norm": 3.8925695419311523, "learning_rate": 4.0233703746133575e-06, "logits/chosen": 13.280454635620117, "logits/rejected": 14.779666900634766, "logps/chosen": -124.14584350585938, "logps/rejected": -169.04104614257812, "loss": 0.6426, "rewards/accuracies": 0.625, "rewards/chosen": -0.08064673095941544, "rewards/margins": 0.13690821826457977, "rewards/rejected": -0.2175549417734146, "step": 5350 }, { "epoch": 0.8275275468780204, "grad_norm": 5.325223445892334, "learning_rate": 4.023083972963684e-06, "logits/chosen": 11.94888687133789, "logits/rejected": 7.522892951965332, "logps/chosen": -271.1258239746094, "logps/rejected": -204.5643310546875, "loss": 0.6348, "rewards/accuracies": 0.5, "rewards/chosen": -0.15165206789970398, "rewards/margins": 0.35170578956604004, "rewards/rejected": -0.5033578872680664, "step": 5351 }, { "epoch": 0.8276821960177847, "grad_norm": 7.117510795593262, "learning_rate": 4.022797571314011e-06, "logits/chosen": 12.60094165802002, "logits/rejected": 8.933693885803223, "logps/chosen": -420.9251708984375, "logps/rejected": -347.2699279785156, "loss": 0.7923, "rewards/accuracies": 0.625, "rewards/chosen": 0.1477445811033249, "rewards/margins": -0.07621574401855469, "rewards/rejected": 0.2239602953195572, "step": 5352 }, { "epoch": 0.8278368451575489, "grad_norm": 3.30772066116333, "learning_rate": 4.0225111696643375e-06, "logits/chosen": 14.168424606323242, "logits/rejected": 6.840731143951416, "logps/chosen": -199.61041259765625, "logps/rejected": -145.72467041015625, "loss": 0.5171, "rewards/accuracies": 0.625, "rewards/chosen": -0.015547472983598709, "rewards/margins": 0.6037575602531433, "rewards/rejected": -0.6193050146102905, "step": 5353 }, { "epoch": 0.827991494297313, "grad_norm": 5.570367336273193, "learning_rate": 4.022224768014664e-06, "logits/chosen": 3.3675670623779297, "logits/rejected": 3.5403053760528564, "logps/chosen": -184.73187255859375, "logps/rejected": -248.59005737304688, "loss": 0.5706, "rewards/accuracies": 0.625, "rewards/chosen": -0.10427740216255188, "rewards/margins": 0.5916788578033447, "rewards/rejected": -0.695956289768219, "step": 5354 }, { "epoch": 0.8281461434370772, "grad_norm": 5.730072021484375, "learning_rate": 4.021938366364991e-06, "logits/chosen": 13.051298141479492, "logits/rejected": 9.186807632446289, "logps/chosen": -340.6981201171875, "logps/rejected": -257.09368896484375, "loss": 0.5716, "rewards/accuracies": 0.625, "rewards/chosen": 0.2891034483909607, "rewards/margins": 0.394697368144989, "rewards/rejected": -0.10559390485286713, "step": 5355 }, { "epoch": 0.8283007925768413, "grad_norm": 9.620980262756348, "learning_rate": 4.0216519647153166e-06, "logits/chosen": 13.854562759399414, "logits/rejected": 9.437166213989258, "logps/chosen": -301.5858154296875, "logps/rejected": -225.3343048095703, "loss": 0.7168, "rewards/accuracies": 0.625, "rewards/chosen": -0.16419801115989685, "rewards/margins": 0.04755708575248718, "rewards/rejected": -0.21175506711006165, "step": 5356 }, { "epoch": 0.8284554417166055, "grad_norm": 4.582464694976807, "learning_rate": 4.021365563065643e-06, "logits/chosen": 14.405271530151367, "logits/rejected": 12.518171310424805, "logps/chosen": -343.9298095703125, "logps/rejected": -275.76611328125, "loss": 0.6287, "rewards/accuracies": 0.625, "rewards/chosen": -0.006426818668842316, "rewards/margins": 0.25624704360961914, "rewards/rejected": -0.2626737952232361, "step": 5357 }, { "epoch": 0.8286100908563696, "grad_norm": 6.05879545211792, "learning_rate": 4.02107916141597e-06, "logits/chosen": 11.324037551879883, "logits/rejected": 12.489995956420898, "logps/chosen": -319.57025146484375, "logps/rejected": -272.546630859375, "loss": 0.8199, "rewards/accuracies": 0.625, "rewards/chosen": 0.08118820190429688, "rewards/margins": -0.05728644132614136, "rewards/rejected": 0.13847464323043823, "step": 5358 }, { "epoch": 0.8287647399961338, "grad_norm": 9.0266752243042, "learning_rate": 4.0207927597662965e-06, "logits/chosen": 6.157256126403809, "logits/rejected": 2.361009120941162, "logps/chosen": -302.79736328125, "logps/rejected": -203.93955993652344, "loss": 0.5531, "rewards/accuracies": 0.625, "rewards/chosen": 0.2322167456150055, "rewards/margins": 0.4372229278087616, "rewards/rejected": -0.2050061970949173, "step": 5359 }, { "epoch": 0.8289193891358979, "grad_norm": 5.840587615966797, "learning_rate": 4.020506358116623e-06, "logits/chosen": 10.24179458618164, "logits/rejected": 11.3818941116333, "logps/chosen": -300.0357360839844, "logps/rejected": -330.2537841796875, "loss": 0.7296, "rewards/accuracies": 0.375, "rewards/chosen": -0.1039886549115181, "rewards/margins": -0.024369336664676666, "rewards/rejected": -0.07961931079626083, "step": 5360 }, { "epoch": 0.8290740382756621, "grad_norm": 8.630515098571777, "learning_rate": 4.02021995646695e-06, "logits/chosen": 11.478658676147461, "logits/rejected": 12.549637794494629, "logps/chosen": -277.0334167480469, "logps/rejected": -265.60076904296875, "loss": 0.8354, "rewards/accuracies": 0.25, "rewards/chosen": -0.5861489772796631, "rewards/margins": -0.1759597361087799, "rewards/rejected": -0.41018927097320557, "step": 5361 }, { "epoch": 0.8292286874154262, "grad_norm": 4.0248589515686035, "learning_rate": 4.0199335548172765e-06, "logits/chosen": 9.13327693939209, "logits/rejected": 0.4751328229904175, "logps/chosen": -304.70947265625, "logps/rejected": -188.1190185546875, "loss": 0.5116, "rewards/accuracies": 0.75, "rewards/chosen": 0.184296116232872, "rewards/margins": 0.5303095579147339, "rewards/rejected": -0.3460134267807007, "step": 5362 }, { "epoch": 0.8293833365551904, "grad_norm": 7.228922367095947, "learning_rate": 4.019647153167602e-06, "logits/chosen": 8.904441833496094, "logits/rejected": 16.654111862182617, "logps/chosen": -289.78485107421875, "logps/rejected": -345.4756164550781, "loss": 0.7966, "rewards/accuracies": 0.5, "rewards/chosen": 0.034785330295562744, "rewards/margins": -0.10227788984775543, "rewards/rejected": 0.13706320524215698, "step": 5363 }, { "epoch": 0.8295379856949546, "grad_norm": 5.103830337524414, "learning_rate": 4.019360751517929e-06, "logits/chosen": 5.899242401123047, "logits/rejected": 9.79134750366211, "logps/chosen": -197.1291046142578, "logps/rejected": -245.04428100585938, "loss": 0.8407, "rewards/accuracies": 0.375, "rewards/chosen": -0.3621762692928314, "rewards/margins": -0.22488480806350708, "rewards/rejected": -0.13729147613048553, "step": 5364 }, { "epoch": 0.8296926348347188, "grad_norm": 5.657057285308838, "learning_rate": 4.0190743498682556e-06, "logits/chosen": 11.96318244934082, "logits/rejected": 14.269253730773926, "logps/chosen": -199.26263427734375, "logps/rejected": -203.2296905517578, "loss": 0.7917, "rewards/accuracies": 0.375, "rewards/chosen": -0.07637586444616318, "rewards/margins": -0.15539111196994781, "rewards/rejected": 0.07901525497436523, "step": 5365 }, { "epoch": 0.8298472839744829, "grad_norm": 6.404512405395508, "learning_rate": 4.018787948218582e-06, "logits/chosen": 8.980405807495117, "logits/rejected": 3.2507221698760986, "logps/chosen": -355.8680725097656, "logps/rejected": -227.14480590820312, "loss": 0.7279, "rewards/accuracies": 0.5, "rewards/chosen": -0.10106442123651505, "rewards/margins": 0.04736147075891495, "rewards/rejected": -0.14842589199543, "step": 5366 }, { "epoch": 0.8300019331142471, "grad_norm": 5.0833001136779785, "learning_rate": 4.018501546568909e-06, "logits/chosen": 10.452532768249512, "logits/rejected": 2.6104063987731934, "logps/chosen": -414.9434814453125, "logps/rejected": -260.2200622558594, "loss": 0.6069, "rewards/accuracies": 0.75, "rewards/chosen": 0.14669114351272583, "rewards/margins": 0.2586354911327362, "rewards/rejected": -0.11194434762001038, "step": 5367 }, { "epoch": 0.8301565822540112, "grad_norm": 10.146333694458008, "learning_rate": 4.0182151449192355e-06, "logits/chosen": 15.662110328674316, "logits/rejected": 4.235750198364258, "logps/chosen": -502.3944091796875, "logps/rejected": -241.30320739746094, "loss": 0.4973, "rewards/accuracies": 0.875, "rewards/chosen": 0.027603913098573685, "rewards/margins": 0.6622288227081299, "rewards/rejected": -0.6346248984336853, "step": 5368 }, { "epoch": 0.8303112313937754, "grad_norm": 5.116820335388184, "learning_rate": 4.017928743269562e-06, "logits/chosen": 10.789506912231445, "logits/rejected": 10.022411346435547, "logps/chosen": -319.52545166015625, "logps/rejected": -274.0902099609375, "loss": 0.682, "rewards/accuracies": 0.5, "rewards/chosen": -0.006325904279947281, "rewards/margins": 0.36327606439590454, "rewards/rejected": -0.3696020245552063, "step": 5369 }, { "epoch": 0.8304658805335395, "grad_norm": 4.79144811630249, "learning_rate": 4.017642341619888e-06, "logits/chosen": 11.066890716552734, "logits/rejected": 5.095754146575928, "logps/chosen": -225.28695678710938, "logps/rejected": -174.3505859375, "loss": 0.7012, "rewards/accuracies": 0.625, "rewards/chosen": -0.0682850331068039, "rewards/margins": 0.03783566504716873, "rewards/rejected": -0.10612067580223083, "step": 5370 }, { "epoch": 0.8306205296733037, "grad_norm": 5.753739356994629, "learning_rate": 4.017355939970215e-06, "logits/chosen": 12.702654838562012, "logits/rejected": 7.077354907989502, "logps/chosen": -318.09503173828125, "logps/rejected": -266.88018798828125, "loss": 0.6557, "rewards/accuracies": 0.625, "rewards/chosen": 0.016322467476129532, "rewards/margins": 0.15788981318473816, "rewards/rejected": -0.14156733453273773, "step": 5371 }, { "epoch": 0.8307751788130678, "grad_norm": 5.359127044677734, "learning_rate": 4.017069538320541e-06, "logits/chosen": 13.164688110351562, "logits/rejected": 9.30786418914795, "logps/chosen": -276.2366027832031, "logps/rejected": -177.9744110107422, "loss": 0.5139, "rewards/accuracies": 0.75, "rewards/chosen": 0.2154313623905182, "rewards/margins": 0.5429648756980896, "rewards/rejected": -0.3275335133075714, "step": 5372 }, { "epoch": 0.830929827952832, "grad_norm": 6.666069030761719, "learning_rate": 4.016783136670868e-06, "logits/chosen": 17.07319450378418, "logits/rejected": 13.02253246307373, "logps/chosen": -318.0179443359375, "logps/rejected": -253.59043884277344, "loss": 0.6691, "rewards/accuracies": 0.5, "rewards/chosen": -0.1107303574681282, "rewards/margins": 0.16250231862068176, "rewards/rejected": -0.27323266863822937, "step": 5373 }, { "epoch": 0.8310844770925961, "grad_norm": 6.0921630859375, "learning_rate": 4.016496735021195e-06, "logits/chosen": 6.870823860168457, "logits/rejected": 5.898431777954102, "logps/chosen": -375.41259765625, "logps/rejected": -329.1944885253906, "loss": 0.604, "rewards/accuracies": 0.75, "rewards/chosen": 0.1251259744167328, "rewards/margins": 0.2159103900194168, "rewards/rejected": -0.09078440815210342, "step": 5374 }, { "epoch": 0.8312391262323603, "grad_norm": 4.845489978790283, "learning_rate": 4.01621033337152e-06, "logits/chosen": 9.506854057312012, "logits/rejected": 4.547054290771484, "logps/chosen": -334.96435546875, "logps/rejected": -283.1689453125, "loss": 0.5931, "rewards/accuracies": 0.75, "rewards/chosen": -0.0023044124245643616, "rewards/margins": 0.37399858236312866, "rewards/rejected": -0.3763029873371124, "step": 5375 }, { "epoch": 0.8313937753721244, "grad_norm": 5.396146297454834, "learning_rate": 4.015923931721847e-06, "logits/chosen": 10.54942798614502, "logits/rejected": 9.277009010314941, "logps/chosen": -229.40316772460938, "logps/rejected": -194.76673889160156, "loss": 0.7013, "rewards/accuracies": 0.375, "rewards/chosen": 0.02864570915699005, "rewards/margins": 0.07281656563282013, "rewards/rejected": -0.04417085647583008, "step": 5376 }, { "epoch": 0.8315484245118887, "grad_norm": 4.1248955726623535, "learning_rate": 4.015637530072174e-06, "logits/chosen": 3.780885934829712, "logits/rejected": 5.9319682121276855, "logps/chosen": -319.5355529785156, "logps/rejected": -314.78515625, "loss": 0.4975, "rewards/accuracies": 0.75, "rewards/chosen": 0.18458938598632812, "rewards/margins": 0.6036069393157959, "rewards/rejected": -0.41901758313179016, "step": 5377 }, { "epoch": 0.8317030736516529, "grad_norm": 4.499195575714111, "learning_rate": 4.0153511284225e-06, "logits/chosen": 13.451874732971191, "logits/rejected": 1.7122514247894287, "logps/chosen": -389.09869384765625, "logps/rejected": -259.02337646484375, "loss": 0.401, "rewards/accuracies": 1.0, "rewards/chosen": 0.008623795583844185, "rewards/margins": 0.7962130308151245, "rewards/rejected": -0.7875891923904419, "step": 5378 }, { "epoch": 0.831857722791417, "grad_norm": 5.6987433433532715, "learning_rate": 4.015064726772826e-06, "logits/chosen": 9.704020500183105, "logits/rejected": 5.749418258666992, "logps/chosen": -228.26461791992188, "logps/rejected": -190.92189025878906, "loss": 0.7193, "rewards/accuracies": 0.5, "rewards/chosen": -0.13351833820343018, "rewards/margins": 0.033403925597667694, "rewards/rejected": -0.16692225635051727, "step": 5379 }, { "epoch": 0.8320123719311812, "grad_norm": 4.734670639038086, "learning_rate": 4.014778325123153e-06, "logits/chosen": 8.425107955932617, "logits/rejected": 5.822018623352051, "logps/chosen": -211.37890625, "logps/rejected": -259.53314208984375, "loss": 0.6281, "rewards/accuracies": 0.625, "rewards/chosen": -0.005975663661956787, "rewards/margins": 0.2646450698375702, "rewards/rejected": -0.270620733499527, "step": 5380 }, { "epoch": 0.8321670210709453, "grad_norm": 4.815316677093506, "learning_rate": 4.0144919234734794e-06, "logits/chosen": 7.501192092895508, "logits/rejected": 13.593565940856934, "logps/chosen": -135.0107879638672, "logps/rejected": -269.2069396972656, "loss": 0.7324, "rewards/accuracies": 0.5, "rewards/chosen": -0.36686083674430847, "rewards/margins": 0.268845796585083, "rewards/rejected": -0.6357066631317139, "step": 5381 }, { "epoch": 0.8323216702107095, "grad_norm": 3.9795141220092773, "learning_rate": 4.014205521823806e-06, "logits/chosen": 14.000324249267578, "logits/rejected": 10.978670120239258, "logps/chosen": -293.13153076171875, "logps/rejected": -239.74111938476562, "loss": 0.5501, "rewards/accuracies": 0.875, "rewards/chosen": -0.1538265347480774, "rewards/margins": 0.5825468301773071, "rewards/rejected": -0.7363733053207397, "step": 5382 }, { "epoch": 0.8324763193504736, "grad_norm": 5.09022855758667, "learning_rate": 4.013919120174133e-06, "logits/chosen": 13.133593559265137, "logits/rejected": 14.01787281036377, "logps/chosen": -226.3394012451172, "logps/rejected": -252.2895965576172, "loss": 0.6428, "rewards/accuracies": 0.625, "rewards/chosen": -0.2822531759738922, "rewards/margins": 0.1283479630947113, "rewards/rejected": -0.4106011390686035, "step": 5383 }, { "epoch": 0.8326309684902378, "grad_norm": 4.223243236541748, "learning_rate": 4.0136327185244585e-06, "logits/chosen": 17.141403198242188, "logits/rejected": 8.355865478515625, "logps/chosen": -236.6707305908203, "logps/rejected": -210.37130737304688, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": 0.28904491662979126, "rewards/margins": 0.44514426589012146, "rewards/rejected": -0.1560993492603302, "step": 5384 }, { "epoch": 0.8327856176300019, "grad_norm": 6.058457851409912, "learning_rate": 4.013346316874785e-06, "logits/chosen": 13.686336517333984, "logits/rejected": 4.966727256774902, "logps/chosen": -283.00848388671875, "logps/rejected": -223.71339416503906, "loss": 0.7023, "rewards/accuracies": 0.5, "rewards/chosen": -0.08856715261936188, "rewards/margins": 0.2917318046092987, "rewards/rejected": -0.38029900193214417, "step": 5385 }, { "epoch": 0.8329402667697661, "grad_norm": 12.94128704071045, "learning_rate": 4.013059915225112e-06, "logits/chosen": 14.132623672485352, "logits/rejected": 7.437209129333496, "logps/chosen": -323.65167236328125, "logps/rejected": -270.63580322265625, "loss": 0.5334, "rewards/accuracies": 0.5, "rewards/chosen": 0.3487892150878906, "rewards/margins": 0.625971257686615, "rewards/rejected": -0.2771819531917572, "step": 5386 }, { "epoch": 0.8330949159095302, "grad_norm": 6.800536632537842, "learning_rate": 4.0127735135754385e-06, "logits/chosen": 16.286441802978516, "logits/rejected": 9.720698356628418, "logps/chosen": -514.7172241210938, "logps/rejected": -367.98443603515625, "loss": 0.6303, "rewards/accuracies": 0.5, "rewards/chosen": 0.4346073269844055, "rewards/margins": 0.33411428332328796, "rewards/rejected": 0.10049304366111755, "step": 5387 }, { "epoch": 0.8332495650492944, "grad_norm": 4.872771739959717, "learning_rate": 4.012487111925765e-06, "logits/chosen": 7.491901397705078, "logits/rejected": 9.20926570892334, "logps/chosen": -218.38690185546875, "logps/rejected": -275.0525817871094, "loss": 0.7046, "rewards/accuracies": 0.5, "rewards/chosen": -0.17015504837036133, "rewards/margins": 0.08373256772756577, "rewards/rejected": -0.2538875937461853, "step": 5388 }, { "epoch": 0.8334042141890585, "grad_norm": 4.226747989654541, "learning_rate": 4.012200710276091e-06, "logits/chosen": 6.772472381591797, "logits/rejected": 3.7654976844787598, "logps/chosen": -163.06546020507812, "logps/rejected": -147.28819274902344, "loss": 0.6079, "rewards/accuracies": 0.375, "rewards/chosen": -0.006968967616558075, "rewards/margins": 0.3486133813858032, "rewards/rejected": -0.3555823564529419, "step": 5389 }, { "epoch": 0.8335588633288228, "grad_norm": 4.889275550842285, "learning_rate": 4.011914308626418e-06, "logits/chosen": 5.655824661254883, "logits/rejected": 4.966555595397949, "logps/chosen": -276.1043701171875, "logps/rejected": -224.635986328125, "loss": 0.5581, "rewards/accuracies": 0.75, "rewards/chosen": 0.005195764824748039, "rewards/margins": 0.41703474521636963, "rewards/rejected": -0.41183900833129883, "step": 5390 }, { "epoch": 0.8337135124685869, "grad_norm": 5.367610931396484, "learning_rate": 4.011627906976744e-06, "logits/chosen": 13.729022979736328, "logits/rejected": 10.7450590133667, "logps/chosen": -464.8741455078125, "logps/rejected": -347.12689208984375, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": 0.24824295938014984, "rewards/margins": 0.08843203634023666, "rewards/rejected": 0.15981093049049377, "step": 5391 }, { "epoch": 0.8338681616083511, "grad_norm": 5.969167232513428, "learning_rate": 4.011341505327071e-06, "logits/chosen": 8.183238983154297, "logits/rejected": 5.0609002113342285, "logps/chosen": -211.6408233642578, "logps/rejected": -207.94390869140625, "loss": 0.7592, "rewards/accuracies": 0.25, "rewards/chosen": -0.08726739883422852, "rewards/margins": -0.047663405537605286, "rewards/rejected": -0.039604008197784424, "step": 5392 }, { "epoch": 0.8340228107481152, "grad_norm": 5.681665897369385, "learning_rate": 4.0110551036773975e-06, "logits/chosen": 8.574899673461914, "logits/rejected": 2.3997669219970703, "logps/chosen": -315.51763916015625, "logps/rejected": -242.56893920898438, "loss": 0.5515, "rewards/accuracies": 0.625, "rewards/chosen": 0.1526292860507965, "rewards/margins": 0.7872132658958435, "rewards/rejected": -0.6345839500427246, "step": 5393 }, { "epoch": 0.8341774598878794, "grad_norm": 6.395241737365723, "learning_rate": 4.010768702027724e-06, "logits/chosen": 5.79974365234375, "logits/rejected": 8.506085395812988, "logps/chosen": -266.611328125, "logps/rejected": -286.57086181640625, "loss": 0.6994, "rewards/accuracies": 0.625, "rewards/chosen": 0.2074846625328064, "rewards/margins": 0.04301976412534714, "rewards/rejected": 0.16446489095687866, "step": 5394 }, { "epoch": 0.8343321090276435, "grad_norm": 4.886826038360596, "learning_rate": 4.010482300378051e-06, "logits/chosen": 11.948812484741211, "logits/rejected": 7.996578216552734, "logps/chosen": -289.35333251953125, "logps/rejected": -158.23025512695312, "loss": 0.6282, "rewards/accuracies": 0.5, "rewards/chosen": 0.33129453659057617, "rewards/margins": 0.30022209882736206, "rewards/rejected": 0.03107243776321411, "step": 5395 }, { "epoch": 0.8344867581674077, "grad_norm": 6.295848369598389, "learning_rate": 4.010195898728377e-06, "logits/chosen": 7.4329376220703125, "logits/rejected": 4.973328590393066, "logps/chosen": -355.7255554199219, "logps/rejected": -365.41748046875, "loss": 0.5476, "rewards/accuracies": 0.625, "rewards/chosen": 0.09848098456859589, "rewards/margins": 0.4621601998806, "rewards/rejected": -0.3636792302131653, "step": 5396 }, { "epoch": 0.8346414073071718, "grad_norm": 4.316558837890625, "learning_rate": 4.009909497078703e-06, "logits/chosen": 9.350947380065918, "logits/rejected": 3.718388080596924, "logps/chosen": -276.31097412109375, "logps/rejected": -163.05532836914062, "loss": 0.5391, "rewards/accuracies": 0.625, "rewards/chosen": 0.2558572292327881, "rewards/margins": 0.43711453676223755, "rewards/rejected": -0.18125732243061066, "step": 5397 }, { "epoch": 0.834796056446936, "grad_norm": 6.025699138641357, "learning_rate": 4.00962309542903e-06, "logits/chosen": 7.262233734130859, "logits/rejected": 3.3563954830169678, "logps/chosen": -263.9393310546875, "logps/rejected": -208.1671600341797, "loss": 0.642, "rewards/accuracies": 0.75, "rewards/chosen": 0.46710091829299927, "rewards/margins": 0.2902814745903015, "rewards/rejected": 0.17681945860385895, "step": 5398 }, { "epoch": 0.8349507055867001, "grad_norm": 5.9390082359313965, "learning_rate": 4.009336693779357e-06, "logits/chosen": 7.669254779815674, "logits/rejected": 4.8430280685424805, "logps/chosen": -201.2021026611328, "logps/rejected": -206.9154815673828, "loss": 0.7802, "rewards/accuracies": 0.375, "rewards/chosen": -0.13391587138175964, "rewards/margins": 0.0535183846950531, "rewards/rejected": -0.18743430078029633, "step": 5399 }, { "epoch": 0.8351053547264643, "grad_norm": 5.8911237716674805, "learning_rate": 4.009050292129683e-06, "logits/chosen": 13.248455047607422, "logits/rejected": 8.486472129821777, "logps/chosen": -409.1990966796875, "logps/rejected": -302.76611328125, "loss": 0.6375, "rewards/accuracies": 0.5, "rewards/chosen": 0.2739750146865845, "rewards/margins": 0.14663706719875336, "rewards/rejected": 0.12733793258666992, "step": 5400 }, { "epoch": 0.8352600038662285, "grad_norm": 5.067920207977295, "learning_rate": 4.00876389048001e-06, "logits/chosen": 14.423843383789062, "logits/rejected": 11.532532691955566, "logps/chosen": -367.6778869628906, "logps/rejected": -350.1405334472656, "loss": 0.5488, "rewards/accuracies": 0.75, "rewards/chosen": 0.38997527956962585, "rewards/margins": 0.5512980222702026, "rewards/rejected": -0.161322683095932, "step": 5401 }, { "epoch": 0.8354146530059926, "grad_norm": 3.7607522010803223, "learning_rate": 4.008477488830336e-06, "logits/chosen": 10.857397079467773, "logits/rejected": 7.8524980545043945, "logps/chosen": -348.26129150390625, "logps/rejected": -294.834716796875, "loss": 0.5272, "rewards/accuracies": 0.625, "rewards/chosen": 0.7445254325866699, "rewards/margins": 0.5306577682495117, "rewards/rejected": 0.2138676792383194, "step": 5402 }, { "epoch": 0.8355693021457569, "grad_norm": 5.832326889038086, "learning_rate": 4.008191087180662e-06, "logits/chosen": 14.55161190032959, "logits/rejected": 7.82150411605835, "logps/chosen": -408.7925720214844, "logps/rejected": -263.0992431640625, "loss": 0.6181, "rewards/accuracies": 0.375, "rewards/chosen": 0.41906851530075073, "rewards/margins": 0.30957135558128357, "rewards/rejected": 0.10949714481830597, "step": 5403 }, { "epoch": 0.835723951285521, "grad_norm": 5.645401954650879, "learning_rate": 4.007904685530989e-06, "logits/chosen": 11.063433647155762, "logits/rejected": 10.394119262695312, "logps/chosen": -271.5616455078125, "logps/rejected": -244.5500946044922, "loss": 0.7557, "rewards/accuracies": 0.5, "rewards/chosen": -0.03717225790023804, "rewards/margins": -0.03263280168175697, "rewards/rejected": -0.004539459943771362, "step": 5404 }, { "epoch": 0.8358786004252852, "grad_norm": 5.294131278991699, "learning_rate": 4.007618283881316e-06, "logits/chosen": 6.1546502113342285, "logits/rejected": 5.169168949127197, "logps/chosen": -269.3834533691406, "logps/rejected": -223.49252319335938, "loss": 0.8347, "rewards/accuracies": 0.625, "rewards/chosen": -0.08259715884923935, "rewards/margins": -0.02604052983224392, "rewards/rejected": -0.05655666068196297, "step": 5405 }, { "epoch": 0.8360332495650493, "grad_norm": 5.916450023651123, "learning_rate": 4.007331882231642e-06, "logits/chosen": 15.58582878112793, "logits/rejected": 15.447723388671875, "logps/chosen": -310.55780029296875, "logps/rejected": -316.4345397949219, "loss": 0.7655, "rewards/accuracies": 0.375, "rewards/chosen": 0.4645574390888214, "rewards/margins": -0.09004535526037216, "rewards/rejected": 0.5546028017997742, "step": 5406 }, { "epoch": 0.8361878987048135, "grad_norm": 2.8976948261260986, "learning_rate": 4.007045480581969e-06, "logits/chosen": 15.364934921264648, "logits/rejected": 13.665740966796875, "logps/chosen": -126.39093017578125, "logps/rejected": -133.12171936035156, "loss": 0.5433, "rewards/accuracies": 0.875, "rewards/chosen": 0.10509534180164337, "rewards/margins": 0.45479148626327515, "rewards/rejected": -0.34969615936279297, "step": 5407 }, { "epoch": 0.8363425478445776, "grad_norm": 6.552395820617676, "learning_rate": 4.006759078932296e-06, "logits/chosen": 8.795272827148438, "logits/rejected": 5.817706108093262, "logps/chosen": -299.75738525390625, "logps/rejected": -261.15863037109375, "loss": 0.6783, "rewards/accuracies": 0.625, "rewards/chosen": 0.17785143852233887, "rewards/margins": 0.30028846859931946, "rewards/rejected": -0.12243704497814178, "step": 5408 }, { "epoch": 0.8364971969843418, "grad_norm": 6.476000785827637, "learning_rate": 4.006472677282621e-06, "logits/chosen": 11.645604133605957, "logits/rejected": 8.61089038848877, "logps/chosen": -259.6112060546875, "logps/rejected": -225.9443359375, "loss": 0.8523, "rewards/accuracies": 0.5, "rewards/chosen": -0.005492404103279114, "rewards/margins": -0.264350563287735, "rewards/rejected": 0.2588581442832947, "step": 5409 }, { "epoch": 0.8366518461241059, "grad_norm": 3.7469305992126465, "learning_rate": 4.006186275632948e-06, "logits/chosen": 12.261303901672363, "logits/rejected": 5.355901718139648, "logps/chosen": -231.98089599609375, "logps/rejected": -210.46971130371094, "loss": 0.4566, "rewards/accuracies": 0.75, "rewards/chosen": 0.39939552545547485, "rewards/margins": 0.6560695171356201, "rewards/rejected": -0.25667402148246765, "step": 5410 }, { "epoch": 0.8368064952638701, "grad_norm": 11.923209190368652, "learning_rate": 4.005899873983275e-06, "logits/chosen": 15.726322174072266, "logits/rejected": 10.33785629272461, "logps/chosen": -393.6388854980469, "logps/rejected": -321.0372619628906, "loss": 0.6675, "rewards/accuracies": 0.375, "rewards/chosen": 0.38613855838775635, "rewards/margins": 0.17060332000255585, "rewards/rejected": 0.21553519368171692, "step": 5411 }, { "epoch": 0.8369611444036342, "grad_norm": 5.933987617492676, "learning_rate": 4.005613472333601e-06, "logits/chosen": 8.458877563476562, "logits/rejected": 8.396862030029297, "logps/chosen": -319.454833984375, "logps/rejected": -336.85076904296875, "loss": 0.6865, "rewards/accuracies": 0.5, "rewards/chosen": 0.1493656188249588, "rewards/margins": 0.13950181007385254, "rewards/rejected": 0.009863808751106262, "step": 5412 }, { "epoch": 0.8371157935433984, "grad_norm": 4.926679611206055, "learning_rate": 4.005327070683927e-06, "logits/chosen": 9.339546203613281, "logits/rejected": -0.37227004766464233, "logps/chosen": -329.81329345703125, "logps/rejected": -216.01385498046875, "loss": 0.5934, "rewards/accuracies": 0.75, "rewards/chosen": 0.24635376036167145, "rewards/margins": 0.3219704031944275, "rewards/rejected": -0.07561665028333664, "step": 5413 }, { "epoch": 0.8372704426831625, "grad_norm": 5.30418062210083, "learning_rate": 4.005040669034254e-06, "logits/chosen": 12.046310424804688, "logits/rejected": 10.75256633758545, "logps/chosen": -251.45831298828125, "logps/rejected": -233.2830047607422, "loss": 0.7095, "rewards/accuracies": 0.375, "rewards/chosen": 0.39050063490867615, "rewards/margins": 0.01399308443069458, "rewards/rejected": 0.37650758028030396, "step": 5414 }, { "epoch": 0.8374250918229267, "grad_norm": 5.1942219734191895, "learning_rate": 4.0047542673845805e-06, "logits/chosen": 16.091569900512695, "logits/rejected": 10.159652709960938, "logps/chosen": -324.17486572265625, "logps/rejected": -255.77749633789062, "loss": 0.4045, "rewards/accuracies": 1.0, "rewards/chosen": 0.2942097783088684, "rewards/margins": 0.8292987942695618, "rewards/rejected": -0.5350890159606934, "step": 5415 }, { "epoch": 0.8375797409626909, "grad_norm": 4.26779317855835, "learning_rate": 4.004467865734907e-06, "logits/chosen": 16.833147048950195, "logits/rejected": 12.745699882507324, "logps/chosen": -216.31436157226562, "logps/rejected": -178.08505249023438, "loss": 0.5441, "rewards/accuracies": 0.625, "rewards/chosen": 0.18688784539699554, "rewards/margins": 0.37858593463897705, "rewards/rejected": -0.1916980892419815, "step": 5416 }, { "epoch": 0.8377343901024551, "grad_norm": 6.008831977844238, "learning_rate": 4.004181464085233e-06, "logits/chosen": 10.574746131896973, "logits/rejected": 8.50964069366455, "logps/chosen": -200.4350128173828, "logps/rejected": -203.24606323242188, "loss": 0.5834, "rewards/accuracies": 0.625, "rewards/chosen": 0.32726478576660156, "rewards/margins": 0.3187827169895172, "rewards/rejected": 0.008482083678245544, "step": 5417 }, { "epoch": 0.8378890392422192, "grad_norm": 5.158947944641113, "learning_rate": 4.0038950624355596e-06, "logits/chosen": 9.645227432250977, "logits/rejected": 6.985991954803467, "logps/chosen": -328.030517578125, "logps/rejected": -256.05377197265625, "loss": 0.597, "rewards/accuracies": 0.625, "rewards/chosen": 0.6289145946502686, "rewards/margins": 0.3069944381713867, "rewards/rejected": 0.32192009687423706, "step": 5418 }, { "epoch": 0.8380436883819834, "grad_norm": 4.650661945343018, "learning_rate": 4.003608660785886e-06, "logits/chosen": 11.20956802368164, "logits/rejected": 2.6708712577819824, "logps/chosen": -333.18011474609375, "logps/rejected": -222.7096405029297, "loss": 0.4484, "rewards/accuracies": 0.875, "rewards/chosen": 0.40139809250831604, "rewards/margins": 0.7944390773773193, "rewards/rejected": -0.39304110407829285, "step": 5419 }, { "epoch": 0.8381983375217475, "grad_norm": 5.28973913192749, "learning_rate": 4.003322259136213e-06, "logits/chosen": 11.617319107055664, "logits/rejected": 11.456250190734863, "logps/chosen": -273.3245849609375, "logps/rejected": -217.79920959472656, "loss": 0.6176, "rewards/accuracies": 0.625, "rewards/chosen": 0.13470740616321564, "rewards/margins": 0.23251557350158691, "rewards/rejected": -0.09780816733837128, "step": 5420 }, { "epoch": 0.8383529866615117, "grad_norm": 5.416359901428223, "learning_rate": 4.0030358574865395e-06, "logits/chosen": 3.3947293758392334, "logits/rejected": 5.646587371826172, "logps/chosen": -227.69610595703125, "logps/rejected": -242.85885620117188, "loss": 0.6724, "rewards/accuracies": 0.5, "rewards/chosen": 0.14110738039016724, "rewards/margins": 0.07445431500673294, "rewards/rejected": 0.0666530653834343, "step": 5421 }, { "epoch": 0.8385076358012759, "grad_norm": 6.574779033660889, "learning_rate": 4.002749455836865e-06, "logits/chosen": 3.8974621295928955, "logits/rejected": 5.640439033508301, "logps/chosen": -249.04949951171875, "logps/rejected": -253.0032958984375, "loss": 0.6468, "rewards/accuracies": 0.5, "rewards/chosen": 0.3739694356918335, "rewards/margins": 0.20426106452941895, "rewards/rejected": 0.16970840096473694, "step": 5422 }, { "epoch": 0.83866228494104, "grad_norm": 4.975757122039795, "learning_rate": 4.002463054187192e-06, "logits/chosen": 11.546712875366211, "logits/rejected": 9.134991645812988, "logps/chosen": -287.65704345703125, "logps/rejected": -263.9325866699219, "loss": 0.7327, "rewards/accuracies": 0.5, "rewards/chosen": 0.26884832978248596, "rewards/margins": 0.012437999248504639, "rewards/rejected": 0.25641030073165894, "step": 5423 }, { "epoch": 0.8388169340808042, "grad_norm": 7.482711315155029, "learning_rate": 4.002176652537519e-06, "logits/chosen": 12.253576278686523, "logits/rejected": 10.279074668884277, "logps/chosen": -317.8530578613281, "logps/rejected": -239.5455322265625, "loss": 0.7639, "rewards/accuracies": 0.375, "rewards/chosen": 0.2441038191318512, "rewards/margins": 0.05510842800140381, "rewards/rejected": 0.1889953762292862, "step": 5424 }, { "epoch": 0.8389715832205683, "grad_norm": 5.61442232131958, "learning_rate": 4.001890250887845e-06, "logits/chosen": 2.2847537994384766, "logits/rejected": 3.24780535697937, "logps/chosen": -164.4940185546875, "logps/rejected": -216.36029052734375, "loss": 0.8194, "rewards/accuracies": 0.375, "rewards/chosen": -0.3095889985561371, "rewards/margins": -0.2146223783493042, "rewards/rejected": -0.09496665000915527, "step": 5425 }, { "epoch": 0.8391262323603325, "grad_norm": 4.330936431884766, "learning_rate": 4.001603849238172e-06, "logits/chosen": 8.27112102508545, "logits/rejected": 5.523753643035889, "logps/chosen": -232.79727172851562, "logps/rejected": -219.52406311035156, "loss": 0.5741, "rewards/accuracies": 0.625, "rewards/chosen": 0.11994314193725586, "rewards/margins": 0.5259772539138794, "rewards/rejected": -0.4060341417789459, "step": 5426 }, { "epoch": 0.8392808815000966, "grad_norm": 5.4964213371276855, "learning_rate": 4.001317447588499e-06, "logits/chosen": 7.427450180053711, "logits/rejected": 7.085107803344727, "logps/chosen": -263.59686279296875, "logps/rejected": -222.1402587890625, "loss": 0.6939, "rewards/accuracies": 0.375, "rewards/chosen": 0.256794810295105, "rewards/margins": 0.06467723101377487, "rewards/rejected": 0.1921176016330719, "step": 5427 }, { "epoch": 0.8394355306398608, "grad_norm": 6.141524314880371, "learning_rate": 4.001031045938825e-06, "logits/chosen": 9.463638305664062, "logits/rejected": 12.04014778137207, "logps/chosen": -197.3345947265625, "logps/rejected": -293.56268310546875, "loss": 0.7188, "rewards/accuracies": 0.5, "rewards/chosen": -0.08512932062149048, "rewards/margins": 0.06572775542736053, "rewards/rejected": -0.150857076048851, "step": 5428 }, { "epoch": 0.839590179779625, "grad_norm": 4.164482593536377, "learning_rate": 4.000744644289151e-06, "logits/chosen": 15.505621910095215, "logits/rejected": 6.917988300323486, "logps/chosen": -352.1197814941406, "logps/rejected": -238.9019012451172, "loss": 0.4952, "rewards/accuracies": 0.625, "rewards/chosen": 0.4008140563964844, "rewards/margins": 0.66022127866745, "rewards/rejected": -0.2594072222709656, "step": 5429 }, { "epoch": 0.8397448289193892, "grad_norm": 5.01873254776001, "learning_rate": 4.000458242639478e-06, "logits/chosen": 11.75703239440918, "logits/rejected": 9.634759902954102, "logps/chosen": -288.7507629394531, "logps/rejected": -269.29498291015625, "loss": 0.601, "rewards/accuracies": 0.625, "rewards/chosen": 0.22041340172290802, "rewards/margins": 0.3064883351325989, "rewards/rejected": -0.08607494086027145, "step": 5430 }, { "epoch": 0.8398994780591533, "grad_norm": 5.715124607086182, "learning_rate": 4.000171840989804e-06, "logits/chosen": 5.409604072570801, "logits/rejected": 10.110594749450684, "logps/chosen": -177.2064208984375, "logps/rejected": -220.87513732910156, "loss": 0.8621, "rewards/accuracies": 0.375, "rewards/chosen": -0.014394789934158325, "rewards/margins": -0.12481828778982162, "rewards/rejected": 0.1104234978556633, "step": 5431 }, { "epoch": 0.8400541271989175, "grad_norm": 5.102543354034424, "learning_rate": 3.999885439340131e-06, "logits/chosen": 12.225706100463867, "logits/rejected": 6.9984917640686035, "logps/chosen": -182.45681762695312, "logps/rejected": -111.1597671508789, "loss": 0.5693, "rewards/accuracies": 0.875, "rewards/chosen": 0.24482709169387817, "rewards/margins": 0.36885514855384827, "rewards/rejected": -0.12402810156345367, "step": 5432 }, { "epoch": 0.8402087763386816, "grad_norm": 4.921548843383789, "learning_rate": 3.999599037690458e-06, "logits/chosen": 9.975991249084473, "logits/rejected": 10.47380256652832, "logps/chosen": -187.17385864257812, "logps/rejected": -190.00796508789062, "loss": 0.643, "rewards/accuracies": 0.625, "rewards/chosen": -0.04586517810821533, "rewards/margins": 0.1925581395626068, "rewards/rejected": -0.23842331767082214, "step": 5433 }, { "epoch": 0.8403634254784458, "grad_norm": 3.8383398056030273, "learning_rate": 3.999312636040784e-06, "logits/chosen": 9.934160232543945, "logits/rejected": 6.6337385177612305, "logps/chosen": -200.88095092773438, "logps/rejected": -111.7099838256836, "loss": 0.5699, "rewards/accuracies": 0.625, "rewards/chosen": 0.018119312822818756, "rewards/margins": 0.45537564158439636, "rewards/rejected": -0.4372563362121582, "step": 5434 }, { "epoch": 0.8405180746182099, "grad_norm": 5.574609756469727, "learning_rate": 3.99902623439111e-06, "logits/chosen": 9.462666511535645, "logits/rejected": 3.1613376140594482, "logps/chosen": -348.7305908203125, "logps/rejected": -304.126708984375, "loss": 0.4777, "rewards/accuracies": 0.75, "rewards/chosen": 0.15324078500270844, "rewards/margins": 0.718546986579895, "rewards/rejected": -0.5653061866760254, "step": 5435 }, { "epoch": 0.8406727237579741, "grad_norm": 5.148168087005615, "learning_rate": 3.998739832741437e-06, "logits/chosen": 8.252252578735352, "logits/rejected": 6.443573474884033, "logps/chosen": -268.34515380859375, "logps/rejected": -227.71771240234375, "loss": 0.6833, "rewards/accuracies": 0.375, "rewards/chosen": 0.2763122618198395, "rewards/margins": 0.2445923089981079, "rewards/rejected": 0.031719960272312164, "step": 5436 }, { "epoch": 0.8408273728977382, "grad_norm": 4.207274436950684, "learning_rate": 3.998453431091763e-06, "logits/chosen": 11.124573707580566, "logits/rejected": 7.32269287109375, "logps/chosen": -285.9581604003906, "logps/rejected": -257.38043212890625, "loss": 0.4608, "rewards/accuracies": 1.0, "rewards/chosen": 0.49729377031326294, "rewards/margins": 0.576913595199585, "rewards/rejected": -0.07961972057819366, "step": 5437 }, { "epoch": 0.8409820220375024, "grad_norm": 5.801570892333984, "learning_rate": 3.99816702944209e-06, "logits/chosen": 7.315394401550293, "logits/rejected": 3.6569457054138184, "logps/chosen": -551.8583984375, "logps/rejected": -265.0906066894531, "loss": 0.6491, "rewards/accuracies": 0.5, "rewards/chosen": 0.08991736173629761, "rewards/margins": 0.22030098736286163, "rewards/rejected": -0.13038358092308044, "step": 5438 }, { "epoch": 0.8411366711772665, "grad_norm": 7.415975570678711, "learning_rate": 3.997880627792417e-06, "logits/chosen": 10.483821868896484, "logits/rejected": 5.198424339294434, "logps/chosen": -201.1514892578125, "logps/rejected": -182.17062377929688, "loss": 0.8435, "rewards/accuracies": 0.5, "rewards/chosen": -0.054277002811431885, "rewards/margins": -0.07612176239490509, "rewards/rejected": 0.021844759583473206, "step": 5439 }, { "epoch": 0.8412913203170307, "grad_norm": 13.179608345031738, "learning_rate": 3.997594226142743e-06, "logits/chosen": 12.54238510131836, "logits/rejected": 15.689638137817383, "logps/chosen": -337.2723083496094, "logps/rejected": -327.63751220703125, "loss": 0.571, "rewards/accuracies": 0.75, "rewards/chosen": -0.11863747239112854, "rewards/margins": 0.4537114202976227, "rewards/rejected": -0.572348952293396, "step": 5440 }, { "epoch": 0.841445969456795, "grad_norm": 7.264350414276123, "learning_rate": 3.99730782449307e-06, "logits/chosen": 11.289186477661133, "logits/rejected": 8.527827262878418, "logps/chosen": -338.5001220703125, "logps/rejected": -252.5107421875, "loss": 0.8086, "rewards/accuracies": 0.375, "rewards/chosen": 0.03689804673194885, "rewards/margins": -0.08825166523456573, "rewards/rejected": 0.12514972686767578, "step": 5441 }, { "epoch": 0.8416006185965591, "grad_norm": 5.569958686828613, "learning_rate": 3.997021422843396e-06, "logits/chosen": 8.517095565795898, "logits/rejected": 8.322416305541992, "logps/chosen": -301.5049133300781, "logps/rejected": -237.5975799560547, "loss": 0.6905, "rewards/accuracies": 0.375, "rewards/chosen": 0.19004316627979279, "rewards/margins": 0.10138611495494843, "rewards/rejected": 0.08865704387426376, "step": 5442 }, { "epoch": 0.8417552677363233, "grad_norm": 4.2139997482299805, "learning_rate": 3.9967350211937224e-06, "logits/chosen": 11.163293838500977, "logits/rejected": 5.496822357177734, "logps/chosen": -294.4971008300781, "logps/rejected": -254.0455322265625, "loss": 0.4135, "rewards/accuracies": 0.875, "rewards/chosen": 0.518298327922821, "rewards/margins": 0.8536103963851929, "rewards/rejected": -0.3353120684623718, "step": 5443 }, { "epoch": 0.8419099168760874, "grad_norm": 4.374655246734619, "learning_rate": 3.996448619544049e-06, "logits/chosen": 8.741077423095703, "logits/rejected": 7.998415946960449, "logps/chosen": -240.798828125, "logps/rejected": -243.75619506835938, "loss": 0.6448, "rewards/accuracies": 0.5, "rewards/chosen": 0.28527846932411194, "rewards/margins": 0.21384401619434357, "rewards/rejected": 0.07143445312976837, "step": 5444 }, { "epoch": 0.8420645660158516, "grad_norm": 5.06480073928833, "learning_rate": 3.996162217894376e-06, "logits/chosen": 9.56277847290039, "logits/rejected": 10.73475170135498, "logps/chosen": -332.46563720703125, "logps/rejected": -287.79486083984375, "loss": 0.6308, "rewards/accuracies": 0.625, "rewards/chosen": 0.19607073068618774, "rewards/margins": 0.18030638992786407, "rewards/rejected": 0.015764333307743073, "step": 5445 }, { "epoch": 0.8422192151556157, "grad_norm": 5.909814834594727, "learning_rate": 3.995875816244702e-06, "logits/chosen": 8.587616920471191, "logits/rejected": 6.354789733886719, "logps/chosen": -203.06478881835938, "logps/rejected": -216.36228942871094, "loss": 0.7618, "rewards/accuracies": 0.625, "rewards/chosen": -0.333962619304657, "rewards/margins": 0.006642401218414307, "rewards/rejected": -0.3406050205230713, "step": 5446 }, { "epoch": 0.8423738642953799, "grad_norm": 4.2088727951049805, "learning_rate": 3.995589414595028e-06, "logits/chosen": 9.15581226348877, "logits/rejected": 5.306465148925781, "logps/chosen": -240.82017517089844, "logps/rejected": -240.62744140625, "loss": 0.5462, "rewards/accuracies": 0.75, "rewards/chosen": 0.09779989719390869, "rewards/margins": 0.4776156544685364, "rewards/rejected": -0.3798157572746277, "step": 5447 }, { "epoch": 0.842528513435144, "grad_norm": 8.272249221801758, "learning_rate": 3.995303012945355e-06, "logits/chosen": 12.955611228942871, "logits/rejected": 7.64576530456543, "logps/chosen": -371.28948974609375, "logps/rejected": -270.0327453613281, "loss": 0.7689, "rewards/accuracies": 0.625, "rewards/chosen": -0.22182704508304596, "rewards/margins": 0.3040510416030884, "rewards/rejected": -0.5258780717849731, "step": 5448 }, { "epoch": 0.8426831625749082, "grad_norm": 5.398093223571777, "learning_rate": 3.9950166112956815e-06, "logits/chosen": 5.926149845123291, "logits/rejected": 6.374869346618652, "logps/chosen": -248.61924743652344, "logps/rejected": -289.9886779785156, "loss": 0.5821, "rewards/accuracies": 0.625, "rewards/chosen": 0.014869850128889084, "rewards/margins": 0.4561840891838074, "rewards/rejected": -0.4413142204284668, "step": 5449 }, { "epoch": 0.8428378117146723, "grad_norm": 3.6684465408325195, "learning_rate": 3.994730209646008e-06, "logits/chosen": 13.245660781860352, "logits/rejected": 5.497580528259277, "logps/chosen": -191.15628051757812, "logps/rejected": -127.91675567626953, "loss": 0.5407, "rewards/accuracies": 0.625, "rewards/chosen": 0.03709081560373306, "rewards/margins": 0.47077396512031555, "rewards/rejected": -0.4336831271648407, "step": 5450 }, { "epoch": 0.8429924608544365, "grad_norm": 124.80889129638672, "learning_rate": 3.994443807996334e-06, "logits/chosen": 7.019758224487305, "logits/rejected": 8.855713844299316, "logps/chosen": -274.4535217285156, "logps/rejected": -306.98382568359375, "loss": 0.6544, "rewards/accuracies": 0.375, "rewards/chosen": 0.17454136908054352, "rewards/margins": 0.4354098439216614, "rewards/rejected": -0.2608683705329895, "step": 5451 }, { "epoch": 0.8431471099942006, "grad_norm": 6.874704837799072, "learning_rate": 3.994157406346661e-06, "logits/chosen": 6.674954414367676, "logits/rejected": 2.596411943435669, "logps/chosen": -221.28607177734375, "logps/rejected": -212.7696075439453, "loss": 0.981, "rewards/accuracies": 0.5, "rewards/chosen": -0.24677175283432007, "rewards/margins": -0.3635064363479614, "rewards/rejected": 0.11673471331596375, "step": 5452 }, { "epoch": 0.8433017591339648, "grad_norm": 5.543606758117676, "learning_rate": 3.993871004696987e-06, "logits/chosen": 8.839470863342285, "logits/rejected": 9.854912757873535, "logps/chosen": -211.8182373046875, "logps/rejected": -238.90908813476562, "loss": 0.7424, "rewards/accuracies": 0.5, "rewards/chosen": -0.3842387795448303, "rewards/margins": -0.038873158395290375, "rewards/rejected": -0.34536558389663696, "step": 5453 }, { "epoch": 0.843456408273729, "grad_norm": 6.784116744995117, "learning_rate": 3.993584603047314e-06, "logits/chosen": 14.78835391998291, "logits/rejected": 13.23796558380127, "logps/chosen": -310.6944885253906, "logps/rejected": -251.83795166015625, "loss": 0.8677, "rewards/accuracies": 0.25, "rewards/chosen": -0.014471471309661865, "rewards/margins": -0.24712349474430084, "rewards/rejected": 0.2326519936323166, "step": 5454 }, { "epoch": 0.8436110574134932, "grad_norm": 4.349673271179199, "learning_rate": 3.99329820139764e-06, "logits/chosen": 10.144844055175781, "logits/rejected": 14.238162994384766, "logps/chosen": -157.75543212890625, "logps/rejected": -169.78427124023438, "loss": 0.6516, "rewards/accuracies": 0.625, "rewards/chosen": 0.18741624057292938, "rewards/margins": 0.2006918489933014, "rewards/rejected": -0.013275638222694397, "step": 5455 }, { "epoch": 0.8437657065532573, "grad_norm": 5.367730617523193, "learning_rate": 3.993011799747966e-06, "logits/chosen": 12.548636436462402, "logits/rejected": 10.133837699890137, "logps/chosen": -282.38360595703125, "logps/rejected": -311.2052307128906, "loss": 0.5331, "rewards/accuracies": 0.875, "rewards/chosen": 0.4988144040107727, "rewards/margins": 0.3848533034324646, "rewards/rejected": 0.11396108567714691, "step": 5456 }, { "epoch": 0.8439203556930215, "grad_norm": 6.205030918121338, "learning_rate": 3.992725398098293e-06, "logits/chosen": 12.612508773803711, "logits/rejected": 5.023171901702881, "logps/chosen": -454.2065734863281, "logps/rejected": -228.8165740966797, "loss": 0.662, "rewards/accuracies": 0.625, "rewards/chosen": 0.3579191267490387, "rewards/margins": 0.28423964977264404, "rewards/rejected": 0.07367949932813644, "step": 5457 }, { "epoch": 0.8440750048327856, "grad_norm": 4.7500901222229, "learning_rate": 3.99243899644862e-06, "logits/chosen": 8.069990158081055, "logits/rejected": 10.263045310974121, "logps/chosen": -149.6179962158203, "logps/rejected": -191.2418212890625, "loss": 0.7158, "rewards/accuracies": 0.375, "rewards/chosen": -0.16837653517723083, "rewards/margins": 0.043401919305324554, "rewards/rejected": -0.2117784470319748, "step": 5458 }, { "epoch": 0.8442296539725498, "grad_norm": 4.306687355041504, "learning_rate": 3.992152594798946e-06, "logits/chosen": 11.442225456237793, "logits/rejected": 1.4943844079971313, "logps/chosen": -242.65017700195312, "logps/rejected": -108.43907928466797, "loss": 0.5709, "rewards/accuracies": 0.75, "rewards/chosen": 0.12707114219665527, "rewards/margins": 0.47909146547317505, "rewards/rejected": -0.3520203232765198, "step": 5459 }, { "epoch": 0.8443843031123139, "grad_norm": 5.47991943359375, "learning_rate": 3.991866193149273e-06, "logits/chosen": 9.255265235900879, "logits/rejected": 6.019491195678711, "logps/chosen": -253.19265747070312, "logps/rejected": -192.83509826660156, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.06651601195335388, "rewards/margins": 0.19208915531635284, "rewards/rejected": -0.12557317316532135, "step": 5460 }, { "epoch": 0.8445389522520781, "grad_norm": 3.825132369995117, "learning_rate": 3.9915797914996e-06, "logits/chosen": 8.347002983093262, "logits/rejected": 9.163528442382812, "logps/chosen": -165.01242065429688, "logps/rejected": -166.1040802001953, "loss": 0.6086, "rewards/accuracies": 0.625, "rewards/chosen": 0.13648171722888947, "rewards/margins": 0.3249170184135437, "rewards/rejected": -0.18843528628349304, "step": 5461 }, { "epoch": 0.8446936013918422, "grad_norm": 5.549593448638916, "learning_rate": 3.991293389849925e-06, "logits/chosen": 13.56130599975586, "logits/rejected": 12.508157730102539, "logps/chosen": -263.97509765625, "logps/rejected": -273.3756103515625, "loss": 0.6705, "rewards/accuracies": 0.5, "rewards/chosen": -0.050782106816768646, "rewards/margins": 0.08747228980064392, "rewards/rejected": -0.13825440406799316, "step": 5462 }, { "epoch": 0.8448482505316064, "grad_norm": 5.221488952636719, "learning_rate": 3.991006988200252e-06, "logits/chosen": 7.283550262451172, "logits/rejected": 4.725826263427734, "logps/chosen": -362.39312744140625, "logps/rejected": -299.189208984375, "loss": 0.5214, "rewards/accuracies": 0.625, "rewards/chosen": 0.6565374732017517, "rewards/margins": 0.79338538646698, "rewards/rejected": -0.13684791326522827, "step": 5463 }, { "epoch": 0.8450028996713705, "grad_norm": 3.4726195335388184, "learning_rate": 3.990720586550579e-06, "logits/chosen": 11.575824737548828, "logits/rejected": -0.2912936210632324, "logps/chosen": -329.842041015625, "logps/rejected": -176.5313262939453, "loss": 0.4885, "rewards/accuracies": 0.625, "rewards/chosen": 0.06805653870105743, "rewards/margins": 0.6268208026885986, "rewards/rejected": -0.5587642788887024, "step": 5464 }, { "epoch": 0.8451575488111347, "grad_norm": 5.900938034057617, "learning_rate": 3.990434184900905e-06, "logits/chosen": 13.171445846557617, "logits/rejected": 6.941216945648193, "logps/chosen": -353.9685363769531, "logps/rejected": -183.34910583496094, "loss": 0.4463, "rewards/accuracies": 0.75, "rewards/chosen": 0.4265161156654358, "rewards/margins": 0.8473890423774719, "rewards/rejected": -0.4208729565143585, "step": 5465 }, { "epoch": 0.8453121979508988, "grad_norm": 7.092343330383301, "learning_rate": 3.990147783251232e-06, "logits/chosen": 9.414999961853027, "logits/rejected": 9.273770332336426, "logps/chosen": -354.06866455078125, "logps/rejected": -328.0182189941406, "loss": 0.6045, "rewards/accuracies": 0.75, "rewards/chosen": 0.14364787936210632, "rewards/margins": 0.26097074151039124, "rewards/rejected": -0.11732286214828491, "step": 5466 }, { "epoch": 0.8454668470906631, "grad_norm": 6.088544845581055, "learning_rate": 3.989861381601559e-06, "logits/chosen": 8.224116325378418, "logits/rejected": 10.840134620666504, "logps/chosen": -418.8666687011719, "logps/rejected": -407.1133117675781, "loss": 0.5853, "rewards/accuracies": 0.625, "rewards/chosen": 0.14702008664608002, "rewards/margins": 0.394416481256485, "rewards/rejected": -0.24739637970924377, "step": 5467 }, { "epoch": 0.8456214962304273, "grad_norm": 5.374605178833008, "learning_rate": 3.9895749799518845e-06, "logits/chosen": 12.732425689697266, "logits/rejected": 14.39950180053711, "logps/chosen": -236.18177795410156, "logps/rejected": -282.9896240234375, "loss": 0.8467, "rewards/accuracies": 0.375, "rewards/chosen": -0.2567649781703949, "rewards/margins": -0.23266582190990448, "rewards/rejected": -0.024099163711071014, "step": 5468 }, { "epoch": 0.8457761453701914, "grad_norm": 5.545962333679199, "learning_rate": 3.989288578302211e-06, "logits/chosen": 12.432683944702148, "logits/rejected": 3.2952661514282227, "logps/chosen": -269.54656982421875, "logps/rejected": -229.50872802734375, "loss": 0.5971, "rewards/accuracies": 0.75, "rewards/chosen": 0.25189971923828125, "rewards/margins": 0.23972085118293762, "rewards/rejected": 0.012178882956504822, "step": 5469 }, { "epoch": 0.8459307945099556, "grad_norm": 3.770609140396118, "learning_rate": 3.989002176652538e-06, "logits/chosen": 12.660408973693848, "logits/rejected": 7.900226593017578, "logps/chosen": -268.8815612792969, "logps/rejected": -205.54701232910156, "loss": 0.5838, "rewards/accuracies": 0.875, "rewards/chosen": 0.3496103286743164, "rewards/margins": 0.25646916031837463, "rewards/rejected": 0.09314117580652237, "step": 5470 }, { "epoch": 0.8460854436497197, "grad_norm": 6.674996852874756, "learning_rate": 3.9887157750028644e-06, "logits/chosen": 9.373661041259766, "logits/rejected": 6.587751865386963, "logps/chosen": -425.7975158691406, "logps/rejected": -419.66815185546875, "loss": 0.5703, "rewards/accuracies": 0.75, "rewards/chosen": 0.4166502356529236, "rewards/margins": 0.4247686564922333, "rewards/rejected": -0.008118439465761185, "step": 5471 }, { "epoch": 0.8462400927894839, "grad_norm": 4.417994499206543, "learning_rate": 3.988429373353191e-06, "logits/chosen": 12.378002166748047, "logits/rejected": 14.436321258544922, "logps/chosen": -135.5165557861328, "logps/rejected": -164.73138427734375, "loss": 0.6432, "rewards/accuracies": 0.375, "rewards/chosen": -0.2739633619785309, "rewards/margins": 0.13503243029117584, "rewards/rejected": -0.40899577736854553, "step": 5472 }, { "epoch": 0.846394741929248, "grad_norm": 5.065564155578613, "learning_rate": 3.988142971703518e-06, "logits/chosen": 10.653124809265137, "logits/rejected": 6.704805374145508, "logps/chosen": -267.66009521484375, "logps/rejected": -214.47833251953125, "loss": 0.6208, "rewards/accuracies": 0.5, "rewards/chosen": 0.3979877829551697, "rewards/margins": 0.42701444029808044, "rewards/rejected": -0.029026679694652557, "step": 5473 }, { "epoch": 0.8465493910690122, "grad_norm": 7.963303089141846, "learning_rate": 3.987856570053844e-06, "logits/chosen": 8.149672508239746, "logits/rejected": 10.126824378967285, "logps/chosen": -187.68344116210938, "logps/rejected": -277.41009521484375, "loss": 0.8712, "rewards/accuracies": 0.25, "rewards/chosen": 0.0975956916809082, "rewards/margins": -0.25921422243118286, "rewards/rejected": 0.35680994391441345, "step": 5474 }, { "epoch": 0.8467040402087763, "grad_norm": 5.427379131317139, "learning_rate": 3.98757016840417e-06, "logits/chosen": 15.40723991394043, "logits/rejected": 12.819866180419922, "logps/chosen": -403.01910400390625, "logps/rejected": -333.90325927734375, "loss": 0.4324, "rewards/accuracies": 0.875, "rewards/chosen": 0.6557334661483765, "rewards/margins": 0.815754771232605, "rewards/rejected": -0.16002121567726135, "step": 5475 }, { "epoch": 0.8468586893485405, "grad_norm": 5.994505405426025, "learning_rate": 3.987283766754497e-06, "logits/chosen": 7.1236467361450195, "logits/rejected": 2.8771605491638184, "logps/chosen": -229.33168029785156, "logps/rejected": -191.91180419921875, "loss": 0.626, "rewards/accuracies": 0.625, "rewards/chosen": 0.03733201324939728, "rewards/margins": 0.17790646851062775, "rewards/rejected": -0.14057445526123047, "step": 5476 }, { "epoch": 0.8470133384883046, "grad_norm": 4.784754753112793, "learning_rate": 3.9869973651048235e-06, "logits/chosen": 12.940276145935059, "logits/rejected": 6.782338619232178, "logps/chosen": -316.0126647949219, "logps/rejected": -254.24484252929688, "loss": 0.5747, "rewards/accuracies": 0.75, "rewards/chosen": -0.018910042941570282, "rewards/margins": 0.3407229781150818, "rewards/rejected": -0.3596329689025879, "step": 5477 }, { "epoch": 0.8471679876280688, "grad_norm": 5.256471157073975, "learning_rate": 3.98671096345515e-06, "logits/chosen": 7.633310794830322, "logits/rejected": 4.674440383911133, "logps/chosen": -331.40875244140625, "logps/rejected": -204.9730682373047, "loss": 0.7333, "rewards/accuracies": 0.375, "rewards/chosen": 0.16876892745494843, "rewards/margins": 0.11264100670814514, "rewards/rejected": 0.05612790584564209, "step": 5478 }, { "epoch": 0.8473226367678329, "grad_norm": 5.068542957305908, "learning_rate": 3.986424561805477e-06, "logits/chosen": 9.508679389953613, "logits/rejected": 5.23380708694458, "logps/chosen": -208.66061401367188, "logps/rejected": -164.4246368408203, "loss": 0.7484, "rewards/accuracies": 0.375, "rewards/chosen": -0.20797139406204224, "rewards/margins": -0.022533897310495377, "rewards/rejected": -0.18543748557567596, "step": 5479 }, { "epoch": 0.8474772859075972, "grad_norm": 6.490504264831543, "learning_rate": 3.9861381601558034e-06, "logits/chosen": 8.677637100219727, "logits/rejected": 7.825351715087891, "logps/chosen": -232.0186004638672, "logps/rejected": -198.38323974609375, "loss": 0.6872, "rewards/accuracies": 0.5, "rewards/chosen": -0.35770681500434875, "rewards/margins": 0.12265169620513916, "rewards/rejected": -0.4803585112094879, "step": 5480 }, { "epoch": 0.8476319350473613, "grad_norm": 5.286553859710693, "learning_rate": 3.985851758506129e-06, "logits/chosen": 5.221107482910156, "logits/rejected": 6.806898593902588, "logps/chosen": -187.85260009765625, "logps/rejected": -245.42233276367188, "loss": 0.7391, "rewards/accuracies": 0.5, "rewards/chosen": -0.2765347361564636, "rewards/margins": -0.029204845428466797, "rewards/rejected": -0.24732989072799683, "step": 5481 }, { "epoch": 0.8477865841871255, "grad_norm": 7.118972301483154, "learning_rate": 3.985565356856456e-06, "logits/chosen": 14.64684772491455, "logits/rejected": 12.665660858154297, "logps/chosen": -263.484130859375, "logps/rejected": -300.7195739746094, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.37521693110466003, "rewards/margins": 0.20982113480567932, "rewards/rejected": 0.16539576649665833, "step": 5482 }, { "epoch": 0.8479412333268896, "grad_norm": 6.252742767333984, "learning_rate": 3.9852789552067825e-06, "logits/chosen": 9.27850341796875, "logits/rejected": 8.052875518798828, "logps/chosen": -285.70111083984375, "logps/rejected": -285.9509582519531, "loss": 0.76, "rewards/accuracies": 0.75, "rewards/chosen": -0.04352791607379913, "rewards/margins": 0.22563757002353668, "rewards/rejected": -0.2691655158996582, "step": 5483 }, { "epoch": 0.8480958824666538, "grad_norm": 5.236358642578125, "learning_rate": 3.984992553557109e-06, "logits/chosen": 7.729684352874756, "logits/rejected": -1.0859134197235107, "logps/chosen": -260.7658996582031, "logps/rejected": -158.29518127441406, "loss": 0.5808, "rewards/accuracies": 0.625, "rewards/chosen": 0.14956846833229065, "rewards/margins": 0.349578857421875, "rewards/rejected": -0.20001041889190674, "step": 5484 }, { "epoch": 0.848250531606418, "grad_norm": 3.3107223510742188, "learning_rate": 3.984706151907435e-06, "logits/chosen": 5.789141654968262, "logits/rejected": 3.0061471462249756, "logps/chosen": -209.1846466064453, "logps/rejected": -131.57101440429688, "loss": 0.5561, "rewards/accuracies": 0.75, "rewards/chosen": -0.0344119668006897, "rewards/margins": 0.46683138608932495, "rewards/rejected": -0.5012433528900146, "step": 5485 }, { "epoch": 0.8484051807461821, "grad_norm": 3.3643791675567627, "learning_rate": 3.984419750257762e-06, "logits/chosen": 12.296547889709473, "logits/rejected": 5.911550045013428, "logps/chosen": -202.30465698242188, "logps/rejected": -137.86553955078125, "loss": 0.4844, "rewards/accuracies": 0.625, "rewards/chosen": 0.20803004503250122, "rewards/margins": 0.6443226337432861, "rewards/rejected": -0.4362925589084625, "step": 5486 }, { "epoch": 0.8485598298859462, "grad_norm": 22.514795303344727, "learning_rate": 3.984133348608088e-06, "logits/chosen": 9.195449829101562, "logits/rejected": 9.772502899169922, "logps/chosen": -239.89364624023438, "logps/rejected": -248.40615844726562, "loss": 0.796, "rewards/accuracies": 0.5, "rewards/chosen": -0.6489839553833008, "rewards/margins": -0.02284925803542137, "rewards/rejected": -0.6261346936225891, "step": 5487 }, { "epoch": 0.8487144790257104, "grad_norm": 5.163241863250732, "learning_rate": 3.983846946958415e-06, "logits/chosen": 9.574090957641602, "logits/rejected": 13.16737174987793, "logps/chosen": -165.08685302734375, "logps/rejected": -243.1929931640625, "loss": 0.5893, "rewards/accuracies": 0.5, "rewards/chosen": -0.10586822032928467, "rewards/margins": 0.3137997090816498, "rewards/rejected": -0.41966789960861206, "step": 5488 }, { "epoch": 0.8488691281654746, "grad_norm": 4.582966327667236, "learning_rate": 3.983560545308741e-06, "logits/chosen": 8.251498222351074, "logits/rejected": 1.7257134914398193, "logps/chosen": -363.6442565917969, "logps/rejected": -225.69203186035156, "loss": 0.4887, "rewards/accuracies": 0.75, "rewards/chosen": 0.14303795993328094, "rewards/margins": 0.595531702041626, "rewards/rejected": -0.4524937868118286, "step": 5489 }, { "epoch": 0.8490237773052387, "grad_norm": 5.71950626373291, "learning_rate": 3.983274143659067e-06, "logits/chosen": 9.648967742919922, "logits/rejected": 4.315744876861572, "logps/chosen": -304.9479675292969, "logps/rejected": -292.1197509765625, "loss": 0.5288, "rewards/accuracies": 0.625, "rewards/chosen": -0.0482361763715744, "rewards/margins": 0.7476270198822021, "rewards/rejected": -0.7958632707595825, "step": 5490 }, { "epoch": 0.8491784264450029, "grad_norm": 6.292933464050293, "learning_rate": 3.982987742009394e-06, "logits/chosen": 4.617978096008301, "logits/rejected": 6.700188636779785, "logps/chosen": -208.8612060546875, "logps/rejected": -267.6300048828125, "loss": 0.9422, "rewards/accuracies": 0.375, "rewards/chosen": -0.0003688819706439972, "rewards/margins": -0.3538159430027008, "rewards/rejected": 0.3534470796585083, "step": 5491 }, { "epoch": 0.849333075584767, "grad_norm": 3.921041250228882, "learning_rate": 3.982701340359721e-06, "logits/chosen": 9.975062370300293, "logits/rejected": 5.595370769500732, "logps/chosen": -248.07568359375, "logps/rejected": -264.8523254394531, "loss": 0.5387, "rewards/accuracies": 0.875, "rewards/chosen": -0.049535274505615234, "rewards/margins": 0.4183952808380127, "rewards/rejected": -0.46793055534362793, "step": 5492 }, { "epoch": 0.8494877247245313, "grad_norm": 4.795502185821533, "learning_rate": 3.982414938710047e-06, "logits/chosen": 12.421577453613281, "logits/rejected": 8.837335586547852, "logps/chosen": -251.47259521484375, "logps/rejected": -209.2999267578125, "loss": 0.6389, "rewards/accuracies": 0.5, "rewards/chosen": 0.08791498094797134, "rewards/margins": 0.2378964126110077, "rewards/rejected": -0.14998143911361694, "step": 5493 }, { "epoch": 0.8496423738642954, "grad_norm": 3.839031219482422, "learning_rate": 3.982128537060374e-06, "logits/chosen": 7.401708602905273, "logits/rejected": 5.659162998199463, "logps/chosen": -192.65206909179688, "logps/rejected": -117.4052734375, "loss": 0.5601, "rewards/accuracies": 0.75, "rewards/chosen": 0.03605546057224274, "rewards/margins": 0.369170606136322, "rewards/rejected": -0.3331151604652405, "step": 5494 }, { "epoch": 0.8497970230040596, "grad_norm": 6.970352649688721, "learning_rate": 3.9818421354107e-06, "logits/chosen": 8.382070541381836, "logits/rejected": 14.198983192443848, "logps/chosen": -212.7875213623047, "logps/rejected": -306.4270935058594, "loss": 0.9613, "rewards/accuracies": 0.25, "rewards/chosen": -0.38489991426467896, "rewards/margins": -0.437112033367157, "rewards/rejected": 0.05221214145421982, "step": 5495 }, { "epoch": 0.8499516721438237, "grad_norm": 3.9105520248413086, "learning_rate": 3.9815557337610265e-06, "logits/chosen": 10.03400993347168, "logits/rejected": 10.690797805786133, "logps/chosen": -231.88458251953125, "logps/rejected": -230.76101684570312, "loss": 0.6025, "rewards/accuracies": 0.625, "rewards/chosen": 0.06226252019405365, "rewards/margins": 0.3598465323448181, "rewards/rejected": -0.2975839674472809, "step": 5496 }, { "epoch": 0.8501063212835879, "grad_norm": 8.119980812072754, "learning_rate": 3.981269332111353e-06, "logits/chosen": 7.285583972930908, "logits/rejected": 3.6322736740112305, "logps/chosen": -426.3985290527344, "logps/rejected": -408.44232177734375, "loss": 0.7449, "rewards/accuracies": 0.75, "rewards/chosen": 0.1334976851940155, "rewards/margins": 0.354358971118927, "rewards/rejected": -0.22086124122142792, "step": 5497 }, { "epoch": 0.850260970423352, "grad_norm": 3.3217272758483887, "learning_rate": 3.98098293046168e-06, "logits/chosen": 12.459260940551758, "logits/rejected": 4.302353382110596, "logps/chosen": -217.92222595214844, "logps/rejected": -184.69979858398438, "loss": 0.4998, "rewards/accuracies": 0.875, "rewards/chosen": 0.09964251518249512, "rewards/margins": 0.46920478343963623, "rewards/rejected": -0.3695622682571411, "step": 5498 }, { "epoch": 0.8504156195631162, "grad_norm": 9.07116985321045, "learning_rate": 3.980696528812006e-06, "logits/chosen": 6.277608394622803, "logits/rejected": 9.21856689453125, "logps/chosen": -296.05181884765625, "logps/rejected": -331.8778991699219, "loss": 0.8306, "rewards/accuracies": 0.5, "rewards/chosen": -0.15516506135463715, "rewards/margins": -0.1964268982410431, "rewards/rejected": 0.04126181825995445, "step": 5499 }, { "epoch": 0.8505702687028803, "grad_norm": 6.438467025756836, "learning_rate": 3.980410127162333e-06, "logits/chosen": 6.978841304779053, "logits/rejected": 11.333194732666016, "logps/chosen": -189.72842407226562, "logps/rejected": -252.83270263671875, "loss": 0.9321, "rewards/accuracies": 0.375, "rewards/chosen": -0.20909695327281952, "rewards/margins": -0.3070370554924011, "rewards/rejected": 0.0979401096701622, "step": 5500 }, { "epoch": 0.8507249178426445, "grad_norm": 5.399377822875977, "learning_rate": 3.980123725512659e-06, "logits/chosen": 13.276514053344727, "logits/rejected": 18.137290954589844, "logps/chosen": -143.25082397460938, "logps/rejected": -185.884521484375, "loss": 0.9047, "rewards/accuracies": 0.125, "rewards/chosen": -0.2445690631866455, "rewards/margins": -0.3527756333351135, "rewards/rejected": 0.10820655524730682, "step": 5501 }, { "epoch": 0.8508795669824086, "grad_norm": 6.511845588684082, "learning_rate": 3.9798373238629855e-06, "logits/chosen": 8.543060302734375, "logits/rejected": 5.446573734283447, "logps/chosen": -250.5183563232422, "logps/rejected": -240.40802001953125, "loss": 0.7824, "rewards/accuracies": 0.25, "rewards/chosen": -0.08172449469566345, "rewards/margins": -0.10007509589195251, "rewards/rejected": 0.018350616097450256, "step": 5502 }, { "epoch": 0.8510342161221728, "grad_norm": 6.365176200866699, "learning_rate": 3.979550922213312e-06, "logits/chosen": 11.111099243164062, "logits/rejected": 5.582236289978027, "logps/chosen": -383.2252502441406, "logps/rejected": -310.8315124511719, "loss": 0.7168, "rewards/accuracies": 0.5, "rewards/chosen": -0.16108235716819763, "rewards/margins": 0.22597134113311768, "rewards/rejected": -0.3870536983013153, "step": 5503 }, { "epoch": 0.8511888652619369, "grad_norm": 9.928916931152344, "learning_rate": 3.979264520563639e-06, "logits/chosen": 8.950950622558594, "logits/rejected": 6.616206169128418, "logps/chosen": -383.1068420410156, "logps/rejected": -409.5474548339844, "loss": 0.5915, "rewards/accuracies": 0.75, "rewards/chosen": 0.3428735136985779, "rewards/margins": 0.25947338342666626, "rewards/rejected": 0.08340016007423401, "step": 5504 }, { "epoch": 0.8513435144017012, "grad_norm": 3.189671277999878, "learning_rate": 3.9789781189139655e-06, "logits/chosen": 14.187501907348633, "logits/rejected": 8.953895568847656, "logps/chosen": -227.4154815673828, "logps/rejected": -184.2188262939453, "loss": 0.4707, "rewards/accuracies": 0.875, "rewards/chosen": 0.46153968572616577, "rewards/margins": 0.5768137574195862, "rewards/rejected": -0.1152740940451622, "step": 5505 }, { "epoch": 0.8514981635414653, "grad_norm": 5.075366973876953, "learning_rate": 3.978691717264292e-06, "logits/chosen": 14.241186141967773, "logits/rejected": 10.220351219177246, "logps/chosen": -198.8037109375, "logps/rejected": -168.76805114746094, "loss": 0.6364, "rewards/accuracies": 0.375, "rewards/chosen": -0.06963552534580231, "rewards/margins": 0.26270973682403564, "rewards/rejected": -0.33234524726867676, "step": 5506 }, { "epoch": 0.8516528126812295, "grad_norm": 5.984999656677246, "learning_rate": 3.978405315614619e-06, "logits/chosen": 12.944738388061523, "logits/rejected": 11.130903244018555, "logps/chosen": -344.90496826171875, "logps/rejected": -307.46441650390625, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": 0.39944949746131897, "rewards/margins": 0.28340253233909607, "rewards/rejected": 0.11604700237512589, "step": 5507 }, { "epoch": 0.8518074618209936, "grad_norm": 5.058184623718262, "learning_rate": 3.9781189139649446e-06, "logits/chosen": 14.496612548828125, "logits/rejected": 3.7888152599334717, "logps/chosen": -352.7911376953125, "logps/rejected": -240.57614135742188, "loss": 0.5169, "rewards/accuracies": 0.75, "rewards/chosen": 0.22777345776557922, "rewards/margins": 0.5655710101127625, "rewards/rejected": -0.33779752254486084, "step": 5508 }, { "epoch": 0.8519621109607578, "grad_norm": 6.677819728851318, "learning_rate": 3.977832512315271e-06, "logits/chosen": 9.146625518798828, "logits/rejected": 4.415740966796875, "logps/chosen": -337.3669128417969, "logps/rejected": -276.8423767089844, "loss": 0.6121, "rewards/accuracies": 0.5, "rewards/chosen": 0.3911222219467163, "rewards/margins": 0.36787116527557373, "rewards/rejected": 0.023251086473464966, "step": 5509 }, { "epoch": 0.852116760100522, "grad_norm": 5.219935894012451, "learning_rate": 3.977546110665598e-06, "logits/chosen": 11.380074501037598, "logits/rejected": 4.655664920806885, "logps/chosen": -284.1026916503906, "logps/rejected": -216.97323608398438, "loss": 0.5087, "rewards/accuracies": 0.875, "rewards/chosen": 0.21835654973983765, "rewards/margins": 0.5672281980514526, "rewards/rejected": -0.3488716781139374, "step": 5510 }, { "epoch": 0.8522714092402861, "grad_norm": 5.133790493011475, "learning_rate": 3.9772597090159245e-06, "logits/chosen": 12.784741401672363, "logits/rejected": 9.135361671447754, "logps/chosen": -365.32843017578125, "logps/rejected": -298.96746826171875, "loss": 0.6463, "rewards/accuracies": 0.75, "rewards/chosen": -0.10990868508815765, "rewards/margins": 0.13372841477394104, "rewards/rejected": -0.2436370998620987, "step": 5511 }, { "epoch": 0.8524260583800503, "grad_norm": 4.976015567779541, "learning_rate": 3.976973307366251e-06, "logits/chosen": 8.681876182556152, "logits/rejected": 7.761447429656982, "logps/chosen": -191.5873565673828, "logps/rejected": -204.71768188476562, "loss": 0.6727, "rewards/accuracies": 0.375, "rewards/chosen": 0.3618316054344177, "rewards/margins": 0.12575922906398773, "rewards/rejected": 0.2360723614692688, "step": 5512 }, { "epoch": 0.8525807075198144, "grad_norm": 4.589407920837402, "learning_rate": 3.976686905716578e-06, "logits/chosen": 16.618200302124023, "logits/rejected": 9.696112632751465, "logps/chosen": -324.1329040527344, "logps/rejected": -209.81588745117188, "loss": 0.4338, "rewards/accuracies": 0.875, "rewards/chosen": -0.02694815769791603, "rewards/margins": 0.8358587026596069, "rewards/rejected": -0.8628067970275879, "step": 5513 }, { "epoch": 0.8527353566595786, "grad_norm": 5.786118030548096, "learning_rate": 3.976400504066904e-06, "logits/chosen": 11.021668434143066, "logits/rejected": 10.574874877929688, "logps/chosen": -247.92266845703125, "logps/rejected": -264.9029541015625, "loss": 0.6241, "rewards/accuracies": 0.625, "rewards/chosen": 0.09197451174259186, "rewards/margins": 0.3161090612411499, "rewards/rejected": -0.22413453459739685, "step": 5514 }, { "epoch": 0.8528900057993427, "grad_norm": 5.676928520202637, "learning_rate": 3.97611410241723e-06, "logits/chosen": 12.500770568847656, "logits/rejected": 11.121211051940918, "logps/chosen": -450.8160095214844, "logps/rejected": -376.83343505859375, "loss": 0.5175, "rewards/accuracies": 0.75, "rewards/chosen": 0.6583291292190552, "rewards/margins": 0.5020024180412292, "rewards/rejected": 0.15632666647434235, "step": 5515 }, { "epoch": 0.8530446549391069, "grad_norm": 4.453126907348633, "learning_rate": 3.975827700767557e-06, "logits/chosen": 10.449207305908203, "logits/rejected": 12.167027473449707, "logps/chosen": -183.93203735351562, "logps/rejected": -222.5181884765625, "loss": 0.6508, "rewards/accuracies": 0.625, "rewards/chosen": -0.00059538334608078, "rewards/margins": 0.28539708256721497, "rewards/rejected": -0.28599247336387634, "step": 5516 }, { "epoch": 0.853199304078871, "grad_norm": 9.975476264953613, "learning_rate": 3.9755412991178836e-06, "logits/chosen": 8.739119529724121, "logits/rejected": 9.14430046081543, "logps/chosen": -360.94146728515625, "logps/rejected": -317.3536376953125, "loss": 1.188, "rewards/accuracies": 0.375, "rewards/chosen": -0.4631873369216919, "rewards/margins": -0.5738186240196228, "rewards/rejected": 0.1106313019990921, "step": 5517 }, { "epoch": 0.8533539532186353, "grad_norm": 3.940706253051758, "learning_rate": 3.97525489746821e-06, "logits/chosen": 12.042640686035156, "logits/rejected": 13.598318099975586, "logps/chosen": -179.56272888183594, "logps/rejected": -218.2500762939453, "loss": 0.5224, "rewards/accuracies": 1.0, "rewards/chosen": 0.07911482453346252, "rewards/margins": 0.39709216356277466, "rewards/rejected": -0.31797733902931213, "step": 5518 }, { "epoch": 0.8535086023583994, "grad_norm": 7.980309963226318, "learning_rate": 3.974968495818536e-06, "logits/chosen": 11.663081169128418, "logits/rejected": 10.500115394592285, "logps/chosen": -397.35565185546875, "logps/rejected": -308.91632080078125, "loss": 0.6316, "rewards/accuracies": 0.625, "rewards/chosen": 0.2902355194091797, "rewards/margins": 0.2300301343202591, "rewards/rejected": 0.060205407440662384, "step": 5519 }, { "epoch": 0.8536632514981636, "grad_norm": 4.050093650817871, "learning_rate": 3.974682094168863e-06, "logits/chosen": 11.713811874389648, "logits/rejected": 7.592785835266113, "logps/chosen": -199.0465087890625, "logps/rejected": -182.48623657226562, "loss": 0.5093, "rewards/accuracies": 0.625, "rewards/chosen": -0.004756644368171692, "rewards/margins": 0.5292977094650269, "rewards/rejected": -0.5340542793273926, "step": 5520 }, { "epoch": 0.8538179006379277, "grad_norm": 5.195960521697998, "learning_rate": 3.974395692519189e-06, "logits/chosen": 5.48490571975708, "logits/rejected": 7.278876304626465, "logps/chosen": -100.11968231201172, "logps/rejected": -161.72340393066406, "loss": 0.8406, "rewards/accuracies": 0.375, "rewards/chosen": -0.07327794283628464, "rewards/margins": -0.2435295432806015, "rewards/rejected": 0.17025160789489746, "step": 5521 }, { "epoch": 0.8539725497776919, "grad_norm": 4.676542282104492, "learning_rate": 3.974109290869516e-06, "logits/chosen": 2.9857027530670166, "logits/rejected": 7.063844680786133, "logps/chosen": -223.51419067382812, "logps/rejected": -311.634521484375, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": -0.03967433422803879, "rewards/margins": 0.9175422191619873, "rewards/rejected": -0.9572165012359619, "step": 5522 }, { "epoch": 0.854127198917456, "grad_norm": 7.71112060546875, "learning_rate": 3.973822889219842e-06, "logits/chosen": 12.697504043579102, "logits/rejected": 8.772088050842285, "logps/chosen": -165.67213439941406, "logps/rejected": -203.76319885253906, "loss": 0.632, "rewards/accuracies": 0.625, "rewards/chosen": 0.30290815234184265, "rewards/margins": 0.159666508436203, "rewards/rejected": 0.14324164390563965, "step": 5523 }, { "epoch": 0.8542818480572202, "grad_norm": 3.8125832080841064, "learning_rate": 3.9735364875701684e-06, "logits/chosen": 10.960256576538086, "logits/rejected": 3.113831043243408, "logps/chosen": -309.34332275390625, "logps/rejected": -163.04061889648438, "loss": 0.5579, "rewards/accuracies": 0.625, "rewards/chosen": 0.2807078957557678, "rewards/margins": 0.43664461374282837, "rewards/rejected": -0.15593671798706055, "step": 5524 }, { "epoch": 0.8544364971969843, "grad_norm": 4.3831329345703125, "learning_rate": 3.973250085920495e-06, "logits/chosen": 14.845224380493164, "logits/rejected": 5.573022842407227, "logps/chosen": -321.0080871582031, "logps/rejected": -181.04718017578125, "loss": 0.5544, "rewards/accuracies": 0.75, "rewards/chosen": 0.13004538416862488, "rewards/margins": 0.4373641014099121, "rewards/rejected": -0.30731871724128723, "step": 5525 }, { "epoch": 0.8545911463367485, "grad_norm": 4.656836032867432, "learning_rate": 3.972963684270822e-06, "logits/chosen": 7.565646648406982, "logits/rejected": 3.1633729934692383, "logps/chosen": -341.6214599609375, "logps/rejected": -219.31301879882812, "loss": 0.5371, "rewards/accuracies": 0.75, "rewards/chosen": 0.3157382011413574, "rewards/margins": 0.41551926732063293, "rewards/rejected": -0.0997810810804367, "step": 5526 }, { "epoch": 0.8547457954765126, "grad_norm": 5.701992511749268, "learning_rate": 3.972677282621148e-06, "logits/chosen": 5.455742359161377, "logits/rejected": 5.645936965942383, "logps/chosen": -277.1202392578125, "logps/rejected": -262.00091552734375, "loss": 0.5569, "rewards/accuracies": 0.625, "rewards/chosen": -0.06556187570095062, "rewards/margins": 0.34338244795799255, "rewards/rejected": -0.408944308757782, "step": 5527 }, { "epoch": 0.8549004446162768, "grad_norm": 13.386775016784668, "learning_rate": 3.972390880971474e-06, "logits/chosen": 12.9055814743042, "logits/rejected": 10.631471633911133, "logps/chosen": -345.20465087890625, "logps/rejected": -305.3077697753906, "loss": 0.8612, "rewards/accuracies": 0.25, "rewards/chosen": -0.14449329674243927, "rewards/margins": -0.14074286818504333, "rewards/rejected": -0.003750436007976532, "step": 5528 }, { "epoch": 0.8550550937560409, "grad_norm": 5.1428375244140625, "learning_rate": 3.972104479321801e-06, "logits/chosen": 12.146710395812988, "logits/rejected": 8.29996395111084, "logps/chosen": -635.9072875976562, "logps/rejected": -470.0387878417969, "loss": 0.4645, "rewards/accuracies": 0.75, "rewards/chosen": 0.6608940362930298, "rewards/margins": 0.6956630349159241, "rewards/rejected": -0.034769050776958466, "step": 5529 }, { "epoch": 0.8552097428958051, "grad_norm": 5.171927452087402, "learning_rate": 3.9718180776721275e-06, "logits/chosen": 13.20880126953125, "logits/rejected": 8.684325218200684, "logps/chosen": -362.83544921875, "logps/rejected": -350.0487976074219, "loss": 0.6053, "rewards/accuracies": 0.625, "rewards/chosen": 0.11922894418239594, "rewards/margins": 0.4723970293998718, "rewards/rejected": -0.3531681299209595, "step": 5530 }, { "epoch": 0.8553643920355694, "grad_norm": 6.222297191619873, "learning_rate": 3.971531676022454e-06, "logits/chosen": 13.412002563476562, "logits/rejected": 9.023429870605469, "logps/chosen": -277.14251708984375, "logps/rejected": -238.841064453125, "loss": 0.5226, "rewards/accuracies": 0.75, "rewards/chosen": 0.2784581184387207, "rewards/margins": 0.5299692153930664, "rewards/rejected": -0.2515110969543457, "step": 5531 }, { "epoch": 0.8555190411753335, "grad_norm": 4.826961517333984, "learning_rate": 3.971245274372781e-06, "logits/chosen": 10.715517044067383, "logits/rejected": 6.7913498878479, "logps/chosen": -196.56402587890625, "logps/rejected": -125.03688049316406, "loss": 0.5586, "rewards/accuracies": 0.875, "rewards/chosen": 0.05702914670109749, "rewards/margins": 0.40519261360168457, "rewards/rejected": -0.3481634855270386, "step": 5532 }, { "epoch": 0.8556736903150977, "grad_norm": 6.214565753936768, "learning_rate": 3.9709588727231074e-06, "logits/chosen": 12.208765983581543, "logits/rejected": 9.875855445861816, "logps/chosen": -317.4407958984375, "logps/rejected": -296.9223937988281, "loss": 0.8089, "rewards/accuracies": 0.375, "rewards/chosen": -0.08436611294746399, "rewards/margins": -0.17837321758270264, "rewards/rejected": 0.09400712698698044, "step": 5533 }, { "epoch": 0.8558283394548618, "grad_norm": 5.382976531982422, "learning_rate": 3.970672471073433e-06, "logits/chosen": 5.199872970581055, "logits/rejected": 9.805152893066406, "logps/chosen": -213.81668090820312, "logps/rejected": -259.80584716796875, "loss": 0.8172, "rewards/accuracies": 0.5, "rewards/chosen": -0.03748548403382301, "rewards/margins": -0.16475293040275574, "rewards/rejected": 0.12726745009422302, "step": 5534 }, { "epoch": 0.855982988594626, "grad_norm": 5.463367938995361, "learning_rate": 3.97038606942376e-06, "logits/chosen": 12.242330551147461, "logits/rejected": 11.819622993469238, "logps/chosen": -224.96087646484375, "logps/rejected": -259.28826904296875, "loss": 0.7398, "rewards/accuracies": 0.5, "rewards/chosen": 0.03340083360671997, "rewards/margins": 0.008428975939750671, "rewards/rejected": 0.024971872568130493, "step": 5535 }, { "epoch": 0.8561376377343901, "grad_norm": 4.299546241760254, "learning_rate": 3.9700996677740865e-06, "logits/chosen": 15.45047378540039, "logits/rejected": 8.400930404663086, "logps/chosen": -197.11111450195312, "logps/rejected": -145.38729858398438, "loss": 0.5645, "rewards/accuracies": 0.625, "rewards/chosen": -0.3550064265727997, "rewards/margins": 0.3712439239025116, "rewards/rejected": -0.7262503504753113, "step": 5536 }, { "epoch": 0.8562922868741543, "grad_norm": 6.327221393585205, "learning_rate": 3.969813266124413e-06, "logits/chosen": 9.867925643920898, "logits/rejected": 8.902490615844727, "logps/chosen": -389.11212158203125, "logps/rejected": -294.26513671875, "loss": 0.6609, "rewards/accuracies": 0.375, "rewards/chosen": 0.1277143359184265, "rewards/margins": 0.196881502866745, "rewards/rejected": -0.06916715949773788, "step": 5537 }, { "epoch": 0.8564469360139184, "grad_norm": 5.8160834312438965, "learning_rate": 3.96952686447474e-06, "logits/chosen": 15.131919860839844, "logits/rejected": 8.25251579284668, "logps/chosen": -322.2184753417969, "logps/rejected": -238.88633728027344, "loss": 0.6919, "rewards/accuracies": 0.5, "rewards/chosen": 0.12332974374294281, "rewards/margins": 0.16262176632881165, "rewards/rejected": -0.03929205983877182, "step": 5538 }, { "epoch": 0.8566015851536826, "grad_norm": 5.321799278259277, "learning_rate": 3.9692404628250665e-06, "logits/chosen": 8.743563652038574, "logits/rejected": 5.112298488616943, "logps/chosen": -353.24847412109375, "logps/rejected": -263.61798095703125, "loss": 0.7137, "rewards/accuracies": 0.5, "rewards/chosen": 0.19939926266670227, "rewards/margins": 0.09951724112033844, "rewards/rejected": 0.09988205134868622, "step": 5539 }, { "epoch": 0.8567562342934467, "grad_norm": 10.051793098449707, "learning_rate": 3.968954061175393e-06, "logits/chosen": 9.454984664916992, "logits/rejected": 3.158181667327881, "logps/chosen": -438.4866943359375, "logps/rejected": -303.512939453125, "loss": 0.4547, "rewards/accuracies": 0.875, "rewards/chosen": 0.29820510745048523, "rewards/margins": 0.7262248992919922, "rewards/rejected": -0.42801979184150696, "step": 5540 }, { "epoch": 0.8569108834332109, "grad_norm": 17.073593139648438, "learning_rate": 3.968667659525719e-06, "logits/chosen": 8.447195053100586, "logits/rejected": 8.953034400939941, "logps/chosen": -340.5556640625, "logps/rejected": -355.7740478515625, "loss": 0.7351, "rewards/accuracies": 0.5, "rewards/chosen": 0.30880415439605713, "rewards/margins": 0.05526772141456604, "rewards/rejected": 0.2535364031791687, "step": 5541 }, { "epoch": 0.857065532572975, "grad_norm": 4.253938674926758, "learning_rate": 3.968381257876046e-06, "logits/chosen": 6.562461853027344, "logits/rejected": -2.93577241897583, "logps/chosen": -265.87109375, "logps/rejected": -153.58255004882812, "loss": 0.5355, "rewards/accuracies": 0.875, "rewards/chosen": -0.09016941487789154, "rewards/margins": 0.4087334871292114, "rewards/rejected": -0.49890291690826416, "step": 5542 }, { "epoch": 0.8572201817127392, "grad_norm": 6.389232158660889, "learning_rate": 3.968094856226372e-06, "logits/chosen": 7.3160247802734375, "logits/rejected": 6.972973823547363, "logps/chosen": -222.66339111328125, "logps/rejected": -282.0250244140625, "loss": 0.6935, "rewards/accuracies": 0.75, "rewards/chosen": -0.0887586697936058, "rewards/margins": 0.025575678795576096, "rewards/rejected": -0.1143343448638916, "step": 5543 }, { "epoch": 0.8573748308525034, "grad_norm": 4.663178443908691, "learning_rate": 3.967808454576699e-06, "logits/chosen": 11.413351058959961, "logits/rejected": 7.847943305969238, "logps/chosen": -230.12515258789062, "logps/rejected": -210.52798461914062, "loss": 0.5126, "rewards/accuracies": 0.75, "rewards/chosen": 0.31653425097465515, "rewards/margins": 0.5554164052009583, "rewards/rejected": -0.2388821840286255, "step": 5544 }, { "epoch": 0.8575294799922676, "grad_norm": 7.371320724487305, "learning_rate": 3.9675220529270255e-06, "logits/chosen": 9.153878211975098, "logits/rejected": 4.771054267883301, "logps/chosen": -431.10205078125, "logps/rejected": -367.1356201171875, "loss": 0.6264, "rewards/accuracies": 0.5, "rewards/chosen": 0.6836245059967041, "rewards/margins": 0.19900822639465332, "rewards/rejected": 0.48461630940437317, "step": 5545 }, { "epoch": 0.8576841291320317, "grad_norm": 8.207365036010742, "learning_rate": 3.967235651277352e-06, "logits/chosen": 7.745014667510986, "logits/rejected": 5.892283916473389, "logps/chosen": -226.06863403320312, "logps/rejected": -262.7561340332031, "loss": 0.9527, "rewards/accuracies": 0.375, "rewards/chosen": -0.24970795214176178, "rewards/margins": -0.36114442348480225, "rewards/rejected": 0.11143648624420166, "step": 5546 }, { "epoch": 0.8578387782717959, "grad_norm": 6.068310737609863, "learning_rate": 3.966949249627678e-06, "logits/chosen": 2.7964437007904053, "logits/rejected": 4.375521183013916, "logps/chosen": -208.13751220703125, "logps/rejected": -257.7275695800781, "loss": 0.8231, "rewards/accuracies": 0.625, "rewards/chosen": 0.14024151861667633, "rewards/margins": 0.09700223803520203, "rewards/rejected": 0.043239206075668335, "step": 5547 }, { "epoch": 0.85799342741156, "grad_norm": 5.413476467132568, "learning_rate": 3.966662847978005e-06, "logits/chosen": 9.429909706115723, "logits/rejected": 11.490013122558594, "logps/chosen": -258.3641052246094, "logps/rejected": -279.36212158203125, "loss": 0.6794, "rewards/accuracies": 0.625, "rewards/chosen": 0.25293999910354614, "rewards/margins": 0.045966241508722305, "rewards/rejected": 0.20697373151779175, "step": 5548 }, { "epoch": 0.8581480765513242, "grad_norm": 4.993340492248535, "learning_rate": 3.966376446328331e-06, "logits/chosen": 12.423839569091797, "logits/rejected": 12.750974655151367, "logps/chosen": -280.7548828125, "logps/rejected": -346.6853332519531, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": 0.35837188363075256, "rewards/margins": 0.6557400226593018, "rewards/rejected": -0.2973681390285492, "step": 5549 }, { "epoch": 0.8583027256910883, "grad_norm": 5.79370641708374, "learning_rate": 3.966090044678658e-06, "logits/chosen": 9.242609024047852, "logits/rejected": 10.357810974121094, "logps/chosen": -323.60565185546875, "logps/rejected": -347.3231506347656, "loss": 0.637, "rewards/accuracies": 0.875, "rewards/chosen": 0.15418893098831177, "rewards/margins": 0.4306488335132599, "rewards/rejected": -0.2764599919319153, "step": 5550 }, { "epoch": 0.8584573748308525, "grad_norm": 7.5426201820373535, "learning_rate": 3.965803643028985e-06, "logits/chosen": 9.522414207458496, "logits/rejected": 12.636218070983887, "logps/chosen": -266.0033264160156, "logps/rejected": -283.789794921875, "loss": 0.8981, "rewards/accuracies": 0.375, "rewards/chosen": -0.19298070669174194, "rewards/margins": -0.1364603340625763, "rewards/rejected": -0.05652038753032684, "step": 5551 }, { "epoch": 0.8586120239706166, "grad_norm": 4.220952033996582, "learning_rate": 3.96551724137931e-06, "logits/chosen": 8.761380195617676, "logits/rejected": 3.7925281524658203, "logps/chosen": -250.84353637695312, "logps/rejected": -210.23773193359375, "loss": 0.4722, "rewards/accuracies": 0.875, "rewards/chosen": 0.050732336938381195, "rewards/margins": 0.6292870044708252, "rewards/rejected": -0.5785546898841858, "step": 5552 }, { "epoch": 0.8587666731103808, "grad_norm": 5.072403430938721, "learning_rate": 3.965230839729637e-06, "logits/chosen": 8.99654483795166, "logits/rejected": 14.076211929321289, "logps/chosen": -164.7877655029297, "logps/rejected": -205.15432739257812, "loss": 0.6751, "rewards/accuracies": 0.5, "rewards/chosen": -0.23361565172672272, "rewards/margins": 0.12223657965660095, "rewards/rejected": -0.3558522164821625, "step": 5553 }, { "epoch": 0.858921322250145, "grad_norm": 4.6037774085998535, "learning_rate": 3.964944438079964e-06, "logits/chosen": 12.717657089233398, "logits/rejected": 10.634328842163086, "logps/chosen": -290.24505615234375, "logps/rejected": -316.93035888671875, "loss": 0.5442, "rewards/accuracies": 0.875, "rewards/chosen": 0.1433984786272049, "rewards/margins": 0.42021113634109497, "rewards/rejected": -0.2768126428127289, "step": 5554 }, { "epoch": 0.8590759713899091, "grad_norm": 6.171657562255859, "learning_rate": 3.96465803643029e-06, "logits/chosen": 6.50289249420166, "logits/rejected": 8.421377182006836, "logps/chosen": -162.61032104492188, "logps/rejected": -209.62774658203125, "loss": 0.7573, "rewards/accuracies": 0.375, "rewards/chosen": 0.1106221154332161, "rewards/margins": -0.07369647175073624, "rewards/rejected": 0.18431857228279114, "step": 5555 }, { "epoch": 0.8592306205296733, "grad_norm": 7.367908000946045, "learning_rate": 3.964371634780617e-06, "logits/chosen": 12.182393074035645, "logits/rejected": 5.837953567504883, "logps/chosen": -336.4083251953125, "logps/rejected": -213.71844482421875, "loss": 0.684, "rewards/accuracies": 0.5, "rewards/chosen": 0.1285104751586914, "rewards/margins": 0.3372896611690521, "rewards/rejected": -0.20877915620803833, "step": 5556 }, { "epoch": 0.8593852696694375, "grad_norm": 5.022037982940674, "learning_rate": 3.964085233130943e-06, "logits/chosen": 8.136197090148926, "logits/rejected": 7.256975173950195, "logps/chosen": -219.56837463378906, "logps/rejected": -241.2669219970703, "loss": 0.5404, "rewards/accuracies": 0.75, "rewards/chosen": 0.22837886214256287, "rewards/margins": 0.4187639057636261, "rewards/rejected": -0.19038507342338562, "step": 5557 }, { "epoch": 0.8595399188092017, "grad_norm": 6.981671333312988, "learning_rate": 3.9637988314812695e-06, "logits/chosen": 11.872611999511719, "logits/rejected": 12.234930038452148, "logps/chosen": -341.8452453613281, "logps/rejected": -340.52899169921875, "loss": 0.9542, "rewards/accuracies": 0.125, "rewards/chosen": -0.12858542799949646, "rewards/margins": -0.380593478679657, "rewards/rejected": 0.2520080506801605, "step": 5558 }, { "epoch": 0.8596945679489658, "grad_norm": 5.184417724609375, "learning_rate": 3.963512429831596e-06, "logits/chosen": 6.285898208618164, "logits/rejected": 6.663991928100586, "logps/chosen": -205.7534637451172, "logps/rejected": -235.4987030029297, "loss": 0.7474, "rewards/accuracies": 0.5, "rewards/chosen": 0.21914659440517426, "rewards/margins": 0.003766484558582306, "rewards/rejected": 0.21538008749485016, "step": 5559 }, { "epoch": 0.85984921708873, "grad_norm": 5.698225498199463, "learning_rate": 3.963226028181923e-06, "logits/chosen": 11.049238204956055, "logits/rejected": 9.75180435180664, "logps/chosen": -255.01080322265625, "logps/rejected": -263.9441223144531, "loss": 0.7398, "rewards/accuracies": 0.5, "rewards/chosen": 0.17442665994167328, "rewards/margins": 0.054497167468070984, "rewards/rejected": 0.11992951482534409, "step": 5560 }, { "epoch": 0.8600038662284941, "grad_norm": 7.597950458526611, "learning_rate": 3.9629396265322486e-06, "logits/chosen": 6.745762825012207, "logits/rejected": 5.756608963012695, "logps/chosen": -251.48046875, "logps/rejected": -241.15567016601562, "loss": 0.8848, "rewards/accuracies": 0.5, "rewards/chosen": -0.17406368255615234, "rewards/margins": 0.1296660602092743, "rewards/rejected": -0.30372974276542664, "step": 5561 }, { "epoch": 0.8601585153682583, "grad_norm": 3.4851796627044678, "learning_rate": 3.962653224882575e-06, "logits/chosen": 11.194986343383789, "logits/rejected": 8.162862777709961, "logps/chosen": -114.60517120361328, "logps/rejected": -111.16508483886719, "loss": 0.576, "rewards/accuracies": 0.875, "rewards/chosen": -0.027560904622077942, "rewards/margins": 0.317198246717453, "rewards/rejected": -0.34475910663604736, "step": 5562 }, { "epoch": 0.8603131645080224, "grad_norm": 7.40164852142334, "learning_rate": 3.962366823232902e-06, "logits/chosen": 12.411735534667969, "logits/rejected": 8.850687026977539, "logps/chosen": -337.29547119140625, "logps/rejected": -277.8260498046875, "loss": 0.6801, "rewards/accuracies": 0.5, "rewards/chosen": 0.36499306559562683, "rewards/margins": 0.21913452446460724, "rewards/rejected": 0.14585858583450317, "step": 5563 }, { "epoch": 0.8604678136477866, "grad_norm": 6.116249084472656, "learning_rate": 3.9620804215832285e-06, "logits/chosen": 12.2783203125, "logits/rejected": 10.857599258422852, "logps/chosen": -281.6092224121094, "logps/rejected": -212.8323974609375, "loss": 0.6946, "rewards/accuracies": 0.5, "rewards/chosen": -0.11243562400341034, "rewards/margins": 0.053007036447525024, "rewards/rejected": -0.16544266045093536, "step": 5564 }, { "epoch": 0.8606224627875507, "grad_norm": 5.1273603439331055, "learning_rate": 3.961794019933555e-06, "logits/chosen": 8.237610816955566, "logits/rejected": 7.821701526641846, "logps/chosen": -234.32177734375, "logps/rejected": -317.19769287109375, "loss": 0.5606, "rewards/accuracies": 0.75, "rewards/chosen": 0.16736702620983124, "rewards/margins": 0.33171504735946655, "rewards/rejected": -0.1643480360507965, "step": 5565 }, { "epoch": 0.8607771119273149, "grad_norm": 4.789910316467285, "learning_rate": 3.961507618283882e-06, "logits/chosen": 11.612488746643066, "logits/rejected": 10.5182523727417, "logps/chosen": -265.38177490234375, "logps/rejected": -242.57489013671875, "loss": 0.6635, "rewards/accuracies": 0.5, "rewards/chosen": 0.013011261820793152, "rewards/margins": 0.13539275527000427, "rewards/rejected": -0.12238150089979172, "step": 5566 }, { "epoch": 0.860931761067079, "grad_norm": 5.014251708984375, "learning_rate": 3.961221216634208e-06, "logits/chosen": 16.432933807373047, "logits/rejected": 13.618185043334961, "logps/chosen": -384.0850524902344, "logps/rejected": -331.5423583984375, "loss": 0.5402, "rewards/accuracies": 0.75, "rewards/chosen": 0.3585062026977539, "rewards/margins": 0.6304289102554321, "rewards/rejected": -0.27192267775535583, "step": 5567 }, { "epoch": 0.8610864102068432, "grad_norm": 6.250051975250244, "learning_rate": 3.960934814984534e-06, "logits/chosen": 12.310720443725586, "logits/rejected": 13.226346015930176, "logps/chosen": -320.4024658203125, "logps/rejected": -314.7881774902344, "loss": 0.8323, "rewards/accuracies": 0.25, "rewards/chosen": -0.0634918212890625, "rewards/margins": -0.19194220006465912, "rewards/rejected": 0.12845037877559662, "step": 5568 }, { "epoch": 0.8612410593466073, "grad_norm": 5.12235164642334, "learning_rate": 3.960648413334861e-06, "logits/chosen": 6.9342122077941895, "logits/rejected": 7.804890155792236, "logps/chosen": -284.4322814941406, "logps/rejected": -261.19305419921875, "loss": 0.6083, "rewards/accuracies": 0.5, "rewards/chosen": -0.039760202169418335, "rewards/margins": 0.21222585439682007, "rewards/rejected": -0.2519860863685608, "step": 5569 }, { "epoch": 0.8613957084863716, "grad_norm": 22.274707794189453, "learning_rate": 3.9603620116851876e-06, "logits/chosen": 10.584257125854492, "logits/rejected": 13.976146697998047, "logps/chosen": -151.39248657226562, "logps/rejected": -187.773681640625, "loss": 0.785, "rewards/accuracies": 0.5, "rewards/chosen": -0.3866022527217865, "rewards/margins": -0.1313094198703766, "rewards/rejected": -0.2552928328514099, "step": 5570 }, { "epoch": 0.8615503576261357, "grad_norm": 6.046761512756348, "learning_rate": 3.960075610035514e-06, "logits/chosen": 13.19694709777832, "logits/rejected": 9.149131774902344, "logps/chosen": -230.0457305908203, "logps/rejected": -177.5976104736328, "loss": 0.781, "rewards/accuracies": 0.375, "rewards/chosen": -0.19006797671318054, "rewards/margins": -0.06863032281398773, "rewards/rejected": -0.12143763899803162, "step": 5571 }, { "epoch": 0.8617050067658999, "grad_norm": 5.000414848327637, "learning_rate": 3.959789208385841e-06, "logits/chosen": 11.03077507019043, "logits/rejected": 13.187887191772461, "logps/chosen": -260.22027587890625, "logps/rejected": -252.41555786132812, "loss": 0.7057, "rewards/accuracies": 0.625, "rewards/chosen": -0.16556119918823242, "rewards/margins": 0.00532994419336319, "rewards/rejected": -0.1708911508321762, "step": 5572 }, { "epoch": 0.861859655905664, "grad_norm": 9.176785469055176, "learning_rate": 3.9595028067361675e-06, "logits/chosen": 8.673894882202148, "logits/rejected": 8.565500259399414, "logps/chosen": -279.72021484375, "logps/rejected": -264.1474609375, "loss": 1.0208, "rewards/accuracies": 0.625, "rewards/chosen": -0.4105193614959717, "rewards/margins": -0.3400534391403198, "rewards/rejected": -0.07046595215797424, "step": 5573 }, { "epoch": 0.8620143050454282, "grad_norm": 6.136605739593506, "learning_rate": 3.959216405086493e-06, "logits/chosen": 11.717804908752441, "logits/rejected": 11.446734428405762, "logps/chosen": -269.6417236328125, "logps/rejected": -265.83331298828125, "loss": 0.7581, "rewards/accuracies": 0.375, "rewards/chosen": 0.019072622060775757, "rewards/margins": -0.07733035087585449, "rewards/rejected": 0.09640297293663025, "step": 5574 }, { "epoch": 0.8621689541851923, "grad_norm": 5.139334678649902, "learning_rate": 3.95893000343682e-06, "logits/chosen": 14.313995361328125, "logits/rejected": 11.819242477416992, "logps/chosen": -344.8808898925781, "logps/rejected": -280.67645263671875, "loss": 0.6363, "rewards/accuracies": 0.5, "rewards/chosen": 0.488066703081131, "rewards/margins": 0.472256064414978, "rewards/rejected": 0.015810586512088776, "step": 5575 }, { "epoch": 0.8623236033249565, "grad_norm": 6.363095283508301, "learning_rate": 3.958643601787147e-06, "logits/chosen": 13.745997428894043, "logits/rejected": 3.4037387371063232, "logps/chosen": -333.07745361328125, "logps/rejected": -218.85836791992188, "loss": 0.4101, "rewards/accuracies": 0.875, "rewards/chosen": 0.12555581331253052, "rewards/margins": 0.7948639392852783, "rewards/rejected": -0.6693081259727478, "step": 5576 }, { "epoch": 0.8624782524647207, "grad_norm": 6.558548927307129, "learning_rate": 3.958357200137473e-06, "logits/chosen": 1.0127133131027222, "logits/rejected": 5.681978225708008, "logps/chosen": -249.9398651123047, "logps/rejected": -289.20001220703125, "loss": 0.6633, "rewards/accuracies": 0.625, "rewards/chosen": 0.16100487112998962, "rewards/margins": 0.11872687190771103, "rewards/rejected": 0.042277999222278595, "step": 5577 }, { "epoch": 0.8626329016044848, "grad_norm": 4.4048871994018555, "learning_rate": 3.9580707984878e-06, "logits/chosen": 10.924642562866211, "logits/rejected": 4.229212760925293, "logps/chosen": -335.284912109375, "logps/rejected": -203.130126953125, "loss": 0.655, "rewards/accuracies": 0.625, "rewards/chosen": -0.08222848176956177, "rewards/margins": 0.22194786369800568, "rewards/rejected": -0.30417633056640625, "step": 5578 }, { "epoch": 0.862787550744249, "grad_norm": 5.373351573944092, "learning_rate": 3.957784396838127e-06, "logits/chosen": 6.795207977294922, "logits/rejected": 12.02531623840332, "logps/chosen": -209.53326416015625, "logps/rejected": -254.7356414794922, "loss": 0.7737, "rewards/accuracies": 0.25, "rewards/chosen": -0.04380173981189728, "rewards/margins": -0.11935292929410934, "rewards/rejected": 0.07555118203163147, "step": 5579 }, { "epoch": 0.8629421998840131, "grad_norm": 6.247703552246094, "learning_rate": 3.957497995188452e-06, "logits/chosen": 7.563823699951172, "logits/rejected": 5.765118598937988, "logps/chosen": -443.6858825683594, "logps/rejected": -397.2836608886719, "loss": 0.5705, "rewards/accuracies": 0.875, "rewards/chosen": 0.12699198722839355, "rewards/margins": 0.49848002195358276, "rewards/rejected": -0.3714880347251892, "step": 5580 }, { "epoch": 0.8630968490237773, "grad_norm": 8.227096557617188, "learning_rate": 3.957211593538779e-06, "logits/chosen": 9.86913013458252, "logits/rejected": 7.2057623863220215, "logps/chosen": -324.13116455078125, "logps/rejected": -367.2266845703125, "loss": 0.8363, "rewards/accuracies": 0.5, "rewards/chosen": -0.022597193717956543, "rewards/margins": 0.06384515762329102, "rewards/rejected": -0.08644236624240875, "step": 5581 }, { "epoch": 0.8632514981635415, "grad_norm": 4.9398393630981445, "learning_rate": 3.956925191889106e-06, "logits/chosen": 3.505120038986206, "logits/rejected": 8.423637390136719, "logps/chosen": -156.98138427734375, "logps/rejected": -226.23301696777344, "loss": 0.6654, "rewards/accuracies": 0.5, "rewards/chosen": -0.10237536579370499, "rewards/margins": 0.09464340656995773, "rewards/rejected": -0.1970188021659851, "step": 5582 }, { "epoch": 0.8634061473033057, "grad_norm": 11.51438045501709, "learning_rate": 3.956638790239432e-06, "logits/chosen": 9.482044219970703, "logits/rejected": 14.614742279052734, "logps/chosen": -244.4484100341797, "logps/rejected": -456.899658203125, "loss": 0.6966, "rewards/accuracies": 0.5, "rewards/chosen": -0.08227504789829254, "rewards/margins": 0.058049432933330536, "rewards/rejected": -0.14032451808452606, "step": 5583 }, { "epoch": 0.8635607964430698, "grad_norm": 5.607166767120361, "learning_rate": 3.956352388589759e-06, "logits/chosen": 10.76272964477539, "logits/rejected": 1.8757203817367554, "logps/chosen": -357.39892578125, "logps/rejected": -284.69171142578125, "loss": 0.4959, "rewards/accuracies": 0.75, "rewards/chosen": 0.13323983550071716, "rewards/margins": 0.5907565355300903, "rewards/rejected": -0.45751672983169556, "step": 5584 }, { "epoch": 0.863715445582834, "grad_norm": 7.2526984214782715, "learning_rate": 3.956065986940086e-06, "logits/chosen": 11.311847686767578, "logits/rejected": 12.392753601074219, "logps/chosen": -331.0661315917969, "logps/rejected": -373.85162353515625, "loss": 0.8958, "rewards/accuracies": 0.5, "rewards/chosen": 0.025860585272312164, "rewards/margins": -0.1390220820903778, "rewards/rejected": 0.16488268971443176, "step": 5585 }, { "epoch": 0.8638700947225981, "grad_norm": 3.3673830032348633, "learning_rate": 3.9557795852904114e-06, "logits/chosen": 7.557724952697754, "logits/rejected": 4.005731582641602, "logps/chosen": -113.14189147949219, "logps/rejected": -146.0245819091797, "loss": 0.5504, "rewards/accuracies": 0.625, "rewards/chosen": -0.28499364852905273, "rewards/margins": 0.4255397319793701, "rewards/rejected": -0.7105333805084229, "step": 5586 }, { "epoch": 0.8640247438623623, "grad_norm": 7.194338321685791, "learning_rate": 3.955493183640738e-06, "logits/chosen": 9.403700828552246, "logits/rejected": 7.309061527252197, "logps/chosen": -275.8131103515625, "logps/rejected": -313.4444885253906, "loss": 0.683, "rewards/accuracies": 0.375, "rewards/chosen": 0.42857396602630615, "rewards/margins": 0.0807749330997467, "rewards/rejected": 0.34779900312423706, "step": 5587 }, { "epoch": 0.8641793930021264, "grad_norm": 3.8534152507781982, "learning_rate": 3.955206781991065e-06, "logits/chosen": 17.8963565826416, "logits/rejected": 8.14344596862793, "logps/chosen": -427.930908203125, "logps/rejected": -356.53076171875, "loss": 0.435, "rewards/accuracies": 0.875, "rewards/chosen": 0.5800073742866516, "rewards/margins": 0.8146600723266602, "rewards/rejected": -0.23465272784233093, "step": 5588 }, { "epoch": 0.8643340421418906, "grad_norm": 6.4218525886535645, "learning_rate": 3.954920380341391e-06, "logits/chosen": 6.947747707366943, "logits/rejected": 6.141756534576416, "logps/chosen": -360.1333923339844, "logps/rejected": -388.59979248046875, "loss": 0.6759, "rewards/accuracies": 0.625, "rewards/chosen": 0.24942654371261597, "rewards/margins": 0.14833489060401917, "rewards/rejected": 0.10109168291091919, "step": 5589 }, { "epoch": 0.8644886912816547, "grad_norm": 4.075565338134766, "learning_rate": 3.954633978691717e-06, "logits/chosen": 12.835847854614258, "logits/rejected": 5.379994869232178, "logps/chosen": -265.06915283203125, "logps/rejected": -136.9203338623047, "loss": 0.5726, "rewards/accuracies": 0.625, "rewards/chosen": -0.05200263857841492, "rewards/margins": 0.5894217491149902, "rewards/rejected": -0.6414244174957275, "step": 5590 }, { "epoch": 0.8646433404214189, "grad_norm": 4.9312920570373535, "learning_rate": 3.954347577042044e-06, "logits/chosen": 7.500208854675293, "logits/rejected": 8.466938972473145, "logps/chosen": -114.65245056152344, "logps/rejected": -227.3726806640625, "loss": 0.5527, "rewards/accuracies": 0.625, "rewards/chosen": -0.3188348114490509, "rewards/margins": 0.5198426842689514, "rewards/rejected": -0.8386775255203247, "step": 5591 }, { "epoch": 0.864797989561183, "grad_norm": 6.288552284240723, "learning_rate": 3.9540611753923705e-06, "logits/chosen": 12.835363388061523, "logits/rejected": 5.822674751281738, "logps/chosen": -318.30364990234375, "logps/rejected": -362.590576171875, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": -0.07603712379932404, "rewards/margins": 0.2552597224712372, "rewards/rejected": -0.3312968611717224, "step": 5592 }, { "epoch": 0.8649526387009472, "grad_norm": 7.344549179077148, "learning_rate": 3.953774773742697e-06, "logits/chosen": 16.82768440246582, "logits/rejected": 6.916017532348633, "logps/chosen": -210.23316955566406, "logps/rejected": -137.35305786132812, "loss": 0.8574, "rewards/accuracies": 0.375, "rewards/chosen": -0.8811869025230408, "rewards/margins": -0.18916422128677368, "rewards/rejected": -0.6920227408409119, "step": 5593 }, { "epoch": 0.8651072878407113, "grad_norm": 5.030068397521973, "learning_rate": 3.953488372093024e-06, "logits/chosen": 8.664188385009766, "logits/rejected": 10.605673789978027, "logps/chosen": -349.632080078125, "logps/rejected": -383.5105285644531, "loss": 0.5603, "rewards/accuracies": 0.75, "rewards/chosen": 0.24142371118068695, "rewards/margins": 0.6926233172416687, "rewards/rejected": -0.45119965076446533, "step": 5594 }, { "epoch": 0.8652619369804756, "grad_norm": 5.0279669761657715, "learning_rate": 3.95320197044335e-06, "logits/chosen": 10.762035369873047, "logits/rejected": 6.500424861907959, "logps/chosen": -365.1126403808594, "logps/rejected": -279.74798583984375, "loss": 0.5475, "rewards/accuracies": 0.75, "rewards/chosen": 0.6005088090896606, "rewards/margins": 0.4703788757324219, "rewards/rejected": 0.13012991845607758, "step": 5595 }, { "epoch": 0.8654165861202398, "grad_norm": 5.001711368560791, "learning_rate": 3.952915568793676e-06, "logits/chosen": 9.891135215759277, "logits/rejected": 4.358508586883545, "logps/chosen": -412.11480712890625, "logps/rejected": -239.86184692382812, "loss": 0.5274, "rewards/accuracies": 0.875, "rewards/chosen": 0.2786291241645813, "rewards/margins": 0.6756082773208618, "rewards/rejected": -0.3969791531562805, "step": 5596 }, { "epoch": 0.8655712352600039, "grad_norm": 5.172476291656494, "learning_rate": 3.952629167144003e-06, "logits/chosen": 9.6688232421875, "logits/rejected": 9.194499969482422, "logps/chosen": -373.3966369628906, "logps/rejected": -409.0218200683594, "loss": 0.5085, "rewards/accuracies": 0.75, "rewards/chosen": 0.4827442765235901, "rewards/margins": 0.6946344375610352, "rewards/rejected": -0.21189017593860626, "step": 5597 }, { "epoch": 0.865725884399768, "grad_norm": 4.925397872924805, "learning_rate": 3.9523427654943296e-06, "logits/chosen": 12.273463249206543, "logits/rejected": 8.57210922241211, "logps/chosen": -258.656982421875, "logps/rejected": -196.41998291015625, "loss": 0.6451, "rewards/accuracies": 0.75, "rewards/chosen": 0.2659473419189453, "rewards/margins": 0.1428723931312561, "rewards/rejected": 0.12307494878768921, "step": 5598 }, { "epoch": 0.8658805335395322, "grad_norm": 42.34637451171875, "learning_rate": 3.952056363844656e-06, "logits/chosen": 8.770404815673828, "logits/rejected": 4.005621910095215, "logps/chosen": -176.989990234375, "logps/rejected": -140.04248046875, "loss": 0.5476, "rewards/accuracies": 0.625, "rewards/chosen": 0.2061765044927597, "rewards/margins": 0.5282948017120361, "rewards/rejected": -0.32211834192276, "step": 5599 }, { "epoch": 0.8660351826792964, "grad_norm": 4.553174018859863, "learning_rate": 3.951769962194982e-06, "logits/chosen": 5.2121663093566895, "logits/rejected": 7.479856967926025, "logps/chosen": -164.1041259765625, "logps/rejected": -164.49722290039062, "loss": 0.5913, "rewards/accuracies": 0.75, "rewards/chosen": 0.08756618201732635, "rewards/margins": 0.3333725929260254, "rewards/rejected": -0.24580645561218262, "step": 5600 }, { "epoch": 0.8661898318190605, "grad_norm": 4.731479644775391, "learning_rate": 3.951483560545309e-06, "logits/chosen": 15.040358543395996, "logits/rejected": 14.302785873413086, "logps/chosen": -190.0380401611328, "logps/rejected": -220.01815795898438, "loss": 0.5428, "rewards/accuracies": 0.875, "rewards/chosen": -0.10195541381835938, "rewards/margins": 0.42290830612182617, "rewards/rejected": -0.5248637199401855, "step": 5601 }, { "epoch": 0.8663444809588247, "grad_norm": 5.872387886047363, "learning_rate": 3.951197158895635e-06, "logits/chosen": 6.359010696411133, "logits/rejected": 11.898884773254395, "logps/chosen": -141.8868408203125, "logps/rejected": -228.91729736328125, "loss": 0.694, "rewards/accuracies": 0.625, "rewards/chosen": -0.050789739936590195, "rewards/margins": 0.01831177994608879, "rewards/rejected": -0.06910151988267899, "step": 5602 }, { "epoch": 0.8664991300985888, "grad_norm": 6.231602668762207, "learning_rate": 3.950910757245962e-06, "logits/chosen": 7.6184306144714355, "logits/rejected": 8.309837341308594, "logps/chosen": -298.68377685546875, "logps/rejected": -300.8892517089844, "loss": 0.6997, "rewards/accuracies": 0.375, "rewards/chosen": 0.20585650205612183, "rewards/margins": 0.12166690826416016, "rewards/rejected": 0.08418960869312286, "step": 5603 }, { "epoch": 0.866653779238353, "grad_norm": 5.718890190124512, "learning_rate": 3.950624355596289e-06, "logits/chosen": 1.0394971370697021, "logits/rejected": 13.917007446289062, "logps/chosen": -162.19444274902344, "logps/rejected": -358.68890380859375, "loss": 0.7912, "rewards/accuracies": 0.5, "rewards/chosen": -0.1972808688879013, "rewards/margins": -0.05365484952926636, "rewards/rejected": -0.14362603425979614, "step": 5604 }, { "epoch": 0.8668084283781171, "grad_norm": 5.780733108520508, "learning_rate": 3.950337953946615e-06, "logits/chosen": 11.858189582824707, "logits/rejected": 9.82012939453125, "logps/chosen": -419.78668212890625, "logps/rejected": -427.4334716796875, "loss": 0.5878, "rewards/accuracies": 0.625, "rewards/chosen": 0.4508832097053528, "rewards/margins": 0.28657639026641846, "rewards/rejected": 0.16430678963661194, "step": 5605 }, { "epoch": 0.8669630775178813, "grad_norm": 6.165585041046143, "learning_rate": 3.950051552296942e-06, "logits/chosen": 7.159930229187012, "logits/rejected": 4.129848480224609, "logps/chosen": -241.71958923339844, "logps/rejected": -197.82447814941406, "loss": 0.6946, "rewards/accuracies": 0.75, "rewards/chosen": 0.11882977187633514, "rewards/margins": 0.1422731578350067, "rewards/rejected": -0.023443374782800674, "step": 5606 }, { "epoch": 0.8671177266576454, "grad_norm": 5.085695743560791, "learning_rate": 3.949765150647268e-06, "logits/chosen": 5.326354503631592, "logits/rejected": 8.356669425964355, "logps/chosen": -175.46498107910156, "logps/rejected": -190.8360595703125, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": 0.08924822509288788, "rewards/margins": 0.3447166681289673, "rewards/rejected": -0.2554684281349182, "step": 5607 }, { "epoch": 0.8672723757974097, "grad_norm": 6.473757266998291, "learning_rate": 3.949478748997594e-06, "logits/chosen": 11.082825660705566, "logits/rejected": 7.568131446838379, "logps/chosen": -333.39996337890625, "logps/rejected": -304.8360595703125, "loss": 0.5413, "rewards/accuracies": 0.75, "rewards/chosen": 0.3528112769126892, "rewards/margins": 0.3843561112880707, "rewards/rejected": -0.03154488652944565, "step": 5608 }, { "epoch": 0.8674270249371738, "grad_norm": 6.504826545715332, "learning_rate": 3.949192347347921e-06, "logits/chosen": 13.074699401855469, "logits/rejected": 12.940622329711914, "logps/chosen": -283.00299072265625, "logps/rejected": -252.4912567138672, "loss": 0.6719, "rewards/accuracies": 0.625, "rewards/chosen": 0.13514652848243713, "rewards/margins": 0.18675857782363892, "rewards/rejected": -0.05161203444004059, "step": 5609 }, { "epoch": 0.867581674076938, "grad_norm": 4.671186923980713, "learning_rate": 3.948905945698248e-06, "logits/chosen": 13.153820037841797, "logits/rejected": 9.608548164367676, "logps/chosen": -301.005126953125, "logps/rejected": -267.2201232910156, "loss": 0.5344, "rewards/accuracies": 0.75, "rewards/chosen": 0.4216276705265045, "rewards/margins": 0.5195762515068054, "rewards/rejected": -0.09794855862855911, "step": 5610 }, { "epoch": 0.8677363232167021, "grad_norm": 4.416604042053223, "learning_rate": 3.948619544048574e-06, "logits/chosen": 9.490065574645996, "logits/rejected": 10.032252311706543, "logps/chosen": -159.22503662109375, "logps/rejected": -164.47659301757812, "loss": 0.7279, "rewards/accuracies": 0.625, "rewards/chosen": 0.08413050323724747, "rewards/margins": -0.009891770780086517, "rewards/rejected": 0.09402228146791458, "step": 5611 }, { "epoch": 0.8678909723564663, "grad_norm": 7.097733020782471, "learning_rate": 3.948333142398901e-06, "logits/chosen": 7.061870098114014, "logits/rejected": 3.3953373432159424, "logps/chosen": -273.488037109375, "logps/rejected": -203.53004455566406, "loss": 0.6395, "rewards/accuracies": 0.5, "rewards/chosen": -0.10962973535060883, "rewards/margins": 0.29717832803726196, "rewards/rejected": -0.406808078289032, "step": 5612 }, { "epoch": 0.8680456214962304, "grad_norm": 5.786071300506592, "learning_rate": 3.948046740749227e-06, "logits/chosen": 8.510398864746094, "logits/rejected": 7.187309265136719, "logps/chosen": -231.5532989501953, "logps/rejected": -181.15170288085938, "loss": 0.7664, "rewards/accuracies": 0.375, "rewards/chosen": 0.16049346327781677, "rewards/margins": -0.06298433244228363, "rewards/rejected": 0.2234778106212616, "step": 5613 }, { "epoch": 0.8682002706359946, "grad_norm": 5.439120292663574, "learning_rate": 3.947760339099553e-06, "logits/chosen": 8.747971534729004, "logits/rejected": 3.8913488388061523, "logps/chosen": -351.14007568359375, "logps/rejected": -262.868896484375, "loss": 0.644, "rewards/accuracies": 0.5, "rewards/chosen": -0.00694364495575428, "rewards/margins": 0.29663002490997314, "rewards/rejected": -0.3035736680030823, "step": 5614 }, { "epoch": 0.8683549197757587, "grad_norm": 3.9751415252685547, "learning_rate": 3.94747393744988e-06, "logits/chosen": 13.17003345489502, "logits/rejected": 12.629056930541992, "logps/chosen": -141.6834716796875, "logps/rejected": -136.32272338867188, "loss": 0.5992, "rewards/accuracies": 0.625, "rewards/chosen": -0.049347929656505585, "rewards/margins": 0.3328120708465576, "rewards/rejected": -0.3821600079536438, "step": 5615 }, { "epoch": 0.8685095689155229, "grad_norm": 5.343812942504883, "learning_rate": 3.947187535800207e-06, "logits/chosen": 11.60512924194336, "logits/rejected": 9.868955612182617, "logps/chosen": -314.24542236328125, "logps/rejected": -249.23345947265625, "loss": 0.7507, "rewards/accuracies": 0.25, "rewards/chosen": 0.0857694149017334, "rewards/margins": -0.0817815363407135, "rewards/rejected": 0.1675509512424469, "step": 5616 }, { "epoch": 0.868664218055287, "grad_norm": 5.257479190826416, "learning_rate": 3.946901134150533e-06, "logits/chosen": 11.600604057312012, "logits/rejected": 4.496735095977783, "logps/chosen": -415.73004150390625, "logps/rejected": -289.48992919921875, "loss": 0.5595, "rewards/accuracies": 0.75, "rewards/chosen": 0.43422386050224304, "rewards/margins": 0.5791343450546265, "rewards/rejected": -0.1449105441570282, "step": 5617 }, { "epoch": 0.8688188671950512, "grad_norm": 4.118588447570801, "learning_rate": 3.94661473250086e-06, "logits/chosen": 11.091497421264648, "logits/rejected": 9.375388145446777, "logps/chosen": -259.69384765625, "logps/rejected": -225.29159545898438, "loss": 0.6482, "rewards/accuracies": 0.5, "rewards/chosen": 0.3469921946525574, "rewards/margins": 0.124681755900383, "rewards/rejected": 0.22231043875217438, "step": 5618 }, { "epoch": 0.8689735163348153, "grad_norm": 5.650290012359619, "learning_rate": 3.946328330851187e-06, "logits/chosen": 8.009944915771484, "logits/rejected": 8.996103286743164, "logps/chosen": -313.7467956542969, "logps/rejected": -293.6455078125, "loss": 0.6783, "rewards/accuracies": 0.75, "rewards/chosen": 0.4786514341831207, "rewards/margins": 0.3097921311855316, "rewards/rejected": 0.1688593029975891, "step": 5619 }, { "epoch": 0.8691281654745795, "grad_norm": 7.998353481292725, "learning_rate": 3.9460419292015125e-06, "logits/chosen": 1.233825922012329, "logits/rejected": 3.0593180656433105, "logps/chosen": -317.2923583984375, "logps/rejected": -242.02273559570312, "loss": 0.743, "rewards/accuracies": 0.375, "rewards/chosen": 0.030298329889774323, "rewards/margins": 0.052943140268325806, "rewards/rejected": -0.022644802927970886, "step": 5620 }, { "epoch": 0.8692828146143438, "grad_norm": 5.124409198760986, "learning_rate": 3.945755527551839e-06, "logits/chosen": 15.041266441345215, "logits/rejected": 14.017416000366211, "logps/chosen": -180.35031127929688, "logps/rejected": -206.74505615234375, "loss": 0.6208, "rewards/accuracies": 0.5, "rewards/chosen": -0.2330726683139801, "rewards/margins": 0.26515519618988037, "rewards/rejected": -0.4982278645038605, "step": 5621 }, { "epoch": 0.8694374637541079, "grad_norm": 7.9604034423828125, "learning_rate": 3.945469125902166e-06, "logits/chosen": 16.699312210083008, "logits/rejected": 14.102435111999512, "logps/chosen": -297.41998291015625, "logps/rejected": -306.90478515625, "loss": 0.7426, "rewards/accuracies": 0.625, "rewards/chosen": -0.16882115602493286, "rewards/margins": -0.031309328973293304, "rewards/rejected": -0.13751183450222015, "step": 5622 }, { "epoch": 0.8695921128938721, "grad_norm": 6.771982192993164, "learning_rate": 3.9451827242524924e-06, "logits/chosen": 9.043725967407227, "logits/rejected": 6.445558071136475, "logps/chosen": -343.36566162109375, "logps/rejected": -282.7401428222656, "loss": 0.5892, "rewards/accuracies": 0.625, "rewards/chosen": 0.08827246725559235, "rewards/margins": 0.3778069317340851, "rewards/rejected": -0.2895345091819763, "step": 5623 }, { "epoch": 0.8697467620336362, "grad_norm": 8.483500480651855, "learning_rate": 3.944896322602818e-06, "logits/chosen": 17.842782974243164, "logits/rejected": 13.857070922851562, "logps/chosen": -245.7689971923828, "logps/rejected": -201.0692138671875, "loss": 0.7439, "rewards/accuracies": 0.375, "rewards/chosen": -0.33216142654418945, "rewards/margins": -0.051038503646850586, "rewards/rejected": -0.28112292289733887, "step": 5624 }, { "epoch": 0.8699014111734004, "grad_norm": 5.0938873291015625, "learning_rate": 3.944609920953145e-06, "logits/chosen": 15.27180290222168, "logits/rejected": 9.57136344909668, "logps/chosen": -295.0552673339844, "logps/rejected": -199.18890380859375, "loss": 0.7411, "rewards/accuracies": 0.5, "rewards/chosen": 0.0026002079248428345, "rewards/margins": -0.04800724238157272, "rewards/rejected": 0.050607435405254364, "step": 5625 }, { "epoch": 0.8700560603131645, "grad_norm": 7.9792890548706055, "learning_rate": 3.9443235193034715e-06, "logits/chosen": 8.136433601379395, "logits/rejected": 7.820018768310547, "logps/chosen": -403.07952880859375, "logps/rejected": -342.0752868652344, "loss": 0.6737, "rewards/accuracies": 0.5, "rewards/chosen": -0.12141789495944977, "rewards/margins": 0.1150195375084877, "rewards/rejected": -0.23643741011619568, "step": 5626 }, { "epoch": 0.8702107094529287, "grad_norm": 4.639564514160156, "learning_rate": 3.944037117653798e-06, "logits/chosen": 11.050708770751953, "logits/rejected": 16.36686134338379, "logps/chosen": -254.51023864746094, "logps/rejected": -277.595703125, "loss": 0.5577, "rewards/accuracies": 0.625, "rewards/chosen": 0.10722842812538147, "rewards/margins": 0.4551927149295807, "rewards/rejected": -0.3479642868041992, "step": 5627 }, { "epoch": 0.8703653585926928, "grad_norm": 8.111231803894043, "learning_rate": 3.943750716004124e-06, "logits/chosen": 12.284795761108398, "logits/rejected": 10.267173767089844, "logps/chosen": -269.4294738769531, "logps/rejected": -257.981201171875, "loss": 0.7234, "rewards/accuracies": 0.625, "rewards/chosen": 0.0955488383769989, "rewards/margins": 0.1421506106853485, "rewards/rejected": -0.04660177230834961, "step": 5628 }, { "epoch": 0.870520007732457, "grad_norm": 7.362261772155762, "learning_rate": 3.943464314354451e-06, "logits/chosen": 11.454507827758789, "logits/rejected": 10.433797836303711, "logps/chosen": -286.57373046875, "logps/rejected": -350.7206115722656, "loss": 0.6955, "rewards/accuracies": 0.5, "rewards/chosen": 0.5244772434234619, "rewards/margins": 0.109484001994133, "rewards/rejected": 0.4149932265281677, "step": 5629 }, { "epoch": 0.8706746568722211, "grad_norm": 6.0964436531066895, "learning_rate": 3.943177912704777e-06, "logits/chosen": 10.769762992858887, "logits/rejected": 9.031015396118164, "logps/chosen": -622.0875854492188, "logps/rejected": -565.76416015625, "loss": 0.6787, "rewards/accuracies": 0.625, "rewards/chosen": 0.5956319570541382, "rewards/margins": 0.06504993140697479, "rewards/rejected": 0.530582070350647, "step": 5630 }, { "epoch": 0.8708293060119853, "grad_norm": 5.355895519256592, "learning_rate": 3.942891511055104e-06, "logits/chosen": 9.512357711791992, "logits/rejected": 10.41882038116455, "logps/chosen": -236.83792114257812, "logps/rejected": -218.12615966796875, "loss": 0.7215, "rewards/accuracies": 0.5, "rewards/chosen": 0.22888503968715668, "rewards/margins": 0.095341257750988, "rewards/rejected": 0.13354375958442688, "step": 5631 }, { "epoch": 0.8709839551517494, "grad_norm": 5.857627868652344, "learning_rate": 3.942605109405431e-06, "logits/chosen": 8.529337882995605, "logits/rejected": 7.599100589752197, "logps/chosen": -244.71400451660156, "logps/rejected": -272.49542236328125, "loss": 0.6863, "rewards/accuracies": 0.375, "rewards/chosen": 0.046622369438409805, "rewards/margins": 0.08016528189182281, "rewards/rejected": -0.033542923629283905, "step": 5632 }, { "epoch": 0.8711386042915136, "grad_norm": 5.319549560546875, "learning_rate": 3.942318707755756e-06, "logits/chosen": 14.629606246948242, "logits/rejected": 9.831968307495117, "logps/chosen": -271.996337890625, "logps/rejected": -241.22518920898438, "loss": 0.5885, "rewards/accuracies": 0.625, "rewards/chosen": 0.11663269251585007, "rewards/margins": 0.32205379009246826, "rewards/rejected": -0.2054210901260376, "step": 5633 }, { "epoch": 0.8712932534312778, "grad_norm": 4.89212703704834, "learning_rate": 3.942032306106083e-06, "logits/chosen": 7.2878217697143555, "logits/rejected": 11.789358139038086, "logps/chosen": -153.80325317382812, "logps/rejected": -209.60549926757812, "loss": 0.7105, "rewards/accuracies": 0.5, "rewards/chosen": -0.1948097050189972, "rewards/margins": 0.10725729912519455, "rewards/rejected": -0.30206698179244995, "step": 5634 }, { "epoch": 0.871447902571042, "grad_norm": 4.66943883895874, "learning_rate": 3.94174590445641e-06, "logits/chosen": 5.970928192138672, "logits/rejected": 6.47063684463501, "logps/chosen": -300.7245788574219, "logps/rejected": -234.2465057373047, "loss": 0.5905, "rewards/accuracies": 0.75, "rewards/chosen": 0.06971701979637146, "rewards/margins": 0.25740116834640503, "rewards/rejected": -0.18768411874771118, "step": 5635 }, { "epoch": 0.8716025517108061, "grad_norm": 4.462173938751221, "learning_rate": 3.941459502806736e-06, "logits/chosen": 15.418695449829102, "logits/rejected": 11.841038703918457, "logps/chosen": -269.9836730957031, "logps/rejected": -214.45849609375, "loss": 0.6207, "rewards/accuracies": 0.625, "rewards/chosen": 0.203358456492424, "rewards/margins": 0.3614543080329895, "rewards/rejected": -0.15809586644172668, "step": 5636 }, { "epoch": 0.8717572008505703, "grad_norm": 5.354763031005859, "learning_rate": 3.941173101157063e-06, "logits/chosen": 14.487762451171875, "logits/rejected": 12.11870002746582, "logps/chosen": -256.5377197265625, "logps/rejected": -271.5584411621094, "loss": 0.5547, "rewards/accuracies": 0.75, "rewards/chosen": 0.023633763194084167, "rewards/margins": 0.42893093824386597, "rewards/rejected": -0.4052972197532654, "step": 5637 }, { "epoch": 0.8719118499903344, "grad_norm": 3.8580222129821777, "learning_rate": 3.94088669950739e-06, "logits/chosen": 14.341715812683105, "logits/rejected": 9.587904930114746, "logps/chosen": -213.06634521484375, "logps/rejected": -190.93557739257812, "loss": 0.4934, "rewards/accuracies": 1.0, "rewards/chosen": 0.2575652301311493, "rewards/margins": 0.4646955728530884, "rewards/rejected": -0.2071303427219391, "step": 5638 }, { "epoch": 0.8720664991300986, "grad_norm": 5.44061279296875, "learning_rate": 3.940600297857716e-06, "logits/chosen": 15.466172218322754, "logits/rejected": 11.639422416687012, "logps/chosen": -345.5414733886719, "logps/rejected": -268.7454528808594, "loss": 0.5077, "rewards/accuracies": 0.875, "rewards/chosen": 0.2716158926486969, "rewards/margins": 0.5531377792358398, "rewards/rejected": -0.28152191638946533, "step": 5639 }, { "epoch": 0.8722211482698627, "grad_norm": 13.47822380065918, "learning_rate": 3.940313896208042e-06, "logits/chosen": 11.579065322875977, "logits/rejected": 6.654143810272217, "logps/chosen": -470.0531005859375, "logps/rejected": -327.0259704589844, "loss": 0.5623, "rewards/accuracies": 0.625, "rewards/chosen": 0.3596084713935852, "rewards/margins": 0.4161161184310913, "rewards/rejected": -0.05650767683982849, "step": 5640 }, { "epoch": 0.8723757974096269, "grad_norm": 9.235669136047363, "learning_rate": 3.940027494558369e-06, "logits/chosen": 10.109981536865234, "logits/rejected": 8.653867721557617, "logps/chosen": -327.7939758300781, "logps/rejected": -338.1885986328125, "loss": 0.6744, "rewards/accuracies": 0.625, "rewards/chosen": -0.31724536418914795, "rewards/margins": 0.29116860032081604, "rewards/rejected": -0.6084139943122864, "step": 5641 }, { "epoch": 0.872530446549391, "grad_norm": 3.829383134841919, "learning_rate": 3.939741092908695e-06, "logits/chosen": 8.609882354736328, "logits/rejected": 4.51651668548584, "logps/chosen": -444.54119873046875, "logps/rejected": -325.2269287109375, "loss": 0.4444, "rewards/accuracies": 0.875, "rewards/chosen": 0.5772438645362854, "rewards/margins": 0.7134607434272766, "rewards/rejected": -0.1362168937921524, "step": 5642 }, { "epoch": 0.8726850956891552, "grad_norm": 5.789526462554932, "learning_rate": 3.939454691259022e-06, "logits/chosen": 4.465651035308838, "logits/rejected": 1.058051347732544, "logps/chosen": -271.0077209472656, "logps/rejected": -176.4681854248047, "loss": 0.7499, "rewards/accuracies": 0.375, "rewards/chosen": -0.15213823318481445, "rewards/margins": 0.029533851891756058, "rewards/rejected": -0.1816720813512802, "step": 5643 }, { "epoch": 0.8728397448289194, "grad_norm": 4.762213230133057, "learning_rate": 3.939168289609349e-06, "logits/chosen": 5.643044471740723, "logits/rejected": 5.361799716949463, "logps/chosen": -182.87486267089844, "logps/rejected": -193.81788635253906, "loss": 0.6077, "rewards/accuracies": 0.625, "rewards/chosen": 0.016397807747125626, "rewards/margins": 0.20616284012794495, "rewards/rejected": -0.1897650510072708, "step": 5644 }, { "epoch": 0.8729943939686835, "grad_norm": 5.213296890258789, "learning_rate": 3.938881887959675e-06, "logits/chosen": 9.429723739624023, "logits/rejected": 9.652655601501465, "logps/chosen": -248.0252685546875, "logps/rejected": -296.86798095703125, "loss": 0.6675, "rewards/accuracies": 0.375, "rewards/chosen": -0.06651425361633301, "rewards/margins": 0.1570776402950287, "rewards/rejected": -0.2235919088125229, "step": 5645 }, { "epoch": 0.8731490431084478, "grad_norm": 5.00590705871582, "learning_rate": 3.938595486310001e-06, "logits/chosen": 11.406298637390137, "logits/rejected": 4.312463760375977, "logps/chosen": -290.09100341796875, "logps/rejected": -232.20513916015625, "loss": 0.4868, "rewards/accuracies": 0.75, "rewards/chosen": 0.032554298639297485, "rewards/margins": 0.666236400604248, "rewards/rejected": -0.6336820721626282, "step": 5646 }, { "epoch": 0.8733036922482119, "grad_norm": 5.848501682281494, "learning_rate": 3.938309084660328e-06, "logits/chosen": 8.37657642364502, "logits/rejected": 0.4146420955657959, "logps/chosen": -219.15805053710938, "logps/rejected": -121.03724670410156, "loss": 0.699, "rewards/accuracies": 0.375, "rewards/chosen": -0.04975175857543945, "rewards/margins": 0.10176274180412292, "rewards/rejected": -0.15151448547840118, "step": 5647 }, { "epoch": 0.8734583413879761, "grad_norm": 8.237857818603516, "learning_rate": 3.9380226830106544e-06, "logits/chosen": 9.774773597717285, "logits/rejected": 4.954183578491211, "logps/chosen": -381.2142333984375, "logps/rejected": -292.44952392578125, "loss": 0.9024, "rewards/accuracies": 0.375, "rewards/chosen": -0.3442268371582031, "rewards/margins": -0.25781023502349854, "rewards/rejected": -0.08641661703586578, "step": 5648 }, { "epoch": 0.8736129905277402, "grad_norm": 6.017556190490723, "learning_rate": 3.937736281360981e-06, "logits/chosen": 7.605977535247803, "logits/rejected": 7.767765045166016, "logps/chosen": -300.46478271484375, "logps/rejected": -304.66912841796875, "loss": 0.8732, "rewards/accuracies": 0.375, "rewards/chosen": 0.3434692323207855, "rewards/margins": -0.21501006186008453, "rewards/rejected": 0.5584793090820312, "step": 5649 }, { "epoch": 0.8737676396675044, "grad_norm": 4.724082946777344, "learning_rate": 3.937449879711308e-06, "logits/chosen": 12.63094711303711, "logits/rejected": 15.25442886352539, "logps/chosen": -277.34796142578125, "logps/rejected": -263.77099609375, "loss": 0.641, "rewards/accuracies": 0.625, "rewards/chosen": 0.17564235627651215, "rewards/margins": 0.15760484337806702, "rewards/rejected": 0.01803751103579998, "step": 5650 }, { "epoch": 0.8739222888072685, "grad_norm": 4.252694129943848, "learning_rate": 3.937163478061634e-06, "logits/chosen": 9.061813354492188, "logits/rejected": 8.071301460266113, "logps/chosen": -234.41494750976562, "logps/rejected": -245.01809692382812, "loss": 0.6188, "rewards/accuracies": 0.5, "rewards/chosen": 0.0609808973968029, "rewards/margins": 0.26599031686782837, "rewards/rejected": -0.20500941574573517, "step": 5651 }, { "epoch": 0.8740769379470327, "grad_norm": 5.34287166595459, "learning_rate": 3.936877076411961e-06, "logits/chosen": 13.25833797454834, "logits/rejected": 10.553860664367676, "logps/chosen": -249.4315643310547, "logps/rejected": -243.89340209960938, "loss": 0.5934, "rewards/accuracies": 0.75, "rewards/chosen": 0.182373508810997, "rewards/margins": 0.5319761633872986, "rewards/rejected": -0.3496026396751404, "step": 5652 }, { "epoch": 0.8742315870867968, "grad_norm": 5.252841472625732, "learning_rate": 3.936590674762287e-06, "logits/chosen": 17.859966278076172, "logits/rejected": 2.896955966949463, "logps/chosen": -349.1850280761719, "logps/rejected": -167.3238525390625, "loss": 0.5405, "rewards/accuracies": 0.625, "rewards/chosen": 0.13819772005081177, "rewards/margins": 0.7922736406326294, "rewards/rejected": -0.6540759801864624, "step": 5653 }, { "epoch": 0.874386236226561, "grad_norm": 4.111155033111572, "learning_rate": 3.9363042731126135e-06, "logits/chosen": 12.214452743530273, "logits/rejected": 11.434420585632324, "logps/chosen": -239.00567626953125, "logps/rejected": -260.8659973144531, "loss": 0.6608, "rewards/accuracies": 0.5, "rewards/chosen": 0.2804913818836212, "rewards/margins": 0.15111541748046875, "rewards/rejected": 0.12937593460083008, "step": 5654 }, { "epoch": 0.8745408853663251, "grad_norm": 6.461802005767822, "learning_rate": 3.93601787146294e-06, "logits/chosen": 9.54474925994873, "logits/rejected": 10.665502548217773, "logps/chosen": -245.351318359375, "logps/rejected": -277.05181884765625, "loss": 0.6939, "rewards/accuracies": 0.5, "rewards/chosen": -0.009486563503742218, "rewards/margins": 0.1848127543926239, "rewards/rejected": -0.19429929554462433, "step": 5655 }, { "epoch": 0.8746955345060893, "grad_norm": 3.8820958137512207, "learning_rate": 3.935731469813267e-06, "logits/chosen": 4.143945217132568, "logits/rejected": 3.234565019607544, "logps/chosen": -203.73922729492188, "logps/rejected": -270.5245361328125, "loss": 0.486, "rewards/accuracies": 0.75, "rewards/chosen": -0.16650941967964172, "rewards/margins": 0.6335764527320862, "rewards/rejected": -0.8000859022140503, "step": 5656 }, { "epoch": 0.8748501836458534, "grad_norm": 4.7538347244262695, "learning_rate": 3.9354450681635935e-06, "logits/chosen": 11.000550270080566, "logits/rejected": 6.898792266845703, "logps/chosen": -361.6807556152344, "logps/rejected": -219.75794982910156, "loss": 0.6001, "rewards/accuracies": 0.75, "rewards/chosen": -0.009472116827964783, "rewards/margins": 0.2839500308036804, "rewards/rejected": -0.293422132730484, "step": 5657 }, { "epoch": 0.8750048327856176, "grad_norm": 4.112393856048584, "learning_rate": 3.935158666513919e-06, "logits/chosen": 11.376110076904297, "logits/rejected": 4.066917896270752, "logps/chosen": -268.251708984375, "logps/rejected": -192.46917724609375, "loss": 0.5249, "rewards/accuracies": 0.625, "rewards/chosen": -0.07191857695579529, "rewards/margins": 0.5339853167533875, "rewards/rejected": -0.6059039235115051, "step": 5658 }, { "epoch": 0.8751594819253818, "grad_norm": 4.6746745109558105, "learning_rate": 3.934872264864246e-06, "logits/chosen": 7.891869068145752, "logits/rejected": 6.6777143478393555, "logps/chosen": -162.88125610351562, "logps/rejected": -173.12864685058594, "loss": 0.7765, "rewards/accuracies": 0.625, "rewards/chosen": -0.14955027401447296, "rewards/margins": 0.012018300592899323, "rewards/rejected": -0.16156858205795288, "step": 5659 }, { "epoch": 0.875314131065146, "grad_norm": 6.297798156738281, "learning_rate": 3.9345858632145726e-06, "logits/chosen": 6.34410285949707, "logits/rejected": 8.712772369384766, "logps/chosen": -265.5455322265625, "logps/rejected": -333.2878112792969, "loss": 0.6323, "rewards/accuracies": 0.75, "rewards/chosen": 0.0799713209271431, "rewards/margins": 0.1753229796886444, "rewards/rejected": -0.09535164386034012, "step": 5660 }, { "epoch": 0.8754687802049101, "grad_norm": 5.930227756500244, "learning_rate": 3.934299461564899e-06, "logits/chosen": 7.926520347595215, "logits/rejected": 3.0510663986206055, "logps/chosen": -369.01934814453125, "logps/rejected": -275.99578857421875, "loss": 0.6328, "rewards/accuracies": 0.5, "rewards/chosen": 0.07866773009300232, "rewards/margins": 0.19924335181713104, "rewards/rejected": -0.12057562172412872, "step": 5661 }, { "epoch": 0.8756234293446743, "grad_norm": 8.665170669555664, "learning_rate": 3.934013059915225e-06, "logits/chosen": 13.285758018493652, "logits/rejected": 13.363130569458008, "logps/chosen": -504.457275390625, "logps/rejected": -454.870849609375, "loss": 0.6126, "rewards/accuracies": 0.75, "rewards/chosen": 0.28258201479911804, "rewards/margins": 0.3246647119522095, "rewards/rejected": -0.04208267107605934, "step": 5662 }, { "epoch": 0.8757780784844384, "grad_norm": 3.694821834564209, "learning_rate": 3.933726658265552e-06, "logits/chosen": 8.232464790344238, "logits/rejected": 6.61293888092041, "logps/chosen": -234.60890197753906, "logps/rejected": -176.3289337158203, "loss": 0.555, "rewards/accuracies": 0.625, "rewards/chosen": -0.12071295827627182, "rewards/margins": 0.33618611097335815, "rewards/rejected": -0.4568990468978882, "step": 5663 }, { "epoch": 0.8759327276242026, "grad_norm": 6.1637139320373535, "learning_rate": 3.933440256615878e-06, "logits/chosen": 10.9147367477417, "logits/rejected": 8.75466537475586, "logps/chosen": -434.5308837890625, "logps/rejected": -262.6238098144531, "loss": 0.7895, "rewards/accuracies": 0.5, "rewards/chosen": -0.04773131012916565, "rewards/margins": 0.059706032276153564, "rewards/rejected": -0.10743732750415802, "step": 5664 }, { "epoch": 0.8760873767639668, "grad_norm": 4.539864540100098, "learning_rate": 3.933153854966205e-06, "logits/chosen": 13.912259101867676, "logits/rejected": 2.1627326011657715, "logps/chosen": -329.2405700683594, "logps/rejected": -151.69346618652344, "loss": 0.5578, "rewards/accuracies": 0.875, "rewards/chosen": -0.14639337360858917, "rewards/margins": 0.3467675447463989, "rewards/rejected": -0.4931609034538269, "step": 5665 }, { "epoch": 0.8762420259037309, "grad_norm": 4.697590351104736, "learning_rate": 3.932867453316531e-06, "logits/chosen": 9.148768424987793, "logits/rejected": 0.09035599231719971, "logps/chosen": -268.40838623046875, "logps/rejected": -214.3895721435547, "loss": 0.6741, "rewards/accuracies": 0.75, "rewards/chosen": 0.139594167470932, "rewards/margins": 0.378447562456131, "rewards/rejected": -0.23885338008403778, "step": 5666 }, { "epoch": 0.876396675043495, "grad_norm": 5.498101711273193, "learning_rate": 3.932581051666857e-06, "logits/chosen": 16.134925842285156, "logits/rejected": 12.661490440368652, "logps/chosen": -350.7439270019531, "logps/rejected": -291.5743713378906, "loss": 0.5646, "rewards/accuracies": 0.75, "rewards/chosen": 0.213811457157135, "rewards/margins": 0.4235631227493286, "rewards/rejected": -0.20975171029567719, "step": 5667 }, { "epoch": 0.8765513241832592, "grad_norm": 6.963108062744141, "learning_rate": 3.932294650017184e-06, "logits/chosen": 9.436728477478027, "logits/rejected": 11.135045051574707, "logps/chosen": -338.39715576171875, "logps/rejected": -404.2248840332031, "loss": 0.8649, "rewards/accuracies": 0.375, "rewards/chosen": 0.0227874293923378, "rewards/margins": -0.1967308521270752, "rewards/rejected": 0.21951822936534882, "step": 5668 }, { "epoch": 0.8767059733230234, "grad_norm": 5.140109539031982, "learning_rate": 3.932008248367511e-06, "logits/chosen": 11.411271095275879, "logits/rejected": 8.633234977722168, "logps/chosen": -493.7334289550781, "logps/rejected": -347.44873046875, "loss": 0.4756, "rewards/accuracies": 0.625, "rewards/chosen": 0.48280030488967896, "rewards/margins": 0.7372726202011108, "rewards/rejected": -0.2544722259044647, "step": 5669 }, { "epoch": 0.8768606224627875, "grad_norm": 7.385674953460693, "learning_rate": 3.931721846717837e-06, "logits/chosen": 10.652819633483887, "logits/rejected": 4.669861793518066, "logps/chosen": -313.36444091796875, "logps/rejected": -223.39328002929688, "loss": 0.7038, "rewards/accuracies": 0.625, "rewards/chosen": 0.143855482339859, "rewards/margins": 0.1621609777212143, "rewards/rejected": -0.01830551028251648, "step": 5670 }, { "epoch": 0.8770152716025517, "grad_norm": 7.921140193939209, "learning_rate": 3.931435445068164e-06, "logits/chosen": 10.085105895996094, "logits/rejected": 10.221999168395996, "logps/chosen": -346.9088439941406, "logps/rejected": -318.07415771484375, "loss": 0.7316, "rewards/accuracies": 0.5, "rewards/chosen": 0.12066832184791565, "rewards/margins": -0.0223359614610672, "rewards/rejected": 0.14300426840782166, "step": 5671 }, { "epoch": 0.8771699207423159, "grad_norm": 5.412511348724365, "learning_rate": 3.931149043418491e-06, "logits/chosen": 13.754417419433594, "logits/rejected": 11.952595710754395, "logps/chosen": -257.49273681640625, "logps/rejected": -247.9320068359375, "loss": 0.7382, "rewards/accuracies": 0.5, "rewards/chosen": 0.23533602058887482, "rewards/margins": 0.04562690854072571, "rewards/rejected": 0.18970909714698792, "step": 5672 }, { "epoch": 0.8773245698820801, "grad_norm": 6.484980583190918, "learning_rate": 3.9308626417688165e-06, "logits/chosen": -0.3907392919063568, "logits/rejected": 6.294769763946533, "logps/chosen": -194.39942932128906, "logps/rejected": -307.7702331542969, "loss": 0.8776, "rewards/accuracies": 0.375, "rewards/chosen": -0.15546885132789612, "rewards/margins": -0.11805954575538635, "rewards/rejected": -0.03740927577018738, "step": 5673 }, { "epoch": 0.8774792190218442, "grad_norm": 5.181357383728027, "learning_rate": 3.930576240119143e-06, "logits/chosen": 13.858302116394043, "logits/rejected": 10.649045944213867, "logps/chosen": -180.3077392578125, "logps/rejected": -191.67022705078125, "loss": 0.6879, "rewards/accuracies": 0.375, "rewards/chosen": -0.22265706956386566, "rewards/margins": 0.16413095593452454, "rewards/rejected": -0.3867880403995514, "step": 5674 }, { "epoch": 0.8776338681616084, "grad_norm": 4.592799186706543, "learning_rate": 3.93028983846947e-06, "logits/chosen": 16.543363571166992, "logits/rejected": 12.259946823120117, "logps/chosen": -267.28802490234375, "logps/rejected": -229.10971069335938, "loss": 0.5494, "rewards/accuracies": 0.75, "rewards/chosen": -0.025543205440044403, "rewards/margins": 0.3987596333026886, "rewards/rejected": -0.4243028163909912, "step": 5675 }, { "epoch": 0.8777885173013725, "grad_norm": 6.9234938621521, "learning_rate": 3.9300034368197964e-06, "logits/chosen": 9.289552688598633, "logits/rejected": 7.248505115509033, "logps/chosen": -296.2676086425781, "logps/rejected": -239.59759521484375, "loss": 0.7121, "rewards/accuracies": 0.5, "rewards/chosen": -0.08736447989940643, "rewards/margins": 0.1253564953804016, "rewards/rejected": -0.21272096037864685, "step": 5676 }, { "epoch": 0.8779431664411367, "grad_norm": 4.872354030609131, "learning_rate": 3.929717035170123e-06, "logits/chosen": 7.813759803771973, "logits/rejected": 2.984287738800049, "logps/chosen": -214.008056640625, "logps/rejected": -204.40634155273438, "loss": 0.7225, "rewards/accuracies": 0.75, "rewards/chosen": 0.041722774505615234, "rewards/margins": 0.04574348032474518, "rewards/rejected": -0.004020705819129944, "step": 5677 }, { "epoch": 0.8780978155809008, "grad_norm": 5.934500694274902, "learning_rate": 3.92943063352045e-06, "logits/chosen": 16.50546646118164, "logits/rejected": 10.927936553955078, "logps/chosen": -327.83599853515625, "logps/rejected": -323.0665588378906, "loss": 0.6838, "rewards/accuracies": 0.5, "rewards/chosen": 0.18454760313034058, "rewards/margins": 0.1856900453567505, "rewards/rejected": -0.0011423975229263306, "step": 5678 }, { "epoch": 0.878252464720665, "grad_norm": 5.392816543579102, "learning_rate": 3.9291442318707755e-06, "logits/chosen": 7.035706520080566, "logits/rejected": 7.192243576049805, "logps/chosen": -134.77679443359375, "logps/rejected": -176.96095275878906, "loss": 0.7881, "rewards/accuracies": 0.5, "rewards/chosen": 0.09383855015039444, "rewards/margins": 0.017092328518629074, "rewards/rejected": 0.07674622535705566, "step": 5679 }, { "epoch": 0.8784071138604291, "grad_norm": 6.367462635040283, "learning_rate": 3.928857830221102e-06, "logits/chosen": 8.42098331451416, "logits/rejected": 9.098801612854004, "logps/chosen": -298.13629150390625, "logps/rejected": -310.239013671875, "loss": 0.7174, "rewards/accuracies": 0.5, "rewards/chosen": 0.26808756589889526, "rewards/margins": 0.11679258942604065, "rewards/rejected": 0.1512949913740158, "step": 5680 }, { "epoch": 0.8785617630001933, "grad_norm": 3.8521969318389893, "learning_rate": 3.928571428571429e-06, "logits/chosen": 14.077341079711914, "logits/rejected": 8.14858627319336, "logps/chosen": -282.34661865234375, "logps/rejected": -224.75416564941406, "loss": 0.397, "rewards/accuracies": 0.875, "rewards/chosen": 0.0699823871254921, "rewards/margins": 0.8541668653488159, "rewards/rejected": -0.784184455871582, "step": 5681 }, { "epoch": 0.8787164121399574, "grad_norm": 4.985754013061523, "learning_rate": 3.9282850269217555e-06, "logits/chosen": 11.141313552856445, "logits/rejected": 4.546165943145752, "logps/chosen": -293.9993896484375, "logps/rejected": -202.7480926513672, "loss": 0.5562, "rewards/accuracies": 0.5, "rewards/chosen": 0.6147188544273376, "rewards/margins": 0.6592667102813721, "rewards/rejected": -0.044547900557518005, "step": 5682 }, { "epoch": 0.8788710612797216, "grad_norm": 6.047011375427246, "learning_rate": 3.927998625272082e-06, "logits/chosen": 9.215534210205078, "logits/rejected": 7.423741340637207, "logps/chosen": -348.14825439453125, "logps/rejected": -331.0448913574219, "loss": 0.5222, "rewards/accuracies": 0.875, "rewards/chosen": -0.03747911751270294, "rewards/margins": 0.4158361256122589, "rewards/rejected": -0.45331525802612305, "step": 5683 }, { "epoch": 0.8790257104194857, "grad_norm": 5.067002296447754, "learning_rate": 3.927712223622409e-06, "logits/chosen": 7.173776149749756, "logits/rejected": 5.573089122772217, "logps/chosen": -223.5181884765625, "logps/rejected": -255.6622314453125, "loss": 0.5584, "rewards/accuracies": 0.625, "rewards/chosen": 0.060227587819099426, "rewards/margins": 0.3956969082355499, "rewards/rejected": -0.3354693651199341, "step": 5684 }, { "epoch": 0.87918035955925, "grad_norm": 6.530332565307617, "learning_rate": 3.9274258219727354e-06, "logits/chosen": 12.846552848815918, "logits/rejected": 9.800727844238281, "logps/chosen": -264.124755859375, "logps/rejected": -253.51968383789062, "loss": 0.686, "rewards/accuracies": 0.75, "rewards/chosen": 0.1744282841682434, "rewards/margins": 0.0831167995929718, "rewards/rejected": 0.09131143987178802, "step": 5685 }, { "epoch": 0.8793350086990142, "grad_norm": 4.751879692077637, "learning_rate": 3.927139420323061e-06, "logits/chosen": 14.078407287597656, "logits/rejected": 9.540026664733887, "logps/chosen": -317.36529541015625, "logps/rejected": -232.094970703125, "loss": 0.5829, "rewards/accuracies": 0.625, "rewards/chosen": 0.26495128870010376, "rewards/margins": 0.38752806186676025, "rewards/rejected": -0.12257680296897888, "step": 5686 }, { "epoch": 0.8794896578387783, "grad_norm": 5.118988513946533, "learning_rate": 3.926853018673388e-06, "logits/chosen": 6.926477909088135, "logits/rejected": 6.143775939941406, "logps/chosen": -294.31646728515625, "logps/rejected": -249.96615600585938, "loss": 0.6908, "rewards/accuracies": 0.625, "rewards/chosen": 0.36341917514801025, "rewards/margins": 0.05482121556997299, "rewards/rejected": 0.30859795212745667, "step": 5687 }, { "epoch": 0.8796443069785425, "grad_norm": 4.586796283721924, "learning_rate": 3.9265666170237145e-06, "logits/chosen": 4.956618785858154, "logits/rejected": 6.146323204040527, "logps/chosen": -208.92919921875, "logps/rejected": -222.78135681152344, "loss": 0.7469, "rewards/accuracies": 0.125, "rewards/chosen": -0.05154542997479439, "rewards/margins": -0.0685061439871788, "rewards/rejected": 0.016960717737674713, "step": 5688 }, { "epoch": 0.8797989561183066, "grad_norm": 5.894932270050049, "learning_rate": 3.926280215374041e-06, "logits/chosen": 4.68338680267334, "logits/rejected": 8.852193832397461, "logps/chosen": -275.29559326171875, "logps/rejected": -287.2887268066406, "loss": 0.6729, "rewards/accuracies": 0.625, "rewards/chosen": -0.12761251628398895, "rewards/margins": 0.17982949316501617, "rewards/rejected": -0.3074420094490051, "step": 5689 }, { "epoch": 0.8799536052580708, "grad_norm": 4.680460453033447, "learning_rate": 3.925993813724368e-06, "logits/chosen": 16.064149856567383, "logits/rejected": 12.807035446166992, "logps/chosen": -341.8258056640625, "logps/rejected": -276.86669921875, "loss": 0.6504, "rewards/accuracies": 0.5, "rewards/chosen": 0.20412616431713104, "rewards/margins": 0.14597684144973755, "rewards/rejected": 0.05814933776855469, "step": 5690 }, { "epoch": 0.8801082543978349, "grad_norm": 5.986044406890869, "learning_rate": 3.9257074120746945e-06, "logits/chosen": 11.95112419128418, "logits/rejected": 10.669514656066895, "logps/chosen": -356.47271728515625, "logps/rejected": -364.63519287109375, "loss": 0.7306, "rewards/accuracies": 0.5, "rewards/chosen": 0.45785582065582275, "rewards/margins": 0.03142721951007843, "rewards/rejected": 0.4264286160469055, "step": 5691 }, { "epoch": 0.8802629035375991, "grad_norm": 5.993568420410156, "learning_rate": 3.92542101042502e-06, "logits/chosen": 9.345330238342285, "logits/rejected": 4.395379066467285, "logps/chosen": -404.22662353515625, "logps/rejected": -324.53973388671875, "loss": 0.5773, "rewards/accuracies": 0.625, "rewards/chosen": 0.33714932203292847, "rewards/margins": 0.46598827838897705, "rewards/rejected": -0.1288389265537262, "step": 5692 }, { "epoch": 0.8804175526773632, "grad_norm": 4.783054351806641, "learning_rate": 3.925134608775347e-06, "logits/chosen": 8.3233060836792, "logits/rejected": 4.597684860229492, "logps/chosen": -290.25286865234375, "logps/rejected": -197.84854125976562, "loss": 0.728, "rewards/accuracies": 0.375, "rewards/chosen": -0.08984370529651642, "rewards/margins": 0.0006747245788574219, "rewards/rejected": -0.09051842987537384, "step": 5693 }, { "epoch": 0.8805722018171274, "grad_norm": 3.8297502994537354, "learning_rate": 3.924848207125674e-06, "logits/chosen": 10.921207427978516, "logits/rejected": 9.71729850769043, "logps/chosen": -211.61270141601562, "logps/rejected": -220.1630859375, "loss": 0.5926, "rewards/accuracies": 0.75, "rewards/chosen": 0.42336779832839966, "rewards/margins": 0.35031557083129883, "rewards/rejected": 0.07305224239826202, "step": 5694 }, { "epoch": 0.8807268509568915, "grad_norm": 4.451205730438232, "learning_rate": 3.924561805476e-06, "logits/chosen": 17.75222396850586, "logits/rejected": 13.98066520690918, "logps/chosen": -236.87969970703125, "logps/rejected": -178.76641845703125, "loss": 0.4782, "rewards/accuracies": 0.75, "rewards/chosen": 0.2390918731689453, "rewards/margins": 0.609368085861206, "rewards/rejected": -0.37027621269226074, "step": 5695 }, { "epoch": 0.8808815000966557, "grad_norm": 3.727084159851074, "learning_rate": 3.924275403826326e-06, "logits/chosen": 8.494039535522461, "logits/rejected": 6.410273551940918, "logps/chosen": -199.05967712402344, "logps/rejected": -192.14984130859375, "loss": 0.5722, "rewards/accuracies": 0.625, "rewards/chosen": 0.2859953045845032, "rewards/margins": 0.3050665259361267, "rewards/rejected": -0.01907125487923622, "step": 5696 }, { "epoch": 0.8810361492364198, "grad_norm": 4.612589359283447, "learning_rate": 3.923989002176653e-06, "logits/chosen": 12.652214050292969, "logits/rejected": 4.704769134521484, "logps/chosen": -366.7347717285156, "logps/rejected": -176.88043212890625, "loss": 0.548, "rewards/accuracies": 0.5, "rewards/chosen": 0.318561851978302, "rewards/margins": 0.4347693920135498, "rewards/rejected": -0.116207554936409, "step": 5697 }, { "epoch": 0.8811907983761841, "grad_norm": 5.3449201583862305, "learning_rate": 3.923702600526979e-06, "logits/chosen": 11.487663269042969, "logits/rejected": 7.070026874542236, "logps/chosen": -287.8294677734375, "logps/rejected": -233.54437255859375, "loss": 0.642, "rewards/accuracies": 0.375, "rewards/chosen": 0.25714826583862305, "rewards/margins": 0.23449267446994781, "rewards/rejected": 0.022655636072158813, "step": 5698 }, { "epoch": 0.8813454475159482, "grad_norm": 5.4208149909973145, "learning_rate": 3.923416198877306e-06, "logits/chosen": 7.438830375671387, "logits/rejected": 5.745214462280273, "logps/chosen": -216.25875854492188, "logps/rejected": -319.15142822265625, "loss": 0.662, "rewards/accuracies": 0.5, "rewards/chosen": -0.1105307787656784, "rewards/margins": 0.19088268280029297, "rewards/rejected": -0.30141347646713257, "step": 5699 }, { "epoch": 0.8815000966557124, "grad_norm": 4.36551570892334, "learning_rate": 3.923129797227632e-06, "logits/chosen": 15.135406494140625, "logits/rejected": 13.00969123840332, "logps/chosen": -311.62030029296875, "logps/rejected": -293.10028076171875, "loss": 0.5552, "rewards/accuracies": 0.75, "rewards/chosen": 0.2520088851451874, "rewards/margins": 0.3791600465774536, "rewards/rejected": -0.12715116143226624, "step": 5700 }, { "epoch": 0.8816547457954765, "grad_norm": 4.1058807373046875, "learning_rate": 3.9228433955779585e-06, "logits/chosen": 10.676985740661621, "logits/rejected": 8.554781913757324, "logps/chosen": -196.91278076171875, "logps/rejected": -148.56385803222656, "loss": 0.5721, "rewards/accuracies": 0.875, "rewards/chosen": 0.23484423756599426, "rewards/margins": 0.2858145534992218, "rewards/rejected": -0.050970323383808136, "step": 5701 }, { "epoch": 0.8818093949352407, "grad_norm": 2.8860790729522705, "learning_rate": 3.922556993928285e-06, "logits/chosen": 7.610790729522705, "logits/rejected": -1.929937481880188, "logps/chosen": -182.20364379882812, "logps/rejected": -91.02428436279297, "loss": 0.5321, "rewards/accuracies": 0.75, "rewards/chosen": 0.08037164062261581, "rewards/margins": 0.3965491056442261, "rewards/rejected": -0.31617751717567444, "step": 5702 }, { "epoch": 0.8819640440750048, "grad_norm": 5.045615196228027, "learning_rate": 3.922270592278612e-06, "logits/chosen": 3.3387739658355713, "logits/rejected": 6.531632900238037, "logps/chosen": -164.33871459960938, "logps/rejected": -179.22242736816406, "loss": 0.7349, "rewards/accuracies": 0.625, "rewards/chosen": -0.17465294897556305, "rewards/margins": 0.003287695348262787, "rewards/rejected": -0.17794065177440643, "step": 5703 }, { "epoch": 0.882118693214769, "grad_norm": 5.3651604652404785, "learning_rate": 3.921984190628938e-06, "logits/chosen": 14.905363082885742, "logits/rejected": 1.7832754850387573, "logps/chosen": -386.7444152832031, "logps/rejected": -242.4635772705078, "loss": 0.5232, "rewards/accuracies": 0.75, "rewards/chosen": 0.21647655963897705, "rewards/margins": 0.44515755772590637, "rewards/rejected": -0.22868099808692932, "step": 5704 }, { "epoch": 0.8822733423545331, "grad_norm": 37.49298095703125, "learning_rate": 3.921697788979264e-06, "logits/chosen": 13.31360912322998, "logits/rejected": 8.313511848449707, "logps/chosen": -226.02801513671875, "logps/rejected": -132.7248992919922, "loss": 0.5479, "rewards/accuracies": 0.875, "rewards/chosen": 0.14264078438282013, "rewards/margins": 0.3591480851173401, "rewards/rejected": -0.21650728583335876, "step": 5705 }, { "epoch": 0.8824279914942973, "grad_norm": 4.866557598114014, "learning_rate": 3.921411387329591e-06, "logits/chosen": 11.690808296203613, "logits/rejected": 7.529850959777832, "logps/chosen": -358.360595703125, "logps/rejected": -228.60592651367188, "loss": 0.5891, "rewards/accuracies": 0.75, "rewards/chosen": 0.049704648554325104, "rewards/margins": 0.3491722345352173, "rewards/rejected": -0.2994675636291504, "step": 5706 }, { "epoch": 0.8825826406340614, "grad_norm": 5.29364538192749, "learning_rate": 3.9211249856799175e-06, "logits/chosen": 17.279762268066406, "logits/rejected": 10.129562377929688, "logps/chosen": -462.92352294921875, "logps/rejected": -326.7575988769531, "loss": 0.4071, "rewards/accuracies": 0.875, "rewards/chosen": 0.6093364953994751, "rewards/margins": 1.1377660036087036, "rewards/rejected": -0.5284295678138733, "step": 5707 }, { "epoch": 0.8827372897738256, "grad_norm": 5.2459611892700195, "learning_rate": 3.920838584030244e-06, "logits/chosen": 8.722107887268066, "logits/rejected": 6.616268157958984, "logps/chosen": -188.29290771484375, "logps/rejected": -202.31494140625, "loss": 0.9364, "rewards/accuracies": 0.5, "rewards/chosen": -0.32921215891838074, "rewards/margins": -0.29155609011650085, "rewards/rejected": -0.037656065076589584, "step": 5708 }, { "epoch": 0.8828919389135897, "grad_norm": 5.211967945098877, "learning_rate": 3.920552182380571e-06, "logits/chosen": 11.724498748779297, "logits/rejected": 10.188246726989746, "logps/chosen": -298.84246826171875, "logps/rejected": -312.59564208984375, "loss": 0.491, "rewards/accuracies": 0.75, "rewards/chosen": 0.3389541506767273, "rewards/margins": 0.5990217924118042, "rewards/rejected": -0.2600676417350769, "step": 5709 }, { "epoch": 0.8830465880533539, "grad_norm": 5.4087233543396, "learning_rate": 3.9202657807308975e-06, "logits/chosen": 16.124073028564453, "logits/rejected": 10.48360538482666, "logps/chosen": -246.78402709960938, "logps/rejected": -162.92636108398438, "loss": 0.7441, "rewards/accuracies": 0.5, "rewards/chosen": -0.2183852195739746, "rewards/margins": -0.05012936517596245, "rewards/rejected": -0.16825586557388306, "step": 5710 }, { "epoch": 0.8832012371931182, "grad_norm": 4.968470573425293, "learning_rate": 3.919979379081224e-06, "logits/chosen": 11.219609260559082, "logits/rejected": 8.278921127319336, "logps/chosen": -244.74037170410156, "logps/rejected": -215.74349975585938, "loss": 0.7217, "rewards/accuracies": 0.625, "rewards/chosen": -0.19195738434791565, "rewards/margins": 0.07371539622545242, "rewards/rejected": -0.2656727731227875, "step": 5711 }, { "epoch": 0.8833558863328823, "grad_norm": 7.693360805511475, "learning_rate": 3.91969297743155e-06, "logits/chosen": 9.648031234741211, "logits/rejected": 10.574610710144043, "logps/chosen": -344.81488037109375, "logps/rejected": -352.55194091796875, "loss": 0.845, "rewards/accuracies": 0.5, "rewards/chosen": 0.23703914880752563, "rewards/margins": -0.1912335455417633, "rewards/rejected": 0.42827269434928894, "step": 5712 }, { "epoch": 0.8835105354726465, "grad_norm": 7.614741802215576, "learning_rate": 3.9194065757818766e-06, "logits/chosen": 16.03765869140625, "logits/rejected": 12.23193359375, "logps/chosen": -316.5546875, "logps/rejected": -289.1417236328125, "loss": 0.8436, "rewards/accuracies": 0.25, "rewards/chosen": -0.3965412974357605, "rewards/margins": -0.14649152755737305, "rewards/rejected": -0.25004979968070984, "step": 5713 }, { "epoch": 0.8836651846124106, "grad_norm": 7.736001491546631, "learning_rate": 3.919120174132203e-06, "logits/chosen": 10.7076416015625, "logits/rejected": 10.2206392288208, "logps/chosen": -207.13267517089844, "logps/rejected": -234.1348419189453, "loss": 0.7957, "rewards/accuracies": 0.5, "rewards/chosen": -0.3026137351989746, "rewards/margins": 0.07845886051654816, "rewards/rejected": -0.3810725808143616, "step": 5714 }, { "epoch": 0.8838198337521748, "grad_norm": 11.667096138000488, "learning_rate": 3.91883377248253e-06, "logits/chosen": 12.753061294555664, "logits/rejected": 7.801212310791016, "logps/chosen": -373.0035400390625, "logps/rejected": -300.8188171386719, "loss": 0.6794, "rewards/accuracies": 0.75, "rewards/chosen": 0.614719033241272, "rewards/margins": 0.26208117604255676, "rewards/rejected": 0.3526378870010376, "step": 5715 }, { "epoch": 0.8839744828919389, "grad_norm": 8.16006088256836, "learning_rate": 3.9185473708328565e-06, "logits/chosen": 10.51950740814209, "logits/rejected": -3.701554298400879, "logps/chosen": -326.212890625, "logps/rejected": -200.83639526367188, "loss": 0.64, "rewards/accuracies": 0.5, "rewards/chosen": 0.3709743320941925, "rewards/margins": 0.5845832228660583, "rewards/rejected": -0.21360892057418823, "step": 5716 }, { "epoch": 0.8841291320317031, "grad_norm": 4.286136150360107, "learning_rate": 3.918260969183183e-06, "logits/chosen": 4.369946479797363, "logits/rejected": 7.638784885406494, "logps/chosen": -140.1514434814453, "logps/rejected": -136.88760375976562, "loss": 0.6431, "rewards/accuracies": 0.625, "rewards/chosen": -0.1485108882188797, "rewards/margins": 0.15217891335487366, "rewards/rejected": -0.30068981647491455, "step": 5717 }, { "epoch": 0.8842837811714672, "grad_norm": 7.765321731567383, "learning_rate": 3.91797456753351e-06, "logits/chosen": 13.634510040283203, "logits/rejected": 9.603250503540039, "logps/chosen": -371.98248291015625, "logps/rejected": -296.15875244140625, "loss": 0.7404, "rewards/accuracies": 0.5, "rewards/chosen": -0.1119348481297493, "rewards/margins": 0.08819341659545898, "rewards/rejected": -0.20012828707695007, "step": 5718 }, { "epoch": 0.8844384303112314, "grad_norm": 5.180447101593018, "learning_rate": 3.917688165883836e-06, "logits/chosen": 8.412239074707031, "logits/rejected": 5.719845771789551, "logps/chosen": -300.8465881347656, "logps/rejected": -276.9864807128906, "loss": 0.5265, "rewards/accuracies": 0.75, "rewards/chosen": 0.5363882780075073, "rewards/margins": 0.6356222629547119, "rewards/rejected": -0.0992339700460434, "step": 5719 }, { "epoch": 0.8845930794509955, "grad_norm": 4.950469017028809, "learning_rate": 3.917401764234162e-06, "logits/chosen": 11.615145683288574, "logits/rejected": 12.291160583496094, "logps/chosen": -340.7977600097656, "logps/rejected": -322.55218505859375, "loss": 0.5434, "rewards/accuracies": 0.875, "rewards/chosen": 0.5119456052780151, "rewards/margins": 0.4473438858985901, "rewards/rejected": 0.06460171937942505, "step": 5720 }, { "epoch": 0.8847477285907597, "grad_norm": 5.036688327789307, "learning_rate": 3.917115362584489e-06, "logits/chosen": 14.608207702636719, "logits/rejected": 7.88964319229126, "logps/chosen": -471.6127014160156, "logps/rejected": -324.5352783203125, "loss": 0.4216, "rewards/accuracies": 0.625, "rewards/chosen": 0.5948439836502075, "rewards/margins": 0.9432607293128967, "rewards/rejected": -0.3484167456626892, "step": 5721 }, { "epoch": 0.8849023777305238, "grad_norm": 4.146422863006592, "learning_rate": 3.9168289609348156e-06, "logits/chosen": 9.466646194458008, "logits/rejected": 12.533526420593262, "logps/chosen": -202.91134643554688, "logps/rejected": -194.45562744140625, "loss": 0.6314, "rewards/accuracies": 0.625, "rewards/chosen": 0.11016635596752167, "rewards/margins": 0.28474920988082886, "rewards/rejected": -0.17458288371562958, "step": 5722 }, { "epoch": 0.8850570268702881, "grad_norm": 5.789351940155029, "learning_rate": 3.916542559285142e-06, "logits/chosen": 7.4438910484313965, "logits/rejected": 11.713520050048828, "logps/chosen": -217.23736572265625, "logps/rejected": -242.3692626953125, "loss": 0.7695, "rewards/accuracies": 0.25, "rewards/chosen": -0.07172223925590515, "rewards/margins": 0.01283930242061615, "rewards/rejected": -0.0845615416765213, "step": 5723 }, { "epoch": 0.8852116760100522, "grad_norm": 5.6492390632629395, "learning_rate": 3.916256157635469e-06, "logits/chosen": 3.7442309856414795, "logits/rejected": 8.91292953491211, "logps/chosen": -223.8847198486328, "logps/rejected": -335.2643737792969, "loss": 0.7253, "rewards/accuracies": 0.625, "rewards/chosen": -0.10682099312543869, "rewards/margins": 0.18941956758499146, "rewards/rejected": -0.29624056816101074, "step": 5724 }, { "epoch": 0.8853663251498164, "grad_norm": 4.877742290496826, "learning_rate": 3.915969755985795e-06, "logits/chosen": 5.995549201965332, "logits/rejected": 3.7759718894958496, "logps/chosen": -186.341796875, "logps/rejected": -207.14224243164062, "loss": 0.6035, "rewards/accuracies": 0.5, "rewards/chosen": 0.009882714599370956, "rewards/margins": 0.33444270491600037, "rewards/rejected": -0.3245599865913391, "step": 5725 }, { "epoch": 0.8855209742895805, "grad_norm": 4.757693767547607, "learning_rate": 3.915683354336121e-06, "logits/chosen": 12.983833312988281, "logits/rejected": 4.540615081787109, "logps/chosen": -430.4539794921875, "logps/rejected": -351.2604675292969, "loss": 0.4529, "rewards/accuracies": 0.75, "rewards/chosen": 0.7119522094726562, "rewards/margins": 0.949677586555481, "rewards/rejected": -0.2377253770828247, "step": 5726 }, { "epoch": 0.8856756234293447, "grad_norm": 5.418318748474121, "learning_rate": 3.915396952686448e-06, "logits/chosen": 10.195610046386719, "logits/rejected": 6.444699764251709, "logps/chosen": -363.6402893066406, "logps/rejected": -317.5598449707031, "loss": 0.6229, "rewards/accuracies": 0.5, "rewards/chosen": 0.38743603229522705, "rewards/margins": 0.34168070554733276, "rewards/rejected": 0.0457552969455719, "step": 5727 }, { "epoch": 0.8858302725691088, "grad_norm": 4.7921953201293945, "learning_rate": 3.915110551036775e-06, "logits/chosen": 8.705930709838867, "logits/rejected": 1.3356387615203857, "logps/chosen": -271.5472412109375, "logps/rejected": -156.489990234375, "loss": 0.5629, "rewards/accuracies": 0.75, "rewards/chosen": 0.3196978271007538, "rewards/margins": 0.37005728483200073, "rewards/rejected": -0.050359442830085754, "step": 5728 }, { "epoch": 0.885984921708873, "grad_norm": 6.967679023742676, "learning_rate": 3.914824149387101e-06, "logits/chosen": 15.88611888885498, "logits/rejected": 11.138936996459961, "logps/chosen": -433.5938720703125, "logps/rejected": -398.22882080078125, "loss": 0.7217, "rewards/accuracies": 0.375, "rewards/chosen": 0.3259948492050171, "rewards/margins": -0.030917182564735413, "rewards/rejected": 0.3569120466709137, "step": 5729 }, { "epoch": 0.8861395708486371, "grad_norm": 7.154730796813965, "learning_rate": 3.914537747737427e-06, "logits/chosen": 14.754705429077148, "logits/rejected": 11.588245391845703, "logps/chosen": -321.4205322265625, "logps/rejected": -268.4865417480469, "loss": 0.7381, "rewards/accuracies": 0.5, "rewards/chosen": 0.08195889741182327, "rewards/margins": -0.04528498277068138, "rewards/rejected": 0.12724387645721436, "step": 5730 }, { "epoch": 0.8862942199884013, "grad_norm": 6.6600165367126465, "learning_rate": 3.914251346087754e-06, "logits/chosen": 9.972471237182617, "logits/rejected": 13.445955276489258, "logps/chosen": -244.17820739746094, "logps/rejected": -263.0948791503906, "loss": 0.777, "rewards/accuracies": 0.375, "rewards/chosen": 0.05811205506324768, "rewards/margins": -0.11703595519065857, "rewards/rejected": 0.17514801025390625, "step": 5731 }, { "epoch": 0.8864488691281655, "grad_norm": 8.647895812988281, "learning_rate": 3.91396494443808e-06, "logits/chosen": 16.506153106689453, "logits/rejected": 12.974069595336914, "logps/chosen": -459.3907775878906, "logps/rejected": -287.63702392578125, "loss": 0.7383, "rewards/accuracies": 0.375, "rewards/chosen": -0.1404932141304016, "rewards/margins": 0.07582530379295349, "rewards/rejected": -0.21631845831871033, "step": 5732 }, { "epoch": 0.8866035182679296, "grad_norm": 4.1455979347229, "learning_rate": 3.913678542788407e-06, "logits/chosen": 12.931025505065918, "logits/rejected": 13.822722434997559, "logps/chosen": -224.34390258789062, "logps/rejected": -162.61351013183594, "loss": 0.7132, "rewards/accuracies": 0.625, "rewards/chosen": -0.08715471625328064, "rewards/margins": -0.007643356919288635, "rewards/rejected": -0.079511359333992, "step": 5733 }, { "epoch": 0.8867581674076938, "grad_norm": 5.531655311584473, "learning_rate": 3.913392141138733e-06, "logits/chosen": 10.088375091552734, "logits/rejected": 5.220111846923828, "logps/chosen": -190.84645080566406, "logps/rejected": -187.39700317382812, "loss": 0.5899, "rewards/accuracies": 0.5, "rewards/chosen": -0.250689834356308, "rewards/margins": 0.32124096155166626, "rewards/rejected": -0.5719308257102966, "step": 5734 }, { "epoch": 0.8869128165474579, "grad_norm": 4.449574947357178, "learning_rate": 3.9131057394890595e-06, "logits/chosen": 6.834588050842285, "logits/rejected": 3.3841464519500732, "logps/chosen": -339.1780090332031, "logps/rejected": -362.3229675292969, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 0.22625428438186646, "rewards/margins": 0.7454162240028381, "rewards/rejected": -0.5191619992256165, "step": 5735 }, { "epoch": 0.8870674656872222, "grad_norm": 8.110933303833008, "learning_rate": 3.912819337839386e-06, "logits/chosen": 12.218389511108398, "logits/rejected": 11.65676212310791, "logps/chosen": -240.7242431640625, "logps/rejected": -225.2874298095703, "loss": 0.8025, "rewards/accuracies": 0.625, "rewards/chosen": -0.0749661922454834, "rewards/margins": -0.06264910846948624, "rewards/rejected": -0.012317083775997162, "step": 5736 }, { "epoch": 0.8872221148269863, "grad_norm": 4.405845642089844, "learning_rate": 3.912532936189713e-06, "logits/chosen": 11.866279602050781, "logits/rejected": 9.245824813842773, "logps/chosen": -177.52191162109375, "logps/rejected": -194.79713439941406, "loss": 0.6054, "rewards/accuracies": 0.75, "rewards/chosen": -0.01137109100818634, "rewards/margins": 0.23183490335941315, "rewards/rejected": -0.2432059943675995, "step": 5737 }, { "epoch": 0.8873767639667505, "grad_norm": 5.082192420959473, "learning_rate": 3.912246534540039e-06, "logits/chosen": 10.702702522277832, "logits/rejected": 7.364471912384033, "logps/chosen": -271.68499755859375, "logps/rejected": -224.69876098632812, "loss": 0.589, "rewards/accuracies": 0.625, "rewards/chosen": -0.03307408094406128, "rewards/margins": 0.3397447466850281, "rewards/rejected": -0.37281882762908936, "step": 5738 }, { "epoch": 0.8875314131065146, "grad_norm": 4.297824382781982, "learning_rate": 3.911960132890365e-06, "logits/chosen": 7.88686466217041, "logits/rejected": 8.273067474365234, "logps/chosen": -246.26055908203125, "logps/rejected": -325.49468994140625, "loss": 0.5338, "rewards/accuracies": 0.75, "rewards/chosen": -0.057716239243745804, "rewards/margins": 0.4108290672302246, "rewards/rejected": -0.4685452878475189, "step": 5739 }, { "epoch": 0.8876860622462788, "grad_norm": 5.887892246246338, "learning_rate": 3.911673731240692e-06, "logits/chosen": 13.80660629272461, "logits/rejected": 10.44577693939209, "logps/chosen": -252.92649841308594, "logps/rejected": -194.51593017578125, "loss": 0.7662, "rewards/accuracies": 0.5, "rewards/chosen": 0.02310333400964737, "rewards/margins": -0.09983141720294952, "rewards/rejected": 0.1229347512125969, "step": 5740 }, { "epoch": 0.8878407113860429, "grad_norm": 6.024567127227783, "learning_rate": 3.9113873295910185e-06, "logits/chosen": 13.409719467163086, "logits/rejected": 5.196177005767822, "logps/chosen": -345.5506896972656, "logps/rejected": -216.5962371826172, "loss": 0.6392, "rewards/accuracies": 0.375, "rewards/chosen": -0.1940809190273285, "rewards/margins": 0.18006497621536255, "rewards/rejected": -0.37414589524269104, "step": 5741 }, { "epoch": 0.8879953605258071, "grad_norm": 7.745041847229004, "learning_rate": 3.911100927941345e-06, "logits/chosen": 8.966459274291992, "logits/rejected": 12.393768310546875, "logps/chosen": -259.3885498046875, "logps/rejected": -249.9786834716797, "loss": 1.1557, "rewards/accuracies": 0.375, "rewards/chosen": -0.6753278970718384, "rewards/margins": -0.6296943426132202, "rewards/rejected": -0.04563350975513458, "step": 5742 }, { "epoch": 0.8881500096655712, "grad_norm": 4.841660976409912, "learning_rate": 3.910814526291672e-06, "logits/chosen": 11.803377151489258, "logits/rejected": 6.779069900512695, "logps/chosen": -313.079345703125, "logps/rejected": -284.3074951171875, "loss": 0.4704, "rewards/accuracies": 0.75, "rewards/chosen": 0.1785726547241211, "rewards/margins": 0.9311029314994812, "rewards/rejected": -0.7525302767753601, "step": 5743 }, { "epoch": 0.8883046588053354, "grad_norm": 23.435745239257812, "learning_rate": 3.9105281246419985e-06, "logits/chosen": 14.145280838012695, "logits/rejected": 8.06881332397461, "logps/chosen": -397.1736755371094, "logps/rejected": -345.9590759277344, "loss": 0.5141, "rewards/accuracies": 0.75, "rewards/chosen": -0.23405838012695312, "rewards/margins": 0.568458080291748, "rewards/rejected": -0.8025164604187012, "step": 5744 }, { "epoch": 0.8884593079450995, "grad_norm": 6.681275367736816, "learning_rate": 3.910241722992324e-06, "logits/chosen": 9.403277397155762, "logits/rejected": 8.474343299865723, "logps/chosen": -235.55935668945312, "logps/rejected": -205.58026123046875, "loss": 0.6553, "rewards/accuracies": 0.375, "rewards/chosen": 0.03649824857711792, "rewards/margins": 0.1222074031829834, "rewards/rejected": -0.08570914715528488, "step": 5745 }, { "epoch": 0.8886139570848637, "grad_norm": 5.971920013427734, "learning_rate": 3.909955321342651e-06, "logits/chosen": 8.480854034423828, "logits/rejected": 11.802057266235352, "logps/chosen": -224.52545166015625, "logps/rejected": -267.1866760253906, "loss": 0.7041, "rewards/accuracies": 0.625, "rewards/chosen": -0.15403667092323303, "rewards/margins": 0.03144526481628418, "rewards/rejected": -0.1854819357395172, "step": 5746 }, { "epoch": 0.8887686062246278, "grad_norm": 5.7973551750183105, "learning_rate": 3.909668919692978e-06, "logits/chosen": 9.143149375915527, "logits/rejected": 5.627012252807617, "logps/chosen": -232.7069091796875, "logps/rejected": -234.0638885498047, "loss": 0.5269, "rewards/accuracies": 0.75, "rewards/chosen": -0.021738380193710327, "rewards/margins": 0.5361195206642151, "rewards/rejected": -0.5578579306602478, "step": 5747 }, { "epoch": 0.888923255364392, "grad_norm": 4.7776713371276855, "learning_rate": 3.909382518043304e-06, "logits/chosen": 14.661382675170898, "logits/rejected": 9.264200210571289, "logps/chosen": -228.35877990722656, "logps/rejected": -161.07286071777344, "loss": 0.5439, "rewards/accuracies": 0.75, "rewards/chosen": -0.030674919486045837, "rewards/margins": 0.5089076161384583, "rewards/rejected": -0.5395825505256653, "step": 5748 }, { "epoch": 0.8890779045041562, "grad_norm": 5.406339168548584, "learning_rate": 3.909096116393631e-06, "logits/chosen": 11.576033592224121, "logits/rejected": 7.081048011779785, "logps/chosen": -290.57452392578125, "logps/rejected": -216.6158447265625, "loss": 0.6072, "rewards/accuracies": 0.875, "rewards/chosen": 0.18026159703731537, "rewards/margins": 0.21082177758216858, "rewards/rejected": -0.030560161918401718, "step": 5749 }, { "epoch": 0.8892325536439204, "grad_norm": 4.738779544830322, "learning_rate": 3.9088097147439575e-06, "logits/chosen": 10.026796340942383, "logits/rejected": 7.0431084632873535, "logps/chosen": -335.5976867675781, "logps/rejected": -239.91140747070312, "loss": 0.5771, "rewards/accuracies": 0.875, "rewards/chosen": 0.20376934111118317, "rewards/margins": 0.35572391748428345, "rewards/rejected": -0.1519545614719391, "step": 5750 }, { "epoch": 0.8893872027836845, "grad_norm": 12.458568572998047, "learning_rate": 3.908523313094284e-06, "logits/chosen": 9.851884841918945, "logits/rejected": 9.982420921325684, "logps/chosen": -349.2794189453125, "logps/rejected": -337.798095703125, "loss": 1.131, "rewards/accuracies": 0.25, "rewards/chosen": -0.013603691011667252, "rewards/margins": -0.6422320604324341, "rewards/rejected": 0.6286283731460571, "step": 5751 }, { "epoch": 0.8895418519234487, "grad_norm": 5.155664443969727, "learning_rate": 3.90823691144461e-06, "logits/chosen": 10.588802337646484, "logits/rejected": 4.128095626831055, "logps/chosen": -350.32098388671875, "logps/rejected": -300.9133605957031, "loss": 0.4947, "rewards/accuracies": 0.875, "rewards/chosen": 0.27958473563194275, "rewards/margins": 0.4854336380958557, "rewards/rejected": -0.20584887266159058, "step": 5752 }, { "epoch": 0.8896965010632129, "grad_norm": 5.581695079803467, "learning_rate": 3.907950509794937e-06, "logits/chosen": 11.199490547180176, "logits/rejected": 7.459531307220459, "logps/chosen": -308.09283447265625, "logps/rejected": -214.06842041015625, "loss": 0.7749, "rewards/accuracies": 0.5, "rewards/chosen": -0.0329490564763546, "rewards/margins": -0.07803630828857422, "rewards/rejected": 0.04508723318576813, "step": 5753 }, { "epoch": 0.889851150202977, "grad_norm": 4.957764625549316, "learning_rate": 3.907664108145263e-06, "logits/chosen": 11.860799789428711, "logits/rejected": 8.487844467163086, "logps/chosen": -267.4550476074219, "logps/rejected": -239.7709197998047, "loss": 0.6991, "rewards/accuracies": 0.625, "rewards/chosen": 0.19502225518226624, "rewards/margins": 0.0836663544178009, "rewards/rejected": 0.11135593801736832, "step": 5754 }, { "epoch": 0.8900057993427412, "grad_norm": 5.874998092651367, "learning_rate": 3.90737770649559e-06, "logits/chosen": 8.549209594726562, "logits/rejected": 5.060385704040527, "logps/chosen": -258.3175048828125, "logps/rejected": -243.79141235351562, "loss": 0.7516, "rewards/accuracies": 0.625, "rewards/chosen": -0.1748601794242859, "rewards/margins": 0.006876006722450256, "rewards/rejected": -0.18173615634441376, "step": 5755 }, { "epoch": 0.8901604484825053, "grad_norm": 5.554418563842773, "learning_rate": 3.907091304845917e-06, "logits/chosen": 9.065581321716309, "logits/rejected": 7.799722671508789, "logps/chosen": -354.9833984375, "logps/rejected": -343.6627197265625, "loss": 0.6716, "rewards/accuracies": 0.625, "rewards/chosen": 0.40501782298088074, "rewards/margins": 0.25420254468917847, "rewards/rejected": 0.1508152335882187, "step": 5756 }, { "epoch": 0.8903150976222695, "grad_norm": 7.55493688583374, "learning_rate": 3.906804903196243e-06, "logits/chosen": 8.936965942382812, "logits/rejected": 5.624577522277832, "logps/chosen": -358.93316650390625, "logps/rejected": -289.907470703125, "loss": 0.8718, "rewards/accuracies": 0.375, "rewards/chosen": -0.06171342730522156, "rewards/margins": -0.2696279287338257, "rewards/rejected": 0.20791450142860413, "step": 5757 }, { "epoch": 0.8904697467620336, "grad_norm": 5.616508483886719, "learning_rate": 3.906518501546569e-06, "logits/chosen": 13.807989120483398, "logits/rejected": 3.765812397003174, "logps/chosen": -280.65093994140625, "logps/rejected": -157.28773498535156, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": 0.03565177693963051, "rewards/margins": 0.21808186173439026, "rewards/rejected": -0.18243007361888885, "step": 5758 }, { "epoch": 0.8906243959017978, "grad_norm": 4.631516933441162, "learning_rate": 3.906232099896896e-06, "logits/chosen": 11.538749694824219, "logits/rejected": 4.234883785247803, "logps/chosen": -353.98907470703125, "logps/rejected": -145.6070556640625, "loss": 0.4823, "rewards/accuracies": 0.75, "rewards/chosen": 0.3437071740627289, "rewards/margins": 0.6516937017440796, "rewards/rejected": -0.3079864978790283, "step": 5759 }, { "epoch": 0.8907790450415619, "grad_norm": 4.433241844177246, "learning_rate": 3.905945698247222e-06, "logits/chosen": 11.608865737915039, "logits/rejected": 9.918044090270996, "logps/chosen": -279.2489013671875, "logps/rejected": -190.66151428222656, "loss": 0.4964, "rewards/accuracies": 0.875, "rewards/chosen": 0.14460617303848267, "rewards/margins": 0.5654255151748657, "rewards/rejected": -0.42081934213638306, "step": 5760 }, { "epoch": 0.8909336941813261, "grad_norm": 5.150976657867432, "learning_rate": 3.905659296597549e-06, "logits/chosen": 12.157896041870117, "logits/rejected": 8.949045181274414, "logps/chosen": -330.6917724609375, "logps/rejected": -210.3186492919922, "loss": 0.5486, "rewards/accuracies": 0.75, "rewards/chosen": -0.04945487901568413, "rewards/margins": 0.4112991690635681, "rewards/rejected": -0.4607540965080261, "step": 5761 }, { "epoch": 0.8910883433210903, "grad_norm": 8.179576873779297, "learning_rate": 3.905372894947876e-06, "logits/chosen": 1.7145663499832153, "logits/rejected": 4.757074356079102, "logps/chosen": -290.9599609375, "logps/rejected": -332.01544189453125, "loss": 0.9075, "rewards/accuracies": 0.375, "rewards/chosen": 0.11309175193309784, "rewards/margins": -0.2051432579755783, "rewards/rejected": 0.31823503971099854, "step": 5762 }, { "epoch": 0.8912429924608545, "grad_norm": 5.04812479019165, "learning_rate": 3.9050864932982015e-06, "logits/chosen": 9.727849960327148, "logits/rejected": 4.04802942276001, "logps/chosen": -345.1971130371094, "logps/rejected": -179.4881134033203, "loss": 0.6163, "rewards/accuracies": 0.625, "rewards/chosen": 0.1700761318206787, "rewards/margins": 0.2689133286476135, "rewards/rejected": -0.09883718937635422, "step": 5763 }, { "epoch": 0.8913976416006186, "grad_norm": 4.066697120666504, "learning_rate": 3.904800091648528e-06, "logits/chosen": 12.286554336547852, "logits/rejected": 9.58411979675293, "logps/chosen": -320.670166015625, "logps/rejected": -276.7817077636719, "loss": 0.5486, "rewards/accuracies": 0.875, "rewards/chosen": 0.3345256745815277, "rewards/margins": 0.34571850299835205, "rewards/rejected": -0.011192798614501953, "step": 5764 }, { "epoch": 0.8915522907403828, "grad_norm": 5.4556427001953125, "learning_rate": 3.904513689998855e-06, "logits/chosen": 8.078201293945312, "logits/rejected": 4.984035491943359, "logps/chosen": -221.33428955078125, "logps/rejected": -187.317626953125, "loss": 0.6514, "rewards/accuracies": 0.5, "rewards/chosen": 0.03998975828289986, "rewards/margins": 0.2150421440601349, "rewards/rejected": -0.17505237460136414, "step": 5765 }, { "epoch": 0.8917069398801469, "grad_norm": 5.378304958343506, "learning_rate": 3.904227288349181e-06, "logits/chosen": 9.374171257019043, "logits/rejected": 10.058323860168457, "logps/chosen": -180.64031982421875, "logps/rejected": -172.59909057617188, "loss": 0.8512, "rewards/accuracies": 0.375, "rewards/chosen": -0.6234707832336426, "rewards/margins": -0.24978072941303253, "rewards/rejected": -0.37369006872177124, "step": 5766 }, { "epoch": 0.8918615890199111, "grad_norm": 6.537511348724365, "learning_rate": 3.903940886699508e-06, "logits/chosen": 6.561960220336914, "logits/rejected": 5.24894905090332, "logps/chosen": -378.9321594238281, "logps/rejected": -304.2655029296875, "loss": 0.7731, "rewards/accuracies": 0.375, "rewards/chosen": -0.023377999663352966, "rewards/margins": -0.09391117095947266, "rewards/rejected": 0.07053318619728088, "step": 5767 }, { "epoch": 0.8920162381596752, "grad_norm": 6.960522174835205, "learning_rate": 3.903654485049834e-06, "logits/chosen": 11.047513008117676, "logits/rejected": 6.493597984313965, "logps/chosen": -433.89202880859375, "logps/rejected": -383.3857421875, "loss": 0.7824, "rewards/accuracies": 0.5, "rewards/chosen": 0.3854230046272278, "rewards/margins": 0.029317021369934082, "rewards/rejected": 0.3561059534549713, "step": 5768 }, { "epoch": 0.8921708872994394, "grad_norm": 3.61096453666687, "learning_rate": 3.9033680834001605e-06, "logits/chosen": 16.009328842163086, "logits/rejected": 10.315284729003906, "logps/chosen": -265.6707458496094, "logps/rejected": -232.4981689453125, "loss": 0.4998, "rewards/accuracies": 0.875, "rewards/chosen": 0.15180665254592896, "rewards/margins": 0.5562466382980347, "rewards/rejected": -0.4044400453567505, "step": 5769 }, { "epoch": 0.8923255364392035, "grad_norm": 5.762599468231201, "learning_rate": 3.903081681750487e-06, "logits/chosen": 11.761422157287598, "logits/rejected": 6.324789524078369, "logps/chosen": -334.6190185546875, "logps/rejected": -259.453125, "loss": 0.5703, "rewards/accuracies": 0.625, "rewards/chosen": 0.10923900455236435, "rewards/margins": 0.4044661819934845, "rewards/rejected": -0.29522716999053955, "step": 5770 }, { "epoch": 0.8924801855789677, "grad_norm": 5.068279266357422, "learning_rate": 3.902795280100814e-06, "logits/chosen": 8.493657112121582, "logits/rejected": 8.671353340148926, "logps/chosen": -228.67816162109375, "logps/rejected": -275.82794189453125, "loss": 0.7354, "rewards/accuracies": 0.5, "rewards/chosen": 0.20717626810073853, "rewards/margins": 0.13601268827915192, "rewards/rejected": 0.07116355746984482, "step": 5771 }, { "epoch": 0.8926348347187318, "grad_norm": 47.15262985229492, "learning_rate": 3.90250887845114e-06, "logits/chosen": 7.927428245544434, "logits/rejected": 10.981467247009277, "logps/chosen": -226.99713134765625, "logps/rejected": -213.72265625, "loss": 0.8563, "rewards/accuracies": 0.625, "rewards/chosen": -0.08466038107872009, "rewards/margins": 0.02425430715084076, "rewards/rejected": -0.10891470313072205, "step": 5772 }, { "epoch": 0.892789483858496, "grad_norm": 6.1558685302734375, "learning_rate": 3.902222476801466e-06, "logits/chosen": 8.256296157836914, "logits/rejected": 7.800989627838135, "logps/chosen": -254.19973754882812, "logps/rejected": -296.0955810546875, "loss": 0.637, "rewards/accuracies": 0.625, "rewards/chosen": 0.18366794288158417, "rewards/margins": 0.22860854864120483, "rewards/rejected": -0.044940609484910965, "step": 5773 }, { "epoch": 0.8929441329982601, "grad_norm": 4.605040073394775, "learning_rate": 3.901936075151793e-06, "logits/chosen": 9.867841720581055, "logits/rejected": 10.04358196258545, "logps/chosen": -263.62872314453125, "logps/rejected": -207.45558166503906, "loss": 0.7324, "rewards/accuracies": 0.625, "rewards/chosen": -0.24722671508789062, "rewards/margins": -0.013812467455863953, "rewards/rejected": -0.23341424763202667, "step": 5774 }, { "epoch": 0.8930987821380244, "grad_norm": 10.984442710876465, "learning_rate": 3.9016496735021196e-06, "logits/chosen": 2.1906890869140625, "logits/rejected": 8.064613342285156, "logps/chosen": -209.891357421875, "logps/rejected": -261.04620361328125, "loss": 1.0817, "rewards/accuracies": 0.375, "rewards/chosen": -0.08369524776935577, "rewards/margins": -0.5420249700546265, "rewards/rejected": 0.4583296477794647, "step": 5775 }, { "epoch": 0.8932534312777886, "grad_norm": 5.522519111633301, "learning_rate": 3.901363271852446e-06, "logits/chosen": 7.920083999633789, "logits/rejected": 5.469483375549316, "logps/chosen": -245.44558715820312, "logps/rejected": -223.60574340820312, "loss": 0.5873, "rewards/accuracies": 0.75, "rewards/chosen": 0.1464330554008484, "rewards/margins": 0.3499521017074585, "rewards/rejected": -0.2035190612077713, "step": 5776 }, { "epoch": 0.8934080804175527, "grad_norm": 4.455384731292725, "learning_rate": 3.901076870202773e-06, "logits/chosen": 9.459773063659668, "logits/rejected": 8.106334686279297, "logps/chosen": -174.80857849121094, "logps/rejected": -115.38701629638672, "loss": 0.6269, "rewards/accuracies": 0.625, "rewards/chosen": 0.02055474743247032, "rewards/margins": 0.16783945262432098, "rewards/rejected": -0.14728470146656036, "step": 5777 }, { "epoch": 0.8935627295573169, "grad_norm": 4.525018692016602, "learning_rate": 3.900790468553099e-06, "logits/chosen": 11.904264450073242, "logits/rejected": 8.026816368103027, "logps/chosen": -293.42333984375, "logps/rejected": -207.0321502685547, "loss": 0.5365, "rewards/accuracies": 0.875, "rewards/chosen": 0.35818254947662354, "rewards/margins": 0.4342919588088989, "rewards/rejected": -0.07610940933227539, "step": 5778 }, { "epoch": 0.893717378697081, "grad_norm": 3.6988890171051025, "learning_rate": 3.900504066903425e-06, "logits/chosen": 12.633952140808105, "logits/rejected": 7.129212379455566, "logps/chosen": -356.6917419433594, "logps/rejected": -252.245849609375, "loss": 0.3564, "rewards/accuracies": 1.0, "rewards/chosen": 0.42897891998291016, "rewards/margins": 0.9579540491104126, "rewards/rejected": -0.5289751291275024, "step": 5779 }, { "epoch": 0.8938720278368452, "grad_norm": 6.133105754852295, "learning_rate": 3.900217665253752e-06, "logits/chosen": 5.24315881729126, "logits/rejected": 1.84628164768219, "logps/chosen": -300.00250244140625, "logps/rejected": -213.4097900390625, "loss": 0.748, "rewards/accuracies": 0.5, "rewards/chosen": 0.02803216129541397, "rewards/margins": -0.03245110064744949, "rewards/rejected": 0.06048326566815376, "step": 5780 }, { "epoch": 0.8940266769766093, "grad_norm": 6.531915664672852, "learning_rate": 3.899931263604079e-06, "logits/chosen": 9.105352401733398, "logits/rejected": 11.369983673095703, "logps/chosen": -336.5491638183594, "logps/rejected": -376.9744567871094, "loss": 0.7547, "rewards/accuracies": 0.375, "rewards/chosen": 0.08375931531190872, "rewards/margins": -0.029708579182624817, "rewards/rejected": 0.11346788704395294, "step": 5781 }, { "epoch": 0.8941813261163735, "grad_norm": 5.715112686157227, "learning_rate": 3.899644861954405e-06, "logits/chosen": 13.731454849243164, "logits/rejected": 9.523012161254883, "logps/chosen": -313.4877624511719, "logps/rejected": -247.133056640625, "loss": 0.6316, "rewards/accuracies": 0.75, "rewards/chosen": 0.2693215608596802, "rewards/margins": 0.21356190741062164, "rewards/rejected": 0.05575963854789734, "step": 5782 }, { "epoch": 0.8943359752561376, "grad_norm": 4.2412261962890625, "learning_rate": 3.899358460304732e-06, "logits/chosen": 9.01489543914795, "logits/rejected": 12.498173713684082, "logps/chosen": -167.2577667236328, "logps/rejected": -204.22789001464844, "loss": 0.6329, "rewards/accuracies": 0.75, "rewards/chosen": 0.13652697205543518, "rewards/margins": 0.1878114640712738, "rewards/rejected": -0.05128450319170952, "step": 5783 }, { "epoch": 0.8944906243959018, "grad_norm": 5.601959228515625, "learning_rate": 3.899072058655059e-06, "logits/chosen": 7.988924026489258, "logits/rejected": 8.908060073852539, "logps/chosen": -235.0262451171875, "logps/rejected": -220.17633056640625, "loss": 0.6915, "rewards/accuracies": 0.75, "rewards/chosen": 0.22519434988498688, "rewards/margins": 0.06586436927318573, "rewards/rejected": 0.15933001041412354, "step": 5784 }, { "epoch": 0.8946452735356659, "grad_norm": 4.819338798522949, "learning_rate": 3.898785657005384e-06, "logits/chosen": 4.816971778869629, "logits/rejected": 0.19012141227722168, "logps/chosen": -347.17156982421875, "logps/rejected": -243.0165557861328, "loss": 0.5642, "rewards/accuracies": 0.75, "rewards/chosen": 0.12997885048389435, "rewards/margins": 0.5381138920783997, "rewards/rejected": -0.4081350862979889, "step": 5785 }, { "epoch": 0.8947999226754301, "grad_norm": 4.035464286804199, "learning_rate": 3.898499255355711e-06, "logits/chosen": 9.308698654174805, "logits/rejected": 10.726720809936523, "logps/chosen": -282.947021484375, "logps/rejected": -245.0333709716797, "loss": 0.4728, "rewards/accuracies": 0.875, "rewards/chosen": 0.5799012780189514, "rewards/margins": 0.6192495822906494, "rewards/rejected": -0.03934831917285919, "step": 5786 }, { "epoch": 0.8949545718151943, "grad_norm": 6.591187953948975, "learning_rate": 3.898212853706038e-06, "logits/chosen": 14.6987943649292, "logits/rejected": 11.750274658203125, "logps/chosen": -337.1551513671875, "logps/rejected": -299.08544921875, "loss": 0.6348, "rewards/accuracies": 0.625, "rewards/chosen": 0.5019106864929199, "rewards/margins": 0.2939349412918091, "rewards/rejected": 0.20797570049762726, "step": 5787 }, { "epoch": 0.8951092209549585, "grad_norm": 5.288928508758545, "learning_rate": 3.897926452056364e-06, "logits/chosen": 10.313579559326172, "logits/rejected": 7.051687717437744, "logps/chosen": -322.61590576171875, "logps/rejected": -221.70352172851562, "loss": 0.6925, "rewards/accuracies": 0.5, "rewards/chosen": 0.046183258295059204, "rewards/margins": 0.06858716905117035, "rewards/rejected": -0.022403907030820847, "step": 5788 }, { "epoch": 0.8952638700947226, "grad_norm": 5.243613243103027, "learning_rate": 3.897640050406691e-06, "logits/chosen": 9.921998023986816, "logits/rejected": 5.1040825843811035, "logps/chosen": -206.79026794433594, "logps/rejected": -135.08091735839844, "loss": 0.7962, "rewards/accuracies": 0.125, "rewards/chosen": -0.22975994646549225, "rewards/margins": -0.17974111437797546, "rewards/rejected": -0.05001883953809738, "step": 5789 }, { "epoch": 0.8954185192344868, "grad_norm": 5.887124538421631, "learning_rate": 3.897353648757018e-06, "logits/chosen": 10.311300277709961, "logits/rejected": 8.953064918518066, "logps/chosen": -312.102294921875, "logps/rejected": -292.21954345703125, "loss": 0.7181, "rewards/accuracies": 0.25, "rewards/chosen": 0.3187662363052368, "rewards/margins": 0.08487636595964432, "rewards/rejected": 0.2338898479938507, "step": 5790 }, { "epoch": 0.8955731683742509, "grad_norm": 6.83258581161499, "learning_rate": 3.8970672471073434e-06, "logits/chosen": 10.823326110839844, "logits/rejected": 8.36181354522705, "logps/chosen": -184.07888793945312, "logps/rejected": -190.6837158203125, "loss": 0.665, "rewards/accuracies": 0.625, "rewards/chosen": 0.15010690689086914, "rewards/margins": 0.10632428526878357, "rewards/rejected": 0.043782614171504974, "step": 5791 }, { "epoch": 0.8957278175140151, "grad_norm": 6.705944061279297, "learning_rate": 3.89678084545767e-06, "logits/chosen": 8.686849594116211, "logits/rejected": 9.22749137878418, "logps/chosen": -331.6884460449219, "logps/rejected": -321.5303649902344, "loss": 0.6811, "rewards/accuracies": 0.375, "rewards/chosen": 0.16520243883132935, "rewards/margins": 0.06841883063316345, "rewards/rejected": 0.0967836007475853, "step": 5792 }, { "epoch": 0.8958824666537792, "grad_norm": 3.462826728820801, "learning_rate": 3.896494443807997e-06, "logits/chosen": 7.829308032989502, "logits/rejected": 4.918103218078613, "logps/chosen": -273.01983642578125, "logps/rejected": -204.1581268310547, "loss": 0.4569, "rewards/accuracies": 0.875, "rewards/chosen": 0.057319071143865585, "rewards/margins": 0.6962177157402039, "rewards/rejected": -0.6388986706733704, "step": 5793 }, { "epoch": 0.8960371157935434, "grad_norm": 6.72977352142334, "learning_rate": 3.896208042158323e-06, "logits/chosen": 10.112924575805664, "logits/rejected": 10.463385581970215, "logps/chosen": -262.98236083984375, "logps/rejected": -287.0904541015625, "loss": 0.8473, "rewards/accuracies": 0.25, "rewards/chosen": -0.1544014811515808, "rewards/margins": -0.26306837797164917, "rewards/rejected": 0.10866688936948776, "step": 5794 }, { "epoch": 0.8961917649333075, "grad_norm": 5.962997913360596, "learning_rate": 3.89592164050865e-06, "logits/chosen": 16.793039321899414, "logits/rejected": 10.594056129455566, "logps/chosen": -273.9061279296875, "logps/rejected": -188.57601928710938, "loss": 0.7511, "rewards/accuracies": 0.5, "rewards/chosen": -0.05273417755961418, "rewards/margins": -0.07121656835079193, "rewards/rejected": 0.018482400104403496, "step": 5795 }, { "epoch": 0.8963464140730717, "grad_norm": 8.835264205932617, "learning_rate": 3.895635238858977e-06, "logits/chosen": 6.181654453277588, "logits/rejected": 8.423907279968262, "logps/chosen": -205.4691925048828, "logps/rejected": -234.89208984375, "loss": 0.8139, "rewards/accuracies": 0.375, "rewards/chosen": -0.1505850851535797, "rewards/margins": -0.1840323507785797, "rewards/rejected": 0.033447265625, "step": 5796 }, { "epoch": 0.8965010632128358, "grad_norm": 29.765905380249023, "learning_rate": 3.8953488372093025e-06, "logits/chosen": 12.476085662841797, "logits/rejected": 11.049947738647461, "logps/chosen": -304.8553466796875, "logps/rejected": -280.56365966796875, "loss": 0.854, "rewards/accuracies": 0.25, "rewards/chosen": 0.13271498680114746, "rewards/margins": 0.13122093677520752, "rewards/rejected": 0.0014940500259399414, "step": 5797 }, { "epoch": 0.8966557123526, "grad_norm": 6.187057018280029, "learning_rate": 3.895062435559629e-06, "logits/chosen": 12.39517879486084, "logits/rejected": 14.127275466918945, "logps/chosen": -351.138671875, "logps/rejected": -495.7914733886719, "loss": 0.4518, "rewards/accuracies": 1.0, "rewards/chosen": 0.30968108773231506, "rewards/margins": 0.6748337745666504, "rewards/rejected": -0.36515265703201294, "step": 5798 }, { "epoch": 0.8968103614923641, "grad_norm": 4.8687639236450195, "learning_rate": 3.894776033909956e-06, "logits/chosen": 15.382537841796875, "logits/rejected": 12.328469276428223, "logps/chosen": -268.01007080078125, "logps/rejected": -227.9391326904297, "loss": 0.6959, "rewards/accuracies": 0.375, "rewards/chosen": -0.13318169116973877, "rewards/margins": 0.2268151044845581, "rewards/rejected": -0.3599967360496521, "step": 5799 }, { "epoch": 0.8969650106321284, "grad_norm": 5.825780391693115, "learning_rate": 3.8944896322602824e-06, "logits/chosen": 13.089054107666016, "logits/rejected": 10.024188041687012, "logps/chosen": -313.15887451171875, "logps/rejected": -254.19517517089844, "loss": 0.8508, "rewards/accuracies": 0.375, "rewards/chosen": 0.09526987373828888, "rewards/margins": -0.2415209859609604, "rewards/rejected": 0.3367908298969269, "step": 5800 }, { "epoch": 0.8971196597718926, "grad_norm": 4.953996181488037, "learning_rate": 3.894203230610609e-06, "logits/chosen": 11.274621963500977, "logits/rejected": 11.288661003112793, "logps/chosen": -317.3144226074219, "logps/rejected": -294.0598449707031, "loss": 0.6051, "rewards/accuracies": 0.625, "rewards/chosen": 0.5526087284088135, "rewards/margins": 0.27658313512802124, "rewards/rejected": 0.27602559328079224, "step": 5801 }, { "epoch": 0.8972743089116567, "grad_norm": 8.640522003173828, "learning_rate": 3.893916828960935e-06, "logits/chosen": 10.786994934082031, "logits/rejected": 9.769344329833984, "logps/chosen": -152.9961395263672, "logps/rejected": -166.99082946777344, "loss": 0.56, "rewards/accuracies": 0.75, "rewards/chosen": 0.025615647435188293, "rewards/margins": 0.329357773065567, "rewards/rejected": -0.30374211072921753, "step": 5802 }, { "epoch": 0.8974289580514209, "grad_norm": 31.095870971679688, "learning_rate": 3.8936304273112616e-06, "logits/chosen": 8.2977933883667, "logits/rejected": 6.608639717102051, "logps/chosen": -332.3779296875, "logps/rejected": -331.2200622558594, "loss": 0.91, "rewards/accuracies": 0.375, "rewards/chosen": -0.22739753127098083, "rewards/margins": -0.25158873200416565, "rewards/rejected": 0.024191195145249367, "step": 5803 }, { "epoch": 0.897583607191185, "grad_norm": 4.276152610778809, "learning_rate": 3.893344025661588e-06, "logits/chosen": 11.328424453735352, "logits/rejected": 8.60206127166748, "logps/chosen": -260.8816833496094, "logps/rejected": -265.866455078125, "loss": 0.5412, "rewards/accuracies": 0.75, "rewards/chosen": 0.5953232049942017, "rewards/margins": 0.5436924695968628, "rewards/rejected": 0.051630690693855286, "step": 5804 }, { "epoch": 0.8977382563309492, "grad_norm": 6.091341495513916, "learning_rate": 3.893057624011915e-06, "logits/chosen": 6.324405670166016, "logits/rejected": 5.587436199188232, "logps/chosen": -358.77520751953125, "logps/rejected": -389.6812744140625, "loss": 0.6475, "rewards/accuracies": 0.625, "rewards/chosen": 0.4374438226222992, "rewards/margins": 0.21011047065258026, "rewards/rejected": 0.22733336687088013, "step": 5805 }, { "epoch": 0.8978929054707133, "grad_norm": 5.2883124351501465, "learning_rate": 3.892771222362241e-06, "logits/chosen": 13.2877197265625, "logits/rejected": 10.083975791931152, "logps/chosen": -251.19290161132812, "logps/rejected": -229.6358642578125, "loss": 0.5871, "rewards/accuracies": 0.75, "rewards/chosen": -0.06448793411254883, "rewards/margins": 0.31439948081970215, "rewards/rejected": -0.378887414932251, "step": 5806 }, { "epoch": 0.8980475546104775, "grad_norm": 8.017000198364258, "learning_rate": 3.892484820712567e-06, "logits/chosen": 6.96412992477417, "logits/rejected": 9.324800491333008, "logps/chosen": -255.52098083496094, "logps/rejected": -245.35235595703125, "loss": 0.6797, "rewards/accuracies": 0.625, "rewards/chosen": 0.1656692624092102, "rewards/margins": 0.05077696591615677, "rewards/rejected": 0.11489230394363403, "step": 5807 }, { "epoch": 0.8982022037502416, "grad_norm": 5.281316757202148, "learning_rate": 3.892198419062894e-06, "logits/chosen": 8.622862815856934, "logits/rejected": 5.965861797332764, "logps/chosen": -274.3324279785156, "logps/rejected": -249.67575073242188, "loss": 0.6743, "rewards/accuracies": 0.375, "rewards/chosen": 0.601416826248169, "rewards/margins": 0.10115213692188263, "rewards/rejected": 0.5002647042274475, "step": 5808 }, { "epoch": 0.8983568528900058, "grad_norm": 5.0237908363342285, "learning_rate": 3.891912017413221e-06, "logits/chosen": 9.30994987487793, "logits/rejected": 7.526909828186035, "logps/chosen": -326.1006164550781, "logps/rejected": -221.7826385498047, "loss": 0.6668, "rewards/accuracies": 0.625, "rewards/chosen": -0.1453217715024948, "rewards/margins": 0.3931927978992462, "rewards/rejected": -0.5385145545005798, "step": 5809 }, { "epoch": 0.8985115020297699, "grad_norm": 4.522352695465088, "learning_rate": 3.891625615763547e-06, "logits/chosen": 7.3874053955078125, "logits/rejected": 5.093265533447266, "logps/chosen": -389.14251708984375, "logps/rejected": -225.38217163085938, "loss": 0.4979, "rewards/accuracies": 0.875, "rewards/chosen": 0.35242289304733276, "rewards/margins": 0.5075194239616394, "rewards/rejected": -0.15509653091430664, "step": 5810 }, { "epoch": 0.8986661511695341, "grad_norm": 9.168158531188965, "learning_rate": 3.891339214113873e-06, "logits/chosen": 11.820075035095215, "logits/rejected": 7.127833366394043, "logps/chosen": -189.8061981201172, "logps/rejected": -175.92935180664062, "loss": 0.7752, "rewards/accuracies": 0.5, "rewards/chosen": -0.08858819305896759, "rewards/margins": -0.014167554676532745, "rewards/rejected": -0.07442066073417664, "step": 5811 }, { "epoch": 0.8988208003092982, "grad_norm": 2.488330125808716, "learning_rate": 3.8910528124642e-06, "logits/chosen": 8.111259460449219, "logits/rejected": 6.724429130554199, "logps/chosen": -159.6066436767578, "logps/rejected": -91.45460510253906, "loss": 0.5147, "rewards/accuracies": 0.75, "rewards/chosen": 0.31005510687828064, "rewards/margins": 0.46373051404953003, "rewards/rejected": -0.15367542207241058, "step": 5812 }, { "epoch": 0.8989754494490625, "grad_norm": 6.011414527893066, "learning_rate": 3.890766410814526e-06, "logits/chosen": 13.762782096862793, "logits/rejected": 9.7354736328125, "logps/chosen": -298.726806640625, "logps/rejected": -300.033203125, "loss": 0.7355, "rewards/accuracies": 0.375, "rewards/chosen": 0.02225351333618164, "rewards/margins": 0.25399595499038696, "rewards/rejected": -0.23174244165420532, "step": 5813 }, { "epoch": 0.8991300985888266, "grad_norm": 4.9309916496276855, "learning_rate": 3.890480009164853e-06, "logits/chosen": 9.000768661499023, "logits/rejected": 3.162428379058838, "logps/chosen": -305.7404479980469, "logps/rejected": -220.20472717285156, "loss": 0.6613, "rewards/accuracies": 0.375, "rewards/chosen": -0.14583033323287964, "rewards/margins": 0.1452416628599167, "rewards/rejected": -0.2910720109939575, "step": 5814 }, { "epoch": 0.8992847477285908, "grad_norm": 5.011508464813232, "learning_rate": 3.89019360751518e-06, "logits/chosen": 8.8680419921875, "logits/rejected": 3.670546531677246, "logps/chosen": -301.1461181640625, "logps/rejected": -215.93003845214844, "loss": 0.6633, "rewards/accuracies": 0.5, "rewards/chosen": 0.23907470703125, "rewards/margins": 0.1076871007680893, "rewards/rejected": 0.1313876211643219, "step": 5815 }, { "epoch": 0.899439396868355, "grad_norm": 5.040525913238525, "learning_rate": 3.889907205865506e-06, "logits/chosen": 9.04183578491211, "logits/rejected": 5.215781211853027, "logps/chosen": -350.3896789550781, "logps/rejected": -216.79725646972656, "loss": 0.593, "rewards/accuracies": 0.75, "rewards/chosen": 0.42782342433929443, "rewards/margins": 0.27642130851745605, "rewards/rejected": 0.151402086019516, "step": 5816 }, { "epoch": 0.8995940460081191, "grad_norm": 4.033872127532959, "learning_rate": 3.889620804215832e-06, "logits/chosen": 6.798552989959717, "logits/rejected": 8.213788986206055, "logps/chosen": -226.74070739746094, "logps/rejected": -302.1864013671875, "loss": 0.4446, "rewards/accuracies": 0.875, "rewards/chosen": 0.2932452857494354, "rewards/margins": 0.6816126108169556, "rewards/rejected": -0.38836729526519775, "step": 5817 }, { "epoch": 0.8997486951478832, "grad_norm": 4.856017589569092, "learning_rate": 3.889334402566159e-06, "logits/chosen": 5.278564453125, "logits/rejected": 0.8399578332901001, "logps/chosen": -245.49124145507812, "logps/rejected": -242.11532592773438, "loss": 0.548, "rewards/accuracies": 0.625, "rewards/chosen": 0.2413884997367859, "rewards/margins": 0.5936083793640137, "rewards/rejected": -0.352219820022583, "step": 5818 }, { "epoch": 0.8999033442876474, "grad_norm": 4.567702770233154, "learning_rate": 3.889048000916485e-06, "logits/chosen": 11.040934562683105, "logits/rejected": 11.924667358398438, "logps/chosen": -220.0098876953125, "logps/rejected": -237.9245147705078, "loss": 0.6992, "rewards/accuracies": 0.375, "rewards/chosen": 0.12206556648015976, "rewards/margins": 0.167752206325531, "rewards/rejected": -0.045686691999435425, "step": 5819 }, { "epoch": 0.9000579934274116, "grad_norm": 5.837713718414307, "learning_rate": 3.888761599266812e-06, "logits/chosen": 5.77709436416626, "logits/rejected": 9.078289031982422, "logps/chosen": -188.1656494140625, "logps/rejected": -232.5315704345703, "loss": 0.6351, "rewards/accuracies": 0.625, "rewards/chosen": 0.18178953230381012, "rewards/margins": 0.17593033611774445, "rewards/rejected": 0.0058591775596141815, "step": 5820 }, { "epoch": 0.9002126425671757, "grad_norm": 5.338743686676025, "learning_rate": 3.888475197617139e-06, "logits/chosen": 8.580313682556152, "logits/rejected": 3.0343730449676514, "logps/chosen": -292.9908142089844, "logps/rejected": -265.82525634765625, "loss": 0.7346, "rewards/accuracies": 0.375, "rewards/chosen": 0.029426276683807373, "rewards/margins": 0.10101652145385742, "rewards/rejected": -0.07159022241830826, "step": 5821 }, { "epoch": 0.9003672917069399, "grad_norm": 5.040890216827393, "learning_rate": 3.888188795967465e-06, "logits/chosen": 8.20743179321289, "logits/rejected": 3.6761863231658936, "logps/chosen": -258.45263671875, "logps/rejected": -293.16754150390625, "loss": 0.5834, "rewards/accuracies": 0.625, "rewards/chosen": 0.4921242594718933, "rewards/margins": 0.4046463370323181, "rewards/rejected": 0.08747787773609161, "step": 5822 }, { "epoch": 0.900521940846704, "grad_norm": 8.992168426513672, "learning_rate": 3.887902394317792e-06, "logits/chosen": 7.730225086212158, "logits/rejected": 7.373764991760254, "logps/chosen": -226.39979553222656, "logps/rejected": -205.3111572265625, "loss": 0.8276, "rewards/accuracies": 0.75, "rewards/chosen": -0.023194849491119385, "rewards/margins": -0.12436148524284363, "rewards/rejected": 0.10116663575172424, "step": 5823 }, { "epoch": 0.9006765899864682, "grad_norm": 5.892737865447998, "learning_rate": 3.887615992668118e-06, "logits/chosen": 9.87824821472168, "logits/rejected": 10.255525588989258, "logps/chosen": -288.4353942871094, "logps/rejected": -303.8541259765625, "loss": 0.7292, "rewards/accuracies": 0.375, "rewards/chosen": 0.40973442792892456, "rewards/margins": -0.03363390266895294, "rewards/rejected": 0.4433683156967163, "step": 5824 }, { "epoch": 0.9008312391262323, "grad_norm": 6.744109153747559, "learning_rate": 3.8873295910184445e-06, "logits/chosen": 4.24338436126709, "logits/rejected": 5.201857089996338, "logps/chosen": -239.29898071289062, "logps/rejected": -419.9927978515625, "loss": 0.5876, "rewards/accuracies": 0.625, "rewards/chosen": 0.19418013095855713, "rewards/margins": 0.2867938280105591, "rewards/rejected": -0.09261371195316315, "step": 5825 }, { "epoch": 0.9009858882659966, "grad_norm": 6.569641590118408, "learning_rate": 3.887043189368771e-06, "logits/chosen": 13.829446792602539, "logits/rejected": 6.065988540649414, "logps/chosen": -368.6732177734375, "logps/rejected": -219.236083984375, "loss": 0.7739, "rewards/accuracies": 0.5, "rewards/chosen": -0.09992039203643799, "rewards/margins": -0.09991222620010376, "rewards/rejected": -8.158385753631592e-06, "step": 5826 }, { "epoch": 0.9011405374057607, "grad_norm": 5.553526878356934, "learning_rate": 3.886756787719098e-06, "logits/chosen": 16.383296966552734, "logits/rejected": 12.941035270690918, "logps/chosen": -240.0606689453125, "logps/rejected": -224.10797119140625, "loss": 0.7538, "rewards/accuracies": 0.375, "rewards/chosen": -0.0914207398891449, "rewards/margins": -0.08539792150259018, "rewards/rejected": -0.006022820249199867, "step": 5827 }, { "epoch": 0.9012951865455249, "grad_norm": 4.780274391174316, "learning_rate": 3.8864703860694244e-06, "logits/chosen": 10.414852142333984, "logits/rejected": 11.135489463806152, "logps/chosen": -263.7305603027344, "logps/rejected": -256.6584777832031, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 0.5535441637039185, "rewards/margins": 0.56716388463974, "rewards/rejected": -0.013619698584079742, "step": 5828 }, { "epoch": 0.901449835685289, "grad_norm": 6.247519016265869, "learning_rate": 3.886183984419751e-06, "logits/chosen": 9.281834602355957, "logits/rejected": 7.712191581726074, "logps/chosen": -241.29293823242188, "logps/rejected": -276.02960205078125, "loss": 0.6365, "rewards/accuracies": 0.625, "rewards/chosen": -0.0068801045417785645, "rewards/margins": 0.23522864282131195, "rewards/rejected": -0.24210873246192932, "step": 5829 }, { "epoch": 0.9016044848250532, "grad_norm": 5.402223587036133, "learning_rate": 3.885897582770078e-06, "logits/chosen": 7.422797203063965, "logits/rejected": 5.952421188354492, "logps/chosen": -231.46493530273438, "logps/rejected": -184.68133544921875, "loss": 0.6886, "rewards/accuracies": 0.625, "rewards/chosen": -0.10784335434436798, "rewards/margins": 0.022812752053141594, "rewards/rejected": -0.13065610826015472, "step": 5830 }, { "epoch": 0.9017591339648173, "grad_norm": 4.328973293304443, "learning_rate": 3.8856111811204035e-06, "logits/chosen": 6.162834167480469, "logits/rejected": 6.610774993896484, "logps/chosen": -272.64263916015625, "logps/rejected": -234.24659729003906, "loss": 0.5011, "rewards/accuracies": 0.875, "rewards/chosen": 0.5779678821563721, "rewards/margins": 0.5311887264251709, "rewards/rejected": 0.04677905887365341, "step": 5831 }, { "epoch": 0.9019137831045815, "grad_norm": 6.595884799957275, "learning_rate": 3.88532477947073e-06, "logits/chosen": 9.275840759277344, "logits/rejected": 9.328700065612793, "logps/chosen": -373.49810791015625, "logps/rejected": -321.27667236328125, "loss": 0.8494, "rewards/accuracies": 0.375, "rewards/chosen": 0.19307613372802734, "rewards/margins": -0.23838338255882263, "rewards/rejected": 0.43145954608917236, "step": 5832 }, { "epoch": 0.9020684322443456, "grad_norm": 5.678833961486816, "learning_rate": 3.885038377821057e-06, "logits/chosen": 5.063775539398193, "logits/rejected": -0.3743937611579895, "logps/chosen": -266.460205078125, "logps/rejected": -161.34249877929688, "loss": 0.5961, "rewards/accuracies": 0.5, "rewards/chosen": 0.0388980358839035, "rewards/margins": 0.3764452338218689, "rewards/rejected": -0.3375471830368042, "step": 5833 }, { "epoch": 0.9022230813841098, "grad_norm": 7.250502586364746, "learning_rate": 3.8847519761713835e-06, "logits/chosen": 5.888790130615234, "logits/rejected": 15.33763313293457, "logps/chosen": -192.1502227783203, "logps/rejected": -295.3578796386719, "loss": 0.9184, "rewards/accuracies": 0.375, "rewards/chosen": -0.28777143359184265, "rewards/margins": -0.33882731199264526, "rewards/rejected": 0.0510559044778347, "step": 5834 }, { "epoch": 0.9023777305238739, "grad_norm": 4.430229663848877, "learning_rate": 3.884465574521709e-06, "logits/chosen": 10.761627197265625, "logits/rejected": 9.028371810913086, "logps/chosen": -318.77203369140625, "logps/rejected": -245.9497833251953, "loss": 0.4878, "rewards/accuracies": 0.875, "rewards/chosen": 0.41721418499946594, "rewards/margins": 0.5311620831489563, "rewards/rejected": -0.11394791305065155, "step": 5835 }, { "epoch": 0.9025323796636381, "grad_norm": 5.535238265991211, "learning_rate": 3.884179172872036e-06, "logits/chosen": 9.515908241271973, "logits/rejected": 8.492105484008789, "logps/chosen": -295.0663146972656, "logps/rejected": -241.23912048339844, "loss": 0.6869, "rewards/accuracies": 0.5, "rewards/chosen": 0.07690748572349548, "rewards/margins": 0.11782458424568176, "rewards/rejected": -0.04091709852218628, "step": 5836 }, { "epoch": 0.9026870288034022, "grad_norm": 6.513454437255859, "learning_rate": 3.883892771222363e-06, "logits/chosen": 5.844347953796387, "logits/rejected": 9.307866096496582, "logps/chosen": -309.8131103515625, "logps/rejected": -323.50836181640625, "loss": 0.8676, "rewards/accuracies": 0.5, "rewards/chosen": -0.19861078262329102, "rewards/margins": -0.1942564845085144, "rewards/rejected": -0.0043542832136154175, "step": 5837 }, { "epoch": 0.9028416779431664, "grad_norm": 3.3172013759613037, "learning_rate": 3.883606369572689e-06, "logits/chosen": 6.828248023986816, "logits/rejected": 5.912540435791016, "logps/chosen": -203.55136108398438, "logps/rejected": -220.48294067382812, "loss": 0.6378, "rewards/accuracies": 0.375, "rewards/chosen": 0.16925562918186188, "rewards/margins": 0.2474106401205063, "rewards/rejected": -0.078155018389225, "step": 5838 }, { "epoch": 0.9029963270829306, "grad_norm": 4.391114234924316, "learning_rate": 3.883319967923016e-06, "logits/chosen": 15.383584976196289, "logits/rejected": 10.069034576416016, "logps/chosen": -320.87017822265625, "logps/rejected": -264.7410888671875, "loss": 0.5722, "rewards/accuracies": 0.625, "rewards/chosen": 0.18732596933841705, "rewards/margins": 0.34394752979278564, "rewards/rejected": -0.15662159025669098, "step": 5839 }, { "epoch": 0.9031509762226948, "grad_norm": 5.451707363128662, "learning_rate": 3.883033566273342e-06, "logits/chosen": 13.697030067443848, "logits/rejected": 13.811174392700195, "logps/chosen": -322.304931640625, "logps/rejected": -282.4361877441406, "loss": 0.578, "rewards/accuracies": 0.875, "rewards/chosen": 0.28093957901000977, "rewards/margins": 0.3700934648513794, "rewards/rejected": -0.08915385603904724, "step": 5840 }, { "epoch": 0.903305625362459, "grad_norm": 6.3685221672058105, "learning_rate": 3.882747164623668e-06, "logits/chosen": 8.442115783691406, "logits/rejected": 4.736579895019531, "logps/chosen": -252.9369659423828, "logps/rejected": -242.43760681152344, "loss": 0.5933, "rewards/accuracies": 0.75, "rewards/chosen": 0.2573397755622864, "rewards/margins": 0.3026355803012848, "rewards/rejected": -0.0452958345413208, "step": 5841 }, { "epoch": 0.9034602745022231, "grad_norm": 4.195534706115723, "learning_rate": 3.882460762973995e-06, "logits/chosen": 5.978053092956543, "logits/rejected": 6.568967819213867, "logps/chosen": -168.0675811767578, "logps/rejected": -186.30543518066406, "loss": 0.7227, "rewards/accuracies": 0.5, "rewards/chosen": 0.06088666617870331, "rewards/margins": -0.024186521768569946, "rewards/rejected": 0.08507319539785385, "step": 5842 }, { "epoch": 0.9036149236419873, "grad_norm": 3.672985076904297, "learning_rate": 3.882174361324322e-06, "logits/chosen": 9.157450675964355, "logits/rejected": 9.113726615905762, "logps/chosen": -236.30455017089844, "logps/rejected": -247.42835998535156, "loss": 0.5047, "rewards/accuracies": 0.75, "rewards/chosen": 0.00868912786245346, "rewards/margins": 0.6104499101638794, "rewards/rejected": -0.601760745048523, "step": 5843 }, { "epoch": 0.9037695727817514, "grad_norm": 5.554665565490723, "learning_rate": 3.8818879596746474e-06, "logits/chosen": 8.053369522094727, "logits/rejected": 5.807934284210205, "logps/chosen": -224.80792236328125, "logps/rejected": -194.58795166015625, "loss": 0.735, "rewards/accuracies": 0.25, "rewards/chosen": 0.18455220758914948, "rewards/margins": 0.12116733938455582, "rewards/rejected": 0.06338487565517426, "step": 5844 }, { "epoch": 0.9039242219215156, "grad_norm": 4.206684589385986, "learning_rate": 3.881601558024974e-06, "logits/chosen": 9.244462966918945, "logits/rejected": 5.271476745605469, "logps/chosen": -325.0238037109375, "logps/rejected": -303.20501708984375, "loss": 0.4118, "rewards/accuracies": 1.0, "rewards/chosen": 0.37349826097488403, "rewards/margins": 0.764920175075531, "rewards/rejected": -0.3914218842983246, "step": 5845 }, { "epoch": 0.9040788710612797, "grad_norm": 5.245149612426758, "learning_rate": 3.881315156375301e-06, "logits/chosen": 11.899967193603516, "logits/rejected": 1.9415113925933838, "logps/chosen": -196.98587036132812, "logps/rejected": -125.8809585571289, "loss": 0.6777, "rewards/accuracies": 0.75, "rewards/chosen": 0.04752540588378906, "rewards/margins": 0.27655839920043945, "rewards/rejected": -0.2290329486131668, "step": 5846 }, { "epoch": 0.9042335202010439, "grad_norm": 4.382696151733398, "learning_rate": 3.881028754725627e-06, "logits/chosen": 12.819002151489258, "logits/rejected": 5.692922592163086, "logps/chosen": -309.71807861328125, "logps/rejected": -232.36790466308594, "loss": 0.5486, "rewards/accuracies": 0.625, "rewards/chosen": 0.03396128863096237, "rewards/margins": 0.487964004278183, "rewards/rejected": -0.4540027379989624, "step": 5847 }, { "epoch": 0.904388169340808, "grad_norm": 5.878704071044922, "learning_rate": 3.880742353075954e-06, "logits/chosen": 13.20284652709961, "logits/rejected": 4.059720516204834, "logps/chosen": -360.00994873046875, "logps/rejected": -260.7804870605469, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": 0.18025809526443481, "rewards/margins": 0.1990213394165039, "rewards/rejected": -0.018763268366456032, "step": 5848 }, { "epoch": 0.9045428184805722, "grad_norm": 5.185843467712402, "learning_rate": 3.880455951426281e-06, "logits/chosen": 13.477865219116211, "logits/rejected": 11.918599128723145, "logps/chosen": -309.1504821777344, "logps/rejected": -299.9570617675781, "loss": 0.535, "rewards/accuracies": 0.75, "rewards/chosen": -0.16140177845954895, "rewards/margins": 0.5257657170295715, "rewards/rejected": -0.6871674656867981, "step": 5849 }, { "epoch": 0.9046974676203363, "grad_norm": 5.3445281982421875, "learning_rate": 3.8801695497766065e-06, "logits/chosen": 6.33530855178833, "logits/rejected": 12.506031036376953, "logps/chosen": -176.27145385742188, "logps/rejected": -191.63589477539062, "loss": 0.6987, "rewards/accuracies": 0.625, "rewards/chosen": 0.11392469704151154, "rewards/margins": 0.0710691586136818, "rewards/rejected": 0.04285553842782974, "step": 5850 }, { "epoch": 0.9048521167601005, "grad_norm": 6.441342830657959, "learning_rate": 3.879883148126933e-06, "logits/chosen": 7.998721599578857, "logits/rejected": 11.967599868774414, "logps/chosen": -260.7688903808594, "logps/rejected": -303.0032958984375, "loss": 0.73, "rewards/accuracies": 0.625, "rewards/chosen": 0.389772891998291, "rewards/margins": 0.03484584391117096, "rewards/rejected": 0.35492706298828125, "step": 5851 }, { "epoch": 0.9050067658998647, "grad_norm": 3.743983745574951, "learning_rate": 3.87959674647726e-06, "logits/chosen": 13.090093612670898, "logits/rejected": 6.495992660522461, "logps/chosen": -415.82757568359375, "logps/rejected": -282.7840576171875, "loss": 0.4181, "rewards/accuracies": 0.875, "rewards/chosen": 0.42210546135902405, "rewards/margins": 0.7916517853736877, "rewards/rejected": -0.3695463538169861, "step": 5852 }, { "epoch": 0.9051614150396289, "grad_norm": 9.290580749511719, "learning_rate": 3.8793103448275865e-06, "logits/chosen": 4.171314716339111, "logits/rejected": 7.288641929626465, "logps/chosen": -276.3918151855469, "logps/rejected": -342.0833740234375, "loss": 0.7821, "rewards/accuracies": 0.375, "rewards/chosen": 0.2479313164949417, "rewards/margins": -0.11495855450630188, "rewards/rejected": 0.3628898859024048, "step": 5853 }, { "epoch": 0.905316064179393, "grad_norm": 3.8928864002227783, "learning_rate": 3.879023943177913e-06, "logits/chosen": 10.509458541870117, "logits/rejected": 9.40723991394043, "logps/chosen": -231.02806091308594, "logps/rejected": -235.6477813720703, "loss": 0.496, "rewards/accuracies": 0.875, "rewards/chosen": 0.5517292618751526, "rewards/margins": 0.5025173425674438, "rewards/rejected": 0.04921187460422516, "step": 5854 }, { "epoch": 0.9054707133191572, "grad_norm": 4.945900917053223, "learning_rate": 3.87873754152824e-06, "logits/chosen": 10.313854217529297, "logits/rejected": 8.259241104125977, "logps/chosen": -382.12152099609375, "logps/rejected": -315.88916015625, "loss": 0.5726, "rewards/accuracies": 0.5, "rewards/chosen": -0.04192980006337166, "rewards/margins": 0.41825219988822937, "rewards/rejected": -0.46018195152282715, "step": 5855 }, { "epoch": 0.9056253624589213, "grad_norm": 3.7689552307128906, "learning_rate": 3.878451139878566e-06, "logits/chosen": 16.6031551361084, "logits/rejected": 9.461174964904785, "logps/chosen": -236.73219299316406, "logps/rejected": -127.3540267944336, "loss": 0.5949, "rewards/accuracies": 0.625, "rewards/chosen": 0.21787521243095398, "rewards/margins": 0.31660696864128113, "rewards/rejected": -0.09873175621032715, "step": 5856 }, { "epoch": 0.9057800115986855, "grad_norm": 5.5852837562561035, "learning_rate": 3.878164738228892e-06, "logits/chosen": 14.486068725585938, "logits/rejected": 9.360355377197266, "logps/chosen": -342.8150939941406, "logps/rejected": -335.119140625, "loss": 0.5652, "rewards/accuracies": 0.875, "rewards/chosen": 0.3054373562335968, "rewards/margins": 0.2915671467781067, "rewards/rejected": 0.013870231807231903, "step": 5857 }, { "epoch": 0.9059346607384496, "grad_norm": 3.898383140563965, "learning_rate": 3.877878336579219e-06, "logits/chosen": 4.096169471740723, "logits/rejected": 6.356062412261963, "logps/chosen": -142.49746704101562, "logps/rejected": -168.0295867919922, "loss": 0.572, "rewards/accuracies": 0.75, "rewards/chosen": -0.19904474914073944, "rewards/margins": 0.2963623106479645, "rewards/rejected": -0.4954070448875427, "step": 5858 }, { "epoch": 0.9060893098782138, "grad_norm": 5.5752763748168945, "learning_rate": 3.8775919349295455e-06, "logits/chosen": 10.031557083129883, "logits/rejected": 7.413594722747803, "logps/chosen": -392.9423828125, "logps/rejected": -298.0181884765625, "loss": 0.6128, "rewards/accuracies": 0.75, "rewards/chosen": 0.3351958990097046, "rewards/margins": 0.42352375388145447, "rewards/rejected": -0.08832788467407227, "step": 5859 }, { "epoch": 0.9062439590179779, "grad_norm": 4.158720970153809, "learning_rate": 3.877305533279872e-06, "logits/chosen": 14.963878631591797, "logits/rejected": 8.858963012695312, "logps/chosen": -257.9062194824219, "logps/rejected": -264.0928955078125, "loss": 0.5223, "rewards/accuracies": 0.875, "rewards/chosen": 0.037215426564216614, "rewards/margins": 0.46270790696144104, "rewards/rejected": -0.4254924952983856, "step": 5860 }, { "epoch": 0.9063986081577421, "grad_norm": 5.162186145782471, "learning_rate": 3.877019131630199e-06, "logits/chosen": 7.1378889083862305, "logits/rejected": 10.812788009643555, "logps/chosen": -273.01812744140625, "logps/rejected": -304.15380859375, "loss": 0.6644, "rewards/accuracies": 0.5, "rewards/chosen": 0.12640608847141266, "rewards/margins": 0.11050014197826385, "rewards/rejected": 0.015905946493148804, "step": 5861 }, { "epoch": 0.9065532572975062, "grad_norm": 4.256577968597412, "learning_rate": 3.8767327299805255e-06, "logits/chosen": 13.968976974487305, "logits/rejected": 7.623793601989746, "logps/chosen": -260.9290771484375, "logps/rejected": -183.25228881835938, "loss": 0.5722, "rewards/accuracies": 0.625, "rewards/chosen": 0.2127974033355713, "rewards/margins": 0.6174132823944092, "rewards/rejected": -0.4046158790588379, "step": 5862 }, { "epoch": 0.9067079064372704, "grad_norm": 7.413290023803711, "learning_rate": 3.876446328330852e-06, "logits/chosen": 10.745857238769531, "logits/rejected": 8.528212547302246, "logps/chosen": -255.7159423828125, "logps/rejected": -263.3093566894531, "loss": 0.7381, "rewards/accuracies": 0.625, "rewards/chosen": 0.4664587676525116, "rewards/margins": 0.037835799157619476, "rewards/rejected": 0.4286229610443115, "step": 5863 }, { "epoch": 0.9068625555770347, "grad_norm": 6.548617362976074, "learning_rate": 3.876159926681178e-06, "logits/chosen": 9.338762283325195, "logits/rejected": 9.32027530670166, "logps/chosen": -258.281494140625, "logps/rejected": -240.11874389648438, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": 0.19616562128067017, "rewards/margins": 0.14268246293067932, "rewards/rejected": 0.05348314344882965, "step": 5864 }, { "epoch": 0.9070172047167988, "grad_norm": 5.089291572570801, "learning_rate": 3.8758735250315046e-06, "logits/chosen": 12.281780242919922, "logits/rejected": 4.380681037902832, "logps/chosen": -362.73504638671875, "logps/rejected": -196.89303588867188, "loss": 0.6017, "rewards/accuracies": 0.75, "rewards/chosen": 0.02687111310660839, "rewards/margins": 0.3171910047531128, "rewards/rejected": -0.29031985998153687, "step": 5865 }, { "epoch": 0.907171853856563, "grad_norm": 6.04975700378418, "learning_rate": 3.875587123381831e-06, "logits/chosen": 8.875252723693848, "logits/rejected": 7.564862251281738, "logps/chosen": -418.327392578125, "logps/rejected": -330.35064697265625, "loss": 0.6601, "rewards/accuracies": 0.5, "rewards/chosen": 0.367825984954834, "rewards/margins": 0.1807415783405304, "rewards/rejected": 0.1870843768119812, "step": 5866 }, { "epoch": 0.9073265029963271, "grad_norm": 6.029545307159424, "learning_rate": 3.875300721732158e-06, "logits/chosen": 11.076995849609375, "logits/rejected": 12.436990737915039, "logps/chosen": -222.04310607910156, "logps/rejected": -245.8939208984375, "loss": 0.7228, "rewards/accuracies": 0.625, "rewards/chosen": 0.08270444720983505, "rewards/margins": 0.06755058467388153, "rewards/rejected": 0.015153884887695312, "step": 5867 }, { "epoch": 0.9074811521360913, "grad_norm": 5.62288236618042, "learning_rate": 3.8750143200824845e-06, "logits/chosen": 3.1694464683532715, "logits/rejected": 4.831650733947754, "logps/chosen": -183.48605346679688, "logps/rejected": -235.12632751464844, "loss": 0.8222, "rewards/accuracies": 0.375, "rewards/chosen": 0.017087791115045547, "rewards/margins": -0.1821080446243286, "rewards/rejected": 0.19919581711292267, "step": 5868 }, { "epoch": 0.9076358012758554, "grad_norm": 6.17611837387085, "learning_rate": 3.87472791843281e-06, "logits/chosen": 9.089207649230957, "logits/rejected": 13.079728126525879, "logps/chosen": -273.21484375, "logps/rejected": -363.84228515625, "loss": 0.7125, "rewards/accuracies": 0.75, "rewards/chosen": 0.07337423413991928, "rewards/margins": 0.09915758669376373, "rewards/rejected": -0.025783345103263855, "step": 5869 }, { "epoch": 0.9077904504156196, "grad_norm": 5.846744537353516, "learning_rate": 3.874441516783137e-06, "logits/chosen": 10.480364799499512, "logits/rejected": 9.511070251464844, "logps/chosen": -344.05462646484375, "logps/rejected": -262.875, "loss": 0.6223, "rewards/accuracies": 0.625, "rewards/chosen": 0.04386405646800995, "rewards/margins": 0.2818721532821655, "rewards/rejected": -0.23800812661647797, "step": 5870 }, { "epoch": 0.9079450995553837, "grad_norm": 6.28924036026001, "learning_rate": 3.874155115133464e-06, "logits/chosen": 10.907773971557617, "logits/rejected": 11.629669189453125, "logps/chosen": -269.60296630859375, "logps/rejected": -312.3037109375, "loss": 0.8594, "rewards/accuracies": 0.375, "rewards/chosen": 0.22646427154541016, "rewards/margins": -0.22511149942874908, "rewards/rejected": 0.45157578587532043, "step": 5871 }, { "epoch": 0.9080997486951479, "grad_norm": 6.138974666595459, "learning_rate": 3.87386871348379e-06, "logits/chosen": 8.09500503540039, "logits/rejected": 4.935572147369385, "logps/chosen": -317.02227783203125, "logps/rejected": -300.6366882324219, "loss": 0.7205, "rewards/accuracies": 0.5, "rewards/chosen": 0.3320663571357727, "rewards/margins": 0.04267704486846924, "rewards/rejected": 0.28938931226730347, "step": 5872 }, { "epoch": 0.908254397834912, "grad_norm": 5.642059326171875, "learning_rate": 3.873582311834116e-06, "logits/chosen": 12.37524127960205, "logits/rejected": 10.193174362182617, "logps/chosen": -277.6837463378906, "logps/rejected": -266.68084716796875, "loss": 0.6788, "rewards/accuracies": 0.5, "rewards/chosen": 0.08328705281019211, "rewards/margins": 0.1363345831632614, "rewards/rejected": -0.05304756388068199, "step": 5873 }, { "epoch": 0.9084090469746762, "grad_norm": 6.166357517242432, "learning_rate": 3.873295910184443e-06, "logits/chosen": 6.426780700683594, "logits/rejected": 9.414957046508789, "logps/chosen": -209.12893676757812, "logps/rejected": -238.20860290527344, "loss": 0.8466, "rewards/accuracies": 0.25, "rewards/chosen": -0.14154444634914398, "rewards/margins": -0.1155434399843216, "rewards/rejected": -0.026000961661338806, "step": 5874 }, { "epoch": 0.9085636961144403, "grad_norm": 6.2518510818481445, "learning_rate": 3.873009508534769e-06, "logits/chosen": 14.87118148803711, "logits/rejected": 8.895995140075684, "logps/chosen": -314.4089660644531, "logps/rejected": -231.42095947265625, "loss": 0.6886, "rewards/accuracies": 0.375, "rewards/chosen": 0.13724622130393982, "rewards/margins": 0.10886593163013458, "rewards/rejected": 0.028380297124385834, "step": 5875 }, { "epoch": 0.9087183452542045, "grad_norm": 6.266149044036865, "learning_rate": 3.872723106885096e-06, "logits/chosen": 12.845640182495117, "logits/rejected": 10.37359619140625, "logps/chosen": -322.1435546875, "logps/rejected": -274.1820068359375, "loss": 0.739, "rewards/accuracies": 0.625, "rewards/chosen": -0.2960757613182068, "rewards/margins": 0.018304161727428436, "rewards/rejected": -0.3143799304962158, "step": 5876 }, { "epoch": 0.9088729943939687, "grad_norm": 5.608229160308838, "learning_rate": 3.872436705235423e-06, "logits/chosen": 15.693495750427246, "logits/rejected": 10.589967727661133, "logps/chosen": -354.5541076660156, "logps/rejected": -338.3266906738281, "loss": 0.6245, "rewards/accuracies": 0.75, "rewards/chosen": 0.5042687654495239, "rewards/margins": 0.1902521252632141, "rewards/rejected": 0.3140166401863098, "step": 5877 }, { "epoch": 0.9090276435337329, "grad_norm": 5.700530052185059, "learning_rate": 3.8721503035857485e-06, "logits/chosen": 9.475207328796387, "logits/rejected": 9.510040283203125, "logps/chosen": -368.9979553222656, "logps/rejected": -413.1342468261719, "loss": 0.7633, "rewards/accuracies": 0.625, "rewards/chosen": 0.24829746782779694, "rewards/margins": 0.0990840345621109, "rewards/rejected": 0.14921341836452484, "step": 5878 }, { "epoch": 0.909182292673497, "grad_norm": 5.58975076675415, "learning_rate": 3.871863901936075e-06, "logits/chosen": 6.354435920715332, "logits/rejected": 6.9265336990356445, "logps/chosen": -191.07354736328125, "logps/rejected": -258.0149841308594, "loss": 0.5965, "rewards/accuracies": 0.5, "rewards/chosen": 0.16558726131916046, "rewards/margins": 0.3360985517501831, "rewards/rejected": -0.17051129043102264, "step": 5879 }, { "epoch": 0.9093369418132612, "grad_norm": 8.578877449035645, "learning_rate": 3.871577500286402e-06, "logits/chosen": 13.14071273803711, "logits/rejected": 9.55809211730957, "logps/chosen": -295.3739013671875, "logps/rejected": -231.53436279296875, "loss": 0.7492, "rewards/accuracies": 0.5, "rewards/chosen": 0.4801799952983856, "rewards/margins": 0.06567084789276123, "rewards/rejected": 0.4145091474056244, "step": 5880 }, { "epoch": 0.9094915909530253, "grad_norm": 5.274852275848389, "learning_rate": 3.8712910986367284e-06, "logits/chosen": 10.39631462097168, "logits/rejected": 9.390652656555176, "logps/chosen": -281.65899658203125, "logps/rejected": -277.39471435546875, "loss": 0.6186, "rewards/accuracies": 0.5, "rewards/chosen": 0.27159056067466736, "rewards/margins": 0.44433337450027466, "rewards/rejected": -0.1727428287267685, "step": 5881 }, { "epoch": 0.9096462400927895, "grad_norm": 4.362977504730225, "learning_rate": 3.871004696987055e-06, "logits/chosen": 8.973688125610352, "logits/rejected": 12.19929027557373, "logps/chosen": -153.68785095214844, "logps/rejected": -198.5580291748047, "loss": 0.7528, "rewards/accuracies": 0.625, "rewards/chosen": 0.24013786017894745, "rewards/margins": -0.07577338814735413, "rewards/rejected": 0.31591126322746277, "step": 5882 }, { "epoch": 0.9098008892325536, "grad_norm": 4.9892096519470215, "learning_rate": 3.870718295337381e-06, "logits/chosen": 10.454109191894531, "logits/rejected": 9.95296859741211, "logps/chosen": -255.88088989257812, "logps/rejected": -205.77029418945312, "loss": 0.5946, "rewards/accuracies": 0.875, "rewards/chosen": 0.33802539110183716, "rewards/margins": 0.385031521320343, "rewards/rejected": -0.04700613021850586, "step": 5883 }, { "epoch": 0.9099555383723178, "grad_norm": 5.3680644035339355, "learning_rate": 3.8704318936877075e-06, "logits/chosen": 9.20842456817627, "logits/rejected": 3.709805488586426, "logps/chosen": -236.03872680664062, "logps/rejected": -221.62564086914062, "loss": 0.697, "rewards/accuracies": 0.5, "rewards/chosen": 0.03382416069507599, "rewards/margins": 0.3379474878311157, "rewards/rejected": -0.30412331223487854, "step": 5884 }, { "epoch": 0.910110187512082, "grad_norm": 5.74845552444458, "learning_rate": 3.870145492038034e-06, "logits/chosen": 12.581165313720703, "logits/rejected": 5.730310916900635, "logps/chosen": -263.79754638671875, "logps/rejected": -205.05313110351562, "loss": 0.5877, "rewards/accuracies": 0.625, "rewards/chosen": 0.15203207731246948, "rewards/margins": 0.2605578601360321, "rewards/rejected": -0.10852579772472382, "step": 5885 }, { "epoch": 0.9102648366518461, "grad_norm": 3.768998384475708, "learning_rate": 3.869859090388361e-06, "logits/chosen": 12.344903945922852, "logits/rejected": 14.888874053955078, "logps/chosen": -270.77239990234375, "logps/rejected": -257.6913146972656, "loss": 0.4446, "rewards/accuracies": 0.75, "rewards/chosen": 0.4103223979473114, "rewards/margins": 0.7548952102661133, "rewards/rejected": -0.3445727229118347, "step": 5886 }, { "epoch": 0.9104194857916103, "grad_norm": 4.497020721435547, "learning_rate": 3.8695726887386875e-06, "logits/chosen": 10.86084270477295, "logits/rejected": 14.221529960632324, "logps/chosen": -197.7698516845703, "logps/rejected": -290.35101318359375, "loss": 0.6059, "rewards/accuracies": 0.625, "rewards/chosen": -0.017513558268547058, "rewards/margins": 0.29999542236328125, "rewards/rejected": -0.3175089955329895, "step": 5887 }, { "epoch": 0.9105741349313744, "grad_norm": 4.394694805145264, "learning_rate": 3.869286287089014e-06, "logits/chosen": 8.712041854858398, "logits/rejected": 6.049680709838867, "logps/chosen": -336.4422607421875, "logps/rejected": -312.62054443359375, "loss": 0.4668, "rewards/accuracies": 0.875, "rewards/chosen": 0.33791497349739075, "rewards/margins": 0.6791126728057861, "rewards/rejected": -0.3411977291107178, "step": 5888 }, { "epoch": 0.9107287840711386, "grad_norm": 4.4446563720703125, "learning_rate": 3.868999885439341e-06, "logits/chosen": 15.691184997558594, "logits/rejected": 8.593006134033203, "logps/chosen": -347.0106506347656, "logps/rejected": -222.3578643798828, "loss": 0.4786, "rewards/accuracies": 0.875, "rewards/chosen": 0.34931623935699463, "rewards/margins": 0.5407199859619141, "rewards/rejected": -0.19140376150608063, "step": 5889 }, { "epoch": 0.9108834332109028, "grad_norm": 5.76023530960083, "learning_rate": 3.868713483789667e-06, "logits/chosen": 11.842556953430176, "logits/rejected": 13.940601348876953, "logps/chosen": -237.77659606933594, "logps/rejected": -233.53759765625, "loss": 0.7998, "rewards/accuracies": 0.5, "rewards/chosen": 0.1934671401977539, "rewards/margins": -0.1268077939748764, "rewards/rejected": 0.3202749490737915, "step": 5890 }, { "epoch": 0.911038082350667, "grad_norm": 6.6792311668396, "learning_rate": 3.868427082139993e-06, "logits/chosen": 8.31523323059082, "logits/rejected": 9.795251846313477, "logps/chosen": -261.92724609375, "logps/rejected": -340.0951232910156, "loss": 0.8456, "rewards/accuracies": 0.5, "rewards/chosen": -0.06994685530662537, "rewards/margins": -0.193409726023674, "rewards/rejected": 0.12346287071704865, "step": 5891 }, { "epoch": 0.9111927314904311, "grad_norm": 5.544681549072266, "learning_rate": 3.86814068049032e-06, "logits/chosen": 11.096994400024414, "logits/rejected": 8.1788330078125, "logps/chosen": -404.7717590332031, "logps/rejected": -305.47772216796875, "loss": 0.5715, "rewards/accuracies": 0.75, "rewards/chosen": 0.307533860206604, "rewards/margins": 0.3805437386035919, "rewards/rejected": -0.07300987839698792, "step": 5892 }, { "epoch": 0.9113473806301953, "grad_norm": 5.960513114929199, "learning_rate": 3.8678542788406465e-06, "logits/chosen": 5.858979225158691, "logits/rejected": 7.962894439697266, "logps/chosen": -254.59347534179688, "logps/rejected": -332.9234313964844, "loss": 0.4901, "rewards/accuracies": 0.875, "rewards/chosen": 0.24142858386039734, "rewards/margins": 0.6056240797042847, "rewards/rejected": -0.3641955554485321, "step": 5893 }, { "epoch": 0.9115020297699594, "grad_norm": 4.539766311645508, "learning_rate": 3.867567877190973e-06, "logits/chosen": 10.890776634216309, "logits/rejected": 6.910697937011719, "logps/chosen": -329.326904296875, "logps/rejected": -285.2637939453125, "loss": 0.4738, "rewards/accuracies": 0.875, "rewards/chosen": 0.6203383207321167, "rewards/margins": 0.5936251878738403, "rewards/rejected": 0.026713073253631592, "step": 5894 }, { "epoch": 0.9116566789097236, "grad_norm": 4.375959873199463, "learning_rate": 3.8672814755413e-06, "logits/chosen": 5.006954193115234, "logits/rejected": 5.969168186187744, "logps/chosen": -217.16885375976562, "logps/rejected": -249.6975555419922, "loss": 0.532, "rewards/accuracies": 0.75, "rewards/chosen": 0.15714392066001892, "rewards/margins": 0.4601404666900635, "rewards/rejected": -0.30299657583236694, "step": 5895 }, { "epoch": 0.9118113280494877, "grad_norm": 6.452846527099609, "learning_rate": 3.8669950738916265e-06, "logits/chosen": 10.5885591506958, "logits/rejected": 14.107398986816406, "logps/chosen": -332.09906005859375, "logps/rejected": -306.437255859375, "loss": 0.742, "rewards/accuracies": 0.375, "rewards/chosen": 0.19397488236427307, "rewards/margins": -0.062388621270656586, "rewards/rejected": 0.25636351108551025, "step": 5896 }, { "epoch": 0.9119659771892519, "grad_norm": 4.224374771118164, "learning_rate": 3.866708672241952e-06, "logits/chosen": 8.025949478149414, "logits/rejected": 0.833151638507843, "logps/chosen": -243.8278350830078, "logps/rejected": -207.10089111328125, "loss": 0.5728, "rewards/accuracies": 0.625, "rewards/chosen": -0.21796277165412903, "rewards/margins": 0.8589696884155273, "rewards/rejected": -1.076932430267334, "step": 5897 }, { "epoch": 0.912120626329016, "grad_norm": 4.2984819412231445, "learning_rate": 3.866422270592279e-06, "logits/chosen": 9.800004005432129, "logits/rejected": 6.534969806671143, "logps/chosen": -131.1072540283203, "logps/rejected": -105.47775268554688, "loss": 0.7154, "rewards/accuracies": 0.25, "rewards/chosen": 0.0369708277285099, "rewards/margins": 0.01552751287817955, "rewards/rejected": 0.02144332230091095, "step": 5898 }, { "epoch": 0.9122752754687802, "grad_norm": 4.24132776260376, "learning_rate": 3.866135868942606e-06, "logits/chosen": 11.291203498840332, "logits/rejected": 4.246890068054199, "logps/chosen": -347.5997314453125, "logps/rejected": -274.2054443359375, "loss": 0.4576, "rewards/accuracies": 0.75, "rewards/chosen": 0.272003173828125, "rewards/margins": 0.7611892223358154, "rewards/rejected": -0.48918601870536804, "step": 5899 }, { "epoch": 0.9124299246085443, "grad_norm": 5.092874526977539, "learning_rate": 3.865849467292932e-06, "logits/chosen": 9.607381820678711, "logits/rejected": 1.9911748170852661, "logps/chosen": -386.2210693359375, "logps/rejected": -442.4115295410156, "loss": 0.4891, "rewards/accuracies": 0.75, "rewards/chosen": 0.49228185415267944, "rewards/margins": 0.9390621185302734, "rewards/rejected": -0.4467802047729492, "step": 5900 }, { "epoch": 0.9125845737483085, "grad_norm": 4.684938907623291, "learning_rate": 3.865563065643259e-06, "logits/chosen": 12.4434814453125, "logits/rejected": 11.634963989257812, "logps/chosen": -195.53292846679688, "logps/rejected": -226.75164794921875, "loss": 0.6766, "rewards/accuracies": 0.75, "rewards/chosen": -0.07983823120594025, "rewards/margins": 0.26623111963272095, "rewards/rejected": -0.3460693359375, "step": 5901 }, { "epoch": 0.9127392228880726, "grad_norm": 5.0986857414245605, "learning_rate": 3.8652766639935855e-06, "logits/chosen": 11.001699447631836, "logits/rejected": 2.162646532058716, "logps/chosen": -364.4613342285156, "logps/rejected": -277.11102294921875, "loss": 0.5448, "rewards/accuracies": 0.75, "rewards/chosen": 0.3906462788581848, "rewards/margins": 0.4935569763183594, "rewards/rejected": -0.10291068255901337, "step": 5902 }, { "epoch": 0.9128938720278369, "grad_norm": 6.443974018096924, "learning_rate": 3.864990262343911e-06, "logits/chosen": 15.372228622436523, "logits/rejected": 9.23705005645752, "logps/chosen": -422.90985107421875, "logps/rejected": -329.5019836425781, "loss": 0.6644, "rewards/accuracies": 0.75, "rewards/chosen": 0.29591548442840576, "rewards/margins": 0.1617671698331833, "rewards/rejected": 0.13414831459522247, "step": 5903 }, { "epoch": 0.913048521167601, "grad_norm": 6.59765625, "learning_rate": 3.864703860694238e-06, "logits/chosen": 4.117753982543945, "logits/rejected": 6.603634357452393, "logps/chosen": -264.6317138671875, "logps/rejected": -238.9684600830078, "loss": 0.8735, "rewards/accuracies": 0.5, "rewards/chosen": -0.20861637592315674, "rewards/margins": -0.1493082046508789, "rewards/rejected": -0.059308186173439026, "step": 5904 }, { "epoch": 0.9132031703073652, "grad_norm": 4.567427158355713, "learning_rate": 3.864417459044565e-06, "logits/chosen": 12.141523361206055, "logits/rejected": 5.422127723693848, "logps/chosen": -270.4252624511719, "logps/rejected": -132.70730590820312, "loss": 0.5819, "rewards/accuracies": 0.625, "rewards/chosen": 0.030228987336158752, "rewards/margins": 0.2846803665161133, "rewards/rejected": -0.2544513940811157, "step": 5905 }, { "epoch": 0.9133578194471293, "grad_norm": 6.439109802246094, "learning_rate": 3.864131057394891e-06, "logits/chosen": 6.0170111656188965, "logits/rejected": 9.03499984741211, "logps/chosen": -294.7020263671875, "logps/rejected": -388.0574645996094, "loss": 0.8228, "rewards/accuracies": 0.375, "rewards/chosen": -0.10948237776756287, "rewards/margins": -0.07519664615392685, "rewards/rejected": -0.03428572416305542, "step": 5906 }, { "epoch": 0.9135124685868935, "grad_norm": 5.855745315551758, "learning_rate": 3.863844655745217e-06, "logits/chosen": 17.104736328125, "logits/rejected": 8.900671005249023, "logps/chosen": -527.9058837890625, "logps/rejected": -407.5986328125, "loss": 0.4497, "rewards/accuracies": 0.875, "rewards/chosen": 0.4642106890678406, "rewards/margins": 0.6380539536476135, "rewards/rejected": -0.17384320497512817, "step": 5907 }, { "epoch": 0.9136671177266577, "grad_norm": 4.556571960449219, "learning_rate": 3.863558254095544e-06, "logits/chosen": 13.010801315307617, "logits/rejected": 10.369705200195312, "logps/chosen": -308.9446716308594, "logps/rejected": -252.33395385742188, "loss": 0.6248, "rewards/accuracies": 0.625, "rewards/chosen": -0.04721641540527344, "rewards/margins": 0.19226133823394775, "rewards/rejected": -0.23947773873806, "step": 5908 }, { "epoch": 0.9138217668664218, "grad_norm": 4.743622779846191, "learning_rate": 3.86327185244587e-06, "logits/chosen": 10.117725372314453, "logits/rejected": 9.849259376525879, "logps/chosen": -194.17526245117188, "logps/rejected": -159.54159545898438, "loss": 0.7004, "rewards/accuracies": 0.5, "rewards/chosen": 0.03017682209610939, "rewards/margins": 0.05196913704276085, "rewards/rejected": -0.02179231494665146, "step": 5909 }, { "epoch": 0.913976416006186, "grad_norm": 6.324093818664551, "learning_rate": 3.862985450796197e-06, "logits/chosen": 9.268342971801758, "logits/rejected": 8.057497024536133, "logps/chosen": -260.33953857421875, "logps/rejected": -235.00332641601562, "loss": 0.6996, "rewards/accuracies": 0.375, "rewards/chosen": -0.15262578427791595, "rewards/margins": 0.05134010314941406, "rewards/rejected": -0.20396587252616882, "step": 5910 }, { "epoch": 0.9141310651459501, "grad_norm": 4.399489879608154, "learning_rate": 3.862699049146523e-06, "logits/chosen": 10.362421035766602, "logits/rejected": 10.820565223693848, "logps/chosen": -272.98870849609375, "logps/rejected": -253.4434356689453, "loss": 0.5946, "rewards/accuracies": 0.5, "rewards/chosen": -0.1616211086511612, "rewards/margins": 0.3922400176525116, "rewards/rejected": -0.553861141204834, "step": 5911 }, { "epoch": 0.9142857142857143, "grad_norm": 9.468118667602539, "learning_rate": 3.8624126474968495e-06, "logits/chosen": 12.71480941772461, "logits/rejected": 8.210349082946777, "logps/chosen": -309.13043212890625, "logps/rejected": -287.5068359375, "loss": 0.7842, "rewards/accuracies": 0.625, "rewards/chosen": 0.1380024552345276, "rewards/margins": -0.030953746289014816, "rewards/rejected": 0.16895617544651031, "step": 5912 }, { "epoch": 0.9144403634254784, "grad_norm": 5.364417552947998, "learning_rate": 3.862126245847176e-06, "logits/chosen": 14.358708381652832, "logits/rejected": 6.679542064666748, "logps/chosen": -288.8837890625, "logps/rejected": -221.76528930664062, "loss": 0.5227, "rewards/accuracies": 0.625, "rewards/chosen": 0.10908961296081543, "rewards/margins": 0.518195629119873, "rewards/rejected": -0.40910604596138, "step": 5913 }, { "epoch": 0.9145950125652426, "grad_norm": 6.501421928405762, "learning_rate": 3.861839844197503e-06, "logits/chosen": 11.75555419921875, "logits/rejected": 7.3360724449157715, "logps/chosen": -246.612548828125, "logps/rejected": -224.14794921875, "loss": 0.617, "rewards/accuracies": 0.5, "rewards/chosen": 0.19812843203544617, "rewards/margins": 0.30944639444351196, "rewards/rejected": -0.11131791770458221, "step": 5914 }, { "epoch": 0.9147496617050067, "grad_norm": 4.851212024688721, "learning_rate": 3.8615534425478295e-06, "logits/chosen": 15.381839752197266, "logits/rejected": 9.44843864440918, "logps/chosen": -325.95123291015625, "logps/rejected": -271.8412780761719, "loss": 0.5835, "rewards/accuracies": 0.75, "rewards/chosen": 0.2488618940114975, "rewards/margins": 0.48829320073127747, "rewards/rejected": -0.23943127691745758, "step": 5915 }, { "epoch": 0.914904310844771, "grad_norm": 5.3454484939575195, "learning_rate": 3.861267040898155e-06, "logits/chosen": 11.534419059753418, "logits/rejected": 7.670881271362305, "logps/chosen": -373.47320556640625, "logps/rejected": -303.9528503417969, "loss": 0.6034, "rewards/accuracies": 0.625, "rewards/chosen": -0.04454078525304794, "rewards/margins": 0.23381319642066956, "rewards/rejected": -0.2783539891242981, "step": 5916 }, { "epoch": 0.9150589599845351, "grad_norm": 5.922972202301025, "learning_rate": 3.860980639248482e-06, "logits/chosen": 8.902524948120117, "logits/rejected": 9.386190414428711, "logps/chosen": -277.7146301269531, "logps/rejected": -339.0614929199219, "loss": 0.6622, "rewards/accuracies": 0.5, "rewards/chosen": 0.05004728212952614, "rewards/margins": 0.17682011425495148, "rewards/rejected": -0.12677285075187683, "step": 5917 }, { "epoch": 0.9152136091242993, "grad_norm": 5.290755748748779, "learning_rate": 3.8606942375988086e-06, "logits/chosen": 5.35184383392334, "logits/rejected": 9.185018539428711, "logps/chosen": -261.62103271484375, "logps/rejected": -321.3396301269531, "loss": 0.5852, "rewards/accuracies": 0.375, "rewards/chosen": -0.05494198948144913, "rewards/margins": 0.5391534566879272, "rewards/rejected": -0.5940955281257629, "step": 5918 }, { "epoch": 0.9153682582640634, "grad_norm": 5.035971164703369, "learning_rate": 3.860407835949135e-06, "logits/chosen": 15.949684143066406, "logits/rejected": 11.24844741821289, "logps/chosen": -246.62939453125, "logps/rejected": -151.89537048339844, "loss": 0.5858, "rewards/accuracies": 0.5, "rewards/chosen": 0.5066784024238586, "rewards/margins": 0.3790939450263977, "rewards/rejected": 0.1275845170021057, "step": 5919 }, { "epoch": 0.9155229074038276, "grad_norm": 5.3687825202941895, "learning_rate": 3.860121434299462e-06, "logits/chosen": 13.22360897064209, "logits/rejected": 14.694915771484375, "logps/chosen": -269.3085021972656, "logps/rejected": -269.97723388671875, "loss": 0.5464, "rewards/accuracies": 0.75, "rewards/chosen": 0.17001762986183167, "rewards/margins": 0.401035338640213, "rewards/rejected": -0.23101767897605896, "step": 5920 }, { "epoch": 0.9156775565435917, "grad_norm": 4.238692760467529, "learning_rate": 3.8598350326497885e-06, "logits/chosen": 11.878721237182617, "logits/rejected": 6.834425449371338, "logps/chosen": -354.71038818359375, "logps/rejected": -227.5897979736328, "loss": 0.5358, "rewards/accuracies": 0.625, "rewards/chosen": 0.29697972536087036, "rewards/margins": 0.5982266664505005, "rewards/rejected": -0.3012469410896301, "step": 5921 }, { "epoch": 0.9158322056833559, "grad_norm": 8.2029390335083, "learning_rate": 3.859548631000115e-06, "logits/chosen": 7.202447891235352, "logits/rejected": 2.3025941848754883, "logps/chosen": -375.6297607421875, "logps/rejected": -220.82632446289062, "loss": 0.7654, "rewards/accuracies": 0.25, "rewards/chosen": 0.05733557045459747, "rewards/margins": -0.06681393086910248, "rewards/rejected": 0.12414951622486115, "step": 5922 }, { "epoch": 0.91598685482312, "grad_norm": 7.002408504486084, "learning_rate": 3.859262229350441e-06, "logits/chosen": 15.865678787231445, "logits/rejected": 15.669532775878906, "logps/chosen": -254.5007781982422, "logps/rejected": -230.11817932128906, "loss": 0.7696, "rewards/accuracies": 0.625, "rewards/chosen": 0.12614861130714417, "rewards/margins": -0.035456568002700806, "rewards/rejected": 0.16160517930984497, "step": 5923 }, { "epoch": 0.9161415039628842, "grad_norm": 3.7600300312042236, "learning_rate": 3.858975827700768e-06, "logits/chosen": 6.476556777954102, "logits/rejected": 5.155506134033203, "logps/chosen": -163.3370361328125, "logps/rejected": -174.42233276367188, "loss": 0.6208, "rewards/accuracies": 0.625, "rewards/chosen": 0.23719027638435364, "rewards/margins": 0.20608875155448914, "rewards/rejected": 0.031101537868380547, "step": 5924 }, { "epoch": 0.9162961531026483, "grad_norm": 3.0437915325164795, "learning_rate": 3.858689426051094e-06, "logits/chosen": 7.21820068359375, "logits/rejected": 5.366580963134766, "logps/chosen": -165.77870178222656, "logps/rejected": -122.99116516113281, "loss": 0.5162, "rewards/accuracies": 0.75, "rewards/chosen": 0.12377265840768814, "rewards/margins": 0.4593256711959839, "rewards/rejected": -0.33555299043655396, "step": 5925 }, { "epoch": 0.9164508022424125, "grad_norm": 5.067497253417969, "learning_rate": 3.858403024401421e-06, "logits/chosen": 12.549297332763672, "logits/rejected": 11.871038436889648, "logps/chosen": -286.08685302734375, "logps/rejected": -315.29034423828125, "loss": 0.7401, "rewards/accuracies": 0.5, "rewards/chosen": 0.2791282534599304, "rewards/margins": 0.01311815157532692, "rewards/rejected": 0.2660101056098938, "step": 5926 }, { "epoch": 0.9166054513821766, "grad_norm": 4.377481460571289, "learning_rate": 3.8581166227517476e-06, "logits/chosen": 5.702720642089844, "logits/rejected": 3.9241890907287598, "logps/chosen": -137.35012817382812, "logps/rejected": -145.62823486328125, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": -0.04117536544799805, "rewards/margins": 0.11607442796230316, "rewards/rejected": -0.1572497934103012, "step": 5927 }, { "epoch": 0.9167601005219409, "grad_norm": 4.919307708740234, "learning_rate": 3.857830221102074e-06, "logits/chosen": 6.303287029266357, "logits/rejected": 13.36454963684082, "logps/chosen": -148.81932067871094, "logps/rejected": -218.31182861328125, "loss": 0.6832, "rewards/accuracies": 0.375, "rewards/chosen": -0.14241819083690643, "rewards/margins": 0.10381338000297546, "rewards/rejected": -0.2462315559387207, "step": 5928 }, { "epoch": 0.916914749661705, "grad_norm": 5.774654865264893, "learning_rate": 3.8575438194524e-06, "logits/chosen": 15.903022766113281, "logits/rejected": 9.915826797485352, "logps/chosen": -309.3786926269531, "logps/rejected": -261.5390625, "loss": 0.6905, "rewards/accuracies": 0.625, "rewards/chosen": 0.043460264801979065, "rewards/margins": 0.12520970404148102, "rewards/rejected": -0.08174943923950195, "step": 5929 }, { "epoch": 0.9170693988014692, "grad_norm": 5.834843635559082, "learning_rate": 3.857257417802727e-06, "logits/chosen": 6.180651664733887, "logits/rejected": 7.186861991882324, "logps/chosen": -256.467529296875, "logps/rejected": -198.36305236816406, "loss": 0.714, "rewards/accuracies": 0.375, "rewards/chosen": 0.016269681975245476, "rewards/margins": 0.0899326503276825, "rewards/rejected": -0.07366294413805008, "step": 5930 }, { "epoch": 0.9172240479412334, "grad_norm": 6.222607612609863, "learning_rate": 3.856971016153053e-06, "logits/chosen": 8.585904121398926, "logits/rejected": 5.8289570808410645, "logps/chosen": -186.5392608642578, "logps/rejected": -183.44021606445312, "loss": 0.7348, "rewards/accuracies": 0.5, "rewards/chosen": 0.10585668683052063, "rewards/margins": 0.008877936750650406, "rewards/rejected": 0.09697875380516052, "step": 5931 }, { "epoch": 0.9173786970809975, "grad_norm": 5.199121952056885, "learning_rate": 3.85668461450338e-06, "logits/chosen": 15.467013359069824, "logits/rejected": 5.116355895996094, "logps/chosen": -318.99383544921875, "logps/rejected": -254.9661102294922, "loss": 0.5685, "rewards/accuracies": 0.625, "rewards/chosen": 0.36116892099380493, "rewards/margins": 0.5343852043151855, "rewards/rejected": -0.17321628332138062, "step": 5932 }, { "epoch": 0.9175333462207617, "grad_norm": 4.401479244232178, "learning_rate": 3.856398212853707e-06, "logits/chosen": 9.046539306640625, "logits/rejected": 2.07232403755188, "logps/chosen": -253.52285766601562, "logps/rejected": -140.69761657714844, "loss": 0.5358, "rewards/accuracies": 0.625, "rewards/chosen": 0.02295185625553131, "rewards/margins": 0.4381467401981354, "rewards/rejected": -0.41519486904144287, "step": 5933 }, { "epoch": 0.9176879953605258, "grad_norm": 9.792787551879883, "learning_rate": 3.856111811204033e-06, "logits/chosen": 10.339614868164062, "logits/rejected": 9.148236274719238, "logps/chosen": -342.1416931152344, "logps/rejected": -340.17510986328125, "loss": 0.8286, "rewards/accuracies": 0.75, "rewards/chosen": 0.08393508195877075, "rewards/margins": -0.0599772185087204, "rewards/rejected": 0.14391233026981354, "step": 5934 }, { "epoch": 0.91784264450029, "grad_norm": 5.708592414855957, "learning_rate": 3.85582540955436e-06, "logits/chosen": 10.877514839172363, "logits/rejected": 8.588447570800781, "logps/chosen": -283.3267822265625, "logps/rejected": -257.55224609375, "loss": 0.5222, "rewards/accuracies": 0.625, "rewards/chosen": 0.2284901738166809, "rewards/margins": 0.8872479200363159, "rewards/rejected": -0.6587576866149902, "step": 5935 }, { "epoch": 0.9179972936400541, "grad_norm": 4.462575912475586, "learning_rate": 3.855539007904686e-06, "logits/chosen": 10.294260025024414, "logits/rejected": 5.6089019775390625, "logps/chosen": -266.9420166015625, "logps/rejected": -242.7266082763672, "loss": 0.5873, "rewards/accuracies": 0.75, "rewards/chosen": 0.40647435188293457, "rewards/margins": 0.2888174057006836, "rewards/rejected": 0.11765693873167038, "step": 5936 }, { "epoch": 0.9181519427798183, "grad_norm": 6.262191295623779, "learning_rate": 3.855252606255012e-06, "logits/chosen": 9.659331321716309, "logits/rejected": 7.160659313201904, "logps/chosen": -371.291259765625, "logps/rejected": -301.55078125, "loss": 0.6773, "rewards/accuracies": 0.375, "rewards/chosen": 0.009488284587860107, "rewards/margins": 0.218671977519989, "rewards/rejected": -0.2091836929321289, "step": 5937 }, { "epoch": 0.9183065919195824, "grad_norm": 13.311737060546875, "learning_rate": 3.854966204605339e-06, "logits/chosen": 11.907447814941406, "logits/rejected": 8.653641700744629, "logps/chosen": -319.3544006347656, "logps/rejected": -288.51898193359375, "loss": 0.9044, "rewards/accuracies": 0.375, "rewards/chosen": -0.2930832803249359, "rewards/margins": -0.2596186697483063, "rewards/rejected": -0.03346461430191994, "step": 5938 }, { "epoch": 0.9184612410593466, "grad_norm": 6.035009384155273, "learning_rate": 3.854679802955666e-06, "logits/chosen": 5.040353775024414, "logits/rejected": 5.486931324005127, "logps/chosen": -365.8705139160156, "logps/rejected": -309.94818115234375, "loss": 0.6754, "rewards/accuracies": 0.625, "rewards/chosen": 0.0919039398431778, "rewards/margins": 0.09457303583621979, "rewards/rejected": -0.002669095993041992, "step": 5939 }, { "epoch": 0.9186158901991107, "grad_norm": 5.269845962524414, "learning_rate": 3.854393401305992e-06, "logits/chosen": 7.8071208000183105, "logits/rejected": 9.440631866455078, "logps/chosen": -239.56668090820312, "logps/rejected": -271.0215148925781, "loss": 0.6969, "rewards/accuracies": 0.375, "rewards/chosen": 0.15914659202098846, "rewards/margins": 0.028732866048812866, "rewards/rejected": 0.1304137408733368, "step": 5940 }, { "epoch": 0.918770539338875, "grad_norm": 5.278595924377441, "learning_rate": 3.854106999656318e-06, "logits/chosen": 11.516739845275879, "logits/rejected": 13.211091995239258, "logps/chosen": -325.21099853515625, "logps/rejected": -339.77764892578125, "loss": 0.4958, "rewards/accuracies": 0.75, "rewards/chosen": 0.31322020292282104, "rewards/margins": 0.6078562140464783, "rewards/rejected": -0.29463598132133484, "step": 5941 }, { "epoch": 0.9189251884786391, "grad_norm": 6.398715019226074, "learning_rate": 3.853820598006645e-06, "logits/chosen": 11.936965942382812, "logits/rejected": 10.497695922851562, "logps/chosen": -364.0721435546875, "logps/rejected": -253.64527893066406, "loss": 0.7408, "rewards/accuracies": 0.625, "rewards/chosen": -0.3417280912399292, "rewards/margins": 0.04365827143192291, "rewards/rejected": -0.3853863775730133, "step": 5942 }, { "epoch": 0.9190798376184033, "grad_norm": 4.045505046844482, "learning_rate": 3.8535341963569714e-06, "logits/chosen": 14.519649505615234, "logits/rejected": 15.048075675964355, "logps/chosen": -236.2891845703125, "logps/rejected": -254.0170440673828, "loss": 0.554, "rewards/accuracies": 0.5, "rewards/chosen": 0.19047680497169495, "rewards/margins": 0.47173410654067993, "rewards/rejected": -0.281257301568985, "step": 5943 }, { "epoch": 0.9192344867581674, "grad_norm": 5.424352645874023, "learning_rate": 3.853247794707298e-06, "logits/chosen": 7.910563945770264, "logits/rejected": 9.230586051940918, "logps/chosen": -217.4111328125, "logps/rejected": -269.77642822265625, "loss": 0.7234, "rewards/accuracies": 0.5, "rewards/chosen": 0.17150217294692993, "rewards/margins": 0.04834473133087158, "rewards/rejected": 0.12315745651721954, "step": 5944 }, { "epoch": 0.9193891358979316, "grad_norm": 4.2203168869018555, "learning_rate": 3.852961393057624e-06, "logits/chosen": 9.357605934143066, "logits/rejected": 8.5084228515625, "logps/chosen": -207.44784545898438, "logps/rejected": -242.84083557128906, "loss": 0.6284, "rewards/accuracies": 0.625, "rewards/chosen": -0.16369342803955078, "rewards/margins": 0.37120023369789124, "rewards/rejected": -0.5348936915397644, "step": 5945 }, { "epoch": 0.9195437850376957, "grad_norm": 4.9508185386657715, "learning_rate": 3.8526749914079505e-06, "logits/chosen": 14.602723121643066, "logits/rejected": 7.486633777618408, "logps/chosen": -271.6171875, "logps/rejected": -204.06320190429688, "loss": 0.5517, "rewards/accuracies": 0.625, "rewards/chosen": 0.1853051632642746, "rewards/margins": 0.44909024238586426, "rewards/rejected": -0.26378506422042847, "step": 5946 }, { "epoch": 0.9196984341774599, "grad_norm": 8.3158597946167, "learning_rate": 3.852388589758277e-06, "logits/chosen": 8.789648056030273, "logits/rejected": 5.727154731750488, "logps/chosen": -338.49835205078125, "logps/rejected": -257.8027038574219, "loss": 0.688, "rewards/accuracies": 0.75, "rewards/chosen": 0.31732234358787537, "rewards/margins": 0.16338233649730682, "rewards/rejected": 0.15394000709056854, "step": 5947 }, { "epoch": 0.919853083317224, "grad_norm": 6.242115497589111, "learning_rate": 3.852102188108604e-06, "logits/chosen": 13.601849555969238, "logits/rejected": 12.952322959899902, "logps/chosen": -258.85833740234375, "logps/rejected": -302.5117492675781, "loss": 0.5806, "rewards/accuracies": 0.625, "rewards/chosen": 0.3359399437904358, "rewards/margins": 0.3445708155632019, "rewards/rejected": -0.008630847558379173, "step": 5948 }, { "epoch": 0.9200077324569882, "grad_norm": 3.889660596847534, "learning_rate": 3.85181578645893e-06, "logits/chosen": 8.088129997253418, "logits/rejected": 10.209095001220703, "logps/chosen": -214.66986083984375, "logps/rejected": -237.7310028076172, "loss": 0.5545, "rewards/accuracies": 0.625, "rewards/chosen": 0.30456647276878357, "rewards/margins": 0.37158316373825073, "rewards/rejected": -0.06701669096946716, "step": 5949 }, { "epoch": 0.9201623815967523, "grad_norm": 5.8114333152771, "learning_rate": 3.851529384809256e-06, "logits/chosen": 8.929306030273438, "logits/rejected": 7.519437789916992, "logps/chosen": -251.84629821777344, "logps/rejected": -217.41412353515625, "loss": 0.6704, "rewards/accuracies": 0.625, "rewards/chosen": 0.004251718521118164, "rewards/margins": 0.0848056823015213, "rewards/rejected": -0.08055394887924194, "step": 5950 }, { "epoch": 0.9203170307365165, "grad_norm": 8.590255737304688, "learning_rate": 3.851242983159583e-06, "logits/chosen": 12.934130668640137, "logits/rejected": 6.063616752624512, "logps/chosen": -369.6166687011719, "logps/rejected": -229.67901611328125, "loss": 0.6688, "rewards/accuracies": 0.5, "rewards/chosen": 0.11090479046106339, "rewards/margins": 0.3181837201118469, "rewards/rejected": -0.20727893710136414, "step": 5951 }, { "epoch": 0.9204716798762806, "grad_norm": 4.726585865020752, "learning_rate": 3.85095658150991e-06, "logits/chosen": 9.413644790649414, "logits/rejected": 11.746776580810547, "logps/chosen": -241.02403259277344, "logps/rejected": -286.5113830566406, "loss": 0.6274, "rewards/accuracies": 0.5, "rewards/chosen": -0.08068778365850449, "rewards/margins": 0.2278514951467514, "rewards/rejected": -0.3085392713546753, "step": 5952 }, { "epoch": 0.9206263290160448, "grad_norm": 4.609578609466553, "learning_rate": 3.850670179860236e-06, "logits/chosen": 17.141637802124023, "logits/rejected": 10.48354721069336, "logps/chosen": -382.8043212890625, "logps/rejected": -249.50013732910156, "loss": 0.5196, "rewards/accuracies": 0.75, "rewards/chosen": 0.40035080909729004, "rewards/margins": 0.6527730226516724, "rewards/rejected": -0.25242215394973755, "step": 5953 }, { "epoch": 0.9207809781558091, "grad_norm": 5.546847343444824, "learning_rate": 3.850383778210563e-06, "logits/chosen": 9.014184951782227, "logits/rejected": 7.124309539794922, "logps/chosen": -359.6578674316406, "logps/rejected": -309.4601745605469, "loss": 0.6796, "rewards/accuracies": 0.375, "rewards/chosen": 0.27597880363464355, "rewards/margins": 0.14819855988025665, "rewards/rejected": 0.1277802288532257, "step": 5954 }, { "epoch": 0.9209356272955732, "grad_norm": 4.403045177459717, "learning_rate": 3.8500973765608896e-06, "logits/chosen": 5.863503932952881, "logits/rejected": 4.2348856925964355, "logps/chosen": -212.52920532226562, "logps/rejected": -211.0526123046875, "loss": 0.7037, "rewards/accuracies": 0.625, "rewards/chosen": 0.03576654940843582, "rewards/margins": 0.21357953548431396, "rewards/rejected": -0.17781296372413635, "step": 5955 }, { "epoch": 0.9210902764353374, "grad_norm": 6.843137741088867, "learning_rate": 3.849810974911215e-06, "logits/chosen": 13.903402328491211, "logits/rejected": 9.994194030761719, "logps/chosen": -333.7403259277344, "logps/rejected": -288.69317626953125, "loss": 0.829, "rewards/accuracies": 0.5, "rewards/chosen": -0.10667981952428818, "rewards/margins": 0.06349324434995651, "rewards/rejected": -0.1701730489730835, "step": 5956 }, { "epoch": 0.9212449255751015, "grad_norm": 5.5774946212768555, "learning_rate": 3.849524573261542e-06, "logits/chosen": 15.12047004699707, "logits/rejected": 12.097946166992188, "logps/chosen": -265.2837829589844, "logps/rejected": -175.3000946044922, "loss": 0.7397, "rewards/accuracies": 0.5, "rewards/chosen": 0.2028406262397766, "rewards/margins": 0.029902443289756775, "rewards/rejected": 0.17293816804885864, "step": 5957 }, { "epoch": 0.9213995747148657, "grad_norm": 5.4411115646362305, "learning_rate": 3.849238171611869e-06, "logits/chosen": 14.200237274169922, "logits/rejected": 12.430276870727539, "logps/chosen": -250.39157104492188, "logps/rejected": -207.62936401367188, "loss": 0.7891, "rewards/accuracies": 0.25, "rewards/chosen": 0.21793919801712036, "rewards/margins": -0.14933212101459503, "rewards/rejected": 0.3672713041305542, "step": 5958 }, { "epoch": 0.9215542238546298, "grad_norm": 4.988310813903809, "learning_rate": 3.848951769962195e-06, "logits/chosen": 13.829568862915039, "logits/rejected": 12.126517295837402, "logps/chosen": -286.1348571777344, "logps/rejected": -296.5434875488281, "loss": 0.6016, "rewards/accuracies": 0.75, "rewards/chosen": -0.1906820386648178, "rewards/margins": 0.2569863498210907, "rewards/rejected": -0.4476683735847473, "step": 5959 }, { "epoch": 0.921708872994394, "grad_norm": 5.089449882507324, "learning_rate": 3.848665368312522e-06, "logits/chosen": 13.649700164794922, "logits/rejected": 9.107711791992188, "logps/chosen": -356.34674072265625, "logps/rejected": -247.37521362304688, "loss": 0.6604, "rewards/accuracies": 0.5, "rewards/chosen": 0.29121553897857666, "rewards/margins": 0.229379802942276, "rewards/rejected": 0.061835743486881256, "step": 5960 }, { "epoch": 0.9218635221341581, "grad_norm": 3.6978254318237305, "learning_rate": 3.848378966662849e-06, "logits/chosen": 6.951534748077393, "logits/rejected": 6.844333171844482, "logps/chosen": -208.95864868164062, "logps/rejected": -170.73056030273438, "loss": 0.5061, "rewards/accuracies": 0.875, "rewards/chosen": 0.29118815064430237, "rewards/margins": 0.5795993804931641, "rewards/rejected": -0.2884112596511841, "step": 5961 }, { "epoch": 0.9220181712739223, "grad_norm": 16.355939865112305, "learning_rate": 3.848092565013174e-06, "logits/chosen": 9.058082580566406, "logits/rejected": 12.13011360168457, "logps/chosen": -275.96343994140625, "logps/rejected": -326.87603759765625, "loss": 0.749, "rewards/accuracies": 0.625, "rewards/chosen": -0.15806210041046143, "rewards/margins": 0.3714718818664551, "rewards/rejected": -0.5295339226722717, "step": 5962 }, { "epoch": 0.9221728204136864, "grad_norm": 5.769424915313721, "learning_rate": 3.847806163363501e-06, "logits/chosen": 8.52417278289795, "logits/rejected": 6.214466094970703, "logps/chosen": -401.5508117675781, "logps/rejected": -279.8078918457031, "loss": 0.4864, "rewards/accuracies": 0.875, "rewards/chosen": 0.13646875321865082, "rewards/margins": 0.5443332195281982, "rewards/rejected": -0.407864511013031, "step": 5963 }, { "epoch": 0.9223274695534506, "grad_norm": 7.583956241607666, "learning_rate": 3.847519761713828e-06, "logits/chosen": 9.999289512634277, "logits/rejected": 10.591829299926758, "logps/chosen": -365.614990234375, "logps/rejected": -267.7789611816406, "loss": 0.5999, "rewards/accuracies": 0.625, "rewards/chosen": 0.1878223419189453, "rewards/margins": 0.3166043162345886, "rewards/rejected": -0.1287819892168045, "step": 5964 }, { "epoch": 0.9224821186932147, "grad_norm": 3.909832715988159, "learning_rate": 3.847233360064154e-06, "logits/chosen": 13.395157814025879, "logits/rejected": 13.810171127319336, "logps/chosen": -283.162109375, "logps/rejected": -237.293701171875, "loss": 0.5571, "rewards/accuracies": 0.75, "rewards/chosen": 0.36247557401657104, "rewards/margins": 0.371268630027771, "rewards/rejected": -0.008793100714683533, "step": 5965 }, { "epoch": 0.9226367678329789, "grad_norm": 4.247903347015381, "learning_rate": 3.846946958414481e-06, "logits/chosen": 11.090770721435547, "logits/rejected": 11.543726921081543, "logps/chosen": -356.1608581542969, "logps/rejected": -360.84185791015625, "loss": 0.4669, "rewards/accuracies": 0.75, "rewards/chosen": 0.35332566499710083, "rewards/margins": 0.6636320352554321, "rewards/rejected": -0.3103064000606537, "step": 5966 }, { "epoch": 0.9227914169727431, "grad_norm": 5.168519973754883, "learning_rate": 3.846660556764808e-06, "logits/chosen": 8.626436233520508, "logits/rejected": 7.0696187019348145, "logps/chosen": -189.49253845214844, "logps/rejected": -165.54818725585938, "loss": 0.6043, "rewards/accuracies": 0.625, "rewards/chosen": -0.39061564207077026, "rewards/margins": 0.35810378193855286, "rewards/rejected": -0.7487194538116455, "step": 5967 }, { "epoch": 0.9229460661125073, "grad_norm": 5.606865406036377, "learning_rate": 3.846374155115134e-06, "logits/chosen": 8.776806831359863, "logits/rejected": 8.227149963378906, "logps/chosen": -234.8128662109375, "logps/rejected": -283.83160400390625, "loss": 0.7573, "rewards/accuracies": 0.5, "rewards/chosen": 0.34023475646972656, "rewards/margins": 0.0626526027917862, "rewards/rejected": 0.27758219838142395, "step": 5968 }, { "epoch": 0.9231007152522714, "grad_norm": 3.8609025478363037, "learning_rate": 3.84608775346546e-06, "logits/chosen": 9.259864807128906, "logits/rejected": 6.362096786499023, "logps/chosen": -265.30224609375, "logps/rejected": -256.0496520996094, "loss": 0.4642, "rewards/accuracies": 0.625, "rewards/chosen": 0.2841939926147461, "rewards/margins": 0.7123348712921143, "rewards/rejected": -0.42814087867736816, "step": 5969 }, { "epoch": 0.9232553643920356, "grad_norm": 5.903758525848389, "learning_rate": 3.845801351815787e-06, "logits/chosen": 6.796444416046143, "logits/rejected": 13.478812217712402, "logps/chosen": -170.37298583984375, "logps/rejected": -305.61248779296875, "loss": 0.805, "rewards/accuracies": 0.625, "rewards/chosen": -0.29077470302581787, "rewards/margins": -0.08672347664833069, "rewards/rejected": -0.204051211476326, "step": 5970 }, { "epoch": 0.9234100135317997, "grad_norm": 4.833823204040527, "learning_rate": 3.845514950166113e-06, "logits/chosen": 5.403933048248291, "logits/rejected": 1.7210121154785156, "logps/chosen": -154.13180541992188, "logps/rejected": -133.4947509765625, "loss": 0.6119, "rewards/accuracies": 0.625, "rewards/chosen": -0.23658888041973114, "rewards/margins": 0.20584726333618164, "rewards/rejected": -0.4424360990524292, "step": 5971 }, { "epoch": 0.9235646626715639, "grad_norm": 5.080250263214111, "learning_rate": 3.84522854851644e-06, "logits/chosen": 8.055213928222656, "logits/rejected": 8.364986419677734, "logps/chosen": -205.92010498046875, "logps/rejected": -193.29132080078125, "loss": 0.7308, "rewards/accuracies": 0.375, "rewards/chosen": 0.06329239159822464, "rewards/margins": -0.020917341113090515, "rewards/rejected": 0.08420972526073456, "step": 5972 }, { "epoch": 0.923719311811328, "grad_norm": 5.192558288574219, "learning_rate": 3.844942146866767e-06, "logits/chosen": 13.305277824401855, "logits/rejected": 12.780332565307617, "logps/chosen": -311.9449462890625, "logps/rejected": -338.5521240234375, "loss": 0.5333, "rewards/accuracies": 0.625, "rewards/chosen": 0.3909967541694641, "rewards/margins": 0.5475971698760986, "rewards/rejected": -0.15660040080547333, "step": 5973 }, { "epoch": 0.9238739609510922, "grad_norm": 3.564321994781494, "learning_rate": 3.844655745217093e-06, "logits/chosen": 18.885189056396484, "logits/rejected": 14.983673095703125, "logps/chosen": -187.3491973876953, "logps/rejected": -158.0996551513672, "loss": 0.6028, "rewards/accuracies": 0.75, "rewards/chosen": -0.033954039216041565, "rewards/margins": 0.22972875833511353, "rewards/rejected": -0.2636827826499939, "step": 5974 }, { "epoch": 0.9240286100908564, "grad_norm": 8.08517074584961, "learning_rate": 3.844369343567419e-06, "logits/chosen": 5.152029037475586, "logits/rejected": 7.747014045715332, "logps/chosen": -220.47872924804688, "logps/rejected": -245.01150512695312, "loss": 0.5951, "rewards/accuracies": 0.5, "rewards/chosen": -0.2685032784938812, "rewards/margins": 0.4616629481315613, "rewards/rejected": -0.7301662564277649, "step": 5975 }, { "epoch": 0.9241832592306205, "grad_norm": 4.957554340362549, "learning_rate": 3.844082941917746e-06, "logits/chosen": 9.247166633605957, "logits/rejected": 10.735408782958984, "logps/chosen": -235.8817901611328, "logps/rejected": -248.5756072998047, "loss": 0.7305, "rewards/accuracies": 0.5, "rewards/chosen": -0.06319206953048706, "rewards/margins": 0.09371539950370789, "rewards/rejected": -0.15690746903419495, "step": 5976 }, { "epoch": 0.9243379083703847, "grad_norm": 8.51071834564209, "learning_rate": 3.8437965402680725e-06, "logits/chosen": 9.490609169006348, "logits/rejected": 12.374410629272461, "logps/chosen": -270.05328369140625, "logps/rejected": -283.78564453125, "loss": 0.7867, "rewards/accuracies": 0.375, "rewards/chosen": 0.013409137725830078, "rewards/margins": -0.048030681908130646, "rewards/rejected": 0.06143981218338013, "step": 5977 }, { "epoch": 0.9244925575101488, "grad_norm": 4.739641189575195, "learning_rate": 3.843510138618399e-06, "logits/chosen": 14.059650421142578, "logits/rejected": -0.6608867645263672, "logps/chosen": -497.4606018066406, "logps/rejected": -231.14060974121094, "loss": 0.5057, "rewards/accuracies": 0.75, "rewards/chosen": 0.17075282335281372, "rewards/margins": 0.8179172873497009, "rewards/rejected": -0.6471644639968872, "step": 5978 }, { "epoch": 0.924647206649913, "grad_norm": 9.917501449584961, "learning_rate": 3.843223736968725e-06, "logits/chosen": 14.094247817993164, "logits/rejected": 13.558454513549805, "logps/chosen": -285.1767272949219, "logps/rejected": -284.9686584472656, "loss": 0.7405, "rewards/accuracies": 0.5, "rewards/chosen": 0.1509305238723755, "rewards/margins": 0.0009683221578598022, "rewards/rejected": 0.14996221661567688, "step": 5979 }, { "epoch": 0.9248018557896772, "grad_norm": 4.1342997550964355, "learning_rate": 3.8429373353190516e-06, "logits/chosen": 16.070533752441406, "logits/rejected": 15.911742210388184, "logps/chosen": -157.46766662597656, "logps/rejected": -163.3729248046875, "loss": 0.5542, "rewards/accuracies": 0.625, "rewards/chosen": -0.13045649230480194, "rewards/margins": 0.38117191195487976, "rewards/rejected": -0.5116283297538757, "step": 5980 }, { "epoch": 0.9249565049294414, "grad_norm": 5.723352909088135, "learning_rate": 3.842650933669378e-06, "logits/chosen": 16.820104598999023, "logits/rejected": 7.088027477264404, "logps/chosen": -330.1671447753906, "logps/rejected": -253.65802001953125, "loss": 0.578, "rewards/accuracies": 0.5, "rewards/chosen": 0.3902343809604645, "rewards/margins": 0.4019431471824646, "rewards/rejected": -0.011708717793226242, "step": 5981 }, { "epoch": 0.9251111540692055, "grad_norm": 5.285297393798828, "learning_rate": 3.842364532019705e-06, "logits/chosen": 11.568995475769043, "logits/rejected": 8.384541511535645, "logps/chosen": -205.7856903076172, "logps/rejected": -201.41253662109375, "loss": 0.6246, "rewards/accuracies": 0.5, "rewards/chosen": -0.05940427631139755, "rewards/margins": 0.35890018939971924, "rewards/rejected": -0.4183045029640198, "step": 5982 }, { "epoch": 0.9252658032089697, "grad_norm": 6.304208755493164, "learning_rate": 3.842078130370031e-06, "logits/chosen": 14.559545516967773, "logits/rejected": 12.454434394836426, "logps/chosen": -480.0831604003906, "logps/rejected": -322.0045166015625, "loss": 0.4961, "rewards/accuracies": 0.75, "rewards/chosen": 0.09261521697044373, "rewards/margins": 0.5517861843109131, "rewards/rejected": -0.4591709077358246, "step": 5983 }, { "epoch": 0.9254204523487338, "grad_norm": 6.11890172958374, "learning_rate": 3.841791728720357e-06, "logits/chosen": 5.396515846252441, "logits/rejected": 9.605737686157227, "logps/chosen": -216.80996704101562, "logps/rejected": -212.20599365234375, "loss": 0.7952, "rewards/accuracies": 0.125, "rewards/chosen": -0.09389863163232803, "rewards/margins": -0.15854188799858093, "rewards/rejected": 0.0646432489156723, "step": 5984 }, { "epoch": 0.925575101488498, "grad_norm": 5.103549003601074, "learning_rate": 3.841505327070684e-06, "logits/chosen": 12.546974182128906, "logits/rejected": 9.806419372558594, "logps/chosen": -250.4193572998047, "logps/rejected": -211.29010009765625, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": -0.07964477688074112, "rewards/margins": 0.041126541793346405, "rewards/rejected": -0.12077131122350693, "step": 5985 }, { "epoch": 0.9257297506282621, "grad_norm": 4.3281378746032715, "learning_rate": 3.841218925421011e-06, "logits/chosen": 9.79928207397461, "logits/rejected": 9.200427055358887, "logps/chosen": -307.8592529296875, "logps/rejected": -291.7019958496094, "loss": 0.4705, "rewards/accuracies": 0.875, "rewards/chosen": 0.35281121730804443, "rewards/margins": 0.8667663335800171, "rewards/rejected": -0.5139550566673279, "step": 5986 }, { "epoch": 0.9258843997680263, "grad_norm": 4.6955647468566895, "learning_rate": 3.840932523771337e-06, "logits/chosen": 7.133037090301514, "logits/rejected": 7.298615455627441, "logps/chosen": -288.6138000488281, "logps/rejected": -287.91217041015625, "loss": 0.6542, "rewards/accuracies": 0.5, "rewards/chosen": 0.30169397592544556, "rewards/margins": 0.14161929488182068, "rewards/rejected": 0.16007468104362488, "step": 5987 }, { "epoch": 0.9260390489077904, "grad_norm": 4.4902191162109375, "learning_rate": 3.840646122121664e-06, "logits/chosen": 10.88308048248291, "logits/rejected": 5.6794047355651855, "logps/chosen": -302.8494873046875, "logps/rejected": -225.64649963378906, "loss": 0.6366, "rewards/accuracies": 0.75, "rewards/chosen": 0.040824323892593384, "rewards/margins": 0.3924453854560852, "rewards/rejected": -0.35162103176116943, "step": 5988 }, { "epoch": 0.9261936980475546, "grad_norm": 5.543754577636719, "learning_rate": 3.84035972047199e-06, "logits/chosen": 9.354480743408203, "logits/rejected": 10.920613288879395, "logps/chosen": -291.64715576171875, "logps/rejected": -277.1886291503906, "loss": 0.6078, "rewards/accuracies": 0.625, "rewards/chosen": -0.01307515799999237, "rewards/margins": 0.37407273054122925, "rewards/rejected": -0.3871479034423828, "step": 5989 }, { "epoch": 0.9263483471873187, "grad_norm": 5.527048110961914, "learning_rate": 3.840073318822316e-06, "logits/chosen": 11.379961013793945, "logits/rejected": 12.641226768493652, "logps/chosen": -258.747802734375, "logps/rejected": -245.22509765625, "loss": 0.853, "rewards/accuracies": 0.625, "rewards/chosen": -0.2707340717315674, "rewards/margins": -0.22588051855564117, "rewards/rejected": -0.04485354945063591, "step": 5990 }, { "epoch": 0.9265029963270829, "grad_norm": 6.397703170776367, "learning_rate": 3.839786917172643e-06, "logits/chosen": 14.814423561096191, "logits/rejected": 7.18342399597168, "logps/chosen": -282.2193603515625, "logps/rejected": -206.79129028320312, "loss": 0.6437, "rewards/accuracies": 0.5, "rewards/chosen": 0.4099922180175781, "rewards/margins": 0.16729727387428284, "rewards/rejected": 0.2426949441432953, "step": 5991 }, { "epoch": 0.926657645466847, "grad_norm": 4.245517730712891, "learning_rate": 3.83950051552297e-06, "logits/chosen": 10.150803565979004, "logits/rejected": 9.948104858398438, "logps/chosen": -192.3197784423828, "logps/rejected": -233.9940185546875, "loss": 0.6158, "rewards/accuracies": 0.5, "rewards/chosen": -0.003999426960945129, "rewards/margins": 0.23117296397686005, "rewards/rejected": -0.2351723611354828, "step": 5992 }, { "epoch": 0.9268122946066113, "grad_norm": 6.12477970123291, "learning_rate": 3.839214113873296e-06, "logits/chosen": 11.006741523742676, "logits/rejected": 13.550213813781738, "logps/chosen": -315.9485168457031, "logps/rejected": -282.2869567871094, "loss": 0.7184, "rewards/accuracies": 0.375, "rewards/chosen": -0.22746402025222778, "rewards/margins": -0.004456795752048492, "rewards/rejected": -0.2230072170495987, "step": 5993 }, { "epoch": 0.9269669437463754, "grad_norm": 4.8840413093566895, "learning_rate": 3.838927712223623e-06, "logits/chosen": 10.316544532775879, "logits/rejected": 6.839597702026367, "logps/chosen": -316.9295654296875, "logps/rejected": -254.8017578125, "loss": 0.5098, "rewards/accuracies": 0.875, "rewards/chosen": 0.11888179183006287, "rewards/margins": 0.5479996800422668, "rewards/rejected": -0.4291178584098816, "step": 5994 }, { "epoch": 0.9271215928861396, "grad_norm": 4.272189617156982, "learning_rate": 3.838641310573949e-06, "logits/chosen": 5.318933486938477, "logits/rejected": 4.6727800369262695, "logps/chosen": -140.0816650390625, "logps/rejected": -173.6703338623047, "loss": 0.5935, "rewards/accuracies": 0.75, "rewards/chosen": -0.2964593470096588, "rewards/margins": 0.2374880313873291, "rewards/rejected": -0.5339474081993103, "step": 5995 }, { "epoch": 0.9272762420259038, "grad_norm": 4.730823040008545, "learning_rate": 3.8383549089242754e-06, "logits/chosen": 9.433661460876465, "logits/rejected": 7.357000350952148, "logps/chosen": -299.11700439453125, "logps/rejected": -236.88186645507812, "loss": 0.6115, "rewards/accuracies": 0.75, "rewards/chosen": 0.2665528953075409, "rewards/margins": 0.2866978645324707, "rewards/rejected": -0.0201449915766716, "step": 5996 }, { "epoch": 0.9274308911656679, "grad_norm": 4.293432235717773, "learning_rate": 3.838068507274602e-06, "logits/chosen": 9.619799613952637, "logits/rejected": 9.015604972839355, "logps/chosen": -232.1720733642578, "logps/rejected": -190.91671752929688, "loss": 0.5996, "rewards/accuracies": 0.75, "rewards/chosen": -0.10083504766225815, "rewards/margins": 0.2952927052974701, "rewards/rejected": -0.39612776041030884, "step": 5997 }, { "epoch": 0.927585540305432, "grad_norm": 4.396122455596924, "learning_rate": 3.837782105624929e-06, "logits/chosen": 10.244634628295898, "logits/rejected": 7.831020832061768, "logps/chosen": -182.12704467773438, "logps/rejected": -192.9267578125, "loss": 0.5556, "rewards/accuracies": 0.75, "rewards/chosen": 0.05804350972175598, "rewards/margins": 0.33055567741394043, "rewards/rejected": -0.27251213788986206, "step": 5998 }, { "epoch": 0.9277401894451962, "grad_norm": 5.335066795349121, "learning_rate": 3.837495703975255e-06, "logits/chosen": 14.32138729095459, "logits/rejected": 11.036077499389648, "logps/chosen": -366.6612854003906, "logps/rejected": -260.768310546875, "loss": 0.6549, "rewards/accuracies": 0.625, "rewards/chosen": 0.0031122565269470215, "rewards/margins": 0.13330931961536407, "rewards/rejected": -0.13019704818725586, "step": 5999 }, { "epoch": 0.9278948385849604, "grad_norm": 4.370915412902832, "learning_rate": 3.837209302325582e-06, "logits/chosen": 12.220867156982422, "logits/rejected": 6.463516712188721, "logps/chosen": -325.37176513671875, "logps/rejected": -208.3942108154297, "loss": 0.5755, "rewards/accuracies": 0.75, "rewards/chosen": 0.07932300865650177, "rewards/margins": 0.38254308700561523, "rewards/rejected": -0.30322009325027466, "step": 6000 }, { "epoch": 0.9280494877247245, "grad_norm": 3.832820415496826, "learning_rate": 3.836922900675909e-06, "logits/chosen": 10.285869598388672, "logits/rejected": 6.5575151443481445, "logps/chosen": -257.0646667480469, "logps/rejected": -216.92608642578125, "loss": 0.4541, "rewards/accuracies": 0.75, "rewards/chosen": -0.07314136624336243, "rewards/margins": 0.8702197074890137, "rewards/rejected": -0.9433611631393433, "step": 6001 }, { "epoch": 0.9282041368644887, "grad_norm": 7.224663734436035, "learning_rate": 3.8366364990262345e-06, "logits/chosen": 7.140823841094971, "logits/rejected": 6.941685676574707, "logps/chosen": -377.1507263183594, "logps/rejected": -332.791748046875, "loss": 0.6105, "rewards/accuracies": 0.75, "rewards/chosen": -0.23440855741500854, "rewards/margins": 0.3616694509983063, "rewards/rejected": -0.5960780382156372, "step": 6002 }, { "epoch": 0.9283587860042528, "grad_norm": 8.36377239227295, "learning_rate": 3.836350097376561e-06, "logits/chosen": 6.744920253753662, "logits/rejected": 5.525350093841553, "logps/chosen": -200.383056640625, "logps/rejected": -213.25692749023438, "loss": 0.787, "rewards/accuracies": 0.375, "rewards/chosen": -0.14215537905693054, "rewards/margins": 0.03371515870094299, "rewards/rejected": -0.17587052285671234, "step": 6003 }, { "epoch": 0.928513435144017, "grad_norm": 5.147937297821045, "learning_rate": 3.836063695726888e-06, "logits/chosen": 6.726301193237305, "logits/rejected": 6.76882791519165, "logps/chosen": -270.8572998046875, "logps/rejected": -261.0835876464844, "loss": 0.7157, "rewards/accuracies": 0.5, "rewards/chosen": 0.08769246190786362, "rewards/margins": 0.031356245279312134, "rewards/rejected": 0.05633620172739029, "step": 6004 }, { "epoch": 0.9286680842837812, "grad_norm": 4.630476951599121, "learning_rate": 3.8357772940772144e-06, "logits/chosen": 5.05290412902832, "logits/rejected": 3.440218687057495, "logps/chosen": -194.867919921875, "logps/rejected": -182.01553344726562, "loss": 0.7222, "rewards/accuracies": 0.375, "rewards/chosen": 0.12103088945150375, "rewards/margins": -0.00909436121582985, "rewards/rejected": 0.1301252692937851, "step": 6005 }, { "epoch": 0.9288227334235454, "grad_norm": 4.764076232910156, "learning_rate": 3.835490892427541e-06, "logits/chosen": 11.23635482788086, "logits/rejected": 6.176011562347412, "logps/chosen": -233.3962860107422, "logps/rejected": -148.70950317382812, "loss": 0.6845, "rewards/accuracies": 0.5, "rewards/chosen": 0.06881700456142426, "rewards/margins": 0.32831257581710815, "rewards/rejected": -0.2594955563545227, "step": 6006 }, { "epoch": 0.9289773825633095, "grad_norm": 6.077795028686523, "learning_rate": 3.835204490777868e-06, "logits/chosen": 8.089153289794922, "logits/rejected": 4.023036479949951, "logps/chosen": -477.6796569824219, "logps/rejected": -347.79107666015625, "loss": 0.5066, "rewards/accuracies": 0.75, "rewards/chosen": 0.3078598976135254, "rewards/margins": 0.5196900367736816, "rewards/rejected": -0.21183015406131744, "step": 6007 }, { "epoch": 0.9291320317030737, "grad_norm": 4.766157150268555, "learning_rate": 3.8349180891281936e-06, "logits/chosen": 5.765253067016602, "logits/rejected": 5.693391799926758, "logps/chosen": -184.94967651367188, "logps/rejected": -208.50437927246094, "loss": 0.778, "rewards/accuracies": 0.625, "rewards/chosen": 0.13897529244422913, "rewards/margins": -0.08087249100208282, "rewards/rejected": 0.21984776854515076, "step": 6008 }, { "epoch": 0.9292866808428378, "grad_norm": 6.012340068817139, "learning_rate": 3.83463168747852e-06, "logits/chosen": 9.131631851196289, "logits/rejected": 8.53108024597168, "logps/chosen": -264.8885498046875, "logps/rejected": -322.9024353027344, "loss": 0.6786, "rewards/accuracies": 0.625, "rewards/chosen": -0.20992150902748108, "rewards/margins": 0.4160435199737549, "rewards/rejected": -0.6259649991989136, "step": 6009 }, { "epoch": 0.929441329982602, "grad_norm": 17.134231567382812, "learning_rate": 3.834345285828847e-06, "logits/chosen": 12.58877182006836, "logits/rejected": 7.450962543487549, "logps/chosen": -321.4927673339844, "logps/rejected": -252.15084838867188, "loss": 0.6573, "rewards/accuracies": 0.625, "rewards/chosen": -0.044093236327171326, "rewards/margins": 0.4173487722873688, "rewards/rejected": -0.4614419937133789, "step": 6010 }, { "epoch": 0.9295959791223661, "grad_norm": 6.025030136108398, "learning_rate": 3.8340588841791735e-06, "logits/chosen": 13.786745071411133, "logits/rejected": 15.04443645477295, "logps/chosen": -261.65142822265625, "logps/rejected": -323.10760498046875, "loss": 0.6626, "rewards/accuracies": 0.5, "rewards/chosen": 0.2919447720050812, "rewards/margins": 0.08822473138570786, "rewards/rejected": 0.2037200629711151, "step": 6011 }, { "epoch": 0.9297506282621303, "grad_norm": 5.432513236999512, "learning_rate": 3.8337724825295e-06, "logits/chosen": 6.158177852630615, "logits/rejected": 7.188523292541504, "logps/chosen": -307.4617919921875, "logps/rejected": -262.46612548828125, "loss": 0.6134, "rewards/accuracies": 0.625, "rewards/chosen": -0.0014916956424713135, "rewards/margins": 0.4702872335910797, "rewards/rejected": -0.4717789888381958, "step": 6012 }, { "epoch": 0.9299052774018944, "grad_norm": 14.94957160949707, "learning_rate": 3.833486080879826e-06, "logits/chosen": 5.800658702850342, "logits/rejected": 4.967113494873047, "logps/chosen": -284.8963928222656, "logps/rejected": -262.14190673828125, "loss": 0.5591, "rewards/accuracies": 0.625, "rewards/chosen": 0.32444095611572266, "rewards/margins": 0.47640955448150635, "rewards/rejected": -0.15196862816810608, "step": 6013 }, { "epoch": 0.9300599265416586, "grad_norm": 5.186890602111816, "learning_rate": 3.833199679230153e-06, "logits/chosen": 11.036134719848633, "logits/rejected": 10.278372764587402, "logps/chosen": -253.8583984375, "logps/rejected": -208.4478759765625, "loss": 0.6822, "rewards/accuracies": 0.5, "rewards/chosen": 0.3863145709037781, "rewards/margins": 0.2542487382888794, "rewards/rejected": 0.1320658177137375, "step": 6014 }, { "epoch": 0.9302145756814227, "grad_norm": 4.433958053588867, "learning_rate": 3.832913277580479e-06, "logits/chosen": 13.264066696166992, "logits/rejected": 8.640970230102539, "logps/chosen": -235.05982971191406, "logps/rejected": -168.89462280273438, "loss": 0.6085, "rewards/accuracies": 0.625, "rewards/chosen": -0.23281946778297424, "rewards/margins": 0.4022805690765381, "rewards/rejected": -0.6351000666618347, "step": 6015 }, { "epoch": 0.9303692248211869, "grad_norm": 8.066131591796875, "learning_rate": 3.832626875930806e-06, "logits/chosen": 8.911860466003418, "logits/rejected": 3.7579660415649414, "logps/chosen": -238.1811065673828, "logps/rejected": -202.03439331054688, "loss": 0.776, "rewards/accuracies": 0.875, "rewards/chosen": -0.10820291936397552, "rewards/margins": 0.14072877168655396, "rewards/rejected": -0.24893172085285187, "step": 6016 }, { "epoch": 0.930523873960951, "grad_norm": 4.749118328094482, "learning_rate": 3.832340474281132e-06, "logits/chosen": 5.73783016204834, "logits/rejected": 2.6617510318756104, "logps/chosen": -285.53875732421875, "logps/rejected": -221.54373168945312, "loss": 0.6347, "rewards/accuracies": 0.5, "rewards/chosen": 0.47561997175216675, "rewards/margins": 0.3651784360408783, "rewards/rejected": 0.11044152081012726, "step": 6017 }, { "epoch": 0.9306785231007153, "grad_norm": 4.501606464385986, "learning_rate": 3.832054072631458e-06, "logits/chosen": 12.063766479492188, "logits/rejected": 3.245598554611206, "logps/chosen": -200.2242431640625, "logps/rejected": -154.25167846679688, "loss": 0.6233, "rewards/accuracies": 0.625, "rewards/chosen": 0.3054177165031433, "rewards/margins": 0.3008345663547516, "rewards/rejected": 0.004583191126585007, "step": 6018 }, { "epoch": 0.9308331722404795, "grad_norm": 4.452945232391357, "learning_rate": 3.831767670981785e-06, "logits/chosen": 10.99527359008789, "logits/rejected": 7.252123832702637, "logps/chosen": -374.6976013183594, "logps/rejected": -229.2428741455078, "loss": 0.4409, "rewards/accuracies": 0.875, "rewards/chosen": 0.4849901795387268, "rewards/margins": 0.7554462552070618, "rewards/rejected": -0.2704560160636902, "step": 6019 }, { "epoch": 0.9309878213802436, "grad_norm": 5.7374396324157715, "learning_rate": 3.831481269332112e-06, "logits/chosen": 8.0463228225708, "logits/rejected": 9.23757553100586, "logps/chosen": -277.8264465332031, "logps/rejected": -231.28671264648438, "loss": 0.9093, "rewards/accuracies": 0.375, "rewards/chosen": 0.10356885194778442, "rewards/margins": -0.1309281885623932, "rewards/rejected": 0.23449702560901642, "step": 6020 }, { "epoch": 0.9311424705200078, "grad_norm": 4.379790782928467, "learning_rate": 3.831194867682438e-06, "logits/chosen": 12.132953643798828, "logits/rejected": 10.656715393066406, "logps/chosen": -326.63165283203125, "logps/rejected": -277.4964599609375, "loss": 0.4916, "rewards/accuracies": 0.75, "rewards/chosen": 0.2004929482936859, "rewards/margins": 0.5663213133811951, "rewards/rejected": -0.36582839488983154, "step": 6021 }, { "epoch": 0.9312971196597719, "grad_norm": 3.0467631816864014, "learning_rate": 3.830908466032764e-06, "logits/chosen": 11.022439956665039, "logits/rejected": 10.373337745666504, "logps/chosen": -272.3958740234375, "logps/rejected": -215.5037384033203, "loss": 0.4691, "rewards/accuracies": 0.75, "rewards/chosen": 0.5104539394378662, "rewards/margins": 0.6950963735580444, "rewards/rejected": -0.18464237451553345, "step": 6022 }, { "epoch": 0.9314517687995361, "grad_norm": 6.030575275421143, "learning_rate": 3.830622064383091e-06, "logits/chosen": 14.003541946411133, "logits/rejected": 8.356953620910645, "logps/chosen": -453.41717529296875, "logps/rejected": -328.6996154785156, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": 0.3017389178276062, "rewards/margins": 0.447161465883255, "rewards/rejected": -0.1454225480556488, "step": 6023 }, { "epoch": 0.9316064179393002, "grad_norm": 5.45608377456665, "learning_rate": 3.830335662733417e-06, "logits/chosen": 12.766246795654297, "logits/rejected": 10.134567260742188, "logps/chosen": -348.9058837890625, "logps/rejected": -261.0465393066406, "loss": 0.626, "rewards/accuracies": 0.625, "rewards/chosen": 0.33152657747268677, "rewards/margins": 0.39144694805145264, "rewards/rejected": -0.059920430183410645, "step": 6024 }, { "epoch": 0.9317610670790644, "grad_norm": 5.8412909507751465, "learning_rate": 3.830049261083744e-06, "logits/chosen": 9.497193336486816, "logits/rejected": 14.345690727233887, "logps/chosen": -365.80523681640625, "logps/rejected": -380.3077392578125, "loss": 0.7554, "rewards/accuracies": 0.625, "rewards/chosen": 0.1632295846939087, "rewards/margins": 0.27112072706222534, "rewards/rejected": -0.10789114236831665, "step": 6025 }, { "epoch": 0.9319157162188285, "grad_norm": 4.8470258712768555, "learning_rate": 3.829762859434071e-06, "logits/chosen": 9.572096824645996, "logits/rejected": 11.61083984375, "logps/chosen": -195.73455810546875, "logps/rejected": -183.56591796875, "loss": 0.6392, "rewards/accuracies": 0.75, "rewards/chosen": -0.24269239604473114, "rewards/margins": 0.12330552190542221, "rewards/rejected": -0.36599791049957275, "step": 6026 }, { "epoch": 0.9320703653585927, "grad_norm": 4.497620582580566, "learning_rate": 3.829476457784397e-06, "logits/chosen": 11.514942169189453, "logits/rejected": 11.367023468017578, "logps/chosen": -249.30511474609375, "logps/rejected": -213.90652465820312, "loss": 0.5604, "rewards/accuracies": 0.75, "rewards/chosen": -0.057696253061294556, "rewards/margins": 0.4380154609680176, "rewards/rejected": -0.49571171402931213, "step": 6027 }, { "epoch": 0.9322250144983568, "grad_norm": 4.109735012054443, "learning_rate": 3.829190056134723e-06, "logits/chosen": 16.47292709350586, "logits/rejected": 9.034542083740234, "logps/chosen": -342.5322570800781, "logps/rejected": -204.951171875, "loss": 0.4944, "rewards/accuracies": 0.875, "rewards/chosen": 0.35756605863571167, "rewards/margins": 0.5707290172576904, "rewards/rejected": -0.21316289901733398, "step": 6028 }, { "epoch": 0.932379663638121, "grad_norm": 5.297967433929443, "learning_rate": 3.82890365448505e-06, "logits/chosen": 9.922212600708008, "logits/rejected": 8.710733413696289, "logps/chosen": -213.2489013671875, "logps/rejected": -236.77659606933594, "loss": 0.5968, "rewards/accuracies": 0.625, "rewards/chosen": -0.11869316548109055, "rewards/margins": 0.36253565549850464, "rewards/rejected": -0.4812288284301758, "step": 6029 }, { "epoch": 0.9325343127778851, "grad_norm": 4.6720290184021, "learning_rate": 3.8286172528353765e-06, "logits/chosen": 11.735729217529297, "logits/rejected": 9.072715759277344, "logps/chosen": -384.2177429199219, "logps/rejected": -239.77377319335938, "loss": 0.5031, "rewards/accuracies": 0.75, "rewards/chosen": 0.4991775155067444, "rewards/margins": 0.5800157785415649, "rewards/rejected": -0.08083821088075638, "step": 6030 }, { "epoch": 0.9326889619176494, "grad_norm": 10.191884994506836, "learning_rate": 3.828330851185703e-06, "logits/chosen": 9.411463737487793, "logits/rejected": 3.467341899871826, "logps/chosen": -330.3621520996094, "logps/rejected": -248.29336547851562, "loss": 0.8695, "rewards/accuracies": 0.5, "rewards/chosen": 0.010299697518348694, "rewards/margins": -0.19282718002796173, "rewards/rejected": 0.20312686264514923, "step": 6031 }, { "epoch": 0.9328436110574135, "grad_norm": 4.5380072593688965, "learning_rate": 3.82804444953603e-06, "logits/chosen": 9.85280704498291, "logits/rejected": 6.435073375701904, "logps/chosen": -326.41607666015625, "logps/rejected": -208.88665771484375, "loss": 0.6914, "rewards/accuracies": 0.5, "rewards/chosen": 0.28879299759864807, "rewards/margins": 0.2380891740322113, "rewards/rejected": 0.050703808665275574, "step": 6032 }, { "epoch": 0.9329982601971777, "grad_norm": 5.611965179443359, "learning_rate": 3.8277580478863564e-06, "logits/chosen": 11.191861152648926, "logits/rejected": 2.6972930431365967, "logps/chosen": -359.6614990234375, "logps/rejected": -290.51641845703125, "loss": 0.5934, "rewards/accuracies": 0.625, "rewards/chosen": 0.3615747392177582, "rewards/margins": 0.376809298992157, "rewards/rejected": -0.015234556049108505, "step": 6033 }, { "epoch": 0.9331529093369418, "grad_norm": 4.315613269805908, "learning_rate": 3.827471646236683e-06, "logits/chosen": 10.832747459411621, "logits/rejected": 12.062925338745117, "logps/chosen": -167.652587890625, "logps/rejected": -157.68569946289062, "loss": 0.7155, "rewards/accuracies": 0.5, "rewards/chosen": 0.0033126845955848694, "rewards/margins": -0.0034381896257400513, "rewards/rejected": 0.006750866770744324, "step": 6034 }, { "epoch": 0.933307558476706, "grad_norm": 6.593498706817627, "learning_rate": 3.827185244587009e-06, "logits/chosen": 9.622892379760742, "logits/rejected": 9.4996976852417, "logps/chosen": -308.12884521484375, "logps/rejected": -322.00311279296875, "loss": 0.6496, "rewards/accuracies": 0.625, "rewards/chosen": 0.3257409930229187, "rewards/margins": 0.250812828540802, "rewards/rejected": 0.07492820918560028, "step": 6035 }, { "epoch": 0.9334622076164701, "grad_norm": 5.7150654792785645, "learning_rate": 3.8268988429373355e-06, "logits/chosen": 8.950979232788086, "logits/rejected": 10.60782527923584, "logps/chosen": -384.05035400390625, "logps/rejected": -281.7973327636719, "loss": 0.6524, "rewards/accuracies": 0.5, "rewards/chosen": 0.3711523115634918, "rewards/margins": 0.24722784757614136, "rewards/rejected": 0.12392445653676987, "step": 6036 }, { "epoch": 0.9336168567562343, "grad_norm": 6.262704849243164, "learning_rate": 3.826612441287662e-06, "logits/chosen": 12.922733306884766, "logits/rejected": 12.037979125976562, "logps/chosen": -238.59129333496094, "logps/rejected": -261.8547058105469, "loss": 0.8978, "rewards/accuracies": 0.25, "rewards/chosen": -0.15271368622779846, "rewards/margins": -0.30616745352745056, "rewards/rejected": 0.1534537672996521, "step": 6037 }, { "epoch": 0.9337715058959984, "grad_norm": 4.9652180671691895, "learning_rate": 3.826326039637989e-06, "logits/chosen": 13.282856941223145, "logits/rejected": 12.026641845703125, "logps/chosen": -240.10231018066406, "logps/rejected": -215.92483520507812, "loss": 0.7116, "rewards/accuracies": 0.5, "rewards/chosen": 0.1337796151638031, "rewards/margins": -0.016256965696811676, "rewards/rejected": 0.15003658831119537, "step": 6038 }, { "epoch": 0.9339261550357626, "grad_norm": 4.477086067199707, "learning_rate": 3.8260396379883155e-06, "logits/chosen": 11.503787994384766, "logits/rejected": 9.353680610656738, "logps/chosen": -265.6096496582031, "logps/rejected": -266.1465148925781, "loss": 0.5894, "rewards/accuracies": 0.5, "rewards/chosen": 0.43273651599884033, "rewards/margins": 0.5105540752410889, "rewards/rejected": -0.07781754434108734, "step": 6039 }, { "epoch": 0.9340808041755267, "grad_norm": 4.011292457580566, "learning_rate": 3.825753236338642e-06, "logits/chosen": 9.134407043457031, "logits/rejected": 11.923297882080078, "logps/chosen": -215.97825622558594, "logps/rejected": -263.9239807128906, "loss": 0.5358, "rewards/accuracies": 0.75, "rewards/chosen": 0.12924326956272125, "rewards/margins": 0.5040192604064941, "rewards/rejected": -0.3747760057449341, "step": 6040 }, { "epoch": 0.9342354533152909, "grad_norm": 4.179714202880859, "learning_rate": 3.825466834688968e-06, "logits/chosen": 11.143575668334961, "logits/rejected": 5.20336389541626, "logps/chosen": -250.85903930664062, "logps/rejected": -153.57920837402344, "loss": 0.5451, "rewards/accuracies": 0.625, "rewards/chosen": 0.06176067516207695, "rewards/margins": 0.5256879925727844, "rewards/rejected": -0.463927298784256, "step": 6041 }, { "epoch": 0.934390102455055, "grad_norm": 7.819858074188232, "learning_rate": 3.825180433039295e-06, "logits/chosen": 8.802535057067871, "logits/rejected": 7.068612575531006, "logps/chosen": -299.29998779296875, "logps/rejected": -242.37384033203125, "loss": 0.6275, "rewards/accuracies": 0.75, "rewards/chosen": 0.23822559416294098, "rewards/margins": 0.22409909963607788, "rewards/rejected": 0.014126542955636978, "step": 6042 }, { "epoch": 0.9345447515948192, "grad_norm": 4.5175700187683105, "learning_rate": 3.824894031389621e-06, "logits/chosen": 8.470870018005371, "logits/rejected": 2.412821054458618, "logps/chosen": -333.9119873046875, "logps/rejected": -278.8467712402344, "loss": 0.4997, "rewards/accuracies": 0.75, "rewards/chosen": 0.6869453191757202, "rewards/margins": 0.5555365085601807, "rewards/rejected": 0.13140885531902313, "step": 6043 }, { "epoch": 0.9346994007345835, "grad_norm": 3.8975155353546143, "learning_rate": 3.824607629739948e-06, "logits/chosen": 10.772109031677246, "logits/rejected": 4.530871391296387, "logps/chosen": -269.3810729980469, "logps/rejected": -169.63943481445312, "loss": 0.4885, "rewards/accuracies": 0.875, "rewards/chosen": 0.2432047724723816, "rewards/margins": 0.5404318571090698, "rewards/rejected": -0.29722708463668823, "step": 6044 }, { "epoch": 0.9348540498743476, "grad_norm": 4.505472183227539, "learning_rate": 3.8243212280902745e-06, "logits/chosen": 10.762072563171387, "logits/rejected": 6.81305456161499, "logps/chosen": -285.3082580566406, "logps/rejected": -218.459716796875, "loss": 0.549, "rewards/accuracies": 0.875, "rewards/chosen": 0.16315041482448578, "rewards/margins": 0.3377542495727539, "rewards/rejected": -0.17460383474826813, "step": 6045 }, { "epoch": 0.9350086990141118, "grad_norm": 6.288514614105225, "learning_rate": 3.8240348264406e-06, "logits/chosen": 14.718301773071289, "logits/rejected": 10.2052583694458, "logps/chosen": -398.0087585449219, "logps/rejected": -323.6663513183594, "loss": 0.652, "rewards/accuracies": 0.625, "rewards/chosen": 0.16363297402858734, "rewards/margins": 0.11178320646286011, "rewards/rejected": 0.05184975266456604, "step": 6046 }, { "epoch": 0.9351633481538759, "grad_norm": 3.1541645526885986, "learning_rate": 3.823748424790927e-06, "logits/chosen": 8.760577201843262, "logits/rejected": 0.8657584190368652, "logps/chosen": -368.43115234375, "logps/rejected": -197.75477600097656, "loss": 0.4479, "rewards/accuracies": 0.875, "rewards/chosen": 0.5673789978027344, "rewards/margins": 0.6440527439117432, "rewards/rejected": -0.07667368650436401, "step": 6047 }, { "epoch": 0.9353179972936401, "grad_norm": 3.8411777019500732, "learning_rate": 3.823462023141254e-06, "logits/chosen": 12.408902168273926, "logits/rejected": 13.599173545837402, "logps/chosen": -140.28466796875, "logps/rejected": -187.16244506835938, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": 0.22700080275535583, "rewards/margins": 0.06837469339370728, "rewards/rejected": 0.15862610936164856, "step": 6048 }, { "epoch": 0.9354726464334042, "grad_norm": 8.164158821105957, "learning_rate": 3.82317562149158e-06, "logits/chosen": 11.915801048278809, "logits/rejected": 8.787885665893555, "logps/chosen": -402.47857666015625, "logps/rejected": -297.17352294921875, "loss": 0.8091, "rewards/accuracies": 0.25, "rewards/chosen": 0.07263347506523132, "rewards/margins": -0.1701229214668274, "rewards/rejected": 0.24275636672973633, "step": 6049 }, { "epoch": 0.9356272955731684, "grad_norm": 5.679912567138672, "learning_rate": 3.822889219841907e-06, "logits/chosen": 6.447786331176758, "logits/rejected": 9.144700050354004, "logps/chosen": -264.0130615234375, "logps/rejected": -363.205810546875, "loss": 0.5765, "rewards/accuracies": 0.75, "rewards/chosen": 0.38308852910995483, "rewards/margins": 0.42537885904312134, "rewards/rejected": -0.042290303856134415, "step": 6050 }, { "epoch": 0.9357819447129325, "grad_norm": 5.169233798980713, "learning_rate": 3.822602818192233e-06, "logits/chosen": 13.977466583251953, "logits/rejected": 9.791933059692383, "logps/chosen": -239.18670654296875, "logps/rejected": -220.9244384765625, "loss": 0.5799, "rewards/accuracies": 0.75, "rewards/chosen": 0.18723654747009277, "rewards/margins": 0.26936832070350647, "rewards/rejected": -0.0821317583322525, "step": 6051 }, { "epoch": 0.9359365938526967, "grad_norm": 4.727330207824707, "learning_rate": 3.822316416542559e-06, "logits/chosen": 13.885272979736328, "logits/rejected": 16.182043075561523, "logps/chosen": -169.0514678955078, "logps/rejected": -222.49009704589844, "loss": 0.7323, "rewards/accuracies": 0.375, "rewards/chosen": -0.1628856211900711, "rewards/margins": 0.0253386739641428, "rewards/rejected": -0.18822431564331055, "step": 6052 }, { "epoch": 0.9360912429924608, "grad_norm": 4.583122730255127, "learning_rate": 3.822030014892886e-06, "logits/chosen": 11.220190048217773, "logits/rejected": 7.90618896484375, "logps/chosen": -248.93777465820312, "logps/rejected": -276.22998046875, "loss": 0.5869, "rewards/accuracies": 0.5, "rewards/chosen": 0.07014027237892151, "rewards/margins": 0.3012443780899048, "rewards/rejected": -0.23110409080982208, "step": 6053 }, { "epoch": 0.936245892132225, "grad_norm": 4.106428146362305, "learning_rate": 3.821743613243213e-06, "logits/chosen": 11.332945823669434, "logits/rejected": 4.969393253326416, "logps/chosen": -176.0836181640625, "logps/rejected": -134.5875244140625, "loss": 0.6316, "rewards/accuracies": 0.5, "rewards/chosen": -0.10211662948131561, "rewards/margins": 0.2919144034385681, "rewards/rejected": -0.39403101801872253, "step": 6054 }, { "epoch": 0.9364005412719891, "grad_norm": 5.967907905578613, "learning_rate": 3.8214572115935385e-06, "logits/chosen": 12.463626861572266, "logits/rejected": 11.256778717041016, "logps/chosen": -327.74005126953125, "logps/rejected": -361.98406982421875, "loss": 0.6383, "rewards/accuracies": 0.75, "rewards/chosen": 0.38031715154647827, "rewards/margins": 0.2521805167198181, "rewards/rejected": 0.12813663482666016, "step": 6055 }, { "epoch": 0.9365551904117533, "grad_norm": 6.8066277503967285, "learning_rate": 3.821170809943865e-06, "logits/chosen": 12.9143705368042, "logits/rejected": 12.729548454284668, "logps/chosen": -337.443603515625, "logps/rejected": -350.5340270996094, "loss": 0.7835, "rewards/accuracies": 0.625, "rewards/chosen": 0.135735422372818, "rewards/margins": 0.06516237556934357, "rewards/rejected": 0.07057303935289383, "step": 6056 }, { "epoch": 0.9367098395515175, "grad_norm": 3.171182155609131, "learning_rate": 3.820884408294192e-06, "logits/chosen": 10.704608917236328, "logits/rejected": 7.606919288635254, "logps/chosen": -251.07205200195312, "logps/rejected": -167.4860382080078, "loss": 0.521, "rewards/accuracies": 0.5, "rewards/chosen": 0.28730398416519165, "rewards/margins": 0.5352724194526672, "rewards/rejected": -0.24796846508979797, "step": 6057 }, { "epoch": 0.9368644886912817, "grad_norm": 3.9153659343719482, "learning_rate": 3.8205980066445185e-06, "logits/chosen": 13.292579650878906, "logits/rejected": 4.033205032348633, "logps/chosen": -332.9898986816406, "logps/rejected": -177.48500061035156, "loss": 0.5533, "rewards/accuracies": 0.5, "rewards/chosen": 0.03870735317468643, "rewards/margins": 0.5156199336051941, "rewards/rejected": -0.47691261768341064, "step": 6058 }, { "epoch": 0.9370191378310458, "grad_norm": 8.662125587463379, "learning_rate": 3.820311604994845e-06, "logits/chosen": 9.569156646728516, "logits/rejected": 7.637282371520996, "logps/chosen": -230.04583740234375, "logps/rejected": -171.99783325195312, "loss": 0.8238, "rewards/accuracies": 0.625, "rewards/chosen": -0.3394660949707031, "rewards/margins": -0.02765965461730957, "rewards/rejected": -0.31180644035339355, "step": 6059 }, { "epoch": 0.93717378697081, "grad_norm": 7.341385364532471, "learning_rate": 3.820025203345172e-06, "logits/chosen": 13.23616886138916, "logits/rejected": 9.569148063659668, "logps/chosen": -266.386962890625, "logps/rejected": -241.69935607910156, "loss": 0.9053, "rewards/accuracies": 0.25, "rewards/chosen": 0.1514882594347, "rewards/margins": -0.14549335837364197, "rewards/rejected": 0.2969816327095032, "step": 6060 }, { "epoch": 0.9373284361105741, "grad_norm": 5.778408527374268, "learning_rate": 3.8197388016954976e-06, "logits/chosen": 14.685229301452637, "logits/rejected": 12.554548263549805, "logps/chosen": -272.43682861328125, "logps/rejected": -260.2255859375, "loss": 0.7574, "rewards/accuracies": 0.375, "rewards/chosen": 0.26109209656715393, "rewards/margins": -0.09590382128953934, "rewards/rejected": 0.35699591040611267, "step": 6061 }, { "epoch": 0.9374830852503383, "grad_norm": 5.86389684677124, "learning_rate": 3.819452400045824e-06, "logits/chosen": 8.837570190429688, "logits/rejected": 9.727023124694824, "logps/chosen": -297.515869140625, "logps/rejected": -223.49392700195312, "loss": 0.6368, "rewards/accuracies": 0.5, "rewards/chosen": 0.43487799167633057, "rewards/margins": 0.1827138513326645, "rewards/rejected": 0.25216415524482727, "step": 6062 }, { "epoch": 0.9376377343901025, "grad_norm": 3.4504144191741943, "learning_rate": 3.819165998396151e-06, "logits/chosen": 14.197935104370117, "logits/rejected": 7.041372299194336, "logps/chosen": -151.49386596679688, "logps/rejected": -108.96153259277344, "loss": 0.5901, "rewards/accuracies": 0.625, "rewards/chosen": -0.013797953724861145, "rewards/margins": 0.34054338932037354, "rewards/rejected": -0.3543413579463959, "step": 6063 }, { "epoch": 0.9377923835298666, "grad_norm": 6.328835964202881, "learning_rate": 3.8188795967464775e-06, "logits/chosen": 9.975637435913086, "logits/rejected": 9.145719528198242, "logps/chosen": -313.7372741699219, "logps/rejected": -304.69183349609375, "loss": 0.9328, "rewards/accuracies": 0.25, "rewards/chosen": -0.023795247077941895, "rewards/margins": -0.28483957052230835, "rewards/rejected": 0.26104432344436646, "step": 6064 }, { "epoch": 0.9379470326696308, "grad_norm": 4.681914806365967, "learning_rate": 3.818593195096804e-06, "logits/chosen": 9.851226806640625, "logits/rejected": 11.388118743896484, "logps/chosen": -260.6351623535156, "logps/rejected": -293.7640380859375, "loss": 0.5494, "rewards/accuracies": 0.75, "rewards/chosen": 0.32276809215545654, "rewards/margins": 0.41228392720222473, "rewards/rejected": -0.08951583504676819, "step": 6065 }, { "epoch": 0.9381016818093949, "grad_norm": 3.847282886505127, "learning_rate": 3.818306793447131e-06, "logits/chosen": 10.958084106445312, "logits/rejected": 11.48143482208252, "logps/chosen": -191.4639892578125, "logps/rejected": -200.89566040039062, "loss": 0.6415, "rewards/accuracies": 0.625, "rewards/chosen": 0.13334208726882935, "rewards/margins": 0.14758911728858948, "rewards/rejected": -0.014247044920921326, "step": 6066 }, { "epoch": 0.9382563309491591, "grad_norm": 4.838980197906494, "learning_rate": 3.8180203917974575e-06, "logits/chosen": 17.168556213378906, "logits/rejected": 5.975695610046387, "logps/chosen": -384.2066650390625, "logps/rejected": -271.62554931640625, "loss": 0.5127, "rewards/accuracies": 0.75, "rewards/chosen": 0.6038050055503845, "rewards/margins": 0.6593232750892639, "rewards/rejected": -0.055518247187137604, "step": 6067 }, { "epoch": 0.9384109800889232, "grad_norm": 4.785552501678467, "learning_rate": 3.817733990147783e-06, "logits/chosen": 11.991065979003906, "logits/rejected": 7.702507972717285, "logps/chosen": -329.9935302734375, "logps/rejected": -191.23634338378906, "loss": 0.4522, "rewards/accuracies": 1.0, "rewards/chosen": 0.6737090349197388, "rewards/margins": 0.5773491263389587, "rewards/rejected": 0.09635983407497406, "step": 6068 }, { "epoch": 0.9385656292286875, "grad_norm": 7.1877899169921875, "learning_rate": 3.81744758849811e-06, "logits/chosen": 8.11004638671875, "logits/rejected": 9.369216918945312, "logps/chosen": -236.0681610107422, "logps/rejected": -290.5652160644531, "loss": 0.7017, "rewards/accuracies": 0.5, "rewards/chosen": -0.4204835295677185, "rewards/margins": 0.12054291367530823, "rewards/rejected": -0.5410264730453491, "step": 6069 }, { "epoch": 0.9387202783684516, "grad_norm": 5.705470085144043, "learning_rate": 3.8171611868484366e-06, "logits/chosen": 16.26763343811035, "logits/rejected": 13.144468307495117, "logps/chosen": -245.2537078857422, "logps/rejected": -232.14541625976562, "loss": 0.5803, "rewards/accuracies": 0.75, "rewards/chosen": 0.4970560371875763, "rewards/margins": 0.4018312990665436, "rewards/rejected": 0.0952247753739357, "step": 6070 }, { "epoch": 0.9388749275082158, "grad_norm": 4.4256415367126465, "learning_rate": 3.816874785198763e-06, "logits/chosen": 8.794835090637207, "logits/rejected": 7.541773319244385, "logps/chosen": -191.7821044921875, "logps/rejected": -188.01748657226562, "loss": 0.6287, "rewards/accuracies": 0.5, "rewards/chosen": -0.05795135349035263, "rewards/margins": 0.21488073468208313, "rewards/rejected": -0.27283209562301636, "step": 6071 }, { "epoch": 0.9390295766479799, "grad_norm": 4.758356094360352, "learning_rate": 3.81658838354909e-06, "logits/chosen": 11.676168441772461, "logits/rejected": 7.672636985778809, "logps/chosen": -316.0868835449219, "logps/rejected": -287.555908203125, "loss": 0.5652, "rewards/accuracies": 0.625, "rewards/chosen": 0.4077572822570801, "rewards/margins": 0.376626580953598, "rewards/rejected": 0.031130697578191757, "step": 6072 }, { "epoch": 0.9391842257877441, "grad_norm": 5.94786262512207, "learning_rate": 3.8163019818994165e-06, "logits/chosen": 13.635997772216797, "logits/rejected": 7.333440780639648, "logps/chosen": -344.91094970703125, "logps/rejected": -302.4306335449219, "loss": 0.7249, "rewards/accuracies": 0.5, "rewards/chosen": 0.34345197677612305, "rewards/margins": 0.05055246502161026, "rewards/rejected": 0.2928995192050934, "step": 6073 }, { "epoch": 0.9393388749275082, "grad_norm": 6.516737461090088, "learning_rate": 3.816015580249742e-06, "logits/chosen": 13.521687507629395, "logits/rejected": 13.969501495361328, "logps/chosen": -278.9033203125, "logps/rejected": -287.1125183105469, "loss": 0.7128, "rewards/accuracies": 0.5, "rewards/chosen": 0.44379115104675293, "rewards/margins": 0.16020014882087708, "rewards/rejected": 0.28359100222587585, "step": 6074 }, { "epoch": 0.9394935240672724, "grad_norm": 5.933638095855713, "learning_rate": 3.815729178600069e-06, "logits/chosen": 12.309395790100098, "logits/rejected": 11.905566215515137, "logps/chosen": -193.1526641845703, "logps/rejected": -209.6673583984375, "loss": 0.7389, "rewards/accuracies": 0.5, "rewards/chosen": 0.11351699382066727, "rewards/margins": 0.03330119699239731, "rewards/rejected": 0.08021579682826996, "step": 6075 }, { "epoch": 0.9396481732070365, "grad_norm": 6.3787994384765625, "learning_rate": 3.815442776950396e-06, "logits/chosen": 10.8314208984375, "logits/rejected": 11.874008178710938, "logps/chosen": -286.8882751464844, "logps/rejected": -338.9561462402344, "loss": 0.8067, "rewards/accuracies": 0.5, "rewards/chosen": 0.27103862166404724, "rewards/margins": -0.13938254117965698, "rewards/rejected": 0.4104211926460266, "step": 6076 }, { "epoch": 0.9398028223468007, "grad_norm": 3.216580390930176, "learning_rate": 3.815156375300722e-06, "logits/chosen": 10.877608299255371, "logits/rejected": 5.905982494354248, "logps/chosen": -182.88221740722656, "logps/rejected": -146.63233947753906, "loss": 0.516, "rewards/accuracies": 0.75, "rewards/chosen": 0.14734011888504028, "rewards/margins": 0.4875781238079071, "rewards/rejected": -0.34023797512054443, "step": 6077 }, { "epoch": 0.9399574714865648, "grad_norm": 5.839911460876465, "learning_rate": 3.814869973651049e-06, "logits/chosen": 1.2418125867843628, "logits/rejected": 7.115954875946045, "logps/chosen": -325.61602783203125, "logps/rejected": -316.3528747558594, "loss": 0.623, "rewards/accuracies": 0.625, "rewards/chosen": 0.10578656941652298, "rewards/margins": 0.3688156008720398, "rewards/rejected": -0.2630290687084198, "step": 6078 }, { "epoch": 0.940112120626329, "grad_norm": 4.351027965545654, "learning_rate": 3.814583572001375e-06, "logits/chosen": 5.610586643218994, "logits/rejected": 4.684720516204834, "logps/chosen": -177.2313690185547, "logps/rejected": -200.40921020507812, "loss": 0.5593, "rewards/accuracies": 0.875, "rewards/chosen": 0.174510195851326, "rewards/margins": 0.36411920189857483, "rewards/rejected": -0.18960900604724884, "step": 6079 }, { "epoch": 0.9402667697660931, "grad_norm": 11.34358024597168, "learning_rate": 3.814297170351702e-06, "logits/chosen": 6.915814399719238, "logits/rejected": 7.635409355163574, "logps/chosen": -285.6097106933594, "logps/rejected": -237.17234802246094, "loss": 0.5961, "rewards/accuracies": 0.625, "rewards/chosen": 0.12571869790554047, "rewards/margins": 0.337399423122406, "rewards/rejected": -0.21168071031570435, "step": 6080 }, { "epoch": 0.9404214189058573, "grad_norm": 4.962376117706299, "learning_rate": 3.8140107687020276e-06, "logits/chosen": 10.566926956176758, "logits/rejected": 6.878274440765381, "logps/chosen": -437.80218505859375, "logps/rejected": -356.24957275390625, "loss": 0.445, "rewards/accuracies": 0.875, "rewards/chosen": 0.8575985431671143, "rewards/margins": 0.6839499473571777, "rewards/rejected": 0.17364856600761414, "step": 6081 }, { "epoch": 0.9405760680456215, "grad_norm": 4.927545070648193, "learning_rate": 3.8137243670523543e-06, "logits/chosen": 4.8792195320129395, "logits/rejected": 2.1346592903137207, "logps/chosen": -267.8224792480469, "logps/rejected": -223.33038330078125, "loss": 0.613, "rewards/accuracies": 0.75, "rewards/chosen": 0.7079000473022461, "rewards/margins": 0.3419094383716583, "rewards/rejected": 0.36599063873291016, "step": 6082 }, { "epoch": 0.9407307171853857, "grad_norm": 4.992504596710205, "learning_rate": 3.813437965402681e-06, "logits/chosen": 11.298521041870117, "logits/rejected": 4.287998199462891, "logps/chosen": -350.1596984863281, "logps/rejected": -250.62960815429688, "loss": 0.514, "rewards/accuracies": 0.75, "rewards/chosen": 0.6133615970611572, "rewards/margins": 0.43809372186660767, "rewards/rejected": 0.17526786029338837, "step": 6083 }, { "epoch": 0.9408853663251499, "grad_norm": 6.099196434020996, "learning_rate": 3.8131515637530076e-06, "logits/chosen": 9.103509902954102, "logits/rejected": 11.175631523132324, "logps/chosen": -253.2087860107422, "logps/rejected": -263.1287536621094, "loss": 0.8713, "rewards/accuracies": 0.25, "rewards/chosen": 0.239552304148674, "rewards/margins": -0.28746098279953003, "rewards/rejected": 0.5270133018493652, "step": 6084 }, { "epoch": 0.941040015464914, "grad_norm": 5.9227399826049805, "learning_rate": 3.812865162103334e-06, "logits/chosen": 6.880556583404541, "logits/rejected": 7.891871929168701, "logps/chosen": -301.21710205078125, "logps/rejected": -268.6335754394531, "loss": 0.8614, "rewards/accuracies": 0.375, "rewards/chosen": 0.22740042209625244, "rewards/margins": -0.18442678451538086, "rewards/rejected": 0.4118272066116333, "step": 6085 }, { "epoch": 0.9411946646046782, "grad_norm": 5.366952419281006, "learning_rate": 3.812578760453661e-06, "logits/chosen": 13.178825378417969, "logits/rejected": 10.543661117553711, "logps/chosen": -234.63943481445312, "logps/rejected": -252.57705688476562, "loss": 0.6222, "rewards/accuracies": 0.5, "rewards/chosen": 0.3527628183364868, "rewards/margins": 0.2799367904663086, "rewards/rejected": 0.07282600551843643, "step": 6086 }, { "epoch": 0.9413493137444423, "grad_norm": 5.538469314575195, "learning_rate": 3.812292358803987e-06, "logits/chosen": 1.831620454788208, "logits/rejected": 4.772876262664795, "logps/chosen": -230.24127197265625, "logps/rejected": -295.0673828125, "loss": 0.6083, "rewards/accuracies": 0.625, "rewards/chosen": 0.005966514348983765, "rewards/margins": 0.242758110165596, "rewards/rejected": -0.23679161071777344, "step": 6087 }, { "epoch": 0.9415039628842065, "grad_norm": 3.7193217277526855, "learning_rate": 3.8120059571543133e-06, "logits/chosen": 7.188810348510742, "logits/rejected": 8.100234985351562, "logps/chosen": -219.13656616210938, "logps/rejected": -254.40017700195312, "loss": 0.4915, "rewards/accuracies": 0.625, "rewards/chosen": 0.2356186956167221, "rewards/margins": 0.5997982621192932, "rewards/rejected": -0.3641796112060547, "step": 6088 }, { "epoch": 0.9416586120239706, "grad_norm": 5.5721025466918945, "learning_rate": 3.81171955550464e-06, "logits/chosen": 7.171139717102051, "logits/rejected": 6.843477725982666, "logps/chosen": -338.8473815917969, "logps/rejected": -287.5997619628906, "loss": 0.6182, "rewards/accuracies": 0.5, "rewards/chosen": 0.6582000851631165, "rewards/margins": 0.37820136547088623, "rewards/rejected": 0.27999868988990784, "step": 6089 }, { "epoch": 0.9418132611637348, "grad_norm": 3.9159910678863525, "learning_rate": 3.8114331538549666e-06, "logits/chosen": 11.181341171264648, "logits/rejected": 4.574030876159668, "logps/chosen": -191.98849487304688, "logps/rejected": -146.96788024902344, "loss": 0.6727, "rewards/accuracies": 0.375, "rewards/chosen": 0.30301570892333984, "rewards/margins": 0.1540481299161911, "rewards/rejected": 0.14896757900714874, "step": 6090 }, { "epoch": 0.9419679103034989, "grad_norm": 7.600465774536133, "learning_rate": 3.8111467522052933e-06, "logits/chosen": 14.485508918762207, "logits/rejected": 12.474799156188965, "logps/chosen": -342.322998046875, "logps/rejected": -291.435791015625, "loss": 0.901, "rewards/accuracies": 0.375, "rewards/chosen": -0.10375480353832245, "rewards/margins": -0.2649831771850586, "rewards/rejected": 0.16122838854789734, "step": 6091 }, { "epoch": 0.9421225594432631, "grad_norm": 8.614843368530273, "learning_rate": 3.8108603505556195e-06, "logits/chosen": 7.101761817932129, "logits/rejected": 6.240938663482666, "logps/chosen": -339.32220458984375, "logps/rejected": -212.43060302734375, "loss": 0.8266, "rewards/accuracies": 0.5, "rewards/chosen": -0.04470086097717285, "rewards/margins": -0.16797193884849548, "rewards/rejected": 0.12327107787132263, "step": 6092 }, { "epoch": 0.9422772085830272, "grad_norm": 5.7017316818237305, "learning_rate": 3.810573948905946e-06, "logits/chosen": 4.3470892906188965, "logits/rejected": 7.0994720458984375, "logps/chosen": -192.86744689941406, "logps/rejected": -246.5413055419922, "loss": 0.8215, "rewards/accuracies": 0.5, "rewards/chosen": 0.24562174081802368, "rewards/margins": -0.1625586450099945, "rewards/rejected": 0.4081803560256958, "step": 6093 }, { "epoch": 0.9424318577227914, "grad_norm": 6.6136369705200195, "learning_rate": 3.8102875472562724e-06, "logits/chosen": 6.410834312438965, "logits/rejected": 10.590200424194336, "logps/chosen": -231.900390625, "logps/rejected": -253.52920532226562, "loss": 0.8541, "rewards/accuracies": 0.375, "rewards/chosen": 0.06740179657936096, "rewards/margins": -0.18243303894996643, "rewards/rejected": 0.2498348355293274, "step": 6094 }, { "epoch": 0.9425865068625556, "grad_norm": 4.242220878601074, "learning_rate": 3.810001145606599e-06, "logits/chosen": 9.677803039550781, "logits/rejected": 8.787566184997559, "logps/chosen": -178.03857421875, "logps/rejected": -151.91616821289062, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": 0.06072451174259186, "rewards/margins": 0.04228389263153076, "rewards/rejected": 0.0184406116604805, "step": 6095 }, { "epoch": 0.9427411560023198, "grad_norm": 4.677660942077637, "learning_rate": 3.8097147439569252e-06, "logits/chosen": 12.098503112792969, "logits/rejected": 9.779885292053223, "logps/chosen": -262.5589294433594, "logps/rejected": -266.28314208984375, "loss": 0.5837, "rewards/accuracies": 0.75, "rewards/chosen": 0.44803160429000854, "rewards/margins": 0.3180495500564575, "rewards/rejected": 0.1299821138381958, "step": 6096 }, { "epoch": 0.9428958051420839, "grad_norm": 5.403985500335693, "learning_rate": 3.809428342307252e-06, "logits/chosen": 7.742856979370117, "logits/rejected": 10.280571937561035, "logps/chosen": -233.95733642578125, "logps/rejected": -210.7534637451172, "loss": 0.6166, "rewards/accuracies": 0.625, "rewards/chosen": 0.4494001567363739, "rewards/margins": 0.24035492539405823, "rewards/rejected": 0.20904523134231567, "step": 6097 }, { "epoch": 0.9430504542818481, "grad_norm": 6.647812843322754, "learning_rate": 3.8091419406575785e-06, "logits/chosen": 9.193872451782227, "logits/rejected": 7.398804187774658, "logps/chosen": -319.1348876953125, "logps/rejected": -316.2547607421875, "loss": 0.5713, "rewards/accuracies": 0.75, "rewards/chosen": 0.35655656456947327, "rewards/margins": 0.5459267497062683, "rewards/rejected": -0.18937018513679504, "step": 6098 }, { "epoch": 0.9432051034216122, "grad_norm": 4.627180099487305, "learning_rate": 3.808855539007905e-06, "logits/chosen": 10.844181060791016, "logits/rejected": 9.718484878540039, "logps/chosen": -343.73284912109375, "logps/rejected": -208.0472412109375, "loss": 0.5383, "rewards/accuracies": 0.625, "rewards/chosen": 0.6060517430305481, "rewards/margins": 0.5087236166000366, "rewards/rejected": 0.0973280817270279, "step": 6099 }, { "epoch": 0.9433597525613764, "grad_norm": 4.272436141967773, "learning_rate": 3.808569137358232e-06, "logits/chosen": 15.333287239074707, "logits/rejected": 8.599126815795898, "logps/chosen": -409.5078125, "logps/rejected": -248.9840850830078, "loss": 0.5638, "rewards/accuracies": 0.625, "rewards/chosen": 0.5476647615432739, "rewards/margins": 0.40758851170539856, "rewards/rejected": 0.14007622003555298, "step": 6100 }, { "epoch": 0.9435144017011405, "grad_norm": 5.972644329071045, "learning_rate": 3.8082827357085576e-06, "logits/chosen": 12.754161834716797, "logits/rejected": 15.414602279663086, "logps/chosen": -317.50360107421875, "logps/rejected": -427.9605407714844, "loss": 0.682, "rewards/accuracies": 0.625, "rewards/chosen": 0.13516905903816223, "rewards/margins": 0.07610650360584259, "rewards/rejected": 0.059062570333480835, "step": 6101 }, { "epoch": 0.9436690508409047, "grad_norm": 5.319756031036377, "learning_rate": 3.8079963340588843e-06, "logits/chosen": 15.805214881896973, "logits/rejected": 11.866049766540527, "logps/chosen": -309.2769775390625, "logps/rejected": -225.6322021484375, "loss": 0.5775, "rewards/accuracies": 0.625, "rewards/chosen": 0.057625673711299896, "rewards/margins": 0.42817145586013794, "rewards/rejected": -0.37054574489593506, "step": 6102 }, { "epoch": 0.9438236999806688, "grad_norm": 6.35297966003418, "learning_rate": 3.807709932409211e-06, "logits/chosen": 10.88467788696289, "logits/rejected": 10.55798053741455, "logps/chosen": -336.02789306640625, "logps/rejected": -345.5227355957031, "loss": 0.5511, "rewards/accuracies": 0.75, "rewards/chosen": 0.5578911900520325, "rewards/margins": 0.3338162302970886, "rewards/rejected": 0.22407494485378265, "step": 6103 }, { "epoch": 0.943978349120433, "grad_norm": 5.715827465057373, "learning_rate": 3.8074235307595376e-06, "logits/chosen": 11.487932205200195, "logits/rejected": 8.79894733428955, "logps/chosen": -313.4056396484375, "logps/rejected": -209.98672485351562, "loss": 0.6531, "rewards/accuracies": 0.625, "rewards/chosen": 0.3565572500228882, "rewards/margins": 0.2715810239315033, "rewards/rejected": 0.08497625589370728, "step": 6104 }, { "epoch": 0.9441329982601971, "grad_norm": 5.52899169921875, "learning_rate": 3.8071371291098642e-06, "logits/chosen": 5.1163787841796875, "logits/rejected": 11.167240142822266, "logps/chosen": -175.9130859375, "logps/rejected": -261.94427490234375, "loss": 0.8449, "rewards/accuracies": 0.5, "rewards/chosen": 0.2729036211967468, "rewards/margins": -0.11569945514202118, "rewards/rejected": 0.3886030912399292, "step": 6105 }, { "epoch": 0.9442876473999613, "grad_norm": 4.608291149139404, "learning_rate": 3.806850727460191e-06, "logits/chosen": 12.11892318725586, "logits/rejected": 9.762636184692383, "logps/chosen": -295.04425048828125, "logps/rejected": -226.96890258789062, "loss": 0.5679, "rewards/accuracies": 0.75, "rewards/chosen": 0.2966315746307373, "rewards/margins": 0.4821425974369049, "rewards/rejected": -0.1855110228061676, "step": 6106 }, { "epoch": 0.9444422965397254, "grad_norm": 6.040639400482178, "learning_rate": 3.8065643258105167e-06, "logits/chosen": 13.903783798217773, "logits/rejected": 10.05005168914795, "logps/chosen": -280.8214111328125, "logps/rejected": -167.11900329589844, "loss": 0.6999, "rewards/accuracies": 0.375, "rewards/chosen": 0.19398802518844604, "rewards/margins": 0.08902202546596527, "rewards/rejected": 0.10496602952480316, "step": 6107 }, { "epoch": 0.9445969456794897, "grad_norm": 4.522506237030029, "learning_rate": 3.8062779241608434e-06, "logits/chosen": 6.287240505218506, "logits/rejected": 2.296971559524536, "logps/chosen": -287.56463623046875, "logps/rejected": -189.1895294189453, "loss": 0.5238, "rewards/accuracies": 0.875, "rewards/chosen": 0.4894564151763916, "rewards/margins": 0.45365530252456665, "rewards/rejected": 0.035801127552986145, "step": 6108 }, { "epoch": 0.9447515948192539, "grad_norm": 8.307732582092285, "learning_rate": 3.80599152251117e-06, "logits/chosen": 9.483098983764648, "logits/rejected": 9.91816520690918, "logps/chosen": -280.9747009277344, "logps/rejected": -264.1375732421875, "loss": 0.6628, "rewards/accuracies": 0.625, "rewards/chosen": 0.4822605848312378, "rewards/margins": 0.41404256224632263, "rewards/rejected": 0.06821804493665695, "step": 6109 }, { "epoch": 0.944906243959018, "grad_norm": 676.383056640625, "learning_rate": 3.8057051208614967e-06, "logits/chosen": 9.731789588928223, "logits/rejected": 9.675384521484375, "logps/chosen": -244.6284942626953, "logps/rejected": -293.3633117675781, "loss": 1.1458, "rewards/accuracies": 0.625, "rewards/chosen": 0.3688059449195862, "rewards/margins": 0.02286475896835327, "rewards/rejected": 0.3459411859512329, "step": 6110 }, { "epoch": 0.9450608930987822, "grad_norm": 5.440345764160156, "learning_rate": 3.805418719211823e-06, "logits/chosen": 6.476787567138672, "logits/rejected": 6.469910144805908, "logps/chosen": -195.49354553222656, "logps/rejected": -263.1617431640625, "loss": 0.6224, "rewards/accuracies": 0.75, "rewards/chosen": 0.3218001127243042, "rewards/margins": 0.48463016748428345, "rewards/rejected": -0.16283002495765686, "step": 6111 }, { "epoch": 0.9452155422385463, "grad_norm": 5.384084224700928, "learning_rate": 3.8051323175621495e-06, "logits/chosen": 10.614958763122559, "logits/rejected": 17.449613571166992, "logps/chosen": -210.1707763671875, "logps/rejected": -241.51287841796875, "loss": 0.7528, "rewards/accuracies": 0.375, "rewards/chosen": -0.14073190093040466, "rewards/margins": -0.02024676650762558, "rewards/rejected": -0.12048512697219849, "step": 6112 }, { "epoch": 0.9453701913783105, "grad_norm": 3.032682418823242, "learning_rate": 3.804845915912476e-06, "logits/chosen": 16.2205810546875, "logits/rejected": 10.663674354553223, "logps/chosen": -183.4769287109375, "logps/rejected": -158.5052032470703, "loss": 0.4465, "rewards/accuracies": 0.875, "rewards/chosen": 0.44304996728897095, "rewards/margins": 0.6766156554222107, "rewards/rejected": -0.23356571793556213, "step": 6113 }, { "epoch": 0.9455248405180746, "grad_norm": 3.896304130554199, "learning_rate": 3.8045595142628024e-06, "logits/chosen": 9.357583999633789, "logits/rejected": 5.256113052368164, "logps/chosen": -227.8314971923828, "logps/rejected": -113.90750122070312, "loss": 0.5927, "rewards/accuracies": 0.625, "rewards/chosen": 0.39485710859298706, "rewards/margins": 0.32727789878845215, "rewards/rejected": 0.0675792321562767, "step": 6114 }, { "epoch": 0.9456794896578388, "grad_norm": 5.4467878341674805, "learning_rate": 3.8042731126131286e-06, "logits/chosen": 13.243876457214355, "logits/rejected": 14.673941612243652, "logps/chosen": -239.44859313964844, "logps/rejected": -331.18841552734375, "loss": 0.8322, "rewards/accuracies": 0.625, "rewards/chosen": 0.19260179996490479, "rewards/margins": -0.10337914526462555, "rewards/rejected": 0.29598093032836914, "step": 6115 }, { "epoch": 0.9458341387976029, "grad_norm": 5.094240665435791, "learning_rate": 3.8039867109634553e-06, "logits/chosen": 7.527679443359375, "logits/rejected": 9.080537796020508, "logps/chosen": -299.16094970703125, "logps/rejected": -308.7109375, "loss": 0.6061, "rewards/accuracies": 0.75, "rewards/chosen": 0.5531708002090454, "rewards/margins": 0.29709121584892273, "rewards/rejected": 0.2560795843601227, "step": 6116 }, { "epoch": 0.9459887879373671, "grad_norm": 6.777413368225098, "learning_rate": 3.803700309313782e-06, "logits/chosen": 11.91340446472168, "logits/rejected": 9.278749465942383, "logps/chosen": -274.78533935546875, "logps/rejected": -275.62799072265625, "loss": 0.6441, "rewards/accuracies": 0.75, "rewards/chosen": 0.5145151019096375, "rewards/margins": 0.42700961232185364, "rewards/rejected": 0.0875055193901062, "step": 6117 }, { "epoch": 0.9461434370771312, "grad_norm": 4.660823345184326, "learning_rate": 3.8034139076641086e-06, "logits/chosen": 11.751721382141113, "logits/rejected": 9.275279998779297, "logps/chosen": -344.2777404785156, "logps/rejected": -276.0328063964844, "loss": 0.5992, "rewards/accuracies": 0.625, "rewards/chosen": 0.5876486897468567, "rewards/margins": 0.23932303488254547, "rewards/rejected": 0.34832563996315, "step": 6118 }, { "epoch": 0.9462980862168954, "grad_norm": 4.778783321380615, "learning_rate": 3.8031275060144352e-06, "logits/chosen": 4.699098587036133, "logits/rejected": 3.591595411300659, "logps/chosen": -241.07110595703125, "logps/rejected": -215.99533081054688, "loss": 0.5314, "rewards/accuracies": 0.875, "rewards/chosen": 0.42230841517448425, "rewards/margins": 0.5223271250724792, "rewards/rejected": -0.1000187024474144, "step": 6119 }, { "epoch": 0.9464527353566595, "grad_norm": 14.398313522338867, "learning_rate": 3.802841104364762e-06, "logits/chosen": 9.755953788757324, "logits/rejected": 3.4724786281585693, "logps/chosen": -384.8707275390625, "logps/rejected": -315.34637451171875, "loss": 0.489, "rewards/accuracies": 0.625, "rewards/chosen": 0.6170616149902344, "rewards/margins": 0.7081850171089172, "rewards/rejected": -0.09112338721752167, "step": 6120 }, { "epoch": 0.9466073844964238, "grad_norm": 6.513869762420654, "learning_rate": 3.8025547027150877e-06, "logits/chosen": 11.69550895690918, "logits/rejected": 9.572566032409668, "logps/chosen": -411.56231689453125, "logps/rejected": -394.0745544433594, "loss": 0.6479, "rewards/accuracies": 0.5, "rewards/chosen": 0.7219116687774658, "rewards/margins": 0.21830222010612488, "rewards/rejected": 0.5036094188690186, "step": 6121 }, { "epoch": 0.9467620336361879, "grad_norm": 9.223554611206055, "learning_rate": 3.8022683010654143e-06, "logits/chosen": 8.417282104492188, "logits/rejected": 1.5517919063568115, "logps/chosen": -401.50653076171875, "logps/rejected": -220.24920654296875, "loss": 0.7144, "rewards/accuracies": 0.375, "rewards/chosen": 0.5609298348426819, "rewards/margins": -0.005590818822383881, "rewards/rejected": 0.5665207505226135, "step": 6122 }, { "epoch": 0.9469166827759521, "grad_norm": 4.91739559173584, "learning_rate": 3.801981899415741e-06, "logits/chosen": 12.335090637207031, "logits/rejected": 8.075563430786133, "logps/chosen": -376.4961853027344, "logps/rejected": -298.4585876464844, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 0.4171767830848694, "rewards/margins": 0.680549681186676, "rewards/rejected": -0.26337286829948425, "step": 6123 }, { "epoch": 0.9470713319157162, "grad_norm": 5.677758693695068, "learning_rate": 3.8016954977660676e-06, "logits/chosen": 7.166735649108887, "logits/rejected": 7.543004989624023, "logps/chosen": -225.05581665039062, "logps/rejected": -182.28384399414062, "loss": 0.6979, "rewards/accuracies": 0.625, "rewards/chosen": 0.07324963063001633, "rewards/margins": 0.08322267234325409, "rewards/rejected": -0.009973041713237762, "step": 6124 }, { "epoch": 0.9472259810554804, "grad_norm": 4.433850288391113, "learning_rate": 3.8014090961163943e-06, "logits/chosen": 11.126750946044922, "logits/rejected": 5.822546005249023, "logps/chosen": -173.7269744873047, "logps/rejected": -157.27572631835938, "loss": 0.5867, "rewards/accuracies": 0.625, "rewards/chosen": 0.15981268882751465, "rewards/margins": 0.26645272970199585, "rewards/rejected": -0.1066400557756424, "step": 6125 }, { "epoch": 0.9473806301952445, "grad_norm": 10.051560401916504, "learning_rate": 3.8011226944667205e-06, "logits/chosen": 11.969038963317871, "logits/rejected": 8.733071327209473, "logps/chosen": -423.1612548828125, "logps/rejected": -361.1314697265625, "loss": 0.8901, "rewards/accuracies": 0.25, "rewards/chosen": 0.006579972803592682, "rewards/margins": -0.19934165477752686, "rewards/rejected": 0.20592162013053894, "step": 6126 }, { "epoch": 0.9475352793350087, "grad_norm": 3.7855629920959473, "learning_rate": 3.8008362928170467e-06, "logits/chosen": 14.899744987487793, "logits/rejected": 5.31304931640625, "logps/chosen": -254.46495056152344, "logps/rejected": -168.96116638183594, "loss": 0.4594, "rewards/accuracies": 0.875, "rewards/chosen": 0.58144211769104, "rewards/margins": 0.6862486600875854, "rewards/rejected": -0.10480650514364243, "step": 6127 }, { "epoch": 0.9476899284747728, "grad_norm": 3.807384490966797, "learning_rate": 3.8005498911673734e-06, "logits/chosen": 12.657123565673828, "logits/rejected": 8.67054557800293, "logps/chosen": -226.19183349609375, "logps/rejected": -198.3279266357422, "loss": 0.5347, "rewards/accuracies": 0.875, "rewards/chosen": 0.42361658811569214, "rewards/margins": 0.40676647424697876, "rewards/rejected": 0.016850091516971588, "step": 6128 }, { "epoch": 0.947844577614537, "grad_norm": 5.2223005294799805, "learning_rate": 3.8002634895177e-06, "logits/chosen": 10.917251586914062, "logits/rejected": 5.476696968078613, "logps/chosen": -456.75872802734375, "logps/rejected": -325.453369140625, "loss": 0.4865, "rewards/accuracies": 0.75, "rewards/chosen": 0.7777464389801025, "rewards/margins": 0.8205711841583252, "rewards/rejected": -0.042824745178222656, "step": 6129 }, { "epoch": 0.9479992267543011, "grad_norm": 4.111788272857666, "learning_rate": 3.7999770878680263e-06, "logits/chosen": 9.898801803588867, "logits/rejected": 9.476448059082031, "logps/chosen": -158.30126953125, "logps/rejected": -193.73118591308594, "loss": 0.6639, "rewards/accuracies": 0.625, "rewards/chosen": 0.146633580327034, "rewards/margins": 0.10084643959999084, "rewards/rejected": 0.04578714072704315, "step": 6130 }, { "epoch": 0.9481538758940653, "grad_norm": 4.172186374664307, "learning_rate": 3.799690686218353e-06, "logits/chosen": 6.938317775726318, "logits/rejected": 3.4027481079101562, "logps/chosen": -217.08563232421875, "logps/rejected": -181.32907104492188, "loss": 0.4901, "rewards/accuracies": 0.875, "rewards/chosen": 0.3057669401168823, "rewards/margins": 0.6542397737503052, "rewards/rejected": -0.34847280383110046, "step": 6131 }, { "epoch": 0.9483085250338295, "grad_norm": 4.117425918579102, "learning_rate": 3.7994042845686796e-06, "logits/chosen": 10.764884948730469, "logits/rejected": 6.007167816162109, "logps/chosen": -184.376220703125, "logps/rejected": -184.09844970703125, "loss": 0.635, "rewards/accuracies": 0.75, "rewards/chosen": 0.32048383355140686, "rewards/margins": 0.19168972969055176, "rewards/rejected": 0.1287941038608551, "step": 6132 }, { "epoch": 0.9484631741735936, "grad_norm": 4.116074562072754, "learning_rate": 3.7991178829190062e-06, "logits/chosen": 12.23703384399414, "logits/rejected": 8.825287818908691, "logps/chosen": -320.6624450683594, "logps/rejected": -288.0609436035156, "loss": 0.446, "rewards/accuracies": 0.875, "rewards/chosen": 0.4983493685722351, "rewards/margins": 0.6485175490379333, "rewards/rejected": -0.15016812086105347, "step": 6133 }, { "epoch": 0.9486178233133579, "grad_norm": 4.125493049621582, "learning_rate": 3.798831481269332e-06, "logits/chosen": 18.443208694458008, "logits/rejected": 12.027984619140625, "logps/chosen": -235.64654541015625, "logps/rejected": -183.31814575195312, "loss": 0.6387, "rewards/accuracies": 0.75, "rewards/chosen": 0.3844262957572937, "rewards/margins": 0.22453317046165466, "rewards/rejected": 0.15989314019680023, "step": 6134 }, { "epoch": 0.948772472453122, "grad_norm": 7.958446502685547, "learning_rate": 3.7985450796196587e-06, "logits/chosen": 8.696544647216797, "logits/rejected": 9.067924499511719, "logps/chosen": -302.22381591796875, "logps/rejected": -315.93084716796875, "loss": 0.8832, "rewards/accuracies": 0.25, "rewards/chosen": 0.1506291925907135, "rewards/margins": -0.26111355423927307, "rewards/rejected": 0.4117427468299866, "step": 6135 }, { "epoch": 0.9489271215928862, "grad_norm": 10.748108863830566, "learning_rate": 3.7982586779699853e-06, "logits/chosen": 10.577842712402344, "logits/rejected": 6.776095390319824, "logps/chosen": -488.1216735839844, "logps/rejected": -299.84271240234375, "loss": 0.7171, "rewards/accuracies": 0.625, "rewards/chosen": 0.23031625151634216, "rewards/margins": 0.13431869447231293, "rewards/rejected": 0.09599751234054565, "step": 6136 }, { "epoch": 0.9490817707326503, "grad_norm": 4.6449503898620605, "learning_rate": 3.797972276320312e-06, "logits/chosen": 9.781865119934082, "logits/rejected": 13.294943809509277, "logps/chosen": -247.5777130126953, "logps/rejected": -308.8779602050781, "loss": 0.605, "rewards/accuracies": 0.625, "rewards/chosen": 0.3307187557220459, "rewards/margins": 0.2628345191478729, "rewards/rejected": 0.06788422167301178, "step": 6137 }, { "epoch": 0.9492364198724145, "grad_norm": 6.828583240509033, "learning_rate": 3.7976858746706386e-06, "logits/chosen": 16.520654678344727, "logits/rejected": 7.7576212882995605, "logps/chosen": -311.0717468261719, "logps/rejected": -252.43626403808594, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": 0.4419318735599518, "rewards/margins": 0.2562493681907654, "rewards/rejected": 0.18568256497383118, "step": 6138 }, { "epoch": 0.9493910690121786, "grad_norm": 7.355157852172852, "learning_rate": 3.7973994730209653e-06, "logits/chosen": 7.037096977233887, "logits/rejected": 5.259735107421875, "logps/chosen": -355.94232177734375, "logps/rejected": -266.8430480957031, "loss": 0.5736, "rewards/accuracies": 0.75, "rewards/chosen": 0.23144692182540894, "rewards/margins": 0.4259180426597595, "rewards/rejected": -0.19447113573551178, "step": 6139 }, { "epoch": 0.9495457181519428, "grad_norm": 3.692997455596924, "learning_rate": 3.797113071371291e-06, "logits/chosen": 12.380500793457031, "logits/rejected": 6.629700660705566, "logps/chosen": -140.75379943847656, "logps/rejected": -127.47859191894531, "loss": 0.6221, "rewards/accuracies": 0.625, "rewards/chosen": -0.06310685724020004, "rewards/margins": 0.17889685928821564, "rewards/rejected": -0.24200373888015747, "step": 6140 }, { "epoch": 0.9497003672917069, "grad_norm": 4.56638240814209, "learning_rate": 3.7968266697216177e-06, "logits/chosen": 8.284974098205566, "logits/rejected": 8.240362167358398, "logps/chosen": -230.05026245117188, "logps/rejected": -264.61456298828125, "loss": 0.6489, "rewards/accuracies": 0.75, "rewards/chosen": 0.2506457567214966, "rewards/margins": 0.37028414011001587, "rewards/rejected": -0.11963840574026108, "step": 6141 }, { "epoch": 0.9498550164314711, "grad_norm": 4.7954607009887695, "learning_rate": 3.7965402680719444e-06, "logits/chosen": 12.753697395324707, "logits/rejected": 9.30424690246582, "logps/chosen": -320.78839111328125, "logps/rejected": -298.1253662109375, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 0.5831942558288574, "rewards/margins": 0.7153184413909912, "rewards/rejected": -0.1321241855621338, "step": 6142 }, { "epoch": 0.9500096655712352, "grad_norm": 5.0751447677612305, "learning_rate": 3.796253866422271e-06, "logits/chosen": 12.804666519165039, "logits/rejected": 11.85648250579834, "logps/chosen": -305.0457763671875, "logps/rejected": -298.6139831542969, "loss": 0.6151, "rewards/accuracies": 0.625, "rewards/chosen": 0.5067696571350098, "rewards/margins": 0.4775269329547882, "rewards/rejected": 0.02924273908138275, "step": 6143 }, { "epoch": 0.9501643147109994, "grad_norm": 5.078519344329834, "learning_rate": 3.7959674647725977e-06, "logits/chosen": 10.292719841003418, "logits/rejected": 3.6246705055236816, "logps/chosen": -258.091064453125, "logps/rejected": -162.05177307128906, "loss": 0.6891, "rewards/accuracies": 0.5, "rewards/chosen": 0.010790683329105377, "rewards/margins": 0.0842796117067337, "rewards/rejected": -0.07348892092704773, "step": 6144 }, { "epoch": 0.9503189638507635, "grad_norm": 4.760468482971191, "learning_rate": 3.795681063122924e-06, "logits/chosen": 8.143221855163574, "logits/rejected": 7.942566394805908, "logps/chosen": -381.5123596191406, "logps/rejected": -391.5032043457031, "loss": 0.4413, "rewards/accuracies": 1.0, "rewards/chosen": 0.4306606948375702, "rewards/margins": 0.6841031312942505, "rewards/rejected": -0.25344234704971313, "step": 6145 }, { "epoch": 0.9504736129905278, "grad_norm": 4.661147117614746, "learning_rate": 3.7953946614732506e-06, "logits/chosen": 13.569191932678223, "logits/rejected": 10.540250778198242, "logps/chosen": -283.15802001953125, "logps/rejected": -212.72402954101562, "loss": 0.6584, "rewards/accuracies": 0.375, "rewards/chosen": 0.4299682676792145, "rewards/margins": 0.12703828513622284, "rewards/rejected": 0.30292996764183044, "step": 6146 }, { "epoch": 0.950628262130292, "grad_norm": 6.309093952178955, "learning_rate": 3.795108259823577e-06, "logits/chosen": 5.467181205749512, "logits/rejected": 7.628287315368652, "logps/chosen": -189.5636444091797, "logps/rejected": -207.5373992919922, "loss": 0.7883, "rewards/accuracies": 0.375, "rewards/chosen": -0.017991885542869568, "rewards/margins": -0.1218787431716919, "rewards/rejected": 0.10388685762882233, "step": 6147 }, { "epoch": 0.9507829112700561, "grad_norm": 8.071401596069336, "learning_rate": 3.7948218581739034e-06, "logits/chosen": 16.6971435546875, "logits/rejected": 5.629417419433594, "logps/chosen": -382.31268310546875, "logps/rejected": -348.284423828125, "loss": 0.6747, "rewards/accuracies": 0.625, "rewards/chosen": 0.20832709968090057, "rewards/margins": 0.4912237823009491, "rewards/rejected": -0.2828966975212097, "step": 6148 }, { "epoch": 0.9509375604098202, "grad_norm": 3.390028715133667, "learning_rate": 3.7945354565242297e-06, "logits/chosen": 8.839740753173828, "logits/rejected": 9.218704223632812, "logps/chosen": -233.2584991455078, "logps/rejected": -196.21310424804688, "loss": 0.4673, "rewards/accuracies": 1.0, "rewards/chosen": 0.18338054418563843, "rewards/margins": 0.6705355644226074, "rewards/rejected": -0.4871550500392914, "step": 6149 }, { "epoch": 0.9510922095495844, "grad_norm": 6.178202152252197, "learning_rate": 3.7942490548745563e-06, "logits/chosen": 13.983511924743652, "logits/rejected": 0.584989070892334, "logps/chosen": -532.3519287109375, "logps/rejected": -201.24252319335938, "loss": 0.4103, "rewards/accuracies": 1.0, "rewards/chosen": 0.8076561093330383, "rewards/margins": 0.8006258606910706, "rewards/rejected": 0.0070302411913871765, "step": 6150 }, { "epoch": 0.9512468586893486, "grad_norm": 8.772147178649902, "learning_rate": 3.793962653224883e-06, "logits/chosen": 9.899749755859375, "logits/rejected": 10.163934707641602, "logps/chosen": -356.1944580078125, "logps/rejected": -356.65728759765625, "loss": 0.6872, "rewards/accuracies": 0.625, "rewards/chosen": 0.6404454708099365, "rewards/margins": 0.06865201890468597, "rewards/rejected": 0.5717933773994446, "step": 6151 }, { "epoch": 0.9514015078291127, "grad_norm": 5.855581283569336, "learning_rate": 3.7936762515752096e-06, "logits/chosen": 9.701614379882812, "logits/rejected": 3.668771266937256, "logps/chosen": -521.6398315429688, "logps/rejected": -287.03717041015625, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": 0.7385131120681763, "rewards/margins": 0.5150148272514343, "rewards/rejected": 0.22349828481674194, "step": 6152 }, { "epoch": 0.9515561569688769, "grad_norm": 5.712414264678955, "learning_rate": 3.7933898499255354e-06, "logits/chosen": 11.184454917907715, "logits/rejected": 10.683685302734375, "logps/chosen": -225.55450439453125, "logps/rejected": -198.64552307128906, "loss": 0.5853, "rewards/accuracies": 0.625, "rewards/chosen": 0.03424198180437088, "rewards/margins": 0.3246713876724243, "rewards/rejected": -0.29042938351631165, "step": 6153 }, { "epoch": 0.951710806108641, "grad_norm": 4.311692714691162, "learning_rate": 3.793103448275862e-06, "logits/chosen": 8.396364212036133, "logits/rejected": 4.56004524230957, "logps/chosen": -221.5716552734375, "logps/rejected": -204.5906524658203, "loss": 0.6164, "rewards/accuracies": 0.625, "rewards/chosen": 0.3391536772251129, "rewards/margins": 0.42422419786453247, "rewards/rejected": -0.08507047593593597, "step": 6154 }, { "epoch": 0.9518654552484052, "grad_norm": 5.062068462371826, "learning_rate": 3.7928170466261887e-06, "logits/chosen": 16.395055770874023, "logits/rejected": 11.11180305480957, "logps/chosen": -361.14288330078125, "logps/rejected": -298.46856689453125, "loss": 0.6231, "rewards/accuracies": 0.625, "rewards/chosen": 0.32938405871391296, "rewards/margins": 0.5155172944068909, "rewards/rejected": -0.1861332654953003, "step": 6155 }, { "epoch": 0.9520201043881693, "grad_norm": 4.368488788604736, "learning_rate": 3.7925306449765154e-06, "logits/chosen": 17.074222564697266, "logits/rejected": 16.759662628173828, "logps/chosen": -274.2134094238281, "logps/rejected": -220.26956176757812, "loss": 0.6121, "rewards/accuracies": 0.5, "rewards/chosen": 0.4574798345565796, "rewards/margins": 0.2169291377067566, "rewards/rejected": 0.2405507117509842, "step": 6156 }, { "epoch": 0.9521747535279335, "grad_norm": 5.707125663757324, "learning_rate": 3.792244243326842e-06, "logits/chosen": 10.741851806640625, "logits/rejected": 7.960721015930176, "logps/chosen": -227.63397216796875, "logps/rejected": -239.238037109375, "loss": 0.7078, "rewards/accuracies": 0.625, "rewards/chosen": 0.4618026912212372, "rewards/margins": 0.06386394798755646, "rewards/rejected": 0.39793872833251953, "step": 6157 }, { "epoch": 0.9523294026676976, "grad_norm": 5.398306369781494, "learning_rate": 3.7919578416771687e-06, "logits/chosen": 11.080409049987793, "logits/rejected": 11.525304794311523, "logps/chosen": -238.9967041015625, "logps/rejected": -221.96751403808594, "loss": 0.756, "rewards/accuracies": 0.5, "rewards/chosen": 0.04042952135205269, "rewards/margins": 0.07515348494052887, "rewards/rejected": -0.03472394496202469, "step": 6158 }, { "epoch": 0.9524840518074619, "grad_norm": 5.277289867401123, "learning_rate": 3.791671440027495e-06, "logits/chosen": 10.193986892700195, "logits/rejected": 5.847362041473389, "logps/chosen": -267.63067626953125, "logps/rejected": -198.78640747070312, "loss": 0.5537, "rewards/accuracies": 0.625, "rewards/chosen": 0.33028534054756165, "rewards/margins": 0.4747997522354126, "rewards/rejected": -0.14451441168785095, "step": 6159 }, { "epoch": 0.952638700947226, "grad_norm": 4.350001335144043, "learning_rate": 3.791385038377821e-06, "logits/chosen": 12.001304626464844, "logits/rejected": 4.2634968757629395, "logps/chosen": -276.31109619140625, "logps/rejected": -235.59461975097656, "loss": 0.4967, "rewards/accuracies": 0.875, "rewards/chosen": 0.12909665703773499, "rewards/margins": 0.5830200910568237, "rewards/rejected": -0.45392337441444397, "step": 6160 }, { "epoch": 0.9527933500869902, "grad_norm": 4.104081630706787, "learning_rate": 3.7910986367281478e-06, "logits/chosen": 12.183959007263184, "logits/rejected": 8.322110176086426, "logps/chosen": -262.2836608886719, "logps/rejected": -221.3241729736328, "loss": 0.549, "rewards/accuracies": 0.75, "rewards/chosen": 0.3626824617385864, "rewards/margins": 0.41054120659828186, "rewards/rejected": -0.04785875231027603, "step": 6161 }, { "epoch": 0.9529479992267543, "grad_norm": 5.812829971313477, "learning_rate": 3.7908122350784744e-06, "logits/chosen": 12.200241088867188, "logits/rejected": 1.9696413278579712, "logps/chosen": -390.1821594238281, "logps/rejected": -200.35595703125, "loss": 0.5202, "rewards/accuracies": 0.625, "rewards/chosen": 0.5052471160888672, "rewards/margins": 0.7562521696090698, "rewards/rejected": -0.2510051131248474, "step": 6162 }, { "epoch": 0.9531026483665185, "grad_norm": 6.0550537109375, "learning_rate": 3.790525833428801e-06, "logits/chosen": 4.950176239013672, "logits/rejected": 5.137077331542969, "logps/chosen": -167.5026092529297, "logps/rejected": -226.3450927734375, "loss": 0.6834, "rewards/accuracies": 0.625, "rewards/chosen": 0.12146544456481934, "rewards/margins": 0.12269788235425949, "rewards/rejected": -0.0012324228882789612, "step": 6163 }, { "epoch": 0.9532572975062826, "grad_norm": 4.608234882354736, "learning_rate": 3.7902394317791273e-06, "logits/chosen": 10.137441635131836, "logits/rejected": 12.505643844604492, "logps/chosen": -195.520263671875, "logps/rejected": -243.80435180664062, "loss": 0.6525, "rewards/accuracies": 0.5, "rewards/chosen": 0.20495612919330597, "rewards/margins": 0.1427948772907257, "rewards/rejected": 0.06216125190258026, "step": 6164 }, { "epoch": 0.9534119466460468, "grad_norm": 14.737418174743652, "learning_rate": 3.789953030129454e-06, "logits/chosen": 9.711441993713379, "logits/rejected": 13.04696273803711, "logps/chosen": -274.9349060058594, "logps/rejected": -430.790771484375, "loss": 0.7203, "rewards/accuracies": 0.5, "rewards/chosen": 0.6060918569564819, "rewards/margins": 0.20980969071388245, "rewards/rejected": 0.3962821960449219, "step": 6165 }, { "epoch": 0.9535665957858109, "grad_norm": 5.359762668609619, "learning_rate": 3.7896666284797806e-06, "logits/chosen": 12.074807167053223, "logits/rejected": 4.143256187438965, "logps/chosen": -400.29571533203125, "logps/rejected": -267.4970397949219, "loss": 0.5314, "rewards/accuracies": 0.875, "rewards/chosen": 0.6141083836555481, "rewards/margins": 0.5339053869247437, "rewards/rejected": 0.08020296692848206, "step": 6166 }, { "epoch": 0.9537212449255751, "grad_norm": 3.6389474868774414, "learning_rate": 3.789380226830107e-06, "logits/chosen": 9.88829517364502, "logits/rejected": 8.14225959777832, "logps/chosen": -225.5856475830078, "logps/rejected": -222.03958129882812, "loss": 0.5914, "rewards/accuracies": 0.5, "rewards/chosen": -0.012787438929080963, "rewards/margins": 0.5036100149154663, "rewards/rejected": -0.5163974761962891, "step": 6167 }, { "epoch": 0.9538758940653392, "grad_norm": 5.345178127288818, "learning_rate": 3.789093825180433e-06, "logits/chosen": 5.077747344970703, "logits/rejected": 10.080281257629395, "logps/chosen": -220.88548278808594, "logps/rejected": -289.1187744140625, "loss": 0.7962, "rewards/accuracies": 0.25, "rewards/chosen": -0.07675837725400925, "rewards/margins": -0.1315731555223465, "rewards/rejected": 0.05481477081775665, "step": 6168 }, { "epoch": 0.9540305432051034, "grad_norm": 5.9572038650512695, "learning_rate": 3.7888074235307597e-06, "logits/chosen": 8.516749382019043, "logits/rejected": 5.46872091293335, "logps/chosen": -235.18028259277344, "logps/rejected": -214.20684814453125, "loss": 0.6963, "rewards/accuracies": 0.5, "rewards/chosen": -0.038121797144412994, "rewards/margins": 0.22291134297847748, "rewards/rejected": -0.26103317737579346, "step": 6169 }, { "epoch": 0.9541851923448675, "grad_norm": 5.583144187927246, "learning_rate": 3.7885210218810864e-06, "logits/chosen": 8.929736137390137, "logits/rejected": 6.0205488204956055, "logps/chosen": -301.0149230957031, "logps/rejected": -250.39556884765625, "loss": 0.7552, "rewards/accuracies": 0.375, "rewards/chosen": 0.10005579888820648, "rewards/margins": 0.1535303145647049, "rewards/rejected": -0.053474511951208115, "step": 6170 }, { "epoch": 0.9543398414846317, "grad_norm": 5.039641380310059, "learning_rate": 3.788234620231413e-06, "logits/chosen": 8.63708209991455, "logits/rejected": 6.5884857177734375, "logps/chosen": -256.0694885253906, "logps/rejected": -198.71670532226562, "loss": 0.5418, "rewards/accuracies": 0.75, "rewards/chosen": 0.13345059752464294, "rewards/margins": 0.4428868591785431, "rewards/rejected": -0.30943626165390015, "step": 6171 }, { "epoch": 0.954494490624396, "grad_norm": 8.050763130187988, "learning_rate": 3.7879482185817397e-06, "logits/chosen": 10.67708683013916, "logits/rejected": 2.730160713195801, "logps/chosen": -350.8143005371094, "logps/rejected": -204.71275329589844, "loss": 0.7992, "rewards/accuracies": 0.75, "rewards/chosen": -0.09011248499155045, "rewards/margins": 0.10813485085964203, "rewards/rejected": -0.19824734330177307, "step": 6172 }, { "epoch": 0.9546491397641601, "grad_norm": 7.7803473472595215, "learning_rate": 3.7876618169320655e-06, "logits/chosen": 10.17917251586914, "logits/rejected": 8.727293014526367, "logps/chosen": -297.3963317871094, "logps/rejected": -324.045654296875, "loss": 0.6007, "rewards/accuracies": 0.75, "rewards/chosen": 0.650406539440155, "rewards/margins": 0.38630932569503784, "rewards/rejected": 0.2640972137451172, "step": 6173 }, { "epoch": 0.9548037889039243, "grad_norm": 5.165964126586914, "learning_rate": 3.787375415282392e-06, "logits/chosen": 10.358448028564453, "logits/rejected": 6.400697231292725, "logps/chosen": -265.35589599609375, "logps/rejected": -244.11416625976562, "loss": 0.6577, "rewards/accuracies": 0.625, "rewards/chosen": 0.040180496871471405, "rewards/margins": 0.28275009989738464, "rewards/rejected": -0.24256959557533264, "step": 6174 }, { "epoch": 0.9549584380436884, "grad_norm": 6.588400840759277, "learning_rate": 3.7870890136327188e-06, "logits/chosen": 5.818270206451416, "logits/rejected": 7.853864669799805, "logps/chosen": -174.64486694335938, "logps/rejected": -222.53135681152344, "loss": 0.5996, "rewards/accuracies": 0.5, "rewards/chosen": -0.05821056663990021, "rewards/margins": 0.34612321853637695, "rewards/rejected": -0.40433377027511597, "step": 6175 }, { "epoch": 0.9551130871834526, "grad_norm": 5.107873439788818, "learning_rate": 3.7868026119830454e-06, "logits/chosen": 8.837618827819824, "logits/rejected": 6.268115043640137, "logps/chosen": -227.54653930664062, "logps/rejected": -236.0309295654297, "loss": 0.6414, "rewards/accuracies": 0.5, "rewards/chosen": 0.15765684843063354, "rewards/margins": 0.2781776189804077, "rewards/rejected": -0.12052074819803238, "step": 6176 }, { "epoch": 0.9552677363232167, "grad_norm": 4.786239147186279, "learning_rate": 3.786516210333372e-06, "logits/chosen": 1.8756210803985596, "logits/rejected": -2.148780107498169, "logps/chosen": -235.91082763671875, "logps/rejected": -128.50123596191406, "loss": 0.7035, "rewards/accuracies": 0.5, "rewards/chosen": -0.1964249610900879, "rewards/margins": 0.03667372837662697, "rewards/rejected": -0.23309871554374695, "step": 6177 }, { "epoch": 0.9554223854629809, "grad_norm": 10.871891021728516, "learning_rate": 3.7862298086836983e-06, "logits/chosen": 7.380008697509766, "logits/rejected": 2.810032606124878, "logps/chosen": -306.8412780761719, "logps/rejected": -235.4676513671875, "loss": 0.5255, "rewards/accuracies": 0.75, "rewards/chosen": 0.22980064153671265, "rewards/margins": 0.43018412590026855, "rewards/rejected": -0.20038345456123352, "step": 6178 }, { "epoch": 0.955577034602745, "grad_norm": 4.260034561157227, "learning_rate": 3.785943407034025e-06, "logits/chosen": 8.592061042785645, "logits/rejected": 8.915990829467773, "logps/chosen": -356.1918029785156, "logps/rejected": -364.6982421875, "loss": 0.4934, "rewards/accuracies": 0.75, "rewards/chosen": 0.6894409656524658, "rewards/margins": 0.5635682344436646, "rewards/rejected": 0.12587273120880127, "step": 6179 }, { "epoch": 0.9557316837425092, "grad_norm": 6.339696884155273, "learning_rate": 3.785657005384351e-06, "logits/chosen": 9.19437026977539, "logits/rejected": 3.1364874839782715, "logps/chosen": -212.06027221679688, "logps/rejected": -192.9576416015625, "loss": 0.7342, "rewards/accuracies": 0.375, "rewards/chosen": 0.2538849413394928, "rewards/margins": 0.009813845157623291, "rewards/rejected": 0.2440710961818695, "step": 6180 }, { "epoch": 0.9558863328822733, "grad_norm": 5.144402027130127, "learning_rate": 3.785370603734678e-06, "logits/chosen": 9.853436470031738, "logits/rejected": 10.003225326538086, "logps/chosen": -89.36921691894531, "logps/rejected": -147.92208862304688, "loss": 0.8273, "rewards/accuracies": 0.375, "rewards/chosen": -0.24263063073158264, "rewards/margins": -0.17359329760074615, "rewards/rejected": -0.06903731822967529, "step": 6181 }, { "epoch": 0.9560409820220375, "grad_norm": 5.819518089294434, "learning_rate": 3.7850842020850045e-06, "logits/chosen": 7.742002487182617, "logits/rejected": 9.171527862548828, "logps/chosen": -275.7296142578125, "logps/rejected": -245.9471435546875, "loss": 0.8142, "rewards/accuracies": 0.5, "rewards/chosen": 0.04904394969344139, "rewards/margins": -0.1914726197719574, "rewards/rejected": 0.2405165433883667, "step": 6182 }, { "epoch": 0.9561956311618016, "grad_norm": 5.64265251159668, "learning_rate": 3.7847978004353307e-06, "logits/chosen": 7.825226306915283, "logits/rejected": 6.729127883911133, "logps/chosen": -238.1453399658203, "logps/rejected": -211.0230712890625, "loss": 0.6599, "rewards/accuracies": 0.75, "rewards/chosen": 0.34128665924072266, "rewards/margins": 0.2665756344795227, "rewards/rejected": 0.07471100986003876, "step": 6183 }, { "epoch": 0.9563502803015658, "grad_norm": 5.056849002838135, "learning_rate": 3.7845113987856574e-06, "logits/chosen": 12.205657005310059, "logits/rejected": 6.569552421569824, "logps/chosen": -350.52886962890625, "logps/rejected": -301.33282470703125, "loss": 0.7055, "rewards/accuracies": 0.5, "rewards/chosen": 0.31120777130126953, "rewards/margins": 0.4939754903316498, "rewards/rejected": -0.18276770412921906, "step": 6184 }, { "epoch": 0.95650492944133, "grad_norm": 3.610272169113159, "learning_rate": 3.784224997135984e-06, "logits/chosen": 8.418853759765625, "logits/rejected": 3.3167402744293213, "logps/chosen": -216.83267211914062, "logps/rejected": -166.833984375, "loss": 0.4795, "rewards/accuracies": 0.75, "rewards/chosen": 0.015908867120742798, "rewards/margins": 0.7096530795097351, "rewards/rejected": -0.6937441825866699, "step": 6185 }, { "epoch": 0.9566595785810942, "grad_norm": 6.288115978240967, "learning_rate": 3.7839385954863102e-06, "logits/chosen": 8.40555477142334, "logits/rejected": 11.845731735229492, "logps/chosen": -384.36956787109375, "logps/rejected": -371.1394958496094, "loss": 0.795, "rewards/accuracies": 0.5, "rewards/chosen": 0.3408474028110504, "rewards/margins": -0.05401533842086792, "rewards/rejected": 0.3948627710342407, "step": 6186 }, { "epoch": 0.9568142277208583, "grad_norm": 4.930776596069336, "learning_rate": 3.7836521938366365e-06, "logits/chosen": 12.268728256225586, "logits/rejected": 13.093002319335938, "logps/chosen": -280.7565612792969, "logps/rejected": -270.92138671875, "loss": 0.5424, "rewards/accuracies": 0.75, "rewards/chosen": 0.4865604043006897, "rewards/margins": 0.3804599940776825, "rewards/rejected": 0.10610037297010422, "step": 6187 }, { "epoch": 0.9569688768606225, "grad_norm": 4.21857213973999, "learning_rate": 3.783365792186963e-06, "logits/chosen": 11.802602767944336, "logits/rejected": 8.7615327835083, "logps/chosen": -397.9958801269531, "logps/rejected": -360.99920654296875, "loss": 0.548, "rewards/accuracies": 0.875, "rewards/chosen": 0.4543735980987549, "rewards/margins": 0.47703033685684204, "rewards/rejected": -0.022656723856925964, "step": 6188 }, { "epoch": 0.9571235260003866, "grad_norm": 3.6183483600616455, "learning_rate": 3.7830793905372898e-06, "logits/chosen": 13.680797576904297, "logits/rejected": 6.239417552947998, "logps/chosen": -246.63763427734375, "logps/rejected": -165.08819580078125, "loss": 0.5639, "rewards/accuracies": 0.75, "rewards/chosen": 0.004466649144887924, "rewards/margins": 0.5195215940475464, "rewards/rejected": -0.5150550007820129, "step": 6189 }, { "epoch": 0.9572781751401508, "grad_norm": 4.1929497718811035, "learning_rate": 3.7827929888876164e-06, "logits/chosen": 15.130483627319336, "logits/rejected": 6.626381874084473, "logps/chosen": -229.71731567382812, "logps/rejected": -158.8823699951172, "loss": 0.4868, "rewards/accuracies": 0.625, "rewards/chosen": 0.3587179183959961, "rewards/margins": 0.6306701898574829, "rewards/rejected": -0.27195224165916443, "step": 6190 }, { "epoch": 0.9574328242799149, "grad_norm": 2.6293959617614746, "learning_rate": 3.782506587237943e-06, "logits/chosen": 9.32863712310791, "logits/rejected": 6.863626003265381, "logps/chosen": -132.62428283691406, "logps/rejected": -149.93482971191406, "loss": 0.4542, "rewards/accuracies": 1.0, "rewards/chosen": -0.05582243204116821, "rewards/margins": 0.7524368166923523, "rewards/rejected": -0.8082591891288757, "step": 6191 }, { "epoch": 0.9575874734196791, "grad_norm": 6.161720275878906, "learning_rate": 3.7822201855882697e-06, "logits/chosen": 9.685081481933594, "logits/rejected": 8.40473461151123, "logps/chosen": -318.6549072265625, "logps/rejected": -264.1510009765625, "loss": 0.672, "rewards/accuracies": 0.75, "rewards/chosen": 0.1546720564365387, "rewards/margins": 0.23635368049144745, "rewards/rejected": -0.08168165385723114, "step": 6192 }, { "epoch": 0.9577421225594432, "grad_norm": 5.166300296783447, "learning_rate": 3.7819337839385955e-06, "logits/chosen": 9.75143814086914, "logits/rejected": 14.300973892211914, "logps/chosen": -264.825927734375, "logps/rejected": -337.2331848144531, "loss": 0.6335, "rewards/accuracies": 0.75, "rewards/chosen": 0.7513271570205688, "rewards/margins": 0.26507991552352905, "rewards/rejected": 0.4862472116947174, "step": 6193 }, { "epoch": 0.9578967716992074, "grad_norm": 4.775003910064697, "learning_rate": 3.781647382288922e-06, "logits/chosen": 15.092397689819336, "logits/rejected": 7.727599143981934, "logps/chosen": -329.2802734375, "logps/rejected": -236.48779296875, "loss": 0.4964, "rewards/accuracies": 0.875, "rewards/chosen": 0.3986269235610962, "rewards/margins": 0.5121880173683167, "rewards/rejected": -0.11356109380722046, "step": 6194 }, { "epoch": 0.9580514208389715, "grad_norm": 5.808637619018555, "learning_rate": 3.781360980639249e-06, "logits/chosen": 11.492547035217285, "logits/rejected": 10.122631072998047, "logps/chosen": -434.10394287109375, "logps/rejected": -356.5558166503906, "loss": 0.599, "rewards/accuracies": 0.75, "rewards/chosen": 0.2299526184797287, "rewards/margins": 0.31451255083084106, "rewards/rejected": -0.08455993235111237, "step": 6195 }, { "epoch": 0.9582060699787357, "grad_norm": 5.995190143585205, "learning_rate": 3.7810745789895755e-06, "logits/chosen": 16.781414031982422, "logits/rejected": 18.06316375732422, "logps/chosen": -212.47698974609375, "logps/rejected": -326.875, "loss": 0.7496, "rewards/accuracies": 0.375, "rewards/chosen": 0.13011418282985687, "rewards/margins": -0.05014096572995186, "rewards/rejected": 0.18025514483451843, "step": 6196 }, { "epoch": 0.9583607191184998, "grad_norm": 3.9708070755004883, "learning_rate": 3.7807881773399017e-06, "logits/chosen": 12.563114166259766, "logits/rejected": 10.903181076049805, "logps/chosen": -291.62823486328125, "logps/rejected": -236.60528564453125, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 0.45499327778816223, "rewards/margins": 0.7704517841339111, "rewards/rejected": -0.31545859575271606, "step": 6197 }, { "epoch": 0.9585153682582641, "grad_norm": 3.8150579929351807, "learning_rate": 3.7805017756902283e-06, "logits/chosen": 9.56097412109375, "logits/rejected": 5.480674743652344, "logps/chosen": -245.415283203125, "logps/rejected": -165.5003662109375, "loss": 0.5566, "rewards/accuracies": 0.625, "rewards/chosen": 0.10580796748399734, "rewards/margins": 0.3801415264606476, "rewards/rejected": -0.27433356642723083, "step": 6198 }, { "epoch": 0.9586700173980283, "grad_norm": 5.646664619445801, "learning_rate": 3.780215374040555e-06, "logits/chosen": 13.170984268188477, "logits/rejected": 10.892875671386719, "logps/chosen": -308.5303039550781, "logps/rejected": -326.3078308105469, "loss": 0.4202, "rewards/accuracies": 0.875, "rewards/chosen": 0.28127679228782654, "rewards/margins": 1.1327598094940186, "rewards/rejected": -0.8514830470085144, "step": 6199 }, { "epoch": 0.9588246665377924, "grad_norm": 4.061943531036377, "learning_rate": 3.7799289723908812e-06, "logits/chosen": 11.620902061462402, "logits/rejected": 0.8329916596412659, "logps/chosen": -353.20745849609375, "logps/rejected": -243.18264770507812, "loss": 0.5421, "rewards/accuracies": 0.625, "rewards/chosen": 0.4509489834308624, "rewards/margins": 0.6272865533828735, "rewards/rejected": -0.17633754014968872, "step": 6200 }, { "epoch": 0.9589793156775566, "grad_norm": 4.095134735107422, "learning_rate": 3.779642570741208e-06, "logits/chosen": 14.328485488891602, "logits/rejected": 9.460273742675781, "logps/chosen": -271.25164794921875, "logps/rejected": -169.11851501464844, "loss": 0.6445, "rewards/accuracies": 0.75, "rewards/chosen": -0.13385413587093353, "rewards/margins": 0.18290254473686218, "rewards/rejected": -0.3167566657066345, "step": 6201 }, { "epoch": 0.9591339648173207, "grad_norm": 6.392009735107422, "learning_rate": 3.779356169091534e-06, "logits/chosen": 9.518019676208496, "logits/rejected": 11.726667404174805, "logps/chosen": -303.9400634765625, "logps/rejected": -253.27401733398438, "loss": 0.8503, "rewards/accuracies": 0.25, "rewards/chosen": 0.15070396661758423, "rewards/margins": -0.2587589621543884, "rewards/rejected": 0.40946295857429504, "step": 6202 }, { "epoch": 0.9592886139570849, "grad_norm": 5.753974437713623, "learning_rate": 3.7790697674418607e-06, "logits/chosen": 14.135196685791016, "logits/rejected": 12.58681869506836, "logps/chosen": -390.4419860839844, "logps/rejected": -381.3349609375, "loss": 0.663, "rewards/accuracies": 0.5, "rewards/chosen": 0.3868066966533661, "rewards/margins": 0.22663907706737518, "rewards/rejected": 0.1601676046848297, "step": 6203 }, { "epoch": 0.959443263096849, "grad_norm": 4.343027591705322, "learning_rate": 3.7787833657921874e-06, "logits/chosen": 8.071842193603516, "logits/rejected": 1.7248845100402832, "logps/chosen": -190.04464721679688, "logps/rejected": -120.87312316894531, "loss": 0.6401, "rewards/accuracies": 0.5, "rewards/chosen": 0.06427059322595596, "rewards/margins": 0.20327618718147278, "rewards/rejected": -0.13900557160377502, "step": 6204 }, { "epoch": 0.9595979122366132, "grad_norm": 5.777004241943359, "learning_rate": 3.778496964142514e-06, "logits/chosen": 14.482356071472168, "logits/rejected": 7.85591459274292, "logps/chosen": -297.9575500488281, "logps/rejected": -229.93800354003906, "loss": 0.5911, "rewards/accuracies": 0.5, "rewards/chosen": 0.13670288026332855, "rewards/margins": 0.46076586842536926, "rewards/rejected": -0.3240630030632019, "step": 6205 }, { "epoch": 0.9597525613763773, "grad_norm": 4.93242073059082, "learning_rate": 3.77821056249284e-06, "logits/chosen": 9.400463104248047, "logits/rejected": 7.245129108428955, "logps/chosen": -291.78363037109375, "logps/rejected": -218.31008911132812, "loss": 0.5752, "rewards/accuracies": 0.75, "rewards/chosen": 0.08678723871707916, "rewards/margins": 0.2959372401237488, "rewards/rejected": -0.2091500163078308, "step": 6206 }, { "epoch": 0.9599072105161415, "grad_norm": 4.211946487426758, "learning_rate": 3.7779241608431665e-06, "logits/chosen": 10.71906852722168, "logits/rejected": 3.749788284301758, "logps/chosen": -268.0580749511719, "logps/rejected": -261.02862548828125, "loss": 0.5619, "rewards/accuracies": 0.75, "rewards/chosen": 0.1457514762878418, "rewards/margins": 0.45661646127700806, "rewards/rejected": -0.31086498498916626, "step": 6207 }, { "epoch": 0.9600618596559056, "grad_norm": 5.2096076011657715, "learning_rate": 3.777637759193493e-06, "logits/chosen": 12.332616806030273, "logits/rejected": 11.804502487182617, "logps/chosen": -292.0572814941406, "logps/rejected": -254.89215087890625, "loss": 0.5512, "rewards/accuracies": 0.5, "rewards/chosen": 0.3568549156188965, "rewards/margins": 0.5626353621482849, "rewards/rejected": -0.20578041672706604, "step": 6208 }, { "epoch": 0.9602165087956698, "grad_norm": 7.805737018585205, "learning_rate": 3.77735135754382e-06, "logits/chosen": 13.04304313659668, "logits/rejected": 15.592020988464355, "logps/chosen": -224.40077209472656, "logps/rejected": -244.50022888183594, "loss": 0.9041, "rewards/accuracies": 0.25, "rewards/chosen": -0.36959657073020935, "rewards/margins": -0.30037039518356323, "rewards/rejected": -0.06922616809606552, "step": 6209 }, { "epoch": 0.960371157935434, "grad_norm": 4.945275783538818, "learning_rate": 3.7770649558941465e-06, "logits/chosen": 8.065134048461914, "logits/rejected": 8.350366592407227, "logps/chosen": -288.42657470703125, "logps/rejected": -210.79345703125, "loss": 0.5046, "rewards/accuracies": 0.875, "rewards/chosen": 0.38806694746017456, "rewards/margins": 0.5291779637336731, "rewards/rejected": -0.14111098647117615, "step": 6210 }, { "epoch": 0.9605258070751982, "grad_norm": 7.0027008056640625, "learning_rate": 3.776778554244473e-06, "logits/chosen": 6.472238540649414, "logits/rejected": 3.75234055519104, "logps/chosen": -376.69647216796875, "logps/rejected": -263.144287109375, "loss": 0.805, "rewards/accuracies": 0.375, "rewards/chosen": -0.06863565742969513, "rewards/margins": -0.14633022248744965, "rewards/rejected": 0.07769455015659332, "step": 6211 }, { "epoch": 0.9606804562149623, "grad_norm": 5.786815166473389, "learning_rate": 3.7764921525947993e-06, "logits/chosen": 9.991025924682617, "logits/rejected": 6.760941982269287, "logps/chosen": -304.273681640625, "logps/rejected": -298.0568542480469, "loss": 0.5337, "rewards/accuracies": 0.75, "rewards/chosen": 0.6838828325271606, "rewards/margins": 0.43535536527633667, "rewards/rejected": 0.2485274374485016, "step": 6212 }, { "epoch": 0.9608351053547265, "grad_norm": 3.9415769577026367, "learning_rate": 3.7762057509451256e-06, "logits/chosen": 14.743974685668945, "logits/rejected": 10.124166488647461, "logps/chosen": -247.69381713867188, "logps/rejected": -194.5726318359375, "loss": 0.486, "rewards/accuracies": 0.625, "rewards/chosen": 0.2417970597743988, "rewards/margins": 0.7244962453842163, "rewards/rejected": -0.48269912600517273, "step": 6213 }, { "epoch": 0.9609897544944906, "grad_norm": 4.849062442779541, "learning_rate": 3.775919349295452e-06, "logits/chosen": 11.756913185119629, "logits/rejected": 13.900941848754883, "logps/chosen": -237.27456665039062, "logps/rejected": -282.3472595214844, "loss": 0.5531, "rewards/accuracies": 0.75, "rewards/chosen": 0.3385724127292633, "rewards/margins": 0.43315306305885315, "rewards/rejected": -0.09458065032958984, "step": 6214 }, { "epoch": 0.9611444036342548, "grad_norm": 3.8358962535858154, "learning_rate": 3.775632947645779e-06, "logits/chosen": 12.525490760803223, "logits/rejected": 7.20210075378418, "logps/chosen": -240.63632202148438, "logps/rejected": -164.78302001953125, "loss": 0.5478, "rewards/accuracies": 0.875, "rewards/chosen": -0.10352945327758789, "rewards/margins": 0.39274662733078003, "rewards/rejected": -0.4962760806083679, "step": 6215 }, { "epoch": 0.961299052774019, "grad_norm": 4.672080039978027, "learning_rate": 3.775346545996105e-06, "logits/chosen": 11.450679779052734, "logits/rejected": 12.03573989868164, "logps/chosen": -345.100830078125, "logps/rejected": -326.0154724121094, "loss": 0.5363, "rewards/accuracies": 0.75, "rewards/chosen": -0.06310184299945831, "rewards/margins": 0.542306661605835, "rewards/rejected": -0.6054084897041321, "step": 6216 }, { "epoch": 0.9614537019137831, "grad_norm": 4.900850296020508, "learning_rate": 3.7750601443464317e-06, "logits/chosen": 10.704215049743652, "logits/rejected": 5.836019992828369, "logps/chosen": -280.88079833984375, "logps/rejected": -286.5028991699219, "loss": 0.5755, "rewards/accuracies": 0.75, "rewards/chosen": 0.3014638423919678, "rewards/margins": 0.29908487200737, "rewards/rejected": 0.0023789890110492706, "step": 6217 }, { "epoch": 0.9616083510535473, "grad_norm": 4.7244133949279785, "learning_rate": 3.7747737426967584e-06, "logits/chosen": 10.71621322631836, "logits/rejected": 9.23169994354248, "logps/chosen": -262.21112060546875, "logps/rejected": -267.5535888671875, "loss": 0.5859, "rewards/accuracies": 0.75, "rewards/chosen": 0.37306928634643555, "rewards/margins": 0.35109177231788635, "rewards/rejected": 0.021977512165904045, "step": 6218 }, { "epoch": 0.9617630001933114, "grad_norm": 6.141534328460693, "learning_rate": 3.7744873410470846e-06, "logits/chosen": 7.8809614181518555, "logits/rejected": 6.794828414916992, "logps/chosen": -287.1963806152344, "logps/rejected": -373.8373718261719, "loss": 0.5956, "rewards/accuracies": 0.75, "rewards/chosen": 0.08312735706567764, "rewards/margins": 0.26163122057914734, "rewards/rejected": -0.1785038709640503, "step": 6219 }, { "epoch": 0.9619176493330756, "grad_norm": 3.6746222972869873, "learning_rate": 3.7742009393974113e-06, "logits/chosen": 10.175399780273438, "logits/rejected": 7.9590654373168945, "logps/chosen": -261.11688232421875, "logps/rejected": -242.82693481445312, "loss": 0.4584, "rewards/accuracies": 0.875, "rewards/chosen": 0.5537927150726318, "rewards/margins": 0.6324170231819153, "rewards/rejected": -0.07862435281276703, "step": 6220 }, { "epoch": 0.9620722984728397, "grad_norm": 5.872509956359863, "learning_rate": 3.7739145377477375e-06, "logits/chosen": 12.021138191223145, "logits/rejected": 10.698677062988281, "logps/chosen": -432.6414489746094, "logps/rejected": -367.2095947265625, "loss": 0.5876, "rewards/accuracies": 0.625, "rewards/chosen": 0.21761779487133026, "rewards/margins": 0.47728434205055237, "rewards/rejected": -0.2596665620803833, "step": 6221 }, { "epoch": 0.9622269476126039, "grad_norm": 5.82351541519165, "learning_rate": 3.773628136098064e-06, "logits/chosen": 10.622352600097656, "logits/rejected": 3.973573684692383, "logps/chosen": -402.18402099609375, "logps/rejected": -312.0247497558594, "loss": 0.5743, "rewards/accuracies": 0.625, "rewards/chosen": 0.5350500345230103, "rewards/margins": 0.4204881489276886, "rewards/rejected": 0.11456190049648285, "step": 6222 }, { "epoch": 0.9623815967523681, "grad_norm": 8.601079940795898, "learning_rate": 3.773341734448391e-06, "logits/chosen": 7.874594688415527, "logits/rejected": 11.27859878540039, "logps/chosen": -254.940185546875, "logps/rejected": -291.0589294433594, "loss": 0.796, "rewards/accuracies": 0.375, "rewards/chosen": -0.15610548853874207, "rewards/margins": -0.10389894992113113, "rewards/rejected": -0.052206531167030334, "step": 6223 }, { "epoch": 0.9625362458921323, "grad_norm": 5.773099422454834, "learning_rate": 3.7730553327987174e-06, "logits/chosen": 9.543684959411621, "logits/rejected": 8.454202651977539, "logps/chosen": -267.7629699707031, "logps/rejected": -300.6573181152344, "loss": 0.5254, "rewards/accuracies": 0.875, "rewards/chosen": 0.4540160298347473, "rewards/margins": 0.45007288455963135, "rewards/rejected": 0.003943160176277161, "step": 6224 }, { "epoch": 0.9626908950318964, "grad_norm": 3.9194095134735107, "learning_rate": 3.772768931149044e-06, "logits/chosen": 9.256590843200684, "logits/rejected": 5.746793746948242, "logps/chosen": -217.80056762695312, "logps/rejected": -167.87939453125, "loss": 0.6648, "rewards/accuracies": 0.375, "rewards/chosen": 0.1267896294593811, "rewards/margins": 0.2920009195804596, "rewards/rejected": -0.1652112901210785, "step": 6225 }, { "epoch": 0.9628455441716606, "grad_norm": 5.422349452972412, "learning_rate": 3.77248252949937e-06, "logits/chosen": 12.005634307861328, "logits/rejected": 6.595765590667725, "logps/chosen": -276.6656494140625, "logps/rejected": -280.4333190917969, "loss": 0.5374, "rewards/accuracies": 0.75, "rewards/chosen": 0.03590288385748863, "rewards/margins": 0.42755985260009766, "rewards/rejected": -0.3916569948196411, "step": 6226 }, { "epoch": 0.9630001933114247, "grad_norm": 4.353426456451416, "learning_rate": 3.7721961278496965e-06, "logits/chosen": 9.1546630859375, "logits/rejected": 8.671625137329102, "logps/chosen": -241.7147216796875, "logps/rejected": -221.46524047851562, "loss": 0.5177, "rewards/accuracies": 0.625, "rewards/chosen": 0.1428285539150238, "rewards/margins": 0.7597874402999878, "rewards/rejected": -0.6169588565826416, "step": 6227 }, { "epoch": 0.9631548424511889, "grad_norm": 5.999122142791748, "learning_rate": 3.771909726200023e-06, "logits/chosen": 6.420291900634766, "logits/rejected": 5.207700729370117, "logps/chosen": -376.8318176269531, "logps/rejected": -318.32037353515625, "loss": 0.7101, "rewards/accuracies": 0.75, "rewards/chosen": 0.2463313192129135, "rewards/margins": 0.1491008996963501, "rewards/rejected": 0.09723042696714401, "step": 6228 }, { "epoch": 0.963309491590953, "grad_norm": 7.111025810241699, "learning_rate": 3.77162332455035e-06, "logits/chosen": 11.377235412597656, "logits/rejected": 10.18140697479248, "logps/chosen": -328.5623474121094, "logps/rejected": -288.66314697265625, "loss": 0.8211, "rewards/accuracies": 0.5, "rewards/chosen": -0.15537777543067932, "rewards/margins": -0.17616042494773865, "rewards/rejected": 0.020782656967639923, "step": 6229 }, { "epoch": 0.9634641407307172, "grad_norm": 4.652571678161621, "learning_rate": 3.7713369229006765e-06, "logits/chosen": 6.284856796264648, "logits/rejected": 5.213893890380859, "logps/chosen": -258.7974548339844, "logps/rejected": -237.41958618164062, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": 0.11941994726657867, "rewards/margins": 0.07685817033052444, "rewards/rejected": 0.04256176948547363, "step": 6230 }, { "epoch": 0.9636187898704813, "grad_norm": 7.093327522277832, "learning_rate": 3.7710505212510027e-06, "logits/chosen": 15.114252090454102, "logits/rejected": 8.233548164367676, "logps/chosen": -311.77203369140625, "logps/rejected": -221.2399139404297, "loss": 0.8362, "rewards/accuracies": 0.5, "rewards/chosen": -0.0834583267569542, "rewards/margins": -0.0853218138217926, "rewards/rejected": 0.0018634870648384094, "step": 6231 }, { "epoch": 0.9637734390102455, "grad_norm": 5.479827880859375, "learning_rate": 3.7707641196013294e-06, "logits/chosen": 5.144112586975098, "logits/rejected": 11.315292358398438, "logps/chosen": -233.96780395507812, "logps/rejected": -245.29150390625, "loss": 0.822, "rewards/accuracies": 0.375, "rewards/chosen": -0.4076550006866455, "rewards/margins": -0.08142456412315369, "rewards/rejected": -0.32623040676116943, "step": 6232 }, { "epoch": 0.9639280881500096, "grad_norm": 4.323971748352051, "learning_rate": 3.7704777179516556e-06, "logits/chosen": 5.676105499267578, "logits/rejected": 6.429553031921387, "logps/chosen": -205.0814208984375, "logps/rejected": -209.31210327148438, "loss": 0.7604, "rewards/accuracies": 0.5, "rewards/chosen": -0.15172801911830902, "rewards/margins": -0.003660343587398529, "rewards/rejected": -0.1480676829814911, "step": 6233 }, { "epoch": 0.9640827372897738, "grad_norm": 5.070718288421631, "learning_rate": 3.7701913163019822e-06, "logits/chosen": 7.9707136154174805, "logits/rejected": 10.688069343566895, "logps/chosen": -198.2361602783203, "logps/rejected": -239.82376098632812, "loss": 0.6574, "rewards/accuracies": 0.5, "rewards/chosen": -0.04830784350633621, "rewards/margins": 0.2938075065612793, "rewards/rejected": -0.3421153724193573, "step": 6234 }, { "epoch": 0.9642373864295379, "grad_norm": 16.31546401977539, "learning_rate": 3.7699049146523085e-06, "logits/chosen": 10.313432693481445, "logits/rejected": 8.43807315826416, "logps/chosen": -279.4935607910156, "logps/rejected": -275.16363525390625, "loss": 0.6778, "rewards/accuracies": 0.625, "rewards/chosen": -0.2216387689113617, "rewards/margins": 0.13679268956184387, "rewards/rejected": -0.3584314286708832, "step": 6235 }, { "epoch": 0.9643920355693022, "grad_norm": 6.370844841003418, "learning_rate": 3.769618513002635e-06, "logits/chosen": 16.11417579650879, "logits/rejected": 12.088629722595215, "logps/chosen": -417.5743408203125, "logps/rejected": -374.2684020996094, "loss": 0.5429, "rewards/accuracies": 0.75, "rewards/chosen": 0.503851592540741, "rewards/margins": 0.448061466217041, "rewards/rejected": 0.05579013377428055, "step": 6236 }, { "epoch": 0.9645466847090663, "grad_norm": 6.4553680419921875, "learning_rate": 3.7693321113529618e-06, "logits/chosen": 13.063579559326172, "logits/rejected": 12.430404663085938, "logps/chosen": -345.1760559082031, "logps/rejected": -258.789794921875, "loss": 0.6749, "rewards/accuracies": 0.75, "rewards/chosen": -0.1946934163570404, "rewards/margins": 0.17828252911567688, "rewards/rejected": -0.3729759454727173, "step": 6237 }, { "epoch": 0.9647013338488305, "grad_norm": 5.131139278411865, "learning_rate": 3.7690457097032884e-06, "logits/chosen": 10.330784797668457, "logits/rejected": 4.305326461791992, "logps/chosen": -324.40020751953125, "logps/rejected": -192.52452087402344, "loss": 0.5927, "rewards/accuracies": 0.75, "rewards/chosen": 0.10897751152515411, "rewards/margins": 0.44921454787254333, "rewards/rejected": -0.340237021446228, "step": 6238 }, { "epoch": 0.9648559829885947, "grad_norm": 4.922372817993164, "learning_rate": 3.7687593080536147e-06, "logits/chosen": 10.259038925170898, "logits/rejected": 8.114009857177734, "logps/chosen": -297.686767578125, "logps/rejected": -347.5851135253906, "loss": 0.5775, "rewards/accuracies": 0.625, "rewards/chosen": 0.2805238664150238, "rewards/margins": 0.39249420166015625, "rewards/rejected": -0.11197035014629364, "step": 6239 }, { "epoch": 0.9650106321283588, "grad_norm": 3.8152194023132324, "learning_rate": 3.768472906403941e-06, "logits/chosen": 10.1238374710083, "logits/rejected": 6.693074703216553, "logps/chosen": -341.34783935546875, "logps/rejected": -176.46409606933594, "loss": 0.4395, "rewards/accuracies": 0.75, "rewards/chosen": 0.5498703718185425, "rewards/margins": 0.79653400182724, "rewards/rejected": -0.24666360020637512, "step": 6240 }, { "epoch": 0.965165281268123, "grad_norm": 4.972837924957275, "learning_rate": 3.7681865047542675e-06, "logits/chosen": 12.047258377075195, "logits/rejected": 9.121379852294922, "logps/chosen": -197.76177978515625, "logps/rejected": -180.068603515625, "loss": 0.7013, "rewards/accuracies": 0.625, "rewards/chosen": 0.14627385139465332, "rewards/margins": 0.03559955954551697, "rewards/rejected": 0.11067428439855576, "step": 6241 }, { "epoch": 0.9653199304078871, "grad_norm": 4.085970878601074, "learning_rate": 3.767900103104594e-06, "logits/chosen": 17.535829544067383, "logits/rejected": 7.910150527954102, "logps/chosen": -266.6650390625, "logps/rejected": -184.14718627929688, "loss": 0.5101, "rewards/accuracies": 0.75, "rewards/chosen": 0.3464258015155792, "rewards/margins": 0.6349284052848816, "rewards/rejected": -0.2885025441646576, "step": 6242 }, { "epoch": 0.9654745795476513, "grad_norm": 5.301179885864258, "learning_rate": 3.767613701454921e-06, "logits/chosen": 8.30402660369873, "logits/rejected": 5.971802234649658, "logps/chosen": -406.5301818847656, "logps/rejected": -317.0602722167969, "loss": 0.572, "rewards/accuracies": 0.625, "rewards/chosen": 0.19353963434696198, "rewards/margins": 0.4953781068325043, "rewards/rejected": -0.3018384873867035, "step": 6243 }, { "epoch": 0.9656292286874154, "grad_norm": 7.259011745452881, "learning_rate": 3.7673272998052475e-06, "logits/chosen": 6.027864933013916, "logits/rejected": 9.42466926574707, "logps/chosen": -259.1260681152344, "logps/rejected": -274.513427734375, "loss": 0.8888, "rewards/accuracies": 0.25, "rewards/chosen": -0.03978786617517471, "rewards/margins": -0.30730140209198, "rewards/rejected": 0.2675135135650635, "step": 6244 }, { "epoch": 0.9657838778271796, "grad_norm": 5.502675533294678, "learning_rate": 3.767040898155574e-06, "logits/chosen": 8.866201400756836, "logits/rejected": 6.491358757019043, "logps/chosen": -362.2781982421875, "logps/rejected": -276.3226623535156, "loss": 0.6315, "rewards/accuracies": 0.75, "rewards/chosen": 0.652718186378479, "rewards/margins": 0.38082462549209595, "rewards/rejected": 0.27189356088638306, "step": 6245 }, { "epoch": 0.9659385269669437, "grad_norm": 7.042604923248291, "learning_rate": 3.7667544965059e-06, "logits/chosen": 9.896074295043945, "logits/rejected": 9.05870246887207, "logps/chosen": -289.7452392578125, "logps/rejected": -254.51736450195312, "loss": 0.8295, "rewards/accuracies": 0.375, "rewards/chosen": -0.4651522636413574, "rewards/margins": -0.10872002691030502, "rewards/rejected": -0.3564322590827942, "step": 6246 }, { "epoch": 0.9660931761067079, "grad_norm": 5.6107025146484375, "learning_rate": 3.7664680948562266e-06, "logits/chosen": 9.470053672790527, "logits/rejected": 9.284296035766602, "logps/chosen": -297.4190673828125, "logps/rejected": -284.5372009277344, "loss": 0.6573, "rewards/accuracies": 0.75, "rewards/chosen": 0.12754374742507935, "rewards/margins": 0.10900935530662537, "rewards/rejected": 0.01853439211845398, "step": 6247 }, { "epoch": 0.966247825246472, "grad_norm": 3.7968590259552, "learning_rate": 3.7661816932065532e-06, "logits/chosen": 10.05697250366211, "logits/rejected": 8.074006080627441, "logps/chosen": -277.1243896484375, "logps/rejected": -253.9668731689453, "loss": 0.4872, "rewards/accuracies": 0.75, "rewards/chosen": 0.47266513109207153, "rewards/margins": 0.6206865906715393, "rewards/rejected": -0.14802150428295135, "step": 6248 }, { "epoch": 0.9664024743862363, "grad_norm": 2.798936128616333, "learning_rate": 3.76589529155688e-06, "logits/chosen": 8.21577262878418, "logits/rejected": 1.5273282527923584, "logps/chosen": -186.09304809570312, "logps/rejected": -128.33729553222656, "loss": 0.4976, "rewards/accuracies": 0.625, "rewards/chosen": 0.29868149757385254, "rewards/margins": 0.6515520811080933, "rewards/rejected": -0.3528705835342407, "step": 6249 }, { "epoch": 0.9665571235260004, "grad_norm": 6.064682483673096, "learning_rate": 3.765608889907206e-06, "logits/chosen": 7.74521541595459, "logits/rejected": 11.561120986938477, "logps/chosen": -292.43035888671875, "logps/rejected": -317.5081787109375, "loss": 0.7125, "rewards/accuracies": 0.5, "rewards/chosen": 0.37806808948516846, "rewards/margins": 0.13698609173297882, "rewards/rejected": 0.24108201265335083, "step": 6250 }, { "epoch": 0.9667117726657646, "grad_norm": 4.284787654876709, "learning_rate": 3.7653224882575328e-06, "logits/chosen": 14.056763648986816, "logits/rejected": 3.4579153060913086, "logps/chosen": -230.45591735839844, "logps/rejected": -126.14958953857422, "loss": 0.5697, "rewards/accuracies": 0.625, "rewards/chosen": 0.17201842367649078, "rewards/margins": 0.5434473156929016, "rewards/rejected": -0.371428906917572, "step": 6251 }, { "epoch": 0.9668664218055287, "grad_norm": 4.843050956726074, "learning_rate": 3.765036086607859e-06, "logits/chosen": 16.6568660736084, "logits/rejected": 7.945697784423828, "logps/chosen": -305.6340637207031, "logps/rejected": -204.84315490722656, "loss": 0.6595, "rewards/accuracies": 0.625, "rewards/chosen": 0.34388524293899536, "rewards/margins": 0.1610361933708191, "rewards/rejected": 0.18284901976585388, "step": 6252 }, { "epoch": 0.9670210709452929, "grad_norm": 8.145442008972168, "learning_rate": 3.7647496849581856e-06, "logits/chosen": 8.537132263183594, "logits/rejected": 11.795557022094727, "logps/chosen": -303.1133117675781, "logps/rejected": -279.73284912109375, "loss": 0.9007, "rewards/accuracies": 0.25, "rewards/chosen": -0.2750663161277771, "rewards/margins": -0.2852938771247864, "rewards/rejected": 0.01022757962346077, "step": 6253 }, { "epoch": 0.967175720085057, "grad_norm": 10.77951717376709, "learning_rate": 3.764463283308512e-06, "logits/chosen": 10.265498161315918, "logits/rejected": 10.32868480682373, "logps/chosen": -298.82305908203125, "logps/rejected": -321.20587158203125, "loss": 0.9603, "rewards/accuracies": 0.375, "rewards/chosen": 0.20694218575954437, "rewards/margins": -0.32499951124191284, "rewards/rejected": 0.5319416522979736, "step": 6254 }, { "epoch": 0.9673303692248212, "grad_norm": 5.997251510620117, "learning_rate": 3.7641768816588385e-06, "logits/chosen": 5.200666904449463, "logits/rejected": 4.120314598083496, "logps/chosen": -254.244873046875, "logps/rejected": -196.6265869140625, "loss": 0.7239, "rewards/accuracies": 0.625, "rewards/chosen": -0.35333359241485596, "rewards/margins": 0.07666229456663132, "rewards/rejected": -0.42999589443206787, "step": 6255 }, { "epoch": 0.9674850183645853, "grad_norm": 5.547662734985352, "learning_rate": 3.763890480009165e-06, "logits/chosen": 9.776832580566406, "logits/rejected": 9.78276538848877, "logps/chosen": -156.112060546875, "logps/rejected": -188.59878540039062, "loss": 0.6949, "rewards/accuracies": 0.375, "rewards/chosen": -0.12008310854434967, "rewards/margins": 0.10368173569440842, "rewards/rejected": -0.2237648367881775, "step": 6256 }, { "epoch": 0.9676396675043495, "grad_norm": 6.655076026916504, "learning_rate": 3.763604078359492e-06, "logits/chosen": 16.87818145751953, "logits/rejected": 8.104897499084473, "logps/chosen": -452.58721923828125, "logps/rejected": -302.3865966796875, "loss": 0.6135, "rewards/accuracies": 0.5, "rewards/chosen": 0.6017551422119141, "rewards/margins": 0.4058828353881836, "rewards/rejected": 0.19587230682373047, "step": 6257 }, { "epoch": 0.9677943166441136, "grad_norm": 4.455080509185791, "learning_rate": 3.7633176767098185e-06, "logits/chosen": 12.935771942138672, "logits/rejected": 8.161739349365234, "logps/chosen": -287.3878173828125, "logps/rejected": -207.23658752441406, "loss": 0.6114, "rewards/accuracies": 0.625, "rewards/chosen": 0.28862935304641724, "rewards/margins": 0.36560189723968506, "rewards/rejected": -0.07697255909442902, "step": 6258 }, { "epoch": 0.9679489657838778, "grad_norm": 7.170156955718994, "learning_rate": 3.7630312750601443e-06, "logits/chosen": 8.677175521850586, "logits/rejected": 11.884601593017578, "logps/chosen": -215.62002563476562, "logps/rejected": -243.57623291015625, "loss": 0.9372, "rewards/accuracies": 0.375, "rewards/chosen": -0.3639761209487915, "rewards/margins": -0.3351106643676758, "rewards/rejected": -0.028865426778793335, "step": 6259 }, { "epoch": 0.9681036149236419, "grad_norm": 8.850513458251953, "learning_rate": 3.762744873410471e-06, "logits/chosen": 9.936614036560059, "logits/rejected": 10.412424087524414, "logps/chosen": -217.98822021484375, "logps/rejected": -176.16763305664062, "loss": 0.808, "rewards/accuracies": 0.375, "rewards/chosen": -0.19781073927879333, "rewards/margins": -0.11795802414417267, "rewards/rejected": -0.07985273003578186, "step": 6260 }, { "epoch": 0.9682582640634061, "grad_norm": 6.450249195098877, "learning_rate": 3.7624584717607976e-06, "logits/chosen": 7.904399871826172, "logits/rejected": 9.053790092468262, "logps/chosen": -238.248779296875, "logps/rejected": -361.2785339355469, "loss": 0.7736, "rewards/accuracies": 0.5, "rewards/chosen": 0.4528582990169525, "rewards/margins": 0.10984626412391663, "rewards/rejected": 0.3430120646953583, "step": 6261 }, { "epoch": 0.9684129132031704, "grad_norm": 5.9091010093688965, "learning_rate": 3.7621720701111242e-06, "logits/chosen": 5.1943535804748535, "logits/rejected": 4.121196746826172, "logps/chosen": -189.3777313232422, "logps/rejected": -268.4761657714844, "loss": 0.5993, "rewards/accuracies": 0.75, "rewards/chosen": 0.20213675498962402, "rewards/margins": 0.31495893001556396, "rewards/rejected": -0.11282214522361755, "step": 6262 }, { "epoch": 0.9685675623429345, "grad_norm": 6.606688976287842, "learning_rate": 3.761885668461451e-06, "logits/chosen": 16.05550765991211, "logits/rejected": 12.101274490356445, "logps/chosen": -400.423583984375, "logps/rejected": -331.2559814453125, "loss": 0.7084, "rewards/accuracies": 0.5, "rewards/chosen": 0.37084275484085083, "rewards/margins": 0.27033746242523193, "rewards/rejected": 0.1005052849650383, "step": 6263 }, { "epoch": 0.9687222114826987, "grad_norm": 5.956660747528076, "learning_rate": 3.7615992668117775e-06, "logits/chosen": 10.192949295043945, "logits/rejected": 8.260550498962402, "logps/chosen": -317.9083557128906, "logps/rejected": -233.95382690429688, "loss": 0.9477, "rewards/accuracies": 0.5, "rewards/chosen": -0.41795188188552856, "rewards/margins": -0.08529482781887054, "rewards/rejected": -0.33265700936317444, "step": 6264 }, { "epoch": 0.9688768606224628, "grad_norm": 4.748476982116699, "learning_rate": 3.7613128651621033e-06, "logits/chosen": 6.996815204620361, "logits/rejected": 6.674928665161133, "logps/chosen": -235.6258087158203, "logps/rejected": -249.1686553955078, "loss": 0.6297, "rewards/accuracies": 0.5, "rewards/chosen": 0.043549057096242905, "rewards/margins": 0.21402449905872345, "rewards/rejected": -0.17047543823719025, "step": 6265 }, { "epoch": 0.969031509762227, "grad_norm": 6.7631330490112305, "learning_rate": 3.76102646351243e-06, "logits/chosen": 14.424544334411621, "logits/rejected": 17.116493225097656, "logps/chosen": -285.1363220214844, "logps/rejected": -267.27081298828125, "loss": 0.9648, "rewards/accuracies": 0.375, "rewards/chosen": -0.5097063779830933, "rewards/margins": -0.3229352533817291, "rewards/rejected": -0.18677110970020294, "step": 6266 }, { "epoch": 0.9691861589019911, "grad_norm": 5.223404407501221, "learning_rate": 3.7607400618627566e-06, "logits/chosen": 10.831860542297363, "logits/rejected": 10.007755279541016, "logps/chosen": -157.50570678710938, "logps/rejected": -178.83848571777344, "loss": 0.5841, "rewards/accuracies": 0.875, "rewards/chosen": -0.235112726688385, "rewards/margins": 0.3109947144985199, "rewards/rejected": -0.5461074709892273, "step": 6267 }, { "epoch": 0.9693408080417553, "grad_norm": 4.666409015655518, "learning_rate": 3.7604536602130833e-06, "logits/chosen": 10.217757225036621, "logits/rejected": 5.7212629318237305, "logps/chosen": -191.2835693359375, "logps/rejected": -166.89266967773438, "loss": 0.7029, "rewards/accuracies": 0.375, "rewards/chosen": -0.3665688633918762, "rewards/margins": 0.2122378647327423, "rewards/rejected": -0.5788067579269409, "step": 6268 }, { "epoch": 0.9694954571815194, "grad_norm": 4.502893447875977, "learning_rate": 3.7601672585634095e-06, "logits/chosen": 10.026115417480469, "logits/rejected": 6.265719413757324, "logps/chosen": -190.56192016601562, "logps/rejected": -157.34432983398438, "loss": 0.6764, "rewards/accuracies": 0.625, "rewards/chosen": 0.021848969161510468, "rewards/margins": 0.127274751663208, "rewards/rejected": -0.10542578250169754, "step": 6269 }, { "epoch": 0.9696501063212836, "grad_norm": 4.055923938751221, "learning_rate": 3.759880856913736e-06, "logits/chosen": 10.522934913635254, "logits/rejected": 5.213428497314453, "logps/chosen": -296.31005859375, "logps/rejected": -256.1463928222656, "loss": 0.4316, "rewards/accuracies": 1.0, "rewards/chosen": 0.5916823148727417, "rewards/margins": 0.8471230268478394, "rewards/rejected": -0.25544074177742004, "step": 6270 }, { "epoch": 0.9698047554610477, "grad_norm": 4.781545162200928, "learning_rate": 3.759594455264063e-06, "logits/chosen": 13.486408233642578, "logits/rejected": 12.499540328979492, "logps/chosen": -281.8381042480469, "logps/rejected": -267.7893981933594, "loss": 0.5145, "rewards/accuracies": 1.0, "rewards/chosen": 0.59675532579422, "rewards/margins": 0.4308159053325653, "rewards/rejected": 0.16593945026397705, "step": 6271 }, { "epoch": 0.9699594046008119, "grad_norm": 4.990830421447754, "learning_rate": 3.759308053614389e-06, "logits/chosen": 11.217247009277344, "logits/rejected": 9.477635383605957, "logps/chosen": -221.22340393066406, "logps/rejected": -221.83580017089844, "loss": 0.6782, "rewards/accuracies": 0.5, "rewards/chosen": 0.022788338363170624, "rewards/margins": 0.10545320063829422, "rewards/rejected": -0.08266487717628479, "step": 6272 }, { "epoch": 0.970114053740576, "grad_norm": 4.796380519866943, "learning_rate": 3.7590216519647153e-06, "logits/chosen": 7.505277156829834, "logits/rejected": 6.669300079345703, "logps/chosen": -220.4984130859375, "logps/rejected": -211.16049194335938, "loss": 0.6993, "rewards/accuracies": 0.75, "rewards/chosen": 0.11744508892297745, "rewards/margins": 0.10261349380016327, "rewards/rejected": 0.014831595122814178, "step": 6273 }, { "epoch": 0.9702687028803403, "grad_norm": 12.667013168334961, "learning_rate": 3.758735250315042e-06, "logits/chosen": 12.283042907714844, "logits/rejected": 9.919509887695312, "logps/chosen": -240.1666259765625, "logps/rejected": -269.742431640625, "loss": 0.7003, "rewards/accuracies": 0.5, "rewards/chosen": -0.04436378926038742, "rewards/margins": 0.08059270679950714, "rewards/rejected": -0.12495648115873337, "step": 6274 }, { "epoch": 0.9704233520201044, "grad_norm": 5.496799468994141, "learning_rate": 3.7584488486653686e-06, "logits/chosen": 10.627809524536133, "logits/rejected": 8.267110824584961, "logps/chosen": -335.3692321777344, "logps/rejected": -222.99029541015625, "loss": 0.6714, "rewards/accuracies": 0.625, "rewards/chosen": 0.46367499232292175, "rewards/margins": 0.13004744052886963, "rewards/rejected": 0.33362752199172974, "step": 6275 }, { "epoch": 0.9705780011598686, "grad_norm": 5.949850082397461, "learning_rate": 3.7581624470156952e-06, "logits/chosen": 13.366804122924805, "logits/rejected": 7.2980146408081055, "logps/chosen": -316.492431640625, "logps/rejected": -297.769287109375, "loss": 0.5956, "rewards/accuracies": 0.625, "rewards/chosen": 0.12766170501708984, "rewards/margins": 0.5112359523773193, "rewards/rejected": -0.3835742175579071, "step": 6276 }, { "epoch": 0.9707326502996327, "grad_norm": 4.195515155792236, "learning_rate": 3.757876045366022e-06, "logits/chosen": 12.662261962890625, "logits/rejected": 10.821195602416992, "logps/chosen": -239.55706787109375, "logps/rejected": -261.9835205078125, "loss": 0.4488, "rewards/accuracies": 1.0, "rewards/chosen": -0.04185633733868599, "rewards/margins": 0.6447039842605591, "rewards/rejected": -0.6865603923797607, "step": 6277 }, { "epoch": 0.9708872994393969, "grad_norm": 6.776169300079346, "learning_rate": 3.7575896437163485e-06, "logits/chosen": 6.963130950927734, "logits/rejected": 7.097776412963867, "logps/chosen": -207.04531860351562, "logps/rejected": -228.71939086914062, "loss": 0.7167, "rewards/accuracies": 0.375, "rewards/chosen": -0.04304493963718414, "rewards/margins": 0.06501109898090363, "rewards/rejected": -0.10805603116750717, "step": 6278 }, { "epoch": 0.971041948579161, "grad_norm": 5.149867057800293, "learning_rate": 3.7573032420666743e-06, "logits/chosen": 9.821857452392578, "logits/rejected": 8.230743408203125, "logps/chosen": -224.55416870117188, "logps/rejected": -295.2489929199219, "loss": 0.5014, "rewards/accuracies": 0.625, "rewards/chosen": 0.39226651191711426, "rewards/margins": 0.7520332336425781, "rewards/rejected": -0.35976678133010864, "step": 6279 }, { "epoch": 0.9711965977189252, "grad_norm": 6.54766321182251, "learning_rate": 3.757016840417001e-06, "logits/chosen": 9.347705841064453, "logits/rejected": 4.830849647521973, "logps/chosen": -319.0818176269531, "logps/rejected": -241.9410400390625, "loss": 0.6269, "rewards/accuracies": 0.5, "rewards/chosen": 0.25212106108665466, "rewards/margins": 0.28301262855529785, "rewards/rejected": -0.030891556292772293, "step": 6280 }, { "epoch": 0.9713512468586893, "grad_norm": 5.549529552459717, "learning_rate": 3.7567304387673276e-06, "logits/chosen": 12.870904922485352, "logits/rejected": 9.637121200561523, "logps/chosen": -336.7303771972656, "logps/rejected": -274.7347717285156, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": 0.20548829436302185, "rewards/margins": 0.1625308394432068, "rewards/rejected": 0.04295744746923447, "step": 6281 }, { "epoch": 0.9715058959984535, "grad_norm": 9.188798904418945, "learning_rate": 3.7564440371176543e-06, "logits/chosen": 11.511979103088379, "logits/rejected": 7.0975189208984375, "logps/chosen": -355.4588623046875, "logps/rejected": -344.9566650390625, "loss": 0.8716, "rewards/accuracies": 0.5, "rewards/chosen": -0.11412334442138672, "rewards/margins": -0.12623050808906555, "rewards/rejected": 0.012107163667678833, "step": 6282 }, { "epoch": 0.9716605451382176, "grad_norm": 3.688019037246704, "learning_rate": 3.756157635467981e-06, "logits/chosen": 13.157388687133789, "logits/rejected": 4.678380966186523, "logps/chosen": -258.9842834472656, "logps/rejected": -191.83261108398438, "loss": 0.5084, "rewards/accuracies": 0.75, "rewards/chosen": 0.529140830039978, "rewards/margins": 0.5969865918159485, "rewards/rejected": -0.06784577667713165, "step": 6283 }, { "epoch": 0.9718151942779818, "grad_norm": 4.0534281730651855, "learning_rate": 3.755871233818307e-06, "logits/chosen": 11.310357093811035, "logits/rejected": 12.62339973449707, "logps/chosen": -237.59552001953125, "logps/rejected": -293.6082763671875, "loss": 0.4773, "rewards/accuracies": 0.875, "rewards/chosen": 0.4341612756252289, "rewards/margins": 0.6125265955924988, "rewards/rejected": -0.1783653348684311, "step": 6284 }, { "epoch": 0.971969843417746, "grad_norm": 5.446765899658203, "learning_rate": 3.7555848321686334e-06, "logits/chosen": 10.807339668273926, "logits/rejected": 5.760083198547363, "logps/chosen": -329.4555969238281, "logps/rejected": -267.57373046875, "loss": 0.6303, "rewards/accuracies": 0.625, "rewards/chosen": 0.2157566100358963, "rewards/margins": 0.2430887371301651, "rewards/rejected": -0.027332112193107605, "step": 6285 }, { "epoch": 0.9721244925575101, "grad_norm": 6.091634750366211, "learning_rate": 3.75529843051896e-06, "logits/chosen": 11.049680709838867, "logits/rejected": 11.144487380981445, "logps/chosen": -259.19287109375, "logps/rejected": -274.60821533203125, "loss": 0.8043, "rewards/accuracies": 0.125, "rewards/chosen": 0.28198039531707764, "rewards/margins": -0.1819252073764801, "rewards/rejected": 0.46390560269355774, "step": 6286 }, { "epoch": 0.9722791416972744, "grad_norm": 4.244925022125244, "learning_rate": 3.7550120288692867e-06, "logits/chosen": 7.915404319763184, "logits/rejected": 7.382715702056885, "logps/chosen": -170.4591522216797, "logps/rejected": -210.62298583984375, "loss": 0.621, "rewards/accuracies": 0.75, "rewards/chosen": 0.01643562689423561, "rewards/margins": 0.2778860032558441, "rewards/rejected": -0.2614504098892212, "step": 6287 }, { "epoch": 0.9724337908370385, "grad_norm": 7.048520088195801, "learning_rate": 3.754725627219613e-06, "logits/chosen": 12.771485328674316, "logits/rejected": 5.459840774536133, "logps/chosen": -341.1960754394531, "logps/rejected": -299.1044921875, "loss": 0.3689, "rewards/accuracies": 1.0, "rewards/chosen": 0.391074001789093, "rewards/margins": 0.8669734001159668, "rewards/rejected": -0.47589945793151855, "step": 6288 }, { "epoch": 0.9725884399768027, "grad_norm": 6.387578964233398, "learning_rate": 3.7544392255699396e-06, "logits/chosen": 11.089256286621094, "logits/rejected": 7.219018936157227, "logps/chosen": -340.5226745605469, "logps/rejected": -197.89361572265625, "loss": 0.7196, "rewards/accuracies": 0.5, "rewards/chosen": -0.11683791875839233, "rewards/margins": 0.037959348410367966, "rewards/rejected": -0.1547972708940506, "step": 6289 }, { "epoch": 0.9727430891165668, "grad_norm": 4.7493696212768555, "learning_rate": 3.754152823920266e-06, "logits/chosen": 12.237720489501953, "logits/rejected": 3.726443290710449, "logps/chosen": -321.08154296875, "logps/rejected": -214.3508758544922, "loss": 0.5117, "rewards/accuracies": 1.0, "rewards/chosen": 0.2756778597831726, "rewards/margins": 0.4415963292121887, "rewards/rejected": -0.16591843962669373, "step": 6290 }, { "epoch": 0.972897738256331, "grad_norm": 9.014803886413574, "learning_rate": 3.753866422270593e-06, "logits/chosen": 6.828606605529785, "logits/rejected": 5.5821967124938965, "logps/chosen": -307.0285339355469, "logps/rejected": -301.3689880371094, "loss": 0.5959, "rewards/accuracies": 0.75, "rewards/chosen": 0.2567369341850281, "rewards/margins": 0.38547590374946594, "rewards/rejected": -0.12873896956443787, "step": 6291 }, { "epoch": 0.9730523873960951, "grad_norm": 8.48596477508545, "learning_rate": 3.7535800206209187e-06, "logits/chosen": 4.945058822631836, "logits/rejected": 4.95685338973999, "logps/chosen": -204.084228515625, "logps/rejected": -258.9480285644531, "loss": 0.8064, "rewards/accuracies": 0.375, "rewards/chosen": 0.3339933454990387, "rewards/margins": 0.06666679680347443, "rewards/rejected": 0.2673265337944031, "step": 6292 }, { "epoch": 0.9732070365358593, "grad_norm": 3.631401538848877, "learning_rate": 3.7532936189712453e-06, "logits/chosen": 13.18494987487793, "logits/rejected": 9.509491920471191, "logps/chosen": -179.29742431640625, "logps/rejected": -132.405029296875, "loss": 0.6282, "rewards/accuracies": 0.5, "rewards/chosen": 0.004099570214748383, "rewards/margins": 0.46705377101898193, "rewards/rejected": -0.46295422315597534, "step": 6293 }, { "epoch": 0.9733616856756234, "grad_norm": 4.574029922485352, "learning_rate": 3.753007217321572e-06, "logits/chosen": 10.251144409179688, "logits/rejected": 12.60718822479248, "logps/chosen": -375.3944091796875, "logps/rejected": -291.18341064453125, "loss": 0.5058, "rewards/accuracies": 0.625, "rewards/chosen": 0.44762903451919556, "rewards/margins": 0.7367630004882812, "rewards/rejected": -0.2891339957714081, "step": 6294 }, { "epoch": 0.9735163348153876, "grad_norm": 4.10559606552124, "learning_rate": 3.7527208156718986e-06, "logits/chosen": 8.01666259765625, "logits/rejected": 8.446287155151367, "logps/chosen": -166.8321075439453, "logps/rejected": -226.93145751953125, "loss": 0.589, "rewards/accuracies": 0.5, "rewards/chosen": -0.0058164894580841064, "rewards/margins": 0.5392737984657288, "rewards/rejected": -0.5450902581214905, "step": 6295 }, { "epoch": 0.9736709839551517, "grad_norm": 6.397380828857422, "learning_rate": 3.7524344140222253e-06, "logits/chosen": 8.823354721069336, "logits/rejected": 14.64541244506836, "logps/chosen": -217.61114501953125, "logps/rejected": -306.6552429199219, "loss": 0.8856, "rewards/accuracies": 0.625, "rewards/chosen": -0.17376859486103058, "rewards/margins": -0.03956770896911621, "rewards/rejected": -0.13420085608959198, "step": 6296 }, { "epoch": 0.9738256330949159, "grad_norm": 8.119359970092773, "learning_rate": 3.752148012372552e-06, "logits/chosen": 16.643131256103516, "logits/rejected": 14.099477767944336, "logps/chosen": -293.25189208984375, "logps/rejected": -257.774169921875, "loss": 0.8966, "rewards/accuracies": 0.375, "rewards/chosen": -0.1111118495464325, "rewards/margins": -0.11099110543727875, "rewards/rejected": -0.00012074783444404602, "step": 6297 }, { "epoch": 0.97398028223468, "grad_norm": 5.610093593597412, "learning_rate": 3.7518616107228777e-06, "logits/chosen": 12.070931434631348, "logits/rejected": 9.110671043395996, "logps/chosen": -239.3953857421875, "logps/rejected": -190.05569458007812, "loss": 0.6075, "rewards/accuracies": 0.625, "rewards/chosen": 0.1395576298236847, "rewards/margins": 0.3216792643070221, "rewards/rejected": -0.1821216642856598, "step": 6298 }, { "epoch": 0.9741349313744442, "grad_norm": 4.453665733337402, "learning_rate": 3.7515752090732044e-06, "logits/chosen": 12.264678955078125, "logits/rejected": 7.147149085998535, "logps/chosen": -319.1675109863281, "logps/rejected": -209.34268188476562, "loss": 0.5107, "rewards/accuracies": 0.75, "rewards/chosen": 0.392706036567688, "rewards/margins": 0.45684176683425903, "rewards/rejected": -0.06413574516773224, "step": 6299 }, { "epoch": 0.9742895805142084, "grad_norm": 5.393481254577637, "learning_rate": 3.751288807423531e-06, "logits/chosen": 12.595806121826172, "logits/rejected": 9.223859786987305, "logps/chosen": -296.7220458984375, "logps/rejected": -285.6002197265625, "loss": 0.6413, "rewards/accuracies": 0.625, "rewards/chosen": 0.3037174642086029, "rewards/margins": 0.1938306838274002, "rewards/rejected": 0.10988680273294449, "step": 6300 }, { "epoch": 0.9744442296539726, "grad_norm": 4.3994526863098145, "learning_rate": 3.7510024057738577e-06, "logits/chosen": 12.864940643310547, "logits/rejected": 8.482186317443848, "logps/chosen": -239.47463989257812, "logps/rejected": -251.9224853515625, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": -0.05874108523130417, "rewards/margins": 0.6514422297477722, "rewards/rejected": -0.710183322429657, "step": 6301 }, { "epoch": 0.9745988787937367, "grad_norm": 5.376049041748047, "learning_rate": 3.7507160041241843e-06, "logits/chosen": 13.081180572509766, "logits/rejected": 12.240509033203125, "logps/chosen": -333.3944091796875, "logps/rejected": -307.56640625, "loss": 0.6369, "rewards/accuracies": 0.75, "rewards/chosen": 0.5191308259963989, "rewards/margins": 0.15374480187892914, "rewards/rejected": 0.3653860092163086, "step": 6302 }, { "epoch": 0.9747535279335009, "grad_norm": 4.008645534515381, "learning_rate": 3.7504296024745105e-06, "logits/chosen": 7.70807409286499, "logits/rejected": 3.812920093536377, "logps/chosen": -269.4783935546875, "logps/rejected": -174.51019287109375, "loss": 0.5031, "rewards/accuracies": 0.625, "rewards/chosen": -0.01090259850025177, "rewards/margins": 0.6251951456069946, "rewards/rejected": -0.63609778881073, "step": 6303 }, { "epoch": 0.974908177073265, "grad_norm": 5.73533296585083, "learning_rate": 3.750143200824837e-06, "logits/chosen": 11.198104858398438, "logits/rejected": 15.782443046569824, "logps/chosen": -220.4842529296875, "logps/rejected": -290.2976379394531, "loss": 0.7058, "rewards/accuracies": 0.625, "rewards/chosen": -0.2851814031600952, "rewards/margins": 0.029813483357429504, "rewards/rejected": -0.3149949014186859, "step": 6304 }, { "epoch": 0.9750628262130292, "grad_norm": 5.344424724578857, "learning_rate": 3.7498567991751634e-06, "logits/chosen": 10.123014450073242, "logits/rejected": 5.762684345245361, "logps/chosen": -327.5176696777344, "logps/rejected": -204.54148864746094, "loss": 0.7542, "rewards/accuracies": 0.625, "rewards/chosen": 0.12442417442798615, "rewards/margins": -0.027842365205287933, "rewards/rejected": 0.15226653218269348, "step": 6305 }, { "epoch": 0.9752174753527934, "grad_norm": 7.004614353179932, "learning_rate": 3.74957039752549e-06, "logits/chosen": 4.719689846038818, "logits/rejected": 5.728633403778076, "logps/chosen": -403.8475036621094, "logps/rejected": -365.57611083984375, "loss": 0.6849, "rewards/accuracies": 0.5, "rewards/chosen": 0.08424302935600281, "rewards/margins": 0.18592345714569092, "rewards/rejected": -0.10168042778968811, "step": 6306 }, { "epoch": 0.9753721244925575, "grad_norm": 4.442594051361084, "learning_rate": 3.7492839958758163e-06, "logits/chosen": 16.242652893066406, "logits/rejected": 10.898727416992188, "logps/chosen": -348.8134460449219, "logps/rejected": -268.0030517578125, "loss": 0.5538, "rewards/accuracies": 0.75, "rewards/chosen": 0.27345019578933716, "rewards/margins": 0.46818414330482483, "rewards/rejected": -0.19473400712013245, "step": 6307 }, { "epoch": 0.9755267736323217, "grad_norm": 4.056453704833984, "learning_rate": 3.748997594226143e-06, "logits/chosen": 9.053031921386719, "logits/rejected": 9.082283020019531, "logps/chosen": -296.16461181640625, "logps/rejected": -258.0118408203125, "loss": 0.5295, "rewards/accuracies": 0.875, "rewards/chosen": 0.3165741562843323, "rewards/margins": 0.49262741208076477, "rewards/rejected": -0.1760532259941101, "step": 6308 }, { "epoch": 0.9756814227720858, "grad_norm": 4.034596920013428, "learning_rate": 3.7487111925764696e-06, "logits/chosen": 12.421431541442871, "logits/rejected": 5.468685150146484, "logps/chosen": -323.6148986816406, "logps/rejected": -247.73236083984375, "loss": 0.4609, "rewards/accuracies": 0.75, "rewards/chosen": 0.7340185642242432, "rewards/margins": 0.7449280619621277, "rewards/rejected": -0.010909520089626312, "step": 6309 }, { "epoch": 0.97583607191185, "grad_norm": 7.380002498626709, "learning_rate": 3.7484247909267962e-06, "logits/chosen": 7.1760993003845215, "logits/rejected": 7.825562000274658, "logps/chosen": -402.88702392578125, "logps/rejected": -314.72381591796875, "loss": 0.6748, "rewards/accuracies": 0.5, "rewards/chosen": 0.5270090103149414, "rewards/margins": 0.19681420922279358, "rewards/rejected": 0.3301948308944702, "step": 6310 }, { "epoch": 0.9759907210516141, "grad_norm": 3.603492021560669, "learning_rate": 3.748138389277123e-06, "logits/chosen": 10.86450481414795, "logits/rejected": 3.8220930099487305, "logps/chosen": -231.05758666992188, "logps/rejected": -205.30532836914062, "loss": 0.5533, "rewards/accuracies": 0.625, "rewards/chosen": 0.21636362373828888, "rewards/margins": 0.5296019911766052, "rewards/rejected": -0.31323838233947754, "step": 6311 }, { "epoch": 0.9761453701913783, "grad_norm": 5.500557899475098, "learning_rate": 3.7478519876274487e-06, "logits/chosen": 5.7570977210998535, "logits/rejected": 3.496809244155884, "logps/chosen": -190.95701599121094, "logps/rejected": -257.35772705078125, "loss": 0.6752, "rewards/accuracies": 0.5, "rewards/chosen": -0.3326965570449829, "rewards/margins": 0.16002917289733887, "rewards/rejected": -0.492725670337677, "step": 6312 }, { "epoch": 0.9763000193311425, "grad_norm": 3.8797390460968018, "learning_rate": 3.7475655859777754e-06, "logits/chosen": 12.644182205200195, "logits/rejected": 7.577823638916016, "logps/chosen": -379.1197509765625, "logps/rejected": -282.9969482421875, "loss": 0.4467, "rewards/accuracies": 0.875, "rewards/chosen": 0.1979825794696808, "rewards/margins": 0.6604365706443787, "rewards/rejected": -0.46245402097702026, "step": 6313 }, { "epoch": 0.9764546684709067, "grad_norm": 3.9358632564544678, "learning_rate": 3.747279184328102e-06, "logits/chosen": 10.954132080078125, "logits/rejected": 11.247160911560059, "logps/chosen": -290.91741943359375, "logps/rejected": -241.4364471435547, "loss": 0.5067, "rewards/accuracies": 0.75, "rewards/chosen": 0.37206947803497314, "rewards/margins": 0.552484393119812, "rewards/rejected": -0.18041495978832245, "step": 6314 }, { "epoch": 0.9766093176106708, "grad_norm": 5.466884613037109, "learning_rate": 3.7469927826784287e-06, "logits/chosen": 4.846158981323242, "logits/rejected": 6.689931392669678, "logps/chosen": -251.2412567138672, "logps/rejected": -266.4517517089844, "loss": 0.7856, "rewards/accuracies": 0.625, "rewards/chosen": 0.06358089298009872, "rewards/margins": -0.060526855289936066, "rewards/rejected": 0.12410774827003479, "step": 6315 }, { "epoch": 0.976763966750435, "grad_norm": 6.830657005310059, "learning_rate": 3.7467063810287553e-06, "logits/chosen": 13.615756034851074, "logits/rejected": 8.434019088745117, "logps/chosen": -330.7833557128906, "logps/rejected": -315.7149658203125, "loss": 0.6189, "rewards/accuracies": 0.625, "rewards/chosen": 0.38659992814064026, "rewards/margins": 0.360819935798645, "rewards/rejected": 0.025780007243156433, "step": 6316 }, { "epoch": 0.9769186158901991, "grad_norm": 5.3813910484313965, "learning_rate": 3.746419979379082e-06, "logits/chosen": 10.657546997070312, "logits/rejected": 10.771195411682129, "logps/chosen": -222.75119018554688, "logps/rejected": -309.4671630859375, "loss": 0.5296, "rewards/accuracies": 0.875, "rewards/chosen": 0.2782250642776489, "rewards/margins": 0.45868441462516785, "rewards/rejected": -0.1804593950510025, "step": 6317 }, { "epoch": 0.9770732650299633, "grad_norm": 5.064642906188965, "learning_rate": 3.7461335777294078e-06, "logits/chosen": 11.744096755981445, "logits/rejected": 11.42077922821045, "logps/chosen": -246.2164306640625, "logps/rejected": -269.1891784667969, "loss": 0.6044, "rewards/accuracies": 0.875, "rewards/chosen": 0.31924593448638916, "rewards/margins": 0.44632774591445923, "rewards/rejected": -0.12708181142807007, "step": 6318 }, { "epoch": 0.9772279141697274, "grad_norm": 6.542880535125732, "learning_rate": 3.7458471760797344e-06, "logits/chosen": 8.96570873260498, "logits/rejected": 2.1790828704833984, "logps/chosen": -452.2339172363281, "logps/rejected": -301.28253173828125, "loss": 0.5202, "rewards/accuracies": 0.75, "rewards/chosen": 0.7694894075393677, "rewards/margins": 0.588054895401001, "rewards/rejected": 0.18143445253372192, "step": 6319 }, { "epoch": 0.9773825633094916, "grad_norm": 5.794060230255127, "learning_rate": 3.745560774430061e-06, "logits/chosen": 11.890701293945312, "logits/rejected": 11.94587516784668, "logps/chosen": -277.8301696777344, "logps/rejected": -370.354248046875, "loss": 0.7621, "rewards/accuracies": 0.5, "rewards/chosen": 0.055869489908218384, "rewards/margins": -0.0626932829618454, "rewards/rejected": 0.11856281757354736, "step": 6320 }, { "epoch": 0.9775372124492557, "grad_norm": 4.301764011383057, "learning_rate": 3.7452743727803877e-06, "logits/chosen": 11.885927200317383, "logits/rejected": 11.155097961425781, "logps/chosen": -203.3512725830078, "logps/rejected": -293.7401123046875, "loss": 0.5359, "rewards/accuracies": 0.75, "rewards/chosen": -0.15808498859405518, "rewards/margins": 0.5096316337585449, "rewards/rejected": -0.6677165627479553, "step": 6321 }, { "epoch": 0.9776918615890199, "grad_norm": 5.338297367095947, "learning_rate": 3.744987971130714e-06, "logits/chosen": 10.313863754272461, "logits/rejected": 5.111762046813965, "logps/chosen": -382.3363342285156, "logps/rejected": -243.29466247558594, "loss": 0.6486, "rewards/accuracies": 0.75, "rewards/chosen": 0.037276655435562134, "rewards/margins": 0.24384687840938568, "rewards/rejected": -0.20657017827033997, "step": 6322 }, { "epoch": 0.977846510728784, "grad_norm": 3.9889814853668213, "learning_rate": 3.7447015694810406e-06, "logits/chosen": 6.860918045043945, "logits/rejected": 10.217616081237793, "logps/chosen": -210.58311462402344, "logps/rejected": -235.1737518310547, "loss": 0.4993, "rewards/accuracies": 0.875, "rewards/chosen": 0.10423905402421951, "rewards/margins": 0.521422803401947, "rewards/rejected": -0.4171837866306305, "step": 6323 }, { "epoch": 0.9780011598685482, "grad_norm": 4.534695148468018, "learning_rate": 3.7444151678313672e-06, "logits/chosen": 5.336467742919922, "logits/rejected": 5.143927574157715, "logps/chosen": -235.08445739746094, "logps/rejected": -248.03250122070312, "loss": 0.6127, "rewards/accuracies": 0.75, "rewards/chosen": 0.38200390338897705, "rewards/margins": 0.3354136347770691, "rewards/rejected": 0.04659028351306915, "step": 6324 }, { "epoch": 0.9781558090083123, "grad_norm": 6.216179370880127, "learning_rate": 3.7441287661816935e-06, "logits/chosen": 7.222482681274414, "logits/rejected": 10.387896537780762, "logps/chosen": -293.2083740234375, "logps/rejected": -327.56201171875, "loss": 0.4955, "rewards/accuracies": 0.75, "rewards/chosen": 0.07214432209730148, "rewards/margins": 0.5247325897216797, "rewards/rejected": -0.4525882601737976, "step": 6325 }, { "epoch": 0.9783104581480766, "grad_norm": 5.459686279296875, "learning_rate": 3.7438423645320197e-06, "logits/chosen": 16.51071548461914, "logits/rejected": 8.343562126159668, "logps/chosen": -266.8075866699219, "logps/rejected": -244.42324829101562, "loss": 0.5517, "rewards/accuracies": 0.75, "rewards/chosen": 0.17756357789039612, "rewards/margins": 0.5416630506515503, "rewards/rejected": -0.36409950256347656, "step": 6326 }, { "epoch": 0.9784651072878408, "grad_norm": 5.218517780303955, "learning_rate": 3.7435559628823463e-06, "logits/chosen": 13.929750442504883, "logits/rejected": 8.82845687866211, "logps/chosen": -417.3029479980469, "logps/rejected": -352.20751953125, "loss": 0.5191, "rewards/accuracies": 0.75, "rewards/chosen": 0.3881669342517853, "rewards/margins": 0.5378721952438354, "rewards/rejected": -0.14970529079437256, "step": 6327 }, { "epoch": 0.9786197564276049, "grad_norm": 4.251748085021973, "learning_rate": 3.743269561232673e-06, "logits/chosen": 11.406187057495117, "logits/rejected": 6.826017379760742, "logps/chosen": -208.38363647460938, "logps/rejected": -186.21038818359375, "loss": 0.4853, "rewards/accuracies": 0.875, "rewards/chosen": 0.1649625301361084, "rewards/margins": 0.5692854523658752, "rewards/rejected": -0.40432295203208923, "step": 6328 }, { "epoch": 0.978774405567369, "grad_norm": 5.656533241271973, "learning_rate": 3.7429831595829996e-06, "logits/chosen": 11.392833709716797, "logits/rejected": 8.460349082946777, "logps/chosen": -315.70086669921875, "logps/rejected": -250.5478973388672, "loss": 0.6484, "rewards/accuracies": 0.75, "rewards/chosen": -0.03816051781177521, "rewards/margins": 0.4078497886657715, "rewards/rejected": -0.4460102915763855, "step": 6329 }, { "epoch": 0.9789290547071332, "grad_norm": 6.576191425323486, "learning_rate": 3.7426967579333263e-06, "logits/chosen": 14.664199829101562, "logits/rejected": 14.87417221069336, "logps/chosen": -287.00469970703125, "logps/rejected": -283.00994873046875, "loss": 0.8008, "rewards/accuracies": 0.375, "rewards/chosen": -0.8382795453071594, "rewards/margins": 0.001187305897474289, "rewards/rejected": -0.8394668102264404, "step": 6330 }, { "epoch": 0.9790837038468974, "grad_norm": 4.4208984375, "learning_rate": 3.742410356283652e-06, "logits/chosen": 8.74218463897705, "logits/rejected": 9.795685768127441, "logps/chosen": -262.88226318359375, "logps/rejected": -216.15997314453125, "loss": 0.682, "rewards/accuracies": 0.625, "rewards/chosen": 0.3701019287109375, "rewards/margins": 0.08830776065587997, "rewards/rejected": 0.28179416060447693, "step": 6331 }, { "epoch": 0.9792383529866615, "grad_norm": 4.583682537078857, "learning_rate": 3.7421239546339787e-06, "logits/chosen": 11.940658569335938, "logits/rejected": 5.56840181350708, "logps/chosen": -402.09283447265625, "logps/rejected": -268.4326477050781, "loss": 0.4906, "rewards/accuracies": 0.75, "rewards/chosen": 0.505084753036499, "rewards/margins": 0.9302171468734741, "rewards/rejected": -0.4251323640346527, "step": 6332 }, { "epoch": 0.9793930021264257, "grad_norm": 4.248706817626953, "learning_rate": 3.7418375529843054e-06, "logits/chosen": 11.651357650756836, "logits/rejected": 5.169594764709473, "logps/chosen": -423.9834899902344, "logps/rejected": -283.9046936035156, "loss": 0.4152, "rewards/accuracies": 0.75, "rewards/chosen": 0.4368492066860199, "rewards/margins": 0.8613738417625427, "rewards/rejected": -0.42452460527420044, "step": 6333 }, { "epoch": 0.9795476512661898, "grad_norm": 3.337502956390381, "learning_rate": 3.741551151334632e-06, "logits/chosen": 10.139825820922852, "logits/rejected": 7.991117477416992, "logps/chosen": -232.88397216796875, "logps/rejected": -200.31248474121094, "loss": 0.4523, "rewards/accuracies": 0.75, "rewards/chosen": -0.46951746940612793, "rewards/margins": 0.6828499436378479, "rewards/rejected": -1.152367353439331, "step": 6334 }, { "epoch": 0.979702300405954, "grad_norm": 8.208161354064941, "learning_rate": 3.7412647496849587e-06, "logits/chosen": 8.249188423156738, "logits/rejected": 8.633781433105469, "logps/chosen": -392.4921875, "logps/rejected": -305.61383056640625, "loss": 0.6339, "rewards/accuracies": 0.75, "rewards/chosen": 0.2545856535434723, "rewards/margins": 0.16935941576957703, "rewards/rejected": 0.08522625267505646, "step": 6335 }, { "epoch": 0.9798569495457181, "grad_norm": 5.370692253112793, "learning_rate": 3.7409783480352853e-06, "logits/chosen": 10.284990310668945, "logits/rejected": 8.454547882080078, "logps/chosen": -375.0708312988281, "logps/rejected": -283.7245788574219, "loss": 0.6497, "rewards/accuracies": 0.75, "rewards/chosen": 0.23219729959964752, "rewards/margins": 0.33492040634155273, "rewards/rejected": -0.1027231216430664, "step": 6336 }, { "epoch": 0.9800115986854823, "grad_norm": 7.273519039154053, "learning_rate": 3.7406919463856116e-06, "logits/chosen": 14.320666313171387, "logits/rejected": 12.81432056427002, "logps/chosen": -365.43475341796875, "logps/rejected": -246.53311157226562, "loss": 0.8923, "rewards/accuracies": 0.375, "rewards/chosen": -0.32235288619995117, "rewards/margins": -0.14710262417793274, "rewards/rejected": -0.17525026202201843, "step": 6337 }, { "epoch": 0.9801662478252464, "grad_norm": 4.570174694061279, "learning_rate": 3.740405544735938e-06, "logits/chosen": 13.6356840133667, "logits/rejected": 8.28172779083252, "logps/chosen": -341.417724609375, "logps/rejected": -264.3694763183594, "loss": 0.523, "rewards/accuracies": 0.75, "rewards/chosen": 0.3188232183456421, "rewards/margins": 0.8184434175491333, "rewards/rejected": -0.499620258808136, "step": 6338 }, { "epoch": 0.9803208969650107, "grad_norm": 9.591302871704102, "learning_rate": 3.7401191430862645e-06, "logits/chosen": 4.8775482177734375, "logits/rejected": 6.110818386077881, "logps/chosen": -141.5069122314453, "logps/rejected": -298.1834411621094, "loss": 0.7242, "rewards/accuracies": 0.5, "rewards/chosen": -0.3294106125831604, "rewards/margins": 0.20954623818397522, "rewards/rejected": -0.5389568209648132, "step": 6339 }, { "epoch": 0.9804755461047748, "grad_norm": 6.365485668182373, "learning_rate": 3.739832741436591e-06, "logits/chosen": 7.626447677612305, "logits/rejected": 9.57275390625, "logps/chosen": -249.103271484375, "logps/rejected": -248.5445556640625, "loss": 0.7672, "rewards/accuracies": 0.5, "rewards/chosen": -0.010637475177645683, "rewards/margins": -0.06676015257835388, "rewards/rejected": 0.05612267926335335, "step": 6340 }, { "epoch": 0.980630195244539, "grad_norm": 6.013866901397705, "learning_rate": 3.7395463397869173e-06, "logits/chosen": 15.039027214050293, "logits/rejected": 10.024212837219238, "logps/chosen": -308.22174072265625, "logps/rejected": -277.9765319824219, "loss": 0.6094, "rewards/accuracies": 0.625, "rewards/chosen": -0.10356522351503372, "rewards/margins": 0.4523034393787384, "rewards/rejected": -0.5558686256408691, "step": 6341 }, { "epoch": 0.9807848443843031, "grad_norm": 8.10254192352295, "learning_rate": 3.739259938137244e-06, "logits/chosen": 11.551238059997559, "logits/rejected": 1.3056763410568237, "logps/chosen": -341.2266845703125, "logps/rejected": -220.62278747558594, "loss": 0.7973, "rewards/accuracies": 0.25, "rewards/chosen": -0.03634757548570633, "rewards/margins": -0.10511816293001175, "rewards/rejected": 0.06877059489488602, "step": 6342 }, { "epoch": 0.9809394935240673, "grad_norm": 6.16043758392334, "learning_rate": 3.7389735364875706e-06, "logits/chosen": 5.587350845336914, "logits/rejected": 3.4804418087005615, "logps/chosen": -188.5323486328125, "logps/rejected": -241.19775390625, "loss": 0.579, "rewards/accuracies": 0.625, "rewards/chosen": 0.3693351149559021, "rewards/margins": 0.5227667093276978, "rewards/rejected": -0.15343162417411804, "step": 6343 }, { "epoch": 0.9810941426638314, "grad_norm": 5.35908317565918, "learning_rate": 3.7386871348378973e-06, "logits/chosen": 7.207553386688232, "logits/rejected": 5.906684398651123, "logps/chosen": -210.07369995117188, "logps/rejected": -238.39297485351562, "loss": 0.7538, "rewards/accuracies": 0.5, "rewards/chosen": 0.03599214926362038, "rewards/margins": 0.04232324659824371, "rewards/rejected": -0.006331082433462143, "step": 6344 }, { "epoch": 0.9812487918035956, "grad_norm": 4.146737575531006, "learning_rate": 3.738400733188223e-06, "logits/chosen": 7.1606035232543945, "logits/rejected": 8.163259506225586, "logps/chosen": -143.42739868164062, "logps/rejected": -185.353271484375, "loss": 0.6346, "rewards/accuracies": 0.625, "rewards/chosen": 0.09329091012477875, "rewards/margins": 0.22248223423957825, "rewards/rejected": -0.1291913092136383, "step": 6345 }, { "epoch": 0.9814034409433597, "grad_norm": 4.873830795288086, "learning_rate": 3.7381143315385497e-06, "logits/chosen": 15.064471244812012, "logits/rejected": 10.58346176147461, "logps/chosen": -322.13592529296875, "logps/rejected": -242.6052703857422, "loss": 0.6514, "rewards/accuracies": 0.625, "rewards/chosen": 0.15427017211914062, "rewards/margins": 0.19796046614646912, "rewards/rejected": -0.04369029402732849, "step": 6346 }, { "epoch": 0.9815580900831239, "grad_norm": 4.298923492431641, "learning_rate": 3.7378279298888764e-06, "logits/chosen": 9.079001426696777, "logits/rejected": 4.134699821472168, "logps/chosen": -292.2060546875, "logps/rejected": -202.46417236328125, "loss": 0.4833, "rewards/accuracies": 0.75, "rewards/chosen": 0.32802486419677734, "rewards/margins": 0.5648854970932007, "rewards/rejected": -0.23686060309410095, "step": 6347 }, { "epoch": 0.981712739222888, "grad_norm": 9.15503978729248, "learning_rate": 3.737541528239203e-06, "logits/chosen": 14.616005897521973, "logits/rejected": 7.228925704956055, "logps/chosen": -310.8160400390625, "logps/rejected": -213.77389526367188, "loss": 0.7195, "rewards/accuracies": 0.5, "rewards/chosen": 0.07918335497379303, "rewards/margins": 0.22846432030200958, "rewards/rejected": -0.14928096532821655, "step": 6348 }, { "epoch": 0.9818673883626522, "grad_norm": 7.727329730987549, "learning_rate": 3.7372551265895297e-06, "logits/chosen": 10.335702896118164, "logits/rejected": 8.534158706665039, "logps/chosen": -396.2249755859375, "logps/rejected": -378.1708679199219, "loss": 0.7248, "rewards/accuracies": 0.5, "rewards/chosen": 0.49631375074386597, "rewards/margins": 0.06901147961616516, "rewards/rejected": 0.4273022711277008, "step": 6349 }, { "epoch": 0.9820220375024163, "grad_norm": 5.398975849151611, "learning_rate": 3.7369687249398563e-06, "logits/chosen": 8.378582000732422, "logits/rejected": 12.507745742797852, "logps/chosen": -305.9085388183594, "logps/rejected": -333.411865234375, "loss": 0.5532, "rewards/accuracies": 0.75, "rewards/chosen": 0.17840319871902466, "rewards/margins": 0.5202721357345581, "rewards/rejected": -0.3418689966201782, "step": 6350 }, { "epoch": 0.9821766866421806, "grad_norm": 5.5790019035339355, "learning_rate": 3.736682323290182e-06, "logits/chosen": 13.69083023071289, "logits/rejected": 6.97307825088501, "logps/chosen": -358.3127746582031, "logps/rejected": -206.38015747070312, "loss": 0.5706, "rewards/accuracies": 0.75, "rewards/chosen": 0.2394128292798996, "rewards/margins": 0.3754867911338806, "rewards/rejected": -0.1360739767551422, "step": 6351 }, { "epoch": 0.9823313357819448, "grad_norm": 5.558178424835205, "learning_rate": 3.736395921640509e-06, "logits/chosen": 10.22050666809082, "logits/rejected": 8.757943153381348, "logps/chosen": -345.7509460449219, "logps/rejected": -301.1663513183594, "loss": 0.6601, "rewards/accuracies": 0.625, "rewards/chosen": 0.28731414675712585, "rewards/margins": 0.13952180743217468, "rewards/rejected": 0.14779233932495117, "step": 6352 }, { "epoch": 0.9824859849217089, "grad_norm": 4.343262672424316, "learning_rate": 3.7361095199908354e-06, "logits/chosen": 8.431159019470215, "logits/rejected": 5.432050704956055, "logps/chosen": -168.1685333251953, "logps/rejected": -182.77088928222656, "loss": 0.6361, "rewards/accuracies": 0.375, "rewards/chosen": -0.10684751719236374, "rewards/margins": 0.2605520486831665, "rewards/rejected": -0.3673996031284332, "step": 6353 }, { "epoch": 0.9826406340614731, "grad_norm": 6.614505290985107, "learning_rate": 3.735823118341162e-06, "logits/chosen": 4.245138168334961, "logits/rejected": 4.8325276374816895, "logps/chosen": -252.76461791992188, "logps/rejected": -242.34823608398438, "loss": 0.6661, "rewards/accuracies": 0.75, "rewards/chosen": -0.20296013355255127, "rewards/margins": 0.23311752080917358, "rewards/rejected": -0.43607765436172485, "step": 6354 }, { "epoch": 0.9827952832012372, "grad_norm": 7.216494560241699, "learning_rate": 3.7355367166914887e-06, "logits/chosen": 8.744220733642578, "logits/rejected": 9.35415267944336, "logps/chosen": -209.74905395507812, "logps/rejected": -250.81924438476562, "loss": 0.7311, "rewards/accuracies": 0.375, "rewards/chosen": -0.19596849381923676, "rewards/margins": 0.05969156324863434, "rewards/rejected": -0.2556600570678711, "step": 6355 }, { "epoch": 0.9829499323410014, "grad_norm": 5.547492027282715, "learning_rate": 3.735250315041815e-06, "logits/chosen": 10.118589401245117, "logits/rejected": 5.989643573760986, "logps/chosen": -265.45489501953125, "logps/rejected": -189.0480194091797, "loss": 0.5531, "rewards/accuracies": 0.75, "rewards/chosen": 0.530346155166626, "rewards/margins": 0.6523032784461975, "rewards/rejected": -0.12195707857608795, "step": 6356 }, { "epoch": 0.9831045814807655, "grad_norm": 3.405036449432373, "learning_rate": 3.7349639133921416e-06, "logits/chosen": 6.900462627410889, "logits/rejected": 8.3464994430542, "logps/chosen": -157.25973510742188, "logps/rejected": -232.02215576171875, "loss": 0.435, "rewards/accuracies": 0.75, "rewards/chosen": 0.3262181580066681, "rewards/margins": 1.0053631067276, "rewards/rejected": -0.6791449189186096, "step": 6357 }, { "epoch": 0.9832592306205297, "grad_norm": 6.441837310791016, "learning_rate": 3.734677511742468e-06, "logits/chosen": 11.748625755310059, "logits/rejected": 13.682984352111816, "logps/chosen": -368.30633544921875, "logps/rejected": -377.2191162109375, "loss": 0.649, "rewards/accuracies": 0.5, "rewards/chosen": 0.09854501485824585, "rewards/margins": 0.3158285915851593, "rewards/rejected": -0.21728357672691345, "step": 6358 }, { "epoch": 0.9834138797602938, "grad_norm": 6.330887317657471, "learning_rate": 3.7343911100927945e-06, "logits/chosen": 9.120230674743652, "logits/rejected": 10.975332260131836, "logps/chosen": -350.6260681152344, "logps/rejected": -413.9373779296875, "loss": 0.6913, "rewards/accuracies": 0.625, "rewards/chosen": -0.02962927520275116, "rewards/margins": 0.3403489589691162, "rewards/rejected": -0.36997824907302856, "step": 6359 }, { "epoch": 0.983568528900058, "grad_norm": 5.733031272888184, "learning_rate": 3.7341047084431207e-06, "logits/chosen": 8.469575881958008, "logits/rejected": 7.300873279571533, "logps/chosen": -290.756591796875, "logps/rejected": -237.67581176757812, "loss": 0.6744, "rewards/accuracies": 0.375, "rewards/chosen": 0.1746126264333725, "rewards/margins": 0.17614364624023438, "rewards/rejected": -0.0015310049057006836, "step": 6360 }, { "epoch": 0.9837231780398221, "grad_norm": 6.685068607330322, "learning_rate": 3.7338183067934474e-06, "logits/chosen": 5.318485260009766, "logits/rejected": 10.301240921020508, "logps/chosen": -225.68707275390625, "logps/rejected": -325.6181640625, "loss": 0.8425, "rewards/accuracies": 0.375, "rewards/chosen": -0.1301746368408203, "rewards/margins": -0.15783292055130005, "rewards/rejected": 0.027658268809318542, "step": 6361 }, { "epoch": 0.9838778271795863, "grad_norm": 5.311933994293213, "learning_rate": 3.733531905143774e-06, "logits/chosen": 15.729347229003906, "logits/rejected": 11.071685791015625, "logps/chosen": -256.0406188964844, "logps/rejected": -199.35235595703125, "loss": 0.6575, "rewards/accuracies": 0.625, "rewards/chosen": -0.060159068554639816, "rewards/margins": 0.16870784759521484, "rewards/rejected": -0.22886691987514496, "step": 6362 }, { "epoch": 0.9840324763193504, "grad_norm": 3.684779644012451, "learning_rate": 3.7332455034941007e-06, "logits/chosen": 9.606433868408203, "logits/rejected": 0.7240857481956482, "logps/chosen": -279.3525390625, "logps/rejected": -215.30224609375, "loss": 0.4508, "rewards/accuracies": 0.875, "rewards/chosen": -0.24538739025592804, "rewards/margins": 0.7554906010627747, "rewards/rejected": -1.0008779764175415, "step": 6363 }, { "epoch": 0.9841871254591147, "grad_norm": 4.594018459320068, "learning_rate": 3.7329591018444265e-06, "logits/chosen": 9.479605674743652, "logits/rejected": 11.16738224029541, "logps/chosen": -212.99578857421875, "logps/rejected": -239.20811462402344, "loss": 0.5402, "rewards/accuracies": 0.75, "rewards/chosen": 0.37108105421066284, "rewards/margins": 0.531692624092102, "rewards/rejected": -0.16061154007911682, "step": 6364 }, { "epoch": 0.9843417745988788, "grad_norm": 5.41662073135376, "learning_rate": 3.732672700194753e-06, "logits/chosen": 9.314374923706055, "logits/rejected": 8.568046569824219, "logps/chosen": -228.67742919921875, "logps/rejected": -239.1463623046875, "loss": 0.6967, "rewards/accuracies": 0.625, "rewards/chosen": 0.025880619883537292, "rewards/margins": 0.07058162987232208, "rewards/rejected": -0.044700995087623596, "step": 6365 }, { "epoch": 0.984496423738643, "grad_norm": 4.842507839202881, "learning_rate": 3.7323862985450798e-06, "logits/chosen": 16.975248336791992, "logits/rejected": 8.563737869262695, "logps/chosen": -381.3269958496094, "logps/rejected": -256.0245361328125, "loss": 0.4658, "rewards/accuracies": 0.625, "rewards/chosen": 0.4928404688835144, "rewards/margins": 0.8768168687820435, "rewards/rejected": -0.38397639989852905, "step": 6366 }, { "epoch": 0.9846510728784071, "grad_norm": 5.3084211349487305, "learning_rate": 3.7320998968954064e-06, "logits/chosen": 15.943305969238281, "logits/rejected": 8.33459186553955, "logps/chosen": -354.7040100097656, "logps/rejected": -254.4704132080078, "loss": 0.597, "rewards/accuracies": 0.875, "rewards/chosen": -0.08817901462316513, "rewards/margins": 0.3635047972202301, "rewards/rejected": -0.45168381929397583, "step": 6367 }, { "epoch": 0.9848057220181713, "grad_norm": 4.902894020080566, "learning_rate": 3.731813495245733e-06, "logits/chosen": 6.656890392303467, "logits/rejected": 11.914179801940918, "logps/chosen": -222.10264587402344, "logps/rejected": -264.1852111816406, "loss": 0.6316, "rewards/accuracies": 0.625, "rewards/chosen": 0.05056753009557724, "rewards/margins": 0.2905178964138031, "rewards/rejected": -0.23995037376880646, "step": 6368 }, { "epoch": 0.9849603711579354, "grad_norm": 5.321906566619873, "learning_rate": 3.7315270935960597e-06, "logits/chosen": 6.409091949462891, "logits/rejected": 8.225178718566895, "logps/chosen": -210.30987548828125, "logps/rejected": -222.08755493164062, "loss": 0.7868, "rewards/accuracies": 0.625, "rewards/chosen": 0.16246500611305237, "rewards/margins": 0.0187017060816288, "rewards/rejected": 0.14376330375671387, "step": 6369 }, { "epoch": 0.9851150202976996, "grad_norm": 4.566802501678467, "learning_rate": 3.731240691946386e-06, "logits/chosen": 13.245766639709473, "logits/rejected": 9.584945678710938, "logps/chosen": -310.4659729003906, "logps/rejected": -279.9970703125, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": 0.3515969216823578, "rewards/margins": 0.15906305611133575, "rewards/rejected": 0.19253388047218323, "step": 6370 }, { "epoch": 0.9852696694374637, "grad_norm": 5.151280879974365, "learning_rate": 3.730954290296712e-06, "logits/chosen": 11.174251556396484, "logits/rejected": 5.913450241088867, "logps/chosen": -372.3224792480469, "logps/rejected": -259.20526123046875, "loss": 0.4793, "rewards/accuracies": 0.875, "rewards/chosen": 0.3799913227558136, "rewards/margins": 0.5719354152679443, "rewards/rejected": -0.19194403290748596, "step": 6371 }, { "epoch": 0.9854243185772279, "grad_norm": 5.12819242477417, "learning_rate": 3.730667888647039e-06, "logits/chosen": 14.985021591186523, "logits/rejected": 10.003158569335938, "logps/chosen": -271.2546081542969, "logps/rejected": -266.3619384765625, "loss": 0.6395, "rewards/accuracies": 0.75, "rewards/chosen": -0.08971020579338074, "rewards/margins": 0.15500064194202423, "rewards/rejected": -0.24471083283424377, "step": 6372 }, { "epoch": 0.985578967716992, "grad_norm": 7.795731067657471, "learning_rate": 3.7303814869973655e-06, "logits/chosen": 9.79568099975586, "logits/rejected": 10.994924545288086, "logps/chosen": -419.13677978515625, "logps/rejected": -419.67138671875, "loss": 0.8165, "rewards/accuracies": 0.375, "rewards/chosen": 0.22752055525779724, "rewards/margins": 0.06996190547943115, "rewards/rejected": 0.1575586497783661, "step": 6373 }, { "epoch": 0.9857336168567562, "grad_norm": 5.015719890594482, "learning_rate": 3.730095085347692e-06, "logits/chosen": 15.121077537536621, "logits/rejected": 4.451775550842285, "logps/chosen": -273.3724060058594, "logps/rejected": -159.63136291503906, "loss": 0.6138, "rewards/accuracies": 0.75, "rewards/chosen": 0.22033709287643433, "rewards/margins": 0.2666897177696228, "rewards/rejected": -0.0463525764644146, "step": 6374 }, { "epoch": 0.9858882659965204, "grad_norm": 7.012768745422363, "learning_rate": 3.7298086836980184e-06, "logits/chosen": 7.010719299316406, "logits/rejected": 9.252592086791992, "logps/chosen": -277.5211486816406, "logps/rejected": -393.36322021484375, "loss": 0.9625, "rewards/accuracies": 0.375, "rewards/chosen": -0.3574404716491699, "rewards/margins": -0.12064673006534576, "rewards/rejected": -0.23679374158382416, "step": 6375 }, { "epoch": 0.9860429151362845, "grad_norm": 4.6598639488220215, "learning_rate": 3.729522282048345e-06, "logits/chosen": 11.543252944946289, "logits/rejected": 6.669504165649414, "logps/chosen": -148.09378051757812, "logps/rejected": -122.16343688964844, "loss": 0.6663, "rewards/accuracies": 0.625, "rewards/chosen": -0.09077329933643341, "rewards/margins": 0.14782200753688812, "rewards/rejected": -0.23859530687332153, "step": 6376 }, { "epoch": 0.9861975642760488, "grad_norm": 5.08552885055542, "learning_rate": 3.7292358803986712e-06, "logits/chosen": 10.157306671142578, "logits/rejected": 10.05604076385498, "logps/chosen": -278.8956298828125, "logps/rejected": -225.61953735351562, "loss": 0.668, "rewards/accuracies": 0.375, "rewards/chosen": 0.013445764780044556, "rewards/margins": 0.14905597269535065, "rewards/rejected": -0.1356101930141449, "step": 6377 }, { "epoch": 0.9863522134158129, "grad_norm": 6.076529502868652, "learning_rate": 3.728949478748998e-06, "logits/chosen": 7.508108615875244, "logits/rejected": 4.79211950302124, "logps/chosen": -382.4896240234375, "logps/rejected": -307.8619384765625, "loss": 0.8144, "rewards/accuracies": 0.375, "rewards/chosen": 0.1903144121170044, "rewards/margins": -0.1058650016784668, "rewards/rejected": 0.2961793839931488, "step": 6378 }, { "epoch": 0.9865068625555771, "grad_norm": 6.188473224639893, "learning_rate": 3.728663077099324e-06, "logits/chosen": 13.722238540649414, "logits/rejected": 11.189976692199707, "logps/chosen": -229.97079467773438, "logps/rejected": -174.31863403320312, "loss": 0.7357, "rewards/accuracies": 0.5, "rewards/chosen": -0.3522096276283264, "rewards/margins": -0.037701189517974854, "rewards/rejected": -0.31450843811035156, "step": 6379 }, { "epoch": 0.9866615116953412, "grad_norm": 3.196591854095459, "learning_rate": 3.7283766754496508e-06, "logits/chosen": 18.061851501464844, "logits/rejected": 10.70038890838623, "logps/chosen": -296.8410339355469, "logps/rejected": -143.9546356201172, "loss": 0.5163, "rewards/accuracies": 0.5, "rewards/chosen": 0.33662357926368713, "rewards/margins": 0.871543288230896, "rewards/rejected": -0.5349197387695312, "step": 6380 }, { "epoch": 0.9868161608351054, "grad_norm": 5.040510654449463, "learning_rate": 3.7280902737999774e-06, "logits/chosen": 10.511635780334473, "logits/rejected": 7.7900285720825195, "logps/chosen": -256.0610656738281, "logps/rejected": -270.65484619140625, "loss": 0.5722, "rewards/accuracies": 0.875, "rewards/chosen": 0.6627236008644104, "rewards/margins": 0.4329145550727844, "rewards/rejected": 0.2298090159893036, "step": 6381 }, { "epoch": 0.9869708099748695, "grad_norm": 4.601862907409668, "learning_rate": 3.727803872150304e-06, "logits/chosen": 7.746264934539795, "logits/rejected": 5.125948429107666, "logps/chosen": -207.99002075195312, "logps/rejected": -240.99896240234375, "loss": 0.651, "rewards/accuracies": 0.5, "rewards/chosen": -0.47464340925216675, "rewards/margins": 0.2699340879917145, "rewards/rejected": -0.7445774674415588, "step": 6382 }, { "epoch": 0.9871254591146337, "grad_norm": 5.484795093536377, "learning_rate": 3.7275174705006307e-06, "logits/chosen": 13.179880142211914, "logits/rejected": 12.174570083618164, "logps/chosen": -267.025634765625, "logps/rejected": -188.67352294921875, "loss": 0.7008, "rewards/accuracies": 0.5, "rewards/chosen": 0.12739549577236176, "rewards/margins": 0.06455676257610321, "rewards/rejected": 0.06283873319625854, "step": 6383 }, { "epoch": 0.9872801082543978, "grad_norm": 4.959286689758301, "learning_rate": 3.7272310688509565e-06, "logits/chosen": 12.184050559997559, "logits/rejected": 8.341327667236328, "logps/chosen": -327.19879150390625, "logps/rejected": -264.320556640625, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": 0.3330177366733551, "rewards/margins": 0.5039314031600952, "rewards/rejected": -0.1709136962890625, "step": 6384 }, { "epoch": 0.987434757394162, "grad_norm": 3.609959363937378, "learning_rate": 3.726944667201283e-06, "logits/chosen": 8.2381591796875, "logits/rejected": 9.955843925476074, "logps/chosen": -185.87957763671875, "logps/rejected": -232.37850952148438, "loss": 0.5972, "rewards/accuracies": 0.75, "rewards/chosen": 0.1679568886756897, "rewards/margins": 0.257804274559021, "rewards/rejected": -0.08984740823507309, "step": 6385 }, { "epoch": 0.9875894065339261, "grad_norm": 4.931064128875732, "learning_rate": 3.72665826555161e-06, "logits/chosen": 13.199957847595215, "logits/rejected": 7.943856239318848, "logps/chosen": -175.22314453125, "logps/rejected": -154.5943145751953, "loss": 0.656, "rewards/accuracies": 0.625, "rewards/chosen": 0.04112076759338379, "rewards/margins": 0.0937330350279808, "rewards/rejected": -0.05261225998401642, "step": 6386 }, { "epoch": 0.9877440556736903, "grad_norm": 3.6509485244750977, "learning_rate": 3.7263718639019365e-06, "logits/chosen": 12.731666564941406, "logits/rejected": 6.374953269958496, "logps/chosen": -232.49203491210938, "logps/rejected": -191.49087524414062, "loss": 0.4798, "rewards/accuracies": 0.875, "rewards/chosen": 0.10531330108642578, "rewards/margins": 0.6045680642127991, "rewards/rejected": -0.4992547035217285, "step": 6387 }, { "epoch": 0.9878987048134544, "grad_norm": 5.918891906738281, "learning_rate": 3.726085462252263e-06, "logits/chosen": 14.738401412963867, "logits/rejected": 7.014528751373291, "logps/chosen": -442.99920654296875, "logps/rejected": -271.3382568359375, "loss": 0.6167, "rewards/accuracies": 0.375, "rewards/chosen": 0.3283092677593231, "rewards/margins": 0.2893158197402954, "rewards/rejected": 0.038993436843156815, "step": 6388 }, { "epoch": 0.9880533539532186, "grad_norm": 5.170039653778076, "learning_rate": 3.7257990606025894e-06, "logits/chosen": 9.440051078796387, "logits/rejected": 12.597487449645996, "logps/chosen": -180.85267639160156, "logps/rejected": -208.2803192138672, "loss": 0.6388, "rewards/accuracies": 0.625, "rewards/chosen": 0.09109164774417877, "rewards/margins": 0.2528671622276306, "rewards/rejected": -0.16177549958229065, "step": 6389 }, { "epoch": 0.9882080030929828, "grad_norm": 5.9500932693481445, "learning_rate": 3.725512658952916e-06, "logits/chosen": 10.363550186157227, "logits/rejected": 9.956448554992676, "logps/chosen": -300.74749755859375, "logps/rejected": -325.7415466308594, "loss": 0.7747, "rewards/accuracies": 0.25, "rewards/chosen": -0.15665781497955322, "rewards/margins": -0.08261716365814209, "rewards/rejected": -0.07404066622257233, "step": 6390 }, { "epoch": 0.988362652232747, "grad_norm": 5.4790449142456055, "learning_rate": 3.7252262573032422e-06, "logits/chosen": 11.44119644165039, "logits/rejected": 9.993799209594727, "logps/chosen": -244.72235107421875, "logps/rejected": -254.03421020507812, "loss": 0.6352, "rewards/accuracies": 0.5, "rewards/chosen": 0.26246488094329834, "rewards/margins": 0.17048987746238708, "rewards/rejected": 0.09197502583265305, "step": 6391 }, { "epoch": 0.9885173013725111, "grad_norm": 6.128749370574951, "learning_rate": 3.724939855653569e-06, "logits/chosen": 8.635459899902344, "logits/rejected": 3.4009971618652344, "logps/chosen": -204.38153076171875, "logps/rejected": -212.85047912597656, "loss": 0.7206, "rewards/accuracies": 0.5, "rewards/chosen": -0.28203368186950684, "rewards/margins": 0.08916183561086655, "rewards/rejected": -0.371195524930954, "step": 6392 }, { "epoch": 0.9886719505122753, "grad_norm": 5.082852363586426, "learning_rate": 3.7246534540038955e-06, "logits/chosen": 6.198980331420898, "logits/rejected": 9.080013275146484, "logps/chosen": -210.5640411376953, "logps/rejected": -230.71670532226562, "loss": 0.741, "rewards/accuracies": 0.625, "rewards/chosen": 0.1753840446472168, "rewards/margins": -0.05014643445611, "rewards/rejected": 0.2255304902791977, "step": 6393 }, { "epoch": 0.9888265996520395, "grad_norm": 7.022514343261719, "learning_rate": 3.7243670523542218e-06, "logits/chosen": 15.562702178955078, "logits/rejected": 8.763711929321289, "logps/chosen": -299.00506591796875, "logps/rejected": -256.720458984375, "loss": 0.6607, "rewards/accuracies": 0.625, "rewards/chosen": 0.03995572030544281, "rewards/margins": 0.23928508162498474, "rewards/rejected": -0.19932936131954193, "step": 6394 }, { "epoch": 0.9889812487918036, "grad_norm": 5.80563497543335, "learning_rate": 3.7240806507045484e-06, "logits/chosen": 9.469968795776367, "logits/rejected": 2.30733060836792, "logps/chosen": -257.93316650390625, "logps/rejected": -222.91473388671875, "loss": 0.6314, "rewards/accuracies": 0.75, "rewards/chosen": 0.33021676540374756, "rewards/margins": 0.43539756536483765, "rewards/rejected": -0.1051807701587677, "step": 6395 }, { "epoch": 0.9891358979315678, "grad_norm": 6.67752742767334, "learning_rate": 3.723794249054875e-06, "logits/chosen": 7.952746391296387, "logits/rejected": 9.51457691192627, "logps/chosen": -260.056884765625, "logps/rejected": -264.37542724609375, "loss": 0.9161, "rewards/accuracies": 0.375, "rewards/chosen": -0.05107274651527405, "rewards/margins": -0.2795703113079071, "rewards/rejected": 0.22849754989147186, "step": 6396 }, { "epoch": 0.9892905470713319, "grad_norm": 3.7259817123413086, "learning_rate": 3.7235078474052013e-06, "logits/chosen": 15.1904935836792, "logits/rejected": 10.23059368133545, "logps/chosen": -257.2881774902344, "logps/rejected": -251.77818298339844, "loss": 0.4748, "rewards/accuracies": 0.875, "rewards/chosen": 0.39571958780288696, "rewards/margins": 0.5983229279518127, "rewards/rejected": -0.2026033103466034, "step": 6397 }, { "epoch": 0.9894451962110961, "grad_norm": 7.705357551574707, "learning_rate": 3.7232214457555275e-06, "logits/chosen": 5.060511589050293, "logits/rejected": 5.846954345703125, "logps/chosen": -366.26324462890625, "logps/rejected": -292.3350830078125, "loss": 0.8663, "rewards/accuracies": 0.25, "rewards/chosen": -0.05128952115774155, "rewards/margins": -0.2110341489315033, "rewards/rejected": 0.15974465012550354, "step": 6398 }, { "epoch": 0.9895998453508602, "grad_norm": 5.061436653137207, "learning_rate": 3.722935044105854e-06, "logits/chosen": 7.7066569328308105, "logits/rejected": 6.9301862716674805, "logps/chosen": -261.54443359375, "logps/rejected": -212.9409942626953, "loss": 0.68, "rewards/accuracies": 0.625, "rewards/chosen": 0.3508901000022888, "rewards/margins": 0.15891200304031372, "rewards/rejected": 0.1919780671596527, "step": 6399 }, { "epoch": 0.9897544944906244, "grad_norm": 6.117293357849121, "learning_rate": 3.722648642456181e-06, "logits/chosen": 12.491506576538086, "logits/rejected": 10.824533462524414, "logps/chosen": -463.5404052734375, "logps/rejected": -453.656982421875, "loss": 0.6627, "rewards/accuracies": 0.625, "rewards/chosen": 0.36115768551826477, "rewards/margins": 0.27406203746795654, "rewards/rejected": 0.08709569275379181, "step": 6400 }, { "epoch": 0.9899091436303885, "grad_norm": 8.071745872497559, "learning_rate": 3.7223622408065075e-06, "logits/chosen": 9.31758975982666, "logits/rejected": 6.549283981323242, "logps/chosen": -209.12582397460938, "logps/rejected": -175.58926391601562, "loss": 0.5469, "rewards/accuracies": 0.625, "rewards/chosen": 0.17268727719783783, "rewards/margins": 0.4429420828819275, "rewards/rejected": -0.2702547609806061, "step": 6401 }, { "epoch": 0.9900637927701527, "grad_norm": 6.005964279174805, "learning_rate": 3.722075839156834e-06, "logits/chosen": 7.940710544586182, "logits/rejected": 7.991883277893066, "logps/chosen": -260.8432922363281, "logps/rejected": -264.8493957519531, "loss": 0.7184, "rewards/accuracies": 0.625, "rewards/chosen": 0.28549882769584656, "rewards/margins": 0.10374833643436432, "rewards/rejected": 0.18175050616264343, "step": 6402 }, { "epoch": 0.9902184419099169, "grad_norm": 4.348447799682617, "learning_rate": 3.7217894375071608e-06, "logits/chosen": 10.062132835388184, "logits/rejected": 9.699281692504883, "logps/chosen": -239.81065368652344, "logps/rejected": -228.9882354736328, "loss": 0.6238, "rewards/accuracies": 0.75, "rewards/chosen": 0.2647506892681122, "rewards/margins": 0.3034078776836395, "rewards/rejected": -0.03865720331668854, "step": 6403 }, { "epoch": 0.9903730910496811, "grad_norm": 7.457139492034912, "learning_rate": 3.7215030358574866e-06, "logits/chosen": 10.277219772338867, "logits/rejected": 12.910514831542969, "logps/chosen": -314.7123718261719, "logps/rejected": -374.315185546875, "loss": 0.8161, "rewards/accuracies": 0.125, "rewards/chosen": 0.15039664506912231, "rewards/margins": -0.1926751285791397, "rewards/rejected": 0.34307175874710083, "step": 6404 }, { "epoch": 0.9905277401894452, "grad_norm": 5.699006080627441, "learning_rate": 3.7212166342078132e-06, "logits/chosen": 6.843673229217529, "logits/rejected": 6.50965690612793, "logps/chosen": -220.083740234375, "logps/rejected": -224.9158477783203, "loss": 0.6835, "rewards/accuracies": 0.5, "rewards/chosen": 0.15800824761390686, "rewards/margins": 0.042635876685380936, "rewards/rejected": 0.11537237465381622, "step": 6405 }, { "epoch": 0.9906823893292094, "grad_norm": 6.266790866851807, "learning_rate": 3.72093023255814e-06, "logits/chosen": 7.589700698852539, "logits/rejected": 13.214097023010254, "logps/chosen": -244.7571258544922, "logps/rejected": -375.617431640625, "loss": 0.8402, "rewards/accuracies": 0.375, "rewards/chosen": 0.18168388307094574, "rewards/margins": -0.1142912358045578, "rewards/rejected": 0.29597511887550354, "step": 6406 }, { "epoch": 0.9908370384689735, "grad_norm": 3.9138331413269043, "learning_rate": 3.7206438309084665e-06, "logits/chosen": 2.3229284286499023, "logits/rejected": 2.293281078338623, "logps/chosen": -172.07394409179688, "logps/rejected": -131.80897521972656, "loss": 0.6227, "rewards/accuracies": 0.75, "rewards/chosen": 0.40869075059890747, "rewards/margins": 0.2160681039094925, "rewards/rejected": 0.19262266159057617, "step": 6407 }, { "epoch": 0.9909916876087377, "grad_norm": 8.796308517456055, "learning_rate": 3.7203574292587927e-06, "logits/chosen": 14.957677841186523, "logits/rejected": 15.079998016357422, "logps/chosen": -272.4525146484375, "logps/rejected": -240.1488494873047, "loss": 0.8273, "rewards/accuracies": 0.625, "rewards/chosen": 0.04069849103689194, "rewards/margins": 0.1002734899520874, "rewards/rejected": -0.05957496166229248, "step": 6408 }, { "epoch": 0.9911463367485018, "grad_norm": 7.710762977600098, "learning_rate": 3.7200710276091194e-06, "logits/chosen": 17.11355972290039, "logits/rejected": 8.132133483886719, "logps/chosen": -466.312744140625, "logps/rejected": -349.2236328125, "loss": 0.5881, "rewards/accuracies": 0.75, "rewards/chosen": 0.6155927777290344, "rewards/margins": 0.3932848572731018, "rewards/rejected": 0.22230792045593262, "step": 6409 }, { "epoch": 0.991300985888266, "grad_norm": 8.482393264770508, "learning_rate": 3.7197846259594456e-06, "logits/chosen": 11.441797256469727, "logits/rejected": 14.680935859680176, "logps/chosen": -317.5072937011719, "logps/rejected": -345.8556823730469, "loss": 0.9578, "rewards/accuracies": 0.5, "rewards/chosen": -0.35917553305625916, "rewards/margins": -0.274425745010376, "rewards/rejected": -0.08474978804588318, "step": 6410 }, { "epoch": 0.9914556350280301, "grad_norm": 4.098133563995361, "learning_rate": 3.7194982243097723e-06, "logits/chosen": 10.93844223022461, "logits/rejected": 8.813115119934082, "logps/chosen": -180.91848754882812, "logps/rejected": -167.08865356445312, "loss": 0.7275, "rewards/accuracies": 0.5, "rewards/chosen": -0.06862376630306244, "rewards/margins": 0.0667669028043747, "rewards/rejected": -0.13539066910743713, "step": 6411 }, { "epoch": 0.9916102841677943, "grad_norm": 6.639828205108643, "learning_rate": 3.719211822660099e-06, "logits/chosen": 10.014822006225586, "logits/rejected": 9.669681549072266, "logps/chosen": -249.07492065429688, "logps/rejected": -293.29071044921875, "loss": 0.655, "rewards/accuracies": 0.75, "rewards/chosen": 0.04954013228416443, "rewards/margins": 0.3228171467781067, "rewards/rejected": -0.2732769846916199, "step": 6412 }, { "epoch": 0.9917649333075584, "grad_norm": 6.708457946777344, "learning_rate": 3.718925421010425e-06, "logits/chosen": 6.201162815093994, "logits/rejected": 2.6435225009918213, "logps/chosen": -197.08804321289062, "logps/rejected": -141.50880432128906, "loss": 0.6185, "rewards/accuracies": 0.75, "rewards/chosen": -0.48373717069625854, "rewards/margins": 0.31828323006629944, "rewards/rejected": -0.8020203709602356, "step": 6413 }, { "epoch": 0.9919195824473226, "grad_norm": 5.775219917297363, "learning_rate": 3.718639019360752e-06, "logits/chosen": 10.714132308959961, "logits/rejected": 9.037699699401855, "logps/chosen": -355.2578125, "logps/rejected": -285.55352783203125, "loss": 0.6524, "rewards/accuracies": 0.375, "rewards/chosen": 0.48623791337013245, "rewards/margins": 0.14448612928390503, "rewards/rejected": 0.34175175428390503, "step": 6414 }, { "epoch": 0.9920742315870869, "grad_norm": 3.9897866249084473, "learning_rate": 3.7183526177110785e-06, "logits/chosen": 7.48845100402832, "logits/rejected": 7.3719987869262695, "logps/chosen": -206.3500213623047, "logps/rejected": -230.9735107421875, "loss": 0.6759, "rewards/accuracies": 0.625, "rewards/chosen": -0.04097408801317215, "rewards/margins": 0.06924095749855042, "rewards/rejected": -0.11021503806114197, "step": 6415 }, { "epoch": 0.992228880726851, "grad_norm": 4.605623722076416, "learning_rate": 3.718066216061405e-06, "logits/chosen": 10.593321800231934, "logits/rejected": 8.16114616394043, "logps/chosen": -251.10311889648438, "logps/rejected": -222.08673095703125, "loss": 0.6673, "rewards/accuracies": 0.5, "rewards/chosen": -0.2484852820634842, "rewards/margins": 0.12412064522504807, "rewards/rejected": -0.37260594964027405, "step": 6416 }, { "epoch": 0.9923835298666152, "grad_norm": 12.407342910766602, "learning_rate": 3.717779814411731e-06, "logits/chosen": 10.985397338867188, "logits/rejected": 0.05024600028991699, "logps/chosen": -426.27069091796875, "logps/rejected": -237.6470947265625, "loss": 0.5892, "rewards/accuracies": 0.75, "rewards/chosen": 0.5563673377037048, "rewards/margins": 0.2939397990703583, "rewards/rejected": 0.26242750883102417, "step": 6417 }, { "epoch": 0.9925381790063793, "grad_norm": 5.3696160316467285, "learning_rate": 3.7174934127620576e-06, "logits/chosen": 10.513065338134766, "logits/rejected": 10.9337158203125, "logps/chosen": -373.86004638671875, "logps/rejected": -255.8009490966797, "loss": 0.5089, "rewards/accuracies": 0.875, "rewards/chosen": 0.532660186290741, "rewards/margins": 0.6481893062591553, "rewards/rejected": -0.11552904546260834, "step": 6418 }, { "epoch": 0.9926928281461435, "grad_norm": 8.462489128112793, "learning_rate": 3.717207011112384e-06, "logits/chosen": 5.9502129554748535, "logits/rejected": 4.107422351837158, "logps/chosen": -295.1842041015625, "logps/rejected": -249.59805297851562, "loss": 0.9288, "rewards/accuracies": 0.375, "rewards/chosen": -0.04321698099374771, "rewards/margins": -0.24786341190338135, "rewards/rejected": 0.20464642345905304, "step": 6419 }, { "epoch": 0.9928474772859076, "grad_norm": 5.031231880187988, "learning_rate": 3.716920609462711e-06, "logits/chosen": 11.654215812683105, "logits/rejected": 2.93538761138916, "logps/chosen": -221.47122192382812, "logps/rejected": -166.4352569580078, "loss": 0.7836, "rewards/accuracies": 0.375, "rewards/chosen": -0.14523737132549286, "rewards/margins": -0.05829368531703949, "rewards/rejected": -0.08694367110729218, "step": 6420 }, { "epoch": 0.9930021264256718, "grad_norm": 3.2796337604522705, "learning_rate": 3.7166342078130375e-06, "logits/chosen": 14.392969131469727, "logits/rejected": 10.050641059875488, "logps/chosen": -253.76919555664062, "logps/rejected": -219.97955322265625, "loss": 0.4514, "rewards/accuracies": 1.0, "rewards/chosen": 0.25053876638412476, "rewards/margins": 0.6669167280197144, "rewards/rejected": -0.416377991437912, "step": 6421 }, { "epoch": 0.9931567755654359, "grad_norm": 8.043817520141602, "learning_rate": 3.716347806163364e-06, "logits/chosen": 8.410589218139648, "logits/rejected": 6.314308166503906, "logps/chosen": -350.14617919921875, "logps/rejected": -325.44989013671875, "loss": 0.8105, "rewards/accuracies": 0.375, "rewards/chosen": 0.3739131689071655, "rewards/margins": -0.14903900027275085, "rewards/rejected": 0.5229521989822388, "step": 6422 }, { "epoch": 0.9933114247052001, "grad_norm": 9.700401306152344, "learning_rate": 3.7160614045136904e-06, "logits/chosen": 9.112405776977539, "logits/rejected": 13.738035202026367, "logps/chosen": -228.12466430664062, "logps/rejected": -249.49899291992188, "loss": 0.6306, "rewards/accuracies": 0.625, "rewards/chosen": 0.055996183305978775, "rewards/margins": 0.4456881880760193, "rewards/rejected": -0.3896920382976532, "step": 6423 }, { "epoch": 0.9934660738449642, "grad_norm": 5.776155948638916, "learning_rate": 3.7157750028640166e-06, "logits/chosen": 4.318787574768066, "logits/rejected": 5.837235450744629, "logps/chosen": -330.2237548828125, "logps/rejected": -355.15399169921875, "loss": 0.5104, "rewards/accuracies": 0.75, "rewards/chosen": 0.6251097917556763, "rewards/margins": 0.5988390445709229, "rewards/rejected": 0.026270776987075806, "step": 6424 }, { "epoch": 0.9936207229847284, "grad_norm": 5.2671918869018555, "learning_rate": 3.7154886012143433e-06, "logits/chosen": 9.176443099975586, "logits/rejected": 6.544194221496582, "logps/chosen": -290.18756103515625, "logps/rejected": -340.17877197265625, "loss": 0.7401, "rewards/accuracies": 0.5, "rewards/chosen": -0.05302000045776367, "rewards/margins": 0.08814285695552826, "rewards/rejected": -0.14116287231445312, "step": 6425 }, { "epoch": 0.9937753721244925, "grad_norm": 4.734760284423828, "learning_rate": 3.71520219956467e-06, "logits/chosen": 10.48302173614502, "logits/rejected": 5.032235622406006, "logps/chosen": -413.1797790527344, "logps/rejected": -419.94580078125, "loss": 0.4113, "rewards/accuracies": 0.875, "rewards/chosen": 0.3933395445346832, "rewards/margins": 0.9376549124717712, "rewards/rejected": -0.5443153381347656, "step": 6426 }, { "epoch": 0.9939300212642567, "grad_norm": 4.734888076782227, "learning_rate": 3.714915797914996e-06, "logits/chosen": 5.717303276062012, "logits/rejected": 7.294363021850586, "logps/chosen": -229.66470336914062, "logps/rejected": -305.2263488769531, "loss": 0.5005, "rewards/accuracies": 0.875, "rewards/chosen": 0.16094055771827698, "rewards/margins": 0.473614364862442, "rewards/rejected": -0.31267380714416504, "step": 6427 }, { "epoch": 0.9940846704040209, "grad_norm": 5.623244762420654, "learning_rate": 3.714629396265323e-06, "logits/chosen": 10.129432678222656, "logits/rejected": 9.351290702819824, "logps/chosen": -254.37786865234375, "logps/rejected": -250.22079467773438, "loss": 0.6417, "rewards/accuracies": 0.625, "rewards/chosen": 0.2449115812778473, "rewards/margins": 0.2890045642852783, "rewards/rejected": -0.04409298300743103, "step": 6428 }, { "epoch": 0.9942393195437851, "grad_norm": 4.211686611175537, "learning_rate": 3.7143429946156494e-06, "logits/chosen": 13.407751083374023, "logits/rejected": 5.79897403717041, "logps/chosen": -330.6243896484375, "logps/rejected": -232.6272735595703, "loss": 0.5101, "rewards/accuracies": 0.625, "rewards/chosen": 0.46750152111053467, "rewards/margins": 0.5535997152328491, "rewards/rejected": -0.08609818667173386, "step": 6429 }, { "epoch": 0.9943939686835492, "grad_norm": 4.994174480438232, "learning_rate": 3.7140565929659757e-06, "logits/chosen": 13.413518905639648, "logits/rejected": 3.5449743270874023, "logps/chosen": -363.74908447265625, "logps/rejected": -267.62506103515625, "loss": 0.5234, "rewards/accuracies": 0.75, "rewards/chosen": 0.6236370205879211, "rewards/margins": 0.6098806262016296, "rewards/rejected": 0.01375637948513031, "step": 6430 }, { "epoch": 0.9945486178233134, "grad_norm": 6.460704326629639, "learning_rate": 3.7137701913163023e-06, "logits/chosen": 5.026034355163574, "logits/rejected": 2.93806529045105, "logps/chosen": -335.23638916015625, "logps/rejected": -322.83624267578125, "loss": 0.4922, "rewards/accuracies": 0.625, "rewards/chosen": 0.8510804772377014, "rewards/margins": 0.7431129217147827, "rewards/rejected": 0.1079675704240799, "step": 6431 }, { "epoch": 0.9947032669630775, "grad_norm": 3.4955148696899414, "learning_rate": 3.7134837896666285e-06, "logits/chosen": 17.210689544677734, "logits/rejected": 10.415397644042969, "logps/chosen": -398.5893859863281, "logps/rejected": -245.32540893554688, "loss": 0.4282, "rewards/accuracies": 0.875, "rewards/chosen": 0.7730404138565063, "rewards/margins": 0.7427493333816528, "rewards/rejected": 0.030291080474853516, "step": 6432 }, { "epoch": 0.9948579161028417, "grad_norm": 4.404941558837891, "learning_rate": 3.713197388016955e-06, "logits/chosen": 6.819891452789307, "logits/rejected": 8.653105735778809, "logps/chosen": -166.37310791015625, "logps/rejected": -177.69534301757812, "loss": 0.6709, "rewards/accuracies": 0.625, "rewards/chosen": 0.11002130061388016, "rewards/margins": 0.10400070250034332, "rewards/rejected": 0.006020592525601387, "step": 6433 }, { "epoch": 0.9950125652426058, "grad_norm": 4.815541744232178, "learning_rate": 3.712910986367282e-06, "logits/chosen": 11.207900047302246, "logits/rejected": 11.828226089477539, "logps/chosen": -241.86944580078125, "logps/rejected": -332.5943908691406, "loss": 0.5174, "rewards/accuracies": 0.625, "rewards/chosen": 0.1279597282409668, "rewards/margins": 0.5327659845352173, "rewards/rejected": -0.4048061966896057, "step": 6434 }, { "epoch": 0.99516721438237, "grad_norm": 4.53037691116333, "learning_rate": 3.7126245847176085e-06, "logits/chosen": 14.962818145751953, "logits/rejected": 6.390637397766113, "logps/chosen": -401.25030517578125, "logps/rejected": -224.285888671875, "loss": 0.5032, "rewards/accuracies": 0.75, "rewards/chosen": 0.48805028200149536, "rewards/margins": 0.5336525440216064, "rewards/rejected": -0.0456022247672081, "step": 6435 }, { "epoch": 0.9953218635221341, "grad_norm": 3.8327882289886475, "learning_rate": 3.712338183067935e-06, "logits/chosen": 12.019838333129883, "logits/rejected": 8.912884712219238, "logps/chosen": -171.60142517089844, "logps/rejected": -163.62643432617188, "loss": 0.6241, "rewards/accuracies": 0.625, "rewards/chosen": 0.1824900507926941, "rewards/margins": 0.2052711546421051, "rewards/rejected": -0.022781088948249817, "step": 6436 }, { "epoch": 0.9954765126618983, "grad_norm": 5.008584499359131, "learning_rate": 3.712051781418261e-06, "logits/chosen": 7.2660417556762695, "logits/rejected": 9.870080947875977, "logps/chosen": -280.6383056640625, "logps/rejected": -295.5782165527344, "loss": 0.504, "rewards/accuracies": 0.75, "rewards/chosen": 0.40721285343170166, "rewards/margins": 0.5243091583251953, "rewards/rejected": -0.11709632724523544, "step": 6437 }, { "epoch": 0.9956311618016624, "grad_norm": 4.102077484130859, "learning_rate": 3.7117653797685876e-06, "logits/chosen": 12.455120086669922, "logits/rejected": 10.793878555297852, "logps/chosen": -253.3858184814453, "logps/rejected": -227.75735473632812, "loss": 0.5701, "rewards/accuracies": 0.75, "rewards/chosen": 0.19202375411987305, "rewards/margins": 0.38915860652923584, "rewards/rejected": -0.19713488221168518, "step": 6438 }, { "epoch": 0.9957858109414266, "grad_norm": 6.331699371337891, "learning_rate": 3.7114789781189143e-06, "logits/chosen": 10.446334838867188, "logits/rejected": 9.925003051757812, "logps/chosen": -340.41937255859375, "logps/rejected": -330.2668762207031, "loss": 0.7328, "rewards/accuracies": 0.625, "rewards/chosen": 0.3427719175815582, "rewards/margins": -0.032489921897649765, "rewards/rejected": 0.3752618134021759, "step": 6439 }, { "epoch": 0.9959404600811907, "grad_norm": 10.979613304138184, "learning_rate": 3.711192576469241e-06, "logits/chosen": 6.557989120483398, "logits/rejected": 13.282819747924805, "logps/chosen": -223.29698181152344, "logps/rejected": -329.47576904296875, "loss": 0.9979, "rewards/accuracies": 0.25, "rewards/chosen": 0.015458628535270691, "rewards/margins": -0.5006383061408997, "rewards/rejected": 0.5160968899726868, "step": 6440 }, { "epoch": 0.996095109220955, "grad_norm": 4.591195583343506, "learning_rate": 3.7109061748195676e-06, "logits/chosen": 9.61746883392334, "logits/rejected": 6.124847412109375, "logps/chosen": -323.44793701171875, "logps/rejected": -257.9283447265625, "loss": 0.6567, "rewards/accuracies": 0.5, "rewards/chosen": 0.32818177342414856, "rewards/margins": 0.32907921075820923, "rewards/rejected": -0.0008974462980404496, "step": 6441 }, { "epoch": 0.9962497583607192, "grad_norm": 5.878073692321777, "learning_rate": 3.7106197731698938e-06, "logits/chosen": 14.631660461425781, "logits/rejected": 7.4148945808410645, "logps/chosen": -274.1687316894531, "logps/rejected": -194.74220275878906, "loss": 0.6431, "rewards/accuracies": 0.75, "rewards/chosen": 0.37658941745758057, "rewards/margins": 0.3262397050857544, "rewards/rejected": 0.050349727272987366, "step": 6442 }, { "epoch": 0.9964044075004833, "grad_norm": 6.323184967041016, "learning_rate": 3.71033337152022e-06, "logits/chosen": 10.861124038696289, "logits/rejected": 8.038949012756348, "logps/chosen": -304.2377624511719, "logps/rejected": -333.73284912109375, "loss": 0.668, "rewards/accuracies": 0.5, "rewards/chosen": 0.40864118933677673, "rewards/margins": 0.26527321338653564, "rewards/rejected": 0.1433679461479187, "step": 6443 }, { "epoch": 0.9965590566402475, "grad_norm": 4.15866756439209, "learning_rate": 3.7100469698705467e-06, "logits/chosen": 8.23521614074707, "logits/rejected": 8.099372863769531, "logps/chosen": -296.7837829589844, "logps/rejected": -335.0313720703125, "loss": 0.4556, "rewards/accuracies": 0.75, "rewards/chosen": 0.5893731713294983, "rewards/margins": 0.7047889828681946, "rewards/rejected": -0.11541580408811569, "step": 6444 }, { "epoch": 0.9967137057800116, "grad_norm": 5.668601036071777, "learning_rate": 3.7097605682208733e-06, "logits/chosen": 11.344316482543945, "logits/rejected": 10.135232925415039, "logps/chosen": -314.1343994140625, "logps/rejected": -243.55874633789062, "loss": 0.8067, "rewards/accuracies": 0.5, "rewards/chosen": 0.1590518057346344, "rewards/margins": -0.08058004081249237, "rewards/rejected": 0.23963184654712677, "step": 6445 }, { "epoch": 0.9968683549197758, "grad_norm": 3.9897096157073975, "learning_rate": 3.7094741665711995e-06, "logits/chosen": 8.347829818725586, "logits/rejected": 2.55678391456604, "logps/chosen": -323.57257080078125, "logps/rejected": -188.37205505371094, "loss": 0.5162, "rewards/accuracies": 0.625, "rewards/chosen": 0.32635608315467834, "rewards/margins": 0.5472850799560547, "rewards/rejected": -0.22092899680137634, "step": 6446 }, { "epoch": 0.9970230040595399, "grad_norm": 5.015946388244629, "learning_rate": 3.709187764921526e-06, "logits/chosen": 12.791887283325195, "logits/rejected": 11.704107284545898, "logps/chosen": -274.552001953125, "logps/rejected": -276.5045166015625, "loss": 0.6409, "rewards/accuracies": 0.5, "rewards/chosen": 0.04246644675731659, "rewards/margins": 0.2217501848936081, "rewards/rejected": -0.17928370833396912, "step": 6447 }, { "epoch": 0.9971776531993041, "grad_norm": 4.478327751159668, "learning_rate": 3.708901363271853e-06, "logits/chosen": 9.696907997131348, "logits/rejected": 9.220049858093262, "logps/chosen": -258.6571350097656, "logps/rejected": -249.5697784423828, "loss": 0.5514, "rewards/accuracies": 0.75, "rewards/chosen": 0.526727557182312, "rewards/margins": 0.5402780175209045, "rewards/rejected": -0.01355036348104477, "step": 6448 }, { "epoch": 0.9973323023390682, "grad_norm": 4.411247730255127, "learning_rate": 3.7086149616221795e-06, "logits/chosen": 8.781852722167969, "logits/rejected": 1.7044378519058228, "logps/chosen": -294.9023742675781, "logps/rejected": -150.32318115234375, "loss": 0.5621, "rewards/accuracies": 0.5, "rewards/chosen": 0.2328437864780426, "rewards/margins": 0.5416602492332458, "rewards/rejected": -0.30881643295288086, "step": 6449 }, { "epoch": 0.9974869514788324, "grad_norm": 5.800382137298584, "learning_rate": 3.7083285599725057e-06, "logits/chosen": 6.159079074859619, "logits/rejected": 6.400328636169434, "logps/chosen": -259.21734619140625, "logps/rejected": -194.0346221923828, "loss": 0.6565, "rewards/accuracies": 0.625, "rewards/chosen": 0.392267107963562, "rewards/margins": 0.1629045605659485, "rewards/rejected": 0.22936254739761353, "step": 6450 }, { "epoch": 0.9976416006185965, "grad_norm": 5.962613582611084, "learning_rate": 3.708042158322832e-06, "logits/chosen": 7.412317276000977, "logits/rejected": 8.949893951416016, "logps/chosen": -263.3511047363281, "logps/rejected": -292.3740234375, "loss": 0.7336, "rewards/accuracies": 0.625, "rewards/chosen": 0.3386686444282532, "rewards/margins": 0.12247180938720703, "rewards/rejected": 0.21619683504104614, "step": 6451 }, { "epoch": 0.9977962497583607, "grad_norm": 4.726262092590332, "learning_rate": 3.7077557566731586e-06, "logits/chosen": 16.00439453125, "logits/rejected": 9.305023193359375, "logps/chosen": -332.0745849609375, "logps/rejected": -252.96035766601562, "loss": 0.5894, "rewards/accuracies": 0.625, "rewards/chosen": 0.39330828189849854, "rewards/margins": 0.32374680042266846, "rewards/rejected": 0.06956146657466888, "step": 6452 }, { "epoch": 0.9979508988981248, "grad_norm": 5.195104122161865, "learning_rate": 3.7074693550234852e-06, "logits/chosen": 6.627902984619141, "logits/rejected": 5.248289108276367, "logps/chosen": -255.76513671875, "logps/rejected": -230.42604064941406, "loss": 0.5402, "rewards/accuracies": 0.625, "rewards/chosen": 0.20039893686771393, "rewards/margins": 0.45336925983428955, "rewards/rejected": -0.2529703676700592, "step": 6453 }, { "epoch": 0.9981055480378891, "grad_norm": 6.634561061859131, "learning_rate": 3.707182953373812e-06, "logits/chosen": 10.792223930358887, "logits/rejected": 13.415390014648438, "logps/chosen": -272.1614990234375, "logps/rejected": -303.35064697265625, "loss": 0.714, "rewards/accuracies": 0.625, "rewards/chosen": 0.37170207500457764, "rewards/margins": 0.03073558583855629, "rewards/rejected": 0.34096652269363403, "step": 6454 }, { "epoch": 0.9982601971776532, "grad_norm": 7.098182201385498, "learning_rate": 3.7068965517241385e-06, "logits/chosen": 16.01780128479004, "logits/rejected": 5.407187461853027, "logps/chosen": -504.06689453125, "logps/rejected": -242.10687255859375, "loss": 0.5645, "rewards/accuracies": 0.625, "rewards/chosen": 0.5858308672904968, "rewards/margins": 0.41286909580230713, "rewards/rejected": 0.17296181619167328, "step": 6455 }, { "epoch": 0.9984148463174174, "grad_norm": 3.831732988357544, "learning_rate": 3.706610150074465e-06, "logits/chosen": 11.377111434936523, "logits/rejected": 6.26080322265625, "logps/chosen": -276.1357727050781, "logps/rejected": -206.07383728027344, "loss": 0.4207, "rewards/accuracies": 0.75, "rewards/chosen": 0.5700348615646362, "rewards/margins": 1.223379373550415, "rewards/rejected": -0.6533443331718445, "step": 6456 }, { "epoch": 0.9985694954571815, "grad_norm": 4.787290573120117, "learning_rate": 3.706323748424791e-06, "logits/chosen": 16.607322692871094, "logits/rejected": 7.273774147033691, "logps/chosen": -397.1927490234375, "logps/rejected": -220.79031372070312, "loss": 0.5305, "rewards/accuracies": 0.75, "rewards/chosen": 0.7793780565261841, "rewards/margins": 0.6378126740455627, "rewards/rejected": 0.14156541228294373, "step": 6457 }, { "epoch": 0.9987241445969457, "grad_norm": 5.1946282386779785, "learning_rate": 3.7060373467751176e-06, "logits/chosen": 11.702022552490234, "logits/rejected": 10.066984176635742, "logps/chosen": -258.2348937988281, "logps/rejected": -200.14053344726562, "loss": 0.6547, "rewards/accuracies": 0.5, "rewards/chosen": -0.04380466043949127, "rewards/margins": 0.16082774102687836, "rewards/rejected": -0.20463237166404724, "step": 6458 }, { "epoch": 0.9988787937367098, "grad_norm": 5.404623508453369, "learning_rate": 3.7057509451254443e-06, "logits/chosen": 13.3839693069458, "logits/rejected": 5.842410087585449, "logps/chosen": -310.8735046386719, "logps/rejected": -230.002685546875, "loss": 0.6934, "rewards/accuracies": 0.375, "rewards/chosen": -0.07032537460327148, "rewards/margins": 0.07508254051208496, "rewards/rejected": -0.14540790021419525, "step": 6459 }, { "epoch": 0.999033442876474, "grad_norm": 6.139119625091553, "learning_rate": 3.705464543475771e-06, "logits/chosen": 13.555074691772461, "logits/rejected": 5.720902919769287, "logps/chosen": -255.6298065185547, "logps/rejected": -132.44500732421875, "loss": 0.6642, "rewards/accuracies": 0.75, "rewards/chosen": 0.1491905152797699, "rewards/margins": 0.14715290069580078, "rewards/rejected": 0.002037620171904564, "step": 6460 }, { "epoch": 0.9991880920162381, "grad_norm": 8.389126777648926, "learning_rate": 3.705178141826097e-06, "logits/chosen": 11.02554702758789, "logits/rejected": 14.100635528564453, "logps/chosen": -392.8874816894531, "logps/rejected": -400.7939453125, "loss": 0.7983, "rewards/accuracies": 0.25, "rewards/chosen": 0.5057023763656616, "rewards/margins": -0.11814691126346588, "rewards/rejected": 0.6238492727279663, "step": 6461 }, { "epoch": 0.9993427411560023, "grad_norm": 6.983454704284668, "learning_rate": 3.704891740176424e-06, "logits/chosen": 14.922819137573242, "logits/rejected": 10.502524375915527, "logps/chosen": -318.099365234375, "logps/rejected": -246.74215698242188, "loss": 0.8399, "rewards/accuracies": 0.25, "rewards/chosen": 0.03999558091163635, "rewards/margins": -0.18530333042144775, "rewards/rejected": 0.2252988964319229, "step": 6462 }, { "epoch": 0.9994973902957665, "grad_norm": 7.0249810218811035, "learning_rate": 3.70460533852675e-06, "logits/chosen": 6.691718578338623, "logits/rejected": 8.100693702697754, "logps/chosen": -226.1967010498047, "logps/rejected": -215.714599609375, "loss": 0.9998, "rewards/accuracies": 0.25, "rewards/chosen": -0.4663422405719757, "rewards/margins": -0.38877493143081665, "rewards/rejected": -0.07756730169057846, "step": 6463 }, { "epoch": 0.9996520394355306, "grad_norm": 3.797253131866455, "learning_rate": 3.7043189368770767e-06, "logits/chosen": 7.48823356628418, "logits/rejected": 7.208713531494141, "logps/chosen": -194.60357666015625, "logps/rejected": -145.7886962890625, "loss": 0.7152, "rewards/accuracies": 0.5, "rewards/chosen": 0.04407081753015518, "rewards/margins": 0.002133004367351532, "rewards/rejected": 0.04193783923983574, "step": 6464 }, { "epoch": 0.9998066885752948, "grad_norm": 4.286912441253662, "learning_rate": 3.704032535227403e-06, "logits/chosen": 8.498443603515625, "logits/rejected": 3.455369710922241, "logps/chosen": -286.6949462890625, "logps/rejected": -241.1104736328125, "loss": 0.507, "rewards/accuracies": 0.75, "rewards/chosen": 0.5331889986991882, "rewards/margins": 0.5734596252441406, "rewards/rejected": -0.04027062654495239, "step": 6465 }, { "epoch": 0.9999613377150589, "grad_norm": 5.675995826721191, "learning_rate": 3.7037461335777296e-06, "logits/chosen": -1.0441932678222656, "logits/rejected": 8.65644359588623, "logps/chosen": -132.24937438964844, "logps/rejected": -179.43557739257812, "loss": 0.7638, "rewards/accuracies": 0.5, "rewards/chosen": -0.33726200461387634, "rewards/margins": 0.05548731982707977, "rewards/rejected": -0.3927493393421173, "step": 6466 }, { "epoch": 1.000115986854823, "grad_norm": 4.397193431854248, "learning_rate": 3.7034597319280562e-06, "logits/chosen": 9.970890045166016, "logits/rejected": 11.026260375976562, "logps/chosen": -324.40606689453125, "logps/rejected": -262.9850158691406, "loss": 0.4699, "rewards/accuracies": 0.75, "rewards/chosen": 0.6350075602531433, "rewards/margins": 0.6038157343864441, "rewards/rejected": 0.031191788613796234, "step": 6467 }, { "epoch": 1.0002706359945872, "grad_norm": 6.071857929229736, "learning_rate": 3.703173330278383e-06, "logits/chosen": 8.23035717010498, "logits/rejected": 14.393836975097656, "logps/chosen": -337.9178466796875, "logps/rejected": -361.6004638671875, "loss": 0.8434, "rewards/accuracies": 0.5, "rewards/chosen": 0.37275755405426025, "rewards/margins": 0.04437033832073212, "rewards/rejected": 0.32838720083236694, "step": 6468 }, { "epoch": 1.0004252851343514, "grad_norm": 4.508437633514404, "learning_rate": 3.7028869286287095e-06, "logits/chosen": 17.907562255859375, "logits/rejected": 15.179469108581543, "logps/chosen": -322.15289306640625, "logps/rejected": -252.47393798828125, "loss": 0.5076, "rewards/accuracies": 0.625, "rewards/chosen": 0.22953186929225922, "rewards/margins": 0.56075519323349, "rewards/rejected": -0.33122333884239197, "step": 6469 }, { "epoch": 1.0005799342741155, "grad_norm": 4.127502918243408, "learning_rate": 3.7026005269790353e-06, "logits/chosen": 13.710186004638672, "logits/rejected": 13.976655960083008, "logps/chosen": -283.73956298828125, "logps/rejected": -212.97613525390625, "loss": 0.5532, "rewards/accuracies": 0.875, "rewards/chosen": 0.4209112226963043, "rewards/margins": 0.35665363073349, "rewards/rejected": 0.06425757706165314, "step": 6470 }, { "epoch": 1.0007345834138797, "grad_norm": 4.5542893409729, "learning_rate": 3.702314125329362e-06, "logits/chosen": 4.959611892700195, "logits/rejected": 0.7296081781387329, "logps/chosen": -257.67987060546875, "logps/rejected": -291.7732238769531, "loss": 0.5917, "rewards/accuracies": 0.5, "rewards/chosen": 0.06830978393554688, "rewards/margins": 0.38829559087753296, "rewards/rejected": -0.3199857771396637, "step": 6471 }, { "epoch": 1.0008892325536438, "grad_norm": 4.960212707519531, "learning_rate": 3.7020277236796886e-06, "logits/chosen": 8.817639350891113, "logits/rejected": 11.630827903747559, "logps/chosen": -372.274169921875, "logps/rejected": -452.58489990234375, "loss": 0.5445, "rewards/accuracies": 0.75, "rewards/chosen": 0.4633787274360657, "rewards/margins": 0.37530338764190674, "rewards/rejected": 0.08807535469532013, "step": 6472 }, { "epoch": 1.0010438816934082, "grad_norm": 4.946148872375488, "learning_rate": 3.7017413220300153e-06, "logits/chosen": 10.475448608398438, "logits/rejected": 10.050158500671387, "logps/chosen": -321.27252197265625, "logps/rejected": -258.4363708496094, "loss": 0.5804, "rewards/accuracies": 0.5, "rewards/chosen": 0.5828481912612915, "rewards/margins": 0.3572867512702942, "rewards/rejected": 0.22556143999099731, "step": 6473 }, { "epoch": 1.0011985308331723, "grad_norm": 6.328547954559326, "learning_rate": 3.701454920380342e-06, "logits/chosen": 11.512105941772461, "logits/rejected": 11.806161880493164, "logps/chosen": -430.54644775390625, "logps/rejected": -394.33184814453125, "loss": 0.6867, "rewards/accuracies": 0.25, "rewards/chosen": 0.549159824848175, "rewards/margins": 0.20954646170139313, "rewards/rejected": 0.3396133482456207, "step": 6474 }, { "epoch": 1.0013531799729365, "grad_norm": 4.63679313659668, "learning_rate": 3.7011685187306686e-06, "logits/chosen": 13.474045753479004, "logits/rejected": 7.575530529022217, "logps/chosen": -397.6506042480469, "logps/rejected": -262.03228759765625, "loss": 0.5233, "rewards/accuracies": 0.75, "rewards/chosen": 0.7780610918998718, "rewards/margins": 0.461338073015213, "rewards/rejected": 0.3167230486869812, "step": 6475 }, { "epoch": 1.0015078291127006, "grad_norm": 4.678894519805908, "learning_rate": 3.7008821170809944e-06, "logits/chosen": 12.240632057189941, "logits/rejected": 10.878813743591309, "logps/chosen": -238.29595947265625, "logps/rejected": -245.44129943847656, "loss": 0.5433, "rewards/accuracies": 0.75, "rewards/chosen": 0.2810564935207367, "rewards/margins": 0.4207093119621277, "rewards/rejected": -0.139652818441391, "step": 6476 }, { "epoch": 1.0016624782524648, "grad_norm": 6.588015556335449, "learning_rate": 3.700595715431321e-06, "logits/chosen": 13.789307594299316, "logits/rejected": 11.070528030395508, "logps/chosen": -438.00311279296875, "logps/rejected": -324.98626708984375, "loss": 0.8311, "rewards/accuracies": 0.25, "rewards/chosen": 0.2357904613018036, "rewards/margins": -0.24124449491500854, "rewards/rejected": 0.47703495621681213, "step": 6477 }, { "epoch": 1.001817127392229, "grad_norm": 6.812483310699463, "learning_rate": 3.7003093137816477e-06, "logits/chosen": 7.506626605987549, "logits/rejected": 12.467564582824707, "logps/chosen": -218.64212036132812, "logps/rejected": -279.65966796875, "loss": 0.6789, "rewards/accuracies": 0.75, "rewards/chosen": 0.14739733934402466, "rewards/margins": 0.07972268760204315, "rewards/rejected": 0.06767462939023972, "step": 6478 }, { "epoch": 1.001971776531993, "grad_norm": 5.825155735015869, "learning_rate": 3.7000229121319743e-06, "logits/chosen": 6.795217037200928, "logits/rejected": 6.0769453048706055, "logps/chosen": -345.43841552734375, "logps/rejected": -275.205322265625, "loss": 0.6548, "rewards/accuracies": 0.625, "rewards/chosen": 0.15224283933639526, "rewards/margins": 0.16189280152320862, "rewards/rejected": -0.00964994728565216, "step": 6479 }, { "epoch": 1.0021264256717572, "grad_norm": 7.262514114379883, "learning_rate": 3.6997365104823006e-06, "logits/chosen": 8.445148468017578, "logits/rejected": 8.914729118347168, "logps/chosen": -329.12689208984375, "logps/rejected": -333.1185302734375, "loss": 0.8121, "rewards/accuracies": 0.5, "rewards/chosen": 0.598840594291687, "rewards/margins": 0.02515360713005066, "rewards/rejected": 0.5736870169639587, "step": 6480 }, { "epoch": 1.0022810748115214, "grad_norm": 5.612082481384277, "learning_rate": 3.6994501088326272e-06, "logits/chosen": 7.604832172393799, "logits/rejected": 9.593935012817383, "logps/chosen": -380.4143981933594, "logps/rejected": -401.48046875, "loss": 0.58, "rewards/accuracies": 0.5, "rewards/chosen": 0.8835973739624023, "rewards/margins": 0.3783763647079468, "rewards/rejected": 0.505220890045166, "step": 6481 }, { "epoch": 1.0024357239512856, "grad_norm": 4.28853178024292, "learning_rate": 3.699163707182954e-06, "logits/chosen": 14.142362594604492, "logits/rejected": 6.589823246002197, "logps/chosen": -319.54144287109375, "logps/rejected": -227.5568084716797, "loss": 0.5133, "rewards/accuracies": 0.75, "rewards/chosen": 0.35826224088668823, "rewards/margins": 0.6096003651618958, "rewards/rejected": -0.25133809447288513, "step": 6482 }, { "epoch": 1.0025903730910497, "grad_norm": 3.6368038654327393, "learning_rate": 3.69887730553328e-06, "logits/chosen": 5.254798889160156, "logits/rejected": 9.603643417358398, "logps/chosen": -226.14010620117188, "logps/rejected": -282.77825927734375, "loss": 0.478, "rewards/accuracies": 0.875, "rewards/chosen": 0.31996646523475647, "rewards/margins": 0.5870798826217651, "rewards/rejected": -0.2671133875846863, "step": 6483 }, { "epoch": 1.0027450222308139, "grad_norm": 5.984035968780518, "learning_rate": 3.6985909038836063e-06, "logits/chosen": 8.93764591217041, "logits/rejected": 5.029354095458984, "logps/chosen": -266.23577880859375, "logps/rejected": -251.21987915039062, "loss": 0.6095, "rewards/accuracies": 0.75, "rewards/chosen": 0.26544857025146484, "rewards/margins": 0.28799566626548767, "rewards/rejected": -0.02254711091518402, "step": 6484 }, { "epoch": 1.002899671370578, "grad_norm": 4.384690284729004, "learning_rate": 3.698304502233933e-06, "logits/chosen": 10.72787094116211, "logits/rejected": 14.027435302734375, "logps/chosen": -176.72085571289062, "logps/rejected": -270.55035400390625, "loss": 0.5231, "rewards/accuracies": 0.875, "rewards/chosen": -0.0743076354265213, "rewards/margins": 0.468065470457077, "rewards/rejected": -0.5423730611801147, "step": 6485 }, { "epoch": 1.0030543205103422, "grad_norm": 3.422541379928589, "learning_rate": 3.6980181005842596e-06, "logits/chosen": 11.922115325927734, "logits/rejected": 8.016584396362305, "logps/chosen": -183.350830078125, "logps/rejected": -150.6525115966797, "loss": 0.4901, "rewards/accuracies": 0.625, "rewards/chosen": 0.3783113956451416, "rewards/margins": 0.588878870010376, "rewards/rejected": -0.21056750416755676, "step": 6486 }, { "epoch": 1.0032089696501063, "grad_norm": 4.68432092666626, "learning_rate": 3.6977316989345863e-06, "logits/chosen": 10.02033805847168, "logits/rejected": 5.156567573547363, "logps/chosen": -203.0462188720703, "logps/rejected": -165.84323120117188, "loss": 0.5739, "rewards/accuracies": 0.75, "rewards/chosen": 0.10937246680259705, "rewards/margins": 0.40428417921066284, "rewards/rejected": -0.2949116826057434, "step": 6487 }, { "epoch": 1.0033636187898705, "grad_norm": 5.173445701599121, "learning_rate": 3.697445297284913e-06, "logits/chosen": 11.687442779541016, "logits/rejected": 5.338320732116699, "logps/chosen": -287.1285400390625, "logps/rejected": -190.30963134765625, "loss": 0.6449, "rewards/accuracies": 0.5, "rewards/chosen": 0.3509214520454407, "rewards/margins": 0.16770482063293457, "rewards/rejected": 0.1832166314125061, "step": 6488 }, { "epoch": 1.0035182679296346, "grad_norm": 5.891007900238037, "learning_rate": 3.6971588956352387e-06, "logits/chosen": 6.7640204429626465, "logits/rejected": 3.843388080596924, "logps/chosen": -215.14576721191406, "logps/rejected": -199.46826171875, "loss": 0.6882, "rewards/accuracies": 0.75, "rewards/chosen": 0.33893540501594543, "rewards/margins": 0.14164628088474274, "rewards/rejected": 0.1972891390323639, "step": 6489 }, { "epoch": 1.0036729170693988, "grad_norm": 5.027500629425049, "learning_rate": 3.6968724939855654e-06, "logits/chosen": 10.829145431518555, "logits/rejected": 11.652154922485352, "logps/chosen": -218.57725524902344, "logps/rejected": -252.7308349609375, "loss": 0.6982, "rewards/accuracies": 0.5, "rewards/chosen": 0.1749027669429779, "rewards/margins": 0.012367707677185535, "rewards/rejected": 0.16253504157066345, "step": 6490 }, { "epoch": 1.003827566209163, "grad_norm": 7.334293365478516, "learning_rate": 3.696586092335892e-06, "logits/chosen": 12.625438690185547, "logits/rejected": 7.769203186035156, "logps/chosen": -351.1131591796875, "logps/rejected": -291.02716064453125, "loss": 0.8311, "rewards/accuracies": 0.5, "rewards/chosen": -0.007216166704893112, "rewards/margins": -0.17327935993671417, "rewards/rejected": 0.16606321930885315, "step": 6491 }, { "epoch": 1.003982215348927, "grad_norm": 4.9731974601745605, "learning_rate": 3.6962996906862187e-06, "logits/chosen": 7.9905500411987305, "logits/rejected": 8.718603134155273, "logps/chosen": -169.21807861328125, "logps/rejected": -189.5457763671875, "loss": 0.5797, "rewards/accuracies": 0.5, "rewards/chosen": 0.3706709146499634, "rewards/margins": 0.425445556640625, "rewards/rejected": -0.05477464199066162, "step": 6492 }, { "epoch": 1.0041368644886912, "grad_norm": 6.196478366851807, "learning_rate": 3.6960132890365453e-06, "logits/chosen": 10.227004051208496, "logits/rejected": 3.1333799362182617, "logps/chosen": -393.52923583984375, "logps/rejected": -251.47586059570312, "loss": 0.6181, "rewards/accuracies": 0.625, "rewards/chosen": 0.34924450516700745, "rewards/margins": 0.3547116219997406, "rewards/rejected": -0.005467124283313751, "step": 6493 }, { "epoch": 1.0042915136284554, "grad_norm": 4.1911773681640625, "learning_rate": 3.695726887386872e-06, "logits/chosen": 11.043564796447754, "logits/rejected": 9.156267166137695, "logps/chosen": -256.17724609375, "logps/rejected": -248.79600524902344, "loss": 0.623, "rewards/accuracies": 0.875, "rewards/chosen": 0.3548438549041748, "rewards/margins": 0.2840927541255951, "rewards/rejected": 0.0707511305809021, "step": 6494 }, { "epoch": 1.0044461627682195, "grad_norm": 3.3736093044281006, "learning_rate": 3.695440485737198e-06, "logits/chosen": 15.362344741821289, "logits/rejected": 13.852359771728516, "logps/chosen": -192.2646942138672, "logps/rejected": -168.83152770996094, "loss": 0.5499, "rewards/accuracies": 0.75, "rewards/chosen": 0.18855062127113342, "rewards/margins": 0.3880029320716858, "rewards/rejected": -0.19945232570171356, "step": 6495 }, { "epoch": 1.0046008119079837, "grad_norm": 6.9795241355896, "learning_rate": 3.6951540840875244e-06, "logits/chosen": 3.5728375911712646, "logits/rejected": 7.5879950523376465, "logps/chosen": -208.23135375976562, "logps/rejected": -328.0044250488281, "loss": 1.0147, "rewards/accuracies": 0.5, "rewards/chosen": 0.3019470274448395, "rewards/margins": -0.3014880120754242, "rewards/rejected": 0.6034350395202637, "step": 6496 }, { "epoch": 1.0047554610477478, "grad_norm": 3.7983055114746094, "learning_rate": 3.694867682437851e-06, "logits/chosen": 10.055536270141602, "logits/rejected": 11.805242538452148, "logps/chosen": -255.73080444335938, "logps/rejected": -238.3896942138672, "loss": 0.5697, "rewards/accuracies": 0.625, "rewards/chosen": 0.2134089171886444, "rewards/margins": 0.39890730381011963, "rewards/rejected": -0.18549838662147522, "step": 6497 }, { "epoch": 1.004910110187512, "grad_norm": 4.407406330108643, "learning_rate": 3.6945812807881777e-06, "logits/chosen": 14.96390438079834, "logits/rejected": 10.439229965209961, "logps/chosen": -223.6092529296875, "logps/rejected": -179.12969970703125, "loss": 0.6297, "rewards/accuracies": 0.625, "rewards/chosen": -0.13989132642745972, "rewards/margins": 0.32183021306991577, "rewards/rejected": -0.4617215692996979, "step": 6498 }, { "epoch": 1.0050647593272763, "grad_norm": 4.386175155639648, "learning_rate": 3.694294879138504e-06, "logits/chosen": 5.19125509262085, "logits/rejected": 7.769558429718018, "logps/chosen": -168.78756713867188, "logps/rejected": -208.15203857421875, "loss": 0.716, "rewards/accuracies": 0.5, "rewards/chosen": -0.22360999882221222, "rewards/margins": 0.043396957218647, "rewards/rejected": -0.2670069634914398, "step": 6499 }, { "epoch": 1.0052194084670405, "grad_norm": 3.7246615886688232, "learning_rate": 3.6940084774888306e-06, "logits/chosen": 12.95360279083252, "logits/rejected": 11.05101203918457, "logps/chosen": -186.3341064453125, "logps/rejected": -143.59999084472656, "loss": 0.5254, "rewards/accuracies": 0.875, "rewards/chosen": 0.4003264605998993, "rewards/margins": 0.4193662703037262, "rewards/rejected": -0.0190398208796978, "step": 6500 } ], "logging_steps": 1, "max_steps": 19398, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }