diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,97533 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0052194084670405, + "eval_steps": 500, + "global_step": 6500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00015464913976416007, + "grad_norm": 6.911925792694092, + "learning_rate": 2.577319587628866e-09, + "logits/chosen": 7.725544452667236, + "logits/rejected": 7.458861827850342, + "logps/chosen": -328.6910400390625, + "logps/rejected": -265.0786437988281, + "loss": 0.7306, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0007827766239643097, + "rewards/margins": -0.06521320343017578, + "rewards/rejected": 0.06599598377943039, + "step": 1 + }, + { + "epoch": 0.00030929827952832015, + "grad_norm": 6.559286594390869, + "learning_rate": 5.154639175257732e-09, + "logits/chosen": 10.706411361694336, + "logits/rejected": 9.259780883789062, + "logps/chosen": -513.0947265625, + "logps/rejected": -438.3065185546875, + "loss": 0.6917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006189629435539246, + "rewards/margins": 0.016045667231082916, + "rewards/rejected": -0.009856035001575947, + "step": 2 + }, + { + "epoch": 0.00046394741929248017, + "grad_norm": 4.6717209815979, + "learning_rate": 7.731958762886597e-09, + "logits/chosen": 5.7964887619018555, + "logits/rejected": 7.366323471069336, + "logps/chosen": -346.95306396484375, + "logps/rejected": -268.921630859375, + "loss": 0.696, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.007051087915897369, + "rewards/margins": 0.0025309547781944275, + "rewards/rejected": 0.004520131275057793, + "step": 3 + }, + { + "epoch": 0.0006185965590566403, + "grad_norm": 3.4333865642547607, + "learning_rate": 1.0309278350515464e-08, + "logits/chosen": 14.677587509155273, + "logits/rejected": 10.515243530273438, + "logps/chosen": -245.28768920898438, + "logps/rejected": -242.6073455810547, + "loss": 0.6476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03533508628606796, + "rewards/margins": 0.09676864743232727, + "rewards/rejected": -0.06143355742096901, + "step": 4 + }, + { + "epoch": 0.0007732456988208003, + "grad_norm": 5.887388706207275, + "learning_rate": 1.2886597938144331e-08, + "logits/chosen": 8.125778198242188, + "logits/rejected": 7.515656471252441, + "logps/chosen": -264.28338623046875, + "logps/rejected": -239.83038330078125, + "loss": 0.7056, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01714172586798668, + "rewards/margins": -0.02102193795144558, + "rewards/rejected": 0.03816366195678711, + "step": 5 + }, + { + "epoch": 0.0009278948385849603, + "grad_norm": 6.934746742248535, + "learning_rate": 1.5463917525773195e-08, + "logits/chosen": 8.793354988098145, + "logits/rejected": 12.097320556640625, + "logps/chosen": -251.8073272705078, + "logps/rejected": -353.7068176269531, + "loss": 0.699, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04178829491138458, + "rewards/margins": -0.008631706237792969, + "rewards/rejected": -0.03315658122301102, + "step": 6 + }, + { + "epoch": 0.0010825439783491205, + "grad_norm": 5.932006359100342, + "learning_rate": 1.8041237113402063e-08, + "logits/chosen": 13.62399673461914, + "logits/rejected": 4.940817832946777, + "logps/chosen": -301.8253479003906, + "logps/rejected": -327.65887451171875, + "loss": 0.7116, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.031215764582157135, + "rewards/margins": -0.029828118160367012, + "rewards/rejected": -0.0013876445591449738, + "step": 7 + }, + { + "epoch": 0.0012371931181132806, + "grad_norm": 5.266278266906738, + "learning_rate": 2.061855670103093e-08, + "logits/chosen": 5.989433765411377, + "logits/rejected": 6.766448020935059, + "logps/chosen": -217.256591796875, + "logps/rejected": -272.15509033203125, + "loss": 0.7083, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0018736126367002726, + "rewards/margins": -0.028320670127868652, + "rewards/rejected": 0.03019428253173828, + "step": 8 + }, + { + "epoch": 0.0013918422578774407, + "grad_norm": 4.636612892150879, + "learning_rate": 2.3195876288659797e-08, + "logits/chosen": 7.120231628417969, + "logits/rejected": 7.554953575134277, + "logps/chosen": -209.84974670410156, + "logps/rejected": -197.9786834716797, + "loss": 0.7263, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.016598273068666458, + "rewards/margins": -0.06356807053089142, + "rewards/rejected": 0.04696979373693466, + "step": 9 + }, + { + "epoch": 0.0015464913976416005, + "grad_norm": 4.485163688659668, + "learning_rate": 2.5773195876288662e-08, + "logits/chosen": 6.194683074951172, + "logits/rejected": 4.8854875564575195, + "logps/chosen": -246.56932067871094, + "logps/rejected": -217.247314453125, + "loss": 0.706, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023969482630491257, + "rewards/margins": -0.020935650914907455, + "rewards/rejected": -0.003033827990293503, + "step": 10 + }, + { + "epoch": 0.0017011405374057606, + "grad_norm": 7.249931812286377, + "learning_rate": 2.8350515463917528e-08, + "logits/chosen": 15.518509864807129, + "logits/rejected": 11.09177017211914, + "logps/chosen": -274.49652099609375, + "logps/rejected": -226.093994140625, + "loss": 0.718, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.005039501935243607, + "rewards/margins": -0.04767618328332901, + "rewards/rejected": 0.0426366813480854, + "step": 11 + }, + { + "epoch": 0.0018557896771699207, + "grad_norm": 4.163851737976074, + "learning_rate": 3.092783505154639e-08, + "logits/chosen": 12.197102546691895, + "logits/rejected": 8.043967247009277, + "logps/chosen": -261.67230224609375, + "logps/rejected": -195.1876220703125, + "loss": 0.683, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008725881576538086, + "rewards/margins": 0.022560596466064453, + "rewards/rejected": -0.013834714889526367, + "step": 12 + }, + { + "epoch": 0.002010438816934081, + "grad_norm": 5.067342758178711, + "learning_rate": 3.350515463917526e-08, + "logits/chosen": 17.780834197998047, + "logits/rejected": 17.275726318359375, + "logps/chosen": -296.10980224609375, + "logps/rejected": -271.4646301269531, + "loss": 0.6786, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03504469618201256, + "rewards/margins": 0.03791084885597229, + "rewards/rejected": -0.002866152673959732, + "step": 13 + }, + { + "epoch": 0.002165087956698241, + "grad_norm": 5.199130535125732, + "learning_rate": 3.608247422680413e-08, + "logits/chosen": 6.776096343994141, + "logits/rejected": 14.16385555267334, + "logps/chosen": -232.74832153320312, + "logps/rejected": -261.77581787109375, + "loss": 0.7054, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06902992725372314, + "rewards/margins": -0.022028755396604538, + "rewards/rejected": -0.047001175582408905, + "step": 14 + }, + { + "epoch": 0.002319737096462401, + "grad_norm": 4.540201663970947, + "learning_rate": 3.865979381443299e-08, + "logits/chosen": 10.701871871948242, + "logits/rejected": 13.761884689331055, + "logps/chosen": -287.3067626953125, + "logps/rejected": -248.40289306640625, + "loss": 0.7116, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0004139924421906471, + "rewards/margins": -0.03300056606531143, + "rewards/rejected": 0.03258657455444336, + "step": 15 + }, + { + "epoch": 0.002474386236226561, + "grad_norm": 4.610867977142334, + "learning_rate": 4.123711340206186e-08, + "logits/chosen": 12.358034133911133, + "logits/rejected": -0.12671267986297607, + "logps/chosen": -275.07647705078125, + "logps/rejected": -172.946533203125, + "loss": 0.7131, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008443592116236687, + "rewards/margins": -0.03740517795085907, + "rewards/rejected": 0.028961585834622383, + "step": 16 + }, + { + "epoch": 0.0026290353759907212, + "grad_norm": 3.652007579803467, + "learning_rate": 4.381443298969072e-08, + "logits/chosen": 16.298797607421875, + "logits/rejected": 7.837356090545654, + "logps/chosen": -196.04367065429688, + "logps/rejected": -154.35011291503906, + "loss": 0.6704, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012971021234989166, + "rewards/margins": 0.049596380442380905, + "rewards/rejected": -0.03662535920739174, + "step": 17 + }, + { + "epoch": 0.0027836845157548813, + "grad_norm": 5.670866012573242, + "learning_rate": 4.6391752577319594e-08, + "logits/chosen": 16.421899795532227, + "logits/rejected": 9.962812423706055, + "logps/chosen": -472.7196044921875, + "logps/rejected": -284.9296875, + "loss": 0.6217, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12304907292127609, + "rewards/margins": 0.15355949103832245, + "rewards/rejected": -0.030510425567626953, + "step": 18 + }, + { + "epoch": 0.002938333655519041, + "grad_norm": 6.218260288238525, + "learning_rate": 4.896907216494846e-08, + "logits/chosen": 10.792370796203613, + "logits/rejected": 3.2917964458465576, + "logps/chosen": -410.13140869140625, + "logps/rejected": -323.98284912109375, + "loss": 0.6851, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0450771301984787, + "rewards/margins": 0.019390054047107697, + "rewards/rejected": 0.025687076151371002, + "step": 19 + }, + { + "epoch": 0.003092982795283201, + "grad_norm": 5.937564373016357, + "learning_rate": 5.1546391752577325e-08, + "logits/chosen": 2.615875244140625, + "logits/rejected": 6.465677261352539, + "logps/chosen": -194.7277374267578, + "logps/rejected": -249.28128051757812, + "loss": 0.6757, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02864077314734459, + "rewards/margins": 0.03823430836200714, + "rewards/rejected": -0.009593534283339977, + "step": 20 + }, + { + "epoch": 0.003247631935047361, + "grad_norm": 4.729965686798096, + "learning_rate": 5.412371134020619e-08, + "logits/chosen": 9.202268600463867, + "logits/rejected": 11.467805862426758, + "logps/chosen": -208.8218994140625, + "logps/rejected": -304.7249755859375, + "loss": 0.7092, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.007246019318699837, + "rewards/margins": -0.0298062302172184, + "rewards/rejected": 0.037052251398563385, + "step": 21 + }, + { + "epoch": 0.003402281074811521, + "grad_norm": 4.760029315948486, + "learning_rate": 5.6701030927835055e-08, + "logits/chosen": 9.577291488647461, + "logits/rejected": 12.896297454833984, + "logps/chosen": -231.807373046875, + "logps/rejected": -322.9843444824219, + "loss": 0.6655, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048436544835567474, + "rewards/margins": 0.058693885803222656, + "rewards/rejected": -0.010257341898977757, + "step": 22 + }, + { + "epoch": 0.0035569302145756813, + "grad_norm": 5.684564113616943, + "learning_rate": 5.927835051546392e-08, + "logits/chosen": 11.468250274658203, + "logits/rejected": 2.6983911991119385, + "logps/chosen": -363.1484375, + "logps/rejected": -283.53765869140625, + "loss": 0.7331, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030769065022468567, + "rewards/margins": -0.06990260630846024, + "rewards/rejected": 0.039133548736572266, + "step": 23 + }, + { + "epoch": 0.0037115793543398413, + "grad_norm": 7.322030067443848, + "learning_rate": 6.185567010309278e-08, + "logits/chosen": 8.942168235778809, + "logits/rejected": 10.792451858520508, + "logps/chosen": -243.0218963623047, + "logps/rejected": -256.6020812988281, + "loss": 0.7369, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10762663185596466, + "rewards/margins": -0.07802677154541016, + "rewards/rejected": -0.029599856585264206, + "step": 24 + }, + { + "epoch": 0.0038662284941040014, + "grad_norm": 6.209859371185303, + "learning_rate": 6.443298969072165e-08, + "logits/chosen": 12.64309024810791, + "logits/rejected": 3.72812557220459, + "logps/chosen": -305.7120361328125, + "logps/rejected": -228.66659545898438, + "loss": 0.7297, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02528085745871067, + "rewards/margins": -0.06709671020507812, + "rewards/rejected": 0.041815854609012604, + "step": 25 + }, + { + "epoch": 0.004020877633868162, + "grad_norm": 4.138147830963135, + "learning_rate": 6.701030927835052e-08, + "logits/chosen": 8.450196266174316, + "logits/rejected": 8.557428359985352, + "logps/chosen": -277.9859313964844, + "logps/rejected": -268.0521545410156, + "loss": 0.7216, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.004189919680356979, + "rewards/margins": -0.05424795299768448, + "rewards/rejected": 0.0500580370426178, + "step": 26 + }, + { + "epoch": 0.004175526773632322, + "grad_norm": 5.362881183624268, + "learning_rate": 6.95876288659794e-08, + "logits/chosen": 8.368182182312012, + "logits/rejected": 7.4664459228515625, + "logps/chosen": -285.5622253417969, + "logps/rejected": -252.48875427246094, + "loss": 0.7122, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0396575927734375, + "rewards/margins": -0.03567342832684517, + "rewards/rejected": -0.0039841653779149055, + "step": 27 + }, + { + "epoch": 0.004330175913396482, + "grad_norm": 5.1329827308654785, + "learning_rate": 7.216494845360825e-08, + "logits/chosen": 5.287141799926758, + "logits/rejected": 9.286478042602539, + "logps/chosen": -184.01339721679688, + "logps/rejected": -349.4659118652344, + "loss": 0.7461, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.045447640120983124, + "rewards/margins": -0.09351148456335068, + "rewards/rejected": 0.04806385189294815, + "step": 28 + }, + { + "epoch": 0.004484825053160642, + "grad_norm": 5.443169116973877, + "learning_rate": 7.474226804123713e-08, + "logits/chosen": 2.246333599090576, + "logits/rejected": 9.324928283691406, + "logps/chosen": -206.28512573242188, + "logps/rejected": -225.4776153564453, + "loss": 0.7056, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.034403469413518906, + "rewards/margins": -0.022684622555971146, + "rewards/rejected": -0.01171884499490261, + "step": 29 + }, + { + "epoch": 0.004639474192924802, + "grad_norm": 5.341912269592285, + "learning_rate": 7.731958762886598e-08, + "logits/chosen": 10.89834213256836, + "logits/rejected": 2.3132970333099365, + "logps/chosen": -408.22601318359375, + "logps/rejected": -238.31373596191406, + "loss": 0.7202, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.023410415276885033, + "rewards/margins": -0.048658084124326706, + "rewards/rejected": 0.025247670710086823, + "step": 30 + }, + { + "epoch": 0.004794123332688962, + "grad_norm": 4.035882949829102, + "learning_rate": 7.989690721649484e-08, + "logits/chosen": 7.057154655456543, + "logits/rejected": 0.49073028564453125, + "logps/chosen": -306.31097412109375, + "logps/rejected": -182.73757934570312, + "loss": 0.6574, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.073747918009758, + "rewards/margins": 0.07770189642906189, + "rewards/rejected": -0.003953981213271618, + "step": 31 + }, + { + "epoch": 0.004948772472453122, + "grad_norm": 5.315826416015625, + "learning_rate": 8.247422680412371e-08, + "logits/chosen": 9.184189796447754, + "logits/rejected": 7.225438117980957, + "logps/chosen": -260.186279296875, + "logps/rejected": -234.22914123535156, + "loss": 0.7139, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.07834196835756302, + "rewards/margins": -0.0400027260184288, + "rewards/rejected": -0.03833923488855362, + "step": 32 + }, + { + "epoch": 0.005103421612217282, + "grad_norm": 4.139220714569092, + "learning_rate": 8.505154639175257e-08, + "logits/chosen": 9.457880020141602, + "logits/rejected": 9.91469955444336, + "logps/chosen": -265.49456787109375, + "logps/rejected": -215.19534301757812, + "loss": 0.6897, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0047846343368291855, + "rewards/margins": 0.010850241407752037, + "rewards/rejected": -0.006065608002245426, + "step": 33 + }, + { + "epoch": 0.0052580707519814425, + "grad_norm": 8.514988899230957, + "learning_rate": 8.762886597938144e-08, + "logits/chosen": 10.960465431213379, + "logits/rejected": 11.944863319396973, + "logps/chosen": -364.357421875, + "logps/rejected": -378.77349853515625, + "loss": 0.7028, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0046242717653512955, + "rewards/margins": -0.014529705047607422, + "rewards/rejected": 0.009905435144901276, + "step": 34 + }, + { + "epoch": 0.005412719891745603, + "grad_norm": 5.189547061920166, + "learning_rate": 9.02061855670103e-08, + "logits/chosen": 10.958404541015625, + "logits/rejected": 10.693275451660156, + "logps/chosen": -273.6878967285156, + "logps/rejected": -254.7626190185547, + "loss": 0.6742, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04495172202587128, + "rewards/margins": 0.04239773750305176, + "rewards/rejected": 0.0025539863854646683, + "step": 35 + }, + { + "epoch": 0.005567369031509763, + "grad_norm": 6.511617660522461, + "learning_rate": 9.278350515463919e-08, + "logits/chosen": 9.624488830566406, + "logits/rejected": 8.125849723815918, + "logps/chosen": -309.81689453125, + "logps/rejected": -246.7415771484375, + "loss": 0.6828, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025867179036140442, + "rewards/margins": 0.021593429148197174, + "rewards/rejected": 0.004273748956620693, + "step": 36 + }, + { + "epoch": 0.005722018171273923, + "grad_norm": 4.336499214172363, + "learning_rate": 9.536082474226806e-08, + "logits/chosen": 9.214275360107422, + "logits/rejected": 2.736907482147217, + "logps/chosen": -182.815673828125, + "logps/rejected": -139.5016632080078, + "loss": 0.7167, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.00035815173760056496, + "rewards/margins": -0.045297957956790924, + "rewards/rejected": 0.044939808547496796, + "step": 37 + }, + { + "epoch": 0.005876667311038082, + "grad_norm": 5.187273025512695, + "learning_rate": 9.793814432989692e-08, + "logits/chosen": 10.795100212097168, + "logits/rejected": 11.301115036010742, + "logps/chosen": -276.6474304199219, + "logps/rejected": -295.5259704589844, + "loss": 0.6834, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00455322302877903, + "rewards/margins": 0.02165413089096546, + "rewards/rejected": -0.017100907862186432, + "step": 38 + }, + { + "epoch": 0.006031316450802242, + "grad_norm": 20.21113395690918, + "learning_rate": 1.0051546391752579e-07, + "logits/chosen": 10.748891830444336, + "logits/rejected": 8.51008415222168, + "logps/chosen": -377.041015625, + "logps/rejected": -276.65789794921875, + "loss": 0.6994, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.013005254790186882, + "rewards/margins": -0.004641342908143997, + "rewards/rejected": -0.008363913744688034, + "step": 39 + }, + { + "epoch": 0.006185965590566402, + "grad_norm": 4.1608147621154785, + "learning_rate": 1.0309278350515465e-07, + "logits/chosen": 15.16433048248291, + "logits/rejected": 12.891679763793945, + "logps/chosen": -272.31048583984375, + "logps/rejected": -207.171875, + "loss": 0.7103, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.010014916770160198, + "rewards/margins": -0.028248880058526993, + "rewards/rejected": 0.01823396608233452, + "step": 40 + }, + { + "epoch": 0.006340614730330562, + "grad_norm": 4.690361499786377, + "learning_rate": 1.0567010309278352e-07, + "logits/chosen": 11.853530883789062, + "logits/rejected": 13.110629081726074, + "logps/chosen": -251.1317138671875, + "logps/rejected": -312.3910827636719, + "loss": 0.74, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03138899803161621, + "rewards/margins": -0.08699889481067657, + "rewards/rejected": 0.055609896779060364, + "step": 41 + }, + { + "epoch": 0.006495263870094722, + "grad_norm": 4.178880214691162, + "learning_rate": 1.0824742268041238e-07, + "logits/chosen": 6.848409175872803, + "logits/rejected": 8.219043731689453, + "logps/chosen": -166.265380859375, + "logps/rejected": -169.47552490234375, + "loss": 0.6972, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003542447928339243, + "rewards/margins": -0.005707143805921078, + "rewards/rejected": 0.009249591268599033, + "step": 42 + }, + { + "epoch": 0.006649913009858882, + "grad_norm": 4.671844005584717, + "learning_rate": 1.1082474226804125e-07, + "logits/chosen": 9.44178581237793, + "logits/rejected": 2.4399619102478027, + "logps/chosen": -252.87725830078125, + "logps/rejected": -205.69924926757812, + "loss": 0.7013, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0025249021127820015, + "rewards/margins": -0.012448456138372421, + "rewards/rejected": 0.009923554956912994, + "step": 43 + }, + { + "epoch": 0.006804562149623042, + "grad_norm": 7.647763252258301, + "learning_rate": 1.1340206185567011e-07, + "logits/chosen": 9.81114673614502, + "logits/rejected": 2.1354012489318848, + "logps/chosen": -230.75701904296875, + "logps/rejected": -146.80470275878906, + "loss": 0.7639, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05395350605249405, + "rewards/margins": -0.12842561304569244, + "rewards/rejected": 0.0744720995426178, + "step": 44 + }, + { + "epoch": 0.0069592112893872024, + "grad_norm": 3.828352451324463, + "learning_rate": 1.1597938144329898e-07, + "logits/chosen": 12.649640083312988, + "logits/rejected": 10.158154487609863, + "logps/chosen": -221.09107971191406, + "logps/rejected": -145.47027587890625, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012367580085992813, + "rewards/margins": 0.037506103515625, + "rewards/rejected": -0.02513851970434189, + "step": 45 + }, + { + "epoch": 0.0071138604291513625, + "grad_norm": 4.946978569030762, + "learning_rate": 1.1855670103092784e-07, + "logits/chosen": 10.738189697265625, + "logits/rejected": 11.961038589477539, + "logps/chosen": -219.86959838867188, + "logps/rejected": -224.05934143066406, + "loss": 0.7233, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.019327593967318535, + "rewards/margins": -0.05806604027748108, + "rewards/rejected": 0.07739362865686417, + "step": 46 + }, + { + "epoch": 0.007268509568915523, + "grad_norm": 4.380319595336914, + "learning_rate": 1.211340206185567e-07, + "logits/chosen": 17.339160919189453, + "logits/rejected": 6.111804962158203, + "logps/chosen": -201.5426025390625, + "logps/rejected": -145.981201171875, + "loss": 0.6785, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.055193666368722916, + "rewards/margins": 0.03219471126794815, + "rewards/rejected": 0.022998955100774765, + "step": 47 + }, + { + "epoch": 0.007423158708679683, + "grad_norm": 3.666557550430298, + "learning_rate": 1.2371134020618556e-07, + "logits/chosen": 13.23106861114502, + "logits/rejected": 8.200860977172852, + "logps/chosen": -166.75164794921875, + "logps/rejected": -111.83175659179688, + "loss": 0.6843, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0284881591796875, + "rewards/margins": 0.019208334386348724, + "rewards/rejected": 0.009279822930693626, + "step": 48 + }, + { + "epoch": 0.007577807848443843, + "grad_norm": 5.851673126220703, + "learning_rate": 1.2628865979381446e-07, + "logits/chosen": 4.434458255767822, + "logits/rejected": 5.869389057159424, + "logps/chosen": -293.1557922363281, + "logps/rejected": -324.98394775390625, + "loss": 0.6812, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02338256873190403, + "rewards/margins": 0.02982187271118164, + "rewards/rejected": -0.006439303979277611, + "step": 49 + }, + { + "epoch": 0.007732456988208003, + "grad_norm": 7.123608112335205, + "learning_rate": 1.288659793814433e-07, + "logits/chosen": 12.065396308898926, + "logits/rejected": 6.792296886444092, + "logps/chosen": -243.808349609375, + "logps/rejected": -256.5181579589844, + "loss": 0.661, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03321099281311035, + "rewards/margins": 0.06735959649085999, + "rewards/rejected": -0.03414859622716904, + "step": 50 + }, + { + "epoch": 0.007887106127972164, + "grad_norm": 4.990226745605469, + "learning_rate": 1.3144329896907217e-07, + "logits/chosen": 9.566271781921387, + "logits/rejected": 13.786304473876953, + "logps/chosen": -238.0357666015625, + "logps/rejected": -297.1591796875, + "loss": 0.752, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.042253829538822174, + "rewards/margins": -0.10889139026403427, + "rewards/rejected": 0.0666375607252121, + "step": 51 + }, + { + "epoch": 0.008041755267736324, + "grad_norm": 3.86415696144104, + "learning_rate": 1.3402061855670105e-07, + "logits/chosen": 2.9946484565734863, + "logits/rejected": 2.850429058074951, + "logps/chosen": -228.489501953125, + "logps/rejected": -198.38775634765625, + "loss": 0.6746, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07479849457740784, + "rewards/margins": 0.04300132393836975, + "rewards/rejected": 0.031797170639038086, + "step": 52 + }, + { + "epoch": 0.008196404407500484, + "grad_norm": 9.43787956237793, + "learning_rate": 1.3659793814432992e-07, + "logits/chosen": 8.113728523254395, + "logits/rejected": 4.7881245613098145, + "logps/chosen": -421.5853576660156, + "logps/rejected": -274.0136413574219, + "loss": 0.6726, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0017354898154735565, + "rewards/margins": 0.04958948865532875, + "rewards/rejected": -0.047853998839855194, + "step": 53 + }, + { + "epoch": 0.008351053547264644, + "grad_norm": 5.2647881507873535, + "learning_rate": 1.391752577319588e-07, + "logits/chosen": 13.694591522216797, + "logits/rejected": 8.33656120300293, + "logps/chosen": -310.445068359375, + "logps/rejected": -200.9225311279297, + "loss": 0.6577, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05243654549121857, + "rewards/margins": 0.07473216205835342, + "rewards/rejected": -0.022295618429780006, + "step": 54 + }, + { + "epoch": 0.008505702687028804, + "grad_norm": 5.5423665046691895, + "learning_rate": 1.4175257731958764e-07, + "logits/chosen": 14.974117279052734, + "logits/rejected": 8.096324920654297, + "logps/chosen": -318.1484680175781, + "logps/rejected": -251.4805908203125, + "loss": 0.7033, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0038333898410201073, + "rewards/margins": -0.017112065106630325, + "rewards/rejected": 0.020945454016327858, + "step": 55 + }, + { + "epoch": 0.008660351826792964, + "grad_norm": 5.278740406036377, + "learning_rate": 1.443298969072165e-07, + "logits/chosen": 6.886699676513672, + "logits/rejected": 5.67666482925415, + "logps/chosen": -430.3440246582031, + "logps/rejected": -272.5577392578125, + "loss": 0.7056, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008675767108798027, + "rewards/margins": -0.023654652759432793, + "rewards/rejected": 0.014978885650634766, + "step": 56 + }, + { + "epoch": 0.008815000966557124, + "grad_norm": 7.0525336265563965, + "learning_rate": 1.4690721649484538e-07, + "logits/chosen": 11.573299407958984, + "logits/rejected": 6.818861484527588, + "logps/chosen": -327.9615173339844, + "logps/rejected": -329.19232177734375, + "loss": 0.6403, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06640644371509552, + "rewards/margins": 0.11603251099586487, + "rewards/rejected": -0.04962606728076935, + "step": 57 + }, + { + "epoch": 0.008969650106321284, + "grad_norm": 5.868617534637451, + "learning_rate": 1.4948453608247425e-07, + "logits/chosen": 13.533453941345215, + "logits/rejected": 12.22661018371582, + "logps/chosen": -325.85284423828125, + "logps/rejected": -333.2220764160156, + "loss": 0.695, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.026420211419463158, + "rewards/margins": -0.0008131072390824556, + "rewards/rejected": 0.027233313769102097, + "step": 58 + }, + { + "epoch": 0.009124299246085444, + "grad_norm": 3.9673924446105957, + "learning_rate": 1.520618556701031e-07, + "logits/chosen": 11.48481559753418, + "logits/rejected": -0.8207135200500488, + "logps/chosen": -279.1275939941406, + "logps/rejected": -135.3128662109375, + "loss": 0.7097, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0018380163237452507, + "rewards/margins": -0.03214993700385094, + "rewards/rejected": 0.033987950533628464, + "step": 59 + }, + { + "epoch": 0.009278948385849604, + "grad_norm": 4.902527809143066, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": 12.110644340515137, + "logits/rejected": 10.14527702331543, + "logps/chosen": -291.07928466796875, + "logps/rejected": -293.1893310546875, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0004943609237670898, + "rewards/margins": 0.01345379650592804, + "rewards/rejected": -0.01394815556704998, + "step": 60 + }, + { + "epoch": 0.009433597525613764, + "grad_norm": 3.8037991523742676, + "learning_rate": 1.5721649484536084e-07, + "logits/chosen": 12.840753555297852, + "logits/rejected": 5.81508207321167, + "logps/chosen": -178.47412109375, + "logps/rejected": -151.3430633544922, + "loss": 0.7173, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.001392459962517023, + "rewards/margins": -0.04688852280378342, + "rewards/rejected": 0.04549605771899223, + "step": 61 + }, + { + "epoch": 0.009588246665377925, + "grad_norm": 3.996386766433716, + "learning_rate": 1.5979381443298969e-07, + "logits/chosen": 6.25969123840332, + "logits/rejected": 4.946198463439941, + "logps/chosen": -213.48226928710938, + "logps/rejected": -213.9498291015625, + "loss": 0.6963, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.007098199799656868, + "rewards/margins": -0.0002037547528743744, + "rewards/rejected": -0.006894445046782494, + "step": 62 + }, + { + "epoch": 0.009742895805142085, + "grad_norm": 5.785022258758545, + "learning_rate": 1.6237113402061858e-07, + "logits/chosen": 10.98247241973877, + "logits/rejected": 8.06905460357666, + "logps/chosen": -291.7880859375, + "logps/rejected": -269.6845397949219, + "loss": 0.7234, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0038303378969430923, + "rewards/margins": -0.05777034908533096, + "rewards/rejected": 0.0616006925702095, + "step": 63 + }, + { + "epoch": 0.009897544944906245, + "grad_norm": 4.6899871826171875, + "learning_rate": 1.6494845360824743e-07, + "logits/chosen": 6.98136043548584, + "logits/rejected": 8.341474533081055, + "logps/chosen": -186.41683959960938, + "logps/rejected": -244.73849487304688, + "loss": 0.6532, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011904047802090645, + "rewards/margins": 0.08552099019289017, + "rewards/rejected": -0.07361693680286407, + "step": 64 + }, + { + "epoch": 0.010052194084670405, + "grad_norm": 8.800151824951172, + "learning_rate": 1.675257731958763e-07, + "logits/chosen": 0.23321041464805603, + "logits/rejected": 6.668447971343994, + "logps/chosen": -258.418701171875, + "logps/rejected": -374.27886962890625, + "loss": 0.6568, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05098380893468857, + "rewards/margins": 0.08276348561048508, + "rewards/rejected": -0.03177966922521591, + "step": 65 + }, + { + "epoch": 0.010206843224434565, + "grad_norm": 5.204796314239502, + "learning_rate": 1.7010309278350515e-07, + "logits/chosen": 2.947244167327881, + "logits/rejected": 3.6227288246154785, + "logps/chosen": -145.93878173828125, + "logps/rejected": -151.84219360351562, + "loss": 0.709, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0032607070170342922, + "rewards/margins": -0.02523498237133026, + "rewards/rejected": 0.02197427675127983, + "step": 66 + }, + { + "epoch": 0.010361492364198725, + "grad_norm": 5.2081732749938965, + "learning_rate": 1.7268041237113404e-07, + "logits/chosen": 8.921749114990234, + "logits/rejected": 8.524456024169922, + "logps/chosen": -222.26181030273438, + "logps/rejected": -163.75942993164062, + "loss": 0.7049, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01697554439306259, + "rewards/margins": -0.022300314158201218, + "rewards/rejected": 0.005324769299477339, + "step": 67 + }, + { + "epoch": 0.010516141503962885, + "grad_norm": 4.866977214813232, + "learning_rate": 1.752577319587629e-07, + "logits/chosen": 12.838434219360352, + "logits/rejected": 11.409696578979492, + "logps/chosen": -328.6891784667969, + "logps/rejected": -283.2961120605469, + "loss": 0.6979, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004480741918087006, + "rewards/margins": -0.0014948388561606407, + "rewards/rejected": 0.0059755826368927956, + "step": 68 + }, + { + "epoch": 0.010670790643727045, + "grad_norm": 7.5063700675964355, + "learning_rate": 1.7783505154639176e-07, + "logits/chosen": 8.556709289550781, + "logits/rejected": 8.246336936950684, + "logps/chosen": -422.247802734375, + "logps/rejected": -360.22540283203125, + "loss": 0.7694, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06881704181432724, + "rewards/margins": -0.13730239868164062, + "rewards/rejected": 0.06848535686731339, + "step": 69 + }, + { + "epoch": 0.010825439783491205, + "grad_norm": 5.279910087585449, + "learning_rate": 1.804123711340206e-07, + "logits/chosen": 12.430498123168945, + "logits/rejected": 7.914384365081787, + "logps/chosen": -238.27554321289062, + "logps/rejected": -174.36917114257812, + "loss": 0.677, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.019179202616214752, + "rewards/margins": 0.034891657531261444, + "rewards/rejected": -0.015712453052401543, + "step": 70 + }, + { + "epoch": 0.010980088923255365, + "grad_norm": 5.323324680328369, + "learning_rate": 1.829896907216495e-07, + "logits/chosen": 7.749332427978516, + "logits/rejected": 4.102944374084473, + "logps/chosen": -226.2488555908203, + "logps/rejected": -150.99114990234375, + "loss": 0.7017, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.055611733347177505, + "rewards/margins": -0.013157534413039684, + "rewards/rejected": -0.04245419427752495, + "step": 71 + }, + { + "epoch": 0.011134738063019525, + "grad_norm": 7.350645065307617, + "learning_rate": 1.8556701030927838e-07, + "logits/chosen": 8.904353141784668, + "logits/rejected": 7.493212699890137, + "logps/chosen": -319.02978515625, + "logps/rejected": -309.9676513671875, + "loss": 0.6731, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0036481861025094986, + "rewards/margins": 0.0438537634909153, + "rewards/rejected": -0.047501951456069946, + "step": 72 + }, + { + "epoch": 0.011289387202783685, + "grad_norm": 5.224555969238281, + "learning_rate": 1.8814432989690722e-07, + "logits/chosen": 12.686028480529785, + "logits/rejected": 8.858221054077148, + "logps/chosen": -298.44476318359375, + "logps/rejected": -260.1572265625, + "loss": 0.7105, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03465289995074272, + "rewards/margins": -0.03115687146782875, + "rewards/rejected": -0.003496028482913971, + "step": 73 + }, + { + "epoch": 0.011444036342547845, + "grad_norm": 4.9264044761657715, + "learning_rate": 1.9072164948453612e-07, + "logits/chosen": 7.611832618713379, + "logits/rejected": 11.591278076171875, + "logps/chosen": -361.92724609375, + "logps/rejected": -452.9621887207031, + "loss": 0.6952, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03055468015372753, + "rewards/margins": 0.006420614197850227, + "rewards/rejected": 0.024134064093232155, + "step": 74 + }, + { + "epoch": 0.011598685482312004, + "grad_norm": 6.524920463562012, + "learning_rate": 1.9329896907216497e-07, + "logits/chosen": 10.921269416809082, + "logits/rejected": 7.254299163818359, + "logps/chosen": -237.08285522460938, + "logps/rejected": -226.239501953125, + "loss": 0.7351, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.060632992535829544, + "rewards/margins": -0.08086157590150833, + "rewards/rejected": 0.02022857591509819, + "step": 75 + }, + { + "epoch": 0.011753334622076164, + "grad_norm": 5.861778736114502, + "learning_rate": 1.9587628865979384e-07, + "logits/chosen": 5.802974224090576, + "logits/rejected": 4.184164524078369, + "logps/chosen": -245.282958984375, + "logps/rejected": -231.754150390625, + "loss": 0.6817, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0020096758380532265, + "rewards/margins": 0.026843644678592682, + "rewards/rejected": -0.02483396604657173, + "step": 76 + }, + { + "epoch": 0.011907983761840324, + "grad_norm": 5.2542724609375, + "learning_rate": 1.9845360824742268e-07, + "logits/chosen": 12.744388580322266, + "logits/rejected": 8.454718589782715, + "logps/chosen": -358.8239440917969, + "logps/rejected": -302.8267822265625, + "loss": 0.6689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03047761879861355, + "rewards/margins": 0.05145206302404404, + "rewards/rejected": -0.020974446088075638, + "step": 77 + }, + { + "epoch": 0.012062632901604484, + "grad_norm": 4.325829029083252, + "learning_rate": 2.0103092783505158e-07, + "logits/chosen": 9.856746673583984, + "logits/rejected": 12.297477722167969, + "logps/chosen": -209.46218872070312, + "logps/rejected": -284.36248779296875, + "loss": 0.6958, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0018074996769428253, + "rewards/margins": -0.0029071811586618423, + "rewards/rejected": 0.004714678041636944, + "step": 78 + }, + { + "epoch": 0.012217282041368644, + "grad_norm": 4.3449273109436035, + "learning_rate": 2.0360824742268043e-07, + "logits/chosen": 10.401052474975586, + "logits/rejected": 0.46581029891967773, + "logps/chosen": -286.626953125, + "logps/rejected": -175.97720336914062, + "loss": 0.6794, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012897204607725143, + "rewards/margins": 0.030520722270011902, + "rewards/rejected": -0.017623521387577057, + "step": 79 + }, + { + "epoch": 0.012371931181132804, + "grad_norm": 5.984966278076172, + "learning_rate": 2.061855670103093e-07, + "logits/chosen": 15.044057846069336, + "logits/rejected": 9.719198226928711, + "logps/chosen": -321.7178955078125, + "logps/rejected": -354.29620361328125, + "loss": 0.7061, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03402872383594513, + "rewards/margins": -0.020597314462065697, + "rewards/rejected": -0.013431406579911709, + "step": 80 + }, + { + "epoch": 0.012526580320896964, + "grad_norm": 3.8173110485076904, + "learning_rate": 2.0876288659793814e-07, + "logits/chosen": 8.523874282836914, + "logits/rejected": 5.635761260986328, + "logps/chosen": -167.03334045410156, + "logps/rejected": -187.72418212890625, + "loss": 0.664, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0035264017060399055, + "rewards/margins": 0.06371049582958221, + "rewards/rejected": -0.06018409878015518, + "step": 81 + }, + { + "epoch": 0.012681229460661124, + "grad_norm": 4.351160049438477, + "learning_rate": 2.1134020618556704e-07, + "logits/chosen": 6.980379104614258, + "logits/rejected": 10.344731330871582, + "logps/chosen": -148.7363739013672, + "logps/rejected": -154.00201416015625, + "loss": 0.7276, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.020682476460933685, + "rewards/margins": -0.06525816023349762, + "rewards/rejected": 0.04457569122314453, + "step": 82 + }, + { + "epoch": 0.012835878600425284, + "grad_norm": 5.728026390075684, + "learning_rate": 2.139175257731959e-07, + "logits/chosen": 9.606285095214844, + "logits/rejected": 8.50086784362793, + "logps/chosen": -352.49932861328125, + "logps/rejected": -303.3203125, + "loss": 0.6661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.013780402019619942, + "rewards/margins": 0.057223130017519, + "rewards/rejected": -0.07100353389978409, + "step": 83 + }, + { + "epoch": 0.012990527740189444, + "grad_norm": 9.317477226257324, + "learning_rate": 2.1649484536082476e-07, + "logits/chosen": 10.684642791748047, + "logits/rejected": 7.32394552230835, + "logps/chosen": -330.871826171875, + "logps/rejected": -251.26858520507812, + "loss": 0.751, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0837281197309494, + "rewards/margins": -0.10600291192531586, + "rewards/rejected": 0.02227477915585041, + "step": 84 + }, + { + "epoch": 0.013145176879953605, + "grad_norm": 6.320870399475098, + "learning_rate": 2.190721649484536e-07, + "logits/chosen": 6.818489074707031, + "logits/rejected": 4.175490379333496, + "logps/chosen": -322.9170837402344, + "logps/rejected": -286.32928466796875, + "loss": 0.7179, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03987712785601616, + "rewards/margins": -0.04615812748670578, + "rewards/rejected": 0.006280993577092886, + "step": 85 + }, + { + "epoch": 0.013299826019717765, + "grad_norm": 9.204850196838379, + "learning_rate": 2.216494845360825e-07, + "logits/chosen": 8.331786155700684, + "logits/rejected": 7.814069747924805, + "logps/chosen": -486.9302978515625, + "logps/rejected": -450.00140380859375, + "loss": 0.7185, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.035970304161310196, + "rewards/margins": -0.03715171292424202, + "rewards/rejected": 0.0011814087629318237, + "step": 86 + }, + { + "epoch": 0.013454475159481925, + "grad_norm": 5.094347953796387, + "learning_rate": 2.2422680412371135e-07, + "logits/chosen": 9.036510467529297, + "logits/rejected": 5.101694107055664, + "logps/chosen": -305.684326171875, + "logps/rejected": -170.80503845214844, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014848662540316582, + "rewards/margins": 0.0014084815047681332, + "rewards/rejected": 0.013440179638564587, + "step": 87 + }, + { + "epoch": 0.013609124299246085, + "grad_norm": 3.4267141819000244, + "learning_rate": 2.2680412371134022e-07, + "logits/chosen": 17.177186965942383, + "logits/rejected": 10.287654876708984, + "logps/chosen": -189.2765655517578, + "logps/rejected": -184.96090698242188, + "loss": 0.6992, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013287735171616077, + "rewards/margins": -0.011234856210649014, + "rewards/rejected": -0.002052880357950926, + "step": 88 + }, + { + "epoch": 0.013763773439010245, + "grad_norm": 3.644435167312622, + "learning_rate": 2.2938144329896907e-07, + "logits/chosen": 8.002180099487305, + "logits/rejected": 12.51270866394043, + "logps/chosen": -109.40216064453125, + "logps/rejected": -209.42178344726562, + "loss": 0.7122, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.029180288314819336, + "rewards/margins": -0.0355406329035759, + "rewards/rejected": 0.006360341794788837, + "step": 89 + }, + { + "epoch": 0.013918422578774405, + "grad_norm": 4.993024826049805, + "learning_rate": 2.3195876288659797e-07, + "logits/chosen": 13.116409301757812, + "logits/rejected": 8.573915481567383, + "logps/chosen": -285.0827331542969, + "logps/rejected": -235.10255432128906, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.019628314301371574, + "rewards/margins": -0.0007615312933921814, + "rewards/rejected": 0.020389841869473457, + "step": 90 + }, + { + "epoch": 0.014073071718538565, + "grad_norm": 3.503422498703003, + "learning_rate": 2.345360824742268e-07, + "logits/chosen": 11.814714431762695, + "logits/rejected": 7.204892635345459, + "logps/chosen": -193.9886474609375, + "logps/rejected": -129.6933135986328, + "loss": 0.687, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.024440957233309746, + "rewards/margins": 0.014166689477860928, + "rewards/rejected": -0.0386076457798481, + "step": 91 + }, + { + "epoch": 0.014227720858302725, + "grad_norm": 4.424459934234619, + "learning_rate": 2.3711340206185568e-07, + "logits/chosen": 8.881943702697754, + "logits/rejected": 9.744006156921387, + "logps/chosen": -303.65576171875, + "logps/rejected": -298.8911437988281, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0425170436501503, + "rewards/margins": -0.048736244440078735, + "rewards/rejected": 0.006219197064638138, + "step": 92 + }, + { + "epoch": 0.014382369998066885, + "grad_norm": 5.523021697998047, + "learning_rate": 2.3969072164948455e-07, + "logits/chosen": 10.548042297363281, + "logits/rejected": 7.480461120605469, + "logps/chosen": -267.7251892089844, + "logps/rejected": -251.8106689453125, + "loss": 0.712, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004119347780942917, + "rewards/margins": -0.032270170748233795, + "rewards/rejected": 0.02815082296729088, + "step": 93 + }, + { + "epoch": 0.014537019137831045, + "grad_norm": 6.018385887145996, + "learning_rate": 2.422680412371134e-07, + "logits/chosen": 9.545321464538574, + "logits/rejected": 5.171817779541016, + "logps/chosen": -192.32418823242188, + "logps/rejected": -162.64511108398438, + "loss": 0.7192, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.01340007595717907, + "rewards/margins": -0.04911398887634277, + "rewards/rejected": 0.06251406669616699, + "step": 94 + }, + { + "epoch": 0.014691668277595205, + "grad_norm": 3.7559831142425537, + "learning_rate": 2.448453608247423e-07, + "logits/chosen": 12.315309524536133, + "logits/rejected": 13.280659675598145, + "logps/chosen": -205.85284423828125, + "logps/rejected": -168.16867065429688, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014152051880955696, + "rewards/margins": 0.03300756961107254, + "rewards/rejected": -0.01885552518069744, + "step": 95 + }, + { + "epoch": 0.014846317417359365, + "grad_norm": 5.631770133972168, + "learning_rate": 2.474226804123711e-07, + "logits/chosen": 8.322446823120117, + "logits/rejected": 5.429612636566162, + "logps/chosen": -250.402587890625, + "logps/rejected": -206.419189453125, + "loss": 0.6821, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05188605934381485, + "rewards/margins": 0.024941157549619675, + "rewards/rejected": 0.026944901794195175, + "step": 96 + }, + { + "epoch": 0.015000966557123525, + "grad_norm": 4.542779922485352, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": 3.9670815467834473, + "logits/rejected": 7.176328182220459, + "logps/chosen": -354.7858581542969, + "logps/rejected": -319.0560607910156, + "loss": 0.6375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08435390144586563, + "rewards/margins": 0.11990010738372803, + "rewards/rejected": -0.035546205937862396, + "step": 97 + }, + { + "epoch": 0.015155615696887685, + "grad_norm": 8.824295997619629, + "learning_rate": 2.525773195876289e-07, + "logits/chosen": 6.885655403137207, + "logits/rejected": 7.994781494140625, + "logps/chosen": -394.6037292480469, + "logps/rejected": -472.6541748046875, + "loss": 0.6897, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004859543405473232, + "rewards/margins": 0.012601710855960846, + "rewards/rejected": -0.007742166519165039, + "step": 98 + }, + { + "epoch": 0.015310264836651846, + "grad_norm": 4.0067219734191895, + "learning_rate": 2.5515463917525773e-07, + "logits/chosen": 7.246218681335449, + "logits/rejected": 0.9063689708709717, + "logps/chosen": -260.90106201171875, + "logps/rejected": -166.2843475341797, + "loss": 0.6978, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02266242541372776, + "rewards/margins": -0.007983684539794922, + "rewards/rejected": -0.014678740873932838, + "step": 99 + }, + { + "epoch": 0.015464913976416006, + "grad_norm": 5.003303527832031, + "learning_rate": 2.577319587628866e-07, + "logits/chosen": 6.180871486663818, + "logits/rejected": 2.7873902320861816, + "logps/chosen": -300.70928955078125, + "logps/rejected": -213.31500244140625, + "loss": 0.6601, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06647412478923798, + "rewards/margins": 0.06903731822967529, + "rewards/rejected": -0.0025631911121308804, + "step": 100 + }, + { + "epoch": 0.015619563116180166, + "grad_norm": 6.092252731323242, + "learning_rate": 2.603092783505155e-07, + "logits/chosen": 16.338642120361328, + "logits/rejected": 9.950223922729492, + "logps/chosen": -262.9970703125, + "logps/rejected": -246.46461486816406, + "loss": 0.6718, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.037627674639225006, + "rewards/margins": 0.05207538604736328, + "rewards/rejected": -0.01444771233946085, + "step": 101 + }, + { + "epoch": 0.015774212255944327, + "grad_norm": 4.586551189422607, + "learning_rate": 2.6288659793814435e-07, + "logits/chosen": 9.419777870178223, + "logits/rejected": 1.2569756507873535, + "logps/chosen": -283.7944030761719, + "logps/rejected": -207.13519287109375, + "loss": 0.705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0035183895379304886, + "rewards/margins": -0.018809601664543152, + "rewards/rejected": 0.015291215851902962, + "step": 102 + }, + { + "epoch": 0.015928861395708486, + "grad_norm": 5.104458332061768, + "learning_rate": 2.654639175257732e-07, + "logits/chosen": 9.748336791992188, + "logits/rejected": 10.516401290893555, + "logps/chosen": -267.2437438964844, + "logps/rejected": -233.54745483398438, + "loss": 0.7415, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04259653016924858, + "rewards/margins": -0.09110765904188156, + "rewards/rejected": 0.04851112514734268, + "step": 103 + }, + { + "epoch": 0.016083510535472648, + "grad_norm": 4.945676803588867, + "learning_rate": 2.680412371134021e-07, + "logits/chosen": 6.208408355712891, + "logits/rejected": 11.386116027832031, + "logps/chosen": -256.3028259277344, + "logps/rejected": -299.6868591308594, + "loss": 0.6756, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011820506304502487, + "rewards/margins": 0.037409354001283646, + "rewards/rejected": -0.02558884769678116, + "step": 104 + }, + { + "epoch": 0.016238159675236806, + "grad_norm": 5.4342875480651855, + "learning_rate": 2.7061855670103096e-07, + "logits/chosen": 12.268007278442383, + "logits/rejected": 7.494583606719971, + "logps/chosen": -398.30792236328125, + "logps/rejected": -256.7729797363281, + "loss": 0.6882, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0017288196831941605, + "rewards/margins": 0.018518351018428802, + "rewards/rejected": -0.020247172564268112, + "step": 105 + }, + { + "epoch": 0.016392808815000968, + "grad_norm": 5.318408966064453, + "learning_rate": 2.7319587628865984e-07, + "logits/chosen": 7.161886215209961, + "logits/rejected": 5.1986494064331055, + "logps/chosen": -309.04693603515625, + "logps/rejected": -280.927490234375, + "loss": 0.6737, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0156279094517231, + "rewards/margins": 0.04376005753874779, + "rewards/rejected": -0.028132153674960136, + "step": 106 + }, + { + "epoch": 0.016547457954765126, + "grad_norm": 5.483700275421143, + "learning_rate": 2.7577319587628865e-07, + "logits/chosen": 10.534073829650879, + "logits/rejected": 11.566827774047852, + "logps/chosen": -250.28680419921875, + "logps/rejected": -362.1705322265625, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008404970169067383, + "rewards/margins": 0.022122427821159363, + "rewards/rejected": -0.01371745951473713, + "step": 107 + }, + { + "epoch": 0.016702107094529288, + "grad_norm": 4.226380825042725, + "learning_rate": 2.783505154639176e-07, + "logits/chosen": 8.030059814453125, + "logits/rejected": 6.6773810386657715, + "logps/chosen": -275.9827880859375, + "logps/rejected": -259.358642578125, + "loss": 0.6674, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030481815338134766, + "rewards/margins": 0.056189581751823425, + "rewards/rejected": -0.02570776641368866, + "step": 108 + }, + { + "epoch": 0.016856756234293446, + "grad_norm": 4.288832664489746, + "learning_rate": 2.809278350515464e-07, + "logits/chosen": 5.9005126953125, + "logits/rejected": 3.222754716873169, + "logps/chosen": -239.56863403320312, + "logps/rejected": -230.7248992919922, + "loss": 0.7088, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.019434787333011627, + "rewards/margins": -0.029011959210038185, + "rewards/rejected": 0.04844675213098526, + "step": 109 + }, + { + "epoch": 0.017011405374057608, + "grad_norm": 5.753316879272461, + "learning_rate": 2.8350515463917527e-07, + "logits/chosen": 9.93628215789795, + "logits/rejected": 5.556370735168457, + "logps/chosen": -377.8464050292969, + "logps/rejected": -261.0926513671875, + "loss": 0.695, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.008871268481016159, + "rewards/margins": -0.001321074552834034, + "rewards/rejected": 0.010192345827817917, + "step": 110 + }, + { + "epoch": 0.017166054513821766, + "grad_norm": 5.302252292633057, + "learning_rate": 2.8608247422680414e-07, + "logits/chosen": 9.056117057800293, + "logits/rejected": 7.192343711853027, + "logps/chosen": -256.74346923828125, + "logps/rejected": -260.11798095703125, + "loss": 0.6637, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028095673769712448, + "rewards/margins": 0.06538401544094086, + "rewards/rejected": -0.03728833422064781, + "step": 111 + }, + { + "epoch": 0.017320703653585928, + "grad_norm": 4.046882152557373, + "learning_rate": 2.88659793814433e-07, + "logits/chosen": 12.280829429626465, + "logits/rejected": 7.815947532653809, + "logps/chosen": -211.55368041992188, + "logps/rejected": -204.84384155273438, + "loss": 0.6957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01647648960351944, + "rewards/margins": -0.004299450665712357, + "rewards/rejected": -0.012177038937807083, + "step": 112 + }, + { + "epoch": 0.017475352793350087, + "grad_norm": 6.47496223449707, + "learning_rate": 2.912371134020619e-07, + "logits/chosen": 8.501127243041992, + "logits/rejected": 0.23458325862884521, + "logps/chosen": -377.19732666015625, + "logps/rejected": -308.78424072265625, + "loss": 0.7093, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07137012481689453, + "rewards/margins": -0.030649472028017044, + "rewards/rejected": -0.04072065278887749, + "step": 113 + }, + { + "epoch": 0.01763000193311425, + "grad_norm": 5.730913162231445, + "learning_rate": 2.9381443298969076e-07, + "logits/chosen": 13.995811462402344, + "logits/rejected": -0.11919510364532471, + "logps/chosen": -325.1755676269531, + "logps/rejected": -133.4680633544922, + "loss": 0.6689, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017102956771850586, + "rewards/margins": 0.05096147209405899, + "rewards/rejected": -0.033858511596918106, + "step": 114 + }, + { + "epoch": 0.017784651072878407, + "grad_norm": 4.757335186004639, + "learning_rate": 2.963917525773196e-07, + "logits/chosen": 10.673439025878906, + "logits/rejected": 2.0020837783813477, + "logps/chosen": -271.75323486328125, + "logps/rejected": -191.2560272216797, + "loss": 0.6589, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06459856033325195, + "rewards/margins": 0.08755870163440704, + "rewards/rejected": -0.02296013943850994, + "step": 115 + }, + { + "epoch": 0.01793930021264257, + "grad_norm": 6.322711944580078, + "learning_rate": 2.989690721649485e-07, + "logits/chosen": 19.059375762939453, + "logits/rejected": 16.80080223083496, + "logps/chosen": -327.9287109375, + "logps/rejected": -344.2548522949219, + "loss": 0.6679, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.055001161992549896, + "rewards/margins": 0.057523250579833984, + "rewards/rejected": -0.002522086724638939, + "step": 116 + }, + { + "epoch": 0.018093949352406727, + "grad_norm": 5.038933277130127, + "learning_rate": 3.0154639175257737e-07, + "logits/chosen": 7.062486171722412, + "logits/rejected": 3.4042725563049316, + "logps/chosen": -319.0403137207031, + "logps/rejected": -235.81442260742188, + "loss": 0.7182, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.028659485280513763, + "rewards/margins": -0.047144271433353424, + "rewards/rejected": 0.018484782427549362, + "step": 117 + }, + { + "epoch": 0.01824859849217089, + "grad_norm": 3.1864757537841797, + "learning_rate": 3.041237113402062e-07, + "logits/chosen": 8.75621223449707, + "logits/rejected": 5.865131378173828, + "logps/chosen": -162.49127197265625, + "logps/rejected": -152.47247314453125, + "loss": 0.6563, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.052471160888671875, + "rewards/margins": 0.07654944062232971, + "rewards/rejected": -0.02407827600836754, + "step": 118 + }, + { + "epoch": 0.018403247631935047, + "grad_norm": 6.459438323974609, + "learning_rate": 3.0670103092783506e-07, + "logits/chosen": 4.881914138793945, + "logits/rejected": 4.98189115524292, + "logps/chosen": -414.2001037597656, + "logps/rejected": -396.5462341308594, + "loss": 0.7469, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08090643584728241, + "rewards/margins": -0.09828218817710876, + "rewards/rejected": 0.017375757917761803, + "step": 119 + }, + { + "epoch": 0.01855789677169921, + "grad_norm": 6.3285980224609375, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": 12.205191612243652, + "logits/rejected": 9.061829566955566, + "logps/chosen": -264.37127685546875, + "logps/rejected": -220.64028930664062, + "loss": 0.6812, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02196674421429634, + "rewards/margins": 0.028208348900079727, + "rewards/rejected": -0.00624160747975111, + "step": 120 + }, + { + "epoch": 0.018712545911463367, + "grad_norm": 5.029258728027344, + "learning_rate": 3.118556701030928e-07, + "logits/chosen": 8.74778938293457, + "logits/rejected": 6.037238121032715, + "logps/chosen": -285.9646301269531, + "logps/rejected": -211.68907165527344, + "loss": 0.7156, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.010250378400087357, + "rewards/margins": -0.03964881971478462, + "rewards/rejected": 0.029398441314697266, + "step": 121 + }, + { + "epoch": 0.01886719505122753, + "grad_norm": 5.2786173820495605, + "learning_rate": 3.144329896907217e-07, + "logits/chosen": 9.570748329162598, + "logits/rejected": 13.717671394348145, + "logps/chosen": -218.76718139648438, + "logps/rejected": -259.5052795410156, + "loss": 0.69, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02082233503460884, + "rewards/margins": 0.009538032114505768, + "rewards/rejected": -0.03036036714911461, + "step": 122 + }, + { + "epoch": 0.019021844190991687, + "grad_norm": 4.859240531921387, + "learning_rate": 3.1701030927835055e-07, + "logits/chosen": 9.398500442504883, + "logits/rejected": 8.594369888305664, + "logps/chosen": -285.2724609375, + "logps/rejected": -272.74517822265625, + "loss": 0.7316, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.037125684320926666, + "rewards/margins": -0.07307553291320801, + "rewards/rejected": 0.03594984859228134, + "step": 123 + }, + { + "epoch": 0.01917649333075585, + "grad_norm": 4.039507865905762, + "learning_rate": 3.1958762886597937e-07, + "logits/chosen": 12.342098236083984, + "logits/rejected": 13.515018463134766, + "logps/chosen": -147.92825317382812, + "logps/rejected": -214.56097412109375, + "loss": 0.6894, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0400967113673687, + "rewards/margins": 0.008935356512665749, + "rewards/rejected": 0.0311613529920578, + "step": 124 + }, + { + "epoch": 0.019331142470520007, + "grad_norm": 3.4822394847869873, + "learning_rate": 3.2216494845360824e-07, + "logits/chosen": 4.24094295501709, + "logits/rejected": 9.235977172851562, + "logps/chosen": -123.66722106933594, + "logps/rejected": -160.98106384277344, + "loss": 0.6765, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.013030624017119408, + "rewards/margins": 0.035941220819950104, + "rewards/rejected": -0.022910594940185547, + "step": 125 + }, + { + "epoch": 0.01948579161028417, + "grad_norm": 5.78380823135376, + "learning_rate": 3.2474226804123717e-07, + "logits/chosen": 7.895442962646484, + "logits/rejected": 7.685380935668945, + "logps/chosen": -298.37115478515625, + "logps/rejected": -292.2838439941406, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006781863979995251, + "rewards/margins": 0.007892562076449394, + "rewards/rejected": -0.01467442698776722, + "step": 126 + }, + { + "epoch": 0.019640440750048328, + "grad_norm": 4.466803550720215, + "learning_rate": 3.2731958762886604e-07, + "logits/chosen": 10.08015251159668, + "logits/rejected": 11.464984893798828, + "logps/chosen": -205.8199462890625, + "logps/rejected": -266.9953308105469, + "loss": 0.6638, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02930777333676815, + "rewards/margins": 0.06376784294843674, + "rewards/rejected": -0.03446006774902344, + "step": 127 + }, + { + "epoch": 0.01979508988981249, + "grad_norm": 7.772876262664795, + "learning_rate": 3.2989690721649486e-07, + "logits/chosen": 11.536581039428711, + "logits/rejected": 13.485919952392578, + "logps/chosen": -402.9762878417969, + "logps/rejected": -324.923828125, + "loss": 0.7065, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07936153560876846, + "rewards/margins": -0.02284088358283043, + "rewards/rejected": 0.1022024154663086, + "step": 128 + }, + { + "epoch": 0.019949739029576648, + "grad_norm": 6.40769624710083, + "learning_rate": 3.3247422680412373e-07, + "logits/chosen": 14.489065170288086, + "logits/rejected": 13.18747329711914, + "logps/chosen": -463.24761962890625, + "logps/rejected": -356.75726318359375, + "loss": 0.6877, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012290572747588158, + "rewards/margins": 0.017167475074529648, + "rewards/rejected": -0.004876900464296341, + "step": 129 + }, + { + "epoch": 0.02010438816934081, + "grad_norm": 4.053723335266113, + "learning_rate": 3.350515463917526e-07, + "logits/chosen": 2.3003878593444824, + "logits/rejected": 11.692012786865234, + "logps/chosen": -221.3137969970703, + "logps/rejected": -297.11968994140625, + "loss": 0.6608, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008316326886415482, + "rewards/margins": 0.07795529812574387, + "rewards/rejected": -0.06963896751403809, + "step": 130 + }, + { + "epoch": 0.020259037309104968, + "grad_norm": 6.093809604644775, + "learning_rate": 3.3762886597938147e-07, + "logits/chosen": 12.096522331237793, + "logits/rejected": 2.8824214935302734, + "logps/chosen": -351.3785705566406, + "logps/rejected": -216.28469848632812, + "loss": 0.6721, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.029314802959561348, + "rewards/margins": 0.052289389073848724, + "rewards/rejected": -0.022974587976932526, + "step": 131 + }, + { + "epoch": 0.02041368644886913, + "grad_norm": 3.7649548053741455, + "learning_rate": 3.402061855670103e-07, + "logits/chosen": 16.990177154541016, + "logits/rejected": 15.147708892822266, + "logps/chosen": -196.39218139648438, + "logps/rejected": -199.55445861816406, + "loss": 0.6732, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0535799041390419, + "rewards/margins": 0.04155528545379639, + "rewards/rejected": 0.01202461775392294, + "step": 132 + }, + { + "epoch": 0.020568335588633288, + "grad_norm": 4.7234416007995605, + "learning_rate": 3.427835051546392e-07, + "logits/chosen": 10.940417289733887, + "logits/rejected": 6.862434387207031, + "logps/chosen": -322.29254150390625, + "logps/rejected": -309.9874267578125, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008367825299501419, + "rewards/margins": 0.009527873247861862, + "rewards/rejected": -0.0011600481811910868, + "step": 133 + }, + { + "epoch": 0.02072298472839745, + "grad_norm": 4.684025287628174, + "learning_rate": 3.453608247422681e-07, + "logits/chosen": 13.74543571472168, + "logits/rejected": 10.144643783569336, + "logps/chosen": -331.08648681640625, + "logps/rejected": -277.6265869140625, + "loss": 0.7038, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0042053223587572575, + "rewards/margins": -0.019487954676151276, + "rewards/rejected": 0.01528263185173273, + "step": 134 + }, + { + "epoch": 0.020877633868161608, + "grad_norm": 5.05457067489624, + "learning_rate": 3.4793814432989696e-07, + "logits/chosen": 10.365510940551758, + "logits/rejected": 5.414824962615967, + "logps/chosen": -296.9322509765625, + "logps/rejected": -221.81578063964844, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.046361736953258514, + "rewards/margins": 0.04304616525769234, + "rewards/rejected": 0.003315567970275879, + "step": 135 + }, + { + "epoch": 0.02103228300792577, + "grad_norm": 4.940082550048828, + "learning_rate": 3.505154639175258e-07, + "logits/chosen": 10.648636817932129, + "logits/rejected": 8.979429244995117, + "logps/chosen": -273.0692138671875, + "logps/rejected": -231.90045166015625, + "loss": 0.6676, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006854342296719551, + "rewards/margins": 0.057478807866573334, + "rewards/rejected": -0.05062446370720863, + "step": 136 + }, + { + "epoch": 0.02118693214768993, + "grad_norm": 7.012190818786621, + "learning_rate": 3.5309278350515465e-07, + "logits/chosen": 8.252876281738281, + "logits/rejected": 5.904921531677246, + "logps/chosen": -306.2396240234375, + "logps/rejected": -305.6393737792969, + "loss": 0.689, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02063150331377983, + "rewards/margins": 0.0295425858348608, + "rewards/rejected": -0.00891108624637127, + "step": 137 + }, + { + "epoch": 0.02134158128745409, + "grad_norm": 5.608420372009277, + "learning_rate": 3.556701030927835e-07, + "logits/chosen": 13.231416702270508, + "logits/rejected": 9.319743156433105, + "logps/chosen": -292.5795593261719, + "logps/rejected": -225.84927368164062, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01856699027121067, + "rewards/margins": 0.03866300731897354, + "rewards/rejected": -0.05722999572753906, + "step": 138 + }, + { + "epoch": 0.02149623042721825, + "grad_norm": 5.710394382476807, + "learning_rate": 3.582474226804124e-07, + "logits/chosen": 19.21364974975586, + "logits/rejected": 11.254575729370117, + "logps/chosen": -354.6864013671875, + "logps/rejected": -268.7855224609375, + "loss": 0.7212, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03736724704504013, + "rewards/margins": -0.051706213504076004, + "rewards/rejected": 0.014338970184326172, + "step": 139 + }, + { + "epoch": 0.02165087956698241, + "grad_norm": 4.571855545043945, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": 14.058659553527832, + "logits/rejected": 10.861377716064453, + "logps/chosen": -280.15386962890625, + "logps/rejected": -282.8993225097656, + "loss": 0.7012, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.04230537265539169, + "rewards/margins": -0.011055950075387955, + "rewards/rejected": 0.05336131900548935, + "step": 140 + }, + { + "epoch": 0.02180552870674657, + "grad_norm": 5.263911247253418, + "learning_rate": 3.6340206185567014e-07, + "logits/chosen": 7.461472034454346, + "logits/rejected": 1.0462312698364258, + "logps/chosen": -330.1807861328125, + "logps/rejected": -182.9571533203125, + "loss": 0.6717, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008874009363353252, + "rewards/margins": 0.04460351541638374, + "rewards/rejected": -0.03572950139641762, + "step": 141 + }, + { + "epoch": 0.02196017784651073, + "grad_norm": 4.290580749511719, + "learning_rate": 3.65979381443299e-07, + "logits/chosen": 7.0978007316589355, + "logits/rejected": 7.093575954437256, + "logps/chosen": -177.14627075195312, + "logps/rejected": -217.9191436767578, + "loss": 0.6974, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030620671808719635, + "rewards/margins": -0.0039480216801166534, + "rewards/rejected": -0.026672648265957832, + "step": 142 + }, + { + "epoch": 0.02211482698627489, + "grad_norm": 3.931922197341919, + "learning_rate": 3.685567010309279e-07, + "logits/chosen": 8.608698844909668, + "logits/rejected": 5.749289035797119, + "logps/chosen": -220.7203369140625, + "logps/rejected": -195.58392333984375, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.016330527141690254, + "rewards/margins": 0.028108548372983932, + "rewards/rejected": -0.011778019368648529, + "step": 143 + }, + { + "epoch": 0.02226947612603905, + "grad_norm": 4.380866527557373, + "learning_rate": 3.7113402061855675e-07, + "logits/chosen": 12.03700065612793, + "logits/rejected": 9.799747467041016, + "logps/chosen": -183.94830322265625, + "logps/rejected": -159.5745086669922, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.041863348335027695, + "rewards/margins": 0.02260288968682289, + "rewards/rejected": 0.019260454922914505, + "step": 144 + }, + { + "epoch": 0.02242412526580321, + "grad_norm": 13.73681354522705, + "learning_rate": 3.737113402061856e-07, + "logits/chosen": 13.341185569763184, + "logits/rejected": 4.3040289878845215, + "logps/chosen": -417.81817626953125, + "logps/rejected": -280.46783447265625, + "loss": 0.6873, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014989567920565605, + "rewards/margins": 0.014561032876372337, + "rewards/rejected": 0.0004285350441932678, + "step": 145 + }, + { + "epoch": 0.02257877440556737, + "grad_norm": 5.867197513580322, + "learning_rate": 3.7628865979381445e-07, + "logits/chosen": 7.845378398895264, + "logits/rejected": 7.68946647644043, + "logps/chosen": -204.5855255126953, + "logps/rejected": -263.0268859863281, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04747460037469864, + "rewards/margins": 0.003543257713317871, + "rewards/rejected": -0.05101785808801651, + "step": 146 + }, + { + "epoch": 0.02273342354533153, + "grad_norm": 7.118080139160156, + "learning_rate": 3.788659793814433e-07, + "logits/chosen": 15.928805351257324, + "logits/rejected": 6.229506015777588, + "logps/chosen": -341.1634826660156, + "logps/rejected": -129.547119140625, + "loss": 0.7244, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04797687754034996, + "rewards/margins": -0.05064563825726509, + "rewards/rejected": 0.00266876257956028, + "step": 147 + }, + { + "epoch": 0.02288807268509569, + "grad_norm": 7.006904602050781, + "learning_rate": 3.8144329896907224e-07, + "logits/chosen": 13.481695175170898, + "logits/rejected": 8.497061729431152, + "logps/chosen": -340.57769775390625, + "logps/rejected": -283.00592041015625, + "loss": 0.6955, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00449838861823082, + "rewards/margins": 0.0006693825125694275, + "rewards/rejected": -0.005167771130800247, + "step": 148 + }, + { + "epoch": 0.02304272182485985, + "grad_norm": 9.270798683166504, + "learning_rate": 3.8402061855670106e-07, + "logits/chosen": 6.4740190505981445, + "logits/rejected": 0.8017644286155701, + "logps/chosen": -213.53318786621094, + "logps/rejected": -273.9697265625, + "loss": 0.6893, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03169379383325577, + "rewards/margins": 0.014923380687832832, + "rewards/rejected": 0.016770416870713234, + "step": 149 + }, + { + "epoch": 0.023197370964624008, + "grad_norm": 3.4471242427825928, + "learning_rate": 3.8659793814432993e-07, + "logits/chosen": 9.30126953125, + "logits/rejected": 6.892515182495117, + "logps/chosen": -193.80108642578125, + "logps/rejected": -131.40599060058594, + "loss": 0.7022, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009104728698730469, + "rewards/margins": -0.01559281162917614, + "rewards/rejected": 0.00648808479309082, + "step": 150 + }, + { + "epoch": 0.02335202010438817, + "grad_norm": 5.857764720916748, + "learning_rate": 3.891752577319588e-07, + "logits/chosen": 9.284612655639648, + "logits/rejected": 6.896724224090576, + "logps/chosen": -316.2874450683594, + "logps/rejected": -264.9364013671875, + "loss": 0.6794, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0007912623696029186, + "rewards/margins": 0.03216905891895294, + "rewards/rejected": -0.03137779235839844, + "step": 151 + }, + { + "epoch": 0.023506669244152328, + "grad_norm": 4.955041885375977, + "learning_rate": 3.917525773195877e-07, + "logits/chosen": 7.255969047546387, + "logits/rejected": 12.712529182434082, + "logps/chosen": -261.017333984375, + "logps/rejected": -297.79144287109375, + "loss": 0.6967, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02873850055038929, + "rewards/margins": -0.004970931448042393, + "rewards/rejected": -0.023767568171024323, + "step": 152 + }, + { + "epoch": 0.02366131838391649, + "grad_norm": 6.317713260650635, + "learning_rate": 3.943298969072165e-07, + "logits/chosen": 11.194629669189453, + "logits/rejected": 10.033746719360352, + "logps/chosen": -334.9357604980469, + "logps/rejected": -294.2626953125, + "loss": 0.7025, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029807284474372864, + "rewards/margins": -0.008442975580692291, + "rewards/rejected": -0.021364308893680573, + "step": 153 + }, + { + "epoch": 0.023815967523680648, + "grad_norm": 4.404287338256836, + "learning_rate": 3.9690721649484537e-07, + "logits/chosen": 14.251007080078125, + "logits/rejected": 7.5094709396362305, + "logps/chosen": -293.74658203125, + "logps/rejected": -288.35394287109375, + "loss": 0.658, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06871271133422852, + "rewards/margins": 0.07606048882007599, + "rewards/rejected": -0.007347774691879749, + "step": 154 + }, + { + "epoch": 0.02397061666344481, + "grad_norm": 4.846568584442139, + "learning_rate": 3.9948453608247424e-07, + "logits/chosen": 6.9730119705200195, + "logits/rejected": 6.351569175720215, + "logps/chosen": -270.74835205078125, + "logps/rejected": -237.1637420654297, + "loss": 0.7098, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.035985566675662994, + "rewards/margins": -0.03157832846045494, + "rewards/rejected": -0.004407238215208054, + "step": 155 + }, + { + "epoch": 0.024125265803208968, + "grad_norm": 6.149508953094482, + "learning_rate": 4.0206185567010316e-07, + "logits/chosen": 7.503884792327881, + "logits/rejected": 5.384027004241943, + "logps/chosen": -232.51004028320312, + "logps/rejected": -208.80050659179688, + "loss": 0.699, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.012560082599520683, + "rewards/margins": -0.008591175079345703, + "rewards/rejected": -0.0039689065888524055, + "step": 156 + }, + { + "epoch": 0.02427991494297313, + "grad_norm": 7.02464485168457, + "learning_rate": 4.0463917525773204e-07, + "logits/chosen": 12.675241470336914, + "logits/rejected": 13.16816234588623, + "logps/chosen": -362.6820068359375, + "logps/rejected": -485.51751708984375, + "loss": 0.7168, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.007289508357644081, + "rewards/margins": -0.034657854586839676, + "rewards/rejected": 0.041947364807128906, + "step": 157 + }, + { + "epoch": 0.024434564082737288, + "grad_norm": 3.8207221031188965, + "learning_rate": 4.0721649484536085e-07, + "logits/chosen": 7.228775978088379, + "logits/rejected": 6.786842346191406, + "logps/chosen": -178.40481567382812, + "logps/rejected": -187.9625244140625, + "loss": 0.6889, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.030375385656952858, + "rewards/margins": 0.010330486111342907, + "rewards/rejected": 0.020044900476932526, + "step": 158 + }, + { + "epoch": 0.02458921322250145, + "grad_norm": 4.20653772354126, + "learning_rate": 4.0979381443298973e-07, + "logits/chosen": 3.8242712020874023, + "logits/rejected": 5.5489091873168945, + "logps/chosen": -272.1354675292969, + "logps/rejected": -266.16693115234375, + "loss": 0.6443, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05270209163427353, + "rewards/margins": 0.10416822880506516, + "rewards/rejected": -0.05146613344550133, + "step": 159 + }, + { + "epoch": 0.02474386236226561, + "grad_norm": 4.795764446258545, + "learning_rate": 4.123711340206186e-07, + "logits/chosen": 4.660576820373535, + "logits/rejected": 7.1966328620910645, + "logps/chosen": -228.36221313476562, + "logps/rejected": -224.59349060058594, + "loss": 0.6512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03494858741760254, + "rewards/margins": 0.08641910552978516, + "rewards/rejected": -0.05147051811218262, + "step": 160 + }, + { + "epoch": 0.02489851150202977, + "grad_norm": 5.600543975830078, + "learning_rate": 4.149484536082474e-07, + "logits/chosen": 9.707596778869629, + "logits/rejected": 6.820638656616211, + "logps/chosen": -264.181396484375, + "logps/rejected": -214.16131591796875, + "loss": 0.7034, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0313660129904747, + "rewards/margins": -0.01497330516576767, + "rewards/rejected": -0.016392705962061882, + "step": 161 + }, + { + "epoch": 0.02505316064179393, + "grad_norm": 6.325645446777344, + "learning_rate": 4.175257731958763e-07, + "logits/chosen": 11.53718376159668, + "logits/rejected": 14.02033805847168, + "logps/chosen": -348.6204833984375, + "logps/rejected": -331.4289245605469, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007296848110854626, + "rewards/margins": 0.010853290557861328, + "rewards/rejected": -0.01815013960003853, + "step": 162 + }, + { + "epoch": 0.02520780978155809, + "grad_norm": 4.813772678375244, + "learning_rate": 4.201030927835052e-07, + "logits/chosen": 15.513379096984863, + "logits/rejected": 9.078393936157227, + "logps/chosen": -295.4510498046875, + "logps/rejected": -229.6817626953125, + "loss": 0.7497, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03236312419176102, + "rewards/margins": -0.09920233488082886, + "rewards/rejected": 0.06683921813964844, + "step": 163 + }, + { + "epoch": 0.02536245892132225, + "grad_norm": 5.510994911193848, + "learning_rate": 4.226804123711341e-07, + "logits/chosen": 10.846382141113281, + "logits/rejected": 11.770069122314453, + "logps/chosen": -189.88009643554688, + "logps/rejected": -156.14712524414062, + "loss": 0.655, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05986461788415909, + "rewards/margins": 0.08025836944580078, + "rewards/rejected": -0.020393753424286842, + "step": 164 + }, + { + "epoch": 0.02551710806108641, + "grad_norm": 4.027902126312256, + "learning_rate": 4.2525773195876296e-07, + "logits/chosen": 10.94518756866455, + "logits/rejected": 9.858892440795898, + "logps/chosen": -273.0108337402344, + "logps/rejected": -244.79905700683594, + "loss": 0.6941, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.012119006365537643, + "rewards/margins": 0.005785701796412468, + "rewards/rejected": -0.01790471374988556, + "step": 165 + }, + { + "epoch": 0.02567175720085057, + "grad_norm": 4.807586669921875, + "learning_rate": 4.278350515463918e-07, + "logits/chosen": 8.141124725341797, + "logits/rejected": 6.653397560119629, + "logps/chosen": -297.3433532714844, + "logps/rejected": -242.2059326171875, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03848304599523544, + "rewards/margins": 0.025315478444099426, + "rewards/rejected": 0.013167573139071465, + "step": 166 + }, + { + "epoch": 0.02582640634061473, + "grad_norm": 4.420519828796387, + "learning_rate": 4.3041237113402065e-07, + "logits/chosen": 11.321237564086914, + "logits/rejected": 11.26107120513916, + "logps/chosen": -194.85035705566406, + "logps/rejected": -196.01620483398438, + "loss": 0.7045, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0019239448010921478, + "rewards/margins": -0.017923161387443542, + "rewards/rejected": 0.01984710618853569, + "step": 167 + }, + { + "epoch": 0.02598105548037889, + "grad_norm": 6.189317226409912, + "learning_rate": 4.329896907216495e-07, + "logits/chosen": 12.190542221069336, + "logits/rejected": 11.473383903503418, + "logps/chosen": -288.4436950683594, + "logps/rejected": -383.51922607421875, + "loss": 0.7033, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0007077232003211975, + "rewards/margins": -0.017600730061531067, + "rewards/rejected": 0.018308449536561966, + "step": 168 + }, + { + "epoch": 0.02613570462014305, + "grad_norm": 4.858217239379883, + "learning_rate": 4.3556701030927834e-07, + "logits/chosen": 13.371498107910156, + "logits/rejected": 8.25893783569336, + "logps/chosen": -208.8777618408203, + "logps/rejected": -223.69036865234375, + "loss": 0.7022, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03538203611969948, + "rewards/margins": -0.01615438610315323, + "rewards/rejected": -0.01922765001654625, + "step": 169 + }, + { + "epoch": 0.02629035375990721, + "grad_norm": 6.036569595336914, + "learning_rate": 4.381443298969072e-07, + "logits/chosen": 8.494549751281738, + "logits/rejected": 9.896512031555176, + "logps/chosen": -156.01968383789062, + "logps/rejected": -187.5347442626953, + "loss": 0.7163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0037443162873387337, + "rewards/margins": -0.04112754017114639, + "rewards/rejected": 0.037383221089839935, + "step": 170 + }, + { + "epoch": 0.02644500289967137, + "grad_norm": 5.20986795425415, + "learning_rate": 4.4072164948453614e-07, + "logits/chosen": 13.111475944519043, + "logits/rejected": 6.484719753265381, + "logps/chosen": -409.5218505859375, + "logps/rejected": -234.3643035888672, + "loss": 0.683, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.01825409010052681, + "rewards/margins": 0.024442581459879875, + "rewards/rejected": -0.04269666597247124, + "step": 171 + }, + { + "epoch": 0.02659965203943553, + "grad_norm": 5.2855658531188965, + "learning_rate": 4.43298969072165e-07, + "logits/chosen": 4.768103122711182, + "logits/rejected": 6.6989593505859375, + "logps/chosen": -260.90045166015625, + "logps/rejected": -291.176513671875, + "loss": 0.6919, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02348909340798855, + "rewards/margins": 0.013461204245686531, + "rewards/rejected": 0.010027887299656868, + "step": 172 + }, + { + "epoch": 0.02675430117919969, + "grad_norm": 6.277451992034912, + "learning_rate": 4.458762886597939e-07, + "logits/chosen": 10.987936973571777, + "logits/rejected": 13.236594200134277, + "logps/chosen": -335.50177001953125, + "logps/rejected": -352.0394287109375, + "loss": 0.6882, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.005882358178496361, + "rewards/margins": 0.01384363230317831, + "rewards/rejected": -0.019725989550352097, + "step": 173 + }, + { + "epoch": 0.02690895031896385, + "grad_norm": 4.070093154907227, + "learning_rate": 4.484536082474227e-07, + "logits/chosen": 6.1086859703063965, + "logits/rejected": 6.355241775512695, + "logps/chosen": -255.8524932861328, + "logps/rejected": -230.45578002929688, + "loss": 0.7045, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0077735427767038345, + "rewards/margins": -0.017763851210474968, + "rewards/rejected": 0.009990310296416283, + "step": 174 + }, + { + "epoch": 0.02706359945872801, + "grad_norm": 7.5643181800842285, + "learning_rate": 4.5103092783505157e-07, + "logits/chosen": 8.44739055633545, + "logits/rejected": 8.935912132263184, + "logps/chosen": -179.26397705078125, + "logps/rejected": -202.01805114746094, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017644383013248444, + "rewards/margins": 0.01310274749994278, + "rewards/rejected": 0.004541635047644377, + "step": 175 + }, + { + "epoch": 0.02721824859849217, + "grad_norm": 4.193178653717041, + "learning_rate": 4.5360824742268044e-07, + "logits/chosen": 8.576276779174805, + "logits/rejected": 11.404003143310547, + "logps/chosen": -219.38568115234375, + "logps/rejected": -182.7236328125, + "loss": 0.7097, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002615641802549362, + "rewards/margins": -0.031035995110869408, + "rewards/rejected": 0.028420355170965195, + "step": 176 + }, + { + "epoch": 0.02737289773825633, + "grad_norm": 4.314047813415527, + "learning_rate": 4.561855670103093e-07, + "logits/chosen": 6.486111164093018, + "logits/rejected": 14.272531509399414, + "logps/chosen": -110.76365661621094, + "logps/rejected": -200.30520629882812, + "loss": 0.7035, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.014495277777314186, + "rewards/margins": -0.0194854736328125, + "rewards/rejected": 0.03398074954748154, + "step": 177 + }, + { + "epoch": 0.02752754687802049, + "grad_norm": 9.834793090820312, + "learning_rate": 4.5876288659793813e-07, + "logits/chosen": 11.759220123291016, + "logits/rejected": 13.046407699584961, + "logps/chosen": -258.09619140625, + "logps/rejected": -273.06842041015625, + "loss": 0.7337, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.057515814900398254, + "rewards/margins": -0.07578323036432266, + "rewards/rejected": 0.018267419189214706, + "step": 178 + }, + { + "epoch": 0.02768219601778465, + "grad_norm": 4.0035786628723145, + "learning_rate": 4.6134020618556706e-07, + "logits/chosen": 4.870166301727295, + "logits/rejected": 7.615548133850098, + "logps/chosen": -199.38555908203125, + "logps/rejected": -275.7410888671875, + "loss": 0.701, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01069631427526474, + "rewards/margins": -0.014157723635435104, + "rewards/rejected": 0.0034614093601703644, + "step": 179 + }, + { + "epoch": 0.02783684515754881, + "grad_norm": 7.824204444885254, + "learning_rate": 4.6391752577319593e-07, + "logits/chosen": 8.274697303771973, + "logits/rejected": 12.119913101196289, + "logps/chosen": -322.6107482910156, + "logps/rejected": -353.1990966796875, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03774423524737358, + "rewards/margins": 0.07958593219518661, + "rewards/rejected": -0.04184170067310333, + "step": 180 + }, + { + "epoch": 0.02799149429731297, + "grad_norm": 7.327850341796875, + "learning_rate": 4.664948453608248e-07, + "logits/chosen": 13.005859375, + "logits/rejected": 8.920368194580078, + "logps/chosen": -304.26446533203125, + "logps/rejected": -248.5641632080078, + "loss": 0.6879, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0012749661691486835, + "rewards/margins": 0.018329523503780365, + "rewards/rejected": -0.017054561525583267, + "step": 181 + }, + { + "epoch": 0.02814614343707713, + "grad_norm": 4.1393256187438965, + "learning_rate": 4.690721649484536e-07, + "logits/chosen": 9.603414535522461, + "logits/rejected": 6.550224304199219, + "logps/chosen": -229.20713806152344, + "logps/rejected": -226.37852478027344, + "loss": 0.7114, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.019595623016357422, + "rewards/margins": -0.029221773147583008, + "rewards/rejected": 0.009626151993870735, + "step": 182 + }, + { + "epoch": 0.02830079257684129, + "grad_norm": 4.45011568069458, + "learning_rate": 4.716494845360825e-07, + "logits/chosen": 7.681704044342041, + "logits/rejected": 5.8652238845825195, + "logps/chosen": -272.65020751953125, + "logps/rejected": -244.9304962158203, + "loss": 0.6791, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0519106425344944, + "rewards/margins": 0.03381602466106415, + "rewards/rejected": 0.018094610422849655, + "step": 183 + }, + { + "epoch": 0.02845544171660545, + "grad_norm": 7.654642105102539, + "learning_rate": 4.7422680412371136e-07, + "logits/chosen": 9.277904510498047, + "logits/rejected": 9.229616165161133, + "logps/chosen": -276.8092346191406, + "logps/rejected": -291.9592590332031, + "loss": 0.6564, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.031248953193426132, + "rewards/margins": 0.07713842391967773, + "rewards/rejected": -0.0458894744515419, + "step": 184 + }, + { + "epoch": 0.028610090856369612, + "grad_norm": 6.350154399871826, + "learning_rate": 4.7680412371134024e-07, + "logits/chosen": 16.259220123291016, + "logits/rejected": 13.335022926330566, + "logps/chosen": -268.2291259765625, + "logps/rejected": -270.746337890625, + "loss": 0.7032, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030719853937625885, + "rewards/margins": -0.015138531103730202, + "rewards/rejected": -0.015581320971250534, + "step": 185 + }, + { + "epoch": 0.02876473999613377, + "grad_norm": 5.605805397033691, + "learning_rate": 4.793814432989691e-07, + "logits/chosen": 7.675743579864502, + "logits/rejected": 4.859321117401123, + "logps/chosen": -338.41265869140625, + "logps/rejected": -347.5958557128906, + "loss": 0.5978, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07572059333324432, + "rewards/margins": 0.21087026596069336, + "rewards/rejected": -0.13514965772628784, + "step": 186 + }, + { + "epoch": 0.028919389135897932, + "grad_norm": 5.362581729888916, + "learning_rate": 4.81958762886598e-07, + "logits/chosen": 15.630083084106445, + "logits/rejected": 13.24819564819336, + "logps/chosen": -297.172119140625, + "logps/rejected": -249.78953552246094, + "loss": 0.7117, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.012043287977576256, + "rewards/margins": -0.03339576721191406, + "rewards/rejected": 0.021352481096982956, + "step": 187 + }, + { + "epoch": 0.02907403827566209, + "grad_norm": 4.199508190155029, + "learning_rate": 4.845360824742269e-07, + "logits/chosen": 9.092329025268555, + "logits/rejected": 10.078535079956055, + "logps/chosen": -225.06993103027344, + "logps/rejected": -267.7950439453125, + "loss": 0.7125, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.010802840813994408, + "rewards/margins": -0.03644533455371857, + "rewards/rejected": 0.02564249187707901, + "step": 188 + }, + { + "epoch": 0.029228687415426252, + "grad_norm": 3.7857048511505127, + "learning_rate": 4.871134020618557e-07, + "logits/chosen": 0.11447806656360626, + "logits/rejected": 2.68275785446167, + "logps/chosen": -143.5679168701172, + "logps/rejected": -192.89923095703125, + "loss": 0.719, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.035102128982543945, + "rewards/margins": -0.04992819204926491, + "rewards/rejected": 0.014826059341430664, + "step": 189 + }, + { + "epoch": 0.02938333655519041, + "grad_norm": 6.220125675201416, + "learning_rate": 4.896907216494846e-07, + "logits/chosen": 12.432947158813477, + "logits/rejected": 12.735036849975586, + "logps/chosen": -249.94854736328125, + "logps/rejected": -289.95025634765625, + "loss": 0.6844, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005805587396025658, + "rewards/margins": 0.02017221227288246, + "rewards/rejected": -0.014366628602147102, + "step": 190 + }, + { + "epoch": 0.029537985694954572, + "grad_norm": 4.018984317779541, + "learning_rate": 4.922680412371135e-07, + "logits/chosen": 11.0584716796875, + "logits/rejected": 8.789422988891602, + "logps/chosen": -234.2735595703125, + "logps/rejected": -233.0352783203125, + "loss": 0.7069, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01151285320520401, + "rewards/margins": -0.023624751716852188, + "rewards/rejected": 0.0351376049220562, + "step": 191 + }, + { + "epoch": 0.02969263483471873, + "grad_norm": 5.126651763916016, + "learning_rate": 4.948453608247422e-07, + "logits/chosen": 10.567253112792969, + "logits/rejected": 4.80112361907959, + "logps/chosen": -329.8632507324219, + "logps/rejected": -254.94168090820312, + "loss": 0.6513, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06606252491474152, + "rewards/margins": 0.09130598604679108, + "rewards/rejected": -0.025243476033210754, + "step": 192 + }, + { + "epoch": 0.029847283974482892, + "grad_norm": 4.853884220123291, + "learning_rate": 4.974226804123711e-07, + "logits/chosen": 15.442766189575195, + "logits/rejected": 9.313232421875, + "logps/chosen": -335.0982971191406, + "logps/rejected": -212.04495239257812, + "loss": 0.7052, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.051625922322273254, + "rewards/margins": -0.019226882606744766, + "rewards/rejected": -0.03239903599023819, + "step": 193 + }, + { + "epoch": 0.03000193311424705, + "grad_norm": 3.774740695953369, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": 4.590182304382324, + "logits/rejected": 8.030871391296387, + "logps/chosen": -155.51739501953125, + "logps/rejected": -200.157958984375, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03606202453374863, + "rewards/margins": 0.05163295194506645, + "rewards/rejected": -0.015570924617350101, + "step": 194 + }, + { + "epoch": 0.030156582254011213, + "grad_norm": 5.68532133102417, + "learning_rate": 5.02577319587629e-07, + "logits/chosen": 10.800435066223145, + "logits/rejected": 5.137495040893555, + "logps/chosen": -477.038818359375, + "logps/rejected": -351.06561279296875, + "loss": 0.6281, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04695453867316246, + "rewards/margins": 0.1407017707824707, + "rewards/rejected": -0.09374723583459854, + "step": 195 + }, + { + "epoch": 0.03031123139377537, + "grad_norm": 5.503288269042969, + "learning_rate": 5.051546391752578e-07, + "logits/chosen": 8.266733169555664, + "logits/rejected": 8.95901870727539, + "logps/chosen": -233.74664306640625, + "logps/rejected": -250.41323852539062, + "loss": 0.7025, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005411338992416859, + "rewards/margins": -0.01662764512002468, + "rewards/rejected": 0.011216306127607822, + "step": 196 + }, + { + "epoch": 0.030465880533539533, + "grad_norm": 5.579014778137207, + "learning_rate": 5.077319587628866e-07, + "logits/chosen": 9.309625625610352, + "logits/rejected": 3.965503692626953, + "logps/chosen": -299.0317077636719, + "logps/rejected": -270.5412902832031, + "loss": 0.6949, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.04466180503368378, + "rewards/margins": 0.0016595353372395039, + "rewards/rejected": 0.04300227016210556, + "step": 197 + }, + { + "epoch": 0.03062052967330369, + "grad_norm": 5.246983528137207, + "learning_rate": 5.103092783505155e-07, + "logits/chosen": 12.451875686645508, + "logits/rejected": 13.854093551635742, + "logps/chosen": -242.00267028808594, + "logps/rejected": -255.1998291015625, + "loss": 0.7272, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02913236990571022, + "rewards/margins": -0.06588325649499893, + "rewards/rejected": 0.03675089031457901, + "step": 198 + }, + { + "epoch": 0.030775178813067853, + "grad_norm": 4.322015762329102, + "learning_rate": 5.128865979381443e-07, + "logits/chosen": 10.467855453491211, + "logits/rejected": 5.929073333740234, + "logps/chosen": -309.23040771484375, + "logps/rejected": -255.4852294921875, + "loss": 0.6511, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05621838942170143, + "rewards/margins": 0.08871493488550186, + "rewards/rejected": -0.03249654546380043, + "step": 199 + }, + { + "epoch": 0.03092982795283201, + "grad_norm": 5.768648147583008, + "learning_rate": 5.154639175257732e-07, + "logits/chosen": 8.117884635925293, + "logits/rejected": 2.875415802001953, + "logps/chosen": -289.96588134765625, + "logps/rejected": -215.07467651367188, + "loss": 0.7327, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02657940238714218, + "rewards/margins": -0.07221557199954987, + "rewards/rejected": 0.04563617706298828, + "step": 200 + }, + { + "epoch": 0.031084477092596173, + "grad_norm": 5.3595075607299805, + "learning_rate": 5.180412371134022e-07, + "logits/chosen": 12.535530090332031, + "logits/rejected": 8.698311805725098, + "logps/chosen": -297.65924072265625, + "logps/rejected": -240.9003448486328, + "loss": 0.6846, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03645692020654678, + "rewards/margins": 0.02216940000653267, + "rewards/rejected": 0.01428751926869154, + "step": 201 + }, + { + "epoch": 0.03123912623236033, + "grad_norm": 5.111512184143066, + "learning_rate": 5.20618556701031e-07, + "logits/chosen": 9.871399879455566, + "logits/rejected": 14.595333099365234, + "logps/chosen": -206.845703125, + "logps/rejected": -191.87661743164062, + "loss": 0.7088, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.005719520151615143, + "rewards/margins": -0.029749490320682526, + "rewards/rejected": 0.03546901047229767, + "step": 202 + }, + { + "epoch": 0.03139377537212449, + "grad_norm": 5.261501312255859, + "learning_rate": 5.231958762886598e-07, + "logits/chosen": 7.539558410644531, + "logits/rejected": 6.129846572875977, + "logps/chosen": -231.51771545410156, + "logps/rejected": -193.36862182617188, + "loss": 0.7117, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.005268574692308903, + "rewards/margins": -0.03355374187231064, + "rewards/rejected": 0.03882231563329697, + "step": 203 + }, + { + "epoch": 0.031548424511888655, + "grad_norm": 5.315875053405762, + "learning_rate": 5.257731958762887e-07, + "logits/chosen": 10.683394432067871, + "logits/rejected": 5.820173740386963, + "logps/chosen": -249.38955688476562, + "logps/rejected": -200.80557250976562, + "loss": 0.7192, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012097455561161041, + "rewards/margins": -0.04487933963537216, + "rewards/rejected": 0.0569767951965332, + "step": 204 + }, + { + "epoch": 0.03170307365165281, + "grad_norm": 4.840034484863281, + "learning_rate": 5.283505154639176e-07, + "logits/chosen": 13.294004440307617, + "logits/rejected": 4.9387664794921875, + "logps/chosen": -343.0023193359375, + "logps/rejected": -164.44154357910156, + "loss": 0.6756, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02777595818042755, + "rewards/margins": 0.038086939603090286, + "rewards/rejected": -0.010310984216630459, + "step": 205 + }, + { + "epoch": 0.03185772279141697, + "grad_norm": 8.74624252319336, + "learning_rate": 5.309278350515464e-07, + "logits/chosen": 7.686714172363281, + "logits/rejected": 9.148231506347656, + "logps/chosen": -210.98745727539062, + "logps/rejected": -289.07427978515625, + "loss": 0.7007, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01983022876083851, + "rewards/margins": -0.009432818740606308, + "rewards/rejected": -0.0103974100202322, + "step": 206 + }, + { + "epoch": 0.03201237193118113, + "grad_norm": 5.937737464904785, + "learning_rate": 5.335051546391753e-07, + "logits/chosen": 7.729095458984375, + "logits/rejected": 5.516740798950195, + "logps/chosen": -329.46875, + "logps/rejected": -308.0854187011719, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0182005874812603, + "rewards/margins": 0.010803531855344772, + "rewards/rejected": 0.007397056557238102, + "step": 207 + }, + { + "epoch": 0.032167021070945295, + "grad_norm": 3.5201690196990967, + "learning_rate": 5.360824742268042e-07, + "logits/chosen": 12.320011138916016, + "logits/rejected": 7.052798271179199, + "logps/chosen": -216.28662109375, + "logps/rejected": -148.6501922607422, + "loss": 0.6765, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0409000888466835, + "rewards/margins": 0.041211411356925964, + "rewards/rejected": -0.0003113262355327606, + "step": 208 + }, + { + "epoch": 0.032321670210709454, + "grad_norm": 3.8653488159179688, + "learning_rate": 5.386597938144331e-07, + "logits/chosen": 11.556921005249023, + "logits/rejected": 12.459211349487305, + "logps/chosen": -202.27687072753906, + "logps/rejected": -191.4716796875, + "loss": 0.6586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004669379908591509, + "rewards/margins": 0.07291088253259659, + "rewards/rejected": -0.06824149936437607, + "step": 209 + }, + { + "epoch": 0.03247631935047361, + "grad_norm": 7.129583835601807, + "learning_rate": 5.412371134020619e-07, + "logits/chosen": 7.569952964782715, + "logits/rejected": 2.0636281967163086, + "logps/chosen": -320.1714172363281, + "logps/rejected": -294.3085021972656, + "loss": 0.7065, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02710290253162384, + "rewards/margins": -0.020142219960689545, + "rewards/rejected": -0.006960679776966572, + "step": 210 + }, + { + "epoch": 0.03263096849023777, + "grad_norm": 4.896057605743408, + "learning_rate": 5.438144329896908e-07, + "logits/chosen": 14.750947952270508, + "logits/rejected": 12.198291778564453, + "logps/chosen": -309.416259765625, + "logps/rejected": -275.0567626953125, + "loss": 0.6486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0052623748779296875, + "rewards/margins": 0.09564967453479767, + "rewards/rejected": -0.09038729965686798, + "step": 211 + }, + { + "epoch": 0.032785617630001936, + "grad_norm": 5.7905168533325195, + "learning_rate": 5.463917525773197e-07, + "logits/chosen": 5.347293853759766, + "logits/rejected": 5.936334133148193, + "logps/chosen": -276.7212829589844, + "logps/rejected": -216.69508361816406, + "loss": 0.7259, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05456046760082245, + "rewards/margins": -0.05719555914402008, + "rewards/rejected": 0.002635098062455654, + "step": 212 + }, + { + "epoch": 0.032940266769766094, + "grad_norm": 6.859431743621826, + "learning_rate": 5.489690721649485e-07, + "logits/chosen": 9.798736572265625, + "logits/rejected": 7.827033996582031, + "logps/chosen": -310.09796142578125, + "logps/rejected": -202.17762756347656, + "loss": 0.6818, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030898097902536392, + "rewards/margins": 0.033220648765563965, + "rewards/rejected": -0.0023225508630275726, + "step": 213 + }, + { + "epoch": 0.03309491590953025, + "grad_norm": 4.4510273933410645, + "learning_rate": 5.515463917525773e-07, + "logits/chosen": 8.931305885314941, + "logits/rejected": 7.163075923919678, + "logps/chosen": -142.5396728515625, + "logps/rejected": -132.72410583496094, + "loss": 0.6885, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01383285690099001, + "rewards/margins": 0.01216497365385294, + "rewards/rejected": 0.0016678813844919205, + "step": 214 + }, + { + "epoch": 0.03324956504929441, + "grad_norm": 4.577381134033203, + "learning_rate": 5.541237113402062e-07, + "logits/chosen": 6.717674255371094, + "logits/rejected": 8.618230819702148, + "logps/chosen": -213.08270263671875, + "logps/rejected": -289.16326904296875, + "loss": 0.7375, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04158239811658859, + "rewards/margins": -0.07986173778772354, + "rewards/rejected": 0.03827934339642525, + "step": 215 + }, + { + "epoch": 0.033404214189058576, + "grad_norm": 6.901246070861816, + "learning_rate": 5.567010309278352e-07, + "logits/chosen": 10.95913028717041, + "logits/rejected": 10.665098190307617, + "logps/chosen": -184.7791748046875, + "logps/rejected": -208.35800170898438, + "loss": 0.6516, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03146687150001526, + "rewards/margins": 0.08955011516809464, + "rewards/rejected": -0.05808325111865997, + "step": 216 + }, + { + "epoch": 0.033558863328822734, + "grad_norm": 5.082674026489258, + "learning_rate": 5.59278350515464e-07, + "logits/chosen": 14.751993179321289, + "logits/rejected": 6.697968482971191, + "logps/chosen": -324.7636413574219, + "logps/rejected": -241.14578247070312, + "loss": 0.6883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.048441458493471146, + "rewards/margins": 0.015271376818418503, + "rewards/rejected": -0.06371283531188965, + "step": 217 + }, + { + "epoch": 0.03371351246858689, + "grad_norm": 3.8480961322784424, + "learning_rate": 5.618556701030928e-07, + "logits/chosen": 10.180124282836914, + "logits/rejected": 12.340204238891602, + "logps/chosen": -134.39517211914062, + "logps/rejected": -149.95010375976562, + "loss": 0.7153, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.004220293834805489, + "rewards/margins": -0.04071822762489319, + "rewards/rejected": 0.03649792820215225, + "step": 218 + }, + { + "epoch": 0.03386816160835105, + "grad_norm": 4.407406806945801, + "learning_rate": 5.644329896907217e-07, + "logits/chosen": 16.62118148803711, + "logits/rejected": 7.807868957519531, + "logps/chosen": -354.60162353515625, + "logps/rejected": -201.27975463867188, + "loss": 0.714, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028145790100097656, + "rewards/margins": -0.03343295678496361, + "rewards/rejected": 0.06157875061035156, + "step": 219 + }, + { + "epoch": 0.034022810748115216, + "grad_norm": 4.289771556854248, + "learning_rate": 5.670103092783505e-07, + "logits/chosen": 7.658382415771484, + "logits/rejected": 5.6317572593688965, + "logps/chosen": -205.30947875976562, + "logps/rejected": -259.0213623046875, + "loss": 0.6738, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03346576914191246, + "rewards/margins": 0.042540889233350754, + "rewards/rejected": -0.009075116366147995, + "step": 220 + }, + { + "epoch": 0.034177459887879375, + "grad_norm": 3.378429651260376, + "learning_rate": 5.695876288659794e-07, + "logits/chosen": 5.626931190490723, + "logits/rejected": 5.373292922973633, + "logps/chosen": -171.7481689453125, + "logps/rejected": -139.57618713378906, + "loss": 0.7024, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008703136816620827, + "rewards/margins": -0.017125798389315605, + "rewards/rejected": 0.008422663435339928, + "step": 221 + }, + { + "epoch": 0.03433210902764353, + "grad_norm": 4.4845075607299805, + "learning_rate": 5.721649484536083e-07, + "logits/chosen": 3.8155980110168457, + "logits/rejected": 8.531209945678711, + "logps/chosen": -170.51113891601562, + "logps/rejected": -235.72474670410156, + "loss": 0.7049, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005229093134403229, + "rewards/margins": -0.018719002604484558, + "rewards/rejected": 0.023948095738887787, + "step": 222 + }, + { + "epoch": 0.03448675816740769, + "grad_norm": 5.294673919677734, + "learning_rate": 5.747422680412372e-07, + "logits/chosen": 6.5246686935424805, + "logits/rejected": 9.144453048706055, + "logps/chosen": -277.46026611328125, + "logps/rejected": -325.8945007324219, + "loss": 0.6959, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.011956976726651192, + "rewards/margins": 0.002624470740556717, + "rewards/rejected": -0.01458144560456276, + "step": 223 + }, + { + "epoch": 0.034641407307171856, + "grad_norm": 4.342596054077148, + "learning_rate": 5.77319587628866e-07, + "logits/chosen": 12.956892013549805, + "logits/rejected": 6.823075294494629, + "logps/chosen": -302.9952087402344, + "logps/rejected": -165.18283081054688, + "loss": 0.6869, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030018998309969902, + "rewards/margins": 0.017669297754764557, + "rewards/rejected": -0.04768829792737961, + "step": 224 + }, + { + "epoch": 0.034796056446936015, + "grad_norm": 4.966875076293945, + "learning_rate": 5.798969072164949e-07, + "logits/chosen": 10.153735160827637, + "logits/rejected": 7.376662254333496, + "logps/chosen": -285.5647888183594, + "logps/rejected": -221.4811553955078, + "loss": 0.7308, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.006377171725034714, + "rewards/margins": -0.07181944698095322, + "rewards/rejected": 0.0654422789812088, + "step": 225 + }, + { + "epoch": 0.03495070558670017, + "grad_norm": 4.671102523803711, + "learning_rate": 5.824742268041238e-07, + "logits/chosen": 13.67463207244873, + "logits/rejected": 5.030215263366699, + "logps/chosen": -282.1994934082031, + "logps/rejected": -198.82147216796875, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0008883010596036911, + "rewards/margins": 0.022807549685239792, + "rewards/rejected": -0.02191925048828125, + "step": 226 + }, + { + "epoch": 0.03510535472646433, + "grad_norm": 6.9201436042785645, + "learning_rate": 5.850515463917526e-07, + "logits/chosen": 3.512515068054199, + "logits/rejected": 7.149104595184326, + "logps/chosen": -278.8149719238281, + "logps/rejected": -391.1086730957031, + "loss": 0.7129, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.027152635157108307, + "rewards/margins": -0.02888527885079384, + "rewards/rejected": 0.0017326362431049347, + "step": 227 + }, + { + "epoch": 0.0352600038662285, + "grad_norm": 3.4967703819274902, + "learning_rate": 5.876288659793815e-07, + "logits/chosen": 15.019588470458984, + "logits/rejected": 11.224769592285156, + "logps/chosen": -215.16358947753906, + "logps/rejected": -185.71273803710938, + "loss": 0.6679, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02270641177892685, + "rewards/margins": 0.05463428422808647, + "rewards/rejected": -0.03192787244915962, + "step": 228 + }, + { + "epoch": 0.035414653005992655, + "grad_norm": 5.382562637329102, + "learning_rate": 5.902061855670104e-07, + "logits/chosen": 6.3345417976379395, + "logits/rejected": 6.99592399597168, + "logps/chosen": -278.01287841796875, + "logps/rejected": -231.80795288085938, + "loss": 0.7005, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03036472760140896, + "rewards/margins": -0.013205980882048607, + "rewards/rejected": -0.01715874671936035, + "step": 229 + }, + { + "epoch": 0.03556930214575681, + "grad_norm": 4.758025646209717, + "learning_rate": 5.927835051546392e-07, + "logits/chosen": 8.463251113891602, + "logits/rejected": 12.116561889648438, + "logps/chosen": -191.79452514648438, + "logps/rejected": -224.87753295898438, + "loss": 0.7104, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.002331519266590476, + "rewards/margins": -0.033104680478572845, + "rewards/rejected": 0.030773162841796875, + "step": 230 + }, + { + "epoch": 0.03572395128552097, + "grad_norm": 3.2386176586151123, + "learning_rate": 5.95360824742268e-07, + "logits/chosen": 9.068927764892578, + "logits/rejected": 9.351752281188965, + "logps/chosen": -136.34466552734375, + "logps/rejected": -155.36605834960938, + "loss": 0.6806, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.004687094129621983, + "rewards/margins": 0.026478338986635208, + "rewards/rejected": -0.02179124392569065, + "step": 231 + }, + { + "epoch": 0.03587860042528514, + "grad_norm": 5.229002475738525, + "learning_rate": 5.97938144329897e-07, + "logits/chosen": 14.541337013244629, + "logits/rejected": 5.303619384765625, + "logps/chosen": -345.19927978515625, + "logps/rejected": -286.30828857421875, + "loss": 0.6369, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08268857002258301, + "rewards/margins": 0.11939717084169388, + "rewards/rejected": -0.03670859336853027, + "step": 232 + }, + { + "epoch": 0.036033249565049295, + "grad_norm": 6.893274784088135, + "learning_rate": 6.005154639175259e-07, + "logits/chosen": 14.66867733001709, + "logits/rejected": 9.734107971191406, + "logps/chosen": -410.5635986328125, + "logps/rejected": -300.903076171875, + "loss": 0.7371, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07901516556739807, + "rewards/margins": -0.07119274139404297, + "rewards/rejected": -0.007822412997484207, + "step": 233 + }, + { + "epoch": 0.036187898704813454, + "grad_norm": 3.7327115535736084, + "learning_rate": 6.030927835051547e-07, + "logits/chosen": 7.433411598205566, + "logits/rejected": 9.584771156311035, + "logps/chosen": -154.3632049560547, + "logps/rejected": -192.30227661132812, + "loss": 0.6981, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01377263106405735, + "rewards/margins": -0.008358431980013847, + "rewards/rejected": -0.005414200946688652, + "step": 234 + }, + { + "epoch": 0.03634254784457761, + "grad_norm": 4.205161094665527, + "learning_rate": 6.056701030927835e-07, + "logits/chosen": 8.071340560913086, + "logits/rejected": 8.306831359863281, + "logps/chosen": -181.2935791015625, + "logps/rejected": -179.22610473632812, + "loss": 0.7108, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.003616809844970703, + "rewards/margins": -0.0334671288728714, + "rewards/rejected": 0.029850320890545845, + "step": 235 + }, + { + "epoch": 0.03649719698434178, + "grad_norm": 4.360382080078125, + "learning_rate": 6.082474226804124e-07, + "logits/chosen": 11.336736679077148, + "logits/rejected": 9.09284496307373, + "logps/chosen": -337.3224182128906, + "logps/rejected": -279.7801208496094, + "loss": 0.7039, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.018046284094452858, + "rewards/margins": -0.020105741918087006, + "rewards/rejected": 0.03815202787518501, + "step": 236 + }, + { + "epoch": 0.036651846124105936, + "grad_norm": 6.259074687957764, + "learning_rate": 6.108247422680413e-07, + "logits/chosen": 8.761602401733398, + "logits/rejected": 10.941136360168457, + "logps/chosen": -396.19451904296875, + "logps/rejected": -382.68408203125, + "loss": 0.7346, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.018232012167572975, + "rewards/margins": -0.07538881152868271, + "rewards/rejected": 0.05715680494904518, + "step": 237 + }, + { + "epoch": 0.036806495263870094, + "grad_norm": 6.0970988273620605, + "learning_rate": 6.134020618556701e-07, + "logits/chosen": 6.664066791534424, + "logits/rejected": 4.940464973449707, + "logps/chosen": -354.9864501953125, + "logps/rejected": -308.2416687011719, + "loss": 0.6844, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002811622340232134, + "rewards/margins": 0.020966840907931328, + "rewards/rejected": -0.023778462782502174, + "step": 238 + }, + { + "epoch": 0.03696114440363425, + "grad_norm": 4.786613941192627, + "learning_rate": 6.15979381443299e-07, + "logits/chosen": 10.859687805175781, + "logits/rejected": 12.618412017822266, + "logps/chosen": -223.54574584960938, + "logps/rejected": -236.51113891601562, + "loss": 0.705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04124817997217178, + "rewards/margins": -0.017923451960086823, + "rewards/rejected": -0.02332472801208496, + "step": 239 + }, + { + "epoch": 0.03711579354339842, + "grad_norm": 6.639766693115234, + "learning_rate": 6.185567010309279e-07, + "logits/chosen": 4.527590751647949, + "logits/rejected": 4.626595497131348, + "logps/chosen": -315.76593017578125, + "logps/rejected": -259.0128173828125, + "loss": 0.7226, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.024262474849820137, + "rewards/margins": -0.055610038340091705, + "rewards/rejected": 0.03134756162762642, + "step": 240 + }, + { + "epoch": 0.037270442683162576, + "grad_norm": 5.480623722076416, + "learning_rate": 6.211340206185567e-07, + "logits/chosen": 12.011322021484375, + "logits/rejected": 15.609920501708984, + "logps/chosen": -293.7601318359375, + "logps/rejected": -340.34759521484375, + "loss": 0.7107, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.01647777482867241, + "rewards/margins": -0.02978820912539959, + "rewards/rejected": 0.04626598209142685, + "step": 241 + }, + { + "epoch": 0.037425091822926734, + "grad_norm": 5.76165771484375, + "learning_rate": 6.237113402061856e-07, + "logits/chosen": 12.008529663085938, + "logits/rejected": 5.540915489196777, + "logps/chosen": -328.7164001464844, + "logps/rejected": -246.66229248046875, + "loss": 0.7215, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03191695362329483, + "rewards/margins": -0.05326266586780548, + "rewards/rejected": 0.02134571224451065, + "step": 242 + }, + { + "epoch": 0.03757974096269089, + "grad_norm": 5.17066764831543, + "learning_rate": 6.262886597938145e-07, + "logits/chosen": 9.700451850891113, + "logits/rejected": 8.016971588134766, + "logps/chosen": -217.6426239013672, + "logps/rejected": -221.5758056640625, + "loss": 0.6952, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013753032311797142, + "rewards/margins": -0.002923298627138138, + "rewards/rejected": -0.010829731822013855, + "step": 243 + }, + { + "epoch": 0.03773439010245506, + "grad_norm": 11.010458946228027, + "learning_rate": 6.288659793814434e-07, + "logits/chosen": 7.685364723205566, + "logits/rejected": 1.084984302520752, + "logps/chosen": -314.3756103515625, + "logps/rejected": -188.3623504638672, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007253644987940788, + "rewards/margins": -0.021234802901744843, + "rewards/rejected": 0.028488444164395332, + "step": 244 + }, + { + "epoch": 0.037889039242219216, + "grad_norm": 5.415032386779785, + "learning_rate": 6.314432989690722e-07, + "logits/chosen": 9.269636154174805, + "logits/rejected": 10.793008804321289, + "logps/chosen": -294.50616455078125, + "logps/rejected": -287.6456298828125, + "loss": 0.6644, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03605537861585617, + "rewards/margins": 0.06008339300751686, + "rewards/rejected": -0.02402801625430584, + "step": 245 + }, + { + "epoch": 0.038043688381983375, + "grad_norm": 5.652324676513672, + "learning_rate": 6.340206185567011e-07, + "logits/chosen": 12.311849594116211, + "logits/rejected": 7.074306964874268, + "logps/chosen": -409.453125, + "logps/rejected": -253.30751037597656, + "loss": 0.6632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0446925163269043, + "rewards/margins": 0.06195654720067978, + "rewards/rejected": -0.01726403459906578, + "step": 246 + }, + { + "epoch": 0.03819833752174753, + "grad_norm": 4.350164413452148, + "learning_rate": 6.365979381443299e-07, + "logits/chosen": 6.472908020019531, + "logits/rejected": 3.3387234210968018, + "logps/chosen": -300.912353515625, + "logps/rejected": -208.2216796875, + "loss": 0.6648, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.014641453512012959, + "rewards/margins": 0.060414668172597885, + "rewards/rejected": -0.07505612075328827, + "step": 247 + }, + { + "epoch": 0.0383529866615117, + "grad_norm": 4.907051086425781, + "learning_rate": 6.391752577319587e-07, + "logits/chosen": 6.359959602355957, + "logits/rejected": 8.720989227294922, + "logps/chosen": -236.83102416992188, + "logps/rejected": -220.8046417236328, + "loss": 0.6988, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011784838512539864, + "rewards/margins": -0.0011364268139004707, + "rewards/rejected": -0.010648416355252266, + "step": 248 + }, + { + "epoch": 0.03850763580127586, + "grad_norm": 5.66964864730835, + "learning_rate": 6.417525773195876e-07, + "logits/chosen": 17.11200714111328, + "logits/rejected": 17.57602882385254, + "logps/chosen": -354.4012451171875, + "logps/rejected": -323.81201171875, + "loss": 0.6798, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05915484577417374, + "rewards/margins": 0.03593145310878754, + "rewards/rejected": 0.023223400115966797, + "step": 249 + }, + { + "epoch": 0.038662284941040015, + "grad_norm": 4.988640785217285, + "learning_rate": 6.443298969072165e-07, + "logits/chosen": 11.416837692260742, + "logits/rejected": 8.00868034362793, + "logps/chosen": -242.99166870117188, + "logps/rejected": -235.29714965820312, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.028706837445497513, + "rewards/margins": 0.004238747991621494, + "rewards/rejected": -0.03294558823108673, + "step": 250 + }, + { + "epoch": 0.03881693408080417, + "grad_norm": 4.878240585327148, + "learning_rate": 6.469072164948455e-07, + "logits/chosen": 5.145282745361328, + "logits/rejected": 4.16807746887207, + "logps/chosen": -214.65066528320312, + "logps/rejected": -169.9105224609375, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01110463123768568, + "rewards/margins": 0.013231130316853523, + "rewards/rejected": -0.024335766211152077, + "step": 251 + }, + { + "epoch": 0.03897158322056834, + "grad_norm": 4.538815498352051, + "learning_rate": 6.494845360824743e-07, + "logits/chosen": 11.637895584106445, + "logits/rejected": 13.668797492980957, + "logps/chosen": -278.2616882324219, + "logps/rejected": -258.5806884765625, + "loss": 0.7241, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03656420856714249, + "rewards/margins": -0.0572175532579422, + "rewards/rejected": 0.020653344690799713, + "step": 252 + }, + { + "epoch": 0.0391262323603325, + "grad_norm": 4.906022548675537, + "learning_rate": 6.520618556701032e-07, + "logits/chosen": 9.867277145385742, + "logits/rejected": 7.840575218200684, + "logps/chosen": -349.86749267578125, + "logps/rejected": -285.179443359375, + "loss": 0.6982, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03592377156019211, + "rewards/margins": -0.004194688051939011, + "rewards/rejected": -0.0317290797829628, + "step": 253 + }, + { + "epoch": 0.039280881500096655, + "grad_norm": 6.599225044250488, + "learning_rate": 6.546391752577321e-07, + "logits/chosen": 15.538134574890137, + "logits/rejected": 11.52098274230957, + "logps/chosen": -367.9677734375, + "logps/rejected": -349.4126281738281, + "loss": 0.6764, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03877105936408043, + "rewards/margins": 0.03812885284423828, + "rewards/rejected": 0.0006422027945518494, + "step": 254 + }, + { + "epoch": 0.039435530639860814, + "grad_norm": 5.181053638458252, + "learning_rate": 6.57216494845361e-07, + "logits/chosen": 12.436885833740234, + "logits/rejected": 9.671371459960938, + "logps/chosen": -382.39788818359375, + "logps/rejected": -313.9125061035156, + "loss": 0.7045, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004657313227653503, + "rewards/margins": -0.014827582985162735, + "rewards/rejected": 0.019484899938106537, + "step": 255 + }, + { + "epoch": 0.03959017977962498, + "grad_norm": 5.534814357757568, + "learning_rate": 6.597938144329897e-07, + "logits/chosen": 11.357664108276367, + "logits/rejected": 1.0162296295166016, + "logps/chosen": -395.32830810546875, + "logps/rejected": -249.87985229492188, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020236967131495476, + "rewards/margins": 0.05510273575782776, + "rewards/rejected": -0.07533970475196838, + "step": 256 + }, + { + "epoch": 0.03974482891938914, + "grad_norm": 5.347562313079834, + "learning_rate": 6.623711340206186e-07, + "logits/chosen": 15.50752067565918, + "logits/rejected": 7.486250877380371, + "logps/chosen": -333.0701904296875, + "logps/rejected": -243.05026245117188, + "loss": 0.7043, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.019951246678829193, + "rewards/margins": -0.01807079091668129, + "rewards/rejected": -0.0018804557621479034, + "step": 257 + }, + { + "epoch": 0.039899478059153295, + "grad_norm": 4.464232444763184, + "learning_rate": 6.649484536082475e-07, + "logits/chosen": 5.619850158691406, + "logits/rejected": 1.327756643295288, + "logps/chosen": -208.21044921875, + "logps/rejected": -188.10911560058594, + "loss": 0.6919, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0006053922697901726, + "rewards/margins": 0.004115653224289417, + "rewards/rejected": -0.0047210450284183025, + "step": 258 + }, + { + "epoch": 0.040054127198917454, + "grad_norm": 6.004766464233398, + "learning_rate": 6.675257731958763e-07, + "logits/chosen": 11.703359603881836, + "logits/rejected": 9.75662612915039, + "logps/chosen": -456.85687255859375, + "logps/rejected": -357.17144775390625, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.047693539410829544, + "rewards/margins": -0.03385457396507263, + "rewards/rejected": 0.08154811710119247, + "step": 259 + }, + { + "epoch": 0.04020877633868162, + "grad_norm": 5.586493015289307, + "learning_rate": 6.701030927835052e-07, + "logits/chosen": 13.049908638000488, + "logits/rejected": 13.037406921386719, + "logps/chosen": -289.798828125, + "logps/rejected": -279.8524169921875, + "loss": 0.7376, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.0055467598140239716, + "rewards/margins": -0.0849161148071289, + "rewards/rejected": 0.09046287834644318, + "step": 260 + }, + { + "epoch": 0.04036342547844578, + "grad_norm": 5.428988456726074, + "learning_rate": 6.726804123711341e-07, + "logits/chosen": 13.159706115722656, + "logits/rejected": 16.09222412109375, + "logps/chosen": -339.9249267578125, + "logps/rejected": -432.504638671875, + "loss": 0.7467, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030592726543545723, + "rewards/margins": -0.09264698624610901, + "rewards/rejected": 0.06205425783991814, + "step": 261 + }, + { + "epoch": 0.040518074618209936, + "grad_norm": 3.8464701175689697, + "learning_rate": 6.752577319587629e-07, + "logits/chosen": 11.381383895874023, + "logits/rejected": 3.2776682376861572, + "logps/chosen": -203.27291870117188, + "logps/rejected": -171.57546997070312, + "loss": 0.7256, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0026408201083540916, + "rewards/margins": -0.06274910271167755, + "rewards/rejected": 0.060108281672000885, + "step": 262 + }, + { + "epoch": 0.040672723757974094, + "grad_norm": 4.425137519836426, + "learning_rate": 6.778350515463917e-07, + "logits/chosen": 14.23583698272705, + "logits/rejected": 5.952187538146973, + "logps/chosen": -278.9794921875, + "logps/rejected": -276.9984436035156, + "loss": 0.7253, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03361053392291069, + "rewards/margins": -0.061350345611572266, + "rewards/rejected": 0.027739809826016426, + "step": 263 + }, + { + "epoch": 0.04082737289773826, + "grad_norm": 6.176183700561523, + "learning_rate": 6.804123711340206e-07, + "logits/chosen": 7.840469837188721, + "logits/rejected": 8.058832168579102, + "logps/chosen": -375.3706359863281, + "logps/rejected": -321.70721435546875, + "loss": 0.7301, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.025685694068670273, + "rewards/margins": -0.06539702415466309, + "rewards/rejected": 0.03971133381128311, + "step": 264 + }, + { + "epoch": 0.04098202203750242, + "grad_norm": 6.129113674163818, + "learning_rate": 6.829896907216495e-07, + "logits/chosen": 16.604860305786133, + "logits/rejected": 7.964756965637207, + "logps/chosen": -387.2036437988281, + "logps/rejected": -248.52195739746094, + "loss": 0.731, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06861067563295364, + "rewards/margins": -0.06551866978406906, + "rewards/rejected": -0.003092002123594284, + "step": 265 + }, + { + "epoch": 0.041136671177266576, + "grad_norm": 5.237852096557617, + "learning_rate": 6.855670103092784e-07, + "logits/chosen": 8.166175842285156, + "logits/rejected": 7.472873210906982, + "logps/chosen": -352.65679931640625, + "logps/rejected": -317.54736328125, + "loss": 0.6935, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.008534002117812634, + "rewards/margins": 0.010436061769723892, + "rewards/rejected": -0.018970057368278503, + "step": 266 + }, + { + "epoch": 0.041291320317030734, + "grad_norm": 4.047346115112305, + "learning_rate": 6.881443298969073e-07, + "logits/chosen": 10.062591552734375, + "logits/rejected": 7.389678478240967, + "logps/chosen": -203.36083984375, + "logps/rejected": -213.27920532226562, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.047116756439208984, + "rewards/margins": 0.051412105560302734, + "rewards/rejected": -0.00429534912109375, + "step": 267 + }, + { + "epoch": 0.0414459694567949, + "grad_norm": 4.589973449707031, + "learning_rate": 6.907216494845362e-07, + "logits/chosen": 9.534613609313965, + "logits/rejected": 5.6274333000183105, + "logps/chosen": -252.291015625, + "logps/rejected": -247.4940948486328, + "loss": 0.6792, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.019533302634954453, + "rewards/margins": 0.02815093845129013, + "rewards/rejected": -0.047684237360954285, + "step": 268 + }, + { + "epoch": 0.04160061859655906, + "grad_norm": 6.799015045166016, + "learning_rate": 6.93298969072165e-07, + "logits/chosen": 7.078848361968994, + "logits/rejected": 11.0949068069458, + "logps/chosen": -197.40658569335938, + "logps/rejected": -237.89590454101562, + "loss": 0.6889, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031607821583747864, + "rewards/margins": 0.010406684130430222, + "rewards/rejected": 0.021201133728027344, + "step": 269 + }, + { + "epoch": 0.041755267736323216, + "grad_norm": 4.157468318939209, + "learning_rate": 6.958762886597939e-07, + "logits/chosen": 7.125399589538574, + "logits/rejected": 11.697006225585938, + "logps/chosen": -170.443359375, + "logps/rejected": -197.20567321777344, + "loss": 0.7335, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0439486987888813, + "rewards/margins": -0.07237127423286438, + "rewards/rejected": 0.02842256985604763, + "step": 270 + }, + { + "epoch": 0.041909916876087375, + "grad_norm": 5.464375019073486, + "learning_rate": 6.984536082474228e-07, + "logits/chosen": 10.274873733520508, + "logits/rejected": 7.8898773193359375, + "logps/chosen": -222.9231414794922, + "logps/rejected": -151.10484313964844, + "loss": 0.7088, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0074091204442083836, + "rewards/margins": -0.02668764814734459, + "rewards/rejected": 0.019278524443507195, + "step": 271 + }, + { + "epoch": 0.04206456601585154, + "grad_norm": 6.094438076019287, + "learning_rate": 7.010309278350516e-07, + "logits/chosen": 4.080531120300293, + "logits/rejected": 4.95831298828125, + "logps/chosen": -315.23846435546875, + "logps/rejected": -244.72308349609375, + "loss": 0.6854, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06399526447057724, + "rewards/margins": 0.017319299280643463, + "rewards/rejected": 0.04667596518993378, + "step": 272 + }, + { + "epoch": 0.0422192151556157, + "grad_norm": 6.01910400390625, + "learning_rate": 7.036082474226804e-07, + "logits/chosen": 9.406524658203125, + "logits/rejected": 11.486480712890625, + "logps/chosen": -404.70562744140625, + "logps/rejected": -401.83380126953125, + "loss": 0.6725, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004156113136559725, + "rewards/margins": 0.044912248849868774, + "rewards/rejected": -0.04075613245368004, + "step": 273 + }, + { + "epoch": 0.04237386429537986, + "grad_norm": 5.294518947601318, + "learning_rate": 7.061855670103093e-07, + "logits/chosen": 15.296045303344727, + "logits/rejected": 8.236815452575684, + "logps/chosen": -243.39500427246094, + "logps/rejected": -190.75949096679688, + "loss": 0.7023, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01391210500150919, + "rewards/margins": -0.013545418158173561, + "rewards/rejected": -0.0003666868433356285, + "step": 274 + }, + { + "epoch": 0.042528513435144015, + "grad_norm": 4.91370964050293, + "learning_rate": 7.087628865979382e-07, + "logits/chosen": 11.261940956115723, + "logits/rejected": 5.535235404968262, + "logps/chosen": -381.68475341796875, + "logps/rejected": -241.4693603515625, + "loss": 0.6714, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06995878368616104, + "rewards/margins": 0.04795096069574356, + "rewards/rejected": 0.02200782299041748, + "step": 275 + }, + { + "epoch": 0.04268316257490818, + "grad_norm": 8.425891876220703, + "learning_rate": 7.11340206185567e-07, + "logits/chosen": 8.027288436889648, + "logits/rejected": 7.0562896728515625, + "logps/chosen": -257.78424072265625, + "logps/rejected": -268.9976806640625, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016646670177578926, + "rewards/margins": 0.004961486905813217, + "rewards/rejected": -0.02160816267132759, + "step": 276 + }, + { + "epoch": 0.04283781171467234, + "grad_norm": 6.820143222808838, + "learning_rate": 7.139175257731959e-07, + "logits/chosen": 11.263406753540039, + "logits/rejected": 3.7569408416748047, + "logps/chosen": -443.0341491699219, + "logps/rejected": -337.4405822753906, + "loss": 0.7387, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01112661324441433, + "rewards/margins": -0.08719973266124725, + "rewards/rejected": 0.09832635521888733, + "step": 277 + }, + { + "epoch": 0.0429924608544365, + "grad_norm": 4.918961524963379, + "learning_rate": 7.164948453608248e-07, + "logits/chosen": 12.5675630569458, + "logits/rejected": 6.431389808654785, + "logps/chosen": -301.914306640625, + "logps/rejected": -271.76953125, + "loss": 0.6813, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0032967086881399155, + "rewards/margins": 0.027314042672514915, + "rewards/rejected": -0.03061075322329998, + "step": 278 + }, + { + "epoch": 0.043147109994200655, + "grad_norm": 6.123851299285889, + "learning_rate": 7.190721649484537e-07, + "logits/chosen": 2.2918896675109863, + "logits/rejected": 6.765120983123779, + "logps/chosen": -304.2936706542969, + "logps/rejected": -402.9510192871094, + "loss": 0.6498, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07635727524757385, + "rewards/margins": 0.09249601513147354, + "rewards/rejected": -0.016138743609189987, + "step": 279 + }, + { + "epoch": 0.04330175913396482, + "grad_norm": 4.967680931091309, + "learning_rate": 7.216494845360824e-07, + "logits/chosen": 14.142841339111328, + "logits/rejected": 6.342775821685791, + "logps/chosen": -384.2119140625, + "logps/rejected": -277.9646301269531, + "loss": 0.6662, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04666900634765625, + "rewards/margins": 0.06462021172046661, + "rewards/rejected": -0.017951201647520065, + "step": 280 + }, + { + "epoch": 0.04345640827372898, + "grad_norm": 6.099856853485107, + "learning_rate": 7.242268041237115e-07, + "logits/chosen": 12.001784324645996, + "logits/rejected": 3.5784199237823486, + "logps/chosen": -309.82696533203125, + "logps/rejected": -272.38836669921875, + "loss": 0.7136, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02949538454413414, + "rewards/margins": -0.031954433768987656, + "rewards/rejected": 0.0024590478278696537, + "step": 281 + }, + { + "epoch": 0.04361105741349314, + "grad_norm": 6.103199005126953, + "learning_rate": 7.268041237113403e-07, + "logits/chosen": 13.201889038085938, + "logits/rejected": 6.912469863891602, + "logps/chosen": -382.53936767578125, + "logps/rejected": -298.7481689453125, + "loss": 0.6957, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.008629227057099342, + "rewards/margins": -0.00011920928955078125, + "rewards/rejected": 0.008748434484004974, + "step": 282 + }, + { + "epoch": 0.043765706553257296, + "grad_norm": 3.697726249694824, + "learning_rate": 7.293814432989691e-07, + "logits/chosen": 5.166966915130615, + "logits/rejected": 4.032721519470215, + "logps/chosen": -201.18789672851562, + "logps/rejected": -185.63388061523438, + "loss": 0.7003, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02250685729086399, + "rewards/margins": -0.01213073544204235, + "rewards/rejected": 0.03463759273290634, + "step": 283 + }, + { + "epoch": 0.04392035569302146, + "grad_norm": 5.440157890319824, + "learning_rate": 7.31958762886598e-07, + "logits/chosen": 5.483170509338379, + "logits/rejected": 0.9315651655197144, + "logps/chosen": -324.3583679199219, + "logps/rejected": -193.5889892578125, + "loss": 0.7359, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03743145614862442, + "rewards/margins": -0.07862356305122375, + "rewards/rejected": 0.041192103177309036, + "step": 284 + }, + { + "epoch": 0.04407500483278562, + "grad_norm": 4.959395408630371, + "learning_rate": 7.345360824742269e-07, + "logits/chosen": 7.315350532531738, + "logits/rejected": 5.967403888702393, + "logps/chosen": -235.76220703125, + "logps/rejected": -225.611328125, + "loss": 0.7132, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.041570376604795456, + "rewards/margins": -0.029239557683467865, + "rewards/rejected": 0.07080993801355362, + "step": 285 + }, + { + "epoch": 0.04422965397254978, + "grad_norm": 6.410409927368164, + "learning_rate": 7.371134020618558e-07, + "logits/chosen": 14.67337417602539, + "logits/rejected": 14.789937019348145, + "logps/chosen": -299.0250244140625, + "logps/rejected": -256.5709228515625, + "loss": 0.731, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.009676171466708183, + "rewards/margins": -0.07034854590892792, + "rewards/rejected": 0.06067238003015518, + "step": 286 + }, + { + "epoch": 0.044384303112313936, + "grad_norm": 3.8615458011627197, + "learning_rate": 7.396907216494846e-07, + "logits/chosen": 12.82430648803711, + "logits/rejected": 6.008855819702148, + "logps/chosen": -278.73486328125, + "logps/rejected": -257.6081848144531, + "loss": 0.6434, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0827404037117958, + "rewards/margins": 0.11119689792394638, + "rewards/rejected": -0.028456497937440872, + "step": 287 + }, + { + "epoch": 0.0445389522520781, + "grad_norm": 6.949217319488525, + "learning_rate": 7.422680412371135e-07, + "logits/chosen": 11.780693054199219, + "logits/rejected": 9.067593574523926, + "logps/chosen": -508.15875244140625, + "logps/rejected": -371.2984619140625, + "loss": 0.7018, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006142043508589268, + "rewards/margins": -0.010315701365470886, + "rewards/rejected": 0.016457747668027878, + "step": 288 + }, + { + "epoch": 0.04469360139184226, + "grad_norm": 5.891060829162598, + "learning_rate": 7.448453608247423e-07, + "logits/chosen": 6.376235008239746, + "logits/rejected": 9.458555221557617, + "logps/chosen": -388.1113586425781, + "logps/rejected": -407.88543701171875, + "loss": 0.6802, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011972811073064804, + "rewards/margins": 0.03756694495677948, + "rewards/rejected": -0.025594139471650124, + "step": 289 + }, + { + "epoch": 0.04484825053160642, + "grad_norm": 7.497729778289795, + "learning_rate": 7.474226804123711e-07, + "logits/chosen": 8.1383638381958, + "logits/rejected": 6.186180114746094, + "logps/chosen": -200.9911651611328, + "logps/rejected": -196.11199951171875, + "loss": 0.7483, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03496842086315155, + "rewards/margins": -0.09396882355213165, + "rewards/rejected": 0.059000395238399506, + "step": 290 + }, + { + "epoch": 0.045002899671370576, + "grad_norm": 4.562068462371826, + "learning_rate": 7.5e-07, + "logits/chosen": 10.212431907653809, + "logits/rejected": 4.131237030029297, + "logps/chosen": -192.6217041015625, + "logps/rejected": -162.54302978515625, + "loss": 0.6536, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04490475356578827, + "rewards/margins": 0.08690089732408524, + "rewards/rejected": -0.04199614375829697, + "step": 291 + }, + { + "epoch": 0.04515754881113474, + "grad_norm": 4.859351634979248, + "learning_rate": 7.525773195876289e-07, + "logits/chosen": 11.998026847839355, + "logits/rejected": 6.427470684051514, + "logps/chosen": -307.7152099609375, + "logps/rejected": -237.17135620117188, + "loss": 0.6581, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09181271493434906, + "rewards/margins": 0.08123352378606796, + "rewards/rejected": 0.0105791836977005, + "step": 292 + }, + { + "epoch": 0.0453121979508989, + "grad_norm": 5.752589225769043, + "learning_rate": 7.551546391752578e-07, + "logits/chosen": 8.069319725036621, + "logits/rejected": 9.863545417785645, + "logps/chosen": -327.6481018066406, + "logps/rejected": -369.81683349609375, + "loss": 0.7216, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.033083729445934296, + "rewards/margins": -0.0516609251499176, + "rewards/rejected": 0.018577193841338158, + "step": 293 + }, + { + "epoch": 0.04546684709066306, + "grad_norm": 5.523726463317871, + "learning_rate": 7.577319587628866e-07, + "logits/chosen": 15.03489875793457, + "logits/rejected": 10.836637496948242, + "logps/chosen": -348.3163146972656, + "logps/rejected": -263.0343017578125, + "loss": 0.6694, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03772168233990669, + "rewards/margins": 0.05891909450292587, + "rewards/rejected": -0.02119741216301918, + "step": 294 + }, + { + "epoch": 0.045621496230427216, + "grad_norm": 4.027552604675293, + "learning_rate": 7.603092783505155e-07, + "logits/chosen": 9.441871643066406, + "logits/rejected": 9.530956268310547, + "logps/chosen": -234.0558319091797, + "logps/rejected": -234.98226928710938, + "loss": 0.6858, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02280271053314209, + "rewards/margins": 0.02082827314734459, + "rewards/rejected": -0.04363097995519638, + "step": 295 + }, + { + "epoch": 0.04577614537019138, + "grad_norm": 6.074437141418457, + "learning_rate": 7.628865979381445e-07, + "logits/chosen": 12.259597778320312, + "logits/rejected": 5.049566268920898, + "logps/chosen": -371.72308349609375, + "logps/rejected": -179.959716796875, + "loss": 0.7025, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012221671640872955, + "rewards/margins": -0.013094663619995117, + "rewards/rejected": 0.025316333398222923, + "step": 296 + }, + { + "epoch": 0.04593079450995554, + "grad_norm": 5.406341552734375, + "learning_rate": 7.654639175257734e-07, + "logits/chosen": 9.372140884399414, + "logits/rejected": 10.361873626708984, + "logps/chosen": -328.73101806640625, + "logps/rejected": -280.20892333984375, + "loss": 0.6424, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.045784760266542435, + "rewards/margins": 0.10740404576063156, + "rewards/rejected": -0.06161927804350853, + "step": 297 + }, + { + "epoch": 0.0460854436497197, + "grad_norm": 4.657041549682617, + "learning_rate": 7.680412371134021e-07, + "logits/chosen": 6.708735466003418, + "logits/rejected": 8.383312225341797, + "logps/chosen": -229.92178344726562, + "logps/rejected": -261.3010559082031, + "loss": 0.6886, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012757347896695137, + "rewards/margins": 0.011767909862101078, + "rewards/rejected": 0.0009894361719489098, + "step": 298 + }, + { + "epoch": 0.04624009278948386, + "grad_norm": 5.440919399261475, + "learning_rate": 7.70618556701031e-07, + "logits/chosen": 10.24075984954834, + "logits/rejected": 11.400381088256836, + "logps/chosen": -268.0219421386719, + "logps/rejected": -261.69720458984375, + "loss": 0.6787, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04366712644696236, + "rewards/margins": 0.03244920074939728, + "rewards/rejected": 0.011217927560210228, + "step": 299 + }, + { + "epoch": 0.046394741929248015, + "grad_norm": 10.694855690002441, + "learning_rate": 7.731958762886599e-07, + "logits/chosen": 9.533609390258789, + "logits/rejected": 4.375674724578857, + "logps/chosen": -258.9429931640625, + "logps/rejected": -183.02110290527344, + "loss": 0.6859, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025099849328398705, + "rewards/margins": 0.01829042285680771, + "rewards/rejected": 0.006809425540268421, + "step": 300 + }, + { + "epoch": 0.04654939106901218, + "grad_norm": 4.438869476318359, + "learning_rate": 7.757731958762887e-07, + "logits/chosen": 9.963016510009766, + "logits/rejected": 11.539241790771484, + "logps/chosen": -311.2004089355469, + "logps/rejected": -357.8814697265625, + "loss": 0.6657, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025063039734959602, + "rewards/margins": 0.06273975223302841, + "rewards/rejected": -0.03767671808600426, + "step": 301 + }, + { + "epoch": 0.04670404020877634, + "grad_norm": 6.975356101989746, + "learning_rate": 7.783505154639176e-07, + "logits/chosen": 9.927818298339844, + "logits/rejected": 3.5872130393981934, + "logps/chosen": -310.9517517089844, + "logps/rejected": -217.1642303466797, + "loss": 0.6634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05728526785969734, + "rewards/margins": 0.06350240856409073, + "rewards/rejected": -0.006217147223651409, + "step": 302 + }, + { + "epoch": 0.0468586893485405, + "grad_norm": 5.248288154602051, + "learning_rate": 7.809278350515465e-07, + "logits/chosen": 12.73100471496582, + "logits/rejected": 12.335542678833008, + "logps/chosen": -340.8487548828125, + "logps/rejected": -341.8193359375, + "loss": 0.6971, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03075290098786354, + "rewards/margins": -0.006798101589083672, + "rewards/rejected": -0.02395479753613472, + "step": 303 + }, + { + "epoch": 0.047013338488304655, + "grad_norm": 3.1768858432769775, + "learning_rate": 7.835051546391754e-07, + "logits/chosen": 10.36426067352295, + "logits/rejected": 8.524162292480469, + "logps/chosen": -169.6994171142578, + "logps/rejected": -157.38575744628906, + "loss": 0.6774, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04603080824017525, + "rewards/margins": 0.03396458923816681, + "rewards/rejected": 0.012066220864653587, + "step": 304 + }, + { + "epoch": 0.04716798762806882, + "grad_norm": 5.595256328582764, + "learning_rate": 7.860824742268041e-07, + "logits/chosen": 9.441547393798828, + "logits/rejected": 5.497175216674805, + "logps/chosen": -383.0271911621094, + "logps/rejected": -286.5113525390625, + "loss": 0.6937, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04546399414539337, + "rewards/margins": 0.000604487955570221, + "rewards/rejected": 0.04485950618982315, + "step": 305 + }, + { + "epoch": 0.04732263676783298, + "grad_norm": 5.533136367797852, + "learning_rate": 7.88659793814433e-07, + "logits/chosen": 10.888647079467773, + "logits/rejected": 6.140551567077637, + "logps/chosen": -298.96881103515625, + "logps/rejected": -317.203125, + "loss": 0.686, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02645902708172798, + "rewards/margins": 0.015329839661717415, + "rewards/rejected": 0.011129188351333141, + "step": 306 + }, + { + "epoch": 0.04747728590759714, + "grad_norm": 4.928723335266113, + "learning_rate": 7.912371134020619e-07, + "logits/chosen": 9.593779563903809, + "logits/rejected": 11.967325210571289, + "logps/chosen": -312.11602783203125, + "logps/rejected": -284.61907958984375, + "loss": 0.7073, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04487323760986328, + "rewards/margins": -0.02668905258178711, + "rewards/rejected": -0.018184185028076172, + "step": 307 + }, + { + "epoch": 0.047631935047361296, + "grad_norm": 6.147202491760254, + "learning_rate": 7.938144329896907e-07, + "logits/chosen": 9.028889656066895, + "logits/rejected": 11.248432159423828, + "logps/chosen": -355.0299987792969, + "logps/rejected": -317.34173583984375, + "loss": 0.6661, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024903394281864166, + "rewards/margins": 0.05873899534344673, + "rewards/rejected": -0.033835604786872864, + "step": 308 + }, + { + "epoch": 0.04778658418712546, + "grad_norm": 5.863432884216309, + "learning_rate": 7.963917525773196e-07, + "logits/chosen": 16.674617767333984, + "logits/rejected": 12.86128044128418, + "logps/chosen": -284.236572265625, + "logps/rejected": -285.61669921875, + "loss": 0.7351, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06699486076831818, + "rewards/margins": -0.07974996417760849, + "rewards/rejected": 0.012755108065903187, + "step": 309 + }, + { + "epoch": 0.04794123332688962, + "grad_norm": 4.738588333129883, + "learning_rate": 7.989690721649485e-07, + "logits/chosen": 14.011253356933594, + "logits/rejected": 7.4572930335998535, + "logps/chosen": -300.88555908203125, + "logps/rejected": -234.4568328857422, + "loss": 0.6884, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015485147945582867, + "rewards/margins": 0.014288333244621754, + "rewards/rejected": 0.001196814700961113, + "step": 310 + }, + { + "epoch": 0.04809588246665378, + "grad_norm": 4.169150352478027, + "learning_rate": 8.015463917525775e-07, + "logits/chosen": 5.935704231262207, + "logits/rejected": 4.86208963394165, + "logps/chosen": -197.8755340576172, + "logps/rejected": -205.51852416992188, + "loss": 0.6872, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031170319765806198, + "rewards/margins": 0.012457729317247868, + "rewards/rejected": 0.018712591379880905, + "step": 311 + }, + { + "epoch": 0.048250531606417936, + "grad_norm": 6.766357421875, + "learning_rate": 8.041237113402063e-07, + "logits/chosen": 12.25485610961914, + "logits/rejected": 5.660464286804199, + "logps/chosen": -340.59417724609375, + "logps/rejected": -208.60427856445312, + "loss": 0.7113, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.017274286597967148, + "rewards/margins": -0.03250467777252197, + "rewards/rejected": 0.015230393968522549, + "step": 312 + }, + { + "epoch": 0.0484051807461821, + "grad_norm": 4.594578266143799, + "learning_rate": 8.067010309278352e-07, + "logits/chosen": 16.12421417236328, + "logits/rejected": 8.62697696685791, + "logps/chosen": -265.9131774902344, + "logps/rejected": -187.4433135986328, + "loss": 0.6859, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014047149568796158, + "rewards/margins": 0.015713075175881386, + "rewards/rejected": -0.001665925607085228, + "step": 313 + }, + { + "epoch": 0.04855982988594626, + "grad_norm": 3.4818480014801025, + "learning_rate": 8.092783505154641e-07, + "logits/chosen": 13.8653564453125, + "logits/rejected": 6.9018049240112305, + "logps/chosen": -192.21493530273438, + "logps/rejected": -151.43930053710938, + "loss": 0.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02290806919336319, + "rewards/margins": 0.050026800483465195, + "rewards/rejected": -0.027118727564811707, + "step": 314 + }, + { + "epoch": 0.04871447902571042, + "grad_norm": 7.035672187805176, + "learning_rate": 8.118556701030928e-07, + "logits/chosen": 3.897305488586426, + "logits/rejected": 3.538041353225708, + "logps/chosen": -207.46163940429688, + "logps/rejected": -209.98765563964844, + "loss": 0.691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024081282317638397, + "rewards/margins": 0.009569570422172546, + "rewards/rejected": -0.033650852739810944, + "step": 315 + }, + { + "epoch": 0.048869128165474576, + "grad_norm": 7.0732340812683105, + "learning_rate": 8.144329896907217e-07, + "logits/chosen": 9.977954864501953, + "logits/rejected": 10.643462181091309, + "logps/chosen": -367.1545715332031, + "logps/rejected": -323.3158264160156, + "loss": 0.7364, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.059644319117069244, + "rewards/margins": -0.07982683181762695, + "rewards/rejected": 0.02018251270055771, + "step": 316 + }, + { + "epoch": 0.04902377730523874, + "grad_norm": 8.79511833190918, + "learning_rate": 8.170103092783506e-07, + "logits/chosen": 3.738468647003174, + "logits/rejected": 11.525028228759766, + "logps/chosen": -165.37423706054688, + "logps/rejected": -243.92523193359375, + "loss": 0.7152, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0034049982205033302, + "rewards/margins": -0.03902578353881836, + "rewards/rejected": 0.042430780827999115, + "step": 317 + }, + { + "epoch": 0.0491784264450029, + "grad_norm": 4.1245951652526855, + "learning_rate": 8.195876288659795e-07, + "logits/chosen": 3.153686285018921, + "logits/rejected": 5.225451469421387, + "logps/chosen": -225.98306274414062, + "logps/rejected": -253.50357055664062, + "loss": 0.7139, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02681265026330948, + "rewards/margins": -0.039389848709106445, + "rewards/rejected": 0.012577196583151817, + "step": 318 + }, + { + "epoch": 0.04933307558476706, + "grad_norm": 5.788033962249756, + "learning_rate": 8.221649484536083e-07, + "logits/chosen": 14.813587188720703, + "logits/rejected": 14.116519927978516, + "logps/chosen": -235.96603393554688, + "logps/rejected": -232.52171325683594, + "loss": 0.6833, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00020151096396148205, + "rewards/margins": 0.022872546687722206, + "rewards/rejected": -0.023074055090546608, + "step": 319 + }, + { + "epoch": 0.04948772472453122, + "grad_norm": 5.623997211456299, + "learning_rate": 8.247422680412372e-07, + "logits/chosen": 6.207259178161621, + "logits/rejected": -4.1912994384765625, + "logps/chosen": -315.69964599609375, + "logps/rejected": -167.5287628173828, + "loss": 0.689, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04370555654168129, + "rewards/margins": 0.009932899847626686, + "rewards/rejected": 0.03377266228199005, + "step": 320 + }, + { + "epoch": 0.04964237386429538, + "grad_norm": 3.6895346641540527, + "learning_rate": 8.273195876288661e-07, + "logits/chosen": 11.903623580932617, + "logits/rejected": 8.772470474243164, + "logps/chosen": -230.1386260986328, + "logps/rejected": -203.36868286132812, + "loss": 0.7055, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007718563079833984, + "rewards/margins": -0.02218818850815296, + "rewards/rejected": 0.014469623565673828, + "step": 321 + }, + { + "epoch": 0.04979702300405954, + "grad_norm": 4.731302261352539, + "learning_rate": 8.298969072164948e-07, + "logits/chosen": 13.479562759399414, + "logits/rejected": 6.947115898132324, + "logps/chosen": -373.03350830078125, + "logps/rejected": -224.48995971679688, + "loss": 0.6551, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05569906532764435, + "rewards/margins": 0.08135590702295303, + "rewards/rejected": -0.025656841695308685, + "step": 322 + }, + { + "epoch": 0.0499516721438237, + "grad_norm": 4.305988788604736, + "learning_rate": 8.324742268041237e-07, + "logits/chosen": 9.580376625061035, + "logits/rejected": 0.8384977579116821, + "logps/chosen": -253.3452606201172, + "logps/rejected": -160.5526123046875, + "loss": 0.691, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0063598137348890305, + "rewards/margins": 0.01699228212237358, + "rewards/rejected": -0.02335209771990776, + "step": 323 + }, + { + "epoch": 0.05010632128358786, + "grad_norm": 4.122289657592773, + "learning_rate": 8.350515463917526e-07, + "logits/chosen": 14.331572532653809, + "logits/rejected": 11.76861572265625, + "logps/chosen": -265.4837341308594, + "logps/rejected": -267.43017578125, + "loss": 0.6954, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0019929646514356136, + "rewards/margins": -0.002494524698704481, + "rewards/rejected": 0.004487491212785244, + "step": 324 + }, + { + "epoch": 0.05026097042335202, + "grad_norm": 6.475592613220215, + "learning_rate": 8.376288659793815e-07, + "logits/chosen": 10.418147087097168, + "logits/rejected": 2.835232734680176, + "logps/chosen": -562.50048828125, + "logps/rejected": -400.02081298828125, + "loss": 0.6729, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04315614700317383, + "rewards/margins": 0.04643592983484268, + "rewards/rejected": -0.0032797809690237045, + "step": 325 + }, + { + "epoch": 0.05041561956311618, + "grad_norm": 4.158835411071777, + "learning_rate": 8.402061855670104e-07, + "logits/chosen": 17.390661239624023, + "logits/rejected": 7.49455451965332, + "logps/chosen": -279.53399658203125, + "logps/rejected": -149.30152893066406, + "loss": 0.6685, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.025096513330936432, + "rewards/margins": 0.057434868067502975, + "rewards/rejected": -0.08253137767314911, + "step": 326 + }, + { + "epoch": 0.05057026870288034, + "grad_norm": 9.585373878479004, + "learning_rate": 8.427835051546393e-07, + "logits/chosen": 8.694064140319824, + "logits/rejected": 3.928234100341797, + "logps/chosen": -238.50270080566406, + "logps/rejected": -144.52503967285156, + "loss": 0.6718, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012478066608309746, + "rewards/margins": 0.04846363142132759, + "rewards/rejected": -0.035985566675662994, + "step": 327 + }, + { + "epoch": 0.0507249178426445, + "grad_norm": 4.9702301025390625, + "learning_rate": 8.453608247422682e-07, + "logits/chosen": 11.128905296325684, + "logits/rejected": 13.74989128112793, + "logps/chosen": -309.1045227050781, + "logps/rejected": -372.2152404785156, + "loss": 0.7297, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0483829528093338, + "rewards/margins": -0.06745920330286026, + "rewards/rejected": 0.019076254218816757, + "step": 328 + }, + { + "epoch": 0.05087956698240866, + "grad_norm": 9.810717582702637, + "learning_rate": 8.47938144329897e-07, + "logits/chosen": 6.248475551605225, + "logits/rejected": 4.073984146118164, + "logps/chosen": -238.91769409179688, + "logps/rejected": -212.77735900878906, + "loss": 0.6707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009897114709019661, + "rewards/margins": 0.053171947598457336, + "rewards/rejected": -0.06306906044483185, + "step": 329 + }, + { + "epoch": 0.05103421612217282, + "grad_norm": 5.923515319824219, + "learning_rate": 8.505154639175259e-07, + "logits/chosen": 8.707401275634766, + "logits/rejected": 12.377830505371094, + "logps/chosen": -207.44644165039062, + "logps/rejected": -218.87484741210938, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006574677303433418, + "rewards/margins": 0.0008665574714541435, + "rewards/rejected": -0.007441233843564987, + "step": 330 + }, + { + "epoch": 0.05118886526193698, + "grad_norm": 5.5287981033325195, + "learning_rate": 8.530927835051547e-07, + "logits/chosen": 11.679425239562988, + "logits/rejected": 7.240042209625244, + "logps/chosen": -285.194580078125, + "logps/rejected": -235.5878143310547, + "loss": 0.6694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0436800979077816, + "rewards/margins": 0.053113702684640884, + "rewards/rejected": -0.009433603845536709, + "step": 331 + }, + { + "epoch": 0.05134351440170114, + "grad_norm": 6.355861186981201, + "learning_rate": 8.556701030927836e-07, + "logits/chosen": 11.296228408813477, + "logits/rejected": 9.15643310546875, + "logps/chosen": -325.03143310546875, + "logps/rejected": -298.59246826171875, + "loss": 0.6962, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.004544067662209272, + "rewards/margins": -0.002562951296567917, + "rewards/rejected": -0.001981116831302643, + "step": 332 + }, + { + "epoch": 0.0514981635414653, + "grad_norm": 5.970607757568359, + "learning_rate": 8.582474226804124e-07, + "logits/chosen": 10.666913986206055, + "logits/rejected": 12.998638153076172, + "logps/chosen": -465.88665771484375, + "logps/rejected": -456.6786804199219, + "loss": 0.7013, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0484885647892952, + "rewards/margins": -0.012132979929447174, + "rewards/rejected": 0.06062154844403267, + "step": 333 + }, + { + "epoch": 0.05165281268122946, + "grad_norm": 5.521275043487549, + "learning_rate": 8.608247422680413e-07, + "logits/chosen": 14.129186630249023, + "logits/rejected": 2.064903974533081, + "logps/chosen": -357.00714111328125, + "logps/rejected": -193.00653076171875, + "loss": 0.6755, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.027088262140750885, + "rewards/margins": 0.03876437991857529, + "rewards/rejected": -0.0116761215031147, + "step": 334 + }, + { + "epoch": 0.05180746182099362, + "grad_norm": 4.094925880432129, + "learning_rate": 8.634020618556702e-07, + "logits/chosen": 11.846107482910156, + "logits/rejected": 7.880058288574219, + "logps/chosen": -278.2975158691406, + "logps/rejected": -205.12425231933594, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01932988129556179, + "rewards/margins": 0.03342390060424805, + "rewards/rejected": -0.014094019308686256, + "step": 335 + }, + { + "epoch": 0.05196211096075778, + "grad_norm": 6.341585636138916, + "learning_rate": 8.65979381443299e-07, + "logits/chosen": 17.249914169311523, + "logits/rejected": 7.681830406188965, + "logps/chosen": -381.96405029296875, + "logps/rejected": -283.91192626953125, + "loss": 0.7154, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11012067645788193, + "rewards/margins": -0.034940049052238464, + "rewards/rejected": -0.07518062740564346, + "step": 336 + }, + { + "epoch": 0.05211676010052194, + "grad_norm": 4.551817893981934, + "learning_rate": 8.685567010309279e-07, + "logits/chosen": 6.682127952575684, + "logits/rejected": 7.055165767669678, + "logps/chosen": -267.300048828125, + "logps/rejected": -249.1988067626953, + "loss": 0.7096, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008320286870002747, + "rewards/margins": -0.023122448474168777, + "rewards/rejected": 0.03144273906946182, + "step": 337 + }, + { + "epoch": 0.0522714092402861, + "grad_norm": 6.360701560974121, + "learning_rate": 8.711340206185567e-07, + "logits/chosen": 10.616209030151367, + "logits/rejected": 11.711845397949219, + "logps/chosen": -366.0268859863281, + "logps/rejected": -393.9095764160156, + "loss": 0.7314, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.015015266835689545, + "rewards/margins": -0.06660046428442001, + "rewards/rejected": 0.05158519744873047, + "step": 338 + }, + { + "epoch": 0.05242605838005026, + "grad_norm": 5.620662212371826, + "learning_rate": 8.737113402061856e-07, + "logits/chosen": 15.40963363647461, + "logits/rejected": 14.104247093200684, + "logps/chosen": -314.3692626953125, + "logps/rejected": -269.7205810546875, + "loss": 0.6943, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09499354660511017, + "rewards/margins": -0.0008111950010061264, + "rewards/rejected": 0.09580473601818085, + "step": 339 + }, + { + "epoch": 0.05258070751981442, + "grad_norm": 5.203341007232666, + "learning_rate": 8.762886597938144e-07, + "logits/chosen": 4.358460426330566, + "logits/rejected": 1.727196455001831, + "logps/chosen": -201.946044921875, + "logps/rejected": -136.56553649902344, + "loss": 0.7446, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05658016353845596, + "rewards/margins": -0.0963442325592041, + "rewards/rejected": 0.03976407274603844, + "step": 340 + }, + { + "epoch": 0.05273535665957858, + "grad_norm": 5.324661731719971, + "learning_rate": 8.788659793814433e-07, + "logits/chosen": 8.280628204345703, + "logits/rejected": 8.178197860717773, + "logps/chosen": -370.210205078125, + "logps/rejected": -334.6656799316406, + "loss": 0.6759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05555377155542374, + "rewards/margins": 0.03767814487218857, + "rewards/rejected": 0.01787562295794487, + "step": 341 + }, + { + "epoch": 0.05289000579934274, + "grad_norm": 7.864620685577393, + "learning_rate": 8.814432989690723e-07, + "logits/chosen": 6.158729076385498, + "logits/rejected": 4.1754961013793945, + "logps/chosen": -478.3458557128906, + "logps/rejected": -256.2119140625, + "loss": 0.7192, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.014479974284768105, + "rewards/margins": -0.044060613960027695, + "rewards/rejected": 0.05854058265686035, + "step": 342 + }, + { + "epoch": 0.0530446549391069, + "grad_norm": 4.414979457855225, + "learning_rate": 8.840206185567011e-07, + "logits/chosen": 10.996040344238281, + "logits/rejected": 1.509592890739441, + "logps/chosen": -261.48602294921875, + "logps/rejected": -142.6768341064453, + "loss": 0.6914, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.019737720489501953, + "rewards/margins": 0.0056035518646240234, + "rewards/rejected": 0.01413416862487793, + "step": 343 + }, + { + "epoch": 0.05319930407887106, + "grad_norm": 4.6226091384887695, + "learning_rate": 8.8659793814433e-07, + "logits/chosen": 10.649694442749023, + "logits/rejected": 9.918621063232422, + "logps/chosen": -309.9231262207031, + "logps/rejected": -225.6995086669922, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006951688788831234, + "rewards/margins": 0.0020817983895540237, + "rewards/rejected": 0.004869889467954636, + "step": 344 + }, + { + "epoch": 0.053353953218635224, + "grad_norm": 4.1718878746032715, + "learning_rate": 8.891752577319589e-07, + "logits/chosen": 7.7937397956848145, + "logits/rejected": 8.358108520507812, + "logps/chosen": -270.802734375, + "logps/rejected": -261.93414306640625, + "loss": 0.6993, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.005881977267563343, + "rewards/margins": -0.009798050858080387, + "rewards/rejected": 0.01568002626299858, + "step": 345 + }, + { + "epoch": 0.05350860235839938, + "grad_norm": 5.7086501121521, + "learning_rate": 8.917525773195878e-07, + "logits/chosen": 12.025075912475586, + "logits/rejected": 0.27682721614837646, + "logps/chosen": -311.3431396484375, + "logps/rejected": -198.30581665039062, + "loss": 0.7112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05789661407470703, + "rewards/margins": -0.03056936338543892, + "rewards/rejected": -0.027327250689268112, + "step": 346 + }, + { + "epoch": 0.05366325149816354, + "grad_norm": 3.6815907955169678, + "learning_rate": 8.943298969072166e-07, + "logits/chosen": 6.982710838317871, + "logits/rejected": 6.452592849731445, + "logps/chosen": -211.49989318847656, + "logps/rejected": -182.47181701660156, + "loss": 0.6772, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.060828495770692825, + "rewards/margins": 0.035726070404052734, + "rewards/rejected": 0.02510242722928524, + "step": 347 + }, + { + "epoch": 0.0538179006379277, + "grad_norm": 4.95511531829834, + "learning_rate": 8.969072164948454e-07, + "logits/chosen": 14.31574821472168, + "logits/rejected": 16.112995147705078, + "logps/chosen": -292.53240966796875, + "logps/rejected": -384.8315124511719, + "loss": 0.7026, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.043743617832660675, + "rewards/margins": -0.008153248578310013, + "rewards/rejected": -0.035590361803770065, + "step": 348 + }, + { + "epoch": 0.053972549777691864, + "grad_norm": 6.050301551818848, + "learning_rate": 8.994845360824743e-07, + "logits/chosen": 10.782508850097656, + "logits/rejected": 10.924026489257812, + "logps/chosen": -328.68798828125, + "logps/rejected": -301.2613525390625, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008078956976532936, + "rewards/margins": 0.005491733551025391, + "rewards/rejected": 0.0025872234255075455, + "step": 349 + }, + { + "epoch": 0.05412719891745602, + "grad_norm": 4.550487041473389, + "learning_rate": 9.020618556701031e-07, + "logits/chosen": 7.750008583068848, + "logits/rejected": 6.9923810958862305, + "logps/chosen": -279.5172424316406, + "logps/rejected": -223.89041137695312, + "loss": 0.6841, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0246523879468441, + "rewards/margins": 0.018850183114409447, + "rewards/rejected": 0.00580220390111208, + "step": 350 + }, + { + "epoch": 0.05428184805722018, + "grad_norm": 11.858559608459473, + "learning_rate": 9.04639175257732e-07, + "logits/chosen": 8.763167381286621, + "logits/rejected": 11.137678146362305, + "logps/chosen": -227.9757080078125, + "logps/rejected": -290.70648193359375, + "loss": 0.678, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05212879180908203, + "rewards/margins": 0.039518166333436966, + "rewards/rejected": 0.01261062454432249, + "step": 351 + }, + { + "epoch": 0.05443649719698434, + "grad_norm": 5.670365810394287, + "learning_rate": 9.072164948453609e-07, + "logits/chosen": 13.587382316589355, + "logits/rejected": 10.373586654663086, + "logps/chosen": -338.270263671875, + "logps/rejected": -274.0187072753906, + "loss": 0.6978, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028385069221258163, + "rewards/margins": -0.007985593751072884, + "rewards/rejected": 0.0363706573843956, + "step": 352 + }, + { + "epoch": 0.054591146336748504, + "grad_norm": 5.200290203094482, + "learning_rate": 9.097938144329898e-07, + "logits/chosen": 5.452160835266113, + "logits/rejected": 4.716856956481934, + "logps/chosen": -306.956787109375, + "logps/rejected": -302.99609375, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02313823625445366, + "rewards/margins": 0.031882576644420624, + "rewards/rejected": -0.008744340389966965, + "step": 353 + }, + { + "epoch": 0.05474579547651266, + "grad_norm": 4.956686496734619, + "learning_rate": 9.123711340206186e-07, + "logits/chosen": 15.669855117797852, + "logits/rejected": 5.030588626861572, + "logps/chosen": -255.5647735595703, + "logps/rejected": -134.20985412597656, + "loss": 0.6934, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0011940496042370796, + "rewards/margins": 0.000958440825343132, + "rewards/rejected": -0.0021524904295802116, + "step": 354 + }, + { + "epoch": 0.05490044461627682, + "grad_norm": 5.248405933380127, + "learning_rate": 9.149484536082474e-07, + "logits/chosen": 12.432308197021484, + "logits/rejected": 10.941661834716797, + "logps/chosen": -366.5779113769531, + "logps/rejected": -343.7878112792969, + "loss": 0.7115, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0014962200075387955, + "rewards/margins": -0.022401336580514908, + "rewards/rejected": 0.023897551000118256, + "step": 355 + }, + { + "epoch": 0.05505509375604098, + "grad_norm": 4.469074726104736, + "learning_rate": 9.175257731958763e-07, + "logits/chosen": 6.67618989944458, + "logits/rejected": 10.412915229797363, + "logps/chosen": -209.1573028564453, + "logps/rejected": -242.23846435546875, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.002662944607436657, + "rewards/margins": 0.011094575747847557, + "rewards/rejected": -0.013757515698671341, + "step": 356 + }, + { + "epoch": 0.055209742895805144, + "grad_norm": 4.443343162536621, + "learning_rate": 9.201030927835052e-07, + "logits/chosen": 12.131492614746094, + "logits/rejected": 9.86172866821289, + "logps/chosen": -294.4602355957031, + "logps/rejected": -263.77783203125, + "loss": 0.6849, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010724161751568317, + "rewards/margins": 0.018154192715883255, + "rewards/rejected": -0.028878355398774147, + "step": 357 + }, + { + "epoch": 0.0553643920355693, + "grad_norm": 8.816765785217285, + "learning_rate": 9.226804123711341e-07, + "logits/chosen": 12.643805503845215, + "logits/rejected": 2.08780574798584, + "logps/chosen": -519.277587890625, + "logps/rejected": -241.80743408203125, + "loss": 0.7157, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04626712203025818, + "rewards/margins": -0.03952989727258682, + "rewards/rejected": -0.0067372312769293785, + "step": 358 + }, + { + "epoch": 0.05551904117533346, + "grad_norm": 5.776669979095459, + "learning_rate": 9.25257731958763e-07, + "logits/chosen": 12.58725357055664, + "logits/rejected": 8.617329597473145, + "logps/chosen": -403.50390625, + "logps/rejected": -354.3662109375, + "loss": 0.6816, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003914071246981621, + "rewards/margins": 0.027027608826756477, + "rewards/rejected": -0.03094167448580265, + "step": 359 + }, + { + "epoch": 0.05567369031509762, + "grad_norm": 6.46565055847168, + "learning_rate": 9.278350515463919e-07, + "logits/chosen": 9.653300285339355, + "logits/rejected": 9.97469711303711, + "logps/chosen": -298.61175537109375, + "logps/rejected": -283.46844482421875, + "loss": 0.6743, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027576066553592682, + "rewards/margins": 0.0417148619890213, + "rewards/rejected": -0.06929092109203339, + "step": 360 + }, + { + "epoch": 0.055828339454861785, + "grad_norm": 6.373989105224609, + "learning_rate": 9.304123711340207e-07, + "logits/chosen": 8.019002914428711, + "logits/rejected": 11.971508979797363, + "logps/chosen": -237.29595947265625, + "logps/rejected": -309.50732421875, + "loss": 0.685, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004565784707665443, + "rewards/margins": 0.017758727073669434, + "rewards/rejected": -0.013192940503358841, + "step": 361 + }, + { + "epoch": 0.05598298859462594, + "grad_norm": 5.160031795501709, + "learning_rate": 9.329896907216496e-07, + "logits/chosen": 8.270573616027832, + "logits/rejected": 12.278438568115234, + "logps/chosen": -298.4097900390625, + "logps/rejected": -364.43438720703125, + "loss": 0.6691, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03375225141644478, + "rewards/margins": 0.0528591126203537, + "rewards/rejected": -0.08661137521266937, + "step": 362 + }, + { + "epoch": 0.0561376377343901, + "grad_norm": 3.9054408073425293, + "learning_rate": 9.355670103092785e-07, + "logits/chosen": 11.082379341125488, + "logits/rejected": 4.30116081237793, + "logps/chosen": -269.73187255859375, + "logps/rejected": -168.511474609375, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024539470672607422, + "rewards/margins": 0.04987955838441849, + "rewards/rejected": -0.025340082123875618, + "step": 363 + }, + { + "epoch": 0.05629228687415426, + "grad_norm": 5.839847564697266, + "learning_rate": 9.381443298969072e-07, + "logits/chosen": 6.032514572143555, + "logits/rejected": 7.4395222663879395, + "logps/chosen": -265.63519287109375, + "logps/rejected": -252.85031127929688, + "loss": 0.6561, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07532262802124023, + "rewards/margins": 0.07692494988441467, + "rewards/rejected": -0.0016023144125938416, + "step": 364 + }, + { + "epoch": 0.056446936013918425, + "grad_norm": 3.9480528831481934, + "learning_rate": 9.407216494845361e-07, + "logits/chosen": 9.79598617553711, + "logits/rejected": 9.774332046508789, + "logps/chosen": -236.4007568359375, + "logps/rejected": -228.12460327148438, + "loss": 0.6995, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008272027596831322, + "rewards/margins": -0.009929515421390533, + "rewards/rejected": 0.018201543018221855, + "step": 365 + }, + { + "epoch": 0.05660158515368258, + "grad_norm": 4.725208759307861, + "learning_rate": 9.43298969072165e-07, + "logits/chosen": 10.195878982543945, + "logits/rejected": 9.985554695129395, + "logps/chosen": -287.7193298339844, + "logps/rejected": -245.8714141845703, + "loss": 0.6939, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00581636605784297, + "rewards/margins": 0.002316952683031559, + "rewards/rejected": -0.008133318275213242, + "step": 366 + }, + { + "epoch": 0.05675623429344674, + "grad_norm": 4.951709270477295, + "learning_rate": 9.458762886597939e-07, + "logits/chosen": 14.735419273376465, + "logits/rejected": 9.899681091308594, + "logps/chosen": -285.44085693359375, + "logps/rejected": -273.3875732421875, + "loss": 0.7172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04300088807940483, + "rewards/margins": -0.04202437400817871, + "rewards/rejected": -0.0009765159338712692, + "step": 367 + }, + { + "epoch": 0.0569108834332109, + "grad_norm": 4.781497001647949, + "learning_rate": 9.484536082474227e-07, + "logits/chosen": 15.991767883300781, + "logits/rejected": 8.424766540527344, + "logps/chosen": -238.76235961914062, + "logps/rejected": -197.5927734375, + "loss": 0.7247, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.021326255053281784, + "rewards/margins": -0.060691025108098984, + "rewards/rejected": 0.0393647700548172, + "step": 368 + }, + { + "epoch": 0.057065532572975065, + "grad_norm": 5.740688800811768, + "learning_rate": 9.510309278350516e-07, + "logits/chosen": 10.063053131103516, + "logits/rejected": 2.9771573543548584, + "logps/chosen": -334.138427734375, + "logps/rejected": -214.73902893066406, + "loss": 0.6442, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06613150238990784, + "rewards/margins": 0.10492897033691406, + "rewards/rejected": -0.03879747539758682, + "step": 369 + }, + { + "epoch": 0.057220181712739224, + "grad_norm": 4.534903049468994, + "learning_rate": 9.536082474226805e-07, + "logits/chosen": 11.396720886230469, + "logits/rejected": 6.763251781463623, + "logps/chosen": -216.68621826171875, + "logps/rejected": -183.06671142578125, + "loss": 0.6859, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03284458816051483, + "rewards/margins": 0.017432022839784622, + "rewards/rejected": -0.050276611000299454, + "step": 370 + }, + { + "epoch": 0.05737483085250338, + "grad_norm": 7.5135016441345215, + "learning_rate": 9.561855670103093e-07, + "logits/chosen": 9.614764213562012, + "logits/rejected": 8.616850852966309, + "logps/chosen": -319.6884765625, + "logps/rejected": -315.4322509765625, + "loss": 0.7099, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.027391627430915833, + "rewards/margins": -0.02809848263859749, + "rewards/rejected": 0.05549011379480362, + "step": 371 + }, + { + "epoch": 0.05752947999226754, + "grad_norm": 6.570878505706787, + "learning_rate": 9.587628865979382e-07, + "logits/chosen": 6.676506042480469, + "logits/rejected": 3.8560619354248047, + "logps/chosen": -217.4125213623047, + "logps/rejected": -242.31954956054688, + "loss": 0.6844, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0360744446516037, + "rewards/margins": 0.02077498659491539, + "rewards/rejected": 0.015299463644623756, + "step": 372 + }, + { + "epoch": 0.057684129132031706, + "grad_norm": 3.395559310913086, + "learning_rate": 9.61340206185567e-07, + "logits/chosen": 7.980839729309082, + "logits/rejected": 13.185905456542969, + "logps/chosen": -166.61965942382812, + "logps/rejected": -147.8430633544922, + "loss": 0.7083, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.025401689112186432, + "rewards/margins": -0.02807903103530407, + "rewards/rejected": 0.0026773447170853615, + "step": 373 + }, + { + "epoch": 0.057838778271795864, + "grad_norm": 4.8726911544799805, + "learning_rate": 9.63917525773196e-07, + "logits/chosen": 12.443926811218262, + "logits/rejected": 15.207575798034668, + "logps/chosen": -322.5223083496094, + "logps/rejected": -278.626953125, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0005522725405171514, + "rewards/margins": 0.03925461694598198, + "rewards/rejected": -0.0387023463845253, + "step": 374 + }, + { + "epoch": 0.05799342741156002, + "grad_norm": 5.701084136962891, + "learning_rate": 9.664948453608248e-07, + "logits/chosen": 13.062406539916992, + "logits/rejected": 4.93897819519043, + "logps/chosen": -365.8446960449219, + "logps/rejected": -220.0662078857422, + "loss": 0.6676, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08141865581274033, + "rewards/margins": 0.054802462458610535, + "rewards/rejected": 0.026616191491484642, + "step": 375 + }, + { + "epoch": 0.05814807655132418, + "grad_norm": 4.8684844970703125, + "learning_rate": 9.690721649484537e-07, + "logits/chosen": 14.162793159484863, + "logits/rejected": 10.294622421264648, + "logps/chosen": -398.8664855957031, + "logps/rejected": -331.074951171875, + "loss": 0.7148, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.047443680465221405, + "rewards/margins": -0.03860035538673401, + "rewards/rejected": -0.00884332600980997, + "step": 376 + }, + { + "epoch": 0.058302725691088346, + "grad_norm": 5.442417621612549, + "learning_rate": 9.716494845360826e-07, + "logits/chosen": 9.5631103515625, + "logits/rejected": 9.802350044250488, + "logps/chosen": -315.45654296875, + "logps/rejected": -310.20556640625, + "loss": 0.6863, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013831520453095436, + "rewards/margins": 0.017550181597471237, + "rewards/rejected": -0.0037186630070209503, + "step": 377 + }, + { + "epoch": 0.058457374830852504, + "grad_norm": 6.687643051147461, + "learning_rate": 9.742268041237114e-07, + "logits/chosen": 14.423138618469238, + "logits/rejected": 13.543318748474121, + "logps/chosen": -313.0621643066406, + "logps/rejected": -310.69012451171875, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03683624416589737, + "rewards/margins": 0.0302339568734169, + "rewards/rejected": 0.006602289155125618, + "step": 378 + }, + { + "epoch": 0.05861202397061666, + "grad_norm": 5.6178812980651855, + "learning_rate": 9.768041237113403e-07, + "logits/chosen": 9.242607116699219, + "logits/rejected": 8.472833633422852, + "logps/chosen": -381.8070983886719, + "logps/rejected": -400.93890380859375, + "loss": 0.6787, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.040642257779836655, + "rewards/margins": 0.036738112568855286, + "rewards/rejected": 0.003904152661561966, + "step": 379 + }, + { + "epoch": 0.05876667311038082, + "grad_norm": 3.849717378616333, + "learning_rate": 9.793814432989692e-07, + "logits/chosen": 6.134887218475342, + "logits/rejected": 7.432956218719482, + "logps/chosen": -202.69830322265625, + "logps/rejected": -218.01083374023438, + "loss": 0.6905, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003968000411987305, + "rewards/margins": 0.01186499372124672, + "rewards/rejected": -0.015832997858524323, + "step": 380 + }, + { + "epoch": 0.058921322250144986, + "grad_norm": 4.675786972045898, + "learning_rate": 9.81958762886598e-07, + "logits/chosen": 13.579373359680176, + "logits/rejected": 11.059019088745117, + "logps/chosen": -277.2948913574219, + "logps/rejected": -249.16050720214844, + "loss": 0.6668, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03067338652908802, + "rewards/margins": 0.05679219216108322, + "rewards/rejected": -0.0261188056319952, + "step": 381 + }, + { + "epoch": 0.059075971389909145, + "grad_norm": 6.958254814147949, + "learning_rate": 9.84536082474227e-07, + "logits/chosen": 13.963619232177734, + "logits/rejected": 11.869502067565918, + "logps/chosen": -339.5461120605469, + "logps/rejected": -330.08624267578125, + "loss": 0.7129, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03049173578619957, + "rewards/margins": -0.03578352928161621, + "rewards/rejected": 0.005291796289384365, + "step": 382 + }, + { + "epoch": 0.0592306205296733, + "grad_norm": 5.348432540893555, + "learning_rate": 9.871134020618558e-07, + "logits/chosen": 11.107304573059082, + "logits/rejected": 8.949459075927734, + "logps/chosen": -354.39605712890625, + "logps/rejected": -313.8327331542969, + "loss": 0.6789, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012913036160171032, + "rewards/margins": 0.030461497604846954, + "rewards/rejected": -0.017548464238643646, + "step": 383 + }, + { + "epoch": 0.05938526966943746, + "grad_norm": 6.003533363342285, + "learning_rate": 9.896907216494845e-07, + "logits/chosen": 5.539496421813965, + "logits/rejected": 10.436893463134766, + "logps/chosen": -300.509765625, + "logps/rejected": -333.1765441894531, + "loss": 0.6981, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00812673382461071, + "rewards/margins": -0.008057691156864166, + "rewards/rejected": 0.016184426844120026, + "step": 384 + }, + { + "epoch": 0.05953991880920163, + "grad_norm": 4.731614112854004, + "learning_rate": 9.922680412371133e-07, + "logits/chosen": 9.498493194580078, + "logits/rejected": 5.924538612365723, + "logps/chosen": -375.8652648925781, + "logps/rejected": -273.45050048828125, + "loss": 0.7049, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.008946752175688744, + "rewards/margins": -0.018858052790164948, + "rewards/rejected": 0.027804803103208542, + "step": 385 + }, + { + "epoch": 0.059694567948965785, + "grad_norm": 4.286332607269287, + "learning_rate": 9.948453608247422e-07, + "logits/chosen": 14.273489952087402, + "logits/rejected": 7.135898113250732, + "logps/chosen": -252.98452758789062, + "logps/rejected": -209.14920043945312, + "loss": 0.7084, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0075575849041342735, + "rewards/margins": -0.018668843433260918, + "rewards/rejected": 0.026226425543427467, + "step": 386 + }, + { + "epoch": 0.05984921708872994, + "grad_norm": 3.7653305530548096, + "learning_rate": 9.974226804123713e-07, + "logits/chosen": 9.252832412719727, + "logits/rejected": 8.167980194091797, + "logps/chosen": -131.02125549316406, + "logps/rejected": -157.9849853515625, + "loss": 0.6939, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006353235337883234, + "rewards/margins": -4.544481635093689e-05, + "rewards/rejected": -0.006307792849838734, + "step": 387 + }, + { + "epoch": 0.0600038662284941, + "grad_norm": 5.94622278213501, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 15.253323554992676, + "logits/rejected": 13.738725662231445, + "logps/chosen": -405.9993896484375, + "logps/rejected": -355.36004638671875, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010611534118652344, + "rewards/margins": 0.009074783883988857, + "rewards/rejected": -0.019686317071318626, + "step": 388 + }, + { + "epoch": 0.06015851536825827, + "grad_norm": 5.627812385559082, + "learning_rate": 1.002577319587629e-06, + "logits/chosen": 7.367169380187988, + "logits/rejected": 11.624138832092285, + "logps/chosen": -234.81594848632812, + "logps/rejected": -327.74371337890625, + "loss": 0.7235, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04003448784351349, + "rewards/margins": -0.055303286761045456, + "rewards/rejected": 0.015268802642822266, + "step": 389 + }, + { + "epoch": 0.060313164508022425, + "grad_norm": 4.381931781768799, + "learning_rate": 1.005154639175258e-06, + "logits/chosen": 10.94035530090332, + "logits/rejected": 4.489120006561279, + "logps/chosen": -360.4523620605469, + "logps/rejected": -250.97390747070312, + "loss": 0.6892, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0281643345952034, + "rewards/margins": 0.011478759348392487, + "rewards/rejected": -0.039643097668886185, + "step": 390 + }, + { + "epoch": 0.060467813647786584, + "grad_norm": 5.554924964904785, + "learning_rate": 1.0077319587628868e-06, + "logits/chosen": 10.382328033447266, + "logits/rejected": 10.860540390014648, + "logps/chosen": -305.2872009277344, + "logps/rejected": -389.4974670410156, + "loss": 0.6972, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.019063852727413177, + "rewards/margins": -0.0033590756356716156, + "rewards/rejected": 0.02242293208837509, + "step": 391 + }, + { + "epoch": 0.06062246278755074, + "grad_norm": 6.395705223083496, + "learning_rate": 1.0103092783505157e-06, + "logits/chosen": 8.464802742004395, + "logits/rejected": 6.494001388549805, + "logps/chosen": -419.99530029296875, + "logps/rejected": -296.424560546875, + "loss": 0.692, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.020837876945734024, + "rewards/margins": 0.009765293449163437, + "rewards/rejected": -0.03060317412018776, + "step": 392 + }, + { + "epoch": 0.06077711192731491, + "grad_norm": 4.983519077301025, + "learning_rate": 1.0128865979381445e-06, + "logits/chosen": 14.822391510009766, + "logits/rejected": 12.447269439697266, + "logps/chosen": -376.1868896484375, + "logps/rejected": -295.7694396972656, + "loss": 0.6795, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06785649806261063, + "rewards/margins": 0.0322817787528038, + "rewards/rejected": 0.03557472676038742, + "step": 393 + }, + { + "epoch": 0.060931761067079065, + "grad_norm": 46.94654083251953, + "learning_rate": 1.0154639175257732e-06, + "logits/chosen": 10.589373588562012, + "logits/rejected": 3.4049954414367676, + "logps/chosen": -168.27459716796875, + "logps/rejected": -220.32583618164062, + "loss": 0.6713, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04598045349121094, + "rewards/margins": 0.045572809875011444, + "rewards/rejected": 0.0004076478071510792, + "step": 394 + }, + { + "epoch": 0.061086410206843224, + "grad_norm": 5.823394775390625, + "learning_rate": 1.018041237113402e-06, + "logits/chosen": 7.790483474731445, + "logits/rejected": 12.002765655517578, + "logps/chosen": -220.02870178222656, + "logps/rejected": -320.35577392578125, + "loss": 0.7181, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.003933548927307129, + "rewards/margins": -0.045825716108083725, + "rewards/rejected": 0.04975926876068115, + "step": 395 + }, + { + "epoch": 0.06124105934660738, + "grad_norm": 4.505932807922363, + "learning_rate": 1.020618556701031e-06, + "logits/chosen": 9.639538764953613, + "logits/rejected": 7.1267900466918945, + "logps/chosen": -224.4041748046875, + "logps/rejected": -176.70773315429688, + "loss": 0.6818, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02993164025247097, + "rewards/margins": 0.02464001253247261, + "rewards/rejected": 0.00529162771999836, + "step": 396 + }, + { + "epoch": 0.06139570848637155, + "grad_norm": 5.793320178985596, + "learning_rate": 1.0231958762886598e-06, + "logits/chosen": 12.236940383911133, + "logits/rejected": 11.046968460083008, + "logps/chosen": -365.28973388671875, + "logps/rejected": -373.0707702636719, + "loss": 0.6593, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.052540771663188934, + "rewards/margins": 0.08348321914672852, + "rewards/rejected": -0.030942440032958984, + "step": 397 + }, + { + "epoch": 0.061550357626135706, + "grad_norm": 4.560193061828613, + "learning_rate": 1.0257731958762887e-06, + "logits/chosen": 10.098865509033203, + "logits/rejected": 8.987324714660645, + "logps/chosen": -281.2982177734375, + "logps/rejected": -271.20001220703125, + "loss": 0.6421, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.031172750517725945, + "rewards/margins": 0.10895471274852753, + "rewards/rejected": -0.07778196781873703, + "step": 398 + }, + { + "epoch": 0.061705006765899864, + "grad_norm": 3.6590681076049805, + "learning_rate": 1.0283505154639175e-06, + "logits/chosen": 3.6563708782196045, + "logits/rejected": 9.398565292358398, + "logps/chosen": -106.84200286865234, + "logps/rejected": -124.43630981445312, + "loss": 0.7112, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.018233537673950195, + "rewards/margins": -0.03455691412091255, + "rewards/rejected": 0.016323376446962357, + "step": 399 + }, + { + "epoch": 0.06185965590566402, + "grad_norm": 5.60069465637207, + "learning_rate": 1.0309278350515464e-06, + "logits/chosen": 11.04920768737793, + "logits/rejected": 12.640439987182617, + "logps/chosen": -289.4459228515625, + "logps/rejected": -264.5858154296875, + "loss": 0.6786, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01958027109503746, + "rewards/margins": 0.035243794322013855, + "rewards/rejected": -0.015663526952266693, + "step": 400 + }, + { + "epoch": 0.06201430504542819, + "grad_norm": 5.548246383666992, + "learning_rate": 1.0335051546391753e-06, + "logits/chosen": 8.104073524475098, + "logits/rejected": 9.847138404846191, + "logps/chosen": -281.1682434082031, + "logps/rejected": -272.9338684082031, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009098435752093792, + "rewards/margins": 0.023684311658143997, + "rewards/rejected": -0.01458587683737278, + "step": 401 + }, + { + "epoch": 0.062168954185192346, + "grad_norm": 4.97169828414917, + "learning_rate": 1.0360824742268044e-06, + "logits/chosen": 15.493053436279297, + "logits/rejected": 2.851205348968506, + "logps/chosen": -234.340576171875, + "logps/rejected": -148.24948120117188, + "loss": 0.7244, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.010373781435191631, + "rewards/margins": -0.057037945836782455, + "rewards/rejected": 0.046664170920848846, + "step": 402 + }, + { + "epoch": 0.062323603324956504, + "grad_norm": 5.792660713195801, + "learning_rate": 1.038659793814433e-06, + "logits/chosen": 13.377229690551758, + "logits/rejected": 8.254720687866211, + "logps/chosen": -370.7276611328125, + "logps/rejected": -287.15008544921875, + "loss": 0.6679, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06946516036987305, + "rewards/margins": 0.05564098805189133, + "rewards/rejected": 0.013824177905917168, + "step": 403 + }, + { + "epoch": 0.06247825246472066, + "grad_norm": 3.400301933288574, + "learning_rate": 1.041237113402062e-06, + "logits/chosen": 7.354280471801758, + "logits/rejected": 4.088191032409668, + "logps/chosen": -177.28558349609375, + "logps/rejected": -170.65638732910156, + "loss": 0.6603, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.031157229095697403, + "rewards/margins": 0.07195504009723663, + "rewards/rejected": -0.04079780727624893, + "step": 404 + }, + { + "epoch": 0.06263290160448483, + "grad_norm": 4.521085262298584, + "learning_rate": 1.0438144329896908e-06, + "logits/chosen": 15.999117851257324, + "logits/rejected": 7.129674434661865, + "logps/chosen": -287.62994384765625, + "logps/rejected": -186.813720703125, + "loss": 0.6759, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.010795880109071732, + "rewards/margins": 0.04042024165391922, + "rewards/rejected": -0.05121612548828125, + "step": 405 + }, + { + "epoch": 0.06278755074424898, + "grad_norm": 3.939335823059082, + "learning_rate": 1.0463917525773196e-06, + "logits/chosen": 14.739656448364258, + "logits/rejected": 12.417524337768555, + "logps/chosen": -256.73895263671875, + "logps/rejected": -189.15966796875, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.030969714745879173, + "rewards/margins": 0.013964174315333366, + "rewards/rejected": 0.017005540430545807, + "step": 406 + }, + { + "epoch": 0.06294219988401314, + "grad_norm": 4.425426483154297, + "learning_rate": 1.0489690721649485e-06, + "logits/chosen": 11.847260475158691, + "logits/rejected": 7.343934535980225, + "logps/chosen": -186.3024444580078, + "logps/rejected": -154.26666259765625, + "loss": 0.7195, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04618697240948677, + "rewards/margins": -0.04906139522790909, + "rewards/rejected": 0.0028744228184223175, + "step": 407 + }, + { + "epoch": 0.06309684902377731, + "grad_norm": 6.185189247131348, + "learning_rate": 1.0515463917525774e-06, + "logits/chosen": 5.740227699279785, + "logits/rejected": 9.117938041687012, + "logps/chosen": -327.19561767578125, + "logps/rejected": -353.5487060546875, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.005328178405761719, + "rewards/margins": 0.024118423461914062, + "rewards/rejected": -0.018790245056152344, + "step": 408 + }, + { + "epoch": 0.06325149816354146, + "grad_norm": 4.2316508293151855, + "learning_rate": 1.0541237113402063e-06, + "logits/chosen": 7.834961891174316, + "logits/rejected": 7.438101291656494, + "logps/chosen": -191.24432373046875, + "logps/rejected": -211.92042541503906, + "loss": 0.6478, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06866011768579483, + "rewards/margins": 0.10433387756347656, + "rewards/rejected": -0.03567375987768173, + "step": 409 + }, + { + "epoch": 0.06340614730330563, + "grad_norm": 5.276022911071777, + "learning_rate": 1.0567010309278351e-06, + "logits/chosen": 7.849164009094238, + "logits/rejected": 8.43691635131836, + "logps/chosen": -310.4337158203125, + "logps/rejected": -318.9949951171875, + "loss": 0.7336, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.044051505625247955, + "rewards/margins": -0.0714423730969429, + "rewards/rejected": 0.027390865609049797, + "step": 410 + }, + { + "epoch": 0.06356079644306979, + "grad_norm": 5.893574237823486, + "learning_rate": 1.059278350515464e-06, + "logits/chosen": 6.151208877563477, + "logits/rejected": 6.023972511291504, + "logps/chosen": -233.06085205078125, + "logps/rejected": -253.8050537109375, + "loss": 0.6646, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05469723045825958, + "rewards/margins": 0.05942115932703018, + "rewards/rejected": -0.004723930731415749, + "step": 411 + }, + { + "epoch": 0.06371544558283394, + "grad_norm": 4.401363372802734, + "learning_rate": 1.0618556701030929e-06, + "logits/chosen": 12.957690238952637, + "logits/rejected": 9.535648345947266, + "logps/chosen": -304.7477722167969, + "logps/rejected": -230.508056640625, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0379658006131649, + "rewards/margins": -0.04407656192779541, + "rewards/rejected": 0.006110764108598232, + "step": 412 + }, + { + "epoch": 0.06387009472259811, + "grad_norm": 3.616117477416992, + "learning_rate": 1.0644329896907218e-06, + "logits/chosen": 11.035181999206543, + "logits/rejected": 11.399690628051758, + "logps/chosen": -139.2234344482422, + "logps/rejected": -147.9811248779297, + "loss": 0.7337, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03679187595844269, + "rewards/margins": -0.07756879925727844, + "rewards/rejected": 0.040776923298835754, + "step": 413 + }, + { + "epoch": 0.06402474386236226, + "grad_norm": 4.700901985168457, + "learning_rate": 1.0670103092783506e-06, + "logits/chosen": 7.755059719085693, + "logits/rejected": 18.040864944458008, + "logps/chosen": -175.4807891845703, + "logps/rejected": -251.79000854492188, + "loss": 0.7011, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009173011407256126, + "rewards/margins": -0.012544157914817333, + "rewards/rejected": 0.003371143713593483, + "step": 414 + }, + { + "epoch": 0.06417939300212643, + "grad_norm": 3.2760438919067383, + "learning_rate": 1.0695876288659795e-06, + "logits/chosen": 6.526534080505371, + "logits/rejected": 7.675030708312988, + "logps/chosen": -130.85191345214844, + "logps/rejected": -144.63307189941406, + "loss": 0.6737, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06054973974823952, + "rewards/margins": 0.04247203469276428, + "rewards/rejected": 0.018077708780765533, + "step": 415 + }, + { + "epoch": 0.06433404214189059, + "grad_norm": 6.944642543792725, + "learning_rate": 1.0721649484536084e-06, + "logits/chosen": 9.642186164855957, + "logits/rejected": 6.517422676086426, + "logps/chosen": -372.90985107421875, + "logps/rejected": -269.1368103027344, + "loss": 0.7411, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04121983051300049, + "rewards/margins": -0.08632919937372208, + "rewards/rejected": 0.04510936886072159, + "step": 416 + }, + { + "epoch": 0.06448869128165474, + "grad_norm": 5.0813140869140625, + "learning_rate": 1.0747422680412372e-06, + "logits/chosen": 12.539143562316895, + "logits/rejected": 10.774240493774414, + "logps/chosen": -312.68560791015625, + "logps/rejected": -258.262939453125, + "loss": 0.7357, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08162746578454971, + "rewards/margins": -0.07941237092018127, + "rewards/rejected": -0.0022150976583361626, + "step": 417 + }, + { + "epoch": 0.06464334042141891, + "grad_norm": 4.718321800231934, + "learning_rate": 1.0773195876288661e-06, + "logits/chosen": 11.76107120513916, + "logits/rejected": 1.2383140325546265, + "logps/chosen": -248.8386688232422, + "logps/rejected": -154.86441040039062, + "loss": 0.7073, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.017749402672052383, + "rewards/margins": -0.019788123667240143, + "rewards/rejected": 0.037537530064582825, + "step": 418 + }, + { + "epoch": 0.06479798956118307, + "grad_norm": 3.8745291233062744, + "learning_rate": 1.079896907216495e-06, + "logits/chosen": 8.4127836227417, + "logits/rejected": 10.399481773376465, + "logps/chosen": -229.52426147460938, + "logps/rejected": -256.41644287109375, + "loss": 0.6635, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00891094096004963, + "rewards/margins": 0.07323947548866272, + "rewards/rejected": -0.06432852894067764, + "step": 419 + }, + { + "epoch": 0.06495263870094722, + "grad_norm": 7.327706336975098, + "learning_rate": 1.0824742268041239e-06, + "logits/chosen": 11.710941314697266, + "logits/rejected": 4.853361129760742, + "logps/chosen": -187.626220703125, + "logps/rejected": -176.29119873046875, + "loss": 0.6926, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.020308634266257286, + "rewards/margins": 0.006362845189869404, + "rewards/rejected": 0.013945795595645905, + "step": 420 + }, + { + "epoch": 0.06510728784071139, + "grad_norm": 5.054439544677734, + "learning_rate": 1.0850515463917527e-06, + "logits/chosen": 7.633783340454102, + "logits/rejected": 9.53019905090332, + "logps/chosen": -332.91033935546875, + "logps/rejected": -347.2127990722656, + "loss": 0.6818, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0553162582218647, + "rewards/margins": 0.02460308186709881, + "rewards/rejected": 0.030713174492120743, + "step": 421 + }, + { + "epoch": 0.06526193698047554, + "grad_norm": 4.469846725463867, + "learning_rate": 1.0876288659793816e-06, + "logits/chosen": 12.601400375366211, + "logits/rejected": 6.941188335418701, + "logps/chosen": -297.45391845703125, + "logps/rejected": -243.72630310058594, + "loss": 0.722, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02540598064661026, + "rewards/margins": -0.05200149863958359, + "rewards/rejected": 0.026595521718263626, + "step": 422 + }, + { + "epoch": 0.0654165861202397, + "grad_norm": 3.918545961380005, + "learning_rate": 1.0902061855670105e-06, + "logits/chosen": 10.874977111816406, + "logits/rejected": 5.981527328491211, + "logps/chosen": -258.41192626953125, + "logps/rejected": -224.7599639892578, + "loss": 0.6514, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07436370849609375, + "rewards/margins": 0.08920204639434814, + "rewards/rejected": -0.01483833882957697, + "step": 423 + }, + { + "epoch": 0.06557123526000387, + "grad_norm": 5.158308029174805, + "learning_rate": 1.0927835051546393e-06, + "logits/chosen": 13.492680549621582, + "logits/rejected": 7.164361953735352, + "logps/chosen": -385.2947998046875, + "logps/rejected": -204.1776885986328, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006291484460234642, + "rewards/margins": 0.01792421191930771, + "rewards/rejected": -0.011632728390395641, + "step": 424 + }, + { + "epoch": 0.06572588439976802, + "grad_norm": 6.421451568603516, + "learning_rate": 1.0953608247422682e-06, + "logits/chosen": 14.007320404052734, + "logits/rejected": 14.686290740966797, + "logps/chosen": -375.481689453125, + "logps/rejected": -330.3765869140625, + "loss": 0.6328, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02172689512372017, + "rewards/margins": 0.12835693359375, + "rewards/rejected": -0.10663004219532013, + "step": 425 + }, + { + "epoch": 0.06588053353953219, + "grad_norm": 5.324506759643555, + "learning_rate": 1.097938144329897e-06, + "logits/chosen": 9.088860511779785, + "logits/rejected": 9.94772720336914, + "logps/chosen": -412.4358825683594, + "logps/rejected": -346.6433410644531, + "loss": 0.6596, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02656860277056694, + "rewards/margins": 0.08164462447166443, + "rewards/rejected": -0.05507602542638779, + "step": 426 + }, + { + "epoch": 0.06603518267929635, + "grad_norm": 10.140251159667969, + "learning_rate": 1.1005154639175257e-06, + "logits/chosen": 12.245105743408203, + "logits/rejected": 8.658808708190918, + "logps/chosen": -461.3382568359375, + "logps/rejected": -376.05560302734375, + "loss": 0.7176, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06021628528833389, + "rewards/margins": -0.04364652931690216, + "rewards/rejected": -0.01656975969672203, + "step": 427 + }, + { + "epoch": 0.0661898318190605, + "grad_norm": 3.773886203765869, + "learning_rate": 1.1030927835051546e-06, + "logits/chosen": 10.883367538452148, + "logits/rejected": 6.539021015167236, + "logps/chosen": -315.97607421875, + "logps/rejected": -244.35726928710938, + "loss": 0.6606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.045493461191654205, + "rewards/margins": 0.07271971553564072, + "rewards/rejected": -0.02722625620663166, + "step": 428 + }, + { + "epoch": 0.06634448095882467, + "grad_norm": 5.636545658111572, + "learning_rate": 1.1056701030927835e-06, + "logits/chosen": 5.362805366516113, + "logits/rejected": 1.5850732326507568, + "logps/chosen": -365.0799255371094, + "logps/rejected": -250.87652587890625, + "loss": 0.7222, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08530960232019424, + "rewards/margins": -0.055381014943122864, + "rewards/rejected": -0.02992858737707138, + "step": 429 + }, + { + "epoch": 0.06649913009858882, + "grad_norm": 5.54054594039917, + "learning_rate": 1.1082474226804124e-06, + "logits/chosen": 6.456350803375244, + "logits/rejected": 5.648205757141113, + "logps/chosen": -317.45147705078125, + "logps/rejected": -265.6895751953125, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04060354083776474, + "rewards/margins": 0.08768625557422638, + "rewards/rejected": -0.04708271473646164, + "step": 430 + }, + { + "epoch": 0.06665377923835299, + "grad_norm": 4.973996162414551, + "learning_rate": 1.1108247422680412e-06, + "logits/chosen": 14.110208511352539, + "logits/rejected": 10.118903160095215, + "logps/chosen": -272.4558410644531, + "logps/rejected": -246.0497283935547, + "loss": 0.7095, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015845393761992455, + "rewards/margins": -0.029163219034671783, + "rewards/rejected": 0.01331782341003418, + "step": 431 + }, + { + "epoch": 0.06680842837811715, + "grad_norm": 7.229224681854248, + "learning_rate": 1.1134020618556703e-06, + "logits/chosen": 14.000032424926758, + "logits/rejected": 11.125240325927734, + "logps/chosen": -401.0099792480469, + "logps/rejected": -432.0771484375, + "loss": 0.7027, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0053405724465847015, + "rewards/margins": -0.01319103129208088, + "rewards/rejected": 0.007850456051528454, + "step": 432 + }, + { + "epoch": 0.0669630775178813, + "grad_norm": 5.13885498046875, + "learning_rate": 1.1159793814432992e-06, + "logits/chosen": 15.442996978759766, + "logits/rejected": 11.216538429260254, + "logps/chosen": -385.5250549316406, + "logps/rejected": -268.22015380859375, + "loss": 0.719, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.004207659512758255, + "rewards/margins": -0.04783777892589569, + "rewards/rejected": 0.052045442163944244, + "step": 433 + }, + { + "epoch": 0.06711772665764547, + "grad_norm": 4.798912048339844, + "learning_rate": 1.118556701030928e-06, + "logits/chosen": 11.21569538116455, + "logits/rejected": 10.157526016235352, + "logps/chosen": -246.6448974609375, + "logps/rejected": -230.35739135742188, + "loss": 0.7533, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.031946610659360886, + "rewards/margins": -0.11454010009765625, + "rewards/rejected": 0.08259349316358566, + "step": 434 + }, + { + "epoch": 0.06727237579740963, + "grad_norm": 4.816092491149902, + "learning_rate": 1.121134020618557e-06, + "logits/chosen": 6.873818397521973, + "logits/rejected": 2.828075408935547, + "logps/chosen": -271.92919921875, + "logps/rejected": -181.554443359375, + "loss": 0.72, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03334550932049751, + "rewards/margins": -0.05021844431757927, + "rewards/rejected": 0.01687292940914631, + "step": 435 + }, + { + "epoch": 0.06742702493717379, + "grad_norm": 4.457712173461914, + "learning_rate": 1.1237113402061856e-06, + "logits/chosen": 13.320279121398926, + "logits/rejected": 9.802282333374023, + "logps/chosen": -330.33319091796875, + "logps/rejected": -269.45849609375, + "loss": 0.6793, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017458343878388405, + "rewards/margins": 0.03067474253475666, + "rewards/rejected": -0.013216400519013405, + "step": 436 + }, + { + "epoch": 0.06758167407693795, + "grad_norm": 4.063880920410156, + "learning_rate": 1.1262886597938145e-06, + "logits/chosen": 16.130605697631836, + "logits/rejected": 12.348139762878418, + "logps/chosen": -198.15565490722656, + "logps/rejected": -178.13626098632812, + "loss": 0.6857, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02518201246857643, + "rewards/margins": 0.015363645739853382, + "rewards/rejected": 0.0098183648660779, + "step": 437 + }, + { + "epoch": 0.0677363232167021, + "grad_norm": 4.596961498260498, + "learning_rate": 1.1288659793814433e-06, + "logits/chosen": 9.14218807220459, + "logits/rejected": 10.569528579711914, + "logps/chosen": -193.8854217529297, + "logps/rejected": -218.80386352539062, + "loss": 0.7241, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02099437825381756, + "rewards/margins": -0.05657162517309189, + "rewards/rejected": 0.03557724878191948, + "step": 438 + }, + { + "epoch": 0.06789097235646627, + "grad_norm": 5.912600517272949, + "learning_rate": 1.1314432989690722e-06, + "logits/chosen": 9.41981315612793, + "logits/rejected": 6.537056922912598, + "logps/chosen": -351.82135009765625, + "logps/rejected": -227.25894165039062, + "loss": 0.7363, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13925524055957794, + "rewards/margins": -0.07983632385730743, + "rewards/rejected": -0.05941891670227051, + "step": 439 + }, + { + "epoch": 0.06804562149623043, + "grad_norm": 6.625514984130859, + "learning_rate": 1.134020618556701e-06, + "logits/chosen": 11.954994201660156, + "logits/rejected": 8.536954879760742, + "logps/chosen": -305.7756652832031, + "logps/rejected": -244.58151245117188, + "loss": 0.7504, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08066359162330627, + "rewards/margins": -0.10602931678295135, + "rewards/rejected": 0.025365734472870827, + "step": 440 + }, + { + "epoch": 0.06820027063599458, + "grad_norm": 7.536952018737793, + "learning_rate": 1.13659793814433e-06, + "logits/chosen": 13.029542922973633, + "logits/rejected": 7.037688732147217, + "logps/chosen": -576.3472290039062, + "logps/rejected": -304.11407470703125, + "loss": 0.756, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03906955569982529, + "rewards/margins": -0.1104925125837326, + "rewards/rejected": 0.07142295688390732, + "step": 441 + }, + { + "epoch": 0.06835491977575875, + "grad_norm": 4.61942720413208, + "learning_rate": 1.1391752577319588e-06, + "logits/chosen": 10.842082977294922, + "logits/rejected": 12.782186508178711, + "logps/chosen": -230.60238647460938, + "logps/rejected": -241.21810913085938, + "loss": 0.7052, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007932376116514206, + "rewards/margins": -0.01750480942428112, + "rewards/rejected": 0.009572433307766914, + "step": 442 + }, + { + "epoch": 0.06850956891552291, + "grad_norm": 6.207898139953613, + "learning_rate": 1.1417525773195877e-06, + "logits/chosen": 5.723091125488281, + "logits/rejected": 9.703177452087402, + "logps/chosen": -364.4405822753906, + "logps/rejected": -386.5208740234375, + "loss": 0.7192, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0430697463452816, + "rewards/margins": -0.04799194633960724, + "rewards/rejected": 0.004922197666019201, + "step": 443 + }, + { + "epoch": 0.06866421805528707, + "grad_norm": 5.925559043884277, + "learning_rate": 1.1443298969072166e-06, + "logits/chosen": 10.710570335388184, + "logits/rejected": 5.233012676239014, + "logps/chosen": -327.94757080078125, + "logps/rejected": -242.01473999023438, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.022325709462165833, + "rewards/margins": 0.011135056614875793, + "rewards/rejected": -0.03346076235175133, + "step": 444 + }, + { + "epoch": 0.06881886719505123, + "grad_norm": 4.352199554443359, + "learning_rate": 1.1469072164948454e-06, + "logits/chosen": 19.635128021240234, + "logits/rejected": 11.797924041748047, + "logps/chosen": -264.49005126953125, + "logps/rejected": -145.75155639648438, + "loss": 0.6844, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.002335714176297188, + "rewards/margins": 0.018510818481445312, + "rewards/rejected": -0.016175102442502975, + "step": 445 + }, + { + "epoch": 0.06897351633481538, + "grad_norm": 12.978840827941895, + "learning_rate": 1.1494845360824743e-06, + "logits/chosen": 13.32224178314209, + "logits/rejected": 10.29178237915039, + "logps/chosen": -213.24143981933594, + "logps/rejected": -228.46661376953125, + "loss": 0.6915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007376480847597122, + "rewards/margins": 0.005146408919245005, + "rewards/rejected": 0.002230070997029543, + "step": 446 + }, + { + "epoch": 0.06912816547457955, + "grad_norm": 4.917621612548828, + "learning_rate": 1.1520618556701032e-06, + "logits/chosen": 8.363395690917969, + "logits/rejected": 8.210826873779297, + "logps/chosen": -360.5667419433594, + "logps/rejected": -286.4749755859375, + "loss": 0.665, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01968412473797798, + "rewards/margins": 0.06538467854261398, + "rewards/rejected": -0.0457005500793457, + "step": 447 + }, + { + "epoch": 0.06928281461434371, + "grad_norm": 5.113491535186768, + "learning_rate": 1.154639175257732e-06, + "logits/chosen": 2.8186323642730713, + "logits/rejected": 4.038246154785156, + "logps/chosen": -273.90380859375, + "logps/rejected": -269.4390563964844, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04487667232751846, + "rewards/margins": 0.02400503121316433, + "rewards/rejected": 0.020871637389063835, + "step": 448 + }, + { + "epoch": 0.06943746375410786, + "grad_norm": 5.9082159996032715, + "learning_rate": 1.157216494845361e-06, + "logits/chosen": 6.520223617553711, + "logits/rejected": 7.196210861206055, + "logps/chosen": -242.16896057128906, + "logps/rejected": -244.6864776611328, + "loss": 0.7472, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07740268856287003, + "rewards/margins": -0.10075444728136063, + "rewards/rejected": 0.0233517624437809, + "step": 449 + }, + { + "epoch": 0.06959211289387203, + "grad_norm": 4.350265979766846, + "learning_rate": 1.1597938144329898e-06, + "logits/chosen": 6.269900321960449, + "logits/rejected": 5.303152084350586, + "logps/chosen": -223.11007690429688, + "logps/rejected": -233.52423095703125, + "loss": 0.6804, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0032989010214805603, + "rewards/margins": 0.03347807005047798, + "rewards/rejected": -0.030179165303707123, + "step": 450 + }, + { + "epoch": 0.06974676203363618, + "grad_norm": 5.940832614898682, + "learning_rate": 1.1623711340206187e-06, + "logits/chosen": 9.687047004699707, + "logits/rejected": 12.396611213684082, + "logps/chosen": -251.50350952148438, + "logps/rejected": -299.99945068359375, + "loss": 0.6704, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06874170899391174, + "rewards/margins": 0.050896354019641876, + "rewards/rejected": 0.01784534379839897, + "step": 451 + }, + { + "epoch": 0.06990141117340035, + "grad_norm": 5.027914047241211, + "learning_rate": 1.1649484536082475e-06, + "logits/chosen": 5.539091110229492, + "logits/rejected": 5.175711631774902, + "logps/chosen": -217.0523681640625, + "logps/rejected": -277.43768310546875, + "loss": 0.7147, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008119489066302776, + "rewards/margins": -0.03714494779706001, + "rewards/rejected": 0.029025457799434662, + "step": 452 + }, + { + "epoch": 0.07005606031316451, + "grad_norm": 4.091969966888428, + "learning_rate": 1.1675257731958764e-06, + "logits/chosen": 9.525016784667969, + "logits/rejected": 6.513009071350098, + "logps/chosen": -183.4822998046875, + "logps/rejected": -162.91171264648438, + "loss": 0.6526, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.011124372482299805, + "rewards/margins": 0.08528690040111542, + "rewards/rejected": -0.07416252791881561, + "step": 453 + }, + { + "epoch": 0.07021070945292866, + "grad_norm": 4.9361419677734375, + "learning_rate": 1.1701030927835053e-06, + "logits/chosen": 15.196216583251953, + "logits/rejected": 8.254142761230469, + "logps/chosen": -341.0078125, + "logps/rejected": -272.251220703125, + "loss": 0.7192, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.027997970581054688, + "rewards/margins": -0.04884038120508194, + "rewards/rejected": 0.020842408761382103, + "step": 454 + }, + { + "epoch": 0.07036535859269283, + "grad_norm": 5.737484931945801, + "learning_rate": 1.1726804123711342e-06, + "logits/chosen": 8.865446090698242, + "logits/rejected": 12.857247352600098, + "logps/chosen": -264.6304931640625, + "logps/rejected": -265.1905517578125, + "loss": 0.6574, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0091384407132864, + "rewards/margins": 0.07763008773326874, + "rewards/rejected": -0.06849164515733719, + "step": 455 + }, + { + "epoch": 0.070520007732457, + "grad_norm": 5.029343128204346, + "learning_rate": 1.175257731958763e-06, + "logits/chosen": 6.768464088439941, + "logits/rejected": 4.105677604675293, + "logps/chosen": -300.0268249511719, + "logps/rejected": -210.82620239257812, + "loss": 0.6701, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006981231272220612, + "rewards/margins": 0.048430733382701874, + "rewards/rejected": -0.04144950211048126, + "step": 456 + }, + { + "epoch": 0.07067465687222114, + "grad_norm": 6.606870651245117, + "learning_rate": 1.177835051546392e-06, + "logits/chosen": 9.166215896606445, + "logits/rejected": 2.806367874145508, + "logps/chosen": -352.10003662109375, + "logps/rejected": -343.111328125, + "loss": 0.7365, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06574497371912003, + "rewards/margins": -0.08144745975732803, + "rewards/rejected": 0.01570248417556286, + "step": 457 + }, + { + "epoch": 0.07082930601198531, + "grad_norm": 7.202205657958984, + "learning_rate": 1.1804123711340208e-06, + "logits/chosen": 3.586365222930908, + "logits/rejected": 0.8253377676010132, + "logps/chosen": -360.13494873046875, + "logps/rejected": -324.2682189941406, + "loss": 0.7161, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.060147859156131744, + "rewards/margins": -0.04070886969566345, + "rewards/rejected": -0.019438982009887695, + "step": 458 + }, + { + "epoch": 0.07098395515174946, + "grad_norm": 5.205109596252441, + "learning_rate": 1.1829896907216496e-06, + "logits/chosen": 11.185762405395508, + "logits/rejected": 14.334480285644531, + "logps/chosen": -299.57733154296875, + "logps/rejected": -290.84576416015625, + "loss": 0.6919, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0498957633972168, + "rewards/margins": 0.009350206702947617, + "rewards/rejected": 0.04054555669426918, + "step": 459 + }, + { + "epoch": 0.07113860429151363, + "grad_norm": 5.1602020263671875, + "learning_rate": 1.1855670103092783e-06, + "logits/chosen": 12.370702743530273, + "logits/rejected": 8.091865539550781, + "logps/chosen": -318.81549072265625, + "logps/rejected": -297.52362060546875, + "loss": 0.6797, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04072318226099014, + "rewards/margins": 0.0316314697265625, + "rewards/rejected": -0.07235465198755264, + "step": 460 + }, + { + "epoch": 0.07129325343127779, + "grad_norm": 5.133179664611816, + "learning_rate": 1.1881443298969072e-06, + "logits/chosen": 3.630934000015259, + "logits/rejected": 11.368070602416992, + "logps/chosen": -270.8773498535156, + "logps/rejected": -243.56610107421875, + "loss": 0.7303, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03770842403173447, + "rewards/margins": -0.07174105942249298, + "rewards/rejected": 0.034032631665468216, + "step": 461 + }, + { + "epoch": 0.07144790257104194, + "grad_norm": 5.185516834259033, + "learning_rate": 1.190721649484536e-06, + "logits/chosen": 7.575233459472656, + "logits/rejected": 4.580668926239014, + "logps/chosen": -204.6844482421875, + "logps/rejected": -197.384765625, + "loss": 0.6914, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013845921494066715, + "rewards/margins": 0.006259298417717218, + "rewards/rejected": -0.007643889635801315, + "step": 462 + }, + { + "epoch": 0.07160255171080611, + "grad_norm": 6.377435207366943, + "learning_rate": 1.1932989690721651e-06, + "logits/chosen": 4.983016014099121, + "logits/rejected": 0.05048179626464844, + "logps/chosen": -319.78411865234375, + "logps/rejected": -216.42721557617188, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03390403091907501, + "rewards/margins": -0.0013873083516955376, + "rewards/rejected": -0.0325167179107666, + "step": 463 + }, + { + "epoch": 0.07175720085057027, + "grad_norm": 5.488708972930908, + "learning_rate": 1.195876288659794e-06, + "logits/chosen": 9.706180572509766, + "logits/rejected": 8.32201099395752, + "logps/chosen": -288.8235778808594, + "logps/rejected": -275.0206298828125, + "loss": 0.7092, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.02388305775821209, + "rewards/margins": -0.030299853533506393, + "rewards/rejected": 0.054182909429073334, + "step": 464 + }, + { + "epoch": 0.07191184999033443, + "grad_norm": 8.434926986694336, + "learning_rate": 1.1984536082474229e-06, + "logits/chosen": 7.797680377960205, + "logits/rejected": 9.956478118896484, + "logps/chosen": -395.3771667480469, + "logps/rejected": -477.75323486328125, + "loss": 0.6461, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04650573432445526, + "rewards/margins": 0.10006103664636612, + "rewards/rejected": -0.05355529859662056, + "step": 465 + }, + { + "epoch": 0.07206649913009859, + "grad_norm": 4.710972309112549, + "learning_rate": 1.2010309278350517e-06, + "logits/chosen": 12.573779106140137, + "logits/rejected": 8.880084991455078, + "logps/chosen": -196.49710083007812, + "logps/rejected": -228.15316772460938, + "loss": 0.7172, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05125265195965767, + "rewards/margins": -0.04457240551710129, + "rewards/rejected": -0.006680251099169254, + "step": 466 + }, + { + "epoch": 0.07222114826986274, + "grad_norm": 5.636977195739746, + "learning_rate": 1.2036082474226806e-06, + "logits/chosen": 12.430391311645508, + "logits/rejected": 10.582672119140625, + "logps/chosen": -465.9703063964844, + "logps/rejected": -376.74609375, + "loss": 0.6615, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03753151744604111, + "rewards/margins": 0.07629828155040741, + "rewards/rejected": -0.11382980644702911, + "step": 467 + }, + { + "epoch": 0.07237579740962691, + "grad_norm": 5.817788600921631, + "learning_rate": 1.2061855670103095e-06, + "logits/chosen": 10.093749046325684, + "logits/rejected": 9.10906982421875, + "logps/chosen": -375.3330078125, + "logps/rejected": -334.5609436035156, + "loss": 0.7106, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04627704620361328, + "rewards/margins": -0.03184280917048454, + "rewards/rejected": -0.014434242621064186, + "step": 468 + }, + { + "epoch": 0.07253044654939107, + "grad_norm": 3.9110538959503174, + "learning_rate": 1.2087628865979382e-06, + "logits/chosen": 6.988757610321045, + "logits/rejected": 3.1502225399017334, + "logps/chosen": -252.61614990234375, + "logps/rejected": -187.54251098632812, + "loss": 0.6927, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.016286754980683327, + "rewards/margins": 0.0049581993371248245, + "rewards/rejected": -0.02124495431780815, + "step": 469 + }, + { + "epoch": 0.07268509568915522, + "grad_norm": 5.507916450500488, + "learning_rate": 1.211340206185567e-06, + "logits/chosen": 9.607152938842773, + "logits/rejected": 11.154939651489258, + "logps/chosen": -304.7886047363281, + "logps/rejected": -338.859130859375, + "loss": 0.6901, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029600191861391068, + "rewards/margins": 0.012299539521336555, + "rewards/rejected": -0.041899729520082474, + "step": 470 + }, + { + "epoch": 0.07283974482891939, + "grad_norm": 18.582128524780273, + "learning_rate": 1.213917525773196e-06, + "logits/chosen": 9.084453582763672, + "logits/rejected": 1.4555106163024902, + "logps/chosen": -508.26171875, + "logps/rejected": -334.971923828125, + "loss": 0.7142, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05624312907457352, + "rewards/margins": -0.0235869362950325, + "rewards/rejected": -0.032656192779541016, + "step": 471 + }, + { + "epoch": 0.07299439396868355, + "grad_norm": 5.251086711883545, + "learning_rate": 1.2164948453608248e-06, + "logits/chosen": 4.806502342224121, + "logits/rejected": 5.997603893280029, + "logps/chosen": -318.2728271484375, + "logps/rejected": -330.8165588378906, + "loss": 0.6601, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024497367441654205, + "rewards/margins": 0.07905077934265137, + "rewards/rejected": -0.05455341190099716, + "step": 472 + }, + { + "epoch": 0.0731490431084477, + "grad_norm": 7.7333149909973145, + "learning_rate": 1.2190721649484536e-06, + "logits/chosen": 8.06280517578125, + "logits/rejected": 6.172645568847656, + "logps/chosen": -264.54034423828125, + "logps/rejected": -267.8489990234375, + "loss": 0.7209, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05796775966882706, + "rewards/margins": -0.051055099815130234, + "rewards/rejected": -0.00691266031935811, + "step": 473 + }, + { + "epoch": 0.07330369224821187, + "grad_norm": 5.668310642242432, + "learning_rate": 1.2216494845360825e-06, + "logits/chosen": 14.605587005615234, + "logits/rejected": 11.51723861694336, + "logps/chosen": -415.44781494140625, + "logps/rejected": -361.1017761230469, + "loss": 0.7256, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11909104138612747, + "rewards/margins": -0.05120258405804634, + "rewards/rejected": -0.06788845360279083, + "step": 474 + }, + { + "epoch": 0.07345834138797602, + "grad_norm": 4.868416786193848, + "learning_rate": 1.2242268041237114e-06, + "logits/chosen": 13.368406295776367, + "logits/rejected": 9.429543495178223, + "logps/chosen": -296.8177795410156, + "logps/rejected": -251.13247680664062, + "loss": 0.6958, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06994041800498962, + "rewards/margins": -0.003677511587738991, + "rewards/rejected": -0.06626291573047638, + "step": 475 + }, + { + "epoch": 0.07361299052774019, + "grad_norm": 5.10068416595459, + "learning_rate": 1.2268041237113403e-06, + "logits/chosen": 10.763989448547363, + "logits/rejected": 11.519791603088379, + "logps/chosen": -302.250732421875, + "logps/rejected": -260.015380859375, + "loss": 0.6773, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01711585558950901, + "rewards/margins": 0.034076668322086334, + "rewards/rejected": -0.051192522048950195, + "step": 476 + }, + { + "epoch": 0.07376763966750435, + "grad_norm": 4.7022705078125, + "learning_rate": 1.2293814432989691e-06, + "logits/chosen": 14.871297836303711, + "logits/rejected": 8.740377426147461, + "logps/chosen": -415.8657531738281, + "logps/rejected": -297.3988342285156, + "loss": 0.6549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.046979717910289764, + "rewards/margins": 0.0824311226606369, + "rewards/rejected": -0.035451412200927734, + "step": 477 + }, + { + "epoch": 0.0739222888072685, + "grad_norm": 4.601221561431885, + "learning_rate": 1.231958762886598e-06, + "logits/chosen": 14.473128318786621, + "logits/rejected": 15.302721977233887, + "logps/chosen": -240.88668823242188, + "logps/rejected": -258.47900390625, + "loss": 0.748, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0831640213727951, + "rewards/margins": -0.10379353165626526, + "rewards/rejected": 0.020629502832889557, + "step": 478 + }, + { + "epoch": 0.07407693794703267, + "grad_norm": 5.6209716796875, + "learning_rate": 1.2345360824742269e-06, + "logits/chosen": 8.16950798034668, + "logits/rejected": 4.3024492263793945, + "logps/chosen": -281.9227294921875, + "logps/rejected": -232.9622802734375, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.011417914181947708, + "rewards/margins": 0.010905311442911625, + "rewards/rejected": -0.022323228418827057, + "step": 479 + }, + { + "epoch": 0.07423158708679684, + "grad_norm": 4.501147747039795, + "learning_rate": 1.2371134020618557e-06, + "logits/chosen": 9.206696510314941, + "logits/rejected": 10.740234375, + "logps/chosen": -264.6712951660156, + "logps/rejected": -250.37405395507812, + "loss": 0.6617, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003394031897187233, + "rewards/margins": 0.07045107334852219, + "rewards/rejected": -0.07384509593248367, + "step": 480 + }, + { + "epoch": 0.07438623622656099, + "grad_norm": 13.334127426147461, + "learning_rate": 1.2396907216494846e-06, + "logits/chosen": 12.364567756652832, + "logits/rejected": 9.056158065795898, + "logps/chosen": -301.6614685058594, + "logps/rejected": -292.27978515625, + "loss": 0.6771, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.009432125836610794, + "rewards/margins": 0.03644128143787384, + "rewards/rejected": -0.027009155601263046, + "step": 481 + }, + { + "epoch": 0.07454088536632515, + "grad_norm": 4.799774169921875, + "learning_rate": 1.2422680412371135e-06, + "logits/chosen": 12.694483757019043, + "logits/rejected": 8.58286190032959, + "logps/chosen": -259.81097412109375, + "logps/rejected": -252.54266357421875, + "loss": 0.7058, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04981064796447754, + "rewards/margins": -0.022566698491573334, + "rewards/rejected": -0.027243951335549355, + "step": 482 + }, + { + "epoch": 0.0746955345060893, + "grad_norm": 3.73966646194458, + "learning_rate": 1.2448453608247424e-06, + "logits/chosen": 9.738327980041504, + "logits/rejected": 10.59136962890625, + "logps/chosen": -193.85232543945312, + "logps/rejected": -163.93902587890625, + "loss": 0.707, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.033843282610177994, + "rewards/margins": -0.026288272812962532, + "rewards/rejected": -0.007555009797215462, + "step": 483 + }, + { + "epoch": 0.07485018364585347, + "grad_norm": 4.871867656707764, + "learning_rate": 1.2474226804123712e-06, + "logits/chosen": 15.306303977966309, + "logits/rejected": 13.005790710449219, + "logps/chosen": -267.81549072265625, + "logps/rejected": -257.08367919921875, + "loss": 0.7025, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021944332867860794, + "rewards/margins": -0.01678919792175293, + "rewards/rejected": -0.00515513401478529, + "step": 484 + }, + { + "epoch": 0.07500483278561763, + "grad_norm": 6.419384956359863, + "learning_rate": 1.25e-06, + "logits/chosen": 5.472255229949951, + "logits/rejected": 0.6009783148765564, + "logps/chosen": -323.40191650390625, + "logps/rejected": -193.5068817138672, + "loss": 0.723, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.06473977863788605, + "rewards/margins": -0.05760948732495308, + "rewards/rejected": -0.007130288984626532, + "step": 485 + }, + { + "epoch": 0.07515948192538179, + "grad_norm": 6.550868034362793, + "learning_rate": 1.252577319587629e-06, + "logits/chosen": 8.418712615966797, + "logits/rejected": 3.8301689624786377, + "logps/chosen": -363.01776123046875, + "logps/rejected": -280.04931640625, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07135991752147675, + "rewards/margins": 0.08949494361877441, + "rewards/rejected": -0.01813502237200737, + "step": 486 + }, + { + "epoch": 0.07531413106514595, + "grad_norm": 3.9579107761383057, + "learning_rate": 1.2551546391752578e-06, + "logits/chosen": 9.789752960205078, + "logits/rejected": 8.411087989807129, + "logps/chosen": -275.6781005859375, + "logps/rejected": -144.12982177734375, + "loss": 0.6806, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0124449972063303, + "rewards/margins": 0.026842165738344193, + "rewards/rejected": -0.014397167600691319, + "step": 487 + }, + { + "epoch": 0.07546878020491012, + "grad_norm": 5.32539176940918, + "learning_rate": 1.2577319587628867e-06, + "logits/chosen": 11.784067153930664, + "logits/rejected": 10.029605865478516, + "logps/chosen": -335.7101135253906, + "logps/rejected": -288.75372314453125, + "loss": 0.7238, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08159056305885315, + "rewards/margins": -0.055178213864564896, + "rewards/rejected": -0.026412345468997955, + "step": 488 + }, + { + "epoch": 0.07562342934467427, + "grad_norm": 5.820296287536621, + "learning_rate": 1.2603092783505156e-06, + "logits/chosen": 11.580249786376953, + "logits/rejected": 6.299615859985352, + "logps/chosen": -277.8828125, + "logps/rejected": -212.2325439453125, + "loss": 0.717, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.057082418352365494, + "rewards/margins": -0.041176747530698776, + "rewards/rejected": -0.01590566709637642, + "step": 489 + }, + { + "epoch": 0.07577807848443843, + "grad_norm": 5.272955894470215, + "learning_rate": 1.2628865979381445e-06, + "logits/chosen": 10.399672508239746, + "logits/rejected": 5.922280788421631, + "logps/chosen": -187.53143310546875, + "logps/rejected": -154.47177124023438, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04404468834400177, + "rewards/margins": 0.005482863634824753, + "rewards/rejected": -0.04952755197882652, + "step": 490 + }, + { + "epoch": 0.07593272762420258, + "grad_norm": 4.080560207366943, + "learning_rate": 1.2654639175257733e-06, + "logits/chosen": 9.95780086517334, + "logits/rejected": 11.035721778869629, + "logps/chosen": -161.07461547851562, + "logps/rejected": -181.50311279296875, + "loss": 0.7322, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.023262454196810722, + "rewards/margins": -0.07226769626140594, + "rewards/rejected": 0.04900524765253067, + "step": 491 + }, + { + "epoch": 0.07608737676396675, + "grad_norm": 4.837400436401367, + "learning_rate": 1.2680412371134022e-06, + "logits/chosen": 13.23857307434082, + "logits/rejected": 14.329360961914062, + "logps/chosen": -399.9242248535156, + "logps/rejected": -387.9140625, + "loss": 0.658, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.044596292078495026, + "rewards/margins": 0.07503624260425568, + "rewards/rejected": -0.1196325272321701, + "step": 492 + }, + { + "epoch": 0.07624202590373091, + "grad_norm": 7.8216118812561035, + "learning_rate": 1.2706185567010309e-06, + "logits/chosen": 8.370171546936035, + "logits/rejected": 8.059374809265137, + "logps/chosen": -409.1068115234375, + "logps/rejected": -309.05181884765625, + "loss": 0.6999, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0683976262807846, + "rewards/margins": -0.007261945866048336, + "rewards/rejected": -0.061135679483413696, + "step": 493 + }, + { + "epoch": 0.07639667504349507, + "grad_norm": 5.2427496910095215, + "learning_rate": 1.2731958762886597e-06, + "logits/chosen": 6.7537031173706055, + "logits/rejected": 10.230915069580078, + "logps/chosen": -225.65072631835938, + "logps/rejected": -272.75469970703125, + "loss": 0.6956, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03593377768993378, + "rewards/margins": 0.005887121893465519, + "rewards/rejected": 0.030046656727790833, + "step": 494 + }, + { + "epoch": 0.07655132418325923, + "grad_norm": 4.877942085266113, + "learning_rate": 1.2757731958762886e-06, + "logits/chosen": 18.830055236816406, + "logits/rejected": 10.248369216918945, + "logps/chosen": -288.5787658691406, + "logps/rejected": -205.6837158203125, + "loss": 0.7313, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.047826193273067474, + "rewards/margins": -0.07010393589735031, + "rewards/rejected": 0.02227773703634739, + "step": 495 + }, + { + "epoch": 0.0767059733230234, + "grad_norm": 4.733504772186279, + "learning_rate": 1.2783505154639175e-06, + "logits/chosen": 5.889375686645508, + "logits/rejected": 2.9468088150024414, + "logps/chosen": -275.24468994140625, + "logps/rejected": -284.8700866699219, + "loss": 0.6523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07246246188879013, + "rewards/margins": 0.08885151147842407, + "rewards/rejected": -0.016389036551117897, + "step": 496 + }, + { + "epoch": 0.07686062246278755, + "grad_norm": 4.67637825012207, + "learning_rate": 1.2809278350515464e-06, + "logits/chosen": 9.688321113586426, + "logits/rejected": 3.4134669303894043, + "logps/chosen": -266.0443115234375, + "logps/rejected": -193.4385986328125, + "loss": 0.7056, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08322501182556152, + "rewards/margins": -0.021423693746328354, + "rewards/rejected": -0.06180131435394287, + "step": 497 + }, + { + "epoch": 0.07701527160255171, + "grad_norm": 5.164875030517578, + "learning_rate": 1.2835051546391752e-06, + "logits/chosen": 14.148039817810059, + "logits/rejected": 13.427299499511719, + "logps/chosen": -314.2421875, + "logps/rejected": -299.50311279296875, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029363252222537994, + "rewards/margins": 0.01818694919347763, + "rewards/rejected": -0.047550201416015625, + "step": 498 + }, + { + "epoch": 0.07716992074231586, + "grad_norm": 7.062736511230469, + "learning_rate": 1.286082474226804e-06, + "logits/chosen": 13.982134819030762, + "logits/rejected": 7.812150478363037, + "logps/chosen": -365.7272033691406, + "logps/rejected": -315.32684326171875, + "loss": 0.732, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06968193501234055, + "rewards/margins": -0.07258491963148117, + "rewards/rejected": 0.0029029827564954758, + "step": 499 + }, + { + "epoch": 0.07732456988208003, + "grad_norm": 4.314591407775879, + "learning_rate": 1.288659793814433e-06, + "logits/chosen": 5.615242958068848, + "logits/rejected": 11.519858360290527, + "logps/chosen": -196.22201538085938, + "logps/rejected": -251.9803009033203, + "loss": 0.7353, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05603757128119469, + "rewards/margins": -0.07838129997253418, + "rewards/rejected": 0.022343730553984642, + "step": 500 + }, + { + "epoch": 0.0774792190218442, + "grad_norm": 4.882893085479736, + "learning_rate": 1.291237113402062e-06, + "logits/chosen": 7.686586380004883, + "logits/rejected": 6.984974384307861, + "logps/chosen": -271.8753967285156, + "logps/rejected": -204.36770629882812, + "loss": 0.6561, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04684881865978241, + "rewards/margins": 0.08257794380187988, + "rewards/rejected": -0.035729121416807175, + "step": 501 + }, + { + "epoch": 0.07763386816160835, + "grad_norm": 6.275918960571289, + "learning_rate": 1.293814432989691e-06, + "logits/chosen": 7.644626140594482, + "logits/rejected": 3.437337875366211, + "logps/chosen": -290.195068359375, + "logps/rejected": -223.37295532226562, + "loss": 0.7042, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.059528157114982605, + "rewards/margins": -0.018102407455444336, + "rewards/rejected": -0.04142574965953827, + "step": 502 + }, + { + "epoch": 0.07778851730137251, + "grad_norm": 3.9693758487701416, + "learning_rate": 1.2963917525773198e-06, + "logits/chosen": 8.159713745117188, + "logits/rejected": 10.401387214660645, + "logps/chosen": -174.96224975585938, + "logps/rejected": -226.48019409179688, + "loss": 0.6738, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.062317825853824615, + "rewards/margins": 0.041219066828489304, + "rewards/rejected": -0.10353689640760422, + "step": 503 + }, + { + "epoch": 0.07794316644113668, + "grad_norm": 4.535689353942871, + "learning_rate": 1.2989690721649487e-06, + "logits/chosen": 6.409902572631836, + "logits/rejected": 12.520186424255371, + "logps/chosen": -159.933349609375, + "logps/rejected": -203.27352905273438, + "loss": 0.7004, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05015776678919792, + "rewards/margins": -0.012445949949324131, + "rewards/rejected": -0.03771181032061577, + "step": 504 + }, + { + "epoch": 0.07809781558090083, + "grad_norm": 5.357067108154297, + "learning_rate": 1.3015463917525775e-06, + "logits/chosen": 7.758027076721191, + "logits/rejected": 12.18793773651123, + "logps/chosen": -215.1466522216797, + "logps/rejected": -241.67506408691406, + "loss": 0.7558, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0855511724948883, + "rewards/margins": -0.11551766842603683, + "rewards/rejected": 0.02996649779379368, + "step": 505 + }, + { + "epoch": 0.078252464720665, + "grad_norm": 5.963812828063965, + "learning_rate": 1.3041237113402064e-06, + "logits/chosen": 9.35734748840332, + "logits/rejected": 8.196986198425293, + "logps/chosen": -345.1124572753906, + "logps/rejected": -295.1198425292969, + "loss": 0.6454, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02703724056482315, + "rewards/margins": 0.10452951490879059, + "rewards/rejected": -0.07749228924512863, + "step": 506 + }, + { + "epoch": 0.07840711386042915, + "grad_norm": 5.133436679840088, + "learning_rate": 1.3067010309278353e-06, + "logits/chosen": 11.193446159362793, + "logits/rejected": 11.346586227416992, + "logps/chosen": -319.44683837890625, + "logps/rejected": -258.2880859375, + "loss": 0.7167, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006875228136777878, + "rewards/margins": -0.03874626010656357, + "rewards/rejected": 0.045621491968631744, + "step": 507 + }, + { + "epoch": 0.07856176300019331, + "grad_norm": 5.560775279998779, + "learning_rate": 1.3092783505154642e-06, + "logits/chosen": 8.42133903503418, + "logits/rejected": 13.05972671508789, + "logps/chosen": -256.1662292480469, + "logps/rejected": -285.46466064453125, + "loss": 0.6702, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009373044595122337, + "rewards/margins": 0.05296625941991806, + "rewards/rejected": -0.04359322041273117, + "step": 508 + }, + { + "epoch": 0.07871641213995748, + "grad_norm": 5.174693584442139, + "learning_rate": 1.311855670103093e-06, + "logits/chosen": 14.411124229431152, + "logits/rejected": 7.032424449920654, + "logps/chosen": -344.90020751953125, + "logps/rejected": -208.19241333007812, + "loss": 0.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027427388355135918, + "rewards/margins": 0.022199105471372604, + "rewards/rejected": -0.04962649196386337, + "step": 509 + }, + { + "epoch": 0.07887106127972163, + "grad_norm": 6.354555606842041, + "learning_rate": 1.314432989690722e-06, + "logits/chosen": 10.666659355163574, + "logits/rejected": 7.222296714782715, + "logps/chosen": -426.1317138671875, + "logps/rejected": -366.894775390625, + "loss": 0.7071, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03403759002685547, + "rewards/margins": -0.022094538435339928, + "rewards/rejected": -0.01194305531680584, + "step": 510 + }, + { + "epoch": 0.07902571041948579, + "grad_norm": 4.84645938873291, + "learning_rate": 1.3170103092783506e-06, + "logits/chosen": 11.212747573852539, + "logits/rejected": 8.291570663452148, + "logps/chosen": -244.2994384765625, + "logps/rejected": -194.34815979003906, + "loss": 0.6755, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.004062940366566181, + "rewards/margins": 0.04598658159375191, + "rewards/rejected": -0.050049517303705215, + "step": 511 + }, + { + "epoch": 0.07918035955924996, + "grad_norm": 5.589974403381348, + "learning_rate": 1.3195876288659794e-06, + "logits/chosen": 11.877053260803223, + "logits/rejected": 6.465781211853027, + "logps/chosen": -297.2283020019531, + "logps/rejected": -201.77923583984375, + "loss": 0.7144, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05955009162425995, + "rewards/margins": -0.037588972598314285, + "rewards/rejected": -0.021961115300655365, + "step": 512 + }, + { + "epoch": 0.07933500869901411, + "grad_norm": 8.377959251403809, + "learning_rate": 1.3221649484536083e-06, + "logits/chosen": 9.283735275268555, + "logits/rejected": 9.296442031860352, + "logps/chosen": -477.893310546875, + "logps/rejected": -347.45343017578125, + "loss": 0.7207, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05752735584974289, + "rewards/margins": -0.03290129452943802, + "rewards/rejected": -0.02462606132030487, + "step": 513 + }, + { + "epoch": 0.07948965783877827, + "grad_norm": 3.3914053440093994, + "learning_rate": 1.3247422680412372e-06, + "logits/chosen": 5.411382675170898, + "logits/rejected": 6.597458839416504, + "logps/chosen": -170.07208251953125, + "logps/rejected": -188.1590576171875, + "loss": 0.6755, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.011275816708803177, + "rewards/margins": 0.04031095653772354, + "rewards/rejected": -0.05158677324652672, + "step": 514 + }, + { + "epoch": 0.07964430697854243, + "grad_norm": 7.341994762420654, + "learning_rate": 1.327319587628866e-06, + "logits/chosen": 9.048376083374023, + "logits/rejected": 7.71989631652832, + "logps/chosen": -410.7899169921875, + "logps/rejected": -339.4091796875, + "loss": 0.734, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0873725414276123, + "rewards/margins": -0.07666054368019104, + "rewards/rejected": -0.010712003335356712, + "step": 515 + }, + { + "epoch": 0.07979895611830659, + "grad_norm": 5.503435134887695, + "learning_rate": 1.329896907216495e-06, + "logits/chosen": 11.117693901062012, + "logits/rejected": 10.440019607543945, + "logps/chosen": -401.076416015625, + "logps/rejected": -438.62042236328125, + "loss": 0.732, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11154460906982422, + "rewards/margins": -0.07412925362586975, + "rewards/rejected": -0.037415362894535065, + "step": 516 + }, + { + "epoch": 0.07995360525807076, + "grad_norm": 4.144252777099609, + "learning_rate": 1.3324742268041238e-06, + "logits/chosen": 12.842230796813965, + "logits/rejected": 4.817409038543701, + "logps/chosen": -260.1328430175781, + "logps/rejected": -125.98538208007812, + "loss": 0.7327, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06426072120666504, + "rewards/margins": -0.0740605890750885, + "rewards/rejected": 0.00979986134916544, + "step": 517 + }, + { + "epoch": 0.08010825439783491, + "grad_norm": 4.73619270324707, + "learning_rate": 1.3350515463917527e-06, + "logits/chosen": 11.06511116027832, + "logits/rejected": 6.746665954589844, + "logps/chosen": -269.87432861328125, + "logps/rejected": -219.50643920898438, + "loss": 0.6768, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02751307561993599, + "rewards/margins": 0.037844181060791016, + "rewards/rejected": -0.0653572604060173, + "step": 518 + }, + { + "epoch": 0.08026290353759907, + "grad_norm": 4.447077751159668, + "learning_rate": 1.3376288659793815e-06, + "logits/chosen": 6.431484222412109, + "logits/rejected": 9.76042366027832, + "logps/chosen": -227.97085571289062, + "logps/rejected": -296.36651611328125, + "loss": 0.6371, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.026898100972175598, + "rewards/margins": 0.11998605728149414, + "rewards/rejected": -0.09308796375989914, + "step": 519 + }, + { + "epoch": 0.08041755267736324, + "grad_norm": 5.414590835571289, + "learning_rate": 1.3402061855670104e-06, + "logits/chosen": 10.79727554321289, + "logits/rejected": 3.2325119972229004, + "logps/chosen": -248.14511108398438, + "logps/rejected": -175.93927001953125, + "loss": 0.6965, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04287712648510933, + "rewards/margins": -0.0063204774633049965, + "rewards/rejected": -0.03655664995312691, + "step": 520 + }, + { + "epoch": 0.08057220181712739, + "grad_norm": 5.638577938079834, + "learning_rate": 1.3427835051546393e-06, + "logits/chosen": 10.725425720214844, + "logits/rejected": 3.684582233428955, + "logps/chosen": -338.5487365722656, + "logps/rejected": -221.99310302734375, + "loss": 0.6994, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019631575793027878, + "rewards/margins": -0.007610607892274857, + "rewards/rejected": 0.027242185547947884, + "step": 521 + }, + { + "epoch": 0.08072685095689155, + "grad_norm": 5.674072265625, + "learning_rate": 1.3453608247422681e-06, + "logits/chosen": 9.33547592163086, + "logits/rejected": 15.28099250793457, + "logps/chosen": -229.82672119140625, + "logps/rejected": -376.06597900390625, + "loss": 0.653, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02353219874203205, + "rewards/margins": 0.09013090282678604, + "rewards/rejected": -0.11366310715675354, + "step": 522 + }, + { + "epoch": 0.0808815000966557, + "grad_norm": 5.811645984649658, + "learning_rate": 1.347938144329897e-06, + "logits/chosen": 12.050795555114746, + "logits/rejected": 5.4996337890625, + "logps/chosen": -403.49053955078125, + "logps/rejected": -293.321533203125, + "loss": 0.7094, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.061141159385442734, + "rewards/margins": -0.0239457655698061, + "rewards/rejected": -0.03719539940357208, + "step": 523 + }, + { + "epoch": 0.08103614923641987, + "grad_norm": 6.312408447265625, + "learning_rate": 1.3505154639175259e-06, + "logits/chosen": 10.700739860534668, + "logits/rejected": 1.81795334815979, + "logps/chosen": -483.1976623535156, + "logps/rejected": -274.3953857421875, + "loss": 0.6914, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.024495694786310196, + "rewards/margins": 0.007960964925587177, + "rewards/rejected": -0.0324566587805748, + "step": 524 + }, + { + "epoch": 0.08119079837618404, + "grad_norm": 4.696097373962402, + "learning_rate": 1.3530927835051548e-06, + "logits/chosen": 8.211674690246582, + "logits/rejected": 1.1401091814041138, + "logps/chosen": -264.7847595214844, + "logps/rejected": -209.84481811523438, + "loss": 0.7006, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.044145919382572174, + "rewards/margins": -0.00855002086609602, + "rewards/rejected": -0.03559589385986328, + "step": 525 + }, + { + "epoch": 0.08134544751594819, + "grad_norm": 5.374699592590332, + "learning_rate": 1.3556701030927834e-06, + "logits/chosen": 8.196305274963379, + "logits/rejected": 11.697808265686035, + "logps/chosen": -214.82339477539062, + "logps/rejected": -317.4474182128906, + "loss": 0.7165, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.02326676994562149, + "rewards/margins": -0.040444038808345795, + "rewards/rejected": 0.017177274450659752, + "step": 526 + }, + { + "epoch": 0.08150009665571235, + "grad_norm": 4.186614990234375, + "learning_rate": 1.3582474226804123e-06, + "logits/chosen": 7.532651424407959, + "logits/rejected": 8.822759628295898, + "logps/chosen": -210.77732849121094, + "logps/rejected": -226.58636474609375, + "loss": 0.6827, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.026653384789824486, + "rewards/margins": 0.024219894781708717, + "rewards/rejected": 0.002433490939438343, + "step": 527 + }, + { + "epoch": 0.08165474579547652, + "grad_norm": 3.884063482284546, + "learning_rate": 1.3608247422680412e-06, + "logits/chosen": 12.936111450195312, + "logits/rejected": 8.74923324584961, + "logps/chosen": -274.812744140625, + "logps/rejected": -207.52850341796875, + "loss": 0.6762, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.018146181479096413, + "rewards/margins": 0.04085822403430939, + "rewards/rejected": -0.05900440737605095, + "step": 528 + }, + { + "epoch": 0.08180939493524067, + "grad_norm": 4.786996841430664, + "learning_rate": 1.36340206185567e-06, + "logits/chosen": 12.112577438354492, + "logits/rejected": 10.60428237915039, + "logps/chosen": -309.19598388671875, + "logps/rejected": -299.437744140625, + "loss": 0.7045, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0023768888786435127, + "rewards/margins": -0.009706975892186165, + "rewards/rejected": 0.007330084219574928, + "step": 529 + }, + { + "epoch": 0.08196404407500484, + "grad_norm": 6.0366973876953125, + "learning_rate": 1.365979381443299e-06, + "logits/chosen": 11.771197319030762, + "logits/rejected": 8.55543041229248, + "logps/chosen": -305.8131103515625, + "logps/rejected": -220.6937713623047, + "loss": 0.7013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.018656635656952858, + "rewards/margins": -0.014781379140913486, + "rewards/rejected": -0.003875256050378084, + "step": 530 + }, + { + "epoch": 0.08211869321476899, + "grad_norm": 3.151768207550049, + "learning_rate": 1.368556701030928e-06, + "logits/chosen": 6.802198886871338, + "logits/rejected": 9.858036041259766, + "logps/chosen": -151.63719177246094, + "logps/rejected": -151.3238067626953, + "loss": 0.6807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006170701235532761, + "rewards/margins": 0.028571534901857376, + "rewards/rejected": -0.03474223613739014, + "step": 531 + }, + { + "epoch": 0.08227334235453315, + "grad_norm": 5.344779968261719, + "learning_rate": 1.3711340206185569e-06, + "logits/chosen": 11.66389274597168, + "logits/rejected": 9.489412307739258, + "logps/chosen": -285.146484375, + "logps/rejected": -234.99050903320312, + "loss": 0.715, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07612819969654083, + "rewards/margins": -0.039783380925655365, + "rewards/rejected": -0.03634481504559517, + "step": 532 + }, + { + "epoch": 0.08242799149429732, + "grad_norm": 5.25248908996582, + "learning_rate": 1.3737113402061857e-06, + "logits/chosen": 10.95274829864502, + "logits/rejected": 8.533432960510254, + "logps/chosen": -344.74951171875, + "logps/rejected": -303.6307067871094, + "loss": 0.703, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.036418817937374115, + "rewards/margins": -0.016526460647583008, + "rewards/rejected": -0.019892359152436256, + "step": 533 + }, + { + "epoch": 0.08258264063406147, + "grad_norm": 6.047691822052002, + "learning_rate": 1.3762886597938146e-06, + "logits/chosen": 13.251497268676758, + "logits/rejected": 6.974300861358643, + "logps/chosen": -297.87652587890625, + "logps/rejected": -203.72567749023438, + "loss": 0.713, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.060970306396484375, + "rewards/margins": -0.03670930862426758, + "rewards/rejected": -0.024260997772216797, + "step": 534 + }, + { + "epoch": 0.08273728977382563, + "grad_norm": 6.274545192718506, + "learning_rate": 1.3788659793814435e-06, + "logits/chosen": 9.905800819396973, + "logits/rejected": 11.10840129852295, + "logps/chosen": -370.43463134765625, + "logps/rejected": -324.4618225097656, + "loss": 0.7433, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.08701552450656891, + "rewards/margins": -0.09692764282226562, + "rewards/rejected": 0.00991210900247097, + "step": 535 + }, + { + "epoch": 0.0828919389135898, + "grad_norm": 6.441309452056885, + "learning_rate": 1.3814432989690724e-06, + "logits/chosen": 13.563328742980957, + "logits/rejected": 9.363935470581055, + "logps/chosen": -342.4273986816406, + "logps/rejected": -271.4912414550781, + "loss": 0.7163, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01731271669268608, + "rewards/margins": -0.03972053527832031, + "rewards/rejected": 0.022407814860343933, + "step": 536 + }, + { + "epoch": 0.08304658805335395, + "grad_norm": 5.751824378967285, + "learning_rate": 1.3840206185567012e-06, + "logits/chosen": 4.694397926330566, + "logits/rejected": 9.730031967163086, + "logps/chosen": -216.31333923339844, + "logps/rejected": -267.26898193359375, + "loss": 0.7978, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.18462657928466797, + "rewards/margins": -0.18465977907180786, + "rewards/rejected": 3.318674862384796e-05, + "step": 537 + }, + { + "epoch": 0.08320123719311812, + "grad_norm": 3.7181038856506348, + "learning_rate": 1.38659793814433e-06, + "logits/chosen": 10.303196907043457, + "logits/rejected": 7.523611068725586, + "logps/chosen": -217.4943389892578, + "logps/rejected": -219.6516571044922, + "loss": 0.6568, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02828398160636425, + "rewards/margins": 0.07878727465867996, + "rewards/rejected": -0.10707125812768936, + "step": 538 + }, + { + "epoch": 0.08335588633288227, + "grad_norm": 5.212622165679932, + "learning_rate": 1.389175257731959e-06, + "logits/chosen": 13.040685653686523, + "logits/rejected": 11.43275260925293, + "logps/chosen": -367.2395324707031, + "logps/rejected": -331.2080078125, + "loss": 0.6972, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.027867890894412994, + "rewards/margins": -0.0011987686157226562, + "rewards/rejected": -0.02666911855340004, + "step": 539 + }, + { + "epoch": 0.08351053547264643, + "grad_norm": 5.434782981872559, + "learning_rate": 1.3917525773195878e-06, + "logits/chosen": 15.580764770507812, + "logits/rejected": 8.096982955932617, + "logps/chosen": -358.6407165527344, + "logps/rejected": -276.34918212890625, + "loss": 0.7091, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.055728793144226074, + "rewards/margins": -0.021243112161755562, + "rewards/rejected": -0.034485675394535065, + "step": 540 + }, + { + "epoch": 0.0836651846124106, + "grad_norm": 4.162454605102539, + "learning_rate": 1.3943298969072167e-06, + "logits/chosen": 11.824553489685059, + "logits/rejected": 8.227606773376465, + "logps/chosen": -147.75181579589844, + "logps/rejected": -146.95799255371094, + "loss": 0.6799, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008574152365326881, + "rewards/margins": 0.030722906813025475, + "rewards/rejected": -0.039297059178352356, + "step": 541 + }, + { + "epoch": 0.08381983375217475, + "grad_norm": 4.5789384841918945, + "learning_rate": 1.3969072164948456e-06, + "logits/chosen": 11.601738929748535, + "logits/rejected": 5.445502758026123, + "logps/chosen": -345.2118225097656, + "logps/rejected": -195.30538940429688, + "loss": 0.7202, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04216070473194122, + "rewards/margins": -0.04864849895238876, + "rewards/rejected": 0.006487798877060413, + "step": 542 + }, + { + "epoch": 0.08397448289193891, + "grad_norm": 5.624594688415527, + "learning_rate": 1.3994845360824745e-06, + "logits/chosen": 5.455178737640381, + "logits/rejected": 7.166283130645752, + "logps/chosen": -264.54278564453125, + "logps/rejected": -264.7269592285156, + "loss": 0.7131, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02523994818329811, + "rewards/margins": -0.03223336115479469, + "rewards/rejected": 0.006993414834141731, + "step": 543 + }, + { + "epoch": 0.08412913203170308, + "grad_norm": 16.302080154418945, + "learning_rate": 1.4020618556701031e-06, + "logits/chosen": 10.338399887084961, + "logits/rejected": 8.731912612915039, + "logps/chosen": -262.99542236328125, + "logps/rejected": -217.37112426757812, + "loss": 0.6435, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.017029715701937675, + "rewards/margins": 0.10734157264232635, + "rewards/rejected": -0.09031186252832413, + "step": 544 + }, + { + "epoch": 0.08428378117146723, + "grad_norm": 5.504406929016113, + "learning_rate": 1.404639175257732e-06, + "logits/chosen": 4.236848831176758, + "logits/rejected": 1.2474476099014282, + "logps/chosen": -275.2344055175781, + "logps/rejected": -255.11891174316406, + "loss": 0.7035, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.011736463755369186, + "rewards/margins": -0.013480950146913528, + "rewards/rejected": 0.0017444845288991928, + "step": 545 + }, + { + "epoch": 0.0844384303112314, + "grad_norm": 3.7987728118896484, + "learning_rate": 1.4072164948453609e-06, + "logits/chosen": 10.348388671875, + "logits/rejected": 1.9263609647750854, + "logps/chosen": -193.91937255859375, + "logps/rejected": -115.6342544555664, + "loss": 0.6972, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03842787817120552, + "rewards/margins": -0.007029199041426182, + "rewards/rejected": -0.03139868006110191, + "step": 546 + }, + { + "epoch": 0.08459307945099555, + "grad_norm": 4.090860843658447, + "learning_rate": 1.4097938144329897e-06, + "logits/chosen": 10.532543182373047, + "logits/rejected": 5.598051071166992, + "logps/chosen": -320.3072204589844, + "logps/rejected": -237.58522033691406, + "loss": 0.7144, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.05214262008666992, + "rewards/margins": -0.03299293294548988, + "rewards/rejected": -0.01914968341588974, + "step": 547 + }, + { + "epoch": 0.08474772859075971, + "grad_norm": 6.043724536895752, + "learning_rate": 1.4123711340206186e-06, + "logits/chosen": 13.783498764038086, + "logits/rejected": 11.574289321899414, + "logps/chosen": -316.70062255859375, + "logps/rejected": -304.0550537109375, + "loss": 0.7216, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07665050029754639, + "rewards/margins": -0.05147576332092285, + "rewards/rejected": -0.025174735113978386, + "step": 548 + }, + { + "epoch": 0.08490237773052388, + "grad_norm": 4.6276044845581055, + "learning_rate": 1.4149484536082475e-06, + "logits/chosen": 9.441754341125488, + "logits/rejected": 9.189139366149902, + "logps/chosen": -269.8464050292969, + "logps/rejected": -245.50872802734375, + "loss": 0.6757, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.058960575610399246, + "rewards/margins": 0.04430823773145676, + "rewards/rejected": -0.1032688170671463, + "step": 549 + }, + { + "epoch": 0.08505702687028803, + "grad_norm": 5.975076675415039, + "learning_rate": 1.4175257731958764e-06, + "logits/chosen": 9.283742904663086, + "logits/rejected": 2.3773369789123535, + "logps/chosen": -307.9149169921875, + "logps/rejected": -230.32321166992188, + "loss": 0.6745, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.041390422731637955, + "rewards/margins": 0.043630022555589676, + "rewards/rejected": -0.08502044528722763, + "step": 550 + }, + { + "epoch": 0.0852116760100522, + "grad_norm": 4.116117477416992, + "learning_rate": 1.4201030927835052e-06, + "logits/chosen": 8.857202529907227, + "logits/rejected": 4.9324140548706055, + "logps/chosen": -160.62957763671875, + "logps/rejected": -131.80567932128906, + "loss": 0.6457, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04518408700823784, + "rewards/margins": 0.09874320030212402, + "rewards/rejected": -0.053559109568595886, + "step": 551 + }, + { + "epoch": 0.08536632514981636, + "grad_norm": 5.2435302734375, + "learning_rate": 1.422680412371134e-06, + "logits/chosen": 13.257979393005371, + "logits/rejected": 12.709647178649902, + "logps/chosen": -226.0782928466797, + "logps/rejected": -280.5997009277344, + "loss": 0.6833, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03819599002599716, + "rewards/margins": 0.020241308957338333, + "rewards/rejected": -0.05843730270862579, + "step": 552 + }, + { + "epoch": 0.08552097428958051, + "grad_norm": 9.359949111938477, + "learning_rate": 1.425257731958763e-06, + "logits/chosen": 14.066431045532227, + "logits/rejected": 9.620611190795898, + "logps/chosen": -301.92462158203125, + "logps/rejected": -208.7350311279297, + "loss": 0.6852, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02114868350327015, + "rewards/margins": 0.01965189166367054, + "rewards/rejected": -0.04080057144165039, + "step": 553 + }, + { + "epoch": 0.08567562342934468, + "grad_norm": 5.54028844833374, + "learning_rate": 1.4278350515463918e-06, + "logits/chosen": 13.469223976135254, + "logits/rejected": 13.597623825073242, + "logps/chosen": -279.43377685546875, + "logps/rejected": -301.3321533203125, + "loss": 0.7009, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07416782528162003, + "rewards/margins": -0.010303354822099209, + "rewards/rejected": -0.06386446952819824, + "step": 554 + }, + { + "epoch": 0.08583027256910883, + "grad_norm": 5.318565368652344, + "learning_rate": 1.4304123711340207e-06, + "logits/chosen": 3.1903958320617676, + "logits/rejected": 12.17082405090332, + "logps/chosen": -221.4241943359375, + "logps/rejected": -349.0831298828125, + "loss": 0.682, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.01637106016278267, + "rewards/margins": 0.034010179340839386, + "rewards/rejected": -0.05038123577833176, + "step": 555 + }, + { + "epoch": 0.085984921708873, + "grad_norm": 4.948302745819092, + "learning_rate": 1.4329896907216496e-06, + "logits/chosen": 6.0074872970581055, + "logits/rejected": 5.867912292480469, + "logps/chosen": -257.4555969238281, + "logps/rejected": -264.7852783203125, + "loss": 0.7198, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.010210896842181683, + "rewards/margins": -0.04925103485584259, + "rewards/rejected": 0.03904014080762863, + "step": 556 + }, + { + "epoch": 0.08613957084863716, + "grad_norm": 5.331164360046387, + "learning_rate": 1.4355670103092785e-06, + "logits/chosen": 7.234327793121338, + "logits/rejected": 4.675284385681152, + "logps/chosen": -262.0631408691406, + "logps/rejected": -226.23211669921875, + "loss": 0.7169, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03545413166284561, + "rewards/margins": -0.041504621505737305, + "rewards/rejected": 0.006050491239875555, + "step": 557 + }, + { + "epoch": 0.08629421998840131, + "grad_norm": 5.373672962188721, + "learning_rate": 1.4381443298969073e-06, + "logits/chosen": 3.5728745460510254, + "logits/rejected": 3.163386106491089, + "logps/chosen": -229.23838806152344, + "logps/rejected": -206.3196563720703, + "loss": 0.6944, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007729053497314453, + "rewards/margins": 0.0011762618087232113, + "rewards/rejected": 0.006552794016897678, + "step": 558 + }, + { + "epoch": 0.08644886912816548, + "grad_norm": 4.6300506591796875, + "learning_rate": 1.440721649484536e-06, + "logits/chosen": 13.43500804901123, + "logits/rejected": 8.804279327392578, + "logps/chosen": -281.1897277832031, + "logps/rejected": -220.85140991210938, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05231967195868492, + "rewards/margins": 0.00836491584777832, + "rewards/rejected": -0.06068458408117294, + "step": 559 + }, + { + "epoch": 0.08660351826792964, + "grad_norm": 3.967717409133911, + "learning_rate": 1.4432989690721649e-06, + "logits/chosen": 13.546951293945312, + "logits/rejected": 12.877527236938477, + "logps/chosen": -261.8711242675781, + "logps/rejected": -196.81423950195312, + "loss": 0.6834, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.020794298499822617, + "rewards/margins": 0.023573974147439003, + "rewards/rejected": -0.002779675181955099, + "step": 560 + }, + { + "epoch": 0.08675816740769379, + "grad_norm": 4.789272785186768, + "learning_rate": 1.4458762886597942e-06, + "logits/chosen": 12.408659934997559, + "logits/rejected": 7.128690719604492, + "logps/chosen": -334.94708251953125, + "logps/rejected": -242.84063720703125, + "loss": 0.7051, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06501893699169159, + "rewards/margins": -0.019919585436582565, + "rewards/rejected": -0.04509935528039932, + "step": 561 + }, + { + "epoch": 0.08691281654745796, + "grad_norm": 4.86594295501709, + "learning_rate": 1.448453608247423e-06, + "logits/chosen": 15.593953132629395, + "logits/rejected": 10.727025985717773, + "logps/chosen": -242.47654724121094, + "logps/rejected": -193.03759765625, + "loss": 0.6682, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04373032972216606, + "rewards/margins": 0.054003361612558365, + "rewards/rejected": -0.01027302723377943, + "step": 562 + }, + { + "epoch": 0.08706746568722211, + "grad_norm": 3.5607645511627197, + "learning_rate": 1.4510309278350517e-06, + "logits/chosen": 9.887504577636719, + "logits/rejected": 6.421091079711914, + "logps/chosen": -208.6868438720703, + "logps/rejected": -188.00584411621094, + "loss": 0.6621, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011667155660688877, + "rewards/margins": 0.06540470570325851, + "rewards/rejected": -0.07707186043262482, + "step": 563 + }, + { + "epoch": 0.08722211482698627, + "grad_norm": 6.123905181884766, + "learning_rate": 1.4536082474226806e-06, + "logits/chosen": 4.700023651123047, + "logits/rejected": 10.194036483764648, + "logps/chosen": -284.0270690917969, + "logps/rejected": -314.90087890625, + "loss": 0.791, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12916384637355804, + "rewards/margins": -0.1683189868927002, + "rewards/rejected": 0.039155151695013046, + "step": 564 + }, + { + "epoch": 0.08737676396675044, + "grad_norm": 7.623630046844482, + "learning_rate": 1.4561855670103094e-06, + "logits/chosen": 10.971306800842285, + "logits/rejected": 11.86168098449707, + "logps/chosen": -307.81231689453125, + "logps/rejected": -314.9770812988281, + "loss": 0.7005, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.018035219982266426, + "rewards/margins": -0.013626815751194954, + "rewards/rejected": -0.004408406559377909, + "step": 565 + }, + { + "epoch": 0.08753141310651459, + "grad_norm": 4.394433975219727, + "learning_rate": 1.4587628865979383e-06, + "logits/chosen": 13.23172378540039, + "logits/rejected": 12.297246932983398, + "logps/chosen": -322.37738037109375, + "logps/rejected": -309.84039306640625, + "loss": 0.6609, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01140308566391468, + "rewards/margins": 0.07004957646131516, + "rewards/rejected": -0.058646488934755325, + "step": 566 + }, + { + "epoch": 0.08768606224627876, + "grad_norm": 4.123403072357178, + "learning_rate": 1.4613402061855672e-06, + "logits/chosen": 8.478995323181152, + "logits/rejected": 5.38689661026001, + "logps/chosen": -178.5983123779297, + "logps/rejected": -173.1959228515625, + "loss": 0.6682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020140552893280983, + "rewards/margins": 0.05587553605437279, + "rewards/rejected": -0.07601609081029892, + "step": 567 + }, + { + "epoch": 0.08784071138604292, + "grad_norm": 4.850125312805176, + "learning_rate": 1.463917525773196e-06, + "logits/chosen": 5.15814733505249, + "logits/rejected": 6.131396293640137, + "logps/chosen": -235.00698852539062, + "logps/rejected": -256.5286865234375, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.039864830672740936, + "rewards/margins": 0.00997433066368103, + "rewards/rejected": -0.04983916133642197, + "step": 568 + }, + { + "epoch": 0.08799536052580707, + "grad_norm": 5.217621803283691, + "learning_rate": 1.466494845360825e-06, + "logits/chosen": 12.082124710083008, + "logits/rejected": 9.792362213134766, + "logps/chosen": -338.691162109375, + "logps/rejected": -304.8896179199219, + "loss": 0.7155, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10010968148708344, + "rewards/margins": -0.03405303880572319, + "rewards/rejected": -0.06605663895606995, + "step": 569 + }, + { + "epoch": 0.08815000966557124, + "grad_norm": 9.45602798461914, + "learning_rate": 1.4690721649484538e-06, + "logits/chosen": 6.290523052215576, + "logits/rejected": 7.201108455657959, + "logps/chosen": -331.61712646484375, + "logps/rejected": -379.7377624511719, + "loss": 0.6739, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.003652568906545639, + "rewards/margins": 0.049550626426935196, + "rewards/rejected": -0.053203195333480835, + "step": 570 + }, + { + "epoch": 0.08830465880533539, + "grad_norm": 6.101128101348877, + "learning_rate": 1.4716494845360827e-06, + "logits/chosen": 6.905837535858154, + "logits/rejected": 3.919721841812134, + "logps/chosen": -336.28363037109375, + "logps/rejected": -318.51483154296875, + "loss": 0.7239, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0894021987915039, + "rewards/margins": -0.05167561024427414, + "rewards/rejected": -0.037726595997810364, + "step": 571 + }, + { + "epoch": 0.08845930794509956, + "grad_norm": 3.39512038230896, + "learning_rate": 1.4742268041237115e-06, + "logits/chosen": 9.016313552856445, + "logits/rejected": 7.709306716918945, + "logps/chosen": -157.89161682128906, + "logps/rejected": -154.44366455078125, + "loss": 0.6827, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014220332726836205, + "rewards/margins": 0.02199702337384224, + "rewards/rejected": -0.007776690647006035, + "step": 572 + }, + { + "epoch": 0.08861395708486372, + "grad_norm": 4.189332962036133, + "learning_rate": 1.4768041237113404e-06, + "logits/chosen": 11.938968658447266, + "logits/rejected": 10.110847473144531, + "logps/chosen": -209.30987548828125, + "logps/rejected": -176.3541259765625, + "loss": 0.6913, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04447899013757706, + "rewards/margins": 0.005109596997499466, + "rewards/rejected": -0.049588583409786224, + "step": 573 + }, + { + "epoch": 0.08876860622462787, + "grad_norm": 5.407228469848633, + "learning_rate": 1.4793814432989693e-06, + "logits/chosen": 10.162530899047852, + "logits/rejected": 9.525054931640625, + "logps/chosen": -213.1966552734375, + "logps/rejected": -269.3774719238281, + "loss": 0.667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03099069744348526, + "rewards/margins": 0.05701952055096626, + "rewards/rejected": -0.08801022171974182, + "step": 574 + }, + { + "epoch": 0.08892325536439204, + "grad_norm": 4.868646144866943, + "learning_rate": 1.4819587628865981e-06, + "logits/chosen": 6.9227375984191895, + "logits/rejected": 5.648509979248047, + "logps/chosen": -292.35418701171875, + "logps/rejected": -233.95372009277344, + "loss": 0.726, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12666283547878265, + "rewards/margins": -0.049906060099601746, + "rewards/rejected": -0.07675676792860031, + "step": 575 + }, + { + "epoch": 0.0890779045041562, + "grad_norm": 3.63031268119812, + "learning_rate": 1.484536082474227e-06, + "logits/chosen": 13.012777328491211, + "logits/rejected": 8.049663543701172, + "logps/chosen": -146.19189453125, + "logps/rejected": -133.70364379882812, + "loss": 0.698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0466887503862381, + "rewards/margins": -0.0050859469920396805, + "rewards/rejected": -0.04160280153155327, + "step": 576 + }, + { + "epoch": 0.08923255364392035, + "grad_norm": 7.05629825592041, + "learning_rate": 1.4871134020618557e-06, + "logits/chosen": 9.393556594848633, + "logits/rejected": 9.363550186157227, + "logps/chosen": -175.64712524414062, + "logps/rejected": -230.2144317626953, + "loss": 0.7652, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06522531062364578, + "rewards/margins": -0.11029025912284851, + "rewards/rejected": 0.04506495222449303, + "step": 577 + }, + { + "epoch": 0.08938720278368452, + "grad_norm": 6.8438591957092285, + "learning_rate": 1.4896907216494846e-06, + "logits/chosen": 7.139414310455322, + "logits/rejected": 5.4244914054870605, + "logps/chosen": -305.4336853027344, + "logps/rejected": -273.1817932128906, + "loss": 0.7222, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.032219935208559036, + "rewards/margins": -0.05352919548749924, + "rewards/rejected": 0.021309256553649902, + "step": 578 + }, + { + "epoch": 0.08954185192344867, + "grad_norm": 5.016536712646484, + "learning_rate": 1.4922680412371134e-06, + "logits/chosen": 12.598133087158203, + "logits/rejected": 8.138943672180176, + "logps/chosen": -274.45623779296875, + "logps/rejected": -223.34591674804688, + "loss": 0.6702, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.008155249990522861, + "rewards/margins": 0.05107593908905983, + "rewards/rejected": -0.05923118814826012, + "step": 579 + }, + { + "epoch": 0.08969650106321284, + "grad_norm": 5.520704746246338, + "learning_rate": 1.4948453608247423e-06, + "logits/chosen": 9.072236061096191, + "logits/rejected": 11.72704029083252, + "logps/chosen": -310.29144287109375, + "logps/rejected": -319.53228759765625, + "loss": 0.7474, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11069688946008682, + "rewards/margins": -0.09896030277013779, + "rewards/rejected": -0.011736582964658737, + "step": 580 + }, + { + "epoch": 0.089851150202977, + "grad_norm": 4.271003246307373, + "learning_rate": 1.4974226804123712e-06, + "logits/chosen": 12.873431205749512, + "logits/rejected": 11.894736289978027, + "logps/chosen": -254.03485107421875, + "logps/rejected": -251.77838134765625, + "loss": 0.697, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030272291973233223, + "rewards/margins": -0.002185058780014515, + "rewards/rejected": -0.028087232261896133, + "step": 581 + }, + { + "epoch": 0.09000579934274115, + "grad_norm": 4.902781963348389, + "learning_rate": 1.5e-06, + "logits/chosen": 9.885713577270508, + "logits/rejected": 8.76339054107666, + "logps/chosen": -377.46044921875, + "logps/rejected": -339.8441162109375, + "loss": 0.6819, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.033589839935302734, + "rewards/margins": 0.02693825401365757, + "rewards/rejected": 0.006651591509580612, + "step": 582 + }, + { + "epoch": 0.09016044848250532, + "grad_norm": 7.76738977432251, + "learning_rate": 1.502577319587629e-06, + "logits/chosen": 8.139245986938477, + "logits/rejected": -1.6834871768951416, + "logps/chosen": -399.81158447265625, + "logps/rejected": -163.51123046875, + "loss": 0.6969, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0031184665858745575, + "rewards/margins": 1.3081356883049011e-05, + "rewards/rejected": 0.003105376847088337, + "step": 583 + }, + { + "epoch": 0.09031509762226948, + "grad_norm": 5.397767066955566, + "learning_rate": 1.5051546391752578e-06, + "logits/chosen": 7.014987945556641, + "logits/rejected": 9.233689308166504, + "logps/chosen": -257.14874267578125, + "logps/rejected": -223.935546875, + "loss": 0.735, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.012377787381410599, + "rewards/margins": -0.0755234807729721, + "rewards/rejected": 0.08790126442909241, + "step": 584 + }, + { + "epoch": 0.09046974676203363, + "grad_norm": 6.752077579498291, + "learning_rate": 1.5077319587628867e-06, + "logits/chosen": 8.48318862915039, + "logits/rejected": 8.355714797973633, + "logps/chosen": -221.28858947753906, + "logps/rejected": -293.10003662109375, + "loss": 0.6709, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016498947516083717, + "rewards/margins": 0.05205293744802475, + "rewards/rejected": -0.035553980618715286, + "step": 585 + }, + { + "epoch": 0.0906243959017978, + "grad_norm": 5.544448375701904, + "learning_rate": 1.5103092783505155e-06, + "logits/chosen": 11.244234085083008, + "logits/rejected": 8.31943130493164, + "logps/chosen": -368.56195068359375, + "logps/rejected": -313.887451171875, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03733672946691513, + "rewards/margins": 0.0167783722281456, + "rewards/rejected": -0.05411510542035103, + "step": 586 + }, + { + "epoch": 0.09077904504156195, + "grad_norm": 6.980535507202148, + "learning_rate": 1.5128865979381444e-06, + "logits/chosen": 12.466981887817383, + "logits/rejected": 10.62402057647705, + "logps/chosen": -485.16552734375, + "logps/rejected": -321.82177734375, + "loss": 0.6935, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02911529503762722, + "rewards/margins": 0.006746768951416016, + "rewards/rejected": 0.022368527948856354, + "step": 587 + }, + { + "epoch": 0.09093369418132612, + "grad_norm": 5.6725945472717285, + "learning_rate": 1.5154639175257733e-06, + "logits/chosen": 10.236791610717773, + "logits/rejected": 11.348725318908691, + "logps/chosen": -290.7213439941406, + "logps/rejected": -306.6461181640625, + "loss": 0.6822, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04713492467999458, + "rewards/margins": 0.02363448217511177, + "rewards/rejected": 0.023500442504882812, + "step": 588 + }, + { + "epoch": 0.09108834332109028, + "grad_norm": 3.99106764793396, + "learning_rate": 1.5180412371134021e-06, + "logits/chosen": 12.880867004394531, + "logits/rejected": 1.6681647300720215, + "logps/chosen": -228.91464233398438, + "logps/rejected": -94.42927551269531, + "loss": 0.7188, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.042986344546079636, + "rewards/margins": -0.048634838312864304, + "rewards/rejected": 0.005648494698107243, + "step": 589 + }, + { + "epoch": 0.09124299246085443, + "grad_norm": 4.398662090301514, + "learning_rate": 1.520618556701031e-06, + "logits/chosen": 10.31039810180664, + "logits/rejected": 11.152352333068848, + "logps/chosen": -245.16677856445312, + "logps/rejected": -235.45811462402344, + "loss": 0.6564, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02379927784204483, + "rewards/margins": 0.08172422647476196, + "rewards/rejected": -0.1055234968662262, + "step": 590 + }, + { + "epoch": 0.0913976416006186, + "grad_norm": 7.149019241333008, + "learning_rate": 1.5231958762886599e-06, + "logits/chosen": 2.468039035797119, + "logits/rejected": 6.3760905265808105, + "logps/chosen": -195.27056884765625, + "logps/rejected": -318.6320495605469, + "loss": 0.6857, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02985386922955513, + "rewards/margins": 0.02219838835299015, + "rewards/rejected": 0.007655477151274681, + "step": 591 + }, + { + "epoch": 0.09155229074038276, + "grad_norm": 5.477923393249512, + "learning_rate": 1.525773195876289e-06, + "logits/chosen": 9.466489791870117, + "logits/rejected": 11.058809280395508, + "logps/chosen": -297.7689208984375, + "logps/rejected": -338.0509948730469, + "loss": 0.6585, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.018276499584317207, + "rewards/margins": 0.07651957869529724, + "rewards/rejected": -0.05824308097362518, + "step": 592 + }, + { + "epoch": 0.09170693988014691, + "grad_norm": 5.714723110198975, + "learning_rate": 1.5283505154639178e-06, + "logits/chosen": 14.133981704711914, + "logits/rejected": 7.50548791885376, + "logps/chosen": -311.7084045410156, + "logps/rejected": -237.20822143554688, + "loss": 0.6725, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.031813692301511765, + "rewards/margins": 0.049206286668777466, + "rewards/rejected": -0.08101997524499893, + "step": 593 + }, + { + "epoch": 0.09186158901991108, + "grad_norm": 3.7252426147460938, + "learning_rate": 1.5309278350515467e-06, + "logits/chosen": 5.907700538635254, + "logits/rejected": 6.212896347045898, + "logps/chosen": -138.7192840576172, + "logps/rejected": -149.2027130126953, + "loss": 0.6663, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.038338325917720795, + "rewards/margins": 0.05797116458415985, + "rewards/rejected": -0.019632840529084206, + "step": 594 + }, + { + "epoch": 0.09201623815967523, + "grad_norm": 6.494401454925537, + "learning_rate": 1.5335051546391756e-06, + "logits/chosen": 6.552602767944336, + "logits/rejected": 6.334468364715576, + "logps/chosen": -301.9626159667969, + "logps/rejected": -302.86895751953125, + "loss": 0.6654, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024748800322413445, + "rewards/margins": 0.060784436762332916, + "rewards/rejected": -0.03603563457727432, + "step": 595 + }, + { + "epoch": 0.0921708872994394, + "grad_norm": 7.3843464851379395, + "learning_rate": 1.5360824742268042e-06, + "logits/chosen": 11.448627471923828, + "logits/rejected": 2.547853469848633, + "logps/chosen": -354.51171875, + "logps/rejected": -222.20797729492188, + "loss": 0.6837, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0057009682059288025, + "rewards/margins": 0.040397725999355316, + "rewards/rejected": -0.03469677269458771, + "step": 596 + }, + { + "epoch": 0.09232553643920356, + "grad_norm": 4.499715805053711, + "learning_rate": 1.5386597938144331e-06, + "logits/chosen": 11.953042030334473, + "logits/rejected": 9.38812255859375, + "logps/chosen": -294.1684265136719, + "logps/rejected": -175.8557891845703, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.054490186274051666, + "rewards/margins": 0.009673496708273888, + "rewards/rejected": -0.0641636773943901, + "step": 597 + }, + { + "epoch": 0.09248018557896771, + "grad_norm": 6.344499588012695, + "learning_rate": 1.541237113402062e-06, + "logits/chosen": 12.79347038269043, + "logits/rejected": 8.125385284423828, + "logps/chosen": -376.8821105957031, + "logps/rejected": -276.41351318359375, + "loss": 0.7106, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08567304909229279, + "rewards/margins": -0.03087618388235569, + "rewards/rejected": -0.054796863347291946, + "step": 598 + }, + { + "epoch": 0.09263483471873188, + "grad_norm": 5.350180149078369, + "learning_rate": 1.5438144329896909e-06, + "logits/chosen": 9.485483169555664, + "logits/rejected": 0.19389140605926514, + "logps/chosen": -300.34283447265625, + "logps/rejected": -278.94049072265625, + "loss": 0.6956, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0005204216577112675, + "rewards/margins": -0.00031356606632471085, + "rewards/rejected": 0.0008339891210198402, + "step": 599 + }, + { + "epoch": 0.09278948385849603, + "grad_norm": 4.526947498321533, + "learning_rate": 1.5463917525773197e-06, + "logits/chosen": 10.212084770202637, + "logits/rejected": 5.363928318023682, + "logps/chosen": -274.0177917480469, + "logps/rejected": -252.82843017578125, + "loss": 0.6389, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.048618413507938385, + "rewards/margins": 0.11915703117847443, + "rewards/rejected": -0.07053861767053604, + "step": 600 + }, + { + "epoch": 0.0929441329982602, + "grad_norm": 4.0233378410339355, + "learning_rate": 1.5489690721649486e-06, + "logits/chosen": 8.673198699951172, + "logits/rejected": 2.4412617683410645, + "logps/chosen": -289.31805419921875, + "logps/rejected": -250.50515747070312, + "loss": 0.6486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025545261800289154, + "rewards/margins": 0.10764918476343155, + "rewards/rejected": -0.0821039155125618, + "step": 601 + }, + { + "epoch": 0.09309878213802436, + "grad_norm": 8.741628646850586, + "learning_rate": 1.5515463917525775e-06, + "logits/chosen": 11.945576667785645, + "logits/rejected": 8.788779258728027, + "logps/chosen": -715.8478393554688, + "logps/rejected": -467.29986572265625, + "loss": 0.6618, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03313750401139259, + "rewards/margins": 0.10495051741600037, + "rewards/rejected": -0.07181301712989807, + "step": 602 + }, + { + "epoch": 0.09325343127778851, + "grad_norm": 4.780418395996094, + "learning_rate": 1.5541237113402063e-06, + "logits/chosen": 7.057133674621582, + "logits/rejected": 9.991583824157715, + "logps/chosen": -225.34634399414062, + "logps/rejected": -225.10385131835938, + "loss": 0.7096, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04378471523523331, + "rewards/margins": -0.019158653914928436, + "rewards/rejected": -0.02462606318295002, + "step": 603 + }, + { + "epoch": 0.09340808041755268, + "grad_norm": 4.472261905670166, + "learning_rate": 1.5567010309278352e-06, + "logits/chosen": 9.863222122192383, + "logits/rejected": 7.078555107116699, + "logps/chosen": -223.77841186523438, + "logps/rejected": -211.4258270263672, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.014171219430863857, + "rewards/margins": 0.0021557817235589027, + "rewards/rejected": 0.012015435844659805, + "step": 604 + }, + { + "epoch": 0.09356272955731684, + "grad_norm": 5.152721405029297, + "learning_rate": 1.559278350515464e-06, + "logits/chosen": 11.461219787597656, + "logits/rejected": 9.197368621826172, + "logps/chosen": -310.2313232421875, + "logps/rejected": -313.9874572753906, + "loss": 0.6654, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05733604356646538, + "rewards/margins": 0.060371968895196915, + "rewards/rejected": -0.003035927191376686, + "step": 605 + }, + { + "epoch": 0.093717378697081, + "grad_norm": 3.8857390880584717, + "learning_rate": 1.561855670103093e-06, + "logits/chosen": 14.510835647583008, + "logits/rejected": 8.955690383911133, + "logps/chosen": -185.84414672851562, + "logps/rejected": -135.18988037109375, + "loss": 0.6965, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.042194414883852005, + "rewards/margins": -0.002108335494995117, + "rewards/rejected": -0.04008607938885689, + "step": 606 + }, + { + "epoch": 0.09387202783684516, + "grad_norm": 6.44877815246582, + "learning_rate": 1.5644329896907218e-06, + "logits/chosen": 13.071895599365234, + "logits/rejected": 6.534764766693115, + "logps/chosen": -347.78436279296875, + "logps/rejected": -371.85028076171875, + "loss": 0.6985, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03660164028406143, + "rewards/margins": -0.0078088222071528435, + "rewards/rejected": -0.028792815282940865, + "step": 607 + }, + { + "epoch": 0.09402667697660931, + "grad_norm": 5.619678497314453, + "learning_rate": 1.5670103092783507e-06, + "logits/chosen": 14.796981811523438, + "logits/rejected": 11.97298812866211, + "logps/chosen": -281.37017822265625, + "logps/rejected": -267.58111572265625, + "loss": 0.7588, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07332611083984375, + "rewards/margins": -0.11931552737951279, + "rewards/rejected": 0.045989420264959335, + "step": 608 + }, + { + "epoch": 0.09418132611637348, + "grad_norm": 5.50604248046875, + "learning_rate": 1.5695876288659796e-06, + "logits/chosen": 7.825628757476807, + "logits/rejected": 4.102483749389648, + "logps/chosen": -341.34710693359375, + "logps/rejected": -278.0054931640625, + "loss": 0.671, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016111426055431366, + "rewards/margins": 0.05413079261779785, + "rewards/rejected": -0.038019370287656784, + "step": 609 + }, + { + "epoch": 0.09433597525613764, + "grad_norm": 5.484696388244629, + "learning_rate": 1.5721649484536082e-06, + "logits/chosen": 6.812306880950928, + "logits/rejected": 6.816822052001953, + "logps/chosen": -336.73529052734375, + "logps/rejected": -343.827392578125, + "loss": 0.6566, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008094217628240585, + "rewards/margins": 0.0756252333521843, + "rewards/rejected": -0.06753101944923401, + "step": 610 + }, + { + "epoch": 0.09449062439590179, + "grad_norm": 5.3768110275268555, + "learning_rate": 1.5747422680412371e-06, + "logits/chosen": 11.924095153808594, + "logits/rejected": 10.34201431274414, + "logps/chosen": -258.12548828125, + "logps/rejected": -195.8258056640625, + "loss": 0.7354, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03276486694812775, + "rewards/margins": -0.0749281495809555, + "rewards/rejected": 0.04216327518224716, + "step": 611 + }, + { + "epoch": 0.09464527353566596, + "grad_norm": 4.8351826667785645, + "learning_rate": 1.577319587628866e-06, + "logits/chosen": 7.209897994995117, + "logits/rejected": 8.280296325683594, + "logps/chosen": -252.28445434570312, + "logps/rejected": -274.72479248046875, + "loss": 0.6394, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01845426671206951, + "rewards/margins": 0.116290383040905, + "rewards/rejected": -0.13474464416503906, + "step": 612 + }, + { + "epoch": 0.09479992267543012, + "grad_norm": 5.207186698913574, + "learning_rate": 1.5798969072164949e-06, + "logits/chosen": 9.675490379333496, + "logits/rejected": 6.493882179260254, + "logps/chosen": -496.61614990234375, + "logps/rejected": -297.22283935546875, + "loss": 0.6582, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028522584587335587, + "rewards/margins": 0.0827580988407135, + "rewards/rejected": -0.054235510528087616, + "step": 613 + }, + { + "epoch": 0.09495457181519427, + "grad_norm": 4.982122898101807, + "learning_rate": 1.5824742268041237e-06, + "logits/chosen": 11.553534507751465, + "logits/rejected": 10.82922649383545, + "logps/chosen": -243.1285400390625, + "logps/rejected": -209.518310546875, + "loss": 0.714, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007285021711140871, + "rewards/margins": -0.03921153396368027, + "rewards/rejected": 0.031926512718200684, + "step": 614 + }, + { + "epoch": 0.09510922095495844, + "grad_norm": 5.356812477111816, + "learning_rate": 1.5850515463917526e-06, + "logits/chosen": 12.095643997192383, + "logits/rejected": 4.967586517333984, + "logps/chosen": -262.9018249511719, + "logps/rejected": -209.69000244140625, + "loss": 0.6472, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005501079838722944, + "rewards/margins": 0.10470442473888397, + "rewards/rejected": -0.09920335561037064, + "step": 615 + }, + { + "epoch": 0.09526387009472259, + "grad_norm": 4.781611919403076, + "learning_rate": 1.5876288659793815e-06, + "logits/chosen": 11.37546157836914, + "logits/rejected": 8.753242492675781, + "logps/chosen": -236.43675231933594, + "logps/rejected": -231.66348266601562, + "loss": 0.7134, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05269956216216087, + "rewards/margins": -0.037966012954711914, + "rewards/rejected": -0.014733552932739258, + "step": 616 + }, + { + "epoch": 0.09541851923448676, + "grad_norm": 5.962507247924805, + "learning_rate": 1.5902061855670103e-06, + "logits/chosen": 11.31015682220459, + "logits/rejected": 6.8168182373046875, + "logps/chosen": -370.357421875, + "logps/rejected": -332.8713684082031, + "loss": 0.7109, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06651440262794495, + "rewards/margins": -0.029712533578276634, + "rewards/rejected": -0.036801863461732864, + "step": 617 + }, + { + "epoch": 0.09557316837425092, + "grad_norm": 4.861644268035889, + "learning_rate": 1.5927835051546392e-06, + "logits/chosen": 8.432106018066406, + "logits/rejected": -0.2939087152481079, + "logps/chosen": -329.53631591796875, + "logps/rejected": -212.93565368652344, + "loss": 0.7326, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009197043254971504, + "rewards/margins": -0.06665685027837753, + "rewards/rejected": 0.07585389912128448, + "step": 618 + }, + { + "epoch": 0.09572781751401507, + "grad_norm": 5.125204086303711, + "learning_rate": 1.595360824742268e-06, + "logits/chosen": 5.196965217590332, + "logits/rejected": 4.761816024780273, + "logps/chosen": -197.16281127929688, + "logps/rejected": -129.31732177734375, + "loss": 0.6997, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0391143299639225, + "rewards/margins": -0.01229917909950018, + "rewards/rejected": -0.026815149933099747, + "step": 619 + }, + { + "epoch": 0.09588246665377924, + "grad_norm": 8.327315330505371, + "learning_rate": 1.597938144329897e-06, + "logits/chosen": 10.22437858581543, + "logits/rejected": 7.134114742279053, + "logps/chosen": -340.1286315917969, + "logps/rejected": -298.02813720703125, + "loss": 0.6509, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029244422912597656, + "rewards/margins": 0.0943283662199974, + "rewards/rejected": -0.12357278168201447, + "step": 620 + }, + { + "epoch": 0.0960371157935434, + "grad_norm": 5.642324447631836, + "learning_rate": 1.6005154639175258e-06, + "logits/chosen": 13.739320755004883, + "logits/rejected": 12.696159362792969, + "logps/chosen": -314.5406799316406, + "logps/rejected": -323.3088073730469, + "loss": 0.6784, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.011061572469770908, + "rewards/margins": 0.040704868733882904, + "rewards/rejected": -0.051766443997621536, + "step": 621 + }, + { + "epoch": 0.09619176493330756, + "grad_norm": 5.283963203430176, + "learning_rate": 1.603092783505155e-06, + "logits/chosen": 13.151761054992676, + "logits/rejected": 17.402347564697266, + "logps/chosen": -220.0209197998047, + "logps/rejected": -191.966552734375, + "loss": 0.6695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.033338069915771484, + "rewards/margins": 0.052843473851680756, + "rewards/rejected": -0.01950540393590927, + "step": 622 + }, + { + "epoch": 0.09634641407307172, + "grad_norm": 4.335860729217529, + "learning_rate": 1.6056701030927838e-06, + "logits/chosen": 6.806107521057129, + "logits/rejected": 15.09583854675293, + "logps/chosen": -181.46981811523438, + "logps/rejected": -268.5600891113281, + "loss": 0.7306, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03473300859332085, + "rewards/margins": -0.06548719108104706, + "rewards/rejected": 0.03075418621301651, + "step": 623 + }, + { + "epoch": 0.09650106321283587, + "grad_norm": 5.092329978942871, + "learning_rate": 1.6082474226804127e-06, + "logits/chosen": 10.802000045776367, + "logits/rejected": 10.009002685546875, + "logps/chosen": -210.10116577148438, + "logps/rejected": -251.5255889892578, + "loss": 0.6683, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.005318784154951572, + "rewards/margins": 0.05525808781385422, + "rewards/rejected": -0.06057686731219292, + "step": 624 + }, + { + "epoch": 0.09665571235260004, + "grad_norm": 4.9923248291015625, + "learning_rate": 1.6108247422680415e-06, + "logits/chosen": 5.184527397155762, + "logits/rejected": 8.153695106506348, + "logps/chosen": -206.96498107910156, + "logps/rejected": -223.87667846679688, + "loss": 0.6807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.036101486533880234, + "rewards/margins": 0.03194420412182808, + "rewards/rejected": -0.06804568320512772, + "step": 625 + }, + { + "epoch": 0.0968103614923642, + "grad_norm": 4.65030574798584, + "learning_rate": 1.6134020618556704e-06, + "logits/chosen": 14.926023483276367, + "logits/rejected": 13.717376708984375, + "logps/chosen": -177.5963592529297, + "logps/rejected": -249.663330078125, + "loss": 0.6647, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.006681108847260475, + "rewards/margins": 0.05900819972157478, + "rewards/rejected": -0.06568930298089981, + "step": 626 + }, + { + "epoch": 0.09696501063212835, + "grad_norm": 4.603990077972412, + "learning_rate": 1.6159793814432993e-06, + "logits/chosen": 14.098691940307617, + "logits/rejected": 6.207292556762695, + "logps/chosen": -364.30755615234375, + "logps/rejected": -272.8124084472656, + "loss": 0.6847, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021910574287176132, + "rewards/margins": 0.02428731881082058, + "rewards/rejected": -0.04619789496064186, + "step": 627 + }, + { + "epoch": 0.09711965977189252, + "grad_norm": 5.58837890625, + "learning_rate": 1.6185567010309281e-06, + "logits/chosen": 9.047807693481445, + "logits/rejected": 4.971400260925293, + "logps/chosen": -262.41424560546875, + "logps/rejected": -293.35064697265625, + "loss": 0.71, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03303651884198189, + "rewards/margins": -0.030684994533658028, + "rewards/rejected": -0.0023515233770012856, + "step": 628 + }, + { + "epoch": 0.09727430891165668, + "grad_norm": 5.33598518371582, + "learning_rate": 1.6211340206185568e-06, + "logits/chosen": 5.499825477600098, + "logits/rejected": 4.790665626525879, + "logps/chosen": -268.12353515625, + "logps/rejected": -268.49896240234375, + "loss": 0.7246, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07068347930908203, + "rewards/margins": -0.06015462800860405, + "rewards/rejected": -0.010528851300477982, + "step": 629 + }, + { + "epoch": 0.09742895805142084, + "grad_norm": 4.960086822509766, + "learning_rate": 1.6237113402061857e-06, + "logits/chosen": 15.52161693572998, + "logits/rejected": 11.660200119018555, + "logps/chosen": -326.958251953125, + "logps/rejected": -333.1241455078125, + "loss": 0.6395, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03655433654785156, + "rewards/margins": 0.1183067262172699, + "rewards/rejected": -0.08175239711999893, + "step": 630 + }, + { + "epoch": 0.097583607191185, + "grad_norm": 4.868167400360107, + "learning_rate": 1.6262886597938145e-06, + "logits/chosen": 14.511956214904785, + "logits/rejected": 6.359712600708008, + "logps/chosen": -269.334228515625, + "logps/rejected": -196.94873046875, + "loss": 0.6468, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08865445107221603, + "rewards/margins": 0.10222003608942032, + "rewards/rejected": -0.013565592467784882, + "step": 631 + }, + { + "epoch": 0.09773825633094915, + "grad_norm": 6.033456325531006, + "learning_rate": 1.6288659793814434e-06, + "logits/chosen": 13.104146957397461, + "logits/rejected": 8.012776374816895, + "logps/chosen": -276.33807373046875, + "logps/rejected": -290.249267578125, + "loss": 0.6742, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03361959382891655, + "rewards/margins": 0.041159387677907944, + "rewards/rejected": -0.07477898895740509, + "step": 632 + }, + { + "epoch": 0.09789290547071332, + "grad_norm": 7.156194686889648, + "learning_rate": 1.6314432989690723e-06, + "logits/chosen": 14.901607513427734, + "logits/rejected": 10.479061126708984, + "logps/chosen": -433.1152648925781, + "logps/rejected": -299.2436828613281, + "loss": 0.7059, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003666020929813385, + "rewards/margins": -0.020360376685857773, + "rewards/rejected": 0.016694355756044388, + "step": 633 + }, + { + "epoch": 0.09804755461047748, + "grad_norm": 3.93912672996521, + "learning_rate": 1.6340206185567012e-06, + "logits/chosen": 9.796585083007812, + "logits/rejected": 4.748625755310059, + "logps/chosen": -230.2073974609375, + "logps/rejected": -189.3337860107422, + "loss": 0.6884, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04565896838903427, + "rewards/margins": 0.024111414328217506, + "rewards/rejected": 0.021547559648752213, + "step": 634 + }, + { + "epoch": 0.09820220375024163, + "grad_norm": 7.107681751251221, + "learning_rate": 1.63659793814433e-06, + "logits/chosen": 8.632162094116211, + "logits/rejected": 9.427631378173828, + "logps/chosen": -387.70245361328125, + "logps/rejected": -285.6554870605469, + "loss": 0.7229, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.026623060926795006, + "rewards/margins": -0.05328959971666336, + "rewards/rejected": 0.02666654624044895, + "step": 635 + }, + { + "epoch": 0.0983568528900058, + "grad_norm": 4.889127254486084, + "learning_rate": 1.639175257731959e-06, + "logits/chosen": 8.18308162689209, + "logits/rejected": 13.358589172363281, + "logps/chosen": -200.55355834960938, + "logps/rejected": -285.7563781738281, + "loss": 0.7322, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.004796029534190893, + "rewards/margins": -0.06852416694164276, + "rewards/rejected": 0.06372814625501633, + "step": 636 + }, + { + "epoch": 0.09851150202976997, + "grad_norm": 4.188296794891357, + "learning_rate": 1.6417525773195878e-06, + "logits/chosen": 9.515541076660156, + "logits/rejected": 0.9121682643890381, + "logps/chosen": -191.35414123535156, + "logps/rejected": -132.52049255371094, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05044122040271759, + "rewards/margins": 0.003530503250658512, + "rewards/rejected": -0.05397172272205353, + "step": 637 + }, + { + "epoch": 0.09866615116953412, + "grad_norm": 3.6274330615997314, + "learning_rate": 1.6443298969072167e-06, + "logits/chosen": 9.701925277709961, + "logits/rejected": 3.2490687370300293, + "logps/chosen": -225.98117065429688, + "logps/rejected": -140.39572143554688, + "loss": 0.7037, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.046563006937503815, + "rewards/margins": -0.019072817638516426, + "rewards/rejected": -0.02749018743634224, + "step": 638 + }, + { + "epoch": 0.09882080030929828, + "grad_norm": 5.348845958709717, + "learning_rate": 1.6469072164948455e-06, + "logits/chosen": 11.14402961730957, + "logits/rejected": 5.489739418029785, + "logps/chosen": -414.48193359375, + "logps/rejected": -297.72247314453125, + "loss": 0.6965, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02482414059340954, + "rewards/margins": -0.0009044390171766281, + "rewards/rejected": -0.02391970530152321, + "step": 639 + }, + { + "epoch": 0.09897544944906243, + "grad_norm": 5.5072174072265625, + "learning_rate": 1.6494845360824744e-06, + "logits/chosen": 6.948462963104248, + "logits/rejected": 10.435422897338867, + "logps/chosen": -357.62353515625, + "logps/rejected": -391.70892333984375, + "loss": 0.6993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003362540155649185, + "rewards/margins": 0.0031153932213783264, + "rewards/rejected": -0.006477933377027512, + "step": 640 + }, + { + "epoch": 0.0991300985888266, + "grad_norm": 4.387056350708008, + "learning_rate": 1.6520618556701033e-06, + "logits/chosen": 10.224788665771484, + "logits/rejected": 6.8633503913879395, + "logps/chosen": -137.84393310546875, + "logps/rejected": -127.134765625, + "loss": 0.7039, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.011342001147568226, + "rewards/margins": -0.017363524064421654, + "rewards/rejected": 0.006021523382514715, + "step": 641 + }, + { + "epoch": 0.09928474772859076, + "grad_norm": 2.762166976928711, + "learning_rate": 1.6546391752577321e-06, + "logits/chosen": 6.713349342346191, + "logits/rejected": 10.584786415100098, + "logps/chosen": -73.66946411132812, + "logps/rejected": -101.34318542480469, + "loss": 0.6699, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.001994467107579112, + "rewards/margins": 0.04827606678009033, + "rewards/rejected": -0.05027053505182266, + "step": 642 + }, + { + "epoch": 0.09943939686835492, + "grad_norm": 4.7673020362854, + "learning_rate": 1.6572164948453608e-06, + "logits/chosen": 7.310449600219727, + "logits/rejected": 10.991437911987305, + "logps/chosen": -191.6696319580078, + "logps/rejected": -239.61160278320312, + "loss": 0.6844, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03263092041015625, + "rewards/margins": 0.024889947846531868, + "rewards/rejected": 0.007740974426269531, + "step": 643 + }, + { + "epoch": 0.09959404600811908, + "grad_norm": 5.100652694702148, + "learning_rate": 1.6597938144329897e-06, + "logits/chosen": 10.107490539550781, + "logits/rejected": 10.81308650970459, + "logps/chosen": -223.90249633789062, + "logps/rejected": -318.9140930175781, + "loss": 0.681, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.007338476367294788, + "rewards/margins": 0.030842041596770287, + "rewards/rejected": -0.03818051889538765, + "step": 644 + }, + { + "epoch": 0.09974869514788325, + "grad_norm": 5.188139915466309, + "learning_rate": 1.6623711340206185e-06, + "logits/chosen": 11.427698135375977, + "logits/rejected": 8.363601684570312, + "logps/chosen": -278.658447265625, + "logps/rejected": -254.81475830078125, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024146556854248047, + "rewards/margins": 0.04017200693488121, + "rewards/rejected": -0.016025448217988014, + "step": 645 + }, + { + "epoch": 0.0999033442876474, + "grad_norm": 4.890308856964111, + "learning_rate": 1.6649484536082474e-06, + "logits/chosen": 12.572355270385742, + "logits/rejected": 6.571372032165527, + "logps/chosen": -324.322509765625, + "logps/rejected": -228.4810791015625, + "loss": 0.7045, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03328218683600426, + "rewards/margins": -0.017046835273504257, + "rewards/rejected": -0.0162353515625, + "step": 646 + }, + { + "epoch": 0.10005799342741156, + "grad_norm": 4.256762981414795, + "learning_rate": 1.6675257731958763e-06, + "logits/chosen": 7.919941425323486, + "logits/rejected": 4.48151969909668, + "logps/chosen": -269.82080078125, + "logps/rejected": -290.96575927734375, + "loss": 0.6229, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06998326629400253, + "rewards/margins": 0.15377354621887207, + "rewards/rejected": -0.08379028737545013, + "step": 647 + }, + { + "epoch": 0.10021264256717571, + "grad_norm": 4.747322082519531, + "learning_rate": 1.6701030927835052e-06, + "logits/chosen": 14.111015319824219, + "logits/rejected": 10.853255271911621, + "logps/chosen": -317.4160461425781, + "logps/rejected": -211.2255859375, + "loss": 0.6949, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.029404643923044205, + "rewards/margins": 0.002740904688835144, + "rewards/rejected": -0.03214554861187935, + "step": 648 + }, + { + "epoch": 0.10036729170693988, + "grad_norm": 3.3639962673187256, + "learning_rate": 1.672680412371134e-06, + "logits/chosen": 9.712835311889648, + "logits/rejected": 9.351411819458008, + "logps/chosen": -151.31210327148438, + "logps/rejected": -135.27659606933594, + "loss": 0.6971, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.003421616042032838, + "rewards/margins": -0.005889464169740677, + "rewards/rejected": 0.009311079978942871, + "step": 649 + }, + { + "epoch": 0.10052194084670404, + "grad_norm": 4.250289440155029, + "learning_rate": 1.675257731958763e-06, + "logits/chosen": 8.930456161499023, + "logits/rejected": 12.005864143371582, + "logps/chosen": -177.62753295898438, + "logps/rejected": -221.66717529296875, + "loss": 0.6454, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.008941221982240677, + "rewards/margins": 0.10039810836315155, + "rewards/rejected": -0.09145689010620117, + "step": 650 + }, + { + "epoch": 0.1006765899864682, + "grad_norm": 6.616263389587402, + "learning_rate": 1.6778350515463918e-06, + "logits/chosen": 9.84302043914795, + "logits/rejected": 5.0720624923706055, + "logps/chosen": -214.24847412109375, + "logps/rejected": -151.21328735351562, + "loss": 0.7356, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10174660384654999, + "rewards/margins": -0.07674827426671982, + "rewards/rejected": -0.024998335167765617, + "step": 651 + }, + { + "epoch": 0.10083123912623236, + "grad_norm": 4.11628532409668, + "learning_rate": 1.6804123711340209e-06, + "logits/chosen": 1.6993416547775269, + "logits/rejected": 0.3504621982574463, + "logps/chosen": -307.0272216796875, + "logps/rejected": -255.17030334472656, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05690937116742134, + "rewards/margins": 0.1046469658613205, + "rewards/rejected": -0.04773760214447975, + "step": 652 + }, + { + "epoch": 0.10098588826599653, + "grad_norm": 4.548579692840576, + "learning_rate": 1.6829896907216497e-06, + "logits/chosen": 10.458688735961914, + "logits/rejected": 6.558831691741943, + "logps/chosen": -195.62979125976562, + "logps/rejected": -202.48150634765625, + "loss": 0.6612, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007487392984330654, + "rewards/margins": 0.06877279281616211, + "rewards/rejected": -0.06128539890050888, + "step": 653 + }, + { + "epoch": 0.10114053740576068, + "grad_norm": 7.213531970977783, + "learning_rate": 1.6855670103092786e-06, + "logits/chosen": 6.283636093139648, + "logits/rejected": 7.394184589385986, + "logps/chosen": -334.258544921875, + "logps/rejected": -553.8726196289062, + "loss": 0.7252, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04854417219758034, + "rewards/margins": -0.05938858911395073, + "rewards/rejected": 0.010844423435628414, + "step": 654 + }, + { + "epoch": 0.10129518654552484, + "grad_norm": 5.0834879875183105, + "learning_rate": 1.6881443298969075e-06, + "logits/chosen": 11.197104454040527, + "logits/rejected": -6.347384452819824, + "logps/chosen": -488.41802978515625, + "logps/rejected": -232.65145874023438, + "loss": 0.648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07724142074584961, + "rewards/margins": 0.10025287419557571, + "rewards/rejected": -0.023011445999145508, + "step": 655 + }, + { + "epoch": 0.101449835685289, + "grad_norm": 4.407314777374268, + "learning_rate": 1.6907216494845363e-06, + "logits/chosen": 6.098791122436523, + "logits/rejected": 6.52764368057251, + "logps/chosen": -220.281494140625, + "logps/rejected": -245.36526489257812, + "loss": 0.6444, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.020389366894960403, + "rewards/margins": 0.10595996677875519, + "rewards/rejected": -0.08557059615850449, + "step": 656 + }, + { + "epoch": 0.10160448482505316, + "grad_norm": 5.5342583656311035, + "learning_rate": 1.6932989690721652e-06, + "logits/chosen": 9.752843856811523, + "logits/rejected": 5.9225263595581055, + "logps/chosen": -354.33734130859375, + "logps/rejected": -258.33380126953125, + "loss": 0.7011, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06359577178955078, + "rewards/margins": -0.005949879065155983, + "rewards/rejected": -0.05764589458703995, + "step": 657 + }, + { + "epoch": 0.10175913396481732, + "grad_norm": 4.831279754638672, + "learning_rate": 1.695876288659794e-06, + "logits/chosen": 15.619799613952637, + "logits/rejected": 7.549403190612793, + "logps/chosen": -356.50537109375, + "logps/rejected": -279.78076171875, + "loss": 0.6546, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03341083601117134, + "rewards/margins": 0.08175259083509445, + "rewards/rejected": -0.04834175109863281, + "step": 658 + }, + { + "epoch": 0.10191378310458148, + "grad_norm": 5.875309944152832, + "learning_rate": 1.698453608247423e-06, + "logits/chosen": 11.178624153137207, + "logits/rejected": 8.83117389678955, + "logps/chosen": -359.5606689453125, + "logps/rejected": -378.2495422363281, + "loss": 0.7065, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008427286520600319, + "rewards/margins": -0.01331019401550293, + "rewards/rejected": 0.0217374786734581, + "step": 659 + }, + { + "epoch": 0.10206843224434564, + "grad_norm": 4.892007350921631, + "learning_rate": 1.7010309278350518e-06, + "logits/chosen": 16.60784339904785, + "logits/rejected": 10.378694534301758, + "logps/chosen": -181.0744171142578, + "logps/rejected": -120.24176025390625, + "loss": 0.7297, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04470715671777725, + "rewards/margins": -0.06933944672346115, + "rewards/rejected": 0.0246322862803936, + "step": 660 + }, + { + "epoch": 0.1022230813841098, + "grad_norm": 5.259274005889893, + "learning_rate": 1.7036082474226807e-06, + "logits/chosen": 7.3079938888549805, + "logits/rejected": 10.6466646194458, + "logps/chosen": -266.22613525390625, + "logps/rejected": -331.6214904785156, + "loss": 0.7575, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.013918399810791016, + "rewards/margins": -0.11277685314416885, + "rewards/rejected": 0.09885846078395844, + "step": 661 + }, + { + "epoch": 0.10237773052387396, + "grad_norm": 5.762076377868652, + "learning_rate": 1.7061855670103094e-06, + "logits/chosen": 12.200499534606934, + "logits/rejected": 8.767509460449219, + "logps/chosen": -254.0989990234375, + "logps/rejected": -182.6186065673828, + "loss": 0.7156, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.045737892389297485, + "rewards/margins": -0.03786543384194374, + "rewards/rejected": -0.007872462272644043, + "step": 662 + }, + { + "epoch": 0.10253237966363812, + "grad_norm": 4.698825359344482, + "learning_rate": 1.7087628865979382e-06, + "logits/chosen": 11.65015983581543, + "logits/rejected": 7.005955696105957, + "logps/chosen": -339.91546630859375, + "logps/rejected": -220.9632568359375, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.017162036150693893, + "rewards/margins": 0.011417672038078308, + "rewards/rejected": -0.02857971377670765, + "step": 663 + }, + { + "epoch": 0.10268702880340227, + "grad_norm": 5.734563827514648, + "learning_rate": 1.7113402061855671e-06, + "logits/chosen": 5.212826728820801, + "logits/rejected": 7.359377861022949, + "logps/chosen": -257.7365417480469, + "logps/rejected": -309.5773010253906, + "loss": 0.6746, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04576730728149414, + "rewards/margins": 0.04579916596412659, + "rewards/rejected": -3.1853094696998596e-05, + "step": 664 + }, + { + "epoch": 0.10284167794316644, + "grad_norm": 5.3968586921691895, + "learning_rate": 1.713917525773196e-06, + "logits/chosen": 9.994443893432617, + "logits/rejected": 10.042808532714844, + "logps/chosen": -311.7440185546875, + "logps/rejected": -281.3293151855469, + "loss": 0.6683, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0023292554542422295, + "rewards/margins": 0.06412233412265778, + "rewards/rejected": -0.06179308891296387, + "step": 665 + }, + { + "epoch": 0.1029963270829306, + "grad_norm": 4.270318031311035, + "learning_rate": 1.7164948453608249e-06, + "logits/chosen": 7.720704555511475, + "logits/rejected": 11.80801010131836, + "logps/chosen": -180.4175567626953, + "logps/rejected": -247.20968627929688, + "loss": 0.71, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.027014685794711113, + "rewards/margins": -0.02740330807864666, + "rewards/rejected": 0.054417990148067474, + "step": 666 + }, + { + "epoch": 0.10315097622269476, + "grad_norm": 4.354268550872803, + "learning_rate": 1.7190721649484537e-06, + "logits/chosen": 14.810428619384766, + "logits/rejected": 10.384760856628418, + "logps/chosen": -247.15892028808594, + "logps/rejected": -205.6116180419922, + "loss": 0.6377, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0993863046169281, + "rewards/margins": 0.12247291207313538, + "rewards/rejected": -0.02308659814298153, + "step": 667 + }, + { + "epoch": 0.10330562536245892, + "grad_norm": 6.416478633880615, + "learning_rate": 1.7216494845360826e-06, + "logits/chosen": 12.943609237670898, + "logits/rejected": 10.954690933227539, + "logps/chosen": -314.3682861328125, + "logps/rejected": -257.19830322265625, + "loss": 0.6339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09125775843858719, + "rewards/margins": 0.1301651895046234, + "rewards/rejected": -0.038907431066036224, + "step": 668 + }, + { + "epoch": 0.10346027450222309, + "grad_norm": 5.524900436401367, + "learning_rate": 1.7242268041237115e-06, + "logits/chosen": 9.195277214050293, + "logits/rejected": 9.423467636108398, + "logps/chosen": -292.70257568359375, + "logps/rejected": -316.78643798828125, + "loss": 0.7015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011797618120908737, + "rewards/margins": -0.01085028238594532, + "rewards/rejected": -0.0009473334066569805, + "step": 669 + }, + { + "epoch": 0.10361492364198724, + "grad_norm": 9.757253646850586, + "learning_rate": 1.7268041237113403e-06, + "logits/chosen": 14.634187698364258, + "logits/rejected": 5.348187446594238, + "logps/chosen": -465.4376220703125, + "logps/rejected": -342.2071228027344, + "loss": 0.7478, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03725433349609375, + "rewards/margins": -0.094207763671875, + "rewards/rejected": 0.05695343390107155, + "step": 670 + }, + { + "epoch": 0.1037695727817514, + "grad_norm": 5.383145332336426, + "learning_rate": 1.7293814432989692e-06, + "logits/chosen": 13.826841354370117, + "logits/rejected": 9.040946960449219, + "logps/chosen": -269.61590576171875, + "logps/rejected": -259.8312072753906, + "loss": 0.6992, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01482701301574707, + "rewards/margins": -0.011121200397610664, + "rewards/rejected": 0.025948215276002884, + "step": 671 + }, + { + "epoch": 0.10392422192151556, + "grad_norm": 5.767911911010742, + "learning_rate": 1.731958762886598e-06, + "logits/chosen": 12.13133430480957, + "logits/rejected": 7.115202903747559, + "logps/chosen": -259.7037658691406, + "logps/rejected": -172.8024139404297, + "loss": 0.7274, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07644939422607422, + "rewards/margins": -0.05740680545568466, + "rewards/rejected": -0.019042588770389557, + "step": 672 + }, + { + "epoch": 0.10407887106127972, + "grad_norm": 6.355388164520264, + "learning_rate": 1.734536082474227e-06, + "logits/chosen": 10.470122337341309, + "logits/rejected": 10.729490280151367, + "logps/chosen": -353.16351318359375, + "logps/rejected": -359.7750244140625, + "loss": 0.6569, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10434875637292862, + "rewards/margins": 0.0798313170671463, + "rewards/rejected": 0.024517439305782318, + "step": 673 + }, + { + "epoch": 0.10423352020104389, + "grad_norm": 5.094385623931885, + "learning_rate": 1.7371134020618558e-06, + "logits/chosen": 11.618303298950195, + "logits/rejected": 9.52016830444336, + "logps/chosen": -290.1989440917969, + "logps/rejected": -239.11581420898438, + "loss": 0.6721, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03366727754473686, + "rewards/margins": 0.04694719612598419, + "rewards/rejected": -0.013279914855957031, + "step": 674 + }, + { + "epoch": 0.10438816934080804, + "grad_norm": 7.462923526763916, + "learning_rate": 1.7396907216494847e-06, + "logits/chosen": 14.113037109375, + "logits/rejected": 17.088916778564453, + "logps/chosen": -433.5478515625, + "logps/rejected": -552.8109741210938, + "loss": 0.7075, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.010138891637325287, + "rewards/margins": 0.004543498158454895, + "rewards/rejected": -0.014682380482554436, + "step": 675 + }, + { + "epoch": 0.1045428184805722, + "grad_norm": 5.817153453826904, + "learning_rate": 1.7422680412371134e-06, + "logits/chosen": 12.25714111328125, + "logits/rejected": 10.902167320251465, + "logps/chosen": -438.2728271484375, + "logps/rejected": -402.656982421875, + "loss": 0.6757, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005818936973810196, + "rewards/margins": 0.03894777595996857, + "rewards/rejected": -0.03312883526086807, + "step": 676 + }, + { + "epoch": 0.10469746762033637, + "grad_norm": 4.1766204833984375, + "learning_rate": 1.7448453608247422e-06, + "logits/chosen": 11.384872436523438, + "logits/rejected": 9.269880294799805, + "logps/chosen": -235.53416442871094, + "logps/rejected": -199.64015197753906, + "loss": 0.6744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01969142258167267, + "rewards/margins": 0.040616437792778015, + "rewards/rejected": -0.060307860374450684, + "step": 677 + }, + { + "epoch": 0.10485211676010052, + "grad_norm": 7.616860866546631, + "learning_rate": 1.747422680412371e-06, + "logits/chosen": 12.98716926574707, + "logits/rejected": 6.189037799835205, + "logps/chosen": -346.7720031738281, + "logps/rejected": -234.07763671875, + "loss": 0.7155, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031049348413944244, + "rewards/margins": -0.0368557944893837, + "rewards/rejected": 0.06790514290332794, + "step": 678 + }, + { + "epoch": 0.10500676589986468, + "grad_norm": 8.19212818145752, + "learning_rate": 1.75e-06, + "logits/chosen": 12.169953346252441, + "logits/rejected": 7.735172271728516, + "logps/chosen": -531.0531616210938, + "logps/rejected": -340.606689453125, + "loss": 0.6756, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06909886002540588, + "rewards/margins": 0.045296572148799896, + "rewards/rejected": 0.02380228042602539, + "step": 679 + }, + { + "epoch": 0.10516141503962884, + "grad_norm": 4.897167205810547, + "learning_rate": 1.7525773195876288e-06, + "logits/chosen": 8.591468811035156, + "logits/rejected": 11.312469482421875, + "logps/chosen": -280.17218017578125, + "logps/rejected": -281.02801513671875, + "loss": 0.6602, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04004840925335884, + "rewards/margins": 0.07665366679430008, + "rewards/rejected": -0.03660526126623154, + "step": 680 + }, + { + "epoch": 0.105316064179393, + "grad_norm": 5.469785213470459, + "learning_rate": 1.7551546391752577e-06, + "logits/chosen": 12.214111328125, + "logits/rejected": 6.799766540527344, + "logps/chosen": -305.38201904296875, + "logps/rejected": -276.7591857910156, + "loss": 0.6804, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015477752313017845, + "rewards/margins": 0.033103086054325104, + "rewards/rejected": -0.0485808402299881, + "step": 681 + }, + { + "epoch": 0.10547071331915717, + "grad_norm": 5.973385334014893, + "learning_rate": 1.7577319587628866e-06, + "logits/chosen": 12.681440353393555, + "logits/rejected": 6.3310699462890625, + "logps/chosen": -371.85302734375, + "logps/rejected": -261.76910400390625, + "loss": 0.702, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.002875950187444687, + "rewards/margins": -0.013464685529470444, + "rewards/rejected": 0.01634063757956028, + "step": 682 + }, + { + "epoch": 0.10562536245892132, + "grad_norm": 4.383820056915283, + "learning_rate": 1.7603092783505157e-06, + "logits/chosen": 10.694823265075684, + "logits/rejected": 7.794074058532715, + "logps/chosen": -264.9433898925781, + "logps/rejected": -207.11183166503906, + "loss": 0.6724, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006166078150272369, + "rewards/margins": 0.04402055963873863, + "rewards/rejected": -0.03785448148846626, + "step": 683 + }, + { + "epoch": 0.10578001159868548, + "grad_norm": 5.311039447784424, + "learning_rate": 1.7628865979381445e-06, + "logits/chosen": 7.801418304443359, + "logits/rejected": -0.11302995681762695, + "logps/chosen": -291.1820983886719, + "logps/rejected": -183.00271606445312, + "loss": 0.6769, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.009747695177793503, + "rewards/margins": 0.03845924139022827, + "rewards/rejected": -0.04820694774389267, + "step": 684 + }, + { + "epoch": 0.10593466073844965, + "grad_norm": 5.3003830909729, + "learning_rate": 1.7654639175257734e-06, + "logits/chosen": 10.9329252243042, + "logits/rejected": 4.558310508728027, + "logps/chosen": -395.85552978515625, + "logps/rejected": -277.6243591308594, + "loss": 0.7033, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0415351428091526, + "rewards/margins": -0.015120504423975945, + "rewards/rejected": 0.0566556453704834, + "step": 685 + }, + { + "epoch": 0.1060893098782138, + "grad_norm": 8.903987884521484, + "learning_rate": 1.7680412371134023e-06, + "logits/chosen": 8.132436752319336, + "logits/rejected": 6.30673360824585, + "logps/chosen": -285.2313232421875, + "logps/rejected": -332.7186279296875, + "loss": 0.7038, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013564299792051315, + "rewards/margins": -0.01806449331343174, + "rewards/rejected": 0.004500195384025574, + "step": 686 + }, + { + "epoch": 0.10624395901797797, + "grad_norm": 2.749605178833008, + "learning_rate": 1.7706185567010312e-06, + "logits/chosen": 10.753188133239746, + "logits/rejected": 11.1934814453125, + "logps/chosen": -112.49217987060547, + "logps/rejected": -118.42501831054688, + "loss": 0.6618, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0041128164157271385, + "rewards/margins": 0.06557545065879822, + "rewards/rejected": -0.0614626407623291, + "step": 687 + }, + { + "epoch": 0.10639860815774212, + "grad_norm": 6.521875858306885, + "learning_rate": 1.77319587628866e-06, + "logits/chosen": 11.76104736328125, + "logits/rejected": 14.46867561340332, + "logps/chosen": -386.2987976074219, + "logps/rejected": -358.4405212402344, + "loss": 0.6871, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0009296387434005737, + "rewards/margins": 0.020166778936982155, + "rewards/rejected": -0.019237138330936432, + "step": 688 + }, + { + "epoch": 0.10655325729750628, + "grad_norm": 4.162387371063232, + "learning_rate": 1.775773195876289e-06, + "logits/chosen": 13.494083404541016, + "logits/rejected": 4.4459075927734375, + "logps/chosen": -311.7069396972656, + "logps/rejected": -192.32937622070312, + "loss": 0.7064, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0037535633891820908, + "rewards/margins": -0.01547413133084774, + "rewards/rejected": 0.01922769472002983, + "step": 689 + }, + { + "epoch": 0.10670790643727045, + "grad_norm": 3.5574254989624023, + "learning_rate": 1.7783505154639178e-06, + "logits/chosen": 9.295692443847656, + "logits/rejected": 13.15600299835205, + "logps/chosen": -152.50228881835938, + "logps/rejected": -183.5394287109375, + "loss": 0.6742, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02263064496219158, + "rewards/margins": 0.03988990932703018, + "rewards/rejected": -0.06252054870128632, + "step": 690 + }, + { + "epoch": 0.1068625555770346, + "grad_norm": 3.871260643005371, + "learning_rate": 1.7809278350515466e-06, + "logits/chosen": 13.814468383789062, + "logits/rejected": 4.13873815536499, + "logps/chosen": -223.12741088867188, + "logps/rejected": -127.20864868164062, + "loss": 0.6302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08524029701948166, + "rewards/margins": 0.14162299036979675, + "rewards/rejected": -0.05638270452618599, + "step": 691 + }, + { + "epoch": 0.10701720471679876, + "grad_norm": 4.686367988586426, + "learning_rate": 1.7835051546391755e-06, + "logits/chosen": 9.703445434570312, + "logits/rejected": 6.450917720794678, + "logps/chosen": -242.64956665039062, + "logps/rejected": -234.71563720703125, + "loss": 0.6345, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09268367290496826, + "rewards/margins": 0.1274859756231308, + "rewards/rejected": -0.03480229526758194, + "step": 692 + }, + { + "epoch": 0.10717185385656293, + "grad_norm": 7.197606563568115, + "learning_rate": 1.7860824742268044e-06, + "logits/chosen": 6.909630298614502, + "logits/rejected": 4.325741767883301, + "logps/chosen": -202.50653076171875, + "logps/rejected": -201.484130859375, + "loss": 0.7627, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.02727665938436985, + "rewards/margins": -0.12527775764465332, + "rewards/rejected": 0.09800110757350922, + "step": 693 + }, + { + "epoch": 0.10732650299632708, + "grad_norm": 7.277336120605469, + "learning_rate": 1.7886597938144333e-06, + "logits/chosen": 9.90390396118164, + "logits/rejected": 6.36905574798584, + "logps/chosen": -280.7347412109375, + "logps/rejected": -233.89019775390625, + "loss": 0.6857, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04039278253912926, + "rewards/margins": 0.016063379123806953, + "rewards/rejected": 0.024329401552677155, + "step": 694 + }, + { + "epoch": 0.10748115213609125, + "grad_norm": 5.291078567504883, + "learning_rate": 1.791237113402062e-06, + "logits/chosen": 9.95087718963623, + "logits/rejected": 11.674186706542969, + "logps/chosen": -219.01148986816406, + "logps/rejected": -252.99777221679688, + "loss": 0.6548, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0033414606004953384, + "rewards/margins": 0.08014824986457825, + "rewards/rejected": -0.07680678367614746, + "step": 695 + }, + { + "epoch": 0.1076358012758554, + "grad_norm": 7.976032733917236, + "learning_rate": 1.7938144329896908e-06, + "logits/chosen": 3.961123466491699, + "logits/rejected": 9.905696868896484, + "logps/chosen": -193.1763458251953, + "logps/rejected": -311.06683349609375, + "loss": 0.7636, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07428845763206482, + "rewards/margins": -0.13129006326198578, + "rewards/rejected": 0.057001590728759766, + "step": 696 + }, + { + "epoch": 0.10779045041561956, + "grad_norm": 5.6274094581604, + "learning_rate": 1.7963917525773197e-06, + "logits/chosen": 13.91313362121582, + "logits/rejected": 13.364702224731445, + "logps/chosen": -237.36709594726562, + "logps/rejected": -210.49732971191406, + "loss": 0.7022, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02926046960055828, + "rewards/margins": -0.01489575020968914, + "rewards/rejected": 0.04415621981024742, + "step": 697 + }, + { + "epoch": 0.10794509955538373, + "grad_norm": 3.6515707969665527, + "learning_rate": 1.7989690721649485e-06, + "logits/chosen": 6.404356956481934, + "logits/rejected": 9.36177921295166, + "logps/chosen": -97.04611206054688, + "logps/rejected": -156.2027587890625, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003941057715564966, + "rewards/margins": 0.040685079991817474, + "rewards/rejected": -0.0446261391043663, + "step": 698 + }, + { + "epoch": 0.10809974869514788, + "grad_norm": 4.04915714263916, + "learning_rate": 1.8015463917525774e-06, + "logits/chosen": 12.119953155517578, + "logits/rejected": 11.199554443359375, + "logps/chosen": -272.0041198730469, + "logps/rejected": -259.0806579589844, + "loss": 0.67, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06366066634654999, + "rewards/margins": 0.0506453737616539, + "rewards/rejected": 0.013015293516218662, + "step": 699 + }, + { + "epoch": 0.10825439783491204, + "grad_norm": 4.469096660614014, + "learning_rate": 1.8041237113402063e-06, + "logits/chosen": 7.191540241241455, + "logits/rejected": 12.124711990356445, + "logps/chosen": -120.91340637207031, + "logps/rejected": -182.94583129882812, + "loss": 0.7016, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01614956744015217, + "rewards/margins": -0.01321706548333168, + "rewards/rejected": 0.029366634786128998, + "step": 700 + }, + { + "epoch": 0.10840904697467621, + "grad_norm": 5.043667316436768, + "learning_rate": 1.8067010309278352e-06, + "logits/chosen": 5.647159576416016, + "logits/rejected": 5.3715715408325195, + "logps/chosen": -273.57574462890625, + "logps/rejected": -243.84884643554688, + "loss": 0.7001, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00802641175687313, + "rewards/margins": -0.008949998766183853, + "rewards/rejected": 0.016976404935121536, + "step": 701 + }, + { + "epoch": 0.10856369611444036, + "grad_norm": 7.200451850891113, + "learning_rate": 1.809278350515464e-06, + "logits/chosen": 7.708015441894531, + "logits/rejected": 10.075786590576172, + "logps/chosen": -223.64666748046875, + "logps/rejected": -216.2239532470703, + "loss": 0.7707, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.038020942360162735, + "rewards/margins": -0.13916154205799103, + "rewards/rejected": 0.101140595972538, + "step": 702 + }, + { + "epoch": 0.10871834525420453, + "grad_norm": 8.820755004882812, + "learning_rate": 1.811855670103093e-06, + "logits/chosen": 12.685633659362793, + "logits/rejected": 6.820279121398926, + "logps/chosen": -452.3470153808594, + "logps/rejected": -323.22589111328125, + "loss": 0.7096, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03092489391565323, + "rewards/margins": -0.021825987845659256, + "rewards/rejected": -0.009098910726606846, + "step": 703 + }, + { + "epoch": 0.10887299439396868, + "grad_norm": 3.7269973754882812, + "learning_rate": 1.8144329896907218e-06, + "logits/chosen": 12.842632293701172, + "logits/rejected": 8.11235237121582, + "logps/chosen": -265.1856994628906, + "logps/rejected": -196.07656860351562, + "loss": 0.6798, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010847426950931549, + "rewards/margins": 0.03411087766289711, + "rewards/rejected": -0.02326345443725586, + "step": 704 + }, + { + "epoch": 0.10902764353373284, + "grad_norm": 7.225241184234619, + "learning_rate": 1.8170103092783506e-06, + "logits/chosen": 11.678240776062012, + "logits/rejected": 2.29213547706604, + "logps/chosen": -378.49603271484375, + "logps/rejected": -187.14480590820312, + "loss": 0.6924, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.037880994379520416, + "rewards/margins": 0.0048796930350363255, + "rewards/rejected": 0.03300130367279053, + "step": 705 + }, + { + "epoch": 0.10918229267349701, + "grad_norm": 4.838374137878418, + "learning_rate": 1.8195876288659795e-06, + "logits/chosen": 12.484935760498047, + "logits/rejected": 12.068343162536621, + "logps/chosen": -351.9620056152344, + "logps/rejected": -334.5666809082031, + "loss": 0.6531, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06834468990564346, + "rewards/margins": 0.09771782159805298, + "rewards/rejected": -0.02937312424182892, + "step": 706 + }, + { + "epoch": 0.10933694181326116, + "grad_norm": 5.253668308258057, + "learning_rate": 1.8221649484536084e-06, + "logits/chosen": 12.10118579864502, + "logits/rejected": 3.0983409881591797, + "logps/chosen": -213.44265747070312, + "logps/rejected": -169.13650512695312, + "loss": 0.6807, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.053801730275154114, + "rewards/margins": 0.028064537793397903, + "rewards/rejected": -0.08186626434326172, + "step": 707 + }, + { + "epoch": 0.10949159095302533, + "grad_norm": 6.283231258392334, + "learning_rate": 1.8247422680412373e-06, + "logits/chosen": 7.627786159515381, + "logits/rejected": 6.472894668579102, + "logps/chosen": -293.1684265136719, + "logps/rejected": -301.99786376953125, + "loss": 0.7124, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004636000841856003, + "rewards/margins": -0.023963116109371185, + "rewards/rejected": 0.02859911322593689, + "step": 708 + }, + { + "epoch": 0.10964624009278949, + "grad_norm": 5.071584224700928, + "learning_rate": 1.827319587628866e-06, + "logits/chosen": 7.017948150634766, + "logits/rejected": 5.0013837814331055, + "logps/chosen": -201.0968475341797, + "logps/rejected": -217.12728881835938, + "loss": 0.7346, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.007269810885190964, + "rewards/margins": -0.07340269535779953, + "rewards/rejected": 0.0806725025177002, + "step": 709 + }, + { + "epoch": 0.10980088923255364, + "grad_norm": 6.132742881774902, + "learning_rate": 1.8298969072164948e-06, + "logits/chosen": 10.921630859375, + "logits/rejected": 5.869736671447754, + "logps/chosen": -330.6422119140625, + "logps/rejected": -225.01193237304688, + "loss": 0.7306, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03784565627574921, + "rewards/margins": -0.06417790055274963, + "rewards/rejected": 0.02633224055171013, + "step": 710 + }, + { + "epoch": 0.10995553837231781, + "grad_norm": 4.467646598815918, + "learning_rate": 1.8324742268041237e-06, + "logits/chosen": 14.898259162902832, + "logits/rejected": 10.122255325317383, + "logps/chosen": -230.3477325439453, + "logps/rejected": -226.500244140625, + "loss": 0.6848, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04691576585173607, + "rewards/margins": 0.023256495594978333, + "rewards/rejected": -0.0701722651720047, + "step": 711 + }, + { + "epoch": 0.11011018751208196, + "grad_norm": 5.877440452575684, + "learning_rate": 1.8350515463917525e-06, + "logits/chosen": 7.848925590515137, + "logits/rejected": 9.357367515563965, + "logps/chosen": -344.7926330566406, + "logps/rejected": -372.2003173828125, + "loss": 0.6958, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.057058047503232956, + "rewards/margins": 0.013203656300902367, + "rewards/rejected": 0.04385438188910484, + "step": 712 + }, + { + "epoch": 0.11026483665184612, + "grad_norm": 3.61445951461792, + "learning_rate": 1.8376288659793818e-06, + "logits/chosen": 10.201498985290527, + "logits/rejected": 10.242499351501465, + "logps/chosen": -230.43814086914062, + "logps/rejected": -193.33441162109375, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0902554988861084, + "rewards/margins": 0.09338449686765671, + "rewards/rejected": -0.0031290054321289062, + "step": 713 + }, + { + "epoch": 0.11041948579161029, + "grad_norm": 7.836570739746094, + "learning_rate": 1.8402061855670105e-06, + "logits/chosen": 10.343636512756348, + "logits/rejected": 12.921567916870117, + "logps/chosen": -268.2269287109375, + "logps/rejected": -278.09881591796875, + "loss": 0.7247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03335551917552948, + "rewards/margins": -0.04644375666975975, + "rewards/rejected": 0.013088226318359375, + "step": 714 + }, + { + "epoch": 0.11057413493137444, + "grad_norm": 5.876894950866699, + "learning_rate": 1.8427835051546394e-06, + "logits/chosen": 7.584066867828369, + "logits/rejected": 12.958361625671387, + "logps/chosen": -184.07427978515625, + "logps/rejected": -342.74554443359375, + "loss": 0.6851, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01125945895910263, + "rewards/margins": 0.022399526089429855, + "rewards/rejected": -0.03365898132324219, + "step": 715 + }, + { + "epoch": 0.1107287840711386, + "grad_norm": 4.255242347717285, + "learning_rate": 1.8453608247422682e-06, + "logits/chosen": 10.220253944396973, + "logits/rejected": 10.728243827819824, + "logps/chosen": -196.3232421875, + "logps/rejected": -233.96810913085938, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028787709772586823, + "rewards/margins": 0.03922424837946892, + "rewards/rejected": -0.06801195442676544, + "step": 716 + }, + { + "epoch": 0.11088343321090277, + "grad_norm": 5.621160984039307, + "learning_rate": 1.847938144329897e-06, + "logits/chosen": 11.877320289611816, + "logits/rejected": 10.266227722167969, + "logps/chosen": -220.4231719970703, + "logps/rejected": -188.4232177734375, + "loss": 0.7308, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07582726329565048, + "rewards/margins": -0.06901021301746368, + "rewards/rejected": -0.00681705679744482, + "step": 717 + }, + { + "epoch": 0.11103808235066692, + "grad_norm": 4.846379280090332, + "learning_rate": 1.850515463917526e-06, + "logits/chosen": 11.470077514648438, + "logits/rejected": 4.535397529602051, + "logps/chosen": -249.92218017578125, + "logps/rejected": -232.28977966308594, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012532854452729225, + "rewards/margins": 0.06642108410596848, + "rewards/rejected": -0.0538882240653038, + "step": 718 + }, + { + "epoch": 0.11119273149043109, + "grad_norm": 6.110698699951172, + "learning_rate": 1.8530927835051548e-06, + "logits/chosen": 15.015692710876465, + "logits/rejected": 9.169832229614258, + "logps/chosen": -241.60145568847656, + "logps/rejected": -238.036376953125, + "loss": 0.7212, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04998607933521271, + "rewards/margins": -0.03686261177062988, + "rewards/rejected": -0.0131234647706151, + "step": 719 + }, + { + "epoch": 0.11134738063019524, + "grad_norm": 4.128805637359619, + "learning_rate": 1.8556701030927837e-06, + "logits/chosen": 5.779862880706787, + "logits/rejected": 3.782573938369751, + "logps/chosen": -206.86532592773438, + "logps/rejected": -224.34893798828125, + "loss": 0.6883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08036962151527405, + "rewards/margins": 0.012198328971862793, + "rewards/rejected": 0.06817128509283066, + "step": 720 + }, + { + "epoch": 0.1115020297699594, + "grad_norm": 5.853085041046143, + "learning_rate": 1.8582474226804126e-06, + "logits/chosen": 7.352654457092285, + "logits/rejected": 7.837530136108398, + "logps/chosen": -309.820556640625, + "logps/rejected": -359.3823547363281, + "loss": 0.69, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.028368379920721054, + "rewards/margins": 0.011280491948127747, + "rewards/rejected": 0.017087887972593307, + "step": 721 + }, + { + "epoch": 0.11165667890972357, + "grad_norm": 5.479710578918457, + "learning_rate": 1.8608247422680415e-06, + "logits/chosen": 4.995455741882324, + "logits/rejected": 5.5761942863464355, + "logps/chosen": -201.00523376464844, + "logps/rejected": -191.72509765625, + "loss": 0.7481, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.040320709347724915, + "rewards/margins": -0.09750836342573166, + "rewards/rejected": 0.057187654078006744, + "step": 722 + }, + { + "epoch": 0.11181132804948772, + "grad_norm": 5.180700302124023, + "learning_rate": 1.8634020618556703e-06, + "logits/chosen": 12.719510078430176, + "logits/rejected": 9.608434677124023, + "logps/chosen": -365.6546630859375, + "logps/rejected": -337.07769775390625, + "loss": 0.6768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014326428063213825, + "rewards/margins": 0.044106535613536835, + "rewards/rejected": -0.029780101031064987, + "step": 723 + }, + { + "epoch": 0.11196597718925189, + "grad_norm": 4.572819232940674, + "learning_rate": 1.8659793814432992e-06, + "logits/chosen": 11.909309387207031, + "logits/rejected": 5.242926120758057, + "logps/chosen": -318.7596435546875, + "logps/rejected": -204.9488983154297, + "loss": 0.6155, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10008469223976135, + "rewards/margins": 0.16630098223686218, + "rewards/rejected": -0.06621628254652023, + "step": 724 + }, + { + "epoch": 0.11212062632901605, + "grad_norm": 5.882225513458252, + "learning_rate": 1.868556701030928e-06, + "logits/chosen": 10.94326114654541, + "logits/rejected": 7.2069549560546875, + "logps/chosen": -338.3840637207031, + "logps/rejected": -263.2120361328125, + "loss": 0.6733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02367725595831871, + "rewards/margins": 0.04506239667534828, + "rewards/rejected": -0.06873965263366699, + "step": 725 + }, + { + "epoch": 0.1122752754687802, + "grad_norm": 6.208929061889648, + "learning_rate": 1.871134020618557e-06, + "logits/chosen": 7.1215925216674805, + "logits/rejected": 2.3828399181365967, + "logps/chosen": -271.33428955078125, + "logps/rejected": -259.35662841796875, + "loss": 0.7052, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.005681419745087624, + "rewards/margins": -0.008107852190732956, + "rewards/rejected": 0.013789273798465729, + "step": 726 + }, + { + "epoch": 0.11242992460854437, + "grad_norm": 6.331758975982666, + "learning_rate": 1.8737113402061858e-06, + "logits/chosen": 10.625401496887207, + "logits/rejected": 7.8632588386535645, + "logps/chosen": -236.84521484375, + "logps/rejected": -169.7083282470703, + "loss": 0.7339, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0017783879302442074, + "rewards/margins": -0.07529734075069427, + "rewards/rejected": 0.07351894676685333, + "step": 727 + }, + { + "epoch": 0.11258457374830852, + "grad_norm": 5.382650375366211, + "learning_rate": 1.8762886597938145e-06, + "logits/chosen": 5.8831658363342285, + "logits/rejected": 5.561832427978516, + "logps/chosen": -288.421875, + "logps/rejected": -250.149658203125, + "loss": 0.6735, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09198927879333496, + "rewards/margins": 0.051101118326187134, + "rewards/rejected": 0.040888167917728424, + "step": 728 + }, + { + "epoch": 0.11273922288807268, + "grad_norm": 5.3047614097595215, + "learning_rate": 1.8788659793814434e-06, + "logits/chosen": 9.696928024291992, + "logits/rejected": 13.497712135314941, + "logps/chosen": -191.14898681640625, + "logps/rejected": -281.43035888671875, + "loss": 0.713, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04756645858287811, + "rewards/margins": -0.031801559031009674, + "rewards/rejected": 0.07936801016330719, + "step": 729 + }, + { + "epoch": 0.11289387202783685, + "grad_norm": 4.395238876342773, + "learning_rate": 1.8814432989690722e-06, + "logits/chosen": 6.414369583129883, + "logits/rejected": 1.8793587684631348, + "logps/chosen": -217.67242431640625, + "logps/rejected": -179.20553588867188, + "loss": 0.7062, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0620153471827507, + "rewards/margins": -0.024518823251128197, + "rewards/rejected": -0.03749651834368706, + "step": 730 + }, + { + "epoch": 0.113048521167601, + "grad_norm": 5.196818828582764, + "learning_rate": 1.884020618556701e-06, + "logits/chosen": 3.1169824600219727, + "logits/rejected": 4.476332187652588, + "logps/chosen": -310.5924377441406, + "logps/rejected": -323.5171203613281, + "loss": 0.606, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10536368191242218, + "rewards/margins": 0.19147217273712158, + "rewards/rejected": -0.0861084908246994, + "step": 731 + }, + { + "epoch": 0.11320317030736517, + "grad_norm": 5.124047756195068, + "learning_rate": 1.88659793814433e-06, + "logits/chosen": 13.44982624053955, + "logits/rejected": 6.7886738777160645, + "logps/chosen": -374.034423828125, + "logps/rejected": -250.93060302734375, + "loss": 0.6665, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04483547434210777, + "rewards/margins": 0.06021089479327202, + "rewards/rejected": -0.015375422313809395, + "step": 732 + }, + { + "epoch": 0.11335781944712933, + "grad_norm": 4.819911479949951, + "learning_rate": 1.8891752577319588e-06, + "logits/chosen": 12.030155181884766, + "logits/rejected": 2.9154651165008545, + "logps/chosen": -333.56658935546875, + "logps/rejected": -184.98818969726562, + "loss": 0.6634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04062338173389435, + "rewards/margins": 0.06568670272827148, + "rewards/rejected": -0.025063324719667435, + "step": 733 + }, + { + "epoch": 0.11351246858689348, + "grad_norm": 4.5744147300720215, + "learning_rate": 1.8917525773195877e-06, + "logits/chosen": 8.956939697265625, + "logits/rejected": 14.21845817565918, + "logps/chosen": -185.28526306152344, + "logps/rejected": -251.44049072265625, + "loss": 0.6308, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09377098083496094, + "rewards/margins": 0.13898378610610962, + "rewards/rejected": -0.04521279036998749, + "step": 734 + }, + { + "epoch": 0.11366711772665765, + "grad_norm": 11.274410247802734, + "learning_rate": 1.8943298969072166e-06, + "logits/chosen": 8.833852767944336, + "logits/rejected": 11.25501537322998, + "logps/chosen": -244.5928192138672, + "logps/rejected": -269.6267395019531, + "loss": 0.7713, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.03720083460211754, + "rewards/margins": -0.14621639251708984, + "rewards/rejected": 0.1090155616402626, + "step": 735 + }, + { + "epoch": 0.1138217668664218, + "grad_norm": 5.071261882781982, + "learning_rate": 1.8969072164948455e-06, + "logits/chosen": 11.275022506713867, + "logits/rejected": 11.953080177307129, + "logps/chosen": -285.617919921875, + "logps/rejected": -308.9759216308594, + "loss": 0.6671, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03621811047196388, + "rewards/margins": 0.06337056308984756, + "rewards/rejected": -0.027152445167303085, + "step": 736 + }, + { + "epoch": 0.11397641600618597, + "grad_norm": 7.943196773529053, + "learning_rate": 1.8994845360824743e-06, + "logits/chosen": 10.303644180297852, + "logits/rejected": 11.87435531616211, + "logps/chosen": -214.60906982421875, + "logps/rejected": -205.105712890625, + "loss": 0.702, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.043195489794015884, + "rewards/margins": -0.006416483782231808, + "rewards/rejected": 0.049611978232860565, + "step": 737 + }, + { + "epoch": 0.11413106514595013, + "grad_norm": 8.889657020568848, + "learning_rate": 1.9020618556701032e-06, + "logits/chosen": 13.355319023132324, + "logits/rejected": 11.208335876464844, + "logps/chosen": -397.4225769042969, + "logps/rejected": -385.44598388671875, + "loss": 0.6435, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07396354526281357, + "rewards/margins": 0.11955565214157104, + "rewards/rejected": -0.045592114329338074, + "step": 738 + }, + { + "epoch": 0.11428571428571428, + "grad_norm": 4.7317633628845215, + "learning_rate": 1.904639175257732e-06, + "logits/chosen": 6.910393714904785, + "logits/rejected": 8.529176712036133, + "logps/chosen": -223.09652709960938, + "logps/rejected": -248.20338439941406, + "loss": 0.714, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.030397707596421242, + "rewards/margins": -0.03460369259119034, + "rewards/rejected": 0.06500139832496643, + "step": 739 + }, + { + "epoch": 0.11444036342547845, + "grad_norm": 4.626914024353027, + "learning_rate": 1.907216494845361e-06, + "logits/chosen": 3.751765012741089, + "logits/rejected": 5.63156270980835, + "logps/chosen": -172.82577514648438, + "logps/rejected": -230.5620574951172, + "loss": 0.7216, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04254341125488281, + "rewards/margins": -0.0542665459215641, + "rewards/rejected": 0.011723138391971588, + "step": 740 + }, + { + "epoch": 0.11459501256524261, + "grad_norm": 5.269464492797852, + "learning_rate": 1.90979381443299e-06, + "logits/chosen": 6.896068572998047, + "logits/rejected": 4.070405960083008, + "logps/chosen": -253.23797607421875, + "logps/rejected": -240.21958923339844, + "loss": 0.6954, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04485926777124405, + "rewards/margins": 9.636580944061279e-05, + "rewards/rejected": 0.04476289823651314, + "step": 741 + }, + { + "epoch": 0.11474966170500676, + "grad_norm": 5.3275370597839355, + "learning_rate": 1.9123711340206187e-06, + "logits/chosen": 13.170534133911133, + "logits/rejected": 9.266887664794922, + "logps/chosen": -320.3880615234375, + "logps/rejected": -287.1246643066406, + "loss": 0.6922, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022713851183652878, + "rewards/margins": 0.01951933279633522, + "rewards/rejected": -0.0422331802546978, + "step": 742 + }, + { + "epoch": 0.11490431084477093, + "grad_norm": 3.633965492248535, + "learning_rate": 1.9149484536082476e-06, + "logits/chosen": 12.454980850219727, + "logits/rejected": 0.3428466320037842, + "logps/chosen": -177.30487060546875, + "logps/rejected": -80.21995544433594, + "loss": 0.6473, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07775764167308807, + "rewards/margins": 0.09602270275354385, + "rewards/rejected": -0.01826505735516548, + "step": 743 + }, + { + "epoch": 0.11505895998453508, + "grad_norm": 7.145014762878418, + "learning_rate": 1.9175257731958764e-06, + "logits/chosen": 11.250179290771484, + "logits/rejected": 14.61050796508789, + "logps/chosen": -230.84727478027344, + "logps/rejected": -364.293701171875, + "loss": 0.6433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.010029507800936699, + "rewards/margins": 0.10356760025024414, + "rewards/rejected": -0.11359710991382599, + "step": 744 + }, + { + "epoch": 0.11521360912429925, + "grad_norm": 5.105704307556152, + "learning_rate": 1.9201030927835053e-06, + "logits/chosen": 10.922404289245605, + "logits/rejected": 13.398233413696289, + "logps/chosen": -254.3232421875, + "logps/rejected": -341.56536865234375, + "loss": 0.7304, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08438950031995773, + "rewards/margins": -0.06996269524097443, + "rewards/rejected": -0.014426801353693008, + "step": 745 + }, + { + "epoch": 0.11536825826406341, + "grad_norm": 4.958789348602295, + "learning_rate": 1.922680412371134e-06, + "logits/chosen": 14.878204345703125, + "logits/rejected": 6.612778186798096, + "logps/chosen": -401.16156005859375, + "logps/rejected": -341.08294677734375, + "loss": 0.6621, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.047301530838012695, + "rewards/margins": 0.08037014305591583, + "rewards/rejected": -0.03306861221790314, + "step": 746 + }, + { + "epoch": 0.11552290740382756, + "grad_norm": 5.68366003036499, + "learning_rate": 1.925257731958763e-06, + "logits/chosen": 8.169615745544434, + "logits/rejected": 12.711262702941895, + "logps/chosen": -352.0115051269531, + "logps/rejected": -413.6510925292969, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0029344093054533005, + "rewards/margins": 0.07123227417469025, + "rewards/rejected": -0.0682978630065918, + "step": 747 + }, + { + "epoch": 0.11567755654359173, + "grad_norm": 5.375355243682861, + "learning_rate": 1.927835051546392e-06, + "logits/chosen": 13.744950294494629, + "logits/rejected": 14.151660919189453, + "logps/chosen": -311.1435546875, + "logps/rejected": -313.9859619140625, + "loss": 0.7034, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03731326758861542, + "rewards/margins": -0.014317415654659271, + "rewards/rejected": 0.05163068696856499, + "step": 748 + }, + { + "epoch": 0.11583220568335588, + "grad_norm": 5.679323673248291, + "learning_rate": 1.930412371134021e-06, + "logits/chosen": 10.623978614807129, + "logits/rejected": 10.223285675048828, + "logps/chosen": -310.0151062011719, + "logps/rejected": -344.6908264160156, + "loss": 0.6705, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06432309746742249, + "rewards/margins": 0.055307816714048386, + "rewards/rejected": 0.009015275165438652, + "step": 749 + }, + { + "epoch": 0.11598685482312004, + "grad_norm": 5.993717193603516, + "learning_rate": 1.9329896907216497e-06, + "logits/chosen": 12.796170234680176, + "logits/rejected": 8.972481727600098, + "logps/chosen": -377.73797607421875, + "logps/rejected": -269.9862365722656, + "loss": 0.6818, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04607239365577698, + "rewards/margins": 0.02355222962796688, + "rewards/rejected": 0.022520162165164948, + "step": 750 + }, + { + "epoch": 0.11614150396288421, + "grad_norm": 6.605037212371826, + "learning_rate": 1.9355670103092785e-06, + "logits/chosen": 5.111069679260254, + "logits/rejected": 3.73654842376709, + "logps/chosen": -196.4358367919922, + "logps/rejected": -231.71827697753906, + "loss": 0.647, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05673956498503685, + "rewards/margins": 0.09972162544727325, + "rewards/rejected": -0.042982056736946106, + "step": 751 + }, + { + "epoch": 0.11629615310264836, + "grad_norm": 6.3636860847473145, + "learning_rate": 1.9381443298969074e-06, + "logits/chosen": 9.221412658691406, + "logits/rejected": 5.37880277633667, + "logps/chosen": -341.12127685546875, + "logps/rejected": -292.807373046875, + "loss": 0.7052, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03568115085363388, + "rewards/margins": -0.017798233777284622, + "rewards/rejected": -0.017882922664284706, + "step": 752 + }, + { + "epoch": 0.11645080224241253, + "grad_norm": 6.042775630950928, + "learning_rate": 1.9407216494845363e-06, + "logits/chosen": 10.474544525146484, + "logits/rejected": 12.03170394897461, + "logps/chosen": -216.24810791015625, + "logps/rejected": -325.63226318359375, + "loss": 0.6611, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03577017784118652, + "rewards/margins": 0.0670805424451828, + "rewards/rejected": -0.10285072773694992, + "step": 753 + }, + { + "epoch": 0.11660545138217669, + "grad_norm": 3.8389718532562256, + "learning_rate": 1.943298969072165e-06, + "logits/chosen": 8.815988540649414, + "logits/rejected": 2.8289542198181152, + "logps/chosen": -184.33404541015625, + "logps/rejected": -172.734375, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003516819328069687, + "rewards/margins": 0.005642890930175781, + "rewards/rejected": -0.009159708395600319, + "step": 754 + }, + { + "epoch": 0.11676010052194084, + "grad_norm": 4.905189514160156, + "learning_rate": 1.945876288659794e-06, + "logits/chosen": 7.152502059936523, + "logits/rejected": 7.8270649909973145, + "logps/chosen": -264.0049133300781, + "logps/rejected": -260.3672790527344, + "loss": 0.6685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.056339167058467865, + "rewards/margins": 0.05267782509326935, + "rewards/rejected": 0.0036613456904888153, + "step": 755 + }, + { + "epoch": 0.11691474966170501, + "grad_norm": 3.4967939853668213, + "learning_rate": 1.948453608247423e-06, + "logits/chosen": 7.135779857635498, + "logits/rejected": 5.624508857727051, + "logps/chosen": -190.70980834960938, + "logps/rejected": -167.38829040527344, + "loss": 0.6475, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.05304960906505585, + "rewards/margins": 0.10574927181005478, + "rewards/rejected": -0.05269966274499893, + "step": 756 + }, + { + "epoch": 0.11706939880146916, + "grad_norm": 5.922613620758057, + "learning_rate": 1.9510309278350518e-06, + "logits/chosen": 9.0873384475708, + "logits/rejected": 7.914632320404053, + "logps/chosen": -404.3426208496094, + "logps/rejected": -358.2450866699219, + "loss": 0.6516, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12389259040355682, + "rewards/margins": 0.09641342610120773, + "rewards/rejected": 0.027479171752929688, + "step": 757 + }, + { + "epoch": 0.11722404794123333, + "grad_norm": 50.92366409301758, + "learning_rate": 1.9536082474226806e-06, + "logits/chosen": 7.9488325119018555, + "logits/rejected": 13.052627563476562, + "logps/chosen": -163.4320068359375, + "logps/rejected": -236.76510620117188, + "loss": 0.6034, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05622458457946777, + "rewards/margins": 0.20247511565685272, + "rewards/rejected": -0.14625054597854614, + "step": 758 + }, + { + "epoch": 0.11737869708099749, + "grad_norm": 4.3590240478515625, + "learning_rate": 1.9561855670103095e-06, + "logits/chosen": 8.652853012084961, + "logits/rejected": 3.7794785499572754, + "logps/chosen": -310.9944763183594, + "logps/rejected": -201.96273803710938, + "loss": 0.6033, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1254536658525467, + "rewards/margins": 0.19502457976341248, + "rewards/rejected": -0.06957092136144638, + "step": 759 + }, + { + "epoch": 0.11753334622076164, + "grad_norm": 4.805168628692627, + "learning_rate": 1.9587628865979384e-06, + "logits/chosen": 11.73290729522705, + "logits/rejected": 10.892315864562988, + "logps/chosen": -280.9211730957031, + "logps/rejected": -273.315673828125, + "loss": 0.6826, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07339353859424591, + "rewards/margins": 0.022656060755252838, + "rewards/rejected": 0.050737474113702774, + "step": 760 + }, + { + "epoch": 0.11768799536052581, + "grad_norm": 4.360302925109863, + "learning_rate": 1.9613402061855673e-06, + "logits/chosen": 8.350854873657227, + "logits/rejected": 10.155590057373047, + "logps/chosen": -282.90374755859375, + "logps/rejected": -316.4969482421875, + "loss": 0.6799, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01883707195520401, + "rewards/margins": 0.0349605567753315, + "rewards/rejected": -0.016123484820127487, + "step": 761 + }, + { + "epoch": 0.11784264450028997, + "grad_norm": 5.7184624671936035, + "learning_rate": 1.963917525773196e-06, + "logits/chosen": 10.24630069732666, + "logits/rejected": 14.16028881072998, + "logps/chosen": -329.63446044921875, + "logps/rejected": -369.6271057128906, + "loss": 0.6652, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04944153130054474, + "rewards/margins": 0.09121362119913101, + "rewards/rejected": -0.04177207872271538, + "step": 762 + }, + { + "epoch": 0.11799729364005412, + "grad_norm": 6.68346643447876, + "learning_rate": 1.966494845360825e-06, + "logits/chosen": 11.568010330200195, + "logits/rejected": 4.642280578613281, + "logps/chosen": -424.32977294921875, + "logps/rejected": -360.83892822265625, + "loss": 0.6274, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17120762169361115, + "rewards/margins": 0.1653505265712738, + "rewards/rejected": 0.00585708674043417, + "step": 763 + }, + { + "epoch": 0.11815194277981829, + "grad_norm": 5.750217914581299, + "learning_rate": 1.969072164948454e-06, + "logits/chosen": 16.134275436401367, + "logits/rejected": 11.000141143798828, + "logps/chosen": -325.0711669921875, + "logps/rejected": -225.18186950683594, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06829166412353516, + "rewards/margins": 0.018773367628455162, + "rewards/rejected": 0.04951830208301544, + "step": 764 + }, + { + "epoch": 0.11830659191958244, + "grad_norm": 25.471750259399414, + "learning_rate": 1.9716494845360827e-06, + "logits/chosen": 5.202812194824219, + "logits/rejected": 9.177970886230469, + "logps/chosen": -228.17567443847656, + "logps/rejected": -225.7200164794922, + "loss": 0.7583, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.009062983095645905, + "rewards/margins": -0.11821872740983963, + "rewards/rejected": 0.12728172540664673, + "step": 765 + }, + { + "epoch": 0.1184612410593466, + "grad_norm": 6.5312676429748535, + "learning_rate": 1.9742268041237116e-06, + "logits/chosen": 9.137779235839844, + "logits/rejected": 9.747842788696289, + "logps/chosen": -275.69580078125, + "logps/rejected": -245.17681884765625, + "loss": 0.7522, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.023091748356819153, + "rewards/margins": -0.08754248917102814, + "rewards/rejected": 0.1106342300772667, + "step": 766 + }, + { + "epoch": 0.11861589019911077, + "grad_norm": 4.121193885803223, + "learning_rate": 1.9768041237113405e-06, + "logits/chosen": 11.731197357177734, + "logits/rejected": 5.214690208435059, + "logps/chosen": -307.0433654785156, + "logps/rejected": -128.8443145751953, + "loss": 0.689, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007788658142089844, + "rewards/margins": 0.011392402462661266, + "rewards/rejected": -0.019181059673428535, + "step": 767 + }, + { + "epoch": 0.11877053933887492, + "grad_norm": 5.948873996734619, + "learning_rate": 1.979381443298969e-06, + "logits/chosen": -1.1010417938232422, + "logits/rejected": 3.06386137008667, + "logps/chosen": -277.4606628417969, + "logps/rejected": -318.19659423828125, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09069681167602539, + "rewards/margins": 0.05126165971159935, + "rewards/rejected": 0.03943514823913574, + "step": 768 + }, + { + "epoch": 0.11892518847863909, + "grad_norm": 4.787690162658691, + "learning_rate": 1.981958762886598e-06, + "logits/chosen": 11.404735565185547, + "logits/rejected": 8.035576820373535, + "logps/chosen": -182.26968383789062, + "logps/rejected": -146.99777221679688, + "loss": 0.7037, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.037906453013420105, + "rewards/margins": -0.004474181216210127, + "rewards/rejected": -0.03343227133154869, + "step": 769 + }, + { + "epoch": 0.11907983761840325, + "grad_norm": 5.287943363189697, + "learning_rate": 1.9845360824742267e-06, + "logits/chosen": 14.088287353515625, + "logits/rejected": 11.371744155883789, + "logps/chosen": -342.7298889160156, + "logps/rejected": -270.8135070800781, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09487209469079971, + "rewards/margins": 0.00558633916079998, + "rewards/rejected": 0.08928576111793518, + "step": 770 + }, + { + "epoch": 0.1192344867581674, + "grad_norm": 4.892053127288818, + "learning_rate": 1.9871134020618556e-06, + "logits/chosen": 5.921697616577148, + "logits/rejected": 2.2698230743408203, + "logps/chosen": -271.9732666015625, + "logps/rejected": -201.50393676757812, + "loss": 0.7266, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0035150516778230667, + "rewards/margins": -0.049635402858257294, + "rewards/rejected": 0.05315046012401581, + "step": 771 + }, + { + "epoch": 0.11938913589793157, + "grad_norm": 4.96153450012207, + "learning_rate": 1.9896907216494844e-06, + "logits/chosen": 6.176779747009277, + "logits/rejected": 2.5378684997558594, + "logps/chosen": -200.56024169921875, + "logps/rejected": -211.8494110107422, + "loss": 0.659, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04171309247612953, + "rewards/margins": 0.0731162503361702, + "rewards/rejected": -0.03140316158533096, + "step": 772 + }, + { + "epoch": 0.11954378503769572, + "grad_norm": 6.2721381187438965, + "learning_rate": 1.9922680412371137e-06, + "logits/chosen": 7.779531478881836, + "logits/rejected": 5.811192989349365, + "logps/chosen": -355.29254150390625, + "logps/rejected": -382.0240173339844, + "loss": 0.7882, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.011780645698308945, + "rewards/margins": -0.17687024176120758, + "rewards/rejected": 0.18865087628364563, + "step": 773 + }, + { + "epoch": 0.11969843417745989, + "grad_norm": 4.7964653968811035, + "learning_rate": 1.9948453608247426e-06, + "logits/chosen": 12.759563446044922, + "logits/rejected": 8.546693801879883, + "logps/chosen": -227.78878784179688, + "logps/rejected": -184.16769409179688, + "loss": 0.636, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1331547200679779, + "rewards/margins": 0.1289692223072052, + "rewards/rejected": 0.004185512661933899, + "step": 774 + }, + { + "epoch": 0.11985308331722405, + "grad_norm": 5.230913162231445, + "learning_rate": 1.9974226804123715e-06, + "logits/chosen": 8.235557556152344, + "logits/rejected": 11.511534690856934, + "logps/chosen": -229.26487731933594, + "logps/rejected": -298.178466796875, + "loss": 0.676, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07019595801830292, + "rewards/margins": 0.04300350695848465, + "rewards/rejected": 0.02719244733452797, + "step": 775 + }, + { + "epoch": 0.1200077324569882, + "grad_norm": 4.418065547943115, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 10.97363567352295, + "logits/rejected": 11.506315231323242, + "logps/chosen": -261.0842590332031, + "logps/rejected": -276.09649658203125, + "loss": 0.6842, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.012931514531373978, + "rewards/margins": 0.030180953443050385, + "rewards/rejected": -0.043112464249134064, + "step": 776 + }, + { + "epoch": 0.12016238159675237, + "grad_norm": 4.583871364593506, + "learning_rate": 2.002577319587629e-06, + "logits/chosen": 5.414307594299316, + "logits/rejected": 3.444096565246582, + "logps/chosen": -315.27557373046875, + "logps/rejected": -214.92742919921875, + "loss": 0.6907, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07584243267774582, + "rewards/margins": 0.010672946460545063, + "rewards/rejected": 0.06516948342323303, + "step": 777 + }, + { + "epoch": 0.12031703073651653, + "grad_norm": 4.775980472564697, + "learning_rate": 2.005154639175258e-06, + "logits/chosen": 6.203409671783447, + "logits/rejected": 5.393270015716553, + "logps/chosen": -152.17747497558594, + "logps/rejected": -174.61561584472656, + "loss": 0.7065, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10393404960632324, + "rewards/margins": -0.022334814071655273, + "rewards/rejected": -0.08159923553466797, + "step": 778 + }, + { + "epoch": 0.12047167987628069, + "grad_norm": 4.170057773590088, + "learning_rate": 2.007731958762887e-06, + "logits/chosen": 8.063127517700195, + "logits/rejected": 7.265625953674316, + "logps/chosen": -179.56381225585938, + "logps/rejected": -182.98263549804688, + "loss": 0.6838, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10676445066928864, + "rewards/margins": 0.028914690017700195, + "rewards/rejected": 0.07784977555274963, + "step": 779 + }, + { + "epoch": 0.12062632901604485, + "grad_norm": 4.094460964202881, + "learning_rate": 2.010309278350516e-06, + "logits/chosen": 13.347660064697266, + "logits/rejected": 9.314810752868652, + "logps/chosen": -252.34799194335938, + "logps/rejected": -241.8000946044922, + "loss": 0.6382, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08473721146583557, + "rewards/margins": 0.1212126761674881, + "rewards/rejected": -0.036475468426942825, + "step": 780 + }, + { + "epoch": 0.120780978155809, + "grad_norm": 3.916102647781372, + "learning_rate": 2.0128865979381447e-06, + "logits/chosen": 2.2494542598724365, + "logits/rejected": 9.365983963012695, + "logps/chosen": -129.24871826171875, + "logps/rejected": -147.7412109375, + "loss": 0.6704, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.373335003852844e-05, + "rewards/margins": 0.04763312637805939, + "rewards/rejected": -0.04771685600280762, + "step": 781 + }, + { + "epoch": 0.12093562729557317, + "grad_norm": 4.596820831298828, + "learning_rate": 2.0154639175257736e-06, + "logits/chosen": 14.319719314575195, + "logits/rejected": 13.616143226623535, + "logps/chosen": -244.30213928222656, + "logps/rejected": -255.2755584716797, + "loss": 0.6644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05731811374425888, + "rewards/margins": 0.060666944831609726, + "rewards/rejected": -0.003348827362060547, + "step": 782 + }, + { + "epoch": 0.12109027643533733, + "grad_norm": 7.918374061584473, + "learning_rate": 2.0180412371134024e-06, + "logits/chosen": 14.829571723937988, + "logits/rejected": 12.51628589630127, + "logps/chosen": -223.22036743164062, + "logps/rejected": -236.76748657226562, + "loss": 0.7175, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.007377816364169121, + "rewards/margins": -0.03495025634765625, + "rewards/rejected": 0.04232807084918022, + "step": 783 + }, + { + "epoch": 0.12124492557510148, + "grad_norm": 4.7898030281066895, + "learning_rate": 2.0206185567010313e-06, + "logits/chosen": 9.497858047485352, + "logits/rejected": 12.780324935913086, + "logps/chosen": -190.3990936279297, + "logps/rejected": -213.04229736328125, + "loss": 0.6751, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00687398761510849, + "rewards/margins": 0.04763360694050789, + "rewards/rejected": -0.0407596156001091, + "step": 784 + }, + { + "epoch": 0.12139957471486565, + "grad_norm": 5.661484718322754, + "learning_rate": 2.02319587628866e-06, + "logits/chosen": 8.925309181213379, + "logits/rejected": 6.319267272949219, + "logps/chosen": -194.45753479003906, + "logps/rejected": -164.46499633789062, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0011563310399651527, + "rewards/margins": 0.009312868118286133, + "rewards/rejected": -0.010469197295606136, + "step": 785 + }, + { + "epoch": 0.12155422385462981, + "grad_norm": 6.201214790344238, + "learning_rate": 2.025773195876289e-06, + "logits/chosen": 15.013701438903809, + "logits/rejected": 11.145907402038574, + "logps/chosen": -433.2425231933594, + "logps/rejected": -408.227294921875, + "loss": 0.669, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03738289326429367, + "rewards/margins": 0.051407914608716965, + "rewards/rejected": -0.014025024138391018, + "step": 786 + }, + { + "epoch": 0.12170887299439397, + "grad_norm": 4.733052730560303, + "learning_rate": 2.0283505154639175e-06, + "logits/chosen": 8.976184844970703, + "logits/rejected": 0.9876854419708252, + "logps/chosen": -195.0241241455078, + "logps/rejected": -126.92036437988281, + "loss": 0.6738, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.032509900629520416, + "rewards/margins": 0.04624588415026665, + "rewards/rejected": -0.01373598538339138, + "step": 787 + }, + { + "epoch": 0.12186352213415813, + "grad_norm": 5.2377777099609375, + "learning_rate": 2.0309278350515464e-06, + "logits/chosen": 9.633549690246582, + "logits/rejected": 8.667489051818848, + "logps/chosen": -242.53811645507812, + "logps/rejected": -229.4419708251953, + "loss": 0.7222, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.029599763453006744, + "rewards/margins": -0.0503387451171875, + "rewards/rejected": 0.07993850857019424, + "step": 788 + }, + { + "epoch": 0.12201817127392228, + "grad_norm": 6.034938335418701, + "learning_rate": 2.0335051546391752e-06, + "logits/chosen": 7.839482307434082, + "logits/rejected": 6.206631660461426, + "logps/chosen": -354.34478759765625, + "logps/rejected": -278.4581298828125, + "loss": 0.6565, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08779926598072052, + "rewards/margins": 0.08366823196411133, + "rewards/rejected": 0.004131029359996319, + "step": 789 + }, + { + "epoch": 0.12217282041368645, + "grad_norm": 4.621279239654541, + "learning_rate": 2.036082474226804e-06, + "logits/chosen": 13.00024700164795, + "logits/rejected": 13.73876953125, + "logps/chosen": -241.05104064941406, + "logps/rejected": -224.72862243652344, + "loss": 0.7827, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.03783273696899414, + "rewards/margins": -0.16726845502853394, + "rewards/rejected": 0.129435732960701, + "step": 790 + }, + { + "epoch": 0.12232746955345061, + "grad_norm": 5.090161323547363, + "learning_rate": 2.038659793814433e-06, + "logits/chosen": 6.883066177368164, + "logits/rejected": 3.850621223449707, + "logps/chosen": -175.68406677246094, + "logps/rejected": -196.53976440429688, + "loss": 0.6936, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0826881155371666, + "rewards/margins": 0.017871970310807228, + "rewards/rejected": 0.06481613963842392, + "step": 791 + }, + { + "epoch": 0.12248211869321476, + "grad_norm": 5.466822147369385, + "learning_rate": 2.041237113402062e-06, + "logits/chosen": 7.592716217041016, + "logits/rejected": 11.036369323730469, + "logps/chosen": -282.68658447265625, + "logps/rejected": -372.5218200683594, + "loss": 0.6922, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02853524498641491, + "rewards/margins": 0.007233594078570604, + "rewards/rejected": 0.02130165323615074, + "step": 792 + }, + { + "epoch": 0.12263676783297893, + "grad_norm": 5.506161212921143, + "learning_rate": 2.0438144329896907e-06, + "logits/chosen": 4.128499507904053, + "logits/rejected": 5.9841389656066895, + "logps/chosen": -328.60919189453125, + "logps/rejected": -304.7389221191406, + "loss": 0.7031, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04923431575298309, + "rewards/margins": -0.006314259022474289, + "rewards/rejected": 0.055548571050167084, + "step": 793 + }, + { + "epoch": 0.1227914169727431, + "grad_norm": 5.09592866897583, + "learning_rate": 2.0463917525773196e-06, + "logits/chosen": 11.008781433105469, + "logits/rejected": 13.028718948364258, + "logps/chosen": -283.5499267578125, + "logps/rejected": -314.2843322753906, + "loss": 0.693, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06488609313964844, + "rewards/margins": 0.0037350673228502274, + "rewards/rejected": 0.06115102767944336, + "step": 794 + }, + { + "epoch": 0.12294606611250725, + "grad_norm": 4.265661239624023, + "learning_rate": 2.0489690721649485e-06, + "logits/chosen": 13.848640441894531, + "logits/rejected": 11.360034942626953, + "logps/chosen": -296.02288818359375, + "logps/rejected": -225.19744873046875, + "loss": 0.6794, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017357969656586647, + "rewards/margins": 0.03893432393670082, + "rewards/rejected": -0.021576358005404472, + "step": 795 + }, + { + "epoch": 0.12310071525227141, + "grad_norm": 4.471963882446289, + "learning_rate": 2.0515463917525773e-06, + "logits/chosen": 4.800446510314941, + "logits/rejected": 12.012195587158203, + "logps/chosen": -187.22003173828125, + "logps/rejected": -246.9071044921875, + "loss": 0.6755, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020716626197099686, + "rewards/margins": 0.04033312946557999, + "rewards/rejected": -0.019616510719060898, + "step": 796 + }, + { + "epoch": 0.12325536439203556, + "grad_norm": 5.098708629608154, + "learning_rate": 2.0541237113402062e-06, + "logits/chosen": 8.195775985717773, + "logits/rejected": 15.901062965393066, + "logps/chosen": -197.41647338867188, + "logps/rejected": -285.45166015625, + "loss": 0.6833, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004872034303843975, + "rewards/margins": 0.040291640907526016, + "rewards/rejected": -0.035419613122940063, + "step": 797 + }, + { + "epoch": 0.12341001353179973, + "grad_norm": 5.737367153167725, + "learning_rate": 2.056701030927835e-06, + "logits/chosen": 13.195758819580078, + "logits/rejected": 3.952853202819824, + "logps/chosen": -405.4150085449219, + "logps/rejected": -279.95404052734375, + "loss": 0.6469, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10786724090576172, + "rewards/margins": 0.1071086898446083, + "rewards/rejected": 0.0007585529237985611, + "step": 798 + }, + { + "epoch": 0.1235646626715639, + "grad_norm": 4.648750305175781, + "learning_rate": 2.059278350515464e-06, + "logits/chosen": 6.546786785125732, + "logits/rejected": 10.23299503326416, + "logps/chosen": -195.3010711669922, + "logps/rejected": -216.12734985351562, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04346632957458496, + "rewards/margins": 0.010684346780180931, + "rewards/rejected": 0.03278198093175888, + "step": 799 + }, + { + "epoch": 0.12371931181132804, + "grad_norm": 5.58543062210083, + "learning_rate": 2.061855670103093e-06, + "logits/chosen": 9.791015625, + "logits/rejected": 9.70248031616211, + "logps/chosen": -317.1544189453125, + "logps/rejected": -342.6253356933594, + "loss": 0.6238, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13601836562156677, + "rewards/margins": 0.15982751548290253, + "rewards/rejected": -0.023809146136045456, + "step": 800 + }, + { + "epoch": 0.12387396095109221, + "grad_norm": 5.999906063079834, + "learning_rate": 2.0644329896907217e-06, + "logits/chosen": 15.210868835449219, + "logits/rejected": 10.143805503845215, + "logps/chosen": -367.5404052734375, + "logps/rejected": -262.6044006347656, + "loss": 0.6791, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11988945305347443, + "rewards/margins": 0.05030715465545654, + "rewards/rejected": 0.06958229839801788, + "step": 801 + }, + { + "epoch": 0.12402861009085638, + "grad_norm": 12.851231575012207, + "learning_rate": 2.0670103092783506e-06, + "logits/chosen": 18.283771514892578, + "logits/rejected": 9.252113342285156, + "logps/chosen": -371.8199768066406, + "logps/rejected": -262.4722900390625, + "loss": 0.6784, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03439944237470627, + "rewards/margins": 0.03801487386226654, + "rewards/rejected": -0.003615431487560272, + "step": 802 + }, + { + "epoch": 0.12418325923062053, + "grad_norm": 3.7239251136779785, + "learning_rate": 2.0695876288659794e-06, + "logits/chosen": 14.889500617980957, + "logits/rejected": 16.419139862060547, + "logps/chosen": -185.39463806152344, + "logps/rejected": -192.43190002441406, + "loss": 0.7052, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02721242792904377, + "rewards/margins": -0.021084880456328392, + "rewards/rejected": -0.00612754886969924, + "step": 803 + }, + { + "epoch": 0.12433790837038469, + "grad_norm": 7.316143035888672, + "learning_rate": 2.0721649484536087e-06, + "logits/chosen": 8.062252044677734, + "logits/rejected": 7.828424453735352, + "logps/chosen": -358.885986328125, + "logps/rejected": -353.7328186035156, + "loss": 0.74, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08713152259588242, + "rewards/margins": -0.061764907091856, + "rewards/rejected": -0.02536662295460701, + "step": 804 + }, + { + "epoch": 0.12449255751014884, + "grad_norm": 5.290778636932373, + "learning_rate": 2.0747422680412376e-06, + "logits/chosen": 8.164313316345215, + "logits/rejected": 7.313644886016846, + "logps/chosen": -380.3600158691406, + "logps/rejected": -335.2882080078125, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0015501962043344975, + "rewards/margins": 0.06636910140514374, + "rewards/rejected": -0.0679192990064621, + "step": 805 + }, + { + "epoch": 0.12464720664991301, + "grad_norm": 4.303194046020508, + "learning_rate": 2.077319587628866e-06, + "logits/chosen": 6.468660831451416, + "logits/rejected": 9.87783432006836, + "logps/chosen": -166.953857421875, + "logps/rejected": -272.2720642089844, + "loss": 0.6919, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01073012501001358, + "rewards/margins": 0.0056481375358998775, + "rewards/rejected": -0.016378259286284447, + "step": 806 + }, + { + "epoch": 0.12480185578967717, + "grad_norm": 5.501675128936768, + "learning_rate": 2.079896907216495e-06, + "logits/chosen": 15.556056022644043, + "logits/rejected": 11.417365074157715, + "logps/chosen": -347.63348388671875, + "logps/rejected": -239.30458068847656, + "loss": 0.6314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12454967200756073, + "rewards/margins": 0.13267840445041656, + "rewards/rejected": -0.008128738962113857, + "step": 807 + }, + { + "epoch": 0.12495650492944133, + "grad_norm": 4.462109088897705, + "learning_rate": 2.082474226804124e-06, + "logits/chosen": 11.569940567016602, + "logits/rejected": 4.980714797973633, + "logps/chosen": -225.48834228515625, + "logps/rejected": -188.32470703125, + "loss": 0.6275, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1012401133775711, + "rewards/margins": 0.14932766556739807, + "rewards/rejected": -0.04808754846453667, + "step": 808 + }, + { + "epoch": 0.12511115406920548, + "grad_norm": 5.018195629119873, + "learning_rate": 2.0850515463917527e-06, + "logits/chosen": 6.377014636993408, + "logits/rejected": 9.590864181518555, + "logps/chosen": -304.1554870605469, + "logps/rejected": -264.3926696777344, + "loss": 0.7024, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04383482784032822, + "rewards/margins": -0.0163697712123394, + "rewards/rejected": 0.060204602777957916, + "step": 809 + }, + { + "epoch": 0.12526580320896966, + "grad_norm": 5.525274753570557, + "learning_rate": 2.0876288659793816e-06, + "logits/chosen": 11.017049789428711, + "logits/rejected": 6.923498630523682, + "logps/chosen": -299.4324035644531, + "logps/rejected": -224.84326171875, + "loss": 0.7309, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.019978739321231842, + "rewards/margins": -0.06723172962665558, + "rewards/rejected": 0.08721046894788742, + "step": 810 + }, + { + "epoch": 0.1254204523487338, + "grad_norm": 5.988023281097412, + "learning_rate": 2.0902061855670104e-06, + "logits/chosen": 12.080341339111328, + "logits/rejected": 8.940905570983887, + "logps/chosen": -288.75872802734375, + "logps/rejected": -245.6265411376953, + "loss": 0.6778, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.010688398033380508, + "rewards/margins": 0.042087554931640625, + "rewards/rejected": -0.03139915689826012, + "step": 811 + }, + { + "epoch": 0.12557510148849796, + "grad_norm": 5.466331481933594, + "learning_rate": 2.0927835051546393e-06, + "logits/chosen": 3.9428858757019043, + "logits/rejected": 6.832188129425049, + "logps/chosen": -237.5424041748047, + "logps/rejected": -221.52450561523438, + "loss": 0.7522, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.008899642154574394, + "rewards/margins": -0.10910601913928986, + "rewards/rejected": 0.1180056557059288, + "step": 812 + }, + { + "epoch": 0.12572975062826214, + "grad_norm": 5.38535213470459, + "learning_rate": 2.095360824742268e-06, + "logits/chosen": 11.530954360961914, + "logits/rejected": 10.034501075744629, + "logps/chosen": -230.23403930664062, + "logps/rejected": -247.35653686523438, + "loss": 0.7041, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010055923834443092, + "rewards/margins": -0.01503563392907381, + "rewards/rejected": 0.004979707300662994, + "step": 813 + }, + { + "epoch": 0.1258843997680263, + "grad_norm": 4.561618328094482, + "learning_rate": 2.097938144329897e-06, + "logits/chosen": 17.721664428710938, + "logits/rejected": 13.507036209106445, + "logps/chosen": -297.8882751464844, + "logps/rejected": -251.82601928710938, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11350574344396591, + "rewards/margins": 0.09214649349451065, + "rewards/rejected": 0.02135925367474556, + "step": 814 + }, + { + "epoch": 0.12603904890779044, + "grad_norm": 5.475361347198486, + "learning_rate": 2.100515463917526e-06, + "logits/chosen": 12.706809997558594, + "logits/rejected": 4.335636138916016, + "logps/chosen": -334.39837646484375, + "logps/rejected": -292.5032958984375, + "loss": 0.6386, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03400973975658417, + "rewards/margins": 0.11597838997840881, + "rewards/rejected": -0.08196864277124405, + "step": 815 + }, + { + "epoch": 0.12619369804755462, + "grad_norm": 7.0570878982543945, + "learning_rate": 2.1030927835051548e-06, + "logits/chosen": 9.889067649841309, + "logits/rejected": 8.568881034851074, + "logps/chosen": -315.8548583984375, + "logps/rejected": -297.20184326171875, + "loss": 0.7286, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.037375811487436295, + "rewards/margins": -0.06345844268798828, + "rewards/rejected": 0.026082634925842285, + "step": 816 + }, + { + "epoch": 0.12634834718731877, + "grad_norm": 8.126748085021973, + "learning_rate": 2.1056701030927837e-06, + "logits/chosen": 4.589727401733398, + "logits/rejected": 9.561027526855469, + "logps/chosen": -325.199462890625, + "logps/rejected": -371.70489501953125, + "loss": 0.6273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06847696006298065, + "rewards/margins": 0.1443004608154297, + "rewards/rejected": -0.07582350075244904, + "step": 817 + }, + { + "epoch": 0.12650299632708292, + "grad_norm": 4.674380779266357, + "learning_rate": 2.1082474226804125e-06, + "logits/chosen": 7.388611793518066, + "logits/rejected": 4.8958024978637695, + "logps/chosen": -284.6629333496094, + "logps/rejected": -235.03884887695312, + "loss": 0.6684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008503109216690063, + "rewards/margins": 0.05912017822265625, + "rewards/rejected": -0.05061707645654678, + "step": 818 + }, + { + "epoch": 0.1266576454668471, + "grad_norm": 4.499520301818848, + "learning_rate": 2.1108247422680414e-06, + "logits/chosen": 13.807245254516602, + "logits/rejected": 9.02981185913086, + "logps/chosen": -238.164794921875, + "logps/rejected": -246.54641723632812, + "loss": 0.7396, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.010256385430693626, + "rewards/margins": -0.08858489990234375, + "rewards/rejected": 0.07832851260900497, + "step": 819 + }, + { + "epoch": 0.12681229460661125, + "grad_norm": 5.532468795776367, + "learning_rate": 2.1134020618556703e-06, + "logits/chosen": 13.084516525268555, + "logits/rejected": 10.398052215576172, + "logps/chosen": -327.9105224609375, + "logps/rejected": -317.53192138671875, + "loss": 0.6004, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21336251497268677, + "rewards/margins": 0.21609102189540863, + "rewards/rejected": -0.002728504128754139, + "step": 820 + }, + { + "epoch": 0.1269669437463754, + "grad_norm": 9.940056800842285, + "learning_rate": 2.115979381443299e-06, + "logits/chosen": 3.2517383098602295, + "logits/rejected": 8.900216102600098, + "logps/chosen": -285.9754943847656, + "logps/rejected": -209.3849334716797, + "loss": 0.6975, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.033056069165468216, + "rewards/margins": 0.0024034082889556885, + "rewards/rejected": 0.030652664601802826, + "step": 821 + }, + { + "epoch": 0.12712159288613958, + "grad_norm": 5.591113090515137, + "learning_rate": 2.118556701030928e-06, + "logits/chosen": 2.3430514335632324, + "logits/rejected": 3.8197216987609863, + "logps/chosen": -508.2909240722656, + "logps/rejected": -183.2859649658203, + "loss": 0.654, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.006513596512377262, + "rewards/margins": 0.08492724597454071, + "rewards/rejected": -0.0914408415555954, + "step": 822 + }, + { + "epoch": 0.12727624202590374, + "grad_norm": 6.254790306091309, + "learning_rate": 2.121134020618557e-06, + "logits/chosen": 8.330195426940918, + "logits/rejected": 2.063925266265869, + "logps/chosen": -289.0109558105469, + "logps/rejected": -219.625732421875, + "loss": 0.7446, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.036389730870723724, + "rewards/margins": -0.06864052265882492, + "rewards/rejected": 0.0322507843375206, + "step": 823 + }, + { + "epoch": 0.1274308911656679, + "grad_norm": 5.575355052947998, + "learning_rate": 2.1237113402061858e-06, + "logits/chosen": 7.975765705108643, + "logits/rejected": 10.704412460327148, + "logps/chosen": -236.17141723632812, + "logps/rejected": -391.30511474609375, + "loss": 0.6372, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0528411902487278, + "rewards/margins": 0.11911468207836151, + "rewards/rejected": -0.06627349555492401, + "step": 824 + }, + { + "epoch": 0.12758554030543204, + "grad_norm": 5.477494716644287, + "learning_rate": 2.1262886597938146e-06, + "logits/chosen": 9.324621200561523, + "logits/rejected": 1.4057573080062866, + "logps/chosen": -315.09405517578125, + "logps/rejected": -252.90664672851562, + "loss": 0.6818, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.030775122344493866, + "rewards/margins": 0.026944708079099655, + "rewards/rejected": 0.003830414265394211, + "step": 825 + }, + { + "epoch": 0.12774018944519622, + "grad_norm": 5.634899616241455, + "learning_rate": 2.1288659793814435e-06, + "logits/chosen": 10.121933937072754, + "logits/rejected": 9.240137100219727, + "logps/chosen": -308.15411376953125, + "logps/rejected": -332.68438720703125, + "loss": 0.7094, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.052181147038936615, + "rewards/margins": -0.028965899720788002, + "rewards/rejected": -0.023215247318148613, + "step": 826 + }, + { + "epoch": 0.12789483858496037, + "grad_norm": 4.997093200683594, + "learning_rate": 2.1314432989690724e-06, + "logits/chosen": 12.90583610534668, + "logits/rejected": 10.992410659790039, + "logps/chosen": -287.5904541015625, + "logps/rejected": -243.84765625, + "loss": 0.6812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06390509754419327, + "rewards/margins": 0.043041277676820755, + "rewards/rejected": 0.020863819867372513, + "step": 827 + }, + { + "epoch": 0.12804948772472452, + "grad_norm": 11.93228530883789, + "learning_rate": 2.1340206185567012e-06, + "logits/chosen": 9.490852355957031, + "logits/rejected": 4.092257022857666, + "logps/chosen": -424.6783447265625, + "logps/rejected": -347.0323791503906, + "loss": 0.7611, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06445684283971786, + "rewards/margins": -0.12477999180555344, + "rewards/rejected": 0.06032313406467438, + "step": 828 + }, + { + "epoch": 0.1282041368644887, + "grad_norm": 4.208880424499512, + "learning_rate": 2.13659793814433e-06, + "logits/chosen": 11.985099792480469, + "logits/rejected": 11.756420135498047, + "logps/chosen": -215.1483917236328, + "logps/rejected": -232.8167266845703, + "loss": 0.6597, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06749783456325531, + "rewards/margins": 0.06984911113977432, + "rewards/rejected": -0.002351284958422184, + "step": 829 + }, + { + "epoch": 0.12835878600425285, + "grad_norm": 5.282952308654785, + "learning_rate": 2.139175257731959e-06, + "logits/chosen": 11.35549545288086, + "logits/rejected": 11.96148681640625, + "logps/chosen": -281.9228820800781, + "logps/rejected": -311.50067138671875, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014527034014463425, + "rewards/margins": 0.04262495040893555, + "rewards/rejected": -0.05715198814868927, + "step": 830 + }, + { + "epoch": 0.128513435144017, + "grad_norm": 5.628005504608154, + "learning_rate": 2.141752577319588e-06, + "logits/chosen": 14.49488639831543, + "logits/rejected": 7.180295944213867, + "logps/chosen": -225.1959228515625, + "logps/rejected": -151.926025390625, + "loss": 0.6973, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06507306545972824, + "rewards/margins": -0.0008908025920391083, + "rewards/rejected": -0.06418225169181824, + "step": 831 + }, + { + "epoch": 0.12866808428378118, + "grad_norm": 4.31758975982666, + "learning_rate": 2.1443298969072167e-06, + "logits/chosen": 6.613127708435059, + "logits/rejected": 10.01853084564209, + "logps/chosen": -148.9969940185547, + "logps/rejected": -172.67262268066406, + "loss": 0.6707, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05273721367120743, + "rewards/margins": 0.04980647563934326, + "rewards/rejected": 0.0029307371005415916, + "step": 832 + }, + { + "epoch": 0.12882273342354533, + "grad_norm": 5.901344299316406, + "learning_rate": 2.1469072164948456e-06, + "logits/chosen": 11.104122161865234, + "logits/rejected": 10.231244087219238, + "logps/chosen": -287.00421142578125, + "logps/rejected": -311.8900146484375, + "loss": 0.7569, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04850844666361809, + "rewards/margins": -0.10779397189617157, + "rewards/rejected": 0.059285521507263184, + "step": 833 + }, + { + "epoch": 0.12897738256330948, + "grad_norm": 5.326846122741699, + "learning_rate": 2.1494845360824745e-06, + "logits/chosen": 11.311027526855469, + "logits/rejected": 6.194657325744629, + "logps/chosen": -275.8129577636719, + "logps/rejected": -249.55160522460938, + "loss": 0.6156, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02035531774163246, + "rewards/margins": 0.20545989274978638, + "rewards/rejected": -0.1851045787334442, + "step": 834 + }, + { + "epoch": 0.12913203170307366, + "grad_norm": 5.7647318840026855, + "learning_rate": 2.1520618556701033e-06, + "logits/chosen": 13.287602424621582, + "logits/rejected": 6.345489501953125, + "logps/chosen": -336.2106628417969, + "logps/rejected": -238.15841674804688, + "loss": 0.6602, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.011835005134344101, + "rewards/margins": 0.07760138809680939, + "rewards/rejected": -0.08943638950586319, + "step": 835 + }, + { + "epoch": 0.12928668084283781, + "grad_norm": 5.190736770629883, + "learning_rate": 2.1546391752577322e-06, + "logits/chosen": 14.597845077514648, + "logits/rejected": 14.97640609741211, + "logps/chosen": -305.5723876953125, + "logps/rejected": -259.9925537109375, + "loss": 0.6951, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04437980800867081, + "rewards/margins": 0.008184665814042091, + "rewards/rejected": -0.052564479410648346, + "step": 836 + }, + { + "epoch": 0.12944132998260197, + "grad_norm": 5.012102127075195, + "learning_rate": 2.157216494845361e-06, + "logits/chosen": 10.570706367492676, + "logits/rejected": 4.804713726043701, + "logps/chosen": -239.45379638671875, + "logps/rejected": -216.71066284179688, + "loss": 0.6994, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.006371835246682167, + "rewards/margins": -0.0052504995837807655, + "rewards/rejected": 0.011622333899140358, + "step": 837 + }, + { + "epoch": 0.12959597912236615, + "grad_norm": 5.319317817687988, + "learning_rate": 2.15979381443299e-06, + "logits/chosen": 6.073154449462891, + "logits/rejected": -1.7007324695587158, + "logps/chosen": -325.26202392578125, + "logps/rejected": -249.67059326171875, + "loss": 0.668, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08003725856542587, + "rewards/margins": 0.0574459582567215, + "rewards/rejected": 0.022591307759284973, + "step": 838 + }, + { + "epoch": 0.1297506282621303, + "grad_norm": 5.549462795257568, + "learning_rate": 2.162371134020619e-06, + "logits/chosen": 12.748003005981445, + "logits/rejected": 8.749460220336914, + "logps/chosen": -365.42523193359375, + "logps/rejected": -306.39288330078125, + "loss": 0.6868, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08983974903821945, + "rewards/margins": 0.017597097903490067, + "rewards/rejected": 0.07224264740943909, + "step": 839 + }, + { + "epoch": 0.12990527740189445, + "grad_norm": 5.197721481323242, + "learning_rate": 2.1649484536082477e-06, + "logits/chosen": 15.839910507202148, + "logits/rejected": 9.044678688049316, + "logps/chosen": -357.5267639160156, + "logps/rejected": -239.7066650390625, + "loss": 0.6307, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13030031323432922, + "rewards/margins": 0.13541758060455322, + "rewards/rejected": -0.005117248743772507, + "step": 840 + }, + { + "epoch": 0.1300599265416586, + "grad_norm": 7.483394622802734, + "learning_rate": 2.1675257731958766e-06, + "logits/chosen": 9.625455856323242, + "logits/rejected": 5.541503429412842, + "logps/chosen": -275.71551513671875, + "logps/rejected": -236.4689178466797, + "loss": 0.6559, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01684422791004181, + "rewards/margins": 0.08393378555774689, + "rewards/rejected": -0.1007780134677887, + "step": 841 + }, + { + "epoch": 0.13021457568142278, + "grad_norm": 4.375672817230225, + "learning_rate": 2.1701030927835055e-06, + "logits/chosen": 12.223703384399414, + "logits/rejected": 4.873989105224609, + "logps/chosen": -272.21771240234375, + "logps/rejected": -177.20758056640625, + "loss": 0.6927, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00025720614939928055, + "rewards/margins": 0.004235647618770599, + "rewards/rejected": -0.0044928546994924545, + "step": 842 + }, + { + "epoch": 0.13036922482118693, + "grad_norm": 4.192330360412598, + "learning_rate": 2.1726804123711343e-06, + "logits/chosen": 9.940579414367676, + "logits/rejected": 11.158378601074219, + "logps/chosen": -214.2007293701172, + "logps/rejected": -230.5918731689453, + "loss": 0.6812, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005637500435113907, + "rewards/margins": 0.028319569304585457, + "rewards/rejected": -0.0226820707321167, + "step": 843 + }, + { + "epoch": 0.13052387396095108, + "grad_norm": 7.0680952072143555, + "learning_rate": 2.175257731958763e-06, + "logits/chosen": 9.581306457519531, + "logits/rejected": 7.343588829040527, + "logps/chosen": -320.553955078125, + "logps/rejected": -280.4522705078125, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07376085221767426, + "rewards/margins": 0.012072324752807617, + "rewards/rejected": 0.06168852001428604, + "step": 844 + }, + { + "epoch": 0.13067852310071526, + "grad_norm": 6.187963008880615, + "learning_rate": 2.177835051546392e-06, + "logits/chosen": 8.123754501342773, + "logits/rejected": 8.557390213012695, + "logps/chosen": -197.9710235595703, + "logps/rejected": -185.9560089111328, + "loss": 0.7158, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016050808131694794, + "rewards/margins": -0.030372004956007004, + "rewards/rejected": 0.014321183785796165, + "step": 845 + }, + { + "epoch": 0.1308331722404794, + "grad_norm": 4.1606268882751465, + "learning_rate": 2.180412371134021e-06, + "logits/chosen": 9.032398223876953, + "logits/rejected": 2.874891757965088, + "logps/chosen": -290.2815856933594, + "logps/rejected": -169.91775512695312, + "loss": 0.5965, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19966769218444824, + "rewards/margins": 0.22657713294029236, + "rewards/rejected": -0.026909448206424713, + "step": 846 + }, + { + "epoch": 0.13098782138024356, + "grad_norm": 5.486983299255371, + "learning_rate": 2.18298969072165e-06, + "logits/chosen": 8.518402099609375, + "logits/rejected": 7.906534671783447, + "logps/chosen": -315.7825927734375, + "logps/rejected": -397.38824462890625, + "loss": 0.6529, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1356370449066162, + "rewards/margins": 0.08912573009729385, + "rewards/rejected": 0.046511318534612656, + "step": 847 + }, + { + "epoch": 0.13114247052000774, + "grad_norm": 5.048002243041992, + "learning_rate": 2.1855670103092787e-06, + "logits/chosen": 5.9496002197265625, + "logits/rejected": 2.853168487548828, + "logps/chosen": -420.3154296875, + "logps/rejected": -264.4692687988281, + "loss": 0.6216, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16529923677444458, + "rewards/margins": 0.15697579085826874, + "rewards/rejected": 0.008323431946337223, + "step": 848 + }, + { + "epoch": 0.1312971196597719, + "grad_norm": 6.496761322021484, + "learning_rate": 2.1881443298969076e-06, + "logits/chosen": 9.39857292175293, + "logits/rejected": 7.443991661071777, + "logps/chosen": -356.5985107421875, + "logps/rejected": -357.9806213378906, + "loss": 0.6649, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03705472871661186, + "rewards/margins": 0.06678623706102371, + "rewards/rejected": -0.02973151206970215, + "step": 849 + }, + { + "epoch": 0.13145176879953605, + "grad_norm": 5.437524318695068, + "learning_rate": 2.1907216494845364e-06, + "logits/chosen": 12.375151634216309, + "logits/rejected": 9.113358497619629, + "logps/chosen": -290.90386962890625, + "logps/rejected": -338.252685546875, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010050198063254356, + "rewards/margins": 0.039315223693847656, + "rewards/rejected": -0.02926502376794815, + "step": 850 + }, + { + "epoch": 0.13160641793930022, + "grad_norm": 5.165545463562012, + "learning_rate": 2.1932989690721653e-06, + "logits/chosen": 6.2288618087768555, + "logits/rejected": 10.506753921508789, + "logps/chosen": -294.6086120605469, + "logps/rejected": -350.2838134765625, + "loss": 0.6546, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14319303631782532, + "rewards/margins": 0.08622626960277557, + "rewards/rejected": 0.05696675926446915, + "step": 851 + }, + { + "epoch": 0.13176106707906438, + "grad_norm": 8.821476936340332, + "learning_rate": 2.195876288659794e-06, + "logits/chosen": 15.098304748535156, + "logits/rejected": 10.107544898986816, + "logps/chosen": -405.1445617675781, + "logps/rejected": -450.9846496582031, + "loss": 0.7068, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2196439653635025, + "rewards/margins": -0.013928033411502838, + "rewards/rejected": 0.23357200622558594, + "step": 852 + }, + { + "epoch": 0.13191571621882853, + "grad_norm": 6.702816963195801, + "learning_rate": 2.1984536082474226e-06, + "logits/chosen": 8.889533996582031, + "logits/rejected": 6.199413299560547, + "logps/chosen": -368.3157958984375, + "logps/rejected": -259.1598205566406, + "loss": 0.7288, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.017586207017302513, + "rewards/margins": -0.057279422879219055, + "rewards/rejected": 0.07486562430858612, + "step": 853 + }, + { + "epoch": 0.1320703653585927, + "grad_norm": 5.448475360870361, + "learning_rate": 2.2010309278350515e-06, + "logits/chosen": 5.404626369476318, + "logits/rejected": 7.960400581359863, + "logps/chosen": -207.77020263671875, + "logps/rejected": -236.05502319335938, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0670294314622879, + "rewards/margins": 0.009743781760334969, + "rewards/rejected": 0.057285647839307785, + "step": 854 + }, + { + "epoch": 0.13222501449835686, + "grad_norm": 5.092534065246582, + "learning_rate": 2.2036082474226804e-06, + "logits/chosen": 14.00489616394043, + "logits/rejected": 4.81197452545166, + "logps/chosen": -330.98785400390625, + "logps/rejected": -217.79623413085938, + "loss": 0.6497, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08687266707420349, + "rewards/margins": 0.09883155673742294, + "rewards/rejected": -0.011958885937929153, + "step": 855 + }, + { + "epoch": 0.132379663638121, + "grad_norm": 5.820626258850098, + "learning_rate": 2.2061855670103092e-06, + "logits/chosen": 13.154562950134277, + "logits/rejected": 3.9785842895507812, + "logps/chosen": -245.25205993652344, + "logps/rejected": -226.92996215820312, + "loss": 0.7128, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08124828338623047, + "rewards/margins": -0.03117704577744007, + "rewards/rejected": -0.05007123947143555, + "step": 856 + }, + { + "epoch": 0.13253431277788516, + "grad_norm": 5.1035542488098145, + "learning_rate": 2.208762886597938e-06, + "logits/chosen": 15.868261337280273, + "logits/rejected": 9.982535362243652, + "logps/chosen": -284.6107177734375, + "logps/rejected": -245.8839874267578, + "loss": 0.6598, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09444428235292435, + "rewards/margins": 0.08277445286512375, + "rewards/rejected": 0.0116698257625103, + "step": 857 + }, + { + "epoch": 0.13268896191764934, + "grad_norm": 5.5282745361328125, + "learning_rate": 2.211340206185567e-06, + "logits/chosen": 9.167112350463867, + "logits/rejected": 13.077407836914062, + "logps/chosen": -206.6728973388672, + "logps/rejected": -246.09349060058594, + "loss": 0.7257, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003094123676419258, + "rewards/margins": -0.053266741335392, + "rewards/rejected": 0.05017261952161789, + "step": 858 + }, + { + "epoch": 0.1328436110574135, + "grad_norm": 6.413069725036621, + "learning_rate": 2.213917525773196e-06, + "logits/chosen": 9.0770845413208, + "logits/rejected": 5.942964553833008, + "logps/chosen": -518.3092651367188, + "logps/rejected": -350.59393310546875, + "loss": 0.7058, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08394508063793182, + "rewards/margins": -0.005816606339067221, + "rewards/rejected": 0.08976168930530548, + "step": 859 + }, + { + "epoch": 0.13299826019717764, + "grad_norm": 4.293361186981201, + "learning_rate": 2.2164948453608247e-06, + "logits/chosen": 12.792933464050293, + "logits/rejected": 9.101966857910156, + "logps/chosen": -232.47250366210938, + "logps/rejected": -184.42225646972656, + "loss": 0.6937, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.07570510357618332, + "rewards/margins": 0.006333686411380768, + "rewards/rejected": 0.06937141716480255, + "step": 860 + }, + { + "epoch": 0.13315290933694182, + "grad_norm": 4.916195869445801, + "learning_rate": 2.2190721649484536e-06, + "logits/chosen": 7.360235214233398, + "logits/rejected": 7.22442626953125, + "logps/chosen": -252.9958953857422, + "logps/rejected": -253.37078857421875, + "loss": 0.608, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04272756725549698, + "rewards/margins": 0.20476779341697693, + "rewards/rejected": -0.16204023361206055, + "step": 861 + }, + { + "epoch": 0.13330755847670597, + "grad_norm": 5.882905006408691, + "learning_rate": 2.2216494845360825e-06, + "logits/chosen": 9.378471374511719, + "logits/rejected": 11.289730072021484, + "logps/chosen": -259.3278503417969, + "logps/rejected": -263.862548828125, + "loss": 0.7256, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029156304895877838, + "rewards/margins": -0.05369654297828674, + "rewards/rejected": 0.024540234357118607, + "step": 862 + }, + { + "epoch": 0.13346220761647012, + "grad_norm": 6.041230201721191, + "learning_rate": 2.2242268041237113e-06, + "logits/chosen": 3.6784310340881348, + "logits/rejected": 12.983327865600586, + "logps/chosen": -144.37530517578125, + "logps/rejected": -295.48150634765625, + "loss": 0.6475, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05102301016449928, + "rewards/margins": 0.09967866539955139, + "rewards/rejected": -0.04865565150976181, + "step": 863 + }, + { + "epoch": 0.1336168567562343, + "grad_norm": 5.762960433959961, + "learning_rate": 2.2268041237113406e-06, + "logits/chosen": 6.7540740966796875, + "logits/rejected": 3.442861557006836, + "logps/chosen": -254.0617218017578, + "logps/rejected": -221.40017700195312, + "loss": 0.701, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09227581322193146, + "rewards/margins": -0.004321906715631485, + "rewards/rejected": 0.09659771621227264, + "step": 864 + }, + { + "epoch": 0.13377150589599845, + "grad_norm": 6.7018723487854, + "learning_rate": 2.2293814432989695e-06, + "logits/chosen": 11.998213768005371, + "logits/rejected": 8.129022598266602, + "logps/chosen": -427.28228759765625, + "logps/rejected": -294.0709228515625, + "loss": 0.7175, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0322965644299984, + "rewards/margins": 0.006228595972061157, + "rewards/rejected": -0.03852515667676926, + "step": 865 + }, + { + "epoch": 0.1339261550357626, + "grad_norm": 6.30493688583374, + "learning_rate": 2.2319587628865984e-06, + "logits/chosen": 12.574810028076172, + "logits/rejected": 11.15985107421875, + "logps/chosen": -337.1407470703125, + "logps/rejected": -246.75933837890625, + "loss": 0.7369, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.006773374974727631, + "rewards/margins": -0.07670469582080841, + "rewards/rejected": 0.08347807824611664, + "step": 866 + }, + { + "epoch": 0.13408080417552679, + "grad_norm": 4.71605920791626, + "learning_rate": 2.2345360824742272e-06, + "logits/chosen": 7.984208106994629, + "logits/rejected": 5.614459991455078, + "logps/chosen": -264.9400634765625, + "logps/rejected": -245.4064483642578, + "loss": 0.6501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07264199107885361, + "rewards/margins": 0.09673988819122314, + "rewards/rejected": -0.02409788966178894, + "step": 867 + }, + { + "epoch": 0.13423545331529094, + "grad_norm": 4.266623020172119, + "learning_rate": 2.237113402061856e-06, + "logits/chosen": 10.277667999267578, + "logits/rejected": 12.970866203308105, + "logps/chosen": -192.5612030029297, + "logps/rejected": -283.6071472167969, + "loss": 0.6569, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0014364225789904594, + "rewards/margins": 0.09040756523609161, + "rewards/rejected": -0.09184398502111435, + "step": 868 + }, + { + "epoch": 0.1343901024550551, + "grad_norm": 5.543199062347412, + "learning_rate": 2.239690721649485e-06, + "logits/chosen": 13.170656204223633, + "logits/rejected": 9.070703506469727, + "logps/chosen": -437.1995849609375, + "logps/rejected": -411.743896484375, + "loss": 0.6599, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07504886388778687, + "rewards/margins": 0.07696985453367233, + "rewards/rejected": -0.0019209831953048706, + "step": 869 + }, + { + "epoch": 0.13454475159481927, + "grad_norm": 4.192004203796387, + "learning_rate": 2.242268041237114e-06, + "logits/chosen": 8.867194175720215, + "logits/rejected": 7.47512674331665, + "logps/chosen": -178.85935974121094, + "logps/rejected": -190.47662353515625, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09408316761255264, + "rewards/margins": 0.009184072725474834, + "rewards/rejected": -0.10326724499464035, + "step": 870 + }, + { + "epoch": 0.13469940073458342, + "grad_norm": 9.250446319580078, + "learning_rate": 2.2448453608247427e-06, + "logits/chosen": 13.235067367553711, + "logits/rejected": 5.132692337036133, + "logps/chosen": -263.7756042480469, + "logps/rejected": -132.05821228027344, + "loss": 0.7728, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08982138335704803, + "rewards/margins": -0.1378098577260971, + "rewards/rejected": 0.04798846319317818, + "step": 871 + }, + { + "epoch": 0.13485404987434757, + "grad_norm": 5.271758556365967, + "learning_rate": 2.247422680412371e-06, + "logits/chosen": 5.6582207679748535, + "logits/rejected": 2.5591440200805664, + "logps/chosen": -274.581298828125, + "logps/rejected": -215.13096618652344, + "loss": 0.6764, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10240302234888077, + "rewards/margins": 0.04517187923192978, + "rewards/rejected": 0.05723114311695099, + "step": 872 + }, + { + "epoch": 0.13500869901411172, + "grad_norm": 5.767563819885254, + "learning_rate": 2.25e-06, + "logits/chosen": 14.18519401550293, + "logits/rejected": 13.963915824890137, + "logps/chosen": -360.17596435546875, + "logps/rejected": -314.26702880859375, + "loss": 0.7058, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013453193940222263, + "rewards/margins": -0.005100056529045105, + "rewards/rejected": -0.00835314393043518, + "step": 873 + }, + { + "epoch": 0.1351633481538759, + "grad_norm": 4.836509704589844, + "learning_rate": 2.252577319587629e-06, + "logits/chosen": 12.89808177947998, + "logits/rejected": 9.826332092285156, + "logps/chosen": -304.12188720703125, + "logps/rejected": -229.79782104492188, + "loss": 0.6395, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08812665939331055, + "rewards/margins": 0.13255575299263, + "rewards/rejected": -0.04442910850048065, + "step": 874 + }, + { + "epoch": 0.13531799729364005, + "grad_norm": 5.204434394836426, + "learning_rate": 2.255154639175258e-06, + "logits/chosen": 5.15670919418335, + "logits/rejected": 4.148692607879639, + "logps/chosen": -327.5721740722656, + "logps/rejected": -258.166748046875, + "loss": 0.6624, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13059142231941223, + "rewards/margins": 0.07351868599653244, + "rewards/rejected": 0.05707273632287979, + "step": 875 + }, + { + "epoch": 0.1354726464334042, + "grad_norm": 4.169265270233154, + "learning_rate": 2.2577319587628867e-06, + "logits/chosen": 16.191415786743164, + "logits/rejected": 13.364799499511719, + "logps/chosen": -314.1923828125, + "logps/rejected": -269.66839599609375, + "loss": 0.6483, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09650377929210663, + "rewards/margins": 0.1148630827665329, + "rewards/rejected": -0.01835930347442627, + "step": 876 + }, + { + "epoch": 0.13562729557316838, + "grad_norm": 6.274011135101318, + "learning_rate": 2.2603092783505155e-06, + "logits/chosen": 8.142695426940918, + "logits/rejected": 5.887526988983154, + "logps/chosen": -328.4047546386719, + "logps/rejected": -203.423095703125, + "loss": 0.6657, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15985208749771118, + "rewards/margins": 0.06586318463087082, + "rewards/rejected": 0.09398889541625977, + "step": 877 + }, + { + "epoch": 0.13578194471293253, + "grad_norm": 7.283665180206299, + "learning_rate": 2.2628865979381444e-06, + "logits/chosen": 11.808151245117188, + "logits/rejected": 7.962191581726074, + "logps/chosen": -460.52899169921875, + "logps/rejected": -322.98748779296875, + "loss": 0.5729, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39074021577835083, + "rewards/margins": 0.3238605856895447, + "rewards/rejected": 0.06687964498996735, + "step": 878 + }, + { + "epoch": 0.13593659385269669, + "grad_norm": 5.229115962982178, + "learning_rate": 2.2654639175257733e-06, + "logits/chosen": 14.614744186401367, + "logits/rejected": 7.676170825958252, + "logps/chosen": -330.6083679199219, + "logps/rejected": -216.42282104492188, + "loss": 0.6678, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13434715569019318, + "rewards/margins": 0.0567832887172699, + "rewards/rejected": 0.07756385952234268, + "step": 879 + }, + { + "epoch": 0.13609124299246086, + "grad_norm": 4.286057472229004, + "learning_rate": 2.268041237113402e-06, + "logits/chosen": 12.034272193908691, + "logits/rejected": 9.07854175567627, + "logps/chosen": -202.08837890625, + "logps/rejected": -171.132080078125, + "loss": 0.6994, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.039801403880119324, + "rewards/margins": 0.0036340728402137756, + "rewards/rejected": 0.03616733476519585, + "step": 880 + }, + { + "epoch": 0.13624589213222502, + "grad_norm": 4.793503761291504, + "learning_rate": 2.270618556701031e-06, + "logits/chosen": 10.853227615356445, + "logits/rejected": 7.406322956085205, + "logps/chosen": -288.4029846191406, + "logps/rejected": -249.38790893554688, + "loss": 0.7017, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11276023834943771, + "rewards/margins": -0.008917592465877533, + "rewards/rejected": 0.12167783081531525, + "step": 881 + }, + { + "epoch": 0.13640054127198917, + "grad_norm": 5.04573917388916, + "learning_rate": 2.27319587628866e-06, + "logits/chosen": 8.643777847290039, + "logits/rejected": 11.178739547729492, + "logps/chosen": -215.95718383789062, + "logps/rejected": -273.3276062011719, + "loss": 0.6896, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11197976768016815, + "rewards/margins": 0.020266003906726837, + "rewards/rejected": 0.09171376377344131, + "step": 882 + }, + { + "epoch": 0.13655519041175335, + "grad_norm": 4.487614631652832, + "learning_rate": 2.2757731958762888e-06, + "logits/chosen": 9.808544158935547, + "logits/rejected": 2.70812726020813, + "logps/chosen": -235.29429626464844, + "logps/rejected": -155.73947143554688, + "loss": 0.6418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05298071354627609, + "rewards/margins": 0.10992717742919922, + "rewards/rejected": -0.05694647133350372, + "step": 883 + }, + { + "epoch": 0.1367098395515175, + "grad_norm": 4.402811527252197, + "learning_rate": 2.2783505154639176e-06, + "logits/chosen": 8.570183753967285, + "logits/rejected": 5.887582302093506, + "logps/chosen": -272.9149169921875, + "logps/rejected": -246.6571807861328, + "loss": 0.6751, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.010329247452318668, + "rewards/margins": 0.04991140961647034, + "rewards/rejected": -0.06024065613746643, + "step": 884 + }, + { + "epoch": 0.13686448869128165, + "grad_norm": 5.491387844085693, + "learning_rate": 2.2809278350515465e-06, + "logits/chosen": 11.135345458984375, + "logits/rejected": -0.7600812911987305, + "logps/chosen": -212.52496337890625, + "logps/rejected": -116.70137786865234, + "loss": 0.6569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07405834645032883, + "rewards/margins": 0.08941201865673065, + "rewards/rejected": -0.015353678725659847, + "step": 885 + }, + { + "epoch": 0.13701913783104583, + "grad_norm": 9.304813385009766, + "learning_rate": 2.2835051546391754e-06, + "logits/chosen": 10.337177276611328, + "logits/rejected": 4.320044994354248, + "logps/chosen": -332.7056579589844, + "logps/rejected": -221.1385498046875, + "loss": 0.7446, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06646871566772461, + "rewards/margins": -0.09340272098779678, + "rewards/rejected": 0.159871444106102, + "step": 886 + }, + { + "epoch": 0.13717378697080998, + "grad_norm": 12.956573486328125, + "learning_rate": 2.2860824742268043e-06, + "logits/chosen": 7.518157958984375, + "logits/rejected": 11.64919662475586, + "logps/chosen": -184.48733520507812, + "logps/rejected": -200.37107849121094, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1380397379398346, + "rewards/margins": 0.0472596138715744, + "rewards/rejected": 0.0907801166176796, + "step": 887 + }, + { + "epoch": 0.13732843611057413, + "grad_norm": 4.373281002044678, + "learning_rate": 2.288659793814433e-06, + "logits/chosen": 9.1815185546875, + "logits/rejected": 7.211836814880371, + "logps/chosen": -246.74102783203125, + "logps/rejected": -241.49554443359375, + "loss": 0.6733, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14692078530788422, + "rewards/margins": 0.04744339734315872, + "rewards/rejected": 0.0994773805141449, + "step": 888 + }, + { + "epoch": 0.13748308525033828, + "grad_norm": 4.062875747680664, + "learning_rate": 2.291237113402062e-06, + "logits/chosen": 10.741153717041016, + "logits/rejected": 10.303879737854004, + "logps/chosen": -160.2999725341797, + "logps/rejected": -162.9789276123047, + "loss": 0.6677, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05793919786810875, + "rewards/margins": 0.057959653437137604, + "rewards/rejected": -2.045556902885437e-05, + "step": 889 + }, + { + "epoch": 0.13763773439010246, + "grad_norm": 5.453105926513672, + "learning_rate": 2.293814432989691e-06, + "logits/chosen": 6.773975372314453, + "logits/rejected": 7.228199481964111, + "logps/chosen": -283.3519287109375, + "logps/rejected": -223.96343994140625, + "loss": 0.6572, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03751359134912491, + "rewards/margins": 0.08041834831237793, + "rewards/rejected": -0.04290475323796272, + "step": 890 + }, + { + "epoch": 0.1377923835298666, + "grad_norm": 4.047957897186279, + "learning_rate": 2.2963917525773198e-06, + "logits/chosen": 9.822409629821777, + "logits/rejected": 4.881087303161621, + "logps/chosen": -235.6240997314453, + "logps/rejected": -185.0804901123047, + "loss": 0.5943, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13318605720996857, + "rewards/margins": 0.21671128273010254, + "rewards/rejected": -0.08352524042129517, + "step": 891 + }, + { + "epoch": 0.13794703266963076, + "grad_norm": 7.919179916381836, + "learning_rate": 2.2989690721649486e-06, + "logits/chosen": 2.247959852218628, + "logits/rejected": 7.165493965148926, + "logps/chosen": -437.8634033203125, + "logps/rejected": -280.78741455078125, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018958479166030884, + "rewards/margins": 0.030875300988554955, + "rewards/rejected": -0.011916822753846645, + "step": 892 + }, + { + "epoch": 0.13810168180939494, + "grad_norm": 4.436543941497803, + "learning_rate": 2.3015463917525775e-06, + "logits/chosen": 9.745589256286621, + "logits/rejected": 9.891400337219238, + "logps/chosen": -170.15150451660156, + "logps/rejected": -185.68832397460938, + "loss": 0.6224, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13712917268276215, + "rewards/margins": 0.1751626431941986, + "rewards/rejected": -0.03803347796201706, + "step": 893 + }, + { + "epoch": 0.1382563309491591, + "grad_norm": 6.057544231414795, + "learning_rate": 2.3041237113402064e-06, + "logits/chosen": 12.952285766601562, + "logits/rejected": 10.196444511413574, + "logps/chosen": -366.32781982421875, + "logps/rejected": -299.0440673828125, + "loss": 0.6722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2091427445411682, + "rewards/margins": 0.08215785026550293, + "rewards/rejected": 0.1269848644733429, + "step": 894 + }, + { + "epoch": 0.13841098008892325, + "grad_norm": 4.52533483505249, + "learning_rate": 2.3067010309278352e-06, + "logits/chosen": 8.695606231689453, + "logits/rejected": 5.417052745819092, + "logps/chosen": -237.46693420410156, + "logps/rejected": -207.29637145996094, + "loss": 0.6279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16573992371559143, + "rewards/margins": 0.14171624183654785, + "rewards/rejected": 0.024023674428462982, + "step": 895 + }, + { + "epoch": 0.13856562922868743, + "grad_norm": 5.627161026000977, + "learning_rate": 2.309278350515464e-06, + "logits/chosen": 12.578010559082031, + "logits/rejected": 8.984378814697266, + "logps/chosen": -293.34515380859375, + "logps/rejected": -285.53143310546875, + "loss": 0.7107, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07159089297056198, + "rewards/margins": -0.012142401188611984, + "rewards/rejected": 0.08373329788446426, + "step": 896 + }, + { + "epoch": 0.13872027836845158, + "grad_norm": 7.131906032562256, + "learning_rate": 2.311855670103093e-06, + "logits/chosen": 10.172256469726562, + "logits/rejected": 11.816521644592285, + "logps/chosen": -396.30206298828125, + "logps/rejected": -454.7700500488281, + "loss": 0.7719, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006964873522520065, + "rewards/margins": -0.11543674767017365, + "rewards/rejected": 0.12240162491798401, + "step": 897 + }, + { + "epoch": 0.13887492750821573, + "grad_norm": 4.766530513763428, + "learning_rate": 2.314432989690722e-06, + "logits/chosen": 14.655416488647461, + "logits/rejected": 4.846123695373535, + "logps/chosen": -326.9876403808594, + "logps/rejected": -204.0702667236328, + "loss": 0.6333, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1927243173122406, + "rewards/margins": 0.1624782681465149, + "rewards/rejected": 0.0302460677921772, + "step": 898 + }, + { + "epoch": 0.1390295766479799, + "grad_norm": 14.805150985717773, + "learning_rate": 2.3170103092783507e-06, + "logits/chosen": 13.82388973236084, + "logits/rejected": -2.2094528675079346, + "logps/chosen": -423.536376953125, + "logps/rejected": -366.45751953125, + "loss": 0.6195, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2512151598930359, + "rewards/margins": 0.1839398741722107, + "rewards/rejected": 0.06727529317140579, + "step": 899 + }, + { + "epoch": 0.13918422578774406, + "grad_norm": 8.064802169799805, + "learning_rate": 2.3195876288659796e-06, + "logits/chosen": 2.946336030960083, + "logits/rejected": 5.952096939086914, + "logps/chosen": -200.036865234375, + "logps/rejected": -207.7844696044922, + "loss": 0.6429, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11251163482666016, + "rewards/margins": 0.11533677577972412, + "rewards/rejected": -0.0028251418843865395, + "step": 900 + }, + { + "epoch": 0.1393388749275082, + "grad_norm": 3.7710654735565186, + "learning_rate": 2.3221649484536085e-06, + "logits/chosen": 7.682502746582031, + "logits/rejected": 6.702328205108643, + "logps/chosen": -202.08265686035156, + "logps/rejected": -177.52894592285156, + "loss": 0.6666, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08657179772853851, + "rewards/margins": 0.058722883462905884, + "rewards/rejected": 0.02784891426563263, + "step": 901 + }, + { + "epoch": 0.13949352406727236, + "grad_norm": 5.264594554901123, + "learning_rate": 2.3247422680412373e-06, + "logits/chosen": 12.423885345458984, + "logits/rejected": 9.60908317565918, + "logps/chosen": -234.58888244628906, + "logps/rejected": -198.98355102539062, + "loss": 0.7939, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0695679634809494, + "rewards/margins": -0.1825190633535385, + "rewards/rejected": 0.11295108497142792, + "step": 902 + }, + { + "epoch": 0.13964817320703654, + "grad_norm": 4.88804817199707, + "learning_rate": 2.3273195876288662e-06, + "logits/chosen": 6.858978271484375, + "logits/rejected": 7.906588554382324, + "logps/chosen": -266.4230041503906, + "logps/rejected": -273.89398193359375, + "loss": 0.671, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08675932884216309, + "rewards/margins": 0.05689845606684685, + "rewards/rejected": 0.02986087277531624, + "step": 903 + }, + { + "epoch": 0.1398028223468007, + "grad_norm": 5.948923587799072, + "learning_rate": 2.329896907216495e-06, + "logits/chosen": 10.992117881774902, + "logits/rejected": 6.869849681854248, + "logps/chosen": -309.8206481933594, + "logps/rejected": -274.9728698730469, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08765468746423721, + "rewards/margins": 0.12816838920116425, + "rewards/rejected": -0.04051370918750763, + "step": 904 + }, + { + "epoch": 0.13995747148656484, + "grad_norm": 4.1175031661987305, + "learning_rate": 2.332474226804124e-06, + "logits/chosen": 11.287524223327637, + "logits/rejected": 7.089748382568359, + "logps/chosen": -247.5906524658203, + "logps/rejected": -220.42759704589844, + "loss": 0.6206, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09364671260118484, + "rewards/margins": 0.16723881661891937, + "rewards/rejected": -0.07359209656715393, + "step": 905 + }, + { + "epoch": 0.14011212062632902, + "grad_norm": 6.615764141082764, + "learning_rate": 2.335051546391753e-06, + "logits/chosen": 12.893607139587402, + "logits/rejected": 8.206303596496582, + "logps/chosen": -479.9052429199219, + "logps/rejected": -372.7142333984375, + "loss": 0.6399, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2191700041294098, + "rewards/margins": 0.14063653349876404, + "rewards/rejected": 0.07853345572948456, + "step": 906 + }, + { + "epoch": 0.14026676976609317, + "grad_norm": 4.0273823738098145, + "learning_rate": 2.3376288659793817e-06, + "logits/chosen": 5.282630443572998, + "logits/rejected": 6.603236198425293, + "logps/chosen": -156.4139862060547, + "logps/rejected": -171.40670776367188, + "loss": 0.6445, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06814494729042053, + "rewards/margins": 0.11642909049987793, + "rewards/rejected": -0.048284150660037994, + "step": 907 + }, + { + "epoch": 0.14042141890585733, + "grad_norm": 7.370420932769775, + "learning_rate": 2.3402061855670106e-06, + "logits/chosen": 8.733667373657227, + "logits/rejected": 4.371000289916992, + "logps/chosen": -297.81390380859375, + "logps/rejected": -174.73419189453125, + "loss": 0.8273, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.11691976338624954, + "rewards/margins": -0.24147753417491913, + "rewards/rejected": 0.12455777823925018, + "step": 908 + }, + { + "epoch": 0.1405760680456215, + "grad_norm": 5.129427433013916, + "learning_rate": 2.3427835051546394e-06, + "logits/chosen": 9.334989547729492, + "logits/rejected": 10.864970207214355, + "logps/chosen": -289.06121826171875, + "logps/rejected": -258.4640197753906, + "loss": 0.677, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03410263732075691, + "rewards/margins": 0.038875989615917206, + "rewards/rejected": -0.004773356020450592, + "step": 909 + }, + { + "epoch": 0.14073071718538566, + "grad_norm": 6.479569911956787, + "learning_rate": 2.3453608247422683e-06, + "logits/chosen": 15.983110427856445, + "logits/rejected": 11.189767837524414, + "logps/chosen": -448.77374267578125, + "logps/rejected": -292.93084716796875, + "loss": 0.7404, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11327371001243591, + "rewards/margins": -0.028903141617774963, + "rewards/rejected": 0.14217686653137207, + "step": 910 + }, + { + "epoch": 0.1408853663251498, + "grad_norm": 10.384605407714844, + "learning_rate": 2.347938144329897e-06, + "logits/chosen": 13.814935684204102, + "logits/rejected": 12.97150707244873, + "logps/chosen": -205.5994110107422, + "logps/rejected": -240.3565216064453, + "loss": 0.7341, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.015475651249289513, + "rewards/margins": -0.06577196717262268, + "rewards/rejected": 0.08124761283397675, + "step": 911 + }, + { + "epoch": 0.141040015464914, + "grad_norm": 4.507956027984619, + "learning_rate": 2.350515463917526e-06, + "logits/chosen": 13.520294189453125, + "logits/rejected": 10.504230499267578, + "logps/chosen": -213.5272979736328, + "logps/rejected": -166.77674865722656, + "loss": 0.6763, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.038066767156124115, + "rewards/margins": 0.054728999733924866, + "rewards/rejected": -0.09279575943946838, + "step": 912 + }, + { + "epoch": 0.14119466460467814, + "grad_norm": 5.875748634338379, + "learning_rate": 2.353092783505155e-06, + "logits/chosen": 14.746333122253418, + "logits/rejected": 9.695646286010742, + "logps/chosen": -336.8843078613281, + "logps/rejected": -286.27386474609375, + "loss": 0.6768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11925964802503586, + "rewards/margins": 0.04703053832054138, + "rewards/rejected": 0.07222909480333328, + "step": 913 + }, + { + "epoch": 0.1413493137444423, + "grad_norm": 7.717573165893555, + "learning_rate": 2.355670103092784e-06, + "logits/chosen": 10.052270889282227, + "logits/rejected": 11.42772388458252, + "logps/chosen": -309.8962097167969, + "logps/rejected": -310.22540283203125, + "loss": 0.8004, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023360159248113632, + "rewards/margins": -0.15259763598442078, + "rewards/rejected": 0.1759577989578247, + "step": 914 + }, + { + "epoch": 0.14150396288420647, + "grad_norm": 6.7534613609313965, + "learning_rate": 2.3582474226804127e-06, + "logits/chosen": 5.907132148742676, + "logits/rejected": 10.063715934753418, + "logps/chosen": -264.49053955078125, + "logps/rejected": -345.8446044921875, + "loss": 0.6768, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.043520018458366394, + "rewards/margins": 0.04153170436620712, + "rewards/rejected": 0.0019883131608366966, + "step": 915 + }, + { + "epoch": 0.14165861202397062, + "grad_norm": 6.351677894592285, + "learning_rate": 2.3608247422680415e-06, + "logits/chosen": 8.423628807067871, + "logits/rejected": 8.350414276123047, + "logps/chosen": -267.1183776855469, + "logps/rejected": -242.9943084716797, + "loss": 0.7203, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02573833242058754, + "rewards/margins": -0.041520215570926666, + "rewards/rejected": 0.0672585517168045, + "step": 916 + }, + { + "epoch": 0.14181326116373477, + "grad_norm": 5.402352809906006, + "learning_rate": 2.3634020618556704e-06, + "logits/chosen": 8.543071746826172, + "logits/rejected": 6.9717206954956055, + "logps/chosen": -259.31573486328125, + "logps/rejected": -240.54495239257812, + "loss": 0.7229, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004346180707216263, + "rewards/margins": -0.04667573422193527, + "rewards/rejected": 0.04232954978942871, + "step": 917 + }, + { + "epoch": 0.14196791030349892, + "grad_norm": 6.482724189758301, + "learning_rate": 2.3659793814432993e-06, + "logits/chosen": 10.48664379119873, + "logits/rejected": 2.9964404106140137, + "logps/chosen": -276.0944519042969, + "logps/rejected": -153.4932403564453, + "loss": 0.6688, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.031136134639382362, + "rewards/margins": 0.057294465601444244, + "rewards/rejected": -0.02615833468735218, + "step": 918 + }, + { + "epoch": 0.1421225594432631, + "grad_norm": 5.261061668395996, + "learning_rate": 2.3685567010309277e-06, + "logits/chosen": 8.841046333312988, + "logits/rejected": 12.002863883972168, + "logps/chosen": -242.5989990234375, + "logps/rejected": -229.36228942871094, + "loss": 0.7317, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08840465545654297, + "rewards/margins": -0.05870799720287323, + "rewards/rejected": 0.1471126675605774, + "step": 919 + }, + { + "epoch": 0.14227720858302725, + "grad_norm": 7.081851482391357, + "learning_rate": 2.3711340206185566e-06, + "logits/chosen": 10.484249114990234, + "logits/rejected": 10.433899879455566, + "logps/chosen": -339.4588623046875, + "logps/rejected": -397.8988037109375, + "loss": 0.6804, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20025300979614258, + "rewards/margins": 0.05834160000085831, + "rewards/rejected": 0.14191141724586487, + "step": 920 + }, + { + "epoch": 0.1424318577227914, + "grad_norm": 4.927915096282959, + "learning_rate": 2.3737113402061855e-06, + "logits/chosen": 8.86042308807373, + "logits/rejected": 8.4859619140625, + "logps/chosen": -257.8925476074219, + "logps/rejected": -276.67095947265625, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11479340493679047, + "rewards/margins": 0.002673577517271042, + "rewards/rejected": 0.11211982369422913, + "step": 921 + }, + { + "epoch": 0.14258650686255558, + "grad_norm": 5.104118824005127, + "learning_rate": 2.3762886597938144e-06, + "logits/chosen": 7.407227516174316, + "logits/rejected": 10.410012245178223, + "logps/chosen": -181.71759033203125, + "logps/rejected": -241.81973266601562, + "loss": 0.7042, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.045998621731996536, + "rewards/margins": -0.004401572048664093, + "rewards/rejected": 0.050400182604789734, + "step": 922 + }, + { + "epoch": 0.14274115600231974, + "grad_norm": 3.99607253074646, + "learning_rate": 2.3788659793814432e-06, + "logits/chosen": 13.399528503417969, + "logits/rejected": 10.451894760131836, + "logps/chosen": -239.62503051757812, + "logps/rejected": -196.5133056640625, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13701358437538147, + "rewards/margins": 0.08184747397899628, + "rewards/rejected": 0.055166102945804596, + "step": 923 + }, + { + "epoch": 0.1428958051420839, + "grad_norm": 4.764994144439697, + "learning_rate": 2.381443298969072e-06, + "logits/chosen": 7.255903720855713, + "logits/rejected": 4.167789936065674, + "logps/chosen": -216.55809020996094, + "logps/rejected": -162.16822814941406, + "loss": 0.6769, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12852072715759277, + "rewards/margins": 0.0515337809920311, + "rewards/rejected": 0.07698693871498108, + "step": 924 + }, + { + "epoch": 0.14305045428184807, + "grad_norm": 6.34387731552124, + "learning_rate": 2.3840206185567014e-06, + "logits/chosen": 11.356218338012695, + "logits/rejected": 6.107921600341797, + "logps/chosen": -232.00921630859375, + "logps/rejected": -195.72372436523438, + "loss": 0.6688, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04922781139612198, + "rewards/margins": 0.07268504798412323, + "rewards/rejected": -0.0234572384506464, + "step": 925 + }, + { + "epoch": 0.14320510342161222, + "grad_norm": 8.591683387756348, + "learning_rate": 2.3865979381443303e-06, + "logits/chosen": 4.65860652923584, + "logits/rejected": 3.262638568878174, + "logps/chosen": -295.3010559082031, + "logps/rejected": -261.58953857421875, + "loss": 0.6899, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05151667818427086, + "rewards/margins": 0.06415456533432007, + "rewards/rejected": -0.012637898325920105, + "step": 926 + }, + { + "epoch": 0.14335975256137637, + "grad_norm": 4.3053879737854, + "learning_rate": 2.389175257731959e-06, + "logits/chosen": 11.637528419494629, + "logits/rejected": 5.480941295623779, + "logps/chosen": -199.94320678710938, + "logps/rejected": -131.50418090820312, + "loss": 0.656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01199965551495552, + "rewards/margins": 0.08124446868896484, + "rewards/rejected": -0.06924481689929962, + "step": 927 + }, + { + "epoch": 0.14351440170114055, + "grad_norm": 4.252354621887207, + "learning_rate": 2.391752577319588e-06, + "logits/chosen": 8.545031547546387, + "logits/rejected": 5.23971700668335, + "logps/chosen": -209.64871215820312, + "logps/rejected": -168.820556640625, + "loss": 0.6862, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.12477627396583557, + "rewards/margins": 0.04728969931602478, + "rewards/rejected": 0.0774865597486496, + "step": 928 + }, + { + "epoch": 0.1436690508409047, + "grad_norm": 4.484482765197754, + "learning_rate": 2.394329896907217e-06, + "logits/chosen": 11.50522518157959, + "logits/rejected": 6.6114325523376465, + "logps/chosen": -207.19639587402344, + "logps/rejected": -205.10179138183594, + "loss": 0.6751, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1282355785369873, + "rewards/margins": 0.05071539431810379, + "rewards/rejected": 0.07752018421888351, + "step": 929 + }, + { + "epoch": 0.14382369998066885, + "grad_norm": 6.674232482910156, + "learning_rate": 2.3969072164948458e-06, + "logits/chosen": 12.247712135314941, + "logits/rejected": 9.308394432067871, + "logps/chosen": -376.05029296875, + "logps/rejected": -356.26300048828125, + "loss": 0.6671, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1441488265991211, + "rewards/margins": 0.06759348511695862, + "rewards/rejected": 0.07655534893274307, + "step": 930 + }, + { + "epoch": 0.14397834912043303, + "grad_norm": 6.294856071472168, + "learning_rate": 2.3994845360824746e-06, + "logits/chosen": 9.637535095214844, + "logits/rejected": 6.786552429199219, + "logps/chosen": -359.70330810546875, + "logps/rejected": -263.89666748046875, + "loss": 0.6523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12031211704015732, + "rewards/margins": 0.11500054597854614, + "rewards/rejected": 0.005311585962772369, + "step": 931 + }, + { + "epoch": 0.14413299826019718, + "grad_norm": 7.539927005767822, + "learning_rate": 2.4020618556701035e-06, + "logits/chosen": 12.931175231933594, + "logits/rejected": 6.994305610656738, + "logps/chosen": -425.36505126953125, + "logps/rejected": -248.13870239257812, + "loss": 0.681, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.020102309063076973, + "rewards/margins": 0.04040088132023811, + "rewards/rejected": -0.060503195971250534, + "step": 932 + }, + { + "epoch": 0.14428764739996133, + "grad_norm": 4.166161060333252, + "learning_rate": 2.4046391752577324e-06, + "logits/chosen": 13.583078384399414, + "logits/rejected": 8.128865242004395, + "logps/chosen": -266.9848937988281, + "logps/rejected": -185.58139038085938, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11701817810535431, + "rewards/margins": 0.11121020466089249, + "rewards/rejected": 0.005807972513139248, + "step": 933 + }, + { + "epoch": 0.14444229653972548, + "grad_norm": 4.818428993225098, + "learning_rate": 2.4072164948453612e-06, + "logits/chosen": 9.980958938598633, + "logits/rejected": 6.4126715660095215, + "logps/chosen": -236.86404418945312, + "logps/rejected": -190.4046630859375, + "loss": 0.7017, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17218610644340515, + "rewards/margins": -0.009835533797740936, + "rewards/rejected": 0.1820216178894043, + "step": 934 + }, + { + "epoch": 0.14459694567948966, + "grad_norm": 4.900993347167969, + "learning_rate": 2.40979381443299e-06, + "logits/chosen": 16.548980712890625, + "logits/rejected": 13.31427001953125, + "logps/chosen": -278.5684814453125, + "logps/rejected": -290.00640869140625, + "loss": 0.7218, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08795805275440216, + "rewards/margins": -0.0549248643219471, + "rewards/rejected": 0.14288291335105896, + "step": 935 + }, + { + "epoch": 0.14475159481925381, + "grad_norm": 5.169083595275879, + "learning_rate": 2.412371134020619e-06, + "logits/chosen": 13.235276222229004, + "logits/rejected": 8.044557571411133, + "logps/chosen": -321.1685485839844, + "logps/rejected": -253.15933227539062, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3566027581691742, + "rewards/margins": 0.19021005928516388, + "rewards/rejected": 0.1663927137851715, + "step": 936 + }, + { + "epoch": 0.14490624395901797, + "grad_norm": 5.428519248962402, + "learning_rate": 2.414948453608248e-06, + "logits/chosen": 13.332061767578125, + "logits/rejected": 6.041755676269531, + "logps/chosen": -327.3083801269531, + "logps/rejected": -182.99327087402344, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03844413906335831, + "rewards/margins": 0.16897039115428925, + "rewards/rejected": -0.13052625954151154, + "step": 937 + }, + { + "epoch": 0.14506089309878215, + "grad_norm": 4.59999418258667, + "learning_rate": 2.4175257731958763e-06, + "logits/chosen": 7.4586687088012695, + "logits/rejected": 11.942691802978516, + "logps/chosen": -253.45135498046875, + "logps/rejected": -333.246826171875, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07155981659889221, + "rewards/margins": 0.04269304499030113, + "rewards/rejected": 0.028866764158010483, + "step": 938 + }, + { + "epoch": 0.1452155422385463, + "grad_norm": 4.321788787841797, + "learning_rate": 2.420103092783505e-06, + "logits/chosen": 10.987452507019043, + "logits/rejected": 5.7488250732421875, + "logps/chosen": -190.94424438476562, + "logps/rejected": -192.43084716796875, + "loss": 0.6624, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06689973175525665, + "rewards/margins": 0.07895570248365402, + "rewards/rejected": -0.01205596886575222, + "step": 939 + }, + { + "epoch": 0.14537019137831045, + "grad_norm": 7.607157230377197, + "learning_rate": 2.422680412371134e-06, + "logits/chosen": 9.883402824401855, + "logits/rejected": 6.4782843589782715, + "logps/chosen": -209.8402099609375, + "logps/rejected": -215.99461364746094, + "loss": 0.722, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12372054904699326, + "rewards/margins": -0.02321319654583931, + "rewards/rejected": 0.14693374931812286, + "step": 940 + }, + { + "epoch": 0.14552484051807463, + "grad_norm": 4.300503730773926, + "learning_rate": 2.425257731958763e-06, + "logits/chosen": 11.408533096313477, + "logits/rejected": 12.018718719482422, + "logps/chosen": -218.31005859375, + "logps/rejected": -237.34414672851562, + "loss": 0.6743, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10544481873512268, + "rewards/margins": 0.042931847274303436, + "rewards/rejected": 0.06251297146081924, + "step": 941 + }, + { + "epoch": 0.14567948965783878, + "grad_norm": 3.9465460777282715, + "learning_rate": 2.427835051546392e-06, + "logits/chosen": 11.535292625427246, + "logits/rejected": 7.290932655334473, + "logps/chosen": -203.159912109375, + "logps/rejected": -154.8922882080078, + "loss": 0.6419, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12058965861797333, + "rewards/margins": 0.11256952583789825, + "rewards/rejected": 0.008020136505365372, + "step": 942 + }, + { + "epoch": 0.14583413879760293, + "grad_norm": 7.470331192016602, + "learning_rate": 2.4304123711340207e-06, + "logits/chosen": 6.8543548583984375, + "logits/rejected": 3.3807930946350098, + "logps/chosen": -217.67507934570312, + "logps/rejected": -208.99429321289062, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16164270043373108, + "rewards/margins": 0.032285548746585846, + "rewards/rejected": 0.12935715913772583, + "step": 943 + }, + { + "epoch": 0.1459887879373671, + "grad_norm": 10.592182159423828, + "learning_rate": 2.4329896907216495e-06, + "logits/chosen": 12.615392684936523, + "logits/rejected": 5.006398677825928, + "logps/chosen": -511.2911376953125, + "logps/rejected": -286.5256042480469, + "loss": 0.6697, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17020931839942932, + "rewards/margins": 0.05973578244447708, + "rewards/rejected": 0.11047354340553284, + "step": 944 + }, + { + "epoch": 0.14614343707713126, + "grad_norm": 5.410158157348633, + "learning_rate": 2.4355670103092784e-06, + "logits/chosen": 10.480123519897461, + "logits/rejected": 7.627450942993164, + "logps/chosen": -333.2989196777344, + "logps/rejected": -249.7787628173828, + "loss": 0.7145, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14121408760547638, + "rewards/margins": -0.02265309914946556, + "rewards/rejected": 0.16386719048023224, + "step": 945 + }, + { + "epoch": 0.1462980862168954, + "grad_norm": 6.3184404373168945, + "learning_rate": 2.4381443298969073e-06, + "logits/chosen": 7.971908092498779, + "logits/rejected": 11.381896018981934, + "logps/chosen": -269.70904541015625, + "logps/rejected": -297.00341796875, + "loss": 0.6728, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3306790292263031, + "rewards/margins": 0.054555416107177734, + "rewards/rejected": 0.27612361311912537, + "step": 946 + }, + { + "epoch": 0.1464527353566596, + "grad_norm": 4.406201362609863, + "learning_rate": 2.440721649484536e-06, + "logits/chosen": 15.634912490844727, + "logits/rejected": 8.238141059875488, + "logps/chosen": -226.30343627929688, + "logps/rejected": -154.6337890625, + "loss": 0.6622, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.25033968687057495, + "rewards/margins": 0.07584743201732635, + "rewards/rejected": 0.1744922399520874, + "step": 947 + }, + { + "epoch": 0.14660738449642374, + "grad_norm": 5.298978328704834, + "learning_rate": 2.443298969072165e-06, + "logits/chosen": 10.781770706176758, + "logits/rejected": 7.479361057281494, + "logps/chosen": -321.77734375, + "logps/rejected": -233.26644897460938, + "loss": 0.6438, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12140927463769913, + "rewards/margins": 0.12022820115089417, + "rewards/rejected": 0.0011810734868049622, + "step": 948 + }, + { + "epoch": 0.1467620336361879, + "grad_norm": 4.054439067840576, + "learning_rate": 2.445876288659794e-06, + "logits/chosen": 9.1417818069458, + "logits/rejected": 5.940650939941406, + "logps/chosen": -210.92037963867188, + "logps/rejected": -197.6656951904297, + "loss": 0.6272, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11081576347351074, + "rewards/margins": 0.1395270824432373, + "rewards/rejected": -0.028711318969726562, + "step": 949 + }, + { + "epoch": 0.14691668277595205, + "grad_norm": 4.954822540283203, + "learning_rate": 2.4484536082474228e-06, + "logits/chosen": 8.866207122802734, + "logits/rejected": 7.824855804443359, + "logps/chosen": -202.92568969726562, + "logps/rejected": -220.5973663330078, + "loss": 0.7237, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1889454871416092, + "rewards/margins": -0.035764746367931366, + "rewards/rejected": 0.22471022605895996, + "step": 950 + }, + { + "epoch": 0.14707133191571622, + "grad_norm": 4.961806297302246, + "learning_rate": 2.4510309278350516e-06, + "logits/chosen": 6.981586456298828, + "logits/rejected": 3.450075626373291, + "logps/chosen": -274.5780944824219, + "logps/rejected": -225.8917694091797, + "loss": 0.6571, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1524912416934967, + "rewards/margins": 0.07941422611474991, + "rewards/rejected": 0.07307702302932739, + "step": 951 + }, + { + "epoch": 0.14722598105548038, + "grad_norm": 4.451239585876465, + "learning_rate": 2.4536082474226805e-06, + "logits/chosen": 11.398658752441406, + "logits/rejected": 0.2986936569213867, + "logps/chosen": -301.5375061035156, + "logps/rejected": -183.1888885498047, + "loss": 0.6436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22814522683620453, + "rewards/margins": 0.12175770103931427, + "rewards/rejected": 0.10638751834630966, + "step": 952 + }, + { + "epoch": 0.14738063019524453, + "grad_norm": 9.133742332458496, + "learning_rate": 2.4561855670103094e-06, + "logits/chosen": 11.362016677856445, + "logits/rejected": 10.932202339172363, + "logps/chosen": -322.4869079589844, + "logps/rejected": -290.24273681640625, + "loss": 0.6921, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12620776891708374, + "rewards/margins": 0.011840078979730606, + "rewards/rejected": 0.11436767131090164, + "step": 953 + }, + { + "epoch": 0.1475352793350087, + "grad_norm": 4.968396186828613, + "learning_rate": 2.4587628865979383e-06, + "logits/chosen": 11.792253494262695, + "logits/rejected": 11.920696258544922, + "logps/chosen": -252.83316040039062, + "logps/rejected": -294.8987731933594, + "loss": 0.7015, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.14753074944019318, + "rewards/margins": -0.0014636069536209106, + "rewards/rejected": 0.1489943563938141, + "step": 954 + }, + { + "epoch": 0.14768992847477286, + "grad_norm": 4.555266857147217, + "learning_rate": 2.4613402061855676e-06, + "logits/chosen": 9.884590148925781, + "logits/rejected": 7.298913478851318, + "logps/chosen": -258.48516845703125, + "logps/rejected": -205.30691528320312, + "loss": 0.6705, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19206787645816803, + "rewards/margins": 0.048691462725400925, + "rewards/rejected": 0.1433764100074768, + "step": 955 + }, + { + "epoch": 0.147844577614537, + "grad_norm": 7.090183734893799, + "learning_rate": 2.463917525773196e-06, + "logits/chosen": 10.31575870513916, + "logits/rejected": 11.121665000915527, + "logps/chosen": -268.80474853515625, + "logps/rejected": -319.9278564453125, + "loss": 0.738, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20418377220630646, + "rewards/margins": -0.06507459282875061, + "rewards/rejected": 0.2692583501338959, + "step": 956 + }, + { + "epoch": 0.1479992267543012, + "grad_norm": 18.6771297454834, + "learning_rate": 2.466494845360825e-06, + "logits/chosen": 13.731008529663086, + "logits/rejected": 8.2936429977417, + "logps/chosen": -417.11334228515625, + "logps/rejected": -276.3769836425781, + "loss": 0.7469, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30361098051071167, + "rewards/margins": -0.09678135812282562, + "rewards/rejected": 0.4003923535346985, + "step": 957 + }, + { + "epoch": 0.14815387589406534, + "grad_norm": 5.036862850189209, + "learning_rate": 2.4690721649484537e-06, + "logits/chosen": 5.275806427001953, + "logits/rejected": 12.655960083007812, + "logps/chosen": -152.2441864013672, + "logps/rejected": -290.1501159667969, + "loss": 0.68, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20953655242919922, + "rewards/margins": 0.030674651265144348, + "rewards/rejected": 0.17886190116405487, + "step": 958 + }, + { + "epoch": 0.1483085250338295, + "grad_norm": 7.2273101806640625, + "learning_rate": 2.4716494845360826e-06, + "logits/chosen": 11.772583961486816, + "logits/rejected": 7.524477005004883, + "logps/chosen": -416.39324951171875, + "logps/rejected": -290.85516357421875, + "loss": 0.7542, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06137719005346298, + "rewards/margins": -0.1101866289973259, + "rewards/rejected": 0.17156381905078888, + "step": 959 + }, + { + "epoch": 0.14846317417359367, + "grad_norm": 5.530838966369629, + "learning_rate": 2.4742268041237115e-06, + "logits/chosen": 12.333992004394531, + "logits/rejected": 6.999913215637207, + "logps/chosen": -341.40716552734375, + "logps/rejected": -283.63714599609375, + "loss": 0.6264, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29755330085754395, + "rewards/margins": 0.1672951579093933, + "rewards/rejected": 0.13025812804698944, + "step": 960 + }, + { + "epoch": 0.14861782331335782, + "grad_norm": 5.298342227935791, + "learning_rate": 2.4768041237113404e-06, + "logits/chosen": 12.743539810180664, + "logits/rejected": 6.548881530761719, + "logps/chosen": -312.2908630371094, + "logps/rejected": -227.13064575195312, + "loss": 0.6071, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3252825438976288, + "rewards/margins": 0.20396390557289124, + "rewards/rejected": 0.12131863087415695, + "step": 961 + }, + { + "epoch": 0.14877247245312197, + "grad_norm": 5.345992088317871, + "learning_rate": 2.4793814432989692e-06, + "logits/chosen": 6.8151116371154785, + "logits/rejected": 9.889668464660645, + "logps/chosen": -192.15992736816406, + "logps/rejected": -223.66824340820312, + "loss": 0.7355, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.10050278156995773, + "rewards/margins": -0.07099009305238724, + "rewards/rejected": 0.17149285972118378, + "step": 962 + }, + { + "epoch": 0.14892712159288615, + "grad_norm": 5.785558700561523, + "learning_rate": 2.481958762886598e-06, + "logits/chosen": 5.8352203369140625, + "logits/rejected": 8.236224174499512, + "logps/chosen": -288.91510009765625, + "logps/rejected": -382.5042724609375, + "loss": 0.651, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3705422580242157, + "rewards/margins": 0.12195216119289398, + "rewards/rejected": 0.24859008193016052, + "step": 963 + }, + { + "epoch": 0.1490817707326503, + "grad_norm": 5.374144554138184, + "learning_rate": 2.484536082474227e-06, + "logits/chosen": 16.47252655029297, + "logits/rejected": 11.239618301391602, + "logps/chosen": -274.526611328125, + "logps/rejected": -278.0457763671875, + "loss": 0.7024, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21998809278011322, + "rewards/margins": -0.016491007059812546, + "rewards/rejected": 0.23647911846637726, + "step": 964 + }, + { + "epoch": 0.14923641987241446, + "grad_norm": 4.982558727264404, + "learning_rate": 2.487113402061856e-06, + "logits/chosen": 6.582665920257568, + "logits/rejected": 5.46577262878418, + "logps/chosen": -309.485107421875, + "logps/rejected": -185.4723663330078, + "loss": 0.6505, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28469952940940857, + "rewards/margins": 0.10418701171875, + "rewards/rejected": 0.18051251769065857, + "step": 965 + }, + { + "epoch": 0.1493910690121786, + "grad_norm": 5.450310707092285, + "learning_rate": 2.4896907216494847e-06, + "logits/chosen": 15.865681648254395, + "logits/rejected": 8.391288757324219, + "logps/chosen": -343.5689392089844, + "logps/rejected": -308.62872314453125, + "loss": 0.6467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3063068389892578, + "rewards/margins": 0.09905394911766052, + "rewards/rejected": 0.2072528898715973, + "step": 966 + }, + { + "epoch": 0.14954571815194279, + "grad_norm": 16.31694221496582, + "learning_rate": 2.4922680412371136e-06, + "logits/chosen": 10.372198104858398, + "logits/rejected": 5.018614292144775, + "logps/chosen": -255.9956512451172, + "logps/rejected": -165.75328063964844, + "loss": 0.757, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.04015684127807617, + "rewards/margins": -0.10746342688798904, + "rewards/rejected": 0.14762026071548462, + "step": 967 + }, + { + "epoch": 0.14970036729170694, + "grad_norm": 5.886409282684326, + "learning_rate": 2.4948453608247425e-06, + "logits/chosen": 9.90416145324707, + "logits/rejected": 7.3132476806640625, + "logps/chosen": -318.612548828125, + "logps/rejected": -303.4840087890625, + "loss": 0.5518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5386236310005188, + "rewards/margins": 0.31132185459136963, + "rewards/rejected": 0.22730179131031036, + "step": 968 + }, + { + "epoch": 0.1498550164314711, + "grad_norm": 9.646836280822754, + "learning_rate": 2.4974226804123713e-06, + "logits/chosen": 14.051658630371094, + "logits/rejected": 7.878078460693359, + "logps/chosen": -494.6185607910156, + "logps/rejected": -331.8997802734375, + "loss": 0.7778, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27219200134277344, + "rewards/margins": -0.10797785967588425, + "rewards/rejected": 0.3801698684692383, + "step": 969 + }, + { + "epoch": 0.15000966557123527, + "grad_norm": 4.787323474884033, + "learning_rate": 2.5e-06, + "logits/chosen": 13.235980987548828, + "logits/rejected": 9.642841339111328, + "logps/chosen": -281.2373046875, + "logps/rejected": -190.06008911132812, + "loss": 0.6396, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2701765298843384, + "rewards/margins": 0.14296922087669373, + "rewards/rejected": 0.12720732390880585, + "step": 970 + }, + { + "epoch": 0.15016431471099942, + "grad_norm": 4.692470550537109, + "learning_rate": 2.502577319587629e-06, + "logits/chosen": 9.996617317199707, + "logits/rejected": 9.307639122009277, + "logps/chosen": -247.04067993164062, + "logps/rejected": -222.12002563476562, + "loss": 0.6645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27485352754592896, + "rewards/margins": 0.07333764433860779, + "rewards/rejected": 0.20151585340499878, + "step": 971 + }, + { + "epoch": 0.15031896385076357, + "grad_norm": 4.71809720993042, + "learning_rate": 2.505154639175258e-06, + "logits/chosen": 5.420339107513428, + "logits/rejected": 3.3367981910705566, + "logps/chosen": -228.22998046875, + "logps/rejected": -163.241943359375, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14215907454490662, + "rewards/margins": 0.008592421188950539, + "rewards/rejected": 0.13356666266918182, + "step": 972 + }, + { + "epoch": 0.15047361299052775, + "grad_norm": 6.231609344482422, + "learning_rate": 2.507731958762887e-06, + "logits/chosen": 8.344324111938477, + "logits/rejected": 5.516124725341797, + "logps/chosen": -328.7281494140625, + "logps/rejected": -370.429443359375, + "loss": 0.6591, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29244059324264526, + "rewards/margins": 0.09757347404956818, + "rewards/rejected": 0.19486714899539948, + "step": 973 + }, + { + "epoch": 0.1506282621302919, + "grad_norm": 7.182100296020508, + "learning_rate": 2.5103092783505157e-06, + "logits/chosen": 7.675025939941406, + "logits/rejected": 1.1511075496673584, + "logps/chosen": -493.1493225097656, + "logps/rejected": -235.66189575195312, + "loss": 0.7178, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11076326668262482, + "rewards/margins": -0.0384642668068409, + "rewards/rejected": 0.14922752976417542, + "step": 974 + }, + { + "epoch": 0.15078291127005605, + "grad_norm": 6.490096092224121, + "learning_rate": 2.5128865979381446e-06, + "logits/chosen": 11.676152229309082, + "logits/rejected": 5.522094249725342, + "logps/chosen": -347.0611877441406, + "logps/rejected": -304.6292419433594, + "loss": 0.6271, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4794609546661377, + "rewards/margins": 0.15278568863868713, + "rewards/rejected": 0.3266752362251282, + "step": 975 + }, + { + "epoch": 0.15093756040982023, + "grad_norm": 3.932523488998413, + "learning_rate": 2.5154639175257734e-06, + "logits/chosen": 8.104815483093262, + "logits/rejected": 7.596432685852051, + "logps/chosen": -187.56895446777344, + "logps/rejected": -168.3927459716797, + "loss": 0.7214, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.22360765933990479, + "rewards/margins": -0.04936189949512482, + "rewards/rejected": 0.2729695737361908, + "step": 976 + }, + { + "epoch": 0.15109220954958438, + "grad_norm": 4.268491268157959, + "learning_rate": 2.5180412371134023e-06, + "logits/chosen": 5.396533489227295, + "logits/rejected": 8.581335067749023, + "logps/chosen": -397.6007080078125, + "logps/rejected": -239.90518188476562, + "loss": 0.6384, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2249927520751953, + "rewards/margins": 0.1317373365163803, + "rewards/rejected": 0.0932554304599762, + "step": 977 + }, + { + "epoch": 0.15124685868934853, + "grad_norm": 4.023215293884277, + "learning_rate": 2.520618556701031e-06, + "logits/chosen": 10.017741203308105, + "logits/rejected": 9.831050872802734, + "logps/chosen": -144.67971801757812, + "logps/rejected": -177.0681610107422, + "loss": 0.6578, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22600960731506348, + "rewards/margins": 0.08895401656627655, + "rewards/rejected": 0.13705560564994812, + "step": 978 + }, + { + "epoch": 0.1514015078291127, + "grad_norm": 4.044947624206543, + "learning_rate": 2.52319587628866e-06, + "logits/chosen": 13.742888450622559, + "logits/rejected": 6.837340354919434, + "logps/chosen": -289.7095642089844, + "logps/rejected": -149.99945068359375, + "loss": 0.5796, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3312702775001526, + "rewards/margins": 0.2694663405418396, + "rewards/rejected": 0.0618039146065712, + "step": 979 + }, + { + "epoch": 0.15155615696887687, + "grad_norm": 4.179415702819824, + "learning_rate": 2.525773195876289e-06, + "logits/chosen": 9.955364227294922, + "logits/rejected": 7.088047027587891, + "logps/chosen": -235.14581298828125, + "logps/rejected": -220.76541137695312, + "loss": 0.6716, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22456562519073486, + "rewards/margins": 0.0635136216878891, + "rewards/rejected": 0.16105200350284576, + "step": 980 + }, + { + "epoch": 0.15171080610864102, + "grad_norm": 5.6151275634765625, + "learning_rate": 2.528350515463918e-06, + "logits/chosen": 16.08432960510254, + "logits/rejected": 9.709497451782227, + "logps/chosen": -319.97003173828125, + "logps/rejected": -290.68487548828125, + "loss": 0.7048, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2854556143283844, + "rewards/margins": 0.00053425133228302, + "rewards/rejected": 0.2849213778972626, + "step": 981 + }, + { + "epoch": 0.15186545524840517, + "grad_norm": 5.1181769371032715, + "learning_rate": 2.5309278350515467e-06, + "logits/chosen": 11.758771896362305, + "logits/rejected": 11.075485229492188, + "logps/chosen": -256.3808898925781, + "logps/rejected": -235.63259887695312, + "loss": 0.6035, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3456147313117981, + "rewards/margins": 0.23047533631324768, + "rewards/rejected": 0.11513939499855042, + "step": 982 + }, + { + "epoch": 0.15202010438816935, + "grad_norm": 5.064818859100342, + "learning_rate": 2.5335051546391755e-06, + "logits/chosen": 4.538435935974121, + "logits/rejected": 0.7975058555603027, + "logps/chosen": -129.87490844726562, + "logps/rejected": -97.656982421875, + "loss": 0.7387, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11231029778718948, + "rewards/margins": -0.08480727672576904, + "rewards/rejected": 0.19711756706237793, + "step": 983 + }, + { + "epoch": 0.1521747535279335, + "grad_norm": 5.55540132522583, + "learning_rate": 2.5360824742268044e-06, + "logits/chosen": 7.44645881652832, + "logits/rejected": 3.70943284034729, + "logps/chosen": -354.41949462890625, + "logps/rejected": -403.79052734375, + "loss": 0.6173, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3005375862121582, + "rewards/margins": 0.18250522017478943, + "rewards/rejected": 0.11803236603736877, + "step": 984 + }, + { + "epoch": 0.15232940266769765, + "grad_norm": 5.49982213973999, + "learning_rate": 2.538659793814433e-06, + "logits/chosen": 7.697822093963623, + "logits/rejected": 6.096981048583984, + "logps/chosen": -307.3810119628906, + "logps/rejected": -237.4229736328125, + "loss": 0.7645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1885399967432022, + "rewards/margins": -0.1180962473154068, + "rewards/rejected": 0.306636244058609, + "step": 985 + }, + { + "epoch": 0.15248405180746183, + "grad_norm": 4.034977912902832, + "learning_rate": 2.5412371134020617e-06, + "logits/chosen": 7.3290815353393555, + "logits/rejected": 5.644658088684082, + "logps/chosen": -197.15191650390625, + "logps/rejected": -149.66915893554688, + "loss": 0.6645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2998509407043457, + "rewards/margins": 0.06729154288768768, + "rewards/rejected": 0.23255939781665802, + "step": 986 + }, + { + "epoch": 0.15263870094722598, + "grad_norm": 4.73818302154541, + "learning_rate": 2.5438144329896906e-06, + "logits/chosen": 12.465482711791992, + "logits/rejected": 7.708515644073486, + "logps/chosen": -249.0347900390625, + "logps/rejected": -167.53024291992188, + "loss": 0.6615, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4345059096813202, + "rewards/margins": 0.10229988396167755, + "rewards/rejected": 0.3322060704231262, + "step": 987 + }, + { + "epoch": 0.15279335008699013, + "grad_norm": 14.134047508239746, + "learning_rate": 2.5463917525773195e-06, + "logits/chosen": 11.476678848266602, + "logits/rejected": 2.8608508110046387, + "logps/chosen": -265.569580078125, + "logps/rejected": -235.99075317382812, + "loss": 0.6952, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.25356826186180115, + "rewards/margins": 0.0375649631023407, + "rewards/rejected": 0.21600332856178284, + "step": 988 + }, + { + "epoch": 0.1529479992267543, + "grad_norm": 6.283043384552002, + "learning_rate": 2.5489690721649483e-06, + "logits/chosen": 9.685466766357422, + "logits/rejected": 4.425581455230713, + "logps/chosen": -383.323486328125, + "logps/rejected": -348.57958984375, + "loss": 0.6695, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46346864104270935, + "rewards/margins": 0.06451568007469177, + "rewards/rejected": 0.3989529609680176, + "step": 989 + }, + { + "epoch": 0.15310264836651846, + "grad_norm": 5.045665740966797, + "learning_rate": 2.5515463917525772e-06, + "logits/chosen": 16.158935546875, + "logits/rejected": 7.748905658721924, + "logps/chosen": -225.56414794921875, + "logps/rejected": -144.9650421142578, + "loss": 0.6233, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22278550267219543, + "rewards/margins": 0.1727612167596817, + "rewards/rejected": 0.05002429336309433, + "step": 990 + }, + { + "epoch": 0.1532572975062826, + "grad_norm": 4.56468391418457, + "learning_rate": 2.554123711340206e-06, + "logits/chosen": 6.196763038635254, + "logits/rejected": 9.28365421295166, + "logps/chosen": -205.75367736816406, + "logps/rejected": -211.6551055908203, + "loss": 0.7291, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2946475148200989, + "rewards/margins": -0.06527872383594513, + "rewards/rejected": 0.3599262535572052, + "step": 991 + }, + { + "epoch": 0.1534119466460468, + "grad_norm": 4.6295061111450195, + "learning_rate": 2.556701030927835e-06, + "logits/chosen": 12.202375411987305, + "logits/rejected": 11.065433502197266, + "logps/chosen": -208.89736938476562, + "logps/rejected": -234.885009765625, + "loss": 0.6338, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3158901631832123, + "rewards/margins": 0.13359761238098145, + "rewards/rejected": 0.18229256570339203, + "step": 992 + }, + { + "epoch": 0.15356659578581094, + "grad_norm": 9.558801651000977, + "learning_rate": 2.559278350515464e-06, + "logits/chosen": 11.489753723144531, + "logits/rejected": 4.942627429962158, + "logps/chosen": -340.67266845703125, + "logps/rejected": -239.7880859375, + "loss": 0.6952, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25585898756980896, + "rewards/margins": -0.0014899037778377533, + "rewards/rejected": 0.257348895072937, + "step": 993 + }, + { + "epoch": 0.1537212449255751, + "grad_norm": 5.386810779571533, + "learning_rate": 2.5618556701030927e-06, + "logits/chosen": 10.71761703491211, + "logits/rejected": 5.424169540405273, + "logps/chosen": -273.7106018066406, + "logps/rejected": -142.55484008789062, + "loss": 0.7423, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3274467885494232, + "rewards/margins": -0.07112008333206177, + "rewards/rejected": 0.3985668420791626, + "step": 994 + }, + { + "epoch": 0.15387589406533927, + "grad_norm": 5.826381206512451, + "learning_rate": 2.5644329896907216e-06, + "logits/chosen": 14.185192108154297, + "logits/rejected": 17.662160873413086, + "logps/chosen": -178.9151611328125, + "logps/rejected": -290.31744384765625, + "loss": 0.6422, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33144572377204895, + "rewards/margins": 0.11291008442640305, + "rewards/rejected": 0.2185356169939041, + "step": 995 + }, + { + "epoch": 0.15403054320510343, + "grad_norm": 9.259116172790527, + "learning_rate": 2.5670103092783504e-06, + "logits/chosen": 12.790938377380371, + "logits/rejected": 12.904714584350586, + "logps/chosen": -454.6855163574219, + "logps/rejected": -398.2311096191406, + "loss": 0.7704, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.28311118483543396, + "rewards/margins": -0.14085274934768677, + "rewards/rejected": 0.4239639639854431, + "step": 996 + }, + { + "epoch": 0.15418519234486758, + "grad_norm": 6.129758834838867, + "learning_rate": 2.5695876288659793e-06, + "logits/chosen": 6.6786346435546875, + "logits/rejected": 10.564947128295898, + "logps/chosen": -281.6134338378906, + "logps/rejected": -294.8741149902344, + "loss": 0.7838, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4145020544528961, + "rewards/margins": -0.14903785288333893, + "rewards/rejected": 0.5635399222373962, + "step": 997 + }, + { + "epoch": 0.15433984148463173, + "grad_norm": 6.357661724090576, + "learning_rate": 2.572164948453608e-06, + "logits/chosen": 12.67042350769043, + "logits/rejected": 7.238968372344971, + "logps/chosen": -203.879150390625, + "logps/rejected": -163.60195922851562, + "loss": 0.6808, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23579970002174377, + "rewards/margins": 0.03664977848529816, + "rewards/rejected": 0.19914992153644562, + "step": 998 + }, + { + "epoch": 0.1544944906243959, + "grad_norm": 4.323291301727295, + "learning_rate": 2.574742268041237e-06, + "logits/chosen": 18.202255249023438, + "logits/rejected": 6.99099063873291, + "logps/chosen": -288.840576171875, + "logps/rejected": -187.57894897460938, + "loss": 0.5939, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5485618710517883, + "rewards/margins": 0.22565820813179016, + "rewards/rejected": 0.32290369272232056, + "step": 999 + }, + { + "epoch": 0.15464913976416006, + "grad_norm": 4.010663032531738, + "learning_rate": 2.577319587628866e-06, + "logits/chosen": 4.965423583984375, + "logits/rejected": 4.6642866134643555, + "logps/chosen": -201.75933837890625, + "logps/rejected": -229.17315673828125, + "loss": 0.648, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3071884512901306, + "rewards/margins": 0.10149803757667542, + "rewards/rejected": 0.2056904137134552, + "step": 1000 + }, + { + "epoch": 0.1548037889039242, + "grad_norm": 3.9429001808166504, + "learning_rate": 2.5798969072164952e-06, + "logits/chosen": 8.92806625366211, + "logits/rejected": 7.9105072021484375, + "logps/chosen": -161.7950439453125, + "logps/rejected": -157.60714721679688, + "loss": 0.7161, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3454486131668091, + "rewards/margins": -0.041734881699085236, + "rewards/rejected": 0.3871834874153137, + "step": 1001 + }, + { + "epoch": 0.1549584380436884, + "grad_norm": 5.263321399688721, + "learning_rate": 2.582474226804124e-06, + "logits/chosen": 13.339742660522461, + "logits/rejected": 11.67384147644043, + "logps/chosen": -297.85296630859375, + "logps/rejected": -260.2172546386719, + "loss": 0.6301, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4796229600906372, + "rewards/margins": 0.14002437889575958, + "rewards/rejected": 0.33959856629371643, + "step": 1002 + }, + { + "epoch": 0.15511308718345254, + "grad_norm": 4.655385971069336, + "learning_rate": 2.585051546391753e-06, + "logits/chosen": 11.733928680419922, + "logits/rejected": 4.142250061035156, + "logps/chosen": -368.67431640625, + "logps/rejected": -214.8388671875, + "loss": 0.58, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6160106658935547, + "rewards/margins": 0.27698814868927, + "rewards/rejected": 0.33902251720428467, + "step": 1003 + }, + { + "epoch": 0.1552677363232167, + "grad_norm": 8.346925735473633, + "learning_rate": 2.587628865979382e-06, + "logits/chosen": 9.91627311706543, + "logits/rejected": 6.078705787658691, + "logps/chosen": -324.4015197753906, + "logps/rejected": -352.47760009765625, + "loss": 0.6009, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43736210465431213, + "rewards/margins": 0.2276042103767395, + "rewards/rejected": 0.20975790917873383, + "step": 1004 + }, + { + "epoch": 0.15542238546298087, + "grad_norm": 5.191592693328857, + "learning_rate": 2.5902061855670107e-06, + "logits/chosen": 11.86085319519043, + "logits/rejected": 6.025583267211914, + "logps/chosen": -260.1004638671875, + "logps/rejected": -238.47109985351562, + "loss": 0.6702, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32853877544403076, + "rewards/margins": 0.07583131641149521, + "rewards/rejected": 0.25270742177963257, + "step": 1005 + }, + { + "epoch": 0.15557703460274502, + "grad_norm": 5.581071376800537, + "learning_rate": 2.5927835051546396e-06, + "logits/chosen": 17.071813583374023, + "logits/rejected": 12.390661239624023, + "logps/chosen": -301.09259033203125, + "logps/rejected": -250.86766052246094, + "loss": 0.6759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4589708149433136, + "rewards/margins": 0.05198876932263374, + "rewards/rejected": 0.40698206424713135, + "step": 1006 + }, + { + "epoch": 0.15573168374250917, + "grad_norm": 4.009683609008789, + "learning_rate": 2.5953608247422685e-06, + "logits/chosen": 5.798210144042969, + "logits/rejected": 5.148221015930176, + "logps/chosen": -243.46795654296875, + "logps/rejected": -227.66705322265625, + "loss": 0.6779, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33121249079704285, + "rewards/margins": 0.05117817595601082, + "rewards/rejected": 0.28003430366516113, + "step": 1007 + }, + { + "epoch": 0.15588633288227335, + "grad_norm": 5.627560615539551, + "learning_rate": 2.5979381443298973e-06, + "logits/chosen": 13.577735900878906, + "logits/rejected": 5.309107303619385, + "logps/chosen": -187.17648315429688, + "logps/rejected": -99.2226333618164, + "loss": 0.6701, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32158276438713074, + "rewards/margins": 0.05657311528921127, + "rewards/rejected": 0.26500964164733887, + "step": 1008 + }, + { + "epoch": 0.1560409820220375, + "grad_norm": 5.927374839782715, + "learning_rate": 2.600515463917526e-06, + "logits/chosen": 12.391993522644043, + "logits/rejected": 7.203285217285156, + "logps/chosen": -290.9713134765625, + "logps/rejected": -267.4643249511719, + "loss": 0.7314, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5320847034454346, + "rewards/margins": -0.02508671209216118, + "rewards/rejected": 0.5571714043617249, + "step": 1009 + }, + { + "epoch": 0.15619563116180166, + "grad_norm": 12.79286003112793, + "learning_rate": 2.603092783505155e-06, + "logits/chosen": 8.318655014038086, + "logits/rejected": 10.794252395629883, + "logps/chosen": -168.07498168945312, + "logps/rejected": -164.96377563476562, + "loss": 0.73, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2749417722225189, + "rewards/margins": -0.061866626143455505, + "rewards/rejected": 0.3368084132671356, + "step": 1010 + }, + { + "epoch": 0.15635028030156584, + "grad_norm": 5.886778354644775, + "learning_rate": 2.605670103092784e-06, + "logits/chosen": 6.2451090812683105, + "logits/rejected": 3.7573976516723633, + "logps/chosen": -396.6445617675781, + "logps/rejected": -256.5961608886719, + "loss": 0.65, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.46838143467903137, + "rewards/margins": 0.10984040051698685, + "rewards/rejected": 0.35854101181030273, + "step": 1011 + }, + { + "epoch": 0.15650492944133, + "grad_norm": 9.064360618591309, + "learning_rate": 2.608247422680413e-06, + "logits/chosen": 10.707468032836914, + "logits/rejected": 11.799666404724121, + "logps/chosen": -229.15879821777344, + "logps/rejected": -270.58349609375, + "loss": 0.6652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2975532114505768, + "rewards/margins": 0.06605391204357147, + "rewards/rejected": 0.2314992994070053, + "step": 1012 + }, + { + "epoch": 0.15665957858109414, + "grad_norm": 6.990705966949463, + "learning_rate": 2.6108247422680417e-06, + "logits/chosen": 8.056679725646973, + "logits/rejected": 10.398065567016602, + "logps/chosen": -277.4003601074219, + "logps/rejected": -353.6678771972656, + "loss": 0.7056, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48444175720214844, + "rewards/margins": 0.002009958028793335, + "rewards/rejected": 0.4824318289756775, + "step": 1013 + }, + { + "epoch": 0.1568142277208583, + "grad_norm": 4.032031059265137, + "learning_rate": 2.6134020618556706e-06, + "logits/chosen": 12.851255416870117, + "logits/rejected": 1.686974287033081, + "logps/chosen": -285.05657958984375, + "logps/rejected": -106.9432373046875, + "loss": 0.6104, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5892062187194824, + "rewards/margins": 0.20518063008785248, + "rewards/rejected": 0.38402560353279114, + "step": 1014 + }, + { + "epoch": 0.15696887686062247, + "grad_norm": 5.427288055419922, + "learning_rate": 2.6159793814432994e-06, + "logits/chosen": 6.395529747009277, + "logits/rejected": 7.157956123352051, + "logps/chosen": -206.54147338867188, + "logps/rejected": -208.5509796142578, + "loss": 0.7323, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4238506555557251, + "rewards/margins": -0.048391249030828476, + "rewards/rejected": 0.4722418785095215, + "step": 1015 + }, + { + "epoch": 0.15712352600038662, + "grad_norm": 4.94955587387085, + "learning_rate": 2.6185567010309283e-06, + "logits/chosen": 13.247678756713867, + "logits/rejected": 8.358420372009277, + "logps/chosen": -250.1439666748047, + "logps/rejected": -183.1035919189453, + "loss": 0.6645, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3969514071941376, + "rewards/margins": 0.07653766125440598, + "rewards/rejected": 0.3204137682914734, + "step": 1016 + }, + { + "epoch": 0.15727817514015077, + "grad_norm": 4.708651065826416, + "learning_rate": 2.621134020618557e-06, + "logits/chosen": 14.124601364135742, + "logits/rejected": 12.939817428588867, + "logps/chosen": -257.5981140136719, + "logps/rejected": -253.38003540039062, + "loss": 0.616, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6638963222503662, + "rewards/margins": 0.18651682138442993, + "rewards/rejected": 0.4773794710636139, + "step": 1017 + }, + { + "epoch": 0.15743282427991495, + "grad_norm": 6.371457576751709, + "learning_rate": 2.623711340206186e-06, + "logits/chosen": 10.791617393493652, + "logits/rejected": 10.859783172607422, + "logps/chosen": -287.60955810546875, + "logps/rejected": -307.47918701171875, + "loss": 0.6261, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4228264093399048, + "rewards/margins": 0.1539658159017563, + "rewards/rejected": 0.2688605785369873, + "step": 1018 + }, + { + "epoch": 0.1575874734196791, + "grad_norm": 5.202495098114014, + "learning_rate": 2.626288659793815e-06, + "logits/chosen": 5.763272762298584, + "logits/rejected": 9.891640663146973, + "logps/chosen": -236.70794677734375, + "logps/rejected": -297.4866943359375, + "loss": 0.5873, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48614585399627686, + "rewards/margins": 0.2402709424495697, + "rewards/rejected": 0.24587489664554596, + "step": 1019 + }, + { + "epoch": 0.15774212255944325, + "grad_norm": 5.173934459686279, + "learning_rate": 2.628865979381444e-06, + "logits/chosen": 10.963274002075195, + "logits/rejected": 7.169569969177246, + "logps/chosen": -262.1689758300781, + "logps/rejected": -244.37644958496094, + "loss": 0.7165, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.47549164295196533, + "rewards/margins": -0.037476323544979095, + "rewards/rejected": 0.5129680037498474, + "step": 1020 + }, + { + "epoch": 0.15789677169920743, + "grad_norm": 6.314521789550781, + "learning_rate": 2.6314432989690727e-06, + "logits/chosen": 10.755579948425293, + "logits/rejected": 2.741549491882324, + "logps/chosen": -288.4797668457031, + "logps/rejected": -283.5696716308594, + "loss": 0.6343, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5280244946479797, + "rewards/margins": 0.13582980632781982, + "rewards/rejected": 0.3921946585178375, + "step": 1021 + }, + { + "epoch": 0.15805142083897158, + "grad_norm": 5.325987815856934, + "learning_rate": 2.634020618556701e-06, + "logits/chosen": 4.969079971313477, + "logits/rejected": 12.771631240844727, + "logps/chosen": -227.09994506835938, + "logps/rejected": -374.99560546875, + "loss": 0.6831, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.509530246257782, + "rewards/margins": 0.029446369037032127, + "rewards/rejected": 0.4800838828086853, + "step": 1022 + }, + { + "epoch": 0.15820606997873574, + "grad_norm": 4.745158672332764, + "learning_rate": 2.63659793814433e-06, + "logits/chosen": 7.745759963989258, + "logits/rejected": 14.217255592346191, + "logps/chosen": -131.89381408691406, + "logps/rejected": -202.36746215820312, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4468061625957489, + "rewards/margins": 0.047152094542980194, + "rewards/rejected": 0.3996540606021881, + "step": 1023 + }, + { + "epoch": 0.15836071911849992, + "grad_norm": 8.940340042114258, + "learning_rate": 2.639175257731959e-06, + "logits/chosen": 3.577395439147949, + "logits/rejected": 11.726722717285156, + "logps/chosen": -166.974609375, + "logps/rejected": -214.47677612304688, + "loss": 0.7807, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.17657072842121124, + "rewards/margins": -0.15617960691452026, + "rewards/rejected": 0.3327503204345703, + "step": 1024 + }, + { + "epoch": 0.15851536825826407, + "grad_norm": 5.5178608894348145, + "learning_rate": 2.6417525773195877e-06, + "logits/chosen": 11.245694160461426, + "logits/rejected": 6.978783130645752, + "logps/chosen": -297.09613037109375, + "logps/rejected": -197.52920532226562, + "loss": 0.7745, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30007678270339966, + "rewards/margins": -0.11804361641407013, + "rewards/rejected": 0.4181203842163086, + "step": 1025 + }, + { + "epoch": 0.15867001739802822, + "grad_norm": 5.789656639099121, + "learning_rate": 2.6443298969072166e-06, + "logits/chosen": 10.028810501098633, + "logits/rejected": 10.683966636657715, + "logps/chosen": -307.38726806640625, + "logps/rejected": -258.4380798339844, + "loss": 0.7284, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4536043107509613, + "rewards/margins": -0.051944829523563385, + "rewards/rejected": 0.5055491328239441, + "step": 1026 + }, + { + "epoch": 0.1588246665377924, + "grad_norm": 5.341628551483154, + "learning_rate": 2.6469072164948455e-06, + "logits/chosen": 3.1536104679107666, + "logits/rejected": 8.352611541748047, + "logps/chosen": -207.67640686035156, + "logps/rejected": -323.90362548828125, + "loss": 0.7076, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26018840074539185, + "rewards/margins": -0.027193743735551834, + "rewards/rejected": 0.2873821258544922, + "step": 1027 + }, + { + "epoch": 0.15897931567755655, + "grad_norm": 6.4074835777282715, + "learning_rate": 2.6494845360824743e-06, + "logits/chosen": 6.083681583404541, + "logits/rejected": 10.88270378112793, + "logps/chosen": -280.3446044921875, + "logps/rejected": -292.8846740722656, + "loss": 0.8587, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11750331521034241, + "rewards/margins": -0.26196616888046265, + "rewards/rejected": 0.37946951389312744, + "step": 1028 + }, + { + "epoch": 0.1591339648173207, + "grad_norm": 7.285872936248779, + "learning_rate": 2.6520618556701032e-06, + "logits/chosen": 11.835395812988281, + "logits/rejected": 2.4885847568511963, + "logps/chosen": -610.2842407226562, + "logps/rejected": -328.08502197265625, + "loss": 0.7197, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4710925221443176, + "rewards/margins": 0.00777001678943634, + "rewards/rejected": 0.4633224904537201, + "step": 1029 + }, + { + "epoch": 0.15928861395708485, + "grad_norm": 6.519668102264404, + "learning_rate": 2.654639175257732e-06, + "logits/chosen": 7.799099922180176, + "logits/rejected": 11.39924430847168, + "logps/chosen": -314.4749755859375, + "logps/rejected": -299.37939453125, + "loss": 0.7686, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21852943301200867, + "rewards/margins": -0.13165950775146484, + "rewards/rejected": 0.3501889407634735, + "step": 1030 + }, + { + "epoch": 0.15944326309684903, + "grad_norm": 6.97037935256958, + "learning_rate": 2.657216494845361e-06, + "logits/chosen": 5.605464935302734, + "logits/rejected": 4.981527328491211, + "logps/chosen": -276.9613037109375, + "logps/rejected": -366.38836669921875, + "loss": 0.7046, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.49983900785446167, + "rewards/margins": 0.019785024225711823, + "rewards/rejected": 0.48005399107933044, + "step": 1031 + }, + { + "epoch": 0.15959791223661318, + "grad_norm": 6.064166069030762, + "learning_rate": 2.65979381443299e-06, + "logits/chosen": 7.023273468017578, + "logits/rejected": 4.334907054901123, + "logps/chosen": -277.53472900390625, + "logps/rejected": -236.01010131835938, + "loss": 0.6762, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6226499676704407, + "rewards/margins": 0.05863867700099945, + "rewards/rejected": 0.56401127576828, + "step": 1032 + }, + { + "epoch": 0.15975256137637733, + "grad_norm": 11.71610164642334, + "learning_rate": 2.6623711340206187e-06, + "logits/chosen": 13.209150314331055, + "logits/rejected": 6.806638240814209, + "logps/chosen": -648.9050903320312, + "logps/rejected": -339.5271911621094, + "loss": 0.8801, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2154412418603897, + "rewards/margins": -0.2950975298881531, + "rewards/rejected": 0.5105387568473816, + "step": 1033 + }, + { + "epoch": 0.1599072105161415, + "grad_norm": 5.398703098297119, + "learning_rate": 2.6649484536082476e-06, + "logits/chosen": 10.184659957885742, + "logits/rejected": 4.911975860595703, + "logps/chosen": -350.12945556640625, + "logps/rejected": -298.5102233886719, + "loss": 0.6045, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.566582202911377, + "rewards/margins": 0.21806208789348602, + "rewards/rejected": 0.34852010011672974, + "step": 1034 + }, + { + "epoch": 0.16006185965590566, + "grad_norm": 4.126475811004639, + "learning_rate": 2.6675257731958765e-06, + "logits/chosen": 8.251382827758789, + "logits/rejected": 5.719577789306641, + "logps/chosen": -183.52490234375, + "logps/rejected": -196.65928649902344, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32802990078926086, + "rewards/margins": 0.0051604341715574265, + "rewards/rejected": 0.3228694200515747, + "step": 1035 + }, + { + "epoch": 0.16021650879566982, + "grad_norm": 6.685421466827393, + "learning_rate": 2.6701030927835053e-06, + "logits/chosen": 10.221075057983398, + "logits/rejected": 6.6625213623046875, + "logps/chosen": -383.50048828125, + "logps/rejected": -333.1712341308594, + "loss": 0.7448, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5246036648750305, + "rewards/margins": -0.0873337984085083, + "rewards/rejected": 0.6119374632835388, + "step": 1036 + }, + { + "epoch": 0.160371157935434, + "grad_norm": 5.351931571960449, + "learning_rate": 2.672680412371134e-06, + "logits/chosen": 7.194896697998047, + "logits/rejected": 5.363762855529785, + "logps/chosen": -177.01670837402344, + "logps/rejected": -250.59585571289062, + "loss": 0.6751, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15219125151634216, + "rewards/margins": 0.056029822677373886, + "rewards/rejected": 0.09616144001483917, + "step": 1037 + }, + { + "epoch": 0.16052580707519815, + "grad_norm": 7.467764854431152, + "learning_rate": 2.675257731958763e-06, + "logits/chosen": 12.162158012390137, + "logits/rejected": 13.805460929870605, + "logps/chosen": -376.8035583496094, + "logps/rejected": -349.16436767578125, + "loss": 0.8115, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26126694679260254, + "rewards/margins": -0.19738717377185822, + "rewards/rejected": 0.45865413546562195, + "step": 1038 + }, + { + "epoch": 0.1606804562149623, + "grad_norm": 6.053816795349121, + "learning_rate": 2.677835051546392e-06, + "logits/chosen": 10.336067199707031, + "logits/rejected": 11.565001487731934, + "logps/chosen": -330.21539306640625, + "logps/rejected": -445.71380615234375, + "loss": 0.7374, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.44596052169799805, + "rewards/margins": -0.07219447940587997, + "rewards/rejected": 0.5181549787521362, + "step": 1039 + }, + { + "epoch": 0.16083510535472648, + "grad_norm": 5.0752034187316895, + "learning_rate": 2.680412371134021e-06, + "logits/chosen": 8.184542655944824, + "logits/rejected": 7.182358741760254, + "logps/chosen": -216.462158203125, + "logps/rejected": -159.28610229492188, + "loss": 0.7403, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2973100244998932, + "rewards/margins": -0.08360461890697479, + "rewards/rejected": 0.3809146285057068, + "step": 1040 + }, + { + "epoch": 0.16098975449449063, + "grad_norm": 5.078243732452393, + "learning_rate": 2.6829896907216497e-06, + "logits/chosen": 8.63563060760498, + "logits/rejected": 9.155065536499023, + "logps/chosen": -328.1712341308594, + "logps/rejected": -258.8982849121094, + "loss": 0.6608, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4773038923740387, + "rewards/margins": 0.08635596930980682, + "rewards/rejected": 0.3909479081630707, + "step": 1041 + }, + { + "epoch": 0.16114440363425478, + "grad_norm": 5.658509731292725, + "learning_rate": 2.6855670103092786e-06, + "logits/chosen": 9.052953720092773, + "logits/rejected": 8.630266189575195, + "logps/chosen": -299.23150634765625, + "logps/rejected": -259.9249572753906, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37799960374832153, + "rewards/margins": 0.08504701405763626, + "rewards/rejected": 0.29295259714126587, + "step": 1042 + }, + { + "epoch": 0.16129905277401896, + "grad_norm": 6.068130016326904, + "learning_rate": 2.6881443298969074e-06, + "logits/chosen": 9.236040115356445, + "logits/rejected": 5.622822284698486, + "logps/chosen": -266.4791564941406, + "logps/rejected": -219.8063507080078, + "loss": 0.8015, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3484756052494049, + "rewards/margins": -0.1862947940826416, + "rewards/rejected": 0.5347704291343689, + "step": 1043 + }, + { + "epoch": 0.1614537019137831, + "grad_norm": 4.1639180183410645, + "learning_rate": 2.6907216494845363e-06, + "logits/chosen": 12.256208419799805, + "logits/rejected": 10.92822265625, + "logps/chosen": -206.211669921875, + "logps/rejected": -272.11505126953125, + "loss": 0.6241, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4437619149684906, + "rewards/margins": 0.1814982295036316, + "rewards/rejected": 0.262263685464859, + "step": 1044 + }, + { + "epoch": 0.16160835105354726, + "grad_norm": 3.9855000972747803, + "learning_rate": 2.693298969072165e-06, + "logits/chosen": 13.220599174499512, + "logits/rejected": 8.862778663635254, + "logps/chosen": -220.43321228027344, + "logps/rejected": -182.53016662597656, + "loss": 0.6332, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36090487241744995, + "rewards/margins": 0.14688324928283691, + "rewards/rejected": 0.21402163803577423, + "step": 1045 + }, + { + "epoch": 0.1617630001933114, + "grad_norm": 6.5949387550354, + "learning_rate": 2.695876288659794e-06, + "logits/chosen": 7.444316864013672, + "logits/rejected": 2.5016591548919678, + "logps/chosen": -429.6894226074219, + "logps/rejected": -334.570556640625, + "loss": 0.6245, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5750318765640259, + "rewards/margins": 0.1503545194864273, + "rewards/rejected": 0.42467740178108215, + "step": 1046 + }, + { + "epoch": 0.1619176493330756, + "grad_norm": 4.156692981719971, + "learning_rate": 2.698453608247423e-06, + "logits/chosen": -1.1847783327102661, + "logits/rejected": 10.830698013305664, + "logps/chosen": -122.63531494140625, + "logps/rejected": -227.61268615722656, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4892508387565613, + "rewards/margins": 0.028514418751001358, + "rewards/rejected": 0.4607364237308502, + "step": 1047 + }, + { + "epoch": 0.16207229847283974, + "grad_norm": 7.735217094421387, + "learning_rate": 2.7010309278350518e-06, + "logits/chosen": 9.772907257080078, + "logits/rejected": 10.852302551269531, + "logps/chosen": -292.5924987792969, + "logps/rejected": -311.0618896484375, + "loss": 0.7872, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.23518210649490356, + "rewards/margins": -0.16996806859970093, + "rewards/rejected": 0.4051501750946045, + "step": 1048 + }, + { + "epoch": 0.1622269476126039, + "grad_norm": 8.69966983795166, + "learning_rate": 2.7036082474226807e-06, + "logits/chosen": 14.306394577026367, + "logits/rejected": 7.621552467346191, + "logps/chosen": -365.2208251953125, + "logps/rejected": -231.18020629882812, + "loss": 0.6779, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5280090570449829, + "rewards/margins": 0.05756300687789917, + "rewards/rejected": 0.47044605016708374, + "step": 1049 + }, + { + "epoch": 0.16238159675236807, + "grad_norm": 7.324671268463135, + "learning_rate": 2.7061855670103095e-06, + "logits/chosen": 11.567021369934082, + "logits/rejected": 11.947700500488281, + "logps/chosen": -284.952392578125, + "logps/rejected": -280.4742431640625, + "loss": 0.6987, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2622503340244293, + "rewards/margins": 0.004250619560480118, + "rewards/rejected": 0.2579997181892395, + "step": 1050 + }, + { + "epoch": 0.16253624589213223, + "grad_norm": 5.369792938232422, + "learning_rate": 2.708762886597938e-06, + "logits/chosen": 12.093174934387207, + "logits/rejected": 6.71939754486084, + "logps/chosen": -366.75848388671875, + "logps/rejected": -303.83197021484375, + "loss": 0.6249, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49151867628097534, + "rewards/margins": 0.18071764707565308, + "rewards/rejected": 0.31080102920532227, + "step": 1051 + }, + { + "epoch": 0.16269089503189638, + "grad_norm": 5.221163749694824, + "learning_rate": 2.711340206185567e-06, + "logits/chosen": 6.979132175445557, + "logits/rejected": 10.874004364013672, + "logps/chosen": -154.37570190429688, + "logps/rejected": -187.8915557861328, + "loss": 0.6768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4858044385910034, + "rewards/margins": 0.03911856934428215, + "rewards/rejected": 0.4466858506202698, + "step": 1052 + }, + { + "epoch": 0.16284554417166056, + "grad_norm": 3.984731912612915, + "learning_rate": 2.7139175257731957e-06, + "logits/chosen": 12.964475631713867, + "logits/rejected": 5.655695915222168, + "logps/chosen": -177.99411010742188, + "logps/rejected": -157.4450225830078, + "loss": 0.6622, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.319799542427063, + "rewards/margins": 0.08091521263122559, + "rewards/rejected": 0.23888429999351501, + "step": 1053 + }, + { + "epoch": 0.1630001933114247, + "grad_norm": 4.853552341461182, + "learning_rate": 2.7164948453608246e-06, + "logits/chosen": 10.205548286437988, + "logits/rejected": 4.836327075958252, + "logps/chosen": -281.8667907714844, + "logps/rejected": -237.60916137695312, + "loss": 0.642, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3726174831390381, + "rewards/margins": 0.11989913880825043, + "rewards/rejected": 0.25271835923194885, + "step": 1054 + }, + { + "epoch": 0.16315484245118886, + "grad_norm": 4.472945690155029, + "learning_rate": 2.7190721649484535e-06, + "logits/chosen": 12.537957191467285, + "logits/rejected": 7.656261920928955, + "logps/chosen": -182.21368408203125, + "logps/rejected": -169.21478271484375, + "loss": 0.7051, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3439086675643921, + "rewards/margins": -0.007755044847726822, + "rewards/rejected": 0.351663738489151, + "step": 1055 + }, + { + "epoch": 0.16330949159095304, + "grad_norm": 4.707562446594238, + "learning_rate": 2.7216494845360823e-06, + "logits/chosen": 12.646970748901367, + "logits/rejected": 7.461970329284668, + "logps/chosen": -301.0125732421875, + "logps/rejected": -219.021240234375, + "loss": 0.6498, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6307682991027832, + "rewards/margins": 0.12720398604869843, + "rewards/rejected": 0.5035642981529236, + "step": 1056 + }, + { + "epoch": 0.1634641407307172, + "grad_norm": 43.98398971557617, + "learning_rate": 2.724226804123711e-06, + "logits/chosen": 11.250030517578125, + "logits/rejected": 6.399295806884766, + "logps/chosen": -223.62542724609375, + "logps/rejected": -166.40493774414062, + "loss": 0.6384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45335260033607483, + "rewards/margins": 0.12117159366607666, + "rewards/rejected": 0.33218103647232056, + "step": 1057 + }, + { + "epoch": 0.16361878987048134, + "grad_norm": 5.9794487953186035, + "learning_rate": 2.72680412371134e-06, + "logits/chosen": 6.903171062469482, + "logits/rejected": 11.396451950073242, + "logps/chosen": -210.54454040527344, + "logps/rejected": -237.95388793945312, + "loss": 0.7177, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24321851134300232, + "rewards/margins": -0.03795791044831276, + "rewards/rejected": 0.2811764180660248, + "step": 1058 + }, + { + "epoch": 0.1637734390102455, + "grad_norm": 10.869904518127441, + "learning_rate": 2.729381443298969e-06, + "logits/chosen": 7.053282737731934, + "logits/rejected": 5.508359432220459, + "logps/chosen": -281.413818359375, + "logps/rejected": -377.7446594238281, + "loss": 0.695, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3586414158344269, + "rewards/margins": 0.05114555358886719, + "rewards/rejected": 0.3074958324432373, + "step": 1059 + }, + { + "epoch": 0.16392808815000967, + "grad_norm": 4.94764518737793, + "learning_rate": 2.731958762886598e-06, + "logits/chosen": 8.505027770996094, + "logits/rejected": 5.51999568939209, + "logps/chosen": -311.41778564453125, + "logps/rejected": -285.7543029785156, + "loss": 0.6836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4464007616043091, + "rewards/margins": 0.06450801342725754, + "rewards/rejected": 0.3818928003311157, + "step": 1060 + }, + { + "epoch": 0.16408273728977382, + "grad_norm": 4.890074253082275, + "learning_rate": 2.734536082474227e-06, + "logits/chosen": 9.940583229064941, + "logits/rejected": 11.309616088867188, + "logps/chosen": -253.55035400390625, + "logps/rejected": -276.87957763671875, + "loss": 0.6167, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44649219512939453, + "rewards/margins": 0.16762083768844604, + "rewards/rejected": 0.2788713574409485, + "step": 1061 + }, + { + "epoch": 0.16423738642953797, + "grad_norm": 4.957050323486328, + "learning_rate": 2.737113402061856e-06, + "logits/chosen": 12.855979919433594, + "logits/rejected": 12.664342880249023, + "logps/chosen": -226.63661193847656, + "logps/rejected": -215.68629455566406, + "loss": 0.7752, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3890753984451294, + "rewards/margins": -0.13341045379638672, + "rewards/rejected": 0.5224858522415161, + "step": 1062 + }, + { + "epoch": 0.16439203556930215, + "grad_norm": 5.184538841247559, + "learning_rate": 2.739690721649485e-06, + "logits/chosen": 3.3324735164642334, + "logits/rejected": 4.615572452545166, + "logps/chosen": -212.3704071044922, + "logps/rejected": -233.98492431640625, + "loss": 0.7259, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18181224167346954, + "rewards/margins": -0.05426469445228577, + "rewards/rejected": 0.2360769510269165, + "step": 1063 + }, + { + "epoch": 0.1645466847090663, + "grad_norm": 4.704477787017822, + "learning_rate": 2.7422680412371137e-06, + "logits/chosen": 9.060919761657715, + "logits/rejected": 5.374079704284668, + "logps/chosen": -182.55682373046875, + "logps/rejected": -149.00543212890625, + "loss": 0.7116, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2934325933456421, + "rewards/margins": -0.0037285350263118744, + "rewards/rejected": 0.29716113209724426, + "step": 1064 + }, + { + "epoch": 0.16470133384883046, + "grad_norm": 6.550030708312988, + "learning_rate": 2.7448453608247426e-06, + "logits/chosen": 11.847772598266602, + "logits/rejected": 13.012176513671875, + "logps/chosen": -229.9514923095703, + "logps/rejected": -250.713134765625, + "loss": 0.7289, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26736605167388916, + "rewards/margins": -0.054277244955301285, + "rewards/rejected": 0.32164329290390015, + "step": 1065 + }, + { + "epoch": 0.16485598298859463, + "grad_norm": 5.203851699829102, + "learning_rate": 2.7474226804123715e-06, + "logits/chosen": 4.889805316925049, + "logits/rejected": 4.330077648162842, + "logps/chosen": -178.52366638183594, + "logps/rejected": -208.51792907714844, + "loss": 0.6948, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5053683519363403, + "rewards/margins": 0.017319679260253906, + "rewards/rejected": 0.4880486726760864, + "step": 1066 + }, + { + "epoch": 0.1650106321283588, + "grad_norm": 7.30642032623291, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": 6.800315856933594, + "logits/rejected": 7.008328437805176, + "logps/chosen": -274.5916442871094, + "logps/rejected": -264.5065612792969, + "loss": 0.7605, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.21765394508838654, + "rewards/margins": -0.11245432496070862, + "rewards/rejected": 0.33010828495025635, + "step": 1067 + }, + { + "epoch": 0.16516528126812294, + "grad_norm": 15.721375465393066, + "learning_rate": 2.7525773195876292e-06, + "logits/chosen": 7.161839485168457, + "logits/rejected": 5.509325981140137, + "logps/chosen": -136.89422607421875, + "logps/rejected": -159.31271362304688, + "loss": 0.6906, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21712228655815125, + "rewards/margins": 0.025398138910531998, + "rewards/rejected": 0.19172416627407074, + "step": 1068 + }, + { + "epoch": 0.16531993040788712, + "grad_norm": 4.264392852783203, + "learning_rate": 2.755154639175258e-06, + "logits/chosen": 10.415778160095215, + "logits/rejected": 4.979681491851807, + "logps/chosen": -245.2407684326172, + "logps/rejected": -208.16725158691406, + "loss": 0.57, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48708388209342957, + "rewards/margins": 0.2765290141105652, + "rewards/rejected": 0.21055488288402557, + "step": 1069 + }, + { + "epoch": 0.16547457954765127, + "grad_norm": 5.592239856719971, + "learning_rate": 2.757731958762887e-06, + "logits/chosen": 11.432573318481445, + "logits/rejected": 4.233443737030029, + "logps/chosen": -293.978515625, + "logps/rejected": -223.2623291015625, + "loss": 0.6272, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3736454248428345, + "rewards/margins": 0.17266154289245605, + "rewards/rejected": 0.2009839117527008, + "step": 1070 + }, + { + "epoch": 0.16562922868741542, + "grad_norm": 5.213015079498291, + "learning_rate": 2.760309278350516e-06, + "logits/chosen": 7.342867374420166, + "logits/rejected": 6.233464241027832, + "logps/chosen": -204.68927001953125, + "logps/rejected": -212.27098083496094, + "loss": 0.7297, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.37975209951400757, + "rewards/margins": -0.0384133905172348, + "rewards/rejected": 0.41816550493240356, + "step": 1071 + }, + { + "epoch": 0.1657838778271796, + "grad_norm": 4.677437782287598, + "learning_rate": 2.7628865979381447e-06, + "logits/chosen": 11.655706405639648, + "logits/rejected": 9.31197738647461, + "logps/chosen": -340.36053466796875, + "logps/rejected": -271.4561767578125, + "loss": 0.6443, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37624871730804443, + "rewards/margins": 0.14655783772468567, + "rewards/rejected": 0.22969086468219757, + "step": 1072 + }, + { + "epoch": 0.16593852696694375, + "grad_norm": 4.539829254150391, + "learning_rate": 2.7654639175257736e-06, + "logits/chosen": 5.378650665283203, + "logits/rejected": 2.23707914352417, + "logps/chosen": -308.83624267578125, + "logps/rejected": -245.52114868164062, + "loss": 0.6274, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3348231315612793, + "rewards/margins": 0.15908174216747284, + "rewards/rejected": 0.17574140429496765, + "step": 1073 + }, + { + "epoch": 0.1660931761067079, + "grad_norm": 4.869034290313721, + "learning_rate": 2.7680412371134025e-06, + "logits/chosen": 10.294317245483398, + "logits/rejected": 5.702367305755615, + "logps/chosen": -208.36639404296875, + "logps/rejected": -130.01109313964844, + "loss": 0.7049, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2122596800327301, + "rewards/margins": -0.021868279203772545, + "rewards/rejected": 0.2341279685497284, + "step": 1074 + }, + { + "epoch": 0.16624782524647205, + "grad_norm": 5.645140647888184, + "learning_rate": 2.7706185567010313e-06, + "logits/chosen": 10.160078048706055, + "logits/rejected": 5.1125054359436035, + "logps/chosen": -215.4303436279297, + "logps/rejected": -137.60739135742188, + "loss": 0.7799, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13486647605895996, + "rewards/margins": -0.15635111927986145, + "rewards/rejected": 0.2912175953388214, + "step": 1075 + }, + { + "epoch": 0.16640247438623623, + "grad_norm": 7.067988395690918, + "learning_rate": 2.77319587628866e-06, + "logits/chosen": 9.530256271362305, + "logits/rejected": 10.991371154785156, + "logps/chosen": -354.59027099609375, + "logps/rejected": -343.3446960449219, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.43891656398773193, + "rewards/margins": 0.08997649699449539, + "rewards/rejected": 0.3489401042461395, + "step": 1076 + }, + { + "epoch": 0.16655712352600038, + "grad_norm": 4.581558704376221, + "learning_rate": 2.775773195876289e-06, + "logits/chosen": 10.148000717163086, + "logits/rejected": 10.160100936889648, + "logps/chosen": -256.7032165527344, + "logps/rejected": -243.78848266601562, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35880598425865173, + "rewards/margins": 0.09301841259002686, + "rewards/rejected": 0.2657875418663025, + "step": 1077 + }, + { + "epoch": 0.16671177266576453, + "grad_norm": 3.985835313796997, + "learning_rate": 2.778350515463918e-06, + "logits/chosen": 4.755812644958496, + "logits/rejected": 10.252729415893555, + "logps/chosen": -196.23898315429688, + "logps/rejected": -246.3616943359375, + "loss": 0.5999, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4461737871170044, + "rewards/margins": 0.20455661416053772, + "rewards/rejected": 0.24161720275878906, + "step": 1078 + }, + { + "epoch": 0.16686642180552871, + "grad_norm": 7.236598491668701, + "learning_rate": 2.780927835051547e-06, + "logits/chosen": 7.2391839027404785, + "logits/rejected": 7.341989040374756, + "logps/chosen": -326.91363525390625, + "logps/rejected": -255.23779296875, + "loss": 0.7417, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.24083857238292694, + "rewards/margins": -0.07181355357170105, + "rewards/rejected": 0.3126521408557892, + "step": 1079 + }, + { + "epoch": 0.16702107094529287, + "grad_norm": 3.996279001235962, + "learning_rate": 2.7835051546391757e-06, + "logits/chosen": 4.7879438400268555, + "logits/rejected": 3.020608425140381, + "logps/chosen": -211.53652954101562, + "logps/rejected": -131.73794555664062, + "loss": 0.656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17334146797657013, + "rewards/margins": 0.07940573990345001, + "rewards/rejected": 0.09393573552370071, + "step": 1080 + }, + { + "epoch": 0.16717572008505702, + "grad_norm": 5.130178451538086, + "learning_rate": 2.7860824742268046e-06, + "logits/chosen": 8.511739730834961, + "logits/rejected": 12.763547897338867, + "logps/chosen": -210.34835815429688, + "logps/rejected": -323.07696533203125, + "loss": 0.6607, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29380524158477783, + "rewards/margins": 0.07359995692968369, + "rewards/rejected": 0.22020526230335236, + "step": 1081 + }, + { + "epoch": 0.1673303692248212, + "grad_norm": 5.002078533172607, + "learning_rate": 2.7886597938144334e-06, + "logits/chosen": 15.420188903808594, + "logits/rejected": 8.079397201538086, + "logps/chosen": -307.494384765625, + "logps/rejected": -237.58912658691406, + "loss": 0.644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4491264224052429, + "rewards/margins": 0.10898799449205399, + "rewards/rejected": 0.34013843536376953, + "step": 1082 + }, + { + "epoch": 0.16748501836458535, + "grad_norm": 6.499565124511719, + "learning_rate": 2.7912371134020623e-06, + "logits/chosen": 5.123217582702637, + "logits/rejected": 9.870834350585938, + "logps/chosen": -262.5680847167969, + "logps/rejected": -320.9296569824219, + "loss": 0.7234, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23025751113891602, + "rewards/margins": -0.0433901809155941, + "rewards/rejected": 0.2736476957798004, + "step": 1083 + }, + { + "epoch": 0.1676396675043495, + "grad_norm": 5.860519886016846, + "learning_rate": 2.793814432989691e-06, + "logits/chosen": 12.18691635131836, + "logits/rejected": 13.408405303955078, + "logps/chosen": -318.7335205078125, + "logps/rejected": -346.5426025390625, + "loss": 0.5911, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3901365399360657, + "rewards/margins": 0.24204924702644348, + "rewards/rejected": 0.1480872929096222, + "step": 1084 + }, + { + "epoch": 0.16779431664411368, + "grad_norm": 5.866950988769531, + "learning_rate": 2.79639175257732e-06, + "logits/chosen": 11.779134750366211, + "logits/rejected": 8.920845031738281, + "logps/chosen": -418.793212890625, + "logps/rejected": -367.96392822265625, + "loss": 0.6016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44371214509010315, + "rewards/margins": 0.2046000063419342, + "rewards/rejected": 0.23911209404468536, + "step": 1085 + }, + { + "epoch": 0.16794896578387783, + "grad_norm": 9.693355560302734, + "learning_rate": 2.798969072164949e-06, + "logits/chosen": 7.844634056091309, + "logits/rejected": 2.2951765060424805, + "logps/chosen": -282.4667663574219, + "logps/rejected": -279.37127685546875, + "loss": 0.6956, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3938562273979187, + "rewards/margins": 0.016431525349617004, + "rewards/rejected": 0.3774247467517853, + "step": 1086 + }, + { + "epoch": 0.16810361492364198, + "grad_norm": 4.724522590637207, + "learning_rate": 2.801546391752578e-06, + "logits/chosen": 10.812071800231934, + "logits/rejected": 5.6684064865112305, + "logps/chosen": -259.1080627441406, + "logps/rejected": -203.34054565429688, + "loss": 0.6614, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27513280510902405, + "rewards/margins": 0.09737339615821838, + "rewards/rejected": 0.17775940895080566, + "step": 1087 + }, + { + "epoch": 0.16825826406340616, + "grad_norm": 7.932330131530762, + "learning_rate": 2.8041237113402062e-06, + "logits/chosen": 14.963586807250977, + "logits/rejected": 7.2815446853637695, + "logps/chosen": -418.70245361328125, + "logps/rejected": -304.7147521972656, + "loss": 0.7348, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21449965238571167, + "rewards/margins": -0.05291939526796341, + "rewards/rejected": 0.2674190402030945, + "step": 1088 + }, + { + "epoch": 0.1684129132031703, + "grad_norm": 4.958596229553223, + "learning_rate": 2.806701030927835e-06, + "logits/chosen": 14.71570873260498, + "logits/rejected": 12.743423461914062, + "logps/chosen": -280.58978271484375, + "logps/rejected": -268.813232421875, + "loss": 0.7072, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40674203634262085, + "rewards/margins": -0.01915750280022621, + "rewards/rejected": 0.4258995056152344, + "step": 1089 + }, + { + "epoch": 0.16856756234293446, + "grad_norm": 4.17123556137085, + "learning_rate": 2.809278350515464e-06, + "logits/chosen": 9.626422882080078, + "logits/rejected": 11.575504302978516, + "logps/chosen": -164.32858276367188, + "logps/rejected": -125.93989562988281, + "loss": 0.7044, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19911348819732666, + "rewards/margins": -0.016457343474030495, + "rewards/rejected": 0.2155708372592926, + "step": 1090 + }, + { + "epoch": 0.16872221148269861, + "grad_norm": 5.6398396492004395, + "learning_rate": 2.811855670103093e-06, + "logits/chosen": 9.416651725769043, + "logits/rejected": 11.358421325683594, + "logps/chosen": -247.67654418945312, + "logps/rejected": -291.9112548828125, + "loss": 0.7478, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2780582308769226, + "rewards/margins": -0.07273931801319122, + "rewards/rejected": 0.350797563791275, + "step": 1091 + }, + { + "epoch": 0.1688768606224628, + "grad_norm": 5.406024932861328, + "learning_rate": 2.8144329896907217e-06, + "logits/chosen": 5.060151100158691, + "logits/rejected": 10.967604637145996, + "logps/chosen": -271.1733093261719, + "logps/rejected": -326.7130432128906, + "loss": 0.6533, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5075684189796448, + "rewards/margins": 0.10499919950962067, + "rewards/rejected": 0.4025692045688629, + "step": 1092 + }, + { + "epoch": 0.16903150976222694, + "grad_norm": 3.9346923828125, + "learning_rate": 2.8170103092783506e-06, + "logits/chosen": 12.876140594482422, + "logits/rejected": 8.337103843688965, + "logps/chosen": -301.7906494140625, + "logps/rejected": -180.44461059570312, + "loss": 0.5736, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36086779832839966, + "rewards/margins": 0.26002708077430725, + "rewards/rejected": 0.10084071010351181, + "step": 1093 + }, + { + "epoch": 0.1691861589019911, + "grad_norm": 6.346408367156982, + "learning_rate": 2.8195876288659795e-06, + "logits/chosen": 6.796268939971924, + "logits/rejected": 2.912069320678711, + "logps/chosen": -311.06317138671875, + "logps/rejected": -282.170654296875, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3297578692436218, + "rewards/margins": 0.01216554269194603, + "rewards/rejected": 0.3175923228263855, + "step": 1094 + }, + { + "epoch": 0.16934080804175528, + "grad_norm": 5.333662033081055, + "learning_rate": 2.8221649484536083e-06, + "logits/chosen": 10.003667831420898, + "logits/rejected": 5.769536972045898, + "logps/chosen": -159.35244750976562, + "logps/rejected": -166.38031005859375, + "loss": 0.7419, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.22861595451831818, + "rewards/margins": -0.09019675105810165, + "rewards/rejected": 0.31881269812583923, + "step": 1095 + }, + { + "epoch": 0.16949545718151943, + "grad_norm": 4.490705490112305, + "learning_rate": 2.8247422680412372e-06, + "logits/chosen": 13.215551376342773, + "logits/rejected": 6.94949197769165, + "logps/chosen": -136.20623779296875, + "logps/rejected": -99.8686294555664, + "loss": 0.6786, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18240973353385925, + "rewards/margins": 0.03176534175872803, + "rewards/rejected": 0.15064439177513123, + "step": 1096 + }, + { + "epoch": 0.16965010632128358, + "grad_norm": 4.48181676864624, + "learning_rate": 2.827319587628866e-06, + "logits/chosen": 15.186075210571289, + "logits/rejected": 5.635807991027832, + "logps/chosen": -359.2030029296875, + "logps/rejected": -208.1219024658203, + "loss": 0.6312, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3525662422180176, + "rewards/margins": 0.13948187232017517, + "rewards/rejected": 0.2130843997001648, + "step": 1097 + }, + { + "epoch": 0.16980475546104776, + "grad_norm": 5.749701976776123, + "learning_rate": 2.829896907216495e-06, + "logits/chosen": 9.422935485839844, + "logits/rejected": 9.543334007263184, + "logps/chosen": -287.0223388671875, + "logps/rejected": -192.75335693359375, + "loss": 0.7258, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2940881848335266, + "rewards/margins": -0.041253600269556046, + "rewards/rejected": 0.33534178137779236, + "step": 1098 + }, + { + "epoch": 0.1699594046008119, + "grad_norm": 5.74893856048584, + "learning_rate": 2.832474226804124e-06, + "logits/chosen": 6.8382568359375, + "logits/rejected": 8.938178062438965, + "logps/chosen": -209.59371948242188, + "logps/rejected": -229.0655517578125, + "loss": 0.6306, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24713760614395142, + "rewards/margins": 0.15542274713516235, + "rewards/rejected": 0.09171485900878906, + "step": 1099 + }, + { + "epoch": 0.17011405374057606, + "grad_norm": 6.221514701843262, + "learning_rate": 2.8350515463917527e-06, + "logits/chosen": 8.15424633026123, + "logits/rejected": 4.358251571655273, + "logps/chosen": -324.70751953125, + "logps/rejected": -235.7432861328125, + "loss": 0.7359, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27727362513542175, + "rewards/margins": -0.07042375206947327, + "rewards/rejected": 0.3476974070072174, + "step": 1100 + }, + { + "epoch": 0.17026870288034024, + "grad_norm": 8.049890518188477, + "learning_rate": 2.8376288659793816e-06, + "logits/chosen": 14.120906829833984, + "logits/rejected": 1.464968204498291, + "logps/chosen": -474.0608825683594, + "logps/rejected": -195.5423583984375, + "loss": 0.641, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2594659924507141, + "rewards/margins": 0.14085495471954346, + "rewards/rejected": 0.11861105263233185, + "step": 1101 + }, + { + "epoch": 0.1704233520201044, + "grad_norm": 6.466252326965332, + "learning_rate": 2.8402061855670104e-06, + "logits/chosen": 9.82530689239502, + "logits/rejected": 6.830409049987793, + "logps/chosen": -265.00286865234375, + "logps/rejected": -204.07083129882812, + "loss": 0.7323, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.14010316133499146, + "rewards/margins": -0.0686800479888916, + "rewards/rejected": 0.20878319442272186, + "step": 1102 + }, + { + "epoch": 0.17057800115986854, + "grad_norm": 6.551792144775391, + "learning_rate": 2.8427835051546393e-06, + "logits/chosen": 4.779054641723633, + "logits/rejected": 9.211370468139648, + "logps/chosen": -218.4180145263672, + "logps/rejected": -287.0144958496094, + "loss": 0.7409, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28126710653305054, + "rewards/margins": -0.07243074476718903, + "rewards/rejected": 0.35369786620140076, + "step": 1103 + }, + { + "epoch": 0.17073265029963272, + "grad_norm": 7.692407131195068, + "learning_rate": 2.845360824742268e-06, + "logits/chosen": 3.636638641357422, + "logits/rejected": 4.813644886016846, + "logps/chosen": -321.5621643066406, + "logps/rejected": -285.57684326171875, + "loss": 0.5944, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41143059730529785, + "rewards/margins": 0.21557457745075226, + "rewards/rejected": 0.1958560347557068, + "step": 1104 + }, + { + "epoch": 0.17088729943939687, + "grad_norm": 4.704918384552002, + "learning_rate": 2.847938144329897e-06, + "logits/chosen": 12.658109664916992, + "logits/rejected": 3.816486120223999, + "logps/chosen": -253.438232421875, + "logps/rejected": -136.7063751220703, + "loss": 0.6994, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23000502586364746, + "rewards/margins": -0.010163930244743824, + "rewards/rejected": 0.240168958902359, + "step": 1105 + }, + { + "epoch": 0.17104194857916102, + "grad_norm": 5.638489246368408, + "learning_rate": 2.850515463917526e-06, + "logits/chosen": 8.123842239379883, + "logits/rejected": 9.553081512451172, + "logps/chosen": -294.9290466308594, + "logps/rejected": -389.6015930175781, + "loss": 0.6294, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38644325733184814, + "rewards/margins": 0.192067950963974, + "rewards/rejected": 0.19437527656555176, + "step": 1106 + }, + { + "epoch": 0.17119659771892518, + "grad_norm": 4.879226207733154, + "learning_rate": 2.853092783505155e-06, + "logits/chosen": 8.909822463989258, + "logits/rejected": 2.842954158782959, + "logps/chosen": -259.98614501953125, + "logps/rejected": -186.7456817626953, + "loss": 0.6814, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3965057134628296, + "rewards/margins": 0.040973421186208725, + "rewards/rejected": 0.35553228855133057, + "step": 1107 + }, + { + "epoch": 0.17135124685868935, + "grad_norm": 4.824392795562744, + "learning_rate": 2.8556701030927837e-06, + "logits/chosen": 13.868636131286621, + "logits/rejected": 5.040534019470215, + "logps/chosen": -368.81231689453125, + "logps/rejected": -231.47988891601562, + "loss": 0.6243, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3407599925994873, + "rewards/margins": 0.14472103118896484, + "rewards/rejected": 0.19603893160820007, + "step": 1108 + }, + { + "epoch": 0.1715058959984535, + "grad_norm": 3.7489826679229736, + "learning_rate": 2.8582474226804125e-06, + "logits/chosen": 7.300200462341309, + "logits/rejected": 10.822530746459961, + "logps/chosen": -109.24542236328125, + "logps/rejected": -153.12359619140625, + "loss": 0.7127, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27764689922332764, + "rewards/margins": -0.02867947146296501, + "rewards/rejected": 0.30632635951042175, + "step": 1109 + }, + { + "epoch": 0.17166054513821766, + "grad_norm": 4.981750011444092, + "learning_rate": 2.8608247422680414e-06, + "logits/chosen": 6.557028293609619, + "logits/rejected": 8.688417434692383, + "logps/chosen": -260.2157287597656, + "logps/rejected": -247.9229736328125, + "loss": 0.6951, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36442968249320984, + "rewards/margins": 0.024903878569602966, + "rewards/rejected": 0.3395257890224457, + "step": 1110 + }, + { + "epoch": 0.17181519427798184, + "grad_norm": 4.8829026222229, + "learning_rate": 2.8634020618556703e-06, + "logits/chosen": 11.387208938598633, + "logits/rejected": 15.943961143493652, + "logps/chosen": -182.64720153808594, + "logps/rejected": -205.38931274414062, + "loss": 0.7763, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16148997843265533, + "rewards/margins": -0.13857947289943695, + "rewards/rejected": 0.3000694513320923, + "step": 1111 + }, + { + "epoch": 0.171969843417746, + "grad_norm": 6.7271647453308105, + "learning_rate": 2.865979381443299e-06, + "logits/chosen": 10.159669876098633, + "logits/rejected": 5.57767391204834, + "logps/chosen": -355.318603515625, + "logps/rejected": -253.44464111328125, + "loss": 0.6631, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4220062792301178, + "rewards/margins": 0.06916923820972443, + "rewards/rejected": 0.3528370261192322, + "step": 1112 + }, + { + "epoch": 0.17212449255751014, + "grad_norm": 8.745165824890137, + "learning_rate": 2.868556701030928e-06, + "logits/chosen": 5.87391471862793, + "logits/rejected": 12.86990737915039, + "logps/chosen": -257.3597106933594, + "logps/rejected": -343.64068603515625, + "loss": 0.6151, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3618261218070984, + "rewards/margins": 0.18963623046875, + "rewards/rejected": 0.17218990623950958, + "step": 1113 + }, + { + "epoch": 0.17227914169727432, + "grad_norm": 6.6859636306762695, + "learning_rate": 2.871134020618557e-06, + "logits/chosen": 10.206993103027344, + "logits/rejected": 9.294522285461426, + "logps/chosen": -225.59593200683594, + "logps/rejected": -213.65155029296875, + "loss": 0.7244, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2583962678909302, + "rewards/margins": -0.0537845604121685, + "rewards/rejected": 0.3121808171272278, + "step": 1114 + }, + { + "epoch": 0.17243379083703847, + "grad_norm": 5.0290846824646, + "learning_rate": 2.8737113402061858e-06, + "logits/chosen": 13.735154151916504, + "logits/rejected": 10.959716796875, + "logps/chosen": -353.849853515625, + "logps/rejected": -318.1784362792969, + "loss": 0.6468, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5507327914237976, + "rewards/margins": 0.10231511294841766, + "rewards/rejected": 0.44841769337654114, + "step": 1115 + }, + { + "epoch": 0.17258843997680262, + "grad_norm": 6.560153961181641, + "learning_rate": 2.8762886597938146e-06, + "logits/chosen": 14.087587356567383, + "logits/rejected": 8.3621826171875, + "logps/chosen": -329.96136474609375, + "logps/rejected": -228.5741424560547, + "loss": 0.78, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.263899564743042, + "rewards/margins": -0.1511520892381668, + "rewards/rejected": 0.4150516390800476, + "step": 1116 + }, + { + "epoch": 0.1727430891165668, + "grad_norm": 6.531588554382324, + "learning_rate": 2.878865979381443e-06, + "logits/chosen": 7.251614093780518, + "logits/rejected": 8.09341812133789, + "logps/chosen": -284.9375305175781, + "logps/rejected": -341.9988098144531, + "loss": 0.6242, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3623020350933075, + "rewards/margins": 0.14683209359645844, + "rewards/rejected": 0.21546995639801025, + "step": 1117 + }, + { + "epoch": 0.17289773825633095, + "grad_norm": 5.873194694519043, + "learning_rate": 2.881443298969072e-06, + "logits/chosen": 10.672133445739746, + "logits/rejected": 6.375923156738281, + "logps/chosen": -315.1903076171875, + "logps/rejected": -234.17271423339844, + "loss": 0.713, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.359017938375473, + "rewards/margins": -0.000476837158203125, + "rewards/rejected": 0.35949477553367615, + "step": 1118 + }, + { + "epoch": 0.1730523873960951, + "grad_norm": 5.246539115905762, + "learning_rate": 2.884020618556701e-06, + "logits/chosen": 6.875603675842285, + "logits/rejected": 9.460760116577148, + "logps/chosen": -220.30201721191406, + "logps/rejected": -241.61851501464844, + "loss": 0.7387, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3561538755893707, + "rewards/margins": -0.05357912927865982, + "rewards/rejected": 0.40973299741744995, + "step": 1119 + }, + { + "epoch": 0.17320703653585928, + "grad_norm": 3.860882043838501, + "learning_rate": 2.8865979381443297e-06, + "logits/chosen": 8.976292610168457, + "logits/rejected": 10.67917537689209, + "logps/chosen": -201.78549194335938, + "logps/rejected": -191.02020263671875, + "loss": 0.7099, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34864211082458496, + "rewards/margins": -0.011206664144992828, + "rewards/rejected": 0.3598487675189972, + "step": 1120 + }, + { + "epoch": 0.17336168567562343, + "grad_norm": 4.3847880363464355, + "learning_rate": 2.8891752577319586e-06, + "logits/chosen": 9.891502380371094, + "logits/rejected": 1.0228139162063599, + "logps/chosen": -268.3402099609375, + "logps/rejected": -168.21463012695312, + "loss": 0.6232, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38970279693603516, + "rewards/margins": 0.18353596329689026, + "rewards/rejected": 0.2061668336391449, + "step": 1121 + }, + { + "epoch": 0.17351633481538759, + "grad_norm": 5.330456733703613, + "learning_rate": 2.8917525773195883e-06, + "logits/chosen": 10.398155212402344, + "logits/rejected": 12.598572731018066, + "logps/chosen": -226.34432983398438, + "logps/rejected": -263.0143127441406, + "loss": 0.7005, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2910397946834564, + "rewards/margins": 0.01652168482542038, + "rewards/rejected": 0.27451813220977783, + "step": 1122 + }, + { + "epoch": 0.17367098395515174, + "grad_norm": 4.6225810050964355, + "learning_rate": 2.894329896907217e-06, + "logits/chosen": 15.732912063598633, + "logits/rejected": 6.839818477630615, + "logps/chosen": -323.67706298828125, + "logps/rejected": -197.86285400390625, + "loss": 0.5815, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5045099258422852, + "rewards/margins": 0.2553408741950989, + "rewards/rejected": 0.24916905164718628, + "step": 1123 + }, + { + "epoch": 0.17382563309491592, + "grad_norm": 5.172116756439209, + "learning_rate": 2.896907216494846e-06, + "logits/chosen": 6.7666850090026855, + "logits/rejected": 10.524429321289062, + "logps/chosen": -238.834228515625, + "logps/rejected": -263.2930908203125, + "loss": 0.6824, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2878759205341339, + "rewards/margins": 0.0442294105887413, + "rewards/rejected": 0.24364647269248962, + "step": 1124 + }, + { + "epoch": 0.17398028223468007, + "grad_norm": 6.477171897888184, + "learning_rate": 2.899484536082475e-06, + "logits/chosen": 8.801326751708984, + "logits/rejected": 10.45910930633545, + "logps/chosen": -305.6610412597656, + "logps/rejected": -239.3955535888672, + "loss": 0.7493, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3966098129749298, + "rewards/margins": -0.09666098654270172, + "rewards/rejected": 0.4932708144187927, + "step": 1125 + }, + { + "epoch": 0.17413493137444422, + "grad_norm": 11.301116943359375, + "learning_rate": 2.9020618556701034e-06, + "logits/chosen": 2.8457140922546387, + "logits/rejected": 3.8694403171539307, + "logps/chosen": -293.6243591308594, + "logps/rejected": -235.6193084716797, + "loss": 0.6714, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34122511744499207, + "rewards/margins": 0.07552716135978699, + "rewards/rejected": 0.2656979560852051, + "step": 1126 + }, + { + "epoch": 0.1742895805142084, + "grad_norm": 12.629762649536133, + "learning_rate": 2.9046391752577322e-06, + "logits/chosen": 5.864099502563477, + "logits/rejected": 4.938446521759033, + "logps/chosen": -306.47955322265625, + "logps/rejected": -223.91578674316406, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36849290132522583, + "rewards/margins": 0.11496633291244507, + "rewards/rejected": 0.2535265386104584, + "step": 1127 + }, + { + "epoch": 0.17444422965397255, + "grad_norm": 5.387455463409424, + "learning_rate": 2.907216494845361e-06, + "logits/chosen": 11.163328170776367, + "logits/rejected": 9.876559257507324, + "logps/chosen": -310.73931884765625, + "logps/rejected": -248.8851318359375, + "loss": 0.675, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45418691635131836, + "rewards/margins": 0.06985370814800262, + "rewards/rejected": 0.38433319330215454, + "step": 1128 + }, + { + "epoch": 0.1745988787937367, + "grad_norm": 5.634506702423096, + "learning_rate": 2.90979381443299e-06, + "logits/chosen": 6.626094818115234, + "logits/rejected": 4.87941837310791, + "logps/chosen": -353.708984375, + "logps/rejected": -228.18756103515625, + "loss": 0.613, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47076427936553955, + "rewards/margins": 0.22619284689426422, + "rewards/rejected": 0.24457146227359772, + "step": 1129 + }, + { + "epoch": 0.17475352793350088, + "grad_norm": 6.415378093719482, + "learning_rate": 2.912371134020619e-06, + "logits/chosen": 11.546716690063477, + "logits/rejected": 11.263907432556152, + "logps/chosen": -309.4408874511719, + "logps/rejected": -282.27777099609375, + "loss": 0.702, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3375720977783203, + "rewards/margins": 0.0003494769334793091, + "rewards/rejected": 0.3372226357460022, + "step": 1130 + }, + { + "epoch": 0.17490817707326503, + "grad_norm": 5.494746685028076, + "learning_rate": 2.9149484536082477e-06, + "logits/chosen": 10.173686981201172, + "logits/rejected": 8.87675666809082, + "logps/chosen": -261.99468994140625, + "logps/rejected": -225.0587158203125, + "loss": 0.7565, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2946622967720032, + "rewards/margins": -0.10347646474838257, + "rewards/rejected": 0.39813879132270813, + "step": 1131 + }, + { + "epoch": 0.17506282621302918, + "grad_norm": 5.844377517700195, + "learning_rate": 2.9175257731958766e-06, + "logits/chosen": 3.5838165283203125, + "logits/rejected": 6.299263954162598, + "logps/chosen": -200.82269287109375, + "logps/rejected": -343.9166259765625, + "loss": 0.7216, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1865125596523285, + "rewards/margins": -0.0347355417907238, + "rewards/rejected": 0.22124812006950378, + "step": 1132 + }, + { + "epoch": 0.17521747535279336, + "grad_norm": 5.7626872062683105, + "learning_rate": 2.9201030927835055e-06, + "logits/chosen": 10.046320915222168, + "logits/rejected": 8.99346923828125, + "logps/chosen": -319.71990966796875, + "logps/rejected": -292.3989562988281, + "loss": 0.7177, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.343557745218277, + "rewards/margins": -0.04012775793671608, + "rewards/rejected": 0.38368549942970276, + "step": 1133 + }, + { + "epoch": 0.1753721244925575, + "grad_norm": 4.6312079429626465, + "learning_rate": 2.9226804123711343e-06, + "logits/chosen": 6.430576801300049, + "logits/rejected": 7.720532417297363, + "logps/chosen": -159.00131225585938, + "logps/rejected": -147.68310546875, + "loss": 0.7829, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.22012931108474731, + "rewards/margins": -0.1513059437274933, + "rewards/rejected": 0.3714352250099182, + "step": 1134 + }, + { + "epoch": 0.17552677363232166, + "grad_norm": 7.163835048675537, + "learning_rate": 2.9252577319587632e-06, + "logits/chosen": 3.8095626831054688, + "logits/rejected": 6.109020709991455, + "logps/chosen": -269.7215881347656, + "logps/rejected": -268.4971008300781, + "loss": 0.7648, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30219149589538574, + "rewards/margins": -0.11694401502609253, + "rewards/rejected": 0.41913551092147827, + "step": 1135 + }, + { + "epoch": 0.17568142277208584, + "grad_norm": 6.825829982757568, + "learning_rate": 2.927835051546392e-06, + "logits/chosen": 10.532018661499023, + "logits/rejected": 9.871192932128906, + "logps/chosen": -245.007080078125, + "logps/rejected": -341.94451904296875, + "loss": 0.8056, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.23278719186782837, + "rewards/margins": -0.19790929555892944, + "rewards/rejected": 0.4306964874267578, + "step": 1136 + }, + { + "epoch": 0.17583607191185, + "grad_norm": 4.912806987762451, + "learning_rate": 2.930412371134021e-06, + "logits/chosen": 12.713714599609375, + "logits/rejected": 3.8755369186401367, + "logps/chosen": -305.22222900390625, + "logps/rejected": -143.66517639160156, + "loss": 0.6406, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3633865714073181, + "rewards/margins": 0.12963023781776428, + "rewards/rejected": 0.23375634849071503, + "step": 1137 + }, + { + "epoch": 0.17599072105161415, + "grad_norm": 4.775628089904785, + "learning_rate": 2.93298969072165e-06, + "logits/chosen": 7.143723487854004, + "logits/rejected": 2.2544302940368652, + "logps/chosen": -290.8276672363281, + "logps/rejected": -177.09397888183594, + "loss": 0.6548, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3753347098827362, + "rewards/margins": 0.08654476702213287, + "rewards/rejected": 0.28878992795944214, + "step": 1138 + }, + { + "epoch": 0.1761453701913783, + "grad_norm": 5.060400485992432, + "learning_rate": 2.9355670103092787e-06, + "logits/chosen": 5.091097354888916, + "logits/rejected": 11.51400375366211, + "logps/chosen": -189.26113891601562, + "logps/rejected": -309.8868408203125, + "loss": 0.708, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37924912571907043, + "rewards/margins": -0.02081029862165451, + "rewards/rejected": 0.40005940198898315, + "step": 1139 + }, + { + "epoch": 0.17630001933114248, + "grad_norm": 7.438137531280518, + "learning_rate": 2.9381443298969076e-06, + "logits/chosen": 9.425956726074219, + "logits/rejected": 6.930305480957031, + "logps/chosen": -290.3516540527344, + "logps/rejected": -205.43338012695312, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3329712152481079, + "rewards/margins": 0.06372203677892685, + "rewards/rejected": 0.2692491412162781, + "step": 1140 + }, + { + "epoch": 0.17645466847090663, + "grad_norm": 8.290541648864746, + "learning_rate": 2.9407216494845364e-06, + "logits/chosen": 13.327945709228516, + "logits/rejected": 10.471671104431152, + "logps/chosen": -341.0982971191406, + "logps/rejected": -255.54840087890625, + "loss": 0.6337, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4476722776889801, + "rewards/margins": 0.1451585739850998, + "rewards/rejected": 0.3025137186050415, + "step": 1141 + }, + { + "epoch": 0.17660931761067078, + "grad_norm": 7.709554672241211, + "learning_rate": 2.9432989690721653e-06, + "logits/chosen": 11.955734252929688, + "logits/rejected": 5.034045219421387, + "logps/chosen": -382.2080993652344, + "logps/rejected": -312.8702392578125, + "loss": 0.6873, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3577193319797516, + "rewards/margins": 0.031980231404304504, + "rewards/rejected": 0.3257391154766083, + "step": 1142 + }, + { + "epoch": 0.17676396675043496, + "grad_norm": 4.033751010894775, + "learning_rate": 2.945876288659794e-06, + "logits/chosen": 12.809724807739258, + "logits/rejected": 10.267937660217285, + "logps/chosen": -276.56463623046875, + "logps/rejected": -139.99610900878906, + "loss": 0.6773, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3085514307022095, + "rewards/margins": 0.037268564105033875, + "rewards/rejected": 0.2712828516960144, + "step": 1143 + }, + { + "epoch": 0.1769186158901991, + "grad_norm": 6.057156562805176, + "learning_rate": 2.948453608247423e-06, + "logits/chosen": 8.920923233032227, + "logits/rejected": 6.629421234130859, + "logps/chosen": -249.32882690429688, + "logps/rejected": -200.7079620361328, + "loss": 0.7066, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29187577962875366, + "rewards/margins": -0.02302558720111847, + "rewards/rejected": 0.31490135192871094, + "step": 1144 + }, + { + "epoch": 0.17707326502996326, + "grad_norm": 5.577688694000244, + "learning_rate": 2.951030927835052e-06, + "logits/chosen": 7.342004299163818, + "logits/rejected": 5.811574459075928, + "logps/chosen": -296.17120361328125, + "logps/rejected": -296.16351318359375, + "loss": 0.7219, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4578924775123596, + "rewards/margins": -0.040445707738399506, + "rewards/rejected": 0.4983382225036621, + "step": 1145 + }, + { + "epoch": 0.17722791416972744, + "grad_norm": 4.61027193069458, + "learning_rate": 2.953608247422681e-06, + "logits/chosen": 9.528794288635254, + "logits/rejected": 8.37557315826416, + "logps/chosen": -165.78585815429688, + "logps/rejected": -181.5104522705078, + "loss": 0.6326, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.379351407289505, + "rewards/margins": 0.15150676667690277, + "rewards/rejected": 0.22784467041492462, + "step": 1146 + }, + { + "epoch": 0.1773825633094916, + "grad_norm": 6.47112512588501, + "learning_rate": 2.9561855670103097e-06, + "logits/chosen": 11.794828414916992, + "logits/rejected": 10.370773315429688, + "logps/chosen": -407.6488342285156, + "logps/rejected": -350.3975830078125, + "loss": 0.6214, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5501800775527954, + "rewards/margins": 0.1756875067949295, + "rewards/rejected": 0.3744925558567047, + "step": 1147 + }, + { + "epoch": 0.17753721244925574, + "grad_norm": 7.51181697845459, + "learning_rate": 2.9587628865979385e-06, + "logits/chosen": 14.049497604370117, + "logits/rejected": 11.448801040649414, + "logps/chosen": -395.26617431640625, + "logps/rejected": -311.9085388183594, + "loss": 0.6729, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5568580627441406, + "rewards/margins": 0.08066901564598083, + "rewards/rejected": 0.4761890470981598, + "step": 1148 + }, + { + "epoch": 0.17769186158901992, + "grad_norm": 4.4073967933654785, + "learning_rate": 2.9613402061855674e-06, + "logits/chosen": 8.318864822387695, + "logits/rejected": 3.232300043106079, + "logps/chosen": -328.7777099609375, + "logps/rejected": -202.08578491210938, + "loss": 0.6601, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.42732587456703186, + "rewards/margins": 0.08265604823827744, + "rewards/rejected": 0.34466981887817383, + "step": 1149 + }, + { + "epoch": 0.17784651072878407, + "grad_norm": 6.041601181030273, + "learning_rate": 2.9639175257731963e-06, + "logits/chosen": 8.695036888122559, + "logits/rejected": 11.523868560791016, + "logps/chosen": -244.824951171875, + "logps/rejected": -202.55184936523438, + "loss": 0.7253, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3459719717502594, + "rewards/margins": -0.03434457629919052, + "rewards/rejected": 0.38031652569770813, + "step": 1150 + }, + { + "epoch": 0.17800115986854823, + "grad_norm": 5.251492977142334, + "learning_rate": 2.966494845360825e-06, + "logits/chosen": 9.244641304016113, + "logits/rejected": 16.107168197631836, + "logps/chosen": -190.07156372070312, + "logps/rejected": -262.0315856933594, + "loss": 0.6743, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31787556409835815, + "rewards/margins": 0.0451195053756237, + "rewards/rejected": 0.27275604009628296, + "step": 1151 + }, + { + "epoch": 0.1781558090083124, + "grad_norm": 5.284173488616943, + "learning_rate": 2.969072164948454e-06, + "logits/chosen": 5.584968566894531, + "logits/rejected": 10.666986465454102, + "logps/chosen": -219.20596313476562, + "logps/rejected": -246.28956604003906, + "loss": 0.6795, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3630213737487793, + "rewards/margins": 0.031061813235282898, + "rewards/rejected": 0.3319595754146576, + "step": 1152 + }, + { + "epoch": 0.17831045814807656, + "grad_norm": 5.383566379547119, + "learning_rate": 2.971649484536083e-06, + "logits/chosen": 10.384988784790039, + "logits/rejected": 3.793872833251953, + "logps/chosen": -376.3614501953125, + "logps/rejected": -208.56626892089844, + "loss": 0.6429, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36733898520469666, + "rewards/margins": 0.11935469508171082, + "rewards/rejected": 0.24798427522182465, + "step": 1153 + }, + { + "epoch": 0.1784651072878407, + "grad_norm": 3.970797538757324, + "learning_rate": 2.9742268041237114e-06, + "logits/chosen": 11.370634078979492, + "logits/rejected": 8.494200706481934, + "logps/chosen": -254.89219665527344, + "logps/rejected": -215.02896118164062, + "loss": 0.6171, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5302475094795227, + "rewards/margins": 0.1801958680152893, + "rewards/rejected": 0.3500516414642334, + "step": 1154 + }, + { + "epoch": 0.17861975642760486, + "grad_norm": 5.383657455444336, + "learning_rate": 2.9768041237113402e-06, + "logits/chosen": 9.273427963256836, + "logits/rejected": 8.824771881103516, + "logps/chosen": -274.83673095703125, + "logps/rejected": -364.8233947753906, + "loss": 0.6692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38373440504074097, + "rewards/margins": 0.07081535458564758, + "rewards/rejected": 0.31291908025741577, + "step": 1155 + }, + { + "epoch": 0.17877440556736904, + "grad_norm": 6.889599323272705, + "learning_rate": 2.979381443298969e-06, + "logits/chosen": 14.462203979492188, + "logits/rejected": 12.723499298095703, + "logps/chosen": -342.1315612792969, + "logps/rejected": -350.3135681152344, + "loss": 0.7387, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33119669556617737, + "rewards/margins": -0.08083860576152802, + "rewards/rejected": 0.4120352864265442, + "step": 1156 + }, + { + "epoch": 0.1789290547071332, + "grad_norm": 6.282871723175049, + "learning_rate": 2.981958762886598e-06, + "logits/chosen": 6.331623077392578, + "logits/rejected": 8.313983917236328, + "logps/chosen": -333.8280029296875, + "logps/rejected": -313.6961364746094, + "loss": 0.6366, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6078802943229675, + "rewards/margins": 0.142788365483284, + "rewards/rejected": 0.4650919437408447, + "step": 1157 + }, + { + "epoch": 0.17908370384689734, + "grad_norm": 5.11278772354126, + "learning_rate": 2.984536082474227e-06, + "logits/chosen": 9.76057243347168, + "logits/rejected": 7.916048049926758, + "logps/chosen": -324.56512451171875, + "logps/rejected": -254.02684020996094, + "loss": 0.6605, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39771729707717896, + "rewards/margins": 0.11197453737258911, + "rewards/rejected": 0.28574275970458984, + "step": 1158 + }, + { + "epoch": 0.17923835298666152, + "grad_norm": 4.654809474945068, + "learning_rate": 2.9871134020618557e-06, + "logits/chosen": 18.386581420898438, + "logits/rejected": 5.525882244110107, + "logps/chosen": -347.45355224609375, + "logps/rejected": -181.9476318359375, + "loss": 0.6651, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3984912633895874, + "rewards/margins": 0.0668419897556305, + "rewards/rejected": 0.3316493332386017, + "step": 1159 + }, + { + "epoch": 0.17939300212642567, + "grad_norm": 4.14867639541626, + "learning_rate": 2.9896907216494846e-06, + "logits/chosen": 6.37682580947876, + "logits/rejected": 5.399381637573242, + "logps/chosen": -189.04971313476562, + "logps/rejected": -172.76202392578125, + "loss": 0.6191, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4838985800743103, + "rewards/margins": 0.16113080084323883, + "rewards/rejected": 0.3227677643299103, + "step": 1160 + }, + { + "epoch": 0.17954765126618982, + "grad_norm": 3.794466495513916, + "learning_rate": 2.9922680412371135e-06, + "logits/chosen": 15.56403923034668, + "logits/rejected": 8.906413078308105, + "logps/chosen": -182.099853515625, + "logps/rejected": -131.7032928466797, + "loss": 0.6389, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32996755838394165, + "rewards/margins": 0.13160750269889832, + "rewards/rejected": 0.19836005568504333, + "step": 1161 + }, + { + "epoch": 0.179702300405954, + "grad_norm": 4.580986022949219, + "learning_rate": 2.9948453608247423e-06, + "logits/chosen": 11.547233581542969, + "logits/rejected": 13.655698776245117, + "logps/chosen": -212.57530212402344, + "logps/rejected": -214.5176239013672, + "loss": 0.7137, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3403546214103699, + "rewards/margins": -0.02418498322367668, + "rewards/rejected": 0.36453962326049805, + "step": 1162 + }, + { + "epoch": 0.17985694954571815, + "grad_norm": 5.870129108428955, + "learning_rate": 2.997422680412371e-06, + "logits/chosen": 15.221019744873047, + "logits/rejected": 8.538546562194824, + "logps/chosen": -321.4717102050781, + "logps/rejected": -206.94219970703125, + "loss": 0.7296, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18510447442531586, + "rewards/margins": -0.06214247643947601, + "rewards/rejected": 0.24724695086479187, + "step": 1163 + }, + { + "epoch": 0.1800115986854823, + "grad_norm": 4.725268363952637, + "learning_rate": 3e-06, + "logits/chosen": 11.91286849975586, + "logits/rejected": 7.065916061401367, + "logps/chosen": -343.61468505859375, + "logps/rejected": -212.65185546875, + "loss": 0.6219, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.47155943512916565, + "rewards/margins": 0.24598152935504913, + "rewards/rejected": 0.22557789087295532, + "step": 1164 + }, + { + "epoch": 0.18016624782524648, + "grad_norm": 4.854676723480225, + "learning_rate": 3.002577319587629e-06, + "logits/chosen": 13.181015968322754, + "logits/rejected": 15.521026611328125, + "logps/chosen": -220.15182495117188, + "logps/rejected": -260.1375427246094, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27434659004211426, + "rewards/margins": 0.04805588722229004, + "rewards/rejected": 0.22629070281982422, + "step": 1165 + }, + { + "epoch": 0.18032089696501064, + "grad_norm": 4.67647647857666, + "learning_rate": 3.005154639175258e-06, + "logits/chosen": 13.731133460998535, + "logits/rejected": 9.85840892791748, + "logps/chosen": -312.1427001953125, + "logps/rejected": -219.77969360351562, + "loss": 0.6634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5383355021476746, + "rewards/margins": 0.07219067215919495, + "rewards/rejected": 0.466144859790802, + "step": 1166 + }, + { + "epoch": 0.1804755461047748, + "grad_norm": 4.745318412780762, + "learning_rate": 3.0077319587628867e-06, + "logits/chosen": 9.040209770202637, + "logits/rejected": 10.990416526794434, + "logps/chosen": -316.904541015625, + "logps/rejected": -230.6239471435547, + "loss": 0.6518, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4587368369102478, + "rewards/margins": 0.09942308813333511, + "rewards/rejected": 0.3593137264251709, + "step": 1167 + }, + { + "epoch": 0.18063019524453897, + "grad_norm": 4.76668643951416, + "learning_rate": 3.0103092783505156e-06, + "logits/chosen": 15.52818489074707, + "logits/rejected": 10.53999137878418, + "logps/chosen": -279.0848388671875, + "logps/rejected": -184.01812744140625, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3073820173740387, + "rewards/margins": 0.07401542365550995, + "rewards/rejected": 0.23336659371852875, + "step": 1168 + }, + { + "epoch": 0.18078484438430312, + "grad_norm": 4.650648593902588, + "learning_rate": 3.0128865979381444e-06, + "logits/chosen": 12.051068305969238, + "logits/rejected": 10.280696868896484, + "logps/chosen": -339.3119812011719, + "logps/rejected": -288.9632873535156, + "loss": 0.5826, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6108314990997314, + "rewards/margins": 0.28709107637405396, + "rewards/rejected": 0.3237403929233551, + "step": 1169 + }, + { + "epoch": 0.18093949352406727, + "grad_norm": 8.303998947143555, + "learning_rate": 3.0154639175257733e-06, + "logits/chosen": 7.365882873535156, + "logits/rejected": 3.311863899230957, + "logps/chosen": -291.14337158203125, + "logps/rejected": -276.2372741699219, + "loss": 0.7404, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4151206910610199, + "rewards/margins": -0.07277338951826096, + "rewards/rejected": 0.48789405822753906, + "step": 1170 + }, + { + "epoch": 0.18109414266383142, + "grad_norm": 11.77899169921875, + "learning_rate": 3.018041237113402e-06, + "logits/chosen": 14.795269966125488, + "logits/rejected": 14.246530532836914, + "logps/chosen": -393.1174621582031, + "logps/rejected": -318.1690979003906, + "loss": 0.7647, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34867727756500244, + "rewards/margins": -0.12560062110424042, + "rewards/rejected": 0.47427788376808167, + "step": 1171 + }, + { + "epoch": 0.1812487918035956, + "grad_norm": 4.764474391937256, + "learning_rate": 3.020618556701031e-06, + "logits/chosen": 15.29426383972168, + "logits/rejected": 10.25067138671875, + "logps/chosen": -290.40289306640625, + "logps/rejected": -196.67935180664062, + "loss": 0.656, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3114931881427765, + "rewards/margins": 0.12225961685180664, + "rewards/rejected": 0.18923360109329224, + "step": 1172 + }, + { + "epoch": 0.18140344094335975, + "grad_norm": 4.871850967407227, + "learning_rate": 3.02319587628866e-06, + "logits/chosen": 8.374554634094238, + "logits/rejected": 4.219167709350586, + "logps/chosen": -257.68853759765625, + "logps/rejected": -267.280517578125, + "loss": 0.67, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4222041666507721, + "rewards/margins": 0.06348080933094025, + "rewards/rejected": 0.35872337222099304, + "step": 1173 + }, + { + "epoch": 0.1815580900831239, + "grad_norm": 4.874150276184082, + "learning_rate": 3.025773195876289e-06, + "logits/chosen": 14.099828720092773, + "logits/rejected": 4.365835189819336, + "logps/chosen": -254.84820556640625, + "logps/rejected": -155.9647216796875, + "loss": 0.5837, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.381515234708786, + "rewards/margins": 0.23910629749298096, + "rewards/rejected": 0.14240892231464386, + "step": 1174 + }, + { + "epoch": 0.18171273922288808, + "grad_norm": 6.128303527832031, + "learning_rate": 3.0283505154639177e-06, + "logits/chosen": 5.875522136688232, + "logits/rejected": 7.177199840545654, + "logps/chosen": -165.80751037597656, + "logps/rejected": -208.37188720703125, + "loss": 0.772, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29461461305618286, + "rewards/margins": -0.13151991367340088, + "rewards/rejected": 0.42613452672958374, + "step": 1175 + }, + { + "epoch": 0.18186738836265223, + "grad_norm": 5.21152400970459, + "learning_rate": 3.0309278350515465e-06, + "logits/chosen": 10.468564987182617, + "logits/rejected": 6.813868522644043, + "logps/chosen": -280.7398681640625, + "logps/rejected": -233.3089599609375, + "loss": 0.655, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5198568105697632, + "rewards/margins": 0.12020877748727798, + "rewards/rejected": 0.3996480107307434, + "step": 1176 + }, + { + "epoch": 0.18202203750241638, + "grad_norm": 8.711030960083008, + "learning_rate": 3.0335051546391754e-06, + "logits/chosen": 9.909565925598145, + "logits/rejected": 7.333517551422119, + "logps/chosen": -401.7020568847656, + "logps/rejected": -566.3073120117188, + "loss": 0.754, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3427141606807709, + "rewards/margins": -0.08943916857242584, + "rewards/rejected": 0.4321533143520355, + "step": 1177 + }, + { + "epoch": 0.18217668664218056, + "grad_norm": 5.159784317016602, + "learning_rate": 3.0360824742268043e-06, + "logits/chosen": 5.915364742279053, + "logits/rejected": 5.309688568115234, + "logps/chosen": -226.29086303710938, + "logps/rejected": -190.01730346679688, + "loss": 0.6341, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46011900901794434, + "rewards/margins": 0.15468966960906982, + "rewards/rejected": 0.3054293394088745, + "step": 1178 + }, + { + "epoch": 0.18233133578194471, + "grad_norm": 4.520188808441162, + "learning_rate": 3.038659793814433e-06, + "logits/chosen": 5.355957984924316, + "logits/rejected": 6.04193639755249, + "logps/chosen": -238.64024353027344, + "logps/rejected": -228.92774963378906, + "loss": 0.7198, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4141625761985779, + "rewards/margins": -0.03612041845917702, + "rewards/rejected": 0.4502830505371094, + "step": 1179 + }, + { + "epoch": 0.18248598492170887, + "grad_norm": 5.92887544631958, + "learning_rate": 3.041237113402062e-06, + "logits/chosen": 7.3638529777526855, + "logits/rejected": 4.4720258712768555, + "logps/chosen": -288.26507568359375, + "logps/rejected": -205.216064453125, + "loss": 0.6825, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.238226979970932, + "rewards/margins": 0.04762476682662964, + "rewards/rejected": 0.19060221314430237, + "step": 1180 + }, + { + "epoch": 0.18264063406147305, + "grad_norm": 7.217073917388916, + "learning_rate": 3.043814432989691e-06, + "logits/chosen": 10.551058769226074, + "logits/rejected": 4.731176376342773, + "logps/chosen": -338.35711669921875, + "logps/rejected": -216.30242919921875, + "loss": 0.6928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4426460266113281, + "rewards/margins": 0.03709197789430618, + "rewards/rejected": 0.40555405616760254, + "step": 1181 + }, + { + "epoch": 0.1827952832012372, + "grad_norm": 6.286205768585205, + "learning_rate": 3.0463917525773198e-06, + "logits/chosen": 8.504984855651855, + "logits/rejected": 4.316731929779053, + "logps/chosen": -219.84622192382812, + "logps/rejected": -222.9134063720703, + "loss": 0.6741, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38894233107566833, + "rewards/margins": 0.06511226296424866, + "rewards/rejected": 0.3238300681114197, + "step": 1182 + }, + { + "epoch": 0.18294993234100135, + "grad_norm": 4.606963634490967, + "learning_rate": 3.048969072164949e-06, + "logits/chosen": 8.149803161621094, + "logits/rejected": 4.0450053215026855, + "logps/chosen": -265.49542236328125, + "logps/rejected": -237.5177764892578, + "loss": 0.6337, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6777053475379944, + "rewards/margins": 0.13223743438720703, + "rewards/rejected": 0.5454679131507874, + "step": 1183 + }, + { + "epoch": 0.18310458148076553, + "grad_norm": 5.380277156829834, + "learning_rate": 3.051546391752578e-06, + "logits/chosen": 13.869263648986816, + "logits/rejected": 9.955480575561523, + "logps/chosen": -388.0926818847656, + "logps/rejected": -282.67889404296875, + "loss": 0.604, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5288920402526855, + "rewards/margins": 0.19544801115989685, + "rewards/rejected": 0.3334440290927887, + "step": 1184 + }, + { + "epoch": 0.18325923062052968, + "grad_norm": 6.7408952713012695, + "learning_rate": 3.054123711340207e-06, + "logits/chosen": 11.916411399841309, + "logits/rejected": 10.430109024047852, + "logps/chosen": -248.553955078125, + "logps/rejected": -242.81214904785156, + "loss": 0.7607, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27385321259498596, + "rewards/margins": -0.11242671310901642, + "rewards/rejected": 0.3862799108028412, + "step": 1185 + }, + { + "epoch": 0.18341387976029383, + "grad_norm": 9.486915588378906, + "learning_rate": 3.0567010309278357e-06, + "logits/chosen": 11.307969093322754, + "logits/rejected": 9.538265228271484, + "logps/chosen": -348.06982421875, + "logps/rejected": -356.6627197265625, + "loss": 0.7011, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5180324912071228, + "rewards/margins": 0.012578964233398438, + "rewards/rejected": 0.5054535865783691, + "step": 1186 + }, + { + "epoch": 0.18356852890005798, + "grad_norm": 5.4604716300964355, + "learning_rate": 3.0592783505154646e-06, + "logits/chosen": 7.491862773895264, + "logits/rejected": 10.485586166381836, + "logps/chosen": -236.80398559570312, + "logps/rejected": -288.0054626464844, + "loss": 0.6647, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4369448721408844, + "rewards/margins": 0.11426103860139847, + "rewards/rejected": 0.32268381118774414, + "step": 1187 + }, + { + "epoch": 0.18372317803982216, + "grad_norm": 24.633495330810547, + "learning_rate": 3.0618556701030934e-06, + "logits/chosen": 18.332578659057617, + "logits/rejected": 9.687328338623047, + "logps/chosen": -360.6594543457031, + "logps/rejected": -236.90896606445312, + "loss": 0.6789, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4057636260986328, + "rewards/margins": 0.03886289522051811, + "rewards/rejected": 0.3669007122516632, + "step": 1188 + }, + { + "epoch": 0.1838778271795863, + "grad_norm": 5.3053154945373535, + "learning_rate": 3.0644329896907223e-06, + "logits/chosen": 17.353923797607422, + "logits/rejected": 16.056013107299805, + "logps/chosen": -226.23492431640625, + "logps/rejected": -236.54217529296875, + "loss": 0.6326, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.500227689743042, + "rewards/margins": 0.1318841576576233, + "rewards/rejected": 0.3683435618877411, + "step": 1189 + }, + { + "epoch": 0.18403247631935046, + "grad_norm": 8.136807441711426, + "learning_rate": 3.067010309278351e-06, + "logits/chosen": 4.9448747634887695, + "logits/rejected": 9.623682975769043, + "logps/chosen": -205.90737915039062, + "logps/rejected": -155.84613037109375, + "loss": 0.8436, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2626339793205261, + "rewards/margins": -0.2467816025018692, + "rewards/rejected": 0.5094155669212341, + "step": 1190 + }, + { + "epoch": 0.18418712545911464, + "grad_norm": 4.0371527671813965, + "learning_rate": 3.06958762886598e-06, + "logits/chosen": 3.384756088256836, + "logits/rejected": 5.1296796798706055, + "logps/chosen": -156.49325561523438, + "logps/rejected": -174.53515625, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.412746399641037, + "rewards/margins": 0.04412619024515152, + "rewards/rejected": 0.3686201870441437, + "step": 1191 + }, + { + "epoch": 0.1843417745988788, + "grad_norm": 4.892765998840332, + "learning_rate": 3.0721649484536085e-06, + "logits/chosen": 10.358184814453125, + "logits/rejected": 7.822065353393555, + "logps/chosen": -171.96621704101562, + "logps/rejected": -118.33464813232422, + "loss": 0.685, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39216411113739014, + "rewards/margins": 0.027994196861982346, + "rewards/rejected": 0.3641698956489563, + "step": 1192 + }, + { + "epoch": 0.18449642373864295, + "grad_norm": 21.119739532470703, + "learning_rate": 3.0747422680412374e-06, + "logits/chosen": 12.397688865661621, + "logits/rejected": 11.894867897033691, + "logps/chosen": -306.3863220214844, + "logps/rejected": -322.08416748046875, + "loss": 0.782, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3565625548362732, + "rewards/margins": -0.12484966218471527, + "rewards/rejected": 0.48141220211982727, + "step": 1193 + }, + { + "epoch": 0.18465107287840712, + "grad_norm": 5.164624214172363, + "learning_rate": 3.0773195876288662e-06, + "logits/chosen": 5.650229454040527, + "logits/rejected": 4.6545562744140625, + "logps/chosen": -207.19813537597656, + "logps/rejected": -236.67950439453125, + "loss": 0.6101, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6313309073448181, + "rewards/margins": 0.20105575025081635, + "rewards/rejected": 0.43027520179748535, + "step": 1194 + }, + { + "epoch": 0.18480572201817128, + "grad_norm": 10.106759071350098, + "learning_rate": 3.079896907216495e-06, + "logits/chosen": 6.211137294769287, + "logits/rejected": 8.031717300415039, + "logps/chosen": -208.40060424804688, + "logps/rejected": -265.97882080078125, + "loss": 0.6533, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4110247790813446, + "rewards/margins": 0.1256294846534729, + "rewards/rejected": 0.2853952944278717, + "step": 1195 + }, + { + "epoch": 0.18496037115793543, + "grad_norm": 4.772614479064941, + "learning_rate": 3.082474226804124e-06, + "logits/chosen": 7.721717357635498, + "logits/rejected": 5.436555862426758, + "logps/chosen": -225.6160888671875, + "logps/rejected": -152.51763916015625, + "loss": 0.73, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.38208839297294617, + "rewards/margins": -0.06981611251831055, + "rewards/rejected": 0.4519044756889343, + "step": 1196 + }, + { + "epoch": 0.1851150202976996, + "grad_norm": 5.007623195648193, + "learning_rate": 3.085051546391753e-06, + "logits/chosen": 9.873635292053223, + "logits/rejected": 5.753932476043701, + "logps/chosen": -270.7194519042969, + "logps/rejected": -234.20751953125, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34522420167922974, + "rewards/margins": 0.0871821939945221, + "rewards/rejected": 0.25804200768470764, + "step": 1197 + }, + { + "epoch": 0.18526966943746376, + "grad_norm": 6.744195938110352, + "learning_rate": 3.0876288659793817e-06, + "logits/chosen": 15.802885055541992, + "logits/rejected": 15.260936737060547, + "logps/chosen": -300.5866394042969, + "logps/rejected": -264.56549072265625, + "loss": 0.6644, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5173041224479675, + "rewards/margins": 0.0647246390581131, + "rewards/rejected": 0.4525794982910156, + "step": 1198 + }, + { + "epoch": 0.1854243185772279, + "grad_norm": 6.954531192779541, + "learning_rate": 3.0902061855670106e-06, + "logits/chosen": 12.912381172180176, + "logits/rejected": 9.995681762695312, + "logps/chosen": -394.2403564453125, + "logps/rejected": -345.6117248535156, + "loss": 0.688, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49471521377563477, + "rewards/margins": 0.028119433671236038, + "rewards/rejected": 0.46659573912620544, + "step": 1199 + }, + { + "epoch": 0.18557896771699206, + "grad_norm": 6.823832035064697, + "learning_rate": 3.0927835051546395e-06, + "logits/chosen": 5.388941287994385, + "logits/rejected": 7.926120281219482, + "logps/chosen": -224.07943725585938, + "logps/rejected": -279.90966796875, + "loss": 0.7515, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4284442663192749, + "rewards/margins": -0.0993957370519638, + "rewards/rejected": 0.5278400182723999, + "step": 1200 + }, + { + "epoch": 0.18573361685675624, + "grad_norm": 5.436979293823242, + "learning_rate": 3.0953608247422683e-06, + "logits/chosen": 12.300765991210938, + "logits/rejected": 9.664896011352539, + "logps/chosen": -286.91607666015625, + "logps/rejected": -291.53656005859375, + "loss": 0.6764, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6235439777374268, + "rewards/margins": 0.048819400370121, + "rewards/rejected": 0.574724555015564, + "step": 1201 + }, + { + "epoch": 0.1858882659965204, + "grad_norm": 6.066813945770264, + "learning_rate": 3.097938144329897e-06, + "logits/chosen": 9.628548622131348, + "logits/rejected": 16.925989151000977, + "logps/chosen": -183.95472717285156, + "logps/rejected": -290.9954833984375, + "loss": 0.7769, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.42231979966163635, + "rewards/margins": -0.1516256332397461, + "rewards/rejected": 0.5739454030990601, + "step": 1202 + }, + { + "epoch": 0.18604291513628454, + "grad_norm": 5.672567844390869, + "learning_rate": 3.100515463917526e-06, + "logits/chosen": 13.347369194030762, + "logits/rejected": 12.573772430419922, + "logps/chosen": -325.94793701171875, + "logps/rejected": -342.54583740234375, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.561053991317749, + "rewards/margins": 0.01670600287616253, + "rewards/rejected": 0.5443479418754578, + "step": 1203 + }, + { + "epoch": 0.18619756427604872, + "grad_norm": 5.7790398597717285, + "learning_rate": 3.103092783505155e-06, + "logits/chosen": 7.422928810119629, + "logits/rejected": 6.094414710998535, + "logps/chosen": -242.73480224609375, + "logps/rejected": -189.88925170898438, + "loss": 0.6946, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36713293194770813, + "rewards/margins": 0.010114848613739014, + "rewards/rejected": 0.3570181131362915, + "step": 1204 + }, + { + "epoch": 0.18635221341581287, + "grad_norm": 7.573155403137207, + "learning_rate": 3.105670103092784e-06, + "logits/chosen": 5.264227867126465, + "logits/rejected": 7.8597540855407715, + "logps/chosen": -314.0546569824219, + "logps/rejected": -322.2362365722656, + "loss": 0.7352, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.415477454662323, + "rewards/margins": -0.06558741629123688, + "rewards/rejected": 0.4810648560523987, + "step": 1205 + }, + { + "epoch": 0.18650686255557702, + "grad_norm": 5.678481578826904, + "learning_rate": 3.1082474226804127e-06, + "logits/chosen": 10.003520965576172, + "logits/rejected": 10.643257141113281, + "logps/chosen": -250.01773071289062, + "logps/rejected": -241.93426513671875, + "loss": 0.6846, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32810544967651367, + "rewards/margins": 0.031289104372262955, + "rewards/rejected": 0.296816349029541, + "step": 1206 + }, + { + "epoch": 0.1866615116953412, + "grad_norm": 5.1704864501953125, + "learning_rate": 3.1108247422680416e-06, + "logits/chosen": 10.481098175048828, + "logits/rejected": 7.659857273101807, + "logps/chosen": -228.67388916015625, + "logps/rejected": -189.83267211914062, + "loss": 0.71, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3601820170879364, + "rewards/margins": -0.01919415593147278, + "rewards/rejected": 0.3793761432170868, + "step": 1207 + }, + { + "epoch": 0.18681616083510535, + "grad_norm": 5.7654571533203125, + "learning_rate": 3.1134020618556704e-06, + "logits/chosen": 6.949052810668945, + "logits/rejected": 0.6345968246459961, + "logps/chosen": -270.25494384765625, + "logps/rejected": -184.03750610351562, + "loss": 0.7517, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3889699876308441, + "rewards/margins": -0.09388561546802521, + "rewards/rejected": 0.48285558819770813, + "step": 1208 + }, + { + "epoch": 0.1869708099748695, + "grad_norm": 4.760369777679443, + "learning_rate": 3.1159793814432993e-06, + "logits/chosen": 14.434054374694824, + "logits/rejected": 9.79536247253418, + "logps/chosen": -289.9521789550781, + "logps/rejected": -228.3650665283203, + "loss": 0.6686, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4275573790073395, + "rewards/margins": 0.0646824836730957, + "rewards/rejected": 0.3628748953342438, + "step": 1209 + }, + { + "epoch": 0.18712545911463369, + "grad_norm": 14.572488784790039, + "learning_rate": 3.118556701030928e-06, + "logits/chosen": 11.860732078552246, + "logits/rejected": 9.801314353942871, + "logps/chosen": -328.2403564453125, + "logps/rejected": -261.2964782714844, + "loss": 0.6285, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5980567932128906, + "rewards/margins": 0.14208680391311646, + "rewards/rejected": 0.45596998929977417, + "step": 1210 + }, + { + "epoch": 0.18728010825439784, + "grad_norm": 6.36720609664917, + "learning_rate": 3.121134020618557e-06, + "logits/chosen": 9.14716911315918, + "logits/rejected": 6.533000946044922, + "logps/chosen": -297.1690673828125, + "logps/rejected": -278.9299011230469, + "loss": 0.7287, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3750815987586975, + "rewards/margins": -0.047883618623018265, + "rewards/rejected": 0.4229651987552643, + "step": 1211 + }, + { + "epoch": 0.187434757394162, + "grad_norm": 7.197250843048096, + "learning_rate": 3.123711340206186e-06, + "logits/chosen": 8.806530952453613, + "logits/rejected": 8.954459190368652, + "logps/chosen": -300.0014953613281, + "logps/rejected": -336.94805908203125, + "loss": 0.7026, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.37373867630958557, + "rewards/margins": 0.03315524384379387, + "rewards/rejected": 0.3405834436416626, + "step": 1212 + }, + { + "epoch": 0.18758940653392617, + "grad_norm": 5.8357696533203125, + "learning_rate": 3.126288659793815e-06, + "logits/chosen": 9.556649208068848, + "logits/rejected": 11.319966316223145, + "logps/chosen": -358.9806823730469, + "logps/rejected": -330.43853759765625, + "loss": 0.7371, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4974590539932251, + "rewards/margins": -0.07199737429618835, + "rewards/rejected": 0.5694563984870911, + "step": 1213 + }, + { + "epoch": 0.18774405567369032, + "grad_norm": 6.848079204559326, + "learning_rate": 3.1288659793814437e-06, + "logits/chosen": 6.142574310302734, + "logits/rejected": 5.4072957038879395, + "logps/chosen": -269.4252014160156, + "logps/rejected": -296.4049987792969, + "loss": 0.6496, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6798981428146362, + "rewards/margins": 0.1462177187204361, + "rewards/rejected": 0.5336804389953613, + "step": 1214 + }, + { + "epoch": 0.18789870481345447, + "grad_norm": 6.2556071281433105, + "learning_rate": 3.1314432989690725e-06, + "logits/chosen": 11.529394149780273, + "logits/rejected": 5.767757892608643, + "logps/chosen": -280.46600341796875, + "logps/rejected": -160.28274536132812, + "loss": 0.7714, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.24274583160877228, + "rewards/margins": -0.14023223519325256, + "rewards/rejected": 0.38297808170318604, + "step": 1215 + }, + { + "epoch": 0.18805335395321862, + "grad_norm": 2.86610746383667, + "learning_rate": 3.1340206185567014e-06, + "logits/chosen": 6.791140079498291, + "logits/rejected": 7.40846061706543, + "logps/chosen": -108.73541259765625, + "logps/rejected": -108.48594665527344, + "loss": 0.6359, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4238719642162323, + "rewards/margins": 0.12224177271127701, + "rewards/rejected": 0.3016301989555359, + "step": 1216 + }, + { + "epoch": 0.1882080030929828, + "grad_norm": 6.95395040512085, + "learning_rate": 3.1365979381443303e-06, + "logits/chosen": 10.178174018859863, + "logits/rejected": 8.106420516967773, + "logps/chosen": -393.7174987792969, + "logps/rejected": -381.8161926269531, + "loss": 0.679, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5753136873245239, + "rewards/margins": 0.10919182747602463, + "rewards/rejected": 0.4661218523979187, + "step": 1217 + }, + { + "epoch": 0.18836265223274695, + "grad_norm": 4.500452041625977, + "learning_rate": 3.139175257731959e-06, + "logits/chosen": 8.35544490814209, + "logits/rejected": 4.280854225158691, + "logps/chosen": -264.40826416015625, + "logps/rejected": -249.41998291015625, + "loss": 0.6094, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4063566327095032, + "rewards/margins": 0.19208449125289917, + "rewards/rejected": 0.2142721563577652, + "step": 1218 + }, + { + "epoch": 0.1885173013725111, + "grad_norm": 4.626194953918457, + "learning_rate": 3.141752577319588e-06, + "logits/chosen": 8.580806732177734, + "logits/rejected": 9.48959732055664, + "logps/chosen": -319.87115478515625, + "logps/rejected": -339.5631103515625, + "loss": 0.6076, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5394479036331177, + "rewards/margins": 0.20765885710716248, + "rewards/rejected": 0.3317890167236328, + "step": 1219 + }, + { + "epoch": 0.18867195051227528, + "grad_norm": 5.603726863861084, + "learning_rate": 3.1443298969072165e-06, + "logits/chosen": 9.180947303771973, + "logits/rejected": 0.005242586135864258, + "logps/chosen": -313.40899658203125, + "logps/rejected": -191.90228271484375, + "loss": 0.6322, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5424249172210693, + "rewards/margins": 0.13756096363067627, + "rewards/rejected": 0.40486395359039307, + "step": 1220 + }, + { + "epoch": 0.18882659965203943, + "grad_norm": 9.603288650512695, + "learning_rate": 3.1469072164948453e-06, + "logits/chosen": 7.6078715324401855, + "logits/rejected": 10.072007179260254, + "logps/chosen": -247.25083923339844, + "logps/rejected": -269.4310302734375, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4233908951282501, + "rewards/margins": 0.14118510484695435, + "rewards/rejected": 0.2822057604789734, + "step": 1221 + }, + { + "epoch": 0.18898124879180359, + "grad_norm": 3.6513845920562744, + "learning_rate": 3.1494845360824742e-06, + "logits/chosen": 12.510441780090332, + "logits/rejected": 8.736970901489258, + "logps/chosen": -235.2794647216797, + "logps/rejected": -152.3460693359375, + "loss": 0.6386, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48582226037979126, + "rewards/margins": 0.12270636111497879, + "rewards/rejected": 0.36311590671539307, + "step": 1222 + }, + { + "epoch": 0.18913589793156776, + "grad_norm": 4.3698506355285645, + "learning_rate": 3.152061855670103e-06, + "logits/chosen": 5.830020904541016, + "logits/rejected": 0.8380942940711975, + "logps/chosen": -228.67303466796875, + "logps/rejected": -148.95106506347656, + "loss": 0.6044, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48227614164352417, + "rewards/margins": 0.20268318057060242, + "rewards/rejected": 0.27959299087524414, + "step": 1223 + }, + { + "epoch": 0.18929054707133192, + "grad_norm": 5.827017307281494, + "learning_rate": 3.154639175257732e-06, + "logits/chosen": 9.446216583251953, + "logits/rejected": 10.732011795043945, + "logps/chosen": -257.458251953125, + "logps/rejected": -302.6133728027344, + "loss": 0.7338, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34902212023735046, + "rewards/margins": -0.057440612465143204, + "rewards/rejected": 0.40646272897720337, + "step": 1224 + }, + { + "epoch": 0.18944519621109607, + "grad_norm": 8.706448554992676, + "learning_rate": 3.157216494845361e-06, + "logits/chosen": 11.037115097045898, + "logits/rejected": 7.214148044586182, + "logps/chosen": -283.21307373046875, + "logps/rejected": -283.8851623535156, + "loss": 0.6702, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5103629231452942, + "rewards/margins": 0.06468954682350159, + "rewards/rejected": 0.445673406124115, + "step": 1225 + }, + { + "epoch": 0.18959984535086025, + "grad_norm": 4.980746746063232, + "learning_rate": 3.1597938144329897e-06, + "logits/chosen": 6.920391082763672, + "logits/rejected": 7.818106174468994, + "logps/chosen": -322.8192138671875, + "logps/rejected": -290.6893310546875, + "loss": 0.63, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5367587804794312, + "rewards/margins": 0.14732728898525238, + "rewards/rejected": 0.3894314765930176, + "step": 1226 + }, + { + "epoch": 0.1897544944906244, + "grad_norm": 5.300254821777344, + "learning_rate": 3.1623711340206186e-06, + "logits/chosen": 8.445317268371582, + "logits/rejected": 9.44327163696289, + "logps/chosen": -203.99066162109375, + "logps/rejected": -152.672119140625, + "loss": 0.7431, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.46075350046157837, + "rewards/margins": -0.08466558158397675, + "rewards/rejected": 0.5454190969467163, + "step": 1227 + }, + { + "epoch": 0.18990914363038855, + "grad_norm": 31.06129264831543, + "learning_rate": 3.1649484536082475e-06, + "logits/chosen": 6.146291732788086, + "logits/rejected": 9.227516174316406, + "logps/chosen": -198.13046264648438, + "logps/rejected": -260.4383544921875, + "loss": 0.7542, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.503790557384491, + "rewards/margins": -0.1043115183711052, + "rewards/rejected": 0.6081020832061768, + "step": 1228 + }, + { + "epoch": 0.19006379277015273, + "grad_norm": 9.248373031616211, + "learning_rate": 3.1675257731958763e-06, + "logits/chosen": 9.757558822631836, + "logits/rejected": 7.950119495391846, + "logps/chosen": -297.04473876953125, + "logps/rejected": -293.9345703125, + "loss": 0.6587, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6458636522293091, + "rewards/margins": 0.0953555479645729, + "rewards/rejected": 0.5505080223083496, + "step": 1229 + }, + { + "epoch": 0.19021844190991688, + "grad_norm": 19.740877151489258, + "learning_rate": 3.170103092783505e-06, + "logits/chosen": 9.515580177307129, + "logits/rejected": -1.551758885383606, + "logps/chosen": -217.4691619873047, + "logps/rejected": -115.7433853149414, + "loss": 0.7155, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29984092712402344, + "rewards/margins": -0.03631095588207245, + "rewards/rejected": 0.3361518979072571, + "step": 1230 + }, + { + "epoch": 0.19037309104968103, + "grad_norm": 5.421351909637451, + "learning_rate": 3.172680412371134e-06, + "logits/chosen": 8.8172025680542, + "logits/rejected": 7.618391513824463, + "logps/chosen": -246.85678100585938, + "logps/rejected": -225.1162567138672, + "loss": 0.7273, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.38675397634506226, + "rewards/margins": -0.05506296083331108, + "rewards/rejected": 0.44181695580482483, + "step": 1231 + }, + { + "epoch": 0.19052774018944518, + "grad_norm": 7.741054534912109, + "learning_rate": 3.175257731958763e-06, + "logits/chosen": 11.476730346679688, + "logits/rejected": 12.512069702148438, + "logps/chosen": -417.05908203125, + "logps/rejected": -348.0390319824219, + "loss": 0.731, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4446180462837219, + "rewards/margins": -0.05932272970676422, + "rewards/rejected": 0.5039408206939697, + "step": 1232 + }, + { + "epoch": 0.19068238932920936, + "grad_norm": 4.535299777984619, + "learning_rate": 3.177835051546392e-06, + "logits/chosen": 10.029155731201172, + "logits/rejected": 5.178633689880371, + "logps/chosen": -294.2568054199219, + "logps/rejected": -243.51840209960938, + "loss": 0.6316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5292582511901855, + "rewards/margins": 0.14780116081237793, + "rewards/rejected": 0.3814570903778076, + "step": 1233 + }, + { + "epoch": 0.1908370384689735, + "grad_norm": 4.163630962371826, + "learning_rate": 3.1804123711340207e-06, + "logits/chosen": 8.796285629272461, + "logits/rejected": 1.4558358192443848, + "logps/chosen": -301.8804931640625, + "logps/rejected": -201.09512329101562, + "loss": 0.5625, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.655165433883667, + "rewards/margins": 0.3351425230503082, + "rewards/rejected": 0.32002294063568115, + "step": 1234 + }, + { + "epoch": 0.19099168760873766, + "grad_norm": 6.68710994720459, + "learning_rate": 3.1829896907216496e-06, + "logits/chosen": 8.119000434875488, + "logits/rejected": 7.294620513916016, + "logps/chosen": -353.65087890625, + "logps/rejected": -350.862060546875, + "loss": 0.7298, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5050303936004639, + "rewards/margins": -0.05927343666553497, + "rewards/rejected": 0.5643037557601929, + "step": 1235 + }, + { + "epoch": 0.19114633674850184, + "grad_norm": 5.4455342292785645, + "learning_rate": 3.1855670103092784e-06, + "logits/chosen": 14.876818656921387, + "logits/rejected": 13.735360145568848, + "logps/chosen": -282.4361572265625, + "logps/rejected": -353.4479675292969, + "loss": 0.5893, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6284136772155762, + "rewards/margins": 0.24710312485694885, + "rewards/rejected": 0.3813105523586273, + "step": 1236 + }, + { + "epoch": 0.191300985888266, + "grad_norm": 10.938210487365723, + "learning_rate": 3.1881443298969073e-06, + "logits/chosen": 11.34907341003418, + "logits/rejected": 5.414515495300293, + "logps/chosen": -398.83441162109375, + "logps/rejected": -237.18492126464844, + "loss": 0.6393, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6420078873634338, + "rewards/margins": 0.11836714297533035, + "rewards/rejected": 0.5236407518386841, + "step": 1237 + }, + { + "epoch": 0.19145563502803015, + "grad_norm": 8.826349258422852, + "learning_rate": 3.190721649484536e-06, + "logits/chosen": 13.579442024230957, + "logits/rejected": 9.15294361114502, + "logps/chosen": -294.3518371582031, + "logps/rejected": -255.98394775390625, + "loss": 0.7309, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4393633306026459, + "rewards/margins": -0.05169167369604111, + "rewards/rejected": 0.4910550117492676, + "step": 1238 + }, + { + "epoch": 0.19161028416779433, + "grad_norm": 5.464394569396973, + "learning_rate": 3.193298969072165e-06, + "logits/chosen": 3.160524368286133, + "logits/rejected": 0.9966862201690674, + "logps/chosen": -233.94293212890625, + "logps/rejected": -200.45401000976562, + "loss": 0.7004, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5308198928833008, + "rewards/margins": 0.007534883916378021, + "rewards/rejected": 0.5232850313186646, + "step": 1239 + }, + { + "epoch": 0.19176493330755848, + "grad_norm": 4.881063938140869, + "learning_rate": 3.195876288659794e-06, + "logits/chosen": 11.210578918457031, + "logits/rejected": 7.351213455200195, + "logps/chosen": -193.02114868164062, + "logps/rejected": -181.2538299560547, + "loss": 0.6749, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4692145586013794, + "rewards/margins": 0.05012582987546921, + "rewards/rejected": 0.4190887212753296, + "step": 1240 + }, + { + "epoch": 0.19191958244732263, + "grad_norm": 63.044219970703125, + "learning_rate": 3.1984536082474228e-06, + "logits/chosen": 12.134191513061523, + "logits/rejected": 14.954732894897461, + "logps/chosen": -229.34231567382812, + "logps/rejected": -278.4432373046875, + "loss": 0.7254, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3644210398197174, + "rewards/margins": -0.05073036998510361, + "rewards/rejected": 0.4151514172554016, + "step": 1241 + }, + { + "epoch": 0.1920742315870868, + "grad_norm": 4.016191482543945, + "learning_rate": 3.2010309278350517e-06, + "logits/chosen": 15.895033836364746, + "logits/rejected": 8.987711906433105, + "logps/chosen": -294.3758850097656, + "logps/rejected": -147.92990112304688, + "loss": 0.5742, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6089287400245667, + "rewards/margins": 0.2810121178627014, + "rewards/rejected": 0.32791662216186523, + "step": 1242 + }, + { + "epoch": 0.19222888072685096, + "grad_norm": 5.765580177307129, + "learning_rate": 3.203608247422681e-06, + "logits/chosen": 8.613408088684082, + "logits/rejected": 8.338961601257324, + "logps/chosen": -296.4032897949219, + "logps/rejected": -270.2394104003906, + "loss": 0.6631, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4387373924255371, + "rewards/margins": 0.07200220972299576, + "rewards/rejected": 0.36673516035079956, + "step": 1243 + }, + { + "epoch": 0.1923835298666151, + "grad_norm": 5.679923057556152, + "learning_rate": 3.20618556701031e-06, + "logits/chosen": 5.362502574920654, + "logits/rejected": 12.569731712341309, + "logps/chosen": -167.60635375976562, + "logps/rejected": -258.2720642089844, + "loss": 0.7329, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4995642304420471, + "rewards/margins": -0.06724506616592407, + "rewards/rejected": 0.5668092966079712, + "step": 1244 + }, + { + "epoch": 0.1925381790063793, + "grad_norm": 4.202012062072754, + "learning_rate": 3.2087628865979387e-06, + "logits/chosen": 8.0896635055542, + "logits/rejected": 10.821783065795898, + "logps/chosen": -194.33839416503906, + "logps/rejected": -237.69244384765625, + "loss": 0.7228, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45260074734687805, + "rewards/margins": -0.04802299290895462, + "rewards/rejected": 0.5006237626075745, + "step": 1245 + }, + { + "epoch": 0.19269282814614344, + "grad_norm": 4.239159107208252, + "learning_rate": 3.2113402061855676e-06, + "logits/chosen": 7.96912145614624, + "logits/rejected": 9.134454727172852, + "logps/chosen": -176.5997314453125, + "logps/rejected": -213.2306671142578, + "loss": 0.7185, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5998779535293579, + "rewards/margins": -0.03788008168339729, + "rewards/rejected": 0.6377581357955933, + "step": 1246 + }, + { + "epoch": 0.1928474772859076, + "grad_norm": 4.657351493835449, + "learning_rate": 3.2139175257731964e-06, + "logits/chosen": 7.918962478637695, + "logits/rejected": 3.499849796295166, + "logps/chosen": -213.512939453125, + "logps/rejected": -168.80560302734375, + "loss": 0.6266, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5484223365783691, + "rewards/margins": 0.158765509724617, + "rewards/rejected": 0.38965684175491333, + "step": 1247 + }, + { + "epoch": 0.19300212642567174, + "grad_norm": 4.860681056976318, + "learning_rate": 3.2164948453608253e-06, + "logits/chosen": 11.14093017578125, + "logits/rejected": 6.807981967926025, + "logps/chosen": -201.78338623046875, + "logps/rejected": -175.76773071289062, + "loss": 0.6059, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49234187602996826, + "rewards/margins": 0.195110023021698, + "rewards/rejected": 0.29723188281059265, + "step": 1248 + }, + { + "epoch": 0.19315677556543592, + "grad_norm": 26.690044403076172, + "learning_rate": 3.219072164948454e-06, + "logits/chosen": 11.988876342773438, + "logits/rejected": 7.563211441040039, + "logps/chosen": -397.692626953125, + "logps/rejected": -368.33111572265625, + "loss": 0.6718, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5251701474189758, + "rewards/margins": 0.06899290531873703, + "rewards/rejected": 0.4561772346496582, + "step": 1249 + }, + { + "epoch": 0.19331142470520007, + "grad_norm": 4.7454071044921875, + "learning_rate": 3.221649484536083e-06, + "logits/chosen": 9.304478645324707, + "logits/rejected": 9.801323890686035, + "logps/chosen": -174.55299377441406, + "logps/rejected": -251.90016174316406, + "loss": 0.6694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40585270524024963, + "rewards/margins": 0.05353386700153351, + "rewards/rejected": 0.3523188531398773, + "step": 1250 + }, + { + "epoch": 0.19346607384496423, + "grad_norm": 6.183058738708496, + "learning_rate": 3.224226804123712e-06, + "logits/chosen": 14.203052520751953, + "logits/rejected": 9.77611255645752, + "logps/chosen": -393.35284423828125, + "logps/rejected": -310.80517578125, + "loss": 0.608, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6375166177749634, + "rewards/margins": 0.20667657256126404, + "rewards/rejected": 0.43084004521369934, + "step": 1251 + }, + { + "epoch": 0.1936207229847284, + "grad_norm": 10.06793212890625, + "learning_rate": 3.226804123711341e-06, + "logits/chosen": 7.5367817878723145, + "logits/rejected": 3.743190050125122, + "logps/chosen": -313.5653076171875, + "logps/rejected": -307.9297180175781, + "loss": 0.6956, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43930351734161377, + "rewards/margins": 0.10602889209985733, + "rewards/rejected": 0.33327460289001465, + "step": 1252 + }, + { + "epoch": 0.19377537212449256, + "grad_norm": 5.206816673278809, + "learning_rate": 3.2293814432989697e-06, + "logits/chosen": 3.010655403137207, + "logits/rejected": 11.41939926147461, + "logps/chosen": -150.63723754882812, + "logps/rejected": -224.94993591308594, + "loss": 0.7624, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.47446364164352417, + "rewards/margins": -0.12096692621707916, + "rewards/rejected": 0.5954306125640869, + "step": 1253 + }, + { + "epoch": 0.1939300212642567, + "grad_norm": 5.481898784637451, + "learning_rate": 3.2319587628865985e-06, + "logits/chosen": 12.624159812927246, + "logits/rejected": 6.349588394165039, + "logps/chosen": -281.23651123046875, + "logps/rejected": -243.25228881835938, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4826972484588623, + "rewards/margins": 0.05940217897295952, + "rewards/rejected": 0.4232950806617737, + "step": 1254 + }, + { + "epoch": 0.1940846704040209, + "grad_norm": 4.178683280944824, + "learning_rate": 3.2345360824742274e-06, + "logits/chosen": 12.260648727416992, + "logits/rejected": 3.160780668258667, + "logps/chosen": -265.9971618652344, + "logps/rejected": -153.32199096679688, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47502070665359497, + "rewards/margins": 0.11583016812801361, + "rewards/rejected": 0.35919055342674255, + "step": 1255 + }, + { + "epoch": 0.19423931954378504, + "grad_norm": 6.976434707641602, + "learning_rate": 3.2371134020618563e-06, + "logits/chosen": 11.857710838317871, + "logits/rejected": 7.212756633758545, + "logps/chosen": -316.90765380859375, + "logps/rejected": -269.6816101074219, + "loss": 0.6918, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6193729639053345, + "rewards/margins": 0.024546809494495392, + "rewards/rejected": 0.5948261022567749, + "step": 1256 + }, + { + "epoch": 0.1943939686835492, + "grad_norm": 5.100480556488037, + "learning_rate": 3.239690721649485e-06, + "logits/chosen": 7.107606410980225, + "logits/rejected": 9.467611312866211, + "logps/chosen": -309.34173583984375, + "logps/rejected": -270.22894287109375, + "loss": 0.6226, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5951852798461914, + "rewards/margins": 0.1706523299217224, + "rewards/rejected": 0.4245328903198242, + "step": 1257 + }, + { + "epoch": 0.19454861782331337, + "grad_norm": 5.747494220733643, + "learning_rate": 3.2422680412371136e-06, + "logits/chosen": 13.994686126708984, + "logits/rejected": 13.255468368530273, + "logps/chosen": -256.3341064453125, + "logps/rejected": -238.83697509765625, + "loss": 0.7348, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6035789251327515, + "rewards/margins": -0.07494983077049255, + "rewards/rejected": 0.6785287857055664, + "step": 1258 + }, + { + "epoch": 0.19470326696307752, + "grad_norm": 5.3106889724731445, + "learning_rate": 3.2448453608247425e-06, + "logits/chosen": 10.338050842285156, + "logits/rejected": 8.256830215454102, + "logps/chosen": -187.38653564453125, + "logps/rejected": -230.9452362060547, + "loss": 0.7212, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4010266661643982, + "rewards/margins": -0.031024547293782234, + "rewards/rejected": 0.43205124139785767, + "step": 1259 + }, + { + "epoch": 0.19485791610284167, + "grad_norm": 6.178385257720947, + "learning_rate": 3.2474226804123714e-06, + "logits/chosen": 9.199402809143066, + "logits/rejected": 15.232884407043457, + "logps/chosen": -333.17889404296875, + "logps/rejected": -418.5166320800781, + "loss": 0.7099, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.8721463680267334, + "rewards/margins": -0.0024234652519226074, + "rewards/rejected": 0.8745697736740112, + "step": 1260 + }, + { + "epoch": 0.19501256524260585, + "grad_norm": 6.837693214416504, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": 12.279129028320312, + "logits/rejected": 6.870036602020264, + "logps/chosen": -329.546142578125, + "logps/rejected": -227.9161834716797, + "loss": 0.7091, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40584298968315125, + "rewards/margins": -0.016572382301092148, + "rewards/rejected": 0.4224153757095337, + "step": 1261 + }, + { + "epoch": 0.19516721438237, + "grad_norm": 35.677696228027344, + "learning_rate": 3.252577319587629e-06, + "logits/chosen": 5.335585594177246, + "logits/rejected": 6.641610622406006, + "logps/chosen": -217.2725372314453, + "logps/rejected": -204.86941528320312, + "loss": 0.7201, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5300777554512024, + "rewards/margins": -0.0398736447095871, + "rewards/rejected": 0.5699514150619507, + "step": 1262 + }, + { + "epoch": 0.19532186352213415, + "grad_norm": 9.768061637878418, + "learning_rate": 3.255154639175258e-06, + "logits/chosen": 8.994683265686035, + "logits/rejected": 6.920213222503662, + "logps/chosen": -593.1331176757812, + "logps/rejected": -482.54583740234375, + "loss": 0.6406, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7754771113395691, + "rewards/margins": 0.14648504555225372, + "rewards/rejected": 0.6289920806884766, + "step": 1263 + }, + { + "epoch": 0.1954765126618983, + "grad_norm": 5.485254764556885, + "learning_rate": 3.257731958762887e-06, + "logits/chosen": 15.874935150146484, + "logits/rejected": 5.449057579040527, + "logps/chosen": -473.91400146484375, + "logps/rejected": -279.2198181152344, + "loss": 0.6103, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6073750853538513, + "rewards/margins": 0.2410089075565338, + "rewards/rejected": 0.3663662075996399, + "step": 1264 + }, + { + "epoch": 0.19563116180166248, + "grad_norm": 5.843264102935791, + "learning_rate": 3.2603092783505157e-06, + "logits/chosen": 16.228412628173828, + "logits/rejected": 13.174749374389648, + "logps/chosen": -384.05303955078125, + "logps/rejected": -392.5466613769531, + "loss": 0.597, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7092897891998291, + "rewards/margins": 0.22083836793899536, + "rewards/rejected": 0.48845142126083374, + "step": 1265 + }, + { + "epoch": 0.19578581094142664, + "grad_norm": 4.950313091278076, + "learning_rate": 3.2628865979381446e-06, + "logits/chosen": 8.358231544494629, + "logits/rejected": 5.719239711761475, + "logps/chosen": -209.3634490966797, + "logps/rejected": -101.864990234375, + "loss": 0.6857, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30012694001197815, + "rewards/margins": 0.0218371395021677, + "rewards/rejected": 0.278289794921875, + "step": 1266 + }, + { + "epoch": 0.1959404600811908, + "grad_norm": 5.50861120223999, + "learning_rate": 3.2654639175257735e-06, + "logits/chosen": 12.006473541259766, + "logits/rejected": 6.123589515686035, + "logps/chosen": -273.00140380859375, + "logps/rejected": -228.69090270996094, + "loss": 0.6964, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6794090270996094, + "rewards/margins": 0.04387977719306946, + "rewards/rejected": 0.6355292797088623, + "step": 1267 + }, + { + "epoch": 0.19609510922095497, + "grad_norm": 6.035185813903809, + "learning_rate": 3.2680412371134023e-06, + "logits/chosen": 10.129387855529785, + "logits/rejected": 8.775681495666504, + "logps/chosen": -365.5245666503906, + "logps/rejected": -323.67236328125, + "loss": 0.6466, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.568237841129303, + "rewards/margins": 0.1666661500930786, + "rewards/rejected": 0.4015716314315796, + "step": 1268 + }, + { + "epoch": 0.19624975836071912, + "grad_norm": 5.689505100250244, + "learning_rate": 3.270618556701031e-06, + "logits/chosen": 10.030207633972168, + "logits/rejected": 7.495403289794922, + "logps/chosen": -305.68170166015625, + "logps/rejected": -283.243408203125, + "loss": 0.6295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7296551465988159, + "rewards/margins": 0.14317269623279572, + "rewards/rejected": 0.5864824652671814, + "step": 1269 + }, + { + "epoch": 0.19640440750048327, + "grad_norm": 5.353102684020996, + "learning_rate": 3.27319587628866e-06, + "logits/chosen": 11.297200202941895, + "logits/rejected": 10.937126159667969, + "logps/chosen": -315.881591796875, + "logps/rejected": -327.8677062988281, + "loss": 0.6324, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6738921403884888, + "rewards/margins": 0.13873621821403503, + "rewards/rejected": 0.5351558923721313, + "step": 1270 + }, + { + "epoch": 0.19655905664024745, + "grad_norm": 5.362412929534912, + "learning_rate": 3.275773195876289e-06, + "logits/chosen": 8.960491180419922, + "logits/rejected": 8.659473419189453, + "logps/chosen": -223.84324645996094, + "logps/rejected": -226.14895629882812, + "loss": 0.6768, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5809481143951416, + "rewards/margins": 0.04989110305905342, + "rewards/rejected": 0.5310570001602173, + "step": 1271 + }, + { + "epoch": 0.1967137057800116, + "grad_norm": 4.925468921661377, + "learning_rate": 3.278350515463918e-06, + "logits/chosen": 13.636338233947754, + "logits/rejected": 12.647415161132812, + "logps/chosen": -178.77841186523438, + "logps/rejected": -187.86346435546875, + "loss": 0.6352, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5157396793365479, + "rewards/margins": 0.148208349943161, + "rewards/rejected": 0.367531418800354, + "step": 1272 + }, + { + "epoch": 0.19686835491977575, + "grad_norm": 5.796316146850586, + "learning_rate": 3.2809278350515467e-06, + "logits/chosen": 14.344132423400879, + "logits/rejected": 6.3060760498046875, + "logps/chosen": -433.5544738769531, + "logps/rejected": -325.4803161621094, + "loss": 0.6271, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8045179843902588, + "rewards/margins": 0.1832927167415619, + "rewards/rejected": 0.6212251782417297, + "step": 1273 + }, + { + "epoch": 0.19702300405953993, + "grad_norm": 5.151941299438477, + "learning_rate": 3.2835051546391756e-06, + "logits/chosen": 9.456802368164062, + "logits/rejected": 11.307422637939453, + "logps/chosen": -346.2414245605469, + "logps/rejected": -380.61273193359375, + "loss": 0.6291, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8784075379371643, + "rewards/margins": 0.19248169660568237, + "rewards/rejected": 0.6859257817268372, + "step": 1274 + }, + { + "epoch": 0.19717765319930408, + "grad_norm": 6.6808013916015625, + "learning_rate": 3.2860824742268044e-06, + "logits/chosen": 3.3954012393951416, + "logits/rejected": 4.781131744384766, + "logps/chosen": -305.18524169921875, + "logps/rejected": -295.1188049316406, + "loss": 0.7924, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5000433325767517, + "rewards/margins": -0.11500897258520126, + "rewards/rejected": 0.6150522828102112, + "step": 1275 + }, + { + "epoch": 0.19733230233906823, + "grad_norm": 7.2388386726379395, + "learning_rate": 3.2886597938144333e-06, + "logits/chosen": 6.953413009643555, + "logits/rejected": 7.740036964416504, + "logps/chosen": -232.2669677734375, + "logps/rejected": -260.6564025878906, + "loss": 0.7105, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.43705442547798157, + "rewards/margins": -0.006439249962568283, + "rewards/rejected": 0.44349366426467896, + "step": 1276 + }, + { + "epoch": 0.1974869514788324, + "grad_norm": 4.207391262054443, + "learning_rate": 3.291237113402062e-06, + "logits/chosen": 14.358092308044434, + "logits/rejected": 14.29216194152832, + "logps/chosen": -242.71578979492188, + "logps/rejected": -167.500732421875, + "loss": 0.6449, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49289676547050476, + "rewards/margins": 0.11610956490039825, + "rewards/rejected": 0.3767872452735901, + "step": 1277 + }, + { + "epoch": 0.19764160061859656, + "grad_norm": 4.676634788513184, + "learning_rate": 3.293814432989691e-06, + "logits/chosen": 13.802257537841797, + "logits/rejected": 11.306392669677734, + "logps/chosen": -276.5202941894531, + "logps/rejected": -213.8582763671875, + "loss": 0.6764, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3753712773323059, + "rewards/margins": 0.04441824555397034, + "rewards/rejected": 0.33095303177833557, + "step": 1278 + }, + { + "epoch": 0.19779624975836071, + "grad_norm": 5.706592082977295, + "learning_rate": 3.29639175257732e-06, + "logits/chosen": 8.070343017578125, + "logits/rejected": 16.978242874145508, + "logps/chosen": -158.07843017578125, + "logps/rejected": -311.79840087890625, + "loss": 0.671, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4136684536933899, + "rewards/margins": 0.05436870455741882, + "rewards/rejected": 0.35929977893829346, + "step": 1279 + }, + { + "epoch": 0.19795089889812487, + "grad_norm": 4.6041107177734375, + "learning_rate": 3.298969072164949e-06, + "logits/chosen": 10.371328353881836, + "logits/rejected": 10.11214542388916, + "logps/chosen": -292.2493896484375, + "logps/rejected": -311.4103698730469, + "loss": 0.6225, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5738158226013184, + "rewards/margins": 0.22375929355621338, + "rewards/rejected": 0.35005655884742737, + "step": 1280 + }, + { + "epoch": 0.19810554803788905, + "grad_norm": 5.371640205383301, + "learning_rate": 3.3015463917525777e-06, + "logits/chosen": 5.121698379516602, + "logits/rejected": 7.031170845031738, + "logps/chosen": -263.9494934082031, + "logps/rejected": -246.33132934570312, + "loss": 0.6014, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6553155183792114, + "rewards/margins": 0.20567500591278076, + "rewards/rejected": 0.44964051246643066, + "step": 1281 + }, + { + "epoch": 0.1982601971776532, + "grad_norm": 3.934953212738037, + "learning_rate": 3.3041237113402065e-06, + "logits/chosen": 9.958637237548828, + "logits/rejected": 2.769880771636963, + "logps/chosen": -342.2831115722656, + "logps/rejected": -206.97698974609375, + "loss": 0.6192, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.546492338180542, + "rewards/margins": 0.18888181447982788, + "rewards/rejected": 0.35761046409606934, + "step": 1282 + }, + { + "epoch": 0.19841484631741735, + "grad_norm": 6.036837577819824, + "learning_rate": 3.3067010309278354e-06, + "logits/chosen": 12.089359283447266, + "logits/rejected": 7.27589225769043, + "logps/chosen": -411.790771484375, + "logps/rejected": -327.0368957519531, + "loss": 0.5481, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7499051094055176, + "rewards/margins": 0.33648520708084106, + "rewards/rejected": 0.4134199619293213, + "step": 1283 + }, + { + "epoch": 0.19856949545718153, + "grad_norm": 5.0055766105651855, + "learning_rate": 3.3092783505154643e-06, + "logits/chosen": 3.3996565341949463, + "logits/rejected": 12.273038864135742, + "logps/chosen": -231.321533203125, + "logps/rejected": -298.3727111816406, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6002375483512878, + "rewards/margins": 0.28655147552490234, + "rewards/rejected": 0.3136860728263855, + "step": 1284 + }, + { + "epoch": 0.19872414459694568, + "grad_norm": 5.848355293273926, + "learning_rate": 3.311855670103093e-06, + "logits/chosen": 9.540555953979492, + "logits/rejected": 1.333407998085022, + "logps/chosen": -249.20144653320312, + "logps/rejected": -209.50518798828125, + "loss": 0.6305, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6329747438430786, + "rewards/margins": 0.14627324044704437, + "rewards/rejected": 0.4867015480995178, + "step": 1285 + }, + { + "epoch": 0.19887879373670983, + "grad_norm": 7.092436790466309, + "learning_rate": 3.3144329896907216e-06, + "logits/chosen": 7.742453098297119, + "logits/rejected": 13.437719345092773, + "logps/chosen": -212.93783569335938, + "logps/rejected": -277.6034851074219, + "loss": 0.6316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6184993982315063, + "rewards/margins": 0.15867950022220612, + "rewards/rejected": 0.45981988310813904, + "step": 1286 + }, + { + "epoch": 0.199033442876474, + "grad_norm": 5.362861156463623, + "learning_rate": 3.3170103092783505e-06, + "logits/chosen": 6.933655738830566, + "logits/rejected": 9.977997779846191, + "logps/chosen": -257.5059814453125, + "logps/rejected": -220.40133666992188, + "loss": 0.6184, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.723708987236023, + "rewards/margins": 0.20602166652679443, + "rewards/rejected": 0.5176873207092285, + "step": 1287 + }, + { + "epoch": 0.19918809201623816, + "grad_norm": 4.728067874908447, + "learning_rate": 3.3195876288659793e-06, + "logits/chosen": 12.968427658081055, + "logits/rejected": 5.33281946182251, + "logps/chosen": -248.13919067382812, + "logps/rejected": -142.6277618408203, + "loss": 0.617, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6389952898025513, + "rewards/margins": 0.19254299998283386, + "rewards/rejected": 0.4464523196220398, + "step": 1288 + }, + { + "epoch": 0.1993427411560023, + "grad_norm": 5.08624792098999, + "learning_rate": 3.3221649484536082e-06, + "logits/chosen": 6.731633186340332, + "logits/rejected": 10.717260360717773, + "logps/chosen": -183.9955291748047, + "logps/rejected": -258.29541015625, + "loss": 0.7536, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5029999613761902, + "rewards/margins": -0.05758479982614517, + "rewards/rejected": 0.5605847835540771, + "step": 1289 + }, + { + "epoch": 0.1994973902957665, + "grad_norm": 4.8729567527771, + "learning_rate": 3.324742268041237e-06, + "logits/chosen": 9.901467323303223, + "logits/rejected": 4.756739616394043, + "logps/chosen": -338.7579650878906, + "logps/rejected": -251.5039520263672, + "loss": 0.553, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7929681539535522, + "rewards/margins": 0.35089725255966187, + "rewards/rejected": 0.44207093119621277, + "step": 1290 + }, + { + "epoch": 0.19965203943553064, + "grad_norm": 4.968284606933594, + "learning_rate": 3.327319587628866e-06, + "logits/chosen": 15.702203750610352, + "logits/rejected": 8.607516288757324, + "logps/chosen": -228.546875, + "logps/rejected": -177.3660125732422, + "loss": 0.6942, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5267329812049866, + "rewards/margins": 0.03915111720561981, + "rewards/rejected": 0.48758184909820557, + "step": 1291 + }, + { + "epoch": 0.1998066885752948, + "grad_norm": 3.717909097671509, + "learning_rate": 3.329896907216495e-06, + "logits/chosen": 12.51144027709961, + "logits/rejected": 8.418695449829102, + "logps/chosen": -137.5762176513672, + "logps/rejected": -131.330322265625, + "loss": 0.6422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4770965576171875, + "rewards/margins": 0.11830544471740723, + "rewards/rejected": 0.3587911128997803, + "step": 1292 + }, + { + "epoch": 0.19996133771505897, + "grad_norm": 4.210697174072266, + "learning_rate": 3.3324742268041237e-06, + "logits/chosen": 10.194799423217773, + "logits/rejected": 2.809494972229004, + "logps/chosen": -253.68234252929688, + "logps/rejected": -188.58297729492188, + "loss": 0.5157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.681464433670044, + "rewards/margins": 0.41363441944122314, + "rewards/rejected": 0.26783010363578796, + "step": 1293 + }, + { + "epoch": 0.20011598685482312, + "grad_norm": 5.845391750335693, + "learning_rate": 3.3350515463917526e-06, + "logits/chosen": 12.989534378051758, + "logits/rejected": 12.628778457641602, + "logps/chosen": -300.50445556640625, + "logps/rejected": -337.2967529296875, + "loss": 0.7478, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.6544358134269714, + "rewards/margins": -0.1000395342707634, + "rewards/rejected": 0.7544753551483154, + "step": 1294 + }, + { + "epoch": 0.20027063599458728, + "grad_norm": 7.5394368171691895, + "learning_rate": 3.3376288659793814e-06, + "logits/chosen": 5.752139568328857, + "logits/rejected": 7.404584884643555, + "logps/chosen": -339.4193115234375, + "logps/rejected": -278.47821044921875, + "loss": 0.6552, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8480408787727356, + "rewards/margins": 0.10558734089136124, + "rewards/rejected": 0.7424535155296326, + "step": 1295 + }, + { + "epoch": 0.20042528513435143, + "grad_norm": 5.130138874053955, + "learning_rate": 3.3402061855670103e-06, + "logits/chosen": 7.999868392944336, + "logits/rejected": 4.941250801086426, + "logps/chosen": -254.15121459960938, + "logps/rejected": -243.2579345703125, + "loss": 0.6662, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7615885734558105, + "rewards/margins": 0.09718703478574753, + "rewards/rejected": 0.6644015312194824, + "step": 1296 + }, + { + "epoch": 0.2005799342741156, + "grad_norm": 5.25628662109375, + "learning_rate": 3.342783505154639e-06, + "logits/chosen": 4.310413360595703, + "logits/rejected": 14.12138843536377, + "logps/chosen": -165.95352172851562, + "logps/rejected": -278.21661376953125, + "loss": 0.8053, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3964526057243347, + "rewards/margins": -0.184920534491539, + "rewards/rejected": 0.5813732147216797, + "step": 1297 + }, + { + "epoch": 0.20073458341387976, + "grad_norm": 6.754405975341797, + "learning_rate": 3.345360824742268e-06, + "logits/chosen": 3.988142490386963, + "logits/rejected": 4.422004699707031, + "logps/chosen": -258.9167175292969, + "logps/rejected": -285.71490478515625, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4739401936531067, + "rewards/margins": 0.06007649004459381, + "rewards/rejected": 0.4138637185096741, + "step": 1298 + }, + { + "epoch": 0.2008892325536439, + "grad_norm": 7.065251350402832, + "learning_rate": 3.347938144329897e-06, + "logits/chosen": 11.343421936035156, + "logits/rejected": 3.5668082237243652, + "logps/chosen": -391.7399597167969, + "logps/rejected": -301.20721435546875, + "loss": 0.5808, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9373821020126343, + "rewards/margins": 0.2596184015274048, + "rewards/rejected": 0.6777637004852295, + "step": 1299 + }, + { + "epoch": 0.2010438816934081, + "grad_norm": 6.205452919006348, + "learning_rate": 3.350515463917526e-06, + "logits/chosen": 9.35049057006836, + "logits/rejected": 7.737802505493164, + "logps/chosen": -208.43252563476562, + "logps/rejected": -178.4341583251953, + "loss": 0.6672, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6666086316108704, + "rewards/margins": 0.07263593375682831, + "rewards/rejected": 0.5939726829528809, + "step": 1300 + }, + { + "epoch": 0.20119853083317224, + "grad_norm": 6.771202087402344, + "learning_rate": 3.3530927835051547e-06, + "logits/chosen": 4.866030693054199, + "logits/rejected": 7.370279312133789, + "logps/chosen": -217.40887451171875, + "logps/rejected": -311.69830322265625, + "loss": 0.722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5795423984527588, + "rewards/margins": 0.01841604709625244, + "rewards/rejected": 0.5611263513565063, + "step": 1301 + }, + { + "epoch": 0.2013531799729364, + "grad_norm": 5.767834186553955, + "learning_rate": 3.3556701030927835e-06, + "logits/chosen": 12.459856986999512, + "logits/rejected": 6.914124965667725, + "logps/chosen": -316.07208251953125, + "logps/rejected": -298.9940185546875, + "loss": 0.6578, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7994516491889954, + "rewards/margins": 0.17962822318077087, + "rewards/rejected": 0.6198234558105469, + "step": 1302 + }, + { + "epoch": 0.20150782911270057, + "grad_norm": 5.931360721588135, + "learning_rate": 3.3582474226804124e-06, + "logits/chosen": 11.712615966796875, + "logits/rejected": 11.847522735595703, + "logps/chosen": -362.90533447265625, + "logps/rejected": -348.126708984375, + "loss": 0.6551, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4957813322544098, + "rewards/margins": 0.10481216013431549, + "rewards/rejected": 0.3909691870212555, + "step": 1303 + }, + { + "epoch": 0.20166247825246472, + "grad_norm": 5.806168079376221, + "learning_rate": 3.3608247422680417e-06, + "logits/chosen": 13.309556007385254, + "logits/rejected": 8.737560272216797, + "logps/chosen": -259.7633972167969, + "logps/rejected": -284.0533447265625, + "loss": 0.6523, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.9872535467147827, + "rewards/margins": 0.12479966133832932, + "rewards/rejected": 0.8624539375305176, + "step": 1304 + }, + { + "epoch": 0.20181712739222887, + "grad_norm": 4.399667263031006, + "learning_rate": 3.3634020618556706e-06, + "logits/chosen": 9.206022262573242, + "logits/rejected": 9.837970733642578, + "logps/chosen": -335.8686218261719, + "logps/rejected": -273.716796875, + "loss": 0.6568, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7067378759384155, + "rewards/margins": 0.1392958164215088, + "rewards/rejected": 0.5674420595169067, + "step": 1305 + }, + { + "epoch": 0.20197177653199305, + "grad_norm": 5.653664588928223, + "learning_rate": 3.3659793814432995e-06, + "logits/chosen": 7.799934387207031, + "logits/rejected": 8.471553802490234, + "logps/chosen": -242.89889526367188, + "logps/rejected": -230.40338134765625, + "loss": 0.6197, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8361656069755554, + "rewards/margins": 0.17100675404071808, + "rewards/rejected": 0.6651588678359985, + "step": 1306 + }, + { + "epoch": 0.2021264256717572, + "grad_norm": 7.008354663848877, + "learning_rate": 3.3685567010309283e-06, + "logits/chosen": 4.932295799255371, + "logits/rejected": 2.5523951053619385, + "logps/chosen": -341.0740661621094, + "logps/rejected": -248.2135467529297, + "loss": 0.7656, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6207624673843384, + "rewards/margins": -0.10794610530138016, + "rewards/rejected": 0.7287085652351379, + "step": 1307 + }, + { + "epoch": 0.20228107481152136, + "grad_norm": 5.2920732498168945, + "learning_rate": 3.371134020618557e-06, + "logits/chosen": 14.190934181213379, + "logits/rejected": 7.582371711730957, + "logps/chosen": -409.98773193359375, + "logps/rejected": -309.5728759765625, + "loss": 0.6637, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8439762592315674, + "rewards/margins": 0.21576416492462158, + "rewards/rejected": 0.6282120943069458, + "step": 1308 + }, + { + "epoch": 0.20243572395128553, + "grad_norm": 7.5023298263549805, + "learning_rate": 3.373711340206186e-06, + "logits/chosen": 5.844460487365723, + "logits/rejected": 3.437920570373535, + "logps/chosen": -261.376953125, + "logps/rejected": -236.91403198242188, + "loss": 0.7436, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4867055416107178, + "rewards/margins": -0.08074112236499786, + "rewards/rejected": 0.5674466490745544, + "step": 1309 + }, + { + "epoch": 0.20259037309104969, + "grad_norm": 4.355799674987793, + "learning_rate": 3.376288659793815e-06, + "logits/chosen": 11.189939498901367, + "logits/rejected": 5.724081039428711, + "logps/chosen": -259.7492370605469, + "logps/rejected": -170.89703369140625, + "loss": 0.6478, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5804348587989807, + "rewards/margins": 0.10685832798480988, + "rewards/rejected": 0.47357654571533203, + "step": 1310 + }, + { + "epoch": 0.20274502223081384, + "grad_norm": 4.544366836547852, + "learning_rate": 3.378865979381444e-06, + "logits/chosen": 8.01708698272705, + "logits/rejected": 8.608769416809082, + "logps/chosen": -258.0055847167969, + "logps/rejected": -274.4293212890625, + "loss": 0.6794, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.49057745933532715, + "rewards/margins": 0.07700784504413605, + "rewards/rejected": 0.41356968879699707, + "step": 1311 + }, + { + "epoch": 0.202899671370578, + "grad_norm": 6.106559753417969, + "learning_rate": 3.3814432989690727e-06, + "logits/chosen": 6.4856672286987305, + "logits/rejected": 11.039149284362793, + "logps/chosen": -169.0098876953125, + "logps/rejected": -348.0252990722656, + "loss": 0.7136, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5099309086799622, + "rewards/margins": 0.0011292845010757446, + "rewards/rejected": 0.5088015794754028, + "step": 1312 + }, + { + "epoch": 0.20305432051034217, + "grad_norm": 5.431767463684082, + "learning_rate": 3.3840206185567016e-06, + "logits/chosen": 5.8516340255737305, + "logits/rejected": 5.124634265899658, + "logps/chosen": -259.8133850097656, + "logps/rejected": -274.2174072265625, + "loss": 0.6135, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.782206118106842, + "rewards/margins": 0.2607690095901489, + "rewards/rejected": 0.5214371681213379, + "step": 1313 + }, + { + "epoch": 0.20320896965010632, + "grad_norm": 6.771440029144287, + "learning_rate": 3.3865979381443304e-06, + "logits/chosen": 12.835247039794922, + "logits/rejected": 6.702828884124756, + "logps/chosen": -286.2394104003906, + "logps/rejected": -162.53269958496094, + "loss": 0.7807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4118680953979492, + "rewards/margins": -0.1132233515381813, + "rewards/rejected": 0.5250914096832275, + "step": 1314 + }, + { + "epoch": 0.20336361878987047, + "grad_norm": 5.3595805168151855, + "learning_rate": 3.3891752577319593e-06, + "logits/chosen": 14.908011436462402, + "logits/rejected": 13.651371955871582, + "logps/chosen": -303.52569580078125, + "logps/rejected": -280.60687255859375, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5502185821533203, + "rewards/margins": 0.04772016033530235, + "rewards/rejected": 0.5024983882904053, + "step": 1315 + }, + { + "epoch": 0.20351826792963465, + "grad_norm": 10.267784118652344, + "learning_rate": 3.391752577319588e-06, + "logits/chosen": 6.735694408416748, + "logits/rejected": 8.959792137145996, + "logps/chosen": -251.3719482421875, + "logps/rejected": -246.74952697753906, + "loss": 0.901, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.31730565428733826, + "rewards/margins": -0.3489902913570404, + "rewards/rejected": 0.6662959456443787, + "step": 1316 + }, + { + "epoch": 0.2036729170693988, + "grad_norm": 7.04838228225708, + "learning_rate": 3.394329896907217e-06, + "logits/chosen": 2.9848341941833496, + "logits/rejected": 4.147202014923096, + "logps/chosen": -267.9589538574219, + "logps/rejected": -267.35107421875, + "loss": 0.6618, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4270978569984436, + "rewards/margins": 0.11878165602684021, + "rewards/rejected": 0.308316171169281, + "step": 1317 + }, + { + "epoch": 0.20382756620916295, + "grad_norm": 6.5067925453186035, + "learning_rate": 3.396907216494846e-06, + "logits/chosen": 9.304525375366211, + "logits/rejected": 8.927728652954102, + "logps/chosen": -520.3414916992188, + "logps/rejected": -377.6937561035156, + "loss": 0.61, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6300702691078186, + "rewards/margins": 0.22060082852840424, + "rewards/rejected": 0.4094694256782532, + "step": 1318 + }, + { + "epoch": 0.20398221534892713, + "grad_norm": 5.2436442375183105, + "learning_rate": 3.399484536082475e-06, + "logits/chosen": 7.295438766479492, + "logits/rejected": 4.511636257171631, + "logps/chosen": -197.48475646972656, + "logps/rejected": -201.05982971191406, + "loss": 0.7343, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5716656446456909, + "rewards/margins": -0.042453303933143616, + "rewards/rejected": 0.6141189932823181, + "step": 1319 + }, + { + "epoch": 0.20413686448869128, + "grad_norm": 7.030094146728516, + "learning_rate": 3.4020618556701037e-06, + "logits/chosen": 9.978487014770508, + "logits/rejected": 10.432518005371094, + "logps/chosen": -385.94537353515625, + "logps/rejected": -363.43292236328125, + "loss": 0.653, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5859411954879761, + "rewards/margins": 0.10076151043176651, + "rewards/rejected": 0.4851796627044678, + "step": 1320 + }, + { + "epoch": 0.20429151362845543, + "grad_norm": 5.305937767028809, + "learning_rate": 3.4046391752577325e-06, + "logits/chosen": 14.626575469970703, + "logits/rejected": 7.095052242279053, + "logps/chosen": -364.23223876953125, + "logps/rejected": -269.1084289550781, + "loss": 0.5275, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.684237539768219, + "rewards/margins": 0.39492228627204895, + "rewards/rejected": 0.28931522369384766, + "step": 1321 + }, + { + "epoch": 0.2044461627682196, + "grad_norm": 4.810122013092041, + "learning_rate": 3.4072164948453614e-06, + "logits/chosen": 13.464654922485352, + "logits/rejected": 11.766478538513184, + "logps/chosen": -308.0333251953125, + "logps/rejected": -298.7428283691406, + "loss": 0.6219, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5712913274765015, + "rewards/margins": 0.189250648021698, + "rewards/rejected": 0.38204070925712585, + "step": 1322 + }, + { + "epoch": 0.20460081190798377, + "grad_norm": 5.0442891120910645, + "learning_rate": 3.4097938144329903e-06, + "logits/chosen": 15.703763008117676, + "logits/rejected": 8.28512954711914, + "logps/chosen": -356.3396301269531, + "logps/rejected": -245.67477416992188, + "loss": 0.6153, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.585950493812561, + "rewards/margins": 0.1931637078523636, + "rewards/rejected": 0.39278680086135864, + "step": 1323 + }, + { + "epoch": 0.20475546104774792, + "grad_norm": 6.674186706542969, + "learning_rate": 3.4123711340206187e-06, + "logits/chosen": 10.403979301452637, + "logits/rejected": 10.42829704284668, + "logps/chosen": -263.7735595703125, + "logps/rejected": -225.29261779785156, + "loss": 0.5342, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6374138593673706, + "rewards/margins": 0.41126564145088196, + "rewards/rejected": 0.22614821791648865, + "step": 1324 + }, + { + "epoch": 0.2049101101875121, + "grad_norm": 8.718478202819824, + "learning_rate": 3.4149484536082476e-06, + "logits/chosen": 5.965991973876953, + "logits/rejected": 7.266295433044434, + "logps/chosen": -371.6241149902344, + "logps/rejected": -343.738525390625, + "loss": 0.737, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5868619084358215, + "rewards/margins": -0.06999720633029938, + "rewards/rejected": 0.6568591594696045, + "step": 1325 + }, + { + "epoch": 0.20506475932727625, + "grad_norm": 5.564294338226318, + "learning_rate": 3.4175257731958765e-06, + "logits/chosen": 7.479012489318848, + "logits/rejected": 9.721044540405273, + "logps/chosen": -169.46725463867188, + "logps/rejected": -233.14649963378906, + "loss": 0.7074, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2817467153072357, + "rewards/margins": 0.0021076202392578125, + "rewards/rejected": 0.2796390652656555, + "step": 1326 + }, + { + "epoch": 0.2052194084670404, + "grad_norm": 5.872929096221924, + "learning_rate": 3.4201030927835053e-06, + "logits/chosen": 11.168549537658691, + "logits/rejected": 11.150811195373535, + "logps/chosen": -280.85418701171875, + "logps/rejected": -228.03378295898438, + "loss": 0.7117, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42225030064582825, + "rewards/margins": -0.010264307260513306, + "rewards/rejected": 0.43251463770866394, + "step": 1327 + }, + { + "epoch": 0.20537405760680455, + "grad_norm": 5.55307149887085, + "learning_rate": 3.4226804123711342e-06, + "logits/chosen": 12.75794792175293, + "logits/rejected": 9.967142105102539, + "logps/chosen": -332.015380859375, + "logps/rejected": -330.7392578125, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6682014465332031, + "rewards/margins": 0.06450548768043518, + "rewards/rejected": 0.6036959886550903, + "step": 1328 + }, + { + "epoch": 0.20552870674656873, + "grad_norm": 6.932251930236816, + "learning_rate": 3.425257731958763e-06, + "logits/chosen": 12.506941795349121, + "logits/rejected": 5.496970176696777, + "logps/chosen": -540.5363159179688, + "logps/rejected": -341.3646240234375, + "loss": 0.703, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7784515619277954, + "rewards/margins": -0.0071449726819992065, + "rewards/rejected": 0.785596489906311, + "step": 1329 + }, + { + "epoch": 0.20568335588633288, + "grad_norm": 6.123511791229248, + "learning_rate": 3.427835051546392e-06, + "logits/chosen": 14.895683288574219, + "logits/rejected": 11.09814167022705, + "logps/chosen": -384.80328369140625, + "logps/rejected": -344.2386474609375, + "loss": 0.6329, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4796812832355499, + "rewards/margins": 0.17634254693984985, + "rewards/rejected": 0.3033387362957001, + "step": 1330 + }, + { + "epoch": 0.20583800502609703, + "grad_norm": 5.52587890625, + "learning_rate": 3.430412371134021e-06, + "logits/chosen": 7.867013931274414, + "logits/rejected": 8.348973274230957, + "logps/chosen": -235.122802734375, + "logps/rejected": -251.46774291992188, + "loss": 0.7049, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6932826042175293, + "rewards/margins": 0.015125781297683716, + "rewards/rejected": 0.6781567335128784, + "step": 1331 + }, + { + "epoch": 0.2059926541658612, + "grad_norm": 4.428856372833252, + "learning_rate": 3.4329896907216497e-06, + "logits/chosen": 12.31286907196045, + "logits/rejected": 10.521028518676758, + "logps/chosen": -394.11236572265625, + "logps/rejected": -353.8339538574219, + "loss": 0.5127, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7203955054283142, + "rewards/margins": 0.4415269196033478, + "rewards/rejected": 0.27886858582496643, + "step": 1332 + }, + { + "epoch": 0.20614730330562536, + "grad_norm": 5.749904632568359, + "learning_rate": 3.4355670103092786e-06, + "logits/chosen": 8.381030082702637, + "logits/rejected": 3.0804190635681152, + "logps/chosen": -355.7543640136719, + "logps/rejected": -166.9155731201172, + "loss": 0.6921, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.378226101398468, + "rewards/margins": 0.024712078273296356, + "rewards/rejected": 0.35351401567459106, + "step": 1333 + }, + { + "epoch": 0.2063019524453895, + "grad_norm": 6.69580602645874, + "learning_rate": 3.4381443298969074e-06, + "logits/chosen": 6.467792987823486, + "logits/rejected": 6.528688907623291, + "logps/chosen": -282.5873718261719, + "logps/rejected": -274.0964050292969, + "loss": 0.6595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42681685090065, + "rewards/margins": 0.10628613829612732, + "rewards/rejected": 0.3205307126045227, + "step": 1334 + }, + { + "epoch": 0.2064566015851537, + "grad_norm": 8.357756614685059, + "learning_rate": 3.4407216494845363e-06, + "logits/chosen": 7.871211051940918, + "logits/rejected": 7.975522518157959, + "logps/chosen": -134.2395782470703, + "logps/rejected": -170.60951232910156, + "loss": 0.8547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2632666230201721, + "rewards/margins": -0.14537882804870605, + "rewards/rejected": 0.4086454212665558, + "step": 1335 + }, + { + "epoch": 0.20661125072491784, + "grad_norm": 7.131282329559326, + "learning_rate": 3.443298969072165e-06, + "logits/chosen": 7.78883695602417, + "logits/rejected": 8.601940155029297, + "logps/chosen": -363.1313781738281, + "logps/rejected": -415.93212890625, + "loss": 0.7514, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4316656291484833, + "rewards/margins": -0.039987191557884216, + "rewards/rejected": 0.4716528356075287, + "step": 1336 + }, + { + "epoch": 0.206765899864682, + "grad_norm": 17.950986862182617, + "learning_rate": 3.445876288659794e-06, + "logits/chosen": 10.061782836914062, + "logits/rejected": 2.5823841094970703, + "logps/chosen": -387.6070251464844, + "logps/rejected": -216.02276611328125, + "loss": 0.8459, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3303816020488739, + "rewards/margins": -0.14370346069335938, + "rewards/rejected": 0.4740850627422333, + "step": 1337 + }, + { + "epoch": 0.20692054900444617, + "grad_norm": 9.825039863586426, + "learning_rate": 3.448453608247423e-06, + "logits/chosen": 11.339818000793457, + "logits/rejected": 3.227383613586426, + "logps/chosen": -448.6005554199219, + "logps/rejected": -308.6451416015625, + "loss": 0.7428, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49702179431915283, + "rewards/margins": -0.05179423838853836, + "rewards/rejected": 0.5488160252571106, + "step": 1338 + }, + { + "epoch": 0.20707519814421033, + "grad_norm": 5.399560451507568, + "learning_rate": 3.451030927835052e-06, + "logits/chosen": 7.201487064361572, + "logits/rejected": 8.292201042175293, + "logps/chosen": -222.55514526367188, + "logps/rejected": -231.8472137451172, + "loss": 0.6329, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42050811648368835, + "rewards/margins": 0.13930931687355042, + "rewards/rejected": 0.28119879961013794, + "step": 1339 + }, + { + "epoch": 0.20722984728397448, + "grad_norm": 5.372221946716309, + "learning_rate": 3.4536082474226807e-06, + "logits/chosen": 9.994074821472168, + "logits/rejected": 2.6565442085266113, + "logps/chosen": -508.5455627441406, + "logps/rejected": -259.623291015625, + "loss": 0.646, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46773701906204224, + "rewards/margins": 0.11834974586963654, + "rewards/rejected": 0.3493872880935669, + "step": 1340 + }, + { + "epoch": 0.20738449642373866, + "grad_norm": 5.956606864929199, + "learning_rate": 3.4561855670103095e-06, + "logits/chosen": 14.832313537597656, + "logits/rejected": 10.273258209228516, + "logps/chosen": -332.6368103027344, + "logps/rejected": -294.1409606933594, + "loss": 0.7089, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40060776472091675, + "rewards/margins": 0.005407616496086121, + "rewards/rejected": 0.3952001929283142, + "step": 1341 + }, + { + "epoch": 0.2075391455635028, + "grad_norm": 4.652444362640381, + "learning_rate": 3.4587628865979384e-06, + "logits/chosen": 9.282845497131348, + "logits/rejected": 5.308042526245117, + "logps/chosen": -177.93292236328125, + "logps/rejected": -149.10462951660156, + "loss": 0.7116, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6127756834030151, + "rewards/margins": 0.0020549073815345764, + "rewards/rejected": 0.6107207536697388, + "step": 1342 + }, + { + "epoch": 0.20769379470326696, + "grad_norm": 7.147805213928223, + "learning_rate": 3.4613402061855673e-06, + "logits/chosen": 5.852838516235352, + "logits/rejected": 8.764505386352539, + "logps/chosen": -235.6710205078125, + "logps/rejected": -307.25634765625, + "loss": 0.6973, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4334206283092499, + "rewards/margins": 0.016242746263742447, + "rewards/rejected": 0.41717788577079773, + "step": 1343 + }, + { + "epoch": 0.2078484438430311, + "grad_norm": 6.007249355316162, + "learning_rate": 3.463917525773196e-06, + "logits/chosen": 6.329930305480957, + "logits/rejected": 6.019890308380127, + "logps/chosen": -336.0419921875, + "logps/rejected": -291.0050354003906, + "loss": 0.673, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41354864835739136, + "rewards/margins": 0.1292746663093567, + "rewards/rejected": 0.28427398204803467, + "step": 1344 + }, + { + "epoch": 0.2080030929827953, + "grad_norm": 3.8515822887420654, + "learning_rate": 3.466494845360825e-06, + "logits/chosen": 11.170055389404297, + "logits/rejected": 6.0909929275512695, + "logps/chosen": -184.79251098632812, + "logps/rejected": -128.2896270751953, + "loss": 0.6112, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5677140951156616, + "rewards/margins": 0.19397355616092682, + "rewards/rejected": 0.3737405240535736, + "step": 1345 + }, + { + "epoch": 0.20815774212255944, + "grad_norm": 6.491765022277832, + "learning_rate": 3.469072164948454e-06, + "logits/chosen": 10.483287811279297, + "logits/rejected": 3.665989398956299, + "logps/chosen": -382.7060241699219, + "logps/rejected": -187.4184112548828, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3527681529521942, + "rewards/margins": 0.021226219832897186, + "rewards/rejected": 0.33154192566871643, + "step": 1346 + }, + { + "epoch": 0.2083123912623236, + "grad_norm": 4.384526252746582, + "learning_rate": 3.4716494845360828e-06, + "logits/chosen": 6.553007125854492, + "logits/rejected": 3.0628886222839355, + "logps/chosen": -240.4276123046875, + "logps/rejected": -196.03707885742188, + "loss": 0.5373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6536540389060974, + "rewards/margins": 0.36939775943756104, + "rewards/rejected": 0.28425630927085876, + "step": 1347 + }, + { + "epoch": 0.20846704040208777, + "grad_norm": 4.414042949676514, + "learning_rate": 3.4742268041237117e-06, + "logits/chosen": 12.046533584594727, + "logits/rejected": 6.874114036560059, + "logps/chosen": -192.94686889648438, + "logps/rejected": -105.77291107177734, + "loss": 0.6606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4994664490222931, + "rewards/margins": 0.07198914885520935, + "rewards/rejected": 0.42747727036476135, + "step": 1348 + }, + { + "epoch": 0.20862168954185192, + "grad_norm": 4.539097309112549, + "learning_rate": 3.4768041237113405e-06, + "logits/chosen": 5.563868999481201, + "logits/rejected": 4.425388813018799, + "logps/chosen": -207.09054565429688, + "logps/rejected": -190.4141082763672, + "loss": 0.6657, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5783560276031494, + "rewards/margins": 0.07256454229354858, + "rewards/rejected": 0.5057914853096008, + "step": 1349 + }, + { + "epoch": 0.20877633868161607, + "grad_norm": 6.892353534698486, + "learning_rate": 3.4793814432989694e-06, + "logits/chosen": 8.358343124389648, + "logits/rejected": 6.98340368270874, + "logps/chosen": -257.2492980957031, + "logps/rejected": -248.3079833984375, + "loss": 0.6452, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43171054124832153, + "rewards/margins": 0.14787976443767548, + "rewards/rejected": 0.28383079171180725, + "step": 1350 + }, + { + "epoch": 0.20893098782138025, + "grad_norm": 8.68060302734375, + "learning_rate": 3.4819587628865983e-06, + "logits/chosen": 11.185699462890625, + "logits/rejected": 6.291648864746094, + "logps/chosen": -488.0755615234375, + "logps/rejected": -431.4562072753906, + "loss": 0.6475, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.71002197265625, + "rewards/margins": 0.15664920210838318, + "rewards/rejected": 0.5533727407455444, + "step": 1351 + }, + { + "epoch": 0.2090856369611444, + "grad_norm": 4.891376495361328, + "learning_rate": 3.4845360824742267e-06, + "logits/chosen": 9.884033203125, + "logits/rejected": 9.315380096435547, + "logps/chosen": -254.7776641845703, + "logps/rejected": -262.2553405761719, + "loss": 0.5941, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5134794116020203, + "rewards/margins": 0.23740099370479584, + "rewards/rejected": 0.2760784327983856, + "step": 1352 + }, + { + "epoch": 0.20924028610090856, + "grad_norm": 6.2578301429748535, + "learning_rate": 3.4871134020618556e-06, + "logits/chosen": 6.928276062011719, + "logits/rejected": 3.1283023357391357, + "logps/chosen": -270.92510986328125, + "logps/rejected": -207.70321655273438, + "loss": 0.7315, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.45973482728004456, + "rewards/margins": -0.043284885585308075, + "rewards/rejected": 0.5030196905136108, + "step": 1353 + }, + { + "epoch": 0.20939493524067274, + "grad_norm": 4.849658489227295, + "learning_rate": 3.4896907216494845e-06, + "logits/chosen": 11.711618423461914, + "logits/rejected": 7.391726493835449, + "logps/chosen": -290.62371826171875, + "logps/rejected": -274.7575378417969, + "loss": 0.6334, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29147475957870483, + "rewards/margins": 0.17306284606456757, + "rewards/rejected": 0.11841192096471786, + "step": 1354 + }, + { + "epoch": 0.2095495843804369, + "grad_norm": 4.921021461486816, + "learning_rate": 3.4922680412371133e-06, + "logits/chosen": 11.806081771850586, + "logits/rejected": 9.388275146484375, + "logps/chosen": -246.70620727539062, + "logps/rejected": -272.8097839355469, + "loss": 0.5829, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.629475474357605, + "rewards/margins": 0.24401850998401642, + "rewards/rejected": 0.38545700907707214, + "step": 1355 + }, + { + "epoch": 0.20970423352020104, + "grad_norm": 8.295026779174805, + "learning_rate": 3.494845360824742e-06, + "logits/chosen": 4.895730495452881, + "logits/rejected": 6.708144664764404, + "logps/chosen": -280.4863586425781, + "logps/rejected": -356.3783264160156, + "loss": 0.6362, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7940717935562134, + "rewards/margins": 0.15280523896217346, + "rewards/rejected": 0.6412665843963623, + "step": 1356 + }, + { + "epoch": 0.2098588826599652, + "grad_norm": 4.719005107879639, + "learning_rate": 3.497422680412371e-06, + "logits/chosen": 4.938675880432129, + "logits/rejected": 6.426571369171143, + "logps/chosen": -206.31689453125, + "logps/rejected": -209.0506591796875, + "loss": 0.6483, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43130356073379517, + "rewards/margins": 0.10776080936193466, + "rewards/rejected": 0.3235427737236023, + "step": 1357 + }, + { + "epoch": 0.21001353179972937, + "grad_norm": 5.728810787200928, + "learning_rate": 3.5e-06, + "logits/chosen": 7.770626068115234, + "logits/rejected": 9.35832405090332, + "logps/chosen": -329.96270751953125, + "logps/rejected": -399.1640625, + "loss": 0.6299, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4822203516960144, + "rewards/margins": 0.176669642329216, + "rewards/rejected": 0.3055507242679596, + "step": 1358 + }, + { + "epoch": 0.21016818093949352, + "grad_norm": 7.7981109619140625, + "learning_rate": 3.502577319587629e-06, + "logits/chosen": 6.6221466064453125, + "logits/rejected": 9.354161262512207, + "logps/chosen": -378.2397155761719, + "logps/rejected": -354.189453125, + "loss": 0.9013, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.4871975779533386, + "rewards/margins": -0.3183588981628418, + "rewards/rejected": 0.8055565357208252, + "step": 1359 + }, + { + "epoch": 0.21032283007925767, + "grad_norm": 4.4969024658203125, + "learning_rate": 3.5051546391752577e-06, + "logits/chosen": 10.313762664794922, + "logits/rejected": 5.391558647155762, + "logps/chosen": -311.0745544433594, + "logps/rejected": -229.59690856933594, + "loss": 0.5052, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6098133325576782, + "rewards/margins": 0.4761411249637604, + "rewards/rejected": 0.13367222249507904, + "step": 1360 + }, + { + "epoch": 0.21047747921902185, + "grad_norm": 4.826947212219238, + "learning_rate": 3.5077319587628866e-06, + "logits/chosen": 11.115373611450195, + "logits/rejected": 6.972024440765381, + "logps/chosen": -221.2061004638672, + "logps/rejected": -173.62017822265625, + "loss": 0.6988, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3351612091064453, + "rewards/margins": 0.009484857320785522, + "rewards/rejected": 0.3256763517856598, + "step": 1361 + }, + { + "epoch": 0.210632128358786, + "grad_norm": 5.5508809089660645, + "learning_rate": 3.5103092783505154e-06, + "logits/chosen": 10.264398574829102, + "logits/rejected": 8.313765525817871, + "logps/chosen": -345.8274841308594, + "logps/rejected": -303.15081787109375, + "loss": 0.6368, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4567398130893707, + "rewards/margins": 0.15106360614299774, + "rewards/rejected": 0.3056762218475342, + "step": 1362 + }, + { + "epoch": 0.21078677749855015, + "grad_norm": 3.8625824451446533, + "learning_rate": 3.5128865979381443e-06, + "logits/chosen": 6.132232189178467, + "logits/rejected": 8.672809600830078, + "logps/chosen": -114.29231262207031, + "logps/rejected": -125.7341537475586, + "loss": 0.6784, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3751299977302551, + "rewards/margins": 0.03930889442563057, + "rewards/rejected": 0.33582109212875366, + "step": 1363 + }, + { + "epoch": 0.21094142663831433, + "grad_norm": 42.03532791137695, + "learning_rate": 3.515463917525773e-06, + "logits/chosen": 9.723971366882324, + "logits/rejected": 9.946759223937988, + "logps/chosen": -252.67922973632812, + "logps/rejected": -264.53125, + "loss": 0.6371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.601596474647522, + "rewards/margins": 0.12183618545532227, + "rewards/rejected": 0.4797602593898773, + "step": 1364 + }, + { + "epoch": 0.21109607577807848, + "grad_norm": 4.07283878326416, + "learning_rate": 3.5180412371134025e-06, + "logits/chosen": 13.018229484558105, + "logits/rejected": 3.55332612991333, + "logps/chosen": -213.94863891601562, + "logps/rejected": -103.7365493774414, + "loss": 0.5938, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43834927678108215, + "rewards/margins": 0.2561228573322296, + "rewards/rejected": 0.18222640454769135, + "step": 1365 + }, + { + "epoch": 0.21125072491784264, + "grad_norm": 5.292239665985107, + "learning_rate": 3.5206185567010313e-06, + "logits/chosen": 10.229877471923828, + "logits/rejected": 0.362331748008728, + "logps/chosen": -386.6089172363281, + "logps/rejected": -212.48472595214844, + "loss": 0.6455, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.539689302444458, + "rewards/margins": 0.14695516228675842, + "rewards/rejected": 0.3927341401576996, + "step": 1366 + }, + { + "epoch": 0.21140537405760682, + "grad_norm": 5.216536998748779, + "learning_rate": 3.5231958762886602e-06, + "logits/chosen": 6.677067756652832, + "logits/rejected": 4.888310432434082, + "logps/chosen": -201.47525024414062, + "logps/rejected": -202.88449096679688, + "loss": 0.746, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35402822494506836, + "rewards/margins": -0.07672037929296494, + "rewards/rejected": 0.4307486116886139, + "step": 1367 + }, + { + "epoch": 0.21156002319737097, + "grad_norm": 5.612256050109863, + "learning_rate": 3.525773195876289e-06, + "logits/chosen": 10.84018325805664, + "logits/rejected": 5.113863945007324, + "logps/chosen": -355.4080810546875, + "logps/rejected": -215.42857360839844, + "loss": 0.7804, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5365468859672546, + "rewards/margins": -0.1097065880894661, + "rewards/rejected": 0.6462534666061401, + "step": 1368 + }, + { + "epoch": 0.21171467233713512, + "grad_norm": 26.769519805908203, + "learning_rate": 3.528350515463918e-06, + "logits/chosen": 8.440407752990723, + "logits/rejected": 10.98375129699707, + "logps/chosen": -406.82733154296875, + "logps/rejected": -433.5931396484375, + "loss": 0.6864, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5005760192871094, + "rewards/margins": 0.028843581676483154, + "rewards/rejected": 0.4717324376106262, + "step": 1369 + }, + { + "epoch": 0.2118693214768993, + "grad_norm": 6.7686004638671875, + "learning_rate": 3.530927835051547e-06, + "logits/chosen": 4.020563125610352, + "logits/rejected": 6.296971797943115, + "logps/chosen": -223.6477508544922, + "logps/rejected": -333.93701171875, + "loss": 0.6866, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5287067890167236, + "rewards/margins": 0.0202481746673584, + "rewards/rejected": 0.5084586143493652, + "step": 1370 + }, + { + "epoch": 0.21202397061666345, + "grad_norm": 17.801685333251953, + "learning_rate": 3.5335051546391757e-06, + "logits/chosen": 9.04471206665039, + "logits/rejected": 4.763840198516846, + "logps/chosen": -397.53448486328125, + "logps/rejected": -329.04559326171875, + "loss": 0.7461, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.37218376994132996, + "rewards/margins": -0.07964642345905304, + "rewards/rejected": 0.4518301784992218, + "step": 1371 + }, + { + "epoch": 0.2121786197564276, + "grad_norm": 4.7403669357299805, + "learning_rate": 3.5360824742268046e-06, + "logits/chosen": 10.435708045959473, + "logits/rejected": 7.7940168380737305, + "logps/chosen": -220.83447265625, + "logps/rejected": -167.44602966308594, + "loss": 0.628, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5339148044586182, + "rewards/margins": 0.15386907756328583, + "rewards/rejected": 0.38004571199417114, + "step": 1372 + }, + { + "epoch": 0.21233326889619175, + "grad_norm": 6.455552577972412, + "learning_rate": 3.5386597938144334e-06, + "logits/chosen": 17.841386795043945, + "logits/rejected": 10.11112117767334, + "logps/chosen": -340.7460632324219, + "logps/rejected": -238.43994140625, + "loss": 0.6542, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42541542649269104, + "rewards/margins": 0.08890493959188461, + "rewards/rejected": 0.33651047945022583, + "step": 1373 + }, + { + "epoch": 0.21248791803595593, + "grad_norm": 5.383336067199707, + "learning_rate": 3.5412371134020623e-06, + "logits/chosen": 6.971219062805176, + "logits/rejected": 8.764307975769043, + "logps/chosen": -170.48658752441406, + "logps/rejected": -164.9232177734375, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3233797550201416, + "rewards/margins": 0.053184788674116135, + "rewards/rejected": 0.27019497752189636, + "step": 1374 + }, + { + "epoch": 0.21264256717572008, + "grad_norm": 5.377937316894531, + "learning_rate": 3.543814432989691e-06, + "logits/chosen": 10.733101844787598, + "logits/rejected": 3.9719767570495605, + "logps/chosen": -411.1726989746094, + "logps/rejected": -304.9864501953125, + "loss": 0.5804, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5369677543640137, + "rewards/margins": 0.31531763076782227, + "rewards/rejected": 0.2216501235961914, + "step": 1375 + }, + { + "epoch": 0.21279721631548423, + "grad_norm": 4.059070110321045, + "learning_rate": 3.54639175257732e-06, + "logits/chosen": 8.050950050354004, + "logits/rejected": 9.078701972961426, + "logps/chosen": -210.8147430419922, + "logps/rejected": -284.099609375, + "loss": 0.5366, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6488400101661682, + "rewards/margins": 0.40098732709884644, + "rewards/rejected": 0.24785271286964417, + "step": 1376 + }, + { + "epoch": 0.2129518654552484, + "grad_norm": 4.732486248016357, + "learning_rate": 3.548969072164949e-06, + "logits/chosen": 9.83571720123291, + "logits/rejected": 8.933853149414062, + "logps/chosen": -200.92828369140625, + "logps/rejected": -192.86416625976562, + "loss": 0.7852, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.41808220744132996, + "rewards/margins": -0.1393115222454071, + "rewards/rejected": 0.5573937296867371, + "step": 1377 + }, + { + "epoch": 0.21310651459501256, + "grad_norm": 6.51684045791626, + "learning_rate": 3.551546391752578e-06, + "logits/chosen": 8.409055709838867, + "logits/rejected": 5.54688024520874, + "logps/chosen": -302.28753662109375, + "logps/rejected": -228.40945434570312, + "loss": 0.7138, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5649549961090088, + "rewards/margins": -0.01456449180841446, + "rewards/rejected": 0.579519510269165, + "step": 1378 + }, + { + "epoch": 0.21326116373477672, + "grad_norm": 6.601457118988037, + "learning_rate": 3.5541237113402067e-06, + "logits/chosen": 3.236544609069824, + "logits/rejected": 10.034658432006836, + "logps/chosen": -251.71966552734375, + "logps/rejected": -375.01556396484375, + "loss": 0.782, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.29124915599823, + "rewards/margins": -0.10657037049531937, + "rewards/rejected": 0.39781951904296875, + "step": 1379 + }, + { + "epoch": 0.2134158128745409, + "grad_norm": 5.969194412231445, + "learning_rate": 3.5567010309278356e-06, + "logits/chosen": 14.562919616699219, + "logits/rejected": 12.604349136352539, + "logps/chosen": -420.05902099609375, + "logps/rejected": -321.63702392578125, + "loss": 0.67, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6438345909118652, + "rewards/margins": 0.0644078180193901, + "rewards/rejected": 0.5794267654418945, + "step": 1380 + }, + { + "epoch": 0.21357046201430505, + "grad_norm": 7.169862270355225, + "learning_rate": 3.5592783505154644e-06, + "logits/chosen": 5.652187824249268, + "logits/rejected": 10.172074317932129, + "logps/chosen": -304.4931945800781, + "logps/rejected": -318.00958251953125, + "loss": 0.7506, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.28241586685180664, + "rewards/margins": -0.07487916946411133, + "rewards/rejected": 0.35729503631591797, + "step": 1381 + }, + { + "epoch": 0.2137251111540692, + "grad_norm": 4.332362174987793, + "learning_rate": 3.5618556701030933e-06, + "logits/chosen": 12.813555717468262, + "logits/rejected": 9.093693733215332, + "logps/chosen": -227.2550811767578, + "logps/rejected": -211.45059204101562, + "loss": 0.5874, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4965195655822754, + "rewards/margins": 0.289533793926239, + "rewards/rejected": 0.20698577165603638, + "step": 1382 + }, + { + "epoch": 0.21387976029383338, + "grad_norm": 5.209338188171387, + "learning_rate": 3.564432989690722e-06, + "logits/chosen": 7.237485885620117, + "logits/rejected": 15.047758102416992, + "logps/chosen": -287.3205871582031, + "logps/rejected": -337.389892578125, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.463872492313385, + "rewards/margins": 0.12451709806919098, + "rewards/rejected": 0.3393554091453552, + "step": 1383 + }, + { + "epoch": 0.21403440943359753, + "grad_norm": 5.4234490394592285, + "learning_rate": 3.567010309278351e-06, + "logits/chosen": 11.620508193969727, + "logits/rejected": 4.772186279296875, + "logps/chosen": -295.1038513183594, + "logps/rejected": -208.3180694580078, + "loss": 0.6859, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46672195196151733, + "rewards/margins": 0.04274645820260048, + "rewards/rejected": 0.42397546768188477, + "step": 1384 + }, + { + "epoch": 0.21418905857336168, + "grad_norm": 5.287847995758057, + "learning_rate": 3.56958762886598e-06, + "logits/chosen": 8.704248428344727, + "logits/rejected": 6.20478630065918, + "logps/chosen": -248.81182861328125, + "logps/rejected": -213.886474609375, + "loss": 0.6694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19882261753082275, + "rewards/margins": 0.06858278065919876, + "rewards/rejected": 0.1302398294210434, + "step": 1385 + }, + { + "epoch": 0.21434370771312586, + "grad_norm": 5.387875080108643, + "learning_rate": 3.5721649484536088e-06, + "logits/chosen": 7.495245456695557, + "logits/rejected": 5.227661609649658, + "logps/chosen": -212.9635009765625, + "logps/rejected": -222.36404418945312, + "loss": 0.6765, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38130658864974976, + "rewards/margins": 0.08010635524988174, + "rewards/rejected": 0.301200270652771, + "step": 1386 + }, + { + "epoch": 0.21449835685289, + "grad_norm": 5.341694355010986, + "learning_rate": 3.5747422680412377e-06, + "logits/chosen": 14.002464294433594, + "logits/rejected": 3.797640800476074, + "logps/chosen": -352.9326171875, + "logps/rejected": -213.20823669433594, + "loss": 0.5679, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.548779308795929, + "rewards/margins": 0.2892724871635437, + "rewards/rejected": 0.25950682163238525, + "step": 1387 + }, + { + "epoch": 0.21465300599265416, + "grad_norm": 4.783133029937744, + "learning_rate": 3.5773195876288665e-06, + "logits/chosen": 14.291772842407227, + "logits/rejected": 6.613865375518799, + "logps/chosen": -233.8211669921875, + "logps/rejected": -121.76593017578125, + "loss": 0.6629, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44137075543403625, + "rewards/margins": 0.0829995647072792, + "rewards/rejected": 0.35837119817733765, + "step": 1388 + }, + { + "epoch": 0.2148076551324183, + "grad_norm": 4.695958137512207, + "learning_rate": 3.5798969072164954e-06, + "logits/chosen": 9.069969177246094, + "logits/rejected": 12.631689071655273, + "logps/chosen": -154.82574462890625, + "logps/rejected": -233.6972198486328, + "loss": 0.7544, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31659260392189026, + "rewards/margins": -0.0945776104927063, + "rewards/rejected": 0.41117021441459656, + "step": 1389 + }, + { + "epoch": 0.2149623042721825, + "grad_norm": 17.922128677368164, + "learning_rate": 3.582474226804124e-06, + "logits/chosen": 11.884603500366211, + "logits/rejected": 7.742044448852539, + "logps/chosen": -313.76458740234375, + "logps/rejected": -242.19577026367188, + "loss": 0.5482, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7540775537490845, + "rewards/margins": 0.4058167040348053, + "rewards/rejected": 0.3482608199119568, + "step": 1390 + }, + { + "epoch": 0.21511695341194664, + "grad_norm": 7.593837738037109, + "learning_rate": 3.5850515463917527e-06, + "logits/chosen": 11.985014915466309, + "logits/rejected": 12.171064376831055, + "logps/chosen": -424.50408935546875, + "logps/rejected": -381.389404296875, + "loss": 0.6374, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4164169430732727, + "rewards/margins": 0.12984275817871094, + "rewards/rejected": 0.28657418489456177, + "step": 1391 + }, + { + "epoch": 0.2152716025517108, + "grad_norm": 4.750051021575928, + "learning_rate": 3.5876288659793816e-06, + "logits/chosen": 11.197408676147461, + "logits/rejected": 8.01699447631836, + "logps/chosen": -249.84201049804688, + "logps/rejected": -241.58238220214844, + "loss": 0.6087, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5981346368789673, + "rewards/margins": 0.2080833911895752, + "rewards/rejected": 0.3900512754917145, + "step": 1392 + }, + { + "epoch": 0.21542625169147497, + "grad_norm": 6.1215314865112305, + "learning_rate": 3.5902061855670105e-06, + "logits/chosen": 7.208744049072266, + "logits/rejected": 0.5497065782546997, + "logps/chosen": -353.7894287109375, + "logps/rejected": -192.8113555908203, + "loss": 0.6358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42695724964141846, + "rewards/margins": 0.12696388363838196, + "rewards/rejected": 0.2999933362007141, + "step": 1393 + }, + { + "epoch": 0.21558090083123913, + "grad_norm": 5.062748908996582, + "learning_rate": 3.5927835051546393e-06, + "logits/chosen": 15.541715621948242, + "logits/rejected": 12.296457290649414, + "logps/chosen": -359.38653564453125, + "logps/rejected": -321.9974670410156, + "loss": 0.5524, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6806190609931946, + "rewards/margins": 0.37431812286376953, + "rewards/rejected": 0.30630093812942505, + "step": 1394 + }, + { + "epoch": 0.21573554997100328, + "grad_norm": 5.828248023986816, + "learning_rate": 3.595360824742268e-06, + "logits/chosen": 14.44190788269043, + "logits/rejected": 4.765242576599121, + "logps/chosen": -308.0531311035156, + "logps/rejected": -229.64662170410156, + "loss": 0.6615, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32577210664749146, + "rewards/margins": 0.09610196202993393, + "rewards/rejected": 0.22967013716697693, + "step": 1395 + }, + { + "epoch": 0.21589019911076746, + "grad_norm": 3.8994383811950684, + "learning_rate": 3.597938144329897e-06, + "logits/chosen": 9.809871673583984, + "logits/rejected": 4.176308631896973, + "logps/chosen": -268.6292724609375, + "logps/rejected": -149.78109741210938, + "loss": 0.5801, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5215524435043335, + "rewards/margins": 0.2858830690383911, + "rewards/rejected": 0.23566940426826477, + "step": 1396 + }, + { + "epoch": 0.2160448482505316, + "grad_norm": 4.680583477020264, + "learning_rate": 3.600515463917526e-06, + "logits/chosen": 9.197677612304688, + "logits/rejected": 8.19057559967041, + "logps/chosen": -301.29156494140625, + "logps/rejected": -311.92864990234375, + "loss": 0.5896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6327707767486572, + "rewards/margins": 0.2566205859184265, + "rewards/rejected": 0.37615013122558594, + "step": 1397 + }, + { + "epoch": 0.21619949739029576, + "grad_norm": 5.29899263381958, + "learning_rate": 3.603092783505155e-06, + "logits/chosen": 9.07162094116211, + "logits/rejected": 9.636396408081055, + "logps/chosen": -226.06842041015625, + "logps/rejected": -219.00634765625, + "loss": 0.7619, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41803330183029175, + "rewards/margins": -0.1127861887216568, + "rewards/rejected": 0.5308195352554321, + "step": 1398 + }, + { + "epoch": 0.21635414653005994, + "grad_norm": 4.626092910766602, + "learning_rate": 3.6056701030927837e-06, + "logits/chosen": 2.620670795440674, + "logits/rejected": 2.706982135772705, + "logps/chosen": -153.52186584472656, + "logps/rejected": -179.35394287109375, + "loss": 0.8015, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2736055254936218, + "rewards/margins": -0.18276971578598022, + "rewards/rejected": 0.45637527108192444, + "step": 1399 + }, + { + "epoch": 0.2165087956698241, + "grad_norm": 6.05400276184082, + "learning_rate": 3.6082474226804126e-06, + "logits/chosen": 8.269233703613281, + "logits/rejected": 4.95513916015625, + "logps/chosen": -277.6695251464844, + "logps/rejected": -196.7139892578125, + "loss": 0.6571, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39224058389663696, + "rewards/margins": 0.08993808180093765, + "rewards/rejected": 0.3023025095462799, + "step": 1400 + }, + { + "epoch": 0.21666344480958824, + "grad_norm": 4.07888126373291, + "learning_rate": 3.6108247422680414e-06, + "logits/chosen": 6.532243251800537, + "logits/rejected": 6.481266021728516, + "logps/chosen": -173.1339111328125, + "logps/rejected": -186.14576721191406, + "loss": 0.6752, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24564766883850098, + "rewards/margins": 0.05224600434303284, + "rewards/rejected": 0.19340167939662933, + "step": 1401 + }, + { + "epoch": 0.21681809394935242, + "grad_norm": 4.770934104919434, + "learning_rate": 3.6134020618556703e-06, + "logits/chosen": 3.8904683589935303, + "logits/rejected": 0.7848445177078247, + "logps/chosen": -332.042724609375, + "logps/rejected": -232.2508544921875, + "loss": 0.678, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3198614716529846, + "rewards/margins": 0.047458574175834656, + "rewards/rejected": 0.27240291237831116, + "step": 1402 + }, + { + "epoch": 0.21697274308911657, + "grad_norm": 5.7592034339904785, + "learning_rate": 3.615979381443299e-06, + "logits/chosen": 10.140690803527832, + "logits/rejected": 14.099028587341309, + "logps/chosen": -210.9591064453125, + "logps/rejected": -241.9478302001953, + "loss": 0.762, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3259265422821045, + "rewards/margins": -0.05784143507480621, + "rewards/rejected": 0.3837679922580719, + "step": 1403 + }, + { + "epoch": 0.21712739222888072, + "grad_norm": 4.688066482543945, + "learning_rate": 3.618556701030928e-06, + "logits/chosen": 12.091397285461426, + "logits/rejected": 11.99979305267334, + "logps/chosen": -185.437744140625, + "logps/rejected": -193.048095703125, + "loss": 0.7284, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.30165940523147583, + "rewards/margins": -0.04808884486556053, + "rewards/rejected": 0.3497482240200043, + "step": 1404 + }, + { + "epoch": 0.21728204136864487, + "grad_norm": 4.5660858154296875, + "learning_rate": 3.621134020618557e-06, + "logits/chosen": 7.162897109985352, + "logits/rejected": 5.639487266540527, + "logps/chosen": -178.85079956054688, + "logps/rejected": -166.9153289794922, + "loss": 0.6449, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31257164478302, + "rewards/margins": 0.1051870733499527, + "rewards/rejected": 0.20738458633422852, + "step": 1405 + }, + { + "epoch": 0.21743669050840905, + "grad_norm": 3.9991378784179688, + "learning_rate": 3.623711340206186e-06, + "logits/chosen": 8.645508766174316, + "logits/rejected": 4.361780643463135, + "logps/chosen": -186.33148193359375, + "logps/rejected": -149.33509826660156, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4362901449203491, + "rewards/margins": 0.26957952976226807, + "rewards/rejected": 0.16671063005924225, + "step": 1406 + }, + { + "epoch": 0.2175913396481732, + "grad_norm": 4.87369966506958, + "learning_rate": 3.6262886597938147e-06, + "logits/chosen": 13.010942459106445, + "logits/rejected": 9.216177940368652, + "logps/chosen": -339.8858642578125, + "logps/rejected": -251.97030639648438, + "loss": 0.5785, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4198285937309265, + "rewards/margins": 0.2869997024536133, + "rewards/rejected": 0.13282892107963562, + "step": 1407 + }, + { + "epoch": 0.21774598878793736, + "grad_norm": 5.114838123321533, + "learning_rate": 3.6288659793814435e-06, + "logits/chosen": 15.066329002380371, + "logits/rejected": 13.697265625, + "logps/chosen": -224.92408752441406, + "logps/rejected": -230.80490112304688, + "loss": 0.6566, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.362186074256897, + "rewards/margins": 0.07888708263635635, + "rewards/rejected": 0.28329896926879883, + "step": 1408 + }, + { + "epoch": 0.21790063792770153, + "grad_norm": 7.391239643096924, + "learning_rate": 3.6314432989690724e-06, + "logits/chosen": 8.528319358825684, + "logits/rejected": 8.180903434753418, + "logps/chosen": -283.19189453125, + "logps/rejected": -282.02960205078125, + "loss": 0.5061, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4431118369102478, + "rewards/margins": 0.46919894218444824, + "rewards/rejected": -0.026087090373039246, + "step": 1409 + }, + { + "epoch": 0.2180552870674657, + "grad_norm": 5.6612043380737305, + "learning_rate": 3.6340206185567013e-06, + "logits/chosen": 13.293410301208496, + "logits/rejected": 9.366764068603516, + "logps/chosen": -275.1622314453125, + "logps/rejected": -240.09393310546875, + "loss": 0.6125, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3947445750236511, + "rewards/margins": 0.2122238278388977, + "rewards/rejected": 0.1825207769870758, + "step": 1410 + }, + { + "epoch": 0.21820993620722984, + "grad_norm": 8.534502029418945, + "learning_rate": 3.63659793814433e-06, + "logits/chosen": 15.37127685546875, + "logits/rejected": 17.36404037475586, + "logps/chosen": -227.03945922851562, + "logps/rejected": -260.7655334472656, + "loss": 0.8373, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.35013309121131897, + "rewards/margins": -0.20924700796604156, + "rewards/rejected": 0.5593801140785217, + "step": 1411 + }, + { + "epoch": 0.21836458534699402, + "grad_norm": 5.936330795288086, + "learning_rate": 3.639175257731959e-06, + "logits/chosen": 6.021060943603516, + "logits/rejected": 7.963349342346191, + "logps/chosen": -233.08522033691406, + "logps/rejected": -316.3594970703125, + "loss": 0.5922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41357719898223877, + "rewards/margins": 0.2534785270690918, + "rewards/rejected": 0.16009865701198578, + "step": 1412 + }, + { + "epoch": 0.21851923448675817, + "grad_norm": 7.7762017250061035, + "learning_rate": 3.641752577319588e-06, + "logits/chosen": 6.270167350769043, + "logits/rejected": 11.790924072265625, + "logps/chosen": -188.77398681640625, + "logps/rejected": -274.53271484375, + "loss": 0.7211, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4920801520347595, + "rewards/margins": 0.054075688123703, + "rewards/rejected": 0.4380044639110565, + "step": 1413 + }, + { + "epoch": 0.21867388362652232, + "grad_norm": 4.668298721313477, + "learning_rate": 3.6443298969072168e-06, + "logits/chosen": 11.339103698730469, + "logits/rejected": 3.700221300125122, + "logps/chosen": -292.1308288574219, + "logps/rejected": -210.3017578125, + "loss": 0.6201, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49662187695503235, + "rewards/margins": 0.21353943645954132, + "rewards/rejected": 0.28308239579200745, + "step": 1414 + }, + { + "epoch": 0.2188285327662865, + "grad_norm": 5.152096271514893, + "learning_rate": 3.6469072164948456e-06, + "logits/chosen": 8.295866966247559, + "logits/rejected": 9.64769172668457, + "logps/chosen": -204.8546600341797, + "logps/rejected": -184.64170837402344, + "loss": 0.7069, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.1396666020154953, + "rewards/margins": -0.005854126065969467, + "rewards/rejected": 0.14552073180675507, + "step": 1415 + }, + { + "epoch": 0.21898318190605065, + "grad_norm": 6.844688415527344, + "learning_rate": 3.6494845360824745e-06, + "logits/chosen": 15.606342315673828, + "logits/rejected": 12.615938186645508, + "logps/chosen": -369.7417297363281, + "logps/rejected": -294.941650390625, + "loss": 0.7012, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2298278510570526, + "rewards/margins": 0.002103324979543686, + "rewards/rejected": 0.22772449254989624, + "step": 1416 + }, + { + "epoch": 0.2191378310458148, + "grad_norm": 8.374110221862793, + "learning_rate": 3.6520618556701034e-06, + "logits/chosen": 3.5735723972320557, + "logits/rejected": 5.957098007202148, + "logps/chosen": -299.9869079589844, + "logps/rejected": -386.76800537109375, + "loss": 0.7991, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35375627875328064, + "rewards/margins": -0.1731717586517334, + "rewards/rejected": 0.5269280672073364, + "step": 1417 + }, + { + "epoch": 0.21929248018557898, + "grad_norm": 6.793594837188721, + "learning_rate": 3.654639175257732e-06, + "logits/chosen": 3.817567825317383, + "logits/rejected": 11.726768493652344, + "logps/chosen": -182.70130920410156, + "logps/rejected": -328.7852478027344, + "loss": 0.7583, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4035118818283081, + "rewards/margins": -0.04737842082977295, + "rewards/rejected": 0.45089030265808105, + "step": 1418 + }, + { + "epoch": 0.21944712932534313, + "grad_norm": 5.400929927825928, + "learning_rate": 3.6572164948453607e-06, + "logits/chosen": 10.78718376159668, + "logits/rejected": 11.716590881347656, + "logps/chosen": -312.64202880859375, + "logps/rejected": -300.51251220703125, + "loss": 0.5112, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.638419508934021, + "rewards/margins": 0.43650394678115845, + "rewards/rejected": 0.20191554725170135, + "step": 1419 + }, + { + "epoch": 0.21960177846510728, + "grad_norm": 4.759858131408691, + "learning_rate": 3.6597938144329896e-06, + "logits/chosen": 16.28638458251953, + "logits/rejected": 9.044527053833008, + "logps/chosen": -236.13827514648438, + "logps/rejected": -170.5867462158203, + "loss": 0.6857, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.336395263671875, + "rewards/margins": 0.03040466457605362, + "rewards/rejected": 0.305990606546402, + "step": 1420 + }, + { + "epoch": 0.21975642760487143, + "grad_norm": 5.24135684967041, + "learning_rate": 3.6623711340206185e-06, + "logits/chosen": 13.941750526428223, + "logits/rejected": 5.963501930236816, + "logps/chosen": -323.9901123046875, + "logps/rejected": -213.09153747558594, + "loss": 0.645, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29366081953048706, + "rewards/margins": 0.11821715533733368, + "rewards/rejected": 0.17544367909431458, + "step": 1421 + }, + { + "epoch": 0.21991107674463561, + "grad_norm": 5.48724365234375, + "learning_rate": 3.6649484536082473e-06, + "logits/chosen": 11.891497611999512, + "logits/rejected": 7.095841884613037, + "logps/chosen": -316.9349060058594, + "logps/rejected": -240.37310791015625, + "loss": 0.6301, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4686710834503174, + "rewards/margins": 0.16829286515712738, + "rewards/rejected": 0.3003782331943512, + "step": 1422 + }, + { + "epoch": 0.22006572588439977, + "grad_norm": 4.1777849197387695, + "learning_rate": 3.667525773195876e-06, + "logits/chosen": 10.771716117858887, + "logits/rejected": 11.025160789489746, + "logps/chosen": -175.10061645507812, + "logps/rejected": -187.09262084960938, + "loss": 0.6045, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12658023834228516, + "rewards/margins": 0.2283097505569458, + "rewards/rejected": -0.10172948986291885, + "step": 1423 + }, + { + "epoch": 0.22022037502416392, + "grad_norm": 5.127035140991211, + "learning_rate": 3.670103092783505e-06, + "logits/chosen": 11.211700439453125, + "logits/rejected": 15.031888961791992, + "logps/chosen": -225.87051391601562, + "logps/rejected": -338.299560546875, + "loss": 0.6782, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04646292328834534, + "rewards/margins": 0.07809567451477051, + "rewards/rejected": -0.03163275122642517, + "step": 1424 + }, + { + "epoch": 0.2203750241639281, + "grad_norm": 6.30739164352417, + "learning_rate": 3.6726804123711348e-06, + "logits/chosen": 14.393754005432129, + "logits/rejected": 10.955628395080566, + "logps/chosen": -375.6960754394531, + "logps/rejected": -276.3924865722656, + "loss": 0.6804, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20841285586357117, + "rewards/margins": 0.06851330399513245, + "rewards/rejected": 0.13989953696727753, + "step": 1425 + }, + { + "epoch": 0.22052967330369225, + "grad_norm": 5.809107303619385, + "learning_rate": 3.6752577319587637e-06, + "logits/chosen": 12.49923038482666, + "logits/rejected": 11.387870788574219, + "logps/chosen": -314.86712646484375, + "logps/rejected": -211.35494995117188, + "loss": 0.7159, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2514320909976959, + "rewards/margins": -0.03221062570810318, + "rewards/rejected": 0.2836427390575409, + "step": 1426 + }, + { + "epoch": 0.2206843224434564, + "grad_norm": 7.781691551208496, + "learning_rate": 3.677835051546392e-06, + "logits/chosen": 4.049658298492432, + "logits/rejected": 2.064927816390991, + "logps/chosen": -278.684326171875, + "logps/rejected": -251.45425415039062, + "loss": 0.6375, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15351057052612305, + "rewards/margins": 0.21297302842140198, + "rewards/rejected": -0.059462450444698334, + "step": 1427 + }, + { + "epoch": 0.22083897158322058, + "grad_norm": 5.709812641143799, + "learning_rate": 3.680412371134021e-06, + "logits/chosen": 10.702022552490234, + "logits/rejected": 12.592925071716309, + "logps/chosen": -189.70355224609375, + "logps/rejected": -218.53134155273438, + "loss": 0.7302, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23434460163116455, + "rewards/margins": -0.034587450325489044, + "rewards/rejected": 0.268932044506073, + "step": 1428 + }, + { + "epoch": 0.22099362072298473, + "grad_norm": 5.003627300262451, + "learning_rate": 3.68298969072165e-06, + "logits/chosen": 3.042603015899658, + "logits/rejected": 4.243104457855225, + "logps/chosen": -237.20458984375, + "logps/rejected": -270.1956787109375, + "loss": 0.6838, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23203736543655396, + "rewards/margins": 0.0267547108232975, + "rewards/rejected": 0.20528264343738556, + "step": 1429 + }, + { + "epoch": 0.22114826986274888, + "grad_norm": 5.373926639556885, + "learning_rate": 3.6855670103092787e-06, + "logits/chosen": 12.566391944885254, + "logits/rejected": 8.30225944519043, + "logps/chosen": -256.839599609375, + "logps/rejected": -220.32296752929688, + "loss": 0.6062, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2936462163925171, + "rewards/margins": 0.1967245191335678, + "rewards/rejected": 0.09692173451185226, + "step": 1430 + }, + { + "epoch": 0.22130291900251306, + "grad_norm": 5.836172580718994, + "learning_rate": 3.6881443298969076e-06, + "logits/chosen": 12.115289688110352, + "logits/rejected": 5.989727020263672, + "logps/chosen": -417.7890625, + "logps/rejected": -242.09805297851562, + "loss": 0.6706, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30823010206222534, + "rewards/margins": 0.05545293912291527, + "rewards/rejected": 0.2527771592140198, + "step": 1431 + }, + { + "epoch": 0.2214575681422772, + "grad_norm": 6.998680591583252, + "learning_rate": 3.6907216494845365e-06, + "logits/chosen": 10.992440223693848, + "logits/rejected": 7.924320697784424, + "logps/chosen": -262.57159423828125, + "logps/rejected": -178.3026885986328, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2293817102909088, + "rewards/margins": 0.08067440986633301, + "rewards/rejected": 0.1487073004245758, + "step": 1432 + }, + { + "epoch": 0.22161221728204136, + "grad_norm": 4.474908351898193, + "learning_rate": 3.6932989690721653e-06, + "logits/chosen": 8.56814956665039, + "logits/rejected": 6.0948615074157715, + "logps/chosen": -236.75711059570312, + "logps/rejected": -175.08424377441406, + "loss": 0.7114, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.3214804530143738, + "rewards/margins": 0.00786089338362217, + "rewards/rejected": 0.31361955404281616, + "step": 1433 + }, + { + "epoch": 0.22176686642180554, + "grad_norm": 4.403102874755859, + "learning_rate": 3.695876288659794e-06, + "logits/chosen": 3.6690077781677246, + "logits/rejected": 3.438993453979492, + "logps/chosen": -182.67446899414062, + "logps/rejected": -156.81800842285156, + "loss": 0.7076, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13388250768184662, + "rewards/margins": 0.002867031842470169, + "rewards/rejected": 0.13101546466350555, + "step": 1434 + }, + { + "epoch": 0.2219215155615697, + "grad_norm": 4.940154552459717, + "learning_rate": 3.698453608247423e-06, + "logits/chosen": 10.204668998718262, + "logits/rejected": 8.290806770324707, + "logps/chosen": -271.18048095703125, + "logps/rejected": -234.708740234375, + "loss": 0.5917, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47779157757759094, + "rewards/margins": 0.29514938592910767, + "rewards/rejected": 0.18264217674732208, + "step": 1435 + }, + { + "epoch": 0.22207616470133384, + "grad_norm": 8.411388397216797, + "learning_rate": 3.701030927835052e-06, + "logits/chosen": 6.597256660461426, + "logits/rejected": 8.360712051391602, + "logps/chosen": -239.87237548828125, + "logps/rejected": -249.18927001953125, + "loss": 0.6604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31735408306121826, + "rewards/margins": 0.07976202666759491, + "rewards/rejected": 0.23759204149246216, + "step": 1436 + }, + { + "epoch": 0.222230813841098, + "grad_norm": 5.877204895019531, + "learning_rate": 3.703608247422681e-06, + "logits/chosen": 7.842079162597656, + "logits/rejected": 7.477012634277344, + "logps/chosen": -381.36895751953125, + "logps/rejected": -303.085205078125, + "loss": 0.6412, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5196260809898376, + "rewards/margins": 0.12545932829380035, + "rewards/rejected": 0.3941667675971985, + "step": 1437 + }, + { + "epoch": 0.22238546298086218, + "grad_norm": 5.436801910400391, + "learning_rate": 3.7061855670103097e-06, + "logits/chosen": 9.974928855895996, + "logits/rejected": 7.617944717407227, + "logps/chosen": -273.0373229980469, + "logps/rejected": -241.4461212158203, + "loss": 0.5863, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3655317425727844, + "rewards/margins": 0.3113168478012085, + "rewards/rejected": 0.05421486496925354, + "step": 1438 + }, + { + "epoch": 0.22254011212062633, + "grad_norm": 4.737017631530762, + "learning_rate": 3.7087628865979386e-06, + "logits/chosen": 7.763040065765381, + "logits/rejected": 14.306317329406738, + "logps/chosen": -206.13400268554688, + "logps/rejected": -321.5020446777344, + "loss": 0.6455, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11589355766773224, + "rewards/margins": 0.14583788812160492, + "rewards/rejected": -0.029944326728582382, + "step": 1439 + }, + { + "epoch": 0.22269476126039048, + "grad_norm": 8.504049301147461, + "learning_rate": 3.7113402061855674e-06, + "logits/chosen": 11.967203140258789, + "logits/rejected": 9.252222061157227, + "logps/chosen": -363.0977783203125, + "logps/rejected": -408.66375732421875, + "loss": 0.732, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22312985360622406, + "rewards/margins": -0.062021404504776, + "rewards/rejected": 0.28515127301216125, + "step": 1440 + }, + { + "epoch": 0.22284941040015466, + "grad_norm": 3.8094871044158936, + "learning_rate": 3.7139175257731963e-06, + "logits/chosen": 4.198802947998047, + "logits/rejected": 8.757087707519531, + "logps/chosen": -148.41961669921875, + "logps/rejected": -214.957275390625, + "loss": 0.7111, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24298730492591858, + "rewards/margins": 0.02833227440714836, + "rewards/rejected": 0.21465502679347992, + "step": 1441 + }, + { + "epoch": 0.2230040595399188, + "grad_norm": 6.074606418609619, + "learning_rate": 3.716494845360825e-06, + "logits/chosen": 13.379493713378906, + "logits/rejected": 13.085268020629883, + "logps/chosen": -331.3122863769531, + "logps/rejected": -305.8445739746094, + "loss": 0.7226, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20761454105377197, + "rewards/margins": -0.04384039342403412, + "rewards/rejected": 0.2514549195766449, + "step": 1442 + }, + { + "epoch": 0.22315870867968296, + "grad_norm": 5.571346282958984, + "learning_rate": 3.719072164948454e-06, + "logits/chosen": 12.065376281738281, + "logits/rejected": 8.790617942810059, + "logps/chosen": -226.50143432617188, + "logps/rejected": -194.60081481933594, + "loss": 0.7055, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06936287879943848, + "rewards/margins": 0.01522497832775116, + "rewards/rejected": 0.054137907922267914, + "step": 1443 + }, + { + "epoch": 0.22331335781944714, + "grad_norm": 6.028081893920898, + "learning_rate": 3.721649484536083e-06, + "logits/chosen": 6.155329704284668, + "logits/rejected": 10.264471054077148, + "logps/chosen": -200.39776611328125, + "logps/rejected": -286.0333251953125, + "loss": 0.7078, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3994177579879761, + "rewards/margins": -0.009107328951358795, + "rewards/rejected": 0.4085250496864319, + "step": 1444 + }, + { + "epoch": 0.2234680069592113, + "grad_norm": 4.44863224029541, + "learning_rate": 3.724226804123712e-06, + "logits/chosen": 10.437271118164062, + "logits/rejected": 6.324764728546143, + "logps/chosen": -210.79452514648438, + "logps/rejected": -231.02670288085938, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20896272361278534, + "rewards/margins": 0.026787996292114258, + "rewards/rejected": 0.18217472732067108, + "step": 1445 + }, + { + "epoch": 0.22362265609897544, + "grad_norm": 5.622677326202393, + "learning_rate": 3.7268041237113407e-06, + "logits/chosen": 9.252970695495605, + "logits/rejected": 13.275067329406738, + "logps/chosen": -173.14688110351562, + "logps/rejected": -266.2110595703125, + "loss": 0.6823, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22605973482131958, + "rewards/margins": 0.06749209016561508, + "rewards/rejected": 0.1585676372051239, + "step": 1446 + }, + { + "epoch": 0.22377730523873962, + "grad_norm": 5.240253925323486, + "learning_rate": 3.7293814432989695e-06, + "logits/chosen": 11.51059627532959, + "logits/rejected": 3.2109203338623047, + "logps/chosen": -386.291259765625, + "logps/rejected": -248.99964904785156, + "loss": 0.6739, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34667378664016724, + "rewards/margins": 0.09923753142356873, + "rewards/rejected": 0.24743622541427612, + "step": 1447 + }, + { + "epoch": 0.22393195437850377, + "grad_norm": 5.03073263168335, + "learning_rate": 3.7319587628865984e-06, + "logits/chosen": 13.226451873779297, + "logits/rejected": 7.708680152893066, + "logps/chosen": -265.1357421875, + "logps/rejected": -203.337646484375, + "loss": 0.6255, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2357672154903412, + "rewards/margins": 0.16455897688865662, + "rewards/rejected": 0.07120823860168457, + "step": 1448 + }, + { + "epoch": 0.22408660351826792, + "grad_norm": 5.565544128417969, + "learning_rate": 3.7345360824742273e-06, + "logits/chosen": 9.389707565307617, + "logits/rejected": 6.84724760055542, + "logps/chosen": -293.00640869140625, + "logps/rejected": -324.0738220214844, + "loss": 0.5909, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5901398658752441, + "rewards/margins": 0.2525695264339447, + "rewards/rejected": 0.33757033944129944, + "step": 1449 + }, + { + "epoch": 0.2242412526580321, + "grad_norm": 4.719089031219482, + "learning_rate": 3.737113402061856e-06, + "logits/chosen": 9.208586692810059, + "logits/rejected": 5.907774925231934, + "logps/chosen": -249.88706970214844, + "logps/rejected": -190.32154846191406, + "loss": 0.7511, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17082425951957703, + "rewards/margins": -0.06860889494419098, + "rewards/rejected": 0.239433154463768, + "step": 1450 + }, + { + "epoch": 0.22439590179779625, + "grad_norm": 6.521057605743408, + "learning_rate": 3.739690721649485e-06, + "logits/chosen": 6.830074310302734, + "logits/rejected": 11.48775863647461, + "logps/chosen": -336.54241943359375, + "logps/rejected": -379.4432678222656, + "loss": 0.739, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07159577310085297, + "rewards/margins": -0.02558770775794983, + "rewards/rejected": 0.0971834659576416, + "step": 1451 + }, + { + "epoch": 0.2245505509375604, + "grad_norm": 4.088323593139648, + "learning_rate": 3.742268041237114e-06, + "logits/chosen": 4.787644386291504, + "logits/rejected": 7.796564102172852, + "logps/chosen": -143.172119140625, + "logps/rejected": -124.1253662109375, + "loss": 0.7437, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3385204076766968, + "rewards/margins": -0.07669384777545929, + "rewards/rejected": 0.41521427035331726, + "step": 1452 + }, + { + "epoch": 0.22470520007732456, + "grad_norm": 10.13058853149414, + "learning_rate": 3.7448453608247428e-06, + "logits/chosen": 7.847586154937744, + "logits/rejected": 9.587234497070312, + "logps/chosen": -302.4804382324219, + "logps/rejected": -334.10821533203125, + "loss": 0.6763, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20742465555667877, + "rewards/margins": 0.05256490036845207, + "rewards/rejected": 0.1548597514629364, + "step": 1453 + }, + { + "epoch": 0.22485984921708874, + "grad_norm": 5.530125141143799, + "learning_rate": 3.7474226804123716e-06, + "logits/chosen": 10.282870292663574, + "logits/rejected": 9.538536071777344, + "logps/chosen": -282.2089538574219, + "logps/rejected": -224.3106231689453, + "loss": 0.6578, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19963115453720093, + "rewards/margins": 0.1490197479724884, + "rewards/rejected": 0.050611402839422226, + "step": 1454 + }, + { + "epoch": 0.2250144983568529, + "grad_norm": 7.162295341491699, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": 8.318441390991211, + "logits/rejected": 4.61562967300415, + "logps/chosen": -332.745361328125, + "logps/rejected": -363.1005859375, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3675723969936371, + "rewards/margins": 0.15564587712287903, + "rewards/rejected": 0.21192653477191925, + "step": 1455 + }, + { + "epoch": 0.22516914749661704, + "grad_norm": 6.704213619232178, + "learning_rate": 3.752577319587629e-06, + "logits/chosen": 8.838983535766602, + "logits/rejected": 10.01030158996582, + "logps/chosen": -271.49639892578125, + "logps/rejected": -283.4617614746094, + "loss": 0.7733, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17520073056221008, + "rewards/margins": -0.12454129755496979, + "rewards/rejected": 0.29974204301834106, + "step": 1456 + }, + { + "epoch": 0.22532379663638122, + "grad_norm": 5.502676010131836, + "learning_rate": 3.755154639175258e-06, + "logits/chosen": 10.801664352416992, + "logits/rejected": 7.115739822387695, + "logps/chosen": -193.4436798095703, + "logps/rejected": -200.88377380371094, + "loss": 0.6639, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15627413988113403, + "rewards/margins": 0.08754958212375641, + "rewards/rejected": 0.06872454285621643, + "step": 1457 + }, + { + "epoch": 0.22547844577614537, + "grad_norm": 4.756232261657715, + "learning_rate": 3.7577319587628867e-06, + "logits/chosen": 5.954425811767578, + "logits/rejected": 8.172614097595215, + "logps/chosen": -216.77645874023438, + "logps/rejected": -235.64361572265625, + "loss": 0.6287, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28982970118522644, + "rewards/margins": 0.17168134450912476, + "rewards/rejected": 0.11814837902784348, + "step": 1458 + }, + { + "epoch": 0.22563309491590952, + "grad_norm": 6.785560607910156, + "learning_rate": 3.7603092783505156e-06, + "logits/chosen": 7.969601631164551, + "logits/rejected": 2.9462082386016846, + "logps/chosen": -277.36767578125, + "logps/rejected": -198.71920776367188, + "loss": 0.6588, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33390480279922485, + "rewards/margins": 0.08317037671804428, + "rewards/rejected": 0.2507344186306, + "step": 1459 + }, + { + "epoch": 0.2257877440556737, + "grad_norm": 6.045064926147461, + "learning_rate": 3.7628865979381445e-06, + "logits/chosen": 7.3320207595825195, + "logits/rejected": 14.427106857299805, + "logps/chosen": -232.15289306640625, + "logps/rejected": -355.29205322265625, + "loss": 0.6108, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25549107789993286, + "rewards/margins": 0.2516169548034668, + "rewards/rejected": 0.0038741156458854675, + "step": 1460 + }, + { + "epoch": 0.22594239319543785, + "grad_norm": 6.161303520202637, + "learning_rate": 3.7654639175257733e-06, + "logits/chosen": 10.21287727355957, + "logits/rejected": 9.032976150512695, + "logps/chosen": -242.93142700195312, + "logps/rejected": -217.16339111328125, + "loss": 0.75, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0685734823346138, + "rewards/margins": -0.027738407254219055, + "rewards/rejected": 0.09631189703941345, + "step": 1461 + }, + { + "epoch": 0.226097042335202, + "grad_norm": 4.703522205352783, + "learning_rate": 3.768041237113402e-06, + "logits/chosen": 9.26286792755127, + "logits/rejected": 7.608859062194824, + "logps/chosen": -172.74000549316406, + "logps/rejected": -154.00189208984375, + "loss": 0.7552, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.17624440789222717, + "rewards/margins": -0.10906385630369186, + "rewards/rejected": 0.28530827164649963, + "step": 1462 + }, + { + "epoch": 0.22625169147496618, + "grad_norm": 6.46061372756958, + "learning_rate": 3.770618556701031e-06, + "logits/chosen": 7.008559703826904, + "logits/rejected": 6.716447830200195, + "logps/chosen": -260.0907897949219, + "logps/rejected": -231.19032287597656, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3687230944633484, + "rewards/margins": 0.28190720081329346, + "rewards/rejected": 0.08681588619947433, + "step": 1463 + }, + { + "epoch": 0.22640634061473033, + "grad_norm": 6.741631507873535, + "learning_rate": 3.77319587628866e-06, + "logits/chosen": 12.439546585083008, + "logits/rejected": 11.839517593383789, + "logps/chosen": -370.86474609375, + "logps/rejected": -357.56500244140625, + "loss": 0.6303, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23264950513839722, + "rewards/margins": 0.14716365933418274, + "rewards/rejected": 0.08548584580421448, + "step": 1464 + }, + { + "epoch": 0.22656098975449449, + "grad_norm": 5.3718695640563965, + "learning_rate": 3.775773195876289e-06, + "logits/chosen": 8.539482116699219, + "logits/rejected": 7.709983825683594, + "logps/chosen": -218.16976928710938, + "logps/rejected": -207.93258666992188, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4782260060310364, + "rewards/margins": 0.1768878996372223, + "rewards/rejected": 0.3013381063938141, + "step": 1465 + }, + { + "epoch": 0.22671563889425866, + "grad_norm": 3.3755064010620117, + "learning_rate": 3.7783505154639177e-06, + "logits/chosen": 11.33003044128418, + "logits/rejected": 8.022276878356934, + "logps/chosen": -228.516845703125, + "logps/rejected": -172.59207153320312, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1908881813287735, + "rewards/margins": 0.37673819065093994, + "rewards/rejected": -0.18584999442100525, + "step": 1466 + }, + { + "epoch": 0.22687028803402282, + "grad_norm": 8.281699180603027, + "learning_rate": 3.7809278350515466e-06, + "logits/chosen": 12.606637954711914, + "logits/rejected": 7.0932230949401855, + "logps/chosen": -289.2562255859375, + "logps/rejected": -218.06861877441406, + "loss": 0.8437, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2584618628025055, + "rewards/margins": -0.2069428563117981, + "rewards/rejected": -0.051519013941287994, + "step": 1467 + }, + { + "epoch": 0.22702493717378697, + "grad_norm": 4.887508392333984, + "learning_rate": 3.7835051546391754e-06, + "logits/chosen": 13.424625396728516, + "logits/rejected": 12.916078567504883, + "logps/chosen": -259.698486328125, + "logps/rejected": -217.0360107421875, + "loss": 0.6681, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.026994463056325912, + "rewards/margins": 0.06224951893091202, + "rewards/rejected": -0.035255055874586105, + "step": 1468 + }, + { + "epoch": 0.22717958631355112, + "grad_norm": 5.405309677124023, + "learning_rate": 3.7860824742268043e-06, + "logits/chosen": 8.76015853881836, + "logits/rejected": 6.77261209487915, + "logps/chosen": -233.000732421875, + "logps/rejected": -218.54171752929688, + "loss": 0.6836, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23566150665283203, + "rewards/margins": 0.040636539459228516, + "rewards/rejected": 0.19502496719360352, + "step": 1469 + }, + { + "epoch": 0.2273342354533153, + "grad_norm": 4.913394927978516, + "learning_rate": 3.788659793814433e-06, + "logits/chosen": 6.526886463165283, + "logits/rejected": -3.7138564586639404, + "logps/chosen": -435.60552978515625, + "logps/rejected": -183.29588317871094, + "loss": 0.4968, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3711024522781372, + "rewards/margins": 0.5004416704177856, + "rewards/rejected": -0.12933921813964844, + "step": 1470 + }, + { + "epoch": 0.22748888459307945, + "grad_norm": 6.3503570556640625, + "learning_rate": 3.791237113402062e-06, + "logits/chosen": 10.012077331542969, + "logits/rejected": 5.791125774383545, + "logps/chosen": -344.0557556152344, + "logps/rejected": -249.77151489257812, + "loss": 0.5983, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.285371869802475, + "rewards/margins": 0.2382340282201767, + "rewards/rejected": 0.04713783040642738, + "step": 1471 + }, + { + "epoch": 0.2276435337328436, + "grad_norm": 5.755252361297607, + "learning_rate": 3.793814432989691e-06, + "logits/chosen": 9.312189102172852, + "logits/rejected": 3.8381175994873047, + "logps/chosen": -344.6442565917969, + "logps/rejected": -243.95826721191406, + "loss": 0.6072, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4146071672439575, + "rewards/margins": 0.20414811372756958, + "rewards/rejected": 0.21045905351638794, + "step": 1472 + }, + { + "epoch": 0.22779818287260778, + "grad_norm": 6.811728000640869, + "learning_rate": 3.79639175257732e-06, + "logits/chosen": 6.243160724639893, + "logits/rejected": 6.7085442543029785, + "logps/chosen": -238.72018432617188, + "logps/rejected": -319.30352783203125, + "loss": 0.8162, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.13786697387695312, + "rewards/margins": -0.17182514071464539, + "rewards/rejected": 0.3096920847892761, + "step": 1473 + }, + { + "epoch": 0.22795283201237193, + "grad_norm": 5.297938346862793, + "learning_rate": 3.7989690721649487e-06, + "logits/chosen": 5.921926498413086, + "logits/rejected": 9.10495662689209, + "logps/chosen": -323.8555603027344, + "logps/rejected": -271.42218017578125, + "loss": 0.6342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11444978415966034, + "rewards/margins": 0.1501908302307129, + "rewards/rejected": -0.03574104607105255, + "step": 1474 + }, + { + "epoch": 0.22810748115213608, + "grad_norm": 5.245988845825195, + "learning_rate": 3.8015463917525775e-06, + "logits/chosen": 10.41042709350586, + "logits/rejected": 2.9716429710388184, + "logps/chosen": -257.4392395019531, + "logps/rejected": -169.25694274902344, + "loss": 0.7139, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11719532310962677, + "rewards/margins": -0.026326656341552734, + "rewards/rejected": 0.1435219645500183, + "step": 1475 + }, + { + "epoch": 0.22826213029190026, + "grad_norm": 6.935062885284424, + "learning_rate": 3.8041237113402064e-06, + "logits/chosen": 6.816610336303711, + "logits/rejected": 6.755310535430908, + "logps/chosen": -374.6769714355469, + "logps/rejected": -278.3578796386719, + "loss": 0.6354, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2937660217285156, + "rewards/margins": 0.15558524429798126, + "rewards/rejected": 0.13818077743053436, + "step": 1476 + }, + { + "epoch": 0.2284167794316644, + "grad_norm": 6.507308006286621, + "learning_rate": 3.8067010309278353e-06, + "logits/chosen": 8.174610137939453, + "logits/rejected": 6.285118103027344, + "logps/chosen": -449.8434753417969, + "logps/rejected": -493.5425109863281, + "loss": 0.6402, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34505289793014526, + "rewards/margins": 0.13829079270362854, + "rewards/rejected": 0.20676209032535553, + "step": 1477 + }, + { + "epoch": 0.22857142857142856, + "grad_norm": 8.636043548583984, + "learning_rate": 3.809278350515464e-06, + "logits/chosen": 7.702323913574219, + "logits/rejected": 10.521150588989258, + "logps/chosen": -389.50396728515625, + "logps/rejected": -573.4085693359375, + "loss": 0.6624, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1923900544643402, + "rewards/margins": 0.1506527066230774, + "rewards/rejected": 0.04173736646771431, + "step": 1478 + }, + { + "epoch": 0.22872607771119274, + "grad_norm": 7.029668807983398, + "learning_rate": 3.811855670103093e-06, + "logits/chosen": 9.470576286315918, + "logits/rejected": 8.039572715759277, + "logps/chosen": -410.92645263671875, + "logps/rejected": -317.221435546875, + "loss": 0.7133, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4764593243598938, + "rewards/margins": -0.013340139761567116, + "rewards/rejected": 0.48979946970939636, + "step": 1479 + }, + { + "epoch": 0.2288807268509569, + "grad_norm": 6.377582550048828, + "learning_rate": 3.814432989690722e-06, + "logits/chosen": 11.972147941589355, + "logits/rejected": 8.803773880004883, + "logps/chosen": -282.05181884765625, + "logps/rejected": -225.65359497070312, + "loss": 0.719, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.12003964930772781, + "rewards/margins": -0.035159334540367126, + "rewards/rejected": 0.15519899129867554, + "step": 1480 + }, + { + "epoch": 0.22903537599072105, + "grad_norm": 5.947975158691406, + "learning_rate": 3.81701030927835e-06, + "logits/chosen": 9.577737808227539, + "logits/rejected": 12.643302917480469, + "logps/chosen": -175.74168395996094, + "logps/rejected": -218.39828491210938, + "loss": 0.786, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.07232853770256042, + "rewards/margins": -0.1332847774028778, + "rewards/rejected": 0.20561333000659943, + "step": 1481 + }, + { + "epoch": 0.22919002513048523, + "grad_norm": 7.358123302459717, + "learning_rate": 3.81958762886598e-06, + "logits/chosen": 5.825680255889893, + "logits/rejected": 9.34192180633545, + "logps/chosen": -247.2740020751953, + "logps/rejected": -237.767333984375, + "loss": 0.7652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18771938979625702, + "rewards/margins": -0.1045319065451622, + "rewards/rejected": 0.2922512888908386, + "step": 1482 + }, + { + "epoch": 0.22934467427024938, + "grad_norm": 4.354575157165527, + "learning_rate": 3.822164948453608e-06, + "logits/chosen": 9.432476043701172, + "logits/rejected": 7.647014617919922, + "logps/chosen": -209.530029296875, + "logps/rejected": -231.0635528564453, + "loss": 0.6061, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1467059701681137, + "rewards/margins": 0.19666936993598938, + "rewards/rejected": -0.04996339604258537, + "step": 1483 + }, + { + "epoch": 0.22949932341001353, + "grad_norm": 6.312913417816162, + "learning_rate": 3.824742268041237e-06, + "logits/chosen": 13.776065826416016, + "logits/rejected": 6.200751781463623, + "logps/chosen": -339.8843078613281, + "logps/rejected": -319.6251220703125, + "loss": 0.6722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.008113861083984375, + "rewards/margins": 0.09153690189123154, + "rewards/rejected": -0.08342304080724716, + "step": 1484 + }, + { + "epoch": 0.22965397254977768, + "grad_norm": 4.326000690460205, + "learning_rate": 3.827319587628866e-06, + "logits/chosen": 9.664262771606445, + "logits/rejected": 10.277108192443848, + "logps/chosen": -214.204833984375, + "logps/rejected": -218.7750244140625, + "loss": 0.6324, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2436470091342926, + "rewards/margins": 0.14362019300460815, + "rewards/rejected": 0.10002681612968445, + "step": 1485 + }, + { + "epoch": 0.22980862168954186, + "grad_norm": 7.29660177230835, + "learning_rate": 3.829896907216495e-06, + "logits/chosen": 9.311269760131836, + "logits/rejected": 5.835186958312988, + "logps/chosen": -378.5521240234375, + "logps/rejected": -303.630126953125, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44335824251174927, + "rewards/margins": 0.24892520904541016, + "rewards/rejected": 0.19443301856517792, + "step": 1486 + }, + { + "epoch": 0.229963270829306, + "grad_norm": 5.4405341148376465, + "learning_rate": 3.832474226804124e-06, + "logits/chosen": 4.626574516296387, + "logits/rejected": 11.722896575927734, + "logps/chosen": -240.33250427246094, + "logps/rejected": -279.8541259765625, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21309161186218262, + "rewards/margins": 0.041017621755599976, + "rewards/rejected": 0.17207399010658264, + "step": 1487 + }, + { + "epoch": 0.23011791996907016, + "grad_norm": 4.598001480102539, + "learning_rate": 3.835051546391753e-06, + "logits/chosen": 6.937432765960693, + "logits/rejected": 10.293776512145996, + "logps/chosen": -172.78076171875, + "logps/rejected": -203.2093048095703, + "loss": 0.6964, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09580355137586594, + "rewards/margins": 0.031737860292196274, + "rewards/rejected": 0.06406569480895996, + "step": 1488 + }, + { + "epoch": 0.23027256910883434, + "grad_norm": 5.912476062774658, + "learning_rate": 3.837628865979382e-06, + "logits/chosen": 15.034684181213379, + "logits/rejected": 14.747718811035156, + "logps/chosen": -296.0722961425781, + "logps/rejected": -346.833251953125, + "loss": 0.7026, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2537423372268677, + "rewards/margins": 0.13809485733509064, + "rewards/rejected": 0.11564750969409943, + "step": 1489 + }, + { + "epoch": 0.2304272182485985, + "grad_norm": 5.391479969024658, + "learning_rate": 3.840206185567011e-06, + "logits/chosen": 12.513710021972656, + "logits/rejected": 8.448549270629883, + "logps/chosen": -347.7033996582031, + "logps/rejected": -295.8345947265625, + "loss": 0.59, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19974061846733093, + "rewards/margins": 0.23782959580421448, + "rewards/rejected": -0.03808898478746414, + "step": 1490 + }, + { + "epoch": 0.23058186738836264, + "grad_norm": 6.188045501708984, + "learning_rate": 3.84278350515464e-06, + "logits/chosen": 13.082235336303711, + "logits/rejected": 6.8130106925964355, + "logps/chosen": -305.37542724609375, + "logps/rejected": -244.87841796875, + "loss": 0.7601, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03320226073265076, + "rewards/margins": -0.10829858481884003, + "rewards/rejected": 0.14150084555149078, + "step": 1491 + }, + { + "epoch": 0.23073651652812682, + "grad_norm": 4.750273704528809, + "learning_rate": 3.845360824742268e-06, + "logits/chosen": 11.077557563781738, + "logits/rejected": 11.949806213378906, + "logps/chosen": -176.37054443359375, + "logps/rejected": -263.3591613769531, + "loss": 0.7083, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2062620222568512, + "rewards/margins": 0.019134104251861572, + "rewards/rejected": 0.18712788820266724, + "step": 1492 + }, + { + "epoch": 0.23089116566789097, + "grad_norm": 5.911018371582031, + "learning_rate": 3.847938144329898e-06, + "logits/chosen": 9.231491088867188, + "logits/rejected": 6.493816375732422, + "logps/chosen": -251.95294189453125, + "logps/rejected": -196.33901977539062, + "loss": 0.6706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05813436210155487, + "rewards/margins": 0.06341734528541565, + "rewards/rejected": -0.12155171483755112, + "step": 1493 + }, + { + "epoch": 0.23104581480765513, + "grad_norm": 7.735345363616943, + "learning_rate": 3.850515463917526e-06, + "logits/chosen": 10.80581283569336, + "logits/rejected": 6.760698318481445, + "logps/chosen": -273.9007263183594, + "logps/rejected": -231.0111083984375, + "loss": 0.6673, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3082673251628876, + "rewards/margins": 0.09262268245220184, + "rewards/rejected": 0.21564464271068573, + "step": 1494 + }, + { + "epoch": 0.2312004639474193, + "grad_norm": 7.347454071044922, + "learning_rate": 3.853092783505155e-06, + "logits/chosen": 11.645853042602539, + "logits/rejected": 4.602846145629883, + "logps/chosen": -281.2930603027344, + "logps/rejected": -199.41683959960938, + "loss": 0.6123, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3027874231338501, + "rewards/margins": 0.22355104982852936, + "rewards/rejected": 0.07923637330532074, + "step": 1495 + }, + { + "epoch": 0.23135511308718346, + "grad_norm": 6.737334251403809, + "learning_rate": 3.855670103092784e-06, + "logits/chosen": 7.279929161071777, + "logits/rejected": 9.469220161437988, + "logps/chosen": -248.23397827148438, + "logps/rejected": -327.2369079589844, + "loss": 0.7361, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16058771312236786, + "rewards/margins": -0.058486759662628174, + "rewards/rejected": -0.10210093855857849, + "step": 1496 + }, + { + "epoch": 0.2315097622269476, + "grad_norm": 5.270401954650879, + "learning_rate": 3.858247422680413e-06, + "logits/chosen": 10.151684761047363, + "logits/rejected": 9.549076080322266, + "logps/chosen": -310.0138244628906, + "logps/rejected": -265.59857177734375, + "loss": 0.6461, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3578866422176361, + "rewards/margins": 0.1330842673778534, + "rewards/rejected": 0.2248024046421051, + "step": 1497 + }, + { + "epoch": 0.23166441136671176, + "grad_norm": 8.023092269897461, + "learning_rate": 3.860824742268042e-06, + "logits/chosen": 12.492981910705566, + "logits/rejected": 5.868310928344727, + "logps/chosen": -334.49578857421875, + "logps/rejected": -284.79058837890625, + "loss": 0.7028, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5174224376678467, + "rewards/margins": -0.006380315870046616, + "rewards/rejected": 0.5238027572631836, + "step": 1498 + }, + { + "epoch": 0.23181906050647594, + "grad_norm": 6.564235687255859, + "learning_rate": 3.863402061855671e-06, + "logits/chosen": 9.212175369262695, + "logits/rejected": 8.734397888183594, + "logps/chosen": -211.65870666503906, + "logps/rejected": -230.56903076171875, + "loss": 0.802, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11530418694019318, + "rewards/margins": -0.17269396781921387, + "rewards/rejected": 0.28799813985824585, + "step": 1499 + }, + { + "epoch": 0.2319737096462401, + "grad_norm": 6.599989891052246, + "learning_rate": 3.865979381443299e-06, + "logits/chosen": 9.26891803741455, + "logits/rejected": 9.027416229248047, + "logps/chosen": -320.5238037109375, + "logps/rejected": -285.834716796875, + "loss": 0.7892, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17476335167884827, + "rewards/margins": -0.09738259762525558, + "rewards/rejected": 0.27214595675468445, + "step": 1500 + }, + { + "epoch": 0.23212835878600424, + "grad_norm": 6.970137596130371, + "learning_rate": 3.868556701030929e-06, + "logits/chosen": 3.8541297912597656, + "logits/rejected": 11.664453506469727, + "logps/chosen": -336.1994323730469, + "logps/rejected": -305.06134033203125, + "loss": 0.7175, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2546382248401642, + "rewards/margins": 0.001676023006439209, + "rewards/rejected": 0.252962201833725, + "step": 1501 + }, + { + "epoch": 0.23228300792576842, + "grad_norm": 7.253159523010254, + "learning_rate": 3.871134020618557e-06, + "logits/chosen": 9.755655288696289, + "logits/rejected": 10.065564155578613, + "logps/chosen": -256.7037048339844, + "logps/rejected": -261.17730712890625, + "loss": 0.8747, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03782206028699875, + "rewards/margins": -0.2745472192764282, + "rewards/rejected": 0.31236928701400757, + "step": 1502 + }, + { + "epoch": 0.23243765706553257, + "grad_norm": 5.231308460235596, + "learning_rate": 3.873711340206186e-06, + "logits/chosen": 12.124849319458008, + "logits/rejected": 8.130109786987305, + "logps/chosen": -254.45901489257812, + "logps/rejected": -222.69906616210938, + "loss": 0.5712, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4752688705921173, + "rewards/margins": 0.31232118606567383, + "rewards/rejected": 0.1629476547241211, + "step": 1503 + }, + { + "epoch": 0.23259230620529672, + "grad_norm": 9.625741958618164, + "learning_rate": 3.876288659793815e-06, + "logits/chosen": 9.822915077209473, + "logits/rejected": 10.296433448791504, + "logps/chosen": -315.6681823730469, + "logps/rejected": -281.40643310546875, + "loss": 0.8795, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02320127747952938, + "rewards/margins": -0.3095248341560364, + "rewards/rejected": 0.28632354736328125, + "step": 1504 + }, + { + "epoch": 0.2327469553450609, + "grad_norm": 5.206031322479248, + "learning_rate": 3.878865979381444e-06, + "logits/chosen": 11.777912139892578, + "logits/rejected": 0.42134547233581543, + "logps/chosen": -331.1743469238281, + "logps/rejected": -156.06776428222656, + "loss": 0.5898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2575674057006836, + "rewards/margins": 0.2603178024291992, + "rewards/rejected": -0.002750396728515625, + "step": 1505 + }, + { + "epoch": 0.23290160448482505, + "grad_norm": 6.149837017059326, + "learning_rate": 3.8814432989690726e-06, + "logits/chosen": 4.658173561096191, + "logits/rejected": 8.412554740905762, + "logps/chosen": -245.06982421875, + "logps/rejected": -283.7773132324219, + "loss": 0.7421, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.140394926071167, + "rewards/margins": -0.061921752989292145, + "rewards/rejected": 0.20231667160987854, + "step": 1506 + }, + { + "epoch": 0.2330562536245892, + "grad_norm": 4.144974708557129, + "learning_rate": 3.884020618556701e-06, + "logits/chosen": 12.493021011352539, + "logits/rejected": 9.275261878967285, + "logps/chosen": -278.2568664550781, + "logps/rejected": -251.72659301757812, + "loss": 0.569, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47479915618896484, + "rewards/margins": 0.30153805017471313, + "rewards/rejected": 0.1732611209154129, + "step": 1507 + }, + { + "epoch": 0.23321090276435338, + "grad_norm": 4.9788594245910645, + "learning_rate": 3.88659793814433e-06, + "logits/chosen": 12.702082633972168, + "logits/rejected": 10.610586166381836, + "logps/chosen": -335.4980163574219, + "logps/rejected": -266.1140441894531, + "loss": 0.5811, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38709479570388794, + "rewards/margins": 0.2924399673938751, + "rewards/rejected": 0.0946548581123352, + "step": 1508 + }, + { + "epoch": 0.23336555190411754, + "grad_norm": 7.7753987312316895, + "learning_rate": 3.889175257731959e-06, + "logits/chosen": 9.48106861114502, + "logits/rejected": 5.862563133239746, + "logps/chosen": -365.45733642578125, + "logps/rejected": -320.953125, + "loss": 0.8501, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.253966361284256, + "rewards/margins": -0.19125162065029144, + "rewards/rejected": 0.4452179968357086, + "step": 1509 + }, + { + "epoch": 0.2335202010438817, + "grad_norm": 4.381181240081787, + "learning_rate": 3.891752577319588e-06, + "logits/chosen": 8.655179023742676, + "logits/rejected": 0.4067535400390625, + "logps/chosen": -216.61383056640625, + "logps/rejected": -192.57862854003906, + "loss": 0.5729, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5711411833763123, + "rewards/margins": 0.3399847447872162, + "rewards/rejected": 0.23115646839141846, + "step": 1510 + }, + { + "epoch": 0.23367485018364587, + "grad_norm": 12.981870651245117, + "learning_rate": 3.8943298969072165e-06, + "logits/chosen": 8.160552978515625, + "logits/rejected": 8.916869163513184, + "logps/chosen": -225.50807189941406, + "logps/rejected": -312.6629943847656, + "loss": 0.6959, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30918991565704346, + "rewards/margins": 0.038649603724479675, + "rewards/rejected": 0.2705402970314026, + "step": 1511 + }, + { + "epoch": 0.23382949932341002, + "grad_norm": 6.409939765930176, + "learning_rate": 3.896907216494846e-06, + "logits/chosen": 8.442054748535156, + "logits/rejected": 6.3834228515625, + "logps/chosen": -229.24179077148438, + "logps/rejected": -286.79840087890625, + "loss": 0.7896, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1021028533577919, + "rewards/margins": -0.11848890781402588, + "rewards/rejected": 0.01638607680797577, + "step": 1512 + }, + { + "epoch": 0.23398414846317417, + "grad_norm": 4.915167808532715, + "learning_rate": 3.899484536082474e-06, + "logits/chosen": 7.4133172035217285, + "logits/rejected": 3.7280473709106445, + "logps/chosen": -211.74615478515625, + "logps/rejected": -198.40516662597656, + "loss": 0.6095, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18608593940734863, + "rewards/margins": 0.2144806981086731, + "rewards/rejected": -0.02839474007487297, + "step": 1513 + }, + { + "epoch": 0.23413879760293832, + "grad_norm": 5.191720485687256, + "learning_rate": 3.9020618556701035e-06, + "logits/chosen": 15.940607070922852, + "logits/rejected": 6.936572074890137, + "logps/chosen": -336.0983581542969, + "logps/rejected": -260.76580810546875, + "loss": 0.6357, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40377309918403625, + "rewards/margins": 0.27017515897750854, + "rewards/rejected": 0.1335979551076889, + "step": 1514 + }, + { + "epoch": 0.2342934467427025, + "grad_norm": 4.383785247802734, + "learning_rate": 3.904639175257732e-06, + "logits/chosen": 1.3904391527175903, + "logits/rejected": -3.068742275238037, + "logps/chosen": -199.25372314453125, + "logps/rejected": -123.5685806274414, + "loss": 0.5633, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5240488648414612, + "rewards/margins": 0.3650621771812439, + "rewards/rejected": 0.15898671746253967, + "step": 1515 + }, + { + "epoch": 0.23444809588246665, + "grad_norm": 7.265319347381592, + "learning_rate": 3.907216494845361e-06, + "logits/chosen": 13.460977554321289, + "logits/rejected": 6.150660514831543, + "logps/chosen": -406.9249267578125, + "logps/rejected": -235.89553833007812, + "loss": 0.6503, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5234739780426025, + "rewards/margins": 0.1150905191898346, + "rewards/rejected": 0.40838345885276794, + "step": 1516 + }, + { + "epoch": 0.2346027450222308, + "grad_norm": 7.195724964141846, + "learning_rate": 3.90979381443299e-06, + "logits/chosen": 8.465538024902344, + "logits/rejected": 6.386586666107178, + "logps/chosen": -360.35247802734375, + "logps/rejected": -264.21124267578125, + "loss": 0.7, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5524378418922424, + "rewards/margins": 0.0303567573428154, + "rewards/rejected": 0.5220810770988464, + "step": 1517 + }, + { + "epoch": 0.23475739416199498, + "grad_norm": 7.234546184539795, + "learning_rate": 3.912371134020619e-06, + "logits/chosen": 15.706295013427734, + "logits/rejected": 10.237701416015625, + "logps/chosen": -426.7939758300781, + "logps/rejected": -433.022216796875, + "loss": 0.6667, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3625059127807617, + "rewards/margins": 0.06743927299976349, + "rewards/rejected": 0.2950666546821594, + "step": 1518 + }, + { + "epoch": 0.23491204330175913, + "grad_norm": 5.0474534034729, + "learning_rate": 3.9149484536082475e-06, + "logits/chosen": 10.240234375, + "logits/rejected": 4.4165825843811035, + "logps/chosen": -183.04933166503906, + "logps/rejected": -149.90525817871094, + "loss": 0.7293, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20261268317699432, + "rewards/margins": -0.04941230267286301, + "rewards/rejected": 0.25202497839927673, + "step": 1519 + }, + { + "epoch": 0.23506669244152328, + "grad_norm": 5.936202526092529, + "learning_rate": 3.917525773195877e-06, + "logits/chosen": 10.639086723327637, + "logits/rejected": 3.121960401535034, + "logps/chosen": -246.2490692138672, + "logps/rejected": -164.32725524902344, + "loss": 0.7013, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36068159341812134, + "rewards/margins": -0.00103764608502388, + "rewards/rejected": 0.3617192506790161, + "step": 1520 + }, + { + "epoch": 0.23522134158128746, + "grad_norm": 6.167348861694336, + "learning_rate": 3.920103092783505e-06, + "logits/chosen": 11.816542625427246, + "logits/rejected": 4.546465873718262, + "logps/chosen": -280.189453125, + "logps/rejected": -210.26174926757812, + "loss": 0.6743, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12677177786827087, + "rewards/margins": 0.15038365125656128, + "rewards/rejected": -0.02361188270151615, + "step": 1521 + }, + { + "epoch": 0.23537599072105161, + "grad_norm": 5.172669887542725, + "learning_rate": 3.9226804123711345e-06, + "logits/chosen": 10.670907974243164, + "logits/rejected": 7.24764347076416, + "logps/chosen": -278.16339111328125, + "logps/rejected": -181.60498046875, + "loss": 0.6806, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4874260127544403, + "rewards/margins": 0.043611686676740646, + "rewards/rejected": 0.44381433725357056, + "step": 1522 + }, + { + "epoch": 0.23553063986081577, + "grad_norm": 5.252559185028076, + "learning_rate": 3.925257731958763e-06, + "logits/chosen": 8.412662506103516, + "logits/rejected": 3.091952323913574, + "logps/chosen": -214.8580322265625, + "logps/rejected": -172.47805786132812, + "loss": 0.6562, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4008265733718872, + "rewards/margins": 0.09591773897409439, + "rewards/rejected": 0.3049088716506958, + "step": 1523 + }, + { + "epoch": 0.23568528900057995, + "grad_norm": 5.445528984069824, + "learning_rate": 3.927835051546392e-06, + "logits/chosen": 11.759615898132324, + "logits/rejected": 6.603482246398926, + "logps/chosen": -347.9300537109375, + "logps/rejected": -254.6511993408203, + "loss": 0.6081, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6747207045555115, + "rewards/margins": 0.21302491426467896, + "rewards/rejected": 0.4616957902908325, + "step": 1524 + }, + { + "epoch": 0.2358399381403441, + "grad_norm": 4.5485734939575195, + "learning_rate": 3.930412371134021e-06, + "logits/chosen": 10.335458755493164, + "logits/rejected": 2.6694533824920654, + "logps/chosen": -356.0323486328125, + "logps/rejected": -181.03753662109375, + "loss": 0.5395, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6421796083450317, + "rewards/margins": 0.38767755031585693, + "rewards/rejected": 0.2545021176338196, + "step": 1525 + }, + { + "epoch": 0.23599458728010825, + "grad_norm": 5.563239097595215, + "learning_rate": 3.93298969072165e-06, + "logits/chosen": 9.825559616088867, + "logits/rejected": 8.412909507751465, + "logps/chosen": -270.6493225097656, + "logps/rejected": -206.90167236328125, + "loss": 0.7916, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.30442261695861816, + "rewards/margins": -0.13934195041656494, + "rewards/rejected": 0.4437645673751831, + "step": 1526 + }, + { + "epoch": 0.23614923641987243, + "grad_norm": 5.295621395111084, + "learning_rate": 3.9355670103092784e-06, + "logits/chosen": 10.532032012939453, + "logits/rejected": 6.786726951599121, + "logps/chosen": -329.4215087890625, + "logps/rejected": -267.92071533203125, + "loss": 0.6152, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26189011335372925, + "rewards/margins": 0.19583845138549805, + "rewards/rejected": 0.06605168431997299, + "step": 1527 + }, + { + "epoch": 0.23630388555963658, + "grad_norm": 8.2406587600708, + "learning_rate": 3.938144329896908e-06, + "logits/chosen": 6.929096221923828, + "logits/rejected": 5.003818035125732, + "logps/chosen": -304.64306640625, + "logps/rejected": -254.5154571533203, + "loss": 0.6335, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5827102065086365, + "rewards/margins": 0.13970284163951874, + "rewards/rejected": 0.44300737977027893, + "step": 1528 + }, + { + "epoch": 0.23645853469940073, + "grad_norm": 4.663034915924072, + "learning_rate": 3.940721649484536e-06, + "logits/chosen": 9.873632431030273, + "logits/rejected": 3.9288434982299805, + "logps/chosen": -293.5548095703125, + "logps/rejected": -181.2091522216797, + "loss": 0.5975, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6812810897827148, + "rewards/margins": 0.29897189140319824, + "rewards/rejected": 0.382309228181839, + "step": 1529 + }, + { + "epoch": 0.23661318383916488, + "grad_norm": 4.7728118896484375, + "learning_rate": 3.9432989690721655e-06, + "logits/chosen": 12.11138916015625, + "logits/rejected": 7.631097793579102, + "logps/chosen": -212.34243774414062, + "logps/rejected": -150.57159423828125, + "loss": 0.7554, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4381111264228821, + "rewards/margins": -0.07770747691392899, + "rewards/rejected": 0.5158185958862305, + "step": 1530 + }, + { + "epoch": 0.23676783297892906, + "grad_norm": 6.447408199310303, + "learning_rate": 3.945876288659794e-06, + "logits/chosen": 7.343404293060303, + "logits/rejected": 6.306171894073486, + "logps/chosen": -202.13076782226562, + "logps/rejected": -227.48036193847656, + "loss": 0.6646, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4075896441936493, + "rewards/margins": 0.08385476469993591, + "rewards/rejected": 0.3237348794937134, + "step": 1531 + }, + { + "epoch": 0.2369224821186932, + "grad_norm": 5.694151401519775, + "learning_rate": 3.948453608247423e-06, + "logits/chosen": 2.9482839107513428, + "logits/rejected": 9.376598358154297, + "logps/chosen": -183.54022216796875, + "logps/rejected": -232.29525756835938, + "loss": 0.7505, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.519823431968689, + "rewards/margins": -0.08198332041501999, + "rewards/rejected": 0.6018067598342896, + "step": 1532 + }, + { + "epoch": 0.23707713125845736, + "grad_norm": 4.7428717613220215, + "learning_rate": 3.951030927835052e-06, + "logits/chosen": 9.520221710205078, + "logits/rejected": 9.238412857055664, + "logps/chosen": -289.4178161621094, + "logps/rejected": -280.3835754394531, + "loss": 0.5764, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5405789017677307, + "rewards/margins": 0.27859997749328613, + "rewards/rejected": 0.2619789242744446, + "step": 1533 + }, + { + "epoch": 0.23723178039822154, + "grad_norm": 4.616872310638428, + "learning_rate": 3.953608247422681e-06, + "logits/chosen": 12.716232299804688, + "logits/rejected": 8.899825096130371, + "logps/chosen": -259.2904052734375, + "logps/rejected": -192.69430541992188, + "loss": 0.6037, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46938180923461914, + "rewards/margins": 0.20717325806617737, + "rewards/rejected": 0.26220858097076416, + "step": 1534 + }, + { + "epoch": 0.2373864295379857, + "grad_norm": 5.762049198150635, + "learning_rate": 3.956185567010309e-06, + "logits/chosen": 9.231230735778809, + "logits/rejected": 7.194056510925293, + "logps/chosen": -315.4071044921875, + "logps/rejected": -275.0879821777344, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4359944462776184, + "rewards/margins": 0.048519887030124664, + "rewards/rejected": 0.38747456669807434, + "step": 1535 + }, + { + "epoch": 0.23754107867774985, + "grad_norm": 5.51515007019043, + "learning_rate": 3.958762886597938e-06, + "logits/chosen": 6.62996244430542, + "logits/rejected": 8.82288932800293, + "logps/chosen": -298.00604248046875, + "logps/rejected": -337.2183837890625, + "loss": 0.648, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6336236000061035, + "rewards/margins": 0.13676565885543823, + "rewards/rejected": 0.4968579411506653, + "step": 1536 + }, + { + "epoch": 0.23769572781751402, + "grad_norm": 7.043104648590088, + "learning_rate": 3.961340206185567e-06, + "logits/chosen": 10.491825103759766, + "logits/rejected": 9.131352424621582, + "logps/chosen": -365.07122802734375, + "logps/rejected": -265.52874755859375, + "loss": 0.8316, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.23324403166770935, + "rewards/margins": -0.23364277184009552, + "rewards/rejected": 0.46688681840896606, + "step": 1537 + }, + { + "epoch": 0.23785037695727818, + "grad_norm": 4.385149955749512, + "learning_rate": 3.963917525773196e-06, + "logits/chosen": 7.493856430053711, + "logits/rejected": 1.1993488073349, + "logps/chosen": -378.59686279296875, + "logps/rejected": -259.1129150390625, + "loss": 0.5343, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6846151947975159, + "rewards/margins": 0.36997056007385254, + "rewards/rejected": 0.31464463472366333, + "step": 1538 + }, + { + "epoch": 0.23800502609704233, + "grad_norm": 6.514186859130859, + "learning_rate": 3.966494845360825e-06, + "logits/chosen": 6.022164821624756, + "logits/rejected": 10.044828414916992, + "logps/chosen": -297.1440124511719, + "logps/rejected": -300.332763671875, + "loss": 0.6612, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.358091801404953, + "rewards/margins": 0.09648928046226501, + "rewards/rejected": 0.261602520942688, + "step": 1539 + }, + { + "epoch": 0.2381596752368065, + "grad_norm": 4.723688125610352, + "learning_rate": 3.969072164948453e-06, + "logits/chosen": 6.895518779754639, + "logits/rejected": 2.1096956729888916, + "logps/chosen": -210.67108154296875, + "logps/rejected": -185.7576141357422, + "loss": 0.5632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6370294094085693, + "rewards/margins": 0.2994311451911926, + "rewards/rejected": 0.3375983238220215, + "step": 1540 + }, + { + "epoch": 0.23831432437657066, + "grad_norm": 4.970569610595703, + "learning_rate": 3.971649484536083e-06, + "logits/chosen": 11.017690658569336, + "logits/rejected": 7.6477484703063965, + "logps/chosen": -181.72262573242188, + "logps/rejected": -161.9139404296875, + "loss": 0.7472, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5567940473556519, + "rewards/margins": -0.009918492287397385, + "rewards/rejected": 0.5667125582695007, + "step": 1541 + }, + { + "epoch": 0.2384689735163348, + "grad_norm": 6.497038841247559, + "learning_rate": 3.974226804123711e-06, + "logits/chosen": 12.549454689025879, + "logits/rejected": 8.95539665222168, + "logps/chosen": -235.93719482421875, + "logps/rejected": -244.67599487304688, + "loss": 0.7134, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23067083954811096, + "rewards/margins": -0.0008360408246517181, + "rewards/rejected": 0.23150688409805298, + "step": 1542 + }, + { + "epoch": 0.238623622656099, + "grad_norm": 5.013777732849121, + "learning_rate": 3.97680412371134e-06, + "logits/chosen": 7.2159318923950195, + "logits/rejected": 6.313971042633057, + "logps/chosen": -193.31979370117188, + "logps/rejected": -206.64132690429688, + "loss": 0.6661, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40325412154197693, + "rewards/margins": 0.09369264543056488, + "rewards/rejected": 0.30956146121025085, + "step": 1543 + }, + { + "epoch": 0.23877827179586314, + "grad_norm": 6.626040935516357, + "learning_rate": 3.979381443298969e-06, + "logits/chosen": 11.671626091003418, + "logits/rejected": 7.379290580749512, + "logps/chosen": -303.1752014160156, + "logps/rejected": -229.61151123046875, + "loss": 0.5585, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5849947929382324, + "rewards/margins": 0.34085673093795776, + "rewards/rejected": 0.24413806200027466, + "step": 1544 + }, + { + "epoch": 0.2389329209356273, + "grad_norm": 5.476341724395752, + "learning_rate": 3.981958762886598e-06, + "logits/chosen": 10.761817932128906, + "logits/rejected": 11.846903800964355, + "logps/chosen": -233.01882934570312, + "logps/rejected": -271.3255615234375, + "loss": 0.6231, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4730564057826996, + "rewards/margins": 0.20200592279434204, + "rewards/rejected": 0.27105045318603516, + "step": 1545 + }, + { + "epoch": 0.23908757007539144, + "grad_norm": 7.053686618804932, + "learning_rate": 3.9845360824742274e-06, + "logits/chosen": 6.4414215087890625, + "logits/rejected": 9.58053970336914, + "logps/chosen": -257.3913879394531, + "logps/rejected": -323.2955322265625, + "loss": 0.68, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4482116401195526, + "rewards/margins": 0.047430846840143204, + "rewards/rejected": 0.4007807970046997, + "step": 1546 + }, + { + "epoch": 0.23924221921515562, + "grad_norm": 5.480428218841553, + "learning_rate": 3.987113402061856e-06, + "logits/chosen": 9.976581573486328, + "logits/rejected": 9.315201759338379, + "logps/chosen": -245.08792114257812, + "logps/rejected": -224.33184814453125, + "loss": 0.6179, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5003190636634827, + "rewards/margins": 0.197228342294693, + "rewards/rejected": 0.3030906617641449, + "step": 1547 + }, + { + "epoch": 0.23939686835491977, + "grad_norm": 4.120488166809082, + "learning_rate": 3.989690721649485e-06, + "logits/chosen": 7.97170877456665, + "logits/rejected": 6.064027786254883, + "logps/chosen": -177.2056884765625, + "logps/rejected": -172.6649169921875, + "loss": 0.5638, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7762455940246582, + "rewards/margins": 0.30764156579971313, + "rewards/rejected": 0.46860408782958984, + "step": 1548 + }, + { + "epoch": 0.23955151749468392, + "grad_norm": 5.3417067527771, + "learning_rate": 3.992268041237114e-06, + "logits/chosen": 7.547265529632568, + "logits/rejected": 3.9034485816955566, + "logps/chosen": -249.04042053222656, + "logps/rejected": -248.63888549804688, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40392619371414185, + "rewards/margins": 0.10767553746700287, + "rewards/rejected": 0.2962506413459778, + "step": 1549 + }, + { + "epoch": 0.2397061666344481, + "grad_norm": 6.094179630279541, + "learning_rate": 3.994845360824743e-06, + "logits/chosen": 11.436868667602539, + "logits/rejected": 7.787228107452393, + "logps/chosen": -346.1475830078125, + "logps/rejected": -321.69775390625, + "loss": 0.634, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44328147172927856, + "rewards/margins": 0.18709611892700195, + "rewards/rejected": 0.2561853528022766, + "step": 1550 + }, + { + "epoch": 0.23986081577421225, + "grad_norm": 5.648887634277344, + "learning_rate": 3.997422680412371e-06, + "logits/chosen": 2.649181842803955, + "logits/rejected": 6.367504119873047, + "logps/chosen": -287.20831298828125, + "logps/rejected": -307.246337890625, + "loss": 0.684, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4901849925518036, + "rewards/margins": 0.05955544486641884, + "rewards/rejected": 0.43062952160835266, + "step": 1551 + }, + { + "epoch": 0.2400154649139764, + "grad_norm": 5.494936466217041, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 12.190316200256348, + "logits/rejected": 10.302474021911621, + "logps/chosen": -343.9850158691406, + "logps/rejected": -264.27911376953125, + "loss": 0.5388, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6995195746421814, + "rewards/margins": 0.39650246500968933, + "rewards/rejected": 0.30301710963249207, + "step": 1552 + }, + { + "epoch": 0.24017011405374059, + "grad_norm": 4.425969123840332, + "learning_rate": 4.002577319587629e-06, + "logits/chosen": 9.866296768188477, + "logits/rejected": 4.985841274261475, + "logps/chosen": -296.0503234863281, + "logps/rejected": -240.78343200683594, + "loss": 0.6032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5350544452667236, + "rewards/margins": 0.196751207113266, + "rewards/rejected": 0.33830320835113525, + "step": 1553 + }, + { + "epoch": 0.24032476319350474, + "grad_norm": 8.744105339050293, + "learning_rate": 4.005154639175258e-06, + "logits/chosen": 7.268885612487793, + "logits/rejected": 6.854941368103027, + "logps/chosen": -415.1798095703125, + "logps/rejected": -364.4555969238281, + "loss": 0.7227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9483609199523926, + "rewards/margins": 0.011875815689563751, + "rewards/rejected": 0.9364850521087646, + "step": 1554 + }, + { + "epoch": 0.2404794123332689, + "grad_norm": 7.5815653800964355, + "learning_rate": 4.007731958762887e-06, + "logits/chosen": 12.504404067993164, + "logits/rejected": 11.693572998046875, + "logps/chosen": -306.8857727050781, + "logps/rejected": -252.85137939453125, + "loss": 0.6413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4932171106338501, + "rewards/margins": 0.21040642261505127, + "rewards/rejected": 0.28281062841415405, + "step": 1555 + }, + { + "epoch": 0.24063406147303307, + "grad_norm": 4.477160453796387, + "learning_rate": 4.010309278350516e-06, + "logits/chosen": 7.074172019958496, + "logits/rejected": 9.082114219665527, + "logps/chosen": -247.4729766845703, + "logps/rejected": -216.42205810546875, + "loss": 0.5979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4961368441581726, + "rewards/margins": 0.21538829803466797, + "rewards/rejected": 0.280748575925827, + "step": 1556 + }, + { + "epoch": 0.24078871061279722, + "grad_norm": 3.7664945125579834, + "learning_rate": 4.012886597938145e-06, + "logits/chosen": 3.4420535564422607, + "logits/rejected": 6.456167221069336, + "logps/chosen": -156.06997680664062, + "logps/rejected": -170.4356689453125, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2626434862613678, + "rewards/margins": 0.10600829124450684, + "rewards/rejected": 0.15663519501686096, + "step": 1557 + }, + { + "epoch": 0.24094335975256137, + "grad_norm": 7.399625778198242, + "learning_rate": 4.015463917525774e-06, + "logits/chosen": 12.656184196472168, + "logits/rejected": 12.132808685302734, + "logps/chosen": -348.25836181640625, + "logps/rejected": -312.4849853515625, + "loss": 0.7158, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5322256088256836, + "rewards/margins": 0.015666592866182327, + "rewards/rejected": 0.5165590643882751, + "step": 1558 + }, + { + "epoch": 0.24109800889232555, + "grad_norm": 3.438904285430908, + "learning_rate": 4.018041237113402e-06, + "logits/chosen": 9.07430648803711, + "logits/rejected": 5.470651626586914, + "logps/chosen": -165.8836669921875, + "logps/rejected": -133.9071502685547, + "loss": 0.5751, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45373183488845825, + "rewards/margins": 0.283252090215683, + "rewards/rejected": 0.17047977447509766, + "step": 1559 + }, + { + "epoch": 0.2412526580320897, + "grad_norm": 6.311559200286865, + "learning_rate": 4.020618556701032e-06, + "logits/chosen": 10.098783493041992, + "logits/rejected": 3.9479897022247314, + "logps/chosen": -333.0740966796875, + "logps/rejected": -276.79388427734375, + "loss": 0.624, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7639670372009277, + "rewards/margins": 0.24557381868362427, + "rewards/rejected": 0.5183932781219482, + "step": 1560 + }, + { + "epoch": 0.24140730717185385, + "grad_norm": 5.065113544464111, + "learning_rate": 4.02319587628866e-06, + "logits/chosen": 8.530557632446289, + "logits/rejected": 0.9309936165809631, + "logps/chosen": -230.94143676757812, + "logps/rejected": -145.53173828125, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27160483598709106, + "rewards/margins": 0.06093420833349228, + "rewards/rejected": 0.2106706202030182, + "step": 1561 + }, + { + "epoch": 0.241561956311618, + "grad_norm": 5.3548197746276855, + "learning_rate": 4.025773195876289e-06, + "logits/chosen": 7.3155717849731445, + "logits/rejected": 11.198726654052734, + "logps/chosen": -193.5297088623047, + "logps/rejected": -182.76806640625, + "loss": 0.7579, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2433968484401703, + "rewards/margins": -0.09672746807336807, + "rewards/rejected": 0.34012433886528015, + "step": 1562 + }, + { + "epoch": 0.24171660545138218, + "grad_norm": 8.268684387207031, + "learning_rate": 4.028350515463918e-06, + "logits/chosen": 11.107728004455566, + "logits/rejected": 8.472933769226074, + "logps/chosen": -338.319580078125, + "logps/rejected": -301.05352783203125, + "loss": 0.7797, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5623660683631897, + "rewards/margins": -0.13360103964805603, + "rewards/rejected": 0.6959670782089233, + "step": 1563 + }, + { + "epoch": 0.24187125459114633, + "grad_norm": 3.832423210144043, + "learning_rate": 4.030927835051547e-06, + "logits/chosen": 10.927206039428711, + "logits/rejected": 2.791125774383545, + "logps/chosen": -234.20565795898438, + "logps/rejected": -149.89859008789062, + "loss": 0.5153, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.534905195236206, + "rewards/margins": 0.48860862851142883, + "rewards/rejected": 0.04629654437303543, + "step": 1564 + }, + { + "epoch": 0.24202590373091049, + "grad_norm": 5.430241584777832, + "learning_rate": 4.033505154639176e-06, + "logits/chosen": 6.212075710296631, + "logits/rejected": 6.049513816833496, + "logps/chosen": -185.7310791015625, + "logps/rejected": -189.97540283203125, + "loss": 0.7401, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26261845231056213, + "rewards/margins": -0.06871247291564941, + "rewards/rejected": 0.33133092522621155, + "step": 1565 + }, + { + "epoch": 0.24218055287067466, + "grad_norm": 6.031858921051025, + "learning_rate": 4.036082474226805e-06, + "logits/chosen": 9.077997207641602, + "logits/rejected": 3.846890449523926, + "logps/chosen": -250.1879119873047, + "logps/rejected": -238.61819458007812, + "loss": 0.6995, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3761819303035736, + "rewards/margins": 0.13276588916778564, + "rewards/rejected": 0.24341602623462677, + "step": 1566 + }, + { + "epoch": 0.24233520201043882, + "grad_norm": 5.185763359069824, + "learning_rate": 4.038659793814433e-06, + "logits/chosen": 11.459232330322266, + "logits/rejected": 5.572287559509277, + "logps/chosen": -200.27467346191406, + "logps/rejected": -147.2150115966797, + "loss": 0.6437, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43723392486572266, + "rewards/margins": 0.1866198629140854, + "rewards/rejected": 0.25061407685279846, + "step": 1567 + }, + { + "epoch": 0.24248985115020297, + "grad_norm": 5.5745110511779785, + "learning_rate": 4.041237113402063e-06, + "logits/chosen": 9.668834686279297, + "logits/rejected": 9.785215377807617, + "logps/chosen": -385.4316101074219, + "logps/rejected": -336.98590087890625, + "loss": 0.5286, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6820379495620728, + "rewards/margins": 0.40019628405570984, + "rewards/rejected": 0.2818416655063629, + "step": 1568 + }, + { + "epoch": 0.24264450028996715, + "grad_norm": 19.378620147705078, + "learning_rate": 4.043814432989691e-06, + "logits/chosen": 9.61506175994873, + "logits/rejected": 5.413683891296387, + "logps/chosen": -289.4730224609375, + "logps/rejected": -198.4378204345703, + "loss": 0.5973, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.535370409488678, + "rewards/margins": 0.2744349241256714, + "rewards/rejected": 0.2609354853630066, + "step": 1569 + }, + { + "epoch": 0.2427991494297313, + "grad_norm": 4.232359409332275, + "learning_rate": 4.04639175257732e-06, + "logits/chosen": 9.682815551757812, + "logits/rejected": 7.381643295288086, + "logps/chosen": -207.211181640625, + "logps/rejected": -205.03805541992188, + "loss": 0.6202, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5808166265487671, + "rewards/margins": 0.1779598742723465, + "rewards/rejected": 0.4028567671775818, + "step": 1570 + }, + { + "epoch": 0.24295379856949545, + "grad_norm": 4.199387073516846, + "learning_rate": 4.048969072164949e-06, + "logits/chosen": 8.136574745178223, + "logits/rejected": 9.329063415527344, + "logps/chosen": -140.48373413085938, + "logps/rejected": -144.8646697998047, + "loss": 0.6798, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40834593772888184, + "rewards/margins": 0.05896200239658356, + "rewards/rejected": 0.3493839204311371, + "step": 1571 + }, + { + "epoch": 0.24310844770925963, + "grad_norm": 6.335525035858154, + "learning_rate": 4.051546391752578e-06, + "logits/chosen": 7.455455303192139, + "logits/rejected": 3.4150402545928955, + "logps/chosen": -470.0790100097656, + "logps/rejected": -354.06585693359375, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.620119571685791, + "rewards/margins": 0.07851359993219376, + "rewards/rejected": 0.5416059494018555, + "step": 1572 + }, + { + "epoch": 0.24326309684902378, + "grad_norm": 6.335812091827393, + "learning_rate": 4.0541237113402066e-06, + "logits/chosen": 11.296810150146484, + "logits/rejected": 12.278915405273438, + "logps/chosen": -304.5184631347656, + "logps/rejected": -305.2149353027344, + "loss": 0.6959, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5601184368133545, + "rewards/margins": 0.08246137946844101, + "rewards/rejected": 0.4776570200920105, + "step": 1573 + }, + { + "epoch": 0.24341774598878793, + "grad_norm": 5.249716758728027, + "learning_rate": 4.056701030927835e-06, + "logits/chosen": 5.735694885253906, + "logits/rejected": 3.0682520866394043, + "logps/chosen": -306.17108154296875, + "logps/rejected": -201.2200927734375, + "loss": 0.5644, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7355351448059082, + "rewards/margins": 0.3275569975376129, + "rewards/rejected": 0.40797820687294006, + "step": 1574 + }, + { + "epoch": 0.2435723951285521, + "grad_norm": 5.277350902557373, + "learning_rate": 4.059278350515464e-06, + "logits/chosen": 15.300821304321289, + "logits/rejected": 13.396979331970215, + "logps/chosen": -234.18836975097656, + "logps/rejected": -215.06817626953125, + "loss": 0.5788, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42278116941452026, + "rewards/margins": 0.30266231298446655, + "rewards/rejected": 0.1201188713312149, + "step": 1575 + }, + { + "epoch": 0.24372704426831626, + "grad_norm": 7.821404933929443, + "learning_rate": 4.061855670103093e-06, + "logits/chosen": 9.823543548583984, + "logits/rejected": 9.900336265563965, + "logps/chosen": -257.73492431640625, + "logps/rejected": -306.6255187988281, + "loss": 0.6965, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35069897770881653, + "rewards/margins": 0.030589543282985687, + "rewards/rejected": 0.320109486579895, + "step": 1576 + }, + { + "epoch": 0.2438816934080804, + "grad_norm": 7.5015997886657715, + "learning_rate": 4.064432989690722e-06, + "logits/chosen": 10.402287483215332, + "logits/rejected": 6.180578708648682, + "logps/chosen": -367.4396057128906, + "logps/rejected": -246.1971435546875, + "loss": 0.6785, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33299723267555237, + "rewards/margins": 0.07326700538396835, + "rewards/rejected": 0.2597302496433258, + "step": 1577 + }, + { + "epoch": 0.24403634254784456, + "grad_norm": 18.336650848388672, + "learning_rate": 4.0670103092783505e-06, + "logits/chosen": 11.310956954956055, + "logits/rejected": 10.829543113708496, + "logps/chosen": -177.29788208007812, + "logps/rejected": -208.85240173339844, + "loss": 0.6101, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.424050897359848, + "rewards/margins": 0.24967622756958008, + "rewards/rejected": 0.17437466979026794, + "step": 1578 + }, + { + "epoch": 0.24419099168760874, + "grad_norm": 5.507360458374023, + "learning_rate": 4.06958762886598e-06, + "logits/chosen": 3.5101687908172607, + "logits/rejected": 0.9511356949806213, + "logps/chosen": -216.30337524414062, + "logps/rejected": -138.9408416748047, + "loss": 0.6354, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4257768988609314, + "rewards/margins": 0.20511355996131897, + "rewards/rejected": 0.220663383603096, + "step": 1579 + }, + { + "epoch": 0.2443456408273729, + "grad_norm": 5.62885046005249, + "learning_rate": 4.072164948453608e-06, + "logits/chosen": 0.9026474952697754, + "logits/rejected": 5.950027942657471, + "logps/chosen": -160.66421508789062, + "logps/rejected": -215.285888671875, + "loss": 0.6083, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3418775796890259, + "rewards/margins": 0.24033251404762268, + "rewards/rejected": 0.101545050740242, + "step": 1580 + }, + { + "epoch": 0.24450028996713705, + "grad_norm": 4.013876914978027, + "learning_rate": 4.0747422680412375e-06, + "logits/chosen": 13.218616485595703, + "logits/rejected": 9.569124221801758, + "logps/chosen": -304.29290771484375, + "logps/rejected": -217.02273559570312, + "loss": 0.5242, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7105241417884827, + "rewards/margins": 0.42840376496315, + "rewards/rejected": 0.2821202874183655, + "step": 1581 + }, + { + "epoch": 0.24465493910690123, + "grad_norm": 6.176245212554932, + "learning_rate": 4.077319587628866e-06, + "logits/chosen": 6.742888450622559, + "logits/rejected": 9.035784721374512, + "logps/chosen": -348.742919921875, + "logps/rejected": -335.8526611328125, + "loss": 0.7004, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.25292858481407166, + "rewards/margins": 0.051133204251527786, + "rewards/rejected": 0.20179539918899536, + "step": 1582 + }, + { + "epoch": 0.24480958824666538, + "grad_norm": 7.001703262329102, + "learning_rate": 4.079896907216495e-06, + "logits/chosen": 13.059893608093262, + "logits/rejected": 6.234818935394287, + "logps/chosen": -288.611572265625, + "logps/rejected": -171.21954345703125, + "loss": 0.7763, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16351842880249023, + "rewards/margins": -0.08241356909275055, + "rewards/rejected": 0.24593199789524078, + "step": 1583 + }, + { + "epoch": 0.24496423738642953, + "grad_norm": 6.992018222808838, + "learning_rate": 4.082474226804124e-06, + "logits/chosen": 11.671810150146484, + "logits/rejected": 12.481635093688965, + "logps/chosen": -272.3096618652344, + "logps/rejected": -287.1587829589844, + "loss": 0.8005, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5078313946723938, + "rewards/margins": -0.13844740390777588, + "rewards/rejected": 0.6462787985801697, + "step": 1584 + }, + { + "epoch": 0.2451188865261937, + "grad_norm": 6.328573703765869, + "learning_rate": 4.085051546391753e-06, + "logits/chosen": 11.106008529663086, + "logits/rejected": 9.432999610900879, + "logps/chosen": -264.5981140136719, + "logps/rejected": -257.2050476074219, + "loss": 0.8491, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.18957120180130005, + "rewards/margins": -0.2706175148487091, + "rewards/rejected": 0.46018871665000916, + "step": 1585 + }, + { + "epoch": 0.24527353566595786, + "grad_norm": 7.428191661834717, + "learning_rate": 4.0876288659793815e-06, + "logits/chosen": 9.431912422180176, + "logits/rejected": 9.949995994567871, + "logps/chosen": -282.7473449707031, + "logps/rejected": -262.3726806640625, + "loss": 0.7759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34010347723960876, + "rewards/margins": -0.1167859435081482, + "rewards/rejected": 0.45688945055007935, + "step": 1586 + }, + { + "epoch": 0.245428184805722, + "grad_norm": 4.919670581817627, + "learning_rate": 4.090206185567011e-06, + "logits/chosen": 10.43871021270752, + "logits/rejected": 7.3854875564575195, + "logps/chosen": -300.2728576660156, + "logps/rejected": -241.9805145263672, + "loss": 0.6257, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4826265275478363, + "rewards/margins": 0.1631772518157959, + "rewards/rejected": 0.3194493055343628, + "step": 1587 + }, + { + "epoch": 0.2455828339454862, + "grad_norm": 6.834775924682617, + "learning_rate": 4.092783505154639e-06, + "logits/chosen": 6.900688648223877, + "logits/rejected": 2.7163596153259277, + "logps/chosen": -490.18011474609375, + "logps/rejected": -290.9938049316406, + "loss": 0.5916, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5109007358551025, + "rewards/margins": 0.28936702013015747, + "rewards/rejected": 0.2215336710214615, + "step": 1588 + }, + { + "epoch": 0.24573748308525034, + "grad_norm": 17.95108413696289, + "learning_rate": 4.0953608247422685e-06, + "logits/chosen": 10.227059364318848, + "logits/rejected": 3.6857471466064453, + "logps/chosen": -287.2636413574219, + "logps/rejected": -164.34503173828125, + "loss": 0.7411, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24053683876991272, + "rewards/margins": -0.06568513810634613, + "rewards/rejected": 0.30622199177742004, + "step": 1589 + }, + { + "epoch": 0.2458921322250145, + "grad_norm": 6.457596778869629, + "learning_rate": 4.097938144329897e-06, + "logits/chosen": 8.670398712158203, + "logits/rejected": 12.373117446899414, + "logps/chosen": -273.8775329589844, + "logps/rejected": -326.6858215332031, + "loss": 0.6257, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.629082977771759, + "rewards/margins": 0.16207952797412872, + "rewards/rejected": 0.4670034348964691, + "step": 1590 + }, + { + "epoch": 0.24604678136477867, + "grad_norm": 5.513363361358643, + "learning_rate": 4.100515463917526e-06, + "logits/chosen": 16.222354888916016, + "logits/rejected": 9.223556518554688, + "logps/chosen": -337.9263916015625, + "logps/rejected": -254.47622680664062, + "loss": 0.5971, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42576923966407776, + "rewards/margins": 0.25148630142211914, + "rewards/rejected": 0.17428293824195862, + "step": 1591 + }, + { + "epoch": 0.24620143050454282, + "grad_norm": 5.461479187011719, + "learning_rate": 4.103092783505155e-06, + "logits/chosen": 13.338172912597656, + "logits/rejected": 7.191806793212891, + "logps/chosen": -331.81536865234375, + "logps/rejected": -265.1946105957031, + "loss": 0.4908, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7693902850151062, + "rewards/margins": 0.5378403663635254, + "rewards/rejected": 0.23154990375041962, + "step": 1592 + }, + { + "epoch": 0.24635607964430697, + "grad_norm": 4.133162975311279, + "learning_rate": 4.105670103092784e-06, + "logits/chosen": 18.246402740478516, + "logits/rejected": 11.348210334777832, + "logps/chosen": -275.86944580078125, + "logps/rejected": -222.93460083007812, + "loss": 0.5855, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6855422258377075, + "rewards/margins": 0.24522744119167328, + "rewards/rejected": 0.44031476974487305, + "step": 1593 + }, + { + "epoch": 0.24651072878407113, + "grad_norm": 5.335321426391602, + "learning_rate": 4.1082474226804124e-06, + "logits/chosen": 10.178993225097656, + "logits/rejected": 10.26882553100586, + "logps/chosen": -178.4753875732422, + "logps/rejected": -188.35284423828125, + "loss": 0.7575, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25981560349464417, + "rewards/margins": -0.07432928681373596, + "rewards/rejected": 0.3341448903083801, + "step": 1594 + }, + { + "epoch": 0.2466653779238353, + "grad_norm": 7.256267070770264, + "learning_rate": 4.110824742268042e-06, + "logits/chosen": 7.20297908782959, + "logits/rejected": 5.310847759246826, + "logps/chosen": -175.30859375, + "logps/rejected": -192.25233459472656, + "loss": 0.7024, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4013350307941437, + "rewards/margins": 0.052470821887254715, + "rewards/rejected": 0.34886422753334045, + "step": 1595 + }, + { + "epoch": 0.24682002706359946, + "grad_norm": 5.77985143661499, + "learning_rate": 4.11340206185567e-06, + "logits/chosen": 4.736292839050293, + "logits/rejected": 12.441851615905762, + "logps/chosen": -263.7264709472656, + "logps/rejected": -282.23565673828125, + "loss": 0.6815, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24189285933971405, + "rewards/margins": 0.05961134284734726, + "rewards/rejected": 0.18228153884410858, + "step": 1596 + }, + { + "epoch": 0.2469746762033636, + "grad_norm": 7.368546485900879, + "learning_rate": 4.1159793814432995e-06, + "logits/chosen": 5.932742118835449, + "logits/rejected": 4.723309516906738, + "logps/chosen": -504.2133483886719, + "logps/rejected": -400.3802795410156, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46605217456817627, + "rewards/margins": 0.01820240169763565, + "rewards/rejected": 0.4478497803211212, + "step": 1597 + }, + { + "epoch": 0.2471293253431278, + "grad_norm": 6.774259567260742, + "learning_rate": 4.118556701030928e-06, + "logits/chosen": 16.846004486083984, + "logits/rejected": 13.96391773223877, + "logps/chosen": -404.5115051269531, + "logps/rejected": -248.13941955566406, + "loss": 0.6575, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3905647397041321, + "rewards/margins": 0.2489706128835678, + "rewards/rejected": 0.14159414172172546, + "step": 1598 + }, + { + "epoch": 0.24728397448289194, + "grad_norm": 6.341897487640381, + "learning_rate": 4.121134020618557e-06, + "logits/chosen": 8.28592300415039, + "logits/rejected": 4.471281051635742, + "logps/chosen": -482.963134765625, + "logps/rejected": -298.1980895996094, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8686650991439819, + "rewards/margins": 0.3004007637500763, + "rewards/rejected": 0.5682643055915833, + "step": 1599 + }, + { + "epoch": 0.2474386236226561, + "grad_norm": 5.28187894821167, + "learning_rate": 4.123711340206186e-06, + "logits/chosen": 16.991960525512695, + "logits/rejected": 8.26224422454834, + "logps/chosen": -344.2832336425781, + "logps/rejected": -244.84840393066406, + "loss": 0.7102, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17857977747917175, + "rewards/margins": 0.028863973915576935, + "rewards/rejected": 0.14971581101417542, + "step": 1600 + }, + { + "epoch": 0.24759327276242027, + "grad_norm": 11.196612358093262, + "learning_rate": 4.126288659793815e-06, + "logits/chosen": 6.732871055603027, + "logits/rejected": 7.686327934265137, + "logps/chosen": -247.2836151123047, + "logps/rejected": -235.54293823242188, + "loss": 0.5831, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4772689640522003, + "rewards/margins": 0.3054541349411011, + "rewards/rejected": 0.17181482911109924, + "step": 1601 + }, + { + "epoch": 0.24774792190218442, + "grad_norm": 8.007533073425293, + "learning_rate": 4.128865979381443e-06, + "logits/chosen": 7.7536725997924805, + "logits/rejected": 7.5432515144348145, + "logps/chosen": -322.0111999511719, + "logps/rejected": -283.08660888671875, + "loss": 0.7439, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.212103933095932, + "rewards/margins": -0.03596443682909012, + "rewards/rejected": 0.24806839227676392, + "step": 1602 + }, + { + "epoch": 0.24790257104194857, + "grad_norm": 5.018898010253906, + "learning_rate": 4.131443298969072e-06, + "logits/chosen": 8.799638748168945, + "logits/rejected": 11.236151695251465, + "logps/chosen": -209.7759246826172, + "logps/rejected": -249.03305053710938, + "loss": 0.6216, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3298637270927429, + "rewards/margins": 0.18953871726989746, + "rewards/rejected": 0.14032503962516785, + "step": 1603 + }, + { + "epoch": 0.24805722018171275, + "grad_norm": 6.016971111297607, + "learning_rate": 4.134020618556701e-06, + "logits/chosen": 7.859749794006348, + "logits/rejected": 11.439133644104004, + "logps/chosen": -217.74844360351562, + "logps/rejected": -226.37728881835938, + "loss": 0.6472, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35840949416160583, + "rewards/margins": 0.13885095715522766, + "rewards/rejected": 0.21955853700637817, + "step": 1604 + }, + { + "epoch": 0.2482118693214769, + "grad_norm": 6.286149024963379, + "learning_rate": 4.13659793814433e-06, + "logits/chosen": 10.11738109588623, + "logits/rejected": 7.317766189575195, + "logps/chosen": -195.7477569580078, + "logps/rejected": -173.75634765625, + "loss": 0.7227, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17697674036026, + "rewards/margins": -0.02410898357629776, + "rewards/rejected": 0.20108571648597717, + "step": 1605 + }, + { + "epoch": 0.24836651846124105, + "grad_norm": 6.834414958953857, + "learning_rate": 4.139175257731959e-06, + "logits/chosen": 8.190263748168945, + "logits/rejected": 7.97369384765625, + "logps/chosen": -357.20166015625, + "logps/rejected": -356.8304443359375, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43308526277542114, + "rewards/margins": 0.13444869220256805, + "rewards/rejected": 0.2986365556716919, + "step": 1606 + }, + { + "epoch": 0.24852116760100523, + "grad_norm": 5.023197650909424, + "learning_rate": 4.141752577319588e-06, + "logits/chosen": 7.65443754196167, + "logits/rejected": 8.107499122619629, + "logps/chosen": -194.73854064941406, + "logps/rejected": -196.713623046875, + "loss": 0.6928, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29810890555381775, + "rewards/margins": 0.02460518479347229, + "rewards/rejected": 0.27350372076034546, + "step": 1607 + }, + { + "epoch": 0.24867581674076938, + "grad_norm": 5.9183831214904785, + "learning_rate": 4.1443298969072175e-06, + "logits/chosen": 11.174837112426758, + "logits/rejected": 8.177064895629883, + "logps/chosen": -351.370849609375, + "logps/rejected": -272.376220703125, + "loss": 0.6632, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32531893253326416, + "rewards/margins": 0.1618170589208603, + "rewards/rejected": 0.1635018289089203, + "step": 1608 + }, + { + "epoch": 0.24883046588053354, + "grad_norm": 8.947794914245605, + "learning_rate": 4.146907216494846e-06, + "logits/chosen": 10.086214065551758, + "logits/rejected": 5.1015400886535645, + "logps/chosen": -352.83734130859375, + "logps/rejected": -262.85809326171875, + "loss": 0.8886, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.2161579132080078, + "rewards/margins": -0.3356893062591553, + "rewards/rejected": 0.5518472194671631, + "step": 1609 + }, + { + "epoch": 0.2489851150202977, + "grad_norm": 6.343766212463379, + "learning_rate": 4.149484536082475e-06, + "logits/chosen": 12.456357955932617, + "logits/rejected": 9.871862411499023, + "logps/chosen": -263.312744140625, + "logps/rejected": -190.8375244140625, + "loss": 0.7309, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.43510711193084717, + "rewards/margins": -0.004523627460002899, + "rewards/rejected": 0.43963077664375305, + "step": 1610 + }, + { + "epoch": 0.24913976416006187, + "grad_norm": 4.517666816711426, + "learning_rate": 4.152061855670104e-06, + "logits/chosen": 6.7022552490234375, + "logits/rejected": 5.9243669509887695, + "logps/chosen": -301.82916259765625, + "logps/rejected": -282.73187255859375, + "loss": 0.617, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5187374949455261, + "rewards/margins": 0.2564624845981598, + "rewards/rejected": 0.2622750401496887, + "step": 1611 + }, + { + "epoch": 0.24929441329982602, + "grad_norm": 4.882069110870361, + "learning_rate": 4.154639175257732e-06, + "logits/chosen": 9.98108196258545, + "logits/rejected": 7.616365432739258, + "logps/chosen": -230.05496215820312, + "logps/rejected": -234.039306640625, + "loss": 0.642, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49911755323410034, + "rewards/margins": 0.19729860126972198, + "rewards/rejected": 0.30181896686553955, + "step": 1612 + }, + { + "epoch": 0.24944906243959017, + "grad_norm": 83.67471313476562, + "learning_rate": 4.1572164948453614e-06, + "logits/chosen": 10.120527267456055, + "logits/rejected": 8.16556167602539, + "logps/chosen": -299.514892578125, + "logps/rejected": -271.455810546875, + "loss": 0.6731, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.421031653881073, + "rewards/margins": 0.13261628150939941, + "rewards/rejected": 0.2884153723716736, + "step": 1613 + }, + { + "epoch": 0.24960371157935435, + "grad_norm": 5.759856224060059, + "learning_rate": 4.15979381443299e-06, + "logits/chosen": 11.615106582641602, + "logits/rejected": 10.498109817504883, + "logps/chosen": -306.879638671875, + "logps/rejected": -309.7626647949219, + "loss": 0.7423, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.590875506401062, + "rewards/margins": -0.003452412784099579, + "rewards/rejected": 0.5943279266357422, + "step": 1614 + }, + { + "epoch": 0.2497583607191185, + "grad_norm": 5.235974311828613, + "learning_rate": 4.162371134020619e-06, + "logits/chosen": 7.874249458312988, + "logits/rejected": 4.454874038696289, + "logps/chosen": -294.3907775878906, + "logps/rejected": -220.292724609375, + "loss": 0.5762, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6139842867851257, + "rewards/margins": 0.27210158109664917, + "rewards/rejected": 0.34188273549079895, + "step": 1615 + }, + { + "epoch": 0.24991300985888265, + "grad_norm": 5.683008670806885, + "learning_rate": 4.164948453608248e-06, + "logits/chosen": 12.201066970825195, + "logits/rejected": 8.841693878173828, + "logps/chosen": -486.6582946777344, + "logps/rejected": -356.3330078125, + "loss": 0.5924, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7200773358345032, + "rewards/margins": 0.32216161489486694, + "rewards/rejected": 0.3979157507419586, + "step": 1616 + }, + { + "epoch": 0.2500676589986468, + "grad_norm": 5.7631731033325195, + "learning_rate": 4.167525773195877e-06, + "logits/chosen": 4.660618782043457, + "logits/rejected": 0.7919836044311523, + "logps/chosen": -215.94320678710938, + "logps/rejected": -204.84751892089844, + "loss": 0.7062, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22886334359645844, + "rewards/margins": 0.019197996705770493, + "rewards/rejected": 0.20966535806655884, + "step": 1617 + }, + { + "epoch": 0.25022230813841095, + "grad_norm": 8.097609519958496, + "learning_rate": 4.170103092783505e-06, + "logits/chosen": 8.240747451782227, + "logits/rejected": 10.663824081420898, + "logps/chosen": -277.2115478515625, + "logps/rejected": -318.309326171875, + "loss": 0.7297, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.40421128273010254, + "rewards/margins": -0.0374966636300087, + "rewards/rejected": 0.44170790910720825, + "step": 1618 + }, + { + "epoch": 0.25037695727817516, + "grad_norm": 4.644769191741943, + "learning_rate": 4.172680412371135e-06, + "logits/chosen": 13.678262710571289, + "logits/rejected": 2.361966133117676, + "logps/chosen": -355.73724365234375, + "logps/rejected": -234.37550354003906, + "loss": 0.5003, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40933820605278015, + "rewards/margins": 0.539950430393219, + "rewards/rejected": -0.13061223924160004, + "step": 1619 + }, + { + "epoch": 0.2505316064179393, + "grad_norm": 7.742588996887207, + "learning_rate": 4.175257731958763e-06, + "logits/chosen": 9.044269561767578, + "logits/rejected": 2.665898084640503, + "logps/chosen": -373.65533447265625, + "logps/rejected": -328.0936279296875, + "loss": 0.7179, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.44949454069137573, + "rewards/margins": 0.12218938022851944, + "rewards/rejected": 0.3273051977157593, + "step": 1620 + }, + { + "epoch": 0.25068625555770346, + "grad_norm": 4.420767784118652, + "learning_rate": 4.177835051546392e-06, + "logits/chosen": 9.100753784179688, + "logits/rejected": 9.952964782714844, + "logps/chosen": -195.91946411132812, + "logps/rejected": -197.62229919433594, + "loss": 0.6165, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0967133492231369, + "rewards/margins": 0.1918657422065735, + "rewards/rejected": -0.09515240788459778, + "step": 1621 + }, + { + "epoch": 0.2508409046974676, + "grad_norm": 6.149387359619141, + "learning_rate": 4.180412371134021e-06, + "logits/chosen": 5.239181995391846, + "logits/rejected": 5.390114784240723, + "logps/chosen": -296.5237121582031, + "logps/rejected": -221.3077850341797, + "loss": 0.6636, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31191134452819824, + "rewards/margins": 0.16515719890594482, + "rewards/rejected": 0.14675411581993103, + "step": 1622 + }, + { + "epoch": 0.25099555383723177, + "grad_norm": 4.210749626159668, + "learning_rate": 4.18298969072165e-06, + "logits/chosen": 10.543679237365723, + "logits/rejected": 10.2120361328125, + "logps/chosen": -275.5555419921875, + "logps/rejected": -276.81646728515625, + "loss": 0.6211, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.528279721736908, + "rewards/margins": 0.18649163842201233, + "rewards/rejected": 0.34178805351257324, + "step": 1623 + }, + { + "epoch": 0.2511502029769959, + "grad_norm": 17.685640335083008, + "learning_rate": 4.185567010309279e-06, + "logits/chosen": 7.824277877807617, + "logits/rejected": 13.62466812133789, + "logps/chosen": -158.25189208984375, + "logps/rejected": -357.7012939453125, + "loss": 0.7004, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4482974112033844, + "rewards/margins": 0.056787874549627304, + "rewards/rejected": 0.3915095329284668, + "step": 1624 + }, + { + "epoch": 0.2513048521167601, + "grad_norm": 5.255959987640381, + "learning_rate": 4.188144329896908e-06, + "logits/chosen": 2.517392635345459, + "logits/rejected": 4.677642822265625, + "logps/chosen": -170.2711181640625, + "logps/rejected": -164.18605041503906, + "loss": 0.8421, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.19714275002479553, + "rewards/margins": -0.2503814697265625, + "rewards/rejected": 0.44752421975135803, + "step": 1625 + }, + { + "epoch": 0.2514595012565243, + "grad_norm": 5.370728969573975, + "learning_rate": 4.190721649484536e-06, + "logits/chosen": 9.427894592285156, + "logits/rejected": 3.8264122009277344, + "logps/chosen": -317.9552001953125, + "logps/rejected": -252.83407592773438, + "loss": 0.6689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5245187878608704, + "rewards/margins": 0.14788249135017395, + "rewards/rejected": 0.3766363263130188, + "step": 1626 + }, + { + "epoch": 0.2516141503962884, + "grad_norm": 7.949696063995361, + "learning_rate": 4.193298969072166e-06, + "logits/chosen": 12.46045970916748, + "logits/rejected": 10.630002975463867, + "logps/chosen": -449.3725280761719, + "logps/rejected": -302.653564453125, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06579238176345825, + "rewards/margins": 0.2672802209854126, + "rewards/rejected": -0.20148782432079315, + "step": 1627 + }, + { + "epoch": 0.2517687995360526, + "grad_norm": 5.2793965339660645, + "learning_rate": 4.195876288659794e-06, + "logits/chosen": 9.032638549804688, + "logits/rejected": 3.4847865104675293, + "logps/chosen": -363.011962890625, + "logps/rejected": -237.36341857910156, + "loss": 0.6662, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3056390881538391, + "rewards/margins": 0.1222473755478859, + "rewards/rejected": 0.1833917200565338, + "step": 1628 + }, + { + "epoch": 0.25192344867581673, + "grad_norm": 4.780789852142334, + "learning_rate": 4.198453608247423e-06, + "logits/chosen": 2.772305488586426, + "logits/rejected": 10.86107349395752, + "logps/chosen": -136.4967498779297, + "logps/rejected": -205.36329650878906, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30957695841789246, + "rewards/margins": 0.06535394489765167, + "rewards/rejected": 0.2442229986190796, + "step": 1629 + }, + { + "epoch": 0.2520780978155809, + "grad_norm": 5.505533695220947, + "learning_rate": 4.201030927835052e-06, + "logits/chosen": 5.628686904907227, + "logits/rejected": 8.052618026733398, + "logps/chosen": -294.0024108886719, + "logps/rejected": -278.51141357421875, + "loss": 0.7394, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30035290122032166, + "rewards/margins": -0.0683484673500061, + "rewards/rejected": 0.36870136857032776, + "step": 1630 + }, + { + "epoch": 0.2522327469553451, + "grad_norm": 6.067412376403809, + "learning_rate": 4.203608247422681e-06, + "logits/chosen": 3.856903553009033, + "logits/rejected": 9.425829887390137, + "logps/chosen": -223.6259765625, + "logps/rejected": -271.5736083984375, + "loss": 0.5983, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28897133469581604, + "rewards/margins": 0.2328396588563919, + "rewards/rejected": 0.056131646037101746, + "step": 1631 + }, + { + "epoch": 0.25238739609510924, + "grad_norm": 5.93890380859375, + "learning_rate": 4.2061855670103096e-06, + "logits/chosen": 0.9251695871353149, + "logits/rejected": 7.725450038909912, + "logps/chosen": -178.5623779296875, + "logps/rejected": -275.084716796875, + "loss": 0.7078, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32795462012290955, + "rewards/margins": -0.009829465299844742, + "rewards/rejected": 0.3377840518951416, + "step": 1632 + }, + { + "epoch": 0.2525420452348734, + "grad_norm": 4.634039402008057, + "learning_rate": 4.208762886597939e-06, + "logits/chosen": 10.779047012329102, + "logits/rejected": 3.873975992202759, + "logps/chosen": -457.6269226074219, + "logps/rejected": -299.7712097167969, + "loss": 0.4737, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7059726715087891, + "rewards/margins": 0.5996737480163574, + "rewards/rejected": 0.10629893094301224, + "step": 1633 + }, + { + "epoch": 0.25269669437463754, + "grad_norm": 4.973537921905518, + "learning_rate": 4.211340206185567e-06, + "logits/chosen": 11.1174955368042, + "logits/rejected": 9.119796752929688, + "logps/chosen": -235.68455505371094, + "logps/rejected": -231.4317626953125, + "loss": 0.6717, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34784913063049316, + "rewards/margins": 0.09272685647010803, + "rewards/rejected": 0.2551223039627075, + "step": 1634 + }, + { + "epoch": 0.2528513435144017, + "grad_norm": 4.96306037902832, + "learning_rate": 4.213917525773197e-06, + "logits/chosen": 11.122123718261719, + "logits/rejected": 3.534989356994629, + "logps/chosen": -298.7386474609375, + "logps/rejected": -162.73345947265625, + "loss": 0.781, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1640453338623047, + "rewards/margins": -0.07453343272209167, + "rewards/rejected": 0.23857876658439636, + "step": 1635 + }, + { + "epoch": 0.25300599265416585, + "grad_norm": 6.668501853942871, + "learning_rate": 4.216494845360825e-06, + "logits/chosen": 9.434202194213867, + "logits/rejected": 11.486614227294922, + "logps/chosen": -228.00320434570312, + "logps/rejected": -258.9673767089844, + "loss": 0.7452, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11207906901836395, + "rewards/margins": -0.08272166550159454, + "rewards/rejected": 0.1948007494211197, + "step": 1636 + }, + { + "epoch": 0.25316064179393, + "grad_norm": 10.877275466918945, + "learning_rate": 4.219072164948454e-06, + "logits/chosen": 10.078407287597656, + "logits/rejected": 5.6256422996521, + "logps/chosen": -330.3979187011719, + "logps/rejected": -288.82073974609375, + "loss": 0.6321, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2939951419830322, + "rewards/margins": 0.22671625018119812, + "rewards/rejected": 0.0672789141535759, + "step": 1637 + }, + { + "epoch": 0.2533152909336942, + "grad_norm": 6.319427013397217, + "learning_rate": 4.221649484536083e-06, + "logits/chosen": 7.259459495544434, + "logits/rejected": 7.920317649841309, + "logps/chosen": -268.65576171875, + "logps/rejected": -293.05499267578125, + "loss": 0.669, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3166123628616333, + "rewards/margins": 0.06800252199172974, + "rewards/rejected": 0.24860984086990356, + "step": 1638 + }, + { + "epoch": 0.25346994007345836, + "grad_norm": 8.284077644348145, + "learning_rate": 4.224226804123711e-06, + "logits/chosen": 11.431381225585938, + "logits/rejected": 13.871949195861816, + "logps/chosen": -375.19586181640625, + "logps/rejected": -450.5081481933594, + "loss": 0.8782, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4328592121601105, + "rewards/margins": -0.18595139682292938, + "rewards/rejected": 0.6188106536865234, + "step": 1639 + }, + { + "epoch": 0.2536245892132225, + "grad_norm": 6.5860185623168945, + "learning_rate": 4.2268041237113405e-06, + "logits/chosen": 14.755675315856934, + "logits/rejected": 9.009217262268066, + "logps/chosen": -259.04412841796875, + "logps/rejected": -217.031005859375, + "loss": 0.5198, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.486317902803421, + "rewards/margins": 0.48018601536750793, + "rewards/rejected": 0.0061319246888160706, + "step": 1640 + }, + { + "epoch": 0.25377923835298666, + "grad_norm": 5.292786598205566, + "learning_rate": 4.229381443298969e-06, + "logits/chosen": 12.739208221435547, + "logits/rejected": 10.681746482849121, + "logps/chosen": -283.26129150390625, + "logps/rejected": -278.7433776855469, + "loss": 0.7217, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2046753317117691, + "rewards/margins": 0.019237905740737915, + "rewards/rejected": 0.18543744087219238, + "step": 1641 + }, + { + "epoch": 0.2539338874927508, + "grad_norm": 7.914367198944092, + "learning_rate": 4.231958762886598e-06, + "logits/chosen": 8.906744956970215, + "logits/rejected": 2.5963854789733887, + "logps/chosen": -577.107666015625, + "logps/rejected": -331.62530517578125, + "loss": 0.7303, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3582886755466461, + "rewards/margins": -0.004259783774614334, + "rewards/rejected": 0.36254844069480896, + "step": 1642 + }, + { + "epoch": 0.25408853663251496, + "grad_norm": 7.514982223510742, + "learning_rate": 4.234536082474227e-06, + "logits/chosen": 11.91797924041748, + "logits/rejected": 6.141276836395264, + "logps/chosen": -412.68017578125, + "logps/rejected": -341.97161865234375, + "loss": 0.6345, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5013439655303955, + "rewards/margins": 0.20196658372879028, + "rewards/rejected": 0.29937744140625, + "step": 1643 + }, + { + "epoch": 0.25424318577227917, + "grad_norm": 6.684088706970215, + "learning_rate": 4.237113402061856e-06, + "logits/chosen": 11.187084197998047, + "logits/rejected": 0.9069557189941406, + "logps/chosen": -429.87554931640625, + "logps/rejected": -237.06298828125, + "loss": 0.7121, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22557444870471954, + "rewards/margins": -0.005273483693599701, + "rewards/rejected": 0.23084792494773865, + "step": 1644 + }, + { + "epoch": 0.2543978349120433, + "grad_norm": 7.095033645629883, + "learning_rate": 4.2396907216494845e-06, + "logits/chosen": 11.935456275939941, + "logits/rejected": 10.36489200592041, + "logps/chosen": -277.1987609863281, + "logps/rejected": -232.415771484375, + "loss": 0.834, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11732234060764313, + "rewards/margins": -0.24008171260356903, + "rewards/rejected": 0.35740405321121216, + "step": 1645 + }, + { + "epoch": 0.25455248405180747, + "grad_norm": 3.8879542350769043, + "learning_rate": 4.242268041237114e-06, + "logits/chosen": 12.882658958435059, + "logits/rejected": 8.600262641906738, + "logps/chosen": -179.5771942138672, + "logps/rejected": -165.47805786132812, + "loss": 0.6328, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32318249344825745, + "rewards/margins": 0.19653114676475525, + "rewards/rejected": 0.126651331782341, + "step": 1646 + }, + { + "epoch": 0.2547071331915716, + "grad_norm": 6.627065181732178, + "learning_rate": 4.244845360824742e-06, + "logits/chosen": 6.602793216705322, + "logits/rejected": 5.097395896911621, + "logps/chosen": -158.929443359375, + "logps/rejected": -182.35226440429688, + "loss": 0.6806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37746188044548035, + "rewards/margins": 0.06501216441392899, + "rewards/rejected": 0.31244972348213196, + "step": 1647 + }, + { + "epoch": 0.2548617823313358, + "grad_norm": 6.520936489105225, + "learning_rate": 4.2474226804123715e-06, + "logits/chosen": 8.824220657348633, + "logits/rejected": 14.6446533203125, + "logps/chosen": -217.00958251953125, + "logps/rejected": -292.0254821777344, + "loss": 0.6742, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5182183980941772, + "rewards/margins": 0.17121267318725586, + "rewards/rejected": 0.347005695104599, + "step": 1648 + }, + { + "epoch": 0.2550164314710999, + "grad_norm": 5.7352728843688965, + "learning_rate": 4.25e-06, + "logits/chosen": 10.118045806884766, + "logits/rejected": 10.122156143188477, + "logps/chosen": -234.01589965820312, + "logps/rejected": -286.669677734375, + "loss": 0.7225, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3616633415222168, + "rewards/margins": 0.03921976685523987, + "rewards/rejected": 0.32244354486465454, + "step": 1649 + }, + { + "epoch": 0.2551710806108641, + "grad_norm": 5.220893383026123, + "learning_rate": 4.252577319587629e-06, + "logits/chosen": 11.87497615814209, + "logits/rejected": 8.589349746704102, + "logps/chosen": -211.21163940429688, + "logps/rejected": -181.89675903320312, + "loss": 0.6012, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4932010769844055, + "rewards/margins": 0.21084357798099518, + "rewards/rejected": 0.2823575437068939, + "step": 1650 + }, + { + "epoch": 0.2553257297506283, + "grad_norm": 7.583624362945557, + "learning_rate": 4.255154639175258e-06, + "logits/chosen": 7.972025394439697, + "logits/rejected": 6.4408159255981445, + "logps/chosen": -251.74795532226562, + "logps/rejected": -201.86666870117188, + "loss": 0.8614, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.28842028975486755, + "rewards/margins": -0.2272483855485916, + "rewards/rejected": 0.5156686902046204, + "step": 1651 + }, + { + "epoch": 0.25548037889039243, + "grad_norm": 5.173532962799072, + "learning_rate": 4.257731958762887e-06, + "logits/chosen": 6.8554182052612305, + "logits/rejected": 2.7411251068115234, + "logps/chosen": -272.21905517578125, + "logps/rejected": -204.11715698242188, + "loss": 0.7193, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3607516288757324, + "rewards/margins": 0.019206956028938293, + "rewards/rejected": 0.3415446877479553, + "step": 1652 + }, + { + "epoch": 0.2556350280301566, + "grad_norm": 5.8570122718811035, + "learning_rate": 4.2603092783505155e-06, + "logits/chosen": 8.766138076782227, + "logits/rejected": 9.980649948120117, + "logps/chosen": -356.2545166015625, + "logps/rejected": -280.7750244140625, + "loss": 0.6481, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3571777939796448, + "rewards/margins": 0.11741085350513458, + "rewards/rejected": 0.23976698517799377, + "step": 1653 + }, + { + "epoch": 0.25578967716992074, + "grad_norm": 5.639509201049805, + "learning_rate": 4.262886597938145e-06, + "logits/chosen": 6.747480869293213, + "logits/rejected": 2.5935261249542236, + "logps/chosen": -275.094482421875, + "logps/rejected": -267.2625427246094, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37122949957847595, + "rewards/margins": 0.1012352854013443, + "rewards/rejected": 0.26999419927597046, + "step": 1654 + }, + { + "epoch": 0.2559443263096849, + "grad_norm": 4.119788646697998, + "learning_rate": 4.265463917525773e-06, + "logits/chosen": 13.585538864135742, + "logits/rejected": 9.719527244567871, + "logps/chosen": -247.7025146484375, + "logps/rejected": -235.82984924316406, + "loss": 0.5361, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6978378295898438, + "rewards/margins": 0.3924921154975891, + "rewards/rejected": 0.30534565448760986, + "step": 1655 + }, + { + "epoch": 0.25609897544944904, + "grad_norm": 4.179389476776123, + "learning_rate": 4.2680412371134025e-06, + "logits/chosen": 13.3646240234375, + "logits/rejected": 6.416746139526367, + "logps/chosen": -302.365478515625, + "logps/rejected": -181.3973388671875, + "loss": 0.5955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5110149383544922, + "rewards/margins": 0.23698779940605164, + "rewards/rejected": 0.27402716875076294, + "step": 1656 + }, + { + "epoch": 0.25625362458921325, + "grad_norm": 11.048225402832031, + "learning_rate": 4.270618556701031e-06, + "logits/chosen": 5.747959136962891, + "logits/rejected": 6.425881385803223, + "logps/chosen": -410.8418273925781, + "logps/rejected": -422.99749755859375, + "loss": 0.6099, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3581824004650116, + "rewards/margins": 0.2130957990884781, + "rewards/rejected": 0.1450866162776947, + "step": 1657 + }, + { + "epoch": 0.2564082737289774, + "grad_norm": 4.898638725280762, + "learning_rate": 4.27319587628866e-06, + "logits/chosen": 11.426324844360352, + "logits/rejected": 4.630359172821045, + "logps/chosen": -333.5100402832031, + "logps/rejected": -225.53717041015625, + "loss": 0.6479, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4307011663913727, + "rewards/margins": 0.12580576539039612, + "rewards/rejected": 0.30489540100097656, + "step": 1658 + }, + { + "epoch": 0.25656292286874155, + "grad_norm": 6.974332809448242, + "learning_rate": 4.275773195876289e-06, + "logits/chosen": 11.037900924682617, + "logits/rejected": 9.508896827697754, + "logps/chosen": -236.75973510742188, + "logps/rejected": -258.734375, + "loss": 0.7501, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5625379085540771, + "rewards/margins": 0.05542746186256409, + "rewards/rejected": 0.5071104764938354, + "step": 1659 + }, + { + "epoch": 0.2567175720085057, + "grad_norm": 4.777845859527588, + "learning_rate": 4.278350515463918e-06, + "logits/chosen": 12.794593811035156, + "logits/rejected": 14.51907730102539, + "logps/chosen": -234.52081298828125, + "logps/rejected": -265.2019958496094, + "loss": 0.5942, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5771160125732422, + "rewards/margins": 0.2502594590187073, + "rewards/rejected": 0.3268565535545349, + "step": 1660 + }, + { + "epoch": 0.25687222114826985, + "grad_norm": 9.301328659057617, + "learning_rate": 4.2809278350515464e-06, + "logits/chosen": 6.797787189483643, + "logits/rejected": 0.6090431213378906, + "logps/chosen": -408.44281005859375, + "logps/rejected": -304.8644714355469, + "loss": 0.6822, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2010902464389801, + "rewards/margins": 0.07193639874458313, + "rewards/rejected": 0.12915386259555817, + "step": 1661 + }, + { + "epoch": 0.257026870288034, + "grad_norm": 5.022156715393066, + "learning_rate": 4.283505154639176e-06, + "logits/chosen": 9.151747703552246, + "logits/rejected": 8.361489295959473, + "logps/chosen": -152.9624481201172, + "logps/rejected": -217.64102172851562, + "loss": 0.7814, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3426362872123718, + "rewards/margins": -0.09035389125347137, + "rewards/rejected": 0.4329901933670044, + "step": 1662 + }, + { + "epoch": 0.25718151942779816, + "grad_norm": 6.543351650238037, + "learning_rate": 4.286082474226804e-06, + "logits/chosen": 4.54701566696167, + "logits/rejected": 3.7109899520874023, + "logps/chosen": -293.29534912109375, + "logps/rejected": -317.0552062988281, + "loss": 0.7445, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5780367851257324, + "rewards/margins": -0.05578899383544922, + "rewards/rejected": 0.6338257789611816, + "step": 1663 + }, + { + "epoch": 0.25733616856756236, + "grad_norm": 5.040768146514893, + "learning_rate": 4.2886597938144335e-06, + "logits/chosen": 8.072590827941895, + "logits/rejected": 7.120708465576172, + "logps/chosen": -211.13418579101562, + "logps/rejected": -194.79547119140625, + "loss": 0.6281, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49554014205932617, + "rewards/margins": 0.20108658075332642, + "rewards/rejected": 0.29445356130599976, + "step": 1664 + }, + { + "epoch": 0.2574908177073265, + "grad_norm": 7.257230281829834, + "learning_rate": 4.291237113402062e-06, + "logits/chosen": 10.890982627868652, + "logits/rejected": 5.684013843536377, + "logps/chosen": -232.88424682617188, + "logps/rejected": -133.4580078125, + "loss": 0.8374, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.12326832860708237, + "rewards/margins": -0.23687195777893066, + "rewards/rejected": 0.36014029383659363, + "step": 1665 + }, + { + "epoch": 0.25764546684709067, + "grad_norm": 6.2694268226623535, + "learning_rate": 4.293814432989691e-06, + "logits/chosen": 10.840110778808594, + "logits/rejected": 8.990663528442383, + "logps/chosen": -203.58351135253906, + "logps/rejected": -160.192138671875, + "loss": 0.7588, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26135969161987305, + "rewards/margins": -0.06229672580957413, + "rewards/rejected": 0.3236564099788666, + "step": 1666 + }, + { + "epoch": 0.2578001159868548, + "grad_norm": 4.647719860076904, + "learning_rate": 4.29639175257732e-06, + "logits/chosen": 3.5154929161071777, + "logits/rejected": 9.406538009643555, + "logps/chosen": -167.55267333984375, + "logps/rejected": -214.77041625976562, + "loss": 0.674, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1570068895816803, + "rewards/margins": 0.09085395187139511, + "rewards/rejected": 0.06615294516086578, + "step": 1667 + }, + { + "epoch": 0.25795476512661897, + "grad_norm": 10.691215515136719, + "learning_rate": 4.298969072164949e-06, + "logits/chosen": 8.109671592712402, + "logits/rejected": 11.143040657043457, + "logps/chosen": -233.75888061523438, + "logps/rejected": -306.23980712890625, + "loss": 0.6186, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3036283254623413, + "rewards/margins": 0.21426591277122498, + "rewards/rejected": 0.08936242759227753, + "step": 1668 + }, + { + "epoch": 0.2581094142663831, + "grad_norm": 6.247630596160889, + "learning_rate": 4.301546391752578e-06, + "logits/chosen": 6.889999866485596, + "logits/rejected": 3.4866364002227783, + "logps/chosen": -317.400634765625, + "logps/rejected": -247.3701171875, + "loss": 0.7172, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2345121055841446, + "rewards/margins": -0.020523402839899063, + "rewards/rejected": 0.25503548979759216, + "step": 1669 + }, + { + "epoch": 0.2582640634061473, + "grad_norm": 7.309544563293457, + "learning_rate": 4.304123711340207e-06, + "logits/chosen": 12.075065612792969, + "logits/rejected": 9.863668441772461, + "logps/chosen": -461.16357421875, + "logps/rejected": -365.98004150390625, + "loss": 0.6289, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5054642558097839, + "rewards/margins": 0.2024945318698883, + "rewards/rejected": 0.30296972393989563, + "step": 1670 + }, + { + "epoch": 0.2584187125459115, + "grad_norm": 5.27731990814209, + "learning_rate": 4.306701030927836e-06, + "logits/chosen": 10.16232681274414, + "logits/rejected": 12.613861083984375, + "logps/chosen": -293.225341796875, + "logps/rejected": -207.11866760253906, + "loss": 0.7588, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4922961890697479, + "rewards/margins": -0.09473396092653275, + "rewards/rejected": 0.5870301723480225, + "step": 1671 + }, + { + "epoch": 0.25857336168567563, + "grad_norm": 3.647665023803711, + "learning_rate": 4.3092783505154644e-06, + "logits/chosen": 9.743517875671387, + "logits/rejected": 5.776437759399414, + "logps/chosen": -228.10964965820312, + "logps/rejected": -208.38943481445312, + "loss": 0.5856, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37795019149780273, + "rewards/margins": 0.2556622624397278, + "rewards/rejected": 0.12228794395923615, + "step": 1672 + }, + { + "epoch": 0.2587280108254398, + "grad_norm": 7.569202899932861, + "learning_rate": 4.311855670103094e-06, + "logits/chosen": 10.79423713684082, + "logits/rejected": 8.504213333129883, + "logps/chosen": -243.462646484375, + "logps/rejected": -225.04257202148438, + "loss": 0.7559, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018120139837265015, + "rewards/margins": -0.08692573010921478, + "rewards/rejected": 0.1050458699464798, + "step": 1673 + }, + { + "epoch": 0.25888265996520393, + "grad_norm": 44.99276351928711, + "learning_rate": 4.314432989690722e-06, + "logits/chosen": 14.782356262207031, + "logits/rejected": 7.280447483062744, + "logps/chosen": -210.545166015625, + "logps/rejected": -175.98812866210938, + "loss": 0.6504, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.01798591949045658, + "rewards/margins": 0.16718244552612305, + "rewards/rejected": -0.18516835570335388, + "step": 1674 + }, + { + "epoch": 0.2590373091049681, + "grad_norm": 5.283841133117676, + "learning_rate": 4.3170103092783515e-06, + "logits/chosen": 12.421058654785156, + "logits/rejected": 8.152573585510254, + "logps/chosen": -263.1407775878906, + "logps/rejected": -192.9669647216797, + "loss": 0.6822, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19494442641735077, + "rewards/margins": 0.02758745476603508, + "rewards/rejected": 0.1673569679260254, + "step": 1675 + }, + { + "epoch": 0.2591919582447323, + "grad_norm": 5.319368839263916, + "learning_rate": 4.31958762886598e-06, + "logits/chosen": 8.562289237976074, + "logits/rejected": 10.118972778320312, + "logps/chosen": -312.8533935546875, + "logps/rejected": -229.1541748046875, + "loss": 0.661, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4450657367706299, + "rewards/margins": 0.13138996064662933, + "rewards/rejected": 0.31367579102516174, + "step": 1676 + }, + { + "epoch": 0.25934660738449644, + "grad_norm": 5.762267112731934, + "learning_rate": 4.322164948453608e-06, + "logits/chosen": 12.108770370483398, + "logits/rejected": 6.452144622802734, + "logps/chosen": -373.93341064453125, + "logps/rejected": -272.74200439453125, + "loss": 0.5323, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47330552339553833, + "rewards/margins": 0.3820696473121643, + "rewards/rejected": 0.09123587608337402, + "step": 1677 + }, + { + "epoch": 0.2595012565242606, + "grad_norm": 5.103640556335449, + "learning_rate": 4.324742268041238e-06, + "logits/chosen": 15.442018508911133, + "logits/rejected": 5.498007297515869, + "logps/chosen": -192.68496704101562, + "logps/rejected": -119.8779067993164, + "loss": 0.7747, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3068602979183197, + "rewards/margins": -0.13264621794223785, + "rewards/rejected": 0.43950650095939636, + "step": 1678 + }, + { + "epoch": 0.25965590566402474, + "grad_norm": 6.467741012573242, + "learning_rate": 4.327319587628866e-06, + "logits/chosen": 4.662561416625977, + "logits/rejected": 7.070420265197754, + "logps/chosen": -190.84994506835938, + "logps/rejected": -263.6214599609375, + "loss": 0.7372, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4969891309738159, + "rewards/margins": -0.020562991499900818, + "rewards/rejected": 0.5175521373748779, + "step": 1679 + }, + { + "epoch": 0.2598105548037889, + "grad_norm": 5.728135108947754, + "learning_rate": 4.329896907216495e-06, + "logits/chosen": 9.04445743560791, + "logits/rejected": 9.94682788848877, + "logps/chosen": -303.6062316894531, + "logps/rejected": -279.8541564941406, + "loss": 0.7454, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.444322407245636, + "rewards/margins": -0.039310432970523834, + "rewards/rejected": 0.4836328625679016, + "step": 1680 + }, + { + "epoch": 0.25996520394355305, + "grad_norm": 7.5171799659729, + "learning_rate": 4.332474226804124e-06, + "logits/chosen": 7.127584934234619, + "logits/rejected": 6.6687822341918945, + "logps/chosen": -326.9767150878906, + "logps/rejected": -347.5093688964844, + "loss": 0.6549, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5778396725654602, + "rewards/margins": 0.191556915640831, + "rewards/rejected": 0.386282742023468, + "step": 1681 + }, + { + "epoch": 0.2601198530833172, + "grad_norm": 7.1069016456604, + "learning_rate": 4.335051546391753e-06, + "logits/chosen": 11.272130966186523, + "logits/rejected": 9.587542533874512, + "logps/chosen": -218.4287872314453, + "logps/rejected": -246.72174072265625, + "loss": 0.7376, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16508060693740845, + "rewards/margins": -0.0521341934800148, + "rewards/rejected": 0.21721479296684265, + "step": 1682 + }, + { + "epoch": 0.2602745022230814, + "grad_norm": 9.286980628967285, + "learning_rate": 4.337628865979382e-06, + "logits/chosen": 9.217291831970215, + "logits/rejected": 12.489313125610352, + "logps/chosen": -252.23968505859375, + "logps/rejected": -276.44757080078125, + "loss": 0.8488, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1445694863796234, + "rewards/margins": -0.14328508079051971, + "rewards/rejected": -0.0012844055891036987, + "step": 1683 + }, + { + "epoch": 0.26042915136284556, + "grad_norm": 4.6686553955078125, + "learning_rate": 4.340206185567011e-06, + "logits/chosen": 11.10117244720459, + "logits/rejected": 6.254631519317627, + "logps/chosen": -384.20159912109375, + "logps/rejected": -287.5000305175781, + "loss": 0.5521, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.558184027671814, + "rewards/margins": 0.44858017563819885, + "rewards/rejected": 0.1096038818359375, + "step": 1684 + }, + { + "epoch": 0.2605838005026097, + "grad_norm": 4.128678798675537, + "learning_rate": 4.342783505154639e-06, + "logits/chosen": 7.401800155639648, + "logits/rejected": 4.638880252838135, + "logps/chosen": -181.85731506347656, + "logps/rejected": -173.14053344726562, + "loss": 0.6075, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3325316309928894, + "rewards/margins": 0.18382640182971954, + "rewards/rejected": 0.14870524406433105, + "step": 1685 + }, + { + "epoch": 0.26073844964237386, + "grad_norm": 5.629097938537598, + "learning_rate": 4.345360824742269e-06, + "logits/chosen": 12.161772727966309, + "logits/rejected": 3.5483956336975098, + "logps/chosen": -326.270263671875, + "logps/rejected": -203.72274780273438, + "loss": 0.5637, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6203939318656921, + "rewards/margins": 0.31491580605506897, + "rewards/rejected": 0.30547815561294556, + "step": 1686 + }, + { + "epoch": 0.260893098782138, + "grad_norm": 4.986377239227295, + "learning_rate": 4.347938144329897e-06, + "logits/chosen": 7.248554706573486, + "logits/rejected": 3.0901105403900146, + "logps/chosen": -230.0486297607422, + "logps/rejected": -199.36036682128906, + "loss": 0.6536, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2983247637748718, + "rewards/margins": 0.15213394165039062, + "rewards/rejected": 0.1461908370256424, + "step": 1687 + }, + { + "epoch": 0.26104774792190216, + "grad_norm": 5.384634017944336, + "learning_rate": 4.350515463917526e-06, + "logits/chosen": 10.910951614379883, + "logits/rejected": 7.930371284484863, + "logps/chosen": -265.8597106933594, + "logps/rejected": -133.46841430664062, + "loss": 0.7254, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2534804940223694, + "rewards/margins": 0.0074874237179756165, + "rewards/rejected": 0.24599304795265198, + "step": 1688 + }, + { + "epoch": 0.26120239706166637, + "grad_norm": 6.221289157867432, + "learning_rate": 4.353092783505155e-06, + "logits/chosen": 2.6747543811798096, + "logits/rejected": 8.717193603515625, + "logps/chosen": -171.8345947265625, + "logps/rejected": -222.59078979492188, + "loss": 0.7153, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19862869381904602, + "rewards/margins": 0.056714288890361786, + "rewards/rejected": 0.14191439747810364, + "step": 1689 + }, + { + "epoch": 0.2613570462014305, + "grad_norm": 6.161037921905518, + "learning_rate": 4.355670103092784e-06, + "logits/chosen": 15.940458297729492, + "logits/rejected": 7.757214546203613, + "logps/chosen": -455.1711730957031, + "logps/rejected": -309.407470703125, + "loss": 0.5328, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7765721678733826, + "rewards/margins": 0.3917813301086426, + "rewards/rejected": 0.3847908079624176, + "step": 1690 + }, + { + "epoch": 0.26151169534119467, + "grad_norm": 5.468064785003662, + "learning_rate": 4.358247422680413e-06, + "logits/chosen": 11.471540451049805, + "logits/rejected": 12.728561401367188, + "logps/chosen": -262.777587890625, + "logps/rejected": -268.7331237792969, + "loss": 0.6184, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.304311603307724, + "rewards/margins": 0.16495707631111145, + "rewards/rejected": 0.13935452699661255, + "step": 1691 + }, + { + "epoch": 0.2616663444809588, + "grad_norm": 5.665408134460449, + "learning_rate": 4.360824742268042e-06, + "logits/chosen": 8.737391471862793, + "logits/rejected": 7.977653503417969, + "logps/chosen": -258.18414306640625, + "logps/rejected": -266.08135986328125, + "loss": 0.6747, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.505414605140686, + "rewards/margins": 0.05552331358194351, + "rewards/rejected": 0.44989126920700073, + "step": 1692 + }, + { + "epoch": 0.261820993620723, + "grad_norm": 4.014686107635498, + "learning_rate": 4.36340206185567e-06, + "logits/chosen": 16.23103141784668, + "logits/rejected": 4.625252723693848, + "logps/chosen": -248.75076293945312, + "logps/rejected": -174.1435089111328, + "loss": 0.5379, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5051984786987305, + "rewards/margins": 0.39069533348083496, + "rewards/rejected": 0.11450319737195969, + "step": 1693 + }, + { + "epoch": 0.2619756427604871, + "grad_norm": 5.6513352394104, + "learning_rate": 4.3659793814433e-06, + "logits/chosen": 11.060733795166016, + "logits/rejected": 10.457305908203125, + "logps/chosen": -246.74212646484375, + "logps/rejected": -221.2449951171875, + "loss": 0.7504, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3163064122200012, + "rewards/margins": -0.08438654243946075, + "rewards/rejected": 0.4006929397583008, + "step": 1694 + }, + { + "epoch": 0.2621302919002513, + "grad_norm": 5.130451679229736, + "learning_rate": 4.368556701030928e-06, + "logits/chosen": 12.811918258666992, + "logits/rejected": 9.328104972839355, + "logps/chosen": -163.4132843017578, + "logps/rejected": -154.5377197265625, + "loss": 0.7893, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14657628536224365, + "rewards/margins": -0.1565559357404709, + "rewards/rejected": 0.30313220620155334, + "step": 1695 + }, + { + "epoch": 0.2622849410400155, + "grad_norm": 5.4119648933410645, + "learning_rate": 4.371134020618557e-06, + "logits/chosen": 11.776997566223145, + "logits/rejected": 3.36003041267395, + "logps/chosen": -283.3559875488281, + "logps/rejected": -194.5707244873047, + "loss": 0.5654, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34145188331604004, + "rewards/margins": 0.34150564670562744, + "rewards/rejected": -5.374103784561157e-05, + "step": 1696 + }, + { + "epoch": 0.26243959017977964, + "grad_norm": 5.5374603271484375, + "learning_rate": 4.373711340206186e-06, + "logits/chosen": 10.599066734313965, + "logits/rejected": 10.56446361541748, + "logps/chosen": -321.669189453125, + "logps/rejected": -271.11395263671875, + "loss": 0.7352, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5816910266876221, + "rewards/margins": 0.0010752901434898376, + "rewards/rejected": 0.580615758895874, + "step": 1697 + }, + { + "epoch": 0.2625942393195438, + "grad_norm": 4.938562393188477, + "learning_rate": 4.376288659793815e-06, + "logits/chosen": 11.090388298034668, + "logits/rejected": 13.657381057739258, + "logps/chosen": -226.67254638671875, + "logps/rejected": -220.35751342773438, + "loss": 0.6667, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38853949308395386, + "rewards/margins": 0.07145550847053528, + "rewards/rejected": 0.3170839846134186, + "step": 1698 + }, + { + "epoch": 0.26274888845930794, + "grad_norm": 6.904003620147705, + "learning_rate": 4.3788659793814436e-06, + "logits/chosen": 5.284897327423096, + "logits/rejected": 8.073166847229004, + "logps/chosen": -289.9504089355469, + "logps/rejected": -265.20428466796875, + "loss": 0.6191, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41621050238609314, + "rewards/margins": 0.22259020805358887, + "rewards/rejected": 0.19362029433250427, + "step": 1699 + }, + { + "epoch": 0.2629035375990721, + "grad_norm": 6.463097095489502, + "learning_rate": 4.381443298969073e-06, + "logits/chosen": 10.876604080200195, + "logits/rejected": 13.045934677124023, + "logps/chosen": -255.5340118408203, + "logps/rejected": -292.7287902832031, + "loss": 0.8986, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.08803338557481766, + "rewards/margins": -0.3250718116760254, + "rewards/rejected": 0.41310518980026245, + "step": 1700 + }, + { + "epoch": 0.26305818673883624, + "grad_norm": 7.426027774810791, + "learning_rate": 4.384020618556701e-06, + "logits/chosen": 11.007621765136719, + "logits/rejected": 6.974452972412109, + "logps/chosen": -433.1227111816406, + "logps/rejected": -328.71759033203125, + "loss": 0.7557, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39031946659088135, + "rewards/margins": -0.06136823073029518, + "rewards/rejected": 0.45168769359588623, + "step": 1701 + }, + { + "epoch": 0.26321283587860045, + "grad_norm": 8.258471488952637, + "learning_rate": 4.386597938144331e-06, + "logits/chosen": 11.523098945617676, + "logits/rejected": 6.325174331665039, + "logps/chosen": -483.7724304199219, + "logps/rejected": -299.81927490234375, + "loss": 0.7251, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20430830121040344, + "rewards/margins": -0.03184375911951065, + "rewards/rejected": 0.23615208268165588, + "step": 1702 + }, + { + "epoch": 0.2633674850183646, + "grad_norm": 7.3353986740112305, + "learning_rate": 4.389175257731959e-06, + "logits/chosen": 7.863901138305664, + "logits/rejected": 13.702529907226562, + "logps/chosen": -255.89523315429688, + "logps/rejected": -355.883056640625, + "loss": 0.8622, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.09760475158691406, + "rewards/margins": -0.2865573763847351, + "rewards/rejected": 0.38416212797164917, + "step": 1703 + }, + { + "epoch": 0.26352213415812875, + "grad_norm": 5.180741310119629, + "learning_rate": 4.391752577319588e-06, + "logits/chosen": 16.025365829467773, + "logits/rejected": 7.518298149108887, + "logps/chosen": -442.07183837890625, + "logps/rejected": -320.32086181640625, + "loss": 0.5579, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6019032001495361, + "rewards/margins": 0.3831135630607605, + "rewards/rejected": 0.21878957748413086, + "step": 1704 + }, + { + "epoch": 0.2636767832978929, + "grad_norm": 5.708284378051758, + "learning_rate": 4.394329896907217e-06, + "logits/chosen": 6.307971000671387, + "logits/rejected": 4.043267726898193, + "logps/chosen": -271.7491149902344, + "logps/rejected": -216.861083984375, + "loss": 0.6418, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1972552090883255, + "rewards/margins": 0.2408042699098587, + "rewards/rejected": -0.0435490608215332, + "step": 1705 + }, + { + "epoch": 0.26383143243765705, + "grad_norm": 4.850820064544678, + "learning_rate": 4.396907216494845e-06, + "logits/chosen": 10.822961807250977, + "logits/rejected": 7.7307538986206055, + "logps/chosen": -199.04415893554688, + "logps/rejected": -173.9315948486328, + "loss": 0.6792, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45624539256095886, + "rewards/margins": 0.10952162742614746, + "rewards/rejected": 0.3467237651348114, + "step": 1706 + }, + { + "epoch": 0.2639860815774212, + "grad_norm": 7.11625862121582, + "learning_rate": 4.3994845360824745e-06, + "logits/chosen": 7.0849151611328125, + "logits/rejected": 7.574535369873047, + "logps/chosen": -406.97882080078125, + "logps/rejected": -318.5047607421875, + "loss": 0.7184, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2544782757759094, + "rewards/margins": 0.004982903599739075, + "rewards/rejected": 0.24949535727500916, + "step": 1707 + }, + { + "epoch": 0.2641407307171854, + "grad_norm": 4.960830211639404, + "learning_rate": 4.402061855670103e-06, + "logits/chosen": 9.605134963989258, + "logits/rejected": 13.243675231933594, + "logps/chosen": -202.01596069335938, + "logps/rejected": -214.9547119140625, + "loss": 0.5788, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5835309624671936, + "rewards/margins": 0.27158015966415405, + "rewards/rejected": 0.31195080280303955, + "step": 1708 + }, + { + "epoch": 0.26429537985694956, + "grad_norm": 5.407628059387207, + "learning_rate": 4.404639175257732e-06, + "logits/chosen": 10.823848724365234, + "logits/rejected": 3.217395782470703, + "logps/chosen": -341.1961669921875, + "logps/rejected": -243.29244995117188, + "loss": 0.6608, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4170490503311157, + "rewards/margins": 0.11604584753513336, + "rewards/rejected": 0.3010031580924988, + "step": 1709 + }, + { + "epoch": 0.2644500289967137, + "grad_norm": 5.524591445922852, + "learning_rate": 4.407216494845361e-06, + "logits/chosen": 5.351621150970459, + "logits/rejected": 6.015663146972656, + "logps/chosen": -225.0330047607422, + "logps/rejected": -236.33151245117188, + "loss": 0.6225, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3774985373020172, + "rewards/margins": 0.1686972975730896, + "rewards/rejected": 0.20880120992660522, + "step": 1710 + }, + { + "epoch": 0.26460467813647787, + "grad_norm": 6.06163215637207, + "learning_rate": 4.40979381443299e-06, + "logits/chosen": 9.44446849822998, + "logits/rejected": 11.471393585205078, + "logps/chosen": -210.42742919921875, + "logps/rejected": -231.7503662109375, + "loss": 0.8547, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4026919901371002, + "rewards/margins": -0.2805594801902771, + "rewards/rejected": 0.6832515001296997, + "step": 1711 + }, + { + "epoch": 0.264759327276242, + "grad_norm": 7.141834259033203, + "learning_rate": 4.4123711340206185e-06, + "logits/chosen": 3.8508729934692383, + "logits/rejected": -0.2764413356781006, + "logps/chosen": -281.7173767089844, + "logps/rejected": -257.2532958984375, + "loss": 0.678, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3090146780014038, + "rewards/margins": 0.06591783463954926, + "rewards/rejected": 0.24309682846069336, + "step": 1712 + }, + { + "epoch": 0.26491397641600617, + "grad_norm": 5.941365718841553, + "learning_rate": 4.414948453608248e-06, + "logits/chosen": 9.955275535583496, + "logits/rejected": 13.661100387573242, + "logps/chosen": -323.3267517089844, + "logps/rejected": -266.95220947265625, + "loss": 0.6119, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3876960277557373, + "rewards/margins": 0.17844253778457642, + "rewards/rejected": 0.20925350487232208, + "step": 1713 + }, + { + "epoch": 0.2650686255557703, + "grad_norm": 3.762624740600586, + "learning_rate": 4.417525773195876e-06, + "logits/chosen": 15.55923843383789, + "logits/rejected": 10.97128677368164, + "logps/chosen": -271.4499206542969, + "logps/rejected": -244.6823272705078, + "loss": 0.5464, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7321747541427612, + "rewards/margins": 0.3452012240886688, + "rewards/rejected": 0.38697350025177, + "step": 1714 + }, + { + "epoch": 0.2652232746955345, + "grad_norm": 4.991570472717285, + "learning_rate": 4.4201030927835055e-06, + "logits/chosen": 9.879316329956055, + "logits/rejected": 9.301066398620605, + "logps/chosen": -282.86724853515625, + "logps/rejected": -261.4393310546875, + "loss": 0.6509, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4787258803844452, + "rewards/margins": 0.12435813248157501, + "rewards/rejected": 0.354367733001709, + "step": 1715 + }, + { + "epoch": 0.2653779238352987, + "grad_norm": 5.48406982421875, + "learning_rate": 4.422680412371134e-06, + "logits/chosen": 13.096549034118652, + "logits/rejected": 9.426155090332031, + "logps/chosen": -362.8670959472656, + "logps/rejected": -337.4403381347656, + "loss": 0.5968, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5546704530715942, + "rewards/margins": 0.23997437953948975, + "rewards/rejected": 0.3146960735321045, + "step": 1716 + }, + { + "epoch": 0.26553257297506283, + "grad_norm": 4.979221343994141, + "learning_rate": 4.425257731958763e-06, + "logits/chosen": 10.707191467285156, + "logits/rejected": 7.892974376678467, + "logps/chosen": -299.4150085449219, + "logps/rejected": -316.5184631347656, + "loss": 0.5376, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.626193642616272, + "rewards/margins": 0.36955520510673523, + "rewards/rejected": 0.25663843750953674, + "step": 1717 + }, + { + "epoch": 0.265687222114827, + "grad_norm": 6.365272045135498, + "learning_rate": 4.427835051546392e-06, + "logits/chosen": 16.767621994018555, + "logits/rejected": 8.144224166870117, + "logps/chosen": -425.71563720703125, + "logps/rejected": -292.3013916015625, + "loss": 0.614, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7393825054168701, + "rewards/margins": 0.20860975980758667, + "rewards/rejected": 0.5307728052139282, + "step": 1718 + }, + { + "epoch": 0.26584187125459113, + "grad_norm": 16.35604476928711, + "learning_rate": 4.430412371134021e-06, + "logits/chosen": 13.156147003173828, + "logits/rejected": 13.855541229248047, + "logps/chosen": -273.77984619140625, + "logps/rejected": -252.74386596679688, + "loss": 0.7051, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5535398721694946, + "rewards/margins": -0.001130755990743637, + "rewards/rejected": 0.5546705722808838, + "step": 1719 + }, + { + "epoch": 0.2659965203943553, + "grad_norm": 5.132080554962158, + "learning_rate": 4.4329896907216494e-06, + "logits/chosen": 10.233236312866211, + "logits/rejected": 11.238988876342773, + "logps/chosen": -325.6272888183594, + "logps/rejected": -342.97979736328125, + "loss": 0.679, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3435250520706177, + "rewards/margins": 0.08480892330408096, + "rewards/rejected": 0.2587161064147949, + "step": 1720 + }, + { + "epoch": 0.2661511695341195, + "grad_norm": 5.789462566375732, + "learning_rate": 4.435567010309279e-06, + "logits/chosen": 11.562353134155273, + "logits/rejected": 9.545722007751465, + "logps/chosen": -341.5428466796875, + "logps/rejected": -295.9103088378906, + "loss": 0.4605, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6406839489936829, + "rewards/margins": 0.5698068737983704, + "rewards/rejected": 0.07087710499763489, + "step": 1721 + }, + { + "epoch": 0.26630581867388364, + "grad_norm": 4.944628715515137, + "learning_rate": 4.438144329896907e-06, + "logits/chosen": 7.905723571777344, + "logits/rejected": 1.8851712942123413, + "logps/chosen": -295.4744567871094, + "logps/rejected": -176.78048706054688, + "loss": 0.5922, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5240675210952759, + "rewards/margins": 0.23503799736499786, + "rewards/rejected": 0.2890295088291168, + "step": 1722 + }, + { + "epoch": 0.2664604678136478, + "grad_norm": 5.386043071746826, + "learning_rate": 4.4407216494845365e-06, + "logits/chosen": 7.1710920333862305, + "logits/rejected": 7.8755693435668945, + "logps/chosen": -266.4452209472656, + "logps/rejected": -274.92462158203125, + "loss": 0.6017, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8069710731506348, + "rewards/margins": 0.21230602264404297, + "rewards/rejected": 0.5946650505065918, + "step": 1723 + }, + { + "epoch": 0.26661511695341195, + "grad_norm": 5.013392448425293, + "learning_rate": 4.443298969072165e-06, + "logits/chosen": 9.766303062438965, + "logits/rejected": 6.226452827453613, + "logps/chosen": -284.0867614746094, + "logps/rejected": -198.92022705078125, + "loss": 0.6465, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37502872943878174, + "rewards/margins": 0.19307051599025726, + "rewards/rejected": 0.18195819854736328, + "step": 1724 + }, + { + "epoch": 0.2667697660931761, + "grad_norm": 5.038034915924072, + "learning_rate": 4.445876288659794e-06, + "logits/chosen": 11.282554626464844, + "logits/rejected": 6.617173194885254, + "logps/chosen": -264.6533203125, + "logps/rejected": -198.9580535888672, + "loss": 0.5856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4753740429878235, + "rewards/margins": 0.35207927227020264, + "rewards/rejected": 0.12329478561878204, + "step": 1725 + }, + { + "epoch": 0.26692441523294025, + "grad_norm": 5.749571323394775, + "learning_rate": 4.448453608247423e-06, + "logits/chosen": 10.442389488220215, + "logits/rejected": 14.080352783203125, + "logps/chosen": -257.47186279296875, + "logps/rejected": -313.631591796875, + "loss": 0.7405, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4114357829093933, + "rewards/margins": -0.018024668097496033, + "rewards/rejected": 0.42946046590805054, + "step": 1726 + }, + { + "epoch": 0.2670790643727044, + "grad_norm": 5.406755447387695, + "learning_rate": 4.451030927835052e-06, + "logits/chosen": 13.89488410949707, + "logits/rejected": 7.5984344482421875, + "logps/chosen": -538.5991821289062, + "logps/rejected": -347.7793884277344, + "loss": 0.5461, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8351243138313293, + "rewards/margins": 0.4403015375137329, + "rewards/rejected": 0.39482277631759644, + "step": 1727 + }, + { + "epoch": 0.2672337135124686, + "grad_norm": 5.1755452156066895, + "learning_rate": 4.453608247422681e-06, + "logits/chosen": 14.283370971679688, + "logits/rejected": 9.318902015686035, + "logps/chosen": -230.26361083984375, + "logps/rejected": -198.26577758789062, + "loss": 0.6505, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22499942779541016, + "rewards/margins": 0.12262983620166779, + "rewards/rejected": 0.10236959159374237, + "step": 1728 + }, + { + "epoch": 0.26738836265223276, + "grad_norm": 7.17335844039917, + "learning_rate": 4.45618556701031e-06, + "logits/chosen": -0.17161251604557037, + "logits/rejected": 4.816195964813232, + "logps/chosen": -227.96697998046875, + "logps/rejected": -295.21270751953125, + "loss": 0.8224, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39172661304473877, + "rewards/margins": -0.1557048112154007, + "rewards/rejected": 0.5474314093589783, + "step": 1729 + }, + { + "epoch": 0.2675430117919969, + "grad_norm": 5.123386859893799, + "learning_rate": 4.458762886597939e-06, + "logits/chosen": 10.307762145996094, + "logits/rejected": 5.519076824188232, + "logps/chosen": -287.9240417480469, + "logps/rejected": -260.66473388671875, + "loss": 0.6445, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2875879406929016, + "rewards/margins": 0.20955267548561096, + "rewards/rejected": 0.07803526520729065, + "step": 1730 + }, + { + "epoch": 0.26769766093176106, + "grad_norm": 6.955427646636963, + "learning_rate": 4.4613402061855675e-06, + "logits/chosen": 10.058084487915039, + "logits/rejected": 7.790159225463867, + "logps/chosen": -392.1209716796875, + "logps/rejected": -346.4644470214844, + "loss": 0.8549, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5513603687286377, + "rewards/margins": -0.15822352468967438, + "rewards/rejected": 0.7095838189125061, + "step": 1731 + }, + { + "epoch": 0.2678523100715252, + "grad_norm": 6.139858245849609, + "learning_rate": 4.463917525773197e-06, + "logits/chosen": 10.285011291503906, + "logits/rejected": 11.999436378479004, + "logps/chosen": -199.24026489257812, + "logps/rejected": -302.02435302734375, + "loss": 0.6686, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06514232605695724, + "rewards/margins": 0.11715100705623627, + "rewards/rejected": -0.052008673548698425, + "step": 1732 + }, + { + "epoch": 0.26800695921128936, + "grad_norm": 23.301212310791016, + "learning_rate": 4.466494845360825e-06, + "logits/chosen": 13.485984802246094, + "logits/rejected": 12.20804500579834, + "logps/chosen": -209.71197509765625, + "logps/rejected": -210.00933837890625, + "loss": 0.7298, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11429931223392487, + "rewards/margins": -0.04964572936296463, + "rewards/rejected": 0.1639450639486313, + "step": 1733 + }, + { + "epoch": 0.26816160835105357, + "grad_norm": 3.736797332763672, + "learning_rate": 4.4690721649484545e-06, + "logits/chosen": 8.650613784790039, + "logits/rejected": 6.061467170715332, + "logps/chosen": -176.56268310546875, + "logps/rejected": -146.519287109375, + "loss": 0.5244, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4185592532157898, + "rewards/margins": 0.5009682178497314, + "rewards/rejected": -0.08240897953510284, + "step": 1734 + }, + { + "epoch": 0.2683162574908177, + "grad_norm": 6.25091552734375, + "learning_rate": 4.471649484536083e-06, + "logits/chosen": 12.991402626037598, + "logits/rejected": 9.105416297912598, + "logps/chosen": -230.97344970703125, + "logps/rejected": -192.5790252685547, + "loss": 0.7408, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10231724381446838, + "rewards/margins": -0.06344317644834518, + "rewards/rejected": 0.16576042771339417, + "step": 1735 + }, + { + "epoch": 0.2684709066305819, + "grad_norm": 5.054879188537598, + "learning_rate": 4.474226804123712e-06, + "logits/chosen": 7.346775054931641, + "logits/rejected": 5.71868896484375, + "logps/chosen": -269.82110595703125, + "logps/rejected": -210.7763671875, + "loss": 0.6934, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.510530948638916, + "rewards/margins": 0.0674944818019867, + "rewards/rejected": 0.44303640723228455, + "step": 1736 + }, + { + "epoch": 0.268625555770346, + "grad_norm": 8.783943176269531, + "learning_rate": 4.476804123711341e-06, + "logits/chosen": 8.429193496704102, + "logits/rejected": 7.957465171813965, + "logps/chosen": -388.2986145019531, + "logps/rejected": -327.13800048828125, + "loss": 0.8703, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16317768394947052, + "rewards/margins": -0.11072373390197754, + "rewards/rejected": 0.27390143275260925, + "step": 1737 + }, + { + "epoch": 0.2687802049101102, + "grad_norm": 5.733576774597168, + "learning_rate": 4.47938144329897e-06, + "logits/chosen": 4.806110858917236, + "logits/rejected": 3.667363405227661, + "logps/chosen": -382.98919677734375, + "logps/rejected": -364.3193359375, + "loss": 0.6953, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5008050203323364, + "rewards/margins": 0.13074997067451477, + "rewards/rejected": 0.37005510926246643, + "step": 1738 + }, + { + "epoch": 0.26893485404987433, + "grad_norm": 10.084681510925293, + "learning_rate": 4.4819587628865984e-06, + "logits/chosen": 12.417266845703125, + "logits/rejected": 7.827620506286621, + "logps/chosen": -422.2778625488281, + "logps/rejected": -310.5722961425781, + "loss": 0.4593, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6576128602027893, + "rewards/margins": 0.6134498715400696, + "rewards/rejected": 0.044163040816783905, + "step": 1739 + }, + { + "epoch": 0.26908950318963853, + "grad_norm": 5.354651927947998, + "learning_rate": 4.484536082474228e-06, + "logits/chosen": 11.033258438110352, + "logits/rejected": 10.85450267791748, + "logps/chosen": -260.737060546875, + "logps/rejected": -284.7623596191406, + "loss": 0.6446, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5706996917724609, + "rewards/margins": 0.18445071578025818, + "rewards/rejected": 0.38624900579452515, + "step": 1740 + }, + { + "epoch": 0.2692441523294027, + "grad_norm": 6.8571553230285645, + "learning_rate": 4.487113402061856e-06, + "logits/chosen": 9.52800464630127, + "logits/rejected": 8.58477783203125, + "logps/chosen": -331.73193359375, + "logps/rejected": -230.53656005859375, + "loss": 0.6536, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28750234842300415, + "rewards/margins": 0.09903968870639801, + "rewards/rejected": 0.18846264481544495, + "step": 1741 + }, + { + "epoch": 0.26939880146916684, + "grad_norm": 5.678842544555664, + "learning_rate": 4.4896907216494855e-06, + "logits/chosen": 9.500005722045898, + "logits/rejected": 4.818561553955078, + "logps/chosen": -299.9071044921875, + "logps/rejected": -218.9936981201172, + "loss": 0.721, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.051835354417562485, + "rewards/margins": 0.0384562611579895, + "rewards/rejected": 0.013379089534282684, + "step": 1742 + }, + { + "epoch": 0.269553450608931, + "grad_norm": 8.079463005065918, + "learning_rate": 4.492268041237114e-06, + "logits/chosen": 10.020328521728516, + "logits/rejected": 10.742426872253418, + "logps/chosen": -280.5080261230469, + "logps/rejected": -320.8731689453125, + "loss": 0.8585, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3448030650615692, + "rewards/margins": -0.21688544750213623, + "rewards/rejected": 0.5616885423660278, + "step": 1743 + }, + { + "epoch": 0.26970809974869514, + "grad_norm": 5.063076019287109, + "learning_rate": 4.494845360824742e-06, + "logits/chosen": 12.055166244506836, + "logits/rejected": 10.694090843200684, + "logps/chosen": -306.79461669921875, + "logps/rejected": -290.21490478515625, + "loss": 0.6368, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.375826895236969, + "rewards/margins": 0.17593315243721008, + "rewards/rejected": 0.1998937726020813, + "step": 1744 + }, + { + "epoch": 0.2698627488884593, + "grad_norm": 5.686788558959961, + "learning_rate": 4.497422680412372e-06, + "logits/chosen": 13.565349578857422, + "logits/rejected": 12.07940673828125, + "logps/chosen": -241.08929443359375, + "logps/rejected": -207.55294799804688, + "loss": 0.749, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.19779139757156372, + "rewards/margins": -0.10124626755714417, + "rewards/rejected": 0.2990376651287079, + "step": 1745 + }, + { + "epoch": 0.27001739802822344, + "grad_norm": 3.8289053440093994, + "learning_rate": 4.5e-06, + "logits/chosen": 9.720939636230469, + "logits/rejected": 6.20883846282959, + "logps/chosen": -198.0955810546875, + "logps/rejected": -175.57017517089844, + "loss": 0.6088, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38892191648483276, + "rewards/margins": 0.22377412021160126, + "rewards/rejected": 0.1651478111743927, + "step": 1746 + }, + { + "epoch": 0.27017204716798765, + "grad_norm": 5.094119548797607, + "learning_rate": 4.502577319587629e-06, + "logits/chosen": 7.088400840759277, + "logits/rejected": 8.52591323852539, + "logps/chosen": -238.23483276367188, + "logps/rejected": -290.15411376953125, + "loss": 0.6651, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2936222553253174, + "rewards/margins": 0.09353528171777725, + "rewards/rejected": 0.20008698105812073, + "step": 1747 + }, + { + "epoch": 0.2703266963077518, + "grad_norm": 7.345438480377197, + "learning_rate": 4.505154639175258e-06, + "logits/chosen": 3.275681495666504, + "logits/rejected": 3.0518741607666016, + "logps/chosen": -223.5284423828125, + "logps/rejected": -251.5111083984375, + "loss": 0.6543, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3825116753578186, + "rewards/margins": 0.174207866191864, + "rewards/rejected": 0.2083037942647934, + "step": 1748 + }, + { + "epoch": 0.27048134544751595, + "grad_norm": 5.7420973777771, + "learning_rate": 4.507731958762887e-06, + "logits/chosen": 13.98678207397461, + "logits/rejected": 13.518287658691406, + "logps/chosen": -310.6715393066406, + "logps/rejected": -307.84368896484375, + "loss": 0.6516, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29958975315093994, + "rewards/margins": 0.1392795592546463, + "rewards/rejected": 0.16031017899513245, + "step": 1749 + }, + { + "epoch": 0.2706359945872801, + "grad_norm": 13.066579818725586, + "learning_rate": 4.510309278350516e-06, + "logits/chosen": 8.892087936401367, + "logits/rejected": 14.950610160827637, + "logps/chosen": -230.78277587890625, + "logps/rejected": -292.1122131347656, + "loss": 0.7221, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22716808319091797, + "rewards/margins": -0.04069451242685318, + "rewards/rejected": 0.26786261796951294, + "step": 1750 + }, + { + "epoch": 0.27079064372704426, + "grad_norm": 6.974279880523682, + "learning_rate": 4.512886597938145e-06, + "logits/chosen": 10.362371444702148, + "logits/rejected": 7.3704938888549805, + "logps/chosen": -261.9510192871094, + "logps/rejected": -275.69244384765625, + "loss": 0.6753, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27854663133621216, + "rewards/margins": 0.07175316661596298, + "rewards/rejected": 0.20679347217082977, + "step": 1751 + }, + { + "epoch": 0.2709452928668084, + "grad_norm": 6.656082630157471, + "learning_rate": 4.515463917525773e-06, + "logits/chosen": 8.17276668548584, + "logits/rejected": 9.863580703735352, + "logps/chosen": -323.19586181640625, + "logps/rejected": -431.58294677734375, + "loss": 0.732, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.8858366012573242, + "rewards/margins": 0.03014180064201355, + "rewards/rejected": 0.8556947708129883, + "step": 1752 + }, + { + "epoch": 0.2710999420065726, + "grad_norm": 5.26326322555542, + "learning_rate": 4.518041237113403e-06, + "logits/chosen": 14.49161434173584, + "logits/rejected": 6.8541083335876465, + "logps/chosen": -285.4656982421875, + "logps/rejected": -214.22303771972656, + "loss": 0.5935, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5470476746559143, + "rewards/margins": 0.2501283884048462, + "rewards/rejected": 0.2969193458557129, + "step": 1753 + }, + { + "epoch": 0.27125459114633677, + "grad_norm": 5.698396682739258, + "learning_rate": 4.520618556701031e-06, + "logits/chosen": 4.595928192138672, + "logits/rejected": 4.694220066070557, + "logps/chosen": -200.26171875, + "logps/rejected": -192.99400329589844, + "loss": 0.7333, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34854525327682495, + "rewards/margins": -0.011685170233249664, + "rewards/rejected": 0.3602304458618164, + "step": 1754 + }, + { + "epoch": 0.2714092402861009, + "grad_norm": 4.87807035446167, + "learning_rate": 4.52319587628866e-06, + "logits/chosen": 10.981599807739258, + "logits/rejected": 4.471719264984131, + "logps/chosen": -234.43411254882812, + "logps/rejected": -153.1018524169922, + "loss": 0.6299, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5472139120101929, + "rewards/margins": 0.15522579848766327, + "rewards/rejected": 0.3919881582260132, + "step": 1755 + }, + { + "epoch": 0.27156388942586507, + "grad_norm": 54.97401428222656, + "learning_rate": 4.525773195876289e-06, + "logits/chosen": 10.627900123596191, + "logits/rejected": 11.327165603637695, + "logps/chosen": -336.5604248046875, + "logps/rejected": -403.004150390625, + "loss": 0.627, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6421036124229431, + "rewards/margins": 0.16009390354156494, + "rewards/rejected": 0.4820097088813782, + "step": 1756 + }, + { + "epoch": 0.2717185385656292, + "grad_norm": 4.172118186950684, + "learning_rate": 4.528350515463918e-06, + "logits/chosen": 14.645185470581055, + "logits/rejected": 10.396769523620605, + "logps/chosen": -321.38385009765625, + "logps/rejected": -217.132568359375, + "loss": 0.5333, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4707261025905609, + "rewards/margins": 0.38813915848731995, + "rewards/rejected": 0.08258695900440216, + "step": 1757 + }, + { + "epoch": 0.27187318770539337, + "grad_norm": 10.270281791687012, + "learning_rate": 4.5309278350515466e-06, + "logits/chosen": 8.66783618927002, + "logits/rejected": 7.582070350646973, + "logps/chosen": -408.40521240234375, + "logps/rejected": -355.104248046875, + "loss": 0.8924, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.47501498460769653, + "rewards/margins": -0.2812090516090393, + "rewards/rejected": 0.7562240362167358, + "step": 1758 + }, + { + "epoch": 0.2720278368451575, + "grad_norm": 6.416598796844482, + "learning_rate": 4.533505154639176e-06, + "logits/chosen": 11.456398010253906, + "logits/rejected": 6.370075702667236, + "logps/chosen": -250.26211547851562, + "logps/rejected": -201.88763427734375, + "loss": 0.5553, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2968330383300781, + "rewards/margins": 0.4610915184020996, + "rewards/rejected": -0.16425848007202148, + "step": 1759 + }, + { + "epoch": 0.27218248598492173, + "grad_norm": 4.307449817657471, + "learning_rate": 4.536082474226804e-06, + "logits/chosen": 11.395735740661621, + "logits/rejected": 8.634366989135742, + "logps/chosen": -230.93817138671875, + "logps/rejected": -216.629638671875, + "loss": 0.5468, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47546061873435974, + "rewards/margins": 0.4234132766723633, + "rewards/rejected": 0.05204734578728676, + "step": 1760 + }, + { + "epoch": 0.2723371351246859, + "grad_norm": 6.60194730758667, + "learning_rate": 4.538659793814434e-06, + "logits/chosen": 14.477359771728516, + "logits/rejected": 10.578109741210938, + "logps/chosen": -456.6590576171875, + "logps/rejected": -266.31964111328125, + "loss": 0.6187, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22925835847854614, + "rewards/margins": 0.21333390474319458, + "rewards/rejected": 0.01592445559799671, + "step": 1761 + }, + { + "epoch": 0.27249178426445003, + "grad_norm": 4.966465473175049, + "learning_rate": 4.541237113402062e-06, + "logits/chosen": 11.965703010559082, + "logits/rejected": 8.348986625671387, + "logps/chosen": -150.67515563964844, + "logps/rejected": -164.79669189453125, + "loss": 0.5886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5295576453208923, + "rewards/margins": 0.2687874436378479, + "rewards/rejected": 0.2607702314853668, + "step": 1762 + }, + { + "epoch": 0.2726464334042142, + "grad_norm": 11.325162887573242, + "learning_rate": 4.543814432989691e-06, + "logits/chosen": 6.723527908325195, + "logits/rejected": 10.936582565307617, + "logps/chosen": -386.983154296875, + "logps/rejected": -415.4788818359375, + "loss": 0.7443, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3540104329586029, + "rewards/margins": -0.019354645162820816, + "rewards/rejected": 0.37336504459381104, + "step": 1763 + }, + { + "epoch": 0.27280108254397833, + "grad_norm": 7.192172527313232, + "learning_rate": 4.54639175257732e-06, + "logits/chosen": 11.676755905151367, + "logits/rejected": 9.913164138793945, + "logps/chosen": -252.0538330078125, + "logps/rejected": -227.24188232421875, + "loss": 0.6357, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.58184814453125, + "rewards/margins": 0.23793716728687286, + "rewards/rejected": 0.34391099214553833, + "step": 1764 + }, + { + "epoch": 0.2729557316837425, + "grad_norm": 5.468150615692139, + "learning_rate": 4.548969072164949e-06, + "logits/chosen": 9.721074104309082, + "logits/rejected": 13.080572128295898, + "logps/chosen": -173.80316162109375, + "logps/rejected": -202.72247314453125, + "loss": 0.683, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.380825936794281, + "rewards/margins": 0.049785513430833817, + "rewards/rejected": 0.3310404419898987, + "step": 1765 + }, + { + "epoch": 0.2731103808235067, + "grad_norm": 4.978980541229248, + "learning_rate": 4.5515463917525776e-06, + "logits/chosen": 14.374263763427734, + "logits/rejected": 5.3594255447387695, + "logps/chosen": -211.71661376953125, + "logps/rejected": -182.83963012695312, + "loss": 0.6036, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3469385504722595, + "rewards/margins": 0.28878986835479736, + "rewards/rejected": 0.05814870446920395, + "step": 1766 + }, + { + "epoch": 0.27326502996327084, + "grad_norm": 6.417335033416748, + "learning_rate": 4.554123711340207e-06, + "logits/chosen": 8.773098945617676, + "logits/rejected": 8.000666618347168, + "logps/chosen": -441.27325439453125, + "logps/rejected": -332.311279296875, + "loss": 0.6698, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.45418065786361694, + "rewards/margins": 0.11314989626407623, + "rewards/rejected": 0.3410307466983795, + "step": 1767 + }, + { + "epoch": 0.273419679103035, + "grad_norm": 6.2654595375061035, + "learning_rate": 4.556701030927835e-06, + "logits/chosen": 11.711050033569336, + "logits/rejected": 7.446597099304199, + "logps/chosen": -515.9819946289062, + "logps/rejected": -352.4246520996094, + "loss": 0.6114, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5711075067520142, + "rewards/margins": 0.2978874444961548, + "rewards/rejected": 0.2732200622558594, + "step": 1768 + }, + { + "epoch": 0.27357432824279915, + "grad_norm": 10.886341094970703, + "learning_rate": 4.559278350515465e-06, + "logits/chosen": 7.859816551208496, + "logits/rejected": 2.989755153656006, + "logps/chosen": -160.63906860351562, + "logps/rejected": -102.91426086425781, + "loss": 0.703, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3161394000053406, + "rewards/margins": 0.0051191262900829315, + "rewards/rejected": 0.3110203146934509, + "step": 1769 + }, + { + "epoch": 0.2737289773825633, + "grad_norm": 7.261675834655762, + "learning_rate": 4.561855670103093e-06, + "logits/chosen": 12.162296295166016, + "logits/rejected": 9.816460609436035, + "logps/chosen": -343.8886413574219, + "logps/rejected": -308.63885498046875, + "loss": 0.7363, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3564913272857666, + "rewards/margins": 0.03462439030408859, + "rewards/rejected": 0.3218669295310974, + "step": 1770 + }, + { + "epoch": 0.27388362652232745, + "grad_norm": 6.246756553649902, + "learning_rate": 4.5644329896907215e-06, + "logits/chosen": 9.481365203857422, + "logits/rejected": 10.303899765014648, + "logps/chosen": -266.0, + "logps/rejected": -248.3848419189453, + "loss": 0.5617, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37504637241363525, + "rewards/margins": 0.3056323826313019, + "rewards/rejected": 0.06941400468349457, + "step": 1771 + }, + { + "epoch": 0.27403827566209166, + "grad_norm": 3.724700689315796, + "learning_rate": 4.567010309278351e-06, + "logits/chosen": 8.440160751342773, + "logits/rejected": 7.323162078857422, + "logps/chosen": -147.2929229736328, + "logps/rejected": -155.32650756835938, + "loss": 0.5384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4414490759372711, + "rewards/margins": 0.3574179410934448, + "rewards/rejected": 0.0840311050415039, + "step": 1772 + }, + { + "epoch": 0.2741929248018558, + "grad_norm": 5.508972644805908, + "learning_rate": 4.569587628865979e-06, + "logits/chosen": 4.529531478881836, + "logits/rejected": 5.403200626373291, + "logps/chosen": -319.9095458984375, + "logps/rejected": -235.11407470703125, + "loss": 0.5968, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37142059206962585, + "rewards/margins": 0.21721532940864563, + "rewards/rejected": 0.15420524775981903, + "step": 1773 + }, + { + "epoch": 0.27434757394161996, + "grad_norm": 9.819826126098633, + "learning_rate": 4.5721649484536085e-06, + "logits/chosen": 11.389838218688965, + "logits/rejected": 11.219137191772461, + "logps/chosen": -307.8130798339844, + "logps/rejected": -353.7999267578125, + "loss": 0.6348, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41786760091781616, + "rewards/margins": 0.16911393404006958, + "rewards/rejected": 0.24875369668006897, + "step": 1774 + }, + { + "epoch": 0.2745022230813841, + "grad_norm": 5.779160499572754, + "learning_rate": 4.574742268041237e-06, + "logits/chosen": 8.296561241149902, + "logits/rejected": 4.77658748626709, + "logps/chosen": -416.005859375, + "logps/rejected": -371.0732116699219, + "loss": 0.67, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5942328572273254, + "rewards/margins": 0.1252460479736328, + "rewards/rejected": 0.468986839056015, + "step": 1775 + }, + { + "epoch": 0.27465687222114826, + "grad_norm": 4.862935543060303, + "learning_rate": 4.577319587628866e-06, + "logits/chosen": 7.9415602684021, + "logits/rejected": 6.139190673828125, + "logps/chosen": -208.67913818359375, + "logps/rejected": -175.70506286621094, + "loss": 0.6236, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5619739890098572, + "rewards/margins": 0.18216942250728607, + "rewards/rejected": 0.3798045516014099, + "step": 1776 + }, + { + "epoch": 0.2748115213609124, + "grad_norm": 10.83504867553711, + "learning_rate": 4.579896907216495e-06, + "logits/chosen": 7.566333770751953, + "logits/rejected": 4.853797912597656, + "logps/chosen": -219.80227661132812, + "logps/rejected": -215.96945190429688, + "loss": 0.51, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5173207521438599, + "rewards/margins": 0.589341402053833, + "rewards/rejected": -0.07202073931694031, + "step": 1777 + }, + { + "epoch": 0.27496617050067657, + "grad_norm": 6.655467510223389, + "learning_rate": 4.582474226804124e-06, + "logits/chosen": 9.469337463378906, + "logits/rejected": 4.3183512687683105, + "logps/chosen": -294.57684326171875, + "logps/rejected": -245.7779998779297, + "loss": 0.7966, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17119817435741425, + "rewards/margins": -0.1174437552690506, + "rewards/rejected": 0.28864192962646484, + "step": 1778 + }, + { + "epoch": 0.2751208196404408, + "grad_norm": 6.520418643951416, + "learning_rate": 4.5850515463917525e-06, + "logits/chosen": 10.906291961669922, + "logits/rejected": 5.719893932342529, + "logps/chosen": -338.7460632324219, + "logps/rejected": -327.1804504394531, + "loss": 0.617, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38511353731155396, + "rewards/margins": 0.22053347527980804, + "rewards/rejected": 0.1645800620317459, + "step": 1779 + }, + { + "epoch": 0.2752754687802049, + "grad_norm": 4.582620620727539, + "learning_rate": 4.587628865979382e-06, + "logits/chosen": 4.078237056732178, + "logits/rejected": 6.392640113830566, + "logps/chosen": -219.04946899414062, + "logps/rejected": -252.43405151367188, + "loss": 0.6023, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5643264651298523, + "rewards/margins": 0.25029510259628296, + "rewards/rejected": 0.3140313923358917, + "step": 1780 + }, + { + "epoch": 0.2754301179199691, + "grad_norm": 5.459748268127441, + "learning_rate": 4.59020618556701e-06, + "logits/chosen": 8.431791305541992, + "logits/rejected": 7.698966026306152, + "logps/chosen": -231.88125610351562, + "logps/rejected": -203.00279235839844, + "loss": 0.6682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3660655915737152, + "rewards/margins": 0.09622454643249512, + "rewards/rejected": 0.2698410451412201, + "step": 1781 + }, + { + "epoch": 0.2755847670597332, + "grad_norm": 6.183064937591553, + "learning_rate": 4.5927835051546395e-06, + "logits/chosen": 13.087644577026367, + "logits/rejected": 11.896936416625977, + "logps/chosen": -232.48736572265625, + "logps/rejected": -258.69830322265625, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29107093811035156, + "rewards/margins": 0.14245158433914185, + "rewards/rejected": 0.14861935377120972, + "step": 1782 + }, + { + "epoch": 0.2757394161994974, + "grad_norm": 4.173166751861572, + "learning_rate": 4.595360824742268e-06, + "logits/chosen": 8.520410537719727, + "logits/rejected": 7.409262657165527, + "logps/chosen": -190.48806762695312, + "logps/rejected": -168.40975952148438, + "loss": 0.6975, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25488972663879395, + "rewards/margins": 0.07498779892921448, + "rewards/rejected": 0.17990194261074066, + "step": 1783 + }, + { + "epoch": 0.27589406533926153, + "grad_norm": 6.094630718231201, + "learning_rate": 4.597938144329897e-06, + "logits/chosen": 12.645772933959961, + "logits/rejected": 6.3182172775268555, + "logps/chosen": -419.4778137207031, + "logps/rejected": -291.7328186035156, + "loss": 0.6718, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40914487838745117, + "rewards/margins": 0.07191429287195206, + "rewards/rejected": 0.3372305929660797, + "step": 1784 + }, + { + "epoch": 0.27604871447902574, + "grad_norm": 4.593143463134766, + "learning_rate": 4.600515463917526e-06, + "logits/chosen": 10.098987579345703, + "logits/rejected": 7.184027671813965, + "logps/chosen": -288.43408203125, + "logps/rejected": -263.3219909667969, + "loss": 0.6541, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7223830223083496, + "rewards/margins": 0.2738035023212433, + "rewards/rejected": 0.4485795199871063, + "step": 1785 + }, + { + "epoch": 0.2762033636187899, + "grad_norm": 6.917372703552246, + "learning_rate": 4.603092783505155e-06, + "logits/chosen": 10.330402374267578, + "logits/rejected": 5.444697380065918, + "logps/chosen": -301.9897155761719, + "logps/rejected": -222.5584716796875, + "loss": 0.7968, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07506342977285385, + "rewards/margins": -0.13927708566188812, + "rewards/rejected": 0.21434050798416138, + "step": 1786 + }, + { + "epoch": 0.27635801275855404, + "grad_norm": 3.787792205810547, + "learning_rate": 4.6056701030927834e-06, + "logits/chosen": 14.770713806152344, + "logits/rejected": 7.072144508361816, + "logps/chosen": -293.251953125, + "logps/rejected": -208.4970703125, + "loss": 0.4716, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8451826572418213, + "rewards/margins": 0.5348148345947266, + "rewards/rejected": 0.31036776304244995, + "step": 1787 + }, + { + "epoch": 0.2765126618983182, + "grad_norm": 13.960423469543457, + "learning_rate": 4.608247422680413e-06, + "logits/chosen": 13.204172134399414, + "logits/rejected": 7.987703323364258, + "logps/chosen": -413.2759094238281, + "logps/rejected": -338.0579833984375, + "loss": 0.6093, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38115859031677246, + "rewards/margins": 0.26606765389442444, + "rewards/rejected": 0.11509095132350922, + "step": 1788 + }, + { + "epoch": 0.27666731103808234, + "grad_norm": 4.321527481079102, + "learning_rate": 4.610824742268042e-06, + "logits/chosen": 7.346288204193115, + "logits/rejected": 5.108887672424316, + "logps/chosen": -173.19715881347656, + "logps/rejected": -155.81553649902344, + "loss": 0.7, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.552959680557251, + "rewards/margins": 0.058785922825336456, + "rewards/rejected": 0.4941737949848175, + "step": 1789 + }, + { + "epoch": 0.2768219601778465, + "grad_norm": 5.214527130126953, + "learning_rate": 4.6134020618556705e-06, + "logits/chosen": 4.459333896636963, + "logits/rejected": 0.02829456329345703, + "logps/chosen": -238.23802185058594, + "logps/rejected": -156.39300537109375, + "loss": 0.7568, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5082700848579407, + "rewards/margins": 0.04841124638915062, + "rewards/rejected": 0.45985886454582214, + "step": 1790 + }, + { + "epoch": 0.27697660931761064, + "grad_norm": 5.841790676116943, + "learning_rate": 4.6159793814433e-06, + "logits/chosen": 13.923933029174805, + "logits/rejected": 8.109094619750977, + "logps/chosen": -308.5247802734375, + "logps/rejected": -241.20712280273438, + "loss": 0.6594, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26008549332618713, + "rewards/margins": 0.18404704332351685, + "rewards/rejected": 0.07603845000267029, + "step": 1791 + }, + { + "epoch": 0.27713125845737485, + "grad_norm": 10.888556480407715, + "learning_rate": 4.618556701030928e-06, + "logits/chosen": 7.914096832275391, + "logits/rejected": 9.503911972045898, + "logps/chosen": -255.510986328125, + "logps/rejected": -272.71563720703125, + "loss": 0.8884, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12503138184547424, + "rewards/margins": -0.29305964708328247, + "rewards/rejected": 0.1680283099412918, + "step": 1792 + }, + { + "epoch": 0.277285907597139, + "grad_norm": 4.766130447387695, + "learning_rate": 4.6211340206185575e-06, + "logits/chosen": 13.7325439453125, + "logits/rejected": 11.023902893066406, + "logps/chosen": -231.98545837402344, + "logps/rejected": -183.91726684570312, + "loss": 0.6273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5380759835243225, + "rewards/margins": 0.15609519183635712, + "rewards/rejected": 0.3819808065891266, + "step": 1793 + }, + { + "epoch": 0.27744055673690315, + "grad_norm": 5.318512916564941, + "learning_rate": 4.623711340206186e-06, + "logits/chosen": 7.283623218536377, + "logits/rejected": 9.433982849121094, + "logps/chosen": -169.26589965820312, + "logps/rejected": -164.61131286621094, + "loss": 0.7234, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09434748440980911, + "rewards/margins": -0.03460858017206192, + "rewards/rejected": 0.12895606458187103, + "step": 1794 + }, + { + "epoch": 0.2775952058766673, + "grad_norm": 4.173285007476807, + "learning_rate": 4.626288659793815e-06, + "logits/chosen": 5.179205894470215, + "logits/rejected": 6.072615623474121, + "logps/chosen": -168.74044799804688, + "logps/rejected": -209.5545654296875, + "loss": 0.5635, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6600791215896606, + "rewards/margins": 0.40953323245048523, + "rewards/rejected": 0.2505458891391754, + "step": 1795 + }, + { + "epoch": 0.27774985501643146, + "grad_norm": 7.360849857330322, + "learning_rate": 4.628865979381444e-06, + "logits/chosen": 15.077102661132812, + "logits/rejected": 5.993697643280029, + "logps/chosen": -399.30975341796875, + "logps/rejected": -319.9261779785156, + "loss": 0.4653, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6223754286766052, + "rewards/margins": 0.6062986254692078, + "rewards/rejected": 0.01607675850391388, + "step": 1796 + }, + { + "epoch": 0.2779045041561956, + "grad_norm": 5.941712856292725, + "learning_rate": 4.631443298969073e-06, + "logits/chosen": 9.63193416595459, + "logits/rejected": 6.623553276062012, + "logps/chosen": -276.166748046875, + "logps/rejected": -252.10830688476562, + "loss": 0.6072, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3636169135570526, + "rewards/margins": 0.33589398860931396, + "rewards/rejected": 0.02772293984889984, + "step": 1797 + }, + { + "epoch": 0.2780591532959598, + "grad_norm": 5.1528167724609375, + "learning_rate": 4.6340206185567015e-06, + "logits/chosen": 12.019081115722656, + "logits/rejected": 7.0318098068237305, + "logps/chosen": -384.81573486328125, + "logps/rejected": -245.92637634277344, + "loss": 0.5801, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7144142389297485, + "rewards/margins": 0.3055454194545746, + "rewards/rejected": 0.40886878967285156, + "step": 1798 + }, + { + "epoch": 0.27821380243572397, + "grad_norm": 6.089789390563965, + "learning_rate": 4.636597938144331e-06, + "logits/chosen": 11.860133171081543, + "logits/rejected": 4.922367095947266, + "logps/chosen": -321.1929016113281, + "logps/rejected": -220.14512634277344, + "loss": 0.6868, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6045522093772888, + "rewards/margins": 0.04126105457544327, + "rewards/rejected": 0.5632911920547485, + "step": 1799 + }, + { + "epoch": 0.2783684515754881, + "grad_norm": 6.841023921966553, + "learning_rate": 4.639175257731959e-06, + "logits/chosen": 12.983312606811523, + "logits/rejected": 6.042972564697266, + "logps/chosen": -296.8444519042969, + "logps/rejected": -144.22396850585938, + "loss": 0.7715, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19157515466213226, + "rewards/margins": -0.0874970555305481, + "rewards/rejected": 0.27907222509384155, + "step": 1800 + }, + { + "epoch": 0.27852310071525227, + "grad_norm": 5.250925540924072, + "learning_rate": 4.6417525773195885e-06, + "logits/chosen": 11.337623596191406, + "logits/rejected": 4.912841796875, + "logps/chosen": -263.2588195800781, + "logps/rejected": -176.6988525390625, + "loss": 0.6328, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24549323320388794, + "rewards/margins": 0.21916508674621582, + "rewards/rejected": 0.02632814645767212, + "step": 1801 + }, + { + "epoch": 0.2786777498550164, + "grad_norm": 6.687630653381348, + "learning_rate": 4.644329896907217e-06, + "logits/chosen": 9.23815631866455, + "logits/rejected": 14.233956336975098, + "logps/chosen": -333.1272277832031, + "logps/rejected": -381.56103515625, + "loss": 0.8792, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.43065834045410156, + "rewards/margins": -0.2751563787460327, + "rewards/rejected": 0.7058147192001343, + "step": 1802 + }, + { + "epoch": 0.2788323989947806, + "grad_norm": 4.982458591461182, + "learning_rate": 4.646907216494846e-06, + "logits/chosen": 13.7312650680542, + "logits/rejected": 8.046867370605469, + "logps/chosen": -303.96014404296875, + "logps/rejected": -254.2519073486328, + "loss": 0.4666, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3461725115776062, + "rewards/margins": 0.7006341218948364, + "rewards/rejected": -0.3544616103172302, + "step": 1803 + }, + { + "epoch": 0.2789870481345447, + "grad_norm": 6.613254070281982, + "learning_rate": 4.649484536082475e-06, + "logits/chosen": 7.195912837982178, + "logits/rejected": 12.799966812133789, + "logps/chosen": -305.4556579589844, + "logps/rejected": -274.7067565917969, + "loss": 0.6977, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3151063919067383, + "rewards/margins": 0.08902379870414734, + "rewards/rejected": 0.22608262300491333, + "step": 1804 + }, + { + "epoch": 0.27914169727430893, + "grad_norm": 4.40196418762207, + "learning_rate": 4.652061855670104e-06, + "logits/chosen": 7.241674423217773, + "logits/rejected": 5.626523017883301, + "logps/chosen": -208.08758544921875, + "logps/rejected": -208.23648071289062, + "loss": 0.581, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3735140860080719, + "rewards/margins": 0.26652076840400696, + "rewards/rejected": 0.10699333250522614, + "step": 1805 + }, + { + "epoch": 0.2792963464140731, + "grad_norm": 6.872039794921875, + "learning_rate": 4.6546391752577324e-06, + "logits/chosen": 6.387746810913086, + "logits/rejected": 2.7451484203338623, + "logps/chosen": -401.534423828125, + "logps/rejected": -312.84368896484375, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5557008981704712, + "rewards/margins": 0.01845632866024971, + "rewards/rejected": 0.5372445583343506, + "step": 1806 + }, + { + "epoch": 0.27945099555383723, + "grad_norm": 5.204464912414551, + "learning_rate": 4.657216494845362e-06, + "logits/chosen": 15.71424388885498, + "logits/rejected": 2.1320204734802246, + "logps/chosen": -498.5172424316406, + "logps/rejected": -208.55564880371094, + "loss": 0.517, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5891244411468506, + "rewards/margins": 0.5190377235412598, + "rewards/rejected": 0.07008671760559082, + "step": 1807 + }, + { + "epoch": 0.2796056446936014, + "grad_norm": 37.6791877746582, + "learning_rate": 4.65979381443299e-06, + "logits/chosen": 14.20694637298584, + "logits/rejected": 7.893401145935059, + "logps/chosen": -529.0643920898438, + "logps/rejected": -397.0866394042969, + "loss": 0.6454, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.9732526540756226, + "rewards/margins": 0.1775982677936554, + "rewards/rejected": 0.7956543564796448, + "step": 1808 + }, + { + "epoch": 0.27976029383336554, + "grad_norm": 6.8347649574279785, + "learning_rate": 4.662371134020619e-06, + "logits/chosen": 8.797388076782227, + "logits/rejected": 4.059911727905273, + "logps/chosen": -364.923828125, + "logps/rejected": -330.72515869140625, + "loss": 0.5717, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4942196011543274, + "rewards/margins": 0.3306804597377777, + "rewards/rejected": 0.1635391265153885, + "step": 1809 + }, + { + "epoch": 0.2799149429731297, + "grad_norm": 5.941415786743164, + "learning_rate": 4.664948453608248e-06, + "logits/chosen": -0.28004634380340576, + "logits/rejected": 6.157761573791504, + "logps/chosen": -142.97694396972656, + "logps/rejected": -224.66497802734375, + "loss": 0.7229, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23308923840522766, + "rewards/margins": 0.00542113184928894, + "rewards/rejected": 0.22766809165477753, + "step": 1810 + }, + { + "epoch": 0.2800695921128939, + "grad_norm": 5.795331001281738, + "learning_rate": 4.667525773195876e-06, + "logits/chosen": 9.838809967041016, + "logits/rejected": 11.281718254089355, + "logps/chosen": -306.9436340332031, + "logps/rejected": -324.8358459472656, + "loss": 0.766, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5886789560317993, + "rewards/margins": -0.11536659300327301, + "rewards/rejected": 0.7040455341339111, + "step": 1811 + }, + { + "epoch": 0.28022424125265805, + "grad_norm": 4.510373592376709, + "learning_rate": 4.670103092783506e-06, + "logits/chosen": 13.319091796875, + "logits/rejected": 6.25773286819458, + "logps/chosen": -308.00250244140625, + "logps/rejected": -214.00946044921875, + "loss": 0.5636, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4302256405353546, + "rewards/margins": 0.29232674837112427, + "rewards/rejected": 0.13789892196655273, + "step": 1812 + }, + { + "epoch": 0.2803788903924222, + "grad_norm": 5.403369903564453, + "learning_rate": 4.672680412371134e-06, + "logits/chosen": 10.388248443603516, + "logits/rejected": 4.79567813873291, + "logps/chosen": -357.135009765625, + "logps/rejected": -273.6259460449219, + "loss": 0.5983, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47838470339775085, + "rewards/margins": 0.30037403106689453, + "rewards/rejected": 0.17801065742969513, + "step": 1813 + }, + { + "epoch": 0.28053353953218635, + "grad_norm": 4.630216121673584, + "learning_rate": 4.675257731958763e-06, + "logits/chosen": 7.771177291870117, + "logits/rejected": 5.1385111808776855, + "logps/chosen": -280.0901794433594, + "logps/rejected": -209.920166015625, + "loss": 0.6896, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3488076627254486, + "rewards/margins": 0.022928372025489807, + "rewards/rejected": 0.3258792757987976, + "step": 1814 + }, + { + "epoch": 0.2806881886719505, + "grad_norm": 5.719762325286865, + "learning_rate": 4.677835051546392e-06, + "logits/chosen": 10.391979217529297, + "logits/rejected": 4.174276351928711, + "logps/chosen": -435.1410217285156, + "logps/rejected": -276.4684753417969, + "loss": 0.6236, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5524491667747498, + "rewards/margins": 0.23280295729637146, + "rewards/rejected": 0.3196461796760559, + "step": 1815 + }, + { + "epoch": 0.28084283781171465, + "grad_norm": 5.505342960357666, + "learning_rate": 4.680412371134021e-06, + "logits/chosen": 1.3984167575836182, + "logits/rejected": 3.2833070755004883, + "logps/chosen": -188.52151489257812, + "logps/rejected": -235.49627685546875, + "loss": 0.5879, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4416334331035614, + "rewards/margins": 0.3539799749851227, + "rewards/rejected": 0.08765344321727753, + "step": 1816 + }, + { + "epoch": 0.28099748695147886, + "grad_norm": 6.206616401672363, + "learning_rate": 4.68298969072165e-06, + "logits/chosen": 11.167319297790527, + "logits/rejected": 8.613869667053223, + "logps/chosen": -426.288330078125, + "logps/rejected": -329.3836975097656, + "loss": 0.4729, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.613318681716919, + "rewards/margins": 0.54759681224823, + "rewards/rejected": 0.06572189927101135, + "step": 1817 + }, + { + "epoch": 0.281152136091243, + "grad_norm": 9.026396751403809, + "learning_rate": 4.685567010309279e-06, + "logits/chosen": 8.833276748657227, + "logits/rejected": 8.582449913024902, + "logps/chosen": -230.39968872070312, + "logps/rejected": -245.29591369628906, + "loss": 0.7819, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46692636609077454, + "rewards/margins": -0.04104386270046234, + "rewards/rejected": 0.5079702138900757, + "step": 1818 + }, + { + "epoch": 0.28130678523100716, + "grad_norm": 6.190130233764648, + "learning_rate": 4.688144329896907e-06, + "logits/chosen": 6.207818508148193, + "logits/rejected": 4.249599933624268, + "logps/chosen": -252.45883178710938, + "logps/rejected": -162.86692810058594, + "loss": 0.752, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.35861286520957947, + "rewards/margins": -0.08175475895404816, + "rewards/rejected": 0.4403676390647888, + "step": 1819 + }, + { + "epoch": 0.2814614343707713, + "grad_norm": 6.999055862426758, + "learning_rate": 4.690721649484537e-06, + "logits/chosen": 12.285684585571289, + "logits/rejected": 6.994052886962891, + "logps/chosen": -317.63763427734375, + "logps/rejected": -270.59002685546875, + "loss": 0.5903, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48645782470703125, + "rewards/margins": 0.2407977283000946, + "rewards/rejected": 0.24566012620925903, + "step": 1820 + }, + { + "epoch": 0.28161608351053546, + "grad_norm": 6.097827434539795, + "learning_rate": 4.693298969072165e-06, + "logits/chosen": 10.929293632507324, + "logits/rejected": 3.2800440788269043, + "logps/chosen": -340.63427734375, + "logps/rejected": -256.7178649902344, + "loss": 0.6055, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5508156418800354, + "rewards/margins": 0.24711717665195465, + "rewards/rejected": 0.30369845032691956, + "step": 1821 + }, + { + "epoch": 0.2817707326502996, + "grad_norm": 4.320054054260254, + "learning_rate": 4.695876288659794e-06, + "logits/chosen": 13.914053916931152, + "logits/rejected": 9.445515632629395, + "logps/chosen": -247.6464385986328, + "logps/rejected": -210.24453735351562, + "loss": 0.5064, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5620380640029907, + "rewards/margins": 0.49877631664276123, + "rewards/rejected": 0.06326168775558472, + "step": 1822 + }, + { + "epoch": 0.28192538179006377, + "grad_norm": 4.688449382781982, + "learning_rate": 4.698453608247423e-06, + "logits/chosen": 11.72197151184082, + "logits/rejected": 7.280068874359131, + "logps/chosen": -272.1938171386719, + "logps/rejected": -223.8502960205078, + "loss": 0.6593, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7122691869735718, + "rewards/margins": 0.14574995636940002, + "rewards/rejected": 0.5665192604064941, + "step": 1823 + }, + { + "epoch": 0.282080030929828, + "grad_norm": 5.492824554443359, + "learning_rate": 4.701030927835052e-06, + "logits/chosen": 14.48817253112793, + "logits/rejected": 10.518872261047363, + "logps/chosen": -266.54486083984375, + "logps/rejected": -227.7613525390625, + "loss": 0.6416, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4933539628982544, + "rewards/margins": 0.15884561836719513, + "rewards/rejected": 0.33450832962989807, + "step": 1824 + }, + { + "epoch": 0.2822346800695921, + "grad_norm": 4.968796253204346, + "learning_rate": 4.7036082474226806e-06, + "logits/chosen": 10.975472450256348, + "logits/rejected": 7.610395431518555, + "logps/chosen": -382.6282043457031, + "logps/rejected": -240.187744140625, + "loss": 0.5279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.985701858997345, + "rewards/margins": 0.4036686420440674, + "rewards/rejected": 0.5820331573486328, + "step": 1825 + }, + { + "epoch": 0.2823893292093563, + "grad_norm": 5.8526835441589355, + "learning_rate": 4.70618556701031e-06, + "logits/chosen": 9.63501262664795, + "logits/rejected": 5.556789398193359, + "logps/chosen": -248.28903198242188, + "logps/rejected": -235.29010009765625, + "loss": 0.7531, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3147670030593872, + "rewards/margins": -0.08388163149356842, + "rewards/rejected": 0.39864861965179443, + "step": 1826 + }, + { + "epoch": 0.28254397834912043, + "grad_norm": 4.779936790466309, + "learning_rate": 4.708762886597938e-06, + "logits/chosen": 6.816708564758301, + "logits/rejected": 9.040908813476562, + "logps/chosen": -182.03797912597656, + "logps/rejected": -177.436279296875, + "loss": 0.7321, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.4913061261177063, + "rewards/margins": -0.052229247987270355, + "rewards/rejected": 0.5435353517532349, + "step": 1827 + }, + { + "epoch": 0.2826986274888846, + "grad_norm": 5.755284786224365, + "learning_rate": 4.711340206185568e-06, + "logits/chosen": 8.66589641571045, + "logits/rejected": 8.729596138000488, + "logps/chosen": -295.18707275390625, + "logps/rejected": -263.3929748535156, + "loss": 0.6528, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40980178117752075, + "rewards/margins": 0.1322551816701889, + "rewards/rejected": 0.27754661440849304, + "step": 1828 + }, + { + "epoch": 0.28285327662864873, + "grad_norm": 8.393353462219238, + "learning_rate": 4.713917525773196e-06, + "logits/chosen": 2.3168864250183105, + "logits/rejected": 2.603562593460083, + "logps/chosen": -203.56890869140625, + "logps/rejected": -175.8870086669922, + "loss": 0.7487, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37378817796707153, + "rewards/margins": -0.08337822556495667, + "rewards/rejected": 0.4571663737297058, + "step": 1829 + }, + { + "epoch": 0.28300792576841294, + "grad_norm": 7.22050142288208, + "learning_rate": 4.716494845360825e-06, + "logits/chosen": 10.363870620727539, + "logits/rejected": 8.915002822875977, + "logps/chosen": -404.7255554199219, + "logps/rejected": -308.6307373046875, + "loss": 0.7054, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6660505533218384, + "rewards/margins": 0.04919108748435974, + "rewards/rejected": 0.6168594360351562, + "step": 1830 + }, + { + "epoch": 0.2831625749081771, + "grad_norm": 7.997409820556641, + "learning_rate": 4.719072164948454e-06, + "logits/chosen": 7.335951805114746, + "logits/rejected": 6.4761810302734375, + "logps/chosen": -368.7034912109375, + "logps/rejected": -321.3094482421875, + "loss": 0.7032, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33495789766311646, + "rewards/margins": 0.0901574194431305, + "rewards/rejected": 0.24480049312114716, + "step": 1831 + }, + { + "epoch": 0.28331722404794124, + "grad_norm": 3.9359004497528076, + "learning_rate": 4.721649484536083e-06, + "logits/chosen": 8.565652847290039, + "logits/rejected": 7.981936931610107, + "logps/chosen": -174.45608520507812, + "logps/rejected": -187.18368530273438, + "loss": 0.5904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6424105167388916, + "rewards/margins": 0.2401329129934311, + "rewards/rejected": 0.4022775888442993, + "step": 1832 + }, + { + "epoch": 0.2834718731877054, + "grad_norm": 3.5301573276519775, + "learning_rate": 4.7242268041237115e-06, + "logits/chosen": 14.39261531829834, + "logits/rejected": 6.3522047996521, + "logps/chosen": -205.04884338378906, + "logps/rejected": -107.8465576171875, + "loss": 0.5299, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5573626756668091, + "rewards/margins": 0.4881877303123474, + "rewards/rejected": 0.06917493045330048, + "step": 1833 + }, + { + "epoch": 0.28362652232746954, + "grad_norm": 5.176958084106445, + "learning_rate": 4.726804123711341e-06, + "logits/chosen": 10.465843200683594, + "logits/rejected": 10.067968368530273, + "logps/chosen": -261.00506591796875, + "logps/rejected": -348.6839599609375, + "loss": 0.6793, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.563470721244812, + "rewards/margins": 0.05134439468383789, + "rewards/rejected": 0.5121262669563293, + "step": 1834 + }, + { + "epoch": 0.2837811714672337, + "grad_norm": 6.1779327392578125, + "learning_rate": 4.729381443298969e-06, + "logits/chosen": 11.66494369506836, + "logits/rejected": 8.29582691192627, + "logps/chosen": -332.11004638671875, + "logps/rejected": -339.4618835449219, + "loss": 0.623, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7681221961975098, + "rewards/margins": 0.1700955480337143, + "rewards/rejected": 0.5980266332626343, + "step": 1835 + }, + { + "epoch": 0.28393582060699785, + "grad_norm": 4.645990371704102, + "learning_rate": 4.731958762886599e-06, + "logits/chosen": 10.935059547424316, + "logits/rejected": 8.363554000854492, + "logps/chosen": -213.8449249267578, + "logps/rejected": -161.9983673095703, + "loss": 0.6058, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28287580609321594, + "rewards/margins": 0.2302890419960022, + "rewards/rejected": 0.05258675664663315, + "step": 1836 + }, + { + "epoch": 0.28409046974676205, + "grad_norm": 6.875711917877197, + "learning_rate": 4.734536082474227e-06, + "logits/chosen": 15.596538543701172, + "logits/rejected": 12.098448753356934, + "logps/chosen": -333.8212890625, + "logps/rejected": -198.9110565185547, + "loss": 0.8139, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.32294559478759766, + "rewards/margins": -0.16767026484012604, + "rewards/rejected": 0.4906158745288849, + "step": 1837 + }, + { + "epoch": 0.2842451188865262, + "grad_norm": 6.2596282958984375, + "learning_rate": 4.7371134020618555e-06, + "logits/chosen": 9.050671577453613, + "logits/rejected": 7.256229877471924, + "logps/chosen": -256.604248046875, + "logps/rejected": -235.84396362304688, + "loss": 0.7985, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36071914434432983, + "rewards/margins": -0.14179196953773499, + "rewards/rejected": 0.5025111436843872, + "step": 1838 + }, + { + "epoch": 0.28439976802629036, + "grad_norm": 4.4672980308532715, + "learning_rate": 4.739690721649485e-06, + "logits/chosen": 11.085832595825195, + "logits/rejected": 6.049576759338379, + "logps/chosen": -252.97964477539062, + "logps/rejected": -170.05857849121094, + "loss": 0.6416, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6068791747093201, + "rewards/margins": 0.1697457730770111, + "rewards/rejected": 0.43713343143463135, + "step": 1839 + }, + { + "epoch": 0.2845544171660545, + "grad_norm": 5.493722915649414, + "learning_rate": 4.742268041237113e-06, + "logits/chosen": 10.334281921386719, + "logits/rejected": 11.191232681274414, + "logps/chosen": -214.6047821044922, + "logps/rejected": -284.8372802734375, + "loss": 0.6452, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2926071286201477, + "rewards/margins": 0.15418806672096252, + "rewards/rejected": 0.13841906189918518, + "step": 1840 + }, + { + "epoch": 0.28470906630581866, + "grad_norm": 6.68534517288208, + "learning_rate": 4.7448453608247425e-06, + "logits/chosen": 6.496179580688477, + "logits/rejected": 11.83876895904541, + "logps/chosen": -270.593017578125, + "logps/rejected": -339.4285888671875, + "loss": 0.5293, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6822841763496399, + "rewards/margins": 0.3938106596469879, + "rewards/rejected": 0.288473516702652, + "step": 1841 + }, + { + "epoch": 0.2848637154455828, + "grad_norm": 5.296319961547852, + "learning_rate": 4.747422680412371e-06, + "logits/chosen": 9.883523941040039, + "logits/rejected": 5.976771354675293, + "logps/chosen": -202.47079467773438, + "logps/rejected": -153.80572509765625, + "loss": 0.7006, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5124492645263672, + "rewards/margins": 0.010732997208833694, + "rewards/rejected": 0.5017163157463074, + "step": 1842 + }, + { + "epoch": 0.285018364585347, + "grad_norm": 4.896380424499512, + "learning_rate": 4.75e-06, + "logits/chosen": 11.31805419921875, + "logits/rejected": 7.822057247161865, + "logps/chosen": -185.64405822753906, + "logps/rejected": -175.93832397460938, + "loss": 0.6689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2248258739709854, + "rewards/margins": 0.09107781946659088, + "rewards/rejected": 0.13374805450439453, + "step": 1843 + }, + { + "epoch": 0.28517301372511117, + "grad_norm": 4.91972541809082, + "learning_rate": 4.752577319587629e-06, + "logits/chosen": 8.028700828552246, + "logits/rejected": 2.651580810546875, + "logps/chosen": -279.5573425292969, + "logps/rejected": -196.71580505371094, + "loss": 0.63, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5947310924530029, + "rewards/margins": 0.2853817939758301, + "rewards/rejected": 0.30934929847717285, + "step": 1844 + }, + { + "epoch": 0.2853276628648753, + "grad_norm": 5.742367267608643, + "learning_rate": 4.755154639175258e-06, + "logits/chosen": 9.682886123657227, + "logits/rejected": 13.92170238494873, + "logps/chosen": -242.05584716796875, + "logps/rejected": -283.5407409667969, + "loss": 0.7506, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5419736504554749, + "rewards/margins": -0.02542208880186081, + "rewards/rejected": 0.5673957467079163, + "step": 1845 + }, + { + "epoch": 0.28548231200463947, + "grad_norm": 4.9652323722839355, + "learning_rate": 4.7577319587628865e-06, + "logits/chosen": 6.738117694854736, + "logits/rejected": 4.107222557067871, + "logps/chosen": -251.4000701904297, + "logps/rejected": -215.6869354248047, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4638819098472595, + "rewards/margins": 0.2936471104621887, + "rewards/rejected": 0.1702347695827484, + "step": 1846 + }, + { + "epoch": 0.2856369611444036, + "grad_norm": 3.9702770709991455, + "learning_rate": 4.760309278350516e-06, + "logits/chosen": 10.768385887145996, + "logits/rejected": 8.962892532348633, + "logps/chosen": -194.072998046875, + "logps/rejected": -190.41769409179688, + "loss": 0.6338, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.39902105927467346, + "rewards/margins": 0.17166250944137573, + "rewards/rejected": 0.22735854983329773, + "step": 1847 + }, + { + "epoch": 0.2857916102841678, + "grad_norm": 5.993771076202393, + "learning_rate": 4.762886597938144e-06, + "logits/chosen": 7.632426738739014, + "logits/rejected": 12.546123504638672, + "logps/chosen": -137.77102661132812, + "logps/rejected": -260.28582763671875, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49250122904777527, + "rewards/margins": 0.11576317250728607, + "rewards/rejected": 0.3767380714416504, + "step": 1848 + }, + { + "epoch": 0.285946259423932, + "grad_norm": 5.581829071044922, + "learning_rate": 4.7654639175257735e-06, + "logits/chosen": 7.275015830993652, + "logits/rejected": 7.232879161834717, + "logps/chosen": -192.13311767578125, + "logps/rejected": -218.48243713378906, + "loss": 0.7322, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37595364451408386, + "rewards/margins": -0.05832003802061081, + "rewards/rejected": 0.4342736601829529, + "step": 1849 + }, + { + "epoch": 0.28610090856369613, + "grad_norm": 6.8089704513549805, + "learning_rate": 4.768041237113403e-06, + "logits/chosen": 7.087652683258057, + "logits/rejected": 6.997608661651611, + "logps/chosen": -263.650634765625, + "logps/rejected": -193.8986053466797, + "loss": 0.7268, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5429936647415161, + "rewards/margins": 0.028792712837457657, + "rewards/rejected": 0.5142009258270264, + "step": 1850 + }, + { + "epoch": 0.2862555577034603, + "grad_norm": 7.340090751647949, + "learning_rate": 4.770618556701031e-06, + "logits/chosen": 4.927595615386963, + "logits/rejected": 3.9277572631835938, + "logps/chosen": -224.3555908203125, + "logps/rejected": -273.0819091796875, + "loss": 0.7241, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6165887713432312, + "rewards/margins": 0.16151177883148193, + "rewards/rejected": 0.4550769627094269, + "step": 1851 + }, + { + "epoch": 0.28641020684322444, + "grad_norm": 5.625838279724121, + "learning_rate": 4.7731958762886605e-06, + "logits/chosen": 8.615274429321289, + "logits/rejected": 9.036361694335938, + "logps/chosen": -275.8540954589844, + "logps/rejected": -293.46234130859375, + "loss": 0.6642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34709426760673523, + "rewards/margins": 0.1716088503599167, + "rewards/rejected": 0.17548543214797974, + "step": 1852 + }, + { + "epoch": 0.2865648559829886, + "grad_norm": 6.167919158935547, + "learning_rate": 4.775773195876289e-06, + "logits/chosen": 12.999015808105469, + "logits/rejected": 6.3859992027282715, + "logps/chosen": -378.6021728515625, + "logps/rejected": -347.91046142578125, + "loss": 0.5097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8138599395751953, + "rewards/margins": 0.42903411388397217, + "rewards/rejected": 0.38482582569122314, + "step": 1853 + }, + { + "epoch": 0.28671950512275274, + "grad_norm": 5.481585502624512, + "learning_rate": 4.778350515463918e-06, + "logits/chosen": 7.518723964691162, + "logits/rejected": 3.4797585010528564, + "logps/chosen": -199.45236206054688, + "logps/rejected": -189.0595703125, + "loss": 0.757, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13073891401290894, + "rewards/margins": 0.033858686685562134, + "rewards/rejected": 0.0968802273273468, + "step": 1854 + }, + { + "epoch": 0.2868741542625169, + "grad_norm": 4.221721649169922, + "learning_rate": 4.780927835051547e-06, + "logits/chosen": 11.214179039001465, + "logits/rejected": 5.374087333679199, + "logps/chosen": -234.01914978027344, + "logps/rejected": -249.1956329345703, + "loss": 0.5659, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8257064819335938, + "rewards/margins": 0.4029068946838379, + "rewards/rejected": 0.42279961705207825, + "step": 1855 + }, + { + "epoch": 0.2870288034022811, + "grad_norm": 3.967857599258423, + "learning_rate": 4.783505154639176e-06, + "logits/chosen": 13.628289222717285, + "logits/rejected": 5.328762054443359, + "logps/chosen": -294.2099609375, + "logps/rejected": -188.9038543701172, + "loss": 0.588, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.807749330997467, + "rewards/margins": 0.36114585399627686, + "rewards/rejected": 0.4466035068035126, + "step": 1856 + }, + { + "epoch": 0.28718345254204525, + "grad_norm": 5.563789367675781, + "learning_rate": 4.7860824742268045e-06, + "logits/chosen": 9.868415832519531, + "logits/rejected": 11.074239730834961, + "logps/chosen": -306.042236328125, + "logps/rejected": -333.7354736328125, + "loss": 0.6442, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5254322290420532, + "rewards/margins": 0.25586453080177307, + "rewards/rejected": 0.26956766843795776, + "step": 1857 + }, + { + "epoch": 0.2873381016818094, + "grad_norm": 6.030600547790527, + "learning_rate": 4.788659793814434e-06, + "logits/chosen": 7.580175399780273, + "logits/rejected": 0.6681503057479858, + "logps/chosen": -274.1636962890625, + "logps/rejected": -180.44439697265625, + "loss": 0.6774, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3951334059238434, + "rewards/margins": 0.0551123172044754, + "rewards/rejected": 0.3400210738182068, + "step": 1858 + }, + { + "epoch": 0.28749275082157355, + "grad_norm": 4.292391300201416, + "learning_rate": 4.791237113402062e-06, + "logits/chosen": 6.318345069885254, + "logits/rejected": 3.2153501510620117, + "logps/chosen": -246.58587646484375, + "logps/rejected": -190.0252685546875, + "loss": 0.6109, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49242615699768066, + "rewards/margins": 0.2143019437789917, + "rewards/rejected": 0.27812421321868896, + "step": 1859 + }, + { + "epoch": 0.2876473999613377, + "grad_norm": 4.7412872314453125, + "learning_rate": 4.7938144329896915e-06, + "logits/chosen": 10.898558616638184, + "logits/rejected": 8.342809677124023, + "logps/chosen": -209.5616455078125, + "logps/rejected": -228.09487915039062, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7684593200683594, + "rewards/margins": 0.1832033097743988, + "rewards/rejected": 0.5852559804916382, + "step": 1860 + }, + { + "epoch": 0.28780204910110185, + "grad_norm": 5.757569313049316, + "learning_rate": 4.79639175257732e-06, + "logits/chosen": 8.592177391052246, + "logits/rejected": 9.75408935546875, + "logps/chosen": -274.6553955078125, + "logps/rejected": -263.5838317871094, + "loss": 0.5883, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5710976123809814, + "rewards/margins": 0.2770535349845886, + "rewards/rejected": 0.2940441071987152, + "step": 1861 + }, + { + "epoch": 0.28795669824086606, + "grad_norm": 4.772401809692383, + "learning_rate": 4.798969072164949e-06, + "logits/chosen": 8.272906303405762, + "logits/rejected": 9.007293701171875, + "logps/chosen": -242.4165496826172, + "logps/rejected": -223.509521484375, + "loss": 0.6601, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6363809108734131, + "rewards/margins": 0.11577824503183365, + "rewards/rejected": 0.5206026434898376, + "step": 1862 + }, + { + "epoch": 0.2881113473806302, + "grad_norm": 10.107269287109375, + "learning_rate": 4.801546391752578e-06, + "logits/chosen": 8.460515975952148, + "logits/rejected": 12.47944450378418, + "logps/chosen": -233.12344360351562, + "logps/rejected": -396.3954772949219, + "loss": 0.6955, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6309150457382202, + "rewards/margins": 0.03176078945398331, + "rewards/rejected": 0.5991542339324951, + "step": 1863 + }, + { + "epoch": 0.28826599652039436, + "grad_norm": 5.095467567443848, + "learning_rate": 4.804123711340207e-06, + "logits/chosen": 12.573497772216797, + "logits/rejected": 9.259232521057129, + "logps/chosen": -263.8731689453125, + "logps/rejected": -244.2327880859375, + "loss": 0.6588, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7351876497268677, + "rewards/margins": 0.18546277284622192, + "rewards/rejected": 0.5497248768806458, + "step": 1864 + }, + { + "epoch": 0.2884206456601585, + "grad_norm": 6.954251289367676, + "learning_rate": 4.8067010309278354e-06, + "logits/chosen": 3.989964008331299, + "logits/rejected": 2.697913885116577, + "logps/chosen": -398.9283752441406, + "logps/rejected": -371.2256774902344, + "loss": 0.8019, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6110036969184875, + "rewards/margins": -0.13032497465610504, + "rewards/rejected": 0.7413287162780762, + "step": 1865 + }, + { + "epoch": 0.28857529479992267, + "grad_norm": 12.53165054321289, + "learning_rate": 4.809278350515465e-06, + "logits/chosen": 10.261015892028809, + "logits/rejected": 5.023041725158691, + "logps/chosen": -328.63525390625, + "logps/rejected": -256.99017333984375, + "loss": 0.6777, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.76494300365448, + "rewards/margins": 0.14583933353424072, + "rewards/rejected": 0.6191036701202393, + "step": 1866 + }, + { + "epoch": 0.2887299439396868, + "grad_norm": 8.203725814819336, + "learning_rate": 4.811855670103093e-06, + "logits/chosen": 11.475428581237793, + "logits/rejected": 14.639174461364746, + "logps/chosen": -224.88278198242188, + "logps/rejected": -345.712646484375, + "loss": 0.7935, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.7569916844367981, + "rewards/margins": -0.15340301394462585, + "rewards/rejected": 0.9103946685791016, + "step": 1867 + }, + { + "epoch": 0.28888459307945097, + "grad_norm": 5.280242443084717, + "learning_rate": 4.8144329896907225e-06, + "logits/chosen": 9.474067687988281, + "logits/rejected": 5.182483196258545, + "logps/chosen": -268.6368408203125, + "logps/rejected": -176.23895263671875, + "loss": 0.6656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7429871559143066, + "rewards/margins": 0.15090298652648926, + "rewards/rejected": 0.5920841097831726, + "step": 1868 + }, + { + "epoch": 0.2890392422192152, + "grad_norm": 6.1685357093811035, + "learning_rate": 4.817010309278351e-06, + "logits/chosen": 7.478416442871094, + "logits/rejected": 5.823670864105225, + "logps/chosen": -300.9277038574219, + "logps/rejected": -227.3800506591797, + "loss": 0.672, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6526117324829102, + "rewards/margins": 0.08100330829620361, + "rewards/rejected": 0.5716084241867065, + "step": 1869 + }, + { + "epoch": 0.2891938913589793, + "grad_norm": 4.832686901092529, + "learning_rate": 4.81958762886598e-06, + "logits/chosen": 9.126526832580566, + "logits/rejected": 7.351092338562012, + "logps/chosen": -213.6137237548828, + "logps/rejected": -182.9091339111328, + "loss": 0.6143, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32643720507621765, + "rewards/margins": 0.24625521898269653, + "rewards/rejected": 0.08018197864294052, + "step": 1870 + }, + { + "epoch": 0.2893485404987435, + "grad_norm": 4.023343086242676, + "learning_rate": 4.822164948453609e-06, + "logits/chosen": 10.134033203125, + "logits/rejected": 10.816076278686523, + "logps/chosen": -245.91146850585938, + "logps/rejected": -273.0650939941406, + "loss": 0.4692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7666264772415161, + "rewards/margins": 0.5872600078582764, + "rewards/rejected": 0.17936649918556213, + "step": 1871 + }, + { + "epoch": 0.28950318963850763, + "grad_norm": 5.908870220184326, + "learning_rate": 4.824742268041238e-06, + "logits/chosen": 4.060463905334473, + "logits/rejected": 5.546267986297607, + "logps/chosen": -270.0245361328125, + "logps/rejected": -274.52679443359375, + "loss": 0.684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46871280670166016, + "rewards/margins": 0.10605449229478836, + "rewards/rejected": 0.3626583218574524, + "step": 1872 + }, + { + "epoch": 0.2896578387782718, + "grad_norm": 8.977509498596191, + "learning_rate": 4.827319587628866e-06, + "logits/chosen": 7.3169145584106445, + "logits/rejected": 7.2450175285339355, + "logps/chosen": -226.78680419921875, + "logps/rejected": -215.60064697265625, + "loss": 0.682, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4504333436489105, + "rewards/margins": 0.06812896579504013, + "rewards/rejected": 0.3823044002056122, + "step": 1873 + }, + { + "epoch": 0.28981248791803593, + "grad_norm": 6.160987854003906, + "learning_rate": 4.829896907216496e-06, + "logits/chosen": 10.08566665649414, + "logits/rejected": 15.144140243530273, + "logps/chosen": -248.6840362548828, + "logps/rejected": -341.2016296386719, + "loss": 0.7478, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6291478872299194, + "rewards/margins": -0.07631361484527588, + "rewards/rejected": 0.7054615020751953, + "step": 1874 + }, + { + "epoch": 0.28996713705780014, + "grad_norm": 14.102324485778809, + "learning_rate": 4.832474226804124e-06, + "logits/chosen": 7.883795738220215, + "logits/rejected": 9.165726661682129, + "logps/chosen": -233.27447509765625, + "logps/rejected": -258.0277404785156, + "loss": 0.9088, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5611255764961243, + "rewards/margins": -0.3357929289340973, + "rewards/rejected": 0.8969184756278992, + "step": 1875 + }, + { + "epoch": 0.2901217861975643, + "grad_norm": 4.813839912414551, + "learning_rate": 4.835051546391753e-06, + "logits/chosen": 13.958651542663574, + "logits/rejected": 1.5933709144592285, + "logps/chosen": -415.9864807128906, + "logps/rejected": -271.37457275390625, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8756765127182007, + "rewards/margins": 0.6316781044006348, + "rewards/rejected": 0.2439984381198883, + "step": 1876 + }, + { + "epoch": 0.29027643533732844, + "grad_norm": 4.900857448577881, + "learning_rate": 4.837628865979382e-06, + "logits/chosen": 12.156620025634766, + "logits/rejected": 8.674437522888184, + "logps/chosen": -307.1737060546875, + "logps/rejected": -233.92822265625, + "loss": 0.5752, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7308698296546936, + "rewards/margins": 0.3228812515735626, + "rewards/rejected": 0.407988578081131, + "step": 1877 + }, + { + "epoch": 0.2904310844770926, + "grad_norm": 9.24219799041748, + "learning_rate": 4.84020618556701e-06, + "logits/chosen": 3.6516213417053223, + "logits/rejected": 3.7382593154907227, + "logps/chosen": -181.85975646972656, + "logps/rejected": -220.06683349609375, + "loss": 0.7359, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.41353029012680054, + "rewards/margins": -0.05325116962194443, + "rewards/rejected": 0.4667814373970032, + "step": 1878 + }, + { + "epoch": 0.29058573361685675, + "grad_norm": 5.075562477111816, + "learning_rate": 4.84278350515464e-06, + "logits/chosen": 7.875941276550293, + "logits/rejected": 4.466923713684082, + "logps/chosen": -266.3796691894531, + "logps/rejected": -259.0067138671875, + "loss": 0.5886, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.390921026468277, + "rewards/margins": 0.2779941260814667, + "rewards/rejected": 0.11292685568332672, + "step": 1879 + }, + { + "epoch": 0.2907403827566209, + "grad_norm": 5.127997398376465, + "learning_rate": 4.845360824742268e-06, + "logits/chosen": 7.927471160888672, + "logits/rejected": 7.130305290222168, + "logps/chosen": -258.9188232421875, + "logps/rejected": -238.37164306640625, + "loss": 0.6294, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6842255592346191, + "rewards/margins": 0.17241010069847107, + "rewards/rejected": 0.5118154287338257, + "step": 1880 + }, + { + "epoch": 0.2908950318963851, + "grad_norm": 10.512694358825684, + "learning_rate": 4.847938144329897e-06, + "logits/chosen": 9.732919692993164, + "logits/rejected": 5.461235523223877, + "logps/chosen": -250.4842987060547, + "logps/rejected": -300.2943420410156, + "loss": 0.9948, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.21167299151420593, + "rewards/margins": -0.4778497517108917, + "rewards/rejected": 0.6895227432250977, + "step": 1881 + }, + { + "epoch": 0.29104968103614925, + "grad_norm": 7.691608905792236, + "learning_rate": 4.850515463917526e-06, + "logits/chosen": 12.284557342529297, + "logits/rejected": 9.69810962677002, + "logps/chosen": -337.4669494628906, + "logps/rejected": -307.0381774902344, + "loss": 0.7999, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49800485372543335, + "rewards/margins": -0.14114035665988922, + "rewards/rejected": 0.6391451954841614, + "step": 1882 + }, + { + "epoch": 0.2912043301759134, + "grad_norm": 4.9191813468933105, + "learning_rate": 4.853092783505155e-06, + "logits/chosen": 6.480351448059082, + "logits/rejected": 7.531797409057617, + "logps/chosen": -363.82794189453125, + "logps/rejected": -362.02984619140625, + "loss": 0.4703, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7637132406234741, + "rewards/margins": 0.538051962852478, + "rewards/rejected": 0.2256612479686737, + "step": 1883 + }, + { + "epoch": 0.29135897931567756, + "grad_norm": 6.706927299499512, + "learning_rate": 4.855670103092784e-06, + "logits/chosen": 7.001772880554199, + "logits/rejected": 8.997282028198242, + "logps/chosen": -242.09133911132812, + "logps/rejected": -302.67266845703125, + "loss": 0.6685, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3338572680950165, + "rewards/margins": 0.17152796685695648, + "rewards/rejected": 0.16232930123806, + "step": 1884 + }, + { + "epoch": 0.2915136284554417, + "grad_norm": 4.6081109046936035, + "learning_rate": 4.858247422680413e-06, + "logits/chosen": 10.130657196044922, + "logits/rejected": 7.838085651397705, + "logps/chosen": -268.6007080078125, + "logps/rejected": -249.39320373535156, + "loss": 0.6622, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5536366701126099, + "rewards/margins": 0.0809537023305893, + "rewards/rejected": 0.4726829528808594, + "step": 1885 + }, + { + "epoch": 0.29166827759520586, + "grad_norm": 5.161870002746582, + "learning_rate": 4.860824742268041e-06, + "logits/chosen": 8.898221969604492, + "logits/rejected": 8.204389572143555, + "logps/chosen": -246.04754638671875, + "logps/rejected": -285.5431823730469, + "loss": 0.6054, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5714199542999268, + "rewards/margins": 0.24117514491081238, + "rewards/rejected": 0.33024483919143677, + "step": 1886 + }, + { + "epoch": 0.29182292673497, + "grad_norm": 5.844018936157227, + "learning_rate": 4.863402061855671e-06, + "logits/chosen": 2.492987632751465, + "logits/rejected": 2.259042501449585, + "logps/chosen": -177.7725830078125, + "logps/rejected": -200.97930908203125, + "loss": 0.7019, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3963301479816437, + "rewards/margins": 0.000283166766166687, + "rewards/rejected": 0.3960469961166382, + "step": 1887 + }, + { + "epoch": 0.2919775758747342, + "grad_norm": 6.1057000160217285, + "learning_rate": 4.865979381443299e-06, + "logits/chosen": 11.475907325744629, + "logits/rejected": 4.0576653480529785, + "logps/chosen": -369.1065673828125, + "logps/rejected": -285.66680908203125, + "loss": 0.4942, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0629470348358154, + "rewards/margins": 0.6337527632713318, + "rewards/rejected": 0.4291943311691284, + "step": 1888 + }, + { + "epoch": 0.29213222501449837, + "grad_norm": 7.132784843444824, + "learning_rate": 4.868556701030928e-06, + "logits/chosen": 8.809747695922852, + "logits/rejected": 8.460738182067871, + "logps/chosen": -268.1192626953125, + "logps/rejected": -250.61138916015625, + "loss": 0.6827, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5380210876464844, + "rewards/margins": 0.03575601428747177, + "rewards/rejected": 0.5022650957107544, + "step": 1889 + }, + { + "epoch": 0.2922868741542625, + "grad_norm": 6.852219104766846, + "learning_rate": 4.871134020618557e-06, + "logits/chosen": 4.585185527801514, + "logits/rejected": 6.711069583892822, + "logps/chosen": -231.81878662109375, + "logps/rejected": -285.2042236328125, + "loss": 0.8898, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.20117750763893127, + "rewards/margins": -0.3282470703125, + "rewards/rejected": 0.5294245481491089, + "step": 1890 + }, + { + "epoch": 0.2924415232940267, + "grad_norm": 5.409084796905518, + "learning_rate": 4.873711340206186e-06, + "logits/chosen": 9.516378402709961, + "logits/rejected": 6.283413410186768, + "logps/chosen": -245.01898193359375, + "logps/rejected": -178.39122009277344, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4946885406970978, + "rewards/margins": 0.02823571488261223, + "rewards/rejected": 0.46645283699035645, + "step": 1891 + }, + { + "epoch": 0.2925961724337908, + "grad_norm": 7.7536540031433105, + "learning_rate": 4.8762886597938146e-06, + "logits/chosen": 10.342660903930664, + "logits/rejected": 12.251018524169922, + "logps/chosen": -301.5098876953125, + "logps/rejected": -310.0304870605469, + "loss": 0.6272, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42624300718307495, + "rewards/margins": 0.20878592133522034, + "rewards/rejected": 0.2174571007490158, + "step": 1892 + }, + { + "epoch": 0.292750821573555, + "grad_norm": 6.0120768547058105, + "learning_rate": 4.878865979381444e-06, + "logits/chosen": 2.740140199661255, + "logits/rejected": 7.8863372802734375, + "logps/chosen": -238.9498748779297, + "logps/rejected": -266.8082580566406, + "loss": 0.8034, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48856353759765625, + "rewards/margins": -0.156171977519989, + "rewards/rejected": 0.6447355151176453, + "step": 1893 + }, + { + "epoch": 0.2929054707133192, + "grad_norm": 5.141645908355713, + "learning_rate": 4.881443298969072e-06, + "logits/chosen": 11.683637619018555, + "logits/rejected": 5.295614242553711, + "logps/chosen": -350.488525390625, + "logps/rejected": -293.3786926269531, + "loss": 0.6445, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3900063633918762, + "rewards/margins": 0.25055134296417236, + "rewards/rejected": 0.13945499062538147, + "step": 1894 + }, + { + "epoch": 0.29306011985308333, + "grad_norm": 7.291069984436035, + "learning_rate": 4.884020618556702e-06, + "logits/chosen": 5.5757598876953125, + "logits/rejected": 7.077906608581543, + "logps/chosen": -228.49021911621094, + "logps/rejected": -351.3414306640625, + "loss": 0.8233, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13726310431957245, + "rewards/margins": -0.2067205309867859, + "rewards/rejected": 0.34398362040519714, + "step": 1895 + }, + { + "epoch": 0.2932147689928475, + "grad_norm": 4.559737205505371, + "learning_rate": 4.88659793814433e-06, + "logits/chosen": 9.19395923614502, + "logits/rejected": 6.853704452514648, + "logps/chosen": -184.25628662109375, + "logps/rejected": -172.0687255859375, + "loss": 0.6758, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3070237338542938, + "rewards/margins": 0.07533854991197586, + "rewards/rejected": 0.23168519139289856, + "step": 1896 + }, + { + "epoch": 0.29336941813261164, + "grad_norm": 5.506324768066406, + "learning_rate": 4.889175257731959e-06, + "logits/chosen": 11.05807113647461, + "logits/rejected": 11.027753829956055, + "logps/chosen": -261.2674865722656, + "logps/rejected": -253.5176239013672, + "loss": 0.6262, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4731980860233307, + "rewards/margins": 0.2658166289329529, + "rewards/rejected": 0.2073814421892166, + "step": 1897 + }, + { + "epoch": 0.2935240672723758, + "grad_norm": 4.869597434997559, + "learning_rate": 4.891752577319588e-06, + "logits/chosen": 6.269387245178223, + "logits/rejected": -1.3710306882858276, + "logps/chosen": -396.888916015625, + "logps/rejected": -213.6097412109375, + "loss": 0.5333, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6167778372764587, + "rewards/margins": 0.44955456256866455, + "rewards/rejected": 0.1672232747077942, + "step": 1898 + }, + { + "epoch": 0.29367871641213994, + "grad_norm": 5.392108917236328, + "learning_rate": 4.894329896907217e-06, + "logits/chosen": 13.807971954345703, + "logits/rejected": 9.324326515197754, + "logps/chosen": -290.9385681152344, + "logps/rejected": -183.27549743652344, + "loss": 0.7525, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24965815246105194, + "rewards/margins": -0.06177006661891937, + "rewards/rejected": 0.3114282190799713, + "step": 1899 + }, + { + "epoch": 0.2938333655519041, + "grad_norm": 6.8372907638549805, + "learning_rate": 4.8969072164948455e-06, + "logits/chosen": 15.559110641479492, + "logits/rejected": 12.51849365234375, + "logps/chosen": -301.92877197265625, + "logps/rejected": -255.7155303955078, + "loss": 0.7602, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.32814091444015503, + "rewards/margins": -0.06982055306434631, + "rewards/rejected": 0.39796149730682373, + "step": 1900 + }, + { + "epoch": 0.2939880146916683, + "grad_norm": 25.758333206176758, + "learning_rate": 4.899484536082475e-06, + "logits/chosen": 5.710659027099609, + "logits/rejected": 8.67061710357666, + "logps/chosen": -101.15560913085938, + "logps/rejected": -179.40164184570312, + "loss": 0.7277, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008172348141670227, + "rewards/margins": -0.021855982020497322, + "rewards/rejected": 0.013683632016181946, + "step": 1901 + }, + { + "epoch": 0.29414266383143245, + "grad_norm": 7.125190734863281, + "learning_rate": 4.902061855670103e-06, + "logits/chosen": 10.322349548339844, + "logits/rejected": 5.1161346435546875, + "logps/chosen": -475.27099609375, + "logps/rejected": -407.8321533203125, + "loss": 0.6615, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4830693006515503, + "rewards/margins": 0.10783567279577255, + "rewards/rejected": 0.3752336800098419, + "step": 1902 + }, + { + "epoch": 0.2942973129711966, + "grad_norm": 5.718871593475342, + "learning_rate": 4.904639175257732e-06, + "logits/chosen": 11.220110893249512, + "logits/rejected": 4.4469146728515625, + "logps/chosen": -397.81866455078125, + "logps/rejected": -423.42266845703125, + "loss": 0.571, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5649611949920654, + "rewards/margins": 0.33048421144485474, + "rewards/rejected": 0.23447701334953308, + "step": 1903 + }, + { + "epoch": 0.29445196211096075, + "grad_norm": 5.313636302947998, + "learning_rate": 4.907216494845361e-06, + "logits/chosen": 13.76280403137207, + "logits/rejected": 7.907626152038574, + "logps/chosen": -296.14886474609375, + "logps/rejected": -302.18231201171875, + "loss": 0.5519, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5160866975784302, + "rewards/margins": 0.4192940592765808, + "rewards/rejected": 0.09679260104894638, + "step": 1904 + }, + { + "epoch": 0.2946066112507249, + "grad_norm": 5.146751880645752, + "learning_rate": 4.9097938144329895e-06, + "logits/chosen": 4.972388744354248, + "logits/rejected": 8.123661994934082, + "logps/chosen": -188.42562866210938, + "logps/rejected": -231.59475708007812, + "loss": 0.6479, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2599448263645172, + "rewards/margins": 0.15737327933311462, + "rewards/rejected": 0.10257153958082199, + "step": 1905 + }, + { + "epoch": 0.29476126039048905, + "grad_norm": 4.919521331787109, + "learning_rate": 4.912371134020619e-06, + "logits/chosen": 9.509730339050293, + "logits/rejected": 6.672048091888428, + "logps/chosen": -227.66221618652344, + "logps/rejected": -260.90985107421875, + "loss": 0.7182, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5440791845321655, + "rewards/margins": 0.03063793107867241, + "rewards/rejected": 0.513441264629364, + "step": 1906 + }, + { + "epoch": 0.29491590953025326, + "grad_norm": 6.301992416381836, + "learning_rate": 4.914948453608247e-06, + "logits/chosen": 10.11520004272461, + "logits/rejected": 7.37161922454834, + "logps/chosen": -329.0301513671875, + "logps/rejected": -180.2372589111328, + "loss": 0.7617, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19168326258659363, + "rewards/margins": -0.044951409101486206, + "rewards/rejected": 0.23663464188575745, + "step": 1907 + }, + { + "epoch": 0.2950705586700174, + "grad_norm": 7.471937656402588, + "learning_rate": 4.9175257731958765e-06, + "logits/chosen": 15.103532791137695, + "logits/rejected": 11.24356460571289, + "logps/chosen": -305.7756652832031, + "logps/rejected": -300.25408935546875, + "loss": 0.4992, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7134256362915039, + "rewards/margins": 0.472978800535202, + "rewards/rejected": 0.24044686555862427, + "step": 1908 + }, + { + "epoch": 0.29522520780978156, + "grad_norm": 5.336564540863037, + "learning_rate": 4.920103092783505e-06, + "logits/chosen": 8.325382232666016, + "logits/rejected": 9.150381088256836, + "logps/chosen": -185.45504760742188, + "logps/rejected": -240.892333984375, + "loss": 0.7211, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3793754577636719, + "rewards/margins": 0.0637800320982933, + "rewards/rejected": 0.31559544801712036, + "step": 1909 + }, + { + "epoch": 0.2953798569495457, + "grad_norm": 5.9475998878479, + "learning_rate": 4.922680412371135e-06, + "logits/chosen": 9.271333694458008, + "logits/rejected": 7.484991073608398, + "logps/chosen": -305.6849670410156, + "logps/rejected": -296.37451171875, + "loss": 0.7072, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3814612925052643, + "rewards/margins": 0.027897104620933533, + "rewards/rejected": 0.35356417298316956, + "step": 1910 + }, + { + "epoch": 0.29553450608930987, + "grad_norm": 5.959292888641357, + "learning_rate": 4.9252577319587635e-06, + "logits/chosen": 6.174468040466309, + "logits/rejected": 1.7129467725753784, + "logps/chosen": -331.75115966796875, + "logps/rejected": -309.6365661621094, + "loss": 0.6681, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6077805757522583, + "rewards/margins": 0.12058689445257187, + "rewards/rejected": 0.48719361424446106, + "step": 1911 + }, + { + "epoch": 0.295689155229074, + "grad_norm": 5.185881614685059, + "learning_rate": 4.927835051546392e-06, + "logits/chosen": 14.473819732666016, + "logits/rejected": 6.585643768310547, + "logps/chosen": -234.30099487304688, + "logps/rejected": -189.82171630859375, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3584960699081421, + "rewards/margins": 0.11982384324073792, + "rewards/rejected": 0.23867221176624298, + "step": 1912 + }, + { + "epoch": 0.2958438043688382, + "grad_norm": 5.288536548614502, + "learning_rate": 4.930412371134021e-06, + "logits/chosen": 17.776643753051758, + "logits/rejected": 13.918618202209473, + "logps/chosen": -286.01763916015625, + "logps/rejected": -267.4415283203125, + "loss": 0.5566, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7876964807510376, + "rewards/margins": 0.341988205909729, + "rewards/rejected": 0.4457082748413086, + "step": 1913 + }, + { + "epoch": 0.2959984535086024, + "grad_norm": 8.25245475769043, + "learning_rate": 4.93298969072165e-06, + "logits/chosen": 5.127202987670898, + "logits/rejected": 8.262370109558105, + "logps/chosen": -217.8651123046875, + "logps/rejected": -223.11862182617188, + "loss": 0.7562, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.240508571267128, + "rewards/margins": -0.1099180281162262, + "rewards/rejected": 0.3504266142845154, + "step": 1914 + }, + { + "epoch": 0.29615310264836653, + "grad_norm": 4.382015705108643, + "learning_rate": 4.935567010309279e-06, + "logits/chosen": 2.3040614128112793, + "logits/rejected": 10.225667953491211, + "logps/chosen": -132.89752197265625, + "logps/rejected": -221.07054138183594, + "loss": 0.6706, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23351004719734192, + "rewards/margins": 0.11221817135810852, + "rewards/rejected": 0.121291883289814, + "step": 1915 + }, + { + "epoch": 0.2963077517881307, + "grad_norm": 5.382826328277588, + "learning_rate": 4.9381443298969075e-06, + "logits/chosen": 9.483293533325195, + "logits/rejected": 6.829405307769775, + "logps/chosen": -416.3201904296875, + "logps/rejected": -375.0426025390625, + "loss": 0.5482, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6910586357116699, + "rewards/margins": 0.46107226610183716, + "rewards/rejected": 0.22998639941215515, + "step": 1916 + }, + { + "epoch": 0.29646240092789483, + "grad_norm": 5.007412910461426, + "learning_rate": 4.940721649484537e-06, + "logits/chosen": 7.947233200073242, + "logits/rejected": 3.218405246734619, + "logps/chosen": -222.14443969726562, + "logps/rejected": -171.01058959960938, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18438078463077545, + "rewards/margins": 0.0009992271661758423, + "rewards/rejected": 0.18338152766227722, + "step": 1917 + }, + { + "epoch": 0.296617050067659, + "grad_norm": 4.753635883331299, + "learning_rate": 4.943298969072165e-06, + "logits/chosen": 6.59550666809082, + "logits/rejected": 8.799602508544922, + "logps/chosen": -198.42910766601562, + "logps/rejected": -260.1075744628906, + "loss": 0.6509, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2855777144432068, + "rewards/margins": 0.1866045594215393, + "rewards/rejected": 0.09897316992282867, + "step": 1918 + }, + { + "epoch": 0.29677169920742313, + "grad_norm": 5.054181098937988, + "learning_rate": 4.9458762886597945e-06, + "logits/chosen": 8.46769905090332, + "logits/rejected": 6.968910217285156, + "logps/chosen": -210.31954956054688, + "logps/rejected": -207.2449951171875, + "loss": 0.7131, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.26277339458465576, + "rewards/margins": -0.032272592186927795, + "rewards/rejected": 0.29504600167274475, + "step": 1919 + }, + { + "epoch": 0.29692634834718734, + "grad_norm": 4.9403581619262695, + "learning_rate": 4.948453608247423e-06, + "logits/chosen": 5.502076625823975, + "logits/rejected": 5.065120220184326, + "logps/chosen": -233.93626403808594, + "logps/rejected": -219.66500854492188, + "loss": 0.6379, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15127001702785492, + "rewards/margins": 0.28466981649398804, + "rewards/rejected": -0.1333998143672943, + "step": 1920 + }, + { + "epoch": 0.2970809974869515, + "grad_norm": 5.581308841705322, + "learning_rate": 4.951030927835052e-06, + "logits/chosen": 8.548807144165039, + "logits/rejected": 6.752242565155029, + "logps/chosen": -239.69493103027344, + "logps/rejected": -214.94403076171875, + "loss": 0.7011, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33907243609428406, + "rewards/margins": 0.01254080981016159, + "rewards/rejected": 0.32653161883354187, + "step": 1921 + }, + { + "epoch": 0.29723564662671564, + "grad_norm": 7.465334892272949, + "learning_rate": 4.953608247422681e-06, + "logits/chosen": 8.181343078613281, + "logits/rejected": 7.318568229675293, + "logps/chosen": -257.0267333984375, + "logps/rejected": -235.25161743164062, + "loss": 0.7066, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18755990266799927, + "rewards/margins": 0.05862199515104294, + "rewards/rejected": 0.12893790006637573, + "step": 1922 + }, + { + "epoch": 0.2973902957664798, + "grad_norm": 6.766858100891113, + "learning_rate": 4.95618556701031e-06, + "logits/chosen": 10.215593338012695, + "logits/rejected": 9.005621910095215, + "logps/chosen": -339.88751220703125, + "logps/rejected": -298.79998779296875, + "loss": 0.8354, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3374425768852234, + "rewards/margins": -0.14313651621341705, + "rewards/rejected": 0.48057910799980164, + "step": 1923 + }, + { + "epoch": 0.29754494490624395, + "grad_norm": 7.537698745727539, + "learning_rate": 4.9587628865979385e-06, + "logits/chosen": 3.9117486476898193, + "logits/rejected": 8.305133819580078, + "logps/chosen": -304.59515380859375, + "logps/rejected": -311.7738037109375, + "loss": 0.6832, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1922599971294403, + "rewards/margins": 0.0949392318725586, + "rewards/rejected": 0.09732075035572052, + "step": 1924 + }, + { + "epoch": 0.2976995940460081, + "grad_norm": 6.216429710388184, + "learning_rate": 4.961340206185568e-06, + "logits/chosen": 11.453048706054688, + "logits/rejected": 8.348071098327637, + "logps/chosen": -244.04605102539062, + "logps/rejected": -194.7714385986328, + "loss": 0.6168, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3159163296222687, + "rewards/margins": 0.2807968258857727, + "rewards/rejected": 0.03511953726410866, + "step": 1925 + }, + { + "epoch": 0.2978542431857723, + "grad_norm": 5.361112117767334, + "learning_rate": 4.963917525773196e-06, + "logits/chosen": 8.843154907226562, + "logits/rejected": 6.045343399047852, + "logps/chosen": -215.91107177734375, + "logps/rejected": -151.71206665039062, + "loss": 0.7189, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15566149353981018, + "rewards/margins": -0.0055234357714653015, + "rewards/rejected": 0.16118493676185608, + "step": 1926 + }, + { + "epoch": 0.29800889232553646, + "grad_norm": 4.3612589836120605, + "learning_rate": 4.9664948453608255e-06, + "logits/chosen": 9.401978492736816, + "logits/rejected": 4.440295219421387, + "logps/chosen": -277.6397399902344, + "logps/rejected": -153.89511108398438, + "loss": 0.5945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2461000382900238, + "rewards/margins": 0.24713626503944397, + "rewards/rejected": -0.001036219298839569, + "step": 1927 + }, + { + "epoch": 0.2981635414653006, + "grad_norm": 6.060153484344482, + "learning_rate": 4.969072164948454e-06, + "logits/chosen": 3.191788673400879, + "logits/rejected": 9.357587814331055, + "logps/chosen": -161.32582092285156, + "logps/rejected": -197.40357971191406, + "loss": 0.7306, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21285487711429596, + "rewards/margins": -0.030037857592105865, + "rewards/rejected": 0.24289274215698242, + "step": 1928 + }, + { + "epoch": 0.29831819060506476, + "grad_norm": 4.475562572479248, + "learning_rate": 4.971649484536083e-06, + "logits/chosen": 9.300079345703125, + "logits/rejected": 4.895201206207275, + "logps/chosen": -266.2282409667969, + "logps/rejected": -223.19154357910156, + "loss": 0.6439, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2918722629547119, + "rewards/margins": 0.12380611151456833, + "rewards/rejected": 0.16806615889072418, + "step": 1929 + }, + { + "epoch": 0.2984728397448289, + "grad_norm": 5.8332014083862305, + "learning_rate": 4.974226804123712e-06, + "logits/chosen": 4.643984317779541, + "logits/rejected": 5.1854248046875, + "logps/chosen": -207.89447021484375, + "logps/rejected": -229.30625915527344, + "loss": 0.671, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19573122262954712, + "rewards/margins": 0.0773552656173706, + "rewards/rejected": 0.11837596446275711, + "step": 1930 + }, + { + "epoch": 0.29862748888459306, + "grad_norm": 5.776227951049805, + "learning_rate": 4.976804123711341e-06, + "logits/chosen": 18.28095817565918, + "logits/rejected": 6.923507213592529, + "logps/chosen": -321.23638916015625, + "logps/rejected": -317.8025817871094, + "loss": 0.6399, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5662283897399902, + "rewards/margins": 0.18982170522212982, + "rewards/rejected": 0.3764066994190216, + "step": 1931 + }, + { + "epoch": 0.2987821380243572, + "grad_norm": 6.289240837097168, + "learning_rate": 4.9793814432989694e-06, + "logits/chosen": 8.18239974975586, + "logits/rejected": 8.755838394165039, + "logps/chosen": -270.99853515625, + "logps/rejected": -321.1993103027344, + "loss": 0.6784, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31380495429039, + "rewards/margins": 0.25641822814941406, + "rewards/rejected": 0.05738672614097595, + "step": 1932 + }, + { + "epoch": 0.2989367871641214, + "grad_norm": 4.342260837554932, + "learning_rate": 4.981958762886599e-06, + "logits/chosen": 11.738896369934082, + "logits/rejected": -0.8477178812026978, + "logps/chosen": -318.3328552246094, + "logps/rejected": -168.71299743652344, + "loss": 0.5549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6941225528717041, + "rewards/margins": 0.32181793451309204, + "rewards/rejected": 0.3723045587539673, + "step": 1933 + }, + { + "epoch": 0.29909143630388557, + "grad_norm": 3.701807737350464, + "learning_rate": 4.984536082474227e-06, + "logits/chosen": 9.444266319274902, + "logits/rejected": 5.12848424911499, + "logps/chosen": -149.723876953125, + "logps/rejected": -140.1483154296875, + "loss": 0.6138, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0888395830988884, + "rewards/margins": 0.236131489276886, + "rewards/rejected": -0.147291898727417, + "step": 1934 + }, + { + "epoch": 0.2992460854436497, + "grad_norm": 6.159956455230713, + "learning_rate": 4.9871134020618565e-06, + "logits/chosen": 10.32263469696045, + "logits/rejected": 12.635894775390625, + "logps/chosen": -237.2451934814453, + "logps/rejected": -269.63775634765625, + "loss": 0.7376, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1999063491821289, + "rewards/margins": -0.04331938549876213, + "rewards/rejected": 0.24322575330734253, + "step": 1935 + }, + { + "epoch": 0.2994007345834139, + "grad_norm": 5.250171184539795, + "learning_rate": 4.989690721649485e-06, + "logits/chosen": 9.438034057617188, + "logits/rejected": 7.704252243041992, + "logps/chosen": -328.2623291015625, + "logps/rejected": -293.2724609375, + "loss": 0.5856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10132008790969849, + "rewards/margins": 0.2781168818473816, + "rewards/rejected": -0.1767968088388443, + "step": 1936 + }, + { + "epoch": 0.299555383723178, + "grad_norm": 5.326220989227295, + "learning_rate": 4.992268041237114e-06, + "logits/chosen": 9.928227424621582, + "logits/rejected": 4.256413459777832, + "logps/chosen": -305.5445556640625, + "logps/rejected": -260.82257080078125, + "loss": 0.6563, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3811005651950836, + "rewards/margins": 0.1830521821975708, + "rewards/rejected": 0.19804838299751282, + "step": 1937 + }, + { + "epoch": 0.2997100328629422, + "grad_norm": 5.169743537902832, + "learning_rate": 4.994845360824743e-06, + "logits/chosen": 9.099005699157715, + "logits/rejected": 7.959979057312012, + "logps/chosen": -203.81283569335938, + "logps/rejected": -144.13304138183594, + "loss": 0.6044, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26745331287384033, + "rewards/margins": 0.2917126417160034, + "rewards/rejected": -0.024259347468614578, + "step": 1938 + }, + { + "epoch": 0.2998646820027064, + "grad_norm": 6.053447246551514, + "learning_rate": 4.997422680412372e-06, + "logits/chosen": 9.17519760131836, + "logits/rejected": 8.03541374206543, + "logps/chosen": -342.9123840332031, + "logps/rejected": -308.0648193359375, + "loss": 0.6035, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4306587278842926, + "rewards/margins": 0.2345738410949707, + "rewards/rejected": 0.1960848867893219, + "step": 1939 + }, + { + "epoch": 0.30001933114247054, + "grad_norm": 10.721685409545898, + "learning_rate": 5e-06, + "logits/chosen": 11.93785572052002, + "logits/rejected": 9.35814380645752, + "logps/chosen": -360.3119201660156, + "logps/rejected": -281.695068359375, + "loss": 0.7666, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33686235547065735, + "rewards/margins": -0.09953603893518448, + "rewards/rejected": 0.436398446559906, + "step": 1940 + }, + { + "epoch": 0.3001739802822347, + "grad_norm": 7.545107841491699, + "learning_rate": 4.999713598350327e-06, + "logits/chosen": 13.678266525268555, + "logits/rejected": 5.557208061218262, + "logps/chosen": -290.08184814453125, + "logps/rejected": -289.8662414550781, + "loss": 0.7128, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10415621101856232, + "rewards/margins": 0.02573418617248535, + "rewards/rejected": 0.07842201739549637, + "step": 1941 + }, + { + "epoch": 0.30032862942199884, + "grad_norm": 6.301037788391113, + "learning_rate": 4.999427196700654e-06, + "logits/chosen": 18.108572006225586, + "logits/rejected": 5.521419048309326, + "logps/chosen": -401.7284851074219, + "logps/rejected": -217.22152709960938, + "loss": 0.6597, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3661457896232605, + "rewards/margins": 0.10110605508089066, + "rewards/rejected": 0.26503968238830566, + "step": 1942 + }, + { + "epoch": 0.300483278561763, + "grad_norm": 6.519114017486572, + "learning_rate": 4.99914079505098e-06, + "logits/chosen": 8.60999870300293, + "logits/rejected": 8.846593856811523, + "logps/chosen": -323.40234375, + "logps/rejected": -315.4803161621094, + "loss": 0.8658, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3486313819885254, + "rewards/margins": -0.27207040786743164, + "rewards/rejected": 0.6207018494606018, + "step": 1943 + }, + { + "epoch": 0.30063792770152714, + "grad_norm": 8.513544082641602, + "learning_rate": 4.998854393401306e-06, + "logits/chosen": 6.264193534851074, + "logits/rejected": 6.085964202880859, + "logps/chosen": -300.51007080078125, + "logps/rejected": -305.0381774902344, + "loss": 0.8527, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.02136526256799698, + "rewards/margins": -0.23535224795341492, + "rewards/rejected": 0.2567175030708313, + "step": 1944 + }, + { + "epoch": 0.3007925768412913, + "grad_norm": 8.875266075134277, + "learning_rate": 4.998567991751633e-06, + "logits/chosen": 12.945000648498535, + "logits/rejected": 6.4350266456604, + "logps/chosen": -488.75390625, + "logps/rejected": -327.65692138671875, + "loss": 0.4758, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7236288189888, + "rewards/margins": 0.513248085975647, + "rewards/rejected": 0.21038076281547546, + "step": 1945 + }, + { + "epoch": 0.3009472259810555, + "grad_norm": 6.929609775543213, + "learning_rate": 4.9982815901019595e-06, + "logits/chosen": 9.850441932678223, + "logits/rejected": 6.299242973327637, + "logps/chosen": -321.49749755859375, + "logps/rejected": -262.7158508300781, + "loss": 0.7369, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2698798179626465, + "rewards/margins": -0.0516773946583271, + "rewards/rejected": 0.3215572237968445, + "step": 1946 + }, + { + "epoch": 0.30110187512081965, + "grad_norm": 6.095489501953125, + "learning_rate": 4.997995188452286e-06, + "logits/chosen": 11.506305694580078, + "logits/rejected": 14.566590309143066, + "logps/chosen": -274.4747619628906, + "logps/rejected": -304.9311218261719, + "loss": 0.6408, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35204535722732544, + "rewards/margins": 0.1680188626050949, + "rewards/rejected": 0.18402647972106934, + "step": 1947 + }, + { + "epoch": 0.3012565242605838, + "grad_norm": 8.11747932434082, + "learning_rate": 4.997708786802613e-06, + "logits/chosen": 9.627846717834473, + "logits/rejected": 10.565401077270508, + "logps/chosen": -280.70074462890625, + "logps/rejected": -286.8978271484375, + "loss": 0.9348, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.27578890323638916, + "rewards/margins": -0.39433878660202026, + "rewards/rejected": 0.6701276898384094, + "step": 1948 + }, + { + "epoch": 0.30141117340034795, + "grad_norm": 5.044299125671387, + "learning_rate": 4.997422385152939e-06, + "logits/chosen": 7.704577445983887, + "logits/rejected": 11.479877471923828, + "logps/chosen": -190.8633575439453, + "logps/rejected": -219.56350708007812, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0736057311296463, + "rewards/margins": 0.11645365506410599, + "rewards/rejected": -0.042847927659749985, + "step": 1949 + }, + { + "epoch": 0.3015658225401121, + "grad_norm": 5.353774070739746, + "learning_rate": 4.997135983503265e-06, + "logits/chosen": -0.5301303863525391, + "logits/rejected": 2.583714723587036, + "logps/chosen": -161.01283264160156, + "logps/rejected": -195.58291625976562, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.001950286328792572, + "rewards/margins": 0.060441188514232635, + "rewards/rejected": -0.05849090963602066, + "step": 1950 + }, + { + "epoch": 0.30172047167987626, + "grad_norm": 11.192580223083496, + "learning_rate": 4.996849581853592e-06, + "logits/chosen": 14.546045303344727, + "logits/rejected": 7.657373428344727, + "logps/chosen": -268.5328674316406, + "logps/rejected": -163.61904907226562, + "loss": 0.6853, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22746187448501587, + "rewards/margins": 0.06721755117177963, + "rewards/rejected": 0.16024431586265564, + "step": 1951 + }, + { + "epoch": 0.30187512081964046, + "grad_norm": 5.781955242156982, + "learning_rate": 4.9965631802039185e-06, + "logits/chosen": 5.6796345710754395, + "logits/rejected": 5.839045524597168, + "logps/chosen": -244.78334045410156, + "logps/rejected": -272.55352783203125, + "loss": 0.6228, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37983834743499756, + "rewards/margins": 0.24405992031097412, + "rewards/rejected": 0.13577842712402344, + "step": 1952 + }, + { + "epoch": 0.3020297699594046, + "grad_norm": 3.289320230484009, + "learning_rate": 4.996276778554245e-06, + "logits/chosen": 7.486164569854736, + "logits/rejected": 6.053140640258789, + "logps/chosen": -139.509765625, + "logps/rejected": -154.30303955078125, + "loss": 0.5658, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3910551369190216, + "rewards/margins": 0.35984504222869873, + "rewards/rejected": 0.03121008723974228, + "step": 1953 + }, + { + "epoch": 0.30218441909916877, + "grad_norm": 6.5685882568359375, + "learning_rate": 4.995990376904572e-06, + "logits/chosen": 6.637234687805176, + "logits/rejected": 4.8490309715271, + "logps/chosen": -253.0272979736328, + "logps/rejected": -243.67648315429688, + "loss": 0.8978, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18919944763183594, + "rewards/margins": -0.27830982208251953, + "rewards/rejected": 0.46750926971435547, + "step": 1954 + }, + { + "epoch": 0.3023390682389329, + "grad_norm": 10.512470245361328, + "learning_rate": 4.995703975254898e-06, + "logits/chosen": 10.211843490600586, + "logits/rejected": 9.98330020904541, + "logps/chosen": -292.5981140136719, + "logps/rejected": -256.3668212890625, + "loss": 0.7808, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.46049395203590393, + "rewards/margins": -0.12379749119281769, + "rewards/rejected": 0.5842914581298828, + "step": 1955 + }, + { + "epoch": 0.30249371737869707, + "grad_norm": 7.871715068817139, + "learning_rate": 4.995417573605224e-06, + "logits/chosen": 10.148628234863281, + "logits/rejected": 10.65623664855957, + "logps/chosen": -237.61911010742188, + "logps/rejected": -261.0722351074219, + "loss": 0.8023, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018633782863616943, + "rewards/margins": -0.12444295734167099, + "rewards/rejected": 0.14307674765586853, + "step": 1956 + }, + { + "epoch": 0.3026483665184612, + "grad_norm": 4.232603549957275, + "learning_rate": 4.995131171955551e-06, + "logits/chosen": 8.947652816772461, + "logits/rejected": 7.929649829864502, + "logps/chosen": -158.567138671875, + "logps/rejected": -170.98976135253906, + "loss": 0.5916, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2423844337463379, + "rewards/margins": 0.2443198263645172, + "rewards/rejected": -0.0019353926181793213, + "step": 1957 + }, + { + "epoch": 0.3028030156582254, + "grad_norm": 6.2412614822387695, + "learning_rate": 4.9948447703058776e-06, + "logits/chosen": 9.776228904724121, + "logits/rejected": 5.629969120025635, + "logps/chosen": -281.0980529785156, + "logps/rejected": -257.5005798339844, + "loss": 0.5469, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4187763035297394, + "rewards/margins": 0.3742130398750305, + "rewards/rejected": 0.04456329345703125, + "step": 1958 + }, + { + "epoch": 0.3029576647979896, + "grad_norm": 3.4976234436035156, + "learning_rate": 4.994558368656204e-06, + "logits/chosen": 8.489951133728027, + "logits/rejected": 10.002816200256348, + "logps/chosen": -223.14892578125, + "logps/rejected": -228.04150390625, + "loss": 0.5302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6216352581977844, + "rewards/margins": 0.41556477546691895, + "rewards/rejected": 0.20607048273086548, + "step": 1959 + }, + { + "epoch": 0.30311231393775373, + "grad_norm": 103.04702758789062, + "learning_rate": 4.99427196700653e-06, + "logits/chosen": 6.267934799194336, + "logits/rejected": 10.490842819213867, + "logps/chosen": -259.6253662109375, + "logps/rejected": -306.8897399902344, + "loss": 0.5228, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5385517477989197, + "rewards/margins": 0.44119182229042053, + "rewards/rejected": 0.09735993295907974, + "step": 1960 + }, + { + "epoch": 0.3032669630775179, + "grad_norm": 6.962917804718018, + "learning_rate": 4.993985565356857e-06, + "logits/chosen": 10.575376510620117, + "logits/rejected": 11.073034286499023, + "logps/chosen": -247.47543334960938, + "logps/rejected": -222.78102111816406, + "loss": 0.8459, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11868830025196075, + "rewards/margins": -0.2354481816291809, + "rewards/rejected": 0.35413646697998047, + "step": 1961 + }, + { + "epoch": 0.30342161221728203, + "grad_norm": 4.687536239624023, + "learning_rate": 4.993699163707183e-06, + "logits/chosen": 7.472748279571533, + "logits/rejected": 5.770908355712891, + "logps/chosen": -238.845947265625, + "logps/rejected": -196.5159912109375, + "loss": 0.6528, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4907844066619873, + "rewards/margins": 0.1519974172115326, + "rewards/rejected": 0.3387869596481323, + "step": 1962 + }, + { + "epoch": 0.3035762613570462, + "grad_norm": 8.46551513671875, + "learning_rate": 4.99341276205751e-06, + "logits/chosen": 4.822601318359375, + "logits/rejected": 3.2210443019866943, + "logps/chosen": -292.7070007324219, + "logps/rejected": -211.46051025390625, + "loss": 0.8919, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.25995302200317383, + "rewards/margins": -0.2796034812927246, + "rewards/rejected": 0.01965045928955078, + "step": 1963 + }, + { + "epoch": 0.30373091049681034, + "grad_norm": 4.55012321472168, + "learning_rate": 4.993126360407836e-06, + "logits/chosen": 12.848557472229004, + "logits/rejected": 2.4359664916992188, + "logps/chosen": -373.224609375, + "logps/rejected": -177.35702514648438, + "loss": 0.4911, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6293210387229919, + "rewards/margins": 0.5611113905906677, + "rewards/rejected": 0.06820964813232422, + "step": 1964 + }, + { + "epoch": 0.30388555963657454, + "grad_norm": 4.771052837371826, + "learning_rate": 4.9928399587581624e-06, + "logits/chosen": 7.136575222015381, + "logits/rejected": 7.365444183349609, + "logps/chosen": -253.6295928955078, + "logps/rejected": -266.80560302734375, + "loss": 0.617, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2940073013305664, + "rewards/margins": 0.3670746386051178, + "rewards/rejected": -0.07306733727455139, + "step": 1965 + }, + { + "epoch": 0.3040402087763387, + "grad_norm": 5.675067901611328, + "learning_rate": 4.992553557108489e-06, + "logits/chosen": 10.713343620300293, + "logits/rejected": 11.298481941223145, + "logps/chosen": -244.64675903320312, + "logps/rejected": -202.63841247558594, + "loss": 0.8748, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.009420007467269897, + "rewards/margins": -0.24955837428569794, + "rewards/rejected": 0.24013838171958923, + "step": 1966 + }, + { + "epoch": 0.30419485791610285, + "grad_norm": 4.353074073791504, + "learning_rate": 4.992267155458816e-06, + "logits/chosen": 11.110458374023438, + "logits/rejected": 7.546187400817871, + "logps/chosen": -278.967041015625, + "logps/rejected": -242.59310913085938, + "loss": 0.5162, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.456194669008255, + "rewards/margins": 0.42083263397216797, + "rewards/rejected": 0.03536204993724823, + "step": 1967 + }, + { + "epoch": 0.304349507055867, + "grad_norm": 4.183549404144287, + "learning_rate": 4.991980753809142e-06, + "logits/chosen": 9.97763442993164, + "logits/rejected": 8.955684661865234, + "logps/chosen": -226.4222869873047, + "logps/rejected": -195.60052490234375, + "loss": 0.5923, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4704108238220215, + "rewards/margins": 0.25863757729530334, + "rewards/rejected": 0.21177320182323456, + "step": 1968 + }, + { + "epoch": 0.30450415619563115, + "grad_norm": 5.6945977210998535, + "learning_rate": 4.991694352159469e-06, + "logits/chosen": 13.426898956298828, + "logits/rejected": 14.656112670898438, + "logps/chosen": -337.1523742675781, + "logps/rejected": -321.0163879394531, + "loss": 0.6999, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38470345735549927, + "rewards/margins": 0.03220170736312866, + "rewards/rejected": 0.3525017499923706, + "step": 1969 + }, + { + "epoch": 0.3046588053353953, + "grad_norm": 7.493653297424316, + "learning_rate": 4.991407950509795e-06, + "logits/chosen": 11.03613567352295, + "logits/rejected": 9.68317985534668, + "logps/chosen": -349.5970458984375, + "logps/rejected": -320.911865234375, + "loss": 0.6386, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30209922790527344, + "rewards/margins": 0.154245063662529, + "rewards/rejected": 0.14785417914390564, + "step": 1970 + }, + { + "epoch": 0.3048134544751595, + "grad_norm": 5.85058069229126, + "learning_rate": 4.9911215488601215e-06, + "logits/chosen": 11.295326232910156, + "logits/rejected": 10.579447746276855, + "logps/chosen": -189.58340454101562, + "logps/rejected": -276.0993957519531, + "loss": 0.6909, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18556396663188934, + "rewards/margins": 0.020555786788463593, + "rewards/rejected": 0.16500815749168396, + "step": 1971 + }, + { + "epoch": 0.30496810361492366, + "grad_norm": 5.871170520782471, + "learning_rate": 4.990835147210448e-06, + "logits/chosen": 11.124595642089844, + "logits/rejected": 5.645484447479248, + "logps/chosen": -366.6029357910156, + "logps/rejected": -198.0446014404297, + "loss": 0.5875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.446881502866745, + "rewards/margins": 0.35985517501831055, + "rewards/rejected": 0.08702629804611206, + "step": 1972 + }, + { + "epoch": 0.3051227527546878, + "grad_norm": 4.633442401885986, + "learning_rate": 4.990548745560775e-06, + "logits/chosen": 8.628091812133789, + "logits/rejected": 3.560645580291748, + "logps/chosen": -284.3542175292969, + "logps/rejected": -215.97756958007812, + "loss": 0.5286, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5714645385742188, + "rewards/margins": 0.4734707474708557, + "rewards/rejected": 0.09799380600452423, + "step": 1973 + }, + { + "epoch": 0.30527740189445196, + "grad_norm": 3.6263229846954346, + "learning_rate": 4.9902623439111014e-06, + "logits/chosen": 2.573881149291992, + "logits/rejected": 7.097978591918945, + "logps/chosen": -172.2783660888672, + "logps/rejected": -212.84161376953125, + "loss": 0.5485, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3548279106616974, + "rewards/margins": 0.3845118284225464, + "rewards/rejected": -0.029683917760849, + "step": 1974 + }, + { + "epoch": 0.3054320510342161, + "grad_norm": 3.9928176403045654, + "learning_rate": 4.989975942261428e-06, + "logits/chosen": 13.164188385009766, + "logits/rejected": 10.088249206542969, + "logps/chosen": -272.97381591796875, + "logps/rejected": -258.22918701171875, + "loss": 0.5234, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41110047698020935, + "rewards/margins": 0.4962027370929718, + "rewards/rejected": -0.08510227501392365, + "step": 1975 + }, + { + "epoch": 0.30558670017398026, + "grad_norm": 8.181061744689941, + "learning_rate": 4.989689540611755e-06, + "logits/chosen": 8.641823768615723, + "logits/rejected": 9.229660987854004, + "logps/chosen": -395.608642578125, + "logps/rejected": -354.9344787597656, + "loss": 0.6836, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2017725110054016, + "rewards/margins": 0.15754592418670654, + "rewards/rejected": 0.04422654211521149, + "step": 1976 + }, + { + "epoch": 0.3057413493137444, + "grad_norm": 6.364742279052734, + "learning_rate": 4.9894031389620805e-06, + "logits/chosen": 13.63949966430664, + "logits/rejected": 2.9821300506591797, + "logps/chosen": -414.233642578125, + "logps/rejected": -171.18295288085938, + "loss": 0.6809, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26278382539749146, + "rewards/margins": 0.10288484394550323, + "rewards/rejected": 0.15989898145198822, + "step": 1977 + }, + { + "epoch": 0.3058959984535086, + "grad_norm": 5.2550578117370605, + "learning_rate": 4.989116737312407e-06, + "logits/chosen": 8.25478744506836, + "logits/rejected": 8.853614807128906, + "logps/chosen": -172.90084838867188, + "logps/rejected": -190.1629638671875, + "loss": 0.7209, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.057640835642814636, + "rewards/margins": -0.021668102592229843, + "rewards/rejected": -0.03597274422645569, + "step": 1978 + }, + { + "epoch": 0.3060506475932728, + "grad_norm": 5.316424369812012, + "learning_rate": 4.988830335662734e-06, + "logits/chosen": 9.975484848022461, + "logits/rejected": 0.7701832056045532, + "logps/chosen": -228.9788818359375, + "logps/rejected": -135.40347290039062, + "loss": 0.63, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1471642255783081, + "rewards/margins": 0.2743827998638153, + "rewards/rejected": -0.1272185742855072, + "step": 1979 + }, + { + "epoch": 0.3062052967330369, + "grad_norm": 5.436166763305664, + "learning_rate": 4.9885439340130605e-06, + "logits/chosen": 12.710076332092285, + "logits/rejected": 10.410833358764648, + "logps/chosen": -184.4937286376953, + "logps/rejected": -161.0513916015625, + "loss": 0.6438, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1457688808441162, + "rewards/margins": 0.2016940414905548, + "rewards/rejected": -0.055925153195858, + "step": 1980 + }, + { + "epoch": 0.3063599458728011, + "grad_norm": 4.560668468475342, + "learning_rate": 4.988257532363387e-06, + "logits/chosen": 8.936497688293457, + "logits/rejected": 4.412057399749756, + "logps/chosen": -239.14283752441406, + "logps/rejected": -176.81141662597656, + "loss": 0.5607, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18006639182567596, + "rewards/margins": 0.3544565439224243, + "rewards/rejected": -0.17439012229442596, + "step": 1981 + }, + { + "epoch": 0.3065145950125652, + "grad_norm": 4.5908918380737305, + "learning_rate": 4.987971130713714e-06, + "logits/chosen": 10.937694549560547, + "logits/rejected": 5.790031909942627, + "logps/chosen": -342.538818359375, + "logps/rejected": -272.4270324707031, + "loss": 0.4938, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3979566693305969, + "rewards/margins": 0.512667179107666, + "rewards/rejected": -0.11471052467823029, + "step": 1982 + }, + { + "epoch": 0.3066692441523294, + "grad_norm": 8.673511505126953, + "learning_rate": 4.98768472906404e-06, + "logits/chosen": 9.652889251708984, + "logits/rejected": 6.896554470062256, + "logps/chosen": -346.6098327636719, + "logps/rejected": -264.585205078125, + "loss": 0.893, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.30448389053344727, + "rewards/margins": -0.21031302213668823, + "rewards/rejected": 0.5147969126701355, + "step": 1983 + }, + { + "epoch": 0.3068238932920936, + "grad_norm": 4.490819454193115, + "learning_rate": 4.987398327414366e-06, + "logits/chosen": 8.901927947998047, + "logits/rejected": 6.203165054321289, + "logps/chosen": -181.65650939941406, + "logps/rejected": -208.1868133544922, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1907789260149002, + "rewards/margins": 0.34390294551849365, + "rewards/rejected": -0.15312398970127106, + "step": 1984 + }, + { + "epoch": 0.30697854243185774, + "grad_norm": 5.110992908477783, + "learning_rate": 4.987111925764693e-06, + "logits/chosen": 12.324292182922363, + "logits/rejected": 7.515582084655762, + "logps/chosen": -207.66912841796875, + "logps/rejected": -211.3025360107422, + "loss": 0.7212, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.12857075035572052, + "rewards/margins": 0.013825636357069016, + "rewards/rejected": 0.1147451102733612, + "step": 1985 + }, + { + "epoch": 0.3071331915716219, + "grad_norm": 4.301996231079102, + "learning_rate": 4.9868255241150196e-06, + "logits/chosen": 4.550024509429932, + "logits/rejected": 7.1157121658325195, + "logps/chosen": -144.9849090576172, + "logps/rejected": -201.24571228027344, + "loss": 0.565, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18839310109615326, + "rewards/margins": 0.3174161911010742, + "rewards/rejected": -0.12902307510375977, + "step": 1986 + }, + { + "epoch": 0.30728784071138604, + "grad_norm": 9.139196395874023, + "learning_rate": 4.986539122465346e-06, + "logits/chosen": 10.21019458770752, + "logits/rejected": 9.454524993896484, + "logps/chosen": -217.67738342285156, + "logps/rejected": -216.4231414794922, + "loss": 0.5819, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34340447187423706, + "rewards/margins": 0.2638482451438904, + "rewards/rejected": 0.07955625653266907, + "step": 1987 + }, + { + "epoch": 0.3074424898511502, + "grad_norm": 7.220220565795898, + "learning_rate": 4.986252720815673e-06, + "logits/chosen": 11.023681640625, + "logits/rejected": 6.770074367523193, + "logps/chosen": -287.69525146484375, + "logps/rejected": -212.20657348632812, + "loss": 0.7852, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.09922752529382706, + "rewards/margins": -0.13665685057640076, + "rewards/rejected": 0.0374293252825737, + "step": 1988 + }, + { + "epoch": 0.30759713899091434, + "grad_norm": 5.522611141204834, + "learning_rate": 4.985966319165999e-06, + "logits/chosen": 7.377992630004883, + "logits/rejected": 8.722118377685547, + "logps/chosen": -282.89727783203125, + "logps/rejected": -243.05999755859375, + "loss": 0.7548, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10561446845531464, + "rewards/margins": -0.08198529481887817, + "rewards/rejected": 0.187599778175354, + "step": 1989 + }, + { + "epoch": 0.30775178813067855, + "grad_norm": 4.899139404296875, + "learning_rate": 4.985679917516325e-06, + "logits/chosen": 8.418660163879395, + "logits/rejected": 2.672091007232666, + "logps/chosen": -389.9258117675781, + "logps/rejected": -245.32838439941406, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.495248019695282, + "rewards/margins": 0.34115707874298096, + "rewards/rejected": 0.15409094095230103, + "step": 1990 + }, + { + "epoch": 0.3079064372704427, + "grad_norm": 7.869766712188721, + "learning_rate": 4.985393515866652e-06, + "logits/chosen": 11.306015014648438, + "logits/rejected": 10.009170532226562, + "logps/chosen": -280.33416748046875, + "logps/rejected": -291.0931091308594, + "loss": 0.9103, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2642689049243927, + "rewards/margins": -0.325820654630661, + "rewards/rejected": 0.5900895595550537, + "step": 1991 + }, + { + "epoch": 0.30806108641020685, + "grad_norm": 5.085631370544434, + "learning_rate": 4.985107114216979e-06, + "logits/chosen": 6.397157669067383, + "logits/rejected": 5.009548187255859, + "logps/chosen": -215.83450317382812, + "logps/rejected": -214.5556640625, + "loss": 0.7128, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13566569983959198, + "rewards/margins": -0.004653111100196838, + "rewards/rejected": 0.14031882584095, + "step": 1992 + }, + { + "epoch": 0.308215735549971, + "grad_norm": 4.604753017425537, + "learning_rate": 4.984820712567304e-06, + "logits/chosen": 11.945621490478516, + "logits/rejected": 6.725895881652832, + "logps/chosen": -266.83837890625, + "logps/rejected": -229.85540771484375, + "loss": 0.548, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4130420684814453, + "rewards/margins": 0.3493255376815796, + "rewards/rejected": 0.06371650099754333, + "step": 1993 + }, + { + "epoch": 0.30837038468973516, + "grad_norm": 5.085018157958984, + "learning_rate": 4.984534310917631e-06, + "logits/chosen": 12.733266830444336, + "logits/rejected": 6.141596794128418, + "logps/chosen": -377.250244140625, + "logps/rejected": -258.7804260253906, + "loss": 0.5826, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48643437027931213, + "rewards/margins": 0.3208006024360657, + "rewards/rejected": 0.16563378274440765, + "step": 1994 + }, + { + "epoch": 0.3085250338294993, + "grad_norm": 7.331588268280029, + "learning_rate": 4.984247909267958e-06, + "logits/chosen": 8.357823371887207, + "logits/rejected": 6.840785026550293, + "logps/chosen": -364.28765869140625, + "logps/rejected": -366.29766845703125, + "loss": 0.622, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.099069744348526, + "rewards/margins": 0.1920163631439209, + "rewards/rejected": -0.0929466187953949, + "step": 1995 + }, + { + "epoch": 0.30867968296926346, + "grad_norm": 5.524991035461426, + "learning_rate": 4.983961507618284e-06, + "logits/chosen": 10.343605995178223, + "logits/rejected": 4.031404495239258, + "logps/chosen": -281.15313720703125, + "logps/rejected": -186.6189727783203, + "loss": 0.708, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0052339062094688416, + "rewards/margins": 0.013379998505115509, + "rewards/rejected": -0.008146099746227264, + "step": 1996 + }, + { + "epoch": 0.30883433210902766, + "grad_norm": 13.873757362365723, + "learning_rate": 4.983675105968611e-06, + "logits/chosen": 4.512668609619141, + "logits/rejected": 6.311601638793945, + "logps/chosen": -237.05955505371094, + "logps/rejected": -390.25628662109375, + "loss": 0.5836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26084285974502563, + "rewards/margins": 0.4081624746322632, + "rewards/rejected": -0.14731964468955994, + "step": 1997 + }, + { + "epoch": 0.3089889812487918, + "grad_norm": 7.652311325073242, + "learning_rate": 4.983388704318937e-06, + "logits/chosen": 9.728889465332031, + "logits/rejected": 6.985532760620117, + "logps/chosen": -296.9800109863281, + "logps/rejected": -201.91685485839844, + "loss": 0.7697, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09284285455942154, + "rewards/margins": 0.024631395936012268, + "rewards/rejected": 0.06821145862340927, + "step": 1998 + }, + { + "epoch": 0.30914363038855597, + "grad_norm": 4.929397106170654, + "learning_rate": 4.9831023026692635e-06, + "logits/chosen": 7.380917549133301, + "logits/rejected": 5.23086404800415, + "logps/chosen": -254.52708435058594, + "logps/rejected": -243.0704345703125, + "loss": 0.5669, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31868162751197815, + "rewards/margins": 0.33252009749412537, + "rewards/rejected": -0.013838473707437515, + "step": 1999 + }, + { + "epoch": 0.3092982795283201, + "grad_norm": 5.268299579620361, + "learning_rate": 4.98281590101959e-06, + "logits/chosen": 11.959253311157227, + "logits/rejected": 8.007569313049316, + "logps/chosen": -248.97425842285156, + "logps/rejected": -166.68801879882812, + "loss": 0.6806, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1765538901090622, + "rewards/margins": 0.061085328459739685, + "rewards/rejected": 0.11546854674816132, + "step": 2000 + }, + { + "epoch": 0.30945292866808427, + "grad_norm": 6.657409191131592, + "learning_rate": 4.982529499369917e-06, + "logits/chosen": 6.553297996520996, + "logits/rejected": 7.741517543792725, + "logps/chosen": -261.2109069824219, + "logps/rejected": -257.0432434082031, + "loss": 0.7359, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39706581830978394, + "rewards/margins": -0.0038317739963531494, + "rewards/rejected": 0.4008976221084595, + "step": 2001 + }, + { + "epoch": 0.3096075778078484, + "grad_norm": 7.364019870758057, + "learning_rate": 4.982243097720243e-06, + "logits/chosen": 3.949641227722168, + "logits/rejected": 3.9044790267944336, + "logps/chosen": -209.78517150878906, + "logps/rejected": -234.69386291503906, + "loss": 0.5677, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19461870193481445, + "rewards/margins": 0.3288024067878723, + "rewards/rejected": -0.13418368995189667, + "step": 2002 + }, + { + "epoch": 0.30976222694761263, + "grad_norm": 6.282543659210205, + "learning_rate": 4.981956696070569e-06, + "logits/chosen": 7.29435920715332, + "logits/rejected": 6.744174957275391, + "logps/chosen": -283.6720886230469, + "logps/rejected": -359.3691711425781, + "loss": 0.7836, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2742149233818054, + "rewards/margins": -0.022047381848096848, + "rewards/rejected": 0.29626235365867615, + "step": 2003 + }, + { + "epoch": 0.3099168760873768, + "grad_norm": 6.137124538421631, + "learning_rate": 4.981670294420896e-06, + "logits/chosen": 5.864310264587402, + "logits/rejected": 8.316765785217285, + "logps/chosen": -232.30392456054688, + "logps/rejected": -281.315673828125, + "loss": 0.7373, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3055242896080017, + "rewards/margins": 0.02452573925256729, + "rewards/rejected": 0.2809985280036926, + "step": 2004 + }, + { + "epoch": 0.31007152522714093, + "grad_norm": 3.532426118850708, + "learning_rate": 4.9813838927712225e-06, + "logits/chosen": 7.614628791809082, + "logits/rejected": 3.5947980880737305, + "logps/chosen": -214.77496337890625, + "logps/rejected": -170.39398193359375, + "loss": 0.5666, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.402915358543396, + "rewards/margins": 0.3127111792564392, + "rewards/rejected": 0.0902041494846344, + "step": 2005 + }, + { + "epoch": 0.3102261743669051, + "grad_norm": 6.598997592926025, + "learning_rate": 4.981097491121549e-06, + "logits/chosen": 10.780252456665039, + "logits/rejected": 6.045149803161621, + "logps/chosen": -336.3409729003906, + "logps/rejected": -240.98468017578125, + "loss": 0.6383, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18612727522850037, + "rewards/margins": 0.15972900390625, + "rewards/rejected": 0.026398273184895515, + "step": 2006 + }, + { + "epoch": 0.31038082350666923, + "grad_norm": 8.307952880859375, + "learning_rate": 4.980811089471876e-06, + "logits/chosen": 9.119329452514648, + "logits/rejected": 8.917032241821289, + "logps/chosen": -273.92578125, + "logps/rejected": -295.33203125, + "loss": 0.7864, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23545576632022858, + "rewards/margins": 0.06440198421478271, + "rewards/rejected": 0.17105376720428467, + "step": 2007 + }, + { + "epoch": 0.3105354726464334, + "grad_norm": 3.9343833923339844, + "learning_rate": 4.9805246878222025e-06, + "logits/chosen": 11.872191429138184, + "logits/rejected": 3.4664392471313477, + "logps/chosen": -250.74996948242188, + "logps/rejected": -228.10499572753906, + "loss": 0.475, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3100832998752594, + "rewards/margins": 0.6007246971130371, + "rewards/rejected": -0.2906413972377777, + "step": 2008 + }, + { + "epoch": 0.31069012178619754, + "grad_norm": 8.843900680541992, + "learning_rate": 4.980238286172529e-06, + "logits/chosen": 7.892078399658203, + "logits/rejected": 12.621454238891602, + "logps/chosen": -366.3011169433594, + "logps/rejected": -580.13427734375, + "loss": 0.8792, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20299360156059265, + "rewards/margins": -0.2374470829963684, + "rewards/rejected": 0.44044065475463867, + "step": 2009 + }, + { + "epoch": 0.31084477092596174, + "grad_norm": 5.974053382873535, + "learning_rate": 4.979951884522855e-06, + "logits/chosen": 11.918599128723145, + "logits/rejected": 5.2390217781066895, + "logps/chosen": -217.66104125976562, + "logps/rejected": -163.15391540527344, + "loss": 0.8386, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.13871696591377258, + "rewards/margins": -0.2529628276824951, + "rewards/rejected": 0.3916797637939453, + "step": 2010 + }, + { + "epoch": 0.3109994200657259, + "grad_norm": 4.927247047424316, + "learning_rate": 4.979665482873182e-06, + "logits/chosen": 14.197017669677734, + "logits/rejected": 11.9215726852417, + "logps/chosen": -214.18751525878906, + "logps/rejected": -193.50003051757812, + "loss": 0.5485, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3066695034503937, + "rewards/margins": 0.3455147445201874, + "rewards/rejected": -0.03884520381689072, + "step": 2011 + }, + { + "epoch": 0.31115406920549005, + "grad_norm": 11.098302841186523, + "learning_rate": 4.979379081223508e-06, + "logits/chosen": 7.39481782913208, + "logits/rejected": 5.808822154998779, + "logps/chosen": -275.3327331542969, + "logps/rejected": -235.2631378173828, + "loss": 0.7748, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6115728616714478, + "rewards/margins": -0.04276243597269058, + "rewards/rejected": 0.6543353199958801, + "step": 2012 + }, + { + "epoch": 0.3113087183452542, + "grad_norm": 4.694543838500977, + "learning_rate": 4.979092679573835e-06, + "logits/chosen": 10.235353469848633, + "logits/rejected": 4.523075103759766, + "logps/chosen": -281.56561279296875, + "logps/rejected": -220.22119140625, + "loss": 0.4915, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6348758935928345, + "rewards/margins": 0.4904972314834595, + "rewards/rejected": 0.14437872171401978, + "step": 2013 + }, + { + "epoch": 0.31146336748501835, + "grad_norm": 5.491322040557861, + "learning_rate": 4.9788062779241615e-06, + "logits/chosen": 10.438916206359863, + "logits/rejected": 12.557527542114258, + "logps/chosen": -183.73220825195312, + "logps/rejected": -216.85330200195312, + "loss": 0.7605, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18518799543380737, + "rewards/margins": -0.10930729657411575, + "rewards/rejected": 0.29449528455734253, + "step": 2014 + }, + { + "epoch": 0.3116180166247825, + "grad_norm": 3.6118359565734863, + "learning_rate": 4.978519876274488e-06, + "logits/chosen": 10.015729904174805, + "logits/rejected": 7.5527215003967285, + "logps/chosen": -132.3544464111328, + "logps/rejected": -97.63077545166016, + "loss": 0.6793, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030257228761911392, + "rewards/margins": 0.08355279266834259, + "rewards/rejected": -0.11381002515554428, + "step": 2015 + }, + { + "epoch": 0.3117726657645467, + "grad_norm": 5.444869518280029, + "learning_rate": 4.978233474624814e-06, + "logits/chosen": 12.174051284790039, + "logits/rejected": 3.7872884273529053, + "logps/chosen": -430.60711669921875, + "logps/rejected": -239.57635498046875, + "loss": 0.4853, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5860814452171326, + "rewards/margins": 0.5089577436447144, + "rewards/rejected": 0.07712364941835403, + "step": 2016 + }, + { + "epoch": 0.31192731490431086, + "grad_norm": 7.583166122436523, + "learning_rate": 4.977947072975141e-06, + "logits/chosen": 10.321952819824219, + "logits/rejected": 7.761547565460205, + "logps/chosen": -301.8120422363281, + "logps/rejected": -195.99807739257812, + "loss": 0.6006, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23609209060668945, + "rewards/margins": 0.32064980268478394, + "rewards/rejected": -0.08455768972635269, + "step": 2017 + }, + { + "epoch": 0.312081964044075, + "grad_norm": 4.882416725158691, + "learning_rate": 4.977660671325467e-06, + "logits/chosen": 7.85862398147583, + "logits/rejected": 6.824814796447754, + "logps/chosen": -201.79331970214844, + "logps/rejected": -167.5405731201172, + "loss": 0.6951, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020072750747203827, + "rewards/margins": 0.015002727508544922, + "rewards/rejected": -0.03507547825574875, + "step": 2018 + }, + { + "epoch": 0.31223661318383916, + "grad_norm": 5.949890613555908, + "learning_rate": 4.977374269675794e-06, + "logits/chosen": 10.985013008117676, + "logits/rejected": 8.955205917358398, + "logps/chosen": -351.3401794433594, + "logps/rejected": -263.6220703125, + "loss": 0.546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44436824321746826, + "rewards/margins": 0.33157527446746826, + "rewards/rejected": 0.11279293894767761, + "step": 2019 + }, + { + "epoch": 0.3123912623236033, + "grad_norm": 7.030670166015625, + "learning_rate": 4.977087868026121e-06, + "logits/chosen": 11.158683776855469, + "logits/rejected": 4.009839057922363, + "logps/chosen": -267.39544677734375, + "logps/rejected": -220.5675048828125, + "loss": 0.7079, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14739905297756195, + "rewards/margins": 0.21238264441490173, + "rewards/rejected": -0.06498361378908157, + "step": 2020 + }, + { + "epoch": 0.31254591146336747, + "grad_norm": 111.6714859008789, + "learning_rate": 4.976801466376447e-06, + "logits/chosen": 9.698179244995117, + "logits/rejected": 14.525428771972656, + "logps/chosen": -230.2836151123047, + "logps/rejected": -270.64727783203125, + "loss": 0.6364, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09449329972267151, + "rewards/margins": 0.21789181232452393, + "rewards/rejected": -0.12339849770069122, + "step": 2021 + }, + { + "epoch": 0.31270056060313167, + "grad_norm": 7.74885368347168, + "learning_rate": 4.976515064726774e-06, + "logits/chosen": 9.741405487060547, + "logits/rejected": 11.121164321899414, + "logps/chosen": -280.96734619140625, + "logps/rejected": -271.37615966796875, + "loss": 0.8575, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.183790922164917, + "rewards/margins": -0.17563287913799286, + "rewards/rejected": 0.35942378640174866, + "step": 2022 + }, + { + "epoch": 0.3128552097428958, + "grad_norm": 4.498934268951416, + "learning_rate": 4.9762286630771e-06, + "logits/chosen": 7.058158874511719, + "logits/rejected": 5.465290069580078, + "logps/chosen": -163.27890014648438, + "logps/rejected": -159.01113891601562, + "loss": 0.6766, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.01532740518450737, + "rewards/margins": 0.14587508141994476, + "rewards/rejected": -0.1305476576089859, + "step": 2023 + }, + { + "epoch": 0.31300985888266, + "grad_norm": 4.784653663635254, + "learning_rate": 4.975942261427426e-06, + "logits/chosen": 10.165705680847168, + "logits/rejected": 7.393817901611328, + "logps/chosen": -228.40866088867188, + "logps/rejected": -236.97006225585938, + "loss": 0.6559, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3274020254611969, + "rewards/margins": 0.11454229801893234, + "rewards/rejected": 0.21285971999168396, + "step": 2024 + }, + { + "epoch": 0.3131645080224241, + "grad_norm": 6.6084184646606445, + "learning_rate": 4.975655859777753e-06, + "logits/chosen": 10.696527481079102, + "logits/rejected": 10.428831100463867, + "logps/chosen": -250.44305419921875, + "logps/rejected": -298.8981018066406, + "loss": 0.6702, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22198249399662018, + "rewards/margins": 0.17556850612163544, + "rewards/rejected": 0.046414002776145935, + "step": 2025 + }, + { + "epoch": 0.3133191571621883, + "grad_norm": 3.688711166381836, + "learning_rate": 4.97536945812808e-06, + "logits/chosen": 7.6138458251953125, + "logits/rejected": -0.48515424132347107, + "logps/chosen": -278.2098693847656, + "logps/rejected": -166.37527465820312, + "loss": 0.5956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05551151931285858, + "rewards/margins": 0.31901147961616516, + "rewards/rejected": -0.2634999752044678, + "step": 2026 + }, + { + "epoch": 0.31347380630195243, + "grad_norm": 9.181396484375, + "learning_rate": 4.9750830564784054e-06, + "logits/chosen": 9.64320182800293, + "logits/rejected": 7.418163299560547, + "logps/chosen": -329.3189697265625, + "logps/rejected": -323.0888671875, + "loss": 0.657, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.395430326461792, + "rewards/margins": 0.10457059741020203, + "rewards/rejected": 0.2908596992492676, + "step": 2027 + }, + { + "epoch": 0.3136284554417166, + "grad_norm": 8.308619499206543, + "learning_rate": 4.974796654828732e-06, + "logits/chosen": 8.056121826171875, + "logits/rejected": 8.806220054626465, + "logps/chosen": -351.40740966796875, + "logps/rejected": -317.88018798828125, + "loss": 0.7981, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19664278626441956, + "rewards/margins": -0.08082132041454315, + "rewards/rejected": 0.2774640917778015, + "step": 2028 + }, + { + "epoch": 0.3137831045814808, + "grad_norm": 5.15437126159668, + "learning_rate": 4.974510253179059e-06, + "logits/chosen": 8.79755687713623, + "logits/rejected": 2.5902252197265625, + "logps/chosen": -226.75881958007812, + "logps/rejected": -212.37757873535156, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10303307324647903, + "rewards/margins": 0.3763677477836609, + "rewards/rejected": -0.27333468198776245, + "step": 2029 + }, + { + "epoch": 0.31393775372124494, + "grad_norm": 7.080493927001953, + "learning_rate": 4.974223851529385e-06, + "logits/chosen": 10.762468338012695, + "logits/rejected": 8.365921974182129, + "logps/chosen": -407.4847106933594, + "logps/rejected": -316.06561279296875, + "loss": 0.8173, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12943677604198456, + "rewards/margins": -0.1618298590183258, + "rewards/rejected": 0.29126664996147156, + "step": 2030 + }, + { + "epoch": 0.3140924028610091, + "grad_norm": 6.306060314178467, + "learning_rate": 4.973937449879711e-06, + "logits/chosen": 4.155032157897949, + "logits/rejected": 1.6732174158096313, + "logps/chosen": -344.86700439453125, + "logps/rejected": -301.115234375, + "loss": 0.6702, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3862399458885193, + "rewards/margins": 0.11926203221082687, + "rewards/rejected": 0.26697787642478943, + "step": 2031 + }, + { + "epoch": 0.31424705200077324, + "grad_norm": 5.579253673553467, + "learning_rate": 4.973651048230038e-06, + "logits/chosen": 3.562178373336792, + "logits/rejected": 10.578177452087402, + "logps/chosen": -175.40646362304688, + "logps/rejected": -236.90765380859375, + "loss": 0.7272, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08180347084999084, + "rewards/margins": -0.008615009486675262, + "rewards/rejected": 0.09041848033666611, + "step": 2032 + }, + { + "epoch": 0.3144017011405374, + "grad_norm": 5.391582489013672, + "learning_rate": 4.9733646465803645e-06, + "logits/chosen": 14.033145904541016, + "logits/rejected": 6.150842666625977, + "logps/chosen": -436.641357421875, + "logps/rejected": -314.458984375, + "loss": 0.5474, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36052456498146057, + "rewards/margins": 0.4293084144592285, + "rewards/rejected": -0.06878386437892914, + "step": 2033 + }, + { + "epoch": 0.31455635028030154, + "grad_norm": 6.181769371032715, + "learning_rate": 4.973078244930691e-06, + "logits/chosen": 8.313186645507812, + "logits/rejected": 15.240447998046875, + "logps/chosen": -142.17416381835938, + "logps/rejected": -193.90313720703125, + "loss": 0.7031, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09663400053977966, + "rewards/margins": 0.08778846263885498, + "rewards/rejected": 0.008845508098602295, + "step": 2034 + }, + { + "epoch": 0.31471099942006575, + "grad_norm": 6.115557670593262, + "learning_rate": 4.972791843281018e-06, + "logits/chosen": 6.296451568603516, + "logits/rejected": 12.146219253540039, + "logps/chosen": -215.63710021972656, + "logps/rejected": -330.9129638671875, + "loss": 0.5741, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3868020176887512, + "rewards/margins": 0.3071805238723755, + "rewards/rejected": 0.07962150871753693, + "step": 2035 + }, + { + "epoch": 0.3148656485598299, + "grad_norm": 8.364924430847168, + "learning_rate": 4.972505441631344e-06, + "logits/chosen": 1.0487046241760254, + "logits/rejected": 2.093881845474243, + "logps/chosen": -232.45321655273438, + "logps/rejected": -247.24136352539062, + "loss": 0.7716, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3331920802593231, + "rewards/margins": -0.089888796210289, + "rewards/rejected": 0.4230808615684509, + "step": 2036 + }, + { + "epoch": 0.31502029769959405, + "grad_norm": 4.711363315582275, + "learning_rate": 4.97221903998167e-06, + "logits/chosen": 9.087247848510742, + "logits/rejected": 4.578160762786865, + "logps/chosen": -352.3943786621094, + "logps/rejected": -262.8236083984375, + "loss": 0.5877, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3747193217277527, + "rewards/margins": 0.28360098600387573, + "rewards/rejected": 0.09111829102039337, + "step": 2037 + }, + { + "epoch": 0.3151749468393582, + "grad_norm": 4.907288551330566, + "learning_rate": 4.971932638331997e-06, + "logits/chosen": 15.42646598815918, + "logits/rejected": 13.73568344116211, + "logps/chosen": -289.27386474609375, + "logps/rejected": -324.1131591796875, + "loss": 0.5009, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18994426727294922, + "rewards/margins": 0.5177173614501953, + "rewards/rejected": -0.3277730941772461, + "step": 2038 + }, + { + "epoch": 0.31532959597912236, + "grad_norm": 6.172505855560303, + "learning_rate": 4.9716462366823236e-06, + "logits/chosen": 11.385139465332031, + "logits/rejected": 7.40675687789917, + "logps/chosen": -332.9402160644531, + "logps/rejected": -225.0248260498047, + "loss": 0.652, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3862195909023285, + "rewards/margins": 0.15352395176887512, + "rewards/rejected": 0.23269563913345337, + "step": 2039 + }, + { + "epoch": 0.3154842451188865, + "grad_norm": 6.155210971832275, + "learning_rate": 4.97135983503265e-06, + "logits/chosen": 13.192663192749023, + "logits/rejected": 14.762581825256348, + "logps/chosen": -284.51519775390625, + "logps/rejected": -294.5789794921875, + "loss": 0.732, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006152346730232239, + "rewards/margins": -0.018543139100074768, + "rewards/rejected": 0.012390803545713425, + "step": 2040 + }, + { + "epoch": 0.31563889425865066, + "grad_norm": 4.210750579833984, + "learning_rate": 4.971073433382977e-06, + "logits/chosen": 7.417428016662598, + "logits/rejected": 4.093751907348633, + "logps/chosen": -226.21636962890625, + "logps/rejected": -159.87815856933594, + "loss": 0.7023, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03874000534415245, + "rewards/margins": 0.03634684532880783, + "rewards/rejected": -0.07508685439825058, + "step": 2041 + }, + { + "epoch": 0.31579354339841487, + "grad_norm": 6.30993127822876, + "learning_rate": 4.9707870317333035e-06, + "logits/chosen": 11.614374160766602, + "logits/rejected": 6.277976036071777, + "logps/chosen": -370.63507080078125, + "logps/rejected": -303.1798095703125, + "loss": 0.6582, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3636024594306946, + "rewards/margins": 0.3033856153488159, + "rewards/rejected": 0.060216791927814484, + "step": 2042 + }, + { + "epoch": 0.315948192538179, + "grad_norm": 4.3987717628479, + "learning_rate": 4.970500630083629e-06, + "logits/chosen": 6.204204082489014, + "logits/rejected": 3.2409186363220215, + "logps/chosen": -231.05934143066406, + "logps/rejected": -186.857666015625, + "loss": 0.5877, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1683204621076584, + "rewards/margins": 0.318449467420578, + "rewards/rejected": -0.15012900531291962, + "step": 2043 + }, + { + "epoch": 0.31610284167794317, + "grad_norm": 8.114767074584961, + "learning_rate": 4.970214228433956e-06, + "logits/chosen": 10.443578720092773, + "logits/rejected": 8.904316902160645, + "logps/chosen": -264.67864990234375, + "logps/rejected": -261.36260986328125, + "loss": 0.834, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.031170370057225227, + "rewards/margins": -0.15537044405937195, + "rewards/rejected": 0.18654079735279083, + "step": 2044 + }, + { + "epoch": 0.3162574908177073, + "grad_norm": 6.54292106628418, + "learning_rate": 4.969927826784283e-06, + "logits/chosen": 6.789311408996582, + "logits/rejected": 10.426478385925293, + "logps/chosen": -267.3919372558594, + "logps/rejected": -362.5985107421875, + "loss": 0.6467, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17277130484580994, + "rewards/margins": 0.3037240505218506, + "rewards/rejected": -0.13095274567604065, + "step": 2045 + }, + { + "epoch": 0.31641213995747147, + "grad_norm": 5.540056228637695, + "learning_rate": 4.969641425134609e-06, + "logits/chosen": 4.076648712158203, + "logits/rejected": 7.316946983337402, + "logps/chosen": -192.0807647705078, + "logps/rejected": -181.27587890625, + "loss": 0.7152, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08584483712911606, + "rewards/margins": 0.024266544729471207, + "rewards/rejected": -0.11011138558387756, + "step": 2046 + }, + { + "epoch": 0.3165667890972356, + "grad_norm": 4.6728739738464355, + "learning_rate": 4.969355023484936e-06, + "logits/chosen": 7.171144485473633, + "logits/rejected": 4.804661750793457, + "logps/chosen": -156.41595458984375, + "logps/rejected": -121.79785919189453, + "loss": 0.6667, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02266831323504448, + "rewards/margins": 0.1850113570690155, + "rewards/rejected": -0.16234304010868073, + "step": 2047 + }, + { + "epoch": 0.31672143823699983, + "grad_norm": 4.945480823516846, + "learning_rate": 4.9690686218352626e-06, + "logits/chosen": 13.925334930419922, + "logits/rejected": 11.48580265045166, + "logps/chosen": -336.47216796875, + "logps/rejected": -339.148193359375, + "loss": 0.5536, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35403376817703247, + "rewards/margins": 0.39997369050979614, + "rewards/rejected": -0.04593987762928009, + "step": 2048 + }, + { + "epoch": 0.316876087376764, + "grad_norm": 7.455511569976807, + "learning_rate": 4.968782220185588e-06, + "logits/chosen": 14.020986557006836, + "logits/rejected": 6.704688549041748, + "logps/chosen": -441.7218933105469, + "logps/rejected": -265.552734375, + "loss": 0.6791, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02594432234764099, + "rewards/margins": 0.11928005516529083, + "rewards/rejected": -0.09333573281764984, + "step": 2049 + }, + { + "epoch": 0.31703073651652813, + "grad_norm": 7.04271125793457, + "learning_rate": 4.968495818535915e-06, + "logits/chosen": 4.9354448318481445, + "logits/rejected": 4.99284553527832, + "logps/chosen": -371.6170654296875, + "logps/rejected": -294.0312194824219, + "loss": 0.6185, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4960813522338867, + "rewards/margins": 0.2775948643684387, + "rewards/rejected": 0.21848651766777039, + "step": 2050 + }, + { + "epoch": 0.3171853856562923, + "grad_norm": 5.3045854568481445, + "learning_rate": 4.968209416886242e-06, + "logits/chosen": 10.03846263885498, + "logits/rejected": 8.394601821899414, + "logps/chosen": -254.42642211914062, + "logps/rejected": -289.1803894042969, + "loss": 0.4709, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.330724835395813, + "rewards/margins": 0.6609184741973877, + "rewards/rejected": -0.3301936388015747, + "step": 2051 + }, + { + "epoch": 0.31734003479605644, + "grad_norm": 7.794218063354492, + "learning_rate": 4.967923015236568e-06, + "logits/chosen": 6.5661940574646, + "logits/rejected": 4.515810489654541, + "logps/chosen": -221.69247436523438, + "logps/rejected": -225.12625122070312, + "loss": 0.9342, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.217045396566391, + "rewards/margins": -0.34060293436050415, + "rewards/rejected": 0.12355758249759674, + "step": 2052 + }, + { + "epoch": 0.3174946839358206, + "grad_norm": 6.268409729003906, + "learning_rate": 4.967636613586895e-06, + "logits/chosen": 9.71400260925293, + "logits/rejected": 4.3478102684021, + "logps/chosen": -243.1073760986328, + "logps/rejected": -232.1676483154297, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15780669450759888, + "rewards/margins": 0.15356865525245667, + "rewards/rejected": 0.004238061606884003, + "step": 2053 + }, + { + "epoch": 0.3176493330755848, + "grad_norm": 4.044487953186035, + "learning_rate": 4.967350211937222e-06, + "logits/chosen": 7.557414531707764, + "logits/rejected": 12.595425605773926, + "logps/chosen": -142.3138427734375, + "logps/rejected": -183.90841674804688, + "loss": 0.5899, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06036997586488724, + "rewards/margins": 0.31504297256469727, + "rewards/rejected": -0.3754129409790039, + "step": 2054 + }, + { + "epoch": 0.31780398221534895, + "grad_norm": 5.1918721199035645, + "learning_rate": 4.967063810287548e-06, + "logits/chosen": 7.354862213134766, + "logits/rejected": 10.796515464782715, + "logps/chosen": -263.85601806640625, + "logps/rejected": -288.7096862792969, + "loss": 0.6433, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17918884754180908, + "rewards/margins": 0.18749159574508667, + "rewards/rejected": -0.008302763104438782, + "step": 2055 + }, + { + "epoch": 0.3179586313551131, + "grad_norm": 7.184632301330566, + "learning_rate": 4.966777408637874e-06, + "logits/chosen": 7.668334007263184, + "logits/rejected": 7.771810054779053, + "logps/chosen": -276.070556640625, + "logps/rejected": -262.524658203125, + "loss": 0.7717, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.099912628531456, + "rewards/margins": -0.04781359061598778, + "rewards/rejected": -0.05209903419017792, + "step": 2056 + }, + { + "epoch": 0.31811328049487725, + "grad_norm": 5.591556072235107, + "learning_rate": 4.966491006988201e-06, + "logits/chosen": 6.896025657653809, + "logits/rejected": 4.624721050262451, + "logps/chosen": -220.68096923828125, + "logps/rejected": -269.56719970703125, + "loss": 0.6795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.028016850352287292, + "rewards/margins": 0.1233828067779541, + "rewards/rejected": -0.1513996571302414, + "step": 2057 + }, + { + "epoch": 0.3182679296346414, + "grad_norm": 9.149316787719727, + "learning_rate": 4.966204605338527e-06, + "logits/chosen": 13.482200622558594, + "logits/rejected": 8.05477237701416, + "logps/chosen": -411.701416015625, + "logps/rejected": -340.6821594238281, + "loss": 0.9006, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.006598569452762604, + "rewards/margins": -0.28060245513916016, + "rewards/rejected": 0.2872009873390198, + "step": 2058 + }, + { + "epoch": 0.31842257877440555, + "grad_norm": 6.2234392166137695, + "learning_rate": 4.965918203688854e-06, + "logits/chosen": 10.994446754455566, + "logits/rejected": 6.4054975509643555, + "logps/chosen": -376.44976806640625, + "logps/rejected": -220.66795349121094, + "loss": 0.5575, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2972099184989929, + "rewards/margins": 0.5810242891311646, + "rewards/rejected": -0.28381434082984924, + "step": 2059 + }, + { + "epoch": 0.3185772279141697, + "grad_norm": 7.8359456062316895, + "learning_rate": 4.965631802039181e-06, + "logits/chosen": 10.574647903442383, + "logits/rejected": 4.935401916503906, + "logps/chosen": -500.7237548828125, + "logps/rejected": -347.1833190917969, + "loss": 0.5659, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1308046281337738, + "rewards/margins": 0.42363840341567993, + "rewards/rejected": -0.29283377528190613, + "step": 2060 + }, + { + "epoch": 0.3187318770539339, + "grad_norm": 5.31462287902832, + "learning_rate": 4.9653454003895065e-06, + "logits/chosen": 8.97025203704834, + "logits/rejected": 6.5046186447143555, + "logps/chosen": -378.29754638671875, + "logps/rejected": -308.29364013671875, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22091758251190186, + "rewards/margins": 0.31939026713371277, + "rewards/rejected": -0.09847268462181091, + "step": 2061 + }, + { + "epoch": 0.31888652619369806, + "grad_norm": 4.882965564727783, + "learning_rate": 4.965058998739833e-06, + "logits/chosen": 9.242798805236816, + "logits/rejected": -0.6929713487625122, + "logps/chosen": -308.7655944824219, + "logps/rejected": -159.95748901367188, + "loss": 0.5731, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10764069855213165, + "rewards/margins": 0.3808231055736542, + "rewards/rejected": -0.27318239212036133, + "step": 2062 + }, + { + "epoch": 0.3190411753334622, + "grad_norm": 3.991692304611206, + "learning_rate": 4.96477259709016e-06, + "logits/chosen": 10.273489952087402, + "logits/rejected": 7.59541130065918, + "logps/chosen": -256.82427978515625, + "logps/rejected": -248.18356323242188, + "loss": 0.6164, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13332833349704742, + "rewards/margins": 0.3333299160003662, + "rewards/rejected": -0.20000162720680237, + "step": 2063 + }, + { + "epoch": 0.31919582447322636, + "grad_norm": 5.288827419281006, + "learning_rate": 4.9644861954404864e-06, + "logits/chosen": 12.139446258544922, + "logits/rejected": 7.976694107055664, + "logps/chosen": -392.28106689453125, + "logps/rejected": -291.5730895996094, + "loss": 0.6064, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12893258035182953, + "rewards/margins": 0.22405657172203064, + "rewards/rejected": -0.0951240062713623, + "step": 2064 + }, + { + "epoch": 0.3193504736129905, + "grad_norm": 4.785806179046631, + "learning_rate": 4.964199793790812e-06, + "logits/chosen": 11.824102401733398, + "logits/rejected": 11.754941940307617, + "logps/chosen": -252.7989959716797, + "logps/rejected": -201.30653381347656, + "loss": 0.7488, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009758569300174713, + "rewards/margins": -0.08507233113050461, + "rewards/rejected": 0.09483090043067932, + "step": 2065 + }, + { + "epoch": 0.31950512275275467, + "grad_norm": 7.117434501647949, + "learning_rate": 4.963913392141139e-06, + "logits/chosen": 7.4334235191345215, + "logits/rejected": 11.768050193786621, + "logps/chosen": -244.57630920410156, + "logps/rejected": -266.88580322265625, + "loss": 0.6925, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05017123371362686, + "rewards/margins": 0.10714679956436157, + "rewards/rejected": -0.15731802582740784, + "step": 2066 + }, + { + "epoch": 0.3196597718925189, + "grad_norm": 5.985042572021484, + "learning_rate": 4.9636269904914655e-06, + "logits/chosen": 8.437536239624023, + "logits/rejected": 4.00594425201416, + "logps/chosen": -195.0152587890625, + "logps/rejected": -184.0806884765625, + "loss": 0.6619, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11138291656970978, + "rewards/margins": 0.23827090859413147, + "rewards/rejected": -0.1268879920244217, + "step": 2067 + }, + { + "epoch": 0.319814421032283, + "grad_norm": 6.469642162322998, + "learning_rate": 4.963340588841792e-06, + "logits/chosen": 2.145012855529785, + "logits/rejected": 6.518708229064941, + "logps/chosen": -277.6102600097656, + "logps/rejected": -319.8463439941406, + "loss": 0.7123, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.30444109439849854, + "rewards/margins": 0.010153524577617645, + "rewards/rejected": -0.3145946264266968, + "step": 2068 + }, + { + "epoch": 0.3199690701720472, + "grad_norm": 4.855186462402344, + "learning_rate": 4.963054187192118e-06, + "logits/chosen": 12.217367172241211, + "logits/rejected": 11.836400985717773, + "logps/chosen": -322.51153564453125, + "logps/rejected": -326.32000732421875, + "loss": 0.4969, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4644565284252167, + "rewards/margins": 0.5568170547485352, + "rewards/rejected": -0.0923604965209961, + "step": 2069 + }, + { + "epoch": 0.32012371931181133, + "grad_norm": 4.596843719482422, + "learning_rate": 4.962767785542445e-06, + "logits/chosen": 9.516210556030273, + "logits/rejected": 8.359548568725586, + "logps/chosen": -255.34906005859375, + "logps/rejected": -230.70223999023438, + "loss": 0.5925, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11020180583000183, + "rewards/margins": 0.25397002696990967, + "rewards/rejected": -0.14376823604106903, + "step": 2070 + }, + { + "epoch": 0.3202783684515755, + "grad_norm": 4.529107570648193, + "learning_rate": 4.962481383892771e-06, + "logits/chosen": 0.8398746252059937, + "logits/rejected": 4.05929708480835, + "logps/chosen": -163.35752868652344, + "logps/rejected": -171.0614013671875, + "loss": 0.6696, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03319668769836426, + "rewards/margins": 0.05870075523853302, + "rewards/rejected": -0.09189744293689728, + "step": 2071 + }, + { + "epoch": 0.32043301759133963, + "grad_norm": 6.0078840255737305, + "learning_rate": 4.962194982243098e-06, + "logits/chosen": 2.13967227935791, + "logits/rejected": 8.819110870361328, + "logps/chosen": -144.24813842773438, + "logps/rejected": -197.061279296875, + "loss": 0.7275, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16000080108642578, + "rewards/margins": 0.0023308396339416504, + "rewards/rejected": -0.16233164072036743, + "step": 2072 + }, + { + "epoch": 0.3205876667311038, + "grad_norm": 6.134139060974121, + "learning_rate": 4.961908580593425e-06, + "logits/chosen": 7.407665729522705, + "logits/rejected": 7.107675075531006, + "logps/chosen": -384.6820373535156, + "logps/rejected": -256.6728820800781, + "loss": 0.7827, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07551416754722595, + "rewards/margins": -0.08895669132471085, + "rewards/rejected": 0.013442512601613998, + "step": 2073 + }, + { + "epoch": 0.320742315870868, + "grad_norm": 8.088295936584473, + "learning_rate": 4.961622178943751e-06, + "logits/chosen": 2.8475310802459717, + "logits/rejected": 3.932326316833496, + "logps/chosen": -308.1170959472656, + "logps/rejected": -241.9561309814453, + "loss": 0.5189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2706906497478485, + "rewards/margins": 0.5015276074409485, + "rewards/rejected": -0.23083695769309998, + "step": 2074 + }, + { + "epoch": 0.32089696501063214, + "grad_norm": 45.37759780883789, + "learning_rate": 4.961335777294078e-06, + "logits/chosen": 9.011927604675293, + "logits/rejected": 8.506170272827148, + "logps/chosen": -374.6376953125, + "logps/rejected": -316.80126953125, + "loss": 0.6749, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10515378415584564, + "rewards/margins": 0.11395111680030823, + "rewards/rejected": -0.21910487115383148, + "step": 2075 + }, + { + "epoch": 0.3210516141503963, + "grad_norm": 5.602989196777344, + "learning_rate": 4.961049375644404e-06, + "logits/chosen": 12.401848793029785, + "logits/rejected": 12.414278030395508, + "logps/chosen": -281.40625, + "logps/rejected": -253.7079315185547, + "loss": 0.7303, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0323820635676384, + "rewards/margins": 0.0015699826180934906, + "rewards/rejected": 0.030812077224254608, + "step": 2076 + }, + { + "epoch": 0.32120626329016044, + "grad_norm": 6.0530290603637695, + "learning_rate": 4.96076297399473e-06, + "logits/chosen": 9.598310470581055, + "logits/rejected": 4.026642799377441, + "logps/chosen": -376.65087890625, + "logps/rejected": -279.203369140625, + "loss": 0.6874, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1759847104549408, + "rewards/margins": 0.05274325981736183, + "rewards/rejected": -0.22872796654701233, + "step": 2077 + }, + { + "epoch": 0.3213609124299246, + "grad_norm": 10.415705680847168, + "learning_rate": 4.960476572345057e-06, + "logits/chosen": 7.924570083618164, + "logits/rejected": 5.085732936859131, + "logps/chosen": -295.445068359375, + "logps/rejected": -214.3765411376953, + "loss": 0.7776, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24705390632152557, + "rewards/margins": -0.08882950991392136, + "rewards/rejected": -0.15822440385818481, + "step": 2078 + }, + { + "epoch": 0.32151556156968875, + "grad_norm": 5.727771282196045, + "learning_rate": 4.960190170695384e-06, + "logits/chosen": 5.952622413635254, + "logits/rejected": 4.558812618255615, + "logps/chosen": -222.62860107421875, + "logps/rejected": -184.36595153808594, + "loss": 0.8128, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4535398781299591, + "rewards/margins": -0.16294801235198975, + "rewards/rejected": -0.290591835975647, + "step": 2079 + }, + { + "epoch": 0.32167021070945295, + "grad_norm": 6.961986541748047, + "learning_rate": 4.95990376904571e-06, + "logits/chosen": 11.362753868103027, + "logits/rejected": 8.551691055297852, + "logps/chosen": -270.36566162109375, + "logps/rejected": -272.29693603515625, + "loss": 0.7045, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04510479047894478, + "rewards/margins": 0.051329128444194794, + "rewards/rejected": -0.09643393009901047, + "step": 2080 + }, + { + "epoch": 0.3218248598492171, + "grad_norm": 8.04045295715332, + "learning_rate": 4.959617367396037e-06, + "logits/chosen": 5.838197231292725, + "logits/rejected": 2.3217861652374268, + "logps/chosen": -303.8896484375, + "logps/rejected": -261.91864013671875, + "loss": 0.5366, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18876473605632782, + "rewards/margins": 0.40919047594070435, + "rewards/rejected": -0.22042575478553772, + "step": 2081 + }, + { + "epoch": 0.32197950898898126, + "grad_norm": 6.574918270111084, + "learning_rate": 4.959330965746363e-06, + "logits/chosen": 9.111629486083984, + "logits/rejected": 4.173726558685303, + "logps/chosen": -306.8055419921875, + "logps/rejected": -274.29632568359375, + "loss": 0.701, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16950541734695435, + "rewards/margins": 0.027989469468593597, + "rewards/rejected": -0.19749489426612854, + "step": 2082 + }, + { + "epoch": 0.3221341581287454, + "grad_norm": 7.2142791748046875, + "learning_rate": 4.959044564096689e-06, + "logits/chosen": 16.955080032348633, + "logits/rejected": 16.498762130737305, + "logps/chosen": -366.94305419921875, + "logps/rejected": -268.07958984375, + "loss": 0.6393, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3865606188774109, + "rewards/margins": 0.18941573798656464, + "rewards/rejected": 0.19714492559432983, + "step": 2083 + }, + { + "epoch": 0.32228880726850956, + "grad_norm": 6.034248352050781, + "learning_rate": 4.958758162447016e-06, + "logits/chosen": 11.447286605834961, + "logits/rejected": 10.902953147888184, + "logps/chosen": -280.1450500488281, + "logps/rejected": -301.2906799316406, + "loss": 0.6028, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019470401108264923, + "rewards/margins": 0.23493099212646484, + "rewards/rejected": -0.21546059846878052, + "step": 2084 + }, + { + "epoch": 0.3224434564082737, + "grad_norm": 6.518868446350098, + "learning_rate": 4.958471760797343e-06, + "logits/chosen": 6.861526966094971, + "logits/rejected": 7.466789722442627, + "logps/chosen": -229.49166870117188, + "logps/rejected": -219.41001892089844, + "loss": 0.7894, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.057122763246297836, + "rewards/margins": -0.11546182632446289, + "rewards/rejected": 0.05833907425403595, + "step": 2085 + }, + { + "epoch": 0.3225981055480379, + "grad_norm": 90.26467895507812, + "learning_rate": 4.958185359147669e-06, + "logits/chosen": 7.893308639526367, + "logits/rejected": 3.7543699741363525, + "logps/chosen": -231.8357391357422, + "logps/rejected": -215.47564697265625, + "loss": 0.5828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07351937890052795, + "rewards/margins": 0.3292279839515686, + "rewards/rejected": -0.40274736285209656, + "step": 2086 + }, + { + "epoch": 0.32275275468780207, + "grad_norm": 6.308873176574707, + "learning_rate": 4.957898957497996e-06, + "logits/chosen": 13.796135902404785, + "logits/rejected": 9.136918067932129, + "logps/chosen": -247.26329040527344, + "logps/rejected": -213.85240173339844, + "loss": 0.6556, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03864327073097229, + "rewards/margins": 0.11595988273620605, + "rewards/rejected": -0.15460315346717834, + "step": 2087 + }, + { + "epoch": 0.3229074038275662, + "grad_norm": 9.705131530761719, + "learning_rate": 4.957612555848323e-06, + "logits/chosen": 8.198763847351074, + "logits/rejected": 10.425496101379395, + "logps/chosen": -239.85275268554688, + "logps/rejected": -360.1874084472656, + "loss": 0.7244, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.087799072265625, + "rewards/margins": 0.17272567749023438, + "rewards/rejected": -0.08492660522460938, + "step": 2088 + }, + { + "epoch": 0.32306205296733037, + "grad_norm": 6.444095611572266, + "learning_rate": 4.9573261541986485e-06, + "logits/chosen": 12.79790210723877, + "logits/rejected": 4.606011867523193, + "logps/chosen": -301.3984375, + "logps/rejected": -238.3819580078125, + "loss": 0.7282, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3333694338798523, + "rewards/margins": -0.024694059044122696, + "rewards/rejected": -0.3086753785610199, + "step": 2089 + }, + { + "epoch": 0.3232167021070945, + "grad_norm": 14.251069068908691, + "learning_rate": 4.957039752548975e-06, + "logits/chosen": 14.770837783813477, + "logits/rejected": 0.0905866026878357, + "logps/chosen": -520.4288940429688, + "logps/rejected": -313.33428955078125, + "loss": 0.6629, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1710100919008255, + "rewards/margins": 0.1557396948337555, + "rewards/rejected": 0.015270419418811798, + "step": 2090 + }, + { + "epoch": 0.3233713512468587, + "grad_norm": 5.565276622772217, + "learning_rate": 4.956753350899302e-06, + "logits/chosen": 4.931995868682861, + "logits/rejected": 9.044991493225098, + "logps/chosen": -241.9562225341797, + "logps/rejected": -231.12319946289062, + "loss": 0.7162, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1074863001704216, + "rewards/margins": 0.3090801239013672, + "rewards/rejected": -0.4165664315223694, + "step": 2091 + }, + { + "epoch": 0.3235260003866228, + "grad_norm": 4.336131572723389, + "learning_rate": 4.956466949249628e-06, + "logits/chosen": 11.24570083618164, + "logits/rejected": 10.222318649291992, + "logps/chosen": -217.121337890625, + "logps/rejected": -206.05752563476562, + "loss": 0.5962, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13777370750904083, + "rewards/margins": 0.28516972064971924, + "rewards/rejected": -0.42294347286224365, + "step": 2092 + }, + { + "epoch": 0.32368064952638703, + "grad_norm": 5.697318077087402, + "learning_rate": 4.956180547599955e-06, + "logits/chosen": 6.302378177642822, + "logits/rejected": 10.067934036254883, + "logps/chosen": -289.6753845214844, + "logps/rejected": -320.90655517578125, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2821327745914459, + "rewards/margins": 0.1861279457807541, + "rewards/rejected": 0.09600482136011124, + "step": 2093 + }, + { + "epoch": 0.3238352986661512, + "grad_norm": 7.901885032653809, + "learning_rate": 4.955894145950282e-06, + "logits/chosen": 16.094385147094727, + "logits/rejected": 2.6949076652526855, + "logps/chosen": -439.53515625, + "logps/rejected": -155.47381591796875, + "loss": 0.5518, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10470141470432281, + "rewards/margins": 0.508083701133728, + "rewards/rejected": -0.40338224172592163, + "step": 2094 + }, + { + "epoch": 0.32398994780591533, + "grad_norm": 4.967139720916748, + "learning_rate": 4.9556077443006075e-06, + "logits/chosen": 10.589612007141113, + "logits/rejected": 10.220674514770508, + "logps/chosen": -315.0977478027344, + "logps/rejected": -268.65277099609375, + "loss": 0.5815, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.048830799758434296, + "rewards/margins": 0.24980440735816956, + "rewards/rejected": -0.20097360014915466, + "step": 2095 + }, + { + "epoch": 0.3241445969456795, + "grad_norm": 5.359178066253662, + "learning_rate": 4.955321342650934e-06, + "logits/chosen": 7.773028373718262, + "logits/rejected": 4.351597785949707, + "logps/chosen": -226.37010192871094, + "logps/rejected": -161.7034912109375, + "loss": 0.6527, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08516424894332886, + "rewards/margins": 0.09899123013019562, + "rewards/rejected": -0.013826975598931313, + "step": 2096 + }, + { + "epoch": 0.32429924608544364, + "grad_norm": 3.8178317546844482, + "learning_rate": 4.955034941001261e-06, + "logits/chosen": 15.25799560546875, + "logits/rejected": 8.078344345092773, + "logps/chosen": -173.55859375, + "logps/rejected": -186.33509826660156, + "loss": 0.6671, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07011398673057556, + "rewards/margins": 0.10311809182167053, + "rewards/rejected": -0.1732320785522461, + "step": 2097 + }, + { + "epoch": 0.3244538952252078, + "grad_norm": 4.950554847717285, + "learning_rate": 4.9547485393515875e-06, + "logits/chosen": 14.872785568237305, + "logits/rejected": 8.911056518554688, + "logps/chosen": -358.3226623535156, + "logps/rejected": -257.03985595703125, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23612824082374573, + "rewards/margins": 0.49434345960617065, + "rewards/rejected": -0.2582152485847473, + "step": 2098 + }, + { + "epoch": 0.324608544364972, + "grad_norm": 5.108302116394043, + "learning_rate": 4.954462137701913e-06, + "logits/chosen": 11.643625259399414, + "logits/rejected": 6.648677825927734, + "logps/chosen": -287.53985595703125, + "logps/rejected": -146.73568725585938, + "loss": 0.7027, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23673954606056213, + "rewards/margins": -0.0164263267070055, + "rewards/rejected": -0.22031322121620178, + "step": 2099 + }, + { + "epoch": 0.32476319350473615, + "grad_norm": 4.536948204040527, + "learning_rate": 4.95417573605224e-06, + "logits/chosen": 10.21849250793457, + "logits/rejected": 3.383326292037964, + "logps/chosen": -306.2857360839844, + "logps/rejected": -190.88919067382812, + "loss": 0.4834, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44766801595687866, + "rewards/margins": 0.6022131443023682, + "rewards/rejected": -0.1545451581478119, + "step": 2100 + }, + { + "epoch": 0.3249178426445003, + "grad_norm": 5.082120418548584, + "learning_rate": 4.9538893344025666e-06, + "logits/chosen": 7.061097621917725, + "logits/rejected": 3.894728660583496, + "logps/chosen": -364.2991943359375, + "logps/rejected": -309.91064453125, + "loss": 0.4424, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3529106378555298, + "rewards/margins": 0.6648345589637756, + "rewards/rejected": -0.31192389130592346, + "step": 2101 + }, + { + "epoch": 0.32507249178426445, + "grad_norm": 4.058323860168457, + "learning_rate": 4.953602932752893e-06, + "logits/chosen": 9.08685302734375, + "logits/rejected": 9.114982604980469, + "logps/chosen": -183.080322265625, + "logps/rejected": -221.96896362304688, + "loss": 0.5011, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20131412148475647, + "rewards/margins": 0.6041401028633118, + "rewards/rejected": -0.4028259813785553, + "step": 2102 + }, + { + "epoch": 0.3252271409240286, + "grad_norm": 4.774957656860352, + "learning_rate": 4.953316531103219e-06, + "logits/chosen": 13.167054176330566, + "logits/rejected": 5.450692176818848, + "logps/chosen": -331.02880859375, + "logps/rejected": -213.2152862548828, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.455463707447052, + "rewards/margins": 0.3619253933429718, + "rewards/rejected": 0.09353828430175781, + "step": 2103 + }, + { + "epoch": 0.32538179006379275, + "grad_norm": 4.387048244476318, + "learning_rate": 4.953030129453546e-06, + "logits/chosen": 8.454203605651855, + "logits/rejected": 10.961136817932129, + "logps/chosen": -245.9151611328125, + "logps/rejected": -292.20928955078125, + "loss": 0.5568, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.021159403026103973, + "rewards/margins": 0.33990398049354553, + "rewards/rejected": -0.31874457001686096, + "step": 2104 + }, + { + "epoch": 0.3255364392035569, + "grad_norm": 5.282306671142578, + "learning_rate": 4.952743727803872e-06, + "logits/chosen": 11.518450736999512, + "logits/rejected": 7.982766151428223, + "logps/chosen": -334.3492431640625, + "logps/rejected": -217.17800903320312, + "loss": 0.6692, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03269042819738388, + "rewards/margins": 0.11648471653461456, + "rewards/rejected": -0.14917513728141785, + "step": 2105 + }, + { + "epoch": 0.3256910883433211, + "grad_norm": 5.2052202224731445, + "learning_rate": 4.952457326154199e-06, + "logits/chosen": 12.729175567626953, + "logits/rejected": 6.6468095779418945, + "logps/chosen": -306.32366943359375, + "logps/rejected": -265.7822570800781, + "loss": 0.5525, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09600372612476349, + "rewards/margins": 0.3471567928791046, + "rewards/rejected": -0.2511530816555023, + "step": 2106 + }, + { + "epoch": 0.32584573748308526, + "grad_norm": 4.601687431335449, + "learning_rate": 4.952170924504526e-06, + "logits/chosen": 13.283109664916992, + "logits/rejected": 3.378521203994751, + "logps/chosen": -374.67254638671875, + "logps/rejected": -253.61602783203125, + "loss": 0.5105, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42353230714797974, + "rewards/margins": 0.49716857075691223, + "rewards/rejected": -0.0736362412571907, + "step": 2107 + }, + { + "epoch": 0.3260003866228494, + "grad_norm": 4.700244426727295, + "learning_rate": 4.9518845228548514e-06, + "logits/chosen": 6.466777801513672, + "logits/rejected": 3.8909730911254883, + "logps/chosen": -231.74525451660156, + "logps/rejected": -174.89566040039062, + "loss": 0.6818, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.32801496982574463, + "rewards/margins": 0.11791111528873444, + "rewards/rejected": 0.2101038247346878, + "step": 2108 + }, + { + "epoch": 0.32615503576261357, + "grad_norm": 4.501307010650635, + "learning_rate": 4.951598121205178e-06, + "logits/chosen": 13.879536628723145, + "logits/rejected": 7.336886882781982, + "logps/chosen": -202.54266357421875, + "logps/rejected": -116.31472778320312, + "loss": 0.6563, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.013379007577896118, + "rewards/margins": 0.16892746090888977, + "rewards/rejected": -0.1823064386844635, + "step": 2109 + }, + { + "epoch": 0.3263096849023777, + "grad_norm": 7.5384602546691895, + "learning_rate": 4.951311719555505e-06, + "logits/chosen": 13.817075729370117, + "logits/rejected": 5.907853603363037, + "logps/chosen": -234.12368774414062, + "logps/rejected": -131.2285919189453, + "loss": 0.7606, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1672285497188568, + "rewards/margins": -0.025172285735607147, + "rewards/rejected": -0.14205628633499146, + "step": 2110 + }, + { + "epoch": 0.32646433404214187, + "grad_norm": 6.5641093254089355, + "learning_rate": 4.951025317905831e-06, + "logits/chosen": 8.768404006958008, + "logits/rejected": 13.349371910095215, + "logps/chosen": -275.34429931640625, + "logps/rejected": -348.5492248535156, + "loss": 0.7386, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18378829956054688, + "rewards/margins": -0.06050229072570801, + "rewards/rejected": -0.12328600883483887, + "step": 2111 + }, + { + "epoch": 0.3266189831819061, + "grad_norm": 7.226702690124512, + "learning_rate": 4.950738916256158e-06, + "logits/chosen": 8.845691680908203, + "logits/rejected": 12.88252067565918, + "logps/chosen": -301.1474609375, + "logps/rejected": -445.46343994140625, + "loss": 0.7479, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08916091918945312, + "rewards/margins": -0.03999757766723633, + "rewards/rejected": 0.12915846705436707, + "step": 2112 + }, + { + "epoch": 0.3267736323216702, + "grad_norm": 7.249813079833984, + "learning_rate": 4.950452514606485e-06, + "logits/chosen": 5.492649078369141, + "logits/rejected": 0.48049187660217285, + "logps/chosen": -425.01708984375, + "logps/rejected": -296.83221435546875, + "loss": 0.7061, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30592823028564453, + "rewards/margins": 0.052317190915346146, + "rewards/rejected": 0.2536110579967499, + "step": 2113 + }, + { + "epoch": 0.3269282814614344, + "grad_norm": 6.455074310302734, + "learning_rate": 4.950166112956811e-06, + "logits/chosen": 7.3708882331848145, + "logits/rejected": 11.505752563476562, + "logps/chosen": -282.5945739746094, + "logps/rejected": -288.6684265136719, + "loss": 0.6424, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12609276175498962, + "rewards/margins": 0.15550580620765686, + "rewards/rejected": -0.029413044452667236, + "step": 2114 + }, + { + "epoch": 0.32708293060119853, + "grad_norm": 6.537588119506836, + "learning_rate": 4.949879711307137e-06, + "logits/chosen": 13.17055606842041, + "logits/rejected": 5.763317108154297, + "logps/chosen": -323.1358947753906, + "logps/rejected": -250.46908569335938, + "loss": 0.5604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3411043584346771, + "rewards/margins": 0.4357706606388092, + "rewards/rejected": -0.09466633945703506, + "step": 2115 + }, + { + "epoch": 0.3272375797409627, + "grad_norm": 5.728628158569336, + "learning_rate": 4.949593309657464e-06, + "logits/chosen": 11.254746437072754, + "logits/rejected": 9.19675064086914, + "logps/chosen": -234.3589630126953, + "logps/rejected": -301.9439697265625, + "loss": 0.6734, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1750057190656662, + "rewards/margins": 0.3434600830078125, + "rewards/rejected": -0.1684543341398239, + "step": 2116 + }, + { + "epoch": 0.32739222888072683, + "grad_norm": 4.271417140960693, + "learning_rate": 4.9493069080077904e-06, + "logits/chosen": 9.724249839782715, + "logits/rejected": 6.8040266036987305, + "logps/chosen": -313.16461181640625, + "logps/rejected": -248.35000610351562, + "loss": 0.5831, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024228574708104134, + "rewards/margins": 0.28434285521507263, + "rewards/rejected": -0.26011431217193604, + "step": 2117 + }, + { + "epoch": 0.327546878020491, + "grad_norm": 7.3210554122924805, + "learning_rate": 4.949020506358117e-06, + "logits/chosen": 3.5579299926757812, + "logits/rejected": 4.765020847320557, + "logps/chosen": -287.75970458984375, + "logps/rejected": -297.7496032714844, + "loss": 0.7942, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10287874937057495, + "rewards/margins": -0.12194245308637619, + "rewards/rejected": 0.019063711166381836, + "step": 2118 + }, + { + "epoch": 0.3277015271602552, + "grad_norm": 6.927490234375, + "learning_rate": 4.948734104708444e-06, + "logits/chosen": 12.66522216796875, + "logits/rejected": 7.755368232727051, + "logps/chosen": -276.4278259277344, + "logps/rejected": -245.3193359375, + "loss": 0.6959, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06859144568443298, + "rewards/margins": 0.12725737690925598, + "rewards/rejected": -0.05866594240069389, + "step": 2119 + }, + { + "epoch": 0.32785617630001934, + "grad_norm": 15.658063888549805, + "learning_rate": 4.94844770305877e-06, + "logits/chosen": 12.226007461547852, + "logits/rejected": 9.363908767700195, + "logps/chosen": -143.8397216796875, + "logps/rejected": -190.3950653076172, + "loss": 0.8341, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6046377420425415, + "rewards/margins": -0.2315879464149475, + "rewards/rejected": -0.3730497658252716, + "step": 2120 + }, + { + "epoch": 0.3280108254397835, + "grad_norm": 4.808434963226318, + "learning_rate": 4.948161301409097e-06, + "logits/chosen": 7.353221416473389, + "logits/rejected": 10.257070541381836, + "logps/chosen": -173.77374267578125, + "logps/rejected": -243.19540405273438, + "loss": 0.6458, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04191150516271591, + "rewards/margins": 0.2779221534729004, + "rewards/rejected": -0.2360106259584427, + "step": 2121 + }, + { + "epoch": 0.32816547457954764, + "grad_norm": 6.093598365783691, + "learning_rate": 4.947874899759423e-06, + "logits/chosen": 12.379583358764648, + "logits/rejected": 7.348960876464844, + "logps/chosen": -349.2428894042969, + "logps/rejected": -271.2073059082031, + "loss": 0.6332, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2535230815410614, + "rewards/margins": 0.37280115485191345, + "rewards/rejected": -0.11927807331085205, + "step": 2122 + }, + { + "epoch": 0.3283201237193118, + "grad_norm": 6.790486812591553, + "learning_rate": 4.9475884981097495e-06, + "logits/chosen": 11.065315246582031, + "logits/rejected": 5.394992351531982, + "logps/chosen": -319.71978759765625, + "logps/rejected": -227.23475646972656, + "loss": 0.7013, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2551566958427429, + "rewards/margins": 0.07567586749792099, + "rewards/rejected": 0.17948083579540253, + "step": 2123 + }, + { + "epoch": 0.32847477285907595, + "grad_norm": 6.010940074920654, + "learning_rate": 4.947302096460076e-06, + "logits/chosen": 10.837716102600098, + "logits/rejected": 6.9498820304870605, + "logps/chosen": -429.13482666015625, + "logps/rejected": -370.83343505859375, + "loss": 0.7371, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30295678973197937, + "rewards/margins": 0.16916626691818237, + "rewards/rejected": 0.1337904930114746, + "step": 2124 + }, + { + "epoch": 0.32862942199884015, + "grad_norm": 6.506544589996338, + "learning_rate": 4.947015694810403e-06, + "logits/chosen": 2.0541067123413086, + "logits/rejected": 1.5029680728912354, + "logps/chosen": -227.41168212890625, + "logps/rejected": -219.27052307128906, + "loss": 0.6139, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010286381468176842, + "rewards/margins": 0.1993921399116516, + "rewards/rejected": -0.18910574913024902, + "step": 2125 + }, + { + "epoch": 0.3287840711386043, + "grad_norm": 4.318284511566162, + "learning_rate": 4.9467292931607294e-06, + "logits/chosen": 8.311731338500977, + "logits/rejected": 1.882987380027771, + "logps/chosen": -206.57373046875, + "logps/rejected": -185.872314453125, + "loss": 0.5713, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03734751045703888, + "rewards/margins": 0.38800737261772156, + "rewards/rejected": -0.42535486817359924, + "step": 2126 + }, + { + "epoch": 0.32893872027836846, + "grad_norm": 6.992857456207275, + "learning_rate": 4.946442891511056e-06, + "logits/chosen": 10.2127046585083, + "logits/rejected": 8.646617889404297, + "logps/chosen": -376.332275390625, + "logps/rejected": -331.8555908203125, + "loss": 0.7944, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23878727853298187, + "rewards/margins": -0.11294031888246536, + "rewards/rejected": -0.1258469521999359, + "step": 2127 + }, + { + "epoch": 0.3290933694181326, + "grad_norm": 4.463717937469482, + "learning_rate": 4.946156489861382e-06, + "logits/chosen": 10.69537353515625, + "logits/rejected": 4.715465068817139, + "logps/chosen": -296.04986572265625, + "logps/rejected": -245.56149291992188, + "loss": 0.5613, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.549194872379303, + "rewards/margins": 0.33023345470428467, + "rewards/rejected": 0.2189614474773407, + "step": 2128 + }, + { + "epoch": 0.32924801855789676, + "grad_norm": 4.673308849334717, + "learning_rate": 4.9458700882117085e-06, + "logits/chosen": 11.814477920532227, + "logits/rejected": 4.963963508605957, + "logps/chosen": -307.353759765625, + "logps/rejected": -207.64404296875, + "loss": 0.5448, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2856033444404602, + "rewards/margins": 0.5145837068557739, + "rewards/rejected": -0.22898036241531372, + "step": 2129 + }, + { + "epoch": 0.3294026676976609, + "grad_norm": 8.315261840820312, + "learning_rate": 4.945583686562035e-06, + "logits/chosen": 11.927656173706055, + "logits/rejected": 9.667914390563965, + "logps/chosen": -384.94854736328125, + "logps/rejected": -338.69158935546875, + "loss": 0.6677, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2725120484828949, + "rewards/margins": 0.13412266969680786, + "rewards/rejected": 0.13838940858840942, + "step": 2130 + }, + { + "epoch": 0.3295573168374251, + "grad_norm": 4.120357036590576, + "learning_rate": 4.945297284912362e-06, + "logits/chosen": 10.142644882202148, + "logits/rejected": 6.500965118408203, + "logps/chosen": -223.88516235351562, + "logps/rejected": -137.29342651367188, + "loss": 0.6292, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005709746852517128, + "rewards/margins": 0.16655507683753967, + "rewards/rejected": -0.17226482927799225, + "step": 2131 + }, + { + "epoch": 0.32971196597718927, + "grad_norm": 5.709237575531006, + "learning_rate": 4.9450108832626885e-06, + "logits/chosen": 8.734935760498047, + "logits/rejected": 3.5999698638916016, + "logps/chosen": -222.15179443359375, + "logps/rejected": -206.25152587890625, + "loss": 0.712, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12729153037071228, + "rewards/margins": 0.18759658932685852, + "rewards/rejected": -0.3148880898952484, + "step": 2132 + }, + { + "epoch": 0.3298666151169534, + "grad_norm": 7.291140556335449, + "learning_rate": 4.944724481613014e-06, + "logits/chosen": 4.563351631164551, + "logits/rejected": 5.958873271942139, + "logps/chosen": -266.71563720703125, + "logps/rejected": -329.9641418457031, + "loss": 0.6278, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0905306339263916, + "rewards/margins": 0.22541368007659912, + "rewards/rejected": -0.3159443140029907, + "step": 2133 + }, + { + "epoch": 0.3300212642567176, + "grad_norm": 6.44389533996582, + "learning_rate": 4.944438079963341e-06, + "logits/chosen": 4.952027320861816, + "logits/rejected": 5.0103888511657715, + "logps/chosen": -237.36703491210938, + "logps/rejected": -254.44178771972656, + "loss": 0.7994, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.16160213947296143, + "rewards/margins": -0.07677437365055084, + "rewards/rejected": 0.23837654292583466, + "step": 2134 + }, + { + "epoch": 0.3301759133964817, + "grad_norm": 16.3490047454834, + "learning_rate": 4.944151678313668e-06, + "logits/chosen": 8.593355178833008, + "logits/rejected": 7.395482063293457, + "logps/chosen": -335.38397216796875, + "logps/rejected": -294.83551025390625, + "loss": 0.8929, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19003048539161682, + "rewards/margins": -0.3215496838092804, + "rewards/rejected": 0.511580228805542, + "step": 2135 + }, + { + "epoch": 0.3303305625362459, + "grad_norm": 5.849039077758789, + "learning_rate": 4.943865276663994e-06, + "logits/chosen": 18.099807739257812, + "logits/rejected": 12.95901870727539, + "logps/chosen": -231.89871215820312, + "logps/rejected": -217.69552612304688, + "loss": 0.6823, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13253240287303925, + "rewards/margins": 0.10790696740150452, + "rewards/rejected": 0.024625442922115326, + "step": 2136 + }, + { + "epoch": 0.33048521167601, + "grad_norm": 19.142820358276367, + "learning_rate": 4.94357887501432e-06, + "logits/chosen": 12.757999420166016, + "logits/rejected": 0.3978739082813263, + "logps/chosen": -303.5205993652344, + "logps/rejected": -153.62709045410156, + "loss": 0.5001, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.392508327960968, + "rewards/margins": 0.4961417317390442, + "rewards/rejected": -0.10363340377807617, + "step": 2137 + }, + { + "epoch": 0.33063986081577423, + "grad_norm": 5.831638813018799, + "learning_rate": 4.943292473364647e-06, + "logits/chosen": 1.585843801498413, + "logits/rejected": 2.117445707321167, + "logps/chosen": -205.6033172607422, + "logps/rejected": -255.28086853027344, + "loss": 0.7501, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10074068605899811, + "rewards/margins": -0.06774191558361053, + "rewards/rejected": -0.03299874812364578, + "step": 2138 + }, + { + "epoch": 0.3307945099555384, + "grad_norm": 7.719616889953613, + "learning_rate": 4.943006071714973e-06, + "logits/chosen": 12.479236602783203, + "logits/rejected": 10.195213317871094, + "logps/chosen": -280.171630859375, + "logps/rejected": -305.09393310546875, + "loss": 0.777, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22542619705200195, + "rewards/margins": -0.02217789739370346, + "rewards/rejected": 0.247604101896286, + "step": 2139 + }, + { + "epoch": 0.33094915909530254, + "grad_norm": 4.695708751678467, + "learning_rate": 4.9427196700653e-06, + "logits/chosen": 10.320137023925781, + "logits/rejected": 8.84394359588623, + "logps/chosen": -252.85000610351562, + "logps/rejected": -201.3157958984375, + "loss": 0.6776, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3142176568508148, + "rewards/margins": 0.13354313373565674, + "rewards/rejected": 0.18067452311515808, + "step": 2140 + }, + { + "epoch": 0.3311038082350667, + "grad_norm": 6.1608171463012695, + "learning_rate": 4.942433268415626e-06, + "logits/chosen": 6.467574119567871, + "logits/rejected": 6.092167377471924, + "logps/chosen": -216.0699005126953, + "logps/rejected": -213.14077758789062, + "loss": 0.8243, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.17924192547798157, + "rewards/margins": -0.18616530299186707, + "rewards/rejected": 0.006923386827111244, + "step": 2141 + }, + { + "epoch": 0.33125845737483084, + "grad_norm": 4.709331512451172, + "learning_rate": 4.9421468667659525e-06, + "logits/chosen": 12.148876190185547, + "logits/rejected": 10.136076927185059, + "logps/chosen": -248.241455078125, + "logps/rejected": -265.0379333496094, + "loss": 0.6733, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.036957353353500366, + "rewards/margins": 0.10949775576591492, + "rewards/rejected": -0.07254038006067276, + "step": 2142 + }, + { + "epoch": 0.331413106514595, + "grad_norm": 13.91828727722168, + "learning_rate": 4.941860465116279e-06, + "logits/chosen": 10.493155479431152, + "logits/rejected": 8.240067481994629, + "logps/chosen": -205.35140991210938, + "logps/rejected": -286.3068542480469, + "loss": 0.8099, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07062984257936478, + "rewards/margins": -0.1602824628353119, + "rewards/rejected": 0.23091231286525726, + "step": 2143 + }, + { + "epoch": 0.3315677556543592, + "grad_norm": 5.486730575561523, + "learning_rate": 4.941574063466606e-06, + "logits/chosen": 10.753125190734863, + "logits/rejected": 5.998003005981445, + "logps/chosen": -145.19281005859375, + "logps/rejected": -147.0620574951172, + "loss": 0.6293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012790344655513763, + "rewards/margins": 0.17342935502529144, + "rewards/rejected": -0.1862196922302246, + "step": 2144 + }, + { + "epoch": 0.33172240479412335, + "grad_norm": 5.468768119812012, + "learning_rate": 4.941287661816932e-06, + "logits/chosen": 10.622962951660156, + "logits/rejected": 6.332726955413818, + "logps/chosen": -341.4171447753906, + "logps/rejected": -252.20851135253906, + "loss": 0.6374, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3033198416233063, + "rewards/margins": 0.21081838011741638, + "rewards/rejected": 0.0925014540553093, + "step": 2145 + }, + { + "epoch": 0.3318770539338875, + "grad_norm": 6.681360244750977, + "learning_rate": 4.941001260167259e-06, + "logits/chosen": 6.566615581512451, + "logits/rejected": 8.766490936279297, + "logps/chosen": -307.0692138671875, + "logps/rejected": -330.61456298828125, + "loss": 0.6946, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11982184648513794, + "rewards/margins": 0.0829787403345108, + "rewards/rejected": 0.036843106150627136, + "step": 2146 + }, + { + "epoch": 0.33203170307365165, + "grad_norm": 5.155014514923096, + "learning_rate": 4.940714858517586e-06, + "logits/chosen": 8.016521453857422, + "logits/rejected": 3.768669843673706, + "logps/chosen": -331.3190612792969, + "logps/rejected": -281.0326232910156, + "loss": 0.5944, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2365173101425171, + "rewards/margins": 0.2611900269985199, + "rewards/rejected": -0.024672742933034897, + "step": 2147 + }, + { + "epoch": 0.3321863522134158, + "grad_norm": 4.372710704803467, + "learning_rate": 4.9404284568679115e-06, + "logits/chosen": 9.412343978881836, + "logits/rejected": 7.2935357093811035, + "logps/chosen": -228.67184448242188, + "logps/rejected": -178.893310546875, + "loss": 0.6754, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01220519095659256, + "rewards/margins": 0.10863519459962845, + "rewards/rejected": -0.09643000364303589, + "step": 2148 + }, + { + "epoch": 0.33234100135317995, + "grad_norm": 6.063559532165527, + "learning_rate": 4.940142055218238e-06, + "logits/chosen": 12.230186462402344, + "logits/rejected": 8.85799789428711, + "logps/chosen": -325.6956787109375, + "logps/rejected": -299.7931213378906, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35750043392181396, + "rewards/margins": 0.3025447428226471, + "rewards/rejected": 0.05495566874742508, + "step": 2149 + }, + { + "epoch": 0.3324956504929441, + "grad_norm": 5.640564441680908, + "learning_rate": 4.939855653568565e-06, + "logits/chosen": 12.984869003295898, + "logits/rejected": 4.921185493469238, + "logps/chosen": -243.9294891357422, + "logps/rejected": -144.32144165039062, + "loss": 0.6189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23765462636947632, + "rewards/margins": 0.19950953125953674, + "rewards/rejected": 0.03814505785703659, + "step": 2150 + }, + { + "epoch": 0.3326502996327083, + "grad_norm": 4.923940658569336, + "learning_rate": 4.9395692519188915e-06, + "logits/chosen": 15.85429859161377, + "logits/rejected": 12.531364440917969, + "logps/chosen": -255.62393188476562, + "logps/rejected": -256.057861328125, + "loss": 0.6162, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11795315891504288, + "rewards/margins": 0.23732790350914001, + "rewards/rejected": -0.11937475204467773, + "step": 2151 + }, + { + "epoch": 0.33280494877247246, + "grad_norm": 4.349578857421875, + "learning_rate": 4.939282850269218e-06, + "logits/chosen": 4.954311370849609, + "logits/rejected": 7.276858806610107, + "logps/chosen": -260.74139404296875, + "logps/rejected": -193.87864685058594, + "loss": 0.5965, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3750847578048706, + "rewards/margins": 0.27639806270599365, + "rewards/rejected": 0.09868665784597397, + "step": 2152 + }, + { + "epoch": 0.3329595979122366, + "grad_norm": 4.945383071899414, + "learning_rate": 4.938996448619545e-06, + "logits/chosen": 10.00120735168457, + "logits/rejected": 6.675858974456787, + "logps/chosen": -278.8822021484375, + "logps/rejected": -257.4023742675781, + "loss": 0.4772, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27263402938842773, + "rewards/margins": 0.6318854093551636, + "rewards/rejected": -0.35925135016441345, + "step": 2153 + }, + { + "epoch": 0.33311424705200077, + "grad_norm": 7.046670436859131, + "learning_rate": 4.938710046969871e-06, + "logits/chosen": 8.616357803344727, + "logits/rejected": 8.351316452026367, + "logps/chosen": -418.61322021484375, + "logps/rejected": -293.4689025878906, + "loss": 0.8005, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19877421855926514, + "rewards/margins": -0.12812680006027222, + "rewards/rejected": 0.32690104842185974, + "step": 2154 + }, + { + "epoch": 0.3332688961917649, + "grad_norm": 6.001054286956787, + "learning_rate": 4.938423645320197e-06, + "logits/chosen": 8.804516792297363, + "logits/rejected": 8.200180053710938, + "logps/chosen": -309.0278015136719, + "logps/rejected": -242.1177520751953, + "loss": 0.7059, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1057676374912262, + "rewards/margins": 0.0562465637922287, + "rewards/rejected": 0.0495210662484169, + "step": 2155 + }, + { + "epoch": 0.33342354533152907, + "grad_norm": 8.740618705749512, + "learning_rate": 4.938137243670524e-06, + "logits/chosen": 9.079069137573242, + "logits/rejected": 5.190154552459717, + "logps/chosen": -274.79742431640625, + "logps/rejected": -216.66575622558594, + "loss": 0.7554, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0934847816824913, + "rewards/margins": -0.02816416695713997, + "rewards/rejected": 0.12164896726608276, + "step": 2156 + }, + { + "epoch": 0.3335781944712933, + "grad_norm": 5.348598480224609, + "learning_rate": 4.9378508420208505e-06, + "logits/chosen": 13.66172981262207, + "logits/rejected": 9.360877990722656, + "logps/chosen": -383.59332275390625, + "logps/rejected": -299.9488525390625, + "loss": 0.5302, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.578677773475647, + "rewards/margins": 0.43235549330711365, + "rewards/rejected": 0.14632225036621094, + "step": 2157 + }, + { + "epoch": 0.33373284361105743, + "grad_norm": 7.374210834503174, + "learning_rate": 4.937564440371177e-06, + "logits/chosen": 13.268245697021484, + "logits/rejected": 6.78769588470459, + "logps/chosen": -332.8704528808594, + "logps/rejected": -289.2275085449219, + "loss": 0.6614, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23434966802597046, + "rewards/margins": 0.10861466079950333, + "rewards/rejected": 0.12573499977588654, + "step": 2158 + }, + { + "epoch": 0.3338874927508216, + "grad_norm": 7.228075981140137, + "learning_rate": 4.937278038721504e-06, + "logits/chosen": 11.237176895141602, + "logits/rejected": 7.574673175811768, + "logps/chosen": -277.8772277832031, + "logps/rejected": -312.7956848144531, + "loss": 0.8184, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10549846291542053, + "rewards/margins": -0.21149574220180511, + "rewards/rejected": 0.31699422001838684, + "step": 2159 + }, + { + "epoch": 0.33404214189058573, + "grad_norm": 6.167690277099609, + "learning_rate": 4.9369916370718305e-06, + "logits/chosen": 10.053128242492676, + "logits/rejected": 4.656221866607666, + "logps/chosen": -305.4530944824219, + "logps/rejected": -212.2640380859375, + "loss": 0.7537, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.026428520679473877, + "rewards/margins": -0.06817140430212021, + "rewards/rejected": 0.04174289107322693, + "step": 2160 + }, + { + "epoch": 0.3341967910303499, + "grad_norm": 5.511512279510498, + "learning_rate": 4.936705235422156e-06, + "logits/chosen": 9.553276062011719, + "logits/rejected": 11.722877502441406, + "logps/chosen": -261.0096130371094, + "logps/rejected": -209.82296752929688, + "loss": 0.5279, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3160475492477417, + "rewards/margins": 0.5216274261474609, + "rewards/rejected": -0.20557984709739685, + "step": 2161 + }, + { + "epoch": 0.33435144017011403, + "grad_norm": 8.41563606262207, + "learning_rate": 4.936418833772483e-06, + "logits/chosen": 8.621390342712402, + "logits/rejected": 7.7258687019348145, + "logps/chosen": -335.4895935058594, + "logps/rejected": -345.52081298828125, + "loss": 0.709, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4417629539966583, + "rewards/margins": -0.0197154451161623, + "rewards/rejected": 0.46147841215133667, + "step": 2162 + }, + { + "epoch": 0.33450608930987824, + "grad_norm": 4.94342041015625, + "learning_rate": 4.93613243212281e-06, + "logits/chosen": 6.482465744018555, + "logits/rejected": 4.99796199798584, + "logps/chosen": -288.6210021972656, + "logps/rejected": -260.36444091796875, + "loss": 0.6365, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4152541160583496, + "rewards/margins": 0.14538335800170898, + "rewards/rejected": 0.2698707580566406, + "step": 2163 + }, + { + "epoch": 0.3346607384496424, + "grad_norm": 7.682459831237793, + "learning_rate": 4.935846030473136e-06, + "logits/chosen": 14.980879783630371, + "logits/rejected": 9.9533052444458, + "logps/chosen": -410.21746826171875, + "logps/rejected": -413.191650390625, + "loss": 0.7724, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16669350862503052, + "rewards/margins": -0.06279012560844421, + "rewards/rejected": 0.22948360443115234, + "step": 2164 + }, + { + "epoch": 0.33481538758940654, + "grad_norm": 5.459781646728516, + "learning_rate": 4.935559628823463e-06, + "logits/chosen": 13.500865936279297, + "logits/rejected": 13.78359603881836, + "logps/chosen": -333.18414306640625, + "logps/rejected": -322.5073547363281, + "loss": 0.643, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17819365859031677, + "rewards/margins": 0.20343102514743805, + "rewards/rejected": -0.02523735910654068, + "step": 2165 + }, + { + "epoch": 0.3349700367291707, + "grad_norm": 9.41622543334961, + "learning_rate": 4.935273227173789e-06, + "logits/chosen": 9.599180221557617, + "logits/rejected": 2.0726683139801025, + "logps/chosen": -274.6078186035156, + "logps/rejected": -159.72003173828125, + "loss": 0.7232, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.06719403713941574, + "rewards/margins": 0.02843063324689865, + "rewards/rejected": 0.038763418793678284, + "step": 2166 + }, + { + "epoch": 0.33512468586893485, + "grad_norm": 4.893899440765381, + "learning_rate": 4.934986825524115e-06, + "logits/chosen": 7.2651872634887695, + "logits/rejected": 4.972268581390381, + "logps/chosen": -427.45635986328125, + "logps/rejected": -338.7460632324219, + "loss": 0.5612, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6526381969451904, + "rewards/margins": 0.38484787940979004, + "rewards/rejected": 0.2677903175354004, + "step": 2167 + }, + { + "epoch": 0.335279335008699, + "grad_norm": 4.368253231048584, + "learning_rate": 4.934700423874442e-06, + "logits/chosen": 5.152036190032959, + "logits/rejected": 5.321083068847656, + "logps/chosen": -211.33250427246094, + "logps/rejected": -188.63491821289062, + "loss": 0.7066, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22109776735305786, + "rewards/margins": 0.04243742674589157, + "rewards/rejected": 0.1786603331565857, + "step": 2168 + }, + { + "epoch": 0.33543398414846315, + "grad_norm": 5.0637407302856445, + "learning_rate": 4.934414022224769e-06, + "logits/chosen": 10.351551055908203, + "logits/rejected": 6.950599670410156, + "logps/chosen": -304.55194091796875, + "logps/rejected": -267.5262451171875, + "loss": 0.6939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1781468391418457, + "rewards/margins": 0.03295135870575905, + "rewards/rejected": 0.14519548416137695, + "step": 2169 + }, + { + "epoch": 0.33558863328822736, + "grad_norm": 6.258752822875977, + "learning_rate": 4.934127620575095e-06, + "logits/chosen": 6.840770244598389, + "logits/rejected": 14.694913864135742, + "logps/chosen": -328.0792541503906, + "logps/rejected": -494.0980529785156, + "loss": 0.7807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35887566208839417, + "rewards/margins": -0.006628043949604034, + "rewards/rejected": 0.3655036985874176, + "step": 2170 + }, + { + "epoch": 0.3357432824279915, + "grad_norm": 6.289960861206055, + "learning_rate": 4.933841218925421e-06, + "logits/chosen": 2.8483967781066895, + "logits/rejected": 4.20119571685791, + "logps/chosen": -266.65350341796875, + "logps/rejected": -330.10186767578125, + "loss": 0.5466, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10865803062915802, + "rewards/margins": 0.442324161529541, + "rewards/rejected": -0.5509821772575378, + "step": 2171 + }, + { + "epoch": 0.33589793156775566, + "grad_norm": 7.433115005493164, + "learning_rate": 4.933554817275748e-06, + "logits/chosen": 4.327009677886963, + "logits/rejected": 9.659065246582031, + "logps/chosen": -175.13323974609375, + "logps/rejected": -283.44146728515625, + "loss": 0.9958, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10162576287984848, + "rewards/margins": -0.4215138852596283, + "rewards/rejected": 0.3198881149291992, + "step": 2172 + }, + { + "epoch": 0.3360525807075198, + "grad_norm": 5.919958114624023, + "learning_rate": 4.933268415626074e-06, + "logits/chosen": 9.624743461608887, + "logits/rejected": 6.929082870483398, + "logps/chosen": -343.0086669921875, + "logps/rejected": -272.3357238769531, + "loss": 0.676, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35381561517715454, + "rewards/margins": 0.09450992196798325, + "rewards/rejected": 0.2593056857585907, + "step": 2173 + }, + { + "epoch": 0.33620722984728396, + "grad_norm": 5.625269889831543, + "learning_rate": 4.932982013976401e-06, + "logits/chosen": 9.900564193725586, + "logits/rejected": 9.90178108215332, + "logps/chosen": -278.70416259765625, + "logps/rejected": -254.29090881347656, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2910246253013611, + "rewards/margins": 0.1581825464963913, + "rewards/rejected": 0.1328420639038086, + "step": 2174 + }, + { + "epoch": 0.3363618789870481, + "grad_norm": 7.171331882476807, + "learning_rate": 4.932695612326727e-06, + "logits/chosen": 12.84779167175293, + "logits/rejected": 10.251667022705078, + "logps/chosen": -303.6091613769531, + "logps/rejected": -233.2792510986328, + "loss": 0.8089, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06807422637939453, + "rewards/margins": -0.10673590749502182, + "rewards/rejected": 0.17481011152267456, + "step": 2175 + }, + { + "epoch": 0.3365165281268123, + "grad_norm": 7.200506210327148, + "learning_rate": 4.9324092106770535e-06, + "logits/chosen": 10.479219436645508, + "logits/rejected": 13.919760704040527, + "logps/chosen": -317.1406555175781, + "logps/rejected": -386.9718933105469, + "loss": 0.8333, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18851739168167114, + "rewards/margins": -0.15954551100730896, + "rewards/rejected": 0.3480629026889801, + "step": 2176 + }, + { + "epoch": 0.33667117726657647, + "grad_norm": 4.878973960876465, + "learning_rate": 4.93212280902738e-06, + "logits/chosen": 7.161655426025391, + "logits/rejected": 11.909017562866211, + "logps/chosen": -175.854736328125, + "logps/rejected": -215.295166015625, + "loss": 0.6436, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0897265076637268, + "rewards/margins": 0.13689425587654114, + "rewards/rejected": -0.22662077844142914, + "step": 2177 + }, + { + "epoch": 0.3368258264063406, + "grad_norm": 3.977186918258667, + "learning_rate": 4.931836407377707e-06, + "logits/chosen": 12.02243423461914, + "logits/rejected": 7.770849227905273, + "logps/chosen": -293.2452392578125, + "logps/rejected": -252.00106811523438, + "loss": 0.5136, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3526585102081299, + "rewards/margins": 0.46740198135375977, + "rewards/rejected": -0.11474344879388809, + "step": 2178 + }, + { + "epoch": 0.3369804755461048, + "grad_norm": 6.393489360809326, + "learning_rate": 4.9315500057280334e-06, + "logits/chosen": 10.56814193725586, + "logits/rejected": 9.51702880859375, + "logps/chosen": -194.17556762695312, + "logps/rejected": -158.67721557617188, + "loss": 0.673, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08892497420310974, + "rewards/margins": 0.10081363469362259, + "rewards/rejected": -0.18973861634731293, + "step": 2179 + }, + { + "epoch": 0.3371351246858689, + "grad_norm": 5.057383060455322, + "learning_rate": 4.93126360407836e-06, + "logits/chosen": 15.431549072265625, + "logits/rejected": 8.38217830657959, + "logps/chosen": -275.71258544921875, + "logps/rejected": -220.55601501464844, + "loss": 0.5836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05329176038503647, + "rewards/margins": 0.2801361083984375, + "rewards/rejected": -0.22684431076049805, + "step": 2180 + }, + { + "epoch": 0.3372897738256331, + "grad_norm": 4.324843883514404, + "learning_rate": 4.930977202428686e-06, + "logits/chosen": 10.693016052246094, + "logits/rejected": 7.600887775421143, + "logps/chosen": -344.5273132324219, + "logps/rejected": -274.1141357421875, + "loss": 0.5364, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16971053183078766, + "rewards/margins": 0.38878917694091797, + "rewards/rejected": -0.2190786600112915, + "step": 2181 + }, + { + "epoch": 0.33744442296539723, + "grad_norm": 7.040586471557617, + "learning_rate": 4.9306908007790125e-06, + "logits/chosen": 12.739887237548828, + "logits/rejected": 11.451869010925293, + "logps/chosen": -263.77716064453125, + "logps/rejected": -203.22349548339844, + "loss": 0.6895, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3552180230617523, + "rewards/margins": 0.11035862565040588, + "rewards/rejected": -0.4655766487121582, + "step": 2182 + }, + { + "epoch": 0.33759907210516144, + "grad_norm": 6.448904037475586, + "learning_rate": 4.930404399129339e-06, + "logits/chosen": 10.3959379196167, + "logits/rejected": 7.854723930358887, + "logps/chosen": -364.96221923828125, + "logps/rejected": -341.5102844238281, + "loss": 0.5505, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5349284410476685, + "rewards/margins": 0.3714437186717987, + "rewards/rejected": -0.9063721895217896, + "step": 2183 + }, + { + "epoch": 0.3377537212449256, + "grad_norm": 5.008645534515381, + "learning_rate": 4.930117997479666e-06, + "logits/chosen": 10.696040153503418, + "logits/rejected": 10.032888412475586, + "logps/chosen": -176.92333984375, + "logps/rejected": -195.1748046875, + "loss": 0.7449, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02067551203072071, + "rewards/margins": -0.03131965547800064, + "rewards/rejected": 0.051995180547237396, + "step": 2184 + }, + { + "epoch": 0.33790837038468974, + "grad_norm": 5.502933502197266, + "learning_rate": 4.9298315958299925e-06, + "logits/chosen": 8.644451141357422, + "logits/rejected": 7.277812957763672, + "logps/chosen": -224.171630859375, + "logps/rejected": -212.43795776367188, + "loss": 0.7228, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11053973436355591, + "rewards/margins": -0.032625388354063034, + "rewards/rejected": 0.14316511154174805, + "step": 2185 + }, + { + "epoch": 0.3380630195244539, + "grad_norm": 5.761327743530273, + "learning_rate": 4.929545194180319e-06, + "logits/chosen": 11.314397811889648, + "logits/rejected": 6.3425140380859375, + "logps/chosen": -294.193603515625, + "logps/rejected": -187.38900756835938, + "loss": 0.7198, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20676729083061218, + "rewards/margins": 0.06161829084157944, + "rewards/rejected": 0.14514902234077454, + "step": 2186 + }, + { + "epoch": 0.33821766866421804, + "grad_norm": 5.611957550048828, + "learning_rate": 4.929258792530646e-06, + "logits/chosen": 7.5964579582214355, + "logits/rejected": 12.299043655395508, + "logps/chosen": -167.5782012939453, + "logps/rejected": -275.7806396484375, + "loss": 0.7945, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.03267532214522362, + "rewards/margins": -0.1259826272726059, + "rewards/rejected": 0.158657968044281, + "step": 2187 + }, + { + "epoch": 0.3383723178039822, + "grad_norm": 4.5019426345825195, + "learning_rate": 4.928972390880972e-06, + "logits/chosen": 9.627721786499023, + "logits/rejected": 10.180431365966797, + "logps/chosen": -216.41009521484375, + "logps/rejected": -252.80288696289062, + "loss": 0.4925, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07169594615697861, + "rewards/margins": 0.5863649845123291, + "rewards/rejected": -0.5146690607070923, + "step": 2188 + }, + { + "epoch": 0.3385269669437464, + "grad_norm": 5.478781700134277, + "learning_rate": 4.928685989231298e-06, + "logits/chosen": 13.346943855285645, + "logits/rejected": 8.802850723266602, + "logps/chosen": -250.389892578125, + "logps/rejected": -213.4042510986328, + "loss": 0.6149, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08631086349487305, + "rewards/margins": 0.3143469989299774, + "rewards/rejected": -0.4006578326225281, + "step": 2189 + }, + { + "epoch": 0.33868161608351055, + "grad_norm": 8.719413757324219, + "learning_rate": 4.928399587581625e-06, + "logits/chosen": 14.230716705322266, + "logits/rejected": 10.871758460998535, + "logps/chosen": -336.6778259277344, + "logps/rejected": -285.1334228515625, + "loss": 0.744, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07175198197364807, + "rewards/margins": -0.08417779952287674, + "rewards/rejected": 0.1559297740459442, + "step": 2190 + }, + { + "epoch": 0.3388362652232747, + "grad_norm": 5.874048709869385, + "learning_rate": 4.9281131859319516e-06, + "logits/chosen": 15.300064086914062, + "logits/rejected": 15.285356521606445, + "logps/chosen": -464.52099609375, + "logps/rejected": -439.29705810546875, + "loss": 0.5821, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15989167988300323, + "rewards/margins": 0.3341308832168579, + "rewards/rejected": -0.1742391735315323, + "step": 2191 + }, + { + "epoch": 0.33899091436303885, + "grad_norm": 6.36256742477417, + "learning_rate": 4.927826784282278e-06, + "logits/chosen": 8.372645378112793, + "logits/rejected": 6.759990692138672, + "logps/chosen": -180.9885711669922, + "logps/rejected": -180.5035400390625, + "loss": 0.9234, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.34161269664764404, + "rewards/margins": -0.36969423294067383, + "rewards/rejected": 0.028081543743610382, + "step": 2192 + }, + { + "epoch": 0.339145563502803, + "grad_norm": 4.614841461181641, + "learning_rate": 4.927540382632605e-06, + "logits/chosen": 10.616250991821289, + "logits/rejected": 3.9560906887054443, + "logps/chosen": -281.41162109375, + "logps/rejected": -225.94224548339844, + "loss": 0.5956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007724002003669739, + "rewards/margins": 0.2634137272834778, + "rewards/rejected": -0.25568974018096924, + "step": 2193 + }, + { + "epoch": 0.33930021264256716, + "grad_norm": 5.53109073638916, + "learning_rate": 4.927253980982931e-06, + "logits/chosen": 9.959586143493652, + "logits/rejected": 9.219850540161133, + "logps/chosen": -254.1416473388672, + "logps/rejected": -333.04266357421875, + "loss": 0.6569, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01833067089319229, + "rewards/margins": 0.1865755319595337, + "rewards/rejected": -0.1682448387145996, + "step": 2194 + }, + { + "epoch": 0.33945486178233136, + "grad_norm": 3.877243757247925, + "learning_rate": 4.926967579333257e-06, + "logits/chosen": 13.282426834106445, + "logits/rejected": 3.8752942085266113, + "logps/chosen": -415.6174011230469, + "logps/rejected": -287.8602600097656, + "loss": 0.4, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4146263003349304, + "rewards/margins": 0.9070665836334229, + "rewards/rejected": -0.4924403131008148, + "step": 2195 + }, + { + "epoch": 0.3396095109220955, + "grad_norm": 8.95366096496582, + "learning_rate": 4.926681177683584e-06, + "logits/chosen": 8.199341773986816, + "logits/rejected": 6.678696632385254, + "logps/chosen": -372.427001953125, + "logps/rejected": -428.2519836425781, + "loss": 0.8344, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3477782607078552, + "rewards/margins": -0.14664092659950256, + "rewards/rejected": -0.20113736391067505, + "step": 2196 + }, + { + "epoch": 0.33976416006185967, + "grad_norm": 4.847280979156494, + "learning_rate": 4.926394776033911e-06, + "logits/chosen": 9.063350677490234, + "logits/rejected": 8.378971099853516, + "logps/chosen": -245.4602508544922, + "logps/rejected": -283.3411865234375, + "loss": 0.4844, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08104472607374191, + "rewards/margins": 0.5104107856750488, + "rewards/rejected": -0.5914554595947266, + "step": 2197 + }, + { + "epoch": 0.3399188092016238, + "grad_norm": 4.669248104095459, + "learning_rate": 4.926108374384237e-06, + "logits/chosen": 9.406679153442383, + "logits/rejected": 11.379308700561523, + "logps/chosen": -363.9560546875, + "logps/rejected": -322.1780090332031, + "loss": 0.5854, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01684923656284809, + "rewards/margins": 0.2974766790866852, + "rewards/rejected": -0.28062745928764343, + "step": 2198 + }, + { + "epoch": 0.34007345834138797, + "grad_norm": 4.05111837387085, + "learning_rate": 4.925821972734564e-06, + "logits/chosen": 6.986756801605225, + "logits/rejected": 2.258618116378784, + "logps/chosen": -189.69117736816406, + "logps/rejected": -121.53974914550781, + "loss": 0.6285, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07507272064685822, + "rewards/margins": 0.18000581860542297, + "rewards/rejected": -0.10493309795856476, + "step": 2199 + }, + { + "epoch": 0.3402281074811521, + "grad_norm": 5.7093329429626465, + "learning_rate": 4.92553557108489e-06, + "logits/chosen": 8.678044319152832, + "logits/rejected": 6.821519374847412, + "logps/chosen": -267.0439453125, + "logps/rejected": -271.0375671386719, + "loss": 0.5964, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015613652765750885, + "rewards/margins": 0.2787485718727112, + "rewards/rejected": -0.29436224699020386, + "step": 2200 + }, + { + "epoch": 0.34038275662091627, + "grad_norm": 5.728244304656982, + "learning_rate": 4.925249169435216e-06, + "logits/chosen": 7.073269844055176, + "logits/rejected": 6.776679992675781, + "logps/chosen": -295.5339050292969, + "logps/rejected": -302.1988525390625, + "loss": 0.6306, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1701340675354004, + "rewards/margins": 0.1837719976902008, + "rewards/rejected": -0.3539060652256012, + "step": 2201 + }, + { + "epoch": 0.3405374057606805, + "grad_norm": 4.501295566558838, + "learning_rate": 4.924962767785543e-06, + "logits/chosen": 7.147919654846191, + "logits/rejected": 8.420303344726562, + "logps/chosen": -247.03421020507812, + "logps/rejected": -244.56967163085938, + "loss": 0.6329, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1667885184288025, + "rewards/margins": 0.1886894255876541, + "rewards/rejected": -0.3554779589176178, + "step": 2202 + }, + { + "epoch": 0.34069205490044463, + "grad_norm": 9.208796501159668, + "learning_rate": 4.92467636613587e-06, + "logits/chosen": 10.132715225219727, + "logits/rejected": 13.70450210571289, + "logps/chosen": -383.57269287109375, + "logps/rejected": -455.46966552734375, + "loss": 0.8123, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004334352910518646, + "rewards/margins": -0.1488373577594757, + "rewards/rejected": 0.14450302720069885, + "step": 2203 + }, + { + "epoch": 0.3408467040402088, + "grad_norm": 6.116032600402832, + "learning_rate": 4.9243899644861955e-06, + "logits/chosen": 8.444271087646484, + "logits/rejected": 8.459249496459961, + "logps/chosen": -304.50775146484375, + "logps/rejected": -287.3347473144531, + "loss": 0.6582, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13697025179862976, + "rewards/margins": 0.2546781599521637, + "rewards/rejected": -0.11770791560411453, + "step": 2204 + }, + { + "epoch": 0.34100135317997293, + "grad_norm": 5.337936878204346, + "learning_rate": 4.924103562836522e-06, + "logits/chosen": 13.895332336425781, + "logits/rejected": 11.867212295532227, + "logps/chosen": -291.1551513671875, + "logps/rejected": -318.891845703125, + "loss": 0.5838, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05068197846412659, + "rewards/margins": 0.5032179951667786, + "rewards/rejected": -0.5539000034332275, + "step": 2205 + }, + { + "epoch": 0.3411560023197371, + "grad_norm": 6.2754807472229, + "learning_rate": 4.923817161186849e-06, + "logits/chosen": 9.783641815185547, + "logits/rejected": 3.7100839614868164, + "logps/chosen": -288.7557373046875, + "logps/rejected": -171.37759399414062, + "loss": 0.7735, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10352823138237, + "rewards/margins": 0.23346653580665588, + "rewards/rejected": -0.3369947373867035, + "step": 2206 + }, + { + "epoch": 0.34131065145950124, + "grad_norm": 5.5735554695129395, + "learning_rate": 4.923530759537175e-06, + "logits/chosen": 4.470004081726074, + "logits/rejected": 10.329557418823242, + "logps/chosen": -280.0700988769531, + "logps/rejected": -381.5083923339844, + "loss": 0.666, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01585289090871811, + "rewards/margins": 0.15275263786315918, + "rewards/rejected": -0.1686055213212967, + "step": 2207 + }, + { + "epoch": 0.34146530059926544, + "grad_norm": 5.168330669403076, + "learning_rate": 4.923244357887502e-06, + "logits/chosen": 6.794681072235107, + "logits/rejected": 2.8319389820098877, + "logps/chosen": -300.73040771484375, + "logps/rejected": -267.44580078125, + "loss": 0.6125, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12931528687477112, + "rewards/margins": 0.24279488623142242, + "rewards/rejected": -0.1134796068072319, + "step": 2208 + }, + { + "epoch": 0.3416199497390296, + "grad_norm": 19.52543830871582, + "learning_rate": 4.922957956237828e-06, + "logits/chosen": 11.238948822021484, + "logits/rejected": 8.896709442138672, + "logps/chosen": -376.65240478515625, + "logps/rejected": -315.87274169921875, + "loss": 0.7895, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.013472366146743298, + "rewards/margins": -0.12151608616113663, + "rewards/rejected": 0.10804371535778046, + "step": 2209 + }, + { + "epoch": 0.34177459887879375, + "grad_norm": 5.254403591156006, + "learning_rate": 4.9226715545881545e-06, + "logits/chosen": 8.904789924621582, + "logits/rejected": 6.793849468231201, + "logps/chosen": -289.3121032714844, + "logps/rejected": -208.19049072265625, + "loss": 0.75, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07712142169475555, + "rewards/margins": -0.037978239357471466, + "rewards/rejected": -0.03914318233728409, + "step": 2210 + }, + { + "epoch": 0.3419292480185579, + "grad_norm": 4.339156150817871, + "learning_rate": 4.922385152938481e-06, + "logits/chosen": 6.7839436531066895, + "logits/rejected": 1.5634078979492188, + "logps/chosen": -257.8121337890625, + "logps/rejected": -244.70849609375, + "loss": 0.429, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10001610964536667, + "rewards/margins": 0.7864616513252258, + "rewards/rejected": -0.6864455938339233, + "step": 2211 + }, + { + "epoch": 0.34208389715832205, + "grad_norm": 8.168787002563477, + "learning_rate": 4.922098751288808e-06, + "logits/chosen": 6.811740875244141, + "logits/rejected": 3.902513027191162, + "logps/chosen": -348.7326354980469, + "logps/rejected": -273.1086120605469, + "loss": 0.8642, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2507321536540985, + "rewards/margins": -0.21108895540237427, + "rewards/rejected": -0.03964319825172424, + "step": 2212 + }, + { + "epoch": 0.3422385462980862, + "grad_norm": 5.378510475158691, + "learning_rate": 4.9218123496391345e-06, + "logits/chosen": 8.771527290344238, + "logits/rejected": 10.911977767944336, + "logps/chosen": -254.5963592529297, + "logps/rejected": -284.21697998046875, + "loss": 0.5783, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.008564659394323826, + "rewards/margins": 0.3100131154060364, + "rewards/rejected": -0.31857776641845703, + "step": 2213 + }, + { + "epoch": 0.34239319543785035, + "grad_norm": 5.584686756134033, + "learning_rate": 4.92152594798946e-06, + "logits/chosen": 8.772775650024414, + "logits/rejected": 2.0024702548980713, + "logps/chosen": -287.03704833984375, + "logps/rejected": -234.1224365234375, + "loss": 0.6686, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25973814725875854, + "rewards/margins": 0.09461994469165802, + "rewards/rejected": -0.35435810685157776, + "step": 2214 + }, + { + "epoch": 0.34254784457761456, + "grad_norm": 6.358705997467041, + "learning_rate": 4.921239546339787e-06, + "logits/chosen": 13.620830535888672, + "logits/rejected": 9.011594772338867, + "logps/chosen": -362.0309143066406, + "logps/rejected": -261.93896484375, + "loss": 0.7534, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0829969272017479, + "rewards/margins": 0.05566546320915222, + "rewards/rejected": 0.027331486344337463, + "step": 2215 + }, + { + "epoch": 0.3427024937173787, + "grad_norm": 9.422063827514648, + "learning_rate": 4.920953144690114e-06, + "logits/chosen": 12.300759315490723, + "logits/rejected": 9.482490539550781, + "logps/chosen": -384.3802490234375, + "logps/rejected": -325.10321044921875, + "loss": 0.5804, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015626903623342514, + "rewards/margins": 0.28644314408302307, + "rewards/rejected": -0.3020700514316559, + "step": 2216 + }, + { + "epoch": 0.34285714285714286, + "grad_norm": 5.594699859619141, + "learning_rate": 4.92066674304044e-06, + "logits/chosen": 13.004911422729492, + "logits/rejected": 11.286243438720703, + "logps/chosen": -273.7823791503906, + "logps/rejected": -270.719482421875, + "loss": 0.7184, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0906803160905838, + "rewards/margins": 0.11050395667552948, + "rewards/rejected": -0.2011842578649521, + "step": 2217 + }, + { + "epoch": 0.343011791996907, + "grad_norm": 5.89525032043457, + "learning_rate": 4.920380341390767e-06, + "logits/chosen": 11.978071212768555, + "logits/rejected": 3.3232693672180176, + "logps/chosen": -273.1345520019531, + "logps/rejected": -219.61477661132812, + "loss": 0.697, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.17827454209327698, + "rewards/margins": 0.06826652586460114, + "rewards/rejected": -0.2465410828590393, + "step": 2218 + }, + { + "epoch": 0.34316644113667116, + "grad_norm": 7.048234462738037, + "learning_rate": 4.9200939397410935e-06, + "logits/chosen": 7.120995998382568, + "logits/rejected": 1.8566869497299194, + "logps/chosen": -237.83474731445312, + "logps/rejected": -157.23245239257812, + "loss": 0.686, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1351737082004547, + "rewards/margins": 0.06153935194015503, + "rewards/rejected": -0.19671306014060974, + "step": 2219 + }, + { + "epoch": 0.3433210902764353, + "grad_norm": 5.469114303588867, + "learning_rate": 4.919807538091419e-06, + "logits/chosen": 12.084733963012695, + "logits/rejected": 5.589107990264893, + "logps/chosen": -326.6589050292969, + "logps/rejected": -289.4862365722656, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03627218306064606, + "rewards/margins": 0.37036412954330444, + "rewards/rejected": -0.4066363275051117, + "step": 2220 + }, + { + "epoch": 0.3434757394161995, + "grad_norm": 11.814653396606445, + "learning_rate": 4.919521136441746e-06, + "logits/chosen": 14.363693237304688, + "logits/rejected": 8.811527252197266, + "logps/chosen": -452.9128723144531, + "logps/rejected": -419.9919738769531, + "loss": 0.5306, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24756795167922974, + "rewards/margins": 0.41560888290405273, + "rewards/rejected": -0.168040931224823, + "step": 2221 + }, + { + "epoch": 0.3436303885559637, + "grad_norm": 25.004858016967773, + "learning_rate": 4.919234734792073e-06, + "logits/chosen": 7.665427207946777, + "logits/rejected": 7.383940696716309, + "logps/chosen": -231.5487518310547, + "logps/rejected": -214.76895141601562, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0779341608285904, + "rewards/margins": 0.05487308278679848, + "rewards/rejected": 0.023061085492372513, + "step": 2222 + }, + { + "epoch": 0.3437850376957278, + "grad_norm": 9.273181915283203, + "learning_rate": 4.918948333142399e-06, + "logits/chosen": 9.621376037597656, + "logits/rejected": 6.972739219665527, + "logps/chosen": -274.7452087402344, + "logps/rejected": -231.89039611816406, + "loss": 0.7466, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07981610298156738, + "rewards/margins": -0.04024926945567131, + "rewards/rejected": -0.03956683352589607, + "step": 2223 + }, + { + "epoch": 0.343939686835492, + "grad_norm": 4.530200004577637, + "learning_rate": 4.918661931492726e-06, + "logits/chosen": 11.825055122375488, + "logits/rejected": 7.726041793823242, + "logps/chosen": -233.1574249267578, + "logps/rejected": -174.32904052734375, + "loss": 0.6889, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11028782278299332, + "rewards/margins": 0.04926891252398491, + "rewards/rejected": -0.15955673158168793, + "step": 2224 + }, + { + "epoch": 0.3440943359752561, + "grad_norm": 5.863092422485352, + "learning_rate": 4.918375529843053e-06, + "logits/chosen": 11.903350830078125, + "logits/rejected": 5.086323261260986, + "logps/chosen": -315.8117370605469, + "logps/rejected": -285.3423767089844, + "loss": 0.6754, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3649864196777344, + "rewards/margins": 0.11556059122085571, + "rewards/rejected": -0.4805470108985901, + "step": 2225 + }, + { + "epoch": 0.3442489851150203, + "grad_norm": 4.307432651519775, + "learning_rate": 4.918089128193379e-06, + "logits/chosen": 8.137996673583984, + "logits/rejected": 4.582399845123291, + "logps/chosen": -192.79525756835938, + "logps/rejected": -139.80307006835938, + "loss": 0.6747, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1812305897474289, + "rewards/margins": 0.06533050537109375, + "rewards/rejected": -0.24656111001968384, + "step": 2226 + }, + { + "epoch": 0.3444036342547845, + "grad_norm": 5.5591912269592285, + "learning_rate": 4.917802726543705e-06, + "logits/chosen": 11.140790939331055, + "logits/rejected": 11.527606964111328, + "logps/chosen": -274.5819396972656, + "logps/rejected": -245.5169219970703, + "loss": 0.5775, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013919852674007416, + "rewards/margins": 0.40499410033226013, + "rewards/rejected": -0.41891396045684814, + "step": 2227 + }, + { + "epoch": 0.34455828339454864, + "grad_norm": 4.2306318283081055, + "learning_rate": 4.917516324894032e-06, + "logits/chosen": 5.727298259735107, + "logits/rejected": 4.192427635192871, + "logps/chosen": -219.2635498046875, + "logps/rejected": -179.5087127685547, + "loss": 0.5566, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06968345493078232, + "rewards/margins": 0.3410758972167969, + "rewards/rejected": -0.27139243483543396, + "step": 2228 + }, + { + "epoch": 0.3447129325343128, + "grad_norm": 4.568408966064453, + "learning_rate": 4.917229923244358e-06, + "logits/chosen": 8.758159637451172, + "logits/rejected": 4.163976192474365, + "logps/chosen": -338.8127136230469, + "logps/rejected": -240.6302947998047, + "loss": 0.6097, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06348161399364471, + "rewards/margins": 0.32506924867630005, + "rewards/rejected": -0.38855087757110596, + "step": 2229 + }, + { + "epoch": 0.34486758167407694, + "grad_norm": 5.886579990386963, + "learning_rate": 4.916943521594685e-06, + "logits/chosen": 6.055108547210693, + "logits/rejected": 6.719015121459961, + "logps/chosen": -235.1964874267578, + "logps/rejected": -243.42535400390625, + "loss": 0.5802, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09445696324110031, + "rewards/margins": 0.28427010774612427, + "rewards/rejected": -0.3787270486354828, + "step": 2230 + }, + { + "epoch": 0.3450222308138411, + "grad_norm": 76.57919311523438, + "learning_rate": 4.916657119945012e-06, + "logits/chosen": 6.070754528045654, + "logits/rejected": 4.538302421569824, + "logps/chosen": -206.94229125976562, + "logps/rejected": -160.03150939941406, + "loss": 0.6597, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2322043478488922, + "rewards/margins": 0.10275064408779144, + "rewards/rejected": -0.33495500683784485, + "step": 2231 + }, + { + "epoch": 0.34517687995360524, + "grad_norm": 5.143581867218018, + "learning_rate": 4.916370718295338e-06, + "logits/chosen": 10.92449951171875, + "logits/rejected": 6.929837226867676, + "logps/chosen": -268.9730224609375, + "logps/rejected": -217.0604248046875, + "loss": 0.6188, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10262413322925568, + "rewards/margins": 0.2187485694885254, + "rewards/rejected": -0.32137271761894226, + "step": 2232 + }, + { + "epoch": 0.3453315290933694, + "grad_norm": 4.1480207443237305, + "learning_rate": 4.916084316645665e-06, + "logits/chosen": 14.199913024902344, + "logits/rejected": 6.951969623565674, + "logps/chosen": -331.29266357421875, + "logps/rejected": -209.20079040527344, + "loss": 0.5474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05355348438024521, + "rewards/margins": 0.39276790618896484, + "rewards/rejected": -0.44632136821746826, + "step": 2233 + }, + { + "epoch": 0.3454861782331336, + "grad_norm": 4.425983428955078, + "learning_rate": 4.915797914995991e-06, + "logits/chosen": 7.606254577636719, + "logits/rejected": 2.658592939376831, + "logps/chosen": -302.90618896484375, + "logps/rejected": -203.21453857421875, + "loss": 0.5824, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08009238541126251, + "rewards/margins": 0.3808273673057556, + "rewards/rejected": -0.46091973781585693, + "step": 2234 + }, + { + "epoch": 0.34564082737289775, + "grad_norm": 6.441736221313477, + "learning_rate": 4.915511513346317e-06, + "logits/chosen": 8.51942253112793, + "logits/rejected": 9.007296562194824, + "logps/chosen": -459.0113830566406, + "logps/rejected": -360.12518310546875, + "loss": 0.6071, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.048676781356334686, + "rewards/margins": 0.34012749791145325, + "rewards/rejected": -0.2914506793022156, + "step": 2235 + }, + { + "epoch": 0.3457954765126619, + "grad_norm": 5.1384406089782715, + "learning_rate": 4.915225111696644e-06, + "logits/chosen": 4.124293327331543, + "logits/rejected": 4.67480993270874, + "logps/chosen": -317.9867248535156, + "logps/rejected": -262.2640380859375, + "loss": 0.6127, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1634017527103424, + "rewards/margins": 0.25820791721343994, + "rewards/rejected": -0.09480614960193634, + "step": 2236 + }, + { + "epoch": 0.34595012565242605, + "grad_norm": 5.26631498336792, + "learning_rate": 4.914938710046971e-06, + "logits/chosen": 6.151358127593994, + "logits/rejected": 6.918093681335449, + "logps/chosen": -232.6698760986328, + "logps/rejected": -234.5141143798828, + "loss": 0.6538, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15083113312721252, + "rewards/margins": 0.1375989317893982, + "rewards/rejected": -0.2884300649166107, + "step": 2237 + }, + { + "epoch": 0.3461047747921902, + "grad_norm": 5.205026626586914, + "learning_rate": 4.9146523083972965e-06, + "logits/chosen": 10.508944511413574, + "logits/rejected": 9.416999816894531, + "logps/chosen": -276.3905334472656, + "logps/rejected": -253.36036682128906, + "loss": 0.6187, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1749594658613205, + "rewards/margins": 0.273596853017807, + "rewards/rejected": -0.09863739460706711, + "step": 2238 + }, + { + "epoch": 0.34625942393195436, + "grad_norm": 5.7534003257751465, + "learning_rate": 4.914365906747623e-06, + "logits/chosen": 11.587326049804688, + "logits/rejected": 10.811962127685547, + "logps/chosen": -147.7239990234375, + "logps/rejected": -181.17127990722656, + "loss": 0.7236, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3307652473449707, + "rewards/margins": 0.07926079630851746, + "rewards/rejected": -0.41002607345581055, + "step": 2239 + }, + { + "epoch": 0.34641407307171856, + "grad_norm": 3.9810519218444824, + "learning_rate": 4.91407950509795e-06, + "logits/chosen": 12.147308349609375, + "logits/rejected": 11.696866035461426, + "logps/chosen": -157.50863647460938, + "logps/rejected": -172.9642791748047, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23018760979175568, + "rewards/margins": 0.27839189767837524, + "rewards/rejected": -0.5085794925689697, + "step": 2240 + }, + { + "epoch": 0.3465687222114827, + "grad_norm": 6.633463382720947, + "learning_rate": 4.9137931034482765e-06, + "logits/chosen": 9.445849418640137, + "logits/rejected": -0.09200930595397949, + "logps/chosen": -380.5946350097656, + "logps/rejected": -251.05003356933594, + "loss": 0.7112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2328595221042633, + "rewards/margins": 0.06577625870704651, + "rewards/rejected": -0.2986357808113098, + "step": 2241 + }, + { + "epoch": 0.34672337135124687, + "grad_norm": 5.461822032928467, + "learning_rate": 4.913506701798602e-06, + "logits/chosen": 9.661162376403809, + "logits/rejected": 9.147989273071289, + "logps/chosen": -206.82998657226562, + "logps/rejected": -261.5982360839844, + "loss": 0.7271, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.009399129077792168, + "rewards/margins": -0.04206313192844391, + "rewards/rejected": 0.051462262868881226, + "step": 2242 + }, + { + "epoch": 0.346878020491011, + "grad_norm": 5.272211074829102, + "learning_rate": 4.913220300148929e-06, + "logits/chosen": 7.325489521026611, + "logits/rejected": 8.20553970336914, + "logps/chosen": -260.3209228515625, + "logps/rejected": -252.24371337890625, + "loss": 0.6031, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.050969406962394714, + "rewards/margins": 0.32596737146377563, + "rewards/rejected": -0.37693679332733154, + "step": 2243 + }, + { + "epoch": 0.34703266963077517, + "grad_norm": 4.581593990325928, + "learning_rate": 4.9129338984992556e-06, + "logits/chosen": 7.456971168518066, + "logits/rejected": 7.230869770050049, + "logps/chosen": -212.39593505859375, + "logps/rejected": -179.48623657226562, + "loss": 0.6884, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19984811544418335, + "rewards/margins": 0.06016957014799118, + "rewards/rejected": -0.2600176930427551, + "step": 2244 + }, + { + "epoch": 0.3471873187705393, + "grad_norm": 5.243470668792725, + "learning_rate": 4.912647496849582e-06, + "logits/chosen": 9.719179153442383, + "logits/rejected": 11.93798828125, + "logps/chosen": -278.61029052734375, + "logps/rejected": -315.90972900390625, + "loss": 0.5682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3115367293357849, + "rewards/margins": 0.36510318517684937, + "rewards/rejected": -0.05356644466519356, + "step": 2245 + }, + { + "epoch": 0.3473419679103035, + "grad_norm": 5.636280059814453, + "learning_rate": 4.912361095199909e-06, + "logits/chosen": 14.947442054748535, + "logits/rejected": 11.073668479919434, + "logps/chosen": -288.8595275878906, + "logps/rejected": -238.2893829345703, + "loss": 0.6711, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1511373519897461, + "rewards/margins": 0.12412319332361221, + "rewards/rejected": -0.2752605676651001, + "step": 2246 + }, + { + "epoch": 0.3474966170500677, + "grad_norm": 4.914024829864502, + "learning_rate": 4.912074693550235e-06, + "logits/chosen": 9.521003723144531, + "logits/rejected": 10.034378051757812, + "logps/chosen": -232.0491943359375, + "logps/rejected": -207.06263732910156, + "loss": 0.7122, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.010371115058660507, + "rewards/margins": 0.057499319314956665, + "rewards/rejected": -0.06787042319774628, + "step": 2247 + }, + { + "epoch": 0.34765126618983183, + "grad_norm": 5.5847320556640625, + "learning_rate": 4.911788291900561e-06, + "logits/chosen": 9.698742866516113, + "logits/rejected": 6.02374792098999, + "logps/chosen": -261.2921142578125, + "logps/rejected": -183.23049926757812, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.021566370502114296, + "rewards/margins": 0.14653608202934265, + "rewards/rejected": -0.1681024730205536, + "step": 2248 + }, + { + "epoch": 0.347805915329596, + "grad_norm": 5.414330005645752, + "learning_rate": 4.911501890250888e-06, + "logits/chosen": 11.729146003723145, + "logits/rejected": 8.237777709960938, + "logps/chosen": -256.3666687011719, + "logps/rejected": -278.5428466796875, + "loss": 0.8102, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.174337700009346, + "rewards/margins": -0.10303632915019989, + "rewards/rejected": -0.07130137830972672, + "step": 2249 + }, + { + "epoch": 0.34796056446936013, + "grad_norm": 5.685588359832764, + "learning_rate": 4.911215488601215e-06, + "logits/chosen": 7.431952953338623, + "logits/rejected": 8.43886947631836, + "logps/chosen": -270.51116943359375, + "logps/rejected": -275.28558349609375, + "loss": 0.6361, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13253358006477356, + "rewards/margins": 0.16085144877433777, + "rewards/rejected": -0.02831786870956421, + "step": 2250 + }, + { + "epoch": 0.3481152136091243, + "grad_norm": 29.71629524230957, + "learning_rate": 4.910929086951541e-06, + "logits/chosen": 10.248223304748535, + "logits/rejected": 11.20692253112793, + "logps/chosen": -231.45877075195312, + "logps/rejected": -249.94818115234375, + "loss": 0.6803, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1668645441532135, + "rewards/margins": 0.14421576261520386, + "rewards/rejected": -0.31108030676841736, + "step": 2251 + }, + { + "epoch": 0.34826986274888844, + "grad_norm": 7.318175315856934, + "learning_rate": 4.910642685301868e-06, + "logits/chosen": 9.091716766357422, + "logits/rejected": 7.413163661956787, + "logps/chosen": -349.4971923828125, + "logps/rejected": -264.2409973144531, + "loss": 0.7543, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16747255623340607, + "rewards/margins": 0.02091556042432785, + "rewards/rejected": -0.18838812410831451, + "step": 2252 + }, + { + "epoch": 0.34842451188865264, + "grad_norm": 7.042592525482178, + "learning_rate": 4.910356283652194e-06, + "logits/chosen": 8.61669921875, + "logits/rejected": 13.160215377807617, + "logps/chosen": -283.1356506347656, + "logps/rejected": -348.42083740234375, + "loss": 0.8397, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24656841158866882, + "rewards/margins": -0.16773390769958496, + "rewards/rejected": -0.07883447408676147, + "step": 2253 + }, + { + "epoch": 0.3485791610284168, + "grad_norm": 4.277012348175049, + "learning_rate": 4.91006988200252e-06, + "logits/chosen": 9.668946266174316, + "logits/rejected": 5.8493757247924805, + "logps/chosen": -251.37216186523438, + "logps/rejected": -202.52203369140625, + "loss": 0.606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021043196320533752, + "rewards/margins": 0.2766110599040985, + "rewards/rejected": -0.25556787848472595, + "step": 2254 + }, + { + "epoch": 0.34873381016818095, + "grad_norm": 5.632655620574951, + "learning_rate": 4.909783480352847e-06, + "logits/chosen": 3.3474035263061523, + "logits/rejected": 0.006913661956787109, + "logps/chosen": -265.8349914550781, + "logps/rejected": -197.7389678955078, + "loss": 0.6076, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09871774911880493, + "rewards/margins": 0.3149941563606262, + "rewards/rejected": -0.41371190547943115, + "step": 2255 + }, + { + "epoch": 0.3488884593079451, + "grad_norm": 4.305490493774414, + "learning_rate": 4.909497078703174e-06, + "logits/chosen": 9.300392150878906, + "logits/rejected": 4.30137825012207, + "logps/chosen": -421.07952880859375, + "logps/rejected": -352.4152526855469, + "loss": 0.5095, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34856438636779785, + "rewards/margins": 0.4709594249725342, + "rewards/rejected": -0.12239501625299454, + "step": 2256 + }, + { + "epoch": 0.34904310844770925, + "grad_norm": 3.666188955307007, + "learning_rate": 4.9092106770535e-06, + "logits/chosen": 13.80720043182373, + "logits/rejected": 9.943268775939941, + "logps/chosen": -293.3656005859375, + "logps/rejected": -204.82345581054688, + "loss": 0.528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10769538581371307, + "rewards/margins": 0.4935707449913025, + "rewards/rejected": -0.6012661457061768, + "step": 2257 + }, + { + "epoch": 0.3491977575874734, + "grad_norm": 6.167476654052734, + "learning_rate": 4.908924275403827e-06, + "logits/chosen": 11.450634002685547, + "logits/rejected": 14.47146224975586, + "logps/chosen": -269.498779296875, + "logps/rejected": -457.2481689453125, + "loss": 0.6449, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1913461685180664, + "rewards/margins": 0.16590233147144318, + "rewards/rejected": -0.3572485148906708, + "step": 2258 + }, + { + "epoch": 0.34935240672723755, + "grad_norm": 5.208954334259033, + "learning_rate": 4.908637873754154e-06, + "logits/chosen": 4.9774370193481445, + "logits/rejected": 5.742297649383545, + "logps/chosen": -249.7729949951172, + "logps/rejected": -218.14566040039062, + "loss": 0.7735, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.28843870759010315, + "rewards/margins": -0.06605186313390732, + "rewards/rejected": -0.22238683700561523, + "step": 2259 + }, + { + "epoch": 0.34950705586700176, + "grad_norm": 7.198918342590332, + "learning_rate": 4.9083514721044794e-06, + "logits/chosen": 5.99508810043335, + "logits/rejected": 7.031045913696289, + "logps/chosen": -267.08453369140625, + "logps/rejected": -258.5322570800781, + "loss": 0.754, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14965543150901794, + "rewards/margins": -0.06985634565353394, + "rewards/rejected": -0.07979907840490341, + "step": 2260 + }, + { + "epoch": 0.3496617050067659, + "grad_norm": 6.3062310218811035, + "learning_rate": 4.908065070454806e-06, + "logits/chosen": 9.616310119628906, + "logits/rejected": 4.98814058303833, + "logps/chosen": -346.2275085449219, + "logps/rejected": -287.83349609375, + "loss": 0.7748, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34294837713241577, + "rewards/margins": -0.04744395613670349, + "rewards/rejected": -0.2955043911933899, + "step": 2261 + }, + { + "epoch": 0.34981635414653006, + "grad_norm": 6.942808151245117, + "learning_rate": 4.907778668805133e-06, + "logits/chosen": 14.936433792114258, + "logits/rejected": 18.18695640563965, + "logps/chosen": -272.3729248046875, + "logps/rejected": -328.1497802734375, + "loss": 0.7993, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.32496678829193115, + "rewards/margins": -0.14285282790660858, + "rewards/rejected": -0.18211393058300018, + "step": 2262 + }, + { + "epoch": 0.3499710032862942, + "grad_norm": 5.081689357757568, + "learning_rate": 4.907492267155459e-06, + "logits/chosen": 8.887516975402832, + "logits/rejected": 1.470055103302002, + "logps/chosen": -240.7576904296875, + "logps/rejected": -160.95785522460938, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012402251362800598, + "rewards/margins": 0.3340449929237366, + "rewards/rejected": -0.3216427266597748, + "step": 2263 + }, + { + "epoch": 0.35012565242605836, + "grad_norm": 6.787292957305908, + "learning_rate": 4.907205865505786e-06, + "logits/chosen": 12.554620742797852, + "logits/rejected": 5.174549102783203, + "logps/chosen": -236.28616333007812, + "logps/rejected": -209.17721557617188, + "loss": 0.8091, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.33284130692481995, + "rewards/margins": -0.141166090965271, + "rewards/rejected": -0.19167517125606537, + "step": 2264 + }, + { + "epoch": 0.3502803015658225, + "grad_norm": 5.783636093139648, + "learning_rate": 4.906919463856113e-06, + "logits/chosen": 7.47484016418457, + "logits/rejected": 12.06732177734375, + "logps/chosen": -332.9858703613281, + "logps/rejected": -283.30963134765625, + "loss": 0.7008, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2375369518995285, + "rewards/margins": 0.04651055112481117, + "rewards/rejected": -0.2840474843978882, + "step": 2265 + }, + { + "epoch": 0.3504349507055867, + "grad_norm": 5.621984004974365, + "learning_rate": 4.906633062206439e-06, + "logits/chosen": 11.89096450805664, + "logits/rejected": 6.294903755187988, + "logps/chosen": -354.2367858886719, + "logps/rejected": -260.3260498046875, + "loss": 0.6839, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02276735007762909, + "rewards/margins": 0.07391981780529022, + "rewards/rejected": -0.051152464002370834, + "step": 2266 + }, + { + "epoch": 0.3505895998453509, + "grad_norm": 4.112491607666016, + "learning_rate": 4.906346660556765e-06, + "logits/chosen": 8.1788911819458, + "logits/rejected": 1.4742443561553955, + "logps/chosen": -210.604248046875, + "logps/rejected": -129.72760009765625, + "loss": 0.6717, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.28726643323898315, + "rewards/margins": 0.07727478444576263, + "rewards/rejected": -0.364541232585907, + "step": 2267 + }, + { + "epoch": 0.350744248985115, + "grad_norm": 5.981924057006836, + "learning_rate": 4.906060258907092e-06, + "logits/chosen": 8.20511531829834, + "logits/rejected": 7.404324054718018, + "logps/chosen": -259.75482177734375, + "logps/rejected": -266.87823486328125, + "loss": 0.5223, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12701082229614258, + "rewards/margins": 0.4615393877029419, + "rewards/rejected": -0.33452853560447693, + "step": 2268 + }, + { + "epoch": 0.3508988981248792, + "grad_norm": 5.859411239624023, + "learning_rate": 4.9057738572574184e-06, + "logits/chosen": 8.135759353637695, + "logits/rejected": 2.4925124645233154, + "logps/chosen": -301.67962646484375, + "logps/rejected": -244.54283142089844, + "loss": 0.651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06861220300197601, + "rewards/margins": 0.16191913187503815, + "rewards/rejected": -0.23053131997585297, + "step": 2269 + }, + { + "epoch": 0.35105354726464333, + "grad_norm": 4.8766279220581055, + "learning_rate": 4.905487455607745e-06, + "logits/chosen": 8.630785942077637, + "logits/rejected": 6.301517963409424, + "logps/chosen": -250.0829620361328, + "logps/rejected": -205.16307067871094, + "loss": 0.6342, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13005010783672333, + "rewards/margins": 0.23674780130386353, + "rewards/rejected": -0.36679789423942566, + "step": 2270 + }, + { + "epoch": 0.3512081964044075, + "grad_norm": 6.100093841552734, + "learning_rate": 4.905201053958072e-06, + "logits/chosen": 10.4290771484375, + "logits/rejected": 5.6397786140441895, + "logps/chosen": -329.4725341796875, + "logps/rejected": -310.0322265625, + "loss": 0.705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009967800229787827, + "rewards/margins": 0.027770325541496277, + "rewards/rejected": -0.0377381294965744, + "step": 2271 + }, + { + "epoch": 0.3513628455441717, + "grad_norm": 5.898587226867676, + "learning_rate": 4.9049146523083975e-06, + "logits/chosen": 8.528572082519531, + "logits/rejected": 5.490638732910156, + "logps/chosen": -269.6099853515625, + "logps/rejected": -209.4634246826172, + "loss": 0.602, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1952863335609436, + "rewards/margins": 0.3484611213207245, + "rewards/rejected": -0.5437474250793457, + "step": 2272 + }, + { + "epoch": 0.35151749468393584, + "grad_norm": 7.538588523864746, + "learning_rate": 4.904628250658724e-06, + "logits/chosen": 13.407453536987305, + "logits/rejected": 11.88882064819336, + "logps/chosen": -409.357421875, + "logps/rejected": -364.2845458984375, + "loss": 0.6862, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15313872694969177, + "rewards/margins": 0.17631609737873077, + "rewards/rejected": -0.32945480942726135, + "step": 2273 + }, + { + "epoch": 0.3516721438237, + "grad_norm": 5.733290672302246, + "learning_rate": 4.904341849009051e-06, + "logits/chosen": 8.919346809387207, + "logits/rejected": 11.112564086914062, + "logps/chosen": -239.14163208007812, + "logps/rejected": -251.54750061035156, + "loss": 0.7497, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6215527057647705, + "rewards/margins": -0.05928035080432892, + "rewards/rejected": -0.5622723698616028, + "step": 2274 + }, + { + "epoch": 0.35182679296346414, + "grad_norm": 4.8864216804504395, + "learning_rate": 4.9040554473593775e-06, + "logits/chosen": 13.415364265441895, + "logits/rejected": 12.386824607849121, + "logps/chosen": -370.70404052734375, + "logps/rejected": -351.29815673828125, + "loss": 0.4881, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12877292931079865, + "rewards/margins": 0.6097846031188965, + "rewards/rejected": -0.48101168870925903, + "step": 2275 + }, + { + "epoch": 0.3519814421032283, + "grad_norm": 5.105952739715576, + "learning_rate": 4.903769045709703e-06, + "logits/chosen": 10.409931182861328, + "logits/rejected": 6.689871788024902, + "logps/chosen": -291.099609375, + "logps/rejected": -251.84365844726562, + "loss": 0.6403, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15651798248291016, + "rewards/margins": 0.18619213998317719, + "rewards/rejected": -0.34271010756492615, + "step": 2276 + }, + { + "epoch": 0.35213609124299244, + "grad_norm": 8.522160530090332, + "learning_rate": 4.90348264406003e-06, + "logits/chosen": 2.8824234008789062, + "logits/rejected": 4.933001518249512, + "logps/chosen": -381.6719970703125, + "logps/rejected": -445.5632629394531, + "loss": 0.7482, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1836051046848297, + "rewards/margins": 0.06200967729091644, + "rewards/rejected": -0.24561476707458496, + "step": 2277 + }, + { + "epoch": 0.3522907403827566, + "grad_norm": 5.22220516204834, + "learning_rate": 4.903196242410357e-06, + "logits/chosen": 14.69093132019043, + "logits/rejected": 8.37982177734375, + "logps/chosen": -263.33050537109375, + "logps/rejected": -180.16409301757812, + "loss": 0.7331, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22532400488853455, + "rewards/margins": -0.01858241856098175, + "rewards/rejected": -0.2067415714263916, + "step": 2278 + }, + { + "epoch": 0.3524453895225208, + "grad_norm": 4.423905849456787, + "learning_rate": 4.902909840760683e-06, + "logits/chosen": 12.967979431152344, + "logits/rejected": 10.532442092895508, + "logps/chosen": -367.1943664550781, + "logps/rejected": -423.98193359375, + "loss": 0.4146, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0062543898820877075, + "rewards/margins": 0.765472412109375, + "rewards/rejected": -0.7592180371284485, + "step": 2279 + }, + { + "epoch": 0.35260003866228495, + "grad_norm": 5.410378932952881, + "learning_rate": 4.902623439111009e-06, + "logits/chosen": 10.105243682861328, + "logits/rejected": 14.415291786193848, + "logps/chosen": -278.61737060546875, + "logps/rejected": -319.6083984375, + "loss": 0.6428, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31036606431007385, + "rewards/margins": 0.13710956275463104, + "rewards/rejected": -0.4474756121635437, + "step": 2280 + }, + { + "epoch": 0.3527546878020491, + "grad_norm": 6.157070159912109, + "learning_rate": 4.902337037461336e-06, + "logits/chosen": 10.1044282913208, + "logits/rejected": 10.5338716506958, + "logps/chosen": -369.8498840332031, + "logps/rejected": -346.7928466796875, + "loss": 0.711, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28167325258255005, + "rewards/margins": 0.026620671153068542, + "rewards/rejected": -0.308293879032135, + "step": 2281 + }, + { + "epoch": 0.35290933694181326, + "grad_norm": 5.407443523406982, + "learning_rate": 4.902050635811662e-06, + "logits/chosen": 13.642023086547852, + "logits/rejected": 6.602418422698975, + "logps/chosen": -405.72283935546875, + "logps/rejected": -314.2531433105469, + "loss": 0.5513, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10360391438007355, + "rewards/margins": 0.4180288314819336, + "rewards/rejected": -0.521632730960846, + "step": 2282 + }, + { + "epoch": 0.3530639860815774, + "grad_norm": 5.393918037414551, + "learning_rate": 4.901764234161989e-06, + "logits/chosen": 8.057491302490234, + "logits/rejected": 9.188366889953613, + "logps/chosen": -284.2396240234375, + "logps/rejected": -279.5758056640625, + "loss": 0.716, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39083796739578247, + "rewards/margins": 0.004945278167724609, + "rewards/rejected": -0.3957832455635071, + "step": 2283 + }, + { + "epoch": 0.35321863522134156, + "grad_norm": 6.1068196296691895, + "learning_rate": 4.901477832512316e-06, + "logits/chosen": 7.461799144744873, + "logits/rejected": 8.400287628173828, + "logps/chosen": -166.0146484375, + "logps/rejected": -312.38836669921875, + "loss": 0.8087, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32630789279937744, + "rewards/margins": -0.14760400354862213, + "rewards/rejected": -0.1787038892507553, + "step": 2284 + }, + { + "epoch": 0.35337328436110577, + "grad_norm": 6.6303391456604, + "learning_rate": 4.901191430862642e-06, + "logits/chosen": 8.566883087158203, + "logits/rejected": 6.3126020431518555, + "logps/chosen": -232.5137939453125, + "logps/rejected": -220.58419799804688, + "loss": 0.8915, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18723392486572266, + "rewards/margins": -0.20679838955402374, + "rewards/rejected": 0.019564446061849594, + "step": 2285 + }, + { + "epoch": 0.3535279335008699, + "grad_norm": 5.50675106048584, + "learning_rate": 4.900905029212968e-06, + "logits/chosen": 10.489021301269531, + "logits/rejected": 9.962228775024414, + "logps/chosen": -235.81460571289062, + "logps/rejected": -209.7769012451172, + "loss": 0.6718, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3101990222930908, + "rewards/margins": 0.11363208293914795, + "rewards/rejected": -0.42383110523223877, + "step": 2286 + }, + { + "epoch": 0.35368258264063407, + "grad_norm": 8.460928916931152, + "learning_rate": 4.900618627563295e-06, + "logits/chosen": 5.3383283615112305, + "logits/rejected": 4.417959213256836, + "logps/chosen": -326.1287536621094, + "logps/rejected": -305.16351318359375, + "loss": 0.7241, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3920285403728485, + "rewards/margins": -0.0179438553750515, + "rewards/rejected": -0.3740846514701843, + "step": 2287 + }, + { + "epoch": 0.3538372317803982, + "grad_norm": 4.480382442474365, + "learning_rate": 4.900332225913621e-06, + "logits/chosen": 6.651538848876953, + "logits/rejected": 8.158883094787598, + "logps/chosen": -377.5992431640625, + "logps/rejected": -313.6252136230469, + "loss": 0.5973, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03289628028869629, + "rewards/margins": 0.39587706327438354, + "rewards/rejected": -0.42877334356307983, + "step": 2288 + }, + { + "epoch": 0.35399188092016237, + "grad_norm": 3.833733081817627, + "learning_rate": 4.900045824263948e-06, + "logits/chosen": 13.12617015838623, + "logits/rejected": 9.737539291381836, + "logps/chosen": -224.25856018066406, + "logps/rejected": -223.23565673828125, + "loss": 0.5379, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.067721888422966, + "rewards/margins": 0.3821626901626587, + "rewards/rejected": -0.4498845934867859, + "step": 2289 + }, + { + "epoch": 0.3541465300599265, + "grad_norm": 8.063943862915039, + "learning_rate": 4.899759422614275e-06, + "logits/chosen": 8.838645935058594, + "logits/rejected": 8.395843505859375, + "logps/chosen": -322.738037109375, + "logps/rejected": -291.0431213378906, + "loss": 0.8257, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.36284393072128296, + "rewards/margins": -0.09391230344772339, + "rewards/rejected": -0.26893162727355957, + "step": 2290 + }, + { + "epoch": 0.3543011791996907, + "grad_norm": 5.950629234313965, + "learning_rate": 4.899473020964601e-06, + "logits/chosen": 8.83854866027832, + "logits/rejected": 7.512988090515137, + "logps/chosen": -295.1648254394531, + "logps/rejected": -230.810791015625, + "loss": 0.7297, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4573357105255127, + "rewards/margins": -0.018091723322868347, + "rewards/rejected": -0.43924397230148315, + "step": 2291 + }, + { + "epoch": 0.3544558283394549, + "grad_norm": 5.183650493621826, + "learning_rate": 4.899186619314928e-06, + "logits/chosen": 8.526028633117676, + "logits/rejected": 7.30854606628418, + "logps/chosen": -186.0430450439453, + "logps/rejected": -195.26051330566406, + "loss": 0.6957, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05327737331390381, + "rewards/margins": 0.03432953357696533, + "rewards/rejected": -0.08760692179203033, + "step": 2292 + }, + { + "epoch": 0.35461047747921903, + "grad_norm": 5.21556282043457, + "learning_rate": 4.898900217665254e-06, + "logits/chosen": 10.062601089477539, + "logits/rejected": 5.812929153442383, + "logps/chosen": -251.52444458007812, + "logps/rejected": -237.75575256347656, + "loss": 0.6474, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09459376335144043, + "rewards/margins": 0.15834593772888184, + "rewards/rejected": -0.25293970108032227, + "step": 2293 + }, + { + "epoch": 0.3547651266189832, + "grad_norm": 7.025015354156494, + "learning_rate": 4.8986138160155805e-06, + "logits/chosen": 14.059396743774414, + "logits/rejected": 5.465035915374756, + "logps/chosen": -467.369384765625, + "logps/rejected": -299.49078369140625, + "loss": 0.7008, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.033702850341796875, + "rewards/margins": 0.15374523401260376, + "rewards/rejected": -0.12004238367080688, + "step": 2294 + }, + { + "epoch": 0.35491977575874734, + "grad_norm": 6.39870023727417, + "learning_rate": 4.898327414365907e-06, + "logits/chosen": 8.50955581665039, + "logits/rejected": 2.735879898071289, + "logps/chosen": -303.5803527832031, + "logps/rejected": -185.3979949951172, + "loss": 0.5488, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.16922327876091003, + "rewards/margins": 0.3481117784976959, + "rewards/rejected": -0.517335057258606, + "step": 2295 + }, + { + "epoch": 0.3550744248985115, + "grad_norm": 5.158504009246826, + "learning_rate": 4.898041012716234e-06, + "logits/chosen": 11.480414390563965, + "logits/rejected": 9.00924301147461, + "logps/chosen": -316.7840576171875, + "logps/rejected": -314.14239501953125, + "loss": 0.5916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03316483274102211, + "rewards/margins": 0.45047634840011597, + "rewards/rejected": -0.41731148958206177, + "step": 2296 + }, + { + "epoch": 0.35522907403827564, + "grad_norm": 6.351102352142334, + "learning_rate": 4.89775461106656e-06, + "logits/chosen": 7.808524131774902, + "logits/rejected": 10.521486282348633, + "logps/chosen": -189.92694091796875, + "logps/rejected": -234.21682739257812, + "loss": 0.6965, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09540493786334991, + "rewards/margins": 0.0428440198302269, + "rewards/rejected": -0.13824895024299622, + "step": 2297 + }, + { + "epoch": 0.35538372317803985, + "grad_norm": 9.060208320617676, + "learning_rate": 4.897468209416887e-06, + "logits/chosen": 5.821726322174072, + "logits/rejected": 7.238389015197754, + "logps/chosen": -336.39300537109375, + "logps/rejected": -371.8484802246094, + "loss": 0.8943, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27768027782440186, + "rewards/margins": -0.24758805334568024, + "rewards/rejected": -0.030092239379882812, + "step": 2298 + }, + { + "epoch": 0.355538372317804, + "grad_norm": 4.79667854309082, + "learning_rate": 4.897181807767214e-06, + "logits/chosen": 5.603011608123779, + "logits/rejected": 0.2982349395751953, + "logps/chosen": -256.0111083984375, + "logps/rejected": -255.9495849609375, + "loss": 0.5096, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21172352135181427, + "rewards/margins": 0.49048200249671936, + "rewards/rejected": -0.7022055387496948, + "step": 2299 + }, + { + "epoch": 0.35569302145756815, + "grad_norm": 7.194302558898926, + "learning_rate": 4.8968954061175395e-06, + "logits/chosen": 9.853011131286621, + "logits/rejected": 10.196281433105469, + "logps/chosen": -308.0816955566406, + "logps/rejected": -375.0216064453125, + "loss": 0.7722, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28454896807670593, + "rewards/margins": -0.07456383109092712, + "rewards/rejected": -0.2099851667881012, + "step": 2300 + }, + { + "epoch": 0.3558476705973323, + "grad_norm": 8.1284761428833, + "learning_rate": 4.896609004467866e-06, + "logits/chosen": 7.840907096862793, + "logits/rejected": 1.9497251510620117, + "logps/chosen": -343.1737976074219, + "logps/rejected": -227.4717559814453, + "loss": 0.6274, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11587969213724136, + "rewards/margins": 0.396634578704834, + "rewards/rejected": -0.5125142335891724, + "step": 2301 + }, + { + "epoch": 0.35600231973709645, + "grad_norm": 3.761204481124878, + "learning_rate": 4.896322602818193e-06, + "logits/chosen": 10.910774230957031, + "logits/rejected": 1.5914866924285889, + "logps/chosen": -166.80218505859375, + "logps/rejected": -92.43940734863281, + "loss": 0.6409, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1651858389377594, + "rewards/margins": 0.17134442925453186, + "rewards/rejected": -0.33653026819229126, + "step": 2302 + }, + { + "epoch": 0.3561569688768606, + "grad_norm": 5.362090587615967, + "learning_rate": 4.8960362011685195e-06, + "logits/chosen": 12.594182014465332, + "logits/rejected": 8.193999290466309, + "logps/chosen": -305.1719970703125, + "logps/rejected": -297.2008056640625, + "loss": 0.5383, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.061117835342884064, + "rewards/margins": 0.36943426728248596, + "rewards/rejected": -0.3083164095878601, + "step": 2303 + }, + { + "epoch": 0.3563116180166248, + "grad_norm": 7.27452278137207, + "learning_rate": 4.895749799518846e-06, + "logits/chosen": 5.877128601074219, + "logits/rejected": 3.7415316104888916, + "logps/chosen": -336.67230224609375, + "logps/rejected": -371.3394775390625, + "loss": 0.7349, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2061605006456375, + "rewards/margins": 0.18396373093128204, + "rewards/rejected": 0.02219676598906517, + "step": 2304 + }, + { + "epoch": 0.35646626715638896, + "grad_norm": 6.953784942626953, + "learning_rate": 4.895463397869173e-06, + "logits/chosen": 10.48383903503418, + "logits/rejected": 10.955982208251953, + "logps/chosen": -341.8731689453125, + "logps/rejected": -298.1326599121094, + "loss": 0.7131, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.011556625366210938, + "rewards/margins": 0.10991425812244415, + "rewards/rejected": -0.12147089093923569, + "step": 2305 + }, + { + "epoch": 0.3566209162961531, + "grad_norm": 7.077308177947998, + "learning_rate": 4.8951769962194986e-06, + "logits/chosen": 8.61199951171875, + "logits/rejected": 6.04293966293335, + "logps/chosen": -280.35284423828125, + "logps/rejected": -333.212646484375, + "loss": 0.6358, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14419767260551453, + "rewards/margins": 0.2684920132160187, + "rewards/rejected": -0.4126897156238556, + "step": 2306 + }, + { + "epoch": 0.35677556543591726, + "grad_norm": 6.757407188415527, + "learning_rate": 4.894890594569825e-06, + "logits/chosen": 8.389930725097656, + "logits/rejected": 3.219229221343994, + "logps/chosen": -285.2623596191406, + "logps/rejected": -250.55419921875, + "loss": 0.7, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33544251322746277, + "rewards/margins": 0.04589072987437248, + "rewards/rejected": -0.38133326172828674, + "step": 2307 + }, + { + "epoch": 0.3569302145756814, + "grad_norm": 21.499324798583984, + "learning_rate": 4.894604192920152e-06, + "logits/chosen": 10.170759201049805, + "logits/rejected": 8.207755088806152, + "logps/chosen": -246.21438598632812, + "logps/rejected": -275.232421875, + "loss": 0.7691, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08954611420631409, + "rewards/margins": -0.11354275792837143, + "rewards/rejected": 0.023996641859412193, + "step": 2308 + }, + { + "epoch": 0.35708486371544557, + "grad_norm": 3.6801435947418213, + "learning_rate": 4.8943177912704785e-06, + "logits/chosen": 8.054658889770508, + "logits/rejected": 7.391181468963623, + "logps/chosen": -185.26820373535156, + "logps/rejected": -186.87498474121094, + "loss": 0.6266, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06425218284130096, + "rewards/margins": 0.19823215901851654, + "rewards/rejected": -0.2624843716621399, + "step": 2309 + }, + { + "epoch": 0.3572395128552097, + "grad_norm": 6.398554801940918, + "learning_rate": 4.894031389620804e-06, + "logits/chosen": 0.9814324378967285, + "logits/rejected": 6.214434623718262, + "logps/chosen": -226.17984008789062, + "logps/rejected": -302.9360656738281, + "loss": 0.544, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.29893386363983154, + "rewards/margins": 0.42912763357162476, + "rewards/rejected": -0.7280614376068115, + "step": 2310 + }, + { + "epoch": 0.3573941619949739, + "grad_norm": 7.246306419372559, + "learning_rate": 4.893744987971131e-06, + "logits/chosen": 12.344511985778809, + "logits/rejected": 8.692779541015625, + "logps/chosen": -453.1650695800781, + "logps/rejected": -329.99603271484375, + "loss": 0.6425, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17128944396972656, + "rewards/margins": 0.2065943032503128, + "rewards/rejected": -0.37788376212120056, + "step": 2311 + }, + { + "epoch": 0.3575488111347381, + "grad_norm": 6.005757808685303, + "learning_rate": 4.893458586321458e-06, + "logits/chosen": 7.452842712402344, + "logits/rejected": 4.272350788116455, + "logps/chosen": -278.6836853027344, + "logps/rejected": -273.89813232421875, + "loss": 0.6309, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3194414973258972, + "rewards/margins": 0.35720765590667725, + "rewards/rejected": -0.6766491532325745, + "step": 2312 + }, + { + "epoch": 0.3577034602745022, + "grad_norm": 5.901392459869385, + "learning_rate": 4.893172184671784e-06, + "logits/chosen": 11.812231063842773, + "logits/rejected": 13.108213424682617, + "logps/chosen": -379.9096374511719, + "logps/rejected": -374.9116516113281, + "loss": 0.7209, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18073472380638123, + "rewards/margins": 0.039542943239212036, + "rewards/rejected": -0.22027769684791565, + "step": 2313 + }, + { + "epoch": 0.3578581094142664, + "grad_norm": 5.417755126953125, + "learning_rate": 4.89288578302211e-06, + "logits/chosen": 13.69114875793457, + "logits/rejected": 13.75348949432373, + "logps/chosen": -401.909423828125, + "logps/rejected": -433.63507080078125, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020201489329338074, + "rewards/margins": 0.22761499881744385, + "rewards/rejected": -0.2074134796857834, + "step": 2314 + }, + { + "epoch": 0.35801275855403053, + "grad_norm": 6.451322078704834, + "learning_rate": 4.892599381372437e-06, + "logits/chosen": 9.982239723205566, + "logits/rejected": 5.295004844665527, + "logps/chosen": -264.6368408203125, + "logps/rejected": -252.98638916015625, + "loss": 0.6977, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5814623832702637, + "rewards/margins": 0.03783878684043884, + "rewards/rejected": -0.6193010807037354, + "step": 2315 + }, + { + "epoch": 0.3581674076937947, + "grad_norm": 7.075787544250488, + "learning_rate": 4.892312979722763e-06, + "logits/chosen": 11.914676666259766, + "logits/rejected": 13.889586448669434, + "logps/chosen": -335.5152587890625, + "logps/rejected": -384.6313171386719, + "loss": 0.7642, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.018859509378671646, + "rewards/margins": 0.03121916949748993, + "rewards/rejected": -0.05007869005203247, + "step": 2316 + }, + { + "epoch": 0.3583220568335589, + "grad_norm": 4.819097995758057, + "learning_rate": 4.89202657807309e-06, + "logits/chosen": 13.529319763183594, + "logits/rejected": 6.044322967529297, + "logps/chosen": -290.4903869628906, + "logps/rejected": -226.93785095214844, + "loss": 0.6716, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5643563866615295, + "rewards/margins": 0.13221922516822815, + "rewards/rejected": -0.6965756416320801, + "step": 2317 + }, + { + "epoch": 0.35847670597332304, + "grad_norm": 9.079181671142578, + "learning_rate": 4.891740176423417e-06, + "logits/chosen": 11.359947204589844, + "logits/rejected": 4.493671894073486, + "logps/chosen": -481.7691345214844, + "logps/rejected": -331.90264892578125, + "loss": 0.736, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3059215843677521, + "rewards/margins": -0.00804688036441803, + "rewards/rejected": -0.29787468910217285, + "step": 2318 + }, + { + "epoch": 0.3586313551130872, + "grad_norm": 3.0993034839630127, + "learning_rate": 4.8914537747737425e-06, + "logits/chosen": 12.550003051757812, + "logits/rejected": 4.218654632568359, + "logps/chosen": -308.19268798828125, + "logps/rejected": -203.6925048828125, + "loss": 0.4321, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3049798011779785, + "rewards/margins": 0.6847801804542542, + "rewards/rejected": -0.37980034947395325, + "step": 2319 + }, + { + "epoch": 0.35878600425285134, + "grad_norm": 5.668067455291748, + "learning_rate": 4.891167373124069e-06, + "logits/chosen": 13.877278327941895, + "logits/rejected": 11.128886222839355, + "logps/chosen": -254.1451416015625, + "logps/rejected": -327.8868713378906, + "loss": 0.603, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2491917908191681, + "rewards/margins": 0.33042365312576294, + "rewards/rejected": -0.5796154141426086, + "step": 2320 + }, + { + "epoch": 0.3589406533926155, + "grad_norm": 4.336638927459717, + "learning_rate": 4.890880971474396e-06, + "logits/chosen": 8.338726997375488, + "logits/rejected": -1.4895474910736084, + "logps/chosen": -273.78668212890625, + "logps/rejected": -113.2089614868164, + "loss": 0.4632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01187887042760849, + "rewards/margins": 0.5928508639335632, + "rewards/rejected": -0.5809720158576965, + "step": 2321 + }, + { + "epoch": 0.35909530253237965, + "grad_norm": 5.571256160736084, + "learning_rate": 4.8905945698247224e-06, + "logits/chosen": 9.629920959472656, + "logits/rejected": 5.384699821472168, + "logps/chosen": -329.11309814453125, + "logps/rejected": -267.02130126953125, + "loss": 0.616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0476585328578949, + "rewards/margins": 0.1954512596130371, + "rewards/rejected": -0.243109792470932, + "step": 2322 + }, + { + "epoch": 0.3592499516721438, + "grad_norm": 6.020268440246582, + "learning_rate": 4.890308168175049e-06, + "logits/chosen": 10.552306175231934, + "logits/rejected": 9.562264442443848, + "logps/chosen": -261.8734436035156, + "logps/rejected": -296.9286193847656, + "loss": 0.7023, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1796172857284546, + "rewards/margins": 0.09834901988506317, + "rewards/rejected": -0.27796632051467896, + "step": 2323 + }, + { + "epoch": 0.359404600811908, + "grad_norm": 6.4720377922058105, + "learning_rate": 4.890021766525376e-06, + "logits/chosen": 12.219856262207031, + "logits/rejected": 9.231483459472656, + "logps/chosen": -243.53668212890625, + "logps/rejected": -217.1230926513672, + "loss": 0.763, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.12192372977733612, + "rewards/margins": -0.11754226684570312, + "rewards/rejected": -0.004381466656923294, + "step": 2324 + }, + { + "epoch": 0.35955924995167216, + "grad_norm": 3.857593297958374, + "learning_rate": 4.889735364875702e-06, + "logits/chosen": 9.698766708374023, + "logits/rejected": 12.826196670532227, + "logps/chosen": -186.67410278320312, + "logps/rejected": -218.66256713867188, + "loss": 0.564, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1999448835849762, + "rewards/margins": 0.2949702739715576, + "rewards/rejected": -0.4949151873588562, + "step": 2325 + }, + { + "epoch": 0.3597138990914363, + "grad_norm": 6.291881561279297, + "learning_rate": 4.889448963226028e-06, + "logits/chosen": 16.033544540405273, + "logits/rejected": 10.01887035369873, + "logps/chosen": -243.99578857421875, + "logps/rejected": -255.91879272460938, + "loss": 0.6742, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3948141932487488, + "rewards/margins": 0.2785394489765167, + "rewards/rejected": -0.6733536124229431, + "step": 2326 + }, + { + "epoch": 0.35986854823120046, + "grad_norm": 5.036777019500732, + "learning_rate": 4.889162561576355e-06, + "logits/chosen": 10.51666259765625, + "logits/rejected": 6.418890476226807, + "logps/chosen": -241.8315887451172, + "logps/rejected": -179.93482971191406, + "loss": 0.672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19013568758964539, + "rewards/margins": 0.14670856297016144, + "rewards/rejected": -0.33684423565864563, + "step": 2327 + }, + { + "epoch": 0.3600231973709646, + "grad_norm": 5.67700719833374, + "learning_rate": 4.8888761599266815e-06, + "logits/chosen": 10.7471923828125, + "logits/rejected": 7.230803966522217, + "logps/chosen": -392.7158508300781, + "logps/rejected": -280.47772216796875, + "loss": 0.564, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.021594032645225525, + "rewards/margins": 0.4154955744743347, + "rewards/rejected": -0.3939015567302704, + "step": 2328 + }, + { + "epoch": 0.36017784651072876, + "grad_norm": 5.598464488983154, + "learning_rate": 4.888589758277008e-06, + "logits/chosen": 10.959207534790039, + "logits/rejected": 12.219639778137207, + "logps/chosen": -306.4432373046875, + "logps/rejected": -337.88043212890625, + "loss": 0.5788, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.25032198429107666, + "rewards/margins": 0.29896223545074463, + "rewards/rejected": -0.5492841601371765, + "step": 2329 + }, + { + "epoch": 0.36033249565049297, + "grad_norm": 6.4484028816223145, + "learning_rate": 4.888303356627335e-06, + "logits/chosen": 16.292165756225586, + "logits/rejected": 6.625369071960449, + "logps/chosen": -425.8658752441406, + "logps/rejected": -284.1050109863281, + "loss": 0.6799, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3909912109375, + "rewards/margins": 0.10917416214942932, + "rewards/rejected": -0.5001653432846069, + "step": 2330 + }, + { + "epoch": 0.3604871447902571, + "grad_norm": 6.582698822021484, + "learning_rate": 4.8880169549776614e-06, + "logits/chosen": 2.9579524993896484, + "logits/rejected": 7.169062614440918, + "logps/chosen": -157.89012145996094, + "logps/rejected": -239.32408142089844, + "loss": 1.0264, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6352663040161133, + "rewards/margins": -0.39971378445625305, + "rewards/rejected": -0.23555254936218262, + "step": 2331 + }, + { + "epoch": 0.36064179393002127, + "grad_norm": 5.106259346008301, + "learning_rate": 4.887730553327987e-06, + "logits/chosen": 15.27623176574707, + "logits/rejected": 6.068172931671143, + "logps/chosen": -382.0967712402344, + "logps/rejected": -254.91404724121094, + "loss": 0.5111, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12590059638023376, + "rewards/margins": 0.4322192072868347, + "rewards/rejected": -0.5581198334693909, + "step": 2332 + }, + { + "epoch": 0.3607964430697854, + "grad_norm": 4.261303424835205, + "learning_rate": 4.887444151678314e-06, + "logits/chosen": 9.271157264709473, + "logits/rejected": 9.21375846862793, + "logps/chosen": -189.50863647460938, + "logps/rejected": -156.10137939453125, + "loss": 0.5979, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2089763581752777, + "rewards/margins": 0.24481363594532013, + "rewards/rejected": -0.45379000902175903, + "step": 2333 + }, + { + "epoch": 0.3609510922095496, + "grad_norm": 6.183831214904785, + "learning_rate": 4.8871577500286405e-06, + "logits/chosen": 5.848118782043457, + "logits/rejected": 12.195114135742188, + "logps/chosen": -231.59710693359375, + "logps/rejected": -307.4557189941406, + "loss": 0.8476, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3755185008049011, + "rewards/margins": -0.1300891786813736, + "rewards/rejected": -0.24542932212352753, + "step": 2334 + }, + { + "epoch": 0.3611057413493137, + "grad_norm": 5.124051570892334, + "learning_rate": 4.886871348378967e-06, + "logits/chosen": 10.781159400939941, + "logits/rejected": 12.605043411254883, + "logps/chosen": -223.93026733398438, + "logps/rejected": -209.89682006835938, + "loss": 0.7497, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32741034030914307, + "rewards/margins": -0.05141666904091835, + "rewards/rejected": -0.2759937047958374, + "step": 2335 + }, + { + "epoch": 0.36126039048907793, + "grad_norm": 5.250643253326416, + "learning_rate": 4.886584946729294e-06, + "logits/chosen": 8.486542701721191, + "logits/rejected": 4.996033668518066, + "logps/chosen": -232.08023071289062, + "logps/rejected": -136.31983947753906, + "loss": 0.4867, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.005316736176609993, + "rewards/margins": 0.5012938380241394, + "rewards/rejected": -0.506610631942749, + "step": 2336 + }, + { + "epoch": 0.3614150396288421, + "grad_norm": 6.31569766998291, + "learning_rate": 4.8862985450796205e-06, + "logits/chosen": 8.633461952209473, + "logits/rejected": 3.6754908561706543, + "logps/chosen": -357.68524169921875, + "logps/rejected": -220.08511352539062, + "loss": 0.6646, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07489103823900223, + "rewards/margins": 0.12376195192337036, + "rewards/rejected": -0.198652982711792, + "step": 2337 + }, + { + "epoch": 0.36156968876860623, + "grad_norm": 5.512584686279297, + "learning_rate": 4.886012143429947e-06, + "logits/chosen": 10.981819152832031, + "logits/rejected": -1.7955965995788574, + "logps/chosen": -284.3398742675781, + "logps/rejected": -160.30117797851562, + "loss": 0.6178, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3555864989757538, + "rewards/margins": 0.2697451710700989, + "rewards/rejected": -0.6253317594528198, + "step": 2338 + }, + { + "epoch": 0.3617243379083704, + "grad_norm": 6.299797534942627, + "learning_rate": 4.885725741780273e-06, + "logits/chosen": 12.408297538757324, + "logits/rejected": 12.135342597961426, + "logps/chosen": -288.1808776855469, + "logps/rejected": -345.37286376953125, + "loss": 0.667, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1627824902534485, + "rewards/margins": 0.1001102402806282, + "rewards/rejected": -0.2628927230834961, + "step": 2339 + }, + { + "epoch": 0.36187898704813454, + "grad_norm": 5.398398399353027, + "learning_rate": 4.8854393401306e-06, + "logits/chosen": 8.593003273010254, + "logits/rejected": 8.416049003601074, + "logps/chosen": -227.72084045410156, + "logps/rejected": -214.4921417236328, + "loss": 0.6238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19532723724842072, + "rewards/margins": 0.3638247847557068, + "rewards/rejected": -0.5591520071029663, + "step": 2340 + }, + { + "epoch": 0.3620336361878987, + "grad_norm": 4.384174823760986, + "learning_rate": 4.885152938480926e-06, + "logits/chosen": 9.611751556396484, + "logits/rejected": 8.927783012390137, + "logps/chosen": -322.15179443359375, + "logps/rejected": -262.297119140625, + "loss": 0.6181, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.258592426776886, + "rewards/margins": 0.2849966287612915, + "rewards/rejected": -0.5435889959335327, + "step": 2341 + }, + { + "epoch": 0.36218828532766284, + "grad_norm": 7.699501991271973, + "learning_rate": 4.884866536831253e-06, + "logits/chosen": 18.6249942779541, + "logits/rejected": 11.507184982299805, + "logps/chosen": -368.5857849121094, + "logps/rejected": -314.67877197265625, + "loss": 0.8746, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5849729776382446, + "rewards/margins": -0.209983229637146, + "rewards/rejected": -0.37498971819877625, + "step": 2342 + }, + { + "epoch": 0.36234293446742705, + "grad_norm": 7.476219654083252, + "learning_rate": 4.8845801351815796e-06, + "logits/chosen": 6.921806335449219, + "logits/rejected": 9.927090644836426, + "logps/chosen": -314.04937744140625, + "logps/rejected": -340.1742248535156, + "loss": 0.805, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34801045060157776, + "rewards/margins": -0.1438456028699875, + "rewards/rejected": -0.20416483283042908, + "step": 2343 + }, + { + "epoch": 0.3624975836071912, + "grad_norm": 4.399616241455078, + "learning_rate": 4.884293733531905e-06, + "logits/chosen": 14.571311950683594, + "logits/rejected": 6.413580417633057, + "logps/chosen": -318.14996337890625, + "logps/rejected": -269.2610778808594, + "loss": 0.5276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.30601176619529724, + "rewards/margins": 0.5159397125244141, + "rewards/rejected": -0.8219515085220337, + "step": 2344 + }, + { + "epoch": 0.36265223274695535, + "grad_norm": 7.860400199890137, + "learning_rate": 4.884007331882232e-06, + "logits/chosen": 12.45594310760498, + "logits/rejected": 8.262201309204102, + "logps/chosen": -275.4473571777344, + "logps/rejected": -226.03073120117188, + "loss": 0.6237, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15561650693416595, + "rewards/margins": 0.3029440641403198, + "rewards/rejected": -0.458560585975647, + "step": 2345 + }, + { + "epoch": 0.3628068818867195, + "grad_norm": 8.57366943359375, + "learning_rate": 4.883720930232559e-06, + "logits/chosen": 5.303665637969971, + "logits/rejected": 6.4736247062683105, + "logps/chosen": -275.08843994140625, + "logps/rejected": -369.48492431640625, + "loss": 0.78, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28122615814208984, + "rewards/margins": -0.032385632395744324, + "rewards/rejected": -0.24884052574634552, + "step": 2346 + }, + { + "epoch": 0.36296153102648365, + "grad_norm": 3.795870065689087, + "learning_rate": 4.883434528582885e-06, + "logits/chosen": 6.8698906898498535, + "logits/rejected": 3.725409984588623, + "logps/chosen": -170.1791534423828, + "logps/rejected": -148.29632568359375, + "loss": 0.564, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3769446313381195, + "rewards/margins": 0.33366379141807556, + "rewards/rejected": -0.7106083631515503, + "step": 2347 + }, + { + "epoch": 0.3631161801662478, + "grad_norm": 7.328372001647949, + "learning_rate": 4.883148126933211e-06, + "logits/chosen": 4.369106292724609, + "logits/rejected": 12.003111839294434, + "logps/chosen": -333.47235107421875, + "logps/rejected": -472.6842956542969, + "loss": 0.7566, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17366792261600494, + "rewards/margins": 0.07455785572528839, + "rewards/rejected": -0.24822577834129333, + "step": 2348 + }, + { + "epoch": 0.363270829306012, + "grad_norm": 6.609291076660156, + "learning_rate": 4.882861725283538e-06, + "logits/chosen": 10.910910606384277, + "logits/rejected": 8.999237060546875, + "logps/chosen": -274.8633728027344, + "logps/rejected": -322.33367919921875, + "loss": 0.7171, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06630589812994003, + "rewards/margins": 0.023826494812965393, + "rewards/rejected": -0.09013240039348602, + "step": 2349 + }, + { + "epoch": 0.36342547844577616, + "grad_norm": 4.94293737411499, + "learning_rate": 4.882575323633864e-06, + "logits/chosen": 12.231851577758789, + "logits/rejected": 1.6587518453598022, + "logps/chosen": -244.8386993408203, + "logps/rejected": -133.571533203125, + "loss": 0.6544, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3842792212963104, + "rewards/margins": 0.11548030376434326, + "rewards/rejected": -0.4997595250606537, + "step": 2350 + }, + { + "epoch": 0.3635801275855403, + "grad_norm": 4.537017822265625, + "learning_rate": 4.882288921984191e-06, + "logits/chosen": 5.542438507080078, + "logits/rejected": 7.889125823974609, + "logps/chosen": -269.3973693847656, + "logps/rejected": -283.2834167480469, + "loss": 0.6178, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12994347512722015, + "rewards/margins": 0.25747787952423096, + "rewards/rejected": -0.1275343894958496, + "step": 2351 + }, + { + "epoch": 0.36373477672530447, + "grad_norm": 13.991962432861328, + "learning_rate": 4.882002520334517e-06, + "logits/chosen": 10.358453750610352, + "logits/rejected": 5.808764934539795, + "logps/chosen": -215.3419647216797, + "logps/rejected": -172.7190704345703, + "loss": 0.6002, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11588548123836517, + "rewards/margins": 0.2355099618434906, + "rewards/rejected": -0.3513954281806946, + "step": 2352 + }, + { + "epoch": 0.3638894258650686, + "grad_norm": 5.103969097137451, + "learning_rate": 4.8817161186848435e-06, + "logits/chosen": 7.68189811706543, + "logits/rejected": 6.0498127937316895, + "logps/chosen": -184.1732635498047, + "logps/rejected": -217.8790740966797, + "loss": 0.7151, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.35028061270713806, + "rewards/margins": 0.060576677322387695, + "rewards/rejected": -0.41085729002952576, + "step": 2353 + }, + { + "epoch": 0.36404407500483277, + "grad_norm": 5.266611576080322, + "learning_rate": 4.88142971703517e-06, + "logits/chosen": 6.056519508361816, + "logits/rejected": 6.682257652282715, + "logps/chosen": -220.18661499023438, + "logps/rejected": -272.0639343261719, + "loss": 0.6582, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012373637408018112, + "rewards/margins": 0.12627770006656647, + "rewards/rejected": -0.11390408128499985, + "step": 2354 + }, + { + "epoch": 0.3641987241445969, + "grad_norm": 2.9377174377441406, + "learning_rate": 4.881143315385497e-06, + "logits/chosen": 7.589189529418945, + "logits/rejected": 3.4054646492004395, + "logps/chosen": -171.55177307128906, + "logps/rejected": -164.3984375, + "loss": 0.5127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14273259043693542, + "rewards/margins": 0.5173119306564331, + "rewards/rejected": -0.6600445508956909, + "step": 2355 + }, + { + "epoch": 0.3643533732843611, + "grad_norm": 6.725290775299072, + "learning_rate": 4.8808569137358235e-06, + "logits/chosen": 14.152974128723145, + "logits/rejected": 13.475658416748047, + "logps/chosen": -367.3597106933594, + "logps/rejected": -272.80126953125, + "loss": 0.8065, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0828234925866127, + "rewards/margins": -0.1337573379278183, + "rewards/rejected": 0.0509338453412056, + "step": 2356 + }, + { + "epoch": 0.3645080224241253, + "grad_norm": 3.9029901027679443, + "learning_rate": 4.88057051208615e-06, + "logits/chosen": 11.248859405517578, + "logits/rejected": 8.602968215942383, + "logps/chosen": -293.4554443359375, + "logps/rejected": -330.5849304199219, + "loss": 0.5125, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08047932386398315, + "rewards/margins": 0.4764372408390045, + "rewards/rejected": -0.39595791697502136, + "step": 2357 + }, + { + "epoch": 0.36466267156388943, + "grad_norm": 6.206760406494141, + "learning_rate": 4.880284110436477e-06, + "logits/chosen": 9.542941093444824, + "logits/rejected": 12.10243034362793, + "logps/chosen": -347.6027526855469, + "logps/rejected": -366.81890869140625, + "loss": 0.7131, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0746513307094574, + "rewards/margins": 0.07401637732982635, + "rewards/rejected": -0.14866770803928375, + "step": 2358 + }, + { + "epoch": 0.3648173207036536, + "grad_norm": 5.568342208862305, + "learning_rate": 4.8799977087868026e-06, + "logits/chosen": 5.272181987762451, + "logits/rejected": 7.458010196685791, + "logps/chosen": -190.6249542236328, + "logps/rejected": -248.82757568359375, + "loss": 0.6727, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31546923518180847, + "rewards/margins": 0.07132832705974579, + "rewards/rejected": -0.38679754734039307, + "step": 2359 + }, + { + "epoch": 0.36497196984341773, + "grad_norm": 5.3074774742126465, + "learning_rate": 4.879711307137129e-06, + "logits/chosen": 9.64173412322998, + "logits/rejected": 8.144506454467773, + "logps/chosen": -255.3279266357422, + "logps/rejected": -238.05764770507812, + "loss": 0.6663, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3361782431602478, + "rewards/margins": 0.13661952316761017, + "rewards/rejected": -0.4727977514266968, + "step": 2360 + }, + { + "epoch": 0.3651266189831819, + "grad_norm": 6.0828351974487305, + "learning_rate": 4.879424905487456e-06, + "logits/chosen": 10.668529510498047, + "logits/rejected": 9.489906311035156, + "logps/chosen": -321.1646423339844, + "logps/rejected": -282.66729736328125, + "loss": 0.7068, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24036678671836853, + "rewards/margins": 0.06111770123243332, + "rewards/rejected": -0.30148449540138245, + "step": 2361 + }, + { + "epoch": 0.3652812681229461, + "grad_norm": 6.2184529304504395, + "learning_rate": 4.8791385038377825e-06, + "logits/chosen": 10.269166946411133, + "logits/rejected": 10.164972305297852, + "logps/chosen": -302.8077697753906, + "logps/rejected": -338.8974609375, + "loss": 0.6137, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09644690155982971, + "rewards/margins": 0.20035693049430847, + "rewards/rejected": -0.2968038022518158, + "step": 2362 + }, + { + "epoch": 0.36543591726271024, + "grad_norm": 5.861464023590088, + "learning_rate": 4.878852102188109e-06, + "logits/chosen": 6.682910919189453, + "logits/rejected": 1.3267126083374023, + "logps/chosen": -203.043212890625, + "logps/rejected": -184.43333435058594, + "loss": 0.7404, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5007709264755249, + "rewards/margins": -0.03474564850330353, + "rewards/rejected": -0.4660252630710602, + "step": 2363 + }, + { + "epoch": 0.3655905664024744, + "grad_norm": 5.164547443389893, + "learning_rate": 4.878565700538436e-06, + "logits/chosen": 6.862906455993652, + "logits/rejected": 7.993058204650879, + "logps/chosen": -294.6205749511719, + "logps/rejected": -302.4521789550781, + "loss": 0.67, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05370015650987625, + "rewards/margins": 0.2336564064025879, + "rewards/rejected": -0.17995625734329224, + "step": 2364 + }, + { + "epoch": 0.36574521554223854, + "grad_norm": 7.211198329925537, + "learning_rate": 4.878279298888762e-06, + "logits/chosen": 10.631901741027832, + "logits/rejected": 6.959458827972412, + "logps/chosen": -410.842041015625, + "logps/rejected": -338.8072814941406, + "loss": 0.6009, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06388301402330399, + "rewards/margins": 0.31291818618774414, + "rewards/rejected": -0.3768012225627899, + "step": 2365 + }, + { + "epoch": 0.3658998646820027, + "grad_norm": 7.287830352783203, + "learning_rate": 4.877992897239088e-06, + "logits/chosen": 9.525850296020508, + "logits/rejected": 4.018980503082275, + "logps/chosen": -241.68467712402344, + "logps/rejected": -166.04310607910156, + "loss": 0.7677, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29303646087646484, + "rewards/margins": -0.014094941318035126, + "rewards/rejected": -0.2789415121078491, + "step": 2366 + }, + { + "epoch": 0.36605451382176685, + "grad_norm": 4.5175652503967285, + "learning_rate": 4.877706495589415e-06, + "logits/chosen": 7.682052135467529, + "logits/rejected": 7.023406982421875, + "logps/chosen": -197.8852996826172, + "logps/rejected": -216.0009002685547, + "loss": 0.5906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14389914274215698, + "rewards/margins": 0.3550753891468048, + "rewards/rejected": -0.21117626130580902, + "step": 2367 + }, + { + "epoch": 0.36620916296153105, + "grad_norm": 4.615246295928955, + "learning_rate": 4.877420093939742e-06, + "logits/chosen": 13.965363502502441, + "logits/rejected": 11.511537551879883, + "logps/chosen": -302.0827941894531, + "logps/rejected": -216.3037567138672, + "loss": 0.657, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.055520229041576385, + "rewards/margins": 0.1335691511631012, + "rewards/rejected": -0.18908938765525818, + "step": 2368 + }, + { + "epoch": 0.3663638121012952, + "grad_norm": 6.347133159637451, + "learning_rate": 4.877133692290068e-06, + "logits/chosen": 10.524088859558105, + "logits/rejected": 13.706088066101074, + "logps/chosen": -275.6937255859375, + "logps/rejected": -296.3673400878906, + "loss": 0.7062, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12865515053272247, + "rewards/margins": 0.10073898732662201, + "rewards/rejected": -0.22939416766166687, + "step": 2369 + }, + { + "epoch": 0.36651846124105936, + "grad_norm": 4.301425457000732, + "learning_rate": 4.876847290640395e-06, + "logits/chosen": 10.749343872070312, + "logits/rejected": 3.797736167907715, + "logps/chosen": -294.8398132324219, + "logps/rejected": -196.67572021484375, + "loss": 0.5578, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1838666945695877, + "rewards/margins": 0.3971025049686432, + "rewards/rejected": -0.21323581039905548, + "step": 2370 + }, + { + "epoch": 0.3666731103808235, + "grad_norm": 4.686715602874756, + "learning_rate": 4.8765608889907215e-06, + "logits/chosen": 10.325092315673828, + "logits/rejected": 0.4748508930206299, + "logps/chosen": -273.77069091796875, + "logps/rejected": -163.8691864013672, + "loss": 0.634, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09108364582061768, + "rewards/margins": 0.28942739963531494, + "rewards/rejected": -0.3805110454559326, + "step": 2371 + }, + { + "epoch": 0.36682775952058766, + "grad_norm": 3.7439653873443604, + "learning_rate": 4.876274487341047e-06, + "logits/chosen": 4.917524337768555, + "logits/rejected": 3.206486701965332, + "logps/chosen": -254.50897216796875, + "logps/rejected": -143.85720825195312, + "loss": 0.576, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05421806871891022, + "rewards/margins": 0.29991936683654785, + "rewards/rejected": -0.3541374206542969, + "step": 2372 + }, + { + "epoch": 0.3669824086603518, + "grad_norm": 5.920764446258545, + "learning_rate": 4.875988085691374e-06, + "logits/chosen": 12.630250930786133, + "logits/rejected": 12.027786254882812, + "logps/chosen": -293.1812744140625, + "logps/rejected": -277.10992431640625, + "loss": 0.7128, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2882464528083801, + "rewards/margins": 0.04276639223098755, + "rewards/rejected": -0.3310127854347229, + "step": 2373 + }, + { + "epoch": 0.36713705780011596, + "grad_norm": 5.011660575866699, + "learning_rate": 4.875701684041701e-06, + "logits/chosen": 5.424834251403809, + "logits/rejected": 3.492262363433838, + "logps/chosen": -185.93032836914062, + "logps/rejected": -192.091552734375, + "loss": 0.6556, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12888306379318237, + "rewards/margins": 0.15367940068244934, + "rewards/rejected": -0.2825624942779541, + "step": 2374 + }, + { + "epoch": 0.36729170693988017, + "grad_norm": 5.110328197479248, + "learning_rate": 4.875415282392027e-06, + "logits/chosen": 6.217820167541504, + "logits/rejected": 5.6821064949035645, + "logps/chosen": -278.514892578125, + "logps/rejected": -290.5674743652344, + "loss": 0.6137, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2135058492422104, + "rewards/margins": 0.29518839716911316, + "rewards/rejected": -0.5086942315101624, + "step": 2375 + }, + { + "epoch": 0.3674463560796443, + "grad_norm": 5.255284309387207, + "learning_rate": 4.875128880742354e-06, + "logits/chosen": 10.826190948486328, + "logits/rejected": 6.227410793304443, + "logps/chosen": -356.1262512207031, + "logps/rejected": -232.88088989257812, + "loss": 0.7445, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.030341394245624542, + "rewards/margins": 0.10422481596469879, + "rewards/rejected": -0.13456621766090393, + "step": 2376 + }, + { + "epoch": 0.36760100521940847, + "grad_norm": 5.0341997146606445, + "learning_rate": 4.87484247909268e-06, + "logits/chosen": 12.82275390625, + "logits/rejected": 9.950830459594727, + "logps/chosen": -204.505615234375, + "logps/rejected": -177.79937744140625, + "loss": 0.6338, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.276117205619812, + "rewards/margins": 0.14440016448497772, + "rewards/rejected": -0.42051735520362854, + "step": 2377 + }, + { + "epoch": 0.3677556543591726, + "grad_norm": 5.252141952514648, + "learning_rate": 4.874556077443006e-06, + "logits/chosen": 14.976068496704102, + "logits/rejected": 1.7651383876800537, + "logps/chosen": -469.15869140625, + "logps/rejected": -251.12576293945312, + "loss": 0.5003, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.083179771900177, + "rewards/margins": 0.5186883211135864, + "rewards/rejected": -0.4355085492134094, + "step": 2378 + }, + { + "epoch": 0.3679103034989368, + "grad_norm": 4.702194690704346, + "learning_rate": 4.874269675793333e-06, + "logits/chosen": 8.767407417297363, + "logits/rejected": 8.445731163024902, + "logps/chosen": -224.59815979003906, + "logps/rejected": -246.44595336914062, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09728317707777023, + "rewards/margins": 0.060402654111385345, + "rewards/rejected": -0.15768583118915558, + "step": 2379 + }, + { + "epoch": 0.3680649526387009, + "grad_norm": 12.987638473510742, + "learning_rate": 4.87398327414366e-06, + "logits/chosen": 9.733770370483398, + "logits/rejected": 10.07154655456543, + "logps/chosen": -270.3354187011719, + "logps/rejected": -327.7171936035156, + "loss": 0.5639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12341442704200745, + "rewards/margins": 0.36424195766448975, + "rewards/rejected": -0.2408275604248047, + "step": 2380 + }, + { + "epoch": 0.36821960177846513, + "grad_norm": 6.640517234802246, + "learning_rate": 4.873696872493986e-06, + "logits/chosen": 1.9106736183166504, + "logits/rejected": 7.916538238525391, + "logps/chosen": -173.9552001953125, + "logps/rejected": -203.43161010742188, + "loss": 0.8338, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48268765211105347, + "rewards/margins": -0.17412656545639038, + "rewards/rejected": -0.3085610866546631, + "step": 2381 + }, + { + "epoch": 0.3683742509182293, + "grad_norm": 8.06789779663086, + "learning_rate": 4.873410470844312e-06, + "logits/chosen": 7.257105827331543, + "logits/rejected": 11.474893569946289, + "logps/chosen": -364.2514953613281, + "logps/rejected": -368.940673828125, + "loss": 0.6514, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.47181302309036255, + "rewards/margins": 0.10804357379674911, + "rewards/rejected": -0.5798565745353699, + "step": 2382 + }, + { + "epoch": 0.36852890005799344, + "grad_norm": 6.6192307472229, + "learning_rate": 4.873124069194639e-06, + "logits/chosen": 4.267529010772705, + "logits/rejected": -0.2083573341369629, + "logps/chosen": -307.0151672363281, + "logps/rejected": -195.8062286376953, + "loss": 0.6424, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02369135618209839, + "rewards/margins": 0.4160001575946808, + "rewards/rejected": -0.43969154357910156, + "step": 2383 + }, + { + "epoch": 0.3686835491977576, + "grad_norm": 5.832785606384277, + "learning_rate": 4.8728376675449654e-06, + "logits/chosen": 5.619063377380371, + "logits/rejected": 6.483266353607178, + "logps/chosen": -140.34205627441406, + "logps/rejected": -136.52633666992188, + "loss": 0.8751, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.4540998339653015, + "rewards/margins": -0.3129621744155884, + "rewards/rejected": -0.14113768935203552, + "step": 2384 + }, + { + "epoch": 0.36883819833752174, + "grad_norm": 4.609332084655762, + "learning_rate": 4.872551265895292e-06, + "logits/chosen": 12.109814643859863, + "logits/rejected": 9.321410179138184, + "logps/chosen": -209.52783203125, + "logps/rejected": -260.81695556640625, + "loss": 0.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10003555566072464, + "rewards/margins": 0.21664945781230927, + "rewards/rejected": -0.3166849911212921, + "step": 2385 + }, + { + "epoch": 0.3689928474772859, + "grad_norm": 4.604510307312012, + "learning_rate": 4.872264864245618e-06, + "logits/chosen": 7.773797035217285, + "logits/rejected": 9.025056838989258, + "logps/chosen": -202.4029083251953, + "logps/rejected": -193.79739379882812, + "loss": 0.6154, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1768815517425537, + "rewards/margins": 0.20046842098236084, + "rewards/rejected": -0.37734994292259216, + "step": 2386 + }, + { + "epoch": 0.36914749661705004, + "grad_norm": 4.75264835357666, + "learning_rate": 4.8719784625959445e-06, + "logits/chosen": 5.044244289398193, + "logits/rejected": 2.677712917327881, + "logps/chosen": -225.36862182617188, + "logps/rejected": -164.3058319091797, + "loss": 0.6358, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3549291491508484, + "rewards/margins": 0.17416629195213318, + "rewards/rejected": -0.5290954113006592, + "step": 2387 + }, + { + "epoch": 0.36930214575681425, + "grad_norm": 4.902527332305908, + "learning_rate": 4.871692060946271e-06, + "logits/chosen": 10.700467109680176, + "logits/rejected": 4.135343074798584, + "logps/chosen": -281.79925537109375, + "logps/rejected": -200.09530639648438, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10641193389892578, + "rewards/margins": 0.34592026472091675, + "rewards/rejected": -0.23950831592082977, + "step": 2388 + }, + { + "epoch": 0.3694567948965784, + "grad_norm": 4.677819728851318, + "learning_rate": 4.871405659296598e-06, + "logits/chosen": 12.49563217163086, + "logits/rejected": 9.802335739135742, + "logps/chosen": -274.1383361816406, + "logps/rejected": -216.48519897460938, + "loss": 0.6362, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11770892888307571, + "rewards/margins": 0.1549261510372162, + "rewards/rejected": -0.03721722960472107, + "step": 2389 + }, + { + "epoch": 0.36961144403634255, + "grad_norm": 4.814740180969238, + "learning_rate": 4.8711192576469245e-06, + "logits/chosen": 16.692867279052734, + "logits/rejected": 10.169358253479004, + "logps/chosen": -285.1744689941406, + "logps/rejected": -209.13375854492188, + "loss": 0.6241, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0999113991856575, + "rewards/margins": 0.19803360104560852, + "rewards/rejected": -0.09812220931053162, + "step": 2390 + }, + { + "epoch": 0.3697660931761067, + "grad_norm": 6.51661491394043, + "learning_rate": 4.870832855997251e-06, + "logits/chosen": 9.130752563476562, + "logits/rejected": 11.86257553100586, + "logps/chosen": -385.63970947265625, + "logps/rejected": -411.24090576171875, + "loss": 0.7663, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.060430534183979034, + "rewards/margins": -0.021870076656341553, + "rewards/rejected": 0.0823005735874176, + "step": 2391 + }, + { + "epoch": 0.36992074231587085, + "grad_norm": 4.617303371429443, + "learning_rate": 4.870546454347577e-06, + "logits/chosen": 9.277838706970215, + "logits/rejected": 5.246686935424805, + "logps/chosen": -242.91604614257812, + "logps/rejected": -205.66676330566406, + "loss": 0.5409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03974518924951553, + "rewards/margins": 0.4512771964073181, + "rewards/rejected": -0.49102237820625305, + "step": 2392 + }, + { + "epoch": 0.370075391455635, + "grad_norm": 4.115781307220459, + "learning_rate": 4.870260052697904e-06, + "logits/chosen": 11.460028648376465, + "logits/rejected": 9.533269882202148, + "logps/chosen": -216.5787353515625, + "logps/rejected": -219.90652465820312, + "loss": 0.55, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21053925156593323, + "rewards/margins": 0.5041624307632446, + "rewards/rejected": -0.2936231791973114, + "step": 2393 + }, + { + "epoch": 0.3702300405953992, + "grad_norm": 5.619273662567139, + "learning_rate": 4.86997365104823e-06, + "logits/chosen": 12.299894332885742, + "logits/rejected": 10.783432006835938, + "logps/chosen": -297.86395263671875, + "logps/rejected": -242.69082641601562, + "loss": 0.7247, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12371206283569336, + "rewards/margins": 0.06363438069820404, + "rewards/rejected": -0.1873464584350586, + "step": 2394 + }, + { + "epoch": 0.37038468973516336, + "grad_norm": 4.4429612159729, + "learning_rate": 4.869687249398557e-06, + "logits/chosen": 15.541686058044434, + "logits/rejected": 8.054214477539062, + "logps/chosen": -297.373046875, + "logps/rejected": -159.5595703125, + "loss": 0.608, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04231991618871689, + "rewards/margins": 0.29553380608558655, + "rewards/rejected": -0.33785372972488403, + "step": 2395 + }, + { + "epoch": 0.3705393388749275, + "grad_norm": 5.448227405548096, + "learning_rate": 4.8694008477488836e-06, + "logits/chosen": 8.33600902557373, + "logits/rejected": 2.7957589626312256, + "logps/chosen": -371.14532470703125, + "logps/rejected": -300.59466552734375, + "loss": 0.5739, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01331009715795517, + "rewards/margins": 0.35207265615463257, + "rewards/rejected": -0.3387625813484192, + "step": 2396 + }, + { + "epoch": 0.37069398801469167, + "grad_norm": 7.116875171661377, + "learning_rate": 4.86911444609921e-06, + "logits/chosen": 4.180120468139648, + "logits/rejected": 9.486620903015137, + "logps/chosen": -265.0079345703125, + "logps/rejected": -320.7835693359375, + "loss": 0.8892, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.5311102271080017, + "rewards/margins": -0.3332226872444153, + "rewards/rejected": -0.19788752496242523, + "step": 2397 + }, + { + "epoch": 0.3708486371544558, + "grad_norm": 5.047080039978027, + "learning_rate": 4.868828044449536e-06, + "logits/chosen": 5.533426284790039, + "logits/rejected": 5.468631744384766, + "logps/chosen": -334.0582275390625, + "logps/rejected": -331.41571044921875, + "loss": 0.5213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1340743452310562, + "rewards/margins": 0.5435271263122559, + "rewards/rejected": -0.4094528555870056, + "step": 2398 + }, + { + "epoch": 0.37100328629421997, + "grad_norm": 5.215960502624512, + "learning_rate": 4.868541642799863e-06, + "logits/chosen": 10.979272842407227, + "logits/rejected": 3.5393314361572266, + "logps/chosen": -346.67572021484375, + "logps/rejected": -270.45391845703125, + "loss": 0.5911, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19033685326576233, + "rewards/margins": 0.30940285325050354, + "rewards/rejected": -0.4997396767139435, + "step": 2399 + }, + { + "epoch": 0.3711579354339841, + "grad_norm": 7.44083833694458, + "learning_rate": 4.868255241150189e-06, + "logits/chosen": 9.197869300842285, + "logits/rejected": 10.546732902526855, + "logps/chosen": -301.154541015625, + "logps/rejected": -325.6441345214844, + "loss": 0.6718, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20265166461467743, + "rewards/margins": 0.08321147412061691, + "rewards/rejected": 0.11944020539522171, + "step": 2400 + }, + { + "epoch": 0.3713125845737483, + "grad_norm": 7.2116780281066895, + "learning_rate": 4.867968839500516e-06, + "logits/chosen": 9.496402740478516, + "logits/rejected": 7.272639274597168, + "logps/chosen": -392.7807922363281, + "logps/rejected": -390.8681640625, + "loss": 0.7087, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08842869102954865, + "rewards/margins": 0.24314594268798828, + "rewards/rejected": -0.15471723675727844, + "step": 2401 + }, + { + "epoch": 0.3714672337135125, + "grad_norm": 4.874665260314941, + "learning_rate": 4.867682437850843e-06, + "logits/chosen": 13.588971138000488, + "logits/rejected": 12.379103660583496, + "logps/chosen": -324.69183349609375, + "logps/rejected": -274.97918701171875, + "loss": 0.5758, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14593347907066345, + "rewards/margins": 0.3790302872657776, + "rewards/rejected": -0.23309679329395294, + "step": 2402 + }, + { + "epoch": 0.37162188285327663, + "grad_norm": 9.878633499145508, + "learning_rate": 4.867396036201169e-06, + "logits/chosen": 10.561646461486816, + "logits/rejected": 7.264138221740723, + "logps/chosen": -426.41949462890625, + "logps/rejected": -375.1007080078125, + "loss": 0.7279, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0992315337061882, + "rewards/margins": 0.0367194339632988, + "rewards/rejected": 0.06251209229230881, + "step": 2403 + }, + { + "epoch": 0.3717765319930408, + "grad_norm": 7.184102535247803, + "learning_rate": 4.867109634551496e-06, + "logits/chosen": 9.5424165725708, + "logits/rejected": 7.9329328536987305, + "logps/chosen": -302.94561767578125, + "logps/rejected": -281.15411376953125, + "loss": 0.6258, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08207236230373383, + "rewards/margins": 0.327249675989151, + "rewards/rejected": -0.24517729878425598, + "step": 2404 + }, + { + "epoch": 0.37193118113280493, + "grad_norm": 5.745555877685547, + "learning_rate": 4.866823232901822e-06, + "logits/chosen": 8.223381996154785, + "logits/rejected": 8.593246459960938, + "logps/chosen": -252.44442749023438, + "logps/rejected": -260.95721435546875, + "loss": 0.8353, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.38044825196266174, + "rewards/margins": -0.16518260538578033, + "rewards/rejected": -0.2152656763792038, + "step": 2405 + }, + { + "epoch": 0.3720858302725691, + "grad_norm": 6.383440971374512, + "learning_rate": 4.866536831252148e-06, + "logits/chosen": 9.163786888122559, + "logits/rejected": 10.457295417785645, + "logps/chosen": -276.6761474609375, + "logps/rejected": -272.112060546875, + "loss": 0.7696, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10289005935192108, + "rewards/margins": -0.04424424469470978, + "rewards/rejected": -0.0586458183825016, + "step": 2406 + }, + { + "epoch": 0.3722404794123333, + "grad_norm": 7.81800651550293, + "learning_rate": 4.866250429602475e-06, + "logits/chosen": 6.324779033660889, + "logits/rejected": 3.4595487117767334, + "logps/chosen": -265.68890380859375, + "logps/rejected": -251.76902770996094, + "loss": 0.7418, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07356545329093933, + "rewards/margins": -0.036809131503105164, + "rewards/rejected": -0.036756325513124466, + "step": 2407 + }, + { + "epoch": 0.37239512855209744, + "grad_norm": 6.409412860870361, + "learning_rate": 4.865964027952802e-06, + "logits/chosen": 10.697577476501465, + "logits/rejected": 12.21921157836914, + "logps/chosen": -208.1806640625, + "logps/rejected": -283.0392761230469, + "loss": 0.8412, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22442729771137238, + "rewards/margins": -0.1961127668619156, + "rewards/rejected": -0.028314542025327682, + "step": 2408 + }, + { + "epoch": 0.3725497776918616, + "grad_norm": 4.9999284744262695, + "learning_rate": 4.865677626303128e-06, + "logits/chosen": 10.129697799682617, + "logits/rejected": 9.321220397949219, + "logps/chosen": -251.20555114746094, + "logps/rejected": -240.29701232910156, + "loss": 0.7, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19308196008205414, + "rewards/margins": 0.11935662478208542, + "rewards/rejected": -0.31243857741355896, + "step": 2409 + }, + { + "epoch": 0.37270442683162575, + "grad_norm": 5.70963191986084, + "learning_rate": 4.865391224653455e-06, + "logits/chosen": 12.110647201538086, + "logits/rejected": 8.521402359008789, + "logps/chosen": -345.49847412109375, + "logps/rejected": -242.37579345703125, + "loss": 0.616, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.011140245944261551, + "rewards/margins": 0.18487143516540527, + "rewards/rejected": -0.19601169228553772, + "step": 2410 + }, + { + "epoch": 0.3728590759713899, + "grad_norm": 3.2742412090301514, + "learning_rate": 4.865104823003781e-06, + "logits/chosen": 11.43698501586914, + "logits/rejected": 9.223467826843262, + "logps/chosen": -274.5985412597656, + "logps/rejected": -199.3978271484375, + "loss": 0.4854, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13046349585056305, + "rewards/margins": 0.6204354763031006, + "rewards/rejected": -0.48997196555137634, + "step": 2411 + }, + { + "epoch": 0.37301372511115405, + "grad_norm": 5.112938404083252, + "learning_rate": 4.864818421354107e-06, + "logits/chosen": 14.252082824707031, + "logits/rejected": 10.206888198852539, + "logps/chosen": -368.84783935546875, + "logps/rejected": -245.28164672851562, + "loss": 0.5694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00153336301445961, + "rewards/margins": 0.3470481038093567, + "rewards/rejected": -0.345514714717865, + "step": 2412 + }, + { + "epoch": 0.37316837425091826, + "grad_norm": 4.096931457519531, + "learning_rate": 4.864532019704434e-06, + "logits/chosen": 7.786017417907715, + "logits/rejected": 0.8664898872375488, + "logps/chosen": -304.21197509765625, + "logps/rejected": -188.03717041015625, + "loss": 0.475, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26504290103912354, + "rewards/margins": 0.6168004274368286, + "rewards/rejected": -0.3517575263977051, + "step": 2413 + }, + { + "epoch": 0.3733230233906824, + "grad_norm": 10.065600395202637, + "learning_rate": 4.864245618054761e-06, + "logits/chosen": 5.228086471557617, + "logits/rejected": 7.355319023132324, + "logps/chosen": -334.7033386230469, + "logps/rejected": -249.94747924804688, + "loss": 0.7697, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12815171480178833, + "rewards/margins": -0.11044847965240479, + "rewards/rejected": -0.017703257501125336, + "step": 2414 + }, + { + "epoch": 0.37347767253044656, + "grad_norm": 5.694726467132568, + "learning_rate": 4.8639592164050865e-06, + "logits/chosen": 11.951788902282715, + "logits/rejected": 5.954407691955566, + "logps/chosen": -337.16973876953125, + "logps/rejected": -208.03347778320312, + "loss": 0.7349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11614008247852325, + "rewards/margins": -0.00870351493358612, + "rewards/rejected": -0.10743656754493713, + "step": 2415 + }, + { + "epoch": 0.3736323216702107, + "grad_norm": 4.3964033126831055, + "learning_rate": 4.863672814755413e-06, + "logits/chosen": 12.928388595581055, + "logits/rejected": 13.472946166992188, + "logps/chosen": -200.55270385742188, + "logps/rejected": -194.20721435546875, + "loss": 0.6371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10276656597852707, + "rewards/margins": 0.15031331777572632, + "rewards/rejected": -0.047546759247779846, + "step": 2416 + }, + { + "epoch": 0.37378697080997486, + "grad_norm": 5.179884433746338, + "learning_rate": 4.86338641310574e-06, + "logits/chosen": 13.089771270751953, + "logits/rejected": 16.394187927246094, + "logps/chosen": -353.4322814941406, + "logps/rejected": -350.3993835449219, + "loss": 0.7536, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02358187735080719, + "rewards/margins": -0.027330204844474792, + "rewards/rejected": 0.0037483256310224533, + "step": 2417 + }, + { + "epoch": 0.373941619949739, + "grad_norm": 5.609171390533447, + "learning_rate": 4.8631000114560665e-06, + "logits/chosen": 10.013446807861328, + "logits/rejected": 6.899214744567871, + "logps/chosen": -330.164306640625, + "logps/rejected": -232.083984375, + "loss": 0.7513, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13165056705474854, + "rewards/margins": -0.04262585937976837, + "rewards/rejected": -0.08902469277381897, + "step": 2418 + }, + { + "epoch": 0.37409626908950316, + "grad_norm": 5.190798282623291, + "learning_rate": 4.862813609806393e-06, + "logits/chosen": 5.626438140869141, + "logits/rejected": 10.89083194732666, + "logps/chosen": -138.60348510742188, + "logps/rejected": -195.47848510742188, + "loss": 0.7349, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.09010524302721024, + "rewards/margins": 0.030107304453849792, + "rewards/rejected": -0.12021255493164062, + "step": 2419 + }, + { + "epoch": 0.37425091822926737, + "grad_norm": 6.036233901977539, + "learning_rate": 4.862527208156719e-06, + "logits/chosen": 3.1176884174346924, + "logits/rejected": 8.94989013671875, + "logps/chosen": -161.10765075683594, + "logps/rejected": -262.7567138671875, + "loss": 0.7549, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24192127585411072, + "rewards/margins": -0.046264126896858215, + "rewards/rejected": -0.1956571489572525, + "step": 2420 + }, + { + "epoch": 0.3744055673690315, + "grad_norm": 5.315489768981934, + "learning_rate": 4.862240806507046e-06, + "logits/chosen": 10.71999740600586, + "logits/rejected": 4.455151557922363, + "logps/chosen": -387.28582763671875, + "logps/rejected": -201.41685485839844, + "loss": 0.6543, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0010783225297927856, + "rewards/margins": 0.2618444561958313, + "rewards/rejected": -0.2629227638244629, + "step": 2421 + }, + { + "epoch": 0.3745602165087957, + "grad_norm": 6.784698963165283, + "learning_rate": 4.861954404857372e-06, + "logits/chosen": 5.973268985748291, + "logits/rejected": 2.865466833114624, + "logps/chosen": -401.97113037109375, + "logps/rejected": -291.6532287597656, + "loss": 0.5707, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09425545483827591, + "rewards/margins": 0.30102798342704773, + "rewards/rejected": -0.20677253603935242, + "step": 2422 + }, + { + "epoch": 0.3747148656485598, + "grad_norm": 4.686197757720947, + "learning_rate": 4.861668003207699e-06, + "logits/chosen": 11.251947402954102, + "logits/rejected": 4.704822540283203, + "logps/chosen": -337.48992919921875, + "logps/rejected": -234.16897583007812, + "loss": 0.5442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.038409046828746796, + "rewards/margins": 0.35179638862609863, + "rewards/rejected": -0.31338736414909363, + "step": 2423 + }, + { + "epoch": 0.374869514788324, + "grad_norm": 4.958866119384766, + "learning_rate": 4.8613816015580255e-06, + "logits/chosen": 10.249595642089844, + "logits/rejected": 0.1355055570602417, + "logps/chosen": -347.5803527832031, + "logps/rejected": -247.2126922607422, + "loss": 0.5016, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09505057334899902, + "rewards/margins": 0.4816757142543793, + "rewards/rejected": -0.38662517070770264, + "step": 2424 + }, + { + "epoch": 0.37502416392808813, + "grad_norm": 6.1127424240112305, + "learning_rate": 4.861095199908351e-06, + "logits/chosen": 11.60125732421875, + "logits/rejected": 9.832794189453125, + "logps/chosen": -767.0415649414062, + "logps/rejected": -606.1099243164062, + "loss": 0.5094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5839893221855164, + "rewards/margins": 0.4297370910644531, + "rewards/rejected": 0.15425223112106323, + "step": 2425 + }, + { + "epoch": 0.37517881306785233, + "grad_norm": 4.395288467407227, + "learning_rate": 4.860808798258678e-06, + "logits/chosen": 1.1999735832214355, + "logits/rejected": 4.366621971130371, + "logps/chosen": -201.7080841064453, + "logps/rejected": -238.09596252441406, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14504578709602356, + "rewards/margins": 0.07806803286075592, + "rewards/rejected": -0.22311383485794067, + "step": 2426 + }, + { + "epoch": 0.3753334622076165, + "grad_norm": 6.2342753410339355, + "learning_rate": 4.860522396609005e-06, + "logits/chosen": 5.233345031738281, + "logits/rejected": 5.837580680847168, + "logps/chosen": -215.23550415039062, + "logps/rejected": -286.7138671875, + "loss": 0.7454, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1587119996547699, + "rewards/margins": 0.019587822258472443, + "rewards/rejected": -0.17829981446266174, + "step": 2427 + }, + { + "epoch": 0.37548811134738064, + "grad_norm": 5.123315811157227, + "learning_rate": 4.860235994959331e-06, + "logits/chosen": 11.131426811218262, + "logits/rejected": 8.473309516906738, + "logps/chosen": -197.91627502441406, + "logps/rejected": -202.65553283691406, + "loss": 0.6451, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2181018888950348, + "rewards/margins": 0.13306578993797302, + "rewards/rejected": -0.3511676788330078, + "step": 2428 + }, + { + "epoch": 0.3756427604871448, + "grad_norm": 6.883605003356934, + "learning_rate": 4.859949593309658e-06, + "logits/chosen": 8.799736976623535, + "logits/rejected": 9.347648620605469, + "logps/chosen": -324.57952880859375, + "logps/rejected": -336.3131408691406, + "loss": 0.77, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09100465476512909, + "rewards/margins": -0.06209847331047058, + "rewards/rejected": -0.028906196355819702, + "step": 2429 + }, + { + "epoch": 0.37579740962690894, + "grad_norm": 4.618454456329346, + "learning_rate": 4.859663191659985e-06, + "logits/chosen": 10.595348358154297, + "logits/rejected": 8.491671562194824, + "logps/chosen": -296.3447265625, + "logps/rejected": -270.54449462890625, + "loss": 0.5738, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03315233811736107, + "rewards/margins": 0.33365505933761597, + "rewards/rejected": -0.36680740118026733, + "step": 2430 + }, + { + "epoch": 0.3759520587666731, + "grad_norm": 5.708675384521484, + "learning_rate": 4.85937679001031e-06, + "logits/chosen": 7.179444313049316, + "logits/rejected": 5.314072608947754, + "logps/chosen": -188.41534423828125, + "logps/rejected": -206.8900909423828, + "loss": 0.7006, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2678079605102539, + "rewards/margins": 0.08242912590503693, + "rewards/rejected": -0.350237101316452, + "step": 2431 + }, + { + "epoch": 0.37610670790643724, + "grad_norm": 4.498577117919922, + "learning_rate": 4.859090388360637e-06, + "logits/chosen": 7.766833305358887, + "logits/rejected": -1.2014284133911133, + "logps/chosen": -277.64276123046875, + "logps/rejected": -139.09707641601562, + "loss": 0.727, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07459845393896103, + "rewards/margins": 0.03489711880683899, + "rewards/rejected": -0.10949559509754181, + "step": 2432 + }, + { + "epoch": 0.37626135704620145, + "grad_norm": 5.34918737411499, + "learning_rate": 4.858803986710964e-06, + "logits/chosen": 2.9574499130249023, + "logits/rejected": 3.4283151626586914, + "logps/chosen": -268.2252197265625, + "logps/rejected": -314.6051025390625, + "loss": 0.6752, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04532814398407936, + "rewards/margins": 0.08371267467737198, + "rewards/rejected": -0.03838452696800232, + "step": 2433 + }, + { + "epoch": 0.3764160061859656, + "grad_norm": 5.0474395751953125, + "learning_rate": 4.85851758506129e-06, + "logits/chosen": 5.905858516693115, + "logits/rejected": 4.116021156311035, + "logps/chosen": -210.8483123779297, + "logps/rejected": -264.8016662597656, + "loss": 0.6722, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15677852928638458, + "rewards/margins": 0.1294841766357422, + "rewards/rejected": -0.28626272082328796, + "step": 2434 + }, + { + "epoch": 0.37657065532572975, + "grad_norm": 8.921321868896484, + "learning_rate": 4.858231183411617e-06, + "logits/chosen": 9.516305923461914, + "logits/rejected": 8.695402145385742, + "logps/chosen": -342.0606689453125, + "logps/rejected": -241.3375701904297, + "loss": 0.8331, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14257775247097015, + "rewards/margins": -0.19309987127780914, + "rewards/rejected": 0.05052214860916138, + "step": 2435 + }, + { + "epoch": 0.3767253044654939, + "grad_norm": 7.870149612426758, + "learning_rate": 4.857944781761944e-06, + "logits/chosen": 11.549324035644531, + "logits/rejected": 7.833742618560791, + "logps/chosen": -425.5721130371094, + "logps/rejected": -362.63372802734375, + "loss": 0.7933, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21955886483192444, + "rewards/margins": -0.036484137177467346, + "rewards/rejected": 0.256043016910553, + "step": 2436 + }, + { + "epoch": 0.37687995360525806, + "grad_norm": 4.738528728485107, + "learning_rate": 4.85765838011227e-06, + "logits/chosen": 14.041961669921875, + "logits/rejected": 10.339014053344727, + "logps/chosen": -249.52871704101562, + "logps/rejected": -181.6715087890625, + "loss": 0.6657, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020669549703598022, + "rewards/margins": 0.11426806449890137, + "rewards/rejected": -0.09359850734472275, + "step": 2437 + }, + { + "epoch": 0.3770346027450222, + "grad_norm": 4.823914527893066, + "learning_rate": 4.857371978462596e-06, + "logits/chosen": 7.354625701904297, + "logits/rejected": 12.840993881225586, + "logps/chosen": -101.60289764404297, + "logps/rejected": -157.95004272460938, + "loss": 0.8454, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2161734700202942, + "rewards/margins": -0.22304527461528778, + "rewards/rejected": 0.006871797144412994, + "step": 2438 + }, + { + "epoch": 0.3771892518847864, + "grad_norm": 6.762789249420166, + "learning_rate": 4.857085576812923e-06, + "logits/chosen": 4.13044548034668, + "logits/rejected": 2.03116512298584, + "logps/chosen": -288.9937744140625, + "logps/rejected": -257.204833984375, + "loss": 0.7641, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09213514626026154, + "rewards/margins": -0.05110941827297211, + "rewards/rejected": -0.04102573171257973, + "step": 2439 + }, + { + "epoch": 0.37734390102455057, + "grad_norm": 6.0918803215026855, + "learning_rate": 4.856799175163249e-06, + "logits/chosen": 14.06298542022705, + "logits/rejected": 10.342275619506836, + "logps/chosen": -300.19561767578125, + "logps/rejected": -284.68487548828125, + "loss": 0.6534, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1113671213388443, + "rewards/margins": 0.11857785284519196, + "rewards/rejected": -0.007210716605186462, + "step": 2440 + }, + { + "epoch": 0.3774985501643147, + "grad_norm": 5.471554756164551, + "learning_rate": 4.856512773513576e-06, + "logits/chosen": 12.553479194641113, + "logits/rejected": 3.8274283409118652, + "logps/chosen": -362.3578796386719, + "logps/rejected": -190.26821899414062, + "loss": 0.6007, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05194535851478577, + "rewards/margins": 0.22257833182811737, + "rewards/rejected": -0.1706329882144928, + "step": 2441 + }, + { + "epoch": 0.37765319930407887, + "grad_norm": 6.16045618057251, + "learning_rate": 4.856226371863903e-06, + "logits/chosen": 9.616735458374023, + "logits/rejected": 8.711997985839844, + "logps/chosen": -199.96336364746094, + "logps/rejected": -198.77276611328125, + "loss": 0.6736, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19526158273220062, + "rewards/margins": 0.12442216277122498, + "rewards/rejected": -0.3196837306022644, + "step": 2442 + }, + { + "epoch": 0.377807848443843, + "grad_norm": 6.503495216369629, + "learning_rate": 4.855939970214229e-06, + "logits/chosen": 5.232485294342041, + "logits/rejected": 5.998983860015869, + "logps/chosen": -391.4177551269531, + "logps/rejected": -344.562255859375, + "loss": 0.5262, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.430552214384079, + "rewards/margins": 0.4784107208251953, + "rewards/rejected": -0.047858476638793945, + "step": 2443 + }, + { + "epoch": 0.37796249758360717, + "grad_norm": 4.429693222045898, + "learning_rate": 4.855653568564555e-06, + "logits/chosen": 14.097070693969727, + "logits/rejected": 3.1066911220550537, + "logps/chosen": -283.040771484375, + "logps/rejected": -140.26065063476562, + "loss": 0.6073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18656319379806519, + "rewards/margins": 0.22773678600788116, + "rewards/rejected": -0.41429999470710754, + "step": 2444 + }, + { + "epoch": 0.3781171467233714, + "grad_norm": 5.513053894042969, + "learning_rate": 4.855367166914882e-06, + "logits/chosen": 4.974415302276611, + "logits/rejected": 7.12093448638916, + "logps/chosen": -225.90109252929688, + "logps/rejected": -246.17764282226562, + "loss": 0.6731, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09877672791481018, + "rewards/margins": 0.09649492800235748, + "rewards/rejected": 0.0022817999124526978, + "step": 2445 + }, + { + "epoch": 0.37827179586313553, + "grad_norm": 4.6857829093933105, + "learning_rate": 4.8550807652652085e-06, + "logits/chosen": 5.275303840637207, + "logits/rejected": 10.315199851989746, + "logps/chosen": -151.8195343017578, + "logps/rejected": -182.37969970703125, + "loss": 0.693, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21341568231582642, + "rewards/margins": 0.06520751118659973, + "rewards/rejected": -0.27862319350242615, + "step": 2446 + }, + { + "epoch": 0.3784264450028997, + "grad_norm": 17.664249420166016, + "learning_rate": 4.854794363615535e-06, + "logits/chosen": 5.6250715255737305, + "logits/rejected": 3.5292861461639404, + "logps/chosen": -325.349609375, + "logps/rejected": -256.1471862792969, + "loss": 0.7529, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06405648589134216, + "rewards/margins": -0.07598079741001129, + "rewards/rejected": 0.14003726840019226, + "step": 2447 + }, + { + "epoch": 0.37858109414266383, + "grad_norm": 10.483590126037598, + "learning_rate": 4.854507961965862e-06, + "logits/chosen": 12.203974723815918, + "logits/rejected": 10.922534942626953, + "logps/chosen": -227.8511505126953, + "logps/rejected": -169.13790893554688, + "loss": 0.6498, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09709373116493225, + "rewards/margins": 0.1330793797969818, + "rewards/rejected": -0.23017311096191406, + "step": 2448 + }, + { + "epoch": 0.378735743282428, + "grad_norm": 11.239873886108398, + "learning_rate": 4.8542215603161876e-06, + "logits/chosen": 7.109556198120117, + "logits/rejected": 7.122159957885742, + "logps/chosen": -376.6725158691406, + "logps/rejected": -348.2467956542969, + "loss": 0.6928, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19504088163375854, + "rewards/margins": 0.0822812169790268, + "rewards/rejected": 0.11275969445705414, + "step": 2449 + }, + { + "epoch": 0.37889039242219213, + "grad_norm": 5.834151268005371, + "learning_rate": 4.853935158666514e-06, + "logits/chosen": 11.965173721313477, + "logits/rejected": 6.965633392333984, + "logps/chosen": -371.366943359375, + "logps/rejected": -285.85308837890625, + "loss": 0.6524, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18527603149414062, + "rewards/margins": 0.11812802404165268, + "rewards/rejected": 0.06714802980422974, + "step": 2450 + }, + { + "epoch": 0.3790450415619563, + "grad_norm": 9.095908164978027, + "learning_rate": 4.853648757016841e-06, + "logits/chosen": 7.213448524475098, + "logits/rejected": 8.360918998718262, + "logps/chosen": -254.16258239746094, + "logps/rejected": -268.68841552734375, + "loss": 0.7383, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.054283831268548965, + "rewards/margins": -0.04590270295739174, + "rewards/rejected": 0.10018652677536011, + "step": 2451 + }, + { + "epoch": 0.3791996907017205, + "grad_norm": 8.800716400146484, + "learning_rate": 4.8533623553671675e-06, + "logits/chosen": 5.24588680267334, + "logits/rejected": 5.768891334533691, + "logps/chosen": -191.69677734375, + "logps/rejected": -211.8994598388672, + "loss": 0.6129, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0009166207164525986, + "rewards/margins": 0.23153649270534515, + "rewards/rejected": -0.230619877576828, + "step": 2452 + }, + { + "epoch": 0.37935433984148464, + "grad_norm": 5.712619781494141, + "learning_rate": 4.853075953717493e-06, + "logits/chosen": 13.179658889770508, + "logits/rejected": 13.897315979003906, + "logps/chosen": -435.13336181640625, + "logps/rejected": -484.74798583984375, + "loss": 0.5517, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.49636709690093994, + "rewards/margins": 0.5442224740982056, + "rewards/rejected": -0.047855377197265625, + "step": 2453 + }, + { + "epoch": 0.3795089889812488, + "grad_norm": 7.024582862854004, + "learning_rate": 4.85278955206782e-06, + "logits/chosen": 9.524391174316406, + "logits/rejected": 15.482491493225098, + "logps/chosen": -312.1413269042969, + "logps/rejected": -390.7240295410156, + "loss": 0.7866, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007749810814857483, + "rewards/margins": -0.103228360414505, + "rewards/rejected": 0.11097818613052368, + "step": 2454 + }, + { + "epoch": 0.37966363812101295, + "grad_norm": 4.870251655578613, + "learning_rate": 4.852503150418147e-06, + "logits/chosen": 12.390083312988281, + "logits/rejected": 12.206705093383789, + "logps/chosen": -229.89613342285156, + "logps/rejected": -292.3168029785156, + "loss": 0.6252, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15041351318359375, + "rewards/margins": 0.30681112408638, + "rewards/rejected": -0.15639764070510864, + "step": 2455 + }, + { + "epoch": 0.3798182872607771, + "grad_norm": 4.591142177581787, + "learning_rate": 4.852216748768473e-06, + "logits/chosen": 9.408971786499023, + "logits/rejected": 6.130207538604736, + "logps/chosen": -273.287841796875, + "logps/rejected": -245.18118286132812, + "loss": 0.6123, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10999423265457153, + "rewards/margins": 0.2851707637310028, + "rewards/rejected": -0.17517653107643127, + "step": 2456 + }, + { + "epoch": 0.37997293640054125, + "grad_norm": 5.637016296386719, + "learning_rate": 4.8519303471188e-06, + "logits/chosen": 13.771567344665527, + "logits/rejected": 7.371804237365723, + "logps/chosen": -322.08746337890625, + "logps/rejected": -251.43751525878906, + "loss": 0.6241, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0772399827837944, + "rewards/margins": 0.1934891790151596, + "rewards/rejected": -0.2707291841506958, + "step": 2457 + }, + { + "epoch": 0.38012758554030546, + "grad_norm": 15.902688026428223, + "learning_rate": 4.851643945469126e-06, + "logits/chosen": 10.44197940826416, + "logits/rejected": 7.801851272583008, + "logps/chosen": -333.44769287109375, + "logps/rejected": -323.7859802246094, + "loss": 0.674, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1803637444972992, + "rewards/margins": 0.09277015924453735, + "rewards/rejected": 0.08759360760450363, + "step": 2458 + }, + { + "epoch": 0.3802822346800696, + "grad_norm": 6.0199408531188965, + "learning_rate": 4.851357543819452e-06, + "logits/chosen": 8.649627685546875, + "logits/rejected": 7.019885063171387, + "logps/chosen": -264.76947021484375, + "logps/rejected": -215.9906463623047, + "loss": 0.6809, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0520319938659668, + "rewards/margins": 0.09728287905454636, + "rewards/rejected": -0.04525090008974075, + "step": 2459 + }, + { + "epoch": 0.38043688381983376, + "grad_norm": 4.741609573364258, + "learning_rate": 4.851071142169779e-06, + "logits/chosen": 13.077438354492188, + "logits/rejected": 7.652851581573486, + "logps/chosen": -356.4844665527344, + "logps/rejected": -272.8990478515625, + "loss": 0.6353, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07975941151380539, + "rewards/margins": 0.18288972973823547, + "rewards/rejected": -0.1031302809715271, + "step": 2460 + }, + { + "epoch": 0.3805915329595979, + "grad_norm": 5.356295585632324, + "learning_rate": 4.850784740520106e-06, + "logits/chosen": 11.079095840454102, + "logits/rejected": 7.96843147277832, + "logps/chosen": -177.5699005126953, + "logps/rejected": -200.76068115234375, + "loss": 0.7554, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18139201402664185, + "rewards/margins": 0.04228571057319641, + "rewards/rejected": -0.22367773950099945, + "step": 2461 + }, + { + "epoch": 0.38074618209936206, + "grad_norm": 6.1840972900390625, + "learning_rate": 4.850498338870432e-06, + "logits/chosen": 5.411108493804932, + "logits/rejected": 6.190216541290283, + "logps/chosen": -240.19711303710938, + "logps/rejected": -190.1748504638672, + "loss": 0.7966, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07822180539369583, + "rewards/margins": -0.12411999702453613, + "rewards/rejected": 0.20234179496765137, + "step": 2462 + }, + { + "epoch": 0.3809008312391262, + "grad_norm": 4.115326404571533, + "learning_rate": 4.850211937220759e-06, + "logits/chosen": 10.64490032196045, + "logits/rejected": 5.670012474060059, + "logps/chosen": -299.14886474609375, + "logps/rejected": -327.2231750488281, + "loss": 0.416, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2648065686225891, + "rewards/margins": 0.9659033417701721, + "rewards/rejected": -0.7010967135429382, + "step": 2463 + }, + { + "epoch": 0.38105548037889037, + "grad_norm": 4.51612663269043, + "learning_rate": 4.849925535571085e-06, + "logits/chosen": 8.579599380493164, + "logits/rejected": 6.506267547607422, + "logps/chosen": -248.03672790527344, + "logps/rejected": -188.42611694335938, + "loss": 0.5647, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1253599226474762, + "rewards/margins": 0.31172657012939453, + "rewards/rejected": -0.18636667728424072, + "step": 2464 + }, + { + "epoch": 0.3812101295186546, + "grad_norm": 5.703372478485107, + "learning_rate": 4.8496391339214114e-06, + "logits/chosen": 5.371494293212891, + "logits/rejected": 8.029120445251465, + "logps/chosen": -282.0240783691406, + "logps/rejected": -277.1210632324219, + "loss": 0.6906, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1797756552696228, + "rewards/margins": 0.046551287174224854, + "rewards/rejected": 0.13322439789772034, + "step": 2465 + }, + { + "epoch": 0.3813647786584187, + "grad_norm": 5.621588230133057, + "learning_rate": 4.849352732271738e-06, + "logits/chosen": 5.82005500793457, + "logits/rejected": 8.25497055053711, + "logps/chosen": -256.2855224609375, + "logps/rejected": -251.10400390625, + "loss": 0.6562, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20535221695899963, + "rewards/margins": 0.19338560104370117, + "rewards/rejected": 0.011966601014137268, + "step": 2466 + }, + { + "epoch": 0.3815194277981829, + "grad_norm": 36.747825622558594, + "learning_rate": 4.849066330622065e-06, + "logits/chosen": 15.000533103942871, + "logits/rejected": 9.875008583068848, + "logps/chosen": -308.3725891113281, + "logps/rejected": -317.973876953125, + "loss": 0.5727, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18519611656665802, + "rewards/margins": 0.35210517048835754, + "rewards/rejected": -0.16690905392169952, + "step": 2467 + }, + { + "epoch": 0.381674076937947, + "grad_norm": 5.15939474105835, + "learning_rate": 4.848779928972391e-06, + "logits/chosen": 9.518121719360352, + "logits/rejected": 3.5567731857299805, + "logps/chosen": -364.44757080078125, + "logps/rejected": -281.7632141113281, + "loss": 0.6447, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27831774950027466, + "rewards/margins": 0.18005478382110596, + "rewards/rejected": 0.0982629805803299, + "step": 2468 + }, + { + "epoch": 0.3818287260777112, + "grad_norm": 5.8277506828308105, + "learning_rate": 4.848493527322718e-06, + "logits/chosen": 8.182151794433594, + "logits/rejected": 4.919912338256836, + "logps/chosen": -291.82757568359375, + "logps/rejected": -301.93280029296875, + "loss": 0.6882, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16954413056373596, + "rewards/margins": 0.06865277141332626, + "rewards/rejected": 0.1008913516998291, + "step": 2469 + }, + { + "epoch": 0.38198337521747533, + "grad_norm": 7.3568115234375, + "learning_rate": 4.848207125673045e-06, + "logits/chosen": 9.19233226776123, + "logits/rejected": 5.596482276916504, + "logps/chosen": -397.4821472167969, + "logps/rejected": -285.5025939941406, + "loss": 0.7785, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2325807511806488, + "rewards/margins": -0.005638808012008667, + "rewards/rejected": 0.23821955919265747, + "step": 2470 + }, + { + "epoch": 0.38213802435723954, + "grad_norm": 19.54067611694336, + "learning_rate": 4.8479207240233705e-06, + "logits/chosen": 4.173725128173828, + "logits/rejected": 1.8053724765777588, + "logps/chosen": -428.13427734375, + "logps/rejected": -344.1834716796875, + "loss": 0.7293, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44601812958717346, + "rewards/margins": 0.024462975561618805, + "rewards/rejected": 0.42155516147613525, + "step": 2471 + }, + { + "epoch": 0.3822926734970037, + "grad_norm": 4.856367588043213, + "learning_rate": 4.847634322373697e-06, + "logits/chosen": 12.701238632202148, + "logits/rejected": 15.662541389465332, + "logps/chosen": -146.4423828125, + "logps/rejected": -178.07342529296875, + "loss": 0.8074, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1506718248128891, + "rewards/margins": -0.19828256964683533, + "rewards/rejected": 0.04761075600981712, + "step": 2472 + }, + { + "epoch": 0.38244732263676784, + "grad_norm": 6.804742813110352, + "learning_rate": 4.847347920724024e-06, + "logits/chosen": 7.160064697265625, + "logits/rejected": 2.0367302894592285, + "logps/chosen": -269.0357666015625, + "logps/rejected": -201.4021759033203, + "loss": 0.6672, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27771008014678955, + "rewards/margins": 0.09715823829174042, + "rewards/rejected": 0.18055182695388794, + "step": 2473 + }, + { + "epoch": 0.382601971776532, + "grad_norm": 5.815539360046387, + "learning_rate": 4.8470615190743504e-06, + "logits/chosen": 4.998912811279297, + "logits/rejected": 5.028353214263916, + "logps/chosen": -246.58224487304688, + "logps/rejected": -313.84161376953125, + "loss": 0.8263, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.034151893109083176, + "rewards/margins": -0.08819104731082916, + "rewards/rejected": 0.12234293669462204, + "step": 2474 + }, + { + "epoch": 0.38275662091629614, + "grad_norm": 6.337403297424316, + "learning_rate": 4.846775117424677e-06, + "logits/chosen": 9.842008590698242, + "logits/rejected": 5.350711822509766, + "logps/chosen": -342.2659912109375, + "logps/rejected": -242.08145141601562, + "loss": 0.5456, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2532733082771301, + "rewards/margins": 0.4525088667869568, + "rewards/rejected": -0.19923552870750427, + "step": 2475 + }, + { + "epoch": 0.3829112700560603, + "grad_norm": 5.715733051300049, + "learning_rate": 4.846488715775004e-06, + "logits/chosen": 8.07754898071289, + "logits/rejected": 8.362151145935059, + "logps/chosen": -272.25372314453125, + "logps/rejected": -207.8125, + "loss": 0.6929, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21087270975112915, + "rewards/margins": 0.14852365851402283, + "rewards/rejected": 0.06234902888536453, + "step": 2476 + }, + { + "epoch": 0.3830659191958245, + "grad_norm": 5.360029697418213, + "learning_rate": 4.8462023141253295e-06, + "logits/chosen": 8.912494659423828, + "logits/rejected": 12.23288345336914, + "logps/chosen": -274.897216796875, + "logps/rejected": -328.8795166015625, + "loss": 0.6353, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28889456391334534, + "rewards/margins": 0.2081594169139862, + "rewards/rejected": 0.08073515444993973, + "step": 2477 + }, + { + "epoch": 0.38322056833558865, + "grad_norm": 8.825865745544434, + "learning_rate": 4.845915912475656e-06, + "logits/chosen": 3.672886848449707, + "logits/rejected": 5.712207794189453, + "logps/chosen": -229.70065307617188, + "logps/rejected": -348.7716064453125, + "loss": 0.6415, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09264783561229706, + "rewards/margins": 0.2627146244049072, + "rewards/rejected": -0.17006680369377136, + "step": 2478 + }, + { + "epoch": 0.3833752174753528, + "grad_norm": 3.5883078575134277, + "learning_rate": 4.845629510825983e-06, + "logits/chosen": 12.030282020568848, + "logits/rejected": 4.84902286529541, + "logps/chosen": -209.53594970703125, + "logps/rejected": -129.4523162841797, + "loss": 0.5904, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07043170183897018, + "rewards/margins": 0.27338847517967224, + "rewards/rejected": -0.20295676589012146, + "step": 2479 + }, + { + "epoch": 0.38352986661511695, + "grad_norm": 4.4601731300354, + "learning_rate": 4.8453431091763095e-06, + "logits/chosen": 11.873638153076172, + "logits/rejected": 9.215685844421387, + "logps/chosen": -130.216064453125, + "logps/rejected": -109.36691284179688, + "loss": 0.7171, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19904479384422302, + "rewards/margins": 0.024827249348163605, + "rewards/rejected": -0.22387205064296722, + "step": 2480 + }, + { + "epoch": 0.3836845157548811, + "grad_norm": 6.0697221755981445, + "learning_rate": 4.845056707526636e-06, + "logits/chosen": 7.541437149047852, + "logits/rejected": 7.243613243103027, + "logps/chosen": -165.21231079101562, + "logps/rejected": -193.79469299316406, + "loss": 0.845, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06775610893964767, + "rewards/margins": -0.18812447786331177, + "rewards/rejected": 0.12036838382482529, + "step": 2481 + }, + { + "epoch": 0.38383916489464526, + "grad_norm": 6.477343559265137, + "learning_rate": 4.844770305876963e-06, + "logits/chosen": 6.0937886238098145, + "logits/rejected": 5.872834205627441, + "logps/chosen": -290.815673828125, + "logps/rejected": -307.48541259765625, + "loss": 0.708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.037440016865730286, + "rewards/margins": 0.047222621738910675, + "rewards/rejected": -0.08466263115406036, + "step": 2482 + }, + { + "epoch": 0.3839938140344094, + "grad_norm": 4.505267143249512, + "learning_rate": 4.844483904227289e-06, + "logits/chosen": 11.938836097717285, + "logits/rejected": 6.161751747131348, + "logps/chosen": -235.750244140625, + "logps/rejected": -174.49386596679688, + "loss": 0.6523, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07571306079626083, + "rewards/margins": 0.1427125334739685, + "rewards/rejected": -0.06699948757886887, + "step": 2483 + }, + { + "epoch": 0.3841484631741736, + "grad_norm": 5.465560436248779, + "learning_rate": 4.844197502577615e-06, + "logits/chosen": 8.367072105407715, + "logits/rejected": 4.855196952819824, + "logps/chosen": -237.4248809814453, + "logps/rejected": -219.48329162597656, + "loss": 0.7158, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11514270305633545, + "rewards/margins": 0.10784518718719482, + "rewards/rejected": 0.00729752704501152, + "step": 2484 + }, + { + "epoch": 0.38430311231393777, + "grad_norm": 5.986687660217285, + "learning_rate": 4.843911100927942e-06, + "logits/chosen": 7.6505913734436035, + "logits/rejected": 5.7743659019470215, + "logps/chosen": -259.51507568359375, + "logps/rejected": -233.96424865722656, + "loss": 0.6931, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24423623085021973, + "rewards/margins": 0.1518401801586151, + "rewards/rejected": 0.09239606559276581, + "step": 2485 + }, + { + "epoch": 0.3844577614537019, + "grad_norm": 5.960478782653809, + "learning_rate": 4.8436246992782685e-06, + "logits/chosen": 14.987554550170898, + "logits/rejected": 12.703981399536133, + "logps/chosen": -341.366943359375, + "logps/rejected": -366.9012451171875, + "loss": 0.6442, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15539970993995667, + "rewards/margins": 0.16267462074756622, + "rewards/rejected": -0.007274903357028961, + "step": 2486 + }, + { + "epoch": 0.38461241059346607, + "grad_norm": 5.444278717041016, + "learning_rate": 4.843338297628594e-06, + "logits/chosen": 5.063711166381836, + "logits/rejected": 3.09645938873291, + "logps/chosen": -187.91961669921875, + "logps/rejected": -200.99444580078125, + "loss": 0.7127, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.022503569722175598, + "rewards/margins": 0.046679943799972534, + "rewards/rejected": -0.02417636290192604, + "step": 2487 + }, + { + "epoch": 0.3847670597332302, + "grad_norm": 7.042248249053955, + "learning_rate": 4.843051895978921e-06, + "logits/chosen": 12.392899513244629, + "logits/rejected": 10.489843368530273, + "logps/chosen": -368.0589599609375, + "logps/rejected": -347.74932861328125, + "loss": 0.6701, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3248254060745239, + "rewards/margins": 0.09664908051490784, + "rewards/rejected": 0.2281763106584549, + "step": 2488 + }, + { + "epoch": 0.3849217088729944, + "grad_norm": 5.045039176940918, + "learning_rate": 4.842765494329248e-06, + "logits/chosen": 10.553121566772461, + "logits/rejected": 7.557656764984131, + "logps/chosen": -272.0636291503906, + "logps/rejected": -231.60507202148438, + "loss": 0.6309, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3065963685512543, + "rewards/margins": 0.24395127594470978, + "rewards/rejected": 0.0626450926065445, + "step": 2489 + }, + { + "epoch": 0.3850763580127586, + "grad_norm": 6.289703369140625, + "learning_rate": 4.842479092679574e-06, + "logits/chosen": 6.633242607116699, + "logits/rejected": -0.1066126823425293, + "logps/chosen": -318.16986083984375, + "logps/rejected": -186.30484008789062, + "loss": 0.5141, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.061443231999874115, + "rewards/margins": 0.5793236494064331, + "rewards/rejected": -0.5178803205490112, + "step": 2490 + }, + { + "epoch": 0.38523100715252273, + "grad_norm": 5.394789695739746, + "learning_rate": 4.842192691029901e-06, + "logits/chosen": 8.646957397460938, + "logits/rejected": 7.70578670501709, + "logps/chosen": -262.5252990722656, + "logps/rejected": -265.5787353515625, + "loss": 0.6112, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2964068055152893, + "rewards/margins": 0.18525764346122742, + "rewards/rejected": 0.1111491322517395, + "step": 2491 + }, + { + "epoch": 0.3853856562922869, + "grad_norm": 6.342586994171143, + "learning_rate": 4.841906289380227e-06, + "logits/chosen": 9.51095962524414, + "logits/rejected": 9.00922966003418, + "logps/chosen": -263.4512634277344, + "logps/rejected": -284.76458740234375, + "loss": 0.7145, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007275670766830444, + "rewards/margins": 0.02656612917780876, + "rewards/rejected": -0.019290447235107422, + "step": 2492 + }, + { + "epoch": 0.38554030543205103, + "grad_norm": 5.253087043762207, + "learning_rate": 4.841619887730553e-06, + "logits/chosen": 7.767867088317871, + "logits/rejected": 7.674994468688965, + "logps/chosen": -205.35792541503906, + "logps/rejected": -202.5142822265625, + "loss": 0.6979, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08806352317333221, + "rewards/margins": 0.10078594088554382, + "rewards/rejected": -0.012722387909889221, + "step": 2493 + }, + { + "epoch": 0.3856949545718152, + "grad_norm": 5.032226085662842, + "learning_rate": 4.84133348608088e-06, + "logits/chosen": 5.237201690673828, + "logits/rejected": 12.96186637878418, + "logps/chosen": -205.50302124023438, + "logps/rejected": -314.1517333984375, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003928437829017639, + "rewards/margins": 0.21162468194961548, + "rewards/rejected": -0.20769624412059784, + "step": 2494 + }, + { + "epoch": 0.38584960371157934, + "grad_norm": 10.512274742126465, + "learning_rate": 4.841047084431207e-06, + "logits/chosen": 13.693273544311523, + "logits/rejected": 12.609396934509277, + "logps/chosen": -348.7419738769531, + "logps/rejected": -311.8406982421875, + "loss": 0.6647, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2086874097585678, + "rewards/margins": 0.2260618507862091, + "rewards/rejected": -0.017374418675899506, + "step": 2495 + }, + { + "epoch": 0.3860042528513435, + "grad_norm": 5.769652366638184, + "learning_rate": 4.840760682781533e-06, + "logits/chosen": 10.47608470916748, + "logits/rejected": 7.6666669845581055, + "logps/chosen": -507.9436950683594, + "logps/rejected": -390.9814758300781, + "loss": 0.5936, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5202432870864868, + "rewards/margins": 0.29551029205322266, + "rewards/rejected": 0.22473296523094177, + "step": 2496 + }, + { + "epoch": 0.3861589019911077, + "grad_norm": 5.407959938049316, + "learning_rate": 4.840474281131859e-06, + "logits/chosen": 8.83450698852539, + "logits/rejected": 7.334245681762695, + "logps/chosen": -352.4127197265625, + "logps/rejected": -279.8899230957031, + "loss": 0.6927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0615481361746788, + "rewards/margins": 0.04724645987153053, + "rewards/rejected": 0.01430167630314827, + "step": 2497 + }, + { + "epoch": 0.38631355113087185, + "grad_norm": 3.314016342163086, + "learning_rate": 4.840187879482186e-06, + "logits/chosen": 10.245983123779297, + "logits/rejected": 0.1744976043701172, + "logps/chosen": -343.8799743652344, + "logps/rejected": -170.34158325195312, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33750298619270325, + "rewards/margins": 0.4623255729675293, + "rewards/rejected": -0.12482261657714844, + "step": 2498 + }, + { + "epoch": 0.386468200270636, + "grad_norm": 6.9106316566467285, + "learning_rate": 4.8399014778325125e-06, + "logits/chosen": 5.494254112243652, + "logits/rejected": 7.414394378662109, + "logps/chosen": -225.8417510986328, + "logps/rejected": -301.1886291503906, + "loss": 0.8414, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.174442857503891, + "rewards/margins": -0.21674901247024536, + "rewards/rejected": 0.04230612516403198, + "step": 2499 + }, + { + "epoch": 0.38662284941040015, + "grad_norm": 4.893411159515381, + "learning_rate": 4.839615076182839e-06, + "logits/chosen": 8.914921760559082, + "logits/rejected": 5.699570178985596, + "logps/chosen": -204.90841674804688, + "logps/rejected": -216.63169860839844, + "loss": 0.6283, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20216476917266846, + "rewards/margins": 0.14855243265628815, + "rewards/rejected": 0.05361231788992882, + "step": 2500 + }, + { + "epoch": 0.3867774985501643, + "grad_norm": 4.795331001281738, + "learning_rate": 4.839328674533166e-06, + "logits/chosen": 10.227174758911133, + "logits/rejected": 8.926871299743652, + "logps/chosen": -218.94346618652344, + "logps/rejected": -223.26644897460938, + "loss": 0.6811, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2945233881473541, + "rewards/margins": 0.07627392560243607, + "rewards/rejected": 0.21824945509433746, + "step": 2501 + }, + { + "epoch": 0.38693214768992845, + "grad_norm": 5.449828624725342, + "learning_rate": 4.839042272883492e-06, + "logits/chosen": 7.624882698059082, + "logits/rejected": 2.345890760421753, + "logps/chosen": -359.16888427734375, + "logps/rejected": -282.7822570800781, + "loss": 0.5046, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16612538695335388, + "rewards/margins": 0.46880584955215454, + "rewards/rejected": -0.30268049240112305, + "step": 2502 + }, + { + "epoch": 0.38708679682969266, + "grad_norm": 3.7475388050079346, + "learning_rate": 4.838755871233819e-06, + "logits/chosen": 9.804908752441406, + "logits/rejected": 2.026059865951538, + "logps/chosen": -201.38687133789062, + "logps/rejected": -124.04072570800781, + "loss": 0.6381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06194836646318436, + "rewards/margins": 0.1669876128435135, + "rewards/rejected": -0.10503923892974854, + "step": 2503 + }, + { + "epoch": 0.3872414459694568, + "grad_norm": 5.21207332611084, + "learning_rate": 4.838469469584145e-06, + "logits/chosen": 9.508917808532715, + "logits/rejected": 7.36872673034668, + "logps/chosen": -221.48583984375, + "logps/rejected": -193.13037109375, + "loss": 0.6793, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30535149574279785, + "rewards/margins": 0.050060346722602844, + "rewards/rejected": -0.3554118573665619, + "step": 2504 + }, + { + "epoch": 0.38739609510922096, + "grad_norm": 5.580845355987549, + "learning_rate": 4.8381830679344715e-06, + "logits/chosen": 11.653223991394043, + "logits/rejected": -0.8787755966186523, + "logps/chosen": -359.5242614746094, + "logps/rejected": -245.37086486816406, + "loss": 0.5623, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33282265067100525, + "rewards/margins": 0.374828577041626, + "rewards/rejected": -0.04200592264533043, + "step": 2505 + }, + { + "epoch": 0.3875507442489851, + "grad_norm": 5.351850986480713, + "learning_rate": 4.837896666284798e-06, + "logits/chosen": 12.371397972106934, + "logits/rejected": 8.496321678161621, + "logps/chosen": -293.55316162109375, + "logps/rejected": -218.14512634277344, + "loss": 0.6581, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09970217198133469, + "rewards/margins": 0.16105279326438904, + "rewards/rejected": -0.061350636184215546, + "step": 2506 + }, + { + "epoch": 0.38770539338874926, + "grad_norm": 5.87265682220459, + "learning_rate": 4.837610264635125e-06, + "logits/chosen": 12.582254409790039, + "logits/rejected": 6.969643592834473, + "logps/chosen": -349.2928161621094, + "logps/rejected": -305.8788146972656, + "loss": 0.5681, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3168983459472656, + "rewards/margins": 0.368699848651886, + "rewards/rejected": -0.051801517605781555, + "step": 2507 + }, + { + "epoch": 0.3878600425285134, + "grad_norm": 6.033074855804443, + "learning_rate": 4.8373238629854515e-06, + "logits/chosen": 6.3576765060424805, + "logits/rejected": 6.106639862060547, + "logps/chosen": -257.6221008300781, + "logps/rejected": -257.79742431640625, + "loss": 0.7602, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.023577619343996048, + "rewards/margins": 0.003074638545513153, + "rewards/rejected": -0.02665223926305771, + "step": 2508 + }, + { + "epoch": 0.3880146916682776, + "grad_norm": 3.952921152114868, + "learning_rate": 4.837037461335778e-06, + "logits/chosen": 14.972173690795898, + "logits/rejected": 8.424567222595215, + "logps/chosen": -254.81655883789062, + "logps/rejected": -218.99913024902344, + "loss": 0.5634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19369012117385864, + "rewards/margins": 0.3433961868286133, + "rewards/rejected": -0.14970608055591583, + "step": 2509 + }, + { + "epoch": 0.3881693408080418, + "grad_norm": 5.90886926651001, + "learning_rate": 4.836751059686104e-06, + "logits/chosen": 11.192469596862793, + "logits/rejected": 10.767522811889648, + "logps/chosen": -257.12591552734375, + "logps/rejected": -272.29693603515625, + "loss": 0.7189, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32702887058258057, + "rewards/margins": 0.006801605224609375, + "rewards/rejected": 0.3202272653579712, + "step": 2510 + }, + { + "epoch": 0.3883239899478059, + "grad_norm": 5.425315856933594, + "learning_rate": 4.8364646580364306e-06, + "logits/chosen": 13.872915267944336, + "logits/rejected": 7.135364532470703, + "logps/chosen": -265.2104187011719, + "logps/rejected": -195.73187255859375, + "loss": 0.6287, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16727611422538757, + "rewards/margins": 0.20995265245437622, + "rewards/rejected": -0.04267653822898865, + "step": 2511 + }, + { + "epoch": 0.3884786390875701, + "grad_norm": 6.344537734985352, + "learning_rate": 4.836178256386757e-06, + "logits/chosen": 6.726906776428223, + "logits/rejected": 11.165605545043945, + "logps/chosen": -174.9613800048828, + "logps/rejected": -220.12994384765625, + "loss": 0.9263, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.37487179040908813, + "rewards/margins": -0.375477135181427, + "rewards/rejected": 0.0006053522229194641, + "step": 2512 + }, + { + "epoch": 0.38863328822733423, + "grad_norm": 6.683211803436279, + "learning_rate": 4.835891854737084e-06, + "logits/chosen": 11.192131042480469, + "logits/rejected": 10.389376640319824, + "logps/chosen": -368.70855712890625, + "logps/rejected": -358.7967529296875, + "loss": 0.6662, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13360156118869781, + "rewards/margins": 0.13726729154586792, + "rewards/rejected": -0.003665730357170105, + "step": 2513 + }, + { + "epoch": 0.3887879373670984, + "grad_norm": 4.948020935058594, + "learning_rate": 4.8356054530874105e-06, + "logits/chosen": 7.507864475250244, + "logits/rejected": 4.774501800537109, + "logps/chosen": -231.83563232421875, + "logps/rejected": -231.54940795898438, + "loss": 0.6304, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1089787483215332, + "rewards/margins": 0.16259777545928955, + "rewards/rejected": -0.05361901968717575, + "step": 2514 + }, + { + "epoch": 0.38894258650686253, + "grad_norm": 5.454336643218994, + "learning_rate": 4.835319051437737e-06, + "logits/chosen": 7.162698745727539, + "logits/rejected": 9.290777206420898, + "logps/chosen": -347.8484191894531, + "logps/rejected": -345.4284362792969, + "loss": 0.5257, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2902758717536926, + "rewards/margins": 0.4465555250644684, + "rewards/rejected": -0.15627965331077576, + "step": 2515 + }, + { + "epoch": 0.38909723564662674, + "grad_norm": 5.687977313995361, + "learning_rate": 4.835032649788064e-06, + "logits/chosen": 9.118745803833008, + "logits/rejected": 10.99682331085205, + "logps/chosen": -278.5380859375, + "logps/rejected": -344.48211669921875, + "loss": 0.777, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16848069429397583, + "rewards/margins": -0.13640569150447845, + "rewards/rejected": 0.3048864006996155, + "step": 2516 + }, + { + "epoch": 0.3892518847863909, + "grad_norm": 6.410073757171631, + "learning_rate": 4.83474624813839e-06, + "logits/chosen": 10.351490020751953, + "logits/rejected": 2.3732974529266357, + "logps/chosen": -315.24359130859375, + "logps/rejected": -167.97608947753906, + "loss": 0.5684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2100776880979538, + "rewards/margins": 0.39044898748397827, + "rewards/rejected": -0.18037129938602448, + "step": 2517 + }, + { + "epoch": 0.38940653392615504, + "grad_norm": 7.10020637512207, + "learning_rate": 4.834459846488716e-06, + "logits/chosen": 9.872756958007812, + "logits/rejected": 14.278976440429688, + "logps/chosen": -280.0938720703125, + "logps/rejected": -345.4791259765625, + "loss": 0.8875, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.17021316289901733, + "rewards/margins": -0.2938002347946167, + "rewards/rejected": 0.12358702719211578, + "step": 2518 + }, + { + "epoch": 0.3895611830659192, + "grad_norm": 5.2860236167907715, + "learning_rate": 4.834173444839043e-06, + "logits/chosen": 10.695123672485352, + "logits/rejected": 7.5514116287231445, + "logps/chosen": -253.3675537109375, + "logps/rejected": -164.79969787597656, + "loss": 0.6995, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15195079147815704, + "rewards/margins": 0.03273802995681763, + "rewards/rejected": 0.11921275407075882, + "step": 2519 + }, + { + "epoch": 0.38971583220568334, + "grad_norm": 7.132540225982666, + "learning_rate": 4.83388704318937e-06, + "logits/chosen": 4.442205905914307, + "logits/rejected": 0.9681739211082458, + "logps/chosen": -318.96722412109375, + "logps/rejected": -290.3237609863281, + "loss": 0.5952, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23839467763900757, + "rewards/margins": 0.4139556884765625, + "rewards/rejected": -0.17556098103523254, + "step": 2520 + }, + { + "epoch": 0.3898704813454475, + "grad_norm": 20.26692771911621, + "learning_rate": 4.833600641539695e-06, + "logits/chosen": 9.219143867492676, + "logits/rejected": 9.805583953857422, + "logps/chosen": -395.576171875, + "logps/rejected": -369.728759765625, + "loss": 0.6226, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.442513108253479, + "rewards/margins": 0.17555636167526245, + "rewards/rejected": 0.26695671677589417, + "step": 2521 + }, + { + "epoch": 0.3900251304852117, + "grad_norm": 5.81402587890625, + "learning_rate": 4.833314239890022e-06, + "logits/chosen": 10.73791217803955, + "logits/rejected": 10.264984130859375, + "logps/chosen": -362.8399658203125, + "logps/rejected": -313.6953125, + "loss": 0.7329, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18623968958854675, + "rewards/margins": 0.09422679245471954, + "rewards/rejected": 0.09201288223266602, + "step": 2522 + }, + { + "epoch": 0.39017977962497585, + "grad_norm": 6.399228096008301, + "learning_rate": 4.833027838240349e-06, + "logits/chosen": 5.795349597930908, + "logits/rejected": 10.752036094665527, + "logps/chosen": -251.02041625976562, + "logps/rejected": -336.62774658203125, + "loss": 0.7317, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2808384895324707, + "rewards/margins": 0.027870308607816696, + "rewards/rejected": 0.2529681921005249, + "step": 2523 + }, + { + "epoch": 0.39033442876474, + "grad_norm": 4.551957130432129, + "learning_rate": 4.832741436590675e-06, + "logits/chosen": 8.038311004638672, + "logits/rejected": 4.359333515167236, + "logps/chosen": -267.29779052734375, + "logps/rejected": -271.3774719238281, + "loss": 0.6954, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11396326124668121, + "rewards/margins": 0.15602554380893707, + "rewards/rejected": -0.04206228256225586, + "step": 2524 + }, + { + "epoch": 0.39048907790450416, + "grad_norm": 6.611552715301514, + "learning_rate": 4.832455034941001e-06, + "logits/chosen": 7.531148433685303, + "logits/rejected": 5.5594482421875, + "logps/chosen": -367.160888671875, + "logps/rejected": -371.1960144042969, + "loss": 0.6031, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3096430003643036, + "rewards/margins": 0.4614374041557312, + "rewards/rejected": -0.15179443359375, + "step": 2525 + }, + { + "epoch": 0.3906437270442683, + "grad_norm": 4.8326239585876465, + "learning_rate": 4.832168633291328e-06, + "logits/chosen": 12.293390274047852, + "logits/rejected": 13.100914001464844, + "logps/chosen": -208.90545654296875, + "logps/rejected": -263.8213806152344, + "loss": 0.6037, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00018724799156188965, + "rewards/margins": 0.2105756253004074, + "rewards/rejected": -0.21038836240768433, + "step": 2526 + }, + { + "epoch": 0.39079837618403246, + "grad_norm": 4.485903739929199, + "learning_rate": 4.8318822316416544e-06, + "logits/chosen": 5.969025611877441, + "logits/rejected": 9.473962783813477, + "logps/chosen": -228.4339141845703, + "logps/rejected": -265.7933044433594, + "loss": 0.6509, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09652829170227051, + "rewards/margins": 0.15099148452281952, + "rewards/rejected": -0.24751979112625122, + "step": 2527 + }, + { + "epoch": 0.3909530253237966, + "grad_norm": 4.098492622375488, + "learning_rate": 4.831595829991981e-06, + "logits/chosen": 10.857067108154297, + "logits/rejected": 6.082857608795166, + "logps/chosen": -309.5762023925781, + "logps/rejected": -260.5160217285156, + "loss": 0.5213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.45469456911087036, + "rewards/margins": 0.4366970360279083, + "rewards/rejected": 0.017997555434703827, + "step": 2528 + }, + { + "epoch": 0.3911076744635608, + "grad_norm": 4.350097179412842, + "learning_rate": 4.831309428342308e-06, + "logits/chosen": 12.32647705078125, + "logits/rejected": 6.666042327880859, + "logps/chosen": -278.85089111328125, + "logps/rejected": -184.8273468017578, + "loss": 0.5979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15506428480148315, + "rewards/margins": 0.36954832077026367, + "rewards/rejected": -0.21448402106761932, + "step": 2529 + }, + { + "epoch": 0.39126232360332497, + "grad_norm": 8.040081977844238, + "learning_rate": 4.8310230266926335e-06, + "logits/chosen": 9.168654441833496, + "logits/rejected": 2.2769346237182617, + "logps/chosen": -503.75732421875, + "logps/rejected": -430.2760009765625, + "loss": 0.6192, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3419133126735687, + "rewards/margins": 0.25796353816986084, + "rewards/rejected": 0.08394977450370789, + "step": 2530 + }, + { + "epoch": 0.3914169727430891, + "grad_norm": 3.809129238128662, + "learning_rate": 4.83073662504296e-06, + "logits/chosen": 8.37309741973877, + "logits/rejected": 4.278587818145752, + "logps/chosen": -232.55032348632812, + "logps/rejected": -192.03054809570312, + "loss": 0.4953, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3572920560836792, + "rewards/margins": 0.48990529775619507, + "rewards/rejected": -0.13261322677135468, + "step": 2531 + }, + { + "epoch": 0.39157162188285327, + "grad_norm": 10.449230194091797, + "learning_rate": 4.830450223393287e-06, + "logits/chosen": 1.190462589263916, + "logits/rejected": -2.7900595664978027, + "logps/chosen": -254.5145721435547, + "logps/rejected": -143.00790405273438, + "loss": 0.7438, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19025275111198425, + "rewards/margins": -0.023232147097587585, + "rewards/rejected": -0.16702061891555786, + "step": 2532 + }, + { + "epoch": 0.3917262710226174, + "grad_norm": 6.154085159301758, + "learning_rate": 4.8301638217436135e-06, + "logits/chosen": 11.404205322265625, + "logits/rejected": 14.83556842803955, + "logps/chosen": -359.0123596191406, + "logps/rejected": -366.4436950683594, + "loss": 0.7402, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.003390517085790634, + "rewards/margins": -0.0783708393573761, + "rewards/rejected": 0.08176136016845703, + "step": 2533 + }, + { + "epoch": 0.3918809201623816, + "grad_norm": 4.602861404418945, + "learning_rate": 4.82987742009394e-06, + "logits/chosen": 13.853038787841797, + "logits/rejected": 10.441329956054688, + "logps/chosen": -239.82672119140625, + "logps/rejected": -197.84478759765625, + "loss": 0.5633, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2919125556945801, + "rewards/margins": 0.5329559445381165, + "rewards/rejected": -0.24104338884353638, + "step": 2534 + }, + { + "epoch": 0.3920355693021458, + "grad_norm": 5.717794895172119, + "learning_rate": 4.829591018444267e-06, + "logits/chosen": 6.497745037078857, + "logits/rejected": 9.020249366760254, + "logps/chosen": -285.145751953125, + "logps/rejected": -373.0025634765625, + "loss": 0.616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1580706685781479, + "rewards/margins": 0.2852175831794739, + "rewards/rejected": -0.12714692950248718, + "step": 2535 + }, + { + "epoch": 0.39219021844190993, + "grad_norm": 4.775069713592529, + "learning_rate": 4.8293046167945934e-06, + "logits/chosen": 10.217897415161133, + "logits/rejected": 7.607025146484375, + "logps/chosen": -306.3009948730469, + "logps/rejected": -244.65377807617188, + "loss": 0.6108, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41449278593063354, + "rewards/margins": 0.252946674823761, + "rewards/rejected": 0.16154614090919495, + "step": 2536 + }, + { + "epoch": 0.3923448675816741, + "grad_norm": 8.611421585083008, + "learning_rate": 4.829018215144919e-06, + "logits/chosen": 8.86595630645752, + "logits/rejected": 3.967453956604004, + "logps/chosen": -289.7342529296875, + "logps/rejected": -245.5177459716797, + "loss": 0.9922, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.28698402643203735, + "rewards/margins": -0.48094645142555237, + "rewards/rejected": 0.19396242499351501, + "step": 2537 + }, + { + "epoch": 0.39249951672143824, + "grad_norm": 5.9621782302856445, + "learning_rate": 4.828731813495246e-06, + "logits/chosen": 7.559358596801758, + "logits/rejected": 2.3897948265075684, + "logps/chosen": -267.21783447265625, + "logps/rejected": -213.94320678710938, + "loss": 0.6933, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03697100281715393, + "rewards/margins": 0.06150517985224724, + "rewards/rejected": -0.02453417330980301, + "step": 2538 + }, + { + "epoch": 0.3926541658612024, + "grad_norm": 5.176915645599365, + "learning_rate": 4.8284454118455725e-06, + "logits/chosen": 9.754781723022461, + "logits/rejected": 7.905757427215576, + "logps/chosen": -243.39599609375, + "logps/rejected": -216.9149169921875, + "loss": 0.5959, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015784405171871185, + "rewards/margins": 0.2242448776960373, + "rewards/rejected": -0.20846045017242432, + "step": 2539 + }, + { + "epoch": 0.39280881500096654, + "grad_norm": 6.611644744873047, + "learning_rate": 4.828159010195899e-06, + "logits/chosen": 10.470954895019531, + "logits/rejected": 5.715402603149414, + "logps/chosen": -257.9112854003906, + "logps/rejected": -214.12939453125, + "loss": 0.7329, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05962109938263893, + "rewards/margins": 0.0030518919229507446, + "rewards/rejected": -0.06267299503087997, + "step": 2540 + }, + { + "epoch": 0.39296346414073074, + "grad_norm": 4.98043155670166, + "learning_rate": 4.827872608546226e-06, + "logits/chosen": 6.524811267852783, + "logits/rejected": 8.880760192871094, + "logps/chosen": -196.36111450195312, + "logps/rejected": -215.7021484375, + "loss": 0.6784, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11949749290943146, + "rewards/margins": 0.09556332230567932, + "rewards/rejected": 0.02393416315317154, + "step": 2541 + }, + { + "epoch": 0.3931181132804949, + "grad_norm": 6.939080715179443, + "learning_rate": 4.8275862068965525e-06, + "logits/chosen": 12.446451187133789, + "logits/rejected": 8.930511474609375, + "logps/chosen": -355.4482421875, + "logps/rejected": -376.34698486328125, + "loss": 0.9135, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06933679431676865, + "rewards/margins": -0.26839303970336914, + "rewards/rejected": 0.1990562379360199, + "step": 2542 + }, + { + "epoch": 0.39327276242025905, + "grad_norm": 5.849193096160889, + "learning_rate": 4.827299805246878e-06, + "logits/chosen": 9.99445915222168, + "logits/rejected": 4.4871110916137695, + "logps/chosen": -388.3697509765625, + "logps/rejected": -235.374267578125, + "loss": 0.5368, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16586171090602875, + "rewards/margins": 0.4051932990550995, + "rewards/rejected": -0.23933160305023193, + "step": 2543 + }, + { + "epoch": 0.3934274115600232, + "grad_norm": 4.978914737701416, + "learning_rate": 4.827013403597205e-06, + "logits/chosen": 6.21391487121582, + "logits/rejected": 6.505616188049316, + "logps/chosen": -233.50897216796875, + "logps/rejected": -194.19171142578125, + "loss": 0.6895, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11609721183776855, + "rewards/margins": 0.03347183018922806, + "rewards/rejected": 0.0826253816485405, + "step": 2544 + }, + { + "epoch": 0.39358206069978735, + "grad_norm": 6.409840106964111, + "learning_rate": 4.826727001947532e-06, + "logits/chosen": 8.295774459838867, + "logits/rejected": 2.168236494064331, + "logps/chosen": -216.91693115234375, + "logps/rejected": -179.8815155029297, + "loss": 0.8029, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5232242941856384, + "rewards/margins": -0.07691755890846252, + "rewards/rejected": -0.4463067054748535, + "step": 2545 + }, + { + "epoch": 0.3937367098395515, + "grad_norm": 5.3077073097229, + "learning_rate": 4.826440600297858e-06, + "logits/chosen": 7.847306251525879, + "logits/rejected": 6.454771518707275, + "logps/chosen": -305.9904479980469, + "logps/rejected": -312.0799560546875, + "loss": 0.59, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18082857131958008, + "rewards/margins": 0.29177790880203247, + "rewards/rejected": -0.47260648012161255, + "step": 2546 + }, + { + "epoch": 0.39389135897931565, + "grad_norm": 4.155510902404785, + "learning_rate": 4.826154198648185e-06, + "logits/chosen": 10.758145332336426, + "logits/rejected": 11.782370567321777, + "logps/chosen": -229.51902770996094, + "logps/rejected": -218.76719665527344, + "loss": 0.6103, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01856493577361107, + "rewards/margins": 0.24198251962661743, + "rewards/rejected": -0.2605474591255188, + "step": 2547 + }, + { + "epoch": 0.39404600811907986, + "grad_norm": 4.827736854553223, + "learning_rate": 4.8258677969985116e-06, + "logits/chosen": 11.421258926391602, + "logits/rejected": 10.085678100585938, + "logps/chosen": -259.4638671875, + "logps/rejected": -254.18930053710938, + "loss": 0.628, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0431852862238884, + "rewards/margins": 0.2988182306289673, + "rewards/rejected": -0.3420035243034363, + "step": 2548 + }, + { + "epoch": 0.394200657258844, + "grad_norm": 3.9456310272216797, + "learning_rate": 4.825581395348838e-06, + "logits/chosen": 5.618587493896484, + "logits/rejected": 3.565340042114258, + "logps/chosen": -206.11956787109375, + "logps/rejected": -254.37168884277344, + "loss": 0.5384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12535491585731506, + "rewards/margins": 0.4303221106529236, + "rewards/rejected": -0.3049672245979309, + "step": 2549 + }, + { + "epoch": 0.39435530639860816, + "grad_norm": 6.977304458618164, + "learning_rate": 4.825294993699164e-06, + "logits/chosen": 6.748723030090332, + "logits/rejected": 4.894326686859131, + "logps/chosen": -328.8692932128906, + "logps/rejected": -311.1910400390625, + "loss": 0.6805, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23720084130764008, + "rewards/margins": 0.09849634766578674, + "rewards/rejected": 0.13870447874069214, + "step": 2550 + }, + { + "epoch": 0.3945099555383723, + "grad_norm": 4.787998199462891, + "learning_rate": 4.825008592049491e-06, + "logits/chosen": 3.9990077018737793, + "logits/rejected": 5.439508438110352, + "logps/chosen": -238.10952758789062, + "logps/rejected": -234.81243896484375, + "loss": 0.6523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09690293669700623, + "rewards/margins": 0.14090105891227722, + "rewards/rejected": -0.043998122215270996, + "step": 2551 + }, + { + "epoch": 0.39466460467813647, + "grad_norm": 4.951591968536377, + "learning_rate": 4.824722190399817e-06, + "logits/chosen": 17.354629516601562, + "logits/rejected": 7.6628875732421875, + "logps/chosen": -285.8321533203125, + "logps/rejected": -166.01473999023438, + "loss": 0.7237, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11677198112010956, + "rewards/margins": 0.06048339605331421, + "rewards/rejected": 0.05628858506679535, + "step": 2552 + }, + { + "epoch": 0.3948192538179006, + "grad_norm": 6.390881061553955, + "learning_rate": 4.824435788750144e-06, + "logits/chosen": 9.896942138671875, + "logits/rejected": 6.755846977233887, + "logps/chosen": -333.674072265625, + "logps/rejected": -318.6329040527344, + "loss": 0.6244, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24484139680862427, + "rewards/margins": 0.20574340224266052, + "rewards/rejected": 0.03909797966480255, + "step": 2553 + }, + { + "epoch": 0.3949739029576648, + "grad_norm": 6.148293972015381, + "learning_rate": 4.824149387100471e-06, + "logits/chosen": 10.40079116821289, + "logits/rejected": 6.3178019523620605, + "logps/chosen": -325.0823974609375, + "logps/rejected": -211.243896484375, + "loss": 0.7218, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08466396480798721, + "rewards/margins": 0.008593171834945679, + "rewards/rejected": 0.07607080042362213, + "step": 2554 + }, + { + "epoch": 0.395128552097429, + "grad_norm": 5.462231159210205, + "learning_rate": 4.823862985450796e-06, + "logits/chosen": 11.206364631652832, + "logits/rejected": 5.20212459564209, + "logps/chosen": -345.7375793457031, + "logps/rejected": -241.34317016601562, + "loss": 0.5237, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2157306969165802, + "rewards/margins": 0.43911927938461304, + "rewards/rejected": -0.22338858246803284, + "step": 2555 + }, + { + "epoch": 0.3952832012371931, + "grad_norm": 6.3288187980651855, + "learning_rate": 4.823576583801123e-06, + "logits/chosen": 2.3407130241394043, + "logits/rejected": 10.840267181396484, + "logps/chosen": -244.71420288085938, + "logps/rejected": -304.001708984375, + "loss": 0.6252, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3951888680458069, + "rewards/margins": 0.27616190910339355, + "rewards/rejected": 0.11902695149183273, + "step": 2556 + }, + { + "epoch": 0.3954378503769573, + "grad_norm": 5.270389556884766, + "learning_rate": 4.82329018215145e-06, + "logits/chosen": 6.011882305145264, + "logits/rejected": 8.376742362976074, + "logps/chosen": -194.97410583496094, + "logps/rejected": -157.25685119628906, + "loss": 0.8573, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2020559161901474, + "rewards/margins": -0.22629514336585999, + "rewards/rejected": 0.02423921972513199, + "step": 2557 + }, + { + "epoch": 0.39559249951672143, + "grad_norm": 5.609000205993652, + "learning_rate": 4.823003780501776e-06, + "logits/chosen": 6.962016582489014, + "logits/rejected": 6.735646724700928, + "logps/chosen": -230.80062866210938, + "logps/rejected": -273.57666015625, + "loss": 0.6835, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11239200830459595, + "rewards/margins": 0.12234941124916077, + "rewards/rejected": -0.009957395493984222, + "step": 2558 + }, + { + "epoch": 0.3957471486564856, + "grad_norm": 5.15081262588501, + "learning_rate": 4.822717378852102e-06, + "logits/chosen": 10.177614212036133, + "logits/rejected": 5.896312236785889, + "logps/chosen": -327.22991943359375, + "logps/rejected": -244.12928771972656, + "loss": 0.6143, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10368603467941284, + "rewards/margins": 0.1988917589187622, + "rewards/rejected": -0.09520575404167175, + "step": 2559 + }, + { + "epoch": 0.39590179779624973, + "grad_norm": 5.369154930114746, + "learning_rate": 4.822430977202429e-06, + "logits/chosen": 12.63347053527832, + "logits/rejected": 5.854383945465088, + "logps/chosen": -336.1552429199219, + "logps/rejected": -227.9302215576172, + "loss": 0.6189, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3282032012939453, + "rewards/margins": 0.19863511621952057, + "rewards/rejected": 0.12956809997558594, + "step": 2560 + }, + { + "epoch": 0.39605644693601394, + "grad_norm": 10.721266746520996, + "learning_rate": 4.8221445755527555e-06, + "logits/chosen": 9.032099723815918, + "logits/rejected": 8.536942481994629, + "logps/chosen": -345.3525390625, + "logps/rejected": -327.91558837890625, + "loss": 0.6886, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2865002751350403, + "rewards/margins": 0.10141459852457047, + "rewards/rejected": 0.18508568406105042, + "step": 2561 + }, + { + "epoch": 0.3962110960757781, + "grad_norm": 6.783220291137695, + "learning_rate": 4.821858173903082e-06, + "logits/chosen": 7.5616960525512695, + "logits/rejected": 5.122775554656982, + "logps/chosen": -215.74496459960938, + "logps/rejected": -163.34182739257812, + "loss": 0.674, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.037495702505111694, + "rewards/margins": 0.08999853581190109, + "rewards/rejected": -0.052502818405628204, + "step": 2562 + }, + { + "epoch": 0.39636574521554224, + "grad_norm": 4.724612236022949, + "learning_rate": 4.821571772253408e-06, + "logits/chosen": 8.274641990661621, + "logits/rejected": 1.1090553998947144, + "logps/chosen": -338.54632568359375, + "logps/rejected": -219.84732055664062, + "loss": 0.5656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48839202523231506, + "rewards/margins": 0.36774560809135437, + "rewards/rejected": 0.12064642459154129, + "step": 2563 + }, + { + "epoch": 0.3965203943553064, + "grad_norm": 5.487222194671631, + "learning_rate": 4.8212853706037346e-06, + "logits/chosen": 13.157936096191406, + "logits/rejected": 9.264841079711914, + "logps/chosen": -352.5703430175781, + "logps/rejected": -301.8311462402344, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3016829490661621, + "rewards/margins": 0.3600809574127197, + "rewards/rejected": -0.058398015797138214, + "step": 2564 + }, + { + "epoch": 0.39667504349507055, + "grad_norm": 13.406632423400879, + "learning_rate": 4.820998968954061e-06, + "logits/chosen": 10.260294914245605, + "logits/rejected": 9.320372581481934, + "logps/chosen": -391.321044921875, + "logps/rejected": -369.4396057128906, + "loss": 0.6286, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23625889420509338, + "rewards/margins": 0.2528381943702698, + "rewards/rejected": -0.016579288989305496, + "step": 2565 + }, + { + "epoch": 0.3968296926348347, + "grad_norm": 5.284693717956543, + "learning_rate": 4.820712567304388e-06, + "logits/chosen": 12.277772903442383, + "logits/rejected": 15.693747520446777, + "logps/chosen": -270.925048828125, + "logps/rejected": -298.5973815917969, + "loss": 0.6, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32151374220848083, + "rewards/margins": 0.2964085638523102, + "rewards/rejected": 0.025105183944106102, + "step": 2566 + }, + { + "epoch": 0.3969843417745989, + "grad_norm": 3.9091832637786865, + "learning_rate": 4.8204261656547145e-06, + "logits/chosen": 11.815536499023438, + "logits/rejected": 11.500572204589844, + "logps/chosen": -223.6182861328125, + "logps/rejected": -239.14381408691406, + "loss": 0.5381, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09688058495521545, + "rewards/margins": 0.3870798349380493, + "rewards/rejected": -0.29019924998283386, + "step": 2567 + }, + { + "epoch": 0.39713899091436305, + "grad_norm": 47.65726852416992, + "learning_rate": 4.820139764005041e-06, + "logits/chosen": 7.426468849182129, + "logits/rejected": 7.232683181762695, + "logps/chosen": -318.4400634765625, + "logps/rejected": -312.878662109375, + "loss": 0.6273, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25258469581604004, + "rewards/margins": 0.16212821006774902, + "rewards/rejected": 0.09045649319887161, + "step": 2568 + }, + { + "epoch": 0.3972936400541272, + "grad_norm": 6.041717052459717, + "learning_rate": 4.819853362355368e-06, + "logits/chosen": 11.273836135864258, + "logits/rejected": 12.104462623596191, + "logps/chosen": -286.3793640136719, + "logps/rejected": -298.6015319824219, + "loss": 0.666, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18093900382518768, + "rewards/margins": 0.10206803679466248, + "rewards/rejected": 0.07887096703052521, + "step": 2569 + }, + { + "epoch": 0.39744828919389136, + "grad_norm": 7.853445053100586, + "learning_rate": 4.819566960705694e-06, + "logits/chosen": 10.768254280090332, + "logits/rejected": 7.515294551849365, + "logps/chosen": -415.8226623535156, + "logps/rejected": -295.79388427734375, + "loss": 0.7132, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1591348648071289, + "rewards/margins": 0.0030603818595409393, + "rewards/rejected": 0.15607449412345886, + "step": 2570 + }, + { + "epoch": 0.3976029383336555, + "grad_norm": 2.905759811401367, + "learning_rate": 4.81928055905602e-06, + "logits/chosen": 7.475478172302246, + "logits/rejected": 3.498634099960327, + "logps/chosen": -190.98486328125, + "logps/rejected": -158.92971801757812, + "loss": 0.4804, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14990076422691345, + "rewards/margins": 0.6052706241607666, + "rewards/rejected": -0.45536985993385315, + "step": 2571 + }, + { + "epoch": 0.39775758747341966, + "grad_norm": 3.9463183879852295, + "learning_rate": 4.818994157406347e-06, + "logits/chosen": 13.09033489227295, + "logits/rejected": 9.920575141906738, + "logps/chosen": -195.91567993164062, + "logps/rejected": -171.99148559570312, + "loss": 0.6382, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11033868789672852, + "rewards/margins": 0.18787871301174164, + "rewards/rejected": -0.07754003256559372, + "step": 2572 + }, + { + "epoch": 0.3979122366131838, + "grad_norm": 6.394832611083984, + "learning_rate": 4.818707755756674e-06, + "logits/chosen": 11.797989845275879, + "logits/rejected": 3.8984971046447754, + "logps/chosen": -306.2153625488281, + "logps/rejected": -279.37811279296875, + "loss": 0.677, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.272072434425354, + "rewards/margins": 0.1564340889453888, + "rewards/rejected": 0.11563833057880402, + "step": 2573 + }, + { + "epoch": 0.398066885752948, + "grad_norm": 5.186899185180664, + "learning_rate": 4.818421354107e-06, + "logits/chosen": 8.865129470825195, + "logits/rejected": 3.626326560974121, + "logps/chosen": -368.5791931152344, + "logps/rejected": -319.8849182128906, + "loss": 0.5948, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3122440278530121, + "rewards/margins": 0.21599093079566956, + "rewards/rejected": 0.09625311195850372, + "step": 2574 + }, + { + "epoch": 0.39822153489271217, + "grad_norm": 9.321632385253906, + "learning_rate": 4.818134952457327e-06, + "logits/chosen": 9.455666542053223, + "logits/rejected": 1.7659724950790405, + "logps/chosen": -331.2635803222656, + "logps/rejected": -242.7117462158203, + "loss": 0.6646, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.048567600548267365, + "rewards/margins": 0.11471579968929291, + "rewards/rejected": -0.06614819169044495, + "step": 2575 + }, + { + "epoch": 0.3983761840324763, + "grad_norm": 4.7136664390563965, + "learning_rate": 4.817848550807653e-06, + "logits/chosen": 10.74659538269043, + "logits/rejected": 11.096921920776367, + "logps/chosen": -293.65875244140625, + "logps/rejected": -247.22581481933594, + "loss": 0.6089, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2365730255842209, + "rewards/margins": 0.2898835837841034, + "rewards/rejected": -0.0533105731010437, + "step": 2576 + }, + { + "epoch": 0.3985308331722405, + "grad_norm": 4.790188312530518, + "learning_rate": 4.817562149157979e-06, + "logits/chosen": 16.982070922851562, + "logits/rejected": 6.605229377746582, + "logps/chosen": -309.56585693359375, + "logps/rejected": -224.42669677734375, + "loss": 0.5229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4406345784664154, + "rewards/margins": 0.4002976417541504, + "rewards/rejected": 0.04033694788813591, + "step": 2577 + }, + { + "epoch": 0.3986854823120046, + "grad_norm": 3.9144034385681152, + "learning_rate": 4.817275747508306e-06, + "logits/chosen": 12.170516014099121, + "logits/rejected": 7.328759670257568, + "logps/chosen": -259.0719909667969, + "logps/rejected": -173.70632934570312, + "loss": 0.6003, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35708701610565186, + "rewards/margins": 0.2333994209766388, + "rewards/rejected": 0.12368758767843246, + "step": 2578 + }, + { + "epoch": 0.3988401314517688, + "grad_norm": 5.627710819244385, + "learning_rate": 4.816989345858633e-06, + "logits/chosen": 9.870912551879883, + "logits/rejected": 10.388496398925781, + "logps/chosen": -261.9947204589844, + "logps/rejected": -227.18063354492188, + "loss": 0.8076, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06697288155555725, + "rewards/margins": -0.13728280365467072, + "rewards/rejected": 0.20425567030906677, + "step": 2579 + }, + { + "epoch": 0.398994780591533, + "grad_norm": 4.179599285125732, + "learning_rate": 4.816702944208959e-06, + "logits/chosen": 10.428400993347168, + "logits/rejected": 3.407343864440918, + "logps/chosen": -225.10702514648438, + "logps/rejected": -121.5801773071289, + "loss": 0.6205, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15225005149841309, + "rewards/margins": 0.17652475833892822, + "rewards/rejected": -0.024274736642837524, + "step": 2580 + }, + { + "epoch": 0.39914942973129713, + "grad_norm": 3.1337106227874756, + "learning_rate": 4.816416542559286e-06, + "logits/chosen": 10.498144149780273, + "logits/rejected": 5.005775451660156, + "logps/chosen": -181.6263427734375, + "logps/rejected": -121.93711853027344, + "loss": 0.5385, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33230575919151306, + "rewards/margins": 0.42928481101989746, + "rewards/rejected": -0.09697907418012619, + "step": 2581 + }, + { + "epoch": 0.3993040788710613, + "grad_norm": 4.620580196380615, + "learning_rate": 4.816130140909613e-06, + "logits/chosen": 8.123567581176758, + "logits/rejected": 1.9313759803771973, + "logps/chosen": -300.9485778808594, + "logps/rejected": -255.19776916503906, + "loss": 0.5571, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3754594922065735, + "rewards/margins": 0.35716065764427185, + "rewards/rejected": 0.018298834562301636, + "step": 2582 + }, + { + "epoch": 0.39945872801082544, + "grad_norm": 4.154801845550537, + "learning_rate": 4.815843739259938e-06, + "logits/chosen": 11.134706497192383, + "logits/rejected": 5.583846092224121, + "logps/chosen": -248.82884216308594, + "logps/rejected": -157.994384765625, + "loss": 0.5464, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35838425159454346, + "rewards/margins": 0.3369155526161194, + "rewards/rejected": 0.021468685939908028, + "step": 2583 + }, + { + "epoch": 0.3996133771505896, + "grad_norm": 9.80989933013916, + "learning_rate": 4.815557337610265e-06, + "logits/chosen": 11.670622825622559, + "logits/rejected": 9.94440746307373, + "logps/chosen": -289.3100280761719, + "logps/rejected": -256.14117431640625, + "loss": 0.8284, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.249995619058609, + "rewards/margins": -0.196833997964859, + "rewards/rejected": 0.44682958722114563, + "step": 2584 + }, + { + "epoch": 0.39976802629035374, + "grad_norm": 10.237334251403809, + "learning_rate": 4.815270935960592e-06, + "logits/chosen": 11.217272758483887, + "logits/rejected": 9.922136306762695, + "logps/chosen": -227.38626098632812, + "logps/rejected": -226.56753540039062, + "loss": 0.6419, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3075547218322754, + "rewards/margins": 0.17571119964122772, + "rewards/rejected": 0.13184353709220886, + "step": 2585 + }, + { + "epoch": 0.39992267543011795, + "grad_norm": 7.261939525604248, + "learning_rate": 4.814984534310918e-06, + "logits/chosen": 9.860733032226562, + "logits/rejected": 7.640769004821777, + "logps/chosen": -213.65786743164062, + "logps/rejected": -224.3811798095703, + "loss": 0.5289, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30839210748672485, + "rewards/margins": 0.37613165378570557, + "rewards/rejected": -0.06773953884840012, + "step": 2586 + }, + { + "epoch": 0.4000773245698821, + "grad_norm": 6.840734004974365, + "learning_rate": 4.814698132661245e-06, + "logits/chosen": 13.316822052001953, + "logits/rejected": 9.663893699645996, + "logps/chosen": -434.3657531738281, + "logps/rejected": -384.26287841796875, + "loss": 0.6405, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38900166749954224, + "rewards/margins": 0.28886890411376953, + "rewards/rejected": 0.10013273358345032, + "step": 2587 + }, + { + "epoch": 0.40023197370964625, + "grad_norm": 5.709846019744873, + "learning_rate": 4.814411731011572e-06, + "logits/chosen": 13.386789321899414, + "logits/rejected": 10.11871337890625, + "logps/chosen": -342.57373046875, + "logps/rejected": -325.471435546875, + "loss": 0.6783, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5118557214736938, + "rewards/margins": 0.26147741079330444, + "rewards/rejected": 0.250378280878067, + "step": 2588 + }, + { + "epoch": 0.4003866228494104, + "grad_norm": 5.633150100708008, + "learning_rate": 4.8141253293618974e-06, + "logits/chosen": 7.937169551849365, + "logits/rejected": 5.875890731811523, + "logps/chosen": -365.937255859375, + "logps/rejected": -312.9879150390625, + "loss": 0.6544, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5335246920585632, + "rewards/margins": 0.2694160044193268, + "rewards/rejected": 0.26410868763923645, + "step": 2589 + }, + { + "epoch": 0.40054127198917455, + "grad_norm": 4.867786407470703, + "learning_rate": 4.813838927712224e-06, + "logits/chosen": 7.035674571990967, + "logits/rejected": 5.373874187469482, + "logps/chosen": -268.3909912109375, + "logps/rejected": -227.382568359375, + "loss": 0.6692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4062163531780243, + "rewards/margins": 0.14622172713279724, + "rewards/rejected": 0.25999459624290466, + "step": 2590 + }, + { + "epoch": 0.4006959211289387, + "grad_norm": 6.201251983642578, + "learning_rate": 4.813552526062551e-06, + "logits/chosen": 9.580256462097168, + "logits/rejected": 8.534006118774414, + "logps/chosen": -276.1340026855469, + "logps/rejected": -275.29229736328125, + "loss": 0.7921, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18257032334804535, + "rewards/margins": -0.1590750366449356, + "rewards/rejected": 0.34164533019065857, + "step": 2591 + }, + { + "epoch": 0.40085057026870285, + "grad_norm": 6.719576358795166, + "learning_rate": 4.813266124412877e-06, + "logits/chosen": 9.957694053649902, + "logits/rejected": 5.490819454193115, + "logps/chosen": -234.45053100585938, + "logps/rejected": -248.4857177734375, + "loss": 0.776, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2196652591228485, + "rewards/margins": -0.14617657661437988, + "rewards/rejected": 0.365841805934906, + "step": 2592 + }, + { + "epoch": 0.40100521940846706, + "grad_norm": 5.586428642272949, + "learning_rate": 4.812979722763203e-06, + "logits/chosen": 11.509471893310547, + "logits/rejected": 13.922584533691406, + "logps/chosen": -242.5986328125, + "logps/rejected": -238.77996826171875, + "loss": 0.6286, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41750866174697876, + "rewards/margins": 0.2210138887166977, + "rewards/rejected": 0.19649478793144226, + "step": 2593 + }, + { + "epoch": 0.4011598685482312, + "grad_norm": 5.766911506652832, + "learning_rate": 4.81269332111353e-06, + "logits/chosen": 14.337347984313965, + "logits/rejected": 11.379358291625977, + "logps/chosen": -355.22607421875, + "logps/rejected": -276.8265380859375, + "loss": 0.5631, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7882957458496094, + "rewards/margins": 0.39532890915870667, + "rewards/rejected": 0.3929668664932251, + "step": 2594 + }, + { + "epoch": 0.40131451768799536, + "grad_norm": 9.718783378601074, + "learning_rate": 4.8124069194638565e-06, + "logits/chosen": 0.02197432518005371, + "logits/rejected": 6.0790910720825195, + "logps/chosen": -186.4334259033203, + "logps/rejected": -474.4280700683594, + "loss": 0.9066, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.012886762619018555, + "rewards/margins": -0.28824201226234436, + "rewards/rejected": 0.3011287748813629, + "step": 2595 + }, + { + "epoch": 0.4014691668277595, + "grad_norm": 4.246051788330078, + "learning_rate": 4.812120517814183e-06, + "logits/chosen": 3.56642484664917, + "logits/rejected": 3.160365104675293, + "logps/chosen": -156.90469360351562, + "logps/rejected": -136.982666015625, + "loss": 0.6667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01299332082271576, + "rewards/margins": 0.1608019918203354, + "rewards/rejected": -0.17379531264305115, + "step": 2596 + }, + { + "epoch": 0.40162381596752367, + "grad_norm": 5.396277904510498, + "learning_rate": 4.811834116164509e-06, + "logits/chosen": 10.982786178588867, + "logits/rejected": 7.774756908416748, + "logps/chosen": -270.47857666015625, + "logps/rejected": -193.02224731445312, + "loss": 0.6256, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1582435965538025, + "rewards/margins": 0.20662838220596313, + "rewards/rejected": -0.04838474094867706, + "step": 2597 + }, + { + "epoch": 0.4017784651072878, + "grad_norm": 4.19468355178833, + "learning_rate": 4.811547714514836e-06, + "logits/chosen": 9.862807273864746, + "logits/rejected": 8.71035385131836, + "logps/chosen": -209.9136199951172, + "logps/rejected": -217.19625854492188, + "loss": 0.5702, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19352124631404877, + "rewards/margins": 0.3395649790763855, + "rewards/rejected": -0.14604373276233673, + "step": 2598 + }, + { + "epoch": 0.401933114247052, + "grad_norm": 7.871066570281982, + "learning_rate": 4.811261312865162e-06, + "logits/chosen": 11.740633010864258, + "logits/rejected": 6.135998249053955, + "logps/chosen": -309.8606262207031, + "logps/rejected": -196.49359130859375, + "loss": 0.6767, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1325240135192871, + "rewards/margins": 0.080214723944664, + "rewards/rejected": 0.05230928957462311, + "step": 2599 + }, + { + "epoch": 0.4020877633868162, + "grad_norm": 4.6933817863464355, + "learning_rate": 4.810974911215489e-06, + "logits/chosen": 14.111406326293945, + "logits/rejected": 6.670742034912109, + "logps/chosen": -349.8078308105469, + "logps/rejected": -197.30316162109375, + "loss": 0.6643, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28924280405044556, + "rewards/margins": 0.29984050989151, + "rewards/rejected": -0.010597705841064453, + "step": 2600 + }, + { + "epoch": 0.40224241252658033, + "grad_norm": 6.063241004943848, + "learning_rate": 4.8106885095658156e-06, + "logits/chosen": 7.733075141906738, + "logits/rejected": 2.757169246673584, + "logps/chosen": -329.32977294921875, + "logps/rejected": -236.11361694335938, + "loss": 0.613, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3940313160419464, + "rewards/margins": 0.2367030680179596, + "rewards/rejected": 0.157328262925148, + "step": 2601 + }, + { + "epoch": 0.4023970616663445, + "grad_norm": 8.753619194030762, + "learning_rate": 4.810402107916142e-06, + "logits/chosen": 6.964667320251465, + "logits/rejected": 8.155755043029785, + "logps/chosen": -378.2042236328125, + "logps/rejected": -301.0556945800781, + "loss": 0.7407, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3217959403991699, + "rewards/margins": -0.07635515928268433, + "rewards/rejected": 0.39815109968185425, + "step": 2602 + }, + { + "epoch": 0.40255171080610863, + "grad_norm": 3.9061129093170166, + "learning_rate": 4.810115706266468e-06, + "logits/chosen": 13.372148513793945, + "logits/rejected": 8.747743606567383, + "logps/chosen": -202.840576171875, + "logps/rejected": -186.17889404296875, + "loss": 0.5977, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4059521555900574, + "rewards/margins": 0.25259929895401, + "rewards/rejected": 0.15335288643836975, + "step": 2603 + }, + { + "epoch": 0.4027063599458728, + "grad_norm": 4.528071880340576, + "learning_rate": 4.809829304616795e-06, + "logits/chosen": 10.3316011428833, + "logits/rejected": 7.996026992797852, + "logps/chosen": -213.27682495117188, + "logps/rejected": -189.6713409423828, + "loss": 0.6296, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15922811627388, + "rewards/margins": 0.18942369520664215, + "rewards/rejected": -0.030195560306310654, + "step": 2604 + }, + { + "epoch": 0.40286100908563693, + "grad_norm": 4.940518379211426, + "learning_rate": 4.809542902967121e-06, + "logits/chosen": 8.851593971252441, + "logits/rejected": 10.100085258483887, + "logps/chosen": -212.69921875, + "logps/rejected": -194.36570739746094, + "loss": 0.7309, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2609790861606598, + "rewards/margins": -0.042949870228767395, + "rewards/rejected": 0.3039289712905884, + "step": 2605 + }, + { + "epoch": 0.40301565822540114, + "grad_norm": 4.7839531898498535, + "learning_rate": 4.809256501317448e-06, + "logits/chosen": 7.9479193687438965, + "logits/rejected": 2.2056541442871094, + "logps/chosen": -292.2350769042969, + "logps/rejected": -195.654541015625, + "loss": 0.5978, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3009796440601349, + "rewards/margins": 0.26149171590805054, + "rewards/rejected": 0.03948793560266495, + "step": 2606 + }, + { + "epoch": 0.4031703073651653, + "grad_norm": 6.859809398651123, + "learning_rate": 4.808970099667775e-06, + "logits/chosen": 9.672496795654297, + "logits/rejected": 6.029942035675049, + "logps/chosen": -316.6988525390625, + "logps/rejected": -227.61627197265625, + "loss": 0.5111, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33837226033210754, + "rewards/margins": 0.5145007371902466, + "rewards/rejected": -0.17612850666046143, + "step": 2607 + }, + { + "epoch": 0.40332495650492944, + "grad_norm": 6.114803791046143, + "learning_rate": 4.808683698018101e-06, + "logits/chosen": 5.3322858810424805, + "logits/rejected": 6.1041412353515625, + "logps/chosen": -294.4306945800781, + "logps/rejected": -294.32177734375, + "loss": 0.7115, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32256880402565, + "rewards/margins": 0.12169989943504333, + "rewards/rejected": 0.2008688747882843, + "step": 2608 + }, + { + "epoch": 0.4034796056446936, + "grad_norm": 8.884920120239258, + "learning_rate": 4.808397296368427e-06, + "logits/chosen": 9.282898902893066, + "logits/rejected": 9.19538402557373, + "logps/chosen": -281.532958984375, + "logps/rejected": -261.17083740234375, + "loss": 0.8956, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3274007737636566, + "rewards/margins": -0.25450506806373596, + "rewards/rejected": 0.5819058418273926, + "step": 2609 + }, + { + "epoch": 0.40363425478445775, + "grad_norm": 4.668889999389648, + "learning_rate": 4.808110894718754e-06, + "logits/chosen": 10.31496810913086, + "logits/rejected": 2.831465482711792, + "logps/chosen": -376.6248779296875, + "logps/rejected": -279.6778869628906, + "loss": 0.4939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8887655735015869, + "rewards/margins": 0.5485795140266418, + "rewards/rejected": 0.3401860296726227, + "step": 2610 + }, + { + "epoch": 0.4037889039242219, + "grad_norm": 4.835183620452881, + "learning_rate": 4.80782449306908e-06, + "logits/chosen": 7.978914260864258, + "logits/rejected": 1.4610515832901, + "logps/chosen": -259.80926513671875, + "logps/rejected": -255.30917358398438, + "loss": 0.4105, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6189538240432739, + "rewards/margins": 0.8281218409538269, + "rewards/rejected": -0.20916807651519775, + "step": 2611 + }, + { + "epoch": 0.4039435530639861, + "grad_norm": 9.31905746459961, + "learning_rate": 4.807538091419407e-06, + "logits/chosen": 2.4142608642578125, + "logits/rejected": 2.5926458835601807, + "logps/chosen": -309.0591735839844, + "logps/rejected": -271.2691345214844, + "loss": 0.9097, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.06348896026611328, + "rewards/margins": -0.361579954624176, + "rewards/rejected": 0.4250689148902893, + "step": 2612 + }, + { + "epoch": 0.40409820220375026, + "grad_norm": 6.038558483123779, + "learning_rate": 4.807251689769734e-06, + "logits/chosen": 4.386850357055664, + "logits/rejected": 5.926063537597656, + "logps/chosen": -333.52166748046875, + "logps/rejected": -362.61676025390625, + "loss": 0.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.280229777097702, + "rewards/margins": 0.27903711795806885, + "rewards/rejected": 0.0011926740407943726, + "step": 2613 + }, + { + "epoch": 0.4042528513435144, + "grad_norm": 5.022916316986084, + "learning_rate": 4.80696528812006e-06, + "logits/chosen": 14.886497497558594, + "logits/rejected": 12.660100936889648, + "logps/chosen": -256.1756896972656, + "logps/rejected": -247.79827880859375, + "loss": 0.7212, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3640306293964386, + "rewards/margins": 0.06014080345630646, + "rewards/rejected": 0.3038898706436157, + "step": 2614 + }, + { + "epoch": 0.40440750048327856, + "grad_norm": 4.518743515014648, + "learning_rate": 4.806678886470387e-06, + "logits/chosen": 8.15795612335205, + "logits/rejected": 9.440757751464844, + "logps/chosen": -163.04135131835938, + "logps/rejected": -181.46841430664062, + "loss": 0.6411, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2222786545753479, + "rewards/margins": 0.19500431418418884, + "rewards/rejected": 0.02727438509464264, + "step": 2615 + }, + { + "epoch": 0.4045621496230427, + "grad_norm": 7.617118835449219, + "learning_rate": 4.806392484820713e-06, + "logits/chosen": 6.026540756225586, + "logits/rejected": 4.109331130981445, + "logps/chosen": -213.71939086914062, + "logps/rejected": -217.4627227783203, + "loss": 0.7957, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11376433074474335, + "rewards/margins": -0.11360128223896027, + "rewards/rejected": 0.2273656129837036, + "step": 2616 + }, + { + "epoch": 0.40471679876280686, + "grad_norm": 5.676124572753906, + "learning_rate": 4.8061060831710394e-06, + "logits/chosen": 13.312227249145508, + "logits/rejected": 12.515303611755371, + "logps/chosen": -350.6089172363281, + "logps/rejected": -317.6327209472656, + "loss": 0.6383, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4327283203601837, + "rewards/margins": 0.14337772130966187, + "rewards/rejected": 0.28935056924819946, + "step": 2617 + }, + { + "epoch": 0.40487144790257107, + "grad_norm": 5.661144256591797, + "learning_rate": 4.805819681521366e-06, + "logits/chosen": 9.972670555114746, + "logits/rejected": 3.328108072280884, + "logps/chosen": -333.13201904296875, + "logps/rejected": -234.4336700439453, + "loss": 0.5544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5826784372329712, + "rewards/margins": 0.3760630488395691, + "rewards/rejected": 0.2066154032945633, + "step": 2618 + }, + { + "epoch": 0.4050260970423352, + "grad_norm": 6.063949108123779, + "learning_rate": 4.805533279871693e-06, + "logits/chosen": 4.357425689697266, + "logits/rejected": 6.712894439697266, + "logps/chosen": -262.0039367675781, + "logps/rejected": -239.83370971679688, + "loss": 0.6836, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009914321824908257, + "rewards/margins": 0.14029037952423096, + "rewards/rejected": -0.13037605583667755, + "step": 2619 + }, + { + "epoch": 0.40518074618209937, + "grad_norm": 4.06412935256958, + "learning_rate": 4.805246878222019e-06, + "logits/chosen": 7.4035258293151855, + "logits/rejected": 7.448153018951416, + "logps/chosen": -297.70245361328125, + "logps/rejected": -214.7616729736328, + "loss": 0.5293, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1439451277256012, + "rewards/margins": 0.4220367968082428, + "rewards/rejected": -0.2780916690826416, + "step": 2620 + }, + { + "epoch": 0.4053353953218635, + "grad_norm": 5.077789306640625, + "learning_rate": 4.804960476572346e-06, + "logits/chosen": 8.757404327392578, + "logits/rejected": 3.2138757705688477, + "logps/chosen": -282.9140319824219, + "logps/rejected": -219.0904998779297, + "loss": 0.6875, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3097047805786133, + "rewards/margins": 0.06329468637704849, + "rewards/rejected": 0.24641010165214539, + "step": 2621 + }, + { + "epoch": 0.4054900444616277, + "grad_norm": 5.799759387969971, + "learning_rate": 4.804674074922672e-06, + "logits/chosen": 5.377039909362793, + "logits/rejected": 5.550652027130127, + "logps/chosen": -234.64694213867188, + "logps/rejected": -226.31576538085938, + "loss": 0.7461, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0468912273645401, + "rewards/margins": -0.034545764327049255, + "rewards/rejected": 0.08143696188926697, + "step": 2622 + }, + { + "epoch": 0.4056446936013918, + "grad_norm": 5.164262294769287, + "learning_rate": 4.8043876732729985e-06, + "logits/chosen": 6.502854824066162, + "logits/rejected": 5.388241767883301, + "logps/chosen": -364.510986328125, + "logps/rejected": -197.87681579589844, + "loss": 0.6442, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4631475806236267, + "rewards/margins": 0.4379481077194214, + "rewards/rejected": 0.025199517607688904, + "step": 2623 + }, + { + "epoch": 0.405799342741156, + "grad_norm": 4.712680816650391, + "learning_rate": 4.804101271623325e-06, + "logits/chosen": 5.3691301345825195, + "logits/rejected": 3.0496113300323486, + "logps/chosen": -385.042236328125, + "logps/rejected": -355.14483642578125, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21682611107826233, + "rewards/margins": 0.3757406771183014, + "rewards/rejected": -0.15891456604003906, + "step": 2624 + }, + { + "epoch": 0.4059539918809202, + "grad_norm": 11.883728981018066, + "learning_rate": 4.803814869973652e-06, + "logits/chosen": 4.919912338256836, + "logits/rejected": -0.008937597274780273, + "logps/chosen": -489.31768798828125, + "logps/rejected": -355.6535339355469, + "loss": 0.7418, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05142766237258911, + "rewards/margins": -0.03966262564063072, + "rewards/rejected": -0.011765042319893837, + "step": 2625 + }, + { + "epoch": 0.40610864102068434, + "grad_norm": 4.274256229400635, + "learning_rate": 4.8035284683239784e-06, + "logits/chosen": 9.499042510986328, + "logits/rejected": 4.244202136993408, + "logps/chosen": -248.77346801757812, + "logps/rejected": -177.20687866210938, + "loss": 0.5185, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09489268809556961, + "rewards/margins": 0.44498834013938904, + "rewards/rejected": -0.3500956594944, + "step": 2626 + }, + { + "epoch": 0.4062632901604485, + "grad_norm": 5.311555862426758, + "learning_rate": 4.803242066674304e-06, + "logits/chosen": 15.270570755004883, + "logits/rejected": 11.609674453735352, + "logps/chosen": -236.3300018310547, + "logps/rejected": -225.8714141845703, + "loss": 0.5434, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16436053812503815, + "rewards/margins": 0.4649355709552765, + "rewards/rejected": -0.3005750775337219, + "step": 2627 + }, + { + "epoch": 0.40641793930021264, + "grad_norm": 4.342464447021484, + "learning_rate": 4.802955665024631e-06, + "logits/chosen": 9.718295097351074, + "logits/rejected": 13.8018798828125, + "logps/chosen": -207.02001953125, + "logps/rejected": -273.1556396484375, + "loss": 0.5812, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11753726005554199, + "rewards/margins": 0.29417183995246887, + "rewards/rejected": -0.17663460969924927, + "step": 2628 + }, + { + "epoch": 0.4065725884399768, + "grad_norm": 6.279725074768066, + "learning_rate": 4.8026692633749575e-06, + "logits/chosen": 7.422210216522217, + "logits/rejected": 9.758340835571289, + "logps/chosen": -162.49378967285156, + "logps/rejected": -157.6685791015625, + "loss": 0.8373, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.26207125186920166, + "rewards/margins": -0.23835879564285278, + "rewards/rejected": -0.023712441325187683, + "step": 2629 + }, + { + "epoch": 0.40672723757974094, + "grad_norm": 4.622989177703857, + "learning_rate": 4.802382861725284e-06, + "logits/chosen": 12.836143493652344, + "logits/rejected": 1.408252477645874, + "logps/chosen": -403.2530517578125, + "logps/rejected": -228.9989776611328, + "loss": 0.4593, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13168393075466156, + "rewards/margins": 0.768669843673706, + "rewards/rejected": -0.6369858980178833, + "step": 2630 + }, + { + "epoch": 0.40688188671950515, + "grad_norm": 6.396973609924316, + "learning_rate": 4.80209646007561e-06, + "logits/chosen": 9.866778373718262, + "logits/rejected": 6.122995853424072, + "logps/chosen": -472.73065185546875, + "logps/rejected": -324.6229248046875, + "loss": 0.6309, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3928345739841461, + "rewards/margins": 0.23249207437038422, + "rewards/rejected": 0.1603425145149231, + "step": 2631 + }, + { + "epoch": 0.4070365358592693, + "grad_norm": 7.1897149085998535, + "learning_rate": 4.801810058425937e-06, + "logits/chosen": 11.339544296264648, + "logits/rejected": 11.42724323272705, + "logps/chosen": -363.2203063964844, + "logps/rejected": -357.7216796875, + "loss": 0.7904, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15831291675567627, + "rewards/margins": -0.0312308669090271, + "rewards/rejected": -0.12708204984664917, + "step": 2632 + }, + { + "epoch": 0.40719118499903345, + "grad_norm": 4.733573913574219, + "learning_rate": 4.801523656776263e-06, + "logits/chosen": 6.047045707702637, + "logits/rejected": 9.2461576461792, + "logps/chosen": -215.75318908691406, + "logps/rejected": -263.97296142578125, + "loss": 0.6357, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2662281095981598, + "rewards/margins": 0.14469946920871735, + "rewards/rejected": 0.12152862548828125, + "step": 2633 + }, + { + "epoch": 0.4073458341387976, + "grad_norm": 4.460962772369385, + "learning_rate": 4.80123725512659e-06, + "logits/chosen": 13.84663200378418, + "logits/rejected": 4.173087120056152, + "logps/chosen": -275.0989685058594, + "logps/rejected": -222.12890625, + "loss": 0.4643, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21884170174598694, + "rewards/margins": 0.5685914158821106, + "rewards/rejected": -0.34974968433380127, + "step": 2634 + }, + { + "epoch": 0.40750048327856175, + "grad_norm": 5.4968414306640625, + "learning_rate": 4.800950853476917e-06, + "logits/chosen": 6.543176651000977, + "logits/rejected": -2.3892300128936768, + "logps/chosen": -268.4636535644531, + "logps/rejected": -199.86373901367188, + "loss": 0.5065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26971369981765747, + "rewards/margins": 0.47401466965675354, + "rewards/rejected": -0.20430096983909607, + "step": 2635 + }, + { + "epoch": 0.4076551324183259, + "grad_norm": 5.363725185394287, + "learning_rate": 4.800664451827242e-06, + "logits/chosen": 5.589339733123779, + "logits/rejected": 7.286928176879883, + "logps/chosen": -202.7041015625, + "logps/rejected": -239.35354614257812, + "loss": 0.7613, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08437521755695343, + "rewards/margins": -0.10510352998971939, + "rewards/rejected": 0.18947875499725342, + "step": 2636 + }, + { + "epoch": 0.40780978155809006, + "grad_norm": 7.565221786499023, + "learning_rate": 4.800378050177569e-06, + "logits/chosen": 13.128049850463867, + "logits/rejected": 11.742080688476562, + "logps/chosen": -259.989990234375, + "logps/rejected": -255.3491973876953, + "loss": 0.751, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12680941820144653, + "rewards/margins": -0.020378492772579193, + "rewards/rejected": -0.10643090307712555, + "step": 2637 + }, + { + "epoch": 0.40796443069785426, + "grad_norm": 4.714408874511719, + "learning_rate": 4.800091648527896e-06, + "logits/chosen": 11.26417350769043, + "logits/rejected": 6.868785381317139, + "logps/chosen": -329.751953125, + "logps/rejected": -261.6939697265625, + "loss": 0.6478, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2920898497104645, + "rewards/margins": 0.14915743470191956, + "rewards/rejected": 0.14293241500854492, + "step": 2638 + }, + { + "epoch": 0.4081190798376184, + "grad_norm": 5.228346347808838, + "learning_rate": 4.799805246878222e-06, + "logits/chosen": 7.224055290222168, + "logits/rejected": 5.372251987457275, + "logps/chosen": -233.1369171142578, + "logps/rejected": -186.95213317871094, + "loss": 0.5775, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00942116230726242, + "rewards/margins": 0.2986801266670227, + "rewards/rejected": -0.2892589569091797, + "step": 2639 + }, + { + "epoch": 0.40827372897738257, + "grad_norm": 4.8835530281066895, + "learning_rate": 4.799518845228549e-06, + "logits/chosen": 13.223508834838867, + "logits/rejected": 7.286744594573975, + "logps/chosen": -426.3082275390625, + "logps/rejected": -247.29444885253906, + "loss": 0.5832, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5898993015289307, + "rewards/margins": 0.34897100925445557, + "rewards/rejected": 0.2409282624721527, + "step": 2640 + }, + { + "epoch": 0.4084283781171467, + "grad_norm": 7.119494915008545, + "learning_rate": 4.799232443578876e-06, + "logits/chosen": 10.611406326293945, + "logits/rejected": 2.7520837783813477, + "logps/chosen": -247.2782440185547, + "logps/rejected": -170.2180938720703, + "loss": 0.6579, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1309681087732315, + "rewards/margins": 0.15472464263439178, + "rewards/rejected": -0.2856927514076233, + "step": 2641 + }, + { + "epoch": 0.40858302725691087, + "grad_norm": 6.773077487945557, + "learning_rate": 4.7989460419292014e-06, + "logits/chosen": 7.102503776550293, + "logits/rejected": 7.385689735412598, + "logps/chosen": -303.64093017578125, + "logps/rejected": -308.61395263671875, + "loss": 0.7539, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11752213537693024, + "rewards/margins": 0.013698004186153412, + "rewards/rejected": 0.10382413864135742, + "step": 2642 + }, + { + "epoch": 0.408737676396675, + "grad_norm": 5.577019214630127, + "learning_rate": 4.798659640279528e-06, + "logits/chosen": 6.515275955200195, + "logits/rejected": 10.408912658691406, + "logps/chosen": -130.23463439941406, + "logps/rejected": -216.39822387695312, + "loss": 0.7539, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21503695845603943, + "rewards/margins": -0.09368164837360382, + "rewards/rejected": -0.12135529518127441, + "step": 2643 + }, + { + "epoch": 0.4088923255364392, + "grad_norm": 11.766060829162598, + "learning_rate": 4.798373238629855e-06, + "logits/chosen": 12.002068519592285, + "logits/rejected": 13.626076698303223, + "logps/chosen": -221.24728393554688, + "logps/rejected": -255.05416870117188, + "loss": 0.9005, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05107155442237854, + "rewards/margins": -0.2006019502878189, + "rewards/rejected": 0.14953042566776276, + "step": 2644 + }, + { + "epoch": 0.4090469746762034, + "grad_norm": 6.128371715545654, + "learning_rate": 4.798086836980181e-06, + "logits/chosen": 8.59428882598877, + "logits/rejected": 4.739449501037598, + "logps/chosen": -284.60009765625, + "logps/rejected": -167.88858032226562, + "loss": 0.7036, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.022728145122528076, + "rewards/margins": 0.016990456730127335, + "rewards/rejected": 0.00573769211769104, + "step": 2645 + }, + { + "epoch": 0.40920162381596753, + "grad_norm": 4.945323467254639, + "learning_rate": 4.797800435330508e-06, + "logits/chosen": 11.016851425170898, + "logits/rejected": 8.419404029846191, + "logps/chosen": -281.6387023925781, + "logps/rejected": -245.2271728515625, + "loss": 0.5763, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2981964349746704, + "rewards/margins": 0.4338471293449402, + "rewards/rejected": -0.13565070927143097, + "step": 2646 + }, + { + "epoch": 0.4093562729557317, + "grad_norm": 11.966965675354004, + "learning_rate": 4.797514033680835e-06, + "logits/chosen": 5.151798725128174, + "logits/rejected": 5.54546594619751, + "logps/chosen": -281.3016052246094, + "logps/rejected": -193.2689666748047, + "loss": 0.7505, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.013181231915950775, + "rewards/margins": -0.08431143313646317, + "rewards/rejected": 0.07113020122051239, + "step": 2647 + }, + { + "epoch": 0.40951092209549583, + "grad_norm": 5.401256561279297, + "learning_rate": 4.797227632031161e-06, + "logits/chosen": 9.029970169067383, + "logits/rejected": 3.045306444168091, + "logps/chosen": -268.4500732421875, + "logps/rejected": -243.4966278076172, + "loss": 0.5756, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21103864908218384, + "rewards/margins": 0.3457384407520294, + "rewards/rejected": -0.13469980657100677, + "step": 2648 + }, + { + "epoch": 0.40966557123526, + "grad_norm": 4.6939496994018555, + "learning_rate": 4.796941230381487e-06, + "logits/chosen": 8.520849227905273, + "logits/rejected": 5.825819969177246, + "logps/chosen": -251.12374877929688, + "logps/rejected": -164.05984497070312, + "loss": 0.6412, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3347279727458954, + "rewards/margins": 0.24735581874847412, + "rewards/rejected": 0.08737210929393768, + "step": 2649 + }, + { + "epoch": 0.4098202203750242, + "grad_norm": 4.260602951049805, + "learning_rate": 4.796654828731814e-06, + "logits/chosen": 12.482410430908203, + "logits/rejected": 3.0571346282958984, + "logps/chosen": -291.24835205078125, + "logps/rejected": -181.38148498535156, + "loss": 0.5202, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14117297530174255, + "rewards/margins": 0.47244641184806824, + "rewards/rejected": -0.3312734067440033, + "step": 2650 + }, + { + "epoch": 0.40997486951478834, + "grad_norm": 4.5313944816589355, + "learning_rate": 4.7963684270821405e-06, + "logits/chosen": 3.3460144996643066, + "logits/rejected": 1.4234497547149658, + "logps/chosen": -162.12477111816406, + "logps/rejected": -195.40274047851562, + "loss": 0.6493, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017408132553100586, + "rewards/margins": 0.1096906065940857, + "rewards/rejected": -0.09228246659040451, + "step": 2651 + }, + { + "epoch": 0.4101295186545525, + "grad_norm": 5.835301399230957, + "learning_rate": 4.796082025432467e-06, + "logits/chosen": 6.379386901855469, + "logits/rejected": 9.844743728637695, + "logps/chosen": -225.2978515625, + "logps/rejected": -215.4749755859375, + "loss": 0.8894, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14622831344604492, + "rewards/margins": -0.23944847285747528, + "rewards/rejected": 0.09322013705968857, + "step": 2652 + }, + { + "epoch": 0.41028416779431665, + "grad_norm": 6.756196975708008, + "learning_rate": 4.795795623782794e-06, + "logits/chosen": 7.943758010864258, + "logits/rejected": 10.78510570526123, + "logps/chosen": -227.11619567871094, + "logps/rejected": -299.67352294921875, + "loss": 0.9121, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2748430371284485, + "rewards/margins": -0.18574418127536774, + "rewards/rejected": -0.08909883350133896, + "step": 2653 + }, + { + "epoch": 0.4104388169340808, + "grad_norm": 4.1023101806640625, + "learning_rate": 4.79550922213312e-06, + "logits/chosen": 10.251218795776367, + "logits/rejected": 7.312133312225342, + "logps/chosen": -278.2147216796875, + "logps/rejected": -253.19630432128906, + "loss": 0.5215, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3653113842010498, + "rewards/margins": 0.5224370360374451, + "rewards/rejected": -0.15712566673755646, + "step": 2654 + }, + { + "epoch": 0.41059346607384495, + "grad_norm": 6.181456565856934, + "learning_rate": 4.795222820483446e-06, + "logits/chosen": 7.096429347991943, + "logits/rejected": 10.792226791381836, + "logps/chosen": -251.61062622070312, + "logps/rejected": -300.1155090332031, + "loss": 0.7853, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10043714940547943, + "rewards/margins": -0.014385506510734558, + "rewards/rejected": 0.11482267826795578, + "step": 2655 + }, + { + "epoch": 0.4107481152136091, + "grad_norm": 6.432643413543701, + "learning_rate": 4.794936418833773e-06, + "logits/chosen": 9.615537643432617, + "logits/rejected": 11.772068977355957, + "logps/chosen": -269.7298889160156, + "logps/rejected": -342.2661437988281, + "loss": 0.8283, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07405209541320801, + "rewards/margins": -0.17221638560295105, + "rewards/rejected": 0.24626848101615906, + "step": 2656 + }, + { + "epoch": 0.4109027643533733, + "grad_norm": 5.000306129455566, + "learning_rate": 4.7946500171840995e-06, + "logits/chosen": 8.370254516601562, + "logits/rejected": 7.60905122756958, + "logps/chosen": -266.63787841796875, + "logps/rejected": -197.1949462890625, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09593873471021652, + "rewards/margins": 0.08923462778329849, + "rewards/rejected": 0.0067041050642728806, + "step": 2657 + }, + { + "epoch": 0.41105741349313746, + "grad_norm": 6.106516361236572, + "learning_rate": 4.794363615534426e-06, + "logits/chosen": 12.595562934875488, + "logits/rejected": 5.683802604675293, + "logps/chosen": -344.0003662109375, + "logps/rejected": -277.79376220703125, + "loss": 0.6124, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35268765687942505, + "rewards/margins": 0.23292341828346252, + "rewards/rejected": 0.11976423859596252, + "step": 2658 + }, + { + "epoch": 0.4112120626329016, + "grad_norm": 5.12757682800293, + "learning_rate": 4.794077213884753e-06, + "logits/chosen": 7.615944862365723, + "logits/rejected": 13.078064918518066, + "logps/chosen": -235.4261932373047, + "logps/rejected": -291.4898376464844, + "loss": 0.6256, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20176124572753906, + "rewards/margins": 0.19874128699302673, + "rewards/rejected": 0.003019958734512329, + "step": 2659 + }, + { + "epoch": 0.41136671177266576, + "grad_norm": 3.7776591777801514, + "learning_rate": 4.793790812235079e-06, + "logits/chosen": 8.574359893798828, + "logits/rejected": 2.1253788471221924, + "logps/chosen": -256.5044250488281, + "logps/rejected": -172.62010192871094, + "loss": 0.5411, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12476853281259537, + "rewards/margins": 0.513027548789978, + "rewards/rejected": -0.38825899362564087, + "step": 2660 + }, + { + "epoch": 0.4115213609124299, + "grad_norm": 8.783904075622559, + "learning_rate": 4.793504410585405e-06, + "logits/chosen": 12.412147521972656, + "logits/rejected": 8.608375549316406, + "logps/chosen": -575.7140502929688, + "logps/rejected": -449.0717468261719, + "loss": 0.7288, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13889333605766296, + "rewards/margins": -0.002595890313386917, + "rewards/rejected": 0.1414892077445984, + "step": 2661 + }, + { + "epoch": 0.41167601005219406, + "grad_norm": 4.376724720001221, + "learning_rate": 4.793218008935732e-06, + "logits/chosen": 13.176595687866211, + "logits/rejected": 4.990189552307129, + "logps/chosen": -277.080322265625, + "logps/rejected": -187.01858520507812, + "loss": 0.6347, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06547775119543076, + "rewards/margins": 0.19429269433021545, + "rewards/rejected": -0.1288149356842041, + "step": 2662 + }, + { + "epoch": 0.41183065919195827, + "grad_norm": 5.452521800994873, + "learning_rate": 4.7929316072860586e-06, + "logits/chosen": 12.210566520690918, + "logits/rejected": 5.91943359375, + "logps/chosen": -445.83416748046875, + "logps/rejected": -314.81378173828125, + "loss": 0.5722, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34836122393608093, + "rewards/margins": 0.37429410219192505, + "rewards/rejected": -0.025932878255844116, + "step": 2663 + }, + { + "epoch": 0.4119853083317224, + "grad_norm": 3.8195464611053467, + "learning_rate": 4.792645205636385e-06, + "logits/chosen": 10.620893478393555, + "logits/rejected": 11.452146530151367, + "logps/chosen": -142.13693237304688, + "logps/rejected": -149.37869262695312, + "loss": 0.6116, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.12462678551673889, + "rewards/margins": 0.21381878852844238, + "rewards/rejected": -0.3384455740451813, + "step": 2664 + }, + { + "epoch": 0.4121399574714866, + "grad_norm": 3.477503776550293, + "learning_rate": 4.792358803986711e-06, + "logits/chosen": 9.744803428649902, + "logits/rejected": 7.133264541625977, + "logps/chosen": -216.46688842773438, + "logps/rejected": -205.63217163085938, + "loss": 0.5829, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09257908910512924, + "rewards/margins": 0.29249659180641174, + "rewards/rejected": -0.1999175101518631, + "step": 2665 + }, + { + "epoch": 0.4122946066112507, + "grad_norm": 4.316352844238281, + "learning_rate": 4.792072402337038e-06, + "logits/chosen": 12.296398162841797, + "logits/rejected": 5.777896881103516, + "logps/chosen": -380.7566833496094, + "logps/rejected": -261.357421875, + "loss": 0.5551, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2547001838684082, + "rewards/margins": 0.38686132431030273, + "rewards/rejected": -0.13216114044189453, + "step": 2666 + }, + { + "epoch": 0.4124492557510149, + "grad_norm": 4.193288326263428, + "learning_rate": 4.791786000687364e-06, + "logits/chosen": 2.1880016326904297, + "logits/rejected": 6.825901031494141, + "logps/chosen": -130.9357452392578, + "logps/rejected": -174.4420166015625, + "loss": 0.6474, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25551897287368774, + "rewards/margins": 0.13312536478042603, + "rewards/rejected": -0.38864439725875854, + "step": 2667 + }, + { + "epoch": 0.412603904890779, + "grad_norm": 7.0151519775390625, + "learning_rate": 4.791499599037691e-06, + "logits/chosen": 11.798147201538086, + "logits/rejected": 7.212460994720459, + "logps/chosen": -265.9297790527344, + "logps/rejected": -176.83584594726562, + "loss": 0.6224, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21902886033058167, + "rewards/margins": 0.3132617473602295, + "rewards/rejected": -0.09423283487558365, + "step": 2668 + }, + { + "epoch": 0.4127585540305432, + "grad_norm": 5.6192708015441895, + "learning_rate": 4.791213197388017e-06, + "logits/chosen": 10.524545669555664, + "logits/rejected": 12.62818717956543, + "logps/chosen": -235.33456420898438, + "logps/rejected": -340.3367004394531, + "loss": 0.6069, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2112850397825241, + "rewards/margins": 0.30751287937164307, + "rewards/rejected": -0.09622783958911896, + "step": 2669 + }, + { + "epoch": 0.4129132031703074, + "grad_norm": 6.5952019691467285, + "learning_rate": 4.7909267957383434e-06, + "logits/chosen": 8.100357055664062, + "logits/rejected": 6.451948165893555, + "logps/chosen": -295.75579833984375, + "logps/rejected": -313.97100830078125, + "loss": 0.6492, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08507698774337769, + "rewards/margins": 0.11653098464012146, + "rewards/rejected": -0.03145400807261467, + "step": 2670 + }, + { + "epoch": 0.41306785231007154, + "grad_norm": 6.971599102020264, + "learning_rate": 4.79064039408867e-06, + "logits/chosen": 10.957653045654297, + "logits/rejected": 15.8810453414917, + "logps/chosen": -211.7451629638672, + "logps/rejected": -206.51112365722656, + "loss": 0.803, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16833312809467316, + "rewards/margins": -0.09789979457855225, + "rewards/rejected": -0.07043331861495972, + "step": 2671 + }, + { + "epoch": 0.4132225014498357, + "grad_norm": 4.678345680236816, + "learning_rate": 4.790353992438997e-06, + "logits/chosen": 9.228696823120117, + "logits/rejected": 9.208124160766602, + "logps/chosen": -190.8668670654297, + "logps/rejected": -212.9173126220703, + "loss": 0.6254, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0887783020734787, + "rewards/margins": 0.23429659008979797, + "rewards/rejected": -0.1455182582139969, + "step": 2672 + }, + { + "epoch": 0.41337715058959984, + "grad_norm": 5.304049968719482, + "learning_rate": 4.790067590789323e-06, + "logits/chosen": 7.364456653594971, + "logits/rejected": 7.57342529296875, + "logps/chosen": -194.13265991210938, + "logps/rejected": -197.5412139892578, + "loss": 0.7354, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.036277107894420624, + "rewards/margins": -0.020879503339529037, + "rewards/rejected": -0.01539759710431099, + "step": 2673 + }, + { + "epoch": 0.413531799729364, + "grad_norm": 5.420442581176758, + "learning_rate": 4.78978118913965e-06, + "logits/chosen": 8.265457153320312, + "logits/rejected": 3.7377610206604004, + "logps/chosen": -293.60052490234375, + "logps/rejected": -181.9404296875, + "loss": 0.6926, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0356023907661438, + "rewards/margins": 0.11932381987571716, + "rewards/rejected": -0.15492619574069977, + "step": 2674 + }, + { + "epoch": 0.41368644886912814, + "grad_norm": 5.860610008239746, + "learning_rate": 4.789494787489976e-06, + "logits/chosen": 6.292856216430664, + "logits/rejected": 7.976338863372803, + "logps/chosen": -262.763916015625, + "logps/rejected": -285.5487060546875, + "loss": 0.6629, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04891301691532135, + "rewards/margins": 0.24372237920761108, + "rewards/rejected": -0.19480934739112854, + "step": 2675 + }, + { + "epoch": 0.41384109800889235, + "grad_norm": 7.555635452270508, + "learning_rate": 4.7892083858403025e-06, + "logits/chosen": 14.162042617797852, + "logits/rejected": 10.465352058410645, + "logps/chosen": -332.97528076171875, + "logps/rejected": -391.433349609375, + "loss": 0.5916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06017686426639557, + "rewards/margins": 0.30433887243270874, + "rewards/rejected": -0.24416202306747437, + "step": 2676 + }, + { + "epoch": 0.4139957471486565, + "grad_norm": 6.041835308074951, + "learning_rate": 4.788921984190629e-06, + "logits/chosen": 7.283111572265625, + "logits/rejected": 2.9537463188171387, + "logps/chosen": -291.0113220214844, + "logps/rejected": -180.46897888183594, + "loss": 0.6104, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14422979950904846, + "rewards/margins": 0.18445970118045807, + "rewards/rejected": -0.3286895155906677, + "step": 2677 + }, + { + "epoch": 0.41415039628842065, + "grad_norm": 4.959870338439941, + "learning_rate": 4.788635582540956e-06, + "logits/chosen": 11.053314208984375, + "logits/rejected": 10.915766716003418, + "logps/chosen": -206.71646118164062, + "logps/rejected": -220.74928283691406, + "loss": 0.6833, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023371180519461632, + "rewards/margins": 0.1551080048084259, + "rewards/rejected": -0.1317368447780609, + "step": 2678 + }, + { + "epoch": 0.4143050454281848, + "grad_norm": 5.344726085662842, + "learning_rate": 4.7883491808912824e-06, + "logits/chosen": 11.77116584777832, + "logits/rejected": 5.133759021759033, + "logps/chosen": -336.708251953125, + "logps/rejected": -335.4376220703125, + "loss": 0.6371, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18181678652763367, + "rewards/margins": 0.2018643319606781, + "rewards/rejected": -0.02004757523536682, + "step": 2679 + }, + { + "epoch": 0.41445969456794896, + "grad_norm": 5.46013879776001, + "learning_rate": 4.788062779241609e-06, + "logits/chosen": 6.188963890075684, + "logits/rejected": 7.754343032836914, + "logps/chosen": -220.2676239013672, + "logps/rejected": -192.42047119140625, + "loss": 0.7615, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24339866638183594, + "rewards/margins": 0.06646917760372162, + "rewards/rejected": -0.30986788868904114, + "step": 2680 + }, + { + "epoch": 0.4146143437077131, + "grad_norm": 6.969061374664307, + "learning_rate": 4.787776377591936e-06, + "logits/chosen": 9.019637107849121, + "logits/rejected": 6.001626014709473, + "logps/chosen": -383.18328857421875, + "logps/rejected": -338.8443603515625, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00911402702331543, + "rewards/margins": 0.10183516144752502, + "rewards/rejected": -0.092721126973629, + "step": 2681 + }, + { + "epoch": 0.4147689928474773, + "grad_norm": 5.407541751861572, + "learning_rate": 4.7874899759422615e-06, + "logits/chosen": 6.0520524978637695, + "logits/rejected": 5.730321407318115, + "logps/chosen": -280.2451171875, + "logps/rejected": -221.66537475585938, + "loss": 0.7308, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21384906768798828, + "rewards/margins": 0.059087567031383514, + "rewards/rejected": -0.2729366421699524, + "step": 2682 + }, + { + "epoch": 0.41492364198724146, + "grad_norm": 3.819279909133911, + "learning_rate": 4.787203574292588e-06, + "logits/chosen": 5.031618118286133, + "logits/rejected": 2.9987237453460693, + "logps/chosen": -186.62782287597656, + "logps/rejected": -162.77984619140625, + "loss": 0.49, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2552463412284851, + "rewards/margins": 0.5567307472229004, + "rewards/rejected": -0.3014844059944153, + "step": 2683 + }, + { + "epoch": 0.4150782911270056, + "grad_norm": 5.949801445007324, + "learning_rate": 4.786917172642915e-06, + "logits/chosen": 10.622173309326172, + "logits/rejected": 9.031545639038086, + "logps/chosen": -301.82354736328125, + "logps/rejected": -338.3058166503906, + "loss": 0.6469, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13022488355636597, + "rewards/margins": 0.18733233213424683, + "rewards/rejected": -0.05710745230317116, + "step": 2684 + }, + { + "epoch": 0.41523294026676977, + "grad_norm": 5.231034278869629, + "learning_rate": 4.7866307709932415e-06, + "logits/chosen": 7.656411170959473, + "logits/rejected": 7.173588275909424, + "logps/chosen": -267.7090148925781, + "logps/rejected": -265.0752258300781, + "loss": 0.5862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17060573399066925, + "rewards/margins": 0.28513848781585693, + "rewards/rejected": -0.11453276872634888, + "step": 2685 + }, + { + "epoch": 0.4153875894065339, + "grad_norm": 6.427034854888916, + "learning_rate": 4.786344369343568e-06, + "logits/chosen": 7.376415252685547, + "logits/rejected": 7.928295135498047, + "logps/chosen": -248.8770294189453, + "logps/rejected": -314.45965576171875, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08439035713672638, + "rewards/margins": 0.08876930177211761, + "rewards/rejected": -0.004378937184810638, + "step": 2686 + }, + { + "epoch": 0.41554223854629807, + "grad_norm": 5.761512279510498, + "learning_rate": 4.786057967693895e-06, + "logits/chosen": 4.417949676513672, + "logits/rejected": 7.835887908935547, + "logps/chosen": -222.18365478515625, + "logps/rejected": -251.08248901367188, + "loss": 0.7497, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.026735767722129822, + "rewards/margins": -0.0775153860449791, + "rewards/rejected": 0.050779618322849274, + "step": 2687 + }, + { + "epoch": 0.4156968876860622, + "grad_norm": 4.53059720993042, + "learning_rate": 4.785771566044221e-06, + "logits/chosen": 12.640189170837402, + "logits/rejected": 5.8292646408081055, + "logps/chosen": -327.42340087890625, + "logps/rejected": -217.5017547607422, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48154470324516296, + "rewards/margins": 0.3492458164691925, + "rewards/rejected": 0.13229885697364807, + "step": 2688 + }, + { + "epoch": 0.41585153682582643, + "grad_norm": 4.212057590484619, + "learning_rate": 4.785485164394547e-06, + "logits/chosen": 13.842592239379883, + "logits/rejected": 7.01350212097168, + "logps/chosen": -367.5593566894531, + "logps/rejected": -265.8381652832031, + "loss": 0.4781, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3196007311344147, + "rewards/margins": 0.6862646341323853, + "rewards/rejected": -0.36666393280029297, + "step": 2689 + }, + { + "epoch": 0.4160061859655906, + "grad_norm": 5.01682186126709, + "learning_rate": 4.785198762744874e-06, + "logits/chosen": 15.273069381713867, + "logits/rejected": 4.664915084838867, + "logps/chosen": -540.3794555664062, + "logps/rejected": -164.26556396484375, + "loss": 0.549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2815825045108795, + "rewards/margins": 0.3410099744796753, + "rewards/rejected": -0.059427469968795776, + "step": 2690 + }, + { + "epoch": 0.41616083510535473, + "grad_norm": 4.6780219078063965, + "learning_rate": 4.7849123610952005e-06, + "logits/chosen": 11.537569999694824, + "logits/rejected": 6.925926685333252, + "logps/chosen": -293.36236572265625, + "logps/rejected": -263.9833984375, + "loss": 0.5832, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10317935794591904, + "rewards/margins": 0.3733992576599121, + "rewards/rejected": -0.2702198922634125, + "step": 2691 + }, + { + "epoch": 0.4163154842451189, + "grad_norm": 5.195574760437012, + "learning_rate": 4.784625959445527e-06, + "logits/chosen": 8.589856147766113, + "logits/rejected": 3.791367292404175, + "logps/chosen": -295.43475341796875, + "logps/rejected": -266.682861328125, + "loss": 0.4888, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13618162274360657, + "rewards/margins": 0.6136234402656555, + "rewards/rejected": -0.7498050928115845, + "step": 2692 + }, + { + "epoch": 0.41647013338488303, + "grad_norm": 4.422612190246582, + "learning_rate": 4.784339557795854e-06, + "logits/chosen": 10.778101921081543, + "logits/rejected": 8.08666706085205, + "logps/chosen": -283.65826416015625, + "logps/rejected": -219.94129943847656, + "loss": 0.5003, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18425625562667847, + "rewards/margins": 0.6662859916687012, + "rewards/rejected": -0.8505423069000244, + "step": 2693 + }, + { + "epoch": 0.4166247825246472, + "grad_norm": 6.373537540435791, + "learning_rate": 4.78405315614618e-06, + "logits/chosen": 11.748650550842285, + "logits/rejected": 5.870093822479248, + "logps/chosen": -463.2820129394531, + "logps/rejected": -372.34283447265625, + "loss": 0.5473, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22026681900024414, + "rewards/margins": 0.5614244937896729, + "rewards/rejected": -0.34115761518478394, + "step": 2694 + }, + { + "epoch": 0.4167794316644114, + "grad_norm": 6.2711896896362305, + "learning_rate": 4.783766754496506e-06, + "logits/chosen": 6.283385276794434, + "logits/rejected": 4.548773288726807, + "logps/chosen": -265.0761413574219, + "logps/rejected": -314.5213623046875, + "loss": 0.7434, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.05093931779265404, + "rewards/margins": -0.032683007419109344, + "rewards/rejected": 0.08362233638763428, + "step": 2695 + }, + { + "epoch": 0.41693408080417554, + "grad_norm": 5.825169563293457, + "learning_rate": 4.783480352846833e-06, + "logits/chosen": 3.1713123321533203, + "logits/rejected": 4.2689971923828125, + "logps/chosen": -262.2401123046875, + "logps/rejected": -265.037109375, + "loss": 0.5856, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14730964601039886, + "rewards/margins": 0.2898842692375183, + "rewards/rejected": -0.4371938705444336, + "step": 2696 + }, + { + "epoch": 0.4170887299439397, + "grad_norm": 5.386709213256836, + "learning_rate": 4.78319395119716e-06, + "logits/chosen": 6.719409465789795, + "logits/rejected": 5.37785530090332, + "logps/chosen": -203.09637451171875, + "logps/rejected": -224.99574279785156, + "loss": 0.7086, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10430718958377838, + "rewards/margins": 0.03172749653458595, + "rewards/rejected": -0.13603466749191284, + "step": 2697 + }, + { + "epoch": 0.41724337908370385, + "grad_norm": 6.9432525634765625, + "learning_rate": 4.782907549547485e-06, + "logits/chosen": 8.518115043640137, + "logits/rejected": 4.223296642303467, + "logps/chosen": -364.51220703125, + "logps/rejected": -264.8878479003906, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.020981982350349426, + "rewards/margins": 0.1203949898481369, + "rewards/rejected": -0.09941300749778748, + "step": 2698 + }, + { + "epoch": 0.417398028223468, + "grad_norm": 3.7915279865264893, + "learning_rate": 4.782621147897812e-06, + "logits/chosen": 4.608911991119385, + "logits/rejected": -3.6935369968414307, + "logps/chosen": -225.62054443359375, + "logps/rejected": -175.47084045410156, + "loss": 0.4539, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11719133704900742, + "rewards/margins": 0.7321680784225464, + "rewards/rejected": -0.6149767637252808, + "step": 2699 + }, + { + "epoch": 0.41755267736323215, + "grad_norm": 5.045612335205078, + "learning_rate": 4.782334746248139e-06, + "logits/chosen": 6.090024471282959, + "logits/rejected": 5.915399551391602, + "logps/chosen": -267.00274658203125, + "logps/rejected": -267.0867004394531, + "loss": 0.5902, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.056253623217344284, + "rewards/margins": 0.2599541246891022, + "rewards/rejected": -0.31620773673057556, + "step": 2700 + }, + { + "epoch": 0.4177073265029963, + "grad_norm": 6.319419860839844, + "learning_rate": 4.782048344598465e-06, + "logits/chosen": 11.35316276550293, + "logits/rejected": -1.763852596282959, + "logps/chosen": -402.2293395996094, + "logps/rejected": -195.75038146972656, + "loss": 0.6435, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2728157937526703, + "rewards/margins": 0.25665366649627686, + "rewards/rejected": -0.5294694900512695, + "step": 2701 + }, + { + "epoch": 0.4178619756427605, + "grad_norm": 6.286221504211426, + "learning_rate": 4.781761942948792e-06, + "logits/chosen": 8.281381607055664, + "logits/rejected": 7.197473526000977, + "logps/chosen": -289.343017578125, + "logps/rejected": -248.4585723876953, + "loss": 0.6846, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14469853043556213, + "rewards/margins": 0.046721309423446655, + "rewards/rejected": -0.1914198398590088, + "step": 2702 + }, + { + "epoch": 0.41801662478252466, + "grad_norm": 5.062898635864258, + "learning_rate": 4.781475541299118e-06, + "logits/chosen": 11.330883026123047, + "logits/rejected": -0.18748599290847778, + "logps/chosen": -462.5653381347656, + "logps/rejected": -245.14747619628906, + "loss": 0.5265, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49622058868408203, + "rewards/margins": 0.517587423324585, + "rewards/rejected": -0.02136683464050293, + "step": 2703 + }, + { + "epoch": 0.4181712739222888, + "grad_norm": 4.331087112426758, + "learning_rate": 4.7811891396494445e-06, + "logits/chosen": 12.954935073852539, + "logits/rejected": 13.768539428710938, + "logps/chosen": -214.82540893554688, + "logps/rejected": -249.99374389648438, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03949194401502609, + "rewards/margins": 0.11332221329212189, + "rewards/rejected": -0.0738302618265152, + "step": 2704 + }, + { + "epoch": 0.41832592306205296, + "grad_norm": 4.418230056762695, + "learning_rate": 4.780902737999771e-06, + "logits/chosen": 10.763041496276855, + "logits/rejected": 8.676929473876953, + "logps/chosen": -308.13031005859375, + "logps/rejected": -288.34442138671875, + "loss": 0.4531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13142453134059906, + "rewards/margins": 0.5743554830551147, + "rewards/rejected": -0.44293099641799927, + "step": 2705 + }, + { + "epoch": 0.4184805722018171, + "grad_norm": 5.628113269805908, + "learning_rate": 4.780616336350098e-06, + "logits/chosen": 8.764001846313477, + "logits/rejected": 6.977086544036865, + "logps/chosen": -185.44674682617188, + "logps/rejected": -193.530517578125, + "loss": 0.8034, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2779386639595032, + "rewards/margins": -0.16259542107582092, + "rewards/rejected": -0.11534324288368225, + "step": 2706 + }, + { + "epoch": 0.41863522134158127, + "grad_norm": 4.679571628570557, + "learning_rate": 4.780329934700424e-06, + "logits/chosen": 14.376748085021973, + "logits/rejected": 12.397417068481445, + "logps/chosen": -323.1869812011719, + "logps/rejected": -276.1672668457031, + "loss": 0.6319, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20652762055397034, + "rewards/margins": 0.2234170138835907, + "rewards/rejected": -0.01688939332962036, + "step": 2707 + }, + { + "epoch": 0.41878987048134547, + "grad_norm": 5.90140438079834, + "learning_rate": 4.78004353305075e-06, + "logits/chosen": 14.55916976928711, + "logits/rejected": 12.281283378601074, + "logps/chosen": -320.0032043457031, + "logps/rejected": -313.7568664550781, + "loss": 0.729, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14047031104564667, + "rewards/margins": 0.04747982323169708, + "rewards/rejected": -0.18795014917850494, + "step": 2708 + }, + { + "epoch": 0.4189445196211096, + "grad_norm": 5.6167473793029785, + "learning_rate": 4.779757131401077e-06, + "logits/chosen": 9.743917465209961, + "logits/rejected": 7.455842971801758, + "logps/chosen": -329.2607727050781, + "logps/rejected": -256.277587890625, + "loss": 0.605, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0670848935842514, + "rewards/margins": 0.254169762134552, + "rewards/rejected": -0.3212546408176422, + "step": 2709 + }, + { + "epoch": 0.4190991687608738, + "grad_norm": 9.54301643371582, + "learning_rate": 4.7794707297514035e-06, + "logits/chosen": 9.22585678100586, + "logits/rejected": 6.81712532043457, + "logps/chosen": -280.6036071777344, + "logps/rejected": -287.950927734375, + "loss": 0.5549, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21857662498950958, + "rewards/margins": 0.42286786437034607, + "rewards/rejected": -0.2042911946773529, + "step": 2710 + }, + { + "epoch": 0.4192538179006379, + "grad_norm": 7.372808456420898, + "learning_rate": 4.77918432810173e-06, + "logits/chosen": 10.510807991027832, + "logits/rejected": 6.736120223999023, + "logps/chosen": -263.6864013671875, + "logps/rejected": -315.4885559082031, + "loss": 0.8089, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11675453186035156, + "rewards/margins": -0.10326775908470154, + "rewards/rejected": -0.013486750423908234, + "step": 2711 + }, + { + "epoch": 0.4194084670404021, + "grad_norm": 5.351847171783447, + "learning_rate": 4.778897926452057e-06, + "logits/chosen": 12.728840827941895, + "logits/rejected": 9.63144588470459, + "logps/chosen": -285.2611083984375, + "logps/rejected": -231.7850341796875, + "loss": 0.7402, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2631918787956238, + "rewards/margins": -0.04113469272851944, + "rewards/rejected": 0.304326593875885, + "step": 2712 + }, + { + "epoch": 0.41956311618016623, + "grad_norm": 6.6852803230285645, + "learning_rate": 4.7786115248023835e-06, + "logits/chosen": -0.5082281827926636, + "logits/rejected": 2.6910643577575684, + "logps/chosen": -205.9815216064453, + "logps/rejected": -227.4179229736328, + "loss": 0.5989, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10439737141132355, + "rewards/margins": 0.257002055644989, + "rewards/rejected": -0.15260466933250427, + "step": 2713 + }, + { + "epoch": 0.4197177653199304, + "grad_norm": 4.424941062927246, + "learning_rate": 4.77832512315271e-06, + "logits/chosen": 14.498956680297852, + "logits/rejected": 12.126620292663574, + "logps/chosen": -296.50799560546875, + "logps/rejected": -249.41639709472656, + "loss": 0.4657, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2681650221347809, + "rewards/margins": 0.7185261249542236, + "rewards/rejected": -0.45036107301712036, + "step": 2714 + }, + { + "epoch": 0.4198724144596946, + "grad_norm": 8.645773887634277, + "learning_rate": 4.778038721503036e-06, + "logits/chosen": 7.685153961181641, + "logits/rejected": 8.528714179992676, + "logps/chosen": -344.1767578125, + "logps/rejected": -278.14825439453125, + "loss": 0.8658, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5705689191818237, + "rewards/margins": -0.10321656614542007, + "rewards/rejected": -0.46735236048698425, + "step": 2715 + }, + { + "epoch": 0.42002706359945874, + "grad_norm": 5.718757629394531, + "learning_rate": 4.7777523198533626e-06, + "logits/chosen": 11.820518493652344, + "logits/rejected": 7.936539173126221, + "logps/chosen": -295.724853515625, + "logps/rejected": -233.44459533691406, + "loss": 0.7412, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09451419115066528, + "rewards/margins": -0.010176517069339752, + "rewards/rejected": -0.08433766663074493, + "step": 2716 + }, + { + "epoch": 0.4201817127392229, + "grad_norm": 8.840944290161133, + "learning_rate": 4.777465918203689e-06, + "logits/chosen": 11.691082000732422, + "logits/rejected": 4.844560146331787, + "logps/chosen": -317.56500244140625, + "logps/rejected": -262.15966796875, + "loss": 0.5694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23842407763004303, + "rewards/margins": 0.3859137296676636, + "rewards/rejected": -0.14748963713645935, + "step": 2717 + }, + { + "epoch": 0.42033636187898704, + "grad_norm": 4.4735493659973145, + "learning_rate": 4.777179516554016e-06, + "logits/chosen": 14.208394050598145, + "logits/rejected": 6.30283784866333, + "logps/chosen": -357.7921142578125, + "logps/rejected": -321.6224060058594, + "loss": 0.4229, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18252897262573242, + "rewards/margins": 0.895096480846405, + "rewards/rejected": -0.7125673890113831, + "step": 2718 + }, + { + "epoch": 0.4204910110187512, + "grad_norm": 5.160461902618408, + "learning_rate": 4.7768931149043425e-06, + "logits/chosen": 7.7447052001953125, + "logits/rejected": 2.7460551261901855, + "logps/chosen": -241.39859008789062, + "logps/rejected": -226.61546325683594, + "loss": 0.5778, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13145749270915985, + "rewards/margins": 0.2700778543949127, + "rewards/rejected": -0.4015353322029114, + "step": 2719 + }, + { + "epoch": 0.42064566015851534, + "grad_norm": 5.89119815826416, + "learning_rate": 4.776606713254669e-06, + "logits/chosen": 9.629146575927734, + "logits/rejected": 12.043877601623535, + "logps/chosen": -205.89869689941406, + "logps/rejected": -200.8662872314453, + "loss": 0.8836, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.3858489990234375, + "rewards/margins": -0.2856106758117676, + "rewards/rejected": -0.10023832321166992, + "step": 2720 + }, + { + "epoch": 0.42080030929827955, + "grad_norm": 8.336956024169922, + "learning_rate": 4.776320311604995e-06, + "logits/chosen": 6.635457992553711, + "logits/rejected": 4.65915584564209, + "logps/chosen": -427.4053039550781, + "logps/rejected": -474.78729248046875, + "loss": 0.5444, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3576936721801758, + "rewards/margins": 0.3945466876029968, + "rewards/rejected": -0.03685302287340164, + "step": 2721 + }, + { + "epoch": 0.4209549584380437, + "grad_norm": 5.169402599334717, + "learning_rate": 4.776033909955322e-06, + "logits/chosen": 7.739950180053711, + "logits/rejected": 5.9468278884887695, + "logps/chosen": -302.6673583984375, + "logps/rejected": -233.45791625976562, + "loss": 0.589, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.024704553186893463, + "rewards/margins": 0.2438000738620758, + "rewards/rejected": -0.21909551322460175, + "step": 2722 + }, + { + "epoch": 0.42110960757780785, + "grad_norm": 4.91262674331665, + "learning_rate": 4.775747508305648e-06, + "logits/chosen": 12.377195358276367, + "logits/rejected": 12.8310546875, + "logps/chosen": -260.36761474609375, + "logps/rejected": -311.017333984375, + "loss": 0.6212, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2523934245109558, + "rewards/margins": 0.28162336349487305, + "rewards/rejected": -0.02922992780804634, + "step": 2723 + }, + { + "epoch": 0.421264256717572, + "grad_norm": 6.092989921569824, + "learning_rate": 4.775461106655975e-06, + "logits/chosen": 9.967718124389648, + "logits/rejected": 7.425543785095215, + "logps/chosen": -416.6290283203125, + "logps/rejected": -356.41510009765625, + "loss": 0.6552, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1927390992641449, + "rewards/margins": 0.1469866931438446, + "rewards/rejected": 0.04575243219733238, + "step": 2724 + }, + { + "epoch": 0.42141890585733616, + "grad_norm": 3.4592785835266113, + "learning_rate": 4.775174705006302e-06, + "logits/chosen": 9.121768951416016, + "logits/rejected": 3.0271830558776855, + "logps/chosen": -259.8033447265625, + "logps/rejected": -153.08914184570312, + "loss": 0.5117, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06965846568346024, + "rewards/margins": 0.5046188235282898, + "rewards/rejected": -0.43496036529541016, + "step": 2725 + }, + { + "epoch": 0.4215735549971003, + "grad_norm": 4.231537818908691, + "learning_rate": 4.774888303356628e-06, + "logits/chosen": 8.927241325378418, + "logits/rejected": 3.7615315914154053, + "logps/chosen": -255.95484924316406, + "logps/rejected": -205.25143432617188, + "loss": 0.5355, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3888920843601227, + "rewards/margins": 0.5090867280960083, + "rewards/rejected": -0.12019462883472443, + "step": 2726 + }, + { + "epoch": 0.4217282041368645, + "grad_norm": 4.578822612762451, + "learning_rate": 4.774601901706955e-06, + "logits/chosen": 13.02111530303955, + "logits/rejected": 12.118375778198242, + "logps/chosen": -377.34454345703125, + "logps/rejected": -291.754150390625, + "loss": 0.4507, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3327385187149048, + "rewards/margins": 0.7138979434967041, + "rewards/rejected": -0.3811594247817993, + "step": 2727 + }, + { + "epoch": 0.42188285327662867, + "grad_norm": 4.192107677459717, + "learning_rate": 4.774315500057281e-06, + "logits/chosen": 11.796497344970703, + "logits/rejected": 3.850597381591797, + "logps/chosen": -337.1534118652344, + "logps/rejected": -192.74920654296875, + "loss": 0.492, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25665825605392456, + "rewards/margins": 0.7990729212760925, + "rewards/rejected": -0.542414665222168, + "step": 2728 + }, + { + "epoch": 0.4220375024163928, + "grad_norm": 6.299352645874023, + "learning_rate": 4.774029098407607e-06, + "logits/chosen": 6.67086124420166, + "logits/rejected": 5.477456569671631, + "logps/chosen": -231.43360900878906, + "logps/rejected": -253.68856811523438, + "loss": 0.693, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16059312224388123, + "rewards/margins": 0.030339710414409637, + "rewards/rejected": -0.19093284010887146, + "step": 2729 + }, + { + "epoch": 0.42219215155615697, + "grad_norm": 5.316195964813232, + "learning_rate": 4.773742696757934e-06, + "logits/chosen": 7.674101829528809, + "logits/rejected": 6.588985919952393, + "logps/chosen": -246.37860107421875, + "logps/rejected": -202.45803833007812, + "loss": 0.6541, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1592136025428772, + "rewards/margins": 0.2028958797454834, + "rewards/rejected": -0.043682292103767395, + "step": 2730 + }, + { + "epoch": 0.4223468006959211, + "grad_norm": 7.087975025177002, + "learning_rate": 4.773456295108261e-06, + "logits/chosen": 8.891820907592773, + "logits/rejected": -0.0876539945602417, + "logps/chosen": -336.699951171875, + "logps/rejected": -224.86866760253906, + "loss": 0.828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07742289453744888, + "rewards/margins": -0.12086465954780579, + "rewards/rejected": 0.0434417761862278, + "step": 2731 + }, + { + "epoch": 0.42250144983568527, + "grad_norm": 4.648256301879883, + "learning_rate": 4.7731698934585864e-06, + "logits/chosen": 8.062427520751953, + "logits/rejected": 7.9964447021484375, + "logps/chosen": -209.71705627441406, + "logps/rejected": -194.92779541015625, + "loss": 0.546, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24536506831645966, + "rewards/margins": 0.35593506693840027, + "rewards/rejected": -0.11057000607252121, + "step": 2732 + }, + { + "epoch": 0.4226560989754494, + "grad_norm": 6.438325881958008, + "learning_rate": 4.772883491808913e-06, + "logits/chosen": 10.194450378417969, + "logits/rejected": 6.947749137878418, + "logps/chosen": -349.1863098144531, + "logps/rejected": -264.14288330078125, + "loss": 0.6243, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11034897714853287, + "rewards/margins": 0.2750949561595917, + "rewards/rejected": -0.16474595665931702, + "step": 2733 + }, + { + "epoch": 0.42281074811521363, + "grad_norm": 7.391777038574219, + "learning_rate": 4.77259709015924e-06, + "logits/chosen": 9.822986602783203, + "logits/rejected": 10.895857810974121, + "logps/chosen": -327.1275634765625, + "logps/rejected": -291.96185302734375, + "loss": 0.8514, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07280893623828888, + "rewards/margins": -0.06047758460044861, + "rewards/rejected": -0.012331336736679077, + "step": 2734 + }, + { + "epoch": 0.4229653972549778, + "grad_norm": 7.118979454040527, + "learning_rate": 4.772310688509566e-06, + "logits/chosen": 9.729959487915039, + "logits/rejected": 7.374753475189209, + "logps/chosen": -159.21572875976562, + "logps/rejected": -147.21315002441406, + "loss": 0.6117, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05866682529449463, + "rewards/margins": 0.341379314661026, + "rewards/rejected": -0.400046169757843, + "step": 2735 + }, + { + "epoch": 0.42312004639474193, + "grad_norm": 7.149129390716553, + "learning_rate": 4.772024286859892e-06, + "logits/chosen": 1.4898169040679932, + "logits/rejected": 4.360662460327148, + "logps/chosen": -315.31103515625, + "logps/rejected": -289.0843505859375, + "loss": 0.795, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12909287214279175, + "rewards/margins": -0.0901762917637825, + "rewards/rejected": 0.21926920115947723, + "step": 2736 + }, + { + "epoch": 0.4232746955345061, + "grad_norm": 4.002818584442139, + "learning_rate": 4.771737885210219e-06, + "logits/chosen": 13.609821319580078, + "logits/rejected": 11.763874053955078, + "logps/chosen": -232.28904724121094, + "logps/rejected": -215.25314331054688, + "loss": 0.5194, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07355241477489471, + "rewards/margins": 0.41456544399261475, + "rewards/rejected": -0.3410130739212036, + "step": 2737 + }, + { + "epoch": 0.42342934467427024, + "grad_norm": 6.173304557800293, + "learning_rate": 4.7714514835605455e-06, + "logits/chosen": 7.67469596862793, + "logits/rejected": 6.299157619476318, + "logps/chosen": -343.06353759765625, + "logps/rejected": -363.572998046875, + "loss": 0.6818, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04554472118616104, + "rewards/margins": 0.09363280236721039, + "rewards/rejected": -0.04808807373046875, + "step": 2738 + }, + { + "epoch": 0.4235839938140344, + "grad_norm": 16.468717575073242, + "learning_rate": 4.771165081910872e-06, + "logits/chosen": 7.102536201477051, + "logits/rejected": 5.167606830596924, + "logps/chosen": -300.58056640625, + "logps/rejected": -406.4028015136719, + "loss": 0.9323, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28924620151519775, + "rewards/margins": -0.31365644931793213, + "rewards/rejected": 0.024410255253314972, + "step": 2739 + }, + { + "epoch": 0.4237386429537986, + "grad_norm": 4.076026439666748, + "learning_rate": 4.770878680261199e-06, + "logits/chosen": 5.457551956176758, + "logits/rejected": 7.410901069641113, + "logps/chosen": -210.39723205566406, + "logps/rejected": -216.5321807861328, + "loss": 0.5563, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22935137152671814, + "rewards/margins": 0.40673646330833435, + "rewards/rejected": -0.6360877752304077, + "step": 2740 + }, + { + "epoch": 0.42389329209356275, + "grad_norm": 5.155257225036621, + "learning_rate": 4.770592278611525e-06, + "logits/chosen": 12.875899314880371, + "logits/rejected": 6.465787887573242, + "logps/chosen": -288.6136779785156, + "logps/rejected": -194.6484832763672, + "loss": 0.6288, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08096771687269211, + "rewards/margins": 0.21717016398906708, + "rewards/rejected": -0.2981378734111786, + "step": 2741 + }, + { + "epoch": 0.4240479412333269, + "grad_norm": 4.6979804039001465, + "learning_rate": 4.770305876961851e-06, + "logits/chosen": 8.04547119140625, + "logits/rejected": 8.203258514404297, + "logps/chosen": -443.29681396484375, + "logps/rejected": -622.5830078125, + "loss": 0.415, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5322229266166687, + "rewards/margins": 0.7668886780738831, + "rewards/rejected": -0.23466576635837555, + "step": 2742 + }, + { + "epoch": 0.42420259037309105, + "grad_norm": 4.5598673820495605, + "learning_rate": 4.770019475312178e-06, + "logits/chosen": 5.1866135597229, + "logits/rejected": 10.363449096679688, + "logps/chosen": -173.93594360351562, + "logps/rejected": -195.01580810546875, + "loss": 0.7764, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.375881552696228, + "rewards/margins": -0.12658080458641052, + "rewards/rejected": -0.24930071830749512, + "step": 2743 + }, + { + "epoch": 0.4243572395128552, + "grad_norm": 4.098367691040039, + "learning_rate": 4.7697330736625045e-06, + "logits/chosen": 10.002933502197266, + "logits/rejected": 8.631038665771484, + "logps/chosen": -159.1553192138672, + "logps/rejected": -164.67227172851562, + "loss": 0.6615, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03972359746694565, + "rewards/margins": 0.12288478761911392, + "rewards/rejected": -0.16260838508605957, + "step": 2744 + }, + { + "epoch": 0.42451188865261935, + "grad_norm": 5.395009994506836, + "learning_rate": 4.769446672012831e-06, + "logits/chosen": 5.2226338386535645, + "logits/rejected": 2.028899908065796, + "logps/chosen": -316.1319885253906, + "logps/rejected": -284.5406494140625, + "loss": 0.5824, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4411565661430359, + "rewards/margins": 0.27461639046669006, + "rewards/rejected": -0.7157729864120483, + "step": 2745 + }, + { + "epoch": 0.4246665377923835, + "grad_norm": 4.257770538330078, + "learning_rate": 4.769160270363158e-06, + "logits/chosen": 8.514404296875, + "logits/rejected": 9.623875617980957, + "logps/chosen": -272.18939208984375, + "logps/rejected": -290.9097595214844, + "loss": 0.6206, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10785437375307083, + "rewards/margins": 0.306676983833313, + "rewards/rejected": -0.19882263243198395, + "step": 2746 + }, + { + "epoch": 0.4248211869321477, + "grad_norm": 9.1245698928833, + "learning_rate": 4.7688738687134845e-06, + "logits/chosen": 11.109613418579102, + "logits/rejected": 5.702695846557617, + "logps/chosen": -410.4515075683594, + "logps/rejected": -320.5570983886719, + "loss": 0.7255, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06999436020851135, + "rewards/margins": 0.008247099816799164, + "rewards/rejected": 0.06174727529287338, + "step": 2747 + }, + { + "epoch": 0.42497583607191186, + "grad_norm": 3.340402126312256, + "learning_rate": 4.76858746706381e-06, + "logits/chosen": 9.341480255126953, + "logits/rejected": 5.266529083251953, + "logps/chosen": -221.44529724121094, + "logps/rejected": -157.51185607910156, + "loss": 0.4104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2109074592590332, + "rewards/margins": 0.7160725593566895, + "rewards/rejected": -0.9269800186157227, + "step": 2748 + }, + { + "epoch": 0.425130485211676, + "grad_norm": 7.31988525390625, + "learning_rate": 4.768301065414137e-06, + "logits/chosen": 11.002527236938477, + "logits/rejected": 8.636887550354004, + "logps/chosen": -331.5923156738281, + "logps/rejected": -307.7025146484375, + "loss": 0.8956, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3867785930633545, + "rewards/margins": -0.22611822187900543, + "rewards/rejected": -0.16066037118434906, + "step": 2749 + }, + { + "epoch": 0.42528513435144016, + "grad_norm": 5.898653030395508, + "learning_rate": 4.768014663764464e-06, + "logits/chosen": 13.53886604309082, + "logits/rejected": 8.136905670166016, + "logps/chosen": -326.3847961425781, + "logps/rejected": -262.4565124511719, + "loss": 0.644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40300822257995605, + "rewards/margins": 0.22316938638687134, + "rewards/rejected": -0.6261776089668274, + "step": 2750 + }, + { + "epoch": 0.4254397834912043, + "grad_norm": 5.246368885040283, + "learning_rate": 4.76772826211479e-06, + "logits/chosen": 8.735239028930664, + "logits/rejected": 4.377988815307617, + "logps/chosen": -263.0289306640625, + "logps/rejected": -202.62985229492188, + "loss": 0.5937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.18815650045871735, + "rewards/margins": 0.24281644821166992, + "rewards/rejected": -0.43097296357154846, + "step": 2751 + }, + { + "epoch": 0.42559443263096847, + "grad_norm": 6.4957451820373535, + "learning_rate": 4.767441860465117e-06, + "logits/chosen": 14.04420280456543, + "logits/rejected": 8.219314575195312, + "logps/chosen": -389.6460876464844, + "logps/rejected": -237.43838500976562, + "loss": 0.7016, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16693763434886932, + "rewards/margins": 0.04170088469982147, + "rewards/rejected": -0.208638533949852, + "step": 2752 + }, + { + "epoch": 0.4257490817707327, + "grad_norm": 8.037784576416016, + "learning_rate": 4.7671554588154436e-06, + "logits/chosen": 14.647222518920898, + "logits/rejected": 9.967000007629395, + "logps/chosen": -342.6969909667969, + "logps/rejected": -270.9526672363281, + "loss": 0.8012, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3012911379337311, + "rewards/margins": 0.06553798168897629, + "rewards/rejected": -0.3668290972709656, + "step": 2753 + }, + { + "epoch": 0.4259037309104968, + "grad_norm": 5.5696797370910645, + "learning_rate": 4.766869057165769e-06, + "logits/chosen": 3.4403927326202393, + "logits/rejected": -1.1014904975891113, + "logps/chosen": -204.20413208007812, + "logps/rejected": -120.4216537475586, + "loss": 0.707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5116927623748779, + "rewards/margins": 0.09660043567419052, + "rewards/rejected": -0.6082931756973267, + "step": 2754 + }, + { + "epoch": 0.426058380050261, + "grad_norm": 5.144669055938721, + "learning_rate": 4.766582655516096e-06, + "logits/chosen": 14.354291915893555, + "logits/rejected": 13.039499282836914, + "logps/chosen": -303.9670104980469, + "logps/rejected": -264.73101806640625, + "loss": 0.684, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30414876341819763, + "rewards/margins": 0.07078682631254196, + "rewards/rejected": -0.3749356269836426, + "step": 2755 + }, + { + "epoch": 0.42621302919002513, + "grad_norm": 6.493957996368408, + "learning_rate": 4.766296253866423e-06, + "logits/chosen": 6.75232458114624, + "logits/rejected": 8.511398315429688, + "logps/chosen": -335.8753356933594, + "logps/rejected": -300.45843505859375, + "loss": 0.7413, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0030376510694622993, + "rewards/margins": -0.026888374239206314, + "rewards/rejected": 0.023850727826356888, + "step": 2756 + }, + { + "epoch": 0.4263676783297893, + "grad_norm": 7.644586086273193, + "learning_rate": 4.766009852216749e-06, + "logits/chosen": 10.150111198425293, + "logits/rejected": 7.354369640350342, + "logps/chosen": -292.9273681640625, + "logps/rejected": -298.7638854980469, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14902573823928833, + "rewards/margins": 0.040667399764060974, + "rewards/rejected": -0.1896931231021881, + "step": 2757 + }, + { + "epoch": 0.42652232746955343, + "grad_norm": 7.182144641876221, + "learning_rate": 4.765723450567076e-06, + "logits/chosen": 1.1237726211547852, + "logits/rejected": 8.013139724731445, + "logps/chosen": -141.20968627929688, + "logps/rejected": -239.74183654785156, + "loss": 0.8645, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21673187613487244, + "rewards/margins": -0.17473876476287842, + "rewards/rejected": -0.041993118822574615, + "step": 2758 + }, + { + "epoch": 0.42667697660931764, + "grad_norm": 7.0834197998046875, + "learning_rate": 4.765437048917403e-06, + "logits/chosen": 10.693586349487305, + "logits/rejected": 2.623507022857666, + "logps/chosen": -400.997314453125, + "logps/rejected": -294.77459716796875, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3296867609024048, + "rewards/margins": 0.3362666368484497, + "rewards/rejected": -0.006579883396625519, + "step": 2759 + }, + { + "epoch": 0.4268316257490818, + "grad_norm": 5.904839515686035, + "learning_rate": 4.765150647267729e-06, + "logits/chosen": 12.633384704589844, + "logits/rejected": 8.203807830810547, + "logps/chosen": -336.92047119140625, + "logps/rejected": -308.80755615234375, + "loss": 0.6768, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09725838899612427, + "rewards/margins": 0.06765101104974747, + "rewards/rejected": -0.16490939259529114, + "step": 2760 + }, + { + "epoch": 0.42698627488884594, + "grad_norm": 5.89691686630249, + "learning_rate": 4.764864245618055e-06, + "logits/chosen": 7.833645343780518, + "logits/rejected": 9.76827335357666, + "logps/chosen": -285.94732666015625, + "logps/rejected": -266.9113464355469, + "loss": 0.5762, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004704661667346954, + "rewards/margins": 0.28045085072517395, + "rewards/rejected": -0.2757461965084076, + "step": 2761 + }, + { + "epoch": 0.4271409240286101, + "grad_norm": 9.509416580200195, + "learning_rate": 4.764577843968382e-06, + "logits/chosen": 7.200796604156494, + "logits/rejected": 5.4527153968811035, + "logps/chosen": -250.1338348388672, + "logps/rejected": -241.51318359375, + "loss": 0.8927, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.21443305909633636, + "rewards/margins": -0.30618977546691895, + "rewards/rejected": 0.09175673872232437, + "step": 2762 + }, + { + "epoch": 0.42729557316837424, + "grad_norm": 7.374832630157471, + "learning_rate": 4.764291442318708e-06, + "logits/chosen": 7.869853496551514, + "logits/rejected": 13.285655975341797, + "logps/chosen": -254.0676727294922, + "logps/rejected": -334.44476318359375, + "loss": 0.7752, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10692453384399414, + "rewards/margins": 0.12878704071044922, + "rewards/rejected": -0.23571158945560455, + "step": 2763 + }, + { + "epoch": 0.4274502223081384, + "grad_norm": 3.639901638031006, + "learning_rate": 4.764005040669035e-06, + "logits/chosen": 6.623315811157227, + "logits/rejected": 3.7466959953308105, + "logps/chosen": -162.18270874023438, + "logps/rejected": -151.36611938476562, + "loss": 0.6373, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0015748068690299988, + "rewards/margins": 0.17304383218288422, + "rewards/rejected": -0.17146901786327362, + "step": 2764 + }, + { + "epoch": 0.42760487144790255, + "grad_norm": 15.84953498840332, + "learning_rate": 4.763718639019362e-06, + "logits/chosen": 10.884163856506348, + "logits/rejected": 7.472978115081787, + "logps/chosen": -397.96728515625, + "logps/rejected": -329.92913818359375, + "loss": 0.5627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07396392524242401, + "rewards/margins": 0.39607954025268555, + "rewards/rejected": -0.32211560010910034, + "step": 2765 + }, + { + "epoch": 0.42775952058766675, + "grad_norm": 4.597046375274658, + "learning_rate": 4.7634322373696875e-06, + "logits/chosen": 8.468798637390137, + "logits/rejected": 5.423861503601074, + "logps/chosen": -267.7794494628906, + "logps/rejected": -226.85537719726562, + "loss": 0.4637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2799021005630493, + "rewards/margins": 0.6052736639976501, + "rewards/rejected": -0.32537156343460083, + "step": 2766 + }, + { + "epoch": 0.4279141697274309, + "grad_norm": 6.724148273468018, + "learning_rate": 4.763145835720014e-06, + "logits/chosen": 8.814096450805664, + "logits/rejected": 9.682229995727539, + "logps/chosen": -223.65939331054688, + "logps/rejected": -263.681396484375, + "loss": 0.7809, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.246595099568367, + "rewards/margins": -0.036822013556957245, + "rewards/rejected": -0.20977307856082916, + "step": 2767 + }, + { + "epoch": 0.42806881886719506, + "grad_norm": 6.475399494171143, + "learning_rate": 4.762859434070341e-06, + "logits/chosen": 6.6114068031311035, + "logits/rejected": 9.897765159606934, + "logps/chosen": -256.59771728515625, + "logps/rejected": -311.9908447265625, + "loss": 0.7774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36203208565711975, + "rewards/margins": -0.07748544216156006, + "rewards/rejected": -0.2845466434955597, + "step": 2768 + }, + { + "epoch": 0.4282234680069592, + "grad_norm": 4.478710651397705, + "learning_rate": 4.762573032420667e-06, + "logits/chosen": 5.682971954345703, + "logits/rejected": 4.280372619628906, + "logps/chosen": -219.49554443359375, + "logps/rejected": -241.3962860107422, + "loss": 0.6291, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15944606065750122, + "rewards/margins": 0.16063222289085388, + "rewards/rejected": -0.3200782835483551, + "step": 2769 + }, + { + "epoch": 0.42837811714672336, + "grad_norm": 5.020727634429932, + "learning_rate": 4.762286630770993e-06, + "logits/chosen": 11.01222038269043, + "logits/rejected": 9.80000114440918, + "logps/chosen": -200.78025817871094, + "logps/rejected": -194.8872528076172, + "loss": 0.5965, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6253675818443298, + "rewards/margins": 0.2561553418636322, + "rewards/rejected": -0.8815228939056396, + "step": 2770 + }, + { + "epoch": 0.4285327662864875, + "grad_norm": 6.611806869506836, + "learning_rate": 4.76200022912132e-06, + "logits/chosen": 7.793523788452148, + "logits/rejected": 6.686113357543945, + "logps/chosen": -297.10504150390625, + "logps/rejected": -245.47140502929688, + "loss": 0.7107, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05979418754577637, + "rewards/margins": 0.03167299926280975, + "rewards/rejected": -0.09146718680858612, + "step": 2771 + }, + { + "epoch": 0.4286874154262517, + "grad_norm": 4.4195637702941895, + "learning_rate": 4.7617138274716465e-06, + "logits/chosen": 8.466676712036133, + "logits/rejected": 8.881500244140625, + "logps/chosen": -208.73162841796875, + "logps/rejected": -234.2777557373047, + "loss": 0.5949, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.037338923662900925, + "rewards/margins": 0.26364660263061523, + "rewards/rejected": -0.22630766034126282, + "step": 2772 + }, + { + "epoch": 0.42884206456601587, + "grad_norm": 4.510226726531982, + "learning_rate": 4.761427425821973e-06, + "logits/chosen": 4.968331813812256, + "logits/rejected": 1.514676809310913, + "logps/chosen": -172.02102661132812, + "logps/rejected": -167.14883422851562, + "loss": 0.5744, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.45994484424591064, + "rewards/margins": 0.28233641386032104, + "rewards/rejected": -0.7422811985015869, + "step": 2773 + }, + { + "epoch": 0.42899671370578, + "grad_norm": 7.163201808929443, + "learning_rate": 4.761141024172299e-06, + "logits/chosen": 8.524158477783203, + "logits/rejected": 10.98208236694336, + "logps/chosen": -208.82626342773438, + "logps/rejected": -274.8758544921875, + "loss": 0.7989, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32244065403938293, + "rewards/margins": -0.13323114812374115, + "rewards/rejected": -0.18920952081680298, + "step": 2774 + }, + { + "epoch": 0.42915136284554417, + "grad_norm": 7.631000518798828, + "learning_rate": 4.760854622522626e-06, + "logits/chosen": 5.421298503875732, + "logits/rejected": 11.113502502441406, + "logps/chosen": -211.71424865722656, + "logps/rejected": -270.81817626953125, + "loss": 0.9257, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.49365323781967163, + "rewards/margins": -0.3654343783855438, + "rewards/rejected": -0.1282188445329666, + "step": 2775 + }, + { + "epoch": 0.4293060119853083, + "grad_norm": 5.42144775390625, + "learning_rate": 4.760568220872952e-06, + "logits/chosen": 11.68799114227295, + "logits/rejected": 4.073337078094482, + "logps/chosen": -258.0840759277344, + "logps/rejected": -200.898681640625, + "loss": 0.6138, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.03412303328514099, + "rewards/margins": 0.3629974126815796, + "rewards/rejected": -0.3288743495941162, + "step": 2776 + }, + { + "epoch": 0.4294606611250725, + "grad_norm": 8.292977333068848, + "learning_rate": 4.760281819223279e-06, + "logits/chosen": 17.479278564453125, + "logits/rejected": 11.391890525817871, + "logps/chosen": -438.58154296875, + "logps/rejected": -329.58953857421875, + "loss": 0.5731, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05540771037340164, + "rewards/margins": 0.4602375328540802, + "rewards/rejected": -0.40482980012893677, + "step": 2777 + }, + { + "epoch": 0.4296153102648366, + "grad_norm": 9.318918228149414, + "learning_rate": 4.759995417573606e-06, + "logits/chosen": 13.366079330444336, + "logits/rejected": 13.685798645019531, + "logps/chosen": -386.84356689453125, + "logps/rejected": -404.6522521972656, + "loss": 0.9005, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.30128782987594604, + "rewards/margins": -0.3135948181152344, + "rewards/rejected": 0.012306969612836838, + "step": 2778 + }, + { + "epoch": 0.42976995940460083, + "grad_norm": 6.1490159034729, + "learning_rate": 4.759709015923932e-06, + "logits/chosen": 14.408186912536621, + "logits/rejected": 10.164176940917969, + "logps/chosen": -382.7894287109375, + "logps/rejected": -365.68572998046875, + "loss": 0.6623, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1661672592163086, + "rewards/margins": 0.12371157854795456, + "rewards/rejected": -0.28987884521484375, + "step": 2779 + }, + { + "epoch": 0.429924608544365, + "grad_norm": 5.8148674964904785, + "learning_rate": 4.759422614274258e-06, + "logits/chosen": 5.316891193389893, + "logits/rejected": 7.019569396972656, + "logps/chosen": -225.36041259765625, + "logps/rejected": -211.029296875, + "loss": 0.6179, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021791886538267136, + "rewards/margins": 0.19866633415222168, + "rewards/rejected": -0.2204582393169403, + "step": 2780 + }, + { + "epoch": 0.43007925768412913, + "grad_norm": 7.198339462280273, + "learning_rate": 4.759136212624585e-06, + "logits/chosen": 0.17101365327835083, + "logits/rejected": 4.186741828918457, + "logps/chosen": -231.4630126953125, + "logps/rejected": -214.36569213867188, + "loss": 0.889, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22137005627155304, + "rewards/margins": -0.26349198818206787, + "rewards/rejected": 0.04212189465761185, + "step": 2781 + }, + { + "epoch": 0.4302339068238933, + "grad_norm": 4.7100419998168945, + "learning_rate": 4.758849810974911e-06, + "logits/chosen": 7.900914192199707, + "logits/rejected": 1.3058967590332031, + "logps/chosen": -265.0892333984375, + "logps/rejected": -199.54608154296875, + "loss": 0.5855, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.014899879693984985, + "rewards/margins": 0.41165053844451904, + "rewards/rejected": -0.4265504479408264, + "step": 2782 + }, + { + "epoch": 0.43038855596365744, + "grad_norm": 3.7495269775390625, + "learning_rate": 4.758563409325238e-06, + "logits/chosen": 10.084142684936523, + "logits/rejected": 10.360098838806152, + "logps/chosen": -258.8919982910156, + "logps/rejected": -308.36767578125, + "loss": 0.4645, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16352024674415588, + "rewards/margins": 0.7146258354187012, + "rewards/rejected": -0.8781461715698242, + "step": 2783 + }, + { + "epoch": 0.4305432051034216, + "grad_norm": 5.727351188659668, + "learning_rate": 4.758277007675565e-06, + "logits/chosen": 9.807077407836914, + "logits/rejected": 12.750134468078613, + "logps/chosen": -231.79595947265625, + "logps/rejected": -226.55422973632812, + "loss": 0.6484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06301812827587128, + "rewards/margins": 0.1178392767906189, + "rewards/rejected": -0.18085739016532898, + "step": 2784 + }, + { + "epoch": 0.4306978542431858, + "grad_norm": 5.970767974853516, + "learning_rate": 4.757990606025891e-06, + "logits/chosen": 7.597332954406738, + "logits/rejected": 7.153900146484375, + "logps/chosen": -261.1533508300781, + "logps/rejected": -346.2449645996094, + "loss": 0.6197, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13725723326206207, + "rewards/margins": 0.39906907081604004, + "rewards/rejected": -0.5363263487815857, + "step": 2785 + }, + { + "epoch": 0.43085250338294995, + "grad_norm": 7.419806003570557, + "learning_rate": 4.757704204376218e-06, + "logits/chosen": 4.775491237640381, + "logits/rejected": 3.671041250228882, + "logps/chosen": -330.25640869140625, + "logps/rejected": -289.3252868652344, + "loss": 0.8059, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.49421679973602295, + "rewards/margins": -0.12825129926204681, + "rewards/rejected": -0.36596548557281494, + "step": 2786 + }, + { + "epoch": 0.4310071525227141, + "grad_norm": 5.847108364105225, + "learning_rate": 4.757417802726544e-06, + "logits/chosen": 9.564638137817383, + "logits/rejected": 5.162547588348389, + "logps/chosen": -210.74020385742188, + "logps/rejected": -176.9446258544922, + "loss": 0.7796, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5194675922393799, + "rewards/margins": -0.12075714766979218, + "rewards/rejected": -0.3987103998661041, + "step": 2787 + }, + { + "epoch": 0.43116180166247825, + "grad_norm": 8.376914978027344, + "learning_rate": 4.75713140107687e-06, + "logits/chosen": 14.467245101928711, + "logits/rejected": 14.03645133972168, + "logps/chosen": -341.5711669921875, + "logps/rejected": -375.37548828125, + "loss": 0.7979, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3634641766548157, + "rewards/margins": -0.10142546892166138, + "rewards/rejected": -0.2620387077331543, + "step": 2788 + }, + { + "epoch": 0.4313164508022424, + "grad_norm": 5.915359973907471, + "learning_rate": 4.756844999427197e-06, + "logits/chosen": 10.123217582702637, + "logits/rejected": 3.3616385459899902, + "logps/chosen": -269.1231689453125, + "logps/rejected": -189.5192108154297, + "loss": 0.6006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.040657080709934235, + "rewards/margins": 0.22448387742042542, + "rewards/rejected": -0.26514095067977905, + "step": 2789 + }, + { + "epoch": 0.43147109994200655, + "grad_norm": 6.117700576782227, + "learning_rate": 4.756558597777524e-06, + "logits/chosen": 13.525590896606445, + "logits/rejected": 4.828794956207275, + "logps/chosen": -455.7845458984375, + "logps/rejected": -342.3318786621094, + "loss": 0.4531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12162887305021286, + "rewards/margins": 0.7615545988082886, + "rewards/rejected": -0.6399257183074951, + "step": 2790 + }, + { + "epoch": 0.43162574908177076, + "grad_norm": 6.3880181312561035, + "learning_rate": 4.75627219612785e-06, + "logits/chosen": 4.344879150390625, + "logits/rejected": 8.407329559326172, + "logps/chosen": -195.07583618164062, + "logps/rejected": -236.76116943359375, + "loss": 0.9007, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5900249481201172, + "rewards/margins": -0.2911919057369232, + "rewards/rejected": -0.29883310198783875, + "step": 2791 + }, + { + "epoch": 0.4317803982215349, + "grad_norm": 4.570748805999756, + "learning_rate": 4.755985794478177e-06, + "logits/chosen": 8.43942642211914, + "logits/rejected": 7.202610015869141, + "logps/chosen": -160.795654296875, + "logps/rejected": -120.71747589111328, + "loss": 0.6239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26789963245391846, + "rewards/margins": 0.19951948523521423, + "rewards/rejected": -0.4674190878868103, + "step": 2792 + }, + { + "epoch": 0.43193504736129906, + "grad_norm": 7.541436195373535, + "learning_rate": 4.755699392828504e-06, + "logits/chosen": 6.8108367919921875, + "logits/rejected": 8.16777229309082, + "logps/chosen": -251.1971893310547, + "logps/rejected": -302.0908203125, + "loss": 0.809, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5668714046478271, + "rewards/margins": -0.15265551209449768, + "rewards/rejected": -0.41421589255332947, + "step": 2793 + }, + { + "epoch": 0.4320896965010632, + "grad_norm": 5.5171098709106445, + "learning_rate": 4.7554129911788294e-06, + "logits/chosen": 8.900758743286133, + "logits/rejected": 10.663663864135742, + "logps/chosen": -213.49937438964844, + "logps/rejected": -206.3883819580078, + "loss": 0.7854, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3913537263870239, + "rewards/margins": -0.15273170173168182, + "rewards/rejected": -0.23862199485301971, + "step": 2794 + }, + { + "epoch": 0.43224434564082737, + "grad_norm": 4.662868976593018, + "learning_rate": 4.755126589529156e-06, + "logits/chosen": 10.188685417175293, + "logits/rejected": 6.852789878845215, + "logps/chosen": -402.2821044921875, + "logps/rejected": -321.64715576171875, + "loss": 0.5504, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.12097405642271042, + "rewards/margins": 0.6118749380111694, + "rewards/rejected": -0.7328490018844604, + "step": 2795 + }, + { + "epoch": 0.4323989947805915, + "grad_norm": 6.1262078285217285, + "learning_rate": 4.754840187879483e-06, + "logits/chosen": 4.809596061706543, + "logits/rejected": 6.813551425933838, + "logps/chosen": -212.06600952148438, + "logps/rejected": -278.2464904785156, + "loss": 0.6351, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.49957358837127686, + "rewards/margins": 0.17979955673217773, + "rewards/rejected": -0.6793731451034546, + "step": 2796 + }, + { + "epoch": 0.43255364392035567, + "grad_norm": 7.292859077453613, + "learning_rate": 4.754553786229809e-06, + "logits/chosen": 11.588642120361328, + "logits/rejected": 8.493711471557617, + "logps/chosen": -362.13037109375, + "logps/rejected": -222.6273651123047, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5309098958969116, + "rewards/margins": 0.03510220721364021, + "rewards/rejected": -0.5660121440887451, + "step": 2797 + }, + { + "epoch": 0.4327082930601199, + "grad_norm": 4.58829927444458, + "learning_rate": 4.754267384580136e-06, + "logits/chosen": 10.057673454284668, + "logits/rejected": 13.428799629211426, + "logps/chosen": -166.45016479492188, + "logps/rejected": -250.3113250732422, + "loss": 0.5886, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44345489144325256, + "rewards/margins": 0.23104631900787354, + "rewards/rejected": -0.6745012402534485, + "step": 2798 + }, + { + "epoch": 0.432862942199884, + "grad_norm": 4.8461127281188965, + "learning_rate": 4.753980982930463e-06, + "logits/chosen": 9.331192016601562, + "logits/rejected": -5.255367755889893, + "logps/chosen": -269.07080078125, + "logps/rejected": -113.04387664794922, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07054150849580765, + "rewards/margins": 0.24580320715904236, + "rewards/rejected": -0.1752616912126541, + "step": 2799 + }, + { + "epoch": 0.4330175913396482, + "grad_norm": 5.574551105499268, + "learning_rate": 4.7536945812807885e-06, + "logits/chosen": 10.010777473449707, + "logits/rejected": 6.791634559631348, + "logps/chosen": -336.68402099609375, + "logps/rejected": -296.0834655761719, + "loss": 0.4608, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34524527192115784, + "rewards/margins": 0.6705272793769836, + "rewards/rejected": -0.3252819776535034, + "step": 2800 + }, + { + "epoch": 0.43317224047941233, + "grad_norm": 6.900188446044922, + "learning_rate": 4.753408179631115e-06, + "logits/chosen": 5.616233825683594, + "logits/rejected": 5.674280643463135, + "logps/chosen": -214.99447631835938, + "logps/rejected": -224.55767822265625, + "loss": 0.685, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06657677888870239, + "rewards/margins": 0.04779187589883804, + "rewards/rejected": -0.11436866223812103, + "step": 2801 + }, + { + "epoch": 0.4333268896191765, + "grad_norm": 5.857180118560791, + "learning_rate": 4.753121777981442e-06, + "logits/chosen": 8.992115020751953, + "logits/rejected": 8.140558242797852, + "logps/chosen": -225.6531982421875, + "logps/rejected": -241.54335021972656, + "loss": 0.5998, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18340301513671875, + "rewards/margins": 0.3001747131347656, + "rewards/rejected": -0.48357775807380676, + "step": 2802 + }, + { + "epoch": 0.43348153875894063, + "grad_norm": 13.215141296386719, + "learning_rate": 4.7528353763317685e-06, + "logits/chosen": 7.431465148925781, + "logits/rejected": 2.723891496658325, + "logps/chosen": -303.5306701660156, + "logps/rejected": -220.5635986328125, + "loss": 0.8223, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08407649397850037, + "rewards/margins": 0.09192532300949097, + "rewards/rejected": -0.007848832756280899, + "step": 2803 + }, + { + "epoch": 0.43363618789870484, + "grad_norm": 4.5164103507995605, + "learning_rate": 4.752548974682094e-06, + "logits/chosen": 8.535188674926758, + "logits/rejected": 7.057811737060547, + "logps/chosen": -312.494873046875, + "logps/rejected": -323.5174560546875, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00788029283285141, + "rewards/margins": 0.5675268173217773, + "rewards/rejected": -0.559646487236023, + "step": 2804 + }, + { + "epoch": 0.433790837038469, + "grad_norm": 5.599987983703613, + "learning_rate": 4.752262573032421e-06, + "logits/chosen": 12.378469467163086, + "logits/rejected": 3.429933547973633, + "logps/chosen": -301.00347900390625, + "logps/rejected": -180.8714141845703, + "loss": 0.73, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37512171268463135, + "rewards/margins": 0.00924713909626007, + "rewards/rejected": -0.3843688368797302, + "step": 2805 + }, + { + "epoch": 0.43394548617823314, + "grad_norm": 6.036183834075928, + "learning_rate": 4.7519761713827476e-06, + "logits/chosen": 11.895010948181152, + "logits/rejected": 7.091794967651367, + "logps/chosen": -292.4738464355469, + "logps/rejected": -212.21270751953125, + "loss": 0.6228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.36054885387420654, + "rewards/margins": 0.26990586519241333, + "rewards/rejected": -0.6304547786712646, + "step": 2806 + }, + { + "epoch": 0.4341001353179973, + "grad_norm": 6.840506553649902, + "learning_rate": 4.751689769733074e-06, + "logits/chosen": 4.852654457092285, + "logits/rejected": 2.638561248779297, + "logps/chosen": -264.4471740722656, + "logps/rejected": -238.7056121826172, + "loss": 0.8123, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12147527933120728, + "rewards/margins": -0.16497759521007538, + "rewards/rejected": 0.04350230097770691, + "step": 2807 + }, + { + "epoch": 0.43425478445776144, + "grad_norm": 7.319533824920654, + "learning_rate": 4.7514033680834e-06, + "logits/chosen": 7.211735248565674, + "logits/rejected": 1.5934264659881592, + "logps/chosen": -304.82257080078125, + "logps/rejected": -213.3721466064453, + "loss": 0.808, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3614337742328644, + "rewards/margins": -0.13401289284229279, + "rewards/rejected": -0.2274208962917328, + "step": 2808 + }, + { + "epoch": 0.4344094335975256, + "grad_norm": 7.605015754699707, + "learning_rate": 4.751116966433727e-06, + "logits/chosen": 5.696356296539307, + "logits/rejected": 3.17057728767395, + "logps/chosen": -355.5419006347656, + "logps/rejected": -240.73291015625, + "loss": 0.8108, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20709803700447083, + "rewards/margins": 0.03326638042926788, + "rewards/rejected": -0.2403644174337387, + "step": 2809 + }, + { + "epoch": 0.43456408273728975, + "grad_norm": 4.371910095214844, + "learning_rate": 4.750830564784053e-06, + "logits/chosen": 14.828933715820312, + "logits/rejected": 11.163362503051758, + "logps/chosen": -291.53472900390625, + "logps/rejected": -258.2392578125, + "loss": 0.4902, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13775327801704407, + "rewards/margins": 0.5750832557678223, + "rewards/rejected": -0.4373300075531006, + "step": 2810 + }, + { + "epoch": 0.43471873187705395, + "grad_norm": 6.886568069458008, + "learning_rate": 4.75054416313438e-06, + "logits/chosen": 9.987464904785156, + "logits/rejected": 6.372430801391602, + "logps/chosen": -282.7420349121094, + "logps/rejected": -230.234375, + "loss": 0.8763, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39969274401664734, + "rewards/margins": -0.12942704558372498, + "rewards/rejected": -0.27026569843292236, + "step": 2811 + }, + { + "epoch": 0.4348733810168181, + "grad_norm": 6.040317058563232, + "learning_rate": 4.750257761484707e-06, + "logits/chosen": 14.01717758178711, + "logits/rejected": 8.728301048278809, + "logps/chosen": -246.2141571044922, + "logps/rejected": -220.49880981445312, + "loss": 0.707, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.25272905826568604, + "rewards/margins": 0.14834925532341003, + "rewards/rejected": -0.40107834339141846, + "step": 2812 + }, + { + "epoch": 0.43502803015658226, + "grad_norm": 6.753664970397949, + "learning_rate": 4.749971359835032e-06, + "logits/chosen": 15.51312255859375, + "logits/rejected": 10.871882438659668, + "logps/chosen": -388.32086181640625, + "logps/rejected": -313.42144775390625, + "loss": 0.5732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021067030727863312, + "rewards/margins": 0.5964447259902954, + "rewards/rejected": -0.5753776431083679, + "step": 2813 + }, + { + "epoch": 0.4351826792963464, + "grad_norm": 6.621103286743164, + "learning_rate": 4.749684958185359e-06, + "logits/chosen": 16.63807487487793, + "logits/rejected": 3.980222463607788, + "logps/chosen": -451.0174560546875, + "logps/rejected": -295.88629150390625, + "loss": 0.6123, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025811776518821716, + "rewards/margins": 0.34195002913475037, + "rewards/rejected": -0.3677617907524109, + "step": 2814 + }, + { + "epoch": 0.43533732843611056, + "grad_norm": 5.459160327911377, + "learning_rate": 4.749398556535686e-06, + "logits/chosen": 11.013882637023926, + "logits/rejected": 8.598686218261719, + "logps/chosen": -280.7895812988281, + "logps/rejected": -251.208251953125, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5252373218536377, + "rewards/margins": 0.2069089114665985, + "rewards/rejected": -0.7321462631225586, + "step": 2815 + }, + { + "epoch": 0.4354919775758747, + "grad_norm": 22.75695037841797, + "learning_rate": 4.749112154886012e-06, + "logits/chosen": 8.6644926071167, + "logits/rejected": 7.899404525756836, + "logps/chosen": -186.85472106933594, + "logps/rejected": -175.57276916503906, + "loss": 0.6322, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21507593989372253, + "rewards/margins": 0.36467692255973816, + "rewards/rejected": -0.5797528028488159, + "step": 2816 + }, + { + "epoch": 0.4356466267156389, + "grad_norm": 5.813061237335205, + "learning_rate": 4.748825753236339e-06, + "logits/chosen": 11.378844261169434, + "logits/rejected": 5.882504463195801, + "logps/chosen": -236.85659790039062, + "logps/rejected": -165.74310302734375, + "loss": 0.6468, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.14460229873657227, + "rewards/margins": 0.2838023006916046, + "rewards/rejected": -0.4284045696258545, + "step": 2817 + }, + { + "epoch": 0.43580127585540307, + "grad_norm": 5.486540794372559, + "learning_rate": 4.748539351586666e-06, + "logits/chosen": 12.64401626586914, + "logits/rejected": 8.501300811767578, + "logps/chosen": -360.9309387207031, + "logps/rejected": -238.2698974609375, + "loss": 0.6023, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.018212974071502686, + "rewards/margins": 0.44703519344329834, + "rewards/rejected": -0.465248167514801, + "step": 2818 + }, + { + "epoch": 0.4359559249951672, + "grad_norm": 4.105498790740967, + "learning_rate": 4.748252949936992e-06, + "logits/chosen": 8.176006317138672, + "logits/rejected": 3.262908458709717, + "logps/chosen": -326.33197021484375, + "logps/rejected": -244.0655059814453, + "loss": 0.5462, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08322440087795258, + "rewards/margins": 0.5353583097457886, + "rewards/rejected": -0.4521338939666748, + "step": 2819 + }, + { + "epoch": 0.4361105741349314, + "grad_norm": 5.889348030090332, + "learning_rate": 4.747966548287318e-06, + "logits/chosen": 6.981510639190674, + "logits/rejected": 2.598665475845337, + "logps/chosen": -312.58270263671875, + "logps/rejected": -209.3723907470703, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08875159919261932, + "rewards/margins": 0.5042783617973328, + "rewards/rejected": -0.5930299758911133, + "step": 2820 + }, + { + "epoch": 0.4362652232746955, + "grad_norm": 7.233922481536865, + "learning_rate": 4.747680146637645e-06, + "logits/chosen": 13.439886093139648, + "logits/rejected": 8.223491668701172, + "logps/chosen": -302.23590087890625, + "logps/rejected": -223.22503662109375, + "loss": 0.6514, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22159802913665771, + "rewards/margins": 0.38722625374794006, + "rewards/rejected": -0.6088243126869202, + "step": 2821 + }, + { + "epoch": 0.4364198724144597, + "grad_norm": 5.701486110687256, + "learning_rate": 4.7473937449879714e-06, + "logits/chosen": 7.395998954772949, + "logits/rejected": 5.183539390563965, + "logps/chosen": -242.25369262695312, + "logps/rejected": -202.43270874023438, + "loss": 0.729, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2294432818889618, + "rewards/margins": -0.01485791802406311, + "rewards/rejected": -0.21458537876605988, + "step": 2822 + }, + { + "epoch": 0.4365745215542239, + "grad_norm": 6.663989543914795, + "learning_rate": 4.747107343338298e-06, + "logits/chosen": 14.649640083312988, + "logits/rejected": 13.448821067810059, + "logps/chosen": -389.4757080078125, + "logps/rejected": -348.6912841796875, + "loss": 0.8359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027445029467344284, + "rewards/margins": -0.13992060720920563, + "rewards/rejected": 0.11247558891773224, + "step": 2823 + }, + { + "epoch": 0.43672917069398803, + "grad_norm": 4.454593658447266, + "learning_rate": 4.746820941688625e-06, + "logits/chosen": 12.053659439086914, + "logits/rejected": 12.979633331298828, + "logps/chosen": -189.28482055664062, + "logps/rejected": -211.57125854492188, + "loss": 0.6615, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048810768872499466, + "rewards/margins": 0.28202423453330994, + "rewards/rejected": -0.23321343958377838, + "step": 2824 + }, + { + "epoch": 0.4368838198337522, + "grad_norm": 4.350654602050781, + "learning_rate": 4.746534540038951e-06, + "logits/chosen": 11.314224243164062, + "logits/rejected": 7.191380500793457, + "logps/chosen": -184.46389770507812, + "logps/rejected": -124.232177734375, + "loss": 0.6688, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13321954011917114, + "rewards/margins": 0.22559450566768646, + "rewards/rejected": -0.3588140308856964, + "step": 2825 + }, + { + "epoch": 0.43703846897351634, + "grad_norm": 4.4734649658203125, + "learning_rate": 4.746248138389278e-06, + "logits/chosen": 13.605212211608887, + "logits/rejected": 10.309515953063965, + "logps/chosen": -289.1519775390625, + "logps/rejected": -278.0863952636719, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18803460896015167, + "rewards/margins": 0.2650759816169739, + "rewards/rejected": -0.07704134285449982, + "step": 2826 + }, + { + "epoch": 0.4371931181132805, + "grad_norm": 5.424923896789551, + "learning_rate": 4.745961736739604e-06, + "logits/chosen": 9.14299201965332, + "logits/rejected": 2.465217113494873, + "logps/chosen": -312.8207702636719, + "logps/rejected": -308.5585632324219, + "loss": 0.5008, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1549084633588791, + "rewards/margins": 0.6946346759796143, + "rewards/rejected": -0.5397261381149292, + "step": 2827 + }, + { + "epoch": 0.43734776725304464, + "grad_norm": 4.931646823883057, + "learning_rate": 4.7456753350899305e-06, + "logits/chosen": 12.082155227661133, + "logits/rejected": 12.245010375976562, + "logps/chosen": -328.2345275878906, + "logps/rejected": -312.2367858886719, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02204418182373047, + "rewards/margins": 0.13654464483261108, + "rewards/rejected": -0.11450046300888062, + "step": 2828 + }, + { + "epoch": 0.4375024163928088, + "grad_norm": 5.402237892150879, + "learning_rate": 4.745388933440257e-06, + "logits/chosen": 4.226941108703613, + "logits/rejected": 7.545684814453125, + "logps/chosen": -247.03013610839844, + "logps/rejected": -343.6475524902344, + "loss": 0.5657, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02545851469039917, + "rewards/margins": 0.4725891649723053, + "rewards/rejected": -0.4471306800842285, + "step": 2829 + }, + { + "epoch": 0.437657065532573, + "grad_norm": 7.002410888671875, + "learning_rate": 4.745102531790584e-06, + "logits/chosen": 2.431375741958618, + "logits/rejected": 4.3468017578125, + "logps/chosen": -320.364990234375, + "logps/rejected": -299.33636474609375, + "loss": 0.703, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1979716718196869, + "rewards/margins": 0.05305410921573639, + "rewards/rejected": -0.2510257959365845, + "step": 2830 + }, + { + "epoch": 0.43781171467233715, + "grad_norm": 5.792113304138184, + "learning_rate": 4.7448161301409104e-06, + "logits/chosen": 3.8318676948547363, + "logits/rejected": 5.64227819442749, + "logps/chosen": -224.19606018066406, + "logps/rejected": -263.5726318359375, + "loss": 0.6752, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2541836202144623, + "rewards/margins": 0.05528821051120758, + "rewards/rejected": -0.30947184562683105, + "step": 2831 + }, + { + "epoch": 0.4379663638121013, + "grad_norm": 6.636241436004639, + "learning_rate": 4.744529728491237e-06, + "logits/chosen": 10.726471900939941, + "logits/rejected": 8.35133171081543, + "logps/chosen": -432.3463134765625, + "logps/rejected": -384.0025634765625, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3270619511604309, + "rewards/margins": 0.1283237338066101, + "rewards/rejected": 0.1987381875514984, + "step": 2832 + }, + { + "epoch": 0.43812101295186545, + "grad_norm": 6.241170406341553, + "learning_rate": 4.744243326841563e-06, + "logits/chosen": 11.469947814941406, + "logits/rejected": 8.822287559509277, + "logps/chosen": -292.97149658203125, + "logps/rejected": -263.7149658203125, + "loss": 0.6589, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4599659740924835, + "rewards/margins": 0.15655073523521423, + "rewards/rejected": -0.6165167093276978, + "step": 2833 + }, + { + "epoch": 0.4382756620916296, + "grad_norm": 6.51314115524292, + "learning_rate": 4.7439569251918895e-06, + "logits/chosen": 14.622237205505371, + "logits/rejected": 9.365476608276367, + "logps/chosen": -258.34130859375, + "logps/rejected": -245.51507568359375, + "loss": 0.6062, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07307276129722595, + "rewards/margins": 0.4358707070350647, + "rewards/rejected": -0.36279794573783875, + "step": 2834 + }, + { + "epoch": 0.43843031123139375, + "grad_norm": 5.3864593505859375, + "learning_rate": 4.743670523542216e-06, + "logits/chosen": 13.829833984375, + "logits/rejected": 5.317742347717285, + "logps/chosen": -223.54281616210938, + "logps/rejected": -174.62063598632812, + "loss": 0.7263, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.31119033694267273, + "rewards/margins": 0.16185730695724487, + "rewards/rejected": -0.4730476140975952, + "step": 2835 + }, + { + "epoch": 0.43858496037115796, + "grad_norm": 5.920149803161621, + "learning_rate": 4.743384121892543e-06, + "logits/chosen": 10.097222328186035, + "logits/rejected": 7.227667808532715, + "logps/chosen": -217.132080078125, + "logps/rejected": -186.79025268554688, + "loss": 0.7485, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20261025428771973, + "rewards/margins": -0.0316656157374382, + "rewards/rejected": -0.17094466090202332, + "step": 2836 + }, + { + "epoch": 0.4387396095109221, + "grad_norm": 3.3589744567871094, + "learning_rate": 4.7430977202428695e-06, + "logits/chosen": 12.82377815246582, + "logits/rejected": 7.432671070098877, + "logps/chosen": -341.67144775390625, + "logps/rejected": -225.3497772216797, + "loss": 0.46, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05353812873363495, + "rewards/margins": 0.6709260940551758, + "rewards/rejected": -0.6173880100250244, + "step": 2837 + }, + { + "epoch": 0.43889425865068626, + "grad_norm": 4.675251007080078, + "learning_rate": 4.742811318593195e-06, + "logits/chosen": 16.821693420410156, + "logits/rejected": 13.110118865966797, + "logps/chosen": -333.9576110839844, + "logps/rejected": -315.6451110839844, + "loss": 0.4815, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10564003884792328, + "rewards/margins": 0.7636474967002869, + "rewards/rejected": -0.6580074429512024, + "step": 2838 + }, + { + "epoch": 0.4390489077904504, + "grad_norm": 5.882259368896484, + "learning_rate": 4.742524916943522e-06, + "logits/chosen": 6.317479133605957, + "logits/rejected": 10.260385513305664, + "logps/chosen": -229.90170288085938, + "logps/rejected": -252.86549377441406, + "loss": 0.5991, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21342070400714874, + "rewards/margins": 0.35805028676986694, + "rewards/rejected": -0.5714709758758545, + "step": 2839 + }, + { + "epoch": 0.43920355693021457, + "grad_norm": 6.055997848510742, + "learning_rate": 4.742238515293849e-06, + "logits/chosen": 16.44618034362793, + "logits/rejected": 6.760526657104492, + "logps/chosen": -368.89093017578125, + "logps/rejected": -266.67120361328125, + "loss": 0.5047, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1716228574514389, + "rewards/margins": 0.5712116956710815, + "rewards/rejected": -0.7428345680236816, + "step": 2840 + }, + { + "epoch": 0.4393582060699787, + "grad_norm": 6.78181791305542, + "learning_rate": 4.741952113644175e-06, + "logits/chosen": 4.90526008605957, + "logits/rejected": 3.522831916809082, + "logps/chosen": -288.4462585449219, + "logps/rejected": -307.6053466796875, + "loss": 0.7357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16314341127872467, + "rewards/margins": -0.04299222677946091, + "rewards/rejected": -0.12015116959810257, + "step": 2841 + }, + { + "epoch": 0.43951285520974287, + "grad_norm": 5.552892208099365, + "learning_rate": 4.741665711994501e-06, + "logits/chosen": 13.207711219787598, + "logits/rejected": 7.869141578674316, + "logps/chosen": -348.44677734375, + "logps/rejected": -238.80465698242188, + "loss": 0.624, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20664730668067932, + "rewards/margins": 0.18964087963104248, + "rewards/rejected": 0.017006421461701393, + "step": 2842 + }, + { + "epoch": 0.4396675043495071, + "grad_norm": 8.40849781036377, + "learning_rate": 4.741379310344828e-06, + "logits/chosen": 6.449000358581543, + "logits/rejected": 13.801970481872559, + "logps/chosen": -252.5522003173828, + "logps/rejected": -338.7855224609375, + "loss": 0.609, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24829742312431335, + "rewards/margins": 0.2070353627204895, + "rewards/rejected": -0.45533275604248047, + "step": 2843 + }, + { + "epoch": 0.43982215348927123, + "grad_norm": 5.977924823760986, + "learning_rate": 4.741092908695154e-06, + "logits/chosen": 10.545989990234375, + "logits/rejected": 9.57856273651123, + "logps/chosen": -256.90362548828125, + "logps/rejected": -298.98876953125, + "loss": 0.6774, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09113011509180069, + "rewards/margins": 0.3714331388473511, + "rewards/rejected": -0.2803030014038086, + "step": 2844 + }, + { + "epoch": 0.4399768026290354, + "grad_norm": 4.474948406219482, + "learning_rate": 4.740806507045481e-06, + "logits/chosen": 6.020287990570068, + "logits/rejected": 6.616371154785156, + "logps/chosen": -229.8592529296875, + "logps/rejected": -273.079833984375, + "loss": 0.5296, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09826397150754929, + "rewards/margins": 0.45664700865745544, + "rewards/rejected": -0.5549110174179077, + "step": 2845 + }, + { + "epoch": 0.44013145176879953, + "grad_norm": 5.890885829925537, + "learning_rate": 4.740520105395807e-06, + "logits/chosen": 13.939849853515625, + "logits/rejected": 10.829734802246094, + "logps/chosen": -317.510009765625, + "logps/rejected": -270.29644775390625, + "loss": 0.7285, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20510119199752808, + "rewards/margins": -0.011336132884025574, + "rewards/rejected": 0.21643735468387604, + "step": 2846 + }, + { + "epoch": 0.4402861009085637, + "grad_norm": 5.421611309051514, + "learning_rate": 4.7402337037461335e-06, + "logits/chosen": 15.615734100341797, + "logits/rejected": 10.996831893920898, + "logps/chosen": -395.7857971191406, + "logps/rejected": -286.41119384765625, + "loss": 0.5148, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42165595293045044, + "rewards/margins": 0.45399296283721924, + "rewards/rejected": -0.0323369987308979, + "step": 2847 + }, + { + "epoch": 0.44044075004832783, + "grad_norm": 4.428069591522217, + "learning_rate": 4.73994730209646e-06, + "logits/chosen": 11.204302787780762, + "logits/rejected": 4.326806545257568, + "logps/chosen": -277.58074951171875, + "logps/rejected": -194.81951904296875, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38343364000320435, + "rewards/margins": 0.5048680901527405, + "rewards/rejected": -0.12143450230360031, + "step": 2848 + }, + { + "epoch": 0.44059539918809204, + "grad_norm": 6.054380416870117, + "learning_rate": 4.739660900446787e-06, + "logits/chosen": 6.838757514953613, + "logits/rejected": 1.1049487590789795, + "logps/chosen": -302.639404296875, + "logps/rejected": -203.12832641601562, + "loss": 0.6411, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3010578751564026, + "rewards/margins": 0.2759125828742981, + "rewards/rejected": 0.025145292282104492, + "step": 2849 + }, + { + "epoch": 0.4407500483278562, + "grad_norm": 4.552211761474609, + "learning_rate": 4.739374498797113e-06, + "logits/chosen": 5.224865436553955, + "logits/rejected": -1.8634520769119263, + "logps/chosen": -224.8545379638672, + "logps/rejected": -157.0994110107422, + "loss": 0.6149, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10895290225744247, + "rewards/margins": 0.39938727021217346, + "rewards/rejected": -0.2904343605041504, + "step": 2850 + }, + { + "epoch": 0.44090469746762034, + "grad_norm": 5.609575271606445, + "learning_rate": 4.73908809714744e-06, + "logits/chosen": 9.490568161010742, + "logits/rejected": 7.982059001922607, + "logps/chosen": -423.14227294921875, + "logps/rejected": -411.0915832519531, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2742403745651245, + "rewards/margins": 0.0011831223964691162, + "rewards/rejected": 0.2730572521686554, + "step": 2851 + }, + { + "epoch": 0.4410593466073845, + "grad_norm": 6.922028064727783, + "learning_rate": 4.738801695497767e-06, + "logits/chosen": 6.390465259552002, + "logits/rejected": 12.317313194274902, + "logps/chosen": -209.84487915039062, + "logps/rejected": -293.3603210449219, + "loss": 0.9191, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09546690434217453, + "rewards/margins": -0.3679962754249573, + "rewards/rejected": 0.2725293040275574, + "step": 2852 + }, + { + "epoch": 0.44121399574714865, + "grad_norm": 5.7474517822265625, + "learning_rate": 4.7385152938480925e-06, + "logits/chosen": 14.656574249267578, + "logits/rejected": 11.81248664855957, + "logps/chosen": -393.20941162109375, + "logps/rejected": -329.83905029296875, + "loss": 0.6291, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3799876868724823, + "rewards/margins": 0.2613934278488159, + "rewards/rejected": 0.11859427392482758, + "step": 2853 + }, + { + "epoch": 0.4413686448869128, + "grad_norm": 5.243478775024414, + "learning_rate": 4.738228892198419e-06, + "logits/chosen": 8.810315132141113, + "logits/rejected": 5.108021259307861, + "logps/chosen": -298.6282958984375, + "logps/rejected": -248.97943115234375, + "loss": 0.5645, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18958550691604614, + "rewards/margins": 0.35238659381866455, + "rewards/rejected": -0.16280107200145721, + "step": 2854 + }, + { + "epoch": 0.44152329402667695, + "grad_norm": 4.685748100280762, + "learning_rate": 4.737942490548746e-06, + "logits/chosen": 6.761685371398926, + "logits/rejected": 6.452707767486572, + "logps/chosen": -197.73789978027344, + "logps/rejected": -193.42196655273438, + "loss": 0.713, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.055004216730594635, + "rewards/margins": 0.0031135305762290955, + "rewards/rejected": 0.05189068615436554, + "step": 2855 + }, + { + "epoch": 0.44167794316644116, + "grad_norm": 3.807940721511841, + "learning_rate": 4.7376560888990725e-06, + "logits/chosen": 13.604524612426758, + "logits/rejected": 6.8091630935668945, + "logps/chosen": -358.82720947265625, + "logps/rejected": -285.5086669921875, + "loss": 0.3906, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7971546649932861, + "rewards/margins": 0.861739993095398, + "rewards/rejected": -0.0645853579044342, + "step": 2856 + }, + { + "epoch": 0.4418325923062053, + "grad_norm": 9.43445110321045, + "learning_rate": 4.737369687249399e-06, + "logits/chosen": 4.530782222747803, + "logits/rejected": 4.439804553985596, + "logps/chosen": -345.72705078125, + "logps/rejected": -370.6066589355469, + "loss": 0.9023, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.012003429234027863, + "rewards/margins": -0.31864115595817566, + "rewards/rejected": 0.3306445777416229, + "step": 2857 + }, + { + "epoch": 0.44198724144596946, + "grad_norm": 5.291871547698975, + "learning_rate": 4.737083285599726e-06, + "logits/chosen": 6.8617634773254395, + "logits/rejected": 6.9650678634643555, + "logps/chosen": -291.0069274902344, + "logps/rejected": -243.76925659179688, + "loss": 0.7181, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23095805943012238, + "rewards/margins": 0.05102190375328064, + "rewards/rejected": 0.17993612587451935, + "step": 2858 + }, + { + "epoch": 0.4421418905857336, + "grad_norm": 6.2592997550964355, + "learning_rate": 4.736796883950052e-06, + "logits/chosen": 7.91221809387207, + "logits/rejected": 1.1641794443130493, + "logps/chosen": -255.61685180664062, + "logps/rejected": -223.37796020507812, + "loss": 0.6719, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12598839402198792, + "rewards/margins": 0.2353305071592331, + "rewards/rejected": -0.3613188862800598, + "step": 2859 + }, + { + "epoch": 0.44229653972549776, + "grad_norm": 5.587939262390137, + "learning_rate": 4.736510482300378e-06, + "logits/chosen": 13.008611679077148, + "logits/rejected": 6.4830241203308105, + "logps/chosen": -363.98974609375, + "logps/rejected": -269.67724609375, + "loss": 0.5449, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4826095700263977, + "rewards/margins": 0.5990228056907654, + "rewards/rejected": -0.11641322821378708, + "step": 2860 + }, + { + "epoch": 0.4424511888652619, + "grad_norm": 6.520329475402832, + "learning_rate": 4.736224080650705e-06, + "logits/chosen": 6.634753227233887, + "logits/rejected": 1.4946156740188599, + "logps/chosen": -333.1385192871094, + "logps/rejected": -314.5243835449219, + "loss": 0.7414, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028812743723392487, + "rewards/margins": 0.25756770372390747, + "rewards/rejected": -0.2287549376487732, + "step": 2861 + }, + { + "epoch": 0.4426058380050261, + "grad_norm": 4.743101596832275, + "learning_rate": 4.7359376790010315e-06, + "logits/chosen": 5.956636428833008, + "logits/rejected": 4.081417083740234, + "logps/chosen": -215.38755798339844, + "logps/rejected": -246.11183166503906, + "loss": 0.5464, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.011276345700025558, + "rewards/margins": 0.41927570104599, + "rewards/rejected": -0.43055206537246704, + "step": 2862 + }, + { + "epoch": 0.44276048714479027, + "grad_norm": 6.915797710418701, + "learning_rate": 4.735651277351358e-06, + "logits/chosen": 7.433876037597656, + "logits/rejected": 8.607778549194336, + "logps/chosen": -335.3842468261719, + "logps/rejected": -365.8195495605469, + "loss": 0.6955, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27969077229499817, + "rewards/margins": 0.04275818169116974, + "rewards/rejected": 0.23693259060382843, + "step": 2863 + }, + { + "epoch": 0.4429151362845544, + "grad_norm": 8.339519500732422, + "learning_rate": 4.735364875701685e-06, + "logits/chosen": 6.564026832580566, + "logits/rejected": 4.65277099609375, + "logps/chosen": -240.2432861328125, + "logps/rejected": -315.1837158203125, + "loss": 0.7565, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.027648955583572388, + "rewards/margins": 0.10455577075481415, + "rewards/rejected": -0.07690680772066116, + "step": 2864 + }, + { + "epoch": 0.4430697854243186, + "grad_norm": 7.633686542510986, + "learning_rate": 4.7350784740520115e-06, + "logits/chosen": 8.107829093933105, + "logits/rejected": 5.75361442565918, + "logps/chosen": -299.97369384765625, + "logps/rejected": -305.9490051269531, + "loss": 0.7386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10715429484844208, + "rewards/margins": 0.0795750766992569, + "rewards/rejected": -0.18672938644886017, + "step": 2865 + }, + { + "epoch": 0.4432244345640827, + "grad_norm": 4.541005611419678, + "learning_rate": 4.734792072402337e-06, + "logits/chosen": 6.470778465270996, + "logits/rejected": 7.629186630249023, + "logps/chosen": -208.55015563964844, + "logps/rejected": -222.7590789794922, + "loss": 0.6301, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01057577133178711, + "rewards/margins": 0.20005419850349426, + "rewards/rejected": -0.18947842717170715, + "step": 2866 + }, + { + "epoch": 0.4433790837038469, + "grad_norm": 3.8445582389831543, + "learning_rate": 4.734505670752664e-06, + "logits/chosen": 13.53183364868164, + "logits/rejected": 8.576092720031738, + "logps/chosen": -319.5737609863281, + "logps/rejected": -236.65890502929688, + "loss": 0.4916, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44668495655059814, + "rewards/margins": 0.614438533782959, + "rewards/rejected": -0.16775354743003845, + "step": 2867 + }, + { + "epoch": 0.4435337328436111, + "grad_norm": 3.8433218002319336, + "learning_rate": 4.7342192691029906e-06, + "logits/chosen": 11.131497383117676, + "logits/rejected": 8.642338752746582, + "logps/chosen": -379.417236328125, + "logps/rejected": -328.353515625, + "loss": 0.5224, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39090633392333984, + "rewards/margins": 0.48262134194374084, + "rewards/rejected": -0.091715008020401, + "step": 2868 + }, + { + "epoch": 0.44368838198337524, + "grad_norm": 6.492985248565674, + "learning_rate": 4.733932867453317e-06, + "logits/chosen": 14.70721435546875, + "logits/rejected": 10.450970649719238, + "logps/chosen": -430.2880554199219, + "logps/rejected": -311.1890563964844, + "loss": 0.6463, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14477311074733734, + "rewards/margins": 0.19298803806304932, + "rewards/rejected": -0.04821494221687317, + "step": 2869 + }, + { + "epoch": 0.4438430311231394, + "grad_norm": 6.110063076019287, + "learning_rate": 4.733646465803644e-06, + "logits/chosen": 7.2248921394348145, + "logits/rejected": 12.311100006103516, + "logps/chosen": -249.59519958496094, + "logps/rejected": -327.64544677734375, + "loss": 0.869, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.14278936386108398, + "rewards/margins": -0.2975122928619385, + "rewards/rejected": 0.1547229290008545, + "step": 2870 + }, + { + "epoch": 0.44399768026290354, + "grad_norm": 7.39842414855957, + "learning_rate": 4.73336006415397e-06, + "logits/chosen": 11.0711669921875, + "logits/rejected": 2.909083843231201, + "logps/chosen": -305.2994384765625, + "logps/rejected": -229.380615234375, + "loss": 0.7751, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04197189211845398, + "rewards/margins": 0.14328841865062714, + "rewards/rejected": -0.10131651163101196, + "step": 2871 + }, + { + "epoch": 0.4441523294026677, + "grad_norm": 4.908721446990967, + "learning_rate": 4.733073662504296e-06, + "logits/chosen": 8.833524703979492, + "logits/rejected": 8.912151336669922, + "logps/chosen": -206.72425842285156, + "logps/rejected": -327.9241638183594, + "loss": 0.5799, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1936500072479248, + "rewards/margins": 0.2807844877243042, + "rewards/rejected": -0.0871344804763794, + "step": 2872 + }, + { + "epoch": 0.44430697854243184, + "grad_norm": 3.709312915802002, + "learning_rate": 4.732787260854623e-06, + "logits/chosen": 10.40578556060791, + "logits/rejected": 10.66954231262207, + "logps/chosen": -296.4912109375, + "logps/rejected": -288.18365478515625, + "loss": 0.5902, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4365907907485962, + "rewards/margins": 0.40280190110206604, + "rewards/rejected": 0.03378888964653015, + "step": 2873 + }, + { + "epoch": 0.444461627682196, + "grad_norm": 4.196516036987305, + "learning_rate": 4.73250085920495e-06, + "logits/chosen": 6.399055004119873, + "logits/rejected": 8.222249984741211, + "logps/chosen": -161.1379852294922, + "logps/rejected": -190.58763122558594, + "loss": 0.6653, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29221320152282715, + "rewards/margins": 0.09238973259925842, + "rewards/rejected": 0.19982348382472992, + "step": 2874 + }, + { + "epoch": 0.4446162768219602, + "grad_norm": 3.84584379196167, + "learning_rate": 4.732214457555276e-06, + "logits/chosen": 9.932168960571289, + "logits/rejected": 8.567069053649902, + "logps/chosen": -199.1146697998047, + "logps/rejected": -188.4136962890625, + "loss": 0.6714, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05072886496782303, + "rewards/margins": 0.12173128873109818, + "rewards/rejected": -0.1724601686000824, + "step": 2875 + }, + { + "epoch": 0.44477092596172435, + "grad_norm": 7.2381591796875, + "learning_rate": 4.731928055905602e-06, + "logits/chosen": 4.177708148956299, + "logits/rejected": 0.7572007179260254, + "logps/chosen": -238.58975219726562, + "logps/rejected": -212.23751831054688, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26416513323783875, + "rewards/margins": 0.6271116733551025, + "rewards/rejected": -0.3629465401172638, + "step": 2876 + }, + { + "epoch": 0.4449255751014885, + "grad_norm": 4.084290027618408, + "learning_rate": 4.731641654255929e-06, + "logits/chosen": 11.849916458129883, + "logits/rejected": 9.19716739654541, + "logps/chosen": -207.31561279296875, + "logps/rejected": -205.89488220214844, + "loss": 0.5158, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21161805093288422, + "rewards/margins": 0.5698764324188232, + "rewards/rejected": -0.35825836658477783, + "step": 2877 + }, + { + "epoch": 0.44508022424125265, + "grad_norm": 4.484077453613281, + "learning_rate": 4.731355252606255e-06, + "logits/chosen": 11.6097993850708, + "logits/rejected": 5.1055097579956055, + "logps/chosen": -326.378173828125, + "logps/rejected": -210.7898712158203, + "loss": 0.4662, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4242036044597626, + "rewards/margins": 0.5861848592758179, + "rewards/rejected": -0.16198131442070007, + "step": 2878 + }, + { + "epoch": 0.4452348733810168, + "grad_norm": 6.345067977905273, + "learning_rate": 4.731068850956582e-06, + "logits/chosen": 14.856422424316406, + "logits/rejected": 8.679730415344238, + "logps/chosen": -344.90924072265625, + "logps/rejected": -295.05096435546875, + "loss": 0.7574, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2681482434272766, + "rewards/margins": -0.04513958841562271, + "rewards/rejected": -0.2230086326599121, + "step": 2879 + }, + { + "epoch": 0.44538952252078096, + "grad_norm": 5.224836826324463, + "learning_rate": 4.730782449306908e-06, + "logits/chosen": 12.31800651550293, + "logits/rejected": 5.714697360992432, + "logps/chosen": -284.20989990234375, + "logps/rejected": -223.97613525390625, + "loss": 0.5641, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1688869595527649, + "rewards/margins": 0.37766990065574646, + "rewards/rejected": -0.20878297090530396, + "step": 2880 + }, + { + "epoch": 0.44554417166054516, + "grad_norm": 8.624768257141113, + "learning_rate": 4.7304960476572345e-06, + "logits/chosen": 11.577617645263672, + "logits/rejected": 7.260366439819336, + "logps/chosen": -387.6743469238281, + "logps/rejected": -306.60614013671875, + "loss": 0.8103, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02204202115535736, + "rewards/margins": -0.09120999276638031, + "rewards/rejected": 0.06916798651218414, + "step": 2881 + }, + { + "epoch": 0.4456988208003093, + "grad_norm": 4.786019802093506, + "learning_rate": 4.730209646007561e-06, + "logits/chosen": 8.661097526550293, + "logits/rejected": 3.394134521484375, + "logps/chosen": -293.8501281738281, + "logps/rejected": -209.88909912109375, + "loss": 0.6564, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21976405382156372, + "rewards/margins": 0.349984347820282, + "rewards/rejected": -0.13022029399871826, + "step": 2882 + }, + { + "epoch": 0.44585346994007347, + "grad_norm": 5.908203601837158, + "learning_rate": 4.729923244357888e-06, + "logits/chosen": 11.351553916931152, + "logits/rejected": 9.534684181213379, + "logps/chosen": -214.1370086669922, + "logps/rejected": -250.79733276367188, + "loss": 0.7505, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20431514084339142, + "rewards/margins": -0.060582488775253296, + "rewards/rejected": -0.14373263716697693, + "step": 2883 + }, + { + "epoch": 0.4460081190798376, + "grad_norm": 4.996636390686035, + "learning_rate": 4.7296368427082144e-06, + "logits/chosen": 12.525541305541992, + "logits/rejected": 10.830989837646484, + "logps/chosen": -268.62042236328125, + "logps/rejected": -227.0744171142578, + "loss": 0.6361, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4013764560222626, + "rewards/margins": 0.19802626967430115, + "rewards/rejected": 0.20335017144680023, + "step": 2884 + }, + { + "epoch": 0.44616276821960177, + "grad_norm": 4.845539569854736, + "learning_rate": 4.729350441058541e-06, + "logits/chosen": 8.097562789916992, + "logits/rejected": 4.926815032958984, + "logps/chosen": -145.10507202148438, + "logps/rejected": -147.08103942871094, + "loss": 0.7837, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06862720847129822, + "rewards/margins": -0.09219758212566376, + "rewards/rejected": 0.023570358753204346, + "step": 2885 + }, + { + "epoch": 0.4463174173593659, + "grad_norm": 3.833444833755493, + "learning_rate": 4.729064039408867e-06, + "logits/chosen": 11.570324897766113, + "logits/rejected": 3.198279857635498, + "logps/chosen": -240.45358276367188, + "logps/rejected": -182.78855895996094, + "loss": 0.5184, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4192468225955963, + "rewards/margins": 0.4428374469280243, + "rewards/rejected": -0.023590609431266785, + "step": 2886 + }, + { + "epoch": 0.44647206649913007, + "grad_norm": 6.880417346954346, + "learning_rate": 4.7287776377591935e-06, + "logits/chosen": 14.952377319335938, + "logits/rejected": 6.515292167663574, + "logps/chosen": -471.180419921875, + "logps/rejected": -306.2154235839844, + "loss": 0.6907, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.017264170572161674, + "rewards/margins": 0.1398581862449646, + "rewards/rejected": -0.15712234377861023, + "step": 2887 + }, + { + "epoch": 0.4466267156388943, + "grad_norm": 6.328181743621826, + "learning_rate": 4.72849123610952e-06, + "logits/chosen": 5.250241756439209, + "logits/rejected": 8.796283721923828, + "logps/chosen": -282.8057556152344, + "logps/rejected": -321.9102478027344, + "loss": 0.7552, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23810362815856934, + "rewards/margins": -0.07757376879453659, + "rewards/rejected": 0.3156774044036865, + "step": 2888 + }, + { + "epoch": 0.44678136477865843, + "grad_norm": 5.26899528503418, + "learning_rate": 4.728204834459847e-06, + "logits/chosen": 8.859580993652344, + "logits/rejected": 12.903532981872559, + "logps/chosen": -283.0203857421875, + "logps/rejected": -275.4232177734375, + "loss": 0.6869, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17853814363479614, + "rewards/margins": 0.049526508897542953, + "rewards/rejected": 0.1290116310119629, + "step": 2889 + }, + { + "epoch": 0.4469360139184226, + "grad_norm": 7.279280662536621, + "learning_rate": 4.7279184328101735e-06, + "logits/chosen": 7.685791969299316, + "logits/rejected": 10.467572212219238, + "logps/chosen": -234.00697326660156, + "logps/rejected": -311.71002197265625, + "loss": 0.8521, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07433100044727325, + "rewards/margins": -0.19696417450904846, + "rewards/rejected": 0.2712951898574829, + "step": 2890 + }, + { + "epoch": 0.44709066305818673, + "grad_norm": 6.728243827819824, + "learning_rate": 4.7276320311605e-06, + "logits/chosen": 7.963534832000732, + "logits/rejected": 6.767995357513428, + "logps/chosen": -333.11199951171875, + "logps/rejected": -330.29608154296875, + "loss": 0.8359, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24529170989990234, + "rewards/margins": -0.007717058062553406, + "rewards/rejected": 0.25300878286361694, + "step": 2891 + }, + { + "epoch": 0.4472453121979509, + "grad_norm": 5.864964962005615, + "learning_rate": 4.727345629510826e-06, + "logits/chosen": 4.441109657287598, + "logits/rejected": 6.527352809906006, + "logps/chosen": -211.81524658203125, + "logps/rejected": -303.90679931640625, + "loss": 0.7232, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17581599950790405, + "rewards/margins": -0.0022001415491104126, + "rewards/rejected": 0.17801614105701447, + "step": 2892 + }, + { + "epoch": 0.44739996133771504, + "grad_norm": 5.700013637542725, + "learning_rate": 4.727059227861153e-06, + "logits/chosen": 14.111230850219727, + "logits/rejected": 4.846957206726074, + "logps/chosen": -390.2244873046875, + "logps/rejected": -273.7905578613281, + "loss": 0.4364, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48367539048194885, + "rewards/margins": 0.8490333557128906, + "rewards/rejected": -0.36535799503326416, + "step": 2893 + }, + { + "epoch": 0.44755461047747924, + "grad_norm": 8.39551830291748, + "learning_rate": 4.726772826211479e-06, + "logits/chosen": 6.948728561401367, + "logits/rejected": 5.054862976074219, + "logps/chosen": -444.1217346191406, + "logps/rejected": -310.09161376953125, + "loss": 0.8162, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14990711212158203, + "rewards/margins": 0.020329445600509644, + "rewards/rejected": 0.12957763671875, + "step": 2894 + }, + { + "epoch": 0.4477092596172434, + "grad_norm": 6.756582736968994, + "learning_rate": 4.726486424561806e-06, + "logits/chosen": 8.98362922668457, + "logits/rejected": 10.83120059967041, + "logps/chosen": -225.34942626953125, + "logps/rejected": -311.18133544921875, + "loss": 0.7712, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08209677040576935, + "rewards/margins": -0.11063957214355469, + "rewards/rejected": 0.19273634254932404, + "step": 2895 + }, + { + "epoch": 0.44786390875700754, + "grad_norm": 6.001872539520264, + "learning_rate": 4.7262000229121325e-06, + "logits/chosen": 8.821746826171875, + "logits/rejected": 10.922311782836914, + "logps/chosen": -282.3558349609375, + "logps/rejected": -290.3909912109375, + "loss": 0.6489, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11806097626686096, + "rewards/margins": 0.3032827079296112, + "rewards/rejected": -0.18522171676158905, + "step": 2896 + }, + { + "epoch": 0.4480185578967717, + "grad_norm": 4.782827854156494, + "learning_rate": 4.725913621262459e-06, + "logits/chosen": 6.726935386657715, + "logits/rejected": 5.239489555358887, + "logps/chosen": -283.99639892578125, + "logps/rejected": -273.02099609375, + "loss": 0.5221, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19964337348937988, + "rewards/margins": 0.4158599078655243, + "rewards/rejected": -0.2162165492773056, + "step": 2897 + }, + { + "epoch": 0.44817320703653585, + "grad_norm": 5.477623462677002, + "learning_rate": 4.725627219612786e-06, + "logits/chosen": 14.605005264282227, + "logits/rejected": 12.554754257202148, + "logps/chosen": -345.5086669921875, + "logps/rejected": -252.9807586669922, + "loss": 0.639, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.48494040966033936, + "rewards/margins": 0.18280187249183655, + "rewards/rejected": 0.3021385669708252, + "step": 2898 + }, + { + "epoch": 0.4483278561763, + "grad_norm": 3.8706207275390625, + "learning_rate": 4.725340817963112e-06, + "logits/chosen": 8.385713577270508, + "logits/rejected": 9.839127540588379, + "logps/chosen": -174.33877563476562, + "logps/rejected": -173.85263061523438, + "loss": 0.4903, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2907941937446594, + "rewards/margins": 0.5425559282302856, + "rewards/rejected": -0.2517617344856262, + "step": 2899 + }, + { + "epoch": 0.4484825053160642, + "grad_norm": 3.753722667694092, + "learning_rate": 4.725054416313438e-06, + "logits/chosen": 12.366252899169922, + "logits/rejected": 4.724295616149902, + "logps/chosen": -434.60870361328125, + "logps/rejected": -274.9190368652344, + "loss": 0.3718, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2013658583164215, + "rewards/margins": 1.0279592275619507, + "rewards/rejected": -0.8265933394432068, + "step": 2900 + }, + { + "epoch": 0.44863715445582836, + "grad_norm": 6.165163516998291, + "learning_rate": 4.724768014663765e-06, + "logits/chosen": 8.658722877502441, + "logits/rejected": 7.678178787231445, + "logps/chosen": -170.8721160888672, + "logps/rejected": -172.74989318847656, + "loss": 0.9867, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.27297621965408325, + "rewards/margins": -0.4973093867301941, + "rewards/rejected": 0.22433319687843323, + "step": 2901 + }, + { + "epoch": 0.4487918035955925, + "grad_norm": 6.165961742401123, + "learning_rate": 4.724481613014092e-06, + "logits/chosen": 8.833492279052734, + "logits/rejected": 2.277181625366211, + "logps/chosen": -241.00518798828125, + "logps/rejected": -152.55735778808594, + "loss": 0.7116, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021444957703351974, + "rewards/margins": -0.00022698938846588135, + "rewards/rejected": -0.021217960864305496, + "step": 2902 + }, + { + "epoch": 0.44894645273535666, + "grad_norm": 9.235562324523926, + "learning_rate": 4.724195211364418e-06, + "logits/chosen": 12.745402336120605, + "logits/rejected": 5.982780456542969, + "logps/chosen": -386.70623779296875, + "logps/rejected": -264.0520324707031, + "loss": 0.654, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6716396808624268, + "rewards/margins": 0.4295971393585205, + "rewards/rejected": 0.24204254150390625, + "step": 2903 + }, + { + "epoch": 0.4491011018751208, + "grad_norm": 6.493156909942627, + "learning_rate": 4.723908809714745e-06, + "logits/chosen": 12.71495246887207, + "logits/rejected": 10.095673561096191, + "logps/chosen": -310.887939453125, + "logps/rejected": -335.0145263671875, + "loss": 0.6425, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12912635505199432, + "rewards/margins": 0.1881587952375412, + "rewards/rejected": -0.05903243273496628, + "step": 2904 + }, + { + "epoch": 0.44925575101488496, + "grad_norm": 8.074202537536621, + "learning_rate": 4.723622408065071e-06, + "logits/chosen": 6.4809064865112305, + "logits/rejected": 6.001316547393799, + "logps/chosen": -368.92486572265625, + "logps/rejected": -392.9451599121094, + "loss": 0.8898, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3877643942832947, + "rewards/margins": -0.2901355028152466, + "rewards/rejected": 0.6778998970985413, + "step": 2905 + }, + { + "epoch": 0.4494104001546491, + "grad_norm": 6.152915000915527, + "learning_rate": 4.723336006415397e-06, + "logits/chosen": 4.247409820556641, + "logits/rejected": 11.452526092529297, + "logps/chosen": -176.74588012695312, + "logps/rejected": -200.24981689453125, + "loss": 0.7763, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08430720865726471, + "rewards/margins": -0.09208157658576965, + "rewards/rejected": 0.007774388417601585, + "step": 2906 + }, + { + "epoch": 0.4495650492944133, + "grad_norm": 3.5732929706573486, + "learning_rate": 4.723049604765724e-06, + "logits/chosen": 8.244793891906738, + "logits/rejected": 7.020036697387695, + "logps/chosen": -210.1520538330078, + "logps/rejected": -169.88589477539062, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03771691024303436, + "rewards/margins": 0.5712445378303528, + "rewards/rejected": -0.5335276126861572, + "step": 2907 + }, + { + "epoch": 0.4497196984341775, + "grad_norm": 5.908124923706055, + "learning_rate": 4.722763203116051e-06, + "logits/chosen": 6.088689804077148, + "logits/rejected": 4.099931240081787, + "logps/chosen": -207.8154296875, + "logps/rejected": -217.70750427246094, + "loss": 0.9709, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.11870688199996948, + "rewards/margins": -0.4534386396408081, + "rewards/rejected": 0.3347318172454834, + "step": 2908 + }, + { + "epoch": 0.4498743475739416, + "grad_norm": 6.127265930175781, + "learning_rate": 4.7224768014663765e-06, + "logits/chosen": 11.193788528442383, + "logits/rejected": 10.419739723205566, + "logps/chosen": -279.6499938964844, + "logps/rejected": -285.610107421875, + "loss": 0.7198, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1776106357574463, + "rewards/margins": 0.06285585463047028, + "rewards/rejected": 0.11475477367639542, + "step": 2909 + }, + { + "epoch": 0.4500289967137058, + "grad_norm": 7.298559665679932, + "learning_rate": 4.722190399816703e-06, + "logits/chosen": 6.826419830322266, + "logits/rejected": 5.780760288238525, + "logps/chosen": -189.8668212890625, + "logps/rejected": -177.15286254882812, + "loss": 1.1304, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.3428311049938202, + "rewards/margins": -0.6570719480514526, + "rewards/rejected": 0.31424081325531006, + "step": 2910 + }, + { + "epoch": 0.4501836458534699, + "grad_norm": 3.988098621368408, + "learning_rate": 4.72190399816703e-06, + "logits/chosen": 4.990813255310059, + "logits/rejected": 4.173823356628418, + "logps/chosen": -241.4462890625, + "logps/rejected": -195.440185546875, + "loss": 0.5807, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08225131034851074, + "rewards/margins": 0.3180958330631256, + "rewards/rejected": -0.40034714341163635, + "step": 2911 + }, + { + "epoch": 0.4503382949932341, + "grad_norm": 6.4938130378723145, + "learning_rate": 4.721617596517356e-06, + "logits/chosen": 12.63523006439209, + "logits/rejected": 5.449173927307129, + "logps/chosen": -310.046630859375, + "logps/rejected": -224.2534942626953, + "loss": 0.7482, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06133061647415161, + "rewards/margins": -0.0026338621973991394, + "rewards/rejected": -0.058696746826171875, + "step": 2912 + }, + { + "epoch": 0.4504929441329983, + "grad_norm": 12.178394317626953, + "learning_rate": 4.721331194867683e-06, + "logits/chosen": 9.63875961303711, + "logits/rejected": 6.1392316818237305, + "logps/chosen": -288.7957763671875, + "logps/rejected": -261.6700439453125, + "loss": 0.7464, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07077684253454208, + "rewards/margins": 0.054892003536224365, + "rewards/rejected": 0.01588483154773712, + "step": 2913 + }, + { + "epoch": 0.45064759327276244, + "grad_norm": 6.8520612716674805, + "learning_rate": 4.721044793218009e-06, + "logits/chosen": 11.555852890014648, + "logits/rejected": 9.000267028808594, + "logps/chosen": -354.20050048828125, + "logps/rejected": -288.676513671875, + "loss": 0.6133, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16141340136528015, + "rewards/margins": 0.2694016993045807, + "rewards/rejected": -0.10798830538988113, + "step": 2914 + }, + { + "epoch": 0.4508022424125266, + "grad_norm": 6.7219438552856445, + "learning_rate": 4.7207583915683355e-06, + "logits/chosen": 7.465461254119873, + "logits/rejected": 13.928438186645508, + "logps/chosen": -259.7578125, + "logps/rejected": -347.8528137207031, + "loss": 0.8802, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0214422345161438, + "rewards/margins": -0.19802358746528625, + "rewards/rejected": 0.21946582198143005, + "step": 2915 + }, + { + "epoch": 0.45095689155229074, + "grad_norm": 5.846672534942627, + "learning_rate": 4.720471989918662e-06, + "logits/chosen": 7.874068737030029, + "logits/rejected": 11.163753509521484, + "logps/chosen": -341.92974853515625, + "logps/rejected": -356.7088928222656, + "loss": 0.6387, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18847458064556122, + "rewards/margins": 0.26053744554519653, + "rewards/rejected": -0.07206287235021591, + "step": 2916 + }, + { + "epoch": 0.4511115406920549, + "grad_norm": 3.4652023315429688, + "learning_rate": 4.720185588268989e-06, + "logits/chosen": 11.5037202835083, + "logits/rejected": 4.1251983642578125, + "logps/chosen": -203.9507293701172, + "logps/rejected": -116.57904052734375, + "loss": 0.6846, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03298850357532501, + "rewards/margins": 0.11044225841760635, + "rewards/rejected": -0.07745376229286194, + "step": 2917 + }, + { + "epoch": 0.45126618983181904, + "grad_norm": 4.189902305603027, + "learning_rate": 4.7198991866193155e-06, + "logits/chosen": 11.779868125915527, + "logits/rejected": -1.0590565204620361, + "logps/chosen": -261.41375732421875, + "logps/rejected": -124.47135925292969, + "loss": 0.5819, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.049158379435539246, + "rewards/margins": 0.5056743025779724, + "rewards/rejected": -0.456515908241272, + "step": 2918 + }, + { + "epoch": 0.4514208389715832, + "grad_norm": 4.399352550506592, + "learning_rate": 4.719612784969641e-06, + "logits/chosen": 10.58271312713623, + "logits/rejected": 10.691996574401855, + "logps/chosen": -217.08486938476562, + "logps/rejected": -211.27662658691406, + "loss": 0.6223, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0179380364716053, + "rewards/margins": 0.1928594410419464, + "rewards/rejected": -0.1749214231967926, + "step": 2919 + }, + { + "epoch": 0.4515754881113474, + "grad_norm": 5.943427562713623, + "learning_rate": 4.719326383319968e-06, + "logits/chosen": 6.621442794799805, + "logits/rejected": 4.642213344573975, + "logps/chosen": -247.37417602539062, + "logps/rejected": -218.21728515625, + "loss": 0.6181, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10349779576063156, + "rewards/margins": 0.2949559688568115, + "rewards/rejected": -0.19145813584327698, + "step": 2920 + }, + { + "epoch": 0.45173013725111155, + "grad_norm": 3.9639053344726562, + "learning_rate": 4.7190399816702946e-06, + "logits/chosen": 7.8957438468933105, + "logits/rejected": -1.8594489097595215, + "logps/chosen": -338.7899169921875, + "logps/rejected": -201.40090942382812, + "loss": 0.5404, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4964830279350281, + "rewards/margins": 0.4400922656059265, + "rewards/rejected": 0.05639071762561798, + "step": 2921 + }, + { + "epoch": 0.4518847863908757, + "grad_norm": 5.448639392852783, + "learning_rate": 4.718753580020621e-06, + "logits/chosen": 6.615348815917969, + "logits/rejected": 2.5457427501678467, + "logps/chosen": -345.60064697265625, + "logps/rejected": -294.2164611816406, + "loss": 0.5164, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34713125228881836, + "rewards/margins": 0.5121694207191467, + "rewards/rejected": -0.16503816843032837, + "step": 2922 + }, + { + "epoch": 0.45203943553063985, + "grad_norm": 7.111696720123291, + "learning_rate": 4.718467178370948e-06, + "logits/chosen": 8.019306182861328, + "logits/rejected": 7.517104625701904, + "logps/chosen": -350.3014221191406, + "logps/rejected": -288.8633117675781, + "loss": 0.5941, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.288277804851532, + "rewards/margins": 0.28707897663116455, + "rewards/rejected": 0.0011988431215286255, + "step": 2923 + }, + { + "epoch": 0.452194084670404, + "grad_norm": 5.23995304107666, + "learning_rate": 4.7181807767212745e-06, + "logits/chosen": 8.124153137207031, + "logits/rejected": 9.436251640319824, + "logps/chosen": -284.27801513671875, + "logps/rejected": -277.70611572265625, + "loss": 0.7135, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08599749207496643, + "rewards/margins": -0.021200411021709442, + "rewards/rejected": 0.10719789564609528, + "step": 2924 + }, + { + "epoch": 0.45234873381016816, + "grad_norm": 5.256038188934326, + "learning_rate": 4.7178943750716e-06, + "logits/chosen": 8.66554069519043, + "logits/rejected": 4.414836406707764, + "logps/chosen": -181.9287109375, + "logps/rejected": -150.93508911132812, + "loss": 0.6235, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0687284767627716, + "rewards/margins": 0.2929876744747162, + "rewards/rejected": -0.3617161512374878, + "step": 2925 + }, + { + "epoch": 0.45250338294993236, + "grad_norm": 7.359097003936768, + "learning_rate": 4.717607973421927e-06, + "logits/chosen": 1.096543550491333, + "logits/rejected": 6.549772262573242, + "logps/chosen": -290.3392028808594, + "logps/rejected": -384.2798156738281, + "loss": 0.8194, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.26173877716064453, + "rewards/margins": -0.1917702704668045, + "rewards/rejected": 0.45350903272628784, + "step": 2926 + }, + { + "epoch": 0.4526580320896965, + "grad_norm": 4.2451629638671875, + "learning_rate": 4.717321571772254e-06, + "logits/chosen": 9.908976554870605, + "logits/rejected": 6.200870037078857, + "logps/chosen": -174.06968688964844, + "logps/rejected": -154.8437042236328, + "loss": 0.6444, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03125952184200287, + "rewards/margins": 0.20702381432056427, + "rewards/rejected": -0.23828335106372833, + "step": 2927 + }, + { + "epoch": 0.45281268122946067, + "grad_norm": 4.497978210449219, + "learning_rate": 4.71703517012258e-06, + "logits/chosen": 9.630134582519531, + "logits/rejected": 3.6748063564300537, + "logps/chosen": -295.34820556640625, + "logps/rejected": -263.46453857421875, + "loss": 0.6878, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1658172607421875, + "rewards/margins": 0.2573878765106201, + "rewards/rejected": -0.091570645570755, + "step": 2928 + }, + { + "epoch": 0.4529673303692248, + "grad_norm": 9.614076614379883, + "learning_rate": 4.716748768472907e-06, + "logits/chosen": 9.055042266845703, + "logits/rejected": 5.912835121154785, + "logps/chosen": -337.821044921875, + "logps/rejected": -322.4443359375, + "loss": 0.8793, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.38718166947364807, + "rewards/margins": -0.21364375948905945, + "rewards/rejected": -0.17353789508342743, + "step": 2929 + }, + { + "epoch": 0.45312197950898897, + "grad_norm": 6.579684734344482, + "learning_rate": 4.716462366823234e-06, + "logits/chosen": 9.525078773498535, + "logits/rejected": 4.042651653289795, + "logps/chosen": -430.9586486816406, + "logps/rejected": -267.54595947265625, + "loss": 0.533, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36265188455581665, + "rewards/margins": 0.6446887254714966, + "rewards/rejected": -0.28203684091567993, + "step": 2930 + }, + { + "epoch": 0.4532766286487531, + "grad_norm": 4.843046188354492, + "learning_rate": 4.71617596517356e-06, + "logits/chosen": 9.071455001831055, + "logits/rejected": 3.2884366512298584, + "logps/chosen": -255.28253173828125, + "logps/rejected": -168.43502807617188, + "loss": 0.7336, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09355826675891876, + "rewards/margins": 0.007359735667705536, + "rewards/rejected": 0.08619852364063263, + "step": 2931 + }, + { + "epoch": 0.45343127778851733, + "grad_norm": 5.47656774520874, + "learning_rate": 4.715889563523886e-06, + "logits/chosen": 13.489498138427734, + "logits/rejected": 12.307373046875, + "logps/chosen": -225.63623046875, + "logps/rejected": -243.85015869140625, + "loss": 0.6866, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09337463974952698, + "rewards/margins": 0.06104620546102524, + "rewards/rejected": -0.1544208526611328, + "step": 2932 + }, + { + "epoch": 0.4535859269282815, + "grad_norm": 5.834225177764893, + "learning_rate": 4.715603161874213e-06, + "logits/chosen": 9.431708335876465, + "logits/rejected": 7.638908386230469, + "logps/chosen": -302.6466064453125, + "logps/rejected": -247.7978057861328, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09487934410572052, + "rewards/margins": 0.44622403383255005, + "rewards/rejected": -0.3513447046279907, + "step": 2933 + }, + { + "epoch": 0.45374057606804563, + "grad_norm": 4.184797286987305, + "learning_rate": 4.715316760224539e-06, + "logits/chosen": 8.781228065490723, + "logits/rejected": 6.778829574584961, + "logps/chosen": -264.7762451171875, + "logps/rejected": -270.5966796875, + "loss": 0.5381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38937321305274963, + "rewards/margins": 0.5425171256065369, + "rewards/rejected": -0.15314389765262604, + "step": 2934 + }, + { + "epoch": 0.4538952252078098, + "grad_norm": 7.629965305328369, + "learning_rate": 4.715030358574866e-06, + "logits/chosen": 8.832340240478516, + "logits/rejected": 7.788751602172852, + "logps/chosen": -261.5520324707031, + "logps/rejected": -257.4610900878906, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2996445298194885, + "rewards/margins": 0.5069663524627686, + "rewards/rejected": -0.20732179284095764, + "step": 2935 + }, + { + "epoch": 0.45404987434757393, + "grad_norm": 18.59811019897461, + "learning_rate": 4.714743956925193e-06, + "logits/chosen": 8.237777709960938, + "logits/rejected": 6.447390556335449, + "logps/chosen": -283.7626647949219, + "logps/rejected": -213.61549377441406, + "loss": 0.6715, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2837170958518982, + "rewards/margins": 0.1279708445072174, + "rewards/rejected": 0.15574628114700317, + "step": 2936 + }, + { + "epoch": 0.4542045234873381, + "grad_norm": 5.235077857971191, + "learning_rate": 4.714457555275519e-06, + "logits/chosen": 13.669167518615723, + "logits/rejected": 10.029403686523438, + "logps/chosen": -276.7439270019531, + "logps/rejected": -259.11639404296875, + "loss": 0.5792, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1466926783323288, + "rewards/margins": 0.3439185321331024, + "rewards/rejected": -0.19722583889961243, + "step": 2937 + }, + { + "epoch": 0.45435917262710224, + "grad_norm": 8.17039680480957, + "learning_rate": 4.714171153625846e-06, + "logits/chosen": 12.728302955627441, + "logits/rejected": 10.468025207519531, + "logps/chosen": -398.0960693359375, + "logps/rejected": -307.9277648925781, + "loss": 0.7742, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.39349180459976196, + "rewards/margins": -0.043975263833999634, + "rewards/rejected": -0.3495165705680847, + "step": 2938 + }, + { + "epoch": 0.45451382176686644, + "grad_norm": 4.526334285736084, + "learning_rate": 4.713884751976172e-06, + "logits/chosen": 8.52957820892334, + "logits/rejected": 5.860341548919678, + "logps/chosen": -276.20379638671875, + "logps/rejected": -196.57733154296875, + "loss": 0.599, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05858264118432999, + "rewards/margins": 0.25034815073013306, + "rewards/rejected": -0.30893078446388245, + "step": 2939 + }, + { + "epoch": 0.4546684709066306, + "grad_norm": 5.230543613433838, + "learning_rate": 4.713598350326498e-06, + "logits/chosen": 9.470664024353027, + "logits/rejected": 6.667529582977295, + "logps/chosen": -258.9997253417969, + "logps/rejected": -284.2755126953125, + "loss": 0.6478, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1312621384859085, + "rewards/margins": 0.14021340012550354, + "rewards/rejected": -0.008951276540756226, + "step": 2940 + }, + { + "epoch": 0.45482312004639475, + "grad_norm": 3.7914252281188965, + "learning_rate": 4.713311948676825e-06, + "logits/chosen": 9.263826370239258, + "logits/rejected": -0.4324500560760498, + "logps/chosen": -217.45944213867188, + "logps/rejected": -125.88097381591797, + "loss": 0.6231, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0592634491622448, + "rewards/margins": 0.2843110263347626, + "rewards/rejected": -0.3435744643211365, + "step": 2941 + }, + { + "epoch": 0.4549777691861589, + "grad_norm": 9.012472152709961, + "learning_rate": 4.713025547027152e-06, + "logits/chosen": 6.490154266357422, + "logits/rejected": 6.397916316986084, + "logps/chosen": -307.388671875, + "logps/rejected": -282.1990966796875, + "loss": 0.9156, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1721625179052353, + "rewards/margins": -0.19464808702468872, + "rewards/rejected": 0.022485554218292236, + "step": 2942 + }, + { + "epoch": 0.45513241832592305, + "grad_norm": 6.005348205566406, + "learning_rate": 4.7127391453774775e-06, + "logits/chosen": 13.595619201660156, + "logits/rejected": 8.500800132751465, + "logps/chosen": -287.17926025390625, + "logps/rejected": -235.2257843017578, + "loss": 0.6991, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12554672360420227, + "rewards/margins": 0.049536775797605515, + "rewards/rejected": 0.07600995153188705, + "step": 2943 + }, + { + "epoch": 0.4552870674656872, + "grad_norm": 7.570594787597656, + "learning_rate": 4.712452743727804e-06, + "logits/chosen": 10.391090393066406, + "logits/rejected": 5.073013782501221, + "logps/chosen": -282.4736633300781, + "logps/rejected": -215.39175415039062, + "loss": 0.6653, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.293901264667511, + "rewards/margins": 0.13103628158569336, + "rewards/rejected": -0.42493754625320435, + "step": 2944 + }, + { + "epoch": 0.4554417166054514, + "grad_norm": 5.320469856262207, + "learning_rate": 4.712166342078131e-06, + "logits/chosen": 10.800678253173828, + "logits/rejected": 7.414029121398926, + "logps/chosen": -260.4130859375, + "logps/rejected": -228.25308227539062, + "loss": 0.6105, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12191639840602875, + "rewards/margins": 0.27259302139282227, + "rewards/rejected": -0.1506766378879547, + "step": 2945 + }, + { + "epoch": 0.45559636574521556, + "grad_norm": 7.1692795753479, + "learning_rate": 4.7118799404284574e-06, + "logits/chosen": 5.790440082550049, + "logits/rejected": 9.175068855285645, + "logps/chosen": -303.69293212890625, + "logps/rejected": -370.9253845214844, + "loss": 0.948, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.057733919471502304, + "rewards/margins": -0.36218681931495667, + "rewards/rejected": 0.30445292592048645, + "step": 2946 + }, + { + "epoch": 0.4557510148849797, + "grad_norm": 5.838233470916748, + "learning_rate": 4.711593538778783e-06, + "logits/chosen": 13.011774063110352, + "logits/rejected": 8.782716751098633, + "logps/chosen": -422.3138122558594, + "logps/rejected": -327.1571044921875, + "loss": 0.6231, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.583827018737793, + "rewards/margins": 0.25265467166900635, + "rewards/rejected": 0.3311723470687866, + "step": 2947 + }, + { + "epoch": 0.45590566402474386, + "grad_norm": 7.294370174407959, + "learning_rate": 4.71130713712911e-06, + "logits/chosen": 9.52261734008789, + "logits/rejected": 10.294153213500977, + "logps/chosen": -422.61273193359375, + "logps/rejected": -390.1136474609375, + "loss": 0.569, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5218958258628845, + "rewards/margins": 0.4072917699813843, + "rewards/rejected": 0.11460405588150024, + "step": 2948 + }, + { + "epoch": 0.456060313164508, + "grad_norm": 6.742400169372559, + "learning_rate": 4.7110207354794366e-06, + "logits/chosen": 9.81273365020752, + "logits/rejected": 7.5383405685424805, + "logps/chosen": -310.8827819824219, + "logps/rejected": -266.2939758300781, + "loss": 0.6422, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4784916043281555, + "rewards/margins": 0.23882469534873962, + "rewards/rejected": 0.2396669089794159, + "step": 2949 + }, + { + "epoch": 0.45621496230427216, + "grad_norm": 6.285976409912109, + "learning_rate": 4.710734333829763e-06, + "logits/chosen": 15.320674896240234, + "logits/rejected": 13.500925064086914, + "logps/chosen": -305.8255310058594, + "logps/rejected": -246.95144653320312, + "loss": 0.6347, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2369910180568695, + "rewards/margins": 0.14823904633522034, + "rewards/rejected": 0.08875197917222977, + "step": 2950 + }, + { + "epoch": 0.4563696114440363, + "grad_norm": 10.00040054321289, + "learning_rate": 4.71044793218009e-06, + "logits/chosen": 12.439031600952148, + "logits/rejected": 8.216947555541992, + "logps/chosen": -416.4801025390625, + "logps/rejected": -335.2454528808594, + "loss": 0.6619, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06558550894260406, + "rewards/margins": 0.15731868147850037, + "rewards/rejected": -0.0917331725358963, + "step": 2951 + }, + { + "epoch": 0.4565242605838005, + "grad_norm": 4.993187427520752, + "learning_rate": 4.710161530530416e-06, + "logits/chosen": 5.436563968658447, + "logits/rejected": 4.8355488777160645, + "logps/chosen": -309.2663879394531, + "logps/rejected": -268.904541015625, + "loss": 0.639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20571264624595642, + "rewards/margins": 0.14895473420619965, + "rewards/rejected": 0.05675792321562767, + "step": 2952 + }, + { + "epoch": 0.4566789097235647, + "grad_norm": 3.816739559173584, + "learning_rate": 4.709875128880742e-06, + "logits/chosen": 10.772808074951172, + "logits/rejected": 12.702529907226562, + "logps/chosen": -130.58863830566406, + "logps/rejected": -108.5149154663086, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06780728697776794, + "rewards/margins": 0.02490680105984211, + "rewards/rejected": -0.0927141010761261, + "step": 2953 + }, + { + "epoch": 0.4568335588633288, + "grad_norm": 8.872064590454102, + "learning_rate": 4.709588727231069e-06, + "logits/chosen": 10.774144172668457, + "logits/rejected": 9.176715850830078, + "logps/chosen": -183.29100036621094, + "logps/rejected": -157.44570922851562, + "loss": 0.5443, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.378996878862381, + "rewards/margins": 0.3657678961753845, + "rewards/rejected": 0.013228986412286758, + "step": 2954 + }, + { + "epoch": 0.456988208003093, + "grad_norm": 11.244800567626953, + "learning_rate": 4.709302325581396e-06, + "logits/chosen": 12.992457389831543, + "logits/rejected": 5.650915145874023, + "logps/chosen": -223.0264892578125, + "logps/rejected": -215.3613739013672, + "loss": 0.7114, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.029163725674152374, + "rewards/margins": 0.1944197714328766, + "rewards/rejected": -0.22358348965644836, + "step": 2955 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 6.742232322692871, + "learning_rate": 4.709015923931722e-06, + "logits/chosen": 5.833223819732666, + "logits/rejected": -0.9751105308532715, + "logps/chosen": -503.0262756347656, + "logps/rejected": -311.10504150390625, + "loss": 0.6434, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26541581749916077, + "rewards/margins": 0.1657085418701172, + "rewards/rejected": 0.09970728307962418, + "step": 2956 + }, + { + "epoch": 0.4572975062826213, + "grad_norm": 5.457131385803223, + "learning_rate": 4.708729522282049e-06, + "logits/chosen": 5.667617321014404, + "logits/rejected": 9.613898277282715, + "logps/chosen": -177.75857543945312, + "logps/rejected": -226.108642578125, + "loss": 0.8716, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24148930609226227, + "rewards/margins": -0.23205962777137756, + "rewards/rejected": -0.009429693222045898, + "step": 2957 + }, + { + "epoch": 0.4574521554223855, + "grad_norm": 6.827175617218018, + "learning_rate": 4.708443120632375e-06, + "logits/chosen": 8.793025016784668, + "logits/rejected": 11.108979225158691, + "logps/chosen": -138.37109375, + "logps/rejected": -147.51718139648438, + "loss": 0.8412, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14409655332565308, + "rewards/margins": -0.19285935163497925, + "rewards/rejected": 0.04876277595758438, + "step": 2958 + }, + { + "epoch": 0.45760680456214964, + "grad_norm": 5.0028157234191895, + "learning_rate": 4.708156718982701e-06, + "logits/chosen": 7.638659477233887, + "logits/rejected": 9.501580238342285, + "logps/chosen": -193.59210205078125, + "logps/rejected": -217.2323760986328, + "loss": 0.7786, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15716448426246643, + "rewards/margins": -0.04010576009750366, + "rewards/rejected": -0.11705875396728516, + "step": 2959 + }, + { + "epoch": 0.4577614537019138, + "grad_norm": 5.470632553100586, + "learning_rate": 4.707870317333028e-06, + "logits/chosen": 8.95091724395752, + "logits/rejected": 8.36347484588623, + "logps/chosen": -276.8647155761719, + "logps/rejected": -222.46798706054688, + "loss": 0.7394, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07018567621707916, + "rewards/margins": -0.04856785014271736, + "rewards/rejected": 0.11875350773334503, + "step": 2960 + }, + { + "epoch": 0.45791610284167794, + "grad_norm": 10.072470664978027, + "learning_rate": 4.707583915683355e-06, + "logits/chosen": 6.957911491394043, + "logits/rejected": 8.475503921508789, + "logps/chosen": -246.808837890625, + "logps/rejected": -275.784912109375, + "loss": 0.7485, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.049744416028261185, + "rewards/margins": -0.09210437536239624, + "rewards/rejected": 0.14184880256652832, + "step": 2961 + }, + { + "epoch": 0.4580707519814421, + "grad_norm": 5.873750686645508, + "learning_rate": 4.707297514033681e-06, + "logits/chosen": 8.11273193359375, + "logits/rejected": 11.537481307983398, + "logps/chosen": -180.97439575195312, + "logps/rejected": -252.358154296875, + "loss": 0.7115, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13970740139484406, + "rewards/margins": 0.11542198061943054, + "rewards/rejected": 0.024285420775413513, + "step": 2962 + }, + { + "epoch": 0.45822540112120624, + "grad_norm": 5.016502857208252, + "learning_rate": 4.707011112384008e-06, + "logits/chosen": 9.926589012145996, + "logits/rejected": 8.350566864013672, + "logps/chosen": -256.6161804199219, + "logps/rejected": -244.02792358398438, + "loss": 0.5445, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28623121976852417, + "rewards/margins": 0.3518240451812744, + "rewards/rejected": -0.06559281051158905, + "step": 2963 + }, + { + "epoch": 0.45838005026097045, + "grad_norm": 6.145665168762207, + "learning_rate": 4.706724710734335e-06, + "logits/chosen": 9.292642593383789, + "logits/rejected": 7.054403305053711, + "logps/chosen": -302.0745849609375, + "logps/rejected": -288.5801086425781, + "loss": 0.5826, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17600078880786896, + "rewards/margins": 0.39004552364349365, + "rewards/rejected": -0.2140447497367859, + "step": 2964 + }, + { + "epoch": 0.4585346994007346, + "grad_norm": 5.751436233520508, + "learning_rate": 4.70643830908466e-06, + "logits/chosen": 11.112475395202637, + "logits/rejected": 10.395916938781738, + "logps/chosen": -268.319091796875, + "logps/rejected": -253.51795959472656, + "loss": 0.7416, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17272084951400757, + "rewards/margins": 0.050621889531612396, + "rewards/rejected": 0.12209896743297577, + "step": 2965 + }, + { + "epoch": 0.45868934854049875, + "grad_norm": 4.8773040771484375, + "learning_rate": 4.706151907434987e-06, + "logits/chosen": 9.408148765563965, + "logits/rejected": 5.81047248840332, + "logps/chosen": -355.807861328125, + "logps/rejected": -242.07968139648438, + "loss": 0.543, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3769363462924957, + "rewards/margins": 0.4655241072177887, + "rewards/rejected": -0.08858775347471237, + "step": 2966 + }, + { + "epoch": 0.4588439976802629, + "grad_norm": 6.105955600738525, + "learning_rate": 4.705865505785314e-06, + "logits/chosen": 7.414273262023926, + "logits/rejected": 4.827837944030762, + "logps/chosen": -419.0539245605469, + "logps/rejected": -320.00335693359375, + "loss": 0.519, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37135595083236694, + "rewards/margins": 0.4644581079483032, + "rewards/rejected": -0.09310220181941986, + "step": 2967 + }, + { + "epoch": 0.45899864682002706, + "grad_norm": 6.328551292419434, + "learning_rate": 4.70557910413564e-06, + "logits/chosen": 7.7021942138671875, + "logits/rejected": 6.684210777282715, + "logps/chosen": -291.421142578125, + "logps/rejected": -328.996337890625, + "loss": 0.678, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04359845817089081, + "rewards/margins": 0.07960616052150726, + "rewards/rejected": -0.03600768744945526, + "step": 2968 + }, + { + "epoch": 0.4591532959597912, + "grad_norm": 4.896423816680908, + "learning_rate": 4.705292702485967e-06, + "logits/chosen": 11.680112838745117, + "logits/rejected": 6.156183242797852, + "logps/chosen": -274.7214050292969, + "logps/rejected": -170.7608642578125, + "loss": 0.5834, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22536210715770721, + "rewards/margins": 0.2836277484893799, + "rewards/rejected": -0.05826563760638237, + "step": 2969 + }, + { + "epoch": 0.45930794509955536, + "grad_norm": 6.469099998474121, + "learning_rate": 4.705006300836294e-06, + "logits/chosen": 10.331438064575195, + "logits/rejected": 8.041559219360352, + "logps/chosen": -336.5439453125, + "logps/rejected": -307.2086486816406, + "loss": 0.644, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03885392099618912, + "rewards/margins": 0.17932654917240143, + "rewards/rejected": -0.1404726207256317, + "step": 2970 + }, + { + "epoch": 0.45946259423931957, + "grad_norm": 5.671962261199951, + "learning_rate": 4.70471989918662e-06, + "logits/chosen": 10.851548194885254, + "logits/rejected": 11.975399017333984, + "logps/chosen": -193.5721893310547, + "logps/rejected": -231.0876922607422, + "loss": 0.7137, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0047724246978759766, + "rewards/margins": -0.009501226246356964, + "rewards/rejected": 0.014273647218942642, + "step": 2971 + }, + { + "epoch": 0.4596172433790837, + "grad_norm": 5.729818820953369, + "learning_rate": 4.704433497536946e-06, + "logits/chosen": 8.65424919128418, + "logits/rejected": 6.078248977661133, + "logps/chosen": -308.5588684082031, + "logps/rejected": -304.5982971191406, + "loss": 0.5868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1789310723543167, + "rewards/margins": 0.34274667501449585, + "rewards/rejected": -0.16381561756134033, + "step": 2972 + }, + { + "epoch": 0.45977189251884787, + "grad_norm": 9.466398239135742, + "learning_rate": 4.704147095887273e-06, + "logits/chosen": 15.809646606445312, + "logits/rejected": 12.176141738891602, + "logps/chosen": -345.842529296875, + "logps/rejected": -261.44024658203125, + "loss": 0.6041, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07272882759571075, + "rewards/margins": 0.2708126902580261, + "rewards/rejected": -0.19808383285999298, + "step": 2973 + }, + { + "epoch": 0.459926541658612, + "grad_norm": 4.588893413543701, + "learning_rate": 4.703860694237599e-06, + "logits/chosen": 5.180237293243408, + "logits/rejected": 1.8059334754943848, + "logps/chosen": -415.674560546875, + "logps/rejected": -228.9871063232422, + "loss": 0.5759, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.42974549531936646, + "rewards/margins": 0.3696592450141907, + "rewards/rejected": 0.060086242854595184, + "step": 2974 + }, + { + "epoch": 0.46008119079837617, + "grad_norm": 4.644771575927734, + "learning_rate": 4.703574292587926e-06, + "logits/chosen": 6.418425559997559, + "logits/rejected": 4.221429347991943, + "logps/chosen": -132.37106323242188, + "logps/rejected": -136.19680786132812, + "loss": 0.6838, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03858347237110138, + "rewards/margins": 0.09676718711853027, + "rewards/rejected": -0.058183714747428894, + "step": 2975 + }, + { + "epoch": 0.4602358399381403, + "grad_norm": 6.267160415649414, + "learning_rate": 4.703287890938253e-06, + "logits/chosen": 5.934901237487793, + "logits/rejected": 9.727717399597168, + "logps/chosen": -217.17721557617188, + "logps/rejected": -239.1723175048828, + "loss": 0.7471, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0834803581237793, + "rewards/margins": 0.008295901119709015, + "rewards/rejected": -0.0917762815952301, + "step": 2976 + }, + { + "epoch": 0.46039048907790453, + "grad_norm": 3.273151159286499, + "learning_rate": 4.7030014892885785e-06, + "logits/chosen": 13.827066421508789, + "logits/rejected": 7.5672807693481445, + "logps/chosen": -153.92709350585938, + "logps/rejected": -124.07454681396484, + "loss": 0.5005, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3807571530342102, + "rewards/margins": 0.49577221274375916, + "rewards/rejected": -0.11501505970954895, + "step": 2977 + }, + { + "epoch": 0.4605451382176687, + "grad_norm": 5.983728408813477, + "learning_rate": 4.702715087638905e-06, + "logits/chosen": 13.610491752624512, + "logits/rejected": 7.699213981628418, + "logps/chosen": -319.1959228515625, + "logps/rejected": -244.608642578125, + "loss": 0.6476, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4514175057411194, + "rewards/margins": 0.2528921067714691, + "rewards/rejected": 0.19852539896965027, + "step": 2978 + }, + { + "epoch": 0.46069978735743283, + "grad_norm": 5.342053413391113, + "learning_rate": 4.702428685989232e-06, + "logits/chosen": 11.79953384399414, + "logits/rejected": 7.764245986938477, + "logps/chosen": -305.5440979003906, + "logps/rejected": -251.59214782714844, + "loss": 0.6267, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1859557330608368, + "rewards/margins": 0.2612136900424957, + "rewards/rejected": -0.07525796443223953, + "step": 2979 + }, + { + "epoch": 0.460854436497197, + "grad_norm": 5.773763656616211, + "learning_rate": 4.7021422843395585e-06, + "logits/chosen": 12.285804748535156, + "logits/rejected": 12.791536331176758, + "logps/chosen": -244.81736755371094, + "logps/rejected": -237.22161865234375, + "loss": 0.7447, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.004154682159423828, + "rewards/margins": 0.12794196605682373, + "rewards/rejected": -0.12378731369972229, + "step": 2980 + }, + { + "epoch": 0.46100908563696114, + "grad_norm": 8.02035140991211, + "learning_rate": 4.701855882689884e-06, + "logits/chosen": 12.456417083740234, + "logits/rejected": 7.1342315673828125, + "logps/chosen": -371.58917236328125, + "logps/rejected": -272.7286376953125, + "loss": 0.7906, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19836415350437164, + "rewards/margins": -0.05249124765396118, + "rewards/rejected": -0.14587293565273285, + "step": 2981 + }, + { + "epoch": 0.4611637347767253, + "grad_norm": 6.104375839233398, + "learning_rate": 4.701569481040211e-06, + "logits/chosen": 11.011655807495117, + "logits/rejected": 10.13815689086914, + "logps/chosen": -356.41497802734375, + "logps/rejected": -345.7293701171875, + "loss": 0.648, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31433993577957153, + "rewards/margins": 0.15907107293605804, + "rewards/rejected": 0.1552688628435135, + "step": 2982 + }, + { + "epoch": 0.46131838391648944, + "grad_norm": 8.492865562438965, + "learning_rate": 4.701283079390538e-06, + "logits/chosen": 14.472335815429688, + "logits/rejected": 8.111994743347168, + "logps/chosen": -416.2086181640625, + "logps/rejected": -369.51458740234375, + "loss": 0.7069, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1842903196811676, + "rewards/margins": 0.04865151643753052, + "rewards/rejected": 0.13563883304595947, + "step": 2983 + }, + { + "epoch": 0.46147303305625365, + "grad_norm": 4.215407371520996, + "learning_rate": 4.700996677740864e-06, + "logits/chosen": 11.113818168640137, + "logits/rejected": 9.38661003112793, + "logps/chosen": -231.343505859375, + "logps/rejected": -197.98866271972656, + "loss": 0.6674, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08780211210250854, + "rewards/margins": 0.08906412124633789, + "rewards/rejected": -0.0012619979679584503, + "step": 2984 + }, + { + "epoch": 0.4616276821960178, + "grad_norm": 8.360186576843262, + "learning_rate": 4.700710276091191e-06, + "logits/chosen": 11.55423641204834, + "logits/rejected": 7.486073017120361, + "logps/chosen": -370.11126708984375, + "logps/rejected": -306.4674987792969, + "loss": 0.4897, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07265090942382812, + "rewards/margins": 0.5572525262832642, + "rewards/rejected": -0.6299034357070923, + "step": 2985 + }, + { + "epoch": 0.46178233133578195, + "grad_norm": 5.565553665161133, + "learning_rate": 4.700423874441517e-06, + "logits/chosen": 18.98154067993164, + "logits/rejected": 10.315452575683594, + "logps/chosen": -302.1778259277344, + "logps/rejected": -257.02337646484375, + "loss": 0.6857, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.035533711314201355, + "rewards/margins": 0.07154185324907303, + "rewards/rejected": -0.10707554966211319, + "step": 2986 + }, + { + "epoch": 0.4619369804755461, + "grad_norm": 7.741940498352051, + "learning_rate": 4.700137472791843e-06, + "logits/chosen": 7.266263961791992, + "logits/rejected": 6.052780628204346, + "logps/chosen": -197.5580291748047, + "logps/rejected": -155.3468017578125, + "loss": 0.8848, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10885835438966751, + "rewards/margins": -0.2579139471054077, + "rewards/rejected": 0.36677226424217224, + "step": 2987 + }, + { + "epoch": 0.46209162961531025, + "grad_norm": 5.416033744812012, + "learning_rate": 4.69985107114217e-06, + "logits/chosen": 9.60762882232666, + "logits/rejected": 8.453782081604004, + "logps/chosen": -237.88233947753906, + "logps/rejected": -262.14422607421875, + "loss": 0.6655, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1277083456516266, + "rewards/margins": 0.12136536836624146, + "rewards/rejected": 0.006342977285385132, + "step": 2988 + }, + { + "epoch": 0.4622462787550744, + "grad_norm": 4.104170322418213, + "learning_rate": 4.699564669492497e-06, + "logits/chosen": 10.760554313659668, + "logits/rejected": 4.247102737426758, + "logps/chosen": -253.0543212890625, + "logps/rejected": -199.67893981933594, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12506359815597534, + "rewards/margins": 0.3998783528804779, + "rewards/rejected": -0.2748147249221802, + "step": 2989 + }, + { + "epoch": 0.4624009278948386, + "grad_norm": 7.315491199493408, + "learning_rate": 4.699278267842823e-06, + "logits/chosen": 4.25111198425293, + "logits/rejected": 3.1076107025146484, + "logps/chosen": -207.8371124267578, + "logps/rejected": -184.70236206054688, + "loss": 0.5232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5599367618560791, + "rewards/margins": 0.49828776717185974, + "rewards/rejected": 0.06164901703596115, + "step": 2990 + }, + { + "epoch": 0.46255557703460276, + "grad_norm": 6.039941310882568, + "learning_rate": 4.698991866193149e-06, + "logits/chosen": 14.211645126342773, + "logits/rejected": 9.0758638381958, + "logps/chosen": -402.0782470703125, + "logps/rejected": -302.526123046875, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14402027428150177, + "rewards/margins": 0.11809063702821732, + "rewards/rejected": 0.02592964470386505, + "step": 2991 + }, + { + "epoch": 0.4627102261743669, + "grad_norm": 9.251330375671387, + "learning_rate": 4.698705464543476e-06, + "logits/chosen": 8.933084487915039, + "logits/rejected": 7.97614049911499, + "logps/chosen": -380.0074768066406, + "logps/rejected": -392.1715393066406, + "loss": 0.8386, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03924284875392914, + "rewards/margins": -0.2019944190979004, + "rewards/rejected": 0.16275157034397125, + "step": 2992 + }, + { + "epoch": 0.46286487531413106, + "grad_norm": 5.163235664367676, + "learning_rate": 4.698419062893802e-06, + "logits/chosen": 12.297382354736328, + "logits/rejected": 7.0777788162231445, + "logps/chosen": -309.34100341796875, + "logps/rejected": -221.50167846679688, + "loss": 0.5401, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46429651975631714, + "rewards/margins": 0.3811018168926239, + "rewards/rejected": 0.08319471776485443, + "step": 2993 + }, + { + "epoch": 0.4630195244538952, + "grad_norm": 4.26530122756958, + "learning_rate": 4.698132661244129e-06, + "logits/chosen": 16.082658767700195, + "logits/rejected": 12.471482276916504, + "logps/chosen": -221.74954223632812, + "logps/rejected": -208.36878967285156, + "loss": 0.5515, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4126954674720764, + "rewards/margins": 0.3598061501979828, + "rewards/rejected": 0.05288929119706154, + "step": 2994 + }, + { + "epoch": 0.46317417359365937, + "grad_norm": 7.143664836883545, + "learning_rate": 4.697846259594456e-06, + "logits/chosen": 10.269268989562988, + "logits/rejected": 4.97338342666626, + "logps/chosen": -290.7821044921875, + "logps/rejected": -291.57269287109375, + "loss": 0.7233, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0773792564868927, + "rewards/margins": -0.0046173036098480225, + "rewards/rejected": -0.07276192307472229, + "step": 2995 + }, + { + "epoch": 0.4633288227334235, + "grad_norm": 3.841068744659424, + "learning_rate": 4.697559857944782e-06, + "logits/chosen": 11.763745307922363, + "logits/rejected": 10.870201110839844, + "logps/chosen": -222.72222900390625, + "logps/rejected": -204.68389892578125, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1590469777584076, + "rewards/margins": 0.48615387082099915, + "rewards/rejected": -0.32710689306259155, + "step": 2996 + }, + { + "epoch": 0.4634834718731877, + "grad_norm": 5.703514099121094, + "learning_rate": 4.697273456295109e-06, + "logits/chosen": 11.960027694702148, + "logits/rejected": 6.618836402893066, + "logps/chosen": -256.2616271972656, + "logps/rejected": -279.80126953125, + "loss": 0.638, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3547555208206177, + "rewards/margins": 0.3694687783718109, + "rewards/rejected": -0.014713302254676819, + "step": 2997 + }, + { + "epoch": 0.4636381210129519, + "grad_norm": 4.851461887359619, + "learning_rate": 4.696987054645435e-06, + "logits/chosen": 8.914794921875, + "logits/rejected": 8.558012008666992, + "logps/chosen": -239.77220153808594, + "logps/rejected": -263.8995361328125, + "loss": 0.6449, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26013949513435364, + "rewards/margins": 0.17149752378463745, + "rewards/rejected": 0.08864197880029678, + "step": 2998 + }, + { + "epoch": 0.463792770152716, + "grad_norm": 3.604166269302368, + "learning_rate": 4.6967006529957614e-06, + "logits/chosen": 11.806922912597656, + "logits/rejected": 11.061351776123047, + "logps/chosen": -198.9353485107422, + "logps/rejected": -139.0316619873047, + "loss": 0.5342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14822879433631897, + "rewards/margins": 0.40380263328552246, + "rewards/rejected": -0.2555738389492035, + "step": 2999 + }, + { + "epoch": 0.4639474192924802, + "grad_norm": 5.562335968017578, + "learning_rate": 4.696414251346088e-06, + "logits/chosen": 5.255735874176025, + "logits/rejected": 9.8038330078125, + "logps/chosen": -140.96401977539062, + "logps/rejected": -195.1973876953125, + "loss": 0.9344, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.49991849064826965, + "rewards/margins": -0.3037942349910736, + "rewards/rejected": -0.19612424075603485, + "step": 3000 + }, + { + "epoch": 0.46410206843224433, + "grad_norm": 4.986563205718994, + "learning_rate": 4.696127849696415e-06, + "logits/chosen": 10.662957191467285, + "logits/rejected": 8.625347137451172, + "logps/chosen": -311.1173400878906, + "logps/rejected": -277.9862060546875, + "loss": 0.5527, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4443734288215637, + "rewards/margins": 0.40489834547042847, + "rewards/rejected": 0.03947506099939346, + "step": 3001 + }, + { + "epoch": 0.4642567175720085, + "grad_norm": 4.071439266204834, + "learning_rate": 4.695841448046741e-06, + "logits/chosen": 12.221359252929688, + "logits/rejected": 1.933159351348877, + "logps/chosen": -331.10601806640625, + "logps/rejected": -141.78421020507812, + "loss": 0.6015, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13682307302951813, + "rewards/margins": 0.2776743173599243, + "rewards/rejected": -0.14085125923156738, + "step": 3002 + }, + { + "epoch": 0.4644113667117727, + "grad_norm": 6.970469951629639, + "learning_rate": 4.695555046397068e-06, + "logits/chosen": -0.08822989463806152, + "logits/rejected": 4.778539657592773, + "logps/chosen": -205.90802001953125, + "logps/rejected": -223.74896240234375, + "loss": 0.9828, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.19687604904174805, + "rewards/margins": -0.3774961531162262, + "rewards/rejected": 0.18062010407447815, + "step": 3003 + }, + { + "epoch": 0.46456601585153684, + "grad_norm": 3.9793319702148438, + "learning_rate": 4.695268644747394e-06, + "logits/chosen": 10.422563552856445, + "logits/rejected": 3.5683414936065674, + "logps/chosen": -317.6117858886719, + "logps/rejected": -221.690673828125, + "loss": 0.5104, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3746354281902313, + "rewards/margins": 0.5272427797317505, + "rewards/rejected": -0.15260739624500275, + "step": 3004 + }, + { + "epoch": 0.464720664991301, + "grad_norm": 7.266453266143799, + "learning_rate": 4.6949822430977205e-06, + "logits/chosen": 8.791247367858887, + "logits/rejected": 9.776388168334961, + "logps/chosen": -283.5511474609375, + "logps/rejected": -285.2673034667969, + "loss": 0.7696, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.14489275217056274, + "rewards/margins": -0.13763217628002167, + "rewards/rejected": 0.2825249135494232, + "step": 3005 + }, + { + "epoch": 0.46487531413106514, + "grad_norm": 5.865922927856445, + "learning_rate": 4.694695841448047e-06, + "logits/chosen": 9.802221298217773, + "logits/rejected": 10.44054889678955, + "logps/chosen": -250.34982299804688, + "logps/rejected": -335.50982666015625, + "loss": 0.6637, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.018009856343269348, + "rewards/margins": 0.12503968179225922, + "rewards/rejected": -0.14304950833320618, + "step": 3006 + }, + { + "epoch": 0.4650299632708293, + "grad_norm": 5.864901542663574, + "learning_rate": 4.694409439798374e-06, + "logits/chosen": 10.353107452392578, + "logits/rejected": 10.873077392578125, + "logps/chosen": -276.3033752441406, + "logps/rejected": -278.56768798828125, + "loss": 0.7559, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2895589768886566, + "rewards/margins": -0.09286236017942429, + "rewards/rejected": 0.3824213743209839, + "step": 3007 + }, + { + "epoch": 0.46518461241059345, + "grad_norm": 6.005579471588135, + "learning_rate": 4.6941230381487005e-06, + "logits/chosen": 14.687593460083008, + "logits/rejected": 9.673669815063477, + "logps/chosen": -321.59295654296875, + "logps/rejected": -254.62124633789062, + "loss": 0.6107, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25148218870162964, + "rewards/margins": 0.27360886335372925, + "rewards/rejected": -0.022126667201519012, + "step": 3008 + }, + { + "epoch": 0.46533926155035765, + "grad_norm": 5.281496524810791, + "learning_rate": 4.693836636499027e-06, + "logits/chosen": 18.5379695892334, + "logits/rejected": 7.6601996421813965, + "logps/chosen": -305.9548034667969, + "logps/rejected": -192.962158203125, + "loss": 0.6505, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1769767850637436, + "rewards/margins": 0.16448505222797394, + "rewards/rejected": 0.012491695582866669, + "step": 3009 + }, + { + "epoch": 0.4654939106901218, + "grad_norm": 5.080384731292725, + "learning_rate": 4.693550234849354e-06, + "logits/chosen": 9.558121681213379, + "logits/rejected": 7.163371562957764, + "logps/chosen": -228.67665100097656, + "logps/rejected": -184.52972412109375, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05075579881668091, + "rewards/margins": 0.0759708434343338, + "rewards/rejected": -0.02521505393087864, + "step": 3010 + }, + { + "epoch": 0.46564855982988596, + "grad_norm": 11.415779113769531, + "learning_rate": 4.6932638331996796e-06, + "logits/chosen": 13.355599403381348, + "logits/rejected": 8.992387771606445, + "logps/chosen": -350.96917724609375, + "logps/rejected": -305.41729736328125, + "loss": 0.7469, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23132705688476562, + "rewards/margins": 0.04031284525990486, + "rewards/rejected": 0.19101420044898987, + "step": 3011 + }, + { + "epoch": 0.4658032089696501, + "grad_norm": 6.177041530609131, + "learning_rate": 4.692977431550006e-06, + "logits/chosen": 7.324940204620361, + "logits/rejected": 6.873721599578857, + "logps/chosen": -333.996337890625, + "logps/rejected": -265.8457336425781, + "loss": 0.6312, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06832566857337952, + "rewards/margins": 0.1922735720872879, + "rewards/rejected": -0.12394791096448898, + "step": 3012 + }, + { + "epoch": 0.46595785810941426, + "grad_norm": 5.809593200683594, + "learning_rate": 4.692691029900333e-06, + "logits/chosen": 8.676753997802734, + "logits/rejected": 6.138293266296387, + "logps/chosen": -261.811279296875, + "logps/rejected": -249.48580932617188, + "loss": 0.6832, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12032841891050339, + "rewards/margins": 0.08426617830991745, + "rewards/rejected": -0.20459461212158203, + "step": 3013 + }, + { + "epoch": 0.4661125072491784, + "grad_norm": 3.7026448249816895, + "learning_rate": 4.6924046282506595e-06, + "logits/chosen": 8.78897762298584, + "logits/rejected": 7.335955619812012, + "logps/chosen": -135.98748779296875, + "logps/rejected": -135.40573120117188, + "loss": 0.6172, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013244912028312683, + "rewards/margins": 0.21129214763641357, + "rewards/rejected": -0.22453705966472626, + "step": 3014 + }, + { + "epoch": 0.46626715638894256, + "grad_norm": 5.66171932220459, + "learning_rate": 4.692118226600985e-06, + "logits/chosen": 11.891119003295898, + "logits/rejected": 8.937435150146484, + "logps/chosen": -323.4237060546875, + "logps/rejected": -344.78924560546875, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05019301176071167, + "rewards/margins": 0.135787695646286, + "rewards/rejected": -0.08559465408325195, + "step": 3015 + }, + { + "epoch": 0.46642180552870677, + "grad_norm": 5.276484489440918, + "learning_rate": 4.691831824951312e-06, + "logits/chosen": 14.987350463867188, + "logits/rejected": -1.9259618520736694, + "logps/chosen": -478.56903076171875, + "logps/rejected": -213.12319946289062, + "loss": 0.413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5866913199424744, + "rewards/margins": 0.9898380637168884, + "rewards/rejected": -0.40314677357673645, + "step": 3016 + }, + { + "epoch": 0.4665764546684709, + "grad_norm": 6.515838623046875, + "learning_rate": 4.691545423301639e-06, + "logits/chosen": 11.047337532043457, + "logits/rejected": 5.699032306671143, + "logps/chosen": -327.6561279296875, + "logps/rejected": -249.29318237304688, + "loss": 0.5543, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.47137489914894104, + "rewards/margins": 0.3869286775588989, + "rewards/rejected": 0.08444623649120331, + "step": 3017 + }, + { + "epoch": 0.46673110380823507, + "grad_norm": 7.234194278717041, + "learning_rate": 4.691259021651965e-06, + "logits/chosen": 13.772784233093262, + "logits/rejected": 10.25424575805664, + "logps/chosen": -281.24603271484375, + "logps/rejected": -284.25775146484375, + "loss": 0.7704, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.11060438305139542, + "rewards/margins": -0.09794869273900986, + "rewards/rejected": 0.20855306088924408, + "step": 3018 + }, + { + "epoch": 0.4668857529479992, + "grad_norm": 4.7292656898498535, + "learning_rate": 4.690972620002291e-06, + "logits/chosen": 9.881389617919922, + "logits/rejected": 6.236710548400879, + "logps/chosen": -348.5396728515625, + "logps/rejected": -274.4770202636719, + "loss": 0.5974, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4725329577922821, + "rewards/margins": 0.28675374388694763, + "rewards/rejected": 0.18577921390533447, + "step": 3019 + }, + { + "epoch": 0.4670404020877634, + "grad_norm": 7.421738147735596, + "learning_rate": 4.690686218352618e-06, + "logits/chosen": 8.122907638549805, + "logits/rejected": 4.668972969055176, + "logps/chosen": -331.0155029296875, + "logps/rejected": -245.2442626953125, + "loss": 0.793, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3071400821208954, + "rewards/margins": -0.0647820383310318, + "rewards/rejected": -0.24235805869102478, + "step": 3020 + }, + { + "epoch": 0.4671950512275275, + "grad_norm": 7.01359748840332, + "learning_rate": 4.690399816702944e-06, + "logits/chosen": 12.085275650024414, + "logits/rejected": 8.904709815979004, + "logps/chosen": -352.9685974121094, + "logps/rejected": -317.2801208496094, + "loss": 0.6991, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.38205835223197937, + "rewards/margins": 0.054469138383865356, + "rewards/rejected": 0.327589213848114, + "step": 3021 + }, + { + "epoch": 0.46734970036729173, + "grad_norm": 4.579090595245361, + "learning_rate": 4.690113415053271e-06, + "logits/chosen": 12.5736722946167, + "logits/rejected": 5.3734450340271, + "logps/chosen": -318.8518371582031, + "logps/rejected": -185.54391479492188, + "loss": 0.603, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2514013350009918, + "rewards/margins": 0.23206956684589386, + "rewards/rejected": 0.019331790506839752, + "step": 3022 + }, + { + "epoch": 0.4675043495070559, + "grad_norm": 7.670943260192871, + "learning_rate": 4.689827013403598e-06, + "logits/chosen": 14.403982162475586, + "logits/rejected": 13.06241512298584, + "logps/chosen": -388.9197998046875, + "logps/rejected": -356.6515808105469, + "loss": 0.8136, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25854986906051636, + "rewards/margins": -0.17356276512145996, + "rewards/rejected": 0.4321126341819763, + "step": 3023 + }, + { + "epoch": 0.46765899864682003, + "grad_norm": 11.595117568969727, + "learning_rate": 4.6895406117539235e-06, + "logits/chosen": 4.267294883728027, + "logits/rejected": 4.490293979644775, + "logps/chosen": -425.6158447265625, + "logps/rejected": -370.4498291015625, + "loss": 0.7636, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.424464613199234, + "rewards/margins": -0.048869915306568146, + "rewards/rejected": 0.47333452105522156, + "step": 3024 + }, + { + "epoch": 0.4678136477865842, + "grad_norm": 4.990597248077393, + "learning_rate": 4.68925421010425e-06, + "logits/chosen": 10.850885391235352, + "logits/rejected": 6.21992826461792, + "logps/chosen": -383.0733642578125, + "logps/rejected": -230.14834594726562, + "loss": 0.639, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24378754198551178, + "rewards/margins": 0.22680333256721497, + "rewards/rejected": 0.01698426529765129, + "step": 3025 + }, + { + "epoch": 0.46796829692634834, + "grad_norm": 6.668668270111084, + "learning_rate": 4.688967808454577e-06, + "logits/chosen": 8.512245178222656, + "logits/rejected": 6.942404747009277, + "logps/chosen": -318.826904296875, + "logps/rejected": -267.2339782714844, + "loss": 0.8185, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19500266015529633, + "rewards/margins": -0.12995214760303497, + "rewards/rejected": 0.3249548077583313, + "step": 3026 + }, + { + "epoch": 0.4681229460661125, + "grad_norm": 4.147992134094238, + "learning_rate": 4.6886814068049034e-06, + "logits/chosen": 11.03657054901123, + "logits/rejected": 7.150003910064697, + "logps/chosen": -236.04449462890625, + "logps/rejected": -226.43711853027344, + "loss": 0.5467, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07402047514915466, + "rewards/margins": 0.44944873452186584, + "rewards/rejected": -0.3754281997680664, + "step": 3027 + }, + { + "epoch": 0.46827759520587664, + "grad_norm": 7.048837184906006, + "learning_rate": 4.68839500515523e-06, + "logits/chosen": 10.253664016723633, + "logits/rejected": 11.200864791870117, + "logps/chosen": -248.10501098632812, + "logps/rejected": -248.26441955566406, + "loss": 0.8741, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.024630114436149597, + "rewards/margins": -0.2512737810611725, + "rewards/rejected": 0.2266436666250229, + "step": 3028 + }, + { + "epoch": 0.46843224434564085, + "grad_norm": 5.828037738800049, + "learning_rate": 4.688108603505557e-06, + "logits/chosen": 13.932458877563477, + "logits/rejected": 16.701068878173828, + "logps/chosen": -376.8634338378906, + "logps/rejected": -373.6258239746094, + "loss": 0.7036, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4477007985115051, + "rewards/margins": -0.00186949223279953, + "rewards/rejected": 0.44957026839256287, + "step": 3029 + }, + { + "epoch": 0.468586893485405, + "grad_norm": 4.968430519104004, + "learning_rate": 4.687822201855883e-06, + "logits/chosen": 6.363219738006592, + "logits/rejected": 6.011447906494141, + "logps/chosen": -275.84844970703125, + "logps/rejected": -273.5904541015625, + "loss": 0.6512, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12790071964263916, + "rewards/margins": 0.09668491035699844, + "rewards/rejected": 0.031215816736221313, + "step": 3030 + }, + { + "epoch": 0.46874154262516915, + "grad_norm": 4.819473743438721, + "learning_rate": 4.687535800206209e-06, + "logits/chosen": 12.618279457092285, + "logits/rejected": 6.812953472137451, + "logps/chosen": -247.2542724609375, + "logps/rejected": -192.78634643554688, + "loss": 0.6133, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14485806226730347, + "rewards/margins": 0.21640536189079285, + "rewards/rejected": -0.07154726982116699, + "step": 3031 + }, + { + "epoch": 0.4688961917649333, + "grad_norm": 5.328410625457764, + "learning_rate": 4.687249398556536e-06, + "logits/chosen": 9.071479797363281, + "logits/rejected": 9.877046585083008, + "logps/chosen": -262.59552001953125, + "logps/rejected": -262.7901916503906, + "loss": 0.745, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10335926711559296, + "rewards/margins": -0.009878858923912048, + "rewards/rejected": 0.1132381483912468, + "step": 3032 + }, + { + "epoch": 0.46905084090469745, + "grad_norm": 4.913109302520752, + "learning_rate": 4.6869629969068625e-06, + "logits/chosen": 11.790393829345703, + "logits/rejected": 11.277243614196777, + "logps/chosen": -250.356201171875, + "logps/rejected": -268.9931945800781, + "loss": 0.4965, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13035041093826294, + "rewards/margins": 0.6155429482460022, + "rewards/rejected": -0.48519250750541687, + "step": 3033 + }, + { + "epoch": 0.4692054900444616, + "grad_norm": 5.327846527099609, + "learning_rate": 4.686676595257189e-06, + "logits/chosen": 7.431042671203613, + "logits/rejected": 5.577322959899902, + "logps/chosen": -286.6590881347656, + "logps/rejected": -267.8074645996094, + "loss": 0.6545, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29984867572784424, + "rewards/margins": 0.13244876265525818, + "rewards/rejected": 0.16739995777606964, + "step": 3034 + }, + { + "epoch": 0.4693601391842258, + "grad_norm": 7.3163018226623535, + "learning_rate": 4.686390193607516e-06, + "logits/chosen": 10.322932243347168, + "logits/rejected": 13.13760757446289, + "logps/chosen": -274.2729797363281, + "logps/rejected": -312.0372009277344, + "loss": 0.7162, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34363099932670593, + "rewards/margins": 0.14714661240577698, + "rewards/rejected": 0.19648438692092896, + "step": 3035 + }, + { + "epoch": 0.46951478832398996, + "grad_norm": 4.8995361328125, + "learning_rate": 4.6861037919578424e-06, + "logits/chosen": 10.715742111206055, + "logits/rejected": 12.697279930114746, + "logps/chosen": -138.29400634765625, + "logps/rejected": -190.53488159179688, + "loss": 0.7841, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.18836352229118347, + "rewards/margins": -0.018577218055725098, + "rewards/rejected": -0.1697862595319748, + "step": 3036 + }, + { + "epoch": 0.4696694374637541, + "grad_norm": 5.382208824157715, + "learning_rate": 4.685817390308168e-06, + "logits/chosen": 11.535953521728516, + "logits/rejected": 9.934057235717773, + "logps/chosen": -234.19854736328125, + "logps/rejected": -294.6527099609375, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07564811408519745, + "rewards/margins": 0.17139311134815216, + "rewards/rejected": -0.09574498236179352, + "step": 3037 + }, + { + "epoch": 0.46982408660351827, + "grad_norm": 4.8675618171691895, + "learning_rate": 4.685530988658495e-06, + "logits/chosen": 10.951421737670898, + "logits/rejected": 8.434406280517578, + "logps/chosen": -255.77389526367188, + "logps/rejected": -174.0486297607422, + "loss": 0.6993, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3492065370082855, + "rewards/margins": 0.11506514251232147, + "rewards/rejected": 0.23414140939712524, + "step": 3038 + }, + { + "epoch": 0.4699787357432824, + "grad_norm": 5.73569917678833, + "learning_rate": 4.6852445870088215e-06, + "logits/chosen": 13.290253639221191, + "logits/rejected": 7.9289984703063965, + "logps/chosen": -452.88470458984375, + "logps/rejected": -278.16680908203125, + "loss": 0.4814, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6856586933135986, + "rewards/margins": 0.6601459980010986, + "rewards/rejected": 0.025512687861919403, + "step": 3039 + }, + { + "epoch": 0.47013338488304657, + "grad_norm": 8.164809226989746, + "learning_rate": 4.684958185359148e-06, + "logits/chosen": 9.7992582321167, + "logits/rejected": 5.111950874328613, + "logps/chosen": -339.24969482421875, + "logps/rejected": -269.45684814453125, + "loss": 0.6226, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4777964651584625, + "rewards/margins": 0.29934245347976685, + "rewards/rejected": 0.17845401167869568, + "step": 3040 + }, + { + "epoch": 0.4702880340228108, + "grad_norm": 9.046942710876465, + "learning_rate": 4.684671783709475e-06, + "logits/chosen": 4.686356067657471, + "logits/rejected": 6.7059760093688965, + "logps/chosen": -340.31695556640625, + "logps/rejected": -312.0832214355469, + "loss": 0.7959, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.43454858660697937, + "rewards/margins": -0.13616794347763062, + "rewards/rejected": 0.5707165002822876, + "step": 3041 + }, + { + "epoch": 0.4704426831625749, + "grad_norm": 6.626064777374268, + "learning_rate": 4.6843853820598015e-06, + "logits/chosen": 11.066731452941895, + "logits/rejected": 6.454031944274902, + "logps/chosen": -315.42852783203125, + "logps/rejected": -274.3255615234375, + "loss": 0.5617, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6192066669464111, + "rewards/margins": 0.5136699080467224, + "rewards/rejected": 0.10553675144910812, + "step": 3042 + }, + { + "epoch": 0.4705973323023391, + "grad_norm": 4.641066074371338, + "learning_rate": 4.684098980410128e-06, + "logits/chosen": 11.110712051391602, + "logits/rejected": 7.017316818237305, + "logps/chosen": -262.4399719238281, + "logps/rejected": -189.96127319335938, + "loss": 0.6281, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22620946168899536, + "rewards/margins": 0.19253702461719513, + "rewards/rejected": 0.03367242217063904, + "step": 3043 + }, + { + "epoch": 0.47075198144210323, + "grad_norm": 4.517951965332031, + "learning_rate": 4.683812578760454e-06, + "logits/chosen": 9.379972457885742, + "logits/rejected": 7.071934700012207, + "logps/chosen": -318.04022216796875, + "logps/rejected": -336.4866943359375, + "loss": 0.5734, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5838018655776978, + "rewards/margins": 0.3498693108558655, + "rewards/rejected": 0.23393258452415466, + "step": 3044 + }, + { + "epoch": 0.4709066305818674, + "grad_norm": 4.883569240570068, + "learning_rate": 4.683526177110781e-06, + "logits/chosen": 13.97895622253418, + "logits/rejected": 7.639683723449707, + "logps/chosen": -254.38832092285156, + "logps/rejected": -179.5138702392578, + "loss": 0.6596, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3233962953090668, + "rewards/margins": 0.10601967573165894, + "rewards/rejected": 0.21737661957740784, + "step": 3045 + }, + { + "epoch": 0.47106127972163153, + "grad_norm": 4.058988094329834, + "learning_rate": 4.683239775461107e-06, + "logits/chosen": 6.2645063400268555, + "logits/rejected": 6.5252790451049805, + "logps/chosen": -204.92681884765625, + "logps/rejected": -201.12493896484375, + "loss": 0.6349, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.255244642496109, + "rewards/margins": 0.30357885360717773, + "rewards/rejected": -0.04833421856164932, + "step": 3046 + }, + { + "epoch": 0.4712159288613957, + "grad_norm": 6.851743221282959, + "learning_rate": 4.682953373811434e-06, + "logits/chosen": 6.320741653442383, + "logits/rejected": 7.543875694274902, + "logps/chosen": -245.53750610351562, + "logps/rejected": -233.72482299804688, + "loss": 0.716, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14921225607395172, + "rewards/margins": 0.006156541407108307, + "rewards/rejected": -0.15536880493164062, + "step": 3047 + }, + { + "epoch": 0.4713705780011599, + "grad_norm": 5.082075119018555, + "learning_rate": 4.6826669721617605e-06, + "logits/chosen": 11.506973266601562, + "logits/rejected": 4.627630710601807, + "logps/chosen": -390.1814270019531, + "logps/rejected": -311.8535461425781, + "loss": 0.5567, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36454325914382935, + "rewards/margins": 0.5128042697906494, + "rewards/rejected": -0.14826098084449768, + "step": 3048 + }, + { + "epoch": 0.47152522714092404, + "grad_norm": 11.612847328186035, + "learning_rate": 4.682380570512086e-06, + "logits/chosen": 7.6846537590026855, + "logits/rejected": 5.296227931976318, + "logps/chosen": -224.20394897460938, + "logps/rejected": -196.99183654785156, + "loss": 0.787, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02309267222881317, + "rewards/margins": -0.1418527215719223, + "rewards/rejected": 0.16494536399841309, + "step": 3049 + }, + { + "epoch": 0.4716798762806882, + "grad_norm": 6.362085819244385, + "learning_rate": 4.682094168862413e-06, + "logits/chosen": 4.7474164962768555, + "logits/rejected": 9.70876693725586, + "logps/chosen": -262.0009765625, + "logps/rejected": -298.1358642578125, + "loss": 0.8143, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12552708387374878, + "rewards/margins": 0.05059318244457245, + "rewards/rejected": 0.07493391633033752, + "step": 3050 + }, + { + "epoch": 0.47183452542045234, + "grad_norm": 7.122706890106201, + "learning_rate": 4.68180776721274e-06, + "logits/chosen": 3.781843900680542, + "logits/rejected": 9.156026840209961, + "logps/chosen": -198.87661743164062, + "logps/rejected": -236.2912139892578, + "loss": 0.7254, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02089890092611313, + "rewards/margins": 0.03717374801635742, + "rewards/rejected": -0.01627485454082489, + "step": 3051 + }, + { + "epoch": 0.4719891745602165, + "grad_norm": 5.172514915466309, + "learning_rate": 4.681521365563066e-06, + "logits/chosen": 10.050740242004395, + "logits/rejected": 4.665450096130371, + "logps/chosen": -160.89524841308594, + "logps/rejected": -143.93515014648438, + "loss": 0.6421, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09480108320713043, + "rewards/margins": 0.1491040587425232, + "rewards/rejected": -0.05430297553539276, + "step": 3052 + }, + { + "epoch": 0.47214382369998065, + "grad_norm": 4.853285789489746, + "learning_rate": 4.681234963913392e-06, + "logits/chosen": 10.171934127807617, + "logits/rejected": 8.72216510772705, + "logps/chosen": -221.22750854492188, + "logps/rejected": -248.62179565429688, + "loss": 0.6148, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.248515784740448, + "rewards/margins": 0.20848342776298523, + "rewards/rejected": 0.04003235325217247, + "step": 3053 + }, + { + "epoch": 0.47229847283974485, + "grad_norm": 5.922132968902588, + "learning_rate": 4.680948562263719e-06, + "logits/chosen": 10.94228458404541, + "logits/rejected": 7.261427879333496, + "logps/chosen": -189.93734741210938, + "logps/rejected": -164.9632568359375, + "loss": 0.7664, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11140406876802444, + "rewards/margins": -0.004519656300544739, + "rewards/rejected": -0.1068844348192215, + "step": 3054 + }, + { + "epoch": 0.472453121979509, + "grad_norm": 8.111992835998535, + "learning_rate": 4.680662160614045e-06, + "logits/chosen": 6.163878440856934, + "logits/rejected": 8.51284122467041, + "logps/chosen": -324.60675048828125, + "logps/rejected": -290.0404052734375, + "loss": 0.7231, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3316325545310974, + "rewards/margins": 0.031124353408813477, + "rewards/rejected": 0.3005082309246063, + "step": 3055 + }, + { + "epoch": 0.47260777111927316, + "grad_norm": 5.61262845993042, + "learning_rate": 4.680375758964372e-06, + "logits/chosen": 2.4881088733673096, + "logits/rejected": 6.924922466278076, + "logps/chosen": -178.37986755371094, + "logps/rejected": -235.50222778320312, + "loss": 0.7066, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17450298368930817, + "rewards/margins": -0.002118505537509918, + "rewards/rejected": 0.17662151157855988, + "step": 3056 + }, + { + "epoch": 0.4727624202590373, + "grad_norm": 8.181360244750977, + "learning_rate": 4.680089357314698e-06, + "logits/chosen": 8.793135643005371, + "logits/rejected": 5.763635635375977, + "logps/chosen": -348.526123046875, + "logps/rejected": -321.23480224609375, + "loss": 0.8251, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03598213568329811, + "rewards/margins": -0.15068131685256958, + "rewards/rejected": 0.1866634488105774, + "step": 3057 + }, + { + "epoch": 0.47291706939880146, + "grad_norm": 5.557372570037842, + "learning_rate": 4.6798029556650245e-06, + "logits/chosen": 9.104660034179688, + "logits/rejected": 8.948171615600586, + "logps/chosen": -189.0396728515625, + "logps/rejected": -201.1279296875, + "loss": 0.753, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06726804375648499, + "rewards/margins": -0.09653091430664062, + "rewards/rejected": 0.1637989580631256, + "step": 3058 + }, + { + "epoch": 0.4730717185385656, + "grad_norm": 15.182119369506836, + "learning_rate": 4.679516554015351e-06, + "logits/chosen": 17.72298812866211, + "logits/rejected": 14.906938552856445, + "logps/chosen": -252.43881225585938, + "logps/rejected": -295.6755065917969, + "loss": 0.7163, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43697845935821533, + "rewards/margins": 0.05405254662036896, + "rewards/rejected": 0.3829258978366852, + "step": 3059 + }, + { + "epoch": 0.47322636767832976, + "grad_norm": 50.033668518066406, + "learning_rate": 4.679230152365678e-06, + "logits/chosen": 6.469582557678223, + "logits/rejected": 10.854336738586426, + "logps/chosen": -340.3044738769531, + "logps/rejected": -407.75933837890625, + "loss": 0.8822, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16917115449905396, + "rewards/margins": -0.2311612367630005, + "rewards/rejected": 0.40033239126205444, + "step": 3060 + }, + { + "epoch": 0.47338101681809397, + "grad_norm": 6.531163215637207, + "learning_rate": 4.6789437507160045e-06, + "logits/chosen": 5.292582035064697, + "logits/rejected": 5.018515110015869, + "logps/chosen": -207.46969604492188, + "logps/rejected": -217.6377716064453, + "loss": 0.7886, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07034319639205933, + "rewards/margins": -0.15117892622947693, + "rewards/rejected": 0.0808357447385788, + "step": 3061 + }, + { + "epoch": 0.4735356659578581, + "grad_norm": 4.9883246421813965, + "learning_rate": 4.678657349066331e-06, + "logits/chosen": 7.317048072814941, + "logits/rejected": 8.744040489196777, + "logps/chosen": -271.0904235839844, + "logps/rejected": -397.14715576171875, + "loss": 0.5064, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25454989075660706, + "rewards/margins": 0.527854859828949, + "rewards/rejected": -0.27330493927001953, + "step": 3062 + }, + { + "epoch": 0.47369031509762227, + "grad_norm": 6.8340678215026855, + "learning_rate": 4.678370947416658e-06, + "logits/chosen": 5.69686222076416, + "logits/rejected": 10.370161056518555, + "logps/chosen": -319.1346130371094, + "logps/rejected": -314.53448486328125, + "loss": 0.7169, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3967639207839966, + "rewards/margins": 0.13788823783397675, + "rewards/rejected": 0.25887566804885864, + "step": 3063 + }, + { + "epoch": 0.4738449642373864, + "grad_norm": 5.961577892303467, + "learning_rate": 4.6780845457669836e-06, + "logits/chosen": 11.684815406799316, + "logits/rejected": 9.028783798217773, + "logps/chosen": -246.92117309570312, + "logps/rejected": -227.92347717285156, + "loss": 0.7111, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41813087463378906, + "rewards/margins": 0.13988414406776428, + "rewards/rejected": 0.27824676036834717, + "step": 3064 + }, + { + "epoch": 0.4739996133771506, + "grad_norm": 5.843420505523682, + "learning_rate": 4.67779814411731e-06, + "logits/chosen": 6.879061222076416, + "logits/rejected": 7.678164005279541, + "logps/chosen": -298.0958557128906, + "logps/rejected": -362.84759521484375, + "loss": 0.7892, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1783698946237564, + "rewards/margins": 0.08849802613258362, + "rewards/rejected": 0.08987187594175339, + "step": 3065 + }, + { + "epoch": 0.4741542625169147, + "grad_norm": 7.636201858520508, + "learning_rate": 4.677511742467637e-06, + "logits/chosen": 8.735830307006836, + "logits/rejected": 9.869014739990234, + "logps/chosen": -253.0509490966797, + "logps/rejected": -264.879638671875, + "loss": 0.8386, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22172188758850098, + "rewards/margins": -0.18404985964298248, + "rewards/rejected": 0.40577173233032227, + "step": 3066 + }, + { + "epoch": 0.47430891165667893, + "grad_norm": 9.161300659179688, + "learning_rate": 4.6772253408179635e-06, + "logits/chosen": 8.81488037109375, + "logits/rejected": 6.985283851623535, + "logps/chosen": -306.99237060546875, + "logps/rejected": -262.3344421386719, + "loss": 1.0115, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07892848551273346, + "rewards/margins": -0.41635429859161377, + "rewards/rejected": 0.3374258279800415, + "step": 3067 + }, + { + "epoch": 0.4744635607964431, + "grad_norm": 4.1316304206848145, + "learning_rate": 4.67693893916829e-06, + "logits/chosen": 10.428282737731934, + "logits/rejected": 7.2699689865112305, + "logps/chosen": -244.53489685058594, + "logps/rejected": -213.11257934570312, + "loss": 0.5632, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08608321845531464, + "rewards/margins": 0.3916701674461365, + "rewards/rejected": -0.30558690428733826, + "step": 3068 + }, + { + "epoch": 0.47461820993620724, + "grad_norm": 5.726138591766357, + "learning_rate": 4.676652537518617e-06, + "logits/chosen": 10.898289680480957, + "logits/rejected": 10.581116676330566, + "logps/chosen": -273.69757080078125, + "logps/rejected": -260.623046875, + "loss": 0.7086, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010938942432403564, + "rewards/margins": 0.28959521651268005, + "rewards/rejected": -0.2786563038825989, + "step": 3069 + }, + { + "epoch": 0.4747728590759714, + "grad_norm": 5.0053558349609375, + "learning_rate": 4.676366135868943e-06, + "logits/chosen": 8.677099227905273, + "logits/rejected": 4.523341178894043, + "logps/chosen": -354.2060241699219, + "logps/rejected": -284.7891845703125, + "loss": 0.5478, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3860313892364502, + "rewards/margins": 0.3774925172328949, + "rewards/rejected": 0.008538894355297089, + "step": 3070 + }, + { + "epoch": 0.47492750821573554, + "grad_norm": 7.444076061248779, + "learning_rate": 4.676079734219269e-06, + "logits/chosen": 3.152534008026123, + "logits/rejected": 8.387313842773438, + "logps/chosen": -183.9143524169922, + "logps/rejected": -293.35693359375, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.054882604628801346, + "rewards/margins": 0.16610418260097504, + "rewards/rejected": -0.11122157424688339, + "step": 3071 + }, + { + "epoch": 0.4750821573554997, + "grad_norm": 4.019036769866943, + "learning_rate": 4.675793332569596e-06, + "logits/chosen": 11.028069496154785, + "logits/rejected": 6.256389617919922, + "logps/chosen": -176.41415405273438, + "logps/rejected": -96.75796508789062, + "loss": 0.6107, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09741778671741486, + "rewards/margins": 0.2165750116109848, + "rewards/rejected": -0.31399279832839966, + "step": 3072 + }, + { + "epoch": 0.4752368064952639, + "grad_norm": 6.574222564697266, + "learning_rate": 4.6755069309199226e-06, + "logits/chosen": 4.31530237197876, + "logits/rejected": 4.590390682220459, + "logps/chosen": -215.89593505859375, + "logps/rejected": -269.44268798828125, + "loss": 0.7181, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06494760513305664, + "rewards/margins": 0.033523403108119965, + "rewards/rejected": 0.031424202024936676, + "step": 3073 + }, + { + "epoch": 0.47539145563502805, + "grad_norm": 5.839081764221191, + "learning_rate": 4.675220529270249e-06, + "logits/chosen": 8.054490089416504, + "logits/rejected": 9.482805252075195, + "logps/chosen": -225.25442504882812, + "logps/rejected": -272.8577880859375, + "loss": 0.7333, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1386970579624176, + "rewards/margins": -0.01759222522377968, + "rewards/rejected": 0.15628927946090698, + "step": 3074 + }, + { + "epoch": 0.4755461047747922, + "grad_norm": 4.601315021514893, + "learning_rate": 4.674934127620576e-06, + "logits/chosen": 13.395176887512207, + "logits/rejected": 7.851316452026367, + "logps/chosen": -209.703857421875, + "logps/rejected": -181.48806762695312, + "loss": 0.6291, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2169143110513687, + "rewards/margins": 0.2594432234764099, + "rewards/rejected": -0.042528897523880005, + "step": 3075 + }, + { + "epoch": 0.47570075391455635, + "grad_norm": 7.562048435211182, + "learning_rate": 4.6746477259709025e-06, + "logits/chosen": 4.879905700683594, + "logits/rejected": 9.656997680664062, + "logps/chosen": -223.97235107421875, + "logps/rejected": -282.97393798828125, + "loss": 0.8576, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.23566828668117523, + "rewards/margins": -0.16209626197814941, + "rewards/rejected": 0.39776450395584106, + "step": 3076 + }, + { + "epoch": 0.4758554030543205, + "grad_norm": 7.674280166625977, + "learning_rate": 4.674361324321228e-06, + "logits/chosen": 8.150238990783691, + "logits/rejected": 2.8680195808410645, + "logps/chosen": -320.40460205078125, + "logps/rejected": -282.52117919921875, + "loss": 0.7678, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1763329654932022, + "rewards/margins": -0.020856082439422607, + "rewards/rejected": 0.197189062833786, + "step": 3077 + }, + { + "epoch": 0.47601005219408465, + "grad_norm": 6.089390277862549, + "learning_rate": 4.674074922671555e-06, + "logits/chosen": 11.534485816955566, + "logits/rejected": 7.353596210479736, + "logps/chosen": -311.7340393066406, + "logps/rejected": -248.7273406982422, + "loss": 0.5112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47398436069488525, + "rewards/margins": 0.4283140003681183, + "rewards/rejected": 0.045670315623283386, + "step": 3078 + }, + { + "epoch": 0.4761647013338488, + "grad_norm": 5.227465629577637, + "learning_rate": 4.673788521021882e-06, + "logits/chosen": 7.246560096740723, + "logits/rejected": 5.192907333374023, + "logps/chosen": -256.4664611816406, + "logps/rejected": -264.9912109375, + "loss": 0.6469, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21067595481872559, + "rewards/margins": 0.12050056457519531, + "rewards/rejected": 0.09017538279294968, + "step": 3079 + }, + { + "epoch": 0.476319350473613, + "grad_norm": 4.536075592041016, + "learning_rate": 4.673502119372208e-06, + "logits/chosen": 9.53080940246582, + "logits/rejected": 2.5608153343200684, + "logps/chosen": -220.08416748046875, + "logps/rejected": -171.38954162597656, + "loss": 0.5955, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4440489113330841, + "rewards/margins": 0.24446091055870056, + "rewards/rejected": 0.19958800077438354, + "step": 3080 + }, + { + "epoch": 0.47647399961337716, + "grad_norm": 4.754151821136475, + "learning_rate": 4.673215717722535e-06, + "logits/chosen": 10.98740005493164, + "logits/rejected": 6.7599592208862305, + "logps/chosen": -270.01141357421875, + "logps/rejected": -188.2509765625, + "loss": 0.6372, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3542192578315735, + "rewards/margins": 0.216253861784935, + "rewards/rejected": 0.1379653811454773, + "step": 3081 + }, + { + "epoch": 0.4766286487531413, + "grad_norm": 6.546957015991211, + "learning_rate": 4.672929316072862e-06, + "logits/chosen": 12.19477367401123, + "logits/rejected": 6.244563102722168, + "logps/chosen": -320.218017578125, + "logps/rejected": -249.3705291748047, + "loss": 0.5361, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5429012775421143, + "rewards/margins": 0.4676016867160797, + "rewards/rejected": 0.07529956102371216, + "step": 3082 + }, + { + "epoch": 0.47678329789290547, + "grad_norm": 4.286785125732422, + "learning_rate": 4.672642914423187e-06, + "logits/chosen": 6.530529975891113, + "logits/rejected": 7.054716110229492, + "logps/chosen": -214.75149536132812, + "logps/rejected": -237.53419494628906, + "loss": 0.6876, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.311136394739151, + "rewards/margins": 0.0889199748635292, + "rewards/rejected": 0.2222164124250412, + "step": 3083 + }, + { + "epoch": 0.4769379470326696, + "grad_norm": 5.053194046020508, + "learning_rate": 4.672356512773514e-06, + "logits/chosen": 8.8037109375, + "logits/rejected": 9.088529586791992, + "logps/chosen": -208.4940948486328, + "logps/rejected": -206.71035766601562, + "loss": 0.6826, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23251323401927948, + "rewards/margins": 0.0745190978050232, + "rewards/rejected": 0.1579941362142563, + "step": 3084 + }, + { + "epoch": 0.47709259617243377, + "grad_norm": 7.402444839477539, + "learning_rate": 4.672070111123841e-06, + "logits/chosen": 10.791864395141602, + "logits/rejected": 5.92660665512085, + "logps/chosen": -299.4687805175781, + "logps/rejected": -192.10618591308594, + "loss": 0.6938, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13676252961158752, + "rewards/margins": 0.23840409517288208, + "rewards/rejected": -0.10164155811071396, + "step": 3085 + }, + { + "epoch": 0.477247245312198, + "grad_norm": 5.109952926635742, + "learning_rate": 4.671783709474167e-06, + "logits/chosen": 10.46800708770752, + "logits/rejected": 6.0344038009643555, + "logps/chosen": -211.48513793945312, + "logps/rejected": -251.4752655029297, + "loss": 0.5387, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3065238296985626, + "rewards/margins": 0.5590267181396484, + "rewards/rejected": -0.2525028884410858, + "step": 3086 + }, + { + "epoch": 0.4774018944519621, + "grad_norm": 4.210285663604736, + "learning_rate": 4.671497307824493e-06, + "logits/chosen": 11.207535743713379, + "logits/rejected": 10.102213859558105, + "logps/chosen": -234.4876708984375, + "logps/rejected": -266.76043701171875, + "loss": 0.6449, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3969913721084595, + "rewards/margins": 0.1619933545589447, + "rewards/rejected": 0.23499798774719238, + "step": 3087 + }, + { + "epoch": 0.4775565435917263, + "grad_norm": 5.691965579986572, + "learning_rate": 4.67121090617482e-06, + "logits/chosen": 5.82404899597168, + "logits/rejected": 7.568882465362549, + "logps/chosen": -306.07879638671875, + "logps/rejected": -267.31787109375, + "loss": 0.6646, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2845696210861206, + "rewards/margins": 0.133896604180336, + "rewards/rejected": 0.1506730020046234, + "step": 3088 + }, + { + "epoch": 0.47771119273149043, + "grad_norm": 37.72473907470703, + "learning_rate": 4.6709245045251464e-06, + "logits/chosen": 12.321680068969727, + "logits/rejected": -0.4262933135032654, + "logps/chosen": -320.69903564453125, + "logps/rejected": -139.259033203125, + "loss": 0.4984, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2670469880104065, + "rewards/margins": 0.48887258768081665, + "rewards/rejected": -0.22182559967041016, + "step": 3089 + }, + { + "epoch": 0.4778658418712546, + "grad_norm": 6.362565517425537, + "learning_rate": 4.670638102875473e-06, + "logits/chosen": 10.374752044677734, + "logits/rejected": 10.374979019165039, + "logps/chosen": -248.73681640625, + "logps/rejected": -266.8424987792969, + "loss": 0.5548, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10653285682201385, + "rewards/margins": 0.3668130934238434, + "rewards/rejected": -0.26028022170066833, + "step": 3090 + }, + { + "epoch": 0.47802049101101873, + "grad_norm": 4.842832565307617, + "learning_rate": 4.670351701225799e-06, + "logits/chosen": 11.556646347045898, + "logits/rejected": 1.400526523590088, + "logps/chosen": -485.8091735839844, + "logps/rejected": -235.7618408203125, + "loss": 0.4774, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5398233532905579, + "rewards/margins": 0.5973602533340454, + "rewards/rejected": -0.05753690004348755, + "step": 3091 + }, + { + "epoch": 0.4781751401507829, + "grad_norm": 6.038628578186035, + "learning_rate": 4.6700652995761255e-06, + "logits/chosen": 15.006936073303223, + "logits/rejected": 14.903401374816895, + "logps/chosen": -363.13494873046875, + "logps/rejected": -310.9366455078125, + "loss": 0.7972, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.046074531972408295, + "rewards/margins": -0.10337035357952118, + "rewards/rejected": 0.057295799255371094, + "step": 3092 + }, + { + "epoch": 0.4783297892905471, + "grad_norm": 7.2743144035339355, + "learning_rate": 4.669778897926452e-06, + "logits/chosen": 7.723608016967773, + "logits/rejected": 5.050766944885254, + "logps/chosen": -418.19622802734375, + "logps/rejected": -267.413330078125, + "loss": 0.7688, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26131361722946167, + "rewards/margins": -0.11878012865781784, + "rewards/rejected": 0.3800937235355377, + "step": 3093 + }, + { + "epoch": 0.47848443843031124, + "grad_norm": 5.995887756347656, + "learning_rate": 4.669492496276779e-06, + "logits/chosen": 8.6649751663208, + "logits/rejected": 5.4912028312683105, + "logps/chosen": -234.44033813476562, + "logps/rejected": -187.11074829101562, + "loss": 0.6416, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5310311317443848, + "rewards/margins": 0.19231672585010529, + "rewards/rejected": 0.3387144207954407, + "step": 3094 + }, + { + "epoch": 0.4786390875700754, + "grad_norm": 4.676825046539307, + "learning_rate": 4.6692060946271055e-06, + "logits/chosen": 11.066549301147461, + "logits/rejected": 3.7531991004943848, + "logps/chosen": -258.84423828125, + "logps/rejected": -168.07275390625, + "loss": 0.6382, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16205647587776184, + "rewards/margins": 0.1746605932712555, + "rewards/rejected": -0.012604091316461563, + "step": 3095 + }, + { + "epoch": 0.47879373670983955, + "grad_norm": 4.744071960449219, + "learning_rate": 4.668919692977432e-06, + "logits/chosen": 12.52891731262207, + "logits/rejected": 7.974610328674316, + "logps/chosen": -285.4501647949219, + "logps/rejected": -218.45010375976562, + "loss": 0.6795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20454664528369904, + "rewards/margins": 0.0979498028755188, + "rewards/rejected": 0.10659685730934143, + "step": 3096 + }, + { + "epoch": 0.4789483858496037, + "grad_norm": 41.192081451416016, + "learning_rate": 4.668633291327758e-06, + "logits/chosen": 11.171636581420898, + "logits/rejected": 10.808284759521484, + "logps/chosen": -268.31732177734375, + "logps/rejected": -313.15496826171875, + "loss": 0.6801, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3524947166442871, + "rewards/margins": 0.05625344067811966, + "rewards/rejected": 0.29624128341674805, + "step": 3097 + }, + { + "epoch": 0.47910303498936785, + "grad_norm": 12.082710266113281, + "learning_rate": 4.668346889678085e-06, + "logits/chosen": 9.04279899597168, + "logits/rejected": 4.530684471130371, + "logps/chosen": -303.1335754394531, + "logps/rejected": -236.166015625, + "loss": 0.5328, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25709667801856995, + "rewards/margins": 0.4197738766670227, + "rewards/rejected": -0.16267719864845276, + "step": 3098 + }, + { + "epoch": 0.47925768412913206, + "grad_norm": 7.9044189453125, + "learning_rate": 4.668060488028411e-06, + "logits/chosen": 10.09089469909668, + "logits/rejected": 11.316904067993164, + "logps/chosen": -302.5939636230469, + "logps/rejected": -319.34088134765625, + "loss": 0.7032, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12135867029428482, + "rewards/margins": 0.13457375764846802, + "rewards/rejected": -0.013215094804763794, + "step": 3099 + }, + { + "epoch": 0.4794123332688962, + "grad_norm": 5.916541576385498, + "learning_rate": 4.667774086378738e-06, + "logits/chosen": 7.774745941162109, + "logits/rejected": 4.12262487411499, + "logps/chosen": -205.13259887695312, + "logps/rejected": -114.50479125976562, + "loss": 0.688, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07930092513561249, + "rewards/margins": 0.05635065212845802, + "rewards/rejected": 0.022950269281864166, + "step": 3100 + }, + { + "epoch": 0.47956698240866036, + "grad_norm": 4.6126322746276855, + "learning_rate": 4.6674876847290645e-06, + "logits/chosen": 4.103279113769531, + "logits/rejected": 3.9995923042297363, + "logps/chosen": -266.60174560546875, + "logps/rejected": -200.19781494140625, + "loss": 0.584, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.426532506942749, + "rewards/margins": 0.2903444766998291, + "rewards/rejected": 0.13618803024291992, + "step": 3101 + }, + { + "epoch": 0.4797216315484245, + "grad_norm": 5.278357982635498, + "learning_rate": 4.667201283079391e-06, + "logits/chosen": 7.119441509246826, + "logits/rejected": 1.8866794109344482, + "logps/chosen": -224.30630493164062, + "logps/rejected": -163.84725952148438, + "loss": 0.6944, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0046348609030246735, + "rewards/margins": 0.06837233901023865, + "rewards/rejected": -0.06373748928308487, + "step": 3102 + }, + { + "epoch": 0.47987628068818866, + "grad_norm": 6.516919136047363, + "learning_rate": 4.666914881429717e-06, + "logits/chosen": 10.329404830932617, + "logits/rejected": 5.345468997955322, + "logps/chosen": -314.9106140136719, + "logps/rejected": -253.62203979492188, + "loss": 0.6378, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25726640224456787, + "rewards/margins": 0.16195009648799896, + "rewards/rejected": 0.0953163281083107, + "step": 3103 + }, + { + "epoch": 0.4800309298279528, + "grad_norm": 5.054995059967041, + "learning_rate": 4.666628479780044e-06, + "logits/chosen": 12.634053230285645, + "logits/rejected": 6.672888278961182, + "logps/chosen": -312.3268127441406, + "logps/rejected": -250.08203125, + "loss": 0.5894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4982706308364868, + "rewards/margins": 0.334505558013916, + "rewards/rejected": 0.1637650430202484, + "step": 3104 + }, + { + "epoch": 0.480185578967717, + "grad_norm": 6.274186611175537, + "learning_rate": 4.66634207813037e-06, + "logits/chosen": 7.107688903808594, + "logits/rejected": 7.325886249542236, + "logps/chosen": -256.94720458984375, + "logps/rejected": -267.4947204589844, + "loss": 0.6929, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.31862062215805054, + "rewards/margins": 0.05860920250415802, + "rewards/rejected": 0.2600114047527313, + "step": 3105 + }, + { + "epoch": 0.48034022810748117, + "grad_norm": 5.772373199462891, + "learning_rate": 4.666055676480697e-06, + "logits/chosen": 12.983613014221191, + "logits/rejected": 13.001771926879883, + "logps/chosen": -300.5957946777344, + "logps/rejected": -301.8417663574219, + "loss": 0.6318, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28683149814605713, + "rewards/margins": 0.21356192231178284, + "rewards/rejected": 0.07326958328485489, + "step": 3106 + }, + { + "epoch": 0.4804948772472453, + "grad_norm": 3.7809906005859375, + "learning_rate": 4.665769274831024e-06, + "logits/chosen": 9.988656997680664, + "logits/rejected": 8.445301055908203, + "logps/chosen": -290.3752746582031, + "logps/rejected": -163.20372009277344, + "loss": 0.529, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36333411931991577, + "rewards/margins": 0.4124641418457031, + "rewards/rejected": -0.04913001134991646, + "step": 3107 + }, + { + "epoch": 0.4806495263870095, + "grad_norm": 6.993603706359863, + "learning_rate": 4.66548287318135e-06, + "logits/chosen": 8.771278381347656, + "logits/rejected": 13.76442813873291, + "logps/chosen": -264.7418518066406, + "logps/rejected": -280.3789978027344, + "loss": 0.6834, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27477264404296875, + "rewards/margins": 0.11957436800003052, + "rewards/rejected": 0.15519830584526062, + "step": 3108 + }, + { + "epoch": 0.4808041755267736, + "grad_norm": 4.009907245635986, + "learning_rate": 4.665196471531677e-06, + "logits/chosen": 6.039556980133057, + "logits/rejected": 3.0460400581359863, + "logps/chosen": -363.201171875, + "logps/rejected": -243.82952880859375, + "loss": 0.5105, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3379444479942322, + "rewards/margins": 0.6768665313720703, + "rewards/rejected": -0.3389221131801605, + "step": 3109 + }, + { + "epoch": 0.4809588246665378, + "grad_norm": 6.993696689605713, + "learning_rate": 4.664910069882003e-06, + "logits/chosen": 13.721200942993164, + "logits/rejected": 7.4661865234375, + "logps/chosen": -361.32061767578125, + "logps/rejected": -225.691162109375, + "loss": 0.6726, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2556111216545105, + "rewards/margins": 0.18139737844467163, + "rewards/rejected": 0.07421379536390305, + "step": 3110 + }, + { + "epoch": 0.48111347380630193, + "grad_norm": 9.799798965454102, + "learning_rate": 4.664623668232329e-06, + "logits/chosen": 11.079540252685547, + "logits/rejected": 11.499974250793457, + "logps/chosen": -441.81695556640625, + "logps/rejected": -403.2650146484375, + "loss": 0.767, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2043754607439041, + "rewards/margins": -0.07378693670034409, + "rewards/rejected": 0.2781623899936676, + "step": 3111 + }, + { + "epoch": 0.48126812294606613, + "grad_norm": 6.90119743347168, + "learning_rate": 4.664337266582656e-06, + "logits/chosen": 7.199223518371582, + "logits/rejected": 14.866822242736816, + "logps/chosen": -310.2446594238281, + "logps/rejected": -423.78887939453125, + "loss": 0.6777, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08916039019823074, + "rewards/margins": 0.09727483242750168, + "rewards/rejected": -0.008114442229270935, + "step": 3112 + }, + { + "epoch": 0.4814227720858303, + "grad_norm": 5.121906757354736, + "learning_rate": 4.664050864932983e-06, + "logits/chosen": 8.25510025024414, + "logits/rejected": 5.587943077087402, + "logps/chosen": -331.591796875, + "logps/rejected": -258.9278564453125, + "loss": 0.5827, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2942407727241516, + "rewards/margins": 0.4519074857234955, + "rewards/rejected": -0.15766669809818268, + "step": 3113 + }, + { + "epoch": 0.48157742122559444, + "grad_norm": 4.946910381317139, + "learning_rate": 4.663764463283309e-06, + "logits/chosen": 13.50670337677002, + "logits/rejected": 9.783855438232422, + "logps/chosen": -238.26123046875, + "logps/rejected": -217.48915100097656, + "loss": 0.5451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49375224113464355, + "rewards/margins": 0.44415006041526794, + "rewards/rejected": 0.049602165818214417, + "step": 3114 + }, + { + "epoch": 0.4817320703653586, + "grad_norm": 5.88555383682251, + "learning_rate": 4.663478061633636e-06, + "logits/chosen": 10.317214965820312, + "logits/rejected": 4.8607611656188965, + "logps/chosen": -271.45263671875, + "logps/rejected": -218.02114868164062, + "loss": 0.6784, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19734486937522888, + "rewards/margins": 0.042775336652994156, + "rewards/rejected": 0.15456953644752502, + "step": 3115 + }, + { + "epoch": 0.48188671950512274, + "grad_norm": 5.0947265625, + "learning_rate": 4.663191659983962e-06, + "logits/chosen": 8.713740348815918, + "logits/rejected": 6.296026706695557, + "logps/chosen": -223.08221435546875, + "logps/rejected": -154.70620727539062, + "loss": 0.7356, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1956629753112793, + "rewards/margins": -0.03696922957897186, + "rewards/rejected": 0.23263218998908997, + "step": 3116 + }, + { + "epoch": 0.4820413686448869, + "grad_norm": 10.16594123840332, + "learning_rate": 4.662905258334288e-06, + "logits/chosen": 9.132078170776367, + "logits/rejected": 9.844566345214844, + "logps/chosen": -546.7945556640625, + "logps/rejected": -361.9330749511719, + "loss": 0.7085, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26457977294921875, + "rewards/margins": 0.07653006166219711, + "rewards/rejected": 0.18804970383644104, + "step": 3117 + }, + { + "epoch": 0.4821960177846511, + "grad_norm": 5.411478042602539, + "learning_rate": 4.662618856684615e-06, + "logits/chosen": 7.599648475646973, + "logits/rejected": 9.483412742614746, + "logps/chosen": -241.53836059570312, + "logps/rejected": -250.0540313720703, + "loss": 0.63, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41807055473327637, + "rewards/margins": 0.1949913650751114, + "rewards/rejected": 0.22307920455932617, + "step": 3118 + }, + { + "epoch": 0.48235066692441525, + "grad_norm": 3.5089199542999268, + "learning_rate": 4.662332455034942e-06, + "logits/chosen": 8.58030891418457, + "logits/rejected": 5.935327529907227, + "logps/chosen": -163.31246948242188, + "logps/rejected": -102.3130874633789, + "loss": 0.6235, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.045124948024749756, + "rewards/margins": 0.2460692673921585, + "rewards/rejected": -0.20094431936740875, + "step": 3119 + }, + { + "epoch": 0.4825053160641794, + "grad_norm": 8.25362491607666, + "learning_rate": 4.662046053385268e-06, + "logits/chosen": 14.522830963134766, + "logits/rejected": 9.616324424743652, + "logps/chosen": -417.04278564453125, + "logps/rejected": -322.84259033203125, + "loss": 0.7294, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18802796304225922, + "rewards/margins": -0.04016497731208801, + "rewards/rejected": 0.22819297015666962, + "step": 3120 + }, + { + "epoch": 0.48265996520394355, + "grad_norm": 5.709573745727539, + "learning_rate": 4.661759651735594e-06, + "logits/chosen": 11.128902435302734, + "logits/rejected": 8.42249870300293, + "logps/chosen": -273.7630615234375, + "logps/rejected": -234.2310333251953, + "loss": 0.7539, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2798891067504883, + "rewards/margins": 0.09342312812805176, + "rewards/rejected": 0.18646597862243652, + "step": 3121 + }, + { + "epoch": 0.4828146143437077, + "grad_norm": 6.523487091064453, + "learning_rate": 4.661473250085921e-06, + "logits/chosen": 7.685569763183594, + "logits/rejected": 7.032012939453125, + "logps/chosen": -301.07135009765625, + "logps/rejected": -433.1878356933594, + "loss": 0.565, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4107076823711395, + "rewards/margins": 0.39451009035110474, + "rewards/rejected": 0.016197582706809044, + "step": 3122 + }, + { + "epoch": 0.48296926348347186, + "grad_norm": 7.286925792694092, + "learning_rate": 4.6611868484362475e-06, + "logits/chosen": 8.256193161010742, + "logits/rejected": 10.909571647644043, + "logps/chosen": -384.4923095703125, + "logps/rejected": -283.5874938964844, + "loss": 0.8165, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.16667160391807556, + "rewards/margins": -0.22564469277858734, + "rewards/rejected": 0.3923163115978241, + "step": 3123 + }, + { + "epoch": 0.483123912623236, + "grad_norm": 4.95969295501709, + "learning_rate": 4.660900446786574e-06, + "logits/chosen": 8.335537910461426, + "logits/rejected": 6.66776704788208, + "logps/chosen": -242.01260375976562, + "logps/rejected": -229.59786987304688, + "loss": 0.5817, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5110227465629578, + "rewards/margins": 0.28857529163360596, + "rewards/rejected": 0.2224474549293518, + "step": 3124 + }, + { + "epoch": 0.4832785617630002, + "grad_norm": 8.299555778503418, + "learning_rate": 4.6606140451369e-06, + "logits/chosen": 14.352217674255371, + "logits/rejected": 7.983645439147949, + "logps/chosen": -324.21917724609375, + "logps/rejected": -255.78677368164062, + "loss": 0.3975, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5353757739067078, + "rewards/margins": 0.8145855069160461, + "rewards/rejected": -0.27920979261398315, + "step": 3125 + }, + { + "epoch": 0.48343321090276437, + "grad_norm": 5.962520599365234, + "learning_rate": 4.6603276434872266e-06, + "logits/chosen": 10.823653221130371, + "logits/rejected": 8.550436019897461, + "logps/chosen": -284.08294677734375, + "logps/rejected": -339.2457275390625, + "loss": 0.5209, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.606833815574646, + "rewards/margins": 0.5953049659729004, + "rewards/rejected": 0.0115288645029068, + "step": 3126 + }, + { + "epoch": 0.4835878600425285, + "grad_norm": 11.590056419372559, + "learning_rate": 4.660041241837553e-06, + "logits/chosen": 5.133481979370117, + "logits/rejected": 6.446111679077148, + "logps/chosen": -315.7890930175781, + "logps/rejected": -257.5774841308594, + "loss": 0.7486, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.008068151772022247, + "rewards/margins": -0.045098286122083664, + "rewards/rejected": 0.03703012689948082, + "step": 3127 + }, + { + "epoch": 0.48374250918229267, + "grad_norm": 6.511941432952881, + "learning_rate": 4.65975484018788e-06, + "logits/chosen": 10.51563549041748, + "logits/rejected": 8.67074203491211, + "logps/chosen": -248.95477294921875, + "logps/rejected": -211.943603515625, + "loss": 0.758, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1153748631477356, + "rewards/margins": -0.07697518169879913, + "rewards/rejected": 0.19235001504421234, + "step": 3128 + }, + { + "epoch": 0.4838971583220568, + "grad_norm": 4.336550235748291, + "learning_rate": 4.6594684385382065e-06, + "logits/chosen": 6.033234596252441, + "logits/rejected": 5.192317962646484, + "logps/chosen": -204.49119567871094, + "logps/rejected": -197.9831085205078, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030600735917687416, + "rewards/margins": 0.06455759704113007, + "rewards/rejected": -0.0339568667113781, + "step": 3129 + }, + { + "epoch": 0.48405180746182097, + "grad_norm": 4.572023868560791, + "learning_rate": 4.659182036888532e-06, + "logits/chosen": 10.968120574951172, + "logits/rejected": 7.156067848205566, + "logps/chosen": -209.73492431640625, + "logps/rejected": -198.540283203125, + "loss": 0.5592, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15447193384170532, + "rewards/margins": 0.328130841255188, + "rewards/rejected": -0.17365893721580505, + "step": 3130 + }, + { + "epoch": 0.4842064566015852, + "grad_norm": 5.495441436767578, + "learning_rate": 4.658895635238859e-06, + "logits/chosen": 8.866323471069336, + "logits/rejected": 8.06518268585205, + "logps/chosen": -191.22218322753906, + "logps/rejected": -159.1036834716797, + "loss": 0.7118, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4054487645626068, + "rewards/margins": 0.004098765552043915, + "rewards/rejected": 0.4013499617576599, + "step": 3131 + }, + { + "epoch": 0.48436110574134933, + "grad_norm": 5.050384521484375, + "learning_rate": 4.658609233589186e-06, + "logits/chosen": 10.044139862060547, + "logits/rejected": 8.329888343811035, + "logps/chosen": -292.9063415527344, + "logps/rejected": -307.84405517578125, + "loss": 0.5093, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39067453145980835, + "rewards/margins": 0.4525166153907776, + "rewards/rejected": -0.06184205785393715, + "step": 3132 + }, + { + "epoch": 0.4845157548811135, + "grad_norm": 4.396191596984863, + "learning_rate": 4.658322831939512e-06, + "logits/chosen": 5.640777587890625, + "logits/rejected": 2.405721426010132, + "logps/chosen": -1031.5565185546875, + "logps/rejected": -176.4678192138672, + "loss": 0.609, + "rewards/accuracies": 0.375, + "rewards/chosen": 9.063767433166504, + "rewards/margins": 8.928215026855469, + "rewards/rejected": 0.1355513483285904, + "step": 3133 + }, + { + "epoch": 0.48467040402087763, + "grad_norm": 5.602945804595947, + "learning_rate": 4.658036430289839e-06, + "logits/chosen": 7.063930511474609, + "logits/rejected": 6.749483585357666, + "logps/chosen": -145.97476196289062, + "logps/rejected": -172.72418212890625, + "loss": 0.8493, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.026788663119077682, + "rewards/margins": -0.2280847728252411, + "rewards/rejected": 0.20129609107971191, + "step": 3134 + }, + { + "epoch": 0.4848250531606418, + "grad_norm": 4.217652797698975, + "learning_rate": 4.657750028640166e-06, + "logits/chosen": 12.316527366638184, + "logits/rejected": 2.074108600616455, + "logps/chosen": -162.15072631835938, + "logps/rejected": -92.1205062866211, + "loss": 0.6599, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14895698428153992, + "rewards/margins": 0.11452591419219971, + "rewards/rejected": 0.034431055188179016, + "step": 3135 + }, + { + "epoch": 0.48497970230040593, + "grad_norm": 4.476449012756348, + "learning_rate": 4.657463626990491e-06, + "logits/chosen": 9.3092041015625, + "logits/rejected": 1.613875389099121, + "logps/chosen": -330.0496520996094, + "logps/rejected": -241.52801513671875, + "loss": 0.5316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5261832475662231, + "rewards/margins": 0.5591482520103455, + "rewards/rejected": -0.0329650416970253, + "step": 3136 + }, + { + "epoch": 0.48513435144017014, + "grad_norm": 6.643457412719727, + "learning_rate": 4.657177225340818e-06, + "logits/chosen": 14.627910614013672, + "logits/rejected": 7.467987060546875, + "logps/chosen": -342.03985595703125, + "logps/rejected": -284.0477294921875, + "loss": 0.5862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12968263030052185, + "rewards/margins": 0.5245844125747681, + "rewards/rejected": -0.3949018120765686, + "step": 3137 + }, + { + "epoch": 0.4852890005799343, + "grad_norm": 5.43419075012207, + "learning_rate": 4.656890823691145e-06, + "logits/chosen": 4.246822357177734, + "logits/rejected": 9.969505310058594, + "logps/chosen": -177.4728546142578, + "logps/rejected": -272.5858154296875, + "loss": 0.8379, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1840386688709259, + "rewards/margins": -0.21330195665359497, + "rewards/rejected": 0.3973406255245209, + "step": 3138 + }, + { + "epoch": 0.48544364971969844, + "grad_norm": 4.850854396820068, + "learning_rate": 4.656604422041471e-06, + "logits/chosen": 12.65326976776123, + "logits/rejected": 6.869013786315918, + "logps/chosen": -353.0364990234375, + "logps/rejected": -236.920166015625, + "loss": 0.5683, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5645352602005005, + "rewards/margins": 0.3963402807712555, + "rewards/rejected": 0.168194979429245, + "step": 3139 + }, + { + "epoch": 0.4855982988594626, + "grad_norm": 4.66309928894043, + "learning_rate": 4.656318020391798e-06, + "logits/chosen": 9.907949447631836, + "logits/rejected": 7.054296493530273, + "logps/chosen": -235.88304138183594, + "logps/rejected": -217.68917846679688, + "loss": 0.6477, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35523533821105957, + "rewards/margins": 0.14800453186035156, + "rewards/rejected": 0.20723077654838562, + "step": 3140 + }, + { + "epoch": 0.48575294799922675, + "grad_norm": 5.229705810546875, + "learning_rate": 4.656031618742125e-06, + "logits/chosen": 12.638096809387207, + "logits/rejected": 10.311125755310059, + "logps/chosen": -237.84715270996094, + "logps/rejected": -209.0557098388672, + "loss": 0.749, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.181694895029068, + "rewards/margins": -0.012629128992557526, + "rewards/rejected": 0.1943240463733673, + "step": 3141 + }, + { + "epoch": 0.4859075971389909, + "grad_norm": 11.435564041137695, + "learning_rate": 4.655745217092451e-06, + "logits/chosen": 13.609848022460938, + "logits/rejected": 6.833785533905029, + "logps/chosen": -416.77032470703125, + "logps/rejected": -280.5407409667969, + "loss": 0.7254, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23872433602809906, + "rewards/margins": 0.01831910014152527, + "rewards/rejected": 0.22040525078773499, + "step": 3142 + }, + { + "epoch": 0.48606224627875505, + "grad_norm": 7.797854423522949, + "learning_rate": 4.655458815442777e-06, + "logits/chosen": 8.242634773254395, + "logits/rejected": 6.678414821624756, + "logps/chosen": -360.640380859375, + "logps/rejected": -309.0814514160156, + "loss": 0.8073, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1506882756948471, + "rewards/margins": -0.08424608409404755, + "rewards/rejected": -0.06644222140312195, + "step": 3143 + }, + { + "epoch": 0.48621689541851926, + "grad_norm": 6.6474761962890625, + "learning_rate": 4.655172413793104e-06, + "logits/chosen": 13.172595024108887, + "logits/rejected": 12.744207382202148, + "logps/chosen": -293.0497741699219, + "logps/rejected": -338.942626953125, + "loss": 0.6977, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2808595597743988, + "rewards/margins": 0.04525240510702133, + "rewards/rejected": 0.23560716211795807, + "step": 3144 + }, + { + "epoch": 0.4863715445582834, + "grad_norm": 5.822055816650391, + "learning_rate": 4.65488601214343e-06, + "logits/chosen": 16.076597213745117, + "logits/rejected": 6.6547088623046875, + "logps/chosen": -340.1954040527344, + "logps/rejected": -194.8856201171875, + "loss": 0.5939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3909834623336792, + "rewards/margins": 0.38622355461120605, + "rewards/rejected": 0.004759877920150757, + "step": 3145 + }, + { + "epoch": 0.48652619369804756, + "grad_norm": 4.3357648849487305, + "learning_rate": 4.654599610493757e-06, + "logits/chosen": 14.991737365722656, + "logits/rejected": 10.983709335327148, + "logps/chosen": -189.89471435546875, + "logps/rejected": -186.35240173339844, + "loss": 0.5711, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36932802200317383, + "rewards/margins": 0.2914316952228546, + "rewards/rejected": 0.0778963565826416, + "step": 3146 + }, + { + "epoch": 0.4866808428378117, + "grad_norm": 8.602401733398438, + "learning_rate": 4.654313208844084e-06, + "logits/chosen": 14.080583572387695, + "logits/rejected": 8.30788803100586, + "logps/chosen": -537.536376953125, + "logps/rejected": -455.89764404296875, + "loss": 0.5965, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7892827987670898, + "rewards/margins": 0.2590695321559906, + "rewards/rejected": 0.5302132964134216, + "step": 3147 + }, + { + "epoch": 0.48683549197757586, + "grad_norm": 7.847988128662109, + "learning_rate": 4.65402680719441e-06, + "logits/chosen": 8.244237899780273, + "logits/rejected": 9.512640953063965, + "logps/chosen": -360.02996826171875, + "logps/rejected": -377.20526123046875, + "loss": 0.7297, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4286859631538391, + "rewards/margins": 0.07802101969718933, + "rewards/rejected": 0.3506649434566498, + "step": 3148 + }, + { + "epoch": 0.48699014111734, + "grad_norm": 4.725911617279053, + "learning_rate": 4.653740405544736e-06, + "logits/chosen": 13.503066062927246, + "logits/rejected": 9.251468658447266, + "logps/chosen": -334.5111083984375, + "logps/rejected": -200.09103393554688, + "loss": 0.6043, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4679725170135498, + "rewards/margins": 0.25458309054374695, + "rewards/rejected": 0.21338944137096405, + "step": 3149 + }, + { + "epoch": 0.4871447902571042, + "grad_norm": 6.002705097198486, + "learning_rate": 4.653454003895063e-06, + "logits/chosen": 12.51889419555664, + "logits/rejected": 5.8789801597595215, + "logps/chosen": -398.3709411621094, + "logps/rejected": -269.5645446777344, + "loss": 0.6609, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28691694140434265, + "rewards/margins": 0.16540218889713287, + "rewards/rejected": 0.1215147003531456, + "step": 3150 + }, + { + "epoch": 0.4872994393968684, + "grad_norm": 7.415189266204834, + "learning_rate": 4.6531676022453894e-06, + "logits/chosen": 10.190999984741211, + "logits/rejected": 7.825841903686523, + "logps/chosen": -262.91522216796875, + "logps/rejected": -288.96209716796875, + "loss": 0.7193, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19332566857337952, + "rewards/margins": 0.13973182439804077, + "rewards/rejected": 0.05359382927417755, + "step": 3151 + }, + { + "epoch": 0.4874540885366325, + "grad_norm": 5.6488752365112305, + "learning_rate": 4.652881200595716e-06, + "logits/chosen": 11.01989459991455, + "logits/rejected": 8.425956726074219, + "logps/chosen": -260.1990661621094, + "logps/rejected": -230.9673614501953, + "loss": 0.7021, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09329281747341156, + "rewards/margins": 0.04618797451257706, + "rewards/rejected": 0.04710483178496361, + "step": 3152 + }, + { + "epoch": 0.4876087376763967, + "grad_norm": 6.773861408233643, + "learning_rate": 4.652594798946043e-06, + "logits/chosen": 5.366913318634033, + "logits/rejected": 4.152656555175781, + "logps/chosen": -252.30450439453125, + "logps/rejected": -186.62954711914062, + "loss": 0.6081, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1350730061531067, + "rewards/margins": 0.20778274536132812, + "rewards/rejected": -0.07270975410938263, + "step": 3153 + }, + { + "epoch": 0.4877633868161608, + "grad_norm": 5.900341033935547, + "learning_rate": 4.6523083972963686e-06, + "logits/chosen": 9.167007446289062, + "logits/rejected": 10.83575439453125, + "logps/chosen": -315.940673828125, + "logps/rejected": -373.966064453125, + "loss": 0.6625, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5532617568969727, + "rewards/margins": 0.16774307191371918, + "rewards/rejected": 0.38551872968673706, + "step": 3154 + }, + { + "epoch": 0.487918035955925, + "grad_norm": 6.796382904052734, + "learning_rate": 4.652021995646695e-06, + "logits/chosen": 7.657487869262695, + "logits/rejected": 10.32063102722168, + "logps/chosen": -299.8795471191406, + "logps/rejected": -416.1364440917969, + "loss": 0.7348, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.42327243089675903, + "rewards/margins": -0.06519460678100586, + "rewards/rejected": 0.4884670376777649, + "step": 3155 + }, + { + "epoch": 0.48807268509568913, + "grad_norm": 6.500053405761719, + "learning_rate": 4.651735593997022e-06, + "logits/chosen": 9.578104019165039, + "logits/rejected": 7.073984622955322, + "logps/chosen": -276.14117431640625, + "logps/rejected": -262.5292663574219, + "loss": 0.8523, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36339902877807617, + "rewards/margins": -0.15660923719406128, + "rewards/rejected": 0.5200082659721375, + "step": 3156 + }, + { + "epoch": 0.48822733423545334, + "grad_norm": 7.831830978393555, + "learning_rate": 4.6514491923473485e-06, + "logits/chosen": 3.694526195526123, + "logits/rejected": 11.560588836669922, + "logps/chosen": -200.13336181640625, + "logps/rejected": -293.12005615234375, + "loss": 0.982, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06465218216180801, + "rewards/margins": -0.4228639006614685, + "rewards/rejected": 0.3582116961479187, + "step": 3157 + }, + { + "epoch": 0.4883819833752175, + "grad_norm": 7.800491809844971, + "learning_rate": 4.651162790697675e-06, + "logits/chosen": 6.872100830078125, + "logits/rejected": 5.318763732910156, + "logps/chosen": -365.7567443847656, + "logps/rejected": -265.0841979980469, + "loss": 0.4845, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5878764390945435, + "rewards/margins": 0.6039544939994812, + "rewards/rejected": -0.01607809215784073, + "step": 3158 + }, + { + "epoch": 0.48853663251498164, + "grad_norm": 4.411405086517334, + "learning_rate": 4.650876389048001e-06, + "logits/chosen": 10.610103607177734, + "logits/rejected": 8.967996597290039, + "logps/chosen": -219.71969604492188, + "logps/rejected": -195.4951171875, + "loss": 0.6501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2500418722629547, + "rewards/margins": 0.16353864967823029, + "rewards/rejected": 0.08650322258472443, + "step": 3159 + }, + { + "epoch": 0.4886912816547458, + "grad_norm": 6.110515117645264, + "learning_rate": 4.650589987398328e-06, + "logits/chosen": 7.2000579833984375, + "logits/rejected": 12.480628967285156, + "logps/chosen": -256.54443359375, + "logps/rejected": -297.2021179199219, + "loss": 0.7546, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25119417905807495, + "rewards/margins": 0.04521968960762024, + "rewards/rejected": 0.2059745192527771, + "step": 3160 + }, + { + "epoch": 0.48884593079450994, + "grad_norm": 5.841609001159668, + "learning_rate": 4.650303585748654e-06, + "logits/chosen": 7.3679518699646, + "logits/rejected": 9.284711837768555, + "logps/chosen": -259.32470703125, + "logps/rejected": -282.07989501953125, + "loss": 0.658, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13073723018169403, + "rewards/margins": 0.11859878152608871, + "rewards/rejected": 0.01213844120502472, + "step": 3161 + }, + { + "epoch": 0.4890005799342741, + "grad_norm": 7.9683427810668945, + "learning_rate": 4.650017184098981e-06, + "logits/chosen": 9.648505210876465, + "logits/rejected": 8.252156257629395, + "logps/chosen": -234.68856811523438, + "logps/rejected": -224.43719482421875, + "loss": 0.8463, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12476016581058502, + "rewards/margins": -0.0468250997364521, + "rewards/rejected": -0.07793506234884262, + "step": 3162 + }, + { + "epoch": 0.4891552290740383, + "grad_norm": 5.005395889282227, + "learning_rate": 4.649730782449307e-06, + "logits/chosen": 4.40004825592041, + "logits/rejected": 2.7033019065856934, + "logps/chosen": -219.5928955078125, + "logps/rejected": -152.5099639892578, + "loss": 0.6907, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24326755106449127, + "rewards/margins": 0.040911633521318436, + "rewards/rejected": 0.20235590636730194, + "step": 3163 + }, + { + "epoch": 0.48930987821380245, + "grad_norm": 7.610771179199219, + "learning_rate": 4.649444380799633e-06, + "logits/chosen": 1.3592698574066162, + "logits/rejected": 4.451530456542969, + "logps/chosen": -313.43927001953125, + "logps/rejected": -281.161376953125, + "loss": 0.8932, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06496445834636688, + "rewards/margins": -0.21391120553016663, + "rewards/rejected": 0.2788756191730499, + "step": 3164 + }, + { + "epoch": 0.4894645273535666, + "grad_norm": 10.818938255310059, + "learning_rate": 4.64915797914996e-06, + "logits/chosen": 11.451922416687012, + "logits/rejected": 11.464862823486328, + "logps/chosen": -435.517822265625, + "logps/rejected": -528.1263427734375, + "loss": 0.7433, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6088907718658447, + "rewards/margins": 0.010804370045661926, + "rewards/rejected": 0.598086416721344, + "step": 3165 + }, + { + "epoch": 0.48961917649333075, + "grad_norm": 6.297966480255127, + "learning_rate": 4.648871577500287e-06, + "logits/chosen": 10.342071533203125, + "logits/rejected": 10.34482192993164, + "logps/chosen": -300.5644836425781, + "logps/rejected": -302.13092041015625, + "loss": 0.7791, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.011102981865406036, + "rewards/margins": -0.022024869918823242, + "rewards/rejected": 0.033127881586551666, + "step": 3166 + }, + { + "epoch": 0.4897738256330949, + "grad_norm": 5.925624370574951, + "learning_rate": 4.648585175850613e-06, + "logits/chosen": 8.92153549194336, + "logits/rejected": 6.804349899291992, + "logps/chosen": -229.2291259765625, + "logps/rejected": -202.8915252685547, + "loss": 0.6693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.051148418337106705, + "rewards/margins": 0.12040044367313385, + "rewards/rejected": -0.06925200670957565, + "step": 3167 + }, + { + "epoch": 0.48992847477285906, + "grad_norm": 5.773336410522461, + "learning_rate": 4.64829877420094e-06, + "logits/chosen": 6.840940475463867, + "logits/rejected": 4.913928985595703, + "logps/chosen": -275.9788818359375, + "logps/rejected": -224.2779541015625, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4275149405002594, + "rewards/margins": 0.0990627259016037, + "rewards/rejected": 0.3284522294998169, + "step": 3168 + }, + { + "epoch": 0.4900831239126232, + "grad_norm": 5.766694068908691, + "learning_rate": 4.648012372551266e-06, + "logits/chosen": 12.817164421081543, + "logits/rejected": 6.663919925689697, + "logps/chosen": -316.7397155761719, + "logps/rejected": -290.2966003417969, + "loss": 0.54, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5047233700752258, + "rewards/margins": 0.44056177139282227, + "rewards/rejected": 0.06416158378124237, + "step": 3169 + }, + { + "epoch": 0.4902377730523874, + "grad_norm": 4.884330749511719, + "learning_rate": 4.647725970901592e-06, + "logits/chosen": 11.26691722869873, + "logits/rejected": 10.823739051818848, + "logps/chosen": -155.59881591796875, + "logps/rejected": -202.57444763183594, + "loss": 0.7954, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.28951627016067505, + "rewards/margins": -0.1497904360294342, + "rewards/rejected": -0.13972583413124084, + "step": 3170 + }, + { + "epoch": 0.49039242219215157, + "grad_norm": 8.208195686340332, + "learning_rate": 4.647439569251919e-06, + "logits/chosen": 9.708375930786133, + "logits/rejected": 5.051123142242432, + "logps/chosen": -358.4781799316406, + "logps/rejected": -281.9560546875, + "loss": 0.7367, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5789129137992859, + "rewards/margins": -0.017590772360563278, + "rewards/rejected": 0.596503734588623, + "step": 3171 + }, + { + "epoch": 0.4905470713319157, + "grad_norm": 6.089240550994873, + "learning_rate": 4.647153167602246e-06, + "logits/chosen": 17.972990036010742, + "logits/rejected": 13.444500923156738, + "logps/chosen": -266.7465515136719, + "logps/rejected": -316.9468994140625, + "loss": 0.7546, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.42336201667785645, + "rewards/margins": -0.08778373897075653, + "rewards/rejected": 0.5111457705497742, + "step": 3172 + }, + { + "epoch": 0.49070172047167987, + "grad_norm": 3.864051580429077, + "learning_rate": 4.646866765952572e-06, + "logits/chosen": 12.79916763305664, + "logits/rejected": 10.045812606811523, + "logps/chosen": -215.0010986328125, + "logps/rejected": -207.05117797851562, + "loss": 0.5086, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4178192913532257, + "rewards/margins": 0.527061939239502, + "rewards/rejected": -0.10924268513917923, + "step": 3173 + }, + { + "epoch": 0.490856369611444, + "grad_norm": 6.854030132293701, + "learning_rate": 4.646580364302899e-06, + "logits/chosen": 11.282418251037598, + "logits/rejected": 9.218419075012207, + "logps/chosen": -289.53265380859375, + "logps/rejected": -269.23040771484375, + "loss": 0.7765, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37288475036621094, + "rewards/margins": 0.04157883673906326, + "rewards/rejected": 0.3313058912754059, + "step": 3174 + }, + { + "epoch": 0.4910110187512082, + "grad_norm": 4.960069179534912, + "learning_rate": 4.646293962653226e-06, + "logits/chosen": 7.7244462966918945, + "logits/rejected": 5.769742965698242, + "logps/chosen": -307.2171630859375, + "logps/rejected": -230.10629272460938, + "loss": 0.5998, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.029868803918361664, + "rewards/margins": 0.2932475507259369, + "rewards/rejected": -0.263378769159317, + "step": 3175 + }, + { + "epoch": 0.4911656678909724, + "grad_norm": 6.106284141540527, + "learning_rate": 4.6460075610035515e-06, + "logits/chosen": 12.172809600830078, + "logits/rejected": 9.250649452209473, + "logps/chosen": -306.5214538574219, + "logps/rejected": -255.6414337158203, + "loss": 0.5988, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6228266954421997, + "rewards/margins": 0.2738649547100067, + "rewards/rejected": 0.34896180033683777, + "step": 3176 + }, + { + "epoch": 0.49132031703073653, + "grad_norm": 4.351642608642578, + "learning_rate": 4.645721159353878e-06, + "logits/chosen": 8.836203575134277, + "logits/rejected": 6.279273986816406, + "logps/chosen": -252.66725158691406, + "logps/rejected": -173.04774475097656, + "loss": 0.6449, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22835348546504974, + "rewards/margins": 0.219268798828125, + "rewards/rejected": 0.009084686636924744, + "step": 3177 + }, + { + "epoch": 0.4914749661705007, + "grad_norm": 4.168735027313232, + "learning_rate": 4.645434757704205e-06, + "logits/chosen": 8.805900573730469, + "logits/rejected": 3.201615571975708, + "logps/chosen": -207.68418884277344, + "logps/rejected": -149.4184112548828, + "loss": 0.7084, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14393866062164307, + "rewards/margins": 0.10638261586427689, + "rewards/rejected": 0.037556007504463196, + "step": 3178 + }, + { + "epoch": 0.49162961531026483, + "grad_norm": 5.432239055633545, + "learning_rate": 4.6451483560545314e-06, + "logits/chosen": 5.614684104919434, + "logits/rejected": 8.299880027770996, + "logps/chosen": -233.6148681640625, + "logps/rejected": -350.77996826171875, + "loss": 0.6205, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2374475598335266, + "rewards/margins": 0.3464166224002838, + "rewards/rejected": -0.1089690625667572, + "step": 3179 + }, + { + "epoch": 0.491784264450029, + "grad_norm": 5.511128902435303, + "learning_rate": 4.644861954404858e-06, + "logits/chosen": 8.252463340759277, + "logits/rejected": 8.249320030212402, + "logps/chosen": -331.7830505371094, + "logps/rejected": -377.23779296875, + "loss": 0.6224, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2984260320663452, + "rewards/margins": 0.250837117433548, + "rewards/rejected": 0.04758892208337784, + "step": 3180 + }, + { + "epoch": 0.49193891358979314, + "grad_norm": 5.1030659675598145, + "learning_rate": 4.644575552755185e-06, + "logits/chosen": 5.189993858337402, + "logits/rejected": -2.7467448711395264, + "logps/chosen": -299.9344177246094, + "logps/rejected": -180.72970581054688, + "loss": 0.5793, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38742536306381226, + "rewards/margins": 0.26406800746917725, + "rewards/rejected": 0.123357392847538, + "step": 3181 + }, + { + "epoch": 0.49209356272955734, + "grad_norm": 4.83907413482666, + "learning_rate": 4.6442891511055105e-06, + "logits/chosen": 11.629040718078613, + "logits/rejected": 6.019209384918213, + "logps/chosen": -326.4150695800781, + "logps/rejected": -218.77005004882812, + "loss": 0.4972, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4648973047733307, + "rewards/margins": 0.5192514657974243, + "rewards/rejected": -0.05435418337583542, + "step": 3182 + }, + { + "epoch": 0.4922482118693215, + "grad_norm": 4.269180774688721, + "learning_rate": 4.644002749455837e-06, + "logits/chosen": 6.746547222137451, + "logits/rejected": 2.949510335922241, + "logps/chosen": -256.04364013671875, + "logps/rejected": -221.83444213867188, + "loss": 0.5363, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25615254044532776, + "rewards/margins": 0.3992369472980499, + "rewards/rejected": -0.14308443665504456, + "step": 3183 + }, + { + "epoch": 0.49240286100908565, + "grad_norm": 2.823241949081421, + "learning_rate": 4.643716347806164e-06, + "logits/chosen": 9.455652236938477, + "logits/rejected": 4.490926265716553, + "logps/chosen": -201.31967163085938, + "logps/rejected": -177.3224639892578, + "loss": 0.4472, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23773860931396484, + "rewards/margins": 0.7670780420303345, + "rewards/rejected": -0.5293394327163696, + "step": 3184 + }, + { + "epoch": 0.4925575101488498, + "grad_norm": 4.3402276039123535, + "learning_rate": 4.6434299461564905e-06, + "logits/chosen": 11.96626091003418, + "logits/rejected": 1.8950477838516235, + "logps/chosen": -319.28594970703125, + "logps/rejected": -180.89744567871094, + "loss": 0.4621, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2507093846797943, + "rewards/margins": 0.6179460287094116, + "rewards/rejected": -0.3672366142272949, + "step": 3185 + }, + { + "epoch": 0.49271215928861395, + "grad_norm": 5.199821472167969, + "learning_rate": 4.643143544506817e-06, + "logits/chosen": 11.641837120056152, + "logits/rejected": 8.420984268188477, + "logps/chosen": -279.00006103515625, + "logps/rejected": -258.0652160644531, + "loss": 0.5754, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4567892551422119, + "rewards/margins": 0.274496853351593, + "rewards/rejected": 0.1822923719882965, + "step": 3186 + }, + { + "epoch": 0.4928668084283781, + "grad_norm": 3.9842569828033447, + "learning_rate": 4.642857142857144e-06, + "logits/chosen": 10.269495964050293, + "logits/rejected": 1.7518670558929443, + "logps/chosen": -236.9564208984375, + "logps/rejected": -165.79795837402344, + "loss": 0.475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30555787682533264, + "rewards/margins": 0.6471152305603027, + "rewards/rejected": -0.3415573537349701, + "step": 3187 + }, + { + "epoch": 0.49302145756814225, + "grad_norm": 6.792036056518555, + "learning_rate": 4.64257074120747e-06, + "logits/chosen": 13.500274658203125, + "logits/rejected": 9.345292091369629, + "logps/chosen": -404.97894287109375, + "logps/rejected": -392.1279602050781, + "loss": 0.621, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6801363229751587, + "rewards/margins": 0.20369091629981995, + "rewards/rejected": 0.47644540667533875, + "step": 3188 + }, + { + "epoch": 0.49317610670790646, + "grad_norm": 5.124135971069336, + "learning_rate": 4.642284339557796e-06, + "logits/chosen": 4.360634803771973, + "logits/rejected": 4.045827865600586, + "logps/chosen": -233.89944458007812, + "logps/rejected": -305.7484130859375, + "loss": 0.536, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38344401121139526, + "rewards/margins": 0.4704377353191376, + "rewards/rejected": -0.08699370920658112, + "step": 3189 + }, + { + "epoch": 0.4933307558476706, + "grad_norm": 7.059565544128418, + "learning_rate": 4.641997937908123e-06, + "logits/chosen": 11.31144905090332, + "logits/rejected": 9.760826110839844, + "logps/chosen": -311.3580322265625, + "logps/rejected": -326.622314453125, + "loss": 0.9138, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26804038882255554, + "rewards/margins": -0.30372685194015503, + "rewards/rejected": 0.5717672109603882, + "step": 3190 + }, + { + "epoch": 0.49348540498743476, + "grad_norm": 5.41403865814209, + "learning_rate": 4.6417115362584495e-06, + "logits/chosen": 9.400973320007324, + "logits/rejected": 7.57589054107666, + "logps/chosen": -223.21302795410156, + "logps/rejected": -247.32305908203125, + "loss": 0.622, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2883512079715729, + "rewards/margins": 0.27353349328041077, + "rewards/rejected": 0.014817751944065094, + "step": 3191 + }, + { + "epoch": 0.4936400541271989, + "grad_norm": 7.883642196655273, + "learning_rate": 4.641425134608775e-06, + "logits/chosen": 6.131500720977783, + "logits/rejected": 10.128080368041992, + "logps/chosen": -331.9299621582031, + "logps/rejected": -339.8541259765625, + "loss": 0.8455, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.32346320152282715, + "rewards/margins": -0.18751011788845062, + "rewards/rejected": 0.510973334312439, + "step": 3192 + }, + { + "epoch": 0.49379470326696306, + "grad_norm": 4.2896623611450195, + "learning_rate": 4.641138732959102e-06, + "logits/chosen": 13.842658042907715, + "logits/rejected": 10.419378280639648, + "logps/chosen": -323.53662109375, + "logps/rejected": -273.93701171875, + "loss": 0.4599, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4829898774623871, + "rewards/margins": 0.6405640244483948, + "rewards/rejected": -0.15757417678833008, + "step": 3193 + }, + { + "epoch": 0.4939493524067272, + "grad_norm": 4.135184288024902, + "learning_rate": 4.640852331309429e-06, + "logits/chosen": 12.738903999328613, + "logits/rejected": 4.654675483703613, + "logps/chosen": -346.5086364746094, + "logps/rejected": -260.3005065917969, + "loss": 0.4967, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36599692702293396, + "rewards/margins": 0.7874415516853333, + "rewards/rejected": -0.4214445948600769, + "step": 3194 + }, + { + "epoch": 0.4941040015464914, + "grad_norm": 5.662519454956055, + "learning_rate": 4.640565929659755e-06, + "logits/chosen": 9.974206924438477, + "logits/rejected": 9.11992359161377, + "logps/chosen": -386.9579162597656, + "logps/rejected": -398.414794921875, + "loss": 0.5141, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3070283532142639, + "rewards/margins": 0.45626509189605713, + "rewards/rejected": -0.14923667907714844, + "step": 3195 + }, + { + "epoch": 0.4942586506862556, + "grad_norm": 6.255503177642822, + "learning_rate": 4.640279528010082e-06, + "logits/chosen": 10.668098449707031, + "logits/rejected": 7.344058036804199, + "logps/chosen": -454.4828186035156, + "logps/rejected": -437.55487060546875, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4442010819911957, + "rewards/margins": 0.41534993052482605, + "rewards/rejected": 0.02885115146636963, + "step": 3196 + }, + { + "epoch": 0.4944132998260197, + "grad_norm": 5.766711235046387, + "learning_rate": 4.639993126360408e-06, + "logits/chosen": 11.857966423034668, + "logits/rejected": 6.200558185577393, + "logps/chosen": -309.28753662109375, + "logps/rejected": -250.5145721435547, + "loss": 0.6096, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05699712410569191, + "rewards/margins": 0.23993702232837677, + "rewards/rejected": -0.2969341576099396, + "step": 3197 + }, + { + "epoch": 0.4945679489657839, + "grad_norm": 6.10903787612915, + "learning_rate": 4.639706724710734e-06, + "logits/chosen": 4.986599922180176, + "logits/rejected": 6.530762195587158, + "logps/chosen": -194.1409149169922, + "logps/rejected": -208.06642150878906, + "loss": 0.7198, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34600967168807983, + "rewards/margins": -0.02386576682329178, + "rewards/rejected": 0.369875431060791, + "step": 3198 + }, + { + "epoch": 0.49472259810554803, + "grad_norm": 4.649064540863037, + "learning_rate": 4.639420323061061e-06, + "logits/chosen": 6.362103462219238, + "logits/rejected": 8.3464937210083, + "logps/chosen": -400.4107666015625, + "logps/rejected": -313.1363525390625, + "loss": 0.5027, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5108106136322021, + "rewards/margins": 0.5735217332839966, + "rewards/rejected": -0.0627111941576004, + "step": 3199 + }, + { + "epoch": 0.4948772472453122, + "grad_norm": 4.862316608428955, + "learning_rate": 4.639133921411388e-06, + "logits/chosen": 9.388262748718262, + "logits/rejected": 6.110173225402832, + "logps/chosen": -237.64242553710938, + "logps/rejected": -212.5164794921875, + "loss": 0.589, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1735171377658844, + "rewards/margins": 0.26268139481544495, + "rewards/rejected": -0.08916424959897995, + "step": 3200 + }, + { + "epoch": 0.49503189638507633, + "grad_norm": 5.882922649383545, + "learning_rate": 4.638847519761714e-06, + "logits/chosen": 9.987051010131836, + "logits/rejected": 5.234467506408691, + "logps/chosen": -330.4905090332031, + "logps/rejected": -279.2564697265625, + "loss": 0.7626, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.011462017893791199, + "rewards/margins": -0.028851233422756195, + "rewards/rejected": 0.040313251316547394, + "step": 3201 + }, + { + "epoch": 0.49518654552484054, + "grad_norm": 6.285687446594238, + "learning_rate": 4.63856111811204e-06, + "logits/chosen": 10.027826309204102, + "logits/rejected": 8.446650505065918, + "logps/chosen": -290.5234680175781, + "logps/rejected": -292.0268249511719, + "loss": 0.7152, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0013534873723983765, + "rewards/margins": 0.042198844254016876, + "rewards/rejected": -0.043552324175834656, + "step": 3202 + }, + { + "epoch": 0.4953411946646047, + "grad_norm": 6.7544989585876465, + "learning_rate": 4.638274716462367e-06, + "logits/chosen": 12.055245399475098, + "logits/rejected": 8.946240425109863, + "logps/chosen": -200.11856079101562, + "logps/rejected": -188.32176208496094, + "loss": 0.8951, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2900071144104004, + "rewards/margins": -0.3132253885269165, + "rewards/rejected": 0.023218244314193726, + "step": 3203 + }, + { + "epoch": 0.49549584380436884, + "grad_norm": 4.476693630218506, + "learning_rate": 4.6379883148126935e-06, + "logits/chosen": 12.43474006652832, + "logits/rejected": 6.799140930175781, + "logps/chosen": -425.25323486328125, + "logps/rejected": -358.7868347167969, + "loss": 0.5003, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46089911460876465, + "rewards/margins": 0.7050890922546387, + "rewards/rejected": -0.24418994784355164, + "step": 3204 + }, + { + "epoch": 0.495650492944133, + "grad_norm": 4.653582572937012, + "learning_rate": 4.63770191316302e-06, + "logits/chosen": 16.08995246887207, + "logits/rejected": 12.875402450561523, + "logps/chosen": -155.30169677734375, + "logps/rejected": -124.24064636230469, + "loss": 0.6236, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17245537042617798, + "rewards/margins": 0.19474969804286957, + "rewards/rejected": -0.02229432761669159, + "step": 3205 + }, + { + "epoch": 0.49580514208389714, + "grad_norm": 5.727625370025635, + "learning_rate": 4.637415511513347e-06, + "logits/chosen": 16.610809326171875, + "logits/rejected": 13.747440338134766, + "logps/chosen": -362.3914794921875, + "logps/rejected": -314.66455078125, + "loss": 0.7018, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21794280409812927, + "rewards/margins": 0.04338674619793892, + "rewards/rejected": 0.17455606162548065, + "step": 3206 + }, + { + "epoch": 0.4959597912236613, + "grad_norm": 5.008937358856201, + "learning_rate": 4.637129109863673e-06, + "logits/chosen": 9.213848114013672, + "logits/rejected": 12.177340507507324, + "logps/chosen": -185.40524291992188, + "logps/rejected": -249.67605590820312, + "loss": 0.6861, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2635473310947418, + "rewards/margins": 0.030886556953191757, + "rewards/rejected": 0.23266077041625977, + "step": 3207 + }, + { + "epoch": 0.4961144403634255, + "grad_norm": 9.41383171081543, + "learning_rate": 4.636842708214e-06, + "logits/chosen": 8.661371231079102, + "logits/rejected": 10.524496078491211, + "logps/chosen": -266.9195251464844, + "logps/rejected": -356.602783203125, + "loss": 0.7901, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0417538583278656, + "rewards/margins": 0.020147234201431274, + "rewards/rejected": 0.021606631577014923, + "step": 3208 + }, + { + "epoch": 0.49626908950318965, + "grad_norm": 8.27237606048584, + "learning_rate": 4.636556306564326e-06, + "logits/chosen": 5.4649457931518555, + "logits/rejected": 9.136170387268066, + "logps/chosen": -283.4508056640625, + "logps/rejected": -325.275390625, + "loss": 0.8573, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07023191452026367, + "rewards/margins": -0.14170588552951813, + "rewards/rejected": 0.07147398591041565, + "step": 3209 + }, + { + "epoch": 0.4964237386429538, + "grad_norm": 5.10293436050415, + "learning_rate": 4.6362699049146525e-06, + "logits/chosen": 9.117206573486328, + "logits/rejected": 10.583005905151367, + "logps/chosen": -253.10757446289062, + "logps/rejected": -258.6315612792969, + "loss": 0.7278, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12601019442081451, + "rewards/margins": -0.016238290816545486, + "rewards/rejected": 0.1422484815120697, + "step": 3210 + }, + { + "epoch": 0.49657838778271796, + "grad_norm": 4.5167012214660645, + "learning_rate": 4.635983503264979e-06, + "logits/chosen": 11.27578353881836, + "logits/rejected": 1.932388186454773, + "logps/chosen": -255.1419219970703, + "logps/rejected": -204.13180541992188, + "loss": 0.6155, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3284369111061096, + "rewards/margins": 0.3237423300743103, + "rewards/rejected": 0.0046945735812187195, + "step": 3211 + }, + { + "epoch": 0.4967330369224821, + "grad_norm": 4.347433090209961, + "learning_rate": 4.635697101615306e-06, + "logits/chosen": 7.651025772094727, + "logits/rejected": 1.1188621520996094, + "logps/chosen": -285.5934143066406, + "logps/rejected": -179.17422485351562, + "loss": 0.5516, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1909923404455185, + "rewards/margins": 0.3861883878707886, + "rewards/rejected": -0.19519607722759247, + "step": 3212 + }, + { + "epoch": 0.49688768606224626, + "grad_norm": 7.997645378112793, + "learning_rate": 4.6354106999656325e-06, + "logits/chosen": 8.994619369506836, + "logits/rejected": 7.7450337409973145, + "logps/chosen": -333.566650390625, + "logps/rejected": -257.5031433105469, + "loss": 0.7936, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25449323654174805, + "rewards/margins": -0.037055641412734985, + "rewards/rejected": 0.29154887795448303, + "step": 3213 + }, + { + "epoch": 0.49704233520201047, + "grad_norm": 7.42900276184082, + "learning_rate": 4.635124298315959e-06, + "logits/chosen": 11.444816589355469, + "logits/rejected": 12.119190216064453, + "logps/chosen": -322.518798828125, + "logps/rejected": -293.8309631347656, + "loss": 0.7328, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04818095266819, + "rewards/margins": 0.07652051746845245, + "rewards/rejected": -0.028339581564068794, + "step": 3214 + }, + { + "epoch": 0.4971969843417746, + "grad_norm": 8.868440628051758, + "learning_rate": 4.634837896666285e-06, + "logits/chosen": 9.840538024902344, + "logits/rejected": 2.1293115615844727, + "logps/chosen": -237.59378051757812, + "logps/rejected": -132.53939819335938, + "loss": 0.6684, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.053783148527145386, + "rewards/margins": 0.21836845576763153, + "rewards/rejected": -0.16458530724048615, + "step": 3215 + }, + { + "epoch": 0.49735163348153877, + "grad_norm": 5.591974258422852, + "learning_rate": 4.6345514950166116e-06, + "logits/chosen": 11.873985290527344, + "logits/rejected": 8.65584945678711, + "logps/chosen": -371.9913330078125, + "logps/rejected": -289.956787109375, + "loss": 0.5866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1580563485622406, + "rewards/margins": 0.36342161893844604, + "rewards/rejected": -0.5214779376983643, + "step": 3216 + }, + { + "epoch": 0.4975062826213029, + "grad_norm": 6.619940280914307, + "learning_rate": 4.634265093366938e-06, + "logits/chosen": 4.399501800537109, + "logits/rejected": 3.6414170265197754, + "logps/chosen": -326.87677001953125, + "logps/rejected": -306.3594665527344, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1975744366645813, + "rewards/margins": 0.16901811957359314, + "rewards/rejected": 0.028556296601891518, + "step": 3217 + }, + { + "epoch": 0.49766093176106707, + "grad_norm": 7.987555503845215, + "learning_rate": 4.633978691717265e-06, + "logits/chosen": 9.403387069702148, + "logits/rejected": 4.45906925201416, + "logps/chosen": -250.8699188232422, + "logps/rejected": -169.99916076660156, + "loss": 0.6851, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15247373282909393, + "rewards/margins": 0.11631850153207779, + "rewards/rejected": 0.03615522384643555, + "step": 3218 + }, + { + "epoch": 0.4978155809008312, + "grad_norm": 5.61239767074585, + "learning_rate": 4.6336922900675915e-06, + "logits/chosen": 11.234419822692871, + "logits/rejected": 3.178260564804077, + "logps/chosen": -260.33056640625, + "logps/rejected": -203.0194854736328, + "loss": 0.7129, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.218332439661026, + "rewards/margins": 0.044287826865911484, + "rewards/rejected": 0.17404459416866302, + "step": 3219 + }, + { + "epoch": 0.4979702300405954, + "grad_norm": 5.32751989364624, + "learning_rate": 4.633405888417918e-06, + "logits/chosen": 15.610055923461914, + "logits/rejected": 6.273090362548828, + "logps/chosen": -328.39111328125, + "logps/rejected": -183.8865966796875, + "loss": 0.6169, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35059186816215515, + "rewards/margins": 0.274660587310791, + "rewards/rejected": 0.07593126595020294, + "step": 3220 + }, + { + "epoch": 0.4981248791803596, + "grad_norm": 5.928618431091309, + "learning_rate": 4.633119486768245e-06, + "logits/chosen": 8.828058242797852, + "logits/rejected": 8.824657440185547, + "logps/chosen": -220.4580078125, + "logps/rejected": -187.80079650878906, + "loss": 0.8025, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0530797615647316, + "rewards/margins": -0.08219853788614273, + "rewards/rejected": 0.029118768870830536, + "step": 3221 + }, + { + "epoch": 0.49827952832012373, + "grad_norm": 7.4839768409729, + "learning_rate": 4.632833085118571e-06, + "logits/chosen": 7.095251560211182, + "logits/rejected": 10.842124938964844, + "logps/chosen": -244.18775939941406, + "logps/rejected": -382.8251647949219, + "loss": 0.9543, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0409204363822937, + "rewards/margins": -0.2923834025859833, + "rewards/rejected": 0.3333038091659546, + "step": 3222 + }, + { + "epoch": 0.4984341774598879, + "grad_norm": 6.719126224517822, + "learning_rate": 4.632546683468897e-06, + "logits/chosen": 10.39794921875, + "logits/rejected": 6.886472225189209, + "logps/chosen": -375.6524353027344, + "logps/rejected": -253.48709106445312, + "loss": 0.6206, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22531074285507202, + "rewards/margins": 0.44207724928855896, + "rewards/rejected": -0.21676646173000336, + "step": 3223 + }, + { + "epoch": 0.49858882659965204, + "grad_norm": 3.482813835144043, + "learning_rate": 4.632260281819224e-06, + "logits/chosen": 12.072370529174805, + "logits/rejected": 9.614316940307617, + "logps/chosen": -145.61524963378906, + "logps/rejected": -108.59986877441406, + "loss": 0.6405, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2467600405216217, + "rewards/margins": 0.18540742993354797, + "rewards/rejected": 0.06135261058807373, + "step": 3224 + }, + { + "epoch": 0.4987434757394162, + "grad_norm": 5.7496185302734375, + "learning_rate": 4.6319738801695506e-06, + "logits/chosen": 11.51933765411377, + "logits/rejected": 10.159708976745605, + "logps/chosen": -283.84857177734375, + "logps/rejected": -276.3330993652344, + "loss": 0.7419, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2894914746284485, + "rewards/margins": -0.02282160520553589, + "rewards/rejected": 0.3123130798339844, + "step": 3225 + }, + { + "epoch": 0.49889812487918034, + "grad_norm": 6.08125638961792, + "learning_rate": 4.631687478519876e-06, + "logits/chosen": 13.310603141784668, + "logits/rejected": 5.548508167266846, + "logps/chosen": -322.26513671875, + "logps/rejected": -239.59014892578125, + "loss": 0.6613, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2916352450847626, + "rewards/margins": 0.08447933197021484, + "rewards/rejected": 0.20715588331222534, + "step": 3226 + }, + { + "epoch": 0.49905277401894454, + "grad_norm": 7.443875789642334, + "learning_rate": 4.631401076870203e-06, + "logits/chosen": 12.941253662109375, + "logits/rejected": 2.983773708343506, + "logps/chosen": -415.9276428222656, + "logps/rejected": -337.6793518066406, + "loss": 0.6947, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5055520534515381, + "rewards/margins": 0.03528820723295212, + "rewards/rejected": 0.47026389837265015, + "step": 3227 + }, + { + "epoch": 0.4992074231587087, + "grad_norm": 5.3274617195129395, + "learning_rate": 4.63111467522053e-06, + "logits/chosen": 12.25723648071289, + "logits/rejected": 8.610937118530273, + "logps/chosen": -294.1002502441406, + "logps/rejected": -237.87545776367188, + "loss": 0.5788, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44150614738464355, + "rewards/margins": 0.3152434825897217, + "rewards/rejected": 0.12626267969608307, + "step": 3228 + }, + { + "epoch": 0.49936207229847285, + "grad_norm": 6.431368350982666, + "learning_rate": 4.630828273570856e-06, + "logits/chosen": -0.9038457870483398, + "logits/rejected": 7.840060234069824, + "logps/chosen": -206.0345458984375, + "logps/rejected": -259.9377136230469, + "loss": 0.8315, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2623577117919922, + "rewards/margins": -0.23195143043994904, + "rewards/rejected": -0.030406277626752853, + "step": 3229 + }, + { + "epoch": 0.499516721438237, + "grad_norm": 5.165670871734619, + "learning_rate": 4.630541871921182e-06, + "logits/chosen": 12.120144844055176, + "logits/rejected": 6.662358283996582, + "logps/chosen": -279.2125244140625, + "logps/rejected": -231.15696716308594, + "loss": 0.6535, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16808763146400452, + "rewards/margins": 0.2122688889503479, + "rewards/rejected": -0.044181257486343384, + "step": 3230 + }, + { + "epoch": 0.49967137057800115, + "grad_norm": 6.1574625968933105, + "learning_rate": 4.630255470271509e-06, + "logits/chosen": 6.7483391761779785, + "logits/rejected": 8.390557289123535, + "logps/chosen": -319.6745910644531, + "logps/rejected": -254.29249572753906, + "loss": 0.6291, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4285506308078766, + "rewards/margins": 0.1769622564315796, + "rewards/rejected": 0.2515884041786194, + "step": 3231 + }, + { + "epoch": 0.4998260197177653, + "grad_norm": 5.711991310119629, + "learning_rate": 4.6299690686218354e-06, + "logits/chosen": 6.523872375488281, + "logits/rejected": 4.62911319732666, + "logps/chosen": -248.3011474609375, + "logps/rejected": -189.9203338623047, + "loss": 0.6659, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023024767637252808, + "rewards/margins": 0.10130847990512848, + "rewards/rejected": -0.07828371226787567, + "step": 3232 + }, + { + "epoch": 0.49998066885752945, + "grad_norm": 3.901844024658203, + "learning_rate": 4.629682666972162e-06, + "logits/chosen": 11.681109428405762, + "logits/rejected": 6.818157196044922, + "logps/chosen": -258.7403564453125, + "logps/rejected": -197.45782470703125, + "loss": 0.5584, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4649995267391205, + "rewards/margins": 0.3458278775215149, + "rewards/rejected": 0.1191716194152832, + "step": 3233 + }, + { + "epoch": 0.5001353179972936, + "grad_norm": 4.7333173751831055, + "learning_rate": 4.629396265322489e-06, + "logits/chosen": 10.792336463928223, + "logits/rejected": 7.659724712371826, + "logps/chosen": -287.09613037109375, + "logps/rejected": -233.54824829101562, + "loss": 0.6903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009859800338745117, + "rewards/margins": 0.02727828174829483, + "rewards/rejected": -0.017418459057807922, + "step": 3234 + }, + { + "epoch": 0.5002899671370578, + "grad_norm": 5.64075231552124, + "learning_rate": 4.6291098636728145e-06, + "logits/chosen": 10.718565940856934, + "logits/rejected": 8.840188980102539, + "logps/chosen": -301.17474365234375, + "logps/rejected": -238.9667510986328, + "loss": 0.7236, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09668870270252228, + "rewards/margins": -0.04182238504290581, + "rewards/rejected": 0.13851109147071838, + "step": 3235 + }, + { + "epoch": 0.5004446162768219, + "grad_norm": 4.832586765289307, + "learning_rate": 4.628823462023141e-06, + "logits/chosen": 7.720620632171631, + "logits/rejected": 6.52126407623291, + "logps/chosen": -214.824462890625, + "logps/rejected": -195.15853881835938, + "loss": 0.6475, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30634641647338867, + "rewards/margins": 0.14172527194023132, + "rewards/rejected": 0.16462109982967377, + "step": 3236 + }, + { + "epoch": 0.5005992654165862, + "grad_norm": 5.773725509643555, + "learning_rate": 4.628537060373468e-06, + "logits/chosen": 6.365090370178223, + "logits/rejected": 5.241100311279297, + "logps/chosen": -196.10340881347656, + "logps/rejected": -185.79940795898438, + "loss": 0.74, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.029765471816062927, + "rewards/margins": 0.015344507992267609, + "rewards/rejected": 0.014420978724956512, + "step": 3237 + }, + { + "epoch": 0.5007539145563503, + "grad_norm": 4.231893062591553, + "learning_rate": 4.6282506587237945e-06, + "logits/chosen": 6.115300178527832, + "logits/rejected": 1.6248770952224731, + "logps/chosen": -308.56719970703125, + "logps/rejected": -176.61923217773438, + "loss": 0.6071, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5727317333221436, + "rewards/margins": 0.4009518027305603, + "rewards/rejected": 0.17177993059158325, + "step": 3238 + }, + { + "epoch": 0.5009085636961145, + "grad_norm": 4.538451671600342, + "learning_rate": 4.627964257074121e-06, + "logits/chosen": 1.1103272438049316, + "logits/rejected": 2.763864755630493, + "logps/chosen": -202.9613037109375, + "logps/rejected": -152.6993865966797, + "loss": 0.7266, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05628015100955963, + "rewards/margins": -0.0032441522926092148, + "rewards/rejected": 0.05952432006597519, + "step": 3239 + }, + { + "epoch": 0.5010632128358786, + "grad_norm": 7.519440174102783, + "learning_rate": 4.627677855424448e-06, + "logits/chosen": 9.228011131286621, + "logits/rejected": 3.3821310997009277, + "logps/chosen": -324.1173095703125, + "logps/rejected": -292.279541015625, + "loss": 0.6601, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44543153047561646, + "rewards/margins": 0.13470172882080078, + "rewards/rejected": 0.3107297718524933, + "step": 3240 + }, + { + "epoch": 0.5012178619756428, + "grad_norm": 4.642898082733154, + "learning_rate": 4.6273914537747744e-06, + "logits/chosen": 7.2239789962768555, + "logits/rejected": 4.543787002563477, + "logps/chosen": -243.32937622070312, + "logps/rejected": -190.94973754882812, + "loss": 0.5622, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29530858993530273, + "rewards/margins": 0.3539070785045624, + "rewards/rejected": -0.05859847366809845, + "step": 3241 + }, + { + "epoch": 0.5013725111154069, + "grad_norm": 5.249964714050293, + "learning_rate": 4.6271050521251e-06, + "logits/chosen": 12.115538597106934, + "logits/rejected": 5.638140678405762, + "logps/chosen": -313.64910888671875, + "logps/rejected": -287.5634765625, + "loss": 0.5777, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08723636716604233, + "rewards/margins": 0.554078221321106, + "rewards/rejected": -0.46684184670448303, + "step": 3242 + }, + { + "epoch": 0.5015271602551711, + "grad_norm": 6.670085906982422, + "learning_rate": 4.626818650475427e-06, + "logits/chosen": 3.2348008155822754, + "logits/rejected": 3.5412447452545166, + "logps/chosen": -223.87423706054688, + "logps/rejected": -235.9828643798828, + "loss": 0.6999, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3629200756549835, + "rewards/margins": 0.028505805879831314, + "rewards/rejected": 0.3344142436981201, + "step": 3243 + }, + { + "epoch": 0.5016818093949352, + "grad_norm": 5.689203262329102, + "learning_rate": 4.6265322488257535e-06, + "logits/chosen": 8.295392990112305, + "logits/rejected": 6.412006855010986, + "logps/chosen": -209.7498321533203, + "logps/rejected": -158.74578857421875, + "loss": 0.64, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15107396245002747, + "rewards/margins": 0.18129417300224304, + "rewards/rejected": -0.03022022545337677, + "step": 3244 + }, + { + "epoch": 0.5018364585346994, + "grad_norm": 9.3799467086792, + "learning_rate": 4.62624584717608e-06, + "logits/chosen": 6.568096160888672, + "logits/rejected": 5.438168048858643, + "logps/chosen": -303.2627868652344, + "logps/rejected": -278.31903076171875, + "loss": 0.7963, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.13152244687080383, + "rewards/margins": -0.10410839319229126, + "rewards/rejected": 0.2356308400630951, + "step": 3245 + }, + { + "epoch": 0.5019911076744635, + "grad_norm": 4.707427024841309, + "learning_rate": 4.625959445526407e-06, + "logits/chosen": 4.344818115234375, + "logits/rejected": 7.874292850494385, + "logps/chosen": -207.7290802001953, + "logps/rejected": -212.3821258544922, + "loss": 0.7722, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10631751269102097, + "rewards/margins": -0.10347947478294373, + "rewards/rejected": -0.002838030457496643, + "step": 3246 + }, + { + "epoch": 0.5021457568142277, + "grad_norm": 4.798895835876465, + "learning_rate": 4.6256730438767335e-06, + "logits/chosen": 12.562665939331055, + "logits/rejected": 7.572888374328613, + "logps/chosen": -337.30364990234375, + "logps/rejected": -263.802978515625, + "loss": 0.6221, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5721014738082886, + "rewards/margins": 0.22942838072776794, + "rewards/rejected": 0.342673122882843, + "step": 3247 + }, + { + "epoch": 0.5023004059539918, + "grad_norm": 4.289575576782227, + "learning_rate": 4.625386642227059e-06, + "logits/chosen": 5.521953105926514, + "logits/rejected": 4.95518684387207, + "logps/chosen": -215.48492431640625, + "logps/rejected": -236.41635131835938, + "loss": 0.6166, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2674950361251831, + "rewards/margins": 0.18388274312019348, + "rewards/rejected": 0.08361230790615082, + "step": 3248 + }, + { + "epoch": 0.502455055093756, + "grad_norm": 6.433024883270264, + "learning_rate": 4.625100240577386e-06, + "logits/chosen": 12.959646224975586, + "logits/rejected": 8.154869079589844, + "logps/chosen": -374.6189880371094, + "logps/rejected": -339.1207580566406, + "loss": 0.6991, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45588111877441406, + "rewards/margins": 0.07262136042118073, + "rewards/rejected": 0.38325977325439453, + "step": 3249 + }, + { + "epoch": 0.5026097042335202, + "grad_norm": 7.297660827636719, + "learning_rate": 4.624813838927713e-06, + "logits/chosen": 11.110713958740234, + "logits/rejected": 9.868202209472656, + "logps/chosen": -401.1929931640625, + "logps/rejected": -391.1473083496094, + "loss": 0.9218, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4221614897251129, + "rewards/margins": -0.2851749360561371, + "rewards/rejected": 0.70733642578125, + "step": 3250 + }, + { + "epoch": 0.5027643533732844, + "grad_norm": 5.466851234436035, + "learning_rate": 4.624527437278039e-06, + "logits/chosen": 8.885056495666504, + "logits/rejected": 9.736762046813965, + "logps/chosen": -208.41220092773438, + "logps/rejected": -236.73855590820312, + "loss": 0.7338, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3178447186946869, + "rewards/margins": 0.09304095804691315, + "rewards/rejected": 0.22480377554893494, + "step": 3251 + }, + { + "epoch": 0.5029190025130486, + "grad_norm": 3.7154295444488525, + "learning_rate": 4.624241035628366e-06, + "logits/chosen": 8.286853790283203, + "logits/rejected": 3.569833278656006, + "logps/chosen": -286.64752197265625, + "logps/rejected": -210.27001953125, + "loss": 0.5011, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5680737495422363, + "rewards/margins": 0.6208111047744751, + "rewards/rejected": -0.05273742228746414, + "step": 3252 + }, + { + "epoch": 0.5030736516528127, + "grad_norm": 6.122959136962891, + "learning_rate": 4.6239546339786925e-06, + "logits/chosen": 9.299665451049805, + "logits/rejected": 10.038492202758789, + "logps/chosen": -141.4488067626953, + "logps/rejected": -174.90399169921875, + "loss": 0.7626, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1682216227054596, + "rewards/margins": -0.08915533125400543, + "rewards/rejected": -0.07906626909971237, + "step": 3253 + }, + { + "epoch": 0.5032283007925769, + "grad_norm": 3.8736491203308105, + "learning_rate": 4.623668232329019e-06, + "logits/chosen": 8.56025505065918, + "logits/rejected": 4.659777641296387, + "logps/chosen": -242.66452026367188, + "logps/rejected": -202.87985229492188, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04008864611387253, + "rewards/margins": 0.5544979572296143, + "rewards/rejected": -0.5945866107940674, + "step": 3254 + }, + { + "epoch": 0.503382949932341, + "grad_norm": 6.088406085968018, + "learning_rate": 4.623381830679345e-06, + "logits/chosen": 6.944226264953613, + "logits/rejected": 3.575235366821289, + "logps/chosen": -218.08468627929688, + "logps/rejected": -215.81625366210938, + "loss": 0.7874, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17197920382022858, + "rewards/margins": -0.05819801986217499, + "rewards/rejected": 0.23017722368240356, + "step": 3255 + }, + { + "epoch": 0.5035375990721052, + "grad_norm": 4.040688991546631, + "learning_rate": 4.623095429029672e-06, + "logits/chosen": 12.327072143554688, + "logits/rejected": 4.876599311828613, + "logps/chosen": -188.86550903320312, + "logps/rejected": -155.1163787841797, + "loss": 0.662, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19586950540542603, + "rewards/margins": 0.11446381360292435, + "rewards/rejected": 0.08140568435192108, + "step": 3256 + }, + { + "epoch": 0.5036922482118693, + "grad_norm": 4.168957233428955, + "learning_rate": 4.622809027379998e-06, + "logits/chosen": 9.042384147644043, + "logits/rejected": 6.491412162780762, + "logps/chosen": -215.65042114257812, + "logps/rejected": -173.77108764648438, + "loss": 0.6335, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39675989747047424, + "rewards/margins": 0.29335811734199524, + "rewards/rejected": 0.10340175777673721, + "step": 3257 + }, + { + "epoch": 0.5038468973516335, + "grad_norm": 5.800944805145264, + "learning_rate": 4.622522625730325e-06, + "logits/chosen": 11.838663101196289, + "logits/rejected": 11.668035507202148, + "logps/chosen": -265.31072998046875, + "logps/rejected": -239.455810546875, + "loss": 0.6991, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07556433975696564, + "rewards/margins": 0.046987056732177734, + "rewards/rejected": 0.028577271848917007, + "step": 3258 + }, + { + "epoch": 0.5040015464913976, + "grad_norm": 9.113494873046875, + "learning_rate": 4.622236224080652e-06, + "logits/chosen": 5.679702281951904, + "logits/rejected": 8.43759822845459, + "logps/chosen": -347.2708740234375, + "logps/rejected": -365.0799255371094, + "loss": 0.6248, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7220704555511475, + "rewards/margins": 0.165435791015625, + "rewards/rejected": 0.5566346049308777, + "step": 3259 + }, + { + "epoch": 0.5041561956311618, + "grad_norm": 35.40195083618164, + "learning_rate": 4.621949822430977e-06, + "logits/chosen": 9.104483604431152, + "logits/rejected": 7.996103763580322, + "logps/chosen": -295.86627197265625, + "logps/rejected": -294.913818359375, + "loss": 0.7372, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21118071675300598, + "rewards/margins": 0.10836414247751236, + "rewards/rejected": 0.10281655192375183, + "step": 3260 + }, + { + "epoch": 0.5043108447709259, + "grad_norm": 4.244663715362549, + "learning_rate": 4.621663420781304e-06, + "logits/chosen": 6.868988513946533, + "logits/rejected": 6.118412494659424, + "logps/chosen": -130.43975830078125, + "logps/rejected": -125.55975341796875, + "loss": 0.639, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.020024150609970093, + "rewards/margins": 0.1453128159046173, + "rewards/rejected": -0.1653369963169098, + "step": 3261 + }, + { + "epoch": 0.5044654939106902, + "grad_norm": 30.666452407836914, + "learning_rate": 4.621377019131631e-06, + "logits/chosen": 4.70530366897583, + "logits/rejected": 6.695226192474365, + "logps/chosen": -281.1080627441406, + "logps/rejected": -340.7483215332031, + "loss": 0.6458, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5474357604980469, + "rewards/margins": 0.12058459967374802, + "rewards/rejected": 0.4268511235713959, + "step": 3262 + }, + { + "epoch": 0.5046201430504543, + "grad_norm": 4.437601566314697, + "learning_rate": 4.621090617481957e-06, + "logits/chosen": 9.145987510681152, + "logits/rejected": 4.361265182495117, + "logps/chosen": -293.39300537109375, + "logps/rejected": -210.44580078125, + "loss": 0.6133, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4394845962524414, + "rewards/margins": 0.18916437029838562, + "rewards/rejected": 0.2503202259540558, + "step": 3263 + }, + { + "epoch": 0.5047747921902185, + "grad_norm": 6.351206302642822, + "learning_rate": 4.620804215832283e-06, + "logits/chosen": 4.6507720947265625, + "logits/rejected": 11.356451034545898, + "logps/chosen": -210.76272583007812, + "logps/rejected": -263.29034423828125, + "loss": 0.8746, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09049965441226959, + "rewards/margins": -0.3004557490348816, + "rewards/rejected": 0.3909553289413452, + "step": 3264 + }, + { + "epoch": 0.5049294413299826, + "grad_norm": 4.6295905113220215, + "learning_rate": 4.62051781418261e-06, + "logits/chosen": 10.884513854980469, + "logits/rejected": 7.245157241821289, + "logps/chosen": -254.57241821289062, + "logps/rejected": -186.95785522460938, + "loss": 0.5839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6100372076034546, + "rewards/margins": 0.29260027408599854, + "rewards/rejected": 0.31743699312210083, + "step": 3265 + }, + { + "epoch": 0.5050840904697468, + "grad_norm": 4.323636054992676, + "learning_rate": 4.6202314125329365e-06, + "logits/chosen": 6.880917549133301, + "logits/rejected": -0.4303410053253174, + "logps/chosen": -194.7524871826172, + "logps/rejected": -135.21209716796875, + "loss": 0.6091, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27044790983200073, + "rewards/margins": 0.22092950344085693, + "rewards/rejected": 0.04951842129230499, + "step": 3266 + }, + { + "epoch": 0.5052387396095109, + "grad_norm": 5.352099418640137, + "learning_rate": 4.619945010883263e-06, + "logits/chosen": 1.3101264238357544, + "logits/rejected": 7.754352569580078, + "logps/chosen": -173.9705810546875, + "logps/rejected": -257.4837646484375, + "loss": 0.606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3631083369255066, + "rewards/margins": 0.26010382175445557, + "rewards/rejected": 0.1030045598745346, + "step": 3267 + }, + { + "epoch": 0.5053933887492751, + "grad_norm": 5.960330963134766, + "learning_rate": 4.619658609233589e-06, + "logits/chosen": 12.323392868041992, + "logits/rejected": 9.421552658081055, + "logps/chosen": -248.0334014892578, + "logps/rejected": -188.06338500976562, + "loss": 0.6137, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5610530972480774, + "rewards/margins": 0.2127675712108612, + "rewards/rejected": 0.3482855260372162, + "step": 3268 + }, + { + "epoch": 0.5055480378890392, + "grad_norm": 4.314411640167236, + "learning_rate": 4.6193722075839156e-06, + "logits/chosen": 7.391791343688965, + "logits/rejected": 8.912692070007324, + "logps/chosen": -163.3362579345703, + "logps/rejected": -181.1661376953125, + "loss": 0.7793, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07061205059289932, + "rewards/margins": -0.0177382230758667, + "rewards/rejected": -0.052873801440000534, + "step": 3269 + }, + { + "epoch": 0.5057026870288034, + "grad_norm": 4.295561790466309, + "learning_rate": 4.619085805934242e-06, + "logits/chosen": 11.544595718383789, + "logits/rejected": 0.17121952772140503, + "logps/chosen": -272.5765380859375, + "logps/rejected": -145.60745239257812, + "loss": 0.5283, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5199115872383118, + "rewards/margins": 0.4205535650253296, + "rewards/rejected": 0.09935805201530457, + "step": 3270 + }, + { + "epoch": 0.5058573361685675, + "grad_norm": 5.221998691558838, + "learning_rate": 4.618799404284569e-06, + "logits/chosen": 9.329689025878906, + "logits/rejected": 9.840164184570312, + "logps/chosen": -291.57855224609375, + "logps/rejected": -225.47178649902344, + "loss": 0.5341, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6414833664894104, + "rewards/margins": 0.5150984525680542, + "rewards/rejected": 0.12638498842716217, + "step": 3271 + }, + { + "epoch": 0.5060119853083317, + "grad_norm": 4.863838195800781, + "learning_rate": 4.6185130026348955e-06, + "logits/chosen": 9.67735767364502, + "logits/rejected": 9.155167579650879, + "logps/chosen": -229.3268585205078, + "logps/rejected": -150.3831787109375, + "loss": 0.7025, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2899929881095886, + "rewards/margins": 0.037098973989486694, + "rewards/rejected": 0.25289401412010193, + "step": 3272 + }, + { + "epoch": 0.5061666344480958, + "grad_norm": 6.14678955078125, + "learning_rate": 4.618226600985222e-06, + "logits/chosen": 15.29581069946289, + "logits/rejected": 16.354097366333008, + "logps/chosen": -238.01573181152344, + "logps/rejected": -286.6297607421875, + "loss": 0.6868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33115795254707336, + "rewards/margins": 0.09647694230079651, + "rewards/rejected": 0.23468102514743805, + "step": 3273 + }, + { + "epoch": 0.50632128358786, + "grad_norm": 4.934665679931641, + "learning_rate": 4.617940199335549e-06, + "logits/chosen": 6.422244071960449, + "logits/rejected": 7.959965705871582, + "logps/chosen": -263.961669921875, + "logps/rejected": -270.8356018066406, + "loss": 0.6776, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5538961887359619, + "rewards/margins": 0.11184568703174591, + "rewards/rejected": 0.4420505166053772, + "step": 3274 + }, + { + "epoch": 0.5064759327276243, + "grad_norm": 7.779551029205322, + "learning_rate": 4.617653797685875e-06, + "logits/chosen": 10.395113945007324, + "logits/rejected": 5.497433662414551, + "logps/chosen": -435.391845703125, + "logps/rejected": -340.43768310546875, + "loss": 0.6619, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5168371200561523, + "rewards/margins": 0.2224440574645996, + "rewards/rejected": 0.29439306259155273, + "step": 3275 + }, + { + "epoch": 0.5066305818673884, + "grad_norm": 4.753671646118164, + "learning_rate": 4.617367396036201e-06, + "logits/chosen": 11.554730415344238, + "logits/rejected": 8.039701461791992, + "logps/chosen": -244.26229858398438, + "logps/rejected": -231.60797119140625, + "loss": 0.5446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5520535707473755, + "rewards/margins": 0.40712639689445496, + "rewards/rejected": 0.1449272185564041, + "step": 3276 + }, + { + "epoch": 0.5067852310071526, + "grad_norm": 4.9572978019714355, + "learning_rate": 4.617080994386528e-06, + "logits/chosen": 7.847681522369385, + "logits/rejected": 4.94571590423584, + "logps/chosen": -216.69163513183594, + "logps/rejected": -176.18067932128906, + "loss": 0.6321, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5498632192611694, + "rewards/margins": 0.2688031494617462, + "rewards/rejected": 0.2810600996017456, + "step": 3277 + }, + { + "epoch": 0.5069398801469167, + "grad_norm": 8.32699966430664, + "learning_rate": 4.6167945927368546e-06, + "logits/chosen": 11.8165283203125, + "logits/rejected": 14.487762451171875, + "logps/chosen": -217.32606506347656, + "logps/rejected": -264.3685302734375, + "loss": 0.766, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20697329938411713, + "rewards/margins": 0.015448138117790222, + "rewards/rejected": 0.19152513146400452, + "step": 3278 + }, + { + "epoch": 0.5070945292866809, + "grad_norm": 10.894147872924805, + "learning_rate": 4.616508191087181e-06, + "logits/chosen": 16.20415496826172, + "logits/rejected": 15.814085006713867, + "logps/chosen": -353.5644836425781, + "logps/rejected": -269.7220458984375, + "loss": 0.7998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3779914975166321, + "rewards/margins": 0.05745086073875427, + "rewards/rejected": 0.3205406069755554, + "step": 3279 + }, + { + "epoch": 0.507249178426445, + "grad_norm": 4.648820400238037, + "learning_rate": 4.616221789437508e-06, + "logits/chosen": 13.635940551757812, + "logits/rejected": 5.431562423706055, + "logps/chosen": -266.88037109375, + "logps/rejected": -215.22857666015625, + "loss": 0.4517, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6006661653518677, + "rewards/margins": 0.6850570440292358, + "rewards/rejected": -0.08439093083143234, + "step": 3280 + }, + { + "epoch": 0.5074038275662092, + "grad_norm": 5.254561424255371, + "learning_rate": 4.615935387787834e-06, + "logits/chosen": 12.200284957885742, + "logits/rejected": 10.553935050964355, + "logps/chosen": -267.6666259765625, + "logps/rejected": -237.68408203125, + "loss": 0.6642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5550568103790283, + "rewards/margins": 0.20258420705795288, + "rewards/rejected": 0.35247254371643066, + "step": 3281 + }, + { + "epoch": 0.5075584767059733, + "grad_norm": 5.4262237548828125, + "learning_rate": 4.61564898613816e-06, + "logits/chosen": 14.365741729736328, + "logits/rejected": 8.671972274780273, + "logps/chosen": -278.0113220214844, + "logps/rejected": -161.2666473388672, + "loss": 0.6495, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5143827199935913, + "rewards/margins": 0.23593389987945557, + "rewards/rejected": 0.27844882011413574, + "step": 3282 + }, + { + "epoch": 0.5077131258457375, + "grad_norm": 5.696606159210205, + "learning_rate": 4.615362584488487e-06, + "logits/chosen": 1.919853925704956, + "logits/rejected": 3.4774177074432373, + "logps/chosen": -281.231689453125, + "logps/rejected": -219.40054321289062, + "loss": 0.759, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24876460433006287, + "rewards/margins": -0.013046935200691223, + "rewards/rejected": 0.2618115544319153, + "step": 3283 + }, + { + "epoch": 0.5078677749855016, + "grad_norm": 3.7839126586914062, + "learning_rate": 4.615076182838814e-06, + "logits/chosen": 9.875275611877441, + "logits/rejected": 6.973371505737305, + "logps/chosen": -203.93936157226562, + "logps/rejected": -180.84817504882812, + "loss": 0.5449, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3947101831436157, + "rewards/margins": 0.46642735600471497, + "rewards/rejected": -0.07171717286109924, + "step": 3284 + }, + { + "epoch": 0.5080224241252658, + "grad_norm": 6.045161724090576, + "learning_rate": 4.61478978118914e-06, + "logits/chosen": 5.759314060211182, + "logits/rejected": 11.320684432983398, + "logps/chosen": -226.06271362304688, + "logps/rejected": -297.14166259765625, + "loss": 0.7682, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37793922424316406, + "rewards/margins": 0.025321558117866516, + "rewards/rejected": 0.35261765122413635, + "step": 3285 + }, + { + "epoch": 0.5081770732650299, + "grad_norm": 7.158720016479492, + "learning_rate": 4.614503379539467e-06, + "logits/chosen": 9.620000839233398, + "logits/rejected": 3.800607681274414, + "logps/chosen": -236.81251525878906, + "logps/rejected": -180.69149780273438, + "loss": 0.6608, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020839691162109375, + "rewards/margins": 0.1508420705795288, + "rewards/rejected": -0.13000234961509705, + "step": 3286 + }, + { + "epoch": 0.5083317224047941, + "grad_norm": 6.190760612487793, + "learning_rate": 4.614216977889794e-06, + "logits/chosen": 7.540580749511719, + "logits/rejected": 0.6460055112838745, + "logps/chosen": -337.2153625488281, + "logps/rejected": -218.33355712890625, + "loss": 0.6894, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19868355989456177, + "rewards/margins": 0.17706742882728577, + "rewards/rejected": 0.021616123616695404, + "step": 3287 + }, + { + "epoch": 0.5084863715445583, + "grad_norm": 4.635907173156738, + "learning_rate": 4.613930576240119e-06, + "logits/chosen": 5.867904186248779, + "logits/rejected": 7.914802074432373, + "logps/chosen": -332.451416015625, + "logps/rejected": -448.4495849609375, + "loss": 0.581, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44966018199920654, + "rewards/margins": 0.3103483021259308, + "rewards/rejected": 0.13931189477443695, + "step": 3288 + }, + { + "epoch": 0.5086410206843225, + "grad_norm": 4.746682167053223, + "learning_rate": 4.613644174590446e-06, + "logits/chosen": 4.311805725097656, + "logits/rejected": 1.1995904445648193, + "logps/chosen": -185.853759765625, + "logps/rejected": -133.92442321777344, + "loss": 0.6182, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23295500874519348, + "rewards/margins": 0.3026370406150818, + "rewards/rejected": -0.06968198716640472, + "step": 3289 + }, + { + "epoch": 0.5087956698240866, + "grad_norm": 5.019349098205566, + "learning_rate": 4.613357772940773e-06, + "logits/chosen": 9.831005096435547, + "logits/rejected": 0.4108743667602539, + "logps/chosen": -401.7609558105469, + "logps/rejected": -276.01409912109375, + "loss": 0.5898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6003977060317993, + "rewards/margins": 0.5014404058456421, + "rewards/rejected": 0.09895730018615723, + "step": 3290 + }, + { + "epoch": 0.5089503189638508, + "grad_norm": 5.8945770263671875, + "learning_rate": 4.613071371291099e-06, + "logits/chosen": 8.602256774902344, + "logits/rejected": 12.297430992126465, + "logps/chosen": -326.4326171875, + "logps/rejected": -419.716796875, + "loss": 0.6549, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6179942488670349, + "rewards/margins": 0.18271782994270325, + "rewards/rejected": 0.4352763891220093, + "step": 3291 + }, + { + "epoch": 0.5091049681036149, + "grad_norm": 6.074856758117676, + "learning_rate": 4.612784969641426e-06, + "logits/chosen": 12.51699161529541, + "logits/rejected": 8.815010070800781, + "logps/chosen": -259.63934326171875, + "logps/rejected": -244.95547485351562, + "loss": 0.6704, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27018746733665466, + "rewards/margins": 0.0648518055677414, + "rewards/rejected": 0.20533566176891327, + "step": 3292 + }, + { + "epoch": 0.5092596172433791, + "grad_norm": 11.851131439208984, + "learning_rate": 4.612498567991753e-06, + "logits/chosen": 9.136540412902832, + "logits/rejected": 13.335267066955566, + "logps/chosen": -393.1720275878906, + "logps/rejected": -356.2487487792969, + "loss": 0.7954, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6250507831573486, + "rewards/margins": -0.14003705978393555, + "rewards/rejected": 0.7650877833366394, + "step": 3293 + }, + { + "epoch": 0.5094142663831432, + "grad_norm": 5.108269214630127, + "learning_rate": 4.6122121663420784e-06, + "logits/chosen": 8.016669273376465, + "logits/rejected": 5.554108142852783, + "logps/chosen": -357.6695556640625, + "logps/rejected": -236.93116760253906, + "loss": 0.6403, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4220203161239624, + "rewards/margins": 0.3163836896419525, + "rewards/rejected": 0.1056365966796875, + "step": 3294 + }, + { + "epoch": 0.5095689155229074, + "grad_norm": 5.631624221801758, + "learning_rate": 4.611925764692405e-06, + "logits/chosen": 13.836071968078613, + "logits/rejected": 9.977951049804688, + "logps/chosen": -519.4583129882812, + "logps/rejected": -336.7672119140625, + "loss": 0.5277, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.0055391788482666, + "rewards/margins": 0.6009042263031006, + "rewards/rejected": 0.404634952545166, + "step": 3295 + }, + { + "epoch": 0.5097235646626715, + "grad_norm": 6.327399253845215, + "learning_rate": 4.611639363042732e-06, + "logits/chosen": 4.6474761962890625, + "logits/rejected": 7.623525619506836, + "logps/chosen": -261.93402099609375, + "logps/rejected": -304.2960205078125, + "loss": 0.7615, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17954427003860474, + "rewards/margins": -0.1032150387763977, + "rewards/rejected": -0.07632923126220703, + "step": 3296 + }, + { + "epoch": 0.5098782138024357, + "grad_norm": 5.544502258300781, + "learning_rate": 4.611352961393058e-06, + "logits/chosen": 11.478700637817383, + "logits/rejected": 7.552406311035156, + "logps/chosen": -302.0330505371094, + "logps/rejected": -292.5581970214844, + "loss": 0.5639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2996593415737152, + "rewards/margins": 0.44985684752464294, + "rewards/rejected": -0.15019750595092773, + "step": 3297 + }, + { + "epoch": 0.5100328629421998, + "grad_norm": 6.386444568634033, + "learning_rate": 4.611066559743384e-06, + "logits/chosen": 12.244367599487305, + "logits/rejected": 8.603316307067871, + "logps/chosen": -356.3914489746094, + "logps/rejected": -236.01129150390625, + "loss": 0.7436, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.15625444054603577, + "rewards/margins": 0.1432238221168518, + "rewards/rejected": 0.013030633330345154, + "step": 3298 + }, + { + "epoch": 0.510187512081964, + "grad_norm": 5.056331634521484, + "learning_rate": 4.610780158093711e-06, + "logits/chosen": 10.681868553161621, + "logits/rejected": 10.155205726623535, + "logps/chosen": -355.4100341796875, + "logps/rejected": -333.34466552734375, + "loss": 0.5839, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.219159796833992, + "rewards/margins": 0.30675363540649414, + "rewards/rejected": -0.08759383857250214, + "step": 3299 + }, + { + "epoch": 0.5103421612217282, + "grad_norm": 5.038776397705078, + "learning_rate": 4.6104937564440375e-06, + "logits/chosen": 6.918275833129883, + "logits/rejected": 7.548701286315918, + "logps/chosen": -251.03842163085938, + "logps/rejected": -225.3931121826172, + "loss": 0.7674, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31759369373321533, + "rewards/margins": -0.07449433207511902, + "rewards/rejected": 0.39208802580833435, + "step": 3300 + }, + { + "epoch": 0.5104968103614924, + "grad_norm": 6.149869918823242, + "learning_rate": 4.610207354794364e-06, + "logits/chosen": 13.585990905761719, + "logits/rejected": 14.574674606323242, + "logps/chosen": -308.7471618652344, + "logps/rejected": -375.1549072265625, + "loss": 0.7064, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3792675733566284, + "rewards/margins": 0.05164957046508789, + "rewards/rejected": 0.3276180326938629, + "step": 3301 + }, + { + "epoch": 0.5106514595012566, + "grad_norm": 8.68946361541748, + "learning_rate": 4.60992095314469e-06, + "logits/chosen": 4.943545341491699, + "logits/rejected": 10.283045768737793, + "logps/chosen": -335.4933776855469, + "logps/rejected": -314.6751403808594, + "loss": 0.9695, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09260483086109161, + "rewards/margins": -0.36909058690071106, + "rewards/rejected": 0.4616954028606415, + "step": 3302 + }, + { + "epoch": 0.5108061086410207, + "grad_norm": 5.4128899574279785, + "learning_rate": 4.609634551495017e-06, + "logits/chosen": 11.946561813354492, + "logits/rejected": 11.99673080444336, + "logps/chosen": -246.31185913085938, + "logps/rejected": -264.9508056640625, + "loss": 0.6506, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27053916454315186, + "rewards/margins": 0.17488783597946167, + "rewards/rejected": 0.09565134346485138, + "step": 3303 + }, + { + "epoch": 0.5109607577807849, + "grad_norm": 4.730881214141846, + "learning_rate": 4.609348149845343e-06, + "logits/chosen": 8.126791000366211, + "logits/rejected": 6.974100112915039, + "logps/chosen": -326.1795959472656, + "logps/rejected": -277.41552734375, + "loss": 0.552, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5544772148132324, + "rewards/margins": 0.3648064136505127, + "rewards/rejected": 0.18967080116271973, + "step": 3304 + }, + { + "epoch": 0.511115406920549, + "grad_norm": 7.931200981140137, + "learning_rate": 4.60906174819567e-06, + "logits/chosen": 8.23427677154541, + "logits/rejected": 3.824476480484009, + "logps/chosen": -448.0661926269531, + "logps/rejected": -372.74859619140625, + "loss": 0.6634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5778120756149292, + "rewards/margins": 0.3968660533428192, + "rewards/rejected": 0.1809459924697876, + "step": 3305 + }, + { + "epoch": 0.5112700560603132, + "grad_norm": 6.869718074798584, + "learning_rate": 4.6087753465459965e-06, + "logits/chosen": 7.93504524230957, + "logits/rejected": 3.603963851928711, + "logps/chosen": -453.483154296875, + "logps/rejected": -320.5999755859375, + "loss": 0.5766, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38236987590789795, + "rewards/margins": 0.3331788182258606, + "rewards/rejected": 0.049191076308488846, + "step": 3306 + }, + { + "epoch": 0.5114247052000773, + "grad_norm": 5.299606800079346, + "learning_rate": 4.608488944896322e-06, + "logits/chosen": 13.84046745300293, + "logits/rejected": 11.413540840148926, + "logps/chosen": -307.7435302734375, + "logps/rejected": -404.3388366699219, + "loss": 0.5461, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5440571904182434, + "rewards/margins": 0.43519482016563416, + "rewards/rejected": 0.10886240005493164, + "step": 3307 + }, + { + "epoch": 0.5115793543398415, + "grad_norm": 7.436204433441162, + "learning_rate": 4.608202543246649e-06, + "logits/chosen": 3.9916067123413086, + "logits/rejected": 8.170459747314453, + "logps/chosen": -259.1344909667969, + "logps/rejected": -317.2101135253906, + "loss": 0.8142, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07210784405469894, + "rewards/margins": -0.2072063535451889, + "rewards/rejected": 0.27931419014930725, + "step": 3308 + }, + { + "epoch": 0.5117340034796056, + "grad_norm": 6.892382621765137, + "learning_rate": 4.607916141596976e-06, + "logits/chosen": 8.205277442932129, + "logits/rejected": 12.968907356262207, + "logps/chosen": -364.7828063964844, + "logps/rejected": -320.7512512207031, + "loss": 0.7796, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10809154808521271, + "rewards/margins": -0.11546183377504349, + "rewards/rejected": 0.2235533744096756, + "step": 3309 + }, + { + "epoch": 0.5118886526193698, + "grad_norm": 4.5968546867370605, + "learning_rate": 4.607629739947302e-06, + "logits/chosen": 14.16615104675293, + "logits/rejected": 12.471616744995117, + "logps/chosen": -332.67010498046875, + "logps/rejected": -372.7623291015625, + "loss": 0.5008, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2503507733345032, + "rewards/margins": 0.4937131702899933, + "rewards/rejected": -0.2433624267578125, + "step": 3310 + }, + { + "epoch": 0.5120433017591339, + "grad_norm": 4.575873374938965, + "learning_rate": 4.607343338297629e-06, + "logits/chosen": 10.643853187561035, + "logits/rejected": 10.89056396484375, + "logps/chosen": -165.5745391845703, + "logps/rejected": -197.70614624023438, + "loss": 0.6435, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1332719326019287, + "rewards/margins": 0.2689007520675659, + "rewards/rejected": -0.13562878966331482, + "step": 3311 + }, + { + "epoch": 0.5121979508988981, + "grad_norm": 6.630035877227783, + "learning_rate": 4.607056936647956e-06, + "logits/chosen": 11.515083312988281, + "logits/rejected": 8.150697708129883, + "logps/chosen": -243.21372985839844, + "logps/rejected": -236.29318237304688, + "loss": 0.6737, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.053349412977695465, + "rewards/margins": 0.11289205402135849, + "rewards/rejected": -0.05954265594482422, + "step": 3312 + }, + { + "epoch": 0.5123526000386622, + "grad_norm": 6.082345962524414, + "learning_rate": 4.606770534998282e-06, + "logits/chosen": 7.6710944175720215, + "logits/rejected": 3.204085350036621, + "logps/chosen": -255.6961669921875, + "logps/rejected": -213.114990234375, + "loss": 0.7178, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.062436312437057495, + "rewards/margins": 0.08954112231731415, + "rewards/rejected": -0.027104809880256653, + "step": 3313 + }, + { + "epoch": 0.5125072491784265, + "grad_norm": 7.639415740966797, + "learning_rate": 4.606484133348608e-06, + "logits/chosen": 8.23381233215332, + "logits/rejected": 9.277178764343262, + "logps/chosen": -283.31695556640625, + "logps/rejected": -261.18572998046875, + "loss": 0.6992, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2526858448982239, + "rewards/margins": 0.08515415340662003, + "rewards/rejected": 0.16753168404102325, + "step": 3314 + }, + { + "epoch": 0.5126618983181906, + "grad_norm": 8.525605201721191, + "learning_rate": 4.606197731698935e-06, + "logits/chosen": 13.060556411743164, + "logits/rejected": 9.44193172454834, + "logps/chosen": -409.4605712890625, + "logps/rejected": -380.96112060546875, + "loss": 0.7424, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34490615129470825, + "rewards/margins": 0.16014891862869263, + "rewards/rejected": 0.18475723266601562, + "step": 3315 + }, + { + "epoch": 0.5128165474579548, + "grad_norm": 8.903631210327148, + "learning_rate": 4.605911330049261e-06, + "logits/chosen": 7.798725128173828, + "logits/rejected": 4.855804920196533, + "logps/chosen": -328.53460693359375, + "logps/rejected": -388.38330078125, + "loss": 0.8374, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0003664987161755562, + "rewards/margins": -0.14090563356876373, + "rewards/rejected": 0.14053915441036224, + "step": 3316 + }, + { + "epoch": 0.512971196597719, + "grad_norm": 7.961757183074951, + "learning_rate": 4.605624928399588e-06, + "logits/chosen": 6.386938095092773, + "logits/rejected": 9.763251304626465, + "logps/chosen": -301.98199462890625, + "logps/rejected": -417.2717590332031, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5083305835723877, + "rewards/margins": 0.4255778193473816, + "rewards/rejected": 0.08275280892848969, + "step": 3317 + }, + { + "epoch": 0.5131258457374831, + "grad_norm": 5.357951641082764, + "learning_rate": 4.605338526749915e-06, + "logits/chosen": 8.939440727233887, + "logits/rejected": 0.5143543481826782, + "logps/chosen": -330.86712646484375, + "logps/rejected": -191.1465301513672, + "loss": 0.4968, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19912022352218628, + "rewards/margins": 0.7738736867904663, + "rewards/rejected": -0.5747535228729248, + "step": 3318 + }, + { + "epoch": 0.5132804948772473, + "grad_norm": 7.023497104644775, + "learning_rate": 4.605052125100241e-06, + "logits/chosen": 10.614002227783203, + "logits/rejected": 4.7218146324157715, + "logps/chosen": -517.1453857421875, + "logps/rejected": -291.04052734375, + "loss": 0.7068, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3102092742919922, + "rewards/margins": -0.0038158856332302094, + "rewards/rejected": 0.3140251636505127, + "step": 3319 + }, + { + "epoch": 0.5134351440170114, + "grad_norm": 4.474532127380371, + "learning_rate": 4.604765723450568e-06, + "logits/chosen": 16.762693405151367, + "logits/rejected": 10.661742210388184, + "logps/chosen": -319.2034606933594, + "logps/rejected": -233.9828338623047, + "loss": 0.5909, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4443679451942444, + "rewards/margins": 0.2620030641555786, + "rewards/rejected": 0.18236495554447174, + "step": 3320 + }, + { + "epoch": 0.5135897931567756, + "grad_norm": 5.109394550323486, + "learning_rate": 4.604479321800894e-06, + "logits/chosen": 12.434944152832031, + "logits/rejected": 5.510983943939209, + "logps/chosen": -251.61648559570312, + "logps/rejected": -178.4016571044922, + "loss": 0.6458, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4217732548713684, + "rewards/margins": 0.22006481885910034, + "rewards/rejected": 0.20170843601226807, + "step": 3321 + }, + { + "epoch": 0.5137444422965397, + "grad_norm": 6.105430603027344, + "learning_rate": 4.60419292015122e-06, + "logits/chosen": 10.395490646362305, + "logits/rejected": 3.4469900131225586, + "logps/chosen": -355.70263671875, + "logps/rejected": -247.75942993164062, + "loss": 0.7775, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20129719376564026, + "rewards/margins": -0.10168847441673279, + "rewards/rejected": 0.30298566818237305, + "step": 3322 + }, + { + "epoch": 0.5138990914363039, + "grad_norm": 14.232503890991211, + "learning_rate": 4.603906518501547e-06, + "logits/chosen": 7.293386936187744, + "logits/rejected": 8.30022144317627, + "logps/chosen": -228.21954345703125, + "logps/rejected": -216.17160034179688, + "loss": 0.7305, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18266472220420837, + "rewards/margins": 0.0026621222496032715, + "rewards/rejected": 0.1800025999546051, + "step": 3323 + }, + { + "epoch": 0.514053740576068, + "grad_norm": 5.143128871917725, + "learning_rate": 4.603620116851874e-06, + "logits/chosen": 15.941125869750977, + "logits/rejected": 13.429931640625, + "logps/chosen": -310.6986083984375, + "logps/rejected": -228.2845458984375, + "loss": 0.5077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4222492575645447, + "rewards/margins": 0.46035271883010864, + "rewards/rejected": -0.038103483617305756, + "step": 3324 + }, + { + "epoch": 0.5142083897158322, + "grad_norm": 7.872115135192871, + "learning_rate": 4.6033337152022e-06, + "logits/chosen": 9.733478546142578, + "logits/rejected": 6.192249298095703, + "logps/chosen": -280.38580322265625, + "logps/rejected": -262.6074523925781, + "loss": 0.9302, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.3527243733406067, + "rewards/margins": -0.3561922311782837, + "rewards/rejected": 0.003467857837677002, + "step": 3325 + }, + { + "epoch": 0.5143630388555963, + "grad_norm": 5.421661853790283, + "learning_rate": 4.603047313552527e-06, + "logits/chosen": 12.775594711303711, + "logits/rejected": 10.362990379333496, + "logps/chosen": -279.92083740234375, + "logps/rejected": -250.9683837890625, + "loss": 0.7951, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36717861890792847, + "rewards/margins": 0.12689314782619476, + "rewards/rejected": 0.2402855008840561, + "step": 3326 + }, + { + "epoch": 0.5145176879953606, + "grad_norm": 7.026094913482666, + "learning_rate": 4.602760911902853e-06, + "logits/chosen": 7.578186988830566, + "logits/rejected": 7.290694713592529, + "logps/chosen": -297.53546142578125, + "logps/rejected": -408.7726135253906, + "loss": 0.6782, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.511529266834259, + "rewards/margins": 0.09652864933013916, + "rewards/rejected": 0.41500064730644226, + "step": 3327 + }, + { + "epoch": 0.5146723371351247, + "grad_norm": 7.281806468963623, + "learning_rate": 4.6024745102531795e-06, + "logits/chosen": 12.974857330322266, + "logits/rejected": 11.966880798339844, + "logps/chosen": -411.5673828125, + "logps/rejected": -498.97314453125, + "loss": 0.6552, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.7406171560287476, + "rewards/margins": 0.14281733334064484, + "rewards/rejected": 0.5977998375892639, + "step": 3328 + }, + { + "epoch": 0.5148269862748889, + "grad_norm": 5.58329963684082, + "learning_rate": 4.602188108603506e-06, + "logits/chosen": 10.643851280212402, + "logits/rejected": 3.0388851165771484, + "logps/chosen": -371.23309326171875, + "logps/rejected": -224.8085479736328, + "loss": 0.4527, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5755599737167358, + "rewards/margins": 0.6589735746383667, + "rewards/rejected": -0.08341364562511444, + "step": 3329 + }, + { + "epoch": 0.514981635414653, + "grad_norm": 4.907145977020264, + "learning_rate": 4.601901706953833e-06, + "logits/chosen": 6.897857189178467, + "logits/rejected": 3.476606845855713, + "logps/chosen": -325.2074279785156, + "logps/rejected": -177.27366638183594, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19923964142799377, + "rewards/margins": 0.28254789113998413, + "rewards/rejected": -0.08330824226140976, + "step": 3330 + }, + { + "epoch": 0.5151362845544172, + "grad_norm": 10.427337646484375, + "learning_rate": 4.601615305304159e-06, + "logits/chosen": -0.9577646255493164, + "logits/rejected": 1.4538899660110474, + "logps/chosen": -331.46014404296875, + "logps/rejected": -329.8386535644531, + "loss": 0.7563, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2879761755466461, + "rewards/margins": 0.08168478310108185, + "rewards/rejected": -0.36966097354888916, + "step": 3331 + }, + { + "epoch": 0.5152909336941813, + "grad_norm": 4.053294658660889, + "learning_rate": 4.601328903654485e-06, + "logits/chosen": 12.008349418640137, + "logits/rejected": 2.210422992706299, + "logps/chosen": -409.9730224609375, + "logps/rejected": -272.80633544921875, + "loss": 0.4101, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5077674984931946, + "rewards/margins": 0.8883147239685059, + "rewards/rejected": -0.38054725527763367, + "step": 3332 + }, + { + "epoch": 0.5154455828339455, + "grad_norm": 6.067572593688965, + "learning_rate": 4.601042502004812e-06, + "logits/chosen": 8.359210014343262, + "logits/rejected": 0.7269017100334167, + "logps/chosen": -355.2889099121094, + "logps/rejected": -178.30001831054688, + "loss": 0.6275, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5712300539016724, + "rewards/margins": 0.21494920551776886, + "rewards/rejected": 0.3562808036804199, + "step": 3333 + }, + { + "epoch": 0.5156002319737096, + "grad_norm": 7.381783962249756, + "learning_rate": 4.6007561003551385e-06, + "logits/chosen": 11.462834358215332, + "logits/rejected": 12.725983619689941, + "logps/chosen": -288.2518310546875, + "logps/rejected": -265.99114990234375, + "loss": 0.8291, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.13342642784118652, + "rewards/margins": -0.15023750066757202, + "rewards/rejected": 0.28366392850875854, + "step": 3334 + }, + { + "epoch": 0.5157548811134738, + "grad_norm": 7.987733364105225, + "learning_rate": 4.600469698705465e-06, + "logits/chosen": 8.43124008178711, + "logits/rejected": 12.709207534790039, + "logps/chosen": -236.6010284423828, + "logps/rejected": -281.1996154785156, + "loss": 0.732, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10303372144699097, + "rewards/margins": -0.03188496083021164, + "rewards/rejected": 0.1349186897277832, + "step": 3335 + }, + { + "epoch": 0.5159095302532379, + "grad_norm": 5.715920925140381, + "learning_rate": 4.600183297055791e-06, + "logits/chosen": 5.269281387329102, + "logits/rejected": 6.160981178283691, + "logps/chosen": -193.3437042236328, + "logps/rejected": -258.7533264160156, + "loss": 0.7888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19255928695201874, + "rewards/margins": -0.03225240111351013, + "rewards/rejected": 0.22481170296669006, + "step": 3336 + }, + { + "epoch": 0.5160641793930021, + "grad_norm": 4.292242050170898, + "learning_rate": 4.599896895406118e-06, + "logits/chosen": 5.063158988952637, + "logits/rejected": 8.99674129486084, + "logps/chosen": -203.1636962890625, + "logps/rejected": -198.94207763671875, + "loss": 0.646, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3192750811576843, + "rewards/margins": 0.2594013214111328, + "rewards/rejected": 0.05987377092242241, + "step": 3337 + }, + { + "epoch": 0.5162188285327662, + "grad_norm": 7.232443809509277, + "learning_rate": 4.599610493756444e-06, + "logits/chosen": 9.894434928894043, + "logits/rejected": 3.0244178771972656, + "logps/chosen": -450.18695068359375, + "logps/rejected": -269.5166015625, + "loss": 0.4681, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7104825973510742, + "rewards/margins": 0.8683890700340271, + "rewards/rejected": -0.15790653228759766, + "step": 3338 + }, + { + "epoch": 0.5163734776725305, + "grad_norm": 5.255432605743408, + "learning_rate": 4.599324092106771e-06, + "logits/chosen": 12.322887420654297, + "logits/rejected": 7.271556854248047, + "logps/chosen": -212.72640991210938, + "logps/rejected": -202.61166381835938, + "loss": 0.6583, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.293515682220459, + "rewards/margins": 0.10923933237791061, + "rewards/rejected": 0.18427634239196777, + "step": 3339 + }, + { + "epoch": 0.5165281268122947, + "grad_norm": 3.8906614780426025, + "learning_rate": 4.599037690457097e-06, + "logits/chosen": 9.34835433959961, + "logits/rejected": 13.081023216247559, + "logps/chosen": -117.5382080078125, + "logps/rejected": -186.06973266601562, + "loss": 0.587, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.037637557834386826, + "rewards/margins": 0.29056259989738464, + "rewards/rejected": -0.2529250383377075, + "step": 3340 + }, + { + "epoch": 0.5166827759520588, + "grad_norm": 4.393553256988525, + "learning_rate": 4.598751288807423e-06, + "logits/chosen": 10.058629989624023, + "logits/rejected": 9.304224014282227, + "logps/chosen": -272.46649169921875, + "logps/rejected": -260.66510009765625, + "loss": 0.5814, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6412093639373779, + "rewards/margins": 0.27334436774253845, + "rewards/rejected": 0.3678649663925171, + "step": 3341 + }, + { + "epoch": 0.516837425091823, + "grad_norm": 5.0212507247924805, + "learning_rate": 4.59846488715775e-06, + "logits/chosen": 9.178473472595215, + "logits/rejected": 5.965265274047852, + "logps/chosen": -260.2046203613281, + "logps/rejected": -217.31329345703125, + "loss": 0.6553, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35200709104537964, + "rewards/margins": 0.11002466082572937, + "rewards/rejected": 0.24198244512081146, + "step": 3342 + }, + { + "epoch": 0.5169920742315871, + "grad_norm": 3.9991116523742676, + "learning_rate": 4.598178485508077e-06, + "logits/chosen": 15.431526184082031, + "logits/rejected": 7.793521404266357, + "logps/chosen": -221.8224334716797, + "logps/rejected": -197.96881103515625, + "loss": 0.5477, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4227851927280426, + "rewards/margins": 0.5931785106658936, + "rewards/rejected": -0.17039328813552856, + "step": 3343 + }, + { + "epoch": 0.5171467233713513, + "grad_norm": 6.6603803634643555, + "learning_rate": 4.597892083858403e-06, + "logits/chosen": 0.7117253541946411, + "logits/rejected": 4.9379072189331055, + "logps/chosen": -267.120361328125, + "logps/rejected": -328.4352111816406, + "loss": 0.7424, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09934262186288834, + "rewards/margins": 0.012568570673465729, + "rewards/rejected": 0.0867740660905838, + "step": 3344 + }, + { + "epoch": 0.5173013725111154, + "grad_norm": 6.0258307456970215, + "learning_rate": 4.59760568220873e-06, + "logits/chosen": 7.265829086303711, + "logits/rejected": 5.597441673278809, + "logps/chosen": -251.51492309570312, + "logps/rejected": -212.12924194335938, + "loss": 0.7808, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.04207973554730415, + "rewards/margins": -0.03954887390136719, + "rewards/rejected": 0.08162861317396164, + "step": 3345 + }, + { + "epoch": 0.5174560216508796, + "grad_norm": 5.801704406738281, + "learning_rate": 4.597319280559057e-06, + "logits/chosen": 9.716731071472168, + "logits/rejected": 11.044365882873535, + "logps/chosen": -249.14271545410156, + "logps/rejected": -266.17303466796875, + "loss": 0.7735, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14653760194778442, + "rewards/margins": -0.052437543869018555, + "rewards/rejected": 0.19897514581680298, + "step": 3346 + }, + { + "epoch": 0.5176106707906437, + "grad_norm": 5.435059547424316, + "learning_rate": 4.5970328789093824e-06, + "logits/chosen": 6.191805839538574, + "logits/rejected": 8.739500999450684, + "logps/chosen": -301.4412841796875, + "logps/rejected": -370.99737548828125, + "loss": 0.6259, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2873663902282715, + "rewards/margins": 0.16811160743236542, + "rewards/rejected": 0.11925478279590607, + "step": 3347 + }, + { + "epoch": 0.5177653199304079, + "grad_norm": 3.7823760509490967, + "learning_rate": 4.596746477259709e-06, + "logits/chosen": 11.955760955810547, + "logits/rejected": 10.665847778320312, + "logps/chosen": -227.86294555664062, + "logps/rejected": -178.990966796875, + "loss": 0.6769, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0681418627500534, + "rewards/margins": 0.12963783740997314, + "rewards/rejected": -0.061495959758758545, + "step": 3348 + }, + { + "epoch": 0.517919969070172, + "grad_norm": 6.7825608253479, + "learning_rate": 4.596460075610036e-06, + "logits/chosen": 8.116400718688965, + "logits/rejected": 4.816107273101807, + "logps/chosen": -218.87765502929688, + "logps/rejected": -189.20150756835938, + "loss": 0.6806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09532195329666138, + "rewards/margins": 0.07856465876102448, + "rewards/rejected": 0.016757257282733917, + "step": 3349 + }, + { + "epoch": 0.5180746182099362, + "grad_norm": 6.819690704345703, + "learning_rate": 4.596173673960362e-06, + "logits/chosen": 7.481340408325195, + "logits/rejected": 4.962677001953125, + "logps/chosen": -323.698974609375, + "logps/rejected": -308.75494384765625, + "loss": 0.8075, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.45470166206359863, + "rewards/margins": -0.1253925859928131, + "rewards/rejected": 0.5800942182540894, + "step": 3350 + }, + { + "epoch": 0.5182292673497003, + "grad_norm": 5.916538238525391, + "learning_rate": 4.595887272310689e-06, + "logits/chosen": 8.10086441040039, + "logits/rejected": 1.8858840465545654, + "logps/chosen": -286.738037109375, + "logps/rejected": -225.82388305664062, + "loss": 0.6632, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.44926026463508606, + "rewards/margins": 0.10890054702758789, + "rewards/rejected": 0.3403596878051758, + "step": 3351 + }, + { + "epoch": 0.5183839164894646, + "grad_norm": 5.650180339813232, + "learning_rate": 4.595600870661016e-06, + "logits/chosen": 9.513080596923828, + "logits/rejected": 6.768671035766602, + "logps/chosen": -297.4496154785156, + "logps/rejected": -318.6204528808594, + "loss": 0.6118, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10306416451931, + "rewards/margins": 0.3363010585308075, + "rewards/rejected": -0.2332368791103363, + "step": 3352 + }, + { + "epoch": 0.5185385656292287, + "grad_norm": 5.260039806365967, + "learning_rate": 4.595314469011342e-06, + "logits/chosen": 5.763668537139893, + "logits/rejected": 4.022637367248535, + "logps/chosen": -473.63323974609375, + "logps/rejected": -402.4539489746094, + "loss": 0.5797, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8152771592140198, + "rewards/margins": 0.462116539478302, + "rewards/rejected": 0.3531606197357178, + "step": 3353 + }, + { + "epoch": 0.5186932147689929, + "grad_norm": 4.126369953155518, + "learning_rate": 4.595028067361668e-06, + "logits/chosen": 8.767402648925781, + "logits/rejected": -1.541495442390442, + "logps/chosen": -365.54766845703125, + "logps/rejected": -259.57464599609375, + "loss": 0.451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.473431795835495, + "rewards/margins": 0.6864514350891113, + "rewards/rejected": -0.21301960945129395, + "step": 3354 + }, + { + "epoch": 0.518847863908757, + "grad_norm": 5.820002555847168, + "learning_rate": 4.594741665711995e-06, + "logits/chosen": 4.751580238342285, + "logits/rejected": 8.376527786254883, + "logps/chosen": -217.77182006835938, + "logps/rejected": -243.0459747314453, + "loss": 0.9261, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05957531929016113, + "rewards/margins": -0.34137678146362305, + "rewards/rejected": 0.4009520709514618, + "step": 3355 + }, + { + "epoch": 0.5190025130485212, + "grad_norm": 3.4872448444366455, + "learning_rate": 4.5944552640623214e-06, + "logits/chosen": 12.761938095092773, + "logits/rejected": 2.555190086364746, + "logps/chosen": -360.17889404296875, + "logps/rejected": -196.4600830078125, + "loss": 0.4265, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.641636848449707, + "rewards/margins": 0.8073151111602783, + "rewards/rejected": -0.1656782180070877, + "step": 3356 + }, + { + "epoch": 0.5191571621882853, + "grad_norm": 3.894822835922241, + "learning_rate": 4.594168862412648e-06, + "logits/chosen": 1.6901307106018066, + "logits/rejected": 1.0637298822402954, + "logps/chosen": -216.74249267578125, + "logps/rejected": -229.15560913085938, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39243441820144653, + "rewards/margins": 0.5352980494499207, + "rewards/rejected": -0.14286363124847412, + "step": 3357 + }, + { + "epoch": 0.5193118113280495, + "grad_norm": 5.256778240203857, + "learning_rate": 4.593882460762975e-06, + "logits/chosen": 11.411818504333496, + "logits/rejected": 2.7224555015563965, + "logps/chosen": -304.2349853515625, + "logps/rejected": -188.3083953857422, + "loss": 0.6893, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18156929314136505, + "rewards/margins": 0.09899642318487167, + "rewards/rejected": 0.08257286995649338, + "step": 3358 + }, + { + "epoch": 0.5194664604678136, + "grad_norm": 5.881741046905518, + "learning_rate": 4.593596059113301e-06, + "logits/chosen": 10.966991424560547, + "logits/rejected": 6.728796005249023, + "logps/chosen": -302.0898742675781, + "logps/rejected": -256.14801025390625, + "loss": 0.6459, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3486071527004242, + "rewards/margins": 0.163596972823143, + "rewards/rejected": 0.18501019477844238, + "step": 3359 + }, + { + "epoch": 0.5196211096075778, + "grad_norm": 3.081699848175049, + "learning_rate": 4.593309657463627e-06, + "logits/chosen": 5.417734622955322, + "logits/rejected": 1.3187285661697388, + "logps/chosen": -310.59417724609375, + "logps/rejected": -180.80331420898438, + "loss": 0.5039, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30422377586364746, + "rewards/margins": 0.5703117847442627, + "rewards/rejected": -0.2660880386829376, + "step": 3360 + }, + { + "epoch": 0.5197757587473419, + "grad_norm": 4.984732151031494, + "learning_rate": 4.593023255813954e-06, + "logits/chosen": 4.960865497589111, + "logits/rejected": 4.9250569343566895, + "logps/chosen": -172.64959716796875, + "logps/rejected": -216.44898986816406, + "loss": 0.7157, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31895291805267334, + "rewards/margins": 0.05662061274051666, + "rewards/rejected": 0.2623322606086731, + "step": 3361 + }, + { + "epoch": 0.5199304078871061, + "grad_norm": 5.086485385894775, + "learning_rate": 4.5927368541642805e-06, + "logits/chosen": 11.037814140319824, + "logits/rejected": 8.57933235168457, + "logps/chosen": -306.3039855957031, + "logps/rejected": -306.47149658203125, + "loss": 0.6079, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5484976768493652, + "rewards/margins": 0.22478577494621277, + "rewards/rejected": 0.32371193170547485, + "step": 3362 + }, + { + "epoch": 0.5200850570268702, + "grad_norm": 4.9316816329956055, + "learning_rate": 4.592450452514607e-06, + "logits/chosen": 10.591773986816406, + "logits/rejected": 2.9878017902374268, + "logps/chosen": -382.12664794921875, + "logps/rejected": -290.87548828125, + "loss": 0.4458, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8783547878265381, + "rewards/margins": 0.6642925143241882, + "rewards/rejected": 0.21406233310699463, + "step": 3363 + }, + { + "epoch": 0.5202397061666344, + "grad_norm": 5.727143287658691, + "learning_rate": 4.592164050864934e-06, + "logits/chosen": 13.092672348022461, + "logits/rejected": 8.68062973022461, + "logps/chosen": -336.43170166015625, + "logps/rejected": -313.5290832519531, + "loss": 0.5498, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.43714067339897156, + "rewards/margins": 0.5199013948440552, + "rewards/rejected": -0.08276071399450302, + "step": 3364 + }, + { + "epoch": 0.5203943553063987, + "grad_norm": 5.615415096282959, + "learning_rate": 4.59187764921526e-06, + "logits/chosen": 9.310783386230469, + "logits/rejected": 6.376582622528076, + "logps/chosen": -387.70037841796875, + "logps/rejected": -378.4019470214844, + "loss": 0.5943, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5919109582901001, + "rewards/margins": 0.2893992066383362, + "rewards/rejected": 0.3025117516517639, + "step": 3365 + }, + { + "epoch": 0.5205490044461628, + "grad_norm": 7.270103454589844, + "learning_rate": 4.591591247565586e-06, + "logits/chosen": 6.2316131591796875, + "logits/rejected": 5.721863746643066, + "logps/chosen": -199.2571258544922, + "logps/rejected": -165.15481567382812, + "loss": 0.8633, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0933922529220581, + "rewards/margins": -0.24353134632110596, + "rewards/rejected": 0.15013909339904785, + "step": 3366 + }, + { + "epoch": 0.520703653585927, + "grad_norm": 5.480018615722656, + "learning_rate": 4.591304845915913e-06, + "logits/chosen": 11.577028274536133, + "logits/rejected": 7.851813316345215, + "logps/chosen": -281.567138671875, + "logps/rejected": -266.55657958984375, + "loss": 0.6757, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04081106185913086, + "rewards/margins": 0.052083492279052734, + "rewards/rejected": -0.011272445321083069, + "step": 3367 + }, + { + "epoch": 0.5208583027256911, + "grad_norm": 6.0521111488342285, + "learning_rate": 4.5910184442662396e-06, + "logits/chosen": 7.320011138916016, + "logits/rejected": 6.893837928771973, + "logps/chosen": -211.69017028808594, + "logps/rejected": -263.49395751953125, + "loss": 0.7222, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32181161642074585, + "rewards/margins": 0.07572829723358154, + "rewards/rejected": 0.2460833489894867, + "step": 3368 + }, + { + "epoch": 0.5210129518654553, + "grad_norm": 6.400270938873291, + "learning_rate": 4.590732042616566e-06, + "logits/chosen": 7.621551990509033, + "logits/rejected": 4.2501606941223145, + "logps/chosen": -395.25115966796875, + "logps/rejected": -238.61761474609375, + "loss": 0.7021, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.493667334318161, + "rewards/margins": 0.09907799959182739, + "rewards/rejected": 0.39458930492401123, + "step": 3369 + }, + { + "epoch": 0.5211676010052194, + "grad_norm": 4.112985134124756, + "learning_rate": 4.590445640966892e-06, + "logits/chosen": 6.5976457595825195, + "logits/rejected": 5.435474872589111, + "logps/chosen": -232.96754455566406, + "logps/rejected": -203.96286010742188, + "loss": 0.7434, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21193014085292816, + "rewards/margins": 0.00992593914270401, + "rewards/rejected": 0.20200420916080475, + "step": 3370 + }, + { + "epoch": 0.5213222501449836, + "grad_norm": 6.024825572967529, + "learning_rate": 4.590159239317219e-06, + "logits/chosen": 10.6073637008667, + "logits/rejected": 8.498064041137695, + "logps/chosen": -237.94537353515625, + "logps/rejected": -226.89024353027344, + "loss": 0.5978, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39795076847076416, + "rewards/margins": 0.23435693979263306, + "rewards/rejected": 0.16359379887580872, + "step": 3371 + }, + { + "epoch": 0.5214768992847477, + "grad_norm": 5.085453510284424, + "learning_rate": 4.589872837667545e-06, + "logits/chosen": 9.58914852142334, + "logits/rejected": 8.148595809936523, + "logps/chosen": -299.7940368652344, + "logps/rejected": -241.12063598632812, + "loss": 0.4255, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38754168152809143, + "rewards/margins": 0.7746517062187195, + "rewards/rejected": -0.38711005449295044, + "step": 3372 + }, + { + "epoch": 0.5216315484245119, + "grad_norm": 4.598538875579834, + "learning_rate": 4.589586436017872e-06, + "logits/chosen": 15.125432968139648, + "logits/rejected": 10.811992645263672, + "logps/chosen": -261.6843566894531, + "logps/rejected": -214.27566528320312, + "loss": 0.5474, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4445863962173462, + "rewards/margins": 0.391093909740448, + "rewards/rejected": 0.05349244922399521, + "step": 3373 + }, + { + "epoch": 0.521786197564276, + "grad_norm": 6.244147777557373, + "learning_rate": 4.589300034368198e-06, + "logits/chosen": 15.704962730407715, + "logits/rejected": 7.775631904602051, + "logps/chosen": -323.9530944824219, + "logps/rejected": -188.38279724121094, + "loss": 0.6641, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15005464851856232, + "rewards/margins": 0.17448121309280396, + "rewards/rejected": -0.3245358467102051, + "step": 3374 + }, + { + "epoch": 0.5219408467040402, + "grad_norm": 6.626112461090088, + "learning_rate": 4.589013632718524e-06, + "logits/chosen": 11.426918029785156, + "logits/rejected": 6.568182945251465, + "logps/chosen": -284.2164001464844, + "logps/rejected": -270.18804931640625, + "loss": 0.7606, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2913309931755066, + "rewards/margins": -0.08676622062921524, + "rewards/rejected": 0.3780972361564636, + "step": 3375 + }, + { + "epoch": 0.5220954958438043, + "grad_norm": 6.0998640060424805, + "learning_rate": 4.588727231068851e-06, + "logits/chosen": 7.51963996887207, + "logits/rejected": 11.44102668762207, + "logps/chosen": -272.22381591796875, + "logps/rejected": -297.0274658203125, + "loss": 0.7891, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07508973777294159, + "rewards/margins": -0.09064282476902008, + "rewards/rejected": 0.16573259234428406, + "step": 3376 + }, + { + "epoch": 0.5222501449835685, + "grad_norm": 6.202936172485352, + "learning_rate": 4.588440829419178e-06, + "logits/chosen": 17.11812400817871, + "logits/rejected": 7.117197036743164, + "logps/chosen": -509.7786560058594, + "logps/rejected": -291.5841979980469, + "loss": 0.5102, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5514532327651978, + "rewards/margins": 0.664260983467102, + "rewards/rejected": -0.1128077581524849, + "step": 3377 + }, + { + "epoch": 0.5224047941233327, + "grad_norm": 5.566222190856934, + "learning_rate": 4.588154427769504e-06, + "logits/chosen": 12.539167404174805, + "logits/rejected": 4.024435043334961, + "logps/chosen": -279.06842041015625, + "logps/rejected": -186.0841827392578, + "loss": 0.7426, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1587502360343933, + "rewards/margins": -0.03681080415844917, + "rewards/rejected": 0.19556105136871338, + "step": 3378 + }, + { + "epoch": 0.5225594432630969, + "grad_norm": 5.379266738891602, + "learning_rate": 4.587868026119831e-06, + "logits/chosen": 10.733598709106445, + "logits/rejected": 4.975314140319824, + "logps/chosen": -332.7852783203125, + "logps/rejected": -287.1891784667969, + "loss": 0.5737, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2933945953845978, + "rewards/margins": 0.4398816227912903, + "rewards/rejected": -0.1464870274066925, + "step": 3379 + }, + { + "epoch": 0.522714092402861, + "grad_norm": 6.186374664306641, + "learning_rate": 4.587581624470157e-06, + "logits/chosen": 11.564859390258789, + "logits/rejected": 4.0376667976379395, + "logps/chosen": -571.50341796875, + "logps/rejected": -389.6288757324219, + "loss": 0.5752, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8618119955062866, + "rewards/margins": 0.44151562452316284, + "rewards/rejected": 0.42029643058776855, + "step": 3380 + }, + { + "epoch": 0.5228687415426252, + "grad_norm": 7.375655651092529, + "learning_rate": 4.5872952228204835e-06, + "logits/chosen": 10.973668098449707, + "logits/rejected": 11.470388412475586, + "logps/chosen": -261.96881103515625, + "logps/rejected": -259.1724548339844, + "loss": 0.7135, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10988417267799377, + "rewards/margins": 0.1347062885761261, + "rewards/rejected": -0.024822130799293518, + "step": 3381 + }, + { + "epoch": 0.5230233906823893, + "grad_norm": 6.438285827636719, + "learning_rate": 4.58700882117081e-06, + "logits/chosen": 6.218245506286621, + "logits/rejected": 6.388080596923828, + "logps/chosen": -267.11468505859375, + "logps/rejected": -264.1803894042969, + "loss": 0.7052, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22538027167320251, + "rewards/margins": 0.21441617608070374, + "rewards/rejected": 0.01096411794424057, + "step": 3382 + }, + { + "epoch": 0.5231780398221535, + "grad_norm": 5.4786272048950195, + "learning_rate": 4.586722419521137e-06, + "logits/chosen": 4.929106712341309, + "logits/rejected": 5.664292812347412, + "logps/chosen": -212.44558715820312, + "logps/rejected": -242.03884887695312, + "loss": 0.8249, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12968482077121735, + "rewards/margins": -0.06922565400600433, + "rewards/rejected": 0.19891047477722168, + "step": 3383 + }, + { + "epoch": 0.5233326889619176, + "grad_norm": 4.984866142272949, + "learning_rate": 4.5864360178714634e-06, + "logits/chosen": 8.332356452941895, + "logits/rejected": 4.080158710479736, + "logps/chosen": -284.6562805175781, + "logps/rejected": -196.8278350830078, + "loss": 0.765, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23435664176940918, + "rewards/margins": -0.049413904547691345, + "rewards/rejected": 0.2837705612182617, + "step": 3384 + }, + { + "epoch": 0.5234873381016818, + "grad_norm": 5.085991859436035, + "learning_rate": 4.58614961622179e-06, + "logits/chosen": 10.116312026977539, + "logits/rejected": 3.499004364013672, + "logps/chosen": -246.60089111328125, + "logps/rejected": -172.03099060058594, + "loss": 0.7529, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2066326141357422, + "rewards/margins": 0.06853661686182022, + "rewards/rejected": 0.13809600472450256, + "step": 3385 + }, + { + "epoch": 0.523641987241446, + "grad_norm": 7.175114631652832, + "learning_rate": 4.585863214572117e-06, + "logits/chosen": 10.580463409423828, + "logits/rejected": 8.281279563903809, + "logps/chosen": -340.380615234375, + "logps/rejected": -265.46368408203125, + "loss": 0.6188, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3850410580635071, + "rewards/margins": 0.3041542172431946, + "rewards/rejected": 0.0808868482708931, + "step": 3386 + }, + { + "epoch": 0.5237966363812101, + "grad_norm": 7.444200038909912, + "learning_rate": 4.5855768129224425e-06, + "logits/chosen": 12.52717113494873, + "logits/rejected": 5.678231239318848, + "logps/chosen": -221.00648498535156, + "logps/rejected": -146.95504760742188, + "loss": 0.6194, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11436930298805237, + "rewards/margins": 0.2839950621128082, + "rewards/rejected": -0.16962575912475586, + "step": 3387 + }, + { + "epoch": 0.5239512855209743, + "grad_norm": 3.909456729888916, + "learning_rate": 4.585290411272769e-06, + "logits/chosen": 13.567419052124023, + "logits/rejected": 5.5791850090026855, + "logps/chosen": -289.1203308105469, + "logps/rejected": -195.16036987304688, + "loss": 0.4575, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37114086747169495, + "rewards/margins": 0.6506978273391724, + "rewards/rejected": -0.27955693006515503, + "step": 3388 + }, + { + "epoch": 0.5241059346607384, + "grad_norm": 5.847412109375, + "learning_rate": 4.585004009623096e-06, + "logits/chosen": 12.366923332214355, + "logits/rejected": 11.392325401306152, + "logps/chosen": -413.7122802734375, + "logps/rejected": -339.08795166015625, + "loss": 0.5241, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7155740261077881, + "rewards/margins": 0.6349714994430542, + "rewards/rejected": 0.08060257136821747, + "step": 3389 + }, + { + "epoch": 0.5242605838005026, + "grad_norm": 9.251568794250488, + "learning_rate": 4.5847176079734225e-06, + "logits/chosen": 6.035253524780273, + "logits/rejected": 6.236891269683838, + "logps/chosen": -358.50164794921875, + "logps/rejected": -369.5618896484375, + "loss": 0.8535, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30767059326171875, + "rewards/margins": -0.08816948533058167, + "rewards/rejected": 0.3958400785923004, + "step": 3390 + }, + { + "epoch": 0.5244152329402668, + "grad_norm": 8.34519100189209, + "learning_rate": 4.584431206323749e-06, + "logits/chosen": 7.675492286682129, + "logits/rejected": 5.672919273376465, + "logps/chosen": -238.48988342285156, + "logps/rejected": -189.22886657714844, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4910629093647003, + "rewards/margins": 0.5237280130386353, + "rewards/rejected": -0.03266506642103195, + "step": 3391 + }, + { + "epoch": 0.524569882080031, + "grad_norm": 3.7723278999328613, + "learning_rate": 4.584144804674076e-06, + "logits/chosen": 7.779267311096191, + "logits/rejected": 3.61149001121521, + "logps/chosen": -201.44027709960938, + "logps/rejected": -175.6761474609375, + "loss": 0.4699, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4658648371696472, + "rewards/margins": 0.5744634866714478, + "rewards/rejected": -0.10859866440296173, + "step": 3392 + }, + { + "epoch": 0.5247245312197951, + "grad_norm": 5.506012916564941, + "learning_rate": 4.583858403024402e-06, + "logits/chosen": 9.021503448486328, + "logits/rejected": 4.083602428436279, + "logps/chosen": -348.5492248535156, + "logps/rejected": -223.51123046875, + "loss": 0.763, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17847681045532227, + "rewards/margins": 0.0058135539293289185, + "rewards/rejected": 0.17266327142715454, + "step": 3393 + }, + { + "epoch": 0.5248791803595593, + "grad_norm": 4.694247245788574, + "learning_rate": 4.583572001374728e-06, + "logits/chosen": 4.244956970214844, + "logits/rejected": 7.447844505310059, + "logps/chosen": -195.48538208007812, + "logps/rejected": -140.19839477539062, + "loss": 0.6707, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05313679203391075, + "rewards/margins": 0.1356012225151062, + "rewards/rejected": -0.08246441185474396, + "step": 3394 + }, + { + "epoch": 0.5250338294993234, + "grad_norm": 5.010329723358154, + "learning_rate": 4.583285599725055e-06, + "logits/chosen": 6.858229637145996, + "logits/rejected": 7.083197593688965, + "logps/chosen": -181.20245361328125, + "logps/rejected": -239.50001525878906, + "loss": 0.7389, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08626832813024521, + "rewards/margins": 0.036448098719120026, + "rewards/rejected": -0.12271644175052643, + "step": 3395 + }, + { + "epoch": 0.5251884786390876, + "grad_norm": 4.788662433624268, + "learning_rate": 4.5829991980753815e-06, + "logits/chosen": 5.641383171081543, + "logits/rejected": 1.2096834182739258, + "logps/chosen": -355.52496337890625, + "logps/rejected": -265.81683349609375, + "loss": 0.5497, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7591531276702881, + "rewards/margins": 0.43359097838401794, + "rewards/rejected": 0.32556214928627014, + "step": 3396 + }, + { + "epoch": 0.5253431277788517, + "grad_norm": 6.415170192718506, + "learning_rate": 4.582712796425708e-06, + "logits/chosen": 8.305414199829102, + "logits/rejected": 7.885847091674805, + "logps/chosen": -283.60858154296875, + "logps/rejected": -255.82012939453125, + "loss": 0.8096, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3064641058444977, + "rewards/margins": -0.15541419386863708, + "rewards/rejected": 0.46187829971313477, + "step": 3397 + }, + { + "epoch": 0.5254977769186159, + "grad_norm": 8.35158634185791, + "learning_rate": 4.582426394776035e-06, + "logits/chosen": 12.839118003845215, + "logits/rejected": 7.774646759033203, + "logps/chosen": -310.63714599609375, + "logps/rejected": -212.08917236328125, + "loss": 0.6301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19210198521614075, + "rewards/margins": 0.4572628140449524, + "rewards/rejected": -0.26516082882881165, + "step": 3398 + }, + { + "epoch": 0.52565242605838, + "grad_norm": 4.139445781707764, + "learning_rate": 4.582139993126361e-06, + "logits/chosen": 11.815128326416016, + "logits/rejected": 7.3945841789245605, + "logps/chosen": -334.0032043457031, + "logps/rejected": -219.86627197265625, + "loss": 0.4836, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44902172684669495, + "rewards/margins": 0.6782470941543579, + "rewards/rejected": -0.22922533750534058, + "step": 3399 + }, + { + "epoch": 0.5258070751981442, + "grad_norm": 4.127014636993408, + "learning_rate": 4.581853591476687e-06, + "logits/chosen": 11.822992324829102, + "logits/rejected": 4.1561737060546875, + "logps/chosen": -292.7979431152344, + "logps/rejected": -124.76052856445312, + "loss": 0.5117, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1704309582710266, + "rewards/margins": 0.4945645034313202, + "rewards/rejected": -0.32413357496261597, + "step": 3400 + }, + { + "epoch": 0.5259617243379083, + "grad_norm": 6.6358256340026855, + "learning_rate": 4.581567189827014e-06, + "logits/chosen": 4.832671642303467, + "logits/rejected": 8.779804229736328, + "logps/chosen": -182.05307006835938, + "logps/rejected": -247.2474365234375, + "loss": 0.6533, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.42966824769973755, + "rewards/margins": 0.130997434258461, + "rewards/rejected": 0.29867082834243774, + "step": 3401 + }, + { + "epoch": 0.5261163734776725, + "grad_norm": 4.693929195404053, + "learning_rate": 4.581280788177341e-06, + "logits/chosen": 10.109063148498535, + "logits/rejected": 9.585689544677734, + "logps/chosen": -203.67630004882812, + "logps/rejected": -221.12509155273438, + "loss": 0.6105, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36936986446380615, + "rewards/margins": 0.34170639514923096, + "rewards/rejected": 0.02766351029276848, + "step": 3402 + }, + { + "epoch": 0.5262710226174367, + "grad_norm": 4.989546298980713, + "learning_rate": 4.580994386527666e-06, + "logits/chosen": 12.533157348632812, + "logits/rejected": 4.344411849975586, + "logps/chosen": -309.76190185546875, + "logps/rejected": -215.86572265625, + "loss": 0.6672, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24339275062084198, + "rewards/margins": 0.4155833125114441, + "rewards/rejected": -0.1721905767917633, + "step": 3403 + }, + { + "epoch": 0.5264256717572009, + "grad_norm": 6.5985026359558105, + "learning_rate": 4.580707984877993e-06, + "logits/chosen": 9.559341430664062, + "logits/rejected": 7.2042388916015625, + "logps/chosen": -429.82208251953125, + "logps/rejected": -311.39715576171875, + "loss": 0.6336, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5456064343452454, + "rewards/margins": 0.6279048919677734, + "rewards/rejected": -0.08229847252368927, + "step": 3404 + }, + { + "epoch": 0.526580320896965, + "grad_norm": 5.497079849243164, + "learning_rate": 4.58042158322832e-06, + "logits/chosen": 7.990666389465332, + "logits/rejected": 13.559745788574219, + "logps/chosen": -259.53070068359375, + "logps/rejected": -371.3773193359375, + "loss": 0.7, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18173199892044067, + "rewards/margins": 0.040539830923080444, + "rewards/rejected": 0.14119216799736023, + "step": 3405 + }, + { + "epoch": 0.5267349700367292, + "grad_norm": 6.048287391662598, + "learning_rate": 4.580135181578646e-06, + "logits/chosen": 12.805288314819336, + "logits/rejected": 5.341282844543457, + "logps/chosen": -263.8695068359375, + "logps/rejected": -167.55691528320312, + "loss": 0.6966, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24285611510276794, + "rewards/margins": 0.027521885931491852, + "rewards/rejected": 0.2153342217206955, + "step": 3406 + }, + { + "epoch": 0.5268896191764934, + "grad_norm": 4.9026336669921875, + "learning_rate": 4.579848779928973e-06, + "logits/chosen": 17.73627471923828, + "logits/rejected": 9.94194507598877, + "logps/chosen": -448.31109619140625, + "logps/rejected": -341.8060302734375, + "loss": 0.4685, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5143506526947021, + "rewards/margins": 0.5725738406181335, + "rewards/rejected": -0.05822315439581871, + "step": 3407 + }, + { + "epoch": 0.5270442683162575, + "grad_norm": 6.575826168060303, + "learning_rate": 4.579562378279299e-06, + "logits/chosen": 8.31064224243164, + "logits/rejected": 5.723313331604004, + "logps/chosen": -377.745361328125, + "logps/rejected": -327.5417175292969, + "loss": 0.5335, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5305577516555786, + "rewards/margins": 0.4008696675300598, + "rewards/rejected": 0.12968809902668, + "step": 3408 + }, + { + "epoch": 0.5271989174560217, + "grad_norm": 8.528643608093262, + "learning_rate": 4.5792759766296255e-06, + "logits/chosen": 11.447997093200684, + "logits/rejected": 15.734691619873047, + "logps/chosen": -321.5065002441406, + "logps/rejected": -374.7230224609375, + "loss": 0.8825, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09364481270313263, + "rewards/margins": -0.1999102532863617, + "rewards/rejected": 0.29355505108833313, + "step": 3409 + }, + { + "epoch": 0.5273535665957858, + "grad_norm": 6.614365100860596, + "learning_rate": 4.578989574979952e-06, + "logits/chosen": 0.38496047258377075, + "logits/rejected": 2.333106756210327, + "logps/chosen": -209.05752563476562, + "logps/rejected": -259.7377014160156, + "loss": 0.8604, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.050031304359436035, + "rewards/margins": 0.10323281586170197, + "rewards/rejected": -0.053201496601104736, + "step": 3410 + }, + { + "epoch": 0.52750821573555, + "grad_norm": 5.590544700622559, + "learning_rate": 4.578703173330279e-06, + "logits/chosen": 4.341080188751221, + "logits/rejected": 2.4473624229431152, + "logps/chosen": -257.56475830078125, + "logps/rejected": -172.60374450683594, + "loss": 0.7988, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1192631721496582, + "rewards/margins": -0.10656829178333282, + "rewards/rejected": 0.22583146393299103, + "step": 3411 + }, + { + "epoch": 0.5276628648753141, + "grad_norm": 5.8873186111450195, + "learning_rate": 4.578416771680605e-06, + "logits/chosen": 9.85287094116211, + "logits/rejected": 6.649930953979492, + "logps/chosen": -464.3546142578125, + "logps/rejected": -346.43798828125, + "loss": 0.4718, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6190974116325378, + "rewards/margins": 0.5632246136665344, + "rewards/rejected": 0.055872876197099686, + "step": 3412 + }, + { + "epoch": 0.5278175140150783, + "grad_norm": 5.588597297668457, + "learning_rate": 4.578130370030931e-06, + "logits/chosen": 9.938053131103516, + "logits/rejected": 0.3761177659034729, + "logps/chosen": -347.6468200683594, + "logps/rejected": -237.06393432617188, + "loss": 0.6549, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40525197982788086, + "rewards/margins": 0.13223856687545776, + "rewards/rejected": 0.2730134129524231, + "step": 3413 + }, + { + "epoch": 0.5279721631548424, + "grad_norm": 12.130578994750977, + "learning_rate": 4.577843968381258e-06, + "logits/chosen": 10.577235221862793, + "logits/rejected": 11.355010986328125, + "logps/chosen": -255.88067626953125, + "logps/rejected": -193.0601806640625, + "loss": 0.6929, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3199271559715271, + "rewards/margins": 0.22729697823524475, + "rewards/rejected": 0.09263019263744354, + "step": 3414 + }, + { + "epoch": 0.5281268122946066, + "grad_norm": 6.261444091796875, + "learning_rate": 4.5775575667315845e-06, + "logits/chosen": 8.687597274780273, + "logits/rejected": -0.03992784023284912, + "logps/chosen": -406.1783447265625, + "logps/rejected": -225.51840209960938, + "loss": 0.5584, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6052923798561096, + "rewards/margins": 0.419197678565979, + "rewards/rejected": 0.18609467148780823, + "step": 3415 + }, + { + "epoch": 0.5282814614343708, + "grad_norm": 5.070169925689697, + "learning_rate": 4.577271165081911e-06, + "logits/chosen": 8.9213228225708, + "logits/rejected": 7.145798683166504, + "logps/chosen": -160.59693908691406, + "logps/rejected": -112.22850799560547, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.095132976770401, + "rewards/margins": 0.11526663601398468, + "rewards/rejected": -0.02013365551829338, + "step": 3416 + }, + { + "epoch": 0.528436110574135, + "grad_norm": 5.485719203948975, + "learning_rate": 4.576984763432238e-06, + "logits/chosen": 11.692569732666016, + "logits/rejected": 9.728137016296387, + "logps/chosen": -276.0697937011719, + "logps/rejected": -222.04473876953125, + "loss": 0.6553, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.400642067193985, + "rewards/margins": 0.09185998141765594, + "rewards/rejected": 0.30878210067749023, + "step": 3417 + }, + { + "epoch": 0.5285907597138991, + "grad_norm": 5.471063613891602, + "learning_rate": 4.5766983617825645e-06, + "logits/chosen": 14.95722770690918, + "logits/rejected": 12.853141784667969, + "logps/chosen": -306.56756591796875, + "logps/rejected": -268.8677673339844, + "loss": 0.6376, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19057464599609375, + "rewards/margins": 0.17712116241455078, + "rewards/rejected": 0.013453483581542969, + "step": 3418 + }, + { + "epoch": 0.5287454088536633, + "grad_norm": 5.295947551727295, + "learning_rate": 4.57641196013289e-06, + "logits/chosen": 8.691012382507324, + "logits/rejected": 2.7754409313201904, + "logps/chosen": -253.4430694580078, + "logps/rejected": -248.24066162109375, + "loss": 0.5742, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41307583451271057, + "rewards/margins": 0.45983076095581055, + "rewards/rejected": -0.04675493389368057, + "step": 3419 + }, + { + "epoch": 0.5289000579934274, + "grad_norm": 5.374020099639893, + "learning_rate": 4.576125558483217e-06, + "logits/chosen": 7.413866996765137, + "logits/rejected": 5.250053405761719, + "logps/chosen": -220.97207641601562, + "logps/rejected": -230.5499267578125, + "loss": 0.5933, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46011465787887573, + "rewards/margins": 0.24127589166164398, + "rewards/rejected": 0.21883878111839294, + "step": 3420 + }, + { + "epoch": 0.5290547071331916, + "grad_norm": 4.615992069244385, + "learning_rate": 4.5758391568335436e-06, + "logits/chosen": 9.28696346282959, + "logits/rejected": 7.412108421325684, + "logps/chosen": -220.29551696777344, + "logps/rejected": -184.72955322265625, + "loss": 0.6557, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009767621755599976, + "rewards/margins": 0.21574127674102783, + "rewards/rejected": -0.20597365498542786, + "step": 3421 + }, + { + "epoch": 0.5292093562729557, + "grad_norm": 10.78402328491211, + "learning_rate": 4.57555275518387e-06, + "logits/chosen": 7.141679763793945, + "logits/rejected": 1.6123631000518799, + "logps/chosen": -270.43634033203125, + "logps/rejected": -275.4430236816406, + "loss": 0.7058, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11122465878725052, + "rewards/margins": 0.05174332112073898, + "rewards/rejected": 0.05948131904006004, + "step": 3422 + }, + { + "epoch": 0.5293640054127199, + "grad_norm": 6.367552280426025, + "learning_rate": 4.575266353534197e-06, + "logits/chosen": 8.804319381713867, + "logits/rejected": 9.815238952636719, + "logps/chosen": -401.929443359375, + "logps/rejected": -480.4015197753906, + "loss": 0.5995, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19442030787467957, + "rewards/margins": 0.22161945700645447, + "rewards/rejected": -0.027199167758226395, + "step": 3423 + }, + { + "epoch": 0.529518654552484, + "grad_norm": 5.090580463409424, + "learning_rate": 4.5749799518845235e-06, + "logits/chosen": 8.891847610473633, + "logits/rejected": 9.425918579101562, + "logps/chosen": -274.53045654296875, + "logps/rejected": -257.40093994140625, + "loss": 0.7402, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2104538083076477, + "rewards/margins": -0.07750041037797928, + "rewards/rejected": 0.2879542410373688, + "step": 3424 + }, + { + "epoch": 0.5296733036922482, + "grad_norm": 4.47688102722168, + "learning_rate": 4.57469355023485e-06, + "logits/chosen": 4.2271246910095215, + "logits/rejected": 0.5890989303588867, + "logps/chosen": -290.80743408203125, + "logps/rejected": -184.19851684570312, + "loss": 0.7448, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.17444077134132385, + "rewards/margins": 0.025738459080457687, + "rewards/rejected": 0.14870232343673706, + "step": 3425 + }, + { + "epoch": 0.5298279528320123, + "grad_norm": 5.596575736999512, + "learning_rate": 4.574407148585176e-06, + "logits/chosen": 8.294352531433105, + "logits/rejected": 3.2890946865081787, + "logps/chosen": -211.54519653320312, + "logps/rejected": -205.9228515625, + "loss": 0.6152, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11202490329742432, + "rewards/margins": 0.2089504450559616, + "rewards/rejected": -0.09692555665969849, + "step": 3426 + }, + { + "epoch": 0.5299826019717765, + "grad_norm": 6.116458892822266, + "learning_rate": 4.574120746935503e-06, + "logits/chosen": 7.7513203620910645, + "logits/rejected": 9.854928970336914, + "logps/chosen": -206.88775634765625, + "logps/rejected": -222.33245849609375, + "loss": 0.7963, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3991096615791321, + "rewards/margins": -0.06701162457466125, + "rewards/rejected": 0.46612128615379333, + "step": 3427 + }, + { + "epoch": 0.5301372511115406, + "grad_norm": 4.590334892272949, + "learning_rate": 4.573834345285829e-06, + "logits/chosen": 12.637147903442383, + "logits/rejected": 8.044875144958496, + "logps/chosen": -279.09881591796875, + "logps/rejected": -281.741455078125, + "loss": 0.5024, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3421364724636078, + "rewards/margins": 0.4927014112472534, + "rewards/rejected": -0.15056495368480682, + "step": 3428 + }, + { + "epoch": 0.5302919002513049, + "grad_norm": 5.477031707763672, + "learning_rate": 4.573547943636156e-06, + "logits/chosen": 8.205026626586914, + "logits/rejected": 5.203507423400879, + "logps/chosen": -258.4432678222656, + "logps/rejected": -204.49960327148438, + "loss": 0.8799, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04098685085773468, + "rewards/margins": -0.08836564421653748, + "rewards/rejected": 0.04737880453467369, + "step": 3429 + }, + { + "epoch": 0.530446549391069, + "grad_norm": 5.692229270935059, + "learning_rate": 4.5732615419864826e-06, + "logits/chosen": 10.317362785339355, + "logits/rejected": 4.045015811920166, + "logps/chosen": -328.2392272949219, + "logps/rejected": -254.13131713867188, + "loss": 0.7093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4088035821914673, + "rewards/margins": 0.18261361122131348, + "rewards/rejected": 0.22618995606899261, + "step": 3430 + }, + { + "epoch": 0.5306011985308332, + "grad_norm": 4.959759712219238, + "learning_rate": 4.572975140336809e-06, + "logits/chosen": 8.069620132446289, + "logits/rejected": 8.604790687561035, + "logps/chosen": -246.03280639648438, + "logps/rejected": -237.962158203125, + "loss": 0.6758, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2610960006713867, + "rewards/margins": 0.07367493957281113, + "rewards/rejected": 0.187421053647995, + "step": 3431 + }, + { + "epoch": 0.5307558476705974, + "grad_norm": 4.526830673217773, + "learning_rate": 4.572688738687136e-06, + "logits/chosen": 11.776190757751465, + "logits/rejected": 10.155450820922852, + "logps/chosen": -236.45068359375, + "logps/rejected": -173.79054260253906, + "loss": 0.6356, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3019082546234131, + "rewards/margins": 0.15942107141017914, + "rewards/rejected": 0.14248719811439514, + "step": 3432 + }, + { + "epoch": 0.5309104968103615, + "grad_norm": 7.972452640533447, + "learning_rate": 4.572402337037462e-06, + "logits/chosen": 10.512863159179688, + "logits/rejected": 14.09266185760498, + "logps/chosen": -246.50445556640625, + "logps/rejected": -273.2292785644531, + "loss": 0.7721, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1766868233680725, + "rewards/margins": -0.11256174743175507, + "rewards/rejected": 0.2892485558986664, + "step": 3433 + }, + { + "epoch": 0.5310651459501257, + "grad_norm": 3.793704032897949, + "learning_rate": 4.572115935387788e-06, + "logits/chosen": 10.908559799194336, + "logits/rejected": 11.356244087219238, + "logps/chosen": -166.76580810546875, + "logps/rejected": -197.5608673095703, + "loss": 0.7117, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1815110296010971, + "rewards/margins": 0.019025102257728577, + "rewards/rejected": 0.16248592734336853, + "step": 3434 + }, + { + "epoch": 0.5312197950898898, + "grad_norm": 5.472989082336426, + "learning_rate": 4.571829533738115e-06, + "logits/chosen": 12.45029067993164, + "logits/rejected": 11.325197219848633, + "logps/chosen": -315.3360595703125, + "logps/rejected": -245.63270568847656, + "loss": 0.6477, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4961656332015991, + "rewards/margins": 0.21825763583183289, + "rewards/rejected": 0.27790799736976624, + "step": 3435 + }, + { + "epoch": 0.531374444229654, + "grad_norm": 3.6238322257995605, + "learning_rate": 4.571543132088442e-06, + "logits/chosen": 10.989975929260254, + "logits/rejected": 10.514347076416016, + "logps/chosen": -161.6402587890625, + "logps/rejected": -210.82290649414062, + "loss": 0.5314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15777117013931274, + "rewards/margins": 0.3882730305194855, + "rewards/rejected": -0.23050186038017273, + "step": 3436 + }, + { + "epoch": 0.5315290933694181, + "grad_norm": 5.02354097366333, + "learning_rate": 4.5712567304387674e-06, + "logits/chosen": 13.740961074829102, + "logits/rejected": 5.96253776550293, + "logps/chosen": -337.1373596191406, + "logps/rejected": -210.3853759765625, + "loss": 0.5512, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3758665919303894, + "rewards/margins": 0.5395940542221069, + "rewards/rejected": -0.16372743248939514, + "step": 3437 + }, + { + "epoch": 0.5316837425091823, + "grad_norm": 6.157371520996094, + "learning_rate": 4.570970328789094e-06, + "logits/chosen": 10.211882591247559, + "logits/rejected": 9.94230842590332, + "logps/chosen": -374.38336181640625, + "logps/rejected": -501.30780029296875, + "loss": 0.4766, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7491763234138489, + "rewards/margins": 0.6523413062095642, + "rewards/rejected": 0.09683503210544586, + "step": 3438 + }, + { + "epoch": 0.5318383916489464, + "grad_norm": 5.146474838256836, + "learning_rate": 4.570683927139421e-06, + "logits/chosen": 9.268468856811523, + "logits/rejected": 2.533730983734131, + "logps/chosen": -216.64859008789062, + "logps/rejected": -178.5709991455078, + "loss": 0.5518, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11346770077943802, + "rewards/margins": 0.35315102338790894, + "rewards/rejected": -0.23968330025672913, + "step": 3439 + }, + { + "epoch": 0.5319930407887106, + "grad_norm": 4.453693866729736, + "learning_rate": 4.570397525489747e-06, + "logits/chosen": 11.58981704711914, + "logits/rejected": 13.036358833312988, + "logps/chosen": -186.1175079345703, + "logps/rejected": -180.80081176757812, + "loss": 0.6395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01144113577902317, + "rewards/margins": 0.17896823585033417, + "rewards/rejected": -0.1904093623161316, + "step": 3440 + }, + { + "epoch": 0.5321476899284747, + "grad_norm": 4.810494422912598, + "learning_rate": 4.570111123840073e-06, + "logits/chosen": 8.736467361450195, + "logits/rejected": 6.427340030670166, + "logps/chosen": -297.27703857421875, + "logps/rejected": -246.56668090820312, + "loss": 0.5648, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3800322115421295, + "rewards/margins": 0.4904134273529053, + "rewards/rejected": -0.11038121581077576, + "step": 3441 + }, + { + "epoch": 0.532302339068239, + "grad_norm": 5.815372943878174, + "learning_rate": 4.5698247221904e-06, + "logits/chosen": 8.633378982543945, + "logits/rejected": 14.381312370300293, + "logps/chosen": -230.50863647460938, + "logps/rejected": -365.73797607421875, + "loss": 0.5441, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4519106149673462, + "rewards/margins": 0.5051904320716858, + "rewards/rejected": -0.05327984690666199, + "step": 3442 + }, + { + "epoch": 0.5324569882080031, + "grad_norm": 34.49391555786133, + "learning_rate": 4.5695383205407265e-06, + "logits/chosen": 8.620685577392578, + "logits/rejected": 7.740238666534424, + "logps/chosen": -245.320068359375, + "logps/rejected": -274.30926513671875, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39388230443000793, + "rewards/margins": 0.34456202387809753, + "rewards/rejected": 0.04932032525539398, + "step": 3443 + }, + { + "epoch": 0.5326116373477673, + "grad_norm": 5.060355186462402, + "learning_rate": 4.569251918891053e-06, + "logits/chosen": 18.39594268798828, + "logits/rejected": 10.890106201171875, + "logps/chosen": -284.83502197265625, + "logps/rejected": -254.40145874023438, + "loss": 0.6, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8119302988052368, + "rewards/margins": 0.24286359548568726, + "rewards/rejected": 0.5690666437149048, + "step": 3444 + }, + { + "epoch": 0.5327662864875314, + "grad_norm": 6.999647617340088, + "learning_rate": 4.56896551724138e-06, + "logits/chosen": 8.147418022155762, + "logits/rejected": 5.280096054077148, + "logps/chosen": -318.1426696777344, + "logps/rejected": -175.38006591796875, + "loss": 0.6786, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.28220951557159424, + "rewards/margins": 0.06300004571676254, + "rewards/rejected": 0.2192094773054123, + "step": 3445 + }, + { + "epoch": 0.5329209356272956, + "grad_norm": 7.354351043701172, + "learning_rate": 4.568679115591706e-06, + "logits/chosen": 6.794188976287842, + "logits/rejected": 8.347644805908203, + "logps/chosen": -298.014404296875, + "logps/rejected": -269.04791259765625, + "loss": 0.8629, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.33979547023773193, + "rewards/margins": -0.2117002308368683, + "rewards/rejected": 0.5514957308769226, + "step": 3446 + }, + { + "epoch": 0.5330755847670597, + "grad_norm": 5.101029872894287, + "learning_rate": 4.568392713942032e-06, + "logits/chosen": 9.201620101928711, + "logits/rejected": 3.855217456817627, + "logps/chosen": -300.31732177734375, + "logps/rejected": -262.1998291015625, + "loss": 0.4957, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5049896240234375, + "rewards/margins": 0.45708370208740234, + "rewards/rejected": 0.047905925661325455, + "step": 3447 + }, + { + "epoch": 0.5332302339068239, + "grad_norm": 4.7378435134887695, + "learning_rate": 4.568106312292359e-06, + "logits/chosen": 11.092121124267578, + "logits/rejected": 6.4346137046813965, + "logps/chosen": -200.4429931640625, + "logps/rejected": -162.90960693359375, + "loss": 0.7474, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19061753153800964, + "rewards/margins": 0.013008639216423035, + "rewards/rejected": 0.1776089072227478, + "step": 3448 + }, + { + "epoch": 0.533384883046588, + "grad_norm": 6.047914505004883, + "learning_rate": 4.5678199106426855e-06, + "logits/chosen": 4.059939861297607, + "logits/rejected": 6.663802623748779, + "logps/chosen": -211.39356994628906, + "logps/rejected": -272.5367431640625, + "loss": 0.6125, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6574857234954834, + "rewards/margins": 0.22315870225429535, + "rewards/rejected": 0.43432706594467163, + "step": 3449 + }, + { + "epoch": 0.5335395321863522, + "grad_norm": 6.689841270446777, + "learning_rate": 4.567533508993012e-06, + "logits/chosen": 9.043456077575684, + "logits/rejected": 6.569450378417969, + "logps/chosen": -318.608154296875, + "logps/rejected": -347.499267578125, + "loss": 0.4927, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38673263788223267, + "rewards/margins": 0.6726446151733398, + "rewards/rejected": -0.2859119474887848, + "step": 3450 + }, + { + "epoch": 0.5336941813261163, + "grad_norm": 4.430845260620117, + "learning_rate": 4.567247107343339e-06, + "logits/chosen": 17.865320205688477, + "logits/rejected": 3.8432884216308594, + "logps/chosen": -328.9352111816406, + "logps/rejected": -199.6381378173828, + "loss": 0.3972, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6083768606185913, + "rewards/margins": 0.9470100402832031, + "rewards/rejected": -0.3386331796646118, + "step": 3451 + }, + { + "epoch": 0.5338488304658805, + "grad_norm": 3.575376272201538, + "learning_rate": 4.566960705693665e-06, + "logits/chosen": 9.262465476989746, + "logits/rejected": 6.304553031921387, + "logps/chosen": -197.23269653320312, + "logps/rejected": -126.91417694091797, + "loss": 0.5626, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3719860911369324, + "rewards/margins": 0.3770997226238251, + "rewards/rejected": -0.005113624036312103, + "step": 3452 + }, + { + "epoch": 0.5340034796056446, + "grad_norm": 4.561135292053223, + "learning_rate": 4.566674304043991e-06, + "logits/chosen": 13.124151229858398, + "logits/rejected": 7.113224029541016, + "logps/chosen": -349.787841796875, + "logps/rejected": -271.4983825683594, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5332621932029724, + "rewards/margins": 0.7221713066101074, + "rewards/rejected": -0.1889091432094574, + "step": 3453 + }, + { + "epoch": 0.5341581287454088, + "grad_norm": 5.367588043212891, + "learning_rate": 4.566387902394318e-06, + "logits/chosen": 11.916638374328613, + "logits/rejected": 5.375942707061768, + "logps/chosen": -364.250244140625, + "logps/rejected": -239.3827667236328, + "loss": 0.7707, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05872933566570282, + "rewards/margins": -0.05809955298900604, + "rewards/rejected": 0.11682890355587006, + "step": 3454 + }, + { + "epoch": 0.5343127778851731, + "grad_norm": 4.087586402893066, + "learning_rate": 4.566101500744645e-06, + "logits/chosen": 10.04790210723877, + "logits/rejected": 3.5160884857177734, + "logps/chosen": -282.19976806640625, + "logps/rejected": -177.74288940429688, + "loss": 0.5623, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5670511722564697, + "rewards/margins": 0.32898226380348206, + "rewards/rejected": 0.23806887865066528, + "step": 3455 + }, + { + "epoch": 0.5344674270249372, + "grad_norm": 6.603880405426025, + "learning_rate": 4.565815099094971e-06, + "logits/chosen": 6.2787628173828125, + "logits/rejected": 7.528284072875977, + "logps/chosen": -241.634521484375, + "logps/rejected": -232.2600860595703, + "loss": 0.8805, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09804821014404297, + "rewards/margins": -0.23695436120033264, + "rewards/rejected": 0.3350025713443756, + "step": 3456 + }, + { + "epoch": 0.5346220761647014, + "grad_norm": 5.394029140472412, + "learning_rate": 4.565528697445298e-06, + "logits/chosen": 9.487994194030762, + "logits/rejected": 3.9948291778564453, + "logps/chosen": -425.2696228027344, + "logps/rejected": -370.9013671875, + "loss": 0.4324, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5432972311973572, + "rewards/margins": 0.8604394197463989, + "rewards/rejected": -0.31714215874671936, + "step": 3457 + }, + { + "epoch": 0.5347767253044655, + "grad_norm": 5.575886249542236, + "learning_rate": 4.5652422957956245e-06, + "logits/chosen": 10.354061126708984, + "logits/rejected": 9.322901725769043, + "logps/chosen": -323.078857421875, + "logps/rejected": -292.0771179199219, + "loss": 0.6504, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.558993935585022, + "rewards/margins": 0.23440010845661163, + "rewards/rejected": 0.32459378242492676, + "step": 3458 + }, + { + "epoch": 0.5349313744442297, + "grad_norm": 5.143550872802734, + "learning_rate": 4.56495589414595e-06, + "logits/chosen": 4.204448699951172, + "logits/rejected": 4.879000663757324, + "logps/chosen": -282.2350158691406, + "logps/rejected": -269.42864990234375, + "loss": 0.7038, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5985470414161682, + "rewards/margins": 0.17674662172794342, + "rewards/rejected": 0.421800434589386, + "step": 3459 + }, + { + "epoch": 0.5350860235839938, + "grad_norm": 7.072441577911377, + "learning_rate": 4.564669492496277e-06, + "logits/chosen": 6.972856521606445, + "logits/rejected": 10.14965534210205, + "logps/chosen": -232.37599182128906, + "logps/rejected": -250.8720703125, + "loss": 1.0128, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25090086460113525, + "rewards/margins": -0.4838963747024536, + "rewards/rejected": 0.7347972989082336, + "step": 3460 + }, + { + "epoch": 0.535240672723758, + "grad_norm": 5.6928486824035645, + "learning_rate": 4.564383090846604e-06, + "logits/chosen": 5.260196685791016, + "logits/rejected": 11.90127944946289, + "logps/chosen": -186.59771728515625, + "logps/rejected": -212.52285766601562, + "loss": 0.7685, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06098297983407974, + "rewards/margins": -0.07548104226589203, + "rewards/rejected": 0.13646404445171356, + "step": 3461 + }, + { + "epoch": 0.5353953218635221, + "grad_norm": 9.665477752685547, + "learning_rate": 4.56409668919693e-06, + "logits/chosen": 7.550838947296143, + "logits/rejected": 5.062317848205566, + "logps/chosen": -312.83795166015625, + "logps/rejected": -275.38922119140625, + "loss": 0.5556, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6526155471801758, + "rewards/margins": 0.3766680061817169, + "rewards/rejected": 0.27594754099845886, + "step": 3462 + }, + { + "epoch": 0.5355499710032863, + "grad_norm": 4.193955898284912, + "learning_rate": 4.563810287547257e-06, + "logits/chosen": 8.306144714355469, + "logits/rejected": 9.304515838623047, + "logps/chosen": -259.090087890625, + "logps/rejected": -236.70970153808594, + "loss": 0.488, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41990602016448975, + "rewards/margins": 0.6112083792686462, + "rewards/rejected": -0.19130229949951172, + "step": 3463 + }, + { + "epoch": 0.5357046201430504, + "grad_norm": 4.4209184646606445, + "learning_rate": 4.563523885897584e-06, + "logits/chosen": 11.47598648071289, + "logits/rejected": 10.894248008728027, + "logps/chosen": -199.3096160888672, + "logps/rejected": -177.6346893310547, + "loss": 0.6981, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20075112581253052, + "rewards/margins": 0.065799281001091, + "rewards/rejected": 0.13495182991027832, + "step": 3464 + }, + { + "epoch": 0.5358592692828146, + "grad_norm": 12.39144229888916, + "learning_rate": 4.56323748424791e-06, + "logits/chosen": 9.40106201171875, + "logits/rejected": 13.345853805541992, + "logps/chosen": -237.79360961914062, + "logps/rejected": -363.6505126953125, + "loss": 0.8116, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06312908977270126, + "rewards/margins": -0.12420602142810822, + "rewards/rejected": 0.18733511865139008, + "step": 3465 + }, + { + "epoch": 0.5360139184225787, + "grad_norm": 6.134507179260254, + "learning_rate": 4.562951082598236e-06, + "logits/chosen": 3.9755682945251465, + "logits/rejected": 9.521167755126953, + "logps/chosen": -245.59780883789062, + "logps/rejected": -218.2001953125, + "loss": 0.9435, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.17030514776706696, + "rewards/margins": -0.36190271377563477, + "rewards/rejected": 0.5322078466415405, + "step": 3466 + }, + { + "epoch": 0.5361685675623429, + "grad_norm": 6.467967987060547, + "learning_rate": 4.562664680948563e-06, + "logits/chosen": 13.483763694763184, + "logits/rejected": 9.015125274658203, + "logps/chosen": -349.2707214355469, + "logps/rejected": -347.36566162109375, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8794007301330566, + "rewards/margins": 0.2796288728713989, + "rewards/rejected": 0.5997718572616577, + "step": 3467 + }, + { + "epoch": 0.5363232167021071, + "grad_norm": 4.400718688964844, + "learning_rate": 4.562378279298889e-06, + "logits/chosen": 11.813966751098633, + "logits/rejected": 10.170332908630371, + "logps/chosen": -340.02508544921875, + "logps/rejected": -283.0907897949219, + "loss": 0.5529, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.580676794052124, + "rewards/margins": 0.3901478350162506, + "rewards/rejected": 0.1905289590358734, + "step": 3468 + }, + { + "epoch": 0.5364778658418713, + "grad_norm": 4.829706192016602, + "learning_rate": 4.562091877649216e-06, + "logits/chosen": 11.952885627746582, + "logits/rejected": 5.693309783935547, + "logps/chosen": -429.1151123046875, + "logps/rejected": -335.4317932128906, + "loss": 0.4599, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6229270696640015, + "rewards/margins": 0.6170374751091003, + "rewards/rejected": 0.005889661610126495, + "step": 3469 + }, + { + "epoch": 0.5366325149816354, + "grad_norm": 11.11780071258545, + "learning_rate": 4.561805475999543e-06, + "logits/chosen": 1.0482724905014038, + "logits/rejected": 4.3756303787231445, + "logps/chosen": -210.3115997314453, + "logps/rejected": -444.1589050292969, + "loss": 0.9004, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23940812051296234, + "rewards/margins": 0.06473705172538757, + "rewards/rejected": 0.17467106878757477, + "step": 3470 + }, + { + "epoch": 0.5367871641213996, + "grad_norm": 5.7888946533203125, + "learning_rate": 4.5615190743498685e-06, + "logits/chosen": 6.986234664916992, + "logits/rejected": 2.7387969493865967, + "logps/chosen": -316.13995361328125, + "logps/rejected": -260.50909423828125, + "loss": 0.5771, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6364717483520508, + "rewards/margins": 0.3485320210456848, + "rewards/rejected": 0.28793975710868835, + "step": 3471 + }, + { + "epoch": 0.5369418132611637, + "grad_norm": 4.900925159454346, + "learning_rate": 4.561232672700195e-06, + "logits/chosen": 7.6199846267700195, + "logits/rejected": 4.588237762451172, + "logps/chosen": -315.3199462890625, + "logps/rejected": -267.2480773925781, + "loss": 0.6199, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4717210531234741, + "rewards/margins": 0.3848683834075928, + "rewards/rejected": 0.08685269951820374, + "step": 3472 + }, + { + "epoch": 0.5370964624009279, + "grad_norm": 5.508366584777832, + "learning_rate": 4.560946271050522e-06, + "logits/chosen": 9.255306243896484, + "logits/rejected": 7.784077167510986, + "logps/chosen": -371.81719970703125, + "logps/rejected": -295.12811279296875, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5096185207366943, + "rewards/margins": 0.06269489973783493, + "rewards/rejected": 0.4469236433506012, + "step": 3473 + }, + { + "epoch": 0.537251111540692, + "grad_norm": 6.148151397705078, + "learning_rate": 4.560659869400848e-06, + "logits/chosen": 11.678831100463867, + "logits/rejected": 4.39997673034668, + "logps/chosen": -471.0237121582031, + "logps/rejected": -311.6003723144531, + "loss": 0.7037, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4104408025741577, + "rewards/margins": 0.121530681848526, + "rewards/rejected": 0.2889100909233093, + "step": 3474 + }, + { + "epoch": 0.5374057606804562, + "grad_norm": 5.68243408203125, + "learning_rate": 4.560373467751174e-06, + "logits/chosen": 12.455754280090332, + "logits/rejected": 6.358894348144531, + "logps/chosen": -320.890625, + "logps/rejected": -246.291259765625, + "loss": 0.6719, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40316808223724365, + "rewards/margins": 0.18727856874465942, + "rewards/rejected": 0.215889573097229, + "step": 3475 + }, + { + "epoch": 0.5375604098202204, + "grad_norm": 8.281245231628418, + "learning_rate": 4.560087066101501e-06, + "logits/chosen": 10.458235740661621, + "logits/rejected": 8.534299850463867, + "logps/chosen": -385.112060546875, + "logps/rejected": -315.11456298828125, + "loss": 0.8326, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.313823401927948, + "rewards/margins": -0.14138521254062653, + "rewards/rejected": 0.4552086591720581, + "step": 3476 + }, + { + "epoch": 0.5377150589599845, + "grad_norm": 7.64885139465332, + "learning_rate": 4.5598006644518275e-06, + "logits/chosen": 10.93093490600586, + "logits/rejected": 9.699134826660156, + "logps/chosen": -151.10316467285156, + "logps/rejected": -153.74758911132812, + "loss": 0.6304, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36934810876846313, + "rewards/margins": 0.32461434602737427, + "rewards/rejected": 0.04473382234573364, + "step": 3477 + }, + { + "epoch": 0.5378697080997487, + "grad_norm": 4.424860954284668, + "learning_rate": 4.559514262802154e-06, + "logits/chosen": 13.512920379638672, + "logits/rejected": 9.003018379211426, + "logps/chosen": -341.8384704589844, + "logps/rejected": -246.58056640625, + "loss": 0.5653, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4532545208930969, + "rewards/margins": 0.3818611204624176, + "rewards/rejected": 0.07139340043067932, + "step": 3478 + }, + { + "epoch": 0.5380243572395128, + "grad_norm": 5.093382358551025, + "learning_rate": 4.55922786115248e-06, + "logits/chosen": 9.175430297851562, + "logits/rejected": 8.181477546691895, + "logps/chosen": -277.0560302734375, + "logps/rejected": -286.8317565917969, + "loss": 0.5759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3723825514316559, + "rewards/margins": 0.5575502514839172, + "rewards/rejected": -0.18516770005226135, + "step": 3479 + }, + { + "epoch": 0.5381790063792771, + "grad_norm": 5.199785232543945, + "learning_rate": 4.558941459502807e-06, + "logits/chosen": 4.412034034729004, + "logits/rejected": 2.59041690826416, + "logps/chosen": -261.973388671875, + "logps/rejected": -209.0919189453125, + "loss": 0.5153, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1938885599374771, + "rewards/margins": 0.5277411341667175, + "rewards/rejected": -0.3338525891304016, + "step": 3480 + }, + { + "epoch": 0.5383336555190412, + "grad_norm": 4.076631546020508, + "learning_rate": 4.558655057853133e-06, + "logits/chosen": 11.479317665100098, + "logits/rejected": 3.094334840774536, + "logps/chosen": -283.7841796875, + "logps/rejected": -185.67349243164062, + "loss": 0.5665, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2657521367073059, + "rewards/margins": 0.45385316014289856, + "rewards/rejected": -0.18810106813907623, + "step": 3481 + }, + { + "epoch": 0.5384883046588054, + "grad_norm": 5.191612243652344, + "learning_rate": 4.55836865620346e-06, + "logits/chosen": 8.222091674804688, + "logits/rejected": 6.054858684539795, + "logps/chosen": -247.23997497558594, + "logps/rejected": -219.9218292236328, + "loss": 0.7472, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2999928891658783, + "rewards/margins": 0.07902643084526062, + "rewards/rejected": 0.22096644341945648, + "step": 3482 + }, + { + "epoch": 0.5386429537985695, + "grad_norm": 5.727681636810303, + "learning_rate": 4.5580822545537866e-06, + "logits/chosen": 9.748743057250977, + "logits/rejected": 12.55950927734375, + "logps/chosen": -257.0787048339844, + "logps/rejected": -303.491455078125, + "loss": 0.7742, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.00948730856180191, + "rewards/margins": -0.09779801964759827, + "rewards/rejected": 0.08831073343753815, + "step": 3483 + }, + { + "epoch": 0.5387976029383337, + "grad_norm": 4.163386344909668, + "learning_rate": 4.557795852904113e-06, + "logits/chosen": 11.34853458404541, + "logits/rejected": 6.750984191894531, + "logps/chosen": -278.1275939941406, + "logps/rejected": -209.5895538330078, + "loss": 0.5179, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.643409252166748, + "rewards/margins": 0.4344027042388916, + "rewards/rejected": 0.20900659263134003, + "step": 3484 + }, + { + "epoch": 0.5389522520780978, + "grad_norm": 7.152817726135254, + "learning_rate": 4.557509451254439e-06, + "logits/chosen": 9.748087882995605, + "logits/rejected": 9.264678001403809, + "logps/chosen": -263.6056213378906, + "logps/rejected": -268.05963134765625, + "loss": 0.7036, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2854057550430298, + "rewards/margins": 0.1319640576839447, + "rewards/rejected": 0.1534416675567627, + "step": 3485 + }, + { + "epoch": 0.539106901217862, + "grad_norm": 5.708414554595947, + "learning_rate": 4.557223049604766e-06, + "logits/chosen": 7.172023296356201, + "logits/rejected": 6.871058940887451, + "logps/chosen": -222.507080078125, + "logps/rejected": -229.22044372558594, + "loss": 0.6955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02778664231300354, + "rewards/margins": 0.08936890959739685, + "rewards/rejected": -0.061582282185554504, + "step": 3486 + }, + { + "epoch": 0.5392615503576261, + "grad_norm": 4.342827796936035, + "learning_rate": 4.556936647955092e-06, + "logits/chosen": 11.272900581359863, + "logits/rejected": 6.238199234008789, + "logps/chosen": -276.654296875, + "logps/rejected": -161.07269287109375, + "loss": 0.459, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3879266381263733, + "rewards/margins": 0.6644210815429688, + "rewards/rejected": -0.27649441361427307, + "step": 3487 + }, + { + "epoch": 0.5394161994973903, + "grad_norm": 8.439592361450195, + "learning_rate": 4.556650246305419e-06, + "logits/chosen": 6.385427474975586, + "logits/rejected": 14.341268539428711, + "logps/chosen": -251.87225341796875, + "logps/rejected": -321.5287170410156, + "loss": 0.9173, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.21228301525115967, + "rewards/margins": -0.37563538551330566, + "rewards/rejected": 0.5879184007644653, + "step": 3488 + }, + { + "epoch": 0.5395708486371544, + "grad_norm": 7.2970075607299805, + "learning_rate": 4.556363844655746e-06, + "logits/chosen": 13.9177885055542, + "logits/rejected": 7.445833683013916, + "logps/chosen": -419.8946838378906, + "logps/rejected": -305.1208190917969, + "loss": 0.6625, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3855099081993103, + "rewards/margins": 0.1586289405822754, + "rewards/rejected": 0.2268809676170349, + "step": 3489 + }, + { + "epoch": 0.5397254977769186, + "grad_norm": 3.7492423057556152, + "learning_rate": 4.556077443006072e-06, + "logits/chosen": 12.175384521484375, + "logits/rejected": 2.543306350708008, + "logps/chosen": -332.6507568359375, + "logps/rejected": -249.10455322265625, + "loss": 0.3563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5663467049598694, + "rewards/margins": 0.9684622287750244, + "rewards/rejected": -0.4021156132221222, + "step": 3490 + }, + { + "epoch": 0.5398801469166827, + "grad_norm": 4.765916347503662, + "learning_rate": 4.555791041356399e-06, + "logits/chosen": 14.103438377380371, + "logits/rejected": 7.959611892700195, + "logps/chosen": -399.951171875, + "logps/rejected": -341.9591369628906, + "loss": 0.4355, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5194175243377686, + "rewards/margins": 0.7439227104187012, + "rewards/rejected": -0.22450514137744904, + "step": 3491 + }, + { + "epoch": 0.5400347960564469, + "grad_norm": 5.645360469818115, + "learning_rate": 4.555504639706725e-06, + "logits/chosen": 8.773210525512695, + "logits/rejected": 9.583088874816895, + "logps/chosen": -359.5025634765625, + "logps/rejected": -274.38800048828125, + "loss": 0.6962, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5723686218261719, + "rewards/margins": 0.0845758393406868, + "rewards/rejected": 0.48779281973838806, + "step": 3492 + }, + { + "epoch": 0.5401894451962111, + "grad_norm": 4.530862331390381, + "learning_rate": 4.555218238057051e-06, + "logits/chosen": 10.9473295211792, + "logits/rejected": 9.631717681884766, + "logps/chosen": -229.36935424804688, + "logps/rejected": -207.13758850097656, + "loss": 0.6873, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.13448746502399445, + "rewards/margins": 0.14714720845222473, + "rewards/rejected": -0.01265973225235939, + "step": 3493 + }, + { + "epoch": 0.5403440943359753, + "grad_norm": 4.913314342498779, + "learning_rate": 4.554931836407378e-06, + "logits/chosen": 11.646885871887207, + "logits/rejected": 1.310779333114624, + "logps/chosen": -235.92115783691406, + "logps/rejected": -109.67567443847656, + "loss": 0.6707, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20431068539619446, + "rewards/margins": 0.17526467144489288, + "rewards/rejected": 0.029046017676591873, + "step": 3494 + }, + { + "epoch": 0.5404987434757395, + "grad_norm": 6.161779403686523, + "learning_rate": 4.554645434757705e-06, + "logits/chosen": 7.614030838012695, + "logits/rejected": 7.1966447830200195, + "logps/chosen": -326.32421875, + "logps/rejected": -297.5206298828125, + "loss": 0.6546, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34260284900665283, + "rewards/margins": 0.27079612016677856, + "rewards/rejected": 0.07180673629045486, + "step": 3495 + }, + { + "epoch": 0.5406533926155036, + "grad_norm": 6.2343573570251465, + "learning_rate": 4.554359033108031e-06, + "logits/chosen": 8.52818489074707, + "logits/rejected": 9.596920013427734, + "logps/chosen": -281.6808166503906, + "logps/rejected": -291.0545959472656, + "loss": 0.7382, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02100316435098648, + "rewards/margins": 0.05088706314563751, + "rewards/rejected": -0.02988389879465103, + "step": 3496 + }, + { + "epoch": 0.5408080417552678, + "grad_norm": 14.629050254821777, + "learning_rate": 4.554072631458358e-06, + "logits/chosen": 5.954355716705322, + "logits/rejected": 5.152848720550537, + "logps/chosen": -301.63629150390625, + "logps/rejected": -280.51593017578125, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40612396597862244, + "rewards/margins": 0.3150443434715271, + "rewards/rejected": 0.09107962250709534, + "step": 3497 + }, + { + "epoch": 0.5409626908950319, + "grad_norm": 4.812487602233887, + "learning_rate": 4.553786229808685e-06, + "logits/chosen": 6.442961692810059, + "logits/rejected": 7.085052013397217, + "logps/chosen": -296.0628662109375, + "logps/rejected": -298.15814208984375, + "loss": 0.5238, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37529078125953674, + "rewards/margins": 0.4374125599861145, + "rewards/rejected": -0.06212177127599716, + "step": 3498 + }, + { + "epoch": 0.5411173400347961, + "grad_norm": 5.505073070526123, + "learning_rate": 4.5534998281590104e-06, + "logits/chosen": 12.14414119720459, + "logits/rejected": 11.873077392578125, + "logps/chosen": -372.3367004394531, + "logps/rejected": -424.401123046875, + "loss": 0.5587, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36589258909225464, + "rewards/margins": 0.4328452944755554, + "rewards/rejected": -0.06695270538330078, + "step": 3499 + }, + { + "epoch": 0.5412719891745602, + "grad_norm": 4.925987243652344, + "learning_rate": 4.553213426509337e-06, + "logits/chosen": 11.93224048614502, + "logits/rejected": 9.289567947387695, + "logps/chosen": -230.70289611816406, + "logps/rejected": -221.42782592773438, + "loss": 0.6273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38956716656684875, + "rewards/margins": 0.2550494968891144, + "rewards/rejected": 0.13451766967773438, + "step": 3500 + }, + { + "epoch": 0.5414266383143244, + "grad_norm": 5.067691802978516, + "learning_rate": 4.552927024859664e-06, + "logits/chosen": 12.06839656829834, + "logits/rejected": 9.073575973510742, + "logps/chosen": -392.45635986328125, + "logps/rejected": -247.975830078125, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3974451422691345, + "rewards/margins": 0.29540586471557617, + "rewards/rejected": 0.10203930735588074, + "step": 3501 + }, + { + "epoch": 0.5415812874540885, + "grad_norm": 6.119301795959473, + "learning_rate": 4.55264062320999e-06, + "logits/chosen": 10.994044303894043, + "logits/rejected": 9.389493942260742, + "logps/chosen": -159.7493896484375, + "logps/rejected": -155.42787170410156, + "loss": 0.9328, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2420315444469452, + "rewards/margins": -0.3132123351097107, + "rewards/rejected": 0.0711807906627655, + "step": 3502 + }, + { + "epoch": 0.5417359365938527, + "grad_norm": 7.779451370239258, + "learning_rate": 4.552354221560317e-06, + "logits/chosen": 7.071882724761963, + "logits/rejected": 8.056337356567383, + "logps/chosen": -311.45361328125, + "logps/rejected": -358.69610595703125, + "loss": 0.8465, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13432732224464417, + "rewards/margins": -0.13533399999141693, + "rewards/rejected": 0.2696613371372223, + "step": 3503 + }, + { + "epoch": 0.5418905857336168, + "grad_norm": 5.105988025665283, + "learning_rate": 4.552067819910644e-06, + "logits/chosen": 3.39768123626709, + "logits/rejected": -2.911684513092041, + "logps/chosen": -244.96241760253906, + "logps/rejected": -200.06918334960938, + "loss": 0.6047, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23985296487808228, + "rewards/margins": 0.32487720251083374, + "rewards/rejected": -0.08502425253391266, + "step": 3504 + }, + { + "epoch": 0.542045234873381, + "grad_norm": 5.381991386413574, + "learning_rate": 4.5517814182609695e-06, + "logits/chosen": 10.398744583129883, + "logits/rejected": 9.1279935836792, + "logps/chosen": -359.0007629394531, + "logps/rejected": -264.85064697265625, + "loss": 0.6982, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15377554297447205, + "rewards/margins": 0.026801161468029022, + "rewards/rejected": 0.12697440385818481, + "step": 3505 + }, + { + "epoch": 0.5421998840131452, + "grad_norm": 5.803683757781982, + "learning_rate": 4.551495016611296e-06, + "logits/chosen": 6.138507843017578, + "logits/rejected": 9.383905410766602, + "logps/chosen": -330.3441162109375, + "logps/rejected": -371.12237548828125, + "loss": 0.5767, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2503851652145386, + "rewards/margins": 0.27842631936073303, + "rewards/rejected": -0.028041183948516846, + "step": 3506 + }, + { + "epoch": 0.5423545331529094, + "grad_norm": 6.395082950592041, + "learning_rate": 4.551208614961623e-06, + "logits/chosen": 12.034193992614746, + "logits/rejected": 7.217423439025879, + "logps/chosen": -298.9798583984375, + "logps/rejected": -269.906494140625, + "loss": 0.7352, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.01863950863480568, + "rewards/margins": -0.0606483593583107, + "rewards/rejected": 0.07928786426782608, + "step": 3507 + }, + { + "epoch": 0.5425091822926735, + "grad_norm": 5.099301338195801, + "learning_rate": 4.5509222133119494e-06, + "logits/chosen": 11.359386444091797, + "logits/rejected": 5.786442279815674, + "logps/chosen": -423.5165710449219, + "logps/rejected": -365.9211730957031, + "loss": 0.5684, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8092166781425476, + "rewards/margins": 0.33447590470314026, + "rewards/rejected": 0.47474080324172974, + "step": 3508 + }, + { + "epoch": 0.5426638314324377, + "grad_norm": 7.289186954498291, + "learning_rate": 4.550635811662275e-06, + "logits/chosen": 12.071268081665039, + "logits/rejected": 17.04667854309082, + "logps/chosen": -255.82632446289062, + "logps/rejected": -277.09454345703125, + "loss": 0.7829, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15634021162986755, + "rewards/margins": -0.11880205571651459, + "rewards/rejected": 0.27514228224754333, + "step": 3509 + }, + { + "epoch": 0.5428184805722018, + "grad_norm": 7.643459320068359, + "learning_rate": 4.550349410012602e-06, + "logits/chosen": 11.09959602355957, + "logits/rejected": 7.014785289764404, + "logps/chosen": -376.5906066894531, + "logps/rejected": -375.122314453125, + "loss": 0.5591, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0003324747085571289, + "rewards/margins": 0.6094013452529907, + "rewards/rejected": -0.6090688705444336, + "step": 3510 + }, + { + "epoch": 0.542973129711966, + "grad_norm": 3.691310405731201, + "learning_rate": 4.5500630083629286e-06, + "logits/chosen": 7.974476337432861, + "logits/rejected": 9.409144401550293, + "logps/chosen": -145.7262420654297, + "logps/rejected": -188.11610412597656, + "loss": 0.5545, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3247499465942383, + "rewards/margins": 0.3733232617378235, + "rewards/rejected": -0.048573315143585205, + "step": 3511 + }, + { + "epoch": 0.5431277788517301, + "grad_norm": 3.073251724243164, + "learning_rate": 4.549776606713255e-06, + "logits/chosen": 8.224628448486328, + "logits/rejected": 1.4071017503738403, + "logps/chosen": -141.3560791015625, + "logps/rejected": -102.92980194091797, + "loss": 0.5295, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38744989037513733, + "rewards/margins": 0.4002656638622284, + "rewards/rejected": -0.012815780937671661, + "step": 3512 + }, + { + "epoch": 0.5432824279914943, + "grad_norm": 8.070582389831543, + "learning_rate": 4.549490205063581e-06, + "logits/chosen": 6.5147809982299805, + "logits/rejected": 6.872115612030029, + "logps/chosen": -246.87306213378906, + "logps/rejected": -185.63099670410156, + "loss": 0.6892, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10099928081035614, + "rewards/margins": 0.10853702574968338, + "rewards/rejected": -0.007537752389907837, + "step": 3513 + }, + { + "epoch": 0.5434370771312584, + "grad_norm": 5.410460948944092, + "learning_rate": 4.549203803413908e-06, + "logits/chosen": 14.582372665405273, + "logits/rejected": 7.697572708129883, + "logps/chosen": -305.4378356933594, + "logps/rejected": -212.38340759277344, + "loss": 0.6185, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3665239214897156, + "rewards/margins": 0.24178943037986755, + "rewards/rejected": 0.12473449110984802, + "step": 3514 + }, + { + "epoch": 0.5435917262710226, + "grad_norm": 5.217390060424805, + "learning_rate": 4.548917401764234e-06, + "logits/chosen": 15.824353218078613, + "logits/rejected": 5.598424911499023, + "logps/chosen": -286.70928955078125, + "logps/rejected": -209.32269287109375, + "loss": 0.6231, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3197060823440552, + "rewards/margins": 0.21791549026966095, + "rewards/rejected": 0.10179056972265244, + "step": 3515 + }, + { + "epoch": 0.5437463754107867, + "grad_norm": 3.990569829940796, + "learning_rate": 4.548631000114561e-06, + "logits/chosen": 8.529427528381348, + "logits/rejected": 5.0457916259765625, + "logps/chosen": -187.49325561523438, + "logps/rejected": -179.27442932128906, + "loss": 0.4654, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33572715520858765, + "rewards/margins": 0.691824734210968, + "rewards/rejected": -0.35609766840934753, + "step": 3516 + }, + { + "epoch": 0.5439010245505509, + "grad_norm": 6.907032012939453, + "learning_rate": 4.548344598464888e-06, + "logits/chosen": 3.0524420738220215, + "logits/rejected": 1.3002849817276, + "logps/chosen": -214.88133239746094, + "logps/rejected": -282.6168518066406, + "loss": 0.725, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06216317042708397, + "rewards/margins": 0.07149878889322281, + "rewards/rejected": -0.13366200029850006, + "step": 3517 + }, + { + "epoch": 0.544055673690315, + "grad_norm": 4.797595977783203, + "learning_rate": 4.548058196815213e-06, + "logits/chosen": 13.109292030334473, + "logits/rejected": 6.928149223327637, + "logps/chosen": -227.20571899414062, + "logps/rejected": -182.78598022460938, + "loss": 0.578, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29015055298805237, + "rewards/margins": 0.3859752416610718, + "rewards/rejected": -0.09582467377185822, + "step": 3518 + }, + { + "epoch": 0.5442103228300793, + "grad_norm": 4.863753318786621, + "learning_rate": 4.54777179516554e-06, + "logits/chosen": 8.01257610321045, + "logits/rejected": 7.205863952636719, + "logps/chosen": -240.56210327148438, + "logps/rejected": -277.7075500488281, + "loss": 0.5851, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4072505831718445, + "rewards/margins": 0.41561296582221985, + "rewards/rejected": -0.008362431079149246, + "step": 3519 + }, + { + "epoch": 0.5443649719698435, + "grad_norm": 7.176610469818115, + "learning_rate": 4.547485393515867e-06, + "logits/chosen": 10.424981117248535, + "logits/rejected": 11.230606079101562, + "logps/chosen": -317.6285400390625, + "logps/rejected": -343.45458984375, + "loss": 0.8539, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4626142382621765, + "rewards/margins": -0.2254081666469574, + "rewards/rejected": 0.6880223751068115, + "step": 3520 + }, + { + "epoch": 0.5445196211096076, + "grad_norm": 5.673208713531494, + "learning_rate": 4.547198991866193e-06, + "logits/chosen": 1.7676072120666504, + "logits/rejected": -0.022732943296432495, + "logps/chosen": -362.8194274902344, + "logps/rejected": -242.46539306640625, + "loss": 0.5426, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5794628858566284, + "rewards/margins": 0.43827134370803833, + "rewards/rejected": 0.14119155704975128, + "step": 3521 + }, + { + "epoch": 0.5446742702493718, + "grad_norm": 7.997133255004883, + "learning_rate": 4.54691259021652e-06, + "logits/chosen": 14.438596725463867, + "logits/rejected": 10.949785232543945, + "logps/chosen": -309.0161437988281, + "logps/rejected": -285.4857177734375, + "loss": 0.8135, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14933447539806366, + "rewards/margins": -0.05192427337169647, + "rewards/rejected": 0.20125874876976013, + "step": 3522 + }, + { + "epoch": 0.5448289193891359, + "grad_norm": 6.136359214782715, + "learning_rate": 4.546626188566847e-06, + "logits/chosen": 8.002218246459961, + "logits/rejected": 10.703384399414062, + "logps/chosen": -312.8731994628906, + "logps/rejected": -309.1609191894531, + "loss": 0.7467, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04328130558133125, + "rewards/margins": -0.029910333454608917, + "rewards/rejected": 0.07319164276123047, + "step": 3523 + }, + { + "epoch": 0.5449835685289001, + "grad_norm": 4.540926933288574, + "learning_rate": 4.546339786917173e-06, + "logits/chosen": 13.019472122192383, + "logits/rejected": 12.201904296875, + "logps/chosen": -286.26007080078125, + "logps/rejected": -230.09072875976562, + "loss": 0.4987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4957641661167145, + "rewards/margins": 0.5642727613449097, + "rewards/rejected": -0.06850862503051758, + "step": 3524 + }, + { + "epoch": 0.5451382176686642, + "grad_norm": 6.054940700531006, + "learning_rate": 4.546053385267499e-06, + "logits/chosen": 8.482773780822754, + "logits/rejected": -2.757509708404541, + "logps/chosen": -411.403076171875, + "logps/rejected": -225.9167022705078, + "loss": 0.5334, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2993467450141907, + "rewards/margins": 0.46557068824768066, + "rewards/rejected": -0.1662239134311676, + "step": 3525 + }, + { + "epoch": 0.5452928668084284, + "grad_norm": 5.451732158660889, + "learning_rate": 4.545766983617826e-06, + "logits/chosen": 4.632357120513916, + "logits/rejected": 4.565715312957764, + "logps/chosen": -461.7265625, + "logps/rejected": -378.4767150878906, + "loss": 0.5938, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06513690948486328, + "rewards/margins": 0.4721967279911041, + "rewards/rejected": -0.40705978870391846, + "step": 3526 + }, + { + "epoch": 0.5454475159481925, + "grad_norm": 5.41460657119751, + "learning_rate": 4.545480581968152e-06, + "logits/chosen": 10.451261520385742, + "logits/rejected": 5.0063958168029785, + "logps/chosen": -195.10745239257812, + "logps/rejected": -173.7678985595703, + "loss": 0.7093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4223962724208832, + "rewards/margins": 0.025388240814208984, + "rewards/rejected": 0.3970080316066742, + "step": 3527 + }, + { + "epoch": 0.5456021650879567, + "grad_norm": 5.726624965667725, + "learning_rate": 4.545194180318479e-06, + "logits/chosen": 8.894241333007812, + "logits/rejected": 9.959104537963867, + "logps/chosen": -169.13803100585938, + "logps/rejected": -195.6226806640625, + "loss": 0.7416, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05895858630537987, + "rewards/margins": -0.035721540451049805, + "rewards/rejected": -0.023237038403749466, + "step": 3528 + }, + { + "epoch": 0.5457568142277208, + "grad_norm": 5.137823104858398, + "learning_rate": 4.544907778668806e-06, + "logits/chosen": 14.346549034118652, + "logits/rejected": 11.613884925842285, + "logps/chosen": -341.1233215332031, + "logps/rejected": -213.6710205078125, + "loss": 0.6498, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48205041885375977, + "rewards/margins": 0.1846107542514801, + "rewards/rejected": 0.29743966460227966, + "step": 3529 + }, + { + "epoch": 0.545911463367485, + "grad_norm": 6.021636962890625, + "learning_rate": 4.544621377019132e-06, + "logits/chosen": 13.294836044311523, + "logits/rejected": 10.589038848876953, + "logps/chosen": -316.819580078125, + "logps/rejected": -299.698486328125, + "loss": 0.6054, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12105311453342438, + "rewards/margins": 0.24802693724632263, + "rewards/rejected": -0.12697382271289825, + "step": 3530 + }, + { + "epoch": 0.5460661125072491, + "grad_norm": 5.4508891105651855, + "learning_rate": 4.544334975369458e-06, + "logits/chosen": 10.107401847839355, + "logits/rejected": 14.2821626663208, + "logps/chosen": -188.77883911132812, + "logps/rejected": -227.26724243164062, + "loss": 0.6922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32140201330184937, + "rewards/margins": 0.12523728609085083, + "rewards/rejected": 0.19616475701332092, + "step": 3531 + }, + { + "epoch": 0.5462207616470134, + "grad_norm": 4.356362819671631, + "learning_rate": 4.544048573719785e-06, + "logits/chosen": 17.036211013793945, + "logits/rejected": 9.378527641296387, + "logps/chosen": -468.59832763671875, + "logps/rejected": -339.0898742675781, + "loss": 0.4228, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8874123096466064, + "rewards/margins": 0.6814284324645996, + "rewards/rejected": 0.20598383247852325, + "step": 3532 + }, + { + "epoch": 0.5463754107867775, + "grad_norm": 4.040818214416504, + "learning_rate": 4.5437621720701115e-06, + "logits/chosen": 10.119794845581055, + "logits/rejected": 2.0200061798095703, + "logps/chosen": -395.4491882324219, + "logps/rejected": -234.0541229248047, + "loss": 0.4514, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42939889430999756, + "rewards/margins": 0.6351261734962463, + "rewards/rejected": -0.20572729408740997, + "step": 3533 + }, + { + "epoch": 0.5465300599265417, + "grad_norm": 5.61366605758667, + "learning_rate": 4.543475770420438e-06, + "logits/chosen": 13.310968399047852, + "logits/rejected": 16.62929344177246, + "logps/chosen": -257.2803649902344, + "logps/rejected": -280.9564208984375, + "loss": 0.814, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.169350728392601, + "rewards/margins": -0.15942524373531342, + "rewards/rejected": 0.32877594232559204, + "step": 3534 + }, + { + "epoch": 0.5466847090663058, + "grad_norm": 4.956250190734863, + "learning_rate": 4.543189368770765e-06, + "logits/chosen": 10.099786758422852, + "logits/rejected": 7.958563327789307, + "logps/chosen": -211.59457397460938, + "logps/rejected": -204.10702514648438, + "loss": 0.5962, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3334071636199951, + "rewards/margins": 0.3714045286178589, + "rewards/rejected": -0.03799736499786377, + "step": 3535 + }, + { + "epoch": 0.54683935820607, + "grad_norm": 4.367299556732178, + "learning_rate": 4.5429029671210914e-06, + "logits/chosen": 9.914778709411621, + "logits/rejected": 2.0478150844573975, + "logps/chosen": -205.86819458007812, + "logps/rejected": -156.32708740234375, + "loss": 0.6635, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11699000000953674, + "rewards/margins": 0.11799439787864685, + "rewards/rejected": -0.23498442769050598, + "step": 3536 + }, + { + "epoch": 0.5469940073458341, + "grad_norm": 6.793894290924072, + "learning_rate": 4.542616565471418e-06, + "logits/chosen": 12.172344207763672, + "logits/rejected": 12.992731094360352, + "logps/chosen": -402.451904296875, + "logps/rejected": -482.27203369140625, + "loss": 0.7364, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6419467926025391, + "rewards/margins": 0.10428544878959656, + "rewards/rejected": 0.5376613736152649, + "step": 3537 + }, + { + "epoch": 0.5471486564855983, + "grad_norm": 4.70053243637085, + "learning_rate": 4.542330163821744e-06, + "logits/chosen": 4.366672039031982, + "logits/rejected": 4.480984687805176, + "logps/chosen": -228.4396209716797, + "logps/rejected": -214.60765075683594, + "loss": 0.627, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3523925244808197, + "rewards/margins": 0.24305859208106995, + "rewards/rejected": 0.10933391749858856, + "step": 3538 + }, + { + "epoch": 0.5473033056253624, + "grad_norm": 4.50254487991333, + "learning_rate": 4.5420437621720705e-06, + "logits/chosen": 10.565152168273926, + "logits/rejected": 11.014066696166992, + "logps/chosen": -222.46881103515625, + "logps/rejected": -147.71978759765625, + "loss": 0.6795, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2052536904811859, + "rewards/margins": 0.08800911158323288, + "rewards/rejected": 0.11724459379911423, + "step": 3539 + }, + { + "epoch": 0.5474579547651266, + "grad_norm": 6.468800067901611, + "learning_rate": 4.541757360522397e-06, + "logits/chosen": 11.006531715393066, + "logits/rejected": 7.352038383483887, + "logps/chosen": -317.2406311035156, + "logps/rejected": -351.90911865234375, + "loss": 0.7672, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12836430966854095, + "rewards/margins": 0.00013770908117294312, + "rewards/rejected": 0.1282266080379486, + "step": 3540 + }, + { + "epoch": 0.5476126039048907, + "grad_norm": 6.936290264129639, + "learning_rate": 4.541470958872724e-06, + "logits/chosen": 18.008426666259766, + "logits/rejected": 8.986394882202148, + "logps/chosen": -462.3390808105469, + "logps/rejected": -255.6761932373047, + "loss": 0.7355, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09092272818088531, + "rewards/margins": 0.0928514152765274, + "rewards/rejected": -0.1837741732597351, + "step": 3541 + }, + { + "epoch": 0.5477672530446549, + "grad_norm": 5.683344841003418, + "learning_rate": 4.5411845572230505e-06, + "logits/chosen": 9.448622703552246, + "logits/rejected": 5.785962104797363, + "logps/chosen": -349.160400390625, + "logps/rejected": -245.28756713867188, + "loss": 0.5761, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1705780029296875, + "rewards/margins": 0.36399269104003906, + "rewards/rejected": -0.19341468811035156, + "step": 3542 + }, + { + "epoch": 0.547921902184419, + "grad_norm": 5.563908100128174, + "learning_rate": 4.540898155573376e-06, + "logits/chosen": 11.009073257446289, + "logits/rejected": 8.724419593811035, + "logps/chosen": -205.60867309570312, + "logps/rejected": -256.0479736328125, + "loss": 0.6143, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5023806095123291, + "rewards/margins": 0.2036203294992447, + "rewards/rejected": 0.298760324716568, + "step": 3543 + }, + { + "epoch": 0.5480765513241833, + "grad_norm": 4.960903644561768, + "learning_rate": 4.540611753923703e-06, + "logits/chosen": 5.844745635986328, + "logits/rejected": 9.609888076782227, + "logps/chosen": -207.8001251220703, + "logps/rejected": -215.28321838378906, + "loss": 0.6796, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2908862233161926, + "rewards/margins": 0.07577388733625412, + "rewards/rejected": 0.2151123583316803, + "step": 3544 + }, + { + "epoch": 0.5482312004639475, + "grad_norm": 4.565761089324951, + "learning_rate": 4.54032535227403e-06, + "logits/chosen": 7.832420349121094, + "logits/rejected": 3.3200812339782715, + "logps/chosen": -198.38137817382812, + "logps/rejected": -114.19021606445312, + "loss": 0.6084, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2435060739517212, + "rewards/margins": 0.22167813777923584, + "rewards/rejected": 0.021827932447195053, + "step": 3545 + }, + { + "epoch": 0.5483858496037116, + "grad_norm": 6.146039962768555, + "learning_rate": 4.540038950624356e-06, + "logits/chosen": 11.025949478149414, + "logits/rejected": 6.19476318359375, + "logps/chosen": -303.18719482421875, + "logps/rejected": -210.86190795898438, + "loss": 0.5568, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4069153666496277, + "rewards/margins": 0.45468005537986755, + "rewards/rejected": -0.04776472598314285, + "step": 3546 + }, + { + "epoch": 0.5485404987434758, + "grad_norm": 5.0487589836120605, + "learning_rate": 4.539752548974682e-06, + "logits/chosen": 8.809881210327148, + "logits/rejected": 9.234543800354004, + "logps/chosen": -149.8020782470703, + "logps/rejected": -146.05694580078125, + "loss": 0.6717, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13258127868175507, + "rewards/margins": 0.0861751139163971, + "rewards/rejected": 0.04640618711709976, + "step": 3547 + }, + { + "epoch": 0.5486951478832399, + "grad_norm": 5.862454414367676, + "learning_rate": 4.539466147325009e-06, + "logits/chosen": 5.978017330169678, + "logits/rejected": 10.074748992919922, + "logps/chosen": -264.7501220703125, + "logps/rejected": -322.0801696777344, + "loss": 0.5903, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4706363379955292, + "rewards/margins": 0.24110819399356842, + "rewards/rejected": 0.22952814400196075, + "step": 3548 + }, + { + "epoch": 0.5488497970230041, + "grad_norm": 5.216935634613037, + "learning_rate": 4.539179745675335e-06, + "logits/chosen": 8.29134750366211, + "logits/rejected": 5.364598274230957, + "logps/chosen": -273.58294677734375, + "logps/rejected": -217.62774658203125, + "loss": 0.6012, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22625470161437988, + "rewards/margins": 0.24434253573417664, + "rewards/rejected": -0.018087834119796753, + "step": 3549 + }, + { + "epoch": 0.5490044461627682, + "grad_norm": 5.841380596160889, + "learning_rate": 4.538893344025662e-06, + "logits/chosen": 8.870448112487793, + "logits/rejected": 4.638943672180176, + "logps/chosen": -340.1754150390625, + "logps/rejected": -250.580078125, + "loss": 0.7778, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25906795263290405, + "rewards/margins": 0.3286592960357666, + "rewards/rejected": -0.06959133595228195, + "step": 3550 + }, + { + "epoch": 0.5491590953025324, + "grad_norm": 5.385190486907959, + "learning_rate": 4.538606942375988e-06, + "logits/chosen": 9.568578720092773, + "logits/rejected": 6.89943265914917, + "logps/chosen": -472.45697021484375, + "logps/rejected": -391.3062438964844, + "loss": 0.5963, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13642922043800354, + "rewards/margins": 0.3578115403652191, + "rewards/rejected": -0.22138234972953796, + "step": 3551 + }, + { + "epoch": 0.5493137444422965, + "grad_norm": 6.736863613128662, + "learning_rate": 4.5383205407263144e-06, + "logits/chosen": 13.685302734375, + "logits/rejected": 8.468063354492188, + "logps/chosen": -301.55517578125, + "logps/rejected": -283.5740966796875, + "loss": 0.6054, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44643211364746094, + "rewards/margins": 0.33363571763038635, + "rewards/rejected": 0.11279641091823578, + "step": 3552 + }, + { + "epoch": 0.5494683935820607, + "grad_norm": 5.982054710388184, + "learning_rate": 4.538034139076641e-06, + "logits/chosen": 12.448792457580566, + "logits/rejected": 5.300612449645996, + "logps/chosen": -274.3370056152344, + "logps/rejected": -214.73365783691406, + "loss": 0.7172, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18710699677467346, + "rewards/margins": 0.20708554983139038, + "rewards/rejected": -0.019978567957878113, + "step": 3553 + }, + { + "epoch": 0.5496230427218248, + "grad_norm": 5.329343318939209, + "learning_rate": 4.537747737426968e-06, + "logits/chosen": 10.182079315185547, + "logits/rejected": 9.306894302368164, + "logps/chosen": -270.68115234375, + "logps/rejected": -267.1793518066406, + "loss": 0.7493, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.05081549286842346, + "rewards/margins": -0.07559870183467865, + "rewards/rejected": 0.1264142096042633, + "step": 3554 + }, + { + "epoch": 0.549777691861589, + "grad_norm": 5.044135093688965, + "learning_rate": 4.537461335777294e-06, + "logits/chosen": 10.9108304977417, + "logits/rejected": 5.656590461730957, + "logps/chosen": -337.8362731933594, + "logps/rejected": -237.78717041015625, + "loss": 0.5749, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4850406348705292, + "rewards/margins": 0.43785780668258667, + "rewards/rejected": 0.047182850539684296, + "step": 3555 + }, + { + "epoch": 0.5499323410013531, + "grad_norm": 6.545263767242432, + "learning_rate": 4.537174934127621e-06, + "logits/chosen": 8.910379409790039, + "logits/rejected": 6.235647678375244, + "logps/chosen": -317.0166931152344, + "logps/rejected": -209.79644775390625, + "loss": 0.5564, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3248693645000458, + "rewards/margins": 0.5088315606117249, + "rewards/rejected": -0.18396218121051788, + "step": 3556 + }, + { + "epoch": 0.5500869901411174, + "grad_norm": 5.0078511238098145, + "learning_rate": 4.536888532477948e-06, + "logits/chosen": 9.847518920898438, + "logits/rejected": 9.366765022277832, + "logps/chosen": -186.71817016601562, + "logps/rejected": -182.91854858398438, + "loss": 0.7238, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2906917631626129, + "rewards/margins": 0.03638588637113571, + "rewards/rejected": 0.2543058395385742, + "step": 3557 + }, + { + "epoch": 0.5502416392808815, + "grad_norm": 4.022653579711914, + "learning_rate": 4.5366021308282735e-06, + "logits/chosen": 9.77560043334961, + "logits/rejected": 3.795381546020508, + "logps/chosen": -224.27218627929688, + "logps/rejected": -170.6032257080078, + "loss": 0.5359, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23060496151447296, + "rewards/margins": 0.4511311948299408, + "rewards/rejected": -0.22052627801895142, + "step": 3558 + }, + { + "epoch": 0.5503962884206457, + "grad_norm": 4.373462677001953, + "learning_rate": 4.5363157291786e-06, + "logits/chosen": 5.43952751159668, + "logits/rejected": 10.963338851928711, + "logps/chosen": -177.14576721191406, + "logps/rejected": -186.11505126953125, + "loss": 0.6374, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22932806611061096, + "rewards/margins": 0.22654861211776733, + "rewards/rejected": 0.002779439091682434, + "step": 3559 + }, + { + "epoch": 0.5505509375604098, + "grad_norm": 4.65736198425293, + "learning_rate": 4.536029327528927e-06, + "logits/chosen": 13.079739570617676, + "logits/rejected": 11.66657543182373, + "logps/chosen": -245.99560546875, + "logps/rejected": -264.7964172363281, + "loss": 0.7288, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.510442316532135, + "rewards/margins": -0.004677586257457733, + "rewards/rejected": 0.5151199102401733, + "step": 3560 + }, + { + "epoch": 0.550705586700174, + "grad_norm": 6.700300216674805, + "learning_rate": 4.5357429258792535e-06, + "logits/chosen": 15.049016952514648, + "logits/rejected": 5.899397373199463, + "logps/chosen": -390.96246337890625, + "logps/rejected": -267.9217529296875, + "loss": 0.7615, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0939340591430664, + "rewards/margins": 0.03125503659248352, + "rewards/rejected": 0.06267901510000229, + "step": 3561 + }, + { + "epoch": 0.5508602358399382, + "grad_norm": 7.729654788970947, + "learning_rate": 4.53545652422958e-06, + "logits/chosen": 10.23967170715332, + "logits/rejected": 5.977878093719482, + "logps/chosen": -274.3505859375, + "logps/rejected": -233.908447265625, + "loss": 0.6385, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33057501912117004, + "rewards/margins": 0.17289915680885315, + "rewards/rejected": 0.1576758474111557, + "step": 3562 + }, + { + "epoch": 0.5510148849797023, + "grad_norm": 4.259594917297363, + "learning_rate": 4.535170122579907e-06, + "logits/chosen": 10.6661958694458, + "logits/rejected": 6.657013893127441, + "logps/chosen": -286.9015197753906, + "logps/rejected": -219.30804443359375, + "loss": 0.5235, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5922577977180481, + "rewards/margins": 0.5397415161132812, + "rewards/rejected": 0.05251626670360565, + "step": 3563 + }, + { + "epoch": 0.5511695341194665, + "grad_norm": 5.772739887237549, + "learning_rate": 4.5348837209302326e-06, + "logits/chosen": 7.150918006896973, + "logits/rejected": 7.458500862121582, + "logps/chosen": -305.84930419921875, + "logps/rejected": -265.9232177734375, + "loss": 0.82, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4643403887748718, + "rewards/margins": -0.15803226828575134, + "rewards/rejected": 0.6223726272583008, + "step": 3564 + }, + { + "epoch": 0.5513241832592306, + "grad_norm": 7.2503533363342285, + "learning_rate": 4.534597319280559e-06, + "logits/chosen": 8.620634078979492, + "logits/rejected": 7.966855525970459, + "logps/chosen": -300.65728759765625, + "logps/rejected": -342.868896484375, + "loss": 0.8209, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5630488991737366, + "rewards/margins": -0.13228988647460938, + "rewards/rejected": 0.6953388452529907, + "step": 3565 + }, + { + "epoch": 0.5514788323989948, + "grad_norm": 6.189802169799805, + "learning_rate": 4.534310917630886e-06, + "logits/chosen": 11.349376678466797, + "logits/rejected": 8.790705680847168, + "logps/chosen": -284.88287353515625, + "logps/rejected": -212.41690063476562, + "loss": 0.7765, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09331922978162766, + "rewards/margins": -0.11698141694068909, + "rewards/rejected": 0.023662179708480835, + "step": 3566 + }, + { + "epoch": 0.5516334815387589, + "grad_norm": 4.301194667816162, + "learning_rate": 4.5340245159812125e-06, + "logits/chosen": 7.0621490478515625, + "logits/rejected": 3.6898441314697266, + "logps/chosen": -211.5169677734375, + "logps/rejected": -179.8331298828125, + "loss": 0.5851, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3927188217639923, + "rewards/margins": 0.5453318357467651, + "rewards/rejected": -0.15261292457580566, + "step": 3567 + }, + { + "epoch": 0.5517881306785231, + "grad_norm": 5.55835485458374, + "learning_rate": 4.533738114331539e-06, + "logits/chosen": 11.471624374389648, + "logits/rejected": 6.332268714904785, + "logps/chosen": -344.49310302734375, + "logps/rejected": -234.843994140625, + "loss": 0.6997, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3348548114299774, + "rewards/margins": 0.1306462287902832, + "rewards/rejected": 0.20420856773853302, + "step": 3568 + }, + { + "epoch": 0.5519427798182872, + "grad_norm": 6.9027934074401855, + "learning_rate": 4.533451712681866e-06, + "logits/chosen": 7.23335075378418, + "logits/rejected": 9.39720630645752, + "logps/chosen": -204.245849609375, + "logps/rejected": -231.79544067382812, + "loss": 0.9175, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14404296875, + "rewards/margins": -0.33375269174575806, + "rewards/rejected": 0.47779566049575806, + "step": 3569 + }, + { + "epoch": 0.5520974289580515, + "grad_norm": 5.868442058563232, + "learning_rate": 4.5331653110321925e-06, + "logits/chosen": 9.503335952758789, + "logits/rejected": 7.132210731506348, + "logps/chosen": -198.42047119140625, + "logps/rejected": -197.94317626953125, + "loss": 0.743, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20590871572494507, + "rewards/margins": -0.024768264964222908, + "rewards/rejected": 0.23067699372768402, + "step": 3570 + }, + { + "epoch": 0.5522520780978156, + "grad_norm": 5.358765602111816, + "learning_rate": 4.532878909382518e-06, + "logits/chosen": 12.876893043518066, + "logits/rejected": 3.510669708251953, + "logps/chosen": -281.7961730957031, + "logps/rejected": -158.5064239501953, + "loss": 0.8352, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0024895519018173218, + "rewards/margins": -0.0842953622341156, + "rewards/rejected": 0.08678492903709412, + "step": 3571 + }, + { + "epoch": 0.5524067272375798, + "grad_norm": 3.619243621826172, + "learning_rate": 4.532592507732845e-06, + "logits/chosen": 10.332534790039062, + "logits/rejected": 3.8928916454315186, + "logps/chosen": -287.9915771484375, + "logps/rejected": -245.3173370361328, + "loss": 0.4249, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5293671488761902, + "rewards/margins": 0.6992554664611816, + "rewards/rejected": -0.16988831758499146, + "step": 3572 + }, + { + "epoch": 0.5525613763773439, + "grad_norm": 6.228548049926758, + "learning_rate": 4.5323061060831716e-06, + "logits/chosen": 11.99790096282959, + "logits/rejected": 9.32993221282959, + "logps/chosen": -303.4930419921875, + "logps/rejected": -248.70272827148438, + "loss": 0.6333, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3519423007965088, + "rewards/margins": 0.16439954936504364, + "rewards/rejected": 0.18754275143146515, + "step": 3573 + }, + { + "epoch": 0.5527160255171081, + "grad_norm": 9.161906242370605, + "learning_rate": 4.532019704433498e-06, + "logits/chosen": 7.7228217124938965, + "logits/rejected": 13.913342475891113, + "logps/chosen": -316.5376892089844, + "logps/rejected": -406.7513122558594, + "loss": 0.9234, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0004051625728607178, + "rewards/margins": -0.39028510451316833, + "rewards/rejected": 0.3906902074813843, + "step": 3574 + }, + { + "epoch": 0.5528706746568722, + "grad_norm": 6.7055206298828125, + "learning_rate": 4.531733302783825e-06, + "logits/chosen": 5.911735534667969, + "logits/rejected": 3.2133917808532715, + "logps/chosen": -342.8312683105469, + "logps/rejected": -359.0065612792969, + "loss": 0.6271, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5394797325134277, + "rewards/margins": 0.29227784276008606, + "rewards/rejected": 0.24720191955566406, + "step": 3575 + }, + { + "epoch": 0.5530253237966364, + "grad_norm": 5.960477828979492, + "learning_rate": 4.531446901134151e-06, + "logits/chosen": 7.805113792419434, + "logits/rejected": 8.001245498657227, + "logps/chosen": -354.50689697265625, + "logps/rejected": -356.4140625, + "loss": 0.7935, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3726765513420105, + "rewards/margins": 0.042413897812366486, + "rewards/rejected": 0.3302626311779022, + "step": 3576 + }, + { + "epoch": 0.5531799729364005, + "grad_norm": 4.549402236938477, + "learning_rate": 4.531160499484477e-06, + "logits/chosen": 10.263496398925781, + "logits/rejected": 4.837553024291992, + "logps/chosen": -230.8402099609375, + "logps/rejected": -279.2123718261719, + "loss": 0.5779, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24746140837669373, + "rewards/margins": 0.4961482882499695, + "rewards/rejected": -0.24868685007095337, + "step": 3577 + }, + { + "epoch": 0.5533346220761647, + "grad_norm": 5.337644100189209, + "learning_rate": 4.530874097834804e-06, + "logits/chosen": 9.329675674438477, + "logits/rejected": 13.068574905395508, + "logps/chosen": -269.3693542480469, + "logps/rejected": -286.4462890625, + "loss": 0.7257, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5886191129684448, + "rewards/margins": 0.03895539790391922, + "rewards/rejected": 0.5496636629104614, + "step": 3578 + }, + { + "epoch": 0.5534892712159288, + "grad_norm": 7.192526340484619, + "learning_rate": 4.530587696185131e-06, + "logits/chosen": 10.102540969848633, + "logits/rejected": 12.618480682373047, + "logps/chosen": -332.9478759765625, + "logps/rejected": -242.19119262695312, + "loss": 0.8486, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028556670993566513, + "rewards/margins": -0.20887719094753265, + "rewards/rejected": 0.23743388056755066, + "step": 3579 + }, + { + "epoch": 0.553643920355693, + "grad_norm": 5.215231895446777, + "learning_rate": 4.530301294535457e-06, + "logits/chosen": 10.68503475189209, + "logits/rejected": 8.497568130493164, + "logps/chosen": -320.5820617675781, + "logps/rejected": -262.18023681640625, + "loss": 0.6074, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4127378463745117, + "rewards/margins": 0.23450195789337158, + "rewards/rejected": 0.17823591828346252, + "step": 3580 + }, + { + "epoch": 0.5537985694954571, + "grad_norm": 3.838759183883667, + "learning_rate": 4.530014892885783e-06, + "logits/chosen": 10.553598403930664, + "logits/rejected": 11.064107894897461, + "logps/chosen": -194.6639862060547, + "logps/rejected": -200.62429809570312, + "loss": 0.547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4793577194213867, + "rewards/margins": 0.38866546750068665, + "rewards/rejected": 0.09069228172302246, + "step": 3581 + }, + { + "epoch": 0.5539532186352213, + "grad_norm": 6.029939651489258, + "learning_rate": 4.52972849123611e-06, + "logits/chosen": 14.453691482543945, + "logits/rejected": 5.123515605926514, + "logps/chosen": -468.627197265625, + "logps/rejected": -308.79254150390625, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48506975173950195, + "rewards/margins": 0.23682114481925964, + "rewards/rejected": 0.2482486069202423, + "step": 3582 + }, + { + "epoch": 0.5541078677749856, + "grad_norm": 5.329648494720459, + "learning_rate": 4.529442089586436e-06, + "logits/chosen": 11.393535614013672, + "logits/rejected": 11.046777725219727, + "logps/chosen": -254.82723999023438, + "logps/rejected": -268.787841796875, + "loss": 0.7297, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22061532735824585, + "rewards/margins": 0.025781691074371338, + "rewards/rejected": 0.1948336660861969, + "step": 3583 + }, + { + "epoch": 0.5542625169147497, + "grad_norm": 4.4506916999816895, + "learning_rate": 4.529155687936763e-06, + "logits/chosen": 6.265275001525879, + "logits/rejected": -0.013617873191833496, + "logps/chosen": -244.4647216796875, + "logps/rejected": -158.00363159179688, + "loss": 0.6085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3447190821170807, + "rewards/margins": 0.25595828890800476, + "rewards/rejected": 0.08876079320907593, + "step": 3584 + }, + { + "epoch": 0.5544171660545139, + "grad_norm": 10.360529899597168, + "learning_rate": 4.528869286287089e-06, + "logits/chosen": 8.019856452941895, + "logits/rejected": 7.219570159912109, + "logps/chosen": -395.52459716796875, + "logps/rejected": -199.68572998046875, + "loss": 0.8237, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.061166711151599884, + "rewards/margins": -0.0594443753361702, + "rewards/rejected": -0.0017223283648490906, + "step": 3585 + }, + { + "epoch": 0.554571815194278, + "grad_norm": 6.593823432922363, + "learning_rate": 4.5285828846374155e-06, + "logits/chosen": 7.215122222900391, + "logits/rejected": 6.8266401290893555, + "logps/chosen": -277.1084899902344, + "logps/rejected": -263.850830078125, + "loss": 0.7578, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3335644602775574, + "rewards/margins": 0.0619109645485878, + "rewards/rejected": 0.27165350317955017, + "step": 3586 + }, + { + "epoch": 0.5547264643340422, + "grad_norm": 4.168130874633789, + "learning_rate": 4.528296482987742e-06, + "logits/chosen": 10.273550033569336, + "logits/rejected": 15.078262329101562, + "logps/chosen": -153.53765869140625, + "logps/rejected": -222.0395965576172, + "loss": 0.6152, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37599116563796997, + "rewards/margins": 0.2333216369152069, + "rewards/rejected": 0.14266952872276306, + "step": 3587 + }, + { + "epoch": 0.5548811134738063, + "grad_norm": 4.672872066497803, + "learning_rate": 4.528010081338069e-06, + "logits/chosen": 7.191455841064453, + "logits/rejected": 8.818643569946289, + "logps/chosen": -228.7047576904297, + "logps/rejected": -263.59332275390625, + "loss": 0.6474, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5539558529853821, + "rewards/margins": 0.22437722980976105, + "rewards/rejected": 0.3295786380767822, + "step": 3588 + }, + { + "epoch": 0.5550357626135705, + "grad_norm": 5.20742130279541, + "learning_rate": 4.5277236796883954e-06, + "logits/chosen": 7.266532897949219, + "logits/rejected": 9.920047760009766, + "logps/chosen": -283.619384765625, + "logps/rejected": -306.19854736328125, + "loss": 0.6844, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6154531836509705, + "rewards/margins": 0.044618964195251465, + "rewards/rejected": 0.5708341598510742, + "step": 3589 + }, + { + "epoch": 0.5551904117533346, + "grad_norm": 9.111827850341797, + "learning_rate": 4.527437278038722e-06, + "logits/chosen": 4.200196743011475, + "logits/rejected": 2.7336552143096924, + "logps/chosen": -334.99664306640625, + "logps/rejected": -377.4158935546875, + "loss": 0.7059, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2466956079006195, + "rewards/margins": -0.009928415529429913, + "rewards/rejected": 0.2566240429878235, + "step": 3590 + }, + { + "epoch": 0.5553450608930988, + "grad_norm": 7.970217227935791, + "learning_rate": 4.527150876389048e-06, + "logits/chosen": 7.0902862548828125, + "logits/rejected": 7.37230920791626, + "logps/chosen": -284.6959228515625, + "logps/rejected": -310.8411560058594, + "loss": 0.7812, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5816724300384521, + "rewards/margins": -0.1375368982553482, + "rewards/rejected": 0.7192093133926392, + "step": 3591 + }, + { + "epoch": 0.5554997100328629, + "grad_norm": 6.084896087646484, + "learning_rate": 4.5268644747393745e-06, + "logits/chosen": 6.012429237365723, + "logits/rejected": 1.8172115087509155, + "logps/chosen": -296.15081787109375, + "logps/rejected": -187.25193786621094, + "loss": 0.6238, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.053958505392074585, + "rewards/margins": 0.22143949568271637, + "rewards/rejected": -0.16748099029064178, + "step": 3592 + }, + { + "epoch": 0.5556543591726271, + "grad_norm": 6.385085105895996, + "learning_rate": 4.526578073089701e-06, + "logits/chosen": 15.055275917053223, + "logits/rejected": 5.783827304840088, + "logps/chosen": -301.31109619140625, + "logps/rejected": -221.2132110595703, + "loss": 0.781, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07564354687929153, + "rewards/margins": -0.13373184204101562, + "rewards/rejected": 0.058088287711143494, + "step": 3593 + }, + { + "epoch": 0.5558090083123912, + "grad_norm": 6.9723405838012695, + "learning_rate": 4.526291671440028e-06, + "logits/chosen": 13.371246337890625, + "logits/rejected": 8.344164848327637, + "logps/chosen": -263.8916015625, + "logps/rejected": -230.72943115234375, + "loss": 0.6329, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40703660249710083, + "rewards/margins": 0.18481919169425964, + "rewards/rejected": 0.2222174108028412, + "step": 3594 + }, + { + "epoch": 0.5559636574521554, + "grad_norm": 11.827320098876953, + "learning_rate": 4.5260052697903545e-06, + "logits/chosen": 9.371259689331055, + "logits/rejected": 10.057371139526367, + "logps/chosen": -281.1583557128906, + "logps/rejected": -296.6534423828125, + "loss": 0.6414, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16524186730384827, + "rewards/margins": 0.13216614723205566, + "rewards/rejected": 0.03307570889592171, + "step": 3595 + }, + { + "epoch": 0.5561183065919196, + "grad_norm": 7.136361122131348, + "learning_rate": 4.525718868140681e-06, + "logits/chosen": 6.639684677124023, + "logits/rejected": 6.538175582885742, + "logps/chosen": -306.7061462402344, + "logps/rejected": -168.71986389160156, + "loss": 0.7479, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1392790824174881, + "rewards/margins": -0.004366010427474976, + "rewards/rejected": 0.14364507794380188, + "step": 3596 + }, + { + "epoch": 0.5562729557316838, + "grad_norm": 4.886064529418945, + "learning_rate": 4.525432466491007e-06, + "logits/chosen": 10.801265716552734, + "logits/rejected": 6.915102005004883, + "logps/chosen": -213.23863220214844, + "logps/rejected": -254.48770141601562, + "loss": 0.5227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2671562731266022, + "rewards/margins": 0.47924545407295227, + "rewards/rejected": -0.2120891511440277, + "step": 3597 + }, + { + "epoch": 0.5564276048714479, + "grad_norm": 4.799747943878174, + "learning_rate": 4.525146064841334e-06, + "logits/chosen": 10.222060203552246, + "logits/rejected": 9.786624908447266, + "logps/chosen": -212.83509826660156, + "logps/rejected": -195.61679077148438, + "loss": 0.6008, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39520156383514404, + "rewards/margins": 0.260658860206604, + "rewards/rejected": 0.13454273343086243, + "step": 3598 + }, + { + "epoch": 0.5565822540112121, + "grad_norm": 5.969414710998535, + "learning_rate": 4.52485966319166e-06, + "logits/chosen": 9.955301284790039, + "logits/rejected": 7.979180335998535, + "logps/chosen": -231.13043212890625, + "logps/rejected": -270.25738525390625, + "loss": 0.6987, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5927839875221252, + "rewards/margins": 0.16440770030021667, + "rewards/rejected": 0.42837631702423096, + "step": 3599 + }, + { + "epoch": 0.5567369031509762, + "grad_norm": 4.681183338165283, + "learning_rate": 4.524573261541987e-06, + "logits/chosen": 15.351505279541016, + "logits/rejected": 11.054553031921387, + "logps/chosen": -201.1328125, + "logps/rejected": -217.37356567382812, + "loss": 0.6229, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29159650206565857, + "rewards/margins": 0.1804816722869873, + "rewards/rejected": 0.11111484467983246, + "step": 3600 + }, + { + "epoch": 0.5568915522907404, + "grad_norm": 4.315676212310791, + "learning_rate": 4.5242868598923135e-06, + "logits/chosen": 9.01484203338623, + "logits/rejected": -0.9540919065475464, + "logps/chosen": -343.8393249511719, + "logps/rejected": -234.0284423828125, + "loss": 0.4608, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4085361659526825, + "rewards/margins": 0.7119013667106628, + "rewards/rejected": -0.30336523056030273, + "step": 3601 + }, + { + "epoch": 0.5570462014305045, + "grad_norm": 5.52816104888916, + "learning_rate": 4.52400045824264e-06, + "logits/chosen": 9.709121704101562, + "logits/rejected": 8.241652488708496, + "logps/chosen": -223.72882080078125, + "logps/rejected": -208.41815185546875, + "loss": 0.665, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1758045107126236, + "rewards/margins": 0.1667385697364807, + "rewards/rejected": 0.009065959602594376, + "step": 3602 + }, + { + "epoch": 0.5572008505702687, + "grad_norm": 5.233506202697754, + "learning_rate": 4.523714056592967e-06, + "logits/chosen": 11.593863487243652, + "logits/rejected": 13.646553039550781, + "logps/chosen": -252.87530517578125, + "logps/rejected": -259.9027099609375, + "loss": 0.78, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12750694155693054, + "rewards/margins": -0.04338666796684265, + "rewards/rejected": 0.1708935797214508, + "step": 3603 + }, + { + "epoch": 0.5573554997100328, + "grad_norm": 7.003399848937988, + "learning_rate": 4.523427654943293e-06, + "logits/chosen": 11.303274154663086, + "logits/rejected": 5.539395809173584, + "logps/chosen": -299.13519287109375, + "logps/rejected": -204.26319885253906, + "loss": 0.6439, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5186868906021118, + "rewards/margins": 0.17752408981323242, + "rewards/rejected": 0.3411628007888794, + "step": 3604 + }, + { + "epoch": 0.557510148849797, + "grad_norm": 4.704915523529053, + "learning_rate": 4.523141253293619e-06, + "logits/chosen": 2.3328161239624023, + "logits/rejected": 4.889397621154785, + "logps/chosen": -146.39877319335938, + "logps/rejected": -522.343505859375, + "loss": 0.6458, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10141687095165253, + "rewards/margins": 0.20595534145832062, + "rewards/rejected": -0.1045384630560875, + "step": 3605 + }, + { + "epoch": 0.5576647979895611, + "grad_norm": 4.940998077392578, + "learning_rate": 4.522854851643946e-06, + "logits/chosen": 7.311234951019287, + "logits/rejected": 4.039869785308838, + "logps/chosen": -196.97576904296875, + "logps/rejected": -171.3727569580078, + "loss": 0.6224, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4244900345802307, + "rewards/margins": 0.2368585616350174, + "rewards/rejected": 0.18763147294521332, + "step": 3606 + }, + { + "epoch": 0.5578194471293253, + "grad_norm": 4.063277721405029, + "learning_rate": 4.522568449994273e-06, + "logits/chosen": 6.424857139587402, + "logits/rejected": 7.713414192199707, + "logps/chosen": -227.69915771484375, + "logps/rejected": -219.16546630859375, + "loss": 0.6333, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4822864532470703, + "rewards/margins": 0.16694718599319458, + "rewards/rejected": 0.3153392970561981, + "step": 3607 + }, + { + "epoch": 0.5579740962690894, + "grad_norm": 5.726160049438477, + "learning_rate": 4.522282048344599e-06, + "logits/chosen": 10.90269947052002, + "logits/rejected": 11.126384735107422, + "logps/chosen": -262.9569396972656, + "logps/rejected": -233.80538940429688, + "loss": 0.7248, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22630012035369873, + "rewards/margins": 0.009243473410606384, + "rewards/rejected": 0.21705663204193115, + "step": 3608 + }, + { + "epoch": 0.5581287454088537, + "grad_norm": 3.4471685886383057, + "learning_rate": 4.521995646694926e-06, + "logits/chosen": 8.60572624206543, + "logits/rejected": 7.920344352722168, + "logps/chosen": -218.6759490966797, + "logps/rejected": -242.67666625976562, + "loss": 0.4743, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33369648456573486, + "rewards/margins": 0.5977742671966553, + "rewards/rejected": -0.26407772302627563, + "step": 3609 + }, + { + "epoch": 0.5582833945486179, + "grad_norm": 4.6529035568237305, + "learning_rate": 4.521709245045252e-06, + "logits/chosen": 6.490020751953125, + "logits/rejected": 7.3491363525390625, + "logps/chosen": -249.46868896484375, + "logps/rejected": -216.26907348632812, + "loss": 0.6209, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38138917088508606, + "rewards/margins": 0.22886231541633606, + "rewards/rejected": 0.1525268852710724, + "step": 3610 + }, + { + "epoch": 0.558438043688382, + "grad_norm": 4.666914939880371, + "learning_rate": 4.521422843395578e-06, + "logits/chosen": 9.001693725585938, + "logits/rejected": 8.91424560546875, + "logps/chosen": -247.14523315429688, + "logps/rejected": -252.96237182617188, + "loss": 0.6423, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5848187208175659, + "rewards/margins": 0.1357874721288681, + "rewards/rejected": 0.449031263589859, + "step": 3611 + }, + { + "epoch": 0.5585926928281462, + "grad_norm": 4.410085678100586, + "learning_rate": 4.521136441745905e-06, + "logits/chosen": 17.82365608215332, + "logits/rejected": 10.565075874328613, + "logps/chosen": -268.4102478027344, + "logps/rejected": -239.2488555908203, + "loss": 0.5114, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.558750331401825, + "rewards/margins": 0.5254983901977539, + "rewards/rejected": 0.03325194865465164, + "step": 3612 + }, + { + "epoch": 0.5587473419679103, + "grad_norm": 6.610159873962402, + "learning_rate": 4.520850040096232e-06, + "logits/chosen": 6.7402729988098145, + "logits/rejected": 9.701754570007324, + "logps/chosen": -284.0154724121094, + "logps/rejected": -301.650146484375, + "loss": 0.7617, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6410777568817139, + "rewards/margins": -0.09951437264680862, + "rewards/rejected": 0.7405920624732971, + "step": 3613 + }, + { + "epoch": 0.5589019911076745, + "grad_norm": 5.578883647918701, + "learning_rate": 4.520563638446558e-06, + "logits/chosen": 10.940439224243164, + "logits/rejected": 8.70705509185791, + "logps/chosen": -386.8883972167969, + "logps/rejected": -329.6268005371094, + "loss": 0.6354, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.732818603515625, + "rewards/margins": 0.32858431339263916, + "rewards/rejected": 0.4042343497276306, + "step": 3614 + }, + { + "epoch": 0.5590566402474386, + "grad_norm": 6.58540153503418, + "learning_rate": 4.520277236796884e-06, + "logits/chosen": 6.423508644104004, + "logits/rejected": 6.327029228210449, + "logps/chosen": -315.8009338378906, + "logps/rejected": -228.36264038085938, + "loss": 0.6112, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4469373822212219, + "rewards/margins": 0.24720098078250885, + "rewards/rejected": 0.19973641633987427, + "step": 3615 + }, + { + "epoch": 0.5592112893872028, + "grad_norm": 4.744726181030273, + "learning_rate": 4.519990835147211e-06, + "logits/chosen": 10.787200927734375, + "logits/rejected": 8.305334091186523, + "logps/chosen": -277.614990234375, + "logps/rejected": -241.25241088867188, + "loss": 0.5334, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7808390855789185, + "rewards/margins": 0.43273472785949707, + "rewards/rejected": 0.34810441732406616, + "step": 3616 + }, + { + "epoch": 0.5593659385269669, + "grad_norm": 6.494253158569336, + "learning_rate": 4.519704433497537e-06, + "logits/chosen": 7.153563499450684, + "logits/rejected": 5.177081108093262, + "logps/chosen": -277.0997314453125, + "logps/rejected": -269.0425109863281, + "loss": 0.8098, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.148020938038826, + "rewards/margins": -0.07553607225418091, + "rewards/rejected": 0.2235570251941681, + "step": 3617 + }, + { + "epoch": 0.5595205876667311, + "grad_norm": 4.230822563171387, + "learning_rate": 4.519418031847864e-06, + "logits/chosen": 9.969825744628906, + "logits/rejected": 7.020700454711914, + "logps/chosen": -137.01666259765625, + "logps/rejected": -109.45060729980469, + "loss": 0.6858, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21498818695545197, + "rewards/margins": 0.0501728281378746, + "rewards/rejected": 0.16481536626815796, + "step": 3618 + }, + { + "epoch": 0.5596752368064952, + "grad_norm": 5.804405689239502, + "learning_rate": 4.51913163019819e-06, + "logits/chosen": 8.10032844543457, + "logits/rejected": 4.302483558654785, + "logps/chosen": -312.70654296875, + "logps/rejected": -288.6571960449219, + "loss": 0.6034, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34845641255378723, + "rewards/margins": 0.225398451089859, + "rewards/rejected": 0.1230580061674118, + "step": 3619 + }, + { + "epoch": 0.5598298859462594, + "grad_norm": 5.436731338500977, + "learning_rate": 4.5188452285485165e-06, + "logits/chosen": 10.971719741821289, + "logits/rejected": 13.33697509765625, + "logps/chosen": -262.65435791015625, + "logps/rejected": -269.7942810058594, + "loss": 0.6509, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2128930389881134, + "rewards/margins": 0.13772663474082947, + "rewards/rejected": 0.07516643404960632, + "step": 3620 + }, + { + "epoch": 0.5599845350860236, + "grad_norm": 5.076542854309082, + "learning_rate": 4.518558826898843e-06, + "logits/chosen": 8.224015235900879, + "logits/rejected": -0.49716126918792725, + "logps/chosen": -274.8866271972656, + "logps/rejected": -173.148193359375, + "loss": 0.532, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7412557601928711, + "rewards/margins": 0.4346083402633667, + "rewards/rejected": 0.306647390127182, + "step": 3621 + }, + { + "epoch": 0.5601391842257878, + "grad_norm": 4.226314067840576, + "learning_rate": 4.51827242524917e-06, + "logits/chosen": 10.973672866821289, + "logits/rejected": 6.423691749572754, + "logps/chosen": -265.5919494628906, + "logps/rejected": -201.13607788085938, + "loss": 0.6009, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5679236650466919, + "rewards/margins": 0.2588083744049072, + "rewards/rejected": 0.30911532044410706, + "step": 3622 + }, + { + "epoch": 0.5602938333655519, + "grad_norm": 4.895264625549316, + "learning_rate": 4.5179860235994965e-06, + "logits/chosen": 7.835058212280273, + "logits/rejected": 7.824412822723389, + "logps/chosen": -212.9092559814453, + "logps/rejected": -162.3740234375, + "loss": 0.5371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39376819133758545, + "rewards/margins": 0.4903585910797119, + "rewards/rejected": -0.09659042954444885, + "step": 3623 + }, + { + "epoch": 0.5604484825053161, + "grad_norm": 4.777890682220459, + "learning_rate": 4.517699621949822e-06, + "logits/chosen": 7.037005424499512, + "logits/rejected": 7.80773401260376, + "logps/chosen": -224.48651123046875, + "logps/rejected": -231.4244384765625, + "loss": 0.6012, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6564013957977295, + "rewards/margins": 0.272576242685318, + "rewards/rejected": 0.3838251829147339, + "step": 3624 + }, + { + "epoch": 0.5606031316450802, + "grad_norm": 5.514266014099121, + "learning_rate": 4.517413220300149e-06, + "logits/chosen": 12.807656288146973, + "logits/rejected": 8.926528930664062, + "logps/chosen": -251.1391143798828, + "logps/rejected": -348.05987548828125, + "loss": 0.7072, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35980215668678284, + "rewards/margins": 0.11906873434782028, + "rewards/rejected": 0.24073341488838196, + "step": 3625 + }, + { + "epoch": 0.5607577807848444, + "grad_norm": 5.114682197570801, + "learning_rate": 4.5171268186504756e-06, + "logits/chosen": 8.046839714050293, + "logits/rejected": 10.96063232421875, + "logps/chosen": -160.58099365234375, + "logps/rejected": -195.5984344482422, + "loss": 0.7372, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23748008906841278, + "rewards/margins": -0.05442379415035248, + "rewards/rejected": 0.29190391302108765, + "step": 3626 + }, + { + "epoch": 0.5609124299246085, + "grad_norm": 4.064693927764893, + "learning_rate": 4.516840417000802e-06, + "logits/chosen": 12.972407341003418, + "logits/rejected": 6.20378303527832, + "logps/chosen": -302.0226745605469, + "logps/rejected": -201.37860107421875, + "loss": 0.4759, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6497853994369507, + "rewards/margins": 0.6856660842895508, + "rewards/rejected": -0.03588065505027771, + "step": 3627 + }, + { + "epoch": 0.5610670790643727, + "grad_norm": 7.956607818603516, + "learning_rate": 4.516554015351129e-06, + "logits/chosen": 10.036197662353516, + "logits/rejected": 13.230546951293945, + "logps/chosen": -274.1005554199219, + "logps/rejected": -272.1798095703125, + "loss": 1.0076, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.15438240766525269, + "rewards/margins": -0.43600937724113464, + "rewards/rejected": 0.5903917551040649, + "step": 3628 + }, + { + "epoch": 0.5612217282041368, + "grad_norm": 5.724521160125732, + "learning_rate": 4.5162676137014555e-06, + "logits/chosen": 1.750023365020752, + "logits/rejected": 9.721931457519531, + "logps/chosen": -196.28253173828125, + "logps/rejected": -282.239990234375, + "loss": 0.7141, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10155295580625534, + "rewards/margins": 0.0940236896276474, + "rewards/rejected": 0.007529273629188538, + "step": 3629 + }, + { + "epoch": 0.561376377343901, + "grad_norm": 3.527271032333374, + "learning_rate": 4.515981212051781e-06, + "logits/chosen": 8.080997467041016, + "logits/rejected": 2.065378427505493, + "logps/chosen": -159.24478149414062, + "logps/rejected": -119.29331970214844, + "loss": 0.4922, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28644341230392456, + "rewards/margins": 0.5212591886520386, + "rewards/rejected": -0.23481574654579163, + "step": 3630 + }, + { + "epoch": 0.5615310264836652, + "grad_norm": 7.018500804901123, + "learning_rate": 4.515694810402108e-06, + "logits/chosen": 4.208637237548828, + "logits/rejected": 6.619284629821777, + "logps/chosen": -167.72523498535156, + "logps/rejected": -224.14328002929688, + "loss": 0.8243, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2563505172729492, + "rewards/margins": -0.03866724669933319, + "rewards/rejected": 0.2950177490711212, + "step": 3631 + }, + { + "epoch": 0.5616856756234293, + "grad_norm": 6.5749921798706055, + "learning_rate": 4.515408408752435e-06, + "logits/chosen": 7.861728668212891, + "logits/rejected": 9.753066062927246, + "logps/chosen": -260.8916015625, + "logps/rejected": -310.81903076171875, + "loss": 0.7772, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0019121654331684113, + "rewards/margins": -0.13817378878593445, + "rewards/rejected": 0.13626162707805634, + "step": 3632 + }, + { + "epoch": 0.5618403247631935, + "grad_norm": 6.300232887268066, + "learning_rate": 4.515122007102761e-06, + "logits/chosen": 8.915915489196777, + "logits/rejected": 6.9757256507873535, + "logps/chosen": -273.7812194824219, + "logps/rejected": -286.6151123046875, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6126001477241516, + "rewards/margins": 0.35391557216644287, + "rewards/rejected": 0.25868457555770874, + "step": 3633 + }, + { + "epoch": 0.5619949739029577, + "grad_norm": 4.627342224121094, + "learning_rate": 4.514835605453088e-06, + "logits/chosen": 7.098930835723877, + "logits/rejected": 4.131325721740723, + "logps/chosen": -203.69952392578125, + "logps/rejected": -164.17886352539062, + "loss": 0.6091, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.397869735956192, + "rewards/margins": 0.2247088998556137, + "rewards/rejected": 0.1731608510017395, + "step": 3634 + }, + { + "epoch": 0.5621496230427219, + "grad_norm": 5.479161262512207, + "learning_rate": 4.5145492038034146e-06, + "logits/chosen": 4.6706085205078125, + "logits/rejected": -0.017857074737548828, + "logps/chosen": -271.0173645019531, + "logps/rejected": -209.44134521484375, + "loss": 0.6623, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07213973999023438, + "rewards/margins": 0.20365524291992188, + "rewards/rejected": -0.1315155327320099, + "step": 3635 + }, + { + "epoch": 0.562304272182486, + "grad_norm": 7.688310146331787, + "learning_rate": 4.514262802153741e-06, + "logits/chosen": 6.278905868530273, + "logits/rejected": 10.441362380981445, + "logps/chosen": -303.8002624511719, + "logps/rejected": -330.2528076171875, + "loss": 0.7186, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2380618155002594, + "rewards/margins": 0.03998413681983948, + "rewards/rejected": 0.19807769358158112, + "step": 3636 + }, + { + "epoch": 0.5624589213222502, + "grad_norm": 4.849839687347412, + "learning_rate": 4.513976400504067e-06, + "logits/chosen": 9.330001831054688, + "logits/rejected": 13.519051551818848, + "logps/chosen": -224.05209350585938, + "logps/rejected": -274.3986511230469, + "loss": 0.6609, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5663037300109863, + "rewards/margins": 0.10518389195203781, + "rewards/rejected": 0.46111980080604553, + "step": 3637 + }, + { + "epoch": 0.5626135704620143, + "grad_norm": 6.879826545715332, + "learning_rate": 4.513689998854394e-06, + "logits/chosen": 5.863480567932129, + "logits/rejected": -0.09810042381286621, + "logps/chosen": -297.7219543457031, + "logps/rejected": -219.74148559570312, + "loss": 0.7047, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030804306268692017, + "rewards/margins": 0.16858971118927002, + "rewards/rejected": -0.19939398765563965, + "step": 3638 + }, + { + "epoch": 0.5627682196017785, + "grad_norm": 114.09884643554688, + "learning_rate": 4.51340359720472e-06, + "logits/chosen": 11.261488914489746, + "logits/rejected": 3.533715009689331, + "logps/chosen": -315.0606689453125, + "logps/rejected": -260.05902099609375, + "loss": 0.5768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4755340814590454, + "rewards/margins": 0.44022253155708313, + "rewards/rejected": 0.035311512649059296, + "step": 3639 + }, + { + "epoch": 0.5629228687415426, + "grad_norm": 5.230422496795654, + "learning_rate": 4.513117195555047e-06, + "logits/chosen": 6.309638977050781, + "logits/rejected": 5.827183723449707, + "logps/chosen": -210.89346313476562, + "logps/rejected": -244.02325439453125, + "loss": 0.7449, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.16464069485664368, + "rewards/margins": -0.017087697982788086, + "rewards/rejected": 0.18172839283943176, + "step": 3640 + }, + { + "epoch": 0.5630775178813068, + "grad_norm": 5.134578704833984, + "learning_rate": 4.512830793905374e-06, + "logits/chosen": 9.236634254455566, + "logits/rejected": 5.6499176025390625, + "logps/chosen": -358.2993469238281, + "logps/rejected": -399.5333251953125, + "loss": 0.3958, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8248772621154785, + "rewards/margins": 0.8987542390823364, + "rewards/rejected": -0.07387696206569672, + "step": 3641 + }, + { + "epoch": 0.5632321670210709, + "grad_norm": 4.882595062255859, + "learning_rate": 4.5125443922557e-06, + "logits/chosen": 9.226993560791016, + "logits/rejected": 6.979156494140625, + "logps/chosen": -232.09095764160156, + "logps/rejected": -288.50079345703125, + "loss": 0.5659, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5040963292121887, + "rewards/margins": 0.32436180114746094, + "rewards/rejected": 0.17973452806472778, + "step": 3642 + }, + { + "epoch": 0.5633868161608351, + "grad_norm": 3.5573363304138184, + "learning_rate": 4.512257990606026e-06, + "logits/chosen": 10.841207504272461, + "logits/rejected": 15.784500122070312, + "logps/chosen": -153.48143005371094, + "logps/rejected": -255.61265563964844, + "loss": 0.5567, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10322020202875137, + "rewards/margins": 0.3135751485824585, + "rewards/rejected": -0.21035495400428772, + "step": 3643 + }, + { + "epoch": 0.5635414653005992, + "grad_norm": 6.520923614501953, + "learning_rate": 4.511971588956353e-06, + "logits/chosen": 9.688759803771973, + "logits/rejected": 13.476829528808594, + "logps/chosen": -420.51678466796875, + "logps/rejected": -450.56268310546875, + "loss": 0.6223, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3638840317726135, + "rewards/margins": 0.24585817754268646, + "rewards/rejected": 0.11802583932876587, + "step": 3644 + }, + { + "epoch": 0.5636961144403634, + "grad_norm": 5.898277282714844, + "learning_rate": 4.511685187306679e-06, + "logits/chosen": 9.18428897857666, + "logits/rejected": 9.48148250579834, + "logps/chosen": -250.00482177734375, + "logps/rejected": -328.3102111816406, + "loss": 0.5957, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44468259811401367, + "rewards/margins": 0.25656577944755554, + "rewards/rejected": 0.18811683356761932, + "step": 3645 + }, + { + "epoch": 0.5638507635801275, + "grad_norm": 5.621179580688477, + "learning_rate": 4.511398785657006e-06, + "logits/chosen": 13.359064102172852, + "logits/rejected": 11.708257675170898, + "logps/chosen": -248.6935577392578, + "logps/rejected": -213.87918090820312, + "loss": 0.6613, + "rewards/accuracies": 0.5, + "rewards/chosen": 9.238719940185547e-07, + "rewards/margins": 0.10682410001754761, + "rewards/rejected": -0.10682317614555359, + "step": 3646 + }, + { + "epoch": 0.5640054127198918, + "grad_norm": 3.9496428966522217, + "learning_rate": 4.511112384007333e-06, + "logits/chosen": 9.742288589477539, + "logits/rejected": 5.747826099395752, + "logps/chosen": -325.0531005859375, + "logps/rejected": -275.94976806640625, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5693107843399048, + "rewards/margins": 0.6642358899116516, + "rewards/rejected": -0.09492513537406921, + "step": 3647 + }, + { + "epoch": 0.564160061859656, + "grad_norm": 77.00012969970703, + "learning_rate": 4.5108259823576585e-06, + "logits/chosen": 13.071802139282227, + "logits/rejected": 3.804473638534546, + "logps/chosen": -258.6274719238281, + "logps/rejected": -201.11111450195312, + "loss": 0.7043, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25364238023757935, + "rewards/margins": 0.08735001087188721, + "rewards/rejected": 0.16629236936569214, + "step": 3648 + }, + { + "epoch": 0.5643147109994201, + "grad_norm": 4.047515869140625, + "learning_rate": 4.510539580707985e-06, + "logits/chosen": 10.43231201171875, + "logits/rejected": -1.2250852584838867, + "logps/chosen": -310.70562744140625, + "logps/rejected": -169.94357299804688, + "loss": 0.6077, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36159229278564453, + "rewards/margins": 0.3272063136100769, + "rewards/rejected": 0.034385960549116135, + "step": 3649 + }, + { + "epoch": 0.5644693601391843, + "grad_norm": 4.396068572998047, + "learning_rate": 4.510253179058312e-06, + "logits/chosen": 10.8219633102417, + "logits/rejected": 3.9990551471710205, + "logps/chosen": -328.985595703125, + "logps/rejected": -262.834716796875, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12695759534835815, + "rewards/margins": 0.372251957654953, + "rewards/rejected": -0.24529439210891724, + "step": 3650 + }, + { + "epoch": 0.5646240092789484, + "grad_norm": 5.448184013366699, + "learning_rate": 4.5099667774086384e-06, + "logits/chosen": 13.655328750610352, + "logits/rejected": 9.747859954833984, + "logps/chosen": -267.9127197265625, + "logps/rejected": -266.034423828125, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2778496742248535, + "rewards/margins": 0.2799261212348938, + "rewards/rejected": -0.0020764395594596863, + "step": 3651 + }, + { + "epoch": 0.5647786584187126, + "grad_norm": 5.679709434509277, + "learning_rate": 4.509680375758965e-06, + "logits/chosen": 6.006450653076172, + "logits/rejected": 8.318628311157227, + "logps/chosen": -225.3775177001953, + "logps/rejected": -298.083251953125, + "loss": 0.73, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18423233926296234, + "rewards/margins": 0.009407687932252884, + "rewards/rejected": 0.17482465505599976, + "step": 3652 + }, + { + "epoch": 0.5649333075584767, + "grad_norm": 7.878204345703125, + "learning_rate": 4.509393974109291e-06, + "logits/chosen": 7.103664875030518, + "logits/rejected": 10.92629337310791, + "logps/chosen": -316.171875, + "logps/rejected": -322.41424560546875, + "loss": 1.0135, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08354025334119797, + "rewards/margins": -0.4855995774269104, + "rewards/rejected": 0.40205928683280945, + "step": 3653 + }, + { + "epoch": 0.5650879566982409, + "grad_norm": 5.877285003662109, + "learning_rate": 4.5091075724596175e-06, + "logits/chosen": 10.732038497924805, + "logits/rejected": 11.95211124420166, + "logps/chosen": -181.72096252441406, + "logps/rejected": -182.55142211914062, + "loss": 0.7745, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.28420814871788025, + "rewards/margins": -0.08466291427612305, + "rewards/rejected": 0.3688710629940033, + "step": 3654 + }, + { + "epoch": 0.565242605838005, + "grad_norm": 7.787291526794434, + "learning_rate": 4.508821170809944e-06, + "logits/chosen": 6.113892555236816, + "logits/rejected": 12.226123809814453, + "logps/chosen": -300.123779296875, + "logps/rejected": -379.57769775390625, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0034510791301727295, + "rewards/margins": 0.10299862921237946, + "rewards/rejected": -0.09954756498336792, + "step": 3655 + }, + { + "epoch": 0.5653972549777692, + "grad_norm": 5.9066338539123535, + "learning_rate": 4.508534769160271e-06, + "logits/chosen": 9.238149642944336, + "logits/rejected": 10.223678588867188, + "logps/chosen": -236.20460510253906, + "logps/rejected": -213.0561065673828, + "loss": 0.7917, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.044362232089042664, + "rewards/margins": -0.10226662456989288, + "rewards/rejected": 0.05790438503026962, + "step": 3656 + }, + { + "epoch": 0.5655519041175333, + "grad_norm": 5.332758903503418, + "learning_rate": 4.508248367510597e-06, + "logits/chosen": 11.049283981323242, + "logits/rejected": 12.803634643554688, + "logps/chosen": -213.97698974609375, + "logps/rejected": -205.62265014648438, + "loss": 0.708, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.314159095287323, + "rewards/margins": 0.15524911880493164, + "rewards/rejected": -0.46940815448760986, + "step": 3657 + }, + { + "epoch": 0.5657065532572975, + "grad_norm": 6.386929988861084, + "learning_rate": 4.507961965860923e-06, + "logits/chosen": 10.231005668640137, + "logits/rejected": 6.274954319000244, + "logps/chosen": -313.561279296875, + "logps/rejected": -308.9860534667969, + "loss": 0.5213, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38761091232299805, + "rewards/margins": 0.5744739770889282, + "rewards/rejected": -0.18686306476593018, + "step": 3658 + }, + { + "epoch": 0.5658612023970616, + "grad_norm": 3.730976104736328, + "learning_rate": 4.50767556421125e-06, + "logits/chosen": 10.138751029968262, + "logits/rejected": 3.912522315979004, + "logps/chosen": -263.2839050292969, + "logps/rejected": -158.0906982421875, + "loss": 0.511, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5830817222595215, + "rewards/margins": 0.5625765323638916, + "rewards/rejected": 0.02050519734621048, + "step": 3659 + }, + { + "epoch": 0.5660158515368259, + "grad_norm": 4.402044296264648, + "learning_rate": 4.507389162561577e-06, + "logits/chosen": 12.287494659423828, + "logits/rejected": 5.61386251449585, + "logps/chosen": -301.85284423828125, + "logps/rejected": -204.0203399658203, + "loss": 0.498, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.529687762260437, + "rewards/margins": 0.47713372111320496, + "rewards/rejected": 0.05255403369665146, + "step": 3660 + }, + { + "epoch": 0.56617050067659, + "grad_norm": 4.895749092102051, + "learning_rate": 4.507102760911903e-06, + "logits/chosen": 11.684027671813965, + "logits/rejected": 8.538240432739258, + "logps/chosen": -303.5574645996094, + "logps/rejected": -269.60223388671875, + "loss": 0.6207, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.312182754278183, + "rewards/margins": 0.3468983471393585, + "rewards/rejected": -0.03471565619111061, + "step": 3661 + }, + { + "epoch": 0.5663251498163542, + "grad_norm": 3.913245916366577, + "learning_rate": 4.50681635926223e-06, + "logits/chosen": 15.79096794128418, + "logits/rejected": 11.60367488861084, + "logps/chosen": -249.99383544921875, + "logps/rejected": -209.7312774658203, + "loss": 0.5796, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3750917613506317, + "rewards/margins": 0.2654780447483063, + "rewards/rejected": 0.10961371660232544, + "step": 3662 + }, + { + "epoch": 0.5664797989561183, + "grad_norm": 7.531073093414307, + "learning_rate": 4.506529957612556e-06, + "logits/chosen": 10.545195579528809, + "logits/rejected": 11.463794708251953, + "logps/chosen": -319.2952880859375, + "logps/rejected": -325.82080078125, + "loss": 0.6509, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09842488169670105, + "rewards/margins": 0.31463348865509033, + "rewards/rejected": -0.21620863676071167, + "step": 3663 + }, + { + "epoch": 0.5666344480958825, + "grad_norm": 6.598790168762207, + "learning_rate": 4.506243555962882e-06, + "logits/chosen": 14.826682090759277, + "logits/rejected": 5.902122974395752, + "logps/chosen": -429.685302734375, + "logps/rejected": -260.0694274902344, + "loss": 0.7827, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5122130513191223, + "rewards/margins": 0.07252617180347443, + "rewards/rejected": 0.4396868646144867, + "step": 3664 + }, + { + "epoch": 0.5667890972356466, + "grad_norm": 4.575944423675537, + "learning_rate": 4.505957154313209e-06, + "logits/chosen": 9.64094066619873, + "logits/rejected": 4.752739906311035, + "logps/chosen": -235.779052734375, + "logps/rejected": -134.34237670898438, + "loss": 0.6774, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22035714983940125, + "rewards/margins": 0.13793662190437317, + "rewards/rejected": -0.358293741941452, + "step": 3665 + }, + { + "epoch": 0.5669437463754108, + "grad_norm": 6.269856929779053, + "learning_rate": 4.505670752663536e-06, + "logits/chosen": 12.617897033691406, + "logits/rejected": 7.922951698303223, + "logps/chosen": -353.73980712890625, + "logps/rejected": -256.1517639160156, + "loss": 0.6759, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2883417010307312, + "rewards/margins": 0.1448616087436676, + "rewards/rejected": 0.1434801071882248, + "step": 3666 + }, + { + "epoch": 0.5670983955151749, + "grad_norm": 5.02537727355957, + "learning_rate": 4.505384351013862e-06, + "logits/chosen": 7.874388694763184, + "logits/rejected": 5.053269863128662, + "logps/chosen": -277.9246826171875, + "logps/rejected": -257.0106201171875, + "loss": 0.5354, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.017989888787269592, + "rewards/margins": 0.5474467277526855, + "rewards/rejected": -0.5294568538665771, + "step": 3667 + }, + { + "epoch": 0.5672530446549391, + "grad_norm": 5.3897600173950195, + "learning_rate": 4.505097949364189e-06, + "logits/chosen": 12.264787673950195, + "logits/rejected": 11.02160930633545, + "logps/chosen": -306.97894287109375, + "logps/rejected": -288.6179504394531, + "loss": 0.5599, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2851245403289795, + "rewards/margins": 0.4912259876728058, + "rewards/rejected": -0.2061014324426651, + "step": 3668 + }, + { + "epoch": 0.5674076937947032, + "grad_norm": 4.734246253967285, + "learning_rate": 4.504811547714516e-06, + "logits/chosen": 9.199647903442383, + "logits/rejected": 10.080564498901367, + "logps/chosen": -251.60171508789062, + "logps/rejected": -185.6214141845703, + "loss": 0.5663, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41670435667037964, + "rewards/margins": 0.30135297775268555, + "rewards/rejected": 0.11535138636827469, + "step": 3669 + }, + { + "epoch": 0.5675623429344674, + "grad_norm": 6.553821086883545, + "learning_rate": 4.504525146064841e-06, + "logits/chosen": 14.044720649719238, + "logits/rejected": 11.622954368591309, + "logps/chosen": -286.94451904296875, + "logps/rejected": -244.75901794433594, + "loss": 0.8628, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05636577308177948, + "rewards/margins": -0.28418415784835815, + "rewards/rejected": 0.22781839966773987, + "step": 3670 + }, + { + "epoch": 0.5677169920742315, + "grad_norm": 5.8462324142456055, + "learning_rate": 4.504238744415168e-06, + "logits/chosen": 9.776199340820312, + "logits/rejected": 10.036972045898438, + "logps/chosen": -334.365234375, + "logps/rejected": -349.728271484375, + "loss": 0.6829, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0413224995136261, + "rewards/margins": 0.14678648114204407, + "rewards/rejected": -0.10546398162841797, + "step": 3671 + }, + { + "epoch": 0.5678716412139957, + "grad_norm": 10.066862106323242, + "learning_rate": 4.503952342765495e-06, + "logits/chosen": 7.677642345428467, + "logits/rejected": 11.483621597290039, + "logps/chosen": -226.54116821289062, + "logps/rejected": -322.24114990234375, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3108657896518707, + "rewards/margins": 0.03601866587996483, + "rewards/rejected": 0.274847149848938, + "step": 3672 + }, + { + "epoch": 0.56802629035376, + "grad_norm": 7.763453483581543, + "learning_rate": 4.503665941115821e-06, + "logits/chosen": 13.709482192993164, + "logits/rejected": 8.103269577026367, + "logps/chosen": -344.9154052734375, + "logps/rejected": -211.20916748046875, + "loss": 0.5759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1412796974182129, + "rewards/margins": 0.3192577362060547, + "rewards/rejected": -0.17797806859016418, + "step": 3673 + }, + { + "epoch": 0.5681809394935241, + "grad_norm": 3.7702691555023193, + "learning_rate": 4.503379539466148e-06, + "logits/chosen": 17.089126586914062, + "logits/rejected": 5.692880630493164, + "logps/chosen": -310.4942932128906, + "logps/rejected": -203.1164093017578, + "loss": 0.4442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14049378037452698, + "rewards/margins": 0.6498605608940125, + "rewards/rejected": -0.5093667507171631, + "step": 3674 + }, + { + "epoch": 0.5683355886332883, + "grad_norm": 5.43742561340332, + "learning_rate": 4.503093137816475e-06, + "logits/chosen": 8.659968376159668, + "logits/rejected": 5.5286173820495605, + "logps/chosen": -380.5871276855469, + "logps/rejected": -300.734130859375, + "loss": 0.5399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.26291656494140625, + "rewards/margins": 0.40566912293434143, + "rewards/rejected": -0.14275255799293518, + "step": 3675 + }, + { + "epoch": 0.5684902377730524, + "grad_norm": 5.515951633453369, + "learning_rate": 4.5028067361668005e-06, + "logits/chosen": 7.380363941192627, + "logits/rejected": 3.4265217781066895, + "logps/chosen": -269.34716796875, + "logps/rejected": -227.3361053466797, + "loss": 0.4686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6138440370559692, + "rewards/margins": 0.5778607726097107, + "rewards/rejected": 0.03598332405090332, + "step": 3676 + }, + { + "epoch": 0.5686448869128166, + "grad_norm": 5.3107194900512695, + "learning_rate": 4.502520334517127e-06, + "logits/chosen": 9.512046813964844, + "logits/rejected": 8.318483352661133, + "logps/chosen": -272.41619873046875, + "logps/rejected": -260.7023620605469, + "loss": 0.5269, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3114250898361206, + "rewards/margins": 0.5542429685592651, + "rewards/rejected": -0.24281789362430573, + "step": 3677 + }, + { + "epoch": 0.5687995360525807, + "grad_norm": 5.999351978302002, + "learning_rate": 4.502233932867454e-06, + "logits/chosen": 14.127909660339355, + "logits/rejected": 10.931772232055664, + "logps/chosen": -305.1776123046875, + "logps/rejected": -198.8520965576172, + "loss": 0.8066, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0014509186148643494, + "rewards/margins": -0.17532047629356384, + "rewards/rejected": 0.1767714023590088, + "step": 3678 + }, + { + "epoch": 0.5689541851923449, + "grad_norm": 6.649197578430176, + "learning_rate": 4.50194753121778e-06, + "logits/chosen": 12.58023738861084, + "logits/rejected": 9.021209716796875, + "logps/chosen": -320.255859375, + "logps/rejected": -254.71054077148438, + "loss": 0.7938, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.031194686889648438, + "rewards/margins": -0.037896350026130676, + "rewards/rejected": 0.006701678037643433, + "step": 3679 + }, + { + "epoch": 0.569108834332109, + "grad_norm": 8.822691917419434, + "learning_rate": 4.501661129568107e-06, + "logits/chosen": 8.005599975585938, + "logits/rejected": 6.035144805908203, + "logps/chosen": -513.3710327148438, + "logps/rejected": -327.2975158691406, + "loss": 0.7097, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.27144578099250793, + "rewards/margins": 0.23865589499473572, + "rewards/rejected": 0.03278990089893341, + "step": 3680 + }, + { + "epoch": 0.5692634834718732, + "grad_norm": 10.953230857849121, + "learning_rate": 4.501374727918434e-06, + "logits/chosen": 7.964801788330078, + "logits/rejected": 3.9077465534210205, + "logps/chosen": -429.30487060546875, + "logps/rejected": -366.0997314453125, + "loss": 0.604, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04000203311443329, + "rewards/margins": 0.43759608268737793, + "rewards/rejected": -0.39759403467178345, + "step": 3681 + }, + { + "epoch": 0.5694181326116373, + "grad_norm": 4.495512008666992, + "learning_rate": 4.5010883262687595e-06, + "logits/chosen": 3.5331974029541016, + "logits/rejected": 5.198853015899658, + "logps/chosen": -180.3953857421875, + "logps/rejected": -199.6255340576172, + "loss": 0.6876, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03182120621204376, + "rewards/margins": 0.17631599307060242, + "rewards/rejected": -0.20813718438148499, + "step": 3682 + }, + { + "epoch": 0.5695727817514015, + "grad_norm": 4.0957417488098145, + "learning_rate": 4.500801924619086e-06, + "logits/chosen": 8.53170108795166, + "logits/rejected": 3.757833480834961, + "logps/chosen": -201.50238037109375, + "logps/rejected": -131.14205932617188, + "loss": 0.5042, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22799883782863617, + "rewards/margins": 0.6147207021713257, + "rewards/rejected": -0.3867218494415283, + "step": 3683 + }, + { + "epoch": 0.5697274308911656, + "grad_norm": 4.614190578460693, + "learning_rate": 4.500515522969413e-06, + "logits/chosen": 7.984928131103516, + "logits/rejected": 5.929247856140137, + "logps/chosen": -220.29620361328125, + "logps/rejected": -210.05551147460938, + "loss": 0.6095, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29516980051994324, + "rewards/margins": 0.19861462712287903, + "rewards/rejected": 0.0965551882982254, + "step": 3684 + }, + { + "epoch": 0.5698820800309299, + "grad_norm": 4.490009784698486, + "learning_rate": 4.5002291213197395e-06, + "logits/chosen": 8.425851821899414, + "logits/rejected": 6.92668342590332, + "logps/chosen": -161.73577880859375, + "logps/rejected": -160.24754333496094, + "loss": 0.6626, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21719884872436523, + "rewards/margins": 0.11057616025209427, + "rewards/rejected": 0.10662268847227097, + "step": 3685 + }, + { + "epoch": 0.570036729170694, + "grad_norm": 34.64583206176758, + "learning_rate": 4.499942719670065e-06, + "logits/chosen": 9.462136268615723, + "logits/rejected": 7.680774688720703, + "logps/chosen": -155.32489013671875, + "logps/rejected": -124.76506805419922, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33925849199295044, + "rewards/margins": 0.24680644273757935, + "rewards/rejected": 0.0924520492553711, + "step": 3686 + }, + { + "epoch": 0.5701913783104582, + "grad_norm": 11.269735336303711, + "learning_rate": 4.499656318020392e-06, + "logits/chosen": 8.73222541809082, + "logits/rejected": 9.909281730651855, + "logps/chosen": -261.3561706542969, + "logps/rejected": -340.4133605957031, + "loss": 0.6472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06953859329223633, + "rewards/margins": 0.1912437528371811, + "rewards/rejected": -0.2607823312282562, + "step": 3687 + }, + { + "epoch": 0.5703460274502223, + "grad_norm": 7.320501327514648, + "learning_rate": 4.4993699163707186e-06, + "logits/chosen": 7.51926851272583, + "logits/rejected": 6.704490661621094, + "logps/chosen": -261.43597412109375, + "logps/rejected": -275.8111572265625, + "loss": 0.8212, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05760951340198517, + "rewards/margins": -0.13593344390392303, + "rewards/rejected": 0.07832393050193787, + "step": 3688 + }, + { + "epoch": 0.5705006765899865, + "grad_norm": 5.138495445251465, + "learning_rate": 4.499083514721045e-06, + "logits/chosen": 9.874979019165039, + "logits/rejected": 12.54377555847168, + "logps/chosen": -217.4697723388672, + "logps/rejected": -224.77923583984375, + "loss": 0.7372, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009426403790712357, + "rewards/margins": -0.03714337199926376, + "rewards/rejected": 0.027716970071196556, + "step": 3689 + }, + { + "epoch": 0.5706553257297506, + "grad_norm": 6.244549751281738, + "learning_rate": 4.498797113071372e-06, + "logits/chosen": 7.455750465393066, + "logits/rejected": 4.7432403564453125, + "logps/chosen": -353.31573486328125, + "logps/rejected": -292.26507568359375, + "loss": 0.4749, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29922980070114136, + "rewards/margins": 0.6084624528884888, + "rewards/rejected": -0.309232622385025, + "step": 3690 + }, + { + "epoch": 0.5708099748695148, + "grad_norm": 4.084224224090576, + "learning_rate": 4.498510711421698e-06, + "logits/chosen": 12.917793273925781, + "logits/rejected": 3.246866226196289, + "logps/chosen": -279.272705078125, + "logps/rejected": -194.4368438720703, + "loss": 0.4774, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6332556009292603, + "rewards/margins": 0.6148768663406372, + "rewards/rejected": 0.01837873086333275, + "step": 3691 + }, + { + "epoch": 0.5709646240092789, + "grad_norm": 6.143355846405029, + "learning_rate": 4.498224309772024e-06, + "logits/chosen": 9.517169952392578, + "logits/rejected": 6.6279826164245605, + "logps/chosen": -286.85748291015625, + "logps/rejected": -259.88275146484375, + "loss": 0.7693, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07559224218130112, + "rewards/margins": -0.08706313371658325, + "rewards/rejected": 0.011470891535282135, + "step": 3692 + }, + { + "epoch": 0.5711192731490431, + "grad_norm": 6.04624080657959, + "learning_rate": 4.497937908122351e-06, + "logits/chosen": 14.013015747070312, + "logits/rejected": 3.865271806716919, + "logps/chosen": -316.0419921875, + "logps/rejected": -200.34170532226562, + "loss": 0.6086, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07523509860038757, + "rewards/margins": 0.21238252520561218, + "rewards/rejected": -0.28761762380599976, + "step": 3693 + }, + { + "epoch": 0.5712739222888072, + "grad_norm": 5.281108379364014, + "learning_rate": 4.497651506472678e-06, + "logits/chosen": 12.506205558776855, + "logits/rejected": 5.24484920501709, + "logps/chosen": -384.62353515625, + "logps/rejected": -318.9117736816406, + "loss": 0.4616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5413500070571899, + "rewards/margins": 0.6123498678207397, + "rewards/rejected": -0.07099992036819458, + "step": 3694 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 3.5025908946990967, + "learning_rate": 4.497365104823004e-06, + "logits/chosen": 10.446781158447266, + "logits/rejected": 3.465008497238159, + "logps/chosen": -371.855712890625, + "logps/rejected": -193.8430938720703, + "loss": 0.4506, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39129209518432617, + "rewards/margins": 0.7095879316329956, + "rewards/rejected": -0.31829580664634705, + "step": 3695 + }, + { + "epoch": 0.5715832205683355, + "grad_norm": 5.249233245849609, + "learning_rate": 4.49707870317333e-06, + "logits/chosen": 2.9382739067077637, + "logits/rejected": 1.4521952867507935, + "logps/chosen": -451.50823974609375, + "logps/rejected": -369.6345520019531, + "loss": 0.6066, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07451462000608444, + "rewards/margins": 0.23689767718315125, + "rewards/rejected": -0.31141233444213867, + "step": 3696 + }, + { + "epoch": 0.5717378697080997, + "grad_norm": 4.745990753173828, + "learning_rate": 4.496792301523657e-06, + "logits/chosen": 14.343040466308594, + "logits/rejected": 7.235940933227539, + "logps/chosen": -381.2447509765625, + "logps/rejected": -349.32275390625, + "loss": 0.4675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3196329176425934, + "rewards/margins": 0.612358033657074, + "rewards/rejected": -0.2927250862121582, + "step": 3697 + }, + { + "epoch": 0.571892518847864, + "grad_norm": 7.157134056091309, + "learning_rate": 4.496505899873983e-06, + "logits/chosen": 6.340457916259766, + "logits/rejected": 4.8563642501831055, + "logps/chosen": -281.021484375, + "logps/rejected": -239.84185791015625, + "loss": 0.7218, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.34968167543411255, + "rewards/margins": 0.1062493622303009, + "rewards/rejected": -0.45593100786209106, + "step": 3698 + }, + { + "epoch": 0.5720471679876281, + "grad_norm": 5.116054534912109, + "learning_rate": 4.49621949822431e-06, + "logits/chosen": 9.402952194213867, + "logits/rejected": 11.37956428527832, + "logps/chosen": -236.03652954101562, + "logps/rejected": -253.1999053955078, + "loss": 0.5495, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22523260116577148, + "rewards/margins": 0.5181406736373901, + "rewards/rejected": -0.29290807247161865, + "step": 3699 + }, + { + "epoch": 0.5722018171273923, + "grad_norm": 4.399021625518799, + "learning_rate": 4.495933096574637e-06, + "logits/chosen": 16.857341766357422, + "logits/rejected": 9.565850257873535, + "logps/chosen": -334.3639831542969, + "logps/rejected": -277.7099609375, + "loss": 0.4457, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4729700982570648, + "rewards/margins": 0.6835820078849792, + "rewards/rejected": -0.21061192452907562, + "step": 3700 + }, + { + "epoch": 0.5723564662671564, + "grad_norm": 5.277714729309082, + "learning_rate": 4.495646694924963e-06, + "logits/chosen": 13.100316047668457, + "logits/rejected": 9.61971664428711, + "logps/chosen": -316.03271484375, + "logps/rejected": -217.42532348632812, + "loss": 0.6467, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3280617296695709, + "rewards/margins": 0.3129432201385498, + "rewards/rejected": 0.015118509531021118, + "step": 3701 + }, + { + "epoch": 0.5725111154069206, + "grad_norm": 5.60197639465332, + "learning_rate": 4.49536029327529e-06, + "logits/chosen": 15.08932113647461, + "logits/rejected": 11.757494926452637, + "logps/chosen": -355.3668212890625, + "logps/rejected": -281.8846130371094, + "loss": 0.4995, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0007755272090435028, + "rewards/margins": 0.4936409592628479, + "rewards/rejected": -0.4928654730319977, + "step": 3702 + }, + { + "epoch": 0.5726657645466847, + "grad_norm": 7.01350736618042, + "learning_rate": 4.495073891625616e-06, + "logits/chosen": 5.314853668212891, + "logits/rejected": 1.3873262405395508, + "logps/chosen": -392.3323974609375, + "logps/rejected": -270.7787780761719, + "loss": 0.7477, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0016715079545974731, + "rewards/margins": 0.015269089490175247, + "rewards/rejected": -0.016940593719482422, + "step": 3703 + }, + { + "epoch": 0.5728204136864489, + "grad_norm": 7.185494422912598, + "learning_rate": 4.4947874899759424e-06, + "logits/chosen": 5.782631874084473, + "logits/rejected": -0.3535642623901367, + "logps/chosen": -387.6290283203125, + "logps/rejected": -327.96417236328125, + "loss": 0.4869, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3660643994808197, + "rewards/margins": 0.526902973651886, + "rewards/rejected": -0.16083863377571106, + "step": 3704 + }, + { + "epoch": 0.572975062826213, + "grad_norm": 5.6459784507751465, + "learning_rate": 4.494501088326269e-06, + "logits/chosen": 13.248144149780273, + "logits/rejected": 7.682799816131592, + "logps/chosen": -317.3186950683594, + "logps/rejected": -252.63990783691406, + "loss": 0.704, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3220228850841522, + "rewards/margins": 0.01689395308494568, + "rewards/rejected": 0.3051289916038513, + "step": 3705 + }, + { + "epoch": 0.5731297119659772, + "grad_norm": 6.846540927886963, + "learning_rate": 4.494214686676596e-06, + "logits/chosen": 8.376680374145508, + "logits/rejected": 6.260311126708984, + "logps/chosen": -334.760009765625, + "logps/rejected": -254.99815368652344, + "loss": 0.7095, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1150674819946289, + "rewards/margins": -0.015519723296165466, + "rewards/rejected": 0.13058719038963318, + "step": 3706 + }, + { + "epoch": 0.5732843611057413, + "grad_norm": 5.605840682983398, + "learning_rate": 4.493928285026922e-06, + "logits/chosen": 5.8190016746521, + "logits/rejected": 4.096757888793945, + "logps/chosen": -274.852783203125, + "logps/rejected": -305.9874572753906, + "loss": 0.6153, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42660802602767944, + "rewards/margins": 0.2664538025856018, + "rewards/rejected": 0.16015425324440002, + "step": 3707 + }, + { + "epoch": 0.5734390102455055, + "grad_norm": 4.321354389190674, + "learning_rate": 4.493641883377249e-06, + "logits/chosen": 9.291486740112305, + "logits/rejected": 7.191115379333496, + "logps/chosen": -206.15481567382812, + "logps/rejected": -164.13954162597656, + "loss": 0.7039, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11247245222330093, + "rewards/margins": 0.2309592366218567, + "rewards/rejected": -0.11848677694797516, + "step": 3708 + }, + { + "epoch": 0.5735936593852696, + "grad_norm": 5.628379821777344, + "learning_rate": 4.493355481727575e-06, + "logits/chosen": 3.79827880859375, + "logits/rejected": 2.976776123046875, + "logps/chosen": -384.87994384765625, + "logps/rejected": -256.0940856933594, + "loss": 0.5315, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39725008606910706, + "rewards/margins": 0.4684295952320099, + "rewards/rejected": -0.07117953151464462, + "step": 3709 + }, + { + "epoch": 0.5737483085250338, + "grad_norm": 7.524038791656494, + "learning_rate": 4.4930690800779015e-06, + "logits/chosen": 7.303856372833252, + "logits/rejected": 6.873887538909912, + "logps/chosen": -247.5548095703125, + "logps/rejected": -228.5362548828125, + "loss": 0.744, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.14499793946743011, + "rewards/margins": 0.041953109204769135, + "rewards/rejected": -0.18695104122161865, + "step": 3710 + }, + { + "epoch": 0.573902957664798, + "grad_norm": 5.400707721710205, + "learning_rate": 4.492782678428228e-06, + "logits/chosen": 8.069753646850586, + "logits/rejected": 2.2130513191223145, + "logps/chosen": -360.14459228515625, + "logps/rejected": -180.12789916992188, + "loss": 0.6999, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20198871195316315, + "rewards/margins": 0.34531301259994507, + "rewards/rejected": -0.14332430064678192, + "step": 3711 + }, + { + "epoch": 0.5740576068045622, + "grad_norm": 6.845179557800293, + "learning_rate": 4.492496276778555e-06, + "logits/chosen": 9.691807746887207, + "logits/rejected": 2.09788179397583, + "logps/chosen": -460.1844482421875, + "logps/rejected": -309.6032409667969, + "loss": 0.5093, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.779831051826477, + "rewards/margins": 0.4788621962070465, + "rewards/rejected": 0.3009689450263977, + "step": 3712 + }, + { + "epoch": 0.5742122559443263, + "grad_norm": 6.296635150909424, + "learning_rate": 4.4922098751288814e-06, + "logits/chosen": 12.056804656982422, + "logits/rejected": 7.376004219055176, + "logps/chosen": -336.9814147949219, + "logps/rejected": -240.58607482910156, + "loss": 0.6617, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003088429570198059, + "rewards/margins": 0.14123979210853577, + "rewards/rejected": -0.1381513476371765, + "step": 3713 + }, + { + "epoch": 0.5743669050840905, + "grad_norm": 4.655721664428711, + "learning_rate": 4.491923473479208e-06, + "logits/chosen": 9.70938491821289, + "logits/rejected": -1.8758000135421753, + "logps/chosen": -408.7356262207031, + "logps/rejected": -248.5621337890625, + "loss": 0.5552, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21725772321224213, + "rewards/margins": 0.7408715486526489, + "rewards/rejected": -0.5236138105392456, + "step": 3714 + }, + { + "epoch": 0.5745215542238546, + "grad_norm": 4.45045804977417, + "learning_rate": 4.491637071829535e-06, + "logits/chosen": 14.846986770629883, + "logits/rejected": 10.147533416748047, + "logps/chosen": -260.1597900390625, + "logps/rejected": -213.46693420410156, + "loss": 0.5569, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24194619059562683, + "rewards/margins": 0.41860532760620117, + "rewards/rejected": -0.17665912210941315, + "step": 3715 + }, + { + "epoch": 0.5746762033636188, + "grad_norm": 6.709345817565918, + "learning_rate": 4.4913506701798606e-06, + "logits/chosen": 8.405929565429688, + "logits/rejected": 9.518918991088867, + "logps/chosen": -368.247802734375, + "logps/rejected": -314.67413330078125, + "loss": 0.7618, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18052148818969727, + "rewards/margins": -0.07351011037826538, + "rewards/rejected": 0.25403159856796265, + "step": 3716 + }, + { + "epoch": 0.574830852503383, + "grad_norm": 3.5707855224609375, + "learning_rate": 4.491064268530187e-06, + "logits/chosen": 7.9638237953186035, + "logits/rejected": 7.77506160736084, + "logps/chosen": -241.71742248535156, + "logps/rejected": -174.21189880371094, + "loss": 0.5539, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4991726279258728, + "rewards/margins": 0.35444897413253784, + "rewards/rejected": 0.14472365379333496, + "step": 3717 + }, + { + "epoch": 0.5749855016431471, + "grad_norm": 6.123157024383545, + "learning_rate": 4.490777866880514e-06, + "logits/chosen": 9.426716804504395, + "logits/rejected": -0.6902797222137451, + "logps/chosen": -340.7825012207031, + "logps/rejected": -246.65695190429688, + "loss": 0.5228, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03857307881116867, + "rewards/margins": 0.42529231309890747, + "rewards/rejected": -0.46386539936065674, + "step": 3718 + }, + { + "epoch": 0.5751401507829113, + "grad_norm": 5.5533366203308105, + "learning_rate": 4.4904914652308405e-06, + "logits/chosen": 11.394113540649414, + "logits/rejected": 14.207951545715332, + "logps/chosen": -247.14279174804688, + "logps/rejected": -282.165771484375, + "loss": 0.5418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17103368043899536, + "rewards/margins": 0.5050920844078064, + "rewards/rejected": -0.33405840396881104, + "step": 3719 + }, + { + "epoch": 0.5752947999226754, + "grad_norm": 3.6601967811584473, + "learning_rate": 4.490205063581166e-06, + "logits/chosen": 8.682501792907715, + "logits/rejected": 7.988053798675537, + "logps/chosen": -402.9241638183594, + "logps/rejected": -351.97186279296875, + "loss": 0.4802, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.57405686378479, + "rewards/margins": 0.6669309139251709, + "rewards/rejected": -0.09287404268980026, + "step": 3720 + }, + { + "epoch": 0.5754494490624396, + "grad_norm": 3.6674857139587402, + "learning_rate": 4.489918661931493e-06, + "logits/chosen": 15.722311019897461, + "logits/rejected": 3.220036029815674, + "logps/chosen": -285.2763977050781, + "logps/rejected": -180.33151245117188, + "loss": 0.4644, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16027727723121643, + "rewards/margins": 0.7090728282928467, + "rewards/rejected": -0.5487955212593079, + "step": 3721 + }, + { + "epoch": 0.5756040982022037, + "grad_norm": 6.872456073760986, + "learning_rate": 4.48963226028182e-06, + "logits/chosen": 13.623438835144043, + "logits/rejected": 9.795961380004883, + "logps/chosen": -243.22866821289062, + "logps/rejected": -176.3389434814453, + "loss": 0.681, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16342535614967346, + "rewards/margins": 0.05227527767419815, + "rewards/rejected": 0.11115007102489471, + "step": 3722 + }, + { + "epoch": 0.5757587473419679, + "grad_norm": 6.244222164154053, + "learning_rate": 4.489345858632146e-06, + "logits/chosen": 8.075648307800293, + "logits/rejected": 9.600786209106445, + "logps/chosen": -282.38800048828125, + "logps/rejected": -272.2184753417969, + "loss": 0.8111, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10409536957740784, + "rewards/margins": -0.1438450962305069, + "rewards/rejected": 0.24794045090675354, + "step": 3723 + }, + { + "epoch": 0.5759133964817321, + "grad_norm": 9.015183448791504, + "learning_rate": 4.489059456982472e-06, + "logits/chosen": 9.850502967834473, + "logits/rejected": 7.741997718811035, + "logps/chosen": -321.7296142578125, + "logps/rejected": -202.72525024414062, + "loss": 0.6638, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16827788949012756, + "rewards/margins": 0.23673200607299805, + "rewards/rejected": -0.4050098657608032, + "step": 3724 + }, + { + "epoch": 0.5760680456214963, + "grad_norm": 5.416754245758057, + "learning_rate": 4.488773055332799e-06, + "logits/chosen": 13.911482810974121, + "logits/rejected": 8.012048721313477, + "logps/chosen": -278.4915466308594, + "logps/rejected": -213.6652374267578, + "loss": 0.7069, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09226924180984497, + "rewards/margins": 0.08572528511285782, + "rewards/rejected": 0.006543941795825958, + "step": 3725 + }, + { + "epoch": 0.5762226947612604, + "grad_norm": 4.727205753326416, + "learning_rate": 4.488486653683125e-06, + "logits/chosen": 11.337738037109375, + "logits/rejected": 2.549610137939453, + "logps/chosen": -446.99798583984375, + "logps/rejected": -215.03564453125, + "loss": 0.4234, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7282956838607788, + "rewards/margins": 0.8834214806556702, + "rewards/rejected": -0.15512579679489136, + "step": 3726 + }, + { + "epoch": 0.5763773439010246, + "grad_norm": 4.350319862365723, + "learning_rate": 4.488200252033452e-06, + "logits/chosen": 11.430807113647461, + "logits/rejected": 5.448462009429932, + "logps/chosen": -257.48455810546875, + "logps/rejected": -218.32249450683594, + "loss": 0.5524, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06599608063697815, + "rewards/margins": 0.4484759569168091, + "rewards/rejected": -0.3824799060821533, + "step": 3727 + }, + { + "epoch": 0.5765319930407887, + "grad_norm": 4.239972114562988, + "learning_rate": 4.487913850383779e-06, + "logits/chosen": 12.200223922729492, + "logits/rejected": 8.198999404907227, + "logps/chosen": -281.66552734375, + "logps/rejected": -260.97723388671875, + "loss": 0.6026, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13597224652767181, + "rewards/margins": 0.35886770486831665, + "rewards/rejected": -0.22289542853832245, + "step": 3728 + }, + { + "epoch": 0.5766866421805529, + "grad_norm": 4.722055435180664, + "learning_rate": 4.4876274487341045e-06, + "logits/chosen": 14.625142097473145, + "logits/rejected": 9.34661865234375, + "logps/chosen": -410.48968505859375, + "logps/rejected": -324.8926086425781, + "loss": 0.4963, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.551278293132782, + "rewards/margins": 0.4936273694038391, + "rewards/rejected": 0.057650938630104065, + "step": 3729 + }, + { + "epoch": 0.576841291320317, + "grad_norm": 5.584944725036621, + "learning_rate": 4.487341047084431e-06, + "logits/chosen": 11.08084487915039, + "logits/rejected": 8.703192710876465, + "logps/chosen": -342.2227783203125, + "logps/rejected": -335.8408203125, + "loss": 0.6348, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30301690101623535, + "rewards/margins": 0.3336612284183502, + "rewards/rejected": -0.03064434602856636, + "step": 3730 + }, + { + "epoch": 0.5769959404600812, + "grad_norm": 4.056881427764893, + "learning_rate": 4.487054645434758e-06, + "logits/chosen": 11.559033393859863, + "logits/rejected": 5.392719268798828, + "logps/chosen": -325.2268371582031, + "logps/rejected": -213.72418212890625, + "loss": 0.5389, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04463265836238861, + "rewards/margins": 0.41269466280937195, + "rewards/rejected": -0.36806201934814453, + "step": 3731 + }, + { + "epoch": 0.5771505895998453, + "grad_norm": 5.971040725708008, + "learning_rate": 4.486768243785084e-06, + "logits/chosen": 8.115890502929688, + "logits/rejected": 8.35416316986084, + "logps/chosen": -302.03509521484375, + "logps/rejected": -333.9668884277344, + "loss": 0.6932, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.050124168395996094, + "rewards/margins": 0.0197637677192688, + "rewards/rejected": -0.06988793611526489, + "step": 3732 + }, + { + "epoch": 0.5773052387396095, + "grad_norm": 5.840587615966797, + "learning_rate": 4.486481842135411e-06, + "logits/chosen": 11.901101112365723, + "logits/rejected": 14.537363052368164, + "logps/chosen": -268.8872985839844, + "logps/rejected": -326.4714660644531, + "loss": 0.7482, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04025077447295189, + "rewards/margins": 0.03637939691543579, + "rewards/rejected": -0.07663017511367798, + "step": 3733 + }, + { + "epoch": 0.5774598878793736, + "grad_norm": 5.733351230621338, + "learning_rate": 4.486195440485738e-06, + "logits/chosen": 9.068757057189941, + "logits/rejected": 4.97458028793335, + "logps/chosen": -232.80511474609375, + "logps/rejected": -195.374755859375, + "loss": 0.6016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12704233825206757, + "rewards/margins": 0.23217704892158508, + "rewards/rejected": -0.10513471812009811, + "step": 3734 + }, + { + "epoch": 0.5776145370191378, + "grad_norm": 5.579530715942383, + "learning_rate": 4.485909038836064e-06, + "logits/chosen": 10.892488479614258, + "logits/rejected": 6.22386360168457, + "logps/chosen": -290.3475341796875, + "logps/rejected": -221.51890563964844, + "loss": 0.7976, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3246510624885559, + "rewards/margins": -0.1318206787109375, + "rewards/rejected": -0.1928303837776184, + "step": 3735 + }, + { + "epoch": 0.5777691861589019, + "grad_norm": 5.380240440368652, + "learning_rate": 4.48562263718639e-06, + "logits/chosen": 10.766910552978516, + "logits/rejected": 7.608599662780762, + "logps/chosen": -323.02105712890625, + "logps/rejected": -266.8172607421875, + "loss": 0.6513, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2053164541721344, + "rewards/margins": 0.22976070642471313, + "rewards/rejected": -0.024444222450256348, + "step": 3736 + }, + { + "epoch": 0.5779238352986662, + "grad_norm": 7.871776580810547, + "learning_rate": 4.485336235536717e-06, + "logits/chosen": 5.785160541534424, + "logits/rejected": 6.462968349456787, + "logps/chosen": -209.09873962402344, + "logps/rejected": -276.9587707519531, + "loss": 0.8479, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05167999118566513, + "rewards/margins": -0.16479802131652832, + "rewards/rejected": 0.11311802268028259, + "step": 3737 + }, + { + "epoch": 0.5780784844384304, + "grad_norm": 5.114760398864746, + "learning_rate": 4.4850498338870435e-06, + "logits/chosen": 10.222497940063477, + "logits/rejected": 9.965426445007324, + "logps/chosen": -232.41343688964844, + "logps/rejected": -200.0277099609375, + "loss": 0.6488, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09685802459716797, + "rewards/margins": 0.10919791460037231, + "rewards/rejected": -0.20605593919754028, + "step": 3738 + }, + { + "epoch": 0.5782331335781945, + "grad_norm": 3.8910679817199707, + "learning_rate": 4.48476343223737e-06, + "logits/chosen": 10.613103866577148, + "logits/rejected": 11.750024795532227, + "logps/chosen": -206.47462463378906, + "logps/rejected": -269.7124328613281, + "loss": 0.4296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36297935247421265, + "rewards/margins": 1.1942442655563354, + "rewards/rejected": -0.8312649130821228, + "step": 3739 + }, + { + "epoch": 0.5783877827179587, + "grad_norm": 4.667725563049316, + "learning_rate": 4.484477030587697e-06, + "logits/chosen": 12.33337116241455, + "logits/rejected": 5.116974830627441, + "logps/chosen": -253.1707763671875, + "logps/rejected": -221.9131317138672, + "loss": 0.5925, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1445111334323883, + "rewards/margins": 0.2741219997406006, + "rewards/rejected": -0.12961086630821228, + "step": 3740 + }, + { + "epoch": 0.5785424318577228, + "grad_norm": 4.907107353210449, + "learning_rate": 4.4841906289380234e-06, + "logits/chosen": 11.9082670211792, + "logits/rejected": 11.544818878173828, + "logps/chosen": -309.4208679199219, + "logps/rejected": -302.5118408203125, + "loss": 0.5236, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27777189016342163, + "rewards/margins": 0.47985750436782837, + "rewards/rejected": -0.20208558440208435, + "step": 3741 + }, + { + "epoch": 0.578697080997487, + "grad_norm": 4.337289333343506, + "learning_rate": 4.483904227288349e-06, + "logits/chosen": 10.786046981811523, + "logits/rejected": 3.8704121112823486, + "logps/chosen": -202.09494018554688, + "logps/rejected": -95.15509033203125, + "loss": 0.62, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16829673945903778, + "rewards/margins": 0.32523277401924133, + "rewards/rejected": -0.4935295581817627, + "step": 3742 + }, + { + "epoch": 0.5788517301372511, + "grad_norm": 7.320075511932373, + "learning_rate": 4.483617825638676e-06, + "logits/chosen": 14.791908264160156, + "logits/rejected": 9.116199493408203, + "logps/chosen": -379.0672912597656, + "logps/rejected": -293.0577697753906, + "loss": 0.7252, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2709035873413086, + "rewards/margins": 0.12911444902420044, + "rewards/rejected": -0.40001803636550903, + "step": 3743 + }, + { + "epoch": 0.5790063792770153, + "grad_norm": 4.902713775634766, + "learning_rate": 4.4833314239890025e-06, + "logits/chosen": 10.773898124694824, + "logits/rejected": 6.587568283081055, + "logps/chosen": -213.33270263671875, + "logps/rejected": -187.54014587402344, + "loss": 0.6313, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18620261549949646, + "rewards/margins": 0.22033604979515076, + "rewards/rejected": -0.0341334342956543, + "step": 3744 + }, + { + "epoch": 0.5791610284167794, + "grad_norm": 9.9924898147583, + "learning_rate": 4.483045022339329e-06, + "logits/chosen": 8.371187210083008, + "logits/rejected": 5.008256435394287, + "logps/chosen": -263.6105651855469, + "logps/rejected": -189.58517456054688, + "loss": 0.8659, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.054033905267715454, + "rewards/margins": -0.22151866555213928, + "rewards/rejected": 0.16748476028442383, + "step": 3745 + }, + { + "epoch": 0.5793156775565436, + "grad_norm": 8.487750053405762, + "learning_rate": 4.482758620689656e-06, + "logits/chosen": 10.929519653320312, + "logits/rejected": 10.445944786071777, + "logps/chosen": -366.62847900390625, + "logps/rejected": -356.4198303222656, + "loss": 1.0023, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.17304527759552002, + "rewards/margins": -0.43341904878616333, + "rewards/rejected": 0.2603738009929657, + "step": 3746 + }, + { + "epoch": 0.5794703266963077, + "grad_norm": 5.879138469696045, + "learning_rate": 4.4824722190399825e-06, + "logits/chosen": 13.399274826049805, + "logits/rejected": 7.804872989654541, + "logps/chosen": -192.129638671875, + "logps/rejected": -158.424560546875, + "loss": 0.7957, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07273131608963013, + "rewards/margins": -0.1360088288784027, + "rewards/rejected": 0.06327749788761139, + "step": 3747 + }, + { + "epoch": 0.5796249758360719, + "grad_norm": 6.219170093536377, + "learning_rate": 4.482185817390309e-06, + "logits/chosen": 8.170676231384277, + "logits/rejected": 7.780335426330566, + "logps/chosen": -233.9054718017578, + "logps/rejected": -189.16046142578125, + "loss": 0.8364, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21995553374290466, + "rewards/margins": -0.21628113090991974, + "rewards/rejected": -0.0036744065582752228, + "step": 3748 + }, + { + "epoch": 0.579779624975836, + "grad_norm": 5.667336940765381, + "learning_rate": 4.481899415740635e-06, + "logits/chosen": 12.209220886230469, + "logits/rejected": 8.255184173583984, + "logps/chosen": -360.5721435546875, + "logps/rejected": -305.6612548828125, + "loss": 0.6736, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27346983551979065, + "rewards/margins": 0.250363826751709, + "rewards/rejected": 0.023106008768081665, + "step": 3749 + }, + { + "epoch": 0.5799342741156003, + "grad_norm": 4.742862224578857, + "learning_rate": 4.481613014090962e-06, + "logits/chosen": 17.49440574645996, + "logits/rejected": 14.298995018005371, + "logps/chosen": -370.58538818359375, + "logps/rejected": -371.48577880859375, + "loss": 0.4384, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32053032517433167, + "rewards/margins": 0.6644891500473022, + "rewards/rejected": -0.3439588248729706, + "step": 3750 + }, + { + "epoch": 0.5800889232553644, + "grad_norm": 4.997811794281006, + "learning_rate": 4.481326612441288e-06, + "logits/chosen": 10.554903030395508, + "logits/rejected": 7.099453449249268, + "logps/chosen": -226.81005859375, + "logps/rejected": -230.79368591308594, + "loss": 0.5953, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10634604096412659, + "rewards/margins": 0.24701035022735596, + "rewards/rejected": -0.35335636138916016, + "step": 3751 + }, + { + "epoch": 0.5802435723951286, + "grad_norm": 9.690844535827637, + "learning_rate": 4.481040210791615e-06, + "logits/chosen": 10.978182792663574, + "logits/rejected": 10.606393814086914, + "logps/chosen": -432.098876953125, + "logps/rejected": -440.06005859375, + "loss": 0.8279, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3529754877090454, + "rewards/margins": 0.04951745271682739, + "rewards/rejected": 0.303458034992218, + "step": 3752 + }, + { + "epoch": 0.5803982215348927, + "grad_norm": 10.848416328430176, + "learning_rate": 4.4807538091419415e-06, + "logits/chosen": 8.261761665344238, + "logits/rejected": 7.789950370788574, + "logps/chosen": -261.759765625, + "logps/rejected": -262.27972412109375, + "loss": 1.0525, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07930061221122742, + "rewards/margins": -0.5074067711830139, + "rewards/rejected": 0.4281061887741089, + "step": 3753 + }, + { + "epoch": 0.5805528706746569, + "grad_norm": 7.323809623718262, + "learning_rate": 4.480467407492267e-06, + "logits/chosen": 7.141221523284912, + "logits/rejected": 4.714208126068115, + "logps/chosen": -362.9966125488281, + "logps/rejected": -224.17605590820312, + "loss": 0.6952, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03145284950733185, + "rewards/margins": 0.23262004554271698, + "rewards/rejected": -0.20116716623306274, + "step": 3754 + }, + { + "epoch": 0.580707519814421, + "grad_norm": 4.843491077423096, + "learning_rate": 4.480181005842594e-06, + "logits/chosen": 9.270769119262695, + "logits/rejected": 9.21450424194336, + "logps/chosen": -218.399658203125, + "logps/rejected": -258.3753662109375, + "loss": 0.5349, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3864266574382782, + "rewards/margins": 0.46174392104148865, + "rewards/rejected": -0.07531729340553284, + "step": 3755 + }, + { + "epoch": 0.5808621689541852, + "grad_norm": 7.27138090133667, + "learning_rate": 4.479894604192921e-06, + "logits/chosen": 12.108461380004883, + "logits/rejected": 8.761161804199219, + "logps/chosen": -421.70733642578125, + "logps/rejected": -365.26702880859375, + "loss": 0.6395, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.237372487783432, + "rewards/margins": 0.16200634837150574, + "rewards/rejected": 0.07536615431308746, + "step": 3756 + }, + { + "epoch": 0.5810168180939493, + "grad_norm": 4.791899681091309, + "learning_rate": 4.479608202543247e-06, + "logits/chosen": 9.259770393371582, + "logits/rejected": 4.559820652008057, + "logps/chosen": -258.1905822753906, + "logps/rejected": -188.4403076171875, + "loss": 0.6288, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16299495100975037, + "rewards/margins": 0.2914098799228668, + "rewards/rejected": -0.12841489911079407, + "step": 3757 + }, + { + "epoch": 0.5811714672337135, + "grad_norm": 8.089067459106445, + "learning_rate": 4.479321800893573e-06, + "logits/chosen": 12.341484069824219, + "logits/rejected": 6.69716215133667, + "logps/chosen": -378.91436767578125, + "logps/rejected": -270.20867919921875, + "loss": 0.5691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38474541902542114, + "rewards/margins": 0.44884008169174194, + "rewards/rejected": -0.06409463286399841, + "step": 3758 + }, + { + "epoch": 0.5813261163734776, + "grad_norm": 6.568228244781494, + "learning_rate": 4.4790353992439e-06, + "logits/chosen": 7.807243347167969, + "logits/rejected": 7.417841911315918, + "logps/chosen": -304.3643798828125, + "logps/rejected": -383.8334655761719, + "loss": 0.7749, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2984338402748108, + "rewards/margins": 0.15377725660800934, + "rewards/rejected": 0.14465655386447906, + "step": 3759 + }, + { + "epoch": 0.5814807655132418, + "grad_norm": 3.6757256984710693, + "learning_rate": 4.478748997594226e-06, + "logits/chosen": 12.442808151245117, + "logits/rejected": 6.566100120544434, + "logps/chosen": -337.8768005371094, + "logps/rejected": -298.24859619140625, + "loss": 0.4441, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3336055874824524, + "rewards/margins": 0.7352687120437622, + "rewards/rejected": -0.4016631245613098, + "step": 3760 + }, + { + "epoch": 0.5816354146530059, + "grad_norm": 3.560396194458008, + "learning_rate": 4.478462595944553e-06, + "logits/chosen": 8.497380256652832, + "logits/rejected": 7.978403091430664, + "logps/chosen": -281.2266845703125, + "logps/rejected": -273.2942199707031, + "loss": 0.3832, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4074232280254364, + "rewards/margins": 0.905182957649231, + "rewards/rejected": -0.4977598190307617, + "step": 3761 + }, + { + "epoch": 0.5817900637927702, + "grad_norm": 3.164163589477539, + "learning_rate": 4.478176194294879e-06, + "logits/chosen": 11.702451705932617, + "logits/rejected": 5.535411834716797, + "logps/chosen": -194.426513671875, + "logps/rejected": -101.06061553955078, + "loss": 0.5323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22941532731056213, + "rewards/margins": 0.46564823389053345, + "rewards/rejected": -0.2362329214811325, + "step": 3762 + }, + { + "epoch": 0.5819447129325344, + "grad_norm": 6.195248603820801, + "learning_rate": 4.4778897926452055e-06, + "logits/chosen": 10.390745162963867, + "logits/rejected": 12.758419036865234, + "logps/chosen": -242.35296630859375, + "logps/rejected": -385.4520568847656, + "loss": 0.7732, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24415001273155212, + "rewards/margins": -0.10610131919384003, + "rewards/rejected": 0.35025131702423096, + "step": 3763 + }, + { + "epoch": 0.5820993620722985, + "grad_norm": 5.916524887084961, + "learning_rate": 4.477603390995532e-06, + "logits/chosen": 8.734375953674316, + "logits/rejected": 6.744244575500488, + "logps/chosen": -321.8634338378906, + "logps/rejected": -327.76751708984375, + "loss": 0.6345, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3101542294025421, + "rewards/margins": 0.22043971717357635, + "rewards/rejected": 0.08971453458070755, + "step": 3764 + }, + { + "epoch": 0.5822540112120627, + "grad_norm": 6.159871578216553, + "learning_rate": 4.477316989345859e-06, + "logits/chosen": 9.64154052734375, + "logits/rejected": 2.321913719177246, + "logps/chosen": -248.2386016845703, + "logps/rejected": -207.50784301757812, + "loss": 0.6382, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06591348350048065, + "rewards/margins": 0.24014215171337128, + "rewards/rejected": -0.17422866821289062, + "step": 3765 + }, + { + "epoch": 0.5824086603518268, + "grad_norm": 8.377087593078613, + "learning_rate": 4.4770305876961855e-06, + "logits/chosen": 5.4512715339660645, + "logits/rejected": 6.379087924957275, + "logps/chosen": -247.5706024169922, + "logps/rejected": -344.85137939453125, + "loss": 0.7427, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.440658837556839, + "rewards/margins": -0.02797914296388626, + "rewards/rejected": 0.46863800287246704, + "step": 3766 + }, + { + "epoch": 0.582563309491591, + "grad_norm": 4.889064311981201, + "learning_rate": 4.476744186046512e-06, + "logits/chosen": 7.883516311645508, + "logits/rejected": 7.559130668640137, + "logps/chosen": -213.40573120117188, + "logps/rejected": -211.27577209472656, + "loss": 0.6861, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0546211302280426, + "rewards/margins": 0.021650459617376328, + "rewards/rejected": -0.07627158612012863, + "step": 3767 + }, + { + "epoch": 0.5827179586313551, + "grad_norm": 7.313686847686768, + "learning_rate": 4.476457784396839e-06, + "logits/chosen": 5.974917411804199, + "logits/rejected": 7.605887413024902, + "logps/chosen": -303.02294921875, + "logps/rejected": -319.54815673828125, + "loss": 0.6538, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21846377849578857, + "rewards/margins": 0.20421268045902252, + "rewards/rejected": 0.014251090586185455, + "step": 3768 + }, + { + "epoch": 0.5828726077711193, + "grad_norm": 5.9766926765441895, + "learning_rate": 4.4761713827471646e-06, + "logits/chosen": 10.101509094238281, + "logits/rejected": 12.571516036987305, + "logps/chosen": -308.71331787109375, + "logps/rejected": -300.5577697753906, + "loss": 0.7918, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27153289318084717, + "rewards/margins": -0.06378117203712463, + "rewards/rejected": 0.3353140950202942, + "step": 3769 + }, + { + "epoch": 0.5830272569108834, + "grad_norm": 4.380134105682373, + "learning_rate": 4.475884981097491e-06, + "logits/chosen": 5.582114219665527, + "logits/rejected": 2.132516860961914, + "logps/chosen": -322.6517333984375, + "logps/rejected": -232.29786682128906, + "loss": 0.6985, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2889731526374817, + "rewards/margins": 0.10297083109617233, + "rewards/rejected": 0.18600231409072876, + "step": 3770 + }, + { + "epoch": 0.5831819060506476, + "grad_norm": 7.89937686920166, + "learning_rate": 4.475598579447818e-06, + "logits/chosen": 11.376214981079102, + "logits/rejected": 10.147675514221191, + "logps/chosen": -290.3760070800781, + "logps/rejected": -192.64199829101562, + "loss": 0.5719, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3364810049533844, + "rewards/margins": 0.45987826585769653, + "rewards/rejected": -0.7963593006134033, + "step": 3771 + }, + { + "epoch": 0.5833365551904117, + "grad_norm": 4.555075168609619, + "learning_rate": 4.4753121777981445e-06, + "logits/chosen": 4.479366302490234, + "logits/rejected": 3.993962287902832, + "logps/chosen": -182.8438720703125, + "logps/rejected": -271.4689636230469, + "loss": 0.4553, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.06888549774885178, + "rewards/margins": 0.7116819620132446, + "rewards/rejected": -0.7805674076080322, + "step": 3772 + }, + { + "epoch": 0.5834912043301759, + "grad_norm": 5.773561000823975, + "learning_rate": 4.475025776148471e-06, + "logits/chosen": 13.352177619934082, + "logits/rejected": 6.237696647644043, + "logps/chosen": -187.14210510253906, + "logps/rejected": -167.95602416992188, + "loss": 0.4966, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27232131361961365, + "rewards/margins": 0.48372483253479004, + "rewards/rejected": -0.2114035189151764, + "step": 3773 + }, + { + "epoch": 0.58364585346994, + "grad_norm": 4.6555681228637695, + "learning_rate": 4.474739374498798e-06, + "logits/chosen": 10.552513122558594, + "logits/rejected": 11.679862022399902, + "logps/chosen": -240.0597381591797, + "logps/rejected": -232.78564453125, + "loss": 0.6121, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10497722774744034, + "rewards/margins": 0.2214737981557846, + "rewards/rejected": -0.11649655550718307, + "step": 3774 + }, + { + "epoch": 0.5838005026097043, + "grad_norm": 5.564403533935547, + "learning_rate": 4.474452972849124e-06, + "logits/chosen": 12.715398788452148, + "logits/rejected": 7.832818031311035, + "logps/chosen": -463.7010803222656, + "logps/rejected": -375.20025634765625, + "loss": 0.4966, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4750397205352783, + "rewards/margins": 0.6359333395957947, + "rewards/rejected": -0.16089363396167755, + "step": 3775 + }, + { + "epoch": 0.5839551517494684, + "grad_norm": 4.329071044921875, + "learning_rate": 4.47416657119945e-06, + "logits/chosen": 9.388490676879883, + "logits/rejected": 10.300674438476562, + "logps/chosen": -183.52597045898438, + "logps/rejected": -174.80609130859375, + "loss": 0.6342, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3162219524383545, + "rewards/margins": 0.1883973777294159, + "rewards/rejected": 0.1278245598077774, + "step": 3776 + }, + { + "epoch": 0.5841098008892326, + "grad_norm": 5.97024393081665, + "learning_rate": 4.473880169549777e-06, + "logits/chosen": 16.1877384185791, + "logits/rejected": 9.781013488769531, + "logps/chosen": -446.768798828125, + "logps/rejected": -406.63165283203125, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7192550897598267, + "rewards/margins": 0.36904335021972656, + "rewards/rejected": 0.3502117097377777, + "step": 3777 + }, + { + "epoch": 0.5842644500289967, + "grad_norm": 6.44588041305542, + "learning_rate": 4.4735937679001036e-06, + "logits/chosen": 13.135551452636719, + "logits/rejected": 10.18069839477539, + "logps/chosen": -365.21466064453125, + "logps/rejected": -322.7725524902344, + "loss": 0.6919, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38954561948776245, + "rewards/margins": 0.11762365698814392, + "rewards/rejected": 0.2719219923019409, + "step": 3778 + }, + { + "epoch": 0.5844190991687609, + "grad_norm": 4.911144256591797, + "learning_rate": 4.47330736625043e-06, + "logits/chosen": 14.770395278930664, + "logits/rejected": 10.70228385925293, + "logps/chosen": -264.84332275390625, + "logps/rejected": -250.11279296875, + "loss": 0.6753, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2633327841758728, + "rewards/margins": 0.1370125114917755, + "rewards/rejected": -0.4003453254699707, + "step": 3779 + }, + { + "epoch": 0.584573748308525, + "grad_norm": 6.50197696685791, + "learning_rate": 4.473020964600757e-06, + "logits/chosen": 13.485575675964355, + "logits/rejected": 8.607733726501465, + "logps/chosen": -273.1490478515625, + "logps/rejected": -188.75436401367188, + "loss": 0.727, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0367947518825531, + "rewards/margins": -0.027381855994462967, + "rewards/rejected": 0.06417660415172577, + "step": 3780 + }, + { + "epoch": 0.5847283974482892, + "grad_norm": 10.383441925048828, + "learning_rate": 4.4727345629510835e-06, + "logits/chosen": 9.297343254089355, + "logits/rejected": 5.5316362380981445, + "logps/chosen": -531.0068359375, + "logps/rejected": -336.4123840332031, + "loss": 0.7295, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18946370482444763, + "rewards/margins": 0.010195180773735046, + "rewards/rejected": 0.17926853895187378, + "step": 3781 + }, + { + "epoch": 0.5848830465880533, + "grad_norm": 5.7821760177612305, + "learning_rate": 4.472448161301409e-06, + "logits/chosen": 12.578073501586914, + "logits/rejected": 14.491165161132812, + "logps/chosen": -263.8685607910156, + "logps/rejected": -294.4859313964844, + "loss": 0.6188, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5995829105377197, + "rewards/margins": 0.3486010432243347, + "rewards/rejected": 0.2509818971157074, + "step": 3782 + }, + { + "epoch": 0.5850376957278175, + "grad_norm": 4.529803276062012, + "learning_rate": 4.472161759651736e-06, + "logits/chosen": 12.318668365478516, + "logits/rejected": 1.136120319366455, + "logps/chosen": -329.66497802734375, + "logps/rejected": -239.211669921875, + "loss": 0.4283, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2693607211112976, + "rewards/margins": 0.940729022026062, + "rewards/rejected": -0.6713683605194092, + "step": 3783 + }, + { + "epoch": 0.5851923448675816, + "grad_norm": 6.902698516845703, + "learning_rate": 4.471875358002063e-06, + "logits/chosen": 6.333864212036133, + "logits/rejected": 6.201364994049072, + "logps/chosen": -252.17929077148438, + "logps/rejected": -338.15081787109375, + "loss": 0.7896, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.407183438539505, + "rewards/margins": 0.04521150141954422, + "rewards/rejected": 0.361971914768219, + "step": 3784 + }, + { + "epoch": 0.5853469940073458, + "grad_norm": 6.2179646492004395, + "learning_rate": 4.471588956352389e-06, + "logits/chosen": 12.140695571899414, + "logits/rejected": 13.30762004852295, + "logps/chosen": -187.5406494140625, + "logps/rejected": -265.6317443847656, + "loss": 0.6663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10119519382715225, + "rewards/margins": 0.11078186333179474, + "rewards/rejected": -0.2119770646095276, + "step": 3785 + }, + { + "epoch": 0.58550164314711, + "grad_norm": 4.245568752288818, + "learning_rate": 4.471302554702716e-06, + "logits/chosen": 15.897817611694336, + "logits/rejected": 5.275870323181152, + "logps/chosen": -370.5501708984375, + "logps/rejected": -225.25595092773438, + "loss": 0.4457, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5728532671928406, + "rewards/margins": 0.736522912979126, + "rewards/rejected": -0.16366958618164062, + "step": 3786 + }, + { + "epoch": 0.5856562922868741, + "grad_norm": 5.77073335647583, + "learning_rate": 4.4710161530530426e-06, + "logits/chosen": 12.041760444641113, + "logits/rejected": 10.53640079498291, + "logps/chosen": -279.2558288574219, + "logps/rejected": -321.2295227050781, + "loss": 0.5384, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10754089057445526, + "rewards/margins": 0.43868744373321533, + "rewards/rejected": -0.546228289604187, + "step": 3787 + }, + { + "epoch": 0.5858109414266384, + "grad_norm": 3.7641026973724365, + "learning_rate": 4.470729751403368e-06, + "logits/chosen": 4.199711799621582, + "logits/rejected": 4.029529571533203, + "logps/chosen": -171.558349609375, + "logps/rejected": -180.0418701171875, + "loss": 0.6667, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1341724991798401, + "rewards/margins": 0.1828451305627823, + "rewards/rejected": -0.048672646284103394, + "step": 3788 + }, + { + "epoch": 0.5859655905664025, + "grad_norm": 6.540365219116211, + "learning_rate": 4.470443349753695e-06, + "logits/chosen": 4.306085109710693, + "logits/rejected": 8.175307273864746, + "logps/chosen": -160.38243103027344, + "logps/rejected": -202.55935668945312, + "loss": 0.813, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04820629209280014, + "rewards/margins": -0.1791524887084961, + "rewards/rejected": 0.13094620406627655, + "step": 3789 + }, + { + "epoch": 0.5861202397061667, + "grad_norm": 3.2858846187591553, + "learning_rate": 4.470156948104022e-06, + "logits/chosen": 11.635862350463867, + "logits/rejected": 2.032432794570923, + "logps/chosen": -256.81658935546875, + "logps/rejected": -178.1287078857422, + "loss": 0.4585, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.881072998046875, + "rewards/margins": 0.6806513071060181, + "rewards/rejected": 0.2004217654466629, + "step": 3790 + }, + { + "epoch": 0.5862748888459308, + "grad_norm": 6.301393985748291, + "learning_rate": 4.469870546454348e-06, + "logits/chosen": 9.518982887268066, + "logits/rejected": 5.72247314453125, + "logps/chosen": -404.8212585449219, + "logps/rejected": -392.3601989746094, + "loss": 0.5675, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5476022958755493, + "rewards/margins": 0.3079988360404968, + "rewards/rejected": 0.2396034300327301, + "step": 3791 + }, + { + "epoch": 0.586429537985695, + "grad_norm": 4.827402591705322, + "learning_rate": 4.469584144804674e-06, + "logits/chosen": 8.841787338256836, + "logits/rejected": 5.73464298248291, + "logps/chosen": -217.51168823242188, + "logps/rejected": -181.70196533203125, + "loss": 0.6022, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2334670126438141, + "rewards/margins": 0.2845512628555298, + "rewards/rejected": -0.051084235310554504, + "step": 3792 + }, + { + "epoch": 0.5865841871254591, + "grad_norm": 6.105525493621826, + "learning_rate": 4.469297743155001e-06, + "logits/chosen": 7.163834095001221, + "logits/rejected": 9.782182693481445, + "logps/chosen": -364.913818359375, + "logps/rejected": -337.16357421875, + "loss": 0.6958, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33818262815475464, + "rewards/margins": 0.11036074161529541, + "rewards/rejected": 0.227821946144104, + "step": 3793 + }, + { + "epoch": 0.5867388362652233, + "grad_norm": 5.327899932861328, + "learning_rate": 4.4690113415053274e-06, + "logits/chosen": 15.233545303344727, + "logits/rejected": 9.916312217712402, + "logps/chosen": -309.1690368652344, + "logps/rejected": -214.64369201660156, + "loss": 0.6173, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.49004632234573364, + "rewards/margins": 0.28557318449020386, + "rewards/rejected": 0.2044731229543686, + "step": 3794 + }, + { + "epoch": 0.5868934854049874, + "grad_norm": 3.567430257797241, + "learning_rate": 4.468724939855654e-06, + "logits/chosen": 16.124679565429688, + "logits/rejected": 9.374916076660156, + "logps/chosen": -317.8848571777344, + "logps/rejected": -225.60467529296875, + "loss": 0.4853, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45219936966896057, + "rewards/margins": 0.5519326329231262, + "rewards/rejected": -0.09973322600126266, + "step": 3795 + }, + { + "epoch": 0.5870481345447516, + "grad_norm": 4.013065338134766, + "learning_rate": 4.46843853820598e-06, + "logits/chosen": 11.811205863952637, + "logits/rejected": 5.234990119934082, + "logps/chosen": -328.9539489746094, + "logps/rejected": -247.79653930664062, + "loss": 0.4497, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5740846395492554, + "rewards/margins": 0.704576849937439, + "rewards/rejected": -0.13049226999282837, + "step": 3796 + }, + { + "epoch": 0.5872027836845157, + "grad_norm": 4.756516933441162, + "learning_rate": 4.4681521365563065e-06, + "logits/chosen": 11.040724754333496, + "logits/rejected": 6.3991498947143555, + "logps/chosen": -275.6792297363281, + "logps/rejected": -298.7868347167969, + "loss": 0.4897, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24100293219089508, + "rewards/margins": 0.5949441194534302, + "rewards/rejected": -0.3539411425590515, + "step": 3797 + }, + { + "epoch": 0.5873574328242799, + "grad_norm": 5.730184555053711, + "learning_rate": 4.467865734906633e-06, + "logits/chosen": 6.317093849182129, + "logits/rejected": 10.418078422546387, + "logps/chosen": -256.61669921875, + "logps/rejected": -244.69720458984375, + "loss": 0.738, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5190300941467285, + "rewards/margins": 0.09673692286014557, + "rewards/rejected": 0.42229318618774414, + "step": 3798 + }, + { + "epoch": 0.587512081964044, + "grad_norm": 7.569241523742676, + "learning_rate": 4.46757933325696e-06, + "logits/chosen": 9.801020622253418, + "logits/rejected": 14.157557487487793, + "logps/chosen": -366.411865234375, + "logps/rejected": -429.7743835449219, + "loss": 0.762, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0666530579328537, + "rewards/margins": -0.024210747331380844, + "rewards/rejected": -0.04244232177734375, + "step": 3799 + }, + { + "epoch": 0.5876667311038082, + "grad_norm": 4.484637260437012, + "learning_rate": 4.4672929316072865e-06, + "logits/chosen": 4.176270484924316, + "logits/rejected": 2.5174739360809326, + "logps/chosen": -150.77369689941406, + "logps/rejected": -133.33351135253906, + "loss": 0.7546, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23026953637599945, + "rewards/margins": -0.06017979979515076, + "rewards/rejected": -0.1700897216796875, + "step": 3800 + }, + { + "epoch": 0.5878213802435724, + "grad_norm": 7.265353679656982, + "learning_rate": 4.467006529957613e-06, + "logits/chosen": 4.399864196777344, + "logits/rejected": 1.9547100067138672, + "logps/chosen": -269.5384521484375, + "logps/rejected": -210.87155151367188, + "loss": 0.7618, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06810353696346283, + "rewards/margins": -0.034628890454769135, + "rewards/rejected": -0.0334746390581131, + "step": 3801 + }, + { + "epoch": 0.5879760293833366, + "grad_norm": 5.543549060821533, + "learning_rate": 4.466720128307939e-06, + "logits/chosen": 3.6068787574768066, + "logits/rejected": 11.251604080200195, + "logps/chosen": -118.15328979492188, + "logps/rejected": -180.07046508789062, + "loss": 0.8877, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.14792117476463318, + "rewards/margins": -0.30304527282714844, + "rewards/rejected": 0.15512409806251526, + "step": 3802 + }, + { + "epoch": 0.5881306785231007, + "grad_norm": 9.596186637878418, + "learning_rate": 4.466433726658266e-06, + "logits/chosen": 7.785845756530762, + "logits/rejected": 6.174363613128662, + "logps/chosen": -303.5019836425781, + "logps/rejected": -303.1763610839844, + "loss": 0.6227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3537774085998535, + "rewards/margins": 0.32071545720100403, + "rewards/rejected": 0.033061981201171875, + "step": 3803 + }, + { + "epoch": 0.5882853276628649, + "grad_norm": 5.346992015838623, + "learning_rate": 4.466147325008592e-06, + "logits/chosen": 10.00451946258545, + "logits/rejected": 14.103031158447266, + "logps/chosen": -239.40061950683594, + "logps/rejected": -245.2680206298828, + "loss": 0.6558, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3363875448703766, + "rewards/margins": 0.17690998315811157, + "rewards/rejected": 0.15947756171226501, + "step": 3804 + }, + { + "epoch": 0.588439976802629, + "grad_norm": 6.007246017456055, + "learning_rate": 4.465860923358919e-06, + "logits/chosen": 5.067300319671631, + "logits/rejected": 6.6585211753845215, + "logps/chosen": -214.13839721679688, + "logps/rejected": -256.30999755859375, + "loss": 0.7586, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.022285493090748787, + "rewards/margins": -0.047274086624383926, + "rewards/rejected": 0.06955958902835846, + "step": 3805 + }, + { + "epoch": 0.5885946259423932, + "grad_norm": 4.4729509353637695, + "learning_rate": 4.4655745217092455e-06, + "logits/chosen": 11.940178871154785, + "logits/rejected": 2.373706340789795, + "logps/chosen": -277.29638671875, + "logps/rejected": -191.27943420410156, + "loss": 0.5275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2847093641757965, + "rewards/margins": 0.4315881133079529, + "rewards/rejected": -0.14687877893447876, + "step": 3806 + }, + { + "epoch": 0.5887492750821574, + "grad_norm": 4.49417781829834, + "learning_rate": 4.465288120059572e-06, + "logits/chosen": 5.422382354736328, + "logits/rejected": 1.3937599658966064, + "logps/chosen": -218.1869354248047, + "logps/rejected": -146.658203125, + "loss": 0.6267, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3639605641365051, + "rewards/margins": 0.15747399628162384, + "rewards/rejected": 0.2064865380525589, + "step": 3807 + }, + { + "epoch": 0.5889039242219215, + "grad_norm": 4.653871059417725, + "learning_rate": 4.465001718409898e-06, + "logits/chosen": 10.03309440612793, + "logits/rejected": 9.686051368713379, + "logps/chosen": -230.60037231445312, + "logps/rejected": -260.16497802734375, + "loss": 0.5235, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5185046195983887, + "rewards/margins": 0.46372655034065247, + "rewards/rejected": 0.05477805435657501, + "step": 3808 + }, + { + "epoch": 0.5890585733616857, + "grad_norm": 7.4475178718566895, + "learning_rate": 4.464715316760225e-06, + "logits/chosen": 6.612174034118652, + "logits/rejected": 6.437985420227051, + "logps/chosen": -207.719482421875, + "logps/rejected": -233.73641967773438, + "loss": 0.9596, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.030739925801753998, + "rewards/margins": -0.34236496686935425, + "rewards/rejected": 0.31162506341934204, + "step": 3809 + }, + { + "epoch": 0.5892132225014498, + "grad_norm": 5.061812400817871, + "learning_rate": 4.464428915110551e-06, + "logits/chosen": 9.823419570922852, + "logits/rejected": 6.179940223693848, + "logps/chosen": -300.6733703613281, + "logps/rejected": -208.8863525390625, + "loss": 0.6371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26062458753585815, + "rewards/margins": 0.26005592942237854, + "rewards/rejected": 0.0005686730146408081, + "step": 3810 + }, + { + "epoch": 0.589367871641214, + "grad_norm": 4.545434951782227, + "learning_rate": 4.464142513460878e-06, + "logits/chosen": 11.751673698425293, + "logits/rejected": 9.986916542053223, + "logps/chosen": -236.00482177734375, + "logps/rejected": -223.0562744140625, + "loss": 0.5292, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3402070999145508, + "rewards/margins": 0.4128991961479187, + "rewards/rejected": -0.07269209623336792, + "step": 3811 + }, + { + "epoch": 0.5895225207809781, + "grad_norm": 6.3638105392456055, + "learning_rate": 4.463856111811205e-06, + "logits/chosen": 10.048778533935547, + "logits/rejected": 1.469570279121399, + "logps/chosen": -224.25860595703125, + "logps/rejected": -227.16107177734375, + "loss": 0.7472, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17586174607276917, + "rewards/margins": 0.03858184814453125, + "rewards/rejected": -0.21444359421730042, + "step": 3812 + }, + { + "epoch": 0.5896771699207423, + "grad_norm": 5.346762657165527, + "learning_rate": 4.463569710161531e-06, + "logits/chosen": 8.229119300842285, + "logits/rejected": 9.617132186889648, + "logps/chosen": -275.83599853515625, + "logps/rejected": -285.33917236328125, + "loss": 0.6828, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.498769074678421, + "rewards/margins": 0.14329960942268372, + "rewards/rejected": 0.3554695248603821, + "step": 3813 + }, + { + "epoch": 0.5898318190605065, + "grad_norm": 5.457743167877197, + "learning_rate": 4.463283308511858e-06, + "logits/chosen": 11.086227416992188, + "logits/rejected": 8.038616180419922, + "logps/chosen": -286.2315673828125, + "logps/rejected": -306.5230712890625, + "loss": 0.5223, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6428254842758179, + "rewards/margins": 0.4168838858604431, + "rewards/rejected": 0.22594158351421356, + "step": 3814 + }, + { + "epoch": 0.5899864682002707, + "grad_norm": 6.677029609680176, + "learning_rate": 4.462996906862184e-06, + "logits/chosen": 8.740394592285156, + "logits/rejected": 9.674402236938477, + "logps/chosen": -288.3377685546875, + "logps/rejected": -251.75526428222656, + "loss": 0.7327, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.08418559283018112, + "rewards/margins": 0.055370181798934937, + "rewards/rejected": 0.028815407305955887, + "step": 3815 + }, + { + "epoch": 0.5901411173400348, + "grad_norm": 6.691431999206543, + "learning_rate": 4.46271050521251e-06, + "logits/chosen": 7.850305557250977, + "logits/rejected": 12.168737411499023, + "logps/chosen": -242.0833740234375, + "logps/rejected": -325.6463928222656, + "loss": 0.977, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06883291900157928, + "rewards/margins": -0.4395500421524048, + "rewards/rejected": 0.3707171082496643, + "step": 3816 + }, + { + "epoch": 0.590295766479799, + "grad_norm": 5.518157005310059, + "learning_rate": 4.462424103562837e-06, + "logits/chosen": 12.00661849975586, + "logits/rejected": 10.266444206237793, + "logps/chosen": -229.302978515625, + "logps/rejected": -197.03114318847656, + "loss": 0.7136, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07545642554759979, + "rewards/margins": 0.03733096644282341, + "rewards/rejected": -0.1127873957157135, + "step": 3817 + }, + { + "epoch": 0.5904504156195631, + "grad_norm": 6.423452377319336, + "learning_rate": 4.462137701913164e-06, + "logits/chosen": 9.997434616088867, + "logits/rejected": 7.990696430206299, + "logps/chosen": -303.96868896484375, + "logps/rejected": -210.50595092773438, + "loss": 0.7367, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4308145046234131, + "rewards/margins": 0.1137131005525589, + "rewards/rejected": 0.317101389169693, + "step": 3818 + }, + { + "epoch": 0.5906050647593273, + "grad_norm": 6.38089656829834, + "learning_rate": 4.46185130026349e-06, + "logits/chosen": 12.150814056396484, + "logits/rejected": 5.259729862213135, + "logps/chosen": -415.9914855957031, + "logps/rejected": -306.4114074707031, + "loss": 0.5969, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40292906761169434, + "rewards/margins": 0.4225561022758484, + "rewards/rejected": -0.01962708681821823, + "step": 3819 + }, + { + "epoch": 0.5907597138990914, + "grad_norm": 7.6464338302612305, + "learning_rate": 4.461564898613817e-06, + "logits/chosen": 3.529846429824829, + "logits/rejected": 6.337932586669922, + "logps/chosen": -252.41787719726562, + "logps/rejected": -370.6591491699219, + "loss": 0.7054, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3005480468273163, + "rewards/margins": 0.1689886599779129, + "rewards/rejected": 0.13155938684940338, + "step": 3820 + }, + { + "epoch": 0.5909143630388556, + "grad_norm": 3.996021032333374, + "learning_rate": 4.461278496964143e-06, + "logits/chosen": 10.435494422912598, + "logits/rejected": 2.8606481552124023, + "logps/chosen": -241.61514282226562, + "logps/rejected": -169.81324768066406, + "loss": 0.4563, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4743422269821167, + "rewards/margins": 0.6487396359443665, + "rewards/rejected": -0.17439743876457214, + "step": 3821 + }, + { + "epoch": 0.5910690121786197, + "grad_norm": 9.007889747619629, + "learning_rate": 4.460992095314469e-06, + "logits/chosen": 7.150142192840576, + "logits/rejected": 7.351118087768555, + "logps/chosen": -345.4235534667969, + "logps/rejected": -299.5750732421875, + "loss": 0.6375, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19699811935424805, + "rewards/margins": 0.23684947192668915, + "rewards/rejected": -0.03985138610005379, + "step": 3822 + }, + { + "epoch": 0.5912236613183839, + "grad_norm": 5.481825828552246, + "learning_rate": 4.460705693664796e-06, + "logits/chosen": 5.335975170135498, + "logits/rejected": 6.277894973754883, + "logps/chosen": -231.02935791015625, + "logps/rejected": -288.1390380859375, + "loss": 0.5218, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36069411039352417, + "rewards/margins": 0.4748423099517822, + "rewards/rejected": -0.11414818465709686, + "step": 3823 + }, + { + "epoch": 0.591378310458148, + "grad_norm": 3.6080944538116455, + "learning_rate": 4.460419292015123e-06, + "logits/chosen": 17.56140899658203, + "logits/rejected": 12.553186416625977, + "logps/chosen": -288.2450256347656, + "logps/rejected": -199.40164184570312, + "loss": 0.5734, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3803260326385498, + "rewards/margins": 0.4496496915817261, + "rewards/rejected": -0.06932364404201508, + "step": 3824 + }, + { + "epoch": 0.5915329595979122, + "grad_norm": 7.105933666229248, + "learning_rate": 4.460132890365449e-06, + "logits/chosen": 7.943840980529785, + "logits/rejected": 7.007705211639404, + "logps/chosen": -235.16064453125, + "logps/rejected": -277.0152893066406, + "loss": 0.7742, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34620848298072815, + "rewards/margins": 0.01781301200389862, + "rewards/rejected": 0.3283954858779907, + "step": 3825 + }, + { + "epoch": 0.5916876087376765, + "grad_norm": 7.853766918182373, + "learning_rate": 4.459846488715775e-06, + "logits/chosen": 10.202415466308594, + "logits/rejected": 6.962385654449463, + "logps/chosen": -456.65521240234375, + "logps/rejected": -348.7169189453125, + "loss": 0.7503, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37237823009490967, + "rewards/margins": 0.0034618377685546875, + "rewards/rejected": 0.368916392326355, + "step": 3826 + }, + { + "epoch": 0.5918422578774406, + "grad_norm": 7.678735256195068, + "learning_rate": 4.459560087066102e-06, + "logits/chosen": 6.420163631439209, + "logits/rejected": 8.043878555297852, + "logps/chosen": -292.2291564941406, + "logps/rejected": -365.5219421386719, + "loss": 1.0816, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12138272821903229, + "rewards/margins": -0.44529005885124207, + "rewards/rejected": 0.5666728019714355, + "step": 3827 + }, + { + "epoch": 0.5919969070172048, + "grad_norm": 4.5391411781311035, + "learning_rate": 4.4592736854164285e-06, + "logits/chosen": 15.583244323730469, + "logits/rejected": 8.068771362304688, + "logps/chosen": -335.2794494628906, + "logps/rejected": -264.66888427734375, + "loss": 0.469, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3907284736633301, + "rewards/margins": 0.5339459180831909, + "rewards/rejected": -0.14321747422218323, + "step": 3828 + }, + { + "epoch": 0.5921515561569689, + "grad_norm": 6.282165050506592, + "learning_rate": 4.458987283766755e-06, + "logits/chosen": 11.59078311920166, + "logits/rejected": 2.815585136413574, + "logps/chosen": -378.8709716796875, + "logps/rejected": -231.02513122558594, + "loss": 0.6197, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4551241993904114, + "rewards/margins": 0.3134673535823822, + "rewards/rejected": 0.14165687561035156, + "step": 3829 + }, + { + "epoch": 0.5923062052967331, + "grad_norm": 6.958285331726074, + "learning_rate": 4.458700882117081e-06, + "logits/chosen": 9.933951377868652, + "logits/rejected": 6.501849174499512, + "logps/chosen": -294.91937255859375, + "logps/rejected": -272.4971923828125, + "loss": 0.6409, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.036932624876499176, + "rewards/margins": 0.2048449069261551, + "rewards/rejected": -0.1679123044013977, + "step": 3830 + }, + { + "epoch": 0.5924608544364972, + "grad_norm": 6.097238540649414, + "learning_rate": 4.4584144804674076e-06, + "logits/chosen": 9.449438095092773, + "logits/rejected": 10.267274856567383, + "logps/chosen": -268.89508056640625, + "logps/rejected": -248.31124877929688, + "loss": 0.7914, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2020013928413391, + "rewards/margins": -0.02382427453994751, + "rewards/rejected": 0.22582568228244781, + "step": 3831 + }, + { + "epoch": 0.5926155035762614, + "grad_norm": 4.1570539474487305, + "learning_rate": 4.458128078817734e-06, + "logits/chosen": 9.957818984985352, + "logits/rejected": -4.526979446411133, + "logps/chosen": -316.7931823730469, + "logps/rejected": -135.7413330078125, + "loss": 0.4693, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19125862419605255, + "rewards/margins": 0.7389428019523621, + "rewards/rejected": -0.5476841926574707, + "step": 3832 + }, + { + "epoch": 0.5927701527160255, + "grad_norm": 14.386735916137695, + "learning_rate": 4.457841677168061e-06, + "logits/chosen": 11.602571487426758, + "logits/rejected": 6.634931564331055, + "logps/chosen": -256.8283386230469, + "logps/rejected": -205.56515502929688, + "loss": 0.8041, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10648277401924133, + "rewards/margins": -0.10399642586708069, + "rewards/rejected": 0.21047921478748322, + "step": 3833 + }, + { + "epoch": 0.5929248018557897, + "grad_norm": 4.555590629577637, + "learning_rate": 4.4575552755183875e-06, + "logits/chosen": 9.226140022277832, + "logits/rejected": 5.296758651733398, + "logps/chosen": -347.5211486816406, + "logps/rejected": -244.83934020996094, + "loss": 0.7017, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6001803874969482, + "rewards/margins": 0.2005918025970459, + "rewards/rejected": 0.39958858489990234, + "step": 3834 + }, + { + "epoch": 0.5930794509955538, + "grad_norm": 7.292612075805664, + "learning_rate": 4.457268873868713e-06, + "logits/chosen": 5.563287258148193, + "logits/rejected": 6.9433770179748535, + "logps/chosen": -325.90484619140625, + "logps/rejected": -262.2660217285156, + "loss": 0.6572, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5089548230171204, + "rewards/margins": 0.22604858875274658, + "rewards/rejected": 0.28290626406669617, + "step": 3835 + }, + { + "epoch": 0.593234100135318, + "grad_norm": 4.737924098968506, + "learning_rate": 4.45698247221904e-06, + "logits/chosen": 6.556214809417725, + "logits/rejected": 8.457965850830078, + "logps/chosen": -291.62994384765625, + "logps/rejected": -344.5210266113281, + "loss": 0.4397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30360427498817444, + "rewards/margins": 0.8144075870513916, + "rewards/rejected": -0.5108033418655396, + "step": 3836 + }, + { + "epoch": 0.5933887492750821, + "grad_norm": 6.47429084777832, + "learning_rate": 4.456696070569367e-06, + "logits/chosen": 14.299200057983398, + "logits/rejected": 10.721549034118652, + "logps/chosen": -416.1689758300781, + "logps/rejected": -359.9927673339844, + "loss": 0.6117, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7683303952217102, + "rewards/margins": 0.29366546869277954, + "rewards/rejected": 0.47466492652893066, + "step": 3837 + }, + { + "epoch": 0.5935433984148463, + "grad_norm": 4.734052658081055, + "learning_rate": 4.456409668919693e-06, + "logits/chosen": 13.956354141235352, + "logits/rejected": 12.158875465393066, + "logps/chosen": -390.4969482421875, + "logps/rejected": -265.8102722167969, + "loss": 0.782, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20502310991287231, + "rewards/margins": -0.12449908256530762, + "rewards/rejected": 0.32952219247817993, + "step": 3838 + }, + { + "epoch": 0.5936980475546105, + "grad_norm": 3.339404821395874, + "learning_rate": 4.45612326727002e-06, + "logits/chosen": 10.890353202819824, + "logits/rejected": 3.5730478763580322, + "logps/chosen": -276.361572265625, + "logps/rejected": -129.41502380371094, + "loss": 0.5567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14239682257175446, + "rewards/margins": 0.3733072876930237, + "rewards/rejected": -0.23091045022010803, + "step": 3839 + }, + { + "epoch": 0.5938526966943747, + "grad_norm": 3.691633701324463, + "learning_rate": 4.4558368656203466e-06, + "logits/chosen": 14.728607177734375, + "logits/rejected": 5.328463554382324, + "logps/chosen": -233.8325958251953, + "logps/rejected": -135.4236602783203, + "loss": 0.5692, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1695854365825653, + "rewards/margins": 0.3227081894874573, + "rewards/rejected": -0.15312275290489197, + "step": 3840 + }, + { + "epoch": 0.5940073458341388, + "grad_norm": 7.030613422393799, + "learning_rate": 4.455550463970672e-06, + "logits/chosen": 7.401883602142334, + "logits/rejected": 14.989568710327148, + "logps/chosen": -201.39202880859375, + "logps/rejected": -274.390625, + "loss": 0.7675, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02464757114648819, + "rewards/margins": -0.042287155985832214, + "rewards/rejected": 0.01763959228992462, + "step": 3841 + }, + { + "epoch": 0.594161994973903, + "grad_norm": 4.3906569480896, + "learning_rate": 4.455264062320999e-06, + "logits/chosen": 11.358149528503418, + "logits/rejected": 9.51582145690918, + "logps/chosen": -259.2760314941406, + "logps/rejected": -230.26043701171875, + "loss": 0.6523, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3054589331150055, + "rewards/margins": 0.15285935997962952, + "rewards/rejected": 0.15259957313537598, + "step": 3842 + }, + { + "epoch": 0.5943166441136671, + "grad_norm": 5.709754943847656, + "learning_rate": 4.454977660671326e-06, + "logits/chosen": 5.496818542480469, + "logits/rejected": 10.349492073059082, + "logps/chosen": -282.9944152832031, + "logps/rejected": -270.3442077636719, + "loss": 0.6976, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13073718547821045, + "rewards/margins": 0.007633231580257416, + "rewards/rejected": 0.12310396134853363, + "step": 3843 + }, + { + "epoch": 0.5944712932534313, + "grad_norm": 4.264304161071777, + "learning_rate": 4.454691259021652e-06, + "logits/chosen": 10.246711730957031, + "logits/rejected": 4.339803695678711, + "logps/chosen": -272.0284118652344, + "logps/rejected": -175.11941528320312, + "loss": 0.6163, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07712607085704803, + "rewards/margins": 0.1974329948425293, + "rewards/rejected": -0.12030691653490067, + "step": 3844 + }, + { + "epoch": 0.5946259423931954, + "grad_norm": 18.1430606842041, + "learning_rate": 4.454404857371979e-06, + "logits/chosen": 9.121091842651367, + "logits/rejected": 2.7126824855804443, + "logps/chosen": -368.8348388671875, + "logps/rejected": -270.771728515625, + "loss": 0.7438, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30250370502471924, + "rewards/margins": 0.003220662474632263, + "rewards/rejected": 0.2992830276489258, + "step": 3845 + }, + { + "epoch": 0.5947805915329596, + "grad_norm": 3.467616558074951, + "learning_rate": 4.454118455722306e-06, + "logits/chosen": 6.432444095611572, + "logits/rejected": 4.621376037597656, + "logps/chosen": -161.54161071777344, + "logps/rejected": -147.1109161376953, + "loss": 0.5771, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2612975835800171, + "rewards/margins": 0.45776522159576416, + "rewards/rejected": -0.19646763801574707, + "step": 3846 + }, + { + "epoch": 0.5949352406727237, + "grad_norm": 6.248945236206055, + "learning_rate": 4.453832054072632e-06, + "logits/chosen": 2.3398420810699463, + "logits/rejected": 7.798948287963867, + "logps/chosen": -254.56573486328125, + "logps/rejected": -282.75567626953125, + "loss": 0.704, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05542530119419098, + "rewards/margins": 0.1525730937719345, + "rewards/rejected": -0.2079983949661255, + "step": 3847 + }, + { + "epoch": 0.5950898898124879, + "grad_norm": 4.023340702056885, + "learning_rate": 4.453545652422958e-06, + "logits/chosen": 9.701852798461914, + "logits/rejected": 2.5256001949310303, + "logps/chosen": -195.063232421875, + "logps/rejected": -162.69512939453125, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2550214529037476, + "rewards/margins": 1.0442503690719604, + "rewards/rejected": 0.21077106893062592, + "step": 3848 + }, + { + "epoch": 0.595244538952252, + "grad_norm": 6.15097188949585, + "learning_rate": 4.453259250773285e-06, + "logits/chosen": 7.361240386962891, + "logits/rejected": 5.024587631225586, + "logps/chosen": -315.0986633300781, + "logps/rejected": -231.8363494873047, + "loss": 0.7076, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35231637954711914, + "rewards/margins": 0.15271194279193878, + "rewards/rejected": 0.19960442185401917, + "step": 3849 + }, + { + "epoch": 0.5953991880920162, + "grad_norm": 4.188508033752441, + "learning_rate": 4.452972849123611e-06, + "logits/chosen": 7.146430015563965, + "logits/rejected": 8.702651023864746, + "logps/chosen": -279.047119140625, + "logps/rejected": -298.24932861328125, + "loss": 0.523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35923728346824646, + "rewards/margins": 0.5597008466720581, + "rewards/rejected": -0.20046359300613403, + "step": 3850 + }, + { + "epoch": 0.5955538372317803, + "grad_norm": 4.777875900268555, + "learning_rate": 4.452686447473938e-06, + "logits/chosen": 14.22570514678955, + "logits/rejected": 6.961708068847656, + "logps/chosen": -384.77789306640625, + "logps/rejected": -272.19500732421875, + "loss": 0.5584, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7798794507980347, + "rewards/margins": 0.37557628750801086, + "rewards/rejected": 0.4043031930923462, + "step": 3851 + }, + { + "epoch": 0.5957084863715446, + "grad_norm": 21.18962287902832, + "learning_rate": 4.452400045824265e-06, + "logits/chosen": 8.901566505432129, + "logits/rejected": 11.886366844177246, + "logps/chosen": -213.25863647460938, + "logps/rejected": -315.91119384765625, + "loss": 0.7951, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2784026265144348, + "rewards/margins": -0.04361967742443085, + "rewards/rejected": 0.32202231884002686, + "step": 3852 + }, + { + "epoch": 0.5958631355113088, + "grad_norm": 4.977728843688965, + "learning_rate": 4.452113644174591e-06, + "logits/chosen": 13.325644493103027, + "logits/rejected": 10.87017822265625, + "logps/chosen": -232.05148315429688, + "logps/rejected": -221.09571838378906, + "loss": 0.6102, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15060526132583618, + "rewards/margins": 0.289823442697525, + "rewards/rejected": -0.13921819627285004, + "step": 3853 + }, + { + "epoch": 0.5960177846510729, + "grad_norm": 3.847041130065918, + "learning_rate": 4.451827242524917e-06, + "logits/chosen": 13.751547813415527, + "logits/rejected": 5.848457336425781, + "logps/chosen": -207.70892333984375, + "logps/rejected": -119.81642150878906, + "loss": 0.6032, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16600199043750763, + "rewards/margins": 0.3585396409034729, + "rewards/rejected": -0.19253763556480408, + "step": 3854 + }, + { + "epoch": 0.5961724337908371, + "grad_norm": 8.761685371398926, + "learning_rate": 4.451540840875244e-06, + "logits/chosen": 12.882061004638672, + "logits/rejected": 3.717501163482666, + "logps/chosen": -441.8450927734375, + "logps/rejected": -333.88946533203125, + "loss": 0.7972, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02096595987677574, + "rewards/margins": 0.3197178542613983, + "rewards/rejected": -0.34068381786346436, + "step": 3855 + }, + { + "epoch": 0.5963270829306012, + "grad_norm": 7.407461166381836, + "learning_rate": 4.4512544392255704e-06, + "logits/chosen": 8.958114624023438, + "logits/rejected": 10.856538772583008, + "logps/chosen": -282.1307373046875, + "logps/rejected": -315.6894226074219, + "loss": 0.8233, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33628833293914795, + "rewards/margins": -0.17250213027000427, + "rewards/rejected": 0.5087904334068298, + "step": 3856 + }, + { + "epoch": 0.5964817320703654, + "grad_norm": 6.256063461303711, + "learning_rate": 4.450968037575897e-06, + "logits/chosen": 8.043414115905762, + "logits/rejected": 2.5432448387145996, + "logps/chosen": -280.5829162597656, + "logps/rejected": -207.14633178710938, + "loss": 0.7455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.015946581959724426, + "rewards/margins": 0.127707839012146, + "rewards/rejected": -0.14365443587303162, + "step": 3857 + }, + { + "epoch": 0.5966363812101295, + "grad_norm": 5.679924011230469, + "learning_rate": 4.450681635926224e-06, + "logits/chosen": 9.277485847473145, + "logits/rejected": 9.283084869384766, + "logps/chosen": -209.117919921875, + "logps/rejected": -267.44671630859375, + "loss": 0.681, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3057743012905121, + "rewards/margins": 0.16223332285881042, + "rewards/rejected": 0.14354099333286285, + "step": 3858 + }, + { + "epoch": 0.5967910303498937, + "grad_norm": 5.220340251922607, + "learning_rate": 4.4503952342765495e-06, + "logits/chosen": 7.739818572998047, + "logits/rejected": 4.137601375579834, + "logps/chosen": -256.84930419921875, + "logps/rejected": -240.44061279296875, + "loss": 0.674, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3029986619949341, + "rewards/margins": 0.1641806960105896, + "rewards/rejected": 0.13881796598434448, + "step": 3859 + }, + { + "epoch": 0.5969456794896578, + "grad_norm": 4.8695268630981445, + "learning_rate": 4.450108832626876e-06, + "logits/chosen": 10.396373748779297, + "logits/rejected": 7.189081192016602, + "logps/chosen": -218.73585510253906, + "logps/rejected": -150.6077423095703, + "loss": 0.7079, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3363502323627472, + "rewards/margins": 0.03585357964038849, + "rewards/rejected": 0.3004966974258423, + "step": 3860 + }, + { + "epoch": 0.597100328629422, + "grad_norm": 5.034574031829834, + "learning_rate": 4.449822430977203e-06, + "logits/chosen": 16.268657684326172, + "logits/rejected": 10.540192604064941, + "logps/chosen": -232.69375610351562, + "logps/rejected": -195.15472412109375, + "loss": 0.6918, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.14549486339092255, + "rewards/margins": 0.08534926176071167, + "rewards/rejected": 0.06014557182788849, + "step": 3861 + }, + { + "epoch": 0.5972549777691861, + "grad_norm": 4.311427593231201, + "learning_rate": 4.4495360293275295e-06, + "logits/chosen": 7.273316383361816, + "logits/rejected": 3.2542858123779297, + "logps/chosen": -180.52978515625, + "logps/rejected": -131.00286865234375, + "loss": 0.689, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07191963493824005, + "rewards/margins": 0.08312667906284332, + "rewards/rejected": -0.15504629909992218, + "step": 3862 + }, + { + "epoch": 0.5974096269089503, + "grad_norm": 7.141695976257324, + "learning_rate": 4.449249627677856e-06, + "logits/chosen": 14.52280330657959, + "logits/rejected": 11.800079345703125, + "logps/chosen": -304.2814025878906, + "logps/rejected": -331.8609619140625, + "loss": 0.563, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5603678226470947, + "rewards/margins": 0.45084840059280396, + "rewards/rejected": 0.10951948165893555, + "step": 3863 + }, + { + "epoch": 0.5975642760487144, + "grad_norm": 5.070034503936768, + "learning_rate": 4.448963226028182e-06, + "logits/chosen": 8.325569152832031, + "logits/rejected": 10.92115306854248, + "logps/chosen": -250.042724609375, + "logps/rejected": -270.6550598144531, + "loss": 0.586, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05914616584777832, + "rewards/margins": 0.245741605758667, + "rewards/rejected": -0.18659542500972748, + "step": 3864 + }, + { + "epoch": 0.5977189251884787, + "grad_norm": 9.244002342224121, + "learning_rate": 4.448676824378509e-06, + "logits/chosen": 8.563394546508789, + "logits/rejected": 8.198019981384277, + "logps/chosen": -293.6990966796875, + "logps/rejected": -294.89068603515625, + "loss": 0.8749, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05756746232509613, + "rewards/margins": -0.26582416892051697, + "rewards/rejected": 0.20825672149658203, + "step": 3865 + }, + { + "epoch": 0.5978735743282428, + "grad_norm": 5.550826549530029, + "learning_rate": 4.448390422728835e-06, + "logits/chosen": 9.64195728302002, + "logits/rejected": 5.02839469909668, + "logps/chosen": -213.12632751464844, + "logps/rejected": -170.35665893554688, + "loss": 0.603, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28932976722717285, + "rewards/margins": 0.2146056592464447, + "rewards/rejected": 0.07472410798072815, + "step": 3866 + }, + { + "epoch": 0.598028223468007, + "grad_norm": 6.245521068572998, + "learning_rate": 4.448104021079162e-06, + "logits/chosen": 12.155706405639648, + "logits/rejected": 11.467378616333008, + "logps/chosen": -299.26348876953125, + "logps/rejected": -261.70550537109375, + "loss": 0.6703, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.017500489950180054, + "rewards/margins": 0.172099307179451, + "rewards/rejected": -0.15459880232810974, + "step": 3867 + }, + { + "epoch": 0.5981828726077711, + "grad_norm": 6.4365949630737305, + "learning_rate": 4.447817619429488e-06, + "logits/chosen": 14.18918228149414, + "logits/rejected": 8.934442520141602, + "logps/chosen": -401.605712890625, + "logps/rejected": -339.048828125, + "loss": 0.7455, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2455999255180359, + "rewards/margins": 0.042389582842588425, + "rewards/rejected": 0.20321033895015717, + "step": 3868 + }, + { + "epoch": 0.5983375217475353, + "grad_norm": 7.192135334014893, + "learning_rate": 4.447531217779814e-06, + "logits/chosen": 7.545492649078369, + "logits/rejected": 3.3931689262390137, + "logps/chosen": -296.37615966796875, + "logps/rejected": -181.6265106201172, + "loss": 0.7363, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19697798788547516, + "rewards/margins": 0.019417472183704376, + "rewards/rejected": 0.17756050825119019, + "step": 3869 + }, + { + "epoch": 0.5984921708872994, + "grad_norm": 6.12122917175293, + "learning_rate": 4.447244816130141e-06, + "logits/chosen": 10.559307098388672, + "logits/rejected": 11.585685729980469, + "logps/chosen": -291.38818359375, + "logps/rejected": -283.39215087890625, + "loss": 0.7217, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31820911169052124, + "rewards/margins": 0.023026492446660995, + "rewards/rejected": 0.29518258571624756, + "step": 3870 + }, + { + "epoch": 0.5986468200270636, + "grad_norm": 6.119382858276367, + "learning_rate": 4.446958414480468e-06, + "logits/chosen": 14.64987850189209, + "logits/rejected": 12.435090065002441, + "logps/chosen": -290.3376770019531, + "logps/rejected": -269.3126220703125, + "loss": 0.7163, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2654527723789215, + "rewards/margins": 0.09620637446641922, + "rewards/rejected": 0.1692463904619217, + "step": 3871 + }, + { + "epoch": 0.5988014691668277, + "grad_norm": 3.763533115386963, + "learning_rate": 4.446672012830794e-06, + "logits/chosen": 5.599374294281006, + "logits/rejected": 3.2607431411743164, + "logps/chosen": -261.7787170410156, + "logps/rejected": -208.43399047851562, + "loss": 0.604, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07701735198497772, + "rewards/margins": 0.3463677167892456, + "rewards/rejected": -0.2693503201007843, + "step": 3872 + }, + { + "epoch": 0.5989561183065919, + "grad_norm": 4.655757427215576, + "learning_rate": 4.446385611181121e-06, + "logits/chosen": 9.932258605957031, + "logits/rejected": 8.173981666564941, + "logps/chosen": -266.74591064453125, + "logps/rejected": -242.66490173339844, + "loss": 0.6097, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3883005380630493, + "rewards/margins": 0.21472401916980743, + "rewards/rejected": 0.17357653379440308, + "step": 3873 + }, + { + "epoch": 0.599110767446356, + "grad_norm": 4.346072196960449, + "learning_rate": 4.446099209531447e-06, + "logits/chosen": 5.958211898803711, + "logits/rejected": 5.569001197814941, + "logps/chosen": -175.85031127929688, + "logps/rejected": -187.32028198242188, + "loss": 0.8729, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.055561259388923645, + "rewards/margins": -0.1847131848335266, + "rewards/rejected": 0.24027448892593384, + "step": 3874 + }, + { + "epoch": 0.5992654165861202, + "grad_norm": 4.736356258392334, + "learning_rate": 4.445812807881773e-06, + "logits/chosen": 6.78809118270874, + "logits/rejected": 13.442954063415527, + "logps/chosen": -134.38375854492188, + "logps/rejected": -239.40689086914062, + "loss": 0.6526, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24932880699634552, + "rewards/margins": 0.1478361338376999, + "rewards/rejected": 0.10149267315864563, + "step": 3875 + }, + { + "epoch": 0.5994200657258844, + "grad_norm": 6.878830432891846, + "learning_rate": 4.4455264062321e-06, + "logits/chosen": 8.970477104187012, + "logits/rejected": 2.57135009765625, + "logps/chosen": -255.8218994140625, + "logps/rejected": -268.3470458984375, + "loss": 0.6149, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13815096020698547, + "rewards/margins": 0.4011203646659851, + "rewards/rejected": -0.26296940445899963, + "step": 3876 + }, + { + "epoch": 0.5995747148656485, + "grad_norm": 6.638808727264404, + "learning_rate": 4.445240004582427e-06, + "logits/chosen": 10.611102104187012, + "logits/rejected": 7.543502330780029, + "logps/chosen": -284.49700927734375, + "logps/rejected": -269.28411865234375, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20888566970825195, + "rewards/margins": -0.07874555140733719, + "rewards/rejected": 0.28763121366500854, + "step": 3877 + }, + { + "epoch": 0.5997293640054128, + "grad_norm": 7.689025402069092, + "learning_rate": 4.444953602932753e-06, + "logits/chosen": 10.126932144165039, + "logits/rejected": 10.684929847717285, + "logps/chosen": -434.2848815917969, + "logps/rejected": -358.90533447265625, + "loss": 0.7248, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.626960039138794, + "rewards/margins": 0.07752152532339096, + "rewards/rejected": 0.5494384765625, + "step": 3878 + }, + { + "epoch": 0.5998840131451769, + "grad_norm": 3.148813486099243, + "learning_rate": 4.44466720128308e-06, + "logits/chosen": 9.07924747467041, + "logits/rejected": -1.7599058151245117, + "logps/chosen": -269.93597412109375, + "logps/rejected": -132.77688598632812, + "loss": 0.4736, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7984883189201355, + "rewards/margins": 0.8481366038322449, + "rewards/rejected": -0.04964829981327057, + "step": 3879 + }, + { + "epoch": 0.6000386622849411, + "grad_norm": 5.296482086181641, + "learning_rate": 4.444380799633407e-06, + "logits/chosen": 7.485034942626953, + "logits/rejected": 6.309716701507568, + "logps/chosen": -293.6840515136719, + "logps/rejected": -215.73464965820312, + "loss": 0.7029, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3642003536224365, + "rewards/margins": 0.11009103059768677, + "rewards/rejected": 0.25410932302474976, + "step": 3880 + }, + { + "epoch": 0.6001933114247052, + "grad_norm": 4.511099338531494, + "learning_rate": 4.4440943979837325e-06, + "logits/chosen": 9.776927947998047, + "logits/rejected": 6.702047348022461, + "logps/chosen": -392.608154296875, + "logps/rejected": -285.5046691894531, + "loss": 0.5595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37085992097854614, + "rewards/margins": 0.3414820432662964, + "rewards/rejected": 0.029377862811088562, + "step": 3881 + }, + { + "epoch": 0.6003479605644694, + "grad_norm": 5.933185577392578, + "learning_rate": 4.443807996334059e-06, + "logits/chosen": 8.37652587890625, + "logits/rejected": 11.007089614868164, + "logps/chosen": -172.8363037109375, + "logps/rejected": -191.82162475585938, + "loss": 0.8593, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0849263072013855, + "rewards/margins": -0.2628410756587982, + "rewards/rejected": 0.34776735305786133, + "step": 3882 + }, + { + "epoch": 0.6005026097042335, + "grad_norm": 5.383392810821533, + "learning_rate": 4.443521594684386e-06, + "logits/chosen": 10.5837984085083, + "logits/rejected": 9.564327239990234, + "logps/chosen": -246.11691284179688, + "logps/rejected": -241.84043884277344, + "loss": 0.642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33095866441726685, + "rewards/margins": 0.20551368594169617, + "rewards/rejected": 0.12544497847557068, + "step": 3883 + }, + { + "epoch": 0.6006572588439977, + "grad_norm": 5.434359073638916, + "learning_rate": 4.443235193034712e-06, + "logits/chosen": 14.77172565460205, + "logits/rejected": 8.474355697631836, + "logps/chosen": -376.6193542480469, + "logps/rejected": -229.8174285888672, + "loss": 0.5727, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6839893460273743, + "rewards/margins": 0.3474277853965759, + "rewards/rejected": 0.3365615904331207, + "step": 3884 + }, + { + "epoch": 0.6008119079837618, + "grad_norm": 6.1834893226623535, + "learning_rate": 4.442948791385039e-06, + "logits/chosen": 7.505656719207764, + "logits/rejected": 9.484930992126465, + "logps/chosen": -347.8443603515625, + "logps/rejected": -343.0873718261719, + "loss": 0.6072, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.656563401222229, + "rewards/margins": 0.24430981278419495, + "rewards/rejected": 0.41225361824035645, + "step": 3885 + }, + { + "epoch": 0.600966557123526, + "grad_norm": 6.101221561431885, + "learning_rate": 4.442662389735366e-06, + "logits/chosen": 12.881450653076172, + "logits/rejected": 5.2534708976745605, + "logps/chosen": -412.58831787109375, + "logps/rejected": -260.46978759765625, + "loss": 0.5715, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4104612171649933, + "rewards/margins": 0.38385143876075745, + "rewards/rejected": 0.026609785854816437, + "step": 3886 + }, + { + "epoch": 0.6011212062632901, + "grad_norm": 5.960506439208984, + "learning_rate": 4.4423759880856915e-06, + "logits/chosen": 10.128340721130371, + "logits/rejected": 11.28730297088623, + "logps/chosen": -248.02017211914062, + "logps/rejected": -274.02532958984375, + "loss": 0.8383, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5649007558822632, + "rewards/margins": -0.21767978370189667, + "rewards/rejected": 0.7825806140899658, + "step": 3887 + }, + { + "epoch": 0.6012758554030543, + "grad_norm": 16.900802612304688, + "learning_rate": 4.442089586436018e-06, + "logits/chosen": 11.56409740447998, + "logits/rejected": 7.57831335067749, + "logps/chosen": -393.04583740234375, + "logps/rejected": -164.96380615234375, + "loss": 0.7871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11604196578264236, + "rewards/margins": -0.03710642457008362, + "rewards/rejected": -0.07893553376197815, + "step": 3888 + }, + { + "epoch": 0.6014305045428184, + "grad_norm": 5.817917346954346, + "learning_rate": 4.441803184786345e-06, + "logits/chosen": 7.873283863067627, + "logits/rejected": 10.702044486999512, + "logps/chosen": -177.4079132080078, + "logps/rejected": -238.53720092773438, + "loss": 0.8187, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07048788666725159, + "rewards/margins": -0.18652144074440002, + "rewards/rejected": 0.11603355407714844, + "step": 3889 + }, + { + "epoch": 0.6015851536825826, + "grad_norm": 7.263823509216309, + "learning_rate": 4.4415167831366715e-06, + "logits/chosen": 13.044964790344238, + "logits/rejected": 10.323070526123047, + "logps/chosen": -305.1011962890625, + "logps/rejected": -262.8005065917969, + "loss": 0.9423, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18749113380908966, + "rewards/margins": -0.21726539731025696, + "rewards/rejected": 0.4047565162181854, + "step": 3890 + }, + { + "epoch": 0.6017398028223468, + "grad_norm": 3.867288827896118, + "learning_rate": 4.441230381486998e-06, + "logits/chosen": 9.105090141296387, + "logits/rejected": -2.6467838287353516, + "logps/chosen": -274.07318115234375, + "logps/rejected": -142.89996337890625, + "loss": 0.5237, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3479274809360504, + "rewards/margins": 0.5475428104400635, + "rewards/rejected": -0.19961532950401306, + "step": 3891 + }, + { + "epoch": 0.601894451962111, + "grad_norm": 5.0648322105407715, + "learning_rate": 4.440943979837325e-06, + "logits/chosen": 14.801642417907715, + "logits/rejected": 6.9151105880737305, + "logps/chosen": -290.8497009277344, + "logps/rejected": -180.7908172607422, + "loss": 0.661, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2503805160522461, + "rewards/margins": 0.15373292565345764, + "rewards/rejected": 0.09664759039878845, + "step": 3892 + }, + { + "epoch": 0.6020491011018752, + "grad_norm": 12.034680366516113, + "learning_rate": 4.440657578187651e-06, + "logits/chosen": 12.5345458984375, + "logits/rejected": 9.806342124938965, + "logps/chosen": -524.4227294921875, + "logps/rejected": -496.8655700683594, + "loss": 0.6278, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5607287883758545, + "rewards/margins": 0.21646201610565186, + "rewards/rejected": 0.34426674246788025, + "step": 3893 + }, + { + "epoch": 0.6022037502416393, + "grad_norm": 6.952269554138184, + "learning_rate": 4.440371176537977e-06, + "logits/chosen": 5.1526103019714355, + "logits/rejected": 6.769500732421875, + "logps/chosen": -247.334228515625, + "logps/rejected": -285.9931945800781, + "loss": 0.7882, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.29980432987213135, + "rewards/margins": -0.08810777962207794, + "rewards/rejected": -0.21169652044773102, + "step": 3894 + }, + { + "epoch": 0.6023583993814035, + "grad_norm": 5.4862446784973145, + "learning_rate": 4.440084774888304e-06, + "logits/chosen": 8.05799388885498, + "logits/rejected": 7.2748870849609375, + "logps/chosen": -240.3828125, + "logps/rejected": -192.3104248046875, + "loss": 0.7319, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3096157908439636, + "rewards/margins": 0.17935967445373535, + "rewards/rejected": 0.13025611639022827, + "step": 3895 + }, + { + "epoch": 0.6025130485211676, + "grad_norm": 7.092531204223633, + "learning_rate": 4.4397983732386305e-06, + "logits/chosen": 11.093854904174805, + "logits/rejected": 6.925727367401123, + "logps/chosen": -302.0999755859375, + "logps/rejected": -260.83953857421875, + "loss": 0.8798, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5152764916419983, + "rewards/margins": -0.2856329679489136, + "rewards/rejected": 0.8009095191955566, + "step": 3896 + }, + { + "epoch": 0.6026676976609318, + "grad_norm": 34.546836853027344, + "learning_rate": 4.439511971588956e-06, + "logits/chosen": 9.177679061889648, + "logits/rejected": 3.0738775730133057, + "logps/chosen": -396.36578369140625, + "logps/rejected": -280.2530212402344, + "loss": 0.7116, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5396541357040405, + "rewards/margins": 0.027874130755662918, + "rewards/rejected": 0.5117799639701843, + "step": 3897 + }, + { + "epoch": 0.6028223468006959, + "grad_norm": 5.122930526733398, + "learning_rate": 4.439225569939283e-06, + "logits/chosen": 4.455689907073975, + "logits/rejected": 8.010992050170898, + "logps/chosen": -286.8979797363281, + "logps/rejected": -381.677734375, + "loss": 0.643, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5623769760131836, + "rewards/margins": 0.14786362648010254, + "rewards/rejected": 0.41451334953308105, + "step": 3898 + }, + { + "epoch": 0.6029769959404601, + "grad_norm": 6.4154372215271, + "learning_rate": 4.43893916828961e-06, + "logits/chosen": 13.473001480102539, + "logits/rejected": 8.6923828125, + "logps/chosen": -345.00665283203125, + "logps/rejected": -279.5361022949219, + "loss": 0.7225, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4932249188423157, + "rewards/margins": 0.09298592060804367, + "rewards/rejected": 0.4002390205860138, + "step": 3899 + }, + { + "epoch": 0.6031316450802242, + "grad_norm": 4.310980796813965, + "learning_rate": 4.438652766639936e-06, + "logits/chosen": 9.137060165405273, + "logits/rejected": 6.654911518096924, + "logps/chosen": -194.30029296875, + "logps/rejected": -157.14157104492188, + "loss": 0.6856, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23456516861915588, + "rewards/margins": 0.12567058205604553, + "rewards/rejected": 0.10889458656311035, + "step": 3900 + }, + { + "epoch": 0.6032862942199884, + "grad_norm": 5.997705459594727, + "learning_rate": 4.438366364990263e-06, + "logits/chosen": 8.250368118286133, + "logits/rejected": 7.210304260253906, + "logps/chosen": -305.0679931640625, + "logps/rejected": -318.2009582519531, + "loss": 0.6903, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5197056531906128, + "rewards/margins": 0.07911047339439392, + "rewards/rejected": 0.4405951499938965, + "step": 3901 + }, + { + "epoch": 0.6034409433597525, + "grad_norm": 6.1376118659973145, + "learning_rate": 4.438079963340589e-06, + "logits/chosen": 16.63678550720215, + "logits/rejected": 9.755081176757812, + "logps/chosen": -324.0816345214844, + "logps/rejected": -280.4528503417969, + "loss": 0.6879, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5481510162353516, + "rewards/margins": 0.07492218911647797, + "rewards/rejected": 0.4732288420200348, + "step": 3902 + }, + { + "epoch": 0.6035955924995168, + "grad_norm": 6.018391132354736, + "learning_rate": 4.437793561690915e-06, + "logits/chosen": 9.6676025390625, + "logits/rejected": 14.256958961486816, + "logps/chosen": -245.39459228515625, + "logps/rejected": -287.4531555175781, + "loss": 0.7739, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5975342988967896, + "rewards/margins": -0.1116015687584877, + "rewards/rejected": 0.709135890007019, + "step": 3903 + }, + { + "epoch": 0.6037502416392809, + "grad_norm": 4.122574806213379, + "learning_rate": 4.437507160041242e-06, + "logits/chosen": 5.369261264801025, + "logits/rejected": 4.408687114715576, + "logps/chosen": -189.4131622314453, + "logps/rejected": -159.6771240234375, + "loss": 0.683, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21424371004104614, + "rewards/margins": 0.1198115199804306, + "rewards/rejected": 0.09443218261003494, + "step": 3904 + }, + { + "epoch": 0.6039048907790451, + "grad_norm": 8.841023445129395, + "learning_rate": 4.437220758391569e-06, + "logits/chosen": 15.080881118774414, + "logits/rejected": 8.804738998413086, + "logps/chosen": -391.0445556640625, + "logps/rejected": -280.6028137207031, + "loss": 0.5485, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4271688461303711, + "rewards/margins": 0.5063155293464661, + "rewards/rejected": -0.07914666831493378, + "step": 3905 + }, + { + "epoch": 0.6040595399188092, + "grad_norm": 19.14792251586914, + "learning_rate": 4.436934356741895e-06, + "logits/chosen": 14.502641677856445, + "logits/rejected": 9.034814834594727, + "logps/chosen": -298.1357421875, + "logps/rejected": -203.72283935546875, + "loss": 0.4943, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6897628307342529, + "rewards/margins": 0.606182336807251, + "rewards/rejected": 0.08358044177293777, + "step": 3906 + }, + { + "epoch": 0.6042141890585734, + "grad_norm": 6.1135478019714355, + "learning_rate": 4.436647955092221e-06, + "logits/chosen": 12.373702049255371, + "logits/rejected": 8.945653915405273, + "logps/chosen": -284.17144775390625, + "logps/rejected": -290.6643981933594, + "loss": 0.6898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5265203714370728, + "rewards/margins": 0.24908261001110077, + "rewards/rejected": 0.2774377763271332, + "step": 3907 + }, + { + "epoch": 0.6043688381983375, + "grad_norm": 5.55449914932251, + "learning_rate": 4.436361553442548e-06, + "logits/chosen": 12.21667194366455, + "logits/rejected": 4.47822904586792, + "logps/chosen": -360.3435974121094, + "logps/rejected": -233.45675659179688, + "loss": 0.603, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40338361263275146, + "rewards/margins": 0.3075413405895233, + "rewards/rejected": 0.09584227204322815, + "step": 3908 + }, + { + "epoch": 0.6045234873381017, + "grad_norm": 3.4196724891662598, + "learning_rate": 4.4360751517928744e-06, + "logits/chosen": 7.5966033935546875, + "logits/rejected": 6.637414932250977, + "logps/chosen": -205.63720703125, + "logps/rejected": -149.7642059326172, + "loss": 0.6023, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5818638205528259, + "rewards/margins": 0.28300806879997253, + "rewards/rejected": 0.2988557815551758, + "step": 3909 + }, + { + "epoch": 0.6046781364778658, + "grad_norm": 5.212469100952148, + "learning_rate": 4.435788750143201e-06, + "logits/chosen": 6.841164588928223, + "logits/rejected": 9.152700424194336, + "logps/chosen": -285.6550598144531, + "logps/rejected": -394.9906005859375, + "loss": 0.6463, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6218509078025818, + "rewards/margins": 0.28151923418045044, + "rewards/rejected": 0.34033164381980896, + "step": 3910 + }, + { + "epoch": 0.60483278561763, + "grad_norm": 5.6171488761901855, + "learning_rate": 4.435502348493528e-06, + "logits/chosen": 4.333796501159668, + "logits/rejected": 9.224414825439453, + "logps/chosen": -244.13455200195312, + "logps/rejected": -360.42962646484375, + "loss": 0.6907, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5087676048278809, + "rewards/margins": 0.27077844738960266, + "rewards/rejected": 0.23798918724060059, + "step": 3911 + }, + { + "epoch": 0.6049874347573941, + "grad_norm": 7.398587226867676, + "learning_rate": 4.435215946843854e-06, + "logits/chosen": 8.538468360900879, + "logits/rejected": 7.4908246994018555, + "logps/chosen": -414.14251708984375, + "logps/rejected": -455.76507568359375, + "loss": 0.6647, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.648833155632019, + "rewards/margins": 0.1741776019334793, + "rewards/rejected": 0.4746555984020233, + "step": 3912 + }, + { + "epoch": 0.6051420838971583, + "grad_norm": 4.901476860046387, + "learning_rate": 4.434929545194181e-06, + "logits/chosen": 9.439541816711426, + "logits/rejected": 13.413737297058105, + "logps/chosen": -259.1940612792969, + "logps/rejected": -233.14608764648438, + "loss": 0.7191, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40240737795829773, + "rewards/margins": 0.05190809816122055, + "rewards/rejected": 0.3504992723464966, + "step": 3913 + }, + { + "epoch": 0.6052967330369224, + "grad_norm": 5.9158549308776855, + "learning_rate": 4.434643143544507e-06, + "logits/chosen": 13.142208099365234, + "logits/rejected": 6.812465667724609, + "logps/chosen": -324.46209716796875, + "logps/rejected": -227.11952209472656, + "loss": 0.6264, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6139547228813171, + "rewards/margins": 0.2808613181114197, + "rewards/rejected": 0.33309343457221985, + "step": 3914 + }, + { + "epoch": 0.6054513821766866, + "grad_norm": 4.910852432250977, + "learning_rate": 4.4343567418948335e-06, + "logits/chosen": 11.10114574432373, + "logits/rejected": 6.515744209289551, + "logps/chosen": -253.697021484375, + "logps/rejected": -200.24368286132812, + "loss": 0.4024, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6055654287338257, + "rewards/margins": 1.026044249534607, + "rewards/rejected": -0.42047882080078125, + "step": 3915 + }, + { + "epoch": 0.6056060313164509, + "grad_norm": 7.040600299835205, + "learning_rate": 4.43407034024516e-06, + "logits/chosen": 8.591606140136719, + "logits/rejected": 10.577064514160156, + "logps/chosen": -318.95697021484375, + "logps/rejected": -353.3975830078125, + "loss": 0.6405, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4841551184654236, + "rewards/margins": 0.2094191014766693, + "rewards/rejected": 0.2747359573841095, + "step": 3916 + }, + { + "epoch": 0.605760680456215, + "grad_norm": 4.258431911468506, + "learning_rate": 4.433783938595487e-06, + "logits/chosen": 5.574944496154785, + "logits/rejected": 5.272205352783203, + "logps/chosen": -185.17529296875, + "logps/rejected": -163.1497039794922, + "loss": 0.6985, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21793369948863983, + "rewards/margins": 0.038532041013240814, + "rewards/rejected": 0.17940166592597961, + "step": 3917 + }, + { + "epoch": 0.6059153295959792, + "grad_norm": 6.661002159118652, + "learning_rate": 4.4334975369458135e-06, + "logits/chosen": 7.947875499725342, + "logits/rejected": -1.1607508659362793, + "logps/chosen": -285.3018798828125, + "logps/rejected": -155.67453002929688, + "loss": 0.672, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025693196803331375, + "rewards/margins": 0.3243691921234131, + "rewards/rejected": -0.2986759841442108, + "step": 3918 + }, + { + "epoch": 0.6060699787357433, + "grad_norm": 5.284203052520752, + "learning_rate": 4.43321113529614e-06, + "logits/chosen": 4.179821968078613, + "logits/rejected": 3.223979949951172, + "logps/chosen": -223.80006408691406, + "logps/rejected": -257.2651672363281, + "loss": 0.6962, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.15460927784442902, + "rewards/margins": 0.0940190926194191, + "rewards/rejected": 0.060590144246816635, + "step": 3919 + }, + { + "epoch": 0.6062246278755075, + "grad_norm": 7.079946041107178, + "learning_rate": 4.432924733646466e-06, + "logits/chosen": 5.260807991027832, + "logits/rejected": 6.139631271362305, + "logps/chosen": -268.0679016113281, + "logps/rejected": -341.3180236816406, + "loss": 0.755, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5222038626670837, + "rewards/margins": 0.04032406210899353, + "rewards/rejected": 0.4818797707557678, + "step": 3920 + }, + { + "epoch": 0.6063792770152716, + "grad_norm": 5.207577705383301, + "learning_rate": 4.4326383319967926e-06, + "logits/chosen": 6.2708964347839355, + "logits/rejected": 8.383788108825684, + "logps/chosen": -203.42987060546875, + "logps/rejected": -229.19252014160156, + "loss": 0.655, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1914692521095276, + "rewards/margins": 0.1311703622341156, + "rewards/rejected": 0.06029890477657318, + "step": 3921 + }, + { + "epoch": 0.6065339261550358, + "grad_norm": 5.684727668762207, + "learning_rate": 4.432351930347119e-06, + "logits/chosen": 10.120211601257324, + "logits/rejected": 8.831645011901855, + "logps/chosen": -341.0797119140625, + "logps/rejected": -266.207763671875, + "loss": 0.8905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.211262509226799, + "rewards/margins": -0.19925397634506226, + "rewards/rejected": 0.41051650047302246, + "step": 3922 + }, + { + "epoch": 0.6066885752947999, + "grad_norm": 8.976771354675293, + "learning_rate": 4.432065528697446e-06, + "logits/chosen": 8.938323020935059, + "logits/rejected": 9.13524341583252, + "logps/chosen": -361.41998291015625, + "logps/rejected": -339.8784484863281, + "loss": 0.6839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15553255379199982, + "rewards/margins": 0.11367003619670868, + "rewards/rejected": 0.04186253994703293, + "step": 3923 + }, + { + "epoch": 0.6068432244345641, + "grad_norm": 7.550969123840332, + "learning_rate": 4.4317791270477725e-06, + "logits/chosen": 4.504551410675049, + "logits/rejected": -1.2373692989349365, + "logps/chosen": -243.65744018554688, + "logps/rejected": -222.73126220703125, + "loss": 0.7045, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2893436551094055, + "rewards/margins": 0.07997182011604309, + "rewards/rejected": 0.20937182009220123, + "step": 3924 + }, + { + "epoch": 0.6069978735743282, + "grad_norm": 4.785045623779297, + "learning_rate": 4.431492725398099e-06, + "logits/chosen": 7.851067066192627, + "logits/rejected": 7.215798377990723, + "logps/chosen": -259.1886291503906, + "logps/rejected": -277.53680419921875, + "loss": 0.6859, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10007940232753754, + "rewards/margins": 0.09074074029922485, + "rewards/rejected": 0.009338676929473877, + "step": 3925 + }, + { + "epoch": 0.6071525227140924, + "grad_norm": 18.191181182861328, + "learning_rate": 4.431206323748426e-06, + "logits/chosen": 3.268805742263794, + "logits/rejected": 4.472982883453369, + "logps/chosen": -178.28390502929688, + "logps/rejected": -186.53646850585938, + "loss": 0.702, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36703360080718994, + "rewards/margins": 0.03177686780691147, + "rewards/rejected": 0.33525675535202026, + "step": 3926 + }, + { + "epoch": 0.6073071718538565, + "grad_norm": 4.964666843414307, + "learning_rate": 4.430919922098752e-06, + "logits/chosen": 8.175495147705078, + "logits/rejected": 4.364938735961914, + "logps/chosen": -330.52374267578125, + "logps/rejected": -248.77520751953125, + "loss": 0.603, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7188816666603088, + "rewards/margins": 0.4681444764137268, + "rewards/rejected": 0.25073719024658203, + "step": 3927 + }, + { + "epoch": 0.6074618209936207, + "grad_norm": 4.099869728088379, + "learning_rate": 4.430633520449078e-06, + "logits/chosen": 5.738530158996582, + "logits/rejected": 13.697628021240234, + "logps/chosen": -165.16888427734375, + "logps/rejected": -215.4642791748047, + "loss": 0.7018, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.287031888961792, + "rewards/margins": 0.2301531583070755, + "rewards/rejected": 0.056878771632909775, + "step": 3928 + }, + { + "epoch": 0.6076164701333849, + "grad_norm": 6.254826545715332, + "learning_rate": 4.430347118799405e-06, + "logits/chosen": 9.18960952758789, + "logits/rejected": 5.901559829711914, + "logps/chosen": -266.15423583984375, + "logps/rejected": -244.1021728515625, + "loss": 0.7581, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4968794584274292, + "rewards/margins": -0.09724986553192139, + "rewards/rejected": 0.5941293239593506, + "step": 3929 + }, + { + "epoch": 0.6077711192731491, + "grad_norm": 6.2791643142700195, + "learning_rate": 4.4300607171497316e-06, + "logits/chosen": 10.656717300415039, + "logits/rejected": 12.716848373413086, + "logps/chosen": -231.64620971679688, + "logps/rejected": -269.5526123046875, + "loss": 0.7628, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1951417475938797, + "rewards/margins": 0.0024026334285736084, + "rewards/rejected": 0.1927391141653061, + "step": 3930 + }, + { + "epoch": 0.6079257684129132, + "grad_norm": 4.102294921875, + "learning_rate": 4.429774315500057e-06, + "logits/chosen": 6.861971855163574, + "logits/rejected": 7.396608829498291, + "logps/chosen": -182.29127502441406, + "logps/rejected": -215.16659545898438, + "loss": 0.6351, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11349441111087799, + "rewards/margins": 0.26495811343193054, + "rewards/rejected": -0.15146368741989136, + "step": 3931 + }, + { + "epoch": 0.6080804175526774, + "grad_norm": 6.228318214416504, + "learning_rate": 4.429487913850384e-06, + "logits/chosen": 11.26646614074707, + "logits/rejected": 9.691991806030273, + "logps/chosen": -391.5594177246094, + "logps/rejected": -313.89288330078125, + "loss": 0.6799, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.537706196308136, + "rewards/margins": 0.08447578549385071, + "rewards/rejected": 0.4532304108142853, + "step": 3932 + }, + { + "epoch": 0.6082350666924415, + "grad_norm": 5.476931095123291, + "learning_rate": 4.429201512200711e-06, + "logits/chosen": 3.3181259632110596, + "logits/rejected": 4.004659175872803, + "logps/chosen": -319.52484130859375, + "logps/rejected": -269.1839294433594, + "loss": 0.6531, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41651710867881775, + "rewards/margins": 0.22092759609222412, + "rewards/rejected": 0.19558948278427124, + "step": 3933 + }, + { + "epoch": 0.6083897158322057, + "grad_norm": 6.260740756988525, + "learning_rate": 4.428915110551037e-06, + "logits/chosen": 8.654623985290527, + "logits/rejected": 5.765504360198975, + "logps/chosen": -345.4053955078125, + "logps/rejected": -248.0701141357422, + "loss": 0.6519, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.43752652406692505, + "rewards/margins": 0.2794763743877411, + "rewards/rejected": 0.15805016458034515, + "step": 3934 + }, + { + "epoch": 0.6085443649719698, + "grad_norm": 6.456355571746826, + "learning_rate": 4.428628708901363e-06, + "logits/chosen": 7.498924732208252, + "logits/rejected": 6.414624214172363, + "logps/chosen": -299.5986022949219, + "logps/rejected": -290.5528869628906, + "loss": 0.8254, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.030011199414730072, + "rewards/margins": -0.11222369968891144, + "rewards/rejected": 0.1422349065542221, + "step": 3935 + }, + { + "epoch": 0.608699014111734, + "grad_norm": 4.903226852416992, + "learning_rate": 4.42834230725169e-06, + "logits/chosen": 2.0587334632873535, + "logits/rejected": 5.142673969268799, + "logps/chosen": -212.19400024414062, + "logps/rejected": -155.07481384277344, + "loss": 0.7332, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.18088267743587494, + "rewards/margins": -0.03783514350652695, + "rewards/rejected": 0.2187178134918213, + "step": 3936 + }, + { + "epoch": 0.6088536632514981, + "grad_norm": 4.980144023895264, + "learning_rate": 4.428055905602016e-06, + "logits/chosen": 13.498974800109863, + "logits/rejected": 12.166093826293945, + "logps/chosen": -306.012939453125, + "logps/rejected": -347.42620849609375, + "loss": 0.6097, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2839524447917938, + "rewards/margins": 0.23469415307044983, + "rewards/rejected": 0.0492582842707634, + "step": 3937 + }, + { + "epoch": 0.6090083123912623, + "grad_norm": 4.113332271575928, + "learning_rate": 4.427769503952343e-06, + "logits/chosen": 5.48469877243042, + "logits/rejected": 0.37880033254623413, + "logps/chosen": -242.65028381347656, + "logps/rejected": -152.6143798828125, + "loss": 0.473, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41076648235321045, + "rewards/margins": 0.5718305110931396, + "rewards/rejected": -0.1610640287399292, + "step": 3938 + }, + { + "epoch": 0.6091629615310264, + "grad_norm": 4.880515098571777, + "learning_rate": 4.42748310230267e-06, + "logits/chosen": 6.751075744628906, + "logits/rejected": 8.638193130493164, + "logps/chosen": -166.15328979492188, + "logps/rejected": -230.50582885742188, + "loss": 0.6765, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33478519320487976, + "rewards/margins": 0.0900709256529808, + "rewards/rejected": -0.42485612630844116, + "step": 3939 + }, + { + "epoch": 0.6093176106707906, + "grad_norm": 4.448638916015625, + "learning_rate": 4.4271967006529955e-06, + "logits/chosen": 7.269002914428711, + "logits/rejected": 8.951229095458984, + "logps/chosen": -194.74359130859375, + "logps/rejected": -235.1671600341797, + "loss": 0.6945, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15649251639842987, + "rewards/margins": 0.09792394191026688, + "rewards/rejected": 0.05856857821345329, + "step": 3940 + }, + { + "epoch": 0.6094722598105548, + "grad_norm": 6.018690586090088, + "learning_rate": 4.426910299003322e-06, + "logits/chosen": 13.833003044128418, + "logits/rejected": 8.66430950164795, + "logps/chosen": -237.7458038330078, + "logps/rejected": -161.79444885253906, + "loss": 0.7577, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021770503371953964, + "rewards/margins": 0.15440839529037476, + "rewards/rejected": -0.1326378732919693, + "step": 3941 + }, + { + "epoch": 0.609626908950319, + "grad_norm": 9.304879188537598, + "learning_rate": 4.426623897353649e-06, + "logits/chosen": 6.3647918701171875, + "logits/rejected": 1.8037711381912231, + "logps/chosen": -353.09991455078125, + "logps/rejected": -228.76918029785156, + "loss": 0.6942, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4195597767829895, + "rewards/margins": 0.32777053117752075, + "rewards/rejected": 0.09178925305604935, + "step": 3942 + }, + { + "epoch": 0.6097815580900832, + "grad_norm": 5.764718532562256, + "learning_rate": 4.4263374957039755e-06, + "logits/chosen": 9.352285385131836, + "logits/rejected": 8.429758071899414, + "logps/chosen": -254.9012908935547, + "logps/rejected": -251.05540466308594, + "loss": 0.6218, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24047036468982697, + "rewards/margins": 0.21459397673606873, + "rewards/rejected": 0.025876376777887344, + "step": 3943 + }, + { + "epoch": 0.6099362072298473, + "grad_norm": 2.95068621635437, + "learning_rate": 4.426051094054302e-06, + "logits/chosen": 9.840718269348145, + "logits/rejected": 3.394331455230713, + "logps/chosen": -453.9388427734375, + "logps/rejected": -185.21783447265625, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9666721820831299, + "rewards/margins": 0.9936515092849731, + "rewards/rejected": -0.02697935700416565, + "step": 3944 + }, + { + "epoch": 0.6100908563696115, + "grad_norm": 3.6018004417419434, + "learning_rate": 4.425764692404629e-06, + "logits/chosen": 11.385409355163574, + "logits/rejected": 2.0178732872009277, + "logps/chosen": -191.98577880859375, + "logps/rejected": -98.89240264892578, + "loss": 0.538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47789880633354187, + "rewards/margins": 0.44668370485305786, + "rewards/rejected": 0.031215094029903412, + "step": 3945 + }, + { + "epoch": 0.6102455055093756, + "grad_norm": 5.9750566482543945, + "learning_rate": 4.4254782907549554e-06, + "logits/chosen": 8.962934494018555, + "logits/rejected": 9.986520767211914, + "logps/chosen": -217.88967895507812, + "logps/rejected": -298.3156433105469, + "loss": 0.7551, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1696626991033554, + "rewards/margins": -0.06377942115068436, + "rewards/rejected": 0.23344211280345917, + "step": 3946 + }, + { + "epoch": 0.6104001546491398, + "grad_norm": 4.730971336364746, + "learning_rate": 4.425191889105281e-06, + "logits/chosen": 10.342905044555664, + "logits/rejected": 12.241241455078125, + "logps/chosen": -247.43345642089844, + "logps/rejected": -272.8958740234375, + "loss": 0.6278, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7114505171775818, + "rewards/margins": 0.34041827917099, + "rewards/rejected": 0.371032178401947, + "step": 3947 + }, + { + "epoch": 0.6105548037889039, + "grad_norm": 5.059885501861572, + "learning_rate": 4.424905487455608e-06, + "logits/chosen": 11.473493576049805, + "logits/rejected": 14.256406784057617, + "logps/chosen": -295.53167724609375, + "logps/rejected": -322.2238464355469, + "loss": 0.6668, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09232574701309204, + "rewards/margins": 0.2279464304447174, + "rewards/rejected": -0.13562071323394775, + "step": 3948 + }, + { + "epoch": 0.6107094529286681, + "grad_norm": 4.253401756286621, + "learning_rate": 4.4246190858059345e-06, + "logits/chosen": 12.653377532958984, + "logits/rejected": 7.631697177886963, + "logps/chosen": -273.0514831542969, + "logps/rejected": -208.06675720214844, + "loss": 0.6663, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3519936501979828, + "rewards/margins": 0.13607236742973328, + "rewards/rejected": 0.2159213125705719, + "step": 3949 + }, + { + "epoch": 0.6108641020684322, + "grad_norm": 5.537108421325684, + "learning_rate": 4.424332684156261e-06, + "logits/chosen": 10.17874526977539, + "logits/rejected": 11.40488052368164, + "logps/chosen": -213.402587890625, + "logps/rejected": -304.5474853515625, + "loss": 0.726, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41043418645858765, + "rewards/margins": 0.011886127293109894, + "rewards/rejected": 0.39854809641838074, + "step": 3950 + }, + { + "epoch": 0.6110187512081964, + "grad_norm": 4.392712593078613, + "learning_rate": 4.424046282506588e-06, + "logits/chosen": 9.43829345703125, + "logits/rejected": 5.8715972900390625, + "logps/chosen": -247.12969970703125, + "logps/rejected": -268.78643798828125, + "loss": 0.478, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4751456379890442, + "rewards/margins": 0.6031975746154785, + "rewards/rejected": -0.1280519962310791, + "step": 3951 + }, + { + "epoch": 0.6111734003479605, + "grad_norm": 5.71186637878418, + "learning_rate": 4.4237598808569145e-06, + "logits/chosen": 9.687740325927734, + "logits/rejected": 14.106892585754395, + "logps/chosen": -191.24928283691406, + "logps/rejected": -226.9354248046875, + "loss": 0.6278, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3443165421485901, + "rewards/margins": 0.2833443284034729, + "rewards/rejected": 0.06097222864627838, + "step": 3952 + }, + { + "epoch": 0.6113280494877247, + "grad_norm": 4.307219505310059, + "learning_rate": 4.42347347920724e-06, + "logits/chosen": 8.427547454833984, + "logits/rejected": 5.390182018280029, + "logps/chosen": -411.5386657714844, + "logps/rejected": -280.1956787109375, + "loss": 0.4709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7386701107025146, + "rewards/margins": 0.7951598167419434, + "rewards/rejected": -0.056489646434783936, + "step": 3953 + }, + { + "epoch": 0.6114826986274888, + "grad_norm": 5.931158542633057, + "learning_rate": 4.423187077557567e-06, + "logits/chosen": 8.853277206420898, + "logits/rejected": 3.7948968410491943, + "logps/chosen": -252.56930541992188, + "logps/rejected": -229.06695556640625, + "loss": 0.8123, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.15185098350048065, + "rewards/margins": -0.1505226045846939, + "rewards/rejected": 0.30237358808517456, + "step": 3954 + }, + { + "epoch": 0.6116373477672531, + "grad_norm": 5.277589321136475, + "learning_rate": 4.422900675907894e-06, + "logits/chosen": 10.294349670410156, + "logits/rejected": 7.776795864105225, + "logps/chosen": -240.8458251953125, + "logps/rejected": -204.66903686523438, + "loss": 0.6645, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3079814314842224, + "rewards/margins": 0.18570742011070251, + "rewards/rejected": 0.1222740188241005, + "step": 3955 + }, + { + "epoch": 0.6117919969070172, + "grad_norm": 16.515262603759766, + "learning_rate": 4.42261427425822e-06, + "logits/chosen": 6.147159099578857, + "logits/rejected": 9.732583045959473, + "logps/chosen": -301.515869140625, + "logps/rejected": -360.2158203125, + "loss": 0.8199, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07122736424207687, + "rewards/margins": -0.1844414919614792, + "rewards/rejected": 0.11321412771940231, + "step": 3956 + }, + { + "epoch": 0.6119466460467814, + "grad_norm": 5.09522819519043, + "learning_rate": 4.422327872608547e-06, + "logits/chosen": 6.902000427246094, + "logits/rejected": 6.220500946044922, + "logps/chosen": -207.66470336914062, + "logps/rejected": -157.48452758789062, + "loss": 0.7978, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.28255677223205566, + "rewards/margins": -0.09610196948051453, + "rewards/rejected": 0.3786587119102478, + "step": 3957 + }, + { + "epoch": 0.6121012951865455, + "grad_norm": 6.843777179718018, + "learning_rate": 4.4220414709588735e-06, + "logits/chosen": 8.429061889648438, + "logits/rejected": 11.85678482055664, + "logps/chosen": -159.63804626464844, + "logps/rejected": -244.06492614746094, + "loss": 0.7833, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15043459832668304, + "rewards/margins": -0.03470522165298462, + "rewards/rejected": 0.18513980507850647, + "step": 3958 + }, + { + "epoch": 0.6122559443263097, + "grad_norm": 4.550302982330322, + "learning_rate": 4.4217550693092e-06, + "logits/chosen": 10.892744064331055, + "logits/rejected": 9.70377254486084, + "logps/chosen": -207.5938720703125, + "logps/rejected": -183.10531616210938, + "loss": 0.6456, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10899969935417175, + "rewards/margins": 0.13289988040924072, + "rewards/rejected": -0.02390018105506897, + "step": 3959 + }, + { + "epoch": 0.6124105934660738, + "grad_norm": 9.154976844787598, + "learning_rate": 4.421468667659526e-06, + "logits/chosen": 11.393348693847656, + "logits/rejected": 11.878911972045898, + "logps/chosen": -281.6402282714844, + "logps/rejected": -290.2956237792969, + "loss": 0.5833, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4247601628303528, + "rewards/margins": 0.3584325313568115, + "rewards/rejected": 0.06632763147354126, + "step": 3960 + }, + { + "epoch": 0.612565242605838, + "grad_norm": 3.601804494857788, + "learning_rate": 4.421182266009853e-06, + "logits/chosen": 12.146629333496094, + "logits/rejected": 10.923734664916992, + "logps/chosen": -258.5301513671875, + "logps/rejected": -193.32644653320312, + "loss": 0.5444, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8748998641967773, + "rewards/margins": 0.47861260175704956, + "rewards/rejected": 0.396287202835083, + "step": 3961 + }, + { + "epoch": 0.6127198917456022, + "grad_norm": 6.186570644378662, + "learning_rate": 4.420895864360179e-06, + "logits/chosen": 9.150975227355957, + "logits/rejected": 6.546099662780762, + "logps/chosen": -353.0328063964844, + "logps/rejected": -217.9554443359375, + "loss": 0.7871, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33110055327415466, + "rewards/margins": 0.06069934368133545, + "rewards/rejected": 0.2704011797904968, + "step": 3962 + }, + { + "epoch": 0.6128745408853663, + "grad_norm": 4.830526828765869, + "learning_rate": 4.420609462710506e-06, + "logits/chosen": 9.601766586303711, + "logits/rejected": 5.484531879425049, + "logps/chosen": -232.62954711914062, + "logps/rejected": -177.02243041992188, + "loss": 0.6411, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4963589012622833, + "rewards/margins": 0.19993120431900024, + "rewards/rejected": 0.2964276671409607, + "step": 3963 + }, + { + "epoch": 0.6130291900251305, + "grad_norm": 3.7554080486297607, + "learning_rate": 4.420323061060833e-06, + "logits/chosen": 10.423748970031738, + "logits/rejected": 10.842555046081543, + "logps/chosen": -141.82188415527344, + "logps/rejected": -170.78945922851562, + "loss": 0.5048, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26879173517227173, + "rewards/margins": 0.6932423710823059, + "rewards/rejected": -0.4244506359100342, + "step": 3964 + }, + { + "epoch": 0.6131838391648946, + "grad_norm": 5.461180210113525, + "learning_rate": 4.420036659411158e-06, + "logits/chosen": 7.354410171508789, + "logits/rejected": 12.26761245727539, + "logps/chosen": -189.40394592285156, + "logps/rejected": -246.69393920898438, + "loss": 0.7015, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3383534550666809, + "rewards/margins": 0.03129100054502487, + "rewards/rejected": 0.30706244707107544, + "step": 3965 + }, + { + "epoch": 0.6133384883046588, + "grad_norm": 5.0663323402404785, + "learning_rate": 4.419750257761485e-06, + "logits/chosen": 12.438953399658203, + "logits/rejected": 7.917116165161133, + "logps/chosen": -244.00332641601562, + "logps/rejected": -163.62026977539062, + "loss": 0.8037, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03278389573097229, + "rewards/margins": -0.03631211817264557, + "rewards/rejected": 0.003528214991092682, + "step": 3966 + }, + { + "epoch": 0.613493137444423, + "grad_norm": 9.220536231994629, + "learning_rate": 4.419463856111812e-06, + "logits/chosen": 10.85869312286377, + "logits/rejected": 8.307962417602539, + "logps/chosen": -281.2154541015625, + "logps/rejected": -268.24786376953125, + "loss": 0.5865, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6907461881637573, + "rewards/margins": 0.2936241328716278, + "rewards/rejected": 0.3971221148967743, + "step": 3967 + }, + { + "epoch": 0.6136477865841872, + "grad_norm": 6.323410511016846, + "learning_rate": 4.419177454462138e-06, + "logits/chosen": 10.972429275512695, + "logits/rejected": 6.477114677429199, + "logps/chosen": -304.065185546875, + "logps/rejected": -249.53591918945312, + "loss": 0.7205, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4116255044937134, + "rewards/margins": 0.025416608899831772, + "rewards/rejected": 0.3862088918685913, + "step": 3968 + }, + { + "epoch": 0.6138024357239513, + "grad_norm": 6.181182861328125, + "learning_rate": 4.418891052812464e-06, + "logits/chosen": 4.934808731079102, + "logits/rejected": 6.155580520629883, + "logps/chosen": -189.90966796875, + "logps/rejected": -306.04119873046875, + "loss": 0.6913, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36994311213493347, + "rewards/margins": 0.058012351393699646, + "rewards/rejected": 0.31193074584007263, + "step": 3969 + }, + { + "epoch": 0.6139570848637155, + "grad_norm": 6.408806324005127, + "learning_rate": 4.418604651162791e-06, + "logits/chosen": 10.767641067504883, + "logits/rejected": 5.702798843383789, + "logps/chosen": -313.1101379394531, + "logps/rejected": -293.96978759765625, + "loss": 0.6186, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5324552655220032, + "rewards/margins": 0.28967976570129395, + "rewards/rejected": 0.2427755445241928, + "step": 3970 + }, + { + "epoch": 0.6141117340034796, + "grad_norm": 6.188937664031982, + "learning_rate": 4.4183182495131175e-06, + "logits/chosen": 5.66517448425293, + "logits/rejected": 2.293285846710205, + "logps/chosen": -253.6072998046875, + "logps/rejected": -204.28594970703125, + "loss": 0.6067, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5332667231559753, + "rewards/margins": 0.281089723110199, + "rewards/rejected": 0.25217705965042114, + "step": 3971 + }, + { + "epoch": 0.6142663831432438, + "grad_norm": 6.798548221588135, + "learning_rate": 4.418031847863444e-06, + "logits/chosen": 10.6843900680542, + "logits/rejected": 8.478568077087402, + "logps/chosen": -194.47265625, + "logps/rejected": -195.5419921875, + "loss": 0.8092, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.19158917665481567, + "rewards/margins": -0.20718078315258026, + "rewards/rejected": 0.39876994490623474, + "step": 3972 + }, + { + "epoch": 0.6144210322830079, + "grad_norm": 5.045307636260986, + "learning_rate": 4.41774544621377e-06, + "logits/chosen": 10.354766845703125, + "logits/rejected": 5.917832851409912, + "logps/chosen": -427.99493408203125, + "logps/rejected": -338.61279296875, + "loss": 0.6111, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8421720266342163, + "rewards/margins": 0.27371490001678467, + "rewards/rejected": 0.5684571266174316, + "step": 3973 + }, + { + "epoch": 0.6145756814227721, + "grad_norm": 4.441810607910156, + "learning_rate": 4.4174590445640966e-06, + "logits/chosen": 11.248181343078613, + "logits/rejected": 9.445834159851074, + "logps/chosen": -250.52337646484375, + "logps/rejected": -175.71202087402344, + "loss": 0.6, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6608389616012573, + "rewards/margins": 0.2901197075843811, + "rewards/rejected": 0.3707192540168762, + "step": 3974 + }, + { + "epoch": 0.6147303305625362, + "grad_norm": 6.860800266265869, + "learning_rate": 4.417172642914423e-06, + "logits/chosen": 8.25340461730957, + "logits/rejected": 5.902225971221924, + "logps/chosen": -324.9285888671875, + "logps/rejected": -260.1461486816406, + "loss": 0.7593, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.4724811613559723, + "rewards/margins": 0.021984606981277466, + "rewards/rejected": 0.4504966139793396, + "step": 3975 + }, + { + "epoch": 0.6148849797023004, + "grad_norm": 5.341482162475586, + "learning_rate": 4.41688624126475e-06, + "logits/chosen": 14.92526626586914, + "logits/rejected": 9.183149337768555, + "logps/chosen": -346.3665771484375, + "logps/rejected": -261.3407287597656, + "loss": 0.4572, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8901488780975342, + "rewards/margins": 0.8458946347236633, + "rewards/rejected": 0.04425421357154846, + "step": 3976 + }, + { + "epoch": 0.6150396288420645, + "grad_norm": 7.1989665031433105, + "learning_rate": 4.4165998396150765e-06, + "logits/chosen": 12.811212539672852, + "logits/rejected": 10.94886302947998, + "logps/chosen": -499.0943298339844, + "logps/rejected": -391.61492919921875, + "loss": 0.8161, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.40263786911964417, + "rewards/margins": -0.16286352276802063, + "rewards/rejected": 0.5655014514923096, + "step": 3977 + }, + { + "epoch": 0.6151942779818287, + "grad_norm": 6.798708915710449, + "learning_rate": 4.416313437965403e-06, + "logits/chosen": 8.418795585632324, + "logits/rejected": 7.710784912109375, + "logps/chosen": -363.533203125, + "logps/rejected": -294.5526123046875, + "loss": 0.6379, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5380655527114868, + "rewards/margins": 0.17283202707767487, + "rewards/rejected": 0.36523354053497314, + "step": 3978 + }, + { + "epoch": 0.6153489271215928, + "grad_norm": 3.110316514968872, + "learning_rate": 4.416027036315729e-06, + "logits/chosen": 10.87832260131836, + "logits/rejected": 5.233730316162109, + "logps/chosen": -192.003662109375, + "logps/rejected": -155.4304656982422, + "loss": 0.5194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30637475848197937, + "rewards/margins": 0.49352115392684937, + "rewards/rejected": -0.18714639544487, + "step": 3979 + }, + { + "epoch": 0.6155035762613571, + "grad_norm": 12.283734321594238, + "learning_rate": 4.415740634666056e-06, + "logits/chosen": 16.186311721801758, + "logits/rejected": 6.734541416168213, + "logps/chosen": -378.1627197265625, + "logps/rejected": -228.70108032226562, + "loss": 0.5701, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8108668327331543, + "rewards/margins": 0.45011216402053833, + "rewards/rejected": 0.36075466871261597, + "step": 3980 + }, + { + "epoch": 0.6156582254011213, + "grad_norm": 5.618738174438477, + "learning_rate": 4.415454233016382e-06, + "logits/chosen": 10.618489265441895, + "logits/rejected": 8.664163589477539, + "logps/chosen": -302.9654541015625, + "logps/rejected": -249.79965209960938, + "loss": 0.5684, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6317172646522522, + "rewards/margins": 0.4660932421684265, + "rewards/rejected": 0.16562394797801971, + "step": 3981 + }, + { + "epoch": 0.6158128745408854, + "grad_norm": 6.898849010467529, + "learning_rate": 4.415167831366709e-06, + "logits/chosen": 15.606817245483398, + "logits/rejected": 5.793133735656738, + "logps/chosen": -518.538330078125, + "logps/rejected": -346.8183898925781, + "loss": 0.8555, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20846271514892578, + "rewards/margins": -0.21831157803535461, + "rewards/rejected": 0.4267743229866028, + "step": 3982 + }, + { + "epoch": 0.6159675236806496, + "grad_norm": 5.192005634307861, + "learning_rate": 4.4148814297170356e-06, + "logits/chosen": 6.179653644561768, + "logits/rejected": 1.9027841091156006, + "logps/chosen": -191.0103759765625, + "logps/rejected": -96.84380340576172, + "loss": 0.6513, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11486382782459259, + "rewards/margins": 0.19265887141227722, + "rewards/rejected": -0.07779502868652344, + "step": 3983 + }, + { + "epoch": 0.6161221728204137, + "grad_norm": 5.486961364746094, + "learning_rate": 4.414595028067362e-06, + "logits/chosen": 11.87032699584961, + "logits/rejected": 12.852231979370117, + "logps/chosen": -181.870849609375, + "logps/rejected": -230.91493225097656, + "loss": 0.7214, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04374241083860397, + "rewards/margins": 0.017684705555438995, + "rewards/rejected": -0.06142710894346237, + "step": 3984 + }, + { + "epoch": 0.6162768219601779, + "grad_norm": 6.306859970092773, + "learning_rate": 4.414308626417689e-06, + "logits/chosen": 12.358658790588379, + "logits/rejected": 9.750887870788574, + "logps/chosen": -325.25506591796875, + "logps/rejected": -227.91432189941406, + "loss": 0.7051, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4135579764842987, + "rewards/margins": 0.16071553528308868, + "rewards/rejected": 0.2528424561023712, + "step": 3985 + }, + { + "epoch": 0.616431471099942, + "grad_norm": 5.853692531585693, + "learning_rate": 4.414022224768015e-06, + "logits/chosen": 10.070363998413086, + "logits/rejected": 9.573240280151367, + "logps/chosen": -213.6751708984375, + "logps/rejected": -283.18560791015625, + "loss": 0.7138, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5747483968734741, + "rewards/margins": 0.16866031289100647, + "rewards/rejected": 0.40608811378479004, + "step": 3986 + }, + { + "epoch": 0.6165861202397062, + "grad_norm": 5.2673163414001465, + "learning_rate": 4.413735823118341e-06, + "logits/chosen": 9.505758285522461, + "logits/rejected": 4.42695426940918, + "logps/chosen": -299.49078369140625, + "logps/rejected": -288.18914794921875, + "loss": 0.5287, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6796090006828308, + "rewards/margins": 0.44216328859329224, + "rewards/rejected": 0.23744575679302216, + "step": 3987 + }, + { + "epoch": 0.6167407693794703, + "grad_norm": 5.0639567375183105, + "learning_rate": 4.413449421468668e-06, + "logits/chosen": 15.185131072998047, + "logits/rejected": 13.980964660644531, + "logps/chosen": -265.08172607421875, + "logps/rejected": -262.53045654296875, + "loss": 0.6675, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1539907455444336, + "rewards/margins": 0.1677388846874237, + "rewards/rejected": -0.0137481689453125, + "step": 3988 + }, + { + "epoch": 0.6168954185192345, + "grad_norm": 5.025876522064209, + "learning_rate": 4.413163019818995e-06, + "logits/chosen": 9.889678955078125, + "logits/rejected": 10.165555000305176, + "logps/chosen": -230.9527130126953, + "logps/rejected": -238.69235229492188, + "loss": 0.7475, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.08680114895105362, + "rewards/margins": -0.0775972381234169, + "rewards/rejected": 0.1643984019756317, + "step": 3989 + }, + { + "epoch": 0.6170500676589986, + "grad_norm": 3.8666720390319824, + "learning_rate": 4.412876618169321e-06, + "logits/chosen": 12.497709274291992, + "logits/rejected": 5.2078657150268555, + "logps/chosen": -226.739990234375, + "logps/rejected": -129.91058349609375, + "loss": 0.5501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4635973274707794, + "rewards/margins": 0.48007628321647644, + "rewards/rejected": -0.016479015350341797, + "step": 3990 + }, + { + "epoch": 0.6172047167987628, + "grad_norm": 4.434582233428955, + "learning_rate": 4.412590216519648e-06, + "logits/chosen": 8.910432815551758, + "logits/rejected": 6.856439590454102, + "logps/chosen": -252.7999267578125, + "logps/rejected": -184.7388458251953, + "loss": 0.5696, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6763100624084473, + "rewards/margins": 0.4074946343898773, + "rewards/rejected": 0.26881542801856995, + "step": 3991 + }, + { + "epoch": 0.6173593659385269, + "grad_norm": 8.017807006835938, + "learning_rate": 4.4123038148699746e-06, + "logits/chosen": 4.168961524963379, + "logits/rejected": 6.715488433837891, + "logps/chosen": -360.7528991699219, + "logps/rejected": -328.45361328125, + "loss": 0.8931, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.27519139647483826, + "rewards/margins": -0.24730825424194336, + "rewards/rejected": 0.522499680519104, + "step": 3992 + }, + { + "epoch": 0.6175140150782912, + "grad_norm": 9.119096755981445, + "learning_rate": 4.4120174132203e-06, + "logits/chosen": 7.943830490112305, + "logits/rejected": -0.4127311706542969, + "logps/chosen": -330.6951599121094, + "logps/rejected": -325.27099609375, + "loss": 0.7636, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3674657940864563, + "rewards/margins": -0.062451254576444626, + "rewards/rejected": 0.4299170672893524, + "step": 3993 + }, + { + "epoch": 0.6176686642180553, + "grad_norm": 6.174286365509033, + "learning_rate": 4.411731011570627e-06, + "logits/chosen": 6.646059989929199, + "logits/rejected": 5.974409103393555, + "logps/chosen": -303.280517578125, + "logps/rejected": -227.81649780273438, + "loss": 0.7623, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20779690146446228, + "rewards/margins": -0.007047683000564575, + "rewards/rejected": 0.21484455466270447, + "step": 3994 + }, + { + "epoch": 0.6178233133578195, + "grad_norm": 6.199397087097168, + "learning_rate": 4.411444609920954e-06, + "logits/chosen": 9.876285552978516, + "logits/rejected": 8.515664100646973, + "logps/chosen": -356.68951416015625, + "logps/rejected": -344.5235290527344, + "loss": 0.7737, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5074409246444702, + "rewards/margins": -0.11482200771570206, + "rewards/rejected": 0.6222629547119141, + "step": 3995 + }, + { + "epoch": 0.6179779624975836, + "grad_norm": 8.015007972717285, + "learning_rate": 4.41115820827128e-06, + "logits/chosen": 6.025833606719971, + "logits/rejected": 7.431828498840332, + "logps/chosen": -236.2277069091797, + "logps/rejected": -310.51568603515625, + "loss": 0.7423, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003694772720336914, + "rewards/margins": -0.039074115455150604, + "rewards/rejected": 0.04276890680193901, + "step": 3996 + }, + { + "epoch": 0.6181326116373478, + "grad_norm": 5.4741411209106445, + "learning_rate": 4.410871806621607e-06, + "logits/chosen": 12.832688331604004, + "logits/rejected": 7.965878486633301, + "logps/chosen": -247.70278930664062, + "logps/rejected": -225.88583374023438, + "loss": 0.6795, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3956608772277832, + "rewards/margins": 0.06082305312156677, + "rewards/rejected": 0.33483782410621643, + "step": 3997 + }, + { + "epoch": 0.6182872607771119, + "grad_norm": 4.838098526000977, + "learning_rate": 4.410585404971934e-06, + "logits/chosen": 15.044391632080078, + "logits/rejected": 8.95964527130127, + "logps/chosen": -299.5126647949219, + "logps/rejected": -208.17086791992188, + "loss": 0.5528, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8709566593170166, + "rewards/margins": 0.38361066579818726, + "rewards/rejected": 0.48734599351882935, + "step": 3998 + }, + { + "epoch": 0.6184419099168761, + "grad_norm": 6.5764641761779785, + "learning_rate": 4.4102990033222594e-06, + "logits/chosen": 4.940681457519531, + "logits/rejected": 6.823337554931641, + "logps/chosen": -319.206298828125, + "logps/rejected": -324.8540954589844, + "loss": 0.7444, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.334145188331604, + "rewards/margins": -0.05645487457513809, + "rewards/rejected": 0.3906000852584839, + "step": 3999 + }, + { + "epoch": 0.6185965590566402, + "grad_norm": 5.284141540527344, + "learning_rate": 4.410012601672586e-06, + "logits/chosen": 10.129792213439941, + "logits/rejected": 5.309225559234619, + "logps/chosen": -277.09814453125, + "logps/rejected": -245.0251922607422, + "loss": 0.6838, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35479944944381714, + "rewards/margins": 0.048466116189956665, + "rewards/rejected": 0.3063333332538605, + "step": 4000 + }, + { + "epoch": 0.6187512081964044, + "grad_norm": 6.510834693908691, + "learning_rate": 4.409726200022913e-06, + "logits/chosen": 10.96594524383545, + "logits/rejected": 10.082406997680664, + "logps/chosen": -377.9990539550781, + "logps/rejected": -290.2900390625, + "loss": 0.7381, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.47586309909820557, + "rewards/margins": 0.02584478259086609, + "rewards/rejected": 0.4500183165073395, + "step": 4001 + }, + { + "epoch": 0.6189058573361685, + "grad_norm": 5.866822242736816, + "learning_rate": 4.409439798373239e-06, + "logits/chosen": 10.911684036254883, + "logits/rejected": 9.771059036254883, + "logps/chosen": -249.86415100097656, + "logps/rejected": -252.82192993164062, + "loss": 0.8159, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22084388136863708, + "rewards/margins": -0.16515156626701355, + "rewards/rejected": 0.38599544763565063, + "step": 4002 + }, + { + "epoch": 0.6190605064759327, + "grad_norm": 5.162505149841309, + "learning_rate": 4.409153396723565e-06, + "logits/chosen": 8.09380054473877, + "logits/rejected": 10.095977783203125, + "logps/chosen": -236.61373901367188, + "logps/rejected": -283.0645751953125, + "loss": 0.7665, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6227982044219971, + "rewards/margins": -0.07890131324529648, + "rewards/rejected": 0.7016994953155518, + "step": 4003 + }, + { + "epoch": 0.6192151556156968, + "grad_norm": 6.186079978942871, + "learning_rate": 4.408866995073892e-06, + "logits/chosen": 5.812247276306152, + "logits/rejected": 5.927313327789307, + "logps/chosen": -295.0551452636719, + "logps/rejected": -255.51995849609375, + "loss": 0.846, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24706745147705078, + "rewards/margins": -0.15385542809963226, + "rewards/rejected": 0.40092286467552185, + "step": 4004 + }, + { + "epoch": 0.619369804755461, + "grad_norm": 6.510254859924316, + "learning_rate": 4.4085805934242185e-06, + "logits/chosen": 8.036344528198242, + "logits/rejected": 10.093371391296387, + "logps/chosen": -309.7167053222656, + "logps/rejected": -311.7046813964844, + "loss": 0.6862, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6454997658729553, + "rewards/margins": 0.07213295996189117, + "rewards/rejected": 0.5733668208122253, + "step": 4005 + }, + { + "epoch": 0.6195244538952253, + "grad_norm": 4.414552688598633, + "learning_rate": 4.408294191774545e-06, + "logits/chosen": 12.78913688659668, + "logits/rejected": 12.12380599975586, + "logps/chosen": -182.97825622558594, + "logps/rejected": -152.61581420898438, + "loss": 0.6497, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2689127027988434, + "rewards/margins": 0.17303729057312012, + "rewards/rejected": 0.09587539732456207, + "step": 4006 + }, + { + "epoch": 0.6196791030349894, + "grad_norm": 4.505795955657959, + "learning_rate": 4.408007790124871e-06, + "logits/chosen": 10.55947208404541, + "logits/rejected": 8.339699745178223, + "logps/chosen": -320.95721435546875, + "logps/rejected": -276.74407958984375, + "loss": 0.586, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5985777974128723, + "rewards/margins": 0.3434251844882965, + "rewards/rejected": 0.2551526427268982, + "step": 4007 + }, + { + "epoch": 0.6198337521747536, + "grad_norm": 6.069342613220215, + "learning_rate": 4.407721388475198e-06, + "logits/chosen": 7.596423149108887, + "logits/rejected": 6.370104789733887, + "logps/chosen": -249.0166778564453, + "logps/rejected": -264.9747009277344, + "loss": 0.665, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4278978407382965, + "rewards/margins": 0.09891906380653381, + "rewards/rejected": 0.3289787769317627, + "step": 4008 + }, + { + "epoch": 0.6199884013145177, + "grad_norm": 4.244874000549316, + "learning_rate": 4.407434986825524e-06, + "logits/chosen": 10.794459342956543, + "logits/rejected": 5.339234828948975, + "logps/chosen": -305.938720703125, + "logps/rejected": -218.4661865234375, + "loss": 0.5758, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.529402494430542, + "rewards/margins": 0.36998555064201355, + "rewards/rejected": 0.15941688418388367, + "step": 4009 + }, + { + "epoch": 0.6201430504542819, + "grad_norm": 8.21546459197998, + "learning_rate": 4.407148585175851e-06, + "logits/chosen": 7.781423091888428, + "logits/rejected": 8.992557525634766, + "logps/chosen": -264.19403076171875, + "logps/rejected": -339.052490234375, + "loss": 0.8488, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09604862332344055, + "rewards/margins": -0.21417336165905, + "rewards/rejected": 0.3102220296859741, + "step": 4010 + }, + { + "epoch": 0.620297699594046, + "grad_norm": 4.776425361633301, + "learning_rate": 4.4068621835261775e-06, + "logits/chosen": 12.80551528930664, + "logits/rejected": 8.52711296081543, + "logps/chosen": -364.6460266113281, + "logps/rejected": -257.2583312988281, + "loss": 0.5171, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8015909194946289, + "rewards/margins": 0.5088747143745422, + "rewards/rejected": 0.2927161455154419, + "step": 4011 + }, + { + "epoch": 0.6204523487338102, + "grad_norm": 5.902646541595459, + "learning_rate": 4.406575781876503e-06, + "logits/chosen": 9.784926414489746, + "logits/rejected": 9.356770515441895, + "logps/chosen": -259.9383239746094, + "logps/rejected": -343.2843017578125, + "loss": 0.6867, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3118656575679779, + "rewards/margins": 0.14351238310337067, + "rewards/rejected": 0.16835325956344604, + "step": 4012 + }, + { + "epoch": 0.6206069978735743, + "grad_norm": 4.823896408081055, + "learning_rate": 4.40628938022683e-06, + "logits/chosen": 14.486096382141113, + "logits/rejected": 10.90367603302002, + "logps/chosen": -234.39022827148438, + "logps/rejected": -193.61355590820312, + "loss": 0.5757, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6393522620201111, + "rewards/margins": 0.27943816781044006, + "rewards/rejected": 0.35991406440734863, + "step": 4013 + }, + { + "epoch": 0.6207616470133385, + "grad_norm": 3.4552700519561768, + "learning_rate": 4.406002978577157e-06, + "logits/chosen": 13.32492733001709, + "logits/rejected": 6.451300621032715, + "logps/chosen": -245.94692993164062, + "logps/rejected": -171.83734130859375, + "loss": 0.4626, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5272878408432007, + "rewards/margins": 0.6320104598999023, + "rewards/rejected": -0.10472260415554047, + "step": 4014 + }, + { + "epoch": 0.6209162961531026, + "grad_norm": 5.932234287261963, + "learning_rate": 4.405716576927483e-06, + "logits/chosen": 11.590948104858398, + "logits/rejected": 9.118955612182617, + "logps/chosen": -390.6847839355469, + "logps/rejected": -273.2651062011719, + "loss": 0.6525, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32982784509658813, + "rewards/margins": 0.170156329870224, + "rewards/rejected": 0.15967151522636414, + "step": 4015 + }, + { + "epoch": 0.6210709452928668, + "grad_norm": 5.118629455566406, + "learning_rate": 4.40543017527781e-06, + "logits/chosen": 8.611971855163574, + "logits/rejected": 1.606679081916809, + "logps/chosen": -311.0571594238281, + "logps/rejected": -192.13607788085938, + "loss": 0.6249, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2589518427848816, + "rewards/margins": 0.23573365807533264, + "rewards/rejected": 0.023218199610710144, + "step": 4016 + }, + { + "epoch": 0.6212255944326309, + "grad_norm": 7.144024848937988, + "learning_rate": 4.405143773628137e-06, + "logits/chosen": 8.36507797241211, + "logits/rejected": 7.581270217895508, + "logps/chosen": -235.99813842773438, + "logps/rejected": -251.02427673339844, + "loss": 0.8928, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06704816222190857, + "rewards/margins": -0.21711713075637817, + "rewards/rejected": 0.28416526317596436, + "step": 4017 + }, + { + "epoch": 0.6213802435723951, + "grad_norm": 8.759623527526855, + "learning_rate": 4.404857371978463e-06, + "logits/chosen": 7.732784271240234, + "logits/rejected": 5.147335052490234, + "logps/chosen": -300.4581298828125, + "logps/rejected": -253.42013549804688, + "loss": 0.5774, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5996932983398438, + "rewards/margins": 0.46005499362945557, + "rewards/rejected": 0.13963833451271057, + "step": 4018 + }, + { + "epoch": 0.6215348927121593, + "grad_norm": 5.850327491760254, + "learning_rate": 4.404570970328789e-06, + "logits/chosen": 7.610745429992676, + "logits/rejected": 7.347476959228516, + "logps/chosen": -195.70159912109375, + "logps/rejected": -199.7276153564453, + "loss": 0.7276, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21239474415779114, + "rewards/margins": -0.008057206869125366, + "rewards/rejected": 0.2204519361257553, + "step": 4019 + }, + { + "epoch": 0.6216895418519235, + "grad_norm": 5.558006763458252, + "learning_rate": 4.404284568679116e-06, + "logits/chosen": 12.289063453674316, + "logits/rejected": 12.644683837890625, + "logps/chosen": -252.84823608398438, + "logps/rejected": -280.3616943359375, + "loss": 0.7419, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2592141330242157, + "rewards/margins": -0.06418189406394958, + "rewards/rejected": 0.3233960270881653, + "step": 4020 + }, + { + "epoch": 0.6218441909916876, + "grad_norm": 5.637094497680664, + "learning_rate": 4.403998167029442e-06, + "logits/chosen": 12.32789421081543, + "logits/rejected": 9.94038200378418, + "logps/chosen": -253.4571533203125, + "logps/rejected": -238.50840759277344, + "loss": 0.8034, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5833390355110168, + "rewards/margins": -0.07631255686283112, + "rewards/rejected": 0.6596515774726868, + "step": 4021 + }, + { + "epoch": 0.6219988401314518, + "grad_norm": 6.444872856140137, + "learning_rate": 4.403711765379769e-06, + "logits/chosen": 5.635624408721924, + "logits/rejected": 3.9463272094726562, + "logps/chosen": -175.76524353027344, + "logps/rejected": -252.73910522460938, + "loss": 0.6852, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.32783323526382446, + "rewards/margins": 0.0339665561914444, + "rewards/rejected": 0.29386669397354126, + "step": 4022 + }, + { + "epoch": 0.6221534892712159, + "grad_norm": 7.920750617980957, + "learning_rate": 4.403425363730096e-06, + "logits/chosen": 9.29922866821289, + "logits/rejected": 10.711066246032715, + "logps/chosen": -282.9151916503906, + "logps/rejected": -272.07513427734375, + "loss": 0.8038, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.15416660904884338, + "rewards/margins": -0.1464461386203766, + "rewards/rejected": 0.30061274766921997, + "step": 4023 + }, + { + "epoch": 0.6223081384109801, + "grad_norm": 5.705741882324219, + "learning_rate": 4.403138962080422e-06, + "logits/chosen": 6.605433464050293, + "logits/rejected": 9.549003601074219, + "logps/chosen": -237.51446533203125, + "logps/rejected": -239.06573486328125, + "loss": 0.8658, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04292944818735123, + "rewards/margins": -0.2676040232181549, + "rewards/rejected": 0.3105334937572479, + "step": 4024 + }, + { + "epoch": 0.6224627875507442, + "grad_norm": 4.184696674346924, + "learning_rate": 4.402852560430749e-06, + "logits/chosen": 7.7875776290893555, + "logits/rejected": 3.6724977493286133, + "logps/chosen": -357.46630859375, + "logps/rejected": -218.472412109375, + "loss": 0.6519, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5142974257469177, + "rewards/margins": 0.2307938039302826, + "rewards/rejected": 0.2835036516189575, + "step": 4025 + }, + { + "epoch": 0.6226174366905084, + "grad_norm": 4.74005651473999, + "learning_rate": 4.402566158781075e-06, + "logits/chosen": 4.800574779510498, + "logits/rejected": 5.066928386688232, + "logps/chosen": -243.83627319335938, + "logps/rejected": -254.01856994628906, + "loss": 0.6003, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.027154691517353058, + "rewards/margins": 0.3163312077522278, + "rewards/rejected": -0.28917649388313293, + "step": 4026 + }, + { + "epoch": 0.6227720858302725, + "grad_norm": 27.043251037597656, + "learning_rate": 4.402279757131401e-06, + "logits/chosen": 12.06676197052002, + "logits/rejected": 6.024239540100098, + "logps/chosen": -257.285400390625, + "logps/rejected": -196.4248504638672, + "loss": 0.5654, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29332152009010315, + "rewards/margins": 0.32829222083091736, + "rewards/rejected": -0.03497070074081421, + "step": 4027 + }, + { + "epoch": 0.6229267349700367, + "grad_norm": 7.00987434387207, + "learning_rate": 4.401993355481728e-06, + "logits/chosen": 12.373955726623535, + "logits/rejected": 6.466969966888428, + "logps/chosen": -415.71502685546875, + "logps/rejected": -338.3392639160156, + "loss": 0.5655, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6827533841133118, + "rewards/margins": 0.43671202659606934, + "rewards/rejected": 0.24604134261608124, + "step": 4028 + }, + { + "epoch": 0.6230813841098009, + "grad_norm": 4.14984655380249, + "learning_rate": 4.401706953832055e-06, + "logits/chosen": 10.959396362304688, + "logits/rejected": 3.9779815673828125, + "logps/chosen": -193.0467529296875, + "logps/rejected": -148.32884216308594, + "loss": 0.6356, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30270734429359436, + "rewards/margins": 0.19289106130599976, + "rewards/rejected": 0.10981626808643341, + "step": 4029 + }, + { + "epoch": 0.623236033249565, + "grad_norm": 7.222524166107178, + "learning_rate": 4.401420552182381e-06, + "logits/chosen": 8.91360092163086, + "logits/rejected": 9.961875915527344, + "logps/chosen": -352.41119384765625, + "logps/rejected": -377.68536376953125, + "loss": 0.8253, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3565616011619568, + "rewards/margins": -0.15135295689105988, + "rewards/rejected": 0.5079146027565002, + "step": 4030 + }, + { + "epoch": 0.6233906823893293, + "grad_norm": 7.554623603820801, + "learning_rate": 4.401134150532708e-06, + "logits/chosen": 10.2402925491333, + "logits/rejected": 8.680259704589844, + "logps/chosen": -317.49041748046875, + "logps/rejected": -255.16287231445312, + "loss": 0.6731, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29330506920814514, + "rewards/margins": 0.13002590835094452, + "rewards/rejected": 0.16327916085720062, + "step": 4031 + }, + { + "epoch": 0.6235453315290934, + "grad_norm": 5.088848114013672, + "learning_rate": 4.400847748883034e-06, + "logits/chosen": 9.234837532043457, + "logits/rejected": 9.693359375, + "logps/chosen": -234.90647888183594, + "logps/rejected": -227.0321044921875, + "loss": 0.7181, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36205166578292847, + "rewards/margins": 0.065810427069664, + "rewards/rejected": 0.29624125361442566, + "step": 4032 + }, + { + "epoch": 0.6236999806688576, + "grad_norm": 5.123340606689453, + "learning_rate": 4.4005613472333605e-06, + "logits/chosen": 12.701391220092773, + "logits/rejected": 5.5076398849487305, + "logps/chosen": -293.3648376464844, + "logps/rejected": -204.4552764892578, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4183961749076843, + "rewards/margins": 0.3845493495464325, + "rewards/rejected": 0.03384685516357422, + "step": 4033 + }, + { + "epoch": 0.6238546298086217, + "grad_norm": 5.9557204246521, + "learning_rate": 4.400274945583687e-06, + "logits/chosen": 5.805088520050049, + "logits/rejected": 9.830362319946289, + "logps/chosen": -257.8200378417969, + "logps/rejected": -277.54986572265625, + "loss": 0.7459, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33600106835365295, + "rewards/margins": -0.013096760958433151, + "rewards/rejected": 0.3490978181362152, + "step": 4034 + }, + { + "epoch": 0.6240092789483859, + "grad_norm": 4.270277500152588, + "learning_rate": 4.399988543934014e-06, + "logits/chosen": 5.860166072845459, + "logits/rejected": 3.655550479888916, + "logps/chosen": -286.904541015625, + "logps/rejected": -276.11834716796875, + "loss": 0.5973, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2892366647720337, + "rewards/margins": 0.3018868863582611, + "rewards/rejected": -0.01265022624284029, + "step": 4035 + }, + { + "epoch": 0.62416392808815, + "grad_norm": 5.476607799530029, + "learning_rate": 4.39970214228434e-06, + "logits/chosen": 9.59937572479248, + "logits/rejected": 14.298587799072266, + "logps/chosen": -233.6822967529297, + "logps/rejected": -319.22747802734375, + "loss": 0.6277, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2313644289970398, + "rewards/margins": 0.19144420325756073, + "rewards/rejected": 0.03992024064064026, + "step": 4036 + }, + { + "epoch": 0.6243185772279142, + "grad_norm": 5.029600620269775, + "learning_rate": 4.399415740634666e-06, + "logits/chosen": 12.351079940795898, + "logits/rejected": 1.7783212661743164, + "logps/chosen": -258.0171203613281, + "logps/rejected": -162.19139099121094, + "loss": 0.5499, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2656036913394928, + "rewards/margins": 0.3623029589653015, + "rewards/rejected": -0.09669926762580872, + "step": 4037 + }, + { + "epoch": 0.6244732263676783, + "grad_norm": 5.408899307250977, + "learning_rate": 4.399129338984993e-06, + "logits/chosen": 9.417326927185059, + "logits/rejected": -1.510567307472229, + "logps/chosen": -308.684814453125, + "logps/rejected": -159.21141052246094, + "loss": 0.6245, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29975539445877075, + "rewards/margins": 0.18107320368289948, + "rewards/rejected": 0.11868220567703247, + "step": 4038 + }, + { + "epoch": 0.6246278755074425, + "grad_norm": 4.076916217803955, + "learning_rate": 4.3988429373353195e-06, + "logits/chosen": 6.813101291656494, + "logits/rejected": 5.6339521408081055, + "logps/chosen": -283.181396484375, + "logps/rejected": -237.10426330566406, + "loss": 0.534, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5305769443511963, + "rewards/margins": 0.4034223258495331, + "rewards/rejected": 0.1271546334028244, + "step": 4039 + }, + { + "epoch": 0.6247825246472066, + "grad_norm": 13.659684181213379, + "learning_rate": 4.398556535685646e-06, + "logits/chosen": 10.600590705871582, + "logits/rejected": 7.222653388977051, + "logps/chosen": -281.54058837890625, + "logps/rejected": -188.13075256347656, + "loss": 0.475, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.596403956413269, + "rewards/margins": 0.7324687242507935, + "rewards/rejected": -0.1360647827386856, + "step": 4040 + }, + { + "epoch": 0.6249371737869708, + "grad_norm": 5.383350849151611, + "learning_rate": 4.398270134035972e-06, + "logits/chosen": 10.697227478027344, + "logits/rejected": 7.29941463470459, + "logps/chosen": -343.995361328125, + "logps/rejected": -223.40553283691406, + "loss": 0.6618, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6271936893463135, + "rewards/margins": 0.23002690076828003, + "rewards/rejected": 0.3971668481826782, + "step": 4041 + }, + { + "epoch": 0.6250918229267349, + "grad_norm": 7.511831283569336, + "learning_rate": 4.397983732386299e-06, + "logits/chosen": 13.679967880249023, + "logits/rejected": 3.9051451683044434, + "logps/chosen": -299.9984436035156, + "logps/rejected": -195.6130828857422, + "loss": 0.5494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4566035866737366, + "rewards/margins": 0.34514856338500977, + "rewards/rejected": 0.11145499348640442, + "step": 4042 + }, + { + "epoch": 0.6252464720664991, + "grad_norm": 6.712210178375244, + "learning_rate": 4.397697330736625e-06, + "logits/chosen": 9.03857707977295, + "logits/rejected": 6.271660327911377, + "logps/chosen": -365.9111328125, + "logps/rejected": -254.62643432617188, + "loss": 0.7326, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.346962571144104, + "rewards/margins": 0.14806309342384338, + "rewards/rejected": 0.19889947772026062, + "step": 4043 + }, + { + "epoch": 0.6254011212062633, + "grad_norm": 5.4127678871154785, + "learning_rate": 4.397410929086952e-06, + "logits/chosen": 10.976731300354004, + "logits/rejected": 9.926229476928711, + "logps/chosen": -308.1781005859375, + "logps/rejected": -348.260986328125, + "loss": 0.6871, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36406442523002625, + "rewards/margins": 0.24290120601654053, + "rewards/rejected": 0.12116318196058273, + "step": 4044 + }, + { + "epoch": 0.6255557703460275, + "grad_norm": 5.855021953582764, + "learning_rate": 4.397124527437278e-06, + "logits/chosen": 10.623845100402832, + "logits/rejected": 11.452596664428711, + "logps/chosen": -240.625, + "logps/rejected": -219.32843017578125, + "loss": 0.742, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2689090073108673, + "rewards/margins": -0.040515296161174774, + "rewards/rejected": 0.30942434072494507, + "step": 4045 + }, + { + "epoch": 0.6257104194857916, + "grad_norm": 4.983871936798096, + "learning_rate": 4.396838125787604e-06, + "logits/chosen": 9.657818794250488, + "logits/rejected": 4.532278060913086, + "logps/chosen": -319.77886962890625, + "logps/rejected": -217.75038146972656, + "loss": 0.6808, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27850714325904846, + "rewards/margins": 0.08938390761613846, + "rewards/rejected": 0.1891232281923294, + "step": 4046 + }, + { + "epoch": 0.6258650686255558, + "grad_norm": 5.406404972076416, + "learning_rate": 4.396551724137931e-06, + "logits/chosen": 12.965319633483887, + "logits/rejected": 9.557802200317383, + "logps/chosen": -244.3644256591797, + "logps/rejected": -203.85362243652344, + "loss": 0.6886, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2785509526729584, + "rewards/margins": 0.1845540851354599, + "rewards/rejected": 0.09399686753749847, + "step": 4047 + }, + { + "epoch": 0.62601971776532, + "grad_norm": 5.415280818939209, + "learning_rate": 4.396265322488258e-06, + "logits/chosen": 8.632823944091797, + "logits/rejected": 7.07003116607666, + "logps/chosen": -297.716552734375, + "logps/rejected": -301.20477294921875, + "loss": 0.7321, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.313380628824234, + "rewards/margins": 0.017455842345952988, + "rewards/rejected": 0.2959247827529907, + "step": 4048 + }, + { + "epoch": 0.6261743669050841, + "grad_norm": 5.993331432342529, + "learning_rate": 4.395978920838584e-06, + "logits/chosen": 13.16100788116455, + "logits/rejected": 4.71240758895874, + "logps/chosen": -408.00927734375, + "logps/rejected": -297.93756103515625, + "loss": 0.6833, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3201228976249695, + "rewards/margins": 0.15773311257362366, + "rewards/rejected": 0.1623898148536682, + "step": 4049 + }, + { + "epoch": 0.6263290160448483, + "grad_norm": 6.0911664962768555, + "learning_rate": 4.395692519188911e-06, + "logits/chosen": 8.533544540405273, + "logits/rejected": 7.047549247741699, + "logps/chosen": -297.1532897949219, + "logps/rejected": -233.2585906982422, + "loss": 0.8069, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2909713685512543, + "rewards/margins": -0.14614449441432953, + "rewards/rejected": -0.14482688903808594, + "step": 4050 + }, + { + "epoch": 0.6264836651846124, + "grad_norm": 32.98212432861328, + "learning_rate": 4.395406117539238e-06, + "logits/chosen": 10.070265769958496, + "logits/rejected": -1.7147183418273926, + "logps/chosen": -233.75804138183594, + "logps/rejected": -152.1121368408203, + "loss": 0.5733, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29491937160491943, + "rewards/margins": 0.48594149947166443, + "rewards/rejected": -0.19102217257022858, + "step": 4051 + }, + { + "epoch": 0.6266383143243766, + "grad_norm": 4.405770778656006, + "learning_rate": 4.3951197158895634e-06, + "logits/chosen": 15.662874221801758, + "logits/rejected": 9.65223503112793, + "logps/chosen": -370.09814453125, + "logps/rejected": -266.93438720703125, + "loss": 0.4927, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4853792190551758, + "rewards/margins": 0.5229854583740234, + "rewards/rejected": -0.037606243044137955, + "step": 4052 + }, + { + "epoch": 0.6267929634641407, + "grad_norm": 3.3254263401031494, + "learning_rate": 4.39483331423989e-06, + "logits/chosen": 9.10347843170166, + "logits/rejected": -0.4621303081512451, + "logps/chosen": -229.28807067871094, + "logps/rejected": -111.28083038330078, + "loss": 0.4834, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2897731065750122, + "rewards/margins": 0.61546790599823, + "rewards/rejected": -0.32569482922554016, + "step": 4053 + }, + { + "epoch": 0.6269476126039049, + "grad_norm": 5.133017539978027, + "learning_rate": 4.394546912590217e-06, + "logits/chosen": 9.82995891571045, + "logits/rejected": 10.014596939086914, + "logps/chosen": -294.539794921875, + "logps/rejected": -291.3507080078125, + "loss": 0.6141, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19719655811786652, + "rewards/margins": 0.31432151794433594, + "rewards/rejected": -0.11712498217821121, + "step": 4054 + }, + { + "epoch": 0.627102261743669, + "grad_norm": 4.660998821258545, + "learning_rate": 4.394260510940543e-06, + "logits/chosen": 11.159553527832031, + "logits/rejected": 12.038963317871094, + "logps/chosen": -291.8470153808594, + "logps/rejected": -262.78521728515625, + "loss": 0.5008, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3734864592552185, + "rewards/margins": 0.49905601143836975, + "rewards/rejected": -0.12556958198547363, + "step": 4055 + }, + { + "epoch": 0.6272569108834332, + "grad_norm": 4.688870906829834, + "learning_rate": 4.39397410929087e-06, + "logits/chosen": 5.18271541595459, + "logits/rejected": 2.952927589416504, + "logps/chosen": -308.16754150390625, + "logps/rejected": -289.984619140625, + "loss": 0.607, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3408759832382202, + "rewards/margins": 0.26149341464042664, + "rewards/rejected": 0.07938262820243835, + "step": 4056 + }, + { + "epoch": 0.6274115600231974, + "grad_norm": 3.7789859771728516, + "learning_rate": 4.393687707641197e-06, + "logits/chosen": 11.14282512664795, + "logits/rejected": 6.329801082611084, + "logps/chosen": -234.9351348876953, + "logps/rejected": -176.82781982421875, + "loss": 0.6154, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08922234177589417, + "rewards/margins": 0.22315536439418793, + "rewards/rejected": -0.13393302261829376, + "step": 4057 + }, + { + "epoch": 0.6275662091629616, + "grad_norm": 6.498647689819336, + "learning_rate": 4.393401305991523e-06, + "logits/chosen": 13.571723937988281, + "logits/rejected": 11.705458641052246, + "logps/chosen": -344.78802490234375, + "logps/rejected": -301.2410888671875, + "loss": 0.567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26479947566986084, + "rewards/margins": 0.43605488538742065, + "rewards/rejected": -0.17125540971755981, + "step": 4058 + }, + { + "epoch": 0.6277208583027257, + "grad_norm": 5.898116111755371, + "learning_rate": 4.393114904341849e-06, + "logits/chosen": 12.315786361694336, + "logits/rejected": 13.710007667541504, + "logps/chosen": -333.256591796875, + "logps/rejected": -360.0888671875, + "loss": 0.5931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6876974105834961, + "rewards/margins": 0.3988623023033142, + "rewards/rejected": 0.2888351380825043, + "step": 4059 + }, + { + "epoch": 0.6278755074424899, + "grad_norm": 5.135991096496582, + "learning_rate": 4.392828502692176e-06, + "logits/chosen": 11.135647773742676, + "logits/rejected": 4.969450950622559, + "logps/chosen": -283.25238037109375, + "logps/rejected": -155.79037475585938, + "loss": 0.7085, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06248144805431366, + "rewards/margins": -0.007429918274283409, + "rewards/rejected": 0.06991136819124222, + "step": 4060 + }, + { + "epoch": 0.628030156582254, + "grad_norm": 5.357438087463379, + "learning_rate": 4.3925421010425024e-06, + "logits/chosen": 13.879589080810547, + "logits/rejected": 10.412007331848145, + "logps/chosen": -270.187744140625, + "logps/rejected": -233.69711303710938, + "loss": 0.4302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4024237394332886, + "rewards/margins": 0.7195690870285034, + "rewards/rejected": -0.3171452581882477, + "step": 4061 + }, + { + "epoch": 0.6281848057220182, + "grad_norm": 7.012094497680664, + "learning_rate": 4.392255699392829e-06, + "logits/chosen": 8.41109848022461, + "logits/rejected": 1.721659779548645, + "logps/chosen": -243.05487060546875, + "logps/rejected": -321.6357421875, + "loss": 0.6353, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2585796117782593, + "rewards/margins": 0.31641823053359985, + "rewards/rejected": -0.057838618755340576, + "step": 4062 + }, + { + "epoch": 0.6283394548617823, + "grad_norm": 5.991780757904053, + "learning_rate": 4.391969297743156e-06, + "logits/chosen": 16.45221710205078, + "logits/rejected": 10.865355491638184, + "logps/chosen": -296.0362548828125, + "logps/rejected": -326.06341552734375, + "loss": 0.6332, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4141339063644409, + "rewards/margins": 0.16587454080581665, + "rewards/rejected": 0.24825933575630188, + "step": 4063 + }, + { + "epoch": 0.6284941040015465, + "grad_norm": 5.383486270904541, + "learning_rate": 4.391682896093482e-06, + "logits/chosen": 12.996604919433594, + "logits/rejected": 5.913437843322754, + "logps/chosen": -351.9942626953125, + "logps/rejected": -228.33493041992188, + "loss": 0.6369, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46444129943847656, + "rewards/margins": 0.2363877296447754, + "rewards/rejected": 0.22805356979370117, + "step": 4064 + }, + { + "epoch": 0.6286487531413106, + "grad_norm": 5.335546970367432, + "learning_rate": 4.391396494443808e-06, + "logits/chosen": 11.050395011901855, + "logits/rejected": 10.575007438659668, + "logps/chosen": -387.0365295410156, + "logps/rejected": -293.0663146972656, + "loss": 0.5247, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.611580491065979, + "rewards/margins": 0.46571481227874756, + "rewards/rejected": 0.14586563408374786, + "step": 4065 + }, + { + "epoch": 0.6288034022810748, + "grad_norm": 4.146840572357178, + "learning_rate": 4.391110092794135e-06, + "logits/chosen": 10.987333297729492, + "logits/rejected": 8.577617645263672, + "logps/chosen": -311.9043273925781, + "logps/rejected": -271.08447265625, + "loss": 0.5077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6075385212898254, + "rewards/margins": 0.4484216570854187, + "rewards/rejected": 0.15911683440208435, + "step": 4066 + }, + { + "epoch": 0.6289580514208389, + "grad_norm": 5.356069564819336, + "learning_rate": 4.3908236911444615e-06, + "logits/chosen": 16.6180419921875, + "logits/rejected": 8.864065170288086, + "logps/chosen": -377.5796813964844, + "logps/rejected": -223.75234985351562, + "loss": 0.4691, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7841887474060059, + "rewards/margins": 0.6051515340805054, + "rewards/rejected": 0.1790371835231781, + "step": 4067 + }, + { + "epoch": 0.6291127005606031, + "grad_norm": 4.767258644104004, + "learning_rate": 4.390537289494788e-06, + "logits/chosen": 8.948322296142578, + "logits/rejected": 5.09316873550415, + "logps/chosen": -317.455078125, + "logps/rejected": -229.35791015625, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5640243291854858, + "rewards/margins": 0.328153133392334, + "rewards/rejected": 0.23587122559547424, + "step": 4068 + }, + { + "epoch": 0.6292673497003672, + "grad_norm": 4.488232612609863, + "learning_rate": 4.390250887845115e-06, + "logits/chosen": 14.293383598327637, + "logits/rejected": 9.13192081451416, + "logps/chosen": -356.42156982421875, + "logps/rejected": -197.54701232910156, + "loss": 0.5663, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6690505146980286, + "rewards/margins": 0.31999778747558594, + "rewards/rejected": 0.3490527272224426, + "step": 4069 + }, + { + "epoch": 0.6294219988401315, + "grad_norm": 6.079133987426758, + "learning_rate": 4.389964486195441e-06, + "logits/chosen": 14.880195617675781, + "logits/rejected": 9.341279983520508, + "logps/chosen": -359.162353515625, + "logps/rejected": -230.6212158203125, + "loss": 0.6127, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33809083700180054, + "rewards/margins": 0.32280534505844116, + "rewards/rejected": 0.015285443514585495, + "step": 4070 + }, + { + "epoch": 0.6295766479798957, + "grad_norm": 4.713767051696777, + "learning_rate": 4.389678084545767e-06, + "logits/chosen": 9.947871208190918, + "logits/rejected": 4.430027008056641, + "logps/chosen": -338.67694091796875, + "logps/rejected": -253.3353729248047, + "loss": 0.4061, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7556244730949402, + "rewards/margins": 0.7415270209312439, + "rewards/rejected": 0.014097407460212708, + "step": 4071 + }, + { + "epoch": 0.6297312971196598, + "grad_norm": 9.3604097366333, + "learning_rate": 4.389391682896094e-06, + "logits/chosen": 9.743115425109863, + "logits/rejected": 5.437849044799805, + "logps/chosen": -323.4690246582031, + "logps/rejected": -247.41807556152344, + "loss": 0.8407, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.014921359717845917, + "rewards/margins": -0.16871388256549835, + "rewards/rejected": 0.15379253029823303, + "step": 4072 + }, + { + "epoch": 0.629885946259424, + "grad_norm": 4.237896919250488, + "learning_rate": 4.3891052812464206e-06, + "logits/chosen": 2.015172004699707, + "logits/rejected": 7.489939212799072, + "logps/chosen": -98.97224426269531, + "logps/rejected": -151.1461944580078, + "loss": 0.7955, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1738617718219757, + "rewards/margins": -0.07680667191743851, + "rewards/rejected": 0.2506684362888336, + "step": 4073 + }, + { + "epoch": 0.6300405953991881, + "grad_norm": 4.616840839385986, + "learning_rate": 4.388818879596747e-06, + "logits/chosen": 8.80966854095459, + "logits/rejected": 10.599469184875488, + "logps/chosen": -164.808349609375, + "logps/rejected": -170.6791534423828, + "loss": 0.6441, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35915103554725647, + "rewards/margins": 0.13075952231884003, + "rewards/rejected": 0.22839152812957764, + "step": 4074 + }, + { + "epoch": 0.6301952445389523, + "grad_norm": 5.706430912017822, + "learning_rate": 4.388532477947073e-06, + "logits/chosen": 8.524097442626953, + "logits/rejected": 7.387247085571289, + "logps/chosen": -170.9727783203125, + "logps/rejected": -194.60433959960938, + "loss": 0.6791, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30584239959716797, + "rewards/margins": 0.08205842971801758, + "rewards/rejected": 0.2237839698791504, + "step": 4075 + }, + { + "epoch": 0.6303498936787164, + "grad_norm": 4.895020484924316, + "learning_rate": 4.3882460762974e-06, + "logits/chosen": 16.760425567626953, + "logits/rejected": 12.785892486572266, + "logps/chosen": -373.61981201171875, + "logps/rejected": -346.08734130859375, + "loss": 0.5195, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4349214732646942, + "rewards/margins": 0.44925785064697266, + "rewards/rejected": -0.014336392283439636, + "step": 4076 + }, + { + "epoch": 0.6305045428184806, + "grad_norm": 6.131520748138428, + "learning_rate": 4.387959674647726e-06, + "logits/chosen": 6.4105610847473145, + "logits/rejected": 9.859278678894043, + "logps/chosen": -160.2409210205078, + "logps/rejected": -226.63900756835938, + "loss": 0.675, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2909572124481201, + "rewards/margins": 0.16879379749298096, + "rewards/rejected": 0.12216338515281677, + "step": 4077 + }, + { + "epoch": 0.6306591919582447, + "grad_norm": 4.614214897155762, + "learning_rate": 4.387673272998053e-06, + "logits/chosen": 12.445802688598633, + "logits/rejected": 7.001782417297363, + "logps/chosen": -231.46791076660156, + "logps/rejected": -178.32420349121094, + "loss": 0.6319, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21844187378883362, + "rewards/margins": 0.21396130323410034, + "rewards/rejected": 0.004480551928281784, + "step": 4078 + }, + { + "epoch": 0.6308138410980089, + "grad_norm": 6.138950347900391, + "learning_rate": 4.387386871348379e-06, + "logits/chosen": 10.218317031860352, + "logits/rejected": 12.086372375488281, + "logps/chosen": -453.27276611328125, + "logps/rejected": -471.6017761230469, + "loss": 0.77, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6691047549247742, + "rewards/margins": -0.07518120110034943, + "rewards/rejected": 0.7442859411239624, + "step": 4079 + }, + { + "epoch": 0.630968490237773, + "grad_norm": 7.922305107116699, + "learning_rate": 4.387100469698705e-06, + "logits/chosen": 11.186609268188477, + "logits/rejected": 4.4858479499816895, + "logps/chosen": -272.8786315917969, + "logps/rejected": -263.168701171875, + "loss": 0.7476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2294585257768631, + "rewards/margins": -0.003894984722137451, + "rewards/rejected": 0.23335354030132294, + "step": 4080 + }, + { + "epoch": 0.6311231393775372, + "grad_norm": 5.0894012451171875, + "learning_rate": 4.386814068049032e-06, + "logits/chosen": 8.3685302734375, + "logits/rejected": 9.309028625488281, + "logps/chosen": -193.14651489257812, + "logps/rejected": -212.30909729003906, + "loss": 0.8284, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19910365343093872, + "rewards/margins": -0.1602429896593094, + "rewards/rejected": 0.3593466281890869, + "step": 4081 + }, + { + "epoch": 0.6312777885173013, + "grad_norm": 4.277697563171387, + "learning_rate": 4.386527666399359e-06, + "logits/chosen": 8.325204849243164, + "logits/rejected": 4.172941207885742, + "logps/chosen": -358.3304443359375, + "logps/rejected": -239.33416748046875, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6739838123321533, + "rewards/margins": 0.4819828271865845, + "rewards/rejected": 0.19200094044208527, + "step": 4082 + }, + { + "epoch": 0.6314324376570656, + "grad_norm": 9.58552360534668, + "learning_rate": 4.386241264749685e-06, + "logits/chosen": 10.95848274230957, + "logits/rejected": 6.122400283813477, + "logps/chosen": -503.1493225097656, + "logps/rejected": -393.48828125, + "loss": 0.7519, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.543312668800354, + "rewards/margins": 0.0017075389623641968, + "rewards/rejected": 0.5416051149368286, + "step": 4083 + }, + { + "epoch": 0.6315870867968297, + "grad_norm": 4.633702754974365, + "learning_rate": 4.385954863100012e-06, + "logits/chosen": 11.489587783813477, + "logits/rejected": 9.771651268005371, + "logps/chosen": -132.3289337158203, + "logps/rejected": -114.9928207397461, + "loss": 0.658, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04910849779844284, + "rewards/margins": 0.12203539907932281, + "rewards/rejected": -0.07292690873146057, + "step": 4084 + }, + { + "epoch": 0.6317417359365939, + "grad_norm": 30.265993118286133, + "learning_rate": 4.385668461450338e-06, + "logits/chosen": 11.734833717346191, + "logits/rejected": 9.424222946166992, + "logps/chosen": -279.73077392578125, + "logps/rejected": -296.370849609375, + "loss": 0.6579, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4311927855014801, + "rewards/margins": 0.274163156747818, + "rewards/rejected": 0.1570296287536621, + "step": 4085 + }, + { + "epoch": 0.631896385076358, + "grad_norm": 5.297602653503418, + "learning_rate": 4.3853820598006645e-06, + "logits/chosen": 5.70874547958374, + "logits/rejected": 4.449985504150391, + "logps/chosen": -257.4253234863281, + "logps/rejected": -226.8214111328125, + "loss": 0.7094, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.28552475571632385, + "rewards/margins": 0.12024030089378357, + "rewards/rejected": 0.1652844399213791, + "step": 4086 + }, + { + "epoch": 0.6320510342161222, + "grad_norm": 5.243790149688721, + "learning_rate": 4.385095658150991e-06, + "logits/chosen": 15.595524787902832, + "logits/rejected": 7.7643585205078125, + "logps/chosen": -273.47283935546875, + "logps/rejected": -200.00347900390625, + "loss": 0.4896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3304349482059479, + "rewards/margins": 0.5725619196891785, + "rewards/rejected": -0.2421269416809082, + "step": 4087 + }, + { + "epoch": 0.6322056833558863, + "grad_norm": 5.842273235321045, + "learning_rate": 4.384809256501318e-06, + "logits/chosen": 9.634300231933594, + "logits/rejected": 9.368896484375, + "logps/chosen": -230.87673950195312, + "logps/rejected": -249.20358276367188, + "loss": 0.7919, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16327105462551117, + "rewards/margins": -0.15904149413108826, + "rewards/rejected": 0.32231253385543823, + "step": 4088 + }, + { + "epoch": 0.6323603324956505, + "grad_norm": 7.393531322479248, + "learning_rate": 4.384522854851644e-06, + "logits/chosen": 11.060192108154297, + "logits/rejected": 7.803918838500977, + "logps/chosen": -329.99383544921875, + "logps/rejected": -274.918212890625, + "loss": 0.9994, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3079463243484497, + "rewards/margins": -0.21172676980495453, + "rewards/rejected": 0.5196730494499207, + "step": 4089 + }, + { + "epoch": 0.6325149816354146, + "grad_norm": 3.7282092571258545, + "learning_rate": 4.384236453201971e-06, + "logits/chosen": 10.73511791229248, + "logits/rejected": 9.05099105834961, + "logps/chosen": -287.5132141113281, + "logps/rejected": -176.15481567382812, + "loss": 0.5454, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4524175524711609, + "rewards/margins": 0.3680635392665863, + "rewards/rejected": 0.08435405045747757, + "step": 4090 + }, + { + "epoch": 0.6326696307751788, + "grad_norm": 6.695165157318115, + "learning_rate": 4.383950051552297e-06, + "logits/chosen": 13.514520645141602, + "logits/rejected": 13.532526969909668, + "logps/chosen": -264.879638671875, + "logps/rejected": -300.1611633300781, + "loss": 0.6222, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16812191903591156, + "rewards/margins": 0.19048023223876953, + "rewards/rejected": -0.022358322516083717, + "step": 4091 + }, + { + "epoch": 0.6328242799149429, + "grad_norm": 5.201694965362549, + "learning_rate": 4.3836636499026235e-06, + "logits/chosen": 9.252204895019531, + "logits/rejected": 10.041300773620605, + "logps/chosen": -239.11080932617188, + "logps/rejected": -212.36119079589844, + "loss": 0.7278, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30925390124320984, + "rewards/margins": -0.027025975286960602, + "rewards/rejected": 0.33627986907958984, + "step": 4092 + }, + { + "epoch": 0.6329789290547071, + "grad_norm": 4.844962120056152, + "learning_rate": 4.38337724825295e-06, + "logits/chosen": 8.327411651611328, + "logits/rejected": 10.50060749053955, + "logps/chosen": -261.7782287597656, + "logps/rejected": -254.94830322265625, + "loss": 0.7709, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5955994129180908, + "rewards/margins": -0.06333044916391373, + "rewards/rejected": 0.6589298844337463, + "step": 4093 + }, + { + "epoch": 0.6331335781944712, + "grad_norm": 6.417834281921387, + "learning_rate": 4.383090846603277e-06, + "logits/chosen": 6.415332317352295, + "logits/rejected": 7.511914253234863, + "logps/chosen": -192.75027465820312, + "logps/rejected": -206.2389373779297, + "loss": 0.7683, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3792446553707123, + "rewards/margins": -0.10934244096279144, + "rewards/rejected": 0.4885871112346649, + "step": 4094 + }, + { + "epoch": 0.6332882273342354, + "grad_norm": 3.8428287506103516, + "learning_rate": 4.3828044449536035e-06, + "logits/chosen": 8.29948616027832, + "logits/rejected": 4.485544681549072, + "logps/chosen": -221.4833984375, + "logps/rejected": -184.5738525390625, + "loss": 0.6488, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4823654294013977, + "rewards/margins": 0.12600107491016388, + "rewards/rejected": 0.356364369392395, + "step": 4095 + }, + { + "epoch": 0.6334428764739997, + "grad_norm": 4.322994232177734, + "learning_rate": 4.38251804330393e-06, + "logits/chosen": 13.05551528930664, + "logits/rejected": 13.500021934509277, + "logps/chosen": -177.59197998046875, + "logps/rejected": -201.31393432617188, + "loss": 0.5887, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44487255811691284, + "rewards/margins": 0.3228353261947632, + "rewards/rejected": 0.12203723192214966, + "step": 4096 + }, + { + "epoch": 0.6335975256137638, + "grad_norm": 6.2394866943359375, + "learning_rate": 4.382231641654257e-06, + "logits/chosen": 7.5533905029296875, + "logits/rejected": 9.271539688110352, + "logps/chosen": -274.69775390625, + "logps/rejected": -303.3941345214844, + "loss": 0.7027, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08952885121107101, + "rewards/margins": 0.3045582175254822, + "rewards/rejected": -0.3940870463848114, + "step": 4097 + }, + { + "epoch": 0.633752174753528, + "grad_norm": 3.795670509338379, + "learning_rate": 4.381945240004583e-06, + "logits/chosen": 6.060182571411133, + "logits/rejected": 9.485590934753418, + "logps/chosen": -146.77719116210938, + "logps/rejected": -228.85003662109375, + "loss": 0.6222, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.007003791630268097, + "rewards/margins": 0.25495392084121704, + "rewards/rejected": -0.24795013666152954, + "step": 4098 + }, + { + "epoch": 0.6339068238932921, + "grad_norm": 4.512800693511963, + "learning_rate": 4.381658838354909e-06, + "logits/chosen": 8.659418106079102, + "logits/rejected": 9.568912506103516, + "logps/chosen": -236.9254608154297, + "logps/rejected": -240.45501708984375, + "loss": 0.6456, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1655310094356537, + "rewards/margins": 0.13852977752685547, + "rewards/rejected": 0.027001243084669113, + "step": 4099 + }, + { + "epoch": 0.6340614730330563, + "grad_norm": 5.2828497886657715, + "learning_rate": 4.381372436705236e-06, + "logits/chosen": 8.062158584594727, + "logits/rejected": 8.785603523254395, + "logps/chosen": -270.3441162109375, + "logps/rejected": -283.97796630859375, + "loss": 0.5701, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43046730756759644, + "rewards/margins": 0.370481014251709, + "rewards/rejected": 0.059986360371112823, + "step": 4100 + }, + { + "epoch": 0.6342161221728204, + "grad_norm": 5.401562213897705, + "learning_rate": 4.3810860350555625e-06, + "logits/chosen": 7.222257614135742, + "logits/rejected": 8.367936134338379, + "logps/chosen": -259.8377685546875, + "logps/rejected": -279.1836242675781, + "loss": 0.5507, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34457677602767944, + "rewards/margins": 0.39129161834716797, + "rewards/rejected": -0.046714894473552704, + "step": 4101 + }, + { + "epoch": 0.6343707713125846, + "grad_norm": 4.979738712310791, + "learning_rate": 4.380799633405889e-06, + "logits/chosen": 13.21034049987793, + "logits/rejected": 10.843297004699707, + "logps/chosen": -248.76193237304688, + "logps/rejected": -254.17855834960938, + "loss": 0.7634, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4047614634037018, + "rewards/margins": -0.03327387571334839, + "rewards/rejected": 0.43803533911705017, + "step": 4102 + }, + { + "epoch": 0.6345254204523487, + "grad_norm": 5.004741668701172, + "learning_rate": 4.380513231756216e-06, + "logits/chosen": 14.181822776794434, + "logits/rejected": 6.330324172973633, + "logps/chosen": -342.7789001464844, + "logps/rejected": -332.1922912597656, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5983216762542725, + "rewards/margins": 0.584115743637085, + "rewards/rejected": 0.0142059326171875, + "step": 4103 + }, + { + "epoch": 0.6346800695921129, + "grad_norm": 5.335628986358643, + "learning_rate": 4.380226830106542e-06, + "logits/chosen": 7.961426258087158, + "logits/rejected": 5.05066442489624, + "logps/chosen": -321.98577880859375, + "logps/rejected": -229.30003356933594, + "loss": 0.7195, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.42834556102752686, + "rewards/margins": 0.02657165378332138, + "rewards/rejected": 0.40177392959594727, + "step": 4104 + }, + { + "epoch": 0.634834718731877, + "grad_norm": 5.181117057800293, + "learning_rate": 4.379940428456868e-06, + "logits/chosen": 14.14529800415039, + "logits/rejected": 10.406647682189941, + "logps/chosen": -346.9060974121094, + "logps/rejected": -245.6342010498047, + "loss": 0.614, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47927939891815186, + "rewards/margins": 0.27702128887176514, + "rewards/rejected": 0.20225811004638672, + "step": 4105 + }, + { + "epoch": 0.6349893678716412, + "grad_norm": 6.674239158630371, + "learning_rate": 4.379654026807195e-06, + "logits/chosen": 4.582878589630127, + "logits/rejected": 7.766049861907959, + "logps/chosen": -372.852783203125, + "logps/rejected": -364.62176513671875, + "loss": 0.6792, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3472209870815277, + "rewards/margins": 0.18419525027275085, + "rewards/rejected": 0.16302573680877686, + "step": 4106 + }, + { + "epoch": 0.6351440170114053, + "grad_norm": 4.618480682373047, + "learning_rate": 4.379367625157522e-06, + "logits/chosen": 9.992536544799805, + "logits/rejected": 10.62861156463623, + "logps/chosen": -274.1743469238281, + "logps/rejected": -179.41104125976562, + "loss": 0.675, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20193880796432495, + "rewards/margins": 0.09115152806043625, + "rewards/rejected": 0.1107873022556305, + "step": 4107 + }, + { + "epoch": 0.6352986661511696, + "grad_norm": 5.447495937347412, + "learning_rate": 4.379081223507847e-06, + "logits/chosen": 7.995439052581787, + "logits/rejected": 1.4290084838867188, + "logps/chosen": -284.39617919921875, + "logps/rejected": -228.349609375, + "loss": 0.5617, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22853010892868042, + "rewards/margins": 0.33122774958610535, + "rewards/rejected": -0.10269765555858612, + "step": 4108 + }, + { + "epoch": 0.6354533152909337, + "grad_norm": 3.807565927505493, + "learning_rate": 4.378794821858174e-06, + "logits/chosen": 7.251883506774902, + "logits/rejected": 4.4661054611206055, + "logps/chosen": -242.60794067382812, + "logps/rejected": -170.6915283203125, + "loss": 0.644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38709479570388794, + "rewards/margins": 0.17864611744880676, + "rewards/rejected": 0.20844869315624237, + "step": 4109 + }, + { + "epoch": 0.6356079644306979, + "grad_norm": 4.537283420562744, + "learning_rate": 4.378508420208501e-06, + "logits/chosen": 8.381706237792969, + "logits/rejected": 9.692851066589355, + "logps/chosen": -206.86929321289062, + "logps/rejected": -324.139404296875, + "loss": 0.5956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1472097933292389, + "rewards/margins": 0.27765166759490967, + "rewards/rejected": -0.1304418444633484, + "step": 4110 + }, + { + "epoch": 0.635762613570462, + "grad_norm": 5.434293746948242, + "learning_rate": 4.378222018558827e-06, + "logits/chosen": 10.989086151123047, + "logits/rejected": 10.933847427368164, + "logps/chosen": -297.72991943359375, + "logps/rejected": -222.47848510742188, + "loss": 0.6294, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5885234475135803, + "rewards/margins": 0.24290457367897034, + "rewards/rejected": 0.3456188440322876, + "step": 4111 + }, + { + "epoch": 0.6359172627102262, + "grad_norm": 3.9414892196655273, + "learning_rate": 4.377935616909154e-06, + "logits/chosen": 10.343132019042969, + "logits/rejected": 11.0509614944458, + "logps/chosen": -221.460693359375, + "logps/rejected": -251.46202087402344, + "loss": 0.6261, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41311532258987427, + "rewards/margins": 0.24716883897781372, + "rewards/rejected": 0.16594648361206055, + "step": 4112 + }, + { + "epoch": 0.6360719118499903, + "grad_norm": 5.2202630043029785, + "learning_rate": 4.37764921525948e-06, + "logits/chosen": 12.073678016662598, + "logits/rejected": 8.554848670959473, + "logps/chosen": -357.453369140625, + "logps/rejected": -381.09124755859375, + "loss": 0.6563, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5166854858398438, + "rewards/margins": 0.30638766288757324, + "rewards/rejected": 0.21029780805110931, + "step": 4113 + }, + { + "epoch": 0.6362265609897545, + "grad_norm": 4.668791770935059, + "learning_rate": 4.3773628136098064e-06, + "logits/chosen": 12.560453414916992, + "logits/rejected": 9.434642791748047, + "logps/chosen": -221.6604461669922, + "logps/rejected": -155.72030639648438, + "loss": 0.5726, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30154722929000854, + "rewards/margins": 0.32892292737960815, + "rewards/rejected": -0.027375690639019012, + "step": 4114 + }, + { + "epoch": 0.6363812101295186, + "grad_norm": 17.79600715637207, + "learning_rate": 4.377076411960133e-06, + "logits/chosen": 11.8092041015625, + "logits/rejected": 3.234706401824951, + "logps/chosen": -257.2894287109375, + "logps/rejected": -227.11520385742188, + "loss": 0.683, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15519465506076813, + "rewards/margins": 0.06854070723056793, + "rewards/rejected": 0.0866539403796196, + "step": 4115 + }, + { + "epoch": 0.6365358592692828, + "grad_norm": 5.477811813354492, + "learning_rate": 4.37679001031046e-06, + "logits/chosen": 6.463884353637695, + "logits/rejected": 6.820122241973877, + "logps/chosen": -223.0013427734375, + "logps/rejected": -243.29583740234375, + "loss": 0.6779, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3277086019515991, + "rewards/margins": 0.08995497226715088, + "rewards/rejected": 0.23775361478328705, + "step": 4116 + }, + { + "epoch": 0.636690508409047, + "grad_norm": 8.808624267578125, + "learning_rate": 4.376503608660786e-06, + "logits/chosen": 8.11403751373291, + "logits/rejected": 7.373326778411865, + "logps/chosen": -274.62457275390625, + "logps/rejected": -234.1453857421875, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14819729328155518, + "rewards/margins": 0.3993549346923828, + "rewards/rejected": -0.25115764141082764, + "step": 4117 + }, + { + "epoch": 0.6368451575488111, + "grad_norm": 5.85345458984375, + "learning_rate": 4.376217207011112e-06, + "logits/chosen": 3.6756136417388916, + "logits/rejected": 9.300027847290039, + "logps/chosen": -240.51461791992188, + "logps/rejected": -310.1883544921875, + "loss": 0.6567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3116907477378845, + "rewards/margins": 0.17528238892555237, + "rewards/rejected": 0.13640832901000977, + "step": 4118 + }, + { + "epoch": 0.6369998066885753, + "grad_norm": 6.3791632652282715, + "learning_rate": 4.375930805361439e-06, + "logits/chosen": 8.829212188720703, + "logits/rejected": 6.205772399902344, + "logps/chosen": -354.41888427734375, + "logps/rejected": -248.3518524169922, + "loss": 0.6486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26041820645332336, + "rewards/margins": 0.2785182595252991, + "rewards/rejected": -0.018100067973136902, + "step": 4119 + }, + { + "epoch": 0.6371544558283394, + "grad_norm": 17.831085205078125, + "learning_rate": 4.3756444037117655e-06, + "logits/chosen": 6.502841472625732, + "logits/rejected": 7.498453140258789, + "logps/chosen": -224.22491455078125, + "logps/rejected": -259.0049743652344, + "loss": 0.6452, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4141100347042084, + "rewards/margins": 0.12821297347545624, + "rewards/rejected": 0.28589707612991333, + "step": 4120 + }, + { + "epoch": 0.6373091049681037, + "grad_norm": 7.046199321746826, + "learning_rate": 4.375358002062092e-06, + "logits/chosen": 14.548553466796875, + "logits/rejected": 7.386445999145508, + "logps/chosen": -270.88934326171875, + "logps/rejected": -172.00706481933594, + "loss": 0.4265, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4424956440925598, + "rewards/margins": 0.6944729685783386, + "rewards/rejected": -0.2519773244857788, + "step": 4121 + }, + { + "epoch": 0.6374637541078678, + "grad_norm": 4.747237205505371, + "learning_rate": 4.375071600412419e-06, + "logits/chosen": 11.472861289978027, + "logits/rejected": 7.186437129974365, + "logps/chosen": -238.30471801757812, + "logps/rejected": -217.9029083251953, + "loss": 0.5835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4564840793609619, + "rewards/margins": 0.30192387104034424, + "rewards/rejected": 0.15456023812294006, + "step": 4122 + }, + { + "epoch": 0.637618403247632, + "grad_norm": 4.763288497924805, + "learning_rate": 4.3747851987627455e-06, + "logits/chosen": 10.446076393127441, + "logits/rejected": 3.2043774127960205, + "logps/chosen": -384.4615173339844, + "logps/rejected": -287.5572814941406, + "loss": 0.5123, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.501357913017273, + "rewards/margins": 0.5178449153900146, + "rewards/rejected": -0.01648702472448349, + "step": 4123 + }, + { + "epoch": 0.6377730523873961, + "grad_norm": 11.977688789367676, + "learning_rate": 4.374498797113071e-06, + "logits/chosen": 16.81658172607422, + "logits/rejected": 13.944828033447266, + "logps/chosen": -328.14776611328125, + "logps/rejected": -306.6591491699219, + "loss": 0.7031, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39857369661331177, + "rewards/margins": 0.07378018647432327, + "rewards/rejected": 0.3247935175895691, + "step": 4124 + }, + { + "epoch": 0.6379277015271603, + "grad_norm": 6.331130027770996, + "learning_rate": 4.374212395463398e-06, + "logits/chosen": 13.306492805480957, + "logits/rejected": 7.61560583114624, + "logps/chosen": -358.43426513671875, + "logps/rejected": -205.383544921875, + "loss": 0.767, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10752954334020615, + "rewards/margins": 0.04026623070240021, + "rewards/rejected": 0.06726332008838654, + "step": 4125 + }, + { + "epoch": 0.6380823506669244, + "grad_norm": 6.325943946838379, + "learning_rate": 4.3739259938137246e-06, + "logits/chosen": 7.5689239501953125, + "logits/rejected": 7.183380126953125, + "logps/chosen": -171.04945373535156, + "logps/rejected": -195.79745483398438, + "loss": 0.7774, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3120458722114563, + "rewards/margins": -0.043240297585725784, + "rewards/rejected": -0.2688056230545044, + "step": 4126 + }, + { + "epoch": 0.6382369998066886, + "grad_norm": 3.946017265319824, + "learning_rate": 4.373639592164051e-06, + "logits/chosen": 10.310894012451172, + "logits/rejected": 11.078542709350586, + "logps/chosen": -218.87835693359375, + "logps/rejected": -229.44537353515625, + "loss": 0.5646, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14756150543689728, + "rewards/margins": 0.3348393440246582, + "rewards/rejected": -0.18727785348892212, + "step": 4127 + }, + { + "epoch": 0.6383916489464527, + "grad_norm": 3.8895344734191895, + "learning_rate": 4.373353190514378e-06, + "logits/chosen": 9.92867374420166, + "logits/rejected": 3.5982136726379395, + "logps/chosen": -198.17742919921875, + "logps/rejected": -133.93963623046875, + "loss": 0.6661, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00034789741039276123, + "rewards/margins": 0.06967587769031525, + "rewards/rejected": -0.0700237825512886, + "step": 4128 + }, + { + "epoch": 0.6385462980862169, + "grad_norm": 3.924588441848755, + "learning_rate": 4.3730667888647045e-06, + "logits/chosen": 10.009392738342285, + "logits/rejected": 6.078564643859863, + "logps/chosen": -195.566162109375, + "logps/rejected": -137.6949005126953, + "loss": 0.6163, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20212063193321228, + "rewards/margins": 0.3275145888328552, + "rewards/rejected": -0.12539394199848175, + "step": 4129 + }, + { + "epoch": 0.638700947225981, + "grad_norm": 5.081210613250732, + "learning_rate": 4.372780387215031e-06, + "logits/chosen": 6.342856407165527, + "logits/rejected": 6.001924991607666, + "logps/chosen": -323.96417236328125, + "logps/rejected": -259.64434814453125, + "loss": 0.6038, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4809591770172119, + "rewards/margins": 0.31086859107017517, + "rewards/rejected": 0.17009061574935913, + "step": 4130 + }, + { + "epoch": 0.6388555963657452, + "grad_norm": 6.727889060974121, + "learning_rate": 4.372493985565357e-06, + "logits/chosen": 7.062030792236328, + "logits/rejected": 11.667972564697266, + "logps/chosen": -161.51248168945312, + "logps/rejected": -234.14161682128906, + "loss": 0.9305, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10976529866456985, + "rewards/margins": -0.35832956433296204, + "rewards/rejected": 0.2485642433166504, + "step": 4131 + }, + { + "epoch": 0.6390102455055093, + "grad_norm": 3.9513015747070312, + "learning_rate": 4.372207583915684e-06, + "logits/chosen": 6.404477119445801, + "logits/rejected": 1.3548694849014282, + "logps/chosen": -203.42523193359375, + "logps/rejected": -146.09397888183594, + "loss": 0.5237, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07050585746765137, + "rewards/margins": 0.4598422944545746, + "rewards/rejected": -0.3893364667892456, + "step": 4132 + }, + { + "epoch": 0.6391648946452735, + "grad_norm": 6.383387565612793, + "learning_rate": 4.37192118226601e-06, + "logits/chosen": 10.687776565551758, + "logits/rejected": 6.707417964935303, + "logps/chosen": -368.4574279785156, + "logps/rejected": -281.74859619140625, + "loss": 0.77, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12452947348356247, + "rewards/margins": -0.053909800946712494, + "rewards/rejected": 0.17843925952911377, + "step": 4133 + }, + { + "epoch": 0.6393195437850377, + "grad_norm": 4.701065540313721, + "learning_rate": 4.371634780616337e-06, + "logits/chosen": 12.130321502685547, + "logits/rejected": 5.955464839935303, + "logps/chosen": -299.20086669921875, + "logps/rejected": -204.91860961914062, + "loss": 0.5892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06123046204447746, + "rewards/margins": 0.30267924070358276, + "rewards/rejected": -0.241448774933815, + "step": 4134 + }, + { + "epoch": 0.6394741929248019, + "grad_norm": 7.378091812133789, + "learning_rate": 4.3713483789666636e-06, + "logits/chosen": 7.9696478843688965, + "logits/rejected": 5.825591087341309, + "logps/chosen": -308.034423828125, + "logps/rejected": -290.70965576171875, + "loss": 0.9042, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.09308815002441406, + "rewards/margins": -0.31664466857910156, + "rewards/rejected": 0.2235565185546875, + "step": 4135 + }, + { + "epoch": 0.639628842064566, + "grad_norm": 15.90924072265625, + "learning_rate": 4.37106197731699e-06, + "logits/chosen": 6.8234052658081055, + "logits/rejected": 6.639256954193115, + "logps/chosen": -332.3922119140625, + "logps/rejected": -335.5267639160156, + "loss": 0.8146, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1977781355381012, + "rewards/margins": -0.1273650974035263, + "rewards/rejected": 0.3251432478427887, + "step": 4136 + }, + { + "epoch": 0.6397834912043302, + "grad_norm": 4.474390983581543, + "learning_rate": 4.370775575667317e-06, + "logits/chosen": 12.286456108093262, + "logits/rejected": 6.718454360961914, + "logps/chosen": -298.04364013671875, + "logps/rejected": -198.03721618652344, + "loss": 0.6089, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33563196659088135, + "rewards/margins": 0.21411949396133423, + "rewards/rejected": 0.12151246517896652, + "step": 4137 + }, + { + "epoch": 0.6399381403440944, + "grad_norm": 4.8385491371154785, + "learning_rate": 4.370489174017643e-06, + "logits/chosen": 10.93194580078125, + "logits/rejected": 8.764164924621582, + "logps/chosen": -333.0178527832031, + "logps/rejected": -281.819091796875, + "loss": 0.5366, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40943777561187744, + "rewards/margins": 0.3770712912082672, + "rewards/rejected": 0.03236646205186844, + "step": 4138 + }, + { + "epoch": 0.6400927894838585, + "grad_norm": 5.442009925842285, + "learning_rate": 4.370202772367969e-06, + "logits/chosen": 14.258310317993164, + "logits/rejected": 10.787344932556152, + "logps/chosen": -314.5521545410156, + "logps/rejected": -272.953369140625, + "loss": 0.5668, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3717292547225952, + "rewards/margins": 0.31740519404411316, + "rewards/rejected": 0.05432405695319176, + "step": 4139 + }, + { + "epoch": 0.6402474386236227, + "grad_norm": 4.450315952301025, + "learning_rate": 4.369916370718296e-06, + "logits/chosen": 11.830985069274902, + "logits/rejected": 10.443124771118164, + "logps/chosen": -175.95892333984375, + "logps/rejected": -172.8510284423828, + "loss": 0.7064, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17391079664230347, + "rewards/margins": -0.0089874267578125, + "rewards/rejected": 0.18289823830127716, + "step": 4140 + }, + { + "epoch": 0.6404020877633868, + "grad_norm": 5.178179740905762, + "learning_rate": 4.369629969068623e-06, + "logits/chosen": 11.477242469787598, + "logits/rejected": 6.701263427734375, + "logps/chosen": -367.078125, + "logps/rejected": -259.4224548339844, + "loss": 0.5708, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3994961082935333, + "rewards/margins": 0.43383142352104187, + "rewards/rejected": -0.03433533012866974, + "step": 4141 + }, + { + "epoch": 0.640556736903151, + "grad_norm": 4.926456451416016, + "learning_rate": 4.369343567418948e-06, + "logits/chosen": 9.36685848236084, + "logits/rejected": 3.643974781036377, + "logps/chosen": -264.55255126953125, + "logps/rejected": -194.30076599121094, + "loss": 0.6731, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20452560484409332, + "rewards/margins": 0.18341843783855438, + "rewards/rejected": 0.021107204258441925, + "step": 4142 + }, + { + "epoch": 0.6407113860429151, + "grad_norm": 3.3367748260498047, + "learning_rate": 4.369057165769275e-06, + "logits/chosen": 11.512624740600586, + "logits/rejected": 6.5777716636657715, + "logps/chosen": -255.5541229248047, + "logps/rejected": -189.23135375976562, + "loss": 0.4946, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28150850534439087, + "rewards/margins": 0.5028558969497681, + "rewards/rejected": -0.2213473916053772, + "step": 4143 + }, + { + "epoch": 0.6408660351826793, + "grad_norm": 4.676826000213623, + "learning_rate": 4.368770764119602e-06, + "logits/chosen": 8.315370559692383, + "logits/rejected": 4.311043739318848, + "logps/chosen": -316.3711242675781, + "logps/rejected": -262.5533752441406, + "loss": 0.4354, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48535990715026855, + "rewards/margins": 0.7464770674705505, + "rewards/rejected": -0.261117160320282, + "step": 4144 + }, + { + "epoch": 0.6410206843224434, + "grad_norm": 6.268926620483398, + "learning_rate": 4.368484362469928e-06, + "logits/chosen": 12.007072448730469, + "logits/rejected": 6.08164119720459, + "logps/chosen": -357.03472900390625, + "logps/rejected": -233.7212371826172, + "loss": 0.5375, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2798842489719391, + "rewards/margins": 0.6042281985282898, + "rewards/rejected": -0.3243439197540283, + "step": 4145 + }, + { + "epoch": 0.6411753334622076, + "grad_norm": 6.8585615158081055, + "learning_rate": 4.368197960820255e-06, + "logits/chosen": 8.012149810791016, + "logits/rejected": 4.651320457458496, + "logps/chosen": -277.5685729980469, + "logps/rejected": -251.92120361328125, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.022730447351932526, + "rewards/margins": 0.04365553334355354, + "rewards/rejected": -0.020925089716911316, + "step": 4146 + }, + { + "epoch": 0.6413299826019718, + "grad_norm": 6.885255336761475, + "learning_rate": 4.367911559170581e-06, + "logits/chosen": 9.24830150604248, + "logits/rejected": 11.41317367553711, + "logps/chosen": -317.18365478515625, + "logps/rejected": -357.7243347167969, + "loss": 0.7713, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21779154241085052, + "rewards/margins": 0.1615956574678421, + "rewards/rejected": 0.056195855140686035, + "step": 4147 + }, + { + "epoch": 0.641484631741736, + "grad_norm": 6.580196857452393, + "learning_rate": 4.3676251575209075e-06, + "logits/chosen": 12.149304389953613, + "logits/rejected": 7.330109596252441, + "logps/chosen": -377.93499755859375, + "logps/rejected": -256.2394104003906, + "loss": 0.4987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4020899832248688, + "rewards/margins": 0.47229433059692383, + "rewards/rejected": -0.07020434737205505, + "step": 4148 + }, + { + "epoch": 0.6416392808815001, + "grad_norm": 6.997609615325928, + "learning_rate": 4.367338755871234e-06, + "logits/chosen": 13.607803344726562, + "logits/rejected": 7.595705032348633, + "logps/chosen": -258.41131591796875, + "logps/rejected": -247.26422119140625, + "loss": 0.611, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3885740339756012, + "rewards/margins": 0.2115287482738495, + "rewards/rejected": 0.1770452857017517, + "step": 4149 + }, + { + "epoch": 0.6417939300212643, + "grad_norm": 14.422523498535156, + "learning_rate": 4.367052354221561e-06, + "logits/chosen": 9.782825469970703, + "logits/rejected": 6.187810897827148, + "logps/chosen": -267.31878662109375, + "logps/rejected": -216.63388061523438, + "loss": 0.5862, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3532275855541229, + "rewards/margins": 0.31848013401031494, + "rewards/rejected": 0.03474743664264679, + "step": 4150 + }, + { + "epoch": 0.6419485791610284, + "grad_norm": 5.5749077796936035, + "learning_rate": 4.366765952571887e-06, + "logits/chosen": 6.825323104858398, + "logits/rejected": 9.243118286132812, + "logps/chosen": -306.1999816894531, + "logps/rejected": -338.7637939453125, + "loss": 0.72, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07453508675098419, + "rewards/margins": 0.00998525321483612, + "rewards/rejected": 0.06454983353614807, + "step": 4151 + }, + { + "epoch": 0.6421032283007926, + "grad_norm": 4.406378746032715, + "learning_rate": 4.366479550922213e-06, + "logits/chosen": 18.790006637573242, + "logits/rejected": 10.795914649963379, + "logps/chosen": -246.80125427246094, + "logps/rejected": -188.6220703125, + "loss": 0.6544, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16750024259090424, + "rewards/margins": 0.2535654604434967, + "rewards/rejected": -0.08606519550085068, + "step": 4152 + }, + { + "epoch": 0.6422578774405567, + "grad_norm": 4.765065670013428, + "learning_rate": 4.36619314927254e-06, + "logits/chosen": 6.78310489654541, + "logits/rejected": 4.059783458709717, + "logps/chosen": -220.06101989746094, + "logps/rejected": -161.9205322265625, + "loss": 0.6182, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26454222202301025, + "rewards/margins": 0.19134798645973206, + "rewards/rejected": 0.073194220662117, + "step": 4153 + }, + { + "epoch": 0.6424125265803209, + "grad_norm": 5.132529258728027, + "learning_rate": 4.3659067476228665e-06, + "logits/chosen": 10.343584060668945, + "logits/rejected": 10.764266014099121, + "logps/chosen": -337.9837951660156, + "logps/rejected": -319.16815185546875, + "loss": 0.5137, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6351209878921509, + "rewards/margins": 0.5357867479324341, + "rewards/rejected": 0.0993342399597168, + "step": 4154 + }, + { + "epoch": 0.642567175720085, + "grad_norm": 4.381292819976807, + "learning_rate": 4.365620345973193e-06, + "logits/chosen": 6.6890869140625, + "logits/rejected": 3.8603034019470215, + "logps/chosen": -221.19137573242188, + "logps/rejected": -180.6158905029297, + "loss": 0.5325, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1360655277967453, + "rewards/margins": 0.44591212272644043, + "rewards/rejected": -0.30984658002853394, + "step": 4155 + }, + { + "epoch": 0.6427218248598492, + "grad_norm": 5.729649066925049, + "learning_rate": 4.36533394432352e-06, + "logits/chosen": 10.13838005065918, + "logits/rejected": 7.824774265289307, + "logps/chosen": -287.04559326171875, + "logps/rejected": -291.3411865234375, + "loss": 0.737, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30130115151405334, + "rewards/margins": -0.0588393434882164, + "rewards/rejected": 0.36014050245285034, + "step": 4156 + }, + { + "epoch": 0.6428764739996133, + "grad_norm": 3.3064358234405518, + "learning_rate": 4.365047542673846e-06, + "logits/chosen": 10.453022003173828, + "logits/rejected": 4.2874555587768555, + "logps/chosen": -217.84133911132812, + "logps/rejected": -160.44595336914062, + "loss": 0.4465, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4284857511520386, + "rewards/margins": 0.7460076212882996, + "rewards/rejected": -0.317521870136261, + "step": 4157 + }, + { + "epoch": 0.6430311231393775, + "grad_norm": 5.186545372009277, + "learning_rate": 4.364761141024172e-06, + "logits/chosen": 7.370980262756348, + "logits/rejected": 5.057917594909668, + "logps/chosen": -223.27980041503906, + "logps/rejected": -226.22323608398438, + "loss": 0.5646, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1933669149875641, + "rewards/margins": 0.36912909150123596, + "rewards/rejected": -0.17576219141483307, + "step": 4158 + }, + { + "epoch": 0.6431857722791416, + "grad_norm": 7.477304458618164, + "learning_rate": 4.364474739374499e-06, + "logits/chosen": 10.044393539428711, + "logits/rejected": 12.840227127075195, + "logps/chosen": -259.44842529296875, + "logps/rejected": -343.2508544921875, + "loss": 0.7668, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09966926276683807, + "rewards/margins": -0.07662869989871979, + "rewards/rejected": 0.17629796266555786, + "step": 4159 + }, + { + "epoch": 0.6433404214189059, + "grad_norm": 4.798598289489746, + "learning_rate": 4.364188337724826e-06, + "logits/chosen": 11.262516975402832, + "logits/rejected": 8.867025375366211, + "logps/chosen": -219.349609375, + "logps/rejected": -219.7161865234375, + "loss": 0.6082, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4474555253982544, + "rewards/margins": 0.23363742232322693, + "rewards/rejected": 0.21381810307502747, + "step": 4160 + }, + { + "epoch": 0.6434950705586701, + "grad_norm": 8.234296798706055, + "learning_rate": 4.363901936075152e-06, + "logits/chosen": 6.797112464904785, + "logits/rejected": 4.032725811004639, + "logps/chosen": -317.3948974609375, + "logps/rejected": -278.6521301269531, + "loss": 0.8229, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.214555025100708, + "rewards/margins": -0.07091964781284332, + "rewards/rejected": 0.28547465801239014, + "step": 4161 + }, + { + "epoch": 0.6436497196984342, + "grad_norm": 7.59962272644043, + "learning_rate": 4.363615534425479e-06, + "logits/chosen": 9.691587448120117, + "logits/rejected": 6.5549702644348145, + "logps/chosen": -318.69891357421875, + "logps/rejected": -385.31280517578125, + "loss": 0.7351, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22610989212989807, + "rewards/margins": -0.055990688502788544, + "rewards/rejected": 0.2821005582809448, + "step": 4162 + }, + { + "epoch": 0.6438043688381984, + "grad_norm": 7.549393653869629, + "learning_rate": 4.3633291327758055e-06, + "logits/chosen": 6.954405784606934, + "logits/rejected": 9.473011016845703, + "logps/chosen": -393.3813171386719, + "logps/rejected": -428.99505615234375, + "loss": 0.7889, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0068602971732616425, + "rewards/margins": 0.016147777438163757, + "rewards/rejected": -0.023008093237876892, + "step": 4163 + }, + { + "epoch": 0.6439590179779625, + "grad_norm": 5.818005084991455, + "learning_rate": 4.363042731126131e-06, + "logits/chosen": 8.630800247192383, + "logits/rejected": 14.12895393371582, + "logps/chosen": -236.03073120117188, + "logps/rejected": -383.80047607421875, + "loss": 0.5303, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24141044914722443, + "rewards/margins": 0.4820749759674072, + "rewards/rejected": -0.24066448211669922, + "step": 4164 + }, + { + "epoch": 0.6441136671177267, + "grad_norm": 4.954399108886719, + "learning_rate": 4.362756329476458e-06, + "logits/chosen": 9.562021255493164, + "logits/rejected": 5.233008861541748, + "logps/chosen": -284.25341796875, + "logps/rejected": -194.68319702148438, + "loss": 0.6082, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3352839946746826, + "rewards/margins": 0.26421594619750977, + "rewards/rejected": 0.07106802612543106, + "step": 4165 + }, + { + "epoch": 0.6442683162574908, + "grad_norm": 2.904010534286499, + "learning_rate": 4.362469927826785e-06, + "logits/chosen": 12.85045337677002, + "logits/rejected": 11.012163162231445, + "logps/chosen": -220.31076049804688, + "logps/rejected": -179.41116333007812, + "loss": 0.5487, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3166337013244629, + "rewards/margins": 0.4388699531555176, + "rewards/rejected": -0.12223625183105469, + "step": 4166 + }, + { + "epoch": 0.644422965397255, + "grad_norm": 4.373697757720947, + "learning_rate": 4.362183526177111e-06, + "logits/chosen": 11.535093307495117, + "logits/rejected": 6.671360015869141, + "logps/chosen": -173.48941040039062, + "logps/rejected": -137.11614990234375, + "loss": 0.7012, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0783420279622078, + "rewards/margins": 0.010277032852172852, + "rewards/rejected": 0.06806500256061554, + "step": 4167 + }, + { + "epoch": 0.6445776145370191, + "grad_norm": 7.098147869110107, + "learning_rate": 4.361897124527438e-06, + "logits/chosen": 11.066957473754883, + "logits/rejected": 5.847835063934326, + "logps/chosen": -542.644287109375, + "logps/rejected": -420.784423828125, + "loss": 0.5313, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5807441473007202, + "rewards/margins": 0.4839019775390625, + "rewards/rejected": 0.0968421921133995, + "step": 4168 + }, + { + "epoch": 0.6447322636767833, + "grad_norm": 5.421589374542236, + "learning_rate": 4.361610722877765e-06, + "logits/chosen": 9.260985374450684, + "logits/rejected": 7.635785102844238, + "logps/chosen": -272.85833740234375, + "logps/rejected": -278.8537902832031, + "loss": 0.7274, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08183684945106506, + "rewards/margins": -0.041879698634147644, + "rewards/rejected": -0.039957139641046524, + "step": 4169 + }, + { + "epoch": 0.6448869128165474, + "grad_norm": 13.882923126220703, + "learning_rate": 4.361324321228091e-06, + "logits/chosen": 5.356777191162109, + "logits/rejected": 12.102420806884766, + "logps/chosen": -246.50875854492188, + "logps/rejected": -345.19415283203125, + "loss": 1.1131, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.2506532669067383, + "rewards/margins": -0.6569260358810425, + "rewards/rejected": 0.4062727093696594, + "step": 4170 + }, + { + "epoch": 0.6450415619563116, + "grad_norm": 4.450241565704346, + "learning_rate": 4.361037919578417e-06, + "logits/chosen": 9.326611518859863, + "logits/rejected": 5.942061424255371, + "logps/chosen": -283.51800537109375, + "logps/rejected": -245.8580322265625, + "loss": 0.6022, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2722225785255432, + "rewards/margins": 0.2200319617986679, + "rewards/rejected": 0.052190594375133514, + "step": 4171 + }, + { + "epoch": 0.6451962110960758, + "grad_norm": 5.066701889038086, + "learning_rate": 4.360751517928744e-06, + "logits/chosen": 10.758315086364746, + "logits/rejected": 10.636816024780273, + "logps/chosen": -234.97216796875, + "logps/rejected": -190.72586059570312, + "loss": 0.7421, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2692779302597046, + "rewards/margins": -0.02789749950170517, + "rewards/rejected": -0.24138040840625763, + "step": 4172 + }, + { + "epoch": 0.64535086023584, + "grad_norm": 6.414374828338623, + "learning_rate": 4.36046511627907e-06, + "logits/chosen": 5.687942981719971, + "logits/rejected": 8.49539566040039, + "logps/chosen": -301.84619140625, + "logps/rejected": -376.5115966796875, + "loss": 0.6732, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4581771492958069, + "rewards/margins": 0.07761573791503906, + "rewards/rejected": 0.3805614411830902, + "step": 4173 + }, + { + "epoch": 0.6455055093756041, + "grad_norm": 5.29481840133667, + "learning_rate": 4.360178714629397e-06, + "logits/chosen": 10.033263206481934, + "logits/rejected": 4.910677433013916, + "logps/chosen": -306.9974060058594, + "logps/rejected": -265.02911376953125, + "loss": 0.6484, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13457468152046204, + "rewards/margins": 0.13367551565170288, + "rewards/rejected": 0.0008991807699203491, + "step": 4174 + }, + { + "epoch": 0.6456601585153683, + "grad_norm": 5.107746601104736, + "learning_rate": 4.359892312979724e-06, + "logits/chosen": 11.324872970581055, + "logits/rejected": 10.652301788330078, + "logps/chosen": -173.3695526123047, + "logps/rejected": -146.36082458496094, + "loss": 0.6692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.185777485370636, + "rewards/margins": 0.07777142524719238, + "rewards/rejected": 0.10800604522228241, + "step": 4175 + }, + { + "epoch": 0.6458148076551324, + "grad_norm": 3.1758272647857666, + "learning_rate": 4.3596059113300495e-06, + "logits/chosen": 13.734806060791016, + "logits/rejected": 7.527505874633789, + "logps/chosen": -212.08401489257812, + "logps/rejected": -142.43222045898438, + "loss": 0.4843, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14847631752490997, + "rewards/margins": 0.6263341903686523, + "rewards/rejected": -0.47785788774490356, + "step": 4176 + }, + { + "epoch": 0.6459694567948966, + "grad_norm": 6.027726650238037, + "learning_rate": 4.359319509680376e-06, + "logits/chosen": 10.803421020507812, + "logits/rejected": 7.702727317810059, + "logps/chosen": -344.2530517578125, + "logps/rejected": -239.4901885986328, + "loss": 0.7127, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1887633353471756, + "rewards/margins": 0.030370034277439117, + "rewards/rejected": -0.2191333770751953, + "step": 4177 + }, + { + "epoch": 0.6461241059346607, + "grad_norm": 5.2316484451293945, + "learning_rate": 4.359033108030703e-06, + "logits/chosen": 7.239022254943848, + "logits/rejected": 5.777777671813965, + "logps/chosen": -302.7624816894531, + "logps/rejected": -222.5479278564453, + "loss": 0.5206, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1200920045375824, + "rewards/margins": 0.4413255453109741, + "rewards/rejected": -0.3212335705757141, + "step": 4178 + }, + { + "epoch": 0.6462787550744249, + "grad_norm": 4.305361270904541, + "learning_rate": 4.358746706381029e-06, + "logits/chosen": 8.571762084960938, + "logits/rejected": 10.736604690551758, + "logps/chosen": -164.2189483642578, + "logps/rejected": -203.00970458984375, + "loss": 0.7198, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17898684740066528, + "rewards/margins": -0.0001862645149230957, + "rewards/rejected": -0.1788005828857422, + "step": 4179 + }, + { + "epoch": 0.646433404214189, + "grad_norm": 5.800814628601074, + "learning_rate": 4.358460304731355e-06, + "logits/chosen": 5.612978935241699, + "logits/rejected": 0.43852460384368896, + "logps/chosen": -403.4559631347656, + "logps/rejected": -282.67730712890625, + "loss": 0.6122, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30948859453201294, + "rewards/margins": 0.40233543515205383, + "rewards/rejected": -0.09284687042236328, + "step": 4180 + }, + { + "epoch": 0.6465880533539532, + "grad_norm": 4.620627403259277, + "learning_rate": 4.358173903081682e-06, + "logits/chosen": 12.765654563903809, + "logits/rejected": 7.236401081085205, + "logps/chosen": -258.3139343261719, + "logps/rejected": -238.3670654296875, + "loss": 0.5015, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36743491888046265, + "rewards/margins": 0.5095010995864868, + "rewards/rejected": -0.14206616580486298, + "step": 4181 + }, + { + "epoch": 0.6467427024937173, + "grad_norm": 4.559831619262695, + "learning_rate": 4.3578875014320085e-06, + "logits/chosen": 12.864474296569824, + "logits/rejected": 3.5196242332458496, + "logps/chosen": -395.32879638671875, + "logps/rejected": -253.01942443847656, + "loss": 0.5143, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5292133688926697, + "rewards/margins": 0.7272636890411377, + "rewards/rejected": -0.19805032014846802, + "step": 4182 + }, + { + "epoch": 0.6468973516334815, + "grad_norm": 7.63358211517334, + "learning_rate": 4.357601099782335e-06, + "logits/chosen": 13.847896575927734, + "logits/rejected": 11.406129837036133, + "logps/chosen": -362.0840148925781, + "logps/rejected": -313.28045654296875, + "loss": 0.6591, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44009914994239807, + "rewards/margins": 0.24275389313697815, + "rewards/rejected": 0.19734525680541992, + "step": 4183 + }, + { + "epoch": 0.6470520007732457, + "grad_norm": 5.2918219566345215, + "learning_rate": 4.357314698132662e-06, + "logits/chosen": 10.877615928649902, + "logits/rejected": 8.881293296813965, + "logps/chosen": -247.1022491455078, + "logps/rejected": -227.22557067871094, + "loss": 0.6177, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3296305537223816, + "rewards/margins": 0.2166401445865631, + "rewards/rejected": 0.1129903793334961, + "step": 4184 + }, + { + "epoch": 0.6472066499130099, + "grad_norm": 9.23476505279541, + "learning_rate": 4.357028296482988e-06, + "logits/chosen": 13.849346160888672, + "logits/rejected": 6.874018669128418, + "logps/chosen": -326.7024841308594, + "logps/rejected": -259.6429443359375, + "loss": 0.8702, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20097070932388306, + "rewards/margins": -0.19209720194339752, + "rewards/rejected": -0.00887349247932434, + "step": 4185 + }, + { + "epoch": 0.6473612990527741, + "grad_norm": 6.325669288635254, + "learning_rate": 4.356741894833314e-06, + "logits/chosen": 7.294525146484375, + "logits/rejected": 8.089756965637207, + "logps/chosen": -322.8529052734375, + "logps/rejected": -325.2636413574219, + "loss": 0.664, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3753849267959595, + "rewards/margins": 0.1250227987766266, + "rewards/rejected": 0.2503621280193329, + "step": 4186 + }, + { + "epoch": 0.6475159481925382, + "grad_norm": 4.741813659667969, + "learning_rate": 4.356455493183641e-06, + "logits/chosen": 13.535446166992188, + "logits/rejected": 4.636565208435059, + "logps/chosen": -367.2870788574219, + "logps/rejected": -239.84005737304688, + "loss": 0.5585, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31642550230026245, + "rewards/margins": 0.44724729657173157, + "rewards/rejected": -0.1308218091726303, + "step": 4187 + }, + { + "epoch": 0.6476705973323024, + "grad_norm": 4.872653484344482, + "learning_rate": 4.3561690915339676e-06, + "logits/chosen": 9.763392448425293, + "logits/rejected": 14.847209930419922, + "logps/chosen": -214.76821899414062, + "logps/rejected": -308.6475524902344, + "loss": 0.5787, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37203243374824524, + "rewards/margins": 0.3971463739871979, + "rewards/rejected": -0.02511395514011383, + "step": 4188 + }, + { + "epoch": 0.6478252464720665, + "grad_norm": 4.700504779815674, + "learning_rate": 4.355882689884294e-06, + "logits/chosen": 6.039549827575684, + "logits/rejected": 0.44818228483200073, + "logps/chosen": -300.58648681640625, + "logps/rejected": -232.79554748535156, + "loss": 0.4576, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6200546026229858, + "rewards/margins": 0.6305831670761108, + "rewards/rejected": -0.010528564453125, + "step": 4189 + }, + { + "epoch": 0.6479798956118307, + "grad_norm": 4.852015018463135, + "learning_rate": 4.35559628823462e-06, + "logits/chosen": 8.171926498413086, + "logits/rejected": 5.848831653594971, + "logps/chosen": -205.46250915527344, + "logps/rejected": -173.2490997314453, + "loss": 0.6357, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42423662543296814, + "rewards/margins": 0.21442672610282898, + "rewards/rejected": 0.20980989933013916, + "step": 4190 + }, + { + "epoch": 0.6481345447515948, + "grad_norm": 13.992420196533203, + "learning_rate": 4.355309886584947e-06, + "logits/chosen": 11.810670852661133, + "logits/rejected": 7.921099662780762, + "logps/chosen": -206.73741149902344, + "logps/rejected": -152.72364807128906, + "loss": 0.5261, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2047392874956131, + "rewards/margins": 0.41251787543296814, + "rewards/rejected": -0.20777854323387146, + "step": 4191 + }, + { + "epoch": 0.648289193891359, + "grad_norm": 4.336026191711426, + "learning_rate": 4.355023484935273e-06, + "logits/chosen": 13.193326950073242, + "logits/rejected": 6.914109230041504, + "logps/chosen": -297.0978088378906, + "logps/rejected": -223.58868408203125, + "loss": 0.5482, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5427843332290649, + "rewards/margins": 0.47084811329841614, + "rewards/rejected": 0.0719362199306488, + "step": 4192 + }, + { + "epoch": 0.6484438430311231, + "grad_norm": 5.0561981201171875, + "learning_rate": 4.3547370832856e-06, + "logits/chosen": 12.141061782836914, + "logits/rejected": 2.2735249996185303, + "logps/chosen": -280.3802490234375, + "logps/rejected": -220.25477600097656, + "loss": 0.4377, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30007994174957275, + "rewards/margins": 0.8288781046867371, + "rewards/rejected": -0.5287981033325195, + "step": 4193 + }, + { + "epoch": 0.6485984921708873, + "grad_norm": 6.172568321228027, + "learning_rate": 4.354450681635927e-06, + "logits/chosen": 7.03871488571167, + "logits/rejected": 11.420415878295898, + "logps/chosen": -259.57501220703125, + "logps/rejected": -418.56500244140625, + "loss": 0.7309, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15208502113819122, + "rewards/margins": 0.07502608001232147, + "rewards/rejected": 0.07705894112586975, + "step": 4194 + }, + { + "epoch": 0.6487531413106514, + "grad_norm": 5.374832630157471, + "learning_rate": 4.354164279986253e-06, + "logits/chosen": 12.77337646484375, + "logits/rejected": 8.836723327636719, + "logps/chosen": -365.0564880371094, + "logps/rejected": -301.3245849609375, + "loss": 0.5921, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4177880883216858, + "rewards/margins": 0.3558177649974823, + "rewards/rejected": 0.06197032332420349, + "step": 4195 + }, + { + "epoch": 0.6489077904504156, + "grad_norm": 4.273466110229492, + "learning_rate": 4.35387787833658e-06, + "logits/chosen": 13.224781036376953, + "logits/rejected": 10.768608093261719, + "logps/chosen": -201.20626831054688, + "logps/rejected": -198.96792602539062, + "loss": 0.5346, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.015607690438628197, + "rewards/margins": 0.41390252113342285, + "rewards/rejected": -0.3982948362827301, + "step": 4196 + }, + { + "epoch": 0.6490624395901797, + "grad_norm": 5.9873809814453125, + "learning_rate": 4.353591476686906e-06, + "logits/chosen": 14.172565460205078, + "logits/rejected": 9.2845458984375, + "logps/chosen": -212.9275360107422, + "logps/rejected": -157.1071014404297, + "loss": 0.814, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04813404753804207, + "rewards/margins": -0.1368042379617691, + "rewards/rejected": 0.08867020159959793, + "step": 4197 + }, + { + "epoch": 0.649217088729944, + "grad_norm": 3.5123026371002197, + "learning_rate": 4.353305075037232e-06, + "logits/chosen": 9.64143180847168, + "logits/rejected": 8.916725158691406, + "logps/chosen": -183.2423095703125, + "logps/rejected": -201.08419799804688, + "loss": 0.5009, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24202242493629456, + "rewards/margins": 0.47435975074768066, + "rewards/rejected": -0.2323373556137085, + "step": 4198 + }, + { + "epoch": 0.6493717378697081, + "grad_norm": 6.294008731842041, + "learning_rate": 4.353018673387559e-06, + "logits/chosen": 8.272171974182129, + "logits/rejected": 7.6526103019714355, + "logps/chosen": -332.6680908203125, + "logps/rejected": -330.3778381347656, + "loss": 0.5758, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21828046441078186, + "rewards/margins": 0.4001489281654358, + "rewards/rejected": -0.18186840415000916, + "step": 4199 + }, + { + "epoch": 0.6495263870094723, + "grad_norm": 4.988196849822998, + "learning_rate": 4.352732271737886e-06, + "logits/chosen": 3.080298662185669, + "logits/rejected": 14.015260696411133, + "logps/chosen": -206.3272705078125, + "logps/rejected": -262.3306579589844, + "loss": 0.7257, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.030588969588279724, + "rewards/margins": -0.026652857661247253, + "rewards/rejected": -0.003936097025871277, + "step": 4200 + }, + { + "epoch": 0.6496810361492364, + "grad_norm": 3.7875874042510986, + "learning_rate": 4.352445870088212e-06, + "logits/chosen": 8.165966033935547, + "logits/rejected": 3.1907248497009277, + "logps/chosen": -180.878173828125, + "logps/rejected": -134.9561767578125, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.038340337574481964, + "rewards/margins": 0.32512617111206055, + "rewards/rejected": -0.2867858409881592, + "step": 4201 + }, + { + "epoch": 0.6498356852890006, + "grad_norm": 5.892143249511719, + "learning_rate": 4.352159468438539e-06, + "logits/chosen": 12.405851364135742, + "logits/rejected": 11.449054718017578, + "logps/chosen": -295.89630126953125, + "logps/rejected": -314.49774169921875, + "loss": 0.5933, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.048905763775110245, + "rewards/margins": 0.2612883448600769, + "rewards/rejected": -0.31019413471221924, + "step": 4202 + }, + { + "epoch": 0.6499903344287647, + "grad_norm": 28.691387176513672, + "learning_rate": 4.351873066788865e-06, + "logits/chosen": 10.543848991394043, + "logits/rejected": 9.813764572143555, + "logps/chosen": -527.480224609375, + "logps/rejected": -435.54315185546875, + "loss": 0.8626, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23375284671783447, + "rewards/margins": -0.08418767154216766, + "rewards/rejected": 0.31794053316116333, + "step": 4203 + }, + { + "epoch": 0.6501449835685289, + "grad_norm": 5.667877197265625, + "learning_rate": 4.3515866651391914e-06, + "logits/chosen": 10.827398300170898, + "logits/rejected": 4.473579406738281, + "logps/chosen": -403.90423583984375, + "logps/rejected": -273.38189697265625, + "loss": 0.5008, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2575262188911438, + "rewards/margins": 0.7093676328659058, + "rewards/rejected": -0.45184147357940674, + "step": 4204 + }, + { + "epoch": 0.650299632708293, + "grad_norm": 5.46954345703125, + "learning_rate": 4.351300263489518e-06, + "logits/chosen": 13.069526672363281, + "logits/rejected": 14.894620895385742, + "logps/chosen": -300.318603515625, + "logps/rejected": -305.42242431640625, + "loss": 0.6536, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09276733547449112, + "rewards/margins": 0.12498322874307632, + "rewards/rejected": -0.03221588581800461, + "step": 4205 + }, + { + "epoch": 0.6504542818480572, + "grad_norm": 5.898619651794434, + "learning_rate": 4.351013861839845e-06, + "logits/chosen": 5.566603660583496, + "logits/rejected": 7.7404680252075195, + "logps/chosen": -108.28042602539062, + "logps/rejected": -171.0457763671875, + "loss": 0.7786, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1528187096118927, + "rewards/margins": -0.12764693796634674, + "rewards/rejected": -0.025171803310513496, + "step": 4206 + }, + { + "epoch": 0.6506089309878214, + "grad_norm": 5.101982116699219, + "learning_rate": 4.350727460190171e-06, + "logits/chosen": 9.90913200378418, + "logits/rejected": 5.166670799255371, + "logps/chosen": -283.5765380859375, + "logps/rejected": -195.9736328125, + "loss": 0.6274, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.42315778136253357, + "rewards/margins": 0.2216704785823822, + "rewards/rejected": 0.20148730278015137, + "step": 4207 + }, + { + "epoch": 0.6507635801275855, + "grad_norm": 5.125076770782471, + "learning_rate": 4.350441058540498e-06, + "logits/chosen": 8.263349533081055, + "logits/rejected": 7.413053512573242, + "logps/chosen": -385.040283203125, + "logps/rejected": -288.6653137207031, + "loss": 0.596, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2358895242214203, + "rewards/margins": 0.3548605740070343, + "rewards/rejected": -0.11897105723619461, + "step": 4208 + }, + { + "epoch": 0.6509182292673497, + "grad_norm": 4.582486629486084, + "learning_rate": 4.350154656890825e-06, + "logits/chosen": 15.320573806762695, + "logits/rejected": 8.661050796508789, + "logps/chosen": -317.2214660644531, + "logps/rejected": -250.9655303955078, + "loss": 0.5543, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5238339900970459, + "rewards/margins": 0.33945292234420776, + "rewards/rejected": 0.18438109755516052, + "step": 4209 + }, + { + "epoch": 0.6510728784071138, + "grad_norm": 5.058601379394531, + "learning_rate": 4.3498682552411505e-06, + "logits/chosen": 9.022960662841797, + "logits/rejected": 13.238070487976074, + "logps/chosen": -137.50807189941406, + "logps/rejected": -229.68106079101562, + "loss": 0.7082, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.42664414644241333, + "rewards/margins": 0.09030580520629883, + "rewards/rejected": -0.5169499516487122, + "step": 4210 + }, + { + "epoch": 0.6512275275468781, + "grad_norm": 5.154739856719971, + "learning_rate": 4.349581853591477e-06, + "logits/chosen": 12.505020141601562, + "logits/rejected": 11.892744064331055, + "logps/chosen": -305.08685302734375, + "logps/rejected": -294.178955078125, + "loss": 0.5936, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09012632817029953, + "rewards/margins": 0.45098525285720825, + "rewards/rejected": -0.3608589172363281, + "step": 4211 + }, + { + "epoch": 0.6513821766866422, + "grad_norm": 6.85953426361084, + "learning_rate": 4.349295451941804e-06, + "logits/chosen": 11.5447416305542, + "logits/rejected": 4.32448673248291, + "logps/chosen": -315.2926940917969, + "logps/rejected": -252.00189208984375, + "loss": 0.6007, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3751358985900879, + "rewards/margins": 0.44499102234840393, + "rewards/rejected": -0.06985510140657425, + "step": 4212 + }, + { + "epoch": 0.6515368258264064, + "grad_norm": 4.791493892669678, + "learning_rate": 4.3490090502921304e-06, + "logits/chosen": 12.159289360046387, + "logits/rejected": 8.763517379760742, + "logps/chosen": -220.3642578125, + "logps/rejected": -247.2126007080078, + "loss": 0.6161, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23202410340309143, + "rewards/margins": 0.23403620719909668, + "rewards/rejected": -0.0020121149718761444, + "step": 4213 + }, + { + "epoch": 0.6516914749661705, + "grad_norm": 6.442799091339111, + "learning_rate": 4.348722648642456e-06, + "logits/chosen": 12.218147277832031, + "logits/rejected": 10.689464569091797, + "logps/chosen": -296.619140625, + "logps/rejected": -269.91754150390625, + "loss": 0.6472, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14552584290504456, + "rewards/margins": 0.18558895587921143, + "rewards/rejected": -0.040063098073005676, + "step": 4214 + }, + { + "epoch": 0.6518461241059347, + "grad_norm": 4.191873073577881, + "learning_rate": 4.348436246992783e-06, + "logits/chosen": 5.560467720031738, + "logits/rejected": 4.5959038734436035, + "logps/chosen": -201.33773803710938, + "logps/rejected": -183.98382568359375, + "loss": 0.4289, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20138554275035858, + "rewards/margins": 0.8380700349807739, + "rewards/rejected": -0.6366845369338989, + "step": 4215 + }, + { + "epoch": 0.6520007732456988, + "grad_norm": 4.2784247398376465, + "learning_rate": 4.3481498453431095e-06, + "logits/chosen": 15.070930480957031, + "logits/rejected": 3.3545186519622803, + "logps/chosen": -240.72650146484375, + "logps/rejected": -161.81704711914062, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15401974320411682, + "rewards/margins": 0.5519628524780273, + "rewards/rejected": -0.39794304966926575, + "step": 4216 + }, + { + "epoch": 0.652155422385463, + "grad_norm": 5.652374267578125, + "learning_rate": 4.347863443693436e-06, + "logits/chosen": 7.491048336029053, + "logits/rejected": 4.3013482093811035, + "logps/chosen": -324.8224182128906, + "logps/rejected": -240.87069702148438, + "loss": 0.6706, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07760705798864365, + "rewards/margins": 0.20842379331588745, + "rewards/rejected": -0.1308167427778244, + "step": 4217 + }, + { + "epoch": 0.6523100715252271, + "grad_norm": 3.6855833530426025, + "learning_rate": 4.347577042043762e-06, + "logits/chosen": 5.974587440490723, + "logits/rejected": 4.610760688781738, + "logps/chosen": -259.886962890625, + "logps/rejected": -249.80288696289062, + "loss": 0.4882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23094511032104492, + "rewards/margins": 0.6611930727958679, + "rewards/rejected": -0.430247962474823, + "step": 4218 + }, + { + "epoch": 0.6524647206649913, + "grad_norm": 6.166820049285889, + "learning_rate": 4.347290640394089e-06, + "logits/chosen": 9.867472648620605, + "logits/rejected": 9.522201538085938, + "logps/chosen": -302.54522705078125, + "logps/rejected": -267.424560546875, + "loss": 0.7907, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.42901021242141724, + "rewards/margins": -0.06512265652418137, + "rewards/rejected": -0.36388757824897766, + "step": 4219 + }, + { + "epoch": 0.6526193698047554, + "grad_norm": 4.0333170890808105, + "learning_rate": 4.347004238744415e-06, + "logits/chosen": 14.675891876220703, + "logits/rejected": 4.208190441131592, + "logps/chosen": -303.3465576171875, + "logps/rejected": -200.75021362304688, + "loss": 0.5046, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2253396064043045, + "rewards/margins": 0.4549718499183655, + "rewards/rejected": -0.22963227331638336, + "step": 4220 + }, + { + "epoch": 0.6527740189445196, + "grad_norm": 7.872147083282471, + "learning_rate": 4.346717837094742e-06, + "logits/chosen": 11.461825370788574, + "logits/rejected": 9.13442611694336, + "logps/chosen": -556.8522338867188, + "logps/rejected": -439.05279541015625, + "loss": 0.8367, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0441434346139431, + "rewards/margins": -0.010390043258666992, + "rewards/rejected": -0.033753395080566406, + "step": 4221 + }, + { + "epoch": 0.6529286680842837, + "grad_norm": 4.7352495193481445, + "learning_rate": 4.346431435445069e-06, + "logits/chosen": 7.9646897315979, + "logits/rejected": 7.855943202972412, + "logps/chosen": -194.97967529296875, + "logps/rejected": -223.33575439453125, + "loss": 0.6245, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.043425656855106354, + "rewards/margins": 0.25277218222618103, + "rewards/rejected": -0.2961978316307068, + "step": 4222 + }, + { + "epoch": 0.6530833172240479, + "grad_norm": 6.273276329040527, + "learning_rate": 4.346145033795394e-06, + "logits/chosen": 8.105929374694824, + "logits/rejected": 0.20725619792938232, + "logps/chosen": -538.843505859375, + "logps/rejected": -276.67919921875, + "loss": 0.5755, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.009483430534601212, + "rewards/margins": 0.32109034061431885, + "rewards/rejected": -0.33057376742362976, + "step": 4223 + }, + { + "epoch": 0.6532379663638122, + "grad_norm": 5.0234694480896, + "learning_rate": 4.345858632145721e-06, + "logits/chosen": 8.780721664428711, + "logits/rejected": 11.549026489257812, + "logps/chosen": -175.511474609375, + "logps/rejected": -238.11605834960938, + "loss": 0.5695, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0908842384815216, + "rewards/margins": 0.37255245447158813, + "rewards/rejected": -0.28166818618774414, + "step": 4224 + }, + { + "epoch": 0.6533926155035763, + "grad_norm": 5.550236225128174, + "learning_rate": 4.345572230496048e-06, + "logits/chosen": 1.433288335800171, + "logits/rejected": 8.394508361816406, + "logps/chosen": -238.88351440429688, + "logps/rejected": -269.38739013671875, + "loss": 0.7616, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0143667533993721, + "rewards/margins": -0.013303359970450401, + "rewards/rejected": 0.027670137584209442, + "step": 4225 + }, + { + "epoch": 0.6535472646433405, + "grad_norm": 5.03551721572876, + "learning_rate": 4.345285828846374e-06, + "logits/chosen": 10.318571090698242, + "logits/rejected": 12.024208068847656, + "logps/chosen": -135.0189666748047, + "logps/rejected": -179.40573120117188, + "loss": 0.7662, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.31906285881996155, + "rewards/margins": -0.13008837401866913, + "rewards/rejected": -0.18897446990013123, + "step": 4226 + }, + { + "epoch": 0.6537019137831046, + "grad_norm": 4.7589898109436035, + "learning_rate": 4.344999427196701e-06, + "logits/chosen": 8.128241539001465, + "logits/rejected": 4.892699241638184, + "logps/chosen": -244.4506072998047, + "logps/rejected": -226.9948272705078, + "loss": 0.6246, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04702885448932648, + "rewards/margins": 0.21536409854888916, + "rewards/rejected": -0.1683352291584015, + "step": 4227 + }, + { + "epoch": 0.6538565629228688, + "grad_norm": 6.265934467315674, + "learning_rate": 4.344713025547028e-06, + "logits/chosen": 6.535407543182373, + "logits/rejected": 1.9626760482788086, + "logps/chosen": -251.66062927246094, + "logps/rejected": -258.51287841796875, + "loss": 0.7185, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.007463961839675903, + "rewards/margins": 0.2547076344490051, + "rewards/rejected": -0.26217156648635864, + "step": 4228 + }, + { + "epoch": 0.6540112120626329, + "grad_norm": 5.701231479644775, + "learning_rate": 4.344426623897354e-06, + "logits/chosen": 12.259727478027344, + "logits/rejected": 18.161714553833008, + "logps/chosen": -229.688720703125, + "logps/rejected": -247.94024658203125, + "loss": 0.6685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20073002576828003, + "rewards/margins": 0.15554234385490417, + "rewards/rejected": -0.356272429227829, + "step": 4229 + }, + { + "epoch": 0.6541658612023971, + "grad_norm": 5.357940196990967, + "learning_rate": 4.34414022224768e-06, + "logits/chosen": 13.876051902770996, + "logits/rejected": 9.536811828613281, + "logps/chosen": -285.1541748046875, + "logps/rejected": -298.2764892578125, + "loss": 0.4744, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4411677122116089, + "rewards/margins": 0.707834005355835, + "rewards/rejected": -0.26666635274887085, + "step": 4230 + }, + { + "epoch": 0.6543205103421612, + "grad_norm": 6.058716297149658, + "learning_rate": 4.343853820598007e-06, + "logits/chosen": 7.067928314208984, + "logits/rejected": 12.358060836791992, + "logps/chosen": -209.51629638671875, + "logps/rejected": -309.0819396972656, + "loss": 0.915, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11157140135765076, + "rewards/margins": -0.14984466135501862, + "rewards/rejected": 0.03827321529388428, + "step": 4231 + }, + { + "epoch": 0.6544751594819254, + "grad_norm": 5.347863674163818, + "learning_rate": 4.343567418948333e-06, + "logits/chosen": 11.38589096069336, + "logits/rejected": 11.748504638671875, + "logps/chosen": -246.36474609375, + "logps/rejected": -257.4831237792969, + "loss": 0.6571, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08860114216804504, + "rewards/margins": 0.31712016463279724, + "rewards/rejected": -0.4057213068008423, + "step": 4232 + }, + { + "epoch": 0.6546298086216895, + "grad_norm": 6.214756965637207, + "learning_rate": 4.34328101729866e-06, + "logits/chosen": 9.984513282775879, + "logits/rejected": 8.25812816619873, + "logps/chosen": -230.4915771484375, + "logps/rejected": -182.3870391845703, + "loss": 0.8067, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.015525531023740768, + "rewards/margins": -0.15137168765068054, + "rewards/rejected": 0.16689720749855042, + "step": 4233 + }, + { + "epoch": 0.6547844577614537, + "grad_norm": 5.609480857849121, + "learning_rate": 4.342994615648987e-06, + "logits/chosen": 12.323747634887695, + "logits/rejected": 3.7065200805664062, + "logps/chosen": -360.9742126464844, + "logps/rejected": -235.20010375976562, + "loss": 0.6359, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08385735750198364, + "rewards/margins": 0.39086347818374634, + "rewards/rejected": -0.3070061504840851, + "step": 4234 + }, + { + "epoch": 0.6549391069012178, + "grad_norm": 4.826611042022705, + "learning_rate": 4.342708213999313e-06, + "logits/chosen": 13.70637321472168, + "logits/rejected": 5.131006240844727, + "logps/chosen": -428.180419921875, + "logps/rejected": -234.23574829101562, + "loss": 0.5878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13843460381031036, + "rewards/margins": 0.4423544406890869, + "rewards/rejected": -0.30391985177993774, + "step": 4235 + }, + { + "epoch": 0.655093756040982, + "grad_norm": 7.239215850830078, + "learning_rate": 4.342421812349639e-06, + "logits/chosen": 11.26409912109375, + "logits/rejected": 4.930548667907715, + "logps/chosen": -316.2461242675781, + "logps/rejected": -234.1838836669922, + "loss": 0.6595, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3526248037815094, + "rewards/margins": 0.32214125990867615, + "rewards/rejected": -0.6747660636901855, + "step": 4236 + }, + { + "epoch": 0.6552484051807462, + "grad_norm": 5.34835147857666, + "learning_rate": 4.342135410699966e-06, + "logits/chosen": 10.50351333618164, + "logits/rejected": 12.445616722106934, + "logps/chosen": -305.2685241699219, + "logps/rejected": -328.69464111328125, + "loss": 0.6206, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06022585928440094, + "rewards/margins": 0.3220090866088867, + "rewards/rejected": -0.261783242225647, + "step": 4237 + }, + { + "epoch": 0.6554030543205104, + "grad_norm": 5.125571250915527, + "learning_rate": 4.3418490090502925e-06, + "logits/chosen": 10.250349044799805, + "logits/rejected": 5.923823356628418, + "logps/chosen": -332.0323791503906, + "logps/rejected": -289.42822265625, + "loss": 0.6431, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3787103593349457, + "rewards/margins": 0.19022627174854279, + "rewards/rejected": 0.1884841024875641, + "step": 4238 + }, + { + "epoch": 0.6555577034602745, + "grad_norm": 4.990293979644775, + "learning_rate": 4.341562607400619e-06, + "logits/chosen": 13.771848678588867, + "logits/rejected": 3.848051071166992, + "logps/chosen": -479.2643127441406, + "logps/rejected": -284.7635498046875, + "loss": 0.5347, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21116122603416443, + "rewards/margins": 0.4802495241165161, + "rewards/rejected": -0.2690882682800293, + "step": 4239 + }, + { + "epoch": 0.6557123526000387, + "grad_norm": 5.700728416442871, + "learning_rate": 4.341276205750946e-06, + "logits/chosen": 7.67987585067749, + "logits/rejected": 6.795275688171387, + "logps/chosen": -296.4989318847656, + "logps/rejected": -266.7330017089844, + "loss": 0.6694, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16801680624485016, + "rewards/margins": 0.16278572380542755, + "rewards/rejected": -0.3308025598526001, + "step": 4240 + }, + { + "epoch": 0.6558670017398028, + "grad_norm": 4.20328426361084, + "learning_rate": 4.340989804101272e-06, + "logits/chosen": 10.852849960327148, + "logits/rejected": 9.256386756896973, + "logps/chosen": -290.8721008300781, + "logps/rejected": -245.87806701660156, + "loss": 0.6535, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.24068480730056763, + "rewards/margins": 0.20167070627212524, + "rewards/rejected": 0.03901408612728119, + "step": 4241 + }, + { + "epoch": 0.656021650879567, + "grad_norm": 4.048861503601074, + "learning_rate": 4.340703402451599e-06, + "logits/chosen": 8.340597152709961, + "logits/rejected": 8.36566162109375, + "logps/chosen": -163.4769744873047, + "logps/rejected": -135.12185668945312, + "loss": 0.5529, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.00452122837305069, + "rewards/margins": 0.41734784841537476, + "rewards/rejected": -0.4128265976905823, + "step": 4242 + }, + { + "epoch": 0.6561763000193311, + "grad_norm": 4.394460201263428, + "learning_rate": 4.340417000801925e-06, + "logits/chosen": 6.019317150115967, + "logits/rejected": 2.7620127201080322, + "logps/chosen": -182.89154052734375, + "logps/rejected": -104.48013305664062, + "loss": 0.7581, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32511982321739197, + "rewards/margins": 0.013522684574127197, + "rewards/rejected": -0.33864250779151917, + "step": 4243 + }, + { + "epoch": 0.6563309491590953, + "grad_norm": 7.645695686340332, + "learning_rate": 4.3401305991522515e-06, + "logits/chosen": 12.893644332885742, + "logits/rejected": 12.326776504516602, + "logps/chosen": -307.6593017578125, + "logps/rejected": -297.2375183105469, + "loss": 0.7687, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16537103056907654, + "rewards/margins": -0.042695946991443634, + "rewards/rejected": 0.20806699991226196, + "step": 4244 + }, + { + "epoch": 0.6564855982988594, + "grad_norm": 6.866543292999268, + "learning_rate": 4.339844197502578e-06, + "logits/chosen": 7.2180094718933105, + "logits/rejected": 12.493921279907227, + "logps/chosen": -238.82546997070312, + "logps/rejected": -335.80462646484375, + "loss": 0.8607, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0891430526971817, + "rewards/margins": -0.26331374049186707, + "rewards/rejected": 0.17417068779468536, + "step": 4245 + }, + { + "epoch": 0.6566402474386236, + "grad_norm": 3.879746675491333, + "learning_rate": 4.339557795852905e-06, + "logits/chosen": 12.361034393310547, + "logits/rejected": 7.865085124969482, + "logps/chosen": -253.78721618652344, + "logps/rejected": -316.2119140625, + "loss": 0.3618, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07004080712795258, + "rewards/margins": 1.027457594871521, + "rewards/rejected": -0.957416832447052, + "step": 4246 + }, + { + "epoch": 0.6567948965783877, + "grad_norm": 6.232325077056885, + "learning_rate": 4.3392713942032315e-06, + "logits/chosen": 12.41917896270752, + "logits/rejected": 4.786000728607178, + "logps/chosen": -389.54949951171875, + "logps/rejected": -281.8065490722656, + "loss": 0.5207, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1898013949394226, + "rewards/margins": 0.5042092800140381, + "rewards/rejected": -0.3144078552722931, + "step": 4247 + }, + { + "epoch": 0.6569495457181519, + "grad_norm": 7.850480556488037, + "learning_rate": 4.338984992553557e-06, + "logits/chosen": 10.405895233154297, + "logits/rejected": 8.956665992736816, + "logps/chosen": -301.396484375, + "logps/rejected": -286.8551330566406, + "loss": 0.6124, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.29000720381736755, + "rewards/margins": 0.27204352617263794, + "rewards/rejected": -0.5620507597923279, + "step": 4248 + }, + { + "epoch": 0.6571041948579162, + "grad_norm": 4.464251518249512, + "learning_rate": 4.338698590903884e-06, + "logits/chosen": 6.897383689880371, + "logits/rejected": 8.609350204467773, + "logps/chosen": -228.88211059570312, + "logps/rejected": -246.05857849121094, + "loss": 0.6018, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12348346412181854, + "rewards/margins": 0.2385975867509842, + "rewards/rejected": -0.11511412262916565, + "step": 4249 + }, + { + "epoch": 0.6572588439976803, + "grad_norm": 4.924075603485107, + "learning_rate": 4.338412189254211e-06, + "logits/chosen": 7.3606157302856445, + "logits/rejected": 7.459295272827148, + "logps/chosen": -337.3214111328125, + "logps/rejected": -300.5233154296875, + "loss": 0.6147, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07398763298988342, + "rewards/margins": 0.49985432624816895, + "rewards/rejected": -0.4258667230606079, + "step": 4250 + }, + { + "epoch": 0.6574134931374445, + "grad_norm": 5.979330062866211, + "learning_rate": 4.338125787604537e-06, + "logits/chosen": 10.71957778930664, + "logits/rejected": 15.592201232910156, + "logps/chosen": -293.1734924316406, + "logps/rejected": -359.5371398925781, + "loss": 0.6698, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027801796793937683, + "rewards/margins": 0.26209214329719543, + "rewards/rejected": -0.2898939251899719, + "step": 4251 + }, + { + "epoch": 0.6575681422772086, + "grad_norm": 3.9610395431518555, + "learning_rate": 4.337839385954863e-06, + "logits/chosen": 6.776621341705322, + "logits/rejected": -0.9367251396179199, + "logps/chosen": -337.8732604980469, + "logps/rejected": -216.66941833496094, + "loss": 0.4719, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05533876270055771, + "rewards/margins": 0.752065896987915, + "rewards/rejected": -0.8074046969413757, + "step": 4252 + }, + { + "epoch": 0.6577227914169728, + "grad_norm": 6.113455772399902, + "learning_rate": 4.33755298430519e-06, + "logits/chosen": 8.62043285369873, + "logits/rejected": 5.310030937194824, + "logps/chosen": -290.6543884277344, + "logps/rejected": -278.75689697265625, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05785989761352539, + "rewards/margins": 0.4392286539077759, + "rewards/rejected": -0.3813687860965729, + "step": 4253 + }, + { + "epoch": 0.6578774405567369, + "grad_norm": 7.48187255859375, + "learning_rate": 4.337266582655516e-06, + "logits/chosen": 13.339792251586914, + "logits/rejected": 4.544426918029785, + "logps/chosen": -404.67340087890625, + "logps/rejected": -328.5487060546875, + "loss": 0.5925, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22495344281196594, + "rewards/margins": 0.2778828740119934, + "rewards/rejected": -0.5028363466262817, + "step": 4254 + }, + { + "epoch": 0.6580320896965011, + "grad_norm": 6.2130045890808105, + "learning_rate": 4.336980181005843e-06, + "logits/chosen": 9.765238761901855, + "logits/rejected": 9.637968063354492, + "logps/chosen": -267.8067932128906, + "logps/rejected": -344.5045166015625, + "loss": 0.8128, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20931066572666168, + "rewards/margins": -0.15147018432617188, + "rewards/rejected": -0.057840496301651, + "step": 4255 + }, + { + "epoch": 0.6581867388362652, + "grad_norm": 5.443620681762695, + "learning_rate": 4.336693779356169e-06, + "logits/chosen": 5.635722637176514, + "logits/rejected": 10.607624053955078, + "logps/chosen": -183.70310974121094, + "logps/rejected": -185.73695373535156, + "loss": 0.7388, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34342384338378906, + "rewards/margins": -0.03273334726691246, + "rewards/rejected": -0.3106904923915863, + "step": 4256 + }, + { + "epoch": 0.6583413879760294, + "grad_norm": 5.197975158691406, + "learning_rate": 4.3364073777064954e-06, + "logits/chosen": 8.877005577087402, + "logits/rejected": 7.446663856506348, + "logps/chosen": -285.80609130859375, + "logps/rejected": -295.2646789550781, + "loss": 0.5743, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24882307648658752, + "rewards/margins": 0.31479302048683167, + "rewards/rejected": -0.06596995890140533, + "step": 4257 + }, + { + "epoch": 0.6584960371157935, + "grad_norm": 8.003013610839844, + "learning_rate": 4.336120976056822e-06, + "logits/chosen": 6.098358154296875, + "logits/rejected": 7.598957061767578, + "logps/chosen": -235.44683837890625, + "logps/rejected": -255.38650512695312, + "loss": 0.8314, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5272551774978638, + "rewards/margins": -0.17554105818271637, + "rewards/rejected": -0.3517141342163086, + "step": 4258 + }, + { + "epoch": 0.6586506862555577, + "grad_norm": 5.2458577156066895, + "learning_rate": 4.335834574407149e-06, + "logits/chosen": 7.47066068649292, + "logits/rejected": 8.930221557617188, + "logps/chosen": -235.64450073242188, + "logps/rejected": -229.45272827148438, + "loss": 0.6839, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10166683793067932, + "rewards/margins": 0.04539089649915695, + "rewards/rejected": -0.14705774188041687, + "step": 4259 + }, + { + "epoch": 0.6588053353953218, + "grad_norm": 12.956064224243164, + "learning_rate": 4.335548172757475e-06, + "logits/chosen": 2.390686273574829, + "logits/rejected": 1.0085773468017578, + "logps/chosen": -276.1402282714844, + "logps/rejected": -279.8263854980469, + "loss": 0.4848, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08727875351905823, + "rewards/margins": 0.7267470359802246, + "rewards/rejected": -0.6394683122634888, + "step": 4260 + }, + { + "epoch": 0.658959984535086, + "grad_norm": 5.258927345275879, + "learning_rate": 4.335261771107802e-06, + "logits/chosen": 9.239931106567383, + "logits/rejected": 7.764420032501221, + "logps/chosen": -343.63128662109375, + "logps/rejected": -236.1065216064453, + "loss": 0.6428, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06139403581619263, + "rewards/margins": 0.2517656087875366, + "rewards/rejected": -0.190371572971344, + "step": 4261 + }, + { + "epoch": 0.6591146336748502, + "grad_norm": 4.327454090118408, + "learning_rate": 4.334975369458129e-06, + "logits/chosen": 7.469304084777832, + "logits/rejected": 2.7950024604797363, + "logps/chosen": -233.54017639160156, + "logps/rejected": -168.6626434326172, + "loss": 0.5332, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.21875320374965668, + "rewards/margins": 0.5161008238792419, + "rewards/rejected": -0.7348540425300598, + "step": 4262 + }, + { + "epoch": 0.6592692828146144, + "grad_norm": 7.129507064819336, + "learning_rate": 4.3346889678084545e-06, + "logits/chosen": 0.9923268556594849, + "logits/rejected": 3.7270820140838623, + "logps/chosen": -203.24136352539062, + "logps/rejected": -240.93637084960938, + "loss": 0.9093, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5524977445602417, + "rewards/margins": -0.25231146812438965, + "rewards/rejected": -0.30018624663352966, + "step": 4263 + }, + { + "epoch": 0.6594239319543785, + "grad_norm": 5.462010860443115, + "learning_rate": 4.334402566158781e-06, + "logits/chosen": 9.787393569946289, + "logits/rejected": 12.91716194152832, + "logps/chosen": -208.02406311035156, + "logps/rejected": -236.16030883789062, + "loss": 0.8064, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19643601775169373, + "rewards/margins": -0.15476183593273163, + "rewards/rejected": -0.041674189269542694, + "step": 4264 + }, + { + "epoch": 0.6595785810941427, + "grad_norm": 4.6637067794799805, + "learning_rate": 4.334116164509108e-06, + "logits/chosen": 7.521944999694824, + "logits/rejected": 5.622187614440918, + "logps/chosen": -312.7679443359375, + "logps/rejected": -226.11282348632812, + "loss": 0.5584, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09454327076673508, + "rewards/margins": 0.4101813733577728, + "rewards/rejected": -0.31563809514045715, + "step": 4265 + }, + { + "epoch": 0.6597332302339068, + "grad_norm": 5.969651699066162, + "learning_rate": 4.3338297628594344e-06, + "logits/chosen": 12.349713325500488, + "logits/rejected": 16.498817443847656, + "logps/chosen": -245.55372619628906, + "logps/rejected": -292.7457275390625, + "loss": 0.8999, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.3214834928512573, + "rewards/margins": -0.3493770360946655, + "rewards/rejected": 0.02789352834224701, + "step": 4266 + }, + { + "epoch": 0.659887879373671, + "grad_norm": 3.572495460510254, + "learning_rate": 4.333543361209761e-06, + "logits/chosen": 9.650606155395508, + "logits/rejected": 10.575093269348145, + "logps/chosen": -207.3527374267578, + "logps/rejected": -262.22705078125, + "loss": 0.4973, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24042657017707825, + "rewards/margins": 0.6499561071395874, + "rewards/rejected": -0.40952956676483154, + "step": 4267 + }, + { + "epoch": 0.6600425285134351, + "grad_norm": 5.476774215698242, + "learning_rate": 4.333256959560088e-06, + "logits/chosen": 8.378408432006836, + "logits/rejected": 6.034361362457275, + "logps/chosen": -282.72552490234375, + "logps/rejected": -283.5556335449219, + "loss": 0.6397, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05240161344408989, + "rewards/margins": 0.1941196620464325, + "rewards/rejected": -0.1417180597782135, + "step": 4268 + }, + { + "epoch": 0.6601971776531993, + "grad_norm": 4.192002773284912, + "learning_rate": 4.3329705579104135e-06, + "logits/chosen": 10.967140197753906, + "logits/rejected": -2.1587822437286377, + "logps/chosen": -217.4373779296875, + "logps/rejected": -103.82136535644531, + "loss": 0.5382, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2202337235212326, + "rewards/margins": 0.514327347278595, + "rewards/rejected": -0.734561026096344, + "step": 4269 + }, + { + "epoch": 0.6603518267929634, + "grad_norm": 3.051832437515259, + "learning_rate": 4.33268415626074e-06, + "logits/chosen": 14.620075225830078, + "logits/rejected": 2.9050979614257812, + "logps/chosen": -289.6891174316406, + "logps/rejected": -166.1214599609375, + "loss": 0.374, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.273196816444397, + "rewards/margins": 1.0187900066375732, + "rewards/rejected": -0.745593249797821, + "step": 4270 + }, + { + "epoch": 0.6605064759327276, + "grad_norm": 4.67549991607666, + "learning_rate": 4.332397754611067e-06, + "logits/chosen": 10.286325454711914, + "logits/rejected": 9.392341613769531, + "logps/chosen": -225.73623657226562, + "logps/rejected": -194.42324829101562, + "loss": 0.6069, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.168339803814888, + "rewards/margins": 0.30439293384552, + "rewards/rejected": -0.4727327823638916, + "step": 4271 + }, + { + "epoch": 0.6606611250724918, + "grad_norm": 6.0189337730407715, + "learning_rate": 4.3321113529613935e-06, + "logits/chosen": 8.788934707641602, + "logits/rejected": 5.407052993774414, + "logps/chosen": -260.3956604003906, + "logps/rejected": -202.4595184326172, + "loss": 0.5524, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10741932690143585, + "rewards/margins": 0.426333487033844, + "rewards/rejected": -0.5337527990341187, + "step": 4272 + }, + { + "epoch": 0.6608157742122559, + "grad_norm": 7.782709121704102, + "learning_rate": 4.33182495131172e-06, + "logits/chosen": 4.877094745635986, + "logits/rejected": 9.291874885559082, + "logps/chosen": -205.60812377929688, + "logps/rejected": -262.8767395019531, + "loss": 0.9725, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5181944370269775, + "rewards/margins": -0.37633776664733887, + "rewards/rejected": -0.14185667037963867, + "step": 4273 + }, + { + "epoch": 0.66097042335202, + "grad_norm": 4.9661078453063965, + "learning_rate": 4.331538549662047e-06, + "logits/chosen": 10.442291259765625, + "logits/rejected": 7.284004211425781, + "logps/chosen": -304.2198486328125, + "logps/rejected": -188.07688903808594, + "loss": 0.5088, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16642338037490845, + "rewards/margins": 0.5892209410667419, + "rewards/rejected": -0.4227975904941559, + "step": 4274 + }, + { + "epoch": 0.6611250724917843, + "grad_norm": 5.621748447418213, + "learning_rate": 4.3312521480123734e-06, + "logits/chosen": 1.8635404109954834, + "logits/rejected": 7.7932281494140625, + "logps/chosen": -124.76582336425781, + "logps/rejected": -146.65882873535156, + "loss": 0.7726, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5092434883117676, + "rewards/margins": 0.07121413946151733, + "rewards/rejected": -0.5804575681686401, + "step": 4275 + }, + { + "epoch": 0.6612797216315485, + "grad_norm": 5.874105930328369, + "learning_rate": 4.330965746362699e-06, + "logits/chosen": 7.3795294761657715, + "logits/rejected": 7.9654083251953125, + "logps/chosen": -237.1683807373047, + "logps/rejected": -271.6481018066406, + "loss": 0.8578, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4786272644996643, + "rewards/margins": -0.23775315284729004, + "rewards/rejected": -0.24087411165237427, + "step": 4276 + }, + { + "epoch": 0.6614343707713126, + "grad_norm": 6.900094985961914, + "learning_rate": 4.330679344713026e-06, + "logits/chosen": 9.951662063598633, + "logits/rejected": 8.348591804504395, + "logps/chosen": -312.5068359375, + "logps/rejected": -278.4826354980469, + "loss": 0.8105, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.34500110149383545, + "rewards/margins": -0.059378936886787415, + "rewards/rejected": -0.28562214970588684, + "step": 4277 + }, + { + "epoch": 0.6615890199110768, + "grad_norm": 6.97905158996582, + "learning_rate": 4.3303929430633526e-06, + "logits/chosen": 10.829272270202637, + "logits/rejected": 11.108304977416992, + "logps/chosen": -143.76087951660156, + "logps/rejected": -167.43203735351562, + "loss": 0.6705, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2697776257991791, + "rewards/margins": 0.32433944940567017, + "rewards/rejected": -0.5941171050071716, + "step": 4278 + }, + { + "epoch": 0.6617436690508409, + "grad_norm": 5.425868034362793, + "learning_rate": 4.330106541413679e-06, + "logits/chosen": 9.530460357666016, + "logits/rejected": 6.96138858795166, + "logps/chosen": -223.8497772216797, + "logps/rejected": -244.656494140625, + "loss": 0.7347, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27032825350761414, + "rewards/margins": -0.01611112430691719, + "rewards/rejected": -0.25421711802482605, + "step": 4279 + }, + { + "epoch": 0.6618983181906051, + "grad_norm": 4.473358631134033, + "learning_rate": 4.329820139764006e-06, + "logits/chosen": 11.572851181030273, + "logits/rejected": 6.405040740966797, + "logps/chosen": -260.8026428222656, + "logps/rejected": -260.5738525390625, + "loss": 0.5272, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.11440470814704895, + "rewards/margins": 0.5435508489608765, + "rewards/rejected": -0.6579555869102478, + "step": 4280 + }, + { + "epoch": 0.6620529673303692, + "grad_norm": 4.560245037078857, + "learning_rate": 4.3295337381143325e-06, + "logits/chosen": 8.806246757507324, + "logits/rejected": 11.50917911529541, + "logps/chosen": -256.7647705078125, + "logps/rejected": -319.9904479980469, + "loss": 0.5967, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011428158730268478, + "rewards/margins": 0.3467073142528534, + "rewards/rejected": -0.3352791368961334, + "step": 4281 + }, + { + "epoch": 0.6622076164701334, + "grad_norm": 4.245309352874756, + "learning_rate": 4.329247336464658e-06, + "logits/chosen": 8.630610466003418, + "logits/rejected": -2.6741414070129395, + "logps/chosen": -241.89242553710938, + "logps/rejected": -156.9348907470703, + "loss": 0.6305, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03767205774784088, + "rewards/margins": 0.18159788846969604, + "rewards/rejected": -0.21926994621753693, + "step": 4282 + }, + { + "epoch": 0.6623622656098975, + "grad_norm": 5.839372158050537, + "learning_rate": 4.328960934814985e-06, + "logits/chosen": 11.801851272583008, + "logits/rejected": 9.311161041259766, + "logps/chosen": -218.84344482421875, + "logps/rejected": -238.715087890625, + "loss": 0.7612, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.27069205045700073, + "rewards/margins": 0.1671137660741806, + "rewards/rejected": -0.43780583143234253, + "step": 4283 + }, + { + "epoch": 0.6625169147496617, + "grad_norm": 9.560470581054688, + "learning_rate": 4.328674533165312e-06, + "logits/chosen": 6.968115329742432, + "logits/rejected": 12.529481887817383, + "logps/chosen": -246.2449493408203, + "logps/rejected": -286.8275451660156, + "loss": 0.6173, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24121533334255219, + "rewards/margins": 0.2748055160045624, + "rewards/rejected": -0.5160208344459534, + "step": 4284 + }, + { + "epoch": 0.6626715638894258, + "grad_norm": 5.678939342498779, + "learning_rate": 4.328388131515638e-06, + "logits/chosen": 9.270954132080078, + "logits/rejected": 2.9593586921691895, + "logps/chosen": -342.0398864746094, + "logps/rejected": -291.84246826171875, + "loss": 0.6204, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25605183839797974, + "rewards/margins": 0.2629515826702118, + "rewards/rejected": -0.006899736821651459, + "step": 4285 + }, + { + "epoch": 0.66282621302919, + "grad_norm": 4.40380334854126, + "learning_rate": 4.328101729865964e-06, + "logits/chosen": 11.340071678161621, + "logits/rejected": 1.762393832206726, + "logps/chosen": -220.08724975585938, + "logps/rejected": -103.06749725341797, + "loss": 0.5901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2431265115737915, + "rewards/margins": 0.32804006338119507, + "rewards/rejected": -0.5711665749549866, + "step": 4286 + }, + { + "epoch": 0.6629808621689541, + "grad_norm": 4.827279090881348, + "learning_rate": 4.327815328216291e-06, + "logits/chosen": 6.844410419464111, + "logits/rejected": 10.561067581176758, + "logps/chosen": -229.61019897460938, + "logps/rejected": -248.7239990234375, + "loss": 0.6249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19440661370754242, + "rewards/margins": 0.1983400285243988, + "rewards/rejected": -0.39274662733078003, + "step": 4287 + }, + { + "epoch": 0.6631355113087184, + "grad_norm": 6.126718521118164, + "learning_rate": 4.327528926566617e-06, + "logits/chosen": 9.450445175170898, + "logits/rejected": 7.474595069885254, + "logps/chosen": -356.580078125, + "logps/rejected": -327.7660217285156, + "loss": 0.718, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18197079002857208, + "rewards/margins": 0.013925556093454361, + "rewards/rejected": -0.19589634239673615, + "step": 4288 + }, + { + "epoch": 0.6632901604484825, + "grad_norm": 5.293323516845703, + "learning_rate": 4.327242524916944e-06, + "logits/chosen": 14.445408821105957, + "logits/rejected": 11.202006340026855, + "logps/chosen": -409.4034118652344, + "logps/rejected": -336.47174072265625, + "loss": 0.5409, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13379192352294922, + "rewards/margins": 0.45122969150543213, + "rewards/rejected": -0.3174377381801605, + "step": 4289 + }, + { + "epoch": 0.6634448095882467, + "grad_norm": 4.291279315948486, + "learning_rate": 4.32695612326727e-06, + "logits/chosen": 15.586651802062988, + "logits/rejected": 7.508134841918945, + "logps/chosen": -445.3642272949219, + "logps/rejected": -310.2279357910156, + "loss": 0.449, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1344788521528244, + "rewards/margins": 0.7428315281867981, + "rewards/rejected": -0.6083526611328125, + "step": 4290 + }, + { + "epoch": 0.6635994587280108, + "grad_norm": 22.539995193481445, + "learning_rate": 4.3266697216175965e-06, + "logits/chosen": 7.240866661071777, + "logits/rejected": 3.2018611431121826, + "logps/chosen": -428.97357177734375, + "logps/rejected": -424.547119140625, + "loss": 0.7442, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5236403346061707, + "rewards/margins": -1.790374517440796e-05, + "rewards/rejected": 0.523658275604248, + "step": 4291 + }, + { + "epoch": 0.663754107867775, + "grad_norm": 5.53656530380249, + "learning_rate": 4.326383319967923e-06, + "logits/chosen": 10.309662818908691, + "logits/rejected": -0.15265804529190063, + "logps/chosen": -282.2122802734375, + "logps/rejected": -171.11830139160156, + "loss": 0.5686, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06645546108484268, + "rewards/margins": 0.4247966408729553, + "rewards/rejected": -0.49125218391418457, + "step": 4292 + }, + { + "epoch": 0.6639087570075392, + "grad_norm": 5.4069294929504395, + "learning_rate": 4.32609691831825e-06, + "logits/chosen": 13.59645938873291, + "logits/rejected": 9.281729698181152, + "logps/chosen": -222.18453979492188, + "logps/rejected": -152.1209259033203, + "loss": 0.742, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32295939326286316, + "rewards/margins": -0.025439254939556122, + "rewards/rejected": -0.29752013087272644, + "step": 4293 + }, + { + "epoch": 0.6640634061473033, + "grad_norm": 5.998040676116943, + "learning_rate": 4.325810516668576e-06, + "logits/chosen": 8.16534423828125, + "logits/rejected": 7.85731315612793, + "logps/chosen": -404.62542724609375, + "logps/rejected": -334.340576171875, + "loss": 0.6027, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05921955406665802, + "rewards/margins": 0.3341347575187683, + "rewards/rejected": -0.27491524815559387, + "step": 4294 + }, + { + "epoch": 0.6642180552870675, + "grad_norm": 6.4560418128967285, + "learning_rate": 4.325524115018903e-06, + "logits/chosen": 6.789648056030273, + "logits/rejected": 9.997292518615723, + "logps/chosen": -206.29624938964844, + "logps/rejected": -315.99078369140625, + "loss": 0.6727, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011142492294311523, + "rewards/margins": 0.1732703447341919, + "rewards/rejected": -0.16212786734104156, + "step": 4295 + }, + { + "epoch": 0.6643727044268316, + "grad_norm": 6.536364555358887, + "learning_rate": 4.325237713369229e-06, + "logits/chosen": 10.937895774841309, + "logits/rejected": 10.581141471862793, + "logps/chosen": -346.8070068359375, + "logps/rejected": -271.35797119140625, + "loss": 0.682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2658529281616211, + "rewards/margins": 0.08295775204896927, + "rewards/rejected": -0.34881070256233215, + "step": 4296 + }, + { + "epoch": 0.6645273535665958, + "grad_norm": 5.573677062988281, + "learning_rate": 4.3249513117195555e-06, + "logits/chosen": 12.538296699523926, + "logits/rejected": 5.398101806640625, + "logps/chosen": -374.0251770019531, + "logps/rejected": -247.6549530029297, + "loss": 0.5247, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1656876802444458, + "rewards/margins": 0.5455344915390015, + "rewards/rejected": -0.37984687089920044, + "step": 4297 + }, + { + "epoch": 0.6646820027063599, + "grad_norm": 5.437753677368164, + "learning_rate": 4.324664910069882e-06, + "logits/chosen": 12.648083686828613, + "logits/rejected": 12.451542854309082, + "logps/chosen": -216.94808959960938, + "logps/rejected": -206.02369689941406, + "loss": 0.7463, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2890317440032959, + "rewards/margins": 0.10365095734596252, + "rewards/rejected": -0.3926827013492584, + "step": 4298 + }, + { + "epoch": 0.6648366518461241, + "grad_norm": 9.735902786254883, + "learning_rate": 4.324378508420209e-06, + "logits/chosen": 4.4970879554748535, + "logits/rejected": 9.336906433105469, + "logps/chosen": -297.6566162109375, + "logps/rejected": -491.55145263671875, + "loss": 0.9495, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5635269284248352, + "rewards/margins": -0.16001738607883453, + "rewards/rejected": -0.4035094976425171, + "step": 4299 + }, + { + "epoch": 0.6649913009858882, + "grad_norm": 5.67490291595459, + "learning_rate": 4.3240921067705355e-06, + "logits/chosen": 17.460716247558594, + "logits/rejected": 5.8881378173828125, + "logps/chosen": -357.1310729980469, + "logps/rejected": -159.79966735839844, + "loss": 0.709, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3228677809238434, + "rewards/margins": 0.021592095494270325, + "rewards/rejected": -0.3444598615169525, + "step": 4300 + }, + { + "epoch": 0.6651459501256525, + "grad_norm": 4.3194403648376465, + "learning_rate": 4.323805705120862e-06, + "logits/chosen": 7.022460460662842, + "logits/rejected": 7.644617080688477, + "logps/chosen": -172.9615020751953, + "logps/rejected": -145.02493286132812, + "loss": 0.6729, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2406686246395111, + "rewards/margins": 0.08030257374048233, + "rewards/rejected": -0.32097119092941284, + "step": 4301 + }, + { + "epoch": 0.6653005992654166, + "grad_norm": 4.654993534088135, + "learning_rate": 4.323519303471188e-06, + "logits/chosen": 12.267364501953125, + "logits/rejected": 7.867785453796387, + "logps/chosen": -273.8438720703125, + "logps/rejected": -257.78265380859375, + "loss": 0.4982, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07739315927028656, + "rewards/margins": 0.5061399340629578, + "rewards/rejected": -0.5835331082344055, + "step": 4302 + }, + { + "epoch": 0.6654552484051808, + "grad_norm": 4.152048110961914, + "learning_rate": 4.323232901821515e-06, + "logits/chosen": 8.7733736038208, + "logits/rejected": 2.4716219902038574, + "logps/chosen": -257.78582763671875, + "logps/rejected": -141.50247192382812, + "loss": 0.6444, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24107535183429718, + "rewards/margins": 0.20370645821094513, + "rewards/rejected": -0.44478175044059753, + "step": 4303 + }, + { + "epoch": 0.6656098975449449, + "grad_norm": 7.598574638366699, + "learning_rate": 4.322946500171841e-06, + "logits/chosen": 12.207249641418457, + "logits/rejected": 2.4437835216522217, + "logps/chosen": -417.4272155761719, + "logps/rejected": -250.96444702148438, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.053061626851558685, + "rewards/margins": 0.200920969247818, + "rewards/rejected": -0.2539826035499573, + "step": 4304 + }, + { + "epoch": 0.6657645466847091, + "grad_norm": 3.6164698600769043, + "learning_rate": 4.322660098522168e-06, + "logits/chosen": 8.91761589050293, + "logits/rejected": 1.9214706420898438, + "logps/chosen": -292.17584228515625, + "logps/rejected": -209.76954650878906, + "loss": 0.433, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10923368483781815, + "rewards/margins": 0.700827419757843, + "rewards/rejected": -0.5915937423706055, + "step": 4305 + }, + { + "epoch": 0.6659191958244732, + "grad_norm": 4.041393280029297, + "learning_rate": 4.3223736968724945e-06, + "logits/chosen": 9.307053565979004, + "logits/rejected": 7.442022323608398, + "logps/chosen": -292.13287353515625, + "logps/rejected": -233.47157287597656, + "loss": 0.573, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18764419853687286, + "rewards/margins": 0.4046606421470642, + "rewards/rejected": -0.21701645851135254, + "step": 4306 + }, + { + "epoch": 0.6660738449642374, + "grad_norm": 5.438999176025391, + "learning_rate": 4.322087295222821e-06, + "logits/chosen": 12.889373779296875, + "logits/rejected": 6.3512091636657715, + "logps/chosen": -265.8472900390625, + "logps/rejected": -161.15321350097656, + "loss": 0.656, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21987418830394745, + "rewards/margins": 0.19599267840385437, + "rewards/rejected": -0.4158668518066406, + "step": 4307 + }, + { + "epoch": 0.6662284941040015, + "grad_norm": 5.292679786682129, + "learning_rate": 4.321800893573148e-06, + "logits/chosen": 6.2649359703063965, + "logits/rejected": 9.074081420898438, + "logps/chosen": -343.22161865234375, + "logps/rejected": -397.093505859375, + "loss": 0.5503, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12882918119430542, + "rewards/margins": 0.5189768671989441, + "rewards/rejected": -0.39014768600463867, + "step": 4308 + }, + { + "epoch": 0.6663831432437657, + "grad_norm": 5.072567939758301, + "learning_rate": 4.321514491923474e-06, + "logits/chosen": 9.413214683532715, + "logits/rejected": 7.422295570373535, + "logps/chosen": -225.78944396972656, + "logps/rejected": -247.0975799560547, + "loss": 0.6059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3850398361682892, + "rewards/margins": 0.3058340549468994, + "rewards/rejected": -0.690873920917511, + "step": 4309 + }, + { + "epoch": 0.6665377923835298, + "grad_norm": 5.56306791305542, + "learning_rate": 4.3212280902738e-06, + "logits/chosen": 4.979337215423584, + "logits/rejected": 4.984701156616211, + "logps/chosen": -295.468994140625, + "logps/rejected": -372.0696105957031, + "loss": 0.5649, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09733524918556213, + "rewards/margins": 0.3926045000553131, + "rewards/rejected": -0.2952692210674286, + "step": 4310 + }, + { + "epoch": 0.666692441523294, + "grad_norm": 4.59020471572876, + "learning_rate": 4.320941688624127e-06, + "logits/chosen": 4.973601341247559, + "logits/rejected": 3.915710926055908, + "logps/chosen": -208.26828002929688, + "logps/rejected": -206.94432067871094, + "loss": 0.5671, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27557921409606934, + "rewards/margins": 0.39918094873428345, + "rewards/rejected": -0.12360171973705292, + "step": 4311 + }, + { + "epoch": 0.6668470906630581, + "grad_norm": 6.568686485290527, + "learning_rate": 4.320655286974454e-06, + "logits/chosen": 6.38916015625, + "logits/rejected": 10.807207107543945, + "logps/chosen": -345.784423828125, + "logps/rejected": -344.0394592285156, + "loss": 0.798, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3488273620605469, + "rewards/margins": -0.14115303754806519, + "rewards/rejected": -0.2076743245124817, + "step": 4312 + }, + { + "epoch": 0.6670017398028224, + "grad_norm": 5.443709850311279, + "learning_rate": 4.32036888532478e-06, + "logits/chosen": 4.333506107330322, + "logits/rejected": 5.4422502517700195, + "logps/chosen": -226.25418090820312, + "logps/rejected": -231.79800415039062, + "loss": 0.7051, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.33224472403526306, + "rewards/margins": 0.1515967845916748, + "rewards/rejected": -0.48384150862693787, + "step": 4313 + }, + { + "epoch": 0.6671563889425866, + "grad_norm": 4.064946174621582, + "learning_rate": 4.320082483675107e-06, + "logits/chosen": 11.330009460449219, + "logits/rejected": 8.791619300842285, + "logps/chosen": -274.75244140625, + "logps/rejected": -220.40699768066406, + "loss": 0.6556, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.074774369597435, + "rewards/margins": 0.19703851640224457, + "rewards/rejected": -0.12226416170597076, + "step": 4314 + }, + { + "epoch": 0.6673110380823507, + "grad_norm": 3.7393038272857666, + "learning_rate": 4.319796082025433e-06, + "logits/chosen": 15.827827453613281, + "logits/rejected": 2.6591029167175293, + "logps/chosen": -356.34600830078125, + "logps/rejected": -202.3115997314453, + "loss": 0.382, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0466192290186882, + "rewards/margins": 0.8728972673416138, + "rewards/rejected": -0.8262780904769897, + "step": 4315 + }, + { + "epoch": 0.6674656872221149, + "grad_norm": 5.586511611938477, + "learning_rate": 4.319509680375759e-06, + "logits/chosen": 10.831878662109375, + "logits/rejected": 5.411952018737793, + "logps/chosen": -277.65252685546875, + "logps/rejected": -193.48443603515625, + "loss": 0.7262, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0832025408744812, + "rewards/margins": 0.01356622576713562, + "rewards/rejected": -0.09676878154277802, + "step": 4316 + }, + { + "epoch": 0.667620336361879, + "grad_norm": 5.125912189483643, + "learning_rate": 4.319223278726086e-06, + "logits/chosen": 11.209256172180176, + "logits/rejected": 6.126062393188477, + "logps/chosen": -355.3177795410156, + "logps/rejected": -279.807861328125, + "loss": 0.5187, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.029079005122184753, + "rewards/margins": 0.5901710987091064, + "rewards/rejected": -0.6192500591278076, + "step": 4317 + }, + { + "epoch": 0.6677749855016432, + "grad_norm": 4.7180609703063965, + "learning_rate": 4.318936877076413e-06, + "logits/chosen": 11.925938606262207, + "logits/rejected": 16.087594985961914, + "logps/chosen": -222.77183532714844, + "logps/rejected": -241.7539520263672, + "loss": 0.7657, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14713388681411743, + "rewards/margins": -0.03299078345298767, + "rewards/rejected": -0.11414310336112976, + "step": 4318 + }, + { + "epoch": 0.6679296346414073, + "grad_norm": 5.248588562011719, + "learning_rate": 4.318650475426739e-06, + "logits/chosen": 5.701940536499023, + "logits/rejected": 2.073434829711914, + "logps/chosen": -338.78961181640625, + "logps/rejected": -226.2742462158203, + "loss": 0.4987, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1251198798418045, + "rewards/margins": 0.6047331094741821, + "rewards/rejected": -0.4796132445335388, + "step": 4319 + }, + { + "epoch": 0.6680842837811715, + "grad_norm": 8.054244041442871, + "learning_rate": 4.318364073777065e-06, + "logits/chosen": 9.712955474853516, + "logits/rejected": 7.13339900970459, + "logps/chosen": -343.57763671875, + "logps/rejected": -331.231201171875, + "loss": 0.7035, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4916762709617615, + "rewards/margins": 0.04455384612083435, + "rewards/rejected": -0.5362300872802734, + "step": 4320 + }, + { + "epoch": 0.6682389329209356, + "grad_norm": 5.975429534912109, + "learning_rate": 4.318077672127392e-06, + "logits/chosen": 8.411194801330566, + "logits/rejected": 4.47360897064209, + "logps/chosen": -240.09014892578125, + "logps/rejected": -204.2136688232422, + "loss": 0.7309, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.31958699226379395, + "rewards/margins": 0.06880658864974976, + "rewards/rejected": -0.3883935809135437, + "step": 4321 + }, + { + "epoch": 0.6683935820606998, + "grad_norm": 7.254704475402832, + "learning_rate": 4.317791270477718e-06, + "logits/chosen": 8.755146980285645, + "logits/rejected": 2.7581326961517334, + "logps/chosen": -409.25262451171875, + "logps/rejected": -284.9300842285156, + "loss": 0.8369, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.43356791138648987, + "rewards/margins": -0.1361943781375885, + "rewards/rejected": -0.29737353324890137, + "step": 4322 + }, + { + "epoch": 0.6685482312004639, + "grad_norm": 7.898281097412109, + "learning_rate": 4.317504868828045e-06, + "logits/chosen": 8.513710021972656, + "logits/rejected": 11.34129524230957, + "logps/chosen": -278.28265380859375, + "logps/rejected": -290.51800537109375, + "loss": 0.6963, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19352112710475922, + "rewards/margins": 0.0626763105392456, + "rewards/rejected": -0.256197452545166, + "step": 4323 + }, + { + "epoch": 0.6687028803402281, + "grad_norm": 5.849246025085449, + "learning_rate": 4.317218467178371e-06, + "logits/chosen": 12.781208992004395, + "logits/rejected": 8.761346817016602, + "logps/chosen": -222.95230102539062, + "logps/rejected": -221.46905517578125, + "loss": 0.6919, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4186118245124817, + "rewards/margins": 0.0954810231924057, + "rewards/rejected": -0.5140928030014038, + "step": 4324 + }, + { + "epoch": 0.6688575294799922, + "grad_norm": 5.6869587898254395, + "learning_rate": 4.3169320655286975e-06, + "logits/chosen": 11.598677635192871, + "logits/rejected": 8.498250007629395, + "logps/chosen": -250.36099243164062, + "logps/rejected": -244.9032745361328, + "loss": 0.7611, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17815251648426056, + "rewards/margins": -0.07841001451015472, + "rewards/rejected": -0.09974252432584763, + "step": 4325 + }, + { + "epoch": 0.6690121786197565, + "grad_norm": 5.009400844573975, + "learning_rate": 4.316645663879024e-06, + "logits/chosen": 12.952972412109375, + "logits/rejected": 4.941444396972656, + "logps/chosen": -316.83526611328125, + "logps/rejected": -236.72406005859375, + "loss": 0.645, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2974834442138672, + "rewards/margins": 0.21905556321144104, + "rewards/rejected": -0.5165389776229858, + "step": 4326 + }, + { + "epoch": 0.6691668277595206, + "grad_norm": 5.019449710845947, + "learning_rate": 4.316359262229351e-06, + "logits/chosen": 9.394136428833008, + "logits/rejected": 9.109855651855469, + "logps/chosen": -247.48443603515625, + "logps/rejected": -299.04351806640625, + "loss": 0.5213, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10212001204490662, + "rewards/margins": 0.6044638752937317, + "rewards/rejected": -0.5023438930511475, + "step": 4327 + }, + { + "epoch": 0.6693214768992848, + "grad_norm": 3.848778486251831, + "learning_rate": 4.3160728605796775e-06, + "logits/chosen": 12.363107681274414, + "logits/rejected": 12.56724739074707, + "logps/chosen": -160.00241088867188, + "logps/rejected": -191.36505126953125, + "loss": 0.5652, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08544743061065674, + "rewards/margins": 0.46466854214668274, + "rewards/rejected": -0.5501160025596619, + "step": 4328 + }, + { + "epoch": 0.6694761260390489, + "grad_norm": 8.397557258605957, + "learning_rate": 4.315786458930003e-06, + "logits/chosen": 5.244521141052246, + "logits/rejected": 1.1233984231948853, + "logps/chosen": -292.2578430175781, + "logps/rejected": -207.8626708984375, + "loss": 0.9983, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.7188864946365356, + "rewards/margins": -0.4574933350086212, + "rewards/rejected": -0.26139315962791443, + "step": 4329 + }, + { + "epoch": 0.6696307751788131, + "grad_norm": 23.381149291992188, + "learning_rate": 4.31550005728033e-06, + "logits/chosen": 11.058280944824219, + "logits/rejected": 4.830230236053467, + "logps/chosen": -375.2982177734375, + "logps/rejected": -342.3121032714844, + "loss": 0.6342, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1727220118045807, + "rewards/margins": 0.39396876096725464, + "rewards/rejected": -0.22124677896499634, + "step": 4330 + }, + { + "epoch": 0.6697854243185772, + "grad_norm": 8.040107727050781, + "learning_rate": 4.3152136556306566e-06, + "logits/chosen": 7.47574520111084, + "logits/rejected": 3.1202948093414307, + "logps/chosen": -261.68310546875, + "logps/rejected": -232.18063354492188, + "loss": 0.8969, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4198048412799835, + "rewards/margins": -0.22327278554439545, + "rewards/rejected": -0.19653202593326569, + "step": 4331 + }, + { + "epoch": 0.6699400734583414, + "grad_norm": 4.902009010314941, + "learning_rate": 4.314927253980983e-06, + "logits/chosen": 8.587291717529297, + "logits/rejected": 5.095376491546631, + "logps/chosen": -260.6650695800781, + "logps/rejected": -259.49822998046875, + "loss": 0.4508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2044256031513214, + "rewards/margins": 0.6125047206878662, + "rewards/rejected": -0.81693035364151, + "step": 4332 + }, + { + "epoch": 0.6700947225981055, + "grad_norm": 7.022604465484619, + "learning_rate": 4.31464085233131e-06, + "logits/chosen": 6.559743881225586, + "logits/rejected": 0.9831100702285767, + "logps/chosen": -205.47491455078125, + "logps/rejected": -126.33174896240234, + "loss": 0.7315, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6412868499755859, + "rewards/margins": -0.048475492745637894, + "rewards/rejected": -0.5928113460540771, + "step": 4333 + }, + { + "epoch": 0.6702493717378697, + "grad_norm": 6.118412017822266, + "learning_rate": 4.3143544506816365e-06, + "logits/chosen": 16.033222198486328, + "logits/rejected": 7.6022539138793945, + "logps/chosen": -440.2850341796875, + "logps/rejected": -395.99530029296875, + "loss": 0.588, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18403244018554688, + "rewards/margins": 0.5059230923652649, + "rewards/rejected": -0.3218906819820404, + "step": 4334 + }, + { + "epoch": 0.6704040208776338, + "grad_norm": 5.674912929534912, + "learning_rate": 4.314068049031962e-06, + "logits/chosen": 12.06112003326416, + "logits/rejected": -0.8558475971221924, + "logps/chosen": -401.1207275390625, + "logps/rejected": -296.3409423828125, + "loss": 0.5808, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3929820954799652, + "rewards/margins": 0.8912266492843628, + "rewards/rejected": -1.2842087745666504, + "step": 4335 + }, + { + "epoch": 0.670558670017398, + "grad_norm": 4.7469305992126465, + "learning_rate": 4.313781647382289e-06, + "logits/chosen": 4.4329328536987305, + "logits/rejected": 3.6825971603393555, + "logps/chosen": -181.5720672607422, + "logps/rejected": -220.92105102539062, + "loss": 0.5705, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3711830973625183, + "rewards/margins": 0.3734095096588135, + "rewards/rejected": -0.7445926666259766, + "step": 4336 + }, + { + "epoch": 0.6707133191571621, + "grad_norm": 4.592440128326416, + "learning_rate": 4.313495245732616e-06, + "logits/chosen": 3.863528251647949, + "logits/rejected": 7.95263671875, + "logps/chosen": -168.6116485595703, + "logps/rejected": -206.87734985351562, + "loss": 0.6461, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.012716487050056458, + "rewards/margins": 0.20782911777496338, + "rewards/rejected": -0.22054558992385864, + "step": 4337 + }, + { + "epoch": 0.6708679682969263, + "grad_norm": 5.196115493774414, + "learning_rate": 4.313208844082942e-06, + "logits/chosen": 5.945255756378174, + "logits/rejected": 2.1601343154907227, + "logps/chosen": -439.0337829589844, + "logps/rejected": -377.760986328125, + "loss": 0.4033, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30622929334640503, + "rewards/margins": 0.9300085306167603, + "rewards/rejected": -0.6237791776657104, + "step": 4338 + }, + { + "epoch": 0.6710226174366906, + "grad_norm": 8.12320613861084, + "learning_rate": 4.312922442433269e-06, + "logits/chosen": 11.290288925170898, + "logits/rejected": 6.534239768981934, + "logps/chosen": -225.21377563476562, + "logps/rejected": -166.76809692382812, + "loss": 0.7357, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3592991828918457, + "rewards/margins": -0.04937276244163513, + "rewards/rejected": -0.30992642045021057, + "step": 4339 + }, + { + "epoch": 0.6711772665764547, + "grad_norm": 5.785003185272217, + "learning_rate": 4.3126360407835956e-06, + "logits/chosen": 10.92134952545166, + "logits/rejected": 9.446928977966309, + "logps/chosen": -324.3086853027344, + "logps/rejected": -291.07965087890625, + "loss": 0.6462, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11384683102369308, + "rewards/margins": 0.19888365268707275, + "rewards/rejected": -0.31273049116134644, + "step": 4340 + }, + { + "epoch": 0.6713319157162189, + "grad_norm": 5.306368827819824, + "learning_rate": 4.312349639133922e-06, + "logits/chosen": 9.784195899963379, + "logits/rejected": 5.088803768157959, + "logps/chosen": -294.5977478027344, + "logps/rejected": -251.1013641357422, + "loss": 0.675, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.169917032122612, + "rewards/margins": 0.2355196177959442, + "rewards/rejected": -0.405436635017395, + "step": 4341 + }, + { + "epoch": 0.671486564855983, + "grad_norm": 5.676792621612549, + "learning_rate": 4.312063237484248e-06, + "logits/chosen": 8.502631187438965, + "logits/rejected": 3.6801257133483887, + "logps/chosen": -326.35186767578125, + "logps/rejected": -221.66177368164062, + "loss": 0.7225, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36491233110427856, + "rewards/margins": 0.19939759373664856, + "rewards/rejected": -0.5643098950386047, + "step": 4342 + }, + { + "epoch": 0.6716412139957472, + "grad_norm": 6.94557523727417, + "learning_rate": 4.311776835834575e-06, + "logits/chosen": 11.946405410766602, + "logits/rejected": 11.808531761169434, + "logps/chosen": -373.92987060546875, + "logps/rejected": -304.0584411621094, + "loss": 0.8125, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1062505692243576, + "rewards/margins": -0.066680908203125, + "rewards/rejected": -0.039569661021232605, + "step": 4343 + }, + { + "epoch": 0.6717958631355113, + "grad_norm": 4.628807067871094, + "learning_rate": 4.311490434184901e-06, + "logits/chosen": 7.7528228759765625, + "logits/rejected": 6.748226165771484, + "logps/chosen": -374.65594482421875, + "logps/rejected": -311.2047424316406, + "loss": 0.5592, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.017075102776288986, + "rewards/margins": 0.404699444770813, + "rewards/rejected": -0.42177462577819824, + "step": 4344 + }, + { + "epoch": 0.6719505122752755, + "grad_norm": 4.371573448181152, + "learning_rate": 4.311204032535228e-06, + "logits/chosen": 7.5436110496521, + "logits/rejected": 6.836152076721191, + "logps/chosen": -149.7579345703125, + "logps/rejected": -152.1444854736328, + "loss": 0.5987, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08918696641921997, + "rewards/margins": 0.3966192305088043, + "rewards/rejected": -0.30743223428726196, + "step": 4345 + }, + { + "epoch": 0.6721051614150396, + "grad_norm": 5.504759311676025, + "learning_rate": 4.310917630885555e-06, + "logits/chosen": 12.810900688171387, + "logits/rejected": 12.121976852416992, + "logps/chosen": -285.6212158203125, + "logps/rejected": -326.0976867675781, + "loss": 0.7801, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24133431911468506, + "rewards/margins": -0.037230681627988815, + "rewards/rejected": -0.20410367846488953, + "step": 4346 + }, + { + "epoch": 0.6722598105548038, + "grad_norm": 6.4040021896362305, + "learning_rate": 4.310631229235881e-06, + "logits/chosen": 5.288246154785156, + "logits/rejected": 4.323495388031006, + "logps/chosen": -283.30572509765625, + "logps/rejected": -271.1005859375, + "loss": 0.7062, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21802043914794922, + "rewards/margins": 0.14184197783470154, + "rewards/rejected": -0.35986241698265076, + "step": 4347 + }, + { + "epoch": 0.6724144596945679, + "grad_norm": 6.111469268798828, + "learning_rate": 4.310344827586207e-06, + "logits/chosen": 5.314854145050049, + "logits/rejected": 4.785194396972656, + "logps/chosen": -314.21710205078125, + "logps/rejected": -242.79913330078125, + "loss": 0.7799, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2315208613872528, + "rewards/margins": -0.022492021322250366, + "rewards/rejected": -0.20902882516384125, + "step": 4348 + }, + { + "epoch": 0.6725691088343321, + "grad_norm": 15.672969818115234, + "learning_rate": 4.310058425936534e-06, + "logits/chosen": 4.177640914916992, + "logits/rejected": 8.71719741821289, + "logps/chosen": -197.6545867919922, + "logps/rejected": -265.61590576171875, + "loss": 0.742, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.36047232151031494, + "rewards/margins": -0.0384647399187088, + "rewards/rejected": -0.32200756669044495, + "step": 4349 + }, + { + "epoch": 0.6727237579740962, + "grad_norm": 6.3508381843566895, + "learning_rate": 4.30977202428686e-06, + "logits/chosen": 7.652085304260254, + "logits/rejected": 4.731571674346924, + "logps/chosen": -242.31455993652344, + "logps/rejected": -211.475830078125, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12293510884046555, + "rewards/margins": 0.025870423763990402, + "rewards/rejected": -0.14880552887916565, + "step": 4350 + }, + { + "epoch": 0.6728784071138604, + "grad_norm": 7.533824920654297, + "learning_rate": 4.309485622637187e-06, + "logits/chosen": 7.912522792816162, + "logits/rejected": 9.668782234191895, + "logps/chosen": -229.46963500976562, + "logps/rejected": -225.49020385742188, + "loss": 0.7688, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.48833540081977844, + "rewards/margins": -0.060272425413131714, + "rewards/rejected": -0.4280630052089691, + "step": 4351 + }, + { + "epoch": 0.6730330562536246, + "grad_norm": 5.6148271560668945, + "learning_rate": 4.309199220987514e-06, + "logits/chosen": 9.664350509643555, + "logits/rejected": 8.80941390991211, + "logps/chosen": -293.586669921875, + "logps/rejected": -265.803955078125, + "loss": 0.626, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09197207540273666, + "rewards/margins": 0.2634292244911194, + "rewards/rejected": -0.17145714163780212, + "step": 4352 + }, + { + "epoch": 0.6731877053933888, + "grad_norm": 4.790165901184082, + "learning_rate": 4.3089128193378395e-06, + "logits/chosen": 10.210770606994629, + "logits/rejected": 7.330760955810547, + "logps/chosen": -369.55419921875, + "logps/rejected": -253.150146484375, + "loss": 0.584, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1554819941520691, + "rewards/margins": 0.39861422777175903, + "rewards/rejected": -0.5540962219238281, + "step": 4353 + }, + { + "epoch": 0.6733423545331529, + "grad_norm": 6.02699613571167, + "learning_rate": 4.308626417688166e-06, + "logits/chosen": 5.232302188873291, + "logits/rejected": 4.87021541595459, + "logps/chosen": -303.505615234375, + "logps/rejected": -210.87477111816406, + "loss": 0.7252, + "rewards/accuracies": 0.375, + "rewards/chosen": -2.489238977432251e-05, + "rewards/margins": 0.21937112510204315, + "rewards/rejected": -0.21939602494239807, + "step": 4354 + }, + { + "epoch": 0.6734970036729171, + "grad_norm": 5.76442289352417, + "learning_rate": 4.308340016038493e-06, + "logits/chosen": 3.7333836555480957, + "logits/rejected": 2.573406219482422, + "logps/chosen": -224.85467529296875, + "logps/rejected": -259.0929870605469, + "loss": 0.6197, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.017428487539291382, + "rewards/margins": 0.3432026505470276, + "rewards/rejected": -0.3257741928100586, + "step": 4355 + }, + { + "epoch": 0.6736516528126812, + "grad_norm": 9.911994934082031, + "learning_rate": 4.3080536143888194e-06, + "logits/chosen": 4.855419158935547, + "logits/rejected": 2.613642930984497, + "logps/chosen": -386.39501953125, + "logps/rejected": -298.31524658203125, + "loss": 0.8119, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24843397736549377, + "rewards/margins": -0.13192063570022583, + "rewards/rejected": -0.11651334166526794, + "step": 4356 + }, + { + "epoch": 0.6738063019524454, + "grad_norm": 7.51059627532959, + "learning_rate": 4.307767212739146e-06, + "logits/chosen": 13.94286060333252, + "logits/rejected": 10.514548301696777, + "logps/chosen": -498.1987609863281, + "logps/rejected": -400.3186950683594, + "loss": 0.6186, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10071755200624466, + "rewards/margins": 0.19627895951271057, + "rewards/rejected": -0.09556140005588531, + "step": 4357 + }, + { + "epoch": 0.6739609510922095, + "grad_norm": 3.9615440368652344, + "learning_rate": 4.307480811089472e-06, + "logits/chosen": 10.846609115600586, + "logits/rejected": 6.004369735717773, + "logps/chosen": -285.78265380859375, + "logps/rejected": -217.84197998046875, + "loss": 0.5559, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06391362845897675, + "rewards/margins": 0.3909119665622711, + "rewards/rejected": -0.45482558012008667, + "step": 4358 + }, + { + "epoch": 0.6741156002319737, + "grad_norm": 5.588890552520752, + "learning_rate": 4.3071944094397985e-06, + "logits/chosen": 7.391685962677002, + "logits/rejected": 7.563086986541748, + "logps/chosen": -258.9117736816406, + "logps/rejected": -248.08059692382812, + "loss": 0.8213, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2330484241247177, + "rewards/margins": -0.08544944226741791, + "rewards/rejected": -0.147598996758461, + "step": 4359 + }, + { + "epoch": 0.6742702493717379, + "grad_norm": 13.001784324645996, + "learning_rate": 4.306908007790125e-06, + "logits/chosen": 5.21492862701416, + "logits/rejected": 4.764323711395264, + "logps/chosen": -274.4930419921875, + "logps/rejected": -335.0644836425781, + "loss": 0.8763, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4456023573875427, + "rewards/margins": -0.12490373849868774, + "rewards/rejected": -0.3206985592842102, + "step": 4360 + }, + { + "epoch": 0.674424898511502, + "grad_norm": 8.679266929626465, + "learning_rate": 4.306621606140452e-06, + "logits/chosen": 12.838769912719727, + "logits/rejected": 3.4350359439849854, + "logps/chosen": -287.9986572265625, + "logps/rejected": -241.20518493652344, + "loss": 0.6003, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02727479487657547, + "rewards/margins": 0.32795992493629456, + "rewards/rejected": -0.3552347421646118, + "step": 4361 + }, + { + "epoch": 0.6745795476512662, + "grad_norm": 3.8145883083343506, + "learning_rate": 4.306335204490778e-06, + "logits/chosen": 15.41836166381836, + "logits/rejected": 6.221778869628906, + "logps/chosen": -383.5130615234375, + "logps/rejected": -244.79513549804688, + "loss": 0.3953, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03227100148797035, + "rewards/margins": 0.8833968639373779, + "rewards/rejected": -0.8511258363723755, + "step": 4362 + }, + { + "epoch": 0.6747341967910303, + "grad_norm": 5.615697383880615, + "learning_rate": 4.306048802841104e-06, + "logits/chosen": 11.220874786376953, + "logits/rejected": 3.1929445266723633, + "logps/chosen": -278.5677795410156, + "logps/rejected": -189.5736846923828, + "loss": 0.5682, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2512831687927246, + "rewards/margins": 0.7028713822364807, + "rewards/rejected": -0.9541544914245605, + "step": 4363 + }, + { + "epoch": 0.6748888459307945, + "grad_norm": 6.706066131591797, + "learning_rate": 4.305762401191431e-06, + "logits/chosen": 11.945959091186523, + "logits/rejected": 8.568140029907227, + "logps/chosen": -273.2823486328125, + "logps/rejected": -248.9542999267578, + "loss": 0.6291, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08710212260484695, + "rewards/margins": 0.17725151777267456, + "rewards/rejected": -0.0901493951678276, + "step": 4364 + }, + { + "epoch": 0.6750434950705587, + "grad_norm": 9.580127716064453, + "learning_rate": 4.305475999541758e-06, + "logits/chosen": 8.329070091247559, + "logits/rejected": 5.84481143951416, + "logps/chosen": -228.21717834472656, + "logps/rejected": -219.42247009277344, + "loss": 0.7962, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3564581274986267, + "rewards/margins": 0.24120935797691345, + "rewards/rejected": -0.5976675152778625, + "step": 4365 + }, + { + "epoch": 0.6751981442103229, + "grad_norm": 5.860879421234131, + "learning_rate": 4.305189597892084e-06, + "logits/chosen": 7.781728744506836, + "logits/rejected": 10.093608856201172, + "logps/chosen": -242.03427124023438, + "logps/rejected": -249.22378540039062, + "loss": 0.7788, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2477954924106598, + "rewards/margins": -0.13580816984176636, + "rewards/rejected": -0.11198730021715164, + "step": 4366 + }, + { + "epoch": 0.675352793350087, + "grad_norm": 4.436400890350342, + "learning_rate": 4.304903196242411e-06, + "logits/chosen": 10.72449016571045, + "logits/rejected": 7.377851486206055, + "logps/chosen": -267.89324951171875, + "logps/rejected": -241.6751251220703, + "loss": 0.5809, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.27599436044692993, + "rewards/margins": 0.36522021889686584, + "rewards/rejected": -0.6412145495414734, + "step": 4367 + }, + { + "epoch": 0.6755074424898512, + "grad_norm": 7.7967634201049805, + "learning_rate": 4.304616794592737e-06, + "logits/chosen": 7.131446361541748, + "logits/rejected": 2.8030855655670166, + "logps/chosen": -369.4346923828125, + "logps/rejected": -317.81085205078125, + "loss": 0.6587, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18229426443576813, + "rewards/margins": 0.169584721326828, + "rewards/rejected": -0.3518790006637573, + "step": 4368 + }, + { + "epoch": 0.6756620916296153, + "grad_norm": 7.428574562072754, + "learning_rate": 4.304330392943063e-06, + "logits/chosen": 1.770752191543579, + "logits/rejected": 8.010498046875, + "logps/chosen": -322.97369384765625, + "logps/rejected": -380.0075988769531, + "loss": 0.8691, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5723614692687988, + "rewards/margins": -0.19962920248508453, + "rewards/rejected": -0.3727322518825531, + "step": 4369 + }, + { + "epoch": 0.6758167407693795, + "grad_norm": 18.018117904663086, + "learning_rate": 4.30404399129339e-06, + "logits/chosen": 14.680455207824707, + "logits/rejected": 6.410256385803223, + "logps/chosen": -420.0663146972656, + "logps/rejected": -229.6256561279297, + "loss": 0.4546, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3136487305164337, + "rewards/margins": 0.8922341465950012, + "rewards/rejected": -0.5785854458808899, + "step": 4370 + }, + { + "epoch": 0.6759713899091436, + "grad_norm": 5.81412935256958, + "learning_rate": 4.303757589643717e-06, + "logits/chosen": 9.64257526397705, + "logits/rejected": 6.972983360290527, + "logps/chosen": -270.0439147949219, + "logps/rejected": -192.17625427246094, + "loss": 0.6464, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24827784299850464, + "rewards/margins": 0.1747988760471344, + "rewards/rejected": -0.4230767488479614, + "step": 4371 + }, + { + "epoch": 0.6761260390489078, + "grad_norm": 5.1410932540893555, + "learning_rate": 4.303471187994043e-06, + "logits/chosen": 11.954044342041016, + "logits/rejected": 10.99544906616211, + "logps/chosen": -274.9610595703125, + "logps/rejected": -262.96014404296875, + "loss": 0.6378, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4133809208869934, + "rewards/margins": 0.28973305225372314, + "rewards/rejected": -0.7031139731407166, + "step": 4372 + }, + { + "epoch": 0.6762806881886719, + "grad_norm": 4.030061721801758, + "learning_rate": 4.30318478634437e-06, + "logits/chosen": 7.366610527038574, + "logits/rejected": 2.4514148235321045, + "logps/chosen": -207.94317626953125, + "logps/rejected": -172.33828735351562, + "loss": 0.5104, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24070709943771362, + "rewards/margins": 0.5449966192245483, + "rewards/rejected": -0.3042895495891571, + "step": 4373 + }, + { + "epoch": 0.6764353373284361, + "grad_norm": 4.2293782234191895, + "learning_rate": 4.302898384694697e-06, + "logits/chosen": 14.109688758850098, + "logits/rejected": 12.98661994934082, + "logps/chosen": -342.3206787109375, + "logps/rejected": -275.4928283691406, + "loss": 0.5416, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3115765452384949, + "rewards/margins": 0.5092012286186218, + "rewards/rejected": -0.19762465357780457, + "step": 4374 + }, + { + "epoch": 0.6765899864682002, + "grad_norm": 5.203789710998535, + "learning_rate": 4.302611983045022e-06, + "logits/chosen": 9.19174575805664, + "logits/rejected": 10.340836524963379, + "logps/chosen": -239.3130645751953, + "logps/rejected": -267.2409973144531, + "loss": 0.6258, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1554592251777649, + "rewards/margins": 0.17030923068523407, + "rewards/rejected": -0.32576844096183777, + "step": 4375 + }, + { + "epoch": 0.6767446356079644, + "grad_norm": 4.06829309463501, + "learning_rate": 4.302325581395349e-06, + "logits/chosen": 11.189308166503906, + "logits/rejected": 2.3020639419555664, + "logps/chosen": -194.34716796875, + "logps/rejected": -171.3843231201172, + "loss": 0.6629, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11470726132392883, + "rewards/margins": 0.12832361459732056, + "rewards/rejected": -0.013616353273391724, + "step": 4376 + }, + { + "epoch": 0.6768992847477285, + "grad_norm": 94.50373840332031, + "learning_rate": 4.302039179745676e-06, + "logits/chosen": 8.068161964416504, + "logits/rejected": 11.06473159790039, + "logps/chosen": -314.3012390136719, + "logps/rejected": -436.3773193359375, + "loss": 0.5777, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21490192413330078, + "rewards/margins": 0.34530109167099, + "rewards/rejected": -0.13039913773536682, + "step": 4377 + }, + { + "epoch": 0.6770539338874928, + "grad_norm": 5.796728134155273, + "learning_rate": 4.301752778096002e-06, + "logits/chosen": 14.143423080444336, + "logits/rejected": 9.135540008544922, + "logps/chosen": -281.4481506347656, + "logps/rejected": -257.16900634765625, + "loss": 0.7034, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3008747100830078, + "rewards/margins": 0.027569115161895752, + "rewards/rejected": -0.32844382524490356, + "step": 4378 + }, + { + "epoch": 0.677208583027257, + "grad_norm": 5.3141303062438965, + "learning_rate": 4.301466376446329e-06, + "logits/chosen": 3.780085802078247, + "logits/rejected": 6.952800750732422, + "logps/chosen": -178.5964813232422, + "logps/rejected": -208.16461181640625, + "loss": 0.7543, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08171045780181885, + "rewards/margins": -0.07805463671684265, + "rewards/rejected": -0.00365583598613739, + "step": 4379 + }, + { + "epoch": 0.6773632321670211, + "grad_norm": 6.719045639038086, + "learning_rate": 4.301179974796656e-06, + "logits/chosen": 6.18869686126709, + "logits/rejected": 7.674605846405029, + "logps/chosen": -274.08355712890625, + "logps/rejected": -256.7762451171875, + "loss": 0.6847, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4902491569519043, + "rewards/margins": 0.09670928120613098, + "rewards/rejected": -0.5869584083557129, + "step": 4380 + }, + { + "epoch": 0.6775178813067853, + "grad_norm": 5.575119495391846, + "learning_rate": 4.3008935731469815e-06, + "logits/chosen": 8.194402694702148, + "logits/rejected": 9.91681957244873, + "logps/chosen": -274.889404296875, + "logps/rejected": -282.0581970214844, + "loss": 0.7807, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28673964738845825, + "rewards/margins": -0.11491774767637253, + "rewards/rejected": -0.17182184755802155, + "step": 4381 + }, + { + "epoch": 0.6776725304465494, + "grad_norm": 5.479812145233154, + "learning_rate": 4.300607171497308e-06, + "logits/chosen": 14.414141654968262, + "logits/rejected": 13.416187286376953, + "logps/chosen": -325.0621032714844, + "logps/rejected": -312.00958251953125, + "loss": 0.6873, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17287693917751312, + "rewards/margins": 0.01997271180152893, + "rewards/rejected": -0.19284962117671967, + "step": 4382 + }, + { + "epoch": 0.6778271795863136, + "grad_norm": 5.18855094909668, + "learning_rate": 4.300320769847635e-06, + "logits/chosen": 11.120960235595703, + "logits/rejected": 7.968018531799316, + "logps/chosen": -376.40374755859375, + "logps/rejected": -296.7601318359375, + "loss": 0.6192, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49569687247276306, + "rewards/margins": 0.2680850923061371, + "rewards/rejected": 0.2276117503643036, + "step": 4383 + }, + { + "epoch": 0.6779818287260777, + "grad_norm": 5.750880241394043, + "learning_rate": 4.300034368197961e-06, + "logits/chosen": 8.859659194946289, + "logits/rejected": 3.8299648761749268, + "logps/chosen": -240.78466796875, + "logps/rejected": -204.93621826171875, + "loss": 0.7089, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08001694083213806, + "rewards/margins": 0.10315060615539551, + "rewards/rejected": -0.18316754698753357, + "step": 4384 + }, + { + "epoch": 0.6781364778658419, + "grad_norm": 7.315476894378662, + "learning_rate": 4.299747966548288e-06, + "logits/chosen": 9.85761833190918, + "logits/rejected": 10.541047096252441, + "logps/chosen": -297.17315673828125, + "logps/rejected": -275.2012023925781, + "loss": 0.8568, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.16940820217132568, + "rewards/margins": -0.22387093305587769, + "rewards/rejected": 0.0544627383351326, + "step": 4385 + }, + { + "epoch": 0.678291127005606, + "grad_norm": 4.195440769195557, + "learning_rate": 4.299461564898615e-06, + "logits/chosen": 7.4741082191467285, + "logits/rejected": 5.230938911437988, + "logps/chosen": -244.97683715820312, + "logps/rejected": -198.05111694335938, + "loss": 0.4738, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15811572968959808, + "rewards/margins": 0.5458548665046692, + "rewards/rejected": -0.3877390921115875, + "step": 4386 + }, + { + "epoch": 0.6784457761453702, + "grad_norm": 6.165177822113037, + "learning_rate": 4.2991751632489405e-06, + "logits/chosen": 7.2447099685668945, + "logits/rejected": 5.319435119628906, + "logps/chosen": -305.50177001953125, + "logps/rejected": -280.21044921875, + "loss": 0.8067, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07919955253601074, + "rewards/margins": 0.008561987429857254, + "rewards/rejected": 0.07063756138086319, + "step": 4387 + }, + { + "epoch": 0.6786004252851343, + "grad_norm": 3.8643977642059326, + "learning_rate": 4.298888761599267e-06, + "logits/chosen": 6.140326499938965, + "logits/rejected": 3.7158539295196533, + "logps/chosen": -178.57180786132812, + "logps/rejected": -144.42684936523438, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.142907053232193, + "rewards/margins": 0.19747425615787506, + "rewards/rejected": -0.34038129448890686, + "step": 4388 + }, + { + "epoch": 0.6787550744248985, + "grad_norm": 4.747513294219971, + "learning_rate": 4.298602359949594e-06, + "logits/chosen": 4.594847679138184, + "logits/rejected": 6.825936794281006, + "logps/chosen": -285.394287109375, + "logps/rejected": -273.9123840332031, + "loss": 0.5593, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24942153692245483, + "rewards/margins": 0.47802218794822693, + "rewards/rejected": -0.7274438142776489, + "step": 4389 + }, + { + "epoch": 0.6789097235646627, + "grad_norm": 8.338217735290527, + "learning_rate": 4.2983159582999205e-06, + "logits/chosen": 9.88192367553711, + "logits/rejected": 12.31782054901123, + "logps/chosen": -386.43475341796875, + "logps/rejected": -344.5802001953125, + "loss": 0.9233, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.15460683405399323, + "rewards/margins": -0.3263300061225891, + "rewards/rejected": 0.48093682527542114, + "step": 4390 + }, + { + "epoch": 0.6790643727044269, + "grad_norm": 6.41644287109375, + "learning_rate": 4.298029556650246e-06, + "logits/chosen": 4.8790507316589355, + "logits/rejected": 6.105672836303711, + "logps/chosen": -258.84197998046875, + "logps/rejected": -250.13319396972656, + "loss": 0.787, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.020575448870658875, + "rewards/margins": 0.12163665890693665, + "rewards/rejected": -0.1422121226787567, + "step": 4391 + }, + { + "epoch": 0.679219021844191, + "grad_norm": 7.0461273193359375, + "learning_rate": 4.297743155000573e-06, + "logits/chosen": 12.709023475646973, + "logits/rejected": 7.563149452209473, + "logps/chosen": -283.2870788574219, + "logps/rejected": -267.26080322265625, + "loss": 0.6535, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39243602752685547, + "rewards/margins": 0.3009548485279083, + "rewards/rejected": 0.09148116409778595, + "step": 4392 + }, + { + "epoch": 0.6793736709839552, + "grad_norm": 4.431209564208984, + "learning_rate": 4.2974567533508996e-06, + "logits/chosen": 11.554564476013184, + "logits/rejected": 8.213187217712402, + "logps/chosen": -231.82083129882812, + "logps/rejected": -216.91754150390625, + "loss": 0.5243, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18199843168258667, + "rewards/margins": 0.5338979959487915, + "rewards/rejected": -0.7158964276313782, + "step": 4393 + }, + { + "epoch": 0.6795283201237193, + "grad_norm": 5.199768543243408, + "learning_rate": 4.297170351701226e-06, + "logits/chosen": 8.7282133102417, + "logits/rejected": 7.4432172775268555, + "logps/chosen": -211.1273651123047, + "logps/rejected": -268.0179138183594, + "loss": 0.6277, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21734784543514252, + "rewards/margins": 0.19855497777462006, + "rewards/rejected": -0.4159027934074402, + "step": 4394 + }, + { + "epoch": 0.6796829692634835, + "grad_norm": 6.761626243591309, + "learning_rate": 4.296883950051553e-06, + "logits/chosen": 7.202631950378418, + "logits/rejected": 9.472408294677734, + "logps/chosen": -301.03759765625, + "logps/rejected": -423.3046569824219, + "loss": 0.6538, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02093452960252762, + "rewards/margins": 0.2627944052219391, + "rewards/rejected": -0.2837289869785309, + "step": 4395 + }, + { + "epoch": 0.6798376184032476, + "grad_norm": 5.459228515625, + "learning_rate": 4.296597548401879e-06, + "logits/chosen": 12.33895492553711, + "logits/rejected": 9.177013397216797, + "logps/chosen": -377.6455078125, + "logps/rejected": -290.0875549316406, + "loss": 0.6824, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.047038257122039795, + "rewards/margins": 0.1342509239912033, + "rewards/rejected": -0.1812891960144043, + "step": 4396 + }, + { + "epoch": 0.6799922675430118, + "grad_norm": 4.5779852867126465, + "learning_rate": 4.296311146752205e-06, + "logits/chosen": 14.320616722106934, + "logits/rejected": 6.98124885559082, + "logps/chosen": -268.4227294921875, + "logps/rejected": -244.79818725585938, + "loss": 0.633, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15360823273658752, + "rewards/margins": 0.6148350238800049, + "rewards/rejected": -0.46122676134109497, + "step": 4397 + }, + { + "epoch": 0.6801469166827759, + "grad_norm": 3.5609493255615234, + "learning_rate": 4.296024745102532e-06, + "logits/chosen": 14.616093635559082, + "logits/rejected": 7.603805065155029, + "logps/chosen": -328.3641662597656, + "logps/rejected": -206.73141479492188, + "loss": 0.4058, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04530716314911842, + "rewards/margins": 0.7912201881408691, + "rewards/rejected": -0.745913028717041, + "step": 4398 + }, + { + "epoch": 0.6803015658225401, + "grad_norm": 4.843770503997803, + "learning_rate": 4.295738343452859e-06, + "logits/chosen": 8.000632286071777, + "logits/rejected": 7.5493364334106445, + "logps/chosen": -281.0600891113281, + "logps/rejected": -243.34136962890625, + "loss": 0.6764, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2166353464126587, + "rewards/margins": 0.11579858511686325, + "rewards/rejected": 0.10083675384521484, + "step": 4399 + }, + { + "epoch": 0.6804562149623042, + "grad_norm": 5.3286237716674805, + "learning_rate": 4.295451941803185e-06, + "logits/chosen": 16.993864059448242, + "logits/rejected": 11.162191390991211, + "logps/chosen": -320.25, + "logps/rejected": -208.5397491455078, + "loss": 0.6867, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1269436776638031, + "rewards/margins": 0.10278471559286118, + "rewards/rejected": -0.22972841560840607, + "step": 4400 + }, + { + "epoch": 0.6806108641020684, + "grad_norm": 5.563046932220459, + "learning_rate": 4.295165540153511e-06, + "logits/chosen": 11.928476333618164, + "logits/rejected": 5.195486068725586, + "logps/chosen": -244.13023376464844, + "logps/rejected": -165.9436492919922, + "loss": 0.4953, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.361529141664505, + "rewards/margins": 0.5486453771591187, + "rewards/rejected": -0.18711623549461365, + "step": 4401 + }, + { + "epoch": 0.6807655132418325, + "grad_norm": 6.144144535064697, + "learning_rate": 4.294879138503838e-06, + "logits/chosen": 14.431041717529297, + "logits/rejected": 7.683199882507324, + "logps/chosen": -314.7454528808594, + "logps/rejected": -218.9053955078125, + "loss": 0.7225, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1462804675102234, + "rewards/margins": 0.009972203522920609, + "rewards/rejected": -0.1562526822090149, + "step": 4402 + }, + { + "epoch": 0.6809201623815968, + "grad_norm": 4.993712425231934, + "learning_rate": 4.294592736854164e-06, + "logits/chosen": 14.189153671264648, + "logits/rejected": 12.22339153289795, + "logps/chosen": -332.0534362792969, + "logps/rejected": -317.44464111328125, + "loss": 0.5557, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07819414883852005, + "rewards/margins": 0.40176600217819214, + "rewards/rejected": -0.3235718607902527, + "step": 4403 + }, + { + "epoch": 0.681074811521361, + "grad_norm": 5.071861743927002, + "learning_rate": 4.294306335204491e-06, + "logits/chosen": 17.42917251586914, + "logits/rejected": 8.24921703338623, + "logps/chosen": -359.1090087890625, + "logps/rejected": -306.5159912109375, + "loss": 0.5549, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2623753547668457, + "rewards/margins": 0.3640523850917816, + "rewards/rejected": -0.10167703032493591, + "step": 4404 + }, + { + "epoch": 0.6812294606611251, + "grad_norm": 4.940032958984375, + "learning_rate": 4.294019933554818e-06, + "logits/chosen": 10.026111602783203, + "logits/rejected": 9.074549674987793, + "logps/chosen": -206.15184020996094, + "logps/rejected": -166.01602172851562, + "loss": 0.708, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22703701257705688, + "rewards/margins": 0.013266075402498245, + "rewards/rejected": 0.21377091109752655, + "step": 4405 + }, + { + "epoch": 0.6813841098008893, + "grad_norm": 4.179180145263672, + "learning_rate": 4.293733531905144e-06, + "logits/chosen": 11.007497787475586, + "logits/rejected": 3.752166509628296, + "logps/chosen": -300.860595703125, + "logps/rejected": -151.47885131835938, + "loss": 0.54, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.02710948884487152, + "rewards/margins": 0.3848724961280823, + "rewards/rejected": -0.35776302218437195, + "step": 4406 + }, + { + "epoch": 0.6815387589406534, + "grad_norm": 5.589725494384766, + "learning_rate": 4.293447130255471e-06, + "logits/chosen": 7.565913677215576, + "logits/rejected": 6.852021217346191, + "logps/chosen": -312.3780517578125, + "logps/rejected": -295.39202880859375, + "loss": 0.6693, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07289613038301468, + "rewards/margins": 0.2600077688694, + "rewards/rejected": -0.3329039216041565, + "step": 4407 + }, + { + "epoch": 0.6816934080804176, + "grad_norm": 5.711284637451172, + "learning_rate": 4.293160728605797e-06, + "logits/chosen": 13.750738143920898, + "logits/rejected": 9.577877044677734, + "logps/chosen": -294.51904296875, + "logps/rejected": -234.39547729492188, + "loss": 0.734, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1550583392381668, + "rewards/margins": 0.0804409310221672, + "rewards/rejected": -0.2354992926120758, + "step": 4408 + }, + { + "epoch": 0.6818480572201817, + "grad_norm": 7.570682525634766, + "learning_rate": 4.2928743269561234e-06, + "logits/chosen": 12.190214157104492, + "logits/rejected": 10.80136489868164, + "logps/chosen": -347.2962341308594, + "logps/rejected": -315.48724365234375, + "loss": 0.9836, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4769257605075836, + "rewards/margins": -0.2722587585449219, + "rewards/rejected": -0.20466700196266174, + "step": 4409 + }, + { + "epoch": 0.6820027063599459, + "grad_norm": 5.898629665374756, + "learning_rate": 4.29258792530645e-06, + "logits/chosen": 6.665280342102051, + "logits/rejected": 14.026094436645508, + "logps/chosen": -205.9501953125, + "logps/rejected": -351.3404235839844, + "loss": 0.6919, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2070237100124359, + "rewards/margins": 0.20302194356918335, + "rewards/rejected": -0.41004565358161926, + "step": 4410 + }, + { + "epoch": 0.68215735549971, + "grad_norm": 7.967639446258545, + "learning_rate": 4.292301523656777e-06, + "logits/chosen": 6.183863639831543, + "logits/rejected": 5.97257137298584, + "logps/chosen": -256.78741455078125, + "logps/rejected": -309.83544921875, + "loss": 0.6602, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3181249499320984, + "rewards/margins": 0.19149930775165558, + "rewards/rejected": -0.509624183177948, + "step": 4411 + }, + { + "epoch": 0.6823120046394742, + "grad_norm": 5.440560340881348, + "learning_rate": 4.292015122007103e-06, + "logits/chosen": 6.339186668395996, + "logits/rejected": 6.893258094787598, + "logps/chosen": -176.7098388671875, + "logps/rejected": -170.19900512695312, + "loss": 0.7526, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18195104598999023, + "rewards/margins": -0.0760570615530014, + "rewards/rejected": 0.25800809264183044, + "step": 4412 + }, + { + "epoch": 0.6824666537792383, + "grad_norm": 4.359745502471924, + "learning_rate": 4.29172872035743e-06, + "logits/chosen": 14.661651611328125, + "logits/rejected": 8.421096801757812, + "logps/chosen": -322.3902587890625, + "logps/rejected": -249.64199829101562, + "loss": 0.4913, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26964256167411804, + "rewards/margins": 0.714688777923584, + "rewards/rejected": -0.4450462758541107, + "step": 4413 + }, + { + "epoch": 0.6826213029190025, + "grad_norm": 4.002414703369141, + "learning_rate": 4.291442318707756e-06, + "logits/chosen": 10.898488998413086, + "logits/rejected": 3.8738439083099365, + "logps/chosen": -347.56207275390625, + "logps/rejected": -221.91427612304688, + "loss": 0.552, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2621403932571411, + "rewards/margins": 0.5308627486228943, + "rewards/rejected": -0.2687223255634308, + "step": 4414 + }, + { + "epoch": 0.6827759520587666, + "grad_norm": 4.870889186859131, + "learning_rate": 4.2911559170580825e-06, + "logits/chosen": 9.475064277648926, + "logits/rejected": 4.724165916442871, + "logps/chosen": -274.3704528808594, + "logps/rejected": -216.2876434326172, + "loss": 0.5926, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11690820753574371, + "rewards/margins": 0.4779534339904785, + "rewards/rejected": -0.594861626625061, + "step": 4415 + }, + { + "epoch": 0.6829306011985309, + "grad_norm": 6.941696643829346, + "learning_rate": 4.290869515408409e-06, + "logits/chosen": 14.138030052185059, + "logits/rejected": 11.562395095825195, + "logps/chosen": -292.364990234375, + "logps/rejected": -236.08767700195312, + "loss": 0.8252, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.023112758994102478, + "rewards/margins": -0.15580081939697266, + "rewards/rejected": 0.13268806040287018, + "step": 4416 + }, + { + "epoch": 0.683085250338295, + "grad_norm": 4.98858642578125, + "learning_rate": 4.290583113758736e-06, + "logits/chosen": 6.187417984008789, + "logits/rejected": 7.744145393371582, + "logps/chosen": -261.29656982421875, + "logps/rejected": -330.3377685546875, + "loss": 0.7879, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.139823779463768, + "rewards/margins": -0.07168054580688477, + "rewards/rejected": -0.06814321875572205, + "step": 4417 + }, + { + "epoch": 0.6832398994780592, + "grad_norm": 5.17976188659668, + "learning_rate": 4.2902967121090624e-06, + "logits/chosen": 12.808450698852539, + "logits/rejected": 9.790185928344727, + "logps/chosen": -235.89523315429688, + "logps/rejected": -200.4786376953125, + "loss": 0.7562, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06488892436027527, + "rewards/margins": 0.04878142476081848, + "rewards/rejected": 0.016107499599456787, + "step": 4418 + }, + { + "epoch": 0.6833945486178233, + "grad_norm": 7.002490043640137, + "learning_rate": 4.290010310459389e-06, + "logits/chosen": 9.382917404174805, + "logits/rejected": 8.479498863220215, + "logps/chosen": -290.6116943359375, + "logps/rejected": -265.64361572265625, + "loss": 0.7379, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21291837096214294, + "rewards/margins": 0.1116248369216919, + "rewards/rejected": -0.32454317808151245, + "step": 4419 + }, + { + "epoch": 0.6835491977575875, + "grad_norm": 4.761524677276611, + "learning_rate": 4.289723908809716e-06, + "logits/chosen": 10.212902069091797, + "logits/rejected": 7.367902755737305, + "logps/chosen": -253.96875, + "logps/rejected": -220.33331298828125, + "loss": 0.5865, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10504819452762604, + "rewards/margins": 0.3236563205718994, + "rewards/rejected": -0.21860815584659576, + "step": 4420 + }, + { + "epoch": 0.6837038468973516, + "grad_norm": 6.993953704833984, + "learning_rate": 4.2894375071600415e-06, + "logits/chosen": 10.621439933776855, + "logits/rejected": 6.5814056396484375, + "logps/chosen": -222.0738525390625, + "logps/rejected": -284.02703857421875, + "loss": 0.5379, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2643647789955139, + "rewards/margins": 0.5481058955192566, + "rewards/rejected": -0.2837411165237427, + "step": 4421 + }, + { + "epoch": 0.6838584960371158, + "grad_norm": 5.571416854858398, + "learning_rate": 4.289151105510368e-06, + "logits/chosen": 5.620655059814453, + "logits/rejected": 7.134237766265869, + "logps/chosen": -260.7199401855469, + "logps/rejected": -263.5833435058594, + "loss": 0.6576, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2956846356391907, + "rewards/margins": 0.33304858207702637, + "rewards/rejected": -0.03736397251486778, + "step": 4422 + }, + { + "epoch": 0.6840131451768799, + "grad_norm": 5.099806308746338, + "learning_rate": 4.288864703860695e-06, + "logits/chosen": 6.836983680725098, + "logits/rejected": -2.238745927810669, + "logps/chosen": -316.50054931640625, + "logps/rejected": -205.2968292236328, + "loss": 0.5332, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2413993924856186, + "rewards/margins": 0.5468438863754272, + "rewards/rejected": -0.30544453859329224, + "step": 4423 + }, + { + "epoch": 0.6841677943166441, + "grad_norm": 8.00323486328125, + "learning_rate": 4.2885783022110215e-06, + "logits/chosen": 11.033995628356934, + "logits/rejected": 9.126618385314941, + "logps/chosen": -422.2153625488281, + "logps/rejected": -392.935791015625, + "loss": 0.7761, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.015468217432498932, + "rewards/margins": -0.04781923443078995, + "rewards/rejected": 0.06328745186328888, + "step": 4424 + }, + { + "epoch": 0.6843224434564082, + "grad_norm": 4.680246353149414, + "learning_rate": 4.288291900561347e-06, + "logits/chosen": 11.454498291015625, + "logits/rejected": 5.082576751708984, + "logps/chosen": -283.6060485839844, + "logps/rejected": -178.2688751220703, + "loss": 0.6548, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08155956119298935, + "rewards/margins": 0.22759394347667694, + "rewards/rejected": -0.3091534972190857, + "step": 4425 + }, + { + "epoch": 0.6844770925961724, + "grad_norm": 4.370017051696777, + "learning_rate": 4.288005498911674e-06, + "logits/chosen": 11.999460220336914, + "logits/rejected": 8.618998527526855, + "logps/chosen": -218.4921417236328, + "logps/rejected": -193.80384826660156, + "loss": 0.5025, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30304670333862305, + "rewards/margins": 0.7256160974502563, + "rewards/rejected": -0.4225694537162781, + "step": 4426 + }, + { + "epoch": 0.6846317417359365, + "grad_norm": 3.3437304496765137, + "learning_rate": 4.287719097262001e-06, + "logits/chosen": 15.351802825927734, + "logits/rejected": 8.893656730651855, + "logps/chosen": -237.2900390625, + "logps/rejected": -188.38681030273438, + "loss": 0.3938, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5936269760131836, + "rewards/margins": 0.8723742365837097, + "rewards/rejected": -0.27874720096588135, + "step": 4427 + }, + { + "epoch": 0.6847863908757007, + "grad_norm": 6.830431938171387, + "learning_rate": 4.287432695612327e-06, + "logits/chosen": 6.708425045013428, + "logits/rejected": 5.250683784484863, + "logps/chosen": -221.34515380859375, + "logps/rejected": -232.97952270507812, + "loss": 0.7828, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.323251336812973, + "rewards/margins": -0.041125714778900146, + "rewards/rejected": -0.28212565183639526, + "step": 4428 + }, + { + "epoch": 0.684941040015465, + "grad_norm": 3.639462471008301, + "learning_rate": 4.287146293962653e-06, + "logits/chosen": 9.48283576965332, + "logits/rejected": 2.6893997192382812, + "logps/chosen": -297.8948974609375, + "logps/rejected": -135.90972900390625, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2145722359418869, + "rewards/margins": 0.47692328691482544, + "rewards/rejected": -0.26235103607177734, + "step": 4429 + }, + { + "epoch": 0.6850956891552291, + "grad_norm": 7.06432580947876, + "learning_rate": 4.28685989231298e-06, + "logits/chosen": 10.755661964416504, + "logits/rejected": 11.947458267211914, + "logps/chosen": -287.0971374511719, + "logps/rejected": -265.18096923828125, + "loss": 0.8944, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.08469399809837341, + "rewards/margins": -0.3074387311935425, + "rewards/rejected": 0.3921326994895935, + "step": 4430 + }, + { + "epoch": 0.6852503382949933, + "grad_norm": 5.600857734680176, + "learning_rate": 4.286573490663306e-06, + "logits/chosen": 9.202146530151367, + "logits/rejected": -0.888355553150177, + "logps/chosen": -481.6180419921875, + "logps/rejected": -187.26589965820312, + "loss": 0.7872, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10225935280323029, + "rewards/margins": 0.029102392494678497, + "rewards/rejected": -0.13136178255081177, + "step": 4431 + }, + { + "epoch": 0.6854049874347574, + "grad_norm": 6.241583824157715, + "learning_rate": 4.286287089013633e-06, + "logits/chosen": 10.553300857543945, + "logits/rejected": 3.8909707069396973, + "logps/chosen": -351.89794921875, + "logps/rejected": -247.68299865722656, + "loss": 0.792, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.15536661446094513, + "rewards/margins": -0.10992631316184998, + "rewards/rejected": 0.2652929425239563, + "step": 4432 + }, + { + "epoch": 0.6855596365745216, + "grad_norm": 6.158325672149658, + "learning_rate": 4.28600068736396e-06, + "logits/chosen": 10.460204124450684, + "logits/rejected": 7.212018013000488, + "logps/chosen": -250.78732299804688, + "logps/rejected": -222.10556030273438, + "loss": 0.6019, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1877039074897766, + "rewards/margins": 0.23720739781856537, + "rewards/rejected": -0.049503520131111145, + "step": 4433 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 3.5610086917877197, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": 9.52037525177002, + "logits/rejected": 6.160451412200928, + "logps/chosen": -260.72686767578125, + "logps/rejected": -179.30490112304688, + "loss": 0.479, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18300504982471466, + "rewards/margins": 0.7173242568969727, + "rewards/rejected": -0.5343192219734192, + "step": 4434 + }, + { + "epoch": 0.6858689348540499, + "grad_norm": 4.747145652770996, + "learning_rate": 4.285427884064612e-06, + "logits/chosen": 16.444198608398438, + "logits/rejected": 6.017789840698242, + "logps/chosen": -362.38812255859375, + "logps/rejected": -233.85443115234375, + "loss": 0.5094, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3864232897758484, + "rewards/margins": 0.47597113251686096, + "rewards/rejected": -0.08954782783985138, + "step": 4435 + }, + { + "epoch": 0.686023583993814, + "grad_norm": 5.268344402313232, + "learning_rate": 4.285141482414939e-06, + "logits/chosen": 4.472221374511719, + "logits/rejected": 10.768880844116211, + "logps/chosen": -242.9765167236328, + "logps/rejected": -352.6317138671875, + "loss": 0.6258, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19073772430419922, + "rewards/margins": 0.6525869369506836, + "rewards/rejected": -0.4618492126464844, + "step": 4436 + }, + { + "epoch": 0.6861782331335782, + "grad_norm": 6.1478352546691895, + "learning_rate": 4.284855080765265e-06, + "logits/chosen": 10.997254371643066, + "logits/rejected": 7.679874420166016, + "logps/chosen": -301.6500549316406, + "logps/rejected": -271.2589111328125, + "loss": 0.7048, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1435386687517166, + "rewards/margins": 0.09836465120315552, + "rewards/rejected": -0.24190330505371094, + "step": 4437 + }, + { + "epoch": 0.6863328822733423, + "grad_norm": 6.086324214935303, + "learning_rate": 4.284568679115592e-06, + "logits/chosen": 11.764872550964355, + "logits/rejected": 11.78868579864502, + "logps/chosen": -252.14654541015625, + "logps/rejected": -254.66934204101562, + "loss": 0.7777, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23934301733970642, + "rewards/margins": 0.22395600378513336, + "rewards/rejected": -0.4632989764213562, + "step": 4438 + }, + { + "epoch": 0.6864875314131065, + "grad_norm": 6.6365861892700195, + "learning_rate": 4.284282277465919e-06, + "logits/chosen": 5.342966079711914, + "logits/rejected": 6.390328407287598, + "logps/chosen": -190.566162109375, + "logps/rejected": -236.94354248046875, + "loss": 0.9509, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2988263964653015, + "rewards/margins": -0.24544697999954224, + "rewards/rejected": -0.05337939411401749, + "step": 4439 + }, + { + "epoch": 0.6866421805528706, + "grad_norm": 3.7090659141540527, + "learning_rate": 4.283995875816245e-06, + "logits/chosen": 12.999027252197266, + "logits/rejected": 6.8252997398376465, + "logps/chosen": -150.48828125, + "logps/rejected": -133.07794189453125, + "loss": 0.619, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.18166394531726837, + "rewards/margins": 0.28471317887306213, + "rewards/rejected": -0.4663771390914917, + "step": 4440 + }, + { + "epoch": 0.6867968296926348, + "grad_norm": 3.3590099811553955, + "learning_rate": 4.283709474166571e-06, + "logits/chosen": 9.365459442138672, + "logits/rejected": 8.170974731445312, + "logps/chosen": -225.04531860351562, + "logps/rejected": -187.85997009277344, + "loss": 0.5743, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06690728664398193, + "rewards/margins": 0.5250653624534607, + "rewards/rejected": -0.5919726490974426, + "step": 4441 + }, + { + "epoch": 0.686951478832399, + "grad_norm": 12.276659965515137, + "learning_rate": 4.283423072516898e-06, + "logits/chosen": 11.259847640991211, + "logits/rejected": 3.8195507526397705, + "logps/chosen": -296.73492431640625, + "logps/rejected": -193.91964721679688, + "loss": 0.4528, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16350306570529938, + "rewards/margins": 0.7431578636169434, + "rewards/rejected": -0.5796548128128052, + "step": 4442 + }, + { + "epoch": 0.6871061279721632, + "grad_norm": 9.124277114868164, + "learning_rate": 4.2831366708672245e-06, + "logits/chosen": 12.566473960876465, + "logits/rejected": 6.26070499420166, + "logps/chosen": -391.4123840332031, + "logps/rejected": -286.1532287597656, + "loss": 0.7108, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1961219757795334, + "rewards/margins": 0.22890543937683105, + "rewards/rejected": -0.03278341144323349, + "step": 4443 + }, + { + "epoch": 0.6872607771119273, + "grad_norm": 7.293119430541992, + "learning_rate": 4.282850269217551e-06, + "logits/chosen": 11.97053050994873, + "logits/rejected": 10.036218643188477, + "logps/chosen": -310.6858825683594, + "logps/rejected": -325.09039306640625, + "loss": 0.8062, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24150556325912476, + "rewards/margins": -0.12163146585226059, + "rewards/rejected": 0.36313700675964355, + "step": 4444 + }, + { + "epoch": 0.6874154262516915, + "grad_norm": 5.329139232635498, + "learning_rate": 4.282563867567878e-06, + "logits/chosen": 7.069852828979492, + "logits/rejected": 13.204593658447266, + "logps/chosen": -260.12432861328125, + "logps/rejected": -290.69818115234375, + "loss": 0.6738, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2295803725719452, + "rewards/margins": 0.1481151580810547, + "rewards/rejected": 0.08146519958972931, + "step": 4445 + }, + { + "epoch": 0.6875700753914556, + "grad_norm": 5.738361835479736, + "learning_rate": 4.282277465918204e-06, + "logits/chosen": 11.869058609008789, + "logits/rejected": 3.2904181480407715, + "logps/chosen": -269.22430419921875, + "logps/rejected": -232.68667602539062, + "loss": 0.679, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06167583167552948, + "rewards/margins": 0.1215328648686409, + "rewards/rejected": -0.05985704064369202, + "step": 4446 + }, + { + "epoch": 0.6877247245312198, + "grad_norm": 6.327104568481445, + "learning_rate": 4.28199106426853e-06, + "logits/chosen": 11.759507179260254, + "logits/rejected": 8.659193992614746, + "logps/chosen": -330.90045166015625, + "logps/rejected": -312.1015625, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.029327020049095154, + "rewards/margins": 0.07938119024038315, + "rewards/rejected": -0.1087082028388977, + "step": 4447 + }, + { + "epoch": 0.687879373670984, + "grad_norm": 6.324320316314697, + "learning_rate": 4.281704662618857e-06, + "logits/chosen": 6.9144415855407715, + "logits/rejected": 6.886053085327148, + "logps/chosen": -243.34698486328125, + "logps/rejected": -234.27764892578125, + "loss": 0.6951, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.057009320706129074, + "rewards/margins": 0.036147456616163254, + "rewards/rejected": 0.020861878991127014, + "step": 4448 + }, + { + "epoch": 0.6880340228107481, + "grad_norm": 6.772681713104248, + "learning_rate": 4.2814182609691835e-06, + "logits/chosen": 9.581363677978516, + "logits/rejected": 9.872903823852539, + "logps/chosen": -346.2281494140625, + "logps/rejected": -332.0546875, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1287352740764618, + "rewards/margins": 0.15438660979270935, + "rewards/rejected": -0.025651350617408752, + "step": 4449 + }, + { + "epoch": 0.6881886719505123, + "grad_norm": 5.153995037078857, + "learning_rate": 4.28113185931951e-06, + "logits/chosen": 6.044857978820801, + "logits/rejected": -0.5891554355621338, + "logps/chosen": -210.30276489257812, + "logps/rejected": -151.76376342773438, + "loss": 0.6379, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22512489557266235, + "rewards/margins": 0.2831946015357971, + "rewards/rejected": -0.058069705963134766, + "step": 4450 + }, + { + "epoch": 0.6883433210902764, + "grad_norm": 3.6788439750671387, + "learning_rate": 4.280845457669837e-06, + "logits/chosen": 16.677631378173828, + "logits/rejected": 10.06670093536377, + "logps/chosen": -215.3868865966797, + "logps/rejected": -158.34532165527344, + "loss": 0.5268, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26501116156578064, + "rewards/margins": 0.5089883208274841, + "rewards/rejected": -0.24397718906402588, + "step": 4451 + }, + { + "epoch": 0.6884979702300406, + "grad_norm": 7.015074729919434, + "learning_rate": 4.2805590560201635e-06, + "logits/chosen": 9.428455352783203, + "logits/rejected": 11.390767097473145, + "logps/chosen": -304.6409606933594, + "logps/rejected": -299.46624755859375, + "loss": 0.6543, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.45304667949676514, + "rewards/margins": 0.22036659717559814, + "rewards/rejected": 0.232680082321167, + "step": 4452 + }, + { + "epoch": 0.6886526193698047, + "grad_norm": 5.854787826538086, + "learning_rate": 4.28027265437049e-06, + "logits/chosen": 15.361802101135254, + "logits/rejected": 9.837018013000488, + "logps/chosen": -243.88218688964844, + "logps/rejected": -166.57855224609375, + "loss": 0.6084, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18128342926502228, + "rewards/margins": 0.23061849176883698, + "rewards/rejected": -0.0493350476026535, + "step": 4453 + }, + { + "epoch": 0.688807268509569, + "grad_norm": 5.793326377868652, + "learning_rate": 4.279986252720816e-06, + "logits/chosen": 6.582826614379883, + "logits/rejected": 6.552923202514648, + "logps/chosen": -222.7197265625, + "logps/rejected": -258.70953369140625, + "loss": 0.5229, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3430105149745941, + "rewards/margins": 0.5741848349571228, + "rewards/rejected": -0.2311742752790451, + "step": 4454 + }, + { + "epoch": 0.6889619176493331, + "grad_norm": 5.287814140319824, + "learning_rate": 4.279699851071143e-06, + "logits/chosen": 10.842239379882812, + "logits/rejected": 4.468028545379639, + "logps/chosen": -346.85540771484375, + "logps/rejected": -279.16961669921875, + "loss": 0.6007, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3884417414665222, + "rewards/margins": 0.32020464539527893, + "rewards/rejected": 0.06823711097240448, + "step": 4455 + }, + { + "epoch": 0.6891165667890973, + "grad_norm": 3.6705517768859863, + "learning_rate": 4.279413449421469e-06, + "logits/chosen": 11.041379928588867, + "logits/rejected": 5.593526840209961, + "logps/chosen": -283.5278625488281, + "logps/rejected": -227.07577514648438, + "loss": 0.465, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21017169952392578, + "rewards/margins": 0.7060978412628174, + "rewards/rejected": -0.4959261417388916, + "step": 4456 + }, + { + "epoch": 0.6892712159288614, + "grad_norm": 4.125463485717773, + "learning_rate": 4.279127047771796e-06, + "logits/chosen": 12.102180480957031, + "logits/rejected": 6.286378383636475, + "logps/chosen": -289.98516845703125, + "logps/rejected": -212.45899963378906, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48257529735565186, + "rewards/margins": 0.3367319107055664, + "rewards/rejected": 0.14584335684776306, + "step": 4457 + }, + { + "epoch": 0.6894258650686256, + "grad_norm": 4.032677173614502, + "learning_rate": 4.2788406461221225e-06, + "logits/chosen": 8.035992622375488, + "logits/rejected": 3.0893349647521973, + "logps/chosen": -298.1551818847656, + "logps/rejected": -204.30711364746094, + "loss": 0.5505, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5656207203865051, + "rewards/margins": 0.3655545115470886, + "rewards/rejected": 0.2000662088394165, + "step": 4458 + }, + { + "epoch": 0.6895805142083897, + "grad_norm": 4.442121505737305, + "learning_rate": 4.278554244472448e-06, + "logits/chosen": 10.545802116394043, + "logits/rejected": 8.706689834594727, + "logps/chosen": -173.07183837890625, + "logps/rejected": -166.60781860351562, + "loss": 0.645, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2074240744113922, + "rewards/margins": 0.12810516357421875, + "rewards/rejected": 0.07931890338659286, + "step": 4459 + }, + { + "epoch": 0.6897351633481539, + "grad_norm": 8.730128288269043, + "learning_rate": 4.278267842822775e-06, + "logits/chosen": 11.596196174621582, + "logits/rejected": 6.471994876861572, + "logps/chosen": -425.67608642578125, + "logps/rejected": -317.4320983886719, + "loss": 0.7719, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0530029833316803, + "rewards/margins": -0.06280592083930969, + "rewards/rejected": 0.11580891162157059, + "step": 4460 + }, + { + "epoch": 0.689889812487918, + "grad_norm": 13.567523956298828, + "learning_rate": 4.277981441173102e-06, + "logits/chosen": 9.756856918334961, + "logits/rejected": 6.920470237731934, + "logps/chosen": -253.2501220703125, + "logps/rejected": -212.18373107910156, + "loss": 0.6299, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4337814450263977, + "rewards/margins": 0.19981250166893005, + "rewards/rejected": 0.23396892845630646, + "step": 4461 + }, + { + "epoch": 0.6900444616276822, + "grad_norm": 11.470845222473145, + "learning_rate": 4.277695039523428e-06, + "logits/chosen": 5.811100006103516, + "logits/rejected": 9.205923080444336, + "logps/chosen": -353.174560546875, + "logps/rejected": -362.16778564453125, + "loss": 0.5478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5000346899032593, + "rewards/margins": 0.4216889441013336, + "rewards/rejected": 0.07834573835134506, + "step": 4462 + }, + { + "epoch": 0.6901991107674463, + "grad_norm": 3.72464919090271, + "learning_rate": 4.277408637873754e-06, + "logits/chosen": 16.846162796020508, + "logits/rejected": 5.138298034667969, + "logps/chosen": -342.4397888183594, + "logps/rejected": -146.86964416503906, + "loss": 0.4916, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09391441941261292, + "rewards/margins": 0.794842541217804, + "rewards/rejected": -0.7009280920028687, + "step": 4463 + }, + { + "epoch": 0.6903537599072105, + "grad_norm": 5.917123317718506, + "learning_rate": 4.277122236224081e-06, + "logits/chosen": 8.068735122680664, + "logits/rejected": 8.310803413391113, + "logps/chosen": -179.04954528808594, + "logps/rejected": -166.29823303222656, + "loss": 0.6464, + "rewards/accuracies": 0.375, + "rewards/chosen": 6.006285548210144e-05, + "rewards/margins": 0.1988905966281891, + "rewards/rejected": -0.19883054494857788, + "step": 4464 + }, + { + "epoch": 0.6905084090469746, + "grad_norm": 4.440203666687012, + "learning_rate": 4.276835834574407e-06, + "logits/chosen": 11.373984336853027, + "logits/rejected": 0.045694708824157715, + "logps/chosen": -251.7533416748047, + "logps/rejected": -210.32533264160156, + "loss": 0.4293, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00803305208683014, + "rewards/margins": 0.9182306528091431, + "rewards/rejected": -0.9262637495994568, + "step": 4465 + }, + { + "epoch": 0.6906630581867388, + "grad_norm": 6.657516956329346, + "learning_rate": 4.276549432924734e-06, + "logits/chosen": 9.383832931518555, + "logits/rejected": 7.182356834411621, + "logps/chosen": -292.13641357421875, + "logps/rejected": -170.91891479492188, + "loss": 0.8599, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3271772265434265, + "rewards/margins": -0.16560105979442596, + "rewards/rejected": -0.16157618165016174, + "step": 4466 + }, + { + "epoch": 0.690817707326503, + "grad_norm": 5.258181571960449, + "learning_rate": 4.27626303127506e-06, + "logits/chosen": 10.15674114227295, + "logits/rejected": 6.404476642608643, + "logps/chosen": -233.07278442382812, + "logps/rejected": -255.77577209472656, + "loss": 0.5231, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1105189397931099, + "rewards/margins": 0.45449209213256836, + "rewards/rejected": -0.34397315979003906, + "step": 4467 + }, + { + "epoch": 0.6909723564662672, + "grad_norm": 5.729298114776611, + "learning_rate": 4.2759766296253865e-06, + "logits/chosen": 16.626935958862305, + "logits/rejected": 12.685599327087402, + "logps/chosen": -341.749267578125, + "logps/rejected": -254.06082153320312, + "loss": 0.6758, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12199517339468002, + "rewards/margins": 0.13312430679798126, + "rewards/rejected": -0.2551194727420807, + "step": 4468 + }, + { + "epoch": 0.6911270056060314, + "grad_norm": 3.8000991344451904, + "learning_rate": 4.275690227975713e-06, + "logits/chosen": 10.480460166931152, + "logits/rejected": 4.779432773590088, + "logps/chosen": -187.3704376220703, + "logps/rejected": -131.68356323242188, + "loss": 0.542, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16134941577911377, + "rewards/margins": 0.3910316228866577, + "rewards/rejected": -0.22968220710754395, + "step": 4469 + }, + { + "epoch": 0.6912816547457955, + "grad_norm": 4.3030500411987305, + "learning_rate": 4.27540382632604e-06, + "logits/chosen": 8.15811824798584, + "logits/rejected": 7.014081954956055, + "logps/chosen": -230.70645141601562, + "logps/rejected": -187.0926513671875, + "loss": 0.5585, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24688367545604706, + "rewards/margins": 0.33801594376564026, + "rewards/rejected": -0.091132253408432, + "step": 4470 + }, + { + "epoch": 0.6914363038855597, + "grad_norm": 5.325830459594727, + "learning_rate": 4.2751174246763664e-06, + "logits/chosen": 11.083613395690918, + "logits/rejected": 4.8147172927856445, + "logps/chosen": -370.3558044433594, + "logps/rejected": -254.15090942382812, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4289541244506836, + "rewards/margins": 0.2925434708595276, + "rewards/rejected": 0.136410653591156, + "step": 4471 + }, + { + "epoch": 0.6915909530253238, + "grad_norm": 5.956872940063477, + "learning_rate": 4.274831023026693e-06, + "logits/chosen": 9.191061019897461, + "logits/rejected": 8.84345817565918, + "logps/chosen": -521.2496337890625, + "logps/rejected": -432.1521301269531, + "loss": 0.5759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8666969537734985, + "rewards/margins": 0.32008469104766846, + "rewards/rejected": 0.5466122627258301, + "step": 4472 + }, + { + "epoch": 0.691745602165088, + "grad_norm": 5.084835529327393, + "learning_rate": 4.27454462137702e-06, + "logits/chosen": 14.329225540161133, + "logits/rejected": 16.042705535888672, + "logps/chosen": -234.87545776367188, + "logps/rejected": -271.8675842285156, + "loss": 0.6391, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2103392779827118, + "rewards/margins": 0.3674933612346649, + "rewards/rejected": -0.15715409815311432, + "step": 4473 + }, + { + "epoch": 0.6919002513048521, + "grad_norm": 5.046330451965332, + "learning_rate": 4.2742582197273455e-06, + "logits/chosen": 9.555638313293457, + "logits/rejected": 6.8802809715271, + "logps/chosen": -372.1163330078125, + "logps/rejected": -311.79962158203125, + "loss": 0.5371, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6100231409072876, + "rewards/margins": 0.39819854497909546, + "rewards/rejected": 0.21182462573051453, + "step": 4474 + }, + { + "epoch": 0.6920549004446163, + "grad_norm": 4.712859630584717, + "learning_rate": 4.273971818077672e-06, + "logits/chosen": 5.801811695098877, + "logits/rejected": 9.827659606933594, + "logps/chosen": -122.18645477294922, + "logps/rejected": -168.16775512695312, + "loss": 0.857, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.10234571993350983, + "rewards/margins": -0.26679888367652893, + "rewards/rejected": 0.1644531786441803, + "step": 4475 + }, + { + "epoch": 0.6922095495843804, + "grad_norm": 4.788932800292969, + "learning_rate": 4.273685416427999e-06, + "logits/chosen": 6.0025129318237305, + "logits/rejected": 6.455308437347412, + "logps/chosen": -168.61260986328125, + "logps/rejected": -158.684814453125, + "loss": 0.6885, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12361730635166168, + "rewards/margins": 0.22754812240600586, + "rewards/rejected": -0.10393082350492477, + "step": 4476 + }, + { + "epoch": 0.6923641987241446, + "grad_norm": 10.367257118225098, + "learning_rate": 4.2733990147783255e-06, + "logits/chosen": 6.191953182220459, + "logits/rejected": 2.2458343505859375, + "logps/chosen": -314.0201721191406, + "logps/rejected": -335.0791015625, + "loss": 0.6075, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3049100637435913, + "rewards/margins": 0.3066188395023346, + "rewards/rejected": -0.0017087459564208984, + "step": 4477 + }, + { + "epoch": 0.6925188478639087, + "grad_norm": 5.876046657562256, + "learning_rate": 4.273112613128652e-06, + "logits/chosen": 7.488229751586914, + "logits/rejected": 5.924365997314453, + "logps/chosen": -311.04901123046875, + "logps/rejected": -267.79742431640625, + "loss": 0.7319, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31675636768341064, + "rewards/margins": -0.012081734836101532, + "rewards/rejected": 0.3288381099700928, + "step": 4478 + }, + { + "epoch": 0.6926734970036729, + "grad_norm": 6.2538533210754395, + "learning_rate": 4.272826211478979e-06, + "logits/chosen": 8.13729190826416, + "logits/rejected": 9.317706108093262, + "logps/chosen": -293.9158020019531, + "logps/rejected": -305.23419189453125, + "loss": 0.6176, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32759976387023926, + "rewards/margins": 0.29027268290519714, + "rewards/rejected": 0.03732709586620331, + "step": 4479 + }, + { + "epoch": 0.6928281461434371, + "grad_norm": 5.943842887878418, + "learning_rate": 4.272539809829305e-06, + "logits/chosen": 8.336824417114258, + "logits/rejected": 7.882235527038574, + "logps/chosen": -238.84783935546875, + "logps/rejected": -273.76824951171875, + "loss": 0.6015, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0010039806365966797, + "rewards/margins": 0.2987978756427765, + "rewards/rejected": -0.29980188608169556, + "step": 4480 + }, + { + "epoch": 0.6929827952832013, + "grad_norm": 5.471213340759277, + "learning_rate": 4.272253408179631e-06, + "logits/chosen": 11.127744674682617, + "logits/rejected": 5.473817825317383, + "logps/chosen": -279.04754638671875, + "logps/rejected": -292.2149658203125, + "loss": 0.5747, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.398656964302063, + "rewards/margins": 0.31592652201652527, + "rewards/rejected": 0.0827304795384407, + "step": 4481 + }, + { + "epoch": 0.6931374444229654, + "grad_norm": 4.88809871673584, + "learning_rate": 4.271967006529958e-06, + "logits/chosen": 3.76755428314209, + "logits/rejected": 2.7136266231536865, + "logps/chosen": -179.02857971191406, + "logps/rejected": -233.4971466064453, + "loss": 0.556, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5539631843566895, + "rewards/margins": 0.3540407419204712, + "rewards/rejected": 0.19992247223854065, + "step": 4482 + }, + { + "epoch": 0.6932920935627296, + "grad_norm": 5.8448591232299805, + "learning_rate": 4.2716806048802846e-06, + "logits/chosen": 13.463512420654297, + "logits/rejected": 10.485298156738281, + "logps/chosen": -413.3531494140625, + "logps/rejected": -336.475341796875, + "loss": 0.6576, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48680782318115234, + "rewards/margins": 0.15102580189704895, + "rewards/rejected": 0.3357820510864258, + "step": 4483 + }, + { + "epoch": 0.6934467427024937, + "grad_norm": 4.641660213470459, + "learning_rate": 4.271394203230611e-06, + "logits/chosen": 11.000204086303711, + "logits/rejected": 10.270837783813477, + "logps/chosen": -292.2807922363281, + "logps/rejected": -249.6904296875, + "loss": 0.5762, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3724250793457031, + "rewards/margins": 0.3019684851169586, + "rewards/rejected": 0.0704566091299057, + "step": 4484 + }, + { + "epoch": 0.6936013918422579, + "grad_norm": 4.448966979980469, + "learning_rate": 4.271107801580938e-06, + "logits/chosen": 10.849218368530273, + "logits/rejected": 11.506805419921875, + "logps/chosen": -271.62860107421875, + "logps/rejected": -260.63311767578125, + "loss": 0.5912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29378825426101685, + "rewards/margins": 0.4633154571056366, + "rewards/rejected": -0.16952726244926453, + "step": 4485 + }, + { + "epoch": 0.693756040982022, + "grad_norm": 5.159413814544678, + "learning_rate": 4.2708213999312645e-06, + "logits/chosen": -0.9197348356246948, + "logits/rejected": 3.000253200531006, + "logps/chosen": -192.28976440429688, + "logps/rejected": -220.23855590820312, + "loss": 0.769, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26910415291786194, + "rewards/margins": 0.04591580480337143, + "rewards/rejected": -0.31501996517181396, + "step": 4486 + }, + { + "epoch": 0.6939106901217862, + "grad_norm": 6.626440525054932, + "learning_rate": 4.27053499828159e-06, + "logits/chosen": 6.265573501586914, + "logits/rejected": 8.110580444335938, + "logps/chosen": -260.4603271484375, + "logps/rejected": -301.66131591796875, + "loss": 0.884, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.009654007852077484, + "rewards/margins": -0.33339405059814453, + "rewards/rejected": 0.3430480659008026, + "step": 4487 + }, + { + "epoch": 0.6940653392615503, + "grad_norm": 4.198341369628906, + "learning_rate": 4.270248596631917e-06, + "logits/chosen": 12.62332534790039, + "logits/rejected": 7.45077657699585, + "logps/chosen": -217.82925415039062, + "logps/rejected": -155.8576202392578, + "loss": 0.6712, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.432839035987854, + "rewards/margins": 0.26763468980789185, + "rewards/rejected": 0.16520433127880096, + "step": 4488 + }, + { + "epoch": 0.6942199884013145, + "grad_norm": 4.832276821136475, + "learning_rate": 4.269962194982244e-06, + "logits/chosen": 10.574823379516602, + "logits/rejected": 7.657534599304199, + "logps/chosen": -199.66836547851562, + "logps/rejected": -145.17208862304688, + "loss": 0.6694, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13170365989208221, + "rewards/margins": 0.12167917191982269, + "rewards/rejected": 0.010024495422840118, + "step": 4489 + }, + { + "epoch": 0.6943746375410786, + "grad_norm": 4.152619361877441, + "learning_rate": 4.26967579333257e-06, + "logits/chosen": 8.35746955871582, + "logits/rejected": 5.2645721435546875, + "logps/chosen": -326.18804931640625, + "logps/rejected": -267.47222900390625, + "loss": 0.5617, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5245493650436401, + "rewards/margins": 0.3482816517353058, + "rewards/rejected": 0.17626763880252838, + "step": 4490 + }, + { + "epoch": 0.6945292866808428, + "grad_norm": 6.027318954467773, + "learning_rate": 4.269389391682897e-06, + "logits/chosen": 6.32731294631958, + "logits/rejected": 7.404786109924316, + "logps/chosen": -227.11331176757812, + "logps/rejected": -283.74835205078125, + "loss": 0.7601, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4080234467983246, + "rewards/margins": -0.027156665921211243, + "rewards/rejected": 0.4351801574230194, + "step": 4491 + }, + { + "epoch": 0.694683935820607, + "grad_norm": 6.324120044708252, + "learning_rate": 4.2691029900332236e-06, + "logits/chosen": 8.313152313232422, + "logits/rejected": 8.947222709655762, + "logps/chosen": -255.2548828125, + "logps/rejected": -286.03387451171875, + "loss": 0.6289, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09388574957847595, + "rewards/margins": 0.21506360173225403, + "rewards/rejected": -0.12117785215377808, + "step": 4492 + }, + { + "epoch": 0.6948385849603712, + "grad_norm": 4.870259761810303, + "learning_rate": 4.268816588383549e-06, + "logits/chosen": 7.434970378875732, + "logits/rejected": 2.9332218170166016, + "logps/chosen": -305.2400207519531, + "logps/rejected": -239.05392456054688, + "loss": 0.5754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18469566106796265, + "rewards/margins": 0.3774864673614502, + "rewards/rejected": -0.19279080629348755, + "step": 4493 + }, + { + "epoch": 0.6949932341001354, + "grad_norm": 6.050650596618652, + "learning_rate": 4.268530186733876e-06, + "logits/chosen": 8.938138008117676, + "logits/rejected": 7.8709540367126465, + "logps/chosen": -270.3529052734375, + "logps/rejected": -231.974365234375, + "loss": 0.6593, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2741074562072754, + "rewards/margins": 0.08563751727342606, + "rewards/rejected": 0.18846994638442993, + "step": 4494 + }, + { + "epoch": 0.6951478832398995, + "grad_norm": 6.4017229080200195, + "learning_rate": 4.268243785084203e-06, + "logits/chosen": 14.082723617553711, + "logits/rejected": 9.132933616638184, + "logps/chosen": -396.4774169921875, + "logps/rejected": -386.0800476074219, + "loss": 0.6833, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35264596343040466, + "rewards/margins": 0.24040357768535614, + "rewards/rejected": 0.11224241554737091, + "step": 4495 + }, + { + "epoch": 0.6953025323796637, + "grad_norm": 5.366425037384033, + "learning_rate": 4.267957383434529e-06, + "logits/chosen": 6.996615409851074, + "logits/rejected": 4.5047478675842285, + "logps/chosen": -385.19329833984375, + "logps/rejected": -153.43960571289062, + "loss": 0.4978, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7135995030403137, + "rewards/margins": 0.548667848110199, + "rewards/rejected": 0.16493159532546997, + "step": 4496 + }, + { + "epoch": 0.6954571815194278, + "grad_norm": 5.529112815856934, + "learning_rate": 4.267670981784855e-06, + "logits/chosen": 8.390474319458008, + "logits/rejected": 9.9286527633667, + "logps/chosen": -238.40325927734375, + "logps/rejected": -242.73394775390625, + "loss": 0.6086, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18575145304203033, + "rewards/margins": 0.2877410352230072, + "rewards/rejected": -0.10198955237865448, + "step": 4497 + }, + { + "epoch": 0.695611830659192, + "grad_norm": 5.971905708312988, + "learning_rate": 4.267384580135182e-06, + "logits/chosen": 13.26982307434082, + "logits/rejected": 10.678499221801758, + "logps/chosen": -267.1767578125, + "logps/rejected": -237.23423767089844, + "loss": 0.8925, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.26273947954177856, + "rewards/margins": -0.15909171104431152, + "rewards/rejected": 0.4218311905860901, + "step": 4498 + }, + { + "epoch": 0.6957664797989561, + "grad_norm": 4.168835163116455, + "learning_rate": 4.267098178485508e-06, + "logits/chosen": 9.130517959594727, + "logits/rejected": 3.54677677154541, + "logps/chosen": -227.73214721679688, + "logps/rejected": -138.7923583984375, + "loss": 0.4899, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4243078827857971, + "rewards/margins": 0.5874795317649841, + "rewards/rejected": -0.1631716787815094, + "step": 4499 + }, + { + "epoch": 0.6959211289387203, + "grad_norm": 6.117710590362549, + "learning_rate": 4.266811776835835e-06, + "logits/chosen": 12.560050964355469, + "logits/rejected": 11.859695434570312, + "logps/chosen": -182.50404357910156, + "logps/rejected": -286.2406311035156, + "loss": 0.6277, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17282512784004211, + "rewards/margins": 0.18937042355537415, + "rewards/rejected": -0.01654529571533203, + "step": 4500 + }, + { + "epoch": 0.6960757780784844, + "grad_norm": 5.799213409423828, + "learning_rate": 4.266525375186161e-06, + "logits/chosen": 11.16567611694336, + "logits/rejected": 7.803576469421387, + "logps/chosen": -298.4626159667969, + "logps/rejected": -217.97315979003906, + "loss": 0.6286, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3227972090244293, + "rewards/margins": 0.20572544634342194, + "rewards/rejected": 0.11707176268100739, + "step": 4501 + }, + { + "epoch": 0.6962304272182486, + "grad_norm": 7.603994369506836, + "learning_rate": 4.2662389735364875e-06, + "logits/chosen": 6.379083633422852, + "logits/rejected": 3.5388684272766113, + "logps/chosen": -316.0343933105469, + "logps/rejected": -299.93084716796875, + "loss": 0.7439, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15197044610977173, + "rewards/margins": 0.04690173268318176, + "rewards/rejected": 0.10506869852542877, + "step": 4502 + }, + { + "epoch": 0.6963850763580127, + "grad_norm": 6.077985763549805, + "learning_rate": 4.265952571886814e-06, + "logits/chosen": 6.600677490234375, + "logits/rejected": 15.022510528564453, + "logps/chosen": -209.51133728027344, + "logps/rejected": -285.92681884765625, + "loss": 0.8151, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0385466068983078, + "rewards/margins": -0.17701105773448944, + "rewards/rejected": 0.21555766463279724, + "step": 4503 + }, + { + "epoch": 0.6965397254977769, + "grad_norm": 4.445544719696045, + "learning_rate": 4.265666170237141e-06, + "logits/chosen": 6.029338836669922, + "logits/rejected": 7.2780375480651855, + "logps/chosen": -214.21994018554688, + "logps/rejected": -277.39996337890625, + "loss": 0.5956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15856796503067017, + "rewards/margins": 0.385037362575531, + "rewards/rejected": -0.22646939754486084, + "step": 4504 + }, + { + "epoch": 0.696694374637541, + "grad_norm": 6.936688423156738, + "learning_rate": 4.2653797685874675e-06, + "logits/chosen": 4.115750789642334, + "logits/rejected": 7.8610334396362305, + "logps/chosen": -236.58029174804688, + "logps/rejected": -278.53924560546875, + "loss": 0.7073, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24960748851299286, + "rewards/margins": 0.03195124864578247, + "rewards/rejected": 0.2176562249660492, + "step": 4505 + }, + { + "epoch": 0.6968490237773053, + "grad_norm": 4.63679313659668, + "learning_rate": 4.265093366937793e-06, + "logits/chosen": 14.686271667480469, + "logits/rejected": 15.974813461303711, + "logps/chosen": -236.45315551757812, + "logps/rejected": -291.4210510253906, + "loss": 0.5889, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4108043909072876, + "rewards/margins": 0.3886268734931946, + "rewards/rejected": 0.02217748761177063, + "step": 4506 + }, + { + "epoch": 0.6970036729170694, + "grad_norm": 4.628255367279053, + "learning_rate": 4.26480696528812e-06, + "logits/chosen": 9.792828559875488, + "logits/rejected": 6.230591297149658, + "logps/chosen": -218.79173278808594, + "logps/rejected": -169.16140747070312, + "loss": 0.5344, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30802667140960693, + "rewards/margins": 0.5733361840248108, + "rewards/rejected": -0.26530954241752625, + "step": 4507 + }, + { + "epoch": 0.6971583220568336, + "grad_norm": 3.8382160663604736, + "learning_rate": 4.264520563638447e-06, + "logits/chosen": 12.786280632019043, + "logits/rejected": 3.418998956680298, + "logps/chosen": -283.6490173339844, + "logps/rejected": -180.78756713867188, + "loss": 0.5065, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.210466668009758, + "rewards/margins": 0.4462009072303772, + "rewards/rejected": -0.235734224319458, + "step": 4508 + }, + { + "epoch": 0.6973129711965977, + "grad_norm": 5.651285171508789, + "learning_rate": 4.264234161988773e-06, + "logits/chosen": 14.744607925415039, + "logits/rejected": 10.569001197814941, + "logps/chosen": -315.6968688964844, + "logps/rejected": -268.34527587890625, + "loss": 0.7555, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14891844987869263, + "rewards/margins": 0.03636118769645691, + "rewards/rejected": 0.11255726218223572, + "step": 4509 + }, + { + "epoch": 0.6974676203363619, + "grad_norm": 3.8856687545776367, + "learning_rate": 4.2639477603391e-06, + "logits/chosen": 13.814919471740723, + "logits/rejected": 7.502386093139648, + "logps/chosen": -260.0505065917969, + "logps/rejected": -218.2507781982422, + "loss": 0.4109, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7865480780601501, + "rewards/margins": 0.7240950465202332, + "rewards/rejected": 0.06245298683643341, + "step": 4510 + }, + { + "epoch": 0.697622269476126, + "grad_norm": 7.40568733215332, + "learning_rate": 4.2636613586894265e-06, + "logits/chosen": 12.888557434082031, + "logits/rejected": 8.701112747192383, + "logps/chosen": -299.17144775390625, + "logps/rejected": -253.7444305419922, + "loss": 0.8301, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.061250388622283936, + "rewards/margins": -0.17605458199977875, + "rewards/rejected": 0.2373049557209015, + "step": 4511 + }, + { + "epoch": 0.6977769186158902, + "grad_norm": 4.889257907867432, + "learning_rate": 4.263374957039753e-06, + "logits/chosen": 7.409084796905518, + "logits/rejected": 4.115428924560547, + "logps/chosen": -305.44793701171875, + "logps/rejected": -267.0001220703125, + "loss": 0.5515, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3631269931793213, + "rewards/margins": 0.39390960335731506, + "rewards/rejected": -0.030782606452703476, + "step": 4512 + }, + { + "epoch": 0.6979315677556543, + "grad_norm": 5.9372053146362305, + "learning_rate": 4.263088555390079e-06, + "logits/chosen": 14.701896667480469, + "logits/rejected": 10.373376846313477, + "logps/chosen": -351.3394775390625, + "logps/rejected": -372.42669677734375, + "loss": 0.729, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4987856149673462, + "rewards/margins": 0.013319596648216248, + "rewards/rejected": 0.48546600341796875, + "step": 4513 + }, + { + "epoch": 0.6980862168954185, + "grad_norm": 16.87580680847168, + "learning_rate": 4.262802153740406e-06, + "logits/chosen": 7.63148832321167, + "logits/rejected": 11.962331771850586, + "logps/chosen": -214.32266235351562, + "logps/rejected": -246.63441467285156, + "loss": 0.5951, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1629994958639145, + "rewards/margins": 0.31202423572540283, + "rewards/rejected": -0.14902472496032715, + "step": 4514 + }, + { + "epoch": 0.6982408660351827, + "grad_norm": 10.542787551879883, + "learning_rate": 4.262515752090732e-06, + "logits/chosen": 11.416427612304688, + "logits/rejected": 10.310189247131348, + "logps/chosen": -337.01336669921875, + "logps/rejected": -369.6172790527344, + "loss": 0.857, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20953340828418732, + "rewards/margins": -0.14943121373653412, + "rewards/rejected": 0.35896462202072144, + "step": 4515 + }, + { + "epoch": 0.6983955151749468, + "grad_norm": 4.263949871063232, + "learning_rate": 4.262229350441059e-06, + "logits/chosen": 7.161815643310547, + "logits/rejected": 5.741499900817871, + "logps/chosen": -192.8360137939453, + "logps/rejected": -168.21011352539062, + "loss": 0.6509, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5079464912414551, + "rewards/margins": 0.29513001441955566, + "rewards/rejected": 0.21281644701957703, + "step": 4516 + }, + { + "epoch": 0.698550164314711, + "grad_norm": 4.787829399108887, + "learning_rate": 4.261942948791386e-06, + "logits/chosen": 13.464323043823242, + "logits/rejected": 7.8164381980896, + "logps/chosen": -182.98135375976562, + "logps/rejected": -145.70376586914062, + "loss": 0.6729, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16753850877285004, + "rewards/margins": 0.1046389490365982, + "rewards/rejected": 0.06289957463741302, + "step": 4517 + }, + { + "epoch": 0.6987048134544751, + "grad_norm": 5.972153663635254, + "learning_rate": 4.261656547141712e-06, + "logits/chosen": 7.715356826782227, + "logits/rejected": 11.312923431396484, + "logps/chosen": -270.59625244140625, + "logps/rejected": -294.0712890625, + "loss": 0.7713, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.25535401701927185, + "rewards/margins": -0.048941247165203094, + "rewards/rejected": 0.30429530143737793, + "step": 4518 + }, + { + "epoch": 0.6988594625942394, + "grad_norm": 5.7866363525390625, + "learning_rate": 4.261370145492039e-06, + "logits/chosen": 10.942037582397461, + "logits/rejected": 16.701414108276367, + "logps/chosen": -205.97909545898438, + "logps/rejected": -247.9434356689453, + "loss": 0.7537, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05205276608467102, + "rewards/margins": 0.06624981015920639, + "rewards/rejected": -0.1183025985956192, + "step": 4519 + }, + { + "epoch": 0.6990141117340035, + "grad_norm": 5.484710216522217, + "learning_rate": 4.261083743842365e-06, + "logits/chosen": 8.549579620361328, + "logits/rejected": 7.959667205810547, + "logps/chosen": -248.42132568359375, + "logps/rejected": -236.71444702148438, + "loss": 0.7051, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.25760984420776367, + "rewards/margins": 0.07971373200416565, + "rewards/rejected": 0.17789611220359802, + "step": 4520 + }, + { + "epoch": 0.6991687608737677, + "grad_norm": 5.9600300788879395, + "learning_rate": 4.260797342192691e-06, + "logits/chosen": 13.78126049041748, + "logits/rejected": 3.591838836669922, + "logps/chosen": -281.55450439453125, + "logps/rejected": -188.72952270507812, + "loss": 0.5198, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16513997316360474, + "rewards/margins": 0.6938029527664185, + "rewards/rejected": -0.5286629796028137, + "step": 4521 + }, + { + "epoch": 0.6993234100135318, + "grad_norm": 4.384075164794922, + "learning_rate": 4.260510940543018e-06, + "logits/chosen": 7.3213210105896, + "logits/rejected": 0.5768899917602539, + "logps/chosen": -242.21971130371094, + "logps/rejected": -195.26075744628906, + "loss": 0.6667, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41109585762023926, + "rewards/margins": 0.12469722330570221, + "rewards/rejected": 0.28639861941337585, + "step": 4522 + }, + { + "epoch": 0.699478059153296, + "grad_norm": 5.961554050445557, + "learning_rate": 4.260224538893345e-06, + "logits/chosen": 10.137752532958984, + "logits/rejected": 11.230949401855469, + "logps/chosen": -345.00958251953125, + "logps/rejected": -326.57818603515625, + "loss": 0.715, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24702347815036774, + "rewards/margins": 0.18671254813671112, + "rewards/rejected": 0.060310930013656616, + "step": 4523 + }, + { + "epoch": 0.6996327082930601, + "grad_norm": 5.912920951843262, + "learning_rate": 4.259938137243671e-06, + "logits/chosen": 9.817709922790527, + "logits/rejected": 2.472628593444824, + "logps/chosen": -267.9261779785156, + "logps/rejected": -233.56130981445312, + "loss": 0.7176, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2913530468940735, + "rewards/margins": 0.18615540862083435, + "rewards/rejected": 0.10519766807556152, + "step": 4524 + }, + { + "epoch": 0.6997873574328243, + "grad_norm": 8.732048988342285, + "learning_rate": 4.259651735593998e-06, + "logits/chosen": 5.436592102050781, + "logits/rejected": 8.967561721801758, + "logps/chosen": -207.32400512695312, + "logps/rejected": -276.990966796875, + "loss": 1.112, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.25100183486938477, + "rewards/margins": -0.5311140418052673, + "rewards/rejected": 0.28011220693588257, + "step": 4525 + }, + { + "epoch": 0.6999420065725884, + "grad_norm": 7.210418701171875, + "learning_rate": 4.259365333944324e-06, + "logits/chosen": 11.290397644042969, + "logits/rejected": 6.048158168792725, + "logps/chosen": -329.5198974609375, + "logps/rejected": -248.43145751953125, + "loss": 0.699, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6017400622367859, + "rewards/margins": 0.13417525589466095, + "rewards/rejected": 0.4675648510456085, + "step": 4526 + }, + { + "epoch": 0.7000966557123526, + "grad_norm": 4.823277950286865, + "learning_rate": 4.25907893229465e-06, + "logits/chosen": 14.772256851196289, + "logits/rejected": 4.461668968200684, + "logps/chosen": -449.2216796875, + "logps/rejected": -301.8907165527344, + "loss": 0.407, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6074482202529907, + "rewards/margins": 0.7500501871109009, + "rewards/rejected": -0.14260196685791016, + "step": 4527 + }, + { + "epoch": 0.7002513048521167, + "grad_norm": 10.850847244262695, + "learning_rate": 4.258792530644977e-06, + "logits/chosen": 9.387843132019043, + "logits/rejected": 15.987394332885742, + "logps/chosen": -233.4283447265625, + "logps/rejected": -317.0649719238281, + "loss": 0.9578, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3869436979293823, + "rewards/margins": -0.13617311418056488, + "rewards/rejected": -0.25077056884765625, + "step": 4528 + }, + { + "epoch": 0.7004059539918809, + "grad_norm": 6.553378582000732, + "learning_rate": 4.258506128995304e-06, + "logits/chosen": 9.833511352539062, + "logits/rejected": 9.896178245544434, + "logps/chosen": -260.8777770996094, + "logps/rejected": -245.5105743408203, + "loss": 0.8856, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5739567875862122, + "rewards/margins": -0.2990604043006897, + "rewards/rejected": 0.8730171918869019, + "step": 4529 + }, + { + "epoch": 0.700560603131645, + "grad_norm": 5.9011054039001465, + "learning_rate": 4.25821972734563e-06, + "logits/chosen": 10.517602920532227, + "logits/rejected": 11.96645450592041, + "logps/chosen": -261.49993896484375, + "logps/rejected": -333.6334533691406, + "loss": 0.6194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6023586392402649, + "rewards/margins": 0.22985060513019562, + "rewards/rejected": 0.37250804901123047, + "step": 4530 + }, + { + "epoch": 0.7007152522714093, + "grad_norm": 8.298968315124512, + "learning_rate": 4.257933325695956e-06, + "logits/chosen": 5.801590919494629, + "logits/rejected": 4.023548603057861, + "logps/chosen": -252.943603515625, + "logps/rejected": -258.43560791015625, + "loss": 0.6664, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34438368678092957, + "rewards/margins": 0.33953070640563965, + "rewards/rejected": 0.004852950572967529, + "step": 4531 + }, + { + "epoch": 0.7008699014111734, + "grad_norm": 5.263658046722412, + "learning_rate": 4.257646924046283e-06, + "logits/chosen": 11.740427017211914, + "logits/rejected": 3.867795944213867, + "logps/chosen": -329.8431701660156, + "logps/rejected": -238.7526092529297, + "loss": 0.4305, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.477817565202713, + "rewards/margins": 0.7964357137680054, + "rewards/rejected": -0.31861817836761475, + "step": 4532 + }, + { + "epoch": 0.7010245505509376, + "grad_norm": 4.682222366333008, + "learning_rate": 4.2573605223966095e-06, + "logits/chosen": 12.433629989624023, + "logits/rejected": 4.082302570343018, + "logps/chosen": -290.85711669921875, + "logps/rejected": -162.19224548339844, + "loss": 0.6679, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2377374768257141, + "rewards/margins": 0.061184413731098175, + "rewards/rejected": 0.17655307054519653, + "step": 4533 + }, + { + "epoch": 0.7011791996907017, + "grad_norm": 5.300076961517334, + "learning_rate": 4.257074120746936e-06, + "logits/chosen": 5.017838954925537, + "logits/rejected": 6.487076759338379, + "logps/chosen": -194.14794921875, + "logps/rejected": -207.20458984375, + "loss": 0.7196, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4330114424228668, + "rewards/margins": 0.04724682867527008, + "rewards/rejected": 0.38576459884643555, + "step": 4534 + }, + { + "epoch": 0.7013338488304659, + "grad_norm": 3.788860321044922, + "learning_rate": 4.256787719097262e-06, + "logits/chosen": 12.992966651916504, + "logits/rejected": 11.000411033630371, + "logps/chosen": -218.46665954589844, + "logps/rejected": -216.50485229492188, + "loss": 0.5025, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5065352916717529, + "rewards/margins": 0.5044621229171753, + "rewards/rejected": 0.0020730923861265182, + "step": 4535 + }, + { + "epoch": 0.70148849797023, + "grad_norm": 6.75756311416626, + "learning_rate": 4.2565013174475886e-06, + "logits/chosen": 7.504397869110107, + "logits/rejected": 8.28226375579834, + "logps/chosen": -173.804443359375, + "logps/rejected": -207.50404357910156, + "loss": 0.7555, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4218415319919586, + "rewards/margins": -0.07979655265808105, + "rewards/rejected": 0.5016380548477173, + "step": 4536 + }, + { + "epoch": 0.7016431471099942, + "grad_norm": 26.790225982666016, + "learning_rate": 4.256214915797915e-06, + "logits/chosen": 9.217204093933105, + "logits/rejected": 14.140243530273438, + "logps/chosen": -297.89923095703125, + "logps/rejected": -343.83428955078125, + "loss": 0.6641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45408692955970764, + "rewards/margins": 0.1876131147146225, + "rewards/rejected": 0.26647382974624634, + "step": 4537 + }, + { + "epoch": 0.7017977962497584, + "grad_norm": 6.070600509643555, + "learning_rate": 4.255928514148242e-06, + "logits/chosen": 12.289417266845703, + "logits/rejected": 8.979168891906738, + "logps/chosen": -249.48367309570312, + "logps/rejected": -179.6365966796875, + "loss": 0.7263, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29100480675697327, + "rewards/margins": 0.11057285964488983, + "rewards/rejected": 0.18043194711208344, + "step": 4538 + }, + { + "epoch": 0.7019524453895225, + "grad_norm": 7.463078022003174, + "learning_rate": 4.255642112498568e-06, + "logits/chosen": 12.173538208007812, + "logits/rejected": 12.051815032958984, + "logps/chosen": -382.2598876953125, + "logps/rejected": -344.8181457519531, + "loss": 0.6769, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2353130429983139, + "rewards/margins": 0.3194710910320282, + "rewards/rejected": -0.0841580480337143, + "step": 4539 + }, + { + "epoch": 0.7021070945292867, + "grad_norm": 4.894156455993652, + "learning_rate": 4.255355710848894e-06, + "logits/chosen": 8.359127044677734, + "logits/rejected": 7.2022600173950195, + "logps/chosen": -262.2000427246094, + "logps/rejected": -276.2204284667969, + "loss": 0.5581, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7289108037948608, + "rewards/margins": 0.3931763768196106, + "rewards/rejected": 0.33573436737060547, + "step": 4540 + }, + { + "epoch": 0.7022617436690508, + "grad_norm": 9.222636222839355, + "learning_rate": 4.255069309199221e-06, + "logits/chosen": 8.670086860656738, + "logits/rejected": 12.20486831665039, + "logps/chosen": -280.91339111328125, + "logps/rejected": -349.40313720703125, + "loss": 0.9708, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.18760338425636292, + "rewards/margins": -0.36386236548423767, + "rewards/rejected": 0.17625896632671356, + "step": 4541 + }, + { + "epoch": 0.702416392808815, + "grad_norm": 4.30269193649292, + "learning_rate": 4.254782907549548e-06, + "logits/chosen": 12.84939956665039, + "logits/rejected": 13.319472312927246, + "logps/chosen": -176.6269989013672, + "logps/rejected": -174.7431182861328, + "loss": 0.6821, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3207028806209564, + "rewards/margins": 0.09814807772636414, + "rewards/rejected": 0.2225547879934311, + "step": 4542 + }, + { + "epoch": 0.7025710419485791, + "grad_norm": 4.3327555656433105, + "learning_rate": 4.254496505899874e-06, + "logits/chosen": 11.346357345581055, + "logits/rejected": 8.260499954223633, + "logps/chosen": -236.2779083251953, + "logps/rejected": -185.6102294921875, + "loss": 0.4655, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.32453033328056335, + "rewards/margins": 0.6208864450454712, + "rewards/rejected": -0.29635608196258545, + "step": 4543 + }, + { + "epoch": 0.7027256910883434, + "grad_norm": 4.27791690826416, + "learning_rate": 4.254210104250201e-06, + "logits/chosen": 11.70185375213623, + "logits/rejected": 5.190775394439697, + "logps/chosen": -261.27142333984375, + "logps/rejected": -179.78782653808594, + "loss": 0.5447, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45501822233200073, + "rewards/margins": 0.3926829397678375, + "rewards/rejected": 0.0623352974653244, + "step": 4544 + }, + { + "epoch": 0.7028803402281075, + "grad_norm": 6.794124126434326, + "learning_rate": 4.2539237026005276e-06, + "logits/chosen": 7.268523693084717, + "logits/rejected": 3.076155424118042, + "logps/chosen": -360.5343322753906, + "logps/rejected": -230.9642791748047, + "loss": 0.5687, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7111493349075317, + "rewards/margins": 0.31361597776412964, + "rewards/rejected": 0.3975334167480469, + "step": 4545 + }, + { + "epoch": 0.7030349893678717, + "grad_norm": 3.671812057495117, + "learning_rate": 4.253637300950853e-06, + "logits/chosen": 11.911089897155762, + "logits/rejected": 10.885863304138184, + "logps/chosen": -102.51800537109375, + "logps/rejected": -132.78990173339844, + "loss": 0.6537, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05699014663696289, + "rewards/margins": 0.1282472014427185, + "rewards/rejected": -0.1852373331785202, + "step": 4546 + }, + { + "epoch": 0.7031896385076358, + "grad_norm": 5.970956325531006, + "learning_rate": 4.25335089930118e-06, + "logits/chosen": 13.8966064453125, + "logits/rejected": 10.703203201293945, + "logps/chosen": -326.9024658203125, + "logps/rejected": -328.2403564453125, + "loss": 0.5502, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3544917106628418, + "rewards/margins": 0.4477842152118683, + "rewards/rejected": -0.09329252690076828, + "step": 4547 + }, + { + "epoch": 0.7033442876474, + "grad_norm": 5.157553195953369, + "learning_rate": 4.253064497651507e-06, + "logits/chosen": 6.360335350036621, + "logits/rejected": 6.4123311042785645, + "logps/chosen": -350.803955078125, + "logps/rejected": -315.10003662109375, + "loss": 0.6344, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4565875232219696, + "rewards/margins": 0.1502254605293274, + "rewards/rejected": 0.3063620626926422, + "step": 4548 + }, + { + "epoch": 0.7034989367871641, + "grad_norm": 7.361348628997803, + "learning_rate": 4.252778096001833e-06, + "logits/chosen": 9.062346458435059, + "logits/rejected": 7.906216621398926, + "logps/chosen": -260.3416748046875, + "logps/rejected": -255.54330444335938, + "loss": 0.7359, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2632283866405487, + "rewards/margins": -0.004096284508705139, + "rewards/rejected": 0.26732465624809265, + "step": 4549 + }, + { + "epoch": 0.7036535859269283, + "grad_norm": 4.224150657653809, + "learning_rate": 4.25249169435216e-06, + "logits/chosen": 8.379314422607422, + "logits/rejected": 9.338112831115723, + "logps/chosen": -161.53048706054688, + "logps/rejected": -202.90817260742188, + "loss": 0.6719, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30146777629852295, + "rewards/margins": 0.0955372303724289, + "rewards/rejected": 0.20593053102493286, + "step": 4550 + }, + { + "epoch": 0.7038082350666924, + "grad_norm": 5.291877269744873, + "learning_rate": 4.252205292702487e-06, + "logits/chosen": 14.643348693847656, + "logits/rejected": 12.800247192382812, + "logps/chosen": -372.36785888671875, + "logps/rejected": -353.1993408203125, + "loss": 0.6063, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5627739429473877, + "rewards/margins": 0.23615151643753052, + "rewards/rejected": 0.3266223669052124, + "step": 4551 + }, + { + "epoch": 0.7039628842064566, + "grad_norm": 5.587003231048584, + "learning_rate": 4.251918891052813e-06, + "logits/chosen": 6.755594253540039, + "logits/rejected": 9.65949535369873, + "logps/chosen": -193.87896728515625, + "logps/rejected": -224.65139770507812, + "loss": 0.7177, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28870445489883423, + "rewards/margins": -0.018207117915153503, + "rewards/rejected": 0.3069115877151489, + "step": 4552 + }, + { + "epoch": 0.7041175333462207, + "grad_norm": 3.8572256565093994, + "learning_rate": 4.251632489403139e-06, + "logits/chosen": 4.550288200378418, + "logits/rejected": 5.042516708374023, + "logps/chosen": -166.97640991210938, + "logps/rejected": -166.8123321533203, + "loss": 0.6115, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15429887175559998, + "rewards/margins": 0.5061265230178833, + "rewards/rejected": -0.3518276810646057, + "step": 4553 + }, + { + "epoch": 0.7042721824859849, + "grad_norm": 5.494017124176025, + "learning_rate": 4.251346087753466e-06, + "logits/chosen": 10.74178409576416, + "logits/rejected": 10.703782081604004, + "logps/chosen": -271.8233642578125, + "logps/rejected": -284.4439697265625, + "loss": 0.6403, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10947389155626297, + "rewards/margins": 0.2876623272895813, + "rewards/rejected": -0.17818844318389893, + "step": 4554 + }, + { + "epoch": 0.704426831625749, + "grad_norm": 3.6626198291778564, + "learning_rate": 4.251059686103792e-06, + "logits/chosen": 10.443584442138672, + "logits/rejected": 9.946053504943848, + "logps/chosen": -169.6202392578125, + "logps/rejected": -165.19149780273438, + "loss": 0.5786, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1825583428144455, + "rewards/margins": 0.416953444480896, + "rewards/rejected": -0.23439514636993408, + "step": 4555 + }, + { + "epoch": 0.7045814807655132, + "grad_norm": 4.254640579223633, + "learning_rate": 4.250773284454119e-06, + "logits/chosen": 11.594375610351562, + "logits/rejected": 2.9958701133728027, + "logps/chosen": -288.03485107421875, + "logps/rejected": -138.9364471435547, + "loss": 0.5742, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16672661900520325, + "rewards/margins": 0.3213419020175934, + "rewards/rejected": -0.15461528301239014, + "step": 4556 + }, + { + "epoch": 0.7047361299052775, + "grad_norm": 6.709202289581299, + "learning_rate": 4.250486882804446e-06, + "logits/chosen": 10.963682174682617, + "logits/rejected": 13.264007568359375, + "logps/chosen": -294.2309265136719, + "logps/rejected": -347.676025390625, + "loss": 0.8937, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.11398278176784515, + "rewards/margins": -0.3024635314941406, + "rewards/rejected": 0.4164462983608246, + "step": 4557 + }, + { + "epoch": 0.7048907790450416, + "grad_norm": 6.53480339050293, + "learning_rate": 4.250200481154772e-06, + "logits/chosen": 9.597980499267578, + "logits/rejected": 2.7294387817382812, + "logps/chosen": -363.687255859375, + "logps/rejected": -285.1516418457031, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15482865273952484, + "rewards/margins": 0.08394481241703033, + "rewards/rejected": 0.0708838552236557, + "step": 4558 + }, + { + "epoch": 0.7050454281848058, + "grad_norm": 5.466675758361816, + "learning_rate": 4.249914079505098e-06, + "logits/chosen": 4.729236602783203, + "logits/rejected": 5.2614569664001465, + "logps/chosen": -213.840576171875, + "logps/rejected": -232.79869079589844, + "loss": 0.7956, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06657245010137558, + "rewards/margins": -0.09675539284944534, + "rewards/rejected": 0.16332782804965973, + "step": 4559 + }, + { + "epoch": 0.7052000773245699, + "grad_norm": 6.556961536407471, + "learning_rate": 4.249627677855425e-06, + "logits/chosen": 8.17802619934082, + "logits/rejected": 5.780283451080322, + "logps/chosen": -362.2308654785156, + "logps/rejected": -322.38104248046875, + "loss": 0.667, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.36667710542678833, + "rewards/margins": 0.12676569819450378, + "rewards/rejected": 0.23991142213344574, + "step": 4560 + }, + { + "epoch": 0.7053547264643341, + "grad_norm": 14.223852157592773, + "learning_rate": 4.2493412762057514e-06, + "logits/chosen": 9.246179580688477, + "logits/rejected": 7.893303871154785, + "logps/chosen": -296.88665771484375, + "logps/rejected": -242.22293090820312, + "loss": 0.7423, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22583410143852234, + "rewards/margins": 0.11306314170360565, + "rewards/rejected": -0.3388972282409668, + "step": 4561 + }, + { + "epoch": 0.7055093756040982, + "grad_norm": 6.777234077453613, + "learning_rate": 4.249054874556078e-06, + "logits/chosen": 10.873583793640137, + "logits/rejected": 5.482688903808594, + "logps/chosen": -371.5252990722656, + "logps/rejected": -287.93756103515625, + "loss": 0.5071, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38150864839553833, + "rewards/margins": 0.5027434825897217, + "rewards/rejected": -0.12123481929302216, + "step": 4562 + }, + { + "epoch": 0.7056640247438624, + "grad_norm": 6.712673187255859, + "learning_rate": 4.248768472906405e-06, + "logits/chosen": 14.129240989685059, + "logits/rejected": 8.121871948242188, + "logps/chosen": -416.73236083984375, + "logps/rejected": -246.70632934570312, + "loss": 0.8318, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06838780641555786, + "rewards/margins": -0.18072742223739624, + "rewards/rejected": 0.11233963072299957, + "step": 4563 + }, + { + "epoch": 0.7058186738836265, + "grad_norm": 5.995640754699707, + "learning_rate": 4.2484820712567305e-06, + "logits/chosen": 11.535335540771484, + "logits/rejected": 8.821805953979492, + "logps/chosen": -216.32337951660156, + "logps/rejected": -208.96617126464844, + "loss": 0.5594, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16813993453979492, + "rewards/margins": 0.6013048887252808, + "rewards/rejected": -0.4331649839878082, + "step": 4564 + }, + { + "epoch": 0.7059733230233907, + "grad_norm": 7.763742923736572, + "learning_rate": 4.248195669607057e-06, + "logits/chosen": 9.432860374450684, + "logits/rejected": 10.351784706115723, + "logps/chosen": -287.97015380859375, + "logps/rejected": -296.40155029296875, + "loss": 0.766, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3054032325744629, + "rewards/margins": -0.06390313804149628, + "rewards/rejected": 0.36930638551712036, + "step": 4565 + }, + { + "epoch": 0.7061279721631548, + "grad_norm": 5.013143539428711, + "learning_rate": 4.247909267957384e-06, + "logits/chosen": 8.991588592529297, + "logits/rejected": 5.45574426651001, + "logps/chosen": -192.02626037597656, + "logps/rejected": -190.56459045410156, + "loss": 0.766, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11150780320167542, + "rewards/margins": -0.07187265902757645, + "rewards/rejected": 0.18338046967983246, + "step": 4566 + }, + { + "epoch": 0.706282621302919, + "grad_norm": 14.881312370300293, + "learning_rate": 4.2476228663077105e-06, + "logits/chosen": 13.406326293945312, + "logits/rejected": 6.856800556182861, + "logps/chosen": -361.27410888671875, + "logps/rejected": -320.4654541015625, + "loss": 0.5216, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5811535716056824, + "rewards/margins": 0.4521726965904236, + "rewards/rejected": 0.12898090481758118, + "step": 4567 + }, + { + "epoch": 0.7064372704426831, + "grad_norm": 4.174317359924316, + "learning_rate": 4.247336464658037e-06, + "logits/chosen": 10.269937515258789, + "logits/rejected": 2.4715161323547363, + "logps/chosen": -368.53521728515625, + "logps/rejected": -279.296630859375, + "loss": 0.4159, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7417339086532593, + "rewards/margins": 0.7443634271621704, + "rewards/rejected": -0.0026294589042663574, + "step": 4568 + }, + { + "epoch": 0.7065919195824473, + "grad_norm": 5.458750247955322, + "learning_rate": 4.247050063008363e-06, + "logits/chosen": 9.789627075195312, + "logits/rejected": 9.200784683227539, + "logps/chosen": -288.3679504394531, + "logps/rejected": -316.68804931640625, + "loss": 0.5609, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.314578115940094, + "rewards/margins": 0.6339272260665894, + "rewards/rejected": -0.319349080324173, + "step": 4569 + }, + { + "epoch": 0.7067465687222115, + "grad_norm": 4.499334335327148, + "learning_rate": 4.24676366135869e-06, + "logits/chosen": 11.671292304992676, + "logits/rejected": 11.885964393615723, + "logps/chosen": -248.450439453125, + "logps/rejected": -294.1128234863281, + "loss": 0.5969, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2155788391828537, + "rewards/margins": 0.39140212535858154, + "rewards/rejected": -0.17582333087921143, + "step": 4570 + }, + { + "epoch": 0.7069012178619757, + "grad_norm": 6.113247394561768, + "learning_rate": 4.246477259709016e-06, + "logits/chosen": 8.994807243347168, + "logits/rejected": 7.239126205444336, + "logps/chosen": -296.262451171875, + "logps/rejected": -285.0254821777344, + "loss": 0.7345, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2146557867527008, + "rewards/margins": -0.03775966912508011, + "rewards/rejected": 0.2524154484272003, + "step": 4571 + }, + { + "epoch": 0.7070558670017398, + "grad_norm": 4.778618335723877, + "learning_rate": 4.246190858059343e-06, + "logits/chosen": 13.52690315246582, + "logits/rejected": 9.544361114501953, + "logps/chosen": -233.23886108398438, + "logps/rejected": -230.8426971435547, + "loss": 0.5206, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4327920079231262, + "rewards/margins": 0.5751252174377441, + "rewards/rejected": -0.1423332244157791, + "step": 4572 + }, + { + "epoch": 0.707210516141504, + "grad_norm": 4.7092719078063965, + "learning_rate": 4.245904456409669e-06, + "logits/chosen": 10.643328666687012, + "logits/rejected": 4.016560077667236, + "logps/chosen": -327.07720947265625, + "logps/rejected": -218.01927185058594, + "loss": 0.5235, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44711053371429443, + "rewards/margins": 0.5179625749588013, + "rewards/rejected": -0.07085199654102325, + "step": 4573 + }, + { + "epoch": 0.7073651652812681, + "grad_norm": 5.633538722991943, + "learning_rate": 4.245618054759995e-06, + "logits/chosen": 5.417980670928955, + "logits/rejected": 8.159159660339355, + "logps/chosen": -250.2317352294922, + "logps/rejected": -192.50909423828125, + "loss": 0.7402, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0016814544796943665, + "rewards/margins": -0.03897682577371597, + "rewards/rejected": 0.03729536756873131, + "step": 4574 + }, + { + "epoch": 0.7075198144210323, + "grad_norm": 6.199712753295898, + "learning_rate": 4.245331653110322e-06, + "logits/chosen": 16.160507202148438, + "logits/rejected": 17.126243591308594, + "logps/chosen": -296.52191162109375, + "logps/rejected": -364.4744567871094, + "loss": 0.8055, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1539459228515625, + "rewards/margins": -0.0059756748378276825, + "rewards/rejected": 0.15992160141468048, + "step": 4575 + }, + { + "epoch": 0.7076744635607964, + "grad_norm": 7.218504905700684, + "learning_rate": 4.245045251460649e-06, + "logits/chosen": 6.133431434631348, + "logits/rejected": 7.881924152374268, + "logps/chosen": -270.05755615234375, + "logps/rejected": -246.61935424804688, + "loss": 0.9605, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08289370685815811, + "rewards/margins": -0.18835283815860748, + "rewards/rejected": 0.10545916855335236, + "step": 4576 + }, + { + "epoch": 0.7078291127005606, + "grad_norm": 4.109829902648926, + "learning_rate": 4.244758849810975e-06, + "logits/chosen": 8.774371147155762, + "logits/rejected": 12.36960220336914, + "logps/chosen": -149.04518127441406, + "logps/rejected": -179.76773071289062, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0977703109383583, + "rewards/margins": 0.1250896006822586, + "rewards/rejected": -0.2228599190711975, + "step": 4577 + }, + { + "epoch": 0.7079837618403247, + "grad_norm": 4.731785774230957, + "learning_rate": 4.244472448161302e-06, + "logits/chosen": 9.13316535949707, + "logits/rejected": 7.804808139801025, + "logps/chosen": -190.30599975585938, + "logps/rejected": -174.75729370117188, + "loss": 0.6876, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3220677375793457, + "rewards/margins": 0.10329968482255936, + "rewards/rejected": 0.21876806020736694, + "step": 4578 + }, + { + "epoch": 0.7081384109800889, + "grad_norm": 6.957160472869873, + "learning_rate": 4.244186046511628e-06, + "logits/chosen": 10.29033088684082, + "logits/rejected": 13.34824275970459, + "logps/chosen": -278.59149169921875, + "logps/rejected": -298.6543884277344, + "loss": 0.8957, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2270442098379135, + "rewards/margins": -0.28402191400527954, + "rewards/rejected": 0.05697770416736603, + "step": 4579 + }, + { + "epoch": 0.708293060119853, + "grad_norm": 5.32158088684082, + "learning_rate": 4.243899644861954e-06, + "logits/chosen": 6.668206691741943, + "logits/rejected": 12.51567554473877, + "logps/chosen": -356.54107666015625, + "logps/rejected": -335.79022216796875, + "loss": 0.7461, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16742274165153503, + "rewards/margins": 0.03726506978273392, + "rewards/rejected": 0.13015766441822052, + "step": 4580 + }, + { + "epoch": 0.7084477092596172, + "grad_norm": 4.515707015991211, + "learning_rate": 4.243613243212281e-06, + "logits/chosen": 6.752403259277344, + "logits/rejected": 5.891953945159912, + "logps/chosen": -285.4992370605469, + "logps/rejected": -256.10455322265625, + "loss": 0.5511, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48147740960121155, + "rewards/margins": 0.47435086965560913, + "rewards/rejected": 0.007126554846763611, + "step": 4581 + }, + { + "epoch": 0.7086023583993813, + "grad_norm": 4.8283514976501465, + "learning_rate": 4.243326841562608e-06, + "logits/chosen": 8.763612747192383, + "logits/rejected": 7.175118446350098, + "logps/chosen": -274.317626953125, + "logps/rejected": -355.62518310546875, + "loss": 0.5318, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2706715166568756, + "rewards/margins": 0.47458314895629883, + "rewards/rejected": -0.2039116472005844, + "step": 4582 + }, + { + "epoch": 0.7087570075391456, + "grad_norm": 3.562120199203491, + "learning_rate": 4.243040439912934e-06, + "logits/chosen": 11.488204956054688, + "logits/rejected": -0.03138244152069092, + "logps/chosen": -390.5075988769531, + "logps/rejected": -213.0691375732422, + "loss": 0.4098, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7288947105407715, + "rewards/margins": 1.2655043601989746, + "rewards/rejected": -0.5366096496582031, + "step": 4583 + }, + { + "epoch": 0.7089116566789098, + "grad_norm": 4.430549621582031, + "learning_rate": 4.242754038263261e-06, + "logits/chosen": 7.564137935638428, + "logits/rejected": 4.704463005065918, + "logps/chosen": -288.6220703125, + "logps/rejected": -213.3743896484375, + "loss": 0.5586, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33583182096481323, + "rewards/margins": 0.37405115365982056, + "rewards/rejected": -0.038219302892684937, + "step": 4584 + }, + { + "epoch": 0.7090663058186739, + "grad_norm": 5.988737106323242, + "learning_rate": 4.242467636613588e-06, + "logits/chosen": 10.479631423950195, + "logits/rejected": 7.0845818519592285, + "logps/chosen": -262.98291015625, + "logps/rejected": -323.0245361328125, + "loss": 0.4956, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3370293974876404, + "rewards/margins": 0.536069393157959, + "rewards/rejected": -0.19904004037380219, + "step": 4585 + }, + { + "epoch": 0.7092209549584381, + "grad_norm": 4.775357723236084, + "learning_rate": 4.2421812349639135e-06, + "logits/chosen": 8.966789245605469, + "logits/rejected": 6.047226905822754, + "logps/chosen": -346.08551025390625, + "logps/rejected": -287.6668395996094, + "loss": 0.5307, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42627131938934326, + "rewards/margins": 0.3979650139808655, + "rewards/rejected": 0.02830635756254196, + "step": 4586 + }, + { + "epoch": 0.7093756040982022, + "grad_norm": 5.712526321411133, + "learning_rate": 4.24189483331424e-06, + "logits/chosen": 8.538501739501953, + "logits/rejected": 7.301070690155029, + "logps/chosen": -293.10028076171875, + "logps/rejected": -237.252197265625, + "loss": 0.6693, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3256973326206207, + "rewards/margins": 0.23136553168296814, + "rewards/rejected": 0.0943317785859108, + "step": 4587 + }, + { + "epoch": 0.7095302532379664, + "grad_norm": 7.103586673736572, + "learning_rate": 4.241608431664567e-06, + "logits/chosen": 13.467236518859863, + "logits/rejected": 11.850404739379883, + "logps/chosen": -375.1611633300781, + "logps/rejected": -340.20477294921875, + "loss": 0.7296, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34818315505981445, + "rewards/margins": -0.039399243891239166, + "rewards/rejected": 0.3875824213027954, + "step": 4588 + }, + { + "epoch": 0.7096849023777305, + "grad_norm": 5.8927903175354, + "learning_rate": 4.241322030014893e-06, + "logits/chosen": 12.597597122192383, + "logits/rejected": 10.961711883544922, + "logps/chosen": -469.7466125488281, + "logps/rejected": -342.7825927734375, + "loss": 0.5975, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6095735430717468, + "rewards/margins": 0.23591360449790955, + "rewards/rejected": 0.3736599385738373, + "step": 4589 + }, + { + "epoch": 0.7098395515174947, + "grad_norm": 5.878236770629883, + "learning_rate": 4.24103562836522e-06, + "logits/chosen": 14.659733772277832, + "logits/rejected": 9.472429275512695, + "logps/chosen": -356.6368408203125, + "logps/rejected": -354.6163635253906, + "loss": 0.6378, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.46231406927108765, + "rewards/margins": 0.3263733983039856, + "rewards/rejected": 0.13594065606594086, + "step": 4590 + }, + { + "epoch": 0.7099942006572588, + "grad_norm": 4.429261207580566, + "learning_rate": 4.240749226715547e-06, + "logits/chosen": 9.442913055419922, + "logits/rejected": 13.063085556030273, + "logps/chosen": -218.58944702148438, + "logps/rejected": -260.20867919921875, + "loss": 0.5776, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28602471947669983, + "rewards/margins": 0.3126041293144226, + "rewards/rejected": -0.026579387485980988, + "step": 4591 + }, + { + "epoch": 0.710148849797023, + "grad_norm": 5.877252578735352, + "learning_rate": 4.2404628250658725e-06, + "logits/chosen": 8.935186386108398, + "logits/rejected": 5.961345672607422, + "logps/chosen": -301.419189453125, + "logps/rejected": -236.7633056640625, + "loss": 0.717, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08373585343360901, + "rewards/margins": 0.06022048741579056, + "rewards/rejected": 0.02351538836956024, + "step": 4592 + }, + { + "epoch": 0.7103034989367871, + "grad_norm": 6.092513084411621, + "learning_rate": 4.240176423416199e-06, + "logits/chosen": 8.973538398742676, + "logits/rejected": 10.432695388793945, + "logps/chosen": -208.08888244628906, + "logps/rejected": -238.11029052734375, + "loss": 0.7768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1211184561252594, + "rewards/margins": -0.09161757677793503, + "rewards/rejected": 0.21273604035377502, + "step": 4593 + }, + { + "epoch": 0.7104581480765513, + "grad_norm": 3.9608352184295654, + "learning_rate": 4.239890021766526e-06, + "logits/chosen": 7.756915092468262, + "logits/rejected": 7.772074222564697, + "logps/chosen": -299.4737243652344, + "logps/rejected": -256.7357177734375, + "loss": 0.5155, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5204596519470215, + "rewards/margins": 0.48946577310562134, + "rewards/rejected": 0.030993878841400146, + "step": 4594 + }, + { + "epoch": 0.7106127972163155, + "grad_norm": 5.682147026062012, + "learning_rate": 4.2396036201168525e-06, + "logits/chosen": 3.9371743202209473, + "logits/rejected": 4.048566818237305, + "logps/chosen": -167.6571807861328, + "logps/rejected": -160.6783905029297, + "loss": 0.7465, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12488966435194016, + "rewards/margins": -0.04817222058773041, + "rewards/rejected": 0.17306189239025116, + "step": 4595 + }, + { + "epoch": 0.7107674463560797, + "grad_norm": 3.626577138900757, + "learning_rate": 4.239317218467179e-06, + "logits/chosen": 6.447562217712402, + "logits/rejected": 5.775452613830566, + "logps/chosen": -174.85342407226562, + "logps/rejected": -174.7040252685547, + "loss": 0.5041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38952410221099854, + "rewards/margins": 0.6370323896408081, + "rewards/rejected": -0.24750833213329315, + "step": 4596 + }, + { + "epoch": 0.7109220954958438, + "grad_norm": 7.288050651550293, + "learning_rate": 4.239030816817506e-06, + "logits/chosen": -1.1223838329315186, + "logits/rejected": 5.900338649749756, + "logps/chosen": -234.98805236816406, + "logps/rejected": -334.6891784667969, + "loss": 0.9103, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2509725093841553, + "rewards/margins": -0.13586796820163727, + "rewards/rejected": -0.1151045560836792, + "step": 4597 + }, + { + "epoch": 0.711076744635608, + "grad_norm": 7.594099521636963, + "learning_rate": 4.2387444151678316e-06, + "logits/chosen": 9.304906845092773, + "logits/rejected": 10.392935752868652, + "logps/chosen": -296.36346435546875, + "logps/rejected": -285.77001953125, + "loss": 0.8777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.037966907024383545, + "rewards/margins": -0.24119937419891357, + "rewards/rejected": 0.20323246717453003, + "step": 4598 + }, + { + "epoch": 0.7112313937753721, + "grad_norm": 7.77683687210083, + "learning_rate": 4.238458013518158e-06, + "logits/chosen": 14.195968627929688, + "logits/rejected": 9.00336742401123, + "logps/chosen": -478.3730163574219, + "logps/rejected": -306.6780700683594, + "loss": 0.8273, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.003941148519515991, + "rewards/margins": -0.007843591272830963, + "rewards/rejected": 0.003902435302734375, + "step": 4599 + }, + { + "epoch": 0.7113860429151363, + "grad_norm": 5.745849609375, + "learning_rate": 4.238171611868485e-06, + "logits/chosen": 13.648274421691895, + "logits/rejected": 9.800228118896484, + "logps/chosen": -389.63946533203125, + "logps/rejected": -339.66668701171875, + "loss": 0.5396, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3609994947910309, + "rewards/margins": 0.46152400970458984, + "rewards/rejected": -0.10052451491355896, + "step": 4600 + }, + { + "epoch": 0.7115406920549004, + "grad_norm": 4.448723793029785, + "learning_rate": 4.2378852102188115e-06, + "logits/chosen": 8.844828605651855, + "logits/rejected": 10.707019805908203, + "logps/chosen": -193.85595703125, + "logps/rejected": -253.77130126953125, + "loss": 0.6463, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14980114996433258, + "rewards/margins": 0.1919388622045517, + "rewards/rejected": -0.042137712240219116, + "step": 4601 + }, + { + "epoch": 0.7116953411946646, + "grad_norm": 4.077887058258057, + "learning_rate": 4.237598808569137e-06, + "logits/chosen": 10.013148307800293, + "logits/rejected": 9.555876731872559, + "logps/chosen": -211.83291625976562, + "logps/rejected": -139.9344024658203, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06236010789871216, + "rewards/margins": 0.22210922837257385, + "rewards/rejected": -0.1597491353750229, + "step": 4602 + }, + { + "epoch": 0.7118499903344288, + "grad_norm": 4.701518535614014, + "learning_rate": 4.237312406919464e-06, + "logits/chosen": 12.729211807250977, + "logits/rejected": 7.968535900115967, + "logps/chosen": -407.1088562011719, + "logps/rejected": -274.4529113769531, + "loss": 0.4895, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4684843122959137, + "rewards/margins": 0.5429566502571106, + "rewards/rejected": -0.0744723379611969, + "step": 4603 + }, + { + "epoch": 0.7120046394741929, + "grad_norm": 4.916139125823975, + "learning_rate": 4.237026005269791e-06, + "logits/chosen": 13.155076026916504, + "logits/rejected": 16.855268478393555, + "logps/chosen": -256.2487487792969, + "logps/rejected": -299.71038818359375, + "loss": 0.5594, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4148678779602051, + "rewards/margins": 0.35789576172828674, + "rewards/rejected": 0.056972116231918335, + "step": 4604 + }, + { + "epoch": 0.712159288613957, + "grad_norm": 8.865031242370605, + "learning_rate": 4.236739603620117e-06, + "logits/chosen": 12.798454284667969, + "logits/rejected": 12.440105438232422, + "logps/chosen": -420.20477294921875, + "logps/rejected": -419.4029541015625, + "loss": 0.7462, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11067429184913635, + "rewards/margins": 0.044924937188625336, + "rewards/rejected": 0.06574936211109161, + "step": 4605 + }, + { + "epoch": 0.7123139377537212, + "grad_norm": 4.790214538574219, + "learning_rate": 4.236453201970444e-06, + "logits/chosen": 12.263076782226562, + "logits/rejected": 10.843096733093262, + "logps/chosen": -198.3968963623047, + "logps/rejected": -203.72879028320312, + "loss": 0.6445, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3282395601272583, + "rewards/margins": 0.2692464292049408, + "rewards/rejected": 0.058993108570575714, + "step": 4606 + }, + { + "epoch": 0.7124685868934854, + "grad_norm": 4.335485935211182, + "learning_rate": 4.23616680032077e-06, + "logits/chosen": 11.603706359863281, + "logits/rejected": 3.3118488788604736, + "logps/chosen": -300.590087890625, + "logps/rejected": -239.49508666992188, + "loss": 0.5544, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.313501238822937, + "rewards/margins": 0.3790134787559509, + "rewards/rejected": -0.06551218032836914, + "step": 4607 + }, + { + "epoch": 0.7126232360332496, + "grad_norm": 6.42044734954834, + "learning_rate": 4.235880398671096e-06, + "logits/chosen": 10.737825393676758, + "logits/rejected": 4.036214828491211, + "logps/chosen": -308.0980529785156, + "logps/rejected": -223.8427276611328, + "loss": 0.7902, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.005572028458118439, + "rewards/margins": -0.05523768067359924, + "rewards/rejected": 0.04966564476490021, + "step": 4608 + }, + { + "epoch": 0.7127778851730138, + "grad_norm": 5.577699184417725, + "learning_rate": 4.235593997021423e-06, + "logits/chosen": 8.560925483703613, + "logits/rejected": 10.230661392211914, + "logps/chosen": -253.0076446533203, + "logps/rejected": -341.5283508300781, + "loss": 0.6643, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3910121023654938, + "rewards/margins": 0.1334420144557953, + "rewards/rejected": 0.2575700879096985, + "step": 4609 + }, + { + "epoch": 0.7129325343127779, + "grad_norm": 3.967543840408325, + "learning_rate": 4.23530759537175e-06, + "logits/chosen": 10.803560256958008, + "logits/rejected": 7.799156188964844, + "logps/chosen": -146.0970001220703, + "logps/rejected": -107.52326202392578, + "loss": 0.6864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05748012661933899, + "rewards/margins": 0.10552510619163513, + "rewards/rejected": -0.16300520300865173, + "step": 4610 + }, + { + "epoch": 0.7130871834525421, + "grad_norm": 9.559494972229004, + "learning_rate": 4.235021193722076e-06, + "logits/chosen": 9.288444519042969, + "logits/rejected": 4.61497163772583, + "logps/chosen": -457.3798828125, + "logps/rejected": -427.8670349121094, + "loss": 0.7114, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21303023397922516, + "rewards/margins": 0.1577335000038147, + "rewards/rejected": 0.05529669672250748, + "step": 4611 + }, + { + "epoch": 0.7132418325923062, + "grad_norm": 6.40587854385376, + "learning_rate": 4.234734792072402e-06, + "logits/chosen": 4.82703971862793, + "logits/rejected": 9.51150131225586, + "logps/chosen": -222.35855102539062, + "logps/rejected": -232.75439453125, + "loss": 0.7053, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06283684074878693, + "rewards/margins": -0.000910535454750061, + "rewards/rejected": -0.06192629784345627, + "step": 4612 + }, + { + "epoch": 0.7133964817320704, + "grad_norm": 4.430289268493652, + "learning_rate": 4.234448390422729e-06, + "logits/chosen": 13.957395553588867, + "logits/rejected": 7.47810173034668, + "logps/chosen": -318.04803466796875, + "logps/rejected": -252.24575805664062, + "loss": 0.6093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3107309341430664, + "rewards/margins": 0.2963135838508606, + "rewards/rejected": 0.014417361468076706, + "step": 4613 + }, + { + "epoch": 0.7135511308718345, + "grad_norm": 7.0096116065979, + "learning_rate": 4.2341619887730554e-06, + "logits/chosen": 7.8340630531311035, + "logits/rejected": 5.389813423156738, + "logps/chosen": -309.28106689453125, + "logps/rejected": -244.09130859375, + "loss": 0.7142, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26365795731544495, + "rewards/margins": 0.06978225708007812, + "rewards/rejected": 0.19387570023536682, + "step": 4614 + }, + { + "epoch": 0.7137057800115987, + "grad_norm": 4.990397930145264, + "learning_rate": 4.233875587123382e-06, + "logits/chosen": 5.352141857147217, + "logits/rejected": 7.53994083404541, + "logps/chosen": -228.5999755859375, + "logps/rejected": -245.09378051757812, + "loss": 0.6676, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11973323673009872, + "rewards/margins": 0.23036159574985504, + "rewards/rejected": -0.3500947952270508, + "step": 4615 + }, + { + "epoch": 0.7138604291513628, + "grad_norm": 4.495199203491211, + "learning_rate": 4.233589185473709e-06, + "logits/chosen": 7.831066131591797, + "logits/rejected": 7.557775020599365, + "logps/chosen": -197.90415954589844, + "logps/rejected": -194.01568603515625, + "loss": 0.7217, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16256499290466309, + "rewards/margins": 0.013051211833953857, + "rewards/rejected": 0.14951378107070923, + "step": 4616 + }, + { + "epoch": 0.714015078291127, + "grad_norm": 9.58165168762207, + "learning_rate": 4.233302783824035e-06, + "logits/chosen": 6.459289073944092, + "logits/rejected": 10.197653770446777, + "logps/chosen": -232.92166137695312, + "logps/rejected": -413.5774841308594, + "loss": 0.8116, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.212818905711174, + "rewards/margins": -0.16318702697753906, + "rewards/rejected": -0.04963187873363495, + "step": 4617 + }, + { + "epoch": 0.7141697274308911, + "grad_norm": 3.74113130569458, + "learning_rate": 4.233016382174361e-06, + "logits/chosen": 13.833148956298828, + "logits/rejected": 13.798471450805664, + "logps/chosen": -209.2418212890625, + "logps/rejected": -219.6009521484375, + "loss": 0.4913, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28905540704727173, + "rewards/margins": 0.5732909440994263, + "rewards/rejected": -0.28423553705215454, + "step": 4618 + }, + { + "epoch": 0.7143243765706553, + "grad_norm": 6.600011825561523, + "learning_rate": 4.232729980524688e-06, + "logits/chosen": 10.8261079788208, + "logits/rejected": 3.906608819961548, + "logps/chosen": -304.7872314453125, + "logps/rejected": -288.09423828125, + "loss": 0.5966, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19084981083869934, + "rewards/margins": 0.286954790353775, + "rewards/rejected": -0.09610500931739807, + "step": 4619 + }, + { + "epoch": 0.7144790257104194, + "grad_norm": 4.34542179107666, + "learning_rate": 4.2324435788750145e-06, + "logits/chosen": 9.040791511535645, + "logits/rejected": 4.181394577026367, + "logps/chosen": -263.59197998046875, + "logps/rejected": -264.7336120605469, + "loss": 0.5591, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07334579527378082, + "rewards/margins": 0.3570653796195984, + "rewards/rejected": -0.28371959924697876, + "step": 4620 + }, + { + "epoch": 0.7146336748501837, + "grad_norm": 4.266055583953857, + "learning_rate": 4.232157177225341e-06, + "logits/chosen": 5.790135383605957, + "logits/rejected": 9.477067947387695, + "logps/chosen": -190.33055114746094, + "logps/rejected": -220.47218322753906, + "loss": 0.6193, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28691577911376953, + "rewards/margins": 0.18353229761123657, + "rewards/rejected": 0.10338349640369415, + "step": 4621 + }, + { + "epoch": 0.7147883239899478, + "grad_norm": 5.435953140258789, + "learning_rate": 4.231870775575668e-06, + "logits/chosen": 8.715919494628906, + "logits/rejected": 2.5127782821655273, + "logps/chosen": -201.9743194580078, + "logps/rejected": -159.32742309570312, + "loss": 0.6293, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08574648201465607, + "rewards/margins": 0.1784057766199112, + "rewards/rejected": -0.09265930950641632, + "step": 4622 + }, + { + "epoch": 0.714942973129712, + "grad_norm": 4.416068077087402, + "learning_rate": 4.2315843739259944e-06, + "logits/chosen": 12.662212371826172, + "logits/rejected": 6.926520824432373, + "logps/chosen": -344.5833435058594, + "logps/rejected": -218.7684783935547, + "loss": 0.5106, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5622336864471436, + "rewards/margins": 0.43748739361763, + "rewards/rejected": 0.12474632263183594, + "step": 4623 + }, + { + "epoch": 0.7150976222694762, + "grad_norm": 5.635395050048828, + "learning_rate": 4.231297972276321e-06, + "logits/chosen": 12.82259464263916, + "logits/rejected": 8.568951606750488, + "logps/chosen": -359.47039794921875, + "logps/rejected": -295.04132080078125, + "loss": 0.5342, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5028526782989502, + "rewards/margins": 0.5173433423042297, + "rewards/rejected": -0.014490664005279541, + "step": 4624 + }, + { + "epoch": 0.7152522714092403, + "grad_norm": 4.45168399810791, + "learning_rate": 4.231011570626647e-06, + "logits/chosen": 12.053682327270508, + "logits/rejected": 10.460845947265625, + "logps/chosen": -231.22801208496094, + "logps/rejected": -206.77816772460938, + "loss": 0.6051, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5290884971618652, + "rewards/margins": 0.293659508228302, + "rewards/rejected": 0.23542898893356323, + "step": 4625 + }, + { + "epoch": 0.7154069205490045, + "grad_norm": 7.048733234405518, + "learning_rate": 4.2307251689769735e-06, + "logits/chosen": 9.767511367797852, + "logits/rejected": 8.867424011230469, + "logps/chosen": -269.475341796875, + "logps/rejected": -320.96697998046875, + "loss": 0.7632, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14557959139347076, + "rewards/margins": -0.036857739090919495, + "rewards/rejected": 0.18243733048439026, + "step": 4626 + }, + { + "epoch": 0.7155615696887686, + "grad_norm": 5.3465399742126465, + "learning_rate": 4.2304387673273e-06, + "logits/chosen": 14.575580596923828, + "logits/rejected": 9.753442764282227, + "logps/chosen": -366.173583984375, + "logps/rejected": -227.4510498046875, + "loss": 0.7096, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48749876022338867, + "rewards/margins": 0.12785844504833221, + "rewards/rejected": 0.35964030027389526, + "step": 4627 + }, + { + "epoch": 0.7157162188285328, + "grad_norm": 5.9558892250061035, + "learning_rate": 4.230152365677627e-06, + "logits/chosen": 11.542832374572754, + "logits/rejected": 5.083554267883301, + "logps/chosen": -437.64544677734375, + "logps/rejected": -360.44879150390625, + "loss": 0.5831, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.230820432305336, + "rewards/margins": 0.2564258575439453, + "rewards/rejected": -0.025605440139770508, + "step": 4628 + }, + { + "epoch": 0.7158708679682969, + "grad_norm": 5.45145320892334, + "learning_rate": 4.2298659640279535e-06, + "logits/chosen": 12.42699146270752, + "logits/rejected": 16.88634490966797, + "logps/chosen": -278.68701171875, + "logps/rejected": -246.53636169433594, + "loss": 0.7133, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11747746914625168, + "rewards/margins": 0.06037537008523941, + "rewards/rejected": 0.057102106511592865, + "step": 4629 + }, + { + "epoch": 0.7160255171080611, + "grad_norm": 18.71869468688965, + "learning_rate": 4.22957956237828e-06, + "logits/chosen": 4.967696666717529, + "logits/rejected": 3.1390843391418457, + "logps/chosen": -243.71731567382812, + "logps/rejected": -166.47952270507812, + "loss": 0.7619, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12153066694736481, + "rewards/margins": 0.003656625747680664, + "rewards/rejected": 0.11787405610084534, + "step": 4630 + }, + { + "epoch": 0.7161801662478252, + "grad_norm": 6.964846134185791, + "learning_rate": 4.229293160728607e-06, + "logits/chosen": 10.464961051940918, + "logits/rejected": 10.341142654418945, + "logps/chosen": -324.0904541015625, + "logps/rejected": -323.0029296875, + "loss": 0.5813, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30943137407302856, + "rewards/margins": 0.511340856552124, + "rewards/rejected": -0.20190943777561188, + "step": 4631 + }, + { + "epoch": 0.7163348153875894, + "grad_norm": 14.7042818069458, + "learning_rate": 4.229006759078933e-06, + "logits/chosen": 16.729562759399414, + "logits/rejected": 10.402524948120117, + "logps/chosen": -522.3245849609375, + "logps/rejected": -402.0496520996094, + "loss": 0.5883, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5634576082229614, + "rewards/margins": 0.3435225486755371, + "rewards/rejected": 0.21993504464626312, + "step": 4632 + }, + { + "epoch": 0.7164894645273535, + "grad_norm": 4.383731842041016, + "learning_rate": 4.228720357429259e-06, + "logits/chosen": 14.411029815673828, + "logits/rejected": 12.076712608337402, + "logps/chosen": -314.4481506347656, + "logps/rejected": -322.5937805175781, + "loss": 0.5238, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3533988893032074, + "rewards/margins": 0.6665071845054626, + "rewards/rejected": -0.31310826539993286, + "step": 4633 + }, + { + "epoch": 0.7166441136671178, + "grad_norm": 3.7176642417907715, + "learning_rate": 4.228433955779586e-06, + "logits/chosen": 7.941164016723633, + "logits/rejected": 4.0201849937438965, + "logps/chosen": -217.4586944580078, + "logps/rejected": -147.8682861328125, + "loss": 0.5419, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25656378269195557, + "rewards/margins": 0.4995496869087219, + "rewards/rejected": -0.24298591911792755, + "step": 4634 + }, + { + "epoch": 0.7167987628068819, + "grad_norm": 4.803718566894531, + "learning_rate": 4.2281475541299126e-06, + "logits/chosen": 12.475752830505371, + "logits/rejected": 10.77780532836914, + "logps/chosen": -246.08956909179688, + "logps/rejected": -254.8748321533203, + "loss": 0.7456, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4601503908634186, + "rewards/margins": 0.028110364452004433, + "rewards/rejected": 0.4320400357246399, + "step": 4635 + }, + { + "epoch": 0.7169534119466461, + "grad_norm": 5.811635494232178, + "learning_rate": 4.227861152480238e-06, + "logits/chosen": 10.892448425292969, + "logits/rejected": 9.648085594177246, + "logps/chosen": -397.5242004394531, + "logps/rejected": -386.5086669921875, + "loss": 0.5113, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4311075210571289, + "rewards/margins": 0.46603813767433167, + "rewards/rejected": -0.03493061661720276, + "step": 4636 + }, + { + "epoch": 0.7171080610864102, + "grad_norm": 4.786439418792725, + "learning_rate": 4.227574750830565e-06, + "logits/chosen": 8.258153915405273, + "logits/rejected": 5.33173942565918, + "logps/chosen": -197.9464874267578, + "logps/rejected": -166.2730712890625, + "loss": 0.6797, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03668379783630371, + "rewards/margins": 0.18249371647834778, + "rewards/rejected": -0.14580993354320526, + "step": 4637 + }, + { + "epoch": 0.7172627102261744, + "grad_norm": 7.147068500518799, + "learning_rate": 4.227288349180892e-06, + "logits/chosen": 5.039703845977783, + "logits/rejected": 7.041079044342041, + "logps/chosen": -270.3241882324219, + "logps/rejected": -277.89080810546875, + "loss": 0.844, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.49101442098617554, + "rewards/margins": -0.22120852768421173, + "rewards/rejected": -0.269805908203125, + "step": 4638 + }, + { + "epoch": 0.7174173593659385, + "grad_norm": 5.5372185707092285, + "learning_rate": 4.227001947531218e-06, + "logits/chosen": 9.179319381713867, + "logits/rejected": 7.193024635314941, + "logps/chosen": -204.68038940429688, + "logps/rejected": -262.3078918457031, + "loss": 0.6036, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05145713686943054, + "rewards/margins": 0.2607852518558502, + "rewards/rejected": -0.31224238872528076, + "step": 4639 + }, + { + "epoch": 0.7175720085057027, + "grad_norm": 5.4153828620910645, + "learning_rate": 4.226715545881544e-06, + "logits/chosen": 9.343610763549805, + "logits/rejected": 6.311092376708984, + "logps/chosen": -247.0800018310547, + "logps/rejected": -206.5257110595703, + "loss": 0.5378, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29292821884155273, + "rewards/margins": 0.46754083037376404, + "rewards/rejected": -0.1746126115322113, + "step": 4640 + }, + { + "epoch": 0.7177266576454668, + "grad_norm": 7.227595329284668, + "learning_rate": 4.226429144231871e-06, + "logits/chosen": 10.881799697875977, + "logits/rejected": 3.8781089782714844, + "logps/chosen": -375.35711669921875, + "logps/rejected": -315.3063659667969, + "loss": 0.7193, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07003575563430786, + "rewards/margins": 0.26165515184402466, + "rewards/rejected": -0.1916193962097168, + "step": 4641 + }, + { + "epoch": 0.717881306785231, + "grad_norm": 6.14045524597168, + "learning_rate": 4.226142742582197e-06, + "logits/chosen": 10.574788093566895, + "logits/rejected": 7.586615562438965, + "logps/chosen": -318.93682861328125, + "logps/rejected": -244.3237762451172, + "loss": 0.6617, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20660200715065002, + "rewards/margins": 0.17695912718772888, + "rewards/rejected": 0.02964286133646965, + "step": 4642 + }, + { + "epoch": 0.7180359559249951, + "grad_norm": 6.854588031768799, + "learning_rate": 4.225856340932524e-06, + "logits/chosen": 8.998845100402832, + "logits/rejected": 3.6664860248565674, + "logps/chosen": -356.71990966796875, + "logps/rejected": -257.1462707519531, + "loss": 0.5208, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35594692826271057, + "rewards/margins": 0.6809406876564026, + "rewards/rejected": -0.32499369978904724, + "step": 4643 + }, + { + "epoch": 0.7181906050647593, + "grad_norm": 7.132308483123779, + "learning_rate": 4.225569939282851e-06, + "logits/chosen": 14.064380645751953, + "logits/rejected": 11.397700309753418, + "logps/chosen": -268.5389709472656, + "logps/rejected": -252.5906982421875, + "loss": 0.7136, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20234090089797974, + "rewards/margins": 0.04124408960342407, + "rewards/rejected": 0.16109681129455566, + "step": 4644 + }, + { + "epoch": 0.7183452542045234, + "grad_norm": 6.435413837432861, + "learning_rate": 4.2252835376331765e-06, + "logits/chosen": 12.554622650146484, + "logits/rejected": 10.075553894042969, + "logps/chosen": -355.69091796875, + "logps/rejected": -313.96343994140625, + "loss": 0.8989, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1648985594511032, + "rewards/margins": -0.282002329826355, + "rewards/rejected": 0.446900874376297, + "step": 4645 + }, + { + "epoch": 0.7184999033442876, + "grad_norm": 8.379067420959473, + "learning_rate": 4.224997135983503e-06, + "logits/chosen": 10.873297691345215, + "logits/rejected": 12.469922065734863, + "logps/chosen": -329.8974304199219, + "logps/rejected": -233.2132568359375, + "loss": 0.8804, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07426971942186356, + "rewards/margins": -0.17683526873588562, + "rewards/rejected": 0.2511049509048462, + "step": 4646 + }, + { + "epoch": 0.7186545524840519, + "grad_norm": 3.860957384109497, + "learning_rate": 4.22471073433383e-06, + "logits/chosen": 10.99325942993164, + "logits/rejected": 9.064567565917969, + "logps/chosen": -160.0193634033203, + "logps/rejected": -152.13067626953125, + "loss": 0.7023, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2436378449201584, + "rewards/margins": 0.03730266913771629, + "rewards/rejected": 0.2063351571559906, + "step": 4647 + }, + { + "epoch": 0.718809201623816, + "grad_norm": 4.60730504989624, + "learning_rate": 4.2244243326841565e-06, + "logits/chosen": 8.819602012634277, + "logits/rejected": 7.903059959411621, + "logps/chosen": -260.6100158691406, + "logps/rejected": -294.0946044921875, + "loss": 0.5749, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36508047580718994, + "rewards/margins": 0.49078208208084106, + "rewards/rejected": -0.12570160627365112, + "step": 4648 + }, + { + "epoch": 0.7189638507635802, + "grad_norm": 4.122908115386963, + "learning_rate": 4.224137931034483e-06, + "logits/chosen": 13.057272911071777, + "logits/rejected": 9.321096420288086, + "logps/chosen": -256.8499450683594, + "logps/rejected": -233.92526245117188, + "loss": 0.4515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49239271879196167, + "rewards/margins": 0.6381007432937622, + "rewards/rejected": -0.14570806920528412, + "step": 4649 + }, + { + "epoch": 0.7191184999033443, + "grad_norm": 5.219837188720703, + "learning_rate": 4.22385152938481e-06, + "logits/chosen": 9.81836986541748, + "logits/rejected": 12.089348793029785, + "logps/chosen": -241.3326416015625, + "logps/rejected": -275.6637268066406, + "loss": 0.8797, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.24137966334819794, + "rewards/margins": -0.23598501086235046, + "rewards/rejected": 0.4773646593093872, + "step": 4650 + }, + { + "epoch": 0.7192731490431085, + "grad_norm": 4.851894855499268, + "learning_rate": 4.2235651277351356e-06, + "logits/chosen": 3.2410387992858887, + "logits/rejected": 6.405688762664795, + "logps/chosen": -320.13201904296875, + "logps/rejected": -431.6505432128906, + "loss": 0.5887, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3114035725593567, + "rewards/margins": 0.4934540092945099, + "rewards/rejected": -0.182050421833992, + "step": 4651 + }, + { + "epoch": 0.7194277981828726, + "grad_norm": 5.242563724517822, + "learning_rate": 4.223278726085462e-06, + "logits/chosen": 9.595367431640625, + "logits/rejected": 4.99884557723999, + "logps/chosen": -393.42254638671875, + "logps/rejected": -262.14764404296875, + "loss": 0.6548, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34950342774391174, + "rewards/margins": 0.2511463761329651, + "rewards/rejected": 0.09835705906152725, + "step": 4652 + }, + { + "epoch": 0.7195824473226368, + "grad_norm": 5.531010627746582, + "learning_rate": 4.222992324435789e-06, + "logits/chosen": 9.550742149353027, + "logits/rejected": 9.265010833740234, + "logps/chosen": -223.27291870117188, + "logps/rejected": -259.66156005859375, + "loss": 0.5514, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4141909182071686, + "rewards/margins": 0.48686161637306213, + "rewards/rejected": -0.07267066836357117, + "step": 4653 + }, + { + "epoch": 0.7197370964624009, + "grad_norm": 3.8310351371765137, + "learning_rate": 4.2227059227861155e-06, + "logits/chosen": 14.591660499572754, + "logits/rejected": 12.950172424316406, + "logps/chosen": -251.24322509765625, + "logps/rejected": -225.15335083007812, + "loss": 0.5734, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20353688299655914, + "rewards/margins": 0.3828543424606323, + "rewards/rejected": -0.179317444562912, + "step": 4654 + }, + { + "epoch": 0.7198917456021651, + "grad_norm": 4.616705894470215, + "learning_rate": 4.222419521136442e-06, + "logits/chosen": 11.38825798034668, + "logits/rejected": 6.519626140594482, + "logps/chosen": -178.34844970703125, + "logps/rejected": -152.76821899414062, + "loss": 0.5532, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04259766265749931, + "rewards/margins": 0.45947548747062683, + "rewards/rejected": -0.416877806186676, + "step": 4655 + }, + { + "epoch": 0.7200463947419292, + "grad_norm": 4.2925028800964355, + "learning_rate": 4.222133119486769e-06, + "logits/chosen": 2.5514652729034424, + "logits/rejected": 6.725386619567871, + "logps/chosen": -186.39608764648438, + "logps/rejected": -179.72972106933594, + "loss": 0.6288, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32997971773147583, + "rewards/margins": 0.1427549123764038, + "rewards/rejected": 0.18722479045391083, + "step": 4656 + }, + { + "epoch": 0.7202010438816934, + "grad_norm": 9.864386558532715, + "learning_rate": 4.2218467178370955e-06, + "logits/chosen": 12.211067199707031, + "logits/rejected": 6.216061115264893, + "logps/chosen": -313.1650390625, + "logps/rejected": -233.2894744873047, + "loss": 0.561, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7663730382919312, + "rewards/margins": 0.3916390538215637, + "rewards/rejected": 0.37473392486572266, + "step": 4657 + }, + { + "epoch": 0.7203556930214575, + "grad_norm": 6.886110305786133, + "learning_rate": 4.221560316187421e-06, + "logits/chosen": 10.027338027954102, + "logits/rejected": 7.370658874511719, + "logps/chosen": -346.3270263671875, + "logps/rejected": -281.984619140625, + "loss": 0.8335, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.22282001376152039, + "rewards/margins": 0.032382868230342865, + "rewards/rejected": 0.19043713808059692, + "step": 4658 + }, + { + "epoch": 0.7205103421612217, + "grad_norm": 7.485498905181885, + "learning_rate": 4.221273914537748e-06, + "logits/chosen": 8.284996032714844, + "logits/rejected": 10.953027725219727, + "logps/chosen": -166.54698181152344, + "logps/rejected": -210.5655975341797, + "loss": 0.8508, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.060010701417922974, + "rewards/margins": -0.2089838683605194, + "rewards/rejected": 0.2689945697784424, + "step": 4659 + }, + { + "epoch": 0.7206649913009859, + "grad_norm": 3.434138298034668, + "learning_rate": 4.220987512888075e-06, + "logits/chosen": 13.868374824523926, + "logits/rejected": 5.818326473236084, + "logps/chosen": -275.8595275878906, + "logps/rejected": -181.86875915527344, + "loss": 0.4357, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7116816639900208, + "rewards/margins": 0.7430300116539001, + "rewards/rejected": -0.031348370015621185, + "step": 4660 + }, + { + "epoch": 0.7208196404407501, + "grad_norm": 6.918261528015137, + "learning_rate": 4.220701111238401e-06, + "logits/chosen": 10.387393951416016, + "logits/rejected": 7.188056468963623, + "logps/chosen": -411.4892578125, + "logps/rejected": -403.06341552734375, + "loss": 0.6397, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44164544343948364, + "rewards/margins": 0.20920506119728088, + "rewards/rejected": 0.23244038224220276, + "step": 4661 + }, + { + "epoch": 0.7209742895805142, + "grad_norm": 5.1262054443359375, + "learning_rate": 4.220414709588728e-06, + "logits/chosen": 10.633928298950195, + "logits/rejected": 10.21422004699707, + "logps/chosen": -194.33642578125, + "logps/rejected": -295.7193908691406, + "loss": 0.5587, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28601887822151184, + "rewards/margins": 0.4282449781894684, + "rewards/rejected": -0.14222612977027893, + "step": 4662 + }, + { + "epoch": 0.7211289387202784, + "grad_norm": 5.970149993896484, + "learning_rate": 4.2201283079390545e-06, + "logits/chosen": 7.638833522796631, + "logits/rejected": 7.223271369934082, + "logps/chosen": -344.19964599609375, + "logps/rejected": -227.15353393554688, + "loss": 0.7451, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4394124150276184, + "rewards/margins": 0.0007085800170898438, + "rewards/rejected": 0.43870383501052856, + "step": 4663 + }, + { + "epoch": 0.7212835878600425, + "grad_norm": 7.072237491607666, + "learning_rate": 4.219841906289381e-06, + "logits/chosen": 12.829007148742676, + "logits/rejected": 7.617791652679443, + "logps/chosen": -472.68359375, + "logps/rejected": -373.8777160644531, + "loss": 0.7759, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2337905764579773, + "rewards/margins": -0.07853439450263977, + "rewards/rejected": 0.31232497096061707, + "step": 4664 + }, + { + "epoch": 0.7214382369998067, + "grad_norm": 5.687023162841797, + "learning_rate": 4.219555504639707e-06, + "logits/chosen": 12.600521087646484, + "logits/rejected": 7.528505325317383, + "logps/chosen": -310.04071044921875, + "logps/rejected": -344.87689208984375, + "loss": 0.5636, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.65812087059021, + "rewards/margins": 0.33446481823921204, + "rewards/rejected": 0.32365599274635315, + "step": 4665 + }, + { + "epoch": 0.7215928861395708, + "grad_norm": 5.265872955322266, + "learning_rate": 4.219269102990034e-06, + "logits/chosen": 12.636493682861328, + "logits/rejected": 13.619648933410645, + "logps/chosen": -290.1864013671875, + "logps/rejected": -267.1124267578125, + "loss": 0.7236, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18583893775939941, + "rewards/margins": 0.06740047037601471, + "rewards/rejected": 0.1184384748339653, + "step": 4666 + }, + { + "epoch": 0.721747535279335, + "grad_norm": 5.027142524719238, + "learning_rate": 4.21898270134036e-06, + "logits/chosen": 7.26239538192749, + "logits/rejected": 1.9124733209609985, + "logps/chosen": -240.86940002441406, + "logps/rejected": -187.12486267089844, + "loss": 0.6469, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1723259836435318, + "rewards/margins": 0.5006253123283386, + "rewards/rejected": -0.6729512810707092, + "step": 4667 + }, + { + "epoch": 0.7219021844190991, + "grad_norm": 6.47714376449585, + "learning_rate": 4.218696299690687e-06, + "logits/chosen": 11.352617263793945, + "logits/rejected": 10.03576374053955, + "logps/chosen": -303.8058776855469, + "logps/rejected": -187.26290893554688, + "loss": 0.8221, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18106481432914734, + "rewards/margins": -0.1451394408941269, + "rewards/rejected": 0.32620424032211304, + "step": 4668 + }, + { + "epoch": 0.7220568335588633, + "grad_norm": 7.10595178604126, + "learning_rate": 4.218409898041014e-06, + "logits/chosen": 9.532785415649414, + "logits/rejected": 5.463510513305664, + "logps/chosen": -305.3661193847656, + "logps/rejected": -405.561279296875, + "loss": 0.6468, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4265161454677582, + "rewards/margins": 0.1635202318429947, + "rewards/rejected": 0.2629959285259247, + "step": 4669 + }, + { + "epoch": 0.7222114826986274, + "grad_norm": 5.666043758392334, + "learning_rate": 4.218123496391339e-06, + "logits/chosen": 14.798239707946777, + "logits/rejected": 12.213676452636719, + "logps/chosen": -252.88157653808594, + "logps/rejected": -224.15078735351562, + "loss": 0.7198, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.144762322306633, + "rewards/margins": 0.16181355714797974, + "rewards/rejected": -0.01705123484134674, + "step": 4670 + }, + { + "epoch": 0.7223661318383916, + "grad_norm": 4.627646446228027, + "learning_rate": 4.217837094741666e-06, + "logits/chosen": 8.296591758728027, + "logits/rejected": 7.6592817306518555, + "logps/chosen": -243.8062744140625, + "logps/rejected": -255.21615600585938, + "loss": 0.6379, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14714735746383667, + "rewards/margins": 0.24118289351463318, + "rewards/rejected": -0.0940355509519577, + "step": 4671 + }, + { + "epoch": 0.7225207809781559, + "grad_norm": 5.429712295532227, + "learning_rate": 4.217550693091993e-06, + "logits/chosen": 11.345115661621094, + "logits/rejected": 10.367274284362793, + "logps/chosen": -258.7862854003906, + "logps/rejected": -271.5745544433594, + "loss": 0.6911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23161429166793823, + "rewards/margins": 0.35463401675224304, + "rewards/rejected": -0.1230197325348854, + "step": 4672 + }, + { + "epoch": 0.72267543011792, + "grad_norm": 6.383058547973633, + "learning_rate": 4.217264291442319e-06, + "logits/chosen": 11.830735206604004, + "logits/rejected": 5.300699234008789, + "logps/chosen": -394.55120849609375, + "logps/rejected": -401.2049255371094, + "loss": 0.5976, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6347014307975769, + "rewards/margins": 0.24926090240478516, + "rewards/rejected": 0.38544055819511414, + "step": 4673 + }, + { + "epoch": 0.7228300792576842, + "grad_norm": 5.6670451164245605, + "learning_rate": 4.216977889792645e-06, + "logits/chosen": 10.207159042358398, + "logits/rejected": 10.706491470336914, + "logps/chosen": -358.0547180175781, + "logps/rejected": -348.438232421875, + "loss": 0.51, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6313265562057495, + "rewards/margins": 0.44541940093040466, + "rewards/rejected": 0.18590718507766724, + "step": 4674 + }, + { + "epoch": 0.7229847283974483, + "grad_norm": 4.458642959594727, + "learning_rate": 4.216691488142972e-06, + "logits/chosen": 13.730224609375, + "logits/rejected": 7.8176188468933105, + "logps/chosen": -215.33334350585938, + "logps/rejected": -151.25917053222656, + "loss": 0.5262, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22707036137580872, + "rewards/margins": 0.5649893879890442, + "rewards/rejected": -0.3379189968109131, + "step": 4675 + }, + { + "epoch": 0.7231393775372125, + "grad_norm": 5.969623565673828, + "learning_rate": 4.2164050864932984e-06, + "logits/chosen": 4.943775177001953, + "logits/rejected": 6.249081611633301, + "logps/chosen": -267.60107421875, + "logps/rejected": -259.83892822265625, + "loss": 0.6768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24906936287879944, + "rewards/margins": 0.10447268933057785, + "rewards/rejected": 0.144596666097641, + "step": 4676 + }, + { + "epoch": 0.7232940266769766, + "grad_norm": 7.427109718322754, + "learning_rate": 4.216118684843625e-06, + "logits/chosen": 3.4262521266937256, + "logits/rejected": 4.808072090148926, + "logps/chosen": -288.4163513183594, + "logps/rejected": -293.7523193359375, + "loss": 0.8102, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14433889091014862, + "rewards/margins": -0.06348054111003876, + "rewards/rejected": -0.08085831999778748, + "step": 4677 + }, + { + "epoch": 0.7234486758167408, + "grad_norm": 5.3566083908081055, + "learning_rate": 4.215832283193952e-06, + "logits/chosen": 13.013839721679688, + "logits/rejected": 11.353792190551758, + "logps/chosen": -306.7126159667969, + "logps/rejected": -241.7992706298828, + "loss": 0.5584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34898167848587036, + "rewards/margins": 0.4810735285282135, + "rewards/rejected": -0.13209183514118195, + "step": 4678 + }, + { + "epoch": 0.7236033249565049, + "grad_norm": 4.50926399230957, + "learning_rate": 4.2155458815442775e-06, + "logits/chosen": 11.905725479125977, + "logits/rejected": 6.3731560707092285, + "logps/chosen": -228.26748657226562, + "logps/rejected": -194.92642211914062, + "loss": 0.5013, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46943017840385437, + "rewards/margins": 0.5827988386154175, + "rewards/rejected": -0.1133686751127243, + "step": 4679 + }, + { + "epoch": 0.7237579740962691, + "grad_norm": 4.547010898590088, + "learning_rate": 4.215259479894604e-06, + "logits/chosen": 14.651841163635254, + "logits/rejected": 13.051101684570312, + "logps/chosen": -285.75732421875, + "logps/rejected": -295.51983642578125, + "loss": 0.6404, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3800147473812103, + "rewards/margins": 0.20444633066654205, + "rewards/rejected": 0.17556840181350708, + "step": 4680 + }, + { + "epoch": 0.7239126232360332, + "grad_norm": 3.6450984477996826, + "learning_rate": 4.214973078244931e-06, + "logits/chosen": 6.817581653594971, + "logits/rejected": 3.956143379211426, + "logps/chosen": -161.10848999023438, + "logps/rejected": -129.74124145507812, + "loss": 0.6635, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15393391251564026, + "rewards/margins": 0.06820394098758698, + "rewards/rejected": 0.08572997897863388, + "step": 4681 + }, + { + "epoch": 0.7240672723757974, + "grad_norm": 5.038077354431152, + "learning_rate": 4.2146866765952575e-06, + "logits/chosen": 6.799264907836914, + "logits/rejected": 10.929970741271973, + "logps/chosen": -167.8234100341797, + "logps/rejected": -192.6121826171875, + "loss": 0.7377, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.043410152196884155, + "rewards/margins": 0.10863767564296722, + "rewards/rejected": -0.06522750109434128, + "step": 4682 + }, + { + "epoch": 0.7242219215155615, + "grad_norm": 7.810519695281982, + "learning_rate": 4.214400274945584e-06, + "logits/chosen": 6.762552738189697, + "logits/rejected": 7.165021896362305, + "logps/chosen": -168.40550231933594, + "logps/rejected": -168.51998901367188, + "loss": 0.9501, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6140388250350952, + "rewards/margins": -0.37804698944091797, + "rewards/rejected": -0.23599186539649963, + "step": 4683 + }, + { + "epoch": 0.7243765706553257, + "grad_norm": 6.186726093292236, + "learning_rate": 4.21411387329591e-06, + "logits/chosen": 6.37883996963501, + "logits/rejected": 3.2008724212646484, + "logps/chosen": -265.9684753417969, + "logps/rejected": -208.27178955078125, + "loss": 0.4794, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25573229789733887, + "rewards/margins": 0.6330490112304688, + "rewards/rejected": -0.3773166835308075, + "step": 4684 + }, + { + "epoch": 0.7245312197950899, + "grad_norm": 7.938426494598389, + "learning_rate": 4.213827471646237e-06, + "logits/chosen": 1.4458963871002197, + "logits/rejected": 4.18729305267334, + "logps/chosen": -433.803955078125, + "logps/rejected": -410.27362060546875, + "loss": 0.6554, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33891814947128296, + "rewards/margins": 0.16704228520393372, + "rewards/rejected": 0.17187586426734924, + "step": 4685 + }, + { + "epoch": 0.7246858689348541, + "grad_norm": 15.807042121887207, + "learning_rate": 4.213541069996563e-06, + "logits/chosen": 14.012039184570312, + "logits/rejected": 8.866901397705078, + "logps/chosen": -315.65911865234375, + "logps/rejected": -259.985107421875, + "loss": 0.7432, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3463870882987976, + "rewards/margins": 0.02259824424982071, + "rewards/rejected": 0.3237888514995575, + "step": 4686 + }, + { + "epoch": 0.7248405180746182, + "grad_norm": 4.948505878448486, + "learning_rate": 4.21325466834689e-06, + "logits/chosen": 17.208398818969727, + "logits/rejected": 8.785602569580078, + "logps/chosen": -292.0123291015625, + "logps/rejected": -170.625732421875, + "loss": 0.69, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3512876331806183, + "rewards/margins": 0.16835862398147583, + "rewards/rejected": 0.18292903900146484, + "step": 4687 + }, + { + "epoch": 0.7249951672143824, + "grad_norm": 4.025766849517822, + "learning_rate": 4.2129682666972166e-06, + "logits/chosen": 13.391169548034668, + "logits/rejected": 10.078253746032715, + "logps/chosen": -210.8770751953125, + "logps/rejected": -198.12782287597656, + "loss": 0.7102, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.010708902031183243, + "rewards/margins": 0.06500528752803802, + "rewards/rejected": -0.05429638922214508, + "step": 4688 + }, + { + "epoch": 0.7251498163541465, + "grad_norm": 6.953975200653076, + "learning_rate": 4.212681865047543e-06, + "logits/chosen": 12.4249906539917, + "logits/rejected": 7.346518516540527, + "logps/chosen": -282.6623229980469, + "logps/rejected": -266.74237060546875, + "loss": 0.6153, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.018584638833999634, + "rewards/margins": 0.38569384813308716, + "rewards/rejected": -0.4042784869670868, + "step": 4689 + }, + { + "epoch": 0.7253044654939107, + "grad_norm": 7.109914779663086, + "learning_rate": 4.21239546339787e-06, + "logits/chosen": 15.69974136352539, + "logits/rejected": 11.655282974243164, + "logps/chosen": -400.8044738769531, + "logps/rejected": -367.01409912109375, + "loss": 0.6945, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18244630098342896, + "rewards/margins": 0.11077319830656052, + "rewards/rejected": 0.07167311012744904, + "step": 4690 + }, + { + "epoch": 0.7254591146336749, + "grad_norm": 7.34127950668335, + "learning_rate": 4.212109061748196e-06, + "logits/chosen": 8.716079711914062, + "logits/rejected": 8.624029159545898, + "logps/chosen": -246.34829711914062, + "logps/rejected": -177.95440673828125, + "loss": 0.5315, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2629864811897278, + "rewards/margins": 0.4446890652179718, + "rewards/rejected": -0.1817026138305664, + "step": 4691 + }, + { + "epoch": 0.725613763773439, + "grad_norm": 5.421318531036377, + "learning_rate": 4.211822660098522e-06, + "logits/chosen": 11.135812759399414, + "logits/rejected": 12.228429794311523, + "logps/chosen": -169.8853759765625, + "logps/rejected": -197.93788146972656, + "loss": 0.6319, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.26685449481010437, + "rewards/margins": 0.20001932978630066, + "rewards/rejected": -0.46687382459640503, + "step": 4692 + }, + { + "epoch": 0.7257684129132032, + "grad_norm": 6.548890590667725, + "learning_rate": 4.211536258448849e-06, + "logits/chosen": 6.484709739685059, + "logits/rejected": 6.935871124267578, + "logps/chosen": -194.16201782226562, + "logps/rejected": -214.46810913085938, + "loss": 0.712, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3927028477191925, + "rewards/margins": 0.12043533474206924, + "rewards/rejected": -0.5131381750106812, + "step": 4693 + }, + { + "epoch": 0.7259230620529673, + "grad_norm": 4.851171016693115, + "learning_rate": 4.211249856799176e-06, + "logits/chosen": 9.42538070678711, + "logits/rejected": 8.364532470703125, + "logps/chosen": -263.5284118652344, + "logps/rejected": -320.7128601074219, + "loss": 0.4806, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2540663778781891, + "rewards/margins": 0.527794361114502, + "rewards/rejected": -0.27372801303863525, + "step": 4694 + }, + { + "epoch": 0.7260777111927315, + "grad_norm": 3.683863878250122, + "learning_rate": 4.210963455149502e-06, + "logits/chosen": 10.690038681030273, + "logits/rejected": 7.809244632720947, + "logps/chosen": -220.42422485351562, + "logps/rejected": -141.40753173828125, + "loss": 0.4945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37099236249923706, + "rewards/margins": 0.5335652828216553, + "rewards/rejected": -0.1625729501247406, + "step": 4695 + }, + { + "epoch": 0.7262323603324956, + "grad_norm": 9.078512191772461, + "learning_rate": 4.210677053499829e-06, + "logits/chosen": 3.4368064403533936, + "logits/rejected": 1.8862788677215576, + "logps/chosen": -289.85150146484375, + "logps/rejected": -457.8827209472656, + "loss": 0.6034, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2011779397726059, + "rewards/margins": 0.2352014183998108, + "rewards/rejected": -0.034023478627204895, + "step": 4696 + }, + { + "epoch": 0.7263870094722598, + "grad_norm": 4.544132709503174, + "learning_rate": 4.2103906518501556e-06, + "logits/chosen": 9.603703498840332, + "logits/rejected": 4.0646233558654785, + "logps/chosen": -269.1265869140625, + "logps/rejected": -249.3544921875, + "loss": 0.5998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2045774757862091, + "rewards/margins": 0.2614637017250061, + "rewards/rejected": -0.05688624083995819, + "step": 4697 + }, + { + "epoch": 0.726541658612024, + "grad_norm": 4.941222190856934, + "learning_rate": 4.210104250200481e-06, + "logits/chosen": 8.435171127319336, + "logits/rejected": 10.146812438964844, + "logps/chosen": -295.2537536621094, + "logps/rejected": -275.2083740234375, + "loss": 0.7124, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31353017687797546, + "rewards/margins": -0.003013603389263153, + "rewards/rejected": 0.3165437579154968, + "step": 4698 + }, + { + "epoch": 0.7266963077517882, + "grad_norm": 5.756451606750488, + "learning_rate": 4.209817848550808e-06, + "logits/chosen": 14.595026969909668, + "logits/rejected": 8.859549522399902, + "logps/chosen": -290.5458984375, + "logps/rejected": -230.3878173828125, + "loss": 0.6936, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.151070237159729, + "rewards/margins": 0.1971987932920456, + "rewards/rejected": -0.04612858593463898, + "step": 4699 + }, + { + "epoch": 0.7268509568915523, + "grad_norm": 6.423048496246338, + "learning_rate": 4.209531446901135e-06, + "logits/chosen": 7.840253829956055, + "logits/rejected": 8.613065719604492, + "logps/chosen": -315.06787109375, + "logps/rejected": -353.32757568359375, + "loss": 0.6718, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17087221145629883, + "rewards/margins": 0.2379167228937149, + "rewards/rejected": -0.06704450398683548, + "step": 4700 + }, + { + "epoch": 0.7270056060313165, + "grad_norm": 6.030930995941162, + "learning_rate": 4.209245045251461e-06, + "logits/chosen": 11.919048309326172, + "logits/rejected": 16.987382888793945, + "logps/chosen": -240.172607421875, + "logps/rejected": -270.4597473144531, + "loss": 0.7472, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25187310576438904, + "rewards/margins": -0.061295412480831146, + "rewards/rejected": 0.3131685256958008, + "step": 4701 + }, + { + "epoch": 0.7271602551710806, + "grad_norm": 4.687925815582275, + "learning_rate": 4.208958643601788e-06, + "logits/chosen": 13.742670059204102, + "logits/rejected": 12.651939392089844, + "logps/chosen": -253.6357421875, + "logps/rejected": -240.97171020507812, + "loss": 0.6148, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2723259925842285, + "rewards/margins": 0.20202729105949402, + "rewards/rejected": 0.0702986791729927, + "step": 4702 + }, + { + "epoch": 0.7273149043108448, + "grad_norm": 5.41274881362915, + "learning_rate": 4.208672241952115e-06, + "logits/chosen": 12.720199584960938, + "logits/rejected": 3.3738980293273926, + "logps/chosen": -396.3179626464844, + "logps/rejected": -265.2395324707031, + "loss": 0.4783, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6759431958198547, + "rewards/margins": 0.7797024846076965, + "rewards/rejected": -0.1037592887878418, + "step": 4703 + }, + { + "epoch": 0.7274695534506089, + "grad_norm": 5.377590179443359, + "learning_rate": 4.20838584030244e-06, + "logits/chosen": 12.636724472045898, + "logits/rejected": 12.154258728027344, + "logps/chosen": -269.744873046875, + "logps/rejected": -206.98141479492188, + "loss": 0.7226, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09787529706954956, + "rewards/margins": -0.03338862210512161, + "rewards/rejected": 0.13126391172409058, + "step": 4704 + }, + { + "epoch": 0.7276242025903731, + "grad_norm": 4.086818218231201, + "learning_rate": 4.208099438652767e-06, + "logits/chosen": 5.638157367706299, + "logits/rejected": 6.079058647155762, + "logps/chosen": -269.8873596191406, + "logps/rejected": -220.60565185546875, + "loss": 0.4946, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4810035824775696, + "rewards/margins": 0.5613747835159302, + "rewards/rejected": -0.08037119358778, + "step": 4705 + }, + { + "epoch": 0.7277788517301372, + "grad_norm": 7.9334893226623535, + "learning_rate": 4.207813037003094e-06, + "logits/chosen": 10.022344589233398, + "logits/rejected": 3.5141170024871826, + "logps/chosen": -441.4126281738281, + "logps/rejected": -255.1077117919922, + "loss": 0.6307, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2591535747051239, + "rewards/margins": 0.3867121934890747, + "rewards/rejected": -0.1275586187839508, + "step": 4706 + }, + { + "epoch": 0.7279335008699014, + "grad_norm": 5.438113689422607, + "learning_rate": 4.20752663535342e-06, + "logits/chosen": 14.031675338745117, + "logits/rejected": 6.951600074768066, + "logps/chosen": -468.0054016113281, + "logps/rejected": -335.5232849121094, + "loss": 0.5249, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1453966200351715, + "rewards/margins": 0.5273954272270203, + "rewards/rejected": -0.38199883699417114, + "step": 4707 + }, + { + "epoch": 0.7280881500096655, + "grad_norm": 3.688727617263794, + "learning_rate": 4.207240233703746e-06, + "logits/chosen": 14.206121444702148, + "logits/rejected": 5.133615493774414, + "logps/chosen": -288.57904052734375, + "logps/rejected": -191.79388427734375, + "loss": 0.5379, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17728619277477264, + "rewards/margins": 0.4602702856063843, + "rewards/rejected": -0.2829841077327728, + "step": 4708 + }, + { + "epoch": 0.7282427991494297, + "grad_norm": 8.024949073791504, + "learning_rate": 4.206953832054073e-06, + "logits/chosen": 14.46672248840332, + "logits/rejected": 7.849008560180664, + "logps/chosen": -473.2665100097656, + "logps/rejected": -273.0191650390625, + "loss": 0.6551, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.28687095642089844, + "rewards/margins": 0.13161106407642365, + "rewards/rejected": 0.155259907245636, + "step": 4709 + }, + { + "epoch": 0.7283974482891938, + "grad_norm": 4.506289482116699, + "learning_rate": 4.2066674304043995e-06, + "logits/chosen": 5.828250885009766, + "logits/rejected": 6.773597717285156, + "logps/chosen": -179.40469360351562, + "logps/rejected": -224.75875854492188, + "loss": 0.6985, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0276484452188015, + "rewards/margins": 0.016211219131946564, + "rewards/rejected": -0.04385966807603836, + "step": 4710 + }, + { + "epoch": 0.7285520974289581, + "grad_norm": 4.69533109664917, + "learning_rate": 4.206381028754726e-06, + "logits/chosen": 10.374015808105469, + "logits/rejected": 9.881458282470703, + "logps/chosen": -278.9754943847656, + "logps/rejected": -288.93609619140625, + "loss": 0.6551, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49047279357910156, + "rewards/margins": 0.2534168064594269, + "rewards/rejected": 0.23705598711967468, + "step": 4711 + }, + { + "epoch": 0.7287067465687223, + "grad_norm": 8.948022842407227, + "learning_rate": 4.206094627105052e-06, + "logits/chosen": 11.090749740600586, + "logits/rejected": 11.137782096862793, + "logps/chosen": -235.52304077148438, + "logps/rejected": -216.55130004882812, + "loss": 0.6269, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.055186688899993896, + "rewards/margins": 0.3071196675300598, + "rewards/rejected": -0.2519330084323883, + "step": 4712 + }, + { + "epoch": 0.7288613957084864, + "grad_norm": 4.812723159790039, + "learning_rate": 4.205808225455379e-06, + "logits/chosen": 9.250722885131836, + "logits/rejected": 6.440821647644043, + "logps/chosen": -253.46392822265625, + "logps/rejected": -308.44012451171875, + "loss": 0.5141, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4054829478263855, + "rewards/margins": 0.5554682612419128, + "rewards/rejected": -0.14998534321784973, + "step": 4713 + }, + { + "epoch": 0.7290160448482506, + "grad_norm": 7.435058116912842, + "learning_rate": 4.205521823805705e-06, + "logits/chosen": 6.702761173248291, + "logits/rejected": 9.35500431060791, + "logps/chosen": -401.7355651855469, + "logps/rejected": -339.3660888671875, + "loss": 0.6215, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12997551262378693, + "rewards/margins": 0.24712444841861725, + "rewards/rejected": -0.3770999312400818, + "step": 4714 + }, + { + "epoch": 0.7291706939880147, + "grad_norm": 6.192553997039795, + "learning_rate": 4.205235422156032e-06, + "logits/chosen": 6.972842216491699, + "logits/rejected": 10.829671859741211, + "logps/chosen": -224.13345336914062, + "logps/rejected": -254.299072265625, + "loss": 0.7744, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15659284591674805, + "rewards/margins": -0.07944852858781815, + "rewards/rejected": -0.07714433968067169, + "step": 4715 + }, + { + "epoch": 0.7293253431277789, + "grad_norm": 8.186197280883789, + "learning_rate": 4.2049490205063585e-06, + "logits/chosen": 13.802979469299316, + "logits/rejected": 7.220629692077637, + "logps/chosen": -434.03558349609375, + "logps/rejected": -249.51254272460938, + "loss": 0.7413, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09063348919153214, + "rewards/margins": -0.06410132348537445, + "rewards/rejected": -0.02653217688202858, + "step": 4716 + }, + { + "epoch": 0.729479992267543, + "grad_norm": 7.190593719482422, + "learning_rate": 4.204662618856684e-06, + "logits/chosen": -1.2313786745071411, + "logits/rejected": 8.329784393310547, + "logps/chosen": -188.1888427734375, + "logps/rejected": -320.2513732910156, + "loss": 0.8256, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2141110897064209, + "rewards/margins": -0.10777591168880463, + "rewards/rejected": 0.3218870162963867, + "step": 4717 + }, + { + "epoch": 0.7296346414073072, + "grad_norm": 3.9558451175689697, + "learning_rate": 4.204376217207011e-06, + "logits/chosen": 13.355031967163086, + "logits/rejected": 14.219476699829102, + "logps/chosen": -238.9228515625, + "logps/rejected": -235.6582489013672, + "loss": 0.4603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39660704135894775, + "rewards/margins": 0.5685549974441528, + "rewards/rejected": -0.17194795608520508, + "step": 4718 + }, + { + "epoch": 0.7297892905470713, + "grad_norm": 3.863690137863159, + "learning_rate": 4.204089815557338e-06, + "logits/chosen": 15.465452194213867, + "logits/rejected": 3.9272823333740234, + "logps/chosen": -221.74464416503906, + "logps/rejected": -112.0164794921875, + "loss": 0.5725, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20435886085033417, + "rewards/margins": 0.30952098965644836, + "rewards/rejected": -0.10516209900379181, + "step": 4719 + }, + { + "epoch": 0.7299439396868355, + "grad_norm": 5.704815864562988, + "learning_rate": 4.203803413907664e-06, + "logits/chosen": 7.788920879364014, + "logits/rejected": 9.360577583312988, + "logps/chosen": -278.876220703125, + "logps/rejected": -325.6363525390625, + "loss": 0.4767, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30553552508354187, + "rewards/margins": 0.5258297324180603, + "rewards/rejected": -0.22029419243335724, + "step": 4720 + }, + { + "epoch": 0.7300985888265996, + "grad_norm": 5.160268783569336, + "learning_rate": 4.203517012257991e-06, + "logits/chosen": 11.97552490234375, + "logits/rejected": 4.361750602722168, + "logps/chosen": -324.85284423828125, + "logps/rejected": -205.0785675048828, + "loss": 0.4747, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15700645744800568, + "rewards/margins": 0.6302173733711243, + "rewards/rejected": -0.4732109308242798, + "step": 4721 + }, + { + "epoch": 0.7302532379663638, + "grad_norm": 9.462347030639648, + "learning_rate": 4.203230610608318e-06, + "logits/chosen": 9.682936668395996, + "logits/rejected": 5.709912300109863, + "logps/chosen": -255.21612548828125, + "logps/rejected": -246.0841522216797, + "loss": 0.7568, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016873463988304138, + "rewards/margins": -0.029597178101539612, + "rewards/rejected": 0.04647064581513405, + "step": 4722 + }, + { + "epoch": 0.7304078871061279, + "grad_norm": 5.32291316986084, + "learning_rate": 4.202944208958644e-06, + "logits/chosen": 9.940293312072754, + "logits/rejected": 6.945707321166992, + "logps/chosen": -270.47711181640625, + "logps/rejected": -200.5116424560547, + "loss": 0.5554, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004439353942871094, + "rewards/margins": 0.5420545339584351, + "rewards/rejected": -0.5376152396202087, + "step": 4723 + }, + { + "epoch": 0.7305625362458922, + "grad_norm": 5.322466850280762, + "learning_rate": 4.20265780730897e-06, + "logits/chosen": 11.245365142822266, + "logits/rejected": 6.75719690322876, + "logps/chosen": -281.06927490234375, + "logps/rejected": -216.71633911132812, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44138044118881226, + "rewards/margins": 0.4201873540878296, + "rewards/rejected": 0.021193087100982666, + "step": 4724 + }, + { + "epoch": 0.7307171853856563, + "grad_norm": 9.266231536865234, + "learning_rate": 4.202371405659297e-06, + "logits/chosen": 8.465092658996582, + "logits/rejected": 13.018942832946777, + "logps/chosen": -375.2256774902344, + "logps/rejected": -444.3348693847656, + "loss": 0.7855, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4741671085357666, + "rewards/margins": -0.051219433546066284, + "rewards/rejected": 0.5253865718841553, + "step": 4725 + }, + { + "epoch": 0.7308718345254205, + "grad_norm": 7.655884265899658, + "learning_rate": 4.202085004009623e-06, + "logits/chosen": 11.626346588134766, + "logits/rejected": 8.894448280334473, + "logps/chosen": -267.4024658203125, + "logps/rejected": -333.1418762207031, + "loss": 0.7425, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20657792687416077, + "rewards/margins": 0.060829345136880875, + "rewards/rejected": 0.14574861526489258, + "step": 4726 + }, + { + "epoch": 0.7310264836651846, + "grad_norm": 6.579062461853027, + "learning_rate": 4.20179860235995e-06, + "logits/chosen": 11.06484603881836, + "logits/rejected": 11.69767951965332, + "logps/chosen": -353.1047058105469, + "logps/rejected": -390.7818908691406, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3880038857460022, + "rewards/margins": 0.30388742685317993, + "rewards/rejected": 0.08411645889282227, + "step": 4727 + }, + { + "epoch": 0.7311811328049488, + "grad_norm": 5.405943393707275, + "learning_rate": 4.201512200710277e-06, + "logits/chosen": 5.7828521728515625, + "logits/rejected": 0.461294025182724, + "logps/chosen": -272.8395690917969, + "logps/rejected": -191.7827606201172, + "loss": 0.6459, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18535934388637543, + "rewards/margins": 0.39121270179748535, + "rewards/rejected": -0.20585334300994873, + "step": 4728 + }, + { + "epoch": 0.7313357819447129, + "grad_norm": 5.422591209411621, + "learning_rate": 4.201225799060603e-06, + "logits/chosen": 9.699492454528809, + "logits/rejected": 4.114933013916016, + "logps/chosen": -381.24365234375, + "logps/rejected": -239.6695556640625, + "loss": 0.5684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4555453062057495, + "rewards/margins": 0.459337055683136, + "rewards/rejected": -0.0037917643785476685, + "step": 4729 + }, + { + "epoch": 0.7314904310844771, + "grad_norm": 4.382835865020752, + "learning_rate": 4.200939397410929e-06, + "logits/chosen": 11.869800567626953, + "logits/rejected": 0.9620145559310913, + "logps/chosen": -172.9580078125, + "logps/rejected": -122.19593811035156, + "loss": 0.6222, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09751349687576294, + "rewards/margins": 0.22684121131896973, + "rewards/rejected": -0.32435470819473267, + "step": 4730 + }, + { + "epoch": 0.7316450802242412, + "grad_norm": 4.710762977600098, + "learning_rate": 4.200652995761256e-06, + "logits/chosen": 8.49771785736084, + "logits/rejected": 6.4628214836120605, + "logps/chosen": -198.4684600830078, + "logps/rejected": -180.46646118164062, + "loss": 0.5961, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13437537848949432, + "rewards/margins": 0.4670109450817108, + "rewards/rejected": -0.3326355516910553, + "step": 4731 + }, + { + "epoch": 0.7317997293640054, + "grad_norm": 4.254397392272949, + "learning_rate": 4.200366594111582e-06, + "logits/chosen": 7.064027786254883, + "logits/rejected": 5.760613441467285, + "logps/chosen": -139.57577514648438, + "logps/rejected": -183.42982482910156, + "loss": 0.4471, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03424603492021561, + "rewards/margins": 1.1380047798156738, + "rewards/rejected": -1.1037589311599731, + "step": 4732 + }, + { + "epoch": 0.7319543785037695, + "grad_norm": 4.80014181137085, + "learning_rate": 4.200080192461909e-06, + "logits/chosen": 10.552350044250488, + "logits/rejected": 9.416030883789062, + "logps/chosen": -274.9895324707031, + "logps/rejected": -238.43426513671875, + "loss": 0.7119, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2599753737449646, + "rewards/margins": 0.06376844644546509, + "rewards/rejected": 0.1962069272994995, + "step": 4733 + }, + { + "epoch": 0.7321090276435337, + "grad_norm": 5.721470832824707, + "learning_rate": 4.199793790812236e-06, + "logits/chosen": 13.150030136108398, + "logits/rejected": 11.381097793579102, + "logps/chosen": -338.6885681152344, + "logps/rejected": -347.7142028808594, + "loss": 0.6256, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5123617649078369, + "rewards/margins": 0.2433176338672638, + "rewards/rejected": 0.26904410123825073, + "step": 4734 + }, + { + "epoch": 0.7322636767832978, + "grad_norm": 8.358860969543457, + "learning_rate": 4.199507389162562e-06, + "logits/chosen": 9.067138671875, + "logits/rejected": 9.469714164733887, + "logps/chosen": -354.45556640625, + "logps/rejected": -316.4116516113281, + "loss": 0.8371, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11283457279205322, + "rewards/margins": -0.10998454689979553, + "rewards/rejected": 0.22281914949417114, + "step": 4735 + }, + { + "epoch": 0.7324183259230621, + "grad_norm": 4.5861616134643555, + "learning_rate": 4.199220987512889e-06, + "logits/chosen": 13.042624473571777, + "logits/rejected": 4.53838586807251, + "logps/chosen": -462.93255615234375, + "logps/rejected": -259.7115478515625, + "loss": 0.438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29311391711235046, + "rewards/margins": 0.6601704359054565, + "rewards/rejected": -0.3670564889907837, + "step": 4736 + }, + { + "epoch": 0.7325729750628263, + "grad_norm": 6.264298439025879, + "learning_rate": 4.198934585863215e-06, + "logits/chosen": 6.475538730621338, + "logits/rejected": 5.763686180114746, + "logps/chosen": -136.54244995117188, + "logps/rejected": -277.5910949707031, + "loss": 0.6085, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07031980156898499, + "rewards/margins": 0.27153682708740234, + "rewards/rejected": -0.20121705532073975, + "step": 4737 + }, + { + "epoch": 0.7327276242025904, + "grad_norm": 5.928725242614746, + "learning_rate": 4.1986481842135415e-06, + "logits/chosen": 7.112873554229736, + "logits/rejected": 6.089903831481934, + "logps/chosen": -181.59841918945312, + "logps/rejected": -168.23574829101562, + "loss": 0.7359, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16707392036914825, + "rewards/margins": -0.03283804655075073, + "rewards/rejected": -0.13423585891723633, + "step": 4738 + }, + { + "epoch": 0.7328822733423546, + "grad_norm": 5.5603413581848145, + "learning_rate": 4.198361782563868e-06, + "logits/chosen": 10.272453308105469, + "logits/rejected": 3.9385972023010254, + "logps/chosen": -201.46035766601562, + "logps/rejected": -181.38662719726562, + "loss": 0.7557, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07115526497364044, + "rewards/margins": -0.0715007334947586, + "rewards/rejected": 0.00034546852111816406, + "step": 4739 + }, + { + "epoch": 0.7330369224821187, + "grad_norm": 5.641921520233154, + "learning_rate": 4.198075380914195e-06, + "logits/chosen": 10.48651123046875, + "logits/rejected": 8.48303508758545, + "logps/chosen": -241.899658203125, + "logps/rejected": -215.71957397460938, + "loss": 0.6765, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1212267279624939, + "rewards/margins": 0.1581042855978012, + "rewards/rejected": -0.03687753528356552, + "step": 4740 + }, + { + "epoch": 0.7331915716218829, + "grad_norm": 4.05303955078125, + "learning_rate": 4.197788979264521e-06, + "logits/chosen": 11.159069061279297, + "logits/rejected": 7.462393760681152, + "logps/chosen": -410.6734313964844, + "logps/rejected": -311.5502014160156, + "loss": 0.4606, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5157081484794617, + "rewards/margins": 0.7549344301223755, + "rewards/rejected": -0.239226296544075, + "step": 4741 + }, + { + "epoch": 0.733346220761647, + "grad_norm": 5.8020339012146, + "learning_rate": 4.197502577614847e-06, + "logits/chosen": 12.310346603393555, + "logits/rejected": 10.4533052444458, + "logps/chosen": -382.34552001953125, + "logps/rejected": -236.3104248046875, + "loss": 0.6069, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0025434568524360657, + "rewards/margins": 0.24380816519260406, + "rewards/rejected": -0.24635162949562073, + "step": 4742 + }, + { + "epoch": 0.7335008699014112, + "grad_norm": 6.338212966918945, + "learning_rate": 4.197216175965174e-06, + "logits/chosen": 9.718679428100586, + "logits/rejected": 8.147027015686035, + "logps/chosen": -296.2594299316406, + "logps/rejected": -247.516845703125, + "loss": 0.7865, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.11442165821790695, + "rewards/margins": 0.0996694266796112, + "rewards/rejected": -0.21409109234809875, + "step": 4743 + }, + { + "epoch": 0.7336555190411753, + "grad_norm": 5.800508499145508, + "learning_rate": 4.1969297743155005e-06, + "logits/chosen": 10.19094467163086, + "logits/rejected": 10.416265487670898, + "logps/chosen": -242.73402404785156, + "logps/rejected": -208.94903564453125, + "loss": 0.6988, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.23169006407260895, + "rewards/margins": 0.007804155349731445, + "rewards/rejected": 0.2238859087228775, + "step": 4744 + }, + { + "epoch": 0.7338101681809395, + "grad_norm": 5.29016637802124, + "learning_rate": 4.196643372665827e-06, + "logits/chosen": 13.938228607177734, + "logits/rejected": 10.117415428161621, + "logps/chosen": -378.0729675292969, + "logps/rejected": -338.91815185546875, + "loss": 0.5675, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39828574657440186, + "rewards/margins": 0.3874906897544861, + "rewards/rejected": 0.010795064270496368, + "step": 4745 + }, + { + "epoch": 0.7339648173207036, + "grad_norm": 5.569178104400635, + "learning_rate": 4.196356971016153e-06, + "logits/chosen": 7.9367523193359375, + "logits/rejected": 6.671178817749023, + "logps/chosen": -352.7064208984375, + "logps/rejected": -321.2003479003906, + "loss": 0.6386, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08021535724401474, + "rewards/margins": 0.21329127252101898, + "rewards/rejected": -0.13307590782642365, + "step": 4746 + }, + { + "epoch": 0.7341194664604678, + "grad_norm": 8.94774055480957, + "learning_rate": 4.19607056936648e-06, + "logits/chosen": 10.536354064941406, + "logits/rejected": 4.0362935066223145, + "logps/chosen": -517.2574462890625, + "logps/rejected": -228.9273223876953, + "loss": 0.6895, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14852410554885864, + "rewards/margins": 0.19637462496757507, + "rewards/rejected": -0.04785052686929703, + "step": 4747 + }, + { + "epoch": 0.7342741156002319, + "grad_norm": 4.9800214767456055, + "learning_rate": 4.195784167716806e-06, + "logits/chosen": 13.256465911865234, + "logits/rejected": 13.477039337158203, + "logps/chosen": -220.45750427246094, + "logps/rejected": -221.9154052734375, + "loss": 0.6814, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09138321876525879, + "rewards/margins": 0.09960539638996124, + "rewards/rejected": -0.00822218507528305, + "step": 4748 + }, + { + "epoch": 0.7344287647399962, + "grad_norm": 4.7813286781311035, + "learning_rate": 4.195497766067133e-06, + "logits/chosen": 11.675992965698242, + "logits/rejected": 8.03116512298584, + "logps/chosen": -236.76043701171875, + "logps/rejected": -161.45748901367188, + "loss": 0.6709, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004031464457511902, + "rewards/margins": 0.19753924012184143, + "rewards/rejected": -0.19350777566432953, + "step": 4749 + }, + { + "epoch": 0.7345834138797603, + "grad_norm": 3.82041072845459, + "learning_rate": 4.195211364417459e-06, + "logits/chosen": 11.645238876342773, + "logits/rejected": 8.816662788391113, + "logps/chosen": -240.3917236328125, + "logps/rejected": -207.9389190673828, + "loss": 0.5737, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21743574738502502, + "rewards/margins": 0.31267639994621277, + "rewards/rejected": -0.09524065256118774, + "step": 4750 + }, + { + "epoch": 0.7347380630195245, + "grad_norm": 5.4435505867004395, + "learning_rate": 4.194924962767785e-06, + "logits/chosen": 11.81915283203125, + "logits/rejected": 16.080331802368164, + "logps/chosen": -252.87496948242188, + "logps/rejected": -351.0946960449219, + "loss": 0.6349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11683613061904907, + "rewards/margins": 0.2595912516117096, + "rewards/rejected": -0.37642738223075867, + "step": 4751 + }, + { + "epoch": 0.7348927121592886, + "grad_norm": 6.407220363616943, + "learning_rate": 4.194638561118112e-06, + "logits/chosen": 11.030671119689941, + "logits/rejected": 10.792478561401367, + "logps/chosen": -316.3838195800781, + "logps/rejected": -304.4842529296875, + "loss": 0.7744, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02871304750442505, + "rewards/margins": -0.08277063071727753, + "rewards/rejected": 0.05405759811401367, + "step": 4752 + }, + { + "epoch": 0.7350473612990528, + "grad_norm": 6.015807151794434, + "learning_rate": 4.194352159468439e-06, + "logits/chosen": 12.402322769165039, + "logits/rejected": 11.337557792663574, + "logps/chosen": -375.75665283203125, + "logps/rejected": -358.983154296875, + "loss": 0.5339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5309127569198608, + "rewards/margins": 0.4515155553817749, + "rewards/rejected": 0.07939721643924713, + "step": 4753 + }, + { + "epoch": 0.7352020104388169, + "grad_norm": 6.017291069030762, + "learning_rate": 4.194065757818765e-06, + "logits/chosen": 8.792664527893066, + "logits/rejected": 6.598265171051025, + "logps/chosen": -336.721923828125, + "logps/rejected": -236.42062377929688, + "loss": 0.6941, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33871591091156006, + "rewards/margins": 0.2760820686817169, + "rewards/rejected": 0.06263384222984314, + "step": 4754 + }, + { + "epoch": 0.7353566595785811, + "grad_norm": 11.23551082611084, + "learning_rate": 4.193779356169092e-06, + "logits/chosen": 13.905418395996094, + "logits/rejected": 12.497049331665039, + "logps/chosen": -300.74884033203125, + "logps/rejected": -259.16766357421875, + "loss": 1.0682, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.03809165954589844, + "rewards/margins": -0.5893455743789673, + "rewards/rejected": 0.6274372339248657, + "step": 4755 + }, + { + "epoch": 0.7355113087183452, + "grad_norm": 5.389484405517578, + "learning_rate": 4.193492954519419e-06, + "logits/chosen": 13.51899528503418, + "logits/rejected": 12.981062889099121, + "logps/chosen": -326.459228515625, + "logps/rejected": -315.577392578125, + "loss": 0.6013, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4855082035064697, + "rewards/margins": 0.24323587119579315, + "rewards/rejected": 0.242272287607193, + "step": 4756 + }, + { + "epoch": 0.7356659578581094, + "grad_norm": 6.513172626495361, + "learning_rate": 4.1932065528697444e-06, + "logits/chosen": 9.911660194396973, + "logits/rejected": 8.243659973144531, + "logps/chosen": -306.8182373046875, + "logps/rejected": -256.8122863769531, + "loss": 0.7903, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.008855774998664856, + "rewards/margins": -0.12659047544002533, + "rewards/rejected": 0.13544625043869019, + "step": 4757 + }, + { + "epoch": 0.7358206069978735, + "grad_norm": 60.23966598510742, + "learning_rate": 4.192920151220071e-06, + "logits/chosen": 8.791994094848633, + "logits/rejected": 8.53953742980957, + "logps/chosen": -418.1119384765625, + "logps/rejected": -295.19207763671875, + "loss": 0.8701, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4879385828971863, + "rewards/margins": -0.28741228580474854, + "rewards/rejected": -0.20052632689476013, + "step": 4758 + }, + { + "epoch": 0.7359752561376377, + "grad_norm": 6.296421051025391, + "learning_rate": 4.192633749570398e-06, + "logits/chosen": 14.486000061035156, + "logits/rejected": 10.41128158569336, + "logps/chosen": -353.5166931152344, + "logps/rejected": -281.64422607421875, + "loss": 0.7174, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29859182238578796, + "rewards/margins": 0.08769933879375458, + "rewards/rejected": 0.2108924835920334, + "step": 4759 + }, + { + "epoch": 0.7361299052774019, + "grad_norm": 5.47121000289917, + "learning_rate": 4.192347347920724e-06, + "logits/chosen": 9.593307495117188, + "logits/rejected": 10.236595153808594, + "logps/chosen": -160.1764373779297, + "logps/rejected": -201.8399658203125, + "loss": 0.7197, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28673186898231506, + "rewards/margins": 0.010794736444950104, + "rewards/rejected": -0.29752659797668457, + "step": 4760 + }, + { + "epoch": 0.736284554417166, + "grad_norm": 4.923763275146484, + "learning_rate": 4.192060946271051e-06, + "logits/chosen": 11.913820266723633, + "logits/rejected": 7.56839656829834, + "logps/chosen": -241.36947631835938, + "logps/rejected": -205.43011474609375, + "loss": 0.7003, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09711894392967224, + "rewards/margins": 0.06295973062515259, + "rewards/rejected": -0.16007867455482483, + "step": 4761 + }, + { + "epoch": 0.7364392035569303, + "grad_norm": 5.6501688957214355, + "learning_rate": 4.191774544621378e-06, + "logits/chosen": 7.237883567810059, + "logits/rejected": 10.479668617248535, + "logps/chosen": -323.89849853515625, + "logps/rejected": -312.208984375, + "loss": 0.8207, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18529434502124786, + "rewards/margins": -0.1707799881696701, + "rewards/rejected": 0.35607433319091797, + "step": 4762 + }, + { + "epoch": 0.7365938526966944, + "grad_norm": 6.281984329223633, + "learning_rate": 4.1914881429717035e-06, + "logits/chosen": 13.688703536987305, + "logits/rejected": 8.88680362701416, + "logps/chosen": -439.0293273925781, + "logps/rejected": -350.8081359863281, + "loss": 0.7373, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26570481061935425, + "rewards/margins": 0.019310586154460907, + "rewards/rejected": 0.24639426171779633, + "step": 4763 + }, + { + "epoch": 0.7367485018364586, + "grad_norm": 6.75985860824585, + "learning_rate": 4.19120174132203e-06, + "logits/chosen": 11.57260513305664, + "logits/rejected": 12.854288101196289, + "logps/chosen": -281.9943542480469, + "logps/rejected": -294.39874267578125, + "loss": 0.7733, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21525385975837708, + "rewards/margins": 0.18043500185012817, + "rewards/rejected": 0.0348188579082489, + "step": 4764 + }, + { + "epoch": 0.7369031509762227, + "grad_norm": 6.307370662689209, + "learning_rate": 4.190915339672357e-06, + "logits/chosen": 11.254785537719727, + "logits/rejected": 6.444768905639648, + "logps/chosen": -269.56109619140625, + "logps/rejected": -239.7490234375, + "loss": 0.776, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.33809754252433777, + "rewards/margins": -0.055138155817985535, + "rewards/rejected": 0.3932356834411621, + "step": 4765 + }, + { + "epoch": 0.7370578001159869, + "grad_norm": 4.317245960235596, + "learning_rate": 4.1906289380226834e-06, + "logits/chosen": 7.762613773345947, + "logits/rejected": 4.897261619567871, + "logps/chosen": -210.5369110107422, + "logps/rejected": -209.80514526367188, + "loss": 0.5353, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.011759907007217407, + "rewards/margins": 0.45884081721305847, + "rewards/rejected": -0.44708094000816345, + "step": 4766 + }, + { + "epoch": 0.737212449255751, + "grad_norm": 7.767302989959717, + "learning_rate": 4.19034253637301e-06, + "logits/chosen": 12.339812278747559, + "logits/rejected": 10.903360366821289, + "logps/chosen": -319.3306884765625, + "logps/rejected": -311.0950622558594, + "loss": 0.5433, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4628310203552246, + "rewards/margins": 0.5159217119216919, + "rewards/rejected": -0.053090650588274, + "step": 4767 + }, + { + "epoch": 0.7373670983955152, + "grad_norm": 5.3224406242370605, + "learning_rate": 4.190056134723337e-06, + "logits/chosen": 14.429060935974121, + "logits/rejected": 14.173766136169434, + "logps/chosen": -252.89291381835938, + "logps/rejected": -274.78765869140625, + "loss": 0.6876, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26096177101135254, + "rewards/margins": 0.19047455489635468, + "rewards/rejected": -0.4514363408088684, + "step": 4768 + }, + { + "epoch": 0.7375217475352793, + "grad_norm": 5.439070701599121, + "learning_rate": 4.189769733073663e-06, + "logits/chosen": 11.795591354370117, + "logits/rejected": 8.82680892944336, + "logps/chosen": -266.1576232910156, + "logps/rejected": -217.710205078125, + "loss": 0.5542, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3184768855571747, + "rewards/margins": 0.4833378195762634, + "rewards/rejected": -0.16486096382141113, + "step": 4769 + }, + { + "epoch": 0.7376763966750435, + "grad_norm": 4.673933029174805, + "learning_rate": 4.189483331423989e-06, + "logits/chosen": 12.298160552978516, + "logits/rejected": 4.383668422698975, + "logps/chosen": -269.30419921875, + "logps/rejected": -130.8526153564453, + "loss": 0.5515, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3152461647987366, + "rewards/margins": 0.5219026803970337, + "rewards/rejected": -0.20665651559829712, + "step": 4770 + }, + { + "epoch": 0.7378310458148076, + "grad_norm": 4.872597694396973, + "learning_rate": 4.189196929774316e-06, + "logits/chosen": 9.257883071899414, + "logits/rejected": 5.580127239227295, + "logps/chosen": -308.6827392578125, + "logps/rejected": -271.2395935058594, + "loss": 0.5661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05088486894965172, + "rewards/margins": 0.5715640783309937, + "rewards/rejected": -0.6224489808082581, + "step": 4771 + }, + { + "epoch": 0.7379856949545718, + "grad_norm": 4.062547206878662, + "learning_rate": 4.1889105281246425e-06, + "logits/chosen": 13.11014461517334, + "logits/rejected": 10.769596099853516, + "logps/chosen": -329.4433898925781, + "logps/rejected": -322.385986328125, + "loss": 0.5676, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6099538207054138, + "rewards/margins": 0.4774438142776489, + "rewards/rejected": 0.1325100064277649, + "step": 4772 + }, + { + "epoch": 0.7381403440943359, + "grad_norm": 7.330388069152832, + "learning_rate": 4.188624126474969e-06, + "logits/chosen": 11.050324440002441, + "logits/rejected": 9.256900787353516, + "logps/chosen": -226.64767456054688, + "logps/rejected": -219.11105346679688, + "loss": 0.847, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.08482885360717773, + "rewards/margins": -0.24830521643161774, + "rewards/rejected": 0.3331340551376343, + "step": 4773 + }, + { + "epoch": 0.7382949932341001, + "grad_norm": 8.037586212158203, + "learning_rate": 4.188337724825296e-06, + "logits/chosen": 16.929351806640625, + "logits/rejected": 12.234892845153809, + "logps/chosen": -501.61651611328125, + "logps/rejected": -454.21514892578125, + "loss": 0.7294, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.52154541015625, + "rewards/margins": 0.15335141122341156, + "rewards/rejected": 0.368194043636322, + "step": 4774 + }, + { + "epoch": 0.7384496423738643, + "grad_norm": 4.544396877288818, + "learning_rate": 4.1880513231756224e-06, + "logits/chosen": 8.279939651489258, + "logits/rejected": 3.458927631378174, + "logps/chosen": -303.1224060058594, + "logps/rejected": -239.17803955078125, + "loss": 0.5887, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03961247205734253, + "rewards/margins": 0.48178744316101074, + "rewards/rejected": -0.4421750009059906, + "step": 4775 + }, + { + "epoch": 0.7386042915136285, + "grad_norm": 8.042491912841797, + "learning_rate": 4.187764921525948e-06, + "logits/chosen": 11.185622215270996, + "logits/rejected": 9.732324600219727, + "logps/chosen": -329.8443298339844, + "logps/rejected": -279.06817626953125, + "loss": 0.7629, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3160988688468933, + "rewards/margins": -0.024421006441116333, + "rewards/rejected": -0.29167789220809937, + "step": 4776 + }, + { + "epoch": 0.7387589406533926, + "grad_norm": 4.627344608306885, + "learning_rate": 4.187478519876275e-06, + "logits/chosen": 8.658655166625977, + "logits/rejected": 11.560956001281738, + "logps/chosen": -211.19076538085938, + "logps/rejected": -190.307373046875, + "loss": 0.6018, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0053098164498806, + "rewards/margins": 0.2644790709018707, + "rewards/rejected": -0.25916922092437744, + "step": 4777 + }, + { + "epoch": 0.7389135897931568, + "grad_norm": 4.292059898376465, + "learning_rate": 4.1871921182266015e-06, + "logits/chosen": 8.275537490844727, + "logits/rejected": 3.691534996032715, + "logps/chosen": -227.5767059326172, + "logps/rejected": -151.46847534179688, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25836095213890076, + "rewards/margins": 0.37570351362228394, + "rewards/rejected": -0.11734256148338318, + "step": 4778 + }, + { + "epoch": 0.739068238932921, + "grad_norm": 5.199387550354004, + "learning_rate": 4.186905716576928e-06, + "logits/chosen": 12.26113510131836, + "logits/rejected": 10.05549144744873, + "logps/chosen": -325.6607971191406, + "logps/rejected": -268.3236083984375, + "loss": 0.5816, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3688335120677948, + "rewards/margins": 0.30262085795402527, + "rewards/rejected": 0.06621265411376953, + "step": 4779 + }, + { + "epoch": 0.7392228880726851, + "grad_norm": 9.9437255859375, + "learning_rate": 4.186619314927254e-06, + "logits/chosen": 17.158878326416016, + "logits/rejected": 16.86864471435547, + "logps/chosen": -383.1300048828125, + "logps/rejected": -381.4898681640625, + "loss": 0.7604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0322418250143528, + "rewards/margins": -0.07112255692481995, + "rewards/rejected": 0.10336437821388245, + "step": 4780 + }, + { + "epoch": 0.7393775372124493, + "grad_norm": 4.513852596282959, + "learning_rate": 4.186332913277581e-06, + "logits/chosen": 10.36776065826416, + "logits/rejected": 7.879003524780273, + "logps/chosen": -179.33912658691406, + "logps/rejected": -158.83901977539062, + "loss": 0.7182, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020983316004276276, + "rewards/margins": 0.04471863806247711, + "rewards/rejected": -0.02373531460762024, + "step": 4781 + }, + { + "epoch": 0.7395321863522134, + "grad_norm": 4.013879776000977, + "learning_rate": 4.186046511627907e-06, + "logits/chosen": 13.298583984375, + "logits/rejected": 7.70428991317749, + "logps/chosen": -355.2281494140625, + "logps/rejected": -263.7358703613281, + "loss": 0.4415, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35276365280151367, + "rewards/margins": 0.7590234279632568, + "rewards/rejected": -0.40625983476638794, + "step": 4782 + }, + { + "epoch": 0.7396868354919776, + "grad_norm": 5.810451507568359, + "learning_rate": 4.185760109978234e-06, + "logits/chosen": 10.22127914428711, + "logits/rejected": 12.51368522644043, + "logps/chosen": -261.9689025878906, + "logps/rejected": -249.46876525878906, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3779284656047821, + "rewards/margins": 0.03404693305492401, + "rewards/rejected": 0.3438815474510193, + "step": 4783 + }, + { + "epoch": 0.7398414846317417, + "grad_norm": 5.822102069854736, + "learning_rate": 4.18547370832856e-06, + "logits/chosen": 8.113950729370117, + "logits/rejected": 7.097465515136719, + "logps/chosen": -358.13531494140625, + "logps/rejected": -295.42572021484375, + "loss": 0.6369, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02648501843214035, + "rewards/margins": 0.7722447514533997, + "rewards/rejected": -0.7987297177314758, + "step": 4784 + }, + { + "epoch": 0.7399961337715059, + "grad_norm": 5.558611869812012, + "learning_rate": 4.185187306678886e-06, + "logits/chosen": 10.657334327697754, + "logits/rejected": 9.81453800201416, + "logps/chosen": -336.66241455078125, + "logps/rejected": -271.5, + "loss": 0.6709, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0750436782836914, + "rewards/margins": 0.139068603515625, + "rewards/rejected": -0.2141122817993164, + "step": 4785 + }, + { + "epoch": 0.74015078291127, + "grad_norm": 3.9321529865264893, + "learning_rate": 4.184900905029213e-06, + "logits/chosen": 7.636336803436279, + "logits/rejected": 9.096763610839844, + "logps/chosen": -207.8610382080078, + "logps/rejected": -231.0572509765625, + "loss": 0.4856, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13540057837963104, + "rewards/margins": 0.6203001141548157, + "rewards/rejected": -0.4848995804786682, + "step": 4786 + }, + { + "epoch": 0.7403054320510342, + "grad_norm": 5.173822402954102, + "learning_rate": 4.18461450337954e-06, + "logits/chosen": 7.526389122009277, + "logits/rejected": 1.7991468906402588, + "logps/chosen": -375.69525146484375, + "logps/rejected": -292.9147644042969, + "loss": 0.5748, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32168281078338623, + "rewards/margins": 0.29248085618019104, + "rewards/rejected": 0.0292019285261631, + "step": 4787 + }, + { + "epoch": 0.7404600811907984, + "grad_norm": 10.130017280578613, + "learning_rate": 4.184328101729866e-06, + "logits/chosen": 5.063675880432129, + "logits/rejected": 9.111383438110352, + "logps/chosen": -440.70758056640625, + "logps/rejected": -425.63262939453125, + "loss": 0.8439, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.05717173591256142, + "rewards/margins": -0.10336827486753464, + "rewards/rejected": 0.16054001450538635, + "step": 4788 + }, + { + "epoch": 0.7406147303305626, + "grad_norm": 6.925905227661133, + "learning_rate": 4.184041700080193e-06, + "logits/chosen": 15.306264877319336, + "logits/rejected": 8.550630569458008, + "logps/chosen": -447.69024658203125, + "logps/rejected": -315.3685302734375, + "loss": 0.6885, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3664945662021637, + "rewards/margins": 0.13312828540802002, + "rewards/rejected": 0.23336631059646606, + "step": 4789 + }, + { + "epoch": 0.7407693794703267, + "grad_norm": 19.495683670043945, + "learning_rate": 4.183755298430519e-06, + "logits/chosen": 13.757787704467773, + "logits/rejected": 2.7380151748657227, + "logps/chosen": -394.5092468261719, + "logps/rejected": -162.16622924804688, + "loss": 0.7054, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35929790139198303, + "rewards/margins": 0.0788784995675087, + "rewards/rejected": 0.28041940927505493, + "step": 4790 + }, + { + "epoch": 0.7409240286100909, + "grad_norm": 3.8974475860595703, + "learning_rate": 4.1834688967808455e-06, + "logits/chosen": 12.83820629119873, + "logits/rejected": 6.434313774108887, + "logps/chosen": -326.9476318359375, + "logps/rejected": -289.3776550292969, + "loss": 0.5284, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.583159327507019, + "rewards/margins": 0.5215884447097778, + "rewards/rejected": 0.06157088279724121, + "step": 4791 + }, + { + "epoch": 0.741078677749855, + "grad_norm": 4.9556193351745605, + "learning_rate": 4.183182495131172e-06, + "logits/chosen": 10.742406845092773, + "logits/rejected": 8.162736892700195, + "logps/chosen": -273.11474609375, + "logps/rejected": -232.29864501953125, + "loss": 0.5548, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23834289610385895, + "rewards/margins": 0.4092642664909363, + "rewards/rejected": -0.17092138528823853, + "step": 4792 + }, + { + "epoch": 0.7412333268896192, + "grad_norm": 5.485816478729248, + "learning_rate": 4.182896093481499e-06, + "logits/chosen": 5.3336076736450195, + "logits/rejected": 11.637184143066406, + "logps/chosen": -161.272705078125, + "logps/rejected": -269.3109130859375, + "loss": 0.6561, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06546381115913391, + "rewards/margins": 0.2819777727127075, + "rewards/rejected": -0.2165139615535736, + "step": 4793 + }, + { + "epoch": 0.7413879760293833, + "grad_norm": 4.629825115203857, + "learning_rate": 4.182609691831825e-06, + "logits/chosen": 13.952739715576172, + "logits/rejected": 9.817672729492188, + "logps/chosen": -304.21795654296875, + "logps/rejected": -265.7994384765625, + "loss": 0.5055, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8277340531349182, + "rewards/margins": 0.5605777502059937, + "rewards/rejected": 0.2671562135219574, + "step": 4794 + }, + { + "epoch": 0.7415426251691475, + "grad_norm": 3.8976757526397705, + "learning_rate": 4.182323290182152e-06, + "logits/chosen": 9.156255722045898, + "logits/rejected": 2.176840305328369, + "logps/chosen": -236.00279235839844, + "logps/rejected": -153.2358856201172, + "loss": 0.6101, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3663691282272339, + "rewards/margins": 0.2540814280509949, + "rewards/rejected": 0.11228771507740021, + "step": 4795 + }, + { + "epoch": 0.7416972743089116, + "grad_norm": 3.306509017944336, + "learning_rate": 4.182036888532478e-06, + "logits/chosen": 6.861904621124268, + "logits/rejected": 6.354181289672852, + "logps/chosen": -117.49771881103516, + "logps/rejected": -133.2139434814453, + "loss": 0.6124, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14508453011512756, + "rewards/margins": 0.18305107951164246, + "rewards/rejected": -0.037966538220644, + "step": 4796 + }, + { + "epoch": 0.7418519234486758, + "grad_norm": 6.028225421905518, + "learning_rate": 4.1817504868828045e-06, + "logits/chosen": 13.706073760986328, + "logits/rejected": 12.922555923461914, + "logps/chosen": -371.95849609375, + "logps/rejected": -356.96209716796875, + "loss": 0.6777, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14072629809379578, + "rewards/margins": 0.14261507987976074, + "rewards/rejected": -0.28334131836891174, + "step": 4797 + }, + { + "epoch": 0.7420065725884399, + "grad_norm": 6.3322343826293945, + "learning_rate": 4.181464085233131e-06, + "logits/chosen": 11.921670913696289, + "logits/rejected": 6.583761215209961, + "logps/chosen": -435.3843078613281, + "logps/rejected": -408.2420349121094, + "loss": 0.6996, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.6460952162742615, + "rewards/margins": 0.06955307722091675, + "rewards/rejected": 0.5765421986579895, + "step": 4798 + }, + { + "epoch": 0.7421612217282041, + "grad_norm": 5.877707481384277, + "learning_rate": 4.181177683583458e-06, + "logits/chosen": 8.08895492553711, + "logits/rejected": 6.9810791015625, + "logps/chosen": -216.67037963867188, + "logps/rejected": -233.91574096679688, + "loss": 0.7302, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2697771191596985, + "rewards/margins": -0.04122409597039223, + "rewards/rejected": 0.3110012114048004, + "step": 4799 + }, + { + "epoch": 0.7423158708679682, + "grad_norm": 3.527134656906128, + "learning_rate": 4.1808912819337845e-06, + "logits/chosen": 8.34594440460205, + "logits/rejected": 5.2310309410095215, + "logps/chosen": -162.30355834960938, + "logps/rejected": -170.98753356933594, + "loss": 0.5852, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6474559903144836, + "rewards/margins": 0.2936580777168274, + "rewards/rejected": 0.353797972202301, + "step": 4800 + }, + { + "epoch": 0.7424705200077325, + "grad_norm": 4.963498592376709, + "learning_rate": 4.180604880284111e-06, + "logits/chosen": 11.898832321166992, + "logits/rejected": 10.114301681518555, + "logps/chosen": -314.96209716796875, + "logps/rejected": -280.0485534667969, + "loss": 0.5025, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5252709984779358, + "rewards/margins": 0.5472241044044495, + "rewards/rejected": -0.021953091025352478, + "step": 4801 + }, + { + "epoch": 0.7426251691474967, + "grad_norm": 5.899979114532471, + "learning_rate": 4.180318478634438e-06, + "logits/chosen": 13.386738777160645, + "logits/rejected": 12.98686695098877, + "logps/chosen": -277.8770751953125, + "logps/rejected": -260.10968017578125, + "loss": 0.7403, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2905813455581665, + "rewards/margins": -0.00035734474658966064, + "rewards/rejected": 0.29093867540359497, + "step": 4802 + }, + { + "epoch": 0.7427798182872608, + "grad_norm": 4.862941265106201, + "learning_rate": 4.1800320769847636e-06, + "logits/chosen": 13.305197715759277, + "logits/rejected": 10.171258926391602, + "logps/chosen": -203.8580780029297, + "logps/rejected": -215.6347198486328, + "loss": 0.5774, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10327376425266266, + "rewards/margins": 0.3759494125843048, + "rewards/rejected": -0.27267563343048096, + "step": 4803 + }, + { + "epoch": 0.742934467427025, + "grad_norm": 4.671733856201172, + "learning_rate": 4.17974567533509e-06, + "logits/chosen": 10.614520072937012, + "logits/rejected": 12.669438362121582, + "logps/chosen": -211.83184814453125, + "logps/rejected": -282.8984375, + "loss": 0.6863, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23597446084022522, + "rewards/margins": 0.07573352754116058, + "rewards/rejected": 0.16024094820022583, + "step": 4804 + }, + { + "epoch": 0.7430891165667891, + "grad_norm": 5.944483757019043, + "learning_rate": 4.179459273685417e-06, + "logits/chosen": 9.159265518188477, + "logits/rejected": 10.275708198547363, + "logps/chosen": -225.89813232421875, + "logps/rejected": -230.29693603515625, + "loss": 0.7234, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0193081796169281, + "rewards/margins": 0.05501945689320564, + "rewards/rejected": -0.035711269825696945, + "step": 4805 + }, + { + "epoch": 0.7432437657065533, + "grad_norm": 6.932787895202637, + "learning_rate": 4.1791728720357435e-06, + "logits/chosen": 11.603013038635254, + "logits/rejected": 5.67227029800415, + "logps/chosen": -236.5516815185547, + "logps/rejected": -243.91864013671875, + "loss": 0.7817, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.384068101644516, + "rewards/margins": 0.005043424665927887, + "rewards/rejected": -0.3891115188598633, + "step": 4806 + }, + { + "epoch": 0.7433984148463174, + "grad_norm": 5.3307061195373535, + "learning_rate": 4.17888647038607e-06, + "logits/chosen": 9.669853210449219, + "logits/rejected": 6.428501605987549, + "logps/chosen": -289.4296569824219, + "logps/rejected": -262.89837646484375, + "loss": 0.5536, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5136330723762512, + "rewards/margins": 0.3940231204032898, + "rewards/rejected": 0.11960999667644501, + "step": 4807 + }, + { + "epoch": 0.7435530639860816, + "grad_norm": 6.641125202178955, + "learning_rate": 4.178600068736397e-06, + "logits/chosen": 8.668749809265137, + "logits/rejected": 6.366659164428711, + "logps/chosen": -221.5707244873047, + "logps/rejected": -266.85565185546875, + "loss": 0.8163, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02597782015800476, + "rewards/margins": -0.13528549671173096, + "rewards/rejected": 0.1093076765537262, + "step": 4808 + }, + { + "epoch": 0.7437077131258457, + "grad_norm": 6.800802230834961, + "learning_rate": 4.178313667086723e-06, + "logits/chosen": 15.106536865234375, + "logits/rejected": 7.903476715087891, + "logps/chosen": -354.17694091796875, + "logps/rejected": -264.623046875, + "loss": 0.6969, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.343596875667572, + "rewards/margins": 0.26012516021728516, + "rewards/rejected": 0.08347168564796448, + "step": 4809 + }, + { + "epoch": 0.7438623622656099, + "grad_norm": 4.17566442489624, + "learning_rate": 4.178027265437049e-06, + "logits/chosen": 12.727313995361328, + "logits/rejected": 3.4485666751861572, + "logps/chosen": -314.00048828125, + "logps/rejected": -192.40771484375, + "loss": 0.5045, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3906874656677246, + "rewards/margins": 0.4727931320667267, + "rewards/rejected": -0.08210568130016327, + "step": 4810 + }, + { + "epoch": 0.744017011405374, + "grad_norm": 5.741244792938232, + "learning_rate": 4.177740863787376e-06, + "logits/chosen": 8.71609115600586, + "logits/rejected": 10.898783683776855, + "logps/chosen": -184.07550048828125, + "logps/rejected": -218.75607299804688, + "loss": 0.6761, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16181382536888123, + "rewards/margins": 0.14146538078784943, + "rewards/rejected": -0.30327919125556946, + "step": 4811 + }, + { + "epoch": 0.7441716605451382, + "grad_norm": 4.6770710945129395, + "learning_rate": 4.177454462137703e-06, + "logits/chosen": 13.488677024841309, + "logits/rejected": 12.424948692321777, + "logps/chosen": -214.08004760742188, + "logps/rejected": -195.35145568847656, + "loss": 0.6578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06640143692493439, + "rewards/margins": 0.23079198598861694, + "rewards/rejected": -0.2971934676170349, + "step": 4812 + }, + { + "epoch": 0.7443263096849024, + "grad_norm": 10.468841552734375, + "learning_rate": 4.177168060488029e-06, + "logits/chosen": 9.1223726272583, + "logits/rejected": 7.271476745605469, + "logps/chosen": -333.78521728515625, + "logps/rejected": -372.5546875, + "loss": 0.8147, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.023865804076194763, + "rewards/margins": -0.09401731938123703, + "rewards/rejected": 0.1178831234574318, + "step": 4813 + }, + { + "epoch": 0.7444809588246666, + "grad_norm": 5.845936298370361, + "learning_rate": 4.176881658838355e-06, + "logits/chosen": 1.451494812965393, + "logits/rejected": 2.8326592445373535, + "logps/chosen": -233.25576782226562, + "logps/rejected": -339.4666442871094, + "loss": 0.7356, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10075896978378296, + "rewards/margins": -0.02923470363020897, + "rewards/rejected": 0.12999367713928223, + "step": 4814 + }, + { + "epoch": 0.7446356079644307, + "grad_norm": 4.696289539337158, + "learning_rate": 4.176595257188682e-06, + "logits/chosen": 7.498291015625, + "logits/rejected": 4.349799156188965, + "logps/chosen": -220.40553283691406, + "logps/rejected": -196.92861938476562, + "loss": 0.6683, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22897286713123322, + "rewards/margins": 0.07385443896055222, + "rewards/rejected": 0.1551184207201004, + "step": 4815 + }, + { + "epoch": 0.7447902571041949, + "grad_norm": 8.36641788482666, + "learning_rate": 4.176308855539008e-06, + "logits/chosen": 7.235175132751465, + "logits/rejected": 0.3660110831260681, + "logps/chosen": -352.1417541503906, + "logps/rejected": -302.70416259765625, + "loss": 0.5522, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1253022700548172, + "rewards/margins": 0.3599855303764343, + "rewards/rejected": -0.2346833050251007, + "step": 4816 + }, + { + "epoch": 0.744944906243959, + "grad_norm": 5.706331253051758, + "learning_rate": 4.176022453889335e-06, + "logits/chosen": 10.604809761047363, + "logits/rejected": 6.052546977996826, + "logps/chosen": -284.2577209472656, + "logps/rejected": -254.03941345214844, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.559387743473053, + "rewards/margins": 0.5088394284248352, + "rewards/rejected": 0.050548337399959564, + "step": 4817 + }, + { + "epoch": 0.7450995553837232, + "grad_norm": 6.503905296325684, + "learning_rate": 4.175736052239661e-06, + "logits/chosen": 13.16185188293457, + "logits/rejected": -0.5226092338562012, + "logps/chosen": -340.4718017578125, + "logps/rejected": -172.41815185546875, + "loss": 0.4691, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34280359745025635, + "rewards/margins": 0.6189937591552734, + "rewards/rejected": -0.2761901915073395, + "step": 4818 + }, + { + "epoch": 0.7452542045234873, + "grad_norm": 4.531585693359375, + "learning_rate": 4.1754496505899874e-06, + "logits/chosen": 10.873838424682617, + "logits/rejected": 3.611543655395508, + "logps/chosen": -407.49169921875, + "logps/rejected": -338.61065673828125, + "loss": 0.5227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6937202215194702, + "rewards/margins": 0.5156177878379822, + "rewards/rejected": 0.178102508187294, + "step": 4819 + }, + { + "epoch": 0.7454088536632515, + "grad_norm": 7.156266689300537, + "learning_rate": 4.175163248940314e-06, + "logits/chosen": 6.630589485168457, + "logits/rejected": 8.299389839172363, + "logps/chosen": -245.29534912109375, + "logps/rejected": -301.42254638671875, + "loss": 0.7384, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2609957456588745, + "rewards/margins": -0.009239532053470612, + "rewards/rejected": 0.2702353000640869, + "step": 4820 + }, + { + "epoch": 0.7455635028030156, + "grad_norm": 7.928884983062744, + "learning_rate": 4.174876847290641e-06, + "logits/chosen": 9.81922721862793, + "logits/rejected": 11.316617965698242, + "logps/chosen": -264.1991882324219, + "logps/rejected": -226.83143615722656, + "loss": 0.7047, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19673889875411987, + "rewards/margins": 0.1209615170955658, + "rewards/rejected": 0.07577738165855408, + "step": 4821 + }, + { + "epoch": 0.7457181519427798, + "grad_norm": 3.063037395477295, + "learning_rate": 4.174590445640967e-06, + "logits/chosen": 7.547990798950195, + "logits/rejected": 0.8254170417785645, + "logps/chosen": -208.6787567138672, + "logps/rejected": -159.78138732910156, + "loss": 0.5017, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24274849891662598, + "rewards/margins": 0.5459417700767517, + "rewards/rejected": -0.30319327116012573, + "step": 4822 + }, + { + "epoch": 0.745872801082544, + "grad_norm": 7.068705081939697, + "learning_rate": 4.174304043991293e-06, + "logits/chosen": 8.822977066040039, + "logits/rejected": 7.628363609313965, + "logps/chosen": -352.21978759765625, + "logps/rejected": -293.8230895996094, + "loss": 0.7331, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11144157499074936, + "rewards/margins": -0.049187783151865005, + "rewards/rejected": 0.16062936186790466, + "step": 4823 + }, + { + "epoch": 0.7460274502223081, + "grad_norm": 4.903248310089111, + "learning_rate": 4.17401764234162e-06, + "logits/chosen": 13.733642578125, + "logits/rejected": 8.761543273925781, + "logps/chosen": -401.84228515625, + "logps/rejected": -241.805908203125, + "loss": 0.5121, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31087398529052734, + "rewards/margins": 0.5189108848571777, + "rewards/rejected": -0.208036869764328, + "step": 4824 + }, + { + "epoch": 0.7461820993620722, + "grad_norm": 7.397091865539551, + "learning_rate": 4.1737312406919465e-06, + "logits/chosen": 6.712996959686279, + "logits/rejected": 7.006630897521973, + "logps/chosen": -339.0171203613281, + "logps/rejected": -324.1947021484375, + "loss": 0.9088, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.20211324095726013, + "rewards/margins": -0.37746018171310425, + "rewards/rejected": 0.5795734524726868, + "step": 4825 + }, + { + "epoch": 0.7463367485018365, + "grad_norm": 6.357495307922363, + "learning_rate": 4.173444839042273e-06, + "logits/chosen": 11.699857711791992, + "logits/rejected": 6.020086288452148, + "logps/chosen": -350.0740966796875, + "logps/rejected": -228.97189331054688, + "loss": 0.7088, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17709054052829742, + "rewards/margins": 0.11982870101928711, + "rewards/rejected": 0.05726185441017151, + "step": 4826 + }, + { + "epoch": 0.7464913976416007, + "grad_norm": 5.133193016052246, + "learning_rate": 4.1731584373926e-06, + "logits/chosen": 5.878551959991455, + "logits/rejected": 5.414133071899414, + "logps/chosen": -207.69473266601562, + "logps/rejected": -205.99545288085938, + "loss": 0.5645, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13004469871520996, + "rewards/margins": 0.6078673005104065, + "rewards/rejected": -0.47782260179519653, + "step": 4827 + }, + { + "epoch": 0.7466460467813648, + "grad_norm": 4.098268508911133, + "learning_rate": 4.1728720357429264e-06, + "logits/chosen": 15.177474975585938, + "logits/rejected": 9.74886417388916, + "logps/chosen": -346.4871520996094, + "logps/rejected": -283.2125244140625, + "loss": 0.4848, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8915232419967651, + "rewards/margins": 0.6109054684638977, + "rewards/rejected": 0.28061771392822266, + "step": 4828 + }, + { + "epoch": 0.746800695921129, + "grad_norm": 3.595144510269165, + "learning_rate": 4.172585634093252e-06, + "logits/chosen": 15.834580421447754, + "logits/rejected": 10.500226020812988, + "logps/chosen": -216.09938049316406, + "logps/rejected": -152.27227783203125, + "loss": 0.535, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27926358580589294, + "rewards/margins": 0.600127637386322, + "rewards/rejected": -0.3208640515804291, + "step": 4829 + }, + { + "epoch": 0.7469553450608931, + "grad_norm": 6.200868129730225, + "learning_rate": 4.172299232443579e-06, + "logits/chosen": 11.274935722351074, + "logits/rejected": 13.317134857177734, + "logps/chosen": -343.1924133300781, + "logps/rejected": -367.15350341796875, + "loss": 0.6022, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5902072191238403, + "rewards/margins": 0.24841104447841644, + "rewards/rejected": 0.3417961597442627, + "step": 4830 + }, + { + "epoch": 0.7471099942006573, + "grad_norm": 4.361725807189941, + "learning_rate": 4.1720128307939055e-06, + "logits/chosen": 9.544358253479004, + "logits/rejected": 3.976625442504883, + "logps/chosen": -247.9324493408203, + "logps/rejected": -150.41969299316406, + "loss": 0.6071, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020336657762527466, + "rewards/margins": 0.2519673705101013, + "rewards/rejected": -0.2723039984703064, + "step": 4831 + }, + { + "epoch": 0.7472646433404214, + "grad_norm": 6.279647350311279, + "learning_rate": 4.171726429144232e-06, + "logits/chosen": 9.54776382446289, + "logits/rejected": 8.914756774902344, + "logps/chosen": -292.3633117675781, + "logps/rejected": -472.9407958984375, + "loss": 0.5509, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3753019869327545, + "rewards/margins": 0.43533727526664734, + "rewards/rejected": -0.060035280883312225, + "step": 4832 + }, + { + "epoch": 0.7474192924801856, + "grad_norm": 4.035058975219727, + "learning_rate": 4.171440027494559e-06, + "logits/chosen": 8.587701797485352, + "logits/rejected": 6.294947624206543, + "logps/chosen": -273.48870849609375, + "logps/rejected": -292.9427795410156, + "loss": 0.4349, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12033711373806, + "rewards/margins": 0.6289465427398682, + "rewards/rejected": -0.508609414100647, + "step": 4833 + }, + { + "epoch": 0.7475739416199497, + "grad_norm": 5.26654577255249, + "learning_rate": 4.1711536258448855e-06, + "logits/chosen": 9.489533424377441, + "logits/rejected": 4.669025897979736, + "logps/chosen": -272.92901611328125, + "logps/rejected": -265.3211669921875, + "loss": 0.6321, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6449675559997559, + "rewards/margins": 0.20125295221805573, + "rewards/rejected": 0.44371461868286133, + "step": 4834 + }, + { + "epoch": 0.7477285907597139, + "grad_norm": 3.6931159496307373, + "learning_rate": 4.170867224195212e-06, + "logits/chosen": 10.135820388793945, + "logits/rejected": 6.047074317932129, + "logps/chosen": -194.03077697753906, + "logps/rejected": -157.4449462890625, + "loss": 0.6093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5226764678955078, + "rewards/margins": 0.2899424731731415, + "rewards/rejected": 0.23273403942584991, + "step": 4835 + }, + { + "epoch": 0.747883239899478, + "grad_norm": 5.036673545837402, + "learning_rate": 4.170580822545538e-06, + "logits/chosen": 9.969597816467285, + "logits/rejected": 7.037627696990967, + "logps/chosen": -236.26939392089844, + "logps/rejected": -289.58880615234375, + "loss": 0.5056, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2306259274482727, + "rewards/margins": 0.6020475029945374, + "rewards/rejected": -0.37142154574394226, + "step": 4836 + }, + { + "epoch": 0.7480378890392422, + "grad_norm": 6.106447219848633, + "learning_rate": 4.170294420895865e-06, + "logits/chosen": 16.301727294921875, + "logits/rejected": 11.901161193847656, + "logps/chosen": -326.8699645996094, + "logps/rejected": -294.5316162109375, + "loss": 0.7339, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6186148524284363, + "rewards/margins": -0.006930194795131683, + "rewards/rejected": 0.6255450248718262, + "step": 4837 + }, + { + "epoch": 0.7481925381790063, + "grad_norm": 5.405377388000488, + "learning_rate": 4.170008019246191e-06, + "logits/chosen": 12.78651237487793, + "logits/rejected": 10.472010612487793, + "logps/chosen": -296.6016845703125, + "logps/rejected": -242.8929443359375, + "loss": 0.7105, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42188936471939087, + "rewards/margins": 0.01715688221156597, + "rewards/rejected": 0.40473246574401855, + "step": 4838 + }, + { + "epoch": 0.7483471873187706, + "grad_norm": 6.251347541809082, + "learning_rate": 4.169721617596518e-06, + "logits/chosen": 12.433619499206543, + "logits/rejected": 6.631613731384277, + "logps/chosen": -364.5036315917969, + "logps/rejected": -280.1646423339844, + "loss": 0.743, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13661594688892365, + "rewards/margins": 0.024664364755153656, + "rewards/rejected": 0.11195158958435059, + "step": 4839 + }, + { + "epoch": 0.7485018364585347, + "grad_norm": 4.574646949768066, + "learning_rate": 4.1694352159468446e-06, + "logits/chosen": 11.220207214355469, + "logits/rejected": 3.9194297790527344, + "logps/chosen": -220.86715698242188, + "logps/rejected": -218.00772094726562, + "loss": 0.6247, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20093420147895813, + "rewards/margins": 0.4526696801185608, + "rewards/rejected": -0.25173550844192505, + "step": 4840 + }, + { + "epoch": 0.7486564855982989, + "grad_norm": 4.667858123779297, + "learning_rate": 4.169148814297171e-06, + "logits/chosen": 11.73930549621582, + "logits/rejected": 13.379814147949219, + "logps/chosen": -159.39662170410156, + "logps/rejected": -231.64088439941406, + "loss": 0.6245, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36498862504959106, + "rewards/margins": 0.2093839943408966, + "rewards/rejected": 0.15560461580753326, + "step": 4841 + }, + { + "epoch": 0.748811134738063, + "grad_norm": 5.942371368408203, + "learning_rate": 4.168862412647497e-06, + "logits/chosen": 12.18507194519043, + "logits/rejected": 9.20965576171875, + "logps/chosen": -223.5433349609375, + "logps/rejected": -231.08551025390625, + "loss": 0.7216, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23112405836582184, + "rewards/margins": 0.02223964035511017, + "rewards/rejected": -0.253363698720932, + "step": 4842 + }, + { + "epoch": 0.7489657838778272, + "grad_norm": 5.485408306121826, + "learning_rate": 4.168576010997824e-06, + "logits/chosen": 10.809326171875, + "logits/rejected": 9.459688186645508, + "logps/chosen": -314.85882568359375, + "logps/rejected": -269.78179931640625, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2031661570072174, + "rewards/margins": 0.38123080134391785, + "rewards/rejected": -0.17806468904018402, + "step": 4843 + }, + { + "epoch": 0.7491204330175913, + "grad_norm": 5.794851779937744, + "learning_rate": 4.16828960934815e-06, + "logits/chosen": 9.650164604187012, + "logits/rejected": 8.985267639160156, + "logps/chosen": -226.5543975830078, + "logps/rejected": -186.87440490722656, + "loss": 0.7557, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12609292566776276, + "rewards/margins": -0.03848074749112129, + "rewards/rejected": 0.16457365453243256, + "step": 4844 + }, + { + "epoch": 0.7492750821573555, + "grad_norm": 3.718604564666748, + "learning_rate": 4.168003207698477e-06, + "logits/chosen": 5.795703411102295, + "logits/rejected": 4.269153118133545, + "logps/chosen": -262.18975830078125, + "logps/rejected": -191.9886016845703, + "loss": 0.5186, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16473212838172913, + "rewards/margins": 0.9451456665992737, + "rewards/rejected": -0.7804135084152222, + "step": 4845 + }, + { + "epoch": 0.7494297312971197, + "grad_norm": 16.071382522583008, + "learning_rate": 4.167716806048804e-06, + "logits/chosen": 10.279264450073242, + "logits/rejected": 7.207713603973389, + "logps/chosen": -295.3358154296875, + "logps/rejected": -282.45159912109375, + "loss": 0.7848, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02912130206823349, + "rewards/margins": 0.013496320694684982, + "rewards/rejected": -0.04261759668588638, + "step": 4846 + }, + { + "epoch": 0.7495843804368838, + "grad_norm": 5.455828666687012, + "learning_rate": 4.167430404399129e-06, + "logits/chosen": 10.817137718200684, + "logits/rejected": 6.942222595214844, + "logps/chosen": -298.0824890136719, + "logps/rejected": -174.29147338867188, + "loss": 0.4863, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3604160249233246, + "rewards/margins": 0.6177363395690918, + "rewards/rejected": -0.2573203146457672, + "step": 4847 + }, + { + "epoch": 0.749739029576648, + "grad_norm": 4.1171488761901855, + "learning_rate": 4.167144002749456e-06, + "logits/chosen": 11.83259391784668, + "logits/rejected": 10.357523918151855, + "logps/chosen": -162.07606506347656, + "logps/rejected": -166.74497985839844, + "loss": 0.519, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39864879846572876, + "rewards/margins": 0.4506915807723999, + "rewards/rejected": -0.052042774856090546, + "step": 4848 + }, + { + "epoch": 0.7498936787164121, + "grad_norm": 5.217754364013672, + "learning_rate": 4.166857601099783e-06, + "logits/chosen": 10.635445594787598, + "logits/rejected": 9.232568740844727, + "logps/chosen": -395.853759765625, + "logps/rejected": -313.3909912109375, + "loss": 0.4958, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6965374946594238, + "rewards/margins": 0.5588996410369873, + "rewards/rejected": 0.1376378983259201, + "step": 4849 + }, + { + "epoch": 0.7500483278561763, + "grad_norm": 3.989359140396118, + "learning_rate": 4.166571199450109e-06, + "logits/chosen": 8.043920516967773, + "logits/rejected": 7.287498474121094, + "logps/chosen": -249.3497314453125, + "logps/rejected": -227.15982055664062, + "loss": 0.4563, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04019976034760475, + "rewards/margins": 0.7730265855789185, + "rewards/rejected": -0.732826828956604, + "step": 4850 + }, + { + "epoch": 0.7502029769959404, + "grad_norm": 6.043792724609375, + "learning_rate": 4.166284797800436e-06, + "logits/chosen": 11.858987808227539, + "logits/rejected": 8.757853507995605, + "logps/chosen": -270.83612060546875, + "logps/rejected": -207.2410125732422, + "loss": 0.7772, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0764111578464508, + "rewards/margins": 0.02477322518825531, + "rewards/rejected": 0.05163794755935669, + "step": 4851 + }, + { + "epoch": 0.7503576261357047, + "grad_norm": 5.700096130371094, + "learning_rate": 4.165998396150762e-06, + "logits/chosen": 7.036066055297852, + "logits/rejected": 4.6242828369140625, + "logps/chosen": -213.81365966796875, + "logps/rejected": -245.6555633544922, + "loss": 0.7766, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23110955953598022, + "rewards/margins": 0.07529954612255096, + "rewards/rejected": -0.30640915036201477, + "step": 4852 + }, + { + "epoch": 0.7505122752754688, + "grad_norm": 4.637636661529541, + "learning_rate": 4.1657119945010885e-06, + "logits/chosen": 13.166892051696777, + "logits/rejected": 10.624645233154297, + "logps/chosen": -322.92352294921875, + "logps/rejected": -270.46881103515625, + "loss": 0.4958, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34987935423851013, + "rewards/margins": 0.6453158259391785, + "rewards/rejected": -0.2954365015029907, + "step": 4853 + }, + { + "epoch": 0.750666924415233, + "grad_norm": 9.836350440979004, + "learning_rate": 4.165425592851415e-06, + "logits/chosen": 9.596490859985352, + "logits/rejected": 10.918330192565918, + "logps/chosen": -286.5840759277344, + "logps/rejected": -240.1240692138672, + "loss": 0.9357, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.301012247800827, + "rewards/margins": -0.24442550539970398, + "rewards/rejected": 0.5454376935958862, + "step": 4854 + }, + { + "epoch": 0.7508215735549971, + "grad_norm": 7.767127513885498, + "learning_rate": 4.165139191201742e-06, + "logits/chosen": 11.761005401611328, + "logits/rejected": 6.494842529296875, + "logps/chosen": -437.18896484375, + "logps/rejected": -340.0881652832031, + "loss": 0.8958, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.29401540756225586, + "rewards/margins": -0.11122598499059677, + "rewards/rejected": 0.4052414298057556, + "step": 4855 + }, + { + "epoch": 0.7509762226947613, + "grad_norm": 11.616569519042969, + "learning_rate": 4.1648527895520676e-06, + "logits/chosen": 10.883437156677246, + "logits/rejected": 7.339916229248047, + "logps/chosen": -434.20819091796875, + "logps/rejected": -354.2981262207031, + "loss": 0.9145, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.17182910442352295, + "rewards/margins": -0.322274774312973, + "rewards/rejected": 0.15044564008712769, + "step": 4856 + }, + { + "epoch": 0.7511308718345254, + "grad_norm": 4.164758682250977, + "learning_rate": 4.164566387902394e-06, + "logits/chosen": 13.085493087768555, + "logits/rejected": 8.32006549835205, + "logps/chosen": -255.8421630859375, + "logps/rejected": -229.529296875, + "loss": 0.5219, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3584100306034088, + "rewards/margins": 0.478179931640625, + "rewards/rejected": -0.11976990103721619, + "step": 4857 + }, + { + "epoch": 0.7512855209742896, + "grad_norm": 5.33148717880249, + "learning_rate": 4.164279986252721e-06, + "logits/chosen": 11.501684188842773, + "logits/rejected": 11.381439208984375, + "logps/chosen": -183.98374938964844, + "logps/rejected": -188.72727966308594, + "loss": 0.8294, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09933124482631683, + "rewards/margins": -0.2000461220741272, + "rewards/rejected": 0.10071487724781036, + "step": 4858 + }, + { + "epoch": 0.7514401701140537, + "grad_norm": 5.681978225708008, + "learning_rate": 4.1639935846030475e-06, + "logits/chosen": 8.126919746398926, + "logits/rejected": 8.684080123901367, + "logps/chosen": -308.3127136230469, + "logps/rejected": -320.227294921875, + "loss": 0.6212, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34130972623825073, + "rewards/margins": 0.402667373418808, + "rewards/rejected": -0.06135760247707367, + "step": 4859 + }, + { + "epoch": 0.7515948192538179, + "grad_norm": 7.054001808166504, + "learning_rate": 4.163707182953374e-06, + "logits/chosen": 3.9391956329345703, + "logits/rejected": 6.212490558624268, + "logps/chosen": -265.00872802734375, + "logps/rejected": -275.0775146484375, + "loss": 0.8545, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.037232063710689545, + "rewards/margins": -0.17208650708198547, + "rewards/rejected": 0.20931857824325562, + "step": 4860 + }, + { + "epoch": 0.751749468393582, + "grad_norm": 5.3153862953186035, + "learning_rate": 4.163420781303701e-06, + "logits/chosen": 4.634160041809082, + "logits/rejected": 5.175274848937988, + "logps/chosen": -303.40240478515625, + "logps/rejected": -213.849365234375, + "loss": 0.6406, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4911089539527893, + "rewards/margins": 0.17803442478179932, + "rewards/rejected": 0.31307452917099, + "step": 4861 + }, + { + "epoch": 0.7519041175333462, + "grad_norm": 5.403745651245117, + "learning_rate": 4.163134379654027e-06, + "logits/chosen": 11.666973114013672, + "logits/rejected": 9.547372817993164, + "logps/chosen": -263.4168395996094, + "logps/rejected": -278.2130432128906, + "loss": 0.5922, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6372042894363403, + "rewards/margins": 0.48984840512275696, + "rewards/rejected": 0.14735586941242218, + "step": 4862 + }, + { + "epoch": 0.7520587666731103, + "grad_norm": 3.2964301109313965, + "learning_rate": 4.162847978004353e-06, + "logits/chosen": 10.638582229614258, + "logits/rejected": -2.3800859451293945, + "logps/chosen": -269.4823913574219, + "logps/rejected": -146.95364379882812, + "loss": 0.4316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4664738178253174, + "rewards/margins": 0.8757360577583313, + "rewards/rejected": -0.4092622697353363, + "step": 4863 + }, + { + "epoch": 0.7522134158128745, + "grad_norm": 94.45955657958984, + "learning_rate": 4.16256157635468e-06, + "logits/chosen": 8.801929473876953, + "logits/rejected": 8.142520904541016, + "logps/chosen": -333.4564208984375, + "logps/rejected": -262.99566650390625, + "loss": 0.8018, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.27472934126853943, + "rewards/margins": -0.17398786544799805, + "rewards/rejected": 0.4487171769142151, + "step": 4864 + }, + { + "epoch": 0.7523680649526387, + "grad_norm": 4.626436233520508, + "learning_rate": 4.162275174705007e-06, + "logits/chosen": 13.201923370361328, + "logits/rejected": 9.123678207397461, + "logps/chosen": -180.09149169921875, + "logps/rejected": -172.35079956054688, + "loss": 0.6479, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08253325521945953, + "rewards/margins": 0.2728095054626465, + "rewards/rejected": -0.19027625024318695, + "step": 4865 + }, + { + "epoch": 0.7525227140924029, + "grad_norm": 4.145317077636719, + "learning_rate": 4.161988773055333e-06, + "logits/chosen": 10.445066452026367, + "logits/rejected": 12.700387001037598, + "logps/chosen": -182.2593994140625, + "logps/rejected": -271.501220703125, + "loss": 0.5114, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3474692106246948, + "rewards/margins": 0.45338213443756104, + "rewards/rejected": -0.10591289401054382, + "step": 4866 + }, + { + "epoch": 0.752677363232167, + "grad_norm": 5.330689430236816, + "learning_rate": 4.16170237140566e-06, + "logits/chosen": 11.336567878723145, + "logits/rejected": 7.443289279937744, + "logps/chosen": -295.2197265625, + "logps/rejected": -226.88449096679688, + "loss": 0.6524, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4521089792251587, + "rewards/margins": 0.15858136117458344, + "rewards/rejected": 0.29352760314941406, + "step": 4867 + }, + { + "epoch": 0.7528320123719312, + "grad_norm": 4.9363112449646, + "learning_rate": 4.1614159697559865e-06, + "logits/chosen": 5.8075032234191895, + "logits/rejected": 2.988018751144409, + "logps/chosen": -180.41346740722656, + "logps/rejected": -247.77944946289062, + "loss": 0.5332, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7195602059364319, + "rewards/margins": 1.1486563682556152, + "rewards/rejected": -0.4290962219238281, + "step": 4868 + }, + { + "epoch": 0.7529866615116954, + "grad_norm": 4.186629772186279, + "learning_rate": 4.161129568106312e-06, + "logits/chosen": 9.417548179626465, + "logits/rejected": 7.2234697341918945, + "logps/chosen": -448.9929504394531, + "logps/rejected": -399.49212646484375, + "loss": 0.5829, + "rewards/accuracies": 0.375, + "rewards/chosen": 3.9507672786712646, + "rewards/margins": 9.619233131408691, + "rewards/rejected": -5.668464660644531, + "step": 4869 + }, + { + "epoch": 0.7531413106514595, + "grad_norm": 6.474702835083008, + "learning_rate": 4.160843166456639e-06, + "logits/chosen": 8.779424667358398, + "logits/rejected": 11.26167106628418, + "logps/chosen": -249.2095947265625, + "logps/rejected": -236.08193969726562, + "loss": 0.8036, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14109763503074646, + "rewards/margins": -0.14996615052223206, + "rewards/rejected": 0.2910637855529785, + "step": 4870 + }, + { + "epoch": 0.7532959597912237, + "grad_norm": 5.033507823944092, + "learning_rate": 4.160556764806966e-06, + "logits/chosen": 11.104290008544922, + "logits/rejected": 12.713186264038086, + "logps/chosen": -235.8220672607422, + "logps/rejected": -264.89544677734375, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5147908926010132, + "rewards/margins": 0.1764892041683197, + "rewards/rejected": 0.33830171823501587, + "step": 4871 + }, + { + "epoch": 0.7534506089309878, + "grad_norm": 4.59453010559082, + "learning_rate": 4.160270363157292e-06, + "logits/chosen": 8.122228622436523, + "logits/rejected": 11.204286575317383, + "logps/chosen": -242.2407684326172, + "logps/rejected": -155.99050903320312, + "loss": 0.685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1730659306049347, + "rewards/margins": 0.0686965212225914, + "rewards/rejected": 0.10436940938234329, + "step": 4872 + }, + { + "epoch": 0.753605258070752, + "grad_norm": 4.717090129852295, + "learning_rate": 4.159983961507619e-06, + "logits/chosen": 7.51284122467041, + "logits/rejected": 6.261141300201416, + "logps/chosen": -250.036376953125, + "logps/rejected": -254.99609375, + "loss": 0.5665, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42540812492370605, + "rewards/margins": 0.387265145778656, + "rewards/rejected": 0.03814297914505005, + "step": 4873 + }, + { + "epoch": 0.7537599072105161, + "grad_norm": 4.785600662231445, + "learning_rate": 4.159697559857946e-06, + "logits/chosen": 13.60636043548584, + "logits/rejected": 7.181684494018555, + "logps/chosen": -343.3254699707031, + "logps/rejected": -249.38449096679688, + "loss": 0.53, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5707874298095703, + "rewards/margins": 0.7022783756256104, + "rewards/rejected": -0.13149090111255646, + "step": 4874 + }, + { + "epoch": 0.7539145563502803, + "grad_norm": 4.169939994812012, + "learning_rate": 4.159411158208271e-06, + "logits/chosen": 11.354193687438965, + "logits/rejected": 4.276512622833252, + "logps/chosen": -234.9937744140625, + "logps/rejected": -173.69639587402344, + "loss": 0.5464, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23608791828155518, + "rewards/margins": 0.7283867597579956, + "rewards/rejected": -0.49229881167411804, + "step": 4875 + }, + { + "epoch": 0.7540692054900444, + "grad_norm": 7.825748920440674, + "learning_rate": 4.159124756558598e-06, + "logits/chosen": 10.39487361907959, + "logits/rejected": 2.73970890045166, + "logps/chosen": -262.90191650390625, + "logps/rejected": -191.63388061523438, + "loss": 1.0615, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.14341247081756592, + "rewards/margins": -0.47041571140289307, + "rewards/rejected": 0.613828182220459, + "step": 4876 + }, + { + "epoch": 0.7542238546298087, + "grad_norm": 6.113379001617432, + "learning_rate": 4.158838354908925e-06, + "logits/chosen": 10.946430206298828, + "logits/rejected": 8.744179725646973, + "logps/chosen": -334.80487060546875, + "logps/rejected": -305.72503662109375, + "loss": 0.7291, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5106526017189026, + "rewards/margins": -0.039550863206386566, + "rewards/rejected": 0.5502034425735474, + "step": 4877 + }, + { + "epoch": 0.7543785037695728, + "grad_norm": 7.409026145935059, + "learning_rate": 4.158551953259251e-06, + "logits/chosen": 6.310328006744385, + "logits/rejected": 6.384535789489746, + "logps/chosen": -255.12356567382812, + "logps/rejected": -273.47314453125, + "loss": 0.6123, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11168798804283142, + "rewards/margins": 0.41121432185173035, + "rewards/rejected": -0.2995263636112213, + "step": 4878 + }, + { + "epoch": 0.754533152909337, + "grad_norm": 4.742378234863281, + "learning_rate": 4.158265551609578e-06, + "logits/chosen": 8.79690933227539, + "logits/rejected": 8.849470138549805, + "logps/chosen": -203.0412139892578, + "logps/rejected": -232.00360107421875, + "loss": 0.6605, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32467830181121826, + "rewards/margins": 0.10746662318706512, + "rewards/rejected": 0.21721170842647552, + "step": 4879 + }, + { + "epoch": 0.7546878020491011, + "grad_norm": 5.249179363250732, + "learning_rate": 4.157979149959905e-06, + "logits/chosen": 14.662715911865234, + "logits/rejected": 4.638828754425049, + "logps/chosen": -282.33544921875, + "logps/rejected": -217.3389892578125, + "loss": 0.4491, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1111377701163292, + "rewards/margins": 0.9911292195320129, + "rewards/rejected": -0.8799915313720703, + "step": 4880 + }, + { + "epoch": 0.7548424511888653, + "grad_norm": 6.396276950836182, + "learning_rate": 4.1576927483102304e-06, + "logits/chosen": 14.171765327453613, + "logits/rejected": 8.38521957397461, + "logps/chosen": -312.46807861328125, + "logps/rejected": -293.8635559082031, + "loss": 0.5882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8231029510498047, + "rewards/margins": 0.3185581564903259, + "rewards/rejected": 0.5045448541641235, + "step": 4881 + }, + { + "epoch": 0.7549971003286294, + "grad_norm": 9.577472686767578, + "learning_rate": 4.157406346660557e-06, + "logits/chosen": 13.956840515136719, + "logits/rejected": 4.463374137878418, + "logps/chosen": -371.04583740234375, + "logps/rejected": -228.9930419921875, + "loss": 0.68, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.14048179984092712, + "rewards/margins": 0.14574843645095825, + "rewards/rejected": -0.005266614258289337, + "step": 4882 + }, + { + "epoch": 0.7551517494683936, + "grad_norm": 4.399951934814453, + "learning_rate": 4.157119945010884e-06, + "logits/chosen": 12.402183532714844, + "logits/rejected": 8.673035621643066, + "logps/chosen": -256.32330322265625, + "logps/rejected": -179.59190368652344, + "loss": 0.6318, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44452208280563354, + "rewards/margins": 0.21307125687599182, + "rewards/rejected": 0.2314508557319641, + "step": 4883 + }, + { + "epoch": 0.7553063986081577, + "grad_norm": 7.600841522216797, + "learning_rate": 4.15683354336121e-06, + "logits/chosen": 5.742386341094971, + "logits/rejected": 0.9494252800941467, + "logps/chosen": -208.89401245117188, + "logps/rejected": -207.08868408203125, + "loss": 0.7911, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04910773038864136, + "rewards/margins": -0.043230995535850525, + "rewards/rejected": 0.09233871102333069, + "step": 4884 + }, + { + "epoch": 0.7554610477479219, + "grad_norm": 5.3923726081848145, + "learning_rate": 4.156547141711536e-06, + "logits/chosen": 6.722903251647949, + "logits/rejected": 4.033544540405273, + "logps/chosen": -243.59324645996094, + "logps/rejected": -210.42825317382812, + "loss": 0.792, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19359853863716125, + "rewards/margins": 0.09448637068271637, + "rewards/rejected": 0.09911217540502548, + "step": 4885 + }, + { + "epoch": 0.755615696887686, + "grad_norm": 10.00224781036377, + "learning_rate": 4.156260740061863e-06, + "logits/chosen": 6.39294958114624, + "logits/rejected": 11.800689697265625, + "logps/chosen": -212.2391815185547, + "logps/rejected": -241.23724365234375, + "loss": 0.806, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04922452196478844, + "rewards/margins": -0.1879214346408844, + "rewards/rejected": 0.13869690895080566, + "step": 4886 + }, + { + "epoch": 0.7557703460274502, + "grad_norm": 3.972947835922241, + "learning_rate": 4.1559743384121895e-06, + "logits/chosen": 6.580738067626953, + "logits/rejected": -0.7640472054481506, + "logps/chosen": -221.8458251953125, + "logps/rejected": -157.6392364501953, + "loss": 0.478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2947428822517395, + "rewards/margins": 0.6070636510848999, + "rewards/rejected": -0.3123207092285156, + "step": 4887 + }, + { + "epoch": 0.7559249951672143, + "grad_norm": 5.774746417999268, + "learning_rate": 4.155687936762516e-06, + "logits/chosen": 2.8305511474609375, + "logits/rejected": 2.8302910327911377, + "logps/chosen": -232.02593994140625, + "logps/rejected": -163.4444580078125, + "loss": 0.8295, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06917095929384232, + "rewards/margins": -0.12995997071266174, + "rewards/rejected": 0.06078901141881943, + "step": 4888 + }, + { + "epoch": 0.7560796443069785, + "grad_norm": 6.0303120613098145, + "learning_rate": 4.155401535112843e-06, + "logits/chosen": 6.560315132141113, + "logits/rejected": 2.1548962593078613, + "logps/chosen": -331.7688293457031, + "logps/rejected": -228.27127075195312, + "loss": 0.6343, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38874709606170654, + "rewards/margins": 0.2311120480298996, + "rewards/rejected": 0.15763501822948456, + "step": 4889 + }, + { + "epoch": 0.7562342934467428, + "grad_norm": 4.204606056213379, + "learning_rate": 4.155115133463169e-06, + "logits/chosen": 12.035921096801758, + "logits/rejected": 5.750936508178711, + "logps/chosen": -287.957763671875, + "logps/rejected": -219.12298583984375, + "loss": 0.6494, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2778784930706024, + "rewards/margins": 0.3323845863342285, + "rewards/rejected": -0.05450611561536789, + "step": 4890 + }, + { + "epoch": 0.7563889425865069, + "grad_norm": 3.987344980239868, + "learning_rate": 4.154828731813495e-06, + "logits/chosen": 9.979681015014648, + "logits/rejected": 5.062463283538818, + "logps/chosen": -354.886474609375, + "logps/rejected": -304.448486328125, + "loss": 0.4847, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45751842856407166, + "rewards/margins": 0.5771106481552124, + "rewards/rejected": -0.11959227919578552, + "step": 4891 + }, + { + "epoch": 0.7565435917262711, + "grad_norm": 7.338598728179932, + "learning_rate": 4.154542330163822e-06, + "logits/chosen": 18.9298152923584, + "logits/rejected": 14.901196479797363, + "logps/chosen": -301.9948425292969, + "logps/rejected": -228.91709899902344, + "loss": 0.6626, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6963154077529907, + "rewards/margins": 0.15749777853488922, + "rewards/rejected": 0.5388177037239075, + "step": 4892 + }, + { + "epoch": 0.7566982408660352, + "grad_norm": 4.613230228424072, + "learning_rate": 4.1542559285141486e-06, + "logits/chosen": 9.807443618774414, + "logits/rejected": 9.253124237060547, + "logps/chosen": -233.16943359375, + "logps/rejected": -243.26144409179688, + "loss": 0.6306, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.042470961809158325, + "rewards/margins": 0.2960153818130493, + "rewards/rejected": -0.253544420003891, + "step": 4893 + }, + { + "epoch": 0.7568528900057994, + "grad_norm": 5.299074649810791, + "learning_rate": 4.153969526864475e-06, + "logits/chosen": 5.765031814575195, + "logits/rejected": 4.580773830413818, + "logps/chosen": -357.42559814453125, + "logps/rejected": -328.19256591796875, + "loss": 0.507, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42679429054260254, + "rewards/margins": 0.5977314710617065, + "rewards/rejected": -0.17093724012374878, + "step": 4894 + }, + { + "epoch": 0.7570075391455635, + "grad_norm": 4.942300319671631, + "learning_rate": 4.153683125214801e-06, + "logits/chosen": 13.360185623168945, + "logits/rejected": 13.061016082763672, + "logps/chosen": -289.798583984375, + "logps/rejected": -281.2418212890625, + "loss": 0.7466, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4073914885520935, + "rewards/margins": -0.07652539759874344, + "rewards/rejected": 0.48391687870025635, + "step": 4895 + }, + { + "epoch": 0.7571621882853277, + "grad_norm": 6.595679759979248, + "learning_rate": 4.153396723565128e-06, + "logits/chosen": 11.112854957580566, + "logits/rejected": 13.922195434570312, + "logps/chosen": -244.72744750976562, + "logps/rejected": -300.64288330078125, + "loss": 0.759, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14549341797828674, + "rewards/margins": -0.07439480721950531, + "rewards/rejected": 0.21988821029663086, + "step": 4896 + }, + { + "epoch": 0.7573168374250918, + "grad_norm": 7.344395637512207, + "learning_rate": 4.153110321915454e-06, + "logits/chosen": 13.290486335754395, + "logits/rejected": 12.03635311126709, + "logps/chosen": -289.7325134277344, + "logps/rejected": -271.93499755859375, + "loss": 0.7853, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49479347467422485, + "rewards/margins": -0.0040227919816970825, + "rewards/rejected": 0.49881622195243835, + "step": 4897 + }, + { + "epoch": 0.757471486564856, + "grad_norm": 8.114124298095703, + "learning_rate": 4.152823920265781e-06, + "logits/chosen": 4.685720920562744, + "logits/rejected": 9.170136451721191, + "logps/chosen": -252.373779296875, + "logps/rejected": -266.15460205078125, + "loss": 0.7666, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3568749725818634, + "rewards/margins": 0.19227677583694458, + "rewards/rejected": 0.16459818184375763, + "step": 4898 + }, + { + "epoch": 0.7576261357046201, + "grad_norm": 5.119345188140869, + "learning_rate": 4.152537518616108e-06, + "logits/chosen": 9.796183586120605, + "logits/rejected": 9.1947603225708, + "logps/chosen": -284.006591796875, + "logps/rejected": -208.79620361328125, + "loss": 0.7262, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3368726968765259, + "rewards/margins": -0.016489751636981964, + "rewards/rejected": 0.35336247086524963, + "step": 4899 + }, + { + "epoch": 0.7577807848443843, + "grad_norm": 7.347695827484131, + "learning_rate": 4.152251116966434e-06, + "logits/chosen": 13.330623626708984, + "logits/rejected": 5.243101119995117, + "logps/chosen": -312.1974182128906, + "logps/rejected": -243.76007080078125, + "loss": 0.7052, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12130396068096161, + "rewards/margins": 0.14944599568843842, + "rewards/rejected": -0.028142064809799194, + "step": 4900 + }, + { + "epoch": 0.7579354339841484, + "grad_norm": 6.108462333679199, + "learning_rate": 4.151964715316761e-06, + "logits/chosen": 14.061036109924316, + "logits/rejected": 5.555707931518555, + "logps/chosen": -408.7244873046875, + "logps/rejected": -292.62860107421875, + "loss": 0.5616, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.662522554397583, + "rewards/margins": 0.5321530103683472, + "rewards/rejected": 0.13036946952342987, + "step": 4901 + }, + { + "epoch": 0.7580900831239126, + "grad_norm": 5.217168807983398, + "learning_rate": 4.151678313667087e-06, + "logits/chosen": 14.124031066894531, + "logits/rejected": 6.348136901855469, + "logps/chosen": -321.36614990234375, + "logps/rejected": -320.43267822265625, + "loss": 0.5652, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5383907556533813, + "rewards/margins": 0.3861134946346283, + "rewards/rejected": 0.15227729082107544, + "step": 4902 + }, + { + "epoch": 0.7582447322636768, + "grad_norm": 4.985553741455078, + "learning_rate": 4.151391912017413e-06, + "logits/chosen": 9.301740646362305, + "logits/rejected": 8.294329643249512, + "logps/chosen": -249.47061157226562, + "logps/rejected": -267.64862060546875, + "loss": 0.6142, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6381760835647583, + "rewards/margins": 0.23665611445903778, + "rewards/rejected": 0.4015199542045593, + "step": 4903 + }, + { + "epoch": 0.758399381403441, + "grad_norm": 5.5900726318359375, + "learning_rate": 4.15110551036774e-06, + "logits/chosen": 11.056108474731445, + "logits/rejected": 9.100286483764648, + "logps/chosen": -239.73574829101562, + "logps/rejected": -198.02235412597656, + "loss": 0.8239, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04010138288140297, + "rewards/margins": -0.15031249821186066, + "rewards/rejected": 0.19041383266448975, + "step": 4904 + }, + { + "epoch": 0.7585540305432051, + "grad_norm": 6.506530284881592, + "learning_rate": 4.150819108718067e-06, + "logits/chosen": 10.831025123596191, + "logits/rejected": 8.439030647277832, + "logps/chosen": -418.8076477050781, + "logps/rejected": -347.3546447753906, + "loss": 0.5912, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6486164331436157, + "rewards/margins": 0.33601999282836914, + "rewards/rejected": 0.3125964105129242, + "step": 4905 + }, + { + "epoch": 0.7587086796829693, + "grad_norm": 6.190465927124023, + "learning_rate": 4.150532707068393e-06, + "logits/chosen": 8.84827995300293, + "logits/rejected": 10.607156753540039, + "logps/chosen": -227.6044921875, + "logps/rejected": -306.0880126953125, + "loss": 0.6276, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1654651165008545, + "rewards/margins": 0.1953257918357849, + "rewards/rejected": -0.029860682785511017, + "step": 4906 + }, + { + "epoch": 0.7588633288227334, + "grad_norm": 4.181465148925781, + "learning_rate": 4.15024630541872e-06, + "logits/chosen": 5.151773929595947, + "logits/rejected": -0.595993161201477, + "logps/chosen": -272.97430419921875, + "logps/rejected": -232.081787109375, + "loss": 0.6154, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45322006940841675, + "rewards/margins": 0.47670498490333557, + "rewards/rejected": -0.023484911769628525, + "step": 4907 + }, + { + "epoch": 0.7590179779624976, + "grad_norm": 6.769428253173828, + "learning_rate": 4.149959903769046e-06, + "logits/chosen": 12.372825622558594, + "logits/rejected": 8.445409774780273, + "logps/chosen": -211.14593505859375, + "logps/rejected": -201.21739196777344, + "loss": 0.6324, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4704332947731018, + "rewards/margins": 0.27743685245513916, + "rewards/rejected": 0.19299644231796265, + "step": 4908 + }, + { + "epoch": 0.7591726271022617, + "grad_norm": 5.690778732299805, + "learning_rate": 4.1496735021193724e-06, + "logits/chosen": 4.834717273712158, + "logits/rejected": 4.586916923522949, + "logps/chosen": -334.7077941894531, + "logps/rejected": -335.7353515625, + "loss": 0.5469, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40061330795288086, + "rewards/margins": 0.5135318636894226, + "rewards/rejected": -0.11291855573654175, + "step": 4909 + }, + { + "epoch": 0.7593272762420259, + "grad_norm": 5.367953300476074, + "learning_rate": 4.149387100469699e-06, + "logits/chosen": 5.731888294219971, + "logits/rejected": 5.723881244659424, + "logps/chosen": -225.95489501953125, + "logps/rejected": -199.14968872070312, + "loss": 0.6806, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10567726939916611, + "rewards/margins": 0.08735520392656326, + "rewards/rejected": 0.018322043120861053, + "step": 4910 + }, + { + "epoch": 0.75948192538179, + "grad_norm": 5.577286720275879, + "learning_rate": 4.149100698820026e-06, + "logits/chosen": 7.287262916564941, + "logits/rejected": 5.463646411895752, + "logps/chosen": -240.44281005859375, + "logps/rejected": -217.14999389648438, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14128904044628143, + "rewards/margins": 0.1739535927772522, + "rewards/rejected": -0.03266454115509987, + "step": 4911 + }, + { + "epoch": 0.7596365745215542, + "grad_norm": 8.50859546661377, + "learning_rate": 4.148814297170352e-06, + "logits/chosen": 11.617916107177734, + "logits/rejected": 7.3536906242370605, + "logps/chosen": -236.2200164794922, + "logps/rejected": -234.49818420410156, + "loss": 0.7667, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3820585310459137, + "rewards/margins": -0.055295251309871674, + "rewards/rejected": 0.43735381960868835, + "step": 4912 + }, + { + "epoch": 0.7597912236613183, + "grad_norm": 6.611783027648926, + "learning_rate": 4.148527895520679e-06, + "logits/chosen": 10.420655250549316, + "logits/rejected": 12.067861557006836, + "logps/chosen": -221.20645141601562, + "logps/rejected": -231.5289306640625, + "loss": 0.7919, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08360320329666138, + "rewards/margins": -0.04212473705410957, + "rewards/rejected": 0.12572795152664185, + "step": 4913 + }, + { + "epoch": 0.7599458728010825, + "grad_norm": 4.780416965484619, + "learning_rate": 4.148241493871006e-06, + "logits/chosen": 16.03663444519043, + "logits/rejected": 9.475438117980957, + "logps/chosen": -422.89898681640625, + "logps/rejected": -320.68621826171875, + "loss": 0.3936, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5728496313095093, + "rewards/margins": 0.8516335487365723, + "rewards/rejected": -0.27878400683403015, + "step": 4914 + }, + { + "epoch": 0.7601005219408467, + "grad_norm": 4.702272415161133, + "learning_rate": 4.1479550922213315e-06, + "logits/chosen": 11.05862808227539, + "logits/rejected": 6.111822128295898, + "logps/chosen": -435.7619323730469, + "logps/rejected": -346.0679931640625, + "loss": 0.4264, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.9095495939254761, + "rewards/margins": 0.7445962429046631, + "rewards/rejected": 0.16495341062545776, + "step": 4915 + }, + { + "epoch": 0.7602551710806109, + "grad_norm": 5.261887550354004, + "learning_rate": 4.147668690571658e-06, + "logits/chosen": 11.336010932922363, + "logits/rejected": 7.728522300720215, + "logps/chosen": -321.0860290527344, + "logps/rejected": -234.15423583984375, + "loss": 0.6268, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3022048771381378, + "rewards/margins": 0.20865270495414734, + "rewards/rejected": 0.09355220943689346, + "step": 4916 + }, + { + "epoch": 0.7604098202203751, + "grad_norm": 5.102192401885986, + "learning_rate": 4.147382288921985e-06, + "logits/chosen": 14.532776832580566, + "logits/rejected": 8.315821647644043, + "logps/chosen": -181.035400390625, + "logps/rejected": -107.02165222167969, + "loss": 0.7974, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2955061197280884, + "rewards/margins": 0.03571687638759613, + "rewards/rejected": -0.3312229514122009, + "step": 4917 + }, + { + "epoch": 0.7605644693601392, + "grad_norm": 5.660706043243408, + "learning_rate": 4.1470958872723114e-06, + "logits/chosen": 7.079586505889893, + "logits/rejected": 9.5824556350708, + "logps/chosen": -245.2150421142578, + "logps/rejected": -275.98974609375, + "loss": 0.6828, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2076207995414734, + "rewards/margins": 0.12682899832725525, + "rewards/rejected": 0.08079180866479874, + "step": 4918 + }, + { + "epoch": 0.7607191184999034, + "grad_norm": 6.165192604064941, + "learning_rate": 4.146809485622637e-06, + "logits/chosen": 9.167647361755371, + "logits/rejected": 5.921746253967285, + "logps/chosen": -331.52447509765625, + "logps/rejected": -273.09051513671875, + "loss": 0.5835, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.532878577709198, + "rewards/margins": 0.3147548735141754, + "rewards/rejected": 0.21812373399734497, + "step": 4919 + }, + { + "epoch": 0.7608737676396675, + "grad_norm": 6.272213459014893, + "learning_rate": 4.146523083972964e-06, + "logits/chosen": 7.812296390533447, + "logits/rejected": 7.431384086608887, + "logps/chosen": -363.254638671875, + "logps/rejected": -287.0889587402344, + "loss": 0.6778, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.539750874042511, + "rewards/margins": 0.38153934478759766, + "rewards/rejected": 0.15821152925491333, + "step": 4920 + }, + { + "epoch": 0.7610284167794317, + "grad_norm": 5.577705383300781, + "learning_rate": 4.1462366823232905e-06, + "logits/chosen": 8.577412605285645, + "logits/rejected": 13.420360565185547, + "logps/chosen": -178.31939697265625, + "logps/rejected": -206.78103637695312, + "loss": 0.8226, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0735921561717987, + "rewards/margins": -0.20445388555526733, + "rewards/rejected": 0.13086172938346863, + "step": 4921 + }, + { + "epoch": 0.7611830659191958, + "grad_norm": 4.6788835525512695, + "learning_rate": 4.145950280673617e-06, + "logits/chosen": 8.623282432556152, + "logits/rejected": 3.681525468826294, + "logps/chosen": -333.3215026855469, + "logps/rejected": -256.15643310546875, + "loss": 0.5208, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5315885543823242, + "rewards/margins": 0.4524400532245636, + "rewards/rejected": 0.07914847135543823, + "step": 4922 + }, + { + "epoch": 0.76133771505896, + "grad_norm": 27.451623916625977, + "learning_rate": 4.145663879023943e-06, + "logits/chosen": 11.408885955810547, + "logits/rejected": 9.009562492370605, + "logps/chosen": -230.19955444335938, + "logps/rejected": -240.6341552734375, + "loss": 0.5536, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.010083436965942383, + "rewards/margins": 0.3672701418399811, + "rewards/rejected": -0.3571867048740387, + "step": 4923 + }, + { + "epoch": 0.7614923641987241, + "grad_norm": 5.923617362976074, + "learning_rate": 4.14537747737427e-06, + "logits/chosen": 7.339748859405518, + "logits/rejected": 12.718791007995605, + "logps/chosen": -188.9129638671875, + "logps/rejected": -276.8684997558594, + "loss": 0.7762, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2744036316871643, + "rewards/margins": -0.07441519945859909, + "rewards/rejected": -0.1999884396791458, + "step": 4924 + }, + { + "epoch": 0.7616470133384883, + "grad_norm": 5.329258918762207, + "learning_rate": 4.145091075724596e-06, + "logits/chosen": 6.880941390991211, + "logits/rejected": 6.928739547729492, + "logps/chosen": -229.04409790039062, + "logps/rejected": -280.45587158203125, + "loss": 0.7642, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12328854203224182, + "rewards/margins": -0.035579435527324677, + "rewards/rejected": 0.1588679850101471, + "step": 4925 + }, + { + "epoch": 0.7618016624782524, + "grad_norm": 16.788772583007812, + "learning_rate": 4.144804674074923e-06, + "logits/chosen": 11.350167274475098, + "logits/rejected": 11.149557113647461, + "logps/chosen": -355.59490966796875, + "logps/rejected": -518.6079711914062, + "loss": 0.4683, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29341644048690796, + "rewards/margins": 0.756689190864563, + "rewards/rejected": -0.4632726311683655, + "step": 4926 + }, + { + "epoch": 0.7619563116180166, + "grad_norm": 4.877502918243408, + "learning_rate": 4.14451827242525e-06, + "logits/chosen": 13.342842102050781, + "logits/rejected": 11.937655448913574, + "logps/chosen": -229.94586181640625, + "logps/rejected": -241.38990783691406, + "loss": 0.682, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.057029012590646744, + "rewards/margins": 0.0905604436993599, + "rewards/rejected": -0.03353142738342285, + "step": 4927 + }, + { + "epoch": 0.7621109607577807, + "grad_norm": 8.58271312713623, + "learning_rate": 4.144231870775575e-06, + "logits/chosen": 7.603719711303711, + "logits/rejected": 7.8524370193481445, + "logps/chosen": -161.15945434570312, + "logps/rejected": -162.17681884765625, + "loss": 1.0879, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05612868070602417, + "rewards/margins": -0.42669743299484253, + "rewards/rejected": 0.37056878209114075, + "step": 4928 + }, + { + "epoch": 0.762265609897545, + "grad_norm": 4.385808944702148, + "learning_rate": 4.143945469125902e-06, + "logits/chosen": 11.460906982421875, + "logits/rejected": 9.257423400878906, + "logps/chosen": -238.97616577148438, + "logps/rejected": -269.02203369140625, + "loss": 0.5928, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3834801912307739, + "rewards/margins": 0.4303295314311981, + "rewards/rejected": -0.046849325299263, + "step": 4929 + }, + { + "epoch": 0.7624202590373091, + "grad_norm": 3.991694211959839, + "learning_rate": 4.143659067476229e-06, + "logits/chosen": 7.582732200622559, + "logits/rejected": 6.187265872955322, + "logps/chosen": -147.570068359375, + "logps/rejected": -144.92042541503906, + "loss": 0.651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1417761743068695, + "rewards/margins": 0.13091130554676056, + "rewards/rejected": -0.27268749475479126, + "step": 4930 + }, + { + "epoch": 0.7625749081770733, + "grad_norm": 3.6845335960388184, + "learning_rate": 4.143372665826555e-06, + "logits/chosen": 7.688179016113281, + "logits/rejected": 8.560306549072266, + "logps/chosen": -236.12631225585938, + "logps/rejected": -233.07296752929688, + "loss": 0.6004, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3454280495643616, + "rewards/margins": 0.23559586703777313, + "rewards/rejected": 0.10983218997716904, + "step": 4931 + }, + { + "epoch": 0.7627295573168374, + "grad_norm": 5.549973011016846, + "learning_rate": 4.143086264176882e-06, + "logits/chosen": 5.1850738525390625, + "logits/rejected": 9.952247619628906, + "logps/chosen": -300.2906494140625, + "logps/rejected": -282.5285339355469, + "loss": 0.6156, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10462726652622223, + "rewards/margins": 0.1874665915966034, + "rewards/rejected": -0.08283932507038116, + "step": 4932 + }, + { + "epoch": 0.7628842064566016, + "grad_norm": 5.1061272621154785, + "learning_rate": 4.142799862527209e-06, + "logits/chosen": 8.262295722961426, + "logits/rejected": 4.53387975692749, + "logps/chosen": -290.8871154785156, + "logps/rejected": -196.09738159179688, + "loss": 0.6445, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3276713490486145, + "rewards/margins": 0.24858100712299347, + "rewards/rejected": 0.07909035682678223, + "step": 4933 + }, + { + "epoch": 0.7630388555963658, + "grad_norm": 4.63276481628418, + "learning_rate": 4.142513460877535e-06, + "logits/chosen": 7.329858303070068, + "logits/rejected": 5.278559684753418, + "logps/chosen": -259.938720703125, + "logps/rejected": -244.70745849609375, + "loss": 0.5135, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6174301505088806, + "rewards/margins": 0.5074489712715149, + "rewards/rejected": 0.10998116433620453, + "step": 4934 + }, + { + "epoch": 0.7631935047361299, + "grad_norm": 8.939595222473145, + "learning_rate": 4.142227059227861e-06, + "logits/chosen": 9.228285789489746, + "logits/rejected": 12.601855278015137, + "logps/chosen": -330.9233093261719, + "logps/rejected": -389.1082763671875, + "loss": 0.9763, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.21591608226299286, + "rewards/margins": -0.45536142587661743, + "rewards/rejected": 0.6712775230407715, + "step": 4935 + }, + { + "epoch": 0.763348153875894, + "grad_norm": 4.449682235717773, + "learning_rate": 4.141940657578188e-06, + "logits/chosen": 9.54892349243164, + "logits/rejected": 9.508878707885742, + "logps/chosen": -290.98773193359375, + "logps/rejected": -252.12350463867188, + "loss": 0.5775, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42847490310668945, + "rewards/margins": 0.3792840242385864, + "rewards/rejected": 0.04919089749455452, + "step": 4936 + }, + { + "epoch": 0.7635028030156582, + "grad_norm": 5.2768025398254395, + "learning_rate": 4.141654255928514e-06, + "logits/chosen": 8.646503448486328, + "logits/rejected": 11.98926067352295, + "logps/chosen": -238.91754150390625, + "logps/rejected": -274.1827392578125, + "loss": 0.7279, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02121984213590622, + "rewards/margins": -0.003342166543006897, + "rewards/rejected": 0.024561986327171326, + "step": 4937 + }, + { + "epoch": 0.7636574521554224, + "grad_norm": 6.944789409637451, + "learning_rate": 4.141367854278841e-06, + "logits/chosen": 7.973559379577637, + "logits/rejected": 8.873360633850098, + "logps/chosen": -279.97442626953125, + "logps/rejected": -290.258056640625, + "loss": 0.5769, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5695616006851196, + "rewards/margins": 0.32906368374824524, + "rewards/rejected": 0.24049793183803558, + "step": 4938 + }, + { + "epoch": 0.7638121012951865, + "grad_norm": 4.392387390136719, + "learning_rate": 4.141081452629168e-06, + "logits/chosen": 15.18172550201416, + "logits/rejected": 10.412322998046875, + "logps/chosen": -241.785400390625, + "logps/rejected": -210.549072265625, + "loss": 0.4549, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5665448904037476, + "rewards/margins": 0.665616512298584, + "rewards/rejected": -0.09907159209251404, + "step": 4939 + }, + { + "epoch": 0.7639667504349507, + "grad_norm": 4.9878644943237305, + "learning_rate": 4.140795050979494e-06, + "logits/chosen": 7.041317939758301, + "logits/rejected": 4.476365566253662, + "logps/chosen": -183.692138671875, + "logps/rejected": -146.60140991210938, + "loss": 0.6065, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14306746423244476, + "rewards/margins": 0.34993869066238403, + "rewards/rejected": -0.20687125623226166, + "step": 4940 + }, + { + "epoch": 0.7641213995747148, + "grad_norm": 4.474164962768555, + "learning_rate": 4.14050864932982e-06, + "logits/chosen": 10.443655967712402, + "logits/rejected": 9.39891529083252, + "logps/chosen": -217.26190185546875, + "logps/rejected": -156.72608947753906, + "loss": 0.6492, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4619622528553009, + "rewards/margins": 0.14141054451465607, + "rewards/rejected": 0.32055172324180603, + "step": 4941 + }, + { + "epoch": 0.7642760487144791, + "grad_norm": 7.124011039733887, + "learning_rate": 4.140222247680147e-06, + "logits/chosen": 14.211482048034668, + "logits/rejected": 15.702484130859375, + "logps/chosen": -313.53753662109375, + "logps/rejected": -309.4259338378906, + "loss": 0.725, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22411996126174927, + "rewards/margins": 0.044881612062454224, + "rewards/rejected": 0.17923830449581146, + "step": 4942 + }, + { + "epoch": 0.7644306978542432, + "grad_norm": 14.382828712463379, + "learning_rate": 4.1399358460304735e-06, + "logits/chosen": 9.890050888061523, + "logits/rejected": 6.561791896820068, + "logps/chosen": -351.99639892578125, + "logps/rejected": -202.32537841796875, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20855769515037537, + "rewards/margins": 0.08406265079975128, + "rewards/rejected": 0.12449502944946289, + "step": 4943 + }, + { + "epoch": 0.7645853469940074, + "grad_norm": 6.3000993728637695, + "learning_rate": 4.1396494443808e-06, + "logits/chosen": 13.678356170654297, + "logits/rejected": 5.869448184967041, + "logps/chosen": -316.6748046875, + "logps/rejected": -177.1873779296875, + "loss": 0.5366, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3079023063182831, + "rewards/margins": 0.7318978309631348, + "rewards/rejected": -0.4239955246448517, + "step": 4944 + }, + { + "epoch": 0.7647399961337715, + "grad_norm": 7.358652591705322, + "learning_rate": 4.139363042731127e-06, + "logits/chosen": 12.029150009155273, + "logits/rejected": 7.779742240905762, + "logps/chosen": -492.4407653808594, + "logps/rejected": -350.1304931640625, + "loss": 0.3968, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46798601746559143, + "rewards/margins": 1.2406266927719116, + "rewards/rejected": -0.7726407051086426, + "step": 4945 + }, + { + "epoch": 0.7648946452735357, + "grad_norm": 6.211153984069824, + "learning_rate": 4.139076641081453e-06, + "logits/chosen": 11.553871154785156, + "logits/rejected": 4.746696472167969, + "logps/chosen": -402.80206298828125, + "logps/rejected": -255.36207580566406, + "loss": 0.5038, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8193070888519287, + "rewards/margins": 0.6366795897483826, + "rewards/rejected": 0.18262745440006256, + "step": 4946 + }, + { + "epoch": 0.7650492944132998, + "grad_norm": 6.916305065155029, + "learning_rate": 4.13879023943178e-06, + "logits/chosen": 7.095345497131348, + "logits/rejected": 4.810436248779297, + "logps/chosen": -298.9525451660156, + "logps/rejected": -263.8277587890625, + "loss": 0.6291, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4684111475944519, + "rewards/margins": 0.20465654134750366, + "rewards/rejected": 0.26375460624694824, + "step": 4947 + }, + { + "epoch": 0.765203943553064, + "grad_norm": 5.330521583557129, + "learning_rate": 4.138503837782106e-06, + "logits/chosen": 16.100194931030273, + "logits/rejected": 10.801311492919922, + "logps/chosen": -389.47113037109375, + "logps/rejected": -285.7940979003906, + "loss": 0.5101, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4962612986564636, + "rewards/margins": 0.5578832626342773, + "rewards/rejected": -0.06162194162607193, + "step": 4948 + }, + { + "epoch": 0.7653585926928281, + "grad_norm": 6.446768283843994, + "learning_rate": 4.1382174361324325e-06, + "logits/chosen": 11.464221954345703, + "logits/rejected": 12.036646842956543, + "logps/chosen": -350.800537109375, + "logps/rejected": -340.69647216796875, + "loss": 0.6202, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2223339080810547, + "rewards/margins": 0.40763378143310547, + "rewards/rejected": -0.18529987335205078, + "step": 4949 + }, + { + "epoch": 0.7655132418325923, + "grad_norm": 5.683520793914795, + "learning_rate": 4.137931034482759e-06, + "logits/chosen": 9.901934623718262, + "logits/rejected": 10.866971969604492, + "logps/chosen": -188.46136474609375, + "logps/rejected": -220.008544921875, + "loss": 0.8576, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2402886301279068, + "rewards/margins": -0.23111507296562195, + "rewards/rejected": 0.47140371799468994, + "step": 4950 + }, + { + "epoch": 0.7656678909723564, + "grad_norm": 6.309943675994873, + "learning_rate": 4.137644632833086e-06, + "logits/chosen": 8.952722549438477, + "logits/rejected": 3.296704053878784, + "logps/chosen": -278.5149841308594, + "logps/rejected": -235.2010955810547, + "loss": 0.6499, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03517545759677887, + "rewards/margins": 0.11505352705717087, + "rewards/rejected": -0.15022897720336914, + "step": 4951 + }, + { + "epoch": 0.7658225401121206, + "grad_norm": 6.074580669403076, + "learning_rate": 4.1373582311834125e-06, + "logits/chosen": 15.289661407470703, + "logits/rejected": 13.359203338623047, + "logps/chosen": -366.68023681640625, + "logps/rejected": -341.90728759765625, + "loss": 0.7817, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09628450125455856, + "rewards/margins": -0.037759676575660706, + "rewards/rejected": 0.13404417037963867, + "step": 4952 + }, + { + "epoch": 0.7659771892518847, + "grad_norm": 4.777254581451416, + "learning_rate": 4.137071829533738e-06, + "logits/chosen": 8.840413093566895, + "logits/rejected": 3.5354385375976562, + "logps/chosen": -158.81382751464844, + "logps/rejected": -146.3869171142578, + "loss": 0.645, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1140359491109848, + "rewards/margins": 0.21172145009040833, + "rewards/rejected": -0.09768549352884293, + "step": 4953 + }, + { + "epoch": 0.766131838391649, + "grad_norm": 6.983048915863037, + "learning_rate": 4.136785427884065e-06, + "logits/chosen": 6.146685600280762, + "logits/rejected": 8.76120376586914, + "logps/chosen": -278.0133056640625, + "logps/rejected": -344.5935363769531, + "loss": 0.9043, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20453988015651703, + "rewards/margins": -0.2946438193321228, + "rewards/rejected": 0.09010394662618637, + "step": 4954 + }, + { + "epoch": 0.7662864875314132, + "grad_norm": 5.467168807983398, + "learning_rate": 4.1364990262343916e-06, + "logits/chosen": 10.983089447021484, + "logits/rejected": 8.220662117004395, + "logps/chosen": -386.868408203125, + "logps/rejected": -316.7525634765625, + "loss": 0.5906, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44696810841560364, + "rewards/margins": 0.2936115562915802, + "rewards/rejected": 0.15335655212402344, + "step": 4955 + }, + { + "epoch": 0.7664411366711773, + "grad_norm": 6.071159362792969, + "learning_rate": 4.136212624584718e-06, + "logits/chosen": 9.73252010345459, + "logits/rejected": 10.7449951171875, + "logps/chosen": -280.9246520996094, + "logps/rejected": -238.2398223876953, + "loss": 0.7582, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10747911036014557, + "rewards/margins": -0.057358723133802414, + "rewards/rejected": 0.16483783721923828, + "step": 4956 + }, + { + "epoch": 0.7665957858109415, + "grad_norm": 4.621884346008301, + "learning_rate": 4.135926222935044e-06, + "logits/chosen": 5.683119773864746, + "logits/rejected": 11.96535873413086, + "logps/chosen": -147.86508178710938, + "logps/rejected": -208.42898559570312, + "loss": 0.7092, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21912966668605804, + "rewards/margins": 0.04856341332197189, + "rewards/rejected": -0.26769304275512695, + "step": 4957 + }, + { + "epoch": 0.7667504349507056, + "grad_norm": 4.188169479370117, + "learning_rate": 4.135639821285371e-06, + "logits/chosen": 14.546852111816406, + "logits/rejected": 8.762707710266113, + "logps/chosen": -266.2210693359375, + "logps/rejected": -157.12014770507812, + "loss": 0.7144, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0008868449367582798, + "rewards/margins": 0.0835040882229805, + "rewards/rejected": -0.08439092338085175, + "step": 4958 + }, + { + "epoch": 0.7669050840904698, + "grad_norm": 5.067059516906738, + "learning_rate": 4.135353419635697e-06, + "logits/chosen": 14.334366798400879, + "logits/rejected": 7.460714340209961, + "logps/chosen": -253.41856384277344, + "logps/rejected": -176.9868927001953, + "loss": 0.6364, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1870420277118683, + "rewards/margins": 0.16429150104522705, + "rewards/rejected": -0.35133352875709534, + "step": 4959 + }, + { + "epoch": 0.7670597332302339, + "grad_norm": 4.723127365112305, + "learning_rate": 4.135067017986024e-06, + "logits/chosen": 9.781179428100586, + "logits/rejected": 7.438054084777832, + "logps/chosen": -200.795654296875, + "logps/rejected": -223.2666778564453, + "loss": 0.6219, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1913708746433258, + "rewards/margins": 0.24018707871437073, + "rewards/rejected": -0.04881620779633522, + "step": 4960 + }, + { + "epoch": 0.7672143823699981, + "grad_norm": 4.799639701843262, + "learning_rate": 4.13478061633635e-06, + "logits/chosen": 9.256240844726562, + "logits/rejected": 4.518589973449707, + "logps/chosen": -257.2502136230469, + "logps/rejected": -213.87136840820312, + "loss": 0.6418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017122391611337662, + "rewards/margins": 0.18628929555416107, + "rewards/rejected": -0.1691669076681137, + "step": 4961 + }, + { + "epoch": 0.7673690315097622, + "grad_norm": 9.123564720153809, + "learning_rate": 4.1344942146866764e-06, + "logits/chosen": 4.829556465148926, + "logits/rejected": 6.611625671386719, + "logps/chosen": -330.2411193847656, + "logps/rejected": -341.55194091796875, + "loss": 0.8158, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19843865931034088, + "rewards/margins": -0.1875607669353485, + "rewards/rejected": -0.010877883061766624, + "step": 4962 + }, + { + "epoch": 0.7675236806495264, + "grad_norm": 4.474172115325928, + "learning_rate": 4.134207813037003e-06, + "logits/chosen": 8.630349159240723, + "logits/rejected": 1.6360594034194946, + "logps/chosen": -315.9197998046875, + "logps/rejected": -197.06532287597656, + "loss": 0.5598, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31645143032073975, + "rewards/margins": 0.400278240442276, + "rewards/rejected": -0.08382683992385864, + "step": 4963 + }, + { + "epoch": 0.7676783297892905, + "grad_norm": 5.443153381347656, + "learning_rate": 4.13392141138733e-06, + "logits/chosen": 7.913963794708252, + "logits/rejected": 8.156455993652344, + "logps/chosen": -246.21231079101562, + "logps/rejected": -281.65289306640625, + "loss": 0.5524, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2347273826599121, + "rewards/margins": 0.4020341634750366, + "rewards/rejected": -0.1673067808151245, + "step": 4964 + }, + { + "epoch": 0.7678329789290547, + "grad_norm": 4.753653526306152, + "learning_rate": 4.133635009737656e-06, + "logits/chosen": 10.860950469970703, + "logits/rejected": 4.748505592346191, + "logps/chosen": -373.42120361328125, + "logps/rejected": -267.4067077636719, + "loss": 0.5052, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2632021903991699, + "rewards/margins": 0.5404651761054993, + "rewards/rejected": -0.27726298570632935, + "step": 4965 + }, + { + "epoch": 0.7679876280688188, + "grad_norm": 5.557091236114502, + "learning_rate": 4.133348608087983e-06, + "logits/chosen": 13.716362953186035, + "logits/rejected": 9.751205444335938, + "logps/chosen": -358.31390380859375, + "logps/rejected": -269.47705078125, + "loss": 0.5535, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5474659204483032, + "rewards/margins": 0.36240923404693604, + "rewards/rejected": 0.1850566864013672, + "step": 4966 + }, + { + "epoch": 0.7681422772085831, + "grad_norm": 5.823578834533691, + "learning_rate": 4.13306220643831e-06, + "logits/chosen": 8.433234214782715, + "logits/rejected": 4.309297561645508, + "logps/chosen": -269.5269775390625, + "logps/rejected": -251.9385986328125, + "loss": 0.6165, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.014511585235595703, + "rewards/margins": 0.20919501781463623, + "rewards/rejected": -0.19468346238136292, + "step": 4967 + }, + { + "epoch": 0.7682969263483472, + "grad_norm": 3.7419159412384033, + "learning_rate": 4.1327758047886355e-06, + "logits/chosen": 8.293190002441406, + "logits/rejected": 5.601429462432861, + "logps/chosen": -213.33518981933594, + "logps/rejected": -152.77780151367188, + "loss": 0.5051, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.006918720901012421, + "rewards/margins": 0.6592031121253967, + "rewards/rejected": -0.6661218404769897, + "step": 4968 + }, + { + "epoch": 0.7684515754881114, + "grad_norm": 5.663111686706543, + "learning_rate": 4.132489403138962e-06, + "logits/chosen": 13.131458282470703, + "logits/rejected": 11.536008834838867, + "logps/chosen": -243.63339233398438, + "logps/rejected": -233.37100219726562, + "loss": 0.7241, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3481830954551697, + "rewards/margins": 0.0019211731851100922, + "rewards/rejected": 0.3462618887424469, + "step": 4969 + }, + { + "epoch": 0.7686062246278755, + "grad_norm": 5.256002902984619, + "learning_rate": 4.132203001489289e-06, + "logits/chosen": 8.378101348876953, + "logits/rejected": 12.492430686950684, + "logps/chosen": -143.73837280273438, + "logps/rejected": -193.728271484375, + "loss": 0.7182, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2743394374847412, + "rewards/margins": -0.015911810100078583, + "rewards/rejected": 0.290251225233078, + "step": 4970 + }, + { + "epoch": 0.7687608737676397, + "grad_norm": 4.80961275100708, + "learning_rate": 4.1319165998396154e-06, + "logits/chosen": 3.9095687866210938, + "logits/rejected": 5.7025322914123535, + "logps/chosen": -201.0200958251953, + "logps/rejected": -230.4980926513672, + "loss": 0.6261, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23607198894023895, + "rewards/margins": 0.3503708839416504, + "rewards/rejected": -0.11429891735315323, + "step": 4971 + }, + { + "epoch": 0.7689155229074038, + "grad_norm": 7.354063034057617, + "learning_rate": 4.131630198189942e-06, + "logits/chosen": 9.680054664611816, + "logits/rejected": 10.74063777923584, + "logps/chosen": -301.8260803222656, + "logps/rejected": -282.119140625, + "loss": 0.7921, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08449564129114151, + "rewards/margins": -0.02164582349359989, + "rewards/rejected": 0.10614146292209625, + "step": 4972 + }, + { + "epoch": 0.769070172047168, + "grad_norm": 5.525781631469727, + "learning_rate": 4.131343796540269e-06, + "logits/chosen": 7.839278221130371, + "logits/rejected": 9.007648468017578, + "logps/chosen": -162.90118408203125, + "logps/rejected": -163.5792694091797, + "loss": 0.7822, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.127905935049057, + "rewards/margins": -0.11784529685974121, + "rewards/rejected": 0.24575123190879822, + "step": 4973 + }, + { + "epoch": 0.7692248211869321, + "grad_norm": 5.639936923980713, + "learning_rate": 4.1310573948905945e-06, + "logits/chosen": 11.192437171936035, + "logits/rejected": 5.996338844299316, + "logps/chosen": -308.0993957519531, + "logps/rejected": -177.96994018554688, + "loss": 0.7569, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15776333212852478, + "rewards/margins": -0.01452586054801941, + "rewards/rejected": 0.1722891926765442, + "step": 4974 + }, + { + "epoch": 0.7693794703266963, + "grad_norm": 4.952838897705078, + "learning_rate": 4.130770993240921e-06, + "logits/chosen": 12.394794464111328, + "logits/rejected": 8.547128677368164, + "logps/chosen": -340.3673095703125, + "logps/rejected": -217.74176025390625, + "loss": 0.6889, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20627279579639435, + "rewards/margins": 0.035782910883426666, + "rewards/rejected": 0.17048987746238708, + "step": 4975 + }, + { + "epoch": 0.7695341194664604, + "grad_norm": 6.5808820724487305, + "learning_rate": 4.130484591591248e-06, + "logits/chosen": 12.566085815429688, + "logits/rejected": 8.219300270080566, + "logps/chosen": -344.2880859375, + "logps/rejected": -275.83807373046875, + "loss": 0.5452, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2173694670200348, + "rewards/margins": 0.44629353284835815, + "rewards/rejected": -0.22892408072948456, + "step": 4976 + }, + { + "epoch": 0.7696887686062246, + "grad_norm": 8.133243560791016, + "learning_rate": 4.1301981899415745e-06, + "logits/chosen": 9.09133529663086, + "logits/rejected": 8.781269073486328, + "logps/chosen": -545.9765014648438, + "logps/rejected": -423.1166687011719, + "loss": 0.4537, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.380987286567688, + "rewards/margins": 0.6742621660232544, + "rewards/rejected": -0.2932748794555664, + "step": 4977 + }, + { + "epoch": 0.7698434177459887, + "grad_norm": 5.225375175476074, + "learning_rate": 4.129911788291901e-06, + "logits/chosen": 9.772926330566406, + "logits/rejected": 4.344759941101074, + "logps/chosen": -332.7449951171875, + "logps/rejected": -240.3415985107422, + "loss": 0.5543, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5672166347503662, + "rewards/margins": 0.4154619872570038, + "rewards/rejected": 0.15175466239452362, + "step": 4978 + }, + { + "epoch": 0.7699980668857529, + "grad_norm": 6.085546016693115, + "learning_rate": 4.129625386642228e-06, + "logits/chosen": 9.420472145080566, + "logits/rejected": 14.559541702270508, + "logps/chosen": -210.8945770263672, + "logps/rejected": -284.9475402832031, + "loss": 0.7385, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27289485931396484, + "rewards/margins": 0.004035197198390961, + "rewards/rejected": -0.276930034160614, + "step": 4979 + }, + { + "epoch": 0.7701527160255172, + "grad_norm": 5.08929967880249, + "learning_rate": 4.1293389849925544e-06, + "logits/chosen": 13.113638877868652, + "logits/rejected": 5.629592418670654, + "logps/chosen": -288.2409362792969, + "logps/rejected": -213.57568359375, + "loss": 0.5785, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.036273956298828125, + "rewards/margins": 0.4562690258026123, + "rewards/rejected": -0.49254298210144043, + "step": 4980 + }, + { + "epoch": 0.7703073651652813, + "grad_norm": 7.583585262298584, + "learning_rate": 4.12905258334288e-06, + "logits/chosen": 8.815882682800293, + "logits/rejected": 7.21228551864624, + "logps/chosen": -397.4370422363281, + "logps/rejected": -377.3230895996094, + "loss": 0.7379, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3927827775478363, + "rewards/margins": 0.03347659483551979, + "rewards/rejected": 0.3593061566352844, + "step": 4981 + }, + { + "epoch": 0.7704620143050455, + "grad_norm": 7.3094377517700195, + "learning_rate": 4.128766181693207e-06, + "logits/chosen": 7.136903285980225, + "logits/rejected": 7.766732215881348, + "logps/chosen": -341.82818603515625, + "logps/rejected": -320.99542236328125, + "loss": 0.7132, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1656390130519867, + "rewards/margins": 0.06980957090854645, + "rewards/rejected": 0.09582944214344025, + "step": 4982 + }, + { + "epoch": 0.7706166634448096, + "grad_norm": 4.940464973449707, + "learning_rate": 4.1284797800435335e-06, + "logits/chosen": 11.49021053314209, + "logits/rejected": 8.019957542419434, + "logps/chosen": -340.2213134765625, + "logps/rejected": -349.8654479980469, + "loss": 0.4907, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28009265661239624, + "rewards/margins": 0.5583404302597046, + "rewards/rejected": -0.2782478630542755, + "step": 4983 + }, + { + "epoch": 0.7707713125845738, + "grad_norm": 3.208820104598999, + "learning_rate": 4.12819337839386e-06, + "logits/chosen": 9.834213256835938, + "logits/rejected": 6.487322807312012, + "logps/chosen": -180.23861694335938, + "logps/rejected": -119.73690032958984, + "loss": 0.5539, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3990585207939148, + "rewards/margins": 0.389373242855072, + "rewards/rejected": 0.009685254655778408, + "step": 4984 + }, + { + "epoch": 0.7709259617243379, + "grad_norm": 5.376028060913086, + "learning_rate": 4.127906976744187e-06, + "logits/chosen": 6.716535568237305, + "logits/rejected": 11.201910018920898, + "logps/chosen": -239.73699951171875, + "logps/rejected": -263.1300048828125, + "loss": 0.778, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36066412925720215, + "rewards/margins": -0.0027289018034934998, + "rewards/rejected": 0.36339300870895386, + "step": 4985 + }, + { + "epoch": 0.7710806108641021, + "grad_norm": 4.978393077850342, + "learning_rate": 4.1276205750945135e-06, + "logits/chosen": 9.114242553710938, + "logits/rejected": 10.812358856201172, + "logps/chosen": -188.74911499023438, + "logps/rejected": -207.45289611816406, + "loss": 0.5866, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.26826804876327515, + "rewards/margins": 0.4088376462459564, + "rewards/rejected": -0.6771056652069092, + "step": 4986 + }, + { + "epoch": 0.7712352600038662, + "grad_norm": 5.60716438293457, + "learning_rate": 4.127334173444839e-06, + "logits/chosen": 14.299121856689453, + "logits/rejected": 9.205286026000977, + "logps/chosen": -429.9416198730469, + "logps/rejected": -403.2704772949219, + "loss": 0.5664, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5752893686294556, + "rewards/margins": 0.33691641688346863, + "rewards/rejected": 0.23837293684482574, + "step": 4987 + }, + { + "epoch": 0.7713899091436304, + "grad_norm": 5.608139991760254, + "learning_rate": 4.127047771795166e-06, + "logits/chosen": 9.980561256408691, + "logits/rejected": 7.705402374267578, + "logps/chosen": -340.0922546386719, + "logps/rejected": -294.58392333984375, + "loss": 0.5385, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.027302458882331848, + "rewards/margins": 0.45669716596603394, + "rewards/rejected": -0.4839996099472046, + "step": 4988 + }, + { + "epoch": 0.7715445582833945, + "grad_norm": 7.99901819229126, + "learning_rate": 4.126761370145493e-06, + "logits/chosen": 4.902597427368164, + "logits/rejected": -0.06300020217895508, + "logps/chosen": -292.90606689453125, + "logps/rejected": -405.0431823730469, + "loss": 0.7373, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08763332664966583, + "rewards/margins": 0.19842568039894104, + "rewards/rejected": -0.11079235374927521, + "step": 4989 + }, + { + "epoch": 0.7716992074231587, + "grad_norm": 52.14344024658203, + "learning_rate": 4.126474968495819e-06, + "logits/chosen": 5.340859413146973, + "logits/rejected": 5.4607038497924805, + "logps/chosen": -248.3724365234375, + "logps/rejected": -188.8089141845703, + "loss": 0.5625, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04004386439919472, + "rewards/margins": 0.39794838428497314, + "rewards/rejected": -0.35790449380874634, + "step": 4990 + }, + { + "epoch": 0.7718538565629228, + "grad_norm": 5.102663993835449, + "learning_rate": 4.126188566846145e-06, + "logits/chosen": 12.467475891113281, + "logits/rejected": 8.76297664642334, + "logps/chosen": -270.7680358886719, + "logps/rejected": -276.18695068359375, + "loss": 0.6774, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.039369676262140274, + "rewards/margins": 0.06542877852916718, + "rewards/rejected": -0.0260591059923172, + "step": 4991 + }, + { + "epoch": 0.772008505702687, + "grad_norm": 4.296579837799072, + "learning_rate": 4.125902165196472e-06, + "logits/chosen": 11.526104927062988, + "logits/rejected": 11.35513687133789, + "logps/chosen": -204.5261688232422, + "logps/rejected": -185.63218688964844, + "loss": 0.5754, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11636167764663696, + "rewards/margins": 0.43150269985198975, + "rewards/rejected": -0.3151410222053528, + "step": 4992 + }, + { + "epoch": 0.7721631548424512, + "grad_norm": 6.3434739112854, + "learning_rate": 4.125615763546798e-06, + "logits/chosen": 13.317995071411133, + "logits/rejected": 2.7605810165405273, + "logps/chosen": -307.75518798828125, + "logps/rejected": -178.917236328125, + "loss": 0.7527, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10010546445846558, + "rewards/margins": 0.09486135840415955, + "rewards/rejected": -0.19496682286262512, + "step": 4993 + }, + { + "epoch": 0.7723178039822154, + "grad_norm": 4.062440395355225, + "learning_rate": 4.125329361897125e-06, + "logits/chosen": 10.902128219604492, + "logits/rejected": 5.421245098114014, + "logps/chosen": -213.04019165039062, + "logps/rejected": -215.32192993164062, + "loss": 0.6008, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18839725852012634, + "rewards/margins": 0.2639303207397461, + "rewards/rejected": -0.07553304731845856, + "step": 4994 + }, + { + "epoch": 0.7724724531219795, + "grad_norm": 6.132646560668945, + "learning_rate": 4.125042960247451e-06, + "logits/chosen": 17.396726608276367, + "logits/rejected": 11.30938720703125, + "logps/chosen": -468.0548095703125, + "logps/rejected": -302.32281494140625, + "loss": 0.5418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.07890496402978897, + "rewards/margins": 0.4038117229938507, + "rewards/rejected": -0.32490673661231995, + "step": 4995 + }, + { + "epoch": 0.7726271022617437, + "grad_norm": 5.574467658996582, + "learning_rate": 4.1247565585977775e-06, + "logits/chosen": 11.290813446044922, + "logits/rejected": 5.554466724395752, + "logps/chosen": -268.62811279296875, + "logps/rejected": -196.56036376953125, + "loss": 0.7009, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06991252303123474, + "rewards/margins": 0.1462816745042801, + "rewards/rejected": -0.21619418263435364, + "step": 4996 + }, + { + "epoch": 0.7727817514015078, + "grad_norm": 5.451486110687256, + "learning_rate": 4.124470156948104e-06, + "logits/chosen": 11.673201560974121, + "logits/rejected": 6.237618446350098, + "logps/chosen": -254.86947631835938, + "logps/rejected": -244.02609252929688, + "loss": 0.6469, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0310808178037405, + "rewards/margins": 0.23386698961257935, + "rewards/rejected": -0.2649478316307068, + "step": 4997 + }, + { + "epoch": 0.772936400541272, + "grad_norm": 5.163326263427734, + "learning_rate": 4.124183755298431e-06, + "logits/chosen": 13.895020484924316, + "logits/rejected": 7.872200012207031, + "logps/chosen": -313.21148681640625, + "logps/rejected": -273.2898864746094, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2681724727153778, + "rewards/margins": 0.34868061542510986, + "rewards/rejected": -0.08050814270973206, + "step": 4998 + }, + { + "epoch": 0.7730910496810361, + "grad_norm": 4.0748419761657715, + "learning_rate": 4.123897353648757e-06, + "logits/chosen": 9.035528182983398, + "logits/rejected": 3.976304054260254, + "logps/chosen": -314.4695739746094, + "logps/rejected": -270.3165588378906, + "loss": 0.4937, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.05133618414402008, + "rewards/margins": 0.5926632285118103, + "rewards/rejected": -0.643999457359314, + "step": 4999 + }, + { + "epoch": 0.7732456988208003, + "grad_norm": 4.605780124664307, + "learning_rate": 4.123610951999084e-06, + "logits/chosen": 10.038984298706055, + "logits/rejected": 10.29881763458252, + "logps/chosen": -179.95416259765625, + "logps/rejected": -219.22116088867188, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.023074060678482056, + "rewards/margins": 0.1538427174091339, + "rewards/rejected": -0.13076864182949066, + "step": 5000 + }, + { + "epoch": 0.7734003479605644, + "grad_norm": 4.89199686050415, + "learning_rate": 4.12332455034941e-06, + "logits/chosen": 7.259953022003174, + "logits/rejected": 8.65902328491211, + "logps/chosen": -395.1523132324219, + "logps/rejected": -487.6247253417969, + "loss": 0.7161, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12864947319030762, + "rewards/margins": 0.10570831596851349, + "rewards/rejected": 0.022941168397665024, + "step": 5001 + }, + { + "epoch": 0.7735549971003286, + "grad_norm": 7.77675724029541, + "learning_rate": 4.1230381486997365e-06, + "logits/chosen": 7.0220723152160645, + "logits/rejected": 9.41964340209961, + "logps/chosen": -211.3035430908203, + "logps/rejected": -282.9604797363281, + "loss": 0.7156, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04720047116279602, + "rewards/margins": 0.0022997967898845673, + "rewards/rejected": -0.049500271677970886, + "step": 5002 + }, + { + "epoch": 0.7737096462400928, + "grad_norm": 5.483887672424316, + "learning_rate": 4.122751747050063e-06, + "logits/chosen": 7.901454925537109, + "logits/rejected": 9.114794731140137, + "logps/chosen": -306.2235412597656, + "logps/rejected": -273.46429443359375, + "loss": 0.7368, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.01944485306739807, + "rewards/margins": 0.17546787858009338, + "rewards/rejected": -0.1560230404138565, + "step": 5003 + }, + { + "epoch": 0.7738642953798569, + "grad_norm": 4.512119770050049, + "learning_rate": 4.12246534540039e-06, + "logits/chosen": 6.18450403213501, + "logits/rejected": 5.381974697113037, + "logps/chosen": -284.0148620605469, + "logps/rejected": -230.7216796875, + "loss": 0.5546, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31669148802757263, + "rewards/margins": 0.45358434319496155, + "rewards/rejected": -0.13689284026622772, + "step": 5004 + }, + { + "epoch": 0.774018944519621, + "grad_norm": 5.064146041870117, + "learning_rate": 4.1221789437507165e-06, + "logits/chosen": 9.378414154052734, + "logits/rejected": 8.250425338745117, + "logps/chosen": -268.95367431640625, + "logps/rejected": -256.632080078125, + "loss": 0.7122, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2516672909259796, + "rewards/margins": 0.044621050357818604, + "rewards/rejected": 0.207046240568161, + "step": 5005 + }, + { + "epoch": 0.7741735936593853, + "grad_norm": 5.158390045166016, + "learning_rate": 4.121892542101043e-06, + "logits/chosen": 6.5220489501953125, + "logits/rejected": 7.144718647003174, + "logps/chosen": -272.63958740234375, + "logps/rejected": -232.56692504882812, + "loss": 0.5436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4590062201023102, + "rewards/margins": 0.4421843886375427, + "rewards/rejected": 0.01682186871767044, + "step": 5006 + }, + { + "epoch": 0.7743282427991495, + "grad_norm": 4.248188495635986, + "learning_rate": 4.121606140451369e-06, + "logits/chosen": 10.51870346069336, + "logits/rejected": 8.302328109741211, + "logps/chosen": -318.8675537109375, + "logps/rejected": -234.2373809814453, + "loss": 0.5413, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2951411008834839, + "rewards/margins": 0.3687536418437958, + "rewards/rejected": -0.07361254096031189, + "step": 5007 + }, + { + "epoch": 0.7744828919389136, + "grad_norm": 4.663799285888672, + "learning_rate": 4.1213197388016956e-06, + "logits/chosen": 7.878110885620117, + "logits/rejected": 2.713817596435547, + "logps/chosen": -159.81491088867188, + "logps/rejected": -154.60916137695312, + "loss": 0.6746, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.191911518573761, + "rewards/margins": 0.0977015495300293, + "rewards/rejected": 0.09420996904373169, + "step": 5008 + }, + { + "epoch": 0.7746375410786778, + "grad_norm": 4.316108226776123, + "learning_rate": 4.121033337152022e-06, + "logits/chosen": 13.941579818725586, + "logits/rejected": 6.019278526306152, + "logps/chosen": -273.7798767089844, + "logps/rejected": -162.07249450683594, + "loss": 0.5554, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3104780316352844, + "rewards/margins": 0.3796347677707672, + "rewards/rejected": -0.06915673613548279, + "step": 5009 + }, + { + "epoch": 0.7747921902184419, + "grad_norm": 7.142570495605469, + "learning_rate": 4.120746935502349e-06, + "logits/chosen": 6.283424377441406, + "logits/rejected": 9.530901908874512, + "logps/chosen": -266.60418701171875, + "logps/rejected": -279.9219055175781, + "loss": 0.8331, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2871186435222626, + "rewards/margins": -0.09328995645046234, + "rewards/rejected": -0.19382868707180023, + "step": 5010 + }, + { + "epoch": 0.7749468393582061, + "grad_norm": 6.1782050132751465, + "learning_rate": 4.1204605338526755e-06, + "logits/chosen": 14.791120529174805, + "logits/rejected": 10.545404434204102, + "logps/chosen": -489.9462890625, + "logps/rejected": -317.0007629394531, + "loss": 0.7888, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02662043832242489, + "rewards/margins": 0.09979776293039322, + "rewards/rejected": -0.12641818821430206, + "step": 5011 + }, + { + "epoch": 0.7751014884979702, + "grad_norm": 6.908194065093994, + "learning_rate": 4.120174132203002e-06, + "logits/chosen": 8.393936157226562, + "logits/rejected": 13.2125244140625, + "logps/chosen": -309.270263671875, + "logps/rejected": -380.57861328125, + "loss": 0.773, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.01415882259607315, + "rewards/margins": -0.011274144053459167, + "rewards/rejected": 0.025432981550693512, + "step": 5012 + }, + { + "epoch": 0.7752561376377344, + "grad_norm": 712.8502197265625, + "learning_rate": 4.119887730553329e-06, + "logits/chosen": 2.06146240234375, + "logits/rejected": 5.159744739532471, + "logps/chosen": -181.00794982910156, + "logps/rejected": -722.385986328125, + "loss": 0.663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15017251670360565, + "rewards/margins": 0.28974276781082153, + "rewards/rejected": -0.4399152994155884, + "step": 5013 + }, + { + "epoch": 0.7754107867774985, + "grad_norm": 6.284900665283203, + "learning_rate": 4.119601328903655e-06, + "logits/chosen": 10.280906677246094, + "logits/rejected": 13.962762832641602, + "logps/chosen": -317.97283935546875, + "logps/rejected": -346.02960205078125, + "loss": 0.7059, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24447432160377502, + "rewards/margins": 0.0720866397023201, + "rewards/rejected": 0.17238768935203552, + "step": 5014 + }, + { + "epoch": 0.7755654359172627, + "grad_norm": 4.670611381530762, + "learning_rate": 4.119314927253981e-06, + "logits/chosen": 8.168377876281738, + "logits/rejected": 4.10474157333374, + "logps/chosen": -415.0027160644531, + "logps/rejected": -208.18939208984375, + "loss": 0.4912, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02573414519429207, + "rewards/margins": 0.544352650642395, + "rewards/rejected": -0.5186185240745544, + "step": 5015 + }, + { + "epoch": 0.7757200850570268, + "grad_norm": 8.4817476272583, + "learning_rate": 4.119028525604308e-06, + "logits/chosen": 10.971458435058594, + "logits/rejected": 5.737685680389404, + "logps/chosen": -208.27609252929688, + "logps/rejected": -202.10458374023438, + "loss": 0.7532, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27731525897979736, + "rewards/margins": 0.05599093437194824, + "rewards/rejected": -0.3333061933517456, + "step": 5016 + }, + { + "epoch": 0.775874734196791, + "grad_norm": 6.134255886077881, + "learning_rate": 4.118742123954635e-06, + "logits/chosen": 5.332054615020752, + "logits/rejected": 6.37807559967041, + "logps/chosen": -237.30072021484375, + "logps/rejected": -304.5001525878906, + "loss": 0.755, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0508432611823082, + "rewards/margins": 0.05668017268180847, + "rewards/rejected": -0.10752344131469727, + "step": 5017 + }, + { + "epoch": 0.7760293833365552, + "grad_norm": 3.983471632003784, + "learning_rate": 4.118455722304961e-06, + "logits/chosen": 10.750752449035645, + "logits/rejected": 18.0624942779541, + "logps/chosen": -201.6811065673828, + "logps/rejected": -286.57427978515625, + "loss": 0.6429, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19274616241455078, + "rewards/margins": 0.41438019275665283, + "rewards/rejected": -0.22163397073745728, + "step": 5018 + }, + { + "epoch": 0.7761840324763194, + "grad_norm": 5.100841045379639, + "learning_rate": 4.118169320655288e-06, + "logits/chosen": 5.152981758117676, + "logits/rejected": 6.58890438079834, + "logps/chosen": -255.8887481689453, + "logps/rejected": -262.0158386230469, + "loss": 0.7641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09908465296030045, + "rewards/margins": -0.06049796938896179, + "rewards/rejected": 0.15958262979984283, + "step": 5019 + }, + { + "epoch": 0.7763386816160835, + "grad_norm": 5.116940021514893, + "learning_rate": 4.117882919005614e-06, + "logits/chosen": 10.215417861938477, + "logits/rejected": 12.812397956848145, + "logps/chosen": -168.50875854492188, + "logps/rejected": -220.5176544189453, + "loss": 0.7592, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09314832836389542, + "rewards/margins": 0.09575970470905304, + "rewards/rejected": -0.0026113614439964294, + "step": 5020 + }, + { + "epoch": 0.7764933307558477, + "grad_norm": 3.7714180946350098, + "learning_rate": 4.11759651735594e-06, + "logits/chosen": 10.354387283325195, + "logits/rejected": 7.624279975891113, + "logps/chosen": -207.99615478515625, + "logps/rejected": -215.74276733398438, + "loss": 0.4377, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2874392569065094, + "rewards/margins": 0.6683759689331055, + "rewards/rejected": -0.38093677163124084, + "step": 5021 + }, + { + "epoch": 0.7766479798956119, + "grad_norm": 4.5336713790893555, + "learning_rate": 4.117310115706267e-06, + "logits/chosen": 11.726278305053711, + "logits/rejected": 8.242114067077637, + "logps/chosen": -283.8611145019531, + "logps/rejected": -244.39337158203125, + "loss": 0.5896, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17777138948440552, + "rewards/margins": 0.3340108394622803, + "rewards/rejected": -0.15623946487903595, + "step": 5022 + }, + { + "epoch": 0.776802629035376, + "grad_norm": 6.319726467132568, + "learning_rate": 4.117023714056594e-06, + "logits/chosen": 9.771453857421875, + "logits/rejected": 6.453716278076172, + "logps/chosen": -332.0880126953125, + "logps/rejected": -237.7469940185547, + "loss": 0.696, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2392318844795227, + "rewards/margins": 0.017061710357666016, + "rewards/rejected": 0.2221701592206955, + "step": 5023 + }, + { + "epoch": 0.7769572781751402, + "grad_norm": 5.381030082702637, + "learning_rate": 4.11673731240692e-06, + "logits/chosen": 8.992779731750488, + "logits/rejected": 8.476510047912598, + "logps/chosen": -145.55551147460938, + "logps/rejected": -119.64647674560547, + "loss": 0.7134, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10848407447338104, + "rewards/margins": -0.013696298003196716, + "rewards/rejected": 0.12218037247657776, + "step": 5024 + }, + { + "epoch": 0.7771119273149043, + "grad_norm": 6.5061187744140625, + "learning_rate": 4.116450910757246e-06, + "logits/chosen": 9.034185409545898, + "logits/rejected": 7.38123893737793, + "logps/chosen": -318.8076477050781, + "logps/rejected": -249.76145935058594, + "loss": 0.7415, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12011967599391937, + "rewards/margins": 0.07943274080753326, + "rewards/rejected": 0.04068693518638611, + "step": 5025 + }, + { + "epoch": 0.7772665764546685, + "grad_norm": 5.886760234832764, + "learning_rate": 4.116164509107573e-06, + "logits/chosen": 13.575149536132812, + "logits/rejected": 9.485139846801758, + "logps/chosen": -245.511474609375, + "logps/rejected": -216.506103515625, + "loss": 0.6335, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20794911682605743, + "rewards/margins": 0.1715211421251297, + "rewards/rejected": -0.3794702887535095, + "step": 5026 + }, + { + "epoch": 0.7774212255944326, + "grad_norm": 5.398969650268555, + "learning_rate": 4.115878107457899e-06, + "logits/chosen": 9.968337059020996, + "logits/rejected": 6.6889166831970215, + "logps/chosen": -240.43557739257812, + "logps/rejected": -259.589111328125, + "loss": 0.6504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09291433542966843, + "rewards/margins": 0.2952841520309448, + "rewards/rejected": -0.38819852471351624, + "step": 5027 + }, + { + "epoch": 0.7775758747341968, + "grad_norm": 4.397029876708984, + "learning_rate": 4.115591705808226e-06, + "logits/chosen": 5.869007587432861, + "logits/rejected": 3.8941993713378906, + "logps/chosen": -176.54193115234375, + "logps/rejected": -143.39376831054688, + "loss": 0.6702, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0621388703584671, + "rewards/margins": 0.08066572993993759, + "rewards/rejected": -0.14280462265014648, + "step": 5028 + }, + { + "epoch": 0.7777305238739609, + "grad_norm": 3.883955955505371, + "learning_rate": 4.115305304158552e-06, + "logits/chosen": 13.221551895141602, + "logits/rejected": 6.875918388366699, + "logps/chosen": -257.21484375, + "logps/rejected": -227.89378356933594, + "loss": 0.5043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2855057716369629, + "rewards/margins": 0.44030851125717163, + "rewards/rejected": -0.15480273962020874, + "step": 5029 + }, + { + "epoch": 0.7778851730137251, + "grad_norm": 7.649988174438477, + "learning_rate": 4.1150189025088785e-06, + "logits/chosen": 10.078450202941895, + "logits/rejected": 10.438804626464844, + "logps/chosen": -258.0928955078125, + "logps/rejected": -238.04818725585938, + "loss": 0.6456, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05472786724567413, + "rewards/margins": 0.3163161277770996, + "rewards/rejected": -0.26158827543258667, + "step": 5030 + }, + { + "epoch": 0.7780398221534893, + "grad_norm": 5.158097267150879, + "learning_rate": 4.114732500859205e-06, + "logits/chosen": 10.367234230041504, + "logits/rejected": 11.947220802307129, + "logps/chosen": -205.2589111328125, + "logps/rejected": -246.32557678222656, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22857901453971863, + "rewards/margins": -0.0002719275653362274, + "rewards/rejected": -0.2283070683479309, + "step": 5031 + }, + { + "epoch": 0.7781944712932535, + "grad_norm": 13.855603218078613, + "learning_rate": 4.114446099209532e-06, + "logits/chosen": 4.8544087409973145, + "logits/rejected": 1.3936400413513184, + "logps/chosen": -322.1920166015625, + "logps/rejected": -280.5462341308594, + "loss": 0.6983, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19767610728740692, + "rewards/margins": 0.03756190091371536, + "rewards/rejected": 0.16011419892311096, + "step": 5032 + }, + { + "epoch": 0.7783491204330176, + "grad_norm": 4.3317670822143555, + "learning_rate": 4.1141596975598584e-06, + "logits/chosen": 10.71391773223877, + "logits/rejected": 13.785128593444824, + "logps/chosen": -265.25970458984375, + "logps/rejected": -237.08724975585938, + "loss": 0.582, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.13994871079921722, + "rewards/margins": 0.397346556186676, + "rewards/rejected": -0.5372952222824097, + "step": 5033 + }, + { + "epoch": 0.7785037695727818, + "grad_norm": 4.697053909301758, + "learning_rate": 4.113873295910184e-06, + "logits/chosen": 12.264410972595215, + "logits/rejected": 11.431119918823242, + "logps/chosen": -206.22607421875, + "logps/rejected": -189.2880859375, + "loss": 0.6978, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.10574106872081757, + "rewards/margins": 0.11151537299156189, + "rewards/rejected": -0.005774319171905518, + "step": 5034 + }, + { + "epoch": 0.7786584187125459, + "grad_norm": 8.809879302978516, + "learning_rate": 4.113586894260511e-06, + "logits/chosen": 6.489687919616699, + "logits/rejected": 14.318650245666504, + "logps/chosen": -238.0576171875, + "logps/rejected": -327.53082275390625, + "loss": 0.916, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6989277601242065, + "rewards/margins": -0.2902526259422302, + "rewards/rejected": -0.40867507457733154, + "step": 5035 + }, + { + "epoch": 0.7788130678523101, + "grad_norm": 5.562313079833984, + "learning_rate": 4.1133004926108375e-06, + "logits/chosen": 6.82518196105957, + "logits/rejected": 8.63303279876709, + "logps/chosen": -273.5740051269531, + "logps/rejected": -287.4134521484375, + "loss": 0.7571, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10549216717481613, + "rewards/margins": -0.03956685587763786, + "rewards/rejected": -0.06592531502246857, + "step": 5036 + }, + { + "epoch": 0.7789677169920742, + "grad_norm": 6.7447733879089355, + "learning_rate": 4.113014090961164e-06, + "logits/chosen": 9.13759708404541, + "logits/rejected": 7.33263635635376, + "logps/chosen": -317.3335876464844, + "logps/rejected": -226.57843017578125, + "loss": 0.6594, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.23781397938728333, + "rewards/margins": 0.34381064772605896, + "rewards/rejected": -0.10599665343761444, + "step": 5037 + }, + { + "epoch": 0.7791223661318384, + "grad_norm": 4.94521951675415, + "learning_rate": 4.112727689311491e-06, + "logits/chosen": 9.293581008911133, + "logits/rejected": 7.0353474617004395, + "logps/chosen": -171.9024658203125, + "logps/rejected": -169.03271484375, + "loss": 0.7697, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2942078411579132, + "rewards/margins": -0.08927765488624573, + "rewards/rejected": -0.20493021607398987, + "step": 5038 + }, + { + "epoch": 0.7792770152716025, + "grad_norm": 4.07720422744751, + "learning_rate": 4.1124412876618175e-06, + "logits/chosen": 11.395575523376465, + "logits/rejected": 5.534848690032959, + "logps/chosen": -431.73687744140625, + "logps/rejected": -300.9872741699219, + "loss": 0.493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45613712072372437, + "rewards/margins": 0.5238387584686279, + "rewards/rejected": -0.06770157814025879, + "step": 5039 + }, + { + "epoch": 0.7794316644113667, + "grad_norm": 6.103538513183594, + "learning_rate": 4.112154886012143e-06, + "logits/chosen": 12.989797592163086, + "logits/rejected": 8.313202857971191, + "logps/chosen": -272.17584228515625, + "logps/rejected": -225.73150634765625, + "loss": 0.6352, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21702295541763306, + "rewards/margins": 0.21988089382648468, + "rewards/rejected": -0.0028579290956258774, + "step": 5040 + }, + { + "epoch": 0.7795863135511308, + "grad_norm": 5.951514720916748, + "learning_rate": 4.11186848436247e-06, + "logits/chosen": 16.44672966003418, + "logits/rejected": 13.903814315795898, + "logps/chosen": -333.8041687011719, + "logps/rejected": -317.7332763671875, + "loss": 0.6947, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.04458990693092346, + "rewards/margins": 0.17886611819267273, + "rewards/rejected": -0.13427621126174927, + "step": 5041 + }, + { + "epoch": 0.779740962690895, + "grad_norm": 4.669124603271484, + "learning_rate": 4.111582082712797e-06, + "logits/chosen": 10.363603591918945, + "logits/rejected": 4.938765525817871, + "logps/chosen": -409.20855712890625, + "logps/rejected": -240.4298553466797, + "loss": 0.5331, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10699347406625748, + "rewards/margins": 0.4335766136646271, + "rewards/rejected": -0.3265831470489502, + "step": 5042 + }, + { + "epoch": 0.7798956118306591, + "grad_norm": 5.747920513153076, + "learning_rate": 4.111295681063123e-06, + "logits/chosen": 16.73120880126953, + "logits/rejected": 13.110562324523926, + "logps/chosen": -293.6449279785156, + "logps/rejected": -222.78025817871094, + "loss": 0.7022, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27120092511177063, + "rewards/margins": 0.1408054679632187, + "rewards/rejected": 0.13039547204971313, + "step": 5043 + }, + { + "epoch": 0.7800502609704234, + "grad_norm": 5.237953186035156, + "learning_rate": 4.11100927941345e-06, + "logits/chosen": 15.620123863220215, + "logits/rejected": 10.183158874511719, + "logps/chosen": -330.70416259765625, + "logps/rejected": -286.80572509765625, + "loss": 0.6073, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.312949538230896, + "rewards/margins": 0.25471144914627075, + "rewards/rejected": 0.058238133788108826, + "step": 5044 + }, + { + "epoch": 0.7802049101101876, + "grad_norm": 6.788024425506592, + "learning_rate": 4.1107228777637766e-06, + "logits/chosen": 13.35718059539795, + "logits/rejected": 6.806447982788086, + "logps/chosen": -387.2668762207031, + "logps/rejected": -296.7503356933594, + "loss": 0.5601, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24102340638637543, + "rewards/margins": 0.4665447175502777, + "rewards/rejected": -0.22552131116390228, + "step": 5045 + }, + { + "epoch": 0.7803595592499517, + "grad_norm": 4.676558494567871, + "learning_rate": 4.110436476114103e-06, + "logits/chosen": 12.208867073059082, + "logits/rejected": 8.325477600097656, + "logps/chosen": -392.06427001953125, + "logps/rejected": -307.0189514160156, + "loss": 0.4081, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42378321290016174, + "rewards/margins": 1.0710327625274658, + "rewards/rejected": -0.6472495198249817, + "step": 5046 + }, + { + "epoch": 0.7805142083897159, + "grad_norm": 5.9772443771362305, + "learning_rate": 4.110150074464429e-06, + "logits/chosen": 2.1557722091674805, + "logits/rejected": 5.11979866027832, + "logps/chosen": -220.51300048828125, + "logps/rejected": -247.05026245117188, + "loss": 0.6078, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13328826427459717, + "rewards/margins": 0.20535486936569214, + "rewards/rejected": -0.07206659018993378, + "step": 5047 + }, + { + "epoch": 0.78066885752948, + "grad_norm": 7.3887200355529785, + "learning_rate": 4.109863672814756e-06, + "logits/chosen": 10.722755432128906, + "logits/rejected": 8.231695175170898, + "logps/chosen": -267.7508544921875, + "logps/rejected": -228.03750610351562, + "loss": 0.7786, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.30812105536460876, + "rewards/margins": -0.04417410120368004, + "rewards/rejected": -0.2639469504356384, + "step": 5048 + }, + { + "epoch": 0.7808235066692442, + "grad_norm": 4.58232307434082, + "learning_rate": 4.109577271165082e-06, + "logits/chosen": 11.72867202758789, + "logits/rejected": -2.089128255844116, + "logps/chosen": -307.6416015625, + "logps/rejected": -166.57406616210938, + "loss": 0.5575, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1341150403022766, + "rewards/margins": 0.4138139486312866, + "rewards/rejected": -0.2796989679336548, + "step": 5049 + }, + { + "epoch": 0.7809781558090083, + "grad_norm": 7.005514621734619, + "learning_rate": 4.109290869515409e-06, + "logits/chosen": 15.507721900939941, + "logits/rejected": 9.484916687011719, + "logps/chosen": -319.23626708984375, + "logps/rejected": -262.30194091796875, + "loss": 0.6408, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11864430457353592, + "rewards/margins": 0.15886190533638, + "rewards/rejected": -0.04021759331226349, + "step": 5050 + }, + { + "epoch": 0.7811328049487725, + "grad_norm": 6.510275840759277, + "learning_rate": 4.109004467865736e-06, + "logits/chosen": 8.56306266784668, + "logits/rejected": 6.801578521728516, + "logps/chosen": -410.47222900390625, + "logps/rejected": -324.2342834472656, + "loss": 0.5009, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.605204164981842, + "rewards/margins": 0.5338374376296997, + "rewards/rejected": 0.07136678695678711, + "step": 5051 + }, + { + "epoch": 0.7812874540885366, + "grad_norm": 3.914299488067627, + "learning_rate": 4.108718066216062e-06, + "logits/chosen": 10.392654418945312, + "logits/rejected": 5.892790794372559, + "logps/chosen": -228.93362426757812, + "logps/rejected": -176.40707397460938, + "loss": 0.6022, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47347593307495117, + "rewards/margins": 0.2508716285228729, + "rewards/rejected": 0.22260427474975586, + "step": 5052 + }, + { + "epoch": 0.7814421032283008, + "grad_norm": 5.765299320220947, + "learning_rate": 4.108431664566388e-06, + "logits/chosen": 5.827978134155273, + "logits/rejected": 8.501371383666992, + "logps/chosen": -219.8353271484375, + "logps/rejected": -256.9482727050781, + "loss": 0.6881, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07321976870298386, + "rewards/margins": 0.068837970495224, + "rewards/rejected": 0.004381801933050156, + "step": 5053 + }, + { + "epoch": 0.7815967523680649, + "grad_norm": 5.2162675857543945, + "learning_rate": 4.108145262916715e-06, + "logits/chosen": 9.194456100463867, + "logits/rejected": 3.763007164001465, + "logps/chosen": -386.1903381347656, + "logps/rejected": -235.15545654296875, + "loss": 0.608, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37361860275268555, + "rewards/margins": 0.30699610710144043, + "rewards/rejected": 0.06662251055240631, + "step": 5054 + }, + { + "epoch": 0.7817514015078291, + "grad_norm": 5.34047794342041, + "learning_rate": 4.107858861267041e-06, + "logits/chosen": 6.982151985168457, + "logits/rejected": 10.637344360351562, + "logps/chosen": -189.2821502685547, + "logps/rejected": -211.65943908691406, + "loss": 0.7051, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06354260444641113, + "rewards/margins": 0.2144639641046524, + "rewards/rejected": -0.2780066132545471, + "step": 5055 + }, + { + "epoch": 0.7819060506475932, + "grad_norm": 4.682369709014893, + "learning_rate": 4.107572459617368e-06, + "logits/chosen": 14.726280212402344, + "logits/rejected": 8.44631290435791, + "logps/chosen": -397.2965393066406, + "logps/rejected": -244.98489379882812, + "loss": 0.4395, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4837670624256134, + "rewards/margins": 0.6795926094055176, + "rewards/rejected": -0.19582557678222656, + "step": 5056 + }, + { + "epoch": 0.7820606997873575, + "grad_norm": 3.8550431728363037, + "learning_rate": 4.107286057967695e-06, + "logits/chosen": 8.757742881774902, + "logits/rejected": 1.8678923845291138, + "logps/chosen": -263.2114562988281, + "logps/rejected": -202.33299255371094, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23992905020713806, + "rewards/margins": 0.5675321817398071, + "rewards/rejected": -0.32760316133499146, + "step": 5057 + }, + { + "epoch": 0.7822153489271216, + "grad_norm": 5.130674362182617, + "learning_rate": 4.1069996563180205e-06, + "logits/chosen": 14.660371780395508, + "logits/rejected": 7.732184410095215, + "logps/chosen": -340.6009826660156, + "logps/rejected": -209.63592529296875, + "loss": 0.735, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04329577833414078, + "rewards/margins": 0.06805667281150818, + "rewards/rejected": -0.024760913103818893, + "step": 5058 + }, + { + "epoch": 0.7823699980668858, + "grad_norm": 5.810223579406738, + "learning_rate": 4.106713254668347e-06, + "logits/chosen": 12.289148330688477, + "logits/rejected": 10.763877868652344, + "logps/chosen": -275.8639831542969, + "logps/rejected": -186.7761993408203, + "loss": 0.6325, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09041323512792587, + "rewards/margins": 0.33413732051849365, + "rewards/rejected": -0.42455050349235535, + "step": 5059 + }, + { + "epoch": 0.7825246472066499, + "grad_norm": 5.3058977127075195, + "learning_rate": 4.106426853018674e-06, + "logits/chosen": 8.109975814819336, + "logits/rejected": 15.741321563720703, + "logps/chosen": -267.0071716308594, + "logps/rejected": -313.7628479003906, + "loss": 0.7171, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08863870799541473, + "rewards/margins": 0.036476440727710724, + "rewards/rejected": -0.12511512637138367, + "step": 5060 + }, + { + "epoch": 0.7826792963464141, + "grad_norm": 6.758449077606201, + "learning_rate": 4.106140451369e-06, + "logits/chosen": 10.126361846923828, + "logits/rejected": 10.134641647338867, + "logps/chosen": -277.5089416503906, + "logps/rejected": -410.3080749511719, + "loss": 0.8158, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1090768575668335, + "rewards/margins": -0.08879843354225159, + "rewards/rejected": 0.19787532091140747, + "step": 5061 + }, + { + "epoch": 0.7828339454861782, + "grad_norm": 4.2877020835876465, + "learning_rate": 4.105854049719327e-06, + "logits/chosen": 9.756173133850098, + "logits/rejected": 7.455391883850098, + "logps/chosen": -276.5301513671875, + "logps/rejected": -298.6921081542969, + "loss": 0.4046, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09428445249795914, + "rewards/margins": 0.8458055257797241, + "rewards/rejected": -0.751521110534668, + "step": 5062 + }, + { + "epoch": 0.7829885946259424, + "grad_norm": 6.585441589355469, + "learning_rate": 4.105567648069653e-06, + "logits/chosen": 9.223259925842285, + "logits/rejected": 2.761556625366211, + "logps/chosen": -285.99322509765625, + "logps/rejected": -200.97621154785156, + "loss": 0.6225, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24313923716545105, + "rewards/margins": 0.35347050428390503, + "rewards/rejected": -0.11033125221729279, + "step": 5063 + }, + { + "epoch": 0.7831432437657065, + "grad_norm": 4.626454830169678, + "learning_rate": 4.1052812464199795e-06, + "logits/chosen": 16.85995864868164, + "logits/rejected": 13.843454360961914, + "logps/chosen": -341.9322509765625, + "logps/rejected": -238.6962890625, + "loss": 0.5362, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08032150566577911, + "rewards/margins": 0.44775381684303284, + "rewards/rejected": -0.5280753374099731, + "step": 5064 + }, + { + "epoch": 0.7832978929054707, + "grad_norm": 6.047361850738525, + "learning_rate": 4.104994844770306e-06, + "logits/chosen": 11.13350772857666, + "logits/rejected": 10.96049976348877, + "logps/chosen": -196.30206298828125, + "logps/rejected": -162.70474243164062, + "loss": 0.7814, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15369367599487305, + "rewards/margins": -0.06952162086963654, + "rewards/rejected": -0.08417205512523651, + "step": 5065 + }, + { + "epoch": 0.7834525420452348, + "grad_norm": 4.253897190093994, + "learning_rate": 4.104708443120633e-06, + "logits/chosen": 10.387052536010742, + "logits/rejected": 5.422173500061035, + "logps/chosen": -204.311279296875, + "logps/rejected": -147.24813842773438, + "loss": 0.6118, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10736402869224548, + "rewards/margins": 0.255039781332016, + "rewards/rejected": -0.3624038100242615, + "step": 5066 + }, + { + "epoch": 0.783607191184999, + "grad_norm": 6.538941860198975, + "learning_rate": 4.104422041470959e-06, + "logits/chosen": 13.011324882507324, + "logits/rejected": 7.646974563598633, + "logps/chosen": -260.91448974609375, + "logps/rejected": -188.34507751464844, + "loss": 0.8237, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.31144633889198303, + "rewards/margins": -0.0360245555639267, + "rewards/rejected": 0.3474709093570709, + "step": 5067 + }, + { + "epoch": 0.7837618403247631, + "grad_norm": 5.236286163330078, + "learning_rate": 4.104135639821285e-06, + "logits/chosen": 6.552850723266602, + "logits/rejected": 10.216680526733398, + "logps/chosen": -174.12399291992188, + "logps/rejected": -217.28062438964844, + "loss": 0.6509, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3650899827480316, + "rewards/margins": 0.11579195410013199, + "rewards/rejected": 0.24929803609848022, + "step": 5068 + }, + { + "epoch": 0.7839164894645273, + "grad_norm": 8.082655906677246, + "learning_rate": 4.103849238171612e-06, + "logits/chosen": 1.183900237083435, + "logits/rejected": 10.036355018615723, + "logps/chosen": -139.90013122558594, + "logps/rejected": -330.72607421875, + "loss": 0.6743, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2600708603858948, + "rewards/margins": 0.3249843716621399, + "rewards/rejected": -0.06491345912218094, + "step": 5069 + }, + { + "epoch": 0.7840711386042916, + "grad_norm": 3.6693685054779053, + "learning_rate": 4.103562836521939e-06, + "logits/chosen": 6.519768238067627, + "logits/rejected": 8.200125694274902, + "logps/chosen": -176.1629638671875, + "logps/rejected": -200.93399047851562, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42160022258758545, + "rewards/margins": 0.35430848598480225, + "rewards/rejected": 0.06729169934988022, + "step": 5070 + }, + { + "epoch": 0.7842257877440557, + "grad_norm": 5.75965690612793, + "learning_rate": 4.103276434872265e-06, + "logits/chosen": 7.22062873840332, + "logits/rejected": 11.40235710144043, + "logps/chosen": -214.06446838378906, + "logps/rejected": -284.55316162109375, + "loss": 0.7208, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1937389373779297, + "rewards/margins": -0.023232292383909225, + "rewards/rejected": 0.21697121858596802, + "step": 5071 + }, + { + "epoch": 0.7843804368838199, + "grad_norm": 5.923933506011963, + "learning_rate": 4.102990033222592e-06, + "logits/chosen": 11.293485641479492, + "logits/rejected": 11.829269409179688, + "logps/chosen": -268.4728698730469, + "logps/rejected": -235.5037384033203, + "loss": 0.7037, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030789926648139954, + "rewards/margins": 0.024275191128253937, + "rewards/rejected": 0.006514742970466614, + "step": 5072 + }, + { + "epoch": 0.784535086023584, + "grad_norm": 5.9903764724731445, + "learning_rate": 4.102703631572918e-06, + "logits/chosen": 12.125068664550781, + "logits/rejected": 4.556142807006836, + "logps/chosen": -306.804931640625, + "logps/rejected": -213.5801239013672, + "loss": 0.6845, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08387833833694458, + "rewards/margins": 0.22682486474514008, + "rewards/rejected": -0.1429465264081955, + "step": 5073 + }, + { + "epoch": 0.7846897351633482, + "grad_norm": 8.062975883483887, + "learning_rate": 4.102417229923244e-06, + "logits/chosen": 7.6582231521606445, + "logits/rejected": 10.406158447265625, + "logps/chosen": -362.16339111328125, + "logps/rejected": -389.4421081542969, + "loss": 0.8861, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22911453247070312, + "rewards/margins": -0.24018210172653198, + "rewards/rejected": 0.011067569255828857, + "step": 5074 + }, + { + "epoch": 0.7848443843031123, + "grad_norm": 4.826886177062988, + "learning_rate": 4.102130828273571e-06, + "logits/chosen": 13.163078308105469, + "logits/rejected": 6.655034065246582, + "logps/chosen": -291.2995910644531, + "logps/rejected": -205.60719299316406, + "loss": 0.513, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07948438078165054, + "rewards/margins": 0.5354306697845459, + "rewards/rejected": -0.6149150133132935, + "step": 5075 + }, + { + "epoch": 0.7849990334428765, + "grad_norm": 5.940065383911133, + "learning_rate": 4.101844426623898e-06, + "logits/chosen": 6.081146240234375, + "logits/rejected": 8.027899742126465, + "logps/chosen": -243.90274047851562, + "logps/rejected": -253.07562255859375, + "loss": 0.6405, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2440074384212494, + "rewards/margins": 0.1856367290019989, + "rewards/rejected": -0.42964422702789307, + "step": 5076 + }, + { + "epoch": 0.7851536825826406, + "grad_norm": 4.854711532592773, + "learning_rate": 4.101558024974224e-06, + "logits/chosen": 10.162113189697266, + "logits/rejected": 7.421090126037598, + "logps/chosen": -240.82421875, + "logps/rejected": -232.93780517578125, + "loss": 0.5164, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08275684714317322, + "rewards/margins": 0.4421387314796448, + "rewards/rejected": -0.5248955488204956, + "step": 5077 + }, + { + "epoch": 0.7853083317224048, + "grad_norm": 5.007492542266846, + "learning_rate": 4.101271623324551e-06, + "logits/chosen": 11.49973201751709, + "logits/rejected": 12.11457633972168, + "logps/chosen": -232.97732543945312, + "logps/rejected": -226.9874267578125, + "loss": 0.5829, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1904737502336502, + "rewards/margins": 0.27222391963005066, + "rewards/rejected": -0.08175017684698105, + "step": 5078 + }, + { + "epoch": 0.7854629808621689, + "grad_norm": 4.608596324920654, + "learning_rate": 4.100985221674878e-06, + "logits/chosen": 10.496490478515625, + "logits/rejected": 9.294227600097656, + "logps/chosen": -182.33740234375, + "logps/rejected": -103.50914001464844, + "loss": 0.762, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2902719974517822, + "rewards/margins": 0.05885876715183258, + "rewards/rejected": -0.3491307497024536, + "step": 5079 + }, + { + "epoch": 0.7856176300019331, + "grad_norm": 5.733582496643066, + "learning_rate": 4.100698820025203e-06, + "logits/chosen": 10.298238754272461, + "logits/rejected": 4.564059734344482, + "logps/chosen": -342.3865051269531, + "logps/rejected": -224.36880493164062, + "loss": 0.5779, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42482495307922363, + "rewards/margins": 0.5499775409698486, + "rewards/rejected": -0.125152587890625, + "step": 5080 + }, + { + "epoch": 0.7857722791416972, + "grad_norm": 5.319206714630127, + "learning_rate": 4.10041241837553e-06, + "logits/chosen": 7.999185085296631, + "logits/rejected": 3.193298101425171, + "logps/chosen": -235.6840057373047, + "logps/rejected": -216.12484741210938, + "loss": 0.7459, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21852310001850128, + "rewards/margins": 0.06859800219535828, + "rewards/rejected": -0.28712111711502075, + "step": 5081 + }, + { + "epoch": 0.7859269282814615, + "grad_norm": 4.088825225830078, + "learning_rate": 4.100126016725857e-06, + "logits/chosen": 12.149977684020996, + "logits/rejected": 11.96504020690918, + "logps/chosen": -165.03765869140625, + "logps/rejected": -161.04461669921875, + "loss": 0.6042, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09357132017612457, + "rewards/margins": 0.3030185401439667, + "rewards/rejected": -0.20944717526435852, + "step": 5082 + }, + { + "epoch": 0.7860815774212256, + "grad_norm": 4.019356727600098, + "learning_rate": 4.099839615076183e-06, + "logits/chosen": 14.932348251342773, + "logits/rejected": 7.120345115661621, + "logps/chosen": -353.8023376464844, + "logps/rejected": -201.31927490234375, + "loss": 0.4418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24206753075122833, + "rewards/margins": 0.7625024318695068, + "rewards/rejected": -0.5204348564147949, + "step": 5083 + }, + { + "epoch": 0.7862362265609898, + "grad_norm": 5.034538269042969, + "learning_rate": 4.09955321342651e-06, + "logits/chosen": 10.297540664672852, + "logits/rejected": 6.178125381469727, + "logps/chosen": -334.7886962890625, + "logps/rejected": -218.8343048095703, + "loss": 0.5267, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44965869188308716, + "rewards/margins": 0.44327741861343384, + "rewards/rejected": 0.006381265819072723, + "step": 5084 + }, + { + "epoch": 0.7863908757007539, + "grad_norm": 4.640108108520508, + "learning_rate": 4.099266811776837e-06, + "logits/chosen": 9.456334114074707, + "logits/rejected": 5.47445821762085, + "logps/chosen": -231.6034393310547, + "logps/rejected": -152.19415283203125, + "loss": 0.6264, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06949248909950256, + "rewards/margins": 0.34694284200668335, + "rewards/rejected": -0.2774503827095032, + "step": 5085 + }, + { + "epoch": 0.7865455248405181, + "grad_norm": 4.491507053375244, + "learning_rate": 4.0989804101271624e-06, + "logits/chosen": 6.749477386474609, + "logits/rejected": 1.9619042873382568, + "logps/chosen": -241.92921447753906, + "logps/rejected": -213.1754608154297, + "loss": 0.5349, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2999480068683624, + "rewards/margins": 0.6797693967819214, + "rewards/rejected": -0.37982141971588135, + "step": 5086 + }, + { + "epoch": 0.7867001739802822, + "grad_norm": 6.806582450866699, + "learning_rate": 4.098694008477489e-06, + "logits/chosen": 15.392090797424316, + "logits/rejected": 5.7103705406188965, + "logps/chosen": -442.79656982421875, + "logps/rejected": -319.1934814453125, + "loss": 0.752, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0019940361380577087, + "rewards/margins": 0.27102822065353394, + "rewards/rejected": -0.26903417706489563, + "step": 5087 + }, + { + "epoch": 0.7868548231200464, + "grad_norm": 4.586902141571045, + "learning_rate": 4.098407606827816e-06, + "logits/chosen": 12.856761932373047, + "logits/rejected": 10.159149169921875, + "logps/chosen": -318.08782958984375, + "logps/rejected": -234.09378051757812, + "loss": 0.5427, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.035257622599601746, + "rewards/margins": 0.37344610691070557, + "rewards/rejected": -0.338188499212265, + "step": 5088 + }, + { + "epoch": 0.7870094722598105, + "grad_norm": 7.811407089233398, + "learning_rate": 4.098121205178142e-06, + "logits/chosen": 4.290340900421143, + "logits/rejected": 5.923408031463623, + "logps/chosen": -285.6490478515625, + "logps/rejected": -298.68511962890625, + "loss": 0.792, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13546524941921234, + "rewards/margins": -0.04209838807582855, + "rewards/rejected": -0.0933668464422226, + "step": 5089 + }, + { + "epoch": 0.7871641213995747, + "grad_norm": 4.338775634765625, + "learning_rate": 4.097834803528469e-06, + "logits/chosen": 16.456586837768555, + "logits/rejected": 6.50148868560791, + "logps/chosen": -213.60464477539062, + "logps/rejected": -145.39422607421875, + "loss": 0.588, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.018607236444950104, + "rewards/margins": 0.30714306235313416, + "rewards/rejected": -0.28853580355644226, + "step": 5090 + }, + { + "epoch": 0.7873187705393389, + "grad_norm": 5.082007884979248, + "learning_rate": 4.097548401878796e-06, + "logits/chosen": 7.140385627746582, + "logits/rejected": 4.2235307693481445, + "logps/chosen": -301.2499084472656, + "logps/rejected": -160.086181640625, + "loss": 0.5724, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11972412467002869, + "rewards/margins": 0.43268856406211853, + "rewards/rejected": -0.31296437978744507, + "step": 5091 + }, + { + "epoch": 0.787473419679103, + "grad_norm": 4.049871444702148, + "learning_rate": 4.0972620002291215e-06, + "logits/chosen": 9.526273727416992, + "logits/rejected": 6.412437438964844, + "logps/chosen": -380.4332275390625, + "logps/rejected": -270.4364013671875, + "loss": 0.5365, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4613839089870453, + "rewards/margins": 0.615372896194458, + "rewards/rejected": -0.15398894250392914, + "step": 5092 + }, + { + "epoch": 0.7876280688188672, + "grad_norm": 9.423508644104004, + "learning_rate": 4.096975598579448e-06, + "logits/chosen": 7.798299789428711, + "logits/rejected": 10.19260025024414, + "logps/chosen": -607.5639038085938, + "logps/rejected": -751.56982421875, + "loss": 1.0192, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.07918519526720047, + "rewards/margins": -0.45033693313598633, + "rewards/rejected": 0.37115171551704407, + "step": 5093 + }, + { + "epoch": 0.7877827179586313, + "grad_norm": 3.54980206489563, + "learning_rate": 4.096689196929775e-06, + "logits/chosen": 10.68482780456543, + "logits/rejected": 2.815458059310913, + "logps/chosen": -336.3044128417969, + "logps/rejected": -210.94725036621094, + "loss": 0.3275, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9575498104095459, + "rewards/margins": 1.014569878578186, + "rewards/rejected": -0.057019997388124466, + "step": 5094 + }, + { + "epoch": 0.7879373670983956, + "grad_norm": 9.095545768737793, + "learning_rate": 4.0964027952801015e-06, + "logits/chosen": 12.557228088378906, + "logits/rejected": 3.9230995178222656, + "logps/chosen": -305.9664001464844, + "logps/rejected": -194.7684326171875, + "loss": 0.793, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0077245235443115234, + "rewards/margins": 0.0496039092540741, + "rewards/rejected": -0.04187939688563347, + "step": 5095 + }, + { + "epoch": 0.7880920162381597, + "grad_norm": 4.574399948120117, + "learning_rate": 4.096116393630427e-06, + "logits/chosen": 11.70435905456543, + "logits/rejected": 10.439863204956055, + "logps/chosen": -288.5580139160156, + "logps/rejected": -218.48220825195312, + "loss": 0.5238, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6886293888092041, + "rewards/margins": 0.6195163726806641, + "rewards/rejected": 0.06911302357912064, + "step": 5096 + }, + { + "epoch": 0.7882466653779239, + "grad_norm": 4.24513053894043, + "learning_rate": 4.095829991980754e-06, + "logits/chosen": 12.63845157623291, + "logits/rejected": 10.504179000854492, + "logps/chosen": -288.8387756347656, + "logps/rejected": -195.49526977539062, + "loss": 0.5844, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003933288156986237, + "rewards/margins": 0.3645409941673279, + "rewards/rejected": -0.36060771346092224, + "step": 5097 + }, + { + "epoch": 0.788401314517688, + "grad_norm": 4.105783462524414, + "learning_rate": 4.0955435903310806e-06, + "logits/chosen": 9.760673522949219, + "logits/rejected": 7.60066032409668, + "logps/chosen": -224.1059112548828, + "logps/rejected": -147.4083251953125, + "loss": 0.6594, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1093551516532898, + "rewards/margins": 0.08854524791240692, + "rewards/rejected": 0.020809918642044067, + "step": 5098 + }, + { + "epoch": 0.7885559636574522, + "grad_norm": 5.452014923095703, + "learning_rate": 4.095257188681407e-06, + "logits/chosen": 5.918210029602051, + "logits/rejected": 10.329328536987305, + "logps/chosen": -227.80001831054688, + "logps/rejected": -229.72103881835938, + "loss": 0.5554, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37150678038597107, + "rewards/margins": 0.3152625560760498, + "rewards/rejected": 0.05624423176050186, + "step": 5099 + }, + { + "epoch": 0.7887106127972163, + "grad_norm": 7.25813102722168, + "learning_rate": 4.094970787031734e-06, + "logits/chosen": 7.437568664550781, + "logits/rejected": 8.877620697021484, + "logps/chosen": -190.92784118652344, + "logps/rejected": -252.36814880371094, + "loss": 0.8061, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05465591326355934, + "rewards/margins": -0.14600211381912231, + "rewards/rejected": 0.09134618192911148, + "step": 5100 + }, + { + "epoch": 0.7888652619369805, + "grad_norm": 3.238044261932373, + "learning_rate": 4.09468438538206e-06, + "logits/chosen": -0.017635047435760498, + "logits/rejected": 9.20185661315918, + "logps/chosen": -136.63595581054688, + "logps/rejected": -190.55198669433594, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3917364478111267, + "rewards/margins": 0.644311249256134, + "rewards/rejected": -0.2525748014450073, + "step": 5101 + }, + { + "epoch": 0.7890199110767446, + "grad_norm": 3.8613359928131104, + "learning_rate": 4.094397983732386e-06, + "logits/chosen": 8.534120559692383, + "logits/rejected": 1.7898999452590942, + "logps/chosen": -237.05540466308594, + "logps/rejected": -130.94361877441406, + "loss": 0.5233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6398761868476868, + "rewards/margins": 0.3905426561832428, + "rewards/rejected": 0.24933350086212158, + "step": 5102 + }, + { + "epoch": 0.7891745602165088, + "grad_norm": 4.5816497802734375, + "learning_rate": 4.094111582082713e-06, + "logits/chosen": 10.0445556640625, + "logits/rejected": 8.66766357421875, + "logps/chosen": -223.98605346679688, + "logps/rejected": -252.81219482421875, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4634278118610382, + "rewards/margins": 0.22173044085502625, + "rewards/rejected": 0.24169737100601196, + "step": 5103 + }, + { + "epoch": 0.7893292093562729, + "grad_norm": 4.215822219848633, + "learning_rate": 4.09382518043304e-06, + "logits/chosen": 14.475205421447754, + "logits/rejected": 4.325428009033203, + "logps/chosen": -317.0892333984375, + "logps/rejected": -163.09774780273438, + "loss": 0.5252, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6705270409584045, + "rewards/margins": 0.9162802696228027, + "rewards/rejected": -0.24575325846672058, + "step": 5104 + }, + { + "epoch": 0.7894838584960371, + "grad_norm": 5.75499963760376, + "learning_rate": 4.093538778783366e-06, + "logits/chosen": 9.2464017868042, + "logits/rejected": 14.420654296875, + "logps/chosen": -211.19117736816406, + "logps/rejected": -287.4402770996094, + "loss": 0.7857, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.037461668252944946, + "rewards/margins": -0.08721613138914108, + "rewards/rejected": 0.12467780709266663, + "step": 5105 + }, + { + "epoch": 0.7896385076358012, + "grad_norm": 4.349009037017822, + "learning_rate": 4.093252377133692e-06, + "logits/chosen": 5.422112941741943, + "logits/rejected": 6.471564292907715, + "logps/chosen": -168.31227111816406, + "logps/rejected": -185.10122680664062, + "loss": 0.7265, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2771903872489929, + "rewards/margins": -0.017520040273666382, + "rewards/rejected": 0.2947104275226593, + "step": 5106 + }, + { + "epoch": 0.7897931567755654, + "grad_norm": 5.1660332679748535, + "learning_rate": 4.092965975484019e-06, + "logits/chosen": 11.746912002563477, + "logits/rejected": 5.832705974578857, + "logps/chosen": -399.9925842285156, + "logps/rejected": -297.90032958984375, + "loss": 0.6077, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7811201214790344, + "rewards/margins": 0.24734467267990112, + "rewards/rejected": 0.5337754487991333, + "step": 5107 + }, + { + "epoch": 0.7899478059153296, + "grad_norm": 4.787499904632568, + "learning_rate": 4.092679573834345e-06, + "logits/chosen": 10.443513870239258, + "logits/rejected": 9.022887229919434, + "logps/chosen": -380.3749084472656, + "logps/rejected": -300.4557800292969, + "loss": 0.5348, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7033333778381348, + "rewards/margins": 0.40314429998397827, + "rewards/rejected": 0.3001891076564789, + "step": 5108 + }, + { + "epoch": 0.7901024550550938, + "grad_norm": 5.681580066680908, + "learning_rate": 4.092393172184672e-06, + "logits/chosen": 13.270523071289062, + "logits/rejected": 14.695281982421875, + "logps/chosen": -329.2462158203125, + "logps/rejected": -335.0748291015625, + "loss": 0.6725, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2909242510795593, + "rewards/margins": 0.21132926642894745, + "rewards/rejected": 0.07959498465061188, + "step": 5109 + }, + { + "epoch": 0.790257104194858, + "grad_norm": 5.15778923034668, + "learning_rate": 4.092106770534999e-06, + "logits/chosen": 10.040604591369629, + "logits/rejected": 10.57836627960205, + "logps/chosen": -244.24546813964844, + "logps/rejected": -229.80064392089844, + "loss": 0.6493, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.048752300441265106, + "rewards/margins": 0.11303119361400604, + "rewards/rejected": -0.06427889317274094, + "step": 5110 + }, + { + "epoch": 0.7904117533346221, + "grad_norm": 7.556445121765137, + "learning_rate": 4.091820368885325e-06, + "logits/chosen": 11.767011642456055, + "logits/rejected": 12.161941528320312, + "logps/chosen": -199.0382080078125, + "logps/rejected": -182.70046997070312, + "loss": 0.8979, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.29699739813804626, + "rewards/margins": -0.30969393253326416, + "rewards/rejected": 0.012696534395217896, + "step": 5111 + }, + { + "epoch": 0.7905664024743863, + "grad_norm": 5.7087249755859375, + "learning_rate": 4.091533967235652e-06, + "logits/chosen": 8.876701354980469, + "logits/rejected": 9.305632591247559, + "logps/chosen": -332.8764953613281, + "logps/rejected": -360.33740234375, + "loss": 0.5545, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47623586654663086, + "rewards/margins": 0.3553975224494934, + "rewards/rejected": 0.12083835154771805, + "step": 5112 + }, + { + "epoch": 0.7907210516141504, + "grad_norm": 5.002257823944092, + "learning_rate": 4.091247565585978e-06, + "logits/chosen": 12.8779878616333, + "logits/rejected": 10.324104309082031, + "logps/chosen": -277.0475769042969, + "logps/rejected": -255.50823974609375, + "loss": 0.6383, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33567050099372864, + "rewards/margins": 0.17392273247241974, + "rewards/rejected": 0.1617477387189865, + "step": 5113 + }, + { + "epoch": 0.7908757007539146, + "grad_norm": 4.451407432556152, + "learning_rate": 4.0909611639363044e-06, + "logits/chosen": 11.414949417114258, + "logits/rejected": 12.021656036376953, + "logps/chosen": -256.81536865234375, + "logps/rejected": -249.47708129882812, + "loss": 0.7281, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03548100218176842, + "rewards/margins": 0.14254681766033173, + "rewards/rejected": -0.1070658415555954, + "step": 5114 + }, + { + "epoch": 0.7910303498936787, + "grad_norm": 7.36456298828125, + "learning_rate": 4.090674762286631e-06, + "logits/chosen": 12.94161319732666, + "logits/rejected": 6.897037506103516, + "logps/chosen": -369.4624938964844, + "logps/rejected": -324.5177001953125, + "loss": 0.6475, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4120739996433258, + "rewards/margins": 0.18249830603599548, + "rewards/rejected": 0.22957567870616913, + "step": 5115 + }, + { + "epoch": 0.7911849990334429, + "grad_norm": 5.778082370758057, + "learning_rate": 4.090388360636958e-06, + "logits/chosen": 7.975973129272461, + "logits/rejected": 6.598692893981934, + "logps/chosen": -174.73782348632812, + "logps/rejected": -143.19839477539062, + "loss": 0.929, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0850757583975792, + "rewards/margins": -0.247012197971344, + "rewards/rejected": 0.1619364321231842, + "step": 5116 + }, + { + "epoch": 0.791339648173207, + "grad_norm": 4.2440643310546875, + "learning_rate": 4.090101958987284e-06, + "logits/chosen": 12.873610496520996, + "logits/rejected": 11.144428253173828, + "logps/chosen": -254.34454345703125, + "logps/rejected": -219.94285583496094, + "loss": 0.6213, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5841102600097656, + "rewards/margins": 0.20584255456924438, + "rewards/rejected": 0.37826770544052124, + "step": 5117 + }, + { + "epoch": 0.7914942973129712, + "grad_norm": 6.00151252746582, + "learning_rate": 4.089815557337611e-06, + "logits/chosen": 11.584171295166016, + "logits/rejected": 11.935012817382812, + "logps/chosen": -537.4677124023438, + "logps/rejected": -427.73199462890625, + "loss": 0.587, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5674301385879517, + "rewards/margins": 0.32733821868896484, + "rewards/rejected": 0.24009190499782562, + "step": 5118 + }, + { + "epoch": 0.7916489464527353, + "grad_norm": 14.695266723632812, + "learning_rate": 4.089529155687937e-06, + "logits/chosen": 10.891056060791016, + "logits/rejected": 8.556668281555176, + "logps/chosen": -186.0068359375, + "logps/rejected": -155.95144653320312, + "loss": 0.7171, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019355714321136475, + "rewards/margins": 0.12493747472763062, + "rewards/rejected": -0.10558176040649414, + "step": 5119 + }, + { + "epoch": 0.7918035955924995, + "grad_norm": 10.410287857055664, + "learning_rate": 4.0892427540382635e-06, + "logits/chosen": 4.316266059875488, + "logits/rejected": 2.9128799438476562, + "logps/chosen": -330.39459228515625, + "logps/rejected": -263.1448974609375, + "loss": 0.6007, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3537082076072693, + "rewards/margins": 0.2715470790863037, + "rewards/rejected": 0.08216113597154617, + "step": 5120 + }, + { + "epoch": 0.7919582447322637, + "grad_norm": 3.5756843090057373, + "learning_rate": 4.08895635238859e-06, + "logits/chosen": 10.93596076965332, + "logits/rejected": 6.558873176574707, + "logps/chosen": -249.20327758789062, + "logps/rejected": -189.0355224609375, + "loss": 0.6086, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23424673080444336, + "rewards/margins": 0.43993523716926575, + "rewards/rejected": -0.2056885063648224, + "step": 5121 + }, + { + "epoch": 0.7921128938720279, + "grad_norm": 4.506368637084961, + "learning_rate": 4.088669950738917e-06, + "logits/chosen": 11.031534194946289, + "logits/rejected": 5.076327323913574, + "logps/chosen": -262.301513671875, + "logps/rejected": -210.02330017089844, + "loss": 0.6183, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1448369175195694, + "rewards/margins": 0.4317597448825836, + "rewards/rejected": -0.5765966773033142, + "step": 5122 + }, + { + "epoch": 0.792267543011792, + "grad_norm": 4.79826545715332, + "learning_rate": 4.0883835490892434e-06, + "logits/chosen": 8.827932357788086, + "logits/rejected": 4.778672218322754, + "logps/chosen": -157.8374786376953, + "logps/rejected": -113.94066619873047, + "loss": 0.7055, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3114785850048065, + "rewards/margins": 0.009424515999853611, + "rewards/rejected": 0.3020540475845337, + "step": 5123 + }, + { + "epoch": 0.7924221921515562, + "grad_norm": 4.824007034301758, + "learning_rate": 4.08809714743957e-06, + "logits/chosen": 12.169127464294434, + "logits/rejected": 6.964886665344238, + "logps/chosen": -341.558837890625, + "logps/rejected": -306.52142333984375, + "loss": 0.4289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5295025110244751, + "rewards/margins": 0.7243529558181763, + "rewards/rejected": -0.19485044479370117, + "step": 5124 + }, + { + "epoch": 0.7925768412913203, + "grad_norm": 8.1026611328125, + "learning_rate": 4.087810745789897e-06, + "logits/chosen": 6.729506492614746, + "logits/rejected": 7.2052717208862305, + "logps/chosen": -253.596923828125, + "logps/rejected": -289.608154296875, + "loss": 0.7858, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4378250241279602, + "rewards/margins": -0.023422781378030777, + "rewards/rejected": 0.4612478017807007, + "step": 5125 + }, + { + "epoch": 0.7927314904310845, + "grad_norm": 5.579225540161133, + "learning_rate": 4.0875243441402225e-06, + "logits/chosen": 8.467966079711914, + "logits/rejected": 8.919305801391602, + "logps/chosen": -246.54071044921875, + "logps/rejected": -256.5277404785156, + "loss": 0.5381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41607218980789185, + "rewards/margins": 0.6485758423805237, + "rewards/rejected": -0.23250368237495422, + "step": 5126 + }, + { + "epoch": 0.7928861395708486, + "grad_norm": 4.772952556610107, + "learning_rate": 4.087237942490549e-06, + "logits/chosen": 15.768258094787598, + "logits/rejected": 10.663614273071289, + "logps/chosen": -383.0423278808594, + "logps/rejected": -291.88128662109375, + "loss": 0.581, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5903576612472534, + "rewards/margins": 0.30960386991500854, + "rewards/rejected": 0.2807537317276001, + "step": 5127 + }, + { + "epoch": 0.7930407887106128, + "grad_norm": 4.501585483551025, + "learning_rate": 4.086951540840876e-06, + "logits/chosen": 8.701677322387695, + "logits/rejected": 7.465191841125488, + "logps/chosen": -406.61651611328125, + "logps/rejected": -273.605224609375, + "loss": 0.4708, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2780885398387909, + "rewards/margins": 0.6573121547698975, + "rewards/rejected": -0.37922361493110657, + "step": 5128 + }, + { + "epoch": 0.7931954378503769, + "grad_norm": 4.429296016693115, + "learning_rate": 4.0866651391912025e-06, + "logits/chosen": 9.797746658325195, + "logits/rejected": 10.280259132385254, + "logps/chosen": -216.98837280273438, + "logps/rejected": -240.43994140625, + "loss": 0.529, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.47766926884651184, + "rewards/margins": 0.5691008567810059, + "rewards/rejected": -0.09143156558275223, + "step": 5129 + }, + { + "epoch": 0.7933500869901411, + "grad_norm": 4.896242618560791, + "learning_rate": 4.086378737541528e-06, + "logits/chosen": 5.520730495452881, + "logits/rejected": 5.328020095825195, + "logps/chosen": -193.28045654296875, + "logps/rejected": -157.02879333496094, + "loss": 0.7188, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24938923120498657, + "rewards/margins": 0.11459577083587646, + "rewards/rejected": 0.1347934603691101, + "step": 5130 + }, + { + "epoch": 0.7935047361299052, + "grad_norm": 3.339292049407959, + "learning_rate": 4.086092335891855e-06, + "logits/chosen": 10.50387191772461, + "logits/rejected": 11.079645156860352, + "logps/chosen": -208.0048370361328, + "logps/rejected": -234.5660400390625, + "loss": 0.5039, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34642529487609863, + "rewards/margins": 0.591510534286499, + "rewards/rejected": -0.24508532881736755, + "step": 5131 + }, + { + "epoch": 0.7936593852696694, + "grad_norm": 8.280366897583008, + "learning_rate": 4.085805934242182e-06, + "logits/chosen": 0.9430966377258301, + "logits/rejected": 3.7082700729370117, + "logps/chosen": -307.4939270019531, + "logps/rejected": -318.5357971191406, + "loss": 0.9763, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2641964852809906, + "rewards/margins": -0.19423817098140717, + "rewards/rejected": 0.45843470096588135, + "step": 5132 + }, + { + "epoch": 0.7938140344094335, + "grad_norm": 18.444961547851562, + "learning_rate": 4.085519532592508e-06, + "logits/chosen": 7.743846893310547, + "logits/rejected": 5.6279168128967285, + "logps/chosen": -255.98977661132812, + "logps/rejected": -207.76181030273438, + "loss": 0.6992, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13744987547397614, + "rewards/margins": 0.15704721212387085, + "rewards/rejected": -0.019597336649894714, + "step": 5133 + }, + { + "epoch": 0.7939686835491978, + "grad_norm": 5.020827770233154, + "learning_rate": 4.085233130942834e-06, + "logits/chosen": 9.89692211151123, + "logits/rejected": 9.981078147888184, + "logps/chosen": -234.182373046875, + "logps/rejected": -193.4803466796875, + "loss": 0.8592, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12400683760643005, + "rewards/margins": -0.1671830117702484, + "rewards/rejected": 0.043176181614398956, + "step": 5134 + }, + { + "epoch": 0.794123332688962, + "grad_norm": 4.169794082641602, + "learning_rate": 4.084946729293161e-06, + "logits/chosen": 17.274953842163086, + "logits/rejected": 12.514240264892578, + "logps/chosen": -296.9609375, + "logps/rejected": -228.280029296875, + "loss": 0.5501, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5431222915649414, + "rewards/margins": 0.3721500337123871, + "rewards/rejected": 0.17097224295139313, + "step": 5135 + }, + { + "epoch": 0.7942779818287261, + "grad_norm": 5.927578926086426, + "learning_rate": 4.084660327643487e-06, + "logits/chosen": 14.158815383911133, + "logits/rejected": 12.72148609161377, + "logps/chosen": -350.7445983886719, + "logps/rejected": -361.63128662109375, + "loss": 0.6859, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5648687481880188, + "rewards/margins": 0.13774964213371277, + "rewards/rejected": 0.42711907625198364, + "step": 5136 + }, + { + "epoch": 0.7944326309684903, + "grad_norm": 7.3381195068359375, + "learning_rate": 4.084373925993814e-06, + "logits/chosen": 8.293642044067383, + "logits/rejected": 14.127386093139648, + "logps/chosen": -194.8069305419922, + "logps/rejected": -332.56597900390625, + "loss": 0.8826, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.020099062472581863, + "rewards/margins": -0.22488906979560852, + "rewards/rejected": 0.20479002594947815, + "step": 5137 + }, + { + "epoch": 0.7945872801082544, + "grad_norm": 5.287918567657471, + "learning_rate": 4.084087524344141e-06, + "logits/chosen": 6.866658687591553, + "logits/rejected": 8.704523086547852, + "logps/chosen": -183.92649841308594, + "logps/rejected": -299.900146484375, + "loss": 0.8506, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05032868683338165, + "rewards/margins": -0.1025393009185791, + "rewards/rejected": 0.05221062898635864, + "step": 5138 + }, + { + "epoch": 0.7947419292480186, + "grad_norm": 4.71909236907959, + "learning_rate": 4.0838011226944665e-06, + "logits/chosen": 12.919755935668945, + "logits/rejected": 9.056034088134766, + "logps/chosen": -211.45774841308594, + "logps/rejected": -192.09457397460938, + "loss": 0.5835, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.15688000619411469, + "rewards/margins": 0.3297611474990845, + "rewards/rejected": -0.48664113879203796, + "step": 5139 + }, + { + "epoch": 0.7948965783877827, + "grad_norm": 5.384582042694092, + "learning_rate": 4.083514721044793e-06, + "logits/chosen": 11.911674499511719, + "logits/rejected": 11.493669509887695, + "logps/chosen": -175.20628356933594, + "logps/rejected": -214.06173706054688, + "loss": 0.8418, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.22032225131988525, + "rewards/margins": -0.1977851390838623, + "rewards/rejected": -0.022537142038345337, + "step": 5140 + }, + { + "epoch": 0.7950512275275469, + "grad_norm": 7.07741641998291, + "learning_rate": 4.08322831939512e-06, + "logits/chosen": 8.259150505065918, + "logits/rejected": 12.604560852050781, + "logps/chosen": -256.9228515625, + "logps/rejected": -376.0733642578125, + "loss": 0.9283, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2508412301540375, + "rewards/margins": -0.310799777507782, + "rewards/rejected": 0.5616410374641418, + "step": 5141 + }, + { + "epoch": 0.795205876667311, + "grad_norm": 6.959361553192139, + "learning_rate": 4.082941917745446e-06, + "logits/chosen": 6.577674865722656, + "logits/rejected": 6.3845601081848145, + "logps/chosen": -282.6300354003906, + "logps/rejected": -269.59259033203125, + "loss": 0.8414, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5009900331497192, + "rewards/margins": -0.14512671530246735, + "rewards/rejected": 0.6461167335510254, + "step": 5142 + }, + { + "epoch": 0.7953605258070752, + "grad_norm": 4.713140964508057, + "learning_rate": 4.082655516095773e-06, + "logits/chosen": 9.76624584197998, + "logits/rejected": 7.391097545623779, + "logps/chosen": -266.0717468261719, + "logps/rejected": -271.6428527832031, + "loss": 0.5893, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14975817501544952, + "rewards/margins": 0.2693978250026703, + "rewards/rejected": -0.11963966488838196, + "step": 5143 + }, + { + "epoch": 0.7955151749468393, + "grad_norm": 5.34617805480957, + "learning_rate": 4.0823691144461e-06, + "logits/chosen": 12.2053861618042, + "logits/rejected": 12.456269264221191, + "logps/chosen": -198.55780029296875, + "logps/rejected": -211.1479949951172, + "loss": 0.765, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17930379509925842, + "rewards/margins": -0.00016549229621887207, + "rewards/rejected": -0.17913833260536194, + "step": 5144 + }, + { + "epoch": 0.7956698240866035, + "grad_norm": 5.029722213745117, + "learning_rate": 4.082082712796426e-06, + "logits/chosen": 11.243890762329102, + "logits/rejected": 9.897314071655273, + "logps/chosen": -294.2127685546875, + "logps/rejected": -262.984375, + "loss": 0.6215, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2950325012207031, + "rewards/margins": 0.19400739669799805, + "rewards/rejected": 0.10102510452270508, + "step": 5145 + }, + { + "epoch": 0.7958244732263676, + "grad_norm": 6.0550856590271, + "learning_rate": 4.081796311146752e-06, + "logits/chosen": 11.434615135192871, + "logits/rejected": 5.453403949737549, + "logps/chosen": -260.6808776855469, + "logps/rejected": -228.1754913330078, + "loss": 0.6661, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13278117775917053, + "rewards/margins": 0.10964999347925186, + "rewards/rejected": 0.023131176829338074, + "step": 5146 + }, + { + "epoch": 0.7959791223661319, + "grad_norm": 4.228728294372559, + "learning_rate": 4.081509909497079e-06, + "logits/chosen": 11.719847679138184, + "logits/rejected": 9.542858123779297, + "logps/chosen": -258.9104309082031, + "logps/rejected": -271.6875915527344, + "loss": 0.4688, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3374248743057251, + "rewards/margins": 0.5757278800010681, + "rewards/rejected": -0.2383030354976654, + "step": 5147 + }, + { + "epoch": 0.796133771505896, + "grad_norm": 5.485743045806885, + "learning_rate": 4.0812235078474055e-06, + "logits/chosen": 2.7935211658477783, + "logits/rejected": 2.319593906402588, + "logps/chosen": -298.6899719238281, + "logps/rejected": -220.15921020507812, + "loss": 0.7439, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12295404076576233, + "rewards/margins": -0.014685973525047302, + "rewards/rejected": 0.13763999938964844, + "step": 5148 + }, + { + "epoch": 0.7962884206456602, + "grad_norm": 6.5264506340026855, + "learning_rate": 4.080937106197732e-06, + "logits/chosen": 9.92123794555664, + "logits/rejected": 10.865812301635742, + "logps/chosen": -254.81570434570312, + "logps/rejected": -238.35000610351562, + "loss": 0.7695, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34687894582748413, + "rewards/margins": -0.08180607110261917, + "rewards/rejected": 0.4286850094795227, + "step": 5149 + }, + { + "epoch": 0.7964430697854243, + "grad_norm": 4.936883449554443, + "learning_rate": 4.080650704548059e-06, + "logits/chosen": 11.194367408752441, + "logits/rejected": 11.334209442138672, + "logps/chosen": -346.46539306640625, + "logps/rejected": -296.0880432128906, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00530165433883667, + "rewards/margins": 0.5895830392837524, + "rewards/rejected": -0.5948846936225891, + "step": 5150 + }, + { + "epoch": 0.7965977189251885, + "grad_norm": 5.078946590423584, + "learning_rate": 4.080364302898385e-06, + "logits/chosen": 6.028511047363281, + "logits/rejected": 11.33392333984375, + "logps/chosen": -187.18377685546875, + "logps/rejected": -288.6493225097656, + "loss": 0.7016, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08418698608875275, + "rewards/margins": -0.006801655516028404, + "rewards/rejected": -0.07738533616065979, + "step": 5151 + }, + { + "epoch": 0.7967523680649526, + "grad_norm": 5.873189926147461, + "learning_rate": 4.080077901248711e-06, + "logits/chosen": 15.880168914794922, + "logits/rejected": 10.889388084411621, + "logps/chosen": -375.6282653808594, + "logps/rejected": -247.41439819335938, + "loss": 0.7363, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.055830977857112885, + "rewards/margins": 0.16678863763809204, + "rewards/rejected": -0.11095762252807617, + "step": 5152 + }, + { + "epoch": 0.7969070172047168, + "grad_norm": 5.783189296722412, + "learning_rate": 4.079791499599038e-06, + "logits/chosen": 12.504559516906738, + "logits/rejected": 7.759441375732422, + "logps/chosen": -251.83560180664062, + "logps/rejected": -265.9884033203125, + "loss": 0.6184, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09201879799365997, + "rewards/margins": 0.30969229340553284, + "rewards/rejected": -0.21767349541187286, + "step": 5153 + }, + { + "epoch": 0.797061666344481, + "grad_norm": 5.160167217254639, + "learning_rate": 4.0795050979493645e-06, + "logits/chosen": 9.942333221435547, + "logits/rejected": 5.539155006408691, + "logps/chosen": -227.46876525878906, + "logps/rejected": -260.2804260253906, + "loss": 0.4565, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1997339278459549, + "rewards/margins": 0.7215489149093628, + "rewards/rejected": -0.5218150615692139, + "step": 5154 + }, + { + "epoch": 0.7972163154842451, + "grad_norm": 4.398248195648193, + "learning_rate": 4.079218696299691e-06, + "logits/chosen": 11.980247497558594, + "logits/rejected": 5.4819416999816895, + "logps/chosen": -341.745361328125, + "logps/rejected": -247.32220458984375, + "loss": 0.5341, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6152309775352478, + "rewards/margins": 0.5775164365768433, + "rewards/rejected": 0.037714630365371704, + "step": 5155 + }, + { + "epoch": 0.7973709646240092, + "grad_norm": 4.17824125289917, + "learning_rate": 4.078932294650018e-06, + "logits/chosen": 15.08259105682373, + "logits/rejected": 12.27125072479248, + "logps/chosen": -300.5875549316406, + "logps/rejected": -242.35751342773438, + "loss": 0.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4908035695552826, + "rewards/margins": 0.4971941113471985, + "rewards/rejected": -0.006390571594238281, + "step": 5156 + }, + { + "epoch": 0.7975256137637734, + "grad_norm": 4.464823246002197, + "learning_rate": 4.0786458930003445e-06, + "logits/chosen": 11.842748641967773, + "logits/rejected": 6.18039083480835, + "logps/chosen": -341.8172302246094, + "logps/rejected": -256.6737976074219, + "loss": 0.5122, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25513607263565063, + "rewards/margins": 0.44512036442756653, + "rewards/rejected": -0.18998433649539948, + "step": 5157 + }, + { + "epoch": 0.7976802629035376, + "grad_norm": 3.3774492740631104, + "learning_rate": 4.078359491350671e-06, + "logits/chosen": 10.157923698425293, + "logits/rejected": 9.001262664794922, + "logps/chosen": -164.86631774902344, + "logps/rejected": -212.5494384765625, + "loss": 0.5367, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19584083557128906, + "rewards/margins": 0.4724709391593933, + "rewards/rejected": -0.27663013339042664, + "step": 5158 + }, + { + "epoch": 0.7978349120433018, + "grad_norm": 4.394164085388184, + "learning_rate": 4.078073089700997e-06, + "logits/chosen": 7.729325294494629, + "logits/rejected": 7.729151725769043, + "logps/chosen": -284.0955810546875, + "logps/rejected": -264.1944274902344, + "loss": 0.5913, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5126651525497437, + "rewards/margins": 0.28072088956832886, + "rewards/rejected": 0.23194430768489838, + "step": 5159 + }, + { + "epoch": 0.797989561183066, + "grad_norm": 4.99221658706665, + "learning_rate": 4.0777866880513236e-06, + "logits/chosen": 2.469189405441284, + "logits/rejected": -1.3221299648284912, + "logps/chosen": -259.8636169433594, + "logps/rejected": -167.8727569580078, + "loss": 0.588, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1812053620815277, + "rewards/margins": 0.43950381875038147, + "rewards/rejected": -0.25829845666885376, + "step": 5160 + }, + { + "epoch": 0.7981442103228301, + "grad_norm": 4.671767234802246, + "learning_rate": 4.07750028640165e-06, + "logits/chosen": 9.269762992858887, + "logits/rejected": 2.148052215576172, + "logps/chosen": -201.21466064453125, + "logps/rejected": -128.78848266601562, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11846806108951569, + "rewards/margins": 0.3675863742828369, + "rewards/rejected": -0.24911832809448242, + "step": 5161 + }, + { + "epoch": 0.7982988594625943, + "grad_norm": 4.324918270111084, + "learning_rate": 4.077213884751977e-06, + "logits/chosen": 11.156030654907227, + "logits/rejected": 7.358633041381836, + "logps/chosen": -251.2011260986328, + "logps/rejected": -219.09864807128906, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2468796819448471, + "rewards/margins": 0.493645578622818, + "rewards/rejected": -0.2467658817768097, + "step": 5162 + }, + { + "epoch": 0.7984535086023584, + "grad_norm": 4.701037883758545, + "learning_rate": 4.0769274831023035e-06, + "logits/chosen": 7.528332710266113, + "logits/rejected": 2.950017213821411, + "logps/chosen": -323.69158935546875, + "logps/rejected": -219.5825958251953, + "loss": 0.4739, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10166644304990768, + "rewards/margins": 0.587394654750824, + "rewards/rejected": -0.4857281744480133, + "step": 5163 + }, + { + "epoch": 0.7986081577421226, + "grad_norm": 10.261622428894043, + "learning_rate": 4.076641081452629e-06, + "logits/chosen": 13.693697929382324, + "logits/rejected": 0.9454028606414795, + "logps/chosen": -288.722900390625, + "logps/rejected": -178.91441345214844, + "loss": 0.7346, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1125628650188446, + "rewards/margins": 0.14303874969482422, + "rewards/rejected": -0.030475907027721405, + "step": 5164 + }, + { + "epoch": 0.7987628068818867, + "grad_norm": 5.021134853363037, + "learning_rate": 4.076354679802956e-06, + "logits/chosen": 14.34484577178955, + "logits/rejected": 11.020774841308594, + "logps/chosen": -276.3098449707031, + "logps/rejected": -214.42710876464844, + "loss": 0.7184, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.07126487791538239, + "rewards/margins": 0.03213825076818466, + "rewards/rejected": 0.039126645773649216, + "step": 5165 + }, + { + "epoch": 0.7989174560216509, + "grad_norm": 5.3498215675354, + "learning_rate": 4.076068278153283e-06, + "logits/chosen": 8.067974090576172, + "logits/rejected": 2.415658473968506, + "logps/chosen": -351.56158447265625, + "logps/rejected": -266.89569091796875, + "loss": 0.5354, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2089136838912964, + "rewards/margins": 0.5643680691719055, + "rewards/rejected": -0.35545435547828674, + "step": 5166 + }, + { + "epoch": 0.799072105161415, + "grad_norm": 4.583428382873535, + "learning_rate": 4.075781876503609e-06, + "logits/chosen": 8.18742847442627, + "logits/rejected": 11.306123733520508, + "logps/chosen": -252.96444702148438, + "logps/rejected": -219.61077880859375, + "loss": 0.6739, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09037430584430695, + "rewards/margins": 0.24785694479942322, + "rewards/rejected": -0.15748265385627747, + "step": 5167 + }, + { + "epoch": 0.7992267543011792, + "grad_norm": 6.829429626464844, + "learning_rate": 4.075495474853935e-06, + "logits/chosen": 5.8625688552856445, + "logits/rejected": 11.135774612426758, + "logps/chosen": -226.4049835205078, + "logps/rejected": -315.36285400390625, + "loss": 0.8188, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18695230782032013, + "rewards/margins": -0.19627505540847778, + "rewards/rejected": 0.3832273483276367, + "step": 5168 + }, + { + "epoch": 0.7993814034409433, + "grad_norm": 5.711610794067383, + "learning_rate": 4.075209073204262e-06, + "logits/chosen": 6.895674705505371, + "logits/rejected": 2.1807947158813477, + "logps/chosen": -200.90115356445312, + "logps/rejected": -177.5023956298828, + "loss": 0.7065, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1053914874792099, + "rewards/margins": 0.018756121397018433, + "rewards/rejected": 0.08663541078567505, + "step": 5169 + }, + { + "epoch": 0.7995360525807075, + "grad_norm": 8.075103759765625, + "learning_rate": 4.074922671554588e-06, + "logits/chosen": 12.314367294311523, + "logits/rejected": 8.375946044921875, + "logps/chosen": -521.67529296875, + "logps/rejected": -387.8454284667969, + "loss": 0.7433, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09303244203329086, + "rewards/margins": -0.03562374413013458, + "rewards/rejected": 0.12865619361400604, + "step": 5170 + }, + { + "epoch": 0.7996907017204716, + "grad_norm": 6.888089656829834, + "learning_rate": 4.074636269904915e-06, + "logits/chosen": 11.074056625366211, + "logits/rejected": 11.427193641662598, + "logps/chosen": -266.93377685546875, + "logps/rejected": -346.3176574707031, + "loss": 0.7831, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06091861426830292, + "rewards/margins": -0.06390589475631714, + "rewards/rejected": 0.12482450902462006, + "step": 5171 + }, + { + "epoch": 0.7998453508602359, + "grad_norm": 4.3380560874938965, + "learning_rate": 4.074349868255242e-06, + "logits/chosen": 5.799299240112305, + "logits/rejected": 6.195982933044434, + "logps/chosen": -266.902099609375, + "logps/rejected": -223.0730743408203, + "loss": 0.6685, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16649918258190155, + "rewards/margins": 0.17663513123989105, + "rewards/rejected": -0.010135941207408905, + "step": 5172 + }, + { + "epoch": 0.8, + "grad_norm": 6.414584636688232, + "learning_rate": 4.0740634666055675e-06, + "logits/chosen": 8.644225120544434, + "logits/rejected": 7.706579208374023, + "logps/chosen": -278.543701171875, + "logps/rejected": -291.5439453125, + "loss": 0.6804, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5179382562637329, + "rewards/margins": 0.40932753682136536, + "rewards/rejected": 0.10861071944236755, + "step": 5173 + }, + { + "epoch": 0.8001546491397642, + "grad_norm": 5.999967098236084, + "learning_rate": 4.073777064955894e-06, + "logits/chosen": 11.229823112487793, + "logits/rejected": 12.47011947631836, + "logps/chosen": -307.052978515625, + "logps/rejected": -309.1876220703125, + "loss": 0.7303, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2495204359292984, + "rewards/margins": 0.010356783866882324, + "rewards/rejected": 0.23916365206241608, + "step": 5174 + }, + { + "epoch": 0.8003092982795283, + "grad_norm": 6.367274761199951, + "learning_rate": 4.073490663306221e-06, + "logits/chosen": 10.578140258789062, + "logits/rejected": 3.7615132331848145, + "logps/chosen": -334.2843017578125, + "logps/rejected": -194.01795959472656, + "loss": 0.6853, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15161211788654327, + "rewards/margins": 0.1886766105890274, + "rewards/rejected": -0.3402886986732483, + "step": 5175 + }, + { + "epoch": 0.8004639474192925, + "grad_norm": 3.986971855163574, + "learning_rate": 4.0732042616565474e-06, + "logits/chosen": 15.800336837768555, + "logits/rejected": 9.159343719482422, + "logps/chosen": -207.65200805664062, + "logps/rejected": -136.0389862060547, + "loss": 0.601, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12586450576782227, + "rewards/margins": 0.3264733552932739, + "rewards/rejected": -0.20060878992080688, + "step": 5176 + }, + { + "epoch": 0.8006185965590567, + "grad_norm": 7.346532821655273, + "learning_rate": 4.072917860006874e-06, + "logits/chosen": 13.264067649841309, + "logits/rejected": 13.319330215454102, + "logps/chosen": -360.99481201171875, + "logps/rejected": -331.4978942871094, + "loss": 0.9524, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03138361871242523, + "rewards/margins": -0.38953450322151184, + "rewards/rejected": 0.42091816663742065, + "step": 5177 + }, + { + "epoch": 0.8007732456988208, + "grad_norm": 5.181656837463379, + "learning_rate": 4.0726314583572e-06, + "logits/chosen": 14.349178314208984, + "logits/rejected": 10.960301399230957, + "logps/chosen": -283.50250244140625, + "logps/rejected": -264.8973083496094, + "loss": 0.7506, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07823530584573746, + "rewards/margins": 0.05874181538820267, + "rewards/rejected": -0.13697710633277893, + "step": 5178 + }, + { + "epoch": 0.800927894838585, + "grad_norm": 6.109396457672119, + "learning_rate": 4.0723450567075265e-06, + "logits/chosen": 9.910364151000977, + "logits/rejected": 7.063610553741455, + "logps/chosen": -280.49462890625, + "logps/rejected": -298.12835693359375, + "loss": 0.6456, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.14363422989845276, + "rewards/margins": 0.1463633030653, + "rewards/rejected": -0.28999754786491394, + "step": 5179 + }, + { + "epoch": 0.8010825439783491, + "grad_norm": 5.099960803985596, + "learning_rate": 4.072058655057853e-06, + "logits/chosen": 9.606693267822266, + "logits/rejected": 4.290958881378174, + "logps/chosen": -226.71255493164062, + "logps/rejected": -218.27362060546875, + "loss": 0.6723, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12202448397874832, + "rewards/margins": 0.11381072551012039, + "rewards/rejected": 0.008213765919208527, + "step": 5180 + }, + { + "epoch": 0.8012371931181133, + "grad_norm": 5.887612819671631, + "learning_rate": 4.07177225340818e-06, + "logits/chosen": 12.942381858825684, + "logits/rejected": 11.690461158752441, + "logps/chosen": -327.50445556640625, + "logps/rejected": -358.5029296875, + "loss": 0.5994, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09523968398571014, + "rewards/margins": 0.32843297719955444, + "rewards/rejected": -0.2331932932138443, + "step": 5181 + }, + { + "epoch": 0.8013918422578774, + "grad_norm": 6.461954116821289, + "learning_rate": 4.0714858517585065e-06, + "logits/chosen": 12.46915054321289, + "logits/rejected": 12.396735191345215, + "logps/chosen": -220.73936462402344, + "logps/rejected": -184.2044677734375, + "loss": 0.883, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20647715032100677, + "rewards/margins": -0.2857552170753479, + "rewards/rejected": 0.07927803695201874, + "step": 5182 + }, + { + "epoch": 0.8015464913976416, + "grad_norm": 5.9712018966674805, + "learning_rate": 4.071199450108833e-06, + "logits/chosen": 8.546089172363281, + "logits/rejected": 7.132925033569336, + "logps/chosen": -339.44329833984375, + "logps/rejected": -394.236328125, + "loss": 0.5822, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3559885025024414, + "rewards/margins": 0.3481229543685913, + "rewards/rejected": 0.007865525782108307, + "step": 5183 + }, + { + "epoch": 0.8017011405374057, + "grad_norm": 5.764330863952637, + "learning_rate": 4.07091304845916e-06, + "logits/chosen": 6.275364398956299, + "logits/rejected": 7.319153308868408, + "logps/chosen": -281.5815124511719, + "logps/rejected": -280.7547607421875, + "loss": 0.7068, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09090570360422134, + "rewards/margins": 0.3019055128097534, + "rewards/rejected": -0.39281123876571655, + "step": 5184 + }, + { + "epoch": 0.80185578967717, + "grad_norm": 5.568892478942871, + "learning_rate": 4.070626646809486e-06, + "logits/chosen": 8.021758079528809, + "logits/rejected": 7.616845607757568, + "logps/chosen": -361.82623291015625, + "logps/rejected": -381.05108642578125, + "loss": 0.5372, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5105154514312744, + "rewards/margins": 0.5455015897750854, + "rewards/rejected": -0.03498610854148865, + "step": 5185 + }, + { + "epoch": 0.8020104388169341, + "grad_norm": 4.404518127441406, + "learning_rate": 4.070340245159812e-06, + "logits/chosen": 10.226493835449219, + "logits/rejected": 6.699295520782471, + "logps/chosen": -327.3751220703125, + "logps/rejected": -282.7770690917969, + "loss": 0.4766, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42840346693992615, + "rewards/margins": 0.6867592930793762, + "rewards/rejected": -0.2583558261394501, + "step": 5186 + }, + { + "epoch": 0.8021650879566983, + "grad_norm": 4.824042797088623, + "learning_rate": 4.070053843510139e-06, + "logits/chosen": 8.825841903686523, + "logits/rejected": 3.003796339035034, + "logps/chosen": -361.38232421875, + "logps/rejected": -277.3885192871094, + "loss": 0.516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17106600105762482, + "rewards/margins": 0.45207738876342773, + "rewards/rejected": -0.2810113728046417, + "step": 5187 + }, + { + "epoch": 0.8023197370964624, + "grad_norm": 5.579853057861328, + "learning_rate": 4.0697674418604655e-06, + "logits/chosen": 5.236525058746338, + "logits/rejected": 12.648462295532227, + "logps/chosen": -198.24014282226562, + "logps/rejected": -352.86212158203125, + "loss": 0.7394, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0928381085395813, + "rewards/margins": 0.029683902859687805, + "rewards/rejected": -0.12252199649810791, + "step": 5188 + }, + { + "epoch": 0.8024743862362266, + "grad_norm": 4.990927696228027, + "learning_rate": 4.069481040210792e-06, + "logits/chosen": 11.756256103515625, + "logits/rejected": 5.746004104614258, + "logps/chosen": -273.04345703125, + "logps/rejected": -235.57907104492188, + "loss": 0.6993, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17345009744167328, + "rewards/margins": 0.11696793138980865, + "rewards/rejected": -0.29041802883148193, + "step": 5189 + }, + { + "epoch": 0.8026290353759907, + "grad_norm": 4.492560863494873, + "learning_rate": 4.069194638561119e-06, + "logits/chosen": 15.694756507873535, + "logits/rejected": 9.705060958862305, + "logps/chosen": -294.7191467285156, + "logps/rejected": -239.3253173828125, + "loss": 0.4475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6990019083023071, + "rewards/margins": 0.6343756914138794, + "rewards/rejected": 0.06462621688842773, + "step": 5190 + }, + { + "epoch": 0.8027836845157549, + "grad_norm": 5.97263765335083, + "learning_rate": 4.0689082369114455e-06, + "logits/chosen": 7.277877330780029, + "logits/rejected": -0.9505197405815125, + "logps/chosen": -315.8487548828125, + "logps/rejected": -203.42868041992188, + "loss": 0.5871, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4197215735912323, + "rewards/margins": 0.539340615272522, + "rewards/rejected": -0.11961911618709564, + "step": 5191 + }, + { + "epoch": 0.802938333655519, + "grad_norm": 6.718442440032959, + "learning_rate": 4.068621835261771e-06, + "logits/chosen": 8.153020858764648, + "logits/rejected": 6.387395858764648, + "logps/chosen": -273.1218566894531, + "logps/rejected": -268.9293518066406, + "loss": 0.6694, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02637871913611889, + "rewards/margins": 0.12925881147384644, + "rewards/rejected": -0.10288010537624359, + "step": 5192 + }, + { + "epoch": 0.8030929827952832, + "grad_norm": 4.039098262786865, + "learning_rate": 4.068335433612098e-06, + "logits/chosen": 6.684462547302246, + "logits/rejected": 8.002167701721191, + "logps/chosen": -171.2302703857422, + "logps/rejected": -238.5595703125, + "loss": 0.5871, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5200514793395996, + "rewards/margins": 0.3217238187789917, + "rewards/rejected": 0.1983276903629303, + "step": 5193 + }, + { + "epoch": 0.8032476319350473, + "grad_norm": 4.782079696655273, + "learning_rate": 4.068049031962425e-06, + "logits/chosen": 7.347402572631836, + "logits/rejected": 4.715786457061768, + "logps/chosen": -186.6715087890625, + "logps/rejected": -177.94876098632812, + "loss": 0.584, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09450964629650116, + "rewards/margins": 0.3566397428512573, + "rewards/rejected": -0.4511494040489197, + "step": 5194 + }, + { + "epoch": 0.8034022810748115, + "grad_norm": 8.323575973510742, + "learning_rate": 4.067762630312751e-06, + "logits/chosen": 8.670562744140625, + "logits/rejected": 11.893360137939453, + "logps/chosen": -425.72332763671875, + "logps/rejected": -712.8861083984375, + "loss": 0.736, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008545875549316406, + "rewards/margins": -0.024723917245864868, + "rewards/rejected": 0.016178037971258163, + "step": 5195 + }, + { + "epoch": 0.8035569302145756, + "grad_norm": 5.844688415527344, + "learning_rate": 4.067476228663078e-06, + "logits/chosen": 6.96752405166626, + "logits/rejected": 10.892728805541992, + "logps/chosen": -225.2916717529297, + "logps/rejected": -256.3316650390625, + "loss": 0.8014, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19091683626174927, + "rewards/margins": -0.16553544998168945, + "rewards/rejected": 0.3564522862434387, + "step": 5196 + }, + { + "epoch": 0.8037115793543398, + "grad_norm": 5.034675121307373, + "learning_rate": 4.0671898270134046e-06, + "logits/chosen": 8.807684898376465, + "logits/rejected": 7.643858432769775, + "logps/chosen": -209.61215209960938, + "logps/rejected": -233.79409790039062, + "loss": 0.6496, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12015505135059357, + "rewards/margins": 0.2830027639865875, + "rewards/rejected": -0.16284771263599396, + "step": 5197 + }, + { + "epoch": 0.803866228494104, + "grad_norm": 8.784025192260742, + "learning_rate": 4.06690342536373e-06, + "logits/chosen": 14.334604263305664, + "logits/rejected": 10.856712341308594, + "logps/chosen": -381.6451416015625, + "logps/rejected": -294.3848876953125, + "loss": 0.7972, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4531240165233612, + "rewards/margins": 0.15361306071281433, + "rewards/rejected": -0.6067371368408203, + "step": 5198 + }, + { + "epoch": 0.8040208776338682, + "grad_norm": 4.886275768280029, + "learning_rate": 4.066617023714057e-06, + "logits/chosen": 8.171455383300781, + "logits/rejected": 6.467146873474121, + "logps/chosen": -201.5238037109375, + "logps/rejected": -208.74974060058594, + "loss": 0.5851, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11793877929449081, + "rewards/margins": 0.2462824583053589, + "rewards/rejected": -0.12834367156028748, + "step": 5199 + }, + { + "epoch": 0.8041755267736324, + "grad_norm": 15.686994552612305, + "learning_rate": 4.066330622064384e-06, + "logits/chosen": 8.719632148742676, + "logits/rejected": 4.046745777130127, + "logps/chosen": -185.4056396484375, + "logps/rejected": -144.71392822265625, + "loss": 0.6332, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.002722442150115967, + "rewards/margins": 0.20654967427253723, + "rewards/rejected": -0.2092721164226532, + "step": 5200 + }, + { + "epoch": 0.8043301759133965, + "grad_norm": 7.648826599121094, + "learning_rate": 4.06604422041471e-06, + "logits/chosen": 11.504674911499023, + "logits/rejected": 11.624543190002441, + "logps/chosen": -258.1611633300781, + "logps/rejected": -226.80657958984375, + "loss": 0.6269, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30315646529197693, + "rewards/margins": 0.18724174797534943, + "rewards/rejected": 0.1159147173166275, + "step": 5201 + }, + { + "epoch": 0.8044848250531607, + "grad_norm": 8.368888854980469, + "learning_rate": 4.065757818765036e-06, + "logits/chosen": 11.93128776550293, + "logits/rejected": 9.404498100280762, + "logps/chosen": -400.0206298828125, + "logps/rejected": -376.31103515625, + "loss": 0.7915, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03614950552582741, + "rewards/margins": -0.14478681981563568, + "rewards/rejected": 0.18093635141849518, + "step": 5202 + }, + { + "epoch": 0.8046394741929248, + "grad_norm": 4.903611183166504, + "learning_rate": 4.065471417115363e-06, + "logits/chosen": 9.074488639831543, + "logits/rejected": 2.2496421337127686, + "logps/chosen": -298.34027099609375, + "logps/rejected": -234.4497528076172, + "loss": 0.7267, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.21946890652179718, + "rewards/margins": 0.04027573764324188, + "rewards/rejected": 0.1791931688785553, + "step": 5203 + }, + { + "epoch": 0.804794123332689, + "grad_norm": 4.993732929229736, + "learning_rate": 4.065185015465689e-06, + "logits/chosen": 7.348865509033203, + "logits/rejected": 8.633148193359375, + "logps/chosen": -189.44631958007812, + "logps/rejected": -194.25660705566406, + "loss": 0.6801, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07526562362909317, + "rewards/margins": 0.07058967649936676, + "rewards/rejected": 0.004675954580307007, + "step": 5204 + }, + { + "epoch": 0.8049487724724531, + "grad_norm": 4.015836238861084, + "learning_rate": 4.064898613816016e-06, + "logits/chosen": 15.522954940795898, + "logits/rejected": 11.1107759475708, + "logps/chosen": -212.26478576660156, + "logps/rejected": -184.9337158203125, + "loss": 0.6158, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28311431407928467, + "rewards/margins": 0.28049972653388977, + "rewards/rejected": 0.002614624798297882, + "step": 5205 + }, + { + "epoch": 0.8051034216122173, + "grad_norm": 6.178126811981201, + "learning_rate": 4.064612212166342e-06, + "logits/chosen": 17.10051155090332, + "logits/rejected": 9.199691772460938, + "logps/chosen": -404.9537048339844, + "logps/rejected": -358.705810546875, + "loss": 0.7296, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2712125778198242, + "rewards/margins": 0.037758439779281616, + "rewards/rejected": 0.2334541529417038, + "step": 5206 + }, + { + "epoch": 0.8052580707519814, + "grad_norm": 5.445850372314453, + "learning_rate": 4.0643258105166685e-06, + "logits/chosen": 15.465826034545898, + "logits/rejected": 11.676907539367676, + "logps/chosen": -260.40869140625, + "logps/rejected": -220.15194702148438, + "loss": 0.7879, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1517251878976822, + "rewards/margins": -0.15694619715213776, + "rewards/rejected": 0.005220990628004074, + "step": 5207 + }, + { + "epoch": 0.8054127198917456, + "grad_norm": 10.264176368713379, + "learning_rate": 4.064039408866995e-06, + "logits/chosen": 10.345460891723633, + "logits/rejected": 10.223146438598633, + "logps/chosen": -510.39801025390625, + "logps/rejected": -551.498291015625, + "loss": 0.7476, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11689186096191406, + "rewards/margins": -0.03275033086538315, + "rewards/rejected": 0.1496421843767166, + "step": 5208 + }, + { + "epoch": 0.8055673690315097, + "grad_norm": 5.232631206512451, + "learning_rate": 4.063753007217322e-06, + "logits/chosen": 12.069369316101074, + "logits/rejected": 9.627613067626953, + "logps/chosen": -326.07733154296875, + "logps/rejected": -250.61163330078125, + "loss": 0.5366, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26227816939353943, + "rewards/margins": 0.47139719128608704, + "rewards/rejected": -0.2091190218925476, + "step": 5209 + }, + { + "epoch": 0.8057220181712739, + "grad_norm": 4.837009429931641, + "learning_rate": 4.0634666055676485e-06, + "logits/chosen": 9.050187110900879, + "logits/rejected": 8.561442375183105, + "logps/chosen": -376.6332092285156, + "logps/rejected": -324.16778564453125, + "loss": 0.4906, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6458479166030884, + "rewards/margins": 0.6174514889717102, + "rewards/rejected": 0.02839641273021698, + "step": 5210 + }, + { + "epoch": 0.8058766673110381, + "grad_norm": 8.03756332397461, + "learning_rate": 4.063180203917974e-06, + "logits/chosen": 16.023622512817383, + "logits/rejected": 9.541019439697266, + "logps/chosen": -312.97705078125, + "logps/rejected": -287.52032470703125, + "loss": 0.5909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0995698869228363, + "rewards/margins": 0.44314491748809814, + "rewards/rejected": -0.34357503056526184, + "step": 5211 + }, + { + "epoch": 0.8060313164508023, + "grad_norm": 11.008404731750488, + "learning_rate": 4.062893802268301e-06, + "logits/chosen": 5.884028434753418, + "logits/rejected": 8.004257202148438, + "logps/chosen": -428.6747131347656, + "logps/rejected": -402.5483093261719, + "loss": 0.7213, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22617530822753906, + "rewards/margins": 0.05461283028125763, + "rewards/rejected": -0.2807881534099579, + "step": 5212 + }, + { + "epoch": 0.8061859655905664, + "grad_norm": 5.181330680847168, + "learning_rate": 4.0626074006186276e-06, + "logits/chosen": 5.363044738769531, + "logits/rejected": 10.80592155456543, + "logps/chosen": -169.23484802246094, + "logps/rejected": -206.7460479736328, + "loss": 0.6371, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06880469620227814, + "rewards/margins": 0.16563266515731812, + "rewards/rejected": -0.09682796150445938, + "step": 5213 + }, + { + "epoch": 0.8063406147303306, + "grad_norm": 6.119858264923096, + "learning_rate": 4.062320998968954e-06, + "logits/chosen": 9.56867790222168, + "logits/rejected": 6.6196770668029785, + "logps/chosen": -289.2655029296875, + "logps/rejected": -191.4665069580078, + "loss": 0.6223, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06160430610179901, + "rewards/margins": 0.3181297481060028, + "rewards/rejected": -0.379734069108963, + "step": 5214 + }, + { + "epoch": 0.8064952638700947, + "grad_norm": 5.101420879364014, + "learning_rate": 4.062034597319281e-06, + "logits/chosen": 7.706490516662598, + "logits/rejected": 7.361589431762695, + "logps/chosen": -195.20339965820312, + "logps/rejected": -166.02003479003906, + "loss": 0.8156, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1350744217634201, + "rewards/margins": -0.071823850274086, + "rewards/rejected": 0.2068982720375061, + "step": 5215 + }, + { + "epoch": 0.8066499130098589, + "grad_norm": 5.387447834014893, + "learning_rate": 4.0617481956696075e-06, + "logits/chosen": 13.040346145629883, + "logits/rejected": 8.966412544250488, + "logps/chosen": -241.52308654785156, + "logps/rejected": -196.57492065429688, + "loss": 0.6967, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.006958387792110443, + "rewards/margins": 0.011606879532337189, + "rewards/rejected": -0.01856527104973793, + "step": 5216 + }, + { + "epoch": 0.806804562149623, + "grad_norm": 5.955066204071045, + "learning_rate": 4.061461794019934e-06, + "logits/chosen": 10.688066482543945, + "logits/rejected": 6.326800346374512, + "logps/chosen": -287.8813171386719, + "logps/rejected": -230.87631225585938, + "loss": 0.6571, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.34742462635040283, + "rewards/margins": 0.12873634696006775, + "rewards/rejected": -0.47616100311279297, + "step": 5217 + }, + { + "epoch": 0.8069592112893872, + "grad_norm": 5.2139506340026855, + "learning_rate": 4.06117539237026e-06, + "logits/chosen": 12.241575241088867, + "logits/rejected": 10.76771354675293, + "logps/chosen": -346.0780334472656, + "logps/rejected": -317.2270202636719, + "loss": 0.6177, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.056915730237960815, + "rewards/margins": 0.30003562569618225, + "rewards/rejected": -0.24311991035938263, + "step": 5218 + }, + { + "epoch": 0.8071138604291513, + "grad_norm": 3.31199049949646, + "learning_rate": 4.060888990720587e-06, + "logits/chosen": 8.206533432006836, + "logits/rejected": 9.445413589477539, + "logps/chosen": -111.2761459350586, + "logps/rejected": -130.78717041015625, + "loss": 0.68, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4808102250099182, + "rewards/margins": 0.10177576541900635, + "rewards/rejected": -0.5825859904289246, + "step": 5219 + }, + { + "epoch": 0.8072685095689155, + "grad_norm": 6.173686981201172, + "learning_rate": 4.060602589070913e-06, + "logits/chosen": 6.404994010925293, + "logits/rejected": 4.3115410804748535, + "logps/chosen": -341.13824462890625, + "logps/rejected": -216.08740234375, + "loss": 0.6169, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1019790843129158, + "rewards/margins": 0.29108184576034546, + "rewards/rejected": -0.18910276889801025, + "step": 5220 + }, + { + "epoch": 0.8074231587086796, + "grad_norm": 6.065658092498779, + "learning_rate": 4.06031618742124e-06, + "logits/chosen": 10.73746395111084, + "logits/rejected": 4.924856185913086, + "logps/chosen": -339.2300720214844, + "logps/rejected": -298.42193603515625, + "loss": 0.4699, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23669365048408508, + "rewards/margins": 0.6594338417053223, + "rewards/rejected": -0.42274025082588196, + "step": 5221 + }, + { + "epoch": 0.8075778078484438, + "grad_norm": 5.646334648132324, + "learning_rate": 4.060029785771567e-06, + "logits/chosen": 15.446451187133789, + "logits/rejected": 10.548297882080078, + "logps/chosen": -310.388916015625, + "logps/rejected": -259.560302734375, + "loss": 0.5496, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3516327738761902, + "rewards/margins": 0.5002945065498352, + "rewards/rejected": -0.14866170287132263, + "step": 5222 + }, + { + "epoch": 0.8077324569882081, + "grad_norm": 4.586674213409424, + "learning_rate": 4.059743384121893e-06, + "logits/chosen": 15.566143035888672, + "logits/rejected": 6.519749164581299, + "logps/chosen": -305.5724182128906, + "logps/rejected": -173.30209350585938, + "loss": 0.4968, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22559680044651031, + "rewards/margins": 0.49867165088653564, + "rewards/rejected": -0.2730748653411865, + "step": 5223 + }, + { + "epoch": 0.8078871061279722, + "grad_norm": 5.123507499694824, + "learning_rate": 4.05945698247222e-06, + "logits/chosen": 10.788810729980469, + "logits/rejected": 12.295494079589844, + "logps/chosen": -246.04440307617188, + "logps/rejected": -208.91676330566406, + "loss": 0.6507, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06893173605203629, + "rewards/margins": 0.13411402702331543, + "rewards/rejected": -0.06518229842185974, + "step": 5224 + }, + { + "epoch": 0.8080417552677364, + "grad_norm": 5.3833465576171875, + "learning_rate": 4.059170580822546e-06, + "logits/chosen": 15.8935546875, + "logits/rejected": 8.98311996459961, + "logps/chosen": -345.6173095703125, + "logps/rejected": -268.0266418457031, + "loss": 0.6135, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2668483853340149, + "rewards/margins": 0.32655489444732666, + "rewards/rejected": -0.059706494212150574, + "step": 5225 + }, + { + "epoch": 0.8081964044075005, + "grad_norm": 6.116227149963379, + "learning_rate": 4.058884179172872e-06, + "logits/chosen": 9.894168853759766, + "logits/rejected": 6.375942707061768, + "logps/chosen": -312.7122802734375, + "logps/rejected": -237.5018310546875, + "loss": 0.5712, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.057060711085796356, + "rewards/margins": 0.3607668876647949, + "rewards/rejected": -0.30370616912841797, + "step": 5226 + }, + { + "epoch": 0.8083510535472647, + "grad_norm": 4.162163257598877, + "learning_rate": 4.058597777523199e-06, + "logits/chosen": 14.81265640258789, + "logits/rejected": 0.16877001523971558, + "logps/chosen": -325.4465637207031, + "logps/rejected": -185.43394470214844, + "loss": 0.4337, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08990420401096344, + "rewards/margins": 0.8772229552268982, + "rewards/rejected": -0.9671271443367004, + "step": 5227 + }, + { + "epoch": 0.8085057026870288, + "grad_norm": 7.506683349609375, + "learning_rate": 4.058311375873526e-06, + "logits/chosen": 8.362614631652832, + "logits/rejected": 11.137846946716309, + "logps/chosen": -246.98936462402344, + "logps/rejected": -203.04798889160156, + "loss": 0.6791, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015744924545288086, + "rewards/margins": 0.12110400199890137, + "rewards/rejected": -0.10535907745361328, + "step": 5228 + }, + { + "epoch": 0.808660351826793, + "grad_norm": 4.190384864807129, + "learning_rate": 4.058024974223852e-06, + "logits/chosen": 15.405750274658203, + "logits/rejected": 14.223522186279297, + "logps/chosen": -324.0042724609375, + "logps/rejected": -241.855224609375, + "loss": 0.4907, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07716482877731323, + "rewards/margins": 0.6256026029586792, + "rewards/rejected": -0.548437774181366, + "step": 5229 + }, + { + "epoch": 0.8088150009665571, + "grad_norm": 13.006925582885742, + "learning_rate": 4.057738572574179e-06, + "logits/chosen": 10.146551132202148, + "logits/rejected": 7.837651252746582, + "logps/chosen": -316.58941650390625, + "logps/rejected": -239.23733520507812, + "loss": 1.0397, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.233009472489357, + "rewards/margins": -0.43496549129486084, + "rewards/rejected": 0.20195600390434265, + "step": 5230 + }, + { + "epoch": 0.8089696501063213, + "grad_norm": 6.152152061462402, + "learning_rate": 4.057452170924505e-06, + "logits/chosen": 8.831681251525879, + "logits/rejected": 2.48677396774292, + "logps/chosen": -428.4947814941406, + "logps/rejected": -440.7967224121094, + "loss": 0.5935, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35941529273986816, + "rewards/margins": 0.5527947545051575, + "rewards/rejected": -0.1933795064687729, + "step": 5231 + }, + { + "epoch": 0.8091242992460854, + "grad_norm": 5.133934497833252, + "learning_rate": 4.057165769274831e-06, + "logits/chosen": 10.606398582458496, + "logits/rejected": 4.552722930908203, + "logps/chosen": -274.0418701171875, + "logps/rejected": -207.39620971679688, + "loss": 0.5806, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29895949363708496, + "rewards/margins": 0.3206798732280731, + "rewards/rejected": -0.021720364689826965, + "step": 5232 + }, + { + "epoch": 0.8092789483858496, + "grad_norm": 5.302076816558838, + "learning_rate": 4.056879367625158e-06, + "logits/chosen": 13.161714553833008, + "logits/rejected": 3.288963794708252, + "logps/chosen": -606.9154052734375, + "logps/rejected": -284.2491760253906, + "loss": 0.4936, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5333611369132996, + "rewards/margins": 0.7942339181900024, + "rewards/rejected": -0.2608727812767029, + "step": 5233 + }, + { + "epoch": 0.8094335975256137, + "grad_norm": 4.196181297302246, + "learning_rate": 4.056592965975485e-06, + "logits/chosen": 8.693201065063477, + "logits/rejected": 4.886995315551758, + "logps/chosen": -333.12176513671875, + "logps/rejected": -240.75408935546875, + "loss": 0.4369, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4680570960044861, + "rewards/margins": 0.6776407957077026, + "rewards/rejected": -0.20958368480205536, + "step": 5234 + }, + { + "epoch": 0.8095882466653779, + "grad_norm": 5.188414573669434, + "learning_rate": 4.056306564325811e-06, + "logits/chosen": 7.497532844543457, + "logits/rejected": 9.022340774536133, + "logps/chosen": -290.15472412109375, + "logps/rejected": -330.5630187988281, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2860964834690094, + "rewards/margins": 0.06554519385099411, + "rewards/rejected": 0.22055131196975708, + "step": 5235 + }, + { + "epoch": 0.8097428958051421, + "grad_norm": 19.002063751220703, + "learning_rate": 4.056020162676137e-06, + "logits/chosen": 3.7412750720977783, + "logits/rejected": 8.126347541809082, + "logps/chosen": -236.96688842773438, + "logps/rejected": -321.1217041015625, + "loss": 0.5616, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11160522699356079, + "rewards/margins": 0.35723477602005005, + "rewards/rejected": -0.24562956392765045, + "step": 5236 + }, + { + "epoch": 0.8098975449449063, + "grad_norm": 9.314631462097168, + "learning_rate": 4.055733761026464e-06, + "logits/chosen": 11.819740295410156, + "logits/rejected": 9.621925354003906, + "logps/chosen": -459.09344482421875, + "logps/rejected": -346.83538818359375, + "loss": 0.6899, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2914165258407593, + "rewards/margins": 0.2655884623527527, + "rewards/rejected": 0.025828078389167786, + "step": 5237 + }, + { + "epoch": 0.8100521940846704, + "grad_norm": 5.602231025695801, + "learning_rate": 4.0554473593767904e-06, + "logits/chosen": 2.766555070877075, + "logits/rejected": 4.2995147705078125, + "logps/chosen": -227.91781616210938, + "logps/rejected": -281.0916748046875, + "loss": 0.875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.061980172991752625, + "rewards/margins": 0.04555127024650574, + "rewards/rejected": -0.10753144323825836, + "step": 5238 + }, + { + "epoch": 0.8102068432244346, + "grad_norm": 5.000399589538574, + "learning_rate": 4.055160957727117e-06, + "logits/chosen": 7.695864200592041, + "logits/rejected": 6.486391067504883, + "logps/chosen": -149.65419006347656, + "logps/rejected": -163.77008056640625, + "loss": 0.625, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35791176557540894, + "rewards/margins": 0.18745939433574677, + "rewards/rejected": -0.5453711748123169, + "step": 5239 + }, + { + "epoch": 0.8103614923641987, + "grad_norm": 4.8953423500061035, + "learning_rate": 4.054874556077443e-06, + "logits/chosen": 9.88211727142334, + "logits/rejected": 3.8782706260681152, + "logps/chosen": -243.38905334472656, + "logps/rejected": -215.76162719726562, + "loss": 0.6898, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2254931628704071, + "rewards/margins": 0.2197830080986023, + "rewards/rejected": 0.0057101771235466, + "step": 5240 + }, + { + "epoch": 0.8105161415039629, + "grad_norm": 4.812070369720459, + "learning_rate": 4.0545881544277696e-06, + "logits/chosen": 12.790386199951172, + "logits/rejected": 8.769579887390137, + "logps/chosen": -267.0860595703125, + "logps/rejected": -227.5906982421875, + "loss": 0.6181, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02703189104795456, + "rewards/margins": 0.3609905242919922, + "rewards/rejected": -0.38802242279052734, + "step": 5241 + }, + { + "epoch": 0.810670790643727, + "grad_norm": 8.099353790283203, + "learning_rate": 4.054301752778096e-06, + "logits/chosen": 7.10730504989624, + "logits/rejected": 4.798792839050293, + "logps/chosen": -346.2955627441406, + "logps/rejected": -336.1260070800781, + "loss": 0.6816, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21830779314041138, + "rewards/margins": 0.14087443053722382, + "rewards/rejected": 0.07743337005376816, + "step": 5242 + }, + { + "epoch": 0.8108254397834912, + "grad_norm": 5.594520092010498, + "learning_rate": 4.054015351128423e-06, + "logits/chosen": 9.29182243347168, + "logits/rejected": 4.79741096496582, + "logps/chosen": -268.1068115234375, + "logps/rejected": -225.3123321533203, + "loss": 0.6274, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11372890323400497, + "rewards/margins": 0.37134867906570435, + "rewards/rejected": -0.2576197385787964, + "step": 5243 + }, + { + "epoch": 0.8109800889232553, + "grad_norm": 6.085484981536865, + "learning_rate": 4.053728949478749e-06, + "logits/chosen": 8.205678939819336, + "logits/rejected": 8.612539291381836, + "logps/chosen": -322.44305419921875, + "logps/rejected": -254.89060974121094, + "loss": 0.6224, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.40665608644485474, + "rewards/margins": 0.30481481552124023, + "rewards/rejected": -0.711470901966095, + "step": 5244 + }, + { + "epoch": 0.8111347380630195, + "grad_norm": 12.219535827636719, + "learning_rate": 4.053442547829075e-06, + "logits/chosen": 8.517681121826172, + "logits/rejected": 8.036446571350098, + "logps/chosen": -318.8990173339844, + "logps/rejected": -342.68695068359375, + "loss": 0.4727, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.030153095722198486, + "rewards/margins": 0.7724595069885254, + "rewards/rejected": -0.7423064708709717, + "step": 5245 + }, + { + "epoch": 0.8112893872027837, + "grad_norm": 6.658076286315918, + "learning_rate": 4.053156146179402e-06, + "logits/chosen": 11.386802673339844, + "logits/rejected": 2.051666021347046, + "logps/chosen": -341.3597106933594, + "logps/rejected": -257.64520263671875, + "loss": 0.5587, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3172401487827301, + "rewards/margins": 0.3857433497905731, + "rewards/rejected": -0.06850320100784302, + "step": 5246 + }, + { + "epoch": 0.8114440363425478, + "grad_norm": 3.927082061767578, + "learning_rate": 4.052869744529729e-06, + "logits/chosen": 11.17611312866211, + "logits/rejected": 7.619402885437012, + "logps/chosen": -243.8050079345703, + "logps/rejected": -237.6566925048828, + "loss": 0.5277, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3991936147212982, + "rewards/margins": 0.4035755395889282, + "rewards/rejected": -0.004381915554404259, + "step": 5247 + }, + { + "epoch": 0.811598685482312, + "grad_norm": 4.353519916534424, + "learning_rate": 4.052583342880055e-06, + "logits/chosen": 8.749837875366211, + "logits/rejected": 9.650714874267578, + "logps/chosen": -299.412109375, + "logps/rejected": -283.8265380859375, + "loss": 0.5378, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2868899703025818, + "rewards/margins": 0.6394704580307007, + "rewards/rejected": -0.3525804579257965, + "step": 5248 + }, + { + "epoch": 0.8117533346220762, + "grad_norm": 5.908530235290527, + "learning_rate": 4.052296941230382e-06, + "logits/chosen": 0.9579715728759766, + "logits/rejected": 5.862154960632324, + "logps/chosen": -170.54190063476562, + "logps/rejected": -192.50701904296875, + "loss": 0.7956, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3323361277580261, + "rewards/margins": -0.0383341908454895, + "rewards/rejected": -0.29400190711021423, + "step": 5249 + }, + { + "epoch": 0.8119079837618404, + "grad_norm": 6.58998441696167, + "learning_rate": 4.0520105395807086e-06, + "logits/chosen": 10.976187705993652, + "logits/rejected": 9.866473197937012, + "logps/chosen": -395.2896423339844, + "logps/rejected": -298.64959716796875, + "loss": 0.7724, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15527746081352234, + "rewards/margins": 0.16969317197799683, + "rewards/rejected": -0.014415740966796875, + "step": 5250 + }, + { + "epoch": 0.8120626329016045, + "grad_norm": 3.6378870010375977, + "learning_rate": 4.051724137931034e-06, + "logits/chosen": 9.187033653259277, + "logits/rejected": 0.9402756094932556, + "logps/chosen": -189.7830810546875, + "logps/rejected": -110.6497802734375, + "loss": 0.4986, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04851336032152176, + "rewards/margins": 0.49621251225471497, + "rewards/rejected": -0.4476991593837738, + "step": 5251 + }, + { + "epoch": 0.8122172820413687, + "grad_norm": 3.0564892292022705, + "learning_rate": 4.051437736281361e-06, + "logits/chosen": 11.147770881652832, + "logits/rejected": 5.059230327606201, + "logps/chosen": -132.44859313964844, + "logps/rejected": -95.91191101074219, + "loss": 0.5914, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3794665038585663, + "rewards/margins": 0.28059878945350647, + "rewards/rejected": 0.09886772930622101, + "step": 5252 + }, + { + "epoch": 0.8123719311811328, + "grad_norm": 4.47088098526001, + "learning_rate": 4.051151334631688e-06, + "logits/chosen": 11.804327011108398, + "logits/rejected": 5.642199516296387, + "logps/chosen": -290.63037109375, + "logps/rejected": -205.98760986328125, + "loss": 0.6439, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.30821478366851807, + "rewards/margins": 0.27451837062835693, + "rewards/rejected": 0.03369641304016113, + "step": 5253 + }, + { + "epoch": 0.812526580320897, + "grad_norm": 5.9306840896606445, + "learning_rate": 4.050864932982014e-06, + "logits/chosen": 6.84375, + "logits/rejected": 9.897790908813477, + "logps/chosen": -157.9860382080078, + "logps/rejected": -241.5548858642578, + "loss": 0.753, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2498202621936798, + "rewards/margins": 0.013972923159599304, + "rewards/rejected": -0.26379314064979553, + "step": 5254 + }, + { + "epoch": 0.8126812294606611, + "grad_norm": 5.209493637084961, + "learning_rate": 4.050578531332341e-06, + "logits/chosen": 11.861288070678711, + "logits/rejected": 10.334795951843262, + "logps/chosen": -292.6259765625, + "logps/rejected": -265.9131164550781, + "loss": 0.67, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009715085849165916, + "rewards/margins": 0.07515254616737366, + "rewards/rejected": -0.06543746590614319, + "step": 5255 + }, + { + "epoch": 0.8128358786004253, + "grad_norm": 3.3782143592834473, + "learning_rate": 4.050292129682668e-06, + "logits/chosen": 10.3798828125, + "logits/rejected": 11.115757942199707, + "logps/chosen": -127.02919006347656, + "logps/rejected": -146.3394317626953, + "loss": 0.6686, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02231866866350174, + "rewards/margins": 0.11710226535797119, + "rewards/rejected": -0.09478359669446945, + "step": 5256 + }, + { + "epoch": 0.8129905277401894, + "grad_norm": 5.4266252517700195, + "learning_rate": 4.050005728032994e-06, + "logits/chosen": 10.638373374938965, + "logits/rejected": 5.4431352615356445, + "logps/chosen": -286.39208984375, + "logps/rejected": -243.83013916015625, + "loss": 0.4835, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24860036373138428, + "rewards/margins": 0.7253341674804688, + "rewards/rejected": -0.4767337739467621, + "step": 5257 + }, + { + "epoch": 0.8131451768799536, + "grad_norm": 18.151256561279297, + "learning_rate": 4.04971932638332e-06, + "logits/chosen": 6.482385635375977, + "logits/rejected": 11.082147598266602, + "logps/chosen": -245.0753173828125, + "logps/rejected": -222.33807373046875, + "loss": 0.7444, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.28495582938194275, + "rewards/margins": 0.008654996752738953, + "rewards/rejected": 0.2763008177280426, + "step": 5258 + }, + { + "epoch": 0.8132998260197177, + "grad_norm": 5.113682746887207, + "learning_rate": 4.049432924733647e-06, + "logits/chosen": 10.28695011138916, + "logits/rejected": 9.846305847167969, + "logps/chosen": -250.01727294921875, + "logps/rejected": -239.14910888671875, + "loss": 0.6061, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29099470376968384, + "rewards/margins": 0.31740647554397583, + "rewards/rejected": -0.02641179971396923, + "step": 5259 + }, + { + "epoch": 0.8134544751594819, + "grad_norm": 4.1112775802612305, + "learning_rate": 4.049146523083973e-06, + "logits/chosen": 12.012083053588867, + "logits/rejected": 7.357929229736328, + "logps/chosen": -233.00326538085938, + "logps/rejected": -156.93759155273438, + "loss": 0.4742, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2811274528503418, + "rewards/margins": 0.5422151684761047, + "rewards/rejected": -0.26108771562576294, + "step": 5260 + }, + { + "epoch": 0.813609124299246, + "grad_norm": 7.474087715148926, + "learning_rate": 4.0488601214343e-06, + "logits/chosen": 13.350547790527344, + "logits/rejected": 9.982833862304688, + "logps/chosen": -459.61865234375, + "logps/rejected": -401.1523742675781, + "loss": 0.7776, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3623935580253601, + "rewards/margins": 0.008119970560073853, + "rewards/rejected": 0.35427361726760864, + "step": 5261 + }, + { + "epoch": 0.8137637734390103, + "grad_norm": 5.812631130218506, + "learning_rate": 4.048573719784627e-06, + "logits/chosen": 9.639986991882324, + "logits/rejected": 7.043438911437988, + "logps/chosen": -343.6244812011719, + "logps/rejected": -254.70755004882812, + "loss": 0.7194, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12544479966163635, + "rewards/margins": 0.026358749717473984, + "rewards/rejected": 0.09908603876829147, + "step": 5262 + }, + { + "epoch": 0.8139184225787744, + "grad_norm": 3.132899045944214, + "learning_rate": 4.048287318134953e-06, + "logits/chosen": 10.487598419189453, + "logits/rejected": 5.5532546043396, + "logps/chosen": -271.3434753417969, + "logps/rejected": -232.08139038085938, + "loss": 0.3977, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4800993800163269, + "rewards/margins": 0.8519445061683655, + "rewards/rejected": -0.37184518575668335, + "step": 5263 + }, + { + "epoch": 0.8140730717185386, + "grad_norm": 3.75488018989563, + "learning_rate": 4.048000916485279e-06, + "logits/chosen": 14.997414588928223, + "logits/rejected": 6.509361267089844, + "logps/chosen": -348.81103515625, + "logps/rejected": -240.2603759765625, + "loss": 0.4684, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32040759921073914, + "rewards/margins": 0.6518306732177734, + "rewards/rejected": -0.3314230740070343, + "step": 5264 + }, + { + "epoch": 0.8142277208583028, + "grad_norm": 4.084811687469482, + "learning_rate": 4.047714514835606e-06, + "logits/chosen": 7.143255710601807, + "logits/rejected": 2.140012741088867, + "logps/chosen": -155.64756774902344, + "logps/rejected": -101.34384155273438, + "loss": 0.6055, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.033007651567459106, + "rewards/margins": 0.2969239354133606, + "rewards/rejected": -0.3299315869808197, + "step": 5265 + }, + { + "epoch": 0.8143823699980669, + "grad_norm": 5.536152362823486, + "learning_rate": 4.0474281131859324e-06, + "logits/chosen": 16.83504295349121, + "logits/rejected": 8.692803382873535, + "logps/chosen": -390.17431640625, + "logps/rejected": -261.44927978515625, + "loss": 0.6412, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13782596588134766, + "rewards/margins": 0.1725536435842514, + "rewards/rejected": -0.03472766652703285, + "step": 5266 + }, + { + "epoch": 0.814537019137831, + "grad_norm": 7.197746276855469, + "learning_rate": 4.047141711536259e-06, + "logits/chosen": 11.130281448364258, + "logits/rejected": 5.818378925323486, + "logps/chosen": -465.3574523925781, + "logps/rejected": -483.74114990234375, + "loss": 0.5644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20304261147975922, + "rewards/margins": 0.5159529447555542, + "rewards/rejected": -0.31291037797927856, + "step": 5267 + }, + { + "epoch": 0.8146916682775952, + "grad_norm": 5.068341255187988, + "learning_rate": 4.046855309886586e-06, + "logits/chosen": 7.059698581695557, + "logits/rejected": 8.794771194458008, + "logps/chosen": -202.42514038085938, + "logps/rejected": -188.94395446777344, + "loss": 0.6312, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06027105450630188, + "rewards/margins": 0.18109092116355896, + "rewards/rejected": -0.24136200547218323, + "step": 5268 + }, + { + "epoch": 0.8148463174173594, + "grad_norm": 17.070634841918945, + "learning_rate": 4.046568908236912e-06, + "logits/chosen": 9.194201469421387, + "logits/rejected": 10.488310813903809, + "logps/chosen": -214.42385864257812, + "logps/rejected": -300.6953125, + "loss": 0.6785, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13614827394485474, + "rewards/margins": 0.292192280292511, + "rewards/rejected": -0.42834052443504333, + "step": 5269 + }, + { + "epoch": 0.8150009665571235, + "grad_norm": 4.295851230621338, + "learning_rate": 4.046282506587238e-06, + "logits/chosen": 12.289249420166016, + "logits/rejected": 9.12799072265625, + "logps/chosen": -284.0588073730469, + "logps/rejected": -160.14846801757812, + "loss": 0.6293, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.36966472864151, + "rewards/margins": 0.31363534927368164, + "rewards/rejected": 0.056029416620731354, + "step": 5270 + }, + { + "epoch": 0.8151556156968877, + "grad_norm": 6.943835258483887, + "learning_rate": 4.045996104937565e-06, + "logits/chosen": 8.417034149169922, + "logits/rejected": 8.741055488586426, + "logps/chosen": -347.30682373046875, + "logps/rejected": -366.083740234375, + "loss": 0.7137, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3768126368522644, + "rewards/margins": 0.10149691253900528, + "rewards/rejected": 0.27531570196151733, + "step": 5271 + }, + { + "epoch": 0.8153102648366518, + "grad_norm": 6.641474723815918, + "learning_rate": 4.0457097032878915e-06, + "logits/chosen": 12.882699966430664, + "logits/rejected": 11.63312816619873, + "logps/chosen": -232.55972290039062, + "logps/rejected": -255.19406127929688, + "loss": 0.8741, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.10579453408718109, + "rewards/margins": -0.3159026801586151, + "rewards/rejected": 0.21010813117027283, + "step": 5272 + }, + { + "epoch": 0.815464913976416, + "grad_norm": 6.892787456512451, + "learning_rate": 4.045423301638218e-06, + "logits/chosen": 12.944417953491211, + "logits/rejected": 11.377227783203125, + "logps/chosen": -389.84710693359375, + "logps/rejected": -344.00738525390625, + "loss": 0.6537, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08367764949798584, + "rewards/margins": 0.3446391820907593, + "rewards/rejected": -0.26096153259277344, + "step": 5273 + }, + { + "epoch": 0.8156195631161801, + "grad_norm": 7.29603910446167, + "learning_rate": 4.045136899988544e-06, + "logits/chosen": 7.714539527893066, + "logits/rejected": 13.643836975097656, + "logps/chosen": -235.70118713378906, + "logps/rejected": -356.7230224609375, + "loss": 1.0158, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09182576835155487, + "rewards/margins": -0.4734768271446228, + "rewards/rejected": 0.5653025507926941, + "step": 5274 + }, + { + "epoch": 0.8157742122559444, + "grad_norm": 4.280092239379883, + "learning_rate": 4.044850498338871e-06, + "logits/chosen": 13.5042724609375, + "logits/rejected": 10.427685737609863, + "logps/chosen": -294.9332580566406, + "logps/rejected": -253.32485961914062, + "loss": 0.5618, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13593292236328125, + "rewards/margins": 0.4069046974182129, + "rewards/rejected": -0.27097177505493164, + "step": 5275 + }, + { + "epoch": 0.8159288613957085, + "grad_norm": 4.629084587097168, + "learning_rate": 4.044564096689197e-06, + "logits/chosen": 9.6305570602417, + "logits/rejected": 9.9197359085083, + "logps/chosen": -396.084716796875, + "logps/rejected": -289.13482666015625, + "loss": 0.5654, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4238523244857788, + "rewards/margins": 0.39534714818000793, + "rewards/rejected": 0.028505191206932068, + "step": 5276 + }, + { + "epoch": 0.8160835105354727, + "grad_norm": 6.249995708465576, + "learning_rate": 4.044277695039524e-06, + "logits/chosen": 7.059247970581055, + "logits/rejected": 7.480415344238281, + "logps/chosen": -309.423583984375, + "logps/rejected": -180.4270477294922, + "loss": 0.9654, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32799047231674194, + "rewards/margins": -0.11884063482284546, + "rewards/rejected": -0.2091498225927353, + "step": 5277 + }, + { + "epoch": 0.8162381596752368, + "grad_norm": 3.9495480060577393, + "learning_rate": 4.04399129338985e-06, + "logits/chosen": 14.422968864440918, + "logits/rejected": 5.2372660636901855, + "logps/chosen": -439.24542236328125, + "logps/rejected": -214.88162231445312, + "loss": 0.3928, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4407007098197937, + "rewards/margins": 1.2521064281463623, + "rewards/rejected": -0.8114056587219238, + "step": 5278 + }, + { + "epoch": 0.816392808815001, + "grad_norm": 6.598100662231445, + "learning_rate": 4.043704891740176e-06, + "logits/chosen": 9.808004379272461, + "logits/rejected": 8.550536155700684, + "logps/chosen": -219.38681030273438, + "logps/rejected": -171.9385986328125, + "loss": 0.934, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.7619840502738953, + "rewards/margins": -0.3532869815826416, + "rewards/rejected": -0.40869706869125366, + "step": 5279 + }, + { + "epoch": 0.8165474579547651, + "grad_norm": 4.516329765319824, + "learning_rate": 4.043418490090503e-06, + "logits/chosen": 9.034750938415527, + "logits/rejected": 8.807994842529297, + "logps/chosen": -255.7656707763672, + "logps/rejected": -249.89437866210938, + "loss": 0.4976, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14227935671806335, + "rewards/margins": 0.6736327409744263, + "rewards/rejected": -0.5313533544540405, + "step": 5280 + }, + { + "epoch": 0.8167021070945293, + "grad_norm": 3.921006202697754, + "learning_rate": 4.04313208844083e-06, + "logits/chosen": 11.465631484985352, + "logits/rejected": 7.1311869621276855, + "logps/chosen": -256.7264709472656, + "logps/rejected": -197.33151245117188, + "loss": 0.4946, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35361552238464355, + "rewards/margins": 0.5205506086349487, + "rewards/rejected": -0.1669350564479828, + "step": 5281 + }, + { + "epoch": 0.8168567562342934, + "grad_norm": 4.306658744812012, + "learning_rate": 4.042845686791156e-06, + "logits/chosen": 14.168390274047852, + "logits/rejected": 13.578243255615234, + "logps/chosen": -274.8170166015625, + "logps/rejected": -331.6337585449219, + "loss": 0.5469, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3678053617477417, + "rewards/margins": 0.3964787423610687, + "rewards/rejected": -0.02867337316274643, + "step": 5282 + }, + { + "epoch": 0.8170114053740576, + "grad_norm": 6.740323066711426, + "learning_rate": 4.042559285141483e-06, + "logits/chosen": 18.647960662841797, + "logits/rejected": 11.877840995788574, + "logps/chosen": -392.2648010253906, + "logps/rejected": -259.07025146484375, + "loss": 0.7373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08635292947292328, + "rewards/margins": 0.15699368715286255, + "rewards/rejected": -0.24334661662578583, + "step": 5283 + }, + { + "epoch": 0.8171660545138217, + "grad_norm": 4.946192264556885, + "learning_rate": 4.042272883491809e-06, + "logits/chosen": 10.802740097045898, + "logits/rejected": 8.902528762817383, + "logps/chosen": -293.80413818359375, + "logps/rejected": -310.8073425292969, + "loss": 0.5727, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23894403874874115, + "rewards/margins": 0.35257089138031006, + "rewards/rejected": -0.1136268675327301, + "step": 5284 + }, + { + "epoch": 0.8173207036535859, + "grad_norm": 5.884267807006836, + "learning_rate": 4.041986481842135e-06, + "logits/chosen": 14.318381309509277, + "logits/rejected": 8.124183654785156, + "logps/chosen": -250.32835388183594, + "logps/rejected": -191.47222900390625, + "loss": 0.6961, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015008017420768738, + "rewards/margins": 0.2517485022544861, + "rewards/rejected": -0.23674052953720093, + "step": 5285 + }, + { + "epoch": 0.81747535279335, + "grad_norm": 5.399677753448486, + "learning_rate": 4.041700080192462e-06, + "logits/chosen": 11.85676097869873, + "logits/rejected": 11.586560249328613, + "logps/chosen": -219.27130126953125, + "logps/rejected": -271.974609375, + "loss": 0.6984, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17573201656341553, + "rewards/margins": 0.2112746238708496, + "rewards/rejected": -0.03554264083504677, + "step": 5286 + }, + { + "epoch": 0.8176300019331142, + "grad_norm": 4.5546464920043945, + "learning_rate": 4.041413678542789e-06, + "logits/chosen": 4.765160083770752, + "logits/rejected": 4.182767868041992, + "logps/chosen": -184.91705322265625, + "logps/rejected": -154.1429443359375, + "loss": 0.8115, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.21078996360301971, + "rewards/margins": 0.0438825786113739, + "rewards/rejected": -0.2546725571155548, + "step": 5287 + }, + { + "epoch": 0.8177846510728785, + "grad_norm": 5.755075454711914, + "learning_rate": 4.041127276893115e-06, + "logits/chosen": 6.5628228187561035, + "logits/rejected": 1.8516666889190674, + "logps/chosen": -266.8765563964844, + "logps/rejected": -285.1120910644531, + "loss": 0.6808, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1825660765171051, + "rewards/margins": 0.2668920159339905, + "rewards/rejected": -0.44945812225341797, + "step": 5288 + }, + { + "epoch": 0.8179393002126426, + "grad_norm": 4.75536584854126, + "learning_rate": 4.040840875243442e-06, + "logits/chosen": 14.11911678314209, + "logits/rejected": 4.839972496032715, + "logps/chosen": -297.9878845214844, + "logps/rejected": -213.57167053222656, + "loss": 0.6424, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11523404717445374, + "rewards/margins": 0.14415527880191803, + "rewards/rejected": -0.028921235352754593, + "step": 5289 + }, + { + "epoch": 0.8180939493524068, + "grad_norm": 5.634769916534424, + "learning_rate": 4.040554473593768e-06, + "logits/chosen": 12.931995391845703, + "logits/rejected": 8.381753921508789, + "logps/chosen": -280.044189453125, + "logps/rejected": -226.84364318847656, + "loss": 0.692, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11437497287988663, + "rewards/margins": 0.10387403517961502, + "rewards/rejected": -0.21824902296066284, + "step": 5290 + }, + { + "epoch": 0.8182485984921709, + "grad_norm": 7.441853046417236, + "learning_rate": 4.0402680719440945e-06, + "logits/chosen": 6.6387619972229, + "logits/rejected": 7.478278160095215, + "logps/chosen": -279.8251953125, + "logps/rejected": -294.8862609863281, + "loss": 0.9915, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.013854900375008583, + "rewards/margins": -0.3584097623825073, + "rewards/rejected": 0.37226468324661255, + "step": 5291 + }, + { + "epoch": 0.8184032476319351, + "grad_norm": 5.081742286682129, + "learning_rate": 4.039981670294421e-06, + "logits/chosen": 9.369760513305664, + "logits/rejected": 6.442256927490234, + "logps/chosen": -217.87554931640625, + "logps/rejected": -192.88534545898438, + "loss": 0.596, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08706074953079224, + "rewards/margins": 0.4786318838596344, + "rewards/rejected": -0.565692663192749, + "step": 5292 + }, + { + "epoch": 0.8185578967716992, + "grad_norm": 7.251420021057129, + "learning_rate": 4.039695268644748e-06, + "logits/chosen": 10.684796333312988, + "logits/rejected": 5.992951393127441, + "logps/chosen": -263.9416198730469, + "logps/rejected": -199.82528686523438, + "loss": 0.931, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32957836985588074, + "rewards/margins": -0.3344048857688904, + "rewards/rejected": 0.004826489835977554, + "step": 5293 + }, + { + "epoch": 0.8187125459114634, + "grad_norm": 4.681700706481934, + "learning_rate": 4.039408866995074e-06, + "logits/chosen": 6.999462127685547, + "logits/rejected": 3.8563921451568604, + "logps/chosen": -220.25628662109375, + "logps/rejected": -216.92608642578125, + "loss": 0.5645, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.43038418889045715, + "rewards/margins": 0.35112860798835754, + "rewards/rejected": 0.07925557345151901, + "step": 5294 + }, + { + "epoch": 0.8188671950512275, + "grad_norm": 3.659858226776123, + "learning_rate": 4.039122465345401e-06, + "logits/chosen": 10.38975715637207, + "logits/rejected": 10.337723731994629, + "logps/chosen": -140.5755157470703, + "logps/rejected": -112.56028747558594, + "loss": 0.5598, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.024062322452664375, + "rewards/margins": 0.34360113739967346, + "rewards/rejected": -0.3676634430885315, + "step": 5295 + }, + { + "epoch": 0.8190218441909917, + "grad_norm": 7.283785343170166, + "learning_rate": 4.038836063695728e-06, + "logits/chosen": 10.32509994506836, + "logits/rejected": 4.493505954742432, + "logps/chosen": -315.95123291015625, + "logps/rejected": -268.0956726074219, + "loss": 0.5022, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26954537630081177, + "rewards/margins": 0.47916051745414734, + "rewards/rejected": -0.20961514115333557, + "step": 5296 + }, + { + "epoch": 0.8191764933307558, + "grad_norm": 4.99456262588501, + "learning_rate": 4.0385496620460535e-06, + "logits/chosen": 7.853828430175781, + "logits/rejected": 6.3915252685546875, + "logps/chosen": -231.78533935546875, + "logps/rejected": -265.7840576171875, + "loss": 0.5818, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29572269320487976, + "rewards/margins": 0.33043473958969116, + "rewards/rejected": -0.034712053835392, + "step": 5297 + }, + { + "epoch": 0.81933114247052, + "grad_norm": 5.044380187988281, + "learning_rate": 4.03826326039638e-06, + "logits/chosen": 10.707877159118652, + "logits/rejected": 9.763927459716797, + "logps/chosen": -169.720458984375, + "logps/rejected": -169.6317901611328, + "loss": 0.6595, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31982043385505676, + "rewards/margins": 0.34559178352355957, + "rewards/rejected": -0.6654122471809387, + "step": 5298 + }, + { + "epoch": 0.8194857916102841, + "grad_norm": 7.007102012634277, + "learning_rate": 4.037976858746707e-06, + "logits/chosen": 11.934805870056152, + "logits/rejected": 7.741524696350098, + "logps/chosen": -231.00210571289062, + "logps/rejected": -144.3037109375, + "loss": 0.626, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08430337905883789, + "rewards/margins": 0.2393733561038971, + "rewards/rejected": -0.1550699770450592, + "step": 5299 + }, + { + "epoch": 0.8196404407500484, + "grad_norm": 4.550992012023926, + "learning_rate": 4.0376904570970335e-06, + "logits/chosen": 12.091131210327148, + "logits/rejected": 10.928199768066406, + "logps/chosen": -252.32498168945312, + "logps/rejected": -225.9612274169922, + "loss": 0.4719, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03519707918167114, + "rewards/margins": 0.6677951216697693, + "rewards/rejected": -0.6325980424880981, + "step": 5300 + }, + { + "epoch": 0.8197950898898125, + "grad_norm": 5.415155410766602, + "learning_rate": 4.03740405544736e-06, + "logits/chosen": 15.42573070526123, + "logits/rejected": 14.31202220916748, + "logps/chosen": -442.5439758300781, + "logps/rejected": -331.3834228515625, + "loss": 0.5775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2566392421722412, + "rewards/margins": 0.26325926184654236, + "rewards/rejected": -0.006620034575462341, + "step": 5301 + }, + { + "epoch": 0.8199497390295767, + "grad_norm": 4.5388922691345215, + "learning_rate": 4.037117653797687e-06, + "logits/chosen": 4.377840995788574, + "logits/rejected": 8.438913345336914, + "logps/chosen": -297.1492919921875, + "logps/rejected": -307.8794860839844, + "loss": 0.5323, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3450619876384735, + "rewards/margins": 0.5069870948791504, + "rewards/rejected": -0.16192513704299927, + "step": 5302 + }, + { + "epoch": 0.8201043881693408, + "grad_norm": 4.9085307121276855, + "learning_rate": 4.0368312521480126e-06, + "logits/chosen": 4.341270446777344, + "logits/rejected": 3.215613603591919, + "logps/chosen": -213.53590393066406, + "logps/rejected": -195.8070068359375, + "loss": 0.7238, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09362641721963882, + "rewards/margins": 0.07092452049255371, + "rewards/rejected": -0.16455093026161194, + "step": 5303 + }, + { + "epoch": 0.820259037309105, + "grad_norm": 5.224401473999023, + "learning_rate": 4.036544850498339e-06, + "logits/chosen": 11.544118881225586, + "logits/rejected": 12.571065902709961, + "logps/chosen": -266.6258850097656, + "logps/rejected": -251.29742431640625, + "loss": 0.6853, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2527645230293274, + "rewards/margins": 0.13452434539794922, + "rewards/rejected": 0.11824017763137817, + "step": 5304 + }, + { + "epoch": 0.8204136864488691, + "grad_norm": 5.437252521514893, + "learning_rate": 4.036258448848666e-06, + "logits/chosen": 9.601806640625, + "logits/rejected": 10.409278869628906, + "logps/chosen": -300.93115234375, + "logps/rejected": -269.85565185546875, + "loss": 0.6094, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006354421377182007, + "rewards/margins": 0.2257767617702484, + "rewards/rejected": -0.2194223701953888, + "step": 5305 + }, + { + "epoch": 0.8205683355886333, + "grad_norm": 4.728679656982422, + "learning_rate": 4.0359720471989925e-06, + "logits/chosen": 10.311020851135254, + "logits/rejected": 4.833508014678955, + "logps/chosen": -271.4335632324219, + "logps/rejected": -202.4356689453125, + "loss": 0.5953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04705129191279411, + "rewards/margins": 0.2946092188358307, + "rewards/rejected": -0.3416604995727539, + "step": 5306 + }, + { + "epoch": 0.8207229847283974, + "grad_norm": 6.367340087890625, + "learning_rate": 4.035685645549319e-06, + "logits/chosen": 8.099177360534668, + "logits/rejected": 5.124929428100586, + "logps/chosen": -316.1790771484375, + "logps/rejected": -246.45358276367188, + "loss": 0.7259, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.006880074739456177, + "rewards/margins": 0.03051258623600006, + "rewards/rejected": -0.02363251894712448, + "step": 5307 + }, + { + "epoch": 0.8208776338681616, + "grad_norm": 6.976737022399902, + "learning_rate": 4.035399243899645e-06, + "logits/chosen": 9.827499389648438, + "logits/rejected": 4.327454566955566, + "logps/chosen": -378.16302490234375, + "logps/rejected": -315.242431640625, + "loss": 0.5744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05373586714267731, + "rewards/margins": 0.5628617405891418, + "rewards/rejected": -0.509125828742981, + "step": 5308 + }, + { + "epoch": 0.8210322830079257, + "grad_norm": 7.578852653503418, + "learning_rate": 4.035112842249972e-06, + "logits/chosen": 10.64267635345459, + "logits/rejected": 8.26196002960205, + "logps/chosen": -471.53509521484375, + "logps/rejected": -434.52349853515625, + "loss": 0.6787, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.047881707549095154, + "rewards/margins": 0.13492760062217712, + "rewards/rejected": -0.18280930817127228, + "step": 5309 + }, + { + "epoch": 0.8211869321476899, + "grad_norm": 7.028905391693115, + "learning_rate": 4.034826440600298e-06, + "logits/chosen": 10.830615997314453, + "logits/rejected": 9.741681098937988, + "logps/chosen": -254.65875244140625, + "logps/rejected": -199.8251190185547, + "loss": 0.859, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15158559381961823, + "rewards/margins": -0.1550678312778473, + "rewards/rejected": 0.0034822598099708557, + "step": 5310 + }, + { + "epoch": 0.821341581287454, + "grad_norm": 7.716348171234131, + "learning_rate": 4.034540038950625e-06, + "logits/chosen": 8.212759017944336, + "logits/rejected": 11.404217720031738, + "logps/chosen": -245.264404296875, + "logps/rejected": -337.47528076171875, + "loss": 0.8757, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39241209626197815, + "rewards/margins": -0.19383685290813446, + "rewards/rejected": -0.1985752135515213, + "step": 5311 + }, + { + "epoch": 0.8214962304272182, + "grad_norm": 6.5120530128479, + "learning_rate": 4.034253637300951e-06, + "logits/chosen": 10.246106147766113, + "logits/rejected": 13.368188858032227, + "logps/chosen": -291.8525390625, + "logps/rejected": -256.69110107421875, + "loss": 0.774, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17371977865695953, + "rewards/margins": -0.06294122338294983, + "rewards/rejected": 0.23666101694107056, + "step": 5312 + }, + { + "epoch": 0.8216508795669825, + "grad_norm": 4.0749030113220215, + "learning_rate": 4.033967235651277e-06, + "logits/chosen": 11.509986877441406, + "logits/rejected": -0.17578458786010742, + "logps/chosen": -229.82313537597656, + "logps/rejected": -96.24790954589844, + "loss": 0.6169, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0189799964427948, + "rewards/margins": 0.38654884696006775, + "rewards/rejected": -0.40552881360054016, + "step": 5313 + }, + { + "epoch": 0.8218055287067466, + "grad_norm": 7.076247215270996, + "learning_rate": 4.033680834001604e-06, + "logits/chosen": 4.171049118041992, + "logits/rejected": 6.825776100158691, + "logps/chosen": -265.8440856933594, + "logps/rejected": -373.115966796875, + "loss": 0.6174, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3364883363246918, + "rewards/margins": 0.3720911145210266, + "rewards/rejected": -0.7085794806480408, + "step": 5314 + }, + { + "epoch": 0.8219601778465108, + "grad_norm": 5.649542331695557, + "learning_rate": 4.033394432351931e-06, + "logits/chosen": 4.286419868469238, + "logits/rejected": 1.6393518447875977, + "logps/chosen": -210.84759521484375, + "logps/rejected": -150.1285400390625, + "loss": 0.8277, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3767644762992859, + "rewards/margins": -0.07719491422176361, + "rewards/rejected": -0.2995695471763611, + "step": 5315 + }, + { + "epoch": 0.8221148269862749, + "grad_norm": 5.2629313468933105, + "learning_rate": 4.033108030702257e-06, + "logits/chosen": 8.696675300598145, + "logits/rejected": 1.9972820281982422, + "logps/chosen": -290.17559814453125, + "logps/rejected": -229.56788635253906, + "loss": 0.5815, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17074839770793915, + "rewards/margins": 0.30529817938804626, + "rewards/rejected": -0.1345497965812683, + "step": 5316 + }, + { + "epoch": 0.8222694761260391, + "grad_norm": 5.135610103607178, + "learning_rate": 4.032821629052583e-06, + "logits/chosen": 4.589008331298828, + "logits/rejected": 2.625314712524414, + "logps/chosen": -247.08274841308594, + "logps/rejected": -252.1951141357422, + "loss": 0.6366, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1581580936908722, + "rewards/margins": 0.2367747724056244, + "rewards/rejected": -0.39493289589881897, + "step": 5317 + }, + { + "epoch": 0.8224241252658032, + "grad_norm": 5.244497299194336, + "learning_rate": 4.03253522740291e-06, + "logits/chosen": 10.05750560760498, + "logits/rejected": 9.302133560180664, + "logps/chosen": -330.64202880859375, + "logps/rejected": -356.7399597167969, + "loss": 0.6178, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13964241743087769, + "rewards/margins": 0.3444845676422119, + "rewards/rejected": -0.2048421949148178, + "step": 5318 + }, + { + "epoch": 0.8225787744055674, + "grad_norm": 4.925677299499512, + "learning_rate": 4.0322488257532364e-06, + "logits/chosen": 8.956275939941406, + "logits/rejected": 9.288116455078125, + "logps/chosen": -236.534912109375, + "logps/rejected": -240.97195434570312, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32586202025413513, + "rewards/margins": 0.4305882453918457, + "rewards/rejected": -0.10472622513771057, + "step": 5319 + }, + { + "epoch": 0.8227334235453315, + "grad_norm": 6.542516231536865, + "learning_rate": 4.031962424103563e-06, + "logits/chosen": 10.820230484008789, + "logits/rejected": 6.561944007873535, + "logps/chosen": -329.16485595703125, + "logps/rejected": -360.517578125, + "loss": 0.7851, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2560485303401947, + "rewards/margins": -0.034902364015579224, + "rewards/rejected": 0.2909509241580963, + "step": 5320 + }, + { + "epoch": 0.8228880726850957, + "grad_norm": 5.495143413543701, + "learning_rate": 4.03167602245389e-06, + "logits/chosen": 5.645079135894775, + "logits/rejected": 3.5093014240264893, + "logps/chosen": -162.95184326171875, + "logps/rejected": -218.01318359375, + "loss": 0.7335, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.09918948262929916, + "rewards/margins": 0.13423582911491394, + "rewards/rejected": -0.2334252893924713, + "step": 5321 + }, + { + "epoch": 0.8230427218248598, + "grad_norm": 4.503170013427734, + "learning_rate": 4.031389620804216e-06, + "logits/chosen": 12.930352210998535, + "logits/rejected": 11.06631088256836, + "logps/chosen": -252.0088348388672, + "logps/rejected": -265.4640197753906, + "loss": 0.4894, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4509038031101227, + "rewards/margins": 0.6731268763542175, + "rewards/rejected": -0.22222310304641724, + "step": 5322 + }, + { + "epoch": 0.823197370964624, + "grad_norm": 5.0754618644714355, + "learning_rate": 4.031103219154542e-06, + "logits/chosen": 12.904048919677734, + "logits/rejected": 13.595926284790039, + "logps/chosen": -297.0815734863281, + "logps/rejected": -336.881591796875, + "loss": 0.5675, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2576300799846649, + "rewards/margins": 0.3779858946800232, + "rewards/rejected": -0.12035579979419708, + "step": 5323 + }, + { + "epoch": 0.8233520201043881, + "grad_norm": 4.792301654815674, + "learning_rate": 4.030816817504869e-06, + "logits/chosen": 6.062042713165283, + "logits/rejected": 6.047024726867676, + "logps/chosen": -216.16600036621094, + "logps/rejected": -217.94049072265625, + "loss": 0.5538, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0012787804007530212, + "rewards/margins": 0.3623438775539398, + "rewards/rejected": -0.36362266540527344, + "step": 5324 + }, + { + "epoch": 0.8235066692441523, + "grad_norm": 6.105286121368408, + "learning_rate": 4.0305304158551955e-06, + "logits/chosen": 8.504137992858887, + "logits/rejected": 4.4132513999938965, + "logps/chosen": -269.54205322265625, + "logps/rejected": -230.5826873779297, + "loss": 0.7152, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17474594712257385, + "rewards/margins": 0.09901612997055054, + "rewards/rejected": -0.2737621068954468, + "step": 5325 + }, + { + "epoch": 0.8236613183839165, + "grad_norm": 6.389162540435791, + "learning_rate": 4.030244014205522e-06, + "logits/chosen": 12.092460632324219, + "logits/rejected": 4.347359657287598, + "logps/chosen": -302.00079345703125, + "logps/rejected": -264.7982177734375, + "loss": 0.7441, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.28832319378852844, + "rewards/margins": 0.2440381646156311, + "rewards/rejected": -0.5323613882064819, + "step": 5326 + }, + { + "epoch": 0.8238159675236807, + "grad_norm": 4.724972724914551, + "learning_rate": 4.029957612555849e-06, + "logits/chosen": 14.11539363861084, + "logits/rejected": 8.483016967773438, + "logps/chosen": -310.4341125488281, + "logps/rejected": -225.14439392089844, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2828636169433594, + "rewards/margins": 0.46324360370635986, + "rewards/rejected": -0.1803799867630005, + "step": 5327 + }, + { + "epoch": 0.8239706166634448, + "grad_norm": 4.871413230895996, + "learning_rate": 4.0296712109061754e-06, + "logits/chosen": 9.946161270141602, + "logits/rejected": 8.035512924194336, + "logps/chosen": -214.70883178710938, + "logps/rejected": -246.97708129882812, + "loss": 0.5714, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09428431838750839, + "rewards/margins": 0.4720229208469391, + "rewards/rejected": -0.5663072466850281, + "step": 5328 + }, + { + "epoch": 0.824125265803209, + "grad_norm": 6.055809020996094, + "learning_rate": 4.029384809256502e-06, + "logits/chosen": 8.911832809448242, + "logits/rejected": -0.19004011154174805, + "logps/chosen": -309.13616943359375, + "logps/rejected": -156.112548828125, + "loss": 0.5901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1844894140958786, + "rewards/margins": 0.6713892817497253, + "rewards/rejected": -0.8558787703514099, + "step": 5329 + }, + { + "epoch": 0.8242799149429731, + "grad_norm": 6.95323371887207, + "learning_rate": 4.029098407606828e-06, + "logits/chosen": 5.750208854675293, + "logits/rejected": 6.927001476287842, + "logps/chosen": -283.3548583984375, + "logps/rejected": -279.2781066894531, + "loss": 0.7321, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10793409496545792, + "rewards/margins": -0.02041912078857422, + "rewards/rejected": -0.0875149667263031, + "step": 5330 + }, + { + "epoch": 0.8244345640827373, + "grad_norm": 11.965617179870605, + "learning_rate": 4.0288120059571545e-06, + "logits/chosen": 9.658398628234863, + "logits/rejected": 6.472871780395508, + "logps/chosen": -323.000732421875, + "logps/rejected": -251.18576049804688, + "loss": 0.8512, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13315190374851227, + "rewards/margins": -0.04851962625980377, + "rewards/rejected": -0.0846322551369667, + "step": 5331 + }, + { + "epoch": 0.8245892132225014, + "grad_norm": 5.235379219055176, + "learning_rate": 4.028525604307481e-06, + "logits/chosen": 14.443598747253418, + "logits/rejected": 6.930086135864258, + "logps/chosen": -325.97821044921875, + "logps/rejected": -247.26937866210938, + "loss": 0.5519, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12148480862379074, + "rewards/margins": 0.40008780360221863, + "rewards/rejected": -0.2786029875278473, + "step": 5332 + }, + { + "epoch": 0.8247438623622656, + "grad_norm": 6.801800727844238, + "learning_rate": 4.028239202657808e-06, + "logits/chosen": 8.464888572692871, + "logits/rejected": 8.957808494567871, + "logps/chosen": -174.44998168945312, + "logps/rejected": -134.204833984375, + "loss": 0.923, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.6006500720977783, + "rewards/margins": -0.3253081440925598, + "rewards/rejected": -0.2753419578075409, + "step": 5333 + }, + { + "epoch": 0.8248985115020298, + "grad_norm": 21.104555130004883, + "learning_rate": 4.0279528010081345e-06, + "logits/chosen": 8.508782386779785, + "logits/rejected": 6.074336051940918, + "logps/chosen": -350.965576171875, + "logps/rejected": -335.1566162109375, + "loss": 0.7483, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05120614171028137, + "rewards/margins": 0.011261433362960815, + "rewards/rejected": -0.062467582523822784, + "step": 5334 + }, + { + "epoch": 0.8250531606417939, + "grad_norm": 4.703070640563965, + "learning_rate": 4.027666399358461e-06, + "logits/chosen": 9.430997848510742, + "logits/rejected": 6.76487922668457, + "logps/chosen": -211.81646728515625, + "logps/rejected": -172.1246337890625, + "loss": 0.5257, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.021838165819644928, + "rewards/margins": 0.5281886458396912, + "rewards/rejected": -0.5500267744064331, + "step": 5335 + }, + { + "epoch": 0.825207809781558, + "grad_norm": 4.348498344421387, + "learning_rate": 4.027379997708788e-06, + "logits/chosen": 10.154484748840332, + "logits/rejected": 8.764572143554688, + "logps/chosen": -380.19146728515625, + "logps/rejected": -300.4892272949219, + "loss": 0.4293, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0502408891916275, + "rewards/margins": 0.743482232093811, + "rewards/rejected": -0.6932413578033447, + "step": 5336 + }, + { + "epoch": 0.8253624589213222, + "grad_norm": 7.4166765213012695, + "learning_rate": 4.027093596059114e-06, + "logits/chosen": 5.941498756408691, + "logits/rejected": 7.464786529541016, + "logps/chosen": -380.4335632324219, + "logps/rejected": -355.916748046875, + "loss": 0.4548, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.3901115953922272, + "rewards/margins": 0.611159086227417, + "rewards/rejected": -1.0012707710266113, + "step": 5337 + }, + { + "epoch": 0.8255171080610864, + "grad_norm": 12.903608322143555, + "learning_rate": 4.02680719440944e-06, + "logits/chosen": 5.729025840759277, + "logits/rejected": 6.0861430168151855, + "logps/chosen": -304.4761962890625, + "logps/rejected": -260.3463134765625, + "loss": 0.7369, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26506900787353516, + "rewards/margins": -0.010061264038085938, + "rewards/rejected": 0.2751302719116211, + "step": 5338 + }, + { + "epoch": 0.8256717572008506, + "grad_norm": 12.802170753479004, + "learning_rate": 4.026520792759767e-06, + "logits/chosen": 11.008221626281738, + "logits/rejected": 8.043703079223633, + "logps/chosen": -286.43231201171875, + "logps/rejected": -238.50338745117188, + "loss": 0.7821, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.017602648586034775, + "rewards/margins": -0.053995564579963684, + "rewards/rejected": 0.07159822434186935, + "step": 5339 + }, + { + "epoch": 0.8258264063406148, + "grad_norm": 7.474262714385986, + "learning_rate": 4.0262343911100935e-06, + "logits/chosen": 8.915708541870117, + "logits/rejected": 5.9769439697265625, + "logps/chosen": -217.97325134277344, + "logps/rejected": -189.8278350830078, + "loss": 0.5692, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22961267828941345, + "rewards/margins": 0.4130922257900238, + "rewards/rejected": -0.18347957730293274, + "step": 5340 + }, + { + "epoch": 0.8259810554803789, + "grad_norm": 5.888516902923584, + "learning_rate": 4.025947989460419e-06, + "logits/chosen": 4.281300067901611, + "logits/rejected": 6.6664276123046875, + "logps/chosen": -192.68429565429688, + "logps/rejected": -207.56834411621094, + "loss": 0.7238, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1519346684217453, + "rewards/margins": -0.01962270587682724, + "rewards/rejected": -0.13231196999549866, + "step": 5341 + }, + { + "epoch": 0.8261357046201431, + "grad_norm": 5.999971389770508, + "learning_rate": 4.025661587810746e-06, + "logits/chosen": 9.47360610961914, + "logits/rejected": 6.125233173370361, + "logps/chosen": -370.8904724121094, + "logps/rejected": -328.52227783203125, + "loss": 0.6236, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2609485685825348, + "rewards/margins": 0.29614725708961487, + "rewards/rejected": -0.03519868105649948, + "step": 5342 + }, + { + "epoch": 0.8262903537599072, + "grad_norm": 5.769198894500732, + "learning_rate": 4.025375186161073e-06, + "logits/chosen": 8.71756362915039, + "logits/rejected": 8.844842910766602, + "logps/chosen": -381.8267822265625, + "logps/rejected": -336.4114074707031, + "loss": 0.5582, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37201958894729614, + "rewards/margins": 0.45419007539749146, + "rewards/rejected": -0.08217048645019531, + "step": 5343 + }, + { + "epoch": 0.8264450028996714, + "grad_norm": 7.81317663192749, + "learning_rate": 4.025088784511399e-06, + "logits/chosen": 11.422042846679688, + "logits/rejected": 9.726343154907227, + "logps/chosen": -239.04881286621094, + "logps/rejected": -180.1737060546875, + "loss": 0.7536, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7132056951522827, + "rewards/margins": 0.01957044005393982, + "rewards/rejected": -0.7327761054039001, + "step": 5344 + }, + { + "epoch": 0.8265996520394355, + "grad_norm": 8.058053016662598, + "learning_rate": 4.024802382861726e-06, + "logits/chosen": 7.095990180969238, + "logits/rejected": 3.0499749183654785, + "logps/chosen": -354.446533203125, + "logps/rejected": -227.48739624023438, + "loss": 0.7898, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.023910678923130035, + "rewards/margins": -0.1181541159749031, + "rewards/rejected": 0.09424343705177307, + "step": 5345 + }, + { + "epoch": 0.8267543011791997, + "grad_norm": 4.05272102355957, + "learning_rate": 4.024515981212052e-06, + "logits/chosen": 13.564950942993164, + "logits/rejected": 14.703371047973633, + "logps/chosen": -223.377197265625, + "logps/rejected": -237.2510986328125, + "loss": 0.5241, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15794432163238525, + "rewards/margins": 0.4584812521934509, + "rewards/rejected": -0.6164255738258362, + "step": 5346 + }, + { + "epoch": 0.8269089503189638, + "grad_norm": 5.342638969421387, + "learning_rate": 4.024229579562378e-06, + "logits/chosen": 6.897091865539551, + "logits/rejected": 5.012201309204102, + "logps/chosen": -244.1290740966797, + "logps/rejected": -230.30862426757812, + "loss": 0.712, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.21836552023887634, + "rewards/margins": 0.21496419608592987, + "rewards/rejected": -0.43332967162132263, + "step": 5347 + }, + { + "epoch": 0.827063599458728, + "grad_norm": 9.905418395996094, + "learning_rate": 4.023943177912705e-06, + "logits/chosen": 10.802815437316895, + "logits/rejected": 12.32828140258789, + "logps/chosen": -237.32496643066406, + "logps/rejected": -235.06253051757812, + "loss": 0.9861, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2220773696899414, + "rewards/margins": -0.436323344707489, + "rewards/rejected": 0.2142459899187088, + "step": 5348 + }, + { + "epoch": 0.8272182485984921, + "grad_norm": 9.778711318969727, + "learning_rate": 4.023656776263032e-06, + "logits/chosen": 5.947680950164795, + "logits/rejected": 7.65714693069458, + "logps/chosen": -183.94166564941406, + "logps/rejected": -246.54600524902344, + "loss": 0.8198, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.18549171090126038, + "rewards/margins": -0.13598228991031647, + "rewards/rejected": -0.04950941354036331, + "step": 5349 + }, + { + "epoch": 0.8273728977382563, + "grad_norm": 3.8925695419311523, + "learning_rate": 4.0233703746133575e-06, + "logits/chosen": 13.280454635620117, + "logits/rejected": 14.779666900634766, + "logps/chosen": -124.14584350585938, + "logps/rejected": -169.04104614257812, + "loss": 0.6426, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08064673095941544, + "rewards/margins": 0.13690821826457977, + "rewards/rejected": -0.2175549417734146, + "step": 5350 + }, + { + "epoch": 0.8275275468780204, + "grad_norm": 5.325223445892334, + "learning_rate": 4.023083972963684e-06, + "logits/chosen": 11.94888687133789, + "logits/rejected": 7.522892951965332, + "logps/chosen": -271.1258239746094, + "logps/rejected": -204.5643310546875, + "loss": 0.6348, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15165206789970398, + "rewards/margins": 0.35170578956604004, + "rewards/rejected": -0.5033578872680664, + "step": 5351 + }, + { + "epoch": 0.8276821960177847, + "grad_norm": 7.117510795593262, + "learning_rate": 4.022797571314011e-06, + "logits/chosen": 12.60094165802002, + "logits/rejected": 8.933693885803223, + "logps/chosen": -420.9251708984375, + "logps/rejected": -347.2699279785156, + "loss": 0.7923, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1477445811033249, + "rewards/margins": -0.07621574401855469, + "rewards/rejected": 0.2239602953195572, + "step": 5352 + }, + { + "epoch": 0.8278368451575489, + "grad_norm": 3.30772066116333, + "learning_rate": 4.0225111696643375e-06, + "logits/chosen": 14.168424606323242, + "logits/rejected": 6.840731143951416, + "logps/chosen": -199.61041259765625, + "logps/rejected": -145.72467041015625, + "loss": 0.5171, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.015547472983598709, + "rewards/margins": 0.6037575602531433, + "rewards/rejected": -0.6193050146102905, + "step": 5353 + }, + { + "epoch": 0.827991494297313, + "grad_norm": 5.570367336273193, + "learning_rate": 4.022224768014664e-06, + "logits/chosen": 3.3675670623779297, + "logits/rejected": 3.5403053760528564, + "logps/chosen": -184.73187255859375, + "logps/rejected": -248.59005737304688, + "loss": 0.5706, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10427740216255188, + "rewards/margins": 0.5916788578033447, + "rewards/rejected": -0.695956289768219, + "step": 5354 + }, + { + "epoch": 0.8281461434370772, + "grad_norm": 5.730072021484375, + "learning_rate": 4.021938366364991e-06, + "logits/chosen": 13.051298141479492, + "logits/rejected": 9.186807632446289, + "logps/chosen": -340.6981201171875, + "logps/rejected": -257.09368896484375, + "loss": 0.5716, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2891034483909607, + "rewards/margins": 0.394697368144989, + "rewards/rejected": -0.10559390485286713, + "step": 5355 + }, + { + "epoch": 0.8283007925768413, + "grad_norm": 9.620980262756348, + "learning_rate": 4.0216519647153166e-06, + "logits/chosen": 13.854562759399414, + "logits/rejected": 9.437166213989258, + "logps/chosen": -301.5858154296875, + "logps/rejected": -225.3343048095703, + "loss": 0.7168, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16419801115989685, + "rewards/margins": 0.04755708575248718, + "rewards/rejected": -0.21175506711006165, + "step": 5356 + }, + { + "epoch": 0.8284554417166055, + "grad_norm": 4.582464694976807, + "learning_rate": 4.021365563065643e-06, + "logits/chosen": 14.405271530151367, + "logits/rejected": 12.518171310424805, + "logps/chosen": -343.9298095703125, + "logps/rejected": -275.76611328125, + "loss": 0.6287, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.006426818668842316, + "rewards/margins": 0.25624704360961914, + "rewards/rejected": -0.2626737952232361, + "step": 5357 + }, + { + "epoch": 0.8286100908563696, + "grad_norm": 6.05879545211792, + "learning_rate": 4.02107916141597e-06, + "logits/chosen": 11.324037551879883, + "logits/rejected": 12.489995956420898, + "logps/chosen": -319.57025146484375, + "logps/rejected": -272.546630859375, + "loss": 0.8199, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08118820190429688, + "rewards/margins": -0.05728644132614136, + "rewards/rejected": 0.13847464323043823, + "step": 5358 + }, + { + "epoch": 0.8287647399961338, + "grad_norm": 9.0266752243042, + "learning_rate": 4.0207927597662965e-06, + "logits/chosen": 6.157256126403809, + "logits/rejected": 2.361009120941162, + "logps/chosen": -302.79736328125, + "logps/rejected": -203.93955993652344, + "loss": 0.5531, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2322167456150055, + "rewards/margins": 0.4372229278087616, + "rewards/rejected": -0.2050061970949173, + "step": 5359 + }, + { + "epoch": 0.8289193891358979, + "grad_norm": 5.840587615966797, + "learning_rate": 4.020506358116623e-06, + "logits/chosen": 10.24179458618164, + "logits/rejected": 11.3818941116333, + "logps/chosen": -300.0357360839844, + "logps/rejected": -330.2537841796875, + "loss": 0.7296, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1039886549115181, + "rewards/margins": -0.024369336664676666, + "rewards/rejected": -0.07961931079626083, + "step": 5360 + }, + { + "epoch": 0.8290740382756621, + "grad_norm": 8.630515098571777, + "learning_rate": 4.02021995646695e-06, + "logits/chosen": 11.478658676147461, + "logits/rejected": 12.549637794494629, + "logps/chosen": -277.0334167480469, + "logps/rejected": -265.60076904296875, + "loss": 0.8354, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.5861489772796631, + "rewards/margins": -0.1759597361087799, + "rewards/rejected": -0.41018927097320557, + "step": 5361 + }, + { + "epoch": 0.8292286874154262, + "grad_norm": 4.0248589515686035, + "learning_rate": 4.0199335548172765e-06, + "logits/chosen": 9.13327693939209, + "logits/rejected": 0.4751328229904175, + "logps/chosen": -304.70947265625, + "logps/rejected": -188.1190185546875, + "loss": 0.5116, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.184296116232872, + "rewards/margins": 0.5303095579147339, + "rewards/rejected": -0.3460134267807007, + "step": 5362 + }, + { + "epoch": 0.8293833365551904, + "grad_norm": 7.228922367095947, + "learning_rate": 4.019647153167602e-06, + "logits/chosen": 8.904441833496094, + "logits/rejected": 16.654111862182617, + "logps/chosen": -289.78485107421875, + "logps/rejected": -345.4756164550781, + "loss": 0.7966, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.034785330295562744, + "rewards/margins": -0.10227788984775543, + "rewards/rejected": 0.13706320524215698, + "step": 5363 + }, + { + "epoch": 0.8295379856949546, + "grad_norm": 5.103830337524414, + "learning_rate": 4.019360751517929e-06, + "logits/chosen": 5.899242401123047, + "logits/rejected": 9.79134750366211, + "logps/chosen": -197.1291046142578, + "logps/rejected": -245.04428100585938, + "loss": 0.8407, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3621762692928314, + "rewards/margins": -0.22488480806350708, + "rewards/rejected": -0.13729147613048553, + "step": 5364 + }, + { + "epoch": 0.8296926348347188, + "grad_norm": 5.657057285308838, + "learning_rate": 4.0190743498682556e-06, + "logits/chosen": 11.96318244934082, + "logits/rejected": 14.269253730773926, + "logps/chosen": -199.26263427734375, + "logps/rejected": -203.2296905517578, + "loss": 0.7917, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07637586444616318, + "rewards/margins": -0.15539111196994781, + "rewards/rejected": 0.07901525497436523, + "step": 5365 + }, + { + "epoch": 0.8298472839744829, + "grad_norm": 6.404512405395508, + "learning_rate": 4.018787948218582e-06, + "logits/chosen": 8.980405807495117, + "logits/rejected": 3.2507221698760986, + "logps/chosen": -355.8680725097656, + "logps/rejected": -227.14480590820312, + "loss": 0.7279, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10106442123651505, + "rewards/margins": 0.04736147075891495, + "rewards/rejected": -0.14842589199543, + "step": 5366 + }, + { + "epoch": 0.8300019331142471, + "grad_norm": 5.0833001136779785, + "learning_rate": 4.018501546568909e-06, + "logits/chosen": 10.452532768249512, + "logits/rejected": 2.6104063987731934, + "logps/chosen": -414.9434814453125, + "logps/rejected": -260.2200622558594, + "loss": 0.6069, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14669114351272583, + "rewards/margins": 0.2586354911327362, + "rewards/rejected": -0.11194434762001038, + "step": 5367 + }, + { + "epoch": 0.8301565822540112, + "grad_norm": 10.146333694458008, + "learning_rate": 4.0182151449192355e-06, + "logits/chosen": 15.662110328674316, + "logits/rejected": 4.235750198364258, + "logps/chosen": -502.3944091796875, + "logps/rejected": -241.30320739746094, + "loss": 0.4973, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.027603913098573685, + "rewards/margins": 0.6622288227081299, + "rewards/rejected": -0.6346248984336853, + "step": 5368 + }, + { + "epoch": 0.8303112313937754, + "grad_norm": 5.116820335388184, + "learning_rate": 4.017928743269562e-06, + "logits/chosen": 10.789506912231445, + "logits/rejected": 10.022411346435547, + "logps/chosen": -319.52545166015625, + "logps/rejected": -274.0902099609375, + "loss": 0.682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.006325904279947281, + "rewards/margins": 0.36327606439590454, + "rewards/rejected": -0.3696020245552063, + "step": 5369 + }, + { + "epoch": 0.8304658805335395, + "grad_norm": 4.79144811630249, + "learning_rate": 4.017642341619888e-06, + "logits/chosen": 11.066890716552734, + "logits/rejected": 5.095754146575928, + "logps/chosen": -225.28695678710938, + "logps/rejected": -174.3505859375, + "loss": 0.7012, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0682850331068039, + "rewards/margins": 0.03783566504716873, + "rewards/rejected": -0.10612067580223083, + "step": 5370 + }, + { + "epoch": 0.8306205296733037, + "grad_norm": 5.753739356994629, + "learning_rate": 4.017355939970215e-06, + "logits/chosen": 12.702654838562012, + "logits/rejected": 7.077354907989502, + "logps/chosen": -318.09503173828125, + "logps/rejected": -266.88018798828125, + "loss": 0.6557, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016322467476129532, + "rewards/margins": 0.15788981318473816, + "rewards/rejected": -0.14156733453273773, + "step": 5371 + }, + { + "epoch": 0.8307751788130678, + "grad_norm": 5.359127044677734, + "learning_rate": 4.017069538320541e-06, + "logits/chosen": 13.164688110351562, + "logits/rejected": 9.30786418914795, + "logps/chosen": -276.2366027832031, + "logps/rejected": -177.9744110107422, + "loss": 0.5139, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2154313623905182, + "rewards/margins": 0.5429648756980896, + "rewards/rejected": -0.3275335133075714, + "step": 5372 + }, + { + "epoch": 0.830929827952832, + "grad_norm": 6.666069030761719, + "learning_rate": 4.016783136670868e-06, + "logits/chosen": 17.07319450378418, + "logits/rejected": 13.02253246307373, + "logps/chosen": -318.0179443359375, + "logps/rejected": -253.59043884277344, + "loss": 0.6691, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1107303574681282, + "rewards/margins": 0.16250231862068176, + "rewards/rejected": -0.27323266863822937, + "step": 5373 + }, + { + "epoch": 0.8310844770925961, + "grad_norm": 6.0921630859375, + "learning_rate": 4.016496735021195e-06, + "logits/chosen": 6.870823860168457, + "logits/rejected": 5.898431777954102, + "logps/chosen": -375.41259765625, + "logps/rejected": -329.1944885253906, + "loss": 0.604, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1251259744167328, + "rewards/margins": 0.2159103900194168, + "rewards/rejected": -0.09078440815210342, + "step": 5374 + }, + { + "epoch": 0.8312391262323603, + "grad_norm": 4.845489978790283, + "learning_rate": 4.01621033337152e-06, + "logits/chosen": 9.506854057312012, + "logits/rejected": 4.547054290771484, + "logps/chosen": -334.96435546875, + "logps/rejected": -283.1689453125, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0023044124245643616, + "rewards/margins": 0.37399858236312866, + "rewards/rejected": -0.3763029873371124, + "step": 5375 + }, + { + "epoch": 0.8313937753721244, + "grad_norm": 5.396146297454834, + "learning_rate": 4.015923931721847e-06, + "logits/chosen": 10.54942798614502, + "logits/rejected": 9.277009010314941, + "logps/chosen": -229.40316772460938, + "logps/rejected": -194.76673889160156, + "loss": 0.7013, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02864570915699005, + "rewards/margins": 0.07281656563282013, + "rewards/rejected": -0.04417085647583008, + "step": 5376 + }, + { + "epoch": 0.8315484245118887, + "grad_norm": 4.1248955726623535, + "learning_rate": 4.015637530072174e-06, + "logits/chosen": 3.780885934829712, + "logits/rejected": 5.9319682121276855, + "logps/chosen": -319.5355529785156, + "logps/rejected": -314.78515625, + "loss": 0.4975, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18458938598632812, + "rewards/margins": 0.6036069393157959, + "rewards/rejected": -0.41901758313179016, + "step": 5377 + }, + { + "epoch": 0.8317030736516529, + "grad_norm": 4.499195575714111, + "learning_rate": 4.0153511284225e-06, + "logits/chosen": 13.451874732971191, + "logits/rejected": 1.7122514247894287, + "logps/chosen": -389.09869384765625, + "logps/rejected": -259.02337646484375, + "loss": 0.401, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008623795583844185, + "rewards/margins": 0.7962130308151245, + "rewards/rejected": -0.7875891923904419, + "step": 5378 + }, + { + "epoch": 0.831857722791417, + "grad_norm": 5.6987433433532715, + "learning_rate": 4.015064726772826e-06, + "logits/chosen": 9.704020500183105, + "logits/rejected": 5.749418258666992, + "logps/chosen": -228.26461791992188, + "logps/rejected": -190.92189025878906, + "loss": 0.7193, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13351833820343018, + "rewards/margins": 0.033403925597667694, + "rewards/rejected": -0.16692225635051727, + "step": 5379 + }, + { + "epoch": 0.8320123719311812, + "grad_norm": 4.734670639038086, + "learning_rate": 4.014778325123153e-06, + "logits/chosen": 8.425107955932617, + "logits/rejected": 5.822018623352051, + "logps/chosen": -211.37890625, + "logps/rejected": -259.53314208984375, + "loss": 0.6281, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.005975663661956787, + "rewards/margins": 0.2646450698375702, + "rewards/rejected": -0.270620733499527, + "step": 5380 + }, + { + "epoch": 0.8321670210709453, + "grad_norm": 4.815316677093506, + "learning_rate": 4.0144919234734794e-06, + "logits/chosen": 7.501192092895508, + "logits/rejected": 13.593565940856934, + "logps/chosen": -135.0107879638672, + "logps/rejected": -269.2069396972656, + "loss": 0.7324, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.36686083674430847, + "rewards/margins": 0.268845796585083, + "rewards/rejected": -0.6357066631317139, + "step": 5381 + }, + { + "epoch": 0.8323216702107095, + "grad_norm": 3.9795141220092773, + "learning_rate": 4.014205521823806e-06, + "logits/chosen": 14.000324249267578, + "logits/rejected": 10.978670120239258, + "logps/chosen": -293.13153076171875, + "logps/rejected": -239.74111938476562, + "loss": 0.5501, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.1538265347480774, + "rewards/margins": 0.5825468301773071, + "rewards/rejected": -0.7363733053207397, + "step": 5382 + }, + { + "epoch": 0.8324763193504736, + "grad_norm": 5.09022855758667, + "learning_rate": 4.013919120174133e-06, + "logits/chosen": 13.133593559265137, + "logits/rejected": 14.01787281036377, + "logps/chosen": -226.3394012451172, + "logps/rejected": -252.2895965576172, + "loss": 0.6428, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2822531759738922, + "rewards/margins": 0.1283479630947113, + "rewards/rejected": -0.4106011390686035, + "step": 5383 + }, + { + "epoch": 0.8326309684902378, + "grad_norm": 4.223243236541748, + "learning_rate": 4.0136327185244585e-06, + "logits/chosen": 17.141403198242188, + "logits/rejected": 8.355865478515625, + "logps/chosen": -236.6707305908203, + "logps/rejected": -210.37130737304688, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28904491662979126, + "rewards/margins": 0.44514426589012146, + "rewards/rejected": -0.1560993492603302, + "step": 5384 + }, + { + "epoch": 0.8327856176300019, + "grad_norm": 6.058457851409912, + "learning_rate": 4.013346316874785e-06, + "logits/chosen": 13.686336517333984, + "logits/rejected": 4.966727256774902, + "logps/chosen": -283.00848388671875, + "logps/rejected": -223.71339416503906, + "loss": 0.7023, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08856715261936188, + "rewards/margins": 0.2917318046092987, + "rewards/rejected": -0.38029900193214417, + "step": 5385 + }, + { + "epoch": 0.8329402667697661, + "grad_norm": 12.94128704071045, + "learning_rate": 4.013059915225112e-06, + "logits/chosen": 14.132623672485352, + "logits/rejected": 7.437209129333496, + "logps/chosen": -323.65167236328125, + "logps/rejected": -270.63580322265625, + "loss": 0.5334, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3487892150878906, + "rewards/margins": 0.625971257686615, + "rewards/rejected": -0.2771819531917572, + "step": 5386 + }, + { + "epoch": 0.8330949159095302, + "grad_norm": 6.800536632537842, + "learning_rate": 4.0127735135754385e-06, + "logits/chosen": 16.286441802978516, + "logits/rejected": 9.720698356628418, + "logps/chosen": -514.7172241210938, + "logps/rejected": -367.98443603515625, + "loss": 0.6303, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4346073269844055, + "rewards/margins": 0.33411428332328796, + "rewards/rejected": 0.10049304366111755, + "step": 5387 + }, + { + "epoch": 0.8332495650492944, + "grad_norm": 4.872771739959717, + "learning_rate": 4.012487111925765e-06, + "logits/chosen": 7.491901397705078, + "logits/rejected": 9.20926570892334, + "logps/chosen": -218.38690185546875, + "logps/rejected": -275.0525817871094, + "loss": 0.7046, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17015504837036133, + "rewards/margins": 0.08373256772756577, + "rewards/rejected": -0.2538875937461853, + "step": 5388 + }, + { + "epoch": 0.8334042141890585, + "grad_norm": 4.226747989654541, + "learning_rate": 4.012200710276091e-06, + "logits/chosen": 6.772472381591797, + "logits/rejected": 3.7654976844787598, + "logps/chosen": -163.06546020507812, + "logps/rejected": -147.28819274902344, + "loss": 0.6079, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.006968967616558075, + "rewards/margins": 0.3486133813858032, + "rewards/rejected": -0.3555823564529419, + "step": 5389 + }, + { + "epoch": 0.8335588633288228, + "grad_norm": 4.889275550842285, + "learning_rate": 4.011914308626418e-06, + "logits/chosen": 5.655824661254883, + "logits/rejected": 4.966555595397949, + "logps/chosen": -276.1043701171875, + "logps/rejected": -224.635986328125, + "loss": 0.5581, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.005195764824748039, + "rewards/margins": 0.41703474521636963, + "rewards/rejected": -0.41183900833129883, + "step": 5390 + }, + { + "epoch": 0.8337135124685869, + "grad_norm": 5.367610931396484, + "learning_rate": 4.011627906976744e-06, + "logits/chosen": 13.729022979736328, + "logits/rejected": 10.7450590133667, + "logps/chosen": -464.8741455078125, + "logps/rejected": -347.12689208984375, + "loss": 0.6865, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24824295938014984, + "rewards/margins": 0.08843203634023666, + "rewards/rejected": 0.15981093049049377, + "step": 5391 + }, + { + "epoch": 0.8338681616083511, + "grad_norm": 5.969167232513428, + "learning_rate": 4.011341505327071e-06, + "logits/chosen": 8.183238983154297, + "logits/rejected": 5.0609002113342285, + "logps/chosen": -211.6408233642578, + "logps/rejected": -207.94390869140625, + "loss": 0.7592, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08726739883422852, + "rewards/margins": -0.047663405537605286, + "rewards/rejected": -0.039604008197784424, + "step": 5392 + }, + { + "epoch": 0.8340228107481152, + "grad_norm": 5.681665897369385, + "learning_rate": 4.0110551036773975e-06, + "logits/chosen": 8.574899673461914, + "logits/rejected": 2.3997669219970703, + "logps/chosen": -315.51763916015625, + "logps/rejected": -242.56893920898438, + "loss": 0.5515, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1526292860507965, + "rewards/margins": 0.7872132658958435, + "rewards/rejected": -0.6345839500427246, + "step": 5393 + }, + { + "epoch": 0.8341774598878794, + "grad_norm": 6.395241737365723, + "learning_rate": 4.010768702027724e-06, + "logits/chosen": 5.79974365234375, + "logits/rejected": 8.506085395812988, + "logps/chosen": -266.611328125, + "logps/rejected": -286.57086181640625, + "loss": 0.6994, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2074846625328064, + "rewards/margins": 0.04301976412534714, + "rewards/rejected": 0.16446489095687866, + "step": 5394 + }, + { + "epoch": 0.8343321090276435, + "grad_norm": 4.886826038360596, + "learning_rate": 4.010482300378051e-06, + "logits/chosen": 11.948812484741211, + "logits/rejected": 7.996578216552734, + "logps/chosen": -289.35333251953125, + "logps/rejected": -158.23025512695312, + "loss": 0.6282, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33129453659057617, + "rewards/margins": 0.30022209882736206, + "rewards/rejected": 0.03107243776321411, + "step": 5395 + }, + { + "epoch": 0.8344867581674077, + "grad_norm": 6.295848369598389, + "learning_rate": 4.010195898728377e-06, + "logits/chosen": 7.4329376220703125, + "logits/rejected": 4.973328590393066, + "logps/chosen": -355.7255554199219, + "logps/rejected": -365.41748046875, + "loss": 0.5476, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09848098456859589, + "rewards/margins": 0.4621601998806, + "rewards/rejected": -0.3636792302131653, + "step": 5396 + }, + { + "epoch": 0.8346414073071718, + "grad_norm": 4.316558837890625, + "learning_rate": 4.009909497078703e-06, + "logits/chosen": 9.350947380065918, + "logits/rejected": 3.718388080596924, + "logps/chosen": -276.31097412109375, + "logps/rejected": -163.05532836914062, + "loss": 0.5391, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2558572292327881, + "rewards/margins": 0.43711453676223755, + "rewards/rejected": -0.18125732243061066, + "step": 5397 + }, + { + "epoch": 0.834796056446936, + "grad_norm": 6.025699138641357, + "learning_rate": 4.00962309542903e-06, + "logits/chosen": 7.262233734130859, + "logits/rejected": 3.3563954830169678, + "logps/chosen": -263.9393310546875, + "logps/rejected": -208.1671600341797, + "loss": 0.642, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46710091829299927, + "rewards/margins": 0.2902814745903015, + "rewards/rejected": 0.17681945860385895, + "step": 5398 + }, + { + "epoch": 0.8349507055867001, + "grad_norm": 5.9390082359313965, + "learning_rate": 4.009336693779357e-06, + "logits/chosen": 7.669254779815674, + "logits/rejected": 4.8430280685424805, + "logps/chosen": -201.2021026611328, + "logps/rejected": -206.9154815673828, + "loss": 0.7802, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13391587138175964, + "rewards/margins": 0.0535183846950531, + "rewards/rejected": -0.18743430078029633, + "step": 5399 + }, + { + "epoch": 0.8351053547264643, + "grad_norm": 5.8911237716674805, + "learning_rate": 4.009050292129683e-06, + "logits/chosen": 13.248455047607422, + "logits/rejected": 8.486472129821777, + "logps/chosen": -409.1990966796875, + "logps/rejected": -302.76611328125, + "loss": 0.6375, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2739750146865845, + "rewards/margins": 0.14663706719875336, + "rewards/rejected": 0.12733793258666992, + "step": 5400 + }, + { + "epoch": 0.8352600038662285, + "grad_norm": 5.067920207977295, + "learning_rate": 4.00876389048001e-06, + "logits/chosen": 14.423843383789062, + "logits/rejected": 11.532532691955566, + "logps/chosen": -367.6778869628906, + "logps/rejected": -350.1405334472656, + "loss": 0.5488, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38997527956962585, + "rewards/margins": 0.5512980222702026, + "rewards/rejected": -0.161322683095932, + "step": 5401 + }, + { + "epoch": 0.8354146530059926, + "grad_norm": 3.7607522010803223, + "learning_rate": 4.008477488830336e-06, + "logits/chosen": 10.857397079467773, + "logits/rejected": 7.8524980545043945, + "logps/chosen": -348.26129150390625, + "logps/rejected": -294.834716796875, + "loss": 0.5272, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.7445254325866699, + "rewards/margins": 0.5306577682495117, + "rewards/rejected": 0.2138676792383194, + "step": 5402 + }, + { + "epoch": 0.8355693021457569, + "grad_norm": 5.832326889038086, + "learning_rate": 4.008191087180662e-06, + "logits/chosen": 14.55161190032959, + "logits/rejected": 7.82150411605835, + "logps/chosen": -408.7925720214844, + "logps/rejected": -263.0992431640625, + "loss": 0.6181, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41906851530075073, + "rewards/margins": 0.30957135558128357, + "rewards/rejected": 0.10949714481830597, + "step": 5403 + }, + { + "epoch": 0.835723951285521, + "grad_norm": 5.645401954650879, + "learning_rate": 4.007904685530989e-06, + "logits/chosen": 11.063433647155762, + "logits/rejected": 10.394119262695312, + "logps/chosen": -271.5616455078125, + "logps/rejected": -244.5500946044922, + "loss": 0.7557, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03717225790023804, + "rewards/margins": -0.03263280168175697, + "rewards/rejected": -0.004539459943771362, + "step": 5404 + }, + { + "epoch": 0.8358786004252852, + "grad_norm": 5.294131278991699, + "learning_rate": 4.007618283881316e-06, + "logits/chosen": 6.1546502113342285, + "logits/rejected": 5.169168949127197, + "logps/chosen": -269.3834533691406, + "logps/rejected": -223.49252319335938, + "loss": 0.8347, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08259715884923935, + "rewards/margins": -0.02604052983224392, + "rewards/rejected": -0.05655666068196297, + "step": 5405 + }, + { + "epoch": 0.8360332495650493, + "grad_norm": 5.916450023651123, + "learning_rate": 4.007331882231642e-06, + "logits/chosen": 15.58582878112793, + "logits/rejected": 15.447723388671875, + "logps/chosen": -310.55780029296875, + "logps/rejected": -316.4345397949219, + "loss": 0.7655, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4645574390888214, + "rewards/margins": -0.09004535526037216, + "rewards/rejected": 0.5546028017997742, + "step": 5406 + }, + { + "epoch": 0.8361878987048135, + "grad_norm": 2.8976948261260986, + "learning_rate": 4.007045480581969e-06, + "logits/chosen": 15.364934921264648, + "logits/rejected": 13.665740966796875, + "logps/chosen": -126.39093017578125, + "logps/rejected": -133.12171936035156, + "loss": 0.5433, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10509534180164337, + "rewards/margins": 0.45479148626327515, + "rewards/rejected": -0.34969615936279297, + "step": 5407 + }, + { + "epoch": 0.8363425478445776, + "grad_norm": 6.552395820617676, + "learning_rate": 4.006759078932296e-06, + "logits/chosen": 8.795272827148438, + "logits/rejected": 5.817706108093262, + "logps/chosen": -299.75738525390625, + "logps/rejected": -261.15863037109375, + "loss": 0.6783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17785143852233887, + "rewards/margins": 0.30028846859931946, + "rewards/rejected": -0.12243704497814178, + "step": 5408 + }, + { + "epoch": 0.8364971969843418, + "grad_norm": 6.476000785827637, + "learning_rate": 4.006472677282621e-06, + "logits/chosen": 11.645604133605957, + "logits/rejected": 8.61089038848877, + "logps/chosen": -259.6112060546875, + "logps/rejected": -225.9443359375, + "loss": 0.8523, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.005492404103279114, + "rewards/margins": -0.264350563287735, + "rewards/rejected": 0.2588581442832947, + "step": 5409 + }, + { + "epoch": 0.8366518461241059, + "grad_norm": 3.7469305992126465, + "learning_rate": 4.006186275632948e-06, + "logits/chosen": 12.261303901672363, + "logits/rejected": 5.355901718139648, + "logps/chosen": -231.98089599609375, + "logps/rejected": -210.46971130371094, + "loss": 0.4566, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39939552545547485, + "rewards/margins": 0.6560695171356201, + "rewards/rejected": -0.25667402148246765, + "step": 5410 + }, + { + "epoch": 0.8368064952638701, + "grad_norm": 11.923209190368652, + "learning_rate": 4.005899873983275e-06, + "logits/chosen": 15.726322174072266, + "logits/rejected": 10.33785629272461, + "logps/chosen": -393.6388854980469, + "logps/rejected": -321.0372619628906, + "loss": 0.6675, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.38613855838775635, + "rewards/margins": 0.17060332000255585, + "rewards/rejected": 0.21553519368171692, + "step": 5411 + }, + { + "epoch": 0.8369611444036342, + "grad_norm": 5.933987617492676, + "learning_rate": 4.005613472333601e-06, + "logits/chosen": 8.458877563476562, + "logits/rejected": 8.396862030029297, + "logps/chosen": -319.454833984375, + "logps/rejected": -336.85076904296875, + "loss": 0.6865, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1493656188249588, + "rewards/margins": 0.13950181007385254, + "rewards/rejected": 0.009863808751106262, + "step": 5412 + }, + { + "epoch": 0.8371157935433984, + "grad_norm": 4.926679611206055, + "learning_rate": 4.005327070683927e-06, + "logits/chosen": 9.339546203613281, + "logits/rejected": -0.37227004766464233, + "logps/chosen": -329.81329345703125, + "logps/rejected": -216.01385498046875, + "loss": 0.5934, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24635376036167145, + "rewards/margins": 0.3219704031944275, + "rewards/rejected": -0.07561665028333664, + "step": 5413 + }, + { + "epoch": 0.8372704426831625, + "grad_norm": 5.30418062210083, + "learning_rate": 4.005040669034254e-06, + "logits/chosen": 12.046310424804688, + "logits/rejected": 10.75256633758545, + "logps/chosen": -251.45831298828125, + "logps/rejected": -233.2830047607422, + "loss": 0.7095, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.39050063490867615, + "rewards/margins": 0.01399308443069458, + "rewards/rejected": 0.37650758028030396, + "step": 5414 + }, + { + "epoch": 0.8374250918229267, + "grad_norm": 5.1942219734191895, + "learning_rate": 4.0047542673845805e-06, + "logits/chosen": 16.091569900512695, + "logits/rejected": 10.159652709960938, + "logps/chosen": -324.17486572265625, + "logps/rejected": -255.77749633789062, + "loss": 0.4045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2942097783088684, + "rewards/margins": 0.8292987942695618, + "rewards/rejected": -0.5350890159606934, + "step": 5415 + }, + { + "epoch": 0.8375797409626909, + "grad_norm": 4.26779317855835, + "learning_rate": 4.004467865734907e-06, + "logits/chosen": 16.833147048950195, + "logits/rejected": 12.745699882507324, + "logps/chosen": -216.31436157226562, + "logps/rejected": -178.08505249023438, + "loss": 0.5441, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18688784539699554, + "rewards/margins": 0.37858593463897705, + "rewards/rejected": -0.1916980892419815, + "step": 5416 + }, + { + "epoch": 0.8377343901024551, + "grad_norm": 6.008831977844238, + "learning_rate": 4.004181464085233e-06, + "logits/chosen": 10.574746131896973, + "logits/rejected": 8.50964069366455, + "logps/chosen": -200.4350128173828, + "logps/rejected": -203.24606323242188, + "loss": 0.5834, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32726478576660156, + "rewards/margins": 0.3187827169895172, + "rewards/rejected": 0.008482083678245544, + "step": 5417 + }, + { + "epoch": 0.8378890392422192, + "grad_norm": 5.158947944641113, + "learning_rate": 4.0038950624355596e-06, + "logits/chosen": 9.645227432250977, + "logits/rejected": 6.985991954803467, + "logps/chosen": -328.030517578125, + "logps/rejected": -256.05377197265625, + "loss": 0.597, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6289145946502686, + "rewards/margins": 0.3069944381713867, + "rewards/rejected": 0.32192009687423706, + "step": 5418 + }, + { + "epoch": 0.8380436883819834, + "grad_norm": 4.650661945343018, + "learning_rate": 4.003608660785886e-06, + "logits/chosen": 11.20956802368164, + "logits/rejected": 2.6708712577819824, + "logps/chosen": -333.18011474609375, + "logps/rejected": -222.7096405029297, + "loss": 0.4484, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40139809250831604, + "rewards/margins": 0.7944390773773193, + "rewards/rejected": -0.39304110407829285, + "step": 5419 + }, + { + "epoch": 0.8381983375217475, + "grad_norm": 5.28973913192749, + "learning_rate": 4.003322259136213e-06, + "logits/chosen": 11.617319107055664, + "logits/rejected": 11.456250190734863, + "logps/chosen": -273.3245849609375, + "logps/rejected": -217.79920959472656, + "loss": 0.6176, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13470740616321564, + "rewards/margins": 0.23251557350158691, + "rewards/rejected": -0.09780816733837128, + "step": 5420 + }, + { + "epoch": 0.8383529866615117, + "grad_norm": 5.416359901428223, + "learning_rate": 4.0030358574865395e-06, + "logits/chosen": 3.3947293758392334, + "logits/rejected": 5.646587371826172, + "logps/chosen": -227.69610595703125, + "logps/rejected": -242.85885620117188, + "loss": 0.6724, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14110738039016724, + "rewards/margins": 0.07445431500673294, + "rewards/rejected": 0.0666530653834343, + "step": 5421 + }, + { + "epoch": 0.8385076358012759, + "grad_norm": 6.574779033660889, + "learning_rate": 4.002749455836865e-06, + "logits/chosen": 3.8974621295928955, + "logits/rejected": 5.640439033508301, + "logps/chosen": -249.04949951171875, + "logps/rejected": -253.0032958984375, + "loss": 0.6468, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3739694356918335, + "rewards/margins": 0.20426106452941895, + "rewards/rejected": 0.16970840096473694, + "step": 5422 + }, + { + "epoch": 0.83866228494104, + "grad_norm": 4.975757122039795, + "learning_rate": 4.002463054187192e-06, + "logits/chosen": 11.546712875366211, + "logits/rejected": 9.134991645812988, + "logps/chosen": -287.65704345703125, + "logps/rejected": -263.9325866699219, + "loss": 0.7327, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26884832978248596, + "rewards/margins": 0.012437999248504639, + "rewards/rejected": 0.25641030073165894, + "step": 5423 + }, + { + "epoch": 0.8388169340808042, + "grad_norm": 7.482711315155029, + "learning_rate": 4.002176652537519e-06, + "logits/chosen": 12.253576278686523, + "logits/rejected": 10.279074668884277, + "logps/chosen": -317.8530578613281, + "logps/rejected": -239.5455322265625, + "loss": 0.7639, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2441038191318512, + "rewards/margins": 0.05510842800140381, + "rewards/rejected": 0.1889953762292862, + "step": 5424 + }, + { + "epoch": 0.8389715832205683, + "grad_norm": 5.61442232131958, + "learning_rate": 4.001890250887845e-06, + "logits/chosen": 2.2847537994384766, + "logits/rejected": 3.24780535697937, + "logps/chosen": -164.4940185546875, + "logps/rejected": -216.36029052734375, + "loss": 0.8194, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3095889985561371, + "rewards/margins": -0.2146223783493042, + "rewards/rejected": -0.09496665000915527, + "step": 5425 + }, + { + "epoch": 0.8391262323603325, + "grad_norm": 4.330936431884766, + "learning_rate": 4.001603849238172e-06, + "logits/chosen": 8.27112102508545, + "logits/rejected": 5.523753643035889, + "logps/chosen": -232.79727172851562, + "logps/rejected": -219.52406311035156, + "loss": 0.5741, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11994314193725586, + "rewards/margins": 0.5259772539138794, + "rewards/rejected": -0.4060341417789459, + "step": 5426 + }, + { + "epoch": 0.8392808815000966, + "grad_norm": 5.4964213371276855, + "learning_rate": 4.001317447588499e-06, + "logits/chosen": 7.427450180053711, + "logits/rejected": 7.085107803344727, + "logps/chosen": -263.59686279296875, + "logps/rejected": -222.1402587890625, + "loss": 0.6939, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.256794810295105, + "rewards/margins": 0.06467723101377487, + "rewards/rejected": 0.1921176016330719, + "step": 5427 + }, + { + "epoch": 0.8394355306398608, + "grad_norm": 6.141524314880371, + "learning_rate": 4.001031045938825e-06, + "logits/chosen": 9.463638305664062, + "logits/rejected": 12.04014778137207, + "logps/chosen": -197.3345947265625, + "logps/rejected": -293.56268310546875, + "loss": 0.7188, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08512932062149048, + "rewards/margins": 0.06572775542736053, + "rewards/rejected": -0.150857076048851, + "step": 5428 + }, + { + "epoch": 0.839590179779625, + "grad_norm": 4.164482593536377, + "learning_rate": 4.000744644289151e-06, + "logits/chosen": 15.505621910095215, + "logits/rejected": 6.917988300323486, + "logps/chosen": -352.1197814941406, + "logps/rejected": -238.9019012451172, + "loss": 0.4952, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4008140563964844, + "rewards/margins": 0.66022127866745, + "rewards/rejected": -0.2594072222709656, + "step": 5429 + }, + { + "epoch": 0.8397448289193892, + "grad_norm": 5.01873254776001, + "learning_rate": 4.000458242639478e-06, + "logits/chosen": 11.75703239440918, + "logits/rejected": 9.634759902954102, + "logps/chosen": -288.7507629394531, + "logps/rejected": -269.29498291015625, + "loss": 0.601, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22041340172290802, + "rewards/margins": 0.3064883351325989, + "rewards/rejected": -0.08607494086027145, + "step": 5430 + }, + { + "epoch": 0.8398994780591533, + "grad_norm": 5.715124607086182, + "learning_rate": 4.000171840989804e-06, + "logits/chosen": 5.409604072570801, + "logits/rejected": 10.110594749450684, + "logps/chosen": -177.2064208984375, + "logps/rejected": -220.87513732910156, + "loss": 0.8621, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.014394789934158325, + "rewards/margins": -0.12481828778982162, + "rewards/rejected": 0.1104234978556633, + "step": 5431 + }, + { + "epoch": 0.8400541271989175, + "grad_norm": 5.102543354034424, + "learning_rate": 3.999885439340131e-06, + "logits/chosen": 12.225706100463867, + "logits/rejected": 6.9984917640686035, + "logps/chosen": -182.45681762695312, + "logps/rejected": -111.1597671508789, + "loss": 0.5693, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24482709169387817, + "rewards/margins": 0.36885514855384827, + "rewards/rejected": -0.12402810156345367, + "step": 5432 + }, + { + "epoch": 0.8402087763386816, + "grad_norm": 4.921548843383789, + "learning_rate": 3.999599037690458e-06, + "logits/chosen": 9.975991249084473, + "logits/rejected": 10.47380256652832, + "logps/chosen": -187.17385864257812, + "logps/rejected": -190.00796508789062, + "loss": 0.643, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04586517810821533, + "rewards/margins": 0.1925581395626068, + "rewards/rejected": -0.23842331767082214, + "step": 5433 + }, + { + "epoch": 0.8403634254784458, + "grad_norm": 3.8383398056030273, + "learning_rate": 3.999312636040784e-06, + "logits/chosen": 9.934160232543945, + "logits/rejected": 6.6337385177612305, + "logps/chosen": -200.88095092773438, + "logps/rejected": -111.7099838256836, + "loss": 0.5699, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.018119312822818756, + "rewards/margins": 0.45537564158439636, + "rewards/rejected": -0.4372563362121582, + "step": 5434 + }, + { + "epoch": 0.8405180746182099, + "grad_norm": 5.574609756469727, + "learning_rate": 3.99902623439111e-06, + "logits/chosen": 9.462666511535645, + "logits/rejected": 3.1613376140594482, + "logps/chosen": -348.7305908203125, + "logps/rejected": -304.126708984375, + "loss": 0.4777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15324078500270844, + "rewards/margins": 0.718546986579895, + "rewards/rejected": -0.5653061866760254, + "step": 5435 + }, + { + "epoch": 0.8406727237579741, + "grad_norm": 5.148168087005615, + "learning_rate": 3.998739832741437e-06, + "logits/chosen": 8.252252578735352, + "logits/rejected": 6.443573474884033, + "logps/chosen": -268.34515380859375, + "logps/rejected": -227.71771240234375, + "loss": 0.6833, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2763122618198395, + "rewards/margins": 0.2445923089981079, + "rewards/rejected": 0.031719960272312164, + "step": 5436 + }, + { + "epoch": 0.8408273728977382, + "grad_norm": 4.207274436950684, + "learning_rate": 3.998453431091763e-06, + "logits/chosen": 11.124573707580566, + "logits/rejected": 7.32269287109375, + "logps/chosen": -285.9581604003906, + "logps/rejected": -257.38043212890625, + "loss": 0.4608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49729377031326294, + "rewards/margins": 0.576913595199585, + "rewards/rejected": -0.07961972057819366, + "step": 5437 + }, + { + "epoch": 0.8409820220375024, + "grad_norm": 5.801570892333984, + "learning_rate": 3.99816702944209e-06, + "logits/chosen": 7.315394401550293, + "logits/rejected": 3.6569457054138184, + "logps/chosen": -551.8583984375, + "logps/rejected": -265.0906066894531, + "loss": 0.6491, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08991736173629761, + "rewards/margins": 0.22030098736286163, + "rewards/rejected": -0.13038358092308044, + "step": 5438 + }, + { + "epoch": 0.8411366711772665, + "grad_norm": 7.415975570678711, + "learning_rate": 3.997880627792417e-06, + "logits/chosen": 10.483821868896484, + "logits/rejected": 5.198424339294434, + "logps/chosen": -201.1514892578125, + "logps/rejected": -182.17062377929688, + "loss": 0.8435, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.054277002811431885, + "rewards/margins": -0.07612176239490509, + "rewards/rejected": 0.021844759583473206, + "step": 5439 + }, + { + "epoch": 0.8412913203170307, + "grad_norm": 13.179608345031738, + "learning_rate": 3.997594226142743e-06, + "logits/chosen": 12.54238510131836, + "logits/rejected": 15.689638137817383, + "logps/chosen": -337.2723083496094, + "logps/rejected": -327.63751220703125, + "loss": 0.571, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11863747239112854, + "rewards/margins": 0.4537114202976227, + "rewards/rejected": -0.572348952293396, + "step": 5440 + }, + { + "epoch": 0.841445969456795, + "grad_norm": 7.264350414276123, + "learning_rate": 3.99730782449307e-06, + "logits/chosen": 11.289186477661133, + "logits/rejected": 8.527827262878418, + "logps/chosen": -338.5001220703125, + "logps/rejected": -252.5107421875, + "loss": 0.8086, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03689804673194885, + "rewards/margins": -0.08825166523456573, + "rewards/rejected": 0.12514972686767578, + "step": 5441 + }, + { + "epoch": 0.8416006185965591, + "grad_norm": 5.569958686828613, + "learning_rate": 3.997021422843396e-06, + "logits/chosen": 8.517095565795898, + "logits/rejected": 8.322416305541992, + "logps/chosen": -301.5049133300781, + "logps/rejected": -237.5975799560547, + "loss": 0.6905, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19004316627979279, + "rewards/margins": 0.10138611495494843, + "rewards/rejected": 0.08865704387426376, + "step": 5442 + }, + { + "epoch": 0.8417552677363233, + "grad_norm": 4.2139997482299805, + "learning_rate": 3.9967350211937224e-06, + "logits/chosen": 11.163293838500977, + "logits/rejected": 5.496822357177734, + "logps/chosen": -294.4971008300781, + "logps/rejected": -254.0455322265625, + "loss": 0.4135, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.518298327922821, + "rewards/margins": 0.8536103963851929, + "rewards/rejected": -0.3353120684623718, + "step": 5443 + }, + { + "epoch": 0.8419099168760874, + "grad_norm": 4.374655246734619, + "learning_rate": 3.996448619544049e-06, + "logits/chosen": 8.741077423095703, + "logits/rejected": 7.998415946960449, + "logps/chosen": -240.798828125, + "logps/rejected": -243.75619506835938, + "loss": 0.6448, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28527846932411194, + "rewards/margins": 0.21384401619434357, + "rewards/rejected": 0.07143445312976837, + "step": 5444 + }, + { + "epoch": 0.8420645660158516, + "grad_norm": 5.06480073928833, + "learning_rate": 3.996162217894376e-06, + "logits/chosen": 9.56277847290039, + "logits/rejected": 10.73475170135498, + "logps/chosen": -332.46563720703125, + "logps/rejected": -287.79486083984375, + "loss": 0.6308, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19607073068618774, + "rewards/margins": 0.18030638992786407, + "rewards/rejected": 0.015764333307743073, + "step": 5445 + }, + { + "epoch": 0.8422192151556157, + "grad_norm": 5.909814834594727, + "learning_rate": 3.995875816244702e-06, + "logits/chosen": 8.587616920471191, + "logits/rejected": 6.354789733886719, + "logps/chosen": -203.06478881835938, + "logps/rejected": -216.36228942871094, + "loss": 0.7618, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.333962619304657, + "rewards/margins": 0.006642401218414307, + "rewards/rejected": -0.3406050205230713, + "step": 5446 + }, + { + "epoch": 0.8423738642953799, + "grad_norm": 4.2088727951049805, + "learning_rate": 3.995589414595028e-06, + "logits/chosen": 9.15581226348877, + "logits/rejected": 5.306465148925781, + "logps/chosen": -240.82017517089844, + "logps/rejected": -240.62744140625, + "loss": 0.5462, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09779989719390869, + "rewards/margins": 0.4776156544685364, + "rewards/rejected": -0.3798157572746277, + "step": 5447 + }, + { + "epoch": 0.842528513435144, + "grad_norm": 8.272249221801758, + "learning_rate": 3.995303012945355e-06, + "logits/chosen": 12.955611228942871, + "logits/rejected": 7.64576530456543, + "logps/chosen": -371.28948974609375, + "logps/rejected": -270.0327453613281, + "loss": 0.7689, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.22182704508304596, + "rewards/margins": 0.3040510416030884, + "rewards/rejected": -0.5258780717849731, + "step": 5448 + }, + { + "epoch": 0.8426831625749082, + "grad_norm": 5.398093223571777, + "learning_rate": 3.9950166112956815e-06, + "logits/chosen": 5.926149845123291, + "logits/rejected": 6.374869346618652, + "logps/chosen": -248.61924743652344, + "logps/rejected": -289.9886779785156, + "loss": 0.5821, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014869850128889084, + "rewards/margins": 0.4561840891838074, + "rewards/rejected": -0.4413142204284668, + "step": 5449 + }, + { + "epoch": 0.8428378117146723, + "grad_norm": 3.6684465408325195, + "learning_rate": 3.994730209646008e-06, + "logits/chosen": 13.245660781860352, + "logits/rejected": 5.497580528259277, + "logps/chosen": -191.15628051757812, + "logps/rejected": -127.91675567626953, + "loss": 0.5407, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03709081560373306, + "rewards/margins": 0.47077396512031555, + "rewards/rejected": -0.4336831271648407, + "step": 5450 + }, + { + "epoch": 0.8429924608544365, + "grad_norm": 124.80889129638672, + "learning_rate": 3.994443807996334e-06, + "logits/chosen": 7.019758224487305, + "logits/rejected": 8.855713844299316, + "logps/chosen": -274.4535217285156, + "logps/rejected": -306.98382568359375, + "loss": 0.6544, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.17454136908054352, + "rewards/margins": 0.4354098439216614, + "rewards/rejected": -0.2608683705329895, + "step": 5451 + }, + { + "epoch": 0.8431471099942006, + "grad_norm": 6.874704837799072, + "learning_rate": 3.994157406346661e-06, + "logits/chosen": 6.674954414367676, + "logits/rejected": 2.596411943435669, + "logps/chosen": -221.28607177734375, + "logps/rejected": -212.7696075439453, + "loss": 0.981, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.24677175283432007, + "rewards/margins": -0.3635064363479614, + "rewards/rejected": 0.11673471331596375, + "step": 5452 + }, + { + "epoch": 0.8433017591339648, + "grad_norm": 5.543606758117676, + "learning_rate": 3.993871004696987e-06, + "logits/chosen": 8.839470863342285, + "logits/rejected": 9.854912757873535, + "logps/chosen": -211.8182373046875, + "logps/rejected": -238.90908813476562, + "loss": 0.7424, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3842387795448303, + "rewards/margins": -0.038873158395290375, + "rewards/rejected": -0.34536558389663696, + "step": 5453 + }, + { + "epoch": 0.843456408273729, + "grad_norm": 6.784116744995117, + "learning_rate": 3.993584603047314e-06, + "logits/chosen": 14.78835391998291, + "logits/rejected": 13.23796558380127, + "logps/chosen": -310.6944885253906, + "logps/rejected": -251.83795166015625, + "loss": 0.8677, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.014471471309661865, + "rewards/margins": -0.24712349474430084, + "rewards/rejected": 0.2326519936323166, + "step": 5454 + }, + { + "epoch": 0.8436110574134932, + "grad_norm": 4.349673271179199, + "learning_rate": 3.99329820139764e-06, + "logits/chosen": 10.144844055175781, + "logits/rejected": 14.238162994384766, + "logps/chosen": -157.75543212890625, + "logps/rejected": -169.78427124023438, + "loss": 0.6516, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18741624057292938, + "rewards/margins": 0.2006918489933014, + "rewards/rejected": -0.013275638222694397, + "step": 5455 + }, + { + "epoch": 0.8437657065532573, + "grad_norm": 5.367730617523193, + "learning_rate": 3.993011799747966e-06, + "logits/chosen": 12.548636436462402, + "logits/rejected": 10.133837699890137, + "logps/chosen": -282.38360595703125, + "logps/rejected": -311.2052307128906, + "loss": 0.5331, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4988144040107727, + "rewards/margins": 0.3848533034324646, + "rewards/rejected": 0.11396108567714691, + "step": 5456 + }, + { + "epoch": 0.8439203556930215, + "grad_norm": 6.205030918121338, + "learning_rate": 3.992725398098293e-06, + "logits/chosen": 12.612508773803711, + "logits/rejected": 5.023171901702881, + "logps/chosen": -454.2065734863281, + "logps/rejected": -228.8165740966797, + "loss": 0.662, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3579191267490387, + "rewards/margins": 0.28423964977264404, + "rewards/rejected": 0.07367949932813644, + "step": 5457 + }, + { + "epoch": 0.8440750048327856, + "grad_norm": 4.7500901222229, + "learning_rate": 3.99243899644862e-06, + "logits/chosen": 8.069990158081055, + "logits/rejected": 10.263045310974121, + "logps/chosen": -149.6179962158203, + "logps/rejected": -191.2418212890625, + "loss": 0.7158, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.16837653517723083, + "rewards/margins": 0.043401919305324554, + "rewards/rejected": -0.2117784470319748, + "step": 5458 + }, + { + "epoch": 0.8442296539725498, + "grad_norm": 4.306687355041504, + "learning_rate": 3.992152594798946e-06, + "logits/chosen": 11.442225456237793, + "logits/rejected": 1.4943844079971313, + "logps/chosen": -242.65017700195312, + "logps/rejected": -108.43907928466797, + "loss": 0.5709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12707114219665527, + "rewards/margins": 0.47909146547317505, + "rewards/rejected": -0.3520203232765198, + "step": 5459 + }, + { + "epoch": 0.8443843031123139, + "grad_norm": 5.47991943359375, + "learning_rate": 3.991866193149273e-06, + "logits/chosen": 9.255265235900879, + "logits/rejected": 6.019491195678711, + "logps/chosen": -253.19265747070312, + "logps/rejected": -192.83509826660156, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06651601195335388, + "rewards/margins": 0.19208915531635284, + "rewards/rejected": -0.12557317316532135, + "step": 5460 + }, + { + "epoch": 0.8445389522520781, + "grad_norm": 3.825132369995117, + "learning_rate": 3.9915797914996e-06, + "logits/chosen": 8.347002983093262, + "logits/rejected": 9.163528442382812, + "logps/chosen": -165.01242065429688, + "logps/rejected": -166.1040802001953, + "loss": 0.6086, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13648171722888947, + "rewards/margins": 0.3249170184135437, + "rewards/rejected": -0.18843528628349304, + "step": 5461 + }, + { + "epoch": 0.8446936013918422, + "grad_norm": 5.549593448638916, + "learning_rate": 3.991293389849925e-06, + "logits/chosen": 13.56130599975586, + "logits/rejected": 12.508157730102539, + "logps/chosen": -263.97509765625, + "logps/rejected": -273.3756103515625, + "loss": 0.6705, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.050782106816768646, + "rewards/margins": 0.08747228980064392, + "rewards/rejected": -0.13825440406799316, + "step": 5462 + }, + { + "epoch": 0.8448482505316064, + "grad_norm": 5.221488952636719, + "learning_rate": 3.991006988200252e-06, + "logits/chosen": 7.283550262451172, + "logits/rejected": 4.725826263427734, + "logps/chosen": -362.39312744140625, + "logps/rejected": -299.189208984375, + "loss": 0.5214, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6565374732017517, + "rewards/margins": 0.79338538646698, + "rewards/rejected": -0.13684791326522827, + "step": 5463 + }, + { + "epoch": 0.8450028996713705, + "grad_norm": 3.4726195335388184, + "learning_rate": 3.990720586550579e-06, + "logits/chosen": 11.575824737548828, + "logits/rejected": -0.2912936210632324, + "logps/chosen": -329.842041015625, + "logps/rejected": -176.5313262939453, + "loss": 0.4885, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06805653870105743, + "rewards/margins": 0.6268208026885986, + "rewards/rejected": -0.5587642788887024, + "step": 5464 + }, + { + "epoch": 0.8451575488111347, + "grad_norm": 5.900938034057617, + "learning_rate": 3.990434184900905e-06, + "logits/chosen": 13.171445846557617, + "logits/rejected": 6.941216945648193, + "logps/chosen": -353.9685363769531, + "logps/rejected": -183.34910583496094, + "loss": 0.4463, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4265161156654358, + "rewards/margins": 0.8473890423774719, + "rewards/rejected": -0.4208729565143585, + "step": 5465 + }, + { + "epoch": 0.8453121979508988, + "grad_norm": 7.092343330383301, + "learning_rate": 3.990147783251232e-06, + "logits/chosen": 9.414999961853027, + "logits/rejected": 9.273770332336426, + "logps/chosen": -354.06866455078125, + "logps/rejected": -328.0182189941406, + "loss": 0.6045, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14364787936210632, + "rewards/margins": 0.26097074151039124, + "rewards/rejected": -0.11732286214828491, + "step": 5466 + }, + { + "epoch": 0.8454668470906631, + "grad_norm": 6.088544845581055, + "learning_rate": 3.989861381601559e-06, + "logits/chosen": 8.224116325378418, + "logits/rejected": 10.840134620666504, + "logps/chosen": -418.8666687011719, + "logps/rejected": -407.1133117675781, + "loss": 0.5853, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14702008664608002, + "rewards/margins": 0.394416481256485, + "rewards/rejected": -0.24739637970924377, + "step": 5467 + }, + { + "epoch": 0.8456214962304273, + "grad_norm": 5.374605178833008, + "learning_rate": 3.9895749799518845e-06, + "logits/chosen": 12.732425689697266, + "logits/rejected": 14.39950180053711, + "logps/chosen": -236.18177795410156, + "logps/rejected": -282.9896240234375, + "loss": 0.8467, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2567649781703949, + "rewards/margins": -0.23266582190990448, + "rewards/rejected": -0.024099163711071014, + "step": 5468 + }, + { + "epoch": 0.8457761453701914, + "grad_norm": 5.545962333679199, + "learning_rate": 3.989288578302211e-06, + "logits/chosen": 12.432683944702148, + "logits/rejected": 3.2952661514282227, + "logps/chosen": -269.54656982421875, + "logps/rejected": -229.50872802734375, + "loss": 0.5971, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25189971923828125, + "rewards/margins": 0.23972085118293762, + "rewards/rejected": 0.012178882956504822, + "step": 5469 + }, + { + "epoch": 0.8459307945099556, + "grad_norm": 3.770609140396118, + "learning_rate": 3.989002176652538e-06, + "logits/chosen": 12.660408973693848, + "logits/rejected": 7.900226593017578, + "logps/chosen": -268.8815612792969, + "logps/rejected": -205.54701232910156, + "loss": 0.5838, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3496103286743164, + "rewards/margins": 0.25646916031837463, + "rewards/rejected": 0.09314117580652237, + "step": 5470 + }, + { + "epoch": 0.8460854436497197, + "grad_norm": 6.674996852874756, + "learning_rate": 3.9887157750028644e-06, + "logits/chosen": 9.373661041259766, + "logits/rejected": 6.587751865386963, + "logps/chosen": -425.7975158691406, + "logps/rejected": -419.66815185546875, + "loss": 0.5703, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4166502356529236, + "rewards/margins": 0.4247686564922333, + "rewards/rejected": -0.008118439465761185, + "step": 5471 + }, + { + "epoch": 0.8462400927894839, + "grad_norm": 4.417994499206543, + "learning_rate": 3.988429373353191e-06, + "logits/chosen": 12.378002166748047, + "logits/rejected": 14.436321258544922, + "logps/chosen": -135.5165557861328, + "logps/rejected": -164.73138427734375, + "loss": 0.6432, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2739633619785309, + "rewards/margins": 0.13503243029117584, + "rewards/rejected": -0.40899577736854553, + "step": 5472 + }, + { + "epoch": 0.846394741929248, + "grad_norm": 5.065564155578613, + "learning_rate": 3.988142971703518e-06, + "logits/chosen": 10.653124809265137, + "logits/rejected": 6.704805374145508, + "logps/chosen": -267.66009521484375, + "logps/rejected": -214.47833251953125, + "loss": 0.6208, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3979877829551697, + "rewards/margins": 0.42701444029808044, + "rewards/rejected": -0.029026679694652557, + "step": 5473 + }, + { + "epoch": 0.8465493910690122, + "grad_norm": 7.963303089141846, + "learning_rate": 3.987856570053844e-06, + "logits/chosen": 8.149672508239746, + "logits/rejected": 10.126824378967285, + "logps/chosen": -187.68344116210938, + "logps/rejected": -277.41009521484375, + "loss": 0.8712, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0975956916809082, + "rewards/margins": -0.25921422243118286, + "rewards/rejected": 0.35680994391441345, + "step": 5474 + }, + { + "epoch": 0.8467040402087763, + "grad_norm": 5.427379131317139, + "learning_rate": 3.98757016840417e-06, + "logits/chosen": 15.40723991394043, + "logits/rejected": 12.819866180419922, + "logps/chosen": -403.01910400390625, + "logps/rejected": -333.90325927734375, + "loss": 0.4324, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6557334661483765, + "rewards/margins": 0.815754771232605, + "rewards/rejected": -0.16002121567726135, + "step": 5475 + }, + { + "epoch": 0.8468586893485405, + "grad_norm": 5.994505405426025, + "learning_rate": 3.987283766754497e-06, + "logits/chosen": 7.1236467361450195, + "logits/rejected": 2.8771605491638184, + "logps/chosen": -229.33168029785156, + "logps/rejected": -191.91180419921875, + "loss": 0.626, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03733201324939728, + "rewards/margins": 0.17790646851062775, + "rewards/rejected": -0.14057445526123047, + "step": 5476 + }, + { + "epoch": 0.8470133384883046, + "grad_norm": 4.784754753112793, + "learning_rate": 3.9869973651048235e-06, + "logits/chosen": 12.940276145935059, + "logits/rejected": 6.782338619232178, + "logps/chosen": -316.0126647949219, + "logps/rejected": -254.24484252929688, + "loss": 0.5747, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.018910042941570282, + "rewards/margins": 0.3407229781150818, + "rewards/rejected": -0.3596329689025879, + "step": 5477 + }, + { + "epoch": 0.8471679876280688, + "grad_norm": 5.256471157073975, + "learning_rate": 3.98671096345515e-06, + "logits/chosen": 7.633310794830322, + "logits/rejected": 4.674440383911133, + "logps/chosen": -331.40875244140625, + "logps/rejected": -204.9730682373047, + "loss": 0.7333, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16876892745494843, + "rewards/margins": 0.11264100670814514, + "rewards/rejected": 0.05612790584564209, + "step": 5478 + }, + { + "epoch": 0.8473226367678329, + "grad_norm": 5.068542957305908, + "learning_rate": 3.986424561805477e-06, + "logits/chosen": 9.508679389953613, + "logits/rejected": 5.23380708694458, + "logps/chosen": -208.66061401367188, + "logps/rejected": -164.4246368408203, + "loss": 0.7484, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20797139406204224, + "rewards/margins": -0.022533897310495377, + "rewards/rejected": -0.18543748557567596, + "step": 5479 + }, + { + "epoch": 0.8474772859075972, + "grad_norm": 6.490504264831543, + "learning_rate": 3.9861381601558034e-06, + "logits/chosen": 8.677637100219727, + "logits/rejected": 7.825351715087891, + "logps/chosen": -232.0186004638672, + "logps/rejected": -198.38323974609375, + "loss": 0.6872, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35770681500434875, + "rewards/margins": 0.12265169620513916, + "rewards/rejected": -0.4803585112094879, + "step": 5480 + }, + { + "epoch": 0.8476319350473613, + "grad_norm": 5.286553859710693, + "learning_rate": 3.985851758506129e-06, + "logits/chosen": 5.221107482910156, + "logits/rejected": 6.806898593902588, + "logps/chosen": -187.85260009765625, + "logps/rejected": -245.42233276367188, + "loss": 0.7391, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2765347361564636, + "rewards/margins": -0.029204845428466797, + "rewards/rejected": -0.24732989072799683, + "step": 5481 + }, + { + "epoch": 0.8477865841871255, + "grad_norm": 7.118972301483154, + "learning_rate": 3.985565356856456e-06, + "logits/chosen": 14.64684772491455, + "logits/rejected": 12.665660858154297, + "logps/chosen": -263.484130859375, + "logps/rejected": -300.7195739746094, + "loss": 0.6834, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37521693110466003, + "rewards/margins": 0.20982113480567932, + "rewards/rejected": 0.16539576649665833, + "step": 5482 + }, + { + "epoch": 0.8479412333268896, + "grad_norm": 6.252742767333984, + "learning_rate": 3.9852789552067825e-06, + "logits/chosen": 9.27850341796875, + "logits/rejected": 8.052875518798828, + "logps/chosen": -285.70111083984375, + "logps/rejected": -285.9509582519531, + "loss": 0.76, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04352791607379913, + "rewards/margins": 0.22563757002353668, + "rewards/rejected": -0.2691655158996582, + "step": 5483 + }, + { + "epoch": 0.8480958824666538, + "grad_norm": 5.236358642578125, + "learning_rate": 3.984992553557109e-06, + "logits/chosen": 7.729684352874756, + "logits/rejected": -1.0859134197235107, + "logps/chosen": -260.7658996582031, + "logps/rejected": -158.29518127441406, + "loss": 0.5808, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14956846833229065, + "rewards/margins": 0.349578857421875, + "rewards/rejected": -0.20001041889190674, + "step": 5484 + }, + { + "epoch": 0.848250531606418, + "grad_norm": 3.3107223510742188, + "learning_rate": 3.984706151907435e-06, + "logits/chosen": 5.789141654968262, + "logits/rejected": 3.0061471462249756, + "logps/chosen": -209.1846466064453, + "logps/rejected": -131.57101440429688, + "loss": 0.5561, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0344119668006897, + "rewards/margins": 0.46683138608932495, + "rewards/rejected": -0.5012433528900146, + "step": 5485 + }, + { + "epoch": 0.8484051807461821, + "grad_norm": 3.3643791675567627, + "learning_rate": 3.984419750257762e-06, + "logits/chosen": 12.296547889709473, + "logits/rejected": 5.911550045013428, + "logps/chosen": -202.30465698242188, + "logps/rejected": -137.86553955078125, + "loss": 0.4844, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20803004503250122, + "rewards/margins": 0.6443226337432861, + "rewards/rejected": -0.4362925589084625, + "step": 5486 + }, + { + "epoch": 0.8485598298859462, + "grad_norm": 22.514795303344727, + "learning_rate": 3.984133348608088e-06, + "logits/chosen": 9.195449829101562, + "logits/rejected": 9.772502899169922, + "logps/chosen": -239.89364624023438, + "logps/rejected": -248.40615844726562, + "loss": 0.796, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6489839553833008, + "rewards/margins": -0.02284925803542137, + "rewards/rejected": -0.6261346936225891, + "step": 5487 + }, + { + "epoch": 0.8487144790257104, + "grad_norm": 5.163241863250732, + "learning_rate": 3.983846946958415e-06, + "logits/chosen": 9.574090957641602, + "logits/rejected": 13.16737174987793, + "logps/chosen": -165.08685302734375, + "logps/rejected": -243.1929931640625, + "loss": 0.5893, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10586822032928467, + "rewards/margins": 0.3137997090816498, + "rewards/rejected": -0.41966789960861206, + "step": 5488 + }, + { + "epoch": 0.8488691281654746, + "grad_norm": 4.582966327667236, + "learning_rate": 3.983560545308741e-06, + "logits/chosen": 8.251498222351074, + "logits/rejected": 1.7257134914398193, + "logps/chosen": -363.6442565917969, + "logps/rejected": -225.69203186035156, + "loss": 0.4887, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14303795993328094, + "rewards/margins": 0.595531702041626, + "rewards/rejected": -0.4524937868118286, + "step": 5489 + }, + { + "epoch": 0.8490237773052387, + "grad_norm": 5.71950626373291, + "learning_rate": 3.983274143659067e-06, + "logits/chosen": 9.648967742919922, + "logits/rejected": 4.315744876861572, + "logps/chosen": -304.9479675292969, + "logps/rejected": -292.1197509765625, + "loss": 0.5288, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0482361763715744, + "rewards/margins": 0.7476270198822021, + "rewards/rejected": -0.7958632707595825, + "step": 5490 + }, + { + "epoch": 0.8491784264450029, + "grad_norm": 6.292933464050293, + "learning_rate": 3.982987742009394e-06, + "logits/chosen": 4.617978096008301, + "logits/rejected": 6.700188636779785, + "logps/chosen": -208.8612060546875, + "logps/rejected": -267.6300048828125, + "loss": 0.9422, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0003688819706439972, + "rewards/margins": -0.3538159430027008, + "rewards/rejected": 0.3534470796585083, + "step": 5491 + }, + { + "epoch": 0.849333075584767, + "grad_norm": 3.921041250228882, + "learning_rate": 3.982701340359721e-06, + "logits/chosen": 9.975062370300293, + "logits/rejected": 5.595370769500732, + "logps/chosen": -248.07568359375, + "logps/rejected": -264.8523254394531, + "loss": 0.5387, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.049535274505615234, + "rewards/margins": 0.4183952808380127, + "rewards/rejected": -0.46793055534362793, + "step": 5492 + }, + { + "epoch": 0.8494877247245313, + "grad_norm": 4.795502185821533, + "learning_rate": 3.982414938710047e-06, + "logits/chosen": 12.421577453613281, + "logits/rejected": 8.837335586547852, + "logps/chosen": -251.47259521484375, + "logps/rejected": -209.2999267578125, + "loss": 0.6389, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08791498094797134, + "rewards/margins": 0.2378964126110077, + "rewards/rejected": -0.14998143911361694, + "step": 5493 + }, + { + "epoch": 0.8496423738642954, + "grad_norm": 3.839031219482422, + "learning_rate": 3.982128537060374e-06, + "logits/chosen": 7.401708602905273, + "logits/rejected": 5.659162998199463, + "logps/chosen": -192.65206909179688, + "logps/rejected": -117.4052734375, + "loss": 0.5601, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03605546057224274, + "rewards/margins": 0.369170606136322, + "rewards/rejected": -0.3331151604652405, + "step": 5494 + }, + { + "epoch": 0.8497970230040596, + "grad_norm": 6.970352649688721, + "learning_rate": 3.9818421354107e-06, + "logits/chosen": 8.382070541381836, + "logits/rejected": 14.198983192443848, + "logps/chosen": -212.7875213623047, + "logps/rejected": -306.4270935058594, + "loss": 0.9613, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.38489991426467896, + "rewards/margins": -0.437112033367157, + "rewards/rejected": 0.05221214145421982, + "step": 5495 + }, + { + "epoch": 0.8499516721438237, + "grad_norm": 3.9105520248413086, + "learning_rate": 3.9815557337610265e-06, + "logits/chosen": 10.03400993347168, + "logits/rejected": 10.690797805786133, + "logps/chosen": -231.88458251953125, + "logps/rejected": -230.76101684570312, + "loss": 0.6025, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06226252019405365, + "rewards/margins": 0.3598465323448181, + "rewards/rejected": -0.2975839674472809, + "step": 5496 + }, + { + "epoch": 0.8501063212835879, + "grad_norm": 8.119980812072754, + "learning_rate": 3.981269332111353e-06, + "logits/chosen": 7.285583972930908, + "logits/rejected": 3.6322736740112305, + "logps/chosen": -426.3985290527344, + "logps/rejected": -408.44232177734375, + "loss": 0.7449, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1334976851940155, + "rewards/margins": 0.354358971118927, + "rewards/rejected": -0.22086124122142792, + "step": 5497 + }, + { + "epoch": 0.850260970423352, + "grad_norm": 3.3217272758483887, + "learning_rate": 3.98098293046168e-06, + "logits/chosen": 12.459260940551758, + "logits/rejected": 4.302353382110596, + "logps/chosen": -217.92222595214844, + "logps/rejected": -184.69979858398438, + "loss": 0.4998, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09964251518249512, + "rewards/margins": 0.46920478343963623, + "rewards/rejected": -0.3695622682571411, + "step": 5498 + }, + { + "epoch": 0.8504156195631162, + "grad_norm": 9.07116985321045, + "learning_rate": 3.980696528812006e-06, + "logits/chosen": 6.277608394622803, + "logits/rejected": 9.21856689453125, + "logps/chosen": -296.05181884765625, + "logps/rejected": -331.8778991699219, + "loss": 0.8306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15516506135463715, + "rewards/margins": -0.1964268982410431, + "rewards/rejected": 0.04126181825995445, + "step": 5499 + }, + { + "epoch": 0.8505702687028803, + "grad_norm": 6.438467025756836, + "learning_rate": 3.980410127162333e-06, + "logits/chosen": 6.978841304779053, + "logits/rejected": 11.333194732666016, + "logps/chosen": -189.72842407226562, + "logps/rejected": -252.83270263671875, + "loss": 0.9321, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.20909695327281952, + "rewards/margins": -0.3070370554924011, + "rewards/rejected": 0.0979401096701622, + "step": 5500 + }, + { + "epoch": 0.8507249178426445, + "grad_norm": 5.399377822875977, + "learning_rate": 3.980123725512659e-06, + "logits/chosen": 13.276514053344727, + "logits/rejected": 18.137290954589844, + "logps/chosen": -143.25082397460938, + "logps/rejected": -185.884521484375, + "loss": 0.9047, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.2445690631866455, + "rewards/margins": -0.3527756333351135, + "rewards/rejected": 0.10820655524730682, + "step": 5501 + }, + { + "epoch": 0.8508795669824086, + "grad_norm": 6.511845588684082, + "learning_rate": 3.9798373238629855e-06, + "logits/chosen": 8.543060302734375, + "logits/rejected": 5.446573734283447, + "logps/chosen": -250.5183563232422, + "logps/rejected": -240.40802001953125, + "loss": 0.7824, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.08172449469566345, + "rewards/margins": -0.10007509589195251, + "rewards/rejected": 0.018350616097450256, + "step": 5502 + }, + { + "epoch": 0.8510342161221728, + "grad_norm": 6.365176200866699, + "learning_rate": 3.979550922213312e-06, + "logits/chosen": 11.111099243164062, + "logits/rejected": 5.582236289978027, + "logps/chosen": -383.2252502441406, + "logps/rejected": -310.8315124511719, + "loss": 0.7168, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16108235716819763, + "rewards/margins": 0.22597134113311768, + "rewards/rejected": -0.3870536983013153, + "step": 5503 + }, + { + "epoch": 0.8511888652619369, + "grad_norm": 9.928916931152344, + "learning_rate": 3.979264520563639e-06, + "logits/chosen": 8.950950622558594, + "logits/rejected": 6.616206169128418, + "logps/chosen": -383.1068420410156, + "logps/rejected": -409.5474548339844, + "loss": 0.5915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3428735136985779, + "rewards/margins": 0.25947338342666626, + "rewards/rejected": 0.08340016007423401, + "step": 5504 + }, + { + "epoch": 0.8513435144017012, + "grad_norm": 3.189671277999878, + "learning_rate": 3.9789781189139655e-06, + "logits/chosen": 14.187501907348633, + "logits/rejected": 8.953895568847656, + "logps/chosen": -227.4154815673828, + "logps/rejected": -184.2188262939453, + "loss": 0.4707, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46153968572616577, + "rewards/margins": 0.5768137574195862, + "rewards/rejected": -0.1152740940451622, + "step": 5505 + }, + { + "epoch": 0.8514981635414653, + "grad_norm": 5.075366973876953, + "learning_rate": 3.978691717264292e-06, + "logits/chosen": 14.241186141967773, + "logits/rejected": 10.220351219177246, + "logps/chosen": -198.8037109375, + "logps/rejected": -168.76805114746094, + "loss": 0.6364, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06963552534580231, + "rewards/margins": 0.26270973682403564, + "rewards/rejected": -0.33234524726867676, + "step": 5506 + }, + { + "epoch": 0.8516528126812295, + "grad_norm": 5.984999656677246, + "learning_rate": 3.978405315614619e-06, + "logits/chosen": 12.944738388061523, + "logits/rejected": 11.130903244018555, + "logps/chosen": -344.90496826171875, + "logps/rejected": -307.46441650390625, + "loss": 0.6923, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.39944949746131897, + "rewards/margins": 0.28340253233909607, + "rewards/rejected": 0.11604700237512589, + "step": 5507 + }, + { + "epoch": 0.8518074618209936, + "grad_norm": 5.058184623718262, + "learning_rate": 3.9781189139649446e-06, + "logits/chosen": 14.496612548828125, + "logits/rejected": 3.7888152599334717, + "logps/chosen": -352.7911376953125, + "logps/rejected": -240.57614135742188, + "loss": 0.5169, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22777345776557922, + "rewards/margins": 0.5655710101127625, + "rewards/rejected": -0.33779752254486084, + "step": 5508 + }, + { + "epoch": 0.8519621109607578, + "grad_norm": 6.677819728851318, + "learning_rate": 3.977832512315271e-06, + "logits/chosen": 9.146625518798828, + "logits/rejected": 4.415740966796875, + "logps/chosen": -337.3669128417969, + "logps/rejected": -276.8423767089844, + "loss": 0.6121, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3911222219467163, + "rewards/margins": 0.36787116527557373, + "rewards/rejected": 0.023251086473464966, + "step": 5509 + }, + { + "epoch": 0.852116760100522, + "grad_norm": 5.219935894012451, + "learning_rate": 3.977546110665598e-06, + "logits/chosen": 11.380074501037598, + "logits/rejected": 4.655664920806885, + "logps/chosen": -284.1026916503906, + "logps/rejected": -216.97323608398438, + "loss": 0.5087, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21835654973983765, + "rewards/margins": 0.5672281980514526, + "rewards/rejected": -0.3488716781139374, + "step": 5510 + }, + { + "epoch": 0.8522714092402861, + "grad_norm": 5.133790493011475, + "learning_rate": 3.9772597090159245e-06, + "logits/chosen": 12.784741401672363, + "logits/rejected": 9.135361671447754, + "logps/chosen": -365.32843017578125, + "logps/rejected": -298.96746826171875, + "loss": 0.6463, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10990868508815765, + "rewards/margins": 0.13372841477394104, + "rewards/rejected": -0.2436370998620987, + "step": 5511 + }, + { + "epoch": 0.8524260583800503, + "grad_norm": 4.976015567779541, + "learning_rate": 3.976973307366251e-06, + "logits/chosen": 8.681876182556152, + "logits/rejected": 7.761447429656982, + "logps/chosen": -191.5873565673828, + "logps/rejected": -204.71768188476562, + "loss": 0.6727, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3618316054344177, + "rewards/margins": 0.12575922906398773, + "rewards/rejected": 0.2360723614692688, + "step": 5512 + }, + { + "epoch": 0.8525807075198144, + "grad_norm": 4.589407920837402, + "learning_rate": 3.976686905716578e-06, + "logits/chosen": 16.618200302124023, + "logits/rejected": 9.696112632751465, + "logps/chosen": -324.1329040527344, + "logps/rejected": -209.81588745117188, + "loss": 0.4338, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.02694815769791603, + "rewards/margins": 0.8358587026596069, + "rewards/rejected": -0.8628067970275879, + "step": 5513 + }, + { + "epoch": 0.8527353566595786, + "grad_norm": 5.786118030548096, + "learning_rate": 3.976400504066904e-06, + "logits/chosen": 11.021668434143066, + "logits/rejected": 10.574874877929688, + "logps/chosen": -247.92266845703125, + "logps/rejected": -264.9029541015625, + "loss": 0.6241, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09197451174259186, + "rewards/margins": 0.3161090612411499, + "rewards/rejected": -0.22413453459739685, + "step": 5514 + }, + { + "epoch": 0.8528900057993427, + "grad_norm": 5.676928520202637, + "learning_rate": 3.97611410241723e-06, + "logits/chosen": 12.500770568847656, + "logits/rejected": 11.121211051940918, + "logps/chosen": -450.8160095214844, + "logps/rejected": -376.83343505859375, + "loss": 0.5175, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6583291292190552, + "rewards/margins": 0.5020024180412292, + "rewards/rejected": 0.15632666647434235, + "step": 5515 + }, + { + "epoch": 0.8530446549391069, + "grad_norm": 4.453126907348633, + "learning_rate": 3.975827700767557e-06, + "logits/chosen": 10.449207305908203, + "logits/rejected": 12.167027473449707, + "logps/chosen": -183.93203735351562, + "logps/rejected": -222.5181884765625, + "loss": 0.6508, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.00059538334608078, + "rewards/margins": 0.28539708256721497, + "rewards/rejected": -0.28599247336387634, + "step": 5516 + }, + { + "epoch": 0.853199304078871, + "grad_norm": 9.975476264953613, + "learning_rate": 3.9755412991178836e-06, + "logits/chosen": 8.739119529724121, + "logits/rejected": 9.14430046081543, + "logps/chosen": -360.94146728515625, + "logps/rejected": -317.3536376953125, + "loss": 1.188, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4631873369216919, + "rewards/margins": -0.5738186240196228, + "rewards/rejected": 0.1106313019990921, + "step": 5517 + }, + { + "epoch": 0.8533539532186353, + "grad_norm": 3.940706253051758, + "learning_rate": 3.97525489746821e-06, + "logits/chosen": 12.042640686035156, + "logits/rejected": 13.598318099975586, + "logps/chosen": -179.56272888183594, + "logps/rejected": -218.2500762939453, + "loss": 0.5224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07911482453346252, + "rewards/margins": 0.39709216356277466, + "rewards/rejected": -0.31797733902931213, + "step": 5518 + }, + { + "epoch": 0.8535086023583994, + "grad_norm": 7.980309963226318, + "learning_rate": 3.974968495818536e-06, + "logits/chosen": 11.663081169128418, + "logits/rejected": 10.500115394592285, + "logps/chosen": -397.35565185546875, + "logps/rejected": -308.91632080078125, + "loss": 0.6316, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2902355194091797, + "rewards/margins": 0.2300301343202591, + "rewards/rejected": 0.060205407440662384, + "step": 5519 + }, + { + "epoch": 0.8536632514981636, + "grad_norm": 4.050093650817871, + "learning_rate": 3.974682094168863e-06, + "logits/chosen": 11.713811874389648, + "logits/rejected": 7.592785835266113, + "logps/chosen": -199.0465087890625, + "logps/rejected": -182.48623657226562, + "loss": 0.5093, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004756644368171692, + "rewards/margins": 0.5292977094650269, + "rewards/rejected": -0.5340542793273926, + "step": 5520 + }, + { + "epoch": 0.8538179006379277, + "grad_norm": 5.195960521697998, + "learning_rate": 3.974395692519189e-06, + "logits/chosen": 5.48490571975708, + "logits/rejected": 7.278876304626465, + "logps/chosen": -100.11968231201172, + "logps/rejected": -161.72340393066406, + "loss": 0.8406, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07327794283628464, + "rewards/margins": -0.2435295432806015, + "rewards/rejected": 0.17025160789489746, + "step": 5521 + }, + { + "epoch": 0.8539725497776919, + "grad_norm": 4.676542282104492, + "learning_rate": 3.974109290869516e-06, + "logits/chosen": 2.9857027530670166, + "logits/rejected": 7.063844680786133, + "logps/chosen": -223.51419067382812, + "logps/rejected": -311.634521484375, + "loss": 0.4021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03967433422803879, + "rewards/margins": 0.9175422191619873, + "rewards/rejected": -0.9572165012359619, + "step": 5522 + }, + { + "epoch": 0.854127198917456, + "grad_norm": 7.71112060546875, + "learning_rate": 3.973822889219842e-06, + "logits/chosen": 12.697504043579102, + "logits/rejected": 8.772088050842285, + "logps/chosen": -165.67213439941406, + "logps/rejected": -203.76319885253906, + "loss": 0.632, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30290815234184265, + "rewards/margins": 0.159666508436203, + "rewards/rejected": 0.14324164390563965, + "step": 5523 + }, + { + "epoch": 0.8542818480572202, + "grad_norm": 3.8125832080841064, + "learning_rate": 3.9735364875701684e-06, + "logits/chosen": 10.960256576538086, + "logits/rejected": 3.113831043243408, + "logps/chosen": -309.34332275390625, + "logps/rejected": -163.04061889648438, + "loss": 0.5579, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2807078957557678, + "rewards/margins": 0.43664461374282837, + "rewards/rejected": -0.15593671798706055, + "step": 5524 + }, + { + "epoch": 0.8544364971969843, + "grad_norm": 4.3831329345703125, + "learning_rate": 3.973250085920495e-06, + "logits/chosen": 14.845224380493164, + "logits/rejected": 5.573022842407227, + "logps/chosen": -321.0080871582031, + "logps/rejected": -181.04718017578125, + "loss": 0.5544, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13004538416862488, + "rewards/margins": 0.4373641014099121, + "rewards/rejected": -0.30731871724128723, + "step": 5525 + }, + { + "epoch": 0.8545911463367485, + "grad_norm": 4.656836032867432, + "learning_rate": 3.972963684270822e-06, + "logits/chosen": 7.565646648406982, + "logits/rejected": 3.1633729934692383, + "logps/chosen": -341.6214599609375, + "logps/rejected": -219.31301879882812, + "loss": 0.5371, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3157382011413574, + "rewards/margins": 0.41551926732063293, + "rewards/rejected": -0.0997810810804367, + "step": 5526 + }, + { + "epoch": 0.8547457954765126, + "grad_norm": 5.701992511749268, + "learning_rate": 3.972677282621148e-06, + "logits/chosen": 5.455742359161377, + "logits/rejected": 5.645936965942383, + "logps/chosen": -277.1202392578125, + "logps/rejected": -262.00091552734375, + "loss": 0.5569, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06556187570095062, + "rewards/margins": 0.34338244795799255, + "rewards/rejected": -0.408944308757782, + "step": 5527 + }, + { + "epoch": 0.8549004446162768, + "grad_norm": 13.386775016784668, + "learning_rate": 3.972390880971474e-06, + "logits/chosen": 12.9055814743042, + "logits/rejected": 10.631471633911133, + "logps/chosen": -345.20465087890625, + "logps/rejected": -305.3077697753906, + "loss": 0.8612, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.14449329674243927, + "rewards/margins": -0.14074286818504333, + "rewards/rejected": -0.003750436007976532, + "step": 5528 + }, + { + "epoch": 0.8550550937560409, + "grad_norm": 5.1428375244140625, + "learning_rate": 3.972104479321801e-06, + "logits/chosen": 12.146710395812988, + "logits/rejected": 8.29996395111084, + "logps/chosen": -635.9072875976562, + "logps/rejected": -470.0387878417969, + "loss": 0.4645, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6608940362930298, + "rewards/margins": 0.6956630349159241, + "rewards/rejected": -0.034769050776958466, + "step": 5529 + }, + { + "epoch": 0.8552097428958051, + "grad_norm": 5.171927452087402, + "learning_rate": 3.9718180776721275e-06, + "logits/chosen": 13.20880126953125, + "logits/rejected": 8.684325218200684, + "logps/chosen": -362.83544921875, + "logps/rejected": -350.0487976074219, + "loss": 0.6053, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11922894418239594, + "rewards/margins": 0.4723970293998718, + "rewards/rejected": -0.3531681299209595, + "step": 5530 + }, + { + "epoch": 0.8553643920355694, + "grad_norm": 6.222297191619873, + "learning_rate": 3.971531676022454e-06, + "logits/chosen": 13.412002563476562, + "logits/rejected": 9.023429870605469, + "logps/chosen": -277.14251708984375, + "logps/rejected": -238.841064453125, + "loss": 0.5226, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2784581184387207, + "rewards/margins": 0.5299692153930664, + "rewards/rejected": -0.2515110969543457, + "step": 5531 + }, + { + "epoch": 0.8555190411753335, + "grad_norm": 4.826961517333984, + "learning_rate": 3.971245274372781e-06, + "logits/chosen": 10.715517044067383, + "logits/rejected": 6.7913498878479, + "logps/chosen": -196.56402587890625, + "logps/rejected": -125.03688049316406, + "loss": 0.5586, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05702914670109749, + "rewards/margins": 0.40519261360168457, + "rewards/rejected": -0.3481634855270386, + "step": 5532 + }, + { + "epoch": 0.8556736903150977, + "grad_norm": 6.214565753936768, + "learning_rate": 3.9709588727231074e-06, + "logits/chosen": 12.208765983581543, + "logits/rejected": 9.875855445861816, + "logps/chosen": -317.4407958984375, + "logps/rejected": -296.9223937988281, + "loss": 0.8089, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08436611294746399, + "rewards/margins": -0.17837321758270264, + "rewards/rejected": 0.09400712698698044, + "step": 5533 + }, + { + "epoch": 0.8558283394548618, + "grad_norm": 5.382976531982422, + "learning_rate": 3.970672471073433e-06, + "logits/chosen": 5.199872970581055, + "logits/rejected": 9.805152893066406, + "logps/chosen": -213.81668090820312, + "logps/rejected": -259.80584716796875, + "loss": 0.8172, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.03748548403382301, + "rewards/margins": -0.16475293040275574, + "rewards/rejected": 0.12726745009422302, + "step": 5534 + }, + { + "epoch": 0.855982988594626, + "grad_norm": 5.463367938995361, + "learning_rate": 3.97038606942376e-06, + "logits/chosen": 12.242330551147461, + "logits/rejected": 11.819622993469238, + "logps/chosen": -224.96087646484375, + "logps/rejected": -259.28826904296875, + "loss": 0.7398, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03340083360671997, + "rewards/margins": 0.008428975939750671, + "rewards/rejected": 0.024971872568130493, + "step": 5535 + }, + { + "epoch": 0.8561376377343901, + "grad_norm": 4.299546241760254, + "learning_rate": 3.9700996677740865e-06, + "logits/chosen": 15.45047378540039, + "logits/rejected": 8.400930404663086, + "logps/chosen": -197.11111450195312, + "logps/rejected": -145.38729858398438, + "loss": 0.5645, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3550064265727997, + "rewards/margins": 0.3712439239025116, + "rewards/rejected": -0.7262503504753113, + "step": 5536 + }, + { + "epoch": 0.8562922868741543, + "grad_norm": 6.327221393585205, + "learning_rate": 3.969813266124413e-06, + "logits/chosen": 9.867925643920898, + "logits/rejected": 8.902490615844727, + "logps/chosen": -389.11212158203125, + "logps/rejected": -294.26513671875, + "loss": 0.6609, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1277143359184265, + "rewards/margins": 0.196881502866745, + "rewards/rejected": -0.06916715949773788, + "step": 5537 + }, + { + "epoch": 0.8564469360139184, + "grad_norm": 5.8160834312438965, + "learning_rate": 3.96952686447474e-06, + "logits/chosen": 15.131919860839844, + "logits/rejected": 8.25251579284668, + "logps/chosen": -322.2184753417969, + "logps/rejected": -238.88633728027344, + "loss": 0.6919, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12332974374294281, + "rewards/margins": 0.16262176632881165, + "rewards/rejected": -0.03929205983877182, + "step": 5538 + }, + { + "epoch": 0.8566015851536826, + "grad_norm": 5.321799278259277, + "learning_rate": 3.9692404628250665e-06, + "logits/chosen": 8.743563652038574, + "logits/rejected": 5.112298488616943, + "logps/chosen": -353.24847412109375, + "logps/rejected": -263.61798095703125, + "loss": 0.7137, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19939926266670227, + "rewards/margins": 0.09951724112033844, + "rewards/rejected": 0.09988205134868622, + "step": 5539 + }, + { + "epoch": 0.8567562342934467, + "grad_norm": 10.051793098449707, + "learning_rate": 3.968954061175393e-06, + "logits/chosen": 9.454984664916992, + "logits/rejected": 3.158181667327881, + "logps/chosen": -438.4866943359375, + "logps/rejected": -303.512939453125, + "loss": 0.4547, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29820510745048523, + "rewards/margins": 0.7262248992919922, + "rewards/rejected": -0.42801979184150696, + "step": 5540 + }, + { + "epoch": 0.8569108834332109, + "grad_norm": 17.073593139648438, + "learning_rate": 3.968667659525719e-06, + "logits/chosen": 8.447195053100586, + "logits/rejected": 8.953034400939941, + "logps/chosen": -340.5556640625, + "logps/rejected": -355.7740478515625, + "loss": 0.7351, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30880415439605713, + "rewards/margins": 0.05526772141456604, + "rewards/rejected": 0.2535364031791687, + "step": 5541 + }, + { + "epoch": 0.857065532572975, + "grad_norm": 4.253938674926758, + "learning_rate": 3.968381257876046e-06, + "logits/chosen": 6.562461853027344, + "logits/rejected": -2.93577241897583, + "logps/chosen": -265.87109375, + "logps/rejected": -153.58255004882812, + "loss": 0.5355, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.09016941487789154, + "rewards/margins": 0.4087334871292114, + "rewards/rejected": -0.49890291690826416, + "step": 5542 + }, + { + "epoch": 0.8572201817127392, + "grad_norm": 6.389232158660889, + "learning_rate": 3.968094856226372e-06, + "logits/chosen": 7.3160247802734375, + "logits/rejected": 6.972973823547363, + "logps/chosen": -222.66339111328125, + "logps/rejected": -282.0250244140625, + "loss": 0.6935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.0887586697936058, + "rewards/margins": 0.025575678795576096, + "rewards/rejected": -0.1143343448638916, + "step": 5543 + }, + { + "epoch": 0.8573748308525034, + "grad_norm": 4.663178443908691, + "learning_rate": 3.967808454576699e-06, + "logits/chosen": 11.413351058959961, + "logits/rejected": 7.847943305969238, + "logps/chosen": -230.12515258789062, + "logps/rejected": -210.52798461914062, + "loss": 0.5126, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31653425097465515, + "rewards/margins": 0.5554164052009583, + "rewards/rejected": -0.2388821840286255, + "step": 5544 + }, + { + "epoch": 0.8575294799922676, + "grad_norm": 7.371320724487305, + "learning_rate": 3.9675220529270255e-06, + "logits/chosen": 9.153878211975098, + "logits/rejected": 4.771054267883301, + "logps/chosen": -431.10205078125, + "logps/rejected": -367.1356201171875, + "loss": 0.6264, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6836245059967041, + "rewards/margins": 0.19900822639465332, + "rewards/rejected": 0.48461630940437317, + "step": 5545 + }, + { + "epoch": 0.8576841291320317, + "grad_norm": 8.207365036010742, + "learning_rate": 3.967235651277352e-06, + "logits/chosen": 7.745014667510986, + "logits/rejected": 5.892283916473389, + "logps/chosen": -226.06863403320312, + "logps/rejected": -262.7561340332031, + "loss": 0.9527, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24970795214176178, + "rewards/margins": -0.36114442348480225, + "rewards/rejected": 0.11143648624420166, + "step": 5546 + }, + { + "epoch": 0.8578387782717959, + "grad_norm": 6.068310737609863, + "learning_rate": 3.966949249627678e-06, + "logits/chosen": 2.7964437007904053, + "logits/rejected": 4.375521183013916, + "logps/chosen": -208.13751220703125, + "logps/rejected": -257.7275695800781, + "loss": 0.8231, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14024151861667633, + "rewards/margins": 0.09700223803520203, + "rewards/rejected": 0.043239206075668335, + "step": 5547 + }, + { + "epoch": 0.85799342741156, + "grad_norm": 5.413476467132568, + "learning_rate": 3.966662847978005e-06, + "logits/chosen": 9.429909706115723, + "logits/rejected": 11.490013122558594, + "logps/chosen": -258.3641052246094, + "logps/rejected": -279.36212158203125, + "loss": 0.6794, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25293999910354614, + "rewards/margins": 0.045966241508722305, + "rewards/rejected": 0.20697373151779175, + "step": 5548 + }, + { + "epoch": 0.8581480765513242, + "grad_norm": 4.993340492248535, + "learning_rate": 3.966376446328331e-06, + "logits/chosen": 12.423839569091797, + "logits/rejected": 12.750974655151367, + "logps/chosen": -280.7548828125, + "logps/rejected": -346.6853332519531, + "loss": 0.4456, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35837188363075256, + "rewards/margins": 0.6557400226593018, + "rewards/rejected": -0.2973681390285492, + "step": 5549 + }, + { + "epoch": 0.8583027256910883, + "grad_norm": 5.79370641708374, + "learning_rate": 3.966090044678658e-06, + "logits/chosen": 9.242609024047852, + "logits/rejected": 10.357810974121094, + "logps/chosen": -323.60565185546875, + "logps/rejected": -347.3231506347656, + "loss": 0.637, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15418893098831177, + "rewards/margins": 0.4306488335132599, + "rewards/rejected": -0.2764599919319153, + "step": 5550 + }, + { + "epoch": 0.8584573748308525, + "grad_norm": 7.5426201820373535, + "learning_rate": 3.965803643028985e-06, + "logits/chosen": 9.522414207458496, + "logits/rejected": 12.636218070983887, + "logps/chosen": -266.0033264160156, + "logps/rejected": -283.789794921875, + "loss": 0.8981, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19298070669174194, + "rewards/margins": -0.1364603340625763, + "rewards/rejected": -0.05652038753032684, + "step": 5551 + }, + { + "epoch": 0.8586120239706166, + "grad_norm": 4.220952033996582, + "learning_rate": 3.96551724137931e-06, + "logits/chosen": 8.761380195617676, + "logits/rejected": 3.7925281524658203, + "logps/chosen": -250.84353637695312, + "logps/rejected": -210.23773193359375, + "loss": 0.4722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.050732336938381195, + "rewards/margins": 0.6292870044708252, + "rewards/rejected": -0.5785546898841858, + "step": 5552 + }, + { + "epoch": 0.8587666731103808, + "grad_norm": 5.072403430938721, + "learning_rate": 3.965230839729637e-06, + "logits/chosen": 8.99654483795166, + "logits/rejected": 14.076211929321289, + "logps/chosen": -164.7877655029297, + "logps/rejected": -205.15432739257812, + "loss": 0.6751, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23361565172672272, + "rewards/margins": 0.12223657965660095, + "rewards/rejected": -0.3558522164821625, + "step": 5553 + }, + { + "epoch": 0.858921322250145, + "grad_norm": 4.6037774085998535, + "learning_rate": 3.964944438079964e-06, + "logits/chosen": 12.717657089233398, + "logits/rejected": 10.634328842163086, + "logps/chosen": -290.24505615234375, + "logps/rejected": -316.93035888671875, + "loss": 0.5442, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1433984786272049, + "rewards/margins": 0.42021113634109497, + "rewards/rejected": -0.2768126428127289, + "step": 5554 + }, + { + "epoch": 0.8590759713899091, + "grad_norm": 6.171657562255859, + "learning_rate": 3.96465803643029e-06, + "logits/chosen": 6.50289249420166, + "logits/rejected": 8.421377182006836, + "logps/chosen": -162.61032104492188, + "logps/rejected": -209.62774658203125, + "loss": 0.7573, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1106221154332161, + "rewards/margins": -0.07369647175073624, + "rewards/rejected": 0.18431857228279114, + "step": 5555 + }, + { + "epoch": 0.8592306205296733, + "grad_norm": 7.367908000946045, + "learning_rate": 3.964371634780617e-06, + "logits/chosen": 12.182393074035645, + "logits/rejected": 5.837953567504883, + "logps/chosen": -336.4083251953125, + "logps/rejected": -213.71844482421875, + "loss": 0.684, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1285104751586914, + "rewards/margins": 0.3372896611690521, + "rewards/rejected": -0.20877915620803833, + "step": 5556 + }, + { + "epoch": 0.8593852696694375, + "grad_norm": 5.022037982940674, + "learning_rate": 3.964085233130943e-06, + "logits/chosen": 8.136197090148926, + "logits/rejected": 7.256975173950195, + "logps/chosen": -219.56837463378906, + "logps/rejected": -241.2669219970703, + "loss": 0.5404, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22837886214256287, + "rewards/margins": 0.4187639057636261, + "rewards/rejected": -0.19038507342338562, + "step": 5557 + }, + { + "epoch": 0.8595399188092017, + "grad_norm": 6.981671333312988, + "learning_rate": 3.9637988314812695e-06, + "logits/chosen": 11.872611999511719, + "logits/rejected": 12.234930038452148, + "logps/chosen": -341.8452453613281, + "logps/rejected": -340.52899169921875, + "loss": 0.9542, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.12858542799949646, + "rewards/margins": -0.380593478679657, + "rewards/rejected": 0.2520080506801605, + "step": 5558 + }, + { + "epoch": 0.8596945679489658, + "grad_norm": 5.184417724609375, + "learning_rate": 3.963512429831596e-06, + "logits/chosen": 6.285898208618164, + "logits/rejected": 6.663991928100586, + "logps/chosen": -205.7534637451172, + "logps/rejected": -235.4987030029297, + "loss": 0.7474, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.21914659440517426, + "rewards/margins": 0.003766484558582306, + "rewards/rejected": 0.21538008749485016, + "step": 5559 + }, + { + "epoch": 0.85984921708873, + "grad_norm": 5.698225498199463, + "learning_rate": 3.963226028181923e-06, + "logits/chosen": 11.049238204956055, + "logits/rejected": 9.75180435180664, + "logps/chosen": -255.01080322265625, + "logps/rejected": -263.9441223144531, + "loss": 0.7398, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17442665994167328, + "rewards/margins": 0.054497167468070984, + "rewards/rejected": 0.11992951482534409, + "step": 5560 + }, + { + "epoch": 0.8600038662284941, + "grad_norm": 7.597950458526611, + "learning_rate": 3.9629396265322486e-06, + "logits/chosen": 6.745762825012207, + "logits/rejected": 5.756608963012695, + "logps/chosen": -251.48046875, + "logps/rejected": -241.15567016601562, + "loss": 0.8848, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17406368255615234, + "rewards/margins": 0.1296660602092743, + "rewards/rejected": -0.30372974276542664, + "step": 5561 + }, + { + "epoch": 0.8601585153682583, + "grad_norm": 3.4851796627044678, + "learning_rate": 3.962653224882575e-06, + "logits/chosen": 11.194986343383789, + "logits/rejected": 8.162862777709961, + "logps/chosen": -114.60517120361328, + "logps/rejected": -111.16508483886719, + "loss": 0.576, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.027560904622077942, + "rewards/margins": 0.317198246717453, + "rewards/rejected": -0.34475910663604736, + "step": 5562 + }, + { + "epoch": 0.8603131645080224, + "grad_norm": 7.40164852142334, + "learning_rate": 3.962366823232902e-06, + "logits/chosen": 12.411735534667969, + "logits/rejected": 8.850687026977539, + "logps/chosen": -337.29547119140625, + "logps/rejected": -277.8260498046875, + "loss": 0.6801, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36499306559562683, + "rewards/margins": 0.21913452446460724, + "rewards/rejected": 0.14585858583450317, + "step": 5563 + }, + { + "epoch": 0.8604678136477866, + "grad_norm": 6.116249084472656, + "learning_rate": 3.9620804215832285e-06, + "logits/chosen": 12.2783203125, + "logits/rejected": 10.857599258422852, + "logps/chosen": -281.6092224121094, + "logps/rejected": -212.8323974609375, + "loss": 0.6946, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11243562400341034, + "rewards/margins": 0.053007036447525024, + "rewards/rejected": -0.16544266045093536, + "step": 5564 + }, + { + "epoch": 0.8606224627875507, + "grad_norm": 5.1273603439331055, + "learning_rate": 3.961794019933555e-06, + "logits/chosen": 8.237610816955566, + "logits/rejected": 7.821701526641846, + "logps/chosen": -234.32177734375, + "logps/rejected": -317.19769287109375, + "loss": 0.5606, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16736702620983124, + "rewards/margins": 0.33171504735946655, + "rewards/rejected": -0.1643480360507965, + "step": 5565 + }, + { + "epoch": 0.8607771119273149, + "grad_norm": 4.789910316467285, + "learning_rate": 3.961507618283882e-06, + "logits/chosen": 11.612488746643066, + "logits/rejected": 10.5182523727417, + "logps/chosen": -265.38177490234375, + "logps/rejected": -242.57489013671875, + "loss": 0.6635, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.013011261820793152, + "rewards/margins": 0.13539275527000427, + "rewards/rejected": -0.12238150089979172, + "step": 5566 + }, + { + "epoch": 0.860931761067079, + "grad_norm": 5.014251708984375, + "learning_rate": 3.961221216634208e-06, + "logits/chosen": 16.432933807373047, + "logits/rejected": 13.618185043334961, + "logps/chosen": -384.0850524902344, + "logps/rejected": -331.5423583984375, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3585062026977539, + "rewards/margins": 0.6304289102554321, + "rewards/rejected": -0.27192267775535583, + "step": 5567 + }, + { + "epoch": 0.8610864102068432, + "grad_norm": 6.250051975250244, + "learning_rate": 3.960934814984534e-06, + "logits/chosen": 12.310720443725586, + "logits/rejected": 13.226346015930176, + "logps/chosen": -320.4024658203125, + "logps/rejected": -314.7881774902344, + "loss": 0.8323, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0634918212890625, + "rewards/margins": -0.19194220006465912, + "rewards/rejected": 0.12845037877559662, + "step": 5568 + }, + { + "epoch": 0.8612410593466073, + "grad_norm": 5.12235164642334, + "learning_rate": 3.960648413334861e-06, + "logits/chosen": 6.9342122077941895, + "logits/rejected": 7.804890155792236, + "logps/chosen": -284.4322814941406, + "logps/rejected": -261.19305419921875, + "loss": 0.6083, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.039760202169418335, + "rewards/margins": 0.21222585439682007, + "rewards/rejected": -0.2519860863685608, + "step": 5569 + }, + { + "epoch": 0.8613957084863716, + "grad_norm": 22.274707794189453, + "learning_rate": 3.9603620116851876e-06, + "logits/chosen": 10.584257125854492, + "logits/rejected": 13.976146697998047, + "logps/chosen": -151.39248657226562, + "logps/rejected": -187.773681640625, + "loss": 0.785, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3866022527217865, + "rewards/margins": -0.1313094198703766, + "rewards/rejected": -0.2552928328514099, + "step": 5570 + }, + { + "epoch": 0.8615503576261357, + "grad_norm": 6.046761512756348, + "learning_rate": 3.960075610035514e-06, + "logits/chosen": 13.19694709777832, + "logits/rejected": 9.149131774902344, + "logps/chosen": -230.0457305908203, + "logps/rejected": -177.5976104736328, + "loss": 0.781, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19006797671318054, + "rewards/margins": -0.06863032281398773, + "rewards/rejected": -0.12143763899803162, + "step": 5571 + }, + { + "epoch": 0.8617050067658999, + "grad_norm": 5.000414848327637, + "learning_rate": 3.959789208385841e-06, + "logits/chosen": 11.03077507019043, + "logits/rejected": 13.187887191772461, + "logps/chosen": -260.22027587890625, + "logps/rejected": -252.41555786132812, + "loss": 0.7057, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16556119918823242, + "rewards/margins": 0.00532994419336319, + "rewards/rejected": -0.1708911508321762, + "step": 5572 + }, + { + "epoch": 0.861859655905664, + "grad_norm": 9.176785469055176, + "learning_rate": 3.9595028067361675e-06, + "logits/chosen": 8.673894882202148, + "logits/rejected": 8.565500259399414, + "logps/chosen": -279.72021484375, + "logps/rejected": -264.1474609375, + "loss": 1.0208, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.4105193614959717, + "rewards/margins": -0.3400534391403198, + "rewards/rejected": -0.07046595215797424, + "step": 5573 + }, + { + "epoch": 0.8620143050454282, + "grad_norm": 6.136605739593506, + "learning_rate": 3.959216405086493e-06, + "logits/chosen": 11.717804908752441, + "logits/rejected": 11.446734428405762, + "logps/chosen": -269.6417236328125, + "logps/rejected": -265.83331298828125, + "loss": 0.7581, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.019072622060775757, + "rewards/margins": -0.07733035087585449, + "rewards/rejected": 0.09640297293663025, + "step": 5574 + }, + { + "epoch": 0.8621689541851923, + "grad_norm": 5.139334678649902, + "learning_rate": 3.95893000343682e-06, + "logits/chosen": 14.313995361328125, + "logits/rejected": 11.819242477416992, + "logps/chosen": -344.8808898925781, + "logps/rejected": -280.67645263671875, + "loss": 0.6363, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.488066703081131, + "rewards/margins": 0.472256064414978, + "rewards/rejected": 0.015810586512088776, + "step": 5575 + }, + { + "epoch": 0.8623236033249565, + "grad_norm": 6.363095283508301, + "learning_rate": 3.958643601787147e-06, + "logits/chosen": 13.745997428894043, + "logits/rejected": 3.4037387371063232, + "logps/chosen": -333.07745361328125, + "logps/rejected": -218.85836791992188, + "loss": 0.4101, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12555581331253052, + "rewards/margins": 0.7948639392852783, + "rewards/rejected": -0.6693081259727478, + "step": 5576 + }, + { + "epoch": 0.8624782524647207, + "grad_norm": 6.558548927307129, + "learning_rate": 3.958357200137473e-06, + "logits/chosen": 1.0127133131027222, + "logits/rejected": 5.681978225708008, + "logps/chosen": -249.9398651123047, + "logps/rejected": -289.20001220703125, + "loss": 0.6633, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16100487112998962, + "rewards/margins": 0.11872687190771103, + "rewards/rejected": 0.042277999222278595, + "step": 5577 + }, + { + "epoch": 0.8626329016044848, + "grad_norm": 4.4048871994018555, + "learning_rate": 3.9580707984878e-06, + "logits/chosen": 10.924642562866211, + "logits/rejected": 4.229212760925293, + "logps/chosen": -335.284912109375, + "logps/rejected": -203.130126953125, + "loss": 0.655, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08222848176956177, + "rewards/margins": 0.22194786369800568, + "rewards/rejected": -0.30417633056640625, + "step": 5578 + }, + { + "epoch": 0.862787550744249, + "grad_norm": 5.373351573944092, + "learning_rate": 3.957784396838127e-06, + "logits/chosen": 6.795207977294922, + "logits/rejected": 12.02531623840332, + "logps/chosen": -209.53326416015625, + "logps/rejected": -254.7356414794922, + "loss": 0.7737, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.04380173981189728, + "rewards/margins": -0.11935292929410934, + "rewards/rejected": 0.07555118203163147, + "step": 5579 + }, + { + "epoch": 0.8629421998840131, + "grad_norm": 6.247703552246094, + "learning_rate": 3.957497995188452e-06, + "logits/chosen": 7.563823699951172, + "logits/rejected": 5.765118598937988, + "logps/chosen": -443.6858825683594, + "logps/rejected": -397.2836608886719, + "loss": 0.5705, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12699198722839355, + "rewards/margins": 0.49848002195358276, + "rewards/rejected": -0.3714880347251892, + "step": 5580 + }, + { + "epoch": 0.8630968490237773, + "grad_norm": 8.227096557617188, + "learning_rate": 3.957211593538779e-06, + "logits/chosen": 9.86913013458252, + "logits/rejected": 7.2057623863220215, + "logps/chosen": -324.13116455078125, + "logps/rejected": -367.2266845703125, + "loss": 0.8363, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.022597193717956543, + "rewards/margins": 0.06384515762329102, + "rewards/rejected": -0.08644236624240875, + "step": 5581 + }, + { + "epoch": 0.8632514981635415, + "grad_norm": 4.9398393630981445, + "learning_rate": 3.956925191889106e-06, + "logits/chosen": 3.505120038986206, + "logits/rejected": 8.423637390136719, + "logps/chosen": -156.98138427734375, + "logps/rejected": -226.23301696777344, + "loss": 0.6654, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10237536579370499, + "rewards/margins": 0.09464340656995773, + "rewards/rejected": -0.1970188021659851, + "step": 5582 + }, + { + "epoch": 0.8634061473033057, + "grad_norm": 11.51438045501709, + "learning_rate": 3.956638790239432e-06, + "logits/chosen": 9.482044219970703, + "logits/rejected": 14.614742279052734, + "logps/chosen": -244.4484100341797, + "logps/rejected": -456.899658203125, + "loss": 0.6966, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08227504789829254, + "rewards/margins": 0.058049432933330536, + "rewards/rejected": -0.14032451808452606, + "step": 5583 + }, + { + "epoch": 0.8635607964430698, + "grad_norm": 5.607166767120361, + "learning_rate": 3.956352388589759e-06, + "logits/chosen": 10.76272964477539, + "logits/rejected": 1.8757203817367554, + "logps/chosen": -357.39892578125, + "logps/rejected": -284.69171142578125, + "loss": 0.4959, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13323983550071716, + "rewards/margins": 0.5907565355300903, + "rewards/rejected": -0.45751672983169556, + "step": 5584 + }, + { + "epoch": 0.863715445582834, + "grad_norm": 7.2526984214782715, + "learning_rate": 3.956065986940086e-06, + "logits/chosen": 11.311847686767578, + "logits/rejected": 12.392753601074219, + "logps/chosen": -331.0661315917969, + "logps/rejected": -373.85162353515625, + "loss": 0.8958, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.025860585272312164, + "rewards/margins": -0.1390220820903778, + "rewards/rejected": 0.16488268971443176, + "step": 5585 + }, + { + "epoch": 0.8638700947225981, + "grad_norm": 3.3673830032348633, + "learning_rate": 3.9557795852904114e-06, + "logits/chosen": 7.557724952697754, + "logits/rejected": 4.005731582641602, + "logps/chosen": -113.14189147949219, + "logps/rejected": -146.0245819091797, + "loss": 0.5504, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28499364852905273, + "rewards/margins": 0.4255397319793701, + "rewards/rejected": -0.7105333805084229, + "step": 5586 + }, + { + "epoch": 0.8640247438623623, + "grad_norm": 7.194338321685791, + "learning_rate": 3.955493183640738e-06, + "logits/chosen": 9.403700828552246, + "logits/rejected": 7.309061527252197, + "logps/chosen": -275.8131103515625, + "logps/rejected": -313.4444885253906, + "loss": 0.683, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.42857396602630615, + "rewards/margins": 0.0807749330997467, + "rewards/rejected": 0.34779900312423706, + "step": 5587 + }, + { + "epoch": 0.8641793930021264, + "grad_norm": 3.8534152507781982, + "learning_rate": 3.955206781991065e-06, + "logits/chosen": 17.8963565826416, + "logits/rejected": 8.14344596862793, + "logps/chosen": -427.930908203125, + "logps/rejected": -356.53076171875, + "loss": 0.435, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5800073742866516, + "rewards/margins": 0.8146600723266602, + "rewards/rejected": -0.23465272784233093, + "step": 5588 + }, + { + "epoch": 0.8643340421418906, + "grad_norm": 6.4218525886535645, + "learning_rate": 3.954920380341391e-06, + "logits/chosen": 6.947747707366943, + "logits/rejected": 6.141756534576416, + "logps/chosen": -360.1333923339844, + "logps/rejected": -388.59979248046875, + "loss": 0.6759, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24942654371261597, + "rewards/margins": 0.14833489060401917, + "rewards/rejected": 0.10109168291091919, + "step": 5589 + }, + { + "epoch": 0.8644886912816547, + "grad_norm": 4.075565338134766, + "learning_rate": 3.954633978691717e-06, + "logits/chosen": 12.835847854614258, + "logits/rejected": 5.379994869232178, + "logps/chosen": -265.06915283203125, + "logps/rejected": -136.9203338623047, + "loss": 0.5726, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.05200263857841492, + "rewards/margins": 0.5894217491149902, + "rewards/rejected": -0.6414244174957275, + "step": 5590 + }, + { + "epoch": 0.8646433404214189, + "grad_norm": 4.9312920570373535, + "learning_rate": 3.954347577042044e-06, + "logits/chosen": 7.500208854675293, + "logits/rejected": 8.466938972473145, + "logps/chosen": -114.65245056152344, + "logps/rejected": -227.3726806640625, + "loss": 0.5527, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3188348114490509, + "rewards/margins": 0.5198426842689514, + "rewards/rejected": -0.8386775255203247, + "step": 5591 + }, + { + "epoch": 0.864797989561183, + "grad_norm": 6.288552284240723, + "learning_rate": 3.9540611753923705e-06, + "logits/chosen": 12.835363388061523, + "logits/rejected": 5.822674751281738, + "logps/chosen": -318.30364990234375, + "logps/rejected": -362.590576171875, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07603712379932404, + "rewards/margins": 0.2552597224712372, + "rewards/rejected": -0.3312968611717224, + "step": 5592 + }, + { + "epoch": 0.8649526387009472, + "grad_norm": 7.344549179077148, + "learning_rate": 3.953774773742697e-06, + "logits/chosen": 16.82768440246582, + "logits/rejected": 6.916017532348633, + "logps/chosen": -210.23316955566406, + "logps/rejected": -137.35305786132812, + "loss": 0.8574, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8811869025230408, + "rewards/margins": -0.18916422128677368, + "rewards/rejected": -0.6920227408409119, + "step": 5593 + }, + { + "epoch": 0.8651072878407113, + "grad_norm": 5.030068397521973, + "learning_rate": 3.953488372093024e-06, + "logits/chosen": 8.664188385009766, + "logits/rejected": 10.605673789978027, + "logps/chosen": -349.632080078125, + "logps/rejected": -383.5105285644531, + "loss": 0.5603, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24142371118068695, + "rewards/margins": 0.6926233172416687, + "rewards/rejected": -0.45119965076446533, + "step": 5594 + }, + { + "epoch": 0.8652619369804756, + "grad_norm": 5.0279669761657715, + "learning_rate": 3.95320197044335e-06, + "logits/chosen": 10.762035369873047, + "logits/rejected": 6.500424861907959, + "logps/chosen": -365.1126403808594, + "logps/rejected": -279.74798583984375, + "loss": 0.5475, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6005088090896606, + "rewards/margins": 0.4703788757324219, + "rewards/rejected": 0.13012991845607758, + "step": 5595 + }, + { + "epoch": 0.8654165861202398, + "grad_norm": 5.001711368560791, + "learning_rate": 3.952915568793676e-06, + "logits/chosen": 9.891135215759277, + "logits/rejected": 4.358508586883545, + "logps/chosen": -412.11480712890625, + "logps/rejected": -239.86184692382812, + "loss": 0.5274, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2786291241645813, + "rewards/margins": 0.6756082773208618, + "rewards/rejected": -0.3969791531562805, + "step": 5596 + }, + { + "epoch": 0.8655712352600039, + "grad_norm": 5.172476291656494, + "learning_rate": 3.952629167144003e-06, + "logits/chosen": 9.6688232421875, + "logits/rejected": 9.194499969482422, + "logps/chosen": -373.3966369628906, + "logps/rejected": -409.0218200683594, + "loss": 0.5085, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4827442765235901, + "rewards/margins": 0.6946344375610352, + "rewards/rejected": -0.21189017593860626, + "step": 5597 + }, + { + "epoch": 0.865725884399768, + "grad_norm": 4.925397872924805, + "learning_rate": 3.9523427654943296e-06, + "logits/chosen": 12.273463249206543, + "logits/rejected": 8.57210922241211, + "logps/chosen": -258.656982421875, + "logps/rejected": -196.41998291015625, + "loss": 0.6451, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2659473419189453, + "rewards/margins": 0.1428723931312561, + "rewards/rejected": 0.12307494878768921, + "step": 5598 + }, + { + "epoch": 0.8658805335395322, + "grad_norm": 42.34637451171875, + "learning_rate": 3.952056363844656e-06, + "logits/chosen": 8.770404815673828, + "logits/rejected": 4.005621910095215, + "logps/chosen": -176.989990234375, + "logps/rejected": -140.04248046875, + "loss": 0.5476, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2061765044927597, + "rewards/margins": 0.5282948017120361, + "rewards/rejected": -0.32211834192276, + "step": 5599 + }, + { + "epoch": 0.8660351826792964, + "grad_norm": 4.553174018859863, + "learning_rate": 3.951769962194982e-06, + "logits/chosen": 5.2121663093566895, + "logits/rejected": 7.479856967926025, + "logps/chosen": -164.1041259765625, + "logps/rejected": -164.49722290039062, + "loss": 0.5913, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08756618201732635, + "rewards/margins": 0.3333725929260254, + "rewards/rejected": -0.24580645561218262, + "step": 5600 + }, + { + "epoch": 0.8661898318190605, + "grad_norm": 4.731479644775391, + "learning_rate": 3.951483560545309e-06, + "logits/chosen": 15.040358543395996, + "logits/rejected": 14.302785873413086, + "logps/chosen": -190.0380401611328, + "logps/rejected": -220.01815795898438, + "loss": 0.5428, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10195541381835938, + "rewards/margins": 0.42290830612182617, + "rewards/rejected": -0.5248637199401855, + "step": 5601 + }, + { + "epoch": 0.8663444809588247, + "grad_norm": 5.872387886047363, + "learning_rate": 3.951197158895635e-06, + "logits/chosen": 6.359010696411133, + "logits/rejected": 11.898884773254395, + "logps/chosen": -141.8868408203125, + "logps/rejected": -228.91729736328125, + "loss": 0.694, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.050789739936590195, + "rewards/margins": 0.01831177994608879, + "rewards/rejected": -0.06910151988267899, + "step": 5602 + }, + { + "epoch": 0.8664991300985888, + "grad_norm": 6.231602668762207, + "learning_rate": 3.950910757245962e-06, + "logits/chosen": 7.6184306144714355, + "logits/rejected": 8.309837341308594, + "logps/chosen": -298.68377685546875, + "logps/rejected": -300.8892517089844, + "loss": 0.6997, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20585650205612183, + "rewards/margins": 0.12166690826416016, + "rewards/rejected": 0.08418960869312286, + "step": 5603 + }, + { + "epoch": 0.866653779238353, + "grad_norm": 5.718890190124512, + "learning_rate": 3.950624355596289e-06, + "logits/chosen": 1.0394971370697021, + "logits/rejected": 13.917007446289062, + "logps/chosen": -162.19444274902344, + "logps/rejected": -358.68890380859375, + "loss": 0.7912, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1972808688879013, + "rewards/margins": -0.05365484952926636, + "rewards/rejected": -0.14362603425979614, + "step": 5604 + }, + { + "epoch": 0.8668084283781171, + "grad_norm": 5.780733108520508, + "learning_rate": 3.950337953946615e-06, + "logits/chosen": 11.858189582824707, + "logits/rejected": 9.82012939453125, + "logps/chosen": -419.78668212890625, + "logps/rejected": -427.4334716796875, + "loss": 0.5878, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4508832097053528, + "rewards/margins": 0.28657639026641846, + "rewards/rejected": 0.16430678963661194, + "step": 5605 + }, + { + "epoch": 0.8669630775178813, + "grad_norm": 6.165585041046143, + "learning_rate": 3.950051552296942e-06, + "logits/chosen": 7.159930229187012, + "logits/rejected": 4.129848480224609, + "logps/chosen": -241.71958923339844, + "logps/rejected": -197.82447814941406, + "loss": 0.6946, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11882977187633514, + "rewards/margins": 0.1422731578350067, + "rewards/rejected": -0.023443374782800674, + "step": 5606 + }, + { + "epoch": 0.8671177266576454, + "grad_norm": 5.085695743560791, + "learning_rate": 3.949765150647268e-06, + "logits/chosen": 5.326354503631592, + "logits/rejected": 8.356669425964355, + "logps/chosen": -175.46498107910156, + "logps/rejected": -190.8360595703125, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08924822509288788, + "rewards/margins": 0.3447166681289673, + "rewards/rejected": -0.2554684281349182, + "step": 5607 + }, + { + "epoch": 0.8672723757974097, + "grad_norm": 6.473757266998291, + "learning_rate": 3.949478748997594e-06, + "logits/chosen": 11.082825660705566, + "logits/rejected": 7.568131446838379, + "logps/chosen": -333.39996337890625, + "logps/rejected": -304.8360595703125, + "loss": 0.5413, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3528112769126892, + "rewards/margins": 0.3843561112880707, + "rewards/rejected": -0.03154488652944565, + "step": 5608 + }, + { + "epoch": 0.8674270249371738, + "grad_norm": 6.504826545715332, + "learning_rate": 3.949192347347921e-06, + "logits/chosen": 13.074699401855469, + "logits/rejected": 12.940622329711914, + "logps/chosen": -283.00299072265625, + "logps/rejected": -252.4912567138672, + "loss": 0.6719, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13514652848243713, + "rewards/margins": 0.18675857782363892, + "rewards/rejected": -0.05161203444004059, + "step": 5609 + }, + { + "epoch": 0.867581674076938, + "grad_norm": 4.671186923980713, + "learning_rate": 3.948905945698248e-06, + "logits/chosen": 13.153820037841797, + "logits/rejected": 9.608548164367676, + "logps/chosen": -301.005126953125, + "logps/rejected": -267.2201232910156, + "loss": 0.5344, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4216276705265045, + "rewards/margins": 0.5195762515068054, + "rewards/rejected": -0.09794855862855911, + "step": 5610 + }, + { + "epoch": 0.8677363232167021, + "grad_norm": 4.416604042053223, + "learning_rate": 3.948619544048574e-06, + "logits/chosen": 9.490065574645996, + "logits/rejected": 10.032252311706543, + "logps/chosen": -159.22503662109375, + "logps/rejected": -164.47659301757812, + "loss": 0.7279, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08413050323724747, + "rewards/margins": -0.009891770780086517, + "rewards/rejected": 0.09402228146791458, + "step": 5611 + }, + { + "epoch": 0.8678909723564663, + "grad_norm": 7.097733020782471, + "learning_rate": 3.948333142398901e-06, + "logits/chosen": 7.061870098114014, + "logits/rejected": 3.3953373432159424, + "logps/chosen": -273.488037109375, + "logps/rejected": -203.53004455566406, + "loss": 0.6395, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10962973535060883, + "rewards/margins": 0.29717832803726196, + "rewards/rejected": -0.406808078289032, + "step": 5612 + }, + { + "epoch": 0.8680456214962304, + "grad_norm": 5.786071300506592, + "learning_rate": 3.948046740749227e-06, + "logits/chosen": 8.510398864746094, + "logits/rejected": 7.187309265136719, + "logps/chosen": -231.5532989501953, + "logps/rejected": -181.15170288085938, + "loss": 0.7664, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16049346327781677, + "rewards/margins": -0.06298433244228363, + "rewards/rejected": 0.2234778106212616, + "step": 5613 + }, + { + "epoch": 0.8682002706359946, + "grad_norm": 5.439120292663574, + "learning_rate": 3.947760339099553e-06, + "logits/chosen": 8.747971534729004, + "logits/rejected": 3.8913488388061523, + "logps/chosen": -351.14007568359375, + "logps/rejected": -262.868896484375, + "loss": 0.644, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.00694364495575428, + "rewards/margins": 0.29663002490997314, + "rewards/rejected": -0.3035736680030823, + "step": 5614 + }, + { + "epoch": 0.8683549197757587, + "grad_norm": 3.9751415252685547, + "learning_rate": 3.94747393744988e-06, + "logits/chosen": 13.17003345489502, + "logits/rejected": 12.629056930541992, + "logps/chosen": -141.6834716796875, + "logps/rejected": -136.32272338867188, + "loss": 0.5992, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.049347929656505585, + "rewards/margins": 0.3328120708465576, + "rewards/rejected": -0.3821600079536438, + "step": 5615 + }, + { + "epoch": 0.8685095689155229, + "grad_norm": 5.343812942504883, + "learning_rate": 3.947187535800207e-06, + "logits/chosen": 11.60512924194336, + "logits/rejected": 9.868955612182617, + "logps/chosen": -314.24542236328125, + "logps/rejected": -249.23345947265625, + "loss": 0.7507, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0857694149017334, + "rewards/margins": -0.0817815363407135, + "rewards/rejected": 0.1675509512424469, + "step": 5616 + }, + { + "epoch": 0.868664218055287, + "grad_norm": 5.257479190826416, + "learning_rate": 3.946901134150533e-06, + "logits/chosen": 11.600604057312012, + "logits/rejected": 4.496735095977783, + "logps/chosen": -415.73004150390625, + "logps/rejected": -289.48992919921875, + "loss": 0.5595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.43422386050224304, + "rewards/margins": 0.5791343450546265, + "rewards/rejected": -0.1449105441570282, + "step": 5617 + }, + { + "epoch": 0.8688188671950512, + "grad_norm": 4.118588447570801, + "learning_rate": 3.94661473250086e-06, + "logits/chosen": 11.091497421264648, + "logits/rejected": 9.375388145446777, + "logps/chosen": -259.69384765625, + "logps/rejected": -225.29159545898438, + "loss": 0.6482, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3469921946525574, + "rewards/margins": 0.124681755900383, + "rewards/rejected": 0.22231043875217438, + "step": 5618 + }, + { + "epoch": 0.8689735163348153, + "grad_norm": 5.650290012359619, + "learning_rate": 3.946328330851187e-06, + "logits/chosen": 8.009944915771484, + "logits/rejected": 8.996103286743164, + "logps/chosen": -313.7467956542969, + "logps/rejected": -293.6455078125, + "loss": 0.6783, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4786514341831207, + "rewards/margins": 0.3097921311855316, + "rewards/rejected": 0.1688593029975891, + "step": 5619 + }, + { + "epoch": 0.8691281654745795, + "grad_norm": 7.998353481292725, + "learning_rate": 3.9460419292015125e-06, + "logits/chosen": 1.233825922012329, + "logits/rejected": 3.0593180656433105, + "logps/chosen": -317.2923583984375, + "logps/rejected": -242.02273559570312, + "loss": 0.743, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.030298329889774323, + "rewards/margins": 0.052943140268325806, + "rewards/rejected": -0.022644802927970886, + "step": 5620 + }, + { + "epoch": 0.8692828146143438, + "grad_norm": 5.124409198760986, + "learning_rate": 3.945755527551839e-06, + "logits/chosen": 15.041266441345215, + "logits/rejected": 14.017416000366211, + "logps/chosen": -180.35031127929688, + "logps/rejected": -206.74505615234375, + "loss": 0.6208, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2330726683139801, + "rewards/margins": 0.26515519618988037, + "rewards/rejected": -0.4982278645038605, + "step": 5621 + }, + { + "epoch": 0.8694374637541079, + "grad_norm": 7.9604034423828125, + "learning_rate": 3.945469125902166e-06, + "logits/chosen": 16.699312210083008, + "logits/rejected": 14.102435111999512, + "logps/chosen": -297.41998291015625, + "logps/rejected": -306.90478515625, + "loss": 0.7426, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16882115602493286, + "rewards/margins": -0.031309328973293304, + "rewards/rejected": -0.13751183450222015, + "step": 5622 + }, + { + "epoch": 0.8695921128938721, + "grad_norm": 6.771982192993164, + "learning_rate": 3.9451827242524924e-06, + "logits/chosen": 9.043725967407227, + "logits/rejected": 6.445558071136475, + "logps/chosen": -343.36566162109375, + "logps/rejected": -282.7401428222656, + "loss": 0.5892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08827246725559235, + "rewards/margins": 0.3778069317340851, + "rewards/rejected": -0.2895345091819763, + "step": 5623 + }, + { + "epoch": 0.8697467620336362, + "grad_norm": 8.483500480651855, + "learning_rate": 3.944896322602818e-06, + "logits/chosen": 17.842782974243164, + "logits/rejected": 13.857070922851562, + "logps/chosen": -245.7689971923828, + "logps/rejected": -201.0692138671875, + "loss": 0.7439, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.33216142654418945, + "rewards/margins": -0.051038503646850586, + "rewards/rejected": -0.28112292289733887, + "step": 5624 + }, + { + "epoch": 0.8699014111734004, + "grad_norm": 5.0938873291015625, + "learning_rate": 3.944609920953145e-06, + "logits/chosen": 15.27180290222168, + "logits/rejected": 9.57136344909668, + "logps/chosen": -295.0552673339844, + "logps/rejected": -199.18890380859375, + "loss": 0.7411, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0026002079248428345, + "rewards/margins": -0.04800724238157272, + "rewards/rejected": 0.050607435405254364, + "step": 5625 + }, + { + "epoch": 0.8700560603131645, + "grad_norm": 7.9792890548706055, + "learning_rate": 3.9443235193034715e-06, + "logits/chosen": 8.136433601379395, + "logits/rejected": 7.820018768310547, + "logps/chosen": -403.07952880859375, + "logps/rejected": -342.0752868652344, + "loss": 0.6737, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12141789495944977, + "rewards/margins": 0.1150195375084877, + "rewards/rejected": -0.23643741011619568, + "step": 5626 + }, + { + "epoch": 0.8702107094529287, + "grad_norm": 4.639564514160156, + "learning_rate": 3.944037117653798e-06, + "logits/chosen": 11.050708770751953, + "logits/rejected": 16.36686134338379, + "logps/chosen": -254.51023864746094, + "logps/rejected": -277.595703125, + "loss": 0.5577, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10722842812538147, + "rewards/margins": 0.4551927149295807, + "rewards/rejected": -0.3479642868041992, + "step": 5627 + }, + { + "epoch": 0.8703653585926928, + "grad_norm": 8.111231803894043, + "learning_rate": 3.943750716004124e-06, + "logits/chosen": 12.284795761108398, + "logits/rejected": 10.267173767089844, + "logps/chosen": -269.4294738769531, + "logps/rejected": -257.981201171875, + "loss": 0.7234, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0955488383769989, + "rewards/margins": 0.1421506106853485, + "rewards/rejected": -0.04660177230834961, + "step": 5628 + }, + { + "epoch": 0.870520007732457, + "grad_norm": 7.362261772155762, + "learning_rate": 3.943464314354451e-06, + "logits/chosen": 11.454507827758789, + "logits/rejected": 10.433797836303711, + "logps/chosen": -286.57373046875, + "logps/rejected": -350.7206115722656, + "loss": 0.6955, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5244772434234619, + "rewards/margins": 0.109484001994133, + "rewards/rejected": 0.4149932265281677, + "step": 5629 + }, + { + "epoch": 0.8706746568722211, + "grad_norm": 6.0964436531066895, + "learning_rate": 3.943177912704777e-06, + "logits/chosen": 10.769762992858887, + "logits/rejected": 9.031015396118164, + "logps/chosen": -622.0875854492188, + "logps/rejected": -565.76416015625, + "loss": 0.6787, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5956319570541382, + "rewards/margins": 0.06504993140697479, + "rewards/rejected": 0.530582070350647, + "step": 5630 + }, + { + "epoch": 0.8708293060119853, + "grad_norm": 5.355895519256592, + "learning_rate": 3.942891511055104e-06, + "logits/chosen": 9.512357711791992, + "logits/rejected": 10.41882038116455, + "logps/chosen": -236.83792114257812, + "logps/rejected": -218.12615966796875, + "loss": 0.7215, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.22888503968715668, + "rewards/margins": 0.095341257750988, + "rewards/rejected": 0.13354375958442688, + "step": 5631 + }, + { + "epoch": 0.8709839551517494, + "grad_norm": 5.857627868652344, + "learning_rate": 3.942605109405431e-06, + "logits/chosen": 8.529337882995605, + "logits/rejected": 7.599100589752197, + "logps/chosen": -244.71400451660156, + "logps/rejected": -272.49542236328125, + "loss": 0.6863, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.046622369438409805, + "rewards/margins": 0.08016528189182281, + "rewards/rejected": -0.033542923629283905, + "step": 5632 + }, + { + "epoch": 0.8711386042915136, + "grad_norm": 5.319549560546875, + "learning_rate": 3.942318707755756e-06, + "logits/chosen": 14.629606246948242, + "logits/rejected": 9.831968307495117, + "logps/chosen": -271.996337890625, + "logps/rejected": -241.22518920898438, + "loss": 0.5885, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11663269251585007, + "rewards/margins": 0.32205379009246826, + "rewards/rejected": -0.2054210901260376, + "step": 5633 + }, + { + "epoch": 0.8712932534312778, + "grad_norm": 4.89212703704834, + "learning_rate": 3.942032306106083e-06, + "logits/chosen": 7.2878217697143555, + "logits/rejected": 11.789358139038086, + "logps/chosen": -153.80325317382812, + "logps/rejected": -209.60549926757812, + "loss": 0.7105, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1948097050189972, + "rewards/margins": 0.10725729912519455, + "rewards/rejected": -0.30206698179244995, + "step": 5634 + }, + { + "epoch": 0.871447902571042, + "grad_norm": 4.66943883895874, + "learning_rate": 3.94174590445641e-06, + "logits/chosen": 5.970928192138672, + "logits/rejected": 6.47063684463501, + "logps/chosen": -300.7245788574219, + "logps/rejected": -234.2465057373047, + "loss": 0.5905, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06971701979637146, + "rewards/margins": 0.25740116834640503, + "rewards/rejected": -0.18768411874771118, + "step": 5635 + }, + { + "epoch": 0.8716025517108061, + "grad_norm": 4.462173938751221, + "learning_rate": 3.941459502806736e-06, + "logits/chosen": 15.418695449829102, + "logits/rejected": 11.841038703918457, + "logps/chosen": -269.9836730957031, + "logps/rejected": -214.45849609375, + "loss": 0.6207, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.203358456492424, + "rewards/margins": 0.3614543080329895, + "rewards/rejected": -0.15809586644172668, + "step": 5636 + }, + { + "epoch": 0.8717572008505703, + "grad_norm": 5.354763031005859, + "learning_rate": 3.941173101157063e-06, + "logits/chosen": 14.487762451171875, + "logits/rejected": 12.11870002746582, + "logps/chosen": -256.5377197265625, + "logps/rejected": -271.5584411621094, + "loss": 0.5547, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.023633763194084167, + "rewards/margins": 0.42893093824386597, + "rewards/rejected": -0.4052972197532654, + "step": 5637 + }, + { + "epoch": 0.8719118499903344, + "grad_norm": 3.8580222129821777, + "learning_rate": 3.94088669950739e-06, + "logits/chosen": 14.341715812683105, + "logits/rejected": 9.587904930114746, + "logps/chosen": -213.06634521484375, + "logps/rejected": -190.93557739257812, + "loss": 0.4934, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2575652301311493, + "rewards/margins": 0.4646955728530884, + "rewards/rejected": -0.2071303427219391, + "step": 5638 + }, + { + "epoch": 0.8720664991300986, + "grad_norm": 5.44061279296875, + "learning_rate": 3.940600297857716e-06, + "logits/chosen": 15.466172218322754, + "logits/rejected": 11.639422416687012, + "logps/chosen": -345.5414733886719, + "logps/rejected": -268.7454528808594, + "loss": 0.5077, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2716158926486969, + "rewards/margins": 0.5531377792358398, + "rewards/rejected": -0.28152191638946533, + "step": 5639 + }, + { + "epoch": 0.8722211482698627, + "grad_norm": 13.47822380065918, + "learning_rate": 3.940313896208042e-06, + "logits/chosen": 11.579065322875977, + "logits/rejected": 6.654143810272217, + "logps/chosen": -470.0531005859375, + "logps/rejected": -327.0259704589844, + "loss": 0.5623, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3596084713935852, + "rewards/margins": 0.4161161184310913, + "rewards/rejected": -0.05650767683982849, + "step": 5640 + }, + { + "epoch": 0.8723757974096269, + "grad_norm": 9.235669136047363, + "learning_rate": 3.940027494558369e-06, + "logits/chosen": 10.109981536865234, + "logits/rejected": 8.653867721557617, + "logps/chosen": -327.7939758300781, + "logps/rejected": -338.1885986328125, + "loss": 0.6744, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.31724536418914795, + "rewards/margins": 0.29116860032081604, + "rewards/rejected": -0.6084139943122864, + "step": 5641 + }, + { + "epoch": 0.872530446549391, + "grad_norm": 3.829383134841919, + "learning_rate": 3.939741092908695e-06, + "logits/chosen": 8.609882354736328, + "logits/rejected": 4.51651668548584, + "logps/chosen": -444.54119873046875, + "logps/rejected": -325.2269287109375, + "loss": 0.4444, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5772438645362854, + "rewards/margins": 0.7134607434272766, + "rewards/rejected": -0.1362168937921524, + "step": 5642 + }, + { + "epoch": 0.8726850956891552, + "grad_norm": 5.789526462554932, + "learning_rate": 3.939454691259022e-06, + "logits/chosen": 4.465651035308838, + "logits/rejected": 1.058051347732544, + "logps/chosen": -271.0077209472656, + "logps/rejected": -176.4681854248047, + "loss": 0.7499, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15213823318481445, + "rewards/margins": 0.029533851891756058, + "rewards/rejected": -0.1816720813512802, + "step": 5643 + }, + { + "epoch": 0.8728397448289194, + "grad_norm": 4.762213230133057, + "learning_rate": 3.939168289609349e-06, + "logits/chosen": 5.643044471740723, + "logits/rejected": 5.361799716949463, + "logps/chosen": -182.87486267089844, + "logps/rejected": -193.81788635253906, + "loss": 0.6077, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.016397807747125626, + "rewards/margins": 0.20616284012794495, + "rewards/rejected": -0.1897650510072708, + "step": 5644 + }, + { + "epoch": 0.8729943939686835, + "grad_norm": 5.213296890258789, + "learning_rate": 3.938881887959675e-06, + "logits/chosen": 9.429723739624023, + "logits/rejected": 9.652655601501465, + "logps/chosen": -248.0252685546875, + "logps/rejected": -296.86798095703125, + "loss": 0.6675, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06651425361633301, + "rewards/margins": 0.1570776402950287, + "rewards/rejected": -0.2235919088125229, + "step": 5645 + }, + { + "epoch": 0.8731490431084478, + "grad_norm": 5.00590705871582, + "learning_rate": 3.938595486310001e-06, + "logits/chosen": 11.406298637390137, + "logits/rejected": 4.312463760375977, + "logps/chosen": -290.09100341796875, + "logps/rejected": -232.20513916015625, + "loss": 0.4868, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.032554298639297485, + "rewards/margins": 0.666236400604248, + "rewards/rejected": -0.6336820721626282, + "step": 5646 + }, + { + "epoch": 0.8733036922482119, + "grad_norm": 5.848501682281494, + "learning_rate": 3.938309084660328e-06, + "logits/chosen": 8.37657642364502, + "logits/rejected": 0.4146420955657959, + "logps/chosen": -219.15805053710938, + "logps/rejected": -121.03724670410156, + "loss": 0.699, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04975175857543945, + "rewards/margins": 0.10176274180412292, + "rewards/rejected": -0.15151448547840118, + "step": 5647 + }, + { + "epoch": 0.8734583413879761, + "grad_norm": 8.237857818603516, + "learning_rate": 3.9380226830106544e-06, + "logits/chosen": 9.774773597717285, + "logits/rejected": 4.954183578491211, + "logps/chosen": -381.2142333984375, + "logps/rejected": -292.44952392578125, + "loss": 0.9024, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3442268371582031, + "rewards/margins": -0.25781023502349854, + "rewards/rejected": -0.08641661703586578, + "step": 5648 + }, + { + "epoch": 0.8736129905277402, + "grad_norm": 6.017556190490723, + "learning_rate": 3.937736281360981e-06, + "logits/chosen": 7.605977535247803, + "logits/rejected": 7.767765045166016, + "logps/chosen": -300.46478271484375, + "logps/rejected": -304.66912841796875, + "loss": 0.8732, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3434692323207855, + "rewards/margins": -0.21501006186008453, + "rewards/rejected": 0.5584793090820312, + "step": 5649 + }, + { + "epoch": 0.8737676396675044, + "grad_norm": 4.724082946777344, + "learning_rate": 3.937449879711308e-06, + "logits/chosen": 12.63094711303711, + "logits/rejected": 15.25442886352539, + "logps/chosen": -277.34796142578125, + "logps/rejected": -263.77099609375, + "loss": 0.641, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17564235627651215, + "rewards/margins": 0.15760484337806702, + "rewards/rejected": 0.01803751103579998, + "step": 5650 + }, + { + "epoch": 0.8739222888072685, + "grad_norm": 4.252694129943848, + "learning_rate": 3.937163478061634e-06, + "logits/chosen": 9.061813354492188, + "logits/rejected": 8.071301460266113, + "logps/chosen": -234.41494750976562, + "logps/rejected": -245.01809692382812, + "loss": 0.6188, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0609808973968029, + "rewards/margins": 0.26599031686782837, + "rewards/rejected": -0.20500941574573517, + "step": 5651 + }, + { + "epoch": 0.8740769379470327, + "grad_norm": 5.34287166595459, + "learning_rate": 3.936877076411961e-06, + "logits/chosen": 13.25833797454834, + "logits/rejected": 10.553860664367676, + "logps/chosen": -249.4315643310547, + "logps/rejected": -243.89340209960938, + "loss": 0.5934, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.182373508810997, + "rewards/margins": 0.5319761633872986, + "rewards/rejected": -0.3496026396751404, + "step": 5652 + }, + { + "epoch": 0.8742315870867968, + "grad_norm": 5.252841472625732, + "learning_rate": 3.936590674762287e-06, + "logits/chosen": 17.859966278076172, + "logits/rejected": 2.896955966949463, + "logps/chosen": -349.1850280761719, + "logps/rejected": -167.3238525390625, + "loss": 0.5405, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13819772005081177, + "rewards/margins": 0.7922736406326294, + "rewards/rejected": -0.6540759801864624, + "step": 5653 + }, + { + "epoch": 0.874386236226561, + "grad_norm": 4.111155033111572, + "learning_rate": 3.9363042731126135e-06, + "logits/chosen": 12.214452743530273, + "logits/rejected": 11.434420585632324, + "logps/chosen": -239.00567626953125, + "logps/rejected": -260.8659973144531, + "loss": 0.6608, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2804913818836212, + "rewards/margins": 0.15111541748046875, + "rewards/rejected": 0.12937593460083008, + "step": 5654 + }, + { + "epoch": 0.8745408853663251, + "grad_norm": 6.461802005767822, + "learning_rate": 3.93601787146294e-06, + "logits/chosen": 9.54474925994873, + "logits/rejected": 10.665502548217773, + "logps/chosen": -245.351318359375, + "logps/rejected": -277.05181884765625, + "loss": 0.6939, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.009486563503742218, + "rewards/margins": 0.1848127543926239, + "rewards/rejected": -0.19429929554462433, + "step": 5655 + }, + { + "epoch": 0.8746955345060893, + "grad_norm": 3.8820958137512207, + "learning_rate": 3.935731469813267e-06, + "logits/chosen": 4.143945217132568, + "logits/rejected": 3.234565019607544, + "logps/chosen": -203.73922729492188, + "logps/rejected": -270.5245361328125, + "loss": 0.486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16650941967964172, + "rewards/margins": 0.6335764527320862, + "rewards/rejected": -0.8000859022140503, + "step": 5656 + }, + { + "epoch": 0.8748501836458534, + "grad_norm": 4.7538347244262695, + "learning_rate": 3.9354450681635935e-06, + "logits/chosen": 11.000550270080566, + "logits/rejected": 6.898792266845703, + "logps/chosen": -361.6807556152344, + "logps/rejected": -219.75794982910156, + "loss": 0.6001, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.009472116827964783, + "rewards/margins": 0.2839500308036804, + "rewards/rejected": -0.293422132730484, + "step": 5657 + }, + { + "epoch": 0.8750048327856176, + "grad_norm": 4.112393856048584, + "learning_rate": 3.935158666513919e-06, + "logits/chosen": 11.376110076904297, + "logits/rejected": 4.066917896270752, + "logps/chosen": -268.251708984375, + "logps/rejected": -192.46917724609375, + "loss": 0.5249, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07191857695579529, + "rewards/margins": 0.5339853167533875, + "rewards/rejected": -0.6059039235115051, + "step": 5658 + }, + { + "epoch": 0.8751594819253818, + "grad_norm": 4.6746745109558105, + "learning_rate": 3.934872264864246e-06, + "logits/chosen": 7.891869068145752, + "logits/rejected": 6.6777143478393555, + "logps/chosen": -162.88125610351562, + "logps/rejected": -173.12864685058594, + "loss": 0.7765, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.14955027401447296, + "rewards/margins": 0.012018300592899323, + "rewards/rejected": -0.16156858205795288, + "step": 5659 + }, + { + "epoch": 0.875314131065146, + "grad_norm": 6.297798156738281, + "learning_rate": 3.9345858632145726e-06, + "logits/chosen": 6.34410285949707, + "logits/rejected": 8.712772369384766, + "logps/chosen": -265.5455322265625, + "logps/rejected": -333.2878112792969, + "loss": 0.6323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0799713209271431, + "rewards/margins": 0.1753229796886444, + "rewards/rejected": -0.09535164386034012, + "step": 5660 + }, + { + "epoch": 0.8754687802049101, + "grad_norm": 5.930227756500244, + "learning_rate": 3.934299461564899e-06, + "logits/chosen": 7.926520347595215, + "logits/rejected": 3.0510663986206055, + "logps/chosen": -369.01934814453125, + "logps/rejected": -275.99578857421875, + "loss": 0.6328, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07866773009300232, + "rewards/margins": 0.19924335181713104, + "rewards/rejected": -0.12057562172412872, + "step": 5661 + }, + { + "epoch": 0.8756234293446743, + "grad_norm": 8.665170669555664, + "learning_rate": 3.934013059915225e-06, + "logits/chosen": 13.285758018493652, + "logits/rejected": 13.363130569458008, + "logps/chosen": -504.457275390625, + "logps/rejected": -454.870849609375, + "loss": 0.6126, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28258201479911804, + "rewards/margins": 0.3246647119522095, + "rewards/rejected": -0.04208267107605934, + "step": 5662 + }, + { + "epoch": 0.8757780784844384, + "grad_norm": 3.694821834564209, + "learning_rate": 3.933726658265552e-06, + "logits/chosen": 8.232464790344238, + "logits/rejected": 6.61293888092041, + "logps/chosen": -234.60890197753906, + "logps/rejected": -176.3289337158203, + "loss": 0.555, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12071295827627182, + "rewards/margins": 0.33618611097335815, + "rewards/rejected": -0.4568990468978882, + "step": 5663 + }, + { + "epoch": 0.8759327276242026, + "grad_norm": 6.1637139320373535, + "learning_rate": 3.933440256615878e-06, + "logits/chosen": 10.9147367477417, + "logits/rejected": 8.75466537475586, + "logps/chosen": -434.5308837890625, + "logps/rejected": -262.6238098144531, + "loss": 0.7895, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04773131012916565, + "rewards/margins": 0.059706032276153564, + "rewards/rejected": -0.10743732750415802, + "step": 5664 + }, + { + "epoch": 0.8760873767639668, + "grad_norm": 4.539864540100098, + "learning_rate": 3.933153854966205e-06, + "logits/chosen": 13.912259101867676, + "logits/rejected": 2.1627326011657715, + "logps/chosen": -329.2405700683594, + "logps/rejected": -151.69346618652344, + "loss": 0.5578, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.14639337360858917, + "rewards/margins": 0.3467675447463989, + "rewards/rejected": -0.4931609034538269, + "step": 5665 + }, + { + "epoch": 0.8762420259037309, + "grad_norm": 4.697590351104736, + "learning_rate": 3.932867453316531e-06, + "logits/chosen": 9.148768424987793, + "logits/rejected": 0.09035599231719971, + "logps/chosen": -268.40838623046875, + "logps/rejected": -214.3895721435547, + "loss": 0.6741, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.139594167470932, + "rewards/margins": 0.378447562456131, + "rewards/rejected": -0.23885338008403778, + "step": 5666 + }, + { + "epoch": 0.876396675043495, + "grad_norm": 5.498101711273193, + "learning_rate": 3.932581051666857e-06, + "logits/chosen": 16.134925842285156, + "logits/rejected": 12.661490440368652, + "logps/chosen": -350.7439270019531, + "logps/rejected": -291.5743713378906, + "loss": 0.5646, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.213811457157135, + "rewards/margins": 0.4235631227493286, + "rewards/rejected": -0.20975171029567719, + "step": 5667 + }, + { + "epoch": 0.8765513241832592, + "grad_norm": 6.963108062744141, + "learning_rate": 3.932294650017184e-06, + "logits/chosen": 9.436728477478027, + "logits/rejected": 11.135045051574707, + "logps/chosen": -338.39715576171875, + "logps/rejected": -404.2248840332031, + "loss": 0.8649, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.0227874293923378, + "rewards/margins": -0.1967308521270752, + "rewards/rejected": 0.21951822936534882, + "step": 5668 + }, + { + "epoch": 0.8767059733230234, + "grad_norm": 5.140109539031982, + "learning_rate": 3.932008248367511e-06, + "logits/chosen": 11.411271095275879, + "logits/rejected": 8.633234977722168, + "logps/chosen": -493.7334289550781, + "logps/rejected": -347.44873046875, + "loss": 0.4756, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.48280030488967896, + "rewards/margins": 0.7372726202011108, + "rewards/rejected": -0.2544722259044647, + "step": 5669 + }, + { + "epoch": 0.8768606224627875, + "grad_norm": 7.385674953460693, + "learning_rate": 3.931721846717837e-06, + "logits/chosen": 10.652819633483887, + "logits/rejected": 4.669861793518066, + "logps/chosen": -313.36444091796875, + "logps/rejected": -223.39328002929688, + "loss": 0.7038, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.143855482339859, + "rewards/margins": 0.1621609777212143, + "rewards/rejected": -0.01830551028251648, + "step": 5670 + }, + { + "epoch": 0.8770152716025517, + "grad_norm": 7.921140193939209, + "learning_rate": 3.931435445068164e-06, + "logits/chosen": 10.085105895996094, + "logits/rejected": 10.221999168395996, + "logps/chosen": -346.9088439941406, + "logps/rejected": -318.07415771484375, + "loss": 0.7316, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12066832184791565, + "rewards/margins": -0.0223359614610672, + "rewards/rejected": 0.14300426840782166, + "step": 5671 + }, + { + "epoch": 0.8771699207423159, + "grad_norm": 5.412511348724365, + "learning_rate": 3.931149043418491e-06, + "logits/chosen": 13.754417419433594, + "logits/rejected": 11.952595710754395, + "logps/chosen": -257.49273681640625, + "logps/rejected": -247.9320068359375, + "loss": 0.7382, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23533602058887482, + "rewards/margins": 0.04562690854072571, + "rewards/rejected": 0.18970909714698792, + "step": 5672 + }, + { + "epoch": 0.8773245698820801, + "grad_norm": 6.484980583190918, + "learning_rate": 3.9308626417688165e-06, + "logits/chosen": -0.3907392919063568, + "logits/rejected": 6.294769763946533, + "logps/chosen": -194.39942932128906, + "logps/rejected": -307.7702331542969, + "loss": 0.8776, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15546885132789612, + "rewards/margins": -0.11805954575538635, + "rewards/rejected": -0.03740927577018738, + "step": 5673 + }, + { + "epoch": 0.8774792190218442, + "grad_norm": 5.181357383728027, + "learning_rate": 3.930576240119143e-06, + "logits/chosen": 13.858302116394043, + "logits/rejected": 10.649045944213867, + "logps/chosen": -180.3077392578125, + "logps/rejected": -191.67022705078125, + "loss": 0.6879, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22265706956386566, + "rewards/margins": 0.16413095593452454, + "rewards/rejected": -0.3867880403995514, + "step": 5674 + }, + { + "epoch": 0.8776338681616084, + "grad_norm": 4.592799186706543, + "learning_rate": 3.93028983846947e-06, + "logits/chosen": 16.543363571166992, + "logits/rejected": 12.259946823120117, + "logps/chosen": -267.28802490234375, + "logps/rejected": -229.10971069335938, + "loss": 0.5494, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.025543205440044403, + "rewards/margins": 0.3987596333026886, + "rewards/rejected": -0.4243028163909912, + "step": 5675 + }, + { + "epoch": 0.8777885173013725, + "grad_norm": 6.9234938621521, + "learning_rate": 3.9300034368197964e-06, + "logits/chosen": 9.289552688598633, + "logits/rejected": 7.248505115509033, + "logps/chosen": -296.2676086425781, + "logps/rejected": -239.59759521484375, + "loss": 0.7121, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08736447989940643, + "rewards/margins": 0.1253564953804016, + "rewards/rejected": -0.21272096037864685, + "step": 5676 + }, + { + "epoch": 0.8779431664411367, + "grad_norm": 4.872354030609131, + "learning_rate": 3.929717035170123e-06, + "logits/chosen": 7.813759803771973, + "logits/rejected": 2.984287738800049, + "logps/chosen": -214.008056640625, + "logps/rejected": -204.40634155273438, + "loss": 0.7225, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.041722774505615234, + "rewards/margins": 0.04574348032474518, + "rewards/rejected": -0.004020705819129944, + "step": 5677 + }, + { + "epoch": 0.8780978155809008, + "grad_norm": 5.934500694274902, + "learning_rate": 3.92943063352045e-06, + "logits/chosen": 16.50546646118164, + "logits/rejected": 10.927936553955078, + "logps/chosen": -327.83599853515625, + "logps/rejected": -323.0665588378906, + "loss": 0.6838, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18454760313034058, + "rewards/margins": 0.1856900453567505, + "rewards/rejected": -0.0011423975229263306, + "step": 5678 + }, + { + "epoch": 0.878252464720665, + "grad_norm": 5.392816543579102, + "learning_rate": 3.9291442318707755e-06, + "logits/chosen": 7.035706520080566, + "logits/rejected": 7.192243576049805, + "logps/chosen": -134.77679443359375, + "logps/rejected": -176.96095275878906, + "loss": 0.7881, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09383855015039444, + "rewards/margins": 0.017092328518629074, + "rewards/rejected": 0.07674622535705566, + "step": 5679 + }, + { + "epoch": 0.8784071138604291, + "grad_norm": 6.367462635040283, + "learning_rate": 3.928857830221102e-06, + "logits/chosen": 8.42098331451416, + "logits/rejected": 9.098801612854004, + "logps/chosen": -298.13629150390625, + "logps/rejected": -310.239013671875, + "loss": 0.7174, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26808756589889526, + "rewards/margins": 0.11679258942604065, + "rewards/rejected": 0.1512949913740158, + "step": 5680 + }, + { + "epoch": 0.8785617630001933, + "grad_norm": 3.8521969318389893, + "learning_rate": 3.928571428571429e-06, + "logits/chosen": 14.077341079711914, + "logits/rejected": 8.14858627319336, + "logps/chosen": -282.34661865234375, + "logps/rejected": -224.75416564941406, + "loss": 0.397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0699823871254921, + "rewards/margins": 0.8541668653488159, + "rewards/rejected": -0.784184455871582, + "step": 5681 + }, + { + "epoch": 0.8787164121399574, + "grad_norm": 4.985754013061523, + "learning_rate": 3.9282850269217555e-06, + "logits/chosen": 11.141313552856445, + "logits/rejected": 4.546165943145752, + "logps/chosen": -293.9993896484375, + "logps/rejected": -202.7480926513672, + "loss": 0.5562, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6147188544273376, + "rewards/margins": 0.6592667102813721, + "rewards/rejected": -0.044547900557518005, + "step": 5682 + }, + { + "epoch": 0.8788710612797216, + "grad_norm": 6.047011375427246, + "learning_rate": 3.927998625272082e-06, + "logits/chosen": 9.215534210205078, + "logits/rejected": 7.423741340637207, + "logps/chosen": -348.14825439453125, + "logps/rejected": -331.0448913574219, + "loss": 0.5222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.03747911751270294, + "rewards/margins": 0.4158361256122589, + "rewards/rejected": -0.45331525802612305, + "step": 5683 + }, + { + "epoch": 0.8790257104194857, + "grad_norm": 5.067002296447754, + "learning_rate": 3.927712223622409e-06, + "logits/chosen": 7.173776149749756, + "logits/rejected": 5.573089122772217, + "logps/chosen": -223.5181884765625, + "logps/rejected": -255.6622314453125, + "loss": 0.5584, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.060227587819099426, + "rewards/margins": 0.3956969082355499, + "rewards/rejected": -0.3354693651199341, + "step": 5684 + }, + { + "epoch": 0.87918035955925, + "grad_norm": 6.530332565307617, + "learning_rate": 3.9274258219727354e-06, + "logits/chosen": 12.846552848815918, + "logits/rejected": 9.800727844238281, + "logps/chosen": -264.124755859375, + "logps/rejected": -253.51968383789062, + "loss": 0.686, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1744282841682434, + "rewards/margins": 0.0831167995929718, + "rewards/rejected": 0.09131143987178802, + "step": 5685 + }, + { + "epoch": 0.8793350086990142, + "grad_norm": 4.751879692077637, + "learning_rate": 3.927139420323061e-06, + "logits/chosen": 14.078407287597656, + "logits/rejected": 9.540026664733887, + "logps/chosen": -317.36529541015625, + "logps/rejected": -232.094970703125, + "loss": 0.5829, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.26495128870010376, + "rewards/margins": 0.38752806186676025, + "rewards/rejected": -0.12257680296897888, + "step": 5686 + }, + { + "epoch": 0.8794896578387783, + "grad_norm": 5.118988513946533, + "learning_rate": 3.926853018673388e-06, + "logits/chosen": 6.926477909088135, + "logits/rejected": 6.143775939941406, + "logps/chosen": -294.31646728515625, + "logps/rejected": -249.96615600585938, + "loss": 0.6908, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36341917514801025, + "rewards/margins": 0.05482121556997299, + "rewards/rejected": 0.30859795212745667, + "step": 5687 + }, + { + "epoch": 0.8796443069785425, + "grad_norm": 4.586796283721924, + "learning_rate": 3.9265666170237145e-06, + "logits/chosen": 4.956618785858154, + "logits/rejected": 6.146323204040527, + "logps/chosen": -208.92919921875, + "logps/rejected": -222.78135681152344, + "loss": 0.7469, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.05154542997479439, + "rewards/margins": -0.0685061439871788, + "rewards/rejected": 0.016960717737674713, + "step": 5688 + }, + { + "epoch": 0.8797989561183066, + "grad_norm": 5.894932270050049, + "learning_rate": 3.926280215374041e-06, + "logits/chosen": 4.68338680267334, + "logits/rejected": 8.852193832397461, + "logps/chosen": -275.29559326171875, + "logps/rejected": -287.2887268066406, + "loss": 0.6729, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.12761251628398895, + "rewards/margins": 0.17982949316501617, + "rewards/rejected": -0.3074420094490051, + "step": 5689 + }, + { + "epoch": 0.8799536052580708, + "grad_norm": 4.680460453033447, + "learning_rate": 3.925993813724368e-06, + "logits/chosen": 16.064149856567383, + "logits/rejected": 12.807035446166992, + "logps/chosen": -341.8258056640625, + "logps/rejected": -276.86669921875, + "loss": 0.6504, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20412616431713104, + "rewards/margins": 0.14597684144973755, + "rewards/rejected": 0.05814933776855469, + "step": 5690 + }, + { + "epoch": 0.8801082543978349, + "grad_norm": 5.986044406890869, + "learning_rate": 3.9257074120746945e-06, + "logits/chosen": 11.95112419128418, + "logits/rejected": 10.669514656066895, + "logps/chosen": -356.47271728515625, + "logps/rejected": -364.63519287109375, + "loss": 0.7306, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.45785582065582275, + "rewards/margins": 0.03142721951007843, + "rewards/rejected": 0.4264286160469055, + "step": 5691 + }, + { + "epoch": 0.8802629035375991, + "grad_norm": 5.993568420410156, + "learning_rate": 3.92542101042502e-06, + "logits/chosen": 9.345330238342285, + "logits/rejected": 4.395379066467285, + "logps/chosen": -404.22662353515625, + "logps/rejected": -324.53973388671875, + "loss": 0.5773, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33714932203292847, + "rewards/margins": 0.46598827838897705, + "rewards/rejected": -0.1288389265537262, + "step": 5692 + }, + { + "epoch": 0.8804175526773632, + "grad_norm": 4.783054351806641, + "learning_rate": 3.925134608775347e-06, + "logits/chosen": 8.3233060836792, + "logits/rejected": 4.597684860229492, + "logps/chosen": -290.25286865234375, + "logps/rejected": -197.84854125976562, + "loss": 0.728, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08984370529651642, + "rewards/margins": 0.0006747245788574219, + "rewards/rejected": -0.09051842987537384, + "step": 5693 + }, + { + "epoch": 0.8805722018171274, + "grad_norm": 3.8297502994537354, + "learning_rate": 3.924848207125674e-06, + "logits/chosen": 10.921207427978516, + "logits/rejected": 9.71729850769043, + "logps/chosen": -211.61270141601562, + "logps/rejected": -220.1630859375, + "loss": 0.5926, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42336779832839966, + "rewards/margins": 0.35031557083129883, + "rewards/rejected": 0.07305224239826202, + "step": 5694 + }, + { + "epoch": 0.8807268509568915, + "grad_norm": 4.451205730438232, + "learning_rate": 3.924561805476e-06, + "logits/chosen": 17.75222396850586, + "logits/rejected": 13.98066520690918, + "logps/chosen": -236.87969970703125, + "logps/rejected": -178.76641845703125, + "loss": 0.4782, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2390918731689453, + "rewards/margins": 0.609368085861206, + "rewards/rejected": -0.37027621269226074, + "step": 5695 + }, + { + "epoch": 0.8808815000966557, + "grad_norm": 3.727084159851074, + "learning_rate": 3.924275403826326e-06, + "logits/chosen": 8.494039535522461, + "logits/rejected": 6.410273551940918, + "logps/chosen": -199.05967712402344, + "logps/rejected": -192.14984130859375, + "loss": 0.5722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2859953045845032, + "rewards/margins": 0.3050665259361267, + "rewards/rejected": -0.01907125487923622, + "step": 5696 + }, + { + "epoch": 0.8810361492364198, + "grad_norm": 4.612589359283447, + "learning_rate": 3.923989002176653e-06, + "logits/chosen": 12.652214050292969, + "logits/rejected": 4.704769134521484, + "logps/chosen": -366.7347717285156, + "logps/rejected": -176.88043212890625, + "loss": 0.548, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.318561851978302, + "rewards/margins": 0.4347693920135498, + "rewards/rejected": -0.116207554936409, + "step": 5697 + }, + { + "epoch": 0.8811907983761841, + "grad_norm": 5.3449201583862305, + "learning_rate": 3.923702600526979e-06, + "logits/chosen": 11.487663269042969, + "logits/rejected": 7.070026874542236, + "logps/chosen": -287.8294677734375, + "logps/rejected": -233.54437255859375, + "loss": 0.642, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.25714826583862305, + "rewards/margins": 0.23449267446994781, + "rewards/rejected": 0.022655636072158813, + "step": 5698 + }, + { + "epoch": 0.8813454475159482, + "grad_norm": 5.4208149909973145, + "learning_rate": 3.923416198877306e-06, + "logits/chosen": 7.438830375671387, + "logits/rejected": 5.745214462280273, + "logps/chosen": -216.25875854492188, + "logps/rejected": -319.15142822265625, + "loss": 0.662, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1105307787656784, + "rewards/margins": 0.19088268280029297, + "rewards/rejected": -0.30141347646713257, + "step": 5699 + }, + { + "epoch": 0.8815000966557124, + "grad_norm": 4.36551570892334, + "learning_rate": 3.923129797227632e-06, + "logits/chosen": 15.135406494140625, + "logits/rejected": 13.00969123840332, + "logps/chosen": -311.62030029296875, + "logps/rejected": -293.10028076171875, + "loss": 0.5552, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2520088851451874, + "rewards/margins": 0.3791600465774536, + "rewards/rejected": -0.12715116143226624, + "step": 5700 + }, + { + "epoch": 0.8816547457954765, + "grad_norm": 4.1058807373046875, + "learning_rate": 3.9228433955779585e-06, + "logits/chosen": 10.676985740661621, + "logits/rejected": 8.554781913757324, + "logps/chosen": -196.91278076171875, + "logps/rejected": -148.56385803222656, + "loss": 0.5721, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23484423756599426, + "rewards/margins": 0.2858145534992218, + "rewards/rejected": -0.050970323383808136, + "step": 5701 + }, + { + "epoch": 0.8818093949352407, + "grad_norm": 2.8860790729522705, + "learning_rate": 3.922556993928285e-06, + "logits/chosen": 7.610790729522705, + "logits/rejected": -1.929937481880188, + "logps/chosen": -182.20364379882812, + "logps/rejected": -91.02428436279297, + "loss": 0.5321, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08037164062261581, + "rewards/margins": 0.3965491056442261, + "rewards/rejected": -0.31617751717567444, + "step": 5702 + }, + { + "epoch": 0.8819640440750048, + "grad_norm": 5.045615196228027, + "learning_rate": 3.922270592278612e-06, + "logits/chosen": 3.3387739658355713, + "logits/rejected": 6.531632900238037, + "logps/chosen": -164.33871459960938, + "logps/rejected": -179.22242736816406, + "loss": 0.7349, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17465294897556305, + "rewards/margins": 0.003287695348262787, + "rewards/rejected": -0.17794065177440643, + "step": 5703 + }, + { + "epoch": 0.882118693214769, + "grad_norm": 5.3651604652404785, + "learning_rate": 3.921984190628938e-06, + "logits/chosen": 14.905363082885742, + "logits/rejected": 1.7832754850387573, + "logps/chosen": -386.7444152832031, + "logps/rejected": -242.4635772705078, + "loss": 0.5232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21647655963897705, + "rewards/margins": 0.44515755772590637, + "rewards/rejected": -0.22868099808692932, + "step": 5704 + }, + { + "epoch": 0.8822733423545331, + "grad_norm": 37.49298095703125, + "learning_rate": 3.921697788979264e-06, + "logits/chosen": 13.31360912322998, + "logits/rejected": 8.313511848449707, + "logps/chosen": -226.02801513671875, + "logps/rejected": -132.7248992919922, + "loss": 0.5479, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14264078438282013, + "rewards/margins": 0.3591480851173401, + "rewards/rejected": -0.21650728583335876, + "step": 5705 + }, + { + "epoch": 0.8824279914942973, + "grad_norm": 4.866557598114014, + "learning_rate": 3.921411387329591e-06, + "logits/chosen": 11.690808296203613, + "logits/rejected": 7.529850959777832, + "logps/chosen": -358.360595703125, + "logps/rejected": -228.60592651367188, + "loss": 0.5891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.049704648554325104, + "rewards/margins": 0.3491722345352173, + "rewards/rejected": -0.2994675636291504, + "step": 5706 + }, + { + "epoch": 0.8825826406340614, + "grad_norm": 5.29364538192749, + "learning_rate": 3.9211249856799175e-06, + "logits/chosen": 17.279762268066406, + "logits/rejected": 10.129562377929688, + "logps/chosen": -462.92352294921875, + "logps/rejected": -326.7575988769531, + "loss": 0.4071, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6093364953994751, + "rewards/margins": 1.1377660036087036, + "rewards/rejected": -0.5284295678138733, + "step": 5707 + }, + { + "epoch": 0.8827372897738256, + "grad_norm": 5.2459611892700195, + "learning_rate": 3.920838584030244e-06, + "logits/chosen": 8.722107887268066, + "logits/rejected": 6.616268157958984, + "logps/chosen": -188.29290771484375, + "logps/rejected": -202.31494140625, + "loss": 0.9364, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.32921215891838074, + "rewards/margins": -0.29155609011650085, + "rewards/rejected": -0.037656065076589584, + "step": 5708 + }, + { + "epoch": 0.8828919389135897, + "grad_norm": 5.211967945098877, + "learning_rate": 3.920552182380571e-06, + "logits/chosen": 11.724498748779297, + "logits/rejected": 10.188246726989746, + "logps/chosen": -298.84246826171875, + "logps/rejected": -312.59564208984375, + "loss": 0.491, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3389541506767273, + "rewards/margins": 0.5990217924118042, + "rewards/rejected": -0.2600676417350769, + "step": 5709 + }, + { + "epoch": 0.8830465880533539, + "grad_norm": 5.4087233543396, + "learning_rate": 3.9202657807308975e-06, + "logits/chosen": 16.124073028564453, + "logits/rejected": 10.48360538482666, + "logps/chosen": -246.78402709960938, + "logps/rejected": -162.92636108398438, + "loss": 0.7441, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2183852195739746, + "rewards/margins": -0.05012936517596245, + "rewards/rejected": -0.16825586557388306, + "step": 5710 + }, + { + "epoch": 0.8832012371931182, + "grad_norm": 4.968470573425293, + "learning_rate": 3.919979379081224e-06, + "logits/chosen": 11.219609260559082, + "logits/rejected": 8.278921127319336, + "logps/chosen": -244.74037170410156, + "logps/rejected": -215.74349975585938, + "loss": 0.7217, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.19195738434791565, + "rewards/margins": 0.07371539622545242, + "rewards/rejected": -0.2656727731227875, + "step": 5711 + }, + { + "epoch": 0.8833558863328823, + "grad_norm": 7.693360805511475, + "learning_rate": 3.91969297743155e-06, + "logits/chosen": 9.648031234741211, + "logits/rejected": 10.574610710144043, + "logps/chosen": -344.81488037109375, + "logps/rejected": -352.55194091796875, + "loss": 0.845, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23703914880752563, + "rewards/margins": -0.1912335455417633, + "rewards/rejected": 0.42827269434928894, + "step": 5712 + }, + { + "epoch": 0.8835105354726465, + "grad_norm": 7.614741802215576, + "learning_rate": 3.9194065757818766e-06, + "logits/chosen": 16.03765869140625, + "logits/rejected": 12.23193359375, + "logps/chosen": -316.5546875, + "logps/rejected": -289.1417236328125, + "loss": 0.8436, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.3965412974357605, + "rewards/margins": -0.14649152755737305, + "rewards/rejected": -0.25004979968070984, + "step": 5713 + }, + { + "epoch": 0.8836651846124106, + "grad_norm": 7.736001491546631, + "learning_rate": 3.919120174132203e-06, + "logits/chosen": 10.7076416015625, + "logits/rejected": 10.2206392288208, + "logps/chosen": -207.13267517089844, + "logps/rejected": -234.1348419189453, + "loss": 0.7957, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3026137351989746, + "rewards/margins": 0.07845886051654816, + "rewards/rejected": -0.3810725808143616, + "step": 5714 + }, + { + "epoch": 0.8838198337521748, + "grad_norm": 11.667096138000488, + "learning_rate": 3.91883377248253e-06, + "logits/chosen": 12.753061294555664, + "logits/rejected": 7.801212310791016, + "logps/chosen": -373.0035400390625, + "logps/rejected": -300.8188171386719, + "loss": 0.6794, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.614719033241272, + "rewards/margins": 0.26208117604255676, + "rewards/rejected": 0.3526378870010376, + "step": 5715 + }, + { + "epoch": 0.8839744828919389, + "grad_norm": 8.16006088256836, + "learning_rate": 3.9185473708328565e-06, + "logits/chosen": 10.51950740814209, + "logits/rejected": -3.701554298400879, + "logps/chosen": -326.212890625, + "logps/rejected": -200.83639526367188, + "loss": 0.64, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3709743320941925, + "rewards/margins": 0.5845832228660583, + "rewards/rejected": -0.21360892057418823, + "step": 5716 + }, + { + "epoch": 0.8841291320317031, + "grad_norm": 4.286136150360107, + "learning_rate": 3.918260969183183e-06, + "logits/chosen": 4.369946479797363, + "logits/rejected": 7.638784885406494, + "logps/chosen": -140.1514434814453, + "logps/rejected": -136.88760375976562, + "loss": 0.6431, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1485108882188797, + "rewards/margins": 0.15217891335487366, + "rewards/rejected": -0.30068981647491455, + "step": 5717 + }, + { + "epoch": 0.8842837811714672, + "grad_norm": 7.765321731567383, + "learning_rate": 3.91797456753351e-06, + "logits/chosen": 13.634510040283203, + "logits/rejected": 9.603250503540039, + "logps/chosen": -371.98248291015625, + "logps/rejected": -296.15875244140625, + "loss": 0.7404, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1119348481297493, + "rewards/margins": 0.08819341659545898, + "rewards/rejected": -0.20012828707695007, + "step": 5718 + }, + { + "epoch": 0.8844384303112314, + "grad_norm": 5.180447101593018, + "learning_rate": 3.917688165883836e-06, + "logits/chosen": 8.412239074707031, + "logits/rejected": 5.719845771789551, + "logps/chosen": -300.8465881347656, + "logps/rejected": -276.9864807128906, + "loss": 0.5265, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5363882780075073, + "rewards/margins": 0.6356222629547119, + "rewards/rejected": -0.0992339700460434, + "step": 5719 + }, + { + "epoch": 0.8845930794509955, + "grad_norm": 4.950469017028809, + "learning_rate": 3.917401764234162e-06, + "logits/chosen": 11.615145683288574, + "logits/rejected": 12.291160583496094, + "logps/chosen": -340.7977600097656, + "logps/rejected": -322.55218505859375, + "loss": 0.5434, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5119456052780151, + "rewards/margins": 0.4473438858985901, + "rewards/rejected": 0.06460171937942505, + "step": 5720 + }, + { + "epoch": 0.8847477285907597, + "grad_norm": 5.036688327789307, + "learning_rate": 3.917115362584489e-06, + "logits/chosen": 14.608207702636719, + "logits/rejected": 7.88964319229126, + "logps/chosen": -471.6127014160156, + "logps/rejected": -324.5352783203125, + "loss": 0.4216, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5948439836502075, + "rewards/margins": 0.9432607293128967, + "rewards/rejected": -0.3484167456626892, + "step": 5721 + }, + { + "epoch": 0.8849023777305238, + "grad_norm": 4.146422863006592, + "learning_rate": 3.9168289609348156e-06, + "logits/chosen": 9.466646194458008, + "logits/rejected": 12.533526420593262, + "logps/chosen": -202.91134643554688, + "logps/rejected": -194.45562744140625, + "loss": 0.6314, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11016635596752167, + "rewards/margins": 0.28474920988082886, + "rewards/rejected": -0.17458288371562958, + "step": 5722 + }, + { + "epoch": 0.8850570268702881, + "grad_norm": 5.789351940155029, + "learning_rate": 3.916542559285142e-06, + "logits/chosen": 7.4438910484313965, + "logits/rejected": 11.713520050048828, + "logps/chosen": -217.23736572265625, + "logps/rejected": -242.3692626953125, + "loss": 0.7695, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07172223925590515, + "rewards/margins": 0.01283930242061615, + "rewards/rejected": -0.0845615416765213, + "step": 5723 + }, + { + "epoch": 0.8852116760100522, + "grad_norm": 5.6492390632629395, + "learning_rate": 3.916256157635469e-06, + "logits/chosen": 3.7442309856414795, + "logits/rejected": 8.91292953491211, + "logps/chosen": -223.8847198486328, + "logps/rejected": -335.2643737792969, + "loss": 0.7253, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10682099312543869, + "rewards/margins": 0.18941956758499146, + "rewards/rejected": -0.29624056816101074, + "step": 5724 + }, + { + "epoch": 0.8853663251498164, + "grad_norm": 4.877742290496826, + "learning_rate": 3.915969755985795e-06, + "logits/chosen": 5.995549201965332, + "logits/rejected": 3.7759718894958496, + "logps/chosen": -186.341796875, + "logps/rejected": -207.14224243164062, + "loss": 0.6035, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.009882714599370956, + "rewards/margins": 0.33444270491600037, + "rewards/rejected": -0.3245599865913391, + "step": 5725 + }, + { + "epoch": 0.8855209742895805, + "grad_norm": 4.757693767547607, + "learning_rate": 3.915683354336121e-06, + "logits/chosen": 12.983833312988281, + "logits/rejected": 4.540615081787109, + "logps/chosen": -430.4539794921875, + "logps/rejected": -351.2604675292969, + "loss": 0.4529, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7119522094726562, + "rewards/margins": 0.949677586555481, + "rewards/rejected": -0.2377253770828247, + "step": 5726 + }, + { + "epoch": 0.8856756234293447, + "grad_norm": 5.418318748474121, + "learning_rate": 3.915396952686448e-06, + "logits/chosen": 10.195610046386719, + "logits/rejected": 6.444699764251709, + "logps/chosen": -363.6402893066406, + "logps/rejected": -317.5598449707031, + "loss": 0.6229, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.38743603229522705, + "rewards/margins": 0.34168070554733276, + "rewards/rejected": 0.0457552969455719, + "step": 5727 + }, + { + "epoch": 0.8858302725691088, + "grad_norm": 4.7921953201293945, + "learning_rate": 3.915110551036775e-06, + "logits/chosen": 8.705930709838867, + "logits/rejected": 1.3356387615203857, + "logps/chosen": -271.5472412109375, + "logps/rejected": -156.489990234375, + "loss": 0.5629, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3196978271007538, + "rewards/margins": 0.37005728483200073, + "rewards/rejected": -0.050359442830085754, + "step": 5728 + }, + { + "epoch": 0.885984921708873, + "grad_norm": 6.967679023742676, + "learning_rate": 3.914824149387101e-06, + "logits/chosen": 15.88611888885498, + "logits/rejected": 11.138936996459961, + "logps/chosen": -433.5938720703125, + "logps/rejected": -398.22882080078125, + "loss": 0.7217, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3259948492050171, + "rewards/margins": -0.030917182564735413, + "rewards/rejected": 0.3569120466709137, + "step": 5729 + }, + { + "epoch": 0.8861395708486371, + "grad_norm": 7.154730796813965, + "learning_rate": 3.914537747737427e-06, + "logits/chosen": 14.754705429077148, + "logits/rejected": 11.588245391845703, + "logps/chosen": -321.4205322265625, + "logps/rejected": -268.4865417480469, + "loss": 0.7381, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08195889741182327, + "rewards/margins": -0.04528498277068138, + "rewards/rejected": 0.12724387645721436, + "step": 5730 + }, + { + "epoch": 0.8862942199884013, + "grad_norm": 6.6600165367126465, + "learning_rate": 3.914251346087754e-06, + "logits/chosen": 9.972471237182617, + "logits/rejected": 13.445955276489258, + "logps/chosen": -244.17820739746094, + "logps/rejected": -263.0948791503906, + "loss": 0.777, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.05811205506324768, + "rewards/margins": -0.11703595519065857, + "rewards/rejected": 0.17514801025390625, + "step": 5731 + }, + { + "epoch": 0.8864488691281655, + "grad_norm": 8.647895812988281, + "learning_rate": 3.91396494443808e-06, + "logits/chosen": 16.506153106689453, + "logits/rejected": 12.974069595336914, + "logps/chosen": -459.3907775878906, + "logps/rejected": -287.63702392578125, + "loss": 0.7383, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1404932141304016, + "rewards/margins": 0.07582530379295349, + "rewards/rejected": -0.21631845831871033, + "step": 5732 + }, + { + "epoch": 0.8866035182679296, + "grad_norm": 4.1455979347229, + "learning_rate": 3.913678542788407e-06, + "logits/chosen": 12.931025505065918, + "logits/rejected": 13.822722434997559, + "logps/chosen": -224.34390258789062, + "logps/rejected": -162.61351013183594, + "loss": 0.7132, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08715471625328064, + "rewards/margins": -0.007643356919288635, + "rewards/rejected": -0.079511359333992, + "step": 5733 + }, + { + "epoch": 0.8867581674076938, + "grad_norm": 5.531655311584473, + "learning_rate": 3.913392141138733e-06, + "logits/chosen": 10.088375091552734, + "logits/rejected": 5.220111846923828, + "logps/chosen": -190.84645080566406, + "logps/rejected": -187.39700317382812, + "loss": 0.5899, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.250689834356308, + "rewards/margins": 0.32124096155166626, + "rewards/rejected": -0.5719308257102966, + "step": 5734 + }, + { + "epoch": 0.8869128165474579, + "grad_norm": 4.449574947357178, + "learning_rate": 3.9131057394890595e-06, + "logits/chosen": 6.834588050842285, + "logits/rejected": 3.3841464519500732, + "logps/chosen": -339.1780090332031, + "logps/rejected": -362.3229675292969, + "loss": 0.4309, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22625428438186646, + "rewards/margins": 0.7454162240028381, + "rewards/rejected": -0.5191619992256165, + "step": 5735 + }, + { + "epoch": 0.8870674656872222, + "grad_norm": 8.110933303833008, + "learning_rate": 3.912819337839386e-06, + "logits/chosen": 12.218389511108398, + "logits/rejected": 11.65676212310791, + "logps/chosen": -240.7242431640625, + "logps/rejected": -225.2874298095703, + "loss": 0.8025, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0749661922454834, + "rewards/margins": -0.06264910846948624, + "rewards/rejected": -0.012317083775997162, + "step": 5736 + }, + { + "epoch": 0.8872221148269863, + "grad_norm": 4.405845642089844, + "learning_rate": 3.912532936189713e-06, + "logits/chosen": 11.866279602050781, + "logits/rejected": 9.245824813842773, + "logps/chosen": -177.52191162109375, + "logps/rejected": -194.79713439941406, + "loss": 0.6054, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.01137109100818634, + "rewards/margins": 0.23183490335941315, + "rewards/rejected": -0.2432059943675995, + "step": 5737 + }, + { + "epoch": 0.8873767639667505, + "grad_norm": 5.082192420959473, + "learning_rate": 3.912246534540039e-06, + "logits/chosen": 10.702702522277832, + "logits/rejected": 7.364471912384033, + "logps/chosen": -271.68499755859375, + "logps/rejected": -224.69876098632812, + "loss": 0.589, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03307408094406128, + "rewards/margins": 0.3397447466850281, + "rewards/rejected": -0.37281882762908936, + "step": 5738 + }, + { + "epoch": 0.8875314131065146, + "grad_norm": 4.297824382781982, + "learning_rate": 3.911960132890365e-06, + "logits/chosen": 7.88686466217041, + "logits/rejected": 8.273067474365234, + "logps/chosen": -246.26055908203125, + "logps/rejected": -325.49468994140625, + "loss": 0.5338, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.057716239243745804, + "rewards/margins": 0.4108290672302246, + "rewards/rejected": -0.4685452878475189, + "step": 5739 + }, + { + "epoch": 0.8876860622462788, + "grad_norm": 5.887892246246338, + "learning_rate": 3.911673731240692e-06, + "logits/chosen": 13.80660629272461, + "logits/rejected": 10.44577693939209, + "logps/chosen": -252.92649841308594, + "logps/rejected": -194.51593017578125, + "loss": 0.7662, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02310333400964737, + "rewards/margins": -0.09983141720294952, + "rewards/rejected": 0.1229347512125969, + "step": 5740 + }, + { + "epoch": 0.8878407113860429, + "grad_norm": 6.024567127227783, + "learning_rate": 3.9113873295910185e-06, + "logits/chosen": 13.409719467163086, + "logits/rejected": 5.196177005767822, + "logps/chosen": -345.5506896972656, + "logps/rejected": -216.5962371826172, + "loss": 0.6392, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1940809190273285, + "rewards/margins": 0.18006497621536255, + "rewards/rejected": -0.37414589524269104, + "step": 5741 + }, + { + "epoch": 0.8879953605258071, + "grad_norm": 7.745041847229004, + "learning_rate": 3.911100927941345e-06, + "logits/chosen": 8.966459274291992, + "logits/rejected": 12.393768310546875, + "logps/chosen": -259.3885498046875, + "logps/rejected": -249.9786834716797, + "loss": 1.1557, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6753278970718384, + "rewards/margins": -0.6296943426132202, + "rewards/rejected": -0.04563350975513458, + "step": 5742 + }, + { + "epoch": 0.8881500096655712, + "grad_norm": 4.841660976409912, + "learning_rate": 3.910814526291672e-06, + "logits/chosen": 11.803377151489258, + "logits/rejected": 6.779069900512695, + "logps/chosen": -313.079345703125, + "logps/rejected": -284.3074951171875, + "loss": 0.4704, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1785726547241211, + "rewards/margins": 0.9311029314994812, + "rewards/rejected": -0.7525302767753601, + "step": 5743 + }, + { + "epoch": 0.8883046588053354, + "grad_norm": 23.435745239257812, + "learning_rate": 3.9105281246419985e-06, + "logits/chosen": 14.145280838012695, + "logits/rejected": 8.06881332397461, + "logps/chosen": -397.1736755371094, + "logps/rejected": -345.9590759277344, + "loss": 0.5141, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23405838012695312, + "rewards/margins": 0.568458080291748, + "rewards/rejected": -0.8025164604187012, + "step": 5744 + }, + { + "epoch": 0.8884593079450995, + "grad_norm": 6.681275367736816, + "learning_rate": 3.910241722992324e-06, + "logits/chosen": 9.403277397155762, + "logits/rejected": 8.474343299865723, + "logps/chosen": -235.55935668945312, + "logps/rejected": -205.58026123046875, + "loss": 0.6553, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.03649824857711792, + "rewards/margins": 0.1222074031829834, + "rewards/rejected": -0.08570914715528488, + "step": 5745 + }, + { + "epoch": 0.8886139570848637, + "grad_norm": 5.971920013427734, + "learning_rate": 3.909955321342651e-06, + "logits/chosen": 8.480854034423828, + "logits/rejected": 11.802057266235352, + "logps/chosen": -224.52545166015625, + "logps/rejected": -267.1866760253906, + "loss": 0.7041, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15403667092323303, + "rewards/margins": 0.03144526481628418, + "rewards/rejected": -0.1854819357395172, + "step": 5746 + }, + { + "epoch": 0.8887686062246278, + "grad_norm": 5.7973551750183105, + "learning_rate": 3.909668919692978e-06, + "logits/chosen": 9.143149375915527, + "logits/rejected": 5.627012252807617, + "logps/chosen": -232.7069091796875, + "logps/rejected": -234.0638885498047, + "loss": 0.5269, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.021738380193710327, + "rewards/margins": 0.5361195206642151, + "rewards/rejected": -0.5578579306602478, + "step": 5747 + }, + { + "epoch": 0.888923255364392, + "grad_norm": 4.7776713371276855, + "learning_rate": 3.909382518043304e-06, + "logits/chosen": 14.661382675170898, + "logits/rejected": 9.264200210571289, + "logps/chosen": -228.35877990722656, + "logps/rejected": -161.07286071777344, + "loss": 0.5439, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.030674919486045837, + "rewards/margins": 0.5089076161384583, + "rewards/rejected": -0.5395825505256653, + "step": 5748 + }, + { + "epoch": 0.8890779045041562, + "grad_norm": 5.406339168548584, + "learning_rate": 3.909096116393631e-06, + "logits/chosen": 11.576033592224121, + "logits/rejected": 7.081048011779785, + "logps/chosen": -290.57452392578125, + "logps/rejected": -216.6158447265625, + "loss": 0.6072, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18026159703731537, + "rewards/margins": 0.21082177758216858, + "rewards/rejected": -0.030560161918401718, + "step": 5749 + }, + { + "epoch": 0.8892325536439204, + "grad_norm": 4.738779544830322, + "learning_rate": 3.9088097147439575e-06, + "logits/chosen": 10.026796340942383, + "logits/rejected": 7.0431084632873535, + "logps/chosen": -335.5976867675781, + "logps/rejected": -239.91140747070312, + "loss": 0.5771, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20376934111118317, + "rewards/margins": 0.35572391748428345, + "rewards/rejected": -0.1519545614719391, + "step": 5750 + }, + { + "epoch": 0.8893872027836845, + "grad_norm": 12.458568572998047, + "learning_rate": 3.908523313094284e-06, + "logits/chosen": 9.851884841918945, + "logits/rejected": 9.982420921325684, + "logps/chosen": -349.2794189453125, + "logps/rejected": -337.798095703125, + "loss": 1.131, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.013603691011667252, + "rewards/margins": -0.6422320604324341, + "rewards/rejected": 0.6286283731460571, + "step": 5751 + }, + { + "epoch": 0.8895418519234487, + "grad_norm": 5.155664443969727, + "learning_rate": 3.90823691144461e-06, + "logits/chosen": 10.588802337646484, + "logits/rejected": 4.128095626831055, + "logps/chosen": -350.32098388671875, + "logps/rejected": -300.9133605957031, + "loss": 0.4947, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27958473563194275, + "rewards/margins": 0.4854336380958557, + "rewards/rejected": -0.20584887266159058, + "step": 5752 + }, + { + "epoch": 0.8896965010632129, + "grad_norm": 5.581695079803467, + "learning_rate": 3.907950509794937e-06, + "logits/chosen": 11.199490547180176, + "logits/rejected": 7.459531307220459, + "logps/chosen": -308.09283447265625, + "logps/rejected": -214.06842041015625, + "loss": 0.7749, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0329490564763546, + "rewards/margins": -0.07803630828857422, + "rewards/rejected": 0.04508723318576813, + "step": 5753 + }, + { + "epoch": 0.889851150202977, + "grad_norm": 4.957764625549316, + "learning_rate": 3.907664108145263e-06, + "logits/chosen": 11.860799789428711, + "logits/rejected": 8.487844467163086, + "logps/chosen": -267.4550476074219, + "logps/rejected": -239.7709197998047, + "loss": 0.6991, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19502225518226624, + "rewards/margins": 0.0836663544178009, + "rewards/rejected": 0.11135593801736832, + "step": 5754 + }, + { + "epoch": 0.8900057993427412, + "grad_norm": 5.874998092651367, + "learning_rate": 3.90737770649559e-06, + "logits/chosen": 8.549209594726562, + "logits/rejected": 5.060385704040527, + "logps/chosen": -258.3175048828125, + "logps/rejected": -243.79141235351562, + "loss": 0.7516, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1748601794242859, + "rewards/margins": 0.006876006722450256, + "rewards/rejected": -0.18173615634441376, + "step": 5755 + }, + { + "epoch": 0.8901604484825053, + "grad_norm": 5.554418563842773, + "learning_rate": 3.907091304845917e-06, + "logits/chosen": 9.065581321716309, + "logits/rejected": 7.799722671508789, + "logps/chosen": -354.9833984375, + "logps/rejected": -343.6627197265625, + "loss": 0.6716, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40501782298088074, + "rewards/margins": 0.25420254468917847, + "rewards/rejected": 0.1508152335882187, + "step": 5756 + }, + { + "epoch": 0.8903150976222695, + "grad_norm": 7.55493688583374, + "learning_rate": 3.906804903196243e-06, + "logits/chosen": 8.936965942382812, + "logits/rejected": 5.624577522277832, + "logps/chosen": -358.93316650390625, + "logps/rejected": -289.907470703125, + "loss": 0.8718, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06171342730522156, + "rewards/margins": -0.2696279287338257, + "rewards/rejected": 0.20791450142860413, + "step": 5757 + }, + { + "epoch": 0.8904697467620336, + "grad_norm": 5.616508483886719, + "learning_rate": 3.906518501546569e-06, + "logits/chosen": 13.807989120483398, + "logits/rejected": 3.765812397003174, + "logps/chosen": -280.65093994140625, + "logps/rejected": -157.28773498535156, + "loss": 0.6601, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03565177693963051, + "rewards/margins": 0.21808186173439026, + "rewards/rejected": -0.18243007361888885, + "step": 5758 + }, + { + "epoch": 0.8906243959017978, + "grad_norm": 4.631516933441162, + "learning_rate": 3.906232099896896e-06, + "logits/chosen": 11.538749694824219, + "logits/rejected": 4.234883785247803, + "logps/chosen": -353.98907470703125, + "logps/rejected": -145.6070556640625, + "loss": 0.4823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3437071740627289, + "rewards/margins": 0.6516937017440796, + "rewards/rejected": -0.3079864978790283, + "step": 5759 + }, + { + "epoch": 0.8907790450415619, + "grad_norm": 4.433241844177246, + "learning_rate": 3.905945698247222e-06, + "logits/chosen": 11.608865737915039, + "logits/rejected": 9.918044090270996, + "logps/chosen": -279.2489013671875, + "logps/rejected": -190.66151428222656, + "loss": 0.4964, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.14460617303848267, + "rewards/margins": 0.5654255151748657, + "rewards/rejected": -0.42081934213638306, + "step": 5760 + }, + { + "epoch": 0.8909336941813261, + "grad_norm": 5.150976657867432, + "learning_rate": 3.905659296597549e-06, + "logits/chosen": 12.157896041870117, + "logits/rejected": 8.949045181274414, + "logps/chosen": -330.6917724609375, + "logps/rejected": -210.3186492919922, + "loss": 0.5486, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04945487901568413, + "rewards/margins": 0.4112991690635681, + "rewards/rejected": -0.4607540965080261, + "step": 5761 + }, + { + "epoch": 0.8910883433210903, + "grad_norm": 8.179576873779297, + "learning_rate": 3.905372894947876e-06, + "logits/chosen": 1.7145663499832153, + "logits/rejected": 4.757074356079102, + "logps/chosen": -290.9599609375, + "logps/rejected": -332.01544189453125, + "loss": 0.9075, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.11309175193309784, + "rewards/margins": -0.2051432579755783, + "rewards/rejected": 0.31823503971099854, + "step": 5762 + }, + { + "epoch": 0.8912429924608545, + "grad_norm": 5.04812479019165, + "learning_rate": 3.9050864932982015e-06, + "logits/chosen": 9.727849960327148, + "logits/rejected": 4.04802942276001, + "logps/chosen": -345.1971130371094, + "logps/rejected": -179.4881134033203, + "loss": 0.6163, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1700761318206787, + "rewards/margins": 0.2689133286476135, + "rewards/rejected": -0.09883718937635422, + "step": 5763 + }, + { + "epoch": 0.8913976416006186, + "grad_norm": 4.066697120666504, + "learning_rate": 3.904800091648528e-06, + "logits/chosen": 12.286554336547852, + "logits/rejected": 9.58411979675293, + "logps/chosen": -320.670166015625, + "logps/rejected": -276.7817077636719, + "loss": 0.5486, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3345256745815277, + "rewards/margins": 0.34571850299835205, + "rewards/rejected": -0.011192798614501953, + "step": 5764 + }, + { + "epoch": 0.8915522907403828, + "grad_norm": 5.4556427001953125, + "learning_rate": 3.904513689998855e-06, + "logits/chosen": 8.078201293945312, + "logits/rejected": 4.984035491943359, + "logps/chosen": -221.33428955078125, + "logps/rejected": -187.317626953125, + "loss": 0.6514, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03998975828289986, + "rewards/margins": 0.2150421440601349, + "rewards/rejected": -0.17505237460136414, + "step": 5765 + }, + { + "epoch": 0.8917069398801469, + "grad_norm": 5.378304958343506, + "learning_rate": 3.904227288349181e-06, + "logits/chosen": 9.374171257019043, + "logits/rejected": 10.058323860168457, + "logps/chosen": -180.64031982421875, + "logps/rejected": -172.59909057617188, + "loss": 0.8512, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.6234707832336426, + "rewards/margins": -0.24978072941303253, + "rewards/rejected": -0.37369006872177124, + "step": 5766 + }, + { + "epoch": 0.8918615890199111, + "grad_norm": 6.537511348724365, + "learning_rate": 3.903940886699508e-06, + "logits/chosen": 6.561960220336914, + "logits/rejected": 5.24894905090332, + "logps/chosen": -378.9321594238281, + "logps/rejected": -304.2655029296875, + "loss": 0.7731, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.023377999663352966, + "rewards/margins": -0.09391117095947266, + "rewards/rejected": 0.07053318619728088, + "step": 5767 + }, + { + "epoch": 0.8920162381596752, + "grad_norm": 6.960522174835205, + "learning_rate": 3.903654485049834e-06, + "logits/chosen": 11.047513008117676, + "logits/rejected": 6.493597984313965, + "logps/chosen": -433.89202880859375, + "logps/rejected": -383.3857421875, + "loss": 0.7824, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3854230046272278, + "rewards/margins": 0.029317021369934082, + "rewards/rejected": 0.3561059534549713, + "step": 5768 + }, + { + "epoch": 0.8921708872994394, + "grad_norm": 3.61096453666687, + "learning_rate": 3.9033680834001605e-06, + "logits/chosen": 16.009328842163086, + "logits/rejected": 10.315284729003906, + "logps/chosen": -265.6707458496094, + "logps/rejected": -232.4981689453125, + "loss": 0.4998, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15180665254592896, + "rewards/margins": 0.5562466382980347, + "rewards/rejected": -0.4044400453567505, + "step": 5769 + }, + { + "epoch": 0.8923255364392035, + "grad_norm": 5.762599468231201, + "learning_rate": 3.903081681750487e-06, + "logits/chosen": 11.761422157287598, + "logits/rejected": 6.324789524078369, + "logps/chosen": -334.6190185546875, + "logps/rejected": -259.453125, + "loss": 0.5703, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10923900455236435, + "rewards/margins": 0.4044661819934845, + "rewards/rejected": -0.29522716999053955, + "step": 5770 + }, + { + "epoch": 0.8924801855789677, + "grad_norm": 5.068279266357422, + "learning_rate": 3.902795280100814e-06, + "logits/chosen": 8.493657112121582, + "logits/rejected": 8.671353340148926, + "logps/chosen": -228.67816162109375, + "logps/rejected": -275.82794189453125, + "loss": 0.7354, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20717626810073853, + "rewards/margins": 0.13601268827915192, + "rewards/rejected": 0.07116355746984482, + "step": 5771 + }, + { + "epoch": 0.8926348347187318, + "grad_norm": 47.15262985229492, + "learning_rate": 3.90250887845114e-06, + "logits/chosen": 7.927428245544434, + "logits/rejected": 10.981467247009277, + "logps/chosen": -226.99713134765625, + "logps/rejected": -213.72265625, + "loss": 0.8563, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.08466038107872009, + "rewards/margins": 0.02425430715084076, + "rewards/rejected": -0.10891470313072205, + "step": 5772 + }, + { + "epoch": 0.892789483858496, + "grad_norm": 6.1558685302734375, + "learning_rate": 3.902222476801466e-06, + "logits/chosen": 8.256296157836914, + "logits/rejected": 7.800989627838135, + "logps/chosen": -254.19973754882812, + "logps/rejected": -296.0955810546875, + "loss": 0.637, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18366794288158417, + "rewards/margins": 0.22860854864120483, + "rewards/rejected": -0.044940609484910965, + "step": 5773 + }, + { + "epoch": 0.8929441329982601, + "grad_norm": 4.605040073394775, + "learning_rate": 3.901936075151793e-06, + "logits/chosen": 9.867841720581055, + "logits/rejected": 10.04358196258545, + "logps/chosen": -263.62872314453125, + "logps/rejected": -207.45558166503906, + "loss": 0.7324, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.24722671508789062, + "rewards/margins": -0.013812467455863953, + "rewards/rejected": -0.23341424763202667, + "step": 5774 + }, + { + "epoch": 0.8930987821380244, + "grad_norm": 10.984442710876465, + "learning_rate": 3.9016496735021196e-06, + "logits/chosen": 2.1906890869140625, + "logits/rejected": 8.064613342285156, + "logps/chosen": -209.891357421875, + "logps/rejected": -261.04620361328125, + "loss": 1.0817, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08369524776935577, + "rewards/margins": -0.5420249700546265, + "rewards/rejected": 0.4583296477794647, + "step": 5775 + }, + { + "epoch": 0.8932534312777886, + "grad_norm": 5.522519111633301, + "learning_rate": 3.901363271852446e-06, + "logits/chosen": 7.920083999633789, + "logits/rejected": 5.469483375549316, + "logps/chosen": -245.44558715820312, + "logps/rejected": -223.60574340820312, + "loss": 0.5873, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1464330554008484, + "rewards/margins": 0.3499521017074585, + "rewards/rejected": -0.2035190612077713, + "step": 5776 + }, + { + "epoch": 0.8934080804175527, + "grad_norm": 4.455384731292725, + "learning_rate": 3.901076870202773e-06, + "logits/chosen": 9.459773063659668, + "logits/rejected": 8.106334686279297, + "logps/chosen": -174.80857849121094, + "logps/rejected": -115.38701629638672, + "loss": 0.6269, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02055474743247032, + "rewards/margins": 0.16783945262432098, + "rewards/rejected": -0.14728470146656036, + "step": 5777 + }, + { + "epoch": 0.8935627295573169, + "grad_norm": 4.525018692016602, + "learning_rate": 3.900790468553099e-06, + "logits/chosen": 11.904264450073242, + "logits/rejected": 8.026816368103027, + "logps/chosen": -293.42333984375, + "logps/rejected": -207.0321502685547, + "loss": 0.5365, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35818254947662354, + "rewards/margins": 0.4342919588088989, + "rewards/rejected": -0.07610940933227539, + "step": 5778 + }, + { + "epoch": 0.893717378697081, + "grad_norm": 3.6988890171051025, + "learning_rate": 3.900504066903425e-06, + "logits/chosen": 12.633952140808105, + "logits/rejected": 7.129212379455566, + "logps/chosen": -356.6917419433594, + "logps/rejected": -252.245849609375, + "loss": 0.3564, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42897891998291016, + "rewards/margins": 0.9579540491104126, + "rewards/rejected": -0.5289751291275024, + "step": 5779 + }, + { + "epoch": 0.8938720278368452, + "grad_norm": 6.133105754852295, + "learning_rate": 3.900217665253752e-06, + "logits/chosen": 5.24315881729126, + "logits/rejected": 1.84628164768219, + "logps/chosen": -300.00250244140625, + "logps/rejected": -213.4097900390625, + "loss": 0.748, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.02803216129541397, + "rewards/margins": -0.03245110064744949, + "rewards/rejected": 0.06048326566815376, + "step": 5780 + }, + { + "epoch": 0.8940266769766093, + "grad_norm": 6.531915664672852, + "learning_rate": 3.899931263604079e-06, + "logits/chosen": 9.105352401733398, + "logits/rejected": 11.369983673095703, + "logps/chosen": -336.5491638183594, + "logps/rejected": -376.9744567871094, + "loss": 0.7547, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.08375931531190872, + "rewards/margins": -0.029708579182624817, + "rewards/rejected": 0.11346788704395294, + "step": 5781 + }, + { + "epoch": 0.8941813261163735, + "grad_norm": 5.715112686157227, + "learning_rate": 3.899644861954405e-06, + "logits/chosen": 13.731454849243164, + "logits/rejected": 9.523012161254883, + "logps/chosen": -313.4877624511719, + "logps/rejected": -247.133056640625, + "loss": 0.6316, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2693215608596802, + "rewards/margins": 0.21356190741062164, + "rewards/rejected": 0.05575963854789734, + "step": 5782 + }, + { + "epoch": 0.8943359752561376, + "grad_norm": 4.2412261962890625, + "learning_rate": 3.899358460304732e-06, + "logits/chosen": 9.01489543914795, + "logits/rejected": 12.498173713684082, + "logps/chosen": -167.2577667236328, + "logps/rejected": -204.22789001464844, + "loss": 0.6329, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13652697205543518, + "rewards/margins": 0.1878114640712738, + "rewards/rejected": -0.05128450319170952, + "step": 5783 + }, + { + "epoch": 0.8944906243959018, + "grad_norm": 5.601959228515625, + "learning_rate": 3.899072058655059e-06, + "logits/chosen": 7.988924026489258, + "logits/rejected": 8.908060073852539, + "logps/chosen": -235.0262451171875, + "logps/rejected": -220.17633056640625, + "loss": 0.6915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22519434988498688, + "rewards/margins": 0.06586436927318573, + "rewards/rejected": 0.15933001041412354, + "step": 5784 + }, + { + "epoch": 0.8946452735356659, + "grad_norm": 4.819338798522949, + "learning_rate": 3.898785657005384e-06, + "logits/chosen": 4.816971778869629, + "logits/rejected": 0.19012141227722168, + "logps/chosen": -347.17156982421875, + "logps/rejected": -243.0165557861328, + "loss": 0.5642, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12997885048389435, + "rewards/margins": 0.5381138920783997, + "rewards/rejected": -0.4081350862979889, + "step": 5785 + }, + { + "epoch": 0.8947999226754301, + "grad_norm": 4.035464286804199, + "learning_rate": 3.898499255355711e-06, + "logits/chosen": 9.308698654174805, + "logits/rejected": 10.726720809936523, + "logps/chosen": -282.947021484375, + "logps/rejected": -245.0333709716797, + "loss": 0.4728, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5799012780189514, + "rewards/margins": 0.6192495822906494, + "rewards/rejected": -0.03934831917285919, + "step": 5786 + }, + { + "epoch": 0.8949545718151943, + "grad_norm": 6.591187953948975, + "learning_rate": 3.898212853706038e-06, + "logits/chosen": 14.6987943649292, + "logits/rejected": 11.750274658203125, + "logps/chosen": -337.1551513671875, + "logps/rejected": -299.08544921875, + "loss": 0.6348, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5019106864929199, + "rewards/margins": 0.2939349412918091, + "rewards/rejected": 0.20797570049762726, + "step": 5787 + }, + { + "epoch": 0.8951092209549585, + "grad_norm": 5.288928508758545, + "learning_rate": 3.897926452056364e-06, + "logits/chosen": 10.313579559326172, + "logits/rejected": 7.051687717437744, + "logps/chosen": -322.61590576171875, + "logps/rejected": -221.70352172851562, + "loss": 0.6925, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.046183258295059204, + "rewards/margins": 0.06858716905117035, + "rewards/rejected": -0.022403907030820847, + "step": 5788 + }, + { + "epoch": 0.8952638700947226, + "grad_norm": 5.243613243103027, + "learning_rate": 3.897640050406691e-06, + "logits/chosen": 9.921998023986816, + "logits/rejected": 5.1040825843811035, + "logps/chosen": -206.79026794433594, + "logps/rejected": -135.08091735839844, + "loss": 0.7962, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.22975994646549225, + "rewards/margins": -0.17974111437797546, + "rewards/rejected": -0.05001883953809738, + "step": 5789 + }, + { + "epoch": 0.8954185192344868, + "grad_norm": 5.887124538421631, + "learning_rate": 3.897353648757018e-06, + "logits/chosen": 10.311300277709961, + "logits/rejected": 8.953064918518066, + "logps/chosen": -312.102294921875, + "logps/rejected": -292.21954345703125, + "loss": 0.7181, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.3187662363052368, + "rewards/margins": 0.08487636595964432, + "rewards/rejected": 0.2338898479938507, + "step": 5790 + }, + { + "epoch": 0.8955731683742509, + "grad_norm": 6.83258581161499, + "learning_rate": 3.8970672471073434e-06, + "logits/chosen": 10.823326110839844, + "logits/rejected": 8.36181354522705, + "logps/chosen": -184.07888793945312, + "logps/rejected": -190.6837158203125, + "loss": 0.665, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15010690689086914, + "rewards/margins": 0.10632428526878357, + "rewards/rejected": 0.043782614171504974, + "step": 5791 + }, + { + "epoch": 0.8957278175140151, + "grad_norm": 6.705944061279297, + "learning_rate": 3.89678084545767e-06, + "logits/chosen": 8.686849594116211, + "logits/rejected": 9.22749137878418, + "logps/chosen": -331.6884460449219, + "logps/rejected": -321.5303649902344, + "loss": 0.6811, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16520243883132935, + "rewards/margins": 0.06841883063316345, + "rewards/rejected": 0.0967836007475853, + "step": 5792 + }, + { + "epoch": 0.8958824666537792, + "grad_norm": 3.462826728820801, + "learning_rate": 3.896494443807997e-06, + "logits/chosen": 7.829308032989502, + "logits/rejected": 4.918103218078613, + "logps/chosen": -273.01983642578125, + "logps/rejected": -204.1581268310547, + "loss": 0.4569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.057319071143865585, + "rewards/margins": 0.6962177157402039, + "rewards/rejected": -0.6388986706733704, + "step": 5793 + }, + { + "epoch": 0.8960371157935434, + "grad_norm": 6.72977352142334, + "learning_rate": 3.896208042158323e-06, + "logits/chosen": 10.112924575805664, + "logits/rejected": 10.463385581970215, + "logps/chosen": -262.98236083984375, + "logps/rejected": -287.0904541015625, + "loss": 0.8473, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.1544014811515808, + "rewards/margins": -0.26306837797164917, + "rewards/rejected": 0.10866688936948776, + "step": 5794 + }, + { + "epoch": 0.8961917649333075, + "grad_norm": 5.962997913360596, + "learning_rate": 3.89592164050865e-06, + "logits/chosen": 16.793039321899414, + "logits/rejected": 10.594056129455566, + "logps/chosen": -273.9061279296875, + "logps/rejected": -188.57601928710938, + "loss": 0.7511, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05273417755961418, + "rewards/margins": -0.07121656835079193, + "rewards/rejected": 0.018482400104403496, + "step": 5795 + }, + { + "epoch": 0.8963464140730717, + "grad_norm": 8.835264205932617, + "learning_rate": 3.895635238858977e-06, + "logits/chosen": 6.181654453277588, + "logits/rejected": 8.423907279968262, + "logps/chosen": -205.4691925048828, + "logps/rejected": -234.89208984375, + "loss": 0.8139, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1505850851535797, + "rewards/margins": -0.1840323507785797, + "rewards/rejected": 0.033447265625, + "step": 5796 + }, + { + "epoch": 0.8965010632128358, + "grad_norm": 29.765905380249023, + "learning_rate": 3.8953488372093025e-06, + "logits/chosen": 12.476085662841797, + "logits/rejected": 11.049947738647461, + "logps/chosen": -304.8553466796875, + "logps/rejected": -280.56365966796875, + "loss": 0.854, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.13271498680114746, + "rewards/margins": 0.13122093677520752, + "rewards/rejected": 0.0014940500259399414, + "step": 5797 + }, + { + "epoch": 0.8966557123526, + "grad_norm": 6.187057018280029, + "learning_rate": 3.895062435559629e-06, + "logits/chosen": 12.39517879486084, + "logits/rejected": 14.127275466918945, + "logps/chosen": -351.138671875, + "logps/rejected": -495.7914733886719, + "loss": 0.4518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30968108773231506, + "rewards/margins": 0.6748337745666504, + "rewards/rejected": -0.36515265703201294, + "step": 5798 + }, + { + "epoch": 0.8968103614923641, + "grad_norm": 4.8687639236450195, + "learning_rate": 3.894776033909956e-06, + "logits/chosen": 15.382537841796875, + "logits/rejected": 12.328469276428223, + "logps/chosen": -268.01007080078125, + "logps/rejected": -227.9391326904297, + "loss": 0.6959, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.13318169116973877, + "rewards/margins": 0.2268151044845581, + "rewards/rejected": -0.3599967360496521, + "step": 5799 + }, + { + "epoch": 0.8969650106321284, + "grad_norm": 5.825780391693115, + "learning_rate": 3.8944896322602824e-06, + "logits/chosen": 13.089054107666016, + "logits/rejected": 10.024188041687012, + "logps/chosen": -313.15887451171875, + "logps/rejected": -254.19517517089844, + "loss": 0.8508, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.09526987373828888, + "rewards/margins": -0.2415209859609604, + "rewards/rejected": 0.3367908298969269, + "step": 5800 + }, + { + "epoch": 0.8971196597718926, + "grad_norm": 4.953996181488037, + "learning_rate": 3.894203230610609e-06, + "logits/chosen": 11.274621963500977, + "logits/rejected": 11.288661003112793, + "logps/chosen": -317.3144226074219, + "logps/rejected": -294.0598449707031, + "loss": 0.6051, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5526087284088135, + "rewards/margins": 0.27658313512802124, + "rewards/rejected": 0.27602559328079224, + "step": 5801 + }, + { + "epoch": 0.8972743089116567, + "grad_norm": 8.640522003173828, + "learning_rate": 3.893916828960935e-06, + "logits/chosen": 10.786994934082031, + "logits/rejected": 9.769344329833984, + "logps/chosen": -152.9961395263672, + "logps/rejected": -166.99082946777344, + "loss": 0.56, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.025615647435188293, + "rewards/margins": 0.329357773065567, + "rewards/rejected": -0.30374211072921753, + "step": 5802 + }, + { + "epoch": 0.8974289580514209, + "grad_norm": 31.095870971679688, + "learning_rate": 3.8936304273112616e-06, + "logits/chosen": 8.2977933883667, + "logits/rejected": 6.608639717102051, + "logps/chosen": -332.3779296875, + "logps/rejected": -331.2200622558594, + "loss": 0.91, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22739753127098083, + "rewards/margins": -0.25158873200416565, + "rewards/rejected": 0.024191195145249367, + "step": 5803 + }, + { + "epoch": 0.897583607191185, + "grad_norm": 4.276152610778809, + "learning_rate": 3.893344025661588e-06, + "logits/chosen": 11.328424453735352, + "logits/rejected": 8.60206127166748, + "logps/chosen": -260.8816833496094, + "logps/rejected": -265.866455078125, + "loss": 0.5412, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5953232049942017, + "rewards/margins": 0.5436924695968628, + "rewards/rejected": 0.051630690693855286, + "step": 5804 + }, + { + "epoch": 0.8977382563309492, + "grad_norm": 6.091341495513916, + "learning_rate": 3.893057624011915e-06, + "logits/chosen": 6.324405670166016, + "logits/rejected": 5.587436199188232, + "logps/chosen": -358.77520751953125, + "logps/rejected": -389.6812744140625, + "loss": 0.6475, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4374438226222992, + "rewards/margins": 0.21011047065258026, + "rewards/rejected": 0.22733336687088013, + "step": 5805 + }, + { + "epoch": 0.8978929054707133, + "grad_norm": 5.2883124351501465, + "learning_rate": 3.892771222362241e-06, + "logits/chosen": 13.2877197265625, + "logits/rejected": 10.083975791931152, + "logps/chosen": -251.19290161132812, + "logps/rejected": -229.6358642578125, + "loss": 0.5871, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06448793411254883, + "rewards/margins": 0.31439948081970215, + "rewards/rejected": -0.378887414932251, + "step": 5806 + }, + { + "epoch": 0.8980475546104775, + "grad_norm": 8.017000198364258, + "learning_rate": 3.892484820712567e-06, + "logits/chosen": 6.96412992477417, + "logits/rejected": 9.324800491333008, + "logps/chosen": -255.52098083496094, + "logps/rejected": -245.35235595703125, + "loss": 0.6797, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1656692624092102, + "rewards/margins": 0.05077696591615677, + "rewards/rejected": 0.11489230394363403, + "step": 5807 + }, + { + "epoch": 0.8982022037502416, + "grad_norm": 5.281316757202148, + "learning_rate": 3.892198419062894e-06, + "logits/chosen": 8.622862815856934, + "logits/rejected": 5.965861797332764, + "logps/chosen": -274.3324279785156, + "logps/rejected": -249.67575073242188, + "loss": 0.6743, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.601416826248169, + "rewards/margins": 0.10115213692188263, + "rewards/rejected": 0.5002647042274475, + "step": 5808 + }, + { + "epoch": 0.8983568528900058, + "grad_norm": 5.0237908363342285, + "learning_rate": 3.891912017413221e-06, + "logits/chosen": 9.30994987487793, + "logits/rejected": 7.526909828186035, + "logps/chosen": -326.1006164550781, + "logps/rejected": -221.7826385498047, + "loss": 0.6668, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.1453217715024948, + "rewards/margins": 0.3931927978992462, + "rewards/rejected": -0.5385145545005798, + "step": 5809 + }, + { + "epoch": 0.8985115020297699, + "grad_norm": 4.522352695465088, + "learning_rate": 3.891625615763547e-06, + "logits/chosen": 7.3874053955078125, + "logits/rejected": 5.093265533447266, + "logps/chosen": -389.14251708984375, + "logps/rejected": -225.38217163085938, + "loss": 0.4979, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35242289304733276, + "rewards/margins": 0.5075194239616394, + "rewards/rejected": -0.15509653091430664, + "step": 5810 + }, + { + "epoch": 0.8986661511695341, + "grad_norm": 9.168158531188965, + "learning_rate": 3.891339214113873e-06, + "logits/chosen": 11.820075035095215, + "logits/rejected": 7.127833366394043, + "logps/chosen": -189.8061981201172, + "logps/rejected": -175.92935180664062, + "loss": 0.7752, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08858819305896759, + "rewards/margins": -0.014167554676532745, + "rewards/rejected": -0.07442066073417664, + "step": 5811 + }, + { + "epoch": 0.8988208003092982, + "grad_norm": 2.488330125808716, + "learning_rate": 3.8910528124642e-06, + "logits/chosen": 8.111259460449219, + "logits/rejected": 6.724429130554199, + "logps/chosen": -159.6066436767578, + "logps/rejected": -91.45460510253906, + "loss": 0.5147, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31005510687828064, + "rewards/margins": 0.46373051404953003, + "rewards/rejected": -0.15367542207241058, + "step": 5812 + }, + { + "epoch": 0.8989754494490625, + "grad_norm": 6.011414527893066, + "learning_rate": 3.890766410814526e-06, + "logits/chosen": 13.762782096862793, + "logits/rejected": 9.7354736328125, + "logps/chosen": -298.726806640625, + "logps/rejected": -300.033203125, + "loss": 0.7355, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.02225351333618164, + "rewards/margins": 0.25399595499038696, + "rewards/rejected": -0.23174244165420532, + "step": 5813 + }, + { + "epoch": 0.8991300985888266, + "grad_norm": 4.9309916496276855, + "learning_rate": 3.890480009164853e-06, + "logits/chosen": 9.000768661499023, + "logits/rejected": 3.162428379058838, + "logps/chosen": -305.7404479980469, + "logps/rejected": -220.20472717285156, + "loss": 0.6613, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14583033323287964, + "rewards/margins": 0.1452416628599167, + "rewards/rejected": -0.2910720109939575, + "step": 5814 + }, + { + "epoch": 0.8992847477285908, + "grad_norm": 5.011508464813232, + "learning_rate": 3.89019360751518e-06, + "logits/chosen": 8.8680419921875, + "logits/rejected": 3.670546531677246, + "logps/chosen": -301.1461181640625, + "logps/rejected": -215.93003845214844, + "loss": 0.6633, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23907470703125, + "rewards/margins": 0.1076871007680893, + "rewards/rejected": 0.1313876211643219, + "step": 5815 + }, + { + "epoch": 0.899439396868355, + "grad_norm": 5.040525913238525, + "learning_rate": 3.889907205865506e-06, + "logits/chosen": 9.04183578491211, + "logits/rejected": 5.215781211853027, + "logps/chosen": -350.3896789550781, + "logps/rejected": -216.79725646972656, + "loss": 0.593, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42782342433929443, + "rewards/margins": 0.27642130851745605, + "rewards/rejected": 0.151402086019516, + "step": 5816 + }, + { + "epoch": 0.8995940460081191, + "grad_norm": 4.033872127532959, + "learning_rate": 3.889620804215832e-06, + "logits/chosen": 6.798552989959717, + "logits/rejected": 8.213788986206055, + "logps/chosen": -226.74070739746094, + "logps/rejected": -302.1864013671875, + "loss": 0.4446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2932452857494354, + "rewards/margins": 0.6816126108169556, + "rewards/rejected": -0.38836729526519775, + "step": 5817 + }, + { + "epoch": 0.8997486951478832, + "grad_norm": 4.856017589569092, + "learning_rate": 3.889334402566159e-06, + "logits/chosen": 5.278564453125, + "logits/rejected": 0.8399578332901001, + "logps/chosen": -245.49124145507812, + "logps/rejected": -242.11532592773438, + "loss": 0.548, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2413884997367859, + "rewards/margins": 0.5936083793640137, + "rewards/rejected": -0.352219820022583, + "step": 5818 + }, + { + "epoch": 0.8999033442876474, + "grad_norm": 4.567702770233154, + "learning_rate": 3.889048000916485e-06, + "logits/chosen": 11.040934562683105, + "logits/rejected": 11.924667358398438, + "logps/chosen": -220.0098876953125, + "logps/rejected": -237.9245147705078, + "loss": 0.6992, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12206556648015976, + "rewards/margins": 0.167752206325531, + "rewards/rejected": -0.045686691999435425, + "step": 5819 + }, + { + "epoch": 0.9000579934274116, + "grad_norm": 5.837713718414307, + "learning_rate": 3.888761599266812e-06, + "logits/chosen": 5.77709436416626, + "logits/rejected": 9.078289031982422, + "logps/chosen": -188.1656494140625, + "logps/rejected": -232.5315704345703, + "loss": 0.6351, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18178953230381012, + "rewards/margins": 0.17593033611774445, + "rewards/rejected": 0.0058591775596141815, + "step": 5820 + }, + { + "epoch": 0.9002126425671757, + "grad_norm": 5.338743686676025, + "learning_rate": 3.888475197617139e-06, + "logits/chosen": 8.580313682556152, + "logits/rejected": 3.0343730449676514, + "logps/chosen": -292.9908142089844, + "logps/rejected": -265.82525634765625, + "loss": 0.7346, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.029426276683807373, + "rewards/margins": 0.10101652145385742, + "rewards/rejected": -0.07159022241830826, + "step": 5821 + }, + { + "epoch": 0.9003672917069399, + "grad_norm": 5.040890216827393, + "learning_rate": 3.888188795967465e-06, + "logits/chosen": 8.20743179321289, + "logits/rejected": 3.6761863231658936, + "logps/chosen": -258.45263671875, + "logps/rejected": -293.16754150390625, + "loss": 0.5834, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4921242594718933, + "rewards/margins": 0.4046463370323181, + "rewards/rejected": 0.08747787773609161, + "step": 5822 + }, + { + "epoch": 0.900521940846704, + "grad_norm": 8.992168426513672, + "learning_rate": 3.887902394317792e-06, + "logits/chosen": 7.730225086212158, + "logits/rejected": 7.373764991760254, + "logps/chosen": -226.39979553222656, + "logps/rejected": -205.3111572265625, + "loss": 0.8276, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.023194849491119385, + "rewards/margins": -0.12436148524284363, + "rewards/rejected": 0.10116663575172424, + "step": 5823 + }, + { + "epoch": 0.9006765899864682, + "grad_norm": 5.892737865447998, + "learning_rate": 3.887615992668118e-06, + "logits/chosen": 9.87824821472168, + "logits/rejected": 10.255525588989258, + "logps/chosen": -288.4353942871094, + "logps/rejected": -303.8541259765625, + "loss": 0.7292, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.40973442792892456, + "rewards/margins": -0.03363390266895294, + "rewards/rejected": 0.4433683156967163, + "step": 5824 + }, + { + "epoch": 0.9008312391262323, + "grad_norm": 6.744109153747559, + "learning_rate": 3.8873295910184445e-06, + "logits/chosen": 4.24338436126709, + "logits/rejected": 5.201857089996338, + "logps/chosen": -239.29898071289062, + "logps/rejected": -419.9927978515625, + "loss": 0.5876, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19418013095855713, + "rewards/margins": 0.2867938280105591, + "rewards/rejected": -0.09261371195316315, + "step": 5825 + }, + { + "epoch": 0.9009858882659966, + "grad_norm": 6.569641590118408, + "learning_rate": 3.887043189368771e-06, + "logits/chosen": 13.829446792602539, + "logits/rejected": 6.065988540649414, + "logps/chosen": -368.6732177734375, + "logps/rejected": -219.236083984375, + "loss": 0.7739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09992039203643799, + "rewards/margins": -0.09991222620010376, + "rewards/rejected": -8.158385753631592e-06, + "step": 5826 + }, + { + "epoch": 0.9011405374057607, + "grad_norm": 5.553526878356934, + "learning_rate": 3.886756787719098e-06, + "logits/chosen": 16.383296966552734, + "logits/rejected": 12.941035270690918, + "logps/chosen": -240.0606689453125, + "logps/rejected": -224.10797119140625, + "loss": 0.7538, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0914207398891449, + "rewards/margins": -0.08539792150259018, + "rewards/rejected": -0.006022820249199867, + "step": 5827 + }, + { + "epoch": 0.9012951865455249, + "grad_norm": 4.780274391174316, + "learning_rate": 3.8864703860694244e-06, + "logits/chosen": 10.414852142333984, + "logits/rejected": 11.135489463806152, + "logps/chosen": -263.7305603027344, + "logps/rejected": -256.6584777832031, + "loss": 0.4665, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5535441637039185, + "rewards/margins": 0.56716388463974, + "rewards/rejected": -0.013619698584079742, + "step": 5828 + }, + { + "epoch": 0.901449835685289, + "grad_norm": 6.247519016265869, + "learning_rate": 3.886183984419751e-06, + "logits/chosen": 9.281834602355957, + "logits/rejected": 7.712191581726074, + "logps/chosen": -241.29293823242188, + "logps/rejected": -276.02960205078125, + "loss": 0.6365, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0068801045417785645, + "rewards/margins": 0.23522864282131195, + "rewards/rejected": -0.24210873246192932, + "step": 5829 + }, + { + "epoch": 0.9016044848250532, + "grad_norm": 5.402223587036133, + "learning_rate": 3.885897582770078e-06, + "logits/chosen": 7.422797203063965, + "logits/rejected": 5.952421188354492, + "logps/chosen": -231.46493530273438, + "logps/rejected": -184.68133544921875, + "loss": 0.6886, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10784335434436798, + "rewards/margins": 0.022812752053141594, + "rewards/rejected": -0.13065610826015472, + "step": 5830 + }, + { + "epoch": 0.9017591339648173, + "grad_norm": 4.328973293304443, + "learning_rate": 3.8856111811204035e-06, + "logits/chosen": 6.162834167480469, + "logits/rejected": 6.610774993896484, + "logps/chosen": -272.64263916015625, + "logps/rejected": -234.24659729003906, + "loss": 0.5011, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5779678821563721, + "rewards/margins": 0.5311887264251709, + "rewards/rejected": 0.04677905887365341, + "step": 5831 + }, + { + "epoch": 0.9019137831045815, + "grad_norm": 6.595884799957275, + "learning_rate": 3.88532477947073e-06, + "logits/chosen": 9.275840759277344, + "logits/rejected": 9.328700065612793, + "logps/chosen": -373.49810791015625, + "logps/rejected": -321.27667236328125, + "loss": 0.8494, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19307613372802734, + "rewards/margins": -0.23838338255882263, + "rewards/rejected": 0.43145954608917236, + "step": 5832 + }, + { + "epoch": 0.9020684322443456, + "grad_norm": 5.678833961486816, + "learning_rate": 3.885038377821057e-06, + "logits/chosen": 5.063775539398193, + "logits/rejected": -0.3743937611579895, + "logps/chosen": -266.460205078125, + "logps/rejected": -161.34249877929688, + "loss": 0.5961, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0388980358839035, + "rewards/margins": 0.3764452338218689, + "rewards/rejected": -0.3375471830368042, + "step": 5833 + }, + { + "epoch": 0.9022230813841098, + "grad_norm": 7.250502586364746, + "learning_rate": 3.8847519761713835e-06, + "logits/chosen": 5.888790130615234, + "logits/rejected": 15.33763313293457, + "logps/chosen": -192.1502227783203, + "logps/rejected": -295.3578796386719, + "loss": 0.9184, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.28777143359184265, + "rewards/margins": -0.33882731199264526, + "rewards/rejected": 0.0510559044778347, + "step": 5834 + }, + { + "epoch": 0.9023777305238739, + "grad_norm": 4.430229663848877, + "learning_rate": 3.884465574521709e-06, + "logits/chosen": 10.761627197265625, + "logits/rejected": 9.028371810913086, + "logps/chosen": -318.77203369140625, + "logps/rejected": -245.9497833251953, + "loss": 0.4878, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41721418499946594, + "rewards/margins": 0.5311620831489563, + "rewards/rejected": -0.11394791305065155, + "step": 5835 + }, + { + "epoch": 0.9025323796636381, + "grad_norm": 5.535238265991211, + "learning_rate": 3.884179172872036e-06, + "logits/chosen": 9.515908241271973, + "logits/rejected": 8.492105484008789, + "logps/chosen": -295.0663146972656, + "logps/rejected": -241.23912048339844, + "loss": 0.6869, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07690748572349548, + "rewards/margins": 0.11782458424568176, + "rewards/rejected": -0.04091709852218628, + "step": 5836 + }, + { + "epoch": 0.9026870288034022, + "grad_norm": 6.513454437255859, + "learning_rate": 3.883892771222363e-06, + "logits/chosen": 5.844347953796387, + "logits/rejected": 9.307866096496582, + "logps/chosen": -309.8131103515625, + "logps/rejected": -323.50836181640625, + "loss": 0.8676, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19861078262329102, + "rewards/margins": -0.1942564845085144, + "rewards/rejected": -0.0043542832136154175, + "step": 5837 + }, + { + "epoch": 0.9028416779431664, + "grad_norm": 3.3172013759613037, + "learning_rate": 3.883606369572689e-06, + "logits/chosen": 6.828248023986816, + "logits/rejected": 5.912540435791016, + "logps/chosen": -203.55136108398438, + "logps/rejected": -220.48294067382812, + "loss": 0.6378, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.16925562918186188, + "rewards/margins": 0.2474106401205063, + "rewards/rejected": -0.078155018389225, + "step": 5838 + }, + { + "epoch": 0.9029963270829306, + "grad_norm": 4.391114234924316, + "learning_rate": 3.883319967923016e-06, + "logits/chosen": 15.383584976196289, + "logits/rejected": 10.069034576416016, + "logps/chosen": -320.87017822265625, + "logps/rejected": -264.7410888671875, + "loss": 0.5722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18732596933841705, + "rewards/margins": 0.34394752979278564, + "rewards/rejected": -0.15662159025669098, + "step": 5839 + }, + { + "epoch": 0.9031509762226948, + "grad_norm": 5.451707363128662, + "learning_rate": 3.883033566273342e-06, + "logits/chosen": 13.697030067443848, + "logits/rejected": 13.811174392700195, + "logps/chosen": -322.304931640625, + "logps/rejected": -282.4361877441406, + "loss": 0.578, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28093957901000977, + "rewards/margins": 0.3700934648513794, + "rewards/rejected": -0.08915385603904724, + "step": 5840 + }, + { + "epoch": 0.903305625362459, + "grad_norm": 6.3685221672058105, + "learning_rate": 3.882747164623668e-06, + "logits/chosen": 8.442115783691406, + "logits/rejected": 4.736579895019531, + "logps/chosen": -252.9369659423828, + "logps/rejected": -242.43760681152344, + "loss": 0.5933, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2573397755622864, + "rewards/margins": 0.3026355803012848, + "rewards/rejected": -0.0452958345413208, + "step": 5841 + }, + { + "epoch": 0.9034602745022231, + "grad_norm": 4.195534706115723, + "learning_rate": 3.882460762973995e-06, + "logits/chosen": 5.978053092956543, + "logits/rejected": 6.568967819213867, + "logps/chosen": -168.0675811767578, + "logps/rejected": -186.30543518066406, + "loss": 0.7227, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06088666617870331, + "rewards/margins": -0.024186521768569946, + "rewards/rejected": 0.08507319539785385, + "step": 5842 + }, + { + "epoch": 0.9036149236419873, + "grad_norm": 3.672985076904297, + "learning_rate": 3.882174361324322e-06, + "logits/chosen": 9.157450675964355, + "logits/rejected": 9.113726615905762, + "logps/chosen": -236.30455017089844, + "logps/rejected": -247.42835998535156, + "loss": 0.5047, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.00868912786245346, + "rewards/margins": 0.6104499101638794, + "rewards/rejected": -0.601760745048523, + "step": 5843 + }, + { + "epoch": 0.9037695727817514, + "grad_norm": 5.554665565490723, + "learning_rate": 3.8818879596746474e-06, + "logits/chosen": 8.053369522094727, + "logits/rejected": 5.807934284210205, + "logps/chosen": -224.80792236328125, + "logps/rejected": -194.58795166015625, + "loss": 0.735, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.18455220758914948, + "rewards/margins": 0.12116733938455582, + "rewards/rejected": 0.06338487565517426, + "step": 5844 + }, + { + "epoch": 0.9039242219215156, + "grad_norm": 4.206684589385986, + "learning_rate": 3.881601558024974e-06, + "logits/chosen": 9.244462966918945, + "logits/rejected": 5.271476745605469, + "logps/chosen": -325.0238037109375, + "logps/rejected": -303.20501708984375, + "loss": 0.4118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37349826097488403, + "rewards/margins": 0.764920175075531, + "rewards/rejected": -0.3914218842983246, + "step": 5845 + }, + { + "epoch": 0.9040788710612797, + "grad_norm": 5.245149612426758, + "learning_rate": 3.881315156375301e-06, + "logits/chosen": 11.899967193603516, + "logits/rejected": 1.9415113925933838, + "logps/chosen": -196.98587036132812, + "logps/rejected": -125.8809585571289, + "loss": 0.6777, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04752540588378906, + "rewards/margins": 0.27655839920043945, + "rewards/rejected": -0.2290329486131668, + "step": 5846 + }, + { + "epoch": 0.9042335202010439, + "grad_norm": 4.382696151733398, + "learning_rate": 3.881028754725627e-06, + "logits/chosen": 12.819002151489258, + "logits/rejected": 5.692922592163086, + "logps/chosen": -309.71807861328125, + "logps/rejected": -232.36790466308594, + "loss": 0.5486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03396128863096237, + "rewards/margins": 0.487964004278183, + "rewards/rejected": -0.4540027379989624, + "step": 5847 + }, + { + "epoch": 0.904388169340808, + "grad_norm": 5.878704071044922, + "learning_rate": 3.880742353075954e-06, + "logits/chosen": 13.20284652709961, + "logits/rejected": 4.059720516204834, + "logps/chosen": -360.00994873046875, + "logps/rejected": -260.7804870605469, + "loss": 0.6414, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18025809526443481, + "rewards/margins": 0.1990213394165039, + "rewards/rejected": -0.018763268366456032, + "step": 5848 + }, + { + "epoch": 0.9045428184805722, + "grad_norm": 5.185843467712402, + "learning_rate": 3.880455951426281e-06, + "logits/chosen": 13.477865219116211, + "logits/rejected": 11.918599128723145, + "logps/chosen": -309.1504821777344, + "logps/rejected": -299.9570617675781, + "loss": 0.535, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.16140177845954895, + "rewards/margins": 0.5257657170295715, + "rewards/rejected": -0.6871674656867981, + "step": 5849 + }, + { + "epoch": 0.9046974676203363, + "grad_norm": 5.3445281982421875, + "learning_rate": 3.8801695497766065e-06, + "logits/chosen": 6.33530855178833, + "logits/rejected": 12.506031036376953, + "logps/chosen": -176.27145385742188, + "logps/rejected": -191.63589477539062, + "loss": 0.6987, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11392469704151154, + "rewards/margins": 0.0710691586136818, + "rewards/rejected": 0.04285553842782974, + "step": 5850 + }, + { + "epoch": 0.9048521167601005, + "grad_norm": 6.441342830657959, + "learning_rate": 3.879883148126933e-06, + "logits/chosen": 7.998721599578857, + "logits/rejected": 11.967599868774414, + "logps/chosen": -260.7688903808594, + "logps/rejected": -303.0032958984375, + "loss": 0.73, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.389772891998291, + "rewards/margins": 0.03484584391117096, + "rewards/rejected": 0.35492706298828125, + "step": 5851 + }, + { + "epoch": 0.9050067658998647, + "grad_norm": 3.743983745574951, + "learning_rate": 3.87959674647726e-06, + "logits/chosen": 13.090093612670898, + "logits/rejected": 6.495992660522461, + "logps/chosen": -415.82757568359375, + "logps/rejected": -282.7840576171875, + "loss": 0.4181, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42210546135902405, + "rewards/margins": 0.7916517853736877, + "rewards/rejected": -0.3695463538169861, + "step": 5852 + }, + { + "epoch": 0.9051614150396289, + "grad_norm": 9.290580749511719, + "learning_rate": 3.8793103448275865e-06, + "logits/chosen": 4.171314716339111, + "logits/rejected": 7.288641929626465, + "logps/chosen": -276.3918151855469, + "logps/rejected": -342.0833740234375, + "loss": 0.7821, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2479313164949417, + "rewards/margins": -0.11495855450630188, + "rewards/rejected": 0.3628898859024048, + "step": 5853 + }, + { + "epoch": 0.905316064179393, + "grad_norm": 3.8928864002227783, + "learning_rate": 3.879023943177913e-06, + "logits/chosen": 10.509458541870117, + "logits/rejected": 9.40723991394043, + "logps/chosen": -231.02806091308594, + "logps/rejected": -235.6477813720703, + "loss": 0.496, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5517292618751526, + "rewards/margins": 0.5025173425674438, + "rewards/rejected": 0.04921187460422516, + "step": 5854 + }, + { + "epoch": 0.9054707133191572, + "grad_norm": 4.945900917053223, + "learning_rate": 3.87873754152824e-06, + "logits/chosen": 10.313854217529297, + "logits/rejected": 8.259241104125977, + "logps/chosen": -382.12152099609375, + "logps/rejected": -315.88916015625, + "loss": 0.5726, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04192980006337166, + "rewards/margins": 0.41825219988822937, + "rewards/rejected": -0.46018195152282715, + "step": 5855 + }, + { + "epoch": 0.9056253624589213, + "grad_norm": 3.7689552307128906, + "learning_rate": 3.878451139878566e-06, + "logits/chosen": 16.6031551361084, + "logits/rejected": 9.461174964904785, + "logps/chosen": -236.73219299316406, + "logps/rejected": -127.3540267944336, + "loss": 0.5949, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21787521243095398, + "rewards/margins": 0.31660696864128113, + "rewards/rejected": -0.09873175621032715, + "step": 5856 + }, + { + "epoch": 0.9057800115986855, + "grad_norm": 5.5852837562561035, + "learning_rate": 3.878164738228892e-06, + "logits/chosen": 14.486068725585938, + "logits/rejected": 9.360355377197266, + "logps/chosen": -342.8150939941406, + "logps/rejected": -335.119140625, + "loss": 0.5652, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3054373562335968, + "rewards/margins": 0.2915671467781067, + "rewards/rejected": 0.013870231807231903, + "step": 5857 + }, + { + "epoch": 0.9059346607384496, + "grad_norm": 3.898383140563965, + "learning_rate": 3.877878336579219e-06, + "logits/chosen": 4.096169471740723, + "logits/rejected": 6.356062412261963, + "logps/chosen": -142.49746704101562, + "logps/rejected": -168.0295867919922, + "loss": 0.572, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.19904474914073944, + "rewards/margins": 0.2963623106479645, + "rewards/rejected": -0.4954070448875427, + "step": 5858 + }, + { + "epoch": 0.9060893098782138, + "grad_norm": 5.5752763748168945, + "learning_rate": 3.8775919349295455e-06, + "logits/chosen": 10.031557083129883, + "logits/rejected": 7.413594722747803, + "logps/chosen": -392.9423828125, + "logps/rejected": -298.0181884765625, + "loss": 0.6128, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3351958990097046, + "rewards/margins": 0.42352375388145447, + "rewards/rejected": -0.08832788467407227, + "step": 5859 + }, + { + "epoch": 0.9062439590179779, + "grad_norm": 4.158720970153809, + "learning_rate": 3.877305533279872e-06, + "logits/chosen": 14.963878631591797, + "logits/rejected": 8.858963012695312, + "logps/chosen": -257.9062194824219, + "logps/rejected": -264.0928955078125, + "loss": 0.5223, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.037215426564216614, + "rewards/margins": 0.46270790696144104, + "rewards/rejected": -0.4254924952983856, + "step": 5860 + }, + { + "epoch": 0.9063986081577421, + "grad_norm": 5.162186145782471, + "learning_rate": 3.877019131630199e-06, + "logits/chosen": 7.1378889083862305, + "logits/rejected": 10.812788009643555, + "logps/chosen": -273.01812744140625, + "logps/rejected": -304.15380859375, + "loss": 0.6644, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12640608847141266, + "rewards/margins": 0.11050014197826385, + "rewards/rejected": 0.015905946493148804, + "step": 5861 + }, + { + "epoch": 0.9065532572975062, + "grad_norm": 4.256577968597412, + "learning_rate": 3.8767327299805255e-06, + "logits/chosen": 13.968976974487305, + "logits/rejected": 7.623793601989746, + "logps/chosen": -260.9290771484375, + "logps/rejected": -183.25228881835938, + "loss": 0.5722, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2127974033355713, + "rewards/margins": 0.6174132823944092, + "rewards/rejected": -0.4046158790588379, + "step": 5862 + }, + { + "epoch": 0.9067079064372704, + "grad_norm": 7.413290023803711, + "learning_rate": 3.876446328330852e-06, + "logits/chosen": 10.745857238769531, + "logits/rejected": 8.528212547302246, + "logps/chosen": -255.7159423828125, + "logps/rejected": -263.3093566894531, + "loss": 0.7381, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4664587676525116, + "rewards/margins": 0.037835799157619476, + "rewards/rejected": 0.4286229610443115, + "step": 5863 + }, + { + "epoch": 0.9068625555770347, + "grad_norm": 6.548617362976074, + "learning_rate": 3.876159926681178e-06, + "logits/chosen": 9.338762283325195, + "logits/rejected": 9.32027530670166, + "logps/chosen": -258.281494140625, + "logps/rejected": -240.11874389648438, + "loss": 0.686, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19616562128067017, + "rewards/margins": 0.14268246293067932, + "rewards/rejected": 0.05348314344882965, + "step": 5864 + }, + { + "epoch": 0.9070172047167988, + "grad_norm": 5.089291572570801, + "learning_rate": 3.8758735250315046e-06, + "logits/chosen": 12.281780242919922, + "logits/rejected": 4.380681037902832, + "logps/chosen": -362.73504638671875, + "logps/rejected": -196.89303588867188, + "loss": 0.6017, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02687111310660839, + "rewards/margins": 0.3171910047531128, + "rewards/rejected": -0.29031985998153687, + "step": 5865 + }, + { + "epoch": 0.907171853856563, + "grad_norm": 6.04975700378418, + "learning_rate": 3.875587123381831e-06, + "logits/chosen": 8.875252723693848, + "logits/rejected": 7.564862251281738, + "logps/chosen": -418.327392578125, + "logps/rejected": -330.35064697265625, + "loss": 0.6601, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.367825984954834, + "rewards/margins": 0.1807415783405304, + "rewards/rejected": 0.1870843768119812, + "step": 5866 + }, + { + "epoch": 0.9073265029963271, + "grad_norm": 6.029545307159424, + "learning_rate": 3.875300721732158e-06, + "logits/chosen": 11.076995849609375, + "logits/rejected": 12.436990737915039, + "logps/chosen": -222.04310607910156, + "logps/rejected": -245.8939208984375, + "loss": 0.7228, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08270444720983505, + "rewards/margins": 0.06755058467388153, + "rewards/rejected": 0.015153884887695312, + "step": 5867 + }, + { + "epoch": 0.9074811521360913, + "grad_norm": 5.62288236618042, + "learning_rate": 3.8750143200824845e-06, + "logits/chosen": 3.1694464683532715, + "logits/rejected": 4.831650733947754, + "logps/chosen": -183.48605346679688, + "logps/rejected": -235.12632751464844, + "loss": 0.8222, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.017087791115045547, + "rewards/margins": -0.1821080446243286, + "rewards/rejected": 0.19919581711292267, + "step": 5868 + }, + { + "epoch": 0.9076358012758554, + "grad_norm": 6.17611837387085, + "learning_rate": 3.87472791843281e-06, + "logits/chosen": 9.089207649230957, + "logits/rejected": 13.079728126525879, + "logps/chosen": -273.21484375, + "logps/rejected": -363.84228515625, + "loss": 0.7125, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07337423413991928, + "rewards/margins": 0.09915758669376373, + "rewards/rejected": -0.025783345103263855, + "step": 5869 + }, + { + "epoch": 0.9077904504156196, + "grad_norm": 5.846744537353516, + "learning_rate": 3.874441516783137e-06, + "logits/chosen": 10.480364799499512, + "logits/rejected": 9.511070251464844, + "logps/chosen": -344.05462646484375, + "logps/rejected": -262.875, + "loss": 0.6223, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04386405646800995, + "rewards/margins": 0.2818721532821655, + "rewards/rejected": -0.23800812661647797, + "step": 5870 + }, + { + "epoch": 0.9079450995553837, + "grad_norm": 6.28924036026001, + "learning_rate": 3.874155115133464e-06, + "logits/chosen": 10.907773971557617, + "logits/rejected": 11.629669189453125, + "logps/chosen": -269.60296630859375, + "logps/rejected": -312.3037109375, + "loss": 0.8594, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22646427154541016, + "rewards/margins": -0.22511149942874908, + "rewards/rejected": 0.45157578587532043, + "step": 5871 + }, + { + "epoch": 0.9080997486951479, + "grad_norm": 6.138974666595459, + "learning_rate": 3.87386871348379e-06, + "logits/chosen": 8.09500503540039, + "logits/rejected": 4.935572147369385, + "logps/chosen": -317.02227783203125, + "logps/rejected": -300.6366882324219, + "loss": 0.7205, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3320663571357727, + "rewards/margins": 0.04267704486846924, + "rewards/rejected": 0.28938931226730347, + "step": 5872 + }, + { + "epoch": 0.908254397834912, + "grad_norm": 5.642059326171875, + "learning_rate": 3.873582311834116e-06, + "logits/chosen": 12.37524127960205, + "logits/rejected": 10.193174362182617, + "logps/chosen": -277.6837463378906, + "logps/rejected": -266.68084716796875, + "loss": 0.6788, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08328705281019211, + "rewards/margins": 0.1363345831632614, + "rewards/rejected": -0.05304756388068199, + "step": 5873 + }, + { + "epoch": 0.9084090469746762, + "grad_norm": 6.166357517242432, + "learning_rate": 3.873295910184443e-06, + "logits/chosen": 6.426780700683594, + "logits/rejected": 9.414957046508789, + "logps/chosen": -209.12893676757812, + "logps/rejected": -238.20860290527344, + "loss": 0.8466, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.14154444634914398, + "rewards/margins": -0.1155434399843216, + "rewards/rejected": -0.026000961661338806, + "step": 5874 + }, + { + "epoch": 0.9085636961144403, + "grad_norm": 6.2518510818481445, + "learning_rate": 3.873009508534769e-06, + "logits/chosen": 14.87118148803711, + "logits/rejected": 8.895995140075684, + "logps/chosen": -314.4089660644531, + "logps/rejected": -231.42095947265625, + "loss": 0.6886, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13724622130393982, + "rewards/margins": 0.10886593163013458, + "rewards/rejected": 0.028380297124385834, + "step": 5875 + }, + { + "epoch": 0.9087183452542045, + "grad_norm": 6.266149044036865, + "learning_rate": 3.872723106885096e-06, + "logits/chosen": 12.845640182495117, + "logits/rejected": 10.37359619140625, + "logps/chosen": -322.1435546875, + "logps/rejected": -274.1820068359375, + "loss": 0.739, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2960757613182068, + "rewards/margins": 0.018304161727428436, + "rewards/rejected": -0.3143799304962158, + "step": 5876 + }, + { + "epoch": 0.9088729943939687, + "grad_norm": 5.608229160308838, + "learning_rate": 3.872436705235423e-06, + "logits/chosen": 15.693495750427246, + "logits/rejected": 10.589967727661133, + "logps/chosen": -354.5541076660156, + "logps/rejected": -338.3266906738281, + "loss": 0.6245, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5042687654495239, + "rewards/margins": 0.1902521252632141, + "rewards/rejected": 0.3140166401863098, + "step": 5877 + }, + { + "epoch": 0.9090276435337329, + "grad_norm": 5.700530052185059, + "learning_rate": 3.8721503035857485e-06, + "logits/chosen": 9.475207328796387, + "logits/rejected": 9.510040283203125, + "logps/chosen": -368.9979553222656, + "logps/rejected": -413.1342468261719, + "loss": 0.7633, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24829746782779694, + "rewards/margins": 0.0990840345621109, + "rewards/rejected": 0.14921341836452484, + "step": 5878 + }, + { + "epoch": 0.909182292673497, + "grad_norm": 5.58975076675415, + "learning_rate": 3.871863901936075e-06, + "logits/chosen": 6.354435920715332, + "logits/rejected": 6.9265336990356445, + "logps/chosen": -191.07354736328125, + "logps/rejected": -258.0149841308594, + "loss": 0.5965, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16558726131916046, + "rewards/margins": 0.3360985517501831, + "rewards/rejected": -0.17051129043102264, + "step": 5879 + }, + { + "epoch": 0.9093369418132612, + "grad_norm": 8.578877449035645, + "learning_rate": 3.871577500286402e-06, + "logits/chosen": 13.14071273803711, + "logits/rejected": 9.55809211730957, + "logps/chosen": -295.3739013671875, + "logps/rejected": -231.53436279296875, + "loss": 0.7492, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4801799952983856, + "rewards/margins": 0.06567084789276123, + "rewards/rejected": 0.4145091474056244, + "step": 5880 + }, + { + "epoch": 0.9094915909530253, + "grad_norm": 5.274852275848389, + "learning_rate": 3.8712910986367284e-06, + "logits/chosen": 10.39631462097168, + "logits/rejected": 9.390652656555176, + "logps/chosen": -281.65899658203125, + "logps/rejected": -277.39471435546875, + "loss": 0.6186, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27159056067466736, + "rewards/margins": 0.44433337450027466, + "rewards/rejected": -0.1727428287267685, + "step": 5881 + }, + { + "epoch": 0.9096462400927895, + "grad_norm": 4.362977504730225, + "learning_rate": 3.871004696987055e-06, + "logits/chosen": 8.973688125610352, + "logits/rejected": 12.19929027557373, + "logps/chosen": -153.68785095214844, + "logps/rejected": -198.5580291748047, + "loss": 0.7528, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24013786017894745, + "rewards/margins": -0.07577338814735413, + "rewards/rejected": 0.31591126322746277, + "step": 5882 + }, + { + "epoch": 0.9098008892325536, + "grad_norm": 4.9892096519470215, + "learning_rate": 3.870718295337381e-06, + "logits/chosen": 10.454109191894531, + "logits/rejected": 9.95296859741211, + "logps/chosen": -255.88088989257812, + "logps/rejected": -205.77029418945312, + "loss": 0.5946, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33802539110183716, + "rewards/margins": 0.385031521320343, + "rewards/rejected": -0.04700613021850586, + "step": 5883 + }, + { + "epoch": 0.9099555383723178, + "grad_norm": 5.3680644035339355, + "learning_rate": 3.8704318936877075e-06, + "logits/chosen": 9.20842456817627, + "logits/rejected": 3.709805488586426, + "logps/chosen": -236.03872680664062, + "logps/rejected": -221.62564086914062, + "loss": 0.697, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03382416069507599, + "rewards/margins": 0.3379474878311157, + "rewards/rejected": -0.30412331223487854, + "step": 5884 + }, + { + "epoch": 0.910110187512082, + "grad_norm": 5.74845552444458, + "learning_rate": 3.870145492038034e-06, + "logits/chosen": 12.581165313720703, + "logits/rejected": 5.730310916900635, + "logps/chosen": -263.79754638671875, + "logps/rejected": -205.05313110351562, + "loss": 0.5877, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15203207731246948, + "rewards/margins": 0.2605578601360321, + "rewards/rejected": -0.10852579772472382, + "step": 5885 + }, + { + "epoch": 0.9102648366518461, + "grad_norm": 3.768998384475708, + "learning_rate": 3.869859090388361e-06, + "logits/chosen": 12.344903945922852, + "logits/rejected": 14.888874053955078, + "logps/chosen": -270.77239990234375, + "logps/rejected": -257.6913146972656, + "loss": 0.4446, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4103223979473114, + "rewards/margins": 0.7548952102661133, + "rewards/rejected": -0.3445727229118347, + "step": 5886 + }, + { + "epoch": 0.9104194857916103, + "grad_norm": 4.497020721435547, + "learning_rate": 3.8695726887386875e-06, + "logits/chosen": 10.86084270477295, + "logits/rejected": 14.221529960632324, + "logps/chosen": -197.7698516845703, + "logps/rejected": -290.35101318359375, + "loss": 0.6059, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.017513558268547058, + "rewards/margins": 0.29999542236328125, + "rewards/rejected": -0.3175089955329895, + "step": 5887 + }, + { + "epoch": 0.9105741349313744, + "grad_norm": 4.394694805145264, + "learning_rate": 3.869286287089014e-06, + "logits/chosen": 8.712041854858398, + "logits/rejected": 6.049680709838867, + "logps/chosen": -336.4422607421875, + "logps/rejected": -312.62054443359375, + "loss": 0.4668, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.33791497349739075, + "rewards/margins": 0.6791126728057861, + "rewards/rejected": -0.3411977291107178, + "step": 5888 + }, + { + "epoch": 0.9107287840711386, + "grad_norm": 4.4446563720703125, + "learning_rate": 3.868999885439341e-06, + "logits/chosen": 15.691184997558594, + "logits/rejected": 8.593006134033203, + "logps/chosen": -347.0106506347656, + "logps/rejected": -222.3578643798828, + "loss": 0.4786, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.34931623935699463, + "rewards/margins": 0.5407199859619141, + "rewards/rejected": -0.19140376150608063, + "step": 5889 + }, + { + "epoch": 0.9108834332109028, + "grad_norm": 5.76023530960083, + "learning_rate": 3.868713483789667e-06, + "logits/chosen": 11.842556953430176, + "logits/rejected": 13.940601348876953, + "logps/chosen": -237.77659606933594, + "logps/rejected": -233.53759765625, + "loss": 0.7998, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1934671401977539, + "rewards/margins": -0.1268077939748764, + "rewards/rejected": 0.3202749490737915, + "step": 5890 + }, + { + "epoch": 0.911038082350667, + "grad_norm": 6.6792311668396, + "learning_rate": 3.868427082139993e-06, + "logits/chosen": 8.31523323059082, + "logits/rejected": 9.795251846313477, + "logps/chosen": -261.92724609375, + "logps/rejected": -340.0951232910156, + "loss": 0.8456, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06994685530662537, + "rewards/margins": -0.193409726023674, + "rewards/rejected": 0.12346287071704865, + "step": 5891 + }, + { + "epoch": 0.9111927314904311, + "grad_norm": 5.544681549072266, + "learning_rate": 3.86814068049032e-06, + "logits/chosen": 11.096994400024414, + "logits/rejected": 8.1788330078125, + "logps/chosen": -404.7717590332031, + "logps/rejected": -305.47772216796875, + "loss": 0.5715, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.307533860206604, + "rewards/margins": 0.3805437386035919, + "rewards/rejected": -0.07300987839698792, + "step": 5892 + }, + { + "epoch": 0.9113473806301953, + "grad_norm": 5.960513114929199, + "learning_rate": 3.8678542788406465e-06, + "logits/chosen": 5.858979225158691, + "logits/rejected": 7.962894439697266, + "logps/chosen": -254.59347534179688, + "logps/rejected": -332.9234313964844, + "loss": 0.4901, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24142858386039734, + "rewards/margins": 0.6056240797042847, + "rewards/rejected": -0.3641955554485321, + "step": 5893 + }, + { + "epoch": 0.9115020297699594, + "grad_norm": 4.539766311645508, + "learning_rate": 3.867567877190973e-06, + "logits/chosen": 10.890776634216309, + "logits/rejected": 6.910697937011719, + "logps/chosen": -329.326904296875, + "logps/rejected": -285.2637939453125, + "loss": 0.4738, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6203383207321167, + "rewards/margins": 0.5936251878738403, + "rewards/rejected": 0.026713073253631592, + "step": 5894 + }, + { + "epoch": 0.9116566789097236, + "grad_norm": 4.375959873199463, + "learning_rate": 3.8672814755413e-06, + "logits/chosen": 5.006954193115234, + "logits/rejected": 5.969168186187744, + "logps/chosen": -217.16885375976562, + "logps/rejected": -249.6975555419922, + "loss": 0.532, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15714392066001892, + "rewards/margins": 0.4601404666900635, + "rewards/rejected": -0.30299657583236694, + "step": 5895 + }, + { + "epoch": 0.9118113280494877, + "grad_norm": 6.452846527099609, + "learning_rate": 3.8669950738916265e-06, + "logits/chosen": 10.5885591506958, + "logits/rejected": 14.107398986816406, + "logps/chosen": -332.09906005859375, + "logps/rejected": -306.437255859375, + "loss": 0.742, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19397488236427307, + "rewards/margins": -0.062388621270656586, + "rewards/rejected": 0.25636351108551025, + "step": 5896 + }, + { + "epoch": 0.9119659771892519, + "grad_norm": 4.224374771118164, + "learning_rate": 3.866708672241952e-06, + "logits/chosen": 8.025949478149414, + "logits/rejected": 0.833151638507843, + "logps/chosen": -243.8278350830078, + "logps/rejected": -207.10089111328125, + "loss": 0.5728, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.21796277165412903, + "rewards/margins": 0.8589696884155273, + "rewards/rejected": -1.076932430267334, + "step": 5897 + }, + { + "epoch": 0.912120626329016, + "grad_norm": 4.2984819412231445, + "learning_rate": 3.866422270592279e-06, + "logits/chosen": 9.800004005432129, + "logits/rejected": 6.534969806671143, + "logps/chosen": -131.1072540283203, + "logps/rejected": -105.47775268554688, + "loss": 0.7154, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.0369708277285099, + "rewards/margins": 0.01552751287817955, + "rewards/rejected": 0.02144332230091095, + "step": 5898 + }, + { + "epoch": 0.9122752754687802, + "grad_norm": 4.24132776260376, + "learning_rate": 3.866135868942606e-06, + "logits/chosen": 11.291203498840332, + "logits/rejected": 4.246890068054199, + "logps/chosen": -347.5997314453125, + "logps/rejected": -274.2054443359375, + "loss": 0.4576, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.272003173828125, + "rewards/margins": 0.7611892223358154, + "rewards/rejected": -0.48918601870536804, + "step": 5899 + }, + { + "epoch": 0.9124299246085443, + "grad_norm": 5.092874526977539, + "learning_rate": 3.865849467292932e-06, + "logits/chosen": 9.607381820678711, + "logits/rejected": 1.9911748170852661, + "logps/chosen": -386.2210693359375, + "logps/rejected": -442.4115295410156, + "loss": 0.4891, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.49228185415267944, + "rewards/margins": 0.9390621185302734, + "rewards/rejected": -0.4467802047729492, + "step": 5900 + }, + { + "epoch": 0.9125845737483085, + "grad_norm": 4.684938907623291, + "learning_rate": 3.865563065643259e-06, + "logits/chosen": 12.4434814453125, + "logits/rejected": 11.634963989257812, + "logps/chosen": -195.53292846679688, + "logps/rejected": -226.75164794921875, + "loss": 0.6766, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07983823120594025, + "rewards/margins": 0.26623111963272095, + "rewards/rejected": -0.3460693359375, + "step": 5901 + }, + { + "epoch": 0.9127392228880726, + "grad_norm": 5.0986857414245605, + "learning_rate": 3.8652766639935855e-06, + "logits/chosen": 11.001699447631836, + "logits/rejected": 2.162646532058716, + "logps/chosen": -364.4613342285156, + "logps/rejected": -277.11102294921875, + "loss": 0.5448, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3906462788581848, + "rewards/margins": 0.4935569763183594, + "rewards/rejected": -0.10291068255901337, + "step": 5902 + }, + { + "epoch": 0.9128938720278369, + "grad_norm": 6.443974018096924, + "learning_rate": 3.864990262343911e-06, + "logits/chosen": 15.372228622436523, + "logits/rejected": 9.23705005645752, + "logps/chosen": -422.90985107421875, + "logps/rejected": -329.5019836425781, + "loss": 0.6644, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29591548442840576, + "rewards/margins": 0.1617671698331833, + "rewards/rejected": 0.13414831459522247, + "step": 5903 + }, + { + "epoch": 0.913048521167601, + "grad_norm": 6.59765625, + "learning_rate": 3.864703860694238e-06, + "logits/chosen": 4.117753982543945, + "logits/rejected": 6.603634357452393, + "logps/chosen": -264.6317138671875, + "logps/rejected": -238.9684600830078, + "loss": 0.8735, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.20861637592315674, + "rewards/margins": -0.1493082046508789, + "rewards/rejected": -0.059308186173439026, + "step": 5904 + }, + { + "epoch": 0.9132031703073652, + "grad_norm": 4.567427158355713, + "learning_rate": 3.864417459044565e-06, + "logits/chosen": 12.141523361206055, + "logits/rejected": 5.422127723693848, + "logps/chosen": -270.4252624511719, + "logps/rejected": -132.70730590820312, + "loss": 0.5819, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.030228987336158752, + "rewards/margins": 0.2846803665161133, + "rewards/rejected": -0.2544513940811157, + "step": 5905 + }, + { + "epoch": 0.9133578194471293, + "grad_norm": 6.439109802246094, + "learning_rate": 3.864131057394891e-06, + "logits/chosen": 6.0170111656188965, + "logits/rejected": 9.03499984741211, + "logps/chosen": -294.7020263671875, + "logps/rejected": -388.0574645996094, + "loss": 0.8228, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10948237776756287, + "rewards/margins": -0.07519664615392685, + "rewards/rejected": -0.03428572416305542, + "step": 5906 + }, + { + "epoch": 0.9135124685868935, + "grad_norm": 5.855745315551758, + "learning_rate": 3.863844655745217e-06, + "logits/chosen": 17.104736328125, + "logits/rejected": 8.900671005249023, + "logps/chosen": -527.9058837890625, + "logps/rejected": -407.5986328125, + "loss": 0.4497, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4642106890678406, + "rewards/margins": 0.6380539536476135, + "rewards/rejected": -0.17384320497512817, + "step": 5907 + }, + { + "epoch": 0.9136671177266577, + "grad_norm": 4.556571960449219, + "learning_rate": 3.863558254095544e-06, + "logits/chosen": 13.010801315307617, + "logits/rejected": 10.369705200195312, + "logps/chosen": -308.9446716308594, + "logps/rejected": -252.33395385742188, + "loss": 0.6248, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04721641540527344, + "rewards/margins": 0.19226133823394775, + "rewards/rejected": -0.23947773873806, + "step": 5908 + }, + { + "epoch": 0.9138217668664218, + "grad_norm": 4.743622779846191, + "learning_rate": 3.86327185244587e-06, + "logits/chosen": 10.117725372314453, + "logits/rejected": 9.849259376525879, + "logps/chosen": -194.17526245117188, + "logps/rejected": -159.54159545898438, + "loss": 0.7004, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03017682209610939, + "rewards/margins": 0.05196913704276085, + "rewards/rejected": -0.02179231494665146, + "step": 5909 + }, + { + "epoch": 0.913976416006186, + "grad_norm": 6.324093818664551, + "learning_rate": 3.862985450796197e-06, + "logits/chosen": 9.268342971801758, + "logits/rejected": 8.057497024536133, + "logps/chosen": -260.33953857421875, + "logps/rejected": -235.00332641601562, + "loss": 0.6996, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15262578427791595, + "rewards/margins": 0.05134010314941406, + "rewards/rejected": -0.20396587252616882, + "step": 5910 + }, + { + "epoch": 0.9141310651459501, + "grad_norm": 4.399489879608154, + "learning_rate": 3.862699049146523e-06, + "logits/chosen": 10.362421035766602, + "logits/rejected": 10.820565223693848, + "logps/chosen": -272.98870849609375, + "logps/rejected": -253.4434356689453, + "loss": 0.5946, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1616211086511612, + "rewards/margins": 0.3922400176525116, + "rewards/rejected": -0.553861141204834, + "step": 5911 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 9.468118667602539, + "learning_rate": 3.8624126474968495e-06, + "logits/chosen": 12.71480941772461, + "logits/rejected": 8.210349082946777, + "logps/chosen": -309.13043212890625, + "logps/rejected": -287.5068359375, + "loss": 0.7842, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1380024552345276, + "rewards/margins": -0.030953746289014816, + "rewards/rejected": 0.16895617544651031, + "step": 5912 + }, + { + "epoch": 0.9144403634254784, + "grad_norm": 5.364417552947998, + "learning_rate": 3.862126245847176e-06, + "logits/chosen": 14.358708381652832, + "logits/rejected": 6.679542064666748, + "logps/chosen": -288.8837890625, + "logps/rejected": -221.76528930664062, + "loss": 0.5227, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10908961296081543, + "rewards/margins": 0.518195629119873, + "rewards/rejected": -0.40910604596138, + "step": 5913 + }, + { + "epoch": 0.9145950125652426, + "grad_norm": 6.501421928405762, + "learning_rate": 3.861839844197503e-06, + "logits/chosen": 11.75555419921875, + "logits/rejected": 7.3360724449157715, + "logps/chosen": -246.612548828125, + "logps/rejected": -224.14794921875, + "loss": 0.617, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19812843203544617, + "rewards/margins": 0.30944639444351196, + "rewards/rejected": -0.11131791770458221, + "step": 5914 + }, + { + "epoch": 0.9147496617050067, + "grad_norm": 4.851212024688721, + "learning_rate": 3.8615534425478295e-06, + "logits/chosen": 15.381839752197266, + "logits/rejected": 9.44843864440918, + "logps/chosen": -325.95123291015625, + "logps/rejected": -271.8412780761719, + "loss": 0.5835, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2488618940114975, + "rewards/margins": 0.48829320073127747, + "rewards/rejected": -0.23943127691745758, + "step": 5915 + }, + { + "epoch": 0.914904310844771, + "grad_norm": 5.3454484939575195, + "learning_rate": 3.861267040898155e-06, + "logits/chosen": 11.534419059753418, + "logits/rejected": 7.670881271362305, + "logps/chosen": -373.47320556640625, + "logps/rejected": -303.9528503417969, + "loss": 0.6034, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04454078525304794, + "rewards/margins": 0.23381319642066956, + "rewards/rejected": -0.2783539891242981, + "step": 5916 + }, + { + "epoch": 0.9150589599845351, + "grad_norm": 5.922972202301025, + "learning_rate": 3.860980639248482e-06, + "logits/chosen": 8.902524948120117, + "logits/rejected": 9.386190414428711, + "logps/chosen": -277.7146301269531, + "logps/rejected": -339.0614929199219, + "loss": 0.6622, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05004728212952614, + "rewards/margins": 0.17682011425495148, + "rewards/rejected": -0.12677285075187683, + "step": 5917 + }, + { + "epoch": 0.9152136091242993, + "grad_norm": 5.290755748748779, + "learning_rate": 3.8606942375988086e-06, + "logits/chosen": 5.35184383392334, + "logits/rejected": 9.185018539428711, + "logps/chosen": -261.62103271484375, + "logps/rejected": -321.3396301269531, + "loss": 0.5852, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05494198948144913, + "rewards/margins": 0.5391534566879272, + "rewards/rejected": -0.5940955281257629, + "step": 5918 + }, + { + "epoch": 0.9153682582640634, + "grad_norm": 5.035971164703369, + "learning_rate": 3.860407835949135e-06, + "logits/chosen": 15.949684143066406, + "logits/rejected": 11.24844741821289, + "logps/chosen": -246.62939453125, + "logps/rejected": -151.89537048339844, + "loss": 0.5858, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5066784024238586, + "rewards/margins": 0.3790939450263977, + "rewards/rejected": 0.1275845170021057, + "step": 5919 + }, + { + "epoch": 0.9155229074038276, + "grad_norm": 5.3687825202941895, + "learning_rate": 3.860121434299462e-06, + "logits/chosen": 13.22360897064209, + "logits/rejected": 14.694915771484375, + "logps/chosen": -269.3085021972656, + "logps/rejected": -269.97723388671875, + "loss": 0.5464, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17001762986183167, + "rewards/margins": 0.401035338640213, + "rewards/rejected": -0.23101767897605896, + "step": 5920 + }, + { + "epoch": 0.9156775565435917, + "grad_norm": 4.238692760467529, + "learning_rate": 3.8598350326497885e-06, + "logits/chosen": 11.878721237182617, + "logits/rejected": 6.834425449371338, + "logps/chosen": -354.71038818359375, + "logps/rejected": -227.5897979736328, + "loss": 0.5358, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29697972536087036, + "rewards/margins": 0.5982266664505005, + "rewards/rejected": -0.3012469410896301, + "step": 5921 + }, + { + "epoch": 0.9158322056833559, + "grad_norm": 8.2029390335083, + "learning_rate": 3.859548631000115e-06, + "logits/chosen": 7.202447891235352, + "logits/rejected": 2.3025941848754883, + "logps/chosen": -375.6297607421875, + "logps/rejected": -220.82632446289062, + "loss": 0.7654, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.05733557045459747, + "rewards/margins": -0.06681393086910248, + "rewards/rejected": 0.12414951622486115, + "step": 5922 + }, + { + "epoch": 0.91598685482312, + "grad_norm": 7.002408504486084, + "learning_rate": 3.859262229350441e-06, + "logits/chosen": 15.865678787231445, + "logits/rejected": 15.669532775878906, + "logps/chosen": -254.5007781982422, + "logps/rejected": -230.11817932128906, + "loss": 0.7696, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12614861130714417, + "rewards/margins": -0.035456568002700806, + "rewards/rejected": 0.16160517930984497, + "step": 5923 + }, + { + "epoch": 0.9161415039628842, + "grad_norm": 3.7600300312042236, + "learning_rate": 3.858975827700768e-06, + "logits/chosen": 6.476556777954102, + "logits/rejected": 5.155506134033203, + "logps/chosen": -163.3370361328125, + "logps/rejected": -174.42233276367188, + "loss": 0.6208, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23719027638435364, + "rewards/margins": 0.20608875155448914, + "rewards/rejected": 0.031101537868380547, + "step": 5924 + }, + { + "epoch": 0.9162961531026483, + "grad_norm": 3.0437915325164795, + "learning_rate": 3.858689426051094e-06, + "logits/chosen": 7.21820068359375, + "logits/rejected": 5.366580963134766, + "logps/chosen": -165.77870178222656, + "logps/rejected": -122.99116516113281, + "loss": 0.5162, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12377265840768814, + "rewards/margins": 0.4593256711959839, + "rewards/rejected": -0.33555299043655396, + "step": 5925 + }, + { + "epoch": 0.9164508022424125, + "grad_norm": 5.067497253417969, + "learning_rate": 3.858403024401421e-06, + "logits/chosen": 12.549297332763672, + "logits/rejected": 11.871038436889648, + "logps/chosen": -286.08685302734375, + "logps/rejected": -315.29034423828125, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2791282534599304, + "rewards/margins": 0.01311815157532692, + "rewards/rejected": 0.2660101056098938, + "step": 5926 + }, + { + "epoch": 0.9166054513821766, + "grad_norm": 4.377481460571289, + "learning_rate": 3.8581166227517476e-06, + "logits/chosen": 5.702720642089844, + "logits/rejected": 3.9241890907287598, + "logps/chosen": -137.35012817382812, + "logps/rejected": -145.62823486328125, + "loss": 0.6581, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04117536544799805, + "rewards/margins": 0.11607442796230316, + "rewards/rejected": -0.1572497934103012, + "step": 5927 + }, + { + "epoch": 0.9167601005219409, + "grad_norm": 4.919307708740234, + "learning_rate": 3.857830221102074e-06, + "logits/chosen": 6.303287029266357, + "logits/rejected": 13.36454963684082, + "logps/chosen": -148.81932067871094, + "logps/rejected": -218.31182861328125, + "loss": 0.6832, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14241819083690643, + "rewards/margins": 0.10381338000297546, + "rewards/rejected": -0.2462315559387207, + "step": 5928 + }, + { + "epoch": 0.916914749661705, + "grad_norm": 5.774654865264893, + "learning_rate": 3.8575438194524e-06, + "logits/chosen": 15.903022766113281, + "logits/rejected": 9.915826797485352, + "logps/chosen": -309.3786926269531, + "logps/rejected": -261.5390625, + "loss": 0.6905, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.043460264801979065, + "rewards/margins": 0.12520970404148102, + "rewards/rejected": -0.08174943923950195, + "step": 5929 + }, + { + "epoch": 0.9170693988014692, + "grad_norm": 5.834843635559082, + "learning_rate": 3.857257417802727e-06, + "logits/chosen": 6.180651664733887, + "logits/rejected": 7.186861991882324, + "logps/chosen": -256.467529296875, + "logps/rejected": -198.36305236816406, + "loss": 0.714, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.016269681975245476, + "rewards/margins": 0.0899326503276825, + "rewards/rejected": -0.07366294413805008, + "step": 5930 + }, + { + "epoch": 0.9172240479412334, + "grad_norm": 6.222607612609863, + "learning_rate": 3.856971016153053e-06, + "logits/chosen": 8.585904121398926, + "logits/rejected": 5.8289570808410645, + "logps/chosen": -186.5392608642578, + "logps/rejected": -183.44021606445312, + "loss": 0.7348, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.10585668683052063, + "rewards/margins": 0.008877936750650406, + "rewards/rejected": 0.09697875380516052, + "step": 5931 + }, + { + "epoch": 0.9173786970809975, + "grad_norm": 5.199121952056885, + "learning_rate": 3.85668461450338e-06, + "logits/chosen": 15.467013359069824, + "logits/rejected": 5.116355895996094, + "logps/chosen": -318.99383544921875, + "logps/rejected": -254.9661102294922, + "loss": 0.5685, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36116892099380493, + "rewards/margins": 0.5343852043151855, + "rewards/rejected": -0.17321628332138062, + "step": 5932 + }, + { + "epoch": 0.9175333462207617, + "grad_norm": 4.401479244232178, + "learning_rate": 3.856398212853707e-06, + "logits/chosen": 9.046539306640625, + "logits/rejected": 2.07232403755188, + "logps/chosen": -253.52285766601562, + "logps/rejected": -140.69761657714844, + "loss": 0.5358, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02295185625553131, + "rewards/margins": 0.4381467401981354, + "rewards/rejected": -0.41519486904144287, + "step": 5933 + }, + { + "epoch": 0.9176879953605258, + "grad_norm": 9.792787551879883, + "learning_rate": 3.856111811204033e-06, + "logits/chosen": 10.339614868164062, + "logits/rejected": 9.148236274719238, + "logps/chosen": -342.1416931152344, + "logps/rejected": -340.17510986328125, + "loss": 0.8286, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08393508195877075, + "rewards/margins": -0.0599772185087204, + "rewards/rejected": 0.14391233026981354, + "step": 5934 + }, + { + "epoch": 0.91784264450029, + "grad_norm": 5.708592414855957, + "learning_rate": 3.85582540955436e-06, + "logits/chosen": 10.877514839172363, + "logits/rejected": 8.588447570800781, + "logps/chosen": -283.3267822265625, + "logps/rejected": -257.55224609375, + "loss": 0.5222, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2284901738166809, + "rewards/margins": 0.8872479200363159, + "rewards/rejected": -0.6587576866149902, + "step": 5935 + }, + { + "epoch": 0.9179972936400541, + "grad_norm": 4.462575912475586, + "learning_rate": 3.855539007904686e-06, + "logits/chosen": 10.294260025024414, + "logits/rejected": 5.6089019775390625, + "logps/chosen": -266.9420166015625, + "logps/rejected": -242.7266082763672, + "loss": 0.5873, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40647435188293457, + "rewards/margins": 0.2888174057006836, + "rewards/rejected": 0.11765693873167038, + "step": 5936 + }, + { + "epoch": 0.9181519427798183, + "grad_norm": 6.262191295623779, + "learning_rate": 3.855252606255012e-06, + "logits/chosen": 9.659331321716309, + "logits/rejected": 7.160659313201904, + "logps/chosen": -371.291259765625, + "logps/rejected": -301.55078125, + "loss": 0.6773, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.009488284587860107, + "rewards/margins": 0.218671977519989, + "rewards/rejected": -0.2091836929321289, + "step": 5937 + }, + { + "epoch": 0.9183065919195824, + "grad_norm": 13.311737060546875, + "learning_rate": 3.854966204605339e-06, + "logits/chosen": 11.907447814941406, + "logits/rejected": 8.653641700744629, + "logps/chosen": -319.3544006347656, + "logps/rejected": -288.51898193359375, + "loss": 0.9044, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.2930832803249359, + "rewards/margins": -0.2596186697483063, + "rewards/rejected": -0.03346461430191994, + "step": 5938 + }, + { + "epoch": 0.9184612410593466, + "grad_norm": 6.035009384155273, + "learning_rate": 3.854679802955666e-06, + "logits/chosen": 5.040353775024414, + "logits/rejected": 5.486931324005127, + "logps/chosen": -365.8705139160156, + "logps/rejected": -309.94818115234375, + "loss": 0.6754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0919039398431778, + "rewards/margins": 0.09457303583621979, + "rewards/rejected": -0.002669095993041992, + "step": 5939 + }, + { + "epoch": 0.9186158901991107, + "grad_norm": 5.269845962524414, + "learning_rate": 3.854393401305992e-06, + "logits/chosen": 7.8071208000183105, + "logits/rejected": 9.440631866455078, + "logps/chosen": -239.56668090820312, + "logps/rejected": -271.0215148925781, + "loss": 0.6969, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.15914659202098846, + "rewards/margins": 0.028732866048812866, + "rewards/rejected": 0.1304137408733368, + "step": 5940 + }, + { + "epoch": 0.918770539338875, + "grad_norm": 5.278595924377441, + "learning_rate": 3.854106999656318e-06, + "logits/chosen": 11.516739845275879, + "logits/rejected": 13.211091995239258, + "logps/chosen": -325.21099853515625, + "logps/rejected": -339.77764892578125, + "loss": 0.4958, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31322020292282104, + "rewards/margins": 0.6078562140464783, + "rewards/rejected": -0.29463598132133484, + "step": 5941 + }, + { + "epoch": 0.9189251884786391, + "grad_norm": 6.398715019226074, + "learning_rate": 3.853820598006645e-06, + "logits/chosen": 11.936965942382812, + "logits/rejected": 10.497695922851562, + "logps/chosen": -364.0721435546875, + "logps/rejected": -253.64527893066406, + "loss": 0.7408, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3417280912399292, + "rewards/margins": 0.04365827143192291, + "rewards/rejected": -0.3853863775730133, + "step": 5942 + }, + { + "epoch": 0.9190798376184033, + "grad_norm": 4.045505046844482, + "learning_rate": 3.8535341963569714e-06, + "logits/chosen": 14.519649505615234, + "logits/rejected": 15.048075675964355, + "logps/chosen": -236.2891845703125, + "logps/rejected": -254.0170440673828, + "loss": 0.554, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19047680497169495, + "rewards/margins": 0.47173410654067993, + "rewards/rejected": -0.281257301568985, + "step": 5943 + }, + { + "epoch": 0.9192344867581674, + "grad_norm": 5.424352645874023, + "learning_rate": 3.853247794707298e-06, + "logits/chosen": 7.910563945770264, + "logits/rejected": 9.230586051940918, + "logps/chosen": -217.4111328125, + "logps/rejected": -269.77642822265625, + "loss": 0.7234, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.17150217294692993, + "rewards/margins": 0.04834473133087158, + "rewards/rejected": 0.12315745651721954, + "step": 5944 + }, + { + "epoch": 0.9193891358979316, + "grad_norm": 4.2203168869018555, + "learning_rate": 3.852961393057624e-06, + "logits/chosen": 9.357605934143066, + "logits/rejected": 8.5084228515625, + "logps/chosen": -207.44784545898438, + "logps/rejected": -242.84083557128906, + "loss": 0.6284, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.16369342803955078, + "rewards/margins": 0.37120023369789124, + "rewards/rejected": -0.5348936915397644, + "step": 5945 + }, + { + "epoch": 0.9195437850376957, + "grad_norm": 4.9508185386657715, + "learning_rate": 3.8526749914079505e-06, + "logits/chosen": 14.602723121643066, + "logits/rejected": 7.486633777618408, + "logps/chosen": -271.6171875, + "logps/rejected": -204.06320190429688, + "loss": 0.5517, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1853051632642746, + "rewards/margins": 0.44909024238586426, + "rewards/rejected": -0.26378506422042847, + "step": 5946 + }, + { + "epoch": 0.9196984341774599, + "grad_norm": 8.3158597946167, + "learning_rate": 3.852388589758277e-06, + "logits/chosen": 8.789648056030273, + "logits/rejected": 5.727154731750488, + "logps/chosen": -338.49835205078125, + "logps/rejected": -257.8027038574219, + "loss": 0.688, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.31732234358787537, + "rewards/margins": 0.16338233649730682, + "rewards/rejected": 0.15394000709056854, + "step": 5947 + }, + { + "epoch": 0.919853083317224, + "grad_norm": 6.242115497589111, + "learning_rate": 3.852102188108604e-06, + "logits/chosen": 13.601849555969238, + "logits/rejected": 12.952322959899902, + "logps/chosen": -258.85833740234375, + "logps/rejected": -302.5117492675781, + "loss": 0.5806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3359399437904358, + "rewards/margins": 0.3445708155632019, + "rewards/rejected": -0.008630847558379173, + "step": 5948 + }, + { + "epoch": 0.9200077324569882, + "grad_norm": 3.889660596847534, + "learning_rate": 3.85181578645893e-06, + "logits/chosen": 8.088129997253418, + "logits/rejected": 10.209095001220703, + "logps/chosen": -214.66986083984375, + "logps/rejected": -237.7310028076172, + "loss": 0.5545, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.30456647276878357, + "rewards/margins": 0.37158316373825073, + "rewards/rejected": -0.06701669096946716, + "step": 5949 + }, + { + "epoch": 0.9201623815967523, + "grad_norm": 5.8114333152771, + "learning_rate": 3.851529384809256e-06, + "logits/chosen": 8.929306030273438, + "logits/rejected": 7.519437789916992, + "logps/chosen": -251.84629821777344, + "logps/rejected": -217.41412353515625, + "loss": 0.6704, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004251718521118164, + "rewards/margins": 0.0848056823015213, + "rewards/rejected": -0.08055394887924194, + "step": 5950 + }, + { + "epoch": 0.9203170307365165, + "grad_norm": 8.590255737304688, + "learning_rate": 3.851242983159583e-06, + "logits/chosen": 12.934130668640137, + "logits/rejected": 6.063616752624512, + "logps/chosen": -369.6166687011719, + "logps/rejected": -229.67901611328125, + "loss": 0.6688, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11090479046106339, + "rewards/margins": 0.3181837201118469, + "rewards/rejected": -0.20727893710136414, + "step": 5951 + }, + { + "epoch": 0.9204716798762806, + "grad_norm": 4.726585865020752, + "learning_rate": 3.85095658150991e-06, + "logits/chosen": 9.413644790649414, + "logits/rejected": 11.746776580810547, + "logps/chosen": -241.02403259277344, + "logps/rejected": -286.5113830566406, + "loss": 0.6274, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.08068778365850449, + "rewards/margins": 0.2278514951467514, + "rewards/rejected": -0.3085392713546753, + "step": 5952 + }, + { + "epoch": 0.9206263290160448, + "grad_norm": 4.609578609466553, + "learning_rate": 3.850670179860236e-06, + "logits/chosen": 17.141637802124023, + "logits/rejected": 10.48354721069336, + "logps/chosen": -382.8043212890625, + "logps/rejected": -249.50013732910156, + "loss": 0.5196, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40035080909729004, + "rewards/margins": 0.6527730226516724, + "rewards/rejected": -0.25242215394973755, + "step": 5953 + }, + { + "epoch": 0.9207809781558091, + "grad_norm": 5.546847343444824, + "learning_rate": 3.850383778210563e-06, + "logits/chosen": 9.014184951782227, + "logits/rejected": 7.124309539794922, + "logps/chosen": -359.6578674316406, + "logps/rejected": -309.4601745605469, + "loss": 0.6796, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.27597880363464355, + "rewards/margins": 0.14819855988025665, + "rewards/rejected": 0.1277802288532257, + "step": 5954 + }, + { + "epoch": 0.9209356272955732, + "grad_norm": 4.403045177459717, + "learning_rate": 3.8500973765608896e-06, + "logits/chosen": 5.863503932952881, + "logits/rejected": 4.2348856925964355, + "logps/chosen": -212.52920532226562, + "logps/rejected": -211.0526123046875, + "loss": 0.7037, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03576654940843582, + "rewards/margins": 0.21357953548431396, + "rewards/rejected": -0.17781296372413635, + "step": 5955 + }, + { + "epoch": 0.9210902764353374, + "grad_norm": 6.843137741088867, + "learning_rate": 3.849810974911215e-06, + "logits/chosen": 13.903402328491211, + "logits/rejected": 9.994194030761719, + "logps/chosen": -333.7403259277344, + "logps/rejected": -288.69317626953125, + "loss": 0.829, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10667981952428818, + "rewards/margins": 0.06349324434995651, + "rewards/rejected": -0.1701730489730835, + "step": 5956 + }, + { + "epoch": 0.9212449255751015, + "grad_norm": 5.5774946212768555, + "learning_rate": 3.849524573261542e-06, + "logits/chosen": 15.12047004699707, + "logits/rejected": 12.097946166992188, + "logps/chosen": -265.2837829589844, + "logps/rejected": -175.3000946044922, + "loss": 0.7397, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2028406262397766, + "rewards/margins": 0.029902443289756775, + "rewards/rejected": 0.17293816804885864, + "step": 5957 + }, + { + "epoch": 0.9213995747148657, + "grad_norm": 5.4411115646362305, + "learning_rate": 3.849238171611869e-06, + "logits/chosen": 14.200237274169922, + "logits/rejected": 12.430276870727539, + "logps/chosen": -250.39157104492188, + "logps/rejected": -207.62936401367188, + "loss": 0.7891, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.21793919801712036, + "rewards/margins": -0.14933212101459503, + "rewards/rejected": 0.3672713041305542, + "step": 5958 + }, + { + "epoch": 0.9215542238546298, + "grad_norm": 4.988310813903809, + "learning_rate": 3.848951769962195e-06, + "logits/chosen": 13.829568862915039, + "logits/rejected": 12.126517295837402, + "logps/chosen": -286.1348571777344, + "logps/rejected": -296.5434875488281, + "loss": 0.6016, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1906820386648178, + "rewards/margins": 0.2569863498210907, + "rewards/rejected": -0.4476683735847473, + "step": 5959 + }, + { + "epoch": 0.921708872994394, + "grad_norm": 5.089449882507324, + "learning_rate": 3.848665368312522e-06, + "logits/chosen": 13.649700164794922, + "logits/rejected": 9.107711791992188, + "logps/chosen": -356.34674072265625, + "logps/rejected": -247.37521362304688, + "loss": 0.6604, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.29121553897857666, + "rewards/margins": 0.229379802942276, + "rewards/rejected": 0.061835743486881256, + "step": 5960 + }, + { + "epoch": 0.9218635221341581, + "grad_norm": 3.6978254318237305, + "learning_rate": 3.848378966662849e-06, + "logits/chosen": 6.951534748077393, + "logits/rejected": 6.844333171844482, + "logps/chosen": -208.95864868164062, + "logps/rejected": -170.73056030273438, + "loss": 0.5061, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29118815064430237, + "rewards/margins": 0.5795993804931641, + "rewards/rejected": -0.2884112596511841, + "step": 5961 + }, + { + "epoch": 0.9220181712739223, + "grad_norm": 16.355939865112305, + "learning_rate": 3.848092565013174e-06, + "logits/chosen": 9.058082580566406, + "logits/rejected": 12.13011360168457, + "logps/chosen": -275.96343994140625, + "logps/rejected": -326.87603759765625, + "loss": 0.749, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.15806210041046143, + "rewards/margins": 0.3714718818664551, + "rewards/rejected": -0.5295339226722717, + "step": 5962 + }, + { + "epoch": 0.9221728204136864, + "grad_norm": 5.769424915313721, + "learning_rate": 3.847806163363501e-06, + "logits/chosen": 8.52417278289795, + "logits/rejected": 6.214466094970703, + "logps/chosen": -401.5508117675781, + "logps/rejected": -279.8078918457031, + "loss": 0.4864, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13646875321865082, + "rewards/margins": 0.5443332195281982, + "rewards/rejected": -0.407864511013031, + "step": 5963 + }, + { + "epoch": 0.9223274695534506, + "grad_norm": 7.583956241607666, + "learning_rate": 3.847519761713828e-06, + "logits/chosen": 9.999289512634277, + "logits/rejected": 10.591829299926758, + "logps/chosen": -365.614990234375, + "logps/rejected": -267.7789611816406, + "loss": 0.5999, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1878223419189453, + "rewards/margins": 0.3166043162345886, + "rewards/rejected": -0.1287819892168045, + "step": 5964 + }, + { + "epoch": 0.9224821186932147, + "grad_norm": 3.909832715988159, + "learning_rate": 3.847233360064154e-06, + "logits/chosen": 13.395157814025879, + "logits/rejected": 13.810171127319336, + "logps/chosen": -283.162109375, + "logps/rejected": -237.293701171875, + "loss": 0.5571, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36247557401657104, + "rewards/margins": 0.371268630027771, + "rewards/rejected": -0.008793100714683533, + "step": 5965 + }, + { + "epoch": 0.9226367678329789, + "grad_norm": 4.247903347015381, + "learning_rate": 3.846946958414481e-06, + "logits/chosen": 11.090770721435547, + "logits/rejected": 11.543726921081543, + "logps/chosen": -356.1608581542969, + "logps/rejected": -360.84185791015625, + "loss": 0.4669, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35332566499710083, + "rewards/margins": 0.6636320352554321, + "rewards/rejected": -0.3103064000606537, + "step": 5966 + }, + { + "epoch": 0.9227914169727431, + "grad_norm": 5.168519973754883, + "learning_rate": 3.846660556764808e-06, + "logits/chosen": 8.626436233520508, + "logits/rejected": 7.0696187019348145, + "logps/chosen": -189.49253845214844, + "logps/rejected": -165.54818725585938, + "loss": 0.6043, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.39061564207077026, + "rewards/margins": 0.35810378193855286, + "rewards/rejected": -0.7487194538116455, + "step": 5967 + }, + { + "epoch": 0.9229460661125073, + "grad_norm": 5.606865406036377, + "learning_rate": 3.846374155115134e-06, + "logits/chosen": 8.776806831359863, + "logits/rejected": 8.227149963378906, + "logps/chosen": -234.8128662109375, + "logps/rejected": -283.83160400390625, + "loss": 0.7573, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34023475646972656, + "rewards/margins": 0.0626526027917862, + "rewards/rejected": 0.27758219838142395, + "step": 5968 + }, + { + "epoch": 0.9231007152522714, + "grad_norm": 3.8609025478363037, + "learning_rate": 3.84608775346546e-06, + "logits/chosen": 9.259864807128906, + "logits/rejected": 6.362096786499023, + "logps/chosen": -265.30224609375, + "logps/rejected": -256.0496520996094, + "loss": 0.4642, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2841939926147461, + "rewards/margins": 0.7123348712921143, + "rewards/rejected": -0.42814087867736816, + "step": 5969 + }, + { + "epoch": 0.9232553643920356, + "grad_norm": 5.903758525848389, + "learning_rate": 3.845801351815787e-06, + "logits/chosen": 6.796444416046143, + "logits/rejected": 13.478812217712402, + "logps/chosen": -170.37298583984375, + "logps/rejected": -305.61248779296875, + "loss": 0.805, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.29077470302581787, + "rewards/margins": -0.08672347664833069, + "rewards/rejected": -0.204051211476326, + "step": 5970 + }, + { + "epoch": 0.9234100135317997, + "grad_norm": 4.833823204040527, + "learning_rate": 3.845514950166113e-06, + "logits/chosen": 5.403933048248291, + "logits/rejected": 1.7210121154785156, + "logps/chosen": -154.13180541992188, + "logps/rejected": -133.4947509765625, + "loss": 0.6119, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23658888041973114, + "rewards/margins": 0.20584726333618164, + "rewards/rejected": -0.4424360990524292, + "step": 5971 + }, + { + "epoch": 0.9235646626715639, + "grad_norm": 5.080250263214111, + "learning_rate": 3.84522854851644e-06, + "logits/chosen": 8.055213928222656, + "logits/rejected": 8.364986419677734, + "logps/chosen": -205.92010498046875, + "logps/rejected": -193.29132080078125, + "loss": 0.7308, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06329239159822464, + "rewards/margins": -0.020917341113090515, + "rewards/rejected": 0.08420972526073456, + "step": 5972 + }, + { + "epoch": 0.923719311811328, + "grad_norm": 5.192558288574219, + "learning_rate": 3.844942146866767e-06, + "logits/chosen": 13.305277824401855, + "logits/rejected": 12.780332565307617, + "logps/chosen": -311.9449462890625, + "logps/rejected": -338.5521240234375, + "loss": 0.5333, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3909967541694641, + "rewards/margins": 0.5475971698760986, + "rewards/rejected": -0.15660040080547333, + "step": 5973 + }, + { + "epoch": 0.9238739609510922, + "grad_norm": 3.564321994781494, + "learning_rate": 3.844655745217093e-06, + "logits/chosen": 18.885189056396484, + "logits/rejected": 14.983673095703125, + "logps/chosen": -187.3491973876953, + "logps/rejected": -158.0996551513672, + "loss": 0.6028, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.033954039216041565, + "rewards/margins": 0.22972875833511353, + "rewards/rejected": -0.2636827826499939, + "step": 5974 + }, + { + "epoch": 0.9240286100908564, + "grad_norm": 8.08517074584961, + "learning_rate": 3.844369343567419e-06, + "logits/chosen": 5.152029037475586, + "logits/rejected": 7.747014045715332, + "logps/chosen": -220.47872924804688, + "logps/rejected": -245.01150512695312, + "loss": 0.5951, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2685032784938812, + "rewards/margins": 0.4616629481315613, + "rewards/rejected": -0.7301662564277649, + "step": 5975 + }, + { + "epoch": 0.9241832592306205, + "grad_norm": 4.957554340362549, + "learning_rate": 3.844082941917746e-06, + "logits/chosen": 9.247166633605957, + "logits/rejected": 10.735408782958984, + "logps/chosen": -235.8817901611328, + "logps/rejected": -248.5756072998047, + "loss": 0.7305, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06319206953048706, + "rewards/margins": 0.09371539950370789, + "rewards/rejected": -0.15690746903419495, + "step": 5976 + }, + { + "epoch": 0.9243379083703847, + "grad_norm": 8.51071834564209, + "learning_rate": 3.8437965402680725e-06, + "logits/chosen": 9.490609169006348, + "logits/rejected": 12.374410629272461, + "logps/chosen": -270.05328369140625, + "logps/rejected": -283.78564453125, + "loss": 0.7867, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.013409137725830078, + "rewards/margins": -0.048030681908130646, + "rewards/rejected": 0.06143981218338013, + "step": 5977 + }, + { + "epoch": 0.9244925575101488, + "grad_norm": 4.739641189575195, + "learning_rate": 3.843510138618399e-06, + "logits/chosen": 14.059650421142578, + "logits/rejected": -0.6608867645263672, + "logps/chosen": -497.4606018066406, + "logps/rejected": -231.14060974121094, + "loss": 0.5057, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17075282335281372, + "rewards/margins": 0.8179172873497009, + "rewards/rejected": -0.6471644639968872, + "step": 5978 + }, + { + "epoch": 0.924647206649913, + "grad_norm": 9.917501449584961, + "learning_rate": 3.843223736968725e-06, + "logits/chosen": 14.094247817993164, + "logits/rejected": 13.558454513549805, + "logps/chosen": -285.1767272949219, + "logps/rejected": -284.9686584472656, + "loss": 0.7405, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1509305238723755, + "rewards/margins": 0.0009683221578598022, + "rewards/rejected": 0.14996221661567688, + "step": 5979 + }, + { + "epoch": 0.9248018557896772, + "grad_norm": 4.1342997550964355, + "learning_rate": 3.8429373353190516e-06, + "logits/chosen": 16.070533752441406, + "logits/rejected": 15.911742210388184, + "logps/chosen": -157.46766662597656, + "logps/rejected": -163.3729248046875, + "loss": 0.5542, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13045649230480194, + "rewards/margins": 0.38117191195487976, + "rewards/rejected": -0.5116283297538757, + "step": 5980 + }, + { + "epoch": 0.9249565049294414, + "grad_norm": 5.723352909088135, + "learning_rate": 3.842650933669378e-06, + "logits/chosen": 16.820104598999023, + "logits/rejected": 7.088027477264404, + "logps/chosen": -330.1671447753906, + "logps/rejected": -253.65802001953125, + "loss": 0.578, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3902343809604645, + "rewards/margins": 0.4019431471824646, + "rewards/rejected": -0.011708717793226242, + "step": 5981 + }, + { + "epoch": 0.9251111540692055, + "grad_norm": 5.285297393798828, + "learning_rate": 3.842364532019705e-06, + "logits/chosen": 11.568995475769043, + "logits/rejected": 8.384541511535645, + "logps/chosen": -205.7856903076172, + "logps/rejected": -201.41253662109375, + "loss": 0.6246, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05940427631139755, + "rewards/margins": 0.35890018939971924, + "rewards/rejected": -0.4183045029640198, + "step": 5982 + }, + { + "epoch": 0.9252658032089697, + "grad_norm": 6.304208755493164, + "learning_rate": 3.842078130370031e-06, + "logits/chosen": 14.559545516967773, + "logits/rejected": 12.454434394836426, + "logps/chosen": -480.0831604003906, + "logps/rejected": -322.0045166015625, + "loss": 0.4961, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09261521697044373, + "rewards/margins": 0.5517861843109131, + "rewards/rejected": -0.4591709077358246, + "step": 5983 + }, + { + "epoch": 0.9254204523487338, + "grad_norm": 6.11890172958374, + "learning_rate": 3.841791728720357e-06, + "logits/chosen": 5.396515846252441, + "logits/rejected": 9.605737686157227, + "logps/chosen": -216.80996704101562, + "logps/rejected": -212.20599365234375, + "loss": 0.7952, + "rewards/accuracies": 0.125, + "rewards/chosen": -0.09389863163232803, + "rewards/margins": -0.15854188799858093, + "rewards/rejected": 0.0646432489156723, + "step": 5984 + }, + { + "epoch": 0.925575101488498, + "grad_norm": 5.103549003601074, + "learning_rate": 3.841505327070684e-06, + "logits/chosen": 12.546974182128906, + "logits/rejected": 9.806419372558594, + "logps/chosen": -250.4193572998047, + "logps/rejected": -211.29010009765625, + "loss": 0.6849, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07964477688074112, + "rewards/margins": 0.041126541793346405, + "rewards/rejected": -0.12077131122350693, + "step": 5985 + }, + { + "epoch": 0.9257297506282621, + "grad_norm": 4.3281378746032715, + "learning_rate": 3.841218925421011e-06, + "logits/chosen": 9.79928207397461, + "logits/rejected": 9.200427055358887, + "logps/chosen": -307.8592529296875, + "logps/rejected": -291.7019958496094, + "loss": 0.4705, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35281121730804443, + "rewards/margins": 0.8667663335800171, + "rewards/rejected": -0.5139550566673279, + "step": 5986 + }, + { + "epoch": 0.9258843997680263, + "grad_norm": 4.6955647468566895, + "learning_rate": 3.840932523771337e-06, + "logits/chosen": 7.133037090301514, + "logits/rejected": 7.298615455627441, + "logps/chosen": -288.6138000488281, + "logps/rejected": -287.91217041015625, + "loss": 0.6542, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30169397592544556, + "rewards/margins": 0.14161929488182068, + "rewards/rejected": 0.16007468104362488, + "step": 5987 + }, + { + "epoch": 0.9260390489077904, + "grad_norm": 4.4902191162109375, + "learning_rate": 3.840646122121664e-06, + "logits/chosen": 10.88308048248291, + "logits/rejected": 5.6794047355651855, + "logps/chosen": -302.8494873046875, + "logps/rejected": -225.64649963378906, + "loss": 0.6366, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.040824323892593384, + "rewards/margins": 0.3924453854560852, + "rewards/rejected": -0.35162103176116943, + "step": 5988 + }, + { + "epoch": 0.9261936980475546, + "grad_norm": 5.543754577636719, + "learning_rate": 3.84035972047199e-06, + "logits/chosen": 9.354480743408203, + "logits/rejected": 10.920613288879395, + "logps/chosen": -291.64715576171875, + "logps/rejected": -277.1886291503906, + "loss": 0.6078, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01307515799999237, + "rewards/margins": 0.37407273054122925, + "rewards/rejected": -0.3871479034423828, + "step": 5989 + }, + { + "epoch": 0.9263483471873187, + "grad_norm": 5.527048110961914, + "learning_rate": 3.840073318822316e-06, + "logits/chosen": 11.379961013793945, + "logits/rejected": 12.641226768493652, + "logps/chosen": -258.747802734375, + "logps/rejected": -245.22509765625, + "loss": 0.853, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2707340717315674, + "rewards/margins": -0.22588051855564117, + "rewards/rejected": -0.04485354945063591, + "step": 5990 + }, + { + "epoch": 0.9265029963270829, + "grad_norm": 6.397703170776367, + "learning_rate": 3.839786917172643e-06, + "logits/chosen": 14.814423561096191, + "logits/rejected": 7.18342399597168, + "logps/chosen": -282.2193603515625, + "logps/rejected": -206.79129028320312, + "loss": 0.6437, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4099922180175781, + "rewards/margins": 0.16729727387428284, + "rewards/rejected": 0.2426949441432953, + "step": 5991 + }, + { + "epoch": 0.926657645466847, + "grad_norm": 4.245517730712891, + "learning_rate": 3.83950051552297e-06, + "logits/chosen": 10.150803565979004, + "logits/rejected": 9.948104858398438, + "logps/chosen": -192.3197784423828, + "logps/rejected": -233.9940185546875, + "loss": 0.6158, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.003999426960945129, + "rewards/margins": 0.23117296397686005, + "rewards/rejected": -0.2351723611354828, + "step": 5992 + }, + { + "epoch": 0.9268122946066113, + "grad_norm": 6.12477970123291, + "learning_rate": 3.839214113873296e-06, + "logits/chosen": 11.006741523742676, + "logits/rejected": 13.550213813781738, + "logps/chosen": -315.9485168457031, + "logps/rejected": -282.2869567871094, + "loss": 0.7184, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.22746402025222778, + "rewards/margins": -0.004456795752048492, + "rewards/rejected": -0.2230072170495987, + "step": 5993 + }, + { + "epoch": 0.9269669437463754, + "grad_norm": 4.8840413093566895, + "learning_rate": 3.838927712223623e-06, + "logits/chosen": 10.316544532775879, + "logits/rejected": 6.839597702026367, + "logps/chosen": -316.9295654296875, + "logps/rejected": -254.8017578125, + "loss": 0.5098, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11888179183006287, + "rewards/margins": 0.5479996800422668, + "rewards/rejected": -0.4291178584098816, + "step": 5994 + }, + { + "epoch": 0.9271215928861396, + "grad_norm": 4.272189617156982, + "learning_rate": 3.838641310573949e-06, + "logits/chosen": 5.318933486938477, + "logits/rejected": 4.6727800369262695, + "logps/chosen": -140.0816650390625, + "logps/rejected": -173.6703338623047, + "loss": 0.5935, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.2964593470096588, + "rewards/margins": 0.2374880313873291, + "rewards/rejected": -0.5339474081993103, + "step": 5995 + }, + { + "epoch": 0.9272762420259038, + "grad_norm": 4.730823040008545, + "learning_rate": 3.8383549089242754e-06, + "logits/chosen": 9.433661460876465, + "logits/rejected": 7.357000350952148, + "logps/chosen": -299.11700439453125, + "logps/rejected": -236.88186645507812, + "loss": 0.6115, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2665528953075409, + "rewards/margins": 0.2866978645324707, + "rewards/rejected": -0.0201449915766716, + "step": 5996 + }, + { + "epoch": 0.9274308911656679, + "grad_norm": 4.293432235717773, + "learning_rate": 3.838068507274602e-06, + "logits/chosen": 9.619799613952637, + "logits/rejected": 9.015604972839355, + "logps/chosen": -232.1720733642578, + "logps/rejected": -190.91671752929688, + "loss": 0.5996, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.10083504766225815, + "rewards/margins": 0.2952927052974701, + "rewards/rejected": -0.39612776041030884, + "step": 5997 + }, + { + "epoch": 0.927585540305432, + "grad_norm": 4.396122455596924, + "learning_rate": 3.837782105624929e-06, + "logits/chosen": 10.244634628295898, + "logits/rejected": 7.831020832061768, + "logps/chosen": -182.12704467773438, + "logps/rejected": -192.9267578125, + "loss": 0.5556, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.05804350972175598, + "rewards/margins": 0.33055567741394043, + "rewards/rejected": -0.27251213788986206, + "step": 5998 + }, + { + "epoch": 0.9277401894451962, + "grad_norm": 5.335066795349121, + "learning_rate": 3.837495703975255e-06, + "logits/chosen": 14.32138729095459, + "logits/rejected": 11.036077499389648, + "logps/chosen": -366.6612854003906, + "logps/rejected": -260.768310546875, + "loss": 0.6549, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0031122565269470215, + "rewards/margins": 0.13330931961536407, + "rewards/rejected": -0.13019704818725586, + "step": 5999 + }, + { + "epoch": 0.9278948385849604, + "grad_norm": 4.370915412902832, + "learning_rate": 3.837209302325582e-06, + "logits/chosen": 12.220867156982422, + "logits/rejected": 6.463516712188721, + "logps/chosen": -325.37176513671875, + "logps/rejected": -208.3942108154297, + "loss": 0.5755, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07932300865650177, + "rewards/margins": 0.38254308700561523, + "rewards/rejected": -0.30322009325027466, + "step": 6000 + }, + { + "epoch": 0.9280494877247245, + "grad_norm": 3.832820415496826, + "learning_rate": 3.836922900675909e-06, + "logits/chosen": 10.285869598388672, + "logits/rejected": 6.5575151443481445, + "logps/chosen": -257.0646667480469, + "logps/rejected": -216.92608642578125, + "loss": 0.4541, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.07314136624336243, + "rewards/margins": 0.8702197074890137, + "rewards/rejected": -0.9433611631393433, + "step": 6001 + }, + { + "epoch": 0.9282041368644887, + "grad_norm": 7.224663734436035, + "learning_rate": 3.8366364990262345e-06, + "logits/chosen": 7.140823841094971, + "logits/rejected": 6.941685676574707, + "logps/chosen": -377.1507263183594, + "logps/rejected": -332.791748046875, + "loss": 0.6105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.23440855741500854, + "rewards/margins": 0.3616694509983063, + "rewards/rejected": -0.5960780382156372, + "step": 6002 + }, + { + "epoch": 0.9283587860042528, + "grad_norm": 8.36377239227295, + "learning_rate": 3.836350097376561e-06, + "logits/chosen": 6.744920253753662, + "logits/rejected": 5.525350093841553, + "logps/chosen": -200.383056640625, + "logps/rejected": -213.25692749023438, + "loss": 0.787, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14215537905693054, + "rewards/margins": 0.03371515870094299, + "rewards/rejected": -0.17587052285671234, + "step": 6003 + }, + { + "epoch": 0.928513435144017, + "grad_norm": 5.147937297821045, + "learning_rate": 3.836063695726888e-06, + "logits/chosen": 6.726301193237305, + "logits/rejected": 6.76882791519165, + "logps/chosen": -270.8572998046875, + "logps/rejected": -261.0835876464844, + "loss": 0.7157, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08769246190786362, + "rewards/margins": 0.031356245279312134, + "rewards/rejected": 0.05633620172739029, + "step": 6004 + }, + { + "epoch": 0.9286680842837812, + "grad_norm": 4.630476951599121, + "learning_rate": 3.8357772940772144e-06, + "logits/chosen": 5.05290412902832, + "logits/rejected": 3.440218687057495, + "logps/chosen": -194.867919921875, + "logps/rejected": -182.01553344726562, + "loss": 0.7222, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.12103088945150375, + "rewards/margins": -0.00909436121582985, + "rewards/rejected": 0.1301252692937851, + "step": 6005 + }, + { + "epoch": 0.9288227334235454, + "grad_norm": 4.764076232910156, + "learning_rate": 3.835490892427541e-06, + "logits/chosen": 11.23635482788086, + "logits/rejected": 6.176011562347412, + "logps/chosen": -233.3962860107422, + "logps/rejected": -148.70950317382812, + "loss": 0.6845, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06881700456142426, + "rewards/margins": 0.32831257581710815, + "rewards/rejected": -0.2594955563545227, + "step": 6006 + }, + { + "epoch": 0.9289773825633095, + "grad_norm": 6.077795028686523, + "learning_rate": 3.835204490777868e-06, + "logits/chosen": 8.089153289794922, + "logits/rejected": 4.023036479949951, + "logps/chosen": -477.6796569824219, + "logps/rejected": -347.79107666015625, + "loss": 0.5066, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3078598976135254, + "rewards/margins": 0.5196900367736816, + "rewards/rejected": -0.21183015406131744, + "step": 6007 + }, + { + "epoch": 0.9291320317030737, + "grad_norm": 4.766157150268555, + "learning_rate": 3.8349180891281936e-06, + "logits/chosen": 5.765253067016602, + "logits/rejected": 5.693391799926758, + "logps/chosen": -184.94967651367188, + "logps/rejected": -208.50437927246094, + "loss": 0.778, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13897529244422913, + "rewards/margins": -0.08087249100208282, + "rewards/rejected": 0.21984776854515076, + "step": 6008 + }, + { + "epoch": 0.9292866808428378, + "grad_norm": 6.012340068817139, + "learning_rate": 3.83463168747852e-06, + "logits/chosen": 9.131631851196289, + "logits/rejected": 8.53108024597168, + "logps/chosen": -264.8885498046875, + "logps/rejected": -322.9024353027344, + "loss": 0.6786, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.20992150902748108, + "rewards/margins": 0.4160435199737549, + "rewards/rejected": -0.6259649991989136, + "step": 6009 + }, + { + "epoch": 0.929441329982602, + "grad_norm": 17.134231567382812, + "learning_rate": 3.834345285828847e-06, + "logits/chosen": 12.58877182006836, + "logits/rejected": 7.450962543487549, + "logps/chosen": -321.4927673339844, + "logps/rejected": -252.15084838867188, + "loss": 0.6573, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.044093236327171326, + "rewards/margins": 0.4173487722873688, + "rewards/rejected": -0.4614419937133789, + "step": 6010 + }, + { + "epoch": 0.9295959791223661, + "grad_norm": 6.025030136108398, + "learning_rate": 3.8340588841791735e-06, + "logits/chosen": 13.786745071411133, + "logits/rejected": 15.04443645477295, + "logps/chosen": -261.65142822265625, + "logps/rejected": -323.10760498046875, + "loss": 0.6626, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2919447720050812, + "rewards/margins": 0.08822473138570786, + "rewards/rejected": 0.2037200629711151, + "step": 6011 + }, + { + "epoch": 0.9297506282621303, + "grad_norm": 5.432513236999512, + "learning_rate": 3.8337724825295e-06, + "logits/chosen": 6.158177852630615, + "logits/rejected": 7.188523292541504, + "logps/chosen": -307.4617919921875, + "logps/rejected": -262.46612548828125, + "loss": 0.6134, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0014916956424713135, + "rewards/margins": 0.4702872335910797, + "rewards/rejected": -0.4717789888381958, + "step": 6012 + }, + { + "epoch": 0.9299052774018944, + "grad_norm": 14.94957160949707, + "learning_rate": 3.833486080879826e-06, + "logits/chosen": 5.800658702850342, + "logits/rejected": 4.967113494873047, + "logps/chosen": -284.8963928222656, + "logps/rejected": -262.14190673828125, + "loss": 0.5591, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32444095611572266, + "rewards/margins": 0.47640955448150635, + "rewards/rejected": -0.15196862816810608, + "step": 6013 + }, + { + "epoch": 0.9300599265416586, + "grad_norm": 5.186890602111816, + "learning_rate": 3.833199679230153e-06, + "logits/chosen": 11.036134719848633, + "logits/rejected": 10.278372764587402, + "logps/chosen": -253.8583984375, + "logps/rejected": -208.4478759765625, + "loss": 0.6822, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3863145709037781, + "rewards/margins": 0.2542487382888794, + "rewards/rejected": 0.1320658177137375, + "step": 6014 + }, + { + "epoch": 0.9302145756814227, + "grad_norm": 4.433958053588867, + "learning_rate": 3.832913277580479e-06, + "logits/chosen": 13.264066696166992, + "logits/rejected": 8.640970230102539, + "logps/chosen": -235.05982971191406, + "logps/rejected": -168.89462280273438, + "loss": 0.6085, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.23281946778297424, + "rewards/margins": 0.4022805690765381, + "rewards/rejected": -0.6351000666618347, + "step": 6015 + }, + { + "epoch": 0.9303692248211869, + "grad_norm": 8.066131591796875, + "learning_rate": 3.832626875930806e-06, + "logits/chosen": 8.911860466003418, + "logits/rejected": 3.7579660415649414, + "logps/chosen": -238.1811065673828, + "logps/rejected": -202.03439331054688, + "loss": 0.776, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10820291936397552, + "rewards/margins": 0.14072877168655396, + "rewards/rejected": -0.24893172085285187, + "step": 6016 + }, + { + "epoch": 0.930523873960951, + "grad_norm": 4.749118328094482, + "learning_rate": 3.832340474281132e-06, + "logits/chosen": 5.73783016204834, + "logits/rejected": 2.6617510318756104, + "logps/chosen": -285.53875732421875, + "logps/rejected": -221.54373168945312, + "loss": 0.6347, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.47561997175216675, + "rewards/margins": 0.3651784360408783, + "rewards/rejected": 0.11044152081012726, + "step": 6017 + }, + { + "epoch": 0.9306785231007153, + "grad_norm": 4.501606464385986, + "learning_rate": 3.832054072631458e-06, + "logits/chosen": 12.063766479492188, + "logits/rejected": 3.245598554611206, + "logps/chosen": -200.2242431640625, + "logps/rejected": -154.25167846679688, + "loss": 0.6233, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3054177165031433, + "rewards/margins": 0.3008345663547516, + "rewards/rejected": 0.004583191126585007, + "step": 6018 + }, + { + "epoch": 0.9308331722404795, + "grad_norm": 4.452945232391357, + "learning_rate": 3.831767670981785e-06, + "logits/chosen": 10.99527359008789, + "logits/rejected": 7.252123832702637, + "logps/chosen": -374.6976013183594, + "logps/rejected": -229.2428741455078, + "loss": 0.4409, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4849901795387268, + "rewards/margins": 0.7554462552070618, + "rewards/rejected": -0.2704560160636902, + "step": 6019 + }, + { + "epoch": 0.9309878213802436, + "grad_norm": 5.7374396324157715, + "learning_rate": 3.831481269332112e-06, + "logits/chosen": 8.0463228225708, + "logits/rejected": 9.23757553100586, + "logps/chosen": -277.8264465332031, + "logps/rejected": -231.28671264648438, + "loss": 0.9093, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10356885194778442, + "rewards/margins": -0.1309281885623932, + "rewards/rejected": 0.23449702560901642, + "step": 6020 + }, + { + "epoch": 0.9311424705200078, + "grad_norm": 4.379790782928467, + "learning_rate": 3.831194867682438e-06, + "logits/chosen": 12.132953643798828, + "logits/rejected": 10.656715393066406, + "logps/chosen": -326.63165283203125, + "logps/rejected": -277.4964599609375, + "loss": 0.4916, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2004929482936859, + "rewards/margins": 0.5663213133811951, + "rewards/rejected": -0.36582839488983154, + "step": 6021 + }, + { + "epoch": 0.9312971196597719, + "grad_norm": 3.0467631816864014, + "learning_rate": 3.830908466032764e-06, + "logits/chosen": 11.022439956665039, + "logits/rejected": 10.373337745666504, + "logps/chosen": -272.3958740234375, + "logps/rejected": -215.5037384033203, + "loss": 0.4691, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5104539394378662, + "rewards/margins": 0.6950963735580444, + "rewards/rejected": -0.18464237451553345, + "step": 6022 + }, + { + "epoch": 0.9314517687995361, + "grad_norm": 6.030575275421143, + "learning_rate": 3.830622064383091e-06, + "logits/chosen": 14.003541946411133, + "logits/rejected": 8.356953620910645, + "logps/chosen": -453.41717529296875, + "logps/rejected": -328.6996154785156, + "loss": 0.5302, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3017389178276062, + "rewards/margins": 0.447161465883255, + "rewards/rejected": -0.1454225480556488, + "step": 6023 + }, + { + "epoch": 0.9316064179393002, + "grad_norm": 5.45608377456665, + "learning_rate": 3.830335662733417e-06, + "logits/chosen": 12.766246795654297, + "logits/rejected": 10.134567260742188, + "logps/chosen": -348.9058837890625, + "logps/rejected": -261.0465393066406, + "loss": 0.626, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33152657747268677, + "rewards/margins": 0.39144694805145264, + "rewards/rejected": -0.059920430183410645, + "step": 6024 + }, + { + "epoch": 0.9317610670790644, + "grad_norm": 5.8412909507751465, + "learning_rate": 3.830049261083744e-06, + "logits/chosen": 9.497193336486816, + "logits/rejected": 14.345690727233887, + "logps/chosen": -365.80523681640625, + "logps/rejected": -380.3077392578125, + "loss": 0.7554, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1632295846939087, + "rewards/margins": 0.27112072706222534, + "rewards/rejected": -0.10789114236831665, + "step": 6025 + }, + { + "epoch": 0.9319157162188285, + "grad_norm": 4.8470258712768555, + "learning_rate": 3.829762859434071e-06, + "logits/chosen": 9.572096824645996, + "logits/rejected": 11.61083984375, + "logps/chosen": -195.73455810546875, + "logps/rejected": -183.56591796875, + "loss": 0.6392, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.24269239604473114, + "rewards/margins": 0.12330552190542221, + "rewards/rejected": -0.36599791049957275, + "step": 6026 + }, + { + "epoch": 0.9320703653585927, + "grad_norm": 4.497620582580566, + "learning_rate": 3.829476457784397e-06, + "logits/chosen": 11.514942169189453, + "logits/rejected": 11.367023468017578, + "logps/chosen": -249.30511474609375, + "logps/rejected": -213.90652465820312, + "loss": 0.5604, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.057696253061294556, + "rewards/margins": 0.4380154609680176, + "rewards/rejected": -0.49571171402931213, + "step": 6027 + }, + { + "epoch": 0.9322250144983568, + "grad_norm": 4.109735012054443, + "learning_rate": 3.829190056134723e-06, + "logits/chosen": 16.47292709350586, + "logits/rejected": 9.034542083740234, + "logps/chosen": -342.5322570800781, + "logps/rejected": -204.951171875, + "loss": 0.4944, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35756605863571167, + "rewards/margins": 0.5707290172576904, + "rewards/rejected": -0.21316289901733398, + "step": 6028 + }, + { + "epoch": 0.932379663638121, + "grad_norm": 5.297967433929443, + "learning_rate": 3.82890365448505e-06, + "logits/chosen": 9.922212600708008, + "logits/rejected": 8.710733413696289, + "logps/chosen": -213.2489013671875, + "logps/rejected": -236.77659606933594, + "loss": 0.5968, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.11869316548109055, + "rewards/margins": 0.36253565549850464, + "rewards/rejected": -0.4812288284301758, + "step": 6029 + }, + { + "epoch": 0.9325343127778851, + "grad_norm": 4.6720290184021, + "learning_rate": 3.8286172528353765e-06, + "logits/chosen": 11.735729217529297, + "logits/rejected": 9.072715759277344, + "logps/chosen": -384.2177429199219, + "logps/rejected": -239.77377319335938, + "loss": 0.5031, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4991775155067444, + "rewards/margins": 0.5800157785415649, + "rewards/rejected": -0.08083821088075638, + "step": 6030 + }, + { + "epoch": 0.9326889619176494, + "grad_norm": 10.191884994506836, + "learning_rate": 3.828330851185703e-06, + "logits/chosen": 9.411463737487793, + "logits/rejected": 3.467341899871826, + "logps/chosen": -330.3621520996094, + "logps/rejected": -248.29336547851562, + "loss": 0.8695, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010299697518348694, + "rewards/margins": -0.19282718002796173, + "rewards/rejected": 0.20312686264514923, + "step": 6031 + }, + { + "epoch": 0.9328436110574135, + "grad_norm": 4.5380072593688965, + "learning_rate": 3.82804444953603e-06, + "logits/chosen": 9.85280704498291, + "logits/rejected": 6.435073375701904, + "logps/chosen": -326.41607666015625, + "logps/rejected": -208.88665771484375, + "loss": 0.6914, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28879299759864807, + "rewards/margins": 0.2380891740322113, + "rewards/rejected": 0.050703808665275574, + "step": 6032 + }, + { + "epoch": 0.9329982601971777, + "grad_norm": 5.611965179443359, + "learning_rate": 3.8277580478863564e-06, + "logits/chosen": 11.191861152648926, + "logits/rejected": 2.6972930431365967, + "logps/chosen": -359.6614990234375, + "logps/rejected": -290.51641845703125, + "loss": 0.5934, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3615747392177582, + "rewards/margins": 0.376809298992157, + "rewards/rejected": -0.015234556049108505, + "step": 6033 + }, + { + "epoch": 0.9331529093369418, + "grad_norm": 4.315613269805908, + "learning_rate": 3.827471646236683e-06, + "logits/chosen": 10.832747459411621, + "logits/rejected": 12.062925338745117, + "logps/chosen": -167.652587890625, + "logps/rejected": -157.68569946289062, + "loss": 0.7155, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0033126845955848694, + "rewards/margins": -0.0034381896257400513, + "rewards/rejected": 0.006750866770744324, + "step": 6034 + }, + { + "epoch": 0.933307558476706, + "grad_norm": 6.593498706817627, + "learning_rate": 3.827185244587009e-06, + "logits/chosen": 9.622892379760742, + "logits/rejected": 9.4996976852417, + "logps/chosen": -308.12884521484375, + "logps/rejected": -322.00311279296875, + "loss": 0.6496, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3257409930229187, + "rewards/margins": 0.250812828540802, + "rewards/rejected": 0.07492820918560028, + "step": 6035 + }, + { + "epoch": 0.9334622076164701, + "grad_norm": 5.7150654792785645, + "learning_rate": 3.8268988429373355e-06, + "logits/chosen": 8.950979232788086, + "logits/rejected": 10.60782527923584, + "logps/chosen": -384.05035400390625, + "logps/rejected": -281.7973327636719, + "loss": 0.6524, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3711523115634918, + "rewards/margins": 0.24722784757614136, + "rewards/rejected": 0.12392445653676987, + "step": 6036 + }, + { + "epoch": 0.9336168567562343, + "grad_norm": 6.262704849243164, + "learning_rate": 3.826612441287662e-06, + "logits/chosen": 12.922733306884766, + "logits/rejected": 12.037979125976562, + "logps/chosen": -238.59129333496094, + "logps/rejected": -261.8547058105469, + "loss": 0.8978, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15271368622779846, + "rewards/margins": -0.30616745352745056, + "rewards/rejected": 0.1534537672996521, + "step": 6037 + }, + { + "epoch": 0.9337715058959984, + "grad_norm": 4.9652180671691895, + "learning_rate": 3.826326039637989e-06, + "logits/chosen": 13.282856941223145, + "logits/rejected": 12.026641845703125, + "logps/chosen": -240.10231018066406, + "logps/rejected": -215.92483520507812, + "loss": 0.7116, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1337796151638031, + "rewards/margins": -0.016256965696811676, + "rewards/rejected": 0.15003658831119537, + "step": 6038 + }, + { + "epoch": 0.9339261550357626, + "grad_norm": 4.477086067199707, + "learning_rate": 3.8260396379883155e-06, + "logits/chosen": 11.503787994384766, + "logits/rejected": 9.353680610656738, + "logps/chosen": -265.6096496582031, + "logps/rejected": -266.1465148925781, + "loss": 0.5894, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43273651599884033, + "rewards/margins": 0.5105540752410889, + "rewards/rejected": -0.07781754434108734, + "step": 6039 + }, + { + "epoch": 0.9340808041755267, + "grad_norm": 4.011292457580566, + "learning_rate": 3.825753236338642e-06, + "logits/chosen": 9.134407043457031, + "logits/rejected": 11.923297882080078, + "logps/chosen": -215.97825622558594, + "logps/rejected": -263.9239807128906, + "loss": 0.5358, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12924326956272125, + "rewards/margins": 0.5040192604064941, + "rewards/rejected": -0.3747760057449341, + "step": 6040 + }, + { + "epoch": 0.9342354533152909, + "grad_norm": 4.179714202880859, + "learning_rate": 3.825466834688968e-06, + "logits/chosen": 11.143575668334961, + "logits/rejected": 5.20336389541626, + "logps/chosen": -250.85903930664062, + "logps/rejected": -153.57920837402344, + "loss": 0.5451, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06176067516207695, + "rewards/margins": 0.5256879925727844, + "rewards/rejected": -0.463927298784256, + "step": 6041 + }, + { + "epoch": 0.934390102455055, + "grad_norm": 7.819858074188232, + "learning_rate": 3.825180433039295e-06, + "logits/chosen": 8.802535057067871, + "logits/rejected": 7.068612575531006, + "logps/chosen": -299.29998779296875, + "logps/rejected": -242.37384033203125, + "loss": 0.6275, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23822559416294098, + "rewards/margins": 0.22409909963607788, + "rewards/rejected": 0.014126542955636978, + "step": 6042 + }, + { + "epoch": 0.9345447515948192, + "grad_norm": 4.5175700187683105, + "learning_rate": 3.824894031389621e-06, + "logits/chosen": 8.470870018005371, + "logits/rejected": 2.412821054458618, + "logps/chosen": -333.9119873046875, + "logps/rejected": -278.8467712402344, + "loss": 0.4997, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6869453191757202, + "rewards/margins": 0.5555365085601807, + "rewards/rejected": 0.13140885531902313, + "step": 6043 + }, + { + "epoch": 0.9346994007345835, + "grad_norm": 3.8975155353546143, + "learning_rate": 3.824607629739948e-06, + "logits/chosen": 10.772109031677246, + "logits/rejected": 4.530871391296387, + "logps/chosen": -269.3810729980469, + "logps/rejected": -169.63943481445312, + "loss": 0.4885, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2432047724723816, + "rewards/margins": 0.5404318571090698, + "rewards/rejected": -0.29722708463668823, + "step": 6044 + }, + { + "epoch": 0.9348540498743476, + "grad_norm": 4.505472183227539, + "learning_rate": 3.8243212280902745e-06, + "logits/chosen": 10.762072563171387, + "logits/rejected": 6.81305456161499, + "logps/chosen": -285.3082580566406, + "logps/rejected": -218.459716796875, + "loss": 0.549, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16315041482448578, + "rewards/margins": 0.3377542495727539, + "rewards/rejected": -0.17460383474826813, + "step": 6045 + }, + { + "epoch": 0.9350086990141118, + "grad_norm": 6.288514614105225, + "learning_rate": 3.8240348264406e-06, + "logits/chosen": 14.718301773071289, + "logits/rejected": 10.2052583694458, + "logps/chosen": -398.0087585449219, + "logps/rejected": -323.6663513183594, + "loss": 0.652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16363297402858734, + "rewards/margins": 0.11178320646286011, + "rewards/rejected": 0.05184975266456604, + "step": 6046 + }, + { + "epoch": 0.9351633481538759, + "grad_norm": 3.1541645526885986, + "learning_rate": 3.823748424790927e-06, + "logits/chosen": 8.760577201843262, + "logits/rejected": 0.8657584190368652, + "logps/chosen": -368.43115234375, + "logps/rejected": -197.75477600097656, + "loss": 0.4479, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5673789978027344, + "rewards/margins": 0.6440527439117432, + "rewards/rejected": -0.07667368650436401, + "step": 6047 + }, + { + "epoch": 0.9353179972936401, + "grad_norm": 3.8411777019500732, + "learning_rate": 3.823462023141254e-06, + "logits/chosen": 12.408902168273926, + "logits/rejected": 13.599173545837402, + "logps/chosen": -140.28466796875, + "logps/rejected": -187.16244506835938, + "loss": 0.6748, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22700080275535583, + "rewards/margins": 0.06837469339370728, + "rewards/rejected": 0.15862610936164856, + "step": 6048 + }, + { + "epoch": 0.9354726464334042, + "grad_norm": 8.164158821105957, + "learning_rate": 3.82317562149158e-06, + "logits/chosen": 11.915801048278809, + "logits/rejected": 8.787885665893555, + "logps/chosen": -402.47857666015625, + "logps/rejected": -297.17352294921875, + "loss": 0.8091, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.07263347506523132, + "rewards/margins": -0.1701229214668274, + "rewards/rejected": 0.24275636672973633, + "step": 6049 + }, + { + "epoch": 0.9356272955731684, + "grad_norm": 5.679912567138672, + "learning_rate": 3.822889219841907e-06, + "logits/chosen": 6.447786331176758, + "logits/rejected": 9.144700050354004, + "logps/chosen": -264.0130615234375, + "logps/rejected": -363.205810546875, + "loss": 0.5765, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38308852910995483, + "rewards/margins": 0.42537885904312134, + "rewards/rejected": -0.042290303856134415, + "step": 6050 + }, + { + "epoch": 0.9357819447129325, + "grad_norm": 5.169233798980713, + "learning_rate": 3.822602818192233e-06, + "logits/chosen": 13.977466583251953, + "logits/rejected": 9.791933059692383, + "logps/chosen": -239.18670654296875, + "logps/rejected": -220.9244384765625, + "loss": 0.5799, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18723654747009277, + "rewards/margins": 0.26936832070350647, + "rewards/rejected": -0.0821317583322525, + "step": 6051 + }, + { + "epoch": 0.9359365938526967, + "grad_norm": 4.727330207824707, + "learning_rate": 3.822316416542559e-06, + "logits/chosen": 13.885272979736328, + "logits/rejected": 16.182043075561523, + "logps/chosen": -169.0514678955078, + "logps/rejected": -222.49009704589844, + "loss": 0.7323, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1628856211900711, + "rewards/margins": 0.0253386739641428, + "rewards/rejected": -0.18822431564331055, + "step": 6052 + }, + { + "epoch": 0.9360912429924608, + "grad_norm": 4.583122730255127, + "learning_rate": 3.822030014892886e-06, + "logits/chosen": 11.220190048217773, + "logits/rejected": 7.90618896484375, + "logps/chosen": -248.93777465820312, + "logps/rejected": -276.22998046875, + "loss": 0.5869, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07014027237892151, + "rewards/margins": 0.3012443780899048, + "rewards/rejected": -0.23110409080982208, + "step": 6053 + }, + { + "epoch": 0.936245892132225, + "grad_norm": 4.106428146362305, + "learning_rate": 3.821743613243213e-06, + "logits/chosen": 11.332945823669434, + "logits/rejected": 4.969393253326416, + "logps/chosen": -176.0836181640625, + "logps/rejected": -134.5875244140625, + "loss": 0.6316, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.10211662948131561, + "rewards/margins": 0.2919144034385681, + "rewards/rejected": -0.39403101801872253, + "step": 6054 + }, + { + "epoch": 0.9364005412719891, + "grad_norm": 5.967907905578613, + "learning_rate": 3.8214572115935385e-06, + "logits/chosen": 12.463626861572266, + "logits/rejected": 11.256778717041016, + "logps/chosen": -327.74005126953125, + "logps/rejected": -361.98406982421875, + "loss": 0.6383, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38031715154647827, + "rewards/margins": 0.2521805167198181, + "rewards/rejected": 0.12813663482666016, + "step": 6055 + }, + { + "epoch": 0.9365551904117533, + "grad_norm": 6.8066277503967285, + "learning_rate": 3.821170809943865e-06, + "logits/chosen": 12.9143705368042, + "logits/rejected": 12.729548454284668, + "logps/chosen": -337.443603515625, + "logps/rejected": -350.5340270996094, + "loss": 0.7835, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.135735422372818, + "rewards/margins": 0.06516237556934357, + "rewards/rejected": 0.07057303935289383, + "step": 6056 + }, + { + "epoch": 0.9367098395515175, + "grad_norm": 3.171182155609131, + "learning_rate": 3.820884408294192e-06, + "logits/chosen": 10.704608917236328, + "logits/rejected": 7.606919288635254, + "logps/chosen": -251.07205200195312, + "logps/rejected": -167.4860382080078, + "loss": 0.521, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28730398416519165, + "rewards/margins": 0.5352724194526672, + "rewards/rejected": -0.24796846508979797, + "step": 6057 + }, + { + "epoch": 0.9368644886912817, + "grad_norm": 3.9153659343719482, + "learning_rate": 3.8205980066445185e-06, + "logits/chosen": 13.292579650878906, + "logits/rejected": 4.033205032348633, + "logps/chosen": -332.9898986816406, + "logps/rejected": -177.48500061035156, + "loss": 0.5533, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03870735317468643, + "rewards/margins": 0.5156199336051941, + "rewards/rejected": -0.47691261768341064, + "step": 6058 + }, + { + "epoch": 0.9370191378310458, + "grad_norm": 8.662125587463379, + "learning_rate": 3.820311604994845e-06, + "logits/chosen": 9.569156646728516, + "logits/rejected": 7.637282371520996, + "logps/chosen": -230.04583740234375, + "logps/rejected": -171.99783325195312, + "loss": 0.8238, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.3394660949707031, + "rewards/margins": -0.02765965461730957, + "rewards/rejected": -0.31180644035339355, + "step": 6059 + }, + { + "epoch": 0.93717378697081, + "grad_norm": 7.341385364532471, + "learning_rate": 3.820025203345172e-06, + "logits/chosen": 13.23616886138916, + "logits/rejected": 9.569148063659668, + "logps/chosen": -266.386962890625, + "logps/rejected": -241.69935607910156, + "loss": 0.9053, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1514882594347, + "rewards/margins": -0.14549335837364197, + "rewards/rejected": 0.2969816327095032, + "step": 6060 + }, + { + "epoch": 0.9373284361105741, + "grad_norm": 5.778408527374268, + "learning_rate": 3.8197388016954976e-06, + "logits/chosen": 14.685229301452637, + "logits/rejected": 12.554548263549805, + "logps/chosen": -272.43682861328125, + "logps/rejected": -260.2255859375, + "loss": 0.7574, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.26109209656715393, + "rewards/margins": -0.09590382128953934, + "rewards/rejected": 0.35699591040611267, + "step": 6061 + }, + { + "epoch": 0.9374830852503383, + "grad_norm": 5.86389684677124, + "learning_rate": 3.819452400045824e-06, + "logits/chosen": 8.837570190429688, + "logits/rejected": 9.727023124694824, + "logps/chosen": -297.515869140625, + "logps/rejected": -223.49392700195312, + "loss": 0.6368, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.43487799167633057, + "rewards/margins": 0.1827138513326645, + "rewards/rejected": 0.25216415524482727, + "step": 6062 + }, + { + "epoch": 0.9376377343901025, + "grad_norm": 3.4504144191741943, + "learning_rate": 3.819165998396151e-06, + "logits/chosen": 14.197935104370117, + "logits/rejected": 7.041372299194336, + "logps/chosen": -151.49386596679688, + "logps/rejected": -108.96153259277344, + "loss": 0.5901, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.013797953724861145, + "rewards/margins": 0.34054338932037354, + "rewards/rejected": -0.3543413579463959, + "step": 6063 + }, + { + "epoch": 0.9377923835298666, + "grad_norm": 6.328835964202881, + "learning_rate": 3.8188795967464775e-06, + "logits/chosen": 9.975637435913086, + "logits/rejected": 9.145719528198242, + "logps/chosen": -313.7372741699219, + "logps/rejected": -304.69183349609375, + "loss": 0.9328, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.023795247077941895, + "rewards/margins": -0.28483957052230835, + "rewards/rejected": 0.26104432344436646, + "step": 6064 + }, + { + "epoch": 0.9379470326696308, + "grad_norm": 4.681914806365967, + "learning_rate": 3.818593195096804e-06, + "logits/chosen": 9.851226806640625, + "logits/rejected": 11.388118743896484, + "logps/chosen": -260.6351623535156, + "logps/rejected": -293.7640380859375, + "loss": 0.5494, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32276809215545654, + "rewards/margins": 0.41228392720222473, + "rewards/rejected": -0.08951583504676819, + "step": 6065 + }, + { + "epoch": 0.9381016818093949, + "grad_norm": 3.847282886505127, + "learning_rate": 3.818306793447131e-06, + "logits/chosen": 10.958084106445312, + "logits/rejected": 11.48143482208252, + "logps/chosen": -191.4639892578125, + "logps/rejected": -200.89566040039062, + "loss": 0.6415, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13334208726882935, + "rewards/margins": 0.14758911728858948, + "rewards/rejected": -0.014247044920921326, + "step": 6066 + }, + { + "epoch": 0.9382563309491591, + "grad_norm": 4.838980197906494, + "learning_rate": 3.8180203917974575e-06, + "logits/chosen": 17.168556213378906, + "logits/rejected": 5.975695610046387, + "logps/chosen": -384.2066650390625, + "logps/rejected": -271.62554931640625, + "loss": 0.5127, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6038050055503845, + "rewards/margins": 0.6593232750892639, + "rewards/rejected": -0.055518247187137604, + "step": 6067 + }, + { + "epoch": 0.9384109800889232, + "grad_norm": 4.785552501678467, + "learning_rate": 3.817733990147783e-06, + "logits/chosen": 11.991065979003906, + "logits/rejected": 7.702507972717285, + "logps/chosen": -329.9935302734375, + "logps/rejected": -191.23634338378906, + "loss": 0.4522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6737090349197388, + "rewards/margins": 0.5773491263389587, + "rewards/rejected": 0.09635983407497406, + "step": 6068 + }, + { + "epoch": 0.9385656292286875, + "grad_norm": 7.1877899169921875, + "learning_rate": 3.81744758849811e-06, + "logits/chosen": 8.11004638671875, + "logits/rejected": 9.369216918945312, + "logps/chosen": -236.0681610107422, + "logps/rejected": -290.5652160644531, + "loss": 0.7017, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4204835295677185, + "rewards/margins": 0.12054291367530823, + "rewards/rejected": -0.5410264730453491, + "step": 6069 + }, + { + "epoch": 0.9387202783684516, + "grad_norm": 5.705470085144043, + "learning_rate": 3.8171611868484366e-06, + "logits/chosen": 16.26763343811035, + "logits/rejected": 13.144468307495117, + "logps/chosen": -245.2537078857422, + "logps/rejected": -232.14541625976562, + "loss": 0.5803, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4970560371875763, + "rewards/margins": 0.4018312990665436, + "rewards/rejected": 0.0952247753739357, + "step": 6070 + }, + { + "epoch": 0.9388749275082158, + "grad_norm": 4.4256415367126465, + "learning_rate": 3.816874785198763e-06, + "logits/chosen": 8.794835090637207, + "logits/rejected": 7.541773319244385, + "logps/chosen": -191.7821044921875, + "logps/rejected": -188.01748657226562, + "loss": 0.6287, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05795135349035263, + "rewards/margins": 0.21488073468208313, + "rewards/rejected": -0.27283209562301636, + "step": 6071 + }, + { + "epoch": 0.9390295766479799, + "grad_norm": 4.758356094360352, + "learning_rate": 3.81658838354909e-06, + "logits/chosen": 11.676168441772461, + "logits/rejected": 7.672636985778809, + "logps/chosen": -316.0868835449219, + "logps/rejected": -287.555908203125, + "loss": 0.5652, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4077572822570801, + "rewards/margins": 0.376626580953598, + "rewards/rejected": 0.031130697578191757, + "step": 6072 + }, + { + "epoch": 0.9391842257877441, + "grad_norm": 5.94786262512207, + "learning_rate": 3.8163019818994165e-06, + "logits/chosen": 13.635997772216797, + "logits/rejected": 7.333440780639648, + "logps/chosen": -344.91094970703125, + "logps/rejected": -302.4306335449219, + "loss": 0.7249, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.34345197677612305, + "rewards/margins": 0.05055246502161026, + "rewards/rejected": 0.2928995192050934, + "step": 6073 + }, + { + "epoch": 0.9393388749275082, + "grad_norm": 6.516737461090088, + "learning_rate": 3.816015580249742e-06, + "logits/chosen": 13.521687507629395, + "logits/rejected": 13.969501495361328, + "logps/chosen": -278.9033203125, + "logps/rejected": -287.1125183105469, + "loss": 0.7128, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44379115104675293, + "rewards/margins": 0.16020014882087708, + "rewards/rejected": 0.28359100222587585, + "step": 6074 + }, + { + "epoch": 0.9394935240672724, + "grad_norm": 5.933638095855713, + "learning_rate": 3.815729178600069e-06, + "logits/chosen": 12.309395790100098, + "logits/rejected": 11.905566215515137, + "logps/chosen": -193.1526641845703, + "logps/rejected": -209.6673583984375, + "loss": 0.7389, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11351699382066727, + "rewards/margins": 0.03330119699239731, + "rewards/rejected": 0.08021579682826996, + "step": 6075 + }, + { + "epoch": 0.9396481732070365, + "grad_norm": 6.3787994384765625, + "learning_rate": 3.815442776950396e-06, + "logits/chosen": 10.8314208984375, + "logits/rejected": 11.874008178710938, + "logps/chosen": -286.8882751464844, + "logps/rejected": -338.9561462402344, + "loss": 0.8067, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.27103862166404724, + "rewards/margins": -0.13938254117965698, + "rewards/rejected": 0.4104211926460266, + "step": 6076 + }, + { + "epoch": 0.9398028223468007, + "grad_norm": 3.216580390930176, + "learning_rate": 3.815156375300722e-06, + "logits/chosen": 10.877608299255371, + "logits/rejected": 5.905982494354248, + "logps/chosen": -182.88221740722656, + "logps/rejected": -146.63233947753906, + "loss": 0.516, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14734011888504028, + "rewards/margins": 0.4875781238079071, + "rewards/rejected": -0.34023797512054443, + "step": 6077 + }, + { + "epoch": 0.9399574714865648, + "grad_norm": 5.839911460876465, + "learning_rate": 3.814869973651049e-06, + "logits/chosen": 1.2418125867843628, + "logits/rejected": 7.115954875946045, + "logps/chosen": -325.61602783203125, + "logps/rejected": -316.3528747558594, + "loss": 0.623, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10578656941652298, + "rewards/margins": 0.3688156008720398, + "rewards/rejected": -0.2630290687084198, + "step": 6078 + }, + { + "epoch": 0.940112120626329, + "grad_norm": 4.351027965545654, + "learning_rate": 3.814583572001375e-06, + "logits/chosen": 5.610586643218994, + "logits/rejected": 4.684720516204834, + "logps/chosen": -177.2313690185547, + "logps/rejected": -200.40921020507812, + "loss": 0.5593, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.174510195851326, + "rewards/margins": 0.36411920189857483, + "rewards/rejected": -0.18960900604724884, + "step": 6079 + }, + { + "epoch": 0.9402667697660931, + "grad_norm": 11.34358024597168, + "learning_rate": 3.814297170351702e-06, + "logits/chosen": 6.915814399719238, + "logits/rejected": 7.635409355163574, + "logps/chosen": -285.6097106933594, + "logps/rejected": -237.17234802246094, + "loss": 0.5961, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12571869790554047, + "rewards/margins": 0.337399423122406, + "rewards/rejected": -0.21168071031570435, + "step": 6080 + }, + { + "epoch": 0.9404214189058573, + "grad_norm": 4.962376117706299, + "learning_rate": 3.8140107687020276e-06, + "logits/chosen": 10.566926956176758, + "logits/rejected": 6.878274440765381, + "logps/chosen": -437.80218505859375, + "logps/rejected": -356.24957275390625, + "loss": 0.445, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.8575985431671143, + "rewards/margins": 0.6839499473571777, + "rewards/rejected": 0.17364856600761414, + "step": 6081 + }, + { + "epoch": 0.9405760680456215, + "grad_norm": 4.927545070648193, + "learning_rate": 3.8137243670523543e-06, + "logits/chosen": 4.8792195320129395, + "logits/rejected": 2.1346592903137207, + "logps/chosen": -267.8224792480469, + "logps/rejected": -223.33038330078125, + "loss": 0.613, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7079000473022461, + "rewards/margins": 0.3419094383716583, + "rewards/rejected": 0.36599063873291016, + "step": 6082 + }, + { + "epoch": 0.9407307171853857, + "grad_norm": 4.992504596710205, + "learning_rate": 3.813437965402681e-06, + "logits/chosen": 11.298521041870117, + "logits/rejected": 4.287998199462891, + "logps/chosen": -350.1596984863281, + "logps/rejected": -250.62960815429688, + "loss": 0.514, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6133615970611572, + "rewards/margins": 0.43809372186660767, + "rewards/rejected": 0.17526786029338837, + "step": 6083 + }, + { + "epoch": 0.9408853663251499, + "grad_norm": 6.099196434020996, + "learning_rate": 3.8131515637530076e-06, + "logits/chosen": 9.103509902954102, + "logits/rejected": 11.175631523132324, + "logps/chosen": -253.2087860107422, + "logps/rejected": -263.1287536621094, + "loss": 0.8713, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.239552304148674, + "rewards/margins": -0.28746098279953003, + "rewards/rejected": 0.5270133018493652, + "step": 6084 + }, + { + "epoch": 0.941040015464914, + "grad_norm": 5.9227399826049805, + "learning_rate": 3.812865162103334e-06, + "logits/chosen": 6.880556583404541, + "logits/rejected": 7.891871929168701, + "logps/chosen": -301.21710205078125, + "logps/rejected": -268.6335754394531, + "loss": 0.8614, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22740042209625244, + "rewards/margins": -0.18442678451538086, + "rewards/rejected": 0.4118272066116333, + "step": 6085 + }, + { + "epoch": 0.9411946646046782, + "grad_norm": 5.366952419281006, + "learning_rate": 3.812578760453661e-06, + "logits/chosen": 13.178825378417969, + "logits/rejected": 10.543661117553711, + "logps/chosen": -234.63943481445312, + "logps/rejected": -252.57705688476562, + "loss": 0.6222, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3527628183364868, + "rewards/margins": 0.2799367904663086, + "rewards/rejected": 0.07282600551843643, + "step": 6086 + }, + { + "epoch": 0.9413493137444423, + "grad_norm": 5.538469314575195, + "learning_rate": 3.812292358803987e-06, + "logits/chosen": 1.831620454788208, + "logits/rejected": 4.772876262664795, + "logps/chosen": -230.24127197265625, + "logps/rejected": -295.0673828125, + "loss": 0.6083, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005966514348983765, + "rewards/margins": 0.242758110165596, + "rewards/rejected": -0.23679161071777344, + "step": 6087 + }, + { + "epoch": 0.9415039628842065, + "grad_norm": 3.7193217277526855, + "learning_rate": 3.8120059571543133e-06, + "logits/chosen": 7.188810348510742, + "logits/rejected": 8.100234985351562, + "logps/chosen": -219.13656616210938, + "logps/rejected": -254.40017700195312, + "loss": 0.4915, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2356186956167221, + "rewards/margins": 0.5997982621192932, + "rewards/rejected": -0.3641796112060547, + "step": 6088 + }, + { + "epoch": 0.9416586120239706, + "grad_norm": 5.5721025466918945, + "learning_rate": 3.81171955550464e-06, + "logits/chosen": 7.171139717102051, + "logits/rejected": 6.843477725982666, + "logps/chosen": -338.8473815917969, + "logps/rejected": -287.5997619628906, + "loss": 0.6182, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6582000851631165, + "rewards/margins": 0.37820136547088623, + "rewards/rejected": 0.27999868988990784, + "step": 6089 + }, + { + "epoch": 0.9418132611637348, + "grad_norm": 3.9159910678863525, + "learning_rate": 3.8114331538549666e-06, + "logits/chosen": 11.181341171264648, + "logits/rejected": 4.574030876159668, + "logps/chosen": -191.98849487304688, + "logps/rejected": -146.96788024902344, + "loss": 0.6727, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.30301570892333984, + "rewards/margins": 0.1540481299161911, + "rewards/rejected": 0.14896757900714874, + "step": 6090 + }, + { + "epoch": 0.9419679103034989, + "grad_norm": 7.600465774536133, + "learning_rate": 3.8111467522052933e-06, + "logits/chosen": 14.485508918762207, + "logits/rejected": 12.474799156188965, + "logps/chosen": -342.322998046875, + "logps/rejected": -291.435791015625, + "loss": 0.901, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10375480353832245, + "rewards/margins": -0.2649831771850586, + "rewards/rejected": 0.16122838854789734, + "step": 6091 + }, + { + "epoch": 0.9421225594432631, + "grad_norm": 8.614843368530273, + "learning_rate": 3.8108603505556195e-06, + "logits/chosen": 7.101761817932129, + "logits/rejected": 6.240938663482666, + "logps/chosen": -339.32220458984375, + "logps/rejected": -212.43060302734375, + "loss": 0.8266, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04470086097717285, + "rewards/margins": -0.16797193884849548, + "rewards/rejected": 0.12327107787132263, + "step": 6092 + }, + { + "epoch": 0.9422772085830272, + "grad_norm": 5.7017316818237305, + "learning_rate": 3.810573948905946e-06, + "logits/chosen": 4.3470892906188965, + "logits/rejected": 7.0994720458984375, + "logps/chosen": -192.86744689941406, + "logps/rejected": -246.5413055419922, + "loss": 0.8215, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24562174081802368, + "rewards/margins": -0.1625586450099945, + "rewards/rejected": 0.4081803560256958, + "step": 6093 + }, + { + "epoch": 0.9424318577227914, + "grad_norm": 6.6136369705200195, + "learning_rate": 3.8102875472562724e-06, + "logits/chosen": 6.410834312438965, + "logits/rejected": 10.590200424194336, + "logps/chosen": -231.900390625, + "logps/rejected": -253.52920532226562, + "loss": 0.8541, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.06740179657936096, + "rewards/margins": -0.18243303894996643, + "rewards/rejected": 0.2498348355293274, + "step": 6094 + }, + { + "epoch": 0.9425865068625556, + "grad_norm": 4.242220878601074, + "learning_rate": 3.810001145606599e-06, + "logits/chosen": 9.677803039550781, + "logits/rejected": 8.787566184997559, + "logps/chosen": -178.03857421875, + "logps/rejected": -151.91616821289062, + "loss": 0.7275, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06072451174259186, + "rewards/margins": 0.04228389263153076, + "rewards/rejected": 0.0184406116604805, + "step": 6095 + }, + { + "epoch": 0.9427411560023198, + "grad_norm": 4.677660942077637, + "learning_rate": 3.8097147439569252e-06, + "logits/chosen": 12.098503112792969, + "logits/rejected": 9.779885292053223, + "logps/chosen": -262.5589294433594, + "logps/rejected": -266.28314208984375, + "loss": 0.5837, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44803160429000854, + "rewards/margins": 0.3180495500564575, + "rewards/rejected": 0.1299821138381958, + "step": 6096 + }, + { + "epoch": 0.9428958051420839, + "grad_norm": 5.403985500335693, + "learning_rate": 3.809428342307252e-06, + "logits/chosen": 7.742856979370117, + "logits/rejected": 10.280571937561035, + "logps/chosen": -233.95733642578125, + "logps/rejected": -210.7534637451172, + "loss": 0.6166, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4494001567363739, + "rewards/margins": 0.24035492539405823, + "rewards/rejected": 0.20904523134231567, + "step": 6097 + }, + { + "epoch": 0.9430504542818481, + "grad_norm": 6.647812843322754, + "learning_rate": 3.8091419406575785e-06, + "logits/chosen": 9.193872451782227, + "logits/rejected": 7.398804187774658, + "logps/chosen": -319.1348876953125, + "logps/rejected": -316.2547607421875, + "loss": 0.5713, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35655656456947327, + "rewards/margins": 0.5459267497062683, + "rewards/rejected": -0.18937018513679504, + "step": 6098 + }, + { + "epoch": 0.9432051034216122, + "grad_norm": 4.627180099487305, + "learning_rate": 3.808855539007905e-06, + "logits/chosen": 10.844181060791016, + "logits/rejected": 9.718484878540039, + "logps/chosen": -343.73284912109375, + "logps/rejected": -208.0472412109375, + "loss": 0.5383, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6060517430305481, + "rewards/margins": 0.5087236166000366, + "rewards/rejected": 0.0973280817270279, + "step": 6099 + }, + { + "epoch": 0.9433597525613764, + "grad_norm": 4.272436141967773, + "learning_rate": 3.808569137358232e-06, + "logits/chosen": 15.333287239074707, + "logits/rejected": 8.599126815795898, + "logps/chosen": -409.5078125, + "logps/rejected": -248.9840850830078, + "loss": 0.5638, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5476647615432739, + "rewards/margins": 0.40758851170539856, + "rewards/rejected": 0.14007622003555298, + "step": 6100 + }, + { + "epoch": 0.9435144017011405, + "grad_norm": 5.972644329071045, + "learning_rate": 3.8082827357085576e-06, + "logits/chosen": 12.754161834716797, + "logits/rejected": 15.414602279663086, + "logps/chosen": -317.50360107421875, + "logps/rejected": -427.9605407714844, + "loss": 0.682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13516905903816223, + "rewards/margins": 0.07610650360584259, + "rewards/rejected": 0.059062570333480835, + "step": 6101 + }, + { + "epoch": 0.9436690508409047, + "grad_norm": 5.319756031036377, + "learning_rate": 3.8079963340588843e-06, + "logits/chosen": 15.805214881896973, + "logits/rejected": 11.866049766540527, + "logps/chosen": -309.2769775390625, + "logps/rejected": -225.6322021484375, + "loss": 0.5775, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.057625673711299896, + "rewards/margins": 0.42817145586013794, + "rewards/rejected": -0.37054574489593506, + "step": 6102 + }, + { + "epoch": 0.9438236999806688, + "grad_norm": 6.35297966003418, + "learning_rate": 3.807709932409211e-06, + "logits/chosen": 10.88467788696289, + "logits/rejected": 10.55798053741455, + "logps/chosen": -336.02789306640625, + "logps/rejected": -345.5227355957031, + "loss": 0.5511, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5578911900520325, + "rewards/margins": 0.3338162302970886, + "rewards/rejected": 0.22407494485378265, + "step": 6103 + }, + { + "epoch": 0.943978349120433, + "grad_norm": 5.715827465057373, + "learning_rate": 3.8074235307595376e-06, + "logits/chosen": 11.487932205200195, + "logits/rejected": 8.79894733428955, + "logps/chosen": -313.4056396484375, + "logps/rejected": -209.98672485351562, + "loss": 0.6531, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3565572500228882, + "rewards/margins": 0.2715810239315033, + "rewards/rejected": 0.08497625589370728, + "step": 6104 + }, + { + "epoch": 0.9441329982601971, + "grad_norm": 5.52899169921875, + "learning_rate": 3.8071371291098642e-06, + "logits/chosen": 5.1163787841796875, + "logits/rejected": 11.167240142822266, + "logps/chosen": -175.9130859375, + "logps/rejected": -261.94427490234375, + "loss": 0.8449, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2729036211967468, + "rewards/margins": -0.11569945514202118, + "rewards/rejected": 0.3886030912399292, + "step": 6105 + }, + { + "epoch": 0.9442876473999613, + "grad_norm": 4.608291149139404, + "learning_rate": 3.806850727460191e-06, + "logits/chosen": 12.11892318725586, + "logits/rejected": 9.762636184692383, + "logps/chosen": -295.04425048828125, + "logps/rejected": -226.96890258789062, + "loss": 0.5679, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2966315746307373, + "rewards/margins": 0.4821425974369049, + "rewards/rejected": -0.1855110228061676, + "step": 6106 + }, + { + "epoch": 0.9444422965397254, + "grad_norm": 6.040639400482178, + "learning_rate": 3.8065643258105167e-06, + "logits/chosen": 13.903783798217773, + "logits/rejected": 10.05005168914795, + "logps/chosen": -280.8214111328125, + "logps/rejected": -167.11900329589844, + "loss": 0.6999, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.19398802518844604, + "rewards/margins": 0.08902202546596527, + "rewards/rejected": 0.10496602952480316, + "step": 6107 + }, + { + "epoch": 0.9445969456794897, + "grad_norm": 4.522506237030029, + "learning_rate": 3.8062779241608434e-06, + "logits/chosen": 6.287240505218506, + "logits/rejected": 2.296971559524536, + "logps/chosen": -287.56463623046875, + "logps/rejected": -189.1895294189453, + "loss": 0.5238, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4894564151763916, + "rewards/margins": 0.45365530252456665, + "rewards/rejected": 0.035801127552986145, + "step": 6108 + }, + { + "epoch": 0.9447515948192539, + "grad_norm": 8.307732582092285, + "learning_rate": 3.80599152251117e-06, + "logits/chosen": 9.483098983764648, + "logits/rejected": 9.91816520690918, + "logps/chosen": -280.9747009277344, + "logps/rejected": -264.1375732421875, + "loss": 0.6628, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4822605848312378, + "rewards/margins": 0.41404256224632263, + "rewards/rejected": 0.06821804493665695, + "step": 6109 + }, + { + "epoch": 0.944906243959018, + "grad_norm": 676.383056640625, + "learning_rate": 3.8057051208614967e-06, + "logits/chosen": 9.731789588928223, + "logits/rejected": 9.675384521484375, + "logps/chosen": -244.6284942626953, + "logps/rejected": -293.3633117675781, + "loss": 1.1458, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3688059449195862, + "rewards/margins": 0.02286475896835327, + "rewards/rejected": 0.3459411859512329, + "step": 6110 + }, + { + "epoch": 0.9450608930987822, + "grad_norm": 5.440345764160156, + "learning_rate": 3.805418719211823e-06, + "logits/chosen": 6.476787567138672, + "logits/rejected": 6.469910144805908, + "logps/chosen": -195.49354553222656, + "logps/rejected": -263.1617431640625, + "loss": 0.6224, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3218001127243042, + "rewards/margins": 0.48463016748428345, + "rewards/rejected": -0.16283002495765686, + "step": 6111 + }, + { + "epoch": 0.9452155422385463, + "grad_norm": 5.384084224700928, + "learning_rate": 3.8051323175621495e-06, + "logits/chosen": 10.614958763122559, + "logits/rejected": 17.449613571166992, + "logps/chosen": -210.1707763671875, + "logps/rejected": -241.51287841796875, + "loss": 0.7528, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14073190093040466, + "rewards/margins": -0.02024676650762558, + "rewards/rejected": -0.12048512697219849, + "step": 6112 + }, + { + "epoch": 0.9453701913783105, + "grad_norm": 3.032682418823242, + "learning_rate": 3.804845915912476e-06, + "logits/chosen": 16.2205810546875, + "logits/rejected": 10.663674354553223, + "logps/chosen": -183.4769287109375, + "logps/rejected": -158.5052032470703, + "loss": 0.4465, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44304996728897095, + "rewards/margins": 0.6766156554222107, + "rewards/rejected": -0.23356571793556213, + "step": 6113 + }, + { + "epoch": 0.9455248405180746, + "grad_norm": 3.896304130554199, + "learning_rate": 3.8045595142628024e-06, + "logits/chosen": 9.357583999633789, + "logits/rejected": 5.256113052368164, + "logps/chosen": -227.8314971923828, + "logps/rejected": -113.90750122070312, + "loss": 0.5927, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39485710859298706, + "rewards/margins": 0.32727789878845215, + "rewards/rejected": 0.0675792321562767, + "step": 6114 + }, + { + "epoch": 0.9456794896578388, + "grad_norm": 5.4467878341674805, + "learning_rate": 3.8042731126131286e-06, + "logits/chosen": 13.243876457214355, + "logits/rejected": 14.673941612243652, + "logps/chosen": -239.44859313964844, + "logps/rejected": -331.18841552734375, + "loss": 0.8322, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19260179996490479, + "rewards/margins": -0.10337914526462555, + "rewards/rejected": 0.29598093032836914, + "step": 6115 + }, + { + "epoch": 0.9458341387976029, + "grad_norm": 5.094240665435791, + "learning_rate": 3.8039867109634553e-06, + "logits/chosen": 7.527679443359375, + "logits/rejected": 9.080537796020508, + "logps/chosen": -299.16094970703125, + "logps/rejected": -308.7109375, + "loss": 0.6061, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5531708002090454, + "rewards/margins": 0.29709121584892273, + "rewards/rejected": 0.2560795843601227, + "step": 6116 + }, + { + "epoch": 0.9459887879373671, + "grad_norm": 6.777413368225098, + "learning_rate": 3.803700309313782e-06, + "logits/chosen": 11.91340446472168, + "logits/rejected": 9.278749465942383, + "logps/chosen": -274.78533935546875, + "logps/rejected": -275.62799072265625, + "loss": 0.6441, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5145151019096375, + "rewards/margins": 0.42700961232185364, + "rewards/rejected": 0.0875055193901062, + "step": 6117 + }, + { + "epoch": 0.9461434370771312, + "grad_norm": 4.660823345184326, + "learning_rate": 3.8034139076641086e-06, + "logits/chosen": 11.751721382141113, + "logits/rejected": 9.275279998779297, + "logps/chosen": -344.2777404785156, + "logps/rejected": -276.0328063964844, + "loss": 0.5992, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5876486897468567, + "rewards/margins": 0.23932303488254547, + "rewards/rejected": 0.34832563996315, + "step": 6118 + }, + { + "epoch": 0.9462980862168954, + "grad_norm": 4.778783321380615, + "learning_rate": 3.8031275060144352e-06, + "logits/chosen": 4.699098587036133, + "logits/rejected": 3.591595411300659, + "logps/chosen": -241.07110595703125, + "logps/rejected": -215.99533081054688, + "loss": 0.5314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42230841517448425, + "rewards/margins": 0.5223271250724792, + "rewards/rejected": -0.1000187024474144, + "step": 6119 + }, + { + "epoch": 0.9464527353566595, + "grad_norm": 14.398313522338867, + "learning_rate": 3.802841104364762e-06, + "logits/chosen": 9.755953788757324, + "logits/rejected": 3.4724786281585693, + "logps/chosen": -384.8707275390625, + "logps/rejected": -315.34637451171875, + "loss": 0.489, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6170616149902344, + "rewards/margins": 0.7081850171089172, + "rewards/rejected": -0.09112338721752167, + "step": 6120 + }, + { + "epoch": 0.9466073844964238, + "grad_norm": 6.513869762420654, + "learning_rate": 3.8025547027150877e-06, + "logits/chosen": 11.69550895690918, + "logits/rejected": 9.572566032409668, + "logps/chosen": -411.56231689453125, + "logps/rejected": -394.0745544433594, + "loss": 0.6479, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.7219116687774658, + "rewards/margins": 0.21830222010612488, + "rewards/rejected": 0.5036094188690186, + "step": 6121 + }, + { + "epoch": 0.9467620336361879, + "grad_norm": 9.223554611206055, + "learning_rate": 3.8022683010654143e-06, + "logits/chosen": 8.417282104492188, + "logits/rejected": 1.5517919063568115, + "logps/chosen": -401.50653076171875, + "logps/rejected": -220.24920654296875, + "loss": 0.7144, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.5609298348426819, + "rewards/margins": -0.005590818822383881, + "rewards/rejected": 0.5665207505226135, + "step": 6122 + }, + { + "epoch": 0.9469166827759521, + "grad_norm": 4.91739559173584, + "learning_rate": 3.801981899415741e-06, + "logits/chosen": 12.335090637207031, + "logits/rejected": 8.075563430786133, + "logps/chosen": -376.4961853027344, + "logps/rejected": -298.4585876464844, + "loss": 0.4626, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4171767830848694, + "rewards/margins": 0.680549681186676, + "rewards/rejected": -0.26337286829948425, + "step": 6123 + }, + { + "epoch": 0.9470713319157162, + "grad_norm": 5.677758693695068, + "learning_rate": 3.8016954977660676e-06, + "logits/chosen": 7.166735649108887, + "logits/rejected": 7.543004989624023, + "logps/chosen": -225.05581665039062, + "logps/rejected": -182.28384399414062, + "loss": 0.6979, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07324963063001633, + "rewards/margins": 0.08322267234325409, + "rewards/rejected": -0.009973041713237762, + "step": 6124 + }, + { + "epoch": 0.9472259810554804, + "grad_norm": 4.433850288391113, + "learning_rate": 3.8014090961163943e-06, + "logits/chosen": 11.126750946044922, + "logits/rejected": 5.822546005249023, + "logps/chosen": -173.7269744873047, + "logps/rejected": -157.27572631835938, + "loss": 0.5867, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15981268882751465, + "rewards/margins": 0.26645272970199585, + "rewards/rejected": -0.1066400557756424, + "step": 6125 + }, + { + "epoch": 0.9473806301952445, + "grad_norm": 10.051560401916504, + "learning_rate": 3.8011226944667205e-06, + "logits/chosen": 11.969038963317871, + "logits/rejected": 8.733071327209473, + "logps/chosen": -423.1612548828125, + "logps/rejected": -361.1314697265625, + "loss": 0.8901, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.006579972803592682, + "rewards/margins": -0.19934165477752686, + "rewards/rejected": 0.20592162013053894, + "step": 6126 + }, + { + "epoch": 0.9475352793350087, + "grad_norm": 3.7855629920959473, + "learning_rate": 3.8008362928170467e-06, + "logits/chosen": 14.899744987487793, + "logits/rejected": 5.31304931640625, + "logps/chosen": -254.46495056152344, + "logps/rejected": -168.96116638183594, + "loss": 0.4594, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.58144211769104, + "rewards/margins": 0.6862486600875854, + "rewards/rejected": -0.10480650514364243, + "step": 6127 + }, + { + "epoch": 0.9476899284747728, + "grad_norm": 3.807384490966797, + "learning_rate": 3.8005498911673734e-06, + "logits/chosen": 12.657123565673828, + "logits/rejected": 8.67054557800293, + "logps/chosen": -226.19183349609375, + "logps/rejected": -198.3279266357422, + "loss": 0.5347, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.42361658811569214, + "rewards/margins": 0.40676647424697876, + "rewards/rejected": 0.016850091516971588, + "step": 6128 + }, + { + "epoch": 0.947844577614537, + "grad_norm": 5.2223005294799805, + "learning_rate": 3.8002634895177e-06, + "logits/chosen": 10.917251586914062, + "logits/rejected": 5.476696968078613, + "logps/chosen": -456.75872802734375, + "logps/rejected": -325.453369140625, + "loss": 0.4865, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7777464389801025, + "rewards/margins": 0.8205711841583252, + "rewards/rejected": -0.042824745178222656, + "step": 6129 + }, + { + "epoch": 0.9479992267543011, + "grad_norm": 4.111788272857666, + "learning_rate": 3.7999770878680263e-06, + "logits/chosen": 9.898801803588867, + "logits/rejected": 9.476448059082031, + "logps/chosen": -158.30126953125, + "logps/rejected": -193.73118591308594, + "loss": 0.6639, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.146633580327034, + "rewards/margins": 0.10084643959999084, + "rewards/rejected": 0.04578714072704315, + "step": 6130 + }, + { + "epoch": 0.9481538758940653, + "grad_norm": 4.172186374664307, + "learning_rate": 3.799690686218353e-06, + "logits/chosen": 6.938317775726318, + "logits/rejected": 3.4027481079101562, + "logps/chosen": -217.08563232421875, + "logps/rejected": -181.32907104492188, + "loss": 0.4901, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3057669401168823, + "rewards/margins": 0.6542397737503052, + "rewards/rejected": -0.34847280383110046, + "step": 6131 + }, + { + "epoch": 0.9483085250338295, + "grad_norm": 4.117425918579102, + "learning_rate": 3.7994042845686796e-06, + "logits/chosen": 10.764884948730469, + "logits/rejected": 6.007167816162109, + "logps/chosen": -184.376220703125, + "logps/rejected": -184.09844970703125, + "loss": 0.635, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32048383355140686, + "rewards/margins": 0.19168972969055176, + "rewards/rejected": 0.1287941038608551, + "step": 6132 + }, + { + "epoch": 0.9484631741735936, + "grad_norm": 4.116074562072754, + "learning_rate": 3.7991178829190062e-06, + "logits/chosen": 12.23703384399414, + "logits/rejected": 8.825287818908691, + "logps/chosen": -320.6624450683594, + "logps/rejected": -288.0609436035156, + "loss": 0.446, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4983493685722351, + "rewards/margins": 0.6485175490379333, + "rewards/rejected": -0.15016812086105347, + "step": 6133 + }, + { + "epoch": 0.9486178233133579, + "grad_norm": 4.125493049621582, + "learning_rate": 3.798831481269332e-06, + "logits/chosen": 18.443208694458008, + "logits/rejected": 12.027984619140625, + "logps/chosen": -235.64654541015625, + "logps/rejected": -183.31814575195312, + "loss": 0.6387, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3844262957572937, + "rewards/margins": 0.22453317046165466, + "rewards/rejected": 0.15989314019680023, + "step": 6134 + }, + { + "epoch": 0.948772472453122, + "grad_norm": 7.958446502685547, + "learning_rate": 3.7985450796196587e-06, + "logits/chosen": 8.696544647216797, + "logits/rejected": 9.067924499511719, + "logps/chosen": -302.22381591796875, + "logps/rejected": -315.93084716796875, + "loss": 0.8832, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.1506291925907135, + "rewards/margins": -0.26111355423927307, + "rewards/rejected": 0.4117427468299866, + "step": 6135 + }, + { + "epoch": 0.9489271215928862, + "grad_norm": 10.748108863830566, + "learning_rate": 3.7982586779699853e-06, + "logits/chosen": 10.577842712402344, + "logits/rejected": 6.776095390319824, + "logps/chosen": -488.1216735839844, + "logps/rejected": -299.84271240234375, + "loss": 0.7171, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23031625151634216, + "rewards/margins": 0.13431869447231293, + "rewards/rejected": 0.09599751234054565, + "step": 6136 + }, + { + "epoch": 0.9490817707326503, + "grad_norm": 4.6449503898620605, + "learning_rate": 3.797972276320312e-06, + "logits/chosen": 9.781865119934082, + "logits/rejected": 13.294943809509277, + "logps/chosen": -247.5777130126953, + "logps/rejected": -308.8779602050781, + "loss": 0.605, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3307187557220459, + "rewards/margins": 0.2628345191478729, + "rewards/rejected": 0.06788422167301178, + "step": 6137 + }, + { + "epoch": 0.9492364198724145, + "grad_norm": 6.828583240509033, + "learning_rate": 3.7976858746706386e-06, + "logits/chosen": 16.520654678344727, + "logits/rejected": 7.7576212882995605, + "logps/chosen": -311.0717468261719, + "logps/rejected": -252.43626403808594, + "loss": 0.714, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4419318735599518, + "rewards/margins": 0.2562493681907654, + "rewards/rejected": 0.18568256497383118, + "step": 6138 + }, + { + "epoch": 0.9493910690121786, + "grad_norm": 7.355157852172852, + "learning_rate": 3.7973994730209653e-06, + "logits/chosen": 7.037096977233887, + "logits/rejected": 5.259735107421875, + "logps/chosen": -355.94232177734375, + "logps/rejected": -266.8430480957031, + "loss": 0.5736, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23144692182540894, + "rewards/margins": 0.4259180426597595, + "rewards/rejected": -0.19447113573551178, + "step": 6139 + }, + { + "epoch": 0.9495457181519428, + "grad_norm": 3.692997455596924, + "learning_rate": 3.797113071371291e-06, + "logits/chosen": 12.380500793457031, + "logits/rejected": 6.629700660705566, + "logps/chosen": -140.75379943847656, + "logps/rejected": -127.47859191894531, + "loss": 0.6221, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.06310685724020004, + "rewards/margins": 0.17889685928821564, + "rewards/rejected": -0.24200373888015747, + "step": 6140 + }, + { + "epoch": 0.9497003672917069, + "grad_norm": 4.56638240814209, + "learning_rate": 3.7968266697216177e-06, + "logits/chosen": 8.284974098205566, + "logits/rejected": 8.240362167358398, + "logps/chosen": -230.05026245117188, + "logps/rejected": -264.61456298828125, + "loss": 0.6489, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2506457567214966, + "rewards/margins": 0.37028414011001587, + "rewards/rejected": -0.11963840574026108, + "step": 6141 + }, + { + "epoch": 0.9498550164314711, + "grad_norm": 4.7954607009887695, + "learning_rate": 3.7965402680719444e-06, + "logits/chosen": 12.753697395324707, + "logits/rejected": 9.30424690246582, + "logps/chosen": -320.78839111328125, + "logps/rejected": -298.1253662109375, + "loss": 0.4109, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5831942558288574, + "rewards/margins": 0.7153184413909912, + "rewards/rejected": -0.1321241855621338, + "step": 6142 + }, + { + "epoch": 0.9500096655712352, + "grad_norm": 5.0751447677612305, + "learning_rate": 3.796253866422271e-06, + "logits/chosen": 12.804666519165039, + "logits/rejected": 11.85648250579834, + "logps/chosen": -305.0457763671875, + "logps/rejected": -298.6139831542969, + "loss": 0.6151, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5067696571350098, + "rewards/margins": 0.4775269329547882, + "rewards/rejected": 0.02924273908138275, + "step": 6143 + }, + { + "epoch": 0.9501643147109994, + "grad_norm": 5.078519344329834, + "learning_rate": 3.7959674647725977e-06, + "logits/chosen": 10.292719841003418, + "logits/rejected": 3.6246705055236816, + "logps/chosen": -258.091064453125, + "logps/rejected": -162.05177307128906, + "loss": 0.6891, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.010790683329105377, + "rewards/margins": 0.0842796117067337, + "rewards/rejected": -0.07348892092704773, + "step": 6144 + }, + { + "epoch": 0.9503189638507635, + "grad_norm": 4.760468482971191, + "learning_rate": 3.795681063122924e-06, + "logits/chosen": 8.143221855163574, + "logits/rejected": 7.942566394805908, + "logps/chosen": -381.5123596191406, + "logps/rejected": -391.5032043457031, + "loss": 0.4413, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4306606948375702, + "rewards/margins": 0.6841031312942505, + "rewards/rejected": -0.25344234704971313, + "step": 6145 + }, + { + "epoch": 0.9504736129905278, + "grad_norm": 4.661147117614746, + "learning_rate": 3.7953946614732506e-06, + "logits/chosen": 13.569191932678223, + "logits/rejected": 10.540250778198242, + "logps/chosen": -283.15802001953125, + "logps/rejected": -212.72402954101562, + "loss": 0.6584, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4299682676792145, + "rewards/margins": 0.12703828513622284, + "rewards/rejected": 0.30292996764183044, + "step": 6146 + }, + { + "epoch": 0.950628262130292, + "grad_norm": 6.309093952178955, + "learning_rate": 3.795108259823577e-06, + "logits/chosen": 5.467181205749512, + "logits/rejected": 7.628287315368652, + "logps/chosen": -189.5636444091797, + "logps/rejected": -207.5373992919922, + "loss": 0.7883, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.017991885542869568, + "rewards/margins": -0.1218787431716919, + "rewards/rejected": 0.10388685762882233, + "step": 6147 + }, + { + "epoch": 0.9507829112700561, + "grad_norm": 8.071401596069336, + "learning_rate": 3.7948218581739034e-06, + "logits/chosen": 16.6971435546875, + "logits/rejected": 5.629417419433594, + "logps/chosen": -382.31268310546875, + "logps/rejected": -348.284423828125, + "loss": 0.6747, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20832709968090057, + "rewards/margins": 0.4912237823009491, + "rewards/rejected": -0.2828966975212097, + "step": 6148 + }, + { + "epoch": 0.9509375604098202, + "grad_norm": 3.390028715133667, + "learning_rate": 3.7945354565242297e-06, + "logits/chosen": 8.839740753173828, + "logits/rejected": 9.218704223632812, + "logps/chosen": -233.2584991455078, + "logps/rejected": -196.21310424804688, + "loss": 0.4673, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18338054418563843, + "rewards/margins": 0.6705355644226074, + "rewards/rejected": -0.4871550500392914, + "step": 6149 + }, + { + "epoch": 0.9510922095495844, + "grad_norm": 6.178202152252197, + "learning_rate": 3.7942490548745563e-06, + "logits/chosen": 13.983511924743652, + "logits/rejected": 0.584989070892334, + "logps/chosen": -532.3519287109375, + "logps/rejected": -201.24252319335938, + "loss": 0.4103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8076561093330383, + "rewards/margins": 0.8006258606910706, + "rewards/rejected": 0.0070302411913871765, + "step": 6150 + }, + { + "epoch": 0.9512468586893486, + "grad_norm": 8.772147178649902, + "learning_rate": 3.793962653224883e-06, + "logits/chosen": 9.899749755859375, + "logits/rejected": 10.163934707641602, + "logps/chosen": -356.1944580078125, + "logps/rejected": -356.65728759765625, + "loss": 0.6872, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.6404454708099365, + "rewards/margins": 0.06865201890468597, + "rewards/rejected": 0.5717933773994446, + "step": 6151 + }, + { + "epoch": 0.9514015078291127, + "grad_norm": 5.855581283569336, + "learning_rate": 3.7936762515752096e-06, + "logits/chosen": 9.701614379882812, + "logits/rejected": 3.668771266937256, + "logps/chosen": -521.6398315429688, + "logps/rejected": -287.03717041015625, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7385131120681763, + "rewards/margins": 0.5150148272514343, + "rewards/rejected": 0.22349828481674194, + "step": 6152 + }, + { + "epoch": 0.9515561569688769, + "grad_norm": 5.712414264678955, + "learning_rate": 3.7933898499255354e-06, + "logits/chosen": 11.184454917907715, + "logits/rejected": 10.683685302734375, + "logps/chosen": -225.55450439453125, + "logps/rejected": -198.64552307128906, + "loss": 0.5853, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03424198180437088, + "rewards/margins": 0.3246713876724243, + "rewards/rejected": -0.29042938351631165, + "step": 6153 + }, + { + "epoch": 0.951710806108641, + "grad_norm": 4.311692714691162, + "learning_rate": 3.793103448275862e-06, + "logits/chosen": 8.396364212036133, + "logits/rejected": 4.56004524230957, + "logps/chosen": -221.5716552734375, + "logps/rejected": -204.5906524658203, + "loss": 0.6164, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3391536772251129, + "rewards/margins": 0.42422419786453247, + "rewards/rejected": -0.08507047593593597, + "step": 6154 + }, + { + "epoch": 0.9518654552484052, + "grad_norm": 5.062068462371826, + "learning_rate": 3.7928170466261887e-06, + "logits/chosen": 16.395055770874023, + "logits/rejected": 11.11180305480957, + "logps/chosen": -361.14288330078125, + "logps/rejected": -298.46856689453125, + "loss": 0.6231, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32938405871391296, + "rewards/margins": 0.5155172944068909, + "rewards/rejected": -0.1861332654953003, + "step": 6155 + }, + { + "epoch": 0.9520201043881693, + "grad_norm": 4.368488788604736, + "learning_rate": 3.7925306449765154e-06, + "logits/chosen": 17.074222564697266, + "logits/rejected": 16.759662628173828, + "logps/chosen": -274.2134094238281, + "logps/rejected": -220.26956176757812, + "loss": 0.6121, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4574798345565796, + "rewards/margins": 0.2169291377067566, + "rewards/rejected": 0.2405507117509842, + "step": 6156 + }, + { + "epoch": 0.9521747535279335, + "grad_norm": 5.707125663757324, + "learning_rate": 3.792244243326842e-06, + "logits/chosen": 10.741851806640625, + "logits/rejected": 7.960721015930176, + "logps/chosen": -227.63397216796875, + "logps/rejected": -239.238037109375, + "loss": 0.7078, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4618026912212372, + "rewards/margins": 0.06386394798755646, + "rewards/rejected": 0.39793872833251953, + "step": 6157 + }, + { + "epoch": 0.9523294026676976, + "grad_norm": 5.398306369781494, + "learning_rate": 3.7919578416771687e-06, + "logits/chosen": 11.080409049987793, + "logits/rejected": 11.525304794311523, + "logps/chosen": -238.9967041015625, + "logps/rejected": -221.96751403808594, + "loss": 0.756, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04042952135205269, + "rewards/margins": 0.07515348494052887, + "rewards/rejected": -0.03472394496202469, + "step": 6158 + }, + { + "epoch": 0.9524840518074619, + "grad_norm": 5.277289867401123, + "learning_rate": 3.791671440027495e-06, + "logits/chosen": 10.193986892700195, + "logits/rejected": 5.847362041473389, + "logps/chosen": -267.63067626953125, + "logps/rejected": -198.78640747070312, + "loss": 0.5537, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33028534054756165, + "rewards/margins": 0.4747997522354126, + "rewards/rejected": -0.14451441168785095, + "step": 6159 + }, + { + "epoch": 0.952638700947226, + "grad_norm": 4.350001335144043, + "learning_rate": 3.791385038377821e-06, + "logits/chosen": 12.001304626464844, + "logits/rejected": 4.2634968757629395, + "logps/chosen": -276.31109619140625, + "logps/rejected": -235.59461975097656, + "loss": 0.4967, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12909665703773499, + "rewards/margins": 0.5830200910568237, + "rewards/rejected": -0.45392337441444397, + "step": 6160 + }, + { + "epoch": 0.9527933500869902, + "grad_norm": 4.104081630706787, + "learning_rate": 3.7910986367281478e-06, + "logits/chosen": 12.183959007263184, + "logits/rejected": 8.322110176086426, + "logps/chosen": -262.2836608886719, + "logps/rejected": -221.3241729736328, + "loss": 0.549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3626824617385864, + "rewards/margins": 0.41054120659828186, + "rewards/rejected": -0.04785875231027603, + "step": 6161 + }, + { + "epoch": 0.9529479992267543, + "grad_norm": 5.812829971313477, + "learning_rate": 3.7908122350784744e-06, + "logits/chosen": 12.200241088867188, + "logits/rejected": 1.9696413278579712, + "logps/chosen": -390.1821594238281, + "logps/rejected": -200.35595703125, + "loss": 0.5202, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5052471160888672, + "rewards/margins": 0.7562521696090698, + "rewards/rejected": -0.2510051131248474, + "step": 6162 + }, + { + "epoch": 0.9531026483665185, + "grad_norm": 6.0550537109375, + "learning_rate": 3.790525833428801e-06, + "logits/chosen": 4.950176239013672, + "logits/rejected": 5.137077331542969, + "logps/chosen": -167.5026092529297, + "logps/rejected": -226.3450927734375, + "loss": 0.6834, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12146544456481934, + "rewards/margins": 0.12269788235425949, + "rewards/rejected": -0.0012324228882789612, + "step": 6163 + }, + { + "epoch": 0.9532572975062826, + "grad_norm": 4.608234882354736, + "learning_rate": 3.7902394317791273e-06, + "logits/chosen": 10.137441635131836, + "logits/rejected": 12.505643844604492, + "logps/chosen": -195.520263671875, + "logps/rejected": -243.80435180664062, + "loss": 0.6525, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20495612919330597, + "rewards/margins": 0.1427948772907257, + "rewards/rejected": 0.06216125190258026, + "step": 6164 + }, + { + "epoch": 0.9534119466460468, + "grad_norm": 14.737418174743652, + "learning_rate": 3.789953030129454e-06, + "logits/chosen": 9.711441993713379, + "logits/rejected": 13.04696273803711, + "logps/chosen": -274.9349060058594, + "logps/rejected": -430.790771484375, + "loss": 0.7203, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6060918569564819, + "rewards/margins": 0.20980969071388245, + "rewards/rejected": 0.3962821960449219, + "step": 6165 + }, + { + "epoch": 0.9535665957858109, + "grad_norm": 5.359762668609619, + "learning_rate": 3.7896666284797806e-06, + "logits/chosen": 12.074807167053223, + "logits/rejected": 4.143256187438965, + "logps/chosen": -400.29571533203125, + "logps/rejected": -267.4970397949219, + "loss": 0.5314, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6141083836555481, + "rewards/margins": 0.5339053869247437, + "rewards/rejected": 0.08020296692848206, + "step": 6166 + }, + { + "epoch": 0.9537212449255751, + "grad_norm": 3.6389474868774414, + "learning_rate": 3.789380226830107e-06, + "logits/chosen": 9.88829517364502, + "logits/rejected": 8.14225959777832, + "logps/chosen": -225.5856475830078, + "logps/rejected": -222.03958129882812, + "loss": 0.5914, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.012787438929080963, + "rewards/margins": 0.5036100149154663, + "rewards/rejected": -0.5163974761962891, + "step": 6167 + }, + { + "epoch": 0.9538758940653392, + "grad_norm": 5.345178127288818, + "learning_rate": 3.789093825180433e-06, + "logits/chosen": 5.077747344970703, + "logits/rejected": 10.080281257629395, + "logps/chosen": -220.88548278808594, + "logps/rejected": -289.1187744140625, + "loss": 0.7962, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.07675837725400925, + "rewards/margins": -0.1315731555223465, + "rewards/rejected": 0.05481477081775665, + "step": 6168 + }, + { + "epoch": 0.9540305432051034, + "grad_norm": 5.9572038650512695, + "learning_rate": 3.7888074235307597e-06, + "logits/chosen": 8.516749382019043, + "logits/rejected": 5.46872091293335, + "logps/chosen": -235.18028259277344, + "logps/rejected": -214.20684814453125, + "loss": 0.6963, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.038121797144412994, + "rewards/margins": 0.22291134297847748, + "rewards/rejected": -0.26103317737579346, + "step": 6169 + }, + { + "epoch": 0.9541851923448675, + "grad_norm": 5.583144187927246, + "learning_rate": 3.7885210218810864e-06, + "logits/chosen": 8.929736137390137, + "logits/rejected": 6.0205488204956055, + "logps/chosen": -301.0149230957031, + "logps/rejected": -250.39556884765625, + "loss": 0.7552, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.10005579888820648, + "rewards/margins": 0.1535303145647049, + "rewards/rejected": -0.053474511951208115, + "step": 6170 + }, + { + "epoch": 0.9543398414846317, + "grad_norm": 5.039641380310059, + "learning_rate": 3.788234620231413e-06, + "logits/chosen": 8.63708209991455, + "logits/rejected": 6.5884857177734375, + "logps/chosen": -256.0694885253906, + "logps/rejected": -198.71670532226562, + "loss": 0.5418, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13345059752464294, + "rewards/margins": 0.4428868591785431, + "rewards/rejected": -0.30943626165390015, + "step": 6171 + }, + { + "epoch": 0.954494490624396, + "grad_norm": 8.050763130187988, + "learning_rate": 3.7879482185817397e-06, + "logits/chosen": 10.67708683013916, + "logits/rejected": 2.730160713195801, + "logps/chosen": -350.8143005371094, + "logps/rejected": -204.71275329589844, + "loss": 0.7992, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.09011248499155045, + "rewards/margins": 0.10813485085964203, + "rewards/rejected": -0.19824734330177307, + "step": 6172 + }, + { + "epoch": 0.9546491397641601, + "grad_norm": 7.7803473472595215, + "learning_rate": 3.7876618169320655e-06, + "logits/chosen": 10.17917251586914, + "logits/rejected": 8.727293014526367, + "logps/chosen": -297.3963317871094, + "logps/rejected": -324.045654296875, + "loss": 0.6007, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.650406539440155, + "rewards/margins": 0.38630932569503784, + "rewards/rejected": 0.2640972137451172, + "step": 6173 + }, + { + "epoch": 0.9548037889039243, + "grad_norm": 5.165964126586914, + "learning_rate": 3.787375415282392e-06, + "logits/chosen": 10.358448028564453, + "logits/rejected": 6.400697231292725, + "logps/chosen": -265.35589599609375, + "logps/rejected": -244.11416625976562, + "loss": 0.6577, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.040180496871471405, + "rewards/margins": 0.28275009989738464, + "rewards/rejected": -0.24256959557533264, + "step": 6174 + }, + { + "epoch": 0.9549584380436884, + "grad_norm": 6.588400840759277, + "learning_rate": 3.7870890136327188e-06, + "logits/chosen": 5.818270206451416, + "logits/rejected": 7.853864669799805, + "logps/chosen": -174.64486694335938, + "logps/rejected": -222.53135681152344, + "loss": 0.5996, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05821056663990021, + "rewards/margins": 0.34612321853637695, + "rewards/rejected": -0.40433377027511597, + "step": 6175 + }, + { + "epoch": 0.9551130871834526, + "grad_norm": 5.107873439788818, + "learning_rate": 3.7868026119830454e-06, + "logits/chosen": 8.837618827819824, + "logits/rejected": 6.268115043640137, + "logps/chosen": -227.54653930664062, + "logps/rejected": -236.0309295654297, + "loss": 0.6414, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15765684843063354, + "rewards/margins": 0.2781776189804077, + "rewards/rejected": -0.12052074819803238, + "step": 6176 + }, + { + "epoch": 0.9552677363232167, + "grad_norm": 4.786239147186279, + "learning_rate": 3.786516210333372e-06, + "logits/chosen": 1.8756210803985596, + "logits/rejected": -2.148780107498169, + "logps/chosen": -235.91082763671875, + "logps/rejected": -128.50123596191406, + "loss": 0.7035, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1964249610900879, + "rewards/margins": 0.03667372837662697, + "rewards/rejected": -0.23309871554374695, + "step": 6177 + }, + { + "epoch": 0.9554223854629809, + "grad_norm": 10.871891021728516, + "learning_rate": 3.7862298086836983e-06, + "logits/chosen": 7.380008697509766, + "logits/rejected": 2.810032606124878, + "logps/chosen": -306.8412780761719, + "logps/rejected": -235.4676513671875, + "loss": 0.5255, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22980064153671265, + "rewards/margins": 0.43018412590026855, + "rewards/rejected": -0.20038345456123352, + "step": 6178 + }, + { + "epoch": 0.955577034602745, + "grad_norm": 4.260034561157227, + "learning_rate": 3.785943407034025e-06, + "logits/chosen": 8.592061042785645, + "logits/rejected": 8.915990829467773, + "logps/chosen": -356.1918029785156, + "logps/rejected": -364.6982421875, + "loss": 0.4934, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6894409656524658, + "rewards/margins": 0.5635682344436646, + "rewards/rejected": 0.12587273120880127, + "step": 6179 + }, + { + "epoch": 0.9557316837425092, + "grad_norm": 6.339696884155273, + "learning_rate": 3.785657005384351e-06, + "logits/chosen": 9.19437026977539, + "logits/rejected": 3.1364874839782715, + "logps/chosen": -212.06027221679688, + "logps/rejected": -192.9576416015625, + "loss": 0.7342, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.2538849413394928, + "rewards/margins": 0.009813845157623291, + "rewards/rejected": 0.2440710961818695, + "step": 6180 + }, + { + "epoch": 0.9558863328822733, + "grad_norm": 5.144402027130127, + "learning_rate": 3.785370603734678e-06, + "logits/chosen": 9.853436470031738, + "logits/rejected": 10.003225326538086, + "logps/chosen": -89.36921691894531, + "logps/rejected": -147.92208862304688, + "loss": 0.8273, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.24263063073158264, + "rewards/margins": -0.17359329760074615, + "rewards/rejected": -0.06903731822967529, + "step": 6181 + }, + { + "epoch": 0.9560409820220375, + "grad_norm": 5.819518089294434, + "learning_rate": 3.7850842020850045e-06, + "logits/chosen": 7.742002487182617, + "logits/rejected": 9.171527862548828, + "logps/chosen": -275.7296142578125, + "logps/rejected": -245.9471435546875, + "loss": 0.8142, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04904394969344139, + "rewards/margins": -0.1914726197719574, + "rewards/rejected": 0.2405165433883667, + "step": 6182 + }, + { + "epoch": 0.9561956311618016, + "grad_norm": 5.64265251159668, + "learning_rate": 3.7847978004353307e-06, + "logits/chosen": 7.825226306915283, + "logits/rejected": 6.729127883911133, + "logps/chosen": -238.1453399658203, + "logps/rejected": -211.0230712890625, + "loss": 0.6599, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34128665924072266, + "rewards/margins": 0.2665756344795227, + "rewards/rejected": 0.07471100986003876, + "step": 6183 + }, + { + "epoch": 0.9563502803015658, + "grad_norm": 5.056849002838135, + "learning_rate": 3.7845113987856574e-06, + "logits/chosen": 12.205657005310059, + "logits/rejected": 6.569552421569824, + "logps/chosen": -350.52886962890625, + "logps/rejected": -301.33282470703125, + "loss": 0.7055, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.31120777130126953, + "rewards/margins": 0.4939754903316498, + "rewards/rejected": -0.18276770412921906, + "step": 6184 + }, + { + "epoch": 0.95650492944133, + "grad_norm": 3.610272169113159, + "learning_rate": 3.784224997135984e-06, + "logits/chosen": 8.418853759765625, + "logits/rejected": 3.3167402744293213, + "logps/chosen": -216.83267211914062, + "logps/rejected": -166.833984375, + "loss": 0.4795, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.015908867120742798, + "rewards/margins": 0.7096530795097351, + "rewards/rejected": -0.6937441825866699, + "step": 6185 + }, + { + "epoch": 0.9566595785810942, + "grad_norm": 6.288115978240967, + "learning_rate": 3.7839385954863102e-06, + "logits/chosen": 8.40555477142334, + "logits/rejected": 11.845731735229492, + "logps/chosen": -384.36956787109375, + "logps/rejected": -371.1394958496094, + "loss": 0.795, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3408474028110504, + "rewards/margins": -0.05401533842086792, + "rewards/rejected": 0.3948627710342407, + "step": 6186 + }, + { + "epoch": 0.9568142277208583, + "grad_norm": 4.930776596069336, + "learning_rate": 3.7836521938366365e-06, + "logits/chosen": 12.268728256225586, + "logits/rejected": 13.093002319335938, + "logps/chosen": -280.7565612792969, + "logps/rejected": -270.92138671875, + "loss": 0.5424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4865604043006897, + "rewards/margins": 0.3804599940776825, + "rewards/rejected": 0.10610037297010422, + "step": 6187 + }, + { + "epoch": 0.9569688768606225, + "grad_norm": 4.21857213973999, + "learning_rate": 3.783365792186963e-06, + "logits/chosen": 11.802602767944336, + "logits/rejected": 8.7615327835083, + "logps/chosen": -397.9958801269531, + "logps/rejected": -360.99920654296875, + "loss": 0.548, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4543735980987549, + "rewards/margins": 0.47703033685684204, + "rewards/rejected": -0.022656723856925964, + "step": 6188 + }, + { + "epoch": 0.9571235260003866, + "grad_norm": 3.6183483600616455, + "learning_rate": 3.7830793905372898e-06, + "logits/chosen": 13.680797576904297, + "logits/rejected": 6.239417552947998, + "logps/chosen": -246.63763427734375, + "logps/rejected": -165.08819580078125, + "loss": 0.5639, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.004466649144887924, + "rewards/margins": 0.5195215940475464, + "rewards/rejected": -0.5150550007820129, + "step": 6189 + }, + { + "epoch": 0.9572781751401508, + "grad_norm": 4.1929497718811035, + "learning_rate": 3.7827929888876164e-06, + "logits/chosen": 15.130483627319336, + "logits/rejected": 6.626381874084473, + "logps/chosen": -229.71731567382812, + "logps/rejected": -158.8823699951172, + "loss": 0.4868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3587179183959961, + "rewards/margins": 0.6306701898574829, + "rewards/rejected": -0.27195224165916443, + "step": 6190 + }, + { + "epoch": 0.9574328242799149, + "grad_norm": 2.6293959617614746, + "learning_rate": 3.782506587237943e-06, + "logits/chosen": 9.32863712310791, + "logits/rejected": 6.863626003265381, + "logps/chosen": -132.62428283691406, + "logps/rejected": -149.93482971191406, + "loss": 0.4542, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05582243204116821, + "rewards/margins": 0.7524368166923523, + "rewards/rejected": -0.8082591891288757, + "step": 6191 + }, + { + "epoch": 0.9575874734196791, + "grad_norm": 6.161720275878906, + "learning_rate": 3.7822201855882697e-06, + "logits/chosen": 9.685081481933594, + "logits/rejected": 8.40473461151123, + "logps/chosen": -318.6549072265625, + "logps/rejected": -264.1510009765625, + "loss": 0.672, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1546720564365387, + "rewards/margins": 0.23635368049144745, + "rewards/rejected": -0.08168165385723114, + "step": 6192 + }, + { + "epoch": 0.9577421225594432, + "grad_norm": 5.166300296783447, + "learning_rate": 3.7819337839385955e-06, + "logits/chosen": 9.75143814086914, + "logits/rejected": 14.300973892211914, + "logps/chosen": -264.825927734375, + "logps/rejected": -337.2331848144531, + "loss": 0.6335, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7513271570205688, + "rewards/margins": 0.26507991552352905, + "rewards/rejected": 0.4862472116947174, + "step": 6193 + }, + { + "epoch": 0.9578967716992074, + "grad_norm": 4.775003910064697, + "learning_rate": 3.781647382288922e-06, + "logits/chosen": 15.092397689819336, + "logits/rejected": 7.727599143981934, + "logps/chosen": -329.2802734375, + "logps/rejected": -236.48779296875, + "loss": 0.4964, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3986269235610962, + "rewards/margins": 0.5121880173683167, + "rewards/rejected": -0.11356109380722046, + "step": 6194 + }, + { + "epoch": 0.9580514208389715, + "grad_norm": 5.808637619018555, + "learning_rate": 3.781360980639249e-06, + "logits/chosen": 11.492547035217285, + "logits/rejected": 10.122631072998047, + "logps/chosen": -434.10394287109375, + "logps/rejected": -356.5558166503906, + "loss": 0.599, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2299526184797287, + "rewards/margins": 0.31451255083084106, + "rewards/rejected": -0.08455993235111237, + "step": 6195 + }, + { + "epoch": 0.9582060699787357, + "grad_norm": 5.995190143585205, + "learning_rate": 3.7810745789895755e-06, + "logits/chosen": 16.781414031982422, + "logits/rejected": 18.06316375732422, + "logps/chosen": -212.47698974609375, + "logps/rejected": -326.875, + "loss": 0.7496, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13011418282985687, + "rewards/margins": -0.05014096572995186, + "rewards/rejected": 0.18025514483451843, + "step": 6196 + }, + { + "epoch": 0.9583607191184998, + "grad_norm": 3.9708070755004883, + "learning_rate": 3.7807881773399017e-06, + "logits/chosen": 12.563114166259766, + "logits/rejected": 10.903181076049805, + "logps/chosen": -291.62823486328125, + "logps/rejected": -236.60528564453125, + "loss": 0.3895, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45499327778816223, + "rewards/margins": 0.7704517841339111, + "rewards/rejected": -0.31545859575271606, + "step": 6197 + }, + { + "epoch": 0.9585153682582641, + "grad_norm": 3.8150579929351807, + "learning_rate": 3.7805017756902283e-06, + "logits/chosen": 9.56097412109375, + "logits/rejected": 5.480674743652344, + "logps/chosen": -245.415283203125, + "logps/rejected": -165.5003662109375, + "loss": 0.5566, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10580796748399734, + "rewards/margins": 0.3801415264606476, + "rewards/rejected": -0.27433356642723083, + "step": 6198 + }, + { + "epoch": 0.9586700173980283, + "grad_norm": 5.646664619445801, + "learning_rate": 3.780215374040555e-06, + "logits/chosen": 13.170984268188477, + "logits/rejected": 10.892875671386719, + "logps/chosen": -308.5303039550781, + "logps/rejected": -326.3078308105469, + "loss": 0.4202, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28127679228782654, + "rewards/margins": 1.1327598094940186, + "rewards/rejected": -0.8514830470085144, + "step": 6199 + }, + { + "epoch": 0.9588246665377924, + "grad_norm": 4.061943531036377, + "learning_rate": 3.7799289723908812e-06, + "logits/chosen": 11.620902061462402, + "logits/rejected": 0.8329916596412659, + "logps/chosen": -353.20745849609375, + "logps/rejected": -243.18264770507812, + "loss": 0.5421, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4509489834308624, + "rewards/margins": 0.6272865533828735, + "rewards/rejected": -0.17633754014968872, + "step": 6200 + }, + { + "epoch": 0.9589793156775566, + "grad_norm": 4.095134735107422, + "learning_rate": 3.779642570741208e-06, + "logits/chosen": 14.328485488891602, + "logits/rejected": 9.460273742675781, + "logps/chosen": -271.25164794921875, + "logps/rejected": -169.11851501464844, + "loss": 0.6445, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.13385413587093353, + "rewards/margins": 0.18290254473686218, + "rewards/rejected": -0.3167566657066345, + "step": 6201 + }, + { + "epoch": 0.9591339648173207, + "grad_norm": 6.392009735107422, + "learning_rate": 3.779356169091534e-06, + "logits/chosen": 9.518019676208496, + "logits/rejected": 11.726667404174805, + "logps/chosen": -303.9400634765625, + "logps/rejected": -253.27401733398438, + "loss": 0.8503, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.15070396661758423, + "rewards/margins": -0.2587589621543884, + "rewards/rejected": 0.40946295857429504, + "step": 6202 + }, + { + "epoch": 0.9592886139570849, + "grad_norm": 5.753974437713623, + "learning_rate": 3.7790697674418607e-06, + "logits/chosen": 14.135196685791016, + "logits/rejected": 12.58681869506836, + "logps/chosen": -390.4419860839844, + "logps/rejected": -381.3349609375, + "loss": 0.663, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3868066966533661, + "rewards/margins": 0.22663907706737518, + "rewards/rejected": 0.1601676046848297, + "step": 6203 + }, + { + "epoch": 0.959443263096849, + "grad_norm": 4.343027591705322, + "learning_rate": 3.7787833657921874e-06, + "logits/chosen": 8.071842193603516, + "logits/rejected": 1.7248845100402832, + "logps/chosen": -190.04464721679688, + "logps/rejected": -120.87312316894531, + "loss": 0.6401, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06427059322595596, + "rewards/margins": 0.20327618718147278, + "rewards/rejected": -0.13900557160377502, + "step": 6204 + }, + { + "epoch": 0.9595979122366132, + "grad_norm": 5.777004241943359, + "learning_rate": 3.778496964142514e-06, + "logits/chosen": 14.482356071472168, + "logits/rejected": 7.85591459274292, + "logps/chosen": -297.9575500488281, + "logps/rejected": -229.93800354003906, + "loss": 0.5911, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13670288026332855, + "rewards/margins": 0.46076586842536926, + "rewards/rejected": -0.3240630030632019, + "step": 6205 + }, + { + "epoch": 0.9597525613763773, + "grad_norm": 4.93242073059082, + "learning_rate": 3.77821056249284e-06, + "logits/chosen": 9.400463104248047, + "logits/rejected": 7.245129108428955, + "logps/chosen": -291.78363037109375, + "logps/rejected": -218.31008911132812, + "loss": 0.5752, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08678723871707916, + "rewards/margins": 0.2959372401237488, + "rewards/rejected": -0.2091500163078308, + "step": 6206 + }, + { + "epoch": 0.9599072105161415, + "grad_norm": 4.211946487426758, + "learning_rate": 3.7779241608431665e-06, + "logits/chosen": 10.71906852722168, + "logits/rejected": 3.749788284301758, + "logps/chosen": -268.0580749511719, + "logps/rejected": -261.02862548828125, + "loss": 0.5619, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1457514762878418, + "rewards/margins": 0.45661646127700806, + "rewards/rejected": -0.31086498498916626, + "step": 6207 + }, + { + "epoch": 0.9600618596559056, + "grad_norm": 5.2096076011657715, + "learning_rate": 3.777637759193493e-06, + "logits/chosen": 12.332616806030273, + "logits/rejected": 11.804502487182617, + "logps/chosen": -292.0572814941406, + "logps/rejected": -254.89215087890625, + "loss": 0.5512, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3568549156188965, + "rewards/margins": 0.5626353621482849, + "rewards/rejected": -0.20578041672706604, + "step": 6208 + }, + { + "epoch": 0.9602165087956698, + "grad_norm": 7.805737018585205, + "learning_rate": 3.77735135754382e-06, + "logits/chosen": 13.04304313659668, + "logits/rejected": 15.592020988464355, + "logps/chosen": -224.40077209472656, + "logps/rejected": -244.50022888183594, + "loss": 0.9041, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.36959657073020935, + "rewards/margins": -0.30037039518356323, + "rewards/rejected": -0.06922616809606552, + "step": 6209 + }, + { + "epoch": 0.960371157935434, + "grad_norm": 4.945275783538818, + "learning_rate": 3.7770649558941465e-06, + "logits/chosen": 8.065134048461914, + "logits/rejected": 8.350366592407227, + "logps/chosen": -288.42657470703125, + "logps/rejected": -210.79345703125, + "loss": 0.5046, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38806694746017456, + "rewards/margins": 0.5291779637336731, + "rewards/rejected": -0.14111098647117615, + "step": 6210 + }, + { + "epoch": 0.9605258070751982, + "grad_norm": 7.0027008056640625, + "learning_rate": 3.776778554244473e-06, + "logits/chosen": 6.472238540649414, + "logits/rejected": 3.75234055519104, + "logps/chosen": -376.69647216796875, + "logps/rejected": -263.144287109375, + "loss": 0.805, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.06863565742969513, + "rewards/margins": -0.14633022248744965, + "rewards/rejected": 0.07769455015659332, + "step": 6211 + }, + { + "epoch": 0.9606804562149623, + "grad_norm": 5.786815166473389, + "learning_rate": 3.7764921525947993e-06, + "logits/chosen": 9.991025924682617, + "logits/rejected": 6.760941982269287, + "logps/chosen": -304.273681640625, + "logps/rejected": -298.0568542480469, + "loss": 0.5337, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6838828325271606, + "rewards/margins": 0.43535536527633667, + "rewards/rejected": 0.2485274374485016, + "step": 6212 + }, + { + "epoch": 0.9608351053547265, + "grad_norm": 3.9415769577026367, + "learning_rate": 3.7762057509451256e-06, + "logits/chosen": 14.743974685668945, + "logits/rejected": 10.124166488647461, + "logps/chosen": -247.69381713867188, + "logps/rejected": -194.5726318359375, + "loss": 0.486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2417970597743988, + "rewards/margins": 0.7244962453842163, + "rewards/rejected": -0.48269912600517273, + "step": 6213 + }, + { + "epoch": 0.9609897544944906, + "grad_norm": 4.849062442779541, + "learning_rate": 3.775919349295452e-06, + "logits/chosen": 11.756913185119629, + "logits/rejected": 13.900941848754883, + "logps/chosen": -237.27456665039062, + "logps/rejected": -282.3472595214844, + "loss": 0.5531, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3385724127292633, + "rewards/margins": 0.43315306305885315, + "rewards/rejected": -0.09458065032958984, + "step": 6214 + }, + { + "epoch": 0.9611444036342548, + "grad_norm": 3.8358962535858154, + "learning_rate": 3.775632947645779e-06, + "logits/chosen": 12.525490760803223, + "logits/rejected": 7.20210075378418, + "logps/chosen": -240.63632202148438, + "logps/rejected": -164.78302001953125, + "loss": 0.5478, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.10352945327758789, + "rewards/margins": 0.39274662733078003, + "rewards/rejected": -0.4962760806083679, + "step": 6215 + }, + { + "epoch": 0.961299052774019, + "grad_norm": 4.672080039978027, + "learning_rate": 3.775346545996105e-06, + "logits/chosen": 11.450679779052734, + "logits/rejected": 12.03573989868164, + "logps/chosen": -345.100830078125, + "logps/rejected": -326.0154724121094, + "loss": 0.5363, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.06310184299945831, + "rewards/margins": 0.542306661605835, + "rewards/rejected": -0.6054084897041321, + "step": 6216 + }, + { + "epoch": 0.9614537019137831, + "grad_norm": 4.900850296020508, + "learning_rate": 3.7750601443464317e-06, + "logits/chosen": 10.704215049743652, + "logits/rejected": 5.836019992828369, + "logps/chosen": -280.88079833984375, + "logps/rejected": -286.5028991699219, + "loss": 0.5755, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3014638423919678, + "rewards/margins": 0.29908487200737, + "rewards/rejected": 0.0023789890110492706, + "step": 6217 + }, + { + "epoch": 0.9616083510535473, + "grad_norm": 4.7244133949279785, + "learning_rate": 3.7747737426967584e-06, + "logits/chosen": 10.71621322631836, + "logits/rejected": 9.23169994354248, + "logps/chosen": -262.21112060546875, + "logps/rejected": -267.5535888671875, + "loss": 0.5859, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37306928634643555, + "rewards/margins": 0.35109177231788635, + "rewards/rejected": 0.021977512165904045, + "step": 6218 + }, + { + "epoch": 0.9617630001933114, + "grad_norm": 6.141534328460693, + "learning_rate": 3.7744873410470846e-06, + "logits/chosen": 7.8809614181518555, + "logits/rejected": 6.794828414916992, + "logps/chosen": -287.1963806152344, + "logps/rejected": -373.8373718261719, + "loss": 0.5956, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08312735706567764, + "rewards/margins": 0.26163122057914734, + "rewards/rejected": -0.1785038709640503, + "step": 6219 + }, + { + "epoch": 0.9619176493330756, + "grad_norm": 3.6746222972869873, + "learning_rate": 3.7742009393974113e-06, + "logits/chosen": 10.175399780273438, + "logits/rejected": 7.9590654373168945, + "logps/chosen": -261.11688232421875, + "logps/rejected": -242.82693481445312, + "loss": 0.4584, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5537927150726318, + "rewards/margins": 0.6324170231819153, + "rewards/rejected": -0.07862435281276703, + "step": 6220 + }, + { + "epoch": 0.9620722984728397, + "grad_norm": 5.872509956359863, + "learning_rate": 3.7739145377477375e-06, + "logits/chosen": 12.021138191223145, + "logits/rejected": 10.698677062988281, + "logps/chosen": -432.6414489746094, + "logps/rejected": -367.2095947265625, + "loss": 0.5876, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21761779487133026, + "rewards/margins": 0.47728434205055237, + "rewards/rejected": -0.2596665620803833, + "step": 6221 + }, + { + "epoch": 0.9622269476126039, + "grad_norm": 5.82351541519165, + "learning_rate": 3.773628136098064e-06, + "logits/chosen": 10.622352600097656, + "logits/rejected": 3.973573684692383, + "logps/chosen": -402.18402099609375, + "logps/rejected": -312.0247497558594, + "loss": 0.5743, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5350500345230103, + "rewards/margins": 0.4204881489276886, + "rewards/rejected": 0.11456190049648285, + "step": 6222 + }, + { + "epoch": 0.9623815967523681, + "grad_norm": 8.601079940795898, + "learning_rate": 3.773341734448391e-06, + "logits/chosen": 7.874594688415527, + "logits/rejected": 11.27859878540039, + "logps/chosen": -254.940185546875, + "logps/rejected": -291.0589294433594, + "loss": 0.796, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.15610548853874207, + "rewards/margins": -0.10389894992113113, + "rewards/rejected": -0.052206531167030334, + "step": 6223 + }, + { + "epoch": 0.9625362458921323, + "grad_norm": 5.773099422454834, + "learning_rate": 3.7730553327987174e-06, + "logits/chosen": 9.543684959411621, + "logits/rejected": 8.454202651977539, + "logps/chosen": -267.7629699707031, + "logps/rejected": -300.6573181152344, + "loss": 0.5254, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4540160298347473, + "rewards/margins": 0.45007288455963135, + "rewards/rejected": 0.003943160176277161, + "step": 6224 + }, + { + "epoch": 0.9626908950318964, + "grad_norm": 3.9194095134735107, + "learning_rate": 3.772768931149044e-06, + "logits/chosen": 9.256590843200684, + "logits/rejected": 5.746793746948242, + "logps/chosen": -217.80056762695312, + "logps/rejected": -167.87939453125, + "loss": 0.6648, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1267896294593811, + "rewards/margins": 0.2920009195804596, + "rewards/rejected": -0.1652112901210785, + "step": 6225 + }, + { + "epoch": 0.9628455441716606, + "grad_norm": 5.422349452972412, + "learning_rate": 3.77248252949937e-06, + "logits/chosen": 12.005634307861328, + "logits/rejected": 6.595765590667725, + "logps/chosen": -276.6656494140625, + "logps/rejected": -280.4333190917969, + "loss": 0.5374, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03590288385748863, + "rewards/margins": 0.42755985260009766, + "rewards/rejected": -0.3916569948196411, + "step": 6226 + }, + { + "epoch": 0.9630001933114247, + "grad_norm": 4.353426456451416, + "learning_rate": 3.7721961278496965e-06, + "logits/chosen": 9.1546630859375, + "logits/rejected": 8.671625137329102, + "logps/chosen": -241.7147216796875, + "logps/rejected": -221.46524047851562, + "loss": 0.5177, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1428285539150238, + "rewards/margins": 0.7597874402999878, + "rewards/rejected": -0.6169588565826416, + "step": 6227 + }, + { + "epoch": 0.9631548424511889, + "grad_norm": 5.999122142791748, + "learning_rate": 3.771909726200023e-06, + "logits/chosen": 6.420291900634766, + "logits/rejected": 5.207700729370117, + "logps/chosen": -376.8318176269531, + "logps/rejected": -318.32037353515625, + "loss": 0.7101, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2463313192129135, + "rewards/margins": 0.1491008996963501, + "rewards/rejected": 0.09723042696714401, + "step": 6228 + }, + { + "epoch": 0.963309491590953, + "grad_norm": 7.111025810241699, + "learning_rate": 3.77162332455035e-06, + "logits/chosen": 11.377235412597656, + "logits/rejected": 10.18140697479248, + "logps/chosen": -328.5623474121094, + "logps/rejected": -288.66314697265625, + "loss": 0.8211, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15537777543067932, + "rewards/margins": -0.17616042494773865, + "rewards/rejected": 0.020782656967639923, + "step": 6229 + }, + { + "epoch": 0.9634641407307172, + "grad_norm": 4.652571678161621, + "learning_rate": 3.7713369229006765e-06, + "logits/chosen": 6.284856796264648, + "logits/rejected": 5.213893890380859, + "logps/chosen": -258.7974548339844, + "logps/rejected": -237.41958618164062, + "loss": 0.6938, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.11941994726657867, + "rewards/margins": 0.07685817033052444, + "rewards/rejected": 0.04256176948547363, + "step": 6230 + }, + { + "epoch": 0.9636187898704813, + "grad_norm": 7.093327522277832, + "learning_rate": 3.7710505212510027e-06, + "logits/chosen": 15.114252090454102, + "logits/rejected": 8.233548164367676, + "logps/chosen": -311.77203369140625, + "logps/rejected": -221.2399139404297, + "loss": 0.8362, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0834583267569542, + "rewards/margins": -0.0853218138217926, + "rewards/rejected": 0.0018634870648384094, + "step": 6231 + }, + { + "epoch": 0.9637734390102455, + "grad_norm": 5.479827880859375, + "learning_rate": 3.7707641196013294e-06, + "logits/chosen": 5.144112586975098, + "logits/rejected": 11.315292358398438, + "logps/chosen": -233.96780395507812, + "logps/rejected": -245.29150390625, + "loss": 0.822, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4076550006866455, + "rewards/margins": -0.08142456412315369, + "rewards/rejected": -0.32623040676116943, + "step": 6232 + }, + { + "epoch": 0.9639280881500096, + "grad_norm": 4.323971748352051, + "learning_rate": 3.7704777179516556e-06, + "logits/chosen": 5.676105499267578, + "logits/rejected": 6.429553031921387, + "logps/chosen": -205.0814208984375, + "logps/rejected": -209.31210327148438, + "loss": 0.7604, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.15172801911830902, + "rewards/margins": -0.003660343587398529, + "rewards/rejected": -0.1480676829814911, + "step": 6233 + }, + { + "epoch": 0.9640827372897738, + "grad_norm": 5.070718288421631, + "learning_rate": 3.7701913163019822e-06, + "logits/chosen": 7.9707136154174805, + "logits/rejected": 10.688069343566895, + "logps/chosen": -198.2361602783203, + "logps/rejected": -239.82376098632812, + "loss": 0.6574, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04830784350633621, + "rewards/margins": 0.2938075065612793, + "rewards/rejected": -0.3421153724193573, + "step": 6234 + }, + { + "epoch": 0.9642373864295379, + "grad_norm": 16.31546401977539, + "learning_rate": 3.7699049146523085e-06, + "logits/chosen": 10.313432693481445, + "logits/rejected": 8.43807315826416, + "logps/chosen": -279.4935607910156, + "logps/rejected": -275.16363525390625, + "loss": 0.6778, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2216387689113617, + "rewards/margins": 0.13679268956184387, + "rewards/rejected": -0.3584314286708832, + "step": 6235 + }, + { + "epoch": 0.9643920355693022, + "grad_norm": 6.370844841003418, + "learning_rate": 3.769618513002635e-06, + "logits/chosen": 16.11417579650879, + "logits/rejected": 12.088629722595215, + "logps/chosen": -417.5743408203125, + "logps/rejected": -374.2684020996094, + "loss": 0.5429, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.503851592540741, + "rewards/margins": 0.448061466217041, + "rewards/rejected": 0.05579013377428055, + "step": 6236 + }, + { + "epoch": 0.9645466847090663, + "grad_norm": 6.4553680419921875, + "learning_rate": 3.7693321113529618e-06, + "logits/chosen": 13.063579559326172, + "logits/rejected": 12.430404663085938, + "logps/chosen": -345.1760559082031, + "logps/rejected": -258.789794921875, + "loss": 0.6749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1946934163570404, + "rewards/margins": 0.17828252911567688, + "rewards/rejected": -0.3729759454727173, + "step": 6237 + }, + { + "epoch": 0.9647013338488305, + "grad_norm": 5.131139278411865, + "learning_rate": 3.7690457097032884e-06, + "logits/chosen": 10.330784797668457, + "logits/rejected": 4.305326461791992, + "logps/chosen": -324.40020751953125, + "logps/rejected": -192.52452087402344, + "loss": 0.5927, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10897751152515411, + "rewards/margins": 0.44921454787254333, + "rewards/rejected": -0.340237021446228, + "step": 6238 + }, + { + "epoch": 0.9648559829885947, + "grad_norm": 4.922372817993164, + "learning_rate": 3.7687593080536147e-06, + "logits/chosen": 10.259038925170898, + "logits/rejected": 8.114009857177734, + "logps/chosen": -297.686767578125, + "logps/rejected": -347.5851135253906, + "loss": 0.5775, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2805238664150238, + "rewards/margins": 0.39249420166015625, + "rewards/rejected": -0.11197035014629364, + "step": 6239 + }, + { + "epoch": 0.9650106321283588, + "grad_norm": 3.8152194023132324, + "learning_rate": 3.768472906403941e-06, + "logits/chosen": 10.1238374710083, + "logits/rejected": 6.693074703216553, + "logps/chosen": -341.34783935546875, + "logps/rejected": -176.46409606933594, + "loss": 0.4395, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5498703718185425, + "rewards/margins": 0.79653400182724, + "rewards/rejected": -0.24666360020637512, + "step": 6240 + }, + { + "epoch": 0.965165281268123, + "grad_norm": 4.972837924957275, + "learning_rate": 3.7681865047542675e-06, + "logits/chosen": 12.047258377075195, + "logits/rejected": 9.121379852294922, + "logps/chosen": -197.76177978515625, + "logps/rejected": -180.068603515625, + "loss": 0.7013, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14627385139465332, + "rewards/margins": 0.03559955954551697, + "rewards/rejected": 0.11067428439855576, + "step": 6241 + }, + { + "epoch": 0.9653199304078871, + "grad_norm": 4.085970878601074, + "learning_rate": 3.767900103104594e-06, + "logits/chosen": 17.535829544067383, + "logits/rejected": 7.910150527954102, + "logps/chosen": -266.6650390625, + "logps/rejected": -184.14718627929688, + "loss": 0.5101, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3464258015155792, + "rewards/margins": 0.6349284052848816, + "rewards/rejected": -0.2885025441646576, + "step": 6242 + }, + { + "epoch": 0.9654745795476513, + "grad_norm": 5.301179885864258, + "learning_rate": 3.767613701454921e-06, + "logits/chosen": 8.30402660369873, + "logits/rejected": 5.971802234649658, + "logps/chosen": -406.5301818847656, + "logps/rejected": -317.0602722167969, + "loss": 0.572, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19353963434696198, + "rewards/margins": 0.4953781068325043, + "rewards/rejected": -0.3018384873867035, + "step": 6243 + }, + { + "epoch": 0.9656292286874154, + "grad_norm": 7.259011745452881, + "learning_rate": 3.7673272998052475e-06, + "logits/chosen": 6.027864933013916, + "logits/rejected": 9.42466926574707, + "logps/chosen": -259.1260681152344, + "logps/rejected": -274.513427734375, + "loss": 0.8888, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03978786617517471, + "rewards/margins": -0.30730140209198, + "rewards/rejected": 0.2675135135650635, + "step": 6244 + }, + { + "epoch": 0.9657838778271796, + "grad_norm": 5.502675533294678, + "learning_rate": 3.767040898155574e-06, + "logits/chosen": 8.866201400756836, + "logits/rejected": 6.491358757019043, + "logps/chosen": -362.2781982421875, + "logps/rejected": -276.3226623535156, + "loss": 0.6315, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.652718186378479, + "rewards/margins": 0.38082462549209595, + "rewards/rejected": 0.27189356088638306, + "step": 6245 + }, + { + "epoch": 0.9659385269669437, + "grad_norm": 7.042604923248291, + "learning_rate": 3.7667544965059e-06, + "logits/chosen": 9.896074295043945, + "logits/rejected": 9.05870246887207, + "logps/chosen": -289.7452392578125, + "logps/rejected": -254.51736450195312, + "loss": 0.8295, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.4651522636413574, + "rewards/margins": -0.10872002691030502, + "rewards/rejected": -0.3564322590827942, + "step": 6246 + }, + { + "epoch": 0.9660931761067079, + "grad_norm": 5.6107025146484375, + "learning_rate": 3.7664680948562266e-06, + "logits/chosen": 9.470053672790527, + "logits/rejected": 9.284296035766602, + "logps/chosen": -297.4190673828125, + "logps/rejected": -284.5372009277344, + "loss": 0.6573, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12754374742507935, + "rewards/margins": 0.10900935530662537, + "rewards/rejected": 0.01853439211845398, + "step": 6247 + }, + { + "epoch": 0.966247825246472, + "grad_norm": 3.7968590259552, + "learning_rate": 3.7661816932065532e-06, + "logits/chosen": 10.05697250366211, + "logits/rejected": 8.074006080627441, + "logps/chosen": -277.1243896484375, + "logps/rejected": -253.9668731689453, + "loss": 0.4872, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47266513109207153, + "rewards/margins": 0.6206865906715393, + "rewards/rejected": -0.14802150428295135, + "step": 6248 + }, + { + "epoch": 0.9664024743862363, + "grad_norm": 2.798936128616333, + "learning_rate": 3.76589529155688e-06, + "logits/chosen": 8.21577262878418, + "logits/rejected": 1.5273282527923584, + "logps/chosen": -186.09304809570312, + "logps/rejected": -128.33729553222656, + "loss": 0.4976, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29868149757385254, + "rewards/margins": 0.6515520811080933, + "rewards/rejected": -0.3528705835342407, + "step": 6249 + }, + { + "epoch": 0.9665571235260004, + "grad_norm": 6.064682483673096, + "learning_rate": 3.765608889907206e-06, + "logits/chosen": 7.74521541595459, + "logits/rejected": 11.561120986938477, + "logps/chosen": -292.43035888671875, + "logps/rejected": -317.5081787109375, + "loss": 0.7125, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37806808948516846, + "rewards/margins": 0.13698609173297882, + "rewards/rejected": 0.24108201265335083, + "step": 6250 + }, + { + "epoch": 0.9667117726657646, + "grad_norm": 4.284787654876709, + "learning_rate": 3.7653224882575328e-06, + "logits/chosen": 14.056763648986816, + "logits/rejected": 3.4579153060913086, + "logps/chosen": -230.45591735839844, + "logps/rejected": -126.14958953857422, + "loss": 0.5697, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17201842367649078, + "rewards/margins": 0.5434473156929016, + "rewards/rejected": -0.371428906917572, + "step": 6251 + }, + { + "epoch": 0.9668664218055287, + "grad_norm": 4.843050956726074, + "learning_rate": 3.765036086607859e-06, + "logits/chosen": 16.6568660736084, + "logits/rejected": 7.945697784423828, + "logps/chosen": -305.6340637207031, + "logps/rejected": -204.84315490722656, + "loss": 0.6595, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34388524293899536, + "rewards/margins": 0.1610361933708191, + "rewards/rejected": 0.18284901976585388, + "step": 6252 + }, + { + "epoch": 0.9670210709452929, + "grad_norm": 8.145442008972168, + "learning_rate": 3.7647496849581856e-06, + "logits/chosen": 8.537132263183594, + "logits/rejected": 11.795557022094727, + "logps/chosen": -303.1133117675781, + "logps/rejected": -279.73284912109375, + "loss": 0.9007, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.2750663161277771, + "rewards/margins": -0.2852938771247864, + "rewards/rejected": 0.01022757962346077, + "step": 6253 + }, + { + "epoch": 0.967175720085057, + "grad_norm": 10.77951717376709, + "learning_rate": 3.764463283308512e-06, + "logits/chosen": 10.265498161315918, + "logits/rejected": 10.32868480682373, + "logps/chosen": -298.82305908203125, + "logps/rejected": -321.20587158203125, + "loss": 0.9603, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.20694218575954437, + "rewards/margins": -0.32499951124191284, + "rewards/rejected": 0.5319416522979736, + "step": 6254 + }, + { + "epoch": 0.9673303692248212, + "grad_norm": 5.997251510620117, + "learning_rate": 3.7641768816588385e-06, + "logits/chosen": 5.200666904449463, + "logits/rejected": 4.120314598083496, + "logps/chosen": -254.244873046875, + "logps/rejected": -196.6265869140625, + "loss": 0.7239, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.35333359241485596, + "rewards/margins": 0.07666229456663132, + "rewards/rejected": -0.42999589443206787, + "step": 6255 + }, + { + "epoch": 0.9674850183645853, + "grad_norm": 5.547662734985352, + "learning_rate": 3.763890480009165e-06, + "logits/chosen": 9.776832580566406, + "logits/rejected": 9.78276538848877, + "logps/chosen": -156.112060546875, + "logps/rejected": -188.59878540039062, + "loss": 0.6949, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.12008310854434967, + "rewards/margins": 0.10368173569440842, + "rewards/rejected": -0.2237648367881775, + "step": 6256 + }, + { + "epoch": 0.9676396675043495, + "grad_norm": 6.655076026916504, + "learning_rate": 3.763604078359492e-06, + "logits/chosen": 16.87818145751953, + "logits/rejected": 8.104897499084473, + "logps/chosen": -452.58721923828125, + "logps/rejected": -302.3865966796875, + "loss": 0.6135, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.6017551422119141, + "rewards/margins": 0.4058828353881836, + "rewards/rejected": 0.19587230682373047, + "step": 6257 + }, + { + "epoch": 0.9677943166441136, + "grad_norm": 4.455080509185791, + "learning_rate": 3.7633176767098185e-06, + "logits/chosen": 12.935771942138672, + "logits/rejected": 8.161739349365234, + "logps/chosen": -287.3878173828125, + "logps/rejected": -207.23658752441406, + "loss": 0.6114, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28862935304641724, + "rewards/margins": 0.36560189723968506, + "rewards/rejected": -0.07697255909442902, + "step": 6258 + }, + { + "epoch": 0.9679489657838778, + "grad_norm": 7.170156955718994, + "learning_rate": 3.7630312750601443e-06, + "logits/chosen": 8.677175521850586, + "logits/rejected": 11.884601593017578, + "logps/chosen": -215.62002563476562, + "logps/rejected": -243.57623291015625, + "loss": 0.9372, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3639761209487915, + "rewards/margins": -0.3351106643676758, + "rewards/rejected": -0.028865426778793335, + "step": 6259 + }, + { + "epoch": 0.9681036149236419, + "grad_norm": 8.850513458251953, + "learning_rate": 3.762744873410471e-06, + "logits/chosen": 9.936614036560059, + "logits/rejected": 10.412424087524414, + "logps/chosen": -217.98822021484375, + "logps/rejected": -176.16763305664062, + "loss": 0.808, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19781073927879333, + "rewards/margins": -0.11795802414417267, + "rewards/rejected": -0.07985273003578186, + "step": 6260 + }, + { + "epoch": 0.9682582640634061, + "grad_norm": 6.450249195098877, + "learning_rate": 3.7624584717607976e-06, + "logits/chosen": 7.904399871826172, + "logits/rejected": 9.053790092468262, + "logps/chosen": -238.248779296875, + "logps/rejected": -361.2785339355469, + "loss": 0.7736, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4528582990169525, + "rewards/margins": 0.10984626412391663, + "rewards/rejected": 0.3430120646953583, + "step": 6261 + }, + { + "epoch": 0.9684129132031704, + "grad_norm": 5.9091010093688965, + "learning_rate": 3.7621720701111242e-06, + "logits/chosen": 5.1943535804748535, + "logits/rejected": 4.121196746826172, + "logps/chosen": -189.3777313232422, + "logps/rejected": -268.4761657714844, + "loss": 0.5993, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20213675498962402, + "rewards/margins": 0.31495893001556396, + "rewards/rejected": -0.11282214522361755, + "step": 6262 + }, + { + "epoch": 0.9685675623429345, + "grad_norm": 6.606688976287842, + "learning_rate": 3.761885668461451e-06, + "logits/chosen": 16.05550765991211, + "logits/rejected": 12.101274490356445, + "logps/chosen": -400.423583984375, + "logps/rejected": -331.2559814453125, + "loss": 0.7084, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37084275484085083, + "rewards/margins": 0.27033746242523193, + "rewards/rejected": 0.1005052849650383, + "step": 6263 + }, + { + "epoch": 0.9687222114826987, + "grad_norm": 5.956660747528076, + "learning_rate": 3.7615992668117775e-06, + "logits/chosen": 10.192949295043945, + "logits/rejected": 8.260550498962402, + "logps/chosen": -317.9083557128906, + "logps/rejected": -233.95382690429688, + "loss": 0.9477, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.41795188188552856, + "rewards/margins": -0.08529482781887054, + "rewards/rejected": -0.33265700936317444, + "step": 6264 + }, + { + "epoch": 0.9688768606224628, + "grad_norm": 4.748476982116699, + "learning_rate": 3.7613128651621033e-06, + "logits/chosen": 6.996815204620361, + "logits/rejected": 6.674928665161133, + "logps/chosen": -235.6258087158203, + "logps/rejected": -249.1686553955078, + "loss": 0.6297, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.043549057096242905, + "rewards/margins": 0.21402449905872345, + "rewards/rejected": -0.17047543823719025, + "step": 6265 + }, + { + "epoch": 0.969031509762227, + "grad_norm": 6.7631330490112305, + "learning_rate": 3.76102646351243e-06, + "logits/chosen": 14.424544334411621, + "logits/rejected": 17.116493225097656, + "logps/chosen": -285.1363220214844, + "logps/rejected": -267.27081298828125, + "loss": 0.9648, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.5097063779830933, + "rewards/margins": -0.3229352533817291, + "rewards/rejected": -0.18677110970020294, + "step": 6266 + }, + { + "epoch": 0.9691861589019911, + "grad_norm": 5.223404407501221, + "learning_rate": 3.7607400618627566e-06, + "logits/chosen": 10.831860542297363, + "logits/rejected": 10.007755279541016, + "logps/chosen": -157.50570678710938, + "logps/rejected": -178.83848571777344, + "loss": 0.5841, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.235112726688385, + "rewards/margins": 0.3109947144985199, + "rewards/rejected": -0.5461074709892273, + "step": 6267 + }, + { + "epoch": 0.9693408080417553, + "grad_norm": 4.666409015655518, + "learning_rate": 3.7604536602130833e-06, + "logits/chosen": 10.217757225036621, + "logits/rejected": 5.7212629318237305, + "logps/chosen": -191.2835693359375, + "logps/rejected": -166.89266967773438, + "loss": 0.7029, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3665688633918762, + "rewards/margins": 0.2122378647327423, + "rewards/rejected": -0.5788067579269409, + "step": 6268 + }, + { + "epoch": 0.9694954571815194, + "grad_norm": 4.502893447875977, + "learning_rate": 3.7601672585634095e-06, + "logits/chosen": 10.026115417480469, + "logits/rejected": 6.265719413757324, + "logps/chosen": -190.56192016601562, + "logps/rejected": -157.34432983398438, + "loss": 0.6764, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.021848969161510468, + "rewards/margins": 0.127274751663208, + "rewards/rejected": -0.10542578250169754, + "step": 6269 + }, + { + "epoch": 0.9696501063212836, + "grad_norm": 4.055923938751221, + "learning_rate": 3.759880856913736e-06, + "logits/chosen": 10.522934913635254, + "logits/rejected": 5.213428497314453, + "logps/chosen": -296.31005859375, + "logps/rejected": -256.1463928222656, + "loss": 0.4316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5916823148727417, + "rewards/margins": 0.8471230268478394, + "rewards/rejected": -0.25544074177742004, + "step": 6270 + }, + { + "epoch": 0.9698047554610477, + "grad_norm": 4.781545162200928, + "learning_rate": 3.759594455264063e-06, + "logits/chosen": 13.486408233642578, + "logits/rejected": 12.499540328979492, + "logps/chosen": -281.8381042480469, + "logps/rejected": -267.7893981933594, + "loss": 0.5145, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.59675532579422, + "rewards/margins": 0.4308159053325653, + "rewards/rejected": 0.16593945026397705, + "step": 6271 + }, + { + "epoch": 0.9699594046008119, + "grad_norm": 4.990830421447754, + "learning_rate": 3.759308053614389e-06, + "logits/chosen": 11.217247009277344, + "logits/rejected": 9.477635383605957, + "logps/chosen": -221.22340393066406, + "logps/rejected": -221.83580017089844, + "loss": 0.6782, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.022788338363170624, + "rewards/margins": 0.10545320063829422, + "rewards/rejected": -0.08266487717628479, + "step": 6272 + }, + { + "epoch": 0.970114053740576, + "grad_norm": 4.796380519866943, + "learning_rate": 3.7590216519647153e-06, + "logits/chosen": 7.505277156829834, + "logits/rejected": 6.669300079345703, + "logps/chosen": -220.4984130859375, + "logps/rejected": -211.16049194335938, + "loss": 0.6993, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11744508892297745, + "rewards/margins": 0.10261349380016327, + "rewards/rejected": 0.014831595122814178, + "step": 6273 + }, + { + "epoch": 0.9702687028803403, + "grad_norm": 12.667013168334961, + "learning_rate": 3.758735250315042e-06, + "logits/chosen": 12.283042907714844, + "logits/rejected": 9.919509887695312, + "logps/chosen": -240.1666259765625, + "logps/rejected": -269.742431640625, + "loss": 0.7003, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04436378926038742, + "rewards/margins": 0.08059270679950714, + "rewards/rejected": -0.12495648115873337, + "step": 6274 + }, + { + "epoch": 0.9704233520201044, + "grad_norm": 5.496799468994141, + "learning_rate": 3.7584488486653686e-06, + "logits/chosen": 10.627809524536133, + "logits/rejected": 8.267110824584961, + "logps/chosen": -335.3692321777344, + "logps/rejected": -222.99029541015625, + "loss": 0.6714, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46367499232292175, + "rewards/margins": 0.13004744052886963, + "rewards/rejected": 0.33362752199172974, + "step": 6275 + }, + { + "epoch": 0.9705780011598686, + "grad_norm": 5.949850082397461, + "learning_rate": 3.7581624470156952e-06, + "logits/chosen": 13.366804122924805, + "logits/rejected": 7.2980146408081055, + "logps/chosen": -316.492431640625, + "logps/rejected": -297.769287109375, + "loss": 0.5956, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12766170501708984, + "rewards/margins": 0.5112359523773193, + "rewards/rejected": -0.3835742175579071, + "step": 6276 + }, + { + "epoch": 0.9707326502996327, + "grad_norm": 4.195515155792236, + "learning_rate": 3.757876045366022e-06, + "logits/chosen": 12.662261962890625, + "logits/rejected": 10.821195602416992, + "logps/chosen": -239.55706787109375, + "logps/rejected": -261.9835205078125, + "loss": 0.4488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04185633733868599, + "rewards/margins": 0.6447039842605591, + "rewards/rejected": -0.6865603923797607, + "step": 6277 + }, + { + "epoch": 0.9708872994393969, + "grad_norm": 6.776169300079346, + "learning_rate": 3.7575896437163485e-06, + "logits/chosen": 6.963130950927734, + "logits/rejected": 7.097776412963867, + "logps/chosen": -207.04531860351562, + "logps/rejected": -228.71939086914062, + "loss": 0.7167, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04304493963718414, + "rewards/margins": 0.06501109898090363, + "rewards/rejected": -0.10805603116750717, + "step": 6278 + }, + { + "epoch": 0.971041948579161, + "grad_norm": 5.149867057800293, + "learning_rate": 3.7573032420666743e-06, + "logits/chosen": 9.821857452392578, + "logits/rejected": 8.230743408203125, + "logps/chosen": -224.55416870117188, + "logps/rejected": -295.2489929199219, + "loss": 0.5014, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39226651191711426, + "rewards/margins": 0.7520332336425781, + "rewards/rejected": -0.35976678133010864, + "step": 6279 + }, + { + "epoch": 0.9711965977189252, + "grad_norm": 6.54766321182251, + "learning_rate": 3.757016840417001e-06, + "logits/chosen": 9.347705841064453, + "logits/rejected": 4.830849647521973, + "logps/chosen": -319.0818176269531, + "logps/rejected": -241.9410400390625, + "loss": 0.6269, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25212106108665466, + "rewards/margins": 0.28301262855529785, + "rewards/rejected": -0.030891556292772293, + "step": 6280 + }, + { + "epoch": 0.9713512468586893, + "grad_norm": 5.549529552459717, + "learning_rate": 3.7567304387673276e-06, + "logits/chosen": 12.870904922485352, + "logits/rejected": 9.637121200561523, + "logps/chosen": -336.7303771972656, + "logps/rejected": -274.7347717285156, + "loss": 0.689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20548829436302185, + "rewards/margins": 0.1625308394432068, + "rewards/rejected": 0.04295744746923447, + "step": 6281 + }, + { + "epoch": 0.9715058959984535, + "grad_norm": 9.188798904418945, + "learning_rate": 3.7564440371176543e-06, + "logits/chosen": 11.511979103088379, + "logits/rejected": 7.0975189208984375, + "logps/chosen": -355.4588623046875, + "logps/rejected": -344.9566650390625, + "loss": 0.8716, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11412334442138672, + "rewards/margins": -0.12623050808906555, + "rewards/rejected": 0.012107163667678833, + "step": 6282 + }, + { + "epoch": 0.9716605451382176, + "grad_norm": 3.688019037246704, + "learning_rate": 3.756157635467981e-06, + "logits/chosen": 13.157388687133789, + "logits/rejected": 4.678380966186523, + "logps/chosen": -258.9842834472656, + "logps/rejected": -191.83261108398438, + "loss": 0.5084, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.529140830039978, + "rewards/margins": 0.5969865918159485, + "rewards/rejected": -0.06784577667713165, + "step": 6283 + }, + { + "epoch": 0.9718151942779818, + "grad_norm": 4.0534281730651855, + "learning_rate": 3.755871233818307e-06, + "logits/chosen": 11.310357093811035, + "logits/rejected": 12.62339973449707, + "logps/chosen": -237.59552001953125, + "logps/rejected": -293.6082763671875, + "loss": 0.4773, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4341612756252289, + "rewards/margins": 0.6125265955924988, + "rewards/rejected": -0.1783653348684311, + "step": 6284 + }, + { + "epoch": 0.971969843417746, + "grad_norm": 5.446765899658203, + "learning_rate": 3.7555848321686334e-06, + "logits/chosen": 10.807339668273926, + "logits/rejected": 5.760083198547363, + "logps/chosen": -329.4555969238281, + "logps/rejected": -267.57373046875, + "loss": 0.6303, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2157566100358963, + "rewards/margins": 0.2430887371301651, + "rewards/rejected": -0.027332112193107605, + "step": 6285 + }, + { + "epoch": 0.9721244925575101, + "grad_norm": 6.091634750366211, + "learning_rate": 3.75529843051896e-06, + "logits/chosen": 11.049680709838867, + "logits/rejected": 11.144487380981445, + "logps/chosen": -259.19287109375, + "logps/rejected": -274.60821533203125, + "loss": 0.8043, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.28198039531707764, + "rewards/margins": -0.1819252073764801, + "rewards/rejected": 0.46390560269355774, + "step": 6286 + }, + { + "epoch": 0.9722791416972744, + "grad_norm": 4.244925022125244, + "learning_rate": 3.7550120288692867e-06, + "logits/chosen": 7.915404319763184, + "logits/rejected": 7.382715702056885, + "logps/chosen": -170.4591522216797, + "logps/rejected": -210.62298583984375, + "loss": 0.621, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.01643562689423561, + "rewards/margins": 0.2778860032558441, + "rewards/rejected": -0.2614504098892212, + "step": 6287 + }, + { + "epoch": 0.9724337908370385, + "grad_norm": 7.048520088195801, + "learning_rate": 3.754725627219613e-06, + "logits/chosen": 12.771485328674316, + "logits/rejected": 5.459840774536133, + "logps/chosen": -341.1960754394531, + "logps/rejected": -299.1044921875, + "loss": 0.3689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.391074001789093, + "rewards/margins": 0.8669734001159668, + "rewards/rejected": -0.47589945793151855, + "step": 6288 + }, + { + "epoch": 0.9725884399768027, + "grad_norm": 6.387578964233398, + "learning_rate": 3.7544392255699396e-06, + "logits/chosen": 11.089256286621094, + "logits/rejected": 7.219018936157227, + "logps/chosen": -340.5226745605469, + "logps/rejected": -197.89361572265625, + "loss": 0.7196, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11683791875839233, + "rewards/margins": 0.037959348410367966, + "rewards/rejected": -0.1547972708940506, + "step": 6289 + }, + { + "epoch": 0.9727430891165668, + "grad_norm": 4.7493696212768555, + "learning_rate": 3.754152823920266e-06, + "logits/chosen": 12.237720489501953, + "logits/rejected": 3.726443290710449, + "logps/chosen": -321.08154296875, + "logps/rejected": -214.3508758544922, + "loss": 0.5117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2756778597831726, + "rewards/margins": 0.4415963292121887, + "rewards/rejected": -0.16591843962669373, + "step": 6290 + }, + { + "epoch": 0.972897738256331, + "grad_norm": 9.014803886413574, + "learning_rate": 3.753866422270593e-06, + "logits/chosen": 6.828606605529785, + "logits/rejected": 5.5821967124938965, + "logps/chosen": -307.0285339355469, + "logps/rejected": -301.3689880371094, + "loss": 0.5959, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2567369341850281, + "rewards/margins": 0.38547590374946594, + "rewards/rejected": -0.12873896956443787, + "step": 6291 + }, + { + "epoch": 0.9730523873960951, + "grad_norm": 8.48596477508545, + "learning_rate": 3.7535800206209187e-06, + "logits/chosen": 4.945058822631836, + "logits/rejected": 4.95685338973999, + "logps/chosen": -204.084228515625, + "logps/rejected": -258.9480285644531, + "loss": 0.8064, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3339933454990387, + "rewards/margins": 0.06666679680347443, + "rewards/rejected": 0.2673265337944031, + "step": 6292 + }, + { + "epoch": 0.9732070365358593, + "grad_norm": 3.631401538848877, + "learning_rate": 3.7532936189712453e-06, + "logits/chosen": 13.18494987487793, + "logits/rejected": 9.509491920471191, + "logps/chosen": -179.29742431640625, + "logps/rejected": -132.405029296875, + "loss": 0.6282, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004099570214748383, + "rewards/margins": 0.46705377101898193, + "rewards/rejected": -0.46295422315597534, + "step": 6293 + }, + { + "epoch": 0.9733616856756234, + "grad_norm": 4.574029922485352, + "learning_rate": 3.753007217321572e-06, + "logits/chosen": 10.251144409179688, + "logits/rejected": 12.60718822479248, + "logps/chosen": -375.3944091796875, + "logps/rejected": -291.18341064453125, + "loss": 0.5058, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44762903451919556, + "rewards/margins": 0.7367630004882812, + "rewards/rejected": -0.2891339957714081, + "step": 6294 + }, + { + "epoch": 0.9735163348153876, + "grad_norm": 4.10559606552124, + "learning_rate": 3.7527208156718986e-06, + "logits/chosen": 8.01666259765625, + "logits/rejected": 8.446287155151367, + "logps/chosen": -166.8321075439453, + "logps/rejected": -226.93145751953125, + "loss": 0.589, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0058164894580841064, + "rewards/margins": 0.5392737984657288, + "rewards/rejected": -0.5450902581214905, + "step": 6295 + }, + { + "epoch": 0.9736709839551517, + "grad_norm": 6.397380828857422, + "learning_rate": 3.7524344140222253e-06, + "logits/chosen": 8.823354721069336, + "logits/rejected": 14.64541244506836, + "logps/chosen": -217.61114501953125, + "logps/rejected": -306.6552429199219, + "loss": 0.8856, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.17376859486103058, + "rewards/margins": -0.03956770896911621, + "rewards/rejected": -0.13420085608959198, + "step": 6296 + }, + { + "epoch": 0.9738256330949159, + "grad_norm": 8.119359970092773, + "learning_rate": 3.752148012372552e-06, + "logits/chosen": 16.643131256103516, + "logits/rejected": 14.099477767944336, + "logps/chosen": -293.25189208984375, + "logps/rejected": -257.774169921875, + "loss": 0.8966, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1111118495464325, + "rewards/margins": -0.11099110543727875, + "rewards/rejected": -0.00012074783444404602, + "step": 6297 + }, + { + "epoch": 0.97398028223468, + "grad_norm": 5.610093593597412, + "learning_rate": 3.7518616107228777e-06, + "logits/chosen": 12.070931434631348, + "logits/rejected": 9.110671043395996, + "logps/chosen": -239.3953857421875, + "logps/rejected": -190.05569458007812, + "loss": 0.6075, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1395576298236847, + "rewards/margins": 0.3216792643070221, + "rewards/rejected": -0.1821216642856598, + "step": 6298 + }, + { + "epoch": 0.9741349313744442, + "grad_norm": 4.453665733337402, + "learning_rate": 3.7515752090732044e-06, + "logits/chosen": 12.264678955078125, + "logits/rejected": 7.147149085998535, + "logps/chosen": -319.1675109863281, + "logps/rejected": -209.34268188476562, + "loss": 0.5107, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.392706036567688, + "rewards/margins": 0.45684176683425903, + "rewards/rejected": -0.06413574516773224, + "step": 6299 + }, + { + "epoch": 0.9742895805142084, + "grad_norm": 5.393481254577637, + "learning_rate": 3.751288807423531e-06, + "logits/chosen": 12.595806121826172, + "logits/rejected": 9.223859786987305, + "logps/chosen": -296.7220458984375, + "logps/rejected": -285.6002197265625, + "loss": 0.6413, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3037174642086029, + "rewards/margins": 0.1938306838274002, + "rewards/rejected": 0.10988680273294449, + "step": 6300 + }, + { + "epoch": 0.9744442296539726, + "grad_norm": 4.3994526863098145, + "learning_rate": 3.7510024057738577e-06, + "logits/chosen": 12.864940643310547, + "logits/rejected": 8.482186317443848, + "logps/chosen": -239.47463989257812, + "logps/rejected": -251.9224853515625, + "loss": 0.4344, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05874108523130417, + "rewards/margins": 0.6514422297477722, + "rewards/rejected": -0.710183322429657, + "step": 6301 + }, + { + "epoch": 0.9745988787937367, + "grad_norm": 5.376049041748047, + "learning_rate": 3.7507160041241843e-06, + "logits/chosen": 13.081180572509766, + "logits/rejected": 12.240509033203125, + "logps/chosen": -333.3944091796875, + "logps/rejected": -307.56640625, + "loss": 0.6369, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5191308259963989, + "rewards/margins": 0.15374480187892914, + "rewards/rejected": 0.3653860092163086, + "step": 6302 + }, + { + "epoch": 0.9747535279335009, + "grad_norm": 4.008645534515381, + "learning_rate": 3.7504296024745105e-06, + "logits/chosen": 7.70807409286499, + "logits/rejected": 3.812920093536377, + "logps/chosen": -269.4783935546875, + "logps/rejected": -174.51019287109375, + "loss": 0.5031, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.01090259850025177, + "rewards/margins": 0.6251951456069946, + "rewards/rejected": -0.63609778881073, + "step": 6303 + }, + { + "epoch": 0.974908177073265, + "grad_norm": 5.73533296585083, + "learning_rate": 3.750143200824837e-06, + "logits/chosen": 11.198104858398438, + "logits/rejected": 15.782443046569824, + "logps/chosen": -220.4842529296875, + "logps/rejected": -290.2976379394531, + "loss": 0.7058, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.2851814031600952, + "rewards/margins": 0.029813483357429504, + "rewards/rejected": -0.3149949014186859, + "step": 6304 + }, + { + "epoch": 0.9750628262130292, + "grad_norm": 5.344424724578857, + "learning_rate": 3.7498567991751634e-06, + "logits/chosen": 10.123014450073242, + "logits/rejected": 5.762684345245361, + "logps/chosen": -327.5176696777344, + "logps/rejected": -204.54148864746094, + "loss": 0.7542, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12442417442798615, + "rewards/margins": -0.027842365205287933, + "rewards/rejected": 0.15226653218269348, + "step": 6305 + }, + { + "epoch": 0.9752174753527934, + "grad_norm": 7.004614353179932, + "learning_rate": 3.74957039752549e-06, + "logits/chosen": 4.719689846038818, + "logits/rejected": 5.728633403778076, + "logps/chosen": -403.8475036621094, + "logps/rejected": -365.57611083984375, + "loss": 0.6849, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08424302935600281, + "rewards/margins": 0.18592345714569092, + "rewards/rejected": -0.10168042778968811, + "step": 6306 + }, + { + "epoch": 0.9753721244925575, + "grad_norm": 4.442594051361084, + "learning_rate": 3.7492839958758163e-06, + "logits/chosen": 16.242652893066406, + "logits/rejected": 10.898727416992188, + "logps/chosen": -348.8134460449219, + "logps/rejected": -268.0030517578125, + "loss": 0.5538, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27345019578933716, + "rewards/margins": 0.46818414330482483, + "rewards/rejected": -0.19473400712013245, + "step": 6307 + }, + { + "epoch": 0.9755267736323217, + "grad_norm": 4.056453704833984, + "learning_rate": 3.748997594226143e-06, + "logits/chosen": 9.053031921386719, + "logits/rejected": 9.082283020019531, + "logps/chosen": -296.16461181640625, + "logps/rejected": -258.0118408203125, + "loss": 0.5295, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3165741562843323, + "rewards/margins": 0.49262741208076477, + "rewards/rejected": -0.1760532259941101, + "step": 6308 + }, + { + "epoch": 0.9756814227720858, + "grad_norm": 4.034596920013428, + "learning_rate": 3.7487111925764696e-06, + "logits/chosen": 12.421431541442871, + "logits/rejected": 5.468685150146484, + "logps/chosen": -323.6148986816406, + "logps/rejected": -247.73236083984375, + "loss": 0.4609, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7340185642242432, + "rewards/margins": 0.7449280619621277, + "rewards/rejected": -0.010909520089626312, + "step": 6309 + }, + { + "epoch": 0.97583607191185, + "grad_norm": 7.380002498626709, + "learning_rate": 3.7484247909267962e-06, + "logits/chosen": 7.1760993003845215, + "logits/rejected": 7.825562000274658, + "logps/chosen": -402.88702392578125, + "logps/rejected": -314.72381591796875, + "loss": 0.6748, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5270090103149414, + "rewards/margins": 0.19681420922279358, + "rewards/rejected": 0.3301948308944702, + "step": 6310 + }, + { + "epoch": 0.9759907210516141, + "grad_norm": 3.603492021560669, + "learning_rate": 3.748138389277123e-06, + "logits/chosen": 10.86450481414795, + "logits/rejected": 3.8220930099487305, + "logps/chosen": -231.05758666992188, + "logps/rejected": -205.30532836914062, + "loss": 0.5533, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21636362373828888, + "rewards/margins": 0.5296019911766052, + "rewards/rejected": -0.31323838233947754, + "step": 6311 + }, + { + "epoch": 0.9761453701913783, + "grad_norm": 5.500557899475098, + "learning_rate": 3.7478519876274487e-06, + "logits/chosen": 5.7570977210998535, + "logits/rejected": 3.496809244155884, + "logps/chosen": -190.95701599121094, + "logps/rejected": -257.35772705078125, + "loss": 0.6752, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3326965570449829, + "rewards/margins": 0.16002917289733887, + "rewards/rejected": -0.492725670337677, + "step": 6312 + }, + { + "epoch": 0.9763000193311425, + "grad_norm": 3.8797390460968018, + "learning_rate": 3.7475655859777754e-06, + "logits/chosen": 12.644182205200195, + "logits/rejected": 7.577823638916016, + "logps/chosen": -379.1197509765625, + "logps/rejected": -282.9969482421875, + "loss": 0.4467, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1979825794696808, + "rewards/margins": 0.6604365706443787, + "rewards/rejected": -0.46245402097702026, + "step": 6313 + }, + { + "epoch": 0.9764546684709067, + "grad_norm": 3.9358632564544678, + "learning_rate": 3.747279184328102e-06, + "logits/chosen": 10.954132080078125, + "logits/rejected": 11.247160911560059, + "logps/chosen": -290.91741943359375, + "logps/rejected": -241.4364471435547, + "loss": 0.5067, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37206947803497314, + "rewards/margins": 0.552484393119812, + "rewards/rejected": -0.18041495978832245, + "step": 6314 + }, + { + "epoch": 0.9766093176106708, + "grad_norm": 5.466884613037109, + "learning_rate": 3.7469927826784287e-06, + "logits/chosen": 4.846158981323242, + "logits/rejected": 6.689931392669678, + "logps/chosen": -251.2412567138672, + "logps/rejected": -266.4517517089844, + "loss": 0.7856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06358089298009872, + "rewards/margins": -0.060526855289936066, + "rewards/rejected": 0.12410774827003479, + "step": 6315 + }, + { + "epoch": 0.976763966750435, + "grad_norm": 6.830657005310059, + "learning_rate": 3.7467063810287553e-06, + "logits/chosen": 13.615756034851074, + "logits/rejected": 8.434019088745117, + "logps/chosen": -330.7833557128906, + "logps/rejected": -315.7149658203125, + "loss": 0.6189, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38659992814064026, + "rewards/margins": 0.360819935798645, + "rewards/rejected": 0.025780007243156433, + "step": 6316 + }, + { + "epoch": 0.9769186158901991, + "grad_norm": 5.3813910484313965, + "learning_rate": 3.746419979379082e-06, + "logits/chosen": 10.657546997070312, + "logits/rejected": 10.771195411682129, + "logps/chosen": -222.75119018554688, + "logps/rejected": -309.4671630859375, + "loss": 0.5296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2782250642776489, + "rewards/margins": 0.45868441462516785, + "rewards/rejected": -0.1804593950510025, + "step": 6317 + }, + { + "epoch": 0.9770732650299633, + "grad_norm": 5.064642906188965, + "learning_rate": 3.7461335777294078e-06, + "logits/chosen": 11.744096755981445, + "logits/rejected": 11.42077922821045, + "logps/chosen": -246.2164306640625, + "logps/rejected": -269.1891784667969, + "loss": 0.6044, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31924593448638916, + "rewards/margins": 0.44632774591445923, + "rewards/rejected": -0.12708181142807007, + "step": 6318 + }, + { + "epoch": 0.9772279141697274, + "grad_norm": 6.542880535125732, + "learning_rate": 3.7458471760797344e-06, + "logits/chosen": 8.96570873260498, + "logits/rejected": 2.1790828704833984, + "logps/chosen": -452.2339172363281, + "logps/rejected": -301.28253173828125, + "loss": 0.5202, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7694894075393677, + "rewards/margins": 0.588054895401001, + "rewards/rejected": 0.18143445253372192, + "step": 6319 + }, + { + "epoch": 0.9773825633094916, + "grad_norm": 5.794060230255127, + "learning_rate": 3.745560774430061e-06, + "logits/chosen": 11.890701293945312, + "logits/rejected": 11.94587516784668, + "logps/chosen": -277.8301696777344, + "logps/rejected": -370.354248046875, + "loss": 0.7621, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.055869489908218384, + "rewards/margins": -0.0626932829618454, + "rewards/rejected": 0.11856281757354736, + "step": 6320 + }, + { + "epoch": 0.9775372124492557, + "grad_norm": 4.301764011383057, + "learning_rate": 3.7452743727803877e-06, + "logits/chosen": 11.885927200317383, + "logits/rejected": 11.155097961425781, + "logps/chosen": -203.3512725830078, + "logps/rejected": -293.7401123046875, + "loss": 0.5359, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15808498859405518, + "rewards/margins": 0.5096316337585449, + "rewards/rejected": -0.6677165627479553, + "step": 6321 + }, + { + "epoch": 0.9776918615890199, + "grad_norm": 5.338297367095947, + "learning_rate": 3.744987971130714e-06, + "logits/chosen": 10.313863754272461, + "logits/rejected": 5.111762046813965, + "logps/chosen": -382.3363342285156, + "logps/rejected": -243.29466247558594, + "loss": 0.6486, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.037276655435562134, + "rewards/margins": 0.24384687840938568, + "rewards/rejected": -0.20657017827033997, + "step": 6322 + }, + { + "epoch": 0.977846510728784, + "grad_norm": 3.9889814853668213, + "learning_rate": 3.7447015694810406e-06, + "logits/chosen": 6.860918045043945, + "logits/rejected": 10.217616081237793, + "logps/chosen": -210.58311462402344, + "logps/rejected": -235.1737518310547, + "loss": 0.4993, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10423905402421951, + "rewards/margins": 0.521422803401947, + "rewards/rejected": -0.4171837866306305, + "step": 6323 + }, + { + "epoch": 0.9780011598685482, + "grad_norm": 4.534695148468018, + "learning_rate": 3.7444151678313672e-06, + "logits/chosen": 5.336467742919922, + "logits/rejected": 5.143927574157715, + "logps/chosen": -235.08445739746094, + "logps/rejected": -248.03250122070312, + "loss": 0.6127, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38200390338897705, + "rewards/margins": 0.3354136347770691, + "rewards/rejected": 0.04659028351306915, + "step": 6324 + }, + { + "epoch": 0.9781558090083123, + "grad_norm": 6.216179370880127, + "learning_rate": 3.7441287661816935e-06, + "logits/chosen": 7.222482681274414, + "logits/rejected": 10.387896537780762, + "logps/chosen": -293.2083740234375, + "logps/rejected": -327.56201171875, + "loss": 0.4955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07214432209730148, + "rewards/margins": 0.5247325897216797, + "rewards/rejected": -0.4525882601737976, + "step": 6325 + }, + { + "epoch": 0.9783104581480766, + "grad_norm": 5.459686279296875, + "learning_rate": 3.7438423645320197e-06, + "logits/chosen": 16.51071548461914, + "logits/rejected": 8.343562126159668, + "logps/chosen": -266.8075866699219, + "logps/rejected": -244.42324829101562, + "loss": 0.5517, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17756357789039612, + "rewards/margins": 0.5416630506515503, + "rewards/rejected": -0.36409950256347656, + "step": 6326 + }, + { + "epoch": 0.9784651072878408, + "grad_norm": 5.218517780303955, + "learning_rate": 3.7435559628823463e-06, + "logits/chosen": 13.929750442504883, + "logits/rejected": 8.82845687866211, + "logps/chosen": -417.3029479980469, + "logps/rejected": -352.20751953125, + "loss": 0.5191, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3881669342517853, + "rewards/margins": 0.5378721952438354, + "rewards/rejected": -0.14970529079437256, + "step": 6327 + }, + { + "epoch": 0.9786197564276049, + "grad_norm": 4.251748085021973, + "learning_rate": 3.743269561232673e-06, + "logits/chosen": 11.406187057495117, + "logits/rejected": 6.826017379760742, + "logps/chosen": -208.38363647460938, + "logps/rejected": -186.21038818359375, + "loss": 0.4853, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1649625301361084, + "rewards/margins": 0.5692854523658752, + "rewards/rejected": -0.40432295203208923, + "step": 6328 + }, + { + "epoch": 0.978774405567369, + "grad_norm": 5.656533241271973, + "learning_rate": 3.7429831595829996e-06, + "logits/chosen": 11.392833709716797, + "logits/rejected": 8.460349082946777, + "logps/chosen": -315.70086669921875, + "logps/rejected": -250.5478973388672, + "loss": 0.6484, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.03816051781177521, + "rewards/margins": 0.4078497886657715, + "rewards/rejected": -0.4460102915763855, + "step": 6329 + }, + { + "epoch": 0.9789290547071332, + "grad_norm": 6.576191425323486, + "learning_rate": 3.7426967579333263e-06, + "logits/chosen": 14.664199829101562, + "logits/rejected": 14.87417221069336, + "logps/chosen": -287.00469970703125, + "logps/rejected": -283.00994873046875, + "loss": 0.8008, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.8382795453071594, + "rewards/margins": 0.001187305897474289, + "rewards/rejected": -0.8394668102264404, + "step": 6330 + }, + { + "epoch": 0.9790837038468974, + "grad_norm": 4.4208984375, + "learning_rate": 3.742410356283652e-06, + "logits/chosen": 8.74218463897705, + "logits/rejected": 9.795685768127441, + "logps/chosen": -262.88226318359375, + "logps/rejected": -216.15997314453125, + "loss": 0.682, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3701019287109375, + "rewards/margins": 0.08830776065587997, + "rewards/rejected": 0.28179416060447693, + "step": 6331 + }, + { + "epoch": 0.9792383529866615, + "grad_norm": 4.583682537078857, + "learning_rate": 3.7421239546339787e-06, + "logits/chosen": 11.940658569335938, + "logits/rejected": 5.56840181350708, + "logps/chosen": -402.09283447265625, + "logps/rejected": -268.4326477050781, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.505084753036499, + "rewards/margins": 0.9302171468734741, + "rewards/rejected": -0.4251323640346527, + "step": 6332 + }, + { + "epoch": 0.9793930021264257, + "grad_norm": 4.248706817626953, + "learning_rate": 3.7418375529843054e-06, + "logits/chosen": 11.651357650756836, + "logits/rejected": 5.169594764709473, + "logps/chosen": -423.9834899902344, + "logps/rejected": -283.9046936035156, + "loss": 0.4152, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4368492066860199, + "rewards/margins": 0.8613738417625427, + "rewards/rejected": -0.42452460527420044, + "step": 6333 + }, + { + "epoch": 0.9795476512661898, + "grad_norm": 3.337502956390381, + "learning_rate": 3.741551151334632e-06, + "logits/chosen": 10.139825820922852, + "logits/rejected": 7.991117477416992, + "logps/chosen": -232.88397216796875, + "logps/rejected": -200.31248474121094, + "loss": 0.4523, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.46951746940612793, + "rewards/margins": 0.6828499436378479, + "rewards/rejected": -1.152367353439331, + "step": 6334 + }, + { + "epoch": 0.979702300405954, + "grad_norm": 8.208161354064941, + "learning_rate": 3.7412647496849587e-06, + "logits/chosen": 8.249188423156738, + "logits/rejected": 8.633781433105469, + "logps/chosen": -392.4921875, + "logps/rejected": -305.61383056640625, + "loss": 0.6339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2545856535434723, + "rewards/margins": 0.16935941576957703, + "rewards/rejected": 0.08522625267505646, + "step": 6335 + }, + { + "epoch": 0.9798569495457181, + "grad_norm": 5.370692253112793, + "learning_rate": 3.7409783480352853e-06, + "logits/chosen": 10.284990310668945, + "logits/rejected": 8.454547882080078, + "logps/chosen": -375.0708312988281, + "logps/rejected": -283.7245788574219, + "loss": 0.6497, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23219729959964752, + "rewards/margins": 0.33492040634155273, + "rewards/rejected": -0.1027231216430664, + "step": 6336 + }, + { + "epoch": 0.9800115986854823, + "grad_norm": 7.273519039154053, + "learning_rate": 3.7406919463856116e-06, + "logits/chosen": 14.320666313171387, + "logits/rejected": 12.81432056427002, + "logps/chosen": -365.43475341796875, + "logps/rejected": -246.53311157226562, + "loss": 0.8923, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.32235288619995117, + "rewards/margins": -0.14710262417793274, + "rewards/rejected": -0.17525026202201843, + "step": 6337 + }, + { + "epoch": 0.9801662478252464, + "grad_norm": 4.570174694061279, + "learning_rate": 3.740405544735938e-06, + "logits/chosen": 13.6356840133667, + "logits/rejected": 8.28172779083252, + "logps/chosen": -341.417724609375, + "logps/rejected": -264.3694763183594, + "loss": 0.523, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3188232183456421, + "rewards/margins": 0.8184434175491333, + "rewards/rejected": -0.499620258808136, + "step": 6338 + }, + { + "epoch": 0.9803208969650107, + "grad_norm": 9.591302871704102, + "learning_rate": 3.7401191430862645e-06, + "logits/chosen": 4.8775482177734375, + "logits/rejected": 6.110818386077881, + "logps/chosen": -141.5069122314453, + "logps/rejected": -298.1834411621094, + "loss": 0.7242, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3294106125831604, + "rewards/margins": 0.20954623818397522, + "rewards/rejected": -0.5389568209648132, + "step": 6339 + }, + { + "epoch": 0.9804755461047748, + "grad_norm": 6.365485668182373, + "learning_rate": 3.739832741436591e-06, + "logits/chosen": 7.626447677612305, + "logits/rejected": 9.57275390625, + "logps/chosen": -249.103271484375, + "logps/rejected": -248.5445556640625, + "loss": 0.7672, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.010637475177645683, + "rewards/margins": -0.06676015257835388, + "rewards/rejected": 0.05612267926335335, + "step": 6340 + }, + { + "epoch": 0.980630195244539, + "grad_norm": 6.013866901397705, + "learning_rate": 3.7395463397869173e-06, + "logits/chosen": 15.039027214050293, + "logits/rejected": 10.024212837219238, + "logps/chosen": -308.22174072265625, + "logps/rejected": -277.9765319824219, + "loss": 0.6094, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10356522351503372, + "rewards/margins": 0.4523034393787384, + "rewards/rejected": -0.5558686256408691, + "step": 6341 + }, + { + "epoch": 0.9807848443843031, + "grad_norm": 8.10254192352295, + "learning_rate": 3.739259938137244e-06, + "logits/chosen": 11.551238059997559, + "logits/rejected": 1.3056763410568237, + "logps/chosen": -341.2266845703125, + "logps/rejected": -220.62278747558594, + "loss": 0.7973, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.03634757548570633, + "rewards/margins": -0.10511816293001175, + "rewards/rejected": 0.06877059489488602, + "step": 6342 + }, + { + "epoch": 0.9809394935240673, + "grad_norm": 6.16043758392334, + "learning_rate": 3.7389735364875706e-06, + "logits/chosen": 5.587350845336914, + "logits/rejected": 3.4804418087005615, + "logps/chosen": -188.5323486328125, + "logps/rejected": -241.19775390625, + "loss": 0.579, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3693351149559021, + "rewards/margins": 0.5227667093276978, + "rewards/rejected": -0.15343162417411804, + "step": 6343 + }, + { + "epoch": 0.9810941426638314, + "grad_norm": 5.35908317565918, + "learning_rate": 3.7386871348378973e-06, + "logits/chosen": 7.207553386688232, + "logits/rejected": 5.906684398651123, + "logps/chosen": -210.07369995117188, + "logps/rejected": -238.39297485351562, + "loss": 0.7538, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03599214926362038, + "rewards/margins": 0.04232324659824371, + "rewards/rejected": -0.006331082433462143, + "step": 6344 + }, + { + "epoch": 0.9812487918035956, + "grad_norm": 4.146737575531006, + "learning_rate": 3.738400733188223e-06, + "logits/chosen": 7.1606035232543945, + "logits/rejected": 8.163259506225586, + "logps/chosen": -143.42739868164062, + "logps/rejected": -185.353271484375, + "loss": 0.6346, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09329091012477875, + "rewards/margins": 0.22248223423957825, + "rewards/rejected": -0.1291913092136383, + "step": 6345 + }, + { + "epoch": 0.9814034409433597, + "grad_norm": 4.873830795288086, + "learning_rate": 3.7381143315385497e-06, + "logits/chosen": 15.064471244812012, + "logits/rejected": 10.58346176147461, + "logps/chosen": -322.13592529296875, + "logps/rejected": -242.6052703857422, + "loss": 0.6514, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15427017211914062, + "rewards/margins": 0.19796046614646912, + "rewards/rejected": -0.04369029402732849, + "step": 6346 + }, + { + "epoch": 0.9815580900831239, + "grad_norm": 4.298923492431641, + "learning_rate": 3.7378279298888764e-06, + "logits/chosen": 9.079001426696777, + "logits/rejected": 4.134699821472168, + "logps/chosen": -292.2060546875, + "logps/rejected": -202.46417236328125, + "loss": 0.4833, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.32802486419677734, + "rewards/margins": 0.5648854970932007, + "rewards/rejected": -0.23686060309410095, + "step": 6347 + }, + { + "epoch": 0.981712739222888, + "grad_norm": 9.15503978729248, + "learning_rate": 3.737541528239203e-06, + "logits/chosen": 14.616005897521973, + "logits/rejected": 7.228925704956055, + "logps/chosen": -310.8160400390625, + "logps/rejected": -213.77389526367188, + "loss": 0.7195, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07918335497379303, + "rewards/margins": 0.22846432030200958, + "rewards/rejected": -0.14928096532821655, + "step": 6348 + }, + { + "epoch": 0.9818673883626522, + "grad_norm": 7.727329730987549, + "learning_rate": 3.7372551265895297e-06, + "logits/chosen": 10.335702896118164, + "logits/rejected": 8.534158706665039, + "logps/chosen": -396.2249755859375, + "logps/rejected": -378.1708679199219, + "loss": 0.7248, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.49631375074386597, + "rewards/margins": 0.06901147961616516, + "rewards/rejected": 0.4273022711277008, + "step": 6349 + }, + { + "epoch": 0.9820220375024163, + "grad_norm": 5.398975849151611, + "learning_rate": 3.7369687249398563e-06, + "logits/chosen": 8.378582000732422, + "logits/rejected": 12.507745742797852, + "logps/chosen": -305.9085388183594, + "logps/rejected": -333.411865234375, + "loss": 0.5532, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17840319871902466, + "rewards/margins": 0.5202721357345581, + "rewards/rejected": -0.3418689966201782, + "step": 6350 + }, + { + "epoch": 0.9821766866421806, + "grad_norm": 5.5790019035339355, + "learning_rate": 3.736682323290182e-06, + "logits/chosen": 13.69083023071289, + "logits/rejected": 6.97307825088501, + "logps/chosen": -358.3127746582031, + "logps/rejected": -206.38015747070312, + "loss": 0.5706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2394128292798996, + "rewards/margins": 0.3754867911338806, + "rewards/rejected": -0.1360739767551422, + "step": 6351 + }, + { + "epoch": 0.9823313357819448, + "grad_norm": 5.558178424835205, + "learning_rate": 3.736395921640509e-06, + "logits/chosen": 10.22050666809082, + "logits/rejected": 8.757943153381348, + "logps/chosen": -345.7509460449219, + "logps/rejected": -301.1663513183594, + "loss": 0.6601, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28731414675712585, + "rewards/margins": 0.13952180743217468, + "rewards/rejected": 0.14779233932495117, + "step": 6352 + }, + { + "epoch": 0.9824859849217089, + "grad_norm": 4.343262672424316, + "learning_rate": 3.7361095199908354e-06, + "logits/chosen": 8.431159019470215, + "logits/rejected": 5.432050704956055, + "logps/chosen": -168.1685333251953, + "logps/rejected": -182.77088928222656, + "loss": 0.6361, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10684751719236374, + "rewards/margins": 0.2605520486831665, + "rewards/rejected": -0.3673996031284332, + "step": 6353 + }, + { + "epoch": 0.9826406340614731, + "grad_norm": 6.614505290985107, + "learning_rate": 3.735823118341162e-06, + "logits/chosen": 4.245138168334961, + "logits/rejected": 4.8325276374816895, + "logps/chosen": -252.76461791992188, + "logps/rejected": -242.34823608398438, + "loss": 0.6661, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.20296013355255127, + "rewards/margins": 0.23311752080917358, + "rewards/rejected": -0.43607765436172485, + "step": 6354 + }, + { + "epoch": 0.9827952832012372, + "grad_norm": 7.216494560241699, + "learning_rate": 3.7355367166914887e-06, + "logits/chosen": 8.744220733642578, + "logits/rejected": 9.35415267944336, + "logps/chosen": -209.74905395507812, + "logps/rejected": -250.81924438476562, + "loss": 0.7311, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.19596849381923676, + "rewards/margins": 0.05969156324863434, + "rewards/rejected": -0.2556600570678711, + "step": 6355 + }, + { + "epoch": 0.9829499323410014, + "grad_norm": 5.547492027282715, + "learning_rate": 3.735250315041815e-06, + "logits/chosen": 10.118589401245117, + "logits/rejected": 5.989643573760986, + "logps/chosen": -265.45489501953125, + "logps/rejected": -189.0480194091797, + "loss": 0.5531, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.530346155166626, + "rewards/margins": 0.6523032784461975, + "rewards/rejected": -0.12195707857608795, + "step": 6356 + }, + { + "epoch": 0.9831045814807655, + "grad_norm": 3.405036449432373, + "learning_rate": 3.7349639133921416e-06, + "logits/chosen": 6.900462627410889, + "logits/rejected": 8.3464994430542, + "logps/chosen": -157.25973510742188, + "logps/rejected": -232.02215576171875, + "loss": 0.435, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3262181580066681, + "rewards/margins": 1.0053631067276, + "rewards/rejected": -0.6791449189186096, + "step": 6357 + }, + { + "epoch": 0.9832592306205297, + "grad_norm": 6.441837310791016, + "learning_rate": 3.734677511742468e-06, + "logits/chosen": 11.748625755310059, + "logits/rejected": 13.682984352111816, + "logps/chosen": -368.30633544921875, + "logps/rejected": -377.2191162109375, + "loss": 0.649, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.09854501485824585, + "rewards/margins": 0.3158285915851593, + "rewards/rejected": -0.21728357672691345, + "step": 6358 + }, + { + "epoch": 0.9834138797602938, + "grad_norm": 6.330887317657471, + "learning_rate": 3.7343911100927945e-06, + "logits/chosen": 9.120230674743652, + "logits/rejected": 10.975332260131836, + "logps/chosen": -350.6260681152344, + "logps/rejected": -413.9373779296875, + "loss": 0.6913, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.02962927520275116, + "rewards/margins": 0.3403489589691162, + "rewards/rejected": -0.36997824907302856, + "step": 6359 + }, + { + "epoch": 0.983568528900058, + "grad_norm": 5.733031272888184, + "learning_rate": 3.7341047084431207e-06, + "logits/chosen": 8.469575881958008, + "logits/rejected": 7.300873279571533, + "logps/chosen": -290.756591796875, + "logps/rejected": -237.67581176757812, + "loss": 0.6744, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1746126264333725, + "rewards/margins": 0.17614364624023438, + "rewards/rejected": -0.0015310049057006836, + "step": 6360 + }, + { + "epoch": 0.9837231780398221, + "grad_norm": 6.685068607330322, + "learning_rate": 3.7338183067934474e-06, + "logits/chosen": 5.318485260009766, + "logits/rejected": 10.301240921020508, + "logps/chosen": -225.68707275390625, + "logps/rejected": -325.6181640625, + "loss": 0.8425, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.1301746368408203, + "rewards/margins": -0.15783292055130005, + "rewards/rejected": 0.027658268809318542, + "step": 6361 + }, + { + "epoch": 0.9838778271795863, + "grad_norm": 5.311933994293213, + "learning_rate": 3.733531905143774e-06, + "logits/chosen": 15.729347229003906, + "logits/rejected": 11.071685791015625, + "logps/chosen": -256.0406188964844, + "logps/rejected": -199.35235595703125, + "loss": 0.6575, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.060159068554639816, + "rewards/margins": 0.16870784759521484, + "rewards/rejected": -0.22886691987514496, + "step": 6362 + }, + { + "epoch": 0.9840324763193504, + "grad_norm": 3.684779644012451, + "learning_rate": 3.7332455034941007e-06, + "logits/chosen": 9.606433868408203, + "logits/rejected": 0.7240857481956482, + "logps/chosen": -279.3525390625, + "logps/rejected": -215.30224609375, + "loss": 0.4508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.24538739025592804, + "rewards/margins": 0.7554906010627747, + "rewards/rejected": -1.0008779764175415, + "step": 6363 + }, + { + "epoch": 0.9841871254591147, + "grad_norm": 4.594018459320068, + "learning_rate": 3.7329591018444265e-06, + "logits/chosen": 9.479605674743652, + "logits/rejected": 11.16738224029541, + "logps/chosen": -212.99578857421875, + "logps/rejected": -239.20811462402344, + "loss": 0.5402, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37108105421066284, + "rewards/margins": 0.531692624092102, + "rewards/rejected": -0.16061154007911682, + "step": 6364 + }, + { + "epoch": 0.9843417745988788, + "grad_norm": 5.41662073135376, + "learning_rate": 3.732672700194753e-06, + "logits/chosen": 9.314374923706055, + "logits/rejected": 8.568046569824219, + "logps/chosen": -228.67742919921875, + "logps/rejected": -239.1463623046875, + "loss": 0.6967, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.025880619883537292, + "rewards/margins": 0.07058162987232208, + "rewards/rejected": -0.044700995087623596, + "step": 6365 + }, + { + "epoch": 0.984496423738643, + "grad_norm": 4.842507839202881, + "learning_rate": 3.7323862985450798e-06, + "logits/chosen": 16.975248336791992, + "logits/rejected": 8.563737869262695, + "logps/chosen": -381.3269958496094, + "logps/rejected": -256.0245361328125, + "loss": 0.4658, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4928404688835144, + "rewards/margins": 0.8768168687820435, + "rewards/rejected": -0.38397639989852905, + "step": 6366 + }, + { + "epoch": 0.9846510728784071, + "grad_norm": 5.3084211349487305, + "learning_rate": 3.7320998968954064e-06, + "logits/chosen": 15.943305969238281, + "logits/rejected": 8.33459186553955, + "logps/chosen": -354.7040100097656, + "logps/rejected": -254.4704132080078, + "loss": 0.597, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.08817901462316513, + "rewards/margins": 0.3635047972202301, + "rewards/rejected": -0.45168381929397583, + "step": 6367 + }, + { + "epoch": 0.9848057220181713, + "grad_norm": 4.902894020080566, + "learning_rate": 3.731813495245733e-06, + "logits/chosen": 6.656890392303467, + "logits/rejected": 11.914179801940918, + "logps/chosen": -222.10264587402344, + "logps/rejected": -264.1852111816406, + "loss": 0.6316, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05056753009557724, + "rewards/margins": 0.2905178964138031, + "rewards/rejected": -0.23995037376880646, + "step": 6368 + }, + { + "epoch": 0.9849603711579354, + "grad_norm": 5.321906566619873, + "learning_rate": 3.7315270935960597e-06, + "logits/chosen": 6.409091949462891, + "logits/rejected": 8.225178718566895, + "logps/chosen": -210.30987548828125, + "logps/rejected": -222.08755493164062, + "loss": 0.7868, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16246500611305237, + "rewards/margins": 0.0187017060816288, + "rewards/rejected": 0.14376330375671387, + "step": 6369 + }, + { + "epoch": 0.9851150202976996, + "grad_norm": 4.566802501678467, + "learning_rate": 3.731240691946386e-06, + "logits/chosen": 13.245766639709473, + "logits/rejected": 9.584945678710938, + "logps/chosen": -310.4659729003906, + "logps/rejected": -279.9970703125, + "loss": 0.6395, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3515969216823578, + "rewards/margins": 0.15906305611133575, + "rewards/rejected": 0.19253388047218323, + "step": 6370 + }, + { + "epoch": 0.9852696694374637, + "grad_norm": 5.151280879974365, + "learning_rate": 3.730954290296712e-06, + "logits/chosen": 11.174251556396484, + "logits/rejected": 5.913450241088867, + "logps/chosen": -372.3224792480469, + "logps/rejected": -259.20526123046875, + "loss": 0.4793, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3799913227558136, + "rewards/margins": 0.5719354152679443, + "rewards/rejected": -0.19194403290748596, + "step": 6371 + }, + { + "epoch": 0.9854243185772279, + "grad_norm": 5.12819242477417, + "learning_rate": 3.730667888647039e-06, + "logits/chosen": 14.985021591186523, + "logits/rejected": 10.003158569335938, + "logps/chosen": -271.2546081542969, + "logps/rejected": -266.3619384765625, + "loss": 0.6395, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.08971020579338074, + "rewards/margins": 0.15500064194202423, + "rewards/rejected": -0.24471083283424377, + "step": 6372 + }, + { + "epoch": 0.985578967716992, + "grad_norm": 7.795731067657471, + "learning_rate": 3.7303814869973655e-06, + "logits/chosen": 9.79568099975586, + "logits/rejected": 10.994924545288086, + "logps/chosen": -419.13677978515625, + "logps/rejected": -419.67138671875, + "loss": 0.8165, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.22752055525779724, + "rewards/margins": 0.06996190547943115, + "rewards/rejected": 0.1575586497783661, + "step": 6373 + }, + { + "epoch": 0.9857336168567562, + "grad_norm": 5.015719890594482, + "learning_rate": 3.730095085347692e-06, + "logits/chosen": 15.121077537536621, + "logits/rejected": 4.451775550842285, + "logps/chosen": -273.3724060058594, + "logps/rejected": -159.63136291503906, + "loss": 0.6138, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22033709287643433, + "rewards/margins": 0.2666897177696228, + "rewards/rejected": -0.0463525764644146, + "step": 6374 + }, + { + "epoch": 0.9858882659965204, + "grad_norm": 7.012768745422363, + "learning_rate": 3.7298086836980184e-06, + "logits/chosen": 7.010719299316406, + "logits/rejected": 9.252592086791992, + "logps/chosen": -277.5211486816406, + "logps/rejected": -393.36322021484375, + "loss": 0.9625, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.3574404716491699, + "rewards/margins": -0.12064673006534576, + "rewards/rejected": -0.23679374158382416, + "step": 6375 + }, + { + "epoch": 0.9860429151362845, + "grad_norm": 4.6598639488220215, + "learning_rate": 3.729522282048345e-06, + "logits/chosen": 11.543252944946289, + "logits/rejected": 6.669504165649414, + "logps/chosen": -148.09378051757812, + "logps/rejected": -122.16343688964844, + "loss": 0.6663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.09077329933643341, + "rewards/margins": 0.14782200753688812, + "rewards/rejected": -0.23859530687332153, + "step": 6376 + }, + { + "epoch": 0.9861975642760488, + "grad_norm": 5.08552885055542, + "learning_rate": 3.7292358803986712e-06, + "logits/chosen": 10.157306671142578, + "logits/rejected": 10.05604076385498, + "logps/chosen": -278.8956298828125, + "logps/rejected": -225.61953735351562, + "loss": 0.668, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.013445764780044556, + "rewards/margins": 0.14905597269535065, + "rewards/rejected": -0.1356101930141449, + "step": 6377 + }, + { + "epoch": 0.9863522134158129, + "grad_norm": 6.076529502868652, + "learning_rate": 3.728949478748998e-06, + "logits/chosen": 7.508108615875244, + "logits/rejected": 4.79211950302124, + "logps/chosen": -382.4896240234375, + "logps/rejected": -307.8619384765625, + "loss": 0.8144, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.1903144121170044, + "rewards/margins": -0.1058650016784668, + "rewards/rejected": 0.2961793839931488, + "step": 6378 + }, + { + "epoch": 0.9865068625555771, + "grad_norm": 6.188473224639893, + "learning_rate": 3.728663077099324e-06, + "logits/chosen": 13.722238540649414, + "logits/rejected": 11.189976692199707, + "logps/chosen": -229.97079467773438, + "logps/rejected": -174.31863403320312, + "loss": 0.7357, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.3522096276283264, + "rewards/margins": -0.037701189517974854, + "rewards/rejected": -0.31450843811035156, + "step": 6379 + }, + { + "epoch": 0.9866615116953412, + "grad_norm": 3.196591854095459, + "learning_rate": 3.7283766754496508e-06, + "logits/chosen": 18.061851501464844, + "logits/rejected": 10.70038890838623, + "logps/chosen": -296.8410339355469, + "logps/rejected": -143.9546356201172, + "loss": 0.5163, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33662357926368713, + "rewards/margins": 0.871543288230896, + "rewards/rejected": -0.5349197387695312, + "step": 6380 + }, + { + "epoch": 0.9868161608351054, + "grad_norm": 5.040510654449463, + "learning_rate": 3.7280902737999774e-06, + "logits/chosen": 10.511635780334473, + "logits/rejected": 7.7900285720825195, + "logps/chosen": -256.0610656738281, + "logps/rejected": -270.65484619140625, + "loss": 0.5722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6627236008644104, + "rewards/margins": 0.4329145550727844, + "rewards/rejected": 0.2298090159893036, + "step": 6381 + }, + { + "epoch": 0.9869708099748695, + "grad_norm": 4.601862907409668, + "learning_rate": 3.727803872150304e-06, + "logits/chosen": 7.746264934539795, + "logits/rejected": 5.125948429107666, + "logps/chosen": -207.99002075195312, + "logps/rejected": -240.99896240234375, + "loss": 0.651, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.47464340925216675, + "rewards/margins": 0.2699340879917145, + "rewards/rejected": -0.7445774674415588, + "step": 6382 + }, + { + "epoch": 0.9871254591146337, + "grad_norm": 5.484795093536377, + "learning_rate": 3.7275174705006307e-06, + "logits/chosen": 13.179880142211914, + "logits/rejected": 12.174570083618164, + "logps/chosen": -267.025634765625, + "logps/rejected": -188.67352294921875, + "loss": 0.7008, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.12739549577236176, + "rewards/margins": 0.06455676257610321, + "rewards/rejected": 0.06283873319625854, + "step": 6383 + }, + { + "epoch": 0.9872801082543978, + "grad_norm": 4.959286689758301, + "learning_rate": 3.7272310688509565e-06, + "logits/chosen": 12.184050559997559, + "logits/rejected": 8.341327667236328, + "logps/chosen": -327.19879150390625, + "logps/rejected": -264.320556640625, + "loss": 0.5498, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3330177366733551, + "rewards/margins": 0.5039314031600952, + "rewards/rejected": -0.1709136962890625, + "step": 6384 + }, + { + "epoch": 0.987434757394162, + "grad_norm": 3.609959363937378, + "learning_rate": 3.726944667201283e-06, + "logits/chosen": 8.2381591796875, + "logits/rejected": 9.955843925476074, + "logps/chosen": -185.87957763671875, + "logps/rejected": -232.37850952148438, + "loss": 0.5972, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1679568886756897, + "rewards/margins": 0.257804274559021, + "rewards/rejected": -0.08984740823507309, + "step": 6385 + }, + { + "epoch": 0.9875894065339261, + "grad_norm": 4.931064128875732, + "learning_rate": 3.72665826555161e-06, + "logits/chosen": 13.199957847595215, + "logits/rejected": 7.943856239318848, + "logps/chosen": -175.22314453125, + "logps/rejected": -154.5943145751953, + "loss": 0.656, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04112076759338379, + "rewards/margins": 0.0937330350279808, + "rewards/rejected": -0.05261225998401642, + "step": 6386 + }, + { + "epoch": 0.9877440556736903, + "grad_norm": 3.6509485244750977, + "learning_rate": 3.7263718639019365e-06, + "logits/chosen": 12.731666564941406, + "logits/rejected": 6.374953269958496, + "logps/chosen": -232.49203491210938, + "logps/rejected": -191.49087524414062, + "loss": 0.4798, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.10531330108642578, + "rewards/margins": 0.6045680642127991, + "rewards/rejected": -0.4992547035217285, + "step": 6387 + }, + { + "epoch": 0.9878987048134544, + "grad_norm": 5.918891906738281, + "learning_rate": 3.726085462252263e-06, + "logits/chosen": 14.738401412963867, + "logits/rejected": 7.014528751373291, + "logps/chosen": -442.99920654296875, + "logps/rejected": -271.3382568359375, + "loss": 0.6167, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3283092677593231, + "rewards/margins": 0.2893158197402954, + "rewards/rejected": 0.038993436843156815, + "step": 6388 + }, + { + "epoch": 0.9880533539532186, + "grad_norm": 5.170039653778076, + "learning_rate": 3.7257990606025894e-06, + "logits/chosen": 9.440051078796387, + "logits/rejected": 12.597487449645996, + "logps/chosen": -180.85267639160156, + "logps/rejected": -208.2803192138672, + "loss": 0.6388, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09109164774417877, + "rewards/margins": 0.2528671622276306, + "rewards/rejected": -0.16177549958229065, + "step": 6389 + }, + { + "epoch": 0.9882080030929828, + "grad_norm": 5.9500932693481445, + "learning_rate": 3.725512658952916e-06, + "logits/chosen": 10.363550186157227, + "logits/rejected": 9.956448554992676, + "logps/chosen": -300.74749755859375, + "logps/rejected": -325.7415466308594, + "loss": 0.7747, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.15665781497955322, + "rewards/margins": -0.08261716365814209, + "rewards/rejected": -0.07404066622257233, + "step": 6390 + }, + { + "epoch": 0.988362652232747, + "grad_norm": 5.4790449142456055, + "learning_rate": 3.7252262573032422e-06, + "logits/chosen": 11.44119644165039, + "logits/rejected": 9.993799209594727, + "logps/chosen": -244.72235107421875, + "logps/rejected": -254.03421020507812, + "loss": 0.6352, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.26246488094329834, + "rewards/margins": 0.17048987746238708, + "rewards/rejected": 0.09197502583265305, + "step": 6391 + }, + { + "epoch": 0.9885173013725111, + "grad_norm": 6.128749370574951, + "learning_rate": 3.724939855653569e-06, + "logits/chosen": 8.635459899902344, + "logits/rejected": 3.4009971618652344, + "logps/chosen": -204.38153076171875, + "logps/rejected": -212.85047912597656, + "loss": 0.7206, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.28203368186950684, + "rewards/margins": 0.08916183561086655, + "rewards/rejected": -0.371195524930954, + "step": 6392 + }, + { + "epoch": 0.9886719505122753, + "grad_norm": 5.082852363586426, + "learning_rate": 3.7246534540038955e-06, + "logits/chosen": 6.198980331420898, + "logits/rejected": 9.080013275146484, + "logps/chosen": -210.5640411376953, + "logps/rejected": -230.71670532226562, + "loss": 0.741, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1753840446472168, + "rewards/margins": -0.05014643445611, + "rewards/rejected": 0.2255304902791977, + "step": 6393 + }, + { + "epoch": 0.9888265996520395, + "grad_norm": 7.022514343261719, + "learning_rate": 3.7243670523542218e-06, + "logits/chosen": 15.562702178955078, + "logits/rejected": 8.763711929321289, + "logps/chosen": -299.00506591796875, + "logps/rejected": -256.720458984375, + "loss": 0.6607, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03995572030544281, + "rewards/margins": 0.23928508162498474, + "rewards/rejected": -0.19932936131954193, + "step": 6394 + }, + { + "epoch": 0.9889812487918036, + "grad_norm": 5.80563497543335, + "learning_rate": 3.7240806507045484e-06, + "logits/chosen": 9.469968795776367, + "logits/rejected": 2.30733060836792, + "logps/chosen": -257.93316650390625, + "logps/rejected": -222.91473388671875, + "loss": 0.6314, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33021676540374756, + "rewards/margins": 0.43539756536483765, + "rewards/rejected": -0.1051807701587677, + "step": 6395 + }, + { + "epoch": 0.9891358979315678, + "grad_norm": 6.67752742767334, + "learning_rate": 3.723794249054875e-06, + "logits/chosen": 7.952746391296387, + "logits/rejected": 9.51457691192627, + "logps/chosen": -260.056884765625, + "logps/rejected": -264.37542724609375, + "loss": 0.9161, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05107274651527405, + "rewards/margins": -0.2795703113079071, + "rewards/rejected": 0.22849754989147186, + "step": 6396 + }, + { + "epoch": 0.9892905470713319, + "grad_norm": 3.7259817123413086, + "learning_rate": 3.7235078474052013e-06, + "logits/chosen": 15.1904935836792, + "logits/rejected": 10.23059368133545, + "logps/chosen": -257.2881774902344, + "logps/rejected": -251.77818298339844, + "loss": 0.4748, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.39571958780288696, + "rewards/margins": 0.5983229279518127, + "rewards/rejected": -0.2026033103466034, + "step": 6397 + }, + { + "epoch": 0.9894451962110961, + "grad_norm": 7.705357551574707, + "learning_rate": 3.7232214457555275e-06, + "logits/chosen": 5.060511589050293, + "logits/rejected": 5.846954345703125, + "logps/chosen": -366.26324462890625, + "logps/rejected": -292.3350830078125, + "loss": 0.8663, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05128952115774155, + "rewards/margins": -0.2110341489315033, + "rewards/rejected": 0.15974465012550354, + "step": 6398 + }, + { + "epoch": 0.9895998453508602, + "grad_norm": 5.061436653137207, + "learning_rate": 3.722935044105854e-06, + "logits/chosen": 7.7066569328308105, + "logits/rejected": 6.9301862716674805, + "logps/chosen": -261.54443359375, + "logps/rejected": -212.9409942626953, + "loss": 0.68, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3508901000022888, + "rewards/margins": 0.15891200304031372, + "rewards/rejected": 0.1919780671596527, + "step": 6399 + }, + { + "epoch": 0.9897544944906244, + "grad_norm": 6.117293357849121, + "learning_rate": 3.722648642456181e-06, + "logits/chosen": 12.491506576538086, + "logits/rejected": 10.824533462524414, + "logps/chosen": -463.5404052734375, + "logps/rejected": -453.656982421875, + "loss": 0.6627, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36115768551826477, + "rewards/margins": 0.27406203746795654, + "rewards/rejected": 0.08709569275379181, + "step": 6400 + }, + { + "epoch": 0.9899091436303885, + "grad_norm": 8.071745872497559, + "learning_rate": 3.7223622408065075e-06, + "logits/chosen": 9.31758975982666, + "logits/rejected": 6.549283981323242, + "logps/chosen": -209.12582397460938, + "logps/rejected": -175.58926391601562, + "loss": 0.5469, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17268727719783783, + "rewards/margins": 0.4429420828819275, + "rewards/rejected": -0.2702547609806061, + "step": 6401 + }, + { + "epoch": 0.9900637927701527, + "grad_norm": 6.005964279174805, + "learning_rate": 3.722075839156834e-06, + "logits/chosen": 7.940710544586182, + "logits/rejected": 7.991883277893066, + "logps/chosen": -260.8432922363281, + "logps/rejected": -264.8493957519531, + "loss": 0.7184, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28549882769584656, + "rewards/margins": 0.10374833643436432, + "rewards/rejected": 0.18175050616264343, + "step": 6402 + }, + { + "epoch": 0.9902184419099169, + "grad_norm": 4.348447799682617, + "learning_rate": 3.7217894375071608e-06, + "logits/chosen": 10.062132835388184, + "logits/rejected": 9.699281692504883, + "logps/chosen": -239.81065368652344, + "logps/rejected": -228.9882354736328, + "loss": 0.6238, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2647506892681122, + "rewards/margins": 0.3034078776836395, + "rewards/rejected": -0.03865720331668854, + "step": 6403 + }, + { + "epoch": 0.9903730910496811, + "grad_norm": 7.457139492034912, + "learning_rate": 3.7215030358574866e-06, + "logits/chosen": 10.277219772338867, + "logits/rejected": 12.910514831542969, + "logps/chosen": -314.7123718261719, + "logps/rejected": -374.315185546875, + "loss": 0.8161, + "rewards/accuracies": 0.125, + "rewards/chosen": 0.15039664506912231, + "rewards/margins": -0.1926751285791397, + "rewards/rejected": 0.34307175874710083, + "step": 6404 + }, + { + "epoch": 0.9905277401894452, + "grad_norm": 5.699006080627441, + "learning_rate": 3.7212166342078132e-06, + "logits/chosen": 6.843673229217529, + "logits/rejected": 6.50965690612793, + "logps/chosen": -220.083740234375, + "logps/rejected": -224.9158477783203, + "loss": 0.6835, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15800824761390686, + "rewards/margins": 0.042635876685380936, + "rewards/rejected": 0.11537237465381622, + "step": 6405 + }, + { + "epoch": 0.9906823893292094, + "grad_norm": 6.266790866851807, + "learning_rate": 3.72093023255814e-06, + "logits/chosen": 7.589700698852539, + "logits/rejected": 13.214097023010254, + "logps/chosen": -244.7571258544922, + "logps/rejected": -375.617431640625, + "loss": 0.8402, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.18168388307094574, + "rewards/margins": -0.1142912358045578, + "rewards/rejected": 0.29597511887550354, + "step": 6406 + }, + { + "epoch": 0.9908370384689735, + "grad_norm": 3.9138331413269043, + "learning_rate": 3.7206438309084665e-06, + "logits/chosen": 2.3229284286499023, + "logits/rejected": 2.293281078338623, + "logps/chosen": -172.07394409179688, + "logps/rejected": -131.80897521972656, + "loss": 0.6227, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40869075059890747, + "rewards/margins": 0.2160681039094925, + "rewards/rejected": 0.19262266159057617, + "step": 6407 + }, + { + "epoch": 0.9909916876087377, + "grad_norm": 8.796308517456055, + "learning_rate": 3.7203574292587927e-06, + "logits/chosen": 14.957677841186523, + "logits/rejected": 15.079998016357422, + "logps/chosen": -272.4525146484375, + "logps/rejected": -240.1488494873047, + "loss": 0.8273, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.04069849103689194, + "rewards/margins": 0.1002734899520874, + "rewards/rejected": -0.05957496166229248, + "step": 6408 + }, + { + "epoch": 0.9911463367485018, + "grad_norm": 7.710762977600098, + "learning_rate": 3.7200710276091194e-06, + "logits/chosen": 17.11355972290039, + "logits/rejected": 8.132133483886719, + "logps/chosen": -466.312744140625, + "logps/rejected": -349.2236328125, + "loss": 0.5881, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6155927777290344, + "rewards/margins": 0.3932848572731018, + "rewards/rejected": 0.22230792045593262, + "step": 6409 + }, + { + "epoch": 0.991300985888266, + "grad_norm": 8.482393264770508, + "learning_rate": 3.7197846259594456e-06, + "logits/chosen": 11.441797256469727, + "logits/rejected": 14.680935859680176, + "logps/chosen": -317.5072937011719, + "logps/rejected": -345.8556823730469, + "loss": 0.9578, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.35917553305625916, + "rewards/margins": -0.274425745010376, + "rewards/rejected": -0.08474978804588318, + "step": 6410 + }, + { + "epoch": 0.9914556350280301, + "grad_norm": 4.098133563995361, + "learning_rate": 3.7194982243097723e-06, + "logits/chosen": 10.93844223022461, + "logits/rejected": 8.813115119934082, + "logps/chosen": -180.91848754882812, + "logps/rejected": -167.08865356445312, + "loss": 0.7275, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06862376630306244, + "rewards/margins": 0.0667669028043747, + "rewards/rejected": -0.13539066910743713, + "step": 6411 + }, + { + "epoch": 0.9916102841677943, + "grad_norm": 6.639828205108643, + "learning_rate": 3.719211822660099e-06, + "logits/chosen": 10.014822006225586, + "logits/rejected": 9.669681549072266, + "logps/chosen": -249.07492065429688, + "logps/rejected": -293.29071044921875, + "loss": 0.655, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04954013228416443, + "rewards/margins": 0.3228171467781067, + "rewards/rejected": -0.2732769846916199, + "step": 6412 + }, + { + "epoch": 0.9917649333075584, + "grad_norm": 6.708457946777344, + "learning_rate": 3.718925421010425e-06, + "logits/chosen": 6.201162815093994, + "logits/rejected": 2.6435225009918213, + "logps/chosen": -197.08804321289062, + "logps/rejected": -141.50880432128906, + "loss": 0.6185, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.48373717069625854, + "rewards/margins": 0.31828323006629944, + "rewards/rejected": -0.8020203709602356, + "step": 6413 + }, + { + "epoch": 0.9919195824473226, + "grad_norm": 5.775219917297363, + "learning_rate": 3.718639019360752e-06, + "logits/chosen": 10.714132308959961, + "logits/rejected": 9.037699699401855, + "logps/chosen": -355.2578125, + "logps/rejected": -285.55352783203125, + "loss": 0.6524, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.48623791337013245, + "rewards/margins": 0.14448612928390503, + "rewards/rejected": 0.34175175428390503, + "step": 6414 + }, + { + "epoch": 0.9920742315870869, + "grad_norm": 3.9897866249084473, + "learning_rate": 3.7183526177110785e-06, + "logits/chosen": 7.48845100402832, + "logits/rejected": 7.3719987869262695, + "logps/chosen": -206.3500213623047, + "logps/rejected": -230.9735107421875, + "loss": 0.6759, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.04097408801317215, + "rewards/margins": 0.06924095749855042, + "rewards/rejected": -0.11021503806114197, + "step": 6415 + }, + { + "epoch": 0.992228880726851, + "grad_norm": 4.605623722076416, + "learning_rate": 3.718066216061405e-06, + "logits/chosen": 10.593321800231934, + "logits/rejected": 8.16114616394043, + "logps/chosen": -251.10311889648438, + "logps/rejected": -222.08673095703125, + "loss": 0.6673, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2484852820634842, + "rewards/margins": 0.12412064522504807, + "rewards/rejected": -0.37260594964027405, + "step": 6416 + }, + { + "epoch": 0.9923835298666152, + "grad_norm": 12.407342910766602, + "learning_rate": 3.717779814411731e-06, + "logits/chosen": 10.985397338867188, + "logits/rejected": 0.05024600028991699, + "logps/chosen": -426.27069091796875, + "logps/rejected": -237.6470947265625, + "loss": 0.5892, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5563673377037048, + "rewards/margins": 0.2939397990703583, + "rewards/rejected": 0.26242750883102417, + "step": 6417 + }, + { + "epoch": 0.9925381790063793, + "grad_norm": 5.3696160316467285, + "learning_rate": 3.7174934127620576e-06, + "logits/chosen": 10.513065338134766, + "logits/rejected": 10.9337158203125, + "logps/chosen": -373.86004638671875, + "logps/rejected": -255.8009490966797, + "loss": 0.5089, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.532660186290741, + "rewards/margins": 0.6481893062591553, + "rewards/rejected": -0.11552904546260834, + "step": 6418 + }, + { + "epoch": 0.9926928281461435, + "grad_norm": 8.462489128112793, + "learning_rate": 3.717207011112384e-06, + "logits/chosen": 5.9502129554748535, + "logits/rejected": 4.107422351837158, + "logps/chosen": -295.1842041015625, + "logps/rejected": -249.59805297851562, + "loss": 0.9288, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04321698099374771, + "rewards/margins": -0.24786341190338135, + "rewards/rejected": 0.20464642345905304, + "step": 6419 + }, + { + "epoch": 0.9928474772859076, + "grad_norm": 5.031231880187988, + "learning_rate": 3.716920609462711e-06, + "logits/chosen": 11.654215812683105, + "logits/rejected": 2.93538761138916, + "logps/chosen": -221.47122192382812, + "logps/rejected": -166.4352569580078, + "loss": 0.7836, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.14523737132549286, + "rewards/margins": -0.05829368531703949, + "rewards/rejected": -0.08694367110729218, + "step": 6420 + }, + { + "epoch": 0.9930021264256718, + "grad_norm": 3.2796337604522705, + "learning_rate": 3.7166342078130375e-06, + "logits/chosen": 14.392969131469727, + "logits/rejected": 10.050641059875488, + "logps/chosen": -253.76919555664062, + "logps/rejected": -219.97955322265625, + "loss": 0.4514, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25053876638412476, + "rewards/margins": 0.6669167280197144, + "rewards/rejected": -0.416377991437912, + "step": 6421 + }, + { + "epoch": 0.9931567755654359, + "grad_norm": 8.043817520141602, + "learning_rate": 3.716347806163364e-06, + "logits/chosen": 8.410589218139648, + "logits/rejected": 6.314308166503906, + "logps/chosen": -350.14617919921875, + "logps/rejected": -325.44989013671875, + "loss": 0.8105, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3739131689071655, + "rewards/margins": -0.14903900027275085, + "rewards/rejected": 0.5229521989822388, + "step": 6422 + }, + { + "epoch": 0.9933114247052001, + "grad_norm": 9.700401306152344, + "learning_rate": 3.7160614045136904e-06, + "logits/chosen": 9.112405776977539, + "logits/rejected": 13.738035202026367, + "logps/chosen": -228.12466430664062, + "logps/rejected": -249.49899291992188, + "loss": 0.6306, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.055996183305978775, + "rewards/margins": 0.4456881880760193, + "rewards/rejected": -0.3896920382976532, + "step": 6423 + }, + { + "epoch": 0.9934660738449642, + "grad_norm": 5.776155948638916, + "learning_rate": 3.7157750028640166e-06, + "logits/chosen": 4.318787574768066, + "logits/rejected": 5.837235450744629, + "logps/chosen": -330.2237548828125, + "logps/rejected": -355.15399169921875, + "loss": 0.5104, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6251097917556763, + "rewards/margins": 0.5988390445709229, + "rewards/rejected": 0.026270776987075806, + "step": 6424 + }, + { + "epoch": 0.9936207229847284, + "grad_norm": 5.2671918869018555, + "learning_rate": 3.7154886012143433e-06, + "logits/chosen": 9.176443099975586, + "logits/rejected": 6.544194221496582, + "logps/chosen": -290.18756103515625, + "logps/rejected": -340.17877197265625, + "loss": 0.7401, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05302000045776367, + "rewards/margins": 0.08814285695552826, + "rewards/rejected": -0.14116287231445312, + "step": 6425 + }, + { + "epoch": 0.9937753721244925, + "grad_norm": 4.734760284423828, + "learning_rate": 3.71520219956467e-06, + "logits/chosen": 10.48302173614502, + "logits/rejected": 5.032235622406006, + "logps/chosen": -413.1797790527344, + "logps/rejected": -419.94580078125, + "loss": 0.4113, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3933395445346832, + "rewards/margins": 0.9376549124717712, + "rewards/rejected": -0.5443153381347656, + "step": 6426 + }, + { + "epoch": 0.9939300212642567, + "grad_norm": 4.734888076782227, + "learning_rate": 3.714915797914996e-06, + "logits/chosen": 5.717303276062012, + "logits/rejected": 7.294363021850586, + "logps/chosen": -229.66470336914062, + "logps/rejected": -305.2263488769531, + "loss": 0.5005, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.16094055771827698, + "rewards/margins": 0.473614364862442, + "rewards/rejected": -0.31267380714416504, + "step": 6427 + }, + { + "epoch": 0.9940846704040209, + "grad_norm": 5.623244762420654, + "learning_rate": 3.714629396265323e-06, + "logits/chosen": 10.129432678222656, + "logits/rejected": 9.351290702819824, + "logps/chosen": -254.37786865234375, + "logps/rejected": -250.22079467773438, + "loss": 0.6417, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2449115812778473, + "rewards/margins": 0.2890045642852783, + "rewards/rejected": -0.04409298300743103, + "step": 6428 + }, + { + "epoch": 0.9942393195437851, + "grad_norm": 4.211686611175537, + "learning_rate": 3.7143429946156494e-06, + "logits/chosen": 13.407751083374023, + "logits/rejected": 5.79897403717041, + "logps/chosen": -330.6243896484375, + "logps/rejected": -232.6272735595703, + "loss": 0.5101, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.46750152111053467, + "rewards/margins": 0.5535997152328491, + "rewards/rejected": -0.08609818667173386, + "step": 6429 + }, + { + "epoch": 0.9943939686835492, + "grad_norm": 4.994174480438232, + "learning_rate": 3.7140565929659757e-06, + "logits/chosen": 13.413518905639648, + "logits/rejected": 3.5449743270874023, + "logps/chosen": -363.74908447265625, + "logps/rejected": -267.62506103515625, + "loss": 0.5234, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6236370205879211, + "rewards/margins": 0.6098806262016296, + "rewards/rejected": 0.01375637948513031, + "step": 6430 + }, + { + "epoch": 0.9945486178233134, + "grad_norm": 6.460704326629639, + "learning_rate": 3.7137701913163023e-06, + "logits/chosen": 5.026034355163574, + "logits/rejected": 2.93806529045105, + "logps/chosen": -335.23638916015625, + "logps/rejected": -322.83624267578125, + "loss": 0.4922, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.8510804772377014, + "rewards/margins": 0.7431129217147827, + "rewards/rejected": 0.1079675704240799, + "step": 6431 + }, + { + "epoch": 0.9947032669630775, + "grad_norm": 3.4955148696899414, + "learning_rate": 3.7134837896666285e-06, + "logits/chosen": 17.210689544677734, + "logits/rejected": 10.415397644042969, + "logps/chosen": -398.5893859863281, + "logps/rejected": -245.32540893554688, + "loss": 0.4282, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7730404138565063, + "rewards/margins": 0.7427493333816528, + "rewards/rejected": 0.030291080474853516, + "step": 6432 + }, + { + "epoch": 0.9948579161028417, + "grad_norm": 4.404941558837891, + "learning_rate": 3.713197388016955e-06, + "logits/chosen": 6.819891452789307, + "logits/rejected": 8.653105735778809, + "logps/chosen": -166.37310791015625, + "logps/rejected": -177.69534301757812, + "loss": 0.6709, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11002130061388016, + "rewards/margins": 0.10400070250034332, + "rewards/rejected": 0.006020592525601387, + "step": 6433 + }, + { + "epoch": 0.9950125652426058, + "grad_norm": 4.815541744232178, + "learning_rate": 3.712910986367282e-06, + "logits/chosen": 11.207900047302246, + "logits/rejected": 11.828226089477539, + "logps/chosen": -241.86944580078125, + "logps/rejected": -332.5943908691406, + "loss": 0.5174, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1279597282409668, + "rewards/margins": 0.5327659845352173, + "rewards/rejected": -0.4048061966896057, + "step": 6434 + }, + { + "epoch": 0.99516721438237, + "grad_norm": 4.53037691116333, + "learning_rate": 3.7126245847176085e-06, + "logits/chosen": 14.962818145751953, + "logits/rejected": 6.390637397766113, + "logps/chosen": -401.25030517578125, + "logps/rejected": -224.285888671875, + "loss": 0.5032, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.48805028200149536, + "rewards/margins": 0.5336525440216064, + "rewards/rejected": -0.0456022247672081, + "step": 6435 + }, + { + "epoch": 0.9953218635221341, + "grad_norm": 3.8327882289886475, + "learning_rate": 3.712338183067935e-06, + "logits/chosen": 12.019838333129883, + "logits/rejected": 8.912884712219238, + "logps/chosen": -171.60142517089844, + "logps/rejected": -163.62643432617188, + "loss": 0.6241, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1824900507926941, + "rewards/margins": 0.2052711546421051, + "rewards/rejected": -0.022781088948249817, + "step": 6436 + }, + { + "epoch": 0.9954765126618983, + "grad_norm": 5.008584499359131, + "learning_rate": 3.712051781418261e-06, + "logits/chosen": 7.2660417556762695, + "logits/rejected": 9.870080947875977, + "logps/chosen": -280.6383056640625, + "logps/rejected": -295.5782165527344, + "loss": 0.504, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40721285343170166, + "rewards/margins": 0.5243091583251953, + "rewards/rejected": -0.11709632724523544, + "step": 6437 + }, + { + "epoch": 0.9956311618016624, + "grad_norm": 4.102077484130859, + "learning_rate": 3.7117653797685876e-06, + "logits/chosen": 12.455120086669922, + "logits/rejected": 10.793878555297852, + "logps/chosen": -253.3858184814453, + "logps/rejected": -227.75735473632812, + "loss": 0.5701, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19202375411987305, + "rewards/margins": 0.38915860652923584, + "rewards/rejected": -0.19713488221168518, + "step": 6438 + }, + { + "epoch": 0.9957858109414266, + "grad_norm": 6.331699371337891, + "learning_rate": 3.7114789781189143e-06, + "logits/chosen": 10.446334838867188, + "logits/rejected": 9.925003051757812, + "logps/chosen": -340.41937255859375, + "logps/rejected": -330.2668762207031, + "loss": 0.7328, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3427719175815582, + "rewards/margins": -0.032489921897649765, + "rewards/rejected": 0.3752618134021759, + "step": 6439 + }, + { + "epoch": 0.9959404600811907, + "grad_norm": 10.979613304138184, + "learning_rate": 3.711192576469241e-06, + "logits/chosen": 6.557989120483398, + "logits/rejected": 13.282819747924805, + "logps/chosen": -223.29698181152344, + "logps/rejected": -329.47576904296875, + "loss": 0.9979, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.015458628535270691, + "rewards/margins": -0.5006383061408997, + "rewards/rejected": 0.5160968899726868, + "step": 6440 + }, + { + "epoch": 0.996095109220955, + "grad_norm": 4.591195583343506, + "learning_rate": 3.7109061748195676e-06, + "logits/chosen": 9.61746883392334, + "logits/rejected": 6.124847412109375, + "logps/chosen": -323.44793701171875, + "logps/rejected": -257.9283447265625, + "loss": 0.6567, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.32818177342414856, + "rewards/margins": 0.32907921075820923, + "rewards/rejected": -0.0008974462980404496, + "step": 6441 + }, + { + "epoch": 0.9962497583607192, + "grad_norm": 5.878073692321777, + "learning_rate": 3.7106197731698938e-06, + "logits/chosen": 14.631660461425781, + "logits/rejected": 7.4148945808410645, + "logps/chosen": -274.1687316894531, + "logps/rejected": -194.74220275878906, + "loss": 0.6431, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37658941745758057, + "rewards/margins": 0.3262397050857544, + "rewards/rejected": 0.050349727272987366, + "step": 6442 + }, + { + "epoch": 0.9964044075004833, + "grad_norm": 6.323184967041016, + "learning_rate": 3.71033337152022e-06, + "logits/chosen": 10.861124038696289, + "logits/rejected": 8.038949012756348, + "logps/chosen": -304.2377624511719, + "logps/rejected": -333.73284912109375, + "loss": 0.668, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40864118933677673, + "rewards/margins": 0.26527321338653564, + "rewards/rejected": 0.1433679461479187, + "step": 6443 + }, + { + "epoch": 0.9965590566402475, + "grad_norm": 4.15866756439209, + "learning_rate": 3.7100469698705467e-06, + "logits/chosen": 8.23521614074707, + "logits/rejected": 8.099372863769531, + "logps/chosen": -296.7837829589844, + "logps/rejected": -335.0313720703125, + "loss": 0.4556, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5893731713294983, + "rewards/margins": 0.7047889828681946, + "rewards/rejected": -0.11541580408811569, + "step": 6444 + }, + { + "epoch": 0.9967137057800116, + "grad_norm": 5.668601036071777, + "learning_rate": 3.7097605682208733e-06, + "logits/chosen": 11.344316482543945, + "logits/rejected": 10.135232925415039, + "logps/chosen": -314.1343994140625, + "logps/rejected": -243.55874633789062, + "loss": 0.8067, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1590518057346344, + "rewards/margins": -0.08058004081249237, + "rewards/rejected": 0.23963184654712677, + "step": 6445 + }, + { + "epoch": 0.9968683549197758, + "grad_norm": 3.9897096157073975, + "learning_rate": 3.7094741665711995e-06, + "logits/chosen": 8.347829818725586, + "logits/rejected": 2.55678391456604, + "logps/chosen": -323.57257080078125, + "logps/rejected": -188.37205505371094, + "loss": 0.5162, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32635608315467834, + "rewards/margins": 0.5472850799560547, + "rewards/rejected": -0.22092899680137634, + "step": 6446 + }, + { + "epoch": 0.9970230040595399, + "grad_norm": 5.015946388244629, + "learning_rate": 3.709187764921526e-06, + "logits/chosen": 12.791887283325195, + "logits/rejected": 11.704107284545898, + "logps/chosen": -274.552001953125, + "logps/rejected": -276.5045166015625, + "loss": 0.6409, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04246644675731659, + "rewards/margins": 0.2217501848936081, + "rewards/rejected": -0.17928370833396912, + "step": 6447 + }, + { + "epoch": 0.9971776531993041, + "grad_norm": 4.478327751159668, + "learning_rate": 3.708901363271853e-06, + "logits/chosen": 9.696907997131348, + "logits/rejected": 9.220049858093262, + "logps/chosen": -258.6571350097656, + "logps/rejected": -249.5697784423828, + "loss": 0.5514, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.526727557182312, + "rewards/margins": 0.5402780175209045, + "rewards/rejected": -0.01355036348104477, + "step": 6448 + }, + { + "epoch": 0.9973323023390682, + "grad_norm": 4.411247730255127, + "learning_rate": 3.7086149616221795e-06, + "logits/chosen": 8.781852722167969, + "logits/rejected": 1.7044378519058228, + "logps/chosen": -294.9023742675781, + "logps/rejected": -150.32318115234375, + "loss": 0.5621, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2328437864780426, + "rewards/margins": 0.5416602492332458, + "rewards/rejected": -0.30881643295288086, + "step": 6449 + }, + { + "epoch": 0.9974869514788324, + "grad_norm": 5.800382137298584, + "learning_rate": 3.7083285599725057e-06, + "logits/chosen": 6.159079074859619, + "logits/rejected": 6.400328636169434, + "logps/chosen": -259.21734619140625, + "logps/rejected": -194.0346221923828, + "loss": 0.6565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.392267107963562, + "rewards/margins": 0.1629045605659485, + "rewards/rejected": 0.22936254739761353, + "step": 6450 + }, + { + "epoch": 0.9976416006185965, + "grad_norm": 5.962613582611084, + "learning_rate": 3.708042158322832e-06, + "logits/chosen": 7.412317276000977, + "logits/rejected": 8.949893951416016, + "logps/chosen": -263.3511047363281, + "logps/rejected": -292.3740234375, + "loss": 0.7336, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3386686444282532, + "rewards/margins": 0.12247180938720703, + "rewards/rejected": 0.21619683504104614, + "step": 6451 + }, + { + "epoch": 0.9977962497583607, + "grad_norm": 4.726262092590332, + "learning_rate": 3.7077557566731586e-06, + "logits/chosen": 16.00439453125, + "logits/rejected": 9.305023193359375, + "logps/chosen": -332.0745849609375, + "logps/rejected": -252.96035766601562, + "loss": 0.5894, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39330828189849854, + "rewards/margins": 0.32374680042266846, + "rewards/rejected": 0.06956146657466888, + "step": 6452 + }, + { + "epoch": 0.9979508988981248, + "grad_norm": 5.195104122161865, + "learning_rate": 3.7074693550234852e-06, + "logits/chosen": 6.627902984619141, + "logits/rejected": 5.248289108276367, + "logps/chosen": -255.76513671875, + "logps/rejected": -230.42604064941406, + "loss": 0.5402, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20039893686771393, + "rewards/margins": 0.45336925983428955, + "rewards/rejected": -0.2529703676700592, + "step": 6453 + }, + { + "epoch": 0.9981055480378891, + "grad_norm": 6.634561061859131, + "learning_rate": 3.707182953373812e-06, + "logits/chosen": 10.792223930358887, + "logits/rejected": 13.415390014648438, + "logps/chosen": -272.1614990234375, + "logps/rejected": -303.35064697265625, + "loss": 0.714, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37170207500457764, + "rewards/margins": 0.03073558583855629, + "rewards/rejected": 0.34096652269363403, + "step": 6454 + }, + { + "epoch": 0.9982601971776532, + "grad_norm": 7.098182201385498, + "learning_rate": 3.7068965517241385e-06, + "logits/chosen": 16.01780128479004, + "logits/rejected": 5.407187461853027, + "logps/chosen": -504.06689453125, + "logps/rejected": -242.10687255859375, + "loss": 0.5645, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5858308672904968, + "rewards/margins": 0.41286909580230713, + "rewards/rejected": 0.17296181619167328, + "step": 6455 + }, + { + "epoch": 0.9984148463174174, + "grad_norm": 3.831732988357544, + "learning_rate": 3.706610150074465e-06, + "logits/chosen": 11.377111434936523, + "logits/rejected": 6.26080322265625, + "logps/chosen": -276.1357727050781, + "logps/rejected": -206.07383728027344, + "loss": 0.4207, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5700348615646362, + "rewards/margins": 1.223379373550415, + "rewards/rejected": -0.6533443331718445, + "step": 6456 + }, + { + "epoch": 0.9985694954571815, + "grad_norm": 4.787290573120117, + "learning_rate": 3.706323748424791e-06, + "logits/chosen": 16.607322692871094, + "logits/rejected": 7.273774147033691, + "logps/chosen": -397.1927490234375, + "logps/rejected": -220.79031372070312, + "loss": 0.5305, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7793780565261841, + "rewards/margins": 0.6378126740455627, + "rewards/rejected": 0.14156541228294373, + "step": 6457 + }, + { + "epoch": 0.9987241445969457, + "grad_norm": 5.1946282386779785, + "learning_rate": 3.7060373467751176e-06, + "logits/chosen": 11.702022552490234, + "logits/rejected": 10.066984176635742, + "logps/chosen": -258.2348937988281, + "logps/rejected": -200.14053344726562, + "loss": 0.6547, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04380466043949127, + "rewards/margins": 0.16082774102687836, + "rewards/rejected": -0.20463237166404724, + "step": 6458 + }, + { + "epoch": 0.9988787937367098, + "grad_norm": 5.404623508453369, + "learning_rate": 3.7057509451254443e-06, + "logits/chosen": 13.3839693069458, + "logits/rejected": 5.842410087585449, + "logps/chosen": -310.8735046386719, + "logps/rejected": -230.002685546875, + "loss": 0.6934, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.07032537460327148, + "rewards/margins": 0.07508254051208496, + "rewards/rejected": -0.14540790021419525, + "step": 6459 + }, + { + "epoch": 0.999033442876474, + "grad_norm": 6.139119625091553, + "learning_rate": 3.705464543475771e-06, + "logits/chosen": 13.555074691772461, + "logits/rejected": 5.720902919769287, + "logps/chosen": -255.6298065185547, + "logps/rejected": -132.44500732421875, + "loss": 0.6642, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1491905152797699, + "rewards/margins": 0.14715290069580078, + "rewards/rejected": 0.002037620171904564, + "step": 6460 + }, + { + "epoch": 0.9991880920162381, + "grad_norm": 8.389126777648926, + "learning_rate": 3.705178141826097e-06, + "logits/chosen": 11.02554702758789, + "logits/rejected": 14.100635528564453, + "logps/chosen": -392.8874816894531, + "logps/rejected": -400.7939453125, + "loss": 0.7983, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.5057023763656616, + "rewards/margins": -0.11814691126346588, + "rewards/rejected": 0.6238492727279663, + "step": 6461 + }, + { + "epoch": 0.9993427411560023, + "grad_norm": 6.983454704284668, + "learning_rate": 3.704891740176424e-06, + "logits/chosen": 14.922819137573242, + "logits/rejected": 10.502524375915527, + "logps/chosen": -318.099365234375, + "logps/rejected": -246.74215698242188, + "loss": 0.8399, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.03999558091163635, + "rewards/margins": -0.18530333042144775, + "rewards/rejected": 0.2252988964319229, + "step": 6462 + }, + { + "epoch": 0.9994973902957665, + "grad_norm": 7.0249810218811035, + "learning_rate": 3.70460533852675e-06, + "logits/chosen": 6.691718578338623, + "logits/rejected": 8.100693702697754, + "logps/chosen": -226.1967010498047, + "logps/rejected": -215.714599609375, + "loss": 0.9998, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.4663422405719757, + "rewards/margins": -0.38877493143081665, + "rewards/rejected": -0.07756730169057846, + "step": 6463 + }, + { + "epoch": 0.9996520394355306, + "grad_norm": 3.797253131866455, + "learning_rate": 3.7043189368770767e-06, + "logits/chosen": 7.48823356628418, + "logits/rejected": 7.208713531494141, + "logps/chosen": -194.60357666015625, + "logps/rejected": -145.7886962890625, + "loss": 0.7152, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04407081753015518, + "rewards/margins": 0.002133004367351532, + "rewards/rejected": 0.04193783923983574, + "step": 6464 + }, + { + "epoch": 0.9998066885752948, + "grad_norm": 4.286912441253662, + "learning_rate": 3.704032535227403e-06, + "logits/chosen": 8.498443603515625, + "logits/rejected": 3.455369710922241, + "logps/chosen": -286.6949462890625, + "logps/rejected": -241.1104736328125, + "loss": 0.507, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5331889986991882, + "rewards/margins": 0.5734596252441406, + "rewards/rejected": -0.04027062654495239, + "step": 6465 + }, + { + "epoch": 0.9999613377150589, + "grad_norm": 5.675995826721191, + "learning_rate": 3.7037461335777296e-06, + "logits/chosen": -1.0441932678222656, + "logits/rejected": 8.65644359588623, + "logps/chosen": -132.24937438964844, + "logps/rejected": -179.43557739257812, + "loss": 0.7638, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.33726200461387634, + "rewards/margins": 0.05548731982707977, + "rewards/rejected": -0.3927493393421173, + "step": 6466 + }, + { + "epoch": 1.000115986854823, + "grad_norm": 4.397193431854248, + "learning_rate": 3.7034597319280562e-06, + "logits/chosen": 9.970890045166016, + "logits/rejected": 11.026260375976562, + "logps/chosen": -324.40606689453125, + "logps/rejected": -262.9850158691406, + "loss": 0.4699, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6350075602531433, + "rewards/margins": 0.6038157343864441, + "rewards/rejected": 0.031191788613796234, + "step": 6467 + }, + { + "epoch": 1.0002706359945872, + "grad_norm": 6.071857929229736, + "learning_rate": 3.703173330278383e-06, + "logits/chosen": 8.23035717010498, + "logits/rejected": 14.393836975097656, + "logps/chosen": -337.9178466796875, + "logps/rejected": -361.6004638671875, + "loss": 0.8434, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37275755405426025, + "rewards/margins": 0.04437033832073212, + "rewards/rejected": 0.32838720083236694, + "step": 6468 + }, + { + "epoch": 1.0004252851343514, + "grad_norm": 4.508437633514404, + "learning_rate": 3.7028869286287095e-06, + "logits/chosen": 17.907562255859375, + "logits/rejected": 15.179469108581543, + "logps/chosen": -322.15289306640625, + "logps/rejected": -252.47393798828125, + "loss": 0.5076, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22953186929225922, + "rewards/margins": 0.56075519323349, + "rewards/rejected": -0.33122333884239197, + "step": 6469 + }, + { + "epoch": 1.0005799342741155, + "grad_norm": 4.127502918243408, + "learning_rate": 3.7026005269790353e-06, + "logits/chosen": 13.710186004638672, + "logits/rejected": 13.976655960083008, + "logps/chosen": -283.73956298828125, + "logps/rejected": -212.97613525390625, + "loss": 0.5532, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4209112226963043, + "rewards/margins": 0.35665363073349, + "rewards/rejected": 0.06425757706165314, + "step": 6470 + }, + { + "epoch": 1.0007345834138797, + "grad_norm": 4.5542893409729, + "learning_rate": 3.702314125329362e-06, + "logits/chosen": 4.959611892700195, + "logits/rejected": 0.7296081781387329, + "logps/chosen": -257.67987060546875, + "logps/rejected": -291.7732238769531, + "loss": 0.5917, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06830978393554688, + "rewards/margins": 0.38829559087753296, + "rewards/rejected": -0.3199857771396637, + "step": 6471 + }, + { + "epoch": 1.0008892325536438, + "grad_norm": 4.960212707519531, + "learning_rate": 3.7020277236796886e-06, + "logits/chosen": 8.817639350891113, + "logits/rejected": 11.630827903747559, + "logps/chosen": -372.274169921875, + "logps/rejected": -452.58489990234375, + "loss": 0.5445, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4633787274360657, + "rewards/margins": 0.37530338764190674, + "rewards/rejected": 0.08807535469532013, + "step": 6472 + }, + { + "epoch": 1.0010438816934082, + "grad_norm": 4.946148872375488, + "learning_rate": 3.7017413220300153e-06, + "logits/chosen": 10.475448608398438, + "logits/rejected": 10.050158500671387, + "logps/chosen": -321.27252197265625, + "logps/rejected": -258.4363708496094, + "loss": 0.5804, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.5828481912612915, + "rewards/margins": 0.3572867512702942, + "rewards/rejected": 0.22556143999099731, + "step": 6473 + }, + { + "epoch": 1.0011985308331723, + "grad_norm": 6.328547954559326, + "learning_rate": 3.701454920380342e-06, + "logits/chosen": 11.512105941772461, + "logits/rejected": 11.806161880493164, + "logps/chosen": -430.54644775390625, + "logps/rejected": -394.33184814453125, + "loss": 0.6867, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.549159824848175, + "rewards/margins": 0.20954646170139313, + "rewards/rejected": 0.3396133482456207, + "step": 6474 + }, + { + "epoch": 1.0013531799729365, + "grad_norm": 4.63679313659668, + "learning_rate": 3.7011685187306686e-06, + "logits/chosen": 13.474045753479004, + "logits/rejected": 7.575530529022217, + "logps/chosen": -397.6506042480469, + "logps/rejected": -262.03228759765625, + "loss": 0.5233, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7780610918998718, + "rewards/margins": 0.461338073015213, + "rewards/rejected": 0.3167230486869812, + "step": 6475 + }, + { + "epoch": 1.0015078291127006, + "grad_norm": 4.678894519805908, + "learning_rate": 3.7008821170809944e-06, + "logits/chosen": 12.240632057189941, + "logits/rejected": 10.878813743591309, + "logps/chosen": -238.29595947265625, + "logps/rejected": -245.44129943847656, + "loss": 0.5433, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2810564935207367, + "rewards/margins": 0.4207093119621277, + "rewards/rejected": -0.139652818441391, + "step": 6476 + }, + { + "epoch": 1.0016624782524648, + "grad_norm": 6.588015556335449, + "learning_rate": 3.700595715431321e-06, + "logits/chosen": 13.789307594299316, + "logits/rejected": 11.070528030395508, + "logps/chosen": -438.00311279296875, + "logps/rejected": -324.98626708984375, + "loss": 0.8311, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.2357904613018036, + "rewards/margins": -0.24124449491500854, + "rewards/rejected": 0.47703495621681213, + "step": 6477 + }, + { + "epoch": 1.001817127392229, + "grad_norm": 6.812483310699463, + "learning_rate": 3.7003093137816477e-06, + "logits/chosen": 7.506626605987549, + "logits/rejected": 12.467564582824707, + "logps/chosen": -218.64212036132812, + "logps/rejected": -279.65966796875, + "loss": 0.6789, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14739733934402466, + "rewards/margins": 0.07972268760204315, + "rewards/rejected": 0.06767462939023972, + "step": 6478 + }, + { + "epoch": 1.001971776531993, + "grad_norm": 5.825155735015869, + "learning_rate": 3.7000229121319743e-06, + "logits/chosen": 6.795217037200928, + "logits/rejected": 6.0769453048706055, + "logps/chosen": -345.43841552734375, + "logps/rejected": -275.205322265625, + "loss": 0.6548, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15224283933639526, + "rewards/margins": 0.16189280152320862, + "rewards/rejected": -0.00964994728565216, + "step": 6479 + }, + { + "epoch": 1.0021264256717572, + "grad_norm": 7.262514114379883, + "learning_rate": 3.6997365104823006e-06, + "logits/chosen": 8.445148468017578, + "logits/rejected": 8.914729118347168, + "logps/chosen": -329.12689208984375, + "logps/rejected": -333.1185302734375, + "loss": 0.8121, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.598840594291687, + "rewards/margins": 0.02515360713005066, + "rewards/rejected": 0.5736870169639587, + "step": 6480 + }, + { + "epoch": 1.0022810748115214, + "grad_norm": 5.612082481384277, + "learning_rate": 3.6994501088326272e-06, + "logits/chosen": 7.604832172393799, + "logits/rejected": 9.593935012817383, + "logps/chosen": -380.4143981933594, + "logps/rejected": -401.48046875, + "loss": 0.58, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.8835973739624023, + "rewards/margins": 0.3783763647079468, + "rewards/rejected": 0.505220890045166, + "step": 6481 + }, + { + "epoch": 1.0024357239512856, + "grad_norm": 4.28853178024292, + "learning_rate": 3.699163707182954e-06, + "logits/chosen": 14.142362594604492, + "logits/rejected": 6.589823246002197, + "logps/chosen": -319.54144287109375, + "logps/rejected": -227.5568084716797, + "loss": 0.5133, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35826224088668823, + "rewards/margins": 0.6096003651618958, + "rewards/rejected": -0.25133809447288513, + "step": 6482 + }, + { + "epoch": 1.0025903730910497, + "grad_norm": 3.6368038654327393, + "learning_rate": 3.69887730553328e-06, + "logits/chosen": 5.254798889160156, + "logits/rejected": 9.603643417358398, + "logps/chosen": -226.14010620117188, + "logps/rejected": -282.77825927734375, + "loss": 0.478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.31996646523475647, + "rewards/margins": 0.5870798826217651, + "rewards/rejected": -0.2671133875846863, + "step": 6483 + }, + { + "epoch": 1.0027450222308139, + "grad_norm": 5.984035968780518, + "learning_rate": 3.6985909038836063e-06, + "logits/chosen": 8.93764591217041, + "logits/rejected": 5.029354095458984, + "logps/chosen": -266.23577880859375, + "logps/rejected": -251.21987915039062, + "loss": 0.6095, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26544857025146484, + "rewards/margins": 0.28799566626548767, + "rewards/rejected": -0.02254711091518402, + "step": 6484 + }, + { + "epoch": 1.002899671370578, + "grad_norm": 4.384690284729004, + "learning_rate": 3.698304502233933e-06, + "logits/chosen": 10.72787094116211, + "logits/rejected": 14.027435302734375, + "logps/chosen": -176.72085571289062, + "logps/rejected": -270.55035400390625, + "loss": 0.5231, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.0743076354265213, + "rewards/margins": 0.468065470457077, + "rewards/rejected": -0.5423730611801147, + "step": 6485 + }, + { + "epoch": 1.0030543205103422, + "grad_norm": 3.422541379928589, + "learning_rate": 3.6980181005842596e-06, + "logits/chosen": 11.922115325927734, + "logits/rejected": 8.016584396362305, + "logps/chosen": -183.350830078125, + "logps/rejected": -150.6525115966797, + "loss": 0.4901, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3783113956451416, + "rewards/margins": 0.588878870010376, + "rewards/rejected": -0.21056750416755676, + "step": 6486 + }, + { + "epoch": 1.0032089696501063, + "grad_norm": 4.68432092666626, + "learning_rate": 3.6977316989345863e-06, + "logits/chosen": 10.02033805847168, + "logits/rejected": 5.156567573547363, + "logps/chosen": -203.0462188720703, + "logps/rejected": -165.84323120117188, + "loss": 0.5739, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10937246680259705, + "rewards/margins": 0.40428417921066284, + "rewards/rejected": -0.2949116826057434, + "step": 6487 + }, + { + "epoch": 1.0033636187898705, + "grad_norm": 5.173445701599121, + "learning_rate": 3.697445297284913e-06, + "logits/chosen": 11.687442779541016, + "logits/rejected": 5.338320732116699, + "logps/chosen": -287.1285400390625, + "logps/rejected": -190.30963134765625, + "loss": 0.6449, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3509214520454407, + "rewards/margins": 0.16770482063293457, + "rewards/rejected": 0.1832166314125061, + "step": 6488 + }, + { + "epoch": 1.0035182679296346, + "grad_norm": 5.891007900238037, + "learning_rate": 3.6971588956352387e-06, + "logits/chosen": 6.7640204429626465, + "logits/rejected": 3.843388080596924, + "logps/chosen": -215.14576721191406, + "logps/rejected": -199.46826171875, + "loss": 0.6882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33893540501594543, + "rewards/margins": 0.14164628088474274, + "rewards/rejected": 0.1972891390323639, + "step": 6489 + }, + { + "epoch": 1.0036729170693988, + "grad_norm": 5.027500629425049, + "learning_rate": 3.6968724939855654e-06, + "logits/chosen": 10.829145431518555, + "logits/rejected": 11.652154922485352, + "logps/chosen": -218.57725524902344, + "logps/rejected": -252.7308349609375, + "loss": 0.6982, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1749027669429779, + "rewards/margins": 0.012367707677185535, + "rewards/rejected": 0.16253504157066345, + "step": 6490 + }, + { + "epoch": 1.003827566209163, + "grad_norm": 7.334293365478516, + "learning_rate": 3.696586092335892e-06, + "logits/chosen": 12.625438690185547, + "logits/rejected": 7.769203186035156, + "logps/chosen": -351.1131591796875, + "logps/rejected": -291.02716064453125, + "loss": 0.8311, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.007216166704893112, + "rewards/margins": -0.17327935993671417, + "rewards/rejected": 0.16606321930885315, + "step": 6491 + }, + { + "epoch": 1.003982215348927, + "grad_norm": 4.9731974601745605, + "learning_rate": 3.6962996906862187e-06, + "logits/chosen": 7.9905500411987305, + "logits/rejected": 8.718603134155273, + "logps/chosen": -169.21807861328125, + "logps/rejected": -189.5457763671875, + "loss": 0.5797, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3706709146499634, + "rewards/margins": 0.425445556640625, + "rewards/rejected": -0.05477464199066162, + "step": 6492 + }, + { + "epoch": 1.0041368644886912, + "grad_norm": 6.196478366851807, + "learning_rate": 3.6960132890365453e-06, + "logits/chosen": 10.227004051208496, + "logits/rejected": 3.1333799362182617, + "logps/chosen": -393.52923583984375, + "logps/rejected": -251.47586059570312, + "loss": 0.6181, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34924450516700745, + "rewards/margins": 0.3547116219997406, + "rewards/rejected": -0.005467124283313751, + "step": 6493 + }, + { + "epoch": 1.0042915136284554, + "grad_norm": 4.1911773681640625, + "learning_rate": 3.695726887386872e-06, + "logits/chosen": 11.043564796447754, + "logits/rejected": 9.156267166137695, + "logps/chosen": -256.17724609375, + "logps/rejected": -248.79600524902344, + "loss": 0.623, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3548438549041748, + "rewards/margins": 0.2840927541255951, + "rewards/rejected": 0.0707511305809021, + "step": 6494 + }, + { + "epoch": 1.0044461627682195, + "grad_norm": 3.3736093044281006, + "learning_rate": 3.695440485737198e-06, + "logits/chosen": 15.362344741821289, + "logits/rejected": 13.852359771728516, + "logps/chosen": -192.2646942138672, + "logps/rejected": -168.83152770996094, + "loss": 0.5499, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18855062127113342, + "rewards/margins": 0.3880029320716858, + "rewards/rejected": -0.19945232570171356, + "step": 6495 + }, + { + "epoch": 1.0046008119079837, + "grad_norm": 6.9795241355896, + "learning_rate": 3.6951540840875244e-06, + "logits/chosen": 3.5728375911712646, + "logits/rejected": 7.5879950523376465, + "logps/chosen": -208.23135375976562, + "logps/rejected": -328.0044250488281, + "loss": 1.0147, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3019470274448395, + "rewards/margins": -0.3014880120754242, + "rewards/rejected": 0.6034350395202637, + "step": 6496 + }, + { + "epoch": 1.0047554610477478, + "grad_norm": 3.7983055114746094, + "learning_rate": 3.694867682437851e-06, + "logits/chosen": 10.055536270141602, + "logits/rejected": 11.805242538452148, + "logps/chosen": -255.73080444335938, + "logps/rejected": -238.3896942138672, + "loss": 0.5697, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2134089171886444, + "rewards/margins": 0.39890730381011963, + "rewards/rejected": -0.18549838662147522, + "step": 6497 + }, + { + "epoch": 1.004910110187512, + "grad_norm": 4.407406330108643, + "learning_rate": 3.6945812807881777e-06, + "logits/chosen": 14.96390438079834, + "logits/rejected": 10.439229965209961, + "logps/chosen": -223.6092529296875, + "logps/rejected": -179.12969970703125, + "loss": 0.6297, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.13989132642745972, + "rewards/margins": 0.32183021306991577, + "rewards/rejected": -0.4617215692996979, + "step": 6498 + }, + { + "epoch": 1.0050647593272763, + "grad_norm": 4.386175155639648, + "learning_rate": 3.694294879138504e-06, + "logits/chosen": 5.19125509262085, + "logits/rejected": 7.769558429718018, + "logps/chosen": -168.78756713867188, + "logps/rejected": -208.15203857421875, + "loss": 0.716, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.22360999882221222, + "rewards/margins": 0.043396957218647, + "rewards/rejected": -0.2670069634914398, + "step": 6499 + }, + { + "epoch": 1.0052194084670405, + "grad_norm": 3.7246615886688232, + "learning_rate": 3.6940084774888306e-06, + "logits/chosen": 12.95360279083252, + "logits/rejected": 11.05101203918457, + "logps/chosen": -186.3341064453125, + "logps/rejected": -143.59999084472656, + "loss": 0.5254, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4003264605998993, + "rewards/margins": 0.4193662703037262, + "rewards/rejected": -0.0190398208796978, + "step": 6500 + } + ], + "logging_steps": 1, + "max_steps": 19398, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}