diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8323 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 593, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 4.201680672268907e-09, + "logits/chosen": -0.6788080930709839, + "logits/rejected": -1.1750900745391846, + "logps/chosen": -702.8984985351562, + "logps/rejected": -239.67630004882812, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 8.403361344537815e-09, + "logits/chosen": -1.6158480644226074, + "logits/rejected": -1.2959809303283691, + "logps/chosen": -112.90769958496094, + "logps/rejected": -81.65785217285156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.01, + "learning_rate": 1.2605042016806723e-08, + "logits/chosen": -2.375753879547119, + "logits/rejected": -2.5303637981414795, + "logps/chosen": -105.81280517578125, + "logps/rejected": -131.5235595703125, + "loss": 0.6943, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023235511034727097, + "rewards/margins": -0.002191734267398715, + "rewards/rejected": 0.02542724646627903, + "step": 3 + }, + { + "epoch": 0.01, + "learning_rate": 1.680672268907563e-08, + "logits/chosen": -1.907819151878357, + "logits/rejected": -1.9828282594680786, + "logps/chosen": -243.6266326904297, + "logps/rejected": -293.4872741699219, + "loss": 0.6958, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.013934326358139515, + "rewards/margins": -0.03516464680433273, + "rewards/rejected": 0.049098968505859375, + "step": 4 + }, + { + "epoch": 0.01, + "learning_rate": 2.1008403361344538e-08, + "logits/chosen": -1.5391994714736938, + "logits/rejected": -1.6013704538345337, + "logps/chosen": -514.83447265625, + "logps/rejected": -273.52606201171875, + "loss": 0.6919, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04896850883960724, + "rewards/margins": 0.09175796806812286, + "rewards/rejected": -0.042789459228515625, + "step": 5 + }, + { + "epoch": 0.01, + "learning_rate": 2.5210084033613446e-08, + "logits/chosen": -2.251502513885498, + "logits/rejected": -1.4788130521774292, + "logps/chosen": -194.65187072753906, + "logps/rejected": -230.2232666015625, + "loss": 0.6948, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06929359585046768, + "rewards/margins": 0.0685802549123764, + "rewards/rejected": 0.000713348388671875, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 2.941176470588235e-08, + "logits/chosen": -1.6795597076416016, + "logits/rejected": -1.6621124744415283, + "logps/chosen": -188.00582885742188, + "logps/rejected": -178.40765380859375, + "loss": 0.6955, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004383087158203125, + "rewards/margins": -0.015031430870294571, + "rewards/rejected": 0.019414519891142845, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 3.361344537815126e-08, + "logits/chosen": -1.2061922550201416, + "logits/rejected": -1.4656660556793213, + "logps/chosen": -493.43206787109375, + "logps/rejected": -74.92171478271484, + "loss": 0.6919, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07655182480812073, + "rewards/margins": 0.10538730025291443, + "rewards/rejected": -0.028835486620664597, + "step": 8 + }, + { + "epoch": 0.02, + "learning_rate": 3.7815126050420164e-08, + "logits/chosen": -1.5676227807998657, + "logits/rejected": -1.5455267429351807, + "logps/chosen": -228.5581817626953, + "logps/rejected": -194.3417510986328, + "loss": 0.6953, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03210144117474556, + "rewards/margins": -0.005328751169145107, + "rewards/rejected": 0.03743019327521324, + "step": 9 + }, + { + "epoch": 0.02, + "learning_rate": 4.2016806722689076e-08, + "logits/chosen": -1.2673882246017456, + "logits/rejected": -1.175107717514038, + "logps/chosen": -226.69273376464844, + "logps/rejected": -170.93002319335938, + "loss": 0.6932, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05511780083179474, + "rewards/margins": 0.0763774886727333, + "rewards/rejected": -0.021259689703583717, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 4.621848739495798e-08, + "logits/chosen": -1.7369565963745117, + "logits/rejected": -2.0291335582733154, + "logps/chosen": -134.85565185546875, + "logps/rejected": -61.743980407714844, + "loss": 0.6925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0022448543459177017, + "rewards/margins": 0.011675357818603516, + "rewards/rejected": -0.013920212164521217, + "step": 11 + }, + { + "epoch": 0.02, + "learning_rate": 5.042016806722689e-08, + "logits/chosen": -1.3196473121643066, + "logits/rejected": -1.325734257698059, + "logps/chosen": -80.28683471679688, + "logps/rejected": -79.56066131591797, + "loss": 0.6918, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.006443023681640625, + "rewards/margins": 0.021741105243563652, + "rewards/rejected": -0.015298080630600452, + "step": 12 + }, + { + "epoch": 0.02, + "learning_rate": 5.46218487394958e-08, + "logits/chosen": -1.5383967161178589, + "logits/rejected": -1.4319273233413696, + "logps/chosen": -71.45745086669922, + "logps/rejected": -93.32796478271484, + "loss": 0.6971, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006036281585693359, + "rewards/margins": 0.015392017550766468, + "rewards/rejected": -0.021428298205137253, + "step": 13 + }, + { + "epoch": 0.02, + "learning_rate": 5.88235294117647e-08, + "logits/chosen": -1.968656301498413, + "logits/rejected": -1.845158338546753, + "logps/chosen": -168.4257049560547, + "logps/rejected": -300.03240966796875, + "loss": 0.6904, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.027264786884188652, + "rewards/margins": -0.05615234375, + "rewards/rejected": 0.028887558728456497, + "step": 14 + }, + { + "epoch": 0.03, + "learning_rate": 6.302521008403361e-08, + "logits/chosen": -1.1591368913650513, + "logits/rejected": -1.4170737266540527, + "logps/chosen": -538.101806640625, + "logps/rejected": -236.76358032226562, + "loss": 0.6928, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03155364841222763, + "rewards/margins": 0.02361450158059597, + "rewards/rejected": 0.00793914869427681, + "step": 15 + }, + { + "epoch": 0.03, + "learning_rate": 6.722689075630252e-08, + "logits/chosen": -1.7213101387023926, + "logits/rejected": -1.8231241703033447, + "logps/chosen": -196.15289306640625, + "logps/rejected": -119.35342407226562, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.029254913330078125, + "rewards/margins": 0.023168563842773438, + "rewards/rejected": 0.0060863494873046875, + "step": 16 + }, + { + "epoch": 0.03, + "learning_rate": 7.142857142857142e-08, + "logits/chosen": -1.7846002578735352, + "logits/rejected": -2.3181114196777344, + "logps/chosen": -273.34564208984375, + "logps/rejected": -146.905029296875, + "loss": 0.6954, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03240509331226349, + "rewards/margins": 0.0660804733633995, + "rewards/rejected": -0.033675383776426315, + "step": 17 + }, + { + "epoch": 0.03, + "learning_rate": 7.563025210084033e-08, + "logits/chosen": -2.3839895725250244, + "logits/rejected": -1.8420289754867554, + "logps/chosen": -47.616455078125, + "logps/rejected": -177.2080841064453, + "loss": 0.6935, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0052467347122728825, + "rewards/margins": 0.012156296521425247, + "rewards/rejected": -0.006909562274813652, + "step": 18 + }, + { + "epoch": 0.03, + "learning_rate": 7.983193277310923e-08, + "logits/chosen": -2.3161439895629883, + "logits/rejected": -1.8462892770767212, + "logps/chosen": -96.58424377441406, + "logps/rejected": -209.37664794921875, + "loss": 0.6883, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0225248821079731, + "rewards/margins": 0.016490697860717773, + "rewards/rejected": -0.03901557996869087, + "step": 19 + }, + { + "epoch": 0.03, + "learning_rate": 8.403361344537815e-08, + "logits/chosen": -2.2219033241271973, + "logits/rejected": -2.0519139766693115, + "logps/chosen": -346.8481750488281, + "logps/rejected": -1364.489990234375, + "loss": 0.6862, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08504104614257812, + "rewards/margins": 0.16014480590820312, + "rewards/rejected": -0.075103759765625, + "step": 20 + }, + { + "epoch": 0.04, + "learning_rate": 8.823529411764706e-08, + "logits/chosen": -1.4293802976608276, + "logits/rejected": -1.661201000213623, + "logps/chosen": -307.6474609375, + "logps/rejected": -215.94967651367188, + "loss": 0.6908, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004551697056740522, + "rewards/margins": 0.012935257516801357, + "rewards/rejected": -0.008383559994399548, + "step": 21 + }, + { + "epoch": 0.04, + "learning_rate": 9.243697478991596e-08, + "logits/chosen": -1.8076047897338867, + "logits/rejected": -1.5782675743103027, + "logps/chosen": -179.2224884033203, + "logps/rejected": -232.96527099609375, + "loss": 0.6916, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.04394521564245224, + "rewards/margins": 0.08951330184936523, + "rewards/rejected": -0.045568086206912994, + "step": 22 + }, + { + "epoch": 0.04, + "learning_rate": 9.663865546218488e-08, + "logits/chosen": -1.7942882776260376, + "logits/rejected": -1.0943225622177124, + "logps/chosen": -55.99930191040039, + "logps/rejected": -140.6543426513672, + "loss": 0.6901, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.023340702056884766, + "rewards/margins": 0.0005490314215421677, + "rewards/rejected": 0.022791672497987747, + "step": 23 + }, + { + "epoch": 0.04, + "learning_rate": 1.0084033613445378e-07, + "logits/chosen": -0.837689995765686, + "logits/rejected": -1.8798249959945679, + "logps/chosen": -213.58486938476562, + "logps/rejected": -28.56793785095215, + "loss": 0.6852, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.026500703766942024, + "rewards/margins": 0.03877449035644531, + "rewards/rejected": -0.012273788452148438, + "step": 24 + }, + { + "epoch": 0.04, + "learning_rate": 1.0504201680672269e-07, + "logits/chosen": -2.5890495777130127, + "logits/rejected": -1.7141728401184082, + "logps/chosen": -12.43747329711914, + "logps/rejected": -147.9033203125, + "loss": 0.6846, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0024075033143162727, + "rewards/margins": -0.05119595676660538, + "rewards/rejected": 0.04878845438361168, + "step": 25 + }, + { + "epoch": 0.04, + "learning_rate": 1.092436974789916e-07, + "logits/chosen": -2.9379727840423584, + "logits/rejected": -1.3671715259552002, + "logps/chosen": -203.9562530517578, + "logps/rejected": -130.590576171875, + "loss": 0.6829, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021196747198700905, + "rewards/margins": 0.04621582105755806, + "rewards/rejected": -0.025019073858857155, + "step": 26 + }, + { + "epoch": 0.05, + "learning_rate": 1.134453781512605e-07, + "logits/chosen": -1.2850056886672974, + "logits/rejected": -1.527043104171753, + "logps/chosen": -293.0238037109375, + "logps/rejected": -110.18681335449219, + "loss": 0.6859, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.069427490234375, + "rewards/margins": -0.04693755879998207, + "rewards/rejected": -0.02248992957174778, + "step": 27 + }, + { + "epoch": 0.05, + "learning_rate": 1.176470588235294e-07, + "logits/chosen": -2.0202457904815674, + "logits/rejected": -2.382385730743408, + "logps/chosen": -323.6606750488281, + "logps/rejected": -179.65538024902344, + "loss": 0.6833, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.028132058680057526, + "rewards/margins": -0.018391229212284088, + "rewards/rejected": -0.009740829467773438, + "step": 28 + }, + { + "epoch": 0.05, + "learning_rate": 1.2184873949579832e-07, + "logits/chosen": -1.9959008693695068, + "logits/rejected": -1.408521294593811, + "logps/chosen": -221.8274383544922, + "logps/rejected": -225.3356475830078, + "loss": 0.6799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023763515055179596, + "rewards/margins": 0.14066720008850098, + "rewards/rejected": -0.11690368503332138, + "step": 29 + }, + { + "epoch": 0.05, + "learning_rate": 1.2605042016806723e-07, + "logits/chosen": -1.8002355098724365, + "logits/rejected": -1.521448016166687, + "logps/chosen": -94.72344970703125, + "logps/rejected": -110.32486724853516, + "loss": 0.6821, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.054589081555604935, + "rewards/margins": -0.024533655494451523, + "rewards/rejected": -0.03005542792379856, + "step": 30 + }, + { + "epoch": 0.05, + "learning_rate": 1.3025210084033613e-07, + "logits/chosen": -1.8377197980880737, + "logits/rejected": -2.063385248184204, + "logps/chosen": -62.164398193359375, + "logps/rejected": -104.72893524169922, + "loss": 0.679, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.019611358642578125, + "rewards/margins": -0.0029705059714615345, + "rewards/rejected": 0.022581864148378372, + "step": 31 + }, + { + "epoch": 0.05, + "learning_rate": 1.3445378151260504e-07, + "logits/chosen": -2.392535924911499, + "logits/rejected": -2.1506621837615967, + "logps/chosen": -11.346028327941895, + "logps/rejected": -71.765625, + "loss": 0.6785, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.015737399458885193, + "rewards/margins": -0.004548127297312021, + "rewards/rejected": -0.01118927076458931, + "step": 32 + }, + { + "epoch": 0.06, + "learning_rate": 1.3865546218487394e-07, + "logits/chosen": -2.0362868309020996, + "logits/rejected": -2.1367034912109375, + "logps/chosen": -266.005859375, + "logps/rejected": -214.60191345214844, + "loss": 0.6788, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.052748873829841614, + "rewards/margins": 0.09725818783044815, + "rewards/rejected": -0.04450931400060654, + "step": 33 + }, + { + "epoch": 0.06, + "learning_rate": 1.4285714285714285e-07, + "logits/chosen": -1.3824855089187622, + "logits/rejected": -1.5640826225280762, + "logps/chosen": -111.83187866210938, + "logps/rejected": -36.405189514160156, + "loss": 0.6693, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008962631225585938, + "rewards/margins": 0.011541889980435371, + "rewards/rejected": -0.02050452120602131, + "step": 34 + }, + { + "epoch": 0.06, + "learning_rate": 1.4705882352941175e-07, + "logits/chosen": -2.272282361984253, + "logits/rejected": -2.159532308578491, + "logps/chosen": -43.7902717590332, + "logps/rejected": -74.43631744384766, + "loss": 0.6652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01943950727581978, + "rewards/margins": 0.03280620649456978, + "rewards/rejected": -0.01336669921875, + "step": 35 + }, + { + "epoch": 0.06, + "learning_rate": 1.5126050420168066e-07, + "logits/chosen": -2.0161397457122803, + "logits/rejected": -1.3697400093078613, + "logps/chosen": -67.14362335205078, + "logps/rejected": -123.745361328125, + "loss": 0.6641, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0367613323032856, + "rewards/margins": 0.04867387190461159, + "rewards/rejected": -0.01191253773868084, + "step": 36 + }, + { + "epoch": 0.06, + "learning_rate": 1.554621848739496e-07, + "logits/chosen": -2.1913275718688965, + "logits/rejected": -1.7024658918380737, + "logps/chosen": -10.184264183044434, + "logps/rejected": -107.8653793334961, + "loss": 0.6593, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.004525709431618452, + "rewards/margins": -0.01936373859643936, + "rewards/rejected": 0.014838028699159622, + "step": 37 + }, + { + "epoch": 0.06, + "learning_rate": 1.5966386554621847e-07, + "logits/chosen": -1.081247091293335, + "logits/rejected": -2.124126434326172, + "logps/chosen": -789.7781982421875, + "logps/rejected": -147.115966796875, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05167846754193306, + "rewards/margins": 0.20409394800662994, + "rewards/rejected": -0.152415469288826, + "step": 38 + }, + { + "epoch": 0.07, + "learning_rate": 1.638655462184874e-07, + "logits/chosen": -2.1789205074310303, + "logits/rejected": -1.1509499549865723, + "logps/chosen": -295.66265869140625, + "logps/rejected": -394.5150451660156, + "loss": 0.6517, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12732239067554474, + "rewards/margins": 0.1430404633283615, + "rewards/rejected": -0.27036285400390625, + "step": 39 + }, + { + "epoch": 0.07, + "learning_rate": 1.680672268907563e-07, + "logits/chosen": -2.1448452472686768, + "logits/rejected": -2.1956920623779297, + "logps/chosen": -62.93687438964844, + "logps/rejected": -84.88615417480469, + "loss": 0.6534, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008682060055434704, + "rewards/margins": 0.0031255725771188736, + "rewards/rejected": -0.011807632632553577, + "step": 40 + }, + { + "epoch": 0.07, + "learning_rate": 1.722689075630252e-07, + "logits/chosen": -1.4040545225143433, + "logits/rejected": -0.7300827503204346, + "logps/chosen": -326.945068359375, + "logps/rejected": -324.3778076171875, + "loss": 0.6435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008136749267578125, + "rewards/margins": 0.2589103579521179, + "rewards/rejected": -0.26704710721969604, + "step": 41 + }, + { + "epoch": 0.07, + "learning_rate": 1.764705882352941e-07, + "logits/chosen": -1.4858357906341553, + "logits/rejected": -2.0196330547332764, + "logps/chosen": -459.7484130859375, + "logps/rejected": -210.59243774414062, + "loss": 0.6423, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.13091735541820526, + "rewards/margins": -0.0402679406106472, + "rewards/rejected": -0.09064941853284836, + "step": 42 + }, + { + "epoch": 0.07, + "learning_rate": 1.8067226890756302e-07, + "logits/chosen": -1.4100688695907593, + "logits/rejected": -2.2512903213500977, + "logps/chosen": -263.8143615722656, + "logps/rejected": -82.95572662353516, + "loss": 0.6408, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.052451327443122864, + "rewards/margins": -0.04398571699857712, + "rewards/rejected": -0.008465608581900597, + "step": 43 + }, + { + "epoch": 0.07, + "learning_rate": 1.8487394957983192e-07, + "logits/chosen": -1.654313325881958, + "logits/rejected": -1.3700717687606812, + "logps/chosen": -167.07334899902344, + "logps/rejected": -133.7058868408203, + "loss": 0.646, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.036783602088689804, + "rewards/margins": 0.10394057631492615, + "rewards/rejected": -0.14072418212890625, + "step": 44 + }, + { + "epoch": 0.08, + "learning_rate": 1.8907563025210083e-07, + "logits/chosen": -2.3346948623657227, + "logits/rejected": -1.4270800352096558, + "logps/chosen": -363.33868408203125, + "logps/rejected": -202.8612060546875, + "loss": 0.6329, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.11517754197120667, + "rewards/margins": -0.01095886155962944, + "rewards/rejected": -0.10421867668628693, + "step": 45 + }, + { + "epoch": 0.08, + "learning_rate": 1.9327731092436976e-07, + "logits/chosen": -1.5132286548614502, + "logits/rejected": -0.9802812933921814, + "logps/chosen": -463.6932067871094, + "logps/rejected": -284.0732421875, + "loss": 0.6186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11024780571460724, + "rewards/margins": 0.5847091674804688, + "rewards/rejected": -0.6949569582939148, + "step": 46 + }, + { + "epoch": 0.08, + "learning_rate": 1.9747899159663864e-07, + "logits/chosen": -1.640755295753479, + "logits/rejected": -2.064528465270996, + "logps/chosen": -141.10398864746094, + "logps/rejected": -74.3631362915039, + "loss": 0.6133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.003640557639300823, + "rewards/margins": 0.031121447682380676, + "rewards/rejected": -0.02748088911175728, + "step": 47 + }, + { + "epoch": 0.08, + "learning_rate": 2.0168067226890757e-07, + "logits/chosen": -1.3043317794799805, + "logits/rejected": -0.7944495677947998, + "logps/chosen": -236.87294006347656, + "logps/rejected": -140.55502319335938, + "loss": 0.6065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0874466672539711, + "rewards/margins": 0.6055868864059448, + "rewards/rejected": -0.5181402564048767, + "step": 48 + }, + { + "epoch": 0.08, + "learning_rate": 2.0588235294117645e-07, + "logits/chosen": -2.2910568714141846, + "logits/rejected": -1.5469049215316772, + "logps/chosen": -50.14934539794922, + "logps/rejected": -230.92745971679688, + "loss": 0.6062, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.012336349114775658, + "rewards/margins": 0.3008907437324524, + "rewards/rejected": -0.3132270872592926, + "step": 49 + }, + { + "epoch": 0.08, + "learning_rate": 2.1008403361344538e-07, + "logits/chosen": -1.6005315780639648, + "logits/rejected": -2.0069663524627686, + "logps/chosen": -300.32855224609375, + "logps/rejected": -70.41799926757812, + "loss": 0.5983, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16448670625686646, + "rewards/margins": -0.07991065829992294, + "rewards/rejected": -0.08457604050636292, + "step": 50 + }, + { + "epoch": 0.09, + "learning_rate": 2.1428571428571426e-07, + "logits/chosen": -1.6963545083999634, + "logits/rejected": -1.8104299306869507, + "logps/chosen": -224.7620391845703, + "logps/rejected": -106.64592742919922, + "loss": 0.5888, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.040007784962654114, + "rewards/margins": 0.2745014429092407, + "rewards/rejected": -0.31450921297073364, + "step": 51 + }, + { + "epoch": 0.09, + "learning_rate": 2.184873949579832e-07, + "logits/chosen": -1.519837737083435, + "logits/rejected": -1.6336404085159302, + "logps/chosen": -302.5843505859375, + "logps/rejected": -260.89599609375, + "loss": 0.5738, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.00217361468821764, + "rewards/margins": -0.08065643161535263, + "rewards/rejected": 0.07848282158374786, + "step": 52 + }, + { + "epoch": 0.09, + "learning_rate": 2.226890756302521e-07, + "logits/chosen": -0.7233390212059021, + "logits/rejected": -0.5498945116996765, + "logps/chosen": -314.1108093261719, + "logps/rejected": -156.276611328125, + "loss": 0.5757, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03915100172162056, + "rewards/margins": 0.5292686223983765, + "rewards/rejected": -0.5684196352958679, + "step": 53 + }, + { + "epoch": 0.09, + "learning_rate": 2.26890756302521e-07, + "logits/chosen": -2.079944610595703, + "logits/rejected": -2.1164326667785645, + "logps/chosen": -363.44049072265625, + "logps/rejected": -204.23228454589844, + "loss": 0.5763, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03126373142004013, + "rewards/margins": 0.17057648301124573, + "rewards/rejected": -0.139312744140625, + "step": 54 + }, + { + "epoch": 0.09, + "learning_rate": 2.3109243697478993e-07, + "logits/chosen": -0.8852956891059875, + "logits/rejected": -1.17733633518219, + "logps/chosen": -229.51646423339844, + "logps/rejected": -123.25852966308594, + "loss": 0.5658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012698173522949219, + "rewards/margins": 0.3284967541694641, + "rewards/rejected": -0.3157985806465149, + "step": 55 + }, + { + "epoch": 0.09, + "learning_rate": 2.352941176470588e-07, + "logits/chosen": -1.2436057329177856, + "logits/rejected": -1.3107479810714722, + "logps/chosen": -507.35784912109375, + "logps/rejected": -224.55007934570312, + "loss": 0.5549, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12623444199562073, + "rewards/margins": 0.4832092523574829, + "rewards/rejected": -0.6094436645507812, + "step": 56 + }, + { + "epoch": 0.1, + "learning_rate": 2.394957983193277e-07, + "logits/chosen": -1.765979290008545, + "logits/rejected": -2.5764899253845215, + "logps/chosen": -305.29486083984375, + "logps/rejected": -71.30033874511719, + "loss": 0.5634, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14705120027065277, + "rewards/margins": 0.1404399424791336, + "rewards/rejected": 0.00661125173792243, + "step": 57 + }, + { + "epoch": 0.1, + "learning_rate": 2.4369747899159664e-07, + "logits/chosen": -0.8349874019622803, + "logits/rejected": -0.4467710256576538, + "logps/chosen": -396.75067138671875, + "logps/rejected": -238.7788848876953, + "loss": 0.5588, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2361343502998352, + "rewards/margins": 0.7512519955635071, + "rewards/rejected": -0.9873863458633423, + "step": 58 + }, + { + "epoch": 0.1, + "learning_rate": 2.478991596638655e-07, + "logits/chosen": -1.1839135885238647, + "logits/rejected": -1.342013955116272, + "logps/chosen": -506.8759460449219, + "logps/rejected": -188.46533203125, + "loss": 0.5478, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19339600205421448, + "rewards/margins": 0.9577789306640625, + "rewards/rejected": -1.1511750221252441, + "step": 59 + }, + { + "epoch": 0.1, + "learning_rate": 2.5210084033613445e-07, + "logits/chosen": -1.238139271736145, + "logits/rejected": -1.305624008178711, + "logps/chosen": -354.024169921875, + "logps/rejected": -150.709228515625, + "loss": 0.5474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03719634935259819, + "rewards/margins": 0.4611190855503082, + "rewards/rejected": -0.4983154535293579, + "step": 60 + }, + { + "epoch": 0.1, + "learning_rate": 2.5630252100840333e-07, + "logits/chosen": -1.0698193311691284, + "logits/rejected": -1.1871693134307861, + "logps/chosen": -533.863525390625, + "logps/rejected": -207.14418029785156, + "loss": 0.5121, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06107788532972336, + "rewards/margins": 1.0981537103652954, + "rewards/rejected": -1.1592315435409546, + "step": 61 + }, + { + "epoch": 0.1, + "learning_rate": 2.6050420168067226e-07, + "logits/chosen": -1.856410026550293, + "logits/rejected": -1.372816801071167, + "logps/chosen": -205.7808074951172, + "logps/rejected": -189.2294464111328, + "loss": 0.5077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07348620891571045, + "rewards/margins": 0.7908002138137817, + "rewards/rejected": -0.8642864227294922, + "step": 62 + }, + { + "epoch": 0.11, + "learning_rate": 2.6470588235294114e-07, + "logits/chosen": -1.718478798866272, + "logits/rejected": -1.663999080657959, + "logps/chosen": -604.8311767578125, + "logps/rejected": -742.4992065429688, + "loss": 0.518, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17543946206569672, + "rewards/margins": -0.012347415089607239, + "rewards/rejected": -0.16309204697608948, + "step": 63 + }, + { + "epoch": 0.11, + "learning_rate": 2.689075630252101e-07, + "logits/chosen": -2.534972906112671, + "logits/rejected": -2.4803988933563232, + "logps/chosen": -27.763654708862305, + "logps/rejected": -65.91168975830078, + "loss": 0.4941, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041327860206365585, + "rewards/margins": 0.3402096629142761, + "rewards/rejected": -0.2988818287849426, + "step": 64 + }, + { + "epoch": 0.11, + "learning_rate": 2.7310924369747895e-07, + "logits/chosen": -1.866020679473877, + "logits/rejected": -1.5979124307632446, + "logps/chosen": -348.8237609863281, + "logps/rejected": -415.2561950683594, + "loss": 0.4814, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09458465874195099, + "rewards/margins": 0.18892823159694672, + "rewards/rejected": -0.2835128903388977, + "step": 65 + }, + { + "epoch": 0.11, + "learning_rate": 2.773109243697479e-07, + "logits/chosen": -1.7812227010726929, + "logits/rejected": -1.2362346649169922, + "logps/chosen": -210.58091735839844, + "logps/rejected": -221.3896026611328, + "loss": 0.4545, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13596276938915253, + "rewards/margins": 1.656264305114746, + "rewards/rejected": -1.7922271490097046, + "step": 66 + }, + { + "epoch": 0.11, + "learning_rate": 2.815126050420168e-07, + "logits/chosen": -1.5426255464553833, + "logits/rejected": -1.5356191396713257, + "logps/chosen": -36.993186950683594, + "logps/rejected": -77.3095474243164, + "loss": 0.4601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021850014105439186, + "rewards/margins": 0.5636359453201294, + "rewards/rejected": -0.5417859554290771, + "step": 67 + }, + { + "epoch": 0.11, + "learning_rate": 2.857142857142857e-07, + "logits/chosen": -1.814134955406189, + "logits/rejected": -1.493807077407837, + "logps/chosen": -214.3492431640625, + "logps/rejected": -331.2470397949219, + "loss": 0.466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.041826628148555756, + "rewards/margins": 0.37629854679107666, + "rewards/rejected": -0.3344719111919403, + "step": 68 + }, + { + "epoch": 0.12, + "learning_rate": 2.899159663865546e-07, + "logits/chosen": -1.104387640953064, + "logits/rejected": -1.4887744188308716, + "logps/chosen": -790.798583984375, + "logps/rejected": -575.5951538085938, + "loss": 0.4556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17354126274585724, + "rewards/margins": 0.449990838766098, + "rewards/rejected": -0.276449590921402, + "step": 69 + }, + { + "epoch": 0.12, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -1.1406750679016113, + "logits/rejected": -1.7379848957061768, + "logps/chosen": -677.4329833984375, + "logps/rejected": -125.09095001220703, + "loss": 0.4216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29751741886138916, + "rewards/margins": 1.025307536125183, + "rewards/rejected": -0.727790117263794, + "step": 70 + }, + { + "epoch": 0.12, + "learning_rate": 2.9831932773109244e-07, + "logits/chosen": -1.548018455505371, + "logits/rejected": -1.9434715509414673, + "logps/chosen": -60.39828109741211, + "logps/rejected": -38.36094665527344, + "loss": 0.435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07226741313934326, + "rewards/margins": 0.6180351972579956, + "rewards/rejected": -0.6903026103973389, + "step": 71 + }, + { + "epoch": 0.12, + "learning_rate": 3.025210084033613e-07, + "logits/chosen": -1.4888752698898315, + "logits/rejected": -1.3269966840744019, + "logps/chosen": -333.23468017578125, + "logps/rejected": -181.92431640625, + "loss": 0.4167, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.046831514686346054, + "rewards/margins": 2.105104923248291, + "rewards/rejected": -2.1519362926483154, + "step": 72 + }, + { + "epoch": 0.12, + "learning_rate": 3.0672268907563024e-07, + "logits/chosen": -1.5784661769866943, + "logits/rejected": -1.5319206714630127, + "logps/chosen": -37.48617172241211, + "logps/rejected": -38.057193756103516, + "loss": 0.4306, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07858496159315109, + "rewards/margins": 0.3954930901527405, + "rewards/rejected": -0.47407805919647217, + "step": 73 + }, + { + "epoch": 0.12, + "learning_rate": 3.109243697478992e-07, + "logits/chosen": -2.385610580444336, + "logits/rejected": -2.365718364715576, + "logps/chosen": -31.291919708251953, + "logps/rejected": -60.941036224365234, + "loss": 0.3708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0072297099977731705, + "rewards/margins": 0.6486601829528809, + "rewards/rejected": -0.6414304971694946, + "step": 74 + }, + { + "epoch": 0.13, + "learning_rate": 3.1512605042016805e-07, + "logits/chosen": -1.8049649000167847, + "logits/rejected": -1.474593162536621, + "logps/chosen": -145.8429412841797, + "logps/rejected": -114.68021392822266, + "loss": 0.3998, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15717864036560059, + "rewards/margins": 0.8213388919830322, + "rewards/rejected": -0.9785175919532776, + "step": 75 + }, + { + "epoch": 0.13, + "learning_rate": 3.1932773109243693e-07, + "logits/chosen": -1.4387171268463135, + "logits/rejected": -1.3081284761428833, + "logps/chosen": -72.28972625732422, + "logps/rejected": -99.7930679321289, + "loss": 0.4351, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03587455675005913, + "rewards/margins": 0.7939237356185913, + "rewards/rejected": -0.7580491900444031, + "step": 76 + }, + { + "epoch": 0.13, + "learning_rate": 3.2352941176470586e-07, + "logits/chosen": -1.4761199951171875, + "logits/rejected": -2.109046459197998, + "logps/chosen": -155.40235900878906, + "logps/rejected": -155.92733764648438, + "loss": 0.395, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.06405086815357208, + "rewards/margins": 0.329689621925354, + "rewards/rejected": -0.3937404751777649, + "step": 77 + }, + { + "epoch": 0.13, + "learning_rate": 3.277310924369748e-07, + "logits/chosen": -2.038198232650757, + "logits/rejected": -2.2060189247131348, + "logps/chosen": -148.52001953125, + "logps/rejected": -190.42556762695312, + "loss": 0.3962, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5522751212120056, + "rewards/margins": 0.17331847548484802, + "rewards/rejected": -0.725593626499176, + "step": 78 + }, + { + "epoch": 0.13, + "learning_rate": 3.319327731092437e-07, + "logits/chosen": -1.1491724252700806, + "logits/rejected": -0.9036651849746704, + "logps/chosen": -343.11529541015625, + "logps/rejected": -174.84776306152344, + "loss": 0.4158, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14253844320774078, + "rewards/margins": 2.43456768989563, + "rewards/rejected": -2.577106237411499, + "step": 79 + }, + { + "epoch": 0.13, + "learning_rate": 3.361344537815126e-07, + "logits/chosen": -1.5111039876937866, + "logits/rejected": -1.9596521854400635, + "logps/chosen": -572.5203857421875, + "logps/rejected": -325.9815368652344, + "loss": 0.373, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3365722894668579, + "rewards/margins": 2.104917287826538, + "rewards/rejected": -1.7683449983596802, + "step": 80 + }, + { + "epoch": 0.14, + "learning_rate": 3.403361344537815e-07, + "logits/chosen": -1.8409173488616943, + "logits/rejected": -1.9764267206192017, + "logps/chosen": -352.3498840332031, + "logps/rejected": -321.4953308105469, + "loss": 0.3865, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22799532115459442, + "rewards/margins": 0.7164055109024048, + "rewards/rejected": -0.48841017484664917, + "step": 81 + }, + { + "epoch": 0.14, + "learning_rate": 3.445378151260504e-07, + "logits/chosen": -1.8073253631591797, + "logits/rejected": -2.74298357963562, + "logps/chosen": -222.5423583984375, + "logps/rejected": -52.44717788696289, + "loss": 0.3857, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.19195251166820526, + "rewards/margins": 0.3153046667575836, + "rewards/rejected": -0.5072571635246277, + "step": 82 + }, + { + "epoch": 0.14, + "learning_rate": 3.487394957983193e-07, + "logits/chosen": -2.154622793197632, + "logits/rejected": -1.6847541332244873, + "logps/chosen": -208.72132873535156, + "logps/rejected": -175.53054809570312, + "loss": 0.3637, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23619385063648224, + "rewards/margins": 1.846364140510559, + "rewards/rejected": -2.0825579166412354, + "step": 83 + }, + { + "epoch": 0.14, + "learning_rate": 3.529411764705882e-07, + "logits/chosen": -1.876042127609253, + "logits/rejected": -1.3837803602218628, + "logps/chosen": -53.90886306762695, + "logps/rejected": -144.5306396484375, + "loss": 0.364, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01596364937722683, + "rewards/margins": 0.7440950274467468, + "rewards/rejected": -0.7600586414337158, + "step": 84 + }, + { + "epoch": 0.14, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -1.4703798294067383, + "logits/rejected": -1.7445605993270874, + "logps/chosen": -269.6485900878906, + "logps/rejected": -158.86940002441406, + "loss": 0.3672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10937881469726562, + "rewards/margins": 2.3677597045898438, + "rewards/rejected": -2.258380889892578, + "step": 85 + }, + { + "epoch": 0.15, + "learning_rate": 3.6134453781512604e-07, + "logits/chosen": -1.6020888090133667, + "logits/rejected": -1.6218175888061523, + "logps/chosen": -152.8944091796875, + "logps/rejected": -168.00990295410156, + "loss": 0.3729, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5375404357910156, + "rewards/margins": 0.09828647971153259, + "rewards/rejected": -0.6358269453048706, + "step": 86 + }, + { + "epoch": 0.15, + "learning_rate": 3.655462184873949e-07, + "logits/chosen": -1.5672448873519897, + "logits/rejected": -1.6510668992996216, + "logps/chosen": -447.45806884765625, + "logps/rejected": -239.46371459960938, + "loss": 0.3704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2388494461774826, + "rewards/margins": 3.1040165424346924, + "rewards/rejected": -3.3428659439086914, + "step": 87 + }, + { + "epoch": 0.15, + "learning_rate": 3.6974789915966385e-07, + "logits/chosen": -0.6570608615875244, + "logits/rejected": -0.8279274702072144, + "logps/chosen": -395.8023376464844, + "logps/rejected": -155.96295166015625, + "loss": 0.3316, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0513305589556694, + "rewards/margins": 3.0571579933166504, + "rewards/rejected": -3.0058274269104004, + "step": 88 + }, + { + "epoch": 0.15, + "learning_rate": 3.739495798319328e-07, + "logits/chosen": -1.7863593101501465, + "logits/rejected": -2.064410924911499, + "logps/chosen": -333.698486328125, + "logps/rejected": -122.79313659667969, + "loss": 0.3579, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06812648475170135, + "rewards/margins": 0.5757344365119934, + "rewards/rejected": -0.643860936164856, + "step": 89 + }, + { + "epoch": 0.15, + "learning_rate": 3.7815126050420166e-07, + "logits/chosen": -1.7475690841674805, + "logits/rejected": -2.228104591369629, + "logps/chosen": -287.8011169433594, + "logps/rejected": -31.779052734375, + "loss": 0.3574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38655588030815125, + "rewards/margins": 0.9471727609634399, + "rewards/rejected": -0.5606168508529663, + "step": 90 + }, + { + "epoch": 0.15, + "learning_rate": 3.8235294117647053e-07, + "logits/chosen": -1.7612462043762207, + "logits/rejected": -1.4105805158615112, + "logps/chosen": -122.49490356445312, + "logps/rejected": -187.72357177734375, + "loss": 0.3251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22412091493606567, + "rewards/margins": 2.9434642791748047, + "rewards/rejected": -3.1675851345062256, + "step": 91 + }, + { + "epoch": 0.16, + "learning_rate": 3.865546218487395e-07, + "logits/chosen": -1.4474364519119263, + "logits/rejected": -1.5902643203735352, + "logps/chosen": -239.24310302734375, + "logps/rejected": -135.27609252929688, + "loss": 0.3662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21128883957862854, + "rewards/margins": 1.6379708051681519, + "rewards/rejected": -1.849259614944458, + "step": 92 + }, + { + "epoch": 0.16, + "learning_rate": 3.907563025210084e-07, + "logits/chosen": -1.1318254470825195, + "logits/rejected": -1.4229687452316284, + "logps/chosen": -260.1365966796875, + "logps/rejected": -74.00960540771484, + "loss": 0.3516, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2649814486503601, + "rewards/margins": 1.8917981386184692, + "rewards/rejected": -1.6268166303634644, + "step": 93 + }, + { + "epoch": 0.16, + "learning_rate": 3.949579831932773e-07, + "logits/chosen": -1.8826991319656372, + "logits/rejected": -2.357274055480957, + "logps/chosen": -311.9720458984375, + "logps/rejected": -158.02745056152344, + "loss": 0.3471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05377950519323349, + "rewards/margins": 1.7474021911621094, + "rewards/rejected": -1.6936227083206177, + "step": 94 + }, + { + "epoch": 0.16, + "learning_rate": 3.991596638655462e-07, + "logits/chosen": -1.2974653244018555, + "logits/rejected": -1.5636136531829834, + "logps/chosen": -478.4207763671875, + "logps/rejected": -169.5061492919922, + "loss": 0.2943, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32736513018608093, + "rewards/margins": 3.4351882934570312, + "rewards/rejected": -3.107823133468628, + "step": 95 + }, + { + "epoch": 0.16, + "learning_rate": 4.0336134453781514e-07, + "logits/chosen": -1.6629210710525513, + "logits/rejected": -1.3563766479492188, + "logps/chosen": -302.283447265625, + "logps/rejected": -185.8943328857422, + "loss": 0.3216, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08565587550401688, + "rewards/margins": 2.939385414123535, + "rewards/rejected": -2.853729486465454, + "step": 96 + }, + { + "epoch": 0.16, + "learning_rate": 4.07563025210084e-07, + "logits/chosen": -1.579493761062622, + "logits/rejected": -1.9858088493347168, + "logps/chosen": -166.11209106445312, + "logps/rejected": -93.21321868896484, + "loss": 0.2999, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39952126145362854, + "rewards/margins": 0.800835371017456, + "rewards/rejected": -1.2003566026687622, + "step": 97 + }, + { + "epoch": 0.17, + "learning_rate": 4.117647058823529e-07, + "logits/chosen": -1.9741665124893188, + "logits/rejected": -1.3600385189056396, + "logps/chosen": -258.8650817871094, + "logps/rejected": -185.20352172851562, + "loss": 0.3346, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09575500339269638, + "rewards/margins": 2.4661808013916016, + "rewards/rejected": -2.5619359016418457, + "step": 98 + }, + { + "epoch": 0.17, + "learning_rate": 4.159663865546218e-07, + "logits/chosen": -1.7311229705810547, + "logits/rejected": -2.162808418273926, + "logps/chosen": -423.9176940917969, + "logps/rejected": -175.74667358398438, + "loss": 0.2905, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.4093338251113892, + "rewards/margins": -0.9930892586708069, + "rewards/rejected": -0.4162445068359375, + "step": 99 + }, + { + "epoch": 0.17, + "learning_rate": 4.2016806722689076e-07, + "logits/chosen": -1.7871568202972412, + "logits/rejected": -1.6674413681030273, + "logps/chosen": -133.9112548828125, + "logps/rejected": -180.4711456298828, + "loss": 0.2967, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.17781352996826172, + "rewards/margins": 1.3768540620803833, + "rewards/rejected": -1.554667592048645, + "step": 100 + }, + { + "epoch": 0.17, + "learning_rate": 4.2436974789915964e-07, + "logits/chosen": -1.9930833578109741, + "logits/rejected": -2.3848719596862793, + "logps/chosen": -248.69406127929688, + "logps/rejected": -160.950927734375, + "loss": 0.2864, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7147164344787598, + "rewards/margins": 1.7294585704803467, + "rewards/rejected": -2.4441750049591064, + "step": 101 + }, + { + "epoch": 0.17, + "learning_rate": 4.285714285714285e-07, + "logits/chosen": -1.812693476676941, + "logits/rejected": -1.6050926446914673, + "logps/chosen": -277.7933654785156, + "logps/rejected": -483.4625549316406, + "loss": 0.3317, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6971309781074524, + "rewards/margins": -0.4464415907859802, + "rewards/rejected": -0.2506893575191498, + "step": 102 + }, + { + "epoch": 0.17, + "learning_rate": 4.327731092436975e-07, + "logits/chosen": -1.461750864982605, + "logits/rejected": -1.7464590072631836, + "logps/chosen": -173.95265197753906, + "logps/rejected": -135.8446807861328, + "loss": 0.2714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12809354066848755, + "rewards/margins": 3.298431396484375, + "rewards/rejected": -3.4265246391296387, + "step": 103 + }, + { + "epoch": 0.18, + "learning_rate": 4.369747899159664e-07, + "logits/chosen": -1.5056589841842651, + "logits/rejected": -1.9312864542007446, + "logps/chosen": -201.46270751953125, + "logps/rejected": -267.52276611328125, + "loss": 0.3103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1706157773733139, + "rewards/margins": 0.6676197052001953, + "rewards/rejected": -0.4970039427280426, + "step": 104 + }, + { + "epoch": 0.18, + "learning_rate": 4.4117647058823526e-07, + "logits/chosen": -2.0907585620880127, + "logits/rejected": -1.849104642868042, + "logps/chosen": -21.733829498291016, + "logps/rejected": -76.71643829345703, + "loss": 0.2871, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0892881453037262, + "rewards/margins": 1.409407138824463, + "rewards/rejected": -1.4986952543258667, + "step": 105 + }, + { + "epoch": 0.18, + "learning_rate": 4.453781512605042e-07, + "logits/chosen": -0.9986115097999573, + "logits/rejected": -0.6594001054763794, + "logps/chosen": -368.1170654296875, + "logps/rejected": -226.82830810546875, + "loss": 0.3077, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4677414000034332, + "rewards/margins": 4.347979545593262, + "rewards/rejected": -4.815721035003662, + "step": 106 + }, + { + "epoch": 0.18, + "learning_rate": 4.495798319327731e-07, + "logits/chosen": -1.8972766399383545, + "logits/rejected": -2.1591522693634033, + "logps/chosen": -341.3930358886719, + "logps/rejected": -195.02679443359375, + "loss": 0.28, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6874268054962158, + "rewards/margins": 2.0455260276794434, + "rewards/rejected": -2.732952833175659, + "step": 107 + }, + { + "epoch": 0.18, + "learning_rate": 4.53781512605042e-07, + "logits/chosen": -1.1454424858093262, + "logits/rejected": -1.2674638032913208, + "logps/chosen": -141.8154754638672, + "logps/rejected": -19.07546043395996, + "loss": 0.3023, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4041498303413391, + "rewards/margins": 1.1306045055389404, + "rewards/rejected": -0.7264547348022461, + "step": 108 + }, + { + "epoch": 0.18, + "learning_rate": 4.579831932773109e-07, + "logits/chosen": -2.07033109664917, + "logits/rejected": -2.380950450897217, + "logps/chosen": -80.41409301757812, + "logps/rejected": -98.79011535644531, + "loss": 0.2933, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.43403300642967224, + "rewards/margins": -0.3926330506801605, + "rewards/rejected": -0.04139995574951172, + "step": 109 + }, + { + "epoch": 0.19, + "learning_rate": 4.6218487394957986e-07, + "logits/chosen": -2.5270440578460693, + "logits/rejected": -2.435595750808716, + "logps/chosen": -24.47152328491211, + "logps/rejected": -148.6935577392578, + "loss": 0.3021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11162052303552628, + "rewards/margins": 3.680636405944824, + "rewards/rejected": -3.7922568321228027, + "step": 110 + }, + { + "epoch": 0.19, + "learning_rate": 4.6638655462184874e-07, + "logits/chosen": -1.4159996509552002, + "logits/rejected": -1.1079641580581665, + "logps/chosen": -213.67677307128906, + "logps/rejected": -146.0835418701172, + "loss": 0.2737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6399146914482117, + "rewards/margins": 2.671748161315918, + "rewards/rejected": -3.3116626739501953, + "step": 111 + }, + { + "epoch": 0.19, + "learning_rate": 4.705882352941176e-07, + "logits/chosen": -1.859910488128662, + "logits/rejected": -2.269141435623169, + "logps/chosen": -270.6617126464844, + "logps/rejected": -149.7200164794922, + "loss": 0.2958, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3810228407382965, + "rewards/margins": 2.3572652339935303, + "rewards/rejected": -2.738288164138794, + "step": 112 + }, + { + "epoch": 0.19, + "learning_rate": 4.747899159663865e-07, + "logits/chosen": -1.615531325340271, + "logits/rejected": -2.3673205375671387, + "logps/chosen": -264.86578369140625, + "logps/rejected": -242.48330688476562, + "loss": 0.292, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4372093081474304, + "rewards/margins": 2.0082688331604004, + "rewards/rejected": -2.4454782009124756, + "step": 113 + }, + { + "epoch": 0.19, + "learning_rate": 4.789915966386554e-07, + "logits/chosen": -1.3283716440200806, + "logits/rejected": -1.4000985622406006, + "logps/chosen": -751.4788818359375, + "logps/rejected": -504.08892822265625, + "loss": 0.2725, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6655944585800171, + "rewards/margins": 4.143014907836914, + "rewards/rejected": -3.4774200916290283, + "step": 114 + }, + { + "epoch": 0.19, + "learning_rate": 4.831932773109244e-07, + "logits/chosen": -1.942575216293335, + "logits/rejected": -1.5764302015304565, + "logps/chosen": -49.1660270690918, + "logps/rejected": -56.93123245239258, + "loss": 0.2784, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0690576359629631, + "rewards/margins": 1.2642771005630493, + "rewards/rejected": -1.3333348035812378, + "step": 115 + }, + { + "epoch": 0.2, + "learning_rate": 4.873949579831933e-07, + "logits/chosen": -2.2477641105651855, + "logits/rejected": -2.076430559158325, + "logps/chosen": -29.411571502685547, + "logps/rejected": -121.75186920166016, + "loss": 0.3125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06398458778858185, + "rewards/margins": 3.124544143676758, + "rewards/rejected": -3.1885287761688232, + "step": 116 + }, + { + "epoch": 0.2, + "learning_rate": 4.915966386554621e-07, + "logits/chosen": -2.0187594890594482, + "logits/rejected": -1.1486027240753174, + "logps/chosen": -329.5198974609375, + "logps/rejected": -279.2951354980469, + "loss": 0.2825, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12247312068939209, + "rewards/margins": 6.082241058349609, + "rewards/rejected": -5.959768295288086, + "step": 117 + }, + { + "epoch": 0.2, + "learning_rate": 4.95798319327731e-07, + "logits/chosen": -1.4514484405517578, + "logits/rejected": -2.005096435546875, + "logps/chosen": -317.6128845214844, + "logps/rejected": -207.64822387695312, + "loss": 0.2737, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06410064548254013, + "rewards/margins": 2.704306125640869, + "rewards/rejected": -2.6402053833007812, + "step": 118 + }, + { + "epoch": 0.2, + "learning_rate": 5e-07, + "logits/chosen": -0.627937376499176, + "logits/rejected": -0.6839653253555298, + "logps/chosen": -91.22163391113281, + "logps/rejected": -55.82908630371094, + "loss": 0.2844, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14227086305618286, + "rewards/margins": 2.2958028316497803, + "rewards/rejected": -2.153531789779663, + "step": 119 + }, + { + "epoch": 0.2, + "learning_rate": 5.042016806722689e-07, + "logits/chosen": -2.05309796333313, + "logits/rejected": -1.3187203407287598, + "logps/chosen": -375.8322448730469, + "logps/rejected": -427.10931396484375, + "loss": 0.2755, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019181065261363983, + "rewards/margins": 2.3469948768615723, + "rewards/rejected": -2.3278136253356934, + "step": 120 + }, + { + "epoch": 0.2, + "learning_rate": 5.084033613445377e-07, + "logits/chosen": -1.6020848751068115, + "logits/rejected": -1.890777826309204, + "logps/chosen": -389.67840576171875, + "logps/rejected": -156.53250122070312, + "loss": 0.2691, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7378628253936768, + "rewards/margins": 2.929511308670044, + "rewards/rejected": -3.6673741340637207, + "step": 121 + }, + { + "epoch": 0.21, + "learning_rate": 5.126050420168067e-07, + "logits/chosen": -1.5457667112350464, + "logits/rejected": -0.9591537714004517, + "logps/chosen": -232.89480590820312, + "logps/rejected": -236.02783203125, + "loss": 0.2301, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7060562372207642, + "rewards/margins": 4.284882545471191, + "rewards/rejected": -4.990938663482666, + "step": 122 + }, + { + "epoch": 0.21, + "learning_rate": 5.168067226890757e-07, + "logits/chosen": -1.6671092510223389, + "logits/rejected": -1.6428896188735962, + "logps/chosen": -63.35133361816406, + "logps/rejected": -109.79218292236328, + "loss": 0.2499, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2081298828125, + "rewards/margins": 2.169139862060547, + "rewards/rejected": -2.3772695064544678, + "step": 123 + }, + { + "epoch": 0.21, + "learning_rate": 5.210084033613445e-07, + "logits/chosen": -1.5807249546051025, + "logits/rejected": -1.7926443815231323, + "logps/chosen": -74.63159942626953, + "logps/rejected": -134.3237762451172, + "loss": 0.2469, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.030628204345703125, + "rewards/margins": 1.1621196269989014, + "rewards/rejected": -1.1927478313446045, + "step": 124 + }, + { + "epoch": 0.21, + "learning_rate": 5.252100840336135e-07, + "logits/chosen": -1.318616509437561, + "logits/rejected": -1.6264910697937012, + "logps/chosen": -527.2734375, + "logps/rejected": -174.54412841796875, + "loss": 0.2468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34924012422561646, + "rewards/margins": 3.5974647998809814, + "rewards/rejected": -3.946704864501953, + "step": 125 + }, + { + "epoch": 0.21, + "learning_rate": 5.294117647058823e-07, + "logits/chosen": -1.1573264598846436, + "logits/rejected": -0.8850076198577881, + "logps/chosen": -82.6595458984375, + "logps/rejected": -150.1305389404297, + "loss": 0.2477, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006247520446777344, + "rewards/margins": 2.6717464923858643, + "rewards/rejected": -2.6779940128326416, + "step": 126 + }, + { + "epoch": 0.21, + "learning_rate": 5.336134453781512e-07, + "logits/chosen": -1.5155869722366333, + "logits/rejected": -1.3886268138885498, + "logps/chosen": -13.588849067687988, + "logps/rejected": -60.8436279296875, + "loss": 0.2575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14050476253032684, + "rewards/margins": 1.282322883605957, + "rewards/rejected": -1.4228277206420898, + "step": 127 + }, + { + "epoch": 0.22, + "learning_rate": 5.378151260504201e-07, + "logits/chosen": -2.142261505126953, + "logits/rejected": -2.022604465484619, + "logps/chosen": -36.83582305908203, + "logps/rejected": -84.54644775390625, + "loss": 0.2476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06873762607574463, + "rewards/margins": 1.3124231100082397, + "rewards/rejected": -1.3811607360839844, + "step": 128 + }, + { + "epoch": 0.22, + "learning_rate": 5.42016806722689e-07, + "logits/chosen": -2.0003206729888916, + "logits/rejected": -2.7086338996887207, + "logps/chosen": -297.49810791015625, + "logps/rejected": -121.93182373046875, + "loss": 0.2778, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6414843797683716, + "rewards/margins": 2.057905673980713, + "rewards/rejected": -1.4164212942123413, + "step": 129 + }, + { + "epoch": 0.22, + "learning_rate": 5.462184873949579e-07, + "logits/chosen": -2.3803634643554688, + "logits/rejected": -1.4992303848266602, + "logps/chosen": -57.217220306396484, + "logps/rejected": -203.0693817138672, + "loss": 0.2675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25870370864868164, + "rewards/margins": 2.8911209106445312, + "rewards/rejected": -3.149824619293213, + "step": 130 + }, + { + "epoch": 0.22, + "learning_rate": 5.504201680672269e-07, + "logits/chosen": -1.2806719541549683, + "logits/rejected": -2.390531063079834, + "logps/chosen": -454.8437194824219, + "logps/rejected": -94.0916748046875, + "loss": 0.2616, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13427734375, + "rewards/margins": 3.2612316608428955, + "rewards/rejected": -3.1269543170928955, + "step": 131 + }, + { + "epoch": 0.22, + "learning_rate": 5.546218487394958e-07, + "logits/chosen": -1.9892646074295044, + "logits/rejected": -1.4233769178390503, + "logps/chosen": -108.95271301269531, + "logps/rejected": -160.64715576171875, + "loss": 0.2424, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15306320786476135, + "rewards/margins": 3.4583911895751953, + "rewards/rejected": -3.611454486846924, + "step": 132 + }, + { + "epoch": 0.22, + "learning_rate": 5.588235294117647e-07, + "logits/chosen": -1.2769445180892944, + "logits/rejected": -1.524395227432251, + "logps/chosen": -52.44013214111328, + "logps/rejected": -91.19084167480469, + "loss": 0.2597, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10794153809547424, + "rewards/margins": 0.9474404454231262, + "rewards/rejected": -1.0553820133209229, + "step": 133 + }, + { + "epoch": 0.23, + "learning_rate": 5.630252100840336e-07, + "logits/chosen": -2.26499080657959, + "logits/rejected": -1.1651140451431274, + "logps/chosen": -140.05178833007812, + "logps/rejected": -229.3851318359375, + "loss": 0.2595, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13012734055519104, + "rewards/margins": 3.768279790878296, + "rewards/rejected": -3.898406982421875, + "step": 134 + }, + { + "epoch": 0.23, + "learning_rate": 5.672268907563025e-07, + "logits/chosen": -1.6903553009033203, + "logits/rejected": -1.663693904876709, + "logps/chosen": -70.8027572631836, + "logps/rejected": -179.33489990234375, + "loss": 0.2193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4184946119785309, + "rewards/margins": 0.8598314523696899, + "rewards/rejected": -0.44133681058883667, + "step": 135 + }, + { + "epoch": 0.23, + "learning_rate": 5.714285714285714e-07, + "logits/chosen": -1.894440770149231, + "logits/rejected": -2.020301342010498, + "logps/chosen": -215.9483642578125, + "logps/rejected": -198.383544921875, + "loss": 0.2796, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6141928434371948, + "rewards/margins": 0.986687183380127, + "rewards/rejected": -1.6008800268173218, + "step": 136 + }, + { + "epoch": 0.23, + "learning_rate": 5.756302521008402e-07, + "logits/chosen": -2.016982316970825, + "logits/rejected": -1.272426962852478, + "logps/chosen": -51.863426208496094, + "logps/rejected": -149.52340698242188, + "loss": 0.252, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017465591430664062, + "rewards/margins": 2.352057695388794, + "rewards/rejected": -2.369523286819458, + "step": 137 + }, + { + "epoch": 0.23, + "learning_rate": 5.798319327731093e-07, + "logits/chosen": -0.8563526272773743, + "logits/rejected": -0.9021680355072021, + "logps/chosen": -510.8074951171875, + "logps/rejected": -266.77423095703125, + "loss": 0.2882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9366821050643921, + "rewards/margins": 5.248723030090332, + "rewards/rejected": -6.1854047775268555, + "step": 138 + }, + { + "epoch": 0.23, + "learning_rate": 5.840336134453782e-07, + "logits/chosen": -1.8311724662780762, + "logits/rejected": -1.8810319900512695, + "logps/chosen": -106.62870788574219, + "logps/rejected": -123.30711364746094, + "loss": 0.2667, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28059542179107666, + "rewards/margins": 3.491508960723877, + "rewards/rejected": -3.772104263305664, + "step": 139 + }, + { + "epoch": 0.24, + "learning_rate": 5.88235294117647e-07, + "logits/chosen": -2.032766103744507, + "logits/rejected": -1.9613983631134033, + "logps/chosen": -239.09063720703125, + "logps/rejected": -421.301513671875, + "loss": 0.267, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.27995407581329346, + "rewards/margins": 5.965301990509033, + "rewards/rejected": -6.245256423950195, + "step": 140 + }, + { + "epoch": 0.24, + "learning_rate": 5.924369747899159e-07, + "logits/chosen": -0.8632844090461731, + "logits/rejected": -1.520354151725769, + "logps/chosen": -345.11065673828125, + "logps/rejected": -139.1497039794922, + "loss": 0.2711, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5095192193984985, + "rewards/margins": 2.5539865493774414, + "rewards/rejected": -2.0444672107696533, + "step": 141 + }, + { + "epoch": 0.24, + "learning_rate": 5.966386554621849e-07, + "logits/chosen": -1.3275768756866455, + "logits/rejected": -2.037048816680908, + "logps/chosen": -461.4246826171875, + "logps/rejected": -711.2941284179688, + "loss": 0.2599, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.077467441558838, + "rewards/margins": 0.06723290681838989, + "rewards/rejected": -1.144700288772583, + "step": 142 + }, + { + "epoch": 0.24, + "learning_rate": 6.008403361344537e-07, + "logits/chosen": -2.1924614906311035, + "logits/rejected": -2.12215256690979, + "logps/chosen": -35.62915802001953, + "logps/rejected": -112.1571273803711, + "loss": 0.2408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3445836305618286, + "rewards/margins": 3.033930540084839, + "rewards/rejected": -3.378514289855957, + "step": 143 + }, + { + "epoch": 0.24, + "learning_rate": 6.050420168067226e-07, + "logits/chosen": -1.556571125984192, + "logits/rejected": -1.026197910308838, + "logps/chosen": -352.4817810058594, + "logps/rejected": -479.5821838378906, + "loss": 0.2467, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.581763505935669, + "rewards/margins": 3.661806583404541, + "rewards/rejected": -4.243570327758789, + "step": 144 + }, + { + "epoch": 0.24, + "learning_rate": 6.092436974789916e-07, + "logits/chosen": -1.3160314559936523, + "logits/rejected": -1.45841646194458, + "logps/chosen": -36.064300537109375, + "logps/rejected": -15.627889633178711, + "loss": 0.2421, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07876625657081604, + "rewards/margins": 0.7412266731262207, + "rewards/rejected": -0.8199928998947144, + "step": 145 + }, + { + "epoch": 0.25, + "learning_rate": 6.134453781512605e-07, + "logits/chosen": -1.2557013034820557, + "logits/rejected": -1.1368392705917358, + "logps/chosen": -427.95294189453125, + "logps/rejected": -396.66290283203125, + "loss": 0.2784, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8088486194610596, + "rewards/margins": 1.1350128650665283, + "rewards/rejected": -2.943861484527588, + "step": 146 + }, + { + "epoch": 0.25, + "learning_rate": 6.176470588235294e-07, + "logits/chosen": -2.3703060150146484, + "logits/rejected": -1.596415638923645, + "logps/chosen": -37.992149353027344, + "logps/rejected": -99.8422622680664, + "loss": 0.2201, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14648981392383575, + "rewards/margins": 0.39361295104026794, + "rewards/rejected": -0.5401027798652649, + "step": 147 + }, + { + "epoch": 0.25, + "learning_rate": 6.218487394957984e-07, + "logits/chosen": -2.017554759979248, + "logits/rejected": -1.885864019393921, + "logps/chosen": -31.028438568115234, + "logps/rejected": -165.63868713378906, + "loss": 0.2205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28271618485450745, + "rewards/margins": 5.170090675354004, + "rewards/rejected": -5.4528069496154785, + "step": 148 + }, + { + "epoch": 0.25, + "learning_rate": 6.260504201680672e-07, + "logits/chosen": -1.8670669794082642, + "logits/rejected": -1.4380172491073608, + "logps/chosen": -174.096435546875, + "logps/rejected": -223.5670166015625, + "loss": 0.2268, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1435142755508423, + "rewards/margins": 5.261943817138672, + "rewards/rejected": -6.405458450317383, + "step": 149 + }, + { + "epoch": 0.25, + "learning_rate": 6.302521008403361e-07, + "logits/chosen": -2.312112808227539, + "logits/rejected": -1.3749383687973022, + "logps/chosen": -48.91535949707031, + "logps/rejected": -204.19570922851562, + "loss": 0.205, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1308881789445877, + "rewards/margins": 1.4819360971450806, + "rewards/rejected": -1.6128243207931519, + "step": 150 + }, + { + "epoch": 0.25, + "learning_rate": 6.344537815126049e-07, + "logits/chosen": -2.0252039432525635, + "logits/rejected": -2.22589111328125, + "logps/chosen": -223.1410369873047, + "logps/rejected": -64.30872344970703, + "loss": 0.2547, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0106048583984375, + "rewards/margins": 0.5256061553955078, + "rewards/rejected": -0.5150012969970703, + "step": 151 + }, + { + "epoch": 0.26, + "learning_rate": 6.386554621848739e-07, + "logits/chosen": -1.5374224185943604, + "logits/rejected": -1.8345128297805786, + "logps/chosen": -300.93792724609375, + "logps/rejected": -145.31222534179688, + "loss": 0.2391, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8454994559288025, + "rewards/margins": 2.8806023597717285, + "rewards/rejected": -3.726101875305176, + "step": 152 + }, + { + "epoch": 0.26, + "learning_rate": 6.428571428571429e-07, + "logits/chosen": -1.6726059913635254, + "logits/rejected": -1.3613877296447754, + "logps/chosen": -693.0863037109375, + "logps/rejected": -1098.9571533203125, + "loss": 0.2422, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.5227920413017273, + "rewards/margins": -0.413046270608902, + "rewards/rejected": -0.10974578559398651, + "step": 153 + }, + { + "epoch": 0.26, + "learning_rate": 6.470588235294117e-07, + "logits/chosen": -2.3774003982543945, + "logits/rejected": -1.5442438125610352, + "logps/chosen": -41.62322998046875, + "logps/rejected": -237.86154174804688, + "loss": 0.237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06168098375201225, + "rewards/margins": 5.083809852600098, + "rewards/rejected": -5.145491123199463, + "step": 154 + }, + { + "epoch": 0.26, + "learning_rate": 6.512605042016807e-07, + "logits/chosen": -2.0312511920928955, + "logits/rejected": -1.431885004043579, + "logps/chosen": -149.83917236328125, + "logps/rejected": -129.43667602539062, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30399322509765625, + "rewards/margins": 2.70412540435791, + "rewards/rejected": -3.0081186294555664, + "step": 155 + }, + { + "epoch": 0.26, + "learning_rate": 6.554621848739496e-07, + "logits/chosen": -0.6298438310623169, + "logits/rejected": -0.9561706781387329, + "logps/chosen": -303.7615051269531, + "logps/rejected": -206.84622192382812, + "loss": 0.215, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0302143096923828, + "rewards/margins": 2.9986958503723145, + "rewards/rejected": -4.028910160064697, + "step": 156 + }, + { + "epoch": 0.26, + "learning_rate": 6.596638655462184e-07, + "logits/chosen": -1.0504183769226074, + "logits/rejected": -2.596862316131592, + "logps/chosen": -604.466552734375, + "logps/rejected": -95.1014633178711, + "loss": 0.2044, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6258544921875, + "rewards/margins": 3.6576082706451416, + "rewards/rejected": -3.0317537784576416, + "step": 157 + }, + { + "epoch": 0.27, + "learning_rate": 6.638655462184873e-07, + "logits/chosen": -1.4746941328048706, + "logits/rejected": -2.3198976516723633, + "logps/chosen": -510.818115234375, + "logps/rejected": -121.24671936035156, + "loss": 0.2315, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42745667695999146, + "rewards/margins": 2.9992709159851074, + "rewards/rejected": -2.5718140602111816, + "step": 158 + }, + { + "epoch": 0.27, + "learning_rate": 6.680672268907563e-07, + "logits/chosen": -1.1884299516677856, + "logits/rejected": -0.7777690291404724, + "logps/chosen": -96.32203674316406, + "logps/rejected": -142.47738647460938, + "loss": 0.2078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33403682708740234, + "rewards/margins": 1.1259114742279053, + "rewards/rejected": -1.4599483013153076, + "step": 159 + }, + { + "epoch": 0.27, + "learning_rate": 6.722689075630252e-07, + "logits/chosen": -1.337453007698059, + "logits/rejected": -1.9868876934051514, + "logps/chosen": -307.2751159667969, + "logps/rejected": -128.35006713867188, + "loss": 0.2637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.154842808842659, + "rewards/margins": 0.5152485370635986, + "rewards/rejected": -0.36040574312210083, + "step": 160 + }, + { + "epoch": 0.27, + "learning_rate": 6.764705882352941e-07, + "logits/chosen": -1.0279209613800049, + "logits/rejected": -2.354536533355713, + "logps/chosen": -310.1678466796875, + "logps/rejected": -82.68232727050781, + "loss": 0.2187, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6246879696846008, + "rewards/margins": 4.095552921295166, + "rewards/rejected": -3.470865249633789, + "step": 161 + }, + { + "epoch": 0.27, + "learning_rate": 6.80672268907563e-07, + "logits/chosen": -2.222411870956421, + "logits/rejected": -2.0266168117523193, + "logps/chosen": -31.795406341552734, + "logps/rejected": -155.34320068359375, + "loss": 0.1976, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33064308762550354, + "rewards/margins": 5.187854766845703, + "rewards/rejected": -5.518497943878174, + "step": 162 + }, + { + "epoch": 0.27, + "learning_rate": 6.848739495798319e-07, + "logits/chosen": -1.2088196277618408, + "logits/rejected": -2.6081087589263916, + "logps/chosen": -340.25689697265625, + "logps/rejected": -129.95571899414062, + "loss": 0.2151, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44170382618904114, + "rewards/margins": 1.4245266914367676, + "rewards/rejected": -0.982822835445404, + "step": 163 + }, + { + "epoch": 0.28, + "learning_rate": 6.890756302521008e-07, + "logits/chosen": -1.2510286569595337, + "logits/rejected": -1.489122748374939, + "logps/chosen": -358.14013671875, + "logps/rejected": -482.1717529296875, + "loss": 0.2434, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6463592648506165, + "rewards/margins": -0.21972429752349854, + "rewards/rejected": -0.4266350269317627, + "step": 164 + }, + { + "epoch": 0.28, + "learning_rate": 6.932773109243697e-07, + "logits/chosen": -1.6255576610565186, + "logits/rejected": -2.694222927093506, + "logps/chosen": -328.84320068359375, + "logps/rejected": -102.6138687133789, + "loss": 0.2171, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14793242514133453, + "rewards/margins": 2.839181661605835, + "rewards/rejected": -2.691249132156372, + "step": 165 + }, + { + "epoch": 0.28, + "learning_rate": 6.974789915966386e-07, + "logits/chosen": -1.9384331703186035, + "logits/rejected": -1.580994725227356, + "logps/chosen": -276.12921142578125, + "logps/rejected": -316.6767578125, + "loss": 0.2836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9446266889572144, + "rewards/margins": 9.762857437133789, + "rewards/rejected": -10.707484245300293, + "step": 166 + }, + { + "epoch": 0.28, + "learning_rate": 7.016806722689075e-07, + "logits/chosen": -1.6407663822174072, + "logits/rejected": -1.5907646417617798, + "logps/chosen": -514.5374145507812, + "logps/rejected": -433.6203308105469, + "loss": 0.2494, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24427911639213562, + "rewards/margins": 2.523643970489502, + "rewards/rejected": -2.279364824295044, + "step": 167 + }, + { + "epoch": 0.28, + "learning_rate": 7.058823529411765e-07, + "logits/chosen": -1.674712061882019, + "logits/rejected": -1.6826207637786865, + "logps/chosen": -47.03229522705078, + "logps/rejected": -77.42449951171875, + "loss": 0.2308, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0016428008675575256, + "rewards/margins": 1.911266565322876, + "rewards/rejected": -1.9096237421035767, + "step": 168 + }, + { + "epoch": 0.28, + "learning_rate": 7.100840336134454e-07, + "logits/chosen": -1.5853824615478516, + "logits/rejected": -1.8151202201843262, + "logps/chosen": -343.2923889160156, + "logps/rejected": -209.4898681640625, + "loss": 0.2194, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006474226713180542, + "rewards/margins": 7.342705726623535, + "rewards/rejected": -7.349180221557617, + "step": 169 + }, + { + "epoch": 0.29, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": -1.7506383657455444, + "logits/rejected": -1.3358906507492065, + "logps/chosen": -263.41241455078125, + "logps/rejected": -275.55706787109375, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8642104864120483, + "rewards/margins": 5.5388641357421875, + "rewards/rejected": -6.403074741363525, + "step": 170 + }, + { + "epoch": 0.29, + "learning_rate": 7.184873949579831e-07, + "logits/chosen": -1.4059754610061646, + "logits/rejected": -1.2317900657653809, + "logps/chosen": -191.6216278076172, + "logps/rejected": -363.2991027832031, + "loss": 0.2269, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5392516851425171, + "rewards/margins": 1.2023521661758423, + "rewards/rejected": -1.7416038513183594, + "step": 171 + }, + { + "epoch": 0.29, + "learning_rate": 7.226890756302521e-07, + "logits/chosen": -1.632811188697815, + "logits/rejected": -1.3805952072143555, + "logps/chosen": -133.94142150878906, + "logps/rejected": -269.5859680175781, + "loss": 0.2391, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4022440016269684, + "rewards/margins": 4.070891857147217, + "rewards/rejected": -4.473135948181152, + "step": 172 + }, + { + "epoch": 0.29, + "learning_rate": 7.268907563025209e-07, + "logits/chosen": -1.6576902866363525, + "logits/rejected": -1.4377576112747192, + "logps/chosen": -43.066829681396484, + "logps/rejected": -67.72735595703125, + "loss": 0.2507, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6717521548271179, + "rewards/margins": 0.4190084934234619, + "rewards/rejected": -1.0907607078552246, + "step": 173 + }, + { + "epoch": 0.29, + "learning_rate": 7.310924369747898e-07, + "logits/chosen": -1.5274075269699097, + "logits/rejected": -2.076913833618164, + "logps/chosen": -260.2308349609375, + "logps/rejected": -114.42774963378906, + "loss": 0.1975, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2202541828155518, + "rewards/margins": 3.1372241973876953, + "rewards/rejected": -1.916969895362854, + "step": 174 + }, + { + "epoch": 0.3, + "learning_rate": 7.352941176470589e-07, + "logits/chosen": -1.3671880960464478, + "logits/rejected": -1.7833813428878784, + "logps/chosen": -409.7170715332031, + "logps/rejected": -275.7892150878906, + "loss": 0.2064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.522778332233429, + "rewards/margins": 5.268821716308594, + "rewards/rejected": -5.791600227355957, + "step": 175 + }, + { + "epoch": 0.3, + "learning_rate": 7.394957983193277e-07, + "logits/chosen": -2.0044264793395996, + "logits/rejected": -2.814589023590088, + "logps/chosen": -286.0587158203125, + "logps/rejected": -112.9316177368164, + "loss": 0.2077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33617284893989563, + "rewards/margins": 1.9382390975952148, + "rewards/rejected": -1.6020662784576416, + "step": 176 + }, + { + "epoch": 0.3, + "learning_rate": 7.436974789915966e-07, + "logits/chosen": -2.030181646347046, + "logits/rejected": -1.3561756610870361, + "logps/chosen": -192.53549194335938, + "logps/rejected": -259.68597412109375, + "loss": 0.2328, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3000236749649048, + "rewards/margins": 1.729806900024414, + "rewards/rejected": -2.0298304557800293, + "step": 177 + }, + { + "epoch": 0.3, + "learning_rate": 7.478991596638656e-07, + "logits/chosen": -1.314343810081482, + "logits/rejected": -0.948784351348877, + "logps/chosen": -419.88946533203125, + "logps/rejected": -249.33352661132812, + "loss": 0.2275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.001434326171875, + "rewards/margins": 8.745747566223145, + "rewards/rejected": -8.74718189239502, + "step": 178 + }, + { + "epoch": 0.3, + "learning_rate": 7.521008403361344e-07, + "logits/chosen": -1.3746141195297241, + "logits/rejected": -1.2344566583633423, + "logps/chosen": -33.07182693481445, + "logps/rejected": -67.85247802734375, + "loss": 0.2025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7276212573051453, + "rewards/margins": 2.376377820968628, + "rewards/rejected": -3.103999137878418, + "step": 179 + }, + { + "epoch": 0.3, + "learning_rate": 7.563025210084033e-07, + "logits/chosen": -1.0103384256362915, + "logits/rejected": -2.038599967956543, + "logps/chosen": -425.00360107421875, + "logps/rejected": -136.74205017089844, + "loss": 0.2085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21104279160499573, + "rewards/margins": 5.775498390197754, + "rewards/rejected": -5.564455986022949, + "step": 180 + }, + { + "epoch": 0.31, + "learning_rate": 7.605042016806722e-07, + "logits/chosen": -1.7550321817398071, + "logits/rejected": -2.278670310974121, + "logps/chosen": -90.49088287353516, + "logps/rejected": -64.57603454589844, + "loss": 0.2221, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6065499782562256, + "rewards/margins": 2.4836509227752686, + "rewards/rejected": -3.090200901031494, + "step": 181 + }, + { + "epoch": 0.31, + "learning_rate": 7.647058823529411e-07, + "logits/chosen": -1.554534912109375, + "logits/rejected": -2.033510684967041, + "logps/chosen": -275.0732116699219, + "logps/rejected": -270.5262145996094, + "loss": 0.2019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.631757378578186, + "rewards/margins": 5.426800727844238, + "rewards/rejected": -4.795043468475342, + "step": 182 + }, + { + "epoch": 0.31, + "learning_rate": 7.689075630252101e-07, + "logits/chosen": -1.0876420736312866, + "logits/rejected": -1.9512230157852173, + "logps/chosen": -462.33001708984375, + "logps/rejected": -397.541259765625, + "loss": 0.2168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3109893798828125, + "rewards/margins": 2.645329475402832, + "rewards/rejected": -2.3343400955200195, + "step": 183 + }, + { + "epoch": 0.31, + "learning_rate": 7.73109243697479e-07, + "logits/chosen": -2.234156847000122, + "logits/rejected": -2.153066873550415, + "logps/chosen": -59.26795196533203, + "logps/rejected": -123.72718811035156, + "loss": 0.2057, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2919696867465973, + "rewards/margins": 0.5463926196098328, + "rewards/rejected": -0.8383622765541077, + "step": 184 + }, + { + "epoch": 0.31, + "learning_rate": 7.773109243697479e-07, + "logits/chosen": -1.892674446105957, + "logits/rejected": -0.7566059827804565, + "logps/chosen": -128.3382110595703, + "logps/rejected": -352.1267395019531, + "loss": 0.2409, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1412032842636108, + "rewards/margins": 3.8913497924804688, + "rewards/rejected": -5.032553195953369, + "step": 185 + }, + { + "epoch": 0.31, + "learning_rate": 7.815126050420168e-07, + "logits/chosen": -1.3167263269424438, + "logits/rejected": -1.3082215785980225, + "logps/chosen": -188.31219482421875, + "logps/rejected": -194.53448486328125, + "loss": 0.2037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14720916748046875, + "rewards/margins": 4.053531169891357, + "rewards/rejected": -4.200739860534668, + "step": 186 + }, + { + "epoch": 0.32, + "learning_rate": 7.857142857142856e-07, + "logits/chosen": -1.758721113204956, + "logits/rejected": -1.5897212028503418, + "logps/chosen": -490.67822265625, + "logps/rejected": -506.51971435546875, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8040527701377869, + "rewards/margins": 1.7363312244415283, + "rewards/rejected": -2.540383815765381, + "step": 187 + }, + { + "epoch": 0.32, + "learning_rate": 7.899159663865545e-07, + "logits/chosen": -1.2402632236480713, + "logits/rejected": -2.1518731117248535, + "logps/chosen": -440.912353515625, + "logps/rejected": -190.54396057128906, + "loss": 0.201, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4817703366279602, + "rewards/margins": 2.1288094520568848, + "rewards/rejected": -1.6470390558242798, + "step": 188 + }, + { + "epoch": 0.32, + "learning_rate": 7.941176470588235e-07, + "logits/chosen": -1.8251270055770874, + "logits/rejected": -1.9880740642547607, + "logps/chosen": -525.2092895507812, + "logps/rejected": -373.4764404296875, + "loss": 0.2224, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25654488801956177, + "rewards/margins": 3.9607901573181152, + "rewards/rejected": -3.704245090484619, + "step": 189 + }, + { + "epoch": 0.32, + "learning_rate": 7.983193277310924e-07, + "logits/chosen": -0.7393491864204407, + "logits/rejected": -1.706693172454834, + "logps/chosen": -540.9542236328125, + "logps/rejected": -206.2239990234375, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6728149652481079, + "rewards/margins": 7.774383544921875, + "rewards/rejected": -7.101568222045898, + "step": 190 + }, + { + "epoch": 0.32, + "learning_rate": 8.025210084033613e-07, + "logits/chosen": -1.3957598209381104, + "logits/rejected": -1.6105530261993408, + "logps/chosen": -451.7230224609375, + "logps/rejected": -323.118408203125, + "loss": 0.1768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5888229608535767, + "rewards/margins": 6.918619155883789, + "rewards/rejected": -7.507441997528076, + "step": 191 + }, + { + "epoch": 0.32, + "learning_rate": 8.067226890756303e-07, + "logits/chosen": -2.012829065322876, + "logits/rejected": -1.7003165483474731, + "logps/chosen": -231.9615936279297, + "logps/rejected": -416.9687805175781, + "loss": 0.2369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45652008056640625, + "rewards/margins": 4.130596160888672, + "rewards/rejected": -4.587116241455078, + "step": 192 + }, + { + "epoch": 0.33, + "learning_rate": 8.109243697478991e-07, + "logits/chosen": -1.829034447669983, + "logits/rejected": -1.623055338859558, + "logps/chosen": -405.52508544921875, + "logps/rejected": -334.392578125, + "loss": 0.2274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20874272286891937, + "rewards/margins": 1.027557611465454, + "rewards/rejected": -0.8188148736953735, + "step": 193 + }, + { + "epoch": 0.33, + "learning_rate": 8.15126050420168e-07, + "logits/chosen": -0.5686680674552917, + "logits/rejected": -0.6403495669364929, + "logps/chosen": -654.5108642578125, + "logps/rejected": -387.836669921875, + "loss": 0.2104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.738049328327179, + "rewards/margins": 5.746264457702637, + "rewards/rejected": -6.48431396484375, + "step": 194 + }, + { + "epoch": 0.33, + "learning_rate": 8.19327731092437e-07, + "logits/chosen": -1.5858798027038574, + "logits/rejected": -1.6420996189117432, + "logps/chosen": -116.9679946899414, + "logps/rejected": -202.39669799804688, + "loss": 0.1847, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6328601837158203, + "rewards/margins": 3.9840362071990967, + "rewards/rejected": -4.616896629333496, + "step": 195 + }, + { + "epoch": 0.33, + "learning_rate": 8.235294117647058e-07, + "logits/chosen": -1.6367160081863403, + "logits/rejected": -1.6892379522323608, + "logps/chosen": -175.56951904296875, + "logps/rejected": -154.72865295410156, + "loss": 0.1826, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.052147675305604935, + "rewards/margins": 6.1080498695373535, + "rewards/rejected": -6.1601972579956055, + "step": 196 + }, + { + "epoch": 0.33, + "learning_rate": 8.277310924369747e-07, + "logits/chosen": -1.7774677276611328, + "logits/rejected": -2.791229248046875, + "logps/chosen": -339.4785461425781, + "logps/rejected": -127.16014099121094, + "loss": 0.2728, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5950691103935242, + "rewards/margins": 5.04873514175415, + "rewards/rejected": -5.64380407333374, + "step": 197 + }, + { + "epoch": 0.33, + "learning_rate": 8.319327731092437e-07, + "logits/chosen": -1.5082173347473145, + "logits/rejected": -1.6745432615280151, + "logps/chosen": -78.62190246582031, + "logps/rejected": -35.42887496948242, + "loss": 0.2192, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4798523187637329, + "rewards/margins": 1.3703386783599854, + "rewards/rejected": -1.8501909971237183, + "step": 198 + }, + { + "epoch": 0.34, + "learning_rate": 8.361344537815126e-07, + "logits/chosen": -1.651850938796997, + "logits/rejected": -2.2229156494140625, + "logps/chosen": -417.2205810546875, + "logps/rejected": -179.9000244140625, + "loss": 0.1873, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32609206438064575, + "rewards/margins": 2.4329936504364014, + "rewards/rejected": -2.7590856552124023, + "step": 199 + }, + { + "epoch": 0.34, + "learning_rate": 8.403361344537815e-07, + "logits/chosen": -1.3835643529891968, + "logits/rejected": -1.4247071743011475, + "logps/chosen": -68.0342025756836, + "logps/rejected": -78.13129425048828, + "loss": 0.2554, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7295857667922974, + "rewards/margins": 2.134474277496338, + "rewards/rejected": -2.8640599250793457, + "step": 200 + }, + { + "epoch": 0.34, + "learning_rate": 8.445378151260503e-07, + "logits/chosen": -1.367997169494629, + "logits/rejected": -1.0557571649551392, + "logps/chosen": -353.5372314453125, + "logps/rejected": -270.9771728515625, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3901122808456421, + "rewards/margins": 8.15034294128418, + "rewards/rejected": -8.540454864501953, + "step": 201 + }, + { + "epoch": 0.34, + "learning_rate": 8.487394957983193e-07, + "logits/chosen": -1.7191728353500366, + "logits/rejected": -1.4392447471618652, + "logps/chosen": -22.586790084838867, + "logps/rejected": -131.69461059570312, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18322506546974182, + "rewards/margins": 4.684540748596191, + "rewards/rejected": -4.867766380310059, + "step": 202 + }, + { + "epoch": 0.34, + "learning_rate": 8.529411764705882e-07, + "logits/chosen": -1.5832895040512085, + "logits/rejected": -1.645804524421692, + "logps/chosen": -67.92455291748047, + "logps/rejected": -73.83755493164062, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22629156708717346, + "rewards/margins": 1.8930432796478271, + "rewards/rejected": -2.1193346977233887, + "step": 203 + }, + { + "epoch": 0.34, + "learning_rate": 8.57142857142857e-07, + "logits/chosen": -1.3173104524612427, + "logits/rejected": -1.7601145505905151, + "logps/chosen": -541.1436767578125, + "logps/rejected": -157.87823486328125, + "loss": 0.2101, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.8486511707305908, + "rewards/margins": 1.4881985187530518, + "rewards/rejected": -3.3368496894836426, + "step": 204 + }, + { + "epoch": 0.35, + "learning_rate": 8.613445378151261e-07, + "logits/chosen": -0.26975634694099426, + "logits/rejected": -0.21657763421535492, + "logps/chosen": -448.43878173828125, + "logps/rejected": -328.8587646484375, + "loss": 0.1599, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2071120738983154, + "rewards/margins": 9.672096252441406, + "rewards/rejected": -11.879209518432617, + "step": 205 + }, + { + "epoch": 0.35, + "learning_rate": 8.65546218487395e-07, + "logits/chosen": -1.3258733749389648, + "logits/rejected": -1.7010501623153687, + "logps/chosen": -280.8769836425781, + "logps/rejected": -82.33454132080078, + "loss": 0.2117, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4332069754600525, + "rewards/margins": 1.0438206195831299, + "rewards/rejected": -1.4770275354385376, + "step": 206 + }, + { + "epoch": 0.35, + "learning_rate": 8.697478991596638e-07, + "logits/chosen": -1.833431601524353, + "logits/rejected": -1.4919263124465942, + "logps/chosen": -550.7933959960938, + "logps/rejected": -366.83819580078125, + "loss": 0.2718, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9451126456260681, + "rewards/margins": 9.382735252380371, + "rewards/rejected": -10.327848434448242, + "step": 207 + }, + { + "epoch": 0.35, + "learning_rate": 8.739495798319328e-07, + "logits/chosen": -1.6695424318313599, + "logits/rejected": -2.533496618270874, + "logps/chosen": -193.40089416503906, + "logps/rejected": -144.17640686035156, + "loss": 0.1894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4194018244743347, + "rewards/margins": 3.9117088317871094, + "rewards/rejected": -4.33111047744751, + "step": 208 + }, + { + "epoch": 0.35, + "learning_rate": 8.781512605042016e-07, + "logits/chosen": -1.0884276628494263, + "logits/rejected": -1.7498663663864136, + "logps/chosen": -236.72494506835938, + "logps/rejected": -124.72262573242188, + "loss": 0.1856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06562767922878265, + "rewards/margins": 4.05698823928833, + "rewards/rejected": -4.122615814208984, + "step": 209 + }, + { + "epoch": 0.35, + "learning_rate": 8.823529411764705e-07, + "logits/chosen": -1.2553131580352783, + "logits/rejected": -1.654006838798523, + "logps/chosen": -549.10400390625, + "logps/rejected": -261.51226806640625, + "loss": 0.1827, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3053421378135681, + "rewards/margins": 7.806386947631836, + "rewards/rejected": -8.11172866821289, + "step": 210 + }, + { + "epoch": 0.36, + "learning_rate": 8.865546218487394e-07, + "logits/chosen": -1.1624391078948975, + "logits/rejected": -1.2052842378616333, + "logps/chosen": -47.10140609741211, + "logps/rejected": -77.38719177246094, + "loss": 0.1972, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18456250429153442, + "rewards/margins": 4.566642761230469, + "rewards/rejected": -4.7512054443359375, + "step": 211 + }, + { + "epoch": 0.36, + "learning_rate": 8.907563025210084e-07, + "logits/chosen": -2.755701780319214, + "logits/rejected": -1.7558363676071167, + "logps/chosen": -249.60147094726562, + "logps/rejected": -118.43754577636719, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39346620440483093, + "rewards/margins": 4.152596473693848, + "rewards/rejected": -4.546062469482422, + "step": 212 + }, + { + "epoch": 0.36, + "learning_rate": 8.949579831932773e-07, + "logits/chosen": -1.4844564199447632, + "logits/rejected": -2.2958929538726807, + "logps/chosen": -372.4450988769531, + "logps/rejected": -301.98016357421875, + "loss": 0.215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4042709469795227, + "rewards/margins": 3.7409682273864746, + "rewards/rejected": -4.145238876342773, + "step": 213 + }, + { + "epoch": 0.36, + "learning_rate": 8.991596638655462e-07, + "logits/chosen": -1.6278800964355469, + "logits/rejected": -2.122774600982666, + "logps/chosen": -185.77044677734375, + "logps/rejected": -173.4629364013672, + "loss": 0.2481, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15187864005565643, + "rewards/margins": 5.104752540588379, + "rewards/rejected": -4.952874183654785, + "step": 214 + }, + { + "epoch": 0.36, + "learning_rate": 9.033613445378151e-07, + "logits/chosen": -1.2420963048934937, + "logits/rejected": -1.2279390096664429, + "logps/chosen": -56.64626693725586, + "logps/rejected": -95.80750274658203, + "loss": 0.2378, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10336628556251526, + "rewards/margins": 4.724967956542969, + "rewards/rejected": -4.6216020584106445, + "step": 215 + }, + { + "epoch": 0.36, + "learning_rate": 9.07563025210084e-07, + "logits/chosen": -0.6830325126647949, + "logits/rejected": -0.7307128310203552, + "logps/chosen": -509.64990234375, + "logps/rejected": -261.376220703125, + "loss": 0.2494, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0829544067382812, + "rewards/margins": 7.50957727432251, + "rewards/rejected": -8.592531204223633, + "step": 216 + }, + { + "epoch": 0.37, + "learning_rate": 9.117647058823529e-07, + "logits/chosen": -1.6173702478408813, + "logits/rejected": -1.7928811311721802, + "logps/chosen": -288.9893798828125, + "logps/rejected": -148.32977294921875, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27213987708091736, + "rewards/margins": 6.079885482788086, + "rewards/rejected": -6.352025032043457, + "step": 217 + }, + { + "epoch": 0.37, + "learning_rate": 9.159663865546218e-07, + "logits/chosen": -1.3861663341522217, + "logits/rejected": -1.4028596878051758, + "logps/chosen": -72.13826751708984, + "logps/rejected": -156.20257568359375, + "loss": 0.2067, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7177965044975281, + "rewards/margins": 5.9151411056518555, + "rewards/rejected": -6.632937908172607, + "step": 218 + }, + { + "epoch": 0.37, + "learning_rate": 9.201680672268907e-07, + "logits/chosen": -1.943166971206665, + "logits/rejected": -1.703479290008545, + "logps/chosen": -254.5187530517578, + "logps/rejected": -391.06610107421875, + "loss": 0.198, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8562338948249817, + "rewards/margins": 6.539711952209473, + "rewards/rejected": -7.395946025848389, + "step": 219 + }, + { + "epoch": 0.37, + "learning_rate": 9.243697478991597e-07, + "logits/chosen": -1.8535621166229248, + "logits/rejected": -2.3986668586730957, + "logps/chosen": -221.98687744140625, + "logps/rejected": -164.90716552734375, + "loss": 0.2149, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19046229124069214, + "rewards/margins": 3.5672414302825928, + "rewards/rejected": -3.7577037811279297, + "step": 220 + }, + { + "epoch": 0.37, + "learning_rate": 9.285714285714285e-07, + "logits/chosen": -1.5400798320770264, + "logits/rejected": -0.977436900138855, + "logps/chosen": -38.834285736083984, + "logps/rejected": -214.7268524169922, + "loss": 0.2612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6614927649497986, + "rewards/margins": 6.089022636413574, + "rewards/rejected": -6.750515460968018, + "step": 221 + }, + { + "epoch": 0.37, + "learning_rate": 9.327731092436975e-07, + "logits/chosen": -1.2816520929336548, + "logits/rejected": -1.7735843658447266, + "logps/chosen": -653.9671630859375, + "logps/rejected": -604.3455810546875, + "loss": 0.2203, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04913024976849556, + "rewards/margins": 0.8042449355125427, + "rewards/rejected": -0.8533751964569092, + "step": 222 + }, + { + "epoch": 0.38, + "learning_rate": 9.369747899159663e-07, + "logits/chosen": -0.8977038264274597, + "logits/rejected": -1.0373615026474, + "logps/chosen": -559.4537963867188, + "logps/rejected": -400.3348388671875, + "loss": 0.2109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4869690239429474, + "rewards/margins": 4.854780673980713, + "rewards/rejected": -5.341749668121338, + "step": 223 + }, + { + "epoch": 0.38, + "learning_rate": 9.411764705882352e-07, + "logits/chosen": -1.5731453895568848, + "logits/rejected": -1.3540092706680298, + "logps/chosen": -362.89971923828125, + "logps/rejected": -213.01864624023438, + "loss": 0.2251, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5867255330085754, + "rewards/margins": 7.263546466827393, + "rewards/rejected": -7.850271701812744, + "step": 224 + }, + { + "epoch": 0.38, + "learning_rate": 9.453781512605042e-07, + "logits/chosen": -0.8027037978172302, + "logits/rejected": -1.3039056062698364, + "logps/chosen": -191.79257202148438, + "logps/rejected": -130.3656768798828, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5622925162315369, + "rewards/margins": 2.7290549278259277, + "rewards/rejected": -2.166762590408325, + "step": 225 + }, + { + "epoch": 0.38, + "learning_rate": 9.49579831932773e-07, + "logits/chosen": -1.7100781202316284, + "logits/rejected": -1.5751913785934448, + "logps/chosen": -51.673133850097656, + "logps/rejected": -87.66356658935547, + "loss": 0.1912, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03124428540468216, + "rewards/margins": 3.016190528869629, + "rewards/rejected": -3.0474348068237305, + "step": 226 + }, + { + "epoch": 0.38, + "learning_rate": 9.53781512605042e-07, + "logits/chosen": -1.7360178232192993, + "logits/rejected": -1.5387942790985107, + "logps/chosen": -623.2257690429688, + "logps/rejected": -446.02789306640625, + "loss": 0.2341, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8850250244140625, + "rewards/margins": 0.6927383542060852, + "rewards/rejected": -1.577763319015503, + "step": 227 + }, + { + "epoch": 0.38, + "learning_rate": 9.579831932773109e-07, + "logits/chosen": -1.3128571510314941, + "logits/rejected": -1.2455374002456665, + "logps/chosen": -491.56341552734375, + "logps/rejected": -306.54974365234375, + "loss": 0.2017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8349395990371704, + "rewards/margins": 7.316784381866455, + "rewards/rejected": -8.151723861694336, + "step": 228 + }, + { + "epoch": 0.39, + "learning_rate": 9.621848739495798e-07, + "logits/chosen": -0.9198746681213379, + "logits/rejected": -1.0064659118652344, + "logps/chosen": -170.07423400878906, + "logps/rejected": -151.16380310058594, + "loss": 0.2215, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42428553104400635, + "rewards/margins": 4.414795398712158, + "rewards/rejected": -4.839080810546875, + "step": 229 + }, + { + "epoch": 0.39, + "learning_rate": 9.663865546218487e-07, + "logits/chosen": -0.19439470767974854, + "logits/rejected": -0.1546761840581894, + "logps/chosen": -443.8742980957031, + "logps/rejected": -319.8472595214844, + "loss": 0.2317, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7987457513809204, + "rewards/margins": 9.688015937805176, + "rewards/rejected": -11.486761093139648, + "step": 230 + }, + { + "epoch": 0.39, + "learning_rate": 9.705882352941176e-07, + "logits/chosen": -0.9490076303482056, + "logits/rejected": -0.48910650610923767, + "logps/chosen": -513.2789306640625, + "logps/rejected": -340.21929931640625, + "loss": 0.2053, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3714752197265625, + "rewards/margins": 9.171676635742188, + "rewards/rejected": -10.54315185546875, + "step": 231 + }, + { + "epoch": 0.39, + "learning_rate": 9.747899159663866e-07, + "logits/chosen": -1.1433497667312622, + "logits/rejected": -1.3219791650772095, + "logps/chosen": -505.0430908203125, + "logps/rejected": -255.84214782714844, + "loss": 0.2021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42404481768608093, + "rewards/margins": 8.78639030456543, + "rewards/rejected": -9.21043586730957, + "step": 232 + }, + { + "epoch": 0.39, + "learning_rate": 9.789915966386553e-07, + "logits/chosen": -1.0191929340362549, + "logits/rejected": -1.4857556819915771, + "logps/chosen": -178.77206420898438, + "logps/rejected": -129.4377899169922, + "loss": 0.2355, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5842880606651306, + "rewards/margins": 3.4605538845062256, + "rewards/rejected": -4.044841766357422, + "step": 233 + }, + { + "epoch": 0.39, + "learning_rate": 9.831932773109242e-07, + "logits/chosen": -1.518091082572937, + "logits/rejected": -1.1380579471588135, + "logps/chosen": -144.8524627685547, + "logps/rejected": -124.58405303955078, + "loss": 0.1858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12543268501758575, + "rewards/margins": 5.242918014526367, + "rewards/rejected": -5.368350982666016, + "step": 234 + }, + { + "epoch": 0.4, + "learning_rate": 9.873949579831934e-07, + "logits/chosen": -1.8797619342803955, + "logits/rejected": -1.4644118547439575, + "logps/chosen": -70.80252838134766, + "logps/rejected": -90.79109191894531, + "loss": 0.2319, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3334128856658936, + "rewards/margins": 1.8264957666397095, + "rewards/rejected": -3.1599087715148926, + "step": 235 + }, + { + "epoch": 0.4, + "learning_rate": 9.91596638655462e-07, + "logits/chosen": -0.9166591763496399, + "logits/rejected": -0.93157559633255, + "logps/chosen": -12.770793914794922, + "logps/rejected": -55.15324783325195, + "loss": 0.1993, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37832537293434143, + "rewards/margins": 3.1305532455444336, + "rewards/rejected": -3.508878707885742, + "step": 236 + }, + { + "epoch": 0.4, + "learning_rate": 9.95798319327731e-07, + "logits/chosen": -1.6806684732437134, + "logits/rejected": -2.278653144836426, + "logps/chosen": -293.66644287109375, + "logps/rejected": -160.80221557617188, + "loss": 0.1972, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09649372100830078, + "rewards/margins": 8.160371780395508, + "rewards/rejected": -8.063878059387207, + "step": 237 + }, + { + "epoch": 0.4, + "learning_rate": 1e-06, + "logits/chosen": -0.28713756799697876, + "logits/rejected": -0.2611769437789917, + "logps/chosen": -21.57666778564453, + "logps/rejected": -82.406982421875, + "loss": 0.2085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08337266743183136, + "rewards/margins": 4.99355936050415, + "rewards/rejected": -5.076931953430176, + "step": 238 + }, + { + "epoch": 0.4, + "learning_rate": 9.99999458185223e-07, + "logits/chosen": -2.185724973678589, + "logits/rejected": -2.134838104248047, + "logps/chosen": -37.73735427856445, + "logps/rejected": -141.9013671875, + "loss": 0.1863, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4596128463745117, + "rewards/margins": 5.266613960266113, + "rewards/rejected": -6.726226806640625, + "step": 239 + }, + { + "epoch": 0.4, + "learning_rate": 9.999978327420662e-07, + "logits/chosen": -1.203190565109253, + "logits/rejected": -2.1939120292663574, + "logps/chosen": -514.1233520507812, + "logps/rejected": -75.5627670288086, + "loss": 0.2014, + "rewards/accuracies": 0.5, + "rewards/chosen": 2.2211365699768066, + "rewards/margins": 3.9060745239257812, + "rewards/rejected": -1.684937834739685, + "step": 240 + }, + { + "epoch": 0.41, + "learning_rate": 9.999951236740525e-07, + "logits/chosen": -1.3303395509719849, + "logits/rejected": -1.934448480606079, + "logps/chosen": -135.79515075683594, + "logps/rejected": -107.17416381835938, + "loss": 0.1887, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06375274807214737, + "rewards/margins": 2.7002902030944824, + "rewards/rejected": -2.636537551879883, + "step": 241 + }, + { + "epoch": 0.41, + "learning_rate": 9.999913309870528e-07, + "logits/chosen": -1.44986891746521, + "logits/rejected": -1.9651787281036377, + "logps/chosen": -200.12448120117188, + "logps/rejected": -163.63800048828125, + "loss": 0.1916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016937255859375, + "rewards/margins": 2.1956772804260254, + "rewards/rejected": -2.1787400245666504, + "step": 242 + }, + { + "epoch": 0.41, + "learning_rate": 9.999864546892874e-07, + "logits/chosen": -1.2548986673355103, + "logits/rejected": -1.1517094373703003, + "logps/chosen": -21.825084686279297, + "logps/rejected": -100.38935852050781, + "loss": 0.2183, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.235842227935791, + "rewards/margins": 4.932180404663086, + "rewards/rejected": -6.168022155761719, + "step": 243 + }, + { + "epoch": 0.41, + "learning_rate": 9.99980494791324e-07, + "logits/chosen": -1.3461451530456543, + "logits/rejected": -2.2621426582336426, + "logps/chosen": -529.697998046875, + "logps/rejected": -53.855751037597656, + "loss": 0.2415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11737975478172302, + "rewards/margins": 2.4394328594207764, + "rewards/rejected": -2.5568125247955322, + "step": 244 + }, + { + "epoch": 0.41, + "learning_rate": 9.999734513060793e-07, + "logits/chosen": -1.6569421291351318, + "logits/rejected": -1.4833012819290161, + "logps/chosen": -21.329483032226562, + "logps/rejected": -213.5611572265625, + "loss": 0.1858, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2380591332912445, + "rewards/margins": 10.061112403869629, + "rewards/rejected": -10.299171447753906, + "step": 245 + }, + { + "epoch": 0.41, + "learning_rate": 9.999653242488186e-07, + "logits/chosen": -1.3467878103256226, + "logits/rejected": -1.1173731088638306, + "logps/chosen": -388.6378173828125, + "logps/rejected": -294.2225341796875, + "loss": 0.1943, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8519317507743835, + "rewards/margins": 9.176179885864258, + "rewards/rejected": -10.028111457824707, + "step": 246 + }, + { + "epoch": 0.42, + "learning_rate": 9.999561136371554e-07, + "logits/chosen": -1.8872058391571045, + "logits/rejected": -1.6369519233703613, + "logps/chosen": -214.09727478027344, + "logps/rejected": -376.61334228515625, + "loss": 0.1797, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4823310971260071, + "rewards/margins": -0.6333345770835876, + "rewards/rejected": 0.15100345015525818, + "step": 247 + }, + { + "epoch": 0.42, + "learning_rate": 9.99945819491051e-07, + "logits/chosen": -1.6549506187438965, + "logits/rejected": -1.558489441871643, + "logps/chosen": -263.10980224609375, + "logps/rejected": -266.67388916015625, + "loss": 0.1845, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2903716564178467, + "rewards/margins": 5.641432285308838, + "rewards/rejected": -6.931804180145264, + "step": 248 + }, + { + "epoch": 0.42, + "learning_rate": 9.99934441832816e-07, + "logits/chosen": -1.7296139001846313, + "logits/rejected": -1.958460807800293, + "logps/chosen": -219.89266967773438, + "logps/rejected": -296.84906005859375, + "loss": 0.1885, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.706303060054779, + "rewards/margins": 5.47901725769043, + "rewards/rejected": -6.1853203773498535, + "step": 249 + }, + { + "epoch": 0.42, + "learning_rate": 9.999219806871085e-07, + "logits/chosen": -1.1240818500518799, + "logits/rejected": -1.4093561172485352, + "logps/chosen": -481.0801086425781, + "logps/rejected": -516.7035522460938, + "loss": 0.2172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33460843563079834, + "rewards/margins": 6.402099609375, + "rewards/rejected": -6.067491054534912, + "step": 250 + }, + { + "epoch": 0.42, + "learning_rate": 9.99908436080935e-07, + "logits/chosen": -0.8841766119003296, + "logits/rejected": -1.7438700199127197, + "logps/chosen": -250.23077392578125, + "logps/rejected": -161.5799560546875, + "loss": 0.2042, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4004615843296051, + "rewards/margins": 6.907259941101074, + "rewards/rejected": -6.50679874420166, + "step": 251 + }, + { + "epoch": 0.42, + "learning_rate": 9.998938080436503e-07, + "logits/chosen": -1.61567223072052, + "logits/rejected": -2.236949920654297, + "logps/chosen": -63.57852554321289, + "logps/rejected": -129.1857452392578, + "loss": 0.2339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2066521644592285, + "rewards/margins": 4.170039176940918, + "rewards/rejected": -5.3766913414001465, + "step": 252 + }, + { + "epoch": 0.43, + "learning_rate": 9.998780966069568e-07, + "logits/chosen": -1.5731325149536133, + "logits/rejected": -1.5780061483383179, + "logps/chosen": -24.21834373474121, + "logps/rejected": -83.41648864746094, + "loss": 0.1516, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8808462023735046, + "rewards/margins": 3.9870688915252686, + "rewards/rejected": -4.867915153503418, + "step": 253 + }, + { + "epoch": 0.43, + "learning_rate": 9.998613018049058e-07, + "logits/chosen": -1.2458865642547607, + "logits/rejected": -1.5161921977996826, + "logps/chosen": -137.61996459960938, + "logps/rejected": -198.40377807617188, + "loss": 0.1805, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1367048025131226, + "rewards/margins": 5.306751251220703, + "rewards/rejected": -6.443456172943115, + "step": 254 + }, + { + "epoch": 0.43, + "learning_rate": 9.998434236738956e-07, + "logits/chosen": -1.3364038467407227, + "logits/rejected": -1.633857011795044, + "logps/chosen": -427.1192626953125, + "logps/rejected": -423.90826416015625, + "loss": 0.1867, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.897076427936554, + "rewards/margins": 5.636436462402344, + "rewards/rejected": -6.533513069152832, + "step": 255 + }, + { + "epoch": 0.43, + "learning_rate": 9.998244622526728e-07, + "logits/chosen": -1.3373851776123047, + "logits/rejected": -1.0651671886444092, + "logps/chosen": -123.55149841308594, + "logps/rejected": -188.85494995117188, + "loss": 0.1508, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2550186216831207, + "rewards/margins": 4.371315002441406, + "rewards/rejected": -4.626333713531494, + "step": 256 + }, + { + "epoch": 0.43, + "learning_rate": 9.99804417582332e-07, + "logits/chosen": -1.8455286026000977, + "logits/rejected": -1.6597044467926025, + "logps/chosen": -37.02387619018555, + "logps/rejected": -157.0921630859375, + "loss": 0.1598, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6415446400642395, + "rewards/margins": 7.187655448913574, + "rewards/rejected": -7.829200267791748, + "step": 257 + }, + { + "epoch": 0.44, + "learning_rate": 9.997832897063147e-07, + "logits/chosen": -1.3792277574539185, + "logits/rejected": -1.3170826435089111, + "logps/chosen": -268.0335388183594, + "logps/rejected": -193.27032470703125, + "loss": 0.2239, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12644805014133453, + "rewards/margins": 7.42259407043457, + "rewards/rejected": -7.549041748046875, + "step": 258 + }, + { + "epoch": 0.44, + "learning_rate": 9.99761078670411e-07, + "logits/chosen": -2.141418218612671, + "logits/rejected": -2.0980770587921143, + "logps/chosen": -36.05842590332031, + "logps/rejected": -130.27944946289062, + "loss": 0.1816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2582319378852844, + "rewards/margins": 4.223054885864258, + "rewards/rejected": -4.481287002563477, + "step": 259 + }, + { + "epoch": 0.44, + "learning_rate": 9.997377845227574e-07, + "logits/chosen": -1.0465216636657715, + "logits/rejected": -0.9531089663505554, + "logps/chosen": -123.76052856445312, + "logps/rejected": -127.10284423828125, + "loss": 0.1821, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5870941281318665, + "rewards/margins": 1.3501648902893066, + "rewards/rejected": -1.9372591972351074, + "step": 260 + }, + { + "epoch": 0.44, + "learning_rate": 9.997134073138388e-07, + "logits/chosen": -2.0905027389526367, + "logits/rejected": -2.080312490463257, + "logps/chosen": -161.46063232421875, + "logps/rejected": -82.06962585449219, + "loss": 0.1953, + "rewards/accuracies": 0.0, + "rewards/chosen": -2.4451663494110107, + "rewards/margins": -1.771939754486084, + "rewards/rejected": -0.6732265949249268, + "step": 261 + }, + { + "epoch": 0.44, + "learning_rate": 9.996879470964867e-07, + "logits/chosen": -1.8112472295761108, + "logits/rejected": -2.4045462608337402, + "logps/chosen": -167.5275421142578, + "logps/rejected": -174.94483947753906, + "loss": 0.2188, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22896921634674072, + "rewards/margins": 5.413414478302002, + "rewards/rejected": -5.642383575439453, + "step": 262 + }, + { + "epoch": 0.44, + "learning_rate": 9.996614039258803e-07, + "logits/chosen": -0.665377676486969, + "logits/rejected": -0.4620656967163086, + "logps/chosen": -148.4291229248047, + "logps/rejected": -186.41522216796875, + "loss": 0.2179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5123512744903564, + "rewards/margins": 7.561125755310059, + "rewards/rejected": -8.073476791381836, + "step": 263 + }, + { + "epoch": 0.45, + "learning_rate": 9.996337778595453e-07, + "logits/chosen": -1.246445894241333, + "logits/rejected": -1.9943310022354126, + "logps/chosen": -311.7137451171875, + "logps/rejected": -163.29208374023438, + "loss": 0.1784, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3149963617324829, + "rewards/margins": 7.973663330078125, + "rewards/rejected": -7.658667087554932, + "step": 264 + }, + { + "epoch": 0.45, + "learning_rate": 9.996050689573542e-07, + "logits/chosen": -0.8231534361839294, + "logits/rejected": -1.4796454906463623, + "logps/chosen": -392.57366943359375, + "logps/rejected": -215.98666381835938, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.50887531042099, + "rewards/margins": 4.215329647064209, + "rewards/rejected": -4.724205017089844, + "step": 265 + }, + { + "epoch": 0.45, + "learning_rate": 9.995752772815274e-07, + "logits/chosen": -1.0359983444213867, + "logits/rejected": -1.4785257577896118, + "logps/chosen": -98.88191223144531, + "logps/rejected": -250.07452392578125, + "loss": 0.1721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7754969000816345, + "rewards/margins": 5.748414516448975, + "rewards/rejected": -6.523911476135254, + "step": 266 + }, + { + "epoch": 0.45, + "learning_rate": 9.995444028966306e-07, + "logits/chosen": -1.2062805891036987, + "logits/rejected": -0.6490817666053772, + "logps/chosen": -66.56135559082031, + "logps/rejected": -173.89833068847656, + "loss": 0.2002, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7681030631065369, + "rewards/margins": 3.30619478225708, + "rewards/rejected": -4.074297904968262, + "step": 267 + }, + { + "epoch": 0.45, + "learning_rate": 9.995124458695768e-07, + "logits/chosen": -1.8022315502166748, + "logits/rejected": -1.7330073118209839, + "logps/chosen": -119.78292846679688, + "logps/rejected": -185.3957061767578, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.318962961435318, + "rewards/margins": 3.6459078788757324, + "rewards/rejected": -3.9648709297180176, + "step": 268 + }, + { + "epoch": 0.45, + "learning_rate": 9.99479406269625e-07, + "logits/chosen": -0.64534592628479, + "logits/rejected": -1.2948215007781982, + "logps/chosen": -386.3092956542969, + "logps/rejected": -148.838134765625, + "loss": 0.2189, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32144472002983093, + "rewards/margins": 4.571913242340088, + "rewards/rejected": -4.89335823059082, + "step": 269 + }, + { + "epoch": 0.46, + "learning_rate": 9.994452841683807e-07, + "logits/chosen": -1.8895020484924316, + "logits/rejected": -1.8781219720840454, + "logps/chosen": -109.0746841430664, + "logps/rejected": -128.44271850585938, + "loss": 0.1739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8171417713165283, + "rewards/margins": -0.010423451662063599, + "rewards/rejected": -0.8067182898521423, + "step": 270 + }, + { + "epoch": 0.46, + "learning_rate": 9.994100796397953e-07, + "logits/chosen": -2.2890195846557617, + "logits/rejected": -2.143787145614624, + "logps/chosen": -67.44159698486328, + "logps/rejected": -293.24310302734375, + "loss": 0.2057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6767939329147339, + "rewards/margins": 12.486234664916992, + "rewards/rejected": -13.163028717041016, + "step": 271 + }, + { + "epoch": 0.46, + "learning_rate": 9.993737927601663e-07, + "logits/chosen": -1.2766467332839966, + "logits/rejected": -0.8370835781097412, + "logps/chosen": -126.51022338867188, + "logps/rejected": -241.4452362060547, + "loss": 0.1861, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14905127882957458, + "rewards/margins": 8.128352165222168, + "rewards/rejected": -8.277403831481934, + "step": 272 + }, + { + "epoch": 0.46, + "learning_rate": 9.993364236081366e-07, + "logits/chosen": -2.052964925765991, + "logits/rejected": -1.607141375541687, + "logps/chosen": -103.60296630859375, + "logps/rejected": -158.26458740234375, + "loss": 0.154, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.312964916229248, + "rewards/margins": 1.991333246231079, + "rewards/rejected": -3.304298162460327, + "step": 273 + }, + { + "epoch": 0.46, + "learning_rate": 9.992979722646948e-07, + "logits/chosen": -2.079749584197998, + "logits/rejected": -2.0677154064178467, + "logps/chosen": -41.83177947998047, + "logps/rejected": -55.48749542236328, + "loss": 0.1965, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5781872272491455, + "rewards/margins": 1.497841477394104, + "rewards/rejected": -2.076028823852539, + "step": 274 + }, + { + "epoch": 0.46, + "learning_rate": 9.992584388131748e-07, + "logits/chosen": -2.3774337768554688, + "logits/rejected": -1.4596751928329468, + "logps/chosen": -47.4880256652832, + "logps/rejected": -253.26637268066406, + "loss": 0.1938, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3620060980319977, + "rewards/margins": 8.875134468078613, + "rewards/rejected": -9.237140655517578, + "step": 275 + }, + { + "epoch": 0.47, + "learning_rate": 9.992178233392562e-07, + "logits/chosen": -1.6642234325408936, + "logits/rejected": -1.6687958240509033, + "logps/chosen": -307.078369140625, + "logps/rejected": -324.7652587890625, + "loss": 0.2655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5620359182357788, + "rewards/margins": 5.345144748687744, + "rewards/rejected": -4.783108711242676, + "step": 276 + }, + { + "epoch": 0.47, + "learning_rate": 9.991761259309633e-07, + "logits/chosen": -1.3904876708984375, + "logits/rejected": -1.6669962406158447, + "logps/chosen": -400.28887939453125, + "logps/rejected": -245.69497680664062, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.673797070980072, + "rewards/margins": 7.228074550628662, + "rewards/rejected": -7.901871681213379, + "step": 277 + }, + { + "epoch": 0.47, + "learning_rate": 9.991333466786648e-07, + "logits/chosen": -1.216492772102356, + "logits/rejected": -1.1512091159820557, + "logps/chosen": -16.977386474609375, + "logps/rejected": -128.8122100830078, + "loss": 0.1701, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0717000961303711, + "rewards/margins": 7.391216278076172, + "rewards/rejected": -7.462916374206543, + "step": 278 + }, + { + "epoch": 0.47, + "learning_rate": 9.990894856750744e-07, + "logits/chosen": -0.9838640689849854, + "logits/rejected": -2.3961727619171143, + "logps/chosen": -650.3472900390625, + "logps/rejected": -48.605430603027344, + "loss": 0.1742, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.338510125875473, + "rewards/margins": 1.6768007278442383, + "rewards/rejected": -1.3382906913757324, + "step": 279 + }, + { + "epoch": 0.47, + "learning_rate": 9.990445430152506e-07, + "logits/chosen": -0.4278978407382965, + "logits/rejected": -0.6988143920898438, + "logps/chosen": -533.7452392578125, + "logps/rejected": -292.7690734863281, + "loss": 0.2038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9284515380859375, + "rewards/margins": 10.271871566772461, + "rewards/rejected": -11.200323104858398, + "step": 280 + }, + { + "epoch": 0.47, + "learning_rate": 9.989985187965955e-07, + "logits/chosen": -1.6768834590911865, + "logits/rejected": -1.5711687803268433, + "logps/chosen": -439.8897705078125, + "logps/rejected": -265.1025695800781, + "loss": 0.2195, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3489418029785156, + "rewards/margins": 7.014950752258301, + "rewards/rejected": -8.363892555236816, + "step": 281 + }, + { + "epoch": 0.48, + "learning_rate": 9.989514131188558e-07, + "logits/chosen": -1.4727314710617065, + "logits/rejected": -1.679413914680481, + "logps/chosen": -47.99615478515625, + "logps/rejected": -121.50358581542969, + "loss": 0.1682, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8484709858894348, + "rewards/margins": 4.306775093078613, + "rewards/rejected": -5.155246257781982, + "step": 282 + }, + { + "epoch": 0.48, + "learning_rate": 9.989032260841215e-07, + "logits/chosen": -1.3292348384857178, + "logits/rejected": -1.4934347867965698, + "logps/chosen": -443.295654296875, + "logps/rejected": -279.7898254394531, + "loss": 0.176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13922274112701416, + "rewards/margins": 7.711603164672852, + "rewards/rejected": -7.850825786590576, + "step": 283 + }, + { + "epoch": 0.48, + "learning_rate": 9.988539577968264e-07, + "logits/chosen": -1.694291591644287, + "logits/rejected": -1.0821882486343384, + "logps/chosen": -46.26044845581055, + "logps/rejected": -100.78984069824219, + "loss": 0.199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.683262825012207, + "rewards/margins": 1.5582889318466187, + "rewards/rejected": -2.2415518760681152, + "step": 284 + }, + { + "epoch": 0.48, + "learning_rate": 9.988036083637477e-07, + "logits/chosen": -1.4071030616760254, + "logits/rejected": -1.7171040773391724, + "logps/chosen": -215.1892852783203, + "logps/rejected": -162.13401794433594, + "loss": 0.2222, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22356048226356506, + "rewards/margins": 8.27348518371582, + "rewards/rejected": -8.497045516967773, + "step": 285 + }, + { + "epoch": 0.48, + "learning_rate": 9.987521778940057e-07, + "logits/chosen": -1.219684362411499, + "logits/rejected": -1.615993618965149, + "logps/chosen": -529.4940185546875, + "logps/rejected": -260.98486328125, + "loss": 0.187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5555427670478821, + "rewards/margins": 7.262081623077393, + "rewards/rejected": -7.817624092102051, + "step": 286 + }, + { + "epoch": 0.48, + "learning_rate": 9.986996664990635e-07, + "logits/chosen": -1.1856704950332642, + "logits/rejected": -1.439422607421875, + "logps/chosen": -128.1553192138672, + "logps/rejected": -44.11671447753906, + "loss": 0.1936, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0148143768310547, + "rewards/margins": -0.19398105144500732, + "rewards/rejected": -0.8208333253860474, + "step": 287 + }, + { + "epoch": 0.49, + "learning_rate": 9.986460742927269e-07, + "logits/chosen": -0.8404701948165894, + "logits/rejected": -0.4110315442085266, + "logps/chosen": -421.8138122558594, + "logps/rejected": -330.5446472167969, + "loss": 0.225, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2713227272033691, + "rewards/margins": 11.016674041748047, + "rewards/rejected": -12.287996292114258, + "step": 288 + }, + { + "epoch": 0.49, + "learning_rate": 9.985914013911442e-07, + "logits/chosen": -1.3006629943847656, + "logits/rejected": -1.1097196340560913, + "logps/chosen": -624.8807983398438, + "logps/rejected": -328.3757019042969, + "loss": 0.1815, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4635086059570312, + "rewards/margins": 3.9749832153320312, + "rewards/rejected": -5.4384918212890625, + "step": 289 + }, + { + "epoch": 0.49, + "learning_rate": 9.985356479128056e-07, + "logits/chosen": -0.30210334062576294, + "logits/rejected": -0.34102773666381836, + "logps/chosen": -203.22467041015625, + "logps/rejected": -202.3996124267578, + "loss": 0.2037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25408071279525757, + "rewards/margins": 9.476703643798828, + "rewards/rejected": -9.73078441619873, + "step": 290 + }, + { + "epoch": 0.49, + "learning_rate": 9.984788139785432e-07, + "logits/chosen": -1.5046318769454956, + "logits/rejected": -1.4954370260238647, + "logps/chosen": -128.044189453125, + "logps/rejected": -212.79881286621094, + "loss": 0.1563, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5831112265586853, + "rewards/margins": 4.677426338195801, + "rewards/rejected": -4.094315052032471, + "step": 291 + }, + { + "epoch": 0.49, + "learning_rate": 9.984208997115311e-07, + "logits/chosen": -1.3097823858261108, + "logits/rejected": -0.7972382307052612, + "logps/chosen": -61.33431625366211, + "logps/rejected": -224.305908203125, + "loss": 0.1748, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.20676542818546295, + "rewards/margins": 7.611514091491699, + "rewards/rejected": -7.81827974319458, + "step": 292 + }, + { + "epoch": 0.49, + "learning_rate": 9.983619052372847e-07, + "logits/chosen": -1.4994029998779297, + "logits/rejected": -0.9353764653205872, + "logps/chosen": -454.6905517578125, + "logps/rejected": -354.83721923828125, + "loss": 0.1724, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6514847278594971, + "rewards/margins": 12.14840030670166, + "rewards/rejected": -12.799884796142578, + "step": 293 + }, + { + "epoch": 0.5, + "learning_rate": 9.983018306836599e-07, + "logits/chosen": -1.3215970993041992, + "logits/rejected": -1.484683632850647, + "logps/chosen": -519.1898803710938, + "logps/rejected": -226.5242919921875, + "loss": 0.1852, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4192039668560028, + "rewards/margins": 8.631196975708008, + "rewards/rejected": -8.211993217468262, + "step": 294 + }, + { + "epoch": 0.5, + "learning_rate": 9.98240676180854e-07, + "logits/chosen": -2.4572155475616455, + "logits/rejected": -2.3966965675354004, + "logps/chosen": -40.4495849609375, + "logps/rejected": -193.80703735351562, + "loss": 0.1768, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9751855134963989, + "rewards/margins": 9.661815643310547, + "rewards/rejected": -10.637001037597656, + "step": 295 + }, + { + "epoch": 0.5, + "learning_rate": 9.981784418614046e-07, + "logits/chosen": -1.1194053888320923, + "logits/rejected": -1.3947285413742065, + "logps/chosen": -282.65728759765625, + "logps/rejected": -197.75747680664062, + "loss": 0.1737, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5431690216064453, + "rewards/margins": 4.192901134490967, + "rewards/rejected": -4.736070156097412, + "step": 296 + }, + { + "epoch": 0.5, + "learning_rate": 9.981151278601899e-07, + "logits/chosen": -1.8385300636291504, + "logits/rejected": -0.9504812359809875, + "logps/chosen": -119.73028564453125, + "logps/rejected": -262.2828674316406, + "loss": 0.1592, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2197556495666504, + "rewards/margins": 6.548429489135742, + "rewards/rejected": -7.768185615539551, + "step": 297 + }, + { + "epoch": 0.5, + "learning_rate": 9.980507343144271e-07, + "logits/chosen": -0.959502637386322, + "logits/rejected": -0.6492790579795837, + "logps/chosen": -598.463623046875, + "logps/rejected": -408.7165222167969, + "loss": 0.2004, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5735183954238892, + "rewards/margins": 13.148619651794434, + "rewards/rejected": -14.722137451171875, + "step": 298 + }, + { + "epoch": 0.5, + "learning_rate": 9.979852613636743e-07, + "logits/chosen": -1.758123755455017, + "logits/rejected": -2.5972981452941895, + "logps/chosen": -210.6790771484375, + "logps/rejected": -183.95455932617188, + "loss": 0.2032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4298916161060333, + "rewards/margins": 4.640817642211914, + "rewards/rejected": -5.070709228515625, + "step": 299 + }, + { + "epoch": 0.51, + "learning_rate": 9.979187091498283e-07, + "logits/chosen": -1.9287923574447632, + "logits/rejected": -2.3071420192718506, + "logps/chosen": -224.43728637695312, + "logps/rejected": -252.1556396484375, + "loss": 0.1592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8604675531387329, + "rewards/margins": 4.382769584655762, + "rewards/rejected": -5.243237018585205, + "step": 300 + }, + { + "epoch": 0.51, + "learning_rate": 9.978510778171245e-07, + "logits/chosen": -1.4522650241851807, + "logits/rejected": -0.9249738454818726, + "logps/chosen": -410.66632080078125, + "logps/rejected": -477.5390625, + "loss": 0.1934, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.16993102431297302, + "rewards/margins": 6.357221603393555, + "rewards/rejected": -6.5271525382995605, + "step": 301 + }, + { + "epoch": 0.51, + "learning_rate": 9.977823675121382e-07, + "logits/chosen": -1.6546478271484375, + "logits/rejected": -2.3277413845062256, + "logps/chosen": -207.47552490234375, + "logps/rejected": -181.27479553222656, + "loss": 0.1846, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2778858244419098, + "rewards/margins": 6.3818769454956055, + "rewards/rejected": -6.659762382507324, + "step": 302 + }, + { + "epoch": 0.51, + "learning_rate": 9.977125783837818e-07, + "logits/chosen": -1.469759464263916, + "logits/rejected": -2.041001319885254, + "logps/chosen": -286.0638427734375, + "logps/rejected": -141.27078247070312, + "loss": 0.1763, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4046391546726227, + "rewards/margins": 7.348243713378906, + "rewards/rejected": -6.943604469299316, + "step": 303 + }, + { + "epoch": 0.51, + "learning_rate": 9.97641710583307e-07, + "logits/chosen": -1.4234567880630493, + "logits/rejected": -1.6683655977249146, + "logps/chosen": -362.9468078613281, + "logps/rejected": -260.8007507324219, + "loss": 0.1771, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.304555892944336, + "rewards/margins": 6.251507759094238, + "rewards/rejected": -7.556063652038574, + "step": 304 + }, + { + "epoch": 0.51, + "learning_rate": 9.975697642643022e-07, + "logits/chosen": -1.9733335971832275, + "logits/rejected": -1.0282042026519775, + "logps/chosen": -142.15017700195312, + "logps/rejected": -305.75054931640625, + "loss": 0.1982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6309067010879517, + "rewards/margins": 5.164991855621338, + "rewards/rejected": -5.7958984375, + "step": 305 + }, + { + "epoch": 0.52, + "learning_rate": 9.97496739582694e-07, + "logits/chosen": -0.8187223076820374, + "logits/rejected": -1.733450174331665, + "logps/chosen": -683.83056640625, + "logps/rejected": -257.8089599609375, + "loss": 0.2229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4965972900390625, + "rewards/margins": 11.01584243774414, + "rewards/rejected": -11.512439727783203, + "step": 306 + }, + { + "epoch": 0.52, + "learning_rate": 9.974226366967457e-07, + "logits/chosen": -1.2671740055084229, + "logits/rejected": -1.1369811296463013, + "logps/chosen": -495.198974609375, + "logps/rejected": -387.591064453125, + "loss": 0.2071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4078201353549957, + "rewards/margins": 7.06597900390625, + "rewards/rejected": -7.473799228668213, + "step": 307 + }, + { + "epoch": 0.52, + "learning_rate": 9.973474557670574e-07, + "logits/chosen": -1.4290441274642944, + "logits/rejected": -2.188762903213501, + "logps/chosen": -75.417724609375, + "logps/rejected": -70.07405853271484, + "loss": 0.176, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36977919936180115, + "rewards/margins": 4.8193769454956055, + "rewards/rejected": -5.1891560554504395, + "step": 308 + }, + { + "epoch": 0.52, + "learning_rate": 9.972711969565658e-07, + "logits/chosen": -0.476540207862854, + "logits/rejected": -1.4027069807052612, + "logps/chosen": -1183.3697509765625, + "logps/rejected": -414.2918701171875, + "loss": 0.1684, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6394989490509033, + "rewards/margins": 6.497945308685303, + "rewards/rejected": -5.8584465980529785, + "step": 309 + }, + { + "epoch": 0.52, + "learning_rate": 9.971938604305434e-07, + "logits/chosen": -1.002270221710205, + "logits/rejected": -0.6929762363433838, + "logps/chosen": -175.58682250976562, + "logps/rejected": -347.9329833984375, + "loss": 0.1936, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.08384094387292862, + "rewards/margins": 1.912644863128662, + "rewards/rejected": -1.8288038969039917, + "step": 310 + }, + { + "epoch": 0.52, + "learning_rate": 9.971154463565984e-07, + "logits/chosen": -0.7919758558273315, + "logits/rejected": -1.6887415647506714, + "logps/chosen": -293.739990234375, + "logps/rejected": -134.64825439453125, + "loss": 0.163, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24404069781303406, + "rewards/margins": 7.300686836242676, + "rewards/rejected": -7.544727325439453, + "step": 311 + }, + { + "epoch": 0.53, + "learning_rate": 9.97035954904675e-07, + "logits/chosen": -1.3530901670455933, + "logits/rejected": -1.727453589439392, + "logps/chosen": -433.71282958984375, + "logps/rejected": -299.36175537109375, + "loss": 0.1831, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0365535020828247, + "rewards/margins": 6.750691890716553, + "rewards/rejected": -7.787245273590088, + "step": 312 + }, + { + "epoch": 0.53, + "learning_rate": 9.969553862470508e-07, + "logits/chosen": -0.9584515690803528, + "logits/rejected": -0.9624962210655212, + "logps/chosen": -32.61267852783203, + "logps/rejected": -126.69062042236328, + "loss": 0.1925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5699091553688049, + "rewards/margins": 6.840811729431152, + "rewards/rejected": -7.4107208251953125, + "step": 313 + }, + { + "epoch": 0.53, + "learning_rate": 9.968737405583395e-07, + "logits/chosen": -2.1836318969726562, + "logits/rejected": -1.6158103942871094, + "logps/chosen": -131.1636962890625, + "logps/rejected": -316.4326171875, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3803081512451172, + "rewards/margins": 7.161952972412109, + "rewards/rejected": -7.542261600494385, + "step": 314 + }, + { + "epoch": 0.53, + "learning_rate": 9.967910180154888e-07, + "logits/chosen": -0.774591326713562, + "logits/rejected": -1.062556505203247, + "logps/chosen": -555.9920043945312, + "logps/rejected": -338.88055419921875, + "loss": 0.2006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0430755615234375, + "rewards/margins": 7.494574069976807, + "rewards/rejected": -7.537649631500244, + "step": 315 + }, + { + "epoch": 0.53, + "learning_rate": 9.967072187977793e-07, + "logits/chosen": -1.3581414222717285, + "logits/rejected": -1.6791399717330933, + "logps/chosen": -354.8208312988281, + "logps/rejected": -287.4419860839844, + "loss": 0.1415, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2476089596748352, + "rewards/margins": 4.651924133300781, + "rewards/rejected": -4.404314994812012, + "step": 316 + }, + { + "epoch": 0.53, + "learning_rate": 9.96622343086826e-07, + "logits/chosen": -1.4797134399414062, + "logits/rejected": -1.6615569591522217, + "logps/chosen": -264.7172546386719, + "logps/rejected": -219.28221130371094, + "loss": 0.1911, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6093296408653259, + "rewards/margins": 10.906174659729004, + "rewards/rejected": -10.296845436096191, + "step": 317 + }, + { + "epoch": 0.54, + "learning_rate": 9.96536391066576e-07, + "logits/chosen": -1.7635356187820435, + "logits/rejected": -1.4284019470214844, + "logps/chosen": -262.9644775390625, + "logps/rejected": -328.27642822265625, + "loss": 0.1773, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7439472675323486, + "rewards/margins": 5.8976640701293945, + "rewards/rejected": -5.153717041015625, + "step": 318 + }, + { + "epoch": 0.54, + "learning_rate": 9.964493629233104e-07, + "logits/chosen": -1.0009286403656006, + "logits/rejected": -0.9932736158370972, + "logps/chosen": -289.8218994140625, + "logps/rejected": -227.1639404296875, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4354541301727295, + "rewards/margins": 8.911469459533691, + "rewards/rejected": -10.346923828125, + "step": 319 + }, + { + "epoch": 0.54, + "learning_rate": 9.963612588456412e-07, + "logits/chosen": -2.250784397125244, + "logits/rejected": -1.9884320497512817, + "logps/chosen": -46.93716049194336, + "logps/rejected": -284.6944274902344, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18621844053268433, + "rewards/margins": 10.04560661315918, + "rewards/rejected": -10.231825828552246, + "step": 320 + }, + { + "epoch": 0.54, + "learning_rate": 9.962720790245126e-07, + "logits/chosen": -1.6217896938323975, + "logits/rejected": -1.0232822895050049, + "logps/chosen": -193.26486206054688, + "logps/rejected": -346.9532470703125, + "loss": 0.1927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14425279200077057, + "rewards/margins": 2.2471230030059814, + "rewards/rejected": -2.391375780105591, + "step": 321 + }, + { + "epoch": 0.54, + "learning_rate": 9.96181823653201e-07, + "logits/chosen": -1.9973036050796509, + "logits/rejected": -2.215181827545166, + "logps/chosen": -267.81097412109375, + "logps/rejected": -295.62298583984375, + "loss": 0.1495, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1725495457649231, + "rewards/margins": 7.604098320007324, + "rewards/rejected": -7.776648044586182, + "step": 322 + }, + { + "epoch": 0.54, + "learning_rate": 9.96090492927313e-07, + "logits/chosen": -1.6204432249069214, + "logits/rejected": -1.2365188598632812, + "logps/chosen": -46.48271942138672, + "logps/rejected": -140.6979217529297, + "loss": 0.1883, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3501051366329193, + "rewards/margins": 7.698967456817627, + "rewards/rejected": -8.049072265625, + "step": 323 + }, + { + "epoch": 0.55, + "learning_rate": 9.959980870447852e-07, + "logits/chosen": -0.840675950050354, + "logits/rejected": -0.4675593972206116, + "logps/chosen": -480.7996826171875, + "logps/rejected": -337.56854248046875, + "loss": 0.1888, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6151520013809204, + "rewards/margins": 12.177379608154297, + "rewards/rejected": -13.792531967163086, + "step": 324 + }, + { + "epoch": 0.55, + "learning_rate": 9.959046062058862e-07, + "logits/chosen": -1.4950480461120605, + "logits/rejected": -1.5916494131088257, + "logps/chosen": -64.40080261230469, + "logps/rejected": -71.2299575805664, + "loss": 0.2506, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16438627243041992, + "rewards/margins": 4.567226886749268, + "rewards/rejected": -4.7316131591796875, + "step": 325 + }, + { + "epoch": 0.55, + "learning_rate": 9.958100506132126e-07, + "logits/chosen": -0.7526825666427612, + "logits/rejected": -1.9301047325134277, + "logps/chosen": -560.9881591796875, + "logps/rejected": -72.82947540283203, + "loss": 0.1853, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4727003276348114, + "rewards/margins": 5.001248359680176, + "rewards/rejected": -4.528548240661621, + "step": 326 + }, + { + "epoch": 0.55, + "learning_rate": 9.957144204716907e-07, + "logits/chosen": -0.9634856581687927, + "logits/rejected": -1.3579816818237305, + "logps/chosen": -301.60650634765625, + "logps/rejected": -186.40699768066406, + "loss": 0.1644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7037521600723267, + "rewards/margins": 7.330875873565674, + "rewards/rejected": -8.034627914428711, + "step": 327 + }, + { + "epoch": 0.55, + "learning_rate": 9.956177159885764e-07, + "logits/chosen": -1.8041788339614868, + "logits/rejected": -2.096233367919922, + "logps/chosen": -149.9138946533203, + "logps/rejected": -199.65846252441406, + "loss": 0.1742, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1912795603275299, + "rewards/margins": 6.335195541381836, + "rewards/rejected": -6.143916130065918, + "step": 328 + }, + { + "epoch": 0.55, + "learning_rate": 9.955199373734528e-07, + "logits/chosen": -2.210163116455078, + "logits/rejected": -1.5051549673080444, + "logps/chosen": -43.63890838623047, + "logps/rejected": -231.96681213378906, + "loss": 0.1834, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6588637828826904, + "rewards/margins": 4.09450101852417, + "rewards/rejected": -4.753364562988281, + "step": 329 + }, + { + "epoch": 0.56, + "learning_rate": 9.954210848382317e-07, + "logits/chosen": -1.917798399925232, + "logits/rejected": -1.8297691345214844, + "logps/chosen": -28.441646575927734, + "logps/rejected": -159.19583129882812, + "loss": 0.1769, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4995257258415222, + "rewards/margins": 7.739633560180664, + "rewards/rejected": -8.23915958404541, + "step": 330 + }, + { + "epoch": 0.56, + "learning_rate": 9.953211585971522e-07, + "logits/chosen": -1.6009342670440674, + "logits/rejected": -1.1614232063293457, + "logps/chosen": -143.366455078125, + "logps/rejected": -187.1925048828125, + "loss": 0.1742, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8375290036201477, + "rewards/margins": 0.36481326818466187, + "rewards/rejected": -1.2023422718048096, + "step": 331 + }, + { + "epoch": 0.56, + "learning_rate": 9.952201588667803e-07, + "logits/chosen": -1.2768915891647339, + "logits/rejected": -2.192704916000366, + "logps/chosen": -347.0742492675781, + "logps/rejected": -107.85566711425781, + "loss": 0.2237, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9857234954833984, + "rewards/margins": 4.846242904663086, + "rewards/rejected": -5.831965923309326, + "step": 332 + }, + { + "epoch": 0.56, + "learning_rate": 9.951180858660089e-07, + "logits/chosen": -1.8204140663146973, + "logits/rejected": -1.5513927936553955, + "logps/chosen": -409.3429260253906, + "logps/rejected": -370.9957275390625, + "loss": 0.2061, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.23614689707756042, + "rewards/margins": 0.9891689419746399, + "rewards/rejected": -1.225315809249878, + "step": 333 + }, + { + "epoch": 0.56, + "learning_rate": 9.95014939816056e-07, + "logits/chosen": -0.7975092530250549, + "logits/rejected": -1.1003104448318481, + "logps/chosen": -433.71661376953125, + "logps/rejected": -278.63995361328125, + "loss": 0.1544, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7607848644256592, + "rewards/margins": 9.997198104858398, + "rewards/rejected": -11.75798225402832, + "step": 334 + }, + { + "epoch": 0.56, + "learning_rate": 9.949107209404663e-07, + "logits/chosen": -2.2883524894714355, + "logits/rejected": -2.550262212753296, + "logps/chosen": -122.60182189941406, + "logps/rejected": -206.03915405273438, + "loss": 0.2065, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2694628238677979, + "rewards/margins": 6.3892903327941895, + "rewards/rejected": -7.658753395080566, + "step": 335 + }, + { + "epoch": 0.57, + "learning_rate": 9.948054294651088e-07, + "logits/chosen": -1.41136634349823, + "logits/rejected": -2.039381265640259, + "logps/chosen": -242.87060546875, + "logps/rejected": -118.76246643066406, + "loss": 0.1629, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8472713232040405, + "rewards/margins": 0.7751063704490662, + "rewards/rejected": -1.6223777532577515, + "step": 336 + }, + { + "epoch": 0.57, + "learning_rate": 9.946990656181779e-07, + "logits/chosen": -1.0350347757339478, + "logits/rejected": -0.8846843838691711, + "logps/chosen": -52.651214599609375, + "logps/rejected": -150.87210083007812, + "loss": 0.2258, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9379146099090576, + "rewards/margins": 7.0258870124816895, + "rewards/rejected": -8.963801383972168, + "step": 337 + }, + { + "epoch": 0.57, + "learning_rate": 9.945916296301912e-07, + "logits/chosen": -2.120410442352295, + "logits/rejected": -1.4654765129089355, + "logps/chosen": -81.31796264648438, + "logps/rejected": -198.11753845214844, + "loss": 0.1973, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5237701535224915, + "rewards/margins": 3.80403470993042, + "rewards/rejected": -4.3278045654296875, + "step": 338 + }, + { + "epoch": 0.57, + "learning_rate": 9.944831217339903e-07, + "logits/chosen": -1.9489809274673462, + "logits/rejected": -1.5405217409133911, + "logps/chosen": -335.79010009765625, + "logps/rejected": -635.9193115234375, + "loss": 0.1915, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.2987374067306519, + "rewards/margins": 1.2891318798065186, + "rewards/rejected": -2.587869167327881, + "step": 339 + }, + { + "epoch": 0.57, + "learning_rate": 9.943735421647404e-07, + "logits/chosen": -1.1065800189971924, + "logits/rejected": -1.074052095413208, + "logps/chosen": -33.51839828491211, + "logps/rejected": -160.16390991210938, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1291143894195557, + "rewards/margins": 8.39620304107666, + "rewards/rejected": -9.525317192077637, + "step": 340 + }, + { + "epoch": 0.58, + "learning_rate": 9.94262891159928e-07, + "logits/chosen": -1.4195408821105957, + "logits/rejected": -1.2427754402160645, + "logps/chosen": -230.10208129882812, + "logps/rejected": -262.679931640625, + "loss": 0.2108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9072086811065674, + "rewards/margins": 10.54747200012207, + "rewards/rejected": -12.454681396484375, + "step": 341 + }, + { + "epoch": 0.58, + "learning_rate": 9.941511689593633e-07, + "logits/chosen": -1.1351226568222046, + "logits/rejected": -1.9858900308609009, + "logps/chosen": -298.932861328125, + "logps/rejected": -142.6973876953125, + "loss": 0.1527, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.044623613357544, + "rewards/margins": 9.147809982299805, + "rewards/rejected": -8.10318660736084, + "step": 342 + }, + { + "epoch": 0.58, + "learning_rate": 9.940383758051767e-07, + "logits/chosen": -1.8453896045684814, + "logits/rejected": -1.5388239622116089, + "logps/chosen": -197.114990234375, + "logps/rejected": -312.2078552246094, + "loss": 0.2152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8707555532455444, + "rewards/margins": 12.464103698730469, + "rewards/rejected": -13.334858894348145, + "step": 343 + }, + { + "epoch": 0.58, + "learning_rate": 9.939245119418206e-07, + "logits/chosen": -1.4721529483795166, + "logits/rejected": -1.445346713066101, + "logps/chosen": -178.68853759765625, + "logps/rejected": -172.24676513671875, + "loss": 0.1945, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.9977798461914062, + "rewards/margins": 2.5249760150909424, + "rewards/rejected": -4.522756099700928, + "step": 344 + }, + { + "epoch": 0.58, + "learning_rate": 9.938095776160674e-07, + "logits/chosen": -0.6039644479751587, + "logits/rejected": -0.8809584975242615, + "logps/chosen": -337.0422058105469, + "logps/rejected": -204.4918212890625, + "loss": 0.1941, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2001136839389801, + "rewards/margins": 9.865687370300293, + "rewards/rejected": -10.065800666809082, + "step": 345 + }, + { + "epoch": 0.58, + "learning_rate": 9.936935730770093e-07, + "logits/chosen": -1.719544768333435, + "logits/rejected": -1.678938865661621, + "logps/chosen": -548.88916015625, + "logps/rejected": -351.3948669433594, + "loss": 0.166, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9289765357971191, + "rewards/margins": 1.596639633178711, + "rewards/rejected": -2.52561616897583, + "step": 346 + }, + { + "epoch": 0.59, + "learning_rate": 9.935764985760582e-07, + "logits/chosen": -1.567973256111145, + "logits/rejected": -1.7810003757476807, + "logps/chosen": -64.80793762207031, + "logps/rejected": -145.93380737304688, + "loss": 0.1616, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15498466789722443, + "rewards/margins": 4.86065149307251, + "rewards/rejected": -5.015635967254639, + "step": 347 + }, + { + "epoch": 0.59, + "learning_rate": 9.934583543669453e-07, + "logits/chosen": -2.2390925884246826, + "logits/rejected": -1.3963329792022705, + "logps/chosen": -86.66363525390625, + "logps/rejected": -205.68148803710938, + "loss": 0.1828, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.016691789031028748, + "rewards/margins": 5.741617202758789, + "rewards/rejected": -5.7583088874816895, + "step": 348 + }, + { + "epoch": 0.59, + "learning_rate": 9.933391407057195e-07, + "logits/chosen": -1.3134796619415283, + "logits/rejected": -1.861419439315796, + "logps/chosen": -224.83663940429688, + "logps/rejected": -265.0384521484375, + "loss": 0.1408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34677210450172424, + "rewards/margins": 13.738977432250977, + "rewards/rejected": -13.392204284667969, + "step": 349 + }, + { + "epoch": 0.59, + "learning_rate": 9.932188578507474e-07, + "logits/chosen": -1.9301025867462158, + "logits/rejected": -1.5889010429382324, + "logps/chosen": -50.467098236083984, + "logps/rejected": -331.0249938964844, + "loss": 0.1679, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6283357739448547, + "rewards/margins": 14.762405395507812, + "rewards/rejected": -15.390741348266602, + "step": 350 + }, + { + "epoch": 0.59, + "learning_rate": 9.930975060627136e-07, + "logits/chosen": -2.602487325668335, + "logits/rejected": -1.6359155178070068, + "logps/chosen": -654.8057861328125, + "logps/rejected": -385.56781005859375, + "loss": 0.2003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9707611203193665, + "rewards/margins": 4.78594970703125, + "rewards/rejected": -5.756711006164551, + "step": 351 + }, + { + "epoch": 0.59, + "learning_rate": 9.929750856046187e-07, + "logits/chosen": -0.7864140868186951, + "logits/rejected": -1.5045098066329956, + "logps/chosen": -400.8957214355469, + "logps/rejected": -302.7469787597656, + "loss": 0.1987, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44732972979545593, + "rewards/margins": 1.3191795349121094, + "rewards/rejected": -0.8718498945236206, + "step": 352 + }, + { + "epoch": 0.6, + "learning_rate": 9.928515967417792e-07, + "logits/chosen": -0.948026716709137, + "logits/rejected": -1.3471962213516235, + "logps/chosen": -238.0382843017578, + "logps/rejected": -111.223876953125, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48494261503219604, + "rewards/margins": 7.656557083129883, + "rewards/rejected": -7.171614170074463, + "step": 353 + }, + { + "epoch": 0.6, + "learning_rate": 9.927270397418278e-07, + "logits/chosen": -1.5953532457351685, + "logits/rejected": -1.2294915914535522, + "logps/chosen": -99.09436798095703, + "logps/rejected": -213.32522583007812, + "loss": 0.1767, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6239818930625916, + "rewards/margins": 4.957241058349609, + "rewards/rejected": -5.581223011016846, + "step": 354 + }, + { + "epoch": 0.6, + "learning_rate": 9.92601414874712e-07, + "logits/chosen": -0.9315654635429382, + "logits/rejected": -1.4781970977783203, + "logps/chosen": -360.9663391113281, + "logps/rejected": -193.1742401123047, + "loss": 0.1714, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.39375075697898865, + "rewards/margins": 6.650259971618652, + "rewards/rejected": -7.044010639190674, + "step": 355 + }, + { + "epoch": 0.6, + "learning_rate": 9.924747224126931e-07, + "logits/chosen": -1.1415091753005981, + "logits/rejected": -1.5295865535736084, + "logps/chosen": -136.76544189453125, + "logps/rejected": -136.62179565429688, + "loss": 0.1877, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1295880377292633, + "rewards/margins": 4.632694244384766, + "rewards/rejected": -4.762282371520996, + "step": 356 + }, + { + "epoch": 0.6, + "learning_rate": 9.923469626303464e-07, + "logits/chosen": -2.146008014678955, + "logits/rejected": -1.7241287231445312, + "logps/chosen": -200.16453552246094, + "logps/rejected": -284.6964416503906, + "loss": 0.1901, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12157487869262695, + "rewards/margins": 12.299015045166016, + "rewards/rejected": -12.420589447021484, + "step": 357 + }, + { + "epoch": 0.6, + "learning_rate": 9.922181358045606e-07, + "logits/chosen": -2.4690566062927246, + "logits/rejected": -2.171621084213257, + "logps/chosen": -12.845178604125977, + "logps/rejected": -186.6353302001953, + "loss": 0.1612, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3692449927330017, + "rewards/margins": 6.611391544342041, + "rewards/rejected": -6.980636119842529, + "step": 358 + }, + { + "epoch": 0.61, + "learning_rate": 9.92088242214537e-07, + "logits/chosen": -1.0186376571655273, + "logits/rejected": -1.3315547704696655, + "logps/chosen": -394.67236328125, + "logps/rejected": -266.37738037109375, + "loss": 0.1621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5413925647735596, + "rewards/margins": 7.737855434417725, + "rewards/rejected": -7.196463108062744, + "step": 359 + }, + { + "epoch": 0.61, + "learning_rate": 9.919572821417885e-07, + "logits/chosen": -0.9612762928009033, + "logits/rejected": -1.0460320711135864, + "logps/chosen": -69.64960479736328, + "logps/rejected": -178.71676635742188, + "loss": 0.1798, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11741314828395844, + "rewards/margins": 11.80695629119873, + "rewards/rejected": -11.924369812011719, + "step": 360 + }, + { + "epoch": 0.61, + "learning_rate": 9.918252558701396e-07, + "logits/chosen": -1.4163517951965332, + "logits/rejected": -1.2571159601211548, + "logps/chosen": -48.93925857543945, + "logps/rejected": -165.11422729492188, + "loss": 0.1627, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40845078229904175, + "rewards/margins": 8.386886596679688, + "rewards/rejected": -8.795337677001953, + "step": 361 + }, + { + "epoch": 0.61, + "learning_rate": 9.91692163685725e-07, + "logits/chosen": -1.6064996719360352, + "logits/rejected": -1.1707127094268799, + "logps/chosen": -13.273357391357422, + "logps/rejected": -70.61612701416016, + "loss": 0.1802, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01451602578163147, + "rewards/margins": 3.4827561378479004, + "rewards/rejected": -3.468240261077881, + "step": 362 + }, + { + "epoch": 0.61, + "learning_rate": 9.915580058769908e-07, + "logits/chosen": -1.8302160501480103, + "logits/rejected": -1.5408596992492676, + "logps/chosen": -49.54063034057617, + "logps/rejected": -165.20664978027344, + "loss": 0.1484, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45899465680122375, + "rewards/margins": 0.5188831686973572, + "rewards/rejected": -0.9778778553009033, + "step": 363 + }, + { + "epoch": 0.61, + "learning_rate": 9.914227827346908e-07, + "logits/chosen": -1.4065834283828735, + "logits/rejected": -1.661447525024414, + "logps/chosen": -73.02536010742188, + "logps/rejected": -106.48794555664062, + "loss": 0.1977, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30404195189476013, + "rewards/margins": 2.008908987045288, + "rewards/rejected": -2.31295108795166, + "step": 364 + }, + { + "epoch": 0.62, + "learning_rate": 9.912864945518893e-07, + "logits/chosen": -1.741304636001587, + "logits/rejected": -2.134251356124878, + "logps/chosen": -274.7901916503906, + "logps/rejected": -197.7181396484375, + "loss": 0.2159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4863889813423157, + "rewards/margins": 1.1557424068450928, + "rewards/rejected": -1.6421314477920532, + "step": 365 + }, + { + "epoch": 0.62, + "learning_rate": 9.911491416239577e-07, + "logits/chosen": -0.297260582447052, + "logits/rejected": -0.43519172072410583, + "logps/chosen": -465.78167724609375, + "logps/rejected": -288.87237548828125, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9665650129318237, + "rewards/margins": 12.498331069946289, + "rewards/rejected": -14.464896202087402, + "step": 366 + }, + { + "epoch": 0.62, + "learning_rate": 9.910107242485756e-07, + "logits/chosen": -2.1462013721466064, + "logits/rejected": -1.7071665525436401, + "logps/chosen": -158.352783203125, + "logps/rejected": -259.8519287109375, + "loss": 0.1644, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8491896390914917, + "rewards/margins": 12.058333396911621, + "rewards/rejected": -12.907523155212402, + "step": 367 + }, + { + "epoch": 0.62, + "learning_rate": 9.908712427257291e-07, + "logits/chosen": -1.6921484470367432, + "logits/rejected": -1.673211932182312, + "logps/chosen": -38.84759521484375, + "logps/rejected": -308.70318603515625, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.669171929359436, + "rewards/margins": 10.507699966430664, + "rewards/rejected": -11.176872253417969, + "step": 368 + }, + { + "epoch": 0.62, + "learning_rate": 9.907306973577109e-07, + "logits/chosen": -1.1072365045547485, + "logits/rejected": -1.6484124660491943, + "logps/chosen": -140.98245239257812, + "logps/rejected": -191.68093872070312, + "loss": 0.1682, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5839079022407532, + "rewards/margins": 9.572250366210938, + "rewards/rejected": -10.156158447265625, + "step": 369 + }, + { + "epoch": 0.62, + "learning_rate": 9.905890884491194e-07, + "logits/chosen": -2.0668201446533203, + "logits/rejected": -1.8850473165512085, + "logps/chosen": -19.376293182373047, + "logps/rejected": -101.66358184814453, + "loss": 0.1931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12364569306373596, + "rewards/margins": 4.169041633605957, + "rewards/rejected": -4.29268741607666, + "step": 370 + }, + { + "epoch": 0.63, + "learning_rate": 9.904464163068577e-07, + "logits/chosen": -1.7173949480056763, + "logits/rejected": -2.278022289276123, + "logps/chosen": -288.80584716796875, + "logps/rejected": -257.0757141113281, + "loss": 0.2352, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6069202423095703, + "rewards/margins": 8.09630298614502, + "rewards/rejected": -8.70322322845459, + "step": 371 + }, + { + "epoch": 0.63, + "learning_rate": 9.903026812401332e-07, + "logits/chosen": -1.8909093141555786, + "logits/rejected": -1.652140498161316, + "logps/chosen": -170.0011444091797, + "logps/rejected": -244.4969482421875, + "loss": 0.172, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49996358156204224, + "rewards/margins": 10.814790725708008, + "rewards/rejected": -11.314754486083984, + "step": 372 + }, + { + "epoch": 0.63, + "learning_rate": 9.90157883560457e-07, + "logits/chosen": -0.7032025456428528, + "logits/rejected": -0.7244059443473816, + "logps/chosen": -17.019268035888672, + "logps/rejected": -98.73320770263672, + "loss": 0.2018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16112473607063293, + "rewards/margins": 7.4662017822265625, + "rewards/rejected": -7.305077075958252, + "step": 373 + }, + { + "epoch": 0.63, + "learning_rate": 9.900120235816433e-07, + "logits/chosen": -1.4089502096176147, + "logits/rejected": -1.385013461112976, + "logps/chosen": -298.27459716796875, + "logps/rejected": -263.1510314941406, + "loss": 0.1913, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2739990949630737, + "rewards/margins": 6.35389518737793, + "rewards/rejected": -7.627894401550293, + "step": 374 + }, + { + "epoch": 0.63, + "learning_rate": 9.898651016198085e-07, + "logits/chosen": -1.6036759614944458, + "logits/rejected": -2.3823535442352295, + "logps/chosen": -85.63179016113281, + "logps/rejected": -93.74755859375, + "loss": 0.1946, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.04974517598748207, + "rewards/margins": 4.797137260437012, + "rewards/rejected": -4.747392177581787, + "step": 375 + }, + { + "epoch": 0.63, + "learning_rate": 9.897171179933706e-07, + "logits/chosen": -1.0535942316055298, + "logits/rejected": -2.1571121215820312, + "logps/chosen": -717.8565673828125, + "logps/rejected": -162.77194213867188, + "loss": 0.2117, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7600006461143494, + "rewards/margins": 0.8016586899757385, + "rewards/rejected": -1.561659336090088, + "step": 376 + }, + { + "epoch": 0.64, + "learning_rate": 9.895680730230483e-07, + "logits/chosen": -1.6556205749511719, + "logits/rejected": -1.703450083732605, + "logps/chosen": -159.5177764892578, + "logps/rejected": -124.55549621582031, + "loss": 0.2291, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6513794660568237, + "rewards/margins": 5.128384113311768, + "rewards/rejected": -5.779763221740723, + "step": 377 + }, + { + "epoch": 0.64, + "learning_rate": 9.894179670318606e-07, + "logits/chosen": -1.3000423908233643, + "logits/rejected": -2.175767183303833, + "logps/chosen": -206.19947814941406, + "logps/rejected": -79.72200012207031, + "loss": 0.1892, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14822006225585938, + "rewards/margins": 1.46049165725708, + "rewards/rejected": -1.608711838722229, + "step": 378 + }, + { + "epoch": 0.64, + "learning_rate": 9.892668003451264e-07, + "logits/chosen": -1.3806241750717163, + "logits/rejected": -2.067390203475952, + "logps/chosen": -281.26068115234375, + "logps/rejected": -130.72952270507812, + "loss": 0.1605, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3341960906982422, + "rewards/margins": 7.966523170471191, + "rewards/rejected": -7.632327079772949, + "step": 379 + }, + { + "epoch": 0.64, + "learning_rate": 9.891145732904626e-07, + "logits/chosen": -2.4080684185028076, + "logits/rejected": -2.4052534103393555, + "logps/chosen": -23.60405731201172, + "logps/rejected": -156.3341522216797, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3208608627319336, + "rewards/margins": 9.061665534973145, + "rewards/rejected": -9.382526397705078, + "step": 380 + }, + { + "epoch": 0.64, + "learning_rate": 9.889612861977853e-07, + "logits/chosen": -1.899614691734314, + "logits/rejected": -1.3512424230575562, + "logps/chosen": -52.9254035949707, + "logps/rejected": -142.5703125, + "loss": 0.195, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25636160373687744, + "rewards/margins": 4.048177242279053, + "rewards/rejected": -3.791815757751465, + "step": 381 + }, + { + "epoch": 0.64, + "learning_rate": 9.888069393993068e-07, + "logits/chosen": -1.6391324996948242, + "logits/rejected": -2.6511809825897217, + "logps/chosen": -357.7758483886719, + "logps/rejected": -136.27767944335938, + "loss": 0.1648, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7596569061279297, + "rewards/margins": 3.358733892440796, + "rewards/rejected": -2.599076986312866, + "step": 382 + }, + { + "epoch": 0.65, + "learning_rate": 9.886515332295368e-07, + "logits/chosen": -1.600348711013794, + "logits/rejected": -2.2982192039489746, + "logps/chosen": -228.30441284179688, + "logps/rejected": -234.28158569335938, + "loss": 0.1545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35866472125053406, + "rewards/margins": 5.956897735595703, + "rewards/rejected": -5.598233222961426, + "step": 383 + }, + { + "epoch": 0.65, + "learning_rate": 9.88495068025281e-07, + "logits/chosen": -1.3065029382705688, + "logits/rejected": -1.0549356937408447, + "logps/chosen": -226.62106323242188, + "logps/rejected": -404.517822265625, + "loss": 0.2052, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2328369617462158, + "rewards/margins": 17.27766227722168, + "rewards/rejected": -18.510498046875, + "step": 384 + }, + { + "epoch": 0.65, + "learning_rate": 9.883375441256397e-07, + "logits/chosen": -2.497364044189453, + "logits/rejected": -2.2807140350341797, + "logps/chosen": -79.43330383300781, + "logps/rejected": -1763.1298828125, + "loss": 0.2064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6368292570114136, + "rewards/margins": 4.40083646774292, + "rewards/rejected": -5.037665843963623, + "step": 385 + }, + { + "epoch": 0.65, + "learning_rate": 9.88178961872008e-07, + "logits/chosen": -2.6438419818878174, + "logits/rejected": -1.569690227508545, + "logps/chosen": -492.58660888671875, + "logps/rejected": -267.66607666015625, + "loss": 0.1912, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.091853380203247, + "rewards/margins": 10.429098129272461, + "rewards/rejected": -11.520952224731445, + "step": 386 + }, + { + "epoch": 0.65, + "learning_rate": 9.880193216080748e-07, + "logits/chosen": -1.5667064189910889, + "logits/rejected": -0.8056033253669739, + "logps/chosen": -272.8934020996094, + "logps/rejected": -404.6851806640625, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8850148916244507, + "rewards/margins": 8.984339714050293, + "rewards/rejected": -9.869355201721191, + "step": 387 + }, + { + "epoch": 0.65, + "learning_rate": 9.878586236798221e-07, + "logits/chosen": -1.8617156744003296, + "logits/rejected": -1.3088810443878174, + "logps/chosen": -240.18605041503906, + "logps/rejected": -271.2889099121094, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44331783056259155, + "rewards/margins": 11.63255500793457, + "rewards/rejected": -12.075872421264648, + "step": 388 + }, + { + "epoch": 0.66, + "learning_rate": 9.876968684355238e-07, + "logits/chosen": -0.60749751329422, + "logits/rejected": -0.9327036738395691, + "logps/chosen": -248.83444213867188, + "logps/rejected": -199.20504760742188, + "loss": 0.179, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9339897632598877, + "rewards/margins": 9.368557929992676, + "rewards/rejected": -10.302547454833984, + "step": 389 + }, + { + "epoch": 0.66, + "learning_rate": 9.875340562257452e-07, + "logits/chosen": -1.6634929180145264, + "logits/rejected": -1.8427734375, + "logps/chosen": -225.20516967773438, + "logps/rejected": -224.0525665283203, + "loss": 0.159, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1540244221687317, + "rewards/margins": 4.2535014152526855, + "rewards/rejected": -4.099477291107178, + "step": 390 + }, + { + "epoch": 0.66, + "learning_rate": 9.87370187403343e-07, + "logits/chosen": -1.5186893939971924, + "logits/rejected": -1.5003349781036377, + "logps/chosen": -355.7366638183594, + "logps/rejected": -268.9139404296875, + "loss": 0.1805, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4280953407287598, + "rewards/margins": 10.353462219238281, + "rewards/rejected": -11.781557083129883, + "step": 391 + }, + { + "epoch": 0.66, + "learning_rate": 9.872052623234631e-07, + "logits/chosen": -1.2197469472885132, + "logits/rejected": -2.3034727573394775, + "logps/chosen": -305.0716552734375, + "logps/rejected": -119.30422973632812, + "loss": 0.1617, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9030243158340454, + "rewards/margins": 6.021801471710205, + "rewards/rejected": -6.924825668334961, + "step": 392 + }, + { + "epoch": 0.66, + "learning_rate": 9.870392813435408e-07, + "logits/chosen": -1.6853370666503906, + "logits/rejected": -1.8906601667404175, + "logps/chosen": -54.065086364746094, + "logps/rejected": -109.49345397949219, + "loss": 0.183, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0523476600646973, + "rewards/margins": 4.095767021179199, + "rewards/rejected": -5.1481146812438965, + "step": 393 + }, + { + "epoch": 0.66, + "learning_rate": 9.868722448233003e-07, + "logits/chosen": -1.3388574123382568, + "logits/rejected": -1.6647964715957642, + "logps/chosen": -239.39515686035156, + "logps/rejected": -119.59913635253906, + "loss": 0.1716, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1848201751708984, + "rewards/margins": 4.421016693115234, + "rewards/rejected": -5.605837345123291, + "step": 394 + }, + { + "epoch": 0.67, + "learning_rate": 9.867041531247524e-07, + "logits/chosen": -1.7558951377868652, + "logits/rejected": -1.7084178924560547, + "logps/chosen": -49.69265365600586, + "logps/rejected": -83.90230560302734, + "loss": 0.2227, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1897386610507965, + "rewards/margins": 3.7224905490875244, + "rewards/rejected": -3.912229061126709, + "step": 395 + }, + { + "epoch": 0.67, + "learning_rate": 9.86535006612196e-07, + "logits/chosen": -1.0474004745483398, + "logits/rejected": -0.7698359489440918, + "logps/chosen": -255.73220825195312, + "logps/rejected": -224.53810119628906, + "loss": 0.129, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.990817070007324, + "rewards/margins": 3.028542995452881, + "rewards/rejected": -7.019360065460205, + "step": 396 + }, + { + "epoch": 0.67, + "learning_rate": 9.86364805652215e-07, + "logits/chosen": -1.4001511335372925, + "logits/rejected": -0.2056565284729004, + "logps/chosen": -338.36224365234375, + "logps/rejected": -943.743408203125, + "loss": 0.1818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8232328295707703, + "rewards/margins": 8.953399658203125, + "rewards/rejected": -9.776632308959961, + "step": 397 + }, + { + "epoch": 0.67, + "learning_rate": 9.861935506136793e-07, + "logits/chosen": -2.1997809410095215, + "logits/rejected": -2.1212539672851562, + "logps/chosen": -115.73619079589844, + "logps/rejected": -155.65628051757812, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3197250366210938, + "rewards/margins": 5.0713791847229, + "rewards/rejected": -6.391103744506836, + "step": 398 + }, + { + "epoch": 0.67, + "learning_rate": 9.860212418677425e-07, + "logits/chosen": -1.3245618343353271, + "logits/rejected": -1.5838465690612793, + "logps/chosen": -192.42564392089844, + "logps/rejected": -121.29484558105469, + "loss": 0.1951, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7826123237609863, + "rewards/margins": 6.154439926147461, + "rewards/rejected": -5.371828079223633, + "step": 399 + }, + { + "epoch": 0.67, + "learning_rate": 9.858478797878428e-07, + "logits/chosen": -1.1259046792984009, + "logits/rejected": -1.549862265586853, + "logps/chosen": -599.236328125, + "logps/rejected": -300.1288757324219, + "loss": 0.1937, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30740663409233093, + "rewards/margins": 7.251522064208984, + "rewards/rejected": -6.94411563873291, + "step": 400 + }, + { + "epoch": 0.68, + "learning_rate": 9.856734647497004e-07, + "logits/chosen": -1.8423173427581787, + "logits/rejected": -1.7820255756378174, + "logps/chosen": -338.8928527832031, + "logps/rejected": -427.98590087890625, + "loss": 0.1951, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7935646176338196, + "rewards/margins": 1.3270835876464844, + "rewards/rejected": -2.120648145675659, + "step": 401 + }, + { + "epoch": 0.68, + "learning_rate": 9.854979971313182e-07, + "logits/chosen": -1.5276292562484741, + "logits/rejected": -1.362746000289917, + "logps/chosen": -153.15682983398438, + "logps/rejected": -178.7244873046875, + "loss": 0.1921, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.658955693244934, + "rewards/margins": 6.521320343017578, + "rewards/rejected": -8.180275917053223, + "step": 402 + }, + { + "epoch": 0.68, + "learning_rate": 9.853214773129795e-07, + "logits/chosen": -1.574330449104309, + "logits/rejected": -1.6447815895080566, + "logps/chosen": -203.70518493652344, + "logps/rejected": -186.30174255371094, + "loss": 0.1799, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5951843857765198, + "rewards/margins": 10.695902824401855, + "rewards/rejected": -11.29108715057373, + "step": 403 + }, + { + "epoch": 0.68, + "learning_rate": 9.851439056772488e-07, + "logits/chosen": -2.2148728370666504, + "logits/rejected": -0.5473410487174988, + "logps/chosen": -114.89361572265625, + "logps/rejected": -270.749755859375, + "loss": 0.1814, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34365734457969666, + "rewards/margins": 10.512922286987305, + "rewards/rejected": -10.856579780578613, + "step": 404 + }, + { + "epoch": 0.68, + "learning_rate": 9.8496528260897e-07, + "logits/chosen": -1.523105502128601, + "logits/rejected": -0.9805685877799988, + "logps/chosen": -263.91705322265625, + "logps/rejected": -362.019287109375, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9856665134429932, + "rewards/margins": 13.921882629394531, + "rewards/rejected": -15.907548904418945, + "step": 405 + }, + { + "epoch": 0.68, + "learning_rate": 9.847856084952652e-07, + "logits/chosen": -1.4170777797698975, + "logits/rejected": -1.4437755346298218, + "logps/chosen": -71.64923858642578, + "logps/rejected": -111.52011108398438, + "loss": 0.1337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24141913652420044, + "rewards/margins": 1.8372013568878174, + "rewards/rejected": -2.078620433807373, + "step": 406 + }, + { + "epoch": 0.69, + "learning_rate": 9.846048837255353e-07, + "logits/chosen": -1.8761231899261475, + "logits/rejected": -1.4860190153121948, + "logps/chosen": -321.606689453125, + "logps/rejected": -248.3543701171875, + "loss": 0.1918, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1618309020996094, + "rewards/margins": 8.921380996704102, + "rewards/rejected": -10.083211898803711, + "step": 407 + }, + { + "epoch": 0.69, + "learning_rate": 9.84423108691457e-07, + "logits/chosen": -1.91605544090271, + "logits/rejected": -1.6970082521438599, + "logps/chosen": -59.132293701171875, + "logps/rejected": -167.10081481933594, + "loss": 0.2158, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.300628423690796, + "rewards/margins": 9.486973762512207, + "rewards/rejected": -10.787602424621582, + "step": 408 + }, + { + "epoch": 0.69, + "learning_rate": 9.842402837869842e-07, + "logits/chosen": -0.6720188856124878, + "logits/rejected": -1.0757673978805542, + "logps/chosen": -532.5537109375, + "logps/rejected": -276.9455261230469, + "loss": 0.1982, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42199867963790894, + "rewards/margins": 12.428057670593262, + "rewards/rejected": -12.850056648254395, + "step": 409 + }, + { + "epoch": 0.69, + "learning_rate": 9.84056409408346e-07, + "logits/chosen": -1.7832905054092407, + "logits/rejected": -2.1222946643829346, + "logps/chosen": -311.7086181640625, + "logps/rejected": -320.260009765625, + "loss": 0.2174, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5798829793930054, + "rewards/margins": 5.759591102600098, + "rewards/rejected": -7.339474201202393, + "step": 410 + }, + { + "epoch": 0.69, + "learning_rate": 9.838714859540458e-07, + "logits/chosen": -1.6252554655075073, + "logits/rejected": -2.4704833030700684, + "logps/chosen": -273.0162048339844, + "logps/rejected": -206.291015625, + "loss": 0.1782, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6414878964424133, + "rewards/margins": 7.542596817016602, + "rewards/rejected": -8.18408489227295, + "step": 411 + }, + { + "epoch": 0.69, + "learning_rate": 9.836855138248602e-07, + "logits/chosen": -1.5945271253585815, + "logits/rejected": -1.039263129234314, + "logps/chosen": -213.08566284179688, + "logps/rejected": -276.2701416015625, + "loss": 0.2015, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5801552534103394, + "rewards/margins": 4.361068248748779, + "rewards/rejected": -5.941223621368408, + "step": 412 + }, + { + "epoch": 0.7, + "learning_rate": 9.834984934238397e-07, + "logits/chosen": -2.0274336338043213, + "logits/rejected": -1.1639341115951538, + "logps/chosen": -224.14974975585938, + "logps/rejected": -492.0933532714844, + "loss": 0.1541, + "rewards/accuracies": 0.0, + "rewards/chosen": -1.5109009742736816, + "rewards/margins": -0.4450409412384033, + "rewards/rejected": -1.0658600330352783, + "step": 413 + }, + { + "epoch": 0.7, + "learning_rate": 9.833104251563055e-07, + "logits/chosen": -1.1577857732772827, + "logits/rejected": -1.7103009223937988, + "logps/chosen": -294.07470703125, + "logps/rejected": -189.33216857910156, + "loss": 0.1796, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6231157779693604, + "rewards/margins": 8.794317245483398, + "rewards/rejected": -10.41743278503418, + "step": 414 + }, + { + "epoch": 0.7, + "learning_rate": 9.831213094298504e-07, + "logits/chosen": -2.4059667587280273, + "logits/rejected": -2.134012222290039, + "logps/chosen": -89.21797180175781, + "logps/rejected": -205.1034393310547, + "loss": 0.2001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15212784707546234, + "rewards/margins": 1.837868571281433, + "rewards/rejected": -1.9899964332580566, + "step": 415 + }, + { + "epoch": 0.7, + "learning_rate": 9.829311466543372e-07, + "logits/chosen": -0.7545611262321472, + "logits/rejected": -2.312044382095337, + "logps/chosen": -298.3918151855469, + "logps/rejected": -100.84245300292969, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2649837732315063, + "rewards/margins": 4.35391902923584, + "rewards/rejected": -5.618902683258057, + "step": 416 + }, + { + "epoch": 0.7, + "learning_rate": 9.827399372418978e-07, + "logits/chosen": -1.689001202583313, + "logits/rejected": -1.7924022674560547, + "logps/chosen": -490.21630859375, + "logps/rejected": -572.5187377929688, + "loss": 0.1589, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.4802818298339844, + "rewards/margins": 1.4142119884490967, + "rewards/rejected": -1.894493818283081, + "step": 417 + }, + { + "epoch": 0.7, + "learning_rate": 9.825476816069325e-07, + "logits/chosen": -1.687720537185669, + "logits/rejected": -2.245692014694214, + "logps/chosen": -252.95947265625, + "logps/rejected": -272.66729736328125, + "loss": 0.1749, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0029993057250977, + "rewards/margins": 6.877737522125244, + "rewards/rejected": -7.880736827850342, + "step": 418 + }, + { + "epoch": 0.71, + "learning_rate": 9.823543801661093e-07, + "logits/chosen": -1.327947735786438, + "logits/rejected": -2.4989848136901855, + "logps/chosen": -171.38914489746094, + "logps/rejected": -114.95437622070312, + "loss": 0.1847, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1186927556991577, + "rewards/margins": 3.9151296615600586, + "rewards/rejected": -5.033822536468506, + "step": 419 + }, + { + "epoch": 0.71, + "learning_rate": 9.821600333383624e-07, + "logits/chosen": -1.2999866008758545, + "logits/rejected": -1.940646767616272, + "logps/chosen": -404.37384033203125, + "logps/rejected": -225.85601806640625, + "loss": 0.18, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6930325031280518, + "rewards/margins": 9.95132827758789, + "rewards/rejected": -11.64436149597168, + "step": 420 + }, + { + "epoch": 0.71, + "learning_rate": 9.819646415448917e-07, + "logits/chosen": -0.8139424324035645, + "logits/rejected": -1.0725926160812378, + "logps/chosen": -449.7269287109375, + "logps/rejected": -254.24598693847656, + "loss": 0.1662, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6415162086486816, + "rewards/margins": 9.845947265625, + "rewards/rejected": -11.487462997436523, + "step": 421 + }, + { + "epoch": 0.71, + "learning_rate": 9.817682052091617e-07, + "logits/chosen": -1.257497787475586, + "logits/rejected": -1.384131908416748, + "logps/chosen": -306.87060546875, + "logps/rejected": -237.9322509765625, + "loss": 0.165, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43299102783203125, + "rewards/margins": 6.2627716064453125, + "rewards/rejected": -5.829780578613281, + "step": 422 + }, + { + "epoch": 0.71, + "learning_rate": 9.815707247569012e-07, + "logits/chosen": -1.3054028749465942, + "logits/rejected": -1.9904706478118896, + "logps/chosen": -199.9625244140625, + "logps/rejected": -174.94766235351562, + "loss": 0.2305, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0188324451446533, + "rewards/margins": 9.714458465576172, + "rewards/rejected": -10.733290672302246, + "step": 423 + }, + { + "epoch": 0.72, + "learning_rate": 9.81372200616101e-07, + "logits/chosen": -1.711554765701294, + "logits/rejected": -1.9116215705871582, + "logps/chosen": -244.9205780029297, + "logps/rejected": -235.5703125, + "loss": 0.1702, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49955570697784424, + "rewards/margins": 6.994556427001953, + "rewards/rejected": -7.494112014770508, + "step": 424 + }, + { + "epoch": 0.72, + "learning_rate": 9.81172633217015e-07, + "logits/chosen": -1.4319877624511719, + "logits/rejected": -1.4931282997131348, + "logps/chosen": -227.3060760498047, + "logps/rejected": -265.25616455078125, + "loss": 0.2099, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4414575099945068, + "rewards/margins": 11.510025024414062, + "rewards/rejected": -12.951482772827148, + "step": 425 + }, + { + "epoch": 0.72, + "learning_rate": 9.809720229921572e-07, + "logits/chosen": -1.3125286102294922, + "logits/rejected": -1.061862587928772, + "logps/chosen": -175.32794189453125, + "logps/rejected": -224.84396362304688, + "loss": 0.1759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.008734322153031826, + "rewards/margins": 4.605374336242676, + "rewards/rejected": -4.614108562469482, + "step": 426 + }, + { + "epoch": 0.72, + "learning_rate": 9.807703703763015e-07, + "logits/chosen": -1.8269639015197754, + "logits/rejected": -1.7482253313064575, + "logps/chosen": -33.005889892578125, + "logps/rejected": -88.10383605957031, + "loss": 0.1665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19627046585083008, + "rewards/margins": 3.915902614593506, + "rewards/rejected": -4.112173080444336, + "step": 427 + }, + { + "epoch": 0.72, + "learning_rate": 9.80567675806482e-07, + "logits/chosen": -1.4262571334838867, + "logits/rejected": -1.5540329217910767, + "logps/chosen": -73.27861785888672, + "logps/rejected": -102.44043731689453, + "loss": 0.165, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6107984781265259, + "rewards/margins": 0.42136502265930176, + "rewards/rejected": -2.032163619995117, + "step": 428 + }, + { + "epoch": 0.72, + "learning_rate": 9.8036393972199e-07, + "logits/chosen": -2.002448320388794, + "logits/rejected": -2.299851655960083, + "logps/chosen": -87.68385314941406, + "logps/rejected": -85.51641845703125, + "loss": 0.1533, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3282948732376099, + "rewards/margins": 2.7130818367004395, + "rewards/rejected": -4.04137659072876, + "step": 429 + }, + { + "epoch": 0.73, + "learning_rate": 9.801591625643743e-07, + "logits/chosen": -0.7014894485473633, + "logits/rejected": -1.4072990417480469, + "logps/chosen": -695.2222900390625, + "logps/rejected": -328.1853942871094, + "loss": 0.1957, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7655518054962158, + "rewards/margins": 12.59501838684082, + "rewards/rejected": -14.360569953918457, + "step": 430 + }, + { + "epoch": 0.73, + "learning_rate": 9.799533447774404e-07, + "logits/chosen": -1.8441977500915527, + "logits/rejected": -2.370758533477783, + "logps/chosen": -488.61370849609375, + "logps/rejected": -225.34573364257812, + "loss": 0.1664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5413681268692017, + "rewards/margins": 6.072797775268555, + "rewards/rejected": -6.614165782928467, + "step": 431 + }, + { + "epoch": 0.73, + "learning_rate": 9.797464868072486e-07, + "logits/chosen": -1.492543339729309, + "logits/rejected": -1.383135437965393, + "logps/chosen": -128.7510223388672, + "logps/rejected": -252.252685546875, + "loss": 0.1783, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5648261904716492, + "rewards/margins": 2.220522880554199, + "rewards/rejected": -2.7853493690490723, + "step": 432 + }, + { + "epoch": 0.73, + "learning_rate": 9.795385891021136e-07, + "logits/chosen": -1.3558893203735352, + "logits/rejected": -2.4246795177459717, + "logps/chosen": -88.99250793457031, + "logps/rejected": -69.09440612792969, + "loss": 0.137, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3191506564617157, + "rewards/margins": 4.714672565460205, + "rewards/rejected": -5.033823013305664, + "step": 433 + }, + { + "epoch": 0.73, + "learning_rate": 9.79329652112604e-07, + "logits/chosen": -2.187133312225342, + "logits/rejected": -2.0523133277893066, + "logps/chosen": -237.15127563476562, + "logps/rejected": -257.36614990234375, + "loss": 0.2559, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08025474846363068, + "rewards/margins": 6.414705276489258, + "rewards/rejected": -6.494959831237793, + "step": 434 + }, + { + "epoch": 0.73, + "learning_rate": 9.7911967629154e-07, + "logits/chosen": -0.5673585534095764, + "logits/rejected": -0.7198299169540405, + "logps/chosen": -272.7919921875, + "logps/rejected": -180.43826293945312, + "loss": 0.1932, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3921341001987457, + "rewards/margins": 10.290606498718262, + "rewards/rejected": -9.89847183227539, + "step": 435 + }, + { + "epoch": 0.74, + "learning_rate": 9.789086620939935e-07, + "logits/chosen": -1.5646958351135254, + "logits/rejected": -1.3017683029174805, + "logps/chosen": -246.85928344726562, + "logps/rejected": -270.7761535644531, + "loss": 0.2036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7477285265922546, + "rewards/margins": 11.448813438415527, + "rewards/rejected": -12.196542739868164, + "step": 436 + }, + { + "epoch": 0.74, + "learning_rate": 9.786966099772873e-07, + "logits/chosen": -1.1668096780776978, + "logits/rejected": -1.150396466255188, + "logps/chosen": -214.652099609375, + "logps/rejected": -179.91836547851562, + "loss": 0.1923, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1188222169876099, + "rewards/margins": 8.251716613769531, + "rewards/rejected": -9.370538711547852, + "step": 437 + }, + { + "epoch": 0.74, + "learning_rate": 9.784835204009932e-07, + "logits/chosen": -0.9191622734069824, + "logits/rejected": -0.8720898032188416, + "logps/chosen": -184.92555236816406, + "logps/rejected": -170.7421112060547, + "loss": 0.1958, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.486758828163147, + "rewards/margins": 5.754380702972412, + "rewards/rejected": -7.2411394119262695, + "step": 438 + }, + { + "epoch": 0.74, + "learning_rate": 9.782693938269312e-07, + "logits/chosen": -1.8319122791290283, + "logits/rejected": -1.2807798385620117, + "logps/chosen": -209.416259765625, + "logps/rejected": -269.36041259765625, + "loss": 0.1844, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.37221360206604, + "rewards/margins": 13.217639923095703, + "rewards/rejected": -14.589853286743164, + "step": 439 + }, + { + "epoch": 0.74, + "learning_rate": 9.780542307191697e-07, + "logits/chosen": -1.5643036365509033, + "logits/rejected": -1.2528153657913208, + "logps/chosen": -608.1365356445312, + "logps/rejected": -491.44134521484375, + "loss": 0.1582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6126511096954346, + "rewards/margins": 10.805754661560059, + "rewards/rejected": -11.418405532836914, + "step": 440 + }, + { + "epoch": 0.74, + "learning_rate": 9.778380315440223e-07, + "logits/chosen": -1.5918906927108765, + "logits/rejected": -1.0028258562088013, + "logps/chosen": -284.98736572265625, + "logps/rejected": -302.61846923828125, + "loss": 0.187, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8316802978515625, + "rewards/margins": 8.248760223388672, + "rewards/rejected": -9.080440521240234, + "step": 441 + }, + { + "epoch": 0.75, + "learning_rate": 9.776207967700489e-07, + "logits/chosen": -2.523466110229492, + "logits/rejected": -1.5323551893234253, + "logps/chosen": -88.10340881347656, + "logps/rejected": -281.467041015625, + "loss": 0.1714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7970980405807495, + "rewards/margins": 10.560657501220703, + "rewards/rejected": -11.357755661010742, + "step": 442 + }, + { + "epoch": 0.75, + "learning_rate": 9.774025268680538e-07, + "logits/chosen": -1.2854011058807373, + "logits/rejected": -1.61070716381073, + "logps/chosen": -567.8692016601562, + "logps/rejected": -314.9006042480469, + "loss": 0.1934, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4001374244689941, + "rewards/margins": 9.382172584533691, + "rewards/rejected": -10.782309532165527, + "step": 443 + }, + { + "epoch": 0.75, + "learning_rate": 9.77183222311084e-07, + "logits/chosen": -1.8872003555297852, + "logits/rejected": -1.3685364723205566, + "logps/chosen": -155.8271942138672, + "logps/rejected": -398.8939208984375, + "loss": 0.1818, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7372512817382812, + "rewards/margins": 0.28054046630859375, + "rewards/rejected": -1.017791748046875, + "step": 444 + }, + { + "epoch": 0.75, + "learning_rate": 9.769628835744292e-07, + "logits/chosen": -2.153257131576538, + "logits/rejected": -2.538587808609009, + "logps/chosen": -219.20547485351562, + "logps/rejected": -285.2276611328125, + "loss": 0.1725, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1647541522979736, + "rewards/margins": 6.780641555786133, + "rewards/rejected": -7.945395469665527, + "step": 445 + }, + { + "epoch": 0.75, + "learning_rate": 9.767415111356208e-07, + "logits/chosen": -1.6822693347930908, + "logits/rejected": -1.125065565109253, + "logps/chosen": -213.1407928466797, + "logps/rejected": -248.02450561523438, + "loss": 0.1815, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2097599506378174, + "rewards/margins": 7.717904090881348, + "rewards/rejected": -8.927663803100586, + "step": 446 + }, + { + "epoch": 0.75, + "learning_rate": 9.765191054744304e-07, + "logits/chosen": -0.9882490038871765, + "logits/rejected": -1.6791247129440308, + "logps/chosen": -272.322998046875, + "logps/rejected": -146.1826171875, + "loss": 0.1619, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7433132529258728, + "rewards/margins": 5.633927345275879, + "rewards/rejected": -6.377241134643555, + "step": 447 + }, + { + "epoch": 0.76, + "learning_rate": 9.762956670728683e-07, + "logits/chosen": -1.6359366178512573, + "logits/rejected": -1.28961181640625, + "logps/chosen": -62.76860046386719, + "logps/rejected": -229.94815063476562, + "loss": 0.1326, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22195053100585938, + "rewards/margins": 12.753116607666016, + "rewards/rejected": -12.975067138671875, + "step": 448 + }, + { + "epoch": 0.76, + "learning_rate": 9.76071196415184e-07, + "logits/chosen": -1.9846500158309937, + "logits/rejected": -1.8391332626342773, + "logps/chosen": -223.3437042236328, + "logps/rejected": -206.13235473632812, + "loss": 0.1572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34529823064804077, + "rewards/margins": 8.844955444335938, + "rewards/rejected": -9.190253257751465, + "step": 449 + }, + { + "epoch": 0.76, + "learning_rate": 9.758456939878629e-07, + "logits/chosen": -0.6676144003868103, + "logits/rejected": -0.5327748656272888, + "logps/chosen": -334.7253723144531, + "logps/rejected": -288.2399597167969, + "loss": 0.1739, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5323821902275085, + "rewards/margins": 7.189955234527588, + "rewards/rejected": -7.72233772277832, + "step": 450 + }, + { + "epoch": 0.76, + "learning_rate": 9.756191602796275e-07, + "logits/chosen": -0.7316077947616577, + "logits/rejected": -1.6688975095748901, + "logps/chosen": -526.944580078125, + "logps/rejected": -288.75018310546875, + "loss": 0.1576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8874114751815796, + "rewards/margins": 8.355914115905762, + "rewards/rejected": -9.243326187133789, + "step": 451 + }, + { + "epoch": 0.76, + "learning_rate": 9.753915957814352e-07, + "logits/chosen": -1.136466145515442, + "logits/rejected": -1.8164993524551392, + "logps/chosen": -503.8302307128906, + "logps/rejected": -425.62176513671875, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10046082735061646, + "rewards/margins": 1.3869491815567017, + "rewards/rejected": -1.2864882946014404, + "step": 452 + }, + { + "epoch": 0.76, + "learning_rate": 9.751630009864768e-07, + "logits/chosen": -0.9611995816230774, + "logits/rejected": -1.4471514225006104, + "logps/chosen": -530.7092895507812, + "logps/rejected": -166.03952026367188, + "loss": 0.2416, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7246857285499573, + "rewards/margins": 6.8713698387146, + "rewards/rejected": -7.596055507659912, + "step": 453 + }, + { + "epoch": 0.77, + "learning_rate": 9.74933376390177e-07, + "logits/chosen": -1.8776335716247559, + "logits/rejected": -2.2231943607330322, + "logps/chosen": -299.9205627441406, + "logps/rejected": -328.975341796875, + "loss": 0.1723, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1313549131155014, + "rewards/margins": 5.5653581619262695, + "rewards/rejected": -5.696713447570801, + "step": 454 + }, + { + "epoch": 0.77, + "learning_rate": 9.747027224901912e-07, + "logits/chosen": -1.4517310857772827, + "logits/rejected": -1.5876500606536865, + "logps/chosen": -20.67554473876953, + "logps/rejected": -94.16972351074219, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09321515262126923, + "rewards/margins": 4.643195629119873, + "rewards/rejected": -4.736410617828369, + "step": 455 + }, + { + "epoch": 0.77, + "learning_rate": 9.744710397864066e-07, + "logits/chosen": -2.7332839965820312, + "logits/rejected": -1.3261853456497192, + "logps/chosen": -23.798072814941406, + "logps/rejected": -210.81732177734375, + "loss": 0.1521, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3627491295337677, + "rewards/margins": 9.340432167053223, + "rewards/rejected": -9.703181266784668, + "step": 456 + }, + { + "epoch": 0.77, + "learning_rate": 9.742383287809396e-07, + "logits/chosen": -1.964377999305725, + "logits/rejected": -1.4948861598968506, + "logps/chosen": -281.6620788574219, + "logps/rejected": -280.5074157714844, + "loss": 0.2045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4957534670829773, + "rewards/margins": 11.540742874145508, + "rewards/rejected": -12.036495208740234, + "step": 457 + }, + { + "epoch": 0.77, + "learning_rate": 9.740045899781352e-07, + "logits/chosen": -0.5110257863998413, + "logits/rejected": -0.6007272601127625, + "logps/chosen": -226.26109313964844, + "logps/rejected": -171.11524963378906, + "loss": 0.168, + "rewards/accuracies": 0.0, + "rewards/chosen": -2.9013428688049316, + "rewards/margins": -1.4838972091674805, + "rewards/rejected": -1.4174456596374512, + "step": 458 + }, + { + "epoch": 0.77, + "learning_rate": 9.737698238845658e-07, + "logits/chosen": -1.7447842359542847, + "logits/rejected": -2.5760953426361084, + "logps/chosen": -257.5279846191406, + "logps/rejected": -195.7313995361328, + "loss": 0.1909, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.032274626195430756, + "rewards/margins": 7.554751396179199, + "rewards/rejected": -7.5870256423950195, + "step": 459 + }, + { + "epoch": 0.78, + "learning_rate": 9.735340310090306e-07, + "logits/chosen": -1.7694307565689087, + "logits/rejected": -2.6311557292938232, + "logps/chosen": -307.8857116699219, + "logps/rejected": -171.5406494140625, + "loss": 0.1577, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29933395981788635, + "rewards/margins": 8.812219619750977, + "rewards/rejected": -9.111554145812988, + "step": 460 + }, + { + "epoch": 0.78, + "learning_rate": 9.732972118625536e-07, + "logits/chosen": -1.408591866493225, + "logits/rejected": -1.5779300928115845, + "logps/chosen": -169.06919860839844, + "logps/rejected": -177.49819946289062, + "loss": 0.1833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2286117523908615, + "rewards/margins": 0.6607635617256165, + "rewards/rejected": -0.8893753290176392, + "step": 461 + }, + { + "epoch": 0.78, + "learning_rate": 9.730593669583835e-07, + "logits/chosen": -1.7068212032318115, + "logits/rejected": -1.9184765815734863, + "logps/chosen": -70.21788024902344, + "logps/rejected": -134.25051879882812, + "loss": 0.1484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1888481080532074, + "rewards/margins": 5.6896843910217285, + "rewards/rejected": -5.500836372375488, + "step": 462 + }, + { + "epoch": 0.78, + "learning_rate": 9.728204968119915e-07, + "logits/chosen": -1.1536794900894165, + "logits/rejected": -1.827487826347351, + "logps/chosen": -354.2060546875, + "logps/rejected": -361.4664001464844, + "loss": 0.1828, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.5781145095825195, + "rewards/margins": 4.080938339233398, + "rewards/rejected": -6.659052848815918, + "step": 463 + }, + { + "epoch": 0.78, + "learning_rate": 9.725806019410717e-07, + "logits/chosen": -0.7136868834495544, + "logits/rejected": -0.650567352771759, + "logps/chosen": -158.14785766601562, + "logps/rejected": -164.43289184570312, + "loss": 0.1672, + "rewards/accuracies": 0.0, + "rewards/chosen": -2.082798719406128, + "rewards/margins": -0.6656165719032288, + "rewards/rejected": -1.417182207107544, + "step": 464 + }, + { + "epoch": 0.78, + "learning_rate": 9.723396828655376e-07, + "logits/chosen": -1.1137374639511108, + "logits/rejected": -0.8390330076217651, + "logps/chosen": -389.09088134765625, + "logps/rejected": -321.860107421875, + "loss": 0.1468, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09591063857078552, + "rewards/margins": 12.648015022277832, + "rewards/rejected": -12.743925094604492, + "step": 465 + }, + { + "epoch": 0.79, + "learning_rate": 9.72097740107524e-07, + "logits/chosen": -1.2167657613754272, + "logits/rejected": -1.5186375379562378, + "logps/chosen": -69.07684326171875, + "logps/rejected": -136.32656860351562, + "loss": 0.1958, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.024441910907626152, + "rewards/margins": 7.298024654388428, + "rewards/rejected": -7.273582458496094, + "step": 466 + }, + { + "epoch": 0.79, + "learning_rate": 9.718547741913833e-07, + "logits/chosen": -1.3299391269683838, + "logits/rejected": -1.1457806825637817, + "logps/chosen": -497.37103271484375, + "logps/rejected": -386.53143310546875, + "loss": 0.186, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3952667713165283, + "rewards/margins": 8.335859298706055, + "rewards/rejected": -6.940592288970947, + "step": 467 + }, + { + "epoch": 0.79, + "learning_rate": 9.716107856436855e-07, + "logits/chosen": -1.1360148191452026, + "logits/rejected": -0.6608507037162781, + "logps/chosen": -247.17117309570312, + "logps/rejected": -258.4505920410156, + "loss": 0.2208, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1492900848388672, + "rewards/margins": 6.689465045928955, + "rewards/rejected": -7.838755130767822, + "step": 468 + }, + { + "epoch": 0.79, + "learning_rate": 9.713657749932171e-07, + "logits/chosen": -0.8903838396072388, + "logits/rejected": -1.1991342306137085, + "logps/chosen": -475.362548828125, + "logps/rejected": -285.0642395019531, + "loss": 0.1696, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.755786180496216, + "rewards/margins": 7.545994281768799, + "rewards/rejected": -10.301780700683594, + "step": 469 + }, + { + "epoch": 0.79, + "learning_rate": 9.711197427709795e-07, + "logits/chosen": -0.6181639432907104, + "logits/rejected": -0.9941724538803101, + "logps/chosen": -130.269775390625, + "logps/rejected": -202.236083984375, + "loss": 0.1921, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.045879364013671875, + "rewards/margins": 6.135862827301025, + "rewards/rejected": -6.181741714477539, + "step": 470 + }, + { + "epoch": 0.79, + "learning_rate": 9.708726895101885e-07, + "logits/chosen": -0.6759887337684631, + "logits/rejected": -1.0811114311218262, + "logps/chosen": -349.7430725097656, + "logps/rejected": -202.4884490966797, + "loss": 0.1463, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6895374059677124, + "rewards/margins": 10.514432907104492, + "rewards/rejected": -11.203970909118652, + "step": 471 + }, + { + "epoch": 0.8, + "learning_rate": 9.706246157462726e-07, + "logits/chosen": -1.6232417821884155, + "logits/rejected": -1.6543527841567993, + "logps/chosen": -154.53765869140625, + "logps/rejected": -209.14459228515625, + "loss": 0.2433, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1862388849258423, + "rewards/margins": 2.6417124271392822, + "rewards/rejected": -3.827951431274414, + "step": 472 + }, + { + "epoch": 0.8, + "learning_rate": 9.703755220168714e-07, + "logits/chosen": -2.7903904914855957, + "logits/rejected": -1.3351569175720215, + "logps/chosen": -257.2842712402344, + "logps/rejected": -224.76751708984375, + "loss": 0.1928, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.061550140380859375, + "rewards/margins": 2.485539197921753, + "rewards/rejected": -2.4239890575408936, + "step": 473 + }, + { + "epoch": 0.8, + "learning_rate": 9.701254088618362e-07, + "logits/chosen": -0.9130764007568359, + "logits/rejected": -1.3383204936981201, + "logps/chosen": -381.96124267578125, + "logps/rejected": -193.50051879882812, + "loss": 0.1515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1674940139055252, + "rewards/margins": 8.00719928741455, + "rewards/rejected": -8.17469310760498, + "step": 474 + }, + { + "epoch": 0.8, + "learning_rate": 9.698742768232265e-07, + "logits/chosen": -0.8350385427474976, + "logits/rejected": -0.8087922930717468, + "logps/chosen": -260.8985900878906, + "logps/rejected": -207.9868927001953, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3379669189453125, + "rewards/margins": 6.639181137084961, + "rewards/rejected": -8.977148056030273, + "step": 475 + }, + { + "epoch": 0.8, + "learning_rate": 9.696221264453108e-07, + "logits/chosen": -1.8277158737182617, + "logits/rejected": -1.8809436559677124, + "logps/chosen": -102.64655303955078, + "logps/rejected": -81.0972671508789, + "loss": 0.1846, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0536823272705078, + "rewards/margins": -0.04416823387145996, + "rewards/rejected": -1.0095140933990479, + "step": 476 + }, + { + "epoch": 0.8, + "learning_rate": 9.693689582745643e-07, + "logits/chosen": -2.200521230697632, + "logits/rejected": -2.2031776905059814, + "logps/chosen": -78.0069808959961, + "logps/rejected": -148.51446533203125, + "loss": 0.213, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6362529397010803, + "rewards/margins": 3.200962781906128, + "rewards/rejected": -3.8372156620025635, + "step": 477 + }, + { + "epoch": 0.81, + "learning_rate": 9.691147728596681e-07, + "logits/chosen": -0.9372101426124573, + "logits/rejected": -0.8971385359764099, + "logps/chosen": -158.72555541992188, + "logps/rejected": -178.3800506591797, + "loss": 0.1383, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3784370422363281, + "rewards/margins": 5.4618988037109375, + "rewards/rejected": -5.840336322784424, + "step": 478 + }, + { + "epoch": 0.81, + "learning_rate": 9.688595707515076e-07, + "logits/chosen": -2.139923572540283, + "logits/rejected": -1.6465723514556885, + "logps/chosen": -119.42794799804688, + "logps/rejected": -210.23606872558594, + "loss": 0.2018, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1089653968811035, + "rewards/margins": 4.937000274658203, + "rewards/rejected": -7.045965671539307, + "step": 479 + }, + { + "epoch": 0.81, + "learning_rate": 9.686033525031719e-07, + "logits/chosen": -1.4923748970031738, + "logits/rejected": -1.2859959602355957, + "logps/chosen": -226.31365966796875, + "logps/rejected": -233.12567138671875, + "loss": 0.1772, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.537055492401123, + "rewards/margins": 6.823518753051758, + "rewards/rejected": -8.360574722290039, + "step": 480 + }, + { + "epoch": 0.81, + "learning_rate": 9.683461186699524e-07, + "logits/chosen": -0.9652807116508484, + "logits/rejected": -1.830547571182251, + "logps/chosen": -774.6458129882812, + "logps/rejected": -302.51702880859375, + "loss": 0.185, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.865536689758301, + "rewards/margins": 6.375579833984375, + "rewards/rejected": -9.241116523742676, + "step": 481 + }, + { + "epoch": 0.81, + "learning_rate": 9.680878698093415e-07, + "logits/chosen": -0.6591046452522278, + "logits/rejected": -1.157971739768982, + "logps/chosen": -304.5384216308594, + "logps/rejected": -185.4188232421875, + "loss": 0.1882, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8371291756629944, + "rewards/margins": 6.621988296508789, + "rewards/rejected": -7.4591169357299805, + "step": 482 + }, + { + "epoch": 0.81, + "learning_rate": 9.678286064810316e-07, + "logits/chosen": -1.630854845046997, + "logits/rejected": -1.1944377422332764, + "logps/chosen": -63.372398376464844, + "logps/rejected": -118.29740905761719, + "loss": 0.2208, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9332462549209595, + "rewards/margins": -0.14584654569625854, + "rewards/rejected": -0.7873997092247009, + "step": 483 + }, + { + "epoch": 0.82, + "learning_rate": 9.67568329246913e-07, + "logits/chosen": -1.5651147365570068, + "logits/rejected": -2.2136974334716797, + "logps/chosen": -356.625244140625, + "logps/rejected": -257.2489013671875, + "loss": 0.1678, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.220522880554199, + "rewards/margins": 11.928407669067383, + "rewards/rejected": -14.148930549621582, + "step": 484 + }, + { + "epoch": 0.82, + "learning_rate": 9.673070386710745e-07, + "logits/chosen": -0.5516372323036194, + "logits/rejected": -0.5296367406845093, + "logps/chosen": -12.047179222106934, + "logps/rejected": -111.52848052978516, + "loss": 0.1797, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2791111469268799, + "rewards/margins": 8.235075950622559, + "rewards/rejected": -8.51418685913086, + "step": 485 + }, + { + "epoch": 0.82, + "learning_rate": 9.670447353197998e-07, + "logits/chosen": -1.5218621492385864, + "logits/rejected": -1.7300869226455688, + "logps/chosen": -215.33021545410156, + "logps/rejected": -353.5644226074219, + "loss": 0.1917, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7259873151779175, + "rewards/margins": 16.542556762695312, + "rewards/rejected": -18.268545150756836, + "step": 486 + }, + { + "epoch": 0.82, + "learning_rate": 9.66781419761569e-07, + "logits/chosen": -1.3317276239395142, + "logits/rejected": -1.3344420194625854, + "logps/chosen": -116.13327026367188, + "logps/rejected": -97.93529510498047, + "loss": 0.1949, + "rewards/accuracies": 0.5, + "rewards/chosen": -3.2147207260131836, + "rewards/margins": -0.9069676399230957, + "rewards/rejected": -2.307753086090088, + "step": 487 + }, + { + "epoch": 0.82, + "learning_rate": 9.665170925670546e-07, + "logits/chosen": -1.387438416481018, + "logits/rejected": -1.5614135265350342, + "logps/chosen": -156.07591247558594, + "logps/rejected": -155.57327270507812, + "loss": 0.1721, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0742095708847046, + "rewards/margins": 3.273390293121338, + "rewards/rejected": -4.347599983215332, + "step": 488 + }, + { + "epoch": 0.82, + "learning_rate": 9.662517543091224e-07, + "logits/chosen": -0.29915913939476013, + "logits/rejected": -1.622786521911621, + "logps/chosen": -332.6976013183594, + "logps/rejected": -172.04876708984375, + "loss": 0.1802, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1141693592071533, + "rewards/margins": 10.140617370605469, + "rewards/rejected": -8.026447296142578, + "step": 489 + }, + { + "epoch": 0.83, + "learning_rate": 9.659854055628289e-07, + "logits/chosen": -2.113335371017456, + "logits/rejected": -1.7780506610870361, + "logps/chosen": -116.96385192871094, + "logps/rejected": -280.6581115722656, + "loss": 0.1313, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7822120785713196, + "rewards/margins": 15.220739364624023, + "rewards/rejected": -16.00295066833496, + "step": 490 + }, + { + "epoch": 0.83, + "learning_rate": 9.657180469054212e-07, + "logits/chosen": -1.7113397121429443, + "logits/rejected": -1.1981744766235352, + "logps/chosen": -214.93051147460938, + "logps/rejected": -303.4566650390625, + "loss": 0.1392, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7917399406433105, + "rewards/margins": 11.92519760131836, + "rewards/rejected": -13.716938018798828, + "step": 491 + }, + { + "epoch": 0.83, + "learning_rate": 9.654496789163343e-07, + "logits/chosen": -1.5774062871932983, + "logits/rejected": -1.3951590061187744, + "logps/chosen": -136.52076721191406, + "logps/rejected": -191.61000061035156, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8900161981582642, + "rewards/margins": 10.306234359741211, + "rewards/rejected": -11.196249961853027, + "step": 492 + }, + { + "epoch": 0.83, + "learning_rate": 9.651803021771917e-07, + "logits/chosen": -2.1784653663635254, + "logits/rejected": -1.2229230403900146, + "logps/chosen": -72.89522552490234, + "logps/rejected": -309.3623046875, + "loss": 0.1811, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6526200771331787, + "rewards/margins": 7.078956604003906, + "rewards/rejected": -7.731576919555664, + "step": 493 + }, + { + "epoch": 0.83, + "learning_rate": 9.64909917271802e-07, + "logits/chosen": -0.9013615250587463, + "logits/rejected": -1.0959047079086304, + "logps/chosen": -426.6611022949219, + "logps/rejected": -216.3023681640625, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8864956498146057, + "rewards/margins": 9.015869140625, + "rewards/rejected": -9.902364730834961, + "step": 494 + }, + { + "epoch": 0.83, + "learning_rate": 9.6463852478616e-07, + "logits/chosen": -1.6150933504104614, + "logits/rejected": -2.258601427078247, + "logps/chosen": -96.07403564453125, + "logps/rejected": -62.08420181274414, + "loss": 0.1906, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1480413675308228, + "rewards/margins": 1.0792427062988281, + "rewards/rejected": -2.2272841930389404, + "step": 495 + }, + { + "epoch": 0.84, + "learning_rate": 9.643661253084429e-07, + "logits/chosen": -0.35942748188972473, + "logits/rejected": -1.3206900358200073, + "logps/chosen": -472.96929931640625, + "logps/rejected": -234.10690307617188, + "loss": 0.1597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2673812806606293, + "rewards/margins": 13.789386749267578, + "rewards/rejected": -13.522006034851074, + "step": 496 + }, + { + "epoch": 0.84, + "learning_rate": 9.640927194290116e-07, + "logits/chosen": -1.594843864440918, + "logits/rejected": -1.8921819925308228, + "logps/chosen": -260.1965026855469, + "logps/rejected": -257.98199462890625, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7323309183120728, + "rewards/margins": 6.5982561111450195, + "rewards/rejected": -7.330586910247803, + "step": 497 + }, + { + "epoch": 0.84, + "learning_rate": 9.638183077404068e-07, + "logits/chosen": -2.052978754043579, + "logits/rejected": -1.9758281707763672, + "logps/chosen": -19.00727081298828, + "logps/rejected": -73.09623718261719, + "loss": 0.1491, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.739374041557312, + "rewards/margins": 2.3671977519989014, + "rewards/rejected": -3.106571674346924, + "step": 498 + }, + { + "epoch": 0.84, + "learning_rate": 9.635428908373502e-07, + "logits/chosen": -1.1580097675323486, + "logits/rejected": -1.896628975868225, + "logps/chosen": -570.9942626953125, + "logps/rejected": -265.4372863769531, + "loss": 0.1812, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3030930757522583, + "rewards/margins": 11.787357330322266, + "rewards/rejected": -13.090450286865234, + "step": 499 + }, + { + "epoch": 0.84, + "learning_rate": 9.632664693167416e-07, + "logits/chosen": -1.720935583114624, + "logits/rejected": -1.7201050519943237, + "logps/chosen": -355.47833251953125, + "logps/rejected": -418.269775390625, + "loss": 0.1801, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.297267198562622, + "rewards/margins": 9.594414710998535, + "rewards/rejected": -10.891682624816895, + "step": 500 + }, + { + "epoch": 0.84, + "learning_rate": 9.629890437776579e-07, + "logits/chosen": -1.9035245180130005, + "logits/rejected": -1.9480910301208496, + "logps/chosen": -131.1505889892578, + "logps/rejected": -121.52857208251953, + "loss": 0.1662, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5765511989593506, + "rewards/margins": 5.626952171325684, + "rewards/rejected": -5.050400733947754, + "step": 501 + }, + { + "epoch": 0.85, + "learning_rate": 9.62710614821352e-07, + "logits/chosen": -0.8639505505561829, + "logits/rejected": -0.9645960927009583, + "logps/chosen": -76.16290283203125, + "logps/rejected": -115.89288330078125, + "loss": 0.2082, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.1743850708007812, + "rewards/margins": 3.6736721992492676, + "rewards/rejected": -5.848057270050049, + "step": 502 + }, + { + "epoch": 0.85, + "learning_rate": 9.624311830512519e-07, + "logits/chosen": -1.9524139165878296, + "logits/rejected": -2.4563539028167725, + "logps/chosen": -120.10502624511719, + "logps/rejected": -73.08750915527344, + "loss": 0.194, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.409701943397522, + "rewards/margins": 1.4130560159683228, + "rewards/rejected": -1.8227579593658447, + "step": 503 + }, + { + "epoch": 0.85, + "learning_rate": 9.621507490729584e-07, + "logits/chosen": -1.4694656133651733, + "logits/rejected": -1.635840892791748, + "logps/chosen": -73.20475769042969, + "logps/rejected": -231.72525024414062, + "loss": 0.1675, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7220495343208313, + "rewards/margins": 7.296611309051514, + "rewards/rejected": -8.018661499023438, + "step": 504 + }, + { + "epoch": 0.85, + "learning_rate": 9.618693134942448e-07, + "logits/chosen": -0.5826085805892944, + "logits/rejected": -1.8275094032287598, + "logps/chosen": -112.27894592285156, + "logps/rejected": -76.41281127929688, + "loss": 0.197, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1780475676059723, + "rewards/margins": 6.233481407165527, + "rewards/rejected": -6.411529064178467, + "step": 505 + }, + { + "epoch": 0.85, + "learning_rate": 9.615868769250545e-07, + "logits/chosen": -1.1828463077545166, + "logits/rejected": -1.4431803226470947, + "logps/chosen": -299.8600158691406, + "logps/rejected": -209.68878173828125, + "loss": 0.1821, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3483529090881348, + "rewards/margins": 10.797613143920898, + "rewards/rejected": -12.145965576171875, + "step": 506 + }, + { + "epoch": 0.85, + "learning_rate": 9.613034399775013e-07, + "logits/chosen": -1.7455992698669434, + "logits/rejected": -1.877217173576355, + "logps/chosen": -80.41563415527344, + "logps/rejected": -180.38148498535156, + "loss": 0.1738, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0337793827056885, + "rewards/margins": 5.948246955871582, + "rewards/rejected": -6.982026100158691, + "step": 507 + }, + { + "epoch": 0.86, + "learning_rate": 9.610190032658663e-07, + "logits/chosen": -1.9526875019073486, + "logits/rejected": -1.7264868021011353, + "logps/chosen": -117.76008605957031, + "logps/rejected": -175.2601318359375, + "loss": 0.1698, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.3269020318984985, + "rewards/margins": 6.980993747711182, + "rewards/rejected": -8.30789566040039, + "step": 508 + }, + { + "epoch": 0.86, + "learning_rate": 9.607335674065975e-07, + "logits/chosen": -0.6679888963699341, + "logits/rejected": -0.14870330691337585, + "logps/chosen": -339.99151611328125, + "logps/rejected": -369.12860107421875, + "loss": 0.1847, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8299164772033691, + "rewards/margins": 15.75944709777832, + "rewards/rejected": -17.58936309814453, + "step": 509 + }, + { + "epoch": 0.86, + "learning_rate": 9.604471330183081e-07, + "logits/chosen": -1.387742042541504, + "logits/rejected": -1.4050238132476807, + "logps/chosen": -387.7100524902344, + "logps/rejected": -354.6739807128906, + "loss": 0.1736, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4351615905761719, + "rewards/margins": 15.635486602783203, + "rewards/rejected": -17.070648193359375, + "step": 510 + }, + { + "epoch": 0.86, + "learning_rate": 9.601597007217761e-07, + "logits/chosen": -1.01943039894104, + "logits/rejected": -1.701319932937622, + "logps/chosen": -526.007568359375, + "logps/rejected": -369.4110107421875, + "loss": 0.2055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7693207263946533, + "rewards/margins": 2.1190032958984375, + "rewards/rejected": -2.888324022293091, + "step": 511 + }, + { + "epoch": 0.86, + "learning_rate": 9.598712711399415e-07, + "logits/chosen": -1.298659086227417, + "logits/rejected": -1.1038153171539307, + "logps/chosen": -530.2131958007812, + "logps/rejected": -275.9097595214844, + "loss": 0.2014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26557284593582153, + "rewards/margins": 12.243782997131348, + "rewards/rejected": -11.97821044921875, + "step": 512 + }, + { + "epoch": 0.87, + "learning_rate": 9.59581844897906e-07, + "logits/chosen": -1.3148828744888306, + "logits/rejected": -1.9683585166931152, + "logps/chosen": -369.1336364746094, + "logps/rejected": -188.69871520996094, + "loss": 0.1784, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.694780945777893, + "rewards/margins": 6.680124759674072, + "rewards/rejected": -8.374905586242676, + "step": 513 + }, + { + "epoch": 0.87, + "learning_rate": 9.592914226229314e-07, + "logits/chosen": -1.4996843338012695, + "logits/rejected": -1.4723166227340698, + "logps/chosen": -149.41529846191406, + "logps/rejected": -236.1160888671875, + "loss": 0.1857, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2980178892612457, + "rewards/margins": 12.551896095275879, + "rewards/rejected": -12.849913597106934, + "step": 514 + }, + { + "epoch": 0.87, + "learning_rate": 9.590000049444376e-07, + "logits/chosen": -1.6375008821487427, + "logits/rejected": -2.431309461593628, + "logps/chosen": -514.4171752929688, + "logps/rejected": -307.718017578125, + "loss": 0.1961, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7196776866912842, + "rewards/margins": 6.7644124031066895, + "rewards/rejected": -8.484090805053711, + "step": 515 + }, + { + "epoch": 0.87, + "learning_rate": 9.587075924940028e-07, + "logits/chosen": -1.0333702564239502, + "logits/rejected": -1.2659507989883423, + "logps/chosen": -454.99566650390625, + "logps/rejected": -293.19573974609375, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.113037109375, + "rewards/margins": 9.598712921142578, + "rewards/rejected": -9.711750030517578, + "step": 516 + }, + { + "epoch": 0.87, + "learning_rate": 9.5841418590536e-07, + "logits/chosen": -1.6277176141738892, + "logits/rejected": -2.497152090072632, + "logps/chosen": -325.90850830078125, + "logps/rejected": -145.48583984375, + "loss": 0.2187, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.105952501296997, + "rewards/margins": 0.280051052570343, + "rewards/rejected": -1.3860034942626953, + "step": 517 + }, + { + "epoch": 0.87, + "learning_rate": 9.581197858143977e-07, + "logits/chosen": -2.2153377532958984, + "logits/rejected": -1.4255385398864746, + "logps/chosen": -80.20429229736328, + "logps/rejected": -332.54937744140625, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3072952032089233, + "rewards/margins": 13.6825532913208, + "rewards/rejected": -14.989848136901855, + "step": 518 + }, + { + "epoch": 0.88, + "learning_rate": 9.578243928591569e-07, + "logits/chosen": -1.0782465934753418, + "logits/rejected": -1.0369610786437988, + "logps/chosen": -295.9336853027344, + "logps/rejected": -218.00802612304688, + "loss": 0.1549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03710126876831055, + "rewards/margins": 10.719480514526367, + "rewards/rejected": -10.682379722595215, + "step": 519 + }, + { + "epoch": 0.88, + "learning_rate": 9.57528007679831e-07, + "logits/chosen": -1.1398615837097168, + "logits/rejected": -1.8708375692367554, + "logps/chosen": -540.9119873046875, + "logps/rejected": -327.2794189453125, + "loss": 0.2091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22789306938648224, + "rewards/margins": 5.93986701965332, + "rewards/rejected": -5.711973667144775, + "step": 520 + }, + { + "epoch": 0.88, + "learning_rate": 9.57230630918763e-07, + "logits/chosen": -2.1814510822296143, + "logits/rejected": -2.2879159450531006, + "logps/chosen": -105.28890991210938, + "logps/rejected": -185.7814178466797, + "loss": 0.1897, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.7397704124450684, + "rewards/margins": 6.833057403564453, + "rewards/rejected": -8.572827339172363, + "step": 521 + }, + { + "epoch": 0.88, + "learning_rate": 9.569322632204458e-07, + "logits/chosen": -2.073073148727417, + "logits/rejected": -1.4473570585250854, + "logps/chosen": -314.480712890625, + "logps/rejected": -325.2982482910156, + "loss": 0.21, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3166259825229645, + "rewards/margins": 14.919228553771973, + "rewards/rejected": -15.235854148864746, + "step": 522 + }, + { + "epoch": 0.88, + "learning_rate": 9.566329052315194e-07, + "logits/chosen": -1.8998284339904785, + "logits/rejected": -1.8589746952056885, + "logps/chosen": -63.7332763671875, + "logps/rejected": -274.74609375, + "loss": 0.1681, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.378976821899414, + "rewards/margins": 12.973987579345703, + "rewards/rejected": -14.352964401245117, + "step": 523 + }, + { + "epoch": 0.88, + "learning_rate": 9.5633255760077e-07, + "logits/chosen": -2.28558611869812, + "logits/rejected": -2.275768280029297, + "logps/chosen": -40.30331039428711, + "logps/rejected": -180.61302185058594, + "loss": 0.1904, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8475544452667236, + "rewards/margins": 9.063840866088867, + "rewards/rejected": -9.911395072937012, + "step": 524 + }, + { + "epoch": 0.89, + "learning_rate": 9.56031220979129e-07, + "logits/chosen": -0.7669763565063477, + "logits/rejected": -1.784839391708374, + "logps/chosen": -198.1832275390625, + "logps/rejected": -123.54266357421875, + "loss": 0.1687, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18603363633155823, + "rewards/margins": 5.508733749389648, + "rewards/rejected": -5.694766998291016, + "step": 525 + }, + { + "epoch": 0.89, + "learning_rate": 9.557288960196707e-07, + "logits/chosen": -1.1866172552108765, + "logits/rejected": -1.2076904773712158, + "logps/chosen": -36.310211181640625, + "logps/rejected": -181.77130126953125, + "loss": 0.1558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9400992393493652, + "rewards/margins": 10.733392715454102, + "rewards/rejected": -11.673492431640625, + "step": 526 + }, + { + "epoch": 0.89, + "learning_rate": 9.554255833776117e-07, + "logits/chosen": -2.55730938911438, + "logits/rejected": -1.6672946214675903, + "logps/chosen": -80.0410385131836, + "logps/rejected": -193.53387451171875, + "loss": 0.2318, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1338022947311401, + "rewards/margins": 2.3030526638031006, + "rewards/rejected": -3.436854839324951, + "step": 527 + }, + { + "epoch": 0.89, + "learning_rate": 9.551212837103091e-07, + "logits/chosen": -2.760472059249878, + "logits/rejected": -2.06662917137146, + "logps/chosen": -160.0579071044922, + "logps/rejected": -215.00347900390625, + "loss": 0.1786, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42213478684425354, + "rewards/margins": 5.234758377075195, + "rewards/rejected": -5.656893253326416, + "step": 528 + }, + { + "epoch": 0.89, + "learning_rate": 9.548159976772592e-07, + "logits/chosen": -2.156421661376953, + "logits/rejected": -1.3467743396759033, + "logps/chosen": -69.24724578857422, + "logps/rejected": -327.8988037109375, + "loss": 0.192, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.53495192527771, + "rewards/margins": 7.171606540679932, + "rewards/rejected": -8.706559181213379, + "step": 529 + }, + { + "epoch": 0.89, + "learning_rate": 9.545097259400958e-07, + "logits/chosen": -1.4128961563110352, + "logits/rejected": -0.9324668645858765, + "logps/chosen": -280.76806640625, + "logps/rejected": -290.4639892578125, + "loss": 0.1677, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.6329319477081299, + "rewards/margins": 7.9638519287109375, + "rewards/rejected": -9.596783638000488, + "step": 530 + }, + { + "epoch": 0.9, + "learning_rate": 9.54202469162589e-07, + "logits/chosen": -2.057682991027832, + "logits/rejected": -1.9787952899932861, + "logps/chosen": -197.595703125, + "logps/rejected": -351.2636413574219, + "loss": 0.2056, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0103564262390137, + "rewards/margins": 4.732174396514893, + "rewards/rejected": -6.742530822753906, + "step": 531 + }, + { + "epoch": 0.9, + "learning_rate": 9.538942280106441e-07, + "logits/chosen": -0.3838121294975281, + "logits/rejected": -0.7845942378044128, + "logps/chosen": -481.9730224609375, + "logps/rejected": -301.693115234375, + "loss": 0.1884, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7501007318496704, + "rewards/margins": 12.653074264526367, + "rewards/rejected": -14.403175354003906, + "step": 532 + }, + { + "epoch": 0.9, + "learning_rate": 9.535850031522996e-07, + "logits/chosen": -2.2089288234710693, + "logits/rejected": -2.154285430908203, + "logps/chosen": -34.082950592041016, + "logps/rejected": -219.37469482421875, + "loss": 0.1833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9477675557136536, + "rewards/margins": 11.680444717407227, + "rewards/rejected": -12.628212928771973, + "step": 533 + }, + { + "epoch": 0.9, + "learning_rate": 9.532747952577259e-07, + "logits/chosen": -1.166057825088501, + "logits/rejected": -1.6293365955352783, + "logps/chosen": -674.6851806640625, + "logps/rejected": -405.37255859375, + "loss": 0.1572, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8522399663925171, + "rewards/margins": 10.60324478149414, + "rewards/rejected": -11.455485343933105, + "step": 534 + }, + { + "epoch": 0.9, + "learning_rate": 9.529636049992233e-07, + "logits/chosen": -2.216061592102051, + "logits/rejected": -1.3257899284362793, + "logps/chosen": -267.9684143066406, + "logps/rejected": -269.8423156738281, + "loss": 0.1965, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39237213134765625, + "rewards/margins": 8.582952499389648, + "rewards/rejected": -8.975324630737305, + "step": 535 + }, + { + "epoch": 0.9, + "learning_rate": 9.526514330512224e-07, + "logits/chosen": -1.805991291999817, + "logits/rejected": -2.062978744506836, + "logps/chosen": -129.35073852539062, + "logps/rejected": -154.8126220703125, + "loss": 0.2185, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0079419612884521, + "rewards/margins": 2.9576416015625, + "rewards/rejected": -3.965583562850952, + "step": 536 + }, + { + "epoch": 0.91, + "learning_rate": 9.523382800902804e-07, + "logits/chosen": -0.9184517860412598, + "logits/rejected": -1.1293007135391235, + "logps/chosen": -497.22930908203125, + "logps/rejected": -318.4179382324219, + "loss": 0.1635, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.249415636062622, + "rewards/margins": 13.987029075622559, + "rewards/rejected": -16.2364444732666, + "step": 537 + }, + { + "epoch": 0.91, + "learning_rate": 9.52024146795081e-07, + "logits/chosen": -1.5992757081985474, + "logits/rejected": -1.759086012840271, + "logps/chosen": -143.24261474609375, + "logps/rejected": -166.47059631347656, + "loss": 0.1576, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6290268898010254, + "rewards/margins": 6.777127742767334, + "rewards/rejected": -7.406154632568359, + "step": 538 + }, + { + "epoch": 0.91, + "learning_rate": 9.517090338464324e-07, + "logits/chosen": -1.8976322412490845, + "logits/rejected": -1.2034521102905273, + "logps/chosen": -247.3936004638672, + "logps/rejected": -549.493408203125, + "loss": 0.1994, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.5727836489677429, + "rewards/margins": -0.548413872718811, + "rewards/rejected": -0.024369820952415466, + "step": 539 + }, + { + "epoch": 0.91, + "learning_rate": 9.51392941927266e-07, + "logits/chosen": -1.9126521348953247, + "logits/rejected": -1.6490702629089355, + "logps/chosen": -350.37139892578125, + "logps/rejected": -361.5531005859375, + "loss": 0.2141, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25605812668800354, + "rewards/margins": 1.3470127582550049, + "rewards/rejected": -1.603070855140686, + "step": 540 + }, + { + "epoch": 0.91, + "learning_rate": 9.510758717226351e-07, + "logits/chosen": -1.297040581703186, + "logits/rejected": -2.394585132598877, + "logps/chosen": -318.63446044921875, + "logps/rejected": -72.8540267944336, + "loss": 0.1844, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.1780809909105301, + "rewards/margins": 1.2357361316680908, + "rewards/rejected": -1.413817048072815, + "step": 541 + }, + { + "epoch": 0.91, + "learning_rate": 9.507578239197125e-07, + "logits/chosen": -1.683724284172058, + "logits/rejected": -2.4566798210144043, + "logps/chosen": -198.73486328125, + "logps/rejected": -102.812255859375, + "loss": 0.1964, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.182813048362732, + "rewards/margins": 3.7211146354675293, + "rewards/rejected": -4.903927803039551, + "step": 542 + }, + { + "epoch": 0.92, + "learning_rate": 9.504387992077906e-07, + "logits/chosen": -1.7871900796890259, + "logits/rejected": -2.684126377105713, + "logps/chosen": -830.5599365234375, + "logps/rejected": -110.56852722167969, + "loss": 0.1876, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.6030117273330688, + "rewards/margins": 1.5069835186004639, + "rewards/rejected": -2.1099953651428223, + "step": 543 + }, + { + "epoch": 0.92, + "learning_rate": 9.501187982782784e-07, + "logits/chosen": -1.393845558166504, + "logits/rejected": -2.000667095184326, + "logps/chosen": -319.4396667480469, + "logps/rejected": -176.52003479003906, + "loss": 0.1713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.090196892619133, + "rewards/margins": 11.154555320739746, + "rewards/rejected": -11.244751930236816, + "step": 544 + }, + { + "epoch": 0.92, + "learning_rate": 9.497978218247012e-07, + "logits/chosen": -1.0957725048065186, + "logits/rejected": -1.6756318807601929, + "logps/chosen": -336.61602783203125, + "logps/rejected": -199.86419677734375, + "loss": 0.1575, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11384735256433487, + "rewards/margins": 7.248149871826172, + "rewards/rejected": -7.361997127532959, + "step": 545 + }, + { + "epoch": 0.92, + "learning_rate": 9.494758705426976e-07, + "logits/chosen": -1.4453366994857788, + "logits/rejected": -2.0633018016815186, + "logps/chosen": -223.99911499023438, + "logps/rejected": -254.41709899902344, + "loss": 0.2214, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6794521808624268, + "rewards/margins": 8.287046432495117, + "rewards/rejected": -8.966497421264648, + "step": 546 + }, + { + "epoch": 0.92, + "learning_rate": 9.491529451300199e-07, + "logits/chosen": -0.6172839999198914, + "logits/rejected": -0.6064785718917847, + "logps/chosen": -255.4965362548828, + "logps/rejected": -213.30917358398438, + "loss": 0.2023, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.845526099205017, + "rewards/margins": 9.538248062133789, + "rewards/rejected": -11.383773803710938, + "step": 547 + }, + { + "epoch": 0.92, + "learning_rate": 9.48829046286531e-07, + "logits/chosen": -1.9111276865005493, + "logits/rejected": -2.0904080867767334, + "logps/chosen": -132.1181640625, + "logps/rejected": -127.7751235961914, + "loss": 0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23484620451927185, + "rewards/margins": 7.176847457885742, + "rewards/rejected": -6.9420013427734375, + "step": 548 + }, + { + "epoch": 0.93, + "learning_rate": 9.485041747142033e-07, + "logits/chosen": -1.5261189937591553, + "logits/rejected": -1.6879130601882935, + "logps/chosen": -272.7811279296875, + "logps/rejected": -296.8751525878906, + "loss": 0.1803, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28870850801467896, + "rewards/margins": 5.03480339050293, + "rewards/rejected": -4.746094703674316, + "step": 549 + }, + { + "epoch": 0.93, + "learning_rate": 9.481783311171182e-07, + "logits/chosen": -0.9267941117286682, + "logits/rejected": -0.5881129503250122, + "logps/chosen": -127.84021759033203, + "logps/rejected": -182.4854736328125, + "loss": 0.2082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14871902763843536, + "rewards/margins": 10.788543701171875, + "rewards/rejected": -10.639824867248535, + "step": 550 + }, + { + "epoch": 0.93, + "learning_rate": 9.478515162014628e-07, + "logits/chosen": -0.44782042503356934, + "logits/rejected": -0.7973653078079224, + "logps/chosen": -449.32598876953125, + "logps/rejected": -296.8812255859375, + "loss": 0.2314, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5391403436660767, + "rewards/margins": 8.781317710876465, + "rewards/rejected": -9.32045841217041, + "step": 551 + }, + { + "epoch": 0.93, + "learning_rate": 9.475237306755302e-07, + "logits/chosen": -2.074190855026245, + "logits/rejected": -1.8858578205108643, + "logps/chosen": -194.01026916503906, + "logps/rejected": -407.46392822265625, + "loss": 0.1754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1868438720703125, + "rewards/margins": 1.9931914806365967, + "rewards/rejected": -2.180035352706909, + "step": 552 + }, + { + "epoch": 0.93, + "learning_rate": 9.471949752497159e-07, + "logits/chosen": -1.0973830223083496, + "logits/rejected": -1.0407088994979858, + "logps/chosen": -387.7185363769531, + "logps/rejected": -300.52764892578125, + "loss": 0.2544, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.558343529701233, + "rewards/margins": 11.005720138549805, + "rewards/rejected": -12.564064025878906, + "step": 553 + }, + { + "epoch": 0.93, + "learning_rate": 9.468652506365186e-07, + "logits/chosen": -0.8757210373878479, + "logits/rejected": -0.9368937611579895, + "logps/chosen": -13.649900436401367, + "logps/rejected": -88.50205993652344, + "loss": 0.1295, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2053590714931488, + "rewards/margins": 5.989328384399414, + "rewards/rejected": -6.1946868896484375, + "step": 554 + }, + { + "epoch": 0.94, + "learning_rate": 9.465345575505365e-07, + "logits/chosen": -1.1155973672866821, + "logits/rejected": -1.2391748428344727, + "logps/chosen": -156.2330322265625, + "logps/rejected": -168.20909118652344, + "loss": 0.1828, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7171291708946228, + "rewards/margins": 8.67363452911377, + "rewards/rejected": -9.390764236450195, + "step": 555 + }, + { + "epoch": 0.94, + "learning_rate": 9.462028967084678e-07, + "logits/chosen": -2.5009000301361084, + "logits/rejected": -1.359316110610962, + "logps/chosen": -88.84703063964844, + "logps/rejected": -606.462158203125, + "loss": 0.178, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0266014337539673, + "rewards/margins": 5.369706153869629, + "rewards/rejected": -6.396307468414307, + "step": 556 + }, + { + "epoch": 0.94, + "learning_rate": 9.458702688291071e-07, + "logits/chosen": -1.570953369140625, + "logits/rejected": -1.3593248128890991, + "logps/chosen": -759.3133544921875, + "logps/rejected": -677.512939453125, + "loss": 0.217, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.607739269733429, + "rewards/margins": 1.023413062095642, + "rewards/rejected": -1.6311523914337158, + "step": 557 + }, + { + "epoch": 0.94, + "learning_rate": 9.455366746333453e-07, + "logits/chosen": -1.4975221157073975, + "logits/rejected": -1.9799575805664062, + "logps/chosen": -238.45115661621094, + "logps/rejected": -126.26002502441406, + "loss": 0.1842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.838726818561554, + "rewards/margins": 0.9599689841270447, + "rewards/rejected": -1.7986958026885986, + "step": 558 + }, + { + "epoch": 0.94, + "learning_rate": 9.452021148441674e-07, + "logits/chosen": -2.3967480659484863, + "logits/rejected": -1.919357180595398, + "logps/chosen": -183.41903686523438, + "logps/rejected": -302.5267028808594, + "loss": 0.1945, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.8161871433258057, + "rewards/margins": 1.833760142326355, + "rewards/rejected": -2.649947166442871, + "step": 559 + }, + { + "epoch": 0.94, + "learning_rate": 9.448665901866513e-07, + "logits/chosen": -1.308738112449646, + "logits/rejected": -1.4025413990020752, + "logps/chosen": -420.19940185546875, + "logps/rejected": -352.2750244140625, + "loss": 0.2113, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1311897039413452, + "rewards/margins": 6.021875858306885, + "rewards/rejected": -7.1530656814575195, + "step": 560 + }, + { + "epoch": 0.95, + "learning_rate": 9.445301013879656e-07, + "logits/chosen": -1.9166899919509888, + "logits/rejected": -1.5457202196121216, + "logps/chosen": -236.6947479248047, + "logps/rejected": -277.1458435058594, + "loss": 0.1837, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5052580833435059, + "rewards/margins": 12.255561828613281, + "rewards/rejected": -13.760820388793945, + "step": 561 + }, + { + "epoch": 0.95, + "learning_rate": 9.441926491773689e-07, + "logits/chosen": -1.066957712173462, + "logits/rejected": -1.658841848373413, + "logps/chosen": -255.17095947265625, + "logps/rejected": -266.31304931640625, + "loss": 0.188, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5748581290245056, + "rewards/margins": 0.31342390179634094, + "rewards/rejected": -0.888282060623169, + "step": 562 + }, + { + "epoch": 0.95, + "learning_rate": 9.438542342862075e-07, + "logits/chosen": -1.350300908088684, + "logits/rejected": -2.001142740249634, + "logps/chosen": -323.86114501953125, + "logps/rejected": -88.59988403320312, + "loss": 0.1661, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39666903018951416, + "rewards/margins": 5.692585468292236, + "rewards/rejected": -6.089254856109619, + "step": 563 + }, + { + "epoch": 0.95, + "learning_rate": 9.435148574479144e-07, + "logits/chosen": -1.2990124225616455, + "logits/rejected": -1.5610456466674805, + "logps/chosen": -318.2091979980469, + "logps/rejected": -176.14430236816406, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6469208002090454, + "rewards/margins": 9.246925354003906, + "rewards/rejected": -9.893845558166504, + "step": 564 + }, + { + "epoch": 0.95, + "learning_rate": 9.431745193980068e-07, + "logits/chosen": -1.8961067199707031, + "logits/rejected": -1.6770416498184204, + "logps/chosen": -308.45989990234375, + "logps/rejected": -212.64981079101562, + "loss": 0.1747, + "rewards/accuracies": 0.5, + "rewards/chosen": -2.435619831085205, + "rewards/margins": 1.2442288398742676, + "rewards/rejected": -3.6798486709594727, + "step": 565 + }, + { + "epoch": 0.95, + "learning_rate": 9.428332208740857e-07, + "logits/chosen": -0.562650740146637, + "logits/rejected": -1.6526992321014404, + "logps/chosen": -225.20419311523438, + "logps/rejected": -123.10130310058594, + "loss": 0.1821, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8313095569610596, + "rewards/margins": 5.300149917602539, + "rewards/rejected": -6.1314592361450195, + "step": 566 + }, + { + "epoch": 0.96, + "learning_rate": 9.424909626158332e-07, + "logits/chosen": -1.3460386991500854, + "logits/rejected": -1.8868428468704224, + "logps/chosen": -541.9500122070312, + "logps/rejected": -282.341552734375, + "loss": 0.1658, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.592463970184326, + "rewards/margins": 13.076055526733398, + "rewards/rejected": -15.668519973754883, + "step": 567 + }, + { + "epoch": 0.96, + "learning_rate": 9.421477453650117e-07, + "logits/chosen": -1.3395308256149292, + "logits/rejected": -2.100599527359009, + "logps/chosen": -186.7269287109375, + "logps/rejected": -117.01820373535156, + "loss": 0.1475, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.069182276725769, + "rewards/margins": 6.430233955383301, + "rewards/rejected": -7.499416351318359, + "step": 568 + }, + { + "epoch": 0.96, + "learning_rate": 9.41803569865462e-07, + "logits/chosen": -0.560535192489624, + "logits/rejected": -0.5541735291481018, + "logps/chosen": -407.7998962402344, + "logps/rejected": -330.052001953125, + "loss": 0.1814, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.46812903881073, + "rewards/margins": 9.190093994140625, + "rewards/rejected": -10.658222198486328, + "step": 569 + }, + { + "epoch": 0.96, + "learning_rate": 9.414584368631018e-07, + "logits/chosen": -0.2689858376979828, + "logits/rejected": -0.6253905892372131, + "logps/chosen": -302.06964111328125, + "logps/rejected": -183.74813842773438, + "loss": 0.1558, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3590884506702423, + "rewards/margins": 10.315533638000488, + "rewards/rejected": -10.67462158203125, + "step": 570 + }, + { + "epoch": 0.96, + "learning_rate": 9.411123471059232e-07, + "logits/chosen": -1.1581571102142334, + "logits/rejected": -1.2208583354949951, + "logps/chosen": -386.98779296875, + "logps/rejected": -328.5216064453125, + "loss": 0.1739, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.2733596861362457, + "rewards/margins": 0.4076034724712372, + "rewards/rejected": -0.6809631586074829, + "step": 571 + }, + { + "epoch": 0.96, + "learning_rate": 9.407653013439927e-07, + "logits/chosen": -1.914874792098999, + "logits/rejected": -1.633007526397705, + "logps/chosen": -196.557373046875, + "logps/rejected": -189.84039306640625, + "loss": 0.1601, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.1366727352142334, + "rewards/margins": 0.7186151742935181, + "rewards/rejected": -1.8552879095077515, + "step": 572 + }, + { + "epoch": 0.97, + "learning_rate": 9.404173003294485e-07, + "logits/chosen": -1.0885298252105713, + "logits/rejected": -1.620314359664917, + "logps/chosen": -390.4488525390625, + "logps/rejected": -219.35308837890625, + "loss": 0.1959, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2608696222305298, + "rewards/margins": 8.13377857208252, + "rewards/rejected": -8.394648551940918, + "step": 573 + }, + { + "epoch": 0.97, + "learning_rate": 9.400683448164986e-07, + "logits/chosen": -1.303099274635315, + "logits/rejected": -1.4152745008468628, + "logps/chosen": -225.566650390625, + "logps/rejected": -119.57087707519531, + "loss": 0.1869, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.179052710533142, + "rewards/margins": 2.7054755687713623, + "rewards/rejected": -3.884528160095215, + "step": 574 + }, + { + "epoch": 0.97, + "learning_rate": 9.397184355614205e-07, + "logits/chosen": -1.2585117816925049, + "logits/rejected": -1.849805474281311, + "logps/chosen": -247.82049560546875, + "logps/rejected": -223.6149139404297, + "loss": 0.1884, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9630714654922485, + "rewards/margins": -0.37028050422668457, + "rewards/rejected": -0.592790961265564, + "step": 575 + }, + { + "epoch": 0.97, + "learning_rate": 9.393675733225576e-07, + "logits/chosen": -2.14628005027771, + "logits/rejected": -1.4466516971588135, + "logps/chosen": -36.28599548339844, + "logps/rejected": -156.9398651123047, + "loss": 0.2136, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0615413188934326, + "rewards/margins": 4.366576194763184, + "rewards/rejected": -5.428117752075195, + "step": 576 + }, + { + "epoch": 0.97, + "learning_rate": 9.390157588603201e-07, + "logits/chosen": -1.2252075672149658, + "logits/rejected": -1.802139401435852, + "logps/chosen": -250.9557647705078, + "logps/rejected": -223.22103881835938, + "loss": 0.1422, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9214485287666321, + "rewards/margins": 1.491241455078125, + "rewards/rejected": -2.4126901626586914, + "step": 577 + }, + { + "epoch": 0.97, + "learning_rate": 9.386629929371804e-07, + "logits/chosen": -1.6203854084014893, + "logits/rejected": -1.964041829109192, + "logps/chosen": -233.42408752441406, + "logps/rejected": -223.70074462890625, + "loss": 0.1747, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3521421551704407, + "rewards/margins": 12.760381698608398, + "rewards/rejected": -13.112524032592773, + "step": 578 + }, + { + "epoch": 0.98, + "learning_rate": 9.383092763176738e-07, + "logits/chosen": -1.3921282291412354, + "logits/rejected": -1.1160563230514526, + "logps/chosen": -242.66835021972656, + "logps/rejected": -384.66790771484375, + "loss": 0.196, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.593625783920288, + "rewards/margins": 17.484588623046875, + "rewards/rejected": -19.078216552734375, + "step": 579 + }, + { + "epoch": 0.98, + "learning_rate": 9.379546097683962e-07, + "logits/chosen": -1.1282007694244385, + "logits/rejected": -1.4131447076797485, + "logps/chosen": -137.69473266601562, + "logps/rejected": -102.01011657714844, + "loss": 0.1704, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27938154339790344, + "rewards/margins": 0.8095016479492188, + "rewards/rejected": -1.0888831615447998, + "step": 580 + }, + { + "epoch": 0.98, + "learning_rate": 9.375989940580014e-07, + "logits/chosen": -1.014125943183899, + "logits/rejected": -0.8712400197982788, + "logps/chosen": -333.4993896484375, + "logps/rejected": -282.827880859375, + "loss": 0.1759, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6404533386230469, + "rewards/margins": 12.329509735107422, + "rewards/rejected": -12.969963073730469, + "step": 581 + }, + { + "epoch": 0.98, + "learning_rate": 9.372424299572013e-07, + "logits/chosen": -1.137288212776184, + "logits/rejected": -1.7328457832336426, + "logps/chosen": -630.9047241210938, + "logps/rejected": -265.96551513671875, + "loss": 0.181, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6024795770645142, + "rewards/margins": 13.53583812713623, + "rewards/rejected": -14.138317108154297, + "step": 582 + }, + { + "epoch": 0.98, + "learning_rate": 9.368849182387624e-07, + "logits/chosen": -1.1567295789718628, + "logits/rejected": -1.5108009576797485, + "logps/chosen": -423.7227478027344, + "logps/rejected": -177.82542419433594, + "loss": 0.1898, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.013415813446045, + "rewards/margins": 5.7657599449157715, + "rewards/rejected": -6.779175758361816, + "step": 583 + }, + { + "epoch": 0.98, + "learning_rate": 9.365264596775051e-07, + "logits/chosen": -1.427332878112793, + "logits/rejected": -1.6145710945129395, + "logps/chosen": -719.35400390625, + "logps/rejected": -739.6787109375, + "loss": 0.1735, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9353516101837158, + "rewards/margins": 8.291926383972168, + "rewards/rejected": -9.227277755737305, + "step": 584 + }, + { + "epoch": 0.99, + "learning_rate": 9.361670550503024e-07, + "logits/chosen": -0.7215293049812317, + "logits/rejected": -1.509445071220398, + "logps/chosen": -111.42332458496094, + "logps/rejected": -59.27985763549805, + "loss": 0.1395, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.639510452747345, + "rewards/margins": 3.702336549758911, + "rewards/rejected": -4.341846942901611, + "step": 585 + }, + { + "epoch": 0.99, + "learning_rate": 9.35806705136077e-07, + "logits/chosen": -1.5403887033462524, + "logits/rejected": -1.7206377983093262, + "logps/chosen": -47.813472747802734, + "logps/rejected": -116.00591278076172, + "loss": 0.1867, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1005914211273193, + "rewards/margins": 2.5221846103668213, + "rewards/rejected": -3.6227760314941406, + "step": 586 + }, + { + "epoch": 0.99, + "learning_rate": 9.354454107158003e-07, + "logits/chosen": -1.4039901494979858, + "logits/rejected": -1.2692376375198364, + "logps/chosen": -35.25469207763672, + "logps/rejected": -236.85321044921875, + "loss": 0.1741, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9473680853843689, + "rewards/margins": 11.905098915100098, + "rewards/rejected": -12.852466583251953, + "step": 587 + }, + { + "epoch": 0.99, + "learning_rate": 9.350831725724915e-07, + "logits/chosen": -1.3343538045883179, + "logits/rejected": -1.4938243627548218, + "logps/chosen": -328.1713562011719, + "logps/rejected": -222.6204376220703, + "loss": 0.2049, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.934698462486267, + "rewards/margins": 7.677361011505127, + "rewards/rejected": -9.612059593200684, + "step": 588 + }, + { + "epoch": 0.99, + "learning_rate": 9.347199914912139e-07, + "logits/chosen": -1.4835619926452637, + "logits/rejected": -1.2078652381896973, + "logps/chosen": -425.0323791503906, + "logps/rejected": -336.0191955566406, + "loss": 0.1452, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6457017660140991, + "rewards/margins": 12.339007377624512, + "rewards/rejected": -13.984708786010742, + "step": 589 + }, + { + "epoch": 0.99, + "learning_rate": 9.343558682590755e-07, + "logits/chosen": -1.440277338027954, + "logits/rejected": -1.2249367237091064, + "logps/chosen": -213.87991333007812, + "logps/rejected": -248.79725646972656, + "loss": 0.1673, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.332812786102295, + "rewards/margins": 9.270461082458496, + "rewards/rejected": -10.60327434539795, + "step": 590 + }, + { + "epoch": 1.0, + "learning_rate": 9.339908036652254e-07, + "logits/chosen": -1.4295828342437744, + "logits/rejected": -0.9476127624511719, + "logps/chosen": -167.48570251464844, + "logps/rejected": -194.9608154296875, + "loss": 0.1506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5273482799530029, + "rewards/margins": 11.93600845336914, + "rewards/rejected": -11.408660888671875, + "step": 591 + }, + { + "epoch": 1.0, + "learning_rate": 9.336247985008533e-07, + "logits/chosen": -1.3463596105575562, + "logits/rejected": -1.6185994148254395, + "logps/chosen": -462.23626708984375, + "logps/rejected": -235.42453002929688, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8855430483818054, + "rewards/margins": 5.946411609649658, + "rewards/rejected": -6.831954479217529, + "step": 592 + }, + { + "epoch": 1.0, + "learning_rate": 9.33257853559187e-07, + "logits/chosen": -1.8037328720092773, + "logits/rejected": -1.4643278121948242, + "logps/chosen": -556.474609375, + "logps/rejected": -393.30126953125, + "loss": 0.2315, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.759026527404785, + "rewards/margins": 9.786253929138184, + "rewards/rejected": -7.02722692489624, + "step": 593 + } + ], + "logging_steps": 1.0, + "max_steps": 2372, + "num_input_tokens_seen": 0, + "num_train_epochs": 4, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}