diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4086 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 1563, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 42.15456008911133, + "kl": 0.017818570137023926, + "learning_rate": 1.592356687898089e-08, + "logps/chosen": -285.75128173828125, + "logps/rejected": -254.7062530517578, + "loss": 0.4999, + "rewards/chosen": 0.004669209010899067, + "rewards/margins": 0.0025329389609396458, + "rewards/rejected": 0.0021362705156207085, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 44.65619659423828, + "kl": 0.05461766570806503, + "learning_rate": 3.184713375796178e-08, + "logps/chosen": -286.11944580078125, + "logps/rejected": -276.1832580566406, + "loss": 0.496, + "rewards/chosen": 0.03117586299777031, + "rewards/margins": 0.033454541116952896, + "rewards/rejected": -0.00227867579087615, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 39.61723327636719, + "kl": 0.44255906343460083, + "learning_rate": 4.777070063694268e-08, + "logps/chosen": -269.203125, + "logps/rejected": -260.71966552734375, + "loss": 0.4849, + "rewards/chosen": 0.13460782170295715, + "rewards/margins": 0.11511580646038055, + "rewards/rejected": 0.0194920115172863, + "step": 15 + }, + { + "epoch": 0.01, + "grad_norm": 36.42387771606445, + "kl": 0.871076226234436, + "learning_rate": 6.369426751592356e-08, + "logps/chosen": -244.55203247070312, + "logps/rejected": -259.15496826171875, + "loss": 0.4573, + "rewards/chosen": 0.3324822187423706, + "rewards/margins": 0.3418889045715332, + "rewards/rejected": -0.00940666627138853, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 35.794044494628906, + "kl": 0.3753414750099182, + "learning_rate": 7.961783439490445e-08, + "logps/chosen": -261.01800537109375, + "logps/rejected": -271.74786376953125, + "loss": 0.4164, + "rewards/chosen": 0.48356980085372925, + "rewards/margins": 0.7098164558410645, + "rewards/rejected": -0.22624659538269043, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 34.50333786010742, + "kl": 0.0, + "learning_rate": 9.554140127388536e-08, + "logps/chosen": -306.0127868652344, + "logps/rejected": -244.1356658935547, + "loss": 0.3649, + "rewards/chosen": 0.746610164642334, + "rewards/margins": 1.2027219533920288, + "rewards/rejected": -0.4561118483543396, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 30.50051498413086, + "kl": 0.0, + "learning_rate": 1.1146496815286624e-07, + "logps/chosen": -280.65692138671875, + "logps/rejected": -262.2502136230469, + "loss": 0.3171, + "rewards/chosen": 0.774597704410553, + "rewards/margins": 1.728833794593811, + "rewards/rejected": -0.9542360305786133, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 29.806577682495117, + "kl": 0.0, + "learning_rate": 1.2738853503184713e-07, + "logps/chosen": -276.9497375488281, + "logps/rejected": -259.6419982910156, + "loss": 0.2884, + "rewards/chosen": 0.6780191659927368, + "rewards/margins": 2.165379047393799, + "rewards/rejected": -1.4873597621917725, + "step": 40 + }, + { + "epoch": 0.03, + "grad_norm": 22.508939743041992, + "kl": 0.0, + "learning_rate": 1.43312101910828e-07, + "logps/chosen": -253.7576141357422, + "logps/rejected": -268.8052062988281, + "loss": 0.2596, + "rewards/chosen": 1.205094575881958, + "rewards/margins": 2.685385227203369, + "rewards/rejected": -1.4802907705307007, + "step": 45 + }, + { + "epoch": 0.03, + "grad_norm": 29.29058074951172, + "kl": 0.0, + "learning_rate": 1.592356687898089e-07, + "logps/chosen": -284.4584045410156, + "logps/rejected": -254.226806640625, + "loss": 0.2614, + "rewards/chosen": 1.0118796825408936, + "rewards/margins": 2.8269615173339844, + "rewards/rejected": -1.8150818347930908, + "step": 50 + }, + { + "epoch": 0.04, + "grad_norm": 29.25771141052246, + "kl": 0.0, + "learning_rate": 1.7515923566878978e-07, + "logps/chosen": -266.92254638671875, + "logps/rejected": -260.49261474609375, + "loss": 0.233, + "rewards/chosen": 1.16305673122406, + "rewards/margins": 3.4210219383239746, + "rewards/rejected": -2.257965326309204, + "step": 55 + }, + { + "epoch": 0.04, + "grad_norm": 20.642614364624023, + "kl": 0.0, + "learning_rate": 1.9108280254777072e-07, + "logps/chosen": -231.07168579101562, + "logps/rejected": -258.5835876464844, + "loss": 0.241, + "rewards/chosen": 1.3277983665466309, + "rewards/margins": 3.532343626022339, + "rewards/rejected": -2.204545497894287, + "step": 60 + }, + { + "epoch": 0.04, + "grad_norm": 23.786853790283203, + "kl": 0.0, + "learning_rate": 2.070063694267516e-07, + "logps/chosen": -251.5561981201172, + "logps/rejected": -267.85986328125, + "loss": 0.2376, + "rewards/chosen": 1.2313438653945923, + "rewards/margins": 3.837125062942505, + "rewards/rejected": -2.605781316757202, + "step": 65 + }, + { + "epoch": 0.04, + "grad_norm": 24.48412322998047, + "kl": 0.0, + "learning_rate": 2.2292993630573247e-07, + "logps/chosen": -244.3961639404297, + "logps/rejected": -248.7190704345703, + "loss": 0.2304, + "rewards/chosen": 1.338683843612671, + "rewards/margins": 3.6262855529785156, + "rewards/rejected": -2.287601947784424, + "step": 70 + }, + { + "epoch": 0.05, + "grad_norm": 16.654287338256836, + "kl": 0.0, + "learning_rate": 2.388535031847134e-07, + "logps/chosen": -261.95819091796875, + "logps/rejected": -291.40020751953125, + "loss": 0.2003, + "rewards/chosen": 1.3882195949554443, + "rewards/margins": 4.626662254333496, + "rewards/rejected": -3.238443374633789, + "step": 75 + }, + { + "epoch": 0.05, + "grad_norm": 30.89764976501465, + "kl": 0.0, + "learning_rate": 2.5477707006369425e-07, + "logps/chosen": -267.4027099609375, + "logps/rejected": -271.3486328125, + "loss": 0.1985, + "rewards/chosen": 1.4253151416778564, + "rewards/margins": 4.816943645477295, + "rewards/rejected": -3.3916287422180176, + "step": 80 + }, + { + "epoch": 0.05, + "grad_norm": 55.57185363769531, + "kl": 0.0, + "learning_rate": 2.7070063694267513e-07, + "logps/chosen": -289.416259765625, + "logps/rejected": -282.303955078125, + "loss": 0.2075, + "rewards/chosen": 1.308215856552124, + "rewards/margins": 5.0450119972229, + "rewards/rejected": -3.7367959022521973, + "step": 85 + }, + { + "epoch": 0.06, + "grad_norm": 34.31930923461914, + "kl": 0.0, + "learning_rate": 2.86624203821656e-07, + "logps/chosen": -255.4053192138672, + "logps/rejected": -293.8216247558594, + "loss": 0.1877, + "rewards/chosen": 1.560572862625122, + "rewards/margins": 5.238338470458984, + "rewards/rejected": -3.6777656078338623, + "step": 90 + }, + { + "epoch": 0.06, + "grad_norm": 22.21489143371582, + "kl": 0.0, + "learning_rate": 3.0254777070063694e-07, + "logps/chosen": -286.2869873046875, + "logps/rejected": -271.2530212402344, + "loss": 0.1985, + "rewards/chosen": 1.478058099746704, + "rewards/margins": 5.051484107971191, + "rewards/rejected": -3.5734260082244873, + "step": 95 + }, + { + "epoch": 0.06, + "grad_norm": 29.9346981048584, + "kl": 0.0, + "learning_rate": 3.184713375796178e-07, + "logps/chosen": -245.080810546875, + "logps/rejected": -287.15435791015625, + "loss": 0.1656, + "rewards/chosen": 1.4543142318725586, + "rewards/margins": 5.874560356140137, + "rewards/rejected": -4.420246124267578, + "step": 100 + }, + { + "epoch": 0.07, + "grad_norm": 22.431930541992188, + "kl": 0.0, + "learning_rate": 3.343949044585987e-07, + "logps/chosen": -259.4337463378906, + "logps/rejected": -309.25164794921875, + "loss": 0.1767, + "rewards/chosen": 1.4657680988311768, + "rewards/margins": 5.859877109527588, + "rewards/rejected": -4.39410924911499, + "step": 105 + }, + { + "epoch": 0.07, + "grad_norm": 36.517799377441406, + "kl": 0.0, + "learning_rate": 3.5031847133757957e-07, + "logps/chosen": -264.08306884765625, + "logps/rejected": -295.64788818359375, + "loss": 0.1599, + "rewards/chosen": 1.291338562965393, + "rewards/margins": 5.8188042640686035, + "rewards/rejected": -4.5274658203125, + "step": 110 + }, + { + "epoch": 0.07, + "grad_norm": 25.18623924255371, + "kl": 0.0, + "learning_rate": 3.6624203821656045e-07, + "logps/chosen": -268.96527099609375, + "logps/rejected": -278.4827575683594, + "loss": 0.1735, + "rewards/chosen": 1.4270073175430298, + "rewards/margins": 5.516595363616943, + "rewards/rejected": -4.0895891189575195, + "step": 115 + }, + { + "epoch": 0.08, + "grad_norm": 24.899450302124023, + "kl": 0.0, + "learning_rate": 3.8216560509554143e-07, + "logps/chosen": -257.46807861328125, + "logps/rejected": -306.6875305175781, + "loss": 0.1675, + "rewards/chosen": 1.363029956817627, + "rewards/margins": 5.9437384605407715, + "rewards/rejected": -4.580708980560303, + "step": 120 + }, + { + "epoch": 0.08, + "grad_norm": 18.1636962890625, + "kl": 0.0, + "learning_rate": 3.980891719745223e-07, + "logps/chosen": -274.8599548339844, + "logps/rejected": -316.78057861328125, + "loss": 0.1509, + "rewards/chosen": 1.4044160842895508, + "rewards/margins": 6.668715000152588, + "rewards/rejected": -5.264299392700195, + "step": 125 + }, + { + "epoch": 0.08, + "grad_norm": 53.83689498901367, + "kl": 0.0, + "learning_rate": 4.140127388535032e-07, + "logps/chosen": -316.65460205078125, + "logps/rejected": -318.2517395019531, + "loss": 0.1667, + "rewards/chosen": 1.331627368927002, + "rewards/margins": 6.6092848777771, + "rewards/rejected": -5.277657508850098, + "step": 130 + }, + { + "epoch": 0.09, + "grad_norm": 16.44365692138672, + "kl": 0.0, + "learning_rate": 4.2993630573248406e-07, + "logps/chosen": -242.02651977539062, + "logps/rejected": -283.90106201171875, + "loss": 0.1762, + "rewards/chosen": 1.6127967834472656, + "rewards/margins": 6.091832160949707, + "rewards/rejected": -4.479035377502441, + "step": 135 + }, + { + "epoch": 0.09, + "grad_norm": 19.764270782470703, + "kl": 0.0, + "learning_rate": 4.4585987261146494e-07, + "logps/chosen": -252.82180786132812, + "logps/rejected": -319.73199462890625, + "loss": 0.1468, + "rewards/chosen": 1.5330384969711304, + "rewards/margins": 6.702307224273682, + "rewards/rejected": -5.169268608093262, + "step": 140 + }, + { + "epoch": 0.09, + "grad_norm": 21.560876846313477, + "kl": 0.0, + "learning_rate": 4.6178343949044587e-07, + "logps/chosen": -247.28427124023438, + "logps/rejected": -282.38848876953125, + "loss": 0.1868, + "rewards/chosen": 1.350294828414917, + "rewards/margins": 5.6418867111206055, + "rewards/rejected": -4.291592597961426, + "step": 145 + }, + { + "epoch": 0.1, + "grad_norm": 21.553871154785156, + "kl": 0.0, + "learning_rate": 4.777070063694267e-07, + "logps/chosen": -286.14312744140625, + "logps/rejected": -281.7044372558594, + "loss": 0.168, + "rewards/chosen": 1.5479779243469238, + "rewards/margins": 6.192603588104248, + "rewards/rejected": -4.644625663757324, + "step": 150 + }, + { + "epoch": 0.1, + "grad_norm": 24.488561630249023, + "kl": 0.0, + "learning_rate": 4.936305732484076e-07, + "logps/chosen": -272.4622497558594, + "logps/rejected": -323.91082763671875, + "loss": 0.1648, + "rewards/chosen": 1.3805065155029297, + "rewards/margins": 6.456129550933838, + "rewards/rejected": -5.07562255859375, + "step": 155 + }, + { + "epoch": 0.1, + "grad_norm": 21.614229202270508, + "kl": 0.0, + "learning_rate": 4.989331436699858e-07, + "logps/chosen": -207.8739776611328, + "logps/rejected": -324.73565673828125, + "loss": 0.1435, + "rewards/chosen": 1.465820074081421, + "rewards/margins": 6.953179836273193, + "rewards/rejected": -5.487359046936035, + "step": 160 + }, + { + "epoch": 0.11, + "grad_norm": 20.425729751586914, + "kl": 0.0, + "learning_rate": 4.971550497866287e-07, + "logps/chosen": -287.088623046875, + "logps/rejected": -310.1844787597656, + "loss": 0.1568, + "rewards/chosen": 1.5799640417099, + "rewards/margins": 6.89548397064209, + "rewards/rejected": -5.315520286560059, + "step": 165 + }, + { + "epoch": 0.11, + "grad_norm": 21.86062240600586, + "kl": 0.0, + "learning_rate": 4.953769559032717e-07, + "logps/chosen": -282.91741943359375, + "logps/rejected": -315.76470947265625, + "loss": 0.1516, + "rewards/chosen": 1.442077875137329, + "rewards/margins": 7.074441432952881, + "rewards/rejected": -5.632363796234131, + "step": 170 + }, + { + "epoch": 0.11, + "grad_norm": 20.31084632873535, + "kl": 0.0, + "learning_rate": 4.935988620199146e-07, + "logps/chosen": -274.0014953613281, + "logps/rejected": -310.8323059082031, + "loss": 0.1477, + "rewards/chosen": 1.4551314115524292, + "rewards/margins": 6.99865198135376, + "rewards/rejected": -5.543520450592041, + "step": 175 + }, + { + "epoch": 0.12, + "grad_norm": 20.067346572875977, + "kl": 0.0, + "learning_rate": 4.918207681365576e-07, + "logps/chosen": -274.91705322265625, + "logps/rejected": -330.29315185546875, + "loss": 0.1491, + "rewards/chosen": 1.5545661449432373, + "rewards/margins": 7.674098014831543, + "rewards/rejected": -6.119531631469727, + "step": 180 + }, + { + "epoch": 0.12, + "grad_norm": 26.703292846679688, + "kl": 0.0, + "learning_rate": 4.900426742532006e-07, + "logps/chosen": -267.2247619628906, + "logps/rejected": -308.6265563964844, + "loss": 0.142, + "rewards/chosen": 1.4495770931243896, + "rewards/margins": 7.4614386558532715, + "rewards/rejected": -6.011861324310303, + "step": 185 + }, + { + "epoch": 0.12, + "grad_norm": 22.81442642211914, + "kl": 0.0, + "learning_rate": 4.882645803698435e-07, + "logps/chosen": -243.8881072998047, + "logps/rejected": -318.4960632324219, + "loss": 0.1679, + "rewards/chosen": 1.2335823774337769, + "rewards/margins": 7.564291954040527, + "rewards/rejected": -6.330709934234619, + "step": 190 + }, + { + "epoch": 0.12, + "grad_norm": 17.875883102416992, + "kl": 0.0, + "learning_rate": 4.864864864864865e-07, + "logps/chosen": -276.0986328125, + "logps/rejected": -317.36163330078125, + "loss": 0.1442, + "rewards/chosen": 1.5666391849517822, + "rewards/margins": 7.757128715515137, + "rewards/rejected": -6.190489768981934, + "step": 195 + }, + { + "epoch": 0.13, + "grad_norm": 22.31620216369629, + "kl": 0.0, + "learning_rate": 4.847083926031294e-07, + "logps/chosen": -273.843994140625, + "logps/rejected": -329.43475341796875, + "loss": 0.154, + "rewards/chosen": 1.6012461185455322, + "rewards/margins": 7.519808292388916, + "rewards/rejected": -5.918562412261963, + "step": 200 + }, + { + "epoch": 0.13, + "grad_norm": 19.976451873779297, + "kl": 0.0, + "learning_rate": 4.829302987197724e-07, + "logps/chosen": -284.374755859375, + "logps/rejected": -326.24371337890625, + "loss": 0.1378, + "rewards/chosen": 1.5799314975738525, + "rewards/margins": 7.78595495223999, + "rewards/rejected": -6.206023216247559, + "step": 205 + }, + { + "epoch": 0.13, + "grad_norm": 23.919170379638672, + "kl": 0.0, + "learning_rate": 4.811522048364154e-07, + "logps/chosen": -236.8705291748047, + "logps/rejected": -316.30413818359375, + "loss": 0.1523, + "rewards/chosen": 1.5169861316680908, + "rewards/margins": 7.513286590576172, + "rewards/rejected": -5.99630069732666, + "step": 210 + }, + { + "epoch": 0.14, + "grad_norm": 25.041824340820312, + "kl": 0.0, + "learning_rate": 4.793741109530583e-07, + "logps/chosen": -219.86117553710938, + "logps/rejected": -331.60418701171875, + "loss": 0.1516, + "rewards/chosen": 1.5909864902496338, + "rewards/margins": 7.9639410972595215, + "rewards/rejected": -6.372954845428467, + "step": 215 + }, + { + "epoch": 0.14, + "grad_norm": 25.45568084716797, + "kl": 0.0, + "learning_rate": 4.775960170697012e-07, + "logps/chosen": -271.017333984375, + "logps/rejected": -333.2148132324219, + "loss": 0.1628, + "rewards/chosen": 1.5948388576507568, + "rewards/margins": 7.8658246994018555, + "rewards/rejected": -6.2709856033325195, + "step": 220 + }, + { + "epoch": 0.14, + "grad_norm": 22.259382247924805, + "kl": 0.0, + "learning_rate": 4.7581792318634425e-07, + "logps/chosen": -253.55789184570312, + "logps/rejected": -298.8450622558594, + "loss": 0.1452, + "rewards/chosen": 1.640355110168457, + "rewards/margins": 8.043792724609375, + "rewards/rejected": -6.403438568115234, + "step": 225 + }, + { + "epoch": 0.15, + "grad_norm": 26.135997772216797, + "kl": 0.0, + "learning_rate": 4.7403982930298717e-07, + "logps/chosen": -235.11349487304688, + "logps/rejected": -344.72369384765625, + "loss": 0.1472, + "rewards/chosen": 1.559660792350769, + "rewards/margins": 7.8217010498046875, + "rewards/rejected": -6.262040615081787, + "step": 230 + }, + { + "epoch": 0.15, + "grad_norm": 20.143089294433594, + "kl": 0.0, + "learning_rate": 4.7226173541963014e-07, + "logps/chosen": -249.42196655273438, + "logps/rejected": -277.12554931640625, + "loss": 0.1529, + "rewards/chosen": 1.6460959911346436, + "rewards/margins": 7.348568916320801, + "rewards/rejected": -5.702473163604736, + "step": 235 + }, + { + "epoch": 0.15, + "grad_norm": 23.44059181213379, + "kl": 0.0, + "learning_rate": 4.7048364153627306e-07, + "logps/chosen": -281.3484802246094, + "logps/rejected": -307.25457763671875, + "loss": 0.1387, + "rewards/chosen": 1.7489140033721924, + "rewards/margins": 7.860198974609375, + "rewards/rejected": -6.111284255981445, + "step": 240 + }, + { + "epoch": 0.16, + "grad_norm": 23.201915740966797, + "kl": 0.0, + "learning_rate": 4.6870554765291604e-07, + "logps/chosen": -272.64031982421875, + "logps/rejected": -310.50909423828125, + "loss": 0.1356, + "rewards/chosen": 1.6725397109985352, + "rewards/margins": 8.012718200683594, + "rewards/rejected": -6.340178489685059, + "step": 245 + }, + { + "epoch": 0.16, + "grad_norm": 19.45538902282715, + "kl": 0.0, + "learning_rate": 4.66927453769559e-07, + "logps/chosen": -272.3523254394531, + "logps/rejected": -338.20867919921875, + "loss": 0.1246, + "rewards/chosen": 1.727900743484497, + "rewards/margins": 8.646397590637207, + "rewards/rejected": -6.918497562408447, + "step": 250 + }, + { + "epoch": 0.16, + "grad_norm": 24.896251678466797, + "kl": 0.0, + "learning_rate": 4.65149359886202e-07, + "logps/chosen": -269.20794677734375, + "logps/rejected": -328.73486328125, + "loss": 0.1281, + "rewards/chosen": 1.6262487173080444, + "rewards/margins": 9.223333358764648, + "rewards/rejected": -7.597084999084473, + "step": 255 + }, + { + "epoch": 0.17, + "grad_norm": 20.800025939941406, + "kl": 0.0, + "learning_rate": 4.633712660028449e-07, + "logps/chosen": -270.3649597167969, + "logps/rejected": -330.872314453125, + "loss": 0.1279, + "rewards/chosen": 1.5803369283676147, + "rewards/margins": 9.708114624023438, + "rewards/rejected": -8.127778053283691, + "step": 260 + }, + { + "epoch": 0.17, + "grad_norm": 20.315540313720703, + "kl": 0.0, + "learning_rate": 4.615931721194879e-07, + "logps/chosen": -277.21807861328125, + "logps/rejected": -313.4653625488281, + "loss": 0.1461, + "rewards/chosen": 1.6411361694335938, + "rewards/margins": 7.905195713043213, + "rewards/rejected": -6.264059543609619, + "step": 265 + }, + { + "epoch": 0.17, + "grad_norm": 19.68859100341797, + "kl": 0.0, + "learning_rate": 4.5981507823613085e-07, + "logps/chosen": -275.69696044921875, + "logps/rejected": -295.79400634765625, + "loss": 0.1547, + "rewards/chosen": 1.6596572399139404, + "rewards/margins": 7.9982404708862305, + "rewards/rejected": -6.338583469390869, + "step": 270 + }, + { + "epoch": 0.18, + "grad_norm": 27.070371627807617, + "kl": 0.0, + "learning_rate": 4.580369843527738e-07, + "logps/chosen": -272.72735595703125, + "logps/rejected": -310.302734375, + "loss": 0.1673, + "rewards/chosen": 1.2246049642562866, + "rewards/margins": 8.44649600982666, + "rewards/rejected": -7.221890449523926, + "step": 275 + }, + { + "epoch": 0.18, + "grad_norm": 15.166617393493652, + "kl": 0.0, + "learning_rate": 4.562588904694168e-07, + "logps/chosen": -267.9317932128906, + "logps/rejected": -316.60479736328125, + "loss": 0.1304, + "rewards/chosen": 1.7944562435150146, + "rewards/margins": 8.752721786499023, + "rewards/rejected": -6.958265781402588, + "step": 280 + }, + { + "epoch": 0.18, + "grad_norm": 27.143291473388672, + "kl": 0.0, + "learning_rate": 4.544807965860597e-07, + "logps/chosen": -257.7388916015625, + "logps/rejected": -308.5174560546875, + "loss": 0.1421, + "rewards/chosen": 1.6366589069366455, + "rewards/margins": 8.678224563598633, + "rewards/rejected": -7.04156494140625, + "step": 285 + }, + { + "epoch": 0.19, + "grad_norm": 22.1202392578125, + "kl": 0.0, + "learning_rate": 4.5270270270270264e-07, + "logps/chosen": -266.8901062011719, + "logps/rejected": -322.6683654785156, + "loss": 0.1447, + "rewards/chosen": 1.787255883216858, + "rewards/margins": 9.206243515014648, + "rewards/rejected": -7.418987274169922, + "step": 290 + }, + { + "epoch": 0.19, + "grad_norm": 24.7782039642334, + "kl": 0.0, + "learning_rate": 4.509246088193456e-07, + "logps/chosen": -274.30438232421875, + "logps/rejected": -342.98541259765625, + "loss": 0.1421, + "rewards/chosen": 1.776602029800415, + "rewards/margins": 9.053030967712402, + "rewards/rejected": -7.276429176330566, + "step": 295 + }, + { + "epoch": 0.19, + "grad_norm": 23.54313087463379, + "kl": 0.0, + "learning_rate": 4.491465149359886e-07, + "logps/chosen": -215.9270477294922, + "logps/rejected": -303.5273742675781, + "loss": 0.1438, + "rewards/chosen": 1.868506669998169, + "rewards/margins": 9.129117965698242, + "rewards/rejected": -7.260611534118652, + "step": 300 + }, + { + "epoch": 0.2, + "grad_norm": 17.70969009399414, + "kl": 0.0, + "learning_rate": 4.4736842105263156e-07, + "logps/chosen": -240.4123992919922, + "logps/rejected": -336.1444396972656, + "loss": 0.1403, + "rewards/chosen": 1.7039588689804077, + "rewards/margins": 9.00100326538086, + "rewards/rejected": -7.297044277191162, + "step": 305 + }, + { + "epoch": 0.2, + "grad_norm": 19.908315658569336, + "kl": 0.0, + "learning_rate": 4.4559032716927454e-07, + "logps/chosen": -212.1704864501953, + "logps/rejected": -340.3092041015625, + "loss": 0.1338, + "rewards/chosen": 1.6462271213531494, + "rewards/margins": 8.726274490356445, + "rewards/rejected": -7.0800461769104, + "step": 310 + }, + { + "epoch": 0.2, + "grad_norm": 22.928499221801758, + "kl": 0.0, + "learning_rate": 4.438122332859175e-07, + "logps/chosen": -243.7815704345703, + "logps/rejected": -324.60772705078125, + "loss": 0.1323, + "rewards/chosen": 1.6998401880264282, + "rewards/margins": 8.809240341186523, + "rewards/rejected": -7.109400749206543, + "step": 315 + }, + { + "epoch": 0.2, + "grad_norm": 20.08190155029297, + "kl": 0.0, + "learning_rate": 4.420341394025605e-07, + "logps/chosen": -263.37725830078125, + "logps/rejected": -317.4496154785156, + "loss": 0.1443, + "rewards/chosen": 1.706053376197815, + "rewards/margins": 8.708757400512695, + "rewards/rejected": -7.002703666687012, + "step": 320 + }, + { + "epoch": 0.21, + "grad_norm": 25.728233337402344, + "kl": 0.0, + "learning_rate": 4.4025604551920335e-07, + "logps/chosen": -282.44989013671875, + "logps/rejected": -325.9697265625, + "loss": 0.1351, + "rewards/chosen": 1.6976518630981445, + "rewards/margins": 9.016552925109863, + "rewards/rejected": -7.318901062011719, + "step": 325 + }, + { + "epoch": 0.21, + "grad_norm": 22.311450958251953, + "kl": 0.0, + "learning_rate": 4.384779516358463e-07, + "logps/chosen": -260.39447021484375, + "logps/rejected": -309.620849609375, + "loss": 0.1311, + "rewards/chosen": 1.8077850341796875, + "rewards/margins": 9.406209945678711, + "rewards/rejected": -7.598425388336182, + "step": 330 + }, + { + "epoch": 0.21, + "grad_norm": 23.20633316040039, + "kl": 0.0, + "learning_rate": 4.366998577524893e-07, + "logps/chosen": -302.47509765625, + "logps/rejected": -318.767578125, + "loss": 0.128, + "rewards/chosen": 1.7027689218521118, + "rewards/margins": 10.0870943069458, + "rewards/rejected": -8.38432502746582, + "step": 335 + }, + { + "epoch": 0.22, + "grad_norm": 25.405933380126953, + "kl": 0.0, + "learning_rate": 4.3492176386913227e-07, + "logps/chosen": -238.94827270507812, + "logps/rejected": -326.12786865234375, + "loss": 0.1505, + "rewards/chosen": 1.7603483200073242, + "rewards/margins": 9.71554183959961, + "rewards/rejected": -7.955193519592285, + "step": 340 + }, + { + "epoch": 0.22, + "grad_norm": 22.41493034362793, + "kl": 0.0, + "learning_rate": 4.3314366998577524e-07, + "logps/chosen": -253.826416015625, + "logps/rejected": -330.7270812988281, + "loss": 0.1413, + "rewards/chosen": 1.8580402135849, + "rewards/margins": 9.722744941711426, + "rewards/rejected": -7.8647050857543945, + "step": 345 + }, + { + "epoch": 0.22, + "grad_norm": 26.315628051757812, + "kl": 0.0, + "learning_rate": 4.313655761024182e-07, + "logps/chosen": -286.4155578613281, + "logps/rejected": -342.62652587890625, + "loss": 0.1422, + "rewards/chosen": 1.7754218578338623, + "rewards/margins": 10.7272367477417, + "rewards/rejected": -8.951814651489258, + "step": 350 + }, + { + "epoch": 0.23, + "grad_norm": 22.07290267944336, + "kl": 0.0, + "learning_rate": 4.2958748221906114e-07, + "logps/chosen": -239.4450225830078, + "logps/rejected": -310.7589111328125, + "loss": 0.1108, + "rewards/chosen": 1.9980005025863647, + "rewards/margins": 10.0792818069458, + "rewards/rejected": -8.081281661987305, + "step": 355 + }, + { + "epoch": 0.23, + "grad_norm": 22.56456756591797, + "kl": 0.0, + "learning_rate": 4.278093883357041e-07, + "logps/chosen": -269.43560791015625, + "logps/rejected": -328.5326843261719, + "loss": 0.1524, + "rewards/chosen": 1.747078537940979, + "rewards/margins": 9.907144546508789, + "rewards/rejected": -8.160065650939941, + "step": 360 + }, + { + "epoch": 0.23, + "grad_norm": 24.159225463867188, + "kl": 0.0, + "learning_rate": 4.260312944523471e-07, + "logps/chosen": -241.72286987304688, + "logps/rejected": -353.99493408203125, + "loss": 0.1185, + "rewards/chosen": 2.0929551124572754, + "rewards/margins": 10.861230850219727, + "rewards/rejected": -8.768275260925293, + "step": 365 + }, + { + "epoch": 0.24, + "grad_norm": 21.119632720947266, + "kl": 0.0, + "learning_rate": 4.2425320056899e-07, + "logps/chosen": -263.73370361328125, + "logps/rejected": -343.69866943359375, + "loss": 0.1164, + "rewards/chosen": 1.9714298248291016, + "rewards/margins": 10.647806167602539, + "rewards/rejected": -8.676377296447754, + "step": 370 + }, + { + "epoch": 0.24, + "grad_norm": 23.59329605102539, + "kl": 0.0, + "learning_rate": 4.22475106685633e-07, + "logps/chosen": -244.0771026611328, + "logps/rejected": -348.35693359375, + "loss": 0.1218, + "rewards/chosen": 1.8124616146087646, + "rewards/margins": 10.569868087768555, + "rewards/rejected": -8.757406234741211, + "step": 375 + }, + { + "epoch": 0.24, + "grad_norm": 23.84940528869629, + "kl": 0.0, + "learning_rate": 4.2069701280227595e-07, + "logps/chosen": -241.4117889404297, + "logps/rejected": -336.4566345214844, + "loss": 0.1294, + "rewards/chosen": 1.8767503499984741, + "rewards/margins": 10.489054679870605, + "rewards/rejected": -8.612303733825684, + "step": 380 + }, + { + "epoch": 0.25, + "grad_norm": 16.005657196044922, + "kl": 0.0, + "learning_rate": 4.189189189189189e-07, + "logps/chosen": -307.21600341796875, + "logps/rejected": -341.0569152832031, + "loss": 0.1154, + "rewards/chosen": 1.901063323020935, + "rewards/margins": 10.400434494018555, + "rewards/rejected": -8.499369621276855, + "step": 385 + }, + { + "epoch": 0.25, + "grad_norm": 16.274534225463867, + "kl": 0.0, + "learning_rate": 4.1714082503556185e-07, + "logps/chosen": -253.8114471435547, + "logps/rejected": -333.0570983886719, + "loss": 0.1327, + "rewards/chosen": 1.955413818359375, + "rewards/margins": 10.895428657531738, + "rewards/rejected": -8.940014839172363, + "step": 390 + }, + { + "epoch": 0.25, + "grad_norm": 20.72968101501465, + "kl": 0.0, + "learning_rate": 4.153627311522048e-07, + "logps/chosen": -261.703857421875, + "logps/rejected": -339.4162902832031, + "loss": 0.1308, + "rewards/chosen": 1.7927815914154053, + "rewards/margins": 10.807384490966797, + "rewards/rejected": -9.014602661132812, + "step": 395 + }, + { + "epoch": 0.26, + "grad_norm": 21.173845291137695, + "kl": 0.0, + "learning_rate": 4.135846372688478e-07, + "logps/chosen": -260.5010681152344, + "logps/rejected": -359.41851806640625, + "loss": 0.1131, + "rewards/chosen": 1.8654544353485107, + "rewards/margins": 11.833639144897461, + "rewards/rejected": -9.968184471130371, + "step": 400 + }, + { + "epoch": 0.26, + "grad_norm": 17.008028030395508, + "kl": 0.0, + "learning_rate": 4.1180654338549077e-07, + "logps/chosen": -234.20315551757812, + "logps/rejected": -342.5408020019531, + "loss": 0.1224, + "rewards/chosen": 2.086550235748291, + "rewards/margins": 10.627596855163574, + "rewards/rejected": -8.541048049926758, + "step": 405 + }, + { + "epoch": 0.26, + "grad_norm": 19.24855613708496, + "kl": 0.0, + "learning_rate": 4.100284495021337e-07, + "logps/chosen": -282.03106689453125, + "logps/rejected": -347.0184631347656, + "loss": 0.1212, + "rewards/chosen": 1.9099693298339844, + "rewards/margins": 10.274763107299805, + "rewards/rejected": -8.36479377746582, + "step": 410 + }, + { + "epoch": 0.27, + "grad_norm": 19.914691925048828, + "kl": 0.0, + "learning_rate": 4.082503556187766e-07, + "logps/chosen": -278.811767578125, + "logps/rejected": -348.43096923828125, + "loss": 0.1209, + "rewards/chosen": 1.7562357187271118, + "rewards/margins": 11.304061889648438, + "rewards/rejected": -9.547826766967773, + "step": 415 + }, + { + "epoch": 0.27, + "grad_norm": 32.35190963745117, + "kl": 0.0, + "learning_rate": 4.064722617354196e-07, + "logps/chosen": -272.9192199707031, + "logps/rejected": -339.4073181152344, + "loss": 0.1005, + "rewards/chosen": 2.1069161891937256, + "rewards/margins": 11.689797401428223, + "rewards/rejected": -9.582880973815918, + "step": 420 + }, + { + "epoch": 0.27, + "grad_norm": 16.27385902404785, + "kl": 0.0, + "learning_rate": 4.0469416785206256e-07, + "logps/chosen": -233.26083374023438, + "logps/rejected": -329.8385314941406, + "loss": 0.1135, + "rewards/chosen": 2.1503446102142334, + "rewards/margins": 10.886509895324707, + "rewards/rejected": -8.736165046691895, + "step": 425 + }, + { + "epoch": 0.28, + "grad_norm": 22.10724449157715, + "kl": 0.0, + "learning_rate": 4.0291607396870553e-07, + "logps/chosen": -267.9533996582031, + "logps/rejected": -359.1152038574219, + "loss": 0.1197, + "rewards/chosen": 1.8767788410186768, + "rewards/margins": 11.441519737243652, + "rewards/rejected": -9.564741134643555, + "step": 430 + }, + { + "epoch": 0.28, + "grad_norm": 29.721330642700195, + "kl": 0.0, + "learning_rate": 4.011379800853485e-07, + "logps/chosen": -268.9942932128906, + "logps/rejected": -377.97308349609375, + "loss": 0.1061, + "rewards/chosen": 2.016845703125, + "rewards/margins": 11.474628448486328, + "rewards/rejected": -9.457781791687012, + "step": 435 + }, + { + "epoch": 0.28, + "grad_norm": 24.95069122314453, + "kl": 0.0, + "learning_rate": 3.993598862019915e-07, + "logps/chosen": -279.73944091796875, + "logps/rejected": -370.55474853515625, + "loss": 0.1045, + "rewards/chosen": 2.340444803237915, + "rewards/margins": 11.733491897583008, + "rewards/rejected": -9.393046379089355, + "step": 440 + }, + { + "epoch": 0.28, + "grad_norm": 26.99854278564453, + "kl": 0.0, + "learning_rate": 3.975817923186344e-07, + "logps/chosen": -245.24417114257812, + "logps/rejected": -336.7696533203125, + "loss": 0.1234, + "rewards/chosen": 2.2276217937469482, + "rewards/margins": 10.63569164276123, + "rewards/rejected": -8.40807056427002, + "step": 445 + }, + { + "epoch": 0.29, + "grad_norm": 22.343507766723633, + "kl": 0.0, + "learning_rate": 3.9580369843527737e-07, + "logps/chosen": -250.68197631835938, + "logps/rejected": -364.1970520019531, + "loss": 0.1342, + "rewards/chosen": 2.1794769763946533, + "rewards/margins": 10.827537536621094, + "rewards/rejected": -8.64806079864502, + "step": 450 + }, + { + "epoch": 0.29, + "grad_norm": 16.532583236694336, + "kl": 0.0, + "learning_rate": 3.940256045519203e-07, + "logps/chosen": -246.8504638671875, + "logps/rejected": -336.2493896484375, + "loss": 0.1151, + "rewards/chosen": 1.9471546411514282, + "rewards/margins": 11.059728622436523, + "rewards/rejected": -9.112573623657227, + "step": 455 + }, + { + "epoch": 0.29, + "grad_norm": 19.48154067993164, + "kl": 0.0, + "learning_rate": 3.9224751066856327e-07, + "logps/chosen": -246.47671508789062, + "logps/rejected": -328.21038818359375, + "loss": 0.1156, + "rewards/chosen": 2.022249221801758, + "rewards/margins": 11.125974655151367, + "rewards/rejected": -9.103724479675293, + "step": 460 + }, + { + "epoch": 0.3, + "grad_norm": 17.954296112060547, + "kl": 0.0, + "learning_rate": 3.9046941678520624e-07, + "logps/chosen": -254.69808959960938, + "logps/rejected": -347.45758056640625, + "loss": 0.1254, + "rewards/chosen": 2.0911641120910645, + "rewards/margins": 11.680787086486816, + "rewards/rejected": -9.58962345123291, + "step": 465 + }, + { + "epoch": 0.3, + "grad_norm": 21.16987419128418, + "kl": 0.0, + "learning_rate": 3.886913229018492e-07, + "logps/chosen": -226.58358764648438, + "logps/rejected": -334.48138427734375, + "loss": 0.1437, + "rewards/chosen": 1.8157398700714111, + "rewards/margins": 10.399962425231934, + "rewards/rejected": -8.584221839904785, + "step": 470 + }, + { + "epoch": 0.3, + "grad_norm": 18.404508590698242, + "kl": 0.0, + "learning_rate": 3.8691322901849213e-07, + "logps/chosen": -263.36395263671875, + "logps/rejected": -337.2552795410156, + "loss": 0.1078, + "rewards/chosen": 2.1132290363311768, + "rewards/margins": 11.090356826782227, + "rewards/rejected": -8.977127075195312, + "step": 475 + }, + { + "epoch": 0.31, + "grad_norm": 18.355857849121094, + "kl": 0.0, + "learning_rate": 3.851351351351351e-07, + "logps/chosen": -256.04412841796875, + "logps/rejected": -356.82073974609375, + "loss": 0.1145, + "rewards/chosen": 1.9654195308685303, + "rewards/margins": 11.159137725830078, + "rewards/rejected": -9.193717956542969, + "step": 480 + }, + { + "epoch": 0.31, + "grad_norm": 16.297401428222656, + "kl": 0.0, + "learning_rate": 3.833570412517781e-07, + "logps/chosen": -330.457275390625, + "logps/rejected": -367.7776794433594, + "loss": 0.113, + "rewards/chosen": 1.8794314861297607, + "rewards/margins": 11.851155281066895, + "rewards/rejected": -9.971722602844238, + "step": 485 + }, + { + "epoch": 0.31, + "grad_norm": 22.856266021728516, + "kl": 0.0, + "learning_rate": 3.8157894736842105e-07, + "logps/chosen": -254.3659210205078, + "logps/rejected": -343.2192687988281, + "loss": 0.1138, + "rewards/chosen": 2.034006118774414, + "rewards/margins": 11.903759002685547, + "rewards/rejected": -9.869752883911133, + "step": 490 + }, + { + "epoch": 0.32, + "grad_norm": 21.443723678588867, + "kl": 0.0, + "learning_rate": 3.7980085348506403e-07, + "logps/chosen": -278.9993591308594, + "logps/rejected": -333.3291320800781, + "loss": 0.1287, + "rewards/chosen": 1.906224012374878, + "rewards/margins": 10.766765594482422, + "rewards/rejected": -8.860541343688965, + "step": 495 + }, + { + "epoch": 0.32, + "grad_norm": 24.97509002685547, + "kl": 0.0, + "learning_rate": 3.7802275960170695e-07, + "logps/chosen": -238.44290161132812, + "logps/rejected": -338.4499206542969, + "loss": 0.1142, + "rewards/chosen": 2.103523015975952, + "rewards/margins": 11.162189483642578, + "rewards/rejected": -9.058666229248047, + "step": 500 + }, + { + "epoch": 0.32, + "grad_norm": 22.563114166259766, + "kl": 0.0, + "learning_rate": 3.7624466571834987e-07, + "logps/chosen": -238.7085723876953, + "logps/rejected": -352.23663330078125, + "loss": 0.1396, + "rewards/chosen": 1.8689464330673218, + "rewards/margins": 10.945247650146484, + "rewards/rejected": -9.076301574707031, + "step": 505 + }, + { + "epoch": 0.33, + "grad_norm": 17.017030715942383, + "kl": 0.0, + "learning_rate": 3.7446657183499284e-07, + "logps/chosen": -241.0718231201172, + "logps/rejected": -340.98968505859375, + "loss": 0.1197, + "rewards/chosen": 1.9388542175292969, + "rewards/margins": 11.241914749145508, + "rewards/rejected": -9.303060531616211, + "step": 510 + }, + { + "epoch": 0.33, + "grad_norm": 16.066030502319336, + "kl": 0.0, + "learning_rate": 3.726884779516358e-07, + "logps/chosen": -224.85086059570312, + "logps/rejected": -349.51800537109375, + "loss": 0.1094, + "rewards/chosen": 2.1027920246124268, + "rewards/margins": 11.039840698242188, + "rewards/rejected": -8.937047958374023, + "step": 515 + }, + { + "epoch": 0.33, + "grad_norm": 17.983535766601562, + "kl": 0.0, + "learning_rate": 3.709103840682788e-07, + "logps/chosen": -269.41693115234375, + "logps/rejected": -347.45477294921875, + "loss": 0.1234, + "rewards/chosen": 2.2909629344940186, + "rewards/margins": 11.061802864074707, + "rewards/rejected": -8.770838737487793, + "step": 520 + }, + { + "epoch": 0.34, + "grad_norm": 32.42300796508789, + "kl": 0.0, + "learning_rate": 3.6913229018492176e-07, + "logps/chosen": -274.3470764160156, + "logps/rejected": -335.10345458984375, + "loss": 0.1186, + "rewards/chosen": 2.1239845752716064, + "rewards/margins": 11.3440523147583, + "rewards/rejected": -9.220067977905273, + "step": 525 + }, + { + "epoch": 0.34, + "grad_norm": 16.601402282714844, + "kl": 0.0, + "learning_rate": 3.6735419630156474e-07, + "logps/chosen": -232.43392944335938, + "logps/rejected": -354.2206726074219, + "loss": 0.1218, + "rewards/chosen": 2.117833137512207, + "rewards/margins": 11.192573547363281, + "rewards/rejected": -9.07474136352539, + "step": 530 + }, + { + "epoch": 0.34, + "grad_norm": 24.04266357421875, + "kl": 0.0, + "learning_rate": 3.655761024182077e-07, + "logps/chosen": -277.41241455078125, + "logps/rejected": -334.04815673828125, + "loss": 0.1025, + "rewards/chosen": 2.3604273796081543, + "rewards/margins": 11.987722396850586, + "rewards/rejected": -9.627294540405273, + "step": 535 + }, + { + "epoch": 0.35, + "grad_norm": 15.607158660888672, + "kl": 0.0, + "learning_rate": 3.637980085348506e-07, + "logps/chosen": -286.7027282714844, + "logps/rejected": -319.68896484375, + "loss": 0.1124, + "rewards/chosen": 2.4556756019592285, + "rewards/margins": 11.582249641418457, + "rewards/rejected": -9.126574516296387, + "step": 540 + }, + { + "epoch": 0.35, + "grad_norm": 22.26024055480957, + "kl": 0.0, + "learning_rate": 3.6201991465149355e-07, + "logps/chosen": -239.4503173828125, + "logps/rejected": -345.2344970703125, + "loss": 0.112, + "rewards/chosen": 1.9915698766708374, + "rewards/margins": 11.853796005249023, + "rewards/rejected": -9.862226486206055, + "step": 545 + }, + { + "epoch": 0.35, + "grad_norm": 29.981088638305664, + "kl": 0.0, + "learning_rate": 3.602418207681365e-07, + "logps/chosen": -241.8583984375, + "logps/rejected": -365.6163024902344, + "loss": 0.1207, + "rewards/chosen": 1.9029747247695923, + "rewards/margins": 11.538789749145508, + "rewards/rejected": -9.635814666748047, + "step": 550 + }, + { + "epoch": 0.36, + "grad_norm": 17.248044967651367, + "kl": 0.0, + "learning_rate": 3.584637268847795e-07, + "logps/chosen": -255.35635375976562, + "logps/rejected": -348.6636962890625, + "loss": 0.1267, + "rewards/chosen": 2.0199761390686035, + "rewards/margins": 11.458740234375, + "rewards/rejected": -9.438763618469238, + "step": 555 + }, + { + "epoch": 0.36, + "grad_norm": 20.46364974975586, + "kl": 0.0, + "learning_rate": 3.5668563300142247e-07, + "logps/chosen": -271.98846435546875, + "logps/rejected": -319.4902038574219, + "loss": 0.1209, + "rewards/chosen": 2.140854597091675, + "rewards/margins": 11.506881713867188, + "rewards/rejected": -9.36602783203125, + "step": 560 + }, + { + "epoch": 0.36, + "grad_norm": 13.751622200012207, + "kl": 0.0, + "learning_rate": 3.5490753911806545e-07, + "logps/chosen": -249.31838989257812, + "logps/rejected": -363.65313720703125, + "loss": 0.1066, + "rewards/chosen": 2.302562713623047, + "rewards/margins": 12.324169158935547, + "rewards/rejected": -10.0216064453125, + "step": 565 + }, + { + "epoch": 0.36, + "grad_norm": 23.35749053955078, + "kl": 0.0, + "learning_rate": 3.5312944523470837e-07, + "logps/chosen": -242.62533569335938, + "logps/rejected": -326.2989807128906, + "loss": 0.1213, + "rewards/chosen": 2.164386749267578, + "rewards/margins": 11.125585556030273, + "rewards/rejected": -8.961198806762695, + "step": 570 + }, + { + "epoch": 0.37, + "grad_norm": 22.92085075378418, + "kl": 0.0, + "learning_rate": 3.5135135135135134e-07, + "logps/chosen": -215.1080322265625, + "logps/rejected": -343.92315673828125, + "loss": 0.1096, + "rewards/chosen": 2.084261417388916, + "rewards/margins": 12.25261402130127, + "rewards/rejected": -10.168352127075195, + "step": 575 + }, + { + "epoch": 0.37, + "grad_norm": 19.450267791748047, + "kl": 0.0, + "learning_rate": 3.495732574679943e-07, + "logps/chosen": -217.7822723388672, + "logps/rejected": -329.91375732421875, + "loss": 0.1294, + "rewards/chosen": 2.3254265785217285, + "rewards/margins": 11.483414649963379, + "rewards/rejected": -9.157987594604492, + "step": 580 + }, + { + "epoch": 0.37, + "grad_norm": 18.078920364379883, + "kl": 0.0, + "learning_rate": 3.4779516358463724e-07, + "logps/chosen": -253.4072723388672, + "logps/rejected": -344.491455078125, + "loss": 0.0927, + "rewards/chosen": 2.3925743103027344, + "rewards/margins": 12.241676330566406, + "rewards/rejected": -9.849101066589355, + "step": 585 + }, + { + "epoch": 0.38, + "grad_norm": 17.867704391479492, + "kl": 0.0, + "learning_rate": 3.460170697012802e-07, + "logps/chosen": -253.86178588867188, + "logps/rejected": -367.8619689941406, + "loss": 0.1152, + "rewards/chosen": 2.281512975692749, + "rewards/margins": 12.078498840332031, + "rewards/rejected": -9.79698657989502, + "step": 590 + }, + { + "epoch": 0.38, + "grad_norm": 19.863170623779297, + "kl": 0.0, + "learning_rate": 3.442389758179232e-07, + "logps/chosen": -289.83990478515625, + "logps/rejected": -346.80987548828125, + "loss": 0.1098, + "rewards/chosen": 2.1612110137939453, + "rewards/margins": 12.505953788757324, + "rewards/rejected": -10.344742774963379, + "step": 595 + }, + { + "epoch": 0.38, + "grad_norm": 14.594950675964355, + "kl": 0.0, + "learning_rate": 3.424608819345661e-07, + "logps/chosen": -199.16708374023438, + "logps/rejected": -352.14447021484375, + "loss": 0.0929, + "rewards/chosen": 1.9940685033798218, + "rewards/margins": 12.916508674621582, + "rewards/rejected": -10.922439575195312, + "step": 600 + }, + { + "epoch": 0.39, + "grad_norm": 20.205915451049805, + "kl": 0.0, + "learning_rate": 3.406827880512091e-07, + "logps/chosen": -251.85546875, + "logps/rejected": -347.35113525390625, + "loss": 0.1137, + "rewards/chosen": 2.3019704818725586, + "rewards/margins": 12.349814414978027, + "rewards/rejected": -10.047843933105469, + "step": 605 + }, + { + "epoch": 0.39, + "grad_norm": 21.62274742126465, + "kl": 0.0, + "learning_rate": 3.3890469416785205e-07, + "logps/chosen": -241.4209442138672, + "logps/rejected": -366.311767578125, + "loss": 0.104, + "rewards/chosen": 2.3162879943847656, + "rewards/margins": 11.884176254272461, + "rewards/rejected": -9.567889213562012, + "step": 610 + }, + { + "epoch": 0.39, + "grad_norm": 18.622377395629883, + "kl": 0.0, + "learning_rate": 3.37126600284495e-07, + "logps/chosen": -270.2699279785156, + "logps/rejected": -338.7982482910156, + "loss": 0.1127, + "rewards/chosen": 2.2165682315826416, + "rewards/margins": 11.826861381530762, + "rewards/rejected": -9.6102933883667, + "step": 615 + }, + { + "epoch": 0.4, + "grad_norm": 19.930374145507812, + "kl": 0.0, + "learning_rate": 3.35348506401138e-07, + "logps/chosen": -265.74432373046875, + "logps/rejected": -366.4137268066406, + "loss": 0.1235, + "rewards/chosen": 2.140533924102783, + "rewards/margins": 12.077839851379395, + "rewards/rejected": -9.937305450439453, + "step": 620 + }, + { + "epoch": 0.4, + "grad_norm": 20.78235626220703, + "kl": 0.0, + "learning_rate": 3.335704125177809e-07, + "logps/chosen": -251.783935546875, + "logps/rejected": -321.7081298828125, + "loss": 0.1251, + "rewards/chosen": 2.4168410301208496, + "rewards/margins": 11.567142486572266, + "rewards/rejected": -9.150300979614258, + "step": 625 + }, + { + "epoch": 0.4, + "grad_norm": 18.487886428833008, + "kl": 0.0, + "learning_rate": 3.3179231863442384e-07, + "logps/chosen": -244.60250854492188, + "logps/rejected": -386.168212890625, + "loss": 0.0922, + "rewards/chosen": 2.371992588043213, + "rewards/margins": 13.357122421264648, + "rewards/rejected": -10.985128402709961, + "step": 630 + }, + { + "epoch": 0.41, + "grad_norm": 16.354122161865234, + "kl": 0.0, + "learning_rate": 3.300142247510668e-07, + "logps/chosen": -252.8701171875, + "logps/rejected": -334.31744384765625, + "loss": 0.1003, + "rewards/chosen": 2.335937023162842, + "rewards/margins": 12.5576753616333, + "rewards/rejected": -10.2217378616333, + "step": 635 + }, + { + "epoch": 0.41, + "grad_norm": 16.126724243164062, + "kl": 0.0, + "learning_rate": 3.282361308677098e-07, + "logps/chosen": -240.2421417236328, + "logps/rejected": -346.31414794921875, + "loss": 0.1069, + "rewards/chosen": 2.2422432899475098, + "rewards/margins": 11.99687671661377, + "rewards/rejected": -9.754633903503418, + "step": 640 + }, + { + "epoch": 0.41, + "grad_norm": 21.066267013549805, + "kl": 0.0, + "learning_rate": 3.2645803698435276e-07, + "logps/chosen": -264.81964111328125, + "logps/rejected": -338.95379638671875, + "loss": 0.1126, + "rewards/chosen": 2.3289685249328613, + "rewards/margins": 11.437704086303711, + "rewards/rejected": -9.108736038208008, + "step": 645 + }, + { + "epoch": 0.42, + "grad_norm": 14.892672538757324, + "kl": 0.0, + "learning_rate": 3.2467994310099573e-07, + "logps/chosen": -240.17236328125, + "logps/rejected": -355.543212890625, + "loss": 0.0884, + "rewards/chosen": 2.1047472953796387, + "rewards/margins": 12.319875717163086, + "rewards/rejected": -10.215127944946289, + "step": 650 + }, + { + "epoch": 0.42, + "grad_norm": 22.722187042236328, + "kl": 0.0, + "learning_rate": 3.229018492176387e-07, + "logps/chosen": -235.84341430664062, + "logps/rejected": -363.74041748046875, + "loss": 0.0958, + "rewards/chosen": 2.262721300125122, + "rewards/margins": 12.947868347167969, + "rewards/rejected": -10.685147285461426, + "step": 655 + }, + { + "epoch": 0.42, + "grad_norm": 16.061914443969727, + "kl": 0.0, + "learning_rate": 3.211237553342817e-07, + "logps/chosen": -232.480224609375, + "logps/rejected": -363.93792724609375, + "loss": 0.0947, + "rewards/chosen": 2.245720863342285, + "rewards/margins": 13.019895553588867, + "rewards/rejected": -10.774174690246582, + "step": 660 + }, + { + "epoch": 0.43, + "grad_norm": 17.95708656311035, + "kl": 0.0, + "learning_rate": 3.193456614509246e-07, + "logps/chosen": -266.2756652832031, + "logps/rejected": -349.5681457519531, + "loss": 0.0981, + "rewards/chosen": 2.0552451610565186, + "rewards/margins": 12.47476863861084, + "rewards/rejected": -10.419523239135742, + "step": 665 + }, + { + "epoch": 0.43, + "grad_norm": 22.32909393310547, + "kl": 0.0, + "learning_rate": 3.175675675675675e-07, + "logps/chosen": -262.5782165527344, + "logps/rejected": -350.5921936035156, + "loss": 0.1155, + "rewards/chosen": 2.324476718902588, + "rewards/margins": 12.327436447143555, + "rewards/rejected": -10.002958297729492, + "step": 670 + }, + { + "epoch": 0.43, + "grad_norm": 14.884044647216797, + "kl": 0.0, + "learning_rate": 3.157894736842105e-07, + "logps/chosen": -257.66094970703125, + "logps/rejected": -344.09869384765625, + "loss": 0.096, + "rewards/chosen": 2.2759206295013428, + "rewards/margins": 13.264117240905762, + "rewards/rejected": -10.988197326660156, + "step": 675 + }, + { + "epoch": 0.44, + "grad_norm": 18.041053771972656, + "kl": 0.0, + "learning_rate": 3.1401137980085347e-07, + "logps/chosen": -270.93646240234375, + "logps/rejected": -360.5249938964844, + "loss": 0.0958, + "rewards/chosen": 2.366556167602539, + "rewards/margins": 13.1105318069458, + "rewards/rejected": -10.743974685668945, + "step": 680 + }, + { + "epoch": 0.44, + "grad_norm": 17.0706729888916, + "kl": 0.0, + "learning_rate": 3.1223328591749644e-07, + "logps/chosen": -214.73690795898438, + "logps/rejected": -342.7330017089844, + "loss": 0.1107, + "rewards/chosen": 2.152832508087158, + "rewards/margins": 12.126806259155273, + "rewards/rejected": -9.973973274230957, + "step": 685 + }, + { + "epoch": 0.44, + "grad_norm": 25.26626968383789, + "kl": 0.0, + "learning_rate": 3.104551920341394e-07, + "logps/chosen": -293.3183898925781, + "logps/rejected": -356.9164123535156, + "loss": 0.0973, + "rewards/chosen": 2.3354926109313965, + "rewards/margins": 13.199078559875488, + "rewards/rejected": -10.863585472106934, + "step": 690 + }, + { + "epoch": 0.44, + "grad_norm": 28.42453384399414, + "kl": 0.0, + "learning_rate": 3.0867709815078234e-07, + "logps/chosen": -285.2272644042969, + "logps/rejected": -367.3414001464844, + "loss": 0.1091, + "rewards/chosen": 2.12477707862854, + "rewards/margins": 13.724832534790039, + "rewards/rejected": -11.600054740905762, + "step": 695 + }, + { + "epoch": 0.45, + "grad_norm": 17.899433135986328, + "kl": 0.0, + "learning_rate": 3.068990042674253e-07, + "logps/chosen": -262.5505065917969, + "logps/rejected": -371.45025634765625, + "loss": 0.1083, + "rewards/chosen": 2.060147762298584, + "rewards/margins": 12.760570526123047, + "rewards/rejected": -10.700422286987305, + "step": 700 + }, + { + "epoch": 0.45, + "grad_norm": 19.646583557128906, + "kl": 0.0, + "learning_rate": 3.051209103840683e-07, + "logps/chosen": -274.9856262207031, + "logps/rejected": -360.3489685058594, + "loss": 0.1105, + "rewards/chosen": 2.4847919940948486, + "rewards/margins": 13.168169975280762, + "rewards/rejected": -10.683378219604492, + "step": 705 + }, + { + "epoch": 0.45, + "grad_norm": 14.362700462341309, + "kl": 0.0, + "learning_rate": 3.033428165007112e-07, + "logps/chosen": -253.5523681640625, + "logps/rejected": -336.803466796875, + "loss": 0.1024, + "rewards/chosen": 2.3810598850250244, + "rewards/margins": 12.508206367492676, + "rewards/rejected": -10.127145767211914, + "step": 710 + }, + { + "epoch": 0.46, + "grad_norm": 22.93763542175293, + "kl": 0.0, + "learning_rate": 3.015647226173542e-07, + "logps/chosen": -229.1265869140625, + "logps/rejected": -354.0566101074219, + "loss": 0.1135, + "rewards/chosen": 2.1574866771698, + "rewards/margins": 12.193730354309082, + "rewards/rejected": -10.036243438720703, + "step": 715 + }, + { + "epoch": 0.46, + "grad_norm": 22.314695358276367, + "kl": 0.0, + "learning_rate": 2.9978662873399715e-07, + "logps/chosen": -274.2774353027344, + "logps/rejected": -405.2040100097656, + "loss": 0.0953, + "rewards/chosen": 2.191157817840576, + "rewards/margins": 13.628583908081055, + "rewards/rejected": -11.43742561340332, + "step": 720 + }, + { + "epoch": 0.46, + "grad_norm": 20.03668975830078, + "kl": 0.0, + "learning_rate": 2.9800853485064007e-07, + "logps/chosen": -250.0018768310547, + "logps/rejected": -340.919677734375, + "loss": 0.1022, + "rewards/chosen": 2.4375367164611816, + "rewards/margins": 12.8548583984375, + "rewards/rejected": -10.417322158813477, + "step": 725 + }, + { + "epoch": 0.47, + "grad_norm": 20.21332359313965, + "kl": 0.0, + "learning_rate": 2.9623044096728305e-07, + "logps/chosen": -262.5707092285156, + "logps/rejected": -341.51556396484375, + "loss": 0.113, + "rewards/chosen": 2.2380969524383545, + "rewards/margins": 12.677546501159668, + "rewards/rejected": -10.439449310302734, + "step": 730 + }, + { + "epoch": 0.47, + "grad_norm": 18.686241149902344, + "kl": 0.0, + "learning_rate": 2.94452347083926e-07, + "logps/chosen": -259.5292053222656, + "logps/rejected": -340.3607482910156, + "loss": 0.1188, + "rewards/chosen": 2.1912286281585693, + "rewards/margins": 12.15953540802002, + "rewards/rejected": -9.968307495117188, + "step": 735 + }, + { + "epoch": 0.47, + "grad_norm": 19.728899002075195, + "kl": 0.0, + "learning_rate": 2.92674253200569e-07, + "logps/chosen": -249.8451385498047, + "logps/rejected": -381.3360290527344, + "loss": 0.0967, + "rewards/chosen": 2.2508890628814697, + "rewards/margins": 13.840009689331055, + "rewards/rejected": -11.589120864868164, + "step": 740 + }, + { + "epoch": 0.48, + "grad_norm": 17.338115692138672, + "kl": 0.0, + "learning_rate": 2.9089615931721197e-07, + "logps/chosen": -268.9515380859375, + "logps/rejected": -368.5078125, + "loss": 0.1015, + "rewards/chosen": 2.28879976272583, + "rewards/margins": 13.396432876586914, + "rewards/rejected": -11.107633590698242, + "step": 745 + }, + { + "epoch": 0.48, + "grad_norm": 19.87563133239746, + "kl": 0.0, + "learning_rate": 2.8911806543385494e-07, + "logps/chosen": -293.7141418457031, + "logps/rejected": -355.55535888671875, + "loss": 0.1071, + "rewards/chosen": 2.4517719745635986, + "rewards/margins": 13.449313163757324, + "rewards/rejected": -10.997541427612305, + "step": 750 + }, + { + "epoch": 0.48, + "grad_norm": 19.807565689086914, + "kl": 0.0, + "learning_rate": 2.873399715504978e-07, + "logps/chosen": -248.6322784423828, + "logps/rejected": -353.8228759765625, + "loss": 0.1088, + "rewards/chosen": 2.2985501289367676, + "rewards/margins": 12.722993850708008, + "rewards/rejected": -10.424444198608398, + "step": 755 + }, + { + "epoch": 0.49, + "grad_norm": 14.098715782165527, + "kl": 0.0, + "learning_rate": 2.855618776671408e-07, + "logps/chosen": -240.1453857421875, + "logps/rejected": -330.6163635253906, + "loss": 0.0891, + "rewards/chosen": 2.550762414932251, + "rewards/margins": 12.813929557800293, + "rewards/rejected": -10.263166427612305, + "step": 760 + }, + { + "epoch": 0.49, + "grad_norm": 16.266538619995117, + "kl": 0.0, + "learning_rate": 2.8378378378378376e-07, + "logps/chosen": -236.1558380126953, + "logps/rejected": -360.686767578125, + "loss": 0.0973, + "rewards/chosen": 2.2286760807037354, + "rewards/margins": 13.117490768432617, + "rewards/rejected": -10.888814926147461, + "step": 765 + }, + { + "epoch": 0.49, + "grad_norm": 16.414714813232422, + "kl": 0.0, + "learning_rate": 2.8200568990042673e-07, + "logps/chosen": -283.36944580078125, + "logps/rejected": -368.7194519042969, + "loss": 0.0967, + "rewards/chosen": 2.4076812267303467, + "rewards/margins": 12.956690788269043, + "rewards/rejected": -10.549009323120117, + "step": 770 + }, + { + "epoch": 0.5, + "grad_norm": 18.088735580444336, + "kl": 0.0, + "learning_rate": 2.802275960170697e-07, + "logps/chosen": -302.5070495605469, + "logps/rejected": -348.53289794921875, + "loss": 0.1047, + "rewards/chosen": 2.4432990550994873, + "rewards/margins": 12.935566902160645, + "rewards/rejected": -10.492268562316895, + "step": 775 + }, + { + "epoch": 0.5, + "grad_norm": 16.778833389282227, + "kl": 0.0, + "learning_rate": 2.784495021337127e-07, + "logps/chosen": -243.6858367919922, + "logps/rejected": -366.0279235839844, + "loss": 0.0852, + "rewards/chosen": 2.5511088371276855, + "rewards/margins": 13.161443710327148, + "rewards/rejected": -10.610334396362305, + "step": 780 + }, + { + "epoch": 0.5, + "grad_norm": 17.379335403442383, + "kl": 0.0, + "learning_rate": 2.766714082503556e-07, + "logps/chosen": -279.8948669433594, + "logps/rejected": -352.0287170410156, + "loss": 0.0957, + "rewards/chosen": 2.464888334274292, + "rewards/margins": 13.103485107421875, + "rewards/rejected": -10.638595581054688, + "step": 785 + }, + { + "epoch": 0.51, + "grad_norm": 21.737380981445312, + "kl": 0.0, + "learning_rate": 2.7489331436699857e-07, + "logps/chosen": -258.93865966796875, + "logps/rejected": -345.5055236816406, + "loss": 0.1177, + "rewards/chosen": 2.079983711242676, + "rewards/margins": 12.90088176727295, + "rewards/rejected": -10.820898056030273, + "step": 790 + }, + { + "epoch": 0.51, + "grad_norm": 15.227828025817871, + "kl": 0.0, + "learning_rate": 2.7311522048364154e-07, + "logps/chosen": -233.96194458007812, + "logps/rejected": -370.0096130371094, + "loss": 0.1054, + "rewards/chosen": 2.696063280105591, + "rewards/margins": 13.426877975463867, + "rewards/rejected": -10.730813980102539, + "step": 795 + }, + { + "epoch": 0.51, + "grad_norm": 13.938480377197266, + "kl": 0.0, + "learning_rate": 2.7133712660028446e-07, + "logps/chosen": -243.730224609375, + "logps/rejected": -350.44183349609375, + "loss": 0.0976, + "rewards/chosen": 2.3282103538513184, + "rewards/margins": 13.082319259643555, + "rewards/rejected": -10.754108428955078, + "step": 800 + }, + { + "epoch": 0.52, + "grad_norm": 29.015178680419922, + "kl": 0.0, + "learning_rate": 2.6955903271692744e-07, + "logps/chosen": -216.66488647460938, + "logps/rejected": -368.9279479980469, + "loss": 0.1045, + "rewards/chosen": 2.2219786643981934, + "rewards/margins": 13.255975723266602, + "rewards/rejected": -11.033994674682617, + "step": 805 + }, + { + "epoch": 0.52, + "grad_norm": 12.759780883789062, + "kl": 0.0, + "learning_rate": 2.677809388335704e-07, + "logps/chosen": -232.0436248779297, + "logps/rejected": -358.2416076660156, + "loss": 0.0943, + "rewards/chosen": 2.235215425491333, + "rewards/margins": 12.901985168457031, + "rewards/rejected": -10.666769027709961, + "step": 810 + }, + { + "epoch": 0.52, + "grad_norm": 19.0715389251709, + "kl": 0.0, + "learning_rate": 2.6600284495021333e-07, + "logps/chosen": -245.0729217529297, + "logps/rejected": -379.9324645996094, + "loss": 0.1173, + "rewards/chosen": 2.423140287399292, + "rewards/margins": 13.03075885772705, + "rewards/rejected": -10.60761833190918, + "step": 815 + }, + { + "epoch": 0.52, + "grad_norm": 15.53042984008789, + "kl": 0.0, + "learning_rate": 2.642247510668563e-07, + "logps/chosen": -261.9339294433594, + "logps/rejected": -327.780517578125, + "loss": 0.0903, + "rewards/chosen": 2.4564433097839355, + "rewards/margins": 13.401847839355469, + "rewards/rejected": -10.945404052734375, + "step": 820 + }, + { + "epoch": 0.53, + "grad_norm": 26.67053985595703, + "kl": 0.0, + "learning_rate": 2.624466571834993e-07, + "logps/chosen": -248.34487915039062, + "logps/rejected": -345.79083251953125, + "loss": 0.1071, + "rewards/chosen": 2.359476089477539, + "rewards/margins": 13.113537788391113, + "rewards/rejected": -10.754061698913574, + "step": 825 + }, + { + "epoch": 0.53, + "grad_norm": 22.24759292602539, + "kl": 0.0, + "learning_rate": 2.6066856330014225e-07, + "logps/chosen": -262.4079895019531, + "logps/rejected": -364.2686462402344, + "loss": 0.1076, + "rewards/chosen": 2.37235689163208, + "rewards/margins": 13.127764701843262, + "rewards/rejected": -10.75540828704834, + "step": 830 + }, + { + "epoch": 0.53, + "grad_norm": 17.638994216918945, + "kl": 0.0, + "learning_rate": 2.5889046941678523e-07, + "logps/chosen": -273.74432373046875, + "logps/rejected": -383.0593566894531, + "loss": 0.0984, + "rewards/chosen": 2.275177478790283, + "rewards/margins": 14.033803939819336, + "rewards/rejected": -11.758626937866211, + "step": 835 + }, + { + "epoch": 0.54, + "grad_norm": 15.510226249694824, + "kl": 0.0, + "learning_rate": 2.5711237553342815e-07, + "logps/chosen": -268.3088073730469, + "logps/rejected": -372.16876220703125, + "loss": 0.075, + "rewards/chosen": 2.7448737621307373, + "rewards/margins": 14.005487442016602, + "rewards/rejected": -11.260614395141602, + "step": 840 + }, + { + "epoch": 0.54, + "grad_norm": 18.002506256103516, + "kl": 0.0, + "learning_rate": 2.5533428165007107e-07, + "logps/chosen": -214.6236572265625, + "logps/rejected": -379.697265625, + "loss": 0.1156, + "rewards/chosen": 2.090531349182129, + "rewards/margins": 14.037800788879395, + "rewards/rejected": -11.947270393371582, + "step": 845 + }, + { + "epoch": 0.54, + "grad_norm": 18.8648681640625, + "kl": 0.0, + "learning_rate": 2.5355618776671404e-07, + "logps/chosen": -230.46810913085938, + "logps/rejected": -345.8827209472656, + "loss": 0.1028, + "rewards/chosen": 2.3623595237731934, + "rewards/margins": 13.34724235534668, + "rewards/rejected": -10.984883308410645, + "step": 850 + }, + { + "epoch": 0.55, + "grad_norm": 15.514137268066406, + "kl": 0.0, + "learning_rate": 2.51778093883357e-07, + "logps/chosen": -224.1614227294922, + "logps/rejected": -365.08477783203125, + "loss": 0.0985, + "rewards/chosen": 2.2946102619171143, + "rewards/margins": 14.019018173217773, + "rewards/rejected": -11.724408149719238, + "step": 855 + }, + { + "epoch": 0.55, + "grad_norm": 17.679288864135742, + "kl": 0.0, + "learning_rate": 2.5e-07, + "logps/chosen": -231.95565795898438, + "logps/rejected": -349.056640625, + "loss": 0.0966, + "rewards/chosen": 2.3342556953430176, + "rewards/margins": 13.233372688293457, + "rewards/rejected": -10.899115562438965, + "step": 860 + }, + { + "epoch": 0.55, + "grad_norm": 19.04916000366211, + "kl": 0.0, + "learning_rate": 2.4822190611664296e-07, + "logps/chosen": -233.5281982421875, + "logps/rejected": -374.4831848144531, + "loss": 0.0922, + "rewards/chosen": 2.58280873298645, + "rewards/margins": 13.661170959472656, + "rewards/rejected": -11.078360557556152, + "step": 865 + }, + { + "epoch": 0.56, + "grad_norm": 16.834497451782227, + "kl": 0.0, + "learning_rate": 2.4644381223328594e-07, + "logps/chosen": -236.75106811523438, + "logps/rejected": -353.3365478515625, + "loss": 0.096, + "rewards/chosen": 2.7401375770568848, + "rewards/margins": 13.776086807250977, + "rewards/rejected": -11.035948753356934, + "step": 870 + }, + { + "epoch": 0.56, + "grad_norm": 24.789772033691406, + "kl": 0.0, + "learning_rate": 2.4466571834992886e-07, + "logps/chosen": -267.3653869628906, + "logps/rejected": -370.1556091308594, + "loss": 0.0949, + "rewards/chosen": 2.2669379711151123, + "rewards/margins": 13.626495361328125, + "rewards/rejected": -11.35955810546875, + "step": 875 + }, + { + "epoch": 0.56, + "grad_norm": 12.911144256591797, + "kl": 0.0, + "learning_rate": 2.4288762446657183e-07, + "logps/chosen": -250.23556518554688, + "logps/rejected": -363.0490417480469, + "loss": 0.0717, + "rewards/chosen": 2.6297707557678223, + "rewards/margins": 15.319549560546875, + "rewards/rejected": -12.689778327941895, + "step": 880 + }, + { + "epoch": 0.57, + "grad_norm": 18.159881591796875, + "kl": 0.0, + "learning_rate": 2.411095305832148e-07, + "logps/chosen": -276.918701171875, + "logps/rejected": -351.54132080078125, + "loss": 0.1062, + "rewards/chosen": 2.5637714862823486, + "rewards/margins": 13.2720365524292, + "rewards/rejected": -10.708267211914062, + "step": 885 + }, + { + "epoch": 0.57, + "grad_norm": 16.24278450012207, + "kl": 0.0, + "learning_rate": 2.393314366998578e-07, + "logps/chosen": -264.356689453125, + "logps/rejected": -345.72064208984375, + "loss": 0.1061, + "rewards/chosen": 2.57415509223938, + "rewards/margins": 13.096124649047852, + "rewards/rejected": -10.521968841552734, + "step": 890 + }, + { + "epoch": 0.57, + "grad_norm": 15.353395462036133, + "kl": 0.0, + "learning_rate": 2.375533428165007e-07, + "logps/chosen": -245.558349609375, + "logps/rejected": -346.11151123046875, + "loss": 0.1098, + "rewards/chosen": 2.0979220867156982, + "rewards/margins": 13.061151504516602, + "rewards/rejected": -10.963228225708008, + "step": 895 + }, + { + "epoch": 0.58, + "grad_norm": 15.291166305541992, + "kl": 0.0, + "learning_rate": 2.3577524893314365e-07, + "logps/chosen": -268.0149841308594, + "logps/rejected": -359.99676513671875, + "loss": 0.0935, + "rewards/chosen": 2.660017251968384, + "rewards/margins": 13.785112380981445, + "rewards/rejected": -11.125094413757324, + "step": 900 + }, + { + "epoch": 0.58, + "grad_norm": 27.338308334350586, + "kl": 0.0, + "learning_rate": 2.3399715504978662e-07, + "logps/chosen": -223.98062133789062, + "logps/rejected": -382.53924560546875, + "loss": 0.0733, + "rewards/chosen": 2.573464870452881, + "rewards/margins": 14.070528030395508, + "rewards/rejected": -11.497062683105469, + "step": 905 + }, + { + "epoch": 0.58, + "grad_norm": 28.126638412475586, + "kl": 0.0, + "learning_rate": 2.322190611664296e-07, + "logps/chosen": -225.90432739257812, + "logps/rejected": -369.765380859375, + "loss": 0.1068, + "rewards/chosen": 2.4907584190368652, + "rewards/margins": 13.410183906555176, + "rewards/rejected": -10.919425964355469, + "step": 910 + }, + { + "epoch": 0.59, + "grad_norm": 23.488555908203125, + "kl": 0.0, + "learning_rate": 2.304409672830725e-07, + "logps/chosen": -264.053466796875, + "logps/rejected": -347.4461364746094, + "loss": 0.0978, + "rewards/chosen": 2.3625271320343018, + "rewards/margins": 13.296697616577148, + "rewards/rejected": -10.934170722961426, + "step": 915 + }, + { + "epoch": 0.59, + "grad_norm": 16.538496017456055, + "kl": 0.0, + "learning_rate": 2.2866287339971549e-07, + "logps/chosen": -305.2149353027344, + "logps/rejected": -387.3183898925781, + "loss": 0.0895, + "rewards/chosen": 2.51884126663208, + "rewards/margins": 14.519546508789062, + "rewards/rejected": -12.000704765319824, + "step": 920 + }, + { + "epoch": 0.59, + "grad_norm": 14.570239067077637, + "kl": 0.0, + "learning_rate": 2.2688477951635846e-07, + "logps/chosen": -288.55859375, + "logps/rejected": -360.0328063964844, + "loss": 0.0891, + "rewards/chosen": 2.6060962677001953, + "rewards/margins": 13.63697338104248, + "rewards/rejected": -11.030878067016602, + "step": 925 + }, + { + "epoch": 0.6, + "grad_norm": 14.252922058105469, + "kl": 0.0, + "learning_rate": 2.251066856330014e-07, + "logps/chosen": -237.12985229492188, + "logps/rejected": -375.7847900390625, + "loss": 0.0968, + "rewards/chosen": 2.277179718017578, + "rewards/margins": 13.618906021118164, + "rewards/rejected": -11.341727256774902, + "step": 930 + }, + { + "epoch": 0.6, + "grad_norm": 14.224340438842773, + "kl": 0.0, + "learning_rate": 2.2332859174964438e-07, + "logps/chosen": -247.994384765625, + "logps/rejected": -384.1309509277344, + "loss": 0.0921, + "rewards/chosen": 2.5065815448760986, + "rewards/margins": 14.67822551727295, + "rewards/rejected": -12.17164421081543, + "step": 935 + }, + { + "epoch": 0.6, + "grad_norm": 15.533319473266602, + "kl": 0.0, + "learning_rate": 2.2155049786628733e-07, + "logps/chosen": -217.72982788085938, + "logps/rejected": -364.0068054199219, + "loss": 0.1044, + "rewards/chosen": 2.4342358112335205, + "rewards/margins": 13.825854301452637, + "rewards/rejected": -11.391618728637695, + "step": 940 + }, + { + "epoch": 0.6, + "grad_norm": 12.951379776000977, + "kl": 0.0, + "learning_rate": 2.1977240398293027e-07, + "logps/chosen": -216.9298858642578, + "logps/rejected": -391.88916015625, + "loss": 0.0959, + "rewards/chosen": 2.2248167991638184, + "rewards/margins": 14.502099990844727, + "rewards/rejected": -12.27728271484375, + "step": 945 + }, + { + "epoch": 0.61, + "grad_norm": 12.726017951965332, + "kl": 0.0, + "learning_rate": 2.1799431009957325e-07, + "logps/chosen": -250.4014129638672, + "logps/rejected": -367.2666931152344, + "loss": 0.099, + "rewards/chosen": 2.6326889991760254, + "rewards/margins": 14.203544616699219, + "rewards/rejected": -11.570856094360352, + "step": 950 + }, + { + "epoch": 0.61, + "grad_norm": 24.116321563720703, + "kl": 0.0, + "learning_rate": 2.1621621621621622e-07, + "logps/chosen": -256.0219421386719, + "logps/rejected": -359.89410400390625, + "loss": 0.1114, + "rewards/chosen": 2.291504383087158, + "rewards/margins": 13.204824447631836, + "rewards/rejected": -10.913320541381836, + "step": 955 + }, + { + "epoch": 0.61, + "grad_norm": 21.19695472717285, + "kl": 0.0, + "learning_rate": 2.1443812233285914e-07, + "logps/chosen": -253.9010009765625, + "logps/rejected": -348.592041015625, + "loss": 0.0887, + "rewards/chosen": 2.41105580329895, + "rewards/margins": 13.58563232421875, + "rewards/rejected": -11.174577713012695, + "step": 960 + }, + { + "epoch": 0.62, + "grad_norm": 19.317626953125, + "kl": 0.0, + "learning_rate": 2.1266002844950212e-07, + "logps/chosen": -245.3134765625, + "logps/rejected": -380.17327880859375, + "loss": 0.0998, + "rewards/chosen": 2.4610061645507812, + "rewards/margins": 13.953335762023926, + "rewards/rejected": -11.492330551147461, + "step": 965 + }, + { + "epoch": 0.62, + "grad_norm": 22.688295364379883, + "kl": 0.0, + "learning_rate": 2.108819345661451e-07, + "logps/chosen": -278.2052917480469, + "logps/rejected": -357.1881408691406, + "loss": 0.1063, + "rewards/chosen": 2.404897928237915, + "rewards/margins": 13.51880931854248, + "rewards/rejected": -11.113912582397461, + "step": 970 + }, + { + "epoch": 0.62, + "grad_norm": 14.956331253051758, + "kl": 0.0, + "learning_rate": 2.0910384068278806e-07, + "logps/chosen": -274.7865295410156, + "logps/rejected": -348.69024658203125, + "loss": 0.1178, + "rewards/chosen": 2.560181140899658, + "rewards/margins": 13.09942626953125, + "rewards/rejected": -10.539244651794434, + "step": 975 + }, + { + "epoch": 0.63, + "grad_norm": 14.130335807800293, + "kl": 0.0, + "learning_rate": 2.0732574679943098e-07, + "logps/chosen": -241.42257690429688, + "logps/rejected": -374.1187438964844, + "loss": 0.0728, + "rewards/chosen": 2.970536470413208, + "rewards/margins": 14.709304809570312, + "rewards/rejected": -11.738768577575684, + "step": 980 + }, + { + "epoch": 0.63, + "grad_norm": 20.794334411621094, + "kl": 0.0, + "learning_rate": 2.0554765291607396e-07, + "logps/chosen": -230.2430419921875, + "logps/rejected": -380.54608154296875, + "loss": 0.09, + "rewards/chosen": 2.673881769180298, + "rewards/margins": 14.4403076171875, + "rewards/rejected": -11.766425132751465, + "step": 985 + }, + { + "epoch": 0.63, + "grad_norm": 16.297340393066406, + "kl": 0.0, + "learning_rate": 2.0376955903271693e-07, + "logps/chosen": -264.10137939453125, + "logps/rejected": -372.11346435546875, + "loss": 0.0896, + "rewards/chosen": 2.7595813274383545, + "rewards/margins": 13.790555953979492, + "rewards/rejected": -11.030974388122559, + "step": 990 + }, + { + "epoch": 0.64, + "grad_norm": 20.304162979125977, + "kl": 0.0, + "learning_rate": 2.0199146514935988e-07, + "logps/chosen": -243.2149200439453, + "logps/rejected": -342.080322265625, + "loss": 0.1091, + "rewards/chosen": 2.3006443977355957, + "rewards/margins": 13.069559097290039, + "rewards/rejected": -10.768914222717285, + "step": 995 + }, + { + "epoch": 0.64, + "grad_norm": 19.80646514892578, + "kl": 0.0, + "learning_rate": 2.0021337126600283e-07, + "logps/chosen": -251.78524780273438, + "logps/rejected": -360.4002380371094, + "loss": 0.088, + "rewards/chosen": 2.7110652923583984, + "rewards/margins": 13.96354866027832, + "rewards/rejected": -11.252483367919922, + "step": 1000 + }, + { + "epoch": 0.64, + "grad_norm": 19.04472541809082, + "kl": 0.0, + "learning_rate": 1.984352773826458e-07, + "logps/chosen": -227.35873413085938, + "logps/rejected": -363.24713134765625, + "loss": 0.0952, + "rewards/chosen": 2.7294869422912598, + "rewards/margins": 14.287317276000977, + "rewards/rejected": -11.557830810546875, + "step": 1005 + }, + { + "epoch": 0.65, + "grad_norm": 14.0511474609375, + "kl": 0.0, + "learning_rate": 1.9665718349928875e-07, + "logps/chosen": -248.2740478515625, + "logps/rejected": -359.29534912109375, + "loss": 0.0783, + "rewards/chosen": 2.7295162677764893, + "rewards/margins": 14.418438911437988, + "rewards/rejected": -11.688921928405762, + "step": 1010 + }, + { + "epoch": 0.65, + "grad_norm": 21.88315773010254, + "kl": 0.0, + "learning_rate": 1.9487908961593172e-07, + "logps/chosen": -228.99447631835938, + "logps/rejected": -387.5201721191406, + "loss": 0.0771, + "rewards/chosen": 2.3790504932403564, + "rewards/margins": 13.84051513671875, + "rewards/rejected": -11.461464881896973, + "step": 1015 + }, + { + "epoch": 0.65, + "grad_norm": 15.932575225830078, + "kl": 0.0, + "learning_rate": 1.931009957325747e-07, + "logps/chosen": -278.6766357421875, + "logps/rejected": -372.94793701171875, + "loss": 0.0826, + "rewards/chosen": 2.363306760787964, + "rewards/margins": 14.281936645507812, + "rewards/rejected": -11.918628692626953, + "step": 1020 + }, + { + "epoch": 0.66, + "grad_norm": 11.954800605773926, + "kl": 0.0, + "learning_rate": 1.9132290184921761e-07, + "logps/chosen": -223.6442108154297, + "logps/rejected": -371.5616149902344, + "loss": 0.0961, + "rewards/chosen": 2.332761526107788, + "rewards/margins": 13.71965217590332, + "rewards/rejected": -11.386890411376953, + "step": 1025 + }, + { + "epoch": 0.66, + "grad_norm": 14.154836654663086, + "kl": 0.0, + "learning_rate": 1.895448079658606e-07, + "logps/chosen": -223.6517791748047, + "logps/rejected": -386.1749267578125, + "loss": 0.0908, + "rewards/chosen": 2.7226128578186035, + "rewards/margins": 14.438260078430176, + "rewards/rejected": -11.71564769744873, + "step": 1030 + }, + { + "epoch": 0.66, + "grad_norm": 18.134370803833008, + "kl": 0.0, + "learning_rate": 1.8776671408250356e-07, + "logps/chosen": -260.0550842285156, + "logps/rejected": -346.24127197265625, + "loss": 0.1007, + "rewards/chosen": 2.5339343547821045, + "rewards/margins": 13.723528861999512, + "rewards/rejected": -11.189595222473145, + "step": 1035 + }, + { + "epoch": 0.67, + "grad_norm": 15.416353225708008, + "kl": 0.0, + "learning_rate": 1.859886201991465e-07, + "logps/chosen": -247.7535400390625, + "logps/rejected": -371.42034912109375, + "loss": 0.0912, + "rewards/chosen": 2.3352103233337402, + "rewards/margins": 13.56823444366455, + "rewards/rejected": -11.233022689819336, + "step": 1040 + }, + { + "epoch": 0.67, + "grad_norm": 18.23054313659668, + "kl": 0.0, + "learning_rate": 1.8421052631578946e-07, + "logps/chosen": -213.8344268798828, + "logps/rejected": -369.88702392578125, + "loss": 0.0758, + "rewards/chosen": 2.6615304946899414, + "rewards/margins": 14.028742790222168, + "rewards/rejected": -11.367212295532227, + "step": 1045 + }, + { + "epoch": 0.67, + "grad_norm": 18.42432975769043, + "kl": 0.0, + "learning_rate": 1.8243243243243243e-07, + "logps/chosen": -238.5988006591797, + "logps/rejected": -392.305419921875, + "loss": 0.0965, + "rewards/chosen": 2.5309016704559326, + "rewards/margins": 14.56842041015625, + "rewards/rejected": -12.037518501281738, + "step": 1050 + }, + { + "epoch": 0.67, + "grad_norm": 15.395295143127441, + "kl": 0.0, + "learning_rate": 1.8065433854907538e-07, + "logps/chosen": -242.9210968017578, + "logps/rejected": -375.423828125, + "loss": 0.0985, + "rewards/chosen": 2.644166946411133, + "rewards/margins": 13.905471801757812, + "rewards/rejected": -11.26130485534668, + "step": 1055 + }, + { + "epoch": 0.68, + "grad_norm": 16.343006134033203, + "kl": 0.0, + "learning_rate": 1.7887624466571835e-07, + "logps/chosen": -198.0528106689453, + "logps/rejected": -354.1863098144531, + "loss": 0.092, + "rewards/chosen": 2.4538371562957764, + "rewards/margins": 12.678072929382324, + "rewards/rejected": -10.224235534667969, + "step": 1060 + }, + { + "epoch": 0.68, + "grad_norm": 19.673669815063477, + "kl": 0.0, + "learning_rate": 1.770981507823613e-07, + "logps/chosen": -241.1757049560547, + "logps/rejected": -339.1280822753906, + "loss": 0.088, + "rewards/chosen": 2.550891876220703, + "rewards/margins": 13.831698417663574, + "rewards/rejected": -11.280807495117188, + "step": 1065 + }, + { + "epoch": 0.68, + "grad_norm": 15.624777793884277, + "kl": 0.0, + "learning_rate": 1.7532005689900424e-07, + "logps/chosen": -247.8914031982422, + "logps/rejected": -364.5996398925781, + "loss": 0.1038, + "rewards/chosen": 2.5451152324676514, + "rewards/margins": 13.162847518920898, + "rewards/rejected": -10.617732048034668, + "step": 1070 + }, + { + "epoch": 0.69, + "grad_norm": 16.0270938873291, + "kl": 0.0, + "learning_rate": 1.7354196301564722e-07, + "logps/chosen": -271.2415466308594, + "logps/rejected": -348.6362609863281, + "loss": 0.0832, + "rewards/chosen": 2.6901183128356934, + "rewards/margins": 13.692281723022461, + "rewards/rejected": -11.002164840698242, + "step": 1075 + }, + { + "epoch": 0.69, + "grad_norm": 16.198070526123047, + "kl": 0.0, + "learning_rate": 1.717638691322902e-07, + "logps/chosen": -228.4770965576172, + "logps/rejected": -371.384521484375, + "loss": 0.0879, + "rewards/chosen": 2.317864179611206, + "rewards/margins": 14.147501945495605, + "rewards/rejected": -11.82963752746582, + "step": 1080 + }, + { + "epoch": 0.69, + "grad_norm": 24.463367462158203, + "kl": 0.0, + "learning_rate": 1.6998577524893314e-07, + "logps/chosen": -271.0051574707031, + "logps/rejected": -403.8445129394531, + "loss": 0.0817, + "rewards/chosen": 2.82243275642395, + "rewards/margins": 14.68195629119873, + "rewards/rejected": -11.859524726867676, + "step": 1085 + }, + { + "epoch": 0.7, + "grad_norm": 21.81173324584961, + "kl": 0.0, + "learning_rate": 1.6820768136557609e-07, + "logps/chosen": -209.64572143554688, + "logps/rejected": -360.5612487792969, + "loss": 0.0849, + "rewards/chosen": 2.4915404319763184, + "rewards/margins": 13.439886093139648, + "rewards/rejected": -10.948348045349121, + "step": 1090 + }, + { + "epoch": 0.7, + "grad_norm": 20.325061798095703, + "kl": 0.0, + "learning_rate": 1.6642958748221906e-07, + "logps/chosen": -244.2580108642578, + "logps/rejected": -348.1650390625, + "loss": 0.0804, + "rewards/chosen": 2.744297742843628, + "rewards/margins": 13.572134017944336, + "rewards/rejected": -10.827836990356445, + "step": 1095 + }, + { + "epoch": 0.7, + "grad_norm": 24.380229949951172, + "kl": 0.0, + "learning_rate": 1.64651493598862e-07, + "logps/chosen": -253.1858673095703, + "logps/rejected": -348.04144287109375, + "loss": 0.0846, + "rewards/chosen": 2.656752824783325, + "rewards/margins": 13.392297744750977, + "rewards/rejected": -10.735544204711914, + "step": 1100 + }, + { + "epoch": 0.71, + "grad_norm": 12.965998649597168, + "kl": 0.0, + "learning_rate": 1.6287339971550498e-07, + "logps/chosen": -262.8606262207031, + "logps/rejected": -356.92242431640625, + "loss": 0.0783, + "rewards/chosen": 2.8926401138305664, + "rewards/margins": 14.1354341506958, + "rewards/rejected": -11.242793083190918, + "step": 1105 + }, + { + "epoch": 0.71, + "grad_norm": 16.383338928222656, + "kl": 0.0, + "learning_rate": 1.6109530583214793e-07, + "logps/chosen": -300.94384765625, + "logps/rejected": -358.71588134765625, + "loss": 0.0951, + "rewards/chosen": 2.539355993270874, + "rewards/margins": 13.336338996887207, + "rewards/rejected": -10.79698371887207, + "step": 1110 + }, + { + "epoch": 0.71, + "grad_norm": 15.756513595581055, + "kl": 0.0, + "learning_rate": 1.5931721194879087e-07, + "logps/chosen": -261.84637451171875, + "logps/rejected": -374.3198547363281, + "loss": 0.0791, + "rewards/chosen": 2.755915880203247, + "rewards/margins": 14.211552619934082, + "rewards/rejected": -11.455635070800781, + "step": 1115 + }, + { + "epoch": 0.72, + "grad_norm": 18.087120056152344, + "kl": 0.0, + "learning_rate": 1.5753911806543385e-07, + "logps/chosen": -274.73370361328125, + "logps/rejected": -326.76123046875, + "loss": 0.0935, + "rewards/chosen": 2.786874294281006, + "rewards/margins": 12.87488079071045, + "rewards/rejected": -10.088006973266602, + "step": 1120 + }, + { + "epoch": 0.72, + "grad_norm": 15.65069580078125, + "kl": 0.0, + "learning_rate": 1.5576102418207682e-07, + "logps/chosen": -235.80062866210938, + "logps/rejected": -340.6771545410156, + "loss": 0.1005, + "rewards/chosen": 2.691729784011841, + "rewards/margins": 13.200857162475586, + "rewards/rejected": -10.509127616882324, + "step": 1125 + }, + { + "epoch": 0.72, + "grad_norm": 25.325843811035156, + "kl": 0.0, + "learning_rate": 1.5398293029871974e-07, + "logps/chosen": -244.3973388671875, + "logps/rejected": -343.04412841796875, + "loss": 0.0883, + "rewards/chosen": 2.485719680786133, + "rewards/margins": 13.161664962768555, + "rewards/rejected": -10.675946235656738, + "step": 1130 + }, + { + "epoch": 0.73, + "grad_norm": 17.044322967529297, + "kl": 0.0, + "learning_rate": 1.5220483641536272e-07, + "logps/chosen": -244.70559692382812, + "logps/rejected": -358.86920166015625, + "loss": 0.0987, + "rewards/chosen": 2.5949511528015137, + "rewards/margins": 13.485272407531738, + "rewards/rejected": -10.890320777893066, + "step": 1135 + }, + { + "epoch": 0.73, + "grad_norm": 27.13024139404297, + "kl": 0.0, + "learning_rate": 1.504267425320057e-07, + "logps/chosen": -247.91641235351562, + "logps/rejected": -343.4930419921875, + "loss": 0.0899, + "rewards/chosen": 2.4175989627838135, + "rewards/margins": 13.832735061645508, + "rewards/rejected": -11.415135383605957, + "step": 1140 + }, + { + "epoch": 0.73, + "grad_norm": 14.320551872253418, + "kl": 0.0, + "learning_rate": 1.4864864864864866e-07, + "logps/chosen": -246.96670532226562, + "logps/rejected": -370.0877990722656, + "loss": 0.0758, + "rewards/chosen": 2.691598653793335, + "rewards/margins": 13.812780380249023, + "rewards/rejected": -11.121182441711426, + "step": 1145 + }, + { + "epoch": 0.74, + "grad_norm": 16.582712173461914, + "kl": 0.0, + "learning_rate": 1.4687055476529158e-07, + "logps/chosen": -224.07785034179688, + "logps/rejected": -349.31927490234375, + "loss": 0.082, + "rewards/chosen": 2.6934258937835693, + "rewards/margins": 13.236696243286133, + "rewards/rejected": -10.543269157409668, + "step": 1150 + }, + { + "epoch": 0.74, + "grad_norm": 17.989919662475586, + "kl": 0.0, + "learning_rate": 1.4509246088193456e-07, + "logps/chosen": -243.8923797607422, + "logps/rejected": -367.2593688964844, + "loss": 0.0846, + "rewards/chosen": 2.7338271141052246, + "rewards/margins": 14.203822135925293, + "rewards/rejected": -11.469995498657227, + "step": 1155 + }, + { + "epoch": 0.74, + "grad_norm": 17.582975387573242, + "kl": 0.0, + "learning_rate": 1.4331436699857753e-07, + "logps/chosen": -227.1583709716797, + "logps/rejected": -378.5196228027344, + "loss": 0.1011, + "rewards/chosen": 2.531038284301758, + "rewards/margins": 13.578967094421387, + "rewards/rejected": -11.047929763793945, + "step": 1160 + }, + { + "epoch": 0.75, + "grad_norm": 8.821680068969727, + "kl": 0.0, + "learning_rate": 1.4153627311522048e-07, + "logps/chosen": -277.82684326171875, + "logps/rejected": -364.6512145996094, + "loss": 0.0742, + "rewards/chosen": 2.9555764198303223, + "rewards/margins": 14.14686393737793, + "rewards/rejected": -11.19128704071045, + "step": 1165 + }, + { + "epoch": 0.75, + "grad_norm": 15.817394256591797, + "kl": 0.0, + "learning_rate": 1.3975817923186345e-07, + "logps/chosen": -271.718017578125, + "logps/rejected": -352.4165954589844, + "loss": 0.0848, + "rewards/chosen": 2.5184245109558105, + "rewards/margins": 13.12096881866455, + "rewards/rejected": -10.602544784545898, + "step": 1170 + }, + { + "epoch": 0.75, + "grad_norm": 18.2441463470459, + "kl": 0.0, + "learning_rate": 1.379800853485064e-07, + "logps/chosen": -232.19467163085938, + "logps/rejected": -379.3485412597656, + "loss": 0.0909, + "rewards/chosen": 2.5335030555725098, + "rewards/margins": 14.489652633666992, + "rewards/rejected": -11.956149101257324, + "step": 1175 + }, + { + "epoch": 0.75, + "grad_norm": 30.594867706298828, + "kl": 0.0, + "learning_rate": 1.3620199146514935e-07, + "logps/chosen": -265.6886291503906, + "logps/rejected": -361.285400390625, + "loss": 0.0964, + "rewards/chosen": 2.450854778289795, + "rewards/margins": 13.442533493041992, + "rewards/rejected": -10.991681098937988, + "step": 1180 + }, + { + "epoch": 0.76, + "grad_norm": 12.528332710266113, + "kl": 0.0, + "learning_rate": 1.3442389758179232e-07, + "logps/chosen": -288.8167724609375, + "logps/rejected": -352.4346618652344, + "loss": 0.0741, + "rewards/chosen": 2.7889742851257324, + "rewards/margins": 14.034700393676758, + "rewards/rejected": -11.245725631713867, + "step": 1185 + }, + { + "epoch": 0.76, + "grad_norm": 13.565973281860352, + "kl": 0.0, + "learning_rate": 1.326458036984353e-07, + "logps/chosen": -263.8364562988281, + "logps/rejected": -346.26019287109375, + "loss": 0.0843, + "rewards/chosen": 2.638777256011963, + "rewards/margins": 13.351663589477539, + "rewards/rejected": -10.712886810302734, + "step": 1190 + }, + { + "epoch": 0.76, + "grad_norm": 21.124759674072266, + "kl": 0.0, + "learning_rate": 1.3086770981507821e-07, + "logps/chosen": -245.15576171875, + "logps/rejected": -372.0450134277344, + "loss": 0.0898, + "rewards/chosen": 2.657459259033203, + "rewards/margins": 14.276753425598145, + "rewards/rejected": -11.619293212890625, + "step": 1195 + }, + { + "epoch": 0.77, + "grad_norm": 15.396940231323242, + "kl": 0.0, + "learning_rate": 1.290896159317212e-07, + "logps/chosen": -247.9556121826172, + "logps/rejected": -371.3470764160156, + "loss": 0.0828, + "rewards/chosen": 2.4249672889709473, + "rewards/margins": 14.25316333770752, + "rewards/rejected": -11.828195571899414, + "step": 1200 + }, + { + "epoch": 0.77, + "grad_norm": 14.409557342529297, + "kl": 0.0, + "learning_rate": 1.2731152204836416e-07, + "logps/chosen": -252.07241821289062, + "logps/rejected": -361.1800842285156, + "loss": 0.085, + "rewards/chosen": 2.680985689163208, + "rewards/margins": 14.30175495147705, + "rewards/rejected": -11.620769500732422, + "step": 1205 + }, + { + "epoch": 0.77, + "grad_norm": 13.202485084533691, + "kl": 0.0, + "learning_rate": 1.255334281650071e-07, + "logps/chosen": -237.88565063476562, + "logps/rejected": -350.7021179199219, + "loss": 0.0874, + "rewards/chosen": 2.6545567512512207, + "rewards/margins": 13.95142936706543, + "rewards/rejected": -11.29687213897705, + "step": 1210 + }, + { + "epoch": 0.78, + "grad_norm": 28.25654411315918, + "kl": 0.0, + "learning_rate": 1.2375533428165005e-07, + "logps/chosen": -282.26031494140625, + "logps/rejected": -377.5342712402344, + "loss": 0.0771, + "rewards/chosen": 2.6503818035125732, + "rewards/margins": 14.911274909973145, + "rewards/rejected": -12.260892868041992, + "step": 1215 + }, + { + "epoch": 0.78, + "grad_norm": 20.115049362182617, + "kl": 0.0, + "learning_rate": 1.2197724039829303e-07, + "logps/chosen": -234.886962890625, + "logps/rejected": -347.6160583496094, + "loss": 0.0879, + "rewards/chosen": 2.644916296005249, + "rewards/margins": 14.264989852905273, + "rewards/rejected": -11.620074272155762, + "step": 1220 + }, + { + "epoch": 0.78, + "grad_norm": 17.797800064086914, + "kl": 0.0, + "learning_rate": 1.2019914651493598e-07, + "logps/chosen": -246.7455291748047, + "logps/rejected": -392.910400390625, + "loss": 0.0791, + "rewards/chosen": 2.706545352935791, + "rewards/margins": 14.843210220336914, + "rewards/rejected": -12.136663436889648, + "step": 1225 + }, + { + "epoch": 0.79, + "grad_norm": 12.662943840026855, + "kl": 0.0, + "learning_rate": 1.1842105263157894e-07, + "logps/chosen": -225.7235870361328, + "logps/rejected": -402.0716247558594, + "loss": 0.0925, + "rewards/chosen": 2.4518020153045654, + "rewards/margins": 14.207118034362793, + "rewards/rejected": -11.755315780639648, + "step": 1230 + }, + { + "epoch": 0.79, + "grad_norm": 16.056779861450195, + "kl": 0.0, + "learning_rate": 1.166429587482219e-07, + "logps/chosen": -215.66708374023438, + "logps/rejected": -375.41717529296875, + "loss": 0.0844, + "rewards/chosen": 2.730950355529785, + "rewards/margins": 14.23332691192627, + "rewards/rejected": -11.502375602722168, + "step": 1235 + }, + { + "epoch": 0.79, + "grad_norm": 16.644628524780273, + "kl": 0.0, + "learning_rate": 1.1486486486486487e-07, + "logps/chosen": -266.43878173828125, + "logps/rejected": -358.8226318359375, + "loss": 0.0835, + "rewards/chosen": 2.870779514312744, + "rewards/margins": 14.435707092285156, + "rewards/rejected": -11.56492805480957, + "step": 1240 + }, + { + "epoch": 0.8, + "grad_norm": 8.560157775878906, + "kl": 0.0, + "learning_rate": 1.1308677098150782e-07, + "logps/chosen": -238.55319213867188, + "logps/rejected": -390.98358154296875, + "loss": 0.0804, + "rewards/chosen": 2.759446620941162, + "rewards/margins": 14.843562126159668, + "rewards/rejected": -12.084115028381348, + "step": 1245 + }, + { + "epoch": 0.8, + "grad_norm": 16.91299819946289, + "kl": 0.0, + "learning_rate": 1.1130867709815078e-07, + "logps/chosen": -234.5216064453125, + "logps/rejected": -338.272705078125, + "loss": 0.093, + "rewards/chosen": 2.591742992401123, + "rewards/margins": 14.04884147644043, + "rewards/rejected": -11.457098007202148, + "step": 1250 + }, + { + "epoch": 0.8, + "grad_norm": 13.1578369140625, + "kl": 0.0, + "learning_rate": 1.0953058321479374e-07, + "logps/chosen": -275.4759826660156, + "logps/rejected": -379.9882507324219, + "loss": 0.0692, + "rewards/chosen": 2.984293222427368, + "rewards/margins": 15.574743270874023, + "rewards/rejected": -12.59045124053955, + "step": 1255 + }, + { + "epoch": 0.81, + "grad_norm": 17.815027236938477, + "kl": 0.0, + "learning_rate": 1.077524893314367e-07, + "logps/chosen": -267.3926696777344, + "logps/rejected": -353.8580017089844, + "loss": 0.0725, + "rewards/chosen": 2.5867929458618164, + "rewards/margins": 14.201835632324219, + "rewards/rejected": -11.615041732788086, + "step": 1260 + }, + { + "epoch": 0.81, + "grad_norm": 11.758225440979004, + "kl": 0.0, + "learning_rate": 1.0597439544807964e-07, + "logps/chosen": -249.6155242919922, + "logps/rejected": -364.9371337890625, + "loss": 0.0769, + "rewards/chosen": 2.7065796852111816, + "rewards/margins": 14.05157470703125, + "rewards/rejected": -11.344995498657227, + "step": 1265 + }, + { + "epoch": 0.81, + "grad_norm": 16.8466739654541, + "kl": 0.0, + "learning_rate": 1.0419630156472262e-07, + "logps/chosen": -238.05673217773438, + "logps/rejected": -373.03546142578125, + "loss": 0.0748, + "rewards/chosen": 2.9012789726257324, + "rewards/margins": 14.46104621887207, + "rewards/rejected": -11.55976676940918, + "step": 1270 + }, + { + "epoch": 0.82, + "grad_norm": 14.22836685180664, + "kl": 0.0, + "learning_rate": 1.0241820768136557e-07, + "logps/chosen": -224.74887084960938, + "logps/rejected": -332.44415283203125, + "loss": 0.084, + "rewards/chosen": 2.65814208984375, + "rewards/margins": 13.293182373046875, + "rewards/rejected": -10.635040283203125, + "step": 1275 + }, + { + "epoch": 0.82, + "grad_norm": 11.591891288757324, + "kl": 0.0, + "learning_rate": 1.0064011379800854e-07, + "logps/chosen": -280.44427490234375, + "logps/rejected": -365.83392333984375, + "loss": 0.0801, + "rewards/chosen": 2.5924744606018066, + "rewards/margins": 14.31025505065918, + "rewards/rejected": -11.717779159545898, + "step": 1280 + }, + { + "epoch": 0.82, + "grad_norm": 14.504942893981934, + "kl": 0.0, + "learning_rate": 9.886201991465149e-08, + "logps/chosen": -277.6451416015625, + "logps/rejected": -350.4718322753906, + "loss": 0.0661, + "rewards/chosen": 2.9364330768585205, + "rewards/margins": 14.681096076965332, + "rewards/rejected": -11.744662284851074, + "step": 1285 + }, + { + "epoch": 0.83, + "grad_norm": 16.065649032592773, + "kl": 0.0, + "learning_rate": 9.708392603129445e-08, + "logps/chosen": -262.8613586425781, + "logps/rejected": -391.71649169921875, + "loss": 0.0723, + "rewards/chosen": 2.7925260066986084, + "rewards/margins": 15.433810234069824, + "rewards/rejected": -12.641283988952637, + "step": 1290 + }, + { + "epoch": 0.83, + "grad_norm": 21.038543701171875, + "kl": 0.0, + "learning_rate": 9.530583214793741e-08, + "logps/chosen": -262.80584716796875, + "logps/rejected": -380.09625244140625, + "loss": 0.0801, + "rewards/chosen": 2.761460781097412, + "rewards/margins": 15.412538528442383, + "rewards/rejected": -12.651077270507812, + "step": 1295 + }, + { + "epoch": 0.83, + "grad_norm": 18.872102737426758, + "kl": 0.0, + "learning_rate": 9.352773826458037e-08, + "logps/chosen": -285.86712646484375, + "logps/rejected": -342.8421325683594, + "loss": 0.0819, + "rewards/chosen": 2.6993701457977295, + "rewards/margins": 14.52336311340332, + "rewards/rejected": -11.823991775512695, + "step": 1300 + }, + { + "epoch": 0.83, + "grad_norm": 15.029848098754883, + "kl": 0.0, + "learning_rate": 9.174964438122331e-08, + "logps/chosen": -245.3412322998047, + "logps/rejected": -356.58172607421875, + "loss": 0.094, + "rewards/chosen": 2.818472385406494, + "rewards/margins": 14.51873779296875, + "rewards/rejected": -11.700265884399414, + "step": 1305 + }, + { + "epoch": 0.84, + "grad_norm": 17.082958221435547, + "kl": 0.0, + "learning_rate": 8.997155049786629e-08, + "logps/chosen": -286.61383056640625, + "logps/rejected": -395.6304626464844, + "loss": 0.0716, + "rewards/chosen": 2.8008289337158203, + "rewards/margins": 15.597787857055664, + "rewards/rejected": -12.796960830688477, + "step": 1310 + }, + { + "epoch": 0.84, + "grad_norm": 22.768911361694336, + "kl": 0.0, + "learning_rate": 8.819345661450925e-08, + "logps/chosen": -250.6887664794922, + "logps/rejected": -358.8474426269531, + "loss": 0.0929, + "rewards/chosen": 2.7452361583709717, + "rewards/margins": 14.326945304870605, + "rewards/rejected": -11.581708908081055, + "step": 1315 + }, + { + "epoch": 0.84, + "grad_norm": 21.478580474853516, + "kl": 0.0, + "learning_rate": 8.64153627311522e-08, + "logps/chosen": -267.79974365234375, + "logps/rejected": -369.04815673828125, + "loss": 0.0838, + "rewards/chosen": 2.744828224182129, + "rewards/margins": 14.584070205688477, + "rewards/rejected": -11.839241027832031, + "step": 1320 + }, + { + "epoch": 0.85, + "grad_norm": 12.477672576904297, + "kl": 0.0, + "learning_rate": 8.463726884779517e-08, + "logps/chosen": -239.75399780273438, + "logps/rejected": -383.63812255859375, + "loss": 0.0833, + "rewards/chosen": 2.6762642860412598, + "rewards/margins": 14.272786140441895, + "rewards/rejected": -11.596521377563477, + "step": 1325 + }, + { + "epoch": 0.85, + "grad_norm": 19.415245056152344, + "kl": 0.0, + "learning_rate": 8.285917496443812e-08, + "logps/chosen": -258.97406005859375, + "logps/rejected": -372.96173095703125, + "loss": 0.0874, + "rewards/chosen": 2.7613649368286133, + "rewards/margins": 14.08574390411377, + "rewards/rejected": -11.324378967285156, + "step": 1330 + }, + { + "epoch": 0.85, + "grad_norm": 13.463936805725098, + "kl": 0.0, + "learning_rate": 8.108108108108108e-08, + "logps/chosen": -228.6225128173828, + "logps/rejected": -344.24688720703125, + "loss": 0.0777, + "rewards/chosen": 2.942438840866089, + "rewards/margins": 14.587237358093262, + "rewards/rejected": -11.644798278808594, + "step": 1335 + }, + { + "epoch": 0.86, + "grad_norm": 13.836956977844238, + "kl": 0.0, + "learning_rate": 7.930298719772404e-08, + "logps/chosen": -247.45742797851562, + "logps/rejected": -371.2969055175781, + "loss": 0.0721, + "rewards/chosen": 3.017321825027466, + "rewards/margins": 14.967196464538574, + "rewards/rejected": -11.949874877929688, + "step": 1340 + }, + { + "epoch": 0.86, + "grad_norm": 16.551321029663086, + "kl": 0.0, + "learning_rate": 7.7524893314367e-08, + "logps/chosen": -230.99227905273438, + "logps/rejected": -384.6443786621094, + "loss": 0.0871, + "rewards/chosen": 2.6670827865600586, + "rewards/margins": 14.841870307922363, + "rewards/rejected": -12.174787521362305, + "step": 1345 + }, + { + "epoch": 0.86, + "grad_norm": 15.87748908996582, + "kl": 0.0, + "learning_rate": 7.574679943100994e-08, + "logps/chosen": -266.7972717285156, + "logps/rejected": -335.5400390625, + "loss": 0.0976, + "rewards/chosen": 2.702658176422119, + "rewards/margins": 13.561399459838867, + "rewards/rejected": -10.858741760253906, + "step": 1350 + }, + { + "epoch": 0.87, + "grad_norm": 17.662395477294922, + "kl": 0.0, + "learning_rate": 7.396870554765292e-08, + "logps/chosen": -269.71026611328125, + "logps/rejected": -383.00628662109375, + "loss": 0.0921, + "rewards/chosen": 2.5814826488494873, + "rewards/margins": 14.34886646270752, + "rewards/rejected": -11.76738452911377, + "step": 1355 + }, + { + "epoch": 0.87, + "grad_norm": 16.09382438659668, + "kl": 0.0, + "learning_rate": 7.219061166429587e-08, + "logps/chosen": -269.3471984863281, + "logps/rejected": -375.32208251953125, + "loss": 0.0886, + "rewards/chosen": 2.8077282905578613, + "rewards/margins": 14.510396003723145, + "rewards/rejected": -11.702667236328125, + "step": 1360 + }, + { + "epoch": 0.87, + "grad_norm": 13.27104377746582, + "kl": 0.0, + "learning_rate": 7.041251778093883e-08, + "logps/chosen": -273.42950439453125, + "logps/rejected": -371.14837646484375, + "loss": 0.0685, + "rewards/chosen": 2.944091558456421, + "rewards/margins": 15.448992729187012, + "rewards/rejected": -12.504900932312012, + "step": 1365 + }, + { + "epoch": 0.88, + "grad_norm": 12.18476390838623, + "kl": 0.0, + "learning_rate": 6.863442389758179e-08, + "logps/chosen": -256.47161865234375, + "logps/rejected": -384.1866760253906, + "loss": 0.0742, + "rewards/chosen": 2.7861623764038086, + "rewards/margins": 15.34550952911377, + "rewards/rejected": -12.559347152709961, + "step": 1370 + }, + { + "epoch": 0.88, + "grad_norm": 23.00941276550293, + "kl": 0.0, + "learning_rate": 6.685633001422475e-08, + "logps/chosen": -242.3636474609375, + "logps/rejected": -404.00848388671875, + "loss": 0.0747, + "rewards/chosen": 2.5074126720428467, + "rewards/margins": 14.919418334960938, + "rewards/rejected": -12.412006378173828, + "step": 1375 + }, + { + "epoch": 0.88, + "grad_norm": 10.992572784423828, + "kl": 0.0, + "learning_rate": 6.507823613086771e-08, + "logps/chosen": -248.75967407226562, + "logps/rejected": -368.59063720703125, + "loss": 0.0867, + "rewards/chosen": 2.401717185974121, + "rewards/margins": 14.726252555847168, + "rewards/rejected": -12.32453441619873, + "step": 1380 + }, + { + "epoch": 0.89, + "grad_norm": 11.035951614379883, + "kl": 0.0, + "learning_rate": 6.330014224751067e-08, + "logps/chosen": -240.3590850830078, + "logps/rejected": -347.6131896972656, + "loss": 0.0921, + "rewards/chosen": 2.680974245071411, + "rewards/margins": 14.051698684692383, + "rewards/rejected": -11.370722770690918, + "step": 1385 + }, + { + "epoch": 0.89, + "grad_norm": 21.619415283203125, + "kl": 0.0, + "learning_rate": 6.152204836415363e-08, + "logps/chosen": -240.7654266357422, + "logps/rejected": -381.09307861328125, + "loss": 0.087, + "rewards/chosen": 2.646531105041504, + "rewards/margins": 14.633687019348145, + "rewards/rejected": -11.987154960632324, + "step": 1390 + }, + { + "epoch": 0.89, + "grad_norm": 20.996952056884766, + "kl": 0.0, + "learning_rate": 5.974395448079659e-08, + "logps/chosen": -228.12826538085938, + "logps/rejected": -373.8050231933594, + "loss": 0.0903, + "rewards/chosen": 2.5932085514068604, + "rewards/margins": 14.422918319702148, + "rewards/rejected": -11.829710960388184, + "step": 1395 + }, + { + "epoch": 0.9, + "grad_norm": 12.941699981689453, + "kl": 0.0, + "learning_rate": 5.796586059743954e-08, + "logps/chosen": -266.34759521484375, + "logps/rejected": -377.67572021484375, + "loss": 0.0733, + "rewards/chosen": 2.6969380378723145, + "rewards/margins": 14.922724723815918, + "rewards/rejected": -12.225786209106445, + "step": 1400 + }, + { + "epoch": 0.9, + "grad_norm": 18.563539505004883, + "kl": 0.0, + "learning_rate": 5.61877667140825e-08, + "logps/chosen": -233.05337524414062, + "logps/rejected": -351.65478515625, + "loss": 0.0819, + "rewards/chosen": 2.4380977153778076, + "rewards/margins": 14.38970947265625, + "rewards/rejected": -11.951611518859863, + "step": 1405 + }, + { + "epoch": 0.9, + "grad_norm": 14.77774715423584, + "kl": 0.0, + "learning_rate": 5.4409672830725456e-08, + "logps/chosen": -272.6517028808594, + "logps/rejected": -359.89013671875, + "loss": 0.0762, + "rewards/chosen": 2.888676404953003, + "rewards/margins": 14.460576057434082, + "rewards/rejected": -11.571900367736816, + "step": 1410 + }, + { + "epoch": 0.91, + "grad_norm": 15.590229988098145, + "kl": 0.0, + "learning_rate": 5.2631578947368416e-08, + "logps/chosen": -256.3451843261719, + "logps/rejected": -375.3144226074219, + "loss": 0.0762, + "rewards/chosen": 2.667102336883545, + "rewards/margins": 15.10230827331543, + "rewards/rejected": -12.435206413269043, + "step": 1415 + }, + { + "epoch": 0.91, + "grad_norm": 10.8274507522583, + "kl": 0.0, + "learning_rate": 5.0853485064011376e-08, + "logps/chosen": -268.83636474609375, + "logps/rejected": -372.27838134765625, + "loss": 0.0648, + "rewards/chosen": 2.9176650047302246, + "rewards/margins": 14.840046882629395, + "rewards/rejected": -11.922381401062012, + "step": 1420 + }, + { + "epoch": 0.91, + "grad_norm": 15.774340629577637, + "kl": 0.0, + "learning_rate": 4.9075391180654337e-08, + "logps/chosen": -260.6099853515625, + "logps/rejected": -358.04241943359375, + "loss": 0.0585, + "rewards/chosen": 2.9233345985412598, + "rewards/margins": 14.653867721557617, + "rewards/rejected": -11.730535507202148, + "step": 1425 + }, + { + "epoch": 0.91, + "grad_norm": 15.554561614990234, + "kl": 0.0, + "learning_rate": 4.72972972972973e-08, + "logps/chosen": -238.23739624023438, + "logps/rejected": -386.2373962402344, + "loss": 0.0833, + "rewards/chosen": 2.658268451690674, + "rewards/margins": 14.15376091003418, + "rewards/rejected": -11.49549388885498, + "step": 1430 + }, + { + "epoch": 0.92, + "grad_norm": 17.29914093017578, + "kl": 0.0, + "learning_rate": 4.551920341394026e-08, + "logps/chosen": -248.63204956054688, + "logps/rejected": -371.215087890625, + "loss": 0.0872, + "rewards/chosen": 2.9267373085021973, + "rewards/margins": 14.192481994628906, + "rewards/rejected": -11.265745162963867, + "step": 1435 + }, + { + "epoch": 0.92, + "grad_norm": 14.301885604858398, + "kl": 0.0, + "learning_rate": 4.374110953058322e-08, + "logps/chosen": -240.0107421875, + "logps/rejected": -355.2042541503906, + "loss": 0.0846, + "rewards/chosen": 2.4140007495880127, + "rewards/margins": 13.864062309265137, + "rewards/rejected": -11.450060844421387, + "step": 1440 + }, + { + "epoch": 0.92, + "grad_norm": 13.32541275024414, + "kl": 0.0, + "learning_rate": 4.196301564722617e-08, + "logps/chosen": -247.22152709960938, + "logps/rejected": -360.07110595703125, + "loss": 0.0659, + "rewards/chosen": 2.7749335765838623, + "rewards/margins": 14.867749214172363, + "rewards/rejected": -12.092815399169922, + "step": 1445 + }, + { + "epoch": 0.93, + "grad_norm": 12.356550216674805, + "kl": 0.0, + "learning_rate": 4.018492176386913e-08, + "logps/chosen": -256.69683837890625, + "logps/rejected": -363.08489990234375, + "loss": 0.0799, + "rewards/chosen": 2.8861594200134277, + "rewards/margins": 14.505203247070312, + "rewards/rejected": -11.619044303894043, + "step": 1450 + }, + { + "epoch": 0.93, + "grad_norm": 14.969185829162598, + "kl": 0.0, + "learning_rate": 3.840682788051209e-08, + "logps/chosen": -254.7665252685547, + "logps/rejected": -354.30841064453125, + "loss": 0.0757, + "rewards/chosen": 2.8421378135681152, + "rewards/margins": 14.373845100402832, + "rewards/rejected": -11.531707763671875, + "step": 1455 + }, + { + "epoch": 0.93, + "grad_norm": 17.030691146850586, + "kl": 0.0, + "learning_rate": 3.6628733997155046e-08, + "logps/chosen": -263.8084411621094, + "logps/rejected": -395.4825439453125, + "loss": 0.0759, + "rewards/chosen": 2.9121432304382324, + "rewards/margins": 15.505640983581543, + "rewards/rejected": -12.593496322631836, + "step": 1460 + }, + { + "epoch": 0.94, + "grad_norm": 12.211559295654297, + "kl": 0.0, + "learning_rate": 3.4850640113798006e-08, + "logps/chosen": -236.26431274414062, + "logps/rejected": -357.1573791503906, + "loss": 0.0755, + "rewards/chosen": 2.843881130218506, + "rewards/margins": 14.507527351379395, + "rewards/rejected": -11.66364574432373, + "step": 1465 + }, + { + "epoch": 0.94, + "grad_norm": 17.574806213378906, + "kl": 0.0, + "learning_rate": 3.3072546230440967e-08, + "logps/chosen": -285.59942626953125, + "logps/rejected": -368.08245849609375, + "loss": 0.0676, + "rewards/chosen": 2.877531051635742, + "rewards/margins": 14.671765327453613, + "rewards/rejected": -11.794233322143555, + "step": 1470 + }, + { + "epoch": 0.94, + "grad_norm": 13.167614936828613, + "kl": 0.0, + "learning_rate": 3.129445234708392e-08, + "logps/chosen": -240.7215576171875, + "logps/rejected": -382.0995788574219, + "loss": 0.0651, + "rewards/chosen": 2.778978109359741, + "rewards/margins": 14.833264350891113, + "rewards/rejected": -12.054286003112793, + "step": 1475 + }, + { + "epoch": 0.95, + "grad_norm": 19.462112426757812, + "kl": 0.0, + "learning_rate": 2.9516358463726884e-08, + "logps/chosen": -243.0684356689453, + "logps/rejected": -366.4331359863281, + "loss": 0.0802, + "rewards/chosen": 2.8256452083587646, + "rewards/margins": 15.236946105957031, + "rewards/rejected": -12.411300659179688, + "step": 1480 + }, + { + "epoch": 0.95, + "grad_norm": 20.093833923339844, + "kl": 0.0, + "learning_rate": 2.7738264580369844e-08, + "logps/chosen": -209.17770385742188, + "logps/rejected": -359.9656677246094, + "loss": 0.0839, + "rewards/chosen": 2.901711940765381, + "rewards/margins": 14.679159164428711, + "rewards/rejected": -11.777449607849121, + "step": 1485 + }, + { + "epoch": 0.95, + "grad_norm": 13.270203590393066, + "kl": 0.0, + "learning_rate": 2.59601706970128e-08, + "logps/chosen": -272.44610595703125, + "logps/rejected": -357.05975341796875, + "loss": 0.0868, + "rewards/chosen": 2.6972391605377197, + "rewards/margins": 15.144004821777344, + "rewards/rejected": -12.446764945983887, + "step": 1490 + }, + { + "epoch": 0.96, + "grad_norm": 14.922286987304688, + "kl": 0.0, + "learning_rate": 2.418207681365576e-08, + "logps/chosen": -238.18191528320312, + "logps/rejected": -367.4979553222656, + "loss": 0.0824, + "rewards/chosen": 2.5710983276367188, + "rewards/margins": 14.130340576171875, + "rewards/rejected": -11.559242248535156, + "step": 1495 + }, + { + "epoch": 0.96, + "grad_norm": 21.186229705810547, + "kl": 0.0, + "learning_rate": 2.240398293029872e-08, + "logps/chosen": -240.6565399169922, + "logps/rejected": -387.65936279296875, + "loss": 0.0783, + "rewards/chosen": 2.6365768909454346, + "rewards/margins": 15.131543159484863, + "rewards/rejected": -12.494966506958008, + "step": 1500 + }, + { + "epoch": 0.96, + "grad_norm": 14.05397891998291, + "kl": 0.0, + "learning_rate": 2.0625889046941676e-08, + "logps/chosen": -307.04583740234375, + "logps/rejected": -361.1058654785156, + "loss": 0.0903, + "rewards/chosen": 2.804368257522583, + "rewards/margins": 14.683168411254883, + "rewards/rejected": -11.878799438476562, + "step": 1505 + }, + { + "epoch": 0.97, + "grad_norm": 10.290091514587402, + "kl": 0.0, + "learning_rate": 1.8847795163584636e-08, + "logps/chosen": -280.72406005859375, + "logps/rejected": -393.7200622558594, + "loss": 0.0643, + "rewards/chosen": 2.76891827583313, + "rewards/margins": 15.010998725891113, + "rewards/rejected": -12.24207878112793, + "step": 1510 + }, + { + "epoch": 0.97, + "grad_norm": 13.187542915344238, + "kl": 0.0, + "learning_rate": 1.7069701280227596e-08, + "logps/chosen": -250.7726287841797, + "logps/rejected": -378.99993896484375, + "loss": 0.0697, + "rewards/chosen": 2.8748562335968018, + "rewards/margins": 15.209932327270508, + "rewards/rejected": -12.335077285766602, + "step": 1515 + }, + { + "epoch": 0.97, + "grad_norm": 13.64420223236084, + "kl": 0.0, + "learning_rate": 1.5291607396870554e-08, + "logps/chosen": -253.7127227783203, + "logps/rejected": -367.41558837890625, + "loss": 0.0824, + "rewards/chosen": 3.031802177429199, + "rewards/margins": 14.762832641601562, + "rewards/rejected": -11.73102855682373, + "step": 1520 + }, + { + "epoch": 0.98, + "grad_norm": 10.866116523742676, + "kl": 0.0, + "learning_rate": 1.3513513513513514e-08, + "logps/chosen": -250.94400024414062, + "logps/rejected": -381.5174865722656, + "loss": 0.0793, + "rewards/chosen": 2.8337435722351074, + "rewards/margins": 15.124834060668945, + "rewards/rejected": -12.29109001159668, + "step": 1525 + }, + { + "epoch": 0.98, + "grad_norm": 11.604872703552246, + "kl": 0.0, + "learning_rate": 1.1735419630156473e-08, + "logps/chosen": -233.5682830810547, + "logps/rejected": -377.261962890625, + "loss": 0.0687, + "rewards/chosen": 3.119915008544922, + "rewards/margins": 15.134057998657227, + "rewards/rejected": -12.014142990112305, + "step": 1530 + }, + { + "epoch": 0.98, + "grad_norm": 20.006731033325195, + "kl": 0.0, + "learning_rate": 9.95732574679943e-09, + "logps/chosen": -257.07525634765625, + "logps/rejected": -346.5194396972656, + "loss": 0.0907, + "rewards/chosen": 2.676340103149414, + "rewards/margins": 14.137725830078125, + "rewards/rejected": -11.461384773254395, + "step": 1535 + }, + { + "epoch": 0.99, + "grad_norm": 17.203874588012695, + "kl": 0.0, + "learning_rate": 8.179231863442388e-09, + "logps/chosen": -226.61843872070312, + "logps/rejected": -381.11029052734375, + "loss": 0.0683, + "rewards/chosen": 2.940342903137207, + "rewards/margins": 15.23707103729248, + "rewards/rejected": -12.296728134155273, + "step": 1540 + }, + { + "epoch": 0.99, + "grad_norm": 14.477093696594238, + "kl": 0.0, + "learning_rate": 6.401137980085348e-09, + "logps/chosen": -240.14157104492188, + "logps/rejected": -388.9137268066406, + "loss": 0.0658, + "rewards/chosen": 2.875831127166748, + "rewards/margins": 15.294588088989258, + "rewards/rejected": -12.418756484985352, + "step": 1545 + }, + { + "epoch": 0.99, + "grad_norm": 10.444808959960938, + "kl": 0.0, + "learning_rate": 4.623044096728307e-09, + "logps/chosen": -273.6938781738281, + "logps/rejected": -361.58782958984375, + "loss": 0.0592, + "rewards/chosen": 3.0903258323669434, + "rewards/margins": 15.089597702026367, + "rewards/rejected": -11.999273300170898, + "step": 1550 + }, + { + "epoch": 0.99, + "grad_norm": 13.01125717163086, + "kl": 0.0, + "learning_rate": 2.844950213371266e-09, + "logps/chosen": -263.2748107910156, + "logps/rejected": -382.49481201171875, + "loss": 0.0724, + "rewards/chosen": 2.801457166671753, + "rewards/margins": 14.89887809753418, + "rewards/rejected": -12.097421646118164, + "step": 1555 + }, + { + "epoch": 1.0, + "grad_norm": 13.397442817687988, + "kl": 0.0, + "learning_rate": 1.0668563300142248e-09, + "logps/chosen": -225.07894897460938, + "logps/rejected": -353.08856201171875, + "loss": 0.0824, + "rewards/chosen": 2.47590970993042, + "rewards/margins": 14.8174409866333, + "rewards/rejected": -12.341531753540039, + "step": 1560 + }, + { + "epoch": 1.0, + "step": 1563, + "total_flos": 0.0, + "train_loss": 0.11715243643320149, + "train_runtime": 11089.6574, + "train_samples_per_second": 9.017, + "train_steps_per_second": 0.141 + } + ], + "logging_steps": 5, + "max_steps": 1563, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}