{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998741980123286, "eval_steps": 100, "global_step": 3974, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.3828125, "learning_rate": 5.025125628140703e-09, "logits/chosen": 0.2628047466278076, "logits/rejected": 0.7914568185806274, "logps/chosen": -183.46725463867188, "logps/rejected": -164.62379455566406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.341796875, "learning_rate": 5.0251256281407036e-08, "logits/chosen": 0.22027336061000824, "logits/rejected": 0.3840646743774414, "logps/chosen": -209.14871215820312, "logps/rejected": -223.64410400390625, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007058419287204742, "rewards/margins": 0.00020709568343590945, "rewards/margins_max": 0.002087921602651477, "rewards/margins_min": -0.0016737302066758275, "rewards/margins_std": 0.0026598896365612745, "rewards/rejected": -0.0009129376267082989, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.380859375, "learning_rate": 1.0050251256281407e-07, "logits/chosen": 0.1058058962225914, "logits/rejected": 0.4912484288215637, "logps/chosen": -212.02420043945312, "logps/rejected": -206.0525360107422, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.00032332129194401205, "rewards/margins": 0.0009101700270548463, "rewards/margins_max": 0.003948894329369068, "rewards/margins_min": -0.002128554042428732, "rewards/margins_std": 0.004297405481338501, "rewards/rejected": -0.0012334914645180106, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.458984375, "learning_rate": 1.507537688442211e-07, "logits/chosen": 0.18870362639427185, "logits/rejected": 0.577911376953125, "logps/chosen": -234.39236450195312, "logps/rejected": -218.83242797851562, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.00037711235927417874, "rewards/margins": 0.0012008370831608772, "rewards/margins_max": 0.003616205183789134, "rewards/margins_min": -0.0012145310174673796, "rewards/margins_std": 0.0034158460330218077, "rewards/rejected": -0.0015779495006427169, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.40234375, "learning_rate": 2.0100502512562815e-07, "logits/chosen": 0.06429781764745712, "logits/rejected": 0.31291159987449646, "logps/chosen": -229.8105926513672, "logps/rejected": -213.0727996826172, "loss": 0.6928, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0004698133561760187, "rewards/margins": 0.0012018651468679309, "rewards/margins_max": 0.004088181536644697, "rewards/margins_min": -0.00168445089366287, "rewards/margins_std": 0.004081867169588804, "rewards/rejected": -0.0016716786194592714, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.38671875, "learning_rate": 2.5125628140703517e-07, "logits/chosen": 0.2478822022676468, "logits/rejected": 0.3307963013648987, "logps/chosen": -208.3394317626953, "logps/rejected": -244.5113067626953, "loss": 0.6924, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0009044799953699112, "rewards/margins": 0.0017498359084129333, "rewards/margins_max": 0.003947221674025059, "rewards/margins_min": -0.00044754979899153113, "rewards/margins_std": 0.003107572440057993, "rewards/rejected": -0.0008453559130430222, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.435546875, "learning_rate": 3.015075376884422e-07, "logits/chosen": 0.17191682755947113, "logits/rejected": 0.508013129234314, "logps/chosen": -227.90115356445312, "logps/rejected": -224.430908203125, "loss": 0.6922, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.00039130254299379885, "rewards/margins": 0.0019925818778574467, "rewards/margins_max": 0.0045981681905686855, "rewards/margins_min": -0.0006130046676844358, "rewards/margins_std": 0.0036848559975624084, "rewards/rejected": -0.0023838842753320932, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.42578125, "learning_rate": 3.5175879396984927e-07, "logits/chosen": 0.17003652453422546, "logits/rejected": 0.3985624313354492, "logps/chosen": -211.16152954101562, "logps/rejected": -210.9799041748047, "loss": 0.6922, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.00022076326422393322, "rewards/margins": 0.0016923131188377738, "rewards/margins_max": 0.004520035348832607, "rewards/margins_min": -0.0011354093439877033, "rewards/margins_std": 0.003999003209173679, "rewards/rejected": -0.0019130764994770288, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.427734375, "learning_rate": 4.020100502512563e-07, "logits/chosen": 0.1190398707985878, "logits/rejected": 0.36623337864875793, "logps/chosen": -212.3631591796875, "logps/rejected": -220.9187469482422, "loss": 0.6916, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.00043441675370559096, "rewards/margins": 0.0035046630073338747, "rewards/margins_max": 0.00651139859110117, "rewards/margins_min": 0.0004979277146048844, "rewards/margins_std": 0.004252166021615267, "rewards/rejected": -0.003070246195420623, "step": 80 }, { "epoch": 0.02, "grad_norm": 0.361328125, "learning_rate": 4.522613065326633e-07, "logits/chosen": 0.06567513197660446, "logits/rejected": 0.43274015188217163, "logps/chosen": -222.13961791992188, "logps/rejected": -201.4839630126953, "loss": 0.6913, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6206960683339275e-05, "rewards/margins": 0.0037318530958145857, "rewards/margins_max": 0.00678494805470109, "rewards/margins_min": 0.0006787586025893688, "rewards/margins_std": 0.004317727871239185, "rewards/rejected": -0.0037580605130642653, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.384765625, "learning_rate": 5.025125628140703e-07, "logits/chosen": 0.1317283809185028, "logits/rejected": 0.39888468384742737, "logps/chosen": -195.3096923828125, "logps/rejected": -211.8949432373047, "loss": 0.6907, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0017639435827732086, "rewards/margins": 0.00431477464735508, "rewards/margins_max": 0.008146543055772781, "rewards/margins_min": 0.00048300548223778605, "rewards/margins_std": 0.005418939981609583, "rewards/rejected": -0.0025508308317512274, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.400390625, "learning_rate": 5.527638190954773e-07, "logits/chosen": 0.10737421363592148, "logits/rejected": 0.32433614134788513, "logps/chosen": -205.3096160888672, "logps/rejected": -220.96994018554688, "loss": 0.6898, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0030672824941575527, "rewards/margins": 0.007319621741771698, "rewards/margins_max": 0.01079073641449213, "rewards/margins_min": 0.0038485073018819094, "rewards/margins_std": 0.004908897448331118, "rewards/rejected": -0.004252338781952858, "step": 110 }, { "epoch": 0.03, "grad_norm": 0.400390625, "learning_rate": 6.030150753768844e-07, "logits/chosen": 0.15490484237670898, "logits/rejected": 0.6465431451797485, "logps/chosen": -217.82894897460938, "logps/rejected": -197.4770050048828, "loss": 0.6896, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.0026336044538766146, "rewards/margins": 0.007411560509353876, "rewards/margins_max": 0.011540110222995281, "rewards/margins_min": 0.0032830112613737583, "rewards/margins_std": 0.005838650278747082, "rewards/rejected": -0.004777955822646618, "step": 120 }, { "epoch": 0.03, "grad_norm": 0.412109375, "learning_rate": 6.532663316582915e-07, "logits/chosen": 0.05787094682455063, "logits/rejected": 0.5067285299301147, "logps/chosen": -230.8343963623047, "logps/rejected": -220.9256591796875, "loss": 0.6881, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.002234609331935644, "rewards/margins": 0.008958352729678154, "rewards/margins_max": 0.014376277104020119, "rewards/margins_min": 0.003540429752320051, "rewards/margins_std": 0.007662100251764059, "rewards/rejected": -0.006723743863403797, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.330078125, "learning_rate": 7.035175879396985e-07, "logits/chosen": 0.13236010074615479, "logits/rejected": 0.47717732191085815, "logps/chosen": -219.61264038085938, "logps/rejected": -228.51260375976562, "loss": 0.6868, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.005320298485457897, "rewards/margins": 0.013128049671649933, "rewards/margins_max": 0.020256798714399338, "rewards/margins_min": 0.005999299697577953, "rewards/margins_std": 0.010081576183438301, "rewards/rejected": -0.007807752583175898, "step": 140 }, { "epoch": 0.04, "grad_norm": 0.361328125, "learning_rate": 7.537688442211055e-07, "logits/chosen": 0.21956713497638702, "logits/rejected": 0.5885453820228577, "logps/chosen": -224.57754516601562, "logps/rejected": -218.06106567382812, "loss": 0.6853, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.007667540106922388, "rewards/margins": 0.01637618988752365, "rewards/margins_max": 0.022643666714429855, "rewards/margins_min": 0.010108711197972298, "rewards/margins_std": 0.008863553404808044, "rewards/rejected": -0.008708649314939976, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.37890625, "learning_rate": 8.040201005025126e-07, "logits/chosen": 0.00294627551920712, "logits/rejected": 0.3304385542869568, "logps/chosen": -224.15292358398438, "logps/rejected": -223.5465087890625, "loss": 0.6845, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00907582975924015, "rewards/margins": 0.018309107050299644, "rewards/margins_max": 0.025850627571344376, "rewards/margins_min": 0.010767589323222637, "rewards/margins_std": 0.010665318928658962, "rewards/rejected": -0.009233278222382069, "step": 160 }, { "epoch": 0.04, "grad_norm": 0.439453125, "learning_rate": 8.542713567839196e-07, "logits/chosen": 0.1823168247938156, "logits/rejected": 0.43500009179115295, "logps/chosen": -210.53060913085938, "logps/rejected": -216.46182250976562, "loss": 0.6835, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.009239943698048592, "rewards/margins": 0.01894932985305786, "rewards/margins_max": 0.027668584138154984, "rewards/margins_min": 0.01023007184267044, "rewards/margins_std": 0.012330890633165836, "rewards/rejected": -0.009709383361041546, "step": 170 }, { "epoch": 0.05, "grad_norm": 0.353515625, "learning_rate": 9.045226130653266e-07, "logits/chosen": 0.12103636562824249, "logits/rejected": 0.3777307868003845, "logps/chosen": -195.93931579589844, "logps/rejected": -200.99417114257812, "loss": 0.6822, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.010400387458503246, "rewards/margins": 0.02166915312409401, "rewards/margins_max": 0.03126353397965431, "rewards/margins_min": 0.012074774131178856, "rewards/margins_std": 0.013568502850830555, "rewards/rejected": -0.011268765665590763, "step": 180 }, { "epoch": 0.05, "grad_norm": 0.35546875, "learning_rate": 9.547738693467337e-07, "logits/chosen": 0.07193199545145035, "logits/rejected": 0.3750324845314026, "logps/chosen": -228.74118041992188, "logps/rejected": -230.8755340576172, "loss": 0.6797, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.008367463946342468, "rewards/margins": 0.028456291183829308, "rewards/margins_max": 0.038965143263339996, "rewards/margins_min": 0.017947440966963768, "rewards/margins_std": 0.014861756935715675, "rewards/rejected": -0.02008882723748684, "step": 190 }, { "epoch": 0.05, "grad_norm": 0.390625, "learning_rate": 1.0050251256281407e-06, "logits/chosen": 0.02257654443383217, "logits/rejected": 0.5656744241714478, "logps/chosen": -222.1704559326172, "logps/rejected": -204.72787475585938, "loss": 0.6782, "rewards/accuracies": 0.9375, "rewards/chosen": 0.011104286648333073, "rewards/margins": 0.031201040372252464, "rewards/margins_max": 0.044583261013031006, "rewards/margins_min": 0.017818817868828773, "rewards/margins_std": 0.018925320357084274, "rewards/rejected": -0.020096752792596817, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.388671875, "learning_rate": 1.0552763819095476e-06, "logits/chosen": 0.21097414195537567, "logits/rejected": 0.4384271204471588, "logps/chosen": -186.71658325195312, "logps/rejected": -218.23806762695312, "loss": 0.6771, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011779891327023506, "rewards/margins": 0.0341356061398983, "rewards/margins_max": 0.04913010075688362, "rewards/margins_min": 0.019141118973493576, "rewards/margins_std": 0.021205410361289978, "rewards/rejected": -0.022355718538165092, "step": 210 }, { "epoch": 0.06, "grad_norm": 0.3984375, "learning_rate": 1.1055276381909546e-06, "logits/chosen": 0.12355975806713104, "logits/rejected": 0.5098804235458374, "logps/chosen": -224.91552734375, "logps/rejected": -234.9082489013672, "loss": 0.6752, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.015334665775299072, "rewards/margins": 0.03818144276738167, "rewards/margins_max": 0.05416835471987724, "rewards/margins_min": 0.022194528952240944, "rewards/margins_std": 0.022608909755945206, "rewards/rejected": -0.022846775129437447, "step": 220 }, { "epoch": 0.06, "grad_norm": 0.439453125, "learning_rate": 1.1557788944723616e-06, "logits/chosen": 0.19827620685100555, "logits/rejected": 0.44844430685043335, "logps/chosen": -176.3722381591797, "logps/rejected": -183.7699432373047, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.011357043869793415, "rewards/margins": 0.037941962480545044, "rewards/margins_max": 0.053594231605529785, "rewards/margins_min": 0.022289691492915154, "rewards/margins_std": 0.022135648876428604, "rewards/rejected": -0.026584917679429054, "step": 230 }, { "epoch": 0.06, "grad_norm": 0.390625, "learning_rate": 1.2060301507537688e-06, "logits/chosen": 0.13188159465789795, "logits/rejected": 0.5466545820236206, "logps/chosen": -225.99484252929688, "logps/rejected": -218.8096160888672, "loss": 0.67, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.013194072060286999, "rewards/margins": 0.044676605612039566, "rewards/margins_max": 0.05990206450223923, "rewards/margins_min": 0.02945113554596901, "rewards/margins_std": 0.02153206057846546, "rewards/rejected": -0.031482525169849396, "step": 240 }, { "epoch": 0.06, "grad_norm": 0.404296875, "learning_rate": 1.256281407035176e-06, "logits/chosen": 0.14512896537780762, "logits/rejected": 0.5733065605163574, "logps/chosen": -217.5274658203125, "logps/rejected": -214.5115203857422, "loss": 0.6711, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.011558527126908302, "rewards/margins": 0.047826338559389114, "rewards/margins_max": 0.06687624752521515, "rewards/margins_min": 0.02877642773091793, "rewards/margins_std": 0.026940640062093735, "rewards/rejected": -0.03626781329512596, "step": 250 }, { "epoch": 0.07, "grad_norm": 0.3828125, "learning_rate": 1.306532663316583e-06, "logits/chosen": 0.1416536569595337, "logits/rejected": 0.4681627154350281, "logps/chosen": -217.2357940673828, "logps/rejected": -215.43777465820312, "loss": 0.6677, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.01843900792300701, "rewards/margins": 0.058879125863313675, "rewards/margins_max": 0.08180561661720276, "rewards/margins_min": 0.03595263510942459, "rewards/margins_std": 0.032422952353954315, "rewards/rejected": -0.04044011980295181, "step": 260 }, { "epoch": 0.07, "grad_norm": 0.421875, "learning_rate": 1.3567839195979899e-06, "logits/chosen": 0.22732439637184143, "logits/rejected": 0.4276302456855774, "logps/chosen": -198.55441284179688, "logps/rejected": -224.4716796875, "loss": 0.6628, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.017024602741003036, "rewards/margins": 0.06379345059394836, "rewards/margins_max": 0.09190671890974045, "rewards/margins_min": 0.03568018227815628, "rewards/margins_std": 0.039758164435625076, "rewards/rejected": -0.04676884785294533, "step": 270 }, { "epoch": 0.07, "grad_norm": 0.37109375, "learning_rate": 1.407035175879397e-06, "logits/chosen": 0.03006916679441929, "logits/rejected": 0.2829376757144928, "logps/chosen": -197.93682861328125, "logps/rejected": -201.83853149414062, "loss": 0.6595, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.018216144293546677, "rewards/margins": 0.06814040243625641, "rewards/margins_max": 0.09602198749780655, "rewards/margins_min": 0.04025881737470627, "rewards/margins_std": 0.03943051025271416, "rewards/rejected": -0.049924250692129135, "step": 280 }, { "epoch": 0.07, "grad_norm": 0.447265625, "learning_rate": 1.457286432160804e-06, "logits/chosen": 0.03733636066317558, "logits/rejected": 0.49974188208580017, "logps/chosen": -225.4219207763672, "logps/rejected": -195.27247619628906, "loss": 0.6537, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.020579595118761063, "rewards/margins": 0.0742858499288559, "rewards/margins_max": 0.108786940574646, "rewards/margins_min": 0.03978477045893669, "rewards/margins_std": 0.04879189655184746, "rewards/rejected": -0.05370625853538513, "step": 290 }, { "epoch": 0.08, "grad_norm": 0.40234375, "learning_rate": 1.507537688442211e-06, "logits/chosen": 0.13124307990074158, "logits/rejected": 0.43372398614883423, "logps/chosen": -188.13446044921875, "logps/rejected": -202.31063842773438, "loss": 0.6521, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02380152978003025, "rewards/margins": 0.0889906957745552, "rewards/margins_max": 0.11839659512042999, "rewards/margins_min": 0.059584807604551315, "rewards/margins_std": 0.041586220264434814, "rewards/rejected": -0.0651891678571701, "step": 300 }, { "epoch": 0.08, "grad_norm": 0.400390625, "learning_rate": 1.5577889447236182e-06, "logits/chosen": 0.16254135966300964, "logits/rejected": 0.4572983682155609, "logps/chosen": -217.7134552001953, "logps/rejected": -235.12612915039062, "loss": 0.6464, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02339380793273449, "rewards/margins": 0.09931546449661255, "rewards/margins_max": 0.13573993742465973, "rewards/margins_min": 0.06289096921682358, "rewards/margins_std": 0.05151200294494629, "rewards/rejected": -0.07592164725065231, "step": 310 }, { "epoch": 0.08, "grad_norm": 0.4296875, "learning_rate": 1.6080402010050252e-06, "logits/chosen": 0.21073463559150696, "logits/rejected": 0.5910454988479614, "logps/chosen": -217.4332733154297, "logps/rejected": -214.1886444091797, "loss": 0.6436, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02802230790257454, "rewards/margins": 0.11192785203456879, "rewards/margins_max": 0.1627815067768097, "rewards/margins_min": 0.061074189841747284, "rewards/margins_std": 0.07191795110702515, "rewards/rejected": -0.08390556275844574, "step": 320 }, { "epoch": 0.08, "grad_norm": 0.435546875, "learning_rate": 1.6582914572864321e-06, "logits/chosen": 0.14464020729064941, "logits/rejected": 0.46793508529663086, "logps/chosen": -230.2141876220703, "logps/rejected": -231.2537078857422, "loss": 0.6431, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.031488679349422455, "rewards/margins": 0.1129346638917923, "rewards/margins_max": 0.15848883986473083, "rewards/margins_min": 0.06738051772117615, "rewards/margins_std": 0.06442330777645111, "rewards/rejected": -0.08144598454236984, "step": 330 }, { "epoch": 0.09, "grad_norm": 0.431640625, "learning_rate": 1.708542713567839e-06, "logits/chosen": 0.3066442608833313, "logits/rejected": 0.6389753818511963, "logps/chosen": -194.7659912109375, "logps/rejected": -201.19326782226562, "loss": 0.64, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.01703776977956295, "rewards/margins": 0.09856927394866943, "rewards/margins_max": 0.13916271924972534, "rewards/margins_min": 0.057975828647613525, "rewards/margins_std": 0.05740780755877495, "rewards/rejected": -0.08153150975704193, "step": 340 }, { "epoch": 0.09, "grad_norm": 0.431640625, "learning_rate": 1.7587939698492463e-06, "logits/chosen": 0.11799661815166473, "logits/rejected": 0.49029532074928284, "logps/chosen": -191.6995086669922, "logps/rejected": -209.32369995117188, "loss": 0.6303, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.018647244200110435, "rewards/margins": 0.13601182401180267, "rewards/margins_max": 0.19829824566841125, "rewards/margins_min": 0.0737253949046135, "rewards/margins_std": 0.08808630704879761, "rewards/rejected": -0.11736458539962769, "step": 350 }, { "epoch": 0.09, "grad_norm": 0.390625, "learning_rate": 1.8090452261306533e-06, "logits/chosen": 0.13119210302829742, "logits/rejected": 0.2840971350669861, "logps/chosen": -199.85696411132812, "logps/rejected": -258.749755859375, "loss": 0.6292, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.012986330315470695, "rewards/margins": 0.14114083349704742, "rewards/margins_max": 0.19774450361728668, "rewards/margins_min": 0.08453711867332458, "rewards/margins_std": 0.08004971593618393, "rewards/rejected": -0.12815448641777039, "step": 360 }, { "epoch": 0.09, "grad_norm": 0.40625, "learning_rate": 1.8592964824120602e-06, "logits/chosen": 0.2754780650138855, "logits/rejected": 0.5169572830200195, "logps/chosen": -207.1035919189453, "logps/rejected": -230.28738403320312, "loss": 0.6258, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.014916675165295601, "rewards/margins": 0.16153986752033234, "rewards/margins_max": 0.23447349667549133, "rewards/margins_min": 0.08860625326633453, "rewards/margins_std": 0.10314369201660156, "rewards/rejected": -0.14662319421768188, "step": 370 }, { "epoch": 0.1, "grad_norm": 0.498046875, "learning_rate": 1.9095477386934674e-06, "logits/chosen": 0.062197744846343994, "logits/rejected": 0.3439430892467499, "logps/chosen": -222.99612426757812, "logps/rejected": -234.5655517578125, "loss": 0.62, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.0016535300528630614, "rewards/margins": 0.15255677700042725, "rewards/margins_max": 0.23381371796131134, "rewards/margins_min": 0.07129983603954315, "rewards/margins_std": 0.1149146556854248, "rewards/rejected": -0.15421029925346375, "step": 380 }, { "epoch": 0.1, "grad_norm": 0.51171875, "learning_rate": 1.959798994974874e-06, "logits/chosen": 0.3776804804801941, "logits/rejected": 0.6220484972000122, "logps/chosen": -225.0041961669922, "logps/rejected": -237.2688446044922, "loss": 0.6203, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.009254536591470242, "rewards/margins": 0.14196929335594177, "rewards/margins_max": 0.20804783701896667, "rewards/margins_min": 0.07589074224233627, "rewards/margins_std": 0.0934491753578186, "rewards/rejected": -0.151223823428154, "step": 390 }, { "epoch": 0.1, "grad_norm": 0.515625, "learning_rate": 1.9999984564005714e-06, "logits/chosen": 0.17335475981235504, "logits/rejected": 0.6286818385124207, "logps/chosen": -251.3433380126953, "logps/rejected": -244.0115509033203, "loss": 0.6029, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.004750807769596577, "rewards/margins": 0.19618940353393555, "rewards/margins_max": 0.28234678506851196, "rewards/margins_min": 0.11003203690052032, "rewards/margins_std": 0.12184491008520126, "rewards/rejected": -0.20094020664691925, "step": 400 }, { "epoch": 0.1, "grad_norm": 0.5078125, "learning_rate": 1.999944430920943e-06, "logits/chosen": 0.2944129705429077, "logits/rejected": 0.6106816530227661, "logps/chosen": -209.9373321533203, "logps/rejected": -256.11431884765625, "loss": 0.5943, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.014288604259490967, "rewards/margins": 0.23369868099689484, "rewards/margins_max": 0.3245174288749695, "rewards/margins_min": 0.1428799331188202, "rewards/margins_std": 0.1284371018409729, "rewards/rejected": -0.2479872703552246, "step": 410 }, { "epoch": 0.11, "grad_norm": 0.439453125, "learning_rate": 1.9998132302352276e-06, "logits/chosen": 0.10406245291233063, "logits/rejected": 0.4271799921989441, "logps/chosen": -219.8295135498047, "logps/rejected": -235.6389617919922, "loss": 0.5968, "rewards/accuracies": 1.0, "rewards/chosen": -0.027161872014403343, "rewards/margins": 0.20511429011821747, "rewards/margins_max": 0.27854466438293457, "rewards/margins_min": 0.13168397545814514, "rewards/margins_std": 0.10384617000818253, "rewards/rejected": -0.23227617144584656, "step": 420 }, { "epoch": 0.11, "grad_norm": 0.48046875, "learning_rate": 1.999604864469428e-06, "logits/chosen": 0.22821100056171417, "logits/rejected": 0.5613245964050293, "logps/chosen": -220.06796264648438, "logps/rejected": -239.36196899414062, "loss": 0.5837, "rewards/accuracies": 0.9375, "rewards/chosen": -0.05030583217740059, "rewards/margins": 0.23767979443073273, "rewards/margins_max": 0.35419517755508423, "rewards/margins_min": 0.12116440385580063, "rewards/margins_std": 0.16477763652801514, "rewards/rejected": -0.287985622882843, "step": 430 }, { "epoch": 0.11, "grad_norm": 0.423828125, "learning_rate": 1.999319349705108e-06, "logits/chosen": 0.2373732626438141, "logits/rejected": 0.5678123831748962, "logps/chosen": -253.2532196044922, "logps/rejected": -260.73516845703125, "loss": 0.5869, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.06434480845928192, "rewards/margins": 0.2248760461807251, "rewards/margins_max": 0.3302004635334015, "rewards/margins_min": 0.1195516362786293, "rewards/margins_std": 0.1489512026309967, "rewards/rejected": -0.2892208695411682, "step": 440 }, { "epoch": 0.11, "grad_norm": 0.423828125, "learning_rate": 1.9989567079781537e-06, "logits/chosen": 0.2335653007030487, "logits/rejected": 0.5320082902908325, "logps/chosen": -208.51205444335938, "logps/rejected": -247.15762329101562, "loss": 0.5566, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.060688622295856476, "rewards/margins": 0.31464827060699463, "rewards/margins_max": 0.43832287192344666, "rewards/margins_min": 0.1909736841917038, "rewards/margins_std": 0.1749022752046585, "rewards/rejected": -0.3753369152545929, "step": 450 }, { "epoch": 0.12, "grad_norm": 0.462890625, "learning_rate": 1.9985169672770702e-06, "logits/chosen": -0.06091824918985367, "logits/rejected": 0.25912588834762573, "logps/chosen": -213.4940185546875, "logps/rejected": -248.5473175048828, "loss": 0.5665, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10004855692386627, "rewards/margins": 0.2793845236301422, "rewards/margins_max": 0.3991774916648865, "rewards/margins_min": 0.15959155559539795, "rewards/margins_std": 0.16941285133361816, "rewards/rejected": -0.3794330954551697, "step": 460 }, { "epoch": 0.12, "grad_norm": 0.5078125, "learning_rate": 1.9980001615408227e-06, "logits/chosen": 0.12755416333675385, "logits/rejected": 0.4592605233192444, "logps/chosen": -226.99948120117188, "logps/rejected": -252.36849975585938, "loss": 0.5626, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.12666812539100647, "rewards/margins": 0.2744174599647522, "rewards/margins_max": 0.4089486598968506, "rewards/margins_min": 0.1398862898349762, "rewards/margins_std": 0.19025583565235138, "rewards/rejected": -0.4010855555534363, "step": 470 }, { "epoch": 0.12, "grad_norm": 0.58984375, "learning_rate": 1.9974063306562163e-06, "logits/chosen": 0.04675767198204994, "logits/rejected": 0.2735728919506073, "logps/chosen": -219.89529418945312, "logps/rejected": -272.9166564941406, "loss": 0.5521, "rewards/accuracies": 0.9375, "rewards/chosen": -0.1341877430677414, "rewards/margins": 0.32467249035835266, "rewards/margins_max": 0.4808884263038635, "rewards/margins_min": 0.1684565544128418, "rewards/margins_std": 0.22092270851135254, "rewards/rejected": -0.45886021852493286, "step": 480 }, { "epoch": 0.12, "grad_norm": 0.54296875, "learning_rate": 1.99673552045482e-06, "logits/chosen": 0.021597793325781822, "logits/rejected": 0.5157625675201416, "logps/chosen": -227.2969512939453, "logps/rejected": -255.79684448242188, "loss": 0.5397, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.13789792358875275, "rewards/margins": 0.34923315048217773, "rewards/margins_max": 0.5173725485801697, "rewards/margins_min": 0.18109369277954102, "rewards/margins_std": 0.23778510093688965, "rewards/rejected": -0.4871310293674469, "step": 490 }, { "epoch": 0.13, "grad_norm": 0.625, "learning_rate": 1.995987782709425e-06, "logits/chosen": 0.35428065061569214, "logits/rejected": 0.7805494070053101, "logps/chosen": -254.2764129638672, "logps/rejected": -262.0751953125, "loss": 0.5326, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14267601072788239, "rewards/margins": 0.37824535369873047, "rewards/margins_max": 0.5719924569129944, "rewards/margins_min": 0.18449831008911133, "rewards/margins_std": 0.2739996910095215, "rewards/rejected": -0.5209213495254517, "step": 500 }, { "epoch": 0.13, "grad_norm": 0.47265625, "learning_rate": 1.995163175130053e-06, "logits/chosen": 0.13442710041999817, "logits/rejected": 0.5977517366409302, "logps/chosen": -269.36590576171875, "logps/rejected": -279.8051452636719, "loss": 0.5318, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.20593896508216858, "rewards/margins": 0.36720719933509827, "rewards/margins_max": 0.5583890676498413, "rewards/margins_min": 0.1760253608226776, "rewards/margins_std": 0.2703719735145569, "rewards/rejected": -0.5731461644172668, "step": 510 }, { "epoch": 0.13, "grad_norm": 0.5546875, "learning_rate": 1.994261761359501e-06, "logits/chosen": 0.10652659833431244, "logits/rejected": 0.6973064541816711, "logps/chosen": -262.9113464355469, "logps/rejected": -267.61590576171875, "loss": 0.5143, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.19917282462120056, "rewards/margins": 0.3482286036014557, "rewards/margins_max": 0.5081428289413452, "rewards/margins_min": 0.18831434845924377, "rewards/margins_std": 0.2261529266834259, "rewards/rejected": -0.5474014282226562, "step": 520 }, { "epoch": 0.13, "grad_norm": 0.5625, "learning_rate": 1.9932836109684285e-06, "logits/chosen": 0.023062556982040405, "logits/rejected": 0.35867422819137573, "logps/chosen": -217.8906707763672, "logps/rejected": -279.52618408203125, "loss": 0.5197, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.2201465368270874, "rewards/margins": 0.4889785647392273, "rewards/margins_max": 0.7714502215385437, "rewards/margins_min": 0.20650680363178253, "rewards/margins_std": 0.3994753360748291, "rewards/rejected": -0.7091250419616699, "step": 530 }, { "epoch": 0.14, "grad_norm": 0.5390625, "learning_rate": 1.9922287994499877e-06, "logits/chosen": 0.2635014057159424, "logits/rejected": 0.6844016313552856, "logps/chosen": -242.1995849609375, "logps/rejected": -261.4162902832031, "loss": 0.5346, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.2828103303909302, "rewards/margins": 0.3755984306335449, "rewards/margins_max": 0.5684477686882019, "rewards/margins_min": 0.18274910748004913, "rewards/margins_std": 0.27273014187812805, "rewards/rejected": -0.6584087610244751, "step": 540 }, { "epoch": 0.14, "grad_norm": 0.62109375, "learning_rate": 1.991097408214e-06, "logits/chosen": 0.07120836526155472, "logits/rejected": 0.4711441099643707, "logps/chosen": -283.8448791503906, "logps/rejected": -323.3854064941406, "loss": 0.4654, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2944994866847992, "rewards/margins": 0.6885162591934204, "rewards/margins_max": 0.9488954544067383, "rewards/margins_min": 0.4281369745731354, "rewards/margins_std": 0.3682318329811096, "rewards/rejected": -0.9830157160758972, "step": 550 }, { "epoch": 0.14, "grad_norm": 0.578125, "learning_rate": 1.989889524580669e-06, "logits/chosen": 0.2516458034515381, "logits/rejected": 0.6511009335517883, "logps/chosen": -238.0609588623047, "logps/rejected": -280.14019775390625, "loss": 0.486, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.3175020217895508, "rewards/margins": 0.5095471739768982, "rewards/margins_max": 0.7277418375015259, "rewards/margins_min": 0.29135242104530334, "rewards/margins_std": 0.30857396125793457, "rewards/rejected": -0.8270492553710938, "step": 560 }, { "epoch": 0.14, "grad_norm": 0.5546875, "learning_rate": 1.988605241773843e-06, "logits/chosen": 0.23482546210289001, "logits/rejected": 0.39776262640953064, "logps/chosen": -211.8990020751953, "logps/rejected": -277.9550476074219, "loss": 0.4832, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3305775821208954, "rewards/margins": 0.5754931569099426, "rewards/margins_max": 0.837389349937439, "rewards/margins_min": 0.31359678506851196, "rewards/margins_std": 0.3703773319721222, "rewards/rejected": -0.9060707092285156, "step": 570 }, { "epoch": 0.15, "grad_norm": 0.84375, "learning_rate": 1.987244658913821e-06, "logits/chosen": 0.2136719673871994, "logits/rejected": 0.5631103515625, "logps/chosen": -263.46173095703125, "logps/rejected": -335.8443908691406, "loss": 0.4707, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.3896404504776001, "rewards/margins": 0.6542876958847046, "rewards/margins_max": 1.0539064407348633, "rewards/margins_min": 0.2546689510345459, "rewards/margins_std": 0.5651463270187378, "rewards/rejected": -1.0439281463623047, "step": 580 }, { "epoch": 0.15, "grad_norm": 0.609375, "learning_rate": 1.9858078810097e-06, "logits/chosen": 0.2974611520767212, "logits/rejected": 0.5850492715835571, "logps/chosen": -250.642578125, "logps/rejected": -302.98162841796875, "loss": 0.4955, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4073031544685364, "rewards/margins": 0.4717481732368469, "rewards/margins_max": 0.761227548122406, "rewards/margins_min": 0.18226870894432068, "rewards/margins_std": 0.40938568115234375, "rewards/rejected": -0.8790512084960938, "step": 590 }, { "epoch": 0.15, "grad_norm": 0.66796875, "learning_rate": 1.984295018951274e-06, "logits/chosen": 0.09430913627147675, "logits/rejected": 0.49069744348526, "logps/chosen": -251.55856323242188, "logps/rejected": -317.350341796875, "loss": 0.4458, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.41190361976623535, "rewards/margins": 0.6539624333381653, "rewards/margins_max": 0.9886572957038879, "rewards/margins_min": 0.3192675709724426, "rewards/margins_std": 0.47332993149757385, "rewards/rejected": -1.0658659934997559, "step": 600 }, { "epoch": 0.15, "grad_norm": 0.59765625, "learning_rate": 1.9827061895004715e-06, "logits/chosen": 0.17028877139091492, "logits/rejected": 0.4926506578922272, "logps/chosen": -252.2837371826172, "logps/rejected": -298.1224670410156, "loss": 0.4782, "rewards/accuracies": 0.9375, "rewards/chosen": -0.438131719827652, "rewards/margins": 0.5629655122756958, "rewards/margins_max": 0.9025141596794128, "rewards/margins_min": 0.22341683506965637, "rewards/margins_std": 0.4801942706108093, "rewards/rejected": -1.0010972023010254, "step": 610 }, { "epoch": 0.16, "grad_norm": 0.60546875, "learning_rate": 1.9810415152823475e-06, "logits/chosen": 0.10140929371118546, "logits/rejected": 0.21094012260437012, "logps/chosen": -253.6886749267578, "logps/rejected": -349.69720458984375, "loss": 0.4399, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5137056112289429, "rewards/margins": 0.7384149432182312, "rewards/margins_max": 1.147780179977417, "rewards/margins_min": 0.3290497958660126, "rewards/margins_std": 0.5789297819137573, "rewards/rejected": -1.2521207332611084, "step": 620 }, { "epoch": 0.16, "grad_norm": 0.498046875, "learning_rate": 1.979301124775617e-06, "logits/chosen": 0.21277904510498047, "logits/rejected": 0.5488343834877014, "logps/chosen": -275.15899658203125, "logps/rejected": -342.88812255859375, "loss": 0.4532, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.4999760687351227, "rewards/margins": 0.7143586874008179, "rewards/margins_max": 1.0617420673370361, "rewards/margins_min": 0.3669753670692444, "rewards/margins_std": 0.4912742078304291, "rewards/rejected": -1.2143347263336182, "step": 630 }, { "epoch": 0.16, "grad_norm": 0.6171875, "learning_rate": 1.977485152302741e-06, "logits/chosen": 0.20225989818572998, "logits/rejected": 0.380338191986084, "logps/chosen": -240.4453887939453, "logps/rejected": -322.8797912597656, "loss": 0.4514, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5157987475395203, "rewards/margins": 0.7272204756736755, "rewards/margins_max": 1.0614551305770874, "rewards/margins_min": 0.39298567175865173, "rewards/margins_std": 0.47267937660217285, "rewards/rejected": -1.2430192232131958, "step": 640 }, { "epoch": 0.16, "grad_norm": 0.55859375, "learning_rate": 1.9755937380195564e-06, "logits/chosen": -0.05190020799636841, "logits/rejected": 0.5600059628486633, "logps/chosen": -293.57666015625, "logps/rejected": -305.3736267089844, "loss": 0.4481, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.5161153674125671, "rewards/margins": 0.6342784762382507, "rewards/margins_max": 1.0201352834701538, "rewards/margins_min": 0.2484218180179596, "rewards/margins_std": 0.5456838011741638, "rewards/rejected": -1.1503938436508179, "step": 650 }, { "epoch": 0.17, "grad_norm": 0.703125, "learning_rate": 1.9736270279044634e-06, "logits/chosen": 0.014571094885468483, "logits/rejected": 0.4248642027378082, "logps/chosen": -266.79010009765625, "logps/rejected": -353.534912109375, "loss": 0.4127, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.5665841102600098, "rewards/margins": 0.798658013343811, "rewards/margins_max": 1.0735851526260376, "rewards/margins_min": 0.5237309336662292, "rewards/margins_std": 0.3888055682182312, "rewards/rejected": -1.3652422428131104, "step": 660 }, { "epoch": 0.17, "grad_norm": 0.58984375, "learning_rate": 1.9715851737471544e-06, "logits/chosen": 0.051493000239133835, "logits/rejected": 0.347175657749176, "logps/chosen": -256.3532409667969, "logps/rejected": -362.8037414550781, "loss": 0.4129, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.5985296368598938, "rewards/margins": 0.8330098986625671, "rewards/margins_max": 1.1987718343734741, "rewards/margins_min": 0.467247873544693, "rewards/margins_std": 0.5172656178474426, "rewards/rejected": -1.4315392971038818, "step": 670 }, { "epoch": 0.17, "grad_norm": 0.640625, "learning_rate": 1.969468333136902e-06, "logits/chosen": 0.10662545263767242, "logits/rejected": 0.5305906534194946, "logps/chosen": -277.83624267578125, "logps/rejected": -321.1910095214844, "loss": 0.4247, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -0.6851547956466675, "rewards/margins": 0.7354794144630432, "rewards/margins_max": 1.1970973014831543, "rewards/margins_min": 0.2738614082336426, "rewards/margins_std": 0.6528264284133911, "rewards/rejected": -1.4206342697143555, "step": 680 }, { "epoch": 0.17, "grad_norm": 0.66015625, "learning_rate": 1.9672766694503955e-06, "logits/chosen": 0.130225270986557, "logits/rejected": 0.47270625829696655, "logps/chosen": -272.90985107421875, "logps/rejected": -354.52911376953125, "loss": 0.3923, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6151808500289917, "rewards/margins": 0.8642423748970032, "rewards/margins_max": 1.2572708129882812, "rewards/margins_min": 0.4712139964103699, "rewards/margins_std": 0.5558260679244995, "rewards/rejected": -1.4794232845306396, "step": 690 }, { "epoch": 0.18, "grad_norm": 0.640625, "learning_rate": 1.9650103518391316e-06, "logits/chosen": -0.07168503105640411, "logits/rejected": 0.35873326659202576, "logps/chosen": -279.2594909667969, "logps/rejected": -358.3661193847656, "loss": 0.3894, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.6471285820007324, "rewards/margins": 0.8690595626831055, "rewards/margins_max": 1.3681957721710205, "rewards/margins_min": 0.36992329359054565, "rewards/margins_std": 0.7058852910995483, "rewards/rejected": -1.5161882638931274, "step": 700 }, { "epoch": 0.18, "grad_norm": 0.59375, "learning_rate": 1.9626695552163577e-06, "logits/chosen": 0.1328928917646408, "logits/rejected": 0.5320017337799072, "logps/chosen": -294.1138610839844, "logps/rejected": -380.0039367675781, "loss": 0.407, "rewards/accuracies": 0.875, "rewards/chosen": -0.8441578149795532, "rewards/margins": 0.9135202169418335, "rewards/margins_max": 1.4895973205566406, "rewards/margins_min": 0.33744320273399353, "rewards/margins_std": 0.8146958351135254, "rewards/rejected": -1.7576780319213867, "step": 710 }, { "epoch": 0.18, "grad_norm": 1.015625, "learning_rate": 1.9602544602435754e-06, "logits/chosen": 0.0703146755695343, "logits/rejected": 0.5812051892280579, "logps/chosen": -351.3577575683594, "logps/rejected": -401.6908874511719, "loss": 0.4347, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.7561784386634827, "rewards/margins": 1.0206066370010376, "rewards/margins_max": 1.6420679092407227, "rewards/margins_min": 0.3991455137729645, "rewards/margins_std": 0.8788787722587585, "rewards/rejected": -1.776785135269165, "step": 720 }, { "epoch": 0.18, "grad_norm": 0.76953125, "learning_rate": 1.957765253316595e-06, "logits/chosen": -0.03158079460263252, "logits/rejected": 0.36648237705230713, "logps/chosen": -288.8924865722656, "logps/rejected": -408.21453857421875, "loss": 0.3707, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.7478488683700562, "rewards/margins": 1.2099758386611938, "rewards/margins_max": 1.8198902606964111, "rewards/margins_min": 0.6000615358352661, "rewards/margins_std": 0.8625491857528687, "rewards/rejected": -1.95782470703125, "step": 730 }, { "epoch": 0.19, "grad_norm": 0.609375, "learning_rate": 1.955202126551149e-06, "logits/chosen": 0.01123755145817995, "logits/rejected": 0.3031242787837982, "logps/chosen": -283.08087158203125, "logps/rejected": -442.47216796875, "loss": 0.3474, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7709259986877441, "rewards/margins": 1.405474305152893, "rewards/margins_max": 2.1374449729919434, "rewards/margins_min": 0.6735036969184875, "rewards/margins_std": 1.0351628065109253, "rewards/rejected": -2.1764004230499268, "step": 740 }, { "epoch": 0.19, "grad_norm": 0.69921875, "learning_rate": 1.9525652777680673e-06, "logits/chosen": 0.17332817614078522, "logits/rejected": 0.511985182762146, "logps/chosen": -313.7992248535156, "logps/rejected": -420.5000915527344, "loss": 0.4075, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.8556423187255859, "rewards/margins": 1.1546481847763062, "rewards/margins_max": 1.9178001880645752, "rewards/margins_min": 0.3914966285228729, "rewards/margins_std": 1.0792595148086548, "rewards/rejected": -2.0102906227111816, "step": 750 }, { "epoch": 0.19, "grad_norm": 0.58984375, "learning_rate": 1.949854910478007e-06, "logits/chosen": 0.16492195427417755, "logits/rejected": 0.468805730342865, "logps/chosen": -279.9993591308594, "logps/rejected": -441.51409912109375, "loss": 0.3282, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8717137575149536, "rewards/margins": 1.3921509981155396, "rewards/margins_max": 2.1553454399108887, "rewards/margins_min": 0.6289564967155457, "rewards/margins_std": 1.079319953918457, "rewards/rejected": -2.263864517211914, "step": 760 }, { "epoch": 0.19, "grad_norm": 0.546875, "learning_rate": 1.9470712338657457e-06, "logits/chosen": -0.0090141287073493, "logits/rejected": 0.4108152985572815, "logps/chosen": -303.95452880859375, "logps/rejected": -422.049072265625, "loss": 0.3649, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.8306927680969238, "rewards/margins": 1.1475027799606323, "rewards/margins_max": 1.8583225011825562, "rewards/margins_min": 0.4366832375526428, "rewards/margins_std": 1.0052506923675537, "rewards/rejected": -1.9781955480575562, "step": 770 }, { "epoch": 0.2, "grad_norm": 0.8515625, "learning_rate": 1.9442144627740387e-06, "logits/chosen": 0.2017272412776947, "logits/rejected": 0.3989468812942505, "logps/chosen": -296.7005310058594, "logps/rejected": -446.35650634765625, "loss": 0.3255, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8615614175796509, "rewards/margins": 1.4004117250442505, "rewards/margins_max": 2.0196797847747803, "rewards/margins_min": 0.7811434864997864, "rewards/margins_std": 0.8757774233818054, "rewards/rejected": -2.2619731426239014, "step": 780 }, { "epoch": 0.2, "grad_norm": 0.54296875, "learning_rate": 1.9412848176870363e-06, "logits/chosen": 0.06361217796802521, "logits/rejected": 0.45090895891189575, "logps/chosen": -299.48175048828125, "logps/rejected": -419.0980529785156, "loss": 0.3474, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.9132553339004517, "rewards/margins": 1.1059119701385498, "rewards/margins_max": 1.6222988367080688, "rewards/margins_min": 0.5895251035690308, "rewards/margins_std": 0.7302813529968262, "rewards/rejected": -2.019167423248291, "step": 790 }, { "epoch": 0.2, "grad_norm": 0.63671875, "learning_rate": 1.938282524713266e-06, "logits/chosen": 0.14790871739387512, "logits/rejected": 0.6091148257255554, "logps/chosen": -302.5926818847656, "logps/rejected": -411.4151916503906, "loss": 0.3637, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8289756774902344, "rewards/margins": 1.2672948837280273, "rewards/margins_max": 1.9323132038116455, "rewards/margins_min": 0.6022766828536987, "rewards/margins_std": 0.9404776692390442, "rewards/rejected": -2.0962705612182617, "step": 800 }, { "epoch": 0.2, "grad_norm": 0.83203125, "learning_rate": 1.935207815568183e-06, "logits/chosen": 0.12243340164422989, "logits/rejected": 0.3599459230899811, "logps/chosen": -336.83197021484375, "logps/rejected": -477.29736328125, "loss": 0.3501, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.0968286991119385, "rewards/margins": 1.5700525045394897, "rewards/margins_max": 2.5577869415283203, "rewards/margins_min": 0.582318127155304, "rewards/margins_std": 1.3968675136566162, "rewards/rejected": -2.6668813228607178, "step": 810 }, { "epoch": 0.21, "grad_norm": 0.6640625, "learning_rate": 1.9320609275562863e-06, "logits/chosen": -0.0032353117130696774, "logits/rejected": 0.4075491428375244, "logps/chosen": -307.46099853515625, "logps/rejected": -445.69635009765625, "loss": 0.3292, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.015716791152954, "rewards/margins": 1.3720118999481201, "rewards/margins_max": 2.1683993339538574, "rewards/margins_min": 0.575624406337738, "rewards/margins_std": 1.1262620687484741, "rewards/rejected": -2.387728691101074, "step": 820 }, { "epoch": 0.21, "grad_norm": 1.421875, "learning_rate": 1.9288421035528025e-06, "logits/chosen": 0.007567564491182566, "logits/rejected": 0.45127448439598083, "logps/chosen": -362.3955078125, "logps/rejected": -487.38250732421875, "loss": 0.371, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1685550212860107, "rewards/margins": 1.307808518409729, "rewards/margins_max": 2.092421531677246, "rewards/margins_min": 0.5231954455375671, "rewards/margins_std": 1.1096104383468628, "rewards/rejected": -2.47636342048645, "step": 830 }, { "epoch": 0.21, "grad_norm": 0.734375, "learning_rate": 1.925551591984943e-06, "logits/chosen": 0.11853794753551483, "logits/rejected": 0.392129123210907, "logps/chosen": -341.79779052734375, "logps/rejected": -501.59930419921875, "loss": 0.3212, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.0101326704025269, "rewards/margins": 1.5283323526382446, "rewards/margins_max": 2.2559328079223633, "rewards/margins_min": 0.8007319569587708, "rewards/margins_std": 1.028982400894165, "rewards/rejected": -2.5384650230407715, "step": 840 }, { "epoch": 0.21, "grad_norm": 0.83984375, "learning_rate": 1.9221896468127285e-06, "logits/chosen": 0.03412569314241409, "logits/rejected": 0.4624078869819641, "logps/chosen": -316.32684326171875, "logps/rejected": -468.22705078125, "loss": 0.334, "rewards/accuracies": 0.9375, "rewards/chosen": -1.012904405593872, "rewards/margins": 1.5719475746154785, "rewards/margins_max": 2.157794713973999, "rewards/margins_min": 0.9861001968383789, "rewards/margins_std": 0.8285131454467773, "rewards/rejected": -2.5848519802093506, "step": 850 }, { "epoch": 0.22, "grad_norm": 0.7734375, "learning_rate": 1.918756527509389e-06, "logits/chosen": -0.004495727829635143, "logits/rejected": 0.5306761860847473, "logps/chosen": -349.19427490234375, "logps/rejected": -435.3349609375, "loss": 0.329, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1305290460586548, "rewards/margins": 1.2037010192871094, "rewards/margins_max": 1.8855326175689697, "rewards/margins_min": 0.5218694806098938, "rewards/margins_std": 0.9642555117607117, "rewards/rejected": -2.3342299461364746, "step": 860 }, { "epoch": 0.22, "grad_norm": 0.84375, "learning_rate": 1.9152524990413376e-06, "logits/chosen": 0.07604047656059265, "logits/rejected": 0.3435381054878235, "logps/chosen": -312.3734436035156, "logps/rejected": -463.84552001953125, "loss": 0.3341, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0629851818084717, "rewards/margins": 1.5512994527816772, "rewards/margins_max": 2.33793306350708, "rewards/margins_min": 0.7646657824516296, "rewards/margins_std": 1.1124681234359741, "rewards/rejected": -2.6142849922180176, "step": 870 }, { "epoch": 0.22, "grad_norm": 1.1328125, "learning_rate": 1.9116778318477224e-06, "logits/chosen": 0.017501067370176315, "logits/rejected": 0.3349132537841797, "logps/chosen": -367.47442626953125, "logps/rejected": -500.0546875, "loss": 0.371, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3893836736679077, "rewards/margins": 1.4116103649139404, "rewards/margins_max": 2.3770699501037598, "rewards/margins_min": 0.44615092873573303, "rewards/margins_std": 1.3653658628463745, "rewards/rejected": -2.8009941577911377, "step": 880 }, { "epoch": 0.22, "grad_norm": 0.671875, "learning_rate": 1.908032801819551e-06, "logits/chosen": 0.09761302173137665, "logits/rejected": 0.6039578318595886, "logps/chosen": -371.1180725097656, "logps/rejected": -442.41339111328125, "loss": 0.3455, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.2109535932540894, "rewards/margins": 1.2490062713623047, "rewards/margins_max": 1.8362785577774048, "rewards/margins_min": 0.6617340445518494, "rewards/margins_std": 0.8305282592773438, "rewards/rejected": -2.4599597454071045, "step": 890 }, { "epoch": 0.23, "grad_norm": 1.2578125, "learning_rate": 1.9043176902784006e-06, "logits/chosen": 0.029796432703733444, "logits/rejected": 0.5161929726600647, "logps/chosen": -374.39520263671875, "logps/rejected": -520.0931396484375, "loss": 0.3237, "rewards/accuracies": 0.9375, "rewards/chosen": -1.319215178489685, "rewards/margins": 1.6767199039459229, "rewards/margins_max": 2.5083844661712646, "rewards/margins_min": 0.8450548052787781, "rewards/margins_std": 1.1761517524719238, "rewards/rejected": -2.9959349632263184, "step": 900 }, { "epoch": 0.23, "grad_norm": 0.734375, "learning_rate": 1.900532783954703e-06, "logits/chosen": -0.1830468475818634, "logits/rejected": 0.16268977522850037, "logps/chosen": -327.41705322265625, "logps/rejected": -516.1043090820312, "loss": 0.3022, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2103191614151, "rewards/margins": 1.746766448020935, "rewards/margins_max": 2.5591721534729004, "rewards/margins_min": 0.934360682964325, "rewards/margins_std": 1.1489155292510986, "rewards/rejected": -2.9570858478546143, "step": 910 }, { "epoch": 0.23, "grad_norm": 0.8203125, "learning_rate": 1.8966783749656162e-06, "logits/chosen": 0.15995833277702332, "logits/rejected": 0.3903830647468567, "logps/chosen": -336.82269287109375, "logps/rejected": -550.3900146484375, "loss": 0.305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3295611143112183, "rewards/margins": 1.906531572341919, "rewards/margins_max": 3.0635104179382324, "rewards/margins_min": 0.7495523691177368, "rewards/margins_std": 1.6362155675888062, "rewards/rejected": -3.2360923290252686, "step": 920 }, { "epoch": 0.23, "grad_norm": 0.9296875, "learning_rate": 1.8927547607924793e-06, "logits/chosen": 0.11276821792125702, "logits/rejected": 0.4435056149959564, "logps/chosen": -350.63641357421875, "logps/rejected": -520.3906860351562, "loss": 0.2768, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.1965409517288208, "rewards/margins": 1.8161017894744873, "rewards/margins_max": 2.594125986099243, "rewards/margins_min": 1.0380772352218628, "rewards/margins_std": 1.1002928018569946, "rewards/rejected": -3.0126426219940186, "step": 930 }, { "epoch": 0.24, "grad_norm": 1.0078125, "learning_rate": 1.8887622442578524e-06, "logits/chosen": 0.11966486275196075, "logits/rejected": 0.5965573191642761, "logps/chosen": -324.5442810058594, "logps/rejected": -518.1434936523438, "loss": 0.3189, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.1375452280044556, "rewards/margins": 1.926458716392517, "rewards/margins_max": 3.091047763824463, "rewards/margins_min": 0.7618700861930847, "rewards/margins_std": 1.646977186203003, "rewards/rejected": -3.0640041828155518, "step": 940 }, { "epoch": 0.24, "grad_norm": 0.6953125, "learning_rate": 1.8847011335021445e-06, "logits/chosen": 0.18524505198001862, "logits/rejected": 0.6330695152282715, "logps/chosen": -354.59686279296875, "logps/rejected": -523.9287719726562, "loss": 0.2714, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3136844635009766, "rewards/margins": 1.8904693126678467, "rewards/margins_max": 2.8665313720703125, "rewards/margins_min": 0.9144073724746704, "rewards/margins_std": 1.3803602457046509, "rewards/rejected": -3.2041537761688232, "step": 950 }, { "epoch": 0.24, "grad_norm": 1.03125, "learning_rate": 1.8805717419598329e-06, "logits/chosen": 0.10084180533885956, "logits/rejected": 0.5015174746513367, "logps/chosen": -342.0030822753906, "logps/rejected": -544.0067749023438, "loss": 0.2753, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1676990985870361, "rewards/margins": 2.014329195022583, "rewards/margins_max": 3.015362501144409, "rewards/margins_min": 1.0132955312728882, "rewards/margins_std": 1.415675163269043, "rewards/rejected": -3.182028293609619, "step": 960 }, { "epoch": 0.24, "grad_norm": 1.0078125, "learning_rate": 1.8763743883352707e-06, "logits/chosen": 0.1762905865907669, "logits/rejected": 0.6730665564537048, "logps/chosen": -350.9500427246094, "logps/rejected": -550.13916015625, "loss": 0.3047, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2952146530151367, "rewards/margins": 2.0972981452941895, "rewards/margins_max": 3.471599578857422, "rewards/margins_min": 0.7229966521263123, "rewards/margins_std": 1.9435558319091797, "rewards/rejected": -3.392512798309326, "step": 970 }, { "epoch": 0.25, "grad_norm": 0.78125, "learning_rate": 1.8721093965780905e-06, "logits/chosen": 0.21470198035240173, "logits/rejected": 0.5289596319198608, "logps/chosen": -344.1557922363281, "logps/rejected": -550.0116577148438, "loss": 0.2839, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.366784930229187, "rewards/margins": 2.0145416259765625, "rewards/margins_max": 3.2122111320495605, "rewards/margins_min": 0.816872239112854, "rewards/margins_std": 1.693760633468628, "rewards/rejected": -3.3813271522521973, "step": 980 }, { "epoch": 0.25, "grad_norm": 0.7109375, "learning_rate": 1.8677770958582019e-06, "logits/chosen": 0.17914500832557678, "logits/rejected": 0.4978371262550354, "logps/chosen": -343.85107421875, "logps/rejected": -522.5852661132812, "loss": 0.296, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.3435138463974, "rewards/margins": 1.8682317733764648, "rewards/margins_max": 2.7188496589660645, "rewards/margins_min": 1.0176142454147339, "rewards/margins_std": 1.202954888343811, "rewards/rejected": -3.2117457389831543, "step": 990 }, { "epoch": 0.25, "grad_norm": 1.171875, "learning_rate": 1.863377820540386e-06, "logits/chosen": 0.09994121640920639, "logits/rejected": 0.48022064566612244, "logps/chosen": -373.71710205078125, "logps/rejected": -529.862548828125, "loss": 0.29, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4669697284698486, "rewards/margins": 1.7658771276474, "rewards/margins_max": 2.542285680770874, "rewards/margins_min": 0.9894682765007019, "rewards/margins_std": 1.0980077981948853, "rewards/rejected": -3.232846736907959, "step": 1000 }, { "epoch": 0.25, "grad_norm": 1.0078125, "learning_rate": 1.8589119101584897e-06, "logits/chosen": 0.08443330228328705, "logits/rejected": 0.3289525806903839, "logps/chosen": -340.7434997558594, "logps/rejected": -567.8606567382812, "loss": 0.267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3939064741134644, "rewards/margins": 2.0518834590911865, "rewards/margins_max": 3.2109062671661377, "rewards/margins_min": 0.8928610682487488, "rewards/margins_std": 1.6391054391860962, "rewards/rejected": -3.4457900524139404, "step": 1010 }, { "epoch": 0.26, "grad_norm": 0.73046875, "learning_rate": 1.854379709389221e-06, "logits/chosen": -0.020468706265091896, "logits/rejected": 0.5041080713272095, "logps/chosen": -358.7152404785156, "logps/rejected": -573.8004150390625, "loss": 0.3059, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3497676849365234, "rewards/margins": 2.207468032836914, "rewards/margins_max": 3.2299110889434814, "rewards/margins_min": 1.185024619102478, "rewards/margins_std": 1.445953130722046, "rewards/rejected": -3.5572357177734375, "step": 1020 }, { "epoch": 0.26, "grad_norm": 0.74609375, "learning_rate": 1.849781568025545e-06, "logits/chosen": 0.17804110050201416, "logits/rejected": 0.613066554069519, "logps/chosen": -373.60882568359375, "logps/rejected": -549.121337890625, "loss": 0.2861, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.433725118637085, "rewards/margins": 2.0137925148010254, "rewards/margins_max": 3.150330066680908, "rewards/margins_min": 0.8772546648979187, "rewards/margins_std": 1.6073071956634521, "rewards/rejected": -3.4475178718566895, "step": 1030 }, { "epoch": 0.26, "grad_norm": 1.015625, "learning_rate": 1.84511784094969e-06, "logits/chosen": -0.03766552731394768, "logits/rejected": 0.4089323580265045, "logps/chosen": -367.4641418457031, "logps/rejected": -541.2486572265625, "loss": 0.2689, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.2735382318496704, "rewards/margins": 1.9268970489501953, "rewards/margins_max": 2.815758228302002, "rewards/margins_min": 1.0380356311798096, "rewards/margins_std": 1.2570399045944214, "rewards/rejected": -3.200435161590576, "step": 1040 }, { "epoch": 0.26, "grad_norm": 1.359375, "learning_rate": 1.8403888881057558e-06, "logits/chosen": 0.13449151813983917, "logits/rejected": 0.6226879954338074, "logps/chosen": -366.2633056640625, "logps/rejected": -512.7859497070312, "loss": 0.2869, "rewards/accuracies": 0.875, "rewards/chosen": -1.3161754608154297, "rewards/margins": 1.7274010181427002, "rewards/margins_max": 2.7379233837127686, "rewards/margins_min": 0.7168782353401184, "rewards/margins_std": 1.429094672203064, "rewards/rejected": -3.043576240539551, "step": 1050 }, { "epoch": 0.27, "grad_norm": 0.97265625, "learning_rate": 1.8355950744719345e-06, "logits/chosen": 0.23932485282421112, "logits/rejected": 0.5507219433784485, "logps/chosen": -357.52130126953125, "logps/rejected": -589.6912841796875, "loss": 0.2619, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4837455749511719, "rewards/margins": 2.199106216430664, "rewards/margins_max": 3.2102742195129395, "rewards/margins_min": 1.1879384517669678, "rewards/margins_std": 1.430006980895996, "rewards/rejected": -3.682851791381836, "step": 1060 }, { "epoch": 0.27, "grad_norm": 1.1953125, "learning_rate": 1.830736770032341e-06, "logits/chosen": 0.2617644965648651, "logits/rejected": 0.5150817632675171, "logps/chosen": -347.5115661621094, "logps/rejected": -615.1715087890625, "loss": 0.2594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4821890592575073, "rewards/margins": 2.4075777530670166, "rewards/margins_max": 3.7134640216827393, "rewards/margins_min": 1.1016911268234253, "rewards/margins_std": 1.8468024730682373, "rewards/rejected": -3.8897671699523926, "step": 1070 }, { "epoch": 0.27, "grad_norm": 0.87109375, "learning_rate": 1.8258143497484578e-06, "logits/chosen": 0.00525292893871665, "logits/rejected": 0.4925769865512848, "logps/chosen": -371.81378173828125, "logps/rejected": -569.6729736328125, "loss": 0.2117, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4877418279647827, "rewards/margins": 2.2526438236236572, "rewards/margins_max": 3.327693462371826, "rewards/margins_min": 1.1775938272476196, "rewards/margins_std": 1.5203502178192139, "rewards/rejected": -3.7403857707977295, "step": 1080 }, { "epoch": 0.27, "grad_norm": 0.9921875, "learning_rate": 1.8208281935301955e-06, "logits/chosen": 0.2466718703508377, "logits/rejected": 0.6609460711479187, "logps/chosen": -388.27032470703125, "logps/rejected": -604.4417724609375, "loss": 0.2834, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.688057541847229, "rewards/margins": 2.2050793170928955, "rewards/margins_max": 3.456709384918213, "rewards/margins_min": 0.9534494280815125, "rewards/margins_std": 1.7700719833374023, "rewards/rejected": -3.893136501312256, "step": 1090 }, { "epoch": 0.28, "grad_norm": 1.0703125, "learning_rate": 1.8157786862065731e-06, "logits/chosen": 0.21708440780639648, "logits/rejected": 0.6412609219551086, "logps/chosen": -420.9542541503906, "logps/rejected": -639.0484008789062, "loss": 0.3048, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.730591058731079, "rewards/margins": 2.4535133838653564, "rewards/margins_max": 3.991995334625244, "rewards/margins_min": 0.9150320291519165, "rewards/margins_std": 2.175741672515869, "rewards/rejected": -4.1841044425964355, "step": 1100 }, { "epoch": 0.28, "grad_norm": 1.5234375, "learning_rate": 1.810666217496015e-06, "logits/chosen": 0.30106106400489807, "logits/rejected": 0.7213363647460938, "logps/chosen": -380.4069519042969, "logps/rejected": -642.5003051757812, "loss": 0.2797, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6323025226593018, "rewards/margins": 2.6721444129943848, "rewards/margins_max": 3.9874916076660156, "rewards/margins_min": 1.3567968606948853, "rewards/margins_std": 1.8601821660995483, "rewards/rejected": -4.304447174072266, "step": 1110 }, { "epoch": 0.28, "grad_norm": 1.1015625, "learning_rate": 1.8054911819762739e-06, "logits/chosen": 0.11988552659749985, "logits/rejected": 0.576012372970581, "logps/chosen": -327.7829284667969, "logps/rejected": -495.76678466796875, "loss": 0.2594, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3872520923614502, "rewards/margins": 1.7396234273910522, "rewards/margins_max": 2.6414241790771484, "rewards/margins_min": 0.8378230929374695, "rewards/margins_std": 1.2753384113311768, "rewards/rejected": -3.126875877380371, "step": 1120 }, { "epoch": 0.28, "grad_norm": 0.87890625, "learning_rate": 1.800253979053977e-06, "logits/chosen": 0.15926051139831543, "logits/rejected": 0.5235914587974548, "logps/chosen": -384.0852355957031, "logps/rejected": -643.6329956054688, "loss": 0.2519, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.570711612701416, "rewards/margins": 2.6680593490600586, "rewards/margins_max": 3.8719935417175293, "rewards/margins_min": 1.4641246795654297, "rewards/margins_std": 1.702620506286621, "rewards/rejected": -4.238770961761475, "step": 1130 }, { "epoch": 0.29, "grad_norm": 0.9375, "learning_rate": 1.7949550129338005e-06, "logits/chosen": 0.06529082357883453, "logits/rejected": 0.5438031554222107, "logps/chosen": -414.1963806152344, "logps/rejected": -667.8323364257812, "loss": 0.2537, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.781734824180603, "rewards/margins": 2.5534157752990723, "rewards/margins_max": 3.9235243797302246, "rewards/margins_min": 1.1833075284957886, "rewards/margins_std": 1.9376258850097656, "rewards/rejected": -4.335150718688965, "step": 1140 }, { "epoch": 0.29, "grad_norm": 0.94140625, "learning_rate": 1.7895946925872731e-06, "logits/chosen": 0.261190265417099, "logits/rejected": 0.5999152660369873, "logps/chosen": -390.030029296875, "logps/rejected": -711.2191162109375, "loss": 0.2192, "rewards/accuracies": 0.9375, "rewards/chosen": -1.877661108970642, "rewards/margins": 3.1061835289001465, "rewards/margins_max": 4.556717872619629, "rewards/margins_min": 1.6556494235992432, "rewards/margins_std": 2.0513651371002197, "rewards/rejected": -4.98384428024292, "step": 1150 }, { "epoch": 0.29, "grad_norm": 1.2734375, "learning_rate": 1.7841734317212116e-06, "logits/chosen": 0.1313336342573166, "logits/rejected": 0.593550443649292, "logps/chosen": -413.43701171875, "logps/rejected": -669.3980102539062, "loss": 0.2464, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.9083925485610962, "rewards/margins": 2.6244053840637207, "rewards/margins_max": 4.089666843414307, "rewards/margins_min": 1.1591440439224243, "rewards/margins_std": 2.072192668914795, "rewards/rejected": -4.5327982902526855, "step": 1160 }, { "epoch": 0.29, "grad_norm": 0.859375, "learning_rate": 1.7786916487457911e-06, "logits/chosen": 0.10810734331607819, "logits/rejected": 0.658301055431366, "logps/chosen": -426.03692626953125, "logps/rejected": -653.3524169921875, "loss": 0.2693, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.0395500659942627, "rewards/margins": 2.416887044906616, "rewards/margins_max": 3.8193812370300293, "rewards/margins_min": 1.0143930912017822, "rewards/margins_std": 1.9834257364273071, "rewards/rejected": -4.456437110900879, "step": 1170 }, { "epoch": 0.3, "grad_norm": 1.1171875, "learning_rate": 1.7731497667422526e-06, "logits/chosen": 0.18602201342582703, "logits/rejected": 0.5325266718864441, "logps/chosen": -397.35009765625, "logps/rejected": -679.3014526367188, "loss": 0.2432, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.8916324377059937, "rewards/margins": 2.8336009979248047, "rewards/margins_max": 4.271034240722656, "rewards/margins_min": 1.3961678743362427, "rewards/margins_std": 2.032837390899658, "rewards/rejected": -4.725234031677246, "step": 1180 }, { "epoch": 0.3, "grad_norm": 0.9765625, "learning_rate": 1.7675482134302499e-06, "logits/chosen": 0.25429344177246094, "logits/rejected": 0.5315398573875427, "logps/chosen": -383.9230651855469, "logps/rejected": -631.6602172851562, "loss": 0.223, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8132168054580688, "rewards/margins": 2.535676956176758, "rewards/margins_max": 3.828932523727417, "rewards/margins_min": 1.2424218654632568, "rewards/margins_std": 1.828939437866211, "rewards/rejected": -4.3488945960998535, "step": 1190 }, { "epoch": 0.3, "grad_norm": 0.73046875, "learning_rate": 1.7618874211348381e-06, "logits/chosen": 0.23039917647838593, "logits/rejected": 0.6885030269622803, "logps/chosen": -433.42205810546875, "logps/rejected": -697.4190063476562, "loss": 0.2455, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.1791675090789795, "rewards/margins": 2.756437301635742, "rewards/margins_max": 4.101160049438477, "rewards/margins_min": 1.4117141962051392, "rewards/margins_std": 1.9017255306243896, "rewards/rejected": -4.935604572296143, "step": 1200 }, { "epoch": 0.3, "grad_norm": 0.69140625, "learning_rate": 1.7561678267531078e-06, "logits/chosen": 0.25268083810806274, "logits/rejected": 0.638781726360321, "logps/chosen": -411.7583923339844, "logps/rejected": -672.2666625976562, "loss": 0.2394, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9081943035125732, "rewards/margins": 2.7916979789733887, "rewards/margins_max": 4.282321929931641, "rewards/margins_min": 1.3010739088058472, "rewards/margins_std": 2.108060598373413, "rewards/rejected": -4.699892520904541, "step": 1210 }, { "epoch": 0.31, "grad_norm": 0.828125, "learning_rate": 1.7503898717204631e-06, "logits/chosen": 0.1927916258573532, "logits/rejected": 0.6581898927688599, "logps/chosen": -395.4380798339844, "logps/rejected": -690.7493896484375, "loss": 0.2031, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.8340580463409424, "rewards/margins": 3.096101760864258, "rewards/margins_max": 4.713414192199707, "rewards/margins_min": 1.4787895679473877, "rewards/margins_std": 2.287224531173706, "rewards/rejected": -4.930159568786621, "step": 1220 }, { "epoch": 0.31, "grad_norm": 1.109375, "learning_rate": 1.7445540019765558e-06, "logits/chosen": 0.1801643818616867, "logits/rejected": 0.595844030380249, "logps/chosen": -403.8412170410156, "logps/rejected": -678.00146484375, "loss": 0.2743, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0799314975738525, "rewards/margins": 2.5960934162139893, "rewards/margins_max": 4.042551517486572, "rewards/margins_min": 1.1496355533599854, "rewards/margins_std": 2.045600414276123, "rewards/rejected": -4.676024436950684, "step": 1230 }, { "epoch": 0.31, "grad_norm": 1.171875, "learning_rate": 1.7386606679308648e-06, "logits/chosen": 0.27586087584495544, "logits/rejected": 0.6709119081497192, "logps/chosen": -427.78265380859375, "logps/rejected": -727.4019775390625, "loss": 0.2212, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.961285948753357, "rewards/margins": 2.957775592803955, "rewards/margins_max": 4.439484119415283, "rewards/margins_min": 1.4760667085647583, "rewards/margins_std": 2.0954525470733643, "rewards/rejected": -4.919060707092285, "step": 1240 }, { "epoch": 0.31, "grad_norm": 1.234375, "learning_rate": 1.7327103244279347e-06, "logits/chosen": 0.21624751389026642, "logits/rejected": 0.5476531386375427, "logps/chosen": -414.1004943847656, "logps/rejected": -790.1434326171875, "loss": 0.209, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9725745916366577, "rewards/margins": 3.613555431365967, "rewards/margins_max": 5.268401145935059, "rewards/margins_min": 1.9587090015411377, "rewards/margins_std": 2.340306520462036, "rewards/rejected": -5.586129665374756, "step": 1250 }, { "epoch": 0.32, "grad_norm": 1.5078125, "learning_rate": 1.7267034307122716e-06, "logits/chosen": 0.21748849749565125, "logits/rejected": 0.5897720456123352, "logps/chosen": -445.33135986328125, "logps/rejected": -714.2196044921875, "loss": 0.2039, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.2084898948669434, "rewards/margins": 2.672968864440918, "rewards/margins_max": 4.1215972900390625, "rewards/margins_min": 1.2243406772613525, "rewards/margins_std": 2.0486698150634766, "rewards/rejected": -4.881458759307861, "step": 1260 }, { "epoch": 0.32, "grad_norm": 0.55078125, "learning_rate": 1.720640450392898e-06, "logits/chosen": 0.3318621516227722, "logits/rejected": 0.7322698831558228, "logps/chosen": -412.41571044921875, "logps/rejected": -811.5214233398438, "loss": 0.2346, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9910354614257812, "rewards/margins": 4.049851417541504, "rewards/margins_max": 6.252103805541992, "rewards/margins_min": 1.8475990295410156, "rewards/margins_std": 3.114454984664917, "rewards/rejected": -6.040886402130127, "step": 1270 }, { "epoch": 0.32, "grad_norm": 0.5625, "learning_rate": 1.7145218514075728e-06, "logits/chosen": 0.07924878597259521, "logits/rejected": 0.4982023239135742, "logps/chosen": -447.25238037109375, "logps/rejected": -724.427490234375, "loss": 0.237, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.447425365447998, "rewards/margins": 2.800419330596924, "rewards/margins_max": 4.151153087615967, "rewards/margins_min": 1.4496856927871704, "rewards/margins_std": 1.9102258682250977, "rewards/rejected": -5.247844219207764, "step": 1280 }, { "epoch": 0.32, "grad_norm": 0.984375, "learning_rate": 1.7083481059866747e-06, "logits/chosen": 0.213484525680542, "logits/rejected": 0.7719516158103943, "logps/chosen": -416.8954162597656, "logps/rejected": -753.9082641601562, "loss": 0.195, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.0533287525177, "rewards/margins": 3.40852689743042, "rewards/margins_max": 5.488340377807617, "rewards/margins_min": 1.3287138938903809, "rewards/margins_std": 2.9412999153137207, "rewards/rejected": -5.461855888366699, "step": 1290 }, { "epoch": 0.33, "grad_norm": 5.21875, "learning_rate": 1.7021196906167571e-06, "logits/chosen": 0.24803981184959412, "logits/rejected": 0.8145266771316528, "logps/chosen": -478.02178955078125, "logps/rejected": -837.2579345703125, "loss": 0.2184, "rewards/accuracies": 0.9375, "rewards/chosen": -2.494419813156128, "rewards/margins": 3.7387890815734863, "rewards/margins_max": 5.558200836181641, "rewards/margins_min": 1.9193763732910156, "rewards/margins_std": 2.573037624359131, "rewards/rejected": -6.233208656311035, "step": 1300 }, { "epoch": 0.33, "grad_norm": 2.90625, "learning_rate": 1.6958370860037716e-06, "logits/chosen": 0.11850683391094208, "logits/rejected": 0.5389954447746277, "logps/chosen": -446.10235595703125, "logps/rejected": -709.673828125, "loss": 0.2605, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.3752357959747314, "rewards/margins": 2.7870852947235107, "rewards/margins_max": 4.720024108886719, "rewards/margins_min": 0.8541472554206848, "rewards/margins_std": 2.7335875034332275, "rewards/rejected": -5.162322044372559, "step": 1310 }, { "epoch": 0.33, "grad_norm": 2.21875, "learning_rate": 1.6895007770359697e-06, "logits/chosen": 0.3192082941532135, "logits/rejected": 0.6527734994888306, "logps/chosen": -487.09027099609375, "logps/rejected": -853.08154296875, "loss": 0.2319, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.527470111846924, "rewards/margins": 3.651360273361206, "rewards/margins_max": 5.692448616027832, "rewards/margins_min": 1.6102720499038696, "rewards/margins_std": 2.8865349292755127, "rewards/rejected": -6.178830146789551, "step": 1320 }, { "epoch": 0.33, "grad_norm": 1.5390625, "learning_rate": 1.6831112527464763e-06, "logits/chosen": 0.322293221950531, "logits/rejected": 0.581436276435852, "logps/chosen": -464.16619873046875, "logps/rejected": -799.5835571289062, "loss": 0.1916, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4754815101623535, "rewards/margins": 3.4213860034942627, "rewards/margins_max": 5.288193225860596, "rewards/margins_min": 1.5545791387557983, "rewards/margins_std": 2.640063762664795, "rewards/rejected": -5.8968682289123535, "step": 1330 }, { "epoch": 0.34, "grad_norm": 4.03125, "learning_rate": 1.6766690062755487e-06, "logits/chosen": 0.253692090511322, "logits/rejected": 0.5565173029899597, "logps/chosen": -449.30072021484375, "logps/rejected": -752.14794921875, "loss": 0.2402, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4055395126342773, "rewards/margins": 3.2085556983947754, "rewards/margins_max": 5.205798625946045, "rewards/margins_min": 1.2113126516342163, "rewards/margins_std": 2.824528455734253, "rewards/rejected": -5.614095211029053, "step": 1340 }, { "epoch": 0.34, "grad_norm": 1.390625, "learning_rate": 1.6701745348325153e-06, "logits/chosen": 0.3277135491371155, "logits/rejected": 0.6626953482627869, "logps/chosen": -425.80291748046875, "logps/rejected": -829.0848388671875, "loss": 0.2112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.336625337600708, "rewards/margins": 3.792543888092041, "rewards/margins_max": 5.489853858947754, "rewards/margins_min": 2.09523344039917, "rewards/margins_std": 2.4003586769104004, "rewards/rejected": -6.129168510437012, "step": 1350 }, { "epoch": 0.34, "grad_norm": 3.53125, "learning_rate": 1.6636283396574018e-06, "logits/chosen": 0.19394713640213013, "logits/rejected": 0.7184884548187256, "logps/chosen": -458.70538330078125, "logps/rejected": -763.5852661132812, "loss": 0.2317, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.3784217834472656, "rewards/margins": 3.29382061958313, "rewards/margins_max": 4.978985786437988, "rewards/margins_min": 1.6086561679840088, "rewards/margins_std": 2.3831827640533447, "rewards/rejected": -5.672242164611816, "step": 1360 }, { "epoch": 0.34, "grad_norm": 0.8515625, "learning_rate": 1.6570309259822453e-06, "logits/chosen": 0.2924334406852722, "logits/rejected": 0.6883147358894348, "logps/chosen": -427.87127685546875, "logps/rejected": -780.0853881835938, "loss": 0.1848, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2181785106658936, "rewards/margins": 3.3977344036102295, "rewards/margins_max": 5.132598876953125, "rewards/margins_min": 1.662870168685913, "rewards/margins_std": 2.4534687995910645, "rewards/rejected": -5.615913391113281, "step": 1370 }, { "epoch": 0.35, "grad_norm": 0.921875, "learning_rate": 1.6503828029921002e-06, "logits/chosen": 0.5088449716567993, "logits/rejected": 0.8754922151565552, "logps/chosen": -491.39111328125, "logps/rejected": -809.8668212890625, "loss": 0.1959, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.6552605628967285, "rewards/margins": 3.4456450939178467, "rewards/margins_max": 5.258717060089111, "rewards/margins_min": 1.632573127746582, "rewards/margins_std": 2.5640709400177, "rewards/rejected": -6.100905418395996, "step": 1380 }, { "epoch": 0.35, "grad_norm": 1.4375, "learning_rate": 1.6436844837857416e-06, "logits/chosen": 0.2816401422023773, "logits/rejected": 0.5672039985656738, "logps/chosen": -441.32769775390625, "logps/rejected": -788.40576171875, "loss": 0.1852, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.3083629608154297, "rewards/margins": 3.4626381397247314, "rewards/margins_max": 5.100627899169922, "rewards/margins_min": 1.8246475458145142, "rewards/margins_std": 2.316467761993408, "rewards/rejected": -5.771000862121582, "step": 1390 }, { "epoch": 0.35, "grad_norm": 1.1484375, "learning_rate": 1.6369364853360619e-06, "logits/chosen": 0.39103689789772034, "logits/rejected": 0.6895217299461365, "logps/chosen": -496.9537658691406, "logps/rejected": -941.9112548828125, "loss": 0.2644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8370614051818848, "rewards/margins": 4.051840782165527, "rewards/margins_max": 6.340351104736328, "rewards/margins_min": 1.7633311748504639, "rewards/margins_std": 3.2364420890808105, "rewards/rejected": -6.8889031410217285, "step": 1400 }, { "epoch": 0.35, "grad_norm": 1.78125, "learning_rate": 1.630139328450173e-06, "logits/chosen": 0.29026108980178833, "logits/rejected": 0.6609446406364441, "logps/chosen": -475.28985595703125, "logps/rejected": -914.2362060546875, "loss": 0.2142, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.75921368598938, "rewards/margins": 4.294146537780762, "rewards/margins_max": 6.507538795471191, "rewards/margins_min": 2.0807533264160156, "rewards/margins_std": 3.1302103996276855, "rewards/rejected": -7.0533599853515625, "step": 1410 }, { "epoch": 0.36, "grad_norm": 2.5625, "learning_rate": 1.6232935377292098e-06, "logits/chosen": 0.09786330163478851, "logits/rejected": 0.5836361646652222, "logps/chosen": -473.8296813964844, "logps/rejected": -825.7180786132812, "loss": 0.2472, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.630255699157715, "rewards/margins": 3.6065337657928467, "rewards/margins_max": 5.6653971672058105, "rewards/margins_min": 1.547670602798462, "rewards/margins_std": 2.911672830581665, "rewards/rejected": -6.236789703369141, "step": 1420 }, { "epoch": 0.36, "grad_norm": 1.5625, "learning_rate": 1.6163996415278423e-06, "logits/chosen": 0.42069101333618164, "logits/rejected": 0.6874132752418518, "logps/chosen": -422.8811950683594, "logps/rejected": -810.5554809570312, "loss": 0.1972, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2699074745178223, "rewards/margins": 3.803889036178589, "rewards/margins_max": 5.551349639892578, "rewards/margins_min": 2.0564279556274414, "rewards/margins_std": 2.471282482147217, "rewards/rejected": -6.073796272277832, "step": 1430 }, { "epoch": 0.36, "grad_norm": 1.9140625, "learning_rate": 1.6094581719134973e-06, "logits/chosen": 0.23529568314552307, "logits/rejected": 0.7506182789802551, "logps/chosen": -488.1771545410156, "logps/rejected": -945.7060546875, "loss": 0.2158, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.3911712169647217, "rewards/margins": 4.839472770690918, "rewards/margins_max": 7.3261566162109375, "rewards/margins_min": 2.3527889251708984, "rewards/margins_std": 3.516702175140381, "rewards/rejected": -7.230644226074219, "step": 1440 }, { "epoch": 0.36, "grad_norm": 0.859375, "learning_rate": 1.602469664625293e-06, "logits/chosen": 0.31949982047080994, "logits/rejected": 0.5621960759162903, "logps/chosen": -475.3267517089844, "logps/rejected": -1032.7867431640625, "loss": 0.1546, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7003226280212402, "rewards/margins": 5.414786338806152, "rewards/margins_max": 8.179932594299316, "rewards/margins_min": 2.6496407985687256, "rewards/margins_std": 3.9105067253112793, "rewards/rejected": -8.115108489990234, "step": 1450 }, { "epoch": 0.37, "grad_norm": 0.9609375, "learning_rate": 1.5954346590326923e-06, "logits/chosen": 0.22190162539482117, "logits/rejected": 0.5015226602554321, "logps/chosen": -465.91943359375, "logps/rejected": -911.0699462890625, "loss": 0.184, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6864209175109863, "rewards/margins": 4.381344318389893, "rewards/margins_max": 6.3844804763793945, "rewards/margins_min": 2.3782083988189697, "rewards/margins_std": 2.832862377166748, "rewards/rejected": -7.067765712738037, "step": 1460 }, { "epoch": 0.37, "grad_norm": 2.140625, "learning_rate": 1.5883536980938731e-06, "logits/chosen": 0.37031736969947815, "logits/rejected": 0.7043700218200684, "logps/chosen": -489.9381408691406, "logps/rejected": -984.1287841796875, "loss": 0.1951, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.8816380500793457, "rewards/margins": 4.811006546020508, "rewards/margins_max": 7.4277215003967285, "rewards/margins_min": 2.1942927837371826, "rewards/margins_std": 3.7005927562713623, "rewards/rejected": -7.6926445960998535, "step": 1470 }, { "epoch": 0.37, "grad_norm": 2.0, "learning_rate": 1.5812273283138238e-06, "logits/chosen": 0.5258148908615112, "logits/rejected": 0.7043691873550415, "logps/chosen": -522.13134765625, "logps/rejected": -1048.264892578125, "loss": 0.2019, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.000220775604248, "rewards/margins": 4.934444427490234, "rewards/margins_max": 7.410369873046875, "rewards/margins_min": 2.458519458770752, "rewards/margins_std": 3.5014865398406982, "rewards/rejected": -7.934664726257324, "step": 1480 }, { "epoch": 0.37, "grad_norm": 0.859375, "learning_rate": 1.5740560997021647e-06, "logits/chosen": 0.4362607002258301, "logits/rejected": 0.8238092660903931, "logps/chosen": -533.056640625, "logps/rejected": -982.03564453125, "loss": 0.2047, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.158689022064209, "rewards/margins": 4.55390739440918, "rewards/margins_max": 6.570149898529053, "rewards/margins_min": 2.5376639366149902, "rewards/margins_std": 2.851398229598999, "rewards/rejected": -7.7125959396362305, "step": 1490 }, { "epoch": 0.38, "grad_norm": 0.79296875, "learning_rate": 1.5668405657306973e-06, "logits/chosen": 0.5168190598487854, "logits/rejected": 0.8230735659599304, "logps/chosen": -525.788818359375, "logps/rejected": -1038.3076171875, "loss": 0.1925, "rewards/accuracies": 0.9375, "rewards/chosen": -3.3270175457000732, "rewards/margins": 4.954278469085693, "rewards/margins_max": 7.399777889251709, "rewards/margins_min": 2.5087785720825195, "rewards/margins_std": 3.4584591388702393, "rewards/rejected": -8.281296730041504, "step": 1500 }, { "epoch": 0.38, "grad_norm": 0.90625, "learning_rate": 1.559581283290689e-06, "logits/chosen": 0.3661649823188782, "logits/rejected": 0.813243567943573, "logps/chosen": -504.9390563964844, "logps/rejected": -1073.157470703125, "loss": 0.2528, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.903505325317383, "rewards/margins": 5.695023536682129, "rewards/margins_max": 8.787598609924316, "rewards/margins_min": 2.6024482250213623, "rewards/margins_std": 4.373561859130859, "rewards/rejected": -8.598528861999512, "step": 1510 }, { "epoch": 0.38, "grad_norm": 1.171875, "learning_rate": 1.5522788126498915e-06, "logits/chosen": 0.28599125146865845, "logits/rejected": 0.6875888109207153, "logps/chosen": -599.96484375, "logps/rejected": -924.0895385742188, "loss": 0.3888, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.4606220722198486, "rewards/margins": 3.549129009246826, "rewards/margins_max": 6.038578033447266, "rewards/margins_min": 1.0596802234649658, "rewards/margins_std": 3.5206127166748047, "rewards/rejected": -7.0097503662109375, "step": 1520 }, { "epoch": 0.38, "grad_norm": 2.21875, "learning_rate": 1.544933717409301e-06, "logits/chosen": 0.3157169818878174, "logits/rejected": 0.866075336933136, "logps/chosen": -495.86260986328125, "logps/rejected": -990.1038208007812, "loss": 0.1955, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8637332916259766, "rewards/margins": 4.898533344268799, "rewards/margins_max": 7.39690637588501, "rewards/margins_min": 2.4001593589782715, "rewards/margins_std": 3.533233642578125, "rewards/rejected": -7.762265682220459, "step": 1530 }, { "epoch": 0.39, "grad_norm": 1.9296875, "learning_rate": 1.537546564459657e-06, "logits/chosen": 0.3015773594379425, "logits/rejected": 0.8017538785934448, "logps/chosen": -490.8206481933594, "logps/rejected": -873.3521728515625, "loss": 0.2619, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.775757074356079, "rewards/margins": 4.02579402923584, "rewards/margins_max": 6.456332206726074, "rewards/margins_min": 1.5952569246292114, "rewards/margins_std": 3.4372992515563965, "rewards/rejected": -6.80155086517334, "step": 1540 }, { "epoch": 0.39, "grad_norm": 1.984375, "learning_rate": 1.5301179239376935e-06, "logits/chosen": 0.19896200299263, "logits/rejected": 0.48105502128601074, "logps/chosen": -472.67767333984375, "logps/rejected": -867.4581909179688, "loss": 0.2304, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.7534565925598145, "rewards/margins": 3.8812403678894043, "rewards/margins_max": 5.89428186416626, "rewards/margins_min": 1.8681997060775757, "rewards/margins_std": 2.846869707107544, "rewards/rejected": -6.634696960449219, "step": 1550 }, { "epoch": 0.39, "grad_norm": 2.234375, "learning_rate": 1.5226483691821335e-06, "logits/chosen": 0.43792515993118286, "logits/rejected": 0.809437096118927, "logps/chosen": -493.06121826171875, "logps/rejected": -883.27392578125, "loss": 0.2847, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.7731716632843018, "rewards/margins": 3.9591903686523438, "rewards/margins_max": 5.8009843826293945, "rewards/margins_min": 2.117396116256714, "rewards/margins_std": 2.6046900749206543, "rewards/rejected": -6.732362270355225, "step": 1560 }, { "epoch": 0.4, "grad_norm": 0.859375, "learning_rate": 1.5151384766894394e-06, "logits/chosen": 0.25252875685691833, "logits/rejected": 0.705254852771759, "logps/chosen": -461.9195861816406, "logps/rejected": -954.2716674804688, "loss": 0.1947, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.399007797241211, "rewards/margins": 4.97990608215332, "rewards/margins_max": 7.502171516418457, "rewards/margins_min": 2.4576408863067627, "rewards/margins_std": 3.567021608352661, "rewards/rejected": -7.378913879394531, "step": 1570 }, { "epoch": 0.4, "grad_norm": 2.984375, "learning_rate": 1.5075888260693213e-06, "logits/chosen": 0.20744235813617706, "logits/rejected": 0.490752637386322, "logps/chosen": -477.570068359375, "logps/rejected": -938.7899169921875, "loss": 0.1527, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9296581745147705, "rewards/margins": 4.446514129638672, "rewards/margins_max": 6.75622034072876, "rewards/margins_min": 2.136807441711426, "rewards/margins_std": 3.2664177417755127, "rewards/rejected": -7.376172065734863, "step": 1580 }, { "epoch": 0.4, "grad_norm": 0.66015625, "learning_rate": 1.5e-06, "logits/chosen": 0.25578054785728455, "logits/rejected": 0.7139034867286682, "logps/chosen": -460.7802734375, "logps/rejected": -875.15283203125, "loss": 0.1833, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.5101754665374756, "rewards/margins": 4.372230052947998, "rewards/margins_max": 6.4557671546936035, "rewards/margins_min": 2.2886929512023926, "rewards/margins_std": 2.9465668201446533, "rewards/rejected": -6.882405757904053, "step": 1590 }, { "epoch": 0.4, "grad_norm": 0.9609375, "learning_rate": 1.4923725841832382e-06, "logits/chosen": 0.2641240358352661, "logits/rejected": 0.716410756111145, "logps/chosen": -516.456298828125, "logps/rejected": -952.1373901367188, "loss": 0.2241, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.865016460418701, "rewards/margins": 4.353453636169434, "rewards/margins_max": 6.641819953918457, "rewards/margins_min": 2.0650863647460938, "rewards/margins_std": 3.236238956451416, "rewards/rejected": -7.218469142913818, "step": 1600 }, { "epoch": 0.41, "grad_norm": 1.7265625, "learning_rate": 1.4847071672991365e-06, "logits/chosen": 0.38563448190689087, "logits/rejected": 0.6463780999183655, "logps/chosen": -488.828125, "logps/rejected": -1118.4166259765625, "loss": 0.1518, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.9519083499908447, "rewards/margins": 5.86095666885376, "rewards/margins_max": 7.7536725997924805, "rewards/margins_min": 3.9682400226593018, "rewards/margins_std": 2.6767053604125977, "rewards/rejected": -8.812864303588867, "step": 1610 }, { "epoch": 0.41, "grad_norm": 1.4765625, "learning_rate": 1.4770043409606979e-06, "logits/chosen": 0.47096341848373413, "logits/rejected": 0.6747244596481323, "logps/chosen": -484.9869689941406, "logps/rejected": -1008.4423828125, "loss": 0.1929, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.9664974212646484, "rewards/margins": 5.106780529022217, "rewards/margins_max": 7.504878997802734, "rewards/margins_min": 2.7086825370788574, "rewards/margins_std": 3.391422748565674, "rewards/rejected": -8.073277473449707, "step": 1620 }, { "epoch": 0.41, "grad_norm": 0.98828125, "learning_rate": 1.4692646996681678e-06, "logits/chosen": 0.47422710061073303, "logits/rejected": 0.685745358467102, "logps/chosen": -458.32696533203125, "logps/rejected": -1029.868408203125, "loss": 0.1466, "rewards/accuracies": 1.0, "rewards/chosen": -2.6799213886260986, "rewards/margins": 5.579705715179443, "rewards/margins_max": 8.32711410522461, "rewards/margins_min": 2.8322973251342773, "rewards/margins_std": 3.885422945022583, "rewards/rejected": -8.259626388549805, "step": 1630 }, { "epoch": 0.41, "grad_norm": 1.3359375, "learning_rate": 1.4614888407631518e-06, "logits/chosen": 0.27054479718208313, "logits/rejected": 0.8626736402511597, "logps/chosen": -516.4133911132812, "logps/rejected": -979.1773681640625, "loss": 0.1787, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8786580562591553, "rewards/margins": 4.838742733001709, "rewards/margins_max": 7.334200382232666, "rewards/margins_min": 2.3432841300964355, "rewards/margins_std": 3.5291106700897217, "rewards/rejected": -7.717400550842285, "step": 1640 }, { "epoch": 0.42, "grad_norm": 1.265625, "learning_rate": 1.4536773643825129e-06, "logits/chosen": 0.35027459263801575, "logits/rejected": 0.6745755076408386, "logps/chosen": -515.3684692382812, "logps/rejected": -903.1793823242188, "loss": 0.1713, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.938141107559204, "rewards/margins": 3.8568992614746094, "rewards/margins_max": 5.576117038726807, "rewards/margins_min": 2.137681484222412, "rewards/margins_std": 2.4313409328460693, "rewards/rejected": -6.795041084289551, "step": 1650 }, { "epoch": 0.42, "grad_norm": 0.90234375, "learning_rate": 1.4458308734120524e-06, "logits/chosen": 0.308353990316391, "logits/rejected": 0.7953172922134399, "logps/chosen": -468.75897216796875, "logps/rejected": -853.1901245117188, "loss": 0.2044, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.7174980640411377, "rewards/margins": 3.790794849395752, "rewards/margins_max": 5.873719215393066, "rewards/margins_min": 1.7078701257705688, "rewards/margins_std": 2.9457004070281982, "rewards/rejected": -6.508293151855469, "step": 1660 }, { "epoch": 0.42, "grad_norm": 0.58203125, "learning_rate": 1.4379499734399796e-06, "logits/chosen": 0.3180529773235321, "logits/rejected": 0.6785213351249695, "logps/chosen": -446.6173400878906, "logps/rejected": -992.4054565429688, "loss": 0.1215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.490750789642334, "rewards/margins": 5.268213748931885, "rewards/margins_max": 7.856361389160156, "rewards/margins_min": 2.680065631866455, "rewards/margins_std": 3.660193681716919, "rewards/rejected": -7.758963584899902, "step": 1670 }, { "epoch": 0.42, "grad_norm": 3.234375, "learning_rate": 1.4300352727101737e-06, "logits/chosen": 0.39259445667266846, "logits/rejected": 0.7314284443855286, "logps/chosen": -519.9503173828125, "logps/rejected": -1025.9241943359375, "loss": 0.1885, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.1284713745117188, "rewards/margins": 4.959225654602051, "rewards/margins_max": 7.500026702880859, "rewards/margins_min": 2.418423891067505, "rewards/margins_std": 3.593236207962036, "rewards/rejected": -8.08769702911377, "step": 1680 }, { "epoch": 0.43, "grad_norm": 3.90625, "learning_rate": 1.4220873820752395e-06, "logits/chosen": 0.3535314202308655, "logits/rejected": 0.8503448367118835, "logps/chosen": -514.1683349609375, "logps/rejected": -1090.859619140625, "loss": 0.233, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.791045665740967, "rewards/margins": 5.699390411376953, "rewards/margins_max": 9.158174514770508, "rewards/margins_min": 2.240605115890503, "rewards/margins_std": 4.891460418701172, "rewards/rejected": -8.490435600280762, "step": 1690 }, { "epoch": 0.43, "grad_norm": 1.25, "learning_rate": 1.414106914949361e-06, "logits/chosen": 0.2840239107608795, "logits/rejected": 0.7125069499015808, "logps/chosen": -540.17822265625, "logps/rejected": -1097.022216796875, "loss": 0.228, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.084819793701172, "rewards/margins": 5.610352516174316, "rewards/margins_max": 9.078147888183594, "rewards/margins_min": 2.1425588130950928, "rewards/margins_std": 4.904201984405518, "rewards/rejected": -8.695172309875488, "step": 1700 }, { "epoch": 0.43, "grad_norm": 5.59375, "learning_rate": 1.4060944872609605e-06, "logits/chosen": 0.32603517174720764, "logits/rejected": 0.8470407724380493, "logps/chosen": -518.9697875976562, "logps/rejected": -984.0426635742188, "loss": 0.2196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.9778990745544434, "rewards/margins": 4.964657783508301, "rewards/margins_max": 7.731484889984131, "rewards/margins_min": 2.1978302001953125, "rewards/margins_std": 3.912884473800659, "rewards/rejected": -7.942556858062744, "step": 1710 }, { "epoch": 0.43, "grad_norm": 0.88671875, "learning_rate": 1.3980507174051592e-06, "logits/chosen": 0.2727965712547302, "logits/rejected": 0.8068740963935852, "logps/chosen": -499.65667724609375, "logps/rejected": -935.8679809570312, "loss": 0.1572, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.829961061477661, "rewards/margins": 4.54538631439209, "rewards/margins_max": 6.558957099914551, "rewards/margins_min": 2.531816005706787, "rewards/margins_std": 2.847618579864502, "rewards/rejected": -7.375347137451172, "step": 1720 }, { "epoch": 0.44, "grad_norm": 3.578125, "learning_rate": 1.3899762261960517e-06, "logits/chosen": 0.456474244594574, "logits/rejected": 0.7277069091796875, "logps/chosen": -548.7547607421875, "logps/rejected": -1049.939208984375, "loss": 0.1818, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2771522998809814, "rewards/margins": 4.879497051239014, "rewards/margins_max": 7.429345607757568, "rewards/margins_min": 2.329648017883301, "rewards/margins_std": 3.6060307025909424, "rewards/rejected": -8.156648635864258, "step": 1730 }, { "epoch": 0.44, "grad_norm": 0.71484375, "learning_rate": 1.381871636818791e-06, "logits/chosen": 0.24610686302185059, "logits/rejected": 0.7779833078384399, "logps/chosen": -458.67041015625, "logps/rejected": -814.8113403320312, "loss": 0.234, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.4150824546813965, "rewards/margins": 3.6235358715057373, "rewards/margins_max": 5.524462699890137, "rewards/margins_min": 1.7226091623306274, "rewards/margins_std": 2.6883163452148438, "rewards/rejected": -6.0386176109313965, "step": 1740 }, { "epoch": 0.44, "grad_norm": 1.0234375, "learning_rate": 1.3737375747814914e-06, "logits/chosen": 0.33012324571609497, "logits/rejected": 0.7673249244689941, "logps/chosen": -503.94842529296875, "logps/rejected": -943.15380859375, "loss": 0.2264, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.0407915115356445, "rewards/margins": 4.353100776672363, "rewards/margins_max": 6.474888801574707, "rewards/margins_min": 2.2313132286071777, "rewards/margins_std": 3.0006611347198486, "rewards/rejected": -7.39389181137085, "step": 1750 }, { "epoch": 0.44, "grad_norm": 0.6328125, "learning_rate": 1.3655746678669524e-06, "logits/chosen": 0.44528093934059143, "logits/rejected": 0.9088476300239563, "logps/chosen": -535.5958251953125, "logps/rejected": -997.3358154296875, "loss": 0.1982, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.2844367027282715, "rewards/margins": 4.700140476226807, "rewards/margins_max": 6.8690080642700195, "rewards/margins_min": 2.53127384185791, "rewards/margins_std": 3.0672411918640137, "rewards/rejected": -7.984577178955078, "step": 1760 }, { "epoch": 0.45, "grad_norm": 3.5, "learning_rate": 1.3573835460842062e-06, "logits/chosen": 0.30346041917800903, "logits/rejected": 0.7271562814712524, "logps/chosen": -472.663330078125, "logps/rejected": -1005.3018798828125, "loss": 0.1877, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.7229371070861816, "rewards/margins": 5.189479351043701, "rewards/margins_max": 8.325895309448242, "rewards/margins_min": 2.0530643463134766, "rewards/margins_std": 4.435561180114746, "rewards/rejected": -7.912416934967041, "step": 1770 }, { "epoch": 0.45, "grad_norm": 13.8125, "learning_rate": 1.3491648416198947e-06, "logits/chosen": 0.3526113033294678, "logits/rejected": 0.6005972027778625, "logps/chosen": -493.38714599609375, "logps/rejected": -1039.467041015625, "loss": 0.1729, "rewards/accuracies": 1.0, "rewards/chosen": -2.9820990562438965, "rewards/margins": 5.418988227844238, "rewards/margins_max": 8.14158821105957, "rewards/margins_min": 2.6963882446289062, "rewards/margins_std": 3.850337505340576, "rewards/rejected": -8.401086807250977, "step": 1780 }, { "epoch": 0.45, "grad_norm": 3.421875, "learning_rate": 1.340919188789477e-06, "logits/chosen": 0.4165642261505127, "logits/rejected": 0.8380780220031738, "logps/chosen": -529.6197509765625, "logps/rejected": -929.248046875, "loss": 0.1725, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.2881054878234863, "rewards/margins": 4.095848083496094, "rewards/margins_max": 6.220312118530273, "rewards/margins_min": 1.971383810043335, "rewards/margins_std": 3.004446029663086, "rewards/rejected": -7.383954048156738, "step": 1790 }, { "epoch": 0.45, "grad_norm": 1.34375, "learning_rate": 1.3326472239882734e-06, "logits/chosen": 0.43543314933776855, "logits/rejected": 0.9925807118415833, "logps/chosen": -525.8448486328125, "logps/rejected": -1062.7294921875, "loss": 0.191, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.225259780883789, "rewards/margins": 5.37452507019043, "rewards/margins_max": 8.14264965057373, "rewards/margins_min": 2.6064014434814453, "rewards/margins_std": 3.9147191047668457, "rewards/rejected": -8.599784851074219, "step": 1800 }, { "epoch": 0.46, "grad_norm": 0.75390625, "learning_rate": 1.3243495856423489e-06, "logits/chosen": 0.36167892813682556, "logits/rejected": 0.8087556958198547, "logps/chosen": -553.3516845703125, "logps/rejected": -1192.406005859375, "loss": 0.1608, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.324446439743042, "rewards/margins": 6.299948692321777, "rewards/margins_max": 9.06318187713623, "rewards/margins_min": 3.5367157459259033, "rewards/margins_std": 3.907802104949951, "rewards/rejected": -9.624395370483398, "step": 1810 }, { "epoch": 0.46, "grad_norm": 2.03125, "learning_rate": 1.3160269141592396e-06, "logits/chosen": 0.39735549688339233, "logits/rejected": 0.7091315388679504, "logps/chosen": -510.31158447265625, "logps/rejected": -1065.9410400390625, "loss": 0.1815, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.9605770111083984, "rewards/margins": 5.331042289733887, "rewards/margins_max": 8.254611015319824, "rewards/margins_min": 2.407473087310791, "rewards/margins_std": 4.134551525115967, "rewards/rejected": -8.291619300842285, "step": 1820 }, { "epoch": 0.46, "grad_norm": 1.1328125, "learning_rate": 1.3076798518785272e-06, "logits/chosen": 0.4008331298828125, "logits/rejected": 0.8075596690177917, "logps/chosen": -503.36688232421875, "logps/rejected": -1024.9261474609375, "loss": 0.1532, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.090235948562622, "rewards/margins": 5.229609966278076, "rewards/margins_max": 8.532186508178711, "rewards/margins_min": 1.9270336627960205, "rewards/margins_std": 4.670548439025879, "rewards/rejected": -8.319845199584961, "step": 1830 }, { "epoch": 0.46, "grad_norm": 0.84375, "learning_rate": 1.2993090430222618e-06, "logits/chosen": 0.4138672351837158, "logits/rejected": 0.7416144013404846, "logps/chosen": -577.0935668945312, "logps/rejected": -1252.086181640625, "loss": 0.2346, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5649490356445312, "rewards/margins": 6.544081211090088, "rewards/margins_max": 10.0579833984375, "rewards/margins_min": 3.030177593231201, "rewards/margins_std": 4.969409465789795, "rewards/rejected": -10.109029769897461, "step": 1840 }, { "epoch": 0.47, "grad_norm": 0.90625, "learning_rate": 1.2909151336452427e-06, "logits/chosen": 0.3605644702911377, "logits/rejected": 0.9392998814582825, "logps/chosen": -585.3292236328125, "logps/rejected": -1179.452392578125, "loss": 0.1969, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3506827354431152, "rewards/margins": 6.266888618469238, "rewards/margins_max": 10.083221435546875, "rewards/margins_min": 2.4505550861358643, "rewards/margins_std": 5.3971099853515625, "rewards/rejected": -9.617570877075195, "step": 1850 }, { "epoch": 0.47, "grad_norm": 3.59375, "learning_rate": 1.2824987715851559e-06, "logits/chosen": 0.371305912733078, "logits/rejected": 0.8649128675460815, "logps/chosen": -520.8292846679688, "logps/rejected": -995.4801025390625, "loss": 0.1632, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0674500465393066, "rewards/margins": 4.9427289962768555, "rewards/margins_max": 7.798059940338135, "rewards/margins_min": 2.0873985290527344, "rewards/margins_std": 4.038046836853027, "rewards/rejected": -8.01017951965332, "step": 1860 }, { "epoch": 0.47, "grad_norm": 3.03125, "learning_rate": 1.2740606064125737e-06, "logits/chosen": 0.24925783276557922, "logits/rejected": 0.7453621029853821, "logps/chosen": -577.8242797851562, "logps/rejected": -1344.7510986328125, "loss": 0.1198, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.645860195159912, "rewards/margins": 7.597962856292725, "rewards/margins_max": 11.720430374145508, "rewards/margins_min": 3.475494384765625, "rewards/margins_std": 5.830049991607666, "rewards/rejected": -11.243823051452637, "step": 1870 }, { "epoch": 0.47, "grad_norm": 0.81640625, "learning_rate": 1.265601289380822e-06, "logits/chosen": 0.44502177834510803, "logits/rejected": 0.7797173261642456, "logps/chosen": -554.5269775390625, "logps/rejected": -1116.17919921875, "loss": 0.1529, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5217156410217285, "rewards/margins": 5.7031121253967285, "rewards/margins_max": 8.348360061645508, "rewards/margins_min": 3.057863712310791, "rewards/margins_std": 3.7409462928771973, "rewards/rejected": -9.224828720092773, "step": 1880 }, { "epoch": 0.48, "grad_norm": 1.21875, "learning_rate": 1.257121473375716e-06, "logits/chosen": 0.41753944754600525, "logits/rejected": 0.9086526036262512, "logps/chosen": -539.7106323242188, "logps/rejected": -1138.9730224609375, "loss": 0.2151, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.3240904808044434, "rewards/margins": 5.920177459716797, "rewards/margins_max": 9.132664680480957, "rewards/margins_min": 2.7076900005340576, "rewards/margins_std": 4.543143272399902, "rewards/rejected": -9.244268417358398, "step": 1890 }, { "epoch": 0.48, "grad_norm": 0.82421875, "learning_rate": 1.248621812865172e-06, "logits/chosen": 0.5087807774543762, "logits/rejected": 0.8646427989006042, "logps/chosen": -657.2662353515625, "logps/rejected": -1365.0345458984375, "loss": 0.2192, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.301126480102539, "rewards/margins": 7.023177146911621, "rewards/margins_max": 10.254752159118652, "rewards/margins_min": 3.7916018962860107, "rewards/margins_std": 4.570137977600098, "rewards/rejected": -11.324304580688477, "step": 1900 }, { "epoch": 0.48, "grad_norm": 0.75390625, "learning_rate": 1.240102963848695e-06, "logits/chosen": 0.4806975722312927, "logits/rejected": 0.7998193502426147, "logps/chosen": -539.7196044921875, "logps/rejected": -1049.60205078125, "loss": 0.2559, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.533198595046997, "rewards/margins": 5.0815324783325195, "rewards/margins_max": 8.007471084594727, "rewards/margins_min": 2.1555933952331543, "rewards/margins_std": 4.137903213500977, "rewards/rejected": -8.614730834960938, "step": 1910 }, { "epoch": 0.48, "grad_norm": 3.328125, "learning_rate": 1.2315655838067487e-06, "logits/chosen": 0.4073428511619568, "logits/rejected": 0.8953601121902466, "logps/chosen": -563.6649169921875, "logps/rejected": -1113.478515625, "loss": 0.2925, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5817649364471436, "rewards/margins": 5.543329238891602, "rewards/margins_max": 8.386140823364258, "rewards/margins_min": 2.700516700744629, "rewards/margins_std": 4.02034330368042, "rewards/rejected": -9.125093460083008, "step": 1920 }, { "epoch": 0.49, "grad_norm": 1.078125, "learning_rate": 1.2230103316500127e-06, "logits/chosen": 0.4126254916191101, "logits/rejected": 0.8263294100761414, "logps/chosen": -555.5484008789062, "logps/rejected": -1175.513671875, "loss": 0.1711, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3831818103790283, "rewards/margins": 6.290456295013428, "rewards/margins_max": 9.095232009887695, "rewards/margins_min": 3.4856808185577393, "rewards/margins_std": 3.9665520191192627, "rewards/rejected": -9.673639297485352, "step": 1930 }, { "epoch": 0.49, "grad_norm": 18.625, "learning_rate": 1.2144378676685263e-06, "logits/chosen": 0.40424099564552307, "logits/rejected": 0.7649755477905273, "logps/chosen": -560.0888061523438, "logps/rejected": -1269.2265625, "loss": 0.2382, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.489391326904297, "rewards/margins": 7.084428310394287, "rewards/margins_max": 10.971991539001465, "rewards/margins_min": 3.196864604949951, "rewards/margins_std": 5.497844696044922, "rewards/rejected": -10.573820114135742, "step": 1940 }, { "epoch": 0.49, "grad_norm": 2.25, "learning_rate": 1.2058488534807302e-06, "logits/chosen": 0.4380221366882324, "logits/rejected": 0.858269989490509, "logps/chosen": -622.8287353515625, "logps/rejected": -1164.4425048828125, "loss": 0.1878, "rewards/accuracies": 0.9375, "rewards/chosen": -3.897209882736206, "rewards/margins": 5.580199241638184, "rewards/margins_max": 8.42861270904541, "rewards/margins_min": 2.731786012649536, "rewards/margins_std": 4.028264045715332, "rewards/rejected": -9.477409362792969, "step": 1950 }, { "epoch": 0.49, "grad_norm": 1.3359375, "learning_rate": 1.197243951982401e-06, "logits/chosen": 0.3885877728462219, "logits/rejected": 0.9749298095703125, "logps/chosen": -560.0235595703125, "logps/rejected": -1132.405029296875, "loss": 0.1304, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.391322374343872, "rewards/margins": 5.737778186798096, "rewards/margins_max": 8.883737564086914, "rewards/margins_min": 2.591817617416382, "rewards/margins_std": 4.449059009552002, "rewards/rejected": -9.12909984588623, "step": 1960 }, { "epoch": 0.5, "grad_norm": 2.234375, "learning_rate": 1.1886238272954896e-06, "logits/chosen": 0.45476874709129333, "logits/rejected": 0.8959047198295593, "logps/chosen": -620.4302368164062, "logps/rejected": -1258.3804931640625, "loss": 0.1997, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8833556175231934, "rewards/margins": 6.6274213790893555, "rewards/margins_max": 10.6688232421875, "rewards/margins_min": 2.58601975440979, "rewards/margins_std": 5.7154059410095215, "rewards/rejected": -10.510777473449707, "step": 1970 }, { "epoch": 0.5, "grad_norm": 0.79296875, "learning_rate": 1.1799891447168647e-06, "logits/chosen": 0.5257728695869446, "logits/rejected": 0.8648616671562195, "logps/chosen": -681.9531860351562, "logps/rejected": -1429.496826171875, "loss": 0.1605, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.491623878479004, "rewards/margins": 7.457159519195557, "rewards/margins_max": 10.727521896362305, "rewards/margins_min": 4.186797142028809, "rewards/margins_std": 4.624989986419678, "rewards/rejected": -11.948783874511719, "step": 1980 }, { "epoch": 0.5, "grad_norm": 1.3125, "learning_rate": 1.1713405706669666e-06, "logits/chosen": 0.39060765504837036, "logits/rejected": 0.8693227767944336, "logps/chosen": -638.9912109375, "logps/rejected": -1159.7806396484375, "loss": 0.3408, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.9858803749084473, "rewards/margins": 5.464795112609863, "rewards/margins_max": 9.650136947631836, "rewards/margins_min": 1.2794535160064697, "rewards/margins_std": 5.918967247009277, "rewards/rejected": -9.450675964355469, "step": 1990 }, { "epoch": 0.5, "grad_norm": 2.625, "learning_rate": 1.162678772638372e-06, "logits/chosen": 0.3979375958442688, "logits/rejected": 0.8895372152328491, "logps/chosen": -626.7593994140625, "logps/rejected": -1275.314697265625, "loss": 0.2618, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9609382152557373, "rewards/margins": 6.693270683288574, "rewards/margins_max": 10.132143020629883, "rewards/margins_min": 3.2543983459472656, "rewards/margins_std": 4.86329984664917, "rewards/rejected": -10.654208183288574, "step": 2000 }, { "epoch": 0.51, "grad_norm": 1.125, "learning_rate": 1.1540044191442776e-06, "logits/chosen": 0.43077486753463745, "logits/rejected": 0.9984035491943359, "logps/chosen": -551.9407958984375, "logps/rejected": -1064.71728515625, "loss": 0.1814, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.184940814971924, "rewards/margins": 5.308284759521484, "rewards/margins_max": 8.101531982421875, "rewards/margins_min": 2.5150370597839355, "rewards/margins_std": 3.9502487182617188, "rewards/rejected": -8.493226051330566, "step": 2010 }, { "epoch": 0.51, "grad_norm": 3.375, "learning_rate": 1.145318179666904e-06, "logits/chosen": 0.3742697238922119, "logits/rejected": 0.9613549113273621, "logps/chosen": -551.4102172851562, "logps/rejected": -1192.2840576171875, "loss": 0.1466, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.417285203933716, "rewards/margins": 6.326827049255371, "rewards/margins_max": 9.474775314331055, "rewards/margins_min": 3.1788787841796875, "rewards/margins_std": 4.451870918273926, "rewards/rejected": -9.744112014770508, "step": 2020 }, { "epoch": 0.51, "grad_norm": 1.1953125, "learning_rate": 1.1366207246058268e-06, "logits/chosen": 0.580926775932312, "logits/rejected": 1.0238367319107056, "logps/chosen": -597.9246826171875, "logps/rejected": -1226.0614013671875, "loss": 0.1785, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.605300188064575, "rewards/margins": 6.226903915405273, "rewards/margins_max": 9.348957061767578, "rewards/margins_min": 3.1048502922058105, "rewards/margins_std": 4.415249824523926, "rewards/rejected": -9.832204818725586, "step": 2030 }, { "epoch": 0.51, "grad_norm": 5.21875, "learning_rate": 1.1279127252262344e-06, "logits/chosen": 0.36743634939193726, "logits/rejected": 0.7529619336128235, "logps/chosen": -611.38037109375, "logps/rejected": -1260.2279052734375, "loss": 0.1646, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.913753032684326, "rewards/margins": 6.523721218109131, "rewards/margins_max": 10.163423538208008, "rewards/margins_min": 2.884019136428833, "rewards/margins_std": 5.147315502166748, "rewards/rejected": -10.43747329711914, "step": 2040 }, { "epoch": 0.52, "grad_norm": 1.34375, "learning_rate": 1.11919485360712e-06, "logits/chosen": 0.4193040728569031, "logits/rejected": 0.7446034550666809, "logps/chosen": -641.4575805664062, "logps/rejected": -1331.7291259765625, "loss": 0.1542, "rewards/accuracies": 0.9375, "rewards/chosen": -4.2941741943359375, "rewards/margins": 6.686350345611572, "rewards/margins_max": 10.751882553100586, "rewards/margins_min": 2.620816707611084, "rewards/margins_std": 5.7495317459106445, "rewards/rejected": -10.980524063110352, "step": 2050 }, { "epoch": 0.52, "grad_norm": 1.6171875, "learning_rate": 1.110467782589412e-06, "logits/chosen": 0.37651658058166504, "logits/rejected": 0.9150172472000122, "logps/chosen": -641.582275390625, "logps/rejected": -1308.950927734375, "loss": 0.2451, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.919959306716919, "rewards/margins": 6.766678810119629, "rewards/margins_max": 10.914981842041016, "rewards/margins_min": 2.618375778198242, "rewards/margins_std": 5.866586208343506, "rewards/rejected": -10.686636924743652, "step": 2060 }, { "epoch": 0.52, "grad_norm": 0.69140625, "learning_rate": 1.101732185724043e-06, "logits/chosen": 0.602503776550293, "logits/rejected": 0.9572169184684753, "logps/chosen": -564.55126953125, "logps/rejected": -1175.539794921875, "loss": 0.161, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7203216552734375, "rewards/margins": 5.9916558265686035, "rewards/margins_max": 9.613600730895996, "rewards/margins_min": 2.3697094917297363, "rewards/margins_std": 5.122204780578613, "rewards/rejected": -9.711977005004883, "step": 2070 }, { "epoch": 0.52, "grad_norm": 0.6953125, "learning_rate": 1.0929887372199673e-06, "logits/chosen": 0.4709581434726715, "logits/rejected": 0.9506624937057495, "logps/chosen": -559.4562377929688, "logps/rejected": -1188.647216796875, "loss": 0.1344, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6829051971435547, "rewards/margins": 6.341403007507324, "rewards/margins_max": 9.901277542114258, "rewards/margins_min": 2.7815279960632324, "rewards/margins_std": 5.034422874450684, "rewards/rejected": -10.024307250976562, "step": 2080 }, { "epoch": 0.53, "grad_norm": 2.265625, "learning_rate": 1.084238111892123e-06, "logits/chosen": 0.5924087762832642, "logits/rejected": 0.9477392435073853, "logps/chosen": -567.0867309570312, "logps/rejected": -1217.984130859375, "loss": 0.1593, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6363062858581543, "rewards/margins": 6.4515509605407715, "rewards/margins_max": 9.455270767211914, "rewards/margins_min": 3.44783091545105, "rewards/margins_std": 4.24790096282959, "rewards/rejected": -10.087857246398926, "step": 2090 }, { "epoch": 0.53, "grad_norm": 1.015625, "learning_rate": 1.075480985109353e-06, "logits/chosen": 0.4340541958808899, "logits/rejected": 0.8853395581245422, "logps/chosen": -648.7272338867188, "logps/rejected": -1269.769775390625, "loss": 0.1406, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.123702049255371, "rewards/margins": 6.3718366622924805, "rewards/margins_max": 9.116823196411133, "rewards/margins_min": 3.6268508434295654, "rewards/margins_std": 3.8819961547851562, "rewards/rejected": -10.495538711547852, "step": 2100 }, { "epoch": 0.53, "grad_norm": 0.74609375, "learning_rate": 1.0667180327422796e-06, "logits/chosen": 0.4427351951599121, "logits/rejected": 0.8773029446601868, "logps/chosen": -652.7124633789062, "logps/rejected": -1093.98046875, "loss": 0.2191, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.100010871887207, "rewards/margins": 4.69572639465332, "rewards/margins_max": 7.104989528656006, "rewards/margins_min": 2.2864630222320557, "rewards/margins_std": 3.4072136878967285, "rewards/rejected": -8.795738220214844, "step": 2110 }, { "epoch": 0.53, "grad_norm": 3.796875, "learning_rate": 1.0579499311111394e-06, "logits/chosen": 0.4106171727180481, "logits/rejected": 0.8539530038833618, "logps/chosen": -598.7738037109375, "logps/rejected": -1391.320068359375, "loss": 0.1609, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6164703369140625, "rewards/margins": 8.027566909790039, "rewards/margins_max": 12.555456161499023, "rewards/margins_min": 3.4996769428253174, "rewards/margins_std": 6.403402805328369, "rewards/rejected": -11.644036293029785, "step": 2120 }, { "epoch": 0.54, "grad_norm": 0.77734375, "learning_rate": 1.0491773569335877e-06, "logits/chosen": 0.4420396685600281, "logits/rejected": 0.9396398663520813, "logps/chosen": -626.4326782226562, "logps/rejected": -1140.4166259765625, "loss": 0.2575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.183171272277832, "rewards/margins": 5.311387062072754, "rewards/margins_max": 8.279863357543945, "rewards/margins_min": 2.3429112434387207, "rewards/margins_std": 4.19805908203125, "rewards/rejected": -9.494558334350586, "step": 2130 }, { "epoch": 0.54, "grad_norm": 1.703125, "learning_rate": 1.0404009872724686e-06, "logits/chosen": 0.3594892621040344, "logits/rejected": 0.8964468240737915, "logps/chosen": -567.0457763671875, "logps/rejected": -1121.90234375, "loss": 0.1394, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.354614734649658, "rewards/margins": 5.570419788360596, "rewards/margins_max": 8.419346809387207, "rewards/margins_min": 2.7214925289154053, "rewards/margins_std": 4.028990745544434, "rewards/rejected": -8.925034523010254, "step": 2140 }, { "epoch": 0.54, "grad_norm": 0.91796875, "learning_rate": 1.0316214994835588e-06, "logits/chosen": 0.355679452419281, "logits/rejected": 0.9933696985244751, "logps/chosen": -608.57177734375, "logps/rejected": -1124.74609375, "loss": 0.1533, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8642611503601074, "rewards/margins": 5.441197872161865, "rewards/margins_max": 8.085689544677734, "rewards/margins_min": 2.7967066764831543, "rewards/margins_std": 3.739875316619873, "rewards/rejected": -9.305459976196289, "step": 2150 }, { "epoch": 0.54, "grad_norm": 0.6484375, "learning_rate": 1.0228395711632915e-06, "logits/chosen": 0.3872026205062866, "logits/rejected": 0.8455197215080261, "logps/chosen": -627.2762451171875, "logps/rejected": -1316.978759765625, "loss": 0.2178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.086087703704834, "rewards/margins": 6.8035407066345215, "rewards/margins_max": 9.993437767028809, "rewards/margins_min": 3.613642930984497, "rewards/margins_std": 4.511196613311768, "rewards/rejected": -10.889628410339355, "step": 2160 }, { "epoch": 0.55, "grad_norm": 4.75, "learning_rate": 1.0140558800964588e-06, "logits/chosen": 0.3922487199306488, "logits/rejected": 0.8369787335395813, "logps/chosen": -600.4080810546875, "logps/rejected": -1210.494384765625, "loss": 0.2214, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5966057777404785, "rewards/margins": 6.296887397766113, "rewards/margins_max": 9.430109024047852, "rewards/margins_min": 3.1636674404144287, "rewards/margins_std": 4.431042671203613, "rewards/rejected": -9.89349365234375, "step": 2170 }, { "epoch": 0.55, "grad_norm": 4.71875, "learning_rate": 1.0052711042039e-06, "logits/chosen": 0.510870635509491, "logits/rejected": 0.8489816784858704, "logps/chosen": -562.1312866210938, "logps/rejected": -1294.7515869140625, "loss": 0.1907, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.6715149879455566, "rewards/margins": 7.00562047958374, "rewards/margins_max": 11.010260581970215, "rewards/margins_min": 3.0009806156158447, "rewards/margins_std": 5.663416385650635, "rewards/rejected": -10.677135467529297, "step": 2180 }, { "epoch": 0.55, "grad_norm": 3.546875, "learning_rate": 9.964859214901813e-07, "logits/chosen": 0.3070334494113922, "logits/rejected": 0.711986243724823, "logps/chosen": -667.531005859375, "logps/rejected": -1298.389892578125, "loss": 0.2149, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.415340900421143, "rewards/margins": 6.254446983337402, "rewards/margins_max": 9.225793838500977, "rewards/margins_min": 3.283099412918091, "rewards/margins_std": 4.202120304107666, "rewards/rejected": -10.66978931427002, "step": 2190 }, { "epoch": 0.55, "grad_norm": 1.25, "learning_rate": 9.87701009991267e-07, "logits/chosen": 0.6090846657752991, "logits/rejected": 1.093515396118164, "logps/chosen": -604.7011108398438, "logps/rejected": -1219.076416015625, "loss": 0.1831, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6504428386688232, "rewards/margins": 6.507830619812012, "rewards/margins_max": 9.95383358001709, "rewards/margins_min": 3.0618269443511963, "rewards/margins_std": 4.873384952545166, "rewards/rejected": -10.158273696899414, "step": 2200 }, { "epoch": 0.56, "grad_norm": 1.8203125, "learning_rate": 9.789170477221891e-07, "logits/chosen": 0.49116769433021545, "logits/rejected": 0.964964747428894, "logps/chosen": -525.8590087890625, "logps/rejected": -1271.779296875, "loss": 0.1133, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2402710914611816, "rewards/margins": 7.343132019042969, "rewards/margins_max": 11.660395622253418, "rewards/margins_min": 3.0258681774139404, "rewards/margins_std": 6.105532646179199, "rewards/rejected": -10.583402633666992, "step": 2210 }, { "epoch": 0.56, "grad_norm": 1.5703125, "learning_rate": 9.701347126247183e-07, "logits/chosen": 0.3676094114780426, "logits/rejected": 0.7860090732574463, "logps/chosen": -562.3323364257812, "logps/rejected": -1244.939453125, "loss": 0.1245, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.605191469192505, "rewards/margins": 6.7962799072265625, "rewards/margins_max": 10.232267379760742, "rewards/margins_min": 3.3602943420410156, "rewards/margins_std": 4.859219074249268, "rewards/rejected": -10.401471138000488, "step": 2220 }, { "epoch": 0.56, "grad_norm": 1.4609375, "learning_rate": 9.61354682515042e-07, "logits/chosen": 0.5297726392745972, "logits/rejected": 0.9975612759590149, "logps/chosen": -554.0660400390625, "logps/rejected": -1282.62109375, "loss": 0.2119, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.6336886882781982, "rewards/margins": 7.203893184661865, "rewards/margins_max": 11.056930541992188, "rewards/margins_min": 3.3508553504943848, "rewards/margins_std": 5.4490180015563965, "rewards/rejected": -10.837581634521484, "step": 2230 }, { "epoch": 0.56, "grad_norm": 13.9375, "learning_rate": 9.525776350314484e-07, "logits/chosen": 0.3922134041786194, "logits/rejected": 0.9736678004264832, "logps/chosen": -554.39990234375, "logps/rejected": -1209.232177734375, "loss": 0.1955, "rewards/accuracies": 0.9375, "rewards/chosen": -3.258437395095825, "rewards/margins": 6.5767412185668945, "rewards/margins_max": 10.349874496459961, "rewards/margins_min": 2.8036084175109863, "rewards/margins_std": 5.336016654968262, "rewards/rejected": -9.835180282592773, "step": 2240 }, { "epoch": 0.57, "grad_norm": 2.765625, "learning_rate": 9.438042475820292e-07, "logits/chosen": 0.403189480304718, "logits/rejected": 0.7983392477035522, "logps/chosen": -584.6541748046875, "logps/rejected": -1261.7821044921875, "loss": 0.1611, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.42396879196167, "rewards/margins": 6.889365196228027, "rewards/margins_max": 10.340558052062988, "rewards/margins_min": 3.43817138671875, "rewards/margins_std": 4.880724906921387, "rewards/rejected": -10.313333511352539, "step": 2250 }, { "epoch": 0.57, "grad_norm": 0.5390625, "learning_rate": 9.350351972923963e-07, "logits/chosen": 0.3648914396762848, "logits/rejected": 0.8595021963119507, "logps/chosen": -579.9949340820312, "logps/rejected": -1241.930419921875, "loss": 0.0986, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5924155712127686, "rewards/margins": 6.509829521179199, "rewards/margins_max": 9.569954872131348, "rewards/margins_min": 3.4497056007385254, "rewards/margins_std": 4.327669620513916, "rewards/rejected": -10.102245330810547, "step": 2260 }, { "epoch": 0.57, "grad_norm": 6.28125, "learning_rate": 9.262711609534209e-07, "logits/chosen": 0.5114152431488037, "logits/rejected": 0.7859119772911072, "logps/chosen": -552.0835571289062, "logps/rejected": -1168.512451171875, "loss": 0.2194, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.4986891746520996, "rewards/margins": 5.916913032531738, "rewards/margins_max": 9.174135208129883, "rewards/margins_min": 2.659688711166382, "rewards/margins_std": 4.606410026550293, "rewards/rejected": -9.41560173034668, "step": 2270 }, { "epoch": 0.57, "grad_norm": 1.71875, "learning_rate": 9.175128149690018e-07, "logits/chosen": 0.42313352227211, "logits/rejected": 0.761069655418396, "logps/chosen": -566.6101684570312, "logps/rejected": -1010.4449462890625, "loss": 0.2312, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5931007862091064, "rewards/margins": 4.439265727996826, "rewards/margins_max": 6.598448753356934, "rewards/margins_min": 2.2800817489624023, "rewards/margins_std": 3.05354642868042, "rewards/rejected": -8.032365798950195, "step": 2280 }, { "epoch": 0.58, "grad_norm": 0.490234375, "learning_rate": 9.087608353038571e-07, "logits/chosen": 0.5663483738899231, "logits/rejected": 0.9419177770614624, "logps/chosen": -617.2533569335938, "logps/rejected": -1253.386474609375, "loss": 0.1776, "rewards/accuracies": 0.9375, "rewards/chosen": -3.911703586578369, "rewards/margins": 6.384354591369629, "rewards/margins_max": 10.122902870178223, "rewards/margins_min": 2.6458072662353516, "rewards/margins_std": 5.287104606628418, "rewards/rejected": -10.29605770111084, "step": 2290 }, { "epoch": 0.58, "grad_norm": 3.265625, "learning_rate": 9.00015897431357e-07, "logits/chosen": 0.44118762016296387, "logits/rejected": 0.962271511554718, "logps/chosen": -645.8919677734375, "logps/rejected": -1220.1890869140625, "loss": 0.2006, "rewards/accuracies": 0.9375, "rewards/chosen": -4.0967254638671875, "rewards/margins": 6.049219131469727, "rewards/margins_max": 8.595663070678711, "rewards/margins_min": 3.5027756690979004, "rewards/margins_std": 3.60121488571167, "rewards/rejected": -10.145944595336914, "step": 2300 }, { "epoch": 0.58, "grad_norm": 1.625, "learning_rate": 8.912786762813893e-07, "logits/chosen": 0.5058658123016357, "logits/rejected": 0.9005535244941711, "logps/chosen": -589.7290649414062, "logps/rejected": -1193.9273681640625, "loss": 0.1621, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5936293601989746, "rewards/margins": 6.243061065673828, "rewards/margins_max": 9.953969955444336, "rewards/margins_min": 2.5321524143218994, "rewards/margins_std": 5.24801778793335, "rewards/rejected": -9.836690902709961, "step": 2310 }, { "epoch": 0.58, "grad_norm": 0.703125, "learning_rate": 8.82549846188269e-07, "logits/chosen": 0.5765672922134399, "logits/rejected": 0.8515122532844543, "logps/chosen": -576.6160278320312, "logps/rejected": -1192.1400146484375, "loss": 0.1866, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.822211742401123, "rewards/margins": 5.900345802307129, "rewards/margins_max": 8.774417877197266, "rewards/margins_min": 3.0262744426727295, "rewards/margins_std": 4.064550876617432, "rewards/rejected": -9.72255802154541, "step": 2320 }, { "epoch": 0.59, "grad_norm": 2.28125, "learning_rate": 8.738300808386933e-07, "logits/chosen": 0.4344192445278168, "logits/rejected": 0.8709548115730286, "logps/chosen": -618.6167602539062, "logps/rejected": -1314.2210693359375, "loss": 0.1313, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.075563907623291, "rewards/margins": 7.01934289932251, "rewards/margins_max": 10.089629173278809, "rewards/margins_min": 3.9490573406219482, "rewards/margins_std": 4.342040061950684, "rewards/rejected": -11.0949068069458, "step": 2330 }, { "epoch": 0.59, "grad_norm": 1.015625, "learning_rate": 8.65120053219748e-07, "logits/chosen": 0.4593490958213806, "logits/rejected": 0.8710781335830688, "logps/chosen": -554.8482666015625, "logps/rejected": -1096.6068115234375, "loss": 0.1465, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5408222675323486, "rewards/margins": 5.38980770111084, "rewards/margins_max": 8.383840560913086, "rewards/margins_min": 2.3957760334014893, "rewards/margins_std": 4.234200477600098, "rewards/rejected": -8.930630683898926, "step": 2340 }, { "epoch": 0.59, "grad_norm": 1.75, "learning_rate": 8.564204355669643e-07, "logits/chosen": 0.4738622307777405, "logits/rejected": 0.8612324595451355, "logps/chosen": -653.3856201171875, "logps/rejected": -1402.7314453125, "loss": 0.2242, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.03350305557251, "rewards/margins": 7.781327724456787, "rewards/margins_max": 12.15905475616455, "rewards/margins_min": 3.403602123260498, "rewards/margins_std": 6.191039085388184, "rewards/rejected": -11.814830780029297, "step": 2350 }, { "epoch": 0.59, "grad_norm": 1.078125, "learning_rate": 8.477318993124392e-07, "logits/chosen": 0.44268113374710083, "logits/rejected": 0.979813277721405, "logps/chosen": -556.304443359375, "logps/rejected": -1236.2825927734375, "loss": 0.2425, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.656306505203247, "rewards/margins": 6.749668121337891, "rewards/margins_max": 10.653985977172852, "rewards/margins_min": 2.845351457595825, "rewards/margins_std": 5.521537780761719, "rewards/rejected": -10.405974388122559, "step": 2360 }, { "epoch": 0.6, "grad_norm": 2.859375, "learning_rate": 8.390551150330113e-07, "logits/chosen": 0.3767511248588562, "logits/rejected": 0.7756798267364502, "logps/chosen": -626.3853759765625, "logps/rejected": -1276.080810546875, "loss": 0.3063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.063645362854004, "rewards/margins": 6.4792351722717285, "rewards/margins_max": 9.45138931274414, "rewards/margins_min": 3.507080078125, "rewards/margins_std": 4.203261375427246, "rewards/rejected": -10.542880058288574, "step": 2370 }, { "epoch": 0.6, "grad_norm": 7.1875, "learning_rate": 8.303907523985085e-07, "logits/chosen": 0.41792359948158264, "logits/rejected": 0.9234131574630737, "logps/chosen": -583.3489379882812, "logps/rejected": -1221.631591796875, "loss": 0.1307, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.7345657348632812, "rewards/margins": 6.419626712799072, "rewards/margins_max": 9.607501983642578, "rewards/margins_min": 3.2317516803741455, "rewards/margins_std": 4.508336067199707, "rewards/rejected": -10.154191970825195, "step": 2380 }, { "epoch": 0.6, "grad_norm": 1.6875, "learning_rate": 8.217394801200631e-07, "logits/chosen": 0.5521947741508484, "logits/rejected": 0.8723956942558289, "logps/chosen": -594.2333984375, "logps/rejected": -1298.763427734375, "loss": 0.148, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.983570098876953, "rewards/margins": 6.878546237945557, "rewards/margins_max": 10.753538131713867, "rewards/margins_min": 3.0035533905029297, "rewards/margins_std": 5.480066776275635, "rewards/rejected": -10.862115859985352, "step": 2390 }, { "epoch": 0.6, "grad_norm": 1.3515625, "learning_rate": 8.131019658984988e-07, "logits/chosen": 0.421779066324234, "logits/rejected": 0.9385878443717957, "logps/chosen": -586.00244140625, "logps/rejected": -1196.017822265625, "loss": 0.1597, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.60229754447937, "rewards/margins": 6.160573482513428, "rewards/margins_max": 8.99445629119873, "rewards/margins_min": 3.3266918659210205, "rewards/margins_std": 4.007714748382568, "rewards/rejected": -9.762872695922852, "step": 2400 }, { "epoch": 0.61, "grad_norm": 1.6328125, "learning_rate": 8.04478876372801e-07, "logits/chosen": 0.3881237208843231, "logits/rejected": 1.046502709388733, "logps/chosen": -631.3989868164062, "logps/rejected": -1197.179443359375, "loss": 0.2175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.762930393218994, "rewards/margins": 6.060236930847168, "rewards/margins_max": 9.582808494567871, "rewards/margins_min": 2.5376646518707275, "rewards/margins_std": 4.9816694259643555, "rewards/rejected": -9.82316780090332, "step": 2410 }, { "epoch": 0.61, "grad_norm": 0.96484375, "learning_rate": 7.958708770686628e-07, "logits/chosen": 0.3488084673881531, "logits/rejected": 0.9390872716903687, "logps/chosen": -602.5117797851562, "logps/rejected": -1286.147705078125, "loss": 0.1196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.809040069580078, "rewards/margins": 6.788311958312988, "rewards/margins_max": 9.610162734985352, "rewards/margins_min": 3.9664599895477295, "rewards/margins_std": 3.990701198577881, "rewards/rejected": -10.59735107421875, "step": 2420 }, { "epoch": 0.61, "grad_norm": 5.21875, "learning_rate": 7.872786323471231e-07, "logits/chosen": 0.4111207127571106, "logits/rejected": 0.8087761998176575, "logps/chosen": -582.8560180664062, "logps/rejected": -1202.2833251953125, "loss": 0.1622, "rewards/accuracies": 0.9375, "rewards/chosen": -3.470329761505127, "rewards/margins": 6.254929065704346, "rewards/margins_max": 9.86207389831543, "rewards/margins_min": 2.6477839946746826, "rewards/margins_std": 5.101273536682129, "rewards/rejected": -9.725258827209473, "step": 2430 }, { "epoch": 0.61, "grad_norm": 2.359375, "learning_rate": 7.787028053532894e-07, "logits/chosen": 0.42040500044822693, "logits/rejected": 0.9697147607803345, "logps/chosen": -607.8787231445312, "logps/rejected": -1137.3770751953125, "loss": 0.2404, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6307804584503174, "rewards/margins": 5.513142108917236, "rewards/margins_max": 8.46685791015625, "rewards/margins_min": 2.5594258308410645, "rewards/margins_std": 4.177186012268066, "rewards/rejected": -9.143922805786133, "step": 2440 }, { "epoch": 0.62, "grad_norm": 0.5078125, "learning_rate": 7.701440579651564e-07, "logits/chosen": 0.39534759521484375, "logits/rejected": 0.827987015247345, "logps/chosen": -666.9830322265625, "logps/rejected": -1299.258544921875, "loss": 0.1684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.383899688720703, "rewards/margins": 6.384281635284424, "rewards/margins_max": 9.895980834960938, "rewards/margins_min": 2.8725833892822266, "rewards/margins_std": 4.9662909507751465, "rewards/rejected": -10.768181800842285, "step": 2450 }, { "epoch": 0.62, "grad_norm": 3.4375, "learning_rate": 7.616030507425251e-07, "logits/chosen": 0.5693954229354858, "logits/rejected": 0.8889672160148621, "logps/chosen": -617.1419677734375, "logps/rejected": -1388.5489501953125, "loss": 0.137, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.9340217113494873, "rewards/margins": 7.426980018615723, "rewards/margins_max": 11.167816162109375, "rewards/margins_min": 3.686145067214966, "rewards/margins_std": 5.290339946746826, "rewards/rejected": -11.361001968383789, "step": 2460 }, { "epoch": 0.62, "grad_norm": 0.50390625, "learning_rate": 7.530804428760189e-07, "logits/chosen": 0.4677404463291168, "logits/rejected": 0.9375128746032715, "logps/chosen": -596.3168334960938, "logps/rejected": -1248.475830078125, "loss": 0.1194, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.896852970123291, "rewards/margins": 6.540476322174072, "rewards/margins_max": 9.212288856506348, "rewards/margins_min": 3.8686630725860596, "rewards/margins_std": 3.7785136699676514, "rewards/rejected": -10.437329292297363, "step": 2470 }, { "epoch": 0.62, "grad_norm": 2.671875, "learning_rate": 7.445768921362075e-07, "logits/chosen": 0.40075913071632385, "logits/rejected": 0.7473156452178955, "logps/chosen": -569.2059326171875, "logps/rejected": -1103.2969970703125, "loss": 0.2328, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6370136737823486, "rewards/margins": 5.360899448394775, "rewards/margins_max": 8.596506118774414, "rewards/margins_min": 2.1252918243408203, "rewards/margins_std": 4.575839996337891, "rewards/rejected": -8.997913360595703, "step": 2480 }, { "epoch": 0.63, "grad_norm": 0.76171875, "learning_rate": 7.360930548228421e-07, "logits/chosen": 0.5869132280349731, "logits/rejected": 0.8721901774406433, "logps/chosen": -587.7561645507812, "logps/rejected": -1406.796630859375, "loss": 0.1456, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.811744213104248, "rewards/margins": 8.148642539978027, "rewards/margins_max": 12.553075790405273, "rewards/margins_min": 3.744208812713623, "rewards/margins_std": 6.2288103103637695, "rewards/rejected": -11.960387229919434, "step": 2490 }, { "epoch": 0.63, "grad_norm": 1.078125, "learning_rate": 7.276295857142004e-07, "logits/chosen": 0.32453638315200806, "logits/rejected": 0.8772487640380859, "logps/chosen": -585.3636474609375, "logps/rejected": -1191.1376953125, "loss": 0.1195, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.542001247406006, "rewards/margins": 6.292872428894043, "rewards/margins_max": 9.072725296020508, "rewards/margins_min": 3.5130207538604736, "rewards/margins_std": 3.931304931640625, "rewards/rejected": -9.834874153137207, "step": 2500 }, { "epoch": 0.63, "grad_norm": 2.0, "learning_rate": 7.191871380165537e-07, "logits/chosen": 0.5947480797767639, "logits/rejected": 0.9666692018508911, "logps/chosen": -587.4915161132812, "logps/rejected": -1285.529541015625, "loss": 0.2129, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7693405151367188, "rewards/margins": 6.991917610168457, "rewards/margins_max": 10.897686004638672, "rewards/margins_min": 3.0861494541168213, "rewards/margins_std": 5.523590564727783, "rewards/rejected": -10.76125717163086, "step": 2510 }, { "epoch": 0.63, "grad_norm": 5.15625, "learning_rate": 7.107663633137513e-07, "logits/chosen": 0.5616310834884644, "logits/rejected": 0.9230579137802124, "logps/chosen": -596.8382568359375, "logps/rejected": -1349.6082763671875, "loss": 0.2132, "rewards/accuracies": 0.9375, "rewards/chosen": -3.841231107711792, "rewards/margins": 7.509341239929199, "rewards/margins_max": 12.250666618347168, "rewards/margins_min": 2.7680153846740723, "rewards/margins_std": 6.705247402191162, "rewards/rejected": -11.35057258605957, "step": 2520 }, { "epoch": 0.64, "grad_norm": 1.0625, "learning_rate": 7.023679115169304e-07, "logits/chosen": 0.3933202028274536, "logits/rejected": 0.8223272562026978, "logps/chosen": -606.7198486328125, "logps/rejected": -1352.9849853515625, "loss": 0.1704, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.759533643722534, "rewards/margins": 7.4397478103637695, "rewards/margins_max": 11.29192066192627, "rewards/margins_min": 3.5875747203826904, "rewards/margins_std": 5.4477949142456055, "rewards/rejected": -11.199281692504883, "step": 2530 }, { "epoch": 0.64, "grad_norm": 1.25, "learning_rate": 6.93992430814359e-07, "logits/chosen": 0.42247194051742554, "logits/rejected": 0.9332104921340942, "logps/chosen": -637.812255859375, "logps/rejected": -1319.972900390625, "loss": 0.1834, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.083704471588135, "rewards/margins": 6.8782501220703125, "rewards/margins_max": 11.13465690612793, "rewards/margins_min": 2.6218440532684326, "rewards/margins_std": 6.019468307495117, "rewards/rejected": -10.961955070495605, "step": 2540 }, { "epoch": 0.64, "grad_norm": 2.375, "learning_rate": 6.856405676214072e-07, "logits/chosen": 0.4980488717556, "logits/rejected": 0.9414850473403931, "logps/chosen": -626.1489868164062, "logps/rejected": -1227.9940185546875, "loss": 0.1541, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.9557862281799316, "rewards/margins": 5.900100231170654, "rewards/margins_max": 8.798306465148926, "rewards/margins_min": 3.00189471244812, "rewards/margins_std": 4.098681926727295, "rewards/rejected": -9.855887413024902, "step": 2550 }, { "epoch": 0.64, "grad_norm": 9.3125, "learning_rate": 6.773129665306569e-07, "logits/chosen": 0.35069847106933594, "logits/rejected": 0.8379716873168945, "logps/chosen": -567.0838623046875, "logps/rejected": -1157.524658203125, "loss": 0.1994, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.548142910003662, "rewards/margins": 5.7133989334106445, "rewards/margins_max": 8.906744956970215, "rewards/margins_min": 2.520052433013916, "rewards/margins_std": 4.516073703765869, "rewards/rejected": -9.261542320251465, "step": 2560 }, { "epoch": 0.65, "grad_norm": 0.890625, "learning_rate": 6.690102702621547e-07, "logits/chosen": 0.3375098407268524, "logits/rejected": 0.8294457197189331, "logps/chosen": -558.5492553710938, "logps/rejected": -1136.806396484375, "loss": 0.1465, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5482871532440186, "rewards/margins": 5.674252033233643, "rewards/margins_max": 8.784135818481445, "rewards/margins_min": 2.5643677711486816, "rewards/margins_std": 4.398039817810059, "rewards/rejected": -9.222538948059082, "step": 2570 }, { "epoch": 0.65, "grad_norm": 0.7734375, "learning_rate": 6.60733119613804e-07, "logits/chosen": 0.5296992063522339, "logits/rejected": 0.8904238939285278, "logps/chosen": -576.5020751953125, "logps/rejected": -1276.6494140625, "loss": 0.1902, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6155083179473877, "rewards/margins": 6.869490623474121, "rewards/margins_max": 10.674718856811523, "rewards/margins_min": 3.064263105392456, "rewards/margins_std": 5.381404399871826, "rewards/rejected": -10.48499870300293, "step": 2580 }, { "epoch": 0.65, "grad_norm": 0.5, "learning_rate": 6.524821534119112e-07, "logits/chosen": 0.5071766972541809, "logits/rejected": 0.9823764562606812, "logps/chosen": -582.9547729492188, "logps/rejected": -1479.55126953125, "loss": 0.1345, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5877151489257812, "rewards/margins": 8.957578659057617, "rewards/margins_max": 15.099912643432617, "rewards/margins_min": 2.815244197845459, "rewards/margins_std": 8.686573028564453, "rewards/rejected": -12.545293807983398, "step": 2590 }, { "epoch": 0.65, "grad_norm": 1.1171875, "learning_rate": 6.442580084618804e-07, "logits/chosen": 0.48143234848976135, "logits/rejected": 1.0688936710357666, "logps/chosen": -597.7962646484375, "logps/rejected": -1267.8271484375, "loss": 0.1344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.772063732147217, "rewards/margins": 6.709539890289307, "rewards/margins_max": 9.977819442749023, "rewards/margins_min": 3.441260576248169, "rewards/margins_std": 4.622044086456299, "rewards/rejected": -10.481603622436523, "step": 2600 }, { "epoch": 0.66, "grad_norm": 1.9765625, "learning_rate": 6.360613194990638e-07, "logits/chosen": 0.41432422399520874, "logits/rejected": 0.8854449987411499, "logps/chosen": -622.7572021484375, "logps/rejected": -1250.2237548828125, "loss": 0.2274, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.131538391113281, "rewards/margins": 6.169598579406738, "rewards/margins_max": 9.522704124450684, "rewards/margins_min": 2.816493511199951, "rewards/margins_std": 4.742007255554199, "rewards/rejected": -10.301137924194336, "step": 2610 }, { "epoch": 0.66, "grad_norm": 1.6484375, "learning_rate": 6.278927191397762e-07, "logits/chosen": 0.3944636583328247, "logits/rejected": 0.9111081957817078, "logps/chosen": -612.4472045898438, "logps/rejected": -1219.752685546875, "loss": 0.1986, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.7693982124328613, "rewards/margins": 6.301706314086914, "rewards/margins_max": 9.534825325012207, "rewards/margins_min": 3.0685877799987793, "rewards/margins_std": 4.572320461273193, "rewards/rejected": -10.071104049682617, "step": 2620 }, { "epoch": 0.66, "grad_norm": 0.9140625, "learning_rate": 6.197528378324663e-07, "logits/chosen": 0.5100525617599487, "logits/rejected": 0.9623018503189087, "logps/chosen": -584.0420532226562, "logps/rejected": -1203.010498046875, "loss": 0.1742, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7736029624938965, "rewards/margins": 6.160708427429199, "rewards/margins_max": 9.585714340209961, "rewards/margins_min": 2.7357051372528076, "rewards/margins_std": 4.843687534332275, "rewards/rejected": -9.93431282043457, "step": 2630 }, { "epoch": 0.66, "grad_norm": 3.734375, "learning_rate": 6.116423038090623e-07, "logits/chosen": 0.5766229629516602, "logits/rejected": 0.9825431108474731, "logps/chosen": -547.1226196289062, "logps/rejected": -1281.45703125, "loss": 0.3216, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.65391206741333, "rewards/margins": 7.179081916809082, "rewards/margins_max": 9.718598365783691, "rewards/margins_min": 4.639565467834473, "rewards/margins_std": 3.591418743133545, "rewards/rejected": -10.83299446105957, "step": 2640 }, { "epoch": 0.67, "grad_norm": 0.97265625, "learning_rate": 6.035617430364839e-07, "logits/chosen": 0.4997124671936035, "logits/rejected": 0.9522945284843445, "logps/chosen": -583.899169921875, "logps/rejected": -1155.713623046875, "loss": 0.1477, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7351505756378174, "rewards/margins": 5.709227561950684, "rewards/margins_max": 8.30264949798584, "rewards/margins_min": 3.1158056259155273, "rewards/margins_std": 3.6676526069641113, "rewards/rejected": -9.444378852844238, "step": 2650 }, { "epoch": 0.67, "grad_norm": 1.1640625, "learning_rate": 5.955117791683289e-07, "logits/chosen": 0.5455132722854614, "logits/rejected": 0.7467927932739258, "logps/chosen": -618.4952392578125, "logps/rejected": -1362.67529296875, "loss": 0.1119, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.014021873474121, "rewards/margins": 7.122605323791504, "rewards/margins_max": 10.956873893737793, "rewards/margins_min": 3.2883358001708984, "rewards/margins_std": 5.422475337982178, "rewards/rejected": -11.136625289916992, "step": 2660 }, { "epoch": 0.67, "grad_norm": 1.484375, "learning_rate": 5.874930334967425e-07, "logits/chosen": 0.3480473756790161, "logits/rejected": 0.8517535924911499, "logps/chosen": -577.0358276367188, "logps/rejected": -1364.579833984375, "loss": 0.17, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2995376586914062, "rewards/margins": 8.075822830200195, "rewards/margins_max": 12.583778381347656, "rewards/margins_min": 3.5678658485412598, "rewards/margins_std": 6.375213623046875, "rewards/rejected": -11.375359535217285, "step": 2670 }, { "epoch": 0.67, "grad_norm": 0.8515625, "learning_rate": 5.795061249044657e-07, "logits/chosen": 0.36974793672561646, "logits/rejected": 0.9354592561721802, "logps/chosen": -607.3040771484375, "logps/rejected": -1114.209228515625, "loss": 0.2163, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.8398520946502686, "rewards/margins": 5.254799842834473, "rewards/margins_max": 8.094499588012695, "rewards/margins_min": 2.4150993824005127, "rewards/margins_std": 4.015942573547363, "rewards/rejected": -9.09465217590332, "step": 2680 }, { "epoch": 0.68, "grad_norm": 0.94921875, "learning_rate": 5.715516698170694e-07, "logits/chosen": 0.4757654070854187, "logits/rejected": 0.9228278994560242, "logps/chosen": -593.3265380859375, "logps/rejected": -1246.779296875, "loss": 0.1711, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.8630759716033936, "rewards/margins": 6.61892032623291, "rewards/margins_max": 10.43076229095459, "rewards/margins_min": 2.8070778846740723, "rewards/margins_std": 5.390759468078613, "rewards/rejected": -10.481996536254883, "step": 2690 }, { "epoch": 0.68, "grad_norm": 2.265625, "learning_rate": 5.636302821553791e-07, "logits/chosen": 0.5951135754585266, "logits/rejected": 0.9289643168449402, "logps/chosen": -600.3472900390625, "logps/rejected": -1256.505126953125, "loss": 0.1846, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.007782459259033, "rewards/margins": 6.590015411376953, "rewards/margins_max": 9.676294326782227, "rewards/margins_min": 3.503735065460205, "rewards/margins_std": 4.364659309387207, "rewards/rejected": -10.597796440124512, "step": 2700 }, { "epoch": 0.68, "grad_norm": 1.078125, "learning_rate": 5.557425732880927e-07, "logits/chosen": 0.45299792289733887, "logits/rejected": 0.9710724949836731, "logps/chosen": -576.3006591796875, "logps/rejected": -1305.9984130859375, "loss": 0.2169, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6723008155822754, "rewards/margins": 7.247259616851807, "rewards/margins_max": 10.579780578613281, "rewards/margins_min": 3.914738893508911, "rewards/margins_std": 4.712896347045898, "rewards/rejected": -10.919560432434082, "step": 2710 }, { "epoch": 0.68, "grad_norm": 4.25, "learning_rate": 5.478891519845969e-07, "logits/chosen": 0.4582904279232025, "logits/rejected": 1.0101871490478516, "logps/chosen": -571.7638549804688, "logps/rejected": -1202.6436767578125, "loss": 0.223, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.6778640747070312, "rewards/margins": 6.366325378417969, "rewards/margins_max": 9.646313667297363, "rewards/margins_min": 3.086336851119995, "rewards/margins_std": 4.638604164123535, "rewards/rejected": -10.044189453125, "step": 2720 }, { "epoch": 0.69, "grad_norm": 6.5, "learning_rate": 5.400706243679814e-07, "logits/chosen": 0.39346835017204285, "logits/rejected": 0.9161213040351868, "logps/chosen": -564.0874633789062, "logps/rejected": -1229.169677734375, "loss": 0.1437, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5225460529327393, "rewards/margins": 6.72415828704834, "rewards/margins_max": 10.41191291809082, "rewards/margins_min": 3.036404848098755, "rewards/margins_std": 5.215271472930908, "rewards/rejected": -10.246706008911133, "step": 2730 }, { "epoch": 0.69, "grad_norm": 9.625, "learning_rate": 5.322875938682574e-07, "logits/chosen": 0.4170478284358978, "logits/rejected": 0.855624794960022, "logps/chosen": -595.4231567382812, "logps/rejected": -1336.313720703125, "loss": 0.1761, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6344857215881348, "rewards/margins": 7.587996482849121, "rewards/margins_max": 11.985794067382812, "rewards/margins_min": 3.190199375152588, "rewards/margins_std": 6.219425201416016, "rewards/rejected": -11.222482681274414, "step": 2740 }, { "epoch": 0.69, "grad_norm": 0.921875, "learning_rate": 5.245406611757881e-07, "logits/chosen": 0.45017296075820923, "logits/rejected": 0.7095610499382019, "logps/chosen": -599.9002685546875, "logps/rejected": -1215.00634765625, "loss": 0.2039, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.224530220031738, "rewards/margins": 5.976971626281738, "rewards/margins_max": 9.308272361755371, "rewards/margins_min": 2.645669460296631, "rewards/margins_std": 4.711172580718994, "rewards/rejected": -10.201501846313477, "step": 2750 }, { "epoch": 0.69, "grad_norm": 1.140625, "learning_rate": 5.168304241949258e-07, "logits/chosen": 0.5480870008468628, "logits/rejected": 0.9951409101486206, "logps/chosen": -630.0256958007812, "logps/rejected": -1283.5689697265625, "loss": 0.3047, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.216586589813232, "rewards/margins": 6.3858795166015625, "rewards/margins_max": 9.220312118530273, "rewards/margins_min": 3.5514473915100098, "rewards/margins_std": 4.0084919929504395, "rewards/rejected": -10.602466583251953, "step": 2760 }, { "epoch": 0.7, "grad_norm": 1.71875, "learning_rate": 5.091574779978654e-07, "logits/chosen": 0.5319818258285522, "logits/rejected": 0.923554539680481, "logps/chosen": -576.5374755859375, "logps/rejected": -1277.248046875, "loss": 0.2105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.666407823562622, "rewards/margins": 6.9405670166015625, "rewards/margins_max": 10.391286849975586, "rewards/margins_min": 3.4898483753204346, "rewards/margins_std": 4.880053997039795, "rewards/rejected": -10.606975555419922, "step": 2770 }, { "epoch": 0.7, "grad_norm": 2.09375, "learning_rate": 5.015224147787195e-07, "logits/chosen": 0.4306615889072418, "logits/rejected": 0.8923788070678711, "logps/chosen": -581.0130004882812, "logps/rejected": -1277.997314453125, "loss": 0.1599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6794135570526123, "rewards/margins": 7.105888366699219, "rewards/margins_max": 11.283435821533203, "rewards/margins_min": 2.928340435028076, "rewards/margins_std": 5.907945156097412, "rewards/rejected": -10.785301208496094, "step": 2780 }, { "epoch": 0.7, "grad_norm": 1.453125, "learning_rate": 4.939258238078098e-07, "logits/chosen": 0.3736962378025055, "logits/rejected": 0.9539716839790344, "logps/chosen": -574.2659301757812, "logps/rejected": -1150.515869140625, "loss": 0.0895, "rewards/accuracies": 1.0, "rewards/chosen": -3.42749285697937, "rewards/margins": 5.820822715759277, "rewards/margins_max": 8.33712100982666, "rewards/margins_min": 3.3045241832733154, "rewards/margins_std": 3.5585830211639404, "rewards/rejected": -9.248315811157227, "step": 2790 }, { "epoch": 0.7, "grad_norm": 2.296875, "learning_rate": 4.863682913861911e-07, "logits/chosen": 0.39504092931747437, "logits/rejected": 0.6548932790756226, "logps/chosen": -580.8141479492188, "logps/rejected": -1209.3515625, "loss": 0.2155, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.753568172454834, "rewards/margins": 5.986493110656738, "rewards/margins_max": 9.503013610839844, "rewards/margins_min": 2.469972610473633, "rewards/margins_std": 4.973111152648926, "rewards/rejected": -9.74006175994873, "step": 2800 }, { "epoch": 0.71, "grad_norm": 0.51953125, "learning_rate": 4.788504008003977e-07, "logits/chosen": 0.36534491181373596, "logits/rejected": 0.7744854092597961, "logps/chosen": -587.0809326171875, "logps/rejected": -1210.5406494140625, "loss": 0.2413, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5366668701171875, "rewards/margins": 6.2145538330078125, "rewards/margins_max": 10.104393005371094, "rewards/margins_min": 2.324714183807373, "rewards/margins_std": 5.501064300537109, "rewards/rejected": -9.751221656799316, "step": 2810 }, { "epoch": 0.71, "grad_norm": 0.703125, "learning_rate": 4.7137273227742746e-07, "logits/chosen": 0.3758518695831299, "logits/rejected": 0.9578613042831421, "logps/chosen": -546.7774658203125, "logps/rejected": -1061.381591796875, "loss": 0.2159, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.3317322731018066, "rewards/margins": 5.217543601989746, "rewards/margins_max": 9.113043785095215, "rewards/margins_min": 1.3220431804656982, "rewards/margins_std": 5.509068965911865, "rewards/rejected": -8.549276351928711, "step": 2820 }, { "epoch": 0.71, "grad_norm": 1.9921875, "learning_rate": 4.639358629399601e-07, "logits/chosen": 0.384821355342865, "logits/rejected": 0.8197442293167114, "logps/chosen": -592.8553466796875, "logps/rejected": -1149.2913818359375, "loss": 0.1985, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7938003540039062, "rewards/margins": 5.607391357421875, "rewards/margins_max": 8.316540718078613, "rewards/margins_min": 2.8982410430908203, "rewards/margins_std": 3.8313167095184326, "rewards/rejected": -9.401190757751465, "step": 2830 }, { "epoch": 0.71, "grad_norm": 9.625, "learning_rate": 4.5654036676181496e-07, "logits/chosen": 0.44163426756858826, "logits/rejected": 0.8041768074035645, "logps/chosen": -654.1993408203125, "logps/rejected": -1390.2359619140625, "loss": 0.2291, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.165137767791748, "rewards/margins": 7.4157538414001465, "rewards/margins_max": 11.67861557006836, "rewards/margins_min": 3.152892589569092, "rewards/margins_std": 6.0285964012146, "rewards/rejected": -11.580891609191895, "step": 2840 }, { "epoch": 0.72, "grad_norm": 4.0625, "learning_rate": 4.491868145236508e-07, "logits/chosen": 0.3212242126464844, "logits/rejected": 0.8466861844062805, "logps/chosen": -621.2687377929688, "logps/rejected": -1346.79296875, "loss": 0.169, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9158835411071777, "rewards/margins": 7.350478172302246, "rewards/margins_max": 11.066621780395508, "rewards/margins_min": 3.634335994720459, "rewards/margins_std": 5.255418300628662, "rewards/rejected": -11.266361236572266, "step": 2850 }, { "epoch": 0.72, "grad_norm": 1.3359375, "learning_rate": 4.418757737689156e-07, "logits/chosen": 0.31801286339759827, "logits/rejected": 0.8061238527297974, "logps/chosen": -559.0628662109375, "logps/rejected": -1199.774169921875, "loss": 0.1446, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.384986162185669, "rewards/margins": 6.031187057495117, "rewards/margins_max": 8.800252914428711, "rewards/margins_min": 3.2621231079101562, "rewards/margins_std": 3.916048765182495, "rewards/rejected": -9.416173934936523, "step": 2860 }, { "epoch": 0.72, "grad_norm": 1.953125, "learning_rate": 4.346078087600411e-07, "logits/chosen": 0.4582739472389221, "logits/rejected": 0.9584504961967468, "logps/chosen": -622.5162963867188, "logps/rejected": -1202.921142578125, "loss": 0.2192, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.9695796966552734, "rewards/margins": 5.917517185211182, "rewards/margins_max": 9.363494873046875, "rewards/margins_min": 2.4715399742126465, "rewards/margins_std": 4.873347282409668, "rewards/rejected": -9.887097358703613, "step": 2870 }, { "epoch": 0.72, "grad_norm": 1.7421875, "learning_rate": 4.273834804348959e-07, "logits/chosen": 0.47292360663414, "logits/rejected": 0.8965142369270325, "logps/chosen": -550.5333862304688, "logps/rejected": -1067.5582275390625, "loss": 0.2409, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.4259536266326904, "rewards/margins": 5.33632755279541, "rewards/margins_max": 8.64592170715332, "rewards/margins_min": 2.0267326831817627, "rewards/margins_std": 4.680473327636719, "rewards/rejected": -8.76228141784668, "step": 2880 }, { "epoch": 0.73, "grad_norm": 0.92578125, "learning_rate": 4.202033463634913e-07, "logits/chosen": 0.24783340096473694, "logits/rejected": 0.7742137312889099, "logps/chosen": -621.947998046875, "logps/rejected": -1266.604248046875, "loss": 0.1774, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.8389954566955566, "rewards/margins": 6.534533500671387, "rewards/margins_max": 9.25249195098877, "rewards/margins_min": 3.8165740966796875, "rewards/margins_std": 3.8437747955322266, "rewards/rejected": -10.373528480529785, "step": 2890 }, { "epoch": 0.73, "grad_norm": 1.2578125, "learning_rate": 4.1306796070494755e-07, "logits/chosen": 0.5090914368629456, "logits/rejected": 0.9704787135124207, "logps/chosen": -566.8453369140625, "logps/rejected": -1316.5975341796875, "loss": 0.2067, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5487873554229736, "rewards/margins": 7.52276086807251, "rewards/margins_max": 11.503535270690918, "rewards/margins_min": 3.541985034942627, "rewards/margins_std": 5.629666328430176, "rewards/rejected": -11.071548461914062, "step": 2900 }, { "epoch": 0.73, "grad_norm": 0.73828125, "learning_rate": 4.0597787416472605e-07, "logits/chosen": 0.42445096373558044, "logits/rejected": 1.0089080333709717, "logps/chosen": -590.0271606445312, "logps/rejected": -1290.34912109375, "loss": 0.1194, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6672592163085938, "rewards/margins": 7.150638580322266, "rewards/margins_max": 10.635174751281738, "rewards/margins_min": 3.6661014556884766, "rewards/margins_std": 4.9278788566589355, "rewards/rejected": -10.817896842956543, "step": 2910 }, { "epoch": 0.73, "grad_norm": 2.46875, "learning_rate": 3.989336339521244e-07, "logits/chosen": 0.4603755474090576, "logits/rejected": 1.016980767250061, "logps/chosen": -556.4505615234375, "logps/rejected": -1168.83642578125, "loss": 0.2162, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.3615431785583496, "rewards/margins": 6.154811859130859, "rewards/margins_max": 9.212437629699707, "rewards/margins_min": 3.097187042236328, "rewards/margins_std": 4.324134349822998, "rewards/rejected": -9.516355514526367, "step": 2920 }, { "epoch": 0.74, "grad_norm": 1.3359375, "learning_rate": 3.919357837380436e-07, "logits/chosen": 0.5008795857429504, "logits/rejected": 0.9082363843917847, "logps/chosen": -596.6324462890625, "logps/rejected": -1242.9361572265625, "loss": 0.1661, "rewards/accuracies": 0.9375, "rewards/chosen": -3.753378391265869, "rewards/margins": 6.4934234619140625, "rewards/margins_max": 9.598888397216797, "rewards/margins_min": 3.3879590034484863, "rewards/margins_std": 4.39178991317749, "rewards/rejected": -10.246801376342773, "step": 2930 }, { "epoch": 0.74, "grad_norm": 0.62890625, "learning_rate": 3.849848636130293e-07, "logits/chosen": 0.37968841195106506, "logits/rejected": 0.7749906778335571, "logps/chosen": -589.7718505859375, "logps/rejected": -1237.0853271484375, "loss": 0.1493, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7084765434265137, "rewards/margins": 6.354104042053223, "rewards/margins_max": 9.316937446594238, "rewards/margins_min": 3.3912723064422607, "rewards/margins_std": 4.190077304840088, "rewards/rejected": -10.062582015991211, "step": 2940 }, { "epoch": 0.74, "grad_norm": 0.90625, "learning_rate": 3.780814100455848e-07, "logits/chosen": 0.4370139539241791, "logits/rejected": 0.7921696901321411, "logps/chosen": -594.7927856445312, "logps/rejected": -1272.9208984375, "loss": 0.1801, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.9955291748046875, "rewards/margins": 6.536546230316162, "rewards/margins_max": 10.13880443572998, "rewards/margins_min": 2.9342868328094482, "rewards/margins_std": 5.094363689422607, "rewards/rejected": -10.532075881958008, "step": 2950 }, { "epoch": 0.74, "grad_norm": 1.8671875, "learning_rate": 3.712259558407698e-07, "logits/chosen": 0.5578526258468628, "logits/rejected": 1.0129783153533936, "logps/chosen": -612.83154296875, "logps/rejected": -1290.3111572265625, "loss": 0.1758, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8081657886505127, "rewards/margins": 6.890603542327881, "rewards/margins_max": 10.6620512008667, "rewards/margins_min": 3.119157314300537, "rewards/margins_std": 5.3336310386657715, "rewards/rejected": -10.698770523071289, "step": 2960 }, { "epoch": 0.75, "grad_norm": 3.5, "learning_rate": 3.644190300990774e-07, "logits/chosen": 0.5283955931663513, "logits/rejected": 0.9858170747756958, "logps/chosen": -581.1080932617188, "logps/rejected": -1192.7562255859375, "loss": 0.1819, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.572531223297119, "rewards/margins": 6.240419387817383, "rewards/margins_max": 10.008376121520996, "rewards/margins_min": 2.4724607467651367, "rewards/margins_std": 5.328697204589844, "rewards/rejected": -9.812950134277344, "step": 2970 }, { "epoch": 0.75, "grad_norm": 0.96875, "learning_rate": 3.576611581755972e-07, "logits/chosen": 0.4499734044075012, "logits/rejected": 0.7068200707435608, "logps/chosen": -523.8099975585938, "logps/rejected": -1294.130615234375, "loss": 0.1334, "rewards/accuracies": 1.0, "rewards/chosen": -3.308804988861084, "rewards/margins": 7.3770246505737305, "rewards/margins_max": 10.823869705200195, "rewards/margins_min": 3.930180311203003, "rewards/margins_std": 4.874573707580566, "rewards/rejected": -10.685829162597656, "step": 2980 }, { "epoch": 0.75, "grad_norm": 1.0546875, "learning_rate": 3.5095286163947155e-07, "logits/chosen": 0.48973578214645386, "logits/rejected": 0.9675741195678711, "logps/chosen": -530.2913208007812, "logps/rejected": -1184.1497802734375, "loss": 0.1522, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.161573886871338, "rewards/margins": 6.543496608734131, "rewards/margins_max": 9.359611511230469, "rewards/margins_min": 3.7273802757263184, "rewards/margins_std": 3.9825892448425293, "rewards/rejected": -9.705069541931152, "step": 2990 }, { "epoch": 0.75, "grad_norm": 1.0078125, "learning_rate": 3.442946582336379e-07, "logits/chosen": 0.4471007287502289, "logits/rejected": 0.9367235898971558, "logps/chosen": -580.6430053710938, "logps/rejected": -1280.119140625, "loss": 0.1246, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6090996265411377, "rewards/margins": 7.072485446929932, "rewards/margins_max": 11.0000638961792, "rewards/margins_min": 3.1449074745178223, "rewards/margins_std": 5.554434776306152, "rewards/rejected": -10.681586265563965, "step": 3000 }, { "epoch": 0.76, "grad_norm": 1.5078125, "learning_rate": 3.376870618348722e-07, "logits/chosen": 0.46739286184310913, "logits/rejected": 0.8226820230484009, "logps/chosen": -575.8204956054688, "logps/rejected": -1147.048583984375, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/chosen": -3.88541841506958, "rewards/margins": 5.527801513671875, "rewards/margins_max": 7.994576930999756, "rewards/margins_min": 3.0610268115997314, "rewards/margins_std": 3.488546371459961, "rewards/rejected": -9.41322135925293, "step": 3010 }, { "epoch": 0.76, "grad_norm": 7.4375, "learning_rate": 3.311305824141273e-07, "logits/chosen": 0.4271882176399231, "logits/rejected": 0.8448736071586609, "logps/chosen": -613.0482177734375, "logps/rejected": -1061.115966796875, "loss": 0.3039, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.161238670349121, "rewards/margins": 4.51308536529541, "rewards/margins_max": 7.211556434631348, "rewards/margins_min": 1.81461501121521, "rewards/margins_std": 3.816213607788086, "rewards/rejected": -8.674324989318848, "step": 3020 }, { "epoch": 0.76, "grad_norm": 1.34375, "learning_rate": 3.2462572599717263e-07, "logits/chosen": 0.6139329671859741, "logits/rejected": 0.8676943778991699, "logps/chosen": -577.7686767578125, "logps/rejected": -1452.658203125, "loss": 0.158, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.870023250579834, "rewards/margins": 8.4967679977417, "rewards/margins_max": 13.280682563781738, "rewards/margins_min": 3.7128536701202393, "rewards/margins_std": 6.765477180480957, "rewards/rejected": -12.366792678833008, "step": 3030 }, { "epoch": 0.76, "grad_norm": 0.765625, "learning_rate": 3.181729946255406e-07, "logits/chosen": 0.4582037031650543, "logits/rejected": 0.9174816012382507, "logps/chosen": -629.7871704101562, "logps/rejected": -1234.114990234375, "loss": 0.1879, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7807984352111816, "rewards/margins": 6.365638732910156, "rewards/margins_max": 9.294533729553223, "rewards/margins_min": 3.4367434978485107, "rewards/margins_std": 4.142083168029785, "rewards/rejected": -10.14643669128418, "step": 3040 }, { "epoch": 0.77, "grad_norm": 10.9375, "learning_rate": 3.1177288631777953e-07, "logits/chosen": 0.5115953683853149, "logits/rejected": 0.9640370607376099, "logps/chosen": -588.091552734375, "logps/rejected": -1170.9622802734375, "loss": 0.2467, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.9625473022460938, "rewards/margins": 5.80161190032959, "rewards/margins_max": 8.664453506469727, "rewards/margins_min": 2.9387693405151367, "rewards/margins_std": 4.048670291900635, "rewards/rejected": -9.764158248901367, "step": 3050 }, { "epoch": 0.77, "grad_norm": 2.828125, "learning_rate": 3.054258950310152e-07, "logits/chosen": 0.43586626648902893, "logits/rejected": 0.8257268667221069, "logps/chosen": -565.931640625, "logps/rejected": -1140.6973876953125, "loss": 0.2435, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.604163408279419, "rewards/margins": 5.731095314025879, "rewards/margins_max": 9.021711349487305, "rewards/margins_min": 2.4404799938201904, "rewards/margins_std": 4.653633117675781, "rewards/rejected": -9.335259437561035, "step": 3060 }, { "epoch": 0.77, "grad_norm": 1.1484375, "learning_rate": 2.9913251062282984e-07, "logits/chosen": 0.5903941988945007, "logits/rejected": 0.9113849401473999, "logps/chosen": -563.6361083984375, "logps/rejected": -1265.0657958984375, "loss": 0.1899, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6609840393066406, "rewards/margins": 6.926054954528809, "rewards/margins_max": 10.484045028686523, "rewards/margins_min": 3.3680667877197266, "rewards/margins_std": 5.031756401062012, "rewards/rejected": -10.587040901184082, "step": 3070 }, { "epoch": 0.77, "grad_norm": 6.78125, "learning_rate": 2.9289321881345254e-07, "logits/chosen": 0.5719391703605652, "logits/rejected": 0.9248722791671753, "logps/chosen": -599.0647583007812, "logps/rejected": -1356.6328125, "loss": 0.1565, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8286640644073486, "rewards/margins": 7.712100028991699, "rewards/margins_max": 11.743194580078125, "rewards/margins_min": 3.681006908416748, "rewards/margins_std": 5.700827598571777, "rewards/rejected": -11.540764808654785, "step": 3080 }, { "epoch": 0.78, "grad_norm": 0.60546875, "learning_rate": 2.867085011482737e-07, "logits/chosen": 0.48627376556396484, "logits/rejected": 0.8899961709976196, "logps/chosen": -660.3402709960938, "logps/rejected": -1349.8409423828125, "loss": 0.1774, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.254647254943848, "rewards/margins": 7.1945695877075195, "rewards/margins_max": 10.741331100463867, "rewards/margins_min": 3.6478075981140137, "rewards/margins_std": 5.015878200531006, "rewards/rejected": -11.449216842651367, "step": 3090 }, { "epoch": 0.78, "grad_norm": 0.890625, "learning_rate": 2.8057883496067925e-07, "logits/chosen": 0.5544101595878601, "logits/rejected": 0.8789188265800476, "logps/chosen": -529.1682739257812, "logps/rejected": -1224.767822265625, "loss": 0.1593, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.319157838821411, "rewards/margins": 6.78830099105835, "rewards/margins_max": 10.131688117980957, "rewards/margins_min": 3.4449145793914795, "rewards/margins_std": 4.728262901306152, "rewards/rejected": -10.107458114624023, "step": 3100 }, { "epoch": 0.78, "grad_norm": 1.7890625, "learning_rate": 2.7450469333520853e-07, "logits/chosen": 0.39449039101600647, "logits/rejected": 0.6853546500205994, "logps/chosen": -568.887451171875, "logps/rejected": -1200.629638671875, "loss": 0.1714, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.762941837310791, "rewards/margins": 6.143300533294678, "rewards/margins_max": 9.352704048156738, "rewards/margins_min": 2.9338972568511963, "rewards/margins_std": 4.538782119750977, "rewards/rejected": -9.906242370605469, "step": 3110 }, { "epoch": 0.79, "grad_norm": 1.140625, "learning_rate": 2.6848654507104463e-07, "logits/chosen": 0.3403048515319824, "logits/rejected": 0.807928740978241, "logps/chosen": -615.323974609375, "logps/rejected": -1190.897705078125, "loss": 0.161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.839613676071167, "rewards/margins": 5.719782829284668, "rewards/margins_max": 8.89350700378418, "rewards/margins_min": 2.546060085296631, "rewards/margins_std": 4.4883222579956055, "rewards/rejected": -9.559396743774414, "step": 3120 }, { "epoch": 0.79, "grad_norm": 1.921875, "learning_rate": 2.625248546458303e-07, "logits/chosen": 0.4214434027671814, "logits/rejected": 0.8966633677482605, "logps/chosen": -620.8934936523438, "logps/rejected": -1347.8140869140625, "loss": 0.1593, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.028387546539307, "rewards/margins": 7.290696144104004, "rewards/margins_max": 11.02523422241211, "rewards/margins_min": 3.556157350540161, "rewards/margins_std": 5.281435489654541, "rewards/rejected": -11.319084167480469, "step": 3130 }, { "epoch": 0.79, "grad_norm": 2.046875, "learning_rate": 2.5662008217982156e-07, "logits/chosen": 0.47852668166160583, "logits/rejected": 0.9050714373588562, "logps/chosen": -579.2857666015625, "logps/rejected": -1326.245849609375, "loss": 0.1588, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.5103771686553955, "rewards/margins": 7.595736026763916, "rewards/margins_max": 11.297245025634766, "rewards/margins_min": 3.894225597381592, "rewards/margins_std": 5.2347259521484375, "rewards/rejected": -11.10611343383789, "step": 3140 }, { "epoch": 0.79, "grad_norm": 2.3125, "learning_rate": 2.507726834003745e-07, "logits/chosen": 0.5341039299964905, "logits/rejected": 0.9968475103378296, "logps/chosen": -572.641845703125, "logps/rejected": -1260.7801513671875, "loss": 0.1122, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6314964294433594, "rewards/margins": 6.958949089050293, "rewards/margins_max": 11.029642105102539, "rewards/margins_min": 2.888258457183838, "rewards/margins_std": 5.756827354431152, "rewards/rejected": -10.590445518493652, "step": 3150 }, { "epoch": 0.8, "grad_norm": 2.71875, "learning_rate": 2.44983109606773e-07, "logits/chosen": 0.44414272904396057, "logits/rejected": 0.72679603099823, "logps/chosen": -596.27880859375, "logps/rejected": -1328.358154296875, "loss": 0.177, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8258304595947266, "rewards/margins": 7.107968330383301, "rewards/margins_max": 10.948715209960938, "rewards/margins_min": 3.2672207355499268, "rewards/margins_std": 5.431636810302734, "rewards/rejected": -10.933798789978027, "step": 3160 }, { "epoch": 0.8, "grad_norm": 0.8203125, "learning_rate": 2.3925180763539845e-07, "logits/chosen": 0.4964269697666168, "logits/rejected": 0.9164209365844727, "logps/chosen": -562.5709228515625, "logps/rejected": -1107.593017578125, "loss": 0.1703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.371009349822998, "rewards/margins": 5.362520217895508, "rewards/margins_max": 8.385972023010254, "rewards/margins_min": 2.3390681743621826, "rewards/margins_std": 4.275806427001953, "rewards/rejected": -8.733530044555664, "step": 3170 }, { "epoch": 0.8, "grad_norm": 4.125, "learning_rate": 2.3357921982524197e-07, "logits/chosen": 0.5338067412376404, "logits/rejected": 0.9268990755081177, "logps/chosen": -571.2572021484375, "logps/rejected": -1339.582763671875, "loss": 0.1291, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.572880506515503, "rewards/margins": 7.33321475982666, "rewards/margins_max": 10.481898307800293, "rewards/margins_min": 4.184528827667236, "rewards/margins_std": 4.452913284301758, "rewards/rejected": -10.906094551086426, "step": 3180 }, { "epoch": 0.8, "grad_norm": 1.640625, "learning_rate": 2.279657839837652e-07, "logits/chosen": 0.4593687951564789, "logits/rejected": 0.8522500991821289, "logps/chosen": -554.2239379882812, "logps/rejected": -1212.277587890625, "loss": 0.2414, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.5025417804718018, "rewards/margins": 6.630636692047119, "rewards/margins_max": 10.59939956665039, "rewards/margins_min": 2.661872386932373, "rewards/margins_std": 5.612679958343506, "rewards/rejected": -10.1331787109375, "step": 3190 }, { "epoch": 0.81, "grad_norm": 0.859375, "learning_rate": 2.2241193335311127e-07, "logits/chosen": 0.4334026277065277, "logits/rejected": 0.8511263728141785, "logps/chosen": -522.2242431640625, "logps/rejected": -1108.195556640625, "loss": 0.1553, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.257991313934326, "rewards/margins": 5.921751976013184, "rewards/margins_max": 8.671719551086426, "rewards/margins_min": 3.1717848777770996, "rewards/margins_std": 3.889040470123291, "rewards/rejected": -9.179742813110352, "step": 3200 }, { "epoch": 0.81, "grad_norm": 1.21875, "learning_rate": 2.1691809657666592e-07, "logits/chosen": 0.4394384026527405, "logits/rejected": 0.9547786712646484, "logps/chosen": -563.9006958007812, "logps/rejected": -984.5545654296875, "loss": 0.2146, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6484360694885254, "rewards/margins": 4.430947780609131, "rewards/margins_max": 6.850518226623535, "rewards/margins_min": 2.011378765106201, "rewards/margins_std": 3.421788454055786, "rewards/rejected": -8.079385757446289, "step": 3210 }, { "epoch": 0.81, "grad_norm": 0.94140625, "learning_rate": 2.1148469766597698e-07, "logits/chosen": 0.5856447219848633, "logits/rejected": 0.9771261215209961, "logps/chosen": -587.283935546875, "logps/rejected": -1270.7833251953125, "loss": 0.1871, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.7281603813171387, "rewards/margins": 6.7978515625, "rewards/margins_max": 10.585257530212402, "rewards/margins_min": 3.0104446411132812, "rewards/margins_std": 5.356202125549316, "rewards/rejected": -10.526012420654297, "step": 3220 }, { "epoch": 0.81, "grad_norm": 1.625, "learning_rate": 2.06112155968028e-07, "logits/chosen": 0.34765639901161194, "logits/rejected": 0.7540073990821838, "logps/chosen": -610.2107543945312, "logps/rejected": -1326.593017578125, "loss": 0.1437, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.039941787719727, "rewards/margins": 7.0522661209106445, "rewards/margins_max": 10.800148010253906, "rewards/margins_min": 3.3043816089630127, "rewards/margins_std": 5.300307750701904, "rewards/rejected": -11.092206954956055, "step": 3230 }, { "epoch": 0.82, "grad_norm": 4.375, "learning_rate": 2.0080088613287293e-07, "logits/chosen": 0.4891189932823181, "logits/rejected": 0.9726032018661499, "logps/chosen": -548.3533935546875, "logps/rejected": -1111.445068359375, "loss": 0.1692, "rewards/accuracies": 1.0, "rewards/chosen": -3.530595302581787, "rewards/margins": 5.695003509521484, "rewards/margins_max": 8.761409759521484, "rewards/margins_min": 2.628596305847168, "rewards/margins_std": 4.336554527282715, "rewards/rejected": -9.22559928894043, "step": 3240 }, { "epoch": 0.82, "grad_norm": 1.453125, "learning_rate": 1.955512980816354e-07, "logits/chosen": 0.5204964876174927, "logits/rejected": 0.8765512704849243, "logps/chosen": -591.1302490234375, "logps/rejected": -1324.301513671875, "loss": 0.2054, "rewards/accuracies": 0.9375, "rewards/chosen": -3.970345973968506, "rewards/margins": 7.200788974761963, "rewards/margins_max": 10.806253433227539, "rewards/margins_min": 3.595324754714966, "rewards/margins_std": 5.098896026611328, "rewards/rejected": -11.171134948730469, "step": 3250 }, { "epoch": 0.82, "grad_norm": 1.2890625, "learning_rate": 1.9036379697486927e-07, "logits/chosen": 0.47821909189224243, "logits/rejected": 0.921379566192627, "logps/chosen": -550.5033569335938, "logps/rejected": -1350.917724609375, "loss": 0.1216, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5862479209899902, "rewards/margins": 7.927116394042969, "rewards/margins_max": 12.039579391479492, "rewards/margins_min": 3.81465220451355, "rewards/margins_std": 5.815901756286621, "rewards/rejected": -11.513364791870117, "step": 3260 }, { "epoch": 0.82, "grad_norm": 0.9375, "learning_rate": 1.8523878318128926e-07, "logits/chosen": 0.5904892683029175, "logits/rejected": 1.0056906938552856, "logps/chosen": -576.0701904296875, "logps/rejected": -1295.9527587890625, "loss": 0.1443, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.6342978477478027, "rewards/margins": 7.424314022064209, "rewards/margins_max": 11.07739543914795, "rewards/margins_min": 3.771233320236206, "rewards/margins_std": 5.166236400604248, "rewards/rejected": -11.058611869812012, "step": 3270 }, { "epoch": 0.83, "grad_norm": 2.5, "learning_rate": 1.8017665224687185e-07, "logits/chosen": 0.4087589383125305, "logits/rejected": 0.9955110549926758, "logps/chosen": -640.23828125, "logps/rejected": -1283.7027587890625, "loss": 0.188, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8483219146728516, "rewards/margins": 6.573834419250488, "rewards/margins_max": 9.99498176574707, "rewards/margins_min": 3.15268611907959, "rewards/margins_std": 4.8382344245910645, "rewards/rejected": -10.422155380249023, "step": 3280 }, { "epoch": 0.83, "grad_norm": 1.015625, "learning_rate": 1.7517779486432494e-07, "logits/chosen": 0.5131040811538696, "logits/rejected": 0.9352075457572937, "logps/chosen": -604.0338745117188, "logps/rejected": -1292.89111328125, "loss": 0.1687, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8847403526306152, "rewards/margins": 6.89129638671875, "rewards/margins_max": 10.85944938659668, "rewards/margins_min": 2.9231438636779785, "rewards/margins_std": 5.611815452575684, "rewards/rejected": -10.77603816986084, "step": 3290 }, { "epoch": 0.83, "grad_norm": 1.703125, "learning_rate": 1.7024259684293674e-07, "logits/chosen": 0.4551068842411041, "logits/rejected": 0.9105457067489624, "logps/chosen": -587.896240234375, "logps/rejected": -1109.6060791015625, "loss": 0.2436, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.670138120651245, "rewards/margins": 5.2051897048950195, "rewards/margins_max": 8.09550666809082, "rewards/margins_min": 2.314873218536377, "rewards/margins_std": 4.0875244140625, "rewards/rejected": -8.875328063964844, "step": 3300 }, { "epoch": 0.83, "grad_norm": 0.95703125, "learning_rate": 1.6537143907879792e-07, "logits/chosen": 0.4002392292022705, "logits/rejected": 0.8851076364517212, "logps/chosen": -600.163818359375, "logps/rejected": -1274.1259765625, "loss": 0.1875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.855924606323242, "rewards/margins": 6.683934688568115, "rewards/margins_max": 10.393302917480469, "rewards/margins_min": 2.97456693649292, "rewards/margins_std": 5.2458391189575195, "rewards/rejected": -10.539859771728516, "step": 3310 }, { "epoch": 0.84, "grad_norm": 8.1875, "learning_rate": 1.6056469752540347e-07, "logits/chosen": 0.5070708394050598, "logits/rejected": 1.0886653661727905, "logps/chosen": -579.6941528320312, "logps/rejected": -1257.633544921875, "loss": 0.1963, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.607515335083008, "rewards/margins": 6.9815239906311035, "rewards/margins_max": 10.53345012664795, "rewards/margins_min": 3.4295973777770996, "rewards/margins_std": 5.023181915283203, "rewards/rejected": -10.58903980255127, "step": 3320 }, { "epoch": 0.84, "grad_norm": 0.87890625, "learning_rate": 1.5582274316463928e-07, "logits/chosen": 0.42002058029174805, "logits/rejected": 0.8670506477355957, "logps/chosen": -607.319580078125, "logps/rejected": -1416.8555908203125, "loss": 0.1503, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.934675693511963, "rewards/margins": 8.056138038635254, "rewards/margins_max": 11.901208877563477, "rewards/margins_min": 4.211067199707031, "rewards/margins_std": 5.437750816345215, "rewards/rejected": -11.990813255310059, "step": 3330 }, { "epoch": 0.84, "grad_norm": 0.58203125, "learning_rate": 1.511459419781469e-07, "logits/chosen": 0.473996639251709, "logits/rejected": 0.9316139221191406, "logps/chosen": -638.8199462890625, "logps/rejected": -1332.9974365234375, "loss": 0.1523, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.206120491027832, "rewards/margins": 7.044719696044922, "rewards/margins_max": 10.616361618041992, "rewards/margins_min": 3.4730796813964844, "rewards/margins_std": 5.05106258392334, "rewards/rejected": -11.250840187072754, "step": 3340 }, { "epoch": 0.84, "grad_norm": 0.875, "learning_rate": 1.4653465491908e-07, "logits/chosen": 0.4140965938568115, "logits/rejected": 0.8925831913948059, "logps/chosen": -568.9302978515625, "logps/rejected": -1233.1417236328125, "loss": 0.1886, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6400482654571533, "rewards/margins": 6.646452903747559, "rewards/margins_max": 10.187222480773926, "rewards/margins_min": 3.105684518814087, "rewards/margins_std": 5.007403373718262, "rewards/rejected": -10.28650188446045, "step": 3350 }, { "epoch": 0.85, "grad_norm": 2.21875, "learning_rate": 1.4198923788424477e-07, "logits/chosen": 0.47543078660964966, "logits/rejected": 0.9080629348754883, "logps/chosen": -633.4432373046875, "logps/rejected": -1273.630615234375, "loss": 0.14, "rewards/accuracies": 0.9375, "rewards/chosen": -4.122483730316162, "rewards/margins": 6.322574615478516, "rewards/margins_max": 9.358713150024414, "rewards/margins_min": 3.2864346504211426, "rewards/margins_std": 4.293749809265137, "rewards/rejected": -10.445058822631836, "step": 3360 }, { "epoch": 0.85, "grad_norm": 1.015625, "learning_rate": 1.375100416866316e-07, "logits/chosen": 0.5130153298377991, "logits/rejected": 0.9054125547409058, "logps/chosen": -539.0538940429688, "logps/rejected": -1185.374267578125, "loss": 0.1416, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.414109468460083, "rewards/margins": 6.318561553955078, "rewards/margins_max": 9.108491897583008, "rewards/margins_min": 3.5286312103271484, "rewards/margins_std": 3.9455573558807373, "rewards/rejected": -9.732671737670898, "step": 3370 }, { "epoch": 0.85, "grad_norm": 1.03125, "learning_rate": 1.3309741202834045e-07, "logits/chosen": 0.4064570367336273, "logits/rejected": 0.9202351570129395, "logps/chosen": -582.8175659179688, "logps/rejected": -1286.4290771484375, "loss": 0.1051, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.6537833213806152, "rewards/margins": 6.9002556800842285, "rewards/margins_max": 9.804253578186035, "rewards/margins_min": 3.9962570667266846, "rewards/margins_std": 4.106873512268066, "rewards/rejected": -10.554038047790527, "step": 3380 }, { "epoch": 0.85, "grad_norm": 2.890625, "learning_rate": 1.2875168947389982e-07, "logits/chosen": 0.4888080656528473, "logits/rejected": 0.8091050386428833, "logps/chosen": -651.0318603515625, "logps/rejected": -1255.9091796875, "loss": 0.1691, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.149880409240723, "rewards/margins": 6.239043712615967, "rewards/margins_max": 9.588689804077148, "rewards/margins_min": 2.889397144317627, "rewards/margins_std": 4.73711633682251, "rewards/rejected": -10.388925552368164, "step": 3390 }, { "epoch": 0.86, "grad_norm": 0.95703125, "learning_rate": 1.2447320942398075e-07, "logits/chosen": 0.4371975362300873, "logits/rejected": 1.0079147815704346, "logps/chosen": -620.552978515625, "logps/rejected": -1189.5225830078125, "loss": 0.294, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.896024227142334, "rewards/margins": 6.076613903045654, "rewards/margins_max": 9.558730125427246, "rewards/margins_min": 2.5944974422454834, "rewards/margins_std": 4.924456596374512, "rewards/rejected": -9.972637176513672, "step": 3400 }, { "epoch": 0.86, "grad_norm": 1.34375, "learning_rate": 1.2026230208951304e-07, "logits/chosen": 0.472128301858902, "logits/rejected": 0.9814669489860535, "logps/chosen": -617.8382568359375, "logps/rejected": -1211.0347900390625, "loss": 0.1947, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.850552797317505, "rewards/margins": 6.239529132843018, "rewards/margins_max": 9.678964614868164, "rewards/margins_min": 2.8000922203063965, "rewards/margins_std": 4.864098072052002, "rewards/rejected": -10.090081214904785, "step": 3410 }, { "epoch": 0.86, "grad_norm": 8.4375, "learning_rate": 1.1611929246619723e-07, "logits/chosen": 0.45898929238319397, "logits/rejected": 0.8415622711181641, "logps/chosen": -584.2407836914062, "logps/rejected": -1248.3134765625, "loss": 0.267, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8289635181427, "rewards/margins": 6.5659613609313965, "rewards/margins_max": 9.749704360961914, "rewards/margins_min": 3.3822174072265625, "rewards/margins_std": 4.502493858337402, "rewards/rejected": -10.394925117492676, "step": 3420 }, { "epoch": 0.86, "grad_norm": 10.8125, "learning_rate": 1.1204450030942347e-07, "logits/chosen": 0.5145548582077026, "logits/rejected": 0.8463503122329712, "logps/chosen": -588.1199951171875, "logps/rejected": -1244.9676513671875, "loss": 0.2621, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.942927122116089, "rewards/margins": 6.304174423217773, "rewards/margins_max": 9.600616455078125, "rewards/margins_min": 3.007732629776001, "rewards/margins_std": 4.661872386932373, "rewards/rejected": -10.247102737426758, "step": 3430 }, { "epoch": 0.87, "grad_norm": 0.71875, "learning_rate": 1.080382401095925e-07, "logits/chosen": 0.5430434942245483, "logits/rejected": 1.0276672840118408, "logps/chosen": -612.2022094726562, "logps/rejected": -1238.375, "loss": 0.177, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7015655040740967, "rewards/margins": 6.311164379119873, "rewards/margins_max": 9.718725204467773, "rewards/margins_min": 2.903604745864868, "rewards/margins_std": 4.81901741027832, "rewards/rejected": -10.012730598449707, "step": 3440 }, { "epoch": 0.87, "grad_norm": 0.5234375, "learning_rate": 1.0410082106784235e-07, "logits/chosen": 0.4352169632911682, "logits/rejected": 1.0200514793395996, "logps/chosen": -686.00732421875, "logps/rejected": -1239.3880615234375, "loss": 0.3034, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.280230522155762, "rewards/margins": 6.025425910949707, "rewards/margins_max": 9.804727554321289, "rewards/margins_min": 2.246123790740967, "rewards/margins_std": 5.34473991394043, "rewards/rejected": -10.305655479431152, "step": 3450 }, { "epoch": 0.87, "grad_norm": 3.125, "learning_rate": 1.0023254707218609e-07, "logits/chosen": 0.4326336979866028, "logits/rejected": 0.8475500345230103, "logps/chosen": -625.3204345703125, "logps/rejected": -1264.6829833984375, "loss": 0.2183, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.916844606399536, "rewards/margins": 6.507603645324707, "rewards/margins_max": 10.618246078491211, "rewards/margins_min": 2.3969624042510986, "rewards/margins_std": 5.813324928283691, "rewards/rejected": -10.42444896697998, "step": 3460 }, { "epoch": 0.87, "grad_norm": 2.09375, "learning_rate": 9.643371667405698e-08, "logits/chosen": 0.4223089814186096, "logits/rejected": 0.9621411561965942, "logps/chosen": -599.6414794921875, "logps/rejected": -1107.6328125, "loss": 0.1659, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.7435672283172607, "rewards/margins": 5.297955513000488, "rewards/margins_max": 7.878331184387207, "rewards/margins_min": 2.717580795288086, "rewards/margins_std": 3.6492016315460205, "rewards/rejected": -9.041522979736328, "step": 3470 }, { "epoch": 0.88, "grad_norm": 9.875, "learning_rate": 9.270462306526594e-08, "logits/chosen": 0.540179431438446, "logits/rejected": 0.956885039806366, "logps/chosen": -563.6200561523438, "logps/rejected": -1207.8319091796875, "loss": 0.2462, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.532160520553589, "rewards/margins": 6.614119052886963, "rewards/margins_max": 10.017059326171875, "rewards/margins_min": 3.21117901802063, "rewards/margins_std": 4.812485218048096, "rewards/rejected": -10.146280288696289, "step": 3480 }, { "epoch": 0.88, "grad_norm": 1.2890625, "learning_rate": 8.904555405537406e-08, "logits/chosen": 0.4101219177246094, "logits/rejected": 0.9202925562858582, "logps/chosen": -566.47998046875, "logps/rejected": -1209.119873046875, "loss": 0.1687, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4704158306121826, "rewards/margins": 6.5966796875, "rewards/margins_max": 9.444000244140625, "rewards/margins_min": 3.749358654022217, "rewards/margins_std": 4.026719570159912, "rewards/rejected": -10.067094802856445, "step": 3490 }, { "epoch": 0.88, "grad_norm": 1.5, "learning_rate": 8.545679204947953e-08, "logits/chosen": 0.5104061365127563, "logits/rejected": 0.9191001653671265, "logps/chosen": -560.3649291992188, "logps/rejected": -1123.328857421875, "loss": 0.1221, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.688387632369995, "rewards/margins": 5.621038913726807, "rewards/margins_max": 7.867220401763916, "rewards/margins_min": 3.3748581409454346, "rewards/margins_std": 3.176579713821411, "rewards/rejected": -9.309426307678223, "step": 3500 }, { "epoch": 0.88, "grad_norm": 13.1875, "learning_rate": 8.193861402642088e-08, "logits/chosen": 0.3396713137626648, "logits/rejected": 0.8896854519844055, "logps/chosen": -639.7154541015625, "logps/rejected": -1173.7528076171875, "loss": 0.2153, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.856093168258667, "rewards/margins": 5.626918315887451, "rewards/margins_max": 8.778867721557617, "rewards/margins_min": 2.4749696254730225, "rewards/margins_std": 4.457529067993164, "rewards/rejected": -9.483012199401855, "step": 3510 }, { "epoch": 0.89, "grad_norm": 0.83203125, "learning_rate": 7.849129151740119e-08, "logits/chosen": 0.49893778562545776, "logits/rejected": 0.9670238494873047, "logps/chosen": -577.2950439453125, "logps/rejected": -1137.2681884765625, "loss": 0.1803, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.5992584228515625, "rewards/margins": 5.753302097320557, "rewards/margins_max": 8.932449340820312, "rewards/margins_min": 2.574155807495117, "rewards/margins_std": 4.495992660522461, "rewards/rejected": -9.352560043334961, "step": 3520 }, { "epoch": 0.89, "grad_norm": 1.703125, "learning_rate": 7.511509058502996e-08, "logits/chosen": 0.4532325863838196, "logits/rejected": 0.9574426412582397, "logps/chosen": -568.4635620117188, "logps/rejected": -1101.7496337890625, "loss": 0.2359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.4725348949432373, "rewards/margins": 5.306643486022949, "rewards/margins_max": 8.2151460647583, "rewards/margins_min": 2.398141384124756, "rewards/margins_std": 4.113243579864502, "rewards/rejected": -8.779179573059082, "step": 3530 }, { "epoch": 0.89, "grad_norm": 11.8125, "learning_rate": 7.18102718027901e-08, "logits/chosen": 0.5021312236785889, "logits/rejected": 0.9430710673332214, "logps/chosen": -587.4429321289062, "logps/rejected": -1174.86181640625, "loss": 0.2749, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.7951760292053223, "rewards/margins": 5.84264612197876, "rewards/margins_max": 9.605822563171387, "rewards/margins_min": 2.0794689655303955, "rewards/margins_std": 5.321936130523682, "rewards/rejected": -9.637822151184082, "step": 3540 }, { "epoch": 0.89, "grad_norm": 1.25, "learning_rate": 6.857709023492586e-08, "logits/chosen": 0.39984625577926636, "logits/rejected": 0.8395845293998718, "logps/chosen": -564.1094970703125, "logps/rejected": -1282.72802734375, "loss": 0.2096, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.5986900329589844, "rewards/margins": 6.7760329246521, "rewards/margins_max": 10.53276252746582, "rewards/margins_min": 3.0193045139312744, "rewards/margins_std": 5.312817573547363, "rewards/rejected": -10.374723434448242, "step": 3550 }, { "epoch": 0.9, "grad_norm": 0.74609375, "learning_rate": 6.541579541675734e-08, "logits/chosen": 0.4497915208339691, "logits/rejected": 0.8971832394599915, "logps/chosen": -563.6310424804688, "logps/rejected": -1266.3800048828125, "loss": 0.1246, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.5002026557922363, "rewards/margins": 7.074613094329834, "rewards/margins_max": 10.105807304382324, "rewards/margins_min": 4.043417930603027, "rewards/margins_std": 4.28675651550293, "rewards/rejected": -10.57481575012207, "step": 3560 }, { "epoch": 0.9, "grad_norm": 2.71875, "learning_rate": 6.232663133542204e-08, "logits/chosen": 0.32878604531288147, "logits/rejected": 0.9640012979507446, "logps/chosen": -651.9713134765625, "logps/rejected": -1202.863525390625, "loss": 0.1796, "rewards/accuracies": 0.9375, "rewards/chosen": -4.158196449279785, "rewards/margins": 5.708923816680908, "rewards/margins_max": 8.873598098754883, "rewards/margins_min": 2.544250011444092, "rewards/margins_std": 4.475523948669434, "rewards/rejected": -9.867119789123535, "step": 3570 }, { "epoch": 0.9, "grad_norm": 0.458984375, "learning_rate": 5.9309836411043034e-08, "logits/chosen": 0.4480930268764496, "logits/rejected": 0.9676550030708313, "logps/chosen": -615.3204956054688, "logps/rejected": -1192.294677734375, "loss": 0.1424, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.8424465656280518, "rewards/margins": 6.077481269836426, "rewards/margins_max": 9.163492202758789, "rewards/margins_min": 2.9914684295654297, "rewards/margins_std": 4.364280700683594, "rewards/rejected": -9.919927597045898, "step": 3580 }, { "epoch": 0.9, "grad_norm": 1.0390625, "learning_rate": 5.636564347832906e-08, "logits/chosen": 0.5807913541793823, "logits/rejected": 1.0163103342056274, "logps/chosen": -546.7759399414062, "logps/rejected": -1098.3787841796875, "loss": 0.1258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.498042583465576, "rewards/margins": 5.564633369445801, "rewards/margins_max": 8.194429397583008, "rewards/margins_min": 2.9348368644714355, "rewards/margins_std": 3.7190933227539062, "rewards/rejected": -9.062675476074219, "step": 3590 }, { "epoch": 0.91, "grad_norm": 2.734375, "learning_rate": 5.349427976860321e-08, "logits/chosen": 0.38955169916152954, "logits/rejected": 0.9389937520027161, "logps/chosen": -605.4104614257812, "logps/rejected": -1252.658935546875, "loss": 0.1961, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7875728607177734, "rewards/margins": 6.668432712554932, "rewards/margins_max": 9.599607467651367, "rewards/margins_min": 3.737257480621338, "rewards/margins_std": 4.1453070640563965, "rewards/rejected": -10.456005096435547, "step": 3600 }, { "epoch": 0.91, "grad_norm": 2.015625, "learning_rate": 5.069596689226652e-08, "logits/chosen": 0.44946521520614624, "logits/rejected": 0.9365663528442383, "logps/chosen": -622.5480346679688, "logps/rejected": -1250.881103515625, "loss": 0.1408, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.8964874744415283, "rewards/margins": 6.451874732971191, "rewards/margins_max": 10.348373413085938, "rewards/margins_min": 2.5553746223449707, "rewards/margins_std": 5.510483264923096, "rewards/rejected": -10.348361015319824, "step": 3610 }, { "epoch": 0.91, "grad_norm": 4.90625, "learning_rate": 4.797092082169307e-08, "logits/chosen": 0.5568719506263733, "logits/rejected": 1.070988655090332, "logps/chosen": -644.2371826171875, "logps/rejected": -1172.1806640625, "loss": 0.2374, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.062577724456787, "rewards/margins": 5.627659797668457, "rewards/margins_max": 8.762027740478516, "rewards/margins_min": 2.4932923316955566, "rewards/margins_std": 4.432665824890137, "rewards/rejected": -9.690237998962402, "step": 3620 }, { "epoch": 0.91, "grad_norm": 4.0, "learning_rate": 4.531935187456215e-08, "logits/chosen": 0.562368631362915, "logits/rejected": 1.0721943378448486, "logps/chosen": -617.67333984375, "logps/rejected": -1335.0179443359375, "loss": 0.1633, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.883314609527588, "rewards/margins": 7.272046089172363, "rewards/margins_max": 10.699112892150879, "rewards/margins_min": 3.844979763031006, "rewards/margins_std": 4.846603870391846, "rewards/rejected": -11.15536117553711, "step": 3630 }, { "epoch": 0.92, "grad_norm": 2.40625, "learning_rate": 4.274146469762563e-08, "logits/chosen": 0.5142907500267029, "logits/rejected": 0.8736904859542847, "logps/chosen": -538.7899169921875, "logps/rejected": -1283.476318359375, "loss": 0.1636, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5881550312042236, "rewards/margins": 7.207592010498047, "rewards/margins_max": 10.598726272583008, "rewards/margins_min": 3.8164570331573486, "rewards/margins_std": 4.795788288116455, "rewards/rejected": -10.795746803283691, "step": 3640 }, { "epoch": 0.92, "grad_norm": 0.9609375, "learning_rate": 4.023745825091407e-08, "logits/chosen": 0.4232380986213684, "logits/rejected": 0.8965535163879395, "logps/chosen": -613.1757202148438, "logps/rejected": -1340.668212890625, "loss": 0.1607, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.898705005645752, "rewards/margins": 7.219210624694824, "rewards/margins_max": 10.523519515991211, "rewards/margins_min": 3.914902925491333, "rewards/margins_std": 4.672996997833252, "rewards/rejected": -11.117916107177734, "step": 3650 }, { "epoch": 0.92, "grad_norm": 6.1875, "learning_rate": 3.780752579237978e-08, "logits/chosen": 0.4038727283477783, "logits/rejected": 0.8197474479675293, "logps/chosen": -599.9093017578125, "logps/rejected": -1350.9764404296875, "loss": 0.2599, "rewards/accuracies": 0.9375, "rewards/chosen": -3.743206739425659, "rewards/margins": 7.354147434234619, "rewards/margins_max": 11.198575019836426, "rewards/margins_min": 3.5097198486328125, "rewards/margins_std": 5.4368414878845215, "rewards/rejected": -11.0973539352417, "step": 3660 }, { "epoch": 0.92, "grad_norm": 1.1640625, "learning_rate": 3.545185486298274e-08, "logits/chosen": 0.5607768893241882, "logits/rejected": 0.8514927625656128, "logps/chosen": -599.4677734375, "logps/rejected": -1308.130859375, "loss": 0.1914, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.958008289337158, "rewards/margins": 6.930544376373291, "rewards/margins_max": 10.306341171264648, "rewards/margins_min": 3.5547471046447754, "rewards/margins_std": 4.774097442626953, "rewards/rejected": -10.888551712036133, "step": 3670 }, { "epoch": 0.93, "grad_norm": 1.4375, "learning_rate": 3.317062727221542e-08, "logits/chosen": 0.6026689410209656, "logits/rejected": 0.9835416674613953, "logps/chosen": -614.91943359375, "logps/rejected": -1405.431640625, "loss": 0.1626, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7954108715057373, "rewards/margins": 7.760351657867432, "rewards/margins_max": 12.802927017211914, "rewards/margins_min": 2.7177751064300537, "rewards/margins_std": 7.131278991699219, "rewards/rejected": -11.555761337280273, "step": 3680 }, { "epoch": 0.93, "grad_norm": 0.6484375, "learning_rate": 3.096401908407076e-08, "logits/chosen": 0.39605578780174255, "logits/rejected": 0.9517404437065125, "logps/chosen": -632.5601806640625, "logps/rejected": -1478.791015625, "loss": 0.1858, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.7204372882843018, "rewards/margins": 8.738731384277344, "rewards/margins_max": 13.405255317687988, "rewards/margins_min": 4.072208404541016, "rewards/margins_std": 6.599459648132324, "rewards/rejected": -12.459168434143066, "step": 3690 }, { "epoch": 0.93, "grad_norm": 4.09375, "learning_rate": 2.883220060345437e-08, "logits/chosen": 0.4364239275455475, "logits/rejected": 0.8532499074935913, "logps/chosen": -554.8377075195312, "logps/rejected": -1244.270263671875, "loss": 0.1934, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2251949310302734, "rewards/margins": 7.302639007568359, "rewards/margins_max": 11.826835632324219, "rewards/margins_min": 2.778442859649658, "rewards/margins_std": 6.398179531097412, "rewards/rejected": -10.52783489227295, "step": 3700 }, { "epoch": 0.93, "grad_norm": 4.0625, "learning_rate": 2.6775336363039636e-08, "logits/chosen": 0.294972687959671, "logits/rejected": 0.7404045462608337, "logps/chosen": -603.755126953125, "logps/rejected": -1179.319580078125, "loss": 0.207, "rewards/accuracies": 0.9375, "rewards/chosen": -4.001626014709473, "rewards/margins": 5.692513465881348, "rewards/margins_max": 8.635394096374512, "rewards/margins_min": 2.7496330738067627, "rewards/margins_std": 4.161861896514893, "rewards/rejected": -9.69413948059082, "step": 3710 }, { "epoch": 0.94, "grad_norm": 2.25, "learning_rate": 2.4793585110569726e-08, "logits/chosen": 0.4034551680088043, "logits/rejected": 0.7582255601882935, "logps/chosen": -613.5555419921875, "logps/rejected": -1203.7205810546875, "loss": 0.1721, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.042942523956299, "rewards/margins": 5.850724220275879, "rewards/margins_max": 8.855157852172852, "rewards/margins_min": 2.8462884426116943, "rewards/margins_std": 4.248912811279297, "rewards/rejected": -9.89366626739502, "step": 3720 }, { "epoch": 0.94, "grad_norm": 0.5546875, "learning_rate": 2.2887099796605192e-08, "logits/chosen": 0.48683229088783264, "logits/rejected": 0.9286754727363586, "logps/chosen": -577.3275146484375, "logps/rejected": -1279.540771484375, "loss": 0.1469, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7833991050720215, "rewards/margins": 6.769036293029785, "rewards/margins_max": 10.353792190551758, "rewards/margins_min": 3.184278964996338, "rewards/margins_std": 5.069611549377441, "rewards/rejected": -10.552433967590332, "step": 3730 }, { "epoch": 0.94, "grad_norm": 0.921875, "learning_rate": 2.1056027562719515e-08, "logits/chosen": 0.4771907925605774, "logits/rejected": 0.9152711629867554, "logps/chosen": -630.6005859375, "logps/rejected": -1180.2467041015625, "loss": 0.191, "rewards/accuracies": 0.9375, "rewards/chosen": -4.1157917976379395, "rewards/margins": 5.6571149826049805, "rewards/margins_max": 8.83124828338623, "rewards/margins_min": 2.482980728149414, "rewards/margins_std": 4.488903045654297, "rewards/rejected": -9.772905349731445, "step": 3740 }, { "epoch": 0.94, "grad_norm": 1.7734375, "learning_rate": 1.9300509730142855e-08, "logits/chosen": 0.43386468291282654, "logits/rejected": 0.904864490032196, "logps/chosen": -554.5198974609375, "logps/rejected": -1208.7537841796875, "loss": 0.1863, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.451647996902466, "rewards/margins": 6.490715026855469, "rewards/margins_max": 9.633821487426758, "rewards/margins_min": 3.3476085662841797, "rewards/margins_std": 4.445023536682129, "rewards/rejected": -9.942361831665039, "step": 3750 }, { "epoch": 0.95, "grad_norm": 0.94140625, "learning_rate": 1.762068178885501e-08, "logits/chosen": 0.41329479217529297, "logits/rejected": 0.8515909910202026, "logps/chosen": -620.33935546875, "logps/rejected": -1288.8076171875, "loss": 0.1975, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.9792284965515137, "rewards/margins": 6.714905738830566, "rewards/margins_max": 10.155709266662598, "rewards/margins_min": 3.2741000652313232, "rewards/margins_std": 4.86603307723999, "rewards/rejected": -10.694132804870605, "step": 3760 }, { "epoch": 0.95, "grad_norm": 7.59375, "learning_rate": 1.6016673387127642e-08, "logits/chosen": 0.41189831495285034, "logits/rejected": 1.1138523817062378, "logps/chosen": -601.9688720703125, "logps/rejected": -1127.280029296875, "loss": 0.265, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.6903653144836426, "rewards/margins": 5.70098876953125, "rewards/margins_max": 8.490083694458008, "rewards/margins_min": 2.9118943214416504, "rewards/margins_std": 3.9443747997283936, "rewards/rejected": -9.39135456085205, "step": 3770 }, { "epoch": 0.95, "grad_norm": 1.671875, "learning_rate": 1.4488608321519214e-08, "logits/chosen": 0.310377836227417, "logits/rejected": 0.877922534942627, "logps/chosen": -592.072021484375, "logps/rejected": -1218.7236328125, "loss": 0.1276, "rewards/accuracies": 1.0, "rewards/chosen": -3.8421072959899902, "rewards/margins": 6.450199127197266, "rewards/margins_max": 9.784720420837402, "rewards/margins_min": 3.115678310394287, "rewards/margins_std": 4.715724945068359, "rewards/rejected": -10.292306900024414, "step": 3780 }, { "epoch": 0.95, "grad_norm": 0.58203125, "learning_rate": 1.3036604527319472e-08, "logits/chosen": 0.5283810496330261, "logits/rejected": 0.9584972262382507, "logps/chosen": -610.931884765625, "logps/rejected": -1124.7642822265625, "loss": 0.1821, "rewards/accuracies": 0.9375, "rewards/chosen": -3.874329090118408, "rewards/margins": 5.139523506164551, "rewards/margins_max": 7.766670227050781, "rewards/margins_min": 2.512375831604004, "rewards/margins_std": 3.7153477668762207, "rewards/rejected": -9.013853073120117, "step": 3790 }, { "epoch": 0.96, "grad_norm": 1.6875, "learning_rate": 1.1660774069447876e-08, "logits/chosen": 0.5633661150932312, "logits/rejected": 0.9613991975784302, "logps/chosen": -556.5394287109375, "logps/rejected": -1291.819091796875, "loss": 0.1353, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2998318672180176, "rewards/margins": 7.387022495269775, "rewards/margins_max": 10.791549682617188, "rewards/margins_min": 3.982494831085205, "rewards/margins_std": 4.814728736877441, "rewards/rejected": -10.686854362487793, "step": 3800 }, { "epoch": 0.96, "grad_norm": 1.484375, "learning_rate": 1.0361223133804386e-08, "logits/chosen": 0.5381686091423035, "logits/rejected": 0.9398612976074219, "logps/chosen": -635.9791259765625, "logps/rejected": -1432.7926025390625, "loss": 0.1523, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.109006881713867, "rewards/margins": 7.819764137268066, "rewards/margins_max": 11.635190963745117, "rewards/margins_min": 4.004334926605225, "rewards/margins_std": 5.3958306312561035, "rewards/rejected": -11.928770065307617, "step": 3810 }, { "epoch": 0.96, "grad_norm": 37.75, "learning_rate": 9.138052019073472e-09, "logits/chosen": 0.45118942856788635, "logits/rejected": 0.796768844127655, "logps/chosen": -662.9164428710938, "logps/rejected": -1193.200439453125, "loss": 0.4235, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.19881534576416, "rewards/margins": 5.3286051750183105, "rewards/margins_max": 8.492276191711426, "rewards/margins_min": 2.1649346351623535, "rewards/margins_std": 4.474106311798096, "rewards/rejected": -9.527420043945312, "step": 3820 }, { "epoch": 0.96, "grad_norm": 1.5234375, "learning_rate": 7.991355128984079e-09, "logits/chosen": 0.49201154708862305, "logits/rejected": 0.9295538067817688, "logps/chosen": -532.0567626953125, "logps/rejected": -1102.3900146484375, "loss": 0.1717, "rewards/accuracies": 0.9375, "rewards/chosen": -3.469942092895508, "rewards/margins": 5.525341033935547, "rewards/margins_max": 8.777814865112305, "rewards/margins_min": 2.2728657722473145, "rewards/margins_std": 4.59969425201416, "rewards/rejected": -8.995283126831055, "step": 3830 }, { "epoch": 0.97, "grad_norm": 1.3984375, "learning_rate": 6.921220965023012e-09, "logits/chosen": 0.388469398021698, "logits/rejected": 0.9527280926704407, "logps/chosen": -636.8690185546875, "logps/rejected": -1200.610595703125, "loss": 0.1942, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.05787467956543, "rewards/margins": 5.918933391571045, "rewards/margins_max": 8.800240516662598, "rewards/margins_min": 3.037627696990967, "rewards/margins_std": 4.074782848358154, "rewards/rejected": -9.976808547973633, "step": 3840 }, { "epoch": 0.97, "grad_norm": 0.9375, "learning_rate": 5.9277321196044006e-09, "logits/chosen": 0.4024096429347992, "logits/rejected": 0.9591943025588989, "logps/chosen": -618.9902954101562, "logps/rejected": -1130.0238037109375, "loss": 0.1986, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.7532222270965576, "rewards/margins": 5.360965251922607, "rewards/margins_max": 8.214117050170898, "rewards/margins_min": 2.5078141689300537, "rewards/margins_std": 4.0349650382995605, "rewards/rejected": -9.114187240600586, "step": 3850 }, { "epoch": 0.97, "grad_norm": 1.5, "learning_rate": 5.010965269695577e-09, "logits/chosen": 0.3706130385398865, "logits/rejected": 0.9580795168876648, "logps/chosen": -593.1583251953125, "logps/rejected": -1240.548095703125, "loss": 0.1502, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.345301389694214, "rewards/margins": 6.722414970397949, "rewards/margins_max": 9.771614074707031, "rewards/margins_min": 3.67321515083313, "rewards/margins_std": 4.312219619750977, "rewards/rejected": -10.067716598510742, "step": 3860 }, { "epoch": 0.97, "grad_norm": 3.421875, "learning_rate": 4.170991170898808e-09, "logits/chosen": 0.5548506379127502, "logits/rejected": 0.9052824974060059, "logps/chosen": -576.05126953125, "logps/rejected": -1191.8560791015625, "loss": 0.1446, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.7464206218719482, "rewards/margins": 6.168720722198486, "rewards/margins_max": 8.987236976623535, "rewards/margins_min": 3.3502049446105957, "rewards/margins_std": 3.9859836101531982, "rewards/rejected": -9.915140151977539, "step": 3870 }, { "epoch": 0.98, "grad_norm": 1.6640625, "learning_rate": 3.407874651990883e-09, "logits/chosen": 0.4875260293483734, "logits/rejected": 0.9021614193916321, "logps/chosen": -564.7864379882812, "logps/rejected": -1175.726318359375, "loss": 0.2231, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.7371113300323486, "rewards/margins": 6.037837982177734, "rewards/margins_max": 9.710600852966309, "rewards/margins_min": 2.3650765419006348, "rewards/margins_std": 5.194069862365723, "rewards/rejected": -9.77495002746582, "step": 3880 }, { "epoch": 0.98, "grad_norm": 0.9296875, "learning_rate": 2.7216746099193443e-09, "logits/chosen": 0.537278950214386, "logits/rejected": 0.9852391481399536, "logps/chosen": -634.4955444335938, "logps/rejected": -1307.7852783203125, "loss": 0.1988, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.1310648918151855, "rewards/margins": 6.686505317687988, "rewards/margins_max": 10.347613334655762, "rewards/margins_min": 3.025397539138794, "rewards/margins_std": 5.177587985992432, "rewards/rejected": -10.817570686340332, "step": 3890 }, { "epoch": 0.98, "grad_norm": 1.2265625, "learning_rate": 2.112444005256564e-09, "logits/chosen": 0.4740668833255768, "logits/rejected": 0.7506653070449829, "logps/chosen": -627.5335693359375, "logps/rejected": -1310.2884521484375, "loss": 0.185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.233953475952148, "rewards/margins": 6.597451210021973, "rewards/margins_max": 10.050703048706055, "rewards/margins_min": 3.1441988945007324, "rewards/margins_std": 4.883635520935059, "rewards/rejected": -10.831403732299805, "step": 3900 }, { "epoch": 0.98, "grad_norm": 2.03125, "learning_rate": 1.5802298581132356e-09, "logits/chosen": 0.4343351721763611, "logits/rejected": 0.8734685182571411, "logps/chosen": -593.9387817382812, "logps/rejected": -1331.359130859375, "loss": 0.1502, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.890392780303955, "rewards/margins": 7.371206760406494, "rewards/margins_max": 11.046722412109375, "rewards/margins_min": 3.695690631866455, "rewards/margins_std": 5.197963714599609, "rewards/rejected": -11.261598587036133, "step": 3910 }, { "epoch": 0.99, "grad_norm": 3.359375, "learning_rate": 1.1250732445080569e-09, "logits/chosen": 0.48562726378440857, "logits/rejected": 0.9255334734916687, "logps/chosen": -666.9027709960938, "logps/rejected": -1247.671630859375, "loss": 0.1468, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.539218425750732, "rewards/margins": 5.876649379730225, "rewards/margins_max": 8.69355583190918, "rewards/margins_min": 3.0597426891326904, "rewards/margins_std": 3.9837074279785156, "rewards/rejected": -10.415867805480957, "step": 3920 }, { "epoch": 0.99, "grad_norm": 1.140625, "learning_rate": 7.470092931987082e-10, "logits/chosen": 0.41472572088241577, "logits/rejected": 0.7566056847572327, "logps/chosen": -558.6325073242188, "logps/rejected": -1361.521240234375, "loss": 0.1731, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.543172836303711, "rewards/margins": 7.946673393249512, "rewards/margins_max": 12.362479209899902, "rewards/margins_min": 3.5308678150177, "rewards/margins_std": 6.244892597198486, "rewards/rejected": -11.489847183227539, "step": 3930 }, { "epoch": 0.99, "grad_norm": 0.9296875, "learning_rate": 4.4606718296991143e-10, "logits/chosen": 0.45776480436325073, "logits/rejected": 0.9777078628540039, "logps/chosen": -553.6112060546875, "logps/rejected": -1242.527587890625, "loss": 0.2555, "rewards/accuracies": 0.9375, "rewards/chosen": -3.5952773094177246, "rewards/margins": 6.822895050048828, "rewards/margins_max": 10.043266296386719, "rewards/margins_min": 3.6025233268737793, "rewards/margins_std": 4.554293632507324, "rewards/rejected": -10.418172836303711, "step": 3940 }, { "epoch": 0.99, "grad_norm": 1.9453125, "learning_rate": 2.2227014038189717e-10, "logits/chosen": 0.47588786482810974, "logits/rejected": 0.8221324682235718, "logps/chosen": -567.9865112304688, "logps/rejected": -1433.75048828125, "loss": 0.1907, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7355358600616455, "rewards/margins": 8.41942024230957, "rewards/margins_max": 12.90966796875, "rewards/margins_min": 3.9291725158691406, "rewards/margins_std": 6.3501691818237305, "rewards/rejected": -12.15495491027832, "step": 3950 }, { "epoch": 1.0, "grad_norm": 1.21875, "learning_rate": 7.563543797717287e-11, "logits/chosen": 0.5077834129333496, "logits/rejected": 1.067440390586853, "logps/chosen": -619.56005859375, "logps/rejected": -1299.0670166015625, "loss": 0.1767, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.131485939025879, "rewards/margins": 6.871898651123047, "rewards/margins_max": 9.76762580871582, "rewards/margins_min": 3.9761710166931152, "rewards/margins_std": 4.09517765045166, "rewards/rejected": -11.003384590148926, "step": 3960 }, { "epoch": 1.0, "grad_norm": 0.490234375, "learning_rate": 6.174392948143925e-12, "logits/chosen": 0.5018728971481323, "logits/rejected": 1.0147392749786377, "logps/chosen": -592.7308349609375, "logps/rejected": -1142.571044921875, "loss": 0.198, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.673344373703003, "rewards/margins": 5.651946067810059, "rewards/margins_max": 9.167214393615723, "rewards/margins_min": 2.1366782188415527, "rewards/margins_std": 4.971339702606201, "rewards/rejected": -9.325291633605957, "step": 3970 }, { "epoch": 1.0, "eval_logits/chosen": 0.8492512106895447, "eval_logits/rejected": 1.0155344009399414, "eval_logps/chosen": -616.4664916992188, "eval_logps/rejected": -637.1886596679688, "eval_loss": 0.7862498164176941, "eval_rewards/accuracies": 0.5755000114440918, "eval_rewards/chosen": -2.7944495677948, "eval_rewards/margins": 0.39339083433151245, "eval_rewards/margins_max": 3.4648597240448, "eval_rewards/margins_min": -1.876849889755249, "eval_rewards/margins_std": 1.728664517402649, "eval_rewards/rejected": -3.187840461730957, "eval_runtime": 2500.5294, "eval_samples_per_second": 4.799, "eval_steps_per_second": 0.3, "step": 3974 }, { "epoch": 1.0, "step": 3974, "total_flos": 0.0, "train_loss": 0.27614202823210554, "train_runtime": 32543.5484, "train_samples_per_second": 1.954, "train_steps_per_second": 0.122 } ], "logging_steps": 10, "max_steps": 3974, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }