diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,56325 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 500, + "global_step": 80200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024937655860349127, + "grad_norm": NaN, + "learning_rate": 1.9998753117206985e-05, + "loss": 7.3204, + "step": 10 + }, + { + "epoch": 0.004987531172069825, + "grad_norm": 30.45345687866211, + "learning_rate": 1.999625935162095e-05, + "loss": 6.3759, + "step": 20 + }, + { + "epoch": 0.007481296758104738, + "grad_norm": 52.34556579589844, + "learning_rate": 1.9993765586034916e-05, + "loss": 5.2081, + "step": 30 + }, + { + "epoch": 0.00997506234413965, + "grad_norm": 33.23439025878906, + "learning_rate": 1.999127182044888e-05, + "loss": 3.7536, + "step": 40 + }, + { + "epoch": 0.012468827930174564, + "grad_norm": 19.338409423828125, + "learning_rate": 1.9988778054862846e-05, + "loss": 2.4798, + "step": 50 + }, + { + "epoch": 0.014962593516209476, + "grad_norm": 5.575465679168701, + "learning_rate": 1.998628428927681e-05, + "loss": 1.4935, + "step": 60 + }, + { + "epoch": 0.017456359102244388, + "grad_norm": 1.394232153892517, + "learning_rate": 1.9983790523690774e-05, + "loss": 1.0447, + "step": 70 + }, + { + "epoch": 0.0199501246882793, + "grad_norm": 2.6528663635253906, + "learning_rate": 1.998129675810474e-05, + "loss": 1.1943, + "step": 80 + }, + { + "epoch": 0.022443890274314215, + "grad_norm": 2.9787819385528564, + "learning_rate": 1.9978802992518704e-05, + "loss": 1.2081, + "step": 90 + }, + { + "epoch": 0.02493765586034913, + "grad_norm": 1.774791955947876, + "learning_rate": 1.997630922693267e-05, + "loss": 0.9248, + "step": 100 + }, + { + "epoch": 0.02743142144638404, + "grad_norm": 1.5466320514678955, + "learning_rate": 1.9973815461346635e-05, + "loss": 1.0454, + "step": 110 + }, + { + "epoch": 0.029925187032418952, + "grad_norm": 2.4883105754852295, + "learning_rate": 1.99713216957606e-05, + "loss": 1.1251, + "step": 120 + }, + { + "epoch": 0.032418952618453865, + "grad_norm": 2.3588509559631348, + "learning_rate": 1.9968827930174565e-05, + "loss": 1.178, + "step": 130 + }, + { + "epoch": 0.034912718204488775, + "grad_norm": 2.097666025161743, + "learning_rate": 1.996633416458853e-05, + "loss": 0.937, + "step": 140 + }, + { + "epoch": 0.03740648379052369, + "grad_norm": 2.220046043395996, + "learning_rate": 1.9963840399002496e-05, + "loss": 0.9336, + "step": 150 + }, + { + "epoch": 0.0399002493765586, + "grad_norm": 1.8168127536773682, + "learning_rate": 1.996134663341646e-05, + "loss": 0.8292, + "step": 160 + }, + { + "epoch": 0.04239401496259352, + "grad_norm": 2.687063217163086, + "learning_rate": 1.9958852867830423e-05, + "loss": 0.9569, + "step": 170 + }, + { + "epoch": 0.04488778054862843, + "grad_norm": 2.103942632675171, + "learning_rate": 1.995635910224439e-05, + "loss": 0.7538, + "step": 180 + }, + { + "epoch": 0.04738154613466334, + "grad_norm": 2.371488571166992, + "learning_rate": 1.9953865336658357e-05, + "loss": 0.8276, + "step": 190 + }, + { + "epoch": 0.04987531172069826, + "grad_norm": 3.574516534805298, + "learning_rate": 1.995137157107232e-05, + "loss": 1.021, + "step": 200 + }, + { + "epoch": 0.05236907730673317, + "grad_norm": 3.9193525314331055, + "learning_rate": 1.9948877805486288e-05, + "loss": 0.8487, + "step": 210 + }, + { + "epoch": 0.05486284289276808, + "grad_norm": 2.934713363647461, + "learning_rate": 1.994638403990025e-05, + "loss": 0.7093, + "step": 220 + }, + { + "epoch": 0.057356608478802994, + "grad_norm": 2.5535857677459717, + "learning_rate": 1.994389027431422e-05, + "loss": 0.8764, + "step": 230 + }, + { + "epoch": 0.059850374064837904, + "grad_norm": 2.949345827102661, + "learning_rate": 1.9941396508728182e-05, + "loss": 0.8242, + "step": 240 + }, + { + "epoch": 0.06234413965087282, + "grad_norm": 2.376368522644043, + "learning_rate": 1.9938902743142146e-05, + "loss": 0.8442, + "step": 250 + }, + { + "epoch": 0.06483790523690773, + "grad_norm": 5.437434673309326, + "learning_rate": 1.9936408977556113e-05, + "loss": 1.0203, + "step": 260 + }, + { + "epoch": 0.06733167082294264, + "grad_norm": 2.630307674407959, + "learning_rate": 1.9933915211970076e-05, + "loss": 0.6734, + "step": 270 + }, + { + "epoch": 0.06982543640897755, + "grad_norm": 4.224475860595703, + "learning_rate": 1.993142144638404e-05, + "loss": 0.9501, + "step": 280 + }, + { + "epoch": 0.07231920199501247, + "grad_norm": 4.720646858215332, + "learning_rate": 1.9928927680798007e-05, + "loss": 0.6527, + "step": 290 + }, + { + "epoch": 0.07481296758104738, + "grad_norm": 4.588720321655273, + "learning_rate": 1.992643391521197e-05, + "loss": 0.7645, + "step": 300 + }, + { + "epoch": 0.0773067331670823, + "grad_norm": 2.956965923309326, + "learning_rate": 1.9923940149625938e-05, + "loss": 0.8471, + "step": 310 + }, + { + "epoch": 0.0798004987531172, + "grad_norm": 2.415419816970825, + "learning_rate": 1.99214463840399e-05, + "loss": 0.8739, + "step": 320 + }, + { + "epoch": 0.08229426433915212, + "grad_norm": 4.1891188621521, + "learning_rate": 1.9918952618453865e-05, + "loss": 0.8344, + "step": 330 + }, + { + "epoch": 0.08478802992518704, + "grad_norm": 4.513364315032959, + "learning_rate": 1.9916458852867832e-05, + "loss": 0.7266, + "step": 340 + }, + { + "epoch": 0.08728179551122195, + "grad_norm": 3.10957670211792, + "learning_rate": 1.9913965087281796e-05, + "loss": 0.8771, + "step": 350 + }, + { + "epoch": 0.08977556109725686, + "grad_norm": 2.839132070541382, + "learning_rate": 1.9911471321695763e-05, + "loss": 0.5522, + "step": 360 + }, + { + "epoch": 0.09226932668329177, + "grad_norm": 7.818932056427002, + "learning_rate": 1.9908977556109726e-05, + "loss": 0.7509, + "step": 370 + }, + { + "epoch": 0.09476309226932668, + "grad_norm": 3.391204357147217, + "learning_rate": 1.9906483790523693e-05, + "loss": 0.8433, + "step": 380 + }, + { + "epoch": 0.09725685785536159, + "grad_norm": 5.5061726570129395, + "learning_rate": 1.9903990024937657e-05, + "loss": 1.0054, + "step": 390 + }, + { + "epoch": 0.09975062344139651, + "grad_norm": 3.288151979446411, + "learning_rate": 1.9901496259351624e-05, + "loss": 0.563, + "step": 400 + }, + { + "epoch": 0.10224438902743142, + "grad_norm": 2.599865674972534, + "learning_rate": 1.9899002493765587e-05, + "loss": 0.5838, + "step": 410 + }, + { + "epoch": 0.10473815461346633, + "grad_norm": 10.21823787689209, + "learning_rate": 1.9896508728179554e-05, + "loss": 0.8896, + "step": 420 + }, + { + "epoch": 0.10723192019950124, + "grad_norm": 3.7210872173309326, + "learning_rate": 1.9894014962593518e-05, + "loss": 0.7539, + "step": 430 + }, + { + "epoch": 0.10972568578553615, + "grad_norm": 3.3652985095977783, + "learning_rate": 1.9891521197007485e-05, + "loss": 0.6975, + "step": 440 + }, + { + "epoch": 0.11221945137157108, + "grad_norm": 4.095602512359619, + "learning_rate": 1.988902743142145e-05, + "loss": 0.7403, + "step": 450 + }, + { + "epoch": 0.11471321695760599, + "grad_norm": 4.6530561447143555, + "learning_rate": 1.9886533665835412e-05, + "loss": 0.6642, + "step": 460 + }, + { + "epoch": 0.1172069825436409, + "grad_norm": 3.8766672611236572, + "learning_rate": 1.988403990024938e-05, + "loss": 0.6265, + "step": 470 + }, + { + "epoch": 0.11970074812967581, + "grad_norm": 6.619718551635742, + "learning_rate": 1.9881546134663343e-05, + "loss": 0.6444, + "step": 480 + }, + { + "epoch": 0.12219451371571072, + "grad_norm": 2.7176246643066406, + "learning_rate": 1.9879052369077306e-05, + "loss": 0.6325, + "step": 490 + }, + { + "epoch": 0.12468827930174564, + "grad_norm": 4.887167930603027, + "learning_rate": 1.9876558603491273e-05, + "loss": 0.5515, + "step": 500 + }, + { + "epoch": 0.12718204488778054, + "grad_norm": 3.827092170715332, + "learning_rate": 1.9874064837905237e-05, + "loss": 0.8736, + "step": 510 + }, + { + "epoch": 0.12967581047381546, + "grad_norm": 3.9637207984924316, + "learning_rate": 1.9871571072319204e-05, + "loss": 0.6267, + "step": 520 + }, + { + "epoch": 0.13216957605985039, + "grad_norm": 4.160803318023682, + "learning_rate": 1.9869077306733168e-05, + "loss": 0.6462, + "step": 530 + }, + { + "epoch": 0.13466334164588528, + "grad_norm": 4.465733528137207, + "learning_rate": 1.9866583541147135e-05, + "loss": 0.488, + "step": 540 + }, + { + "epoch": 0.1371571072319202, + "grad_norm": 3.197251081466675, + "learning_rate": 1.98640897755611e-05, + "loss": 0.7185, + "step": 550 + }, + { + "epoch": 0.1396508728179551, + "grad_norm": 3.279279947280884, + "learning_rate": 1.9861596009975065e-05, + "loss": 0.599, + "step": 560 + }, + { + "epoch": 0.14214463840399003, + "grad_norm": 7.450759410858154, + "learning_rate": 1.985910224438903e-05, + "loss": 0.7233, + "step": 570 + }, + { + "epoch": 0.14463840399002495, + "grad_norm": 4.170207500457764, + "learning_rate": 1.9856608478802996e-05, + "loss": 0.5774, + "step": 580 + }, + { + "epoch": 0.14713216957605985, + "grad_norm": 9.313362121582031, + "learning_rate": 1.985411471321696e-05, + "loss": 0.8636, + "step": 590 + }, + { + "epoch": 0.14962593516209477, + "grad_norm": 4.21543550491333, + "learning_rate": 1.9851620947630927e-05, + "loss": 0.648, + "step": 600 + }, + { + "epoch": 0.15211970074812967, + "grad_norm": 7.928882122039795, + "learning_rate": 1.984912718204489e-05, + "loss": 0.6985, + "step": 610 + }, + { + "epoch": 0.1546134663341646, + "grad_norm": 3.2070388793945312, + "learning_rate": 1.9846633416458854e-05, + "loss": 0.5127, + "step": 620 + }, + { + "epoch": 0.1571072319201995, + "grad_norm": 4.8116278648376465, + "learning_rate": 1.984413965087282e-05, + "loss": 0.6521, + "step": 630 + }, + { + "epoch": 0.1596009975062344, + "grad_norm": 6.319545269012451, + "learning_rate": 1.9841645885286784e-05, + "loss": 0.5693, + "step": 640 + }, + { + "epoch": 0.16209476309226933, + "grad_norm": 3.7397208213806152, + "learning_rate": 1.983915211970075e-05, + "loss": 0.6335, + "step": 650 + }, + { + "epoch": 0.16458852867830423, + "grad_norm": 4.853140830993652, + "learning_rate": 1.9836658354114715e-05, + "loss": 0.7663, + "step": 660 + }, + { + "epoch": 0.16708229426433915, + "grad_norm": 3.868441104888916, + "learning_rate": 1.983416458852868e-05, + "loss": 0.6261, + "step": 670 + }, + { + "epoch": 0.16957605985037408, + "grad_norm": 5.102773189544678, + "learning_rate": 1.9831670822942646e-05, + "loss": 0.7153, + "step": 680 + }, + { + "epoch": 0.17206982543640897, + "grad_norm": 4.242384433746338, + "learning_rate": 1.982917705735661e-05, + "loss": 0.6275, + "step": 690 + }, + { + "epoch": 0.1745635910224439, + "grad_norm": 3.3436532020568848, + "learning_rate": 1.9826683291770573e-05, + "loss": 0.5709, + "step": 700 + }, + { + "epoch": 0.1770573566084788, + "grad_norm": 5.803040027618408, + "learning_rate": 1.982418952618454e-05, + "loss": 0.5584, + "step": 710 + }, + { + "epoch": 0.17955112219451372, + "grad_norm": 5.347147464752197, + "learning_rate": 1.9821695760598503e-05, + "loss": 0.5785, + "step": 720 + }, + { + "epoch": 0.18204488778054864, + "grad_norm": 4.12513542175293, + "learning_rate": 1.981920199501247e-05, + "loss": 0.6152, + "step": 730 + }, + { + "epoch": 0.18453865336658354, + "grad_norm": 5.8045220375061035, + "learning_rate": 1.9816708229426434e-05, + "loss": 0.6319, + "step": 740 + }, + { + "epoch": 0.18703241895261846, + "grad_norm": 8.319589614868164, + "learning_rate": 1.98142144638404e-05, + "loss": 0.5286, + "step": 750 + }, + { + "epoch": 0.18952618453865336, + "grad_norm": 3.427260637283325, + "learning_rate": 1.9811720698254365e-05, + "loss": 0.6325, + "step": 760 + }, + { + "epoch": 0.19201995012468828, + "grad_norm": 12.969819068908691, + "learning_rate": 1.9809226932668332e-05, + "loss": 0.6051, + "step": 770 + }, + { + "epoch": 0.19451371571072318, + "grad_norm": 4.117243766784668, + "learning_rate": 1.9806733167082295e-05, + "loss": 0.7746, + "step": 780 + }, + { + "epoch": 0.1970074812967581, + "grad_norm": 8.8555326461792, + "learning_rate": 1.9804239401496262e-05, + "loss": 0.6756, + "step": 790 + }, + { + "epoch": 0.19950124688279303, + "grad_norm": 6.0134596824646, + "learning_rate": 1.9801745635910226e-05, + "loss": 0.479, + "step": 800 + }, + { + "epoch": 0.20199501246882792, + "grad_norm": 3.967231273651123, + "learning_rate": 1.9799251870324193e-05, + "loss": 0.5358, + "step": 810 + }, + { + "epoch": 0.20448877805486285, + "grad_norm": 3.7455625534057617, + "learning_rate": 1.9796758104738157e-05, + "loss": 0.5306, + "step": 820 + }, + { + "epoch": 0.20698254364089774, + "grad_norm": 4.493707656860352, + "learning_rate": 1.979426433915212e-05, + "loss": 0.6243, + "step": 830 + }, + { + "epoch": 0.20947630922693267, + "grad_norm": 4.1963653564453125, + "learning_rate": 1.9791770573566087e-05, + "loss": 0.6031, + "step": 840 + }, + { + "epoch": 0.2119700748129676, + "grad_norm": 3.4451446533203125, + "learning_rate": 1.978927680798005e-05, + "loss": 0.5625, + "step": 850 + }, + { + "epoch": 0.2144638403990025, + "grad_norm": 10.554472923278809, + "learning_rate": 1.9786783042394014e-05, + "loss": 0.7107, + "step": 860 + }, + { + "epoch": 0.2169576059850374, + "grad_norm": 6.821500778198242, + "learning_rate": 1.978428927680798e-05, + "loss": 0.5901, + "step": 870 + }, + { + "epoch": 0.2194513715710723, + "grad_norm": 8.539198875427246, + "learning_rate": 1.9781795511221945e-05, + "loss": 0.6157, + "step": 880 + }, + { + "epoch": 0.22194513715710723, + "grad_norm": 5.176815032958984, + "learning_rate": 1.9779301745635912e-05, + "loss": 0.5484, + "step": 890 + }, + { + "epoch": 0.22443890274314215, + "grad_norm": 7.525144577026367, + "learning_rate": 1.9776807980049876e-05, + "loss": 0.6059, + "step": 900 + }, + { + "epoch": 0.22693266832917705, + "grad_norm": 6.711145401000977, + "learning_rate": 1.9774314214463843e-05, + "loss": 0.6239, + "step": 910 + }, + { + "epoch": 0.22942643391521197, + "grad_norm": 4.520840644836426, + "learning_rate": 1.9771820448877806e-05, + "loss": 0.5336, + "step": 920 + }, + { + "epoch": 0.23192019950124687, + "grad_norm": 11.031719207763672, + "learning_rate": 1.9769326683291773e-05, + "loss": 0.5649, + "step": 930 + }, + { + "epoch": 0.2344139650872818, + "grad_norm": 7.578179836273193, + "learning_rate": 1.9766832917705737e-05, + "loss": 0.6233, + "step": 940 + }, + { + "epoch": 0.23690773067331672, + "grad_norm": 4.446279048919678, + "learning_rate": 1.9764339152119704e-05, + "loss": 0.6529, + "step": 950 + }, + { + "epoch": 0.23940149625935161, + "grad_norm": 5.39076042175293, + "learning_rate": 1.9761845386533668e-05, + "loss": 0.6355, + "step": 960 + }, + { + "epoch": 0.24189526184538654, + "grad_norm": 5.869710445404053, + "learning_rate": 1.9759351620947635e-05, + "loss": 0.6045, + "step": 970 + }, + { + "epoch": 0.24438902743142144, + "grad_norm": 4.218006610870361, + "learning_rate": 1.9756857855361598e-05, + "loss": 0.5569, + "step": 980 + }, + { + "epoch": 0.24688279301745636, + "grad_norm": 4.749696731567383, + "learning_rate": 1.9754364089775562e-05, + "loss": 0.5298, + "step": 990 + }, + { + "epoch": 0.24937655860349128, + "grad_norm": 7.573767185211182, + "learning_rate": 1.975187032418953e-05, + "loss": 0.5561, + "step": 1000 + }, + { + "epoch": 0.2518703241895262, + "grad_norm": 5.48563289642334, + "learning_rate": 1.9749376558603492e-05, + "loss": 0.5115, + "step": 1010 + }, + { + "epoch": 0.2543640897755611, + "grad_norm": 6.635676383972168, + "learning_rate": 1.974688279301746e-05, + "loss": 0.6161, + "step": 1020 + }, + { + "epoch": 0.256857855361596, + "grad_norm": 4.294478893280029, + "learning_rate": 1.9744389027431423e-05, + "loss": 0.5136, + "step": 1030 + }, + { + "epoch": 0.2593516209476309, + "grad_norm": 2.8301312923431396, + "learning_rate": 1.9741895261845387e-05, + "loss": 0.451, + "step": 1040 + }, + { + "epoch": 0.26184538653366585, + "grad_norm": 4.692950248718262, + "learning_rate": 1.9739401496259354e-05, + "loss": 0.5172, + "step": 1050 + }, + { + "epoch": 0.26433915211970077, + "grad_norm": 9.1524076461792, + "learning_rate": 1.9736907730673317e-05, + "loss": 0.7668, + "step": 1060 + }, + { + "epoch": 0.26683291770573564, + "grad_norm": 3.0480797290802, + "learning_rate": 1.973441396508728e-05, + "loss": 0.5275, + "step": 1070 + }, + { + "epoch": 0.26932668329177056, + "grad_norm": 3.7727952003479004, + "learning_rate": 1.9731920199501248e-05, + "loss": 0.6045, + "step": 1080 + }, + { + "epoch": 0.2718204488778055, + "grad_norm": 3.7919719219207764, + "learning_rate": 1.972942643391521e-05, + "loss": 0.568, + "step": 1090 + }, + { + "epoch": 0.2743142144638404, + "grad_norm": 4.75301456451416, + "learning_rate": 1.972693266832918e-05, + "loss": 0.567, + "step": 1100 + }, + { + "epoch": 0.27680798004987534, + "grad_norm": 3.6362855434417725, + "learning_rate": 1.9724438902743142e-05, + "loss": 0.5015, + "step": 1110 + }, + { + "epoch": 0.2793017456359102, + "grad_norm": 13.187034606933594, + "learning_rate": 1.972194513715711e-05, + "loss": 0.5433, + "step": 1120 + }, + { + "epoch": 0.2817955112219451, + "grad_norm": 4.809338092803955, + "learning_rate": 1.9719451371571076e-05, + "loss": 0.5669, + "step": 1130 + }, + { + "epoch": 0.28428927680798005, + "grad_norm": 3.7646372318267822, + "learning_rate": 1.971695760598504e-05, + "loss": 0.5286, + "step": 1140 + }, + { + "epoch": 0.286783042394015, + "grad_norm": 3.752352237701416, + "learning_rate": 1.9714463840399007e-05, + "loss": 0.5286, + "step": 1150 + }, + { + "epoch": 0.2892768079800499, + "grad_norm": 4.207645416259766, + "learning_rate": 1.971197007481297e-05, + "loss": 0.5052, + "step": 1160 + }, + { + "epoch": 0.29177057356608477, + "grad_norm": 3.126208782196045, + "learning_rate": 1.9709476309226934e-05, + "loss": 0.5188, + "step": 1170 + }, + { + "epoch": 0.2942643391521197, + "grad_norm": 4.613558769226074, + "learning_rate": 1.97069825436409e-05, + "loss": 0.5405, + "step": 1180 + }, + { + "epoch": 0.2967581047381546, + "grad_norm": 4.35943603515625, + "learning_rate": 1.9704488778054865e-05, + "loss": 0.4617, + "step": 1190 + }, + { + "epoch": 0.29925187032418954, + "grad_norm": 3.4332399368286133, + "learning_rate": 1.9701995012468828e-05, + "loss": 0.5354, + "step": 1200 + }, + { + "epoch": 0.30174563591022446, + "grad_norm": 3.738420009613037, + "learning_rate": 1.9699501246882795e-05, + "loss": 0.4887, + "step": 1210 + }, + { + "epoch": 0.30423940149625933, + "grad_norm": 6.625039577484131, + "learning_rate": 1.969700748129676e-05, + "loss": 0.5395, + "step": 1220 + }, + { + "epoch": 0.30673316708229426, + "grad_norm": 3.6015474796295166, + "learning_rate": 1.9694513715710726e-05, + "loss": 0.4949, + "step": 1230 + }, + { + "epoch": 0.3092269326683292, + "grad_norm": 9.316793441772461, + "learning_rate": 1.969201995012469e-05, + "loss": 0.6079, + "step": 1240 + }, + { + "epoch": 0.3117206982543641, + "grad_norm": 10.953588485717773, + "learning_rate": 1.9689526184538653e-05, + "loss": 0.524, + "step": 1250 + }, + { + "epoch": 0.314214463840399, + "grad_norm": 5.9117512702941895, + "learning_rate": 1.968703241895262e-05, + "loss": 0.6395, + "step": 1260 + }, + { + "epoch": 0.3167082294264339, + "grad_norm": 3.799406051635742, + "learning_rate": 1.9684538653366584e-05, + "loss": 0.5604, + "step": 1270 + }, + { + "epoch": 0.3192019950124688, + "grad_norm": 5.208715915679932, + "learning_rate": 1.968204488778055e-05, + "loss": 0.5423, + "step": 1280 + }, + { + "epoch": 0.32169576059850374, + "grad_norm": 8.043681144714355, + "learning_rate": 1.9679551122194514e-05, + "loss": 0.5482, + "step": 1290 + }, + { + "epoch": 0.32418952618453867, + "grad_norm": 4.668685436248779, + "learning_rate": 1.967705735660848e-05, + "loss": 0.5868, + "step": 1300 + }, + { + "epoch": 0.3266832917705736, + "grad_norm": 5.136122703552246, + "learning_rate": 1.9674563591022445e-05, + "loss": 0.5217, + "step": 1310 + }, + { + "epoch": 0.32917705735660846, + "grad_norm": 3.8749747276306152, + "learning_rate": 1.9672069825436412e-05, + "loss": 0.48, + "step": 1320 + }, + { + "epoch": 0.3316708229426434, + "grad_norm": 2.9240078926086426, + "learning_rate": 1.9669576059850376e-05, + "loss": 0.6384, + "step": 1330 + }, + { + "epoch": 0.3341645885286783, + "grad_norm": 3.375347852706909, + "learning_rate": 1.9667082294264343e-05, + "loss": 0.5041, + "step": 1340 + }, + { + "epoch": 0.33665835411471323, + "grad_norm": 3.96232008934021, + "learning_rate": 1.9664588528678306e-05, + "loss": 0.4538, + "step": 1350 + }, + { + "epoch": 0.33915211970074816, + "grad_norm": 16.26675796508789, + "learning_rate": 1.966209476309227e-05, + "loss": 0.4686, + "step": 1360 + }, + { + "epoch": 0.341645885286783, + "grad_norm": 6.03434944152832, + "learning_rate": 1.9659600997506237e-05, + "loss": 0.5372, + "step": 1370 + }, + { + "epoch": 0.34413965087281795, + "grad_norm": 4.742402076721191, + "learning_rate": 1.96571072319202e-05, + "loss": 0.4793, + "step": 1380 + }, + { + "epoch": 0.34663341645885287, + "grad_norm": 4.454680442810059, + "learning_rate": 1.9654613466334167e-05, + "loss": 0.4901, + "step": 1390 + }, + { + "epoch": 0.3491271820448878, + "grad_norm": 4.6647796630859375, + "learning_rate": 1.965211970074813e-05, + "loss": 0.5023, + "step": 1400 + }, + { + "epoch": 0.3516209476309227, + "grad_norm": 5.384413719177246, + "learning_rate": 1.9649625935162095e-05, + "loss": 0.5912, + "step": 1410 + }, + { + "epoch": 0.3541147132169576, + "grad_norm": 6.326955795288086, + "learning_rate": 1.964713216957606e-05, + "loss": 0.5661, + "step": 1420 + }, + { + "epoch": 0.3566084788029925, + "grad_norm": 3.840012550354004, + "learning_rate": 1.9644638403990025e-05, + "loss": 0.5107, + "step": 1430 + }, + { + "epoch": 0.35910224438902744, + "grad_norm": 5.383571147918701, + "learning_rate": 1.9642144638403992e-05, + "loss": 0.4882, + "step": 1440 + }, + { + "epoch": 0.36159600997506236, + "grad_norm": 2.377634048461914, + "learning_rate": 1.9639650872817956e-05, + "loss": 0.5656, + "step": 1450 + }, + { + "epoch": 0.3640897755610973, + "grad_norm": 3.8716111183166504, + "learning_rate": 1.963715710723192e-05, + "loss": 0.5097, + "step": 1460 + }, + { + "epoch": 0.36658354114713215, + "grad_norm": 5.310560703277588, + "learning_rate": 1.9634663341645886e-05, + "loss": 0.4505, + "step": 1470 + }, + { + "epoch": 0.3690773067331671, + "grad_norm": 7.158385753631592, + "learning_rate": 1.9632169576059853e-05, + "loss": 0.4722, + "step": 1480 + }, + { + "epoch": 0.371571072319202, + "grad_norm": 3.6055357456207275, + "learning_rate": 1.9629675810473817e-05, + "loss": 0.4542, + "step": 1490 + }, + { + "epoch": 0.3740648379052369, + "grad_norm": 3.826148748397827, + "learning_rate": 1.9627182044887784e-05, + "loss": 0.5559, + "step": 1500 + }, + { + "epoch": 0.3765586034912718, + "grad_norm": 5.25114631652832, + "learning_rate": 1.9624688279301748e-05, + "loss": 0.5079, + "step": 1510 + }, + { + "epoch": 0.3790523690773067, + "grad_norm": 5.912806034088135, + "learning_rate": 1.9622194513715715e-05, + "loss": 0.5651, + "step": 1520 + }, + { + "epoch": 0.38154613466334164, + "grad_norm": 5.626669883728027, + "learning_rate": 1.961970074812968e-05, + "loss": 0.483, + "step": 1530 + }, + { + "epoch": 0.38403990024937656, + "grad_norm": 5.233978271484375, + "learning_rate": 1.9617206982543642e-05, + "loss": 0.472, + "step": 1540 + }, + { + "epoch": 0.3865336658354115, + "grad_norm": 4.9565606117248535, + "learning_rate": 1.961471321695761e-05, + "loss": 0.5137, + "step": 1550 + }, + { + "epoch": 0.38902743142144636, + "grad_norm": 5.3332414627075195, + "learning_rate": 1.9612219451371573e-05, + "loss": 0.5708, + "step": 1560 + }, + { + "epoch": 0.3915211970074813, + "grad_norm": 4.999671459197998, + "learning_rate": 1.9609725685785536e-05, + "loss": 0.5666, + "step": 1570 + }, + { + "epoch": 0.3940149625935162, + "grad_norm": 10.932353019714355, + "learning_rate": 1.9607231920199503e-05, + "loss": 0.5769, + "step": 1580 + }, + { + "epoch": 0.39650872817955113, + "grad_norm": 7.1936211585998535, + "learning_rate": 1.9604738154613467e-05, + "loss": 0.5631, + "step": 1590 + }, + { + "epoch": 0.39900249376558605, + "grad_norm": 6.666138172149658, + "learning_rate": 1.9602244389027434e-05, + "loss": 0.5239, + "step": 1600 + }, + { + "epoch": 0.4014962593516209, + "grad_norm": 3.0620758533477783, + "learning_rate": 1.9599750623441397e-05, + "loss": 0.5028, + "step": 1610 + }, + { + "epoch": 0.40399002493765584, + "grad_norm": 4.261651515960693, + "learning_rate": 1.959725685785536e-05, + "loss": 0.4711, + "step": 1620 + }, + { + "epoch": 0.40648379052369077, + "grad_norm": 3.4371354579925537, + "learning_rate": 1.9594763092269328e-05, + "loss": 0.4729, + "step": 1630 + }, + { + "epoch": 0.4089775561097257, + "grad_norm": 3.691063642501831, + "learning_rate": 1.959226932668329e-05, + "loss": 0.4844, + "step": 1640 + }, + { + "epoch": 0.4114713216957606, + "grad_norm": 4.673091411590576, + "learning_rate": 1.958977556109726e-05, + "loss": 0.5131, + "step": 1650 + }, + { + "epoch": 0.4139650872817955, + "grad_norm": 2.6619250774383545, + "learning_rate": 1.9587281795511222e-05, + "loss": 0.5196, + "step": 1660 + }, + { + "epoch": 0.4164588528678304, + "grad_norm": 6.241903781890869, + "learning_rate": 1.958478802992519e-05, + "loss": 0.471, + "step": 1670 + }, + { + "epoch": 0.41895261845386533, + "grad_norm": 3.603950023651123, + "learning_rate": 1.9582294264339153e-05, + "loss": 0.4476, + "step": 1680 + }, + { + "epoch": 0.42144638403990026, + "grad_norm": 3.6078081130981445, + "learning_rate": 1.957980049875312e-05, + "loss": 0.5308, + "step": 1690 + }, + { + "epoch": 0.4239401496259352, + "grad_norm": 4.073026657104492, + "learning_rate": 1.9577306733167084e-05, + "loss": 0.4973, + "step": 1700 + }, + { + "epoch": 0.42643391521197005, + "grad_norm": 4.738262176513672, + "learning_rate": 1.957481296758105e-05, + "loss": 0.4709, + "step": 1710 + }, + { + "epoch": 0.428927680798005, + "grad_norm": 16.924922943115234, + "learning_rate": 1.9572319201995014e-05, + "loss": 0.5383, + "step": 1720 + }, + { + "epoch": 0.4314214463840399, + "grad_norm": 8.632024765014648, + "learning_rate": 1.956982543640898e-05, + "loss": 0.4519, + "step": 1730 + }, + { + "epoch": 0.4339152119700748, + "grad_norm": 8.484312057495117, + "learning_rate": 1.9567331670822945e-05, + "loss": 0.5642, + "step": 1740 + }, + { + "epoch": 0.43640897755610975, + "grad_norm": 4.789985179901123, + "learning_rate": 1.956483790523691e-05, + "loss": 0.4486, + "step": 1750 + }, + { + "epoch": 0.4389027431421446, + "grad_norm": 4.4060587882995605, + "learning_rate": 1.9562344139650875e-05, + "loss": 0.5041, + "step": 1760 + }, + { + "epoch": 0.44139650872817954, + "grad_norm": 8.840462684631348, + "learning_rate": 1.955985037406484e-05, + "loss": 0.5466, + "step": 1770 + }, + { + "epoch": 0.44389027431421446, + "grad_norm": 3.662015438079834, + "learning_rate": 1.9557356608478803e-05, + "loss": 0.4922, + "step": 1780 + }, + { + "epoch": 0.4463840399002494, + "grad_norm": 5.66322135925293, + "learning_rate": 1.955486284289277e-05, + "loss": 0.4851, + "step": 1790 + }, + { + "epoch": 0.4488778054862843, + "grad_norm": 3.9291810989379883, + "learning_rate": 1.9552369077306733e-05, + "loss": 0.5185, + "step": 1800 + }, + { + "epoch": 0.4513715710723192, + "grad_norm": 4.53109073638916, + "learning_rate": 1.95498753117207e-05, + "loss": 0.5114, + "step": 1810 + }, + { + "epoch": 0.4538653366583541, + "grad_norm": 3.999675750732422, + "learning_rate": 1.9547381546134664e-05, + "loss": 0.4159, + "step": 1820 + }, + { + "epoch": 0.456359102244389, + "grad_norm": 8.522893905639648, + "learning_rate": 1.954488778054863e-05, + "loss": 0.74, + "step": 1830 + }, + { + "epoch": 0.45885286783042395, + "grad_norm": 3.3957653045654297, + "learning_rate": 1.9542394014962594e-05, + "loss": 0.5078, + "step": 1840 + }, + { + "epoch": 0.4613466334164589, + "grad_norm": 2.780721664428711, + "learning_rate": 1.953990024937656e-05, + "loss": 0.5148, + "step": 1850 + }, + { + "epoch": 0.46384039900249374, + "grad_norm": 3.9767260551452637, + "learning_rate": 1.9537406483790525e-05, + "loss": 0.5263, + "step": 1860 + }, + { + "epoch": 0.46633416458852867, + "grad_norm": 15.764490127563477, + "learning_rate": 1.9534912718204492e-05, + "loss": 0.4691, + "step": 1870 + }, + { + "epoch": 0.4688279301745636, + "grad_norm": 7.174173831939697, + "learning_rate": 1.9532418952618456e-05, + "loss": 0.5482, + "step": 1880 + }, + { + "epoch": 0.4713216957605985, + "grad_norm": 3.9777376651763916, + "learning_rate": 1.9529925187032423e-05, + "loss": 0.4499, + "step": 1890 + }, + { + "epoch": 0.47381546134663344, + "grad_norm": 4.089155197143555, + "learning_rate": 1.9527431421446386e-05, + "loss": 0.4241, + "step": 1900 + }, + { + "epoch": 0.4763092269326683, + "grad_norm": 3.909698009490967, + "learning_rate": 1.952493765586035e-05, + "loss": 0.5389, + "step": 1910 + }, + { + "epoch": 0.47880299251870323, + "grad_norm": 4.601511478424072, + "learning_rate": 1.9522443890274317e-05, + "loss": 0.5299, + "step": 1920 + }, + { + "epoch": 0.48129675810473815, + "grad_norm": 3.4147753715515137, + "learning_rate": 1.951995012468828e-05, + "loss": 0.4071, + "step": 1930 + }, + { + "epoch": 0.4837905236907731, + "grad_norm": 10.128273963928223, + "learning_rate": 1.9517456359102248e-05, + "loss": 0.5228, + "step": 1940 + }, + { + "epoch": 0.486284289276808, + "grad_norm": 6.3098464012146, + "learning_rate": 1.951496259351621e-05, + "loss": 0.5648, + "step": 1950 + }, + { + "epoch": 0.48877805486284287, + "grad_norm": 9.323540687561035, + "learning_rate": 1.9512468827930175e-05, + "loss": 0.5566, + "step": 1960 + }, + { + "epoch": 0.4912718204488778, + "grad_norm": 2.8909482955932617, + "learning_rate": 1.9509975062344142e-05, + "loss": 0.4574, + "step": 1970 + }, + { + "epoch": 0.4937655860349127, + "grad_norm": 8.425788879394531, + "learning_rate": 1.9507481296758105e-05, + "loss": 0.594, + "step": 1980 + }, + { + "epoch": 0.49625935162094764, + "grad_norm": 5.776350498199463, + "learning_rate": 1.950498753117207e-05, + "loss": 0.4679, + "step": 1990 + }, + { + "epoch": 0.49875311720698257, + "grad_norm": 5.987747669219971, + "learning_rate": 1.9502493765586036e-05, + "loss": 0.5371, + "step": 2000 + }, + { + "epoch": 0.5012468827930174, + "grad_norm": 3.423344373703003, + "learning_rate": 1.95e-05, + "loss": 0.4901, + "step": 2010 + }, + { + "epoch": 0.5037406483790524, + "grad_norm": 3.8344333171844482, + "learning_rate": 1.9497506234413967e-05, + "loss": 0.4937, + "step": 2020 + }, + { + "epoch": 0.5062344139650873, + "grad_norm": 4.415685653686523, + "learning_rate": 1.949501246882793e-05, + "loss": 0.4658, + "step": 2030 + }, + { + "epoch": 0.5087281795511222, + "grad_norm": 4.0992631912231445, + "learning_rate": 1.9492518703241897e-05, + "loss": 0.4264, + "step": 2040 + }, + { + "epoch": 0.5112219451371571, + "grad_norm": 6.2230305671691895, + "learning_rate": 1.949002493765586e-05, + "loss": 0.5018, + "step": 2050 + }, + { + "epoch": 0.513715710723192, + "grad_norm": 7.8946661949157715, + "learning_rate": 1.9487531172069828e-05, + "loss": 0.477, + "step": 2060 + }, + { + "epoch": 0.516209476309227, + "grad_norm": 6.977099418640137, + "learning_rate": 1.948503740648379e-05, + "loss": 0.4574, + "step": 2070 + }, + { + "epoch": 0.5187032418952618, + "grad_norm": 5.954398155212402, + "learning_rate": 1.948254364089776e-05, + "loss": 0.5273, + "step": 2080 + }, + { + "epoch": 0.5211970074812967, + "grad_norm": 5.636336326599121, + "learning_rate": 1.9480049875311722e-05, + "loss": 0.4623, + "step": 2090 + }, + { + "epoch": 0.5236907730673317, + "grad_norm": 7.925063610076904, + "learning_rate": 1.947755610972569e-05, + "loss": 0.4776, + "step": 2100 + }, + { + "epoch": 0.5261845386533666, + "grad_norm": 6.0404276847839355, + "learning_rate": 1.9475062344139653e-05, + "loss": 0.5091, + "step": 2110 + }, + { + "epoch": 0.5286783042394015, + "grad_norm": 5.379130840301514, + "learning_rate": 1.9472568578553616e-05, + "loss": 0.4522, + "step": 2120 + }, + { + "epoch": 0.5311720698254364, + "grad_norm": 5.63524055480957, + "learning_rate": 1.9470074812967583e-05, + "loss": 0.5296, + "step": 2130 + }, + { + "epoch": 0.5336658354114713, + "grad_norm": 3.867295026779175, + "learning_rate": 1.9467581047381547e-05, + "loss": 0.4351, + "step": 2140 + }, + { + "epoch": 0.5361596009975063, + "grad_norm": 10.726486206054688, + "learning_rate": 1.9465087281795514e-05, + "loss": 0.4772, + "step": 2150 + }, + { + "epoch": 0.5386533665835411, + "grad_norm": 5.092080593109131, + "learning_rate": 1.9462593516209478e-05, + "loss": 0.4877, + "step": 2160 + }, + { + "epoch": 0.5411471321695761, + "grad_norm": 4.903257846832275, + "learning_rate": 1.946009975062344e-05, + "loss": 0.5226, + "step": 2170 + }, + { + "epoch": 0.543640897755611, + "grad_norm": 12.123847007751465, + "learning_rate": 1.9457605985037408e-05, + "loss": 0.4826, + "step": 2180 + }, + { + "epoch": 0.5461346633416458, + "grad_norm": 7.251633644104004, + "learning_rate": 1.9455112219451372e-05, + "loss": 0.5337, + "step": 2190 + }, + { + "epoch": 0.5486284289276808, + "grad_norm": 5.906688213348389, + "learning_rate": 1.945261845386534e-05, + "loss": 0.4971, + "step": 2200 + }, + { + "epoch": 0.5511221945137157, + "grad_norm": 16.242473602294922, + "learning_rate": 1.9450124688279302e-05, + "loss": 0.5198, + "step": 2210 + }, + { + "epoch": 0.5536159600997507, + "grad_norm": 5.047711372375488, + "learning_rate": 1.944763092269327e-05, + "loss": 0.5753, + "step": 2220 + }, + { + "epoch": 0.5561097256857855, + "grad_norm": 12.766270637512207, + "learning_rate": 1.9445137157107233e-05, + "loss": 0.4736, + "step": 2230 + }, + { + "epoch": 0.5586034912718204, + "grad_norm": 9.127399444580078, + "learning_rate": 1.94426433915212e-05, + "loss": 0.4879, + "step": 2240 + }, + { + "epoch": 0.5610972568578554, + "grad_norm": 5.662039756774902, + "learning_rate": 1.9440149625935164e-05, + "loss": 0.4147, + "step": 2250 + }, + { + "epoch": 0.5635910224438903, + "grad_norm": 5.060816764831543, + "learning_rate": 1.943765586034913e-05, + "loss": 0.4406, + "step": 2260 + }, + { + "epoch": 0.5660847880299252, + "grad_norm": 7.149324893951416, + "learning_rate": 1.9435162094763094e-05, + "loss": 0.5203, + "step": 2270 + }, + { + "epoch": 0.5685785536159601, + "grad_norm": 3.712177038192749, + "learning_rate": 1.9432668329177058e-05, + "loss": 0.4658, + "step": 2280 + }, + { + "epoch": 0.571072319201995, + "grad_norm": 3.989894151687622, + "learning_rate": 1.9430174563591025e-05, + "loss": 0.4428, + "step": 2290 + }, + { + "epoch": 0.57356608478803, + "grad_norm": 4.171450614929199, + "learning_rate": 1.942768079800499e-05, + "loss": 0.5034, + "step": 2300 + }, + { + "epoch": 0.5760598503740648, + "grad_norm": 5.396224021911621, + "learning_rate": 1.9425187032418956e-05, + "loss": 0.4929, + "step": 2310 + }, + { + "epoch": 0.5785536159600998, + "grad_norm": 5.547377586364746, + "learning_rate": 1.942269326683292e-05, + "loss": 0.4344, + "step": 2320 + }, + { + "epoch": 0.5810473815461347, + "grad_norm": 9.413858413696289, + "learning_rate": 1.9420199501246883e-05, + "loss": 0.4321, + "step": 2330 + }, + { + "epoch": 0.5835411471321695, + "grad_norm": 6.898621082305908, + "learning_rate": 1.941770573566085e-05, + "loss": 0.5428, + "step": 2340 + }, + { + "epoch": 0.5860349127182045, + "grad_norm": 4.387313365936279, + "learning_rate": 1.9415211970074813e-05, + "loss": 0.4786, + "step": 2350 + }, + { + "epoch": 0.5885286783042394, + "grad_norm": 8.502609252929688, + "learning_rate": 1.941271820448878e-05, + "loss": 0.4587, + "step": 2360 + }, + { + "epoch": 0.5910224438902744, + "grad_norm": 5.494476795196533, + "learning_rate": 1.9410224438902744e-05, + "loss": 0.5174, + "step": 2370 + }, + { + "epoch": 0.5935162094763092, + "grad_norm": 4.911831378936768, + "learning_rate": 1.9407730673316708e-05, + "loss": 0.5318, + "step": 2380 + }, + { + "epoch": 0.5960099750623441, + "grad_norm": 3.805800676345825, + "learning_rate": 1.9405236907730675e-05, + "loss": 0.5523, + "step": 2390 + }, + { + "epoch": 0.5985037406483791, + "grad_norm": 7.420222759246826, + "learning_rate": 1.9402743142144638e-05, + "loss": 0.492, + "step": 2400 + }, + { + "epoch": 0.600997506234414, + "grad_norm": 5.9345245361328125, + "learning_rate": 1.9400249376558605e-05, + "loss": 0.4199, + "step": 2410 + }, + { + "epoch": 0.6034912718204489, + "grad_norm": 4.605949878692627, + "learning_rate": 1.9397755610972572e-05, + "loss": 0.5301, + "step": 2420 + }, + { + "epoch": 0.6059850374064838, + "grad_norm": 11.447821617126465, + "learning_rate": 1.9395261845386536e-05, + "loss": 0.5429, + "step": 2430 + }, + { + "epoch": 0.6084788029925187, + "grad_norm": 4.791404724121094, + "learning_rate": 1.9392768079800503e-05, + "loss": 0.5576, + "step": 2440 + }, + { + "epoch": 0.6109725685785536, + "grad_norm": 4.776665210723877, + "learning_rate": 1.9390274314214466e-05, + "loss": 0.4874, + "step": 2450 + }, + { + "epoch": 0.6134663341645885, + "grad_norm": 8.155165672302246, + "learning_rate": 1.938778054862843e-05, + "loss": 0.5492, + "step": 2460 + }, + { + "epoch": 0.6159600997506235, + "grad_norm": 15.007184028625488, + "learning_rate": 1.9385286783042397e-05, + "loss": 0.4631, + "step": 2470 + }, + { + "epoch": 0.6184538653366584, + "grad_norm": 3.208120107650757, + "learning_rate": 1.938279301745636e-05, + "loss": 0.4789, + "step": 2480 + }, + { + "epoch": 0.6209476309226932, + "grad_norm": 5.233438014984131, + "learning_rate": 1.9380299251870324e-05, + "loss": 0.5402, + "step": 2490 + }, + { + "epoch": 0.6234413965087282, + "grad_norm": 8.981165885925293, + "learning_rate": 1.937780548628429e-05, + "loss": 0.491, + "step": 2500 + }, + { + "epoch": 0.6259351620947631, + "grad_norm": 8.05521011352539, + "learning_rate": 1.9375311720698255e-05, + "loss": 0.4621, + "step": 2510 + }, + { + "epoch": 0.628428927680798, + "grad_norm": 3.094724178314209, + "learning_rate": 1.9372817955112222e-05, + "loss": 0.4244, + "step": 2520 + }, + { + "epoch": 0.6309226932668329, + "grad_norm": 5.453985691070557, + "learning_rate": 1.9370324189526186e-05, + "loss": 0.5035, + "step": 2530 + }, + { + "epoch": 0.6334164588528678, + "grad_norm": 4.8853559494018555, + "learning_rate": 1.936783042394015e-05, + "loss": 0.4758, + "step": 2540 + }, + { + "epoch": 0.6359102244389028, + "grad_norm": 4.26847505569458, + "learning_rate": 1.9365336658354116e-05, + "loss": 0.493, + "step": 2550 + }, + { + "epoch": 0.6384039900249376, + "grad_norm": 7.400485038757324, + "learning_rate": 1.936284289276808e-05, + "loss": 0.465, + "step": 2560 + }, + { + "epoch": 0.6408977556109726, + "grad_norm": 4.4760260581970215, + "learning_rate": 1.9360349127182047e-05, + "loss": 0.4429, + "step": 2570 + }, + { + "epoch": 0.6433915211970075, + "grad_norm": 4.798295974731445, + "learning_rate": 1.935785536159601e-05, + "loss": 0.4837, + "step": 2580 + }, + { + "epoch": 0.6458852867830424, + "grad_norm": 11.160234451293945, + "learning_rate": 1.9355361596009977e-05, + "loss": 0.4985, + "step": 2590 + }, + { + "epoch": 0.6483790523690773, + "grad_norm": 6.34352970123291, + "learning_rate": 1.935286783042394e-05, + "loss": 0.5137, + "step": 2600 + }, + { + "epoch": 0.6508728179551122, + "grad_norm": 4.095518589019775, + "learning_rate": 1.9350374064837908e-05, + "loss": 0.4254, + "step": 2610 + }, + { + "epoch": 0.6533665835411472, + "grad_norm": 8.039348602294922, + "learning_rate": 1.934788029925187e-05, + "loss": 0.4655, + "step": 2620 + }, + { + "epoch": 0.655860349127182, + "grad_norm": 4.135050296783447, + "learning_rate": 1.934538653366584e-05, + "loss": 0.5008, + "step": 2630 + }, + { + "epoch": 0.6583541147132169, + "grad_norm": 7.032421112060547, + "learning_rate": 1.9342892768079802e-05, + "loss": 0.4439, + "step": 2640 + }, + { + "epoch": 0.6608478802992519, + "grad_norm": 4.22358512878418, + "learning_rate": 1.934039900249377e-05, + "loss": 0.5356, + "step": 2650 + }, + { + "epoch": 0.6633416458852868, + "grad_norm": 6.675922393798828, + "learning_rate": 1.9337905236907733e-05, + "loss": 0.5038, + "step": 2660 + }, + { + "epoch": 0.6658354114713217, + "grad_norm": 4.241270542144775, + "learning_rate": 1.9335411471321697e-05, + "loss": 0.4468, + "step": 2670 + }, + { + "epoch": 0.6683291770573566, + "grad_norm": 4.54661226272583, + "learning_rate": 1.9332917705735664e-05, + "loss": 0.4758, + "step": 2680 + }, + { + "epoch": 0.6708229426433915, + "grad_norm": 3.0999062061309814, + "learning_rate": 1.9330423940149627e-05, + "loss": 0.566, + "step": 2690 + }, + { + "epoch": 0.6733167082294265, + "grad_norm": 5.668433666229248, + "learning_rate": 1.932793017456359e-05, + "loss": 0.4979, + "step": 2700 + }, + { + "epoch": 0.6758104738154613, + "grad_norm": 7.616546154022217, + "learning_rate": 1.9325436408977558e-05, + "loss": 0.3973, + "step": 2710 + }, + { + "epoch": 0.6783042394014963, + "grad_norm": 4.385749816894531, + "learning_rate": 1.932294264339152e-05, + "loss": 0.5626, + "step": 2720 + }, + { + "epoch": 0.6807980049875312, + "grad_norm": 12.706242561340332, + "learning_rate": 1.932044887780549e-05, + "loss": 0.5379, + "step": 2730 + }, + { + "epoch": 0.683291770573566, + "grad_norm": 4.375461101531982, + "learning_rate": 1.9317955112219452e-05, + "loss": 0.4326, + "step": 2740 + }, + { + "epoch": 0.685785536159601, + "grad_norm": 4.31388521194458, + "learning_rate": 1.931546134663342e-05, + "loss": 0.5309, + "step": 2750 + }, + { + "epoch": 0.6882793017456359, + "grad_norm": 4.78411865234375, + "learning_rate": 1.9312967581047383e-05, + "loss": 0.4068, + "step": 2760 + }, + { + "epoch": 0.6907730673316709, + "grad_norm": 10.348709106445312, + "learning_rate": 1.931047381546135e-05, + "loss": 0.4246, + "step": 2770 + }, + { + "epoch": 0.6932668329177057, + "grad_norm": 3.510700225830078, + "learning_rate": 1.9307980049875313e-05, + "loss": 0.4845, + "step": 2780 + }, + { + "epoch": 0.6957605985037406, + "grad_norm": 4.097596168518066, + "learning_rate": 1.930548628428928e-05, + "loss": 0.5605, + "step": 2790 + }, + { + "epoch": 0.6982543640897756, + "grad_norm": 4.968656539916992, + "learning_rate": 1.9302992518703244e-05, + "loss": 0.467, + "step": 2800 + }, + { + "epoch": 0.7007481296758105, + "grad_norm": 7.420864105224609, + "learning_rate": 1.930049875311721e-05, + "loss": 0.6162, + "step": 2810 + }, + { + "epoch": 0.7032418952618454, + "grad_norm": 7.818281173706055, + "learning_rate": 1.9298004987531174e-05, + "loss": 0.4224, + "step": 2820 + }, + { + "epoch": 0.7057356608478803, + "grad_norm": 6.321879863739014, + "learning_rate": 1.9295511221945138e-05, + "loss": 0.5507, + "step": 2830 + }, + { + "epoch": 0.7082294264339152, + "grad_norm": 4.185654640197754, + "learning_rate": 1.9293017456359105e-05, + "loss": 0.4075, + "step": 2840 + }, + { + "epoch": 0.7107231920199502, + "grad_norm": 4.573608875274658, + "learning_rate": 1.929052369077307e-05, + "loss": 0.5097, + "step": 2850 + }, + { + "epoch": 0.713216957605985, + "grad_norm": 6.115342617034912, + "learning_rate": 1.9288029925187036e-05, + "loss": 0.4963, + "step": 2860 + }, + { + "epoch": 0.71571072319202, + "grad_norm": 4.295200824737549, + "learning_rate": 1.9285536159601e-05, + "loss": 0.4322, + "step": 2870 + }, + { + "epoch": 0.7182044887780549, + "grad_norm": 2.7827277183532715, + "learning_rate": 1.9283042394014963e-05, + "loss": 0.4749, + "step": 2880 + }, + { + "epoch": 0.7206982543640897, + "grad_norm": 4.560075283050537, + "learning_rate": 1.928054862842893e-05, + "loss": 0.4603, + "step": 2890 + }, + { + "epoch": 0.7231920199501247, + "grad_norm": 6.625837326049805, + "learning_rate": 1.9278054862842894e-05, + "loss": 0.4779, + "step": 2900 + }, + { + "epoch": 0.7256857855361596, + "grad_norm": 6.110123634338379, + "learning_rate": 1.9275561097256857e-05, + "loss": 0.5923, + "step": 2910 + }, + { + "epoch": 0.7281795511221946, + "grad_norm": 5.3894429206848145, + "learning_rate": 1.9273067331670824e-05, + "loss": 0.4797, + "step": 2920 + }, + { + "epoch": 0.7306733167082294, + "grad_norm": 6.485326290130615, + "learning_rate": 1.9270573566084788e-05, + "loss": 0.5242, + "step": 2930 + }, + { + "epoch": 0.7331670822942643, + "grad_norm": 8.976933479309082, + "learning_rate": 1.9268079800498755e-05, + "loss": 0.44, + "step": 2940 + }, + { + "epoch": 0.7356608478802993, + "grad_norm": 8.325933456420898, + "learning_rate": 1.926558603491272e-05, + "loss": 0.5028, + "step": 2950 + }, + { + "epoch": 0.7381546134663342, + "grad_norm": 4.603588581085205, + "learning_rate": 1.9263092269326685e-05, + "loss": 0.4457, + "step": 2960 + }, + { + "epoch": 0.7406483790523691, + "grad_norm": 4.730616569519043, + "learning_rate": 1.926059850374065e-05, + "loss": 0.4515, + "step": 2970 + }, + { + "epoch": 0.743142144638404, + "grad_norm": 4.1613264083862305, + "learning_rate": 1.9258104738154616e-05, + "loss": 0.4357, + "step": 2980 + }, + { + "epoch": 0.7456359102244389, + "grad_norm": 4.212600231170654, + "learning_rate": 1.925561097256858e-05, + "loss": 0.4426, + "step": 2990 + }, + { + "epoch": 0.7481296758104738, + "grad_norm": 5.657212257385254, + "learning_rate": 1.9253117206982547e-05, + "loss": 0.4656, + "step": 3000 + }, + { + "epoch": 0.7506234413965087, + "grad_norm": 6.499466419219971, + "learning_rate": 1.925062344139651e-05, + "loss": 0.421, + "step": 3010 + }, + { + "epoch": 0.7531172069825436, + "grad_norm": 4.284633159637451, + "learning_rate": 1.9248129675810477e-05, + "loss": 0.5375, + "step": 3020 + }, + { + "epoch": 0.7556109725685786, + "grad_norm": 4.252184867858887, + "learning_rate": 1.924563591022444e-05, + "loss": 0.4443, + "step": 3030 + }, + { + "epoch": 0.7581047381546134, + "grad_norm": 4.567617416381836, + "learning_rate": 1.9243142144638405e-05, + "loss": 0.4453, + "step": 3040 + }, + { + "epoch": 0.7605985037406484, + "grad_norm": 3.3405144214630127, + "learning_rate": 1.924064837905237e-05, + "loss": 0.432, + "step": 3050 + }, + { + "epoch": 0.7630922693266833, + "grad_norm": 7.207590579986572, + "learning_rate": 1.9238154613466335e-05, + "loss": 0.446, + "step": 3060 + }, + { + "epoch": 0.7655860349127181, + "grad_norm": 7.11346960067749, + "learning_rate": 1.92356608478803e-05, + "loss": 0.4085, + "step": 3070 + }, + { + "epoch": 0.7680798004987531, + "grad_norm": 3.7571468353271484, + "learning_rate": 1.9233167082294266e-05, + "loss": 0.4873, + "step": 3080 + }, + { + "epoch": 0.770573566084788, + "grad_norm": 6.159874439239502, + "learning_rate": 1.923067331670823e-05, + "loss": 0.5184, + "step": 3090 + }, + { + "epoch": 0.773067331670823, + "grad_norm": 6.5550689697265625, + "learning_rate": 1.9228179551122196e-05, + "loss": 0.4748, + "step": 3100 + }, + { + "epoch": 0.7755610972568578, + "grad_norm": 6.619629383087158, + "learning_rate": 1.922568578553616e-05, + "loss": 0.4853, + "step": 3110 + }, + { + "epoch": 0.7780548628428927, + "grad_norm": 5.71950626373291, + "learning_rate": 1.9223192019950127e-05, + "loss": 0.4853, + "step": 3120 + }, + { + "epoch": 0.7805486284289277, + "grad_norm": 4.437851905822754, + "learning_rate": 1.922069825436409e-05, + "loss": 0.4058, + "step": 3130 + }, + { + "epoch": 0.7830423940149626, + "grad_norm": 13.215971946716309, + "learning_rate": 1.9218204488778058e-05, + "loss": 0.51, + "step": 3140 + }, + { + "epoch": 0.7855361596009975, + "grad_norm": 4.856098175048828, + "learning_rate": 1.921571072319202e-05, + "loss": 0.4832, + "step": 3150 + }, + { + "epoch": 0.7880299251870324, + "grad_norm": 6.450615882873535, + "learning_rate": 1.9213216957605988e-05, + "loss": 0.4538, + "step": 3160 + }, + { + "epoch": 0.7905236907730673, + "grad_norm": 7.96874475479126, + "learning_rate": 1.9210723192019952e-05, + "loss": 0.4463, + "step": 3170 + }, + { + "epoch": 0.7930174563591023, + "grad_norm": 5.789658069610596, + "learning_rate": 1.920822942643392e-05, + "loss": 0.3977, + "step": 3180 + }, + { + "epoch": 0.7955112219451371, + "grad_norm": 4.556354999542236, + "learning_rate": 1.9205735660847882e-05, + "loss": 0.5665, + "step": 3190 + }, + { + "epoch": 0.7980049875311721, + "grad_norm": 7.1619181632995605, + "learning_rate": 1.9203241895261846e-05, + "loss": 0.4884, + "step": 3200 + }, + { + "epoch": 0.800498753117207, + "grad_norm": 5.486998081207275, + "learning_rate": 1.9200748129675813e-05, + "loss": 0.5249, + "step": 3210 + }, + { + "epoch": 0.8029925187032418, + "grad_norm": 5.061128616333008, + "learning_rate": 1.9198254364089777e-05, + "loss": 0.5166, + "step": 3220 + }, + { + "epoch": 0.8054862842892768, + "grad_norm": 5.740884780883789, + "learning_rate": 1.9195760598503744e-05, + "loss": 0.4312, + "step": 3230 + }, + { + "epoch": 0.8079800498753117, + "grad_norm": 6.897968769073486, + "learning_rate": 1.9193266832917707e-05, + "loss": 0.4602, + "step": 3240 + }, + { + "epoch": 0.8104738154613467, + "grad_norm": 11.832649230957031, + "learning_rate": 1.919077306733167e-05, + "loss": 0.4456, + "step": 3250 + }, + { + "epoch": 0.8129675810473815, + "grad_norm": 6.845297336578369, + "learning_rate": 1.9188279301745638e-05, + "loss": 0.5935, + "step": 3260 + }, + { + "epoch": 0.8154613466334164, + "grad_norm": 7.5892157554626465, + "learning_rate": 1.91857855361596e-05, + "loss": 0.5575, + "step": 3270 + }, + { + "epoch": 0.8179551122194514, + "grad_norm": 5.861032009124756, + "learning_rate": 1.9183291770573565e-05, + "loss": 0.4618, + "step": 3280 + }, + { + "epoch": 0.8204488778054863, + "grad_norm": 7.565516471862793, + "learning_rate": 1.9180798004987532e-05, + "loss": 0.5081, + "step": 3290 + }, + { + "epoch": 0.8229426433915212, + "grad_norm": 3.7106189727783203, + "learning_rate": 1.9178304239401496e-05, + "loss": 0.4926, + "step": 3300 + }, + { + "epoch": 0.8254364089775561, + "grad_norm": 4.886735916137695, + "learning_rate": 1.9175810473815463e-05, + "loss": 0.491, + "step": 3310 + }, + { + "epoch": 0.827930174563591, + "grad_norm": 4.354279041290283, + "learning_rate": 1.9173316708229426e-05, + "loss": 0.3565, + "step": 3320 + }, + { + "epoch": 0.830423940149626, + "grad_norm": 6.275864601135254, + "learning_rate": 1.9170822942643393e-05, + "loss": 0.4157, + "step": 3330 + }, + { + "epoch": 0.8329177057356608, + "grad_norm": 5.567061901092529, + "learning_rate": 1.916832917705736e-05, + "loss": 0.5054, + "step": 3340 + }, + { + "epoch": 0.8354114713216958, + "grad_norm": 5.522775650024414, + "learning_rate": 1.9165835411471324e-05, + "loss": 0.504, + "step": 3350 + }, + { + "epoch": 0.8379052369077307, + "grad_norm": 4.974456310272217, + "learning_rate": 1.916334164588529e-05, + "loss": 0.4118, + "step": 3360 + }, + { + "epoch": 0.8403990024937655, + "grad_norm": 3.5197205543518066, + "learning_rate": 1.9160847880299255e-05, + "loss": 0.4819, + "step": 3370 + }, + { + "epoch": 0.8428927680798005, + "grad_norm": 4.410624027252197, + "learning_rate": 1.9158354114713218e-05, + "loss": 0.583, + "step": 3380 + }, + { + "epoch": 0.8453865336658354, + "grad_norm": 6.812956809997559, + "learning_rate": 1.9155860349127185e-05, + "loss": 0.463, + "step": 3390 + }, + { + "epoch": 0.8478802992518704, + "grad_norm": 17.613037109375, + "learning_rate": 1.915336658354115e-05, + "loss": 0.4853, + "step": 3400 + }, + { + "epoch": 0.8503740648379052, + "grad_norm": 6.056275844573975, + "learning_rate": 1.9150872817955113e-05, + "loss": 0.5459, + "step": 3410 + }, + { + "epoch": 0.8528678304239401, + "grad_norm": 4.816941261291504, + "learning_rate": 1.914837905236908e-05, + "loss": 0.4013, + "step": 3420 + }, + { + "epoch": 0.8553615960099751, + "grad_norm": 5.128777027130127, + "learning_rate": 1.9145885286783043e-05, + "loss": 0.47, + "step": 3430 + }, + { + "epoch": 0.85785536159601, + "grad_norm": 5.4511566162109375, + "learning_rate": 1.914339152119701e-05, + "loss": 0.4487, + "step": 3440 + }, + { + "epoch": 0.8603491271820449, + "grad_norm": 7.571670055389404, + "learning_rate": 1.9140897755610974e-05, + "loss": 0.3869, + "step": 3450 + }, + { + "epoch": 0.8628428927680798, + "grad_norm": 5.735408782958984, + "learning_rate": 1.9138403990024937e-05, + "loss": 0.4186, + "step": 3460 + }, + { + "epoch": 0.8653366583541147, + "grad_norm": 5.8194475173950195, + "learning_rate": 1.9135910224438904e-05, + "loss": 0.5033, + "step": 3470 + }, + { + "epoch": 0.8678304239401496, + "grad_norm": 6.551326274871826, + "learning_rate": 1.9133416458852868e-05, + "loss": 0.4501, + "step": 3480 + }, + { + "epoch": 0.8703241895261845, + "grad_norm": 7.928218841552734, + "learning_rate": 1.9130922693266835e-05, + "loss": 0.492, + "step": 3490 + }, + { + "epoch": 0.8728179551122195, + "grad_norm": 4.229366302490234, + "learning_rate": 1.91284289276808e-05, + "loss": 0.375, + "step": 3500 + }, + { + "epoch": 0.8753117206982544, + "grad_norm": 7.293509006500244, + "learning_rate": 1.9125935162094766e-05, + "loss": 0.4393, + "step": 3510 + }, + { + "epoch": 0.8778054862842892, + "grad_norm": 6.139492511749268, + "learning_rate": 1.912344139650873e-05, + "loss": 0.3927, + "step": 3520 + }, + { + "epoch": 0.8802992518703242, + "grad_norm": 7.048768520355225, + "learning_rate": 1.9120947630922696e-05, + "loss": 0.4707, + "step": 3530 + }, + { + "epoch": 0.8827930174563591, + "grad_norm": 4.429152488708496, + "learning_rate": 1.911845386533666e-05, + "loss": 0.4569, + "step": 3540 + }, + { + "epoch": 0.885286783042394, + "grad_norm": 7.218087196350098, + "learning_rate": 1.9115960099750627e-05, + "loss": 0.4156, + "step": 3550 + }, + { + "epoch": 0.8877805486284289, + "grad_norm": 5.999967098236084, + "learning_rate": 1.911346633416459e-05, + "loss": 0.4679, + "step": 3560 + }, + { + "epoch": 0.8902743142144638, + "grad_norm": 5.682243824005127, + "learning_rate": 1.9110972568578554e-05, + "loss": 0.4587, + "step": 3570 + }, + { + "epoch": 0.8927680798004988, + "grad_norm": 4.374421119689941, + "learning_rate": 1.910847880299252e-05, + "loss": 0.4885, + "step": 3580 + }, + { + "epoch": 0.8952618453865336, + "grad_norm": 3.372025966644287, + "learning_rate": 1.9105985037406485e-05, + "loss": 0.4402, + "step": 3590 + }, + { + "epoch": 0.8977556109725686, + "grad_norm": 8.967430114746094, + "learning_rate": 1.9103491271820452e-05, + "loss": 0.4719, + "step": 3600 + }, + { + "epoch": 0.9002493765586035, + "grad_norm": 7.0953850746154785, + "learning_rate": 1.9100997506234415e-05, + "loss": 0.4882, + "step": 3610 + }, + { + "epoch": 0.9027431421446384, + "grad_norm": 7.988110542297363, + "learning_rate": 1.909850374064838e-05, + "loss": 0.3987, + "step": 3620 + }, + { + "epoch": 0.9052369077306733, + "grad_norm": 5.712813377380371, + "learning_rate": 1.9096009975062346e-05, + "loss": 0.4625, + "step": 3630 + }, + { + "epoch": 0.9077306733167082, + "grad_norm": 5.640379905700684, + "learning_rate": 1.909351620947631e-05, + "loss": 0.4713, + "step": 3640 + }, + { + "epoch": 0.9102244389027432, + "grad_norm": 6.33805513381958, + "learning_rate": 1.9091022443890277e-05, + "loss": 0.499, + "step": 3650 + }, + { + "epoch": 0.912718204488778, + "grad_norm": 4.548992156982422, + "learning_rate": 1.908852867830424e-05, + "loss": 0.5038, + "step": 3660 + }, + { + "epoch": 0.9152119700748129, + "grad_norm": 8.613363265991211, + "learning_rate": 1.9086034912718204e-05, + "loss": 0.484, + "step": 3670 + }, + { + "epoch": 0.9177057356608479, + "grad_norm": 4.469968318939209, + "learning_rate": 1.908354114713217e-05, + "loss": 0.4473, + "step": 3680 + }, + { + "epoch": 0.9201995012468828, + "grad_norm": 4.103412628173828, + "learning_rate": 1.9081047381546138e-05, + "loss": 0.4483, + "step": 3690 + }, + { + "epoch": 0.9226932668329177, + "grad_norm": 4.446389675140381, + "learning_rate": 1.90785536159601e-05, + "loss": 0.4354, + "step": 3700 + }, + { + "epoch": 0.9251870324189526, + "grad_norm": 3.4375393390655518, + "learning_rate": 1.907605985037407e-05, + "loss": 0.5089, + "step": 3710 + }, + { + "epoch": 0.9276807980049875, + "grad_norm": 4.128108024597168, + "learning_rate": 1.9073566084788032e-05, + "loss": 0.4881, + "step": 3720 + }, + { + "epoch": 0.9301745635910225, + "grad_norm": 6.5702104568481445, + "learning_rate": 1.9071072319202e-05, + "loss": 0.4994, + "step": 3730 + }, + { + "epoch": 0.9326683291770573, + "grad_norm": 4.294257164001465, + "learning_rate": 1.9068578553615963e-05, + "loss": 0.5258, + "step": 3740 + }, + { + "epoch": 0.9351620947630923, + "grad_norm": 10.037517547607422, + "learning_rate": 1.9066084788029926e-05, + "loss": 0.5119, + "step": 3750 + }, + { + "epoch": 0.9376558603491272, + "grad_norm": 12.020997047424316, + "learning_rate": 1.9063591022443893e-05, + "loss": 0.5925, + "step": 3760 + }, + { + "epoch": 0.940149625935162, + "grad_norm": 6.246609210968018, + "learning_rate": 1.9061097256857857e-05, + "loss": 0.4407, + "step": 3770 + }, + { + "epoch": 0.942643391521197, + "grad_norm": 6.553652286529541, + "learning_rate": 1.905860349127182e-05, + "loss": 0.4833, + "step": 3780 + }, + { + "epoch": 0.9451371571072319, + "grad_norm": 5.495224952697754, + "learning_rate": 1.9056109725685788e-05, + "loss": 0.4579, + "step": 3790 + }, + { + "epoch": 0.9476309226932669, + "grad_norm": 5.287566184997559, + "learning_rate": 1.905361596009975e-05, + "loss": 0.5213, + "step": 3800 + }, + { + "epoch": 0.9501246882793017, + "grad_norm": 5.307123184204102, + "learning_rate": 1.9051122194513718e-05, + "loss": 0.4347, + "step": 3810 + }, + { + "epoch": 0.9526184538653366, + "grad_norm": 6.510472774505615, + "learning_rate": 1.9048628428927682e-05, + "loss": 0.5203, + "step": 3820 + }, + { + "epoch": 0.9551122194513716, + "grad_norm": 4.413476467132568, + "learning_rate": 1.9046134663341645e-05, + "loss": 0.4359, + "step": 3830 + }, + { + "epoch": 0.9576059850374065, + "grad_norm": 4.353250503540039, + "learning_rate": 1.9043640897755612e-05, + "loss": 0.4169, + "step": 3840 + }, + { + "epoch": 0.9600997506234414, + "grad_norm": 6.984455108642578, + "learning_rate": 1.9041147132169576e-05, + "loss": 0.5156, + "step": 3850 + }, + { + "epoch": 0.9625935162094763, + "grad_norm": 4.519622325897217, + "learning_rate": 1.9038653366583543e-05, + "loss": 0.4545, + "step": 3860 + }, + { + "epoch": 0.9650872817955112, + "grad_norm": 4.484631061553955, + "learning_rate": 1.9036159600997507e-05, + "loss": 0.5115, + "step": 3870 + }, + { + "epoch": 0.9675810473815462, + "grad_norm": 6.899198532104492, + "learning_rate": 1.9033665835411474e-05, + "loss": 0.4979, + "step": 3880 + }, + { + "epoch": 0.970074812967581, + "grad_norm": 3.723076581954956, + "learning_rate": 1.9031172069825437e-05, + "loss": 0.464, + "step": 3890 + }, + { + "epoch": 0.972568578553616, + "grad_norm": 6.418664932250977, + "learning_rate": 1.9028678304239404e-05, + "loss": 0.4556, + "step": 3900 + }, + { + "epoch": 0.9750623441396509, + "grad_norm": 3.5927555561065674, + "learning_rate": 1.9026184538653368e-05, + "loss": 0.3993, + "step": 3910 + }, + { + "epoch": 0.9775561097256857, + "grad_norm": 4.519811153411865, + "learning_rate": 1.9023690773067335e-05, + "loss": 0.5431, + "step": 3920 + }, + { + "epoch": 0.9800498753117207, + "grad_norm": 11.50717830657959, + "learning_rate": 1.90211970074813e-05, + "loss": 0.4571, + "step": 3930 + }, + { + "epoch": 0.9825436408977556, + "grad_norm": 4.3499531745910645, + "learning_rate": 1.9018703241895265e-05, + "loss": 0.4586, + "step": 3940 + }, + { + "epoch": 0.9850374064837906, + "grad_norm": 6.324216365814209, + "learning_rate": 1.901620947630923e-05, + "loss": 0.4397, + "step": 3950 + }, + { + "epoch": 0.9875311720698254, + "grad_norm": 6.713108062744141, + "learning_rate": 1.9013715710723193e-05, + "loss": 0.4178, + "step": 3960 + }, + { + "epoch": 0.9900249376558603, + "grad_norm": 4.340822219848633, + "learning_rate": 1.901122194513716e-05, + "loss": 0.5256, + "step": 3970 + }, + { + "epoch": 0.9925187032418953, + "grad_norm": 15.54155445098877, + "learning_rate": 1.9008728179551123e-05, + "loss": 0.4169, + "step": 3980 + }, + { + "epoch": 0.9950124688279302, + "grad_norm": 3.4416344165802, + "learning_rate": 1.9006234413965087e-05, + "loss": 0.4568, + "step": 3990 + }, + { + "epoch": 0.9975062344139651, + "grad_norm": 4.241981506347656, + "learning_rate": 1.9003740648379054e-05, + "loss": 0.4386, + "step": 4000 + }, + { + "epoch": 1.0, + "grad_norm": 2.7881031036376953, + "learning_rate": 1.9001246882793018e-05, + "loss": 0.4039, + "step": 4010 + }, + { + "epoch": 1.0, + "eval_loss": 0.46533510088920593, + "eval_runtime": 60.0033, + "eval_samples_per_second": 16.716, + "eval_steps_per_second": 16.716, + "step": 4010 + }, + { + "epoch": 1.0024937655860349, + "grad_norm": 5.6270012855529785, + "learning_rate": 1.8998753117206985e-05, + "loss": 0.4246, + "step": 4020 + }, + { + "epoch": 1.0049875311720697, + "grad_norm": 9.492588996887207, + "learning_rate": 1.8996259351620948e-05, + "loss": 0.5515, + "step": 4030 + }, + { + "epoch": 1.0074812967581048, + "grad_norm": 4.804108142852783, + "learning_rate": 1.8993765586034915e-05, + "loss": 0.4456, + "step": 4040 + }, + { + "epoch": 1.0099750623441397, + "grad_norm": 4.964878082275391, + "learning_rate": 1.899127182044888e-05, + "loss": 0.418, + "step": 4050 + }, + { + "epoch": 1.0124688279301746, + "grad_norm": 4.545513153076172, + "learning_rate": 1.8988778054862846e-05, + "loss": 0.4251, + "step": 4060 + }, + { + "epoch": 1.0149625935162094, + "grad_norm": 4.988970756530762, + "learning_rate": 1.898628428927681e-05, + "loss": 0.4635, + "step": 4070 + }, + { + "epoch": 1.0174563591022443, + "grad_norm": 3.7804431915283203, + "learning_rate": 1.8983790523690776e-05, + "loss": 0.4948, + "step": 4080 + }, + { + "epoch": 1.0199501246882794, + "grad_norm": 6.862154960632324, + "learning_rate": 1.898129675810474e-05, + "loss": 0.471, + "step": 4090 + }, + { + "epoch": 1.0224438902743143, + "grad_norm": 4.912306308746338, + "learning_rate": 1.8978802992518707e-05, + "loss": 0.4769, + "step": 4100 + }, + { + "epoch": 1.0249376558603491, + "grad_norm": 6.113458633422852, + "learning_rate": 1.897630922693267e-05, + "loss": 0.4927, + "step": 4110 + }, + { + "epoch": 1.027431421446384, + "grad_norm": 7.236551761627197, + "learning_rate": 1.8973815461346634e-05, + "loss": 0.4274, + "step": 4120 + }, + { + "epoch": 1.0299251870324189, + "grad_norm": 5.630270481109619, + "learning_rate": 1.89713216957606e-05, + "loss": 0.4913, + "step": 4130 + }, + { + "epoch": 1.032418952618454, + "grad_norm": 7.250363349914551, + "learning_rate": 1.8968827930174565e-05, + "loss": 0.3752, + "step": 4140 + }, + { + "epoch": 1.0349127182044888, + "grad_norm": 4.857931137084961, + "learning_rate": 1.8966334164588532e-05, + "loss": 0.4775, + "step": 4150 + }, + { + "epoch": 1.0374064837905237, + "grad_norm": 4.707160949707031, + "learning_rate": 1.8963840399002495e-05, + "loss": 0.5688, + "step": 4160 + }, + { + "epoch": 1.0399002493765586, + "grad_norm": 6.663334846496582, + "learning_rate": 1.896134663341646e-05, + "loss": 0.4619, + "step": 4170 + }, + { + "epoch": 1.0423940149625934, + "grad_norm": 8.353853225708008, + "learning_rate": 1.8958852867830426e-05, + "loss": 0.4525, + "step": 4180 + }, + { + "epoch": 1.0448877805486285, + "grad_norm": 5.910696506500244, + "learning_rate": 1.895635910224439e-05, + "loss": 0.4344, + "step": 4190 + }, + { + "epoch": 1.0473815461346634, + "grad_norm": 5.698975086212158, + "learning_rate": 1.8953865336658353e-05, + "loss": 0.4816, + "step": 4200 + }, + { + "epoch": 1.0498753117206983, + "grad_norm": 9.77688217163086, + "learning_rate": 1.895137157107232e-05, + "loss": 0.417, + "step": 4210 + }, + { + "epoch": 1.0523690773067331, + "grad_norm": 4.210202693939209, + "learning_rate": 1.8948877805486284e-05, + "loss": 0.5169, + "step": 4220 + }, + { + "epoch": 1.054862842892768, + "grad_norm": 4.911707401275635, + "learning_rate": 1.894638403990025e-05, + "loss": 0.4293, + "step": 4230 + }, + { + "epoch": 1.057356608478803, + "grad_norm": 5.228448867797852, + "learning_rate": 1.8943890274314215e-05, + "loss": 0.429, + "step": 4240 + }, + { + "epoch": 1.059850374064838, + "grad_norm": 5.143721103668213, + "learning_rate": 1.894139650872818e-05, + "loss": 0.4066, + "step": 4250 + }, + { + "epoch": 1.0623441396508728, + "grad_norm": 7.817266464233398, + "learning_rate": 1.8938902743142145e-05, + "loss": 0.6464, + "step": 4260 + }, + { + "epoch": 1.0648379052369077, + "grad_norm": 5.188505172729492, + "learning_rate": 1.8936408977556112e-05, + "loss": 0.4417, + "step": 4270 + }, + { + "epoch": 1.0673316708229426, + "grad_norm": 6.084823131561279, + "learning_rate": 1.8933915211970076e-05, + "loss": 0.47, + "step": 4280 + }, + { + "epoch": 1.0698254364089776, + "grad_norm": 4.737970352172852, + "learning_rate": 1.8931421446384043e-05, + "loss": 0.5196, + "step": 4290 + }, + { + "epoch": 1.0723192019950125, + "grad_norm": 6.13934326171875, + "learning_rate": 1.8928927680798006e-05, + "loss": 0.3904, + "step": 4300 + }, + { + "epoch": 1.0748129675810474, + "grad_norm": 7.485795497894287, + "learning_rate": 1.8926433915211973e-05, + "loss": 0.4556, + "step": 4310 + }, + { + "epoch": 1.0773067331670823, + "grad_norm": 6.431264877319336, + "learning_rate": 1.8923940149625937e-05, + "loss": 0.4713, + "step": 4320 + }, + { + "epoch": 1.0798004987531171, + "grad_norm": 3.5695624351501465, + "learning_rate": 1.89214463840399e-05, + "loss": 0.4109, + "step": 4330 + }, + { + "epoch": 1.0822942643391522, + "grad_norm": 4.416280746459961, + "learning_rate": 1.8918952618453868e-05, + "loss": 0.4734, + "step": 4340 + }, + { + "epoch": 1.084788029925187, + "grad_norm": 4.557003021240234, + "learning_rate": 1.891645885286783e-05, + "loss": 0.4291, + "step": 4350 + }, + { + "epoch": 1.087281795511222, + "grad_norm": 7.2662272453308105, + "learning_rate": 1.8913965087281798e-05, + "loss": 0.4176, + "step": 4360 + }, + { + "epoch": 1.0897755610972568, + "grad_norm": 7.033773422241211, + "learning_rate": 1.8911471321695762e-05, + "loss": 0.4534, + "step": 4370 + }, + { + "epoch": 1.0922693266832917, + "grad_norm": 4.475677490234375, + "learning_rate": 1.8908977556109726e-05, + "loss": 0.4394, + "step": 4380 + }, + { + "epoch": 1.0947630922693268, + "grad_norm": 4.8636698722839355, + "learning_rate": 1.8906483790523693e-05, + "loss": 0.4371, + "step": 4390 + }, + { + "epoch": 1.0972568578553616, + "grad_norm": 5.832028865814209, + "learning_rate": 1.8903990024937656e-05, + "loss": 0.444, + "step": 4400 + }, + { + "epoch": 1.0997506234413965, + "grad_norm": 7.826235771179199, + "learning_rate": 1.8901496259351623e-05, + "loss": 0.5365, + "step": 4410 + }, + { + "epoch": 1.1022443890274314, + "grad_norm": 6.200789928436279, + "learning_rate": 1.8899002493765587e-05, + "loss": 0.4081, + "step": 4420 + }, + { + "epoch": 1.1047381546134662, + "grad_norm": 5.708020210266113, + "learning_rate": 1.8896508728179554e-05, + "loss": 0.5594, + "step": 4430 + }, + { + "epoch": 1.1072319201995013, + "grad_norm": 5.138545513153076, + "learning_rate": 1.8894014962593517e-05, + "loss": 0.4128, + "step": 4440 + }, + { + "epoch": 1.1097256857855362, + "grad_norm": 5.702857971191406, + "learning_rate": 1.8891521197007484e-05, + "loss": 0.4568, + "step": 4450 + }, + { + "epoch": 1.112219451371571, + "grad_norm": 5.061423301696777, + "learning_rate": 1.8889027431421448e-05, + "loss": 0.4298, + "step": 4460 + }, + { + "epoch": 1.114713216957606, + "grad_norm": 4.579878330230713, + "learning_rate": 1.8886533665835415e-05, + "loss": 0.3446, + "step": 4470 + }, + { + "epoch": 1.1172069825436408, + "grad_norm": 4.493147373199463, + "learning_rate": 1.888403990024938e-05, + "loss": 0.4652, + "step": 4480 + }, + { + "epoch": 1.119700748129676, + "grad_norm": 3.8753387928009033, + "learning_rate": 1.8881546134663342e-05, + "loss": 0.4559, + "step": 4490 + }, + { + "epoch": 1.1221945137157108, + "grad_norm": 5.9976325035095215, + "learning_rate": 1.887905236907731e-05, + "loss": 0.4102, + "step": 4500 + }, + { + "epoch": 1.1246882793017456, + "grad_norm": 7.784470081329346, + "learning_rate": 1.8876558603491273e-05, + "loss": 0.4989, + "step": 4510 + }, + { + "epoch": 1.1271820448877805, + "grad_norm": 5.6073737144470215, + "learning_rate": 1.887406483790524e-05, + "loss": 0.3927, + "step": 4520 + }, + { + "epoch": 1.1296758104738154, + "grad_norm": 11.797880172729492, + "learning_rate": 1.8871571072319203e-05, + "loss": 0.5698, + "step": 4530 + }, + { + "epoch": 1.1321695760598505, + "grad_norm": 10.158886909484863, + "learning_rate": 1.8869077306733167e-05, + "loss": 0.3766, + "step": 4540 + }, + { + "epoch": 1.1346633416458853, + "grad_norm": 6.6217875480651855, + "learning_rate": 1.8866583541147134e-05, + "loss": 0.46, + "step": 4550 + }, + { + "epoch": 1.1371571072319202, + "grad_norm": 5.993943691253662, + "learning_rate": 1.8864089775561098e-05, + "loss": 0.512, + "step": 4560 + }, + { + "epoch": 1.139650872817955, + "grad_norm": 5.103926181793213, + "learning_rate": 1.886159600997506e-05, + "loss": 0.4591, + "step": 4570 + }, + { + "epoch": 1.14214463840399, + "grad_norm": 6.188269138336182, + "learning_rate": 1.885910224438903e-05, + "loss": 0.4811, + "step": 4580 + }, + { + "epoch": 1.144638403990025, + "grad_norm": 3.5611369609832764, + "learning_rate": 1.8856608478802992e-05, + "loss": 0.4138, + "step": 4590 + }, + { + "epoch": 1.14713216957606, + "grad_norm": 6.32553768157959, + "learning_rate": 1.885411471321696e-05, + "loss": 0.4718, + "step": 4600 + }, + { + "epoch": 1.1496259351620948, + "grad_norm": 5.74522590637207, + "learning_rate": 1.8851620947630923e-05, + "loss": 0.4678, + "step": 4610 + }, + { + "epoch": 1.1521197007481296, + "grad_norm": 3.8611743450164795, + "learning_rate": 1.884912718204489e-05, + "loss": 0.361, + "step": 4620 + }, + { + "epoch": 1.1546134663341645, + "grad_norm": 5.439576625823975, + "learning_rate": 1.8846633416458857e-05, + "loss": 0.4243, + "step": 4630 + }, + { + "epoch": 1.1571072319201996, + "grad_norm": 6.206503391265869, + "learning_rate": 1.884413965087282e-05, + "loss": 0.4304, + "step": 4640 + }, + { + "epoch": 1.1596009975062345, + "grad_norm": 10.82448959350586, + "learning_rate": 1.8841645885286787e-05, + "loss": 0.5127, + "step": 4650 + }, + { + "epoch": 1.1620947630922693, + "grad_norm": 5.439650535583496, + "learning_rate": 1.883915211970075e-05, + "loss": 0.4592, + "step": 4660 + }, + { + "epoch": 1.1645885286783042, + "grad_norm": 4.041094779968262, + "learning_rate": 1.8836658354114714e-05, + "loss": 0.4481, + "step": 4670 + }, + { + "epoch": 1.167082294264339, + "grad_norm": 4.277478218078613, + "learning_rate": 1.883416458852868e-05, + "loss": 0.4371, + "step": 4680 + }, + { + "epoch": 1.1695760598503742, + "grad_norm": 9.352392196655273, + "learning_rate": 1.8831670822942645e-05, + "loss": 0.4469, + "step": 4690 + }, + { + "epoch": 1.172069825436409, + "grad_norm": 6.215358257293701, + "learning_rate": 1.882917705735661e-05, + "loss": 0.4068, + "step": 4700 + }, + { + "epoch": 1.174563591022444, + "grad_norm": 8.063887596130371, + "learning_rate": 1.8826683291770576e-05, + "loss": 0.4997, + "step": 4710 + }, + { + "epoch": 1.1770573566084788, + "grad_norm": 9.79735279083252, + "learning_rate": 1.882418952618454e-05, + "loss": 0.4281, + "step": 4720 + }, + { + "epoch": 1.1795511221945136, + "grad_norm": 8.45605182647705, + "learning_rate": 1.8821695760598506e-05, + "loss": 0.4207, + "step": 4730 + }, + { + "epoch": 1.1820448877805487, + "grad_norm": 5.719277858734131, + "learning_rate": 1.881920199501247e-05, + "loss": 0.4669, + "step": 4740 + }, + { + "epoch": 1.1845386533665836, + "grad_norm": 5.201423645019531, + "learning_rate": 1.8816708229426434e-05, + "loss": 0.4345, + "step": 4750 + }, + { + "epoch": 1.1870324189526185, + "grad_norm": 4.76845121383667, + "learning_rate": 1.88142144638404e-05, + "loss": 0.4489, + "step": 4760 + }, + { + "epoch": 1.1895261845386533, + "grad_norm": 7.2256622314453125, + "learning_rate": 1.8811720698254364e-05, + "loss": 0.4855, + "step": 4770 + }, + { + "epoch": 1.1920199501246882, + "grad_norm": 6.201477527618408, + "learning_rate": 1.880922693266833e-05, + "loss": 0.4176, + "step": 4780 + }, + { + "epoch": 1.1945137157107233, + "grad_norm": 7.298521995544434, + "learning_rate": 1.8806733167082295e-05, + "loss": 0.4582, + "step": 4790 + }, + { + "epoch": 1.1970074812967582, + "grad_norm": 6.724678993225098, + "learning_rate": 1.8804239401496262e-05, + "loss": 0.502, + "step": 4800 + }, + { + "epoch": 1.199501246882793, + "grad_norm": 4.230656623840332, + "learning_rate": 1.8801745635910225e-05, + "loss": 0.4097, + "step": 4810 + }, + { + "epoch": 1.201995012468828, + "grad_norm": 4.771544456481934, + "learning_rate": 1.8799251870324192e-05, + "loss": 0.4729, + "step": 4820 + }, + { + "epoch": 1.2044887780548628, + "grad_norm": 5.5048651695251465, + "learning_rate": 1.8796758104738156e-05, + "loss": 0.4848, + "step": 4830 + }, + { + "epoch": 1.2069825436408976, + "grad_norm": 10.239465713500977, + "learning_rate": 1.8794264339152123e-05, + "loss": 0.3863, + "step": 4840 + }, + { + "epoch": 1.2094763092269327, + "grad_norm": 6.2510986328125, + "learning_rate": 1.8791770573566087e-05, + "loss": 0.4414, + "step": 4850 + }, + { + "epoch": 1.2119700748129676, + "grad_norm": 4.370057106018066, + "learning_rate": 1.8789276807980054e-05, + "loss": 0.4307, + "step": 4860 + }, + { + "epoch": 1.2144638403990025, + "grad_norm": 6.856778621673584, + "learning_rate": 1.8786783042394017e-05, + "loss": 0.5257, + "step": 4870 + }, + { + "epoch": 1.2169576059850373, + "grad_norm": 3.6856114864349365, + "learning_rate": 1.878428927680798e-05, + "loss": 0.433, + "step": 4880 + }, + { + "epoch": 1.2194513715710724, + "grad_norm": 4.310407638549805, + "learning_rate": 1.8781795511221948e-05, + "loss": 0.4211, + "step": 4890 + }, + { + "epoch": 1.2219451371571073, + "grad_norm": 4.885754585266113, + "learning_rate": 1.877930174563591e-05, + "loss": 0.3955, + "step": 4900 + }, + { + "epoch": 1.2244389027431422, + "grad_norm": 5.621823787689209, + "learning_rate": 1.8776807980049875e-05, + "loss": 0.4374, + "step": 4910 + }, + { + "epoch": 1.226932668329177, + "grad_norm": 8.302716255187988, + "learning_rate": 1.8774314214463842e-05, + "loss": 0.4245, + "step": 4920 + }, + { + "epoch": 1.229426433915212, + "grad_norm": 5.974333763122559, + "learning_rate": 1.8771820448877806e-05, + "loss": 0.547, + "step": 4930 + }, + { + "epoch": 1.2319201995012468, + "grad_norm": 6.3131303787231445, + "learning_rate": 1.8769326683291773e-05, + "loss": 0.4662, + "step": 4940 + }, + { + "epoch": 1.2344139650872819, + "grad_norm": 5.7204389572143555, + "learning_rate": 1.8766832917705736e-05, + "loss": 0.5053, + "step": 4950 + }, + { + "epoch": 1.2369077306733167, + "grad_norm": 4.9055047035217285, + "learning_rate": 1.87643391521197e-05, + "loss": 0.5201, + "step": 4960 + }, + { + "epoch": 1.2394014962593516, + "grad_norm": 3.9648923873901367, + "learning_rate": 1.8761845386533667e-05, + "loss": 0.4802, + "step": 4970 + }, + { + "epoch": 1.2418952618453865, + "grad_norm": 7.1016364097595215, + "learning_rate": 1.8759351620947634e-05, + "loss": 0.4384, + "step": 4980 + }, + { + "epoch": 1.2443890274314215, + "grad_norm": 4.486512184143066, + "learning_rate": 1.8756857855361598e-05, + "loss": 0.4456, + "step": 4990 + }, + { + "epoch": 1.2468827930174564, + "grad_norm": 5.411483287811279, + "learning_rate": 1.8754364089775565e-05, + "loss": 0.4317, + "step": 5000 + }, + { + "epoch": 1.2493765586034913, + "grad_norm": 6.580456733703613, + "learning_rate": 1.8751870324189528e-05, + "loss": 0.4559, + "step": 5010 + }, + { + "epoch": 1.2518703241895262, + "grad_norm": 3.885417938232422, + "learning_rate": 1.8749376558603495e-05, + "loss": 0.4265, + "step": 5020 + }, + { + "epoch": 1.254364089775561, + "grad_norm": 5.22758150100708, + "learning_rate": 1.874688279301746e-05, + "loss": 0.4173, + "step": 5030 + }, + { + "epoch": 1.2568578553615959, + "grad_norm": 6.1364827156066895, + "learning_rate": 1.8744389027431422e-05, + "loss": 0.4624, + "step": 5040 + }, + { + "epoch": 1.259351620947631, + "grad_norm": 6.770162105560303, + "learning_rate": 1.874189526184539e-05, + "loss": 0.4087, + "step": 5050 + }, + { + "epoch": 1.2618453865336658, + "grad_norm": 6.774170875549316, + "learning_rate": 1.8739401496259353e-05, + "loss": 0.4163, + "step": 5060 + }, + { + "epoch": 1.2643391521197007, + "grad_norm": 5.055146217346191, + "learning_rate": 1.8736907730673317e-05, + "loss": 0.4277, + "step": 5070 + }, + { + "epoch": 1.2668329177057356, + "grad_norm": 4.696500301361084, + "learning_rate": 1.8734413965087284e-05, + "loss": 0.3918, + "step": 5080 + }, + { + "epoch": 1.2693266832917707, + "grad_norm": 5.685618877410889, + "learning_rate": 1.8731920199501247e-05, + "loss": 0.5113, + "step": 5090 + }, + { + "epoch": 1.2718204488778055, + "grad_norm": 4.808736324310303, + "learning_rate": 1.8729426433915214e-05, + "loss": 0.4891, + "step": 5100 + }, + { + "epoch": 1.2743142144638404, + "grad_norm": 8.052289009094238, + "learning_rate": 1.8726932668329178e-05, + "loss": 0.4433, + "step": 5110 + }, + { + "epoch": 1.2768079800498753, + "grad_norm": 6.9519853591918945, + "learning_rate": 1.872443890274314e-05, + "loss": 0.4916, + "step": 5120 + }, + { + "epoch": 1.2793017456359101, + "grad_norm": 4.836465358734131, + "learning_rate": 1.872194513715711e-05, + "loss": 0.4411, + "step": 5130 + }, + { + "epoch": 1.281795511221945, + "grad_norm": 4.297245025634766, + "learning_rate": 1.8719451371571072e-05, + "loss": 0.4697, + "step": 5140 + }, + { + "epoch": 1.28428927680798, + "grad_norm": 5.085397720336914, + "learning_rate": 1.871695760598504e-05, + "loss": 0.4445, + "step": 5150 + }, + { + "epoch": 1.286783042394015, + "grad_norm": 4.748317241668701, + "learning_rate": 1.8714463840399003e-05, + "loss": 0.4488, + "step": 5160 + }, + { + "epoch": 1.2892768079800498, + "grad_norm": 5.959413528442383, + "learning_rate": 1.871197007481297e-05, + "loss": 0.4648, + "step": 5170 + }, + { + "epoch": 1.2917705735660847, + "grad_norm": 11.551535606384277, + "learning_rate": 1.8709476309226933e-05, + "loss": 0.445, + "step": 5180 + }, + { + "epoch": 1.2942643391521198, + "grad_norm": 6.094724655151367, + "learning_rate": 1.87069825436409e-05, + "loss": 0.4175, + "step": 5190 + }, + { + "epoch": 1.2967581047381547, + "grad_norm": 9.543522834777832, + "learning_rate": 1.8704488778054864e-05, + "loss": 0.5068, + "step": 5200 + }, + { + "epoch": 1.2992518703241895, + "grad_norm": 6.417718410491943, + "learning_rate": 1.870199501246883e-05, + "loss": 0.4341, + "step": 5210 + }, + { + "epoch": 1.3017456359102244, + "grad_norm": 4.876067638397217, + "learning_rate": 1.8699501246882795e-05, + "loss": 0.5283, + "step": 5220 + }, + { + "epoch": 1.3042394014962593, + "grad_norm": 6.962366580963135, + "learning_rate": 1.869700748129676e-05, + "loss": 0.5235, + "step": 5230 + }, + { + "epoch": 1.3067331670822941, + "grad_norm": 6.60381555557251, + "learning_rate": 1.8694513715710725e-05, + "loss": 0.5055, + "step": 5240 + }, + { + "epoch": 1.3092269326683292, + "grad_norm": 4.449953079223633, + "learning_rate": 1.869201995012469e-05, + "loss": 0.5745, + "step": 5250 + }, + { + "epoch": 1.311720698254364, + "grad_norm": 6.488658905029297, + "learning_rate": 1.8689526184538656e-05, + "loss": 0.4261, + "step": 5260 + }, + { + "epoch": 1.314214463840399, + "grad_norm": 4.1007795333862305, + "learning_rate": 1.868703241895262e-05, + "loss": 0.3836, + "step": 5270 + }, + { + "epoch": 1.3167082294264338, + "grad_norm": 7.238208293914795, + "learning_rate": 1.8684538653366583e-05, + "loss": 0.5217, + "step": 5280 + }, + { + "epoch": 1.319201995012469, + "grad_norm": 5.4013590812683105, + "learning_rate": 1.868204488778055e-05, + "loss": 0.4387, + "step": 5290 + }, + { + "epoch": 1.3216957605985038, + "grad_norm": 8.315073013305664, + "learning_rate": 1.8679551122194514e-05, + "loss": 0.4616, + "step": 5300 + }, + { + "epoch": 1.3241895261845387, + "grad_norm": 3.4126434326171875, + "learning_rate": 1.867705735660848e-05, + "loss": 0.4431, + "step": 5310 + }, + { + "epoch": 1.3266832917705735, + "grad_norm": 7.5681681632995605, + "learning_rate": 1.8674563591022444e-05, + "loss": 0.5109, + "step": 5320 + }, + { + "epoch": 1.3291770573566084, + "grad_norm": 5.397674560546875, + "learning_rate": 1.867206982543641e-05, + "loss": 0.4732, + "step": 5330 + }, + { + "epoch": 1.3316708229426433, + "grad_norm": 5.570555686950684, + "learning_rate": 1.8669576059850375e-05, + "loss": 0.4385, + "step": 5340 + }, + { + "epoch": 1.3341645885286784, + "grad_norm": 6.722954273223877, + "learning_rate": 1.8667082294264342e-05, + "loss": 0.4784, + "step": 5350 + }, + { + "epoch": 1.3366583541147132, + "grad_norm": 4.577895164489746, + "learning_rate": 1.8664588528678306e-05, + "loss": 0.4526, + "step": 5360 + }, + { + "epoch": 1.339152119700748, + "grad_norm": 5.092601776123047, + "learning_rate": 1.8662094763092273e-05, + "loss": 0.37, + "step": 5370 + }, + { + "epoch": 1.341645885286783, + "grad_norm": 6.236178398132324, + "learning_rate": 1.8659600997506236e-05, + "loss": 0.4303, + "step": 5380 + }, + { + "epoch": 1.344139650872818, + "grad_norm": 4.475374698638916, + "learning_rate": 1.8657107231920203e-05, + "loss": 0.4483, + "step": 5390 + }, + { + "epoch": 1.346633416458853, + "grad_norm": 4.651284217834473, + "learning_rate": 1.8654613466334167e-05, + "loss": 0.431, + "step": 5400 + }, + { + "epoch": 1.3491271820448878, + "grad_norm": 4.801163673400879, + "learning_rate": 1.865211970074813e-05, + "loss": 0.4817, + "step": 5410 + }, + { + "epoch": 1.3516209476309227, + "grad_norm": 6.051980972290039, + "learning_rate": 1.8649625935162097e-05, + "loss": 0.4447, + "step": 5420 + }, + { + "epoch": 1.3541147132169575, + "grad_norm": 6.429530143737793, + "learning_rate": 1.864713216957606e-05, + "loss": 0.5124, + "step": 5430 + }, + { + "epoch": 1.3566084788029924, + "grad_norm": 9.302543640136719, + "learning_rate": 1.8644638403990028e-05, + "loss": 0.4148, + "step": 5440 + }, + { + "epoch": 1.3591022443890275, + "grad_norm": 6.575656414031982, + "learning_rate": 1.864214463840399e-05, + "loss": 0.4759, + "step": 5450 + }, + { + "epoch": 1.3615960099750624, + "grad_norm": 6.7204084396362305, + "learning_rate": 1.8639650872817955e-05, + "loss": 0.4483, + "step": 5460 + }, + { + "epoch": 1.3640897755610972, + "grad_norm": 9.02580451965332, + "learning_rate": 1.8637157107231922e-05, + "loss": 0.4601, + "step": 5470 + }, + { + "epoch": 1.366583541147132, + "grad_norm": 4.408763885498047, + "learning_rate": 1.8634663341645886e-05, + "loss": 0.4205, + "step": 5480 + }, + { + "epoch": 1.3690773067331672, + "grad_norm": 4.800736427307129, + "learning_rate": 1.863216957605985e-05, + "loss": 0.4998, + "step": 5490 + }, + { + "epoch": 1.371571072319202, + "grad_norm": 7.011260509490967, + "learning_rate": 1.8629675810473817e-05, + "loss": 0.5127, + "step": 5500 + }, + { + "epoch": 1.374064837905237, + "grad_norm": 4.573014259338379, + "learning_rate": 1.862718204488778e-05, + "loss": 0.4685, + "step": 5510 + }, + { + "epoch": 1.3765586034912718, + "grad_norm": 6.397534370422363, + "learning_rate": 1.8624688279301747e-05, + "loss": 0.464, + "step": 5520 + }, + { + "epoch": 1.3790523690773067, + "grad_norm": 4.248035907745361, + "learning_rate": 1.862219451371571e-05, + "loss": 0.4275, + "step": 5530 + }, + { + "epoch": 1.3815461346633415, + "grad_norm": 4.859241008758545, + "learning_rate": 1.8619700748129678e-05, + "loss": 0.3563, + "step": 5540 + }, + { + "epoch": 1.3840399002493766, + "grad_norm": 4.7854905128479, + "learning_rate": 1.861720698254364e-05, + "loss": 0.4193, + "step": 5550 + }, + { + "epoch": 1.3865336658354115, + "grad_norm": 17.388259887695312, + "learning_rate": 1.861471321695761e-05, + "loss": 0.5405, + "step": 5560 + }, + { + "epoch": 1.3890274314214464, + "grad_norm": 25.828964233398438, + "learning_rate": 1.8612219451371572e-05, + "loss": 0.4526, + "step": 5570 + }, + { + "epoch": 1.3915211970074812, + "grad_norm": 12.78157901763916, + "learning_rate": 1.860972568578554e-05, + "loss": 0.45, + "step": 5580 + }, + { + "epoch": 1.3940149625935163, + "grad_norm": 5.025924205780029, + "learning_rate": 1.8607231920199503e-05, + "loss": 0.4668, + "step": 5590 + }, + { + "epoch": 1.3965087281795512, + "grad_norm": 3.784602403640747, + "learning_rate": 1.860473815461347e-05, + "loss": 0.4034, + "step": 5600 + }, + { + "epoch": 1.399002493765586, + "grad_norm": 5.713232517242432, + "learning_rate": 1.8602244389027433e-05, + "loss": 0.4374, + "step": 5610 + }, + { + "epoch": 1.401496259351621, + "grad_norm": 5.523266792297363, + "learning_rate": 1.8599750623441397e-05, + "loss": 0.4581, + "step": 5620 + }, + { + "epoch": 1.4039900249376558, + "grad_norm": 12.597986221313477, + "learning_rate": 1.8597256857855364e-05, + "loss": 0.4511, + "step": 5630 + }, + { + "epoch": 1.4064837905236907, + "grad_norm": 3.8616204261779785, + "learning_rate": 1.8594763092269327e-05, + "loss": 0.4322, + "step": 5640 + }, + { + "epoch": 1.4089775561097257, + "grad_norm": 6.135002613067627, + "learning_rate": 1.8592269326683294e-05, + "loss": 0.4043, + "step": 5650 + }, + { + "epoch": 1.4114713216957606, + "grad_norm": 4.191798210144043, + "learning_rate": 1.8589775561097258e-05, + "loss": 0.4508, + "step": 5660 + }, + { + "epoch": 1.4139650872817955, + "grad_norm": 5.872029781341553, + "learning_rate": 1.858728179551122e-05, + "loss": 0.4515, + "step": 5670 + }, + { + "epoch": 1.4164588528678304, + "grad_norm": 7.221251010894775, + "learning_rate": 1.858478802992519e-05, + "loss": 0.4471, + "step": 5680 + }, + { + "epoch": 1.4189526184538654, + "grad_norm": 3.4195024967193604, + "learning_rate": 1.8582294264339152e-05, + "loss": 0.3757, + "step": 5690 + }, + { + "epoch": 1.4214463840399003, + "grad_norm": 4.992902755737305, + "learning_rate": 1.857980049875312e-05, + "loss": 0.4098, + "step": 5700 + }, + { + "epoch": 1.4239401496259352, + "grad_norm": 7.851417064666748, + "learning_rate": 1.8577306733167083e-05, + "loss": 0.54, + "step": 5710 + }, + { + "epoch": 1.42643391521197, + "grad_norm": 5.539144992828369, + "learning_rate": 1.857481296758105e-05, + "loss": 0.4191, + "step": 5720 + }, + { + "epoch": 1.428927680798005, + "grad_norm": 9.11573314666748, + "learning_rate": 1.8572319201995014e-05, + "loss": 0.4237, + "step": 5730 + }, + { + "epoch": 1.4314214463840398, + "grad_norm": 5.815394401550293, + "learning_rate": 1.856982543640898e-05, + "loss": 0.4564, + "step": 5740 + }, + { + "epoch": 1.4339152119700749, + "grad_norm": 4.174201011657715, + "learning_rate": 1.8567331670822944e-05, + "loss": 0.4493, + "step": 5750 + }, + { + "epoch": 1.4364089775561097, + "grad_norm": 7.393326759338379, + "learning_rate": 1.856483790523691e-05, + "loss": 0.4579, + "step": 5760 + }, + { + "epoch": 1.4389027431421446, + "grad_norm": 4.805573463439941, + "learning_rate": 1.8562344139650875e-05, + "loss": 0.4525, + "step": 5770 + }, + { + "epoch": 1.4413965087281795, + "grad_norm": 4.3430495262146, + "learning_rate": 1.855985037406484e-05, + "loss": 0.4323, + "step": 5780 + }, + { + "epoch": 1.4438902743142146, + "grad_norm": 4.935792922973633, + "learning_rate": 1.8557356608478805e-05, + "loss": 0.4649, + "step": 5790 + }, + { + "epoch": 1.4463840399002494, + "grad_norm": 3.4280688762664795, + "learning_rate": 1.855486284289277e-05, + "loss": 0.4597, + "step": 5800 + }, + { + "epoch": 1.4488778054862843, + "grad_norm": 5.937455654144287, + "learning_rate": 1.8552369077306736e-05, + "loss": 0.4628, + "step": 5810 + }, + { + "epoch": 1.4513715710723192, + "grad_norm": 4.315957069396973, + "learning_rate": 1.85498753117207e-05, + "loss": 0.3815, + "step": 5820 + }, + { + "epoch": 1.453865336658354, + "grad_norm": 6.386058330535889, + "learning_rate": 1.8547381546134663e-05, + "loss": 0.5149, + "step": 5830 + }, + { + "epoch": 1.456359102244389, + "grad_norm": 7.211171627044678, + "learning_rate": 1.854488778054863e-05, + "loss": 0.399, + "step": 5840 + }, + { + "epoch": 1.458852867830424, + "grad_norm": 6.84633207321167, + "learning_rate": 1.8542394014962594e-05, + "loss": 0.5866, + "step": 5850 + }, + { + "epoch": 1.4613466334164589, + "grad_norm": 5.431811332702637, + "learning_rate": 1.853990024937656e-05, + "loss": 0.4732, + "step": 5860 + }, + { + "epoch": 1.4638403990024937, + "grad_norm": 4.845829486846924, + "learning_rate": 1.8537406483790524e-05, + "loss": 0.4998, + "step": 5870 + }, + { + "epoch": 1.4663341645885286, + "grad_norm": 5.635951042175293, + "learning_rate": 1.8534912718204488e-05, + "loss": 0.3866, + "step": 5880 + }, + { + "epoch": 1.4688279301745637, + "grad_norm": 8.562129974365234, + "learning_rate": 1.8532418952618455e-05, + "loss": 0.452, + "step": 5890 + }, + { + "epoch": 1.4713216957605986, + "grad_norm": 5.379266738891602, + "learning_rate": 1.852992518703242e-05, + "loss": 0.4434, + "step": 5900 + }, + { + "epoch": 1.4738154613466334, + "grad_norm": 3.9054388999938965, + "learning_rate": 1.8527431421446386e-05, + "loss": 0.4882, + "step": 5910 + }, + { + "epoch": 1.4763092269326683, + "grad_norm": 7.7161383628845215, + "learning_rate": 1.8524937655860353e-05, + "loss": 0.4105, + "step": 5920 + }, + { + "epoch": 1.4788029925187032, + "grad_norm": 5.12331485748291, + "learning_rate": 1.8522443890274316e-05, + "loss": 0.4601, + "step": 5930 + }, + { + "epoch": 1.481296758104738, + "grad_norm": 7.548189640045166, + "learning_rate": 1.8519950124688283e-05, + "loss": 0.4507, + "step": 5940 + }, + { + "epoch": 1.4837905236907731, + "grad_norm": 3.955315589904785, + "learning_rate": 1.8517456359102247e-05, + "loss": 0.4234, + "step": 5950 + }, + { + "epoch": 1.486284289276808, + "grad_norm": 6.11161470413208, + "learning_rate": 1.851496259351621e-05, + "loss": 0.4254, + "step": 5960 + }, + { + "epoch": 1.4887780548628429, + "grad_norm": 8.937747955322266, + "learning_rate": 1.8512468827930178e-05, + "loss": 0.4604, + "step": 5970 + }, + { + "epoch": 1.4912718204488777, + "grad_norm": 13.255558967590332, + "learning_rate": 1.850997506234414e-05, + "loss": 0.4284, + "step": 5980 + }, + { + "epoch": 1.4937655860349128, + "grad_norm": 6.2703776359558105, + "learning_rate": 1.8507481296758105e-05, + "loss": 0.3886, + "step": 5990 + }, + { + "epoch": 1.4962593516209477, + "grad_norm": 6.531227111816406, + "learning_rate": 1.8504987531172072e-05, + "loss": 0.4601, + "step": 6000 + }, + { + "epoch": 1.4987531172069826, + "grad_norm": 5.363514423370361, + "learning_rate": 1.8502493765586035e-05, + "loss": 0.5012, + "step": 6010 + }, + { + "epoch": 1.5012468827930174, + "grad_norm": 3.8353939056396484, + "learning_rate": 1.8500000000000002e-05, + "loss": 0.4911, + "step": 6020 + }, + { + "epoch": 1.5037406483790523, + "grad_norm": 5.923316955566406, + "learning_rate": 1.8497506234413966e-05, + "loss": 0.3982, + "step": 6030 + }, + { + "epoch": 1.5062344139650872, + "grad_norm": 4.840806484222412, + "learning_rate": 1.849501246882793e-05, + "loss": 0.4035, + "step": 6040 + }, + { + "epoch": 1.508728179551122, + "grad_norm": 4.171060085296631, + "learning_rate": 1.8492518703241897e-05, + "loss": 0.515, + "step": 6050 + }, + { + "epoch": 1.5112219451371571, + "grad_norm": 7.2029876708984375, + "learning_rate": 1.849002493765586e-05, + "loss": 0.4485, + "step": 6060 + }, + { + "epoch": 1.513715710723192, + "grad_norm": 4.763010025024414, + "learning_rate": 1.8487531172069827e-05, + "loss": 0.3859, + "step": 6070 + }, + { + "epoch": 1.516209476309227, + "grad_norm": 4.51248836517334, + "learning_rate": 1.848503740648379e-05, + "loss": 0.4476, + "step": 6080 + }, + { + "epoch": 1.518703241895262, + "grad_norm": 5.466489791870117, + "learning_rate": 1.8482543640897758e-05, + "loss": 0.4742, + "step": 6090 + }, + { + "epoch": 1.5211970074812968, + "grad_norm": 6.138864994049072, + "learning_rate": 1.848004987531172e-05, + "loss": 0.4634, + "step": 6100 + }, + { + "epoch": 1.5236907730673317, + "grad_norm": 4.4593892097473145, + "learning_rate": 1.847755610972569e-05, + "loss": 0.4097, + "step": 6110 + }, + { + "epoch": 1.5261845386533666, + "grad_norm": 5.734845161437988, + "learning_rate": 1.8475062344139652e-05, + "loss": 0.4536, + "step": 6120 + }, + { + "epoch": 1.5286783042394014, + "grad_norm": 5.039837837219238, + "learning_rate": 1.847256857855362e-05, + "loss": 0.4227, + "step": 6130 + }, + { + "epoch": 1.5311720698254363, + "grad_norm": 5.665532112121582, + "learning_rate": 1.8470074812967583e-05, + "loss": 0.3996, + "step": 6140 + }, + { + "epoch": 1.5336658354114712, + "grad_norm": 8.553827285766602, + "learning_rate": 1.846758104738155e-05, + "loss": 0.5689, + "step": 6150 + }, + { + "epoch": 1.5361596009975063, + "grad_norm": 10.087984085083008, + "learning_rate": 1.8465087281795513e-05, + "loss": 0.3943, + "step": 6160 + }, + { + "epoch": 1.5386533665835411, + "grad_norm": 4.186636924743652, + "learning_rate": 1.8462593516209477e-05, + "loss": 0.3991, + "step": 6170 + }, + { + "epoch": 1.5411471321695762, + "grad_norm": 6.2279839515686035, + "learning_rate": 1.8460099750623444e-05, + "loss": 0.4065, + "step": 6180 + }, + { + "epoch": 1.543640897755611, + "grad_norm": 4.494476795196533, + "learning_rate": 1.8457605985037408e-05, + "loss": 0.3763, + "step": 6190 + }, + { + "epoch": 1.546134663341646, + "grad_norm": 4.94999885559082, + "learning_rate": 1.845511221945137e-05, + "loss": 0.4988, + "step": 6200 + }, + { + "epoch": 1.5486284289276808, + "grad_norm": 5.871788024902344, + "learning_rate": 1.8452618453865338e-05, + "loss": 0.4231, + "step": 6210 + }, + { + "epoch": 1.5511221945137157, + "grad_norm": 5.616420269012451, + "learning_rate": 1.8450124688279302e-05, + "loss": 0.372, + "step": 6220 + }, + { + "epoch": 1.5536159600997506, + "grad_norm": 4.9915971755981445, + "learning_rate": 1.844763092269327e-05, + "loss": 0.3971, + "step": 6230 + }, + { + "epoch": 1.5561097256857854, + "grad_norm": 4.821728229522705, + "learning_rate": 1.8445137157107232e-05, + "loss": 0.4182, + "step": 6240 + }, + { + "epoch": 1.5586034912718203, + "grad_norm": 8.948670387268066, + "learning_rate": 1.8442643391521196e-05, + "loss": 0.4545, + "step": 6250 + }, + { + "epoch": 1.5610972568578554, + "grad_norm": 6.1959228515625, + "learning_rate": 1.8440149625935163e-05, + "loss": 0.397, + "step": 6260 + }, + { + "epoch": 1.5635910224438903, + "grad_norm": 4.38476037979126, + "learning_rate": 1.843765586034913e-05, + "loss": 0.3934, + "step": 6270 + }, + { + "epoch": 1.5660847880299253, + "grad_norm": 6.135573387145996, + "learning_rate": 1.8435162094763094e-05, + "loss": 0.4019, + "step": 6280 + }, + { + "epoch": 1.5685785536159602, + "grad_norm": 3.752264976501465, + "learning_rate": 1.843266832917706e-05, + "loss": 0.4041, + "step": 6290 + }, + { + "epoch": 1.571072319201995, + "grad_norm": 6.69175386428833, + "learning_rate": 1.8430174563591024e-05, + "loss": 0.3357, + "step": 6300 + }, + { + "epoch": 1.57356608478803, + "grad_norm": 3.5837090015411377, + "learning_rate": 1.842768079800499e-05, + "loss": 0.4172, + "step": 6310 + }, + { + "epoch": 1.5760598503740648, + "grad_norm": 4.377597332000732, + "learning_rate": 1.8425187032418955e-05, + "loss": 0.48, + "step": 6320 + }, + { + "epoch": 1.5785536159600997, + "grad_norm": 6.266040325164795, + "learning_rate": 1.842269326683292e-05, + "loss": 0.5284, + "step": 6330 + }, + { + "epoch": 1.5810473815461346, + "grad_norm": 6.752136707305908, + "learning_rate": 1.8420199501246886e-05, + "loss": 0.4682, + "step": 6340 + }, + { + "epoch": 1.5835411471321694, + "grad_norm": 4.733704090118408, + "learning_rate": 1.841770573566085e-05, + "loss": 0.345, + "step": 6350 + }, + { + "epoch": 1.5860349127182045, + "grad_norm": 6.0275397300720215, + "learning_rate": 1.8415211970074816e-05, + "loss": 0.5216, + "step": 6360 + }, + { + "epoch": 1.5885286783042394, + "grad_norm": 4.750929832458496, + "learning_rate": 1.841271820448878e-05, + "loss": 0.4312, + "step": 6370 + }, + { + "epoch": 1.5910224438902745, + "grad_norm": 3.941704511642456, + "learning_rate": 1.8410224438902743e-05, + "loss": 0.4652, + "step": 6380 + }, + { + "epoch": 1.5935162094763093, + "grad_norm": 6.078151226043701, + "learning_rate": 1.840773067331671e-05, + "loss": 0.4478, + "step": 6390 + }, + { + "epoch": 1.5960099750623442, + "grad_norm": 5.187417030334473, + "learning_rate": 1.8405236907730674e-05, + "loss": 0.4094, + "step": 6400 + }, + { + "epoch": 1.598503740648379, + "grad_norm": 6.688471794128418, + "learning_rate": 1.8402743142144638e-05, + "loss": 0.4707, + "step": 6410 + }, + { + "epoch": 1.600997506234414, + "grad_norm": 4.863025188446045, + "learning_rate": 1.8400249376558605e-05, + "loss": 0.4326, + "step": 6420 + }, + { + "epoch": 1.6034912718204488, + "grad_norm": 6.477893352508545, + "learning_rate": 1.8397755610972568e-05, + "loss": 0.4071, + "step": 6430 + }, + { + "epoch": 1.6059850374064837, + "grad_norm": 3.6920061111450195, + "learning_rate": 1.8395261845386535e-05, + "loss": 0.4258, + "step": 6440 + }, + { + "epoch": 1.6084788029925186, + "grad_norm": 5.195054531097412, + "learning_rate": 1.83927680798005e-05, + "loss": 0.4388, + "step": 6450 + }, + { + "epoch": 1.6109725685785536, + "grad_norm": 6.680331707000732, + "learning_rate": 1.8390274314214466e-05, + "loss": 0.4053, + "step": 6460 + }, + { + "epoch": 1.6134663341645885, + "grad_norm": 4.663398742675781, + "learning_rate": 1.838778054862843e-05, + "loss": 0.457, + "step": 6470 + }, + { + "epoch": 1.6159600997506236, + "grad_norm": 4.989374160766602, + "learning_rate": 1.8385286783042397e-05, + "loss": 0.451, + "step": 6480 + }, + { + "epoch": 1.6184538653366585, + "grad_norm": 7.583066940307617, + "learning_rate": 1.838279301745636e-05, + "loss": 0.3729, + "step": 6490 + }, + { + "epoch": 1.6209476309226933, + "grad_norm": 4.013853073120117, + "learning_rate": 1.8380299251870327e-05, + "loss": 0.4288, + "step": 6500 + }, + { + "epoch": 1.6234413965087282, + "grad_norm": 4.080069065093994, + "learning_rate": 1.837780548628429e-05, + "loss": 0.3728, + "step": 6510 + }, + { + "epoch": 1.625935162094763, + "grad_norm": 6.781628131866455, + "learning_rate": 1.8375311720698258e-05, + "loss": 0.4198, + "step": 6520 + }, + { + "epoch": 1.628428927680798, + "grad_norm": 5.727022647857666, + "learning_rate": 1.837281795511222e-05, + "loss": 0.5096, + "step": 6530 + }, + { + "epoch": 1.6309226932668328, + "grad_norm": 4.232419967651367, + "learning_rate": 1.8370324189526185e-05, + "loss": 0.4087, + "step": 6540 + }, + { + "epoch": 1.6334164588528677, + "grad_norm": 6.57977294921875, + "learning_rate": 1.8367830423940152e-05, + "loss": 0.5512, + "step": 6550 + }, + { + "epoch": 1.6359102244389028, + "grad_norm": 5.6456427574157715, + "learning_rate": 1.8365336658354116e-05, + "loss": 0.448, + "step": 6560 + }, + { + "epoch": 1.6384039900249376, + "grad_norm": 5.072854518890381, + "learning_rate": 1.8362842892768083e-05, + "loss": 0.4374, + "step": 6570 + }, + { + "epoch": 1.6408977556109727, + "grad_norm": 6.125659465789795, + "learning_rate": 1.8360349127182046e-05, + "loss": 0.4616, + "step": 6580 + }, + { + "epoch": 1.6433915211970076, + "grad_norm": 5.4351301193237305, + "learning_rate": 1.835785536159601e-05, + "loss": 0.561, + "step": 6590 + }, + { + "epoch": 1.6458852867830425, + "grad_norm": 5.115979194641113, + "learning_rate": 1.8355361596009977e-05, + "loss": 0.4454, + "step": 6600 + }, + { + "epoch": 1.6483790523690773, + "grad_norm": 5.563187599182129, + "learning_rate": 1.835286783042394e-05, + "loss": 0.4437, + "step": 6610 + }, + { + "epoch": 1.6508728179551122, + "grad_norm": 8.810982704162598, + "learning_rate": 1.8350374064837907e-05, + "loss": 0.472, + "step": 6620 + }, + { + "epoch": 1.653366583541147, + "grad_norm": 6.700620174407959, + "learning_rate": 1.834788029925187e-05, + "loss": 0.4538, + "step": 6630 + }, + { + "epoch": 1.655860349127182, + "grad_norm": 5.371676921844482, + "learning_rate": 1.8345386533665838e-05, + "loss": 0.4628, + "step": 6640 + }, + { + "epoch": 1.6583541147132168, + "grad_norm": 6.457888126373291, + "learning_rate": 1.8342892768079802e-05, + "loss": 0.3982, + "step": 6650 + }, + { + "epoch": 1.660847880299252, + "grad_norm": 3.3349645137786865, + "learning_rate": 1.834039900249377e-05, + "loss": 0.5806, + "step": 6660 + }, + { + "epoch": 1.6633416458852868, + "grad_norm": 4.4449334144592285, + "learning_rate": 1.8337905236907732e-05, + "loss": 0.3955, + "step": 6670 + }, + { + "epoch": 1.6658354114713219, + "grad_norm": 6.8672776222229, + "learning_rate": 1.83354114713217e-05, + "loss": 0.4466, + "step": 6680 + }, + { + "epoch": 1.6683291770573567, + "grad_norm": 8.786575317382812, + "learning_rate": 1.8332917705735663e-05, + "loss": 0.4113, + "step": 6690 + }, + { + "epoch": 1.6708229426433916, + "grad_norm": 5.177858352661133, + "learning_rate": 1.8330423940149627e-05, + "loss": 0.4509, + "step": 6700 + }, + { + "epoch": 1.6733167082294265, + "grad_norm": 8.495447158813477, + "learning_rate": 1.8327930174563594e-05, + "loss": 0.4024, + "step": 6710 + }, + { + "epoch": 1.6758104738154613, + "grad_norm": 3.8897151947021484, + "learning_rate": 1.8325436408977557e-05, + "loss": 0.4081, + "step": 6720 + }, + { + "epoch": 1.6783042394014962, + "grad_norm": 7.075688362121582, + "learning_rate": 1.8322942643391524e-05, + "loss": 0.4058, + "step": 6730 + }, + { + "epoch": 1.680798004987531, + "grad_norm": 5.916785717010498, + "learning_rate": 1.8320448877805488e-05, + "loss": 0.4553, + "step": 6740 + }, + { + "epoch": 1.683291770573566, + "grad_norm": 6.220627307891846, + "learning_rate": 1.831795511221945e-05, + "loss": 0.3365, + "step": 6750 + }, + { + "epoch": 1.685785536159601, + "grad_norm": 5.613668441772461, + "learning_rate": 1.831546134663342e-05, + "loss": 0.4703, + "step": 6760 + }, + { + "epoch": 1.688279301745636, + "grad_norm": 5.4366841316223145, + "learning_rate": 1.8312967581047382e-05, + "loss": 0.4233, + "step": 6770 + }, + { + "epoch": 1.690773067331671, + "grad_norm": 7.235218524932861, + "learning_rate": 1.8310473815461346e-05, + "loss": 0.4584, + "step": 6780 + }, + { + "epoch": 1.6932668329177059, + "grad_norm": 5.918466567993164, + "learning_rate": 1.8307980049875313e-05, + "loss": 0.4717, + "step": 6790 + }, + { + "epoch": 1.6957605985037407, + "grad_norm": 7.333720684051514, + "learning_rate": 1.8305486284289276e-05, + "loss": 0.4562, + "step": 6800 + }, + { + "epoch": 1.6982543640897756, + "grad_norm": 4.598586082458496, + "learning_rate": 1.8302992518703243e-05, + "loss": 0.3797, + "step": 6810 + }, + { + "epoch": 1.7007481296758105, + "grad_norm": 6.30976676940918, + "learning_rate": 1.8300498753117207e-05, + "loss": 0.5182, + "step": 6820 + }, + { + "epoch": 1.7032418952618453, + "grad_norm": 5.384555339813232, + "learning_rate": 1.8298004987531174e-05, + "loss": 0.3699, + "step": 6830 + }, + { + "epoch": 1.7057356608478802, + "grad_norm": 6.011791229248047, + "learning_rate": 1.8295511221945138e-05, + "loss": 0.4675, + "step": 6840 + }, + { + "epoch": 1.708229426433915, + "grad_norm": 5.3825883865356445, + "learning_rate": 1.8293017456359105e-05, + "loss": 0.4217, + "step": 6850 + }, + { + "epoch": 1.7107231920199502, + "grad_norm": 5.053082466125488, + "learning_rate": 1.829052369077307e-05, + "loss": 0.5013, + "step": 6860 + }, + { + "epoch": 1.713216957605985, + "grad_norm": 3.9529480934143066, + "learning_rate": 1.8288029925187035e-05, + "loss": 0.5152, + "step": 6870 + }, + { + "epoch": 1.7157107231920201, + "grad_norm": 7.375166416168213, + "learning_rate": 1.8285536159601e-05, + "loss": 0.3735, + "step": 6880 + }, + { + "epoch": 1.718204488778055, + "grad_norm": 6.015711307525635, + "learning_rate": 1.8283042394014966e-05, + "loss": 0.3918, + "step": 6890 + }, + { + "epoch": 1.7206982543640899, + "grad_norm": 4.770452499389648, + "learning_rate": 1.828054862842893e-05, + "loss": 0.4266, + "step": 6900 + }, + { + "epoch": 1.7231920199501247, + "grad_norm": 4.618214130401611, + "learning_rate": 1.8278054862842893e-05, + "loss": 0.4717, + "step": 6910 + }, + { + "epoch": 1.7256857855361596, + "grad_norm": 4.1790924072265625, + "learning_rate": 1.827556109725686e-05, + "loss": 0.4128, + "step": 6920 + }, + { + "epoch": 1.7281795511221945, + "grad_norm": 6.608766078948975, + "learning_rate": 1.8273067331670824e-05, + "loss": 0.4591, + "step": 6930 + }, + { + "epoch": 1.7306733167082293, + "grad_norm": 8.431743621826172, + "learning_rate": 1.827057356608479e-05, + "loss": 0.5575, + "step": 6940 + }, + { + "epoch": 1.7331670822942642, + "grad_norm": 5.013129711151123, + "learning_rate": 1.8268079800498754e-05, + "loss": 0.3743, + "step": 6950 + }, + { + "epoch": 1.7356608478802993, + "grad_norm": 4.251693248748779, + "learning_rate": 1.8265586034912718e-05, + "loss": 0.4133, + "step": 6960 + }, + { + "epoch": 1.7381546134663342, + "grad_norm": 4.549345970153809, + "learning_rate": 1.8263092269326685e-05, + "loss": 0.5037, + "step": 6970 + }, + { + "epoch": 1.7406483790523692, + "grad_norm": 6.431280612945557, + "learning_rate": 1.826059850374065e-05, + "loss": 0.3989, + "step": 6980 + }, + { + "epoch": 1.7431421446384041, + "grad_norm": 5.792124271392822, + "learning_rate": 1.8258104738154615e-05, + "loss": 0.4512, + "step": 6990 + }, + { + "epoch": 1.745635910224439, + "grad_norm": 8.254396438598633, + "learning_rate": 1.825561097256858e-05, + "loss": 0.4982, + "step": 7000 + }, + { + "epoch": 1.7481296758104738, + "grad_norm": 4.77825927734375, + "learning_rate": 1.8253117206982546e-05, + "loss": 0.4053, + "step": 7010 + }, + { + "epoch": 1.7506234413965087, + "grad_norm": 6.53203821182251, + "learning_rate": 1.825062344139651e-05, + "loss": 0.4921, + "step": 7020 + }, + { + "epoch": 1.7531172069825436, + "grad_norm": 4.8709940910339355, + "learning_rate": 1.8248129675810477e-05, + "loss": 0.4883, + "step": 7030 + }, + { + "epoch": 1.7556109725685785, + "grad_norm": 4.446110725402832, + "learning_rate": 1.824563591022444e-05, + "loss": 0.4667, + "step": 7040 + }, + { + "epoch": 1.7581047381546133, + "grad_norm": 6.364370346069336, + "learning_rate": 1.8243142144638407e-05, + "loss": 0.5304, + "step": 7050 + }, + { + "epoch": 1.7605985037406484, + "grad_norm": 5.594391822814941, + "learning_rate": 1.824064837905237e-05, + "loss": 0.4105, + "step": 7060 + }, + { + "epoch": 1.7630922693266833, + "grad_norm": 4.667792320251465, + "learning_rate": 1.8238154613466338e-05, + "loss": 0.445, + "step": 7070 + }, + { + "epoch": 1.7655860349127181, + "grad_norm": 30.49921417236328, + "learning_rate": 1.82356608478803e-05, + "loss": 0.5001, + "step": 7080 + }, + { + "epoch": 1.7680798004987532, + "grad_norm": 4.282893657684326, + "learning_rate": 1.8233167082294265e-05, + "loss": 0.428, + "step": 7090 + }, + { + "epoch": 1.770573566084788, + "grad_norm": 6.280821323394775, + "learning_rate": 1.8230673316708232e-05, + "loss": 0.4246, + "step": 7100 + }, + { + "epoch": 1.773067331670823, + "grad_norm": 5.7564005851745605, + "learning_rate": 1.8228179551122196e-05, + "loss": 0.5076, + "step": 7110 + }, + { + "epoch": 1.7755610972568578, + "grad_norm": 5.439352035522461, + "learning_rate": 1.822568578553616e-05, + "loss": 0.448, + "step": 7120 + }, + { + "epoch": 1.7780548628428927, + "grad_norm": 4.840354919433594, + "learning_rate": 1.8223192019950126e-05, + "loss": 0.4518, + "step": 7130 + }, + { + "epoch": 1.7805486284289276, + "grad_norm": 4.0967631340026855, + "learning_rate": 1.822069825436409e-05, + "loss": 0.441, + "step": 7140 + }, + { + "epoch": 1.7830423940149625, + "grad_norm": 5.563769340515137, + "learning_rate": 1.8218204488778057e-05, + "loss": 0.5182, + "step": 7150 + }, + { + "epoch": 1.7855361596009975, + "grad_norm": 6.383294582366943, + "learning_rate": 1.821571072319202e-05, + "loss": 0.4211, + "step": 7160 + }, + { + "epoch": 1.7880299251870324, + "grad_norm": 4.494612216949463, + "learning_rate": 1.8213216957605984e-05, + "loss": 0.426, + "step": 7170 + }, + { + "epoch": 1.7905236907730673, + "grad_norm": 4.303411483764648, + "learning_rate": 1.821072319201995e-05, + "loss": 0.3868, + "step": 7180 + }, + { + "epoch": 1.7930174563591024, + "grad_norm": 7.266903400421143, + "learning_rate": 1.8208229426433915e-05, + "loss": 0.4369, + "step": 7190 + }, + { + "epoch": 1.7955112219451372, + "grad_norm": 4.56803560256958, + "learning_rate": 1.8205735660847882e-05, + "loss": 0.4737, + "step": 7200 + }, + { + "epoch": 1.798004987531172, + "grad_norm": 5.030318260192871, + "learning_rate": 1.820324189526185e-05, + "loss": 0.5418, + "step": 7210 + }, + { + "epoch": 1.800498753117207, + "grad_norm": 5.952413558959961, + "learning_rate": 1.8200748129675813e-05, + "loss": 0.3784, + "step": 7220 + }, + { + "epoch": 1.8029925187032418, + "grad_norm": 13.113987922668457, + "learning_rate": 1.819825436408978e-05, + "loss": 0.4306, + "step": 7230 + }, + { + "epoch": 1.8054862842892767, + "grad_norm": 4.897136688232422, + "learning_rate": 1.8195760598503743e-05, + "loss": 0.4035, + "step": 7240 + }, + { + "epoch": 1.8079800498753116, + "grad_norm": 4.879326343536377, + "learning_rate": 1.8193266832917707e-05, + "loss": 0.4381, + "step": 7250 + }, + { + "epoch": 1.8104738154613467, + "grad_norm": 4.985284328460693, + "learning_rate": 1.8190773067331674e-05, + "loss": 0.4622, + "step": 7260 + }, + { + "epoch": 1.8129675810473815, + "grad_norm": 6.7815775871276855, + "learning_rate": 1.8188279301745637e-05, + "loss": 0.5456, + "step": 7270 + }, + { + "epoch": 1.8154613466334164, + "grad_norm": 7.435234069824219, + "learning_rate": 1.81857855361596e-05, + "loss": 0.4235, + "step": 7280 + }, + { + "epoch": 1.8179551122194515, + "grad_norm": 4.0493483543396, + "learning_rate": 1.8183291770573568e-05, + "loss": 0.4922, + "step": 7290 + }, + { + "epoch": 1.8204488778054864, + "grad_norm": 7.476971626281738, + "learning_rate": 1.818079800498753e-05, + "loss": 0.3972, + "step": 7300 + }, + { + "epoch": 1.8229426433915212, + "grad_norm": 5.938323497772217, + "learning_rate": 1.81783042394015e-05, + "loss": 0.4129, + "step": 7310 + }, + { + "epoch": 1.825436408977556, + "grad_norm": 6.04447078704834, + "learning_rate": 1.8175810473815462e-05, + "loss": 0.4718, + "step": 7320 + }, + { + "epoch": 1.827930174563591, + "grad_norm": 11.774287223815918, + "learning_rate": 1.8173316708229426e-05, + "loss": 0.4558, + "step": 7330 + }, + { + "epoch": 1.8304239401496258, + "grad_norm": 3.780583381652832, + "learning_rate": 1.8170822942643393e-05, + "loss": 0.4411, + "step": 7340 + }, + { + "epoch": 1.8329177057356607, + "grad_norm": 3.6571693420410156, + "learning_rate": 1.8168329177057356e-05, + "loss": 0.4096, + "step": 7350 + }, + { + "epoch": 1.8354114713216958, + "grad_norm": 7.564289093017578, + "learning_rate": 1.8165835411471323e-05, + "loss": 0.3931, + "step": 7360 + }, + { + "epoch": 1.8379052369077307, + "grad_norm": 7.691397666931152, + "learning_rate": 1.8163341645885287e-05, + "loss": 0.4236, + "step": 7370 + }, + { + "epoch": 1.8403990024937655, + "grad_norm": 5.345376968383789, + "learning_rate": 1.8160847880299254e-05, + "loss": 0.4377, + "step": 7380 + }, + { + "epoch": 1.8428927680798006, + "grad_norm": 11.432061195373535, + "learning_rate": 1.8158354114713218e-05, + "loss": 0.4218, + "step": 7390 + }, + { + "epoch": 1.8453865336658355, + "grad_norm": 4.74954080581665, + "learning_rate": 1.8155860349127185e-05, + "loss": 0.4077, + "step": 7400 + }, + { + "epoch": 1.8478802992518704, + "grad_norm": 4.802658557891846, + "learning_rate": 1.815336658354115e-05, + "loss": 0.4862, + "step": 7410 + }, + { + "epoch": 1.8503740648379052, + "grad_norm": 5.43306303024292, + "learning_rate": 1.8150872817955115e-05, + "loss": 0.4187, + "step": 7420 + }, + { + "epoch": 1.85286783042394, + "grad_norm": 3.525193929672241, + "learning_rate": 1.814837905236908e-05, + "loss": 0.3819, + "step": 7430 + }, + { + "epoch": 1.855361596009975, + "grad_norm": 6.635593891143799, + "learning_rate": 1.8145885286783046e-05, + "loss": 0.4183, + "step": 7440 + }, + { + "epoch": 1.8578553615960098, + "grad_norm": 6.29930305480957, + "learning_rate": 1.814339152119701e-05, + "loss": 0.5308, + "step": 7450 + }, + { + "epoch": 1.860349127182045, + "grad_norm": 5.674611568450928, + "learning_rate": 1.8140897755610973e-05, + "loss": 0.4003, + "step": 7460 + }, + { + "epoch": 1.8628428927680798, + "grad_norm": 9.636392593383789, + "learning_rate": 1.813840399002494e-05, + "loss": 0.4318, + "step": 7470 + }, + { + "epoch": 1.8653366583541147, + "grad_norm": 4.685451030731201, + "learning_rate": 1.8135910224438904e-05, + "loss": 0.4437, + "step": 7480 + }, + { + "epoch": 1.8678304239401498, + "grad_norm": 8.09288501739502, + "learning_rate": 1.8133416458852867e-05, + "loss": 0.5241, + "step": 7490 + }, + { + "epoch": 1.8703241895261846, + "grad_norm": 4.083009243011475, + "learning_rate": 1.8130922693266834e-05, + "loss": 0.3948, + "step": 7500 + }, + { + "epoch": 1.8728179551122195, + "grad_norm": 6.581864833831787, + "learning_rate": 1.8128428927680798e-05, + "loss": 0.5274, + "step": 7510 + }, + { + "epoch": 1.8753117206982544, + "grad_norm": 5.297054767608643, + "learning_rate": 1.8125935162094765e-05, + "loss": 0.4802, + "step": 7520 + }, + { + "epoch": 1.8778054862842892, + "grad_norm": 5.629668235778809, + "learning_rate": 1.812344139650873e-05, + "loss": 0.4183, + "step": 7530 + }, + { + "epoch": 1.880299251870324, + "grad_norm": 5.578089237213135, + "learning_rate": 1.8120947630922692e-05, + "loss": 0.4061, + "step": 7540 + }, + { + "epoch": 1.882793017456359, + "grad_norm": 6.019644260406494, + "learning_rate": 1.811845386533666e-05, + "loss": 0.4339, + "step": 7550 + }, + { + "epoch": 1.885286783042394, + "grad_norm": 6.0486016273498535, + "learning_rate": 1.8115960099750626e-05, + "loss": 0.3962, + "step": 7560 + }, + { + "epoch": 1.887780548628429, + "grad_norm": 12.667963027954102, + "learning_rate": 1.811346633416459e-05, + "loss": 0.4148, + "step": 7570 + }, + { + "epoch": 1.8902743142144638, + "grad_norm": 5.244442462921143, + "learning_rate": 1.8110972568578557e-05, + "loss": 0.4431, + "step": 7580 + }, + { + "epoch": 1.8927680798004989, + "grad_norm": 5.28915548324585, + "learning_rate": 1.810847880299252e-05, + "loss": 0.4288, + "step": 7590 + }, + { + "epoch": 1.8952618453865338, + "grad_norm": 5.8130717277526855, + "learning_rate": 1.8105985037406487e-05, + "loss": 0.4385, + "step": 7600 + }, + { + "epoch": 1.8977556109725686, + "grad_norm": 9.284071922302246, + "learning_rate": 1.810349127182045e-05, + "loss": 0.4476, + "step": 7610 + }, + { + "epoch": 1.9002493765586035, + "grad_norm": 4.337800979614258, + "learning_rate": 1.8100997506234415e-05, + "loss": 0.4499, + "step": 7620 + }, + { + "epoch": 1.9027431421446384, + "grad_norm": 5.436014652252197, + "learning_rate": 1.8098503740648382e-05, + "loss": 0.4046, + "step": 7630 + }, + { + "epoch": 1.9052369077306732, + "grad_norm": 5.804083347320557, + "learning_rate": 1.8096009975062345e-05, + "loss": 0.4042, + "step": 7640 + }, + { + "epoch": 1.907730673316708, + "grad_norm": 5.294909477233887, + "learning_rate": 1.8093516209476312e-05, + "loss": 0.4618, + "step": 7650 + }, + { + "epoch": 1.9102244389027432, + "grad_norm": 5.705154895782471, + "learning_rate": 1.8091022443890276e-05, + "loss": 0.3835, + "step": 7660 + }, + { + "epoch": 1.912718204488778, + "grad_norm": 7.614785194396973, + "learning_rate": 1.808852867830424e-05, + "loss": 0.4388, + "step": 7670 + }, + { + "epoch": 1.915211970074813, + "grad_norm": 9.722792625427246, + "learning_rate": 1.8086034912718207e-05, + "loss": 0.4568, + "step": 7680 + }, + { + "epoch": 1.917705735660848, + "grad_norm": 5.578179836273193, + "learning_rate": 1.808354114713217e-05, + "loss": 0.3962, + "step": 7690 + }, + { + "epoch": 1.9201995012468829, + "grad_norm": 5.2010178565979, + "learning_rate": 1.8081047381546134e-05, + "loss": 0.4123, + "step": 7700 + }, + { + "epoch": 1.9226932668329177, + "grad_norm": 4.397924423217773, + "learning_rate": 1.80785536159601e-05, + "loss": 0.4321, + "step": 7710 + }, + { + "epoch": 1.9251870324189526, + "grad_norm": 4.920018196105957, + "learning_rate": 1.8076059850374064e-05, + "loss": 0.4054, + "step": 7720 + }, + { + "epoch": 1.9276807980049875, + "grad_norm": 3.409426212310791, + "learning_rate": 1.807356608478803e-05, + "loss": 0.4464, + "step": 7730 + }, + { + "epoch": 1.9301745635910224, + "grad_norm": 7.432455062866211, + "learning_rate": 1.8071072319201995e-05, + "loss": 0.4521, + "step": 7740 + }, + { + "epoch": 1.9326683291770572, + "grad_norm": 4.5080485343933105, + "learning_rate": 1.8068578553615962e-05, + "loss": 0.4328, + "step": 7750 + }, + { + "epoch": 1.9351620947630923, + "grad_norm": 5.043903827667236, + "learning_rate": 1.8066084788029926e-05, + "loss": 0.4508, + "step": 7760 + }, + { + "epoch": 1.9376558603491272, + "grad_norm": 6.6283369064331055, + "learning_rate": 1.8063591022443893e-05, + "loss": 0.464, + "step": 7770 + }, + { + "epoch": 1.940149625935162, + "grad_norm": 4.977077007293701, + "learning_rate": 1.8061097256857856e-05, + "loss": 0.5318, + "step": 7780 + }, + { + "epoch": 1.9426433915211971, + "grad_norm": 4.694863796234131, + "learning_rate": 1.8058603491271823e-05, + "loss": 0.3441, + "step": 7790 + }, + { + "epoch": 1.945137157107232, + "grad_norm": 5.31052303314209, + "learning_rate": 1.8056109725685787e-05, + "loss": 0.4264, + "step": 7800 + }, + { + "epoch": 1.9476309226932669, + "grad_norm": 5.178094863891602, + "learning_rate": 1.8053615960099754e-05, + "loss": 0.3537, + "step": 7810 + }, + { + "epoch": 1.9501246882793017, + "grad_norm": 11.527572631835938, + "learning_rate": 1.8051122194513718e-05, + "loss": 0.5116, + "step": 7820 + }, + { + "epoch": 1.9526184538653366, + "grad_norm": 5.841403961181641, + "learning_rate": 1.804862842892768e-05, + "loss": 0.4658, + "step": 7830 + }, + { + "epoch": 1.9551122194513715, + "grad_norm": 8.75449275970459, + "learning_rate": 1.8046134663341648e-05, + "loss": 0.449, + "step": 7840 + }, + { + "epoch": 1.9576059850374063, + "grad_norm": 7.627124309539795, + "learning_rate": 1.8043640897755612e-05, + "loss": 0.4512, + "step": 7850 + }, + { + "epoch": 1.9600997506234414, + "grad_norm": 7.299831867218018, + "learning_rate": 1.804114713216958e-05, + "loss": 0.4311, + "step": 7860 + }, + { + "epoch": 1.9625935162094763, + "grad_norm": 5.300114154815674, + "learning_rate": 1.8038653366583542e-05, + "loss": 0.3713, + "step": 7870 + }, + { + "epoch": 1.9650872817955112, + "grad_norm": 5.638365745544434, + "learning_rate": 1.8036159600997506e-05, + "loss": 0.5088, + "step": 7880 + }, + { + "epoch": 1.9675810473815463, + "grad_norm": 6.054098606109619, + "learning_rate": 1.8033665835411473e-05, + "loss": 0.4084, + "step": 7890 + }, + { + "epoch": 1.9700748129675811, + "grad_norm": 7.920881271362305, + "learning_rate": 1.8031172069825437e-05, + "loss": 0.4036, + "step": 7900 + }, + { + "epoch": 1.972568578553616, + "grad_norm": 5.523858070373535, + "learning_rate": 1.8028678304239404e-05, + "loss": 0.4152, + "step": 7910 + }, + { + "epoch": 1.9750623441396509, + "grad_norm": 7.041183948516846, + "learning_rate": 1.8026184538653367e-05, + "loss": 0.314, + "step": 7920 + }, + { + "epoch": 1.9775561097256857, + "grad_norm": 5.018404960632324, + "learning_rate": 1.8023690773067334e-05, + "loss": 0.4854, + "step": 7930 + }, + { + "epoch": 1.9800498753117206, + "grad_norm": 6.31093168258667, + "learning_rate": 1.8021446384039903e-05, + "loss": 0.4654, + "step": 7940 + }, + { + "epoch": 1.9825436408977555, + "grad_norm": 6.725535869598389, + "learning_rate": 1.8018952618453866e-05, + "loss": 0.3975, + "step": 7950 + }, + { + "epoch": 1.9850374064837906, + "grad_norm": 7.007317066192627, + "learning_rate": 1.801645885286783e-05, + "loss": 0.4361, + "step": 7960 + }, + { + "epoch": 1.9875311720698254, + "grad_norm": 4.911192893981934, + "learning_rate": 1.8013965087281797e-05, + "loss": 0.4742, + "step": 7970 + }, + { + "epoch": 1.9900249376558603, + "grad_norm": 6.7747955322265625, + "learning_rate": 1.801147132169576e-05, + "loss": 0.4091, + "step": 7980 + }, + { + "epoch": 1.9925187032418954, + "grad_norm": 5.228945732116699, + "learning_rate": 1.8008977556109727e-05, + "loss": 0.4404, + "step": 7990 + }, + { + "epoch": 1.9950124688279303, + "grad_norm": 6.581811428070068, + "learning_rate": 1.800648379052369e-05, + "loss": 0.4312, + "step": 8000 + }, + { + "epoch": 1.9975062344139651, + "grad_norm": 5.50934362411499, + "learning_rate": 1.8003990024937658e-05, + "loss": 0.3873, + "step": 8010 + }, + { + "epoch": 2.0, + "grad_norm": 6.261854648590088, + "learning_rate": 1.800149625935162e-05, + "loss": 0.4136, + "step": 8020 + }, + { + "epoch": 2.0, + "eval_loss": 0.4465249180793762, + "eval_runtime": 59.866, + "eval_samples_per_second": 16.754, + "eval_steps_per_second": 16.754, + "step": 8020 + }, + { + "epoch": 2.002493765586035, + "grad_norm": 3.6723527908325195, + "learning_rate": 1.799900249376559e-05, + "loss": 0.3571, + "step": 8030 + }, + { + "epoch": 2.0049875311720697, + "grad_norm": 5.929795265197754, + "learning_rate": 1.7996508728179552e-05, + "loss": 0.5315, + "step": 8040 + }, + { + "epoch": 2.0074812967581046, + "grad_norm": 6.144455909729004, + "learning_rate": 1.799401496259352e-05, + "loss": 0.4404, + "step": 8050 + }, + { + "epoch": 2.0099750623441395, + "grad_norm": 6.373076438903809, + "learning_rate": 1.7991521197007483e-05, + "loss": 0.374, + "step": 8060 + }, + { + "epoch": 2.0124688279301743, + "grad_norm": 3.5335922241210938, + "learning_rate": 1.798902743142145e-05, + "loss": 0.3972, + "step": 8070 + }, + { + "epoch": 2.0149625935162097, + "grad_norm": 6.1185479164123535, + "learning_rate": 1.7986533665835413e-05, + "loss": 0.4131, + "step": 8080 + }, + { + "epoch": 2.0174563591022445, + "grad_norm": 5.0511345863342285, + "learning_rate": 1.7984039900249377e-05, + "loss": 0.4324, + "step": 8090 + }, + { + "epoch": 2.0199501246882794, + "grad_norm": 7.270425796508789, + "learning_rate": 1.7981546134663344e-05, + "loss": 0.3787, + "step": 8100 + }, + { + "epoch": 2.0224438902743143, + "grad_norm": 4.606263637542725, + "learning_rate": 1.7979052369077308e-05, + "loss": 0.3524, + "step": 8110 + }, + { + "epoch": 2.024937655860349, + "grad_norm": 5.3458452224731445, + "learning_rate": 1.7976558603491275e-05, + "loss": 0.4371, + "step": 8120 + }, + { + "epoch": 2.027431421446384, + "grad_norm": 4.736631870269775, + "learning_rate": 1.797406483790524e-05, + "loss": 0.3898, + "step": 8130 + }, + { + "epoch": 2.029925187032419, + "grad_norm": 5.234148025512695, + "learning_rate": 1.7971571072319202e-05, + "loss": 0.4307, + "step": 8140 + }, + { + "epoch": 2.0324189526184537, + "grad_norm": 6.201371669769287, + "learning_rate": 1.796907730673317e-05, + "loss": 0.4022, + "step": 8150 + }, + { + "epoch": 2.0349127182044886, + "grad_norm": 5.895164489746094, + "learning_rate": 1.7966583541147133e-05, + "loss": 0.4254, + "step": 8160 + }, + { + "epoch": 2.037406483790524, + "grad_norm": 4.677857398986816, + "learning_rate": 1.7964089775561096e-05, + "loss": 0.5048, + "step": 8170 + }, + { + "epoch": 2.039900249376559, + "grad_norm": 4.634955406188965, + "learning_rate": 1.7961596009975063e-05, + "loss": 0.4376, + "step": 8180 + }, + { + "epoch": 2.0423940149625937, + "grad_norm": 5.4965667724609375, + "learning_rate": 1.7959102244389027e-05, + "loss": 0.3972, + "step": 8190 + }, + { + "epoch": 2.0448877805486285, + "grad_norm": 5.892706394195557, + "learning_rate": 1.7956608478802994e-05, + "loss": 0.3795, + "step": 8200 + }, + { + "epoch": 2.0473815461346634, + "grad_norm": 6.747429370880127, + "learning_rate": 1.7954114713216957e-05, + "loss": 0.394, + "step": 8210 + }, + { + "epoch": 2.0498753117206983, + "grad_norm": 6.291106224060059, + "learning_rate": 1.7951620947630924e-05, + "loss": 0.4976, + "step": 8220 + }, + { + "epoch": 2.052369077306733, + "grad_norm": 5.840002536773682, + "learning_rate": 1.794912718204489e-05, + "loss": 0.4118, + "step": 8230 + }, + { + "epoch": 2.054862842892768, + "grad_norm": 9.487309455871582, + "learning_rate": 1.7946633416458855e-05, + "loss": 0.3606, + "step": 8240 + }, + { + "epoch": 2.057356608478803, + "grad_norm": 5.123715877532959, + "learning_rate": 1.7944139650872822e-05, + "loss": 0.4149, + "step": 8250 + }, + { + "epoch": 2.0598503740648377, + "grad_norm": 5.808225154876709, + "learning_rate": 1.7941645885286786e-05, + "loss": 0.4553, + "step": 8260 + }, + { + "epoch": 2.0623441396508726, + "grad_norm": 9.865265846252441, + "learning_rate": 1.793915211970075e-05, + "loss": 0.4352, + "step": 8270 + }, + { + "epoch": 2.064837905236908, + "grad_norm": 4.490272045135498, + "learning_rate": 1.7936658354114716e-05, + "loss": 0.4435, + "step": 8280 + }, + { + "epoch": 2.067331670822943, + "grad_norm": 4.552779197692871, + "learning_rate": 1.793416458852868e-05, + "loss": 0.3933, + "step": 8290 + }, + { + "epoch": 2.0698254364089776, + "grad_norm": 5.0049004554748535, + "learning_rate": 1.7931670822942644e-05, + "loss": 0.4585, + "step": 8300 + }, + { + "epoch": 2.0723192019950125, + "grad_norm": 5.8257904052734375, + "learning_rate": 1.792917705735661e-05, + "loss": 0.4453, + "step": 8310 + }, + { + "epoch": 2.0748129675810474, + "grad_norm": 5.575977802276611, + "learning_rate": 1.7926683291770574e-05, + "loss": 0.4536, + "step": 8320 + }, + { + "epoch": 2.0773067331670823, + "grad_norm": 4.55002498626709, + "learning_rate": 1.792418952618454e-05, + "loss": 0.4842, + "step": 8330 + }, + { + "epoch": 2.079800498753117, + "grad_norm": 6.223612308502197, + "learning_rate": 1.7921695760598505e-05, + "loss": 0.4209, + "step": 8340 + }, + { + "epoch": 2.082294264339152, + "grad_norm": 5.589142322540283, + "learning_rate": 1.791920199501247e-05, + "loss": 0.3957, + "step": 8350 + }, + { + "epoch": 2.084788029925187, + "grad_norm": 5.206849575042725, + "learning_rate": 1.7916708229426435e-05, + "loss": 0.4253, + "step": 8360 + }, + { + "epoch": 2.087281795511222, + "grad_norm": 7.368830680847168, + "learning_rate": 1.79142144638404e-05, + "loss": 0.391, + "step": 8370 + }, + { + "epoch": 2.089775561097257, + "grad_norm": 4.8193817138671875, + "learning_rate": 1.7911720698254366e-05, + "loss": 0.397, + "step": 8380 + }, + { + "epoch": 2.092269326683292, + "grad_norm": 5.361727237701416, + "learning_rate": 1.790922693266833e-05, + "loss": 0.4468, + "step": 8390 + }, + { + "epoch": 2.0947630922693268, + "grad_norm": 5.065185546875, + "learning_rate": 1.7906733167082297e-05, + "loss": 0.4142, + "step": 8400 + }, + { + "epoch": 2.0972568578553616, + "grad_norm": 4.190534591674805, + "learning_rate": 1.790423940149626e-05, + "loss": 0.3448, + "step": 8410 + }, + { + "epoch": 2.0997506234413965, + "grad_norm": 7.922725677490234, + "learning_rate": 1.7901745635910227e-05, + "loss": 0.3987, + "step": 8420 + }, + { + "epoch": 2.1022443890274314, + "grad_norm": 4.495791912078857, + "learning_rate": 1.789925187032419e-05, + "loss": 0.4028, + "step": 8430 + }, + { + "epoch": 2.1047381546134662, + "grad_norm": 5.718870639801025, + "learning_rate": 1.7896758104738158e-05, + "loss": 0.3954, + "step": 8440 + }, + { + "epoch": 2.107231920199501, + "grad_norm": 5.126749038696289, + "learning_rate": 1.789426433915212e-05, + "loss": 0.387, + "step": 8450 + }, + { + "epoch": 2.109725685785536, + "grad_norm": 6.903149604797363, + "learning_rate": 1.7891770573566085e-05, + "loss": 0.3494, + "step": 8460 + }, + { + "epoch": 2.112219451371571, + "grad_norm": 6.055934429168701, + "learning_rate": 1.7889276807980052e-05, + "loss": 0.4843, + "step": 8470 + }, + { + "epoch": 2.114713216957606, + "grad_norm": 5.180309295654297, + "learning_rate": 1.7886783042394016e-05, + "loss": 0.3912, + "step": 8480 + }, + { + "epoch": 2.117206982543641, + "grad_norm": 5.292367458343506, + "learning_rate": 1.7884289276807983e-05, + "loss": 0.4334, + "step": 8490 + }, + { + "epoch": 2.119700748129676, + "grad_norm": 6.745512008666992, + "learning_rate": 1.7881795511221946e-05, + "loss": 0.3954, + "step": 8500 + }, + { + "epoch": 2.1221945137157108, + "grad_norm": 6.295596122741699, + "learning_rate": 1.787930174563591e-05, + "loss": 0.3938, + "step": 8510 + }, + { + "epoch": 2.1246882793017456, + "grad_norm": 3.398513078689575, + "learning_rate": 1.7876807980049877e-05, + "loss": 0.3826, + "step": 8520 + }, + { + "epoch": 2.1271820448877805, + "grad_norm": 7.377808570861816, + "learning_rate": 1.787431421446384e-05, + "loss": 0.4062, + "step": 8530 + }, + { + "epoch": 2.1296758104738154, + "grad_norm": 5.2859272956848145, + "learning_rate": 1.7871820448877808e-05, + "loss": 0.3489, + "step": 8540 + }, + { + "epoch": 2.1321695760598502, + "grad_norm": 5.597499370574951, + "learning_rate": 1.786932668329177e-05, + "loss": 0.3924, + "step": 8550 + }, + { + "epoch": 2.134663341645885, + "grad_norm": 4.033814430236816, + "learning_rate": 1.7866832917705735e-05, + "loss": 0.401, + "step": 8560 + }, + { + "epoch": 2.1371571072319204, + "grad_norm": 10.128519058227539, + "learning_rate": 1.7864339152119702e-05, + "loss": 0.4635, + "step": 8570 + }, + { + "epoch": 2.1396508728179553, + "grad_norm": 3.9514145851135254, + "learning_rate": 1.786184538653367e-05, + "loss": 0.4217, + "step": 8580 + }, + { + "epoch": 2.14214463840399, + "grad_norm": 4.673630714416504, + "learning_rate": 1.7859351620947632e-05, + "loss": 0.4318, + "step": 8590 + }, + { + "epoch": 2.144638403990025, + "grad_norm": 5.571914196014404, + "learning_rate": 1.78568578553616e-05, + "loss": 0.414, + "step": 8600 + }, + { + "epoch": 2.14713216957606, + "grad_norm": 4.7045063972473145, + "learning_rate": 1.7854364089775563e-05, + "loss": 0.4471, + "step": 8610 + }, + { + "epoch": 2.1496259351620948, + "grad_norm": 4.17690372467041, + "learning_rate": 1.785187032418953e-05, + "loss": 0.4632, + "step": 8620 + }, + { + "epoch": 2.1521197007481296, + "grad_norm": 5.189255237579346, + "learning_rate": 1.7849376558603494e-05, + "loss": 0.5894, + "step": 8630 + }, + { + "epoch": 2.1546134663341645, + "grad_norm": 5.136035442352295, + "learning_rate": 1.7846882793017457e-05, + "loss": 0.4954, + "step": 8640 + }, + { + "epoch": 2.1571072319201994, + "grad_norm": 6.0098443031311035, + "learning_rate": 1.7844389027431424e-05, + "loss": 0.3998, + "step": 8650 + }, + { + "epoch": 2.1596009975062342, + "grad_norm": 6.5170979499816895, + "learning_rate": 1.7841895261845388e-05, + "loss": 0.4394, + "step": 8660 + }, + { + "epoch": 2.162094763092269, + "grad_norm": 7.620209217071533, + "learning_rate": 1.783940149625935e-05, + "loss": 0.4352, + "step": 8670 + }, + { + "epoch": 2.1645885286783044, + "grad_norm": 5.3294548988342285, + "learning_rate": 1.783690773067332e-05, + "loss": 0.4285, + "step": 8680 + }, + { + "epoch": 2.1670822942643393, + "grad_norm": 4.24174165725708, + "learning_rate": 1.7834413965087282e-05, + "loss": 0.4855, + "step": 8690 + }, + { + "epoch": 2.169576059850374, + "grad_norm": 4.900123119354248, + "learning_rate": 1.783192019950125e-05, + "loss": 0.5264, + "step": 8700 + }, + { + "epoch": 2.172069825436409, + "grad_norm": 5.917686939239502, + "learning_rate": 1.7829426433915213e-05, + "loss": 0.4571, + "step": 8710 + }, + { + "epoch": 2.174563591022444, + "grad_norm": 5.674629211425781, + "learning_rate": 1.7826932668329176e-05, + "loss": 0.44, + "step": 8720 + }, + { + "epoch": 2.1770573566084788, + "grad_norm": 6.6443986892700195, + "learning_rate": 1.7824438902743143e-05, + "loss": 0.3972, + "step": 8730 + }, + { + "epoch": 2.1795511221945136, + "grad_norm": 4.557736873626709, + "learning_rate": 1.7821945137157107e-05, + "loss": 0.4578, + "step": 8740 + }, + { + "epoch": 2.1820448877805485, + "grad_norm": 5.653411865234375, + "learning_rate": 1.7819451371571074e-05, + "loss": 0.4691, + "step": 8750 + }, + { + "epoch": 2.1845386533665834, + "grad_norm": 5.747959613800049, + "learning_rate": 1.7816957605985038e-05, + "loss": 0.4597, + "step": 8760 + }, + { + "epoch": 2.1870324189526187, + "grad_norm": 4.90444803237915, + "learning_rate": 1.7814463840399005e-05, + "loss": 0.4057, + "step": 8770 + }, + { + "epoch": 2.1895261845386536, + "grad_norm": 5.881532192230225, + "learning_rate": 1.7811970074812968e-05, + "loss": 0.4028, + "step": 8780 + }, + { + "epoch": 2.1920199501246884, + "grad_norm": 5.233574390411377, + "learning_rate": 1.7809476309226935e-05, + "loss": 0.3608, + "step": 8790 + }, + { + "epoch": 2.1945137157107233, + "grad_norm": 6.65619421005249, + "learning_rate": 1.78069825436409e-05, + "loss": 0.3862, + "step": 8800 + }, + { + "epoch": 2.197007481296758, + "grad_norm": 4.836643695831299, + "learning_rate": 1.7804488778054866e-05, + "loss": 0.4005, + "step": 8810 + }, + { + "epoch": 2.199501246882793, + "grad_norm": 5.234933376312256, + "learning_rate": 1.780199501246883e-05, + "loss": 0.4855, + "step": 8820 + }, + { + "epoch": 2.201995012468828, + "grad_norm": 4.512629985809326, + "learning_rate": 1.7799501246882796e-05, + "loss": 0.4217, + "step": 8830 + }, + { + "epoch": 2.2044887780548628, + "grad_norm": 7.3246541023254395, + "learning_rate": 1.779700748129676e-05, + "loss": 0.4968, + "step": 8840 + }, + { + "epoch": 2.2069825436408976, + "grad_norm": 5.89198112487793, + "learning_rate": 1.7794513715710724e-05, + "loss": 0.3842, + "step": 8850 + }, + { + "epoch": 2.2094763092269325, + "grad_norm": 5.893228530883789, + "learning_rate": 1.779201995012469e-05, + "loss": 0.3892, + "step": 8860 + }, + { + "epoch": 2.2119700748129674, + "grad_norm": 11.834216117858887, + "learning_rate": 1.7789526184538654e-05, + "loss": 0.4357, + "step": 8870 + }, + { + "epoch": 2.2144638403990027, + "grad_norm": 6.141737461090088, + "learning_rate": 1.7787032418952618e-05, + "loss": 0.3841, + "step": 8880 + }, + { + "epoch": 2.2169576059850375, + "grad_norm": 10.668390274047852, + "learning_rate": 1.7784538653366585e-05, + "loss": 0.4482, + "step": 8890 + }, + { + "epoch": 2.2194513715710724, + "grad_norm": 5.236949443817139, + "learning_rate": 1.778204488778055e-05, + "loss": 0.4517, + "step": 8900 + }, + { + "epoch": 2.2219451371571073, + "grad_norm": 6.584339618682861, + "learning_rate": 1.7779551122194516e-05, + "loss": 0.4163, + "step": 8910 + }, + { + "epoch": 2.224438902743142, + "grad_norm": 6.040124893188477, + "learning_rate": 1.777705735660848e-05, + "loss": 0.4095, + "step": 8920 + }, + { + "epoch": 2.226932668329177, + "grad_norm": 7.1335062980651855, + "learning_rate": 1.7774563591022446e-05, + "loss": 0.5042, + "step": 8930 + }, + { + "epoch": 2.229426433915212, + "grad_norm": 8.298208236694336, + "learning_rate": 1.777206982543641e-05, + "loss": 0.5001, + "step": 8940 + }, + { + "epoch": 2.2319201995012468, + "grad_norm": 4.262420177459717, + "learning_rate": 1.7769576059850377e-05, + "loss": 0.4444, + "step": 8950 + }, + { + "epoch": 2.2344139650872816, + "grad_norm": 3.8085227012634277, + "learning_rate": 1.776708229426434e-05, + "loss": 0.4736, + "step": 8960 + }, + { + "epoch": 2.236907730673317, + "grad_norm": 8.276735305786133, + "learning_rate": 1.7764588528678307e-05, + "loss": 0.4522, + "step": 8970 + }, + { + "epoch": 2.239401496259352, + "grad_norm": 6.127620220184326, + "learning_rate": 1.776209476309227e-05, + "loss": 0.3601, + "step": 8980 + }, + { + "epoch": 2.2418952618453867, + "grad_norm": 5.782129287719727, + "learning_rate": 1.7759600997506238e-05, + "loss": 0.4117, + "step": 8990 + }, + { + "epoch": 2.2443890274314215, + "grad_norm": 6.870619773864746, + "learning_rate": 1.77571072319202e-05, + "loss": 0.4816, + "step": 9000 + }, + { + "epoch": 2.2468827930174564, + "grad_norm": 3.876192331314087, + "learning_rate": 1.7754613466334165e-05, + "loss": 0.41, + "step": 9010 + }, + { + "epoch": 2.2493765586034913, + "grad_norm": 7.108855724334717, + "learning_rate": 1.7752119700748132e-05, + "loss": 0.4026, + "step": 9020 + }, + { + "epoch": 2.251870324189526, + "grad_norm": 5.940926551818848, + "learning_rate": 1.7749625935162096e-05, + "loss": 0.4307, + "step": 9030 + }, + { + "epoch": 2.254364089775561, + "grad_norm": 7.2069926261901855, + "learning_rate": 1.7747132169576063e-05, + "loss": 0.4925, + "step": 9040 + }, + { + "epoch": 2.256857855361596, + "grad_norm": 5.103030681610107, + "learning_rate": 1.7744638403990026e-05, + "loss": 0.4119, + "step": 9050 + }, + { + "epoch": 2.2593516209476308, + "grad_norm": 6.820674419403076, + "learning_rate": 1.774214463840399e-05, + "loss": 0.4423, + "step": 9060 + }, + { + "epoch": 2.2618453865336656, + "grad_norm": 5.859368801116943, + "learning_rate": 1.7739650872817957e-05, + "loss": 0.416, + "step": 9070 + }, + { + "epoch": 2.264339152119701, + "grad_norm": 3.5914974212646484, + "learning_rate": 1.773715710723192e-05, + "loss": 0.3593, + "step": 9080 + }, + { + "epoch": 2.266832917705736, + "grad_norm": 5.854361057281494, + "learning_rate": 1.7734663341645884e-05, + "loss": 0.3665, + "step": 9090 + }, + { + "epoch": 2.2693266832917707, + "grad_norm": 6.185330390930176, + "learning_rate": 1.773216957605985e-05, + "loss": 0.3968, + "step": 9100 + }, + { + "epoch": 2.2718204488778055, + "grad_norm": 9.060643196105957, + "learning_rate": 1.7729675810473815e-05, + "loss": 0.444, + "step": 9110 + }, + { + "epoch": 2.2743142144638404, + "grad_norm": 6.3430633544921875, + "learning_rate": 1.7727182044887782e-05, + "loss": 0.4807, + "step": 9120 + }, + { + "epoch": 2.2768079800498753, + "grad_norm": 6.883560657501221, + "learning_rate": 1.7724688279301746e-05, + "loss": 0.4042, + "step": 9130 + }, + { + "epoch": 2.27930174563591, + "grad_norm": 5.706015110015869, + "learning_rate": 1.7722194513715713e-05, + "loss": 0.4687, + "step": 9140 + }, + { + "epoch": 2.281795511221945, + "grad_norm": 6.107770919799805, + "learning_rate": 1.7719700748129676e-05, + "loss": 0.6099, + "step": 9150 + }, + { + "epoch": 2.28428927680798, + "grad_norm": 4.1198930740356445, + "learning_rate": 1.7717206982543643e-05, + "loss": 0.3808, + "step": 9160 + }, + { + "epoch": 2.286783042394015, + "grad_norm": 5.448758602142334, + "learning_rate": 1.7714713216957607e-05, + "loss": 0.3805, + "step": 9170 + }, + { + "epoch": 2.28927680798005, + "grad_norm": 4.109248638153076, + "learning_rate": 1.7712219451371574e-05, + "loss": 0.3963, + "step": 9180 + }, + { + "epoch": 2.291770573566085, + "grad_norm": 4.992825508117676, + "learning_rate": 1.7709725685785537e-05, + "loss": 0.3987, + "step": 9190 + }, + { + "epoch": 2.29426433915212, + "grad_norm": 4.800196647644043, + "learning_rate": 1.7707231920199504e-05, + "loss": 0.3984, + "step": 9200 + }, + { + "epoch": 2.2967581047381547, + "grad_norm": 7.4912896156311035, + "learning_rate": 1.7704738154613468e-05, + "loss": 0.462, + "step": 9210 + }, + { + "epoch": 2.2992518703241895, + "grad_norm": 7.3595991134643555, + "learning_rate": 1.770224438902743e-05, + "loss": 0.4066, + "step": 9220 + }, + { + "epoch": 2.3017456359102244, + "grad_norm": 8.947257995605469, + "learning_rate": 1.76997506234414e-05, + "loss": 0.4297, + "step": 9230 + }, + { + "epoch": 2.3042394014962593, + "grad_norm": 5.556568622589111, + "learning_rate": 1.7697256857855362e-05, + "loss": 0.4001, + "step": 9240 + }, + { + "epoch": 2.306733167082294, + "grad_norm": 5.276797294616699, + "learning_rate": 1.769476309226933e-05, + "loss": 0.4318, + "step": 9250 + }, + { + "epoch": 2.309226932668329, + "grad_norm": 7.36549711227417, + "learning_rate": 1.7692269326683293e-05, + "loss": 0.4791, + "step": 9260 + }, + { + "epoch": 2.311720698254364, + "grad_norm": 6.3212761878967285, + "learning_rate": 1.7689775561097257e-05, + "loss": 0.4505, + "step": 9270 + }, + { + "epoch": 2.314214463840399, + "grad_norm": 8.907316207885742, + "learning_rate": 1.7687281795511224e-05, + "loss": 0.5421, + "step": 9280 + }, + { + "epoch": 2.316708229426434, + "grad_norm": 5.004027843475342, + "learning_rate": 1.7684788029925187e-05, + "loss": 0.4232, + "step": 9290 + }, + { + "epoch": 2.319201995012469, + "grad_norm": 5.411210060119629, + "learning_rate": 1.7682294264339154e-05, + "loss": 0.374, + "step": 9300 + }, + { + "epoch": 2.321695760598504, + "grad_norm": 5.966613292694092, + "learning_rate": 1.7679800498753118e-05, + "loss": 0.4109, + "step": 9310 + }, + { + "epoch": 2.3241895261845387, + "grad_norm": 5.003303050994873, + "learning_rate": 1.7677306733167085e-05, + "loss": 0.4241, + "step": 9320 + }, + { + "epoch": 2.3266832917705735, + "grad_norm": 5.208148002624512, + "learning_rate": 1.767481296758105e-05, + "loss": 0.4793, + "step": 9330 + }, + { + "epoch": 2.3291770573566084, + "grad_norm": 8.17491626739502, + "learning_rate": 1.7672319201995015e-05, + "loss": 0.4423, + "step": 9340 + }, + { + "epoch": 2.3316708229426433, + "grad_norm": 8.450447082519531, + "learning_rate": 1.766982543640898e-05, + "loss": 0.4155, + "step": 9350 + }, + { + "epoch": 2.334164588528678, + "grad_norm": 6.473998546600342, + "learning_rate": 1.7667331670822946e-05, + "loss": 0.425, + "step": 9360 + }, + { + "epoch": 2.3366583541147135, + "grad_norm": 8.660719871520996, + "learning_rate": 1.766483790523691e-05, + "loss": 0.4596, + "step": 9370 + }, + { + "epoch": 2.3391521197007483, + "grad_norm": 5.478455543518066, + "learning_rate": 1.7662344139650873e-05, + "loss": 0.3841, + "step": 9380 + }, + { + "epoch": 2.341645885286783, + "grad_norm": 5.475340843200684, + "learning_rate": 1.765985037406484e-05, + "loss": 0.4179, + "step": 9390 + }, + { + "epoch": 2.344139650872818, + "grad_norm": 4.660368919372559, + "learning_rate": 1.7657356608478804e-05, + "loss": 0.4493, + "step": 9400 + }, + { + "epoch": 2.346633416458853, + "grad_norm": 3.002950668334961, + "learning_rate": 1.765486284289277e-05, + "loss": 0.3333, + "step": 9410 + }, + { + "epoch": 2.349127182044888, + "grad_norm": 7.417391777038574, + "learning_rate": 1.7652369077306734e-05, + "loss": 0.3882, + "step": 9420 + }, + { + "epoch": 2.3516209476309227, + "grad_norm": 5.637429714202881, + "learning_rate": 1.7649875311720698e-05, + "loss": 0.3217, + "step": 9430 + }, + { + "epoch": 2.3541147132169575, + "grad_norm": 9.37643814086914, + "learning_rate": 1.7647381546134665e-05, + "loss": 0.423, + "step": 9440 + }, + { + "epoch": 2.3566084788029924, + "grad_norm": 4.772799968719482, + "learning_rate": 1.764488778054863e-05, + "loss": 0.4789, + "step": 9450 + }, + { + "epoch": 2.3591022443890273, + "grad_norm": 6.806458950042725, + "learning_rate": 1.7642394014962592e-05, + "loss": 0.4245, + "step": 9460 + }, + { + "epoch": 2.361596009975062, + "grad_norm": 6.0944647789001465, + "learning_rate": 1.763990024937656e-05, + "loss": 0.4306, + "step": 9470 + }, + { + "epoch": 2.3640897755610975, + "grad_norm": 4.526205062866211, + "learning_rate": 1.7637406483790523e-05, + "loss": 0.3647, + "step": 9480 + }, + { + "epoch": 2.3665835411471323, + "grad_norm": 10.576577186584473, + "learning_rate": 1.763491271820449e-05, + "loss": 0.4645, + "step": 9490 + }, + { + "epoch": 2.369077306733167, + "grad_norm": 8.364253044128418, + "learning_rate": 1.7632418952618457e-05, + "loss": 0.4524, + "step": 9500 + }, + { + "epoch": 2.371571072319202, + "grad_norm": 6.188162326812744, + "learning_rate": 1.762992518703242e-05, + "loss": 0.446, + "step": 9510 + }, + { + "epoch": 2.374064837905237, + "grad_norm": 5.653889179229736, + "learning_rate": 1.7627431421446388e-05, + "loss": 0.3859, + "step": 9520 + }, + { + "epoch": 2.376558603491272, + "grad_norm": 7.959500789642334, + "learning_rate": 1.762493765586035e-05, + "loss": 0.4085, + "step": 9530 + }, + { + "epoch": 2.3790523690773067, + "grad_norm": 7.066885471343994, + "learning_rate": 1.7622443890274318e-05, + "loss": 0.4195, + "step": 9540 + }, + { + "epoch": 2.3815461346633415, + "grad_norm": 4.418666839599609, + "learning_rate": 1.7619950124688282e-05, + "loss": 0.3748, + "step": 9550 + }, + { + "epoch": 2.3840399002493764, + "grad_norm": 7.192332744598389, + "learning_rate": 1.7617456359102245e-05, + "loss": 0.4545, + "step": 9560 + }, + { + "epoch": 2.3865336658354117, + "grad_norm": 6.895399570465088, + "learning_rate": 1.7614962593516212e-05, + "loss": 0.4023, + "step": 9570 + }, + { + "epoch": 2.3890274314214466, + "grad_norm": 5.375655174255371, + "learning_rate": 1.7612468827930176e-05, + "loss": 0.4108, + "step": 9580 + }, + { + "epoch": 2.3915211970074814, + "grad_norm": 5.821678161621094, + "learning_rate": 1.760997506234414e-05, + "loss": 0.4258, + "step": 9590 + }, + { + "epoch": 2.3940149625935163, + "grad_norm": 7.174010276794434, + "learning_rate": 1.7607481296758107e-05, + "loss": 0.4199, + "step": 9600 + }, + { + "epoch": 2.396508728179551, + "grad_norm": 4.9364519119262695, + "learning_rate": 1.760498753117207e-05, + "loss": 0.4044, + "step": 9610 + }, + { + "epoch": 2.399002493765586, + "grad_norm": 6.391279220581055, + "learning_rate": 1.7602493765586037e-05, + "loss": 0.4368, + "step": 9620 + }, + { + "epoch": 2.401496259351621, + "grad_norm": 4.990833282470703, + "learning_rate": 1.76e-05, + "loss": 0.3174, + "step": 9630 + }, + { + "epoch": 2.403990024937656, + "grad_norm": 4.146638870239258, + "learning_rate": 1.7597506234413965e-05, + "loss": 0.3995, + "step": 9640 + }, + { + "epoch": 2.4064837905236907, + "grad_norm": 4.520384311676025, + "learning_rate": 1.759501246882793e-05, + "loss": 0.3627, + "step": 9650 + }, + { + "epoch": 2.4089775561097255, + "grad_norm": 5.92678165435791, + "learning_rate": 1.7592518703241895e-05, + "loss": 0.3737, + "step": 9660 + }, + { + "epoch": 2.4114713216957604, + "grad_norm": 8.433207511901855, + "learning_rate": 1.7590024937655862e-05, + "loss": 0.5038, + "step": 9670 + }, + { + "epoch": 2.4139650872817953, + "grad_norm": 6.0681986808776855, + "learning_rate": 1.7587531172069826e-05, + "loss": 0.4544, + "step": 9680 + }, + { + "epoch": 2.4164588528678306, + "grad_norm": 6.954349040985107, + "learning_rate": 1.7585037406483793e-05, + "loss": 0.4765, + "step": 9690 + }, + { + "epoch": 2.4189526184538654, + "grad_norm": 5.360884666442871, + "learning_rate": 1.7582543640897756e-05, + "loss": 0.4173, + "step": 9700 + }, + { + "epoch": 2.4214463840399003, + "grad_norm": 5.455905437469482, + "learning_rate": 1.7580049875311723e-05, + "loss": 0.3981, + "step": 9710 + }, + { + "epoch": 2.423940149625935, + "grad_norm": 7.694284439086914, + "learning_rate": 1.7577556109725687e-05, + "loss": 0.4159, + "step": 9720 + }, + { + "epoch": 2.42643391521197, + "grad_norm": 2.6241650581359863, + "learning_rate": 1.7575062344139654e-05, + "loss": 0.4415, + "step": 9730 + }, + { + "epoch": 2.428927680798005, + "grad_norm": 5.173274040222168, + "learning_rate": 1.7572568578553618e-05, + "loss": 0.3576, + "step": 9740 + }, + { + "epoch": 2.43142144638404, + "grad_norm": 5.52868127822876, + "learning_rate": 1.7570074812967585e-05, + "loss": 0.4224, + "step": 9750 + }, + { + "epoch": 2.4339152119700747, + "grad_norm": 6.820600509643555, + "learning_rate": 1.7567581047381548e-05, + "loss": 0.4329, + "step": 9760 + }, + { + "epoch": 2.43640897755611, + "grad_norm": 5.07393741607666, + "learning_rate": 1.7565087281795512e-05, + "loss": 0.4344, + "step": 9770 + }, + { + "epoch": 2.438902743142145, + "grad_norm": 6.3466572761535645, + "learning_rate": 1.756259351620948e-05, + "loss": 0.3851, + "step": 9780 + }, + { + "epoch": 2.4413965087281797, + "grad_norm": 4.636044025421143, + "learning_rate": 1.7560099750623442e-05, + "loss": 0.4259, + "step": 9790 + }, + { + "epoch": 2.4438902743142146, + "grad_norm": 4.414524078369141, + "learning_rate": 1.7557605985037406e-05, + "loss": 0.4189, + "step": 9800 + }, + { + "epoch": 2.4463840399002494, + "grad_norm": 5.910994529724121, + "learning_rate": 1.7555112219451373e-05, + "loss": 0.3701, + "step": 9810 + }, + { + "epoch": 2.4488778054862843, + "grad_norm": 5.326263427734375, + "learning_rate": 1.7552618453865337e-05, + "loss": 0.3784, + "step": 9820 + }, + { + "epoch": 2.451371571072319, + "grad_norm": 4.930222511291504, + "learning_rate": 1.7550124688279304e-05, + "loss": 0.4245, + "step": 9830 + }, + { + "epoch": 2.453865336658354, + "grad_norm": 14.357271194458008, + "learning_rate": 1.7547630922693267e-05, + "loss": 0.5032, + "step": 9840 + }, + { + "epoch": 2.456359102244389, + "grad_norm": 6.140951633453369, + "learning_rate": 1.7545137157107234e-05, + "loss": 0.3132, + "step": 9850 + }, + { + "epoch": 2.458852867830424, + "grad_norm": 6.1243743896484375, + "learning_rate": 1.7542643391521198e-05, + "loss": 0.4094, + "step": 9860 + }, + { + "epoch": 2.4613466334164587, + "grad_norm": 5.133067607879639, + "learning_rate": 1.7540149625935165e-05, + "loss": 0.4155, + "step": 9870 + }, + { + "epoch": 2.4638403990024935, + "grad_norm": 5.039419174194336, + "learning_rate": 1.753765586034913e-05, + "loss": 0.3812, + "step": 9880 + }, + { + "epoch": 2.466334164588529, + "grad_norm": 4.443388938903809, + "learning_rate": 1.7535162094763096e-05, + "loss": 0.3958, + "step": 9890 + }, + { + "epoch": 2.4688279301745637, + "grad_norm": 4.9401140213012695, + "learning_rate": 1.753266832917706e-05, + "loss": 0.3768, + "step": 9900 + }, + { + "epoch": 2.4713216957605986, + "grad_norm": 4.5980916023254395, + "learning_rate": 1.7530174563591026e-05, + "loss": 0.3657, + "step": 9910 + }, + { + "epoch": 2.4738154613466334, + "grad_norm": 4.5691423416137695, + "learning_rate": 1.752768079800499e-05, + "loss": 0.5407, + "step": 9920 + }, + { + "epoch": 2.4763092269326683, + "grad_norm": 10.96796989440918, + "learning_rate": 1.7525187032418953e-05, + "loss": 0.3909, + "step": 9930 + }, + { + "epoch": 2.478802992518703, + "grad_norm": 8.20064640045166, + "learning_rate": 1.752269326683292e-05, + "loss": 0.3977, + "step": 9940 + }, + { + "epoch": 2.481296758104738, + "grad_norm": 6.133816242218018, + "learning_rate": 1.7520199501246884e-05, + "loss": 0.4741, + "step": 9950 + }, + { + "epoch": 2.483790523690773, + "grad_norm": 3.481630325317383, + "learning_rate": 1.751770573566085e-05, + "loss": 0.359, + "step": 9960 + }, + { + "epoch": 2.4862842892768082, + "grad_norm": 6.158251762390137, + "learning_rate": 1.7515211970074815e-05, + "loss": 0.43, + "step": 9970 + }, + { + "epoch": 2.488778054862843, + "grad_norm": 6.90919828414917, + "learning_rate": 1.7512718204488778e-05, + "loss": 0.4154, + "step": 9980 + }, + { + "epoch": 2.491271820448878, + "grad_norm": 6.205567836761475, + "learning_rate": 1.7510224438902745e-05, + "loss": 0.4585, + "step": 9990 + }, + { + "epoch": 2.493765586034913, + "grad_norm": 8.581686973571777, + "learning_rate": 1.750773067331671e-05, + "loss": 0.4748, + "step": 10000 + }, + { + "epoch": 2.4962593516209477, + "grad_norm": 6.189614295959473, + "learning_rate": 1.7505236907730673e-05, + "loss": 0.4728, + "step": 10010 + }, + { + "epoch": 2.4987531172069826, + "grad_norm": 6.721256732940674, + "learning_rate": 1.750274314214464e-05, + "loss": 0.4508, + "step": 10020 + }, + { + "epoch": 2.5012468827930174, + "grad_norm": 7.564716815948486, + "learning_rate": 1.7500249376558603e-05, + "loss": 0.4478, + "step": 10030 + }, + { + "epoch": 2.5037406483790523, + "grad_norm": 6.12339973449707, + "learning_rate": 1.749775561097257e-05, + "loss": 0.3435, + "step": 10040 + }, + { + "epoch": 2.506234413965087, + "grad_norm": 6.196720123291016, + "learning_rate": 1.7495261845386534e-05, + "loss": 0.4308, + "step": 10050 + }, + { + "epoch": 2.508728179551122, + "grad_norm": 4.724903106689453, + "learning_rate": 1.74927680798005e-05, + "loss": 0.4072, + "step": 10060 + }, + { + "epoch": 2.511221945137157, + "grad_norm": 5.753084659576416, + "learning_rate": 1.7490274314214464e-05, + "loss": 0.3926, + "step": 10070 + }, + { + "epoch": 2.5137157107231918, + "grad_norm": 4.251994609832764, + "learning_rate": 1.748778054862843e-05, + "loss": 0.4337, + "step": 10080 + }, + { + "epoch": 2.516209476309227, + "grad_norm": 5.593048095703125, + "learning_rate": 1.7485286783042395e-05, + "loss": 0.3766, + "step": 10090 + }, + { + "epoch": 2.518703241895262, + "grad_norm": 3.456326723098755, + "learning_rate": 1.7482793017456362e-05, + "loss": 0.416, + "step": 10100 + }, + { + "epoch": 2.521197007481297, + "grad_norm": 6.803287982940674, + "learning_rate": 1.7480299251870326e-05, + "loss": 0.4372, + "step": 10110 + }, + { + "epoch": 2.5236907730673317, + "grad_norm": 6.90035343170166, + "learning_rate": 1.7477805486284293e-05, + "loss": 0.3842, + "step": 10120 + }, + { + "epoch": 2.5261845386533666, + "grad_norm": 7.092517375946045, + "learning_rate": 1.7475311720698256e-05, + "loss": 0.4817, + "step": 10130 + }, + { + "epoch": 2.5286783042394014, + "grad_norm": 7.831302642822266, + "learning_rate": 1.747281795511222e-05, + "loss": 0.3565, + "step": 10140 + }, + { + "epoch": 2.5311720698254363, + "grad_norm": 5.484219551086426, + "learning_rate": 1.7470324189526187e-05, + "loss": 0.4546, + "step": 10150 + }, + { + "epoch": 2.533665835411471, + "grad_norm": 9.50973129272461, + "learning_rate": 1.746783042394015e-05, + "loss": 0.3583, + "step": 10160 + }, + { + "epoch": 2.5361596009975065, + "grad_norm": 6.531426429748535, + "learning_rate": 1.7465336658354114e-05, + "loss": 0.3899, + "step": 10170 + }, + { + "epoch": 2.5386533665835413, + "grad_norm": 6.3029632568359375, + "learning_rate": 1.746284289276808e-05, + "loss": 0.4156, + "step": 10180 + }, + { + "epoch": 2.541147132169576, + "grad_norm": 6.3402252197265625, + "learning_rate": 1.7460349127182045e-05, + "loss": 0.5982, + "step": 10190 + }, + { + "epoch": 2.543640897755611, + "grad_norm": 6.966554164886475, + "learning_rate": 1.745785536159601e-05, + "loss": 0.4151, + "step": 10200 + }, + { + "epoch": 2.546134663341646, + "grad_norm": 4.213450908660889, + "learning_rate": 1.7455361596009975e-05, + "loss": 0.4014, + "step": 10210 + }, + { + "epoch": 2.548628428927681, + "grad_norm": 6.965415000915527, + "learning_rate": 1.7452867830423942e-05, + "loss": 0.5053, + "step": 10220 + }, + { + "epoch": 2.5511221945137157, + "grad_norm": 7.399175643920898, + "learning_rate": 1.7450374064837906e-05, + "loss": 0.4794, + "step": 10230 + }, + { + "epoch": 2.5536159600997506, + "grad_norm": 7.18726921081543, + "learning_rate": 1.7447880299251873e-05, + "loss": 0.4237, + "step": 10240 + }, + { + "epoch": 2.5561097256857854, + "grad_norm": 5.636864185333252, + "learning_rate": 1.7445386533665837e-05, + "loss": 0.4104, + "step": 10250 + }, + { + "epoch": 2.5586034912718203, + "grad_norm": 6.8820977210998535, + "learning_rate": 1.7442892768079804e-05, + "loss": 0.4435, + "step": 10260 + }, + { + "epoch": 2.561097256857855, + "grad_norm": 9.02462100982666, + "learning_rate": 1.7440399002493767e-05, + "loss": 0.4672, + "step": 10270 + }, + { + "epoch": 2.56359102244389, + "grad_norm": 5.171300411224365, + "learning_rate": 1.7437905236907734e-05, + "loss": 0.4258, + "step": 10280 + }, + { + "epoch": 2.5660847880299253, + "grad_norm": 11.12934684753418, + "learning_rate": 1.7435411471321698e-05, + "loss": 0.3967, + "step": 10290 + }, + { + "epoch": 2.56857855361596, + "grad_norm": 6.156098365783691, + "learning_rate": 1.743291770573566e-05, + "loss": 0.3973, + "step": 10300 + }, + { + "epoch": 2.571072319201995, + "grad_norm": 4.736686706542969, + "learning_rate": 1.743042394014963e-05, + "loss": 0.462, + "step": 10310 + }, + { + "epoch": 2.57356608478803, + "grad_norm": 8.570772171020508, + "learning_rate": 1.7427930174563592e-05, + "loss": 0.4494, + "step": 10320 + }, + { + "epoch": 2.576059850374065, + "grad_norm": 7.194964408874512, + "learning_rate": 1.742543640897756e-05, + "loss": 0.4351, + "step": 10330 + }, + { + "epoch": 2.5785536159600997, + "grad_norm": 4.6874918937683105, + "learning_rate": 1.7422942643391523e-05, + "loss": 0.4341, + "step": 10340 + }, + { + "epoch": 2.5810473815461346, + "grad_norm": 6.765628337860107, + "learning_rate": 1.7420448877805486e-05, + "loss": 0.4364, + "step": 10350 + }, + { + "epoch": 2.5835411471321694, + "grad_norm": 6.419005870819092, + "learning_rate": 1.7417955112219453e-05, + "loss": 0.4402, + "step": 10360 + }, + { + "epoch": 2.5860349127182047, + "grad_norm": 6.456396102905273, + "learning_rate": 1.7415461346633417e-05, + "loss": 0.4289, + "step": 10370 + }, + { + "epoch": 2.5885286783042396, + "grad_norm": 6.123867034912109, + "learning_rate": 1.741296758104738e-05, + "loss": 0.4403, + "step": 10380 + }, + { + "epoch": 2.5910224438902745, + "grad_norm": 5.706097602844238, + "learning_rate": 1.7410473815461347e-05, + "loss": 0.4294, + "step": 10390 + }, + { + "epoch": 2.5935162094763093, + "grad_norm": 6.1752543449401855, + "learning_rate": 1.740798004987531e-05, + "loss": 0.3477, + "step": 10400 + }, + { + "epoch": 2.596009975062344, + "grad_norm": 5.515731334686279, + "learning_rate": 1.7405486284289278e-05, + "loss": 0.4414, + "step": 10410 + }, + { + "epoch": 2.598503740648379, + "grad_norm": 6.281205654144287, + "learning_rate": 1.7402992518703242e-05, + "loss": 0.4038, + "step": 10420 + }, + { + "epoch": 2.600997506234414, + "grad_norm": 5.598005294799805, + "learning_rate": 1.740049875311721e-05, + "loss": 0.3969, + "step": 10430 + }, + { + "epoch": 2.603491271820449, + "grad_norm": 4.307186126708984, + "learning_rate": 1.7398004987531176e-05, + "loss": 0.4112, + "step": 10440 + }, + { + "epoch": 2.6059850374064837, + "grad_norm": 5.855963230133057, + "learning_rate": 1.739551122194514e-05, + "loss": 0.4137, + "step": 10450 + }, + { + "epoch": 2.6084788029925186, + "grad_norm": 6.094501972198486, + "learning_rate": 1.7393017456359106e-05, + "loss": 0.4384, + "step": 10460 + }, + { + "epoch": 2.6109725685785534, + "grad_norm": 7.285081386566162, + "learning_rate": 1.739052369077307e-05, + "loss": 0.4306, + "step": 10470 + }, + { + "epoch": 2.6134663341645883, + "grad_norm": 7.074899196624756, + "learning_rate": 1.7388029925187034e-05, + "loss": 0.4767, + "step": 10480 + }, + { + "epoch": 2.6159600997506236, + "grad_norm": 6.743325710296631, + "learning_rate": 1.7385536159601e-05, + "loss": 0.4486, + "step": 10490 + }, + { + "epoch": 2.6184538653366585, + "grad_norm": 5.235808372497559, + "learning_rate": 1.7383042394014964e-05, + "loss": 0.4621, + "step": 10500 + }, + { + "epoch": 2.6209476309226933, + "grad_norm": 5.081497669219971, + "learning_rate": 1.7380548628428928e-05, + "loss": 0.4273, + "step": 10510 + }, + { + "epoch": 2.623441396508728, + "grad_norm": 5.6099443435668945, + "learning_rate": 1.7378054862842895e-05, + "loss": 0.4259, + "step": 10520 + }, + { + "epoch": 2.625935162094763, + "grad_norm": 6.103354454040527, + "learning_rate": 1.737556109725686e-05, + "loss": 0.4568, + "step": 10530 + }, + { + "epoch": 2.628428927680798, + "grad_norm": 4.791126728057861, + "learning_rate": 1.7373067331670825e-05, + "loss": 0.4052, + "step": 10540 + }, + { + "epoch": 2.630922693266833, + "grad_norm": 4.903756618499756, + "learning_rate": 1.737057356608479e-05, + "loss": 0.4314, + "step": 10550 + }, + { + "epoch": 2.6334164588528677, + "grad_norm": 5.318016052246094, + "learning_rate": 1.7368079800498753e-05, + "loss": 0.4311, + "step": 10560 + }, + { + "epoch": 2.635910224438903, + "grad_norm": 10.588895797729492, + "learning_rate": 1.736558603491272e-05, + "loss": 0.4499, + "step": 10570 + }, + { + "epoch": 2.638403990024938, + "grad_norm": 4.9873833656311035, + "learning_rate": 1.7363092269326683e-05, + "loss": 0.4632, + "step": 10580 + }, + { + "epoch": 2.6408977556109727, + "grad_norm": 7.042355537414551, + "learning_rate": 1.736059850374065e-05, + "loss": 0.3997, + "step": 10590 + }, + { + "epoch": 2.6433915211970076, + "grad_norm": 6.466985702514648, + "learning_rate": 1.7358104738154614e-05, + "loss": 0.4844, + "step": 10600 + }, + { + "epoch": 2.6458852867830425, + "grad_norm": 4.656626224517822, + "learning_rate": 1.735561097256858e-05, + "loss": 0.3745, + "step": 10610 + }, + { + "epoch": 2.6483790523690773, + "grad_norm": 9.50407886505127, + "learning_rate": 1.7353117206982545e-05, + "loss": 0.5028, + "step": 10620 + }, + { + "epoch": 2.650872817955112, + "grad_norm": 5.126711845397949, + "learning_rate": 1.735062344139651e-05, + "loss": 0.4899, + "step": 10630 + }, + { + "epoch": 2.653366583541147, + "grad_norm": 10.905644416809082, + "learning_rate": 1.7348129675810475e-05, + "loss": 0.4768, + "step": 10640 + }, + { + "epoch": 2.655860349127182, + "grad_norm": 5.238419532775879, + "learning_rate": 1.7345635910224442e-05, + "loss": 0.3905, + "step": 10650 + }, + { + "epoch": 2.658354114713217, + "grad_norm": 5.836606979370117, + "learning_rate": 1.7343142144638406e-05, + "loss": 0.4586, + "step": 10660 + }, + { + "epoch": 2.6608478802992517, + "grad_norm": 6.579746723175049, + "learning_rate": 1.734064837905237e-05, + "loss": 0.5069, + "step": 10670 + }, + { + "epoch": 2.6633416458852865, + "grad_norm": 4.4686279296875, + "learning_rate": 1.7338154613466336e-05, + "loss": 0.3852, + "step": 10680 + }, + { + "epoch": 2.665835411471322, + "grad_norm": 6.548900604248047, + "learning_rate": 1.73356608478803e-05, + "loss": 0.3776, + "step": 10690 + }, + { + "epoch": 2.6683291770573567, + "grad_norm": 5.394494533538818, + "learning_rate": 1.7333167082294267e-05, + "loss": 0.3714, + "step": 10700 + }, + { + "epoch": 2.6708229426433916, + "grad_norm": 4.6077141761779785, + "learning_rate": 1.733067331670823e-05, + "loss": 0.3991, + "step": 10710 + }, + { + "epoch": 2.6733167082294265, + "grad_norm": 9.343939781188965, + "learning_rate": 1.7328179551122194e-05, + "loss": 0.3998, + "step": 10720 + }, + { + "epoch": 2.6758104738154613, + "grad_norm": 5.4641032218933105, + "learning_rate": 1.732568578553616e-05, + "loss": 0.3978, + "step": 10730 + }, + { + "epoch": 2.678304239401496, + "grad_norm": 6.154629230499268, + "learning_rate": 1.7323192019950125e-05, + "loss": 0.3814, + "step": 10740 + }, + { + "epoch": 2.680798004987531, + "grad_norm": 8.188713073730469, + "learning_rate": 1.7320698254364092e-05, + "loss": 0.454, + "step": 10750 + }, + { + "epoch": 2.683291770573566, + "grad_norm": 6.351066589355469, + "learning_rate": 1.7318204488778055e-05, + "loss": 0.4385, + "step": 10760 + }, + { + "epoch": 2.6857855361596013, + "grad_norm": 5.4914069175720215, + "learning_rate": 1.731571072319202e-05, + "loss": 0.4687, + "step": 10770 + }, + { + "epoch": 2.688279301745636, + "grad_norm": 10.653285026550293, + "learning_rate": 1.7313216957605986e-05, + "loss": 0.3617, + "step": 10780 + }, + { + "epoch": 2.690773067331671, + "grad_norm": 5.647527694702148, + "learning_rate": 1.7310723192019953e-05, + "loss": 0.4058, + "step": 10790 + }, + { + "epoch": 2.693266832917706, + "grad_norm": 8.179327964782715, + "learning_rate": 1.7308229426433917e-05, + "loss": 0.4148, + "step": 10800 + }, + { + "epoch": 2.6957605985037407, + "grad_norm": 4.55966854095459, + "learning_rate": 1.7305735660847884e-05, + "loss": 0.3592, + "step": 10810 + }, + { + "epoch": 2.6982543640897756, + "grad_norm": 7.562872409820557, + "learning_rate": 1.7303241895261847e-05, + "loss": 0.3956, + "step": 10820 + }, + { + "epoch": 2.7007481296758105, + "grad_norm": 12.162755012512207, + "learning_rate": 1.7300748129675814e-05, + "loss": 0.3981, + "step": 10830 + }, + { + "epoch": 2.7032418952618453, + "grad_norm": 9.855378150939941, + "learning_rate": 1.7298254364089778e-05, + "loss": 0.4338, + "step": 10840 + }, + { + "epoch": 2.70573566084788, + "grad_norm": 4.713982582092285, + "learning_rate": 1.729576059850374e-05, + "loss": 0.3565, + "step": 10850 + }, + { + "epoch": 2.708229426433915, + "grad_norm": 6.571569919586182, + "learning_rate": 1.729326683291771e-05, + "loss": 0.3863, + "step": 10860 + }, + { + "epoch": 2.71072319201995, + "grad_norm": 4.679405212402344, + "learning_rate": 1.7290773067331672e-05, + "loss": 0.3811, + "step": 10870 + }, + { + "epoch": 2.713216957605985, + "grad_norm": 6.7202982902526855, + "learning_rate": 1.7288279301745636e-05, + "loss": 0.3924, + "step": 10880 + }, + { + "epoch": 2.71571072319202, + "grad_norm": 6.362400531768799, + "learning_rate": 1.7285785536159603e-05, + "loss": 0.3774, + "step": 10890 + }, + { + "epoch": 2.718204488778055, + "grad_norm": 5.451074600219727, + "learning_rate": 1.7283291770573566e-05, + "loss": 0.391, + "step": 10900 + }, + { + "epoch": 2.72069825436409, + "grad_norm": 4.991129398345947, + "learning_rate": 1.7280798004987533e-05, + "loss": 0.4007, + "step": 10910 + }, + { + "epoch": 2.7231920199501247, + "grad_norm": 6.635371208190918, + "learning_rate": 1.7278304239401497e-05, + "loss": 0.4801, + "step": 10920 + }, + { + "epoch": 2.7256857855361596, + "grad_norm": 7.7395501136779785, + "learning_rate": 1.727581047381546e-05, + "loss": 0.4322, + "step": 10930 + }, + { + "epoch": 2.7281795511221945, + "grad_norm": 13.381205558776855, + "learning_rate": 1.7273316708229428e-05, + "loss": 0.468, + "step": 10940 + }, + { + "epoch": 2.7306733167082293, + "grad_norm": 7.249978542327881, + "learning_rate": 1.727082294264339e-05, + "loss": 0.4083, + "step": 10950 + }, + { + "epoch": 2.733167082294264, + "grad_norm": 7.9330573081970215, + "learning_rate": 1.7268329177057358e-05, + "loss": 0.4786, + "step": 10960 + }, + { + "epoch": 2.7356608478802995, + "grad_norm": 4.235700607299805, + "learning_rate": 1.7265835411471322e-05, + "loss": 0.4408, + "step": 10970 + }, + { + "epoch": 2.7381546134663344, + "grad_norm": 10.445588111877441, + "learning_rate": 1.726334164588529e-05, + "loss": 0.4088, + "step": 10980 + }, + { + "epoch": 2.7406483790523692, + "grad_norm": 4.6287031173706055, + "learning_rate": 1.7260847880299253e-05, + "loss": 0.4159, + "step": 10990 + }, + { + "epoch": 2.743142144638404, + "grad_norm": 6.074006080627441, + "learning_rate": 1.725835411471322e-05, + "loss": 0.4362, + "step": 11000 + }, + { + "epoch": 2.745635910224439, + "grad_norm": 6.392845153808594, + "learning_rate": 1.7255860349127183e-05, + "loss": 0.4406, + "step": 11010 + }, + { + "epoch": 2.748129675810474, + "grad_norm": 4.722134590148926, + "learning_rate": 1.725336658354115e-05, + "loss": 0.4392, + "step": 11020 + }, + { + "epoch": 2.7506234413965087, + "grad_norm": 5.2035698890686035, + "learning_rate": 1.7250872817955114e-05, + "loss": 0.4408, + "step": 11030 + }, + { + "epoch": 2.7531172069825436, + "grad_norm": 5.009521007537842, + "learning_rate": 1.724837905236908e-05, + "loss": 0.3727, + "step": 11040 + }, + { + "epoch": 2.7556109725685785, + "grad_norm": 8.132833480834961, + "learning_rate": 1.7245885286783044e-05, + "loss": 0.4183, + "step": 11050 + }, + { + "epoch": 2.7581047381546133, + "grad_norm": 4.388547897338867, + "learning_rate": 1.7243391521197008e-05, + "loss": 0.3871, + "step": 11060 + }, + { + "epoch": 2.760598503740648, + "grad_norm": 5.839278221130371, + "learning_rate": 1.7240897755610975e-05, + "loss": 0.3665, + "step": 11070 + }, + { + "epoch": 2.763092269326683, + "grad_norm": 6.53806734085083, + "learning_rate": 1.723840399002494e-05, + "loss": 0.4331, + "step": 11080 + }, + { + "epoch": 2.765586034912718, + "grad_norm": 9.082895278930664, + "learning_rate": 1.7235910224438902e-05, + "loss": 0.379, + "step": 11090 + }, + { + "epoch": 2.7680798004987532, + "grad_norm": 4.956233978271484, + "learning_rate": 1.723341645885287e-05, + "loss": 0.4789, + "step": 11100 + }, + { + "epoch": 2.770573566084788, + "grad_norm": 4.803463935852051, + "learning_rate": 1.7230922693266833e-05, + "loss": 0.4266, + "step": 11110 + }, + { + "epoch": 2.773067331670823, + "grad_norm": 5.308382511138916, + "learning_rate": 1.72284289276808e-05, + "loss": 0.4347, + "step": 11120 + }, + { + "epoch": 2.775561097256858, + "grad_norm": 4.9697418212890625, + "learning_rate": 1.7225935162094763e-05, + "loss": 0.4041, + "step": 11130 + }, + { + "epoch": 2.7780548628428927, + "grad_norm": 5.6570281982421875, + "learning_rate": 1.722344139650873e-05, + "loss": 0.4769, + "step": 11140 + }, + { + "epoch": 2.7805486284289276, + "grad_norm": 5.522329330444336, + "learning_rate": 1.7220947630922694e-05, + "loss": 0.4414, + "step": 11150 + }, + { + "epoch": 2.7830423940149625, + "grad_norm": 6.803591728210449, + "learning_rate": 1.721845386533666e-05, + "loss": 0.4085, + "step": 11160 + }, + { + "epoch": 2.7855361596009978, + "grad_norm": 13.538893699645996, + "learning_rate": 1.7215960099750625e-05, + "loss": 0.4206, + "step": 11170 + }, + { + "epoch": 2.7880299251870326, + "grad_norm": 5.024692535400391, + "learning_rate": 1.7213466334164592e-05, + "loss": 0.4413, + "step": 11180 + }, + { + "epoch": 2.7905236907730675, + "grad_norm": 6.203422546386719, + "learning_rate": 1.7210972568578555e-05, + "loss": 0.3727, + "step": 11190 + }, + { + "epoch": 2.7930174563591024, + "grad_norm": 4.57298469543457, + "learning_rate": 1.7208478802992522e-05, + "loss": 0.393, + "step": 11200 + }, + { + "epoch": 2.7955112219451372, + "grad_norm": 5.918140888214111, + "learning_rate": 1.7205985037406486e-05, + "loss": 0.3846, + "step": 11210 + }, + { + "epoch": 2.798004987531172, + "grad_norm": 3.546912670135498, + "learning_rate": 1.720349127182045e-05, + "loss": 0.4229, + "step": 11220 + }, + { + "epoch": 2.800498753117207, + "grad_norm": 4.094278335571289, + "learning_rate": 1.7200997506234417e-05, + "loss": 0.3498, + "step": 11230 + }, + { + "epoch": 2.802992518703242, + "grad_norm": 5.546658039093018, + "learning_rate": 1.719850374064838e-05, + "loss": 0.4516, + "step": 11240 + }, + { + "epoch": 2.8054862842892767, + "grad_norm": 3.7501320838928223, + "learning_rate": 1.7196009975062347e-05, + "loss": 0.366, + "step": 11250 + }, + { + "epoch": 2.8079800498753116, + "grad_norm": 7.085123062133789, + "learning_rate": 1.719351620947631e-05, + "loss": 0.3878, + "step": 11260 + }, + { + "epoch": 2.8104738154613464, + "grad_norm": 5.683539390563965, + "learning_rate": 1.7191022443890274e-05, + "loss": 0.4439, + "step": 11270 + }, + { + "epoch": 2.8129675810473813, + "grad_norm": 7.982929229736328, + "learning_rate": 1.718852867830424e-05, + "loss": 0.426, + "step": 11280 + }, + { + "epoch": 2.815461346633416, + "grad_norm": 5.907116889953613, + "learning_rate": 1.7186034912718205e-05, + "loss": 0.4181, + "step": 11290 + }, + { + "epoch": 2.8179551122194515, + "grad_norm": 6.01471471786499, + "learning_rate": 1.718354114713217e-05, + "loss": 0.4148, + "step": 11300 + }, + { + "epoch": 2.8204488778054864, + "grad_norm": 4.76801872253418, + "learning_rate": 1.7181047381546136e-05, + "loss": 0.3851, + "step": 11310 + }, + { + "epoch": 2.8229426433915212, + "grad_norm": 5.2211785316467285, + "learning_rate": 1.71785536159601e-05, + "loss": 0.4307, + "step": 11320 + }, + { + "epoch": 2.825436408977556, + "grad_norm": 3.697915554046631, + "learning_rate": 1.7176059850374066e-05, + "loss": 0.4286, + "step": 11330 + }, + { + "epoch": 2.827930174563591, + "grad_norm": 4.701713562011719, + "learning_rate": 1.717356608478803e-05, + "loss": 0.4104, + "step": 11340 + }, + { + "epoch": 2.830423940149626, + "grad_norm": 7.1458964347839355, + "learning_rate": 1.7171072319201997e-05, + "loss": 0.4007, + "step": 11350 + }, + { + "epoch": 2.8329177057356607, + "grad_norm": 4.9811272621154785, + "learning_rate": 1.716857855361596e-05, + "loss": 0.3228, + "step": 11360 + }, + { + "epoch": 2.835411471321696, + "grad_norm": 7.584052085876465, + "learning_rate": 1.7166084788029928e-05, + "loss": 0.4144, + "step": 11370 + }, + { + "epoch": 2.837905236907731, + "grad_norm": 6.256438732147217, + "learning_rate": 1.716359102244389e-05, + "loss": 0.459, + "step": 11380 + }, + { + "epoch": 2.8403990024937658, + "grad_norm": 6.128487586975098, + "learning_rate": 1.7161097256857858e-05, + "loss": 0.4185, + "step": 11390 + }, + { + "epoch": 2.8428927680798006, + "grad_norm": 5.535672664642334, + "learning_rate": 1.7158603491271822e-05, + "loss": 0.451, + "step": 11400 + }, + { + "epoch": 2.8453865336658355, + "grad_norm": 5.653360843658447, + "learning_rate": 1.715610972568579e-05, + "loss": 0.5224, + "step": 11410 + }, + { + "epoch": 2.8478802992518704, + "grad_norm": 8.358339309692383, + "learning_rate": 1.7153615960099752e-05, + "loss": 0.431, + "step": 11420 + }, + { + "epoch": 2.8503740648379052, + "grad_norm": 5.042673587799072, + "learning_rate": 1.7151122194513716e-05, + "loss": 0.4588, + "step": 11430 + }, + { + "epoch": 2.85286783042394, + "grad_norm": 5.333296298980713, + "learning_rate": 1.7148628428927683e-05, + "loss": 0.3947, + "step": 11440 + }, + { + "epoch": 2.855361596009975, + "grad_norm": 4.792226791381836, + "learning_rate": 1.7146134663341647e-05, + "loss": 0.3985, + "step": 11450 + }, + { + "epoch": 2.85785536159601, + "grad_norm": 6.703670501708984, + "learning_rate": 1.7143640897755614e-05, + "loss": 0.4351, + "step": 11460 + }, + { + "epoch": 2.8603491271820447, + "grad_norm": 6.484237194061279, + "learning_rate": 1.7141147132169577e-05, + "loss": 0.4915, + "step": 11470 + }, + { + "epoch": 2.8628428927680796, + "grad_norm": 7.9989447593688965, + "learning_rate": 1.713865336658354e-05, + "loss": 0.3769, + "step": 11480 + }, + { + "epoch": 2.8653366583541144, + "grad_norm": 5.715572834014893, + "learning_rate": 1.7136159600997508e-05, + "loss": 0.5566, + "step": 11490 + }, + { + "epoch": 2.8678304239401498, + "grad_norm": 5.8361945152282715, + "learning_rate": 1.713366583541147e-05, + "loss": 0.3843, + "step": 11500 + }, + { + "epoch": 2.8703241895261846, + "grad_norm": 7.280886650085449, + "learning_rate": 1.713117206982544e-05, + "loss": 0.4158, + "step": 11510 + }, + { + "epoch": 2.8728179551122195, + "grad_norm": 6.559328556060791, + "learning_rate": 1.7128678304239402e-05, + "loss": 0.4373, + "step": 11520 + }, + { + "epoch": 2.8753117206982544, + "grad_norm": 8.497318267822266, + "learning_rate": 1.712618453865337e-05, + "loss": 0.4206, + "step": 11530 + }, + { + "epoch": 2.8778054862842892, + "grad_norm": 4.384303092956543, + "learning_rate": 1.7123690773067333e-05, + "loss": 0.3942, + "step": 11540 + }, + { + "epoch": 2.880299251870324, + "grad_norm": 6.503232002258301, + "learning_rate": 1.71211970074813e-05, + "loss": 0.5279, + "step": 11550 + }, + { + "epoch": 2.882793017456359, + "grad_norm": 6.880457878112793, + "learning_rate": 1.7118703241895263e-05, + "loss": 0.4544, + "step": 11560 + }, + { + "epoch": 2.8852867830423943, + "grad_norm": 5.289419174194336, + "learning_rate": 1.711620947630923e-05, + "loss": 0.414, + "step": 11570 + }, + { + "epoch": 2.887780548628429, + "grad_norm": 7.233415126800537, + "learning_rate": 1.7113715710723194e-05, + "loss": 0.4409, + "step": 11580 + }, + { + "epoch": 2.890274314214464, + "grad_norm": 5.534732818603516, + "learning_rate": 1.7111221945137158e-05, + "loss": 0.4652, + "step": 11590 + }, + { + "epoch": 2.892768079800499, + "grad_norm": 6.039757251739502, + "learning_rate": 1.7108728179551125e-05, + "loss": 0.4492, + "step": 11600 + }, + { + "epoch": 2.8952618453865338, + "grad_norm": 8.882451057434082, + "learning_rate": 1.7106234413965088e-05, + "loss": 0.3906, + "step": 11610 + }, + { + "epoch": 2.8977556109725686, + "grad_norm": 6.894459247589111, + "learning_rate": 1.7103740648379055e-05, + "loss": 0.4368, + "step": 11620 + }, + { + "epoch": 2.9002493765586035, + "grad_norm": 4.845560073852539, + "learning_rate": 1.710124688279302e-05, + "loss": 0.4496, + "step": 11630 + }, + { + "epoch": 2.9027431421446384, + "grad_norm": 4.972646713256836, + "learning_rate": 1.7098753117206982e-05, + "loss": 0.4763, + "step": 11640 + }, + { + "epoch": 2.9052369077306732, + "grad_norm": 5.399471759796143, + "learning_rate": 1.709625935162095e-05, + "loss": 0.4365, + "step": 11650 + }, + { + "epoch": 2.907730673316708, + "grad_norm": 5.46916389465332, + "learning_rate": 1.7093765586034913e-05, + "loss": 0.4315, + "step": 11660 + }, + { + "epoch": 2.910224438902743, + "grad_norm": 9.235302925109863, + "learning_rate": 1.7091271820448877e-05, + "loss": 0.4335, + "step": 11670 + }, + { + "epoch": 2.912718204488778, + "grad_norm": 3.6637065410614014, + "learning_rate": 1.7088778054862844e-05, + "loss": 0.4421, + "step": 11680 + }, + { + "epoch": 2.9152119700748127, + "grad_norm": 4.475642681121826, + "learning_rate": 1.7086284289276807e-05, + "loss": 0.3812, + "step": 11690 + }, + { + "epoch": 2.917705735660848, + "grad_norm": 4.71533203125, + "learning_rate": 1.7083790523690774e-05, + "loss": 0.3723, + "step": 11700 + }, + { + "epoch": 2.920199501246883, + "grad_norm": 9.815446853637695, + "learning_rate": 1.7081296758104738e-05, + "loss": 0.4587, + "step": 11710 + }, + { + "epoch": 2.9226932668329177, + "grad_norm": 5.071043968200684, + "learning_rate": 1.7078802992518705e-05, + "loss": 0.4285, + "step": 11720 + }, + { + "epoch": 2.9251870324189526, + "grad_norm": 4.660521507263184, + "learning_rate": 1.7076309226932672e-05, + "loss": 0.4849, + "step": 11730 + }, + { + "epoch": 2.9276807980049875, + "grad_norm": 4.910328388214111, + "learning_rate": 1.7073815461346636e-05, + "loss": 0.4813, + "step": 11740 + }, + { + "epoch": 2.9301745635910224, + "grad_norm": 5.175450801849365, + "learning_rate": 1.7071321695760603e-05, + "loss": 0.4067, + "step": 11750 + }, + { + "epoch": 2.932668329177057, + "grad_norm": 5.369016170501709, + "learning_rate": 1.7068827930174566e-05, + "loss": 0.4048, + "step": 11760 + }, + { + "epoch": 2.9351620947630925, + "grad_norm": 4.3544745445251465, + "learning_rate": 1.706633416458853e-05, + "loss": 0.4286, + "step": 11770 + }, + { + "epoch": 2.9376558603491274, + "grad_norm": 7.268121242523193, + "learning_rate": 1.7063840399002497e-05, + "loss": 0.3988, + "step": 11780 + }, + { + "epoch": 2.9401496259351623, + "grad_norm": 5.836695194244385, + "learning_rate": 1.706134663341646e-05, + "loss": 0.4187, + "step": 11790 + }, + { + "epoch": 2.942643391521197, + "grad_norm": 4.910001754760742, + "learning_rate": 1.7058852867830424e-05, + "loss": 0.4232, + "step": 11800 + }, + { + "epoch": 2.945137157107232, + "grad_norm": 3.5782833099365234, + "learning_rate": 1.705635910224439e-05, + "loss": 0.5034, + "step": 11810 + }, + { + "epoch": 2.947630922693267, + "grad_norm": 4.067619323730469, + "learning_rate": 1.7053865336658355e-05, + "loss": 0.3851, + "step": 11820 + }, + { + "epoch": 2.9501246882793017, + "grad_norm": 9.091952323913574, + "learning_rate": 1.705137157107232e-05, + "loss": 0.5536, + "step": 11830 + }, + { + "epoch": 2.9526184538653366, + "grad_norm": 7.382187366485596, + "learning_rate": 1.7048877805486285e-05, + "loss": 0.3732, + "step": 11840 + }, + { + "epoch": 2.9551122194513715, + "grad_norm": 5.961577415466309, + "learning_rate": 1.704638403990025e-05, + "loss": 0.3849, + "step": 11850 + }, + { + "epoch": 2.9576059850374063, + "grad_norm": 7.97136116027832, + "learning_rate": 1.7043890274314216e-05, + "loss": 0.4156, + "step": 11860 + }, + { + "epoch": 2.960099750623441, + "grad_norm": 14.47574234008789, + "learning_rate": 1.704139650872818e-05, + "loss": 0.4306, + "step": 11870 + }, + { + "epoch": 2.962593516209476, + "grad_norm": 6.207291603088379, + "learning_rate": 1.7038902743142146e-05, + "loss": 0.3749, + "step": 11880 + }, + { + "epoch": 2.965087281795511, + "grad_norm": 6.804027080535889, + "learning_rate": 1.703640897755611e-05, + "loss": 0.4808, + "step": 11890 + }, + { + "epoch": 2.9675810473815463, + "grad_norm": 7.510690689086914, + "learning_rate": 1.7033915211970077e-05, + "loss": 0.5506, + "step": 11900 + }, + { + "epoch": 2.970074812967581, + "grad_norm": 10.168417930603027, + "learning_rate": 1.703142144638404e-05, + "loss": 0.5405, + "step": 11910 + }, + { + "epoch": 2.972568578553616, + "grad_norm": 5.433992385864258, + "learning_rate": 1.7028927680798008e-05, + "loss": 0.4205, + "step": 11920 + }, + { + "epoch": 2.975062344139651, + "grad_norm": 5.19559907913208, + "learning_rate": 1.702643391521197e-05, + "loss": 0.3868, + "step": 11930 + }, + { + "epoch": 2.9775561097256857, + "grad_norm": 6.645895004272461, + "learning_rate": 1.702394014962594e-05, + "loss": 0.4322, + "step": 11940 + }, + { + "epoch": 2.9800498753117206, + "grad_norm": 5.916768550872803, + "learning_rate": 1.7021446384039902e-05, + "loss": 0.3847, + "step": 11950 + }, + { + "epoch": 2.9825436408977555, + "grad_norm": 4.953275203704834, + "learning_rate": 1.701895261845387e-05, + "loss": 0.4585, + "step": 11960 + }, + { + "epoch": 2.985037406483791, + "grad_norm": 3.9088947772979736, + "learning_rate": 1.7016458852867833e-05, + "loss": 0.3925, + "step": 11970 + }, + { + "epoch": 2.9875311720698257, + "grad_norm": 12.187833786010742, + "learning_rate": 1.7013965087281796e-05, + "loss": 0.5033, + "step": 11980 + }, + { + "epoch": 2.9900249376558605, + "grad_norm": 6.370995998382568, + "learning_rate": 1.7011471321695763e-05, + "loss": 0.3995, + "step": 11990 + }, + { + "epoch": 2.9925187032418954, + "grad_norm": 5.4293365478515625, + "learning_rate": 1.7008977556109727e-05, + "loss": 0.4607, + "step": 12000 + }, + { + "epoch": 2.9950124688279303, + "grad_norm": 10.32349967956543, + "learning_rate": 1.700648379052369e-05, + "loss": 0.4499, + "step": 12010 + }, + { + "epoch": 2.997506234413965, + "grad_norm": 4.519608497619629, + "learning_rate": 1.7003990024937657e-05, + "loss": 0.4193, + "step": 12020 + }, + { + "epoch": 3.0, + "grad_norm": 4.505052089691162, + "learning_rate": 1.700149625935162e-05, + "loss": 0.3652, + "step": 12030 + }, + { + "epoch": 3.0, + "eval_loss": 0.43368199467658997, + "eval_runtime": 67.1343, + "eval_samples_per_second": 14.94, + "eval_steps_per_second": 14.94, + "step": 12030 + }, + { + "epoch": 3.002493765586035, + "grad_norm": 6.454890727996826, + "learning_rate": 1.6999002493765588e-05, + "loss": 0.4374, + "step": 12040 + }, + { + "epoch": 3.0049875311720697, + "grad_norm": 7.071195125579834, + "learning_rate": 1.699650872817955e-05, + "loss": 0.4221, + "step": 12050 + }, + { + "epoch": 3.0074812967581046, + "grad_norm": 5.097235202789307, + "learning_rate": 1.6994014962593515e-05, + "loss": 0.4258, + "step": 12060 + }, + { + "epoch": 3.0099750623441395, + "grad_norm": 4.156216144561768, + "learning_rate": 1.6991521197007482e-05, + "loss": 0.3844, + "step": 12070 + }, + { + "epoch": 3.0124688279301743, + "grad_norm": 4.2227325439453125, + "learning_rate": 1.698902743142145e-05, + "loss": 0.3943, + "step": 12080 + }, + { + "epoch": 3.0149625935162097, + "grad_norm": 5.322434425354004, + "learning_rate": 1.6986533665835413e-05, + "loss": 0.4466, + "step": 12090 + }, + { + "epoch": 3.0174563591022445, + "grad_norm": 7.377932548522949, + "learning_rate": 1.698403990024938e-05, + "loss": 0.4465, + "step": 12100 + }, + { + "epoch": 3.0199501246882794, + "grad_norm": 4.9225263595581055, + "learning_rate": 1.6981546134663343e-05, + "loss": 0.389, + "step": 12110 + }, + { + "epoch": 3.0224438902743143, + "grad_norm": 4.286352157592773, + "learning_rate": 1.697905236907731e-05, + "loss": 0.3605, + "step": 12120 + }, + { + "epoch": 3.024937655860349, + "grad_norm": 7.271639347076416, + "learning_rate": 1.6976558603491274e-05, + "loss": 0.4093, + "step": 12130 + }, + { + "epoch": 3.027431421446384, + "grad_norm": 9.17022705078125, + "learning_rate": 1.6974064837905238e-05, + "loss": 0.4649, + "step": 12140 + }, + { + "epoch": 3.029925187032419, + "grad_norm": 11.9248046875, + "learning_rate": 1.6971571072319205e-05, + "loss": 0.4102, + "step": 12150 + }, + { + "epoch": 3.0324189526184537, + "grad_norm": 5.974640369415283, + "learning_rate": 1.696907730673317e-05, + "loss": 0.4345, + "step": 12160 + }, + { + "epoch": 3.0349127182044886, + "grad_norm": 7.954996109008789, + "learning_rate": 1.6966583541147132e-05, + "loss": 0.3358, + "step": 12170 + }, + { + "epoch": 3.037406483790524, + "grad_norm": 7.6409220695495605, + "learning_rate": 1.6964339152119704e-05, + "loss": 0.4094, + "step": 12180 + }, + { + "epoch": 3.039900249376559, + "grad_norm": 6.415146827697754, + "learning_rate": 1.6961845386533667e-05, + "loss": 0.408, + "step": 12190 + }, + { + "epoch": 3.0423940149625937, + "grad_norm": 6.253342151641846, + "learning_rate": 1.6959351620947634e-05, + "loss": 0.385, + "step": 12200 + }, + { + "epoch": 3.0448877805486285, + "grad_norm": 6.785040378570557, + "learning_rate": 1.6956857855361598e-05, + "loss": 0.3796, + "step": 12210 + }, + { + "epoch": 3.0473815461346634, + "grad_norm": 5.031459331512451, + "learning_rate": 1.6954364089775565e-05, + "loss": 0.3762, + "step": 12220 + }, + { + "epoch": 3.0498753117206983, + "grad_norm": 5.665307521820068, + "learning_rate": 1.695187032418953e-05, + "loss": 0.4444, + "step": 12230 + }, + { + "epoch": 3.052369077306733, + "grad_norm": 9.208979606628418, + "learning_rate": 1.6949376558603492e-05, + "loss": 0.41, + "step": 12240 + }, + { + "epoch": 3.054862842892768, + "grad_norm": 6.723043441772461, + "learning_rate": 1.694688279301746e-05, + "loss": 0.378, + "step": 12250 + }, + { + "epoch": 3.057356608478803, + "grad_norm": 9.06165599822998, + "learning_rate": 1.6944389027431423e-05, + "loss": 0.3957, + "step": 12260 + }, + { + "epoch": 3.0598503740648377, + "grad_norm": 6.359433650970459, + "learning_rate": 1.6941895261845386e-05, + "loss": 0.3632, + "step": 12270 + }, + { + "epoch": 3.0623441396508726, + "grad_norm": 4.539393901824951, + "learning_rate": 1.6939401496259353e-05, + "loss": 0.4039, + "step": 12280 + }, + { + "epoch": 3.064837905236908, + "grad_norm": 7.049586772918701, + "learning_rate": 1.6936907730673317e-05, + "loss": 0.4395, + "step": 12290 + }, + { + "epoch": 3.067331670822943, + "grad_norm": 7.9074835777282715, + "learning_rate": 1.6934413965087284e-05, + "loss": 0.4092, + "step": 12300 + }, + { + "epoch": 3.0698254364089776, + "grad_norm": 6.730893611907959, + "learning_rate": 1.6931920199501248e-05, + "loss": 0.4422, + "step": 12310 + }, + { + "epoch": 3.0723192019950125, + "grad_norm": 4.9922194480896, + "learning_rate": 1.692942643391521e-05, + "loss": 0.3308, + "step": 12320 + }, + { + "epoch": 3.0748129675810474, + "grad_norm": 7.608273506164551, + "learning_rate": 1.6926932668329178e-05, + "loss": 0.363, + "step": 12330 + }, + { + "epoch": 3.0773067331670823, + "grad_norm": 7.005294322967529, + "learning_rate": 1.6924438902743142e-05, + "loss": 0.4175, + "step": 12340 + }, + { + "epoch": 3.079800498753117, + "grad_norm": 7.616174221038818, + "learning_rate": 1.692194513715711e-05, + "loss": 0.3848, + "step": 12350 + }, + { + "epoch": 3.082294264339152, + "grad_norm": 5.1623921394348145, + "learning_rate": 1.6919451371571072e-05, + "loss": 0.4422, + "step": 12360 + }, + { + "epoch": 3.084788029925187, + "grad_norm": 7.74609899520874, + "learning_rate": 1.691695760598504e-05, + "loss": 0.3759, + "step": 12370 + }, + { + "epoch": 3.087281795511222, + "grad_norm": 5.895921230316162, + "learning_rate": 1.6914463840399003e-05, + "loss": 0.4285, + "step": 12380 + }, + { + "epoch": 3.089775561097257, + "grad_norm": 10.510361671447754, + "learning_rate": 1.691197007481297e-05, + "loss": 0.3517, + "step": 12390 + }, + { + "epoch": 3.092269326683292, + "grad_norm": 6.013721466064453, + "learning_rate": 1.6909476309226934e-05, + "loss": 0.3976, + "step": 12400 + }, + { + "epoch": 3.0947630922693268, + "grad_norm": 7.516636371612549, + "learning_rate": 1.69069825436409e-05, + "loss": 0.4454, + "step": 12410 + }, + { + "epoch": 3.0972568578553616, + "grad_norm": 4.560348033905029, + "learning_rate": 1.6904488778054864e-05, + "loss": 0.4975, + "step": 12420 + }, + { + "epoch": 3.0997506234413965, + "grad_norm": 5.868087291717529, + "learning_rate": 1.690199501246883e-05, + "loss": 0.4223, + "step": 12430 + }, + { + "epoch": 3.1022443890274314, + "grad_norm": 5.38186502456665, + "learning_rate": 1.6899501246882795e-05, + "loss": 0.3995, + "step": 12440 + }, + { + "epoch": 3.1047381546134662, + "grad_norm": 6.7590436935424805, + "learning_rate": 1.689700748129676e-05, + "loss": 0.3922, + "step": 12450 + }, + { + "epoch": 3.107231920199501, + "grad_norm": 6.269115447998047, + "learning_rate": 1.6894513715710726e-05, + "loss": 0.4457, + "step": 12460 + }, + { + "epoch": 3.109725685785536, + "grad_norm": 17.736539840698242, + "learning_rate": 1.689201995012469e-05, + "loss": 0.4161, + "step": 12470 + }, + { + "epoch": 3.112219451371571, + "grad_norm": 6.468410491943359, + "learning_rate": 1.6889526184538653e-05, + "loss": 0.4411, + "step": 12480 + }, + { + "epoch": 3.114713216957606, + "grad_norm": 11.51278305053711, + "learning_rate": 1.688703241895262e-05, + "loss": 0.4001, + "step": 12490 + }, + { + "epoch": 3.117206982543641, + "grad_norm": 7.40097188949585, + "learning_rate": 1.6884538653366583e-05, + "loss": 0.3631, + "step": 12500 + }, + { + "epoch": 3.119700748129676, + "grad_norm": 4.9506683349609375, + "learning_rate": 1.688204488778055e-05, + "loss": 0.4302, + "step": 12510 + }, + { + "epoch": 3.1221945137157108, + "grad_norm": 4.265248775482178, + "learning_rate": 1.6879551122194514e-05, + "loss": 0.3701, + "step": 12520 + }, + { + "epoch": 3.1246882793017456, + "grad_norm": 13.634464263916016, + "learning_rate": 1.687705735660848e-05, + "loss": 0.3986, + "step": 12530 + }, + { + "epoch": 3.1271820448877805, + "grad_norm": 6.742231369018555, + "learning_rate": 1.6874563591022445e-05, + "loss": 0.4203, + "step": 12540 + }, + { + "epoch": 3.1296758104738154, + "grad_norm": 5.28926944732666, + "learning_rate": 1.687206982543641e-05, + "loss": 0.4416, + "step": 12550 + }, + { + "epoch": 3.1321695760598502, + "grad_norm": 5.512608051300049, + "learning_rate": 1.6869576059850375e-05, + "loss": 0.4417, + "step": 12560 + }, + { + "epoch": 3.134663341645885, + "grad_norm": 6.725086212158203, + "learning_rate": 1.6867082294264342e-05, + "loss": 0.3786, + "step": 12570 + }, + { + "epoch": 3.1371571072319204, + "grad_norm": 6.855806827545166, + "learning_rate": 1.6864588528678306e-05, + "loss": 0.4306, + "step": 12580 + }, + { + "epoch": 3.1396508728179553, + "grad_norm": 8.445453643798828, + "learning_rate": 1.6862094763092273e-05, + "loss": 0.4489, + "step": 12590 + }, + { + "epoch": 3.14214463840399, + "grad_norm": 4.596224784851074, + "learning_rate": 1.6859600997506236e-05, + "loss": 0.3834, + "step": 12600 + }, + { + "epoch": 3.144638403990025, + "grad_norm": 6.923195838928223, + "learning_rate": 1.68571072319202e-05, + "loss": 0.4399, + "step": 12610 + }, + { + "epoch": 3.14713216957606, + "grad_norm": 3.491319417953491, + "learning_rate": 1.6854613466334167e-05, + "loss": 0.3944, + "step": 12620 + }, + { + "epoch": 3.1496259351620948, + "grad_norm": 6.328956127166748, + "learning_rate": 1.685211970074813e-05, + "loss": 0.3583, + "step": 12630 + }, + { + "epoch": 3.1521197007481296, + "grad_norm": 6.093251705169678, + "learning_rate": 1.6849625935162098e-05, + "loss": 0.3517, + "step": 12640 + }, + { + "epoch": 3.1546134663341645, + "grad_norm": 6.187097549438477, + "learning_rate": 1.684713216957606e-05, + "loss": 0.4233, + "step": 12650 + }, + { + "epoch": 3.1571072319201994, + "grad_norm": 7.890842437744141, + "learning_rate": 1.6844638403990025e-05, + "loss": 0.3321, + "step": 12660 + }, + { + "epoch": 3.1596009975062342, + "grad_norm": 6.837558746337891, + "learning_rate": 1.6842144638403992e-05, + "loss": 0.4426, + "step": 12670 + }, + { + "epoch": 3.162094763092269, + "grad_norm": 5.910398960113525, + "learning_rate": 1.6839650872817956e-05, + "loss": 0.4121, + "step": 12680 + }, + { + "epoch": 3.1645885286783044, + "grad_norm": 6.540918827056885, + "learning_rate": 1.683715710723192e-05, + "loss": 0.4617, + "step": 12690 + }, + { + "epoch": 3.1670822942643393, + "grad_norm": 7.381392955780029, + "learning_rate": 1.6834663341645886e-05, + "loss": 0.4303, + "step": 12700 + }, + { + "epoch": 3.169576059850374, + "grad_norm": 7.120436668395996, + "learning_rate": 1.683216957605985e-05, + "loss": 0.4569, + "step": 12710 + }, + { + "epoch": 3.172069825436409, + "grad_norm": 4.337961196899414, + "learning_rate": 1.6829675810473817e-05, + "loss": 0.3563, + "step": 12720 + }, + { + "epoch": 3.174563591022444, + "grad_norm": 5.855359077453613, + "learning_rate": 1.682718204488778e-05, + "loss": 0.5174, + "step": 12730 + }, + { + "epoch": 3.1770573566084788, + "grad_norm": 5.234665393829346, + "learning_rate": 1.6824688279301747e-05, + "loss": 0.3649, + "step": 12740 + }, + { + "epoch": 3.1795511221945136, + "grad_norm": 7.155577182769775, + "learning_rate": 1.6822194513715714e-05, + "loss": 0.4493, + "step": 12750 + }, + { + "epoch": 3.1820448877805485, + "grad_norm": 6.0398406982421875, + "learning_rate": 1.6819700748129678e-05, + "loss": 0.337, + "step": 12760 + }, + { + "epoch": 3.1845386533665834, + "grad_norm": 6.6076250076293945, + "learning_rate": 1.681720698254364e-05, + "loss": 0.3503, + "step": 12770 + }, + { + "epoch": 3.1870324189526187, + "grad_norm": 6.856815814971924, + "learning_rate": 1.681471321695761e-05, + "loss": 0.4076, + "step": 12780 + }, + { + "epoch": 3.1895261845386536, + "grad_norm": 9.046415328979492, + "learning_rate": 1.6812219451371572e-05, + "loss": 0.4621, + "step": 12790 + }, + { + "epoch": 3.1920199501246884, + "grad_norm": 6.603618144989014, + "learning_rate": 1.680972568578554e-05, + "loss": 0.4491, + "step": 12800 + }, + { + "epoch": 3.1945137157107233, + "grad_norm": 8.234539985656738, + "learning_rate": 1.6807231920199503e-05, + "loss": 0.355, + "step": 12810 + }, + { + "epoch": 3.197007481296758, + "grad_norm": 6.2086005210876465, + "learning_rate": 1.6804738154613467e-05, + "loss": 0.3679, + "step": 12820 + }, + { + "epoch": 3.199501246882793, + "grad_norm": 5.592929363250732, + "learning_rate": 1.6802244389027434e-05, + "loss": 0.3855, + "step": 12830 + }, + { + "epoch": 3.201995012468828, + "grad_norm": 5.385773658752441, + "learning_rate": 1.6799750623441397e-05, + "loss": 0.4592, + "step": 12840 + }, + { + "epoch": 3.2044887780548628, + "grad_norm": 4.771883010864258, + "learning_rate": 1.679725685785536e-05, + "loss": 0.3648, + "step": 12850 + }, + { + "epoch": 3.2069825436408976, + "grad_norm": 7.125466823577881, + "learning_rate": 1.6794763092269328e-05, + "loss": 0.4146, + "step": 12860 + }, + { + "epoch": 3.2094763092269325, + "grad_norm": 5.639003753662109, + "learning_rate": 1.679226932668329e-05, + "loss": 0.4068, + "step": 12870 + }, + { + "epoch": 3.2119700748129674, + "grad_norm": 4.13544225692749, + "learning_rate": 1.678977556109726e-05, + "loss": 0.3519, + "step": 12880 + }, + { + "epoch": 3.2144638403990027, + "grad_norm": 5.362157344818115, + "learning_rate": 1.6787281795511222e-05, + "loss": 0.4477, + "step": 12890 + }, + { + "epoch": 3.2169576059850375, + "grad_norm": 5.619980335235596, + "learning_rate": 1.678478802992519e-05, + "loss": 0.416, + "step": 12900 + }, + { + "epoch": 3.2194513715710724, + "grad_norm": 6.641584396362305, + "learning_rate": 1.6782294264339153e-05, + "loss": 0.3819, + "step": 12910 + }, + { + "epoch": 3.2219451371571073, + "grad_norm": 6.697845935821533, + "learning_rate": 1.677980049875312e-05, + "loss": 0.3809, + "step": 12920 + }, + { + "epoch": 3.224438902743142, + "grad_norm": 9.737817764282227, + "learning_rate": 1.6777306733167083e-05, + "loss": 0.4195, + "step": 12930 + }, + { + "epoch": 3.226932668329177, + "grad_norm": 3.7950658798217773, + "learning_rate": 1.677481296758105e-05, + "loss": 0.4657, + "step": 12940 + }, + { + "epoch": 3.229426433915212, + "grad_norm": 6.923771858215332, + "learning_rate": 1.6772319201995014e-05, + "loss": 0.3954, + "step": 12950 + }, + { + "epoch": 3.2319201995012468, + "grad_norm": 8.065861701965332, + "learning_rate": 1.676982543640898e-05, + "loss": 0.4534, + "step": 12960 + }, + { + "epoch": 3.2344139650872816, + "grad_norm": 7.0419769287109375, + "learning_rate": 1.6767331670822944e-05, + "loss": 0.3881, + "step": 12970 + }, + { + "epoch": 3.236907730673317, + "grad_norm": 5.294747352600098, + "learning_rate": 1.6764837905236908e-05, + "loss": 0.4064, + "step": 12980 + }, + { + "epoch": 3.239401496259352, + "grad_norm": 4.490688800811768, + "learning_rate": 1.6762344139650875e-05, + "loss": 0.4715, + "step": 12990 + }, + { + "epoch": 3.2418952618453867, + "grad_norm": 6.766694068908691, + "learning_rate": 1.675985037406484e-05, + "loss": 0.3785, + "step": 13000 + }, + { + "epoch": 3.2443890274314215, + "grad_norm": 4.411302089691162, + "learning_rate": 1.6757356608478806e-05, + "loss": 0.4342, + "step": 13010 + }, + { + "epoch": 3.2468827930174564, + "grad_norm": 5.128880500793457, + "learning_rate": 1.675486284289277e-05, + "loss": 0.4306, + "step": 13020 + }, + { + "epoch": 3.2493765586034913, + "grad_norm": 8.443227767944336, + "learning_rate": 1.6752369077306733e-05, + "loss": 0.3925, + "step": 13030 + }, + { + "epoch": 3.251870324189526, + "grad_norm": 5.713132858276367, + "learning_rate": 1.67498753117207e-05, + "loss": 0.377, + "step": 13040 + }, + { + "epoch": 3.254364089775561, + "grad_norm": 6.96317720413208, + "learning_rate": 1.6747381546134664e-05, + "loss": 0.3748, + "step": 13050 + }, + { + "epoch": 3.256857855361596, + "grad_norm": 5.586208343505859, + "learning_rate": 1.6744887780548627e-05, + "loss": 0.3898, + "step": 13060 + }, + { + "epoch": 3.2593516209476308, + "grad_norm": 8.379228591918945, + "learning_rate": 1.6742394014962594e-05, + "loss": 0.4784, + "step": 13070 + }, + { + "epoch": 3.2618453865336656, + "grad_norm": 8.025406837463379, + "learning_rate": 1.6739900249376558e-05, + "loss": 0.3903, + "step": 13080 + }, + { + "epoch": 3.264339152119701, + "grad_norm": 8.072983741760254, + "learning_rate": 1.6737406483790525e-05, + "loss": 0.4254, + "step": 13090 + }, + { + "epoch": 3.266832917705736, + "grad_norm": 5.819389343261719, + "learning_rate": 1.6734912718204492e-05, + "loss": 0.4922, + "step": 13100 + }, + { + "epoch": 3.2693266832917707, + "grad_norm": 6.674468040466309, + "learning_rate": 1.6732418952618455e-05, + "loss": 0.438, + "step": 13110 + }, + { + "epoch": 3.2718204488778055, + "grad_norm": 5.349725246429443, + "learning_rate": 1.6729925187032422e-05, + "loss": 0.3625, + "step": 13120 + }, + { + "epoch": 3.2743142144638404, + "grad_norm": 5.366141319274902, + "learning_rate": 1.6727431421446386e-05, + "loss": 0.3778, + "step": 13130 + }, + { + "epoch": 3.2768079800498753, + "grad_norm": 4.81095552444458, + "learning_rate": 1.6724937655860353e-05, + "loss": 0.3809, + "step": 13140 + }, + { + "epoch": 3.27930174563591, + "grad_norm": 5.11724328994751, + "learning_rate": 1.6722443890274317e-05, + "loss": 0.3866, + "step": 13150 + }, + { + "epoch": 3.281795511221945, + "grad_norm": 4.886108875274658, + "learning_rate": 1.671995012468828e-05, + "loss": 0.3952, + "step": 13160 + }, + { + "epoch": 3.28428927680798, + "grad_norm": 6.775887966156006, + "learning_rate": 1.6717456359102247e-05, + "loss": 0.4382, + "step": 13170 + }, + { + "epoch": 3.286783042394015, + "grad_norm": 4.1350579261779785, + "learning_rate": 1.671496259351621e-05, + "loss": 0.3817, + "step": 13180 + }, + { + "epoch": 3.28927680798005, + "grad_norm": 10.852806091308594, + "learning_rate": 1.6712468827930174e-05, + "loss": 0.4278, + "step": 13190 + }, + { + "epoch": 3.291770573566085, + "grad_norm": 4.8792877197265625, + "learning_rate": 1.670997506234414e-05, + "loss": 0.3502, + "step": 13200 + }, + { + "epoch": 3.29426433915212, + "grad_norm": 6.0365400314331055, + "learning_rate": 1.6707481296758105e-05, + "loss": 0.3906, + "step": 13210 + }, + { + "epoch": 3.2967581047381547, + "grad_norm": 8.025360107421875, + "learning_rate": 1.6704987531172072e-05, + "loss": 0.4406, + "step": 13220 + }, + { + "epoch": 3.2992518703241895, + "grad_norm": 7.672976970672607, + "learning_rate": 1.6702493765586036e-05, + "loss": 0.4107, + "step": 13230 + }, + { + "epoch": 3.3017456359102244, + "grad_norm": 4.216901779174805, + "learning_rate": 1.67e-05, + "loss": 0.3429, + "step": 13240 + }, + { + "epoch": 3.3042394014962593, + "grad_norm": 5.8930583000183105, + "learning_rate": 1.6697506234413966e-05, + "loss": 0.4131, + "step": 13250 + }, + { + "epoch": 3.306733167082294, + "grad_norm": 6.364641189575195, + "learning_rate": 1.669501246882793e-05, + "loss": 0.4126, + "step": 13260 + }, + { + "epoch": 3.309226932668329, + "grad_norm": 4.867658615112305, + "learning_rate": 1.6692518703241897e-05, + "loss": 0.3936, + "step": 13270 + }, + { + "epoch": 3.311720698254364, + "grad_norm": 4.648925304412842, + "learning_rate": 1.669002493765586e-05, + "loss": 0.356, + "step": 13280 + }, + { + "epoch": 3.314214463840399, + "grad_norm": 5.959649562835693, + "learning_rate": 1.6687531172069828e-05, + "loss": 0.4527, + "step": 13290 + }, + { + "epoch": 3.316708229426434, + "grad_norm": 5.857553005218506, + "learning_rate": 1.668503740648379e-05, + "loss": 0.3464, + "step": 13300 + }, + { + "epoch": 3.319201995012469, + "grad_norm": 7.903529167175293, + "learning_rate": 1.6682543640897758e-05, + "loss": 0.3983, + "step": 13310 + }, + { + "epoch": 3.321695760598504, + "grad_norm": 7.403318881988525, + "learning_rate": 1.6680049875311722e-05, + "loss": 0.39, + "step": 13320 + }, + { + "epoch": 3.3241895261845387, + "grad_norm": 4.094770431518555, + "learning_rate": 1.667755610972569e-05, + "loss": 0.3534, + "step": 13330 + }, + { + "epoch": 3.3266832917705735, + "grad_norm": 6.073254585266113, + "learning_rate": 1.6675062344139652e-05, + "loss": 0.4348, + "step": 13340 + }, + { + "epoch": 3.3291770573566084, + "grad_norm": 5.4333271980285645, + "learning_rate": 1.667256857855362e-05, + "loss": 0.4438, + "step": 13350 + }, + { + "epoch": 3.3316708229426433, + "grad_norm": 8.86145305633545, + "learning_rate": 1.6670074812967583e-05, + "loss": 0.4223, + "step": 13360 + }, + { + "epoch": 3.334164588528678, + "grad_norm": 4.68287467956543, + "learning_rate": 1.6667581047381547e-05, + "loss": 0.419, + "step": 13370 + }, + { + "epoch": 3.3366583541147135, + "grad_norm": 5.4317827224731445, + "learning_rate": 1.6665087281795514e-05, + "loss": 0.4438, + "step": 13380 + }, + { + "epoch": 3.3391521197007483, + "grad_norm": 6.64384651184082, + "learning_rate": 1.6662842892768082e-05, + "loss": 0.4448, + "step": 13390 + }, + { + "epoch": 3.341645885286783, + "grad_norm": 6.038212776184082, + "learning_rate": 1.6660349127182046e-05, + "loss": 0.4357, + "step": 13400 + }, + { + "epoch": 3.344139650872818, + "grad_norm": 7.981716156005859, + "learning_rate": 1.6657855361596013e-05, + "loss": 0.4141, + "step": 13410 + }, + { + "epoch": 3.346633416458853, + "grad_norm": 6.937891006469727, + "learning_rate": 1.6655361596009976e-05, + "loss": 0.426, + "step": 13420 + }, + { + "epoch": 3.349127182044888, + "grad_norm": 8.279930114746094, + "learning_rate": 1.6652867830423943e-05, + "loss": 0.3797, + "step": 13430 + }, + { + "epoch": 3.3516209476309227, + "grad_norm": 6.905846118927002, + "learning_rate": 1.6650374064837907e-05, + "loss": 0.3517, + "step": 13440 + }, + { + "epoch": 3.3541147132169575, + "grad_norm": 4.052714824676514, + "learning_rate": 1.664788029925187e-05, + "loss": 0.3177, + "step": 13450 + }, + { + "epoch": 3.3566084788029924, + "grad_norm": 6.537426948547363, + "learning_rate": 1.6645386533665837e-05, + "loss": 0.4482, + "step": 13460 + }, + { + "epoch": 3.3591022443890273, + "grad_norm": 5.7491455078125, + "learning_rate": 1.66428927680798e-05, + "loss": 0.385, + "step": 13470 + }, + { + "epoch": 3.361596009975062, + "grad_norm": 7.766640663146973, + "learning_rate": 1.6640399002493768e-05, + "loss": 0.3945, + "step": 13480 + }, + { + "epoch": 3.3640897755610975, + "grad_norm": 5.237179279327393, + "learning_rate": 1.663790523690773e-05, + "loss": 0.4368, + "step": 13490 + }, + { + "epoch": 3.3665835411471323, + "grad_norm": 7.344296932220459, + "learning_rate": 1.6635411471321695e-05, + "loss": 0.3749, + "step": 13500 + }, + { + "epoch": 3.369077306733167, + "grad_norm": 5.677723407745361, + "learning_rate": 1.6632917705735662e-05, + "loss": 0.41, + "step": 13510 + }, + { + "epoch": 3.371571072319202, + "grad_norm": 6.3155412673950195, + "learning_rate": 1.6630423940149626e-05, + "loss": 0.4143, + "step": 13520 + }, + { + "epoch": 3.374064837905237, + "grad_norm": 6.052978038787842, + "learning_rate": 1.6627930174563593e-05, + "loss": 0.4444, + "step": 13530 + }, + { + "epoch": 3.376558603491272, + "grad_norm": 5.203304767608643, + "learning_rate": 1.6625436408977557e-05, + "loss": 0.3819, + "step": 13540 + }, + { + "epoch": 3.3790523690773067, + "grad_norm": 7.154673099517822, + "learning_rate": 1.6622942643391524e-05, + "loss": 0.421, + "step": 13550 + }, + { + "epoch": 3.3815461346633415, + "grad_norm": 6.324401378631592, + "learning_rate": 1.6620448877805487e-05, + "loss": 0.4142, + "step": 13560 + }, + { + "epoch": 3.3840399002493764, + "grad_norm": 5.293837070465088, + "learning_rate": 1.6617955112219454e-05, + "loss": 0.341, + "step": 13570 + }, + { + "epoch": 3.3865336658354117, + "grad_norm": 6.022460460662842, + "learning_rate": 1.6615461346633418e-05, + "loss": 0.4016, + "step": 13580 + }, + { + "epoch": 3.3890274314214466, + "grad_norm": 3.799043655395508, + "learning_rate": 1.6612967581047385e-05, + "loss": 0.4037, + "step": 13590 + }, + { + "epoch": 3.3915211970074814, + "grad_norm": 5.069158554077148, + "learning_rate": 1.661047381546135e-05, + "loss": 0.3429, + "step": 13600 + }, + { + "epoch": 3.3940149625935163, + "grad_norm": 6.114314079284668, + "learning_rate": 1.6607980049875315e-05, + "loss": 0.4867, + "step": 13610 + }, + { + "epoch": 3.396508728179551, + "grad_norm": 5.830687046051025, + "learning_rate": 1.660548628428928e-05, + "loss": 0.3991, + "step": 13620 + }, + { + "epoch": 3.399002493765586, + "grad_norm": 10.9564847946167, + "learning_rate": 1.6602992518703243e-05, + "loss": 0.398, + "step": 13630 + }, + { + "epoch": 3.401496259351621, + "grad_norm": 6.169339179992676, + "learning_rate": 1.660049875311721e-05, + "loss": 0.3501, + "step": 13640 + }, + { + "epoch": 3.403990024937656, + "grad_norm": 4.284657001495361, + "learning_rate": 1.6598004987531173e-05, + "loss": 0.5303, + "step": 13650 + }, + { + "epoch": 3.4064837905236907, + "grad_norm": 5.175209999084473, + "learning_rate": 1.6595511221945137e-05, + "loss": 0.4677, + "step": 13660 + }, + { + "epoch": 3.4089775561097255, + "grad_norm": 4.939456462860107, + "learning_rate": 1.6593017456359104e-05, + "loss": 0.5759, + "step": 13670 + }, + { + "epoch": 3.4114713216957604, + "grad_norm": 6.589729309082031, + "learning_rate": 1.6590523690773067e-05, + "loss": 0.374, + "step": 13680 + }, + { + "epoch": 3.4139650872817953, + "grad_norm": 5.515322685241699, + "learning_rate": 1.6588029925187034e-05, + "loss": 0.3992, + "step": 13690 + }, + { + "epoch": 3.4164588528678306, + "grad_norm": 7.204263687133789, + "learning_rate": 1.6585536159600998e-05, + "loss": 0.443, + "step": 13700 + }, + { + "epoch": 3.4189526184538654, + "grad_norm": 5.2953877449035645, + "learning_rate": 1.6583042394014962e-05, + "loss": 0.3856, + "step": 13710 + }, + { + "epoch": 3.4214463840399003, + "grad_norm": 5.203874111175537, + "learning_rate": 1.658054862842893e-05, + "loss": 0.45, + "step": 13720 + }, + { + "epoch": 3.423940149625935, + "grad_norm": 5.980413436889648, + "learning_rate": 1.6578054862842892e-05, + "loss": 0.4115, + "step": 13730 + }, + { + "epoch": 3.42643391521197, + "grad_norm": 4.752535343170166, + "learning_rate": 1.657556109725686e-05, + "loss": 0.4392, + "step": 13740 + }, + { + "epoch": 3.428927680798005, + "grad_norm": 5.2310919761657715, + "learning_rate": 1.6573067331670823e-05, + "loss": 0.361, + "step": 13750 + }, + { + "epoch": 3.43142144638404, + "grad_norm": 9.963449478149414, + "learning_rate": 1.657057356608479e-05, + "loss": 0.5536, + "step": 13760 + }, + { + "epoch": 3.4339152119700747, + "grad_norm": 5.837104320526123, + "learning_rate": 1.6568079800498757e-05, + "loss": 0.3722, + "step": 13770 + }, + { + "epoch": 3.43640897755611, + "grad_norm": 7.110990524291992, + "learning_rate": 1.656558603491272e-05, + "loss": 0.4229, + "step": 13780 + }, + { + "epoch": 3.438902743142145, + "grad_norm": 5.253129005432129, + "learning_rate": 1.6563092269326684e-05, + "loss": 0.3619, + "step": 13790 + }, + { + "epoch": 3.4413965087281797, + "grad_norm": 7.7098517417907715, + "learning_rate": 1.656059850374065e-05, + "loss": 0.452, + "step": 13800 + }, + { + "epoch": 3.4438902743142146, + "grad_norm": 6.151096820831299, + "learning_rate": 1.6558104738154615e-05, + "loss": 0.4055, + "step": 13810 + }, + { + "epoch": 3.4463840399002494, + "grad_norm": 6.894789218902588, + "learning_rate": 1.6555610972568582e-05, + "loss": 0.3736, + "step": 13820 + }, + { + "epoch": 3.4488778054862843, + "grad_norm": 5.148143291473389, + "learning_rate": 1.6553117206982545e-05, + "loss": 0.462, + "step": 13830 + }, + { + "epoch": 3.451371571072319, + "grad_norm": 5.808004856109619, + "learning_rate": 1.655062344139651e-05, + "loss": 0.3789, + "step": 13840 + }, + { + "epoch": 3.453865336658354, + "grad_norm": 4.1240339279174805, + "learning_rate": 1.6548129675810476e-05, + "loss": 0.4021, + "step": 13850 + }, + { + "epoch": 3.456359102244389, + "grad_norm": 4.346762180328369, + "learning_rate": 1.654563591022444e-05, + "loss": 0.3446, + "step": 13860 + }, + { + "epoch": 3.458852867830424, + "grad_norm": 7.146320819854736, + "learning_rate": 1.6543142144638403e-05, + "loss": 0.395, + "step": 13870 + }, + { + "epoch": 3.4613466334164587, + "grad_norm": 5.127925395965576, + "learning_rate": 1.654064837905237e-05, + "loss": 0.3873, + "step": 13880 + }, + { + "epoch": 3.4638403990024935, + "grad_norm": 6.493956565856934, + "learning_rate": 1.6538154613466334e-05, + "loss": 0.4049, + "step": 13890 + }, + { + "epoch": 3.466334164588529, + "grad_norm": 7.458160400390625, + "learning_rate": 1.65356608478803e-05, + "loss": 0.3721, + "step": 13900 + }, + { + "epoch": 3.4688279301745637, + "grad_norm": 6.495090007781982, + "learning_rate": 1.6533167082294265e-05, + "loss": 0.4702, + "step": 13910 + }, + { + "epoch": 3.4713216957605986, + "grad_norm": 6.47518253326416, + "learning_rate": 1.653067331670823e-05, + "loss": 0.4566, + "step": 13920 + }, + { + "epoch": 3.4738154613466334, + "grad_norm": 5.553424835205078, + "learning_rate": 1.6528179551122195e-05, + "loss": 0.3799, + "step": 13930 + }, + { + "epoch": 3.4763092269326683, + "grad_norm": 5.419945240020752, + "learning_rate": 1.6525685785536162e-05, + "loss": 0.4483, + "step": 13940 + }, + { + "epoch": 3.478802992518703, + "grad_norm": 5.477248191833496, + "learning_rate": 1.6523192019950126e-05, + "loss": 0.326, + "step": 13950 + }, + { + "epoch": 3.481296758104738, + "grad_norm": 7.121800899505615, + "learning_rate": 1.6520698254364093e-05, + "loss": 0.3673, + "step": 13960 + }, + { + "epoch": 3.483790523690773, + "grad_norm": 6.306591510772705, + "learning_rate": 1.6518204488778056e-05, + "loss": 0.4262, + "step": 13970 + }, + { + "epoch": 3.4862842892768082, + "grad_norm": 8.00887393951416, + "learning_rate": 1.6515710723192023e-05, + "loss": 0.4345, + "step": 13980 + }, + { + "epoch": 3.488778054862843, + "grad_norm": 5.474499702453613, + "learning_rate": 1.6513216957605987e-05, + "loss": 0.4994, + "step": 13990 + }, + { + "epoch": 3.491271820448878, + "grad_norm": 6.708861827850342, + "learning_rate": 1.651072319201995e-05, + "loss": 0.3884, + "step": 14000 + }, + { + "epoch": 3.493765586034913, + "grad_norm": 4.315401077270508, + "learning_rate": 1.6508229426433918e-05, + "loss": 0.3686, + "step": 14010 + }, + { + "epoch": 3.4962593516209477, + "grad_norm": 5.418401718139648, + "learning_rate": 1.650573566084788e-05, + "loss": 0.41, + "step": 14020 + }, + { + "epoch": 3.4987531172069826, + "grad_norm": 5.228391647338867, + "learning_rate": 1.6503241895261848e-05, + "loss": 0.4117, + "step": 14030 + }, + { + "epoch": 3.5012468827930174, + "grad_norm": 5.129116535186768, + "learning_rate": 1.6500748129675812e-05, + "loss": 0.3595, + "step": 14040 + }, + { + "epoch": 3.5037406483790523, + "grad_norm": 3.8561089038848877, + "learning_rate": 1.6498254364089775e-05, + "loss": 0.3515, + "step": 14050 + }, + { + "epoch": 3.506234413965087, + "grad_norm": 7.8990278244018555, + "learning_rate": 1.6495760598503742e-05, + "loss": 0.3853, + "step": 14060 + }, + { + "epoch": 3.508728179551122, + "grad_norm": 8.074807167053223, + "learning_rate": 1.6493266832917706e-05, + "loss": 0.4222, + "step": 14070 + }, + { + "epoch": 3.511221945137157, + "grad_norm": 7.334726333618164, + "learning_rate": 1.649077306733167e-05, + "loss": 0.4673, + "step": 14080 + }, + { + "epoch": 3.5137157107231918, + "grad_norm": 5.140750885009766, + "learning_rate": 1.6488279301745637e-05, + "loss": 0.3758, + "step": 14090 + }, + { + "epoch": 3.516209476309227, + "grad_norm": 5.326815605163574, + "learning_rate": 1.64857855361596e-05, + "loss": 0.3841, + "step": 14100 + }, + { + "epoch": 3.518703241895262, + "grad_norm": 4.542361259460449, + "learning_rate": 1.6483291770573567e-05, + "loss": 0.4115, + "step": 14110 + }, + { + "epoch": 3.521197007481297, + "grad_norm": 9.180794715881348, + "learning_rate": 1.6480798004987534e-05, + "loss": 0.3703, + "step": 14120 + }, + { + "epoch": 3.5236907730673317, + "grad_norm": 6.60845422744751, + "learning_rate": 1.6478304239401498e-05, + "loss": 0.396, + "step": 14130 + }, + { + "epoch": 3.5261845386533666, + "grad_norm": 13.184927940368652, + "learning_rate": 1.6475810473815465e-05, + "loss": 0.444, + "step": 14140 + }, + { + "epoch": 3.5286783042394014, + "grad_norm": 8.303811073303223, + "learning_rate": 1.647331670822943e-05, + "loss": 0.3708, + "step": 14150 + }, + { + "epoch": 3.5311720698254363, + "grad_norm": 7.675179958343506, + "learning_rate": 1.6470822942643392e-05, + "loss": 0.4134, + "step": 14160 + }, + { + "epoch": 3.533665835411471, + "grad_norm": 7.668274879455566, + "learning_rate": 1.646832917705736e-05, + "loss": 0.3957, + "step": 14170 + }, + { + "epoch": 3.5361596009975065, + "grad_norm": 19.888633728027344, + "learning_rate": 1.6465835411471323e-05, + "loss": 0.5228, + "step": 14180 + }, + { + "epoch": 3.5386533665835413, + "grad_norm": 6.9068474769592285, + "learning_rate": 1.646334164588529e-05, + "loss": 0.381, + "step": 14190 + }, + { + "epoch": 3.541147132169576, + "grad_norm": 9.011292457580566, + "learning_rate": 1.6460847880299253e-05, + "loss": 0.4341, + "step": 14200 + }, + { + "epoch": 3.543640897755611, + "grad_norm": 7.827251434326172, + "learning_rate": 1.6458354114713217e-05, + "loss": 0.4266, + "step": 14210 + }, + { + "epoch": 3.546134663341646, + "grad_norm": 7.010087966918945, + "learning_rate": 1.6455860349127184e-05, + "loss": 0.4038, + "step": 14220 + }, + { + "epoch": 3.548628428927681, + "grad_norm": 7.390041828155518, + "learning_rate": 1.6453366583541148e-05, + "loss": 0.4247, + "step": 14230 + }, + { + "epoch": 3.5511221945137157, + "grad_norm": 7.997394561767578, + "learning_rate": 1.645087281795511e-05, + "loss": 0.4505, + "step": 14240 + }, + { + "epoch": 3.5536159600997506, + "grad_norm": 7.814496040344238, + "learning_rate": 1.6448379052369078e-05, + "loss": 0.4635, + "step": 14250 + }, + { + "epoch": 3.5561097256857854, + "grad_norm": 4.237307548522949, + "learning_rate": 1.6445885286783042e-05, + "loss": 0.3517, + "step": 14260 + }, + { + "epoch": 3.5586034912718203, + "grad_norm": 5.595398426055908, + "learning_rate": 1.644339152119701e-05, + "loss": 0.3995, + "step": 14270 + }, + { + "epoch": 3.561097256857855, + "grad_norm": 6.063884258270264, + "learning_rate": 1.6440897755610972e-05, + "loss": 0.3919, + "step": 14280 + }, + { + "epoch": 3.56359102244389, + "grad_norm": 16.324724197387695, + "learning_rate": 1.643840399002494e-05, + "loss": 0.4988, + "step": 14290 + }, + { + "epoch": 3.5660847880299253, + "grad_norm": 5.327268600463867, + "learning_rate": 1.6435910224438903e-05, + "loss": 0.4688, + "step": 14300 + }, + { + "epoch": 3.56857855361596, + "grad_norm": 9.353160858154297, + "learning_rate": 1.643341645885287e-05, + "loss": 0.56, + "step": 14310 + }, + { + "epoch": 3.571072319201995, + "grad_norm": 4.720142364501953, + "learning_rate": 1.6430922693266834e-05, + "loss": 0.4159, + "step": 14320 + }, + { + "epoch": 3.57356608478803, + "grad_norm": 4.005600452423096, + "learning_rate": 1.64284289276808e-05, + "loss": 0.3677, + "step": 14330 + }, + { + "epoch": 3.576059850374065, + "grad_norm": 9.215933799743652, + "learning_rate": 1.6425935162094764e-05, + "loss": 0.432, + "step": 14340 + }, + { + "epoch": 3.5785536159600997, + "grad_norm": 4.696425437927246, + "learning_rate": 1.642344139650873e-05, + "loss": 0.3727, + "step": 14350 + }, + { + "epoch": 3.5810473815461346, + "grad_norm": 4.839138984680176, + "learning_rate": 1.6420947630922695e-05, + "loss": 0.4013, + "step": 14360 + }, + { + "epoch": 3.5835411471321694, + "grad_norm": 7.069340705871582, + "learning_rate": 1.641845386533666e-05, + "loss": 0.3633, + "step": 14370 + }, + { + "epoch": 3.5860349127182047, + "grad_norm": 5.56005334854126, + "learning_rate": 1.6415960099750626e-05, + "loss": 0.3376, + "step": 14380 + }, + { + "epoch": 3.5885286783042396, + "grad_norm": 5.112822532653809, + "learning_rate": 1.641346633416459e-05, + "loss": 0.4179, + "step": 14390 + }, + { + "epoch": 3.5910224438902745, + "grad_norm": 6.113282203674316, + "learning_rate": 1.6410972568578556e-05, + "loss": 0.4436, + "step": 14400 + }, + { + "epoch": 3.5935162094763093, + "grad_norm": 4.90158748626709, + "learning_rate": 1.640847880299252e-05, + "loss": 0.4283, + "step": 14410 + }, + { + "epoch": 3.596009975062344, + "grad_norm": 6.325465679168701, + "learning_rate": 1.6405985037406483e-05, + "loss": 0.4857, + "step": 14420 + }, + { + "epoch": 3.598503740648379, + "grad_norm": 8.354521751403809, + "learning_rate": 1.640349127182045e-05, + "loss": 0.4173, + "step": 14430 + }, + { + "epoch": 3.600997506234414, + "grad_norm": 5.772945880889893, + "learning_rate": 1.6400997506234414e-05, + "loss": 0.4367, + "step": 14440 + }, + { + "epoch": 3.603491271820449, + "grad_norm": 5.8793864250183105, + "learning_rate": 1.6398503740648378e-05, + "loss": 0.351, + "step": 14450 + }, + { + "epoch": 3.6059850374064837, + "grad_norm": 4.758648872375488, + "learning_rate": 1.6396009975062345e-05, + "loss": 0.4236, + "step": 14460 + }, + { + "epoch": 3.6084788029925186, + "grad_norm": 9.100720405578613, + "learning_rate": 1.639351620947631e-05, + "loss": 0.3892, + "step": 14470 + }, + { + "epoch": 3.6109725685785534, + "grad_norm": 5.36121129989624, + "learning_rate": 1.6391022443890275e-05, + "loss": 0.4256, + "step": 14480 + }, + { + "epoch": 3.6134663341645883, + "grad_norm": 6.1626505851745605, + "learning_rate": 1.6388528678304242e-05, + "loss": 0.4083, + "step": 14490 + }, + { + "epoch": 3.6159600997506236, + "grad_norm": 4.670628070831299, + "learning_rate": 1.6386034912718206e-05, + "loss": 0.4056, + "step": 14500 + }, + { + "epoch": 3.6184538653366585, + "grad_norm": 5.522136211395264, + "learning_rate": 1.6383541147132173e-05, + "loss": 0.4228, + "step": 14510 + }, + { + "epoch": 3.6209476309226933, + "grad_norm": 4.898807048797607, + "learning_rate": 1.6381047381546137e-05, + "loss": 0.3984, + "step": 14520 + }, + { + "epoch": 3.623441396508728, + "grad_norm": 6.272655963897705, + "learning_rate": 1.6378553615960104e-05, + "loss": 0.4146, + "step": 14530 + }, + { + "epoch": 3.625935162094763, + "grad_norm": 5.485193729400635, + "learning_rate": 1.6376059850374067e-05, + "loss": 0.3955, + "step": 14540 + }, + { + "epoch": 3.628428927680798, + "grad_norm": 5.901163578033447, + "learning_rate": 1.637356608478803e-05, + "loss": 0.4093, + "step": 14550 + }, + { + "epoch": 3.630922693266833, + "grad_norm": 7.169445991516113, + "learning_rate": 1.6371072319201998e-05, + "loss": 0.4289, + "step": 14560 + }, + { + "epoch": 3.6334164588528677, + "grad_norm": 7.727000713348389, + "learning_rate": 1.636857855361596e-05, + "loss": 0.4101, + "step": 14570 + }, + { + "epoch": 3.635910224438903, + "grad_norm": 5.395565509796143, + "learning_rate": 1.6366084788029925e-05, + "loss": 0.4015, + "step": 14580 + }, + { + "epoch": 3.638403990024938, + "grad_norm": 8.575839042663574, + "learning_rate": 1.6363591022443892e-05, + "loss": 0.4276, + "step": 14590 + }, + { + "epoch": 3.6408977556109727, + "grad_norm": 6.496227264404297, + "learning_rate": 1.6361097256857856e-05, + "loss": 0.3814, + "step": 14600 + }, + { + "epoch": 3.6433915211970076, + "grad_norm": 5.557234764099121, + "learning_rate": 1.6358603491271823e-05, + "loss": 0.3926, + "step": 14610 + }, + { + "epoch": 3.6458852867830425, + "grad_norm": 9.716835021972656, + "learning_rate": 1.6356109725685786e-05, + "loss": 0.3903, + "step": 14620 + }, + { + "epoch": 3.6483790523690773, + "grad_norm": 4.228976726531982, + "learning_rate": 1.635361596009975e-05, + "loss": 0.4135, + "step": 14630 + }, + { + "epoch": 3.650872817955112, + "grad_norm": 9.507262229919434, + "learning_rate": 1.6351122194513717e-05, + "loss": 0.4225, + "step": 14640 + }, + { + "epoch": 3.653366583541147, + "grad_norm": 8.070616722106934, + "learning_rate": 1.634862842892768e-05, + "loss": 0.3617, + "step": 14650 + }, + { + "epoch": 3.655860349127182, + "grad_norm": 7.822412490844727, + "learning_rate": 1.6346134663341647e-05, + "loss": 0.4306, + "step": 14660 + }, + { + "epoch": 3.658354114713217, + "grad_norm": 6.324841022491455, + "learning_rate": 1.634364089775561e-05, + "loss": 0.3662, + "step": 14670 + }, + { + "epoch": 3.6608478802992517, + "grad_norm": 4.228864669799805, + "learning_rate": 1.6341147132169578e-05, + "loss": 0.3822, + "step": 14680 + }, + { + "epoch": 3.6633416458852865, + "grad_norm": 6.306748867034912, + "learning_rate": 1.6338653366583542e-05, + "loss": 0.4123, + "step": 14690 + }, + { + "epoch": 3.665835411471322, + "grad_norm": 6.977389812469482, + "learning_rate": 1.633615960099751e-05, + "loss": 0.4305, + "step": 14700 + }, + { + "epoch": 3.6683291770573567, + "grad_norm": 5.803625583648682, + "learning_rate": 1.6333665835411472e-05, + "loss": 0.4697, + "step": 14710 + }, + { + "epoch": 3.6708229426433916, + "grad_norm": 5.12143087387085, + "learning_rate": 1.633117206982544e-05, + "loss": 0.3995, + "step": 14720 + }, + { + "epoch": 3.6733167082294265, + "grad_norm": 4.334492206573486, + "learning_rate": 1.6328678304239403e-05, + "loss": 0.4086, + "step": 14730 + }, + { + "epoch": 3.6758104738154613, + "grad_norm": 4.964009761810303, + "learning_rate": 1.6326184538653367e-05, + "loss": 0.3887, + "step": 14740 + }, + { + "epoch": 3.678304239401496, + "grad_norm": 5.872185707092285, + "learning_rate": 1.6323690773067334e-05, + "loss": 0.4069, + "step": 14750 + }, + { + "epoch": 3.680798004987531, + "grad_norm": 5.125484466552734, + "learning_rate": 1.6321197007481297e-05, + "loss": 0.4077, + "step": 14760 + }, + { + "epoch": 3.683291770573566, + "grad_norm": 6.067224502563477, + "learning_rate": 1.6318703241895264e-05, + "loss": 0.4064, + "step": 14770 + }, + { + "epoch": 3.6857855361596013, + "grad_norm": 5.0722455978393555, + "learning_rate": 1.6316209476309228e-05, + "loss": 0.3666, + "step": 14780 + }, + { + "epoch": 3.688279301745636, + "grad_norm": 8.505644798278809, + "learning_rate": 1.631371571072319e-05, + "loss": 0.4493, + "step": 14790 + }, + { + "epoch": 3.690773067331671, + "grad_norm": 4.582368850708008, + "learning_rate": 1.631122194513716e-05, + "loss": 0.4366, + "step": 14800 + }, + { + "epoch": 3.693266832917706, + "grad_norm": 8.571370124816895, + "learning_rate": 1.6308728179551122e-05, + "loss": 0.3407, + "step": 14810 + }, + { + "epoch": 3.6957605985037407, + "grad_norm": 4.467719554901123, + "learning_rate": 1.630623441396509e-05, + "loss": 0.4311, + "step": 14820 + }, + { + "epoch": 3.6982543640897756, + "grad_norm": 5.480850696563721, + "learning_rate": 1.6303740648379053e-05, + "loss": 0.3438, + "step": 14830 + }, + { + "epoch": 3.7007481296758105, + "grad_norm": 4.664899826049805, + "learning_rate": 1.630124688279302e-05, + "loss": 0.3888, + "step": 14840 + }, + { + "epoch": 3.7032418952618453, + "grad_norm": 5.578117370605469, + "learning_rate": 1.6298753117206983e-05, + "loss": 0.3614, + "step": 14850 + }, + { + "epoch": 3.70573566084788, + "grad_norm": 6.581024646759033, + "learning_rate": 1.629625935162095e-05, + "loss": 0.5117, + "step": 14860 + }, + { + "epoch": 3.708229426433915, + "grad_norm": 7.885854721069336, + "learning_rate": 1.6293765586034914e-05, + "loss": 0.3758, + "step": 14870 + }, + { + "epoch": 3.71072319201995, + "grad_norm": 6.261788368225098, + "learning_rate": 1.629127182044888e-05, + "loss": 0.4068, + "step": 14880 + }, + { + "epoch": 3.713216957605985, + "grad_norm": 5.979361057281494, + "learning_rate": 1.6288778054862845e-05, + "loss": 0.3442, + "step": 14890 + }, + { + "epoch": 3.71571072319202, + "grad_norm": 6.158860683441162, + "learning_rate": 1.628628428927681e-05, + "loss": 0.4162, + "step": 14900 + }, + { + "epoch": 3.718204488778055, + "grad_norm": 6.998167991638184, + "learning_rate": 1.6283790523690775e-05, + "loss": 0.4444, + "step": 14910 + }, + { + "epoch": 3.72069825436409, + "grad_norm": 4.483408451080322, + "learning_rate": 1.628129675810474e-05, + "loss": 0.4146, + "step": 14920 + }, + { + "epoch": 3.7231920199501247, + "grad_norm": 7.948692321777344, + "learning_rate": 1.6278802992518706e-05, + "loss": 0.4934, + "step": 14930 + }, + { + "epoch": 3.7256857855361596, + "grad_norm": 9.365506172180176, + "learning_rate": 1.627630922693267e-05, + "loss": 0.3799, + "step": 14940 + }, + { + "epoch": 3.7281795511221945, + "grad_norm": 4.377537727355957, + "learning_rate": 1.6273815461346633e-05, + "loss": 0.3687, + "step": 14950 + }, + { + "epoch": 3.7306733167082293, + "grad_norm": 14.354135513305664, + "learning_rate": 1.62713216957606e-05, + "loss": 0.4541, + "step": 14960 + }, + { + "epoch": 3.733167082294264, + "grad_norm": 5.008823871612549, + "learning_rate": 1.6268827930174564e-05, + "loss": 0.46, + "step": 14970 + }, + { + "epoch": 3.7356608478802995, + "grad_norm": 6.697189807891846, + "learning_rate": 1.626633416458853e-05, + "loss": 0.3777, + "step": 14980 + }, + { + "epoch": 3.7381546134663344, + "grad_norm": 6.032017707824707, + "learning_rate": 1.6263840399002494e-05, + "loss": 0.3748, + "step": 14990 + }, + { + "epoch": 3.7406483790523692, + "grad_norm": 5.817931652069092, + "learning_rate": 1.6261346633416458e-05, + "loss": 0.4145, + "step": 15000 + }, + { + "epoch": 3.743142144638404, + "grad_norm": 5.026968479156494, + "learning_rate": 1.6258852867830425e-05, + "loss": 0.3745, + "step": 15010 + }, + { + "epoch": 3.745635910224439, + "grad_norm": 4.957605361938477, + "learning_rate": 1.625635910224439e-05, + "loss": 0.427, + "step": 15020 + }, + { + "epoch": 3.748129675810474, + "grad_norm": 6.634207248687744, + "learning_rate": 1.6253865336658355e-05, + "loss": 0.3864, + "step": 15030 + }, + { + "epoch": 3.7506234413965087, + "grad_norm": 5.489914894104004, + "learning_rate": 1.625137157107232e-05, + "loss": 0.4067, + "step": 15040 + }, + { + "epoch": 3.7531172069825436, + "grad_norm": 4.863682270050049, + "learning_rate": 1.6248877805486286e-05, + "loss": 0.3701, + "step": 15050 + }, + { + "epoch": 3.7556109725685785, + "grad_norm": 5.958093643188477, + "learning_rate": 1.6246384039900253e-05, + "loss": 0.5015, + "step": 15060 + }, + { + "epoch": 3.7581047381546133, + "grad_norm": 6.1467084884643555, + "learning_rate": 1.6243890274314217e-05, + "loss": 0.3989, + "step": 15070 + }, + { + "epoch": 3.760598503740648, + "grad_norm": 5.685709476470947, + "learning_rate": 1.624139650872818e-05, + "loss": 0.4924, + "step": 15080 + }, + { + "epoch": 3.763092269326683, + "grad_norm": 5.435773849487305, + "learning_rate": 1.6238902743142147e-05, + "loss": 0.3388, + "step": 15090 + }, + { + "epoch": 3.765586034912718, + "grad_norm": 4.414902687072754, + "learning_rate": 1.623640897755611e-05, + "loss": 0.4236, + "step": 15100 + }, + { + "epoch": 3.7680798004987532, + "grad_norm": 4.539722919464111, + "learning_rate": 1.6233915211970078e-05, + "loss": 0.457, + "step": 15110 + }, + { + "epoch": 3.770573566084788, + "grad_norm": 7.175182342529297, + "learning_rate": 1.623142144638404e-05, + "loss": 0.3434, + "step": 15120 + }, + { + "epoch": 3.773067331670823, + "grad_norm": 7.918752670288086, + "learning_rate": 1.6228927680798005e-05, + "loss": 0.4374, + "step": 15130 + }, + { + "epoch": 3.775561097256858, + "grad_norm": 5.573683738708496, + "learning_rate": 1.6226433915211972e-05, + "loss": 0.4069, + "step": 15140 + }, + { + "epoch": 3.7780548628428927, + "grad_norm": 6.013986110687256, + "learning_rate": 1.6223940149625936e-05, + "loss": 0.372, + "step": 15150 + }, + { + "epoch": 3.7805486284289276, + "grad_norm": 7.387099742889404, + "learning_rate": 1.62214463840399e-05, + "loss": 0.4348, + "step": 15160 + }, + { + "epoch": 3.7830423940149625, + "grad_norm": 8.931721687316895, + "learning_rate": 1.6218952618453866e-05, + "loss": 0.4304, + "step": 15170 + }, + { + "epoch": 3.7855361596009978, + "grad_norm": 6.92478084564209, + "learning_rate": 1.621645885286783e-05, + "loss": 0.4486, + "step": 15180 + }, + { + "epoch": 3.7880299251870326, + "grad_norm": 5.480514049530029, + "learning_rate": 1.6213965087281797e-05, + "loss": 0.4168, + "step": 15190 + }, + { + "epoch": 3.7905236907730675, + "grad_norm": 6.616566181182861, + "learning_rate": 1.621147132169576e-05, + "loss": 0.3913, + "step": 15200 + }, + { + "epoch": 3.7930174563591024, + "grad_norm": 6.839771270751953, + "learning_rate": 1.6208977556109728e-05, + "loss": 0.4484, + "step": 15210 + }, + { + "epoch": 3.7955112219451372, + "grad_norm": 17.69823455810547, + "learning_rate": 1.620648379052369e-05, + "loss": 0.4469, + "step": 15220 + }, + { + "epoch": 3.798004987531172, + "grad_norm": 5.237950325012207, + "learning_rate": 1.6203990024937658e-05, + "loss": 0.4347, + "step": 15230 + }, + { + "epoch": 3.800498753117207, + "grad_norm": 11.648773193359375, + "learning_rate": 1.6201496259351622e-05, + "loss": 0.4735, + "step": 15240 + }, + { + "epoch": 3.802992518703242, + "grad_norm": 4.579110622406006, + "learning_rate": 1.619900249376559e-05, + "loss": 0.3873, + "step": 15250 + }, + { + "epoch": 3.8054862842892767, + "grad_norm": 5.944640636444092, + "learning_rate": 1.6196508728179553e-05, + "loss": 0.4153, + "step": 15260 + }, + { + "epoch": 3.8079800498753116, + "grad_norm": 6.9358696937561035, + "learning_rate": 1.619401496259352e-05, + "loss": 0.3617, + "step": 15270 + }, + { + "epoch": 3.8104738154613464, + "grad_norm": 6.954071521759033, + "learning_rate": 1.6191521197007483e-05, + "loss": 0.4271, + "step": 15280 + }, + { + "epoch": 3.8129675810473813, + "grad_norm": 5.045011520385742, + "learning_rate": 1.6189027431421447e-05, + "loss": 0.4202, + "step": 15290 + }, + { + "epoch": 3.815461346633416, + "grad_norm": 5.211480140686035, + "learning_rate": 1.6186533665835414e-05, + "loss": 0.4072, + "step": 15300 + }, + { + "epoch": 3.8179551122194515, + "grad_norm": 5.901054382324219, + "learning_rate": 1.6184039900249377e-05, + "loss": 0.3808, + "step": 15310 + }, + { + "epoch": 3.8204488778054864, + "grad_norm": 7.312894821166992, + "learning_rate": 1.6181546134663344e-05, + "loss": 0.4005, + "step": 15320 + }, + { + "epoch": 3.8229426433915212, + "grad_norm": 4.741137981414795, + "learning_rate": 1.6179052369077308e-05, + "loss": 0.4235, + "step": 15330 + }, + { + "epoch": 3.825436408977556, + "grad_norm": 14.889175415039062, + "learning_rate": 1.617655860349127e-05, + "loss": 0.5169, + "step": 15340 + }, + { + "epoch": 3.827930174563591, + "grad_norm": 6.416481018066406, + "learning_rate": 1.617406483790524e-05, + "loss": 0.4472, + "step": 15350 + }, + { + "epoch": 3.830423940149626, + "grad_norm": 4.6261701583862305, + "learning_rate": 1.6171571072319202e-05, + "loss": 0.4069, + "step": 15360 + }, + { + "epoch": 3.8329177057356607, + "grad_norm": 4.661010265350342, + "learning_rate": 1.6169077306733166e-05, + "loss": 0.4312, + "step": 15370 + }, + { + "epoch": 3.835411471321696, + "grad_norm": 5.187774658203125, + "learning_rate": 1.6166583541147133e-05, + "loss": 0.3947, + "step": 15380 + }, + { + "epoch": 3.837905236907731, + "grad_norm": 4.887078762054443, + "learning_rate": 1.6164089775561096e-05, + "loss": 0.3661, + "step": 15390 + }, + { + "epoch": 3.8403990024937658, + "grad_norm": 4.326541423797607, + "learning_rate": 1.6161596009975063e-05, + "loss": 0.4481, + "step": 15400 + }, + { + "epoch": 3.8428927680798006, + "grad_norm": 6.179752826690674, + "learning_rate": 1.615910224438903e-05, + "loss": 0.424, + "step": 15410 + }, + { + "epoch": 3.8453865336658355, + "grad_norm": 5.251521110534668, + "learning_rate": 1.6156608478802994e-05, + "loss": 0.4454, + "step": 15420 + }, + { + "epoch": 3.8478802992518704, + "grad_norm": 9.553689956665039, + "learning_rate": 1.615411471321696e-05, + "loss": 0.4171, + "step": 15430 + }, + { + "epoch": 3.8503740648379052, + "grad_norm": 5.729796409606934, + "learning_rate": 1.6151620947630925e-05, + "loss": 0.4229, + "step": 15440 + }, + { + "epoch": 3.85286783042394, + "grad_norm": 4.4921345710754395, + "learning_rate": 1.614912718204489e-05, + "loss": 0.3616, + "step": 15450 + }, + { + "epoch": 3.855361596009975, + "grad_norm": 5.6195502281188965, + "learning_rate": 1.6146633416458855e-05, + "loss": 0.3688, + "step": 15460 + }, + { + "epoch": 3.85785536159601, + "grad_norm": 6.036675930023193, + "learning_rate": 1.614413965087282e-05, + "loss": 0.379, + "step": 15470 + }, + { + "epoch": 3.8603491271820447, + "grad_norm": 6.581831455230713, + "learning_rate": 1.6141645885286786e-05, + "loss": 0.4892, + "step": 15480 + }, + { + "epoch": 3.8628428927680796, + "grad_norm": 9.268636703491211, + "learning_rate": 1.613915211970075e-05, + "loss": 0.4021, + "step": 15490 + }, + { + "epoch": 3.8653366583541144, + "grad_norm": 6.355315685272217, + "learning_rate": 1.6136658354114713e-05, + "loss": 0.4159, + "step": 15500 + }, + { + "epoch": 3.8678304239401498, + "grad_norm": 6.00802755355835, + "learning_rate": 1.613416458852868e-05, + "loss": 0.4285, + "step": 15510 + }, + { + "epoch": 3.8703241895261846, + "grad_norm": 5.517050743103027, + "learning_rate": 1.6131670822942644e-05, + "loss": 0.3094, + "step": 15520 + }, + { + "epoch": 3.8728179551122195, + "grad_norm": 5.958836078643799, + "learning_rate": 1.612917705735661e-05, + "loss": 0.5116, + "step": 15530 + }, + { + "epoch": 3.8753117206982544, + "grad_norm": 4.79234504699707, + "learning_rate": 1.6126683291770574e-05, + "loss": 0.3931, + "step": 15540 + }, + { + "epoch": 3.8778054862842892, + "grad_norm": 7.781925201416016, + "learning_rate": 1.6124189526184538e-05, + "loss": 0.4291, + "step": 15550 + }, + { + "epoch": 3.880299251870324, + "grad_norm": 6.349798679351807, + "learning_rate": 1.6121695760598505e-05, + "loss": 0.5231, + "step": 15560 + }, + { + "epoch": 3.882793017456359, + "grad_norm": 6.636193752288818, + "learning_rate": 1.611920199501247e-05, + "loss": 0.369, + "step": 15570 + }, + { + "epoch": 3.8852867830423943, + "grad_norm": 5.7847580909729, + "learning_rate": 1.6116708229426436e-05, + "loss": 0.4435, + "step": 15580 + }, + { + "epoch": 3.887780548628429, + "grad_norm": 6.807628154754639, + "learning_rate": 1.61142144638404e-05, + "loss": 0.4008, + "step": 15590 + }, + { + "epoch": 3.890274314214464, + "grad_norm": 8.10521125793457, + "learning_rate": 1.6111720698254366e-05, + "loss": 0.4364, + "step": 15600 + }, + { + "epoch": 3.892768079800499, + "grad_norm": 5.665589332580566, + "learning_rate": 1.610922693266833e-05, + "loss": 0.4302, + "step": 15610 + }, + { + "epoch": 3.8952618453865338, + "grad_norm": 6.3911943435668945, + "learning_rate": 1.6106733167082297e-05, + "loss": 0.4188, + "step": 15620 + }, + { + "epoch": 3.8977556109725686, + "grad_norm": 7.833658695220947, + "learning_rate": 1.610423940149626e-05, + "loss": 0.3424, + "step": 15630 + }, + { + "epoch": 3.9002493765586035, + "grad_norm": 5.939789772033691, + "learning_rate": 1.6101745635910228e-05, + "loss": 0.4221, + "step": 15640 + }, + { + "epoch": 3.9027431421446384, + "grad_norm": 6.181157112121582, + "learning_rate": 1.609925187032419e-05, + "loss": 0.4927, + "step": 15650 + }, + { + "epoch": 3.9052369077306732, + "grad_norm": 10.083945274353027, + "learning_rate": 1.6096758104738155e-05, + "loss": 0.4323, + "step": 15660 + }, + { + "epoch": 3.907730673316708, + "grad_norm": 7.5239129066467285, + "learning_rate": 1.6094264339152122e-05, + "loss": 0.4576, + "step": 15670 + }, + { + "epoch": 3.910224438902743, + "grad_norm": 7.4439311027526855, + "learning_rate": 1.6091770573566085e-05, + "loss": 0.373, + "step": 15680 + }, + { + "epoch": 3.912718204488778, + "grad_norm": 8.566959381103516, + "learning_rate": 1.6089276807980052e-05, + "loss": 0.4426, + "step": 15690 + }, + { + "epoch": 3.9152119700748127, + "grad_norm": 6.725310802459717, + "learning_rate": 1.6086783042394016e-05, + "loss": 0.4016, + "step": 15700 + }, + { + "epoch": 3.917705735660848, + "grad_norm": 7.9485392570495605, + "learning_rate": 1.608428927680798e-05, + "loss": 0.416, + "step": 15710 + }, + { + "epoch": 3.920199501246883, + "grad_norm": 7.6618170738220215, + "learning_rate": 1.6081795511221947e-05, + "loss": 0.3751, + "step": 15720 + }, + { + "epoch": 3.9226932668329177, + "grad_norm": 5.6373291015625, + "learning_rate": 1.607930174563591e-05, + "loss": 0.3605, + "step": 15730 + }, + { + "epoch": 3.9251870324189526, + "grad_norm": 4.732673168182373, + "learning_rate": 1.6076807980049874e-05, + "loss": 0.3484, + "step": 15740 + }, + { + "epoch": 3.9276807980049875, + "grad_norm": 6.598507404327393, + "learning_rate": 1.607431421446384e-05, + "loss": 0.381, + "step": 15750 + }, + { + "epoch": 3.9301745635910224, + "grad_norm": 4.610682010650635, + "learning_rate": 1.6071820448877808e-05, + "loss": 0.4001, + "step": 15760 + }, + { + "epoch": 3.932668329177057, + "grad_norm": 4.403365135192871, + "learning_rate": 1.606932668329177e-05, + "loss": 0.4298, + "step": 15770 + }, + { + "epoch": 3.9351620947630925, + "grad_norm": 5.305014133453369, + "learning_rate": 1.606683291770574e-05, + "loss": 0.361, + "step": 15780 + }, + { + "epoch": 3.9376558603491274, + "grad_norm": 7.3379998207092285, + "learning_rate": 1.6064339152119702e-05, + "loss": 0.4318, + "step": 15790 + }, + { + "epoch": 3.9401496259351623, + "grad_norm": 4.637630939483643, + "learning_rate": 1.606184538653367e-05, + "loss": 0.3809, + "step": 15800 + }, + { + "epoch": 3.942643391521197, + "grad_norm": 7.531750202178955, + "learning_rate": 1.6059351620947633e-05, + "loss": 0.3713, + "step": 15810 + }, + { + "epoch": 3.945137157107232, + "grad_norm": 5.172116756439209, + "learning_rate": 1.60568578553616e-05, + "loss": 0.5203, + "step": 15820 + }, + { + "epoch": 3.947630922693267, + "grad_norm": 4.275954723358154, + "learning_rate": 1.6054364089775563e-05, + "loss": 0.3671, + "step": 15830 + }, + { + "epoch": 3.9501246882793017, + "grad_norm": 9.31353759765625, + "learning_rate": 1.6051870324189527e-05, + "loss": 0.376, + "step": 15840 + }, + { + "epoch": 3.9526184538653366, + "grad_norm": 7.531866550445557, + "learning_rate": 1.6049376558603494e-05, + "loss": 0.3427, + "step": 15850 + }, + { + "epoch": 3.9551122194513715, + "grad_norm": 24.911481857299805, + "learning_rate": 1.6046882793017458e-05, + "loss": 0.3652, + "step": 15860 + }, + { + "epoch": 3.9576059850374063, + "grad_norm": 8.664227485656738, + "learning_rate": 1.604438902743142e-05, + "loss": 0.4131, + "step": 15870 + }, + { + "epoch": 3.960099750623441, + "grad_norm": 32.71113586425781, + "learning_rate": 1.6041895261845388e-05, + "loss": 0.3416, + "step": 15880 + }, + { + "epoch": 3.962593516209476, + "grad_norm": 5.102753162384033, + "learning_rate": 1.6039401496259352e-05, + "loss": 0.359, + "step": 15890 + }, + { + "epoch": 3.965087281795511, + "grad_norm": 4.086287021636963, + "learning_rate": 1.603690773067332e-05, + "loss": 0.3796, + "step": 15900 + }, + { + "epoch": 3.9675810473815463, + "grad_norm": 6.271583080291748, + "learning_rate": 1.6034413965087282e-05, + "loss": 0.4116, + "step": 15910 + }, + { + "epoch": 3.970074812967581, + "grad_norm": 6.0941877365112305, + "learning_rate": 1.6031920199501246e-05, + "loss": 0.4144, + "step": 15920 + }, + { + "epoch": 3.972568578553616, + "grad_norm": 6.220065593719482, + "learning_rate": 1.6029426433915213e-05, + "loss": 0.3997, + "step": 15930 + }, + { + "epoch": 3.975062344139651, + "grad_norm": 6.358421802520752, + "learning_rate": 1.6026932668329177e-05, + "loss": 0.415, + "step": 15940 + }, + { + "epoch": 3.9775561097256857, + "grad_norm": 5.868401050567627, + "learning_rate": 1.6024438902743144e-05, + "loss": 0.4425, + "step": 15950 + }, + { + "epoch": 3.9800498753117206, + "grad_norm": 4.294557571411133, + "learning_rate": 1.6021945137157107e-05, + "loss": 0.3949, + "step": 15960 + }, + { + "epoch": 3.9825436408977555, + "grad_norm": 8.493629455566406, + "learning_rate": 1.6019451371571074e-05, + "loss": 0.4398, + "step": 15970 + }, + { + "epoch": 3.985037406483791, + "grad_norm": 7.112602233886719, + "learning_rate": 1.6016957605985038e-05, + "loss": 0.4643, + "step": 15980 + }, + { + "epoch": 3.9875311720698257, + "grad_norm": 11.970419883728027, + "learning_rate": 1.6014463840399005e-05, + "loss": 0.3528, + "step": 15990 + }, + { + "epoch": 3.9900249376558605, + "grad_norm": 6.464308738708496, + "learning_rate": 1.601197007481297e-05, + "loss": 0.393, + "step": 16000 + }, + { + "epoch": 3.9925187032418954, + "grad_norm": 11.118416786193848, + "learning_rate": 1.6009476309226936e-05, + "loss": 0.3788, + "step": 16010 + }, + { + "epoch": 3.9950124688279303, + "grad_norm": 9.291110038757324, + "learning_rate": 1.60069825436409e-05, + "loss": 0.4067, + "step": 16020 + }, + { + "epoch": 3.997506234413965, + "grad_norm": 2.8626253604888916, + "learning_rate": 1.6004488778054866e-05, + "loss": 0.3647, + "step": 16030 + }, + { + "epoch": 4.0, + "grad_norm": 4.152261734008789, + "learning_rate": 1.600199501246883e-05, + "loss": 0.3658, + "step": 16040 + }, + { + "epoch": 4.0, + "eval_loss": 0.4268016815185547, + "eval_runtime": 59.9163, + "eval_samples_per_second": 16.74, + "eval_steps_per_second": 16.74, + "step": 16040 + }, + { + "epoch": 4.002493765586035, + "grad_norm": 7.153280735015869, + "learning_rate": 1.5999501246882793e-05, + "loss": 0.417, + "step": 16050 + }, + { + "epoch": 4.00498753117207, + "grad_norm": 6.5900163650512695, + "learning_rate": 1.599700748129676e-05, + "loss": 0.3892, + "step": 16060 + }, + { + "epoch": 4.007481296758105, + "grad_norm": 4.667524337768555, + "learning_rate": 1.5994513715710724e-05, + "loss": 0.4389, + "step": 16070 + }, + { + "epoch": 4.0099750623441395, + "grad_norm": 4.793924808502197, + "learning_rate": 1.5992019950124688e-05, + "loss": 0.4087, + "step": 16080 + }, + { + "epoch": 4.012468827930174, + "grad_norm": 4.576432228088379, + "learning_rate": 1.5989526184538655e-05, + "loss": 0.4334, + "step": 16090 + }, + { + "epoch": 4.014962593516209, + "grad_norm": 6.454722881317139, + "learning_rate": 1.5987032418952618e-05, + "loss": 0.3957, + "step": 16100 + }, + { + "epoch": 4.017456359102244, + "grad_norm": 5.114348411560059, + "learning_rate": 1.5984538653366585e-05, + "loss": 0.3999, + "step": 16110 + }, + { + "epoch": 4.019950124688279, + "grad_norm": 3.897921085357666, + "learning_rate": 1.598204488778055e-05, + "loss": 0.3734, + "step": 16120 + }, + { + "epoch": 4.022443890274314, + "grad_norm": 6.09066915512085, + "learning_rate": 1.5979551122194516e-05, + "loss": 0.339, + "step": 16130 + }, + { + "epoch": 4.024937655860349, + "grad_norm": 7.583807945251465, + "learning_rate": 1.597705735660848e-05, + "loss": 0.3507, + "step": 16140 + }, + { + "epoch": 4.027431421446384, + "grad_norm": 10.415247917175293, + "learning_rate": 1.5974563591022446e-05, + "loss": 0.4775, + "step": 16150 + }, + { + "epoch": 4.029925187032419, + "grad_norm": 8.367255210876465, + "learning_rate": 1.597206982543641e-05, + "loss": 0.4167, + "step": 16160 + }, + { + "epoch": 4.032418952618454, + "grad_norm": 4.611159801483154, + "learning_rate": 1.5969576059850377e-05, + "loss": 0.4007, + "step": 16170 + }, + { + "epoch": 4.034912718204489, + "grad_norm": 4.2518205642700195, + "learning_rate": 1.596708229426434e-05, + "loss": 0.3998, + "step": 16180 + }, + { + "epoch": 4.037406483790524, + "grad_norm": 5.522350788116455, + "learning_rate": 1.5964588528678308e-05, + "loss": 0.4691, + "step": 16190 + }, + { + "epoch": 4.039900249376559, + "grad_norm": 4.40548849105835, + "learning_rate": 1.596209476309227e-05, + "loss": 0.4, + "step": 16200 + }, + { + "epoch": 4.042394014962594, + "grad_norm": 6.4272141456604, + "learning_rate": 1.5959600997506235e-05, + "loss": 0.4412, + "step": 16210 + }, + { + "epoch": 4.0448877805486285, + "grad_norm": 5.885697364807129, + "learning_rate": 1.5957107231920202e-05, + "loss": 0.3978, + "step": 16220 + }, + { + "epoch": 4.047381546134663, + "grad_norm": 6.327160835266113, + "learning_rate": 1.5954613466334166e-05, + "loss": 0.4464, + "step": 16230 + }, + { + "epoch": 4.049875311720698, + "grad_norm": 6.883283615112305, + "learning_rate": 1.595211970074813e-05, + "loss": 0.4137, + "step": 16240 + }, + { + "epoch": 4.052369077306733, + "grad_norm": 6.365976810455322, + "learning_rate": 1.5949625935162096e-05, + "loss": 0.4346, + "step": 16250 + }, + { + "epoch": 4.054862842892768, + "grad_norm": 5.095077037811279, + "learning_rate": 1.594713216957606e-05, + "loss": 0.3615, + "step": 16260 + }, + { + "epoch": 4.057356608478803, + "grad_norm": 5.573093414306641, + "learning_rate": 1.5944638403990027e-05, + "loss": 0.3894, + "step": 16270 + }, + { + "epoch": 4.059850374064838, + "grad_norm": 7.349832057952881, + "learning_rate": 1.594214463840399e-05, + "loss": 0.405, + "step": 16280 + }, + { + "epoch": 4.062344139650873, + "grad_norm": 7.18329381942749, + "learning_rate": 1.5939650872817954e-05, + "loss": 0.3667, + "step": 16290 + }, + { + "epoch": 4.0648379052369075, + "grad_norm": 5.264151096343994, + "learning_rate": 1.593715710723192e-05, + "loss": 0.4239, + "step": 16300 + }, + { + "epoch": 4.067331670822942, + "grad_norm": 5.923305511474609, + "learning_rate": 1.5934663341645885e-05, + "loss": 0.4125, + "step": 16310 + }, + { + "epoch": 4.069825436408977, + "grad_norm": 7.834733486175537, + "learning_rate": 1.593216957605985e-05, + "loss": 0.4075, + "step": 16320 + }, + { + "epoch": 4.072319201995012, + "grad_norm": 6.473409175872803, + "learning_rate": 1.592967581047382e-05, + "loss": 0.3263, + "step": 16330 + }, + { + "epoch": 4.074812967581048, + "grad_norm": 4.875690460205078, + "learning_rate": 1.5927182044887782e-05, + "loss": 0.3769, + "step": 16340 + }, + { + "epoch": 4.077306733167083, + "grad_norm": 6.263331890106201, + "learning_rate": 1.592468827930175e-05, + "loss": 0.3858, + "step": 16350 + }, + { + "epoch": 4.079800498753118, + "grad_norm": 6.207515716552734, + "learning_rate": 1.5922194513715713e-05, + "loss": 0.5084, + "step": 16360 + }, + { + "epoch": 4.082294264339152, + "grad_norm": 3.502013683319092, + "learning_rate": 1.5919700748129676e-05, + "loss": 0.3862, + "step": 16370 + }, + { + "epoch": 4.084788029925187, + "grad_norm": 7.68639612197876, + "learning_rate": 1.5917206982543643e-05, + "loss": 0.4036, + "step": 16380 + }, + { + "epoch": 4.087281795511222, + "grad_norm": 7.196399688720703, + "learning_rate": 1.5914713216957607e-05, + "loss": 0.3566, + "step": 16390 + }, + { + "epoch": 4.089775561097257, + "grad_norm": 6.002182960510254, + "learning_rate": 1.5912219451371574e-05, + "loss": 0.3955, + "step": 16400 + }, + { + "epoch": 4.092269326683292, + "grad_norm": 8.02131462097168, + "learning_rate": 1.5909725685785538e-05, + "loss": 0.3704, + "step": 16410 + }, + { + "epoch": 4.094763092269327, + "grad_norm": 5.517106533050537, + "learning_rate": 1.59072319201995e-05, + "loss": 0.4051, + "step": 16420 + }, + { + "epoch": 4.097256857855362, + "grad_norm": 9.180191040039062, + "learning_rate": 1.590473815461347e-05, + "loss": 0.3857, + "step": 16430 + }, + { + "epoch": 4.0997506234413965, + "grad_norm": 6.0054030418396, + "learning_rate": 1.5902244389027432e-05, + "loss": 0.3396, + "step": 16440 + }, + { + "epoch": 4.102244389027431, + "grad_norm": 5.941941261291504, + "learning_rate": 1.5899750623441396e-05, + "loss": 0.373, + "step": 16450 + }, + { + "epoch": 4.104738154613466, + "grad_norm": 6.833188056945801, + "learning_rate": 1.5897256857855363e-05, + "loss": 0.3765, + "step": 16460 + }, + { + "epoch": 4.107231920199501, + "grad_norm": 5.7840986251831055, + "learning_rate": 1.5894763092269326e-05, + "loss": 0.396, + "step": 16470 + }, + { + "epoch": 4.109725685785536, + "grad_norm": 7.854373931884766, + "learning_rate": 1.5892269326683293e-05, + "loss": 0.3982, + "step": 16480 + }, + { + "epoch": 4.112219451371571, + "grad_norm": 3.8397293090820312, + "learning_rate": 1.5889775561097257e-05, + "loss": 0.3874, + "step": 16490 + }, + { + "epoch": 4.114713216957606, + "grad_norm": 5.715033054351807, + "learning_rate": 1.5887281795511224e-05, + "loss": 0.3209, + "step": 16500 + }, + { + "epoch": 4.117206982543641, + "grad_norm": 7.231847286224365, + "learning_rate": 1.5884788029925187e-05, + "loss": 0.4834, + "step": 16510 + }, + { + "epoch": 4.1197007481296755, + "grad_norm": 4.733031272888184, + "learning_rate": 1.5882294264339154e-05, + "loss": 0.2822, + "step": 16520 + }, + { + "epoch": 4.12219451371571, + "grad_norm": 5.9622273445129395, + "learning_rate": 1.5879800498753118e-05, + "loss": 0.413, + "step": 16530 + }, + { + "epoch": 4.124688279301745, + "grad_norm": 5.854028701782227, + "learning_rate": 1.5877306733167085e-05, + "loss": 0.3331, + "step": 16540 + }, + { + "epoch": 4.127182044887781, + "grad_norm": 8.097307205200195, + "learning_rate": 1.587481296758105e-05, + "loss": 0.5102, + "step": 16550 + }, + { + "epoch": 4.129675810473816, + "grad_norm": 4.774763584136963, + "learning_rate": 1.5872319201995016e-05, + "loss": 0.3981, + "step": 16560 + }, + { + "epoch": 4.132169576059851, + "grad_norm": 6.483943939208984, + "learning_rate": 1.586982543640898e-05, + "loss": 0.4533, + "step": 16570 + }, + { + "epoch": 4.134663341645886, + "grad_norm": 10.268522262573242, + "learning_rate": 1.5867331670822943e-05, + "loss": 0.3954, + "step": 16580 + }, + { + "epoch": 4.13715710723192, + "grad_norm": 4.830404281616211, + "learning_rate": 1.586483790523691e-05, + "loss": 0.3578, + "step": 16590 + }, + { + "epoch": 4.139650872817955, + "grad_norm": 3.815784215927124, + "learning_rate": 1.5862344139650874e-05, + "loss": 0.3744, + "step": 16600 + }, + { + "epoch": 4.14214463840399, + "grad_norm": 5.319235324859619, + "learning_rate": 1.585985037406484e-05, + "loss": 0.3813, + "step": 16610 + }, + { + "epoch": 4.144638403990025, + "grad_norm": 7.98597526550293, + "learning_rate": 1.5857356608478804e-05, + "loss": 0.3337, + "step": 16620 + }, + { + "epoch": 4.14713216957606, + "grad_norm": 5.068573951721191, + "learning_rate": 1.5854862842892768e-05, + "loss": 0.4009, + "step": 16630 + }, + { + "epoch": 4.149625935162095, + "grad_norm": 4.63468599319458, + "learning_rate": 1.5852369077306735e-05, + "loss": 0.3888, + "step": 16640 + }, + { + "epoch": 4.15211970074813, + "grad_norm": 4.852334976196289, + "learning_rate": 1.58498753117207e-05, + "loss": 0.3634, + "step": 16650 + }, + { + "epoch": 4.1546134663341645, + "grad_norm": 10.55074691772461, + "learning_rate": 1.5847381546134662e-05, + "loss": 0.4119, + "step": 16660 + }, + { + "epoch": 4.157107231920199, + "grad_norm": 5.653322219848633, + "learning_rate": 1.584488778054863e-05, + "loss": 0.3409, + "step": 16670 + }, + { + "epoch": 4.159600997506234, + "grad_norm": 7.006217002868652, + "learning_rate": 1.5842394014962596e-05, + "loss": 0.3994, + "step": 16680 + }, + { + "epoch": 4.162094763092269, + "grad_norm": 6.214487552642822, + "learning_rate": 1.583990024937656e-05, + "loss": 0.4093, + "step": 16690 + }, + { + "epoch": 4.164588528678304, + "grad_norm": 9.91429615020752, + "learning_rate": 1.5837406483790527e-05, + "loss": 0.4333, + "step": 16700 + }, + { + "epoch": 4.167082294264339, + "grad_norm": 5.225426197052002, + "learning_rate": 1.583491271820449e-05, + "loss": 0.418, + "step": 16710 + }, + { + "epoch": 4.169576059850374, + "grad_norm": 4.0191969871521, + "learning_rate": 1.5832418952618457e-05, + "loss": 0.3455, + "step": 16720 + }, + { + "epoch": 4.172069825436409, + "grad_norm": 4.602033615112305, + "learning_rate": 1.582992518703242e-05, + "loss": 0.3552, + "step": 16730 + }, + { + "epoch": 4.174563591022444, + "grad_norm": 4.991748332977295, + "learning_rate": 1.5827431421446384e-05, + "loss": 0.4175, + "step": 16740 + }, + { + "epoch": 4.177057356608479, + "grad_norm": 6.951719284057617, + "learning_rate": 1.582493765586035e-05, + "loss": 0.4059, + "step": 16750 + }, + { + "epoch": 4.179551122194514, + "grad_norm": 5.848910808563232, + "learning_rate": 1.5822443890274315e-05, + "loss": 0.3944, + "step": 16760 + }, + { + "epoch": 4.182044887780549, + "grad_norm": 29.50406837463379, + "learning_rate": 1.5819950124688282e-05, + "loss": 0.4481, + "step": 16770 + }, + { + "epoch": 4.184538653366584, + "grad_norm": 8.905052185058594, + "learning_rate": 1.5817456359102246e-05, + "loss": 0.3706, + "step": 16780 + }, + { + "epoch": 4.187032418952619, + "grad_norm": 4.365262031555176, + "learning_rate": 1.581496259351621e-05, + "loss": 0.4487, + "step": 16790 + }, + { + "epoch": 4.1895261845386536, + "grad_norm": 6.916237831115723, + "learning_rate": 1.5812468827930176e-05, + "loss": 0.4587, + "step": 16800 + }, + { + "epoch": 4.192019950124688, + "grad_norm": 5.828021049499512, + "learning_rate": 1.580997506234414e-05, + "loss": 0.3274, + "step": 16810 + }, + { + "epoch": 4.194513715710723, + "grad_norm": 5.9600138664245605, + "learning_rate": 1.5807481296758107e-05, + "loss": 0.4037, + "step": 16820 + }, + { + "epoch": 4.197007481296758, + "grad_norm": 6.082637310028076, + "learning_rate": 1.580498753117207e-05, + "loss": 0.4237, + "step": 16830 + }, + { + "epoch": 4.199501246882793, + "grad_norm": 4.901458740234375, + "learning_rate": 1.5802493765586034e-05, + "loss": 0.4596, + "step": 16840 + }, + { + "epoch": 4.201995012468828, + "grad_norm": 6.101640701293945, + "learning_rate": 1.58e-05, + "loss": 0.3973, + "step": 16850 + }, + { + "epoch": 4.204488778054863, + "grad_norm": 6.996503829956055, + "learning_rate": 1.5797506234413965e-05, + "loss": 0.3572, + "step": 16860 + }, + { + "epoch": 4.206982543640898, + "grad_norm": 7.564293384552002, + "learning_rate": 1.5795012468827932e-05, + "loss": 0.3673, + "step": 16870 + }, + { + "epoch": 4.2094763092269325, + "grad_norm": 5.015920162200928, + "learning_rate": 1.5792518703241895e-05, + "loss": 0.3798, + "step": 16880 + }, + { + "epoch": 4.211970074812967, + "grad_norm": 9.611790657043457, + "learning_rate": 1.5790024937655862e-05, + "loss": 0.3374, + "step": 16890 + }, + { + "epoch": 4.214463840399002, + "grad_norm": 8.28926944732666, + "learning_rate": 1.5787531172069826e-05, + "loss": 0.3995, + "step": 16900 + }, + { + "epoch": 4.216957605985037, + "grad_norm": 7.903942108154297, + "learning_rate": 1.5785037406483793e-05, + "loss": 0.4166, + "step": 16910 + }, + { + "epoch": 4.219451371571072, + "grad_norm": 6.265456199645996, + "learning_rate": 1.5782543640897757e-05, + "loss": 0.4312, + "step": 16920 + }, + { + "epoch": 4.221945137157107, + "grad_norm": 7.296082019805908, + "learning_rate": 1.5780049875311724e-05, + "loss": 0.3976, + "step": 16930 + }, + { + "epoch": 4.224438902743142, + "grad_norm": 5.318020820617676, + "learning_rate": 1.5777556109725687e-05, + "loss": 0.3702, + "step": 16940 + }, + { + "epoch": 4.2269326683291775, + "grad_norm": 7.501054763793945, + "learning_rate": 1.577506234413965e-05, + "loss": 0.4675, + "step": 16950 + }, + { + "epoch": 4.229426433915212, + "grad_norm": 6.426706790924072, + "learning_rate": 1.5772568578553618e-05, + "loss": 0.3556, + "step": 16960 + }, + { + "epoch": 4.231920199501247, + "grad_norm": 8.110666275024414, + "learning_rate": 1.577007481296758e-05, + "loss": 0.4167, + "step": 16970 + }, + { + "epoch": 4.234413965087282, + "grad_norm": 6.1675238609313965, + "learning_rate": 1.576758104738155e-05, + "loss": 0.3816, + "step": 16980 + }, + { + "epoch": 4.236907730673317, + "grad_norm": 5.011690616607666, + "learning_rate": 1.5765087281795512e-05, + "loss": 0.3866, + "step": 16990 + }, + { + "epoch": 4.239401496259352, + "grad_norm": 6.697452545166016, + "learning_rate": 1.5762593516209476e-05, + "loss": 0.5945, + "step": 17000 + }, + { + "epoch": 4.241895261845387, + "grad_norm": 11.016718864440918, + "learning_rate": 1.5760099750623443e-05, + "loss": 0.3831, + "step": 17010 + }, + { + "epoch": 4.2443890274314215, + "grad_norm": 7.073511123657227, + "learning_rate": 1.5757605985037406e-05, + "loss": 0.4155, + "step": 17020 + }, + { + "epoch": 4.246882793017456, + "grad_norm": 6.227389812469482, + "learning_rate": 1.5755112219451373e-05, + "loss": 0.3701, + "step": 17030 + }, + { + "epoch": 4.249376558603491, + "grad_norm": 5.904865264892578, + "learning_rate": 1.5752618453865337e-05, + "loss": 0.4464, + "step": 17040 + }, + { + "epoch": 4.251870324189526, + "grad_norm": 6.812474250793457, + "learning_rate": 1.5750124688279304e-05, + "loss": 0.399, + "step": 17050 + }, + { + "epoch": 4.254364089775561, + "grad_norm": 6.779056072235107, + "learning_rate": 1.5747630922693268e-05, + "loss": 0.3984, + "step": 17060 + }, + { + "epoch": 4.256857855361596, + "grad_norm": 5.1520891189575195, + "learning_rate": 1.5745137157107235e-05, + "loss": 0.4212, + "step": 17070 + }, + { + "epoch": 4.259351620947631, + "grad_norm": 10.970466613769531, + "learning_rate": 1.5742643391521198e-05, + "loss": 0.3745, + "step": 17080 + }, + { + "epoch": 4.261845386533666, + "grad_norm": 5.555710792541504, + "learning_rate": 1.5740149625935165e-05, + "loss": 0.4095, + "step": 17090 + }, + { + "epoch": 4.2643391521197005, + "grad_norm": 5.996181011199951, + "learning_rate": 1.573765586034913e-05, + "loss": 0.3816, + "step": 17100 + }, + { + "epoch": 4.266832917705735, + "grad_norm": 5.2007646560668945, + "learning_rate": 1.5735162094763096e-05, + "loss": 0.4572, + "step": 17110 + }, + { + "epoch": 4.26932668329177, + "grad_norm": 6.208393096923828, + "learning_rate": 1.573266832917706e-05, + "loss": 0.3948, + "step": 17120 + }, + { + "epoch": 4.271820448877805, + "grad_norm": 6.411615371704102, + "learning_rate": 1.5730174563591023e-05, + "loss": 0.4109, + "step": 17130 + }, + { + "epoch": 4.274314214463841, + "grad_norm": 7.031231880187988, + "learning_rate": 1.572768079800499e-05, + "loss": 0.3633, + "step": 17140 + }, + { + "epoch": 4.276807980049876, + "grad_norm": 7.267471790313721, + "learning_rate": 1.5725187032418954e-05, + "loss": 0.3945, + "step": 17150 + }, + { + "epoch": 4.279301745635911, + "grad_norm": 6.009880065917969, + "learning_rate": 1.5722693266832917e-05, + "loss": 0.3755, + "step": 17160 + }, + { + "epoch": 4.2817955112219455, + "grad_norm": 6.955780029296875, + "learning_rate": 1.5720199501246884e-05, + "loss": 0.4382, + "step": 17170 + }, + { + "epoch": 4.28428927680798, + "grad_norm": 4.450560569763184, + "learning_rate": 1.5717705735660848e-05, + "loss": 0.4039, + "step": 17180 + }, + { + "epoch": 4.286783042394015, + "grad_norm": 7.145969390869141, + "learning_rate": 1.5715211970074815e-05, + "loss": 0.4875, + "step": 17190 + }, + { + "epoch": 4.28927680798005, + "grad_norm": 5.937014102935791, + "learning_rate": 1.571271820448878e-05, + "loss": 0.3465, + "step": 17200 + }, + { + "epoch": 4.291770573566085, + "grad_norm": 6.380857944488525, + "learning_rate": 1.5710224438902742e-05, + "loss": 0.4347, + "step": 17210 + }, + { + "epoch": 4.29426433915212, + "grad_norm": 7.357115268707275, + "learning_rate": 1.570773067331671e-05, + "loss": 0.3523, + "step": 17220 + }, + { + "epoch": 4.296758104738155, + "grad_norm": 6.485621929168701, + "learning_rate": 1.5705236907730673e-05, + "loss": 0.3737, + "step": 17230 + }, + { + "epoch": 4.2992518703241895, + "grad_norm": 5.544325828552246, + "learning_rate": 1.570274314214464e-05, + "loss": 0.4713, + "step": 17240 + }, + { + "epoch": 4.301745635910224, + "grad_norm": 4.941150188446045, + "learning_rate": 1.5700249376558603e-05, + "loss": 0.4139, + "step": 17250 + }, + { + "epoch": 4.304239401496259, + "grad_norm": 5.890329837799072, + "learning_rate": 1.569775561097257e-05, + "loss": 0.4586, + "step": 17260 + }, + { + "epoch": 4.306733167082294, + "grad_norm": 4.7308669090271, + "learning_rate": 1.5695261845386537e-05, + "loss": 0.3211, + "step": 17270 + }, + { + "epoch": 4.309226932668329, + "grad_norm": 6.511252403259277, + "learning_rate": 1.56927680798005e-05, + "loss": 0.3643, + "step": 17280 + }, + { + "epoch": 4.311720698254364, + "grad_norm": 6.328287124633789, + "learning_rate": 1.5690274314214465e-05, + "loss": 0.377, + "step": 17290 + }, + { + "epoch": 4.314214463840399, + "grad_norm": 12.119094848632812, + "learning_rate": 1.568778054862843e-05, + "loss": 0.3672, + "step": 17300 + }, + { + "epoch": 4.316708229426434, + "grad_norm": 5.892948627471924, + "learning_rate": 1.5685286783042395e-05, + "loss": 0.3174, + "step": 17310 + }, + { + "epoch": 4.3192019950124685, + "grad_norm": 6.068002700805664, + "learning_rate": 1.5682793017456362e-05, + "loss": 0.4032, + "step": 17320 + }, + { + "epoch": 4.321695760598503, + "grad_norm": 4.890083312988281, + "learning_rate": 1.5680299251870326e-05, + "loss": 0.422, + "step": 17330 + }, + { + "epoch": 4.324189526184538, + "grad_norm": 4.682448863983154, + "learning_rate": 1.567780548628429e-05, + "loss": 0.3798, + "step": 17340 + }, + { + "epoch": 4.326683291770574, + "grad_norm": 9.495527267456055, + "learning_rate": 1.5675311720698257e-05, + "loss": 0.4184, + "step": 17350 + }, + { + "epoch": 4.329177057356609, + "grad_norm": 6.011453151702881, + "learning_rate": 1.567281795511222e-05, + "loss": 0.3809, + "step": 17360 + }, + { + "epoch": 4.331670822942644, + "grad_norm": 8.991615295410156, + "learning_rate": 1.5670324189526184e-05, + "loss": 0.3961, + "step": 17370 + }, + { + "epoch": 4.334164588528679, + "grad_norm": 7.84186315536499, + "learning_rate": 1.566783042394015e-05, + "loss": 0.4104, + "step": 17380 + }, + { + "epoch": 4.3366583541147135, + "grad_norm": 4.632195949554443, + "learning_rate": 1.5665336658354114e-05, + "loss": 0.3265, + "step": 17390 + }, + { + "epoch": 4.339152119700748, + "grad_norm": 7.049432754516602, + "learning_rate": 1.566284289276808e-05, + "loss": 0.3176, + "step": 17400 + }, + { + "epoch": 4.341645885286783, + "grad_norm": 5.684892177581787, + "learning_rate": 1.5660349127182045e-05, + "loss": 0.3997, + "step": 17410 + }, + { + "epoch": 4.344139650872818, + "grad_norm": 7.792070388793945, + "learning_rate": 1.5657855361596012e-05, + "loss": 0.4101, + "step": 17420 + }, + { + "epoch": 4.346633416458853, + "grad_norm": 6.015437126159668, + "learning_rate": 1.5655361596009976e-05, + "loss": 0.4351, + "step": 17430 + }, + { + "epoch": 4.349127182044888, + "grad_norm": 5.80635404586792, + "learning_rate": 1.5652867830423943e-05, + "loss": 0.3938, + "step": 17440 + }, + { + "epoch": 4.351620947630923, + "grad_norm": 6.462253570556641, + "learning_rate": 1.5650374064837906e-05, + "loss": 0.4272, + "step": 17450 + }, + { + "epoch": 4.3541147132169575, + "grad_norm": 5.280763149261475, + "learning_rate": 1.5647880299251873e-05, + "loss": 0.3625, + "step": 17460 + }, + { + "epoch": 4.356608478802992, + "grad_norm": 6.560273170471191, + "learning_rate": 1.5645386533665837e-05, + "loss": 0.403, + "step": 17470 + }, + { + "epoch": 4.359102244389027, + "grad_norm": 6.191250324249268, + "learning_rate": 1.5642892768079804e-05, + "loss": 0.3826, + "step": 17480 + }, + { + "epoch": 4.361596009975062, + "grad_norm": 7.668184280395508, + "learning_rate": 1.5640399002493767e-05, + "loss": 0.3455, + "step": 17490 + }, + { + "epoch": 4.364089775561097, + "grad_norm": 6.343785285949707, + "learning_rate": 1.563790523690773e-05, + "loss": 0.3462, + "step": 17500 + }, + { + "epoch": 4.366583541147132, + "grad_norm": 7.69766092300415, + "learning_rate": 1.5635411471321698e-05, + "loss": 0.4203, + "step": 17510 + }, + { + "epoch": 4.369077306733167, + "grad_norm": 5.314762115478516, + "learning_rate": 1.5632917705735662e-05, + "loss": 0.4044, + "step": 17520 + }, + { + "epoch": 4.371571072319202, + "grad_norm": 4.914552688598633, + "learning_rate": 1.563042394014963e-05, + "loss": 0.3755, + "step": 17530 + }, + { + "epoch": 4.374064837905237, + "grad_norm": 5.039554119110107, + "learning_rate": 1.5627930174563592e-05, + "loss": 0.3815, + "step": 17540 + }, + { + "epoch": 4.376558603491272, + "grad_norm": 7.485365390777588, + "learning_rate": 1.5625436408977556e-05, + "loss": 0.418, + "step": 17550 + }, + { + "epoch": 4.379052369077307, + "grad_norm": 5.805276393890381, + "learning_rate": 1.5622942643391523e-05, + "loss": 0.4342, + "step": 17560 + }, + { + "epoch": 4.381546134663342, + "grad_norm": 5.376327037811279, + "learning_rate": 1.562069825436409e-05, + "loss": 0.4123, + "step": 17570 + }, + { + "epoch": 4.384039900249377, + "grad_norm": 7.425755500793457, + "learning_rate": 1.5618204488778058e-05, + "loss": 0.3918, + "step": 17580 + }, + { + "epoch": 4.386533665835412, + "grad_norm": 7.2214813232421875, + "learning_rate": 1.5615710723192022e-05, + "loss": 0.405, + "step": 17590 + }, + { + "epoch": 4.389027431421447, + "grad_norm": 7.342413902282715, + "learning_rate": 1.5613216957605985e-05, + "loss": 0.4112, + "step": 17600 + }, + { + "epoch": 4.3915211970074814, + "grad_norm": 6.210078716278076, + "learning_rate": 1.5610723192019952e-05, + "loss": 0.4194, + "step": 17610 + }, + { + "epoch": 4.394014962593516, + "grad_norm": 8.023131370544434, + "learning_rate": 1.5608229426433916e-05, + "loss": 0.3781, + "step": 17620 + }, + { + "epoch": 4.396508728179551, + "grad_norm": 6.007671356201172, + "learning_rate": 1.560573566084788e-05, + "loss": 0.4539, + "step": 17630 + }, + { + "epoch": 4.399002493765586, + "grad_norm": 6.679064750671387, + "learning_rate": 1.5603241895261847e-05, + "loss": 0.4137, + "step": 17640 + }, + { + "epoch": 4.401496259351621, + "grad_norm": 3.1536433696746826, + "learning_rate": 1.560074812967581e-05, + "loss": 0.4058, + "step": 17650 + }, + { + "epoch": 4.403990024937656, + "grad_norm": 6.0214009284973145, + "learning_rate": 1.5598254364089777e-05, + "loss": 0.4396, + "step": 17660 + }, + { + "epoch": 4.406483790523691, + "grad_norm": 4.901978015899658, + "learning_rate": 1.559576059850374e-05, + "loss": 0.3561, + "step": 17670 + }, + { + "epoch": 4.4089775561097255, + "grad_norm": 7.766887664794922, + "learning_rate": 1.5593266832917705e-05, + "loss": 0.4944, + "step": 17680 + }, + { + "epoch": 4.41147132169576, + "grad_norm": 5.148433685302734, + "learning_rate": 1.559077306733167e-05, + "loss": 0.4223, + "step": 17690 + }, + { + "epoch": 4.413965087281795, + "grad_norm": 4.767563343048096, + "learning_rate": 1.558827930174564e-05, + "loss": 0.3936, + "step": 17700 + }, + { + "epoch": 4.41645885286783, + "grad_norm": 7.088444232940674, + "learning_rate": 1.5585785536159602e-05, + "loss": 0.4696, + "step": 17710 + }, + { + "epoch": 4.418952618453865, + "grad_norm": 7.782017707824707, + "learning_rate": 1.558329177057357e-05, + "loss": 0.3238, + "step": 17720 + }, + { + "epoch": 4.4214463840399, + "grad_norm": 8.426640510559082, + "learning_rate": 1.5580798004987533e-05, + "loss": 0.422, + "step": 17730 + }, + { + "epoch": 4.423940149625935, + "grad_norm": 6.744356155395508, + "learning_rate": 1.55783042394015e-05, + "loss": 0.3593, + "step": 17740 + }, + { + "epoch": 4.42643391521197, + "grad_norm": 4.268016338348389, + "learning_rate": 1.5575810473815463e-05, + "loss": 0.4009, + "step": 17750 + }, + { + "epoch": 4.428927680798005, + "grad_norm": 6.248506546020508, + "learning_rate": 1.5573316708229427e-05, + "loss": 0.3938, + "step": 17760 + }, + { + "epoch": 4.43142144638404, + "grad_norm": 5.938436031341553, + "learning_rate": 1.5570822942643394e-05, + "loss": 0.4448, + "step": 17770 + }, + { + "epoch": 4.433915211970075, + "grad_norm": 6.432234764099121, + "learning_rate": 1.5568329177057358e-05, + "loss": 0.3664, + "step": 17780 + }, + { + "epoch": 4.43640897755611, + "grad_norm": 5.563558101654053, + "learning_rate": 1.5565835411471325e-05, + "loss": 0.5319, + "step": 17790 + }, + { + "epoch": 4.438902743142145, + "grad_norm": 5.881470680236816, + "learning_rate": 1.5563341645885288e-05, + "loss": 0.435, + "step": 17800 + }, + { + "epoch": 4.44139650872818, + "grad_norm": 6.403785705566406, + "learning_rate": 1.5560847880299252e-05, + "loss": 0.3419, + "step": 17810 + }, + { + "epoch": 4.443890274314215, + "grad_norm": 6.167340278625488, + "learning_rate": 1.555835411471322e-05, + "loss": 0.4303, + "step": 17820 + }, + { + "epoch": 4.446384039900249, + "grad_norm": 8.766891479492188, + "learning_rate": 1.5555860349127182e-05, + "loss": 0.3443, + "step": 17830 + }, + { + "epoch": 4.448877805486284, + "grad_norm": 6.43876314163208, + "learning_rate": 1.5553366583541146e-05, + "loss": 0.3824, + "step": 17840 + }, + { + "epoch": 4.451371571072319, + "grad_norm": 14.504803657531738, + "learning_rate": 1.5550872817955113e-05, + "loss": 0.3503, + "step": 17850 + }, + { + "epoch": 4.453865336658354, + "grad_norm": 8.448104858398438, + "learning_rate": 1.5548379052369077e-05, + "loss": 0.4532, + "step": 17860 + }, + { + "epoch": 4.456359102244389, + "grad_norm": 7.163640022277832, + "learning_rate": 1.5545885286783044e-05, + "loss": 0.4343, + "step": 17870 + }, + { + "epoch": 4.458852867830424, + "grad_norm": 5.441543102264404, + "learning_rate": 1.5543391521197007e-05, + "loss": 0.3523, + "step": 17880 + }, + { + "epoch": 4.461346633416459, + "grad_norm": 4.966891765594482, + "learning_rate": 1.5540897755610974e-05, + "loss": 0.3701, + "step": 17890 + }, + { + "epoch": 4.4638403990024935, + "grad_norm": 5.674979209899902, + "learning_rate": 1.5538403990024938e-05, + "loss": 0.3415, + "step": 17900 + }, + { + "epoch": 4.466334164588528, + "grad_norm": 6.1280646324157715, + "learning_rate": 1.5535910224438905e-05, + "loss": 0.4511, + "step": 17910 + }, + { + "epoch": 4.468827930174563, + "grad_norm": 6.596588611602783, + "learning_rate": 1.553341645885287e-05, + "loss": 0.3627, + "step": 17920 + }, + { + "epoch": 4.471321695760598, + "grad_norm": 7.014939785003662, + "learning_rate": 1.5530922693266836e-05, + "loss": 0.3572, + "step": 17930 + }, + { + "epoch": 4.473815461346634, + "grad_norm": 5.796361923217773, + "learning_rate": 1.55284289276808e-05, + "loss": 0.448, + "step": 17940 + }, + { + "epoch": 4.476309226932669, + "grad_norm": 7.400503635406494, + "learning_rate": 1.5525935162094766e-05, + "loss": 0.3636, + "step": 17950 + }, + { + "epoch": 4.478802992518704, + "grad_norm": 6.007717132568359, + "learning_rate": 1.552344139650873e-05, + "loss": 0.3922, + "step": 17960 + }, + { + "epoch": 4.4812967581047385, + "grad_norm": 11.898872375488281, + "learning_rate": 1.5520947630922693e-05, + "loss": 0.4342, + "step": 17970 + }, + { + "epoch": 4.483790523690773, + "grad_norm": 6.466373443603516, + "learning_rate": 1.551845386533666e-05, + "loss": 0.4021, + "step": 17980 + }, + { + "epoch": 4.486284289276808, + "grad_norm": 5.270114898681641, + "learning_rate": 1.5515960099750624e-05, + "loss": 0.384, + "step": 17990 + }, + { + "epoch": 4.488778054862843, + "grad_norm": 5.372076034545898, + "learning_rate": 1.551346633416459e-05, + "loss": 0.4017, + "step": 18000 + }, + { + "epoch": 4.491271820448878, + "grad_norm": 7.527553558349609, + "learning_rate": 1.5510972568578555e-05, + "loss": 0.3786, + "step": 18010 + }, + { + "epoch": 4.493765586034913, + "grad_norm": 7.008097171783447, + "learning_rate": 1.5508478802992518e-05, + "loss": 0.4263, + "step": 18020 + }, + { + "epoch": 4.496259351620948, + "grad_norm": 5.688605308532715, + "learning_rate": 1.5505985037406485e-05, + "loss": 0.3271, + "step": 18030 + }, + { + "epoch": 4.498753117206983, + "grad_norm": 3.6691343784332275, + "learning_rate": 1.550349127182045e-05, + "loss": 0.3862, + "step": 18040 + }, + { + "epoch": 4.501246882793017, + "grad_norm": 5.9994635581970215, + "learning_rate": 1.5500997506234416e-05, + "loss": 0.4175, + "step": 18050 + }, + { + "epoch": 4.503740648379052, + "grad_norm": 5.447284698486328, + "learning_rate": 1.549850374064838e-05, + "loss": 0.4004, + "step": 18060 + }, + { + "epoch": 4.506234413965087, + "grad_norm": 6.669367790222168, + "learning_rate": 1.5496009975062347e-05, + "loss": 0.4212, + "step": 18070 + }, + { + "epoch": 4.508728179551122, + "grad_norm": 5.361328601837158, + "learning_rate": 1.549351620947631e-05, + "loss": 0.4352, + "step": 18080 + }, + { + "epoch": 4.511221945137157, + "grad_norm": 7.188773155212402, + "learning_rate": 1.5491022443890277e-05, + "loss": 0.3761, + "step": 18090 + }, + { + "epoch": 4.513715710723192, + "grad_norm": 4.965447902679443, + "learning_rate": 1.548852867830424e-05, + "loss": 0.3335, + "step": 18100 + }, + { + "epoch": 4.516209476309227, + "grad_norm": 8.553069114685059, + "learning_rate": 1.5486034912718208e-05, + "loss": 0.4081, + "step": 18110 + }, + { + "epoch": 4.5187032418952615, + "grad_norm": 6.796214580535889, + "learning_rate": 1.548354114713217e-05, + "loss": 0.3981, + "step": 18120 + }, + { + "epoch": 4.521197007481296, + "grad_norm": 5.6893815994262695, + "learning_rate": 1.5481047381546135e-05, + "loss": 0.5065, + "step": 18130 + }, + { + "epoch": 4.523690773067331, + "grad_norm": 6.233173370361328, + "learning_rate": 1.5478553615960102e-05, + "loss": 0.3679, + "step": 18140 + }, + { + "epoch": 4.526184538653366, + "grad_norm": 7.2343597412109375, + "learning_rate": 1.5476059850374066e-05, + "loss": 0.4395, + "step": 18150 + }, + { + "epoch": 4.528678304239402, + "grad_norm": 7.412199020385742, + "learning_rate": 1.5473566084788033e-05, + "loss": 0.4316, + "step": 18160 + }, + { + "epoch": 4.531172069825437, + "grad_norm": 4.937939643859863, + "learning_rate": 1.5471072319201996e-05, + "loss": 0.3744, + "step": 18170 + }, + { + "epoch": 4.533665835411472, + "grad_norm": 6.558340072631836, + "learning_rate": 1.546857855361596e-05, + "loss": 0.418, + "step": 18180 + }, + { + "epoch": 4.5361596009975065, + "grad_norm": 6.0610551834106445, + "learning_rate": 1.5466084788029927e-05, + "loss": 0.4406, + "step": 18190 + }, + { + "epoch": 4.538653366583541, + "grad_norm": 4.485851764678955, + "learning_rate": 1.546359102244389e-05, + "loss": 0.4511, + "step": 18200 + }, + { + "epoch": 4.541147132169576, + "grad_norm": 5.121599197387695, + "learning_rate": 1.5461097256857857e-05, + "loss": 0.3711, + "step": 18210 + }, + { + "epoch": 4.543640897755611, + "grad_norm": 4.987266540527344, + "learning_rate": 1.545860349127182e-05, + "loss": 0.3687, + "step": 18220 + }, + { + "epoch": 4.546134663341646, + "grad_norm": 6.394310474395752, + "learning_rate": 1.5456109725685785e-05, + "loss": 0.3217, + "step": 18230 + }, + { + "epoch": 4.548628428927681, + "grad_norm": 5.757366180419922, + "learning_rate": 1.5453615960099752e-05, + "loss": 0.3671, + "step": 18240 + }, + { + "epoch": 4.551122194513716, + "grad_norm": 6.678833961486816, + "learning_rate": 1.5451122194513715e-05, + "loss": 0.3765, + "step": 18250 + }, + { + "epoch": 4.553615960099751, + "grad_norm": 6.454516410827637, + "learning_rate": 1.5448628428927682e-05, + "loss": 0.4051, + "step": 18260 + }, + { + "epoch": 4.556109725685785, + "grad_norm": 5.4813337326049805, + "learning_rate": 1.5446134663341646e-05, + "loss": 0.3641, + "step": 18270 + }, + { + "epoch": 4.55860349127182, + "grad_norm": 5.699728488922119, + "learning_rate": 1.5443640897755613e-05, + "loss": 0.3532, + "step": 18280 + }, + { + "epoch": 4.561097256857855, + "grad_norm": 24.547840118408203, + "learning_rate": 1.544114713216958e-05, + "loss": 0.5092, + "step": 18290 + }, + { + "epoch": 4.56359102244389, + "grad_norm": 3.5861904621124268, + "learning_rate": 1.5438653366583544e-05, + "loss": 0.3255, + "step": 18300 + }, + { + "epoch": 4.566084788029925, + "grad_norm": 5.126884937286377, + "learning_rate": 1.5436159600997507e-05, + "loss": 0.3716, + "step": 18310 + }, + { + "epoch": 4.56857855361596, + "grad_norm": 6.409011363983154, + "learning_rate": 1.5433665835411474e-05, + "loss": 0.3852, + "step": 18320 + }, + { + "epoch": 4.571072319201995, + "grad_norm": 5.01663875579834, + "learning_rate": 1.5431172069825438e-05, + "loss": 0.4487, + "step": 18330 + }, + { + "epoch": 4.57356608478803, + "grad_norm": 5.1992506980896, + "learning_rate": 1.54286783042394e-05, + "loss": 0.384, + "step": 18340 + }, + { + "epoch": 4.576059850374065, + "grad_norm": 9.998324394226074, + "learning_rate": 1.542618453865337e-05, + "loss": 0.4018, + "step": 18350 + }, + { + "epoch": 4.5785536159601, + "grad_norm": 5.419407844543457, + "learning_rate": 1.5423690773067332e-05, + "loss": 0.3933, + "step": 18360 + }, + { + "epoch": 4.581047381546135, + "grad_norm": 3.5099966526031494, + "learning_rate": 1.54211970074813e-05, + "loss": 0.3373, + "step": 18370 + }, + { + "epoch": 4.58354114713217, + "grad_norm": 6.4149627685546875, + "learning_rate": 1.5418703241895263e-05, + "loss": 0.3542, + "step": 18380 + }, + { + "epoch": 4.586034912718205, + "grad_norm": 7.924310684204102, + "learning_rate": 1.5416209476309226e-05, + "loss": 0.4282, + "step": 18390 + }, + { + "epoch": 4.58852867830424, + "grad_norm": 19.269880294799805, + "learning_rate": 1.5413715710723193e-05, + "loss": 0.4397, + "step": 18400 + }, + { + "epoch": 4.5910224438902745, + "grad_norm": 8.860027313232422, + "learning_rate": 1.5411221945137157e-05, + "loss": 0.3427, + "step": 18410 + }, + { + "epoch": 4.593516209476309, + "grad_norm": 7.146231174468994, + "learning_rate": 1.5408728179551124e-05, + "loss": 0.4281, + "step": 18420 + }, + { + "epoch": 4.596009975062344, + "grad_norm": 5.25446081161499, + "learning_rate": 1.5406234413965088e-05, + "loss": 0.4066, + "step": 18430 + }, + { + "epoch": 4.598503740648379, + "grad_norm": 4.486170291900635, + "learning_rate": 1.5403740648379055e-05, + "loss": 0.3729, + "step": 18440 + }, + { + "epoch": 4.600997506234414, + "grad_norm": 5.795819282531738, + "learning_rate": 1.5401496259351623e-05, + "loss": 0.3982, + "step": 18450 + }, + { + "epoch": 4.603491271820449, + "grad_norm": 5.524128437042236, + "learning_rate": 1.5399002493765586e-05, + "loss": 0.3987, + "step": 18460 + }, + { + "epoch": 4.605985037406484, + "grad_norm": 9.092752456665039, + "learning_rate": 1.5396508728179553e-05, + "loss": 0.3542, + "step": 18470 + }, + { + "epoch": 4.6084788029925186, + "grad_norm": 4.337942123413086, + "learning_rate": 1.5394014962593517e-05, + "loss": 0.3433, + "step": 18480 + }, + { + "epoch": 4.610972568578553, + "grad_norm": 7.767664909362793, + "learning_rate": 1.539152119700748e-05, + "loss": 0.412, + "step": 18490 + }, + { + "epoch": 4.613466334164588, + "grad_norm": 7.6244964599609375, + "learning_rate": 1.5389027431421448e-05, + "loss": 0.4377, + "step": 18500 + }, + { + "epoch": 4.615960099750623, + "grad_norm": 5.876872539520264, + "learning_rate": 1.538653366583541e-05, + "loss": 0.4679, + "step": 18510 + }, + { + "epoch": 4.618453865336658, + "grad_norm": 7.280019283294678, + "learning_rate": 1.5384039900249378e-05, + "loss": 0.4518, + "step": 18520 + }, + { + "epoch": 4.620947630922693, + "grad_norm": 6.68482780456543, + "learning_rate": 1.5381546134663342e-05, + "loss": 0.351, + "step": 18530 + }, + { + "epoch": 4.623441396508728, + "grad_norm": 6.15846061706543, + "learning_rate": 1.537905236907731e-05, + "loss": 0.4252, + "step": 18540 + }, + { + "epoch": 4.625935162094763, + "grad_norm": 13.20067024230957, + "learning_rate": 1.5376558603491272e-05, + "loss": 0.3845, + "step": 18550 + }, + { + "epoch": 4.628428927680798, + "grad_norm": 5.230330467224121, + "learning_rate": 1.537406483790524e-05, + "loss": 0.4242, + "step": 18560 + }, + { + "epoch": 4.630922693266833, + "grad_norm": 6.149280071258545, + "learning_rate": 1.5371571072319203e-05, + "loss": 0.4017, + "step": 18570 + }, + { + "epoch": 4.633416458852868, + "grad_norm": 5.865719318389893, + "learning_rate": 1.536907730673317e-05, + "loss": 0.3697, + "step": 18580 + }, + { + "epoch": 4.635910224438903, + "grad_norm": 7.069497585296631, + "learning_rate": 1.5366583541147134e-05, + "loss": 0.3835, + "step": 18590 + }, + { + "epoch": 4.638403990024938, + "grad_norm": 10.992486000061035, + "learning_rate": 1.53640897755611e-05, + "loss": 0.4504, + "step": 18600 + }, + { + "epoch": 4.640897755610973, + "grad_norm": 9.323265075683594, + "learning_rate": 1.5361596009975064e-05, + "loss": 0.5272, + "step": 18610 + }, + { + "epoch": 4.643391521197008, + "grad_norm": 7.848715305328369, + "learning_rate": 1.5359102244389028e-05, + "loss": 0.4452, + "step": 18620 + }, + { + "epoch": 4.6458852867830425, + "grad_norm": 7.649922847747803, + "learning_rate": 1.5356608478802995e-05, + "loss": 0.3857, + "step": 18630 + }, + { + "epoch": 4.648379052369077, + "grad_norm": 4.88482141494751, + "learning_rate": 1.535411471321696e-05, + "loss": 0.3648, + "step": 18640 + }, + { + "epoch": 4.650872817955112, + "grad_norm": 8.439806938171387, + "learning_rate": 1.5351620947630922e-05, + "loss": 0.3763, + "step": 18650 + }, + { + "epoch": 4.653366583541147, + "grad_norm": 7.73594856262207, + "learning_rate": 1.534912718204489e-05, + "loss": 0.354, + "step": 18660 + }, + { + "epoch": 4.655860349127182, + "grad_norm": 7.768985271453857, + "learning_rate": 1.5346633416458853e-05, + "loss": 0.3676, + "step": 18670 + }, + { + "epoch": 4.658354114713217, + "grad_norm": 5.735325336456299, + "learning_rate": 1.534413965087282e-05, + "loss": 0.3333, + "step": 18680 + }, + { + "epoch": 4.660847880299252, + "grad_norm": 4.903052806854248, + "learning_rate": 1.5341645885286783e-05, + "loss": 0.467, + "step": 18690 + }, + { + "epoch": 4.6633416458852865, + "grad_norm": 6.769343376159668, + "learning_rate": 1.5339152119700747e-05, + "loss": 0.4424, + "step": 18700 + }, + { + "epoch": 4.665835411471321, + "grad_norm": 22.33620262145996, + "learning_rate": 1.5336658354114714e-05, + "loss": 0.4389, + "step": 18710 + }, + { + "epoch": 4.668329177057356, + "grad_norm": 5.3549909591674805, + "learning_rate": 1.533416458852868e-05, + "loss": 0.362, + "step": 18720 + }, + { + "epoch": 4.670822942643391, + "grad_norm": 4.449096202850342, + "learning_rate": 1.5331670822942645e-05, + "loss": 0.3292, + "step": 18730 + }, + { + "epoch": 4.673316708229427, + "grad_norm": 7.292250156402588, + "learning_rate": 1.532917705735661e-05, + "loss": 0.3926, + "step": 18740 + }, + { + "epoch": 4.675810473815462, + "grad_norm": 6.387401103973389, + "learning_rate": 1.5326683291770575e-05, + "loss": 0.4135, + "step": 18750 + }, + { + "epoch": 4.678304239401497, + "grad_norm": 5.179088115692139, + "learning_rate": 1.5324189526184542e-05, + "loss": 0.3263, + "step": 18760 + }, + { + "epoch": 4.6807980049875315, + "grad_norm": 6.1036529541015625, + "learning_rate": 1.5321695760598506e-05, + "loss": 0.3773, + "step": 18770 + }, + { + "epoch": 4.683291770573566, + "grad_norm": 6.317544937133789, + "learning_rate": 1.531920199501247e-05, + "loss": 0.3549, + "step": 18780 + }, + { + "epoch": 4.685785536159601, + "grad_norm": 4.250217437744141, + "learning_rate": 1.5316708229426437e-05, + "loss": 0.3675, + "step": 18790 + }, + { + "epoch": 4.688279301745636, + "grad_norm": 4.993629455566406, + "learning_rate": 1.53142144638404e-05, + "loss": 0.3732, + "step": 18800 + }, + { + "epoch": 4.690773067331671, + "grad_norm": 6.042954444885254, + "learning_rate": 1.5311720698254364e-05, + "loss": 0.4064, + "step": 18810 + }, + { + "epoch": 4.693266832917706, + "grad_norm": 7.933563709259033, + "learning_rate": 1.530922693266833e-05, + "loss": 0.4322, + "step": 18820 + }, + { + "epoch": 4.695760598503741, + "grad_norm": 5.294432163238525, + "learning_rate": 1.5306733167082294e-05, + "loss": 0.3844, + "step": 18830 + }, + { + "epoch": 4.698254364089776, + "grad_norm": 4.980920791625977, + "learning_rate": 1.530423940149626e-05, + "loss": 0.3597, + "step": 18840 + }, + { + "epoch": 4.7007481296758105, + "grad_norm": 5.460011005401611, + "learning_rate": 1.5301745635910225e-05, + "loss": 0.3212, + "step": 18850 + }, + { + "epoch": 4.703241895261845, + "grad_norm": 6.553736686706543, + "learning_rate": 1.529925187032419e-05, + "loss": 0.4003, + "step": 18860 + }, + { + "epoch": 4.70573566084788, + "grad_norm": 6.58586311340332, + "learning_rate": 1.5296758104738156e-05, + "loss": 0.4062, + "step": 18870 + }, + { + "epoch": 4.708229426433915, + "grad_norm": 6.710026741027832, + "learning_rate": 1.529426433915212e-05, + "loss": 0.4248, + "step": 18880 + }, + { + "epoch": 4.71072319201995, + "grad_norm": 6.377664089202881, + "learning_rate": 1.5291770573566086e-05, + "loss": 0.467, + "step": 18890 + }, + { + "epoch": 4.713216957605985, + "grad_norm": 8.970911979675293, + "learning_rate": 1.528927680798005e-05, + "loss": 0.4081, + "step": 18900 + }, + { + "epoch": 4.71571072319202, + "grad_norm": 5.463969707489014, + "learning_rate": 1.5286783042394017e-05, + "loss": 0.3779, + "step": 18910 + }, + { + "epoch": 4.7182044887780545, + "grad_norm": 5.64981746673584, + "learning_rate": 1.528428927680798e-05, + "loss": 0.4542, + "step": 18920 + }, + { + "epoch": 4.720698254364089, + "grad_norm": 7.127771377563477, + "learning_rate": 1.5281795511221947e-05, + "loss": 0.3373, + "step": 18930 + }, + { + "epoch": 4.723192019950124, + "grad_norm": 6.836137771606445, + "learning_rate": 1.527930174563591e-05, + "loss": 0.3651, + "step": 18940 + }, + { + "epoch": 4.725685785536159, + "grad_norm": 7.067853927612305, + "learning_rate": 1.5276807980049878e-05, + "loss": 0.3785, + "step": 18950 + }, + { + "epoch": 4.728179551122195, + "grad_norm": 8.48742389678955, + "learning_rate": 1.5274314214463842e-05, + "loss": 0.3731, + "step": 18960 + }, + { + "epoch": 4.73067331670823, + "grad_norm": 13.028044700622559, + "learning_rate": 1.527182044887781e-05, + "loss": 0.3846, + "step": 18970 + }, + { + "epoch": 4.733167082294265, + "grad_norm": 8.332368850708008, + "learning_rate": 1.5269326683291772e-05, + "loss": 0.4137, + "step": 18980 + }, + { + "epoch": 4.7356608478802995, + "grad_norm": 7.124736785888672, + "learning_rate": 1.5266832917705736e-05, + "loss": 0.386, + "step": 18990 + }, + { + "epoch": 4.738154613466334, + "grad_norm": 6.675353527069092, + "learning_rate": 1.5264339152119703e-05, + "loss": 0.4329, + "step": 19000 + }, + { + "epoch": 4.740648379052369, + "grad_norm": 5.873729705810547, + "learning_rate": 1.5261845386533667e-05, + "loss": 0.4582, + "step": 19010 + }, + { + "epoch": 4.743142144638404, + "grad_norm": 7.303366184234619, + "learning_rate": 1.525935162094763e-05, + "loss": 0.416, + "step": 19020 + }, + { + "epoch": 4.745635910224439, + "grad_norm": 4.469297885894775, + "learning_rate": 1.5256857855361597e-05, + "loss": 0.3637, + "step": 19030 + }, + { + "epoch": 4.748129675810474, + "grad_norm": 4.885001182556152, + "learning_rate": 1.5254364089775563e-05, + "loss": 0.3829, + "step": 19040 + }, + { + "epoch": 4.750623441396509, + "grad_norm": 12.459643363952637, + "learning_rate": 1.5251870324189528e-05, + "loss": 0.3955, + "step": 19050 + }, + { + "epoch": 4.753117206982544, + "grad_norm": 4.591975688934326, + "learning_rate": 1.5249376558603493e-05, + "loss": 0.3381, + "step": 19060 + }, + { + "epoch": 4.7556109725685785, + "grad_norm": 5.559258937835693, + "learning_rate": 1.5246882793017457e-05, + "loss": 0.4173, + "step": 19070 + }, + { + "epoch": 4.758104738154613, + "grad_norm": 8.134532928466797, + "learning_rate": 1.5244389027431424e-05, + "loss": 0.4132, + "step": 19080 + }, + { + "epoch": 4.760598503740648, + "grad_norm": 8.197823524475098, + "learning_rate": 1.5241895261845387e-05, + "loss": 0.4422, + "step": 19090 + }, + { + "epoch": 4.763092269326683, + "grad_norm": 6.592520713806152, + "learning_rate": 1.5239401496259354e-05, + "loss": 0.3964, + "step": 19100 + }, + { + "epoch": 4.765586034912718, + "grad_norm": 9.510995864868164, + "learning_rate": 1.5236907730673318e-05, + "loss": 0.4011, + "step": 19110 + }, + { + "epoch": 4.768079800498753, + "grad_norm": 8.02636432647705, + "learning_rate": 1.5234413965087282e-05, + "loss": 0.3814, + "step": 19120 + }, + { + "epoch": 4.770573566084788, + "grad_norm": 8.324048042297363, + "learning_rate": 1.5231920199501249e-05, + "loss": 0.4601, + "step": 19130 + }, + { + "epoch": 4.773067331670823, + "grad_norm": 4.599636554718018, + "learning_rate": 1.5229426433915214e-05, + "loss": 0.3584, + "step": 19140 + }, + { + "epoch": 4.775561097256858, + "grad_norm": 7.513518333435059, + "learning_rate": 1.5226932668329178e-05, + "loss": 0.4006, + "step": 19150 + }, + { + "epoch": 4.778054862842893, + "grad_norm": 6.876750946044922, + "learning_rate": 1.5224438902743145e-05, + "loss": 0.3725, + "step": 19160 + }, + { + "epoch": 4.780548628428928, + "grad_norm": 5.191885471343994, + "learning_rate": 1.5221945137157108e-05, + "loss": 0.3534, + "step": 19170 + }, + { + "epoch": 4.783042394014963, + "grad_norm": 7.336450099945068, + "learning_rate": 1.5219451371571075e-05, + "loss": 0.408, + "step": 19180 + }, + { + "epoch": 4.785536159600998, + "grad_norm": 4.654347896575928, + "learning_rate": 1.5216957605985039e-05, + "loss": 0.3971, + "step": 19190 + }, + { + "epoch": 4.788029925187033, + "grad_norm": 8.277083396911621, + "learning_rate": 1.5214463840399002e-05, + "loss": 0.4695, + "step": 19200 + }, + { + "epoch": 4.7905236907730675, + "grad_norm": 5.439416885375977, + "learning_rate": 1.521197007481297e-05, + "loss": 0.4059, + "step": 19210 + }, + { + "epoch": 4.793017456359102, + "grad_norm": 7.094707489013672, + "learning_rate": 1.5209476309226933e-05, + "loss": 0.4118, + "step": 19220 + }, + { + "epoch": 4.795511221945137, + "grad_norm": 4.1312575340271, + "learning_rate": 1.5206982543640898e-05, + "loss": 0.3257, + "step": 19230 + }, + { + "epoch": 4.798004987531172, + "grad_norm": 14.407641410827637, + "learning_rate": 1.5204488778054864e-05, + "loss": 0.3796, + "step": 19240 + }, + { + "epoch": 4.800498753117207, + "grad_norm": 5.280395984649658, + "learning_rate": 1.5201995012468829e-05, + "loss": 0.3183, + "step": 19250 + }, + { + "epoch": 4.802992518703242, + "grad_norm": 5.899741172790527, + "learning_rate": 1.5199501246882796e-05, + "loss": 0.3432, + "step": 19260 + }, + { + "epoch": 4.805486284289277, + "grad_norm": 6.163057804107666, + "learning_rate": 1.519700748129676e-05, + "loss": 0.3459, + "step": 19270 + }, + { + "epoch": 4.807980049875312, + "grad_norm": 6.30502986907959, + "learning_rate": 1.5194513715710723e-05, + "loss": 0.4026, + "step": 19280 + }, + { + "epoch": 4.8104738154613464, + "grad_norm": 8.228434562683105, + "learning_rate": 1.519201995012469e-05, + "loss": 0.4527, + "step": 19290 + }, + { + "epoch": 4.812967581047381, + "grad_norm": 7.36271333694458, + "learning_rate": 1.5189526184538654e-05, + "loss": 0.4226, + "step": 19300 + }, + { + "epoch": 4.815461346633416, + "grad_norm": 6.21150016784668, + "learning_rate": 1.5187032418952619e-05, + "loss": 0.3855, + "step": 19310 + }, + { + "epoch": 4.817955112219451, + "grad_norm": 5.929027557373047, + "learning_rate": 1.5184538653366584e-05, + "loss": 0.3232, + "step": 19320 + }, + { + "epoch": 4.820448877805486, + "grad_norm": 12.694462776184082, + "learning_rate": 1.518204488778055e-05, + "loss": 0.3749, + "step": 19330 + }, + { + "epoch": 4.822942643391521, + "grad_norm": 4.2973456382751465, + "learning_rate": 1.5179551122194515e-05, + "loss": 0.3592, + "step": 19340 + }, + { + "epoch": 4.825436408977556, + "grad_norm": 6.927724838256836, + "learning_rate": 1.517705735660848e-05, + "loss": 0.3819, + "step": 19350 + }, + { + "epoch": 4.8279301745635905, + "grad_norm": 8.140523910522461, + "learning_rate": 1.5174563591022444e-05, + "loss": 0.4302, + "step": 19360 + }, + { + "epoch": 4.830423940149626, + "grad_norm": 9.106082916259766, + "learning_rate": 1.5172069825436411e-05, + "loss": 0.4181, + "step": 19370 + }, + { + "epoch": 4.832917705735661, + "grad_norm": 5.513749599456787, + "learning_rate": 1.5169576059850375e-05, + "loss": 0.3663, + "step": 19380 + }, + { + "epoch": 4.835411471321696, + "grad_norm": 4.832230567932129, + "learning_rate": 1.5167082294264342e-05, + "loss": 0.3402, + "step": 19390 + }, + { + "epoch": 4.837905236907731, + "grad_norm": 5.5058183670043945, + "learning_rate": 1.5164588528678305e-05, + "loss": 0.4934, + "step": 19400 + }, + { + "epoch": 4.840399002493766, + "grad_norm": 6.715292930603027, + "learning_rate": 1.516209476309227e-05, + "loss": 0.4119, + "step": 19410 + }, + { + "epoch": 4.842892768079801, + "grad_norm": 7.778273105621338, + "learning_rate": 1.5159600997506236e-05, + "loss": 0.3779, + "step": 19420 + }, + { + "epoch": 4.8453865336658355, + "grad_norm": 5.197988510131836, + "learning_rate": 1.5157107231920201e-05, + "loss": 0.3754, + "step": 19430 + }, + { + "epoch": 4.84788029925187, + "grad_norm": 4.799652099609375, + "learning_rate": 1.5154613466334165e-05, + "loss": 0.3453, + "step": 19440 + }, + { + "epoch": 4.850374064837905, + "grad_norm": 7.939327716827393, + "learning_rate": 1.5152119700748132e-05, + "loss": 0.3741, + "step": 19450 + }, + { + "epoch": 4.85286783042394, + "grad_norm": 7.11137056350708, + "learning_rate": 1.5149625935162095e-05, + "loss": 0.3905, + "step": 19460 + }, + { + "epoch": 4.855361596009975, + "grad_norm": 6.2819061279296875, + "learning_rate": 1.5147132169576062e-05, + "loss": 0.3733, + "step": 19470 + }, + { + "epoch": 4.85785536159601, + "grad_norm": 5.845408916473389, + "learning_rate": 1.5144638403990026e-05, + "loss": 0.4008, + "step": 19480 + }, + { + "epoch": 4.860349127182045, + "grad_norm": 7.455866813659668, + "learning_rate": 1.5142144638403991e-05, + "loss": 0.418, + "step": 19490 + }, + { + "epoch": 4.86284289276808, + "grad_norm": 5.083822250366211, + "learning_rate": 1.5139650872817957e-05, + "loss": 0.3625, + "step": 19500 + }, + { + "epoch": 4.865336658354114, + "grad_norm": 6.3122968673706055, + "learning_rate": 1.5137157107231922e-05, + "loss": 0.396, + "step": 19510 + }, + { + "epoch": 4.867830423940149, + "grad_norm": 6.627301216125488, + "learning_rate": 1.5134663341645886e-05, + "loss": 0.3841, + "step": 19520 + }, + { + "epoch": 4.870324189526184, + "grad_norm": 7.174892425537109, + "learning_rate": 1.5132169576059853e-05, + "loss": 0.3484, + "step": 19530 + }, + { + "epoch": 4.87281795511222, + "grad_norm": 6.404763698577881, + "learning_rate": 1.5129675810473816e-05, + "loss": 0.4297, + "step": 19540 + }, + { + "epoch": 4.875311720698255, + "grad_norm": 9.780095100402832, + "learning_rate": 1.5127182044887783e-05, + "loss": 0.4377, + "step": 19550 + }, + { + "epoch": 4.87780548628429, + "grad_norm": 6.8640546798706055, + "learning_rate": 1.5124688279301747e-05, + "loss": 0.3958, + "step": 19560 + }, + { + "epoch": 4.8802992518703245, + "grad_norm": 4.705758094787598, + "learning_rate": 1.512219451371571e-05, + "loss": 0.4039, + "step": 19570 + }, + { + "epoch": 4.882793017456359, + "grad_norm": 4.98261833190918, + "learning_rate": 1.5119700748129677e-05, + "loss": 0.4121, + "step": 19580 + }, + { + "epoch": 4.885286783042394, + "grad_norm": 9.222187042236328, + "learning_rate": 1.5117206982543641e-05, + "loss": 0.412, + "step": 19590 + }, + { + "epoch": 4.887780548628429, + "grad_norm": 8.537120819091797, + "learning_rate": 1.5114713216957608e-05, + "loss": 0.4807, + "step": 19600 + }, + { + "epoch": 4.890274314214464, + "grad_norm": 9.445647239685059, + "learning_rate": 1.5112219451371573e-05, + "loss": 0.4003, + "step": 19610 + }, + { + "epoch": 4.892768079800499, + "grad_norm": 7.455517292022705, + "learning_rate": 1.5109725685785537e-05, + "loss": 0.3835, + "step": 19620 + }, + { + "epoch": 4.895261845386534, + "grad_norm": 7.285569667816162, + "learning_rate": 1.5107231920199504e-05, + "loss": 0.3211, + "step": 19630 + }, + { + "epoch": 4.897755610972569, + "grad_norm": 8.98646068572998, + "learning_rate": 1.5104738154613468e-05, + "loss": 0.4019, + "step": 19640 + }, + { + "epoch": 4.9002493765586035, + "grad_norm": 8.273704528808594, + "learning_rate": 1.5102244389027431e-05, + "loss": 0.4262, + "step": 19650 + }, + { + "epoch": 4.902743142144638, + "grad_norm": 3.785250663757324, + "learning_rate": 1.5099750623441398e-05, + "loss": 0.3724, + "step": 19660 + }, + { + "epoch": 4.905236907730673, + "grad_norm": 6.882282257080078, + "learning_rate": 1.5097256857855362e-05, + "loss": 0.4621, + "step": 19670 + }, + { + "epoch": 4.907730673316708, + "grad_norm": 9.958419799804688, + "learning_rate": 1.5094763092269329e-05, + "loss": 0.3655, + "step": 19680 + }, + { + "epoch": 4.910224438902743, + "grad_norm": 4.580639839172363, + "learning_rate": 1.5092269326683292e-05, + "loss": 0.3733, + "step": 19690 + }, + { + "epoch": 4.912718204488778, + "grad_norm": 6.3402276039123535, + "learning_rate": 1.5089775561097258e-05, + "loss": 0.3853, + "step": 19700 + }, + { + "epoch": 4.915211970074813, + "grad_norm": 6.479229927062988, + "learning_rate": 1.5087281795511223e-05, + "loss": 0.4365, + "step": 19710 + }, + { + "epoch": 4.917705735660848, + "grad_norm": 5.332212924957275, + "learning_rate": 1.5084788029925188e-05, + "loss": 0.4072, + "step": 19720 + }, + { + "epoch": 4.920199501246882, + "grad_norm": 5.466404438018799, + "learning_rate": 1.5082294264339152e-05, + "loss": 0.4019, + "step": 19730 + }, + { + "epoch": 4.922693266832917, + "grad_norm": 4.750015735626221, + "learning_rate": 1.5079800498753119e-05, + "loss": 0.3635, + "step": 19740 + }, + { + "epoch": 4.925187032418952, + "grad_norm": 5.45477294921875, + "learning_rate": 1.5077306733167083e-05, + "loss": 0.3239, + "step": 19750 + }, + { + "epoch": 4.927680798004987, + "grad_norm": 7.6153564453125, + "learning_rate": 1.507481296758105e-05, + "loss": 0.3935, + "step": 19760 + }, + { + "epoch": 4.930174563591023, + "grad_norm": 5.9966816902160645, + "learning_rate": 1.5072319201995013e-05, + "loss": 0.3912, + "step": 19770 + }, + { + "epoch": 4.932668329177058, + "grad_norm": 4.490943908691406, + "learning_rate": 1.5069825436408978e-05, + "loss": 0.3943, + "step": 19780 + }, + { + "epoch": 4.9351620947630925, + "grad_norm": 4.777584075927734, + "learning_rate": 1.5067331670822944e-05, + "loss": 0.3863, + "step": 19790 + }, + { + "epoch": 4.937655860349127, + "grad_norm": 6.837095737457275, + "learning_rate": 1.5064837905236909e-05, + "loss": 0.3762, + "step": 19800 + }, + { + "epoch": 4.940149625935162, + "grad_norm": 6.350588321685791, + "learning_rate": 1.5062344139650873e-05, + "loss": 0.4286, + "step": 19810 + }, + { + "epoch": 4.942643391521197, + "grad_norm": 4.62924337387085, + "learning_rate": 1.505985037406484e-05, + "loss": 0.3772, + "step": 19820 + }, + { + "epoch": 4.945137157107232, + "grad_norm": 6.749570369720459, + "learning_rate": 1.5057356608478803e-05, + "loss": 0.4314, + "step": 19830 + }, + { + "epoch": 4.947630922693267, + "grad_norm": 5.663025856018066, + "learning_rate": 1.505486284289277e-05, + "loss": 0.4405, + "step": 19840 + }, + { + "epoch": 4.950124688279302, + "grad_norm": 4.510965347290039, + "learning_rate": 1.5052369077306734e-05, + "loss": 0.351, + "step": 19850 + }, + { + "epoch": 4.952618453865337, + "grad_norm": 4.851863384246826, + "learning_rate": 1.50498753117207e-05, + "loss": 0.3867, + "step": 19860 + }, + { + "epoch": 4.9551122194513715, + "grad_norm": 5.330405235290527, + "learning_rate": 1.5047381546134665e-05, + "loss": 0.4789, + "step": 19870 + }, + { + "epoch": 4.957605985037406, + "grad_norm": 13.125707626342773, + "learning_rate": 1.504488778054863e-05, + "loss": 0.4699, + "step": 19880 + }, + { + "epoch": 4.960099750623441, + "grad_norm": 6.033151149749756, + "learning_rate": 1.5042394014962595e-05, + "loss": 0.381, + "step": 19890 + }, + { + "epoch": 4.962593516209476, + "grad_norm": 5.597021102905273, + "learning_rate": 1.503990024937656e-05, + "loss": 0.3887, + "step": 19900 + }, + { + "epoch": 4.965087281795511, + "grad_norm": 5.585844039916992, + "learning_rate": 1.5037406483790524e-05, + "loss": 0.4441, + "step": 19910 + }, + { + "epoch": 4.967581047381546, + "grad_norm": 6.1360602378845215, + "learning_rate": 1.5034912718204491e-05, + "loss": 0.4344, + "step": 19920 + }, + { + "epoch": 4.970074812967581, + "grad_norm": 13.40194320678711, + "learning_rate": 1.5032418952618455e-05, + "loss": 0.4552, + "step": 19930 + }, + { + "epoch": 4.9725685785536164, + "grad_norm": 7.084679126739502, + "learning_rate": 1.502992518703242e-05, + "loss": 0.3554, + "step": 19940 + }, + { + "epoch": 4.975062344139651, + "grad_norm": 7.618978977203369, + "learning_rate": 1.5027431421446385e-05, + "loss": 0.3493, + "step": 19950 + }, + { + "epoch": 4.977556109725686, + "grad_norm": 6.25748348236084, + "learning_rate": 1.502493765586035e-05, + "loss": 0.4517, + "step": 19960 + }, + { + "epoch": 4.980049875311721, + "grad_norm": 5.518866062164307, + "learning_rate": 1.5022443890274316e-05, + "loss": 0.3737, + "step": 19970 + }, + { + "epoch": 4.982543640897756, + "grad_norm": 6.079436779022217, + "learning_rate": 1.5019950124688281e-05, + "loss": 0.4093, + "step": 19980 + }, + { + "epoch": 4.985037406483791, + "grad_norm": 6.771421432495117, + "learning_rate": 1.5017456359102245e-05, + "loss": 0.3948, + "step": 19990 + }, + { + "epoch": 4.987531172069826, + "grad_norm": 5.409707069396973, + "learning_rate": 1.5014962593516212e-05, + "loss": 0.4326, + "step": 20000 + }, + { + "epoch": 4.9900249376558605, + "grad_norm": 4.681028842926025, + "learning_rate": 1.5012468827930176e-05, + "loss": 0.3327, + "step": 20010 + }, + { + "epoch": 4.992518703241895, + "grad_norm": 7.876126766204834, + "learning_rate": 1.5009975062344139e-05, + "loss": 0.3613, + "step": 20020 + }, + { + "epoch": 4.99501246882793, + "grad_norm": 5.726997375488281, + "learning_rate": 1.5007481296758106e-05, + "loss": 0.4506, + "step": 20030 + }, + { + "epoch": 4.997506234413965, + "grad_norm": 6.533457279205322, + "learning_rate": 1.500498753117207e-05, + "loss": 0.4232, + "step": 20040 + }, + { + "epoch": 5.0, + "grad_norm": 21.448368072509766, + "learning_rate": 1.5002493765586037e-05, + "loss": 0.4523, + "step": 20050 + }, + { + "epoch": 5.0, + "eval_loss": 0.42168691754341125, + "eval_runtime": 60.2711, + "eval_samples_per_second": 16.641, + "eval_steps_per_second": 16.641, + "step": 20050 + }, + { + "epoch": 5.002493765586035, + "grad_norm": 6.531921863555908, + "learning_rate": 1.5000000000000002e-05, + "loss": 0.406, + "step": 20060 + }, + { + "epoch": 5.00498753117207, + "grad_norm": 5.814652442932129, + "learning_rate": 1.4997506234413966e-05, + "loss": 0.4352, + "step": 20070 + }, + { + "epoch": 5.007481296758105, + "grad_norm": 3.371870279312134, + "learning_rate": 1.4995012468827933e-05, + "loss": 0.3459, + "step": 20080 + }, + { + "epoch": 5.0099750623441395, + "grad_norm": 4.824948787689209, + "learning_rate": 1.4992518703241896e-05, + "loss": 0.3522, + "step": 20090 + }, + { + "epoch": 5.012468827930174, + "grad_norm": 10.411859512329102, + "learning_rate": 1.4990024937655863e-05, + "loss": 0.4159, + "step": 20100 + }, + { + "epoch": 5.014962593516209, + "grad_norm": 7.6682562828063965, + "learning_rate": 1.4987531172069827e-05, + "loss": 0.3963, + "step": 20110 + }, + { + "epoch": 5.017456359102244, + "grad_norm": 7.620563983917236, + "learning_rate": 1.498503740648379e-05, + "loss": 0.3902, + "step": 20120 + }, + { + "epoch": 5.019950124688279, + "grad_norm": 7.573136806488037, + "learning_rate": 1.4982543640897758e-05, + "loss": 0.5429, + "step": 20130 + }, + { + "epoch": 5.022443890274314, + "grad_norm": 5.5496697425842285, + "learning_rate": 1.4980049875311721e-05, + "loss": 0.3934, + "step": 20140 + }, + { + "epoch": 5.024937655860349, + "grad_norm": 4.304521560668945, + "learning_rate": 1.4977556109725686e-05, + "loss": 0.3509, + "step": 20150 + }, + { + "epoch": 5.027431421446384, + "grad_norm": 9.141407012939453, + "learning_rate": 1.4975062344139652e-05, + "loss": 0.433, + "step": 20160 + }, + { + "epoch": 5.029925187032419, + "grad_norm": 5.6662678718566895, + "learning_rate": 1.4972568578553617e-05, + "loss": 0.3718, + "step": 20170 + }, + { + "epoch": 5.032418952618454, + "grad_norm": 7.040060997009277, + "learning_rate": 1.4970074812967582e-05, + "loss": 0.3879, + "step": 20180 + }, + { + "epoch": 5.034912718204489, + "grad_norm": 5.835836410522461, + "learning_rate": 1.4967581047381548e-05, + "loss": 0.3697, + "step": 20190 + }, + { + "epoch": 5.037406483790524, + "grad_norm": 3.57354998588562, + "learning_rate": 1.4965087281795511e-05, + "loss": 0.3932, + "step": 20200 + }, + { + "epoch": 5.039900249376559, + "grad_norm": 7.62668514251709, + "learning_rate": 1.4962593516209478e-05, + "loss": 0.3542, + "step": 20210 + }, + { + "epoch": 5.042394014962594, + "grad_norm": 5.238462924957275, + "learning_rate": 1.4960099750623442e-05, + "loss": 0.3125, + "step": 20220 + }, + { + "epoch": 5.0448877805486285, + "grad_norm": 6.3393235206604, + "learning_rate": 1.4957605985037407e-05, + "loss": 0.4034, + "step": 20230 + }, + { + "epoch": 5.047381546134663, + "grad_norm": 7.231442451477051, + "learning_rate": 1.4955112219451373e-05, + "loss": 0.3211, + "step": 20240 + }, + { + "epoch": 5.049875311720698, + "grad_norm": 6.194590091705322, + "learning_rate": 1.4952618453865338e-05, + "loss": 0.4005, + "step": 20250 + }, + { + "epoch": 5.052369077306733, + "grad_norm": 5.869102478027344, + "learning_rate": 1.4950124688279303e-05, + "loss": 0.4237, + "step": 20260 + }, + { + "epoch": 5.054862842892768, + "grad_norm": 6.967468738555908, + "learning_rate": 1.4947630922693268e-05, + "loss": 0.4117, + "step": 20270 + }, + { + "epoch": 5.057356608478803, + "grad_norm": 4.7718658447265625, + "learning_rate": 1.4945137157107232e-05, + "loss": 0.3834, + "step": 20280 + }, + { + "epoch": 5.059850374064838, + "grad_norm": 6.20999813079834, + "learning_rate": 1.4942643391521199e-05, + "loss": 0.3369, + "step": 20290 + }, + { + "epoch": 5.062344139650873, + "grad_norm": 5.620708465576172, + "learning_rate": 1.4940149625935163e-05, + "loss": 0.3699, + "step": 20300 + }, + { + "epoch": 5.0648379052369075, + "grad_norm": 9.380097389221191, + "learning_rate": 1.493765586034913e-05, + "loss": 0.3487, + "step": 20310 + }, + { + "epoch": 5.067331670822942, + "grad_norm": 5.500340938568115, + "learning_rate": 1.4935162094763093e-05, + "loss": 0.3864, + "step": 20320 + }, + { + "epoch": 5.069825436408977, + "grad_norm": 6.254300117492676, + "learning_rate": 1.4932668329177059e-05, + "loss": 0.3982, + "step": 20330 + }, + { + "epoch": 5.072319201995012, + "grad_norm": 7.905352592468262, + "learning_rate": 1.4930174563591024e-05, + "loss": 0.4213, + "step": 20340 + }, + { + "epoch": 5.074812967581048, + "grad_norm": 5.519759654998779, + "learning_rate": 1.492768079800499e-05, + "loss": 0.3563, + "step": 20350 + }, + { + "epoch": 5.077306733167083, + "grad_norm": 7.680830955505371, + "learning_rate": 1.4925187032418953e-05, + "loss": 0.3744, + "step": 20360 + }, + { + "epoch": 5.079800498753118, + "grad_norm": 5.350339889526367, + "learning_rate": 1.492269326683292e-05, + "loss": 0.361, + "step": 20370 + }, + { + "epoch": 5.082294264339152, + "grad_norm": 6.773752212524414, + "learning_rate": 1.4920199501246884e-05, + "loss": 0.4192, + "step": 20380 + }, + { + "epoch": 5.084788029925187, + "grad_norm": 7.636809825897217, + "learning_rate": 1.491770573566085e-05, + "loss": 0.3706, + "step": 20390 + }, + { + "epoch": 5.087281795511222, + "grad_norm": 8.180363655090332, + "learning_rate": 1.4915211970074814e-05, + "loss": 0.3809, + "step": 20400 + }, + { + "epoch": 5.089775561097257, + "grad_norm": 6.33188533782959, + "learning_rate": 1.491271820448878e-05, + "loss": 0.3553, + "step": 20410 + }, + { + "epoch": 5.092269326683292, + "grad_norm": 9.772364616394043, + "learning_rate": 1.4910224438902745e-05, + "loss": 0.4478, + "step": 20420 + }, + { + "epoch": 5.094763092269327, + "grad_norm": 8.910597801208496, + "learning_rate": 1.490773067331671e-05, + "loss": 0.3909, + "step": 20430 + }, + { + "epoch": 5.097256857855362, + "grad_norm": 5.329483509063721, + "learning_rate": 1.4905236907730674e-05, + "loss": 0.3552, + "step": 20440 + }, + { + "epoch": 5.0997506234413965, + "grad_norm": 8.715051651000977, + "learning_rate": 1.490274314214464e-05, + "loss": 0.3806, + "step": 20450 + }, + { + "epoch": 5.102244389027431, + "grad_norm": 4.4914422035217285, + "learning_rate": 1.4900249376558604e-05, + "loss": 0.3125, + "step": 20460 + }, + { + "epoch": 5.104738154613466, + "grad_norm": 8.738079071044922, + "learning_rate": 1.4897755610972571e-05, + "loss": 0.3728, + "step": 20470 + }, + { + "epoch": 5.107231920199501, + "grad_norm": 6.785305023193359, + "learning_rate": 1.4895261845386535e-05, + "loss": 0.387, + "step": 20480 + }, + { + "epoch": 5.109725685785536, + "grad_norm": 6.179448127746582, + "learning_rate": 1.4892768079800499e-05, + "loss": 0.3828, + "step": 20490 + }, + { + "epoch": 5.112219451371571, + "grad_norm": 8.303686141967773, + "learning_rate": 1.4890274314214466e-05, + "loss": 0.4654, + "step": 20500 + }, + { + "epoch": 5.114713216957606, + "grad_norm": 5.6898956298828125, + "learning_rate": 1.4887780548628429e-05, + "loss": 0.3792, + "step": 20510 + }, + { + "epoch": 5.117206982543641, + "grad_norm": 7.065920829772949, + "learning_rate": 1.4885286783042394e-05, + "loss": 0.4403, + "step": 20520 + }, + { + "epoch": 5.1197007481296755, + "grad_norm": 7.726041316986084, + "learning_rate": 1.4882793017456361e-05, + "loss": 0.4045, + "step": 20530 + }, + { + "epoch": 5.12219451371571, + "grad_norm": 7.321938991546631, + "learning_rate": 1.4880299251870325e-05, + "loss": 0.365, + "step": 20540 + }, + { + "epoch": 5.124688279301745, + "grad_norm": 5.387010097503662, + "learning_rate": 1.4877805486284292e-05, + "loss": 0.4085, + "step": 20550 + }, + { + "epoch": 5.127182044887781, + "grad_norm": 5.158405303955078, + "learning_rate": 1.4875311720698256e-05, + "loss": 0.3001, + "step": 20560 + }, + { + "epoch": 5.129675810473816, + "grad_norm": 6.263799667358398, + "learning_rate": 1.487281795511222e-05, + "loss": 0.3872, + "step": 20570 + }, + { + "epoch": 5.132169576059851, + "grad_norm": 6.450216293334961, + "learning_rate": 1.4870324189526186e-05, + "loss": 0.3789, + "step": 20580 + }, + { + "epoch": 5.134663341645886, + "grad_norm": 7.4370198249816895, + "learning_rate": 1.486783042394015e-05, + "loss": 0.3508, + "step": 20590 + }, + { + "epoch": 5.13715710723192, + "grad_norm": 7.416337013244629, + "learning_rate": 1.4865336658354117e-05, + "loss": 0.3011, + "step": 20600 + }, + { + "epoch": 5.139650872817955, + "grad_norm": 7.354401588439941, + "learning_rate": 1.486284289276808e-05, + "loss": 0.3835, + "step": 20610 + }, + { + "epoch": 5.14214463840399, + "grad_norm": 8.334373474121094, + "learning_rate": 1.4860349127182046e-05, + "loss": 0.3471, + "step": 20620 + }, + { + "epoch": 5.144638403990025, + "grad_norm": 6.268391132354736, + "learning_rate": 1.4857855361596011e-05, + "loss": 0.4272, + "step": 20630 + }, + { + "epoch": 5.14713216957606, + "grad_norm": 4.680897235870361, + "learning_rate": 1.4855361596009976e-05, + "loss": 0.4065, + "step": 20640 + }, + { + "epoch": 5.149625935162095, + "grad_norm": 4.887267112731934, + "learning_rate": 1.485286783042394e-05, + "loss": 0.4085, + "step": 20650 + }, + { + "epoch": 5.15211970074813, + "grad_norm": 6.771895408630371, + "learning_rate": 1.4850374064837907e-05, + "loss": 0.3727, + "step": 20660 + }, + { + "epoch": 5.1546134663341645, + "grad_norm": 7.750133037567139, + "learning_rate": 1.484788029925187e-05, + "loss": 0.321, + "step": 20670 + }, + { + "epoch": 5.157107231920199, + "grad_norm": 4.861640930175781, + "learning_rate": 1.4845386533665838e-05, + "loss": 0.4132, + "step": 20680 + }, + { + "epoch": 5.159600997506234, + "grad_norm": 4.334446907043457, + "learning_rate": 1.4842892768079801e-05, + "loss": 0.3547, + "step": 20690 + }, + { + "epoch": 5.162094763092269, + "grad_norm": 5.8802618980407715, + "learning_rate": 1.4840399002493767e-05, + "loss": 0.4131, + "step": 20700 + }, + { + "epoch": 5.164588528678304, + "grad_norm": 7.876018524169922, + "learning_rate": 1.4837905236907732e-05, + "loss": 0.3721, + "step": 20710 + }, + { + "epoch": 5.167082294264339, + "grad_norm": 6.277840614318848, + "learning_rate": 1.4835411471321697e-05, + "loss": 0.4001, + "step": 20720 + }, + { + "epoch": 5.169576059850374, + "grad_norm": 6.03588342666626, + "learning_rate": 1.4832917705735661e-05, + "loss": 0.3861, + "step": 20730 + }, + { + "epoch": 5.172069825436409, + "grad_norm": 17.586366653442383, + "learning_rate": 1.4830423940149628e-05, + "loss": 0.3589, + "step": 20740 + }, + { + "epoch": 5.174563591022444, + "grad_norm": 5.083337783813477, + "learning_rate": 1.4827930174563592e-05, + "loss": 0.3565, + "step": 20750 + }, + { + "epoch": 5.177057356608479, + "grad_norm": 7.520929336547852, + "learning_rate": 1.4825436408977559e-05, + "loss": 0.3924, + "step": 20760 + }, + { + "epoch": 5.179551122194514, + "grad_norm": 10.510026931762695, + "learning_rate": 1.4822942643391522e-05, + "loss": 0.3726, + "step": 20770 + }, + { + "epoch": 5.182044887780549, + "grad_norm": 7.425801753997803, + "learning_rate": 1.4820448877805487e-05, + "loss": 0.3746, + "step": 20780 + }, + { + "epoch": 5.184538653366584, + "grad_norm": 5.459587574005127, + "learning_rate": 1.4817955112219453e-05, + "loss": 0.3441, + "step": 20790 + }, + { + "epoch": 5.187032418952619, + "grad_norm": 5.989594459533691, + "learning_rate": 1.4815461346633418e-05, + "loss": 0.43, + "step": 20800 + }, + { + "epoch": 5.1895261845386536, + "grad_norm": 4.831539154052734, + "learning_rate": 1.4812967581047383e-05, + "loss": 0.4618, + "step": 20810 + }, + { + "epoch": 5.192019950124688, + "grad_norm": 5.510180473327637, + "learning_rate": 1.4810473815461349e-05, + "loss": 0.3506, + "step": 20820 + }, + { + "epoch": 5.194513715710723, + "grad_norm": 6.887777805328369, + "learning_rate": 1.4807980049875312e-05, + "loss": 0.3266, + "step": 20830 + }, + { + "epoch": 5.197007481296758, + "grad_norm": 6.549706935882568, + "learning_rate": 1.480548628428928e-05, + "loss": 0.4234, + "step": 20840 + }, + { + "epoch": 5.199501246882793, + "grad_norm": 4.920103073120117, + "learning_rate": 1.4802992518703243e-05, + "loss": 0.3687, + "step": 20850 + }, + { + "epoch": 5.201995012468828, + "grad_norm": 6.742281436920166, + "learning_rate": 1.4800498753117207e-05, + "loss": 0.4335, + "step": 20860 + }, + { + "epoch": 5.204488778054863, + "grad_norm": 11.688139915466309, + "learning_rate": 1.4798004987531174e-05, + "loss": 0.3504, + "step": 20870 + }, + { + "epoch": 5.206982543640898, + "grad_norm": 7.675383567810059, + "learning_rate": 1.4795511221945139e-05, + "loss": 0.3989, + "step": 20880 + }, + { + "epoch": 5.2094763092269325, + "grad_norm": 5.377547740936279, + "learning_rate": 1.4793017456359104e-05, + "loss": 0.4201, + "step": 20890 + }, + { + "epoch": 5.211970074812967, + "grad_norm": 6.458754539489746, + "learning_rate": 1.479052369077307e-05, + "loss": 0.4486, + "step": 20900 + }, + { + "epoch": 5.214463840399002, + "grad_norm": 6.1860551834106445, + "learning_rate": 1.4788029925187033e-05, + "loss": 0.3515, + "step": 20910 + }, + { + "epoch": 5.216957605985037, + "grad_norm": 5.372089862823486, + "learning_rate": 1.4785536159601e-05, + "loss": 0.3294, + "step": 20920 + }, + { + "epoch": 5.219451371571072, + "grad_norm": 7.561584949493408, + "learning_rate": 1.4783042394014964e-05, + "loss": 0.3811, + "step": 20930 + }, + { + "epoch": 5.221945137157107, + "grad_norm": 7.178483963012695, + "learning_rate": 1.4780548628428927e-05, + "loss": 0.4044, + "step": 20940 + }, + { + "epoch": 5.224438902743142, + "grad_norm": 4.487502098083496, + "learning_rate": 1.4778054862842894e-05, + "loss": 0.334, + "step": 20950 + }, + { + "epoch": 5.2269326683291775, + "grad_norm": 6.48040246963501, + "learning_rate": 1.4775561097256858e-05, + "loss": 0.3088, + "step": 20960 + }, + { + "epoch": 5.229426433915212, + "grad_norm": 6.952301979064941, + "learning_rate": 1.4773067331670825e-05, + "loss": 0.4753, + "step": 20970 + }, + { + "epoch": 5.231920199501247, + "grad_norm": 5.854089260101318, + "learning_rate": 1.4770573566084789e-05, + "loss": 0.3515, + "step": 20980 + }, + { + "epoch": 5.234413965087282, + "grad_norm": 5.807803630828857, + "learning_rate": 1.4768079800498754e-05, + "loss": 0.4071, + "step": 20990 + }, + { + "epoch": 5.236907730673317, + "grad_norm": 8.918316841125488, + "learning_rate": 1.4765586034912721e-05, + "loss": 0.3717, + "step": 21000 + }, + { + "epoch": 5.239401496259352, + "grad_norm": 5.254445552825928, + "learning_rate": 1.4763092269326684e-05, + "loss": 0.3259, + "step": 21010 + }, + { + "epoch": 5.241895261845387, + "grad_norm": 5.781581401824951, + "learning_rate": 1.4760598503740648e-05, + "loss": 0.3459, + "step": 21020 + }, + { + "epoch": 5.2443890274314215, + "grad_norm": 5.134096622467041, + "learning_rate": 1.4758104738154615e-05, + "loss": 0.377, + "step": 21030 + }, + { + "epoch": 5.246882793017456, + "grad_norm": 3.860678195953369, + "learning_rate": 1.4755610972568579e-05, + "loss": 0.3596, + "step": 21040 + }, + { + "epoch": 5.249376558603491, + "grad_norm": 5.5030083656311035, + "learning_rate": 1.4753117206982546e-05, + "loss": 0.3723, + "step": 21050 + }, + { + "epoch": 5.251870324189526, + "grad_norm": 8.171494483947754, + "learning_rate": 1.475062344139651e-05, + "loss": 0.3947, + "step": 21060 + }, + { + "epoch": 5.254364089775561, + "grad_norm": 5.602912425994873, + "learning_rate": 1.4748129675810475e-05, + "loss": 0.4279, + "step": 21070 + }, + { + "epoch": 5.256857855361596, + "grad_norm": 6.1152472496032715, + "learning_rate": 1.474563591022444e-05, + "loss": 0.3779, + "step": 21080 + }, + { + "epoch": 5.259351620947631, + "grad_norm": 5.26813268661499, + "learning_rate": 1.4743142144638405e-05, + "loss": 0.3819, + "step": 21090 + }, + { + "epoch": 5.261845386533666, + "grad_norm": 4.3624491691589355, + "learning_rate": 1.474064837905237e-05, + "loss": 0.3442, + "step": 21100 + }, + { + "epoch": 5.2643391521197005, + "grad_norm": 5.86679220199585, + "learning_rate": 1.4738154613466336e-05, + "loss": 0.3749, + "step": 21110 + }, + { + "epoch": 5.266832917705735, + "grad_norm": 7.071092128753662, + "learning_rate": 1.47356608478803e-05, + "loss": 0.3797, + "step": 21120 + }, + { + "epoch": 5.26932668329177, + "grad_norm": 5.458619117736816, + "learning_rate": 1.4733167082294266e-05, + "loss": 0.4415, + "step": 21130 + }, + { + "epoch": 5.271820448877805, + "grad_norm": 5.8463544845581055, + "learning_rate": 1.473067331670823e-05, + "loss": 0.3957, + "step": 21140 + }, + { + "epoch": 5.274314214463841, + "grad_norm": 9.386983871459961, + "learning_rate": 1.4728179551122195e-05, + "loss": 0.3172, + "step": 21150 + }, + { + "epoch": 5.276807980049876, + "grad_norm": 5.963527679443359, + "learning_rate": 1.472568578553616e-05, + "loss": 0.3782, + "step": 21160 + }, + { + "epoch": 5.279301745635911, + "grad_norm": 8.052948951721191, + "learning_rate": 1.4723192019950126e-05, + "loss": 0.4264, + "step": 21170 + }, + { + "epoch": 5.2817955112219455, + "grad_norm": 9.773006439208984, + "learning_rate": 1.4720698254364091e-05, + "loss": 0.3145, + "step": 21180 + }, + { + "epoch": 5.28428927680798, + "grad_norm": 6.8744330406188965, + "learning_rate": 1.4718204488778057e-05, + "loss": 0.3495, + "step": 21190 + }, + { + "epoch": 5.286783042394015, + "grad_norm": 4.006411075592041, + "learning_rate": 1.471571072319202e-05, + "loss": 0.3099, + "step": 21200 + }, + { + "epoch": 5.28927680798005, + "grad_norm": 5.908966541290283, + "learning_rate": 1.4713216957605987e-05, + "loss": 0.4145, + "step": 21210 + }, + { + "epoch": 5.291770573566085, + "grad_norm": 5.07480001449585, + "learning_rate": 1.4710723192019951e-05, + "loss": 0.3299, + "step": 21220 + }, + { + "epoch": 5.29426433915212, + "grad_norm": 5.2527995109558105, + "learning_rate": 1.4708229426433916e-05, + "loss": 0.4364, + "step": 21230 + }, + { + "epoch": 5.296758104738155, + "grad_norm": 4.9071526527404785, + "learning_rate": 1.4705735660847882e-05, + "loss": 0.4228, + "step": 21240 + }, + { + "epoch": 5.2992518703241895, + "grad_norm": 7.707085609436035, + "learning_rate": 1.4703241895261847e-05, + "loss": 0.3715, + "step": 21250 + }, + { + "epoch": 5.301745635910224, + "grad_norm": 10.137189865112305, + "learning_rate": 1.4700748129675812e-05, + "loss": 0.3773, + "step": 21260 + }, + { + "epoch": 5.304239401496259, + "grad_norm": 6.27385950088501, + "learning_rate": 1.4698254364089777e-05, + "loss": 0.4286, + "step": 21270 + }, + { + "epoch": 5.306733167082294, + "grad_norm": 6.91831636428833, + "learning_rate": 1.4695760598503741e-05, + "loss": 0.3541, + "step": 21280 + }, + { + "epoch": 5.309226932668329, + "grad_norm": 4.474343299865723, + "learning_rate": 1.4693266832917708e-05, + "loss": 0.341, + "step": 21290 + }, + { + "epoch": 5.311720698254364, + "grad_norm": 6.206080913543701, + "learning_rate": 1.4690773067331672e-05, + "loss": 0.386, + "step": 21300 + }, + { + "epoch": 5.314214463840399, + "grad_norm": 5.673043251037598, + "learning_rate": 1.4688279301745639e-05, + "loss": 0.3751, + "step": 21310 + }, + { + "epoch": 5.316708229426434, + "grad_norm": 6.9373955726623535, + "learning_rate": 1.4685785536159602e-05, + "loss": 0.4205, + "step": 21320 + }, + { + "epoch": 5.3192019950124685, + "grad_norm": 5.714535713195801, + "learning_rate": 1.4683291770573566e-05, + "loss": 0.4316, + "step": 21330 + }, + { + "epoch": 5.321695760598503, + "grad_norm": 5.814162254333496, + "learning_rate": 1.4680798004987533e-05, + "loss": 0.4013, + "step": 21340 + }, + { + "epoch": 5.324189526184538, + "grad_norm": 5.710855484008789, + "learning_rate": 1.4678304239401498e-05, + "loss": 0.381, + "step": 21350 + }, + { + "epoch": 5.326683291770574, + "grad_norm": 4.715007781982422, + "learning_rate": 1.4675810473815462e-05, + "loss": 0.3644, + "step": 21360 + }, + { + "epoch": 5.329177057356609, + "grad_norm": 6.894988536834717, + "learning_rate": 1.4673316708229429e-05, + "loss": 0.3702, + "step": 21370 + }, + { + "epoch": 5.331670822942644, + "grad_norm": 5.765067100524902, + "learning_rate": 1.4670822942643392e-05, + "loss": 0.3833, + "step": 21380 + }, + { + "epoch": 5.334164588528679, + "grad_norm": 5.794302940368652, + "learning_rate": 1.466832917705736e-05, + "loss": 0.3816, + "step": 21390 + }, + { + "epoch": 5.3366583541147135, + "grad_norm": 7.873318195343018, + "learning_rate": 1.4665835411471323e-05, + "loss": 0.4156, + "step": 21400 + }, + { + "epoch": 5.339152119700748, + "grad_norm": 6.067409038543701, + "learning_rate": 1.4663341645885287e-05, + "loss": 0.3925, + "step": 21410 + }, + { + "epoch": 5.341645885286783, + "grad_norm": 4.952661514282227, + "learning_rate": 1.4660847880299254e-05, + "loss": 0.3794, + "step": 21420 + }, + { + "epoch": 5.344139650872818, + "grad_norm": 8.906375885009766, + "learning_rate": 1.4658354114713217e-05, + "loss": 0.4743, + "step": 21430 + }, + { + "epoch": 5.346633416458853, + "grad_norm": 5.5464630126953125, + "learning_rate": 1.4655860349127183e-05, + "loss": 0.3503, + "step": 21440 + }, + { + "epoch": 5.349127182044888, + "grad_norm": 4.894265651702881, + "learning_rate": 1.4653366583541148e-05, + "loss": 0.3889, + "step": 21450 + }, + { + "epoch": 5.351620947630923, + "grad_norm": 5.675085544586182, + "learning_rate": 1.4650872817955113e-05, + "loss": 0.4552, + "step": 21460 + }, + { + "epoch": 5.3541147132169575, + "grad_norm": 5.673407077789307, + "learning_rate": 1.464837905236908e-05, + "loss": 0.4368, + "step": 21470 + }, + { + "epoch": 5.356608478802992, + "grad_norm": 6.846253395080566, + "learning_rate": 1.4645885286783044e-05, + "loss": 0.3714, + "step": 21480 + }, + { + "epoch": 5.359102244389027, + "grad_norm": 7.071938991546631, + "learning_rate": 1.4643391521197007e-05, + "loss": 0.453, + "step": 21490 + }, + { + "epoch": 5.361596009975062, + "grad_norm": 5.694882392883301, + "learning_rate": 1.4640897755610974e-05, + "loss": 0.3199, + "step": 21500 + }, + { + "epoch": 5.364089775561097, + "grad_norm": 6.255917072296143, + "learning_rate": 1.4638403990024938e-05, + "loss": 0.3442, + "step": 21510 + }, + { + "epoch": 5.366583541147132, + "grad_norm": 5.694520950317383, + "learning_rate": 1.4635910224438903e-05, + "loss": 0.366, + "step": 21520 + }, + { + "epoch": 5.369077306733167, + "grad_norm": 4.848544597625732, + "learning_rate": 1.4633416458852869e-05, + "loss": 0.3746, + "step": 21530 + }, + { + "epoch": 5.371571072319202, + "grad_norm": 7.683907508850098, + "learning_rate": 1.4630922693266834e-05, + "loss": 0.401, + "step": 21540 + }, + { + "epoch": 5.374064837905237, + "grad_norm": 7.421435832977295, + "learning_rate": 1.46284289276808e-05, + "loss": 0.4114, + "step": 21550 + }, + { + "epoch": 5.376558603491272, + "grad_norm": 5.23777437210083, + "learning_rate": 1.4625935162094765e-05, + "loss": 0.3773, + "step": 21560 + }, + { + "epoch": 5.379052369077307, + "grad_norm": 10.150872230529785, + "learning_rate": 1.4623441396508728e-05, + "loss": 0.4342, + "step": 21570 + }, + { + "epoch": 5.381546134663342, + "grad_norm": 6.791860103607178, + "learning_rate": 1.4621197007481298e-05, + "loss": 0.3954, + "step": 21580 + }, + { + "epoch": 5.384039900249377, + "grad_norm": 7.470459461212158, + "learning_rate": 1.4618703241895262e-05, + "loss": 0.3082, + "step": 21590 + }, + { + "epoch": 5.386533665835412, + "grad_norm": 7.525590419769287, + "learning_rate": 1.4616209476309229e-05, + "loss": 0.4149, + "step": 21600 + }, + { + "epoch": 5.389027431421447, + "grad_norm": 8.42238998413086, + "learning_rate": 1.4613715710723192e-05, + "loss": 0.3672, + "step": 21610 + }, + { + "epoch": 5.3915211970074814, + "grad_norm": 5.296651363372803, + "learning_rate": 1.4611221945137158e-05, + "loss": 0.4263, + "step": 21620 + }, + { + "epoch": 5.394014962593516, + "grad_norm": 4.646917343139648, + "learning_rate": 1.4608728179551123e-05, + "loss": 0.357, + "step": 21630 + }, + { + "epoch": 5.396508728179551, + "grad_norm": 7.128079414367676, + "learning_rate": 1.4606234413965088e-05, + "loss": 0.3383, + "step": 21640 + }, + { + "epoch": 5.399002493765586, + "grad_norm": 6.30106782913208, + "learning_rate": 1.4603740648379054e-05, + "loss": 0.4503, + "step": 21650 + }, + { + "epoch": 5.401496259351621, + "grad_norm": 3.676443576812744, + "learning_rate": 1.4601246882793019e-05, + "loss": 0.3028, + "step": 21660 + }, + { + "epoch": 5.403990024937656, + "grad_norm": 6.699869155883789, + "learning_rate": 1.4598753117206983e-05, + "loss": 0.3205, + "step": 21670 + }, + { + "epoch": 5.406483790523691, + "grad_norm": 7.279172897338867, + "learning_rate": 1.459625935162095e-05, + "loss": 0.3449, + "step": 21680 + }, + { + "epoch": 5.4089775561097255, + "grad_norm": 5.297791004180908, + "learning_rate": 1.4593765586034913e-05, + "loss": 0.3866, + "step": 21690 + }, + { + "epoch": 5.41147132169576, + "grad_norm": 4.082034587860107, + "learning_rate": 1.4591271820448879e-05, + "loss": 0.3785, + "step": 21700 + }, + { + "epoch": 5.413965087281795, + "grad_norm": 4.643633842468262, + "learning_rate": 1.4588778054862844e-05, + "loss": 0.3734, + "step": 21710 + }, + { + "epoch": 5.41645885286783, + "grad_norm": 5.9492597579956055, + "learning_rate": 1.458628428927681e-05, + "loss": 0.4107, + "step": 21720 + }, + { + "epoch": 5.418952618453865, + "grad_norm": 6.517092704772949, + "learning_rate": 1.4583790523690774e-05, + "loss": 0.4124, + "step": 21730 + }, + { + "epoch": 5.4214463840399, + "grad_norm": 5.166103363037109, + "learning_rate": 1.458129675810474e-05, + "loss": 0.3856, + "step": 21740 + }, + { + "epoch": 5.423940149625935, + "grad_norm": 7.804215431213379, + "learning_rate": 1.4578802992518703e-05, + "loss": 0.3939, + "step": 21750 + }, + { + "epoch": 5.42643391521197, + "grad_norm": 9.893004417419434, + "learning_rate": 1.457630922693267e-05, + "loss": 0.3276, + "step": 21760 + }, + { + "epoch": 5.428927680798005, + "grad_norm": 6.463005065917969, + "learning_rate": 1.4573815461346634e-05, + "loss": 0.3526, + "step": 21770 + }, + { + "epoch": 5.43142144638404, + "grad_norm": 6.165153980255127, + "learning_rate": 1.4571321695760601e-05, + "loss": 0.4449, + "step": 21780 + }, + { + "epoch": 5.433915211970075, + "grad_norm": 4.93517541885376, + "learning_rate": 1.4568827930174565e-05, + "loss": 0.328, + "step": 21790 + }, + { + "epoch": 5.43640897755611, + "grad_norm": 5.643067359924316, + "learning_rate": 1.456633416458853e-05, + "loss": 0.4141, + "step": 21800 + }, + { + "epoch": 5.438902743142145, + "grad_norm": 7.786581039428711, + "learning_rate": 1.4563840399002495e-05, + "loss": 0.4098, + "step": 21810 + }, + { + "epoch": 5.44139650872818, + "grad_norm": 4.224101543426514, + "learning_rate": 1.456134663341646e-05, + "loss": 0.4034, + "step": 21820 + }, + { + "epoch": 5.443890274314215, + "grad_norm": 8.033632278442383, + "learning_rate": 1.4558852867830424e-05, + "loss": 0.3504, + "step": 21830 + }, + { + "epoch": 5.446384039900249, + "grad_norm": 6.644708156585693, + "learning_rate": 1.4556359102244391e-05, + "loss": 0.335, + "step": 21840 + }, + { + "epoch": 5.448877805486284, + "grad_norm": 8.7880859375, + "learning_rate": 1.4553865336658355e-05, + "loss": 0.3907, + "step": 21850 + }, + { + "epoch": 5.451371571072319, + "grad_norm": 8.938055038452148, + "learning_rate": 1.4551371571072322e-05, + "loss": 0.3681, + "step": 21860 + }, + { + "epoch": 5.453865336658354, + "grad_norm": 5.602493762969971, + "learning_rate": 1.4548877805486285e-05, + "loss": 0.4231, + "step": 21870 + }, + { + "epoch": 5.456359102244389, + "grad_norm": 6.839218616485596, + "learning_rate": 1.4546384039900249e-05, + "loss": 0.3974, + "step": 21880 + }, + { + "epoch": 5.458852867830424, + "grad_norm": 6.831507682800293, + "learning_rate": 1.4543890274314216e-05, + "loss": 0.4212, + "step": 21890 + }, + { + "epoch": 5.461346633416459, + "grad_norm": 6.885910987854004, + "learning_rate": 1.4541396508728181e-05, + "loss": 0.3996, + "step": 21900 + }, + { + "epoch": 5.4638403990024935, + "grad_norm": 9.154892921447754, + "learning_rate": 1.4538902743142145e-05, + "loss": 0.4248, + "step": 21910 + }, + { + "epoch": 5.466334164588528, + "grad_norm": 5.8164591789245605, + "learning_rate": 1.4536408977556112e-05, + "loss": 0.3866, + "step": 21920 + }, + { + "epoch": 5.468827930174563, + "grad_norm": 7.783123016357422, + "learning_rate": 1.4533915211970076e-05, + "loss": 0.3988, + "step": 21930 + }, + { + "epoch": 5.471321695760598, + "grad_norm": 8.292793273925781, + "learning_rate": 1.4531421446384043e-05, + "loss": 0.5072, + "step": 21940 + }, + { + "epoch": 5.473815461346634, + "grad_norm": 8.550527572631836, + "learning_rate": 1.4528927680798006e-05, + "loss": 0.4445, + "step": 21950 + }, + { + "epoch": 5.476309226932669, + "grad_norm": 7.110447883605957, + "learning_rate": 1.452643391521197e-05, + "loss": 0.3796, + "step": 21960 + }, + { + "epoch": 5.478802992518704, + "grad_norm": 5.753549575805664, + "learning_rate": 1.4523940149625937e-05, + "loss": 0.3745, + "step": 21970 + }, + { + "epoch": 5.4812967581047385, + "grad_norm": 12.874857902526855, + "learning_rate": 1.45214463840399e-05, + "loss": 0.3587, + "step": 21980 + }, + { + "epoch": 5.483790523690773, + "grad_norm": 7.264901161193848, + "learning_rate": 1.4518952618453867e-05, + "loss": 0.3727, + "step": 21990 + }, + { + "epoch": 5.486284289276808, + "grad_norm": 6.658206939697266, + "learning_rate": 1.4516458852867831e-05, + "loss": 0.4147, + "step": 22000 + }, + { + "epoch": 5.488778054862843, + "grad_norm": 8.003405570983887, + "learning_rate": 1.4513965087281796e-05, + "loss": 0.4492, + "step": 22010 + }, + { + "epoch": 5.491271820448878, + "grad_norm": 8.046963691711426, + "learning_rate": 1.4511471321695763e-05, + "loss": 0.3884, + "step": 22020 + }, + { + "epoch": 5.493765586034913, + "grad_norm": 4.5463385581970215, + "learning_rate": 1.4508977556109727e-05, + "loss": 0.3515, + "step": 22030 + }, + { + "epoch": 5.496259351620948, + "grad_norm": 7.268316268920898, + "learning_rate": 1.450648379052369e-05, + "loss": 0.3548, + "step": 22040 + }, + { + "epoch": 5.498753117206983, + "grad_norm": 6.258470058441162, + "learning_rate": 1.4503990024937658e-05, + "loss": 0.3813, + "step": 22050 + }, + { + "epoch": 5.501246882793017, + "grad_norm": 5.818022727966309, + "learning_rate": 1.4501496259351621e-05, + "loss": 0.4218, + "step": 22060 + }, + { + "epoch": 5.503740648379052, + "grad_norm": 5.728906631469727, + "learning_rate": 1.4499002493765588e-05, + "loss": 0.3664, + "step": 22070 + }, + { + "epoch": 5.506234413965087, + "grad_norm": 6.165427207946777, + "learning_rate": 1.4496508728179552e-05, + "loss": 0.3949, + "step": 22080 + }, + { + "epoch": 5.508728179551122, + "grad_norm": 6.8010783195495605, + "learning_rate": 1.4494014962593517e-05, + "loss": 0.3695, + "step": 22090 + }, + { + "epoch": 5.511221945137157, + "grad_norm": 5.022993087768555, + "learning_rate": 1.4491521197007482e-05, + "loss": 0.2954, + "step": 22100 + }, + { + "epoch": 5.513715710723192, + "grad_norm": 8.099823951721191, + "learning_rate": 1.4489027431421448e-05, + "loss": 0.3272, + "step": 22110 + }, + { + "epoch": 5.516209476309227, + "grad_norm": 7.62611198425293, + "learning_rate": 1.4486533665835411e-05, + "loss": 0.4664, + "step": 22120 + }, + { + "epoch": 5.5187032418952615, + "grad_norm": 9.035438537597656, + "learning_rate": 1.4484039900249378e-05, + "loss": 0.4082, + "step": 22130 + }, + { + "epoch": 5.521197007481296, + "grad_norm": 5.160449028015137, + "learning_rate": 1.4481546134663342e-05, + "loss": 0.4821, + "step": 22140 + }, + { + "epoch": 5.523690773067331, + "grad_norm": 5.530974388122559, + "learning_rate": 1.4479052369077309e-05, + "loss": 0.4418, + "step": 22150 + }, + { + "epoch": 5.526184538653366, + "grad_norm": 7.125683784484863, + "learning_rate": 1.4476558603491273e-05, + "loss": 0.358, + "step": 22160 + }, + { + "epoch": 5.528678304239402, + "grad_norm": 5.76719331741333, + "learning_rate": 1.4474064837905238e-05, + "loss": 0.3801, + "step": 22170 + }, + { + "epoch": 5.531172069825437, + "grad_norm": 5.448780536651611, + "learning_rate": 1.4471571072319203e-05, + "loss": 0.3539, + "step": 22180 + }, + { + "epoch": 5.533665835411472, + "grad_norm": 5.663511753082275, + "learning_rate": 1.4469077306733169e-05, + "loss": 0.451, + "step": 22190 + }, + { + "epoch": 5.5361596009975065, + "grad_norm": 5.449460983276367, + "learning_rate": 1.4466583541147132e-05, + "loss": 0.3879, + "step": 22200 + }, + { + "epoch": 5.538653366583541, + "grad_norm": 6.016726493835449, + "learning_rate": 1.44640897755611e-05, + "loss": 0.4061, + "step": 22210 + }, + { + "epoch": 5.541147132169576, + "grad_norm": 5.542261123657227, + "learning_rate": 1.4461596009975063e-05, + "loss": 0.3877, + "step": 22220 + }, + { + "epoch": 5.543640897755611, + "grad_norm": 8.105876922607422, + "learning_rate": 1.445910224438903e-05, + "loss": 0.3574, + "step": 22230 + }, + { + "epoch": 5.546134663341646, + "grad_norm": 6.1067914962768555, + "learning_rate": 1.4456608478802993e-05, + "loss": 0.4374, + "step": 22240 + }, + { + "epoch": 5.548628428927681, + "grad_norm": 7.3289265632629395, + "learning_rate": 1.4454114713216959e-05, + "loss": 0.3598, + "step": 22250 + }, + { + "epoch": 5.551122194513716, + "grad_norm": 6.727115154266357, + "learning_rate": 1.4451620947630924e-05, + "loss": 0.4083, + "step": 22260 + }, + { + "epoch": 5.553615960099751, + "grad_norm": 9.422988891601562, + "learning_rate": 1.444912718204489e-05, + "loss": 0.4603, + "step": 22270 + }, + { + "epoch": 5.556109725685785, + "grad_norm": 8.968878746032715, + "learning_rate": 1.4446633416458855e-05, + "loss": 0.4231, + "step": 22280 + }, + { + "epoch": 5.55860349127182, + "grad_norm": 5.541744232177734, + "learning_rate": 1.444413965087282e-05, + "loss": 0.3105, + "step": 22290 + }, + { + "epoch": 5.561097256857855, + "grad_norm": 5.500674724578857, + "learning_rate": 1.4441645885286784e-05, + "loss": 0.3357, + "step": 22300 + }, + { + "epoch": 5.56359102244389, + "grad_norm": 9.031492233276367, + "learning_rate": 1.443915211970075e-05, + "loss": 0.582, + "step": 22310 + }, + { + "epoch": 5.566084788029925, + "grad_norm": 9.218156814575195, + "learning_rate": 1.4436658354114714e-05, + "loss": 0.3842, + "step": 22320 + }, + { + "epoch": 5.56857855361596, + "grad_norm": 7.594570636749268, + "learning_rate": 1.4434164588528678e-05, + "loss": 0.4222, + "step": 22330 + }, + { + "epoch": 5.571072319201995, + "grad_norm": 5.0406270027160645, + "learning_rate": 1.4431670822942645e-05, + "loss": 0.4213, + "step": 22340 + }, + { + "epoch": 5.57356608478803, + "grad_norm": 5.456195831298828, + "learning_rate": 1.4429177057356608e-05, + "loss": 0.3552, + "step": 22350 + }, + { + "epoch": 5.576059850374065, + "grad_norm": 6.551260948181152, + "learning_rate": 1.4426683291770575e-05, + "loss": 0.5219, + "step": 22360 + }, + { + "epoch": 5.5785536159601, + "grad_norm": 6.80320405960083, + "learning_rate": 1.442418952618454e-05, + "loss": 0.3416, + "step": 22370 + }, + { + "epoch": 5.581047381546135, + "grad_norm": 5.928180694580078, + "learning_rate": 1.4421695760598504e-05, + "loss": 0.3299, + "step": 22380 + }, + { + "epoch": 5.58354114713217, + "grad_norm": 4.945070266723633, + "learning_rate": 1.4419201995012471e-05, + "loss": 0.3556, + "step": 22390 + }, + { + "epoch": 5.586034912718205, + "grad_norm": 6.342123031616211, + "learning_rate": 1.4416708229426435e-05, + "loss": 0.427, + "step": 22400 + }, + { + "epoch": 5.58852867830424, + "grad_norm": 6.5954060554504395, + "learning_rate": 1.4414214463840399e-05, + "loss": 0.4065, + "step": 22410 + }, + { + "epoch": 5.5910224438902745, + "grad_norm": 5.893113136291504, + "learning_rate": 1.4411720698254366e-05, + "loss": 0.4155, + "step": 22420 + }, + { + "epoch": 5.593516209476309, + "grad_norm": 6.298617839813232, + "learning_rate": 1.440922693266833e-05, + "loss": 0.3652, + "step": 22430 + }, + { + "epoch": 5.596009975062344, + "grad_norm": 5.96094274520874, + "learning_rate": 1.4406733167082296e-05, + "loss": 0.4139, + "step": 22440 + }, + { + "epoch": 5.598503740648379, + "grad_norm": 5.680459022521973, + "learning_rate": 1.440423940149626e-05, + "loss": 0.3219, + "step": 22450 + }, + { + "epoch": 5.600997506234414, + "grad_norm": 6.752685546875, + "learning_rate": 1.4401745635910225e-05, + "loss": 0.3695, + "step": 22460 + }, + { + "epoch": 5.603491271820449, + "grad_norm": 6.382385730743408, + "learning_rate": 1.439925187032419e-05, + "loss": 0.3797, + "step": 22470 + }, + { + "epoch": 5.605985037406484, + "grad_norm": 7.606115341186523, + "learning_rate": 1.4396758104738156e-05, + "loss": 0.3709, + "step": 22480 + }, + { + "epoch": 5.6084788029925186, + "grad_norm": 9.14214038848877, + "learning_rate": 1.4394264339152123e-05, + "loss": 0.3548, + "step": 22490 + }, + { + "epoch": 5.610972568578553, + "grad_norm": 4.893841743469238, + "learning_rate": 1.4391770573566086e-05, + "loss": 0.389, + "step": 22500 + }, + { + "epoch": 5.613466334164588, + "grad_norm": 7.275263786315918, + "learning_rate": 1.438927680798005e-05, + "loss": 0.3461, + "step": 22510 + }, + { + "epoch": 5.615960099750623, + "grad_norm": 5.975040435791016, + "learning_rate": 1.4386783042394017e-05, + "loss": 0.4767, + "step": 22520 + }, + { + "epoch": 5.618453865336658, + "grad_norm": 6.58006477355957, + "learning_rate": 1.438428927680798e-05, + "loss": 0.3466, + "step": 22530 + }, + { + "epoch": 5.620947630922693, + "grad_norm": 5.844588756561279, + "learning_rate": 1.4381795511221946e-05, + "loss": 0.3944, + "step": 22540 + }, + { + "epoch": 5.623441396508728, + "grad_norm": 12.84826374053955, + "learning_rate": 1.4379301745635911e-05, + "loss": 0.3679, + "step": 22550 + }, + { + "epoch": 5.625935162094763, + "grad_norm": 5.966850757598877, + "learning_rate": 1.4376807980049877e-05, + "loss": 0.3632, + "step": 22560 + }, + { + "epoch": 5.628428927680798, + "grad_norm": 5.235140323638916, + "learning_rate": 1.4374314214463842e-05, + "loss": 0.3889, + "step": 22570 + }, + { + "epoch": 5.630922693266833, + "grad_norm": 5.83263635635376, + "learning_rate": 1.4371820448877807e-05, + "loss": 0.3315, + "step": 22580 + }, + { + "epoch": 5.633416458852868, + "grad_norm": 4.937515735626221, + "learning_rate": 1.436932668329177e-05, + "loss": 0.3964, + "step": 22590 + }, + { + "epoch": 5.635910224438903, + "grad_norm": 6.0770392417907715, + "learning_rate": 1.4366832917705738e-05, + "loss": 0.3959, + "step": 22600 + }, + { + "epoch": 5.638403990024938, + "grad_norm": 7.637709140777588, + "learning_rate": 1.4364339152119701e-05, + "loss": 0.402, + "step": 22610 + }, + { + "epoch": 5.640897755610973, + "grad_norm": 5.632735252380371, + "learning_rate": 1.4361845386533667e-05, + "loss": 0.3721, + "step": 22620 + }, + { + "epoch": 5.643391521197008, + "grad_norm": 7.393962860107422, + "learning_rate": 1.4359351620947632e-05, + "loss": 0.3827, + "step": 22630 + }, + { + "epoch": 5.6458852867830425, + "grad_norm": 7.217513561248779, + "learning_rate": 1.4356857855361597e-05, + "loss": 0.3942, + "step": 22640 + }, + { + "epoch": 5.648379052369077, + "grad_norm": 8.804829597473145, + "learning_rate": 1.4354364089775563e-05, + "loss": 0.4314, + "step": 22650 + }, + { + "epoch": 5.650872817955112, + "grad_norm": 9.382487297058105, + "learning_rate": 1.4351870324189528e-05, + "loss": 0.552, + "step": 22660 + }, + { + "epoch": 5.653366583541147, + "grad_norm": 4.75145959854126, + "learning_rate": 1.4349376558603492e-05, + "loss": 0.3351, + "step": 22670 + }, + { + "epoch": 5.655860349127182, + "grad_norm": 5.727991104125977, + "learning_rate": 1.4346882793017459e-05, + "loss": 0.3271, + "step": 22680 + }, + { + "epoch": 5.658354114713217, + "grad_norm": 6.325864791870117, + "learning_rate": 1.4344389027431422e-05, + "loss": 0.3485, + "step": 22690 + }, + { + "epoch": 5.660847880299252, + "grad_norm": 6.61782169342041, + "learning_rate": 1.4341895261845386e-05, + "loss": 0.3918, + "step": 22700 + }, + { + "epoch": 5.6633416458852865, + "grad_norm": 8.966527938842773, + "learning_rate": 1.4339401496259353e-05, + "loss": 0.4352, + "step": 22710 + }, + { + "epoch": 5.665835411471321, + "grad_norm": 3.9728758335113525, + "learning_rate": 1.4336907730673318e-05, + "loss": 0.3993, + "step": 22720 + }, + { + "epoch": 5.668329177057356, + "grad_norm": 4.767123222351074, + "learning_rate": 1.4334413965087283e-05, + "loss": 0.3053, + "step": 22730 + }, + { + "epoch": 5.670822942643391, + "grad_norm": 5.514127254486084, + "learning_rate": 1.4331920199501249e-05, + "loss": 0.3162, + "step": 22740 + }, + { + "epoch": 5.673316708229427, + "grad_norm": 5.728281497955322, + "learning_rate": 1.4329426433915212e-05, + "loss": 0.3697, + "step": 22750 + }, + { + "epoch": 5.675810473815462, + "grad_norm": 5.388779640197754, + "learning_rate": 1.432693266832918e-05, + "loss": 0.3806, + "step": 22760 + }, + { + "epoch": 5.678304239401497, + "grad_norm": 5.20200252532959, + "learning_rate": 1.4324438902743143e-05, + "loss": 0.3255, + "step": 22770 + }, + { + "epoch": 5.6807980049875315, + "grad_norm": 6.948160648345947, + "learning_rate": 1.432194513715711e-05, + "loss": 0.3598, + "step": 22780 + }, + { + "epoch": 5.683291770573566, + "grad_norm": 5.765408515930176, + "learning_rate": 1.4319451371571074e-05, + "loss": 0.4121, + "step": 22790 + }, + { + "epoch": 5.685785536159601, + "grad_norm": 11.699581146240234, + "learning_rate": 1.4316957605985037e-05, + "loss": 0.4094, + "step": 22800 + }, + { + "epoch": 5.688279301745636, + "grad_norm": 5.797543048858643, + "learning_rate": 1.4314463840399004e-05, + "loss": 0.3325, + "step": 22810 + }, + { + "epoch": 5.690773067331671, + "grad_norm": 5.791079044342041, + "learning_rate": 1.4311970074812968e-05, + "loss": 0.3589, + "step": 22820 + }, + { + "epoch": 5.693266832917706, + "grad_norm": 5.654359817504883, + "learning_rate": 1.4309476309226933e-05, + "loss": 0.362, + "step": 22830 + }, + { + "epoch": 5.695760598503741, + "grad_norm": 5.24976110458374, + "learning_rate": 1.43069825436409e-05, + "loss": 0.3756, + "step": 22840 + }, + { + "epoch": 5.698254364089776, + "grad_norm": 6.083651542663574, + "learning_rate": 1.4304488778054864e-05, + "loss": 0.3565, + "step": 22850 + }, + { + "epoch": 5.7007481296758105, + "grad_norm": 6.845718860626221, + "learning_rate": 1.430199501246883e-05, + "loss": 0.3706, + "step": 22860 + }, + { + "epoch": 5.703241895261845, + "grad_norm": 4.934425354003906, + "learning_rate": 1.4299501246882794e-05, + "loss": 0.3646, + "step": 22870 + }, + { + "epoch": 5.70573566084788, + "grad_norm": 9.184797286987305, + "learning_rate": 1.4297007481296758e-05, + "loss": 0.4083, + "step": 22880 + }, + { + "epoch": 5.708229426433915, + "grad_norm": 9.234002113342285, + "learning_rate": 1.4294513715710725e-05, + "loss": 0.3188, + "step": 22890 + }, + { + "epoch": 5.71072319201995, + "grad_norm": 7.473569393157959, + "learning_rate": 1.4292019950124689e-05, + "loss": 0.4069, + "step": 22900 + }, + { + "epoch": 5.713216957605985, + "grad_norm": 5.984492301940918, + "learning_rate": 1.4289526184538654e-05, + "loss": 0.3863, + "step": 22910 + }, + { + "epoch": 5.71571072319202, + "grad_norm": 8.733806610107422, + "learning_rate": 1.428703241895262e-05, + "loss": 0.4223, + "step": 22920 + }, + { + "epoch": 5.7182044887780545, + "grad_norm": 7.522699356079102, + "learning_rate": 1.4284538653366585e-05, + "loss": 0.402, + "step": 22930 + }, + { + "epoch": 5.720698254364089, + "grad_norm": 6.460349082946777, + "learning_rate": 1.428204488778055e-05, + "loss": 0.3046, + "step": 22940 + }, + { + "epoch": 5.723192019950124, + "grad_norm": 6.2640204429626465, + "learning_rate": 1.4279551122194515e-05, + "loss": 0.3377, + "step": 22950 + }, + { + "epoch": 5.725685785536159, + "grad_norm": 3.072329521179199, + "learning_rate": 1.4277057356608479e-05, + "loss": 0.3212, + "step": 22960 + }, + { + "epoch": 5.728179551122195, + "grad_norm": 5.633548259735107, + "learning_rate": 1.4274563591022446e-05, + "loss": 0.3203, + "step": 22970 + }, + { + "epoch": 5.73067331670823, + "grad_norm": 8.383296012878418, + "learning_rate": 1.427206982543641e-05, + "loss": 0.4422, + "step": 22980 + }, + { + "epoch": 5.733167082294265, + "grad_norm": 7.479531764984131, + "learning_rate": 1.4269576059850376e-05, + "loss": 0.3443, + "step": 22990 + }, + { + "epoch": 5.7356608478802995, + "grad_norm": 8.796760559082031, + "learning_rate": 1.426708229426434e-05, + "loss": 0.4097, + "step": 23000 + }, + { + "epoch": 5.738154613466334, + "grad_norm": 8.372130393981934, + "learning_rate": 1.4264588528678305e-05, + "loss": 0.4402, + "step": 23010 + }, + { + "epoch": 5.740648379052369, + "grad_norm": 9.008794784545898, + "learning_rate": 1.426209476309227e-05, + "loss": 0.4109, + "step": 23020 + }, + { + "epoch": 5.743142144638404, + "grad_norm": 8.82776927947998, + "learning_rate": 1.4259600997506236e-05, + "loss": 0.3458, + "step": 23030 + }, + { + "epoch": 5.745635910224439, + "grad_norm": 6.604305267333984, + "learning_rate": 1.42571072319202e-05, + "loss": 0.3969, + "step": 23040 + }, + { + "epoch": 5.748129675810474, + "grad_norm": 6.345452308654785, + "learning_rate": 1.4254613466334167e-05, + "loss": 0.3854, + "step": 23050 + }, + { + "epoch": 5.750623441396509, + "grad_norm": 8.280367851257324, + "learning_rate": 1.425211970074813e-05, + "loss": 0.395, + "step": 23060 + }, + { + "epoch": 5.753117206982544, + "grad_norm": 4.906089782714844, + "learning_rate": 1.4249625935162097e-05, + "loss": 0.3929, + "step": 23070 + }, + { + "epoch": 5.7556109725685785, + "grad_norm": 8.1076021194458, + "learning_rate": 1.424713216957606e-05, + "loss": 0.3963, + "step": 23080 + }, + { + "epoch": 5.758104738154613, + "grad_norm": 5.047501087188721, + "learning_rate": 1.4244638403990026e-05, + "loss": 0.354, + "step": 23090 + }, + { + "epoch": 5.760598503740648, + "grad_norm": 5.581216812133789, + "learning_rate": 1.4242144638403991e-05, + "loss": 0.3966, + "step": 23100 + }, + { + "epoch": 5.763092269326683, + "grad_norm": 5.1039347648620605, + "learning_rate": 1.4239650872817957e-05, + "loss": 0.3249, + "step": 23110 + }, + { + "epoch": 5.765586034912718, + "grad_norm": 7.597653388977051, + "learning_rate": 1.423715710723192e-05, + "loss": 0.369, + "step": 23120 + }, + { + "epoch": 5.768079800498753, + "grad_norm": 4.918848037719727, + "learning_rate": 1.4234663341645887e-05, + "loss": 0.394, + "step": 23130 + }, + { + "epoch": 5.770573566084788, + "grad_norm": 7.739204406738281, + "learning_rate": 1.4232169576059851e-05, + "loss": 0.3303, + "step": 23140 + }, + { + "epoch": 5.773067331670823, + "grad_norm": 6.075505256652832, + "learning_rate": 1.4229675810473818e-05, + "loss": 0.3693, + "step": 23150 + }, + { + "epoch": 5.775561097256858, + "grad_norm": 10.600066184997559, + "learning_rate": 1.4227182044887782e-05, + "loss": 0.3751, + "step": 23160 + }, + { + "epoch": 5.778054862842893, + "grad_norm": 7.901242733001709, + "learning_rate": 1.4224688279301745e-05, + "loss": 0.3912, + "step": 23170 + }, + { + "epoch": 5.780548628428928, + "grad_norm": 7.340981483459473, + "learning_rate": 1.4222194513715712e-05, + "loss": 0.437, + "step": 23180 + }, + { + "epoch": 5.783042394014963, + "grad_norm": 6.0289626121521, + "learning_rate": 1.4219700748129678e-05, + "loss": 0.4226, + "step": 23190 + }, + { + "epoch": 5.785536159600998, + "grad_norm": 6.679872989654541, + "learning_rate": 1.4217206982543641e-05, + "loss": 0.4471, + "step": 23200 + }, + { + "epoch": 5.788029925187033, + "grad_norm": 7.326256275177002, + "learning_rate": 1.4214713216957608e-05, + "loss": 0.39, + "step": 23210 + }, + { + "epoch": 5.7905236907730675, + "grad_norm": 5.042798042297363, + "learning_rate": 1.4212219451371572e-05, + "loss": 0.3997, + "step": 23220 + }, + { + "epoch": 5.793017456359102, + "grad_norm": 6.334157943725586, + "learning_rate": 1.4209725685785539e-05, + "loss": 0.3387, + "step": 23230 + }, + { + "epoch": 5.795511221945137, + "grad_norm": 4.981834888458252, + "learning_rate": 1.4207231920199502e-05, + "loss": 0.5182, + "step": 23240 + }, + { + "epoch": 5.798004987531172, + "grad_norm": 4.364705562591553, + "learning_rate": 1.4204738154613466e-05, + "loss": 0.3756, + "step": 23250 + }, + { + "epoch": 5.800498753117207, + "grad_norm": 5.2104268074035645, + "learning_rate": 1.4202244389027433e-05, + "loss": 0.4008, + "step": 23260 + }, + { + "epoch": 5.802992518703242, + "grad_norm": 6.920286178588867, + "learning_rate": 1.4199750623441397e-05, + "loss": 0.4209, + "step": 23270 + }, + { + "epoch": 5.805486284289277, + "grad_norm": 7.996755599975586, + "learning_rate": 1.4197256857855364e-05, + "loss": 0.4087, + "step": 23280 + }, + { + "epoch": 5.807980049875312, + "grad_norm": 5.335063934326172, + "learning_rate": 1.4194763092269327e-05, + "loss": 0.3644, + "step": 23290 + }, + { + "epoch": 5.8104738154613464, + "grad_norm": 7.114324569702148, + "learning_rate": 1.4192269326683293e-05, + "loss": 0.3954, + "step": 23300 + }, + { + "epoch": 5.812967581047381, + "grad_norm": 4.345088005065918, + "learning_rate": 1.418977556109726e-05, + "loss": 0.4041, + "step": 23310 + }, + { + "epoch": 5.815461346633416, + "grad_norm": 6.6690287590026855, + "learning_rate": 1.4187281795511223e-05, + "loss": 0.3639, + "step": 23320 + }, + { + "epoch": 5.817955112219451, + "grad_norm": 7.393259048461914, + "learning_rate": 1.4184788029925187e-05, + "loss": 0.449, + "step": 23330 + }, + { + "epoch": 5.820448877805486, + "grad_norm": 6.419971466064453, + "learning_rate": 1.4182294264339154e-05, + "loss": 0.4075, + "step": 23340 + }, + { + "epoch": 5.822942643391521, + "grad_norm": 5.764027118682861, + "learning_rate": 1.4179800498753117e-05, + "loss": 0.3845, + "step": 23350 + }, + { + "epoch": 5.825436408977556, + "grad_norm": 7.242391109466553, + "learning_rate": 1.4177306733167084e-05, + "loss": 0.3426, + "step": 23360 + }, + { + "epoch": 5.8279301745635905, + "grad_norm": 7.269883155822754, + "learning_rate": 1.4174812967581048e-05, + "loss": 0.3911, + "step": 23370 + }, + { + "epoch": 5.830423940149626, + "grad_norm": 4.136662006378174, + "learning_rate": 1.4172319201995013e-05, + "loss": 0.5394, + "step": 23380 + }, + { + "epoch": 5.832917705735661, + "grad_norm": 7.696037769317627, + "learning_rate": 1.4169825436408979e-05, + "loss": 0.3823, + "step": 23390 + }, + { + "epoch": 5.835411471321696, + "grad_norm": 8.662434577941895, + "learning_rate": 1.4167331670822944e-05, + "loss": 0.448, + "step": 23400 + }, + { + "epoch": 5.837905236907731, + "grad_norm": 7.484658718109131, + "learning_rate": 1.4164837905236908e-05, + "loss": 0.4121, + "step": 23410 + }, + { + "epoch": 5.840399002493766, + "grad_norm": 4.754652976989746, + "learning_rate": 1.4162344139650875e-05, + "loss": 0.3833, + "step": 23420 + }, + { + "epoch": 5.842892768079801, + "grad_norm": 7.560851573944092, + "learning_rate": 1.4159850374064838e-05, + "loss": 0.4049, + "step": 23430 + }, + { + "epoch": 5.8453865336658355, + "grad_norm": 7.429394721984863, + "learning_rate": 1.4157356608478805e-05, + "loss": 0.409, + "step": 23440 + }, + { + "epoch": 5.84788029925187, + "grad_norm": 7.763380527496338, + "learning_rate": 1.4154862842892769e-05, + "loss": 0.3994, + "step": 23450 + }, + { + "epoch": 5.850374064837905, + "grad_norm": 6.119085311889648, + "learning_rate": 1.4152369077306734e-05, + "loss": 0.4111, + "step": 23460 + }, + { + "epoch": 5.85286783042394, + "grad_norm": 7.172895431518555, + "learning_rate": 1.41498753117207e-05, + "loss": 0.4037, + "step": 23470 + }, + { + "epoch": 5.855361596009975, + "grad_norm": 4.939626693725586, + "learning_rate": 1.4147381546134665e-05, + "loss": 0.4928, + "step": 23480 + }, + { + "epoch": 5.85785536159601, + "grad_norm": 5.627965450286865, + "learning_rate": 1.414488778054863e-05, + "loss": 0.4169, + "step": 23490 + }, + { + "epoch": 5.860349127182045, + "grad_norm": 7.699008464813232, + "learning_rate": 1.4142394014962595e-05, + "loss": 0.3669, + "step": 23500 + }, + { + "epoch": 5.86284289276808, + "grad_norm": 8.648443222045898, + "learning_rate": 1.4139900249376559e-05, + "loss": 0.3886, + "step": 23510 + }, + { + "epoch": 5.865336658354114, + "grad_norm": 6.288234710693359, + "learning_rate": 1.4137406483790526e-05, + "loss": 0.4166, + "step": 23520 + }, + { + "epoch": 5.867830423940149, + "grad_norm": 7.797109603881836, + "learning_rate": 1.413491271820449e-05, + "loss": 0.3971, + "step": 23530 + }, + { + "epoch": 5.870324189526184, + "grad_norm": 5.520691394805908, + "learning_rate": 1.4132418952618455e-05, + "loss": 0.4536, + "step": 23540 + }, + { + "epoch": 5.87281795511222, + "grad_norm": 4.677943229675293, + "learning_rate": 1.412992518703242e-05, + "loss": 0.3958, + "step": 23550 + }, + { + "epoch": 5.875311720698255, + "grad_norm": 5.682249069213867, + "learning_rate": 1.4127431421446386e-05, + "loss": 0.3522, + "step": 23560 + }, + { + "epoch": 5.87780548628429, + "grad_norm": 6.83863639831543, + "learning_rate": 1.412493765586035e-05, + "loss": 0.3656, + "step": 23570 + }, + { + "epoch": 5.8802992518703245, + "grad_norm": 6.313279628753662, + "learning_rate": 1.4122443890274316e-05, + "loss": 0.3988, + "step": 23580 + }, + { + "epoch": 5.882793017456359, + "grad_norm": 7.157898426055908, + "learning_rate": 1.411995012468828e-05, + "loss": 0.401, + "step": 23590 + }, + { + "epoch": 5.885286783042394, + "grad_norm": 6.3576531410217285, + "learning_rate": 1.4117456359102247e-05, + "loss": 0.4291, + "step": 23600 + }, + { + "epoch": 5.887780548628429, + "grad_norm": 6.434720993041992, + "learning_rate": 1.411496259351621e-05, + "loss": 0.3876, + "step": 23610 + }, + { + "epoch": 5.890274314214464, + "grad_norm": 6.252552509307861, + "learning_rate": 1.4112468827930174e-05, + "loss": 0.3471, + "step": 23620 + }, + { + "epoch": 5.892768079800499, + "grad_norm": 3.91813588142395, + "learning_rate": 1.4109975062344141e-05, + "loss": 0.3224, + "step": 23630 + }, + { + "epoch": 5.895261845386534, + "grad_norm": 6.144435882568359, + "learning_rate": 1.4107481296758105e-05, + "loss": 0.4012, + "step": 23640 + }, + { + "epoch": 5.897755610972569, + "grad_norm": 6.0497565269470215, + "learning_rate": 1.4104987531172072e-05, + "loss": 0.4003, + "step": 23650 + }, + { + "epoch": 5.9002493765586035, + "grad_norm": 4.732110500335693, + "learning_rate": 1.4102493765586037e-05, + "loss": 0.419, + "step": 23660 + }, + { + "epoch": 5.902743142144638, + "grad_norm": 13.11367416381836, + "learning_rate": 1.41e-05, + "loss": 0.3991, + "step": 23670 + }, + { + "epoch": 5.905236907730673, + "grad_norm": 5.816923141479492, + "learning_rate": 1.4097506234413968e-05, + "loss": 0.34, + "step": 23680 + }, + { + "epoch": 5.907730673316708, + "grad_norm": 7.775258541107178, + "learning_rate": 1.4095012468827931e-05, + "loss": 0.3784, + "step": 23690 + }, + { + "epoch": 5.910224438902743, + "grad_norm": 5.4291486740112305, + "learning_rate": 1.4092518703241898e-05, + "loss": 0.404, + "step": 23700 + }, + { + "epoch": 5.912718204488778, + "grad_norm": 7.315398216247559, + "learning_rate": 1.4090024937655862e-05, + "loss": 0.4278, + "step": 23710 + }, + { + "epoch": 5.915211970074813, + "grad_norm": 5.724961757659912, + "learning_rate": 1.4087531172069825e-05, + "loss": 0.4336, + "step": 23720 + }, + { + "epoch": 5.917705735660848, + "grad_norm": 8.703821182250977, + "learning_rate": 1.4085037406483792e-05, + "loss": 0.4091, + "step": 23730 + }, + { + "epoch": 5.920199501246882, + "grad_norm": 6.634580612182617, + "learning_rate": 1.4082543640897756e-05, + "loss": 0.3738, + "step": 23740 + }, + { + "epoch": 5.922693266832917, + "grad_norm": 7.790684223175049, + "learning_rate": 1.4080049875311721e-05, + "loss": 0.4328, + "step": 23750 + }, + { + "epoch": 5.925187032418952, + "grad_norm": 6.3984808921813965, + "learning_rate": 1.4077556109725687e-05, + "loss": 0.3641, + "step": 23760 + }, + { + "epoch": 5.927680798004987, + "grad_norm": 5.913004398345947, + "learning_rate": 1.4075062344139652e-05, + "loss": 0.3595, + "step": 23770 + }, + { + "epoch": 5.930174563591023, + "grad_norm": 9.154977798461914, + "learning_rate": 1.4072568578553619e-05, + "loss": 0.3679, + "step": 23780 + }, + { + "epoch": 5.932668329177058, + "grad_norm": 6.98581075668335, + "learning_rate": 1.4070074812967583e-05, + "loss": 0.3907, + "step": 23790 + }, + { + "epoch": 5.9351620947630925, + "grad_norm": 6.550581932067871, + "learning_rate": 1.4067581047381546e-05, + "loss": 0.4143, + "step": 23800 + }, + { + "epoch": 5.937655860349127, + "grad_norm": 7.637538909912109, + "learning_rate": 1.4065087281795513e-05, + "loss": 0.3854, + "step": 23810 + }, + { + "epoch": 5.940149625935162, + "grad_norm": 8.336382865905762, + "learning_rate": 1.4062593516209477e-05, + "loss": 0.3903, + "step": 23820 + }, + { + "epoch": 5.942643391521197, + "grad_norm": 5.107188701629639, + "learning_rate": 1.4060099750623442e-05, + "loss": 0.3879, + "step": 23830 + }, + { + "epoch": 5.945137157107232, + "grad_norm": 5.9631571769714355, + "learning_rate": 1.4057605985037407e-05, + "loss": 0.4691, + "step": 23840 + }, + { + "epoch": 5.947630922693267, + "grad_norm": 6.080106258392334, + "learning_rate": 1.4055112219451373e-05, + "loss": 0.4799, + "step": 23850 + }, + { + "epoch": 5.950124688279302, + "grad_norm": 9.913834571838379, + "learning_rate": 1.4052618453865338e-05, + "loss": 0.3768, + "step": 23860 + }, + { + "epoch": 5.952618453865337, + "grad_norm": 6.052908420562744, + "learning_rate": 1.4050124688279303e-05, + "loss": 0.3323, + "step": 23870 + }, + { + "epoch": 5.9551122194513715, + "grad_norm": 8.074835777282715, + "learning_rate": 1.4047630922693267e-05, + "loss": 0.4003, + "step": 23880 + }, + { + "epoch": 5.957605985037406, + "grad_norm": 4.824693202972412, + "learning_rate": 1.4045137157107234e-05, + "loss": 0.3868, + "step": 23890 + }, + { + "epoch": 5.960099750623441, + "grad_norm": 8.275466918945312, + "learning_rate": 1.4042643391521198e-05, + "loss": 0.3944, + "step": 23900 + }, + { + "epoch": 5.962593516209476, + "grad_norm": 7.059325695037842, + "learning_rate": 1.4040149625935163e-05, + "loss": 0.4608, + "step": 23910 + }, + { + "epoch": 5.965087281795511, + "grad_norm": 6.65533447265625, + "learning_rate": 1.4037655860349128e-05, + "loss": 0.3445, + "step": 23920 + }, + { + "epoch": 5.967581047381546, + "grad_norm": 6.2728590965271, + "learning_rate": 1.4035162094763093e-05, + "loss": 0.3764, + "step": 23930 + }, + { + "epoch": 5.970074812967581, + "grad_norm": 8.21418571472168, + "learning_rate": 1.4032668329177059e-05, + "loss": 0.3982, + "step": 23940 + }, + { + "epoch": 5.9725685785536164, + "grad_norm": 6.642632007598877, + "learning_rate": 1.4030423940149627e-05, + "loss": 0.3619, + "step": 23950 + }, + { + "epoch": 5.975062344139651, + "grad_norm": 7.222472667694092, + "learning_rate": 1.4027930174563592e-05, + "loss": 0.3871, + "step": 23960 + }, + { + "epoch": 5.977556109725686, + "grad_norm": 8.01220417022705, + "learning_rate": 1.4025436408977558e-05, + "loss": 0.3979, + "step": 23970 + }, + { + "epoch": 5.980049875311721, + "grad_norm": 5.2847981452941895, + "learning_rate": 1.4022942643391521e-05, + "loss": 0.4304, + "step": 23980 + }, + { + "epoch": 5.982543640897756, + "grad_norm": 10.303918838500977, + "learning_rate": 1.4020448877805488e-05, + "loss": 0.377, + "step": 23990 + }, + { + "epoch": 5.985037406483791, + "grad_norm": 10.450936317443848, + "learning_rate": 1.4017955112219452e-05, + "loss": 0.4502, + "step": 24000 + }, + { + "epoch": 5.987531172069826, + "grad_norm": 6.296121120452881, + "learning_rate": 1.4015461346633417e-05, + "loss": 0.4307, + "step": 24010 + }, + { + "epoch": 5.9900249376558605, + "grad_norm": 5.6589202880859375, + "learning_rate": 1.4012967581047383e-05, + "loss": 0.4116, + "step": 24020 + }, + { + "epoch": 5.992518703241895, + "grad_norm": 6.098679065704346, + "learning_rate": 1.4010473815461348e-05, + "loss": 0.336, + "step": 24030 + }, + { + "epoch": 5.99501246882793, + "grad_norm": 6.028469562530518, + "learning_rate": 1.4007980049875313e-05, + "loss": 0.3988, + "step": 24040 + }, + { + "epoch": 5.997506234413965, + "grad_norm": 7.266406536102295, + "learning_rate": 1.4005486284289278e-05, + "loss": 0.435, + "step": 24050 + }, + { + "epoch": 6.0, + "grad_norm": 4.367632865905762, + "learning_rate": 1.4002992518703242e-05, + "loss": 0.3838, + "step": 24060 + }, + { + "epoch": 6.0, + "eval_loss": 0.41945403814315796, + "eval_runtime": 59.8425, + "eval_samples_per_second": 16.761, + "eval_steps_per_second": 16.761, + "step": 24060 + }, + { + "epoch": 6.002493765586035, + "grad_norm": 5.5019001960754395, + "learning_rate": 1.4000498753117209e-05, + "loss": 0.3192, + "step": 24070 + }, + { + "epoch": 6.00498753117207, + "grad_norm": 6.230374813079834, + "learning_rate": 1.3998004987531173e-05, + "loss": 0.3463, + "step": 24080 + }, + { + "epoch": 6.007481296758105, + "grad_norm": 5.844031810760498, + "learning_rate": 1.3995511221945138e-05, + "loss": 0.4043, + "step": 24090 + }, + { + "epoch": 6.0099750623441395, + "grad_norm": 6.5853166580200195, + "learning_rate": 1.3993017456359103e-05, + "loss": 0.4222, + "step": 24100 + }, + { + "epoch": 6.012468827930174, + "grad_norm": 7.990192890167236, + "learning_rate": 1.3990523690773069e-05, + "loss": 0.3578, + "step": 24110 + }, + { + "epoch": 6.014962593516209, + "grad_norm": 5.267573356628418, + "learning_rate": 1.3988029925187034e-05, + "loss": 0.3111, + "step": 24120 + }, + { + "epoch": 6.017456359102244, + "grad_norm": 6.116258144378662, + "learning_rate": 1.3985536159601e-05, + "loss": 0.3556, + "step": 24130 + }, + { + "epoch": 6.019950124688279, + "grad_norm": 7.899449825286865, + "learning_rate": 1.3983042394014963e-05, + "loss": 0.3691, + "step": 24140 + }, + { + "epoch": 6.022443890274314, + "grad_norm": 8.339388847351074, + "learning_rate": 1.398054862842893e-05, + "loss": 0.3704, + "step": 24150 + }, + { + "epoch": 6.024937655860349, + "grad_norm": 5.403593063354492, + "learning_rate": 1.3978054862842893e-05, + "loss": 0.3668, + "step": 24160 + }, + { + "epoch": 6.027431421446384, + "grad_norm": 7.102303981781006, + "learning_rate": 1.397556109725686e-05, + "loss": 0.3266, + "step": 24170 + }, + { + "epoch": 6.029925187032419, + "grad_norm": 5.142783164978027, + "learning_rate": 1.3973067331670824e-05, + "loss": 0.3544, + "step": 24180 + }, + { + "epoch": 6.032418952618454, + "grad_norm": 7.026705265045166, + "learning_rate": 1.3970573566084788e-05, + "loss": 0.3973, + "step": 24190 + }, + { + "epoch": 6.034912718204489, + "grad_norm": 5.733627796173096, + "learning_rate": 1.3968079800498755e-05, + "loss": 0.3553, + "step": 24200 + }, + { + "epoch": 6.037406483790524, + "grad_norm": 6.362379550933838, + "learning_rate": 1.396558603491272e-05, + "loss": 0.3822, + "step": 24210 + }, + { + "epoch": 6.039900249376559, + "grad_norm": 7.283205509185791, + "learning_rate": 1.3963092269326684e-05, + "loss": 0.4559, + "step": 24220 + }, + { + "epoch": 6.042394014962594, + "grad_norm": 7.929378509521484, + "learning_rate": 1.396059850374065e-05, + "loss": 0.3449, + "step": 24230 + }, + { + "epoch": 6.0448877805486285, + "grad_norm": 7.110724449157715, + "learning_rate": 1.3958104738154614e-05, + "loss": 0.3746, + "step": 24240 + }, + { + "epoch": 6.047381546134663, + "grad_norm": 5.7137227058410645, + "learning_rate": 1.3955610972568581e-05, + "loss": 0.4771, + "step": 24250 + }, + { + "epoch": 6.049875311720698, + "grad_norm": 6.553785800933838, + "learning_rate": 1.3953117206982545e-05, + "loss": 0.3555, + "step": 24260 + }, + { + "epoch": 6.052369077306733, + "grad_norm": 5.964046955108643, + "learning_rate": 1.3950623441396509e-05, + "loss": 0.3619, + "step": 24270 + }, + { + "epoch": 6.054862842892768, + "grad_norm": 7.368553638458252, + "learning_rate": 1.3948129675810476e-05, + "loss": 0.3385, + "step": 24280 + }, + { + "epoch": 6.057356608478803, + "grad_norm": 9.049263000488281, + "learning_rate": 1.3945635910224439e-05, + "loss": 0.3605, + "step": 24290 + }, + { + "epoch": 6.059850374064838, + "grad_norm": 7.191588401794434, + "learning_rate": 1.3943142144638404e-05, + "loss": 0.4145, + "step": 24300 + }, + { + "epoch": 6.062344139650873, + "grad_norm": 10.2169828414917, + "learning_rate": 1.394064837905237e-05, + "loss": 0.3857, + "step": 24310 + }, + { + "epoch": 6.0648379052369075, + "grad_norm": 7.191344738006592, + "learning_rate": 1.3938154613466335e-05, + "loss": 0.3727, + "step": 24320 + }, + { + "epoch": 6.067331670822942, + "grad_norm": 6.484828472137451, + "learning_rate": 1.3935660847880302e-05, + "loss": 0.4169, + "step": 24330 + }, + { + "epoch": 6.069825436408977, + "grad_norm": 5.972647190093994, + "learning_rate": 1.3933167082294266e-05, + "loss": 0.2752, + "step": 24340 + }, + { + "epoch": 6.072319201995012, + "grad_norm": 7.331430912017822, + "learning_rate": 1.393067331670823e-05, + "loss": 0.3688, + "step": 24350 + }, + { + "epoch": 6.074812967581048, + "grad_norm": 4.472461223602295, + "learning_rate": 1.3928179551122196e-05, + "loss": 0.3835, + "step": 24360 + }, + { + "epoch": 6.077306733167083, + "grad_norm": 6.023663520812988, + "learning_rate": 1.392568578553616e-05, + "loss": 0.3425, + "step": 24370 + }, + { + "epoch": 6.079800498753118, + "grad_norm": 6.141242980957031, + "learning_rate": 1.3923192019950127e-05, + "loss": 0.3483, + "step": 24380 + }, + { + "epoch": 6.082294264339152, + "grad_norm": 6.454586029052734, + "learning_rate": 1.392069825436409e-05, + "loss": 0.3239, + "step": 24390 + }, + { + "epoch": 6.084788029925187, + "grad_norm": 5.868727684020996, + "learning_rate": 1.3918204488778056e-05, + "loss": 0.2889, + "step": 24400 + }, + { + "epoch": 6.087281795511222, + "grad_norm": 5.985288619995117, + "learning_rate": 1.3915710723192021e-05, + "loss": 0.3998, + "step": 24410 + }, + { + "epoch": 6.089775561097257, + "grad_norm": 8.732137680053711, + "learning_rate": 1.3913216957605986e-05, + "loss": 0.4191, + "step": 24420 + }, + { + "epoch": 6.092269326683292, + "grad_norm": 3.857243061065674, + "learning_rate": 1.391072319201995e-05, + "loss": 0.3174, + "step": 24430 + }, + { + "epoch": 6.094763092269327, + "grad_norm": 6.74662446975708, + "learning_rate": 1.3908229426433917e-05, + "loss": 0.4585, + "step": 24440 + }, + { + "epoch": 6.097256857855362, + "grad_norm": 8.382293701171875, + "learning_rate": 1.390573566084788e-05, + "loss": 0.4119, + "step": 24450 + }, + { + "epoch": 6.0997506234413965, + "grad_norm": 5.25961446762085, + "learning_rate": 1.3903241895261848e-05, + "loss": 0.355, + "step": 24460 + }, + { + "epoch": 6.102244389027431, + "grad_norm": 10.296069145202637, + "learning_rate": 1.3900748129675811e-05, + "loss": 0.4015, + "step": 24470 + }, + { + "epoch": 6.104738154613466, + "grad_norm": 6.3385009765625, + "learning_rate": 1.3898254364089777e-05, + "loss": 0.4538, + "step": 24480 + }, + { + "epoch": 6.107231920199501, + "grad_norm": 5.728212356567383, + "learning_rate": 1.3895760598503742e-05, + "loss": 0.3313, + "step": 24490 + }, + { + "epoch": 6.109725685785536, + "grad_norm": 7.13115930557251, + "learning_rate": 1.3893266832917707e-05, + "loss": 0.3547, + "step": 24500 + }, + { + "epoch": 6.112219451371571, + "grad_norm": 7.432462692260742, + "learning_rate": 1.3890773067331671e-05, + "loss": 0.3595, + "step": 24510 + }, + { + "epoch": 6.114713216957606, + "grad_norm": 7.006857872009277, + "learning_rate": 1.3888279301745638e-05, + "loss": 0.3668, + "step": 24520 + }, + { + "epoch": 6.117206982543641, + "grad_norm": 6.406911373138428, + "learning_rate": 1.3885785536159601e-05, + "loss": 0.3775, + "step": 24530 + }, + { + "epoch": 6.1197007481296755, + "grad_norm": 4.907394886016846, + "learning_rate": 1.3883291770573568e-05, + "loss": 0.3529, + "step": 24540 + }, + { + "epoch": 6.12219451371571, + "grad_norm": 5.443017482757568, + "learning_rate": 1.3880798004987532e-05, + "loss": 0.3293, + "step": 24550 + }, + { + "epoch": 6.124688279301745, + "grad_norm": 6.361752510070801, + "learning_rate": 1.3878304239401497e-05, + "loss": 0.3988, + "step": 24560 + }, + { + "epoch": 6.127182044887781, + "grad_norm": 7.4896464347839355, + "learning_rate": 1.3875810473815463e-05, + "loss": 0.3992, + "step": 24570 + }, + { + "epoch": 6.129675810473816, + "grad_norm": 5.954341888427734, + "learning_rate": 1.3873316708229428e-05, + "loss": 0.3524, + "step": 24580 + }, + { + "epoch": 6.132169576059851, + "grad_norm": 7.877580642700195, + "learning_rate": 1.3870822942643392e-05, + "loss": 0.3688, + "step": 24590 + }, + { + "epoch": 6.134663341645886, + "grad_norm": 11.095531463623047, + "learning_rate": 1.3868329177057359e-05, + "loss": 0.3773, + "step": 24600 + }, + { + "epoch": 6.13715710723192, + "grad_norm": 4.941054344177246, + "learning_rate": 1.3865835411471322e-05, + "loss": 0.337, + "step": 24610 + }, + { + "epoch": 6.139650872817955, + "grad_norm": 7.861473083496094, + "learning_rate": 1.386334164588529e-05, + "loss": 0.3896, + "step": 24620 + }, + { + "epoch": 6.14214463840399, + "grad_norm": 7.050281524658203, + "learning_rate": 1.3860847880299253e-05, + "loss": 0.3908, + "step": 24630 + }, + { + "epoch": 6.144638403990025, + "grad_norm": 8.201066970825195, + "learning_rate": 1.3858354114713217e-05, + "loss": 0.4176, + "step": 24640 + }, + { + "epoch": 6.14713216957606, + "grad_norm": 6.268393516540527, + "learning_rate": 1.3855860349127184e-05, + "loss": 0.3486, + "step": 24650 + }, + { + "epoch": 6.149625935162095, + "grad_norm": 7.479492664337158, + "learning_rate": 1.3853366583541147e-05, + "loss": 0.3568, + "step": 24660 + }, + { + "epoch": 6.15211970074813, + "grad_norm": 9.700624465942383, + "learning_rate": 1.3850872817955114e-05, + "loss": 0.5033, + "step": 24670 + }, + { + "epoch": 6.1546134663341645, + "grad_norm": 7.117500305175781, + "learning_rate": 1.384837905236908e-05, + "loss": 0.3423, + "step": 24680 + }, + { + "epoch": 6.157107231920199, + "grad_norm": 6.819540023803711, + "learning_rate": 1.3845885286783043e-05, + "loss": 0.4107, + "step": 24690 + }, + { + "epoch": 6.159600997506234, + "grad_norm": 7.1624755859375, + "learning_rate": 1.384339152119701e-05, + "loss": 0.3808, + "step": 24700 + }, + { + "epoch": 6.162094763092269, + "grad_norm": 5.549429416656494, + "learning_rate": 1.3840897755610974e-05, + "loss": 0.4034, + "step": 24710 + }, + { + "epoch": 6.164588528678304, + "grad_norm": 6.838397026062012, + "learning_rate": 1.3838403990024937e-05, + "loss": 0.4178, + "step": 24720 + }, + { + "epoch": 6.167082294264339, + "grad_norm": 6.023339748382568, + "learning_rate": 1.3835910224438904e-05, + "loss": 0.3682, + "step": 24730 + }, + { + "epoch": 6.169576059850374, + "grad_norm": 6.62280797958374, + "learning_rate": 1.3833416458852868e-05, + "loss": 0.3843, + "step": 24740 + }, + { + "epoch": 6.172069825436409, + "grad_norm": 6.568355083465576, + "learning_rate": 1.3830922693266835e-05, + "loss": 0.4233, + "step": 24750 + }, + { + "epoch": 6.174563591022444, + "grad_norm": 9.093072891235352, + "learning_rate": 1.3828428927680799e-05, + "loss": 0.3631, + "step": 24760 + }, + { + "epoch": 6.177057356608479, + "grad_norm": 8.72046184539795, + "learning_rate": 1.3825935162094764e-05, + "loss": 0.3435, + "step": 24770 + }, + { + "epoch": 6.179551122194514, + "grad_norm": 9.039979934692383, + "learning_rate": 1.3823441396508729e-05, + "loss": 0.3879, + "step": 24780 + }, + { + "epoch": 6.182044887780549, + "grad_norm": 8.933988571166992, + "learning_rate": 1.3820947630922694e-05, + "loss": 0.3646, + "step": 24790 + }, + { + "epoch": 6.184538653366584, + "grad_norm": 7.515913963317871, + "learning_rate": 1.3818453865336658e-05, + "loss": 0.3503, + "step": 24800 + }, + { + "epoch": 6.187032418952619, + "grad_norm": 6.617506504058838, + "learning_rate": 1.3815960099750625e-05, + "loss": 0.4107, + "step": 24810 + }, + { + "epoch": 6.1895261845386536, + "grad_norm": 5.950352668762207, + "learning_rate": 1.3813466334164589e-05, + "loss": 0.3318, + "step": 24820 + }, + { + "epoch": 6.192019950124688, + "grad_norm": 6.522132873535156, + "learning_rate": 1.3810972568578556e-05, + "loss": 0.3876, + "step": 24830 + }, + { + "epoch": 6.194513715710723, + "grad_norm": 5.656475067138672, + "learning_rate": 1.380847880299252e-05, + "loss": 0.3361, + "step": 24840 + }, + { + "epoch": 6.197007481296758, + "grad_norm": 9.664381980895996, + "learning_rate": 1.3805985037406485e-05, + "loss": 0.4696, + "step": 24850 + }, + { + "epoch": 6.199501246882793, + "grad_norm": 5.316076755523682, + "learning_rate": 1.380349127182045e-05, + "loss": 0.3133, + "step": 24860 + }, + { + "epoch": 6.201995012468828, + "grad_norm": 5.598194599151611, + "learning_rate": 1.3800997506234415e-05, + "loss": 0.4303, + "step": 24870 + }, + { + "epoch": 6.204488778054863, + "grad_norm": 9.467336654663086, + "learning_rate": 1.379850374064838e-05, + "loss": 0.3493, + "step": 24880 + }, + { + "epoch": 6.206982543640898, + "grad_norm": 5.858520030975342, + "learning_rate": 1.3796009975062346e-05, + "loss": 0.4155, + "step": 24890 + }, + { + "epoch": 6.2094763092269325, + "grad_norm": 6.202467441558838, + "learning_rate": 1.379351620947631e-05, + "loss": 0.3583, + "step": 24900 + }, + { + "epoch": 6.211970074812967, + "grad_norm": 10.094200134277344, + "learning_rate": 1.3791022443890276e-05, + "loss": 0.3416, + "step": 24910 + }, + { + "epoch": 6.214463840399002, + "grad_norm": 6.892770290374756, + "learning_rate": 1.378852867830424e-05, + "loss": 0.3961, + "step": 24920 + }, + { + "epoch": 6.216957605985037, + "grad_norm": 6.055887699127197, + "learning_rate": 1.3786034912718205e-05, + "loss": 0.3684, + "step": 24930 + }, + { + "epoch": 6.219451371571072, + "grad_norm": 7.503059387207031, + "learning_rate": 1.378354114713217e-05, + "loss": 0.3396, + "step": 24940 + }, + { + "epoch": 6.221945137157107, + "grad_norm": 6.7039642333984375, + "learning_rate": 1.3781047381546136e-05, + "loss": 0.3621, + "step": 24950 + }, + { + "epoch": 6.224438902743142, + "grad_norm": 6.32158088684082, + "learning_rate": 1.3778553615960101e-05, + "loss": 0.3421, + "step": 24960 + }, + { + "epoch": 6.2269326683291775, + "grad_norm": 7.217518329620361, + "learning_rate": 1.3776059850374067e-05, + "loss": 0.3611, + "step": 24970 + }, + { + "epoch": 6.229426433915212, + "grad_norm": 6.535889148712158, + "learning_rate": 1.377356608478803e-05, + "loss": 0.3765, + "step": 24980 + }, + { + "epoch": 6.231920199501247, + "grad_norm": 5.55719518661499, + "learning_rate": 1.3771072319201997e-05, + "loss": 0.3583, + "step": 24990 + }, + { + "epoch": 6.234413965087282, + "grad_norm": 8.752567291259766, + "learning_rate": 1.3768578553615961e-05, + "loss": 0.3583, + "step": 25000 + }, + { + "epoch": 6.236907730673317, + "grad_norm": 6.028888702392578, + "learning_rate": 1.3766084788029924e-05, + "loss": 0.3537, + "step": 25010 + }, + { + "epoch": 6.239401496259352, + "grad_norm": 8.308853149414062, + "learning_rate": 1.3763591022443891e-05, + "loss": 0.4142, + "step": 25020 + }, + { + "epoch": 6.241895261845387, + "grad_norm": 7.740601539611816, + "learning_rate": 1.3761097256857857e-05, + "loss": 0.3631, + "step": 25030 + }, + { + "epoch": 6.2443890274314215, + "grad_norm": 6.1072611808776855, + "learning_rate": 1.3758603491271822e-05, + "loss": 0.3523, + "step": 25040 + }, + { + "epoch": 6.246882793017456, + "grad_norm": 5.326240062713623, + "learning_rate": 1.3756109725685787e-05, + "loss": 0.3606, + "step": 25050 + }, + { + "epoch": 6.249376558603491, + "grad_norm": 7.631753921508789, + "learning_rate": 1.3753615960099751e-05, + "loss": 0.3653, + "step": 25060 + }, + { + "epoch": 6.251870324189526, + "grad_norm": 4.055186748504639, + "learning_rate": 1.3751122194513718e-05, + "loss": 0.3317, + "step": 25070 + }, + { + "epoch": 6.254364089775561, + "grad_norm": 6.139715671539307, + "learning_rate": 1.3748628428927682e-05, + "loss": 0.3835, + "step": 25080 + }, + { + "epoch": 6.256857855361596, + "grad_norm": 6.292752265930176, + "learning_rate": 1.3746134663341645e-05, + "loss": 0.2972, + "step": 25090 + }, + { + "epoch": 6.259351620947631, + "grad_norm": 7.624993324279785, + "learning_rate": 1.3743640897755612e-05, + "loss": 0.4233, + "step": 25100 + }, + { + "epoch": 6.261845386533666, + "grad_norm": 5.454896450042725, + "learning_rate": 1.3741147132169576e-05, + "loss": 0.3935, + "step": 25110 + }, + { + "epoch": 6.2643391521197005, + "grad_norm": 7.689428329467773, + "learning_rate": 1.3738653366583543e-05, + "loss": 0.3478, + "step": 25120 + }, + { + "epoch": 6.266832917705735, + "grad_norm": 5.5055365562438965, + "learning_rate": 1.3736159600997507e-05, + "loss": 0.3336, + "step": 25130 + }, + { + "epoch": 6.26932668329177, + "grad_norm": 4.818638324737549, + "learning_rate": 1.3733665835411472e-05, + "loss": 0.3446, + "step": 25140 + }, + { + "epoch": 6.271820448877805, + "grad_norm": 6.679276943206787, + "learning_rate": 1.3731172069825439e-05, + "loss": 0.3915, + "step": 25150 + }, + { + "epoch": 6.274314214463841, + "grad_norm": 7.1155009269714355, + "learning_rate": 1.3728678304239402e-05, + "loss": 0.3504, + "step": 25160 + }, + { + "epoch": 6.276807980049876, + "grad_norm": 7.2714009284973145, + "learning_rate": 1.372618453865337e-05, + "loss": 0.5088, + "step": 25170 + }, + { + "epoch": 6.279301745635911, + "grad_norm": 8.970192909240723, + "learning_rate": 1.3723690773067333e-05, + "loss": 0.4345, + "step": 25180 + }, + { + "epoch": 6.2817955112219455, + "grad_norm": 6.6837334632873535, + "learning_rate": 1.3721197007481297e-05, + "loss": 0.3889, + "step": 25190 + }, + { + "epoch": 6.28428927680798, + "grad_norm": 5.488166809082031, + "learning_rate": 1.3718703241895264e-05, + "loss": 0.3952, + "step": 25200 + }, + { + "epoch": 6.286783042394015, + "grad_norm": 7.671344757080078, + "learning_rate": 1.3716209476309227e-05, + "loss": 0.3589, + "step": 25210 + }, + { + "epoch": 6.28927680798005, + "grad_norm": 6.706274032592773, + "learning_rate": 1.3713715710723193e-05, + "loss": 0.4448, + "step": 25220 + }, + { + "epoch": 6.291770573566085, + "grad_norm": 8.258255004882812, + "learning_rate": 1.3711221945137158e-05, + "loss": 0.3784, + "step": 25230 + }, + { + "epoch": 6.29426433915212, + "grad_norm": 8.191130638122559, + "learning_rate": 1.3708728179551123e-05, + "loss": 0.4016, + "step": 25240 + }, + { + "epoch": 6.296758104738155, + "grad_norm": 6.618471622467041, + "learning_rate": 1.3706234413965089e-05, + "loss": 0.319, + "step": 25250 + }, + { + "epoch": 6.2992518703241895, + "grad_norm": 4.492340564727783, + "learning_rate": 1.3703740648379054e-05, + "loss": 0.3433, + "step": 25260 + }, + { + "epoch": 6.301745635910224, + "grad_norm": 7.213396072387695, + "learning_rate": 1.3701246882793017e-05, + "loss": 0.336, + "step": 25270 + }, + { + "epoch": 6.304239401496259, + "grad_norm": 6.88414192199707, + "learning_rate": 1.3698753117206984e-05, + "loss": 0.3996, + "step": 25280 + }, + { + "epoch": 6.306733167082294, + "grad_norm": 5.107346057891846, + "learning_rate": 1.3696259351620948e-05, + "loss": 0.3807, + "step": 25290 + }, + { + "epoch": 6.309226932668329, + "grad_norm": 6.0163445472717285, + "learning_rate": 1.3693765586034913e-05, + "loss": 0.3609, + "step": 25300 + }, + { + "epoch": 6.311720698254364, + "grad_norm": 6.904167175292969, + "learning_rate": 1.3691271820448879e-05, + "loss": 0.3868, + "step": 25310 + }, + { + "epoch": 6.314214463840399, + "grad_norm": 7.417514801025391, + "learning_rate": 1.3688778054862844e-05, + "loss": 0.384, + "step": 25320 + }, + { + "epoch": 6.316708229426434, + "grad_norm": 7.786649227142334, + "learning_rate": 1.368628428927681e-05, + "loss": 0.4802, + "step": 25330 + }, + { + "epoch": 6.3192019950124685, + "grad_norm": 7.041213512420654, + "learning_rate": 1.3683790523690775e-05, + "loss": 0.4231, + "step": 25340 + }, + { + "epoch": 6.321695760598503, + "grad_norm": 4.815100193023682, + "learning_rate": 1.3681296758104738e-05, + "loss": 0.3803, + "step": 25350 + }, + { + "epoch": 6.324189526184538, + "grad_norm": 5.091302871704102, + "learning_rate": 1.3678802992518705e-05, + "loss": 0.3602, + "step": 25360 + }, + { + "epoch": 6.326683291770574, + "grad_norm": 6.59751558303833, + "learning_rate": 1.3676309226932669e-05, + "loss": 0.3558, + "step": 25370 + }, + { + "epoch": 6.329177057356609, + "grad_norm": 6.500771522521973, + "learning_rate": 1.3673815461346636e-05, + "loss": 0.3743, + "step": 25380 + }, + { + "epoch": 6.331670822942644, + "grad_norm": 5.946366786956787, + "learning_rate": 1.36713216957606e-05, + "loss": 0.3757, + "step": 25390 + }, + { + "epoch": 6.334164588528679, + "grad_norm": 6.4445672035217285, + "learning_rate": 1.3668827930174565e-05, + "loss": 0.3488, + "step": 25400 + }, + { + "epoch": 6.3366583541147135, + "grad_norm": 8.107527732849121, + "learning_rate": 1.366633416458853e-05, + "loss": 0.3637, + "step": 25410 + }, + { + "epoch": 6.339152119700748, + "grad_norm": 7.021248817443848, + "learning_rate": 1.3663840399002495e-05, + "loss": 0.4392, + "step": 25420 + }, + { + "epoch": 6.341645885286783, + "grad_norm": 7.19385290145874, + "learning_rate": 1.3661346633416459e-05, + "loss": 0.3443, + "step": 25430 + }, + { + "epoch": 6.344139650872818, + "grad_norm": 8.480057716369629, + "learning_rate": 1.3658852867830426e-05, + "loss": 0.3249, + "step": 25440 + }, + { + "epoch": 6.346633416458853, + "grad_norm": 6.283059120178223, + "learning_rate": 1.365635910224439e-05, + "loss": 0.4301, + "step": 25450 + }, + { + "epoch": 6.349127182044888, + "grad_norm": 9.004962921142578, + "learning_rate": 1.3653865336658357e-05, + "loss": 0.3856, + "step": 25460 + }, + { + "epoch": 6.351620947630923, + "grad_norm": 6.921197891235352, + "learning_rate": 1.365137157107232e-05, + "loss": 0.3504, + "step": 25470 + }, + { + "epoch": 6.3541147132169575, + "grad_norm": 7.9730634689331055, + "learning_rate": 1.3648877805486284e-05, + "loss": 0.4598, + "step": 25480 + }, + { + "epoch": 6.356608478802992, + "grad_norm": 5.783198356628418, + "learning_rate": 1.3646384039900251e-05, + "loss": 0.388, + "step": 25490 + }, + { + "epoch": 6.359102244389027, + "grad_norm": 5.059525489807129, + "learning_rate": 1.3643890274314216e-05, + "loss": 0.327, + "step": 25500 + }, + { + "epoch": 6.361596009975062, + "grad_norm": 8.875046730041504, + "learning_rate": 1.364139650872818e-05, + "loss": 0.3387, + "step": 25510 + }, + { + "epoch": 6.364089775561097, + "grad_norm": 8.62806224822998, + "learning_rate": 1.3638902743142147e-05, + "loss": 0.3109, + "step": 25520 + }, + { + "epoch": 6.366583541147132, + "grad_norm": 6.7610344886779785, + "learning_rate": 1.363640897755611e-05, + "loss": 0.3855, + "step": 25530 + }, + { + "epoch": 6.369077306733167, + "grad_norm": 10.090450286865234, + "learning_rate": 1.3633915211970077e-05, + "loss": 0.3404, + "step": 25540 + }, + { + "epoch": 6.371571072319202, + "grad_norm": 7.940425872802734, + "learning_rate": 1.3631421446384041e-05, + "loss": 0.3752, + "step": 25550 + }, + { + "epoch": 6.374064837905237, + "grad_norm": 6.398936748504639, + "learning_rate": 1.3628927680798005e-05, + "loss": 0.3769, + "step": 25560 + }, + { + "epoch": 6.376558603491272, + "grad_norm": 4.926769733428955, + "learning_rate": 1.3626433915211972e-05, + "loss": 0.4024, + "step": 25570 + }, + { + "epoch": 6.379052369077307, + "grad_norm": 7.145235538482666, + "learning_rate": 1.3623940149625935e-05, + "loss": 0.3512, + "step": 25580 + }, + { + "epoch": 6.381546134663342, + "grad_norm": 8.193603515625, + "learning_rate": 1.36214463840399e-05, + "loss": 0.3556, + "step": 25590 + }, + { + "epoch": 6.384039900249377, + "grad_norm": 6.601048469543457, + "learning_rate": 1.3618952618453866e-05, + "loss": 0.442, + "step": 25600 + }, + { + "epoch": 6.386533665835412, + "grad_norm": 6.299713134765625, + "learning_rate": 1.3616458852867831e-05, + "loss": 0.3942, + "step": 25610 + }, + { + "epoch": 6.389027431421447, + "grad_norm": 5.013650417327881, + "learning_rate": 1.3613965087281798e-05, + "loss": 0.3576, + "step": 25620 + }, + { + "epoch": 6.3915211970074814, + "grad_norm": 6.1320905685424805, + "learning_rate": 1.3611471321695762e-05, + "loss": 0.426, + "step": 25630 + }, + { + "epoch": 6.394014962593516, + "grad_norm": 7.493283748626709, + "learning_rate": 1.3608977556109725e-05, + "loss": 0.4207, + "step": 25640 + }, + { + "epoch": 6.396508728179551, + "grad_norm": 7.163254261016846, + "learning_rate": 1.3606483790523692e-05, + "loss": 0.4041, + "step": 25650 + }, + { + "epoch": 6.399002493765586, + "grad_norm": 4.770473957061768, + "learning_rate": 1.3603990024937656e-05, + "loss": 0.3493, + "step": 25660 + }, + { + "epoch": 6.401496259351621, + "grad_norm": 6.536076545715332, + "learning_rate": 1.3601496259351623e-05, + "loss": 0.3204, + "step": 25670 + }, + { + "epoch": 6.403990024937656, + "grad_norm": 8.567519187927246, + "learning_rate": 1.3599002493765587e-05, + "loss": 0.4084, + "step": 25680 + }, + { + "epoch": 6.406483790523691, + "grad_norm": 10.568842887878418, + "learning_rate": 1.3596508728179552e-05, + "loss": 0.3572, + "step": 25690 + }, + { + "epoch": 6.4089775561097255, + "grad_norm": 8.251084327697754, + "learning_rate": 1.3594014962593517e-05, + "loss": 0.4384, + "step": 25700 + }, + { + "epoch": 6.41147132169576, + "grad_norm": 5.1175408363342285, + "learning_rate": 1.3591521197007483e-05, + "loss": 0.3512, + "step": 25710 + }, + { + "epoch": 6.413965087281795, + "grad_norm": 5.352699279785156, + "learning_rate": 1.3589027431421446e-05, + "loss": 0.3152, + "step": 25720 + }, + { + "epoch": 6.41645885286783, + "grad_norm": 11.443229675292969, + "learning_rate": 1.3586533665835413e-05, + "loss": 0.3842, + "step": 25730 + }, + { + "epoch": 6.418952618453865, + "grad_norm": 4.957744598388672, + "learning_rate": 1.3584039900249377e-05, + "loss": 0.353, + "step": 25740 + }, + { + "epoch": 6.4214463840399, + "grad_norm": 5.221795082092285, + "learning_rate": 1.3581546134663344e-05, + "loss": 0.3797, + "step": 25750 + }, + { + "epoch": 6.423940149625935, + "grad_norm": 5.243051052093506, + "learning_rate": 1.3579052369077307e-05, + "loss": 0.3356, + "step": 25760 + }, + { + "epoch": 6.42643391521197, + "grad_norm": 6.537060260772705, + "learning_rate": 1.3576558603491273e-05, + "loss": 0.4025, + "step": 25770 + }, + { + "epoch": 6.428927680798005, + "grad_norm": 7.9792094230651855, + "learning_rate": 1.3574064837905238e-05, + "loss": 0.4992, + "step": 25780 + }, + { + "epoch": 6.43142144638404, + "grad_norm": 6.797430992126465, + "learning_rate": 1.3571571072319203e-05, + "loss": 0.3187, + "step": 25790 + }, + { + "epoch": 6.433915211970075, + "grad_norm": 5.758587837219238, + "learning_rate": 1.3569077306733167e-05, + "loss": 0.3202, + "step": 25800 + }, + { + "epoch": 6.43640897755611, + "grad_norm": 6.541738986968994, + "learning_rate": 1.3566583541147134e-05, + "loss": 0.4289, + "step": 25810 + }, + { + "epoch": 6.438902743142145, + "grad_norm": 7.5989227294921875, + "learning_rate": 1.3564089775561098e-05, + "loss": 0.4379, + "step": 25820 + }, + { + "epoch": 6.44139650872818, + "grad_norm": 10.161792755126953, + "learning_rate": 1.3561596009975065e-05, + "loss": 0.344, + "step": 25830 + }, + { + "epoch": 6.443890274314215, + "grad_norm": 6.123249053955078, + "learning_rate": 1.3559102244389028e-05, + "loss": 0.3364, + "step": 25840 + }, + { + "epoch": 6.446384039900249, + "grad_norm": 4.593570709228516, + "learning_rate": 1.3556608478802994e-05, + "loss": 0.4108, + "step": 25850 + }, + { + "epoch": 6.448877805486284, + "grad_norm": 14.478041648864746, + "learning_rate": 1.3554114713216959e-05, + "loss": 0.2916, + "step": 25860 + }, + { + "epoch": 6.451371571072319, + "grad_norm": 9.636341094970703, + "learning_rate": 1.3551620947630924e-05, + "loss": 0.4033, + "step": 25870 + }, + { + "epoch": 6.453865336658354, + "grad_norm": 6.884788513183594, + "learning_rate": 1.354912718204489e-05, + "loss": 0.3478, + "step": 25880 + }, + { + "epoch": 6.456359102244389, + "grad_norm": 6.347228527069092, + "learning_rate": 1.3546633416458855e-05, + "loss": 0.3867, + "step": 25890 + }, + { + "epoch": 6.458852867830424, + "grad_norm": 6.127768516540527, + "learning_rate": 1.3544139650872818e-05, + "loss": 0.3709, + "step": 25900 + }, + { + "epoch": 6.461346633416459, + "grad_norm": 5.719086647033691, + "learning_rate": 1.3541645885286785e-05, + "loss": 0.3649, + "step": 25910 + }, + { + "epoch": 6.4638403990024935, + "grad_norm": 7.747276306152344, + "learning_rate": 1.3539152119700749e-05, + "loss": 0.3425, + "step": 25920 + }, + { + "epoch": 6.466334164588528, + "grad_norm": 7.665780544281006, + "learning_rate": 1.3536658354114713e-05, + "loss": 0.427, + "step": 25930 + }, + { + "epoch": 6.468827930174563, + "grad_norm": 4.468764305114746, + "learning_rate": 1.353416458852868e-05, + "loss": 0.3758, + "step": 25940 + }, + { + "epoch": 6.471321695760598, + "grad_norm": 8.336400985717773, + "learning_rate": 1.3531670822942643e-05, + "loss": 0.3451, + "step": 25950 + }, + { + "epoch": 6.473815461346634, + "grad_norm": 9.712798118591309, + "learning_rate": 1.352917705735661e-05, + "loss": 0.3795, + "step": 25960 + }, + { + "epoch": 6.476309226932669, + "grad_norm": 7.729135990142822, + "learning_rate": 1.3526683291770576e-05, + "loss": 0.3696, + "step": 25970 + }, + { + "epoch": 6.478802992518704, + "grad_norm": 6.578273296356201, + "learning_rate": 1.352418952618454e-05, + "loss": 0.4044, + "step": 25980 + }, + { + "epoch": 6.4812967581047385, + "grad_norm": 11.453405380249023, + "learning_rate": 1.3521695760598506e-05, + "loss": 0.3201, + "step": 25990 + }, + { + "epoch": 6.483790523690773, + "grad_norm": 6.865024566650391, + "learning_rate": 1.351920199501247e-05, + "loss": 0.3353, + "step": 26000 + }, + { + "epoch": 6.486284289276808, + "grad_norm": 6.444002151489258, + "learning_rate": 1.3516708229426433e-05, + "loss": 0.3806, + "step": 26010 + }, + { + "epoch": 6.488778054862843, + "grad_norm": 7.583583831787109, + "learning_rate": 1.35142144638404e-05, + "loss": 0.3622, + "step": 26020 + }, + { + "epoch": 6.491271820448878, + "grad_norm": 7.034835338592529, + "learning_rate": 1.3511720698254364e-05, + "loss": 0.4238, + "step": 26030 + }, + { + "epoch": 6.493765586034913, + "grad_norm": 7.059061050415039, + "learning_rate": 1.3509226932668331e-05, + "loss": 0.3675, + "step": 26040 + }, + { + "epoch": 6.496259351620948, + "grad_norm": 5.760257720947266, + "learning_rate": 1.3506733167082295e-05, + "loss": 0.3814, + "step": 26050 + }, + { + "epoch": 6.498753117206983, + "grad_norm": 6.137911319732666, + "learning_rate": 1.350423940149626e-05, + "loss": 0.3576, + "step": 26060 + }, + { + "epoch": 6.501246882793017, + "grad_norm": 7.457276821136475, + "learning_rate": 1.3501745635910225e-05, + "loss": 0.3941, + "step": 26070 + }, + { + "epoch": 6.503740648379052, + "grad_norm": 7.268845081329346, + "learning_rate": 1.349925187032419e-05, + "loss": 0.3924, + "step": 26080 + }, + { + "epoch": 6.506234413965087, + "grad_norm": 7.861654758453369, + "learning_rate": 1.3496758104738154e-05, + "loss": 0.3578, + "step": 26090 + }, + { + "epoch": 6.508728179551122, + "grad_norm": 8.676166534423828, + "learning_rate": 1.3494264339152121e-05, + "loss": 0.4239, + "step": 26100 + }, + { + "epoch": 6.511221945137157, + "grad_norm": 6.739658355712891, + "learning_rate": 1.3491770573566085e-05, + "loss": 0.4131, + "step": 26110 + }, + { + "epoch": 6.513715710723192, + "grad_norm": 6.20055627822876, + "learning_rate": 1.3489276807980052e-05, + "loss": 0.4278, + "step": 26120 + }, + { + "epoch": 6.516209476309227, + "grad_norm": 6.212075710296631, + "learning_rate": 1.3486783042394015e-05, + "loss": 0.3607, + "step": 26130 + }, + { + "epoch": 6.5187032418952615, + "grad_norm": 10.847241401672363, + "learning_rate": 1.348428927680798e-05, + "loss": 0.3624, + "step": 26140 + }, + { + "epoch": 6.521197007481296, + "grad_norm": 6.094544887542725, + "learning_rate": 1.3481795511221946e-05, + "loss": 0.4353, + "step": 26150 + }, + { + "epoch": 6.523690773067331, + "grad_norm": 7.994797706604004, + "learning_rate": 1.3479301745635911e-05, + "loss": 0.3961, + "step": 26160 + }, + { + "epoch": 6.526184538653366, + "grad_norm": 8.158559799194336, + "learning_rate": 1.3476807980049877e-05, + "loss": 0.3696, + "step": 26170 + }, + { + "epoch": 6.528678304239402, + "grad_norm": 7.66077995300293, + "learning_rate": 1.3474314214463842e-05, + "loss": 0.3302, + "step": 26180 + }, + { + "epoch": 6.531172069825437, + "grad_norm": 6.937994480133057, + "learning_rate": 1.3471820448877806e-05, + "loss": 0.4102, + "step": 26190 + }, + { + "epoch": 6.533665835411472, + "grad_norm": 7.02347993850708, + "learning_rate": 1.3469326683291773e-05, + "loss": 0.4596, + "step": 26200 + }, + { + "epoch": 6.5361596009975065, + "grad_norm": 7.245387077331543, + "learning_rate": 1.3466832917705736e-05, + "loss": 0.3513, + "step": 26210 + }, + { + "epoch": 6.538653366583541, + "grad_norm": 7.9023237228393555, + "learning_rate": 1.3464339152119702e-05, + "loss": 0.3554, + "step": 26220 + }, + { + "epoch": 6.541147132169576, + "grad_norm": 5.199001789093018, + "learning_rate": 1.3461845386533667e-05, + "loss": 0.4321, + "step": 26230 + }, + { + "epoch": 6.543640897755611, + "grad_norm": 5.115858554840088, + "learning_rate": 1.3459351620947632e-05, + "loss": 0.3597, + "step": 26240 + }, + { + "epoch": 6.546134663341646, + "grad_norm": 7.653583526611328, + "learning_rate": 1.3456857855361597e-05, + "loss": 0.4044, + "step": 26250 + }, + { + "epoch": 6.548628428927681, + "grad_norm": 5.315465450286865, + "learning_rate": 1.3454364089775563e-05, + "loss": 0.3737, + "step": 26260 + }, + { + "epoch": 6.551122194513716, + "grad_norm": 6.182226657867432, + "learning_rate": 1.3451870324189526e-05, + "loss": 0.3521, + "step": 26270 + }, + { + "epoch": 6.553615960099751, + "grad_norm": 5.8310546875, + "learning_rate": 1.3449376558603493e-05, + "loss": 0.3277, + "step": 26280 + }, + { + "epoch": 6.556109725685785, + "grad_norm": 6.5603437423706055, + "learning_rate": 1.3446882793017457e-05, + "loss": 0.3724, + "step": 26290 + }, + { + "epoch": 6.55860349127182, + "grad_norm": 7.382344722747803, + "learning_rate": 1.344438902743142e-05, + "loss": 0.3898, + "step": 26300 + }, + { + "epoch": 6.561097256857855, + "grad_norm": 5.281245708465576, + "learning_rate": 1.3441895261845388e-05, + "loss": 0.3697, + "step": 26310 + }, + { + "epoch": 6.56359102244389, + "grad_norm": 6.038678169250488, + "learning_rate": 1.3439401496259353e-05, + "loss": 0.3634, + "step": 26320 + }, + { + "epoch": 6.566084788029925, + "grad_norm": 8.70938491821289, + "learning_rate": 1.3436907730673318e-05, + "loss": 0.3867, + "step": 26330 + }, + { + "epoch": 6.56857855361596, + "grad_norm": 7.3120598793029785, + "learning_rate": 1.3434413965087284e-05, + "loss": 0.3791, + "step": 26340 + }, + { + "epoch": 6.571072319201995, + "grad_norm": 6.2208428382873535, + "learning_rate": 1.3431920199501247e-05, + "loss": 0.3998, + "step": 26350 + }, + { + "epoch": 6.57356608478803, + "grad_norm": 6.549139499664307, + "learning_rate": 1.3429426433915214e-05, + "loss": 0.3935, + "step": 26360 + }, + { + "epoch": 6.576059850374065, + "grad_norm": 4.998687267303467, + "learning_rate": 1.3426932668329178e-05, + "loss": 0.41, + "step": 26370 + }, + { + "epoch": 6.5785536159601, + "grad_norm": 5.387908458709717, + "learning_rate": 1.3424438902743145e-05, + "loss": 0.3421, + "step": 26380 + }, + { + "epoch": 6.581047381546135, + "grad_norm": 7.001126766204834, + "learning_rate": 1.3421945137157108e-05, + "loss": 0.3603, + "step": 26390 + }, + { + "epoch": 6.58354114713217, + "grad_norm": 10.226015090942383, + "learning_rate": 1.3419451371571072e-05, + "loss": 0.3955, + "step": 26400 + }, + { + "epoch": 6.586034912718205, + "grad_norm": 6.176059722900391, + "learning_rate": 1.3416957605985039e-05, + "loss": 0.302, + "step": 26410 + }, + { + "epoch": 6.58852867830424, + "grad_norm": 5.293604373931885, + "learning_rate": 1.3414463840399003e-05, + "loss": 0.3701, + "step": 26420 + }, + { + "epoch": 6.5910224438902745, + "grad_norm": 8.43692398071289, + "learning_rate": 1.3411970074812968e-05, + "loss": 0.3852, + "step": 26430 + }, + { + "epoch": 6.593516209476309, + "grad_norm": 7.703579425811768, + "learning_rate": 1.3409476309226935e-05, + "loss": 0.4242, + "step": 26440 + }, + { + "epoch": 6.596009975062344, + "grad_norm": 9.26103687286377, + "learning_rate": 1.3406982543640899e-05, + "loss": 0.3328, + "step": 26450 + }, + { + "epoch": 6.598503740648379, + "grad_norm": 7.32110071182251, + "learning_rate": 1.3404488778054866e-05, + "loss": 0.3679, + "step": 26460 + }, + { + "epoch": 6.600997506234414, + "grad_norm": 6.620180130004883, + "learning_rate": 1.340199501246883e-05, + "loss": 0.4228, + "step": 26470 + }, + { + "epoch": 6.603491271820449, + "grad_norm": 7.809380531311035, + "learning_rate": 1.3399501246882793e-05, + "loss": 0.4333, + "step": 26480 + }, + { + "epoch": 6.605985037406484, + "grad_norm": 5.4282331466674805, + "learning_rate": 1.339700748129676e-05, + "loss": 0.4154, + "step": 26490 + }, + { + "epoch": 6.6084788029925186, + "grad_norm": 6.508066177368164, + "learning_rate": 1.3394513715710723e-05, + "loss": 0.3998, + "step": 26500 + }, + { + "epoch": 6.610972568578553, + "grad_norm": 9.248714447021484, + "learning_rate": 1.3392019950124689e-05, + "loss": 0.3763, + "step": 26510 + }, + { + "epoch": 6.613466334164588, + "grad_norm": 7.227468967437744, + "learning_rate": 1.3389526184538654e-05, + "loss": 0.3659, + "step": 26520 + }, + { + "epoch": 6.615960099750623, + "grad_norm": 7.72911262512207, + "learning_rate": 1.338703241895262e-05, + "loss": 0.4427, + "step": 26530 + }, + { + "epoch": 6.618453865336658, + "grad_norm": 5.077463626861572, + "learning_rate": 1.3384538653366585e-05, + "loss": 0.3655, + "step": 26540 + }, + { + "epoch": 6.620947630922693, + "grad_norm": 5.688047885894775, + "learning_rate": 1.338204488778055e-05, + "loss": 0.3803, + "step": 26550 + }, + { + "epoch": 6.623441396508728, + "grad_norm": 5.621146202087402, + "learning_rate": 1.3379551122194514e-05, + "loss": 0.3938, + "step": 26560 + }, + { + "epoch": 6.625935162094763, + "grad_norm": 7.967491626739502, + "learning_rate": 1.337705735660848e-05, + "loss": 0.3353, + "step": 26570 + }, + { + "epoch": 6.628428927680798, + "grad_norm": 10.237777709960938, + "learning_rate": 1.3374563591022444e-05, + "loss": 0.3249, + "step": 26580 + }, + { + "epoch": 6.630922693266833, + "grad_norm": 13.385808944702148, + "learning_rate": 1.337206982543641e-05, + "loss": 0.4134, + "step": 26590 + }, + { + "epoch": 6.633416458852868, + "grad_norm": 4.2393341064453125, + "learning_rate": 1.3369576059850375e-05, + "loss": 0.4, + "step": 26600 + }, + { + "epoch": 6.635910224438903, + "grad_norm": 4.3933281898498535, + "learning_rate": 1.336708229426434e-05, + "loss": 0.3216, + "step": 26610 + }, + { + "epoch": 6.638403990024938, + "grad_norm": 4.819437503814697, + "learning_rate": 1.3364588528678305e-05, + "loss": 0.3578, + "step": 26620 + }, + { + "epoch": 6.640897755610973, + "grad_norm": 4.126856803894043, + "learning_rate": 1.336209476309227e-05, + "loss": 0.3625, + "step": 26630 + }, + { + "epoch": 6.643391521197008, + "grad_norm": 5.08698034286499, + "learning_rate": 1.3359600997506234e-05, + "loss": 0.417, + "step": 26640 + }, + { + "epoch": 6.6458852867830425, + "grad_norm": 5.38383674621582, + "learning_rate": 1.3357107231920201e-05, + "loss": 0.356, + "step": 26650 + }, + { + "epoch": 6.648379052369077, + "grad_norm": 8.648747444152832, + "learning_rate": 1.3354613466334165e-05, + "loss": 0.3886, + "step": 26660 + }, + { + "epoch": 6.650872817955112, + "grad_norm": 4.306643486022949, + "learning_rate": 1.3352119700748132e-05, + "loss": 0.3493, + "step": 26670 + }, + { + "epoch": 6.653366583541147, + "grad_norm": 6.597197532653809, + "learning_rate": 1.3349625935162096e-05, + "loss": 0.372, + "step": 26680 + }, + { + "epoch": 6.655860349127182, + "grad_norm": 6.69064474105835, + "learning_rate": 1.3347132169576061e-05, + "loss": 0.3965, + "step": 26690 + }, + { + "epoch": 6.658354114713217, + "grad_norm": 7.140894412994385, + "learning_rate": 1.3344638403990026e-05, + "loss": 0.3822, + "step": 26700 + }, + { + "epoch": 6.660847880299252, + "grad_norm": 5.859182357788086, + "learning_rate": 1.3342144638403992e-05, + "loss": 0.3458, + "step": 26710 + }, + { + "epoch": 6.6633416458852865, + "grad_norm": 6.949796199798584, + "learning_rate": 1.3339650872817955e-05, + "loss": 0.4071, + "step": 26720 + }, + { + "epoch": 6.665835411471321, + "grad_norm": 8.996414184570312, + "learning_rate": 1.3337157107231922e-05, + "loss": 0.3914, + "step": 26730 + }, + { + "epoch": 6.668329177057356, + "grad_norm": 9.598322868347168, + "learning_rate": 1.3334663341645886e-05, + "loss": 0.3846, + "step": 26740 + }, + { + "epoch": 6.670822942643391, + "grad_norm": 5.436387062072754, + "learning_rate": 1.3332169576059853e-05, + "loss": 0.3324, + "step": 26750 + }, + { + "epoch": 6.673316708229427, + "grad_norm": 8.324487686157227, + "learning_rate": 1.3329675810473816e-05, + "loss": 0.3996, + "step": 26760 + }, + { + "epoch": 6.675810473815462, + "grad_norm": 9.037124633789062, + "learning_rate": 1.3327182044887782e-05, + "loss": 0.4561, + "step": 26770 + }, + { + "epoch": 6.678304239401497, + "grad_norm": 7.689178466796875, + "learning_rate": 1.3324688279301747e-05, + "loss": 0.4511, + "step": 26780 + }, + { + "epoch": 6.6807980049875315, + "grad_norm": 6.272636890411377, + "learning_rate": 1.3322194513715712e-05, + "loss": 0.4475, + "step": 26790 + }, + { + "epoch": 6.683291770573566, + "grad_norm": 4.632606029510498, + "learning_rate": 1.3319700748129676e-05, + "loss": 0.4122, + "step": 26800 + }, + { + "epoch": 6.685785536159601, + "grad_norm": 7.478541851043701, + "learning_rate": 1.3317206982543643e-05, + "loss": 0.4107, + "step": 26810 + }, + { + "epoch": 6.688279301745636, + "grad_norm": 10.429938316345215, + "learning_rate": 1.3314713216957607e-05, + "loss": 0.4623, + "step": 26820 + }, + { + "epoch": 6.690773067331671, + "grad_norm": 7.3426995277404785, + "learning_rate": 1.3312219451371574e-05, + "loss": 0.4568, + "step": 26830 + }, + { + "epoch": 6.693266832917706, + "grad_norm": 7.244011402130127, + "learning_rate": 1.3309725685785537e-05, + "loss": 0.3298, + "step": 26840 + }, + { + "epoch": 6.695760598503741, + "grad_norm": 9.129273414611816, + "learning_rate": 1.33072319201995e-05, + "loss": 0.3867, + "step": 26850 + }, + { + "epoch": 6.698254364089776, + "grad_norm": 7.908605575561523, + "learning_rate": 1.3304738154613468e-05, + "loss": 0.394, + "step": 26860 + }, + { + "epoch": 6.7007481296758105, + "grad_norm": 5.406508445739746, + "learning_rate": 1.3302244389027431e-05, + "loss": 0.3463, + "step": 26870 + }, + { + "epoch": 6.703241895261845, + "grad_norm": 7.902821063995361, + "learning_rate": 1.3299750623441398e-05, + "loss": 0.4256, + "step": 26880 + }, + { + "epoch": 6.70573566084788, + "grad_norm": 5.451107978820801, + "learning_rate": 1.3297256857855362e-05, + "loss": 0.3999, + "step": 26890 + }, + { + "epoch": 6.708229426433915, + "grad_norm": 4.476869106292725, + "learning_rate": 1.3294763092269327e-05, + "loss": 0.3836, + "step": 26900 + }, + { + "epoch": 6.71072319201995, + "grad_norm": 6.899260520935059, + "learning_rate": 1.3292269326683294e-05, + "loss": 0.4067, + "step": 26910 + }, + { + "epoch": 6.713216957605985, + "grad_norm": 8.896599769592285, + "learning_rate": 1.3289775561097258e-05, + "loss": 0.4425, + "step": 26920 + }, + { + "epoch": 6.71571072319202, + "grad_norm": 7.202389717102051, + "learning_rate": 1.3287281795511222e-05, + "loss": 0.3334, + "step": 26930 + }, + { + "epoch": 6.7182044887780545, + "grad_norm": 5.247492790222168, + "learning_rate": 1.3284788029925189e-05, + "loss": 0.4, + "step": 26940 + }, + { + "epoch": 6.720698254364089, + "grad_norm": 10.521095275878906, + "learning_rate": 1.3282294264339152e-05, + "loss": 0.3652, + "step": 26950 + }, + { + "epoch": 6.723192019950124, + "grad_norm": 5.5208611488342285, + "learning_rate": 1.327980049875312e-05, + "loss": 0.4477, + "step": 26960 + }, + { + "epoch": 6.725685785536159, + "grad_norm": 8.8683500289917, + "learning_rate": 1.3277306733167083e-05, + "loss": 0.3449, + "step": 26970 + }, + { + "epoch": 6.728179551122195, + "grad_norm": 5.689904689788818, + "learning_rate": 1.3274812967581048e-05, + "loss": 0.4239, + "step": 26980 + }, + { + "epoch": 6.73067331670823, + "grad_norm": 7.4817328453063965, + "learning_rate": 1.3272319201995013e-05, + "loss": 0.4688, + "step": 26990 + }, + { + "epoch": 6.733167082294265, + "grad_norm": 8.645492553710938, + "learning_rate": 1.3269825436408979e-05, + "loss": 0.415, + "step": 27000 + }, + { + "epoch": 6.7356608478802995, + "grad_norm": 4.103111743927002, + "learning_rate": 1.3267331670822942e-05, + "loss": 0.3628, + "step": 27010 + }, + { + "epoch": 6.738154613466334, + "grad_norm": 6.457451820373535, + "learning_rate": 1.326483790523691e-05, + "loss": 0.4813, + "step": 27020 + }, + { + "epoch": 6.740648379052369, + "grad_norm": 11.222722053527832, + "learning_rate": 1.3262344139650873e-05, + "loss": 0.3365, + "step": 27030 + }, + { + "epoch": 6.743142144638404, + "grad_norm": 5.118739128112793, + "learning_rate": 1.325985037406484e-05, + "loss": 0.35, + "step": 27040 + }, + { + "epoch": 6.745635910224439, + "grad_norm": 8.133965492248535, + "learning_rate": 1.3257356608478804e-05, + "loss": 0.3907, + "step": 27050 + }, + { + "epoch": 6.748129675810474, + "grad_norm": 7.104856491088867, + "learning_rate": 1.3254862842892769e-05, + "loss": 0.4105, + "step": 27060 + }, + { + "epoch": 6.750623441396509, + "grad_norm": 5.757190227508545, + "learning_rate": 1.3252369077306734e-05, + "loss": 0.4462, + "step": 27070 + }, + { + "epoch": 6.753117206982544, + "grad_norm": 8.72276782989502, + "learning_rate": 1.32498753117207e-05, + "loss": 0.344, + "step": 27080 + }, + { + "epoch": 6.7556109725685785, + "grad_norm": 3.8614084720611572, + "learning_rate": 1.3247381546134663e-05, + "loss": 0.2905, + "step": 27090 + }, + { + "epoch": 6.758104738154613, + "grad_norm": 6.985355377197266, + "learning_rate": 1.324488778054863e-05, + "loss": 0.3524, + "step": 27100 + }, + { + "epoch": 6.760598503740648, + "grad_norm": 5.594140529632568, + "learning_rate": 1.3242394014962594e-05, + "loss": 0.4011, + "step": 27110 + }, + { + "epoch": 6.763092269326683, + "grad_norm": 7.396782875061035, + "learning_rate": 1.323990024937656e-05, + "loss": 0.378, + "step": 27120 + }, + { + "epoch": 6.765586034912718, + "grad_norm": 7.146440505981445, + "learning_rate": 1.3237406483790524e-05, + "loss": 0.3565, + "step": 27130 + }, + { + "epoch": 6.768079800498753, + "grad_norm": 4.529737949371338, + "learning_rate": 1.323491271820449e-05, + "loss": 0.3357, + "step": 27140 + }, + { + "epoch": 6.770573566084788, + "grad_norm": 5.635150909423828, + "learning_rate": 1.3232418952618455e-05, + "loss": 0.4166, + "step": 27150 + }, + { + "epoch": 6.773067331670823, + "grad_norm": 7.09439754486084, + "learning_rate": 1.322992518703242e-05, + "loss": 0.434, + "step": 27160 + }, + { + "epoch": 6.775561097256858, + "grad_norm": 7.409408092498779, + "learning_rate": 1.3227431421446386e-05, + "loss": 0.3876, + "step": 27170 + }, + { + "epoch": 6.778054862842893, + "grad_norm": 7.326381683349609, + "learning_rate": 1.3224937655860351e-05, + "loss": 0.3933, + "step": 27180 + }, + { + "epoch": 6.780548628428928, + "grad_norm": 6.994472980499268, + "learning_rate": 1.3222443890274315e-05, + "loss": 0.3826, + "step": 27190 + }, + { + "epoch": 6.783042394014963, + "grad_norm": 6.760201454162598, + "learning_rate": 1.3219950124688282e-05, + "loss": 0.4412, + "step": 27200 + }, + { + "epoch": 6.785536159600998, + "grad_norm": 5.957584381103516, + "learning_rate": 1.3217456359102245e-05, + "loss": 0.3916, + "step": 27210 + }, + { + "epoch": 6.788029925187033, + "grad_norm": 8.132131576538086, + "learning_rate": 1.3214962593516209e-05, + "loss": 0.4124, + "step": 27220 + }, + { + "epoch": 6.7905236907730675, + "grad_norm": 7.923279762268066, + "learning_rate": 1.3212468827930176e-05, + "loss": 0.3725, + "step": 27230 + }, + { + "epoch": 6.793017456359102, + "grad_norm": 8.802374839782715, + "learning_rate": 1.3209975062344141e-05, + "loss": 0.332, + "step": 27240 + }, + { + "epoch": 6.795511221945137, + "grad_norm": 8.764188766479492, + "learning_rate": 1.3207481296758106e-05, + "loss": 0.4053, + "step": 27250 + }, + { + "epoch": 6.798004987531172, + "grad_norm": 7.860824108123779, + "learning_rate": 1.3204987531172072e-05, + "loss": 0.4145, + "step": 27260 + }, + { + "epoch": 6.800498753117207, + "grad_norm": 5.447219371795654, + "learning_rate": 1.3202493765586035e-05, + "loss": 0.3885, + "step": 27270 + }, + { + "epoch": 6.802992518703242, + "grad_norm": 4.570130348205566, + "learning_rate": 1.3200000000000002e-05, + "loss": 0.3414, + "step": 27280 + }, + { + "epoch": 6.805486284289277, + "grad_norm": 5.973016262054443, + "learning_rate": 1.3197506234413966e-05, + "loss": 0.3436, + "step": 27290 + }, + { + "epoch": 6.807980049875312, + "grad_norm": 6.451689720153809, + "learning_rate": 1.319501246882793e-05, + "loss": 0.3842, + "step": 27300 + }, + { + "epoch": 6.8104738154613464, + "grad_norm": 6.162374496459961, + "learning_rate": 1.3192518703241897e-05, + "loss": 0.386, + "step": 27310 + }, + { + "epoch": 6.812967581047381, + "grad_norm": 7.707452774047852, + "learning_rate": 1.319002493765586e-05, + "loss": 0.4001, + "step": 27320 + }, + { + "epoch": 6.815461346633416, + "grad_norm": 5.568814754486084, + "learning_rate": 1.3187531172069827e-05, + "loss": 0.4183, + "step": 27330 + }, + { + "epoch": 6.817955112219451, + "grad_norm": 6.596290588378906, + "learning_rate": 1.318503740648379e-05, + "loss": 0.4037, + "step": 27340 + }, + { + "epoch": 6.820448877805486, + "grad_norm": 6.735906600952148, + "learning_rate": 1.3182543640897756e-05, + "loss": 0.3821, + "step": 27350 + }, + { + "epoch": 6.822942643391521, + "grad_norm": 5.793524265289307, + "learning_rate": 1.3180049875311723e-05, + "loss": 0.3668, + "step": 27360 + }, + { + "epoch": 6.825436408977556, + "grad_norm": 5.449307918548584, + "learning_rate": 1.3177556109725687e-05, + "loss": 0.3029, + "step": 27370 + }, + { + "epoch": 6.8279301745635905, + "grad_norm": 7.30305814743042, + "learning_rate": 1.3175062344139654e-05, + "loss": 0.3419, + "step": 27380 + }, + { + "epoch": 6.830423940149626, + "grad_norm": 6.304130554199219, + "learning_rate": 1.3172568578553617e-05, + "loss": 0.3157, + "step": 27390 + }, + { + "epoch": 6.832917705735661, + "grad_norm": 4.693641185760498, + "learning_rate": 1.3170074812967581e-05, + "loss": 0.3667, + "step": 27400 + }, + { + "epoch": 6.835411471321696, + "grad_norm": 13.741803169250488, + "learning_rate": 1.3167581047381548e-05, + "loss": 0.3824, + "step": 27410 + }, + { + "epoch": 6.837905236907731, + "grad_norm": 5.390978813171387, + "learning_rate": 1.3165087281795512e-05, + "loss": 0.3571, + "step": 27420 + }, + { + "epoch": 6.840399002493766, + "grad_norm": 8.878130912780762, + "learning_rate": 1.3162593516209477e-05, + "loss": 0.3623, + "step": 27430 + }, + { + "epoch": 6.842892768079801, + "grad_norm": 9.977058410644531, + "learning_rate": 1.3160099750623442e-05, + "loss": 0.3624, + "step": 27440 + }, + { + "epoch": 6.8453865336658355, + "grad_norm": 4.6882452964782715, + "learning_rate": 1.3157605985037408e-05, + "loss": 0.3141, + "step": 27450 + }, + { + "epoch": 6.84788029925187, + "grad_norm": 14.156923294067383, + "learning_rate": 1.3155112219451373e-05, + "loss": 0.4383, + "step": 27460 + }, + { + "epoch": 6.850374064837905, + "grad_norm": 5.467523097991943, + "learning_rate": 1.3152618453865338e-05, + "loss": 0.367, + "step": 27470 + }, + { + "epoch": 6.85286783042394, + "grad_norm": 5.007584571838379, + "learning_rate": 1.3150124688279302e-05, + "loss": 0.3777, + "step": 27480 + }, + { + "epoch": 6.855361596009975, + "grad_norm": 9.654122352600098, + "learning_rate": 1.3147630922693269e-05, + "loss": 0.3868, + "step": 27490 + }, + { + "epoch": 6.85785536159601, + "grad_norm": 8.467918395996094, + "learning_rate": 1.3145137157107232e-05, + "loss": 0.3602, + "step": 27500 + }, + { + "epoch": 6.860349127182045, + "grad_norm": 8.106925010681152, + "learning_rate": 1.3142643391521198e-05, + "loss": 0.3881, + "step": 27510 + }, + { + "epoch": 6.86284289276808, + "grad_norm": 4.681951999664307, + "learning_rate": 1.3140149625935163e-05, + "loss": 0.4089, + "step": 27520 + }, + { + "epoch": 6.865336658354114, + "grad_norm": 10.276632308959961, + "learning_rate": 1.3137655860349128e-05, + "loss": 0.4051, + "step": 27530 + }, + { + "epoch": 6.867830423940149, + "grad_norm": 6.851044178009033, + "learning_rate": 1.3135162094763094e-05, + "loss": 0.3344, + "step": 27540 + }, + { + "epoch": 6.870324189526184, + "grad_norm": 8.850982666015625, + "learning_rate": 1.3132668329177059e-05, + "loss": 0.4303, + "step": 27550 + }, + { + "epoch": 6.87281795511222, + "grad_norm": 6.095160007476807, + "learning_rate": 1.3130174563591023e-05, + "loss": 0.4008, + "step": 27560 + }, + { + "epoch": 6.875311720698255, + "grad_norm": 4.623822212219238, + "learning_rate": 1.312768079800499e-05, + "loss": 0.4192, + "step": 27570 + }, + { + "epoch": 6.87780548628429, + "grad_norm": 4.292810440063477, + "learning_rate": 1.3125187032418953e-05, + "loss": 0.3421, + "step": 27580 + }, + { + "epoch": 6.8802992518703245, + "grad_norm": 5.254152774810791, + "learning_rate": 1.312269326683292e-05, + "loss": 0.4478, + "step": 27590 + }, + { + "epoch": 6.882793017456359, + "grad_norm": 10.666980743408203, + "learning_rate": 1.3120199501246884e-05, + "loss": 0.347, + "step": 27600 + }, + { + "epoch": 6.885286783042394, + "grad_norm": 5.066367149353027, + "learning_rate": 1.3117705735660849e-05, + "loss": 0.3701, + "step": 27610 + }, + { + "epoch": 6.887780548628429, + "grad_norm": 7.213781356811523, + "learning_rate": 1.3115211970074814e-05, + "loss": 0.4525, + "step": 27620 + }, + { + "epoch": 6.890274314214464, + "grad_norm": 8.0278959274292, + "learning_rate": 1.311271820448878e-05, + "loss": 0.4072, + "step": 27630 + }, + { + "epoch": 6.892768079800499, + "grad_norm": 4.485868453979492, + "learning_rate": 1.3110224438902743e-05, + "loss": 0.3232, + "step": 27640 + }, + { + "epoch": 6.895261845386534, + "grad_norm": 7.174385070800781, + "learning_rate": 1.310773067331671e-05, + "loss": 0.3655, + "step": 27650 + }, + { + "epoch": 6.897755610972569, + "grad_norm": 3.9336626529693604, + "learning_rate": 1.3105236907730674e-05, + "loss": 0.3454, + "step": 27660 + }, + { + "epoch": 6.9002493765586035, + "grad_norm": 6.965961933135986, + "learning_rate": 1.3102743142144641e-05, + "loss": 0.37, + "step": 27670 + }, + { + "epoch": 6.902743142144638, + "grad_norm": 6.3173675537109375, + "learning_rate": 1.3100249376558605e-05, + "loss": 0.3578, + "step": 27680 + }, + { + "epoch": 6.905236907730673, + "grad_norm": 5.358870983123779, + "learning_rate": 1.3097755610972568e-05, + "loss": 0.399, + "step": 27690 + }, + { + "epoch": 6.907730673316708, + "grad_norm": 7.68316125869751, + "learning_rate": 1.3095261845386535e-05, + "loss": 0.409, + "step": 27700 + }, + { + "epoch": 6.910224438902743, + "grad_norm": 7.902895927429199, + "learning_rate": 1.30927680798005e-05, + "loss": 0.3588, + "step": 27710 + }, + { + "epoch": 6.912718204488778, + "grad_norm": 5.715063571929932, + "learning_rate": 1.3090274314214464e-05, + "loss": 0.3029, + "step": 27720 + }, + { + "epoch": 6.915211970074813, + "grad_norm": 6.375357627868652, + "learning_rate": 1.3087780548628431e-05, + "loss": 0.3668, + "step": 27730 + }, + { + "epoch": 6.917705735660848, + "grad_norm": 8.482056617736816, + "learning_rate": 1.3085286783042395e-05, + "loss": 0.3974, + "step": 27740 + }, + { + "epoch": 6.920199501246882, + "grad_norm": 7.30125617980957, + "learning_rate": 1.3082793017456362e-05, + "loss": 0.368, + "step": 27750 + }, + { + "epoch": 6.922693266832917, + "grad_norm": 8.676459312438965, + "learning_rate": 1.3080299251870325e-05, + "loss": 0.4905, + "step": 27760 + }, + { + "epoch": 6.925187032418952, + "grad_norm": 6.748859405517578, + "learning_rate": 1.3077805486284289e-05, + "loss": 0.4028, + "step": 27770 + }, + { + "epoch": 6.927680798004987, + "grad_norm": 5.195030212402344, + "learning_rate": 1.3075311720698256e-05, + "loss": 0.3717, + "step": 27780 + }, + { + "epoch": 6.930174563591023, + "grad_norm": 7.407238960266113, + "learning_rate": 1.307281795511222e-05, + "loss": 0.3701, + "step": 27790 + }, + { + "epoch": 6.932668329177058, + "grad_norm": 9.330326080322266, + "learning_rate": 1.3070324189526185e-05, + "loss": 0.3282, + "step": 27800 + }, + { + "epoch": 6.9351620947630925, + "grad_norm": 6.410877704620361, + "learning_rate": 1.306783042394015e-05, + "loss": 0.4224, + "step": 27810 + }, + { + "epoch": 6.937655860349127, + "grad_norm": 8.255925178527832, + "learning_rate": 1.3065336658354116e-05, + "loss": 0.4419, + "step": 27820 + }, + { + "epoch": 6.940149625935162, + "grad_norm": 5.41157341003418, + "learning_rate": 1.3062842892768083e-05, + "loss": 0.3181, + "step": 27830 + }, + { + "epoch": 6.942643391521197, + "grad_norm": 6.762349605560303, + "learning_rate": 1.3060349127182046e-05, + "loss": 0.2942, + "step": 27840 + }, + { + "epoch": 6.945137157107232, + "grad_norm": 6.1585588455200195, + "learning_rate": 1.305785536159601e-05, + "loss": 0.3983, + "step": 27850 + }, + { + "epoch": 6.947630922693267, + "grad_norm": 6.742948532104492, + "learning_rate": 1.3055361596009977e-05, + "loss": 0.3911, + "step": 27860 + }, + { + "epoch": 6.950124688279302, + "grad_norm": 9.091787338256836, + "learning_rate": 1.305286783042394e-05, + "loss": 0.3564, + "step": 27870 + }, + { + "epoch": 6.952618453865337, + "grad_norm": 8.21529769897461, + "learning_rate": 1.3050374064837907e-05, + "loss": 0.3723, + "step": 27880 + }, + { + "epoch": 6.9551122194513715, + "grad_norm": 5.098814487457275, + "learning_rate": 1.3047880299251871e-05, + "loss": 0.2938, + "step": 27890 + }, + { + "epoch": 6.957605985037406, + "grad_norm": 7.9842000007629395, + "learning_rate": 1.3045386533665836e-05, + "loss": 0.3766, + "step": 27900 + }, + { + "epoch": 6.960099750623441, + "grad_norm": 6.199515342712402, + "learning_rate": 1.3042892768079802e-05, + "loss": 0.4068, + "step": 27910 + }, + { + "epoch": 6.962593516209476, + "grad_norm": 5.990458011627197, + "learning_rate": 1.3040399002493767e-05, + "loss": 0.3514, + "step": 27920 + }, + { + "epoch": 6.965087281795511, + "grad_norm": 6.094311714172363, + "learning_rate": 1.303790523690773e-05, + "loss": 0.3625, + "step": 27930 + }, + { + "epoch": 6.967581047381546, + "grad_norm": 6.9624481201171875, + "learning_rate": 1.3035411471321698e-05, + "loss": 0.4484, + "step": 27940 + }, + { + "epoch": 6.970074812967581, + "grad_norm": 6.599002838134766, + "learning_rate": 1.3032917705735661e-05, + "loss": 0.4132, + "step": 27950 + }, + { + "epoch": 6.9725685785536164, + "grad_norm": 5.155543327331543, + "learning_rate": 1.3030423940149628e-05, + "loss": 0.3469, + "step": 27960 + }, + { + "epoch": 6.975062344139651, + "grad_norm": 6.06123685836792, + "learning_rate": 1.3027930174563592e-05, + "loss": 0.4548, + "step": 27970 + }, + { + "epoch": 6.977556109725686, + "grad_norm": 8.714743614196777, + "learning_rate": 1.3025436408977557e-05, + "loss": 0.4578, + "step": 27980 + }, + { + "epoch": 6.980049875311721, + "grad_norm": 5.861020565032959, + "learning_rate": 1.3023192019950125e-05, + "loss": 0.337, + "step": 27990 + }, + { + "epoch": 6.982543640897756, + "grad_norm": 6.951167106628418, + "learning_rate": 1.302069825436409e-05, + "loss": 0.3824, + "step": 28000 + }, + { + "epoch": 6.985037406483791, + "grad_norm": 3.6698153018951416, + "learning_rate": 1.3018204488778056e-05, + "loss": 0.3376, + "step": 28010 + }, + { + "epoch": 6.987531172069826, + "grad_norm": 3.0680627822875977, + "learning_rate": 1.3015710723192021e-05, + "loss": 0.3163, + "step": 28020 + }, + { + "epoch": 6.9900249376558605, + "grad_norm": 11.675896644592285, + "learning_rate": 1.3013216957605985e-05, + "loss": 0.3303, + "step": 28030 + }, + { + "epoch": 6.992518703241895, + "grad_norm": 4.275930404663086, + "learning_rate": 1.3010723192019952e-05, + "loss": 0.3455, + "step": 28040 + }, + { + "epoch": 6.99501246882793, + "grad_norm": 7.872333526611328, + "learning_rate": 1.3008229426433916e-05, + "loss": 0.3802, + "step": 28050 + }, + { + "epoch": 6.997506234413965, + "grad_norm": 8.095891952514648, + "learning_rate": 1.3005735660847883e-05, + "loss": 0.4079, + "step": 28060 + }, + { + "epoch": 7.0, + "grad_norm": 8.841901779174805, + "learning_rate": 1.3003241895261846e-05, + "loss": 0.4457, + "step": 28070 + }, + { + "epoch": 7.0, + "eval_loss": 0.4166116714477539, + "eval_runtime": 59.9481, + "eval_samples_per_second": 16.731, + "eval_steps_per_second": 16.731, + "step": 28070 + }, + { + "epoch": 7.002493765586035, + "grad_norm": 10.384051322937012, + "learning_rate": 1.3000748129675811e-05, + "loss": 0.3682, + "step": 28080 + }, + { + "epoch": 7.00498753117207, + "grad_norm": 5.374267578125, + "learning_rate": 1.2998254364089777e-05, + "loss": 0.4221, + "step": 28090 + }, + { + "epoch": 7.007481296758105, + "grad_norm": 11.389906883239746, + "learning_rate": 1.2995760598503742e-05, + "loss": 0.3828, + "step": 28100 + }, + { + "epoch": 7.0099750623441395, + "grad_norm": 7.218733787536621, + "learning_rate": 1.2993266832917706e-05, + "loss": 0.3267, + "step": 28110 + }, + { + "epoch": 7.012468827930174, + "grad_norm": 5.1964263916015625, + "learning_rate": 1.2990773067331673e-05, + "loss": 0.3566, + "step": 28120 + }, + { + "epoch": 7.014962593516209, + "grad_norm": 9.020042419433594, + "learning_rate": 1.2988279301745636e-05, + "loss": 0.346, + "step": 28130 + }, + { + "epoch": 7.017456359102244, + "grad_norm": 13.949241638183594, + "learning_rate": 1.2985785536159603e-05, + "loss": 0.31, + "step": 28140 + }, + { + "epoch": 7.019950124688279, + "grad_norm": 6.440070152282715, + "learning_rate": 1.2983291770573567e-05, + "loss": 0.3707, + "step": 28150 + }, + { + "epoch": 7.022443890274314, + "grad_norm": 6.926358222961426, + "learning_rate": 1.2980798004987532e-05, + "loss": 0.3829, + "step": 28160 + }, + { + "epoch": 7.024937655860349, + "grad_norm": 6.67525053024292, + "learning_rate": 1.2978304239401498e-05, + "loss": 0.3468, + "step": 28170 + }, + { + "epoch": 7.027431421446384, + "grad_norm": 6.060240745544434, + "learning_rate": 1.2976059850374066e-05, + "loss": 0.3461, + "step": 28180 + }, + { + "epoch": 7.029925187032419, + "grad_norm": 9.439708709716797, + "learning_rate": 1.2973566084788031e-05, + "loss": 0.4009, + "step": 28190 + }, + { + "epoch": 7.032418952618454, + "grad_norm": 3.421870708465576, + "learning_rate": 1.2971072319201996e-05, + "loss": 0.3896, + "step": 28200 + }, + { + "epoch": 7.034912718204489, + "grad_norm": 5.836117744445801, + "learning_rate": 1.296857855361596e-05, + "loss": 0.3461, + "step": 28210 + }, + { + "epoch": 7.037406483790524, + "grad_norm": 7.276211738586426, + "learning_rate": 1.2966084788029927e-05, + "loss": 0.3559, + "step": 28220 + }, + { + "epoch": 7.039900249376559, + "grad_norm": 5.882770538330078, + "learning_rate": 1.296359102244389e-05, + "loss": 0.294, + "step": 28230 + }, + { + "epoch": 7.042394014962594, + "grad_norm": 4.949868679046631, + "learning_rate": 1.2961097256857858e-05, + "loss": 0.3668, + "step": 28240 + }, + { + "epoch": 7.0448877805486285, + "grad_norm": 5.131392955780029, + "learning_rate": 1.2958603491271821e-05, + "loss": 0.3404, + "step": 28250 + }, + { + "epoch": 7.047381546134663, + "grad_norm": 6.9679856300354, + "learning_rate": 1.2956109725685787e-05, + "loss": 0.4193, + "step": 28260 + }, + { + "epoch": 7.049875311720698, + "grad_norm": 6.4561767578125, + "learning_rate": 1.2953615960099752e-05, + "loss": 0.3525, + "step": 28270 + }, + { + "epoch": 7.052369077306733, + "grad_norm": 7.776483058929443, + "learning_rate": 1.2951122194513717e-05, + "loss": 0.3326, + "step": 28280 + }, + { + "epoch": 7.054862842892768, + "grad_norm": 5.461517810821533, + "learning_rate": 1.2948628428927681e-05, + "loss": 0.3554, + "step": 28290 + }, + { + "epoch": 7.057356608478803, + "grad_norm": 7.675981521606445, + "learning_rate": 1.2946134663341648e-05, + "loss": 0.4105, + "step": 28300 + }, + { + "epoch": 7.059850374064838, + "grad_norm": 3.51208233833313, + "learning_rate": 1.2943640897755611e-05, + "loss": 0.3293, + "step": 28310 + }, + { + "epoch": 7.062344139650873, + "grad_norm": 5.394303321838379, + "learning_rate": 1.2941147132169578e-05, + "loss": 0.4108, + "step": 28320 + }, + { + "epoch": 7.0648379052369075, + "grad_norm": 5.80884313583374, + "learning_rate": 1.2938653366583542e-05, + "loss": 0.2895, + "step": 28330 + }, + { + "epoch": 7.067331670822942, + "grad_norm": 11.093313217163086, + "learning_rate": 1.2936159600997507e-05, + "loss": 0.3812, + "step": 28340 + }, + { + "epoch": 7.069825436408977, + "grad_norm": 8.297402381896973, + "learning_rate": 1.2933665835411473e-05, + "loss": 0.4159, + "step": 28350 + }, + { + "epoch": 7.072319201995012, + "grad_norm": 8.410477638244629, + "learning_rate": 1.2931172069825438e-05, + "loss": 0.3703, + "step": 28360 + }, + { + "epoch": 7.074812967581048, + "grad_norm": 8.238855361938477, + "learning_rate": 1.2928678304239402e-05, + "loss": 0.4046, + "step": 28370 + }, + { + "epoch": 7.077306733167083, + "grad_norm": 9.850193977355957, + "learning_rate": 1.2926184538653369e-05, + "loss": 0.4449, + "step": 28380 + }, + { + "epoch": 7.079800498753118, + "grad_norm": 4.337007999420166, + "learning_rate": 1.2923690773067332e-05, + "loss": 0.3567, + "step": 28390 + }, + { + "epoch": 7.082294264339152, + "grad_norm": 7.723695278167725, + "learning_rate": 1.29211970074813e-05, + "loss": 0.3221, + "step": 28400 + }, + { + "epoch": 7.084788029925187, + "grad_norm": 3.753911256790161, + "learning_rate": 1.2918703241895263e-05, + "loss": 0.3524, + "step": 28410 + }, + { + "epoch": 7.087281795511222, + "grad_norm": 6.445272922515869, + "learning_rate": 1.2916209476309226e-05, + "loss": 0.3343, + "step": 28420 + }, + { + "epoch": 7.089775561097257, + "grad_norm": 7.809503078460693, + "learning_rate": 1.2913715710723193e-05, + "loss": 0.3879, + "step": 28430 + }, + { + "epoch": 7.092269326683292, + "grad_norm": 8.968701362609863, + "learning_rate": 1.2911221945137157e-05, + "loss": 0.4012, + "step": 28440 + }, + { + "epoch": 7.094763092269327, + "grad_norm": 8.712312698364258, + "learning_rate": 1.2908728179551124e-05, + "loss": 0.514, + "step": 28450 + }, + { + "epoch": 7.097256857855362, + "grad_norm": 7.140172481536865, + "learning_rate": 1.290623441396509e-05, + "loss": 0.3044, + "step": 28460 + }, + { + "epoch": 7.0997506234413965, + "grad_norm": 11.479339599609375, + "learning_rate": 1.2903740648379053e-05, + "loss": 0.3965, + "step": 28470 + }, + { + "epoch": 7.102244389027431, + "grad_norm": 7.283442974090576, + "learning_rate": 1.290124688279302e-05, + "loss": 0.3227, + "step": 28480 + }, + { + "epoch": 7.104738154613466, + "grad_norm": 8.901700019836426, + "learning_rate": 1.2898753117206984e-05, + "loss": 0.3847, + "step": 28490 + }, + { + "epoch": 7.107231920199501, + "grad_norm": 5.670577049255371, + "learning_rate": 1.2896259351620947e-05, + "loss": 0.4015, + "step": 28500 + }, + { + "epoch": 7.109725685785536, + "grad_norm": 5.779824733734131, + "learning_rate": 1.2893765586034914e-05, + "loss": 0.2875, + "step": 28510 + }, + { + "epoch": 7.112219451371571, + "grad_norm": 7.593159198760986, + "learning_rate": 1.2891271820448878e-05, + "loss": 0.3917, + "step": 28520 + }, + { + "epoch": 7.114713216957606, + "grad_norm": 5.073284149169922, + "learning_rate": 1.2888778054862845e-05, + "loss": 0.2831, + "step": 28530 + }, + { + "epoch": 7.117206982543641, + "grad_norm": 7.407405376434326, + "learning_rate": 1.2886284289276809e-05, + "loss": 0.3676, + "step": 28540 + }, + { + "epoch": 7.1197007481296755, + "grad_norm": 6.4434638023376465, + "learning_rate": 1.2883790523690774e-05, + "loss": 0.363, + "step": 28550 + }, + { + "epoch": 7.12219451371571, + "grad_norm": 4.871036052703857, + "learning_rate": 1.2881296758104739e-05, + "loss": 0.3373, + "step": 28560 + }, + { + "epoch": 7.124688279301745, + "grad_norm": 5.883360862731934, + "learning_rate": 1.2878802992518704e-05, + "loss": 0.3403, + "step": 28570 + }, + { + "epoch": 7.127182044887781, + "grad_norm": 7.858293533325195, + "learning_rate": 1.2876309226932668e-05, + "loss": 0.3905, + "step": 28580 + }, + { + "epoch": 7.129675810473816, + "grad_norm": 6.678670883178711, + "learning_rate": 1.2873815461346635e-05, + "loss": 0.3882, + "step": 28590 + }, + { + "epoch": 7.132169576059851, + "grad_norm": 8.354759216308594, + "learning_rate": 1.2871321695760599e-05, + "loss": 0.389, + "step": 28600 + }, + { + "epoch": 7.134663341645886, + "grad_norm": 8.125460624694824, + "learning_rate": 1.2868827930174566e-05, + "loss": 0.3412, + "step": 28610 + }, + { + "epoch": 7.13715710723192, + "grad_norm": 6.524233818054199, + "learning_rate": 1.286633416458853e-05, + "loss": 0.4105, + "step": 28620 + }, + { + "epoch": 7.139650872817955, + "grad_norm": 9.085260391235352, + "learning_rate": 1.2863840399002495e-05, + "loss": 0.3852, + "step": 28630 + }, + { + "epoch": 7.14214463840399, + "grad_norm": 6.906386852264404, + "learning_rate": 1.286134663341646e-05, + "loss": 0.3602, + "step": 28640 + }, + { + "epoch": 7.144638403990025, + "grad_norm": 11.312614440917969, + "learning_rate": 1.2858852867830425e-05, + "loss": 0.4129, + "step": 28650 + }, + { + "epoch": 7.14713216957606, + "grad_norm": 5.9242353439331055, + "learning_rate": 1.2856359102244389e-05, + "loss": 0.3578, + "step": 28660 + }, + { + "epoch": 7.149625935162095, + "grad_norm": 8.224637031555176, + "learning_rate": 1.2853865336658356e-05, + "loss": 0.3501, + "step": 28670 + }, + { + "epoch": 7.15211970074813, + "grad_norm": 5.648616313934326, + "learning_rate": 1.285137157107232e-05, + "loss": 0.385, + "step": 28680 + }, + { + "epoch": 7.1546134663341645, + "grad_norm": 5.693350791931152, + "learning_rate": 1.2848877805486286e-05, + "loss": 0.4725, + "step": 28690 + }, + { + "epoch": 7.157107231920199, + "grad_norm": 5.901285171508789, + "learning_rate": 1.284638403990025e-05, + "loss": 0.3856, + "step": 28700 + }, + { + "epoch": 7.159600997506234, + "grad_norm": 8.282814979553223, + "learning_rate": 1.2843890274314215e-05, + "loss": 0.3343, + "step": 28710 + }, + { + "epoch": 7.162094763092269, + "grad_norm": 6.745294094085693, + "learning_rate": 1.284139650872818e-05, + "loss": 0.3845, + "step": 28720 + }, + { + "epoch": 7.164588528678304, + "grad_norm": 5.898577690124512, + "learning_rate": 1.2838902743142146e-05, + "loss": 0.3966, + "step": 28730 + }, + { + "epoch": 7.167082294264339, + "grad_norm": 5.124606132507324, + "learning_rate": 1.2836408977556111e-05, + "loss": 0.4228, + "step": 28740 + }, + { + "epoch": 7.169576059850374, + "grad_norm": 5.206420421600342, + "learning_rate": 1.2833915211970077e-05, + "loss": 0.3557, + "step": 28750 + }, + { + "epoch": 7.172069825436409, + "grad_norm": 8.561698913574219, + "learning_rate": 1.283142144638404e-05, + "loss": 0.3579, + "step": 28760 + }, + { + "epoch": 7.174563591022444, + "grad_norm": 6.212121486663818, + "learning_rate": 1.2828927680798007e-05, + "loss": 0.3384, + "step": 28770 + }, + { + "epoch": 7.177057356608479, + "grad_norm": 7.147701263427734, + "learning_rate": 1.2826433915211971e-05, + "loss": 0.3883, + "step": 28780 + }, + { + "epoch": 7.179551122194514, + "grad_norm": 6.2454023361206055, + "learning_rate": 1.2823940149625934e-05, + "loss": 0.4135, + "step": 28790 + }, + { + "epoch": 7.182044887780549, + "grad_norm": 9.466367721557617, + "learning_rate": 1.2821446384039901e-05, + "loss": 0.4502, + "step": 28800 + }, + { + "epoch": 7.184538653366584, + "grad_norm": 8.00047779083252, + "learning_rate": 1.2818952618453867e-05, + "loss": 0.4377, + "step": 28810 + }, + { + "epoch": 7.187032418952619, + "grad_norm": 7.6417951583862305, + "learning_rate": 1.2816458852867832e-05, + "loss": 0.363, + "step": 28820 + }, + { + "epoch": 7.1895261845386536, + "grad_norm": 4.634132385253906, + "learning_rate": 1.2813965087281797e-05, + "loss": 0.3622, + "step": 28830 + }, + { + "epoch": 7.192019950124688, + "grad_norm": 7.929607391357422, + "learning_rate": 1.2811471321695761e-05, + "loss": 0.3553, + "step": 28840 + }, + { + "epoch": 7.194513715710723, + "grad_norm": 6.655643463134766, + "learning_rate": 1.2808977556109728e-05, + "loss": 0.3342, + "step": 28850 + }, + { + "epoch": 7.197007481296758, + "grad_norm": 7.349606037139893, + "learning_rate": 1.2806483790523692e-05, + "loss": 0.3685, + "step": 28860 + }, + { + "epoch": 7.199501246882793, + "grad_norm": 8.308873176574707, + "learning_rate": 1.2803990024937655e-05, + "loss": 0.3837, + "step": 28870 + }, + { + "epoch": 7.201995012468828, + "grad_norm": 9.069894790649414, + "learning_rate": 1.2801496259351622e-05, + "loss": 0.3609, + "step": 28880 + }, + { + "epoch": 7.204488778054863, + "grad_norm": 10.108824729919434, + "learning_rate": 1.2799002493765586e-05, + "loss": 0.4051, + "step": 28890 + }, + { + "epoch": 7.206982543640898, + "grad_norm": 9.224723815917969, + "learning_rate": 1.2796508728179553e-05, + "loss": 0.4147, + "step": 28900 + }, + { + "epoch": 7.2094763092269325, + "grad_norm": 5.563706398010254, + "learning_rate": 1.2794014962593517e-05, + "loss": 0.3391, + "step": 28910 + }, + { + "epoch": 7.211970074812967, + "grad_norm": 6.96822452545166, + "learning_rate": 1.2791521197007482e-05, + "loss": 0.3146, + "step": 28920 + }, + { + "epoch": 7.214463840399002, + "grad_norm": 8.976058006286621, + "learning_rate": 1.2789027431421449e-05, + "loss": 0.3729, + "step": 28930 + }, + { + "epoch": 7.216957605985037, + "grad_norm": 6.567900657653809, + "learning_rate": 1.2786533665835412e-05, + "loss": 0.3472, + "step": 28940 + }, + { + "epoch": 7.219451371571072, + "grad_norm": 6.142775535583496, + "learning_rate": 1.278403990024938e-05, + "loss": 0.3296, + "step": 28950 + }, + { + "epoch": 7.221945137157107, + "grad_norm": 6.999728202819824, + "learning_rate": 1.2781546134663343e-05, + "loss": 0.3493, + "step": 28960 + }, + { + "epoch": 7.224438902743142, + "grad_norm": 6.844429969787598, + "learning_rate": 1.2779052369077307e-05, + "loss": 0.3389, + "step": 28970 + }, + { + "epoch": 7.2269326683291775, + "grad_norm": 8.233739852905273, + "learning_rate": 1.2776558603491274e-05, + "loss": 0.3728, + "step": 28980 + }, + { + "epoch": 7.229426433915212, + "grad_norm": 6.4842848777771, + "learning_rate": 1.2774064837905237e-05, + "loss": 0.4287, + "step": 28990 + }, + { + "epoch": 7.231920199501247, + "grad_norm": 7.557778835296631, + "learning_rate": 1.2771571072319203e-05, + "loss": 0.3734, + "step": 29000 + }, + { + "epoch": 7.234413965087282, + "grad_norm": 6.296581268310547, + "learning_rate": 1.2769077306733168e-05, + "loss": 0.3677, + "step": 29010 + }, + { + "epoch": 7.236907730673317, + "grad_norm": 7.28203821182251, + "learning_rate": 1.2766583541147133e-05, + "loss": 0.3798, + "step": 29020 + }, + { + "epoch": 7.239401496259352, + "grad_norm": 5.902282238006592, + "learning_rate": 1.2764089775561099e-05, + "loss": 0.3399, + "step": 29030 + }, + { + "epoch": 7.241895261845387, + "grad_norm": 6.5836310386657715, + "learning_rate": 1.2761596009975064e-05, + "loss": 0.406, + "step": 29040 + }, + { + "epoch": 7.2443890274314215, + "grad_norm": 8.319708824157715, + "learning_rate": 1.2759102244389027e-05, + "loss": 0.3888, + "step": 29050 + }, + { + "epoch": 7.246882793017456, + "grad_norm": 8.096579551696777, + "learning_rate": 1.2756608478802994e-05, + "loss": 0.3393, + "step": 29060 + }, + { + "epoch": 7.249376558603491, + "grad_norm": 6.496395587921143, + "learning_rate": 1.2754114713216958e-05, + "loss": 0.3642, + "step": 29070 + }, + { + "epoch": 7.251870324189526, + "grad_norm": 7.759038925170898, + "learning_rate": 1.2751620947630923e-05, + "loss": 0.3334, + "step": 29080 + }, + { + "epoch": 7.254364089775561, + "grad_norm": 10.256757736206055, + "learning_rate": 1.2749127182044889e-05, + "loss": 0.3924, + "step": 29090 + }, + { + "epoch": 7.256857855361596, + "grad_norm": 10.726069450378418, + "learning_rate": 1.2746633416458854e-05, + "loss": 0.4015, + "step": 29100 + }, + { + "epoch": 7.259351620947631, + "grad_norm": 8.847235679626465, + "learning_rate": 1.274413965087282e-05, + "loss": 0.4174, + "step": 29110 + }, + { + "epoch": 7.261845386533666, + "grad_norm": 7.851325035095215, + "learning_rate": 1.2741645885286785e-05, + "loss": 0.3122, + "step": 29120 + }, + { + "epoch": 7.2643391521197005, + "grad_norm": 6.145222187042236, + "learning_rate": 1.2739152119700748e-05, + "loss": 0.3972, + "step": 29130 + }, + { + "epoch": 7.266832917705735, + "grad_norm": 7.113053798675537, + "learning_rate": 1.2736658354114715e-05, + "loss": 0.4018, + "step": 29140 + }, + { + "epoch": 7.26932668329177, + "grad_norm": 10.120803833007812, + "learning_rate": 1.2734164588528679e-05, + "loss": 0.3977, + "step": 29150 + }, + { + "epoch": 7.271820448877805, + "grad_norm": 5.162845134735107, + "learning_rate": 1.2731670822942644e-05, + "loss": 0.3468, + "step": 29160 + }, + { + "epoch": 7.274314214463841, + "grad_norm": 9.55453109741211, + "learning_rate": 1.272917705735661e-05, + "loss": 0.3615, + "step": 29170 + }, + { + "epoch": 7.276807980049876, + "grad_norm": 7.122071266174316, + "learning_rate": 1.2726683291770575e-05, + "loss": 0.3974, + "step": 29180 + }, + { + "epoch": 7.279301745635911, + "grad_norm": 6.449893474578857, + "learning_rate": 1.272418952618454e-05, + "loss": 0.3763, + "step": 29190 + }, + { + "epoch": 7.2817955112219455, + "grad_norm": 12.209590911865234, + "learning_rate": 1.2721695760598505e-05, + "loss": 0.32, + "step": 29200 + }, + { + "epoch": 7.28428927680798, + "grad_norm": 7.169990062713623, + "learning_rate": 1.2719201995012469e-05, + "loss": 0.3413, + "step": 29210 + }, + { + "epoch": 7.286783042394015, + "grad_norm": 8.662860870361328, + "learning_rate": 1.2716708229426436e-05, + "loss": 0.3932, + "step": 29220 + }, + { + "epoch": 7.28927680798005, + "grad_norm": 6.312235355377197, + "learning_rate": 1.27142144638404e-05, + "loss": 0.3372, + "step": 29230 + }, + { + "epoch": 7.291770573566085, + "grad_norm": 5.396263599395752, + "learning_rate": 1.2711720698254367e-05, + "loss": 0.2894, + "step": 29240 + }, + { + "epoch": 7.29426433915212, + "grad_norm": 7.881542205810547, + "learning_rate": 1.270922693266833e-05, + "loss": 0.4439, + "step": 29250 + }, + { + "epoch": 7.296758104738155, + "grad_norm": 6.539719104766846, + "learning_rate": 1.2706733167082294e-05, + "loss": 0.3688, + "step": 29260 + }, + { + "epoch": 7.2992518703241895, + "grad_norm": 9.538723945617676, + "learning_rate": 1.2704239401496261e-05, + "loss": 0.3785, + "step": 29270 + }, + { + "epoch": 7.301745635910224, + "grad_norm": 8.049639701843262, + "learning_rate": 1.2701745635910226e-05, + "loss": 0.3698, + "step": 29280 + }, + { + "epoch": 7.304239401496259, + "grad_norm": 11.048667907714844, + "learning_rate": 1.269925187032419e-05, + "loss": 0.477, + "step": 29290 + }, + { + "epoch": 7.306733167082294, + "grad_norm": 4.723964214324951, + "learning_rate": 1.2696758104738157e-05, + "loss": 0.278, + "step": 29300 + }, + { + "epoch": 7.309226932668329, + "grad_norm": 8.363397598266602, + "learning_rate": 1.269426433915212e-05, + "loss": 0.3963, + "step": 29310 + }, + { + "epoch": 7.311720698254364, + "grad_norm": 5.296679973602295, + "learning_rate": 1.2691770573566087e-05, + "loss": 0.3419, + "step": 29320 + }, + { + "epoch": 7.314214463840399, + "grad_norm": 4.19307804107666, + "learning_rate": 1.2689276807980051e-05, + "loss": 0.3923, + "step": 29330 + }, + { + "epoch": 7.316708229426434, + "grad_norm": 4.72712516784668, + "learning_rate": 1.2686783042394015e-05, + "loss": 0.3854, + "step": 29340 + }, + { + "epoch": 7.3192019950124685, + "grad_norm": 7.943241119384766, + "learning_rate": 1.2684289276807982e-05, + "loss": 0.4142, + "step": 29350 + }, + { + "epoch": 7.321695760598503, + "grad_norm": 7.628240585327148, + "learning_rate": 1.2681795511221945e-05, + "loss": 0.3044, + "step": 29360 + }, + { + "epoch": 7.324189526184538, + "grad_norm": 6.917629718780518, + "learning_rate": 1.267930174563591e-05, + "loss": 0.3286, + "step": 29370 + }, + { + "epoch": 7.326683291770574, + "grad_norm": 5.416843891143799, + "learning_rate": 1.2676807980049876e-05, + "loss": 0.442, + "step": 29380 + }, + { + "epoch": 7.329177057356609, + "grad_norm": 4.867097854614258, + "learning_rate": 1.2674314214463841e-05, + "loss": 0.3525, + "step": 29390 + }, + { + "epoch": 7.331670822942644, + "grad_norm": 8.583399772644043, + "learning_rate": 1.2671820448877808e-05, + "loss": 0.3817, + "step": 29400 + }, + { + "epoch": 7.334164588528679, + "grad_norm": 7.867173194885254, + "learning_rate": 1.2669326683291772e-05, + "loss": 0.3231, + "step": 29410 + }, + { + "epoch": 7.3366583541147135, + "grad_norm": 6.9553022384643555, + "learning_rate": 1.2666832917705735e-05, + "loss": 0.321, + "step": 29420 + }, + { + "epoch": 7.339152119700748, + "grad_norm": 6.242718696594238, + "learning_rate": 1.2664339152119702e-05, + "loss": 0.365, + "step": 29430 + }, + { + "epoch": 7.341645885286783, + "grad_norm": 8.64468765258789, + "learning_rate": 1.2661845386533666e-05, + "loss": 0.3114, + "step": 29440 + }, + { + "epoch": 7.344139650872818, + "grad_norm": 6.854546546936035, + "learning_rate": 1.2659351620947633e-05, + "loss": 0.3563, + "step": 29450 + }, + { + "epoch": 7.346633416458853, + "grad_norm": 10.145390510559082, + "learning_rate": 1.2656857855361597e-05, + "loss": 0.3541, + "step": 29460 + }, + { + "epoch": 7.349127182044888, + "grad_norm": 5.386590480804443, + "learning_rate": 1.2654364089775562e-05, + "loss": 0.3673, + "step": 29470 + }, + { + "epoch": 7.351620947630923, + "grad_norm": 5.273983955383301, + "learning_rate": 1.2651870324189527e-05, + "loss": 0.2825, + "step": 29480 + }, + { + "epoch": 7.3541147132169575, + "grad_norm": 4.6638689041137695, + "learning_rate": 1.2649376558603493e-05, + "loss": 0.4178, + "step": 29490 + }, + { + "epoch": 7.356608478802992, + "grad_norm": 5.493322372436523, + "learning_rate": 1.2646882793017456e-05, + "loss": 0.3265, + "step": 29500 + }, + { + "epoch": 7.359102244389027, + "grad_norm": 6.716048240661621, + "learning_rate": 1.2644389027431423e-05, + "loss": 0.3692, + "step": 29510 + }, + { + "epoch": 7.361596009975062, + "grad_norm": 6.258030891418457, + "learning_rate": 1.2641895261845387e-05, + "loss": 0.426, + "step": 29520 + }, + { + "epoch": 7.364089775561097, + "grad_norm": 5.933165550231934, + "learning_rate": 1.2639401496259354e-05, + "loss": 0.3618, + "step": 29530 + }, + { + "epoch": 7.366583541147132, + "grad_norm": 5.566700458526611, + "learning_rate": 1.2636907730673317e-05, + "loss": 0.4001, + "step": 29540 + }, + { + "epoch": 7.369077306733167, + "grad_norm": 7.390594959259033, + "learning_rate": 1.2634413965087283e-05, + "loss": 0.3826, + "step": 29550 + }, + { + "epoch": 7.371571072319202, + "grad_norm": 4.834929466247559, + "learning_rate": 1.2631920199501248e-05, + "loss": 0.3512, + "step": 29560 + }, + { + "epoch": 7.374064837905237, + "grad_norm": 8.851217269897461, + "learning_rate": 1.2629426433915213e-05, + "loss": 0.387, + "step": 29570 + }, + { + "epoch": 7.376558603491272, + "grad_norm": 14.366303443908691, + "learning_rate": 1.2626932668329177e-05, + "loss": 0.3875, + "step": 29580 + }, + { + "epoch": 7.379052369077307, + "grad_norm": 7.262747287750244, + "learning_rate": 1.2624438902743144e-05, + "loss": 0.3718, + "step": 29590 + }, + { + "epoch": 7.381546134663342, + "grad_norm": 5.579659938812256, + "learning_rate": 1.2621945137157108e-05, + "loss": 0.3919, + "step": 29600 + }, + { + "epoch": 7.384039900249377, + "grad_norm": 5.182689189910889, + "learning_rate": 1.2619451371571075e-05, + "loss": 0.3414, + "step": 29610 + }, + { + "epoch": 7.386533665835412, + "grad_norm": 4.612008094787598, + "learning_rate": 1.2616957605985038e-05, + "loss": 0.3101, + "step": 29620 + }, + { + "epoch": 7.389027431421447, + "grad_norm": 11.192936897277832, + "learning_rate": 1.2614463840399004e-05, + "loss": 0.3038, + "step": 29630 + }, + { + "epoch": 7.3915211970074814, + "grad_norm": 8.110973358154297, + "learning_rate": 1.2611970074812969e-05, + "loss": 0.451, + "step": 29640 + }, + { + "epoch": 7.394014962593516, + "grad_norm": 9.395048141479492, + "learning_rate": 1.2609476309226934e-05, + "loss": 0.3936, + "step": 29650 + }, + { + "epoch": 7.396508728179551, + "grad_norm": 6.499456882476807, + "learning_rate": 1.2606982543640898e-05, + "loss": 0.3217, + "step": 29660 + }, + { + "epoch": 7.399002493765586, + "grad_norm": 6.436877250671387, + "learning_rate": 1.2604488778054865e-05, + "loss": 0.4203, + "step": 29670 + }, + { + "epoch": 7.401496259351621, + "grad_norm": 4.4001688957214355, + "learning_rate": 1.2601995012468828e-05, + "loss": 0.4034, + "step": 29680 + }, + { + "epoch": 7.403990024937656, + "grad_norm": 7.0502800941467285, + "learning_rate": 1.2599501246882795e-05, + "loss": 0.4124, + "step": 29690 + }, + { + "epoch": 7.406483790523691, + "grad_norm": 10.673737525939941, + "learning_rate": 1.2597007481296759e-05, + "loss": 0.4948, + "step": 29700 + }, + { + "epoch": 7.4089775561097255, + "grad_norm": 6.177337646484375, + "learning_rate": 1.2594513715710723e-05, + "loss": 0.4471, + "step": 29710 + }, + { + "epoch": 7.41147132169576, + "grad_norm": 5.368765830993652, + "learning_rate": 1.259201995012469e-05, + "loss": 0.3887, + "step": 29720 + }, + { + "epoch": 7.413965087281795, + "grad_norm": 6.879129409790039, + "learning_rate": 1.2589526184538653e-05, + "loss": 0.3773, + "step": 29730 + }, + { + "epoch": 7.41645885286783, + "grad_norm": 8.17301082611084, + "learning_rate": 1.258703241895262e-05, + "loss": 0.3722, + "step": 29740 + }, + { + "epoch": 7.418952618453865, + "grad_norm": 8.106537818908691, + "learning_rate": 1.2584538653366586e-05, + "loss": 0.41, + "step": 29750 + }, + { + "epoch": 7.4214463840399, + "grad_norm": 6.589071273803711, + "learning_rate": 1.258204488778055e-05, + "loss": 0.4002, + "step": 29760 + }, + { + "epoch": 7.423940149625935, + "grad_norm": 8.137639999389648, + "learning_rate": 1.2579551122194516e-05, + "loss": 0.3859, + "step": 29770 + }, + { + "epoch": 7.42643391521197, + "grad_norm": 6.957583427429199, + "learning_rate": 1.257705735660848e-05, + "loss": 0.4258, + "step": 29780 + }, + { + "epoch": 7.428927680798005, + "grad_norm": 8.225882530212402, + "learning_rate": 1.2574563591022443e-05, + "loss": 0.3842, + "step": 29790 + }, + { + "epoch": 7.43142144638404, + "grad_norm": 6.6841888427734375, + "learning_rate": 1.257206982543641e-05, + "loss": 0.3717, + "step": 29800 + }, + { + "epoch": 7.433915211970075, + "grad_norm": 6.028921127319336, + "learning_rate": 1.2569576059850374e-05, + "loss": 0.3616, + "step": 29810 + }, + { + "epoch": 7.43640897755611, + "grad_norm": 5.823741436004639, + "learning_rate": 1.2567082294264341e-05, + "loss": 0.3755, + "step": 29820 + }, + { + "epoch": 7.438902743142145, + "grad_norm": 11.203161239624023, + "learning_rate": 1.2564588528678305e-05, + "loss": 0.3806, + "step": 29830 + }, + { + "epoch": 7.44139650872818, + "grad_norm": 5.957489967346191, + "learning_rate": 1.256209476309227e-05, + "loss": 0.3821, + "step": 29840 + }, + { + "epoch": 7.443890274314215, + "grad_norm": 4.782578468322754, + "learning_rate": 1.2559600997506235e-05, + "loss": 0.3689, + "step": 29850 + }, + { + "epoch": 7.446384039900249, + "grad_norm": 4.5286784172058105, + "learning_rate": 1.25571072319202e-05, + "loss": 0.3414, + "step": 29860 + }, + { + "epoch": 7.448877805486284, + "grad_norm": 11.462789535522461, + "learning_rate": 1.2554613466334164e-05, + "loss": 0.373, + "step": 29870 + }, + { + "epoch": 7.451371571072319, + "grad_norm": 8.337203979492188, + "learning_rate": 1.2552119700748131e-05, + "loss": 0.3389, + "step": 29880 + }, + { + "epoch": 7.453865336658354, + "grad_norm": 13.295976638793945, + "learning_rate": 1.2549625935162095e-05, + "loss": 0.3165, + "step": 29890 + }, + { + "epoch": 7.456359102244389, + "grad_norm": 6.52192497253418, + "learning_rate": 1.2547132169576062e-05, + "loss": 0.3477, + "step": 29900 + }, + { + "epoch": 7.458852867830424, + "grad_norm": 7.2272491455078125, + "learning_rate": 1.2544638403990025e-05, + "loss": 0.3611, + "step": 29910 + }, + { + "epoch": 7.461346633416459, + "grad_norm": 3.4752140045166016, + "learning_rate": 1.254214463840399e-05, + "loss": 0.3382, + "step": 29920 + }, + { + "epoch": 7.4638403990024935, + "grad_norm": 9.237256050109863, + "learning_rate": 1.2539650872817956e-05, + "loss": 0.3683, + "step": 29930 + }, + { + "epoch": 7.466334164588528, + "grad_norm": 9.527120590209961, + "learning_rate": 1.2537157107231921e-05, + "loss": 0.3784, + "step": 29940 + }, + { + "epoch": 7.468827930174563, + "grad_norm": 5.257556915283203, + "learning_rate": 1.2534663341645887e-05, + "loss": 0.3919, + "step": 29950 + }, + { + "epoch": 7.471321695760598, + "grad_norm": 5.737821102142334, + "learning_rate": 1.2532169576059852e-05, + "loss": 0.3766, + "step": 29960 + }, + { + "epoch": 7.473815461346634, + "grad_norm": 8.01321792602539, + "learning_rate": 1.2529675810473816e-05, + "loss": 0.3299, + "step": 29970 + }, + { + "epoch": 7.476309226932669, + "grad_norm": 5.301641464233398, + "learning_rate": 1.2527182044887783e-05, + "loss": 0.3851, + "step": 29980 + }, + { + "epoch": 7.478802992518704, + "grad_norm": 7.687638759613037, + "learning_rate": 1.2524688279301746e-05, + "loss": 0.3364, + "step": 29990 + }, + { + "epoch": 7.4812967581047385, + "grad_norm": 6.31342077255249, + "learning_rate": 1.2522194513715712e-05, + "loss": 0.3607, + "step": 30000 + }, + { + "epoch": 7.483790523690773, + "grad_norm": 5.950140953063965, + "learning_rate": 1.2519700748129677e-05, + "loss": 0.3616, + "step": 30010 + }, + { + "epoch": 7.486284289276808, + "grad_norm": 6.606958389282227, + "learning_rate": 1.2517206982543642e-05, + "loss": 0.3231, + "step": 30020 + }, + { + "epoch": 7.488778054862843, + "grad_norm": 8.758807182312012, + "learning_rate": 1.2514713216957607e-05, + "loss": 0.3508, + "step": 30030 + }, + { + "epoch": 7.491271820448878, + "grad_norm": 3.6969501972198486, + "learning_rate": 1.2512219451371573e-05, + "loss": 0.3668, + "step": 30040 + }, + { + "epoch": 7.493765586034913, + "grad_norm": 7.893050670623779, + "learning_rate": 1.2509725685785536e-05, + "loss": 0.3001, + "step": 30050 + }, + { + "epoch": 7.496259351620948, + "grad_norm": 8.5418119430542, + "learning_rate": 1.2507231920199503e-05, + "loss": 0.3428, + "step": 30060 + }, + { + "epoch": 7.498753117206983, + "grad_norm": 7.37095308303833, + "learning_rate": 1.2504738154613467e-05, + "loss": 0.36, + "step": 30070 + }, + { + "epoch": 7.501246882793017, + "grad_norm": 5.3542704582214355, + "learning_rate": 1.250224438902743e-05, + "loss": 0.354, + "step": 30080 + }, + { + "epoch": 7.503740648379052, + "grad_norm": 7.024536609649658, + "learning_rate": 1.2499750623441398e-05, + "loss": 0.3936, + "step": 30090 + }, + { + "epoch": 7.506234413965087, + "grad_norm": 8.495894432067871, + "learning_rate": 1.2497256857855363e-05, + "loss": 0.3381, + "step": 30100 + }, + { + "epoch": 7.508728179551122, + "grad_norm": 6.884313106536865, + "learning_rate": 1.2494763092269328e-05, + "loss": 0.381, + "step": 30110 + }, + { + "epoch": 7.511221945137157, + "grad_norm": 7.271829128265381, + "learning_rate": 1.2492269326683294e-05, + "loss": 0.4289, + "step": 30120 + }, + { + "epoch": 7.513715710723192, + "grad_norm": 14.023894309997559, + "learning_rate": 1.2489775561097257e-05, + "loss": 0.4139, + "step": 30130 + }, + { + "epoch": 7.516209476309227, + "grad_norm": 7.130033493041992, + "learning_rate": 1.2487281795511224e-05, + "loss": 0.3319, + "step": 30140 + }, + { + "epoch": 7.5187032418952615, + "grad_norm": 6.766006946563721, + "learning_rate": 1.2484788029925188e-05, + "loss": 0.3546, + "step": 30150 + }, + { + "epoch": 7.521197007481296, + "grad_norm": 10.241814613342285, + "learning_rate": 1.2482294264339151e-05, + "loss": 0.3268, + "step": 30160 + }, + { + "epoch": 7.523690773067331, + "grad_norm": 8.794346809387207, + "learning_rate": 1.2479800498753118e-05, + "loss": 0.3394, + "step": 30170 + }, + { + "epoch": 7.526184538653366, + "grad_norm": 8.435714721679688, + "learning_rate": 1.2477306733167082e-05, + "loss": 0.3174, + "step": 30180 + }, + { + "epoch": 7.528678304239402, + "grad_norm": 9.198299407958984, + "learning_rate": 1.2474812967581049e-05, + "loss": 0.36, + "step": 30190 + }, + { + "epoch": 7.531172069825437, + "grad_norm": 8.56997013092041, + "learning_rate": 1.2472319201995013e-05, + "loss": 0.3682, + "step": 30200 + }, + { + "epoch": 7.533665835411472, + "grad_norm": 9.016803741455078, + "learning_rate": 1.2469825436408978e-05, + "loss": 0.3727, + "step": 30210 + }, + { + "epoch": 7.5361596009975065, + "grad_norm": 7.877820014953613, + "learning_rate": 1.2467331670822945e-05, + "loss": 0.4129, + "step": 30220 + }, + { + "epoch": 7.538653366583541, + "grad_norm": 5.137962818145752, + "learning_rate": 1.2464837905236909e-05, + "loss": 0.3079, + "step": 30230 + }, + { + "epoch": 7.541147132169576, + "grad_norm": 6.477614879608154, + "learning_rate": 1.2462344139650876e-05, + "loss": 0.3845, + "step": 30240 + }, + { + "epoch": 7.543640897755611, + "grad_norm": 7.037804126739502, + "learning_rate": 1.245985037406484e-05, + "loss": 0.4319, + "step": 30250 + }, + { + "epoch": 7.546134663341646, + "grad_norm": 6.353123664855957, + "learning_rate": 1.2457356608478803e-05, + "loss": 0.3744, + "step": 30260 + }, + { + "epoch": 7.548628428927681, + "grad_norm": 18.240354537963867, + "learning_rate": 1.245486284289277e-05, + "loss": 0.3774, + "step": 30270 + }, + { + "epoch": 7.551122194513716, + "grad_norm": 9.531089782714844, + "learning_rate": 1.2452369077306733e-05, + "loss": 0.3673, + "step": 30280 + }, + { + "epoch": 7.553615960099751, + "grad_norm": 6.397697925567627, + "learning_rate": 1.2449875311720699e-05, + "loss": 0.4171, + "step": 30290 + }, + { + "epoch": 7.556109725685785, + "grad_norm": 5.065841197967529, + "learning_rate": 1.2447381546134664e-05, + "loss": 0.3296, + "step": 30300 + }, + { + "epoch": 7.55860349127182, + "grad_norm": 6.741978645324707, + "learning_rate": 1.244488778054863e-05, + "loss": 0.3183, + "step": 30310 + }, + { + "epoch": 7.561097256857855, + "grad_norm": 6.0341620445251465, + "learning_rate": 1.2442394014962595e-05, + "loss": 0.399, + "step": 30320 + }, + { + "epoch": 7.56359102244389, + "grad_norm": 5.0857977867126465, + "learning_rate": 1.243990024937656e-05, + "loss": 0.3163, + "step": 30330 + }, + { + "epoch": 7.566084788029925, + "grad_norm": 7.0515456199646, + "learning_rate": 1.2437406483790524e-05, + "loss": 0.3199, + "step": 30340 + }, + { + "epoch": 7.56857855361596, + "grad_norm": 8.328603744506836, + "learning_rate": 1.243491271820449e-05, + "loss": 0.3836, + "step": 30350 + }, + { + "epoch": 7.571072319201995, + "grad_norm": 7.779388427734375, + "learning_rate": 1.2432418952618454e-05, + "loss": 0.3254, + "step": 30360 + }, + { + "epoch": 7.57356608478803, + "grad_norm": 10.46296215057373, + "learning_rate": 1.242992518703242e-05, + "loss": 0.4414, + "step": 30370 + }, + { + "epoch": 7.576059850374065, + "grad_norm": 6.196642875671387, + "learning_rate": 1.2427431421446385e-05, + "loss": 0.3855, + "step": 30380 + }, + { + "epoch": 7.5785536159601, + "grad_norm": 9.153206825256348, + "learning_rate": 1.242493765586035e-05, + "loss": 0.2903, + "step": 30390 + }, + { + "epoch": 7.581047381546135, + "grad_norm": 6.691465377807617, + "learning_rate": 1.2422443890274315e-05, + "loss": 0.3715, + "step": 30400 + }, + { + "epoch": 7.58354114713217, + "grad_norm": 10.62185287475586, + "learning_rate": 1.241995012468828e-05, + "loss": 0.4213, + "step": 30410 + }, + { + "epoch": 7.586034912718205, + "grad_norm": 5.947132587432861, + "learning_rate": 1.2417456359102244e-05, + "loss": 0.3693, + "step": 30420 + }, + { + "epoch": 7.58852867830424, + "grad_norm": 7.0988993644714355, + "learning_rate": 1.2414962593516211e-05, + "loss": 0.3849, + "step": 30430 + }, + { + "epoch": 7.5910224438902745, + "grad_norm": 6.948726177215576, + "learning_rate": 1.2412468827930175e-05, + "loss": 0.3642, + "step": 30440 + }, + { + "epoch": 7.593516209476309, + "grad_norm": 5.161392688751221, + "learning_rate": 1.2409975062344142e-05, + "loss": 0.4056, + "step": 30450 + }, + { + "epoch": 7.596009975062344, + "grad_norm": 8.350500106811523, + "learning_rate": 1.2407481296758106e-05, + "loss": 0.4004, + "step": 30460 + }, + { + "epoch": 7.598503740648379, + "grad_norm": 7.8617753982543945, + "learning_rate": 1.2404987531172071e-05, + "loss": 0.3857, + "step": 30470 + }, + { + "epoch": 7.600997506234414, + "grad_norm": 6.594773292541504, + "learning_rate": 1.2402493765586036e-05, + "loss": 0.3984, + "step": 30480 + }, + { + "epoch": 7.603491271820449, + "grad_norm": 5.078848361968994, + "learning_rate": 1.2400000000000002e-05, + "loss": 0.3225, + "step": 30490 + }, + { + "epoch": 7.605985037406484, + "grad_norm": 6.860848903656006, + "learning_rate": 1.2397506234413965e-05, + "loss": 0.3953, + "step": 30500 + }, + { + "epoch": 7.6084788029925186, + "grad_norm": 6.241046905517578, + "learning_rate": 1.2395012468827932e-05, + "loss": 0.3999, + "step": 30510 + }, + { + "epoch": 7.610972568578553, + "grad_norm": 4.877734661102295, + "learning_rate": 1.2392518703241896e-05, + "loss": 0.3535, + "step": 30520 + }, + { + "epoch": 7.613466334164588, + "grad_norm": 6.138383388519287, + "learning_rate": 1.2390024937655863e-05, + "loss": 0.4553, + "step": 30530 + }, + { + "epoch": 7.615960099750623, + "grad_norm": 6.23154354095459, + "learning_rate": 1.2387531172069826e-05, + "loss": 0.3939, + "step": 30540 + }, + { + "epoch": 7.618453865336658, + "grad_norm": 5.0998945236206055, + "learning_rate": 1.238503740648379e-05, + "loss": 0.3799, + "step": 30550 + }, + { + "epoch": 7.620947630922693, + "grad_norm": 5.4622392654418945, + "learning_rate": 1.2382543640897757e-05, + "loss": 0.4081, + "step": 30560 + }, + { + "epoch": 7.623441396508728, + "grad_norm": 4.260573863983154, + "learning_rate": 1.2380049875311722e-05, + "loss": 0.3986, + "step": 30570 + }, + { + "epoch": 7.625935162094763, + "grad_norm": 8.23079776763916, + "learning_rate": 1.2377556109725686e-05, + "loss": 0.3604, + "step": 30580 + }, + { + "epoch": 7.628428927680798, + "grad_norm": 8.109442710876465, + "learning_rate": 1.2375062344139653e-05, + "loss": 0.4082, + "step": 30590 + }, + { + "epoch": 7.630922693266833, + "grad_norm": 4.392514705657959, + "learning_rate": 1.2372568578553617e-05, + "loss": 0.3406, + "step": 30600 + }, + { + "epoch": 7.633416458852868, + "grad_norm": 6.927611827850342, + "learning_rate": 1.2370074812967584e-05, + "loss": 0.3685, + "step": 30610 + }, + { + "epoch": 7.635910224438903, + "grad_norm": 8.26883602142334, + "learning_rate": 1.2367581047381547e-05, + "loss": 0.4477, + "step": 30620 + }, + { + "epoch": 7.638403990024938, + "grad_norm": 6.3622517585754395, + "learning_rate": 1.236508728179551e-05, + "loss": 0.337, + "step": 30630 + }, + { + "epoch": 7.640897755610973, + "grad_norm": 9.278800010681152, + "learning_rate": 1.2362593516209478e-05, + "loss": 0.3987, + "step": 30640 + }, + { + "epoch": 7.643391521197008, + "grad_norm": 4.636536598205566, + "learning_rate": 1.2360099750623441e-05, + "loss": 0.3779, + "step": 30650 + }, + { + "epoch": 7.6458852867830425, + "grad_norm": 7.597597122192383, + "learning_rate": 1.2357605985037408e-05, + "loss": 0.3261, + "step": 30660 + }, + { + "epoch": 7.648379052369077, + "grad_norm": 6.5270771980285645, + "learning_rate": 1.2355112219451372e-05, + "loss": 0.3113, + "step": 30670 + }, + { + "epoch": 7.650872817955112, + "grad_norm": 8.069908142089844, + "learning_rate": 1.2352618453865337e-05, + "loss": 0.3637, + "step": 30680 + }, + { + "epoch": 7.653366583541147, + "grad_norm": 7.320400238037109, + "learning_rate": 1.2350124688279304e-05, + "loss": 0.3699, + "step": 30690 + }, + { + "epoch": 7.655860349127182, + "grad_norm": 15.375004768371582, + "learning_rate": 1.2347630922693268e-05, + "loss": 0.367, + "step": 30700 + }, + { + "epoch": 7.658354114713217, + "grad_norm": 7.086206436157227, + "learning_rate": 1.2345137157107232e-05, + "loss": 0.3479, + "step": 30710 + }, + { + "epoch": 7.660847880299252, + "grad_norm": 7.063462257385254, + "learning_rate": 1.2342643391521199e-05, + "loss": 0.3503, + "step": 30720 + }, + { + "epoch": 7.6633416458852865, + "grad_norm": 6.879456043243408, + "learning_rate": 1.2340149625935162e-05, + "loss": 0.3552, + "step": 30730 + }, + { + "epoch": 7.665835411471321, + "grad_norm": 7.064803600311279, + "learning_rate": 1.233765586034913e-05, + "loss": 0.3794, + "step": 30740 + }, + { + "epoch": 7.668329177057356, + "grad_norm": 6.3318023681640625, + "learning_rate": 1.2335162094763093e-05, + "loss": 0.3434, + "step": 30750 + }, + { + "epoch": 7.670822942643391, + "grad_norm": 6.186346054077148, + "learning_rate": 1.2332668329177058e-05, + "loss": 0.3624, + "step": 30760 + }, + { + "epoch": 7.673316708229427, + "grad_norm": 6.247686386108398, + "learning_rate": 1.2330174563591023e-05, + "loss": 0.3922, + "step": 30770 + }, + { + "epoch": 7.675810473815462, + "grad_norm": 5.863278865814209, + "learning_rate": 1.2327680798004989e-05, + "loss": 0.3569, + "step": 30780 + }, + { + "epoch": 7.678304239401497, + "grad_norm": 13.845823287963867, + "learning_rate": 1.2325187032418952e-05, + "loss": 0.3568, + "step": 30790 + }, + { + "epoch": 7.6807980049875315, + "grad_norm": 7.736221790313721, + "learning_rate": 1.232269326683292e-05, + "loss": 0.3471, + "step": 30800 + }, + { + "epoch": 7.683291770573566, + "grad_norm": 6.512248992919922, + "learning_rate": 1.2320199501246883e-05, + "loss": 0.3172, + "step": 30810 + }, + { + "epoch": 7.685785536159601, + "grad_norm": 6.663067817687988, + "learning_rate": 1.231770573566085e-05, + "loss": 0.3857, + "step": 30820 + }, + { + "epoch": 7.688279301745636, + "grad_norm": 8.590274810791016, + "learning_rate": 1.2315211970074814e-05, + "loss": 0.3427, + "step": 30830 + }, + { + "epoch": 7.690773067331671, + "grad_norm": 7.1273722648620605, + "learning_rate": 1.2312718204488779e-05, + "loss": 0.4224, + "step": 30840 + }, + { + "epoch": 7.693266832917706, + "grad_norm": 6.8802924156188965, + "learning_rate": 1.2310224438902744e-05, + "loss": 0.4097, + "step": 30850 + }, + { + "epoch": 7.695760598503741, + "grad_norm": 9.271326065063477, + "learning_rate": 1.230773067331671e-05, + "loss": 0.3757, + "step": 30860 + }, + { + "epoch": 7.698254364089776, + "grad_norm": 6.1901421546936035, + "learning_rate": 1.2305236907730673e-05, + "loss": 0.3571, + "step": 30870 + }, + { + "epoch": 7.7007481296758105, + "grad_norm": 8.706985473632812, + "learning_rate": 1.230274314214464e-05, + "loss": 0.335, + "step": 30880 + }, + { + "epoch": 7.703241895261845, + "grad_norm": 5.975154399871826, + "learning_rate": 1.2300249376558604e-05, + "loss": 0.3703, + "step": 30890 + }, + { + "epoch": 7.70573566084788, + "grad_norm": 8.364744186401367, + "learning_rate": 1.229775561097257e-05, + "loss": 0.3532, + "step": 30900 + }, + { + "epoch": 7.708229426433915, + "grad_norm": 6.633474826812744, + "learning_rate": 1.2295261845386534e-05, + "loss": 0.3447, + "step": 30910 + }, + { + "epoch": 7.71072319201995, + "grad_norm": 7.192882061004639, + "learning_rate": 1.22927680798005e-05, + "loss": 0.365, + "step": 30920 + }, + { + "epoch": 7.713216957605985, + "grad_norm": 4.311435222625732, + "learning_rate": 1.2290274314214465e-05, + "loss": 0.3817, + "step": 30930 + }, + { + "epoch": 7.71571072319202, + "grad_norm": 9.537071228027344, + "learning_rate": 1.228778054862843e-05, + "loss": 0.4124, + "step": 30940 + }, + { + "epoch": 7.7182044887780545, + "grad_norm": 6.788525581359863, + "learning_rate": 1.2285286783042396e-05, + "loss": 0.4284, + "step": 30950 + }, + { + "epoch": 7.720698254364089, + "grad_norm": 4.7663798332214355, + "learning_rate": 1.2282793017456361e-05, + "loss": 0.3825, + "step": 30960 + }, + { + "epoch": 7.723192019950124, + "grad_norm": 6.757378578186035, + "learning_rate": 1.2280299251870325e-05, + "loss": 0.3331, + "step": 30970 + }, + { + "epoch": 7.725685785536159, + "grad_norm": 6.984048366546631, + "learning_rate": 1.2277805486284292e-05, + "loss": 0.354, + "step": 30980 + }, + { + "epoch": 7.728179551122195, + "grad_norm": 6.14350700378418, + "learning_rate": 1.2275311720698255e-05, + "loss": 0.3543, + "step": 30990 + }, + { + "epoch": 7.73067331670823, + "grad_norm": 5.969043254852295, + "learning_rate": 1.2272817955112219e-05, + "loss": 0.3788, + "step": 31000 + }, + { + "epoch": 7.733167082294265, + "grad_norm": 6.313365936279297, + "learning_rate": 1.2270324189526186e-05, + "loss": 0.3608, + "step": 31010 + }, + { + "epoch": 7.7356608478802995, + "grad_norm": 5.8176703453063965, + "learning_rate": 1.226783042394015e-05, + "loss": 0.3599, + "step": 31020 + }, + { + "epoch": 7.738154613466334, + "grad_norm": 5.260715484619141, + "learning_rate": 1.2265336658354116e-05, + "loss": 0.3885, + "step": 31030 + }, + { + "epoch": 7.740648379052369, + "grad_norm": 9.272298812866211, + "learning_rate": 1.2262842892768082e-05, + "loss": 0.3606, + "step": 31040 + }, + { + "epoch": 7.743142144638404, + "grad_norm": 6.716904163360596, + "learning_rate": 1.2260349127182045e-05, + "loss": 0.3364, + "step": 31050 + }, + { + "epoch": 7.745635910224439, + "grad_norm": 7.895336151123047, + "learning_rate": 1.2257855361596012e-05, + "loss": 0.3685, + "step": 31060 + }, + { + "epoch": 7.748129675810474, + "grad_norm": 6.1272969245910645, + "learning_rate": 1.2255361596009976e-05, + "loss": 0.3745, + "step": 31070 + }, + { + "epoch": 7.750623441396509, + "grad_norm": 6.436134338378906, + "learning_rate": 1.225286783042394e-05, + "loss": 0.363, + "step": 31080 + }, + { + "epoch": 7.753117206982544, + "grad_norm": 7.552138328552246, + "learning_rate": 1.2250374064837907e-05, + "loss": 0.4172, + "step": 31090 + }, + { + "epoch": 7.7556109725685785, + "grad_norm": 4.2307658195495605, + "learning_rate": 1.224788029925187e-05, + "loss": 0.4331, + "step": 31100 + }, + { + "epoch": 7.758104738154613, + "grad_norm": 6.373022556304932, + "learning_rate": 1.2245386533665837e-05, + "loss": 0.3571, + "step": 31110 + }, + { + "epoch": 7.760598503740648, + "grad_norm": 6.442211151123047, + "learning_rate": 1.22428927680798e-05, + "loss": 0.3335, + "step": 31120 + }, + { + "epoch": 7.763092269326683, + "grad_norm": 5.978911399841309, + "learning_rate": 1.2240399002493766e-05, + "loss": 0.3363, + "step": 31130 + }, + { + "epoch": 7.765586034912718, + "grad_norm": 7.628538131713867, + "learning_rate": 1.2237905236907731e-05, + "loss": 0.389, + "step": 31140 + }, + { + "epoch": 7.768079800498753, + "grad_norm": 5.834782600402832, + "learning_rate": 1.2235411471321697e-05, + "loss": 0.3835, + "step": 31150 + }, + { + "epoch": 7.770573566084788, + "grad_norm": 9.812804222106934, + "learning_rate": 1.2232917705735664e-05, + "loss": 0.3542, + "step": 31160 + }, + { + "epoch": 7.773067331670823, + "grad_norm": 5.178422451019287, + "learning_rate": 1.2230423940149627e-05, + "loss": 0.3612, + "step": 31170 + }, + { + "epoch": 7.775561097256858, + "grad_norm": 5.95904016494751, + "learning_rate": 1.2227930174563591e-05, + "loss": 0.3285, + "step": 31180 + }, + { + "epoch": 7.778054862842893, + "grad_norm": 7.034958362579346, + "learning_rate": 1.2225436408977558e-05, + "loss": 0.3777, + "step": 31190 + }, + { + "epoch": 7.780548628428928, + "grad_norm": 13.055363655090332, + "learning_rate": 1.2222942643391522e-05, + "loss": 0.4765, + "step": 31200 + }, + { + "epoch": 7.783042394014963, + "grad_norm": 8.228713989257812, + "learning_rate": 1.2220448877805487e-05, + "loss": 0.393, + "step": 31210 + }, + { + "epoch": 7.785536159600998, + "grad_norm": 8.480477333068848, + "learning_rate": 1.2217955112219452e-05, + "loss": 0.352, + "step": 31220 + }, + { + "epoch": 7.788029925187033, + "grad_norm": 7.811897277832031, + "learning_rate": 1.2215461346633418e-05, + "loss": 0.4076, + "step": 31230 + }, + { + "epoch": 7.7905236907730675, + "grad_norm": 6.502315521240234, + "learning_rate": 1.2212967581047383e-05, + "loss": 0.3238, + "step": 31240 + }, + { + "epoch": 7.793017456359102, + "grad_norm": 6.6692657470703125, + "learning_rate": 1.2210473815461348e-05, + "loss": 0.3445, + "step": 31250 + }, + { + "epoch": 7.795511221945137, + "grad_norm": 5.5514655113220215, + "learning_rate": 1.2207980049875312e-05, + "loss": 0.3603, + "step": 31260 + }, + { + "epoch": 7.798004987531172, + "grad_norm": 5.353459358215332, + "learning_rate": 1.2205486284289279e-05, + "loss": 0.3117, + "step": 31270 + }, + { + "epoch": 7.800498753117207, + "grad_norm": 6.266322135925293, + "learning_rate": 1.2202992518703242e-05, + "loss": 0.3608, + "step": 31280 + }, + { + "epoch": 7.802992518703242, + "grad_norm": 5.57569694519043, + "learning_rate": 1.2200498753117208e-05, + "loss": 0.3953, + "step": 31290 + }, + { + "epoch": 7.805486284289277, + "grad_norm": 5.770155429840088, + "learning_rate": 1.2198004987531173e-05, + "loss": 0.4057, + "step": 31300 + }, + { + "epoch": 7.807980049875312, + "grad_norm": 8.527885437011719, + "learning_rate": 1.2195511221945138e-05, + "loss": 0.4245, + "step": 31310 + }, + { + "epoch": 7.8104738154613464, + "grad_norm": 8.572240829467773, + "learning_rate": 1.2193017456359104e-05, + "loss": 0.3845, + "step": 31320 + }, + { + "epoch": 7.812967581047381, + "grad_norm": 8.089426040649414, + "learning_rate": 1.2190523690773069e-05, + "loss": 0.3858, + "step": 31330 + }, + { + "epoch": 7.815461346633416, + "grad_norm": 5.3781657218933105, + "learning_rate": 1.2188029925187033e-05, + "loss": 0.4435, + "step": 31340 + }, + { + "epoch": 7.817955112219451, + "grad_norm": 7.371554374694824, + "learning_rate": 1.2185536159601e-05, + "loss": 0.3908, + "step": 31350 + }, + { + "epoch": 7.820448877805486, + "grad_norm": 6.417791366577148, + "learning_rate": 1.2183042394014963e-05, + "loss": 0.3638, + "step": 31360 + }, + { + "epoch": 7.822942643391521, + "grad_norm": 8.140090942382812, + "learning_rate": 1.2180548628428927e-05, + "loss": 0.4004, + "step": 31370 + }, + { + "epoch": 7.825436408977556, + "grad_norm": 28.9359073638916, + "learning_rate": 1.2178054862842894e-05, + "loss": 0.3789, + "step": 31380 + }, + { + "epoch": 7.8279301745635905, + "grad_norm": 5.224822044372559, + "learning_rate": 1.2175561097256859e-05, + "loss": 0.3685, + "step": 31390 + }, + { + "epoch": 7.830423940149626, + "grad_norm": 5.957163333892822, + "learning_rate": 1.2173067331670824e-05, + "loss": 0.2989, + "step": 31400 + }, + { + "epoch": 7.832917705735661, + "grad_norm": 7.902908802032471, + "learning_rate": 1.217057356608479e-05, + "loss": 0.3216, + "step": 31410 + }, + { + "epoch": 7.835411471321696, + "grad_norm": 9.915661811828613, + "learning_rate": 1.2168079800498753e-05, + "loss": 0.4022, + "step": 31420 + }, + { + "epoch": 7.837905236907731, + "grad_norm": 8.479707717895508, + "learning_rate": 1.216558603491272e-05, + "loss": 0.3614, + "step": 31430 + }, + { + "epoch": 7.840399002493766, + "grad_norm": 5.247765064239502, + "learning_rate": 1.2163092269326684e-05, + "loss": 0.354, + "step": 31440 + }, + { + "epoch": 7.842892768079801, + "grad_norm": 9.220551490783691, + "learning_rate": 1.2160598503740651e-05, + "loss": 0.4473, + "step": 31450 + }, + { + "epoch": 7.8453865336658355, + "grad_norm": 5.292842864990234, + "learning_rate": 1.2158104738154615e-05, + "loss": 0.3996, + "step": 31460 + }, + { + "epoch": 7.84788029925187, + "grad_norm": 9.123703956604004, + "learning_rate": 1.2155610972568578e-05, + "loss": 0.4442, + "step": 31470 + }, + { + "epoch": 7.850374064837905, + "grad_norm": 5.475888252258301, + "learning_rate": 1.2153117206982545e-05, + "loss": 0.3169, + "step": 31480 + }, + { + "epoch": 7.85286783042394, + "grad_norm": 7.926106929779053, + "learning_rate": 1.2150623441396509e-05, + "loss": 0.3636, + "step": 31490 + }, + { + "epoch": 7.855361596009975, + "grad_norm": 6.426756858825684, + "learning_rate": 1.2148129675810474e-05, + "loss": 0.3884, + "step": 31500 + }, + { + "epoch": 7.85785536159601, + "grad_norm": 7.212559700012207, + "learning_rate": 1.2145635910224441e-05, + "loss": 0.3312, + "step": 31510 + }, + { + "epoch": 7.860349127182045, + "grad_norm": 5.426643371582031, + "learning_rate": 1.2143142144638405e-05, + "loss": 0.3075, + "step": 31520 + }, + { + "epoch": 7.86284289276808, + "grad_norm": 6.562136650085449, + "learning_rate": 1.2140648379052372e-05, + "loss": 0.4217, + "step": 31530 + }, + { + "epoch": 7.865336658354114, + "grad_norm": 7.82149076461792, + "learning_rate": 1.2138154613466335e-05, + "loss": 0.347, + "step": 31540 + }, + { + "epoch": 7.867830423940149, + "grad_norm": 6.736440181732178, + "learning_rate": 1.2135660847880299e-05, + "loss": 0.3143, + "step": 31550 + }, + { + "epoch": 7.870324189526184, + "grad_norm": 7.551044940948486, + "learning_rate": 1.2133167082294266e-05, + "loss": 0.4345, + "step": 31560 + }, + { + "epoch": 7.87281795511222, + "grad_norm": 11.6002836227417, + "learning_rate": 1.213067331670823e-05, + "loss": 0.3961, + "step": 31570 + }, + { + "epoch": 7.875311720698255, + "grad_norm": 4.926860809326172, + "learning_rate": 1.2128179551122195e-05, + "loss": 0.3755, + "step": 31580 + }, + { + "epoch": 7.87780548628429, + "grad_norm": 13.205385208129883, + "learning_rate": 1.212568578553616e-05, + "loss": 0.3714, + "step": 31590 + }, + { + "epoch": 7.8802992518703245, + "grad_norm": 4.073211193084717, + "learning_rate": 1.2123192019950126e-05, + "loss": 0.2892, + "step": 31600 + }, + { + "epoch": 7.882793017456359, + "grad_norm": 11.72350788116455, + "learning_rate": 1.212069825436409e-05, + "loss": 0.3547, + "step": 31610 + }, + { + "epoch": 7.885286783042394, + "grad_norm": 6.860603332519531, + "learning_rate": 1.2118204488778056e-05, + "loss": 0.3574, + "step": 31620 + }, + { + "epoch": 7.887780548628429, + "grad_norm": 5.04127836227417, + "learning_rate": 1.211571072319202e-05, + "loss": 0.3101, + "step": 31630 + }, + { + "epoch": 7.890274314214464, + "grad_norm": 5.518341064453125, + "learning_rate": 1.2113216957605987e-05, + "loss": 0.4506, + "step": 31640 + }, + { + "epoch": 7.892768079800499, + "grad_norm": 6.156492710113525, + "learning_rate": 1.211072319201995e-05, + "loss": 0.3326, + "step": 31650 + }, + { + "epoch": 7.895261845386534, + "grad_norm": 7.646767616271973, + "learning_rate": 1.2108229426433917e-05, + "loss": 0.3646, + "step": 31660 + }, + { + "epoch": 7.897755610972569, + "grad_norm": 5.843283176422119, + "learning_rate": 1.2105735660847881e-05, + "loss": 0.418, + "step": 31670 + }, + { + "epoch": 7.9002493765586035, + "grad_norm": 7.041114330291748, + "learning_rate": 1.2103241895261846e-05, + "loss": 0.3759, + "step": 31680 + }, + { + "epoch": 7.902743142144638, + "grad_norm": 8.82856273651123, + "learning_rate": 1.2100748129675812e-05, + "loss": 0.478, + "step": 31690 + }, + { + "epoch": 7.905236907730673, + "grad_norm": 6.0212273597717285, + "learning_rate": 1.2098254364089777e-05, + "loss": 0.3257, + "step": 31700 + }, + { + "epoch": 7.907730673316708, + "grad_norm": 8.000700950622559, + "learning_rate": 1.209576059850374e-05, + "loss": 0.3212, + "step": 31710 + }, + { + "epoch": 7.910224438902743, + "grad_norm": 9.041226387023926, + "learning_rate": 1.2093266832917708e-05, + "loss": 0.3234, + "step": 31720 + }, + { + "epoch": 7.912718204488778, + "grad_norm": 10.657549858093262, + "learning_rate": 1.2090773067331671e-05, + "loss": 0.4932, + "step": 31730 + }, + { + "epoch": 7.915211970074813, + "grad_norm": 8.23529052734375, + "learning_rate": 1.2088279301745638e-05, + "loss": 0.3676, + "step": 31740 + }, + { + "epoch": 7.917705735660848, + "grad_norm": 5.820125102996826, + "learning_rate": 1.2085785536159602e-05, + "loss": 0.38, + "step": 31750 + }, + { + "epoch": 7.920199501246882, + "grad_norm": 7.681821346282959, + "learning_rate": 1.2083291770573567e-05, + "loss": 0.3368, + "step": 31760 + }, + { + "epoch": 7.922693266832917, + "grad_norm": 6.52007532119751, + "learning_rate": 1.2080798004987532e-05, + "loss": 0.3466, + "step": 31770 + }, + { + "epoch": 7.925187032418952, + "grad_norm": 8.937047958374023, + "learning_rate": 1.2078304239401498e-05, + "loss": 0.3921, + "step": 31780 + }, + { + "epoch": 7.927680798004987, + "grad_norm": 7.19578742980957, + "learning_rate": 1.2075810473815461e-05, + "loss": 0.3695, + "step": 31790 + }, + { + "epoch": 7.930174563591023, + "grad_norm": 9.430617332458496, + "learning_rate": 1.2073316708229428e-05, + "loss": 0.3753, + "step": 31800 + }, + { + "epoch": 7.932668329177058, + "grad_norm": 5.713561058044434, + "learning_rate": 1.2070822942643392e-05, + "loss": 0.4164, + "step": 31810 + }, + { + "epoch": 7.9351620947630925, + "grad_norm": 6.72670316696167, + "learning_rate": 1.2068329177057359e-05, + "loss": 0.4523, + "step": 31820 + }, + { + "epoch": 7.937655860349127, + "grad_norm": 6.967277526855469, + "learning_rate": 1.2065835411471323e-05, + "loss": 0.4086, + "step": 31830 + }, + { + "epoch": 7.940149625935162, + "grad_norm": 6.393032073974609, + "learning_rate": 1.2063341645885286e-05, + "loss": 0.3532, + "step": 31840 + }, + { + "epoch": 7.942643391521197, + "grad_norm": 8.138836860656738, + "learning_rate": 1.2060847880299253e-05, + "loss": 0.3699, + "step": 31850 + }, + { + "epoch": 7.945137157107232, + "grad_norm": 7.282432556152344, + "learning_rate": 1.2058354114713218e-05, + "loss": 0.3925, + "step": 31860 + }, + { + "epoch": 7.947630922693267, + "grad_norm": 5.286504745483398, + "learning_rate": 1.2055860349127182e-05, + "loss": 0.2947, + "step": 31870 + }, + { + "epoch": 7.950124688279302, + "grad_norm": 7.868513584136963, + "learning_rate": 1.2053366583541149e-05, + "loss": 0.501, + "step": 31880 + }, + { + "epoch": 7.952618453865337, + "grad_norm": 8.221051216125488, + "learning_rate": 1.2050872817955113e-05, + "loss": 0.4184, + "step": 31890 + }, + { + "epoch": 7.9551122194513715, + "grad_norm": 8.704934120178223, + "learning_rate": 1.204837905236908e-05, + "loss": 0.3372, + "step": 31900 + }, + { + "epoch": 7.957605985037406, + "grad_norm": 8.388035774230957, + "learning_rate": 1.2045885286783043e-05, + "loss": 0.2941, + "step": 31910 + }, + { + "epoch": 7.960099750623441, + "grad_norm": 6.82948637008667, + "learning_rate": 1.2043391521197007e-05, + "loss": 0.3935, + "step": 31920 + }, + { + "epoch": 7.962593516209476, + "grad_norm": 18.161638259887695, + "learning_rate": 1.2040897755610974e-05, + "loss": 0.4707, + "step": 31930 + }, + { + "epoch": 7.965087281795511, + "grad_norm": 7.017074108123779, + "learning_rate": 1.2038403990024938e-05, + "loss": 0.3647, + "step": 31940 + }, + { + "epoch": 7.967581047381546, + "grad_norm": 7.731515884399414, + "learning_rate": 1.2035910224438905e-05, + "loss": 0.3442, + "step": 31950 + }, + { + "epoch": 7.970074812967581, + "grad_norm": 5.6125898361206055, + "learning_rate": 1.2033416458852868e-05, + "loss": 0.4005, + "step": 31960 + }, + { + "epoch": 7.9725685785536164, + "grad_norm": 8.200444221496582, + "learning_rate": 1.2030922693266834e-05, + "loss": 0.4055, + "step": 31970 + }, + { + "epoch": 7.975062344139651, + "grad_norm": 8.561641693115234, + "learning_rate": 1.20284289276808e-05, + "loss": 0.3479, + "step": 31980 + }, + { + "epoch": 7.977556109725686, + "grad_norm": 6.790326118469238, + "learning_rate": 1.2025935162094764e-05, + "loss": 0.4134, + "step": 31990 + }, + { + "epoch": 7.980049875311721, + "grad_norm": 6.383502006530762, + "learning_rate": 1.2023441396508728e-05, + "loss": 0.3949, + "step": 32000 + }, + { + "epoch": 7.982543640897756, + "grad_norm": 5.95525598526001, + "learning_rate": 1.2020947630922695e-05, + "loss": 0.4402, + "step": 32010 + }, + { + "epoch": 7.985037406483791, + "grad_norm": 7.909287929534912, + "learning_rate": 1.2018453865336658e-05, + "loss": 0.349, + "step": 32020 + }, + { + "epoch": 7.987531172069826, + "grad_norm": 5.539024829864502, + "learning_rate": 1.2015960099750625e-05, + "loss": 0.3202, + "step": 32030 + }, + { + "epoch": 7.9900249376558605, + "grad_norm": 7.293623447418213, + "learning_rate": 1.2013466334164589e-05, + "loss": 0.3595, + "step": 32040 + }, + { + "epoch": 7.992518703241895, + "grad_norm": 6.55578088760376, + "learning_rate": 1.2010972568578554e-05, + "loss": 0.3922, + "step": 32050 + }, + { + "epoch": 7.99501246882793, + "grad_norm": 5.113552093505859, + "learning_rate": 1.200847880299252e-05, + "loss": 0.4029, + "step": 32060 + }, + { + "epoch": 7.997506234413965, + "grad_norm": 6.873347282409668, + "learning_rate": 1.2005985037406485e-05, + "loss": 0.3852, + "step": 32070 + }, + { + "epoch": 8.0, + "grad_norm": 10.353362083435059, + "learning_rate": 1.2003491271820449e-05, + "loss": 0.3054, + "step": 32080 + }, + { + "epoch": 8.0, + "eval_loss": 0.4143351912498474, + "eval_runtime": 59.8965, + "eval_samples_per_second": 16.746, + "eval_steps_per_second": 16.746, + "step": 32080 + }, + { + "epoch": 8.002493765586035, + "grad_norm": 6.222545623779297, + "learning_rate": 1.2000997506234416e-05, + "loss": 0.3814, + "step": 32090 + }, + { + "epoch": 8.00498753117207, + "grad_norm": 6.579477310180664, + "learning_rate": 1.1998503740648379e-05, + "loss": 0.3755, + "step": 32100 + }, + { + "epoch": 8.007481296758105, + "grad_norm": 4.621208667755127, + "learning_rate": 1.1996009975062346e-05, + "loss": 0.2821, + "step": 32110 + }, + { + "epoch": 8.00997506234414, + "grad_norm": 5.9014081954956055, + "learning_rate": 1.199351620947631e-05, + "loss": 0.3424, + "step": 32120 + }, + { + "epoch": 8.012468827930174, + "grad_norm": 8.920719146728516, + "learning_rate": 1.1991022443890275e-05, + "loss": 0.4108, + "step": 32130 + }, + { + "epoch": 8.01496259351621, + "grad_norm": 7.701635360717773, + "learning_rate": 1.198852867830424e-05, + "loss": 0.3543, + "step": 32140 + }, + { + "epoch": 8.017456359102244, + "grad_norm": 6.4233856201171875, + "learning_rate": 1.1986034912718206e-05, + "loss": 0.3799, + "step": 32150 + }, + { + "epoch": 8.019950124688279, + "grad_norm": 9.75025463104248, + "learning_rate": 1.1983541147132171e-05, + "loss": 0.4066, + "step": 32160 + }, + { + "epoch": 8.022443890274314, + "grad_norm": 7.719704627990723, + "learning_rate": 1.1981047381546136e-05, + "loss": 0.3938, + "step": 32170 + }, + { + "epoch": 8.024937655860349, + "grad_norm": 4.815377712249756, + "learning_rate": 1.19785536159601e-05, + "loss": 0.3635, + "step": 32180 + }, + { + "epoch": 8.027431421446384, + "grad_norm": 7.045759201049805, + "learning_rate": 1.197630922693267e-05, + "loss": 0.3618, + "step": 32190 + }, + { + "epoch": 8.029925187032418, + "grad_norm": 5.257409572601318, + "learning_rate": 1.1973815461346634e-05, + "loss": 0.3325, + "step": 32200 + }, + { + "epoch": 8.032418952618453, + "grad_norm": 12.353095054626465, + "learning_rate": 1.19713216957606e-05, + "loss": 0.356, + "step": 32210 + }, + { + "epoch": 8.034912718204488, + "grad_norm": 5.638147830963135, + "learning_rate": 1.1968827930174564e-05, + "loss": 0.3877, + "step": 32220 + }, + { + "epoch": 8.037406483790523, + "grad_norm": 4.99983024597168, + "learning_rate": 1.196633416458853e-05, + "loss": 0.357, + "step": 32230 + }, + { + "epoch": 8.039900249376558, + "grad_norm": 5.99961519241333, + "learning_rate": 1.1963840399002495e-05, + "loss": 0.3408, + "step": 32240 + }, + { + "epoch": 8.042394014962593, + "grad_norm": 8.61434555053711, + "learning_rate": 1.196134663341646e-05, + "loss": 0.3429, + "step": 32250 + }, + { + "epoch": 8.044887780548628, + "grad_norm": 6.741703033447266, + "learning_rate": 1.1958852867830424e-05, + "loss": 0.3562, + "step": 32260 + }, + { + "epoch": 8.047381546134662, + "grad_norm": 4.7175703048706055, + "learning_rate": 1.195635910224439e-05, + "loss": 0.3447, + "step": 32270 + }, + { + "epoch": 8.049875311720697, + "grad_norm": 6.885346412658691, + "learning_rate": 1.1953865336658354e-05, + "loss": 0.3637, + "step": 32280 + }, + { + "epoch": 8.052369077306734, + "grad_norm": 7.284684181213379, + "learning_rate": 1.1951371571072321e-05, + "loss": 0.3799, + "step": 32290 + }, + { + "epoch": 8.054862842892769, + "grad_norm": 8.595766067504883, + "learning_rate": 1.1948877805486285e-05, + "loss": 0.3967, + "step": 32300 + }, + { + "epoch": 8.057356608478804, + "grad_norm": 6.696188926696777, + "learning_rate": 1.194638403990025e-05, + "loss": 0.3993, + "step": 32310 + }, + { + "epoch": 8.059850374064839, + "grad_norm": 5.798731327056885, + "learning_rate": 1.1943890274314216e-05, + "loss": 0.3159, + "step": 32320 + }, + { + "epoch": 8.062344139650873, + "grad_norm": 6.5133891105651855, + "learning_rate": 1.1941396508728181e-05, + "loss": 0.3556, + "step": 32330 + }, + { + "epoch": 8.064837905236908, + "grad_norm": 8.15516471862793, + "learning_rate": 1.1938902743142146e-05, + "loss": 0.3533, + "step": 32340 + }, + { + "epoch": 8.067331670822943, + "grad_norm": 9.06305980682373, + "learning_rate": 1.1936408977556111e-05, + "loss": 0.419, + "step": 32350 + }, + { + "epoch": 8.069825436408978, + "grad_norm": 5.8989644050598145, + "learning_rate": 1.1933915211970075e-05, + "loss": 0.3469, + "step": 32360 + }, + { + "epoch": 8.072319201995013, + "grad_norm": 4.38079833984375, + "learning_rate": 1.1931421446384042e-05, + "loss": 0.3461, + "step": 32370 + }, + { + "epoch": 8.074812967581048, + "grad_norm": 7.34318208694458, + "learning_rate": 1.1928927680798006e-05, + "loss": 0.3793, + "step": 32380 + }, + { + "epoch": 8.077306733167083, + "grad_norm": 6.742260932922363, + "learning_rate": 1.192643391521197e-05, + "loss": 0.3649, + "step": 32390 + }, + { + "epoch": 8.079800498753118, + "grad_norm": 6.300445079803467, + "learning_rate": 1.1923940149625936e-05, + "loss": 0.3527, + "step": 32400 + }, + { + "epoch": 8.082294264339152, + "grad_norm": 12.198246002197266, + "learning_rate": 1.1921446384039902e-05, + "loss": 0.3832, + "step": 32410 + }, + { + "epoch": 8.084788029925187, + "grad_norm": 6.432736873626709, + "learning_rate": 1.1918952618453867e-05, + "loss": 0.3424, + "step": 32420 + }, + { + "epoch": 8.087281795511222, + "grad_norm": 7.7809271812438965, + "learning_rate": 1.1916458852867832e-05, + "loss": 0.3318, + "step": 32430 + }, + { + "epoch": 8.089775561097257, + "grad_norm": 8.33100700378418, + "learning_rate": 1.1913965087281796e-05, + "loss": 0.3166, + "step": 32440 + }, + { + "epoch": 8.092269326683292, + "grad_norm": 7.229568958282471, + "learning_rate": 1.1911471321695763e-05, + "loss": 0.4009, + "step": 32450 + }, + { + "epoch": 8.094763092269327, + "grad_norm": 6.71392822265625, + "learning_rate": 1.1908977556109726e-05, + "loss": 0.3876, + "step": 32460 + }, + { + "epoch": 8.097256857855362, + "grad_norm": 7.794308185577393, + "learning_rate": 1.190648379052369e-05, + "loss": 0.406, + "step": 32470 + }, + { + "epoch": 8.099750623441397, + "grad_norm": 6.342446327209473, + "learning_rate": 1.1903990024937657e-05, + "loss": 0.3472, + "step": 32480 + }, + { + "epoch": 8.102244389027431, + "grad_norm": 7.7976393699646, + "learning_rate": 1.190149625935162e-05, + "loss": 0.3918, + "step": 32490 + }, + { + "epoch": 8.104738154613466, + "grad_norm": 5.704341411590576, + "learning_rate": 1.1899002493765588e-05, + "loss": 0.3502, + "step": 32500 + }, + { + "epoch": 8.107231920199501, + "grad_norm": 6.822027206420898, + "learning_rate": 1.1896508728179551e-05, + "loss": 0.3749, + "step": 32510 + }, + { + "epoch": 8.109725685785536, + "grad_norm": 6.865316867828369, + "learning_rate": 1.1894014962593517e-05, + "loss": 0.3452, + "step": 32520 + }, + { + "epoch": 8.11221945137157, + "grad_norm": 9.309464454650879, + "learning_rate": 1.1891521197007484e-05, + "loss": 0.3609, + "step": 32530 + }, + { + "epoch": 8.114713216957606, + "grad_norm": 6.361128807067871, + "learning_rate": 1.1889027431421447e-05, + "loss": 0.3868, + "step": 32540 + }, + { + "epoch": 8.11720698254364, + "grad_norm": 6.661703586578369, + "learning_rate": 1.1886533665835411e-05, + "loss": 0.3508, + "step": 32550 + }, + { + "epoch": 8.119700748129675, + "grad_norm": 5.073139667510986, + "learning_rate": 1.1884039900249378e-05, + "loss": 0.4003, + "step": 32560 + }, + { + "epoch": 8.12219451371571, + "grad_norm": 5.183114051818848, + "learning_rate": 1.1881546134663342e-05, + "loss": 0.3226, + "step": 32570 + }, + { + "epoch": 8.124688279301745, + "grad_norm": 6.288895130157471, + "learning_rate": 1.1879052369077309e-05, + "loss": 0.4334, + "step": 32580 + }, + { + "epoch": 8.12718204488778, + "grad_norm": 6.090054988861084, + "learning_rate": 1.1876558603491272e-05, + "loss": 0.3291, + "step": 32590 + }, + { + "epoch": 8.129675810473815, + "grad_norm": 7.359978675842285, + "learning_rate": 1.1874064837905237e-05, + "loss": 0.4043, + "step": 32600 + }, + { + "epoch": 8.13216957605985, + "grad_norm": 9.150673866271973, + "learning_rate": 1.1871571072319203e-05, + "loss": 0.3534, + "step": 32610 + }, + { + "epoch": 8.134663341645885, + "grad_norm": 10.117431640625, + "learning_rate": 1.1869077306733168e-05, + "loss": 0.3602, + "step": 32620 + }, + { + "epoch": 8.13715710723192, + "grad_norm": 4.644679546356201, + "learning_rate": 1.1866583541147133e-05, + "loss": 0.3498, + "step": 32630 + }, + { + "epoch": 8.139650872817954, + "grad_norm": 6.087995529174805, + "learning_rate": 1.1864089775561099e-05, + "loss": 0.3325, + "step": 32640 + }, + { + "epoch": 8.14214463840399, + "grad_norm": 6.6635050773620605, + "learning_rate": 1.1861596009975062e-05, + "loss": 0.3532, + "step": 32650 + }, + { + "epoch": 8.144638403990024, + "grad_norm": 7.93565559387207, + "learning_rate": 1.185910224438903e-05, + "loss": 0.3327, + "step": 32660 + }, + { + "epoch": 8.147132169576059, + "grad_norm": 5.548285961151123, + "learning_rate": 1.1856608478802993e-05, + "loss": 0.409, + "step": 32670 + }, + { + "epoch": 8.149625935162096, + "grad_norm": 6.113910675048828, + "learning_rate": 1.1854114713216958e-05, + "loss": 0.3934, + "step": 32680 + }, + { + "epoch": 8.15211970074813, + "grad_norm": 8.297721862792969, + "learning_rate": 1.1851620947630924e-05, + "loss": 0.4034, + "step": 32690 + }, + { + "epoch": 8.154613466334165, + "grad_norm": 5.073780536651611, + "learning_rate": 1.1849127182044889e-05, + "loss": 0.326, + "step": 32700 + }, + { + "epoch": 8.1571072319202, + "grad_norm": 6.652329921722412, + "learning_rate": 1.1846633416458854e-05, + "loss": 0.3051, + "step": 32710 + }, + { + "epoch": 8.159600997506235, + "grad_norm": 8.228754043579102, + "learning_rate": 1.184413965087282e-05, + "loss": 0.346, + "step": 32720 + }, + { + "epoch": 8.16209476309227, + "grad_norm": 5.232443809509277, + "learning_rate": 1.1841645885286783e-05, + "loss": 0.3189, + "step": 32730 + }, + { + "epoch": 8.164588528678305, + "grad_norm": 6.608932018280029, + "learning_rate": 1.183915211970075e-05, + "loss": 0.291, + "step": 32740 + }, + { + "epoch": 8.16708229426434, + "grad_norm": 6.14049768447876, + "learning_rate": 1.1836658354114714e-05, + "loss": 0.3777, + "step": 32750 + }, + { + "epoch": 8.169576059850375, + "grad_norm": 6.535306453704834, + "learning_rate": 1.1834164588528679e-05, + "loss": 0.3807, + "step": 32760 + }, + { + "epoch": 8.17206982543641, + "grad_norm": 8.01197624206543, + "learning_rate": 1.1831670822942644e-05, + "loss": 0.3618, + "step": 32770 + }, + { + "epoch": 8.174563591022444, + "grad_norm": 6.140148639678955, + "learning_rate": 1.182917705735661e-05, + "loss": 0.396, + "step": 32780 + }, + { + "epoch": 8.17705735660848, + "grad_norm": 9.92581844329834, + "learning_rate": 1.1826683291770575e-05, + "loss": 0.4123, + "step": 32790 + }, + { + "epoch": 8.179551122194514, + "grad_norm": 5.655374526977539, + "learning_rate": 1.182418952618454e-05, + "loss": 0.3478, + "step": 32800 + }, + { + "epoch": 8.182044887780549, + "grad_norm": 6.091732501983643, + "learning_rate": 1.1821695760598504e-05, + "loss": 0.34, + "step": 32810 + }, + { + "epoch": 8.184538653366584, + "grad_norm": 6.486967086791992, + "learning_rate": 1.1819201995012471e-05, + "loss": 0.3117, + "step": 32820 + }, + { + "epoch": 8.187032418952619, + "grad_norm": 6.588410377502441, + "learning_rate": 1.1816708229426434e-05, + "loss": 0.3123, + "step": 32830 + }, + { + "epoch": 8.189526184538654, + "grad_norm": 5.499619007110596, + "learning_rate": 1.1814214463840401e-05, + "loss": 0.3842, + "step": 32840 + }, + { + "epoch": 8.192019950124688, + "grad_norm": 11.53024673461914, + "learning_rate": 1.1811720698254365e-05, + "loss": 0.437, + "step": 32850 + }, + { + "epoch": 8.194513715710723, + "grad_norm": 9.12915325164795, + "learning_rate": 1.1809226932668329e-05, + "loss": 0.3859, + "step": 32860 + }, + { + "epoch": 8.197007481296758, + "grad_norm": 6.353451728820801, + "learning_rate": 1.1806733167082296e-05, + "loss": 0.3672, + "step": 32870 + }, + { + "epoch": 8.199501246882793, + "grad_norm": 8.243021965026855, + "learning_rate": 1.1804239401496261e-05, + "loss": 0.351, + "step": 32880 + }, + { + "epoch": 8.201995012468828, + "grad_norm": 10.141286849975586, + "learning_rate": 1.1801745635910225e-05, + "loss": 0.3614, + "step": 32890 + }, + { + "epoch": 8.204488778054863, + "grad_norm": 6.647899627685547, + "learning_rate": 1.1799251870324192e-05, + "loss": 0.3553, + "step": 32900 + }, + { + "epoch": 8.206982543640898, + "grad_norm": 8.836320877075195, + "learning_rate": 1.1796758104738155e-05, + "loss": 0.3578, + "step": 32910 + }, + { + "epoch": 8.209476309226932, + "grad_norm": 5.279130458831787, + "learning_rate": 1.1794264339152122e-05, + "loss": 0.3746, + "step": 32920 + }, + { + "epoch": 8.211970074812967, + "grad_norm": 8.758357048034668, + "learning_rate": 1.1791770573566086e-05, + "loss": 0.3627, + "step": 32930 + }, + { + "epoch": 8.214463840399002, + "grad_norm": 7.450492858886719, + "learning_rate": 1.178927680798005e-05, + "loss": 0.3138, + "step": 32940 + }, + { + "epoch": 8.216957605985037, + "grad_norm": 7.956082820892334, + "learning_rate": 1.1786783042394016e-05, + "loss": 0.3486, + "step": 32950 + }, + { + "epoch": 8.219451371571072, + "grad_norm": 6.743360996246338, + "learning_rate": 1.178428927680798e-05, + "loss": 0.3642, + "step": 32960 + }, + { + "epoch": 8.221945137157107, + "grad_norm": 8.854829788208008, + "learning_rate": 1.1781795511221945e-05, + "loss": 0.3638, + "step": 32970 + }, + { + "epoch": 8.224438902743142, + "grad_norm": 8.033832550048828, + "learning_rate": 1.177930174563591e-05, + "loss": 0.3792, + "step": 32980 + }, + { + "epoch": 8.226932668329177, + "grad_norm": 3.877129554748535, + "learning_rate": 1.1776807980049876e-05, + "loss": 0.3361, + "step": 32990 + }, + { + "epoch": 8.229426433915211, + "grad_norm": 7.193696975708008, + "learning_rate": 1.1774314214463843e-05, + "loss": 0.3768, + "step": 33000 + }, + { + "epoch": 8.231920199501246, + "grad_norm": 6.511844635009766, + "learning_rate": 1.1771820448877807e-05, + "loss": 0.3266, + "step": 33010 + }, + { + "epoch": 8.234413965087281, + "grad_norm": 7.392646789550781, + "learning_rate": 1.176932668329177e-05, + "loss": 0.3634, + "step": 33020 + }, + { + "epoch": 8.236907730673316, + "grad_norm": 7.599452495574951, + "learning_rate": 1.1766832917705737e-05, + "loss": 0.4454, + "step": 33030 + }, + { + "epoch": 8.239401496259351, + "grad_norm": 8.29662036895752, + "learning_rate": 1.1764339152119701e-05, + "loss": 0.3168, + "step": 33040 + }, + { + "epoch": 8.241895261845386, + "grad_norm": 9.465046882629395, + "learning_rate": 1.1761845386533666e-05, + "loss": 0.4269, + "step": 33050 + }, + { + "epoch": 8.24438902743142, + "grad_norm": 5.553979396820068, + "learning_rate": 1.1759351620947632e-05, + "loss": 0.3853, + "step": 33060 + }, + { + "epoch": 8.246882793017456, + "grad_norm": 10.552316665649414, + "learning_rate": 1.1756857855361597e-05, + "loss": 0.3968, + "step": 33070 + }, + { + "epoch": 8.24937655860349, + "grad_norm": 6.383450984954834, + "learning_rate": 1.1754364089775562e-05, + "loss": 0.3565, + "step": 33080 + }, + { + "epoch": 8.251870324189527, + "grad_norm": 6.994113922119141, + "learning_rate": 1.1751870324189527e-05, + "loss": 0.3601, + "step": 33090 + }, + { + "epoch": 8.254364089775562, + "grad_norm": 5.046513080596924, + "learning_rate": 1.1749376558603491e-05, + "loss": 0.3747, + "step": 33100 + }, + { + "epoch": 8.256857855361597, + "grad_norm": 10.145176887512207, + "learning_rate": 1.1746882793017458e-05, + "loss": 0.4286, + "step": 33110 + }, + { + "epoch": 8.259351620947632, + "grad_norm": 8.742334365844727, + "learning_rate": 1.1744389027431422e-05, + "loss": 0.3592, + "step": 33120 + }, + { + "epoch": 8.261845386533667, + "grad_norm": 7.104617595672607, + "learning_rate": 1.1741895261845389e-05, + "loss": 0.3678, + "step": 33130 + }, + { + "epoch": 8.264339152119701, + "grad_norm": 7.787363529205322, + "learning_rate": 1.1739401496259352e-05, + "loss": 0.3511, + "step": 33140 + }, + { + "epoch": 8.266832917705736, + "grad_norm": 5.79642915725708, + "learning_rate": 1.1736907730673318e-05, + "loss": 0.3319, + "step": 33150 + }, + { + "epoch": 8.269326683291771, + "grad_norm": 7.905724048614502, + "learning_rate": 1.1734413965087283e-05, + "loss": 0.3393, + "step": 33160 + }, + { + "epoch": 8.271820448877806, + "grad_norm": 9.876928329467773, + "learning_rate": 1.1731920199501248e-05, + "loss": 0.3568, + "step": 33170 + }, + { + "epoch": 8.27431421446384, + "grad_norm": 9.139424324035645, + "learning_rate": 1.1729426433915212e-05, + "loss": 0.4312, + "step": 33180 + }, + { + "epoch": 8.276807980049876, + "grad_norm": 4.746652126312256, + "learning_rate": 1.1726932668329179e-05, + "loss": 0.3464, + "step": 33190 + }, + { + "epoch": 8.27930174563591, + "grad_norm": 7.954774379730225, + "learning_rate": 1.1724438902743142e-05, + "loss": 0.328, + "step": 33200 + }, + { + "epoch": 8.281795511221945, + "grad_norm": 8.069437980651855, + "learning_rate": 1.172194513715711e-05, + "loss": 0.4912, + "step": 33210 + }, + { + "epoch": 8.28428927680798, + "grad_norm": 5.376629829406738, + "learning_rate": 1.1719451371571073e-05, + "loss": 0.339, + "step": 33220 + }, + { + "epoch": 8.286783042394015, + "grad_norm": 5.5280632972717285, + "learning_rate": 1.1716957605985038e-05, + "loss": 0.3914, + "step": 33230 + }, + { + "epoch": 8.28927680798005, + "grad_norm": 6.899628162384033, + "learning_rate": 1.1714463840399004e-05, + "loss": 0.362, + "step": 33240 + }, + { + "epoch": 8.291770573566085, + "grad_norm": 5.153944969177246, + "learning_rate": 1.1711970074812969e-05, + "loss": 0.3784, + "step": 33250 + }, + { + "epoch": 8.29426433915212, + "grad_norm": 5.902993679046631, + "learning_rate": 1.1709476309226933e-05, + "loss": 0.3422, + "step": 33260 + }, + { + "epoch": 8.296758104738155, + "grad_norm": 5.776882648468018, + "learning_rate": 1.17069825436409e-05, + "loss": 0.3242, + "step": 33270 + }, + { + "epoch": 8.29925187032419, + "grad_norm": 6.157176494598389, + "learning_rate": 1.1704488778054863e-05, + "loss": 0.3903, + "step": 33280 + }, + { + "epoch": 8.301745635910224, + "grad_norm": 9.823055267333984, + "learning_rate": 1.170199501246883e-05, + "loss": 0.3959, + "step": 33290 + }, + { + "epoch": 8.30423940149626, + "grad_norm": 8.448189735412598, + "learning_rate": 1.1699501246882794e-05, + "loss": 0.3757, + "step": 33300 + }, + { + "epoch": 8.306733167082294, + "grad_norm": 8.766897201538086, + "learning_rate": 1.1697007481296757e-05, + "loss": 0.3782, + "step": 33310 + }, + { + "epoch": 8.309226932668329, + "grad_norm": 5.299570560455322, + "learning_rate": 1.1694513715710724e-05, + "loss": 0.3758, + "step": 33320 + }, + { + "epoch": 8.311720698254364, + "grad_norm": 5.184985637664795, + "learning_rate": 1.1692019950124688e-05, + "loss": 0.3417, + "step": 33330 + }, + { + "epoch": 8.314214463840399, + "grad_norm": 6.362977981567383, + "learning_rate": 1.1689526184538655e-05, + "loss": 0.341, + "step": 33340 + }, + { + "epoch": 8.316708229426434, + "grad_norm": 7.140589714050293, + "learning_rate": 1.168703241895262e-05, + "loss": 0.3829, + "step": 33350 + }, + { + "epoch": 8.319201995012468, + "grad_norm": 7.842562675476074, + "learning_rate": 1.1684538653366584e-05, + "loss": 0.293, + "step": 33360 + }, + { + "epoch": 8.321695760598503, + "grad_norm": 7.240875244140625, + "learning_rate": 1.1682044887780551e-05, + "loss": 0.4359, + "step": 33370 + }, + { + "epoch": 8.324189526184538, + "grad_norm": 10.326569557189941, + "learning_rate": 1.1679551122194515e-05, + "loss": 0.342, + "step": 33380 + }, + { + "epoch": 8.326683291770573, + "grad_norm": 8.610823631286621, + "learning_rate": 1.1677057356608478e-05, + "loss": 0.3688, + "step": 33390 + }, + { + "epoch": 8.329177057356608, + "grad_norm": 7.092180252075195, + "learning_rate": 1.1674563591022445e-05, + "loss": 0.3506, + "step": 33400 + }, + { + "epoch": 8.331670822942643, + "grad_norm": 6.74815559387207, + "learning_rate": 1.1672069825436409e-05, + "loss": 0.3515, + "step": 33410 + }, + { + "epoch": 8.334164588528678, + "grad_norm": 6.766687393188477, + "learning_rate": 1.1669576059850376e-05, + "loss": 0.3856, + "step": 33420 + }, + { + "epoch": 8.336658354114713, + "grad_norm": 7.77371883392334, + "learning_rate": 1.166708229426434e-05, + "loss": 0.3606, + "step": 33430 + }, + { + "epoch": 8.339152119700747, + "grad_norm": 8.904953956604004, + "learning_rate": 1.1664588528678305e-05, + "loss": 0.4668, + "step": 33440 + }, + { + "epoch": 8.341645885286782, + "grad_norm": 4.797825336456299, + "learning_rate": 1.166209476309227e-05, + "loss": 0.363, + "step": 33450 + }, + { + "epoch": 8.344139650872817, + "grad_norm": 9.200943946838379, + "learning_rate": 1.1659600997506235e-05, + "loss": 0.3227, + "step": 33460 + }, + { + "epoch": 8.346633416458852, + "grad_norm": 6.8330888748168945, + "learning_rate": 1.1657107231920199e-05, + "loss": 0.3053, + "step": 33470 + }, + { + "epoch": 8.349127182044889, + "grad_norm": 8.612521171569824, + "learning_rate": 1.1654613466334166e-05, + "loss": 0.3805, + "step": 33480 + }, + { + "epoch": 8.351620947630924, + "grad_norm": 6.210586071014404, + "learning_rate": 1.165211970074813e-05, + "loss": 0.396, + "step": 33490 + }, + { + "epoch": 8.354114713216958, + "grad_norm": 6.151739597320557, + "learning_rate": 1.1649625935162097e-05, + "loss": 0.4046, + "step": 33500 + }, + { + "epoch": 8.356608478802993, + "grad_norm": 7.85900354385376, + "learning_rate": 1.164713216957606e-05, + "loss": 0.3673, + "step": 33510 + }, + { + "epoch": 8.359102244389028, + "grad_norm": 6.287712097167969, + "learning_rate": 1.1644638403990026e-05, + "loss": 0.3073, + "step": 33520 + }, + { + "epoch": 8.361596009975063, + "grad_norm": 6.707120418548584, + "learning_rate": 1.1642144638403991e-05, + "loss": 0.3577, + "step": 33530 + }, + { + "epoch": 8.364089775561098, + "grad_norm": 6.422928810119629, + "learning_rate": 1.1639650872817956e-05, + "loss": 0.3452, + "step": 33540 + }, + { + "epoch": 8.366583541147133, + "grad_norm": 7.4128031730651855, + "learning_rate": 1.163715710723192e-05, + "loss": 0.4418, + "step": 33550 + }, + { + "epoch": 8.369077306733168, + "grad_norm": 7.202417850494385, + "learning_rate": 1.1634663341645887e-05, + "loss": 0.4487, + "step": 33560 + }, + { + "epoch": 8.371571072319203, + "grad_norm": 6.886559963226318, + "learning_rate": 1.163216957605985e-05, + "loss": 0.399, + "step": 33570 + }, + { + "epoch": 8.374064837905237, + "grad_norm": 6.747541427612305, + "learning_rate": 1.1629675810473817e-05, + "loss": 0.345, + "step": 33580 + }, + { + "epoch": 8.376558603491272, + "grad_norm": 5.691372871398926, + "learning_rate": 1.1627182044887781e-05, + "loss": 0.3636, + "step": 33590 + }, + { + "epoch": 8.379052369077307, + "grad_norm": 7.294488906860352, + "learning_rate": 1.1624688279301746e-05, + "loss": 0.3643, + "step": 33600 + }, + { + "epoch": 8.381546134663342, + "grad_norm": 6.717970371246338, + "learning_rate": 1.1622194513715712e-05, + "loss": 0.3814, + "step": 33610 + }, + { + "epoch": 8.384039900249377, + "grad_norm": 4.82089376449585, + "learning_rate": 1.1619700748129677e-05, + "loss": 0.3283, + "step": 33620 + }, + { + "epoch": 8.386533665835412, + "grad_norm": 10.288949966430664, + "learning_rate": 1.1617206982543642e-05, + "loss": 0.3937, + "step": 33630 + }, + { + "epoch": 8.389027431421447, + "grad_norm": 8.169456481933594, + "learning_rate": 1.1614713216957608e-05, + "loss": 0.3911, + "step": 33640 + }, + { + "epoch": 8.391521197007481, + "grad_norm": 4.798164367675781, + "learning_rate": 1.1612219451371571e-05, + "loss": 0.333, + "step": 33650 + }, + { + "epoch": 8.394014962593516, + "grad_norm": 8.796186447143555, + "learning_rate": 1.1609725685785538e-05, + "loss": 0.3268, + "step": 33660 + }, + { + "epoch": 8.396508728179551, + "grad_norm": 6.837301731109619, + "learning_rate": 1.1607231920199502e-05, + "loss": 0.3797, + "step": 33670 + }, + { + "epoch": 8.399002493765586, + "grad_norm": 4.7810282707214355, + "learning_rate": 1.1604738154613465e-05, + "loss": 0.3541, + "step": 33680 + }, + { + "epoch": 8.401496259351621, + "grad_norm": 7.039614200592041, + "learning_rate": 1.1602244389027432e-05, + "loss": 0.4047, + "step": 33690 + }, + { + "epoch": 8.403990024937656, + "grad_norm": 7.48544454574585, + "learning_rate": 1.1599750623441398e-05, + "loss": 0.3616, + "step": 33700 + }, + { + "epoch": 8.40648379052369, + "grad_norm": 12.1786470413208, + "learning_rate": 1.1597256857855363e-05, + "loss": 0.4326, + "step": 33710 + }, + { + "epoch": 8.408977556109726, + "grad_norm": 6.31995964050293, + "learning_rate": 1.1594763092269328e-05, + "loss": 0.3818, + "step": 33720 + }, + { + "epoch": 8.41147132169576, + "grad_norm": 9.140199661254883, + "learning_rate": 1.1592269326683292e-05, + "loss": 0.3978, + "step": 33730 + }, + { + "epoch": 8.413965087281795, + "grad_norm": 5.953067779541016, + "learning_rate": 1.1589775561097259e-05, + "loss": 0.3253, + "step": 33740 + }, + { + "epoch": 8.41645885286783, + "grad_norm": 6.9121246337890625, + "learning_rate": 1.1587281795511223e-05, + "loss": 0.3492, + "step": 33750 + }, + { + "epoch": 8.418952618453865, + "grad_norm": 6.274780750274658, + "learning_rate": 1.1584788029925186e-05, + "loss": 0.4546, + "step": 33760 + }, + { + "epoch": 8.4214463840399, + "grad_norm": 7.781808376312256, + "learning_rate": 1.1582294264339153e-05, + "loss": 0.3125, + "step": 33770 + }, + { + "epoch": 8.423940149625935, + "grad_norm": 9.689964294433594, + "learning_rate": 1.1579800498753117e-05, + "loss": 0.3515, + "step": 33780 + }, + { + "epoch": 8.42643391521197, + "grad_norm": 5.320454120635986, + "learning_rate": 1.1577306733167084e-05, + "loss": 0.2771, + "step": 33790 + }, + { + "epoch": 8.428927680798004, + "grad_norm": 7.566685199737549, + "learning_rate": 1.1574812967581047e-05, + "loss": 0.3667, + "step": 33800 + }, + { + "epoch": 8.43142144638404, + "grad_norm": 8.28944206237793, + "learning_rate": 1.1572319201995013e-05, + "loss": 0.3956, + "step": 33810 + }, + { + "epoch": 8.433915211970074, + "grad_norm": 7.253947734832764, + "learning_rate": 1.156982543640898e-05, + "loss": 0.3837, + "step": 33820 + }, + { + "epoch": 8.436408977556109, + "grad_norm": 9.780487060546875, + "learning_rate": 1.1567331670822943e-05, + "loss": 0.4626, + "step": 33830 + }, + { + "epoch": 8.438902743142144, + "grad_norm": 6.923801898956299, + "learning_rate": 1.156483790523691e-05, + "loss": 0.3675, + "step": 33840 + }, + { + "epoch": 8.441396508728179, + "grad_norm": 7.305504322052002, + "learning_rate": 1.1562344139650874e-05, + "loss": 0.3247, + "step": 33850 + }, + { + "epoch": 8.443890274314214, + "grad_norm": 5.790426731109619, + "learning_rate": 1.1559850374064838e-05, + "loss": 0.3559, + "step": 33860 + }, + { + "epoch": 8.446384039900249, + "grad_norm": 5.2143731117248535, + "learning_rate": 1.1557356608478805e-05, + "loss": 0.3547, + "step": 33870 + }, + { + "epoch": 8.448877805486283, + "grad_norm": 7.238703727722168, + "learning_rate": 1.1554862842892768e-05, + "loss": 0.3329, + "step": 33880 + }, + { + "epoch": 8.451371571072318, + "grad_norm": 5.136308193206787, + "learning_rate": 1.1552369077306734e-05, + "loss": 0.3662, + "step": 33890 + }, + { + "epoch": 8.453865336658355, + "grad_norm": 7.3576836585998535, + "learning_rate": 1.1549875311720699e-05, + "loss": 0.3306, + "step": 33900 + }, + { + "epoch": 8.45635910224439, + "grad_norm": 8.722187042236328, + "learning_rate": 1.1547381546134664e-05, + "loss": 0.3519, + "step": 33910 + }, + { + "epoch": 8.458852867830425, + "grad_norm": 8.820263862609863, + "learning_rate": 1.154488778054863e-05, + "loss": 0.4284, + "step": 33920 + }, + { + "epoch": 8.46134663341646, + "grad_norm": 8.29401683807373, + "learning_rate": 1.1542394014962595e-05, + "loss": 0.3871, + "step": 33930 + }, + { + "epoch": 8.463840399002494, + "grad_norm": 6.295302867889404, + "learning_rate": 1.1539900249376558e-05, + "loss": 0.3196, + "step": 33940 + }, + { + "epoch": 8.46633416458853, + "grad_norm": 7.778040885925293, + "learning_rate": 1.1537406483790525e-05, + "loss": 0.348, + "step": 33950 + }, + { + "epoch": 8.468827930174564, + "grad_norm": 9.188948631286621, + "learning_rate": 1.1534912718204489e-05, + "loss": 0.3504, + "step": 33960 + }, + { + "epoch": 8.471321695760599, + "grad_norm": 4.633731842041016, + "learning_rate": 1.1532418952618454e-05, + "loss": 0.3604, + "step": 33970 + }, + { + "epoch": 8.473815461346634, + "grad_norm": 7.2224884033203125, + "learning_rate": 1.152992518703242e-05, + "loss": 0.3979, + "step": 33980 + }, + { + "epoch": 8.476309226932669, + "grad_norm": 7.150679588317871, + "learning_rate": 1.1527431421446385e-05, + "loss": 0.3612, + "step": 33990 + }, + { + "epoch": 8.478802992518704, + "grad_norm": 6.690948486328125, + "learning_rate": 1.152493765586035e-05, + "loss": 0.3037, + "step": 34000 + }, + { + "epoch": 8.481296758104738, + "grad_norm": 5.986937046051025, + "learning_rate": 1.1522443890274316e-05, + "loss": 0.4867, + "step": 34010 + }, + { + "epoch": 8.483790523690773, + "grad_norm": 6.088142395019531, + "learning_rate": 1.151995012468828e-05, + "loss": 0.3606, + "step": 34020 + }, + { + "epoch": 8.486284289276808, + "grad_norm": 6.499013423919678, + "learning_rate": 1.1517456359102246e-05, + "loss": 0.3891, + "step": 34030 + }, + { + "epoch": 8.488778054862843, + "grad_norm": 9.472525596618652, + "learning_rate": 1.151496259351621e-05, + "loss": 0.3548, + "step": 34040 + }, + { + "epoch": 8.491271820448878, + "grad_norm": 7.18366813659668, + "learning_rate": 1.1512468827930175e-05, + "loss": 0.3939, + "step": 34050 + }, + { + "epoch": 8.493765586034913, + "grad_norm": 6.115909099578857, + "learning_rate": 1.150997506234414e-05, + "loss": 0.4296, + "step": 34060 + }, + { + "epoch": 8.496259351620948, + "grad_norm": 7.923805236816406, + "learning_rate": 1.1507481296758106e-05, + "loss": 0.389, + "step": 34070 + }, + { + "epoch": 8.498753117206983, + "grad_norm": 6.619093418121338, + "learning_rate": 1.1504987531172071e-05, + "loss": 0.3364, + "step": 34080 + }, + { + "epoch": 8.501246882793017, + "grad_norm": 4.969188213348389, + "learning_rate": 1.1502493765586036e-05, + "loss": 0.3985, + "step": 34090 + }, + { + "epoch": 8.503740648379052, + "grad_norm": 5.744391441345215, + "learning_rate": 1.15e-05, + "loss": 0.4074, + "step": 34100 + }, + { + "epoch": 8.506234413965087, + "grad_norm": 6.1294755935668945, + "learning_rate": 1.1497506234413967e-05, + "loss": 0.3872, + "step": 34110 + }, + { + "epoch": 8.508728179551122, + "grad_norm": 7.260706901550293, + "learning_rate": 1.149501246882793e-05, + "loss": 0.3354, + "step": 34120 + }, + { + "epoch": 8.511221945137157, + "grad_norm": 5.858883380889893, + "learning_rate": 1.1492518703241898e-05, + "loss": 0.3727, + "step": 34130 + }, + { + "epoch": 8.513715710723192, + "grad_norm": 5.901925086975098, + "learning_rate": 1.1490024937655861e-05, + "loss": 0.3703, + "step": 34140 + }, + { + "epoch": 8.516209476309227, + "grad_norm": 6.566878318786621, + "learning_rate": 1.1487531172069825e-05, + "loss": 0.3385, + "step": 34150 + }, + { + "epoch": 8.518703241895262, + "grad_norm": 6.557463645935059, + "learning_rate": 1.1485037406483792e-05, + "loss": 0.2862, + "step": 34160 + }, + { + "epoch": 8.521197007481296, + "grad_norm": 8.497570037841797, + "learning_rate": 1.1482543640897757e-05, + "loss": 0.3043, + "step": 34170 + }, + { + "epoch": 8.523690773067331, + "grad_norm": 6.737511157989502, + "learning_rate": 1.148004987531172e-05, + "loss": 0.3957, + "step": 34180 + }, + { + "epoch": 8.526184538653366, + "grad_norm": 7.414823055267334, + "learning_rate": 1.1477556109725688e-05, + "loss": 0.3243, + "step": 34190 + }, + { + "epoch": 8.528678304239401, + "grad_norm": 6.70835542678833, + "learning_rate": 1.1475311720698254e-05, + "loss": 0.2703, + "step": 34200 + }, + { + "epoch": 8.531172069825436, + "grad_norm": 6.923582077026367, + "learning_rate": 1.1472817955112221e-05, + "loss": 0.3621, + "step": 34210 + }, + { + "epoch": 8.53366583541147, + "grad_norm": 3.6266000270843506, + "learning_rate": 1.1470324189526185e-05, + "loss": 0.2875, + "step": 34220 + }, + { + "epoch": 8.536159600997506, + "grad_norm": 7.4206156730651855, + "learning_rate": 1.1467830423940152e-05, + "loss": 0.3646, + "step": 34230 + }, + { + "epoch": 8.53865336658354, + "grad_norm": 5.679503440856934, + "learning_rate": 1.1465336658354116e-05, + "loss": 0.355, + "step": 34240 + }, + { + "epoch": 8.541147132169575, + "grad_norm": 8.173449516296387, + "learning_rate": 1.1462842892768081e-05, + "loss": 0.3537, + "step": 34250 + }, + { + "epoch": 8.54364089775561, + "grad_norm": 7.642554759979248, + "learning_rate": 1.1460349127182046e-05, + "loss": 0.3652, + "step": 34260 + }, + { + "epoch": 8.546134663341645, + "grad_norm": 10.153780937194824, + "learning_rate": 1.1457855361596012e-05, + "loss": 0.4083, + "step": 34270 + }, + { + "epoch": 8.548628428927682, + "grad_norm": 7.6003289222717285, + "learning_rate": 1.1455361596009975e-05, + "loss": 0.4111, + "step": 34280 + }, + { + "epoch": 8.551122194513717, + "grad_norm": 8.038273811340332, + "learning_rate": 1.1452867830423942e-05, + "loss": 0.3452, + "step": 34290 + }, + { + "epoch": 8.553615960099751, + "grad_norm": 7.191328048706055, + "learning_rate": 1.1450374064837906e-05, + "loss": 0.3575, + "step": 34300 + }, + { + "epoch": 8.556109725685786, + "grad_norm": 8.192909240722656, + "learning_rate": 1.1447880299251873e-05, + "loss": 0.3853, + "step": 34310 + }, + { + "epoch": 8.558603491271821, + "grad_norm": 7.0611677169799805, + "learning_rate": 1.1445386533665836e-05, + "loss": 0.3626, + "step": 34320 + }, + { + "epoch": 8.561097256857856, + "grad_norm": 6.423704624176025, + "learning_rate": 1.14428927680798e-05, + "loss": 0.3413, + "step": 34330 + }, + { + "epoch": 8.563591022443891, + "grad_norm": 7.521974086761475, + "learning_rate": 1.1440399002493767e-05, + "loss": 0.3469, + "step": 34340 + }, + { + "epoch": 8.566084788029926, + "grad_norm": 4.919178485870361, + "learning_rate": 1.143790523690773e-05, + "loss": 0.3544, + "step": 34350 + }, + { + "epoch": 8.56857855361596, + "grad_norm": 6.528305530548096, + "learning_rate": 1.1435411471321696e-05, + "loss": 0.3007, + "step": 34360 + }, + { + "epoch": 8.571072319201996, + "grad_norm": 6.000070571899414, + "learning_rate": 1.1432917705735663e-05, + "loss": 0.3419, + "step": 34370 + }, + { + "epoch": 8.57356608478803, + "grad_norm": 6.5107245445251465, + "learning_rate": 1.1430423940149627e-05, + "loss": 0.3416, + "step": 34380 + }, + { + "epoch": 8.576059850374065, + "grad_norm": 8.171274185180664, + "learning_rate": 1.1427930174563594e-05, + "loss": 0.4224, + "step": 34390 + }, + { + "epoch": 8.5785536159601, + "grad_norm": 7.173635959625244, + "learning_rate": 1.1425436408977557e-05, + "loss": 0.3584, + "step": 34400 + }, + { + "epoch": 8.581047381546135, + "grad_norm": 6.059718608856201, + "learning_rate": 1.142294264339152e-05, + "loss": 0.3048, + "step": 34410 + }, + { + "epoch": 8.58354114713217, + "grad_norm": 5.417482376098633, + "learning_rate": 1.1420448877805488e-05, + "loss": 0.3443, + "step": 34420 + }, + { + "epoch": 8.586034912718205, + "grad_norm": 8.786724090576172, + "learning_rate": 1.1417955112219451e-05, + "loss": 0.4161, + "step": 34430 + }, + { + "epoch": 8.58852867830424, + "grad_norm": 8.032851219177246, + "learning_rate": 1.1415461346633417e-05, + "loss": 0.4395, + "step": 34440 + }, + { + "epoch": 8.591022443890274, + "grad_norm": 8.335599899291992, + "learning_rate": 1.1412967581047382e-05, + "loss": 0.3912, + "step": 34450 + }, + { + "epoch": 8.59351620947631, + "grad_norm": 8.087762832641602, + "learning_rate": 1.1410473815461347e-05, + "loss": 0.3549, + "step": 34460 + }, + { + "epoch": 8.596009975062344, + "grad_norm": 5.728715896606445, + "learning_rate": 1.1407980049875313e-05, + "loss": 0.3888, + "step": 34470 + }, + { + "epoch": 8.598503740648379, + "grad_norm": 8.289750099182129, + "learning_rate": 1.1405486284289278e-05, + "loss": 0.3327, + "step": 34480 + }, + { + "epoch": 8.600997506234414, + "grad_norm": 11.838362693786621, + "learning_rate": 1.1402992518703242e-05, + "loss": 0.3676, + "step": 34490 + }, + { + "epoch": 8.603491271820449, + "grad_norm": 7.733099937438965, + "learning_rate": 1.1400498753117209e-05, + "loss": 0.4015, + "step": 34500 + }, + { + "epoch": 8.605985037406484, + "grad_norm": 6.227921009063721, + "learning_rate": 1.1398004987531172e-05, + "loss": 0.3205, + "step": 34510 + }, + { + "epoch": 8.608478802992519, + "grad_norm": 10.058944702148438, + "learning_rate": 1.139551122194514e-05, + "loss": 0.3644, + "step": 34520 + }, + { + "epoch": 8.610972568578553, + "grad_norm": 5.577748775482178, + "learning_rate": 1.1393017456359103e-05, + "loss": 0.3545, + "step": 34530 + }, + { + "epoch": 8.613466334164588, + "grad_norm": 6.9346923828125, + "learning_rate": 1.1390523690773068e-05, + "loss": 0.3203, + "step": 34540 + }, + { + "epoch": 8.615960099750623, + "grad_norm": 7.139277935028076, + "learning_rate": 1.1388029925187033e-05, + "loss": 0.296, + "step": 34550 + }, + { + "epoch": 8.618453865336658, + "grad_norm": 6.940938949584961, + "learning_rate": 1.1385536159600999e-05, + "loss": 0.339, + "step": 34560 + }, + { + "epoch": 8.620947630922693, + "grad_norm": 7.233773708343506, + "learning_rate": 1.1383042394014962e-05, + "loss": 0.3677, + "step": 34570 + }, + { + "epoch": 8.623441396508728, + "grad_norm": 11.383943557739258, + "learning_rate": 1.138054862842893e-05, + "loss": 0.3711, + "step": 34580 + }, + { + "epoch": 8.625935162094763, + "grad_norm": 5.546501636505127, + "learning_rate": 1.1378054862842893e-05, + "loss": 0.4015, + "step": 34590 + }, + { + "epoch": 8.628428927680797, + "grad_norm": 8.203598022460938, + "learning_rate": 1.137556109725686e-05, + "loss": 0.32, + "step": 34600 + }, + { + "epoch": 8.630922693266832, + "grad_norm": 8.763114929199219, + "learning_rate": 1.1373067331670824e-05, + "loss": 0.2881, + "step": 34610 + }, + { + "epoch": 8.633416458852867, + "grad_norm": 9.64941120147705, + "learning_rate": 1.1370573566084789e-05, + "loss": 0.4229, + "step": 34620 + }, + { + "epoch": 8.635910224438902, + "grad_norm": 8.64614486694336, + "learning_rate": 1.1368079800498754e-05, + "loss": 0.3512, + "step": 34630 + }, + { + "epoch": 8.638403990024937, + "grad_norm": 9.003388404846191, + "learning_rate": 1.136558603491272e-05, + "loss": 0.3894, + "step": 34640 + }, + { + "epoch": 8.640897755610972, + "grad_norm": 6.2537336349487305, + "learning_rate": 1.1363092269326683e-05, + "loss": 0.3445, + "step": 34650 + }, + { + "epoch": 8.643391521197007, + "grad_norm": 7.811208248138428, + "learning_rate": 1.136059850374065e-05, + "loss": 0.4709, + "step": 34660 + }, + { + "epoch": 8.645885286783042, + "grad_norm": 7.517146587371826, + "learning_rate": 1.1358104738154614e-05, + "loss": 0.307, + "step": 34670 + }, + { + "epoch": 8.648379052369076, + "grad_norm": 7.594816207885742, + "learning_rate": 1.135561097256858e-05, + "loss": 0.3967, + "step": 34680 + }, + { + "epoch": 8.650872817955111, + "grad_norm": 6.5755510330200195, + "learning_rate": 1.1353117206982544e-05, + "loss": 0.3044, + "step": 34690 + }, + { + "epoch": 8.653366583541148, + "grad_norm": 10.061249732971191, + "learning_rate": 1.1350623441396508e-05, + "loss": 0.4416, + "step": 34700 + }, + { + "epoch": 8.655860349127183, + "grad_norm": 7.96301794052124, + "learning_rate": 1.1348129675810475e-05, + "loss": 0.4184, + "step": 34710 + }, + { + "epoch": 8.658354114713218, + "grad_norm": 7.919782638549805, + "learning_rate": 1.134563591022444e-05, + "loss": 0.3322, + "step": 34720 + }, + { + "epoch": 8.660847880299253, + "grad_norm": 8.549415588378906, + "learning_rate": 1.1343142144638406e-05, + "loss": 0.3588, + "step": 34730 + }, + { + "epoch": 8.663341645885287, + "grad_norm": 5.7566070556640625, + "learning_rate": 1.1340648379052371e-05, + "loss": 0.3391, + "step": 34740 + }, + { + "epoch": 8.665835411471322, + "grad_norm": 8.828804016113281, + "learning_rate": 1.1338154613466335e-05, + "loss": 0.3558, + "step": 34750 + }, + { + "epoch": 8.668329177057357, + "grad_norm": 7.40009069442749, + "learning_rate": 1.1335660847880302e-05, + "loss": 0.3556, + "step": 34760 + }, + { + "epoch": 8.670822942643392, + "grad_norm": 7.1716814041137695, + "learning_rate": 1.1333167082294265e-05, + "loss": 0.3363, + "step": 34770 + }, + { + "epoch": 8.673316708229427, + "grad_norm": 8.354803085327148, + "learning_rate": 1.1330673316708229e-05, + "loss": 0.3615, + "step": 34780 + }, + { + "epoch": 8.675810473815462, + "grad_norm": 5.095800399780273, + "learning_rate": 1.1328179551122196e-05, + "loss": 0.3083, + "step": 34790 + }, + { + "epoch": 8.678304239401497, + "grad_norm": 8.354507446289062, + "learning_rate": 1.132568578553616e-05, + "loss": 0.3613, + "step": 34800 + }, + { + "epoch": 8.680798004987532, + "grad_norm": 6.8850555419921875, + "learning_rate": 1.1323192019950126e-05, + "loss": 0.3627, + "step": 34810 + }, + { + "epoch": 8.683291770573566, + "grad_norm": 7.482059001922607, + "learning_rate": 1.132069825436409e-05, + "loss": 0.3885, + "step": 34820 + }, + { + "epoch": 8.685785536159601, + "grad_norm": 8.189881324768066, + "learning_rate": 1.1318204488778055e-05, + "loss": 0.3067, + "step": 34830 + }, + { + "epoch": 8.688279301745636, + "grad_norm": 6.003298759460449, + "learning_rate": 1.1315710723192022e-05, + "loss": 0.3853, + "step": 34840 + }, + { + "epoch": 8.690773067331671, + "grad_norm": 9.855851173400879, + "learning_rate": 1.1313216957605986e-05, + "loss": 0.426, + "step": 34850 + }, + { + "epoch": 8.693266832917706, + "grad_norm": 7.669548988342285, + "learning_rate": 1.131072319201995e-05, + "loss": 0.3384, + "step": 34860 + }, + { + "epoch": 8.69576059850374, + "grad_norm": 7.204586029052734, + "learning_rate": 1.1308229426433917e-05, + "loss": 0.3972, + "step": 34870 + }, + { + "epoch": 8.698254364089776, + "grad_norm": 5.998164176940918, + "learning_rate": 1.130573566084788e-05, + "loss": 0.3572, + "step": 34880 + }, + { + "epoch": 8.70074812967581, + "grad_norm": 7.780731201171875, + "learning_rate": 1.1303241895261847e-05, + "loss": 0.3469, + "step": 34890 + }, + { + "epoch": 8.703241895261845, + "grad_norm": 8.073392868041992, + "learning_rate": 1.130074812967581e-05, + "loss": 0.3723, + "step": 34900 + }, + { + "epoch": 8.70573566084788, + "grad_norm": 5.444243907928467, + "learning_rate": 1.1298254364089776e-05, + "loss": 0.3219, + "step": 34910 + }, + { + "epoch": 8.708229426433915, + "grad_norm": 10.334959030151367, + "learning_rate": 1.1295760598503741e-05, + "loss": 0.3386, + "step": 34920 + }, + { + "epoch": 8.71072319201995, + "grad_norm": 8.497328758239746, + "learning_rate": 1.1293266832917707e-05, + "loss": 0.3967, + "step": 34930 + }, + { + "epoch": 8.713216957605985, + "grad_norm": 6.893999099731445, + "learning_rate": 1.129077306733167e-05, + "loss": 0.2833, + "step": 34940 + }, + { + "epoch": 8.71571072319202, + "grad_norm": 6.12678337097168, + "learning_rate": 1.1288279301745637e-05, + "loss": 0.3801, + "step": 34950 + }, + { + "epoch": 8.718204488778055, + "grad_norm": 4.06450080871582, + "learning_rate": 1.1285785536159601e-05, + "loss": 0.3047, + "step": 34960 + }, + { + "epoch": 8.72069825436409, + "grad_norm": 6.966952323913574, + "learning_rate": 1.1283291770573568e-05, + "loss": 0.371, + "step": 34970 + }, + { + "epoch": 8.723192019950124, + "grad_norm": 6.376713752746582, + "learning_rate": 1.1280798004987532e-05, + "loss": 0.3878, + "step": 34980 + }, + { + "epoch": 8.72568578553616, + "grad_norm": 7.877644062042236, + "learning_rate": 1.1278304239401497e-05, + "loss": 0.3533, + "step": 34990 + }, + { + "epoch": 8.728179551122194, + "grad_norm": 4.689276218414307, + "learning_rate": 1.1275810473815462e-05, + "loss": 0.3081, + "step": 35000 + }, + { + "epoch": 8.730673316708229, + "grad_norm": 8.68629264831543, + "learning_rate": 1.1273316708229428e-05, + "loss": 0.3049, + "step": 35010 + }, + { + "epoch": 8.733167082294264, + "grad_norm": 6.382472991943359, + "learning_rate": 1.1270822942643393e-05, + "loss": 0.3957, + "step": 35020 + }, + { + "epoch": 8.735660847880299, + "grad_norm": 5.842182636260986, + "learning_rate": 1.1268329177057358e-05, + "loss": 0.3513, + "step": 35030 + }, + { + "epoch": 8.738154613466333, + "grad_norm": 5.1576948165893555, + "learning_rate": 1.1265835411471322e-05, + "loss": 0.3957, + "step": 35040 + }, + { + "epoch": 8.740648379052368, + "grad_norm": 12.842966079711914, + "learning_rate": 1.1263341645885289e-05, + "loss": 0.3318, + "step": 35050 + }, + { + "epoch": 8.743142144638403, + "grad_norm": 5.631074905395508, + "learning_rate": 1.1260847880299252e-05, + "loss": 0.2967, + "step": 35060 + }, + { + "epoch": 8.745635910224438, + "grad_norm": 6.541484355926514, + "learning_rate": 1.1258354114713218e-05, + "loss": 0.3186, + "step": 35070 + }, + { + "epoch": 8.748129675810475, + "grad_norm": 5.197242259979248, + "learning_rate": 1.1255860349127183e-05, + "loss": 0.3606, + "step": 35080 + }, + { + "epoch": 8.75062344139651, + "grad_norm": 7.91013240814209, + "learning_rate": 1.1253366583541148e-05, + "loss": 0.3609, + "step": 35090 + }, + { + "epoch": 8.753117206982544, + "grad_norm": 6.427094459533691, + "learning_rate": 1.1250872817955114e-05, + "loss": 0.4667, + "step": 35100 + }, + { + "epoch": 8.75561097256858, + "grad_norm": 5.502357482910156, + "learning_rate": 1.1248379052369079e-05, + "loss": 0.3279, + "step": 35110 + }, + { + "epoch": 8.758104738154614, + "grad_norm": 6.215767860412598, + "learning_rate": 1.1245885286783043e-05, + "loss": 0.3371, + "step": 35120 + }, + { + "epoch": 8.760598503740649, + "grad_norm": 6.710012435913086, + "learning_rate": 1.124339152119701e-05, + "loss": 0.3477, + "step": 35130 + }, + { + "epoch": 8.763092269326684, + "grad_norm": 7.127229690551758, + "learning_rate": 1.1240897755610973e-05, + "loss": 0.3687, + "step": 35140 + }, + { + "epoch": 8.765586034912719, + "grad_norm": 7.16987419128418, + "learning_rate": 1.1238403990024937e-05, + "loss": 0.3494, + "step": 35150 + }, + { + "epoch": 8.768079800498754, + "grad_norm": 8.195234298706055, + "learning_rate": 1.1235910224438904e-05, + "loss": 0.331, + "step": 35160 + }, + { + "epoch": 8.770573566084789, + "grad_norm": 5.69235372543335, + "learning_rate": 1.1233416458852869e-05, + "loss": 0.3815, + "step": 35170 + }, + { + "epoch": 8.773067331670823, + "grad_norm": 7.860400199890137, + "learning_rate": 1.1230922693266834e-05, + "loss": 0.3599, + "step": 35180 + }, + { + "epoch": 8.775561097256858, + "grad_norm": 16.83560562133789, + "learning_rate": 1.12284289276808e-05, + "loss": 0.3941, + "step": 35190 + }, + { + "epoch": 8.778054862842893, + "grad_norm": 6.330166816711426, + "learning_rate": 1.1225935162094763e-05, + "loss": 0.3688, + "step": 35200 + }, + { + "epoch": 8.780548628428928, + "grad_norm": 6.7685418128967285, + "learning_rate": 1.122344139650873e-05, + "loss": 0.3861, + "step": 35210 + }, + { + "epoch": 8.783042394014963, + "grad_norm": 8.012506484985352, + "learning_rate": 1.1220947630922694e-05, + "loss": 0.3795, + "step": 35220 + }, + { + "epoch": 8.785536159600998, + "grad_norm": 7.705891132354736, + "learning_rate": 1.1218453865336661e-05, + "loss": 0.439, + "step": 35230 + }, + { + "epoch": 8.788029925187033, + "grad_norm": 6.158017158508301, + "learning_rate": 1.1215960099750625e-05, + "loss": 0.3485, + "step": 35240 + }, + { + "epoch": 8.790523690773068, + "grad_norm": 10.359025001525879, + "learning_rate": 1.1213466334164588e-05, + "loss": 0.397, + "step": 35250 + }, + { + "epoch": 8.793017456359102, + "grad_norm": 6.772762298583984, + "learning_rate": 1.1210972568578555e-05, + "loss": 0.351, + "step": 35260 + }, + { + "epoch": 8.795511221945137, + "grad_norm": 8.312684059143066, + "learning_rate": 1.1208478802992519e-05, + "loss": 0.3252, + "step": 35270 + }, + { + "epoch": 8.798004987531172, + "grad_norm": 7.869873523712158, + "learning_rate": 1.1205985037406484e-05, + "loss": 0.3912, + "step": 35280 + }, + { + "epoch": 8.800498753117207, + "grad_norm": 7.570711612701416, + "learning_rate": 1.1203491271820451e-05, + "loss": 0.3725, + "step": 35290 + }, + { + "epoch": 8.802992518703242, + "grad_norm": 9.398818016052246, + "learning_rate": 1.1200997506234415e-05, + "loss": 0.3509, + "step": 35300 + }, + { + "epoch": 8.805486284289277, + "grad_norm": 6.553288459777832, + "learning_rate": 1.1198503740648382e-05, + "loss": 0.3134, + "step": 35310 + }, + { + "epoch": 8.807980049875312, + "grad_norm": 6.000061511993408, + "learning_rate": 1.1196009975062345e-05, + "loss": 0.3332, + "step": 35320 + }, + { + "epoch": 8.810473815461346, + "grad_norm": 10.576332092285156, + "learning_rate": 1.1193516209476309e-05, + "loss": 0.4149, + "step": 35330 + }, + { + "epoch": 8.812967581047381, + "grad_norm": 6.1524505615234375, + "learning_rate": 1.1191022443890276e-05, + "loss": 0.3426, + "step": 35340 + }, + { + "epoch": 8.815461346633416, + "grad_norm": 10.099903106689453, + "learning_rate": 1.118852867830424e-05, + "loss": 0.3464, + "step": 35350 + }, + { + "epoch": 8.817955112219451, + "grad_norm": 5.1967854499816895, + "learning_rate": 1.1186034912718205e-05, + "loss": 0.3209, + "step": 35360 + }, + { + "epoch": 8.820448877805486, + "grad_norm": 10.381093978881836, + "learning_rate": 1.118354114713217e-05, + "loss": 0.3733, + "step": 35370 + }, + { + "epoch": 8.82294264339152, + "grad_norm": 17.32126808166504, + "learning_rate": 1.1181047381546136e-05, + "loss": 0.3812, + "step": 35380 + }, + { + "epoch": 8.825436408977556, + "grad_norm": 11.370584487915039, + "learning_rate": 1.11785536159601e-05, + "loss": 0.3574, + "step": 35390 + }, + { + "epoch": 8.82793017456359, + "grad_norm": 6.021444797515869, + "learning_rate": 1.1176059850374066e-05, + "loss": 0.3345, + "step": 35400 + }, + { + "epoch": 8.830423940149625, + "grad_norm": 9.910523414611816, + "learning_rate": 1.117356608478803e-05, + "loss": 0.3779, + "step": 35410 + }, + { + "epoch": 8.83291770573566, + "grad_norm": 10.403486251831055, + "learning_rate": 1.1171072319201997e-05, + "loss": 0.2948, + "step": 35420 + }, + { + "epoch": 8.835411471321695, + "grad_norm": 5.244110107421875, + "learning_rate": 1.116857855361596e-05, + "loss": 0.3526, + "step": 35430 + }, + { + "epoch": 8.83790523690773, + "grad_norm": 8.06598949432373, + "learning_rate": 1.1166084788029926e-05, + "loss": 0.3763, + "step": 35440 + }, + { + "epoch": 8.840399002493765, + "grad_norm": 6.2660322189331055, + "learning_rate": 1.1163591022443891e-05, + "loss": 0.36, + "step": 35450 + }, + { + "epoch": 8.8428927680798, + "grad_norm": 7.570070266723633, + "learning_rate": 1.1161097256857856e-05, + "loss": 0.423, + "step": 35460 + }, + { + "epoch": 8.845386533665835, + "grad_norm": 5.027857780456543, + "learning_rate": 1.1158603491271822e-05, + "loss": 0.3392, + "step": 35470 + }, + { + "epoch": 8.84788029925187, + "grad_norm": 7.999138832092285, + "learning_rate": 1.1156109725685787e-05, + "loss": 0.3899, + "step": 35480 + }, + { + "epoch": 8.850374064837904, + "grad_norm": 9.944479942321777, + "learning_rate": 1.115361596009975e-05, + "loss": 0.3727, + "step": 35490 + }, + { + "epoch": 8.85286783042394, + "grad_norm": 7.199899673461914, + "learning_rate": 1.1151122194513718e-05, + "loss": 0.3394, + "step": 35500 + }, + { + "epoch": 8.855361596009976, + "grad_norm": 5.359086036682129, + "learning_rate": 1.1148628428927681e-05, + "loss": 0.4158, + "step": 35510 + }, + { + "epoch": 8.85785536159601, + "grad_norm": 5.6271820068359375, + "learning_rate": 1.1146134663341648e-05, + "loss": 0.391, + "step": 35520 + }, + { + "epoch": 8.860349127182046, + "grad_norm": 7.151126861572266, + "learning_rate": 1.1143640897755612e-05, + "loss": 0.3635, + "step": 35530 + }, + { + "epoch": 8.86284289276808, + "grad_norm": 6.151798725128174, + "learning_rate": 1.1141147132169577e-05, + "loss": 0.3526, + "step": 35540 + }, + { + "epoch": 8.865336658354115, + "grad_norm": 6.384893417358398, + "learning_rate": 1.1138653366583542e-05, + "loss": 0.3955, + "step": 35550 + }, + { + "epoch": 8.86783042394015, + "grad_norm": 11.764251708984375, + "learning_rate": 1.1136159600997508e-05, + "loss": 0.3387, + "step": 35560 + }, + { + "epoch": 8.870324189526185, + "grad_norm": 10.541848182678223, + "learning_rate": 1.1133665835411471e-05, + "loss": 0.3911, + "step": 35570 + }, + { + "epoch": 8.87281795511222, + "grad_norm": 6.850974082946777, + "learning_rate": 1.1131172069825438e-05, + "loss": 0.48, + "step": 35580 + }, + { + "epoch": 8.875311720698255, + "grad_norm": 7.8186211585998535, + "learning_rate": 1.1128678304239402e-05, + "loss": 0.4199, + "step": 35590 + }, + { + "epoch": 8.87780548628429, + "grad_norm": 7.9405012130737305, + "learning_rate": 1.1126184538653369e-05, + "loss": 0.3845, + "step": 35600 + }, + { + "epoch": 8.880299251870325, + "grad_norm": 7.980791091918945, + "learning_rate": 1.1123690773067333e-05, + "loss": 0.431, + "step": 35610 + }, + { + "epoch": 8.88279301745636, + "grad_norm": 8.639333724975586, + "learning_rate": 1.1121197007481296e-05, + "loss": 0.3792, + "step": 35620 + }, + { + "epoch": 8.885286783042394, + "grad_norm": 6.000261306762695, + "learning_rate": 1.1118703241895263e-05, + "loss": 0.3846, + "step": 35630 + }, + { + "epoch": 8.88778054862843, + "grad_norm": 8.1222505569458, + "learning_rate": 1.1116209476309228e-05, + "loss": 0.3591, + "step": 35640 + }, + { + "epoch": 8.890274314214464, + "grad_norm": 9.311159133911133, + "learning_rate": 1.1113715710723192e-05, + "loss": 0.3771, + "step": 35650 + }, + { + "epoch": 8.892768079800499, + "grad_norm": 5.505731105804443, + "learning_rate": 1.1111221945137159e-05, + "loss": 0.3531, + "step": 35660 + }, + { + "epoch": 8.895261845386534, + "grad_norm": 11.431441307067871, + "learning_rate": 1.1108728179551123e-05, + "loss": 0.3913, + "step": 35670 + }, + { + "epoch": 8.897755610972569, + "grad_norm": 6.601688385009766, + "learning_rate": 1.110623441396509e-05, + "loss": 0.374, + "step": 35680 + }, + { + "epoch": 8.900249376558603, + "grad_norm": 4.69558048248291, + "learning_rate": 1.1103740648379053e-05, + "loss": 0.3695, + "step": 35690 + }, + { + "epoch": 8.902743142144638, + "grad_norm": 6.6589131355285645, + "learning_rate": 1.1101246882793017e-05, + "loss": 0.3566, + "step": 35700 + }, + { + "epoch": 8.905236907730673, + "grad_norm": 7.764341831207275, + "learning_rate": 1.1098753117206984e-05, + "loss": 0.3723, + "step": 35710 + }, + { + "epoch": 8.907730673316708, + "grad_norm": 5.244799613952637, + "learning_rate": 1.1096259351620948e-05, + "loss": 0.3354, + "step": 35720 + }, + { + "epoch": 8.910224438902743, + "grad_norm": 7.864104270935059, + "learning_rate": 1.1093765586034915e-05, + "loss": 0.3841, + "step": 35730 + }, + { + "epoch": 8.912718204488778, + "grad_norm": 7.269486427307129, + "learning_rate": 1.1091271820448878e-05, + "loss": 0.4008, + "step": 35740 + }, + { + "epoch": 8.915211970074813, + "grad_norm": 7.298030376434326, + "learning_rate": 1.1088778054862843e-05, + "loss": 0.3713, + "step": 35750 + }, + { + "epoch": 8.917705735660848, + "grad_norm": 3.9201197624206543, + "learning_rate": 1.108628428927681e-05, + "loss": 0.3732, + "step": 35760 + }, + { + "epoch": 8.920199501246882, + "grad_norm": 5.9074249267578125, + "learning_rate": 1.1083790523690774e-05, + "loss": 0.3367, + "step": 35770 + }, + { + "epoch": 8.922693266832917, + "grad_norm": 16.71881866455078, + "learning_rate": 1.1081296758104738e-05, + "loss": 0.4005, + "step": 35780 + }, + { + "epoch": 8.925187032418952, + "grad_norm": 7.94209623336792, + "learning_rate": 1.1078802992518705e-05, + "loss": 0.3229, + "step": 35790 + }, + { + "epoch": 8.927680798004987, + "grad_norm": 6.1742143630981445, + "learning_rate": 1.1076309226932668e-05, + "loss": 0.3859, + "step": 35800 + }, + { + "epoch": 8.930174563591022, + "grad_norm": 14.812112808227539, + "learning_rate": 1.1073815461346635e-05, + "loss": 0.4756, + "step": 35810 + }, + { + "epoch": 8.932668329177057, + "grad_norm": 7.1429338455200195, + "learning_rate": 1.1071321695760599e-05, + "loss": 0.2895, + "step": 35820 + }, + { + "epoch": 8.935162094763092, + "grad_norm": 6.503314018249512, + "learning_rate": 1.1068827930174564e-05, + "loss": 0.3162, + "step": 35830 + }, + { + "epoch": 8.937655860349127, + "grad_norm": 8.817343711853027, + "learning_rate": 1.106633416458853e-05, + "loss": 0.374, + "step": 35840 + }, + { + "epoch": 8.940149625935161, + "grad_norm": 5.8551740646362305, + "learning_rate": 1.1063840399002495e-05, + "loss": 0.4058, + "step": 35850 + }, + { + "epoch": 8.942643391521196, + "grad_norm": 7.214171409606934, + "learning_rate": 1.1061346633416459e-05, + "loss": 0.388, + "step": 35860 + }, + { + "epoch": 8.945137157107231, + "grad_norm": 6.604674816131592, + "learning_rate": 1.1058852867830426e-05, + "loss": 0.3192, + "step": 35870 + }, + { + "epoch": 8.947630922693268, + "grad_norm": 9.637167930603027, + "learning_rate": 1.1056359102244389e-05, + "loss": 0.3364, + "step": 35880 + }, + { + "epoch": 8.950124688279303, + "grad_norm": 6.015578746795654, + "learning_rate": 1.1053865336658356e-05, + "loss": 0.3099, + "step": 35890 + }, + { + "epoch": 8.952618453865338, + "grad_norm": 6.540172100067139, + "learning_rate": 1.105137157107232e-05, + "loss": 0.4536, + "step": 35900 + }, + { + "epoch": 8.955112219451372, + "grad_norm": 7.110110759735107, + "learning_rate": 1.1048877805486285e-05, + "loss": 0.319, + "step": 35910 + }, + { + "epoch": 8.957605985037407, + "grad_norm": 5.485593318939209, + "learning_rate": 1.104638403990025e-05, + "loss": 0.3557, + "step": 35920 + }, + { + "epoch": 8.960099750623442, + "grad_norm": 6.532005786895752, + "learning_rate": 1.1043890274314216e-05, + "loss": 0.3922, + "step": 35930 + }, + { + "epoch": 8.962593516209477, + "grad_norm": 6.834729194641113, + "learning_rate": 1.104139650872818e-05, + "loss": 0.3233, + "step": 35940 + }, + { + "epoch": 8.965087281795512, + "grad_norm": 5.890754699707031, + "learning_rate": 1.1038902743142146e-05, + "loss": 0.3753, + "step": 35950 + }, + { + "epoch": 8.967581047381547, + "grad_norm": 6.0453619956970215, + "learning_rate": 1.103640897755611e-05, + "loss": 0.4137, + "step": 35960 + }, + { + "epoch": 8.970074812967582, + "grad_norm": 7.995392322540283, + "learning_rate": 1.1033915211970077e-05, + "loss": 0.3435, + "step": 35970 + }, + { + "epoch": 8.972568578553616, + "grad_norm": 7.464896202087402, + "learning_rate": 1.103142144638404e-05, + "loss": 0.3894, + "step": 35980 + }, + { + "epoch": 8.975062344139651, + "grad_norm": 9.544695854187012, + "learning_rate": 1.1028927680798006e-05, + "loss": 0.4271, + "step": 35990 + }, + { + "epoch": 8.977556109725686, + "grad_norm": 5.784707069396973, + "learning_rate": 1.1026433915211971e-05, + "loss": 0.3547, + "step": 36000 + }, + { + "epoch": 8.980049875311721, + "grad_norm": 4.10465669631958, + "learning_rate": 1.1023940149625936e-05, + "loss": 0.3881, + "step": 36010 + }, + { + "epoch": 8.982543640897756, + "grad_norm": 3.3419559001922607, + "learning_rate": 1.1021446384039902e-05, + "loss": 0.3459, + "step": 36020 + }, + { + "epoch": 8.98503740648379, + "grad_norm": 7.649290561676025, + "learning_rate": 1.1018952618453867e-05, + "loss": 0.2882, + "step": 36030 + }, + { + "epoch": 8.987531172069826, + "grad_norm": 4.2098798751831055, + "learning_rate": 1.101645885286783e-05, + "loss": 0.3061, + "step": 36040 + }, + { + "epoch": 8.99002493765586, + "grad_norm": 7.673419952392578, + "learning_rate": 1.1013965087281798e-05, + "loss": 0.3267, + "step": 36050 + }, + { + "epoch": 8.992518703241895, + "grad_norm": 8.080073356628418, + "learning_rate": 1.1011471321695761e-05, + "loss": 0.3203, + "step": 36060 + }, + { + "epoch": 8.99501246882793, + "grad_norm": 4.7040205001831055, + "learning_rate": 1.1008977556109725e-05, + "loss": 0.3322, + "step": 36070 + }, + { + "epoch": 8.997506234413965, + "grad_norm": 6.1027069091796875, + "learning_rate": 1.1006483790523692e-05, + "loss": 0.384, + "step": 36080 + }, + { + "epoch": 9.0, + "grad_norm": 9.147186279296875, + "learning_rate": 1.1003990024937656e-05, + "loss": 0.4646, + "step": 36090 + }, + { + "epoch": 9.0, + "eval_loss": 0.41386765241622925, + "eval_runtime": 59.9541, + "eval_samples_per_second": 16.729, + "eval_steps_per_second": 16.729, + "step": 36090 + }, + { + "epoch": 9.002493765586035, + "grad_norm": 7.474572658538818, + "learning_rate": 1.1001496259351623e-05, + "loss": 0.3304, + "step": 36100 + }, + { + "epoch": 9.00498753117207, + "grad_norm": 4.692042827606201, + "learning_rate": 1.0999002493765588e-05, + "loss": 0.3193, + "step": 36110 + }, + { + "epoch": 9.007481296758105, + "grad_norm": 5.829125881195068, + "learning_rate": 1.0996508728179551e-05, + "loss": 0.3628, + "step": 36120 + }, + { + "epoch": 9.00997506234414, + "grad_norm": 7.040647029876709, + "learning_rate": 1.0994014962593518e-05, + "loss": 0.3537, + "step": 36130 + }, + { + "epoch": 9.012468827930174, + "grad_norm": 7.125100612640381, + "learning_rate": 1.0991521197007482e-05, + "loss": 0.3556, + "step": 36140 + }, + { + "epoch": 9.01496259351621, + "grad_norm": 8.841943740844727, + "learning_rate": 1.0989027431421446e-05, + "loss": 0.3286, + "step": 36150 + }, + { + "epoch": 9.017456359102244, + "grad_norm": 6.718105316162109, + "learning_rate": 1.0986533665835413e-05, + "loss": 0.3856, + "step": 36160 + }, + { + "epoch": 9.019950124688279, + "grad_norm": 6.22119665145874, + "learning_rate": 1.0984039900249376e-05, + "loss": 0.3323, + "step": 36170 + }, + { + "epoch": 9.022443890274314, + "grad_norm": 6.809352874755859, + "learning_rate": 1.0981546134663343e-05, + "loss": 0.3431, + "step": 36180 + }, + { + "epoch": 9.024937655860349, + "grad_norm": 8.36000919342041, + "learning_rate": 1.0979052369077307e-05, + "loss": 0.3702, + "step": 36190 + }, + { + "epoch": 9.027431421446384, + "grad_norm": 9.731013298034668, + "learning_rate": 1.0976558603491272e-05, + "loss": 0.3476, + "step": 36200 + }, + { + "epoch": 9.029925187032418, + "grad_norm": 5.753609657287598, + "learning_rate": 1.0974064837905238e-05, + "loss": 0.3427, + "step": 36210 + }, + { + "epoch": 9.032418952618453, + "grad_norm": 3.485929012298584, + "learning_rate": 1.0971571072319203e-05, + "loss": 0.3366, + "step": 36220 + }, + { + "epoch": 9.034912718204488, + "grad_norm": 5.159622669219971, + "learning_rate": 1.096907730673317e-05, + "loss": 0.3141, + "step": 36230 + }, + { + "epoch": 9.037406483790523, + "grad_norm": 6.331995010375977, + "learning_rate": 1.0966583541147134e-05, + "loss": 0.3536, + "step": 36240 + }, + { + "epoch": 9.039900249376558, + "grad_norm": 5.746108055114746, + "learning_rate": 1.0964089775561097e-05, + "loss": 0.3514, + "step": 36250 + }, + { + "epoch": 9.042394014962593, + "grad_norm": 6.630804061889648, + "learning_rate": 1.0961596009975064e-05, + "loss": 0.3299, + "step": 36260 + }, + { + "epoch": 9.044887780548628, + "grad_norm": 7.066774845123291, + "learning_rate": 1.0959102244389028e-05, + "loss": 0.2654, + "step": 36270 + }, + { + "epoch": 9.047381546134662, + "grad_norm": 9.622215270996094, + "learning_rate": 1.0956608478802993e-05, + "loss": 0.3501, + "step": 36280 + }, + { + "epoch": 9.049875311720697, + "grad_norm": 8.551993370056152, + "learning_rate": 1.0954114713216958e-05, + "loss": 0.3529, + "step": 36290 + }, + { + "epoch": 9.052369077306734, + "grad_norm": 6.227419376373291, + "learning_rate": 1.0951620947630924e-05, + "loss": 0.3803, + "step": 36300 + }, + { + "epoch": 9.054862842892769, + "grad_norm": 8.591386795043945, + "learning_rate": 1.0949127182044889e-05, + "loss": 0.3134, + "step": 36310 + }, + { + "epoch": 9.057356608478804, + "grad_norm": 5.2157673835754395, + "learning_rate": 1.0946633416458854e-05, + "loss": 0.3214, + "step": 36320 + }, + { + "epoch": 9.059850374064839, + "grad_norm": 5.971094131469727, + "learning_rate": 1.0944139650872818e-05, + "loss": 0.3343, + "step": 36330 + }, + { + "epoch": 9.062344139650873, + "grad_norm": 7.728214263916016, + "learning_rate": 1.0941645885286785e-05, + "loss": 0.4131, + "step": 36340 + }, + { + "epoch": 9.064837905236908, + "grad_norm": 9.236366271972656, + "learning_rate": 1.0939401496259351e-05, + "loss": 0.3518, + "step": 36350 + }, + { + "epoch": 9.067331670822943, + "grad_norm": 4.353968620300293, + "learning_rate": 1.0936907730673318e-05, + "loss": 0.352, + "step": 36360 + }, + { + "epoch": 9.069825436408978, + "grad_norm": 9.550552368164062, + "learning_rate": 1.0934413965087282e-05, + "loss": 0.4209, + "step": 36370 + }, + { + "epoch": 9.072319201995013, + "grad_norm": 8.467497825622559, + "learning_rate": 1.0931920199501247e-05, + "loss": 0.3342, + "step": 36380 + }, + { + "epoch": 9.074812967581048, + "grad_norm": 11.831032752990723, + "learning_rate": 1.0929675810473817e-05, + "loss": 0.3481, + "step": 36390 + }, + { + "epoch": 9.077306733167083, + "grad_norm": 5.57447624206543, + "learning_rate": 1.0927182044887781e-05, + "loss": 0.2991, + "step": 36400 + }, + { + "epoch": 9.079800498753118, + "grad_norm": 10.63701057434082, + "learning_rate": 1.0924688279301748e-05, + "loss": 0.3668, + "step": 36410 + }, + { + "epoch": 9.082294264339152, + "grad_norm": 9.0299711227417, + "learning_rate": 1.0922194513715712e-05, + "loss": 0.4303, + "step": 36420 + }, + { + "epoch": 9.084788029925187, + "grad_norm": 6.327706813812256, + "learning_rate": 1.0919700748129675e-05, + "loss": 0.3057, + "step": 36430 + }, + { + "epoch": 9.087281795511222, + "grad_norm": 10.897625923156738, + "learning_rate": 1.0917206982543642e-05, + "loss": 0.3726, + "step": 36440 + }, + { + "epoch": 9.089775561097257, + "grad_norm": 8.112853050231934, + "learning_rate": 1.0914713216957606e-05, + "loss": 0.3477, + "step": 36450 + }, + { + "epoch": 9.092269326683292, + "grad_norm": 7.813316345214844, + "learning_rate": 1.0912219451371573e-05, + "loss": 0.2972, + "step": 36460 + }, + { + "epoch": 9.094763092269327, + "grad_norm": 8.015786170959473, + "learning_rate": 1.0909725685785536e-05, + "loss": 0.3787, + "step": 36470 + }, + { + "epoch": 9.097256857855362, + "grad_norm": 6.349668502807617, + "learning_rate": 1.0907231920199502e-05, + "loss": 0.3598, + "step": 36480 + }, + { + "epoch": 9.099750623441397, + "grad_norm": 5.772064208984375, + "learning_rate": 1.0904738154613467e-05, + "loss": 0.32, + "step": 36490 + }, + { + "epoch": 9.102244389027431, + "grad_norm": 6.944436073303223, + "learning_rate": 1.0902244389027432e-05, + "loss": 0.3449, + "step": 36500 + }, + { + "epoch": 9.104738154613466, + "grad_norm": 6.625913619995117, + "learning_rate": 1.0899750623441396e-05, + "loss": 0.3236, + "step": 36510 + }, + { + "epoch": 9.107231920199501, + "grad_norm": 6.98844051361084, + "learning_rate": 1.0897256857855363e-05, + "loss": 0.3226, + "step": 36520 + }, + { + "epoch": 9.109725685785536, + "grad_norm": 10.177889823913574, + "learning_rate": 1.0894763092269327e-05, + "loss": 0.3706, + "step": 36530 + }, + { + "epoch": 9.11221945137157, + "grad_norm": 9.637530326843262, + "learning_rate": 1.0892269326683294e-05, + "loss": 0.3774, + "step": 36540 + }, + { + "epoch": 9.114713216957606, + "grad_norm": 6.966104030609131, + "learning_rate": 1.0889775561097257e-05, + "loss": 0.3834, + "step": 36550 + }, + { + "epoch": 9.11720698254364, + "grad_norm": 4.891960620880127, + "learning_rate": 1.0887281795511223e-05, + "loss": 0.3099, + "step": 36560 + }, + { + "epoch": 9.119700748129675, + "grad_norm": 12.025975227355957, + "learning_rate": 1.0884788029925188e-05, + "loss": 0.3532, + "step": 36570 + }, + { + "epoch": 9.12219451371571, + "grad_norm": 7.283741474151611, + "learning_rate": 1.0882294264339153e-05, + "loss": 0.3582, + "step": 36580 + }, + { + "epoch": 9.124688279301745, + "grad_norm": 16.89406967163086, + "learning_rate": 1.0879800498753118e-05, + "loss": 0.42, + "step": 36590 + }, + { + "epoch": 9.12718204488778, + "grad_norm": 9.610578536987305, + "learning_rate": 1.0877306733167084e-05, + "loss": 0.3352, + "step": 36600 + }, + { + "epoch": 9.129675810473815, + "grad_norm": 6.792508602142334, + "learning_rate": 1.0874812967581047e-05, + "loss": 0.3906, + "step": 36610 + }, + { + "epoch": 9.13216957605985, + "grad_norm": 6.052822113037109, + "learning_rate": 1.0872319201995014e-05, + "loss": 0.3852, + "step": 36620 + }, + { + "epoch": 9.134663341645885, + "grad_norm": 8.799402236938477, + "learning_rate": 1.0869825436408978e-05, + "loss": 0.4047, + "step": 36630 + }, + { + "epoch": 9.13715710723192, + "grad_norm": 8.114533424377441, + "learning_rate": 1.0867331670822943e-05, + "loss": 0.3871, + "step": 36640 + }, + { + "epoch": 9.139650872817954, + "grad_norm": 9.247639656066895, + "learning_rate": 1.0864837905236909e-05, + "loss": 0.3993, + "step": 36650 + }, + { + "epoch": 9.14214463840399, + "grad_norm": 7.904721736907959, + "learning_rate": 1.0862344139650874e-05, + "loss": 0.3441, + "step": 36660 + }, + { + "epoch": 9.144638403990024, + "grad_norm": 12.610846519470215, + "learning_rate": 1.085985037406484e-05, + "loss": 0.3699, + "step": 36670 + }, + { + "epoch": 9.147132169576059, + "grad_norm": 5.233782768249512, + "learning_rate": 1.0857356608478805e-05, + "loss": 0.3623, + "step": 36680 + }, + { + "epoch": 9.149625935162096, + "grad_norm": 6.99005651473999, + "learning_rate": 1.0854862842892768e-05, + "loss": 0.3476, + "step": 36690 + }, + { + "epoch": 9.15211970074813, + "grad_norm": 11.361775398254395, + "learning_rate": 1.0852369077306735e-05, + "loss": 0.412, + "step": 36700 + }, + { + "epoch": 9.154613466334165, + "grad_norm": 6.979437828063965, + "learning_rate": 1.0849875311720699e-05, + "loss": 0.4043, + "step": 36710 + }, + { + "epoch": 9.1571072319202, + "grad_norm": 6.4810943603515625, + "learning_rate": 1.0847381546134662e-05, + "loss": 0.3768, + "step": 36720 + }, + { + "epoch": 9.159600997506235, + "grad_norm": 10.649642944335938, + "learning_rate": 1.084488778054863e-05, + "loss": 0.3687, + "step": 36730 + }, + { + "epoch": 9.16209476309227, + "grad_norm": 10.065546035766602, + "learning_rate": 1.0842394014962595e-05, + "loss": 0.3891, + "step": 36740 + }, + { + "epoch": 9.164588528678305, + "grad_norm": 7.575584411621094, + "learning_rate": 1.083990024937656e-05, + "loss": 0.3816, + "step": 36750 + }, + { + "epoch": 9.16708229426434, + "grad_norm": 10.201583862304688, + "learning_rate": 1.0837406483790525e-05, + "loss": 0.3173, + "step": 36760 + }, + { + "epoch": 9.169576059850375, + "grad_norm": 10.348787307739258, + "learning_rate": 1.0834912718204489e-05, + "loss": 0.4488, + "step": 36770 + }, + { + "epoch": 9.17206982543641, + "grad_norm": 6.726578235626221, + "learning_rate": 1.0832418952618456e-05, + "loss": 0.3276, + "step": 36780 + }, + { + "epoch": 9.174563591022444, + "grad_norm": 8.635772705078125, + "learning_rate": 1.082992518703242e-05, + "loss": 0.3571, + "step": 36790 + }, + { + "epoch": 9.17705735660848, + "grad_norm": 9.382979393005371, + "learning_rate": 1.0827431421446383e-05, + "loss": 0.3251, + "step": 36800 + }, + { + "epoch": 9.179551122194514, + "grad_norm": 5.400996208190918, + "learning_rate": 1.082493765586035e-05, + "loss": 0.3253, + "step": 36810 + }, + { + "epoch": 9.182044887780549, + "grad_norm": 6.488340854644775, + "learning_rate": 1.0822443890274314e-05, + "loss": 0.3224, + "step": 36820 + }, + { + "epoch": 9.184538653366584, + "grad_norm": 6.428625583648682, + "learning_rate": 1.081995012468828e-05, + "loss": 0.3234, + "step": 36830 + }, + { + "epoch": 9.187032418952619, + "grad_norm": 7.193785667419434, + "learning_rate": 1.0817456359102244e-05, + "loss": 0.2956, + "step": 36840 + }, + { + "epoch": 9.189526184538654, + "grad_norm": 9.758343696594238, + "learning_rate": 1.081496259351621e-05, + "loss": 0.3444, + "step": 36850 + }, + { + "epoch": 9.192019950124688, + "grad_norm": 6.714698314666748, + "learning_rate": 1.0812468827930177e-05, + "loss": 0.4052, + "step": 36860 + }, + { + "epoch": 9.194513715710723, + "grad_norm": 8.280781745910645, + "learning_rate": 1.080997506234414e-05, + "loss": 0.3476, + "step": 36870 + }, + { + "epoch": 9.197007481296758, + "grad_norm": 9.618976593017578, + "learning_rate": 1.0807481296758107e-05, + "loss": 0.3929, + "step": 36880 + }, + { + "epoch": 9.199501246882793, + "grad_norm": 8.107794761657715, + "learning_rate": 1.0804987531172071e-05, + "loss": 0.3866, + "step": 36890 + }, + { + "epoch": 9.201995012468828, + "grad_norm": 6.555667400360107, + "learning_rate": 1.0802493765586035e-05, + "loss": 0.3293, + "step": 36900 + }, + { + "epoch": 9.204488778054863, + "grad_norm": 5.286886215209961, + "learning_rate": 1.0800000000000002e-05, + "loss": 0.3245, + "step": 36910 + }, + { + "epoch": 9.206982543640898, + "grad_norm": 16.370393753051758, + "learning_rate": 1.0797506234413965e-05, + "loss": 0.3701, + "step": 36920 + }, + { + "epoch": 9.209476309226932, + "grad_norm": 8.964117050170898, + "learning_rate": 1.079501246882793e-05, + "loss": 0.4376, + "step": 36930 + }, + { + "epoch": 9.211970074812967, + "grad_norm": 11.275566101074219, + "learning_rate": 1.0792518703241896e-05, + "loss": 0.3203, + "step": 36940 + }, + { + "epoch": 9.214463840399002, + "grad_norm": 7.305138111114502, + "learning_rate": 1.0790024937655861e-05, + "loss": 0.3401, + "step": 36950 + }, + { + "epoch": 9.216957605985037, + "grad_norm": 8.845014572143555, + "learning_rate": 1.0787531172069826e-05, + "loss": 0.3508, + "step": 36960 + }, + { + "epoch": 9.219451371571072, + "grad_norm": 7.507462024688721, + "learning_rate": 1.0785037406483792e-05, + "loss": 0.3717, + "step": 36970 + }, + { + "epoch": 9.221945137157107, + "grad_norm": 6.304407119750977, + "learning_rate": 1.0782543640897755e-05, + "loss": 0.4225, + "step": 36980 + }, + { + "epoch": 9.224438902743142, + "grad_norm": 6.4769511222839355, + "learning_rate": 1.0780049875311722e-05, + "loss": 0.3015, + "step": 36990 + }, + { + "epoch": 9.226932668329177, + "grad_norm": 5.64371395111084, + "learning_rate": 1.0777556109725686e-05, + "loss": 0.3666, + "step": 37000 + }, + { + "epoch": 9.229426433915211, + "grad_norm": 7.976513385772705, + "learning_rate": 1.0775062344139651e-05, + "loss": 0.3437, + "step": 37010 + }, + { + "epoch": 9.231920199501246, + "grad_norm": 8.061870574951172, + "learning_rate": 1.0772568578553617e-05, + "loss": 0.4176, + "step": 37020 + }, + { + "epoch": 9.234413965087281, + "grad_norm": 6.633289813995361, + "learning_rate": 1.0770074812967582e-05, + "loss": 0.3978, + "step": 37030 + }, + { + "epoch": 9.236907730673316, + "grad_norm": 6.083536624908447, + "learning_rate": 1.0767581047381547e-05, + "loss": 0.3067, + "step": 37040 + }, + { + "epoch": 9.239401496259351, + "grad_norm": 9.767627716064453, + "learning_rate": 1.0765087281795513e-05, + "loss": 0.449, + "step": 37050 + }, + { + "epoch": 9.241895261845386, + "grad_norm": 6.643617153167725, + "learning_rate": 1.0762593516209476e-05, + "loss": 0.4396, + "step": 37060 + }, + { + "epoch": 9.24438902743142, + "grad_norm": 13.932653427124023, + "learning_rate": 1.0760099750623443e-05, + "loss": 0.2836, + "step": 37070 + }, + { + "epoch": 9.246882793017456, + "grad_norm": 8.449307441711426, + "learning_rate": 1.0757605985037407e-05, + "loss": 0.3397, + "step": 37080 + }, + { + "epoch": 9.24937655860349, + "grad_norm": 4.122343063354492, + "learning_rate": 1.0755112219451374e-05, + "loss": 0.3449, + "step": 37090 + }, + { + "epoch": 9.251870324189527, + "grad_norm": 5.507016658782959, + "learning_rate": 1.0752618453865337e-05, + "loss": 0.3574, + "step": 37100 + }, + { + "epoch": 9.254364089775562, + "grad_norm": 4.903787136077881, + "learning_rate": 1.0750124688279303e-05, + "loss": 0.3682, + "step": 37110 + }, + { + "epoch": 9.256857855361597, + "grad_norm": 10.680002212524414, + "learning_rate": 1.0747630922693268e-05, + "loss": 0.2884, + "step": 37120 + }, + { + "epoch": 9.259351620947632, + "grad_norm": 7.1772284507751465, + "learning_rate": 1.0745137157107233e-05, + "loss": 0.3803, + "step": 37130 + }, + { + "epoch": 9.261845386533667, + "grad_norm": 5.805769920349121, + "learning_rate": 1.0742643391521197e-05, + "loss": 0.3874, + "step": 37140 + }, + { + "epoch": 9.264339152119701, + "grad_norm": 6.096296787261963, + "learning_rate": 1.0740149625935164e-05, + "loss": 0.2803, + "step": 37150 + }, + { + "epoch": 9.266832917705736, + "grad_norm": 9.108000755310059, + "learning_rate": 1.0737655860349128e-05, + "loss": 0.4142, + "step": 37160 + }, + { + "epoch": 9.269326683291771, + "grad_norm": 8.92357063293457, + "learning_rate": 1.0735162094763095e-05, + "loss": 0.3979, + "step": 37170 + }, + { + "epoch": 9.271820448877806, + "grad_norm": 7.647791862487793, + "learning_rate": 1.0732668329177058e-05, + "loss": 0.3712, + "step": 37180 + }, + { + "epoch": 9.27431421446384, + "grad_norm": 8.077733993530273, + "learning_rate": 1.0730174563591022e-05, + "loss": 0.334, + "step": 37190 + }, + { + "epoch": 9.276807980049876, + "grad_norm": 9.143230438232422, + "learning_rate": 1.0727680798004989e-05, + "loss": 0.3473, + "step": 37200 + }, + { + "epoch": 9.27930174563591, + "grad_norm": 9.733719825744629, + "learning_rate": 1.0725187032418954e-05, + "loss": 0.329, + "step": 37210 + }, + { + "epoch": 9.281795511221945, + "grad_norm": 6.865516185760498, + "learning_rate": 1.0722693266832918e-05, + "loss": 0.4986, + "step": 37220 + }, + { + "epoch": 9.28428927680798, + "grad_norm": 6.8000664710998535, + "learning_rate": 1.0720199501246885e-05, + "loss": 0.3473, + "step": 37230 + }, + { + "epoch": 9.286783042394015, + "grad_norm": 5.607472896575928, + "learning_rate": 1.0717705735660848e-05, + "loss": 0.3789, + "step": 37240 + }, + { + "epoch": 9.28927680798005, + "grad_norm": 8.08205795288086, + "learning_rate": 1.0715211970074815e-05, + "loss": 0.4234, + "step": 37250 + }, + { + "epoch": 9.291770573566085, + "grad_norm": 5.731466293334961, + "learning_rate": 1.0712718204488779e-05, + "loss": 0.3009, + "step": 37260 + }, + { + "epoch": 9.29426433915212, + "grad_norm": 8.150042533874512, + "learning_rate": 1.0710224438902743e-05, + "loss": 0.382, + "step": 37270 + }, + { + "epoch": 9.296758104738155, + "grad_norm": 4.996364116668701, + "learning_rate": 1.070773067331671e-05, + "loss": 0.319, + "step": 37280 + }, + { + "epoch": 9.29925187032419, + "grad_norm": 5.883243083953857, + "learning_rate": 1.0705236907730673e-05, + "loss": 0.3691, + "step": 37290 + }, + { + "epoch": 9.301745635910224, + "grad_norm": 8.430337905883789, + "learning_rate": 1.070274314214464e-05, + "loss": 0.3654, + "step": 37300 + }, + { + "epoch": 9.30423940149626, + "grad_norm": 5.940845012664795, + "learning_rate": 1.0700249376558604e-05, + "loss": 0.4136, + "step": 37310 + }, + { + "epoch": 9.306733167082294, + "grad_norm": 8.069353103637695, + "learning_rate": 1.069775561097257e-05, + "loss": 0.3543, + "step": 37320 + }, + { + "epoch": 9.309226932668329, + "grad_norm": 7.512514591217041, + "learning_rate": 1.0695261845386536e-05, + "loss": 0.3448, + "step": 37330 + }, + { + "epoch": 9.311720698254364, + "grad_norm": 8.438852310180664, + "learning_rate": 1.06927680798005e-05, + "loss": 0.3801, + "step": 37340 + }, + { + "epoch": 9.314214463840399, + "grad_norm": 6.524256229400635, + "learning_rate": 1.0690274314214463e-05, + "loss": 0.43, + "step": 37350 + }, + { + "epoch": 9.316708229426434, + "grad_norm": 7.598324298858643, + "learning_rate": 1.068778054862843e-05, + "loss": 0.3323, + "step": 37360 + }, + { + "epoch": 9.319201995012468, + "grad_norm": 6.724202632904053, + "learning_rate": 1.0685286783042394e-05, + "loss": 0.3265, + "step": 37370 + }, + { + "epoch": 9.321695760598503, + "grad_norm": 6.113863945007324, + "learning_rate": 1.0682793017456361e-05, + "loss": 0.3499, + "step": 37380 + }, + { + "epoch": 9.324189526184538, + "grad_norm": 5.505059719085693, + "learning_rate": 1.0680299251870325e-05, + "loss": 0.4184, + "step": 37390 + }, + { + "epoch": 9.326683291770573, + "grad_norm": 7.815418243408203, + "learning_rate": 1.067780548628429e-05, + "loss": 0.4034, + "step": 37400 + }, + { + "epoch": 9.329177057356608, + "grad_norm": 7.3198957443237305, + "learning_rate": 1.0675311720698255e-05, + "loss": 0.3815, + "step": 37410 + }, + { + "epoch": 9.331670822942643, + "grad_norm": 5.2639570236206055, + "learning_rate": 1.067281795511222e-05, + "loss": 0.4119, + "step": 37420 + }, + { + "epoch": 9.334164588528678, + "grad_norm": 7.291960716247559, + "learning_rate": 1.0670324189526184e-05, + "loss": 0.3337, + "step": 37430 + }, + { + "epoch": 9.336658354114713, + "grad_norm": 6.731927871704102, + "learning_rate": 1.0667830423940151e-05, + "loss": 0.3206, + "step": 37440 + }, + { + "epoch": 9.339152119700747, + "grad_norm": 6.022372722625732, + "learning_rate": 1.0665336658354115e-05, + "loss": 0.3708, + "step": 37450 + }, + { + "epoch": 9.341645885286782, + "grad_norm": 8.287287712097168, + "learning_rate": 1.0662842892768082e-05, + "loss": 0.3234, + "step": 37460 + }, + { + "epoch": 9.344139650872817, + "grad_norm": 5.861464023590088, + "learning_rate": 1.0660349127182045e-05, + "loss": 0.3365, + "step": 37470 + }, + { + "epoch": 9.346633416458852, + "grad_norm": 7.660949230194092, + "learning_rate": 1.065785536159601e-05, + "loss": 0.4347, + "step": 37480 + }, + { + "epoch": 9.349127182044889, + "grad_norm": 7.541104793548584, + "learning_rate": 1.0655361596009976e-05, + "loss": 0.3541, + "step": 37490 + }, + { + "epoch": 9.351620947630924, + "grad_norm": 6.571524620056152, + "learning_rate": 1.0652867830423941e-05, + "loss": 0.3529, + "step": 37500 + }, + { + "epoch": 9.354114713216958, + "grad_norm": 7.663899898529053, + "learning_rate": 1.0650374064837905e-05, + "loss": 0.3262, + "step": 37510 + }, + { + "epoch": 9.356608478802993, + "grad_norm": 5.076836109161377, + "learning_rate": 1.0647880299251872e-05, + "loss": 0.3693, + "step": 37520 + }, + { + "epoch": 9.359102244389028, + "grad_norm": 8.110295295715332, + "learning_rate": 1.0645386533665836e-05, + "loss": 0.3447, + "step": 37530 + }, + { + "epoch": 9.361596009975063, + "grad_norm": 9.976151466369629, + "learning_rate": 1.0642892768079803e-05, + "loss": 0.3858, + "step": 37540 + }, + { + "epoch": 9.364089775561098, + "grad_norm": 5.691299915313721, + "learning_rate": 1.0640399002493766e-05, + "loss": 0.3673, + "step": 37550 + }, + { + "epoch": 9.366583541147133, + "grad_norm": 6.755293369293213, + "learning_rate": 1.0637905236907732e-05, + "loss": 0.4159, + "step": 37560 + }, + { + "epoch": 9.369077306733168, + "grad_norm": 6.009609699249268, + "learning_rate": 1.0635411471321697e-05, + "loss": 0.3256, + "step": 37570 + }, + { + "epoch": 9.371571072319203, + "grad_norm": 8.494285583496094, + "learning_rate": 1.0632917705735662e-05, + "loss": 0.3694, + "step": 37580 + }, + { + "epoch": 9.374064837905237, + "grad_norm": 5.737974643707275, + "learning_rate": 1.0630423940149627e-05, + "loss": 0.3142, + "step": 37590 + }, + { + "epoch": 9.376558603491272, + "grad_norm": 6.128436088562012, + "learning_rate": 1.0627930174563593e-05, + "loss": 0.3198, + "step": 37600 + }, + { + "epoch": 9.379052369077307, + "grad_norm": 7.005685806274414, + "learning_rate": 1.0625436408977556e-05, + "loss": 0.2682, + "step": 37610 + }, + { + "epoch": 9.381546134663342, + "grad_norm": 9.262737274169922, + "learning_rate": 1.0622942643391523e-05, + "loss": 0.3833, + "step": 37620 + }, + { + "epoch": 9.384039900249377, + "grad_norm": 7.676625728607178, + "learning_rate": 1.0620448877805487e-05, + "loss": 0.3851, + "step": 37630 + }, + { + "epoch": 9.386533665835412, + "grad_norm": 7.627851486206055, + "learning_rate": 1.061795511221945e-05, + "loss": 0.3433, + "step": 37640 + }, + { + "epoch": 9.389027431421447, + "grad_norm": 7.219352722167969, + "learning_rate": 1.0615461346633418e-05, + "loss": 0.3822, + "step": 37650 + }, + { + "epoch": 9.391521197007481, + "grad_norm": 6.273348808288574, + "learning_rate": 1.0612967581047381e-05, + "loss": 0.3067, + "step": 37660 + }, + { + "epoch": 9.394014962593516, + "grad_norm": 7.28784704208374, + "learning_rate": 1.0610473815461348e-05, + "loss": 0.3483, + "step": 37670 + }, + { + "epoch": 9.396508728179551, + "grad_norm": 7.136936187744141, + "learning_rate": 1.0607980049875314e-05, + "loss": 0.3342, + "step": 37680 + }, + { + "epoch": 9.399002493765586, + "grad_norm": 9.429666519165039, + "learning_rate": 1.0605486284289277e-05, + "loss": 0.387, + "step": 37690 + }, + { + "epoch": 9.401496259351621, + "grad_norm": 10.183398246765137, + "learning_rate": 1.0602992518703244e-05, + "loss": 0.3281, + "step": 37700 + }, + { + "epoch": 9.403990024937656, + "grad_norm": 9.305542945861816, + "learning_rate": 1.0600498753117208e-05, + "loss": 0.422, + "step": 37710 + }, + { + "epoch": 9.40648379052369, + "grad_norm": 13.953811645507812, + "learning_rate": 1.0598004987531171e-05, + "loss": 0.4099, + "step": 37720 + }, + { + "epoch": 9.408977556109726, + "grad_norm": 7.635592460632324, + "learning_rate": 1.0595511221945138e-05, + "loss": 0.3641, + "step": 37730 + }, + { + "epoch": 9.41147132169576, + "grad_norm": 8.282495498657227, + "learning_rate": 1.0593017456359102e-05, + "loss": 0.3229, + "step": 37740 + }, + { + "epoch": 9.413965087281795, + "grad_norm": 6.074685096740723, + "learning_rate": 1.0590523690773069e-05, + "loss": 0.3637, + "step": 37750 + }, + { + "epoch": 9.41645885286783, + "grad_norm": 5.761679172515869, + "learning_rate": 1.0588029925187033e-05, + "loss": 0.3487, + "step": 37760 + }, + { + "epoch": 9.418952618453865, + "grad_norm": 9.500771522521973, + "learning_rate": 1.0585536159600998e-05, + "loss": 0.3125, + "step": 37770 + }, + { + "epoch": 9.4214463840399, + "grad_norm": 8.102692604064941, + "learning_rate": 1.0583042394014963e-05, + "loss": 0.3755, + "step": 37780 + }, + { + "epoch": 9.423940149625935, + "grad_norm": 7.283730506896973, + "learning_rate": 1.0580548628428929e-05, + "loss": 0.3766, + "step": 37790 + }, + { + "epoch": 9.42643391521197, + "grad_norm": 8.951366424560547, + "learning_rate": 1.0578054862842896e-05, + "loss": 0.4029, + "step": 37800 + }, + { + "epoch": 9.428927680798004, + "grad_norm": 5.814979076385498, + "learning_rate": 1.057556109725686e-05, + "loss": 0.3822, + "step": 37810 + }, + { + "epoch": 9.43142144638404, + "grad_norm": 5.397665977478027, + "learning_rate": 1.0573067331670823e-05, + "loss": 0.3395, + "step": 37820 + }, + { + "epoch": 9.433915211970074, + "grad_norm": 8.175272941589355, + "learning_rate": 1.057057356608479e-05, + "loss": 0.3406, + "step": 37830 + }, + { + "epoch": 9.436408977556109, + "grad_norm": 9.08521556854248, + "learning_rate": 1.0568079800498753e-05, + "loss": 0.3476, + "step": 37840 + }, + { + "epoch": 9.438902743142144, + "grad_norm": 5.0024518966674805, + "learning_rate": 1.0565586034912719e-05, + "loss": 0.4372, + "step": 37850 + }, + { + "epoch": 9.441396508728179, + "grad_norm": 9.28567886352539, + "learning_rate": 1.0563092269326684e-05, + "loss": 0.3301, + "step": 37860 + }, + { + "epoch": 9.443890274314214, + "grad_norm": 10.780661582946777, + "learning_rate": 1.056059850374065e-05, + "loss": 0.3487, + "step": 37870 + }, + { + "epoch": 9.446384039900249, + "grad_norm": 6.956487655639648, + "learning_rate": 1.0558104738154615e-05, + "loss": 0.3411, + "step": 37880 + }, + { + "epoch": 9.448877805486283, + "grad_norm": 8.598062515258789, + "learning_rate": 1.055561097256858e-05, + "loss": 0.406, + "step": 37890 + }, + { + "epoch": 9.451371571072318, + "grad_norm": 7.002991676330566, + "learning_rate": 1.0553117206982544e-05, + "loss": 0.3886, + "step": 37900 + }, + { + "epoch": 9.453865336658355, + "grad_norm": 6.1540846824646, + "learning_rate": 1.055062344139651e-05, + "loss": 0.3386, + "step": 37910 + }, + { + "epoch": 9.45635910224439, + "grad_norm": 4.934481620788574, + "learning_rate": 1.0548129675810474e-05, + "loss": 0.3195, + "step": 37920 + }, + { + "epoch": 9.458852867830425, + "grad_norm": 7.763730525970459, + "learning_rate": 1.054563591022444e-05, + "loss": 0.3492, + "step": 37930 + }, + { + "epoch": 9.46134663341646, + "grad_norm": 10.117830276489258, + "learning_rate": 1.0543142144638405e-05, + "loss": 0.3134, + "step": 37940 + }, + { + "epoch": 9.463840399002494, + "grad_norm": 6.80835485458374, + "learning_rate": 1.054064837905237e-05, + "loss": 0.3832, + "step": 37950 + }, + { + "epoch": 9.46633416458853, + "grad_norm": 6.438353538513184, + "learning_rate": 1.0538154613466335e-05, + "loss": 0.3753, + "step": 37960 + }, + { + "epoch": 9.468827930174564, + "grad_norm": 7.7765278816223145, + "learning_rate": 1.05356608478803e-05, + "loss": 0.342, + "step": 37970 + }, + { + "epoch": 9.471321695760599, + "grad_norm": 7.024895191192627, + "learning_rate": 1.0533167082294264e-05, + "loss": 0.3745, + "step": 37980 + }, + { + "epoch": 9.473815461346634, + "grad_norm": 9.783491134643555, + "learning_rate": 1.0530673316708231e-05, + "loss": 0.3311, + "step": 37990 + }, + { + "epoch": 9.476309226932669, + "grad_norm": 7.7284979820251465, + "learning_rate": 1.0528179551122195e-05, + "loss": 0.3344, + "step": 38000 + }, + { + "epoch": 9.478802992518704, + "grad_norm": 5.616584777832031, + "learning_rate": 1.0525685785536159e-05, + "loss": 0.3145, + "step": 38010 + }, + { + "epoch": 9.481296758104738, + "grad_norm": 7.88400936126709, + "learning_rate": 1.0523192019950126e-05, + "loss": 0.332, + "step": 38020 + }, + { + "epoch": 9.483790523690773, + "grad_norm": 4.7109222412109375, + "learning_rate": 1.0520698254364091e-05, + "loss": 0.3345, + "step": 38030 + }, + { + "epoch": 9.486284289276808, + "grad_norm": 7.7549543380737305, + "learning_rate": 1.0518204488778056e-05, + "loss": 0.3887, + "step": 38040 + }, + { + "epoch": 9.488778054862843, + "grad_norm": 13.061963081359863, + "learning_rate": 1.0515710723192022e-05, + "loss": 0.3815, + "step": 38050 + }, + { + "epoch": 9.491271820448878, + "grad_norm": 7.7246479988098145, + "learning_rate": 1.0513216957605985e-05, + "loss": 0.2914, + "step": 38060 + }, + { + "epoch": 9.493765586034913, + "grad_norm": 9.497514724731445, + "learning_rate": 1.0510723192019952e-05, + "loss": 0.3772, + "step": 38070 + }, + { + "epoch": 9.496259351620948, + "grad_norm": 10.470547676086426, + "learning_rate": 1.0508229426433916e-05, + "loss": 0.3788, + "step": 38080 + }, + { + "epoch": 9.498753117206983, + "grad_norm": 5.6805009841918945, + "learning_rate": 1.0505735660847883e-05, + "loss": 0.3683, + "step": 38090 + }, + { + "epoch": 9.501246882793017, + "grad_norm": 8.353499412536621, + "learning_rate": 1.0503241895261846e-05, + "loss": 0.3531, + "step": 38100 + }, + { + "epoch": 9.503740648379052, + "grad_norm": 5.944037914276123, + "learning_rate": 1.050074812967581e-05, + "loss": 0.3669, + "step": 38110 + }, + { + "epoch": 9.506234413965087, + "grad_norm": 10.917936325073242, + "learning_rate": 1.0498254364089777e-05, + "loss": 0.383, + "step": 38120 + }, + { + "epoch": 9.508728179551122, + "grad_norm": 8.01206111907959, + "learning_rate": 1.049576059850374e-05, + "loss": 0.3929, + "step": 38130 + }, + { + "epoch": 9.511221945137157, + "grad_norm": 8.861490249633789, + "learning_rate": 1.0493266832917706e-05, + "loss": 0.3805, + "step": 38140 + }, + { + "epoch": 9.513715710723192, + "grad_norm": 5.089669227600098, + "learning_rate": 1.0490773067331673e-05, + "loss": 0.3602, + "step": 38150 + }, + { + "epoch": 9.516209476309227, + "grad_norm": 7.066434383392334, + "learning_rate": 1.0488279301745637e-05, + "loss": 0.376, + "step": 38160 + }, + { + "epoch": 9.518703241895262, + "grad_norm": 6.961791038513184, + "learning_rate": 1.0485785536159604e-05, + "loss": 0.3763, + "step": 38170 + }, + { + "epoch": 9.521197007481296, + "grad_norm": 9.394706726074219, + "learning_rate": 1.0483291770573567e-05, + "loss": 0.3548, + "step": 38180 + }, + { + "epoch": 9.523690773067331, + "grad_norm": 8.510271072387695, + "learning_rate": 1.048079800498753e-05, + "loss": 0.3591, + "step": 38190 + }, + { + "epoch": 9.526184538653366, + "grad_norm": 6.186204433441162, + "learning_rate": 1.0478304239401498e-05, + "loss": 0.3024, + "step": 38200 + }, + { + "epoch": 9.528678304239401, + "grad_norm": 8.06693172454834, + "learning_rate": 1.0475810473815461e-05, + "loss": 0.3328, + "step": 38210 + }, + { + "epoch": 9.531172069825436, + "grad_norm": 5.462076663970947, + "learning_rate": 1.0473316708229427e-05, + "loss": 0.3593, + "step": 38220 + }, + { + "epoch": 9.53366583541147, + "grad_norm": 9.915395736694336, + "learning_rate": 1.0470822942643392e-05, + "loss": 0.3356, + "step": 38230 + }, + { + "epoch": 9.536159600997506, + "grad_norm": 7.561076641082764, + "learning_rate": 1.0468329177057357e-05, + "loss": 0.4131, + "step": 38240 + }, + { + "epoch": 9.53865336658354, + "grad_norm": 11.387033462524414, + "learning_rate": 1.0465835411471323e-05, + "loss": 0.3629, + "step": 38250 + }, + { + "epoch": 9.541147132169575, + "grad_norm": 6.235090255737305, + "learning_rate": 1.0463341645885288e-05, + "loss": 0.2799, + "step": 38260 + }, + { + "epoch": 9.54364089775561, + "grad_norm": 6.4282989501953125, + "learning_rate": 1.0460847880299252e-05, + "loss": 0.3212, + "step": 38270 + }, + { + "epoch": 9.546134663341645, + "grad_norm": 6.645716667175293, + "learning_rate": 1.0458354114713219e-05, + "loss": 0.3519, + "step": 38280 + }, + { + "epoch": 9.548628428927682, + "grad_norm": 8.45456314086914, + "learning_rate": 1.0455860349127182e-05, + "loss": 0.3729, + "step": 38290 + }, + { + "epoch": 9.551122194513717, + "grad_norm": 5.648975849151611, + "learning_rate": 1.045336658354115e-05, + "loss": 0.3289, + "step": 38300 + }, + { + "epoch": 9.553615960099751, + "grad_norm": 9.984889030456543, + "learning_rate": 1.0450872817955113e-05, + "loss": 0.4206, + "step": 38310 + }, + { + "epoch": 9.556109725685786, + "grad_norm": 4.996423721313477, + "learning_rate": 1.0448379052369078e-05, + "loss": 0.2972, + "step": 38320 + }, + { + "epoch": 9.558603491271821, + "grad_norm": 5.108618259429932, + "learning_rate": 1.0445885286783043e-05, + "loss": 0.3033, + "step": 38330 + }, + { + "epoch": 9.561097256857856, + "grad_norm": 6.292821884155273, + "learning_rate": 1.0443391521197009e-05, + "loss": 0.3105, + "step": 38340 + }, + { + "epoch": 9.563591022443891, + "grad_norm": 7.772547245025635, + "learning_rate": 1.0440897755610972e-05, + "loss": 0.3646, + "step": 38350 + }, + { + "epoch": 9.566084788029926, + "grad_norm": 6.700054168701172, + "learning_rate": 1.043840399002494e-05, + "loss": 0.3842, + "step": 38360 + }, + { + "epoch": 9.56857855361596, + "grad_norm": 5.910054683685303, + "learning_rate": 1.0435910224438903e-05, + "loss": 0.3718, + "step": 38370 + }, + { + "epoch": 9.571072319201996, + "grad_norm": 5.428488731384277, + "learning_rate": 1.043341645885287e-05, + "loss": 0.3454, + "step": 38380 + }, + { + "epoch": 9.57356608478803, + "grad_norm": 7.547824382781982, + "learning_rate": 1.0430922693266834e-05, + "loss": 0.3771, + "step": 38390 + }, + { + "epoch": 9.576059850374065, + "grad_norm": 5.2548909187316895, + "learning_rate": 1.0428428927680799e-05, + "loss": 0.3896, + "step": 38400 + }, + { + "epoch": 9.5785536159601, + "grad_norm": 6.828177452087402, + "learning_rate": 1.0425935162094764e-05, + "loss": 0.3847, + "step": 38410 + }, + { + "epoch": 9.581047381546135, + "grad_norm": 8.20153522491455, + "learning_rate": 1.042344139650873e-05, + "loss": 0.3524, + "step": 38420 + }, + { + "epoch": 9.58354114713217, + "grad_norm": 4.548335075378418, + "learning_rate": 1.0420947630922693e-05, + "loss": 0.3613, + "step": 38430 + }, + { + "epoch": 9.586034912718205, + "grad_norm": 6.978244304656982, + "learning_rate": 1.041845386533666e-05, + "loss": 0.3803, + "step": 38440 + }, + { + "epoch": 9.58852867830424, + "grad_norm": 7.369738578796387, + "learning_rate": 1.0415960099750624e-05, + "loss": 0.3134, + "step": 38450 + }, + { + "epoch": 9.591022443890274, + "grad_norm": 8.234667778015137, + "learning_rate": 1.041346633416459e-05, + "loss": 0.3537, + "step": 38460 + }, + { + "epoch": 9.59351620947631, + "grad_norm": 6.086751461029053, + "learning_rate": 1.0410972568578554e-05, + "loss": 0.3783, + "step": 38470 + }, + { + "epoch": 9.596009975062344, + "grad_norm": 7.9929046630859375, + "learning_rate": 1.0408478802992518e-05, + "loss": 0.3491, + "step": 38480 + }, + { + "epoch": 9.598503740648379, + "grad_norm": 11.648649215698242, + "learning_rate": 1.0405985037406485e-05, + "loss": 0.3418, + "step": 38490 + }, + { + "epoch": 9.600997506234414, + "grad_norm": 8.06251049041748, + "learning_rate": 1.040349127182045e-05, + "loss": 0.3483, + "step": 38500 + }, + { + "epoch": 9.603491271820449, + "grad_norm": 9.651390075683594, + "learning_rate": 1.0400997506234414e-05, + "loss": 0.3574, + "step": 38510 + }, + { + "epoch": 9.605985037406484, + "grad_norm": 4.3901777267456055, + "learning_rate": 1.0398503740648381e-05, + "loss": 0.3304, + "step": 38520 + }, + { + "epoch": 9.608478802992519, + "grad_norm": 5.272586345672607, + "learning_rate": 1.0396009975062345e-05, + "loss": 0.2917, + "step": 38530 + }, + { + "epoch": 9.610972568578553, + "grad_norm": 9.170848846435547, + "learning_rate": 1.0393516209476312e-05, + "loss": 0.2899, + "step": 38540 + }, + { + "epoch": 9.613466334164588, + "grad_norm": 7.961601257324219, + "learning_rate": 1.0391022443890275e-05, + "loss": 0.3714, + "step": 38550 + }, + { + "epoch": 9.615960099750623, + "grad_norm": 8.155771255493164, + "learning_rate": 1.0388528678304239e-05, + "loss": 0.3591, + "step": 38560 + }, + { + "epoch": 9.618453865336658, + "grad_norm": 7.194378852844238, + "learning_rate": 1.0386034912718206e-05, + "loss": 0.355, + "step": 38570 + }, + { + "epoch": 9.620947630922693, + "grad_norm": 8.259054183959961, + "learning_rate": 1.038354114713217e-05, + "loss": 0.2957, + "step": 38580 + }, + { + "epoch": 9.623441396508728, + "grad_norm": 6.669407844543457, + "learning_rate": 1.0381047381546136e-05, + "loss": 0.3607, + "step": 38590 + }, + { + "epoch": 9.625935162094763, + "grad_norm": 7.477216720581055, + "learning_rate": 1.03785536159601e-05, + "loss": 0.3427, + "step": 38600 + }, + { + "epoch": 9.628428927680797, + "grad_norm": 5.8216986656188965, + "learning_rate": 1.0376059850374065e-05, + "loss": 0.3458, + "step": 38610 + }, + { + "epoch": 9.630922693266832, + "grad_norm": 10.364824295043945, + "learning_rate": 1.0373566084788032e-05, + "loss": 0.3817, + "step": 38620 + }, + { + "epoch": 9.633416458852867, + "grad_norm": 5.567168235778809, + "learning_rate": 1.0371072319201996e-05, + "loss": 0.2853, + "step": 38630 + }, + { + "epoch": 9.635910224438902, + "grad_norm": 7.306951999664307, + "learning_rate": 1.036857855361596e-05, + "loss": 0.3538, + "step": 38640 + }, + { + "epoch": 9.638403990024937, + "grad_norm": 8.60983943939209, + "learning_rate": 1.0366084788029927e-05, + "loss": 0.3719, + "step": 38650 + }, + { + "epoch": 9.640897755610972, + "grad_norm": 8.791508674621582, + "learning_rate": 1.036359102244389e-05, + "loss": 0.4041, + "step": 38660 + }, + { + "epoch": 9.643391521197007, + "grad_norm": 6.479540824890137, + "learning_rate": 1.0361097256857857e-05, + "loss": 0.3139, + "step": 38670 + }, + { + "epoch": 9.645885286783042, + "grad_norm": 7.419809818267822, + "learning_rate": 1.035860349127182e-05, + "loss": 0.3653, + "step": 38680 + }, + { + "epoch": 9.648379052369076, + "grad_norm": 11.746686935424805, + "learning_rate": 1.0356109725685786e-05, + "loss": 0.3622, + "step": 38690 + }, + { + "epoch": 9.650872817955111, + "grad_norm": 9.593301773071289, + "learning_rate": 1.0353615960099751e-05, + "loss": 0.3942, + "step": 38700 + }, + { + "epoch": 9.653366583541148, + "grad_norm": 5.149587154388428, + "learning_rate": 1.0351122194513717e-05, + "loss": 0.3576, + "step": 38710 + }, + { + "epoch": 9.655860349127183, + "grad_norm": 5.903531551361084, + "learning_rate": 1.034862842892768e-05, + "loss": 0.3395, + "step": 38720 + }, + { + "epoch": 9.658354114713218, + "grad_norm": 7.770636558532715, + "learning_rate": 1.0346134663341647e-05, + "loss": 0.2986, + "step": 38730 + }, + { + "epoch": 9.660847880299253, + "grad_norm": 8.158554077148438, + "learning_rate": 1.0343640897755611e-05, + "loss": 0.3346, + "step": 38740 + }, + { + "epoch": 9.663341645885287, + "grad_norm": 5.318978786468506, + "learning_rate": 1.0341147132169578e-05, + "loss": 0.312, + "step": 38750 + }, + { + "epoch": 9.665835411471322, + "grad_norm": 7.235579490661621, + "learning_rate": 1.0338653366583542e-05, + "loss": 0.3416, + "step": 38760 + }, + { + "epoch": 9.668329177057357, + "grad_norm": 6.485595226287842, + "learning_rate": 1.0336159600997507e-05, + "loss": 0.335, + "step": 38770 + }, + { + "epoch": 9.670822942643392, + "grad_norm": 8.763544082641602, + "learning_rate": 1.0333665835411472e-05, + "loss": 0.3305, + "step": 38780 + }, + { + "epoch": 9.673316708229427, + "grad_norm": 11.1206693649292, + "learning_rate": 1.0331172069825438e-05, + "loss": 0.3349, + "step": 38790 + }, + { + "epoch": 9.675810473815462, + "grad_norm": 9.942346572875977, + "learning_rate": 1.0328678304239403e-05, + "loss": 0.3211, + "step": 38800 + }, + { + "epoch": 9.678304239401497, + "grad_norm": 7.735063552856445, + "learning_rate": 1.0326184538653368e-05, + "loss": 0.3628, + "step": 38810 + }, + { + "epoch": 9.680798004987532, + "grad_norm": 8.735753059387207, + "learning_rate": 1.0323690773067332e-05, + "loss": 0.3722, + "step": 38820 + }, + { + "epoch": 9.683291770573566, + "grad_norm": 8.924358367919922, + "learning_rate": 1.0321197007481299e-05, + "loss": 0.3788, + "step": 38830 + }, + { + "epoch": 9.685785536159601, + "grad_norm": 10.292500495910645, + "learning_rate": 1.0318703241895262e-05, + "loss": 0.4222, + "step": 38840 + }, + { + "epoch": 9.688279301745636, + "grad_norm": 5.393009662628174, + "learning_rate": 1.0316209476309228e-05, + "loss": 0.4147, + "step": 38850 + }, + { + "epoch": 9.690773067331671, + "grad_norm": 6.392873287200928, + "learning_rate": 1.0313715710723193e-05, + "loss": 0.2981, + "step": 38860 + }, + { + "epoch": 9.693266832917706, + "grad_norm": 5.524891376495361, + "learning_rate": 1.0311221945137158e-05, + "loss": 0.3877, + "step": 38870 + }, + { + "epoch": 9.69576059850374, + "grad_norm": 7.086117744445801, + "learning_rate": 1.0308728179551124e-05, + "loss": 0.3449, + "step": 38880 + }, + { + "epoch": 9.698254364089776, + "grad_norm": 6.8754072189331055, + "learning_rate": 1.0306234413965089e-05, + "loss": 0.365, + "step": 38890 + }, + { + "epoch": 9.70074812967581, + "grad_norm": 8.261271476745605, + "learning_rate": 1.0303740648379053e-05, + "loss": 0.2822, + "step": 38900 + }, + { + "epoch": 9.703241895261845, + "grad_norm": 8.90969467163086, + "learning_rate": 1.030124688279302e-05, + "loss": 0.4126, + "step": 38910 + }, + { + "epoch": 9.70573566084788, + "grad_norm": 6.020090579986572, + "learning_rate": 1.0298753117206983e-05, + "loss": 0.3471, + "step": 38920 + }, + { + "epoch": 9.708229426433915, + "grad_norm": 7.7385993003845215, + "learning_rate": 1.0296259351620947e-05, + "loss": 0.3681, + "step": 38930 + }, + { + "epoch": 9.71072319201995, + "grad_norm": 8.860572814941406, + "learning_rate": 1.0293765586034914e-05, + "loss": 0.435, + "step": 38940 + }, + { + "epoch": 9.713216957605985, + "grad_norm": 8.749921798706055, + "learning_rate": 1.0291271820448877e-05, + "loss": 0.3425, + "step": 38950 + }, + { + "epoch": 9.71571072319202, + "grad_norm": 8.579872131347656, + "learning_rate": 1.0288778054862844e-05, + "loss": 0.3983, + "step": 38960 + }, + { + "epoch": 9.718204488778055, + "grad_norm": 7.9669389724731445, + "learning_rate": 1.028628428927681e-05, + "loss": 0.3475, + "step": 38970 + }, + { + "epoch": 9.72069825436409, + "grad_norm": 7.299527645111084, + "learning_rate": 1.0283790523690773e-05, + "loss": 0.3911, + "step": 38980 + }, + { + "epoch": 9.723192019950124, + "grad_norm": 5.163363933563232, + "learning_rate": 1.028129675810474e-05, + "loss": 0.361, + "step": 38990 + }, + { + "epoch": 9.72568578553616, + "grad_norm": 8.54089641571045, + "learning_rate": 1.0278802992518704e-05, + "loss": 0.33, + "step": 39000 + }, + { + "epoch": 9.728179551122194, + "grad_norm": 7.269198894500732, + "learning_rate": 1.0276309226932668e-05, + "loss": 0.3695, + "step": 39010 + }, + { + "epoch": 9.730673316708229, + "grad_norm": 8.969745635986328, + "learning_rate": 1.0273815461346635e-05, + "loss": 0.365, + "step": 39020 + }, + { + "epoch": 9.733167082294264, + "grad_norm": 5.630902290344238, + "learning_rate": 1.0271321695760598e-05, + "loss": 0.3615, + "step": 39030 + }, + { + "epoch": 9.735660847880299, + "grad_norm": 8.235981941223145, + "learning_rate": 1.0268827930174565e-05, + "loss": 0.4206, + "step": 39040 + }, + { + "epoch": 9.738154613466333, + "grad_norm": 7.914098262786865, + "learning_rate": 1.0266334164588529e-05, + "loss": 0.343, + "step": 39050 + }, + { + "epoch": 9.740648379052368, + "grad_norm": 7.478903293609619, + "learning_rate": 1.0263840399002494e-05, + "loss": 0.3678, + "step": 39060 + }, + { + "epoch": 9.743142144638403, + "grad_norm": 7.806370735168457, + "learning_rate": 1.026134663341646e-05, + "loss": 0.3855, + "step": 39070 + }, + { + "epoch": 9.745635910224438, + "grad_norm": 6.9918341636657715, + "learning_rate": 1.0258852867830425e-05, + "loss": 0.3848, + "step": 39080 + }, + { + "epoch": 9.748129675810475, + "grad_norm": 7.807930946350098, + "learning_rate": 1.0256359102244392e-05, + "loss": 0.3518, + "step": 39090 + }, + { + "epoch": 9.75062344139651, + "grad_norm": 5.30518913269043, + "learning_rate": 1.0253865336658355e-05, + "loss": 0.3512, + "step": 39100 + }, + { + "epoch": 9.753117206982544, + "grad_norm": 7.166024208068848, + "learning_rate": 1.0251371571072319e-05, + "loss": 0.4185, + "step": 39110 + }, + { + "epoch": 9.75561097256858, + "grad_norm": 7.371334075927734, + "learning_rate": 1.0248877805486286e-05, + "loss": 0.385, + "step": 39120 + }, + { + "epoch": 9.758104738154614, + "grad_norm": 5.414678573608398, + "learning_rate": 1.024638403990025e-05, + "loss": 0.3971, + "step": 39130 + }, + { + "epoch": 9.760598503740649, + "grad_norm": 8.119851112365723, + "learning_rate": 1.0243890274314215e-05, + "loss": 0.3534, + "step": 39140 + }, + { + "epoch": 9.763092269326684, + "grad_norm": 10.847015380859375, + "learning_rate": 1.024139650872818e-05, + "loss": 0.4538, + "step": 39150 + }, + { + "epoch": 9.765586034912719, + "grad_norm": 7.289000511169434, + "learning_rate": 1.0238902743142145e-05, + "loss": 0.3739, + "step": 39160 + }, + { + "epoch": 9.768079800498754, + "grad_norm": 9.922390937805176, + "learning_rate": 1.023640897755611e-05, + "loss": 0.3072, + "step": 39170 + }, + { + "epoch": 9.770573566084789, + "grad_norm": 8.990649223327637, + "learning_rate": 1.0233915211970076e-05, + "loss": 0.3926, + "step": 39180 + }, + { + "epoch": 9.773067331670823, + "grad_norm": 6.73281717300415, + "learning_rate": 1.023142144638404e-05, + "loss": 0.3652, + "step": 39190 + }, + { + "epoch": 9.775561097256858, + "grad_norm": 6.121176242828369, + "learning_rate": 1.0228927680798007e-05, + "loss": 0.3424, + "step": 39200 + }, + { + "epoch": 9.778054862842893, + "grad_norm": 10.394766807556152, + "learning_rate": 1.022643391521197e-05, + "loss": 0.3365, + "step": 39210 + }, + { + "epoch": 9.780548628428928, + "grad_norm": 6.58123254776001, + "learning_rate": 1.0223940149625936e-05, + "loss": 0.3641, + "step": 39220 + }, + { + "epoch": 9.783042394014963, + "grad_norm": 6.802944183349609, + "learning_rate": 1.0221446384039901e-05, + "loss": 0.2725, + "step": 39230 + }, + { + "epoch": 9.785536159600998, + "grad_norm": 8.817499160766602, + "learning_rate": 1.0218952618453866e-05, + "loss": 0.3455, + "step": 39240 + }, + { + "epoch": 9.788029925187033, + "grad_norm": 9.854268074035645, + "learning_rate": 1.0216458852867832e-05, + "loss": 0.3698, + "step": 39250 + }, + { + "epoch": 9.790523690773068, + "grad_norm": 7.4926323890686035, + "learning_rate": 1.0213965087281797e-05, + "loss": 0.4717, + "step": 39260 + }, + { + "epoch": 9.793017456359102, + "grad_norm": 5.085393905639648, + "learning_rate": 1.021147132169576e-05, + "loss": 0.2991, + "step": 39270 + }, + { + "epoch": 9.795511221945137, + "grad_norm": 9.098230361938477, + "learning_rate": 1.0208977556109728e-05, + "loss": 0.2923, + "step": 39280 + }, + { + "epoch": 9.798004987531172, + "grad_norm": 10.33260440826416, + "learning_rate": 1.0206483790523691e-05, + "loss": 0.4181, + "step": 39290 + }, + { + "epoch": 9.800498753117207, + "grad_norm": 8.845526695251465, + "learning_rate": 1.0203990024937658e-05, + "loss": 0.4501, + "step": 39300 + }, + { + "epoch": 9.802992518703242, + "grad_norm": 11.282034873962402, + "learning_rate": 1.0201496259351622e-05, + "loss": 0.3389, + "step": 39310 + }, + { + "epoch": 9.805486284289277, + "grad_norm": 7.388114929199219, + "learning_rate": 1.0199002493765587e-05, + "loss": 0.3754, + "step": 39320 + }, + { + "epoch": 9.807980049875312, + "grad_norm": 7.895745754241943, + "learning_rate": 1.0196508728179552e-05, + "loss": 0.3507, + "step": 39330 + }, + { + "epoch": 9.810473815461346, + "grad_norm": 7.767431259155273, + "learning_rate": 1.0194014962593518e-05, + "loss": 0.4053, + "step": 39340 + }, + { + "epoch": 9.812967581047381, + "grad_norm": 6.963127613067627, + "learning_rate": 1.0191521197007481e-05, + "loss": 0.3367, + "step": 39350 + }, + { + "epoch": 9.815461346633416, + "grad_norm": 5.868590354919434, + "learning_rate": 1.0189027431421448e-05, + "loss": 0.3903, + "step": 39360 + }, + { + "epoch": 9.817955112219451, + "grad_norm": 8.267974853515625, + "learning_rate": 1.0186533665835412e-05, + "loss": 0.3503, + "step": 39370 + }, + { + "epoch": 9.820448877805486, + "grad_norm": 6.432572841644287, + "learning_rate": 1.0184039900249379e-05, + "loss": 0.3092, + "step": 39380 + }, + { + "epoch": 9.82294264339152, + "grad_norm": 9.071556091308594, + "learning_rate": 1.0181546134663343e-05, + "loss": 0.4478, + "step": 39390 + }, + { + "epoch": 9.825436408977556, + "grad_norm": 8.835619926452637, + "learning_rate": 1.0179052369077306e-05, + "loss": 0.3756, + "step": 39400 + }, + { + "epoch": 9.82793017456359, + "grad_norm": 10.740196228027344, + "learning_rate": 1.0176558603491273e-05, + "loss": 0.3349, + "step": 39410 + }, + { + "epoch": 9.830423940149625, + "grad_norm": 11.372931480407715, + "learning_rate": 1.0174064837905237e-05, + "loss": 0.403, + "step": 39420 + }, + { + "epoch": 9.83291770573566, + "grad_norm": 9.308483123779297, + "learning_rate": 1.0171571072319202e-05, + "loss": 0.3523, + "step": 39430 + }, + { + "epoch": 9.835411471321695, + "grad_norm": 6.578537464141846, + "learning_rate": 1.0169077306733169e-05, + "loss": 0.324, + "step": 39440 + }, + { + "epoch": 9.83790523690773, + "grad_norm": 6.254716396331787, + "learning_rate": 1.0166583541147133e-05, + "loss": 0.3546, + "step": 39450 + }, + { + "epoch": 9.840399002493765, + "grad_norm": 6.601047992706299, + "learning_rate": 1.01640897755611e-05, + "loss": 0.405, + "step": 39460 + }, + { + "epoch": 9.8428927680798, + "grad_norm": 9.484192848205566, + "learning_rate": 1.0161596009975063e-05, + "loss": 0.3502, + "step": 39470 + }, + { + "epoch": 9.845386533665835, + "grad_norm": 7.499188423156738, + "learning_rate": 1.0159102244389027e-05, + "loss": 0.3508, + "step": 39480 + }, + { + "epoch": 9.84788029925187, + "grad_norm": 5.97933292388916, + "learning_rate": 1.0156608478802994e-05, + "loss": 0.3444, + "step": 39490 + }, + { + "epoch": 9.850374064837904, + "grad_norm": 7.985541343688965, + "learning_rate": 1.0154114713216958e-05, + "loss": 0.3577, + "step": 39500 + }, + { + "epoch": 9.85286783042394, + "grad_norm": 9.508009910583496, + "learning_rate": 1.0151620947630923e-05, + "loss": 0.2855, + "step": 39510 + }, + { + "epoch": 9.855361596009976, + "grad_norm": 7.971591472625732, + "learning_rate": 1.0149127182044888e-05, + "loss": 0.3184, + "step": 39520 + }, + { + "epoch": 9.85785536159601, + "grad_norm": 6.7001848220825195, + "learning_rate": 1.0146633416458853e-05, + "loss": 0.3265, + "step": 39530 + }, + { + "epoch": 9.860349127182046, + "grad_norm": 5.2400689125061035, + "learning_rate": 1.0144139650872819e-05, + "loss": 0.3971, + "step": 39540 + }, + { + "epoch": 9.86284289276808, + "grad_norm": 7.525379657745361, + "learning_rate": 1.0141645885286784e-05, + "loss": 0.384, + "step": 39550 + }, + { + "epoch": 9.865336658354115, + "grad_norm": 6.4652628898620605, + "learning_rate": 1.0139152119700748e-05, + "loss": 0.4057, + "step": 39560 + }, + { + "epoch": 9.86783042394015, + "grad_norm": 7.187132835388184, + "learning_rate": 1.0136658354114715e-05, + "loss": 0.366, + "step": 39570 + }, + { + "epoch": 9.870324189526185, + "grad_norm": 7.20662784576416, + "learning_rate": 1.0134164588528678e-05, + "loss": 0.344, + "step": 39580 + }, + { + "epoch": 9.87281795511222, + "grad_norm": 5.796775817871094, + "learning_rate": 1.0131670822942645e-05, + "loss": 0.3455, + "step": 39590 + }, + { + "epoch": 9.875311720698255, + "grad_norm": 5.326415061950684, + "learning_rate": 1.0129177057356609e-05, + "loss": 0.3139, + "step": 39600 + }, + { + "epoch": 9.87780548628429, + "grad_norm": 9.870059967041016, + "learning_rate": 1.0126683291770574e-05, + "loss": 0.4385, + "step": 39610 + }, + { + "epoch": 9.880299251870325, + "grad_norm": 5.55562686920166, + "learning_rate": 1.012418952618454e-05, + "loss": 0.3718, + "step": 39620 + }, + { + "epoch": 9.88279301745636, + "grad_norm": 6.446163177490234, + "learning_rate": 1.0121695760598505e-05, + "loss": 0.4105, + "step": 39630 + }, + { + "epoch": 9.885286783042394, + "grad_norm": 7.71967887878418, + "learning_rate": 1.0119201995012469e-05, + "loss": 0.3544, + "step": 39640 + }, + { + "epoch": 9.88778054862843, + "grad_norm": 6.094079494476318, + "learning_rate": 1.0116708229426436e-05, + "loss": 0.3683, + "step": 39650 + }, + { + "epoch": 9.890274314214464, + "grad_norm": 6.991535186767578, + "learning_rate": 1.0114214463840399e-05, + "loss": 0.3176, + "step": 39660 + }, + { + "epoch": 9.892768079800499, + "grad_norm": 6.277289867401123, + "learning_rate": 1.0111720698254366e-05, + "loss": 0.3603, + "step": 39670 + }, + { + "epoch": 9.895261845386534, + "grad_norm": 5.89650821685791, + "learning_rate": 1.010922693266833e-05, + "loss": 0.3687, + "step": 39680 + }, + { + "epoch": 9.897755610972569, + "grad_norm": 8.910008430480957, + "learning_rate": 1.0106733167082295e-05, + "loss": 0.3066, + "step": 39690 + }, + { + "epoch": 9.900249376558603, + "grad_norm": 7.53981876373291, + "learning_rate": 1.010423940149626e-05, + "loss": 0.3586, + "step": 39700 + }, + { + "epoch": 9.902743142144638, + "grad_norm": 6.939689636230469, + "learning_rate": 1.0101745635910226e-05, + "loss": 0.3845, + "step": 39710 + }, + { + "epoch": 9.905236907730673, + "grad_norm": 7.977263927459717, + "learning_rate": 1.009925187032419e-05, + "loss": 0.4126, + "step": 39720 + }, + { + "epoch": 9.907730673316708, + "grad_norm": 7.981022357940674, + "learning_rate": 1.0096758104738156e-05, + "loss": 0.2967, + "step": 39730 + }, + { + "epoch": 9.910224438902743, + "grad_norm": 7.351691246032715, + "learning_rate": 1.009426433915212e-05, + "loss": 0.3309, + "step": 39740 + }, + { + "epoch": 9.912718204488778, + "grad_norm": 6.551850318908691, + "learning_rate": 1.0091770573566087e-05, + "loss": 0.3027, + "step": 39750 + }, + { + "epoch": 9.915211970074813, + "grad_norm": 10.421834945678711, + "learning_rate": 1.008927680798005e-05, + "loss": 0.3109, + "step": 39760 + }, + { + "epoch": 9.917705735660848, + "grad_norm": 9.007574081420898, + "learning_rate": 1.0086783042394014e-05, + "loss": 0.3231, + "step": 39770 + }, + { + "epoch": 9.920199501246882, + "grad_norm": 7.615650177001953, + "learning_rate": 1.0084289276807981e-05, + "loss": 0.3444, + "step": 39780 + }, + { + "epoch": 9.922693266832917, + "grad_norm": 5.678508758544922, + "learning_rate": 1.0081795511221946e-05, + "loss": 0.3623, + "step": 39790 + }, + { + "epoch": 9.925187032418952, + "grad_norm": 7.512736797332764, + "learning_rate": 1.0079301745635912e-05, + "loss": 0.2864, + "step": 39800 + }, + { + "epoch": 9.927680798004987, + "grad_norm": 6.589831829071045, + "learning_rate": 1.0076807980049877e-05, + "loss": 0.3472, + "step": 39810 + }, + { + "epoch": 9.930174563591022, + "grad_norm": 7.6342902183532715, + "learning_rate": 1.007431421446384e-05, + "loss": 0.3813, + "step": 39820 + }, + { + "epoch": 9.932668329177057, + "grad_norm": 11.522653579711914, + "learning_rate": 1.0071820448877808e-05, + "loss": 0.3478, + "step": 39830 + }, + { + "epoch": 9.935162094763092, + "grad_norm": 8.16284465789795, + "learning_rate": 1.0069326683291771e-05, + "loss": 0.3339, + "step": 39840 + }, + { + "epoch": 9.937655860349127, + "grad_norm": 7.553341865539551, + "learning_rate": 1.0066832917705735e-05, + "loss": 0.3129, + "step": 39850 + }, + { + "epoch": 9.940149625935161, + "grad_norm": 6.100545883178711, + "learning_rate": 1.0064339152119702e-05, + "loss": 0.3501, + "step": 39860 + }, + { + "epoch": 9.942643391521196, + "grad_norm": 5.523427486419678, + "learning_rate": 1.0061845386533666e-05, + "loss": 0.392, + "step": 39870 + }, + { + "epoch": 9.945137157107231, + "grad_norm": 6.449985980987549, + "learning_rate": 1.0059351620947633e-05, + "loss": 0.3228, + "step": 39880 + }, + { + "epoch": 9.947630922693268, + "grad_norm": 8.599003791809082, + "learning_rate": 1.0056857855361596e-05, + "loss": 0.3922, + "step": 39890 + }, + { + "epoch": 9.950124688279303, + "grad_norm": 7.818081855773926, + "learning_rate": 1.0054364089775561e-05, + "loss": 0.4501, + "step": 39900 + }, + { + "epoch": 9.952618453865338, + "grad_norm": 6.245206356048584, + "learning_rate": 1.0051870324189528e-05, + "loss": 0.3167, + "step": 39910 + }, + { + "epoch": 9.955112219451372, + "grad_norm": 9.523896217346191, + "learning_rate": 1.0049376558603492e-05, + "loss": 0.3944, + "step": 39920 + }, + { + "epoch": 9.957605985037407, + "grad_norm": 12.862037658691406, + "learning_rate": 1.0046882793017456e-05, + "loss": 0.3859, + "step": 39930 + }, + { + "epoch": 9.960099750623442, + "grad_norm": 11.528470993041992, + "learning_rate": 1.0044389027431423e-05, + "loss": 0.3911, + "step": 39940 + }, + { + "epoch": 9.962593516209477, + "grad_norm": 9.270523071289062, + "learning_rate": 1.0041895261845386e-05, + "loss": 0.4362, + "step": 39950 + }, + { + "epoch": 9.965087281795512, + "grad_norm": 9.051032066345215, + "learning_rate": 1.0039401496259353e-05, + "loss": 0.422, + "step": 39960 + }, + { + "epoch": 9.967581047381547, + "grad_norm": 4.276822566986084, + "learning_rate": 1.0036907730673317e-05, + "loss": 0.397, + "step": 39970 + }, + { + "epoch": 9.970074812967582, + "grad_norm": 4.984722137451172, + "learning_rate": 1.0034413965087282e-05, + "loss": 0.3636, + "step": 39980 + }, + { + "epoch": 9.972568578553616, + "grad_norm": 5.631036281585693, + "learning_rate": 1.0031920199501248e-05, + "loss": 0.3556, + "step": 39990 + }, + { + "epoch": 9.975062344139651, + "grad_norm": 8.992472648620605, + "learning_rate": 1.0029426433915213e-05, + "loss": 0.3561, + "step": 40000 + }, + { + "epoch": 9.977556109725686, + "grad_norm": 9.377120971679688, + "learning_rate": 1.0026932668329176e-05, + "loss": 0.444, + "step": 40010 + }, + { + "epoch": 9.980049875311721, + "grad_norm": NaN, + "learning_rate": 1.0024688279301746e-05, + "loss": 0.3719, + "step": 40020 + }, + { + "epoch": 9.982543640897756, + "grad_norm": 6.351677417755127, + "learning_rate": 1.002219451371571e-05, + "loss": 0.2969, + "step": 40030 + }, + { + "epoch": 9.98503740648379, + "grad_norm": 7.045421600341797, + "learning_rate": 1.0019700748129677e-05, + "loss": 0.3032, + "step": 40040 + }, + { + "epoch": 9.987531172069826, + "grad_norm": 7.283246994018555, + "learning_rate": 1.001720698254364e-05, + "loss": 0.4718, + "step": 40050 + }, + { + "epoch": 9.99002493765586, + "grad_norm": 6.027595520019531, + "learning_rate": 1.0014713216957608e-05, + "loss": 0.3524, + "step": 40060 + }, + { + "epoch": 9.992518703241895, + "grad_norm": 5.910287380218506, + "learning_rate": 1.0012219451371571e-05, + "loss": 0.3885, + "step": 40070 + }, + { + "epoch": 9.99501246882793, + "grad_norm": 6.286899089813232, + "learning_rate": 1.0009725685785537e-05, + "loss": 0.3322, + "step": 40080 + }, + { + "epoch": 9.997506234413965, + "grad_norm": 6.434454441070557, + "learning_rate": 1.0007231920199502e-05, + "loss": 0.3139, + "step": 40090 + }, + { + "epoch": 10.0, + "grad_norm": 6.4550700187683105, + "learning_rate": 1.0004738154613467e-05, + "loss": 0.3902, + "step": 40100 + }, + { + "epoch": 10.0, + "eval_loss": 0.4127441346645355, + "eval_runtime": 59.9961, + "eval_samples_per_second": 16.718, + "eval_steps_per_second": 16.718, + "step": 40100 + }, + { + "epoch": 10.002493765586035, + "grad_norm": 7.405590057373047, + "learning_rate": 1.0002244389027431e-05, + "loss": 0.3798, + "step": 40110 + }, + { + "epoch": 10.00498753117207, + "grad_norm": 6.414757251739502, + "learning_rate": 9.999750623441398e-06, + "loss": 0.3407, + "step": 40120 + }, + { + "epoch": 10.007481296758105, + "grad_norm": 12.407922744750977, + "learning_rate": 9.997256857855361e-06, + "loss": 0.3276, + "step": 40130 + }, + { + "epoch": 10.00997506234414, + "grad_norm": 7.273416996002197, + "learning_rate": 9.994763092269327e-06, + "loss": 0.3299, + "step": 40140 + }, + { + "epoch": 10.012468827930174, + "grad_norm": 5.8642449378967285, + "learning_rate": 9.992269326683292e-06, + "loss": 0.3227, + "step": 40150 + }, + { + "epoch": 10.01496259351621, + "grad_norm": 5.257193565368652, + "learning_rate": 9.989775561097257e-06, + "loss": 0.3175, + "step": 40160 + }, + { + "epoch": 10.017456359102244, + "grad_norm": 10.810057640075684, + "learning_rate": 9.987281795511223e-06, + "loss": 0.4082, + "step": 40170 + }, + { + "epoch": 10.019950124688279, + "grad_norm": 6.935123443603516, + "learning_rate": 9.984788029925188e-06, + "loss": 0.323, + "step": 40180 + }, + { + "epoch": 10.022443890274314, + "grad_norm": 5.4332051277160645, + "learning_rate": 9.982294264339153e-06, + "loss": 0.231, + "step": 40190 + }, + { + "epoch": 10.024937655860349, + "grad_norm": 8.775599479675293, + "learning_rate": 9.979800498753119e-06, + "loss": 0.3459, + "step": 40200 + }, + { + "epoch": 10.027431421446384, + "grad_norm": 4.293675422668457, + "learning_rate": 9.977306733167084e-06, + "loss": 0.3608, + "step": 40210 + }, + { + "epoch": 10.029925187032418, + "grad_norm": 7.175844669342041, + "learning_rate": 9.974812967581048e-06, + "loss": 0.3441, + "step": 40220 + }, + { + "epoch": 10.032418952618453, + "grad_norm": 6.6354522705078125, + "learning_rate": 9.972319201995013e-06, + "loss": 0.3302, + "step": 40230 + }, + { + "epoch": 10.034912718204488, + "grad_norm": 8.921165466308594, + "learning_rate": 9.969825436408978e-06, + "loss": 0.3587, + "step": 40240 + }, + { + "epoch": 10.037406483790523, + "grad_norm": 5.317387580871582, + "learning_rate": 9.967331670822943e-06, + "loss": 0.3295, + "step": 40250 + }, + { + "epoch": 10.039900249376558, + "grad_norm": 5.346241474151611, + "learning_rate": 9.964837905236909e-06, + "loss": 0.3452, + "step": 40260 + }, + { + "epoch": 10.042394014962593, + "grad_norm": 7.5005106925964355, + "learning_rate": 9.962344139650874e-06, + "loss": 0.3677, + "step": 40270 + }, + { + "epoch": 10.044887780548628, + "grad_norm": 6.900725841522217, + "learning_rate": 9.95985037406484e-06, + "loss": 0.3433, + "step": 40280 + }, + { + "epoch": 10.047381546134662, + "grad_norm": 6.401949405670166, + "learning_rate": 9.957356608478805e-06, + "loss": 0.3746, + "step": 40290 + }, + { + "epoch": 10.049875311720697, + "grad_norm": 8.203397750854492, + "learning_rate": 9.954862842892768e-06, + "loss": 0.2977, + "step": 40300 + }, + { + "epoch": 10.052369077306734, + "grad_norm": 4.425477027893066, + "learning_rate": 9.952369077306734e-06, + "loss": 0.3271, + "step": 40310 + }, + { + "epoch": 10.054862842892769, + "grad_norm": 9.94776439666748, + "learning_rate": 9.949875311720699e-06, + "loss": 0.3569, + "step": 40320 + }, + { + "epoch": 10.057356608478804, + "grad_norm": 6.8607892990112305, + "learning_rate": 9.947381546134664e-06, + "loss": 0.3441, + "step": 40330 + }, + { + "epoch": 10.059850374064839, + "grad_norm": 8.044713973999023, + "learning_rate": 9.94488778054863e-06, + "loss": 0.3992, + "step": 40340 + }, + { + "epoch": 10.062344139650873, + "grad_norm": 7.818645477294922, + "learning_rate": 9.942394014962595e-06, + "loss": 0.3718, + "step": 40350 + }, + { + "epoch": 10.064837905236908, + "grad_norm": 8.542047500610352, + "learning_rate": 9.93990024937656e-06, + "loss": 0.4638, + "step": 40360 + }, + { + "epoch": 10.067331670822943, + "grad_norm": 9.035058975219727, + "learning_rate": 9.937406483790526e-06, + "loss": 0.3159, + "step": 40370 + }, + { + "epoch": 10.069825436408978, + "grad_norm": 4.869987964630127, + "learning_rate": 9.934912718204489e-06, + "loss": 0.3242, + "step": 40380 + }, + { + "epoch": 10.072319201995013, + "grad_norm": 10.995007514953613, + "learning_rate": 9.932418952618454e-06, + "loss": 0.3318, + "step": 40390 + }, + { + "epoch": 10.074812967581048, + "grad_norm": 10.2716703414917, + "learning_rate": 9.92992518703242e-06, + "loss": 0.4521, + "step": 40400 + }, + { + "epoch": 10.077306733167083, + "grad_norm": 5.772310256958008, + "learning_rate": 9.927431421446385e-06, + "loss": 0.3316, + "step": 40410 + }, + { + "epoch": 10.079800498753118, + "grad_norm": 8.326214790344238, + "learning_rate": 9.924937655860349e-06, + "loss": 0.3413, + "step": 40420 + }, + { + "epoch": 10.082294264339152, + "grad_norm": 6.764336109161377, + "learning_rate": 9.922443890274314e-06, + "loss": 0.3069, + "step": 40430 + }, + { + "epoch": 10.084788029925187, + "grad_norm": 7.891993522644043, + "learning_rate": 9.91995012468828e-06, + "loss": 0.2665, + "step": 40440 + }, + { + "epoch": 10.087281795511222, + "grad_norm": 6.041927814483643, + "learning_rate": 9.917456359102246e-06, + "loss": 0.2921, + "step": 40450 + }, + { + "epoch": 10.089775561097257, + "grad_norm": 9.28152084350586, + "learning_rate": 9.914962593516212e-06, + "loss": 0.3558, + "step": 40460 + }, + { + "epoch": 10.092269326683292, + "grad_norm": 6.925419330596924, + "learning_rate": 9.912468827930175e-06, + "loss": 0.3164, + "step": 40470 + }, + { + "epoch": 10.094763092269327, + "grad_norm": 6.952348709106445, + "learning_rate": 9.90997506234414e-06, + "loss": 0.4215, + "step": 40480 + }, + { + "epoch": 10.097256857855362, + "grad_norm": 8.346542358398438, + "learning_rate": 9.907481296758106e-06, + "loss": 0.4134, + "step": 40490 + }, + { + "epoch": 10.099750623441397, + "grad_norm": 6.4831719398498535, + "learning_rate": 9.904987531172071e-06, + "loss": 0.3628, + "step": 40500 + }, + { + "epoch": 10.102244389027431, + "grad_norm": 6.676276206970215, + "learning_rate": 9.902493765586035e-06, + "loss": 0.3413, + "step": 40510 + }, + { + "epoch": 10.104738154613466, + "grad_norm": 9.890116691589355, + "learning_rate": 9.9e-06, + "loss": 0.3215, + "step": 40520 + }, + { + "epoch": 10.107231920199501, + "grad_norm": 5.857176780700684, + "learning_rate": 9.897506234413965e-06, + "loss": 0.3393, + "step": 40530 + }, + { + "epoch": 10.109725685785536, + "grad_norm": 6.540131092071533, + "learning_rate": 9.89501246882793e-06, + "loss": 0.3765, + "step": 40540 + }, + { + "epoch": 10.11221945137157, + "grad_norm": 9.703693389892578, + "learning_rate": 9.892518703241896e-06, + "loss": 0.3769, + "step": 40550 + }, + { + "epoch": 10.114713216957606, + "grad_norm": 9.611767768859863, + "learning_rate": 9.890024937655861e-06, + "loss": 0.3857, + "step": 40560 + }, + { + "epoch": 10.11720698254364, + "grad_norm": 6.961813926696777, + "learning_rate": 9.887531172069827e-06, + "loss": 0.3274, + "step": 40570 + }, + { + "epoch": 10.119700748129675, + "grad_norm": 7.749450206756592, + "learning_rate": 9.885037406483792e-06, + "loss": 0.3409, + "step": 40580 + }, + { + "epoch": 10.12219451371571, + "grad_norm": 10.376683235168457, + "learning_rate": 9.882543640897756e-06, + "loss": 0.36, + "step": 40590 + }, + { + "epoch": 10.124688279301745, + "grad_norm": 9.793608665466309, + "learning_rate": 9.880049875311721e-06, + "loss": 0.3467, + "step": 40600 + }, + { + "epoch": 10.12718204488778, + "grad_norm": 6.481242656707764, + "learning_rate": 9.877556109725686e-06, + "loss": 0.3184, + "step": 40610 + }, + { + "epoch": 10.129675810473815, + "grad_norm": 5.596631050109863, + "learning_rate": 9.875062344139651e-06, + "loss": 0.3533, + "step": 40620 + }, + { + "epoch": 10.13216957605985, + "grad_norm": 12.358193397521973, + "learning_rate": 9.872568578553617e-06, + "loss": 0.3539, + "step": 40630 + }, + { + "epoch": 10.134663341645885, + "grad_norm": 7.618841171264648, + "learning_rate": 9.870074812967582e-06, + "loss": 0.4053, + "step": 40640 + }, + { + "epoch": 10.13715710723192, + "grad_norm": 6.739760875701904, + "learning_rate": 9.867581047381547e-06, + "loss": 0.3235, + "step": 40650 + }, + { + "epoch": 10.139650872817954, + "grad_norm": 7.255684852600098, + "learning_rate": 9.865087281795513e-06, + "loss": 0.3134, + "step": 40660 + }, + { + "epoch": 10.14214463840399, + "grad_norm": 10.463494300842285, + "learning_rate": 9.862593516209476e-06, + "loss": 0.3657, + "step": 40670 + }, + { + "epoch": 10.144638403990024, + "grad_norm": 6.290800094604492, + "learning_rate": 9.860099750623442e-06, + "loss": 0.3472, + "step": 40680 + }, + { + "epoch": 10.147132169576059, + "grad_norm": 7.880048751831055, + "learning_rate": 9.857605985037407e-06, + "loss": 0.349, + "step": 40690 + }, + { + "epoch": 10.149625935162096, + "grad_norm": 8.012491226196289, + "learning_rate": 9.855112219451372e-06, + "loss": 0.3229, + "step": 40700 + }, + { + "epoch": 10.15211970074813, + "grad_norm": 9.101916313171387, + "learning_rate": 9.85286783042394e-06, + "loss": 0.3899, + "step": 40710 + }, + { + "epoch": 10.154613466334165, + "grad_norm": 10.565949440002441, + "learning_rate": 9.850374064837906e-06, + "loss": 0.342, + "step": 40720 + }, + { + "epoch": 10.1571072319202, + "grad_norm": 6.891229629516602, + "learning_rate": 9.847880299251871e-06, + "loss": 0.3361, + "step": 40730 + }, + { + "epoch": 10.159600997506235, + "grad_norm": 7.994174957275391, + "learning_rate": 9.845386533665836e-06, + "loss": 0.3542, + "step": 40740 + }, + { + "epoch": 10.16209476309227, + "grad_norm": 13.515098571777344, + "learning_rate": 9.842892768079802e-06, + "loss": 0.371, + "step": 40750 + }, + { + "epoch": 10.164588528678305, + "grad_norm": 6.52901554107666, + "learning_rate": 9.840399002493767e-06, + "loss": 0.3398, + "step": 40760 + }, + { + "epoch": 10.16708229426434, + "grad_norm": 6.790650844573975, + "learning_rate": 9.83790523690773e-06, + "loss": 0.3099, + "step": 40770 + }, + { + "epoch": 10.169576059850375, + "grad_norm": 4.857789516448975, + "learning_rate": 9.835411471321696e-06, + "loss": 0.3564, + "step": 40780 + }, + { + "epoch": 10.17206982543641, + "grad_norm": 10.275793075561523, + "learning_rate": 9.832917705735661e-06, + "loss": 0.4056, + "step": 40790 + }, + { + "epoch": 10.174563591022444, + "grad_norm": 5.9475812911987305, + "learning_rate": 9.830423940149627e-06, + "loss": 0.3462, + "step": 40800 + }, + { + "epoch": 10.17705735660848, + "grad_norm": 7.682375431060791, + "learning_rate": 9.827930174563592e-06, + "loss": 0.3232, + "step": 40810 + }, + { + "epoch": 10.179551122194514, + "grad_norm": 7.272604942321777, + "learning_rate": 9.825436408977557e-06, + "loss": 0.3526, + "step": 40820 + }, + { + "epoch": 10.182044887780549, + "grad_norm": 9.69630241394043, + "learning_rate": 9.822942643391523e-06, + "loss": 0.3904, + "step": 40830 + }, + { + "epoch": 10.184538653366584, + "grad_norm": 5.306740760803223, + "learning_rate": 9.820448877805488e-06, + "loss": 0.2868, + "step": 40840 + }, + { + "epoch": 10.187032418952619, + "grad_norm": 6.456069469451904, + "learning_rate": 9.817955112219451e-06, + "loss": 0.3426, + "step": 40850 + }, + { + "epoch": 10.189526184538654, + "grad_norm": 13.075380325317383, + "learning_rate": 9.815461346633417e-06, + "loss": 0.311, + "step": 40860 + }, + { + "epoch": 10.192019950124688, + "grad_norm": 5.357050895690918, + "learning_rate": 9.812967581047382e-06, + "loss": 0.4198, + "step": 40870 + }, + { + "epoch": 10.194513715710723, + "grad_norm": 7.080076694488525, + "learning_rate": 9.810473815461347e-06, + "loss": 0.3124, + "step": 40880 + }, + { + "epoch": 10.197007481296758, + "grad_norm": 4.402791500091553, + "learning_rate": 9.807980049875313e-06, + "loss": 0.3291, + "step": 40890 + }, + { + "epoch": 10.199501246882793, + "grad_norm": 6.491812705993652, + "learning_rate": 9.805486284289278e-06, + "loss": 0.3769, + "step": 40900 + }, + { + "epoch": 10.201995012468828, + "grad_norm": 7.707225322723389, + "learning_rate": 9.802992518703243e-06, + "loss": 0.395, + "step": 40910 + }, + { + "epoch": 10.204488778054863, + "grad_norm": 6.598991394042969, + "learning_rate": 9.800498753117209e-06, + "loss": 0.3356, + "step": 40920 + }, + { + "epoch": 10.206982543640898, + "grad_norm": 6.032766342163086, + "learning_rate": 9.798004987531174e-06, + "loss": 0.3243, + "step": 40930 + }, + { + "epoch": 10.209476309226932, + "grad_norm": 5.5279340744018555, + "learning_rate": 9.795511221945138e-06, + "loss": 0.3259, + "step": 40940 + }, + { + "epoch": 10.211970074812967, + "grad_norm": 5.94141149520874, + "learning_rate": 9.793017456359103e-06, + "loss": 0.3158, + "step": 40950 + }, + { + "epoch": 10.214463840399002, + "grad_norm": 8.342867851257324, + "learning_rate": 9.790523690773068e-06, + "loss": 0.3472, + "step": 40960 + }, + { + "epoch": 10.216957605985037, + "grad_norm": 6.11809778213501, + "learning_rate": 9.788029925187034e-06, + "loss": 0.3245, + "step": 40970 + }, + { + "epoch": 10.219451371571072, + "grad_norm": 3.928636074066162, + "learning_rate": 9.785536159600997e-06, + "loss": 0.2654, + "step": 40980 + }, + { + "epoch": 10.221945137157107, + "grad_norm": 5.511411190032959, + "learning_rate": 9.783042394014962e-06, + "loss": 0.378, + "step": 40990 + }, + { + "epoch": 10.224438902743142, + "grad_norm": 7.285804271697998, + "learning_rate": 9.78054862842893e-06, + "loss": 0.2728, + "step": 41000 + }, + { + "epoch": 10.226932668329177, + "grad_norm": 10.05148696899414, + "learning_rate": 9.778054862842895e-06, + "loss": 0.2886, + "step": 41010 + }, + { + "epoch": 10.229426433915211, + "grad_norm": 6.61144495010376, + "learning_rate": 9.775561097256858e-06, + "loss": 0.3763, + "step": 41020 + }, + { + "epoch": 10.231920199501246, + "grad_norm": 11.291173934936523, + "learning_rate": 9.773067331670824e-06, + "loss": 0.3521, + "step": 41030 + }, + { + "epoch": 10.234413965087281, + "grad_norm": 7.441661834716797, + "learning_rate": 9.770573566084789e-06, + "loss": 0.3958, + "step": 41040 + }, + { + "epoch": 10.236907730673316, + "grad_norm": 4.365013599395752, + "learning_rate": 9.768079800498754e-06, + "loss": 0.3623, + "step": 41050 + }, + { + "epoch": 10.239401496259351, + "grad_norm": 7.878891468048096, + "learning_rate": 9.765586034912718e-06, + "loss": 0.3634, + "step": 41060 + }, + { + "epoch": 10.241895261845386, + "grad_norm": 8.221386909484863, + "learning_rate": 9.763092269326683e-06, + "loss": 0.394, + "step": 41070 + }, + { + "epoch": 10.24438902743142, + "grad_norm": 6.871973991394043, + "learning_rate": 9.760598503740649e-06, + "loss": 0.3359, + "step": 41080 + }, + { + "epoch": 10.246882793017456, + "grad_norm": 7.841533184051514, + "learning_rate": 9.758104738154614e-06, + "loss": 0.316, + "step": 41090 + }, + { + "epoch": 10.24937655860349, + "grad_norm": 10.699889183044434, + "learning_rate": 9.755610972568579e-06, + "loss": 0.3793, + "step": 41100 + }, + { + "epoch": 10.251870324189527, + "grad_norm": 7.462698459625244, + "learning_rate": 9.753117206982544e-06, + "loss": 0.3795, + "step": 41110 + }, + { + "epoch": 10.254364089775562, + "grad_norm": 5.521877765655518, + "learning_rate": 9.75062344139651e-06, + "loss": 0.3306, + "step": 41120 + }, + { + "epoch": 10.256857855361597, + "grad_norm": 6.224538803100586, + "learning_rate": 9.748129675810475e-06, + "loss": 0.4206, + "step": 41130 + }, + { + "epoch": 10.259351620947632, + "grad_norm": 10.534749984741211, + "learning_rate": 9.74563591022444e-06, + "loss": 0.3515, + "step": 41140 + }, + { + "epoch": 10.261845386533667, + "grad_norm": 7.875677585601807, + "learning_rate": 9.743142144638404e-06, + "loss": 0.3535, + "step": 41150 + }, + { + "epoch": 10.264339152119701, + "grad_norm": 6.144274711608887, + "learning_rate": 9.74064837905237e-06, + "loss": 0.3605, + "step": 41160 + }, + { + "epoch": 10.266832917705736, + "grad_norm": 9.638225555419922, + "learning_rate": 9.738154613466335e-06, + "loss": 0.3261, + "step": 41170 + }, + { + "epoch": 10.269326683291771, + "grad_norm": 13.971467018127441, + "learning_rate": 9.7356608478803e-06, + "loss": 0.3333, + "step": 41180 + }, + { + "epoch": 10.271820448877806, + "grad_norm": 7.335572242736816, + "learning_rate": 9.733167082294265e-06, + "loss": 0.3326, + "step": 41190 + }, + { + "epoch": 10.27431421446384, + "grad_norm": 5.852632999420166, + "learning_rate": 9.73067331670823e-06, + "loss": 0.3612, + "step": 41200 + }, + { + "epoch": 10.276807980049876, + "grad_norm": 6.09345006942749, + "learning_rate": 9.728179551122196e-06, + "loss": 0.3995, + "step": 41210 + }, + { + "epoch": 10.27930174563591, + "grad_norm": 5.712091445922852, + "learning_rate": 9.725685785536161e-06, + "loss": 0.3078, + "step": 41220 + }, + { + "epoch": 10.281795511221945, + "grad_norm": 8.864590644836426, + "learning_rate": 9.723192019950125e-06, + "loss": 0.3629, + "step": 41230 + }, + { + "epoch": 10.28428927680798, + "grad_norm": 8.034436225891113, + "learning_rate": 9.72069825436409e-06, + "loss": 0.3913, + "step": 41240 + }, + { + "epoch": 10.286783042394015, + "grad_norm": 7.192776203155518, + "learning_rate": 9.718204488778055e-06, + "loss": 0.3908, + "step": 41250 + }, + { + "epoch": 10.28927680798005, + "grad_norm": 5.29744291305542, + "learning_rate": 9.71571072319202e-06, + "loss": 0.2776, + "step": 41260 + }, + { + "epoch": 10.291770573566085, + "grad_norm": 8.176728248596191, + "learning_rate": 9.713216957605986e-06, + "loss": 0.3514, + "step": 41270 + }, + { + "epoch": 10.29426433915212, + "grad_norm": 6.879700660705566, + "learning_rate": 9.710723192019951e-06, + "loss": 0.3185, + "step": 41280 + }, + { + "epoch": 10.296758104738155, + "grad_norm": 5.136245250701904, + "learning_rate": 9.708229426433917e-06, + "loss": 0.2969, + "step": 41290 + }, + { + "epoch": 10.29925187032419, + "grad_norm": 12.164958953857422, + "learning_rate": 9.705735660847882e-06, + "loss": 0.3606, + "step": 41300 + }, + { + "epoch": 10.301745635910224, + "grad_norm": 5.413372993469238, + "learning_rate": 9.703241895261846e-06, + "loss": 0.3706, + "step": 41310 + }, + { + "epoch": 10.30423940149626, + "grad_norm": 5.834495544433594, + "learning_rate": 9.700748129675811e-06, + "loss": 0.2906, + "step": 41320 + }, + { + "epoch": 10.306733167082294, + "grad_norm": 6.524380207061768, + "learning_rate": 9.698254364089776e-06, + "loss": 0.3777, + "step": 41330 + }, + { + "epoch": 10.309226932668329, + "grad_norm": 9.674921989440918, + "learning_rate": 9.695760598503741e-06, + "loss": 0.3734, + "step": 41340 + }, + { + "epoch": 10.311720698254364, + "grad_norm": 7.306479454040527, + "learning_rate": 9.693266832917707e-06, + "loss": 0.4354, + "step": 41350 + }, + { + "epoch": 10.314214463840399, + "grad_norm": 5.288915157318115, + "learning_rate": 9.690773067331672e-06, + "loss": 0.2979, + "step": 41360 + }, + { + "epoch": 10.316708229426434, + "grad_norm": 7.090877056121826, + "learning_rate": 9.688279301745637e-06, + "loss": 0.3617, + "step": 41370 + }, + { + "epoch": 10.319201995012468, + "grad_norm": 10.45274543762207, + "learning_rate": 9.685785536159603e-06, + "loss": 0.3738, + "step": 41380 + }, + { + "epoch": 10.321695760598503, + "grad_norm": 6.137414455413818, + "learning_rate": 9.683291770573568e-06, + "loss": 0.3855, + "step": 41390 + }, + { + "epoch": 10.324189526184538, + "grad_norm": 6.6708221435546875, + "learning_rate": 9.680798004987532e-06, + "loss": 0.3804, + "step": 41400 + }, + { + "epoch": 10.326683291770573, + "grad_norm": 7.918160438537598, + "learning_rate": 9.678304239401497e-06, + "loss": 0.3436, + "step": 41410 + }, + { + "epoch": 10.329177057356608, + "grad_norm": 5.83142614364624, + "learning_rate": 9.675810473815462e-06, + "loss": 0.3542, + "step": 41420 + }, + { + "epoch": 10.331670822942643, + "grad_norm": 7.670297145843506, + "learning_rate": 9.673316708229428e-06, + "loss": 0.3867, + "step": 41430 + }, + { + "epoch": 10.334164588528678, + "grad_norm": 5.303461074829102, + "learning_rate": 9.670822942643391e-06, + "loss": 0.3995, + "step": 41440 + }, + { + "epoch": 10.336658354114713, + "grad_norm": 6.452824115753174, + "learning_rate": 9.668329177057357e-06, + "loss": 0.3595, + "step": 41450 + }, + { + "epoch": 10.339152119700747, + "grad_norm": 14.923982620239258, + "learning_rate": 9.665835411471322e-06, + "loss": 0.3651, + "step": 41460 + }, + { + "epoch": 10.341645885286782, + "grad_norm": 5.03542947769165, + "learning_rate": 9.663341645885289e-06, + "loss": 0.3361, + "step": 41470 + }, + { + "epoch": 10.344139650872817, + "grad_norm": 7.917986869812012, + "learning_rate": 9.660847880299252e-06, + "loss": 0.3642, + "step": 41480 + }, + { + "epoch": 10.346633416458852, + "grad_norm": 8.379772186279297, + "learning_rate": 9.658354114713218e-06, + "loss": 0.3277, + "step": 41490 + }, + { + "epoch": 10.349127182044889, + "grad_norm": 7.017518997192383, + "learning_rate": 9.655860349127183e-06, + "loss": 0.3944, + "step": 41500 + }, + { + "epoch": 10.351620947630924, + "grad_norm": 5.158971309661865, + "learning_rate": 9.653366583541148e-06, + "loss": 0.3771, + "step": 41510 + }, + { + "epoch": 10.354114713216958, + "grad_norm": 5.925132751464844, + "learning_rate": 9.650872817955112e-06, + "loss": 0.3433, + "step": 41520 + }, + { + "epoch": 10.356608478802993, + "grad_norm": 8.094498634338379, + "learning_rate": 9.648379052369077e-06, + "loss": 0.3903, + "step": 41530 + }, + { + "epoch": 10.359102244389028, + "grad_norm": 5.886862277984619, + "learning_rate": 9.645885286783043e-06, + "loss": 0.3353, + "step": 41540 + }, + { + "epoch": 10.361596009975063, + "grad_norm": 7.800416469573975, + "learning_rate": 9.643391521197008e-06, + "loss": 0.3465, + "step": 41550 + }, + { + "epoch": 10.364089775561098, + "grad_norm": 5.397622108459473, + "learning_rate": 9.640897755610973e-06, + "loss": 0.4301, + "step": 41560 + }, + { + "epoch": 10.366583541147133, + "grad_norm": 8.980853080749512, + "learning_rate": 9.638403990024939e-06, + "loss": 0.3571, + "step": 41570 + }, + { + "epoch": 10.369077306733168, + "grad_norm": 9.865492820739746, + "learning_rate": 9.635910224438904e-06, + "loss": 0.322, + "step": 41580 + }, + { + "epoch": 10.371571072319203, + "grad_norm": 7.520500183105469, + "learning_rate": 9.63341645885287e-06, + "loss": 0.4147, + "step": 41590 + }, + { + "epoch": 10.374064837905237, + "grad_norm": 8.898337364196777, + "learning_rate": 9.630922693266833e-06, + "loss": 0.3443, + "step": 41600 + }, + { + "epoch": 10.376558603491272, + "grad_norm": 7.803848743438721, + "learning_rate": 9.628428927680798e-06, + "loss": 0.3786, + "step": 41610 + }, + { + "epoch": 10.379052369077307, + "grad_norm": 9.381965637207031, + "learning_rate": 9.625935162094763e-06, + "loss": 0.4683, + "step": 41620 + }, + { + "epoch": 10.381546134663342, + "grad_norm": 6.264208793640137, + "learning_rate": 9.623441396508729e-06, + "loss": 0.3297, + "step": 41630 + }, + { + "epoch": 10.384039900249377, + "grad_norm": 6.923156261444092, + "learning_rate": 9.620947630922694e-06, + "loss": 0.3056, + "step": 41640 + }, + { + "epoch": 10.386533665835412, + "grad_norm": 7.695561408996582, + "learning_rate": 9.61845386533666e-06, + "loss": 0.424, + "step": 41650 + }, + { + "epoch": 10.389027431421447, + "grad_norm": 5.747522354125977, + "learning_rate": 9.615960099750625e-06, + "loss": 0.3297, + "step": 41660 + }, + { + "epoch": 10.391521197007481, + "grad_norm": 5.526257038116455, + "learning_rate": 9.61346633416459e-06, + "loss": 0.3702, + "step": 41670 + }, + { + "epoch": 10.394014962593516, + "grad_norm": 7.8705244064331055, + "learning_rate": 9.610972568578555e-06, + "loss": 0.346, + "step": 41680 + }, + { + "epoch": 10.396508728179551, + "grad_norm": 7.783495903015137, + "learning_rate": 9.608478802992519e-06, + "loss": 0.3441, + "step": 41690 + }, + { + "epoch": 10.399002493765586, + "grad_norm": 8.003525733947754, + "learning_rate": 9.605985037406484e-06, + "loss": 0.3707, + "step": 41700 + }, + { + "epoch": 10.401496259351621, + "grad_norm": 7.88194465637207, + "learning_rate": 9.60349127182045e-06, + "loss": 0.3363, + "step": 41710 + }, + { + "epoch": 10.403990024937656, + "grad_norm": 7.859609127044678, + "learning_rate": 9.600997506234415e-06, + "loss": 0.3417, + "step": 41720 + }, + { + "epoch": 10.40648379052369, + "grad_norm": 6.967172145843506, + "learning_rate": 9.59850374064838e-06, + "loss": 0.3449, + "step": 41730 + }, + { + "epoch": 10.408977556109726, + "grad_norm": 6.901447772979736, + "learning_rate": 9.596009975062345e-06, + "loss": 0.4124, + "step": 41740 + }, + { + "epoch": 10.41147132169576, + "grad_norm": 7.131737232208252, + "learning_rate": 9.59351620947631e-06, + "loss": 0.3701, + "step": 41750 + }, + { + "epoch": 10.413965087281795, + "grad_norm": 8.898688316345215, + "learning_rate": 9.591022443890276e-06, + "loss": 0.323, + "step": 41760 + }, + { + "epoch": 10.41645885286783, + "grad_norm": 7.909594535827637, + "learning_rate": 9.58852867830424e-06, + "loss": 0.3552, + "step": 41770 + }, + { + "epoch": 10.418952618453865, + "grad_norm": 5.500792026519775, + "learning_rate": 9.586034912718205e-06, + "loss": 0.3423, + "step": 41780 + }, + { + "epoch": 10.4214463840399, + "grad_norm": 9.347502708435059, + "learning_rate": 9.58354114713217e-06, + "loss": 0.3671, + "step": 41790 + }, + { + "epoch": 10.423940149625935, + "grad_norm": 6.794683933258057, + "learning_rate": 9.581047381546136e-06, + "loss": 0.4234, + "step": 41800 + }, + { + "epoch": 10.42643391521197, + "grad_norm": 13.040794372558594, + "learning_rate": 9.5785536159601e-06, + "loss": 0.3615, + "step": 41810 + }, + { + "epoch": 10.428927680798004, + "grad_norm": 8.55950927734375, + "learning_rate": 9.576059850374066e-06, + "loss": 0.3848, + "step": 41820 + }, + { + "epoch": 10.43142144638404, + "grad_norm": 8.469449043273926, + "learning_rate": 9.573566084788032e-06, + "loss": 0.315, + "step": 41830 + }, + { + "epoch": 10.433915211970074, + "grad_norm": 6.907922744750977, + "learning_rate": 9.571072319201997e-06, + "loss": 0.4197, + "step": 41840 + }, + { + "epoch": 10.436408977556109, + "grad_norm": 5.317709445953369, + "learning_rate": 9.56857855361596e-06, + "loss": 0.3535, + "step": 41850 + }, + { + "epoch": 10.438902743142144, + "grad_norm": 6.60930061340332, + "learning_rate": 9.566084788029926e-06, + "loss": 0.3643, + "step": 41860 + }, + { + "epoch": 10.441396508728179, + "grad_norm": 9.468186378479004, + "learning_rate": 9.563591022443891e-06, + "loss": 0.3384, + "step": 41870 + }, + { + "epoch": 10.443890274314214, + "grad_norm": 5.716958522796631, + "learning_rate": 9.561097256857856e-06, + "loss": 0.3265, + "step": 41880 + }, + { + "epoch": 10.446384039900249, + "grad_norm": 7.9432172775268555, + "learning_rate": 9.558603491271822e-06, + "loss": 0.3969, + "step": 41890 + }, + { + "epoch": 10.448877805486283, + "grad_norm": 11.144307136535645, + "learning_rate": 9.556109725685785e-06, + "loss": 0.3584, + "step": 41900 + }, + { + "epoch": 10.451371571072318, + "grad_norm": 9.144835472106934, + "learning_rate": 9.55361596009975e-06, + "loss": 0.3963, + "step": 41910 + }, + { + "epoch": 10.453865336658355, + "grad_norm": 6.574597358703613, + "learning_rate": 9.551122194513716e-06, + "loss": 0.3258, + "step": 41920 + }, + { + "epoch": 10.45635910224439, + "grad_norm": 5.504093647003174, + "learning_rate": 9.548628428927681e-06, + "loss": 0.377, + "step": 41930 + }, + { + "epoch": 10.458852867830425, + "grad_norm": 5.041776180267334, + "learning_rate": 9.546134663341647e-06, + "loss": 0.3502, + "step": 41940 + }, + { + "epoch": 10.46134663341646, + "grad_norm": 8.293055534362793, + "learning_rate": 9.543640897755612e-06, + "loss": 0.3173, + "step": 41950 + }, + { + "epoch": 10.463840399002494, + "grad_norm": 6.408956527709961, + "learning_rate": 9.541147132169577e-06, + "loss": 0.3826, + "step": 41960 + }, + { + "epoch": 10.46633416458853, + "grad_norm": 9.970688819885254, + "learning_rate": 9.538653366583542e-06, + "loss": 0.3575, + "step": 41970 + }, + { + "epoch": 10.468827930174564, + "grad_norm": 6.766303062438965, + "learning_rate": 9.536159600997506e-06, + "loss": 0.3414, + "step": 41980 + }, + { + "epoch": 10.471321695760599, + "grad_norm": 5.876713275909424, + "learning_rate": 9.533665835411471e-06, + "loss": 0.3462, + "step": 41990 + }, + { + "epoch": 10.473815461346634, + "grad_norm": 8.713937759399414, + "learning_rate": 9.531172069825437e-06, + "loss": 0.3541, + "step": 42000 + }, + { + "epoch": 10.476309226932669, + "grad_norm": 5.201702117919922, + "learning_rate": 9.528678304239402e-06, + "loss": 0.3199, + "step": 42010 + }, + { + "epoch": 10.478802992518704, + "grad_norm": 12.989068031311035, + "learning_rate": 9.526184538653367e-06, + "loss": 0.3021, + "step": 42020 + }, + { + "epoch": 10.481296758104738, + "grad_norm": 7.327224254608154, + "learning_rate": 9.523690773067333e-06, + "loss": 0.3544, + "step": 42030 + }, + { + "epoch": 10.483790523690773, + "grad_norm": 10.310848236083984, + "learning_rate": 9.521197007481298e-06, + "loss": 0.3914, + "step": 42040 + }, + { + "epoch": 10.486284289276808, + "grad_norm": 6.748259544372559, + "learning_rate": 9.518703241895263e-06, + "loss": 0.3456, + "step": 42050 + }, + { + "epoch": 10.488778054862843, + "grad_norm": 12.408527374267578, + "learning_rate": 9.516209476309227e-06, + "loss": 0.3296, + "step": 42060 + }, + { + "epoch": 10.491271820448878, + "grad_norm": 8.86363697052002, + "learning_rate": 9.513715710723192e-06, + "loss": 0.3759, + "step": 42070 + }, + { + "epoch": 10.493765586034913, + "grad_norm": 9.177289962768555, + "learning_rate": 9.511221945137157e-06, + "loss": 0.375, + "step": 42080 + }, + { + "epoch": 10.496259351620948, + "grad_norm": 8.17337703704834, + "learning_rate": 9.508728179551123e-06, + "loss": 0.3493, + "step": 42090 + }, + { + "epoch": 10.498753117206983, + "grad_norm": 7.164012908935547, + "learning_rate": 9.506234413965088e-06, + "loss": 0.3652, + "step": 42100 + }, + { + "epoch": 10.501246882793017, + "grad_norm": 6.665464401245117, + "learning_rate": 9.503740648379053e-06, + "loss": 0.3838, + "step": 42110 + }, + { + "epoch": 10.503740648379052, + "grad_norm": 7.328868389129639, + "learning_rate": 9.501246882793019e-06, + "loss": 0.3464, + "step": 42120 + }, + { + "epoch": 10.506234413965087, + "grad_norm": 6.057427406311035, + "learning_rate": 9.498753117206984e-06, + "loss": 0.321, + "step": 42130 + }, + { + "epoch": 10.508728179551122, + "grad_norm": 5.329481601715088, + "learning_rate": 9.49625935162095e-06, + "loss": 0.2875, + "step": 42140 + }, + { + "epoch": 10.511221945137157, + "grad_norm": 7.551417827606201, + "learning_rate": 9.493765586034913e-06, + "loss": 0.3713, + "step": 42150 + }, + { + "epoch": 10.513715710723192, + "grad_norm": 5.047544956207275, + "learning_rate": 9.491271820448878e-06, + "loss": 0.4153, + "step": 42160 + }, + { + "epoch": 10.516209476309227, + "grad_norm": 5.810826301574707, + "learning_rate": 9.488778054862844e-06, + "loss": 0.3527, + "step": 42170 + }, + { + "epoch": 10.518703241895262, + "grad_norm": 5.0741753578186035, + "learning_rate": 9.486284289276809e-06, + "loss": 0.3569, + "step": 42180 + }, + { + "epoch": 10.521197007481296, + "grad_norm": 5.9377641677856445, + "learning_rate": 9.483790523690774e-06, + "loss": 0.3968, + "step": 42190 + }, + { + "epoch": 10.523690773067331, + "grad_norm": 8.227517127990723, + "learning_rate": 9.48129675810474e-06, + "loss": 0.3678, + "step": 42200 + }, + { + "epoch": 10.526184538653366, + "grad_norm": 7.43702507019043, + "learning_rate": 9.478802992518705e-06, + "loss": 0.3083, + "step": 42210 + }, + { + "epoch": 10.528678304239401, + "grad_norm": 6.5147199630737305, + "learning_rate": 9.47630922693267e-06, + "loss": 0.2658, + "step": 42220 + }, + { + "epoch": 10.531172069825436, + "grad_norm": 6.781185150146484, + "learning_rate": 9.473815461346634e-06, + "loss": 0.3596, + "step": 42230 + }, + { + "epoch": 10.53366583541147, + "grad_norm": 7.411608695983887, + "learning_rate": 9.471321695760599e-06, + "loss": 0.2931, + "step": 42240 + }, + { + "epoch": 10.536159600997506, + "grad_norm": 12.974010467529297, + "learning_rate": 9.468827930174564e-06, + "loss": 0.2816, + "step": 42250 + }, + { + "epoch": 10.53865336658354, + "grad_norm": 11.239740371704102, + "learning_rate": 9.46633416458853e-06, + "loss": 0.3805, + "step": 42260 + }, + { + "epoch": 10.541147132169575, + "grad_norm": 7.265082836151123, + "learning_rate": 9.463840399002493e-06, + "loss": 0.3101, + "step": 42270 + }, + { + "epoch": 10.54364089775561, + "grad_norm": 7.100124359130859, + "learning_rate": 9.461346633416459e-06, + "loss": 0.343, + "step": 42280 + }, + { + "epoch": 10.546134663341645, + "grad_norm": 5.751792907714844, + "learning_rate": 9.458852867830426e-06, + "loss": 0.3176, + "step": 42290 + }, + { + "epoch": 10.548628428927682, + "grad_norm": 6.716248989105225, + "learning_rate": 9.456359102244391e-06, + "loss": 0.3307, + "step": 42300 + }, + { + "epoch": 10.551122194513717, + "grad_norm": 9.329224586486816, + "learning_rate": 9.453865336658355e-06, + "loss": 0.3778, + "step": 42310 + }, + { + "epoch": 10.553615960099751, + "grad_norm": 8.02515697479248, + "learning_rate": 9.45137157107232e-06, + "loss": 0.4504, + "step": 42320 + }, + { + "epoch": 10.556109725685786, + "grad_norm": 8.72829818725586, + "learning_rate": 9.448877805486285e-06, + "loss": 0.4172, + "step": 42330 + }, + { + "epoch": 10.558603491271821, + "grad_norm": 6.4581780433654785, + "learning_rate": 9.44638403990025e-06, + "loss": 0.328, + "step": 42340 + }, + { + "epoch": 10.561097256857856, + "grad_norm": 10.020159721374512, + "learning_rate": 9.443890274314216e-06, + "loss": 0.4516, + "step": 42350 + }, + { + "epoch": 10.563591022443891, + "grad_norm": 8.494389533996582, + "learning_rate": 9.44139650872818e-06, + "loss": 0.3872, + "step": 42360 + }, + { + "epoch": 10.566084788029926, + "grad_norm": 7.352220058441162, + "learning_rate": 9.438902743142145e-06, + "loss": 0.2962, + "step": 42370 + }, + { + "epoch": 10.56857855361596, + "grad_norm": 7.202230930328369, + "learning_rate": 9.43640897755611e-06, + "loss": 0.3706, + "step": 42380 + }, + { + "epoch": 10.571072319201996, + "grad_norm": 6.778585433959961, + "learning_rate": 9.433915211970075e-06, + "loss": 0.3202, + "step": 42390 + }, + { + "epoch": 10.57356608478803, + "grad_norm": 10.143278121948242, + "learning_rate": 9.43142144638404e-06, + "loss": 0.3963, + "step": 42400 + }, + { + "epoch": 10.576059850374065, + "grad_norm": 9.119635581970215, + "learning_rate": 9.428927680798006e-06, + "loss": 0.3016, + "step": 42410 + }, + { + "epoch": 10.5785536159601, + "grad_norm": 6.856378555297852, + "learning_rate": 9.426433915211971e-06, + "loss": 0.3479, + "step": 42420 + }, + { + "epoch": 10.581047381546135, + "grad_norm": 5.599526882171631, + "learning_rate": 9.423940149625937e-06, + "loss": 0.3469, + "step": 42430 + }, + { + "epoch": 10.58354114713217, + "grad_norm": 6.967793941497803, + "learning_rate": 9.4214463840399e-06, + "loss": 0.3133, + "step": 42440 + }, + { + "epoch": 10.586034912718205, + "grad_norm": 5.5582404136657715, + "learning_rate": 9.418952618453865e-06, + "loss": 0.3838, + "step": 42450 + }, + { + "epoch": 10.58852867830424, + "grad_norm": 11.962347030639648, + "learning_rate": 9.41645885286783e-06, + "loss": 0.3827, + "step": 42460 + }, + { + "epoch": 10.591022443890274, + "grad_norm": 5.947192668914795, + "learning_rate": 9.413965087281796e-06, + "loss": 0.3638, + "step": 42470 + }, + { + "epoch": 10.59351620947631, + "grad_norm": 5.291638374328613, + "learning_rate": 9.411471321695761e-06, + "loss": 0.3622, + "step": 42480 + }, + { + "epoch": 10.596009975062344, + "grad_norm": 6.401669979095459, + "learning_rate": 9.408977556109727e-06, + "loss": 0.3045, + "step": 42490 + }, + { + "epoch": 10.598503740648379, + "grad_norm": 8.00327205657959, + "learning_rate": 9.406483790523692e-06, + "loss": 0.3862, + "step": 42500 + }, + { + "epoch": 10.600997506234414, + "grad_norm": 8.175292015075684, + "learning_rate": 9.403990024937657e-06, + "loss": 0.3123, + "step": 42510 + }, + { + "epoch": 10.603491271820449, + "grad_norm": 13.475317001342773, + "learning_rate": 9.401496259351621e-06, + "loss": 0.3399, + "step": 42520 + }, + { + "epoch": 10.605985037406484, + "grad_norm": 8.634927749633789, + "learning_rate": 9.399002493765586e-06, + "loss": 0.369, + "step": 42530 + }, + { + "epoch": 10.608478802992519, + "grad_norm": 8.839632034301758, + "learning_rate": 9.396508728179552e-06, + "loss": 0.3389, + "step": 42540 + }, + { + "epoch": 10.610972568578553, + "grad_norm": 4.931496620178223, + "learning_rate": 9.394014962593517e-06, + "loss": 0.3422, + "step": 42550 + }, + { + "epoch": 10.613466334164588, + "grad_norm": 5.307199001312256, + "learning_rate": 9.391521197007482e-06, + "loss": 0.2964, + "step": 42560 + }, + { + "epoch": 10.615960099750623, + "grad_norm": 11.918758392333984, + "learning_rate": 9.389027431421447e-06, + "loss": 0.3132, + "step": 42570 + }, + { + "epoch": 10.618453865336658, + "grad_norm": 10.152669906616211, + "learning_rate": 9.386533665835413e-06, + "loss": 0.3228, + "step": 42580 + }, + { + "epoch": 10.620947630922693, + "grad_norm": 8.567078590393066, + "learning_rate": 9.384039900249378e-06, + "loss": 0.4226, + "step": 42590 + }, + { + "epoch": 10.623441396508728, + "grad_norm": 9.373230934143066, + "learning_rate": 9.381546134663343e-06, + "loss": 0.3664, + "step": 42600 + }, + { + "epoch": 10.625935162094763, + "grad_norm": 7.585546016693115, + "learning_rate": 9.379052369077307e-06, + "loss": 0.3636, + "step": 42610 + }, + { + "epoch": 10.628428927680797, + "grad_norm": 6.881632328033447, + "learning_rate": 9.376558603491272e-06, + "loss": 0.304, + "step": 42620 + }, + { + "epoch": 10.630922693266832, + "grad_norm": 5.626585483551025, + "learning_rate": 9.374064837905238e-06, + "loss": 0.3563, + "step": 42630 + }, + { + "epoch": 10.633416458852867, + "grad_norm": 14.296299934387207, + "learning_rate": 9.371571072319203e-06, + "loss": 0.3678, + "step": 42640 + }, + { + "epoch": 10.635910224438902, + "grad_norm": 9.416000366210938, + "learning_rate": 9.369077306733168e-06, + "loss": 0.3737, + "step": 42650 + }, + { + "epoch": 10.638403990024937, + "grad_norm": 7.975756645202637, + "learning_rate": 9.366583541147134e-06, + "loss": 0.3162, + "step": 42660 + }, + { + "epoch": 10.640897755610972, + "grad_norm": 10.369644165039062, + "learning_rate": 9.364089775561099e-06, + "loss": 0.3617, + "step": 42670 + }, + { + "epoch": 10.643391521197007, + "grad_norm": 6.275656700134277, + "learning_rate": 9.361596009975064e-06, + "loss": 0.3393, + "step": 42680 + }, + { + "epoch": 10.645885286783042, + "grad_norm": 8.759428977966309, + "learning_rate": 9.359102244389028e-06, + "loss": 0.3428, + "step": 42690 + }, + { + "epoch": 10.648379052369076, + "grad_norm": 5.912599563598633, + "learning_rate": 9.356608478802993e-06, + "loss": 0.3031, + "step": 42700 + }, + { + "epoch": 10.650872817955111, + "grad_norm": 6.889318943023682, + "learning_rate": 9.354114713216958e-06, + "loss": 0.3897, + "step": 42710 + }, + { + "epoch": 10.653366583541148, + "grad_norm": 5.2232346534729, + "learning_rate": 9.351620947630924e-06, + "loss": 0.3764, + "step": 42720 + }, + { + "epoch": 10.655860349127183, + "grad_norm": 9.666282653808594, + "learning_rate": 9.349127182044887e-06, + "loss": 0.355, + "step": 42730 + }, + { + "epoch": 10.658354114713218, + "grad_norm": 6.856553077697754, + "learning_rate": 9.346633416458853e-06, + "loss": 0.3644, + "step": 42740 + }, + { + "epoch": 10.660847880299253, + "grad_norm": 7.544265270233154, + "learning_rate": 9.344139650872818e-06, + "loss": 0.3547, + "step": 42750 + }, + { + "epoch": 10.663341645885287, + "grad_norm": 9.202102661132812, + "learning_rate": 9.341645885286785e-06, + "loss": 0.3561, + "step": 42760 + }, + { + "epoch": 10.665835411471322, + "grad_norm": 5.572080135345459, + "learning_rate": 9.339152119700749e-06, + "loss": 0.379, + "step": 42770 + }, + { + "epoch": 10.668329177057357, + "grad_norm": 7.436700820922852, + "learning_rate": 9.336658354114714e-06, + "loss": 0.341, + "step": 42780 + }, + { + "epoch": 10.670822942643392, + "grad_norm": 7.51735782623291, + "learning_rate": 9.33416458852868e-06, + "loss": 0.3007, + "step": 42790 + }, + { + "epoch": 10.673316708229427, + "grad_norm": 7.93154239654541, + "learning_rate": 9.331670822942645e-06, + "loss": 0.3787, + "step": 42800 + }, + { + "epoch": 10.675810473815462, + "grad_norm": 7.7301106452941895, + "learning_rate": 9.329177057356608e-06, + "loss": 0.3315, + "step": 42810 + }, + { + "epoch": 10.678304239401497, + "grad_norm": 4.633336544036865, + "learning_rate": 9.326683291770573e-06, + "loss": 0.4015, + "step": 42820 + }, + { + "epoch": 10.680798004987532, + "grad_norm": 8.565386772155762, + "learning_rate": 9.324189526184539e-06, + "loss": 0.3873, + "step": 42830 + }, + { + "epoch": 10.683291770573566, + "grad_norm": 5.983166694641113, + "learning_rate": 9.321695760598504e-06, + "loss": 0.3298, + "step": 42840 + }, + { + "epoch": 10.685785536159601, + "grad_norm": 8.084731101989746, + "learning_rate": 9.31920199501247e-06, + "loss": 0.2891, + "step": 42850 + }, + { + "epoch": 10.688279301745636, + "grad_norm": 9.424635887145996, + "learning_rate": 9.316708229426435e-06, + "loss": 0.3609, + "step": 42860 + }, + { + "epoch": 10.690773067331671, + "grad_norm": 4.796233654022217, + "learning_rate": 9.3142144638404e-06, + "loss": 0.3446, + "step": 42870 + }, + { + "epoch": 10.693266832917706, + "grad_norm": 8.594659805297852, + "learning_rate": 9.311720698254365e-06, + "loss": 0.3575, + "step": 42880 + }, + { + "epoch": 10.69576059850374, + "grad_norm": 5.583627223968506, + "learning_rate": 9.30922693266833e-06, + "loss": 0.3207, + "step": 42890 + }, + { + "epoch": 10.698254364089776, + "grad_norm": 7.472324848175049, + "learning_rate": 9.306733167082294e-06, + "loss": 0.3412, + "step": 42900 + }, + { + "epoch": 10.70074812967581, + "grad_norm": 5.4451584815979, + "learning_rate": 9.30423940149626e-06, + "loss": 0.3852, + "step": 42910 + }, + { + "epoch": 10.703241895261845, + "grad_norm": 8.073676109313965, + "learning_rate": 9.301745635910225e-06, + "loss": 0.3938, + "step": 42920 + }, + { + "epoch": 10.70573566084788, + "grad_norm": 5.401284694671631, + "learning_rate": 9.29925187032419e-06, + "loss": 0.2982, + "step": 42930 + }, + { + "epoch": 10.708229426433915, + "grad_norm": 10.709132194519043, + "learning_rate": 9.296758104738155e-06, + "loss": 0.3338, + "step": 42940 + }, + { + "epoch": 10.71072319201995, + "grad_norm": 6.343472957611084, + "learning_rate": 9.29426433915212e-06, + "loss": 0.3251, + "step": 42950 + }, + { + "epoch": 10.713216957605985, + "grad_norm": 7.1593499183654785, + "learning_rate": 9.291770573566086e-06, + "loss": 0.332, + "step": 42960 + }, + { + "epoch": 10.71571072319202, + "grad_norm": 7.830443382263184, + "learning_rate": 9.289276807980051e-06, + "loss": 0.4038, + "step": 42970 + }, + { + "epoch": 10.718204488778055, + "grad_norm": 8.076725959777832, + "learning_rate": 9.286783042394015e-06, + "loss": 0.3343, + "step": 42980 + }, + { + "epoch": 10.72069825436409, + "grad_norm": 7.131729602813721, + "learning_rate": 9.28428927680798e-06, + "loss": 0.316, + "step": 42990 + }, + { + "epoch": 10.723192019950124, + "grad_norm": 6.504873752593994, + "learning_rate": 9.281795511221946e-06, + "loss": 0.3059, + "step": 43000 + }, + { + "epoch": 10.72568578553616, + "grad_norm": 6.572418212890625, + "learning_rate": 9.279301745635911e-06, + "loss": 0.3717, + "step": 43010 + }, + { + "epoch": 10.728179551122194, + "grad_norm": 7.904941558837891, + "learning_rate": 9.276807980049876e-06, + "loss": 0.405, + "step": 43020 + }, + { + "epoch": 10.730673316708229, + "grad_norm": 7.299516201019287, + "learning_rate": 9.274314214463842e-06, + "loss": 0.3487, + "step": 43030 + }, + { + "epoch": 10.733167082294264, + "grad_norm": 10.756522178649902, + "learning_rate": 9.271820448877807e-06, + "loss": 0.3513, + "step": 43040 + }, + { + "epoch": 10.735660847880299, + "grad_norm": 5.3038010597229, + "learning_rate": 9.269326683291772e-06, + "loss": 0.2722, + "step": 43050 + }, + { + "epoch": 10.738154613466333, + "grad_norm": 9.962454795837402, + "learning_rate": 9.266832917705736e-06, + "loss": 0.3812, + "step": 43060 + }, + { + "epoch": 10.740648379052368, + "grad_norm": 6.948879718780518, + "learning_rate": 9.264339152119701e-06, + "loss": 0.3602, + "step": 43070 + }, + { + "epoch": 10.743142144638403, + "grad_norm": 6.786881923675537, + "learning_rate": 9.261845386533666e-06, + "loss": 0.4163, + "step": 43080 + }, + { + "epoch": 10.745635910224438, + "grad_norm": 9.627817153930664, + "learning_rate": 9.259351620947632e-06, + "loss": 0.3901, + "step": 43090 + }, + { + "epoch": 10.748129675810475, + "grad_norm": 6.600256443023682, + "learning_rate": 9.256857855361597e-06, + "loss": 0.324, + "step": 43100 + }, + { + "epoch": 10.75062344139651, + "grad_norm": 6.746035575866699, + "learning_rate": 9.254364089775562e-06, + "loss": 0.3914, + "step": 43110 + }, + { + "epoch": 10.753117206982544, + "grad_norm": 8.608478546142578, + "learning_rate": 9.251870324189528e-06, + "loss": 0.3469, + "step": 43120 + }, + { + "epoch": 10.75561097256858, + "grad_norm": 7.662023544311523, + "learning_rate": 9.249376558603493e-06, + "loss": 0.3648, + "step": 43130 + }, + { + "epoch": 10.758104738154614, + "grad_norm": 7.992726802825928, + "learning_rate": 9.246882793017458e-06, + "loss": 0.3541, + "step": 43140 + }, + { + "epoch": 10.760598503740649, + "grad_norm": 6.671484470367432, + "learning_rate": 9.244389027431422e-06, + "loss": 0.3287, + "step": 43150 + }, + { + "epoch": 10.763092269326684, + "grad_norm": 8.085719108581543, + "learning_rate": 9.241895261845387e-06, + "loss": 0.3401, + "step": 43160 + }, + { + "epoch": 10.765586034912719, + "grad_norm": 5.610658168792725, + "learning_rate": 9.239401496259353e-06, + "loss": 0.3296, + "step": 43170 + }, + { + "epoch": 10.768079800498754, + "grad_norm": 8.575346946716309, + "learning_rate": 9.236907730673318e-06, + "loss": 0.3825, + "step": 43180 + }, + { + "epoch": 10.770573566084789, + "grad_norm": 9.167414665222168, + "learning_rate": 9.234413965087281e-06, + "loss": 0.3747, + "step": 43190 + }, + { + "epoch": 10.773067331670823, + "grad_norm": 9.802268028259277, + "learning_rate": 9.231920199501247e-06, + "loss": 0.323, + "step": 43200 + }, + { + "epoch": 10.775561097256858, + "grad_norm": 5.6160101890563965, + "learning_rate": 9.229426433915212e-06, + "loss": 0.34, + "step": 43210 + }, + { + "epoch": 10.778054862842893, + "grad_norm": 6.598016738891602, + "learning_rate": 9.226932668329179e-06, + "loss": 0.3541, + "step": 43220 + }, + { + "epoch": 10.780548628428928, + "grad_norm": 7.513404369354248, + "learning_rate": 9.224438902743143e-06, + "loss": 0.3587, + "step": 43230 + }, + { + "epoch": 10.783042394014963, + "grad_norm": 8.931615829467773, + "learning_rate": 9.221945137157108e-06, + "loss": 0.3301, + "step": 43240 + }, + { + "epoch": 10.785536159600998, + "grad_norm": 8.84310245513916, + "learning_rate": 9.219451371571073e-06, + "loss": 0.3577, + "step": 43250 + }, + { + "epoch": 10.788029925187033, + "grad_norm": 6.9853668212890625, + "learning_rate": 9.216957605985039e-06, + "loss": 0.3472, + "step": 43260 + }, + { + "epoch": 10.790523690773068, + "grad_norm": 8.114693641662598, + "learning_rate": 9.214463840399002e-06, + "loss": 0.3509, + "step": 43270 + }, + { + "epoch": 10.793017456359102, + "grad_norm": 6.722007751464844, + "learning_rate": 9.211970074812968e-06, + "loss": 0.3194, + "step": 43280 + }, + { + "epoch": 10.795511221945137, + "grad_norm": 11.021949768066406, + "learning_rate": 9.209476309226933e-06, + "loss": 0.3875, + "step": 43290 + }, + { + "epoch": 10.798004987531172, + "grad_norm": 9.12653636932373, + "learning_rate": 9.206982543640898e-06, + "loss": 0.3549, + "step": 43300 + }, + { + "epoch": 10.800498753117207, + "grad_norm": 6.999925136566162, + "learning_rate": 9.204488778054863e-06, + "loss": 0.3565, + "step": 43310 + }, + { + "epoch": 10.802992518703242, + "grad_norm": 8.389660835266113, + "learning_rate": 9.201995012468829e-06, + "loss": 0.2577, + "step": 43320 + }, + { + "epoch": 10.805486284289277, + "grad_norm": 6.072205543518066, + "learning_rate": 9.199501246882794e-06, + "loss": 0.3147, + "step": 43330 + }, + { + "epoch": 10.807980049875312, + "grad_norm": 7.625298500061035, + "learning_rate": 9.19700748129676e-06, + "loss": 0.2968, + "step": 43340 + }, + { + "epoch": 10.810473815461346, + "grad_norm": 9.655031204223633, + "learning_rate": 9.194513715710725e-06, + "loss": 0.3788, + "step": 43350 + }, + { + "epoch": 10.812967581047381, + "grad_norm": 5.24294900894165, + "learning_rate": 9.192019950124688e-06, + "loss": 0.3467, + "step": 43360 + }, + { + "epoch": 10.815461346633416, + "grad_norm": 3.961106777191162, + "learning_rate": 9.189526184538654e-06, + "loss": 0.2739, + "step": 43370 + }, + { + "epoch": 10.817955112219451, + "grad_norm": 9.00661563873291, + "learning_rate": 9.187032418952619e-06, + "loss": 0.3829, + "step": 43380 + }, + { + "epoch": 10.820448877805486, + "grad_norm": 5.253776550292969, + "learning_rate": 9.184538653366584e-06, + "loss": 0.3532, + "step": 43390 + }, + { + "epoch": 10.82294264339152, + "grad_norm": 6.8196587562561035, + "learning_rate": 9.18204488778055e-06, + "loss": 0.3576, + "step": 43400 + }, + { + "epoch": 10.825436408977556, + "grad_norm": 6.497467517852783, + "learning_rate": 9.179551122194515e-06, + "loss": 0.3487, + "step": 43410 + }, + { + "epoch": 10.82793017456359, + "grad_norm": 7.977984428405762, + "learning_rate": 9.17705735660848e-06, + "loss": 0.3008, + "step": 43420 + }, + { + "epoch": 10.830423940149625, + "grad_norm": 5.911102771759033, + "learning_rate": 9.174563591022445e-06, + "loss": 0.4166, + "step": 43430 + }, + { + "epoch": 10.83291770573566, + "grad_norm": 5.653079032897949, + "learning_rate": 9.172069825436409e-06, + "loss": 0.3284, + "step": 43440 + }, + { + "epoch": 10.835411471321695, + "grad_norm": 5.5909199714660645, + "learning_rate": 9.169576059850374e-06, + "loss": 0.3524, + "step": 43450 + }, + { + "epoch": 10.83790523690773, + "grad_norm": 5.932044982910156, + "learning_rate": 9.16708229426434e-06, + "loss": 0.371, + "step": 43460 + }, + { + "epoch": 10.840399002493765, + "grad_norm": 8.470308303833008, + "learning_rate": 9.164588528678305e-06, + "loss": 0.3592, + "step": 43470 + }, + { + "epoch": 10.8428927680798, + "grad_norm": 5.490266799926758, + "learning_rate": 9.16209476309227e-06, + "loss": 0.3547, + "step": 43480 + }, + { + "epoch": 10.845386533665835, + "grad_norm": 7.31662654876709, + "learning_rate": 9.159600997506236e-06, + "loss": 0.3529, + "step": 43490 + }, + { + "epoch": 10.84788029925187, + "grad_norm": 9.396953582763672, + "learning_rate": 9.157107231920201e-06, + "loss": 0.3525, + "step": 43500 + }, + { + "epoch": 10.850374064837904, + "grad_norm": 8.587018013000488, + "learning_rate": 9.154613466334166e-06, + "loss": 0.3494, + "step": 43510 + }, + { + "epoch": 10.85286783042394, + "grad_norm": 5.789772033691406, + "learning_rate": 9.15211970074813e-06, + "loss": 0.3412, + "step": 43520 + }, + { + "epoch": 10.855361596009976, + "grad_norm": 8.550657272338867, + "learning_rate": 9.149625935162095e-06, + "loss": 0.3953, + "step": 43530 + }, + { + "epoch": 10.85785536159601, + "grad_norm": 7.927114963531494, + "learning_rate": 9.14713216957606e-06, + "loss": 0.3944, + "step": 43540 + }, + { + "epoch": 10.860349127182046, + "grad_norm": 5.831830978393555, + "learning_rate": 9.144638403990026e-06, + "loss": 0.3161, + "step": 43550 + }, + { + "epoch": 10.86284289276808, + "grad_norm": 7.033876419067383, + "learning_rate": 9.14214463840399e-06, + "loss": 0.3668, + "step": 43560 + }, + { + "epoch": 10.865336658354115, + "grad_norm": 5.114386081695557, + "learning_rate": 9.139650872817956e-06, + "loss": 0.3579, + "step": 43570 + }, + { + "epoch": 10.86783042394015, + "grad_norm": 5.873979568481445, + "learning_rate": 9.137157107231922e-06, + "loss": 0.3652, + "step": 43580 + }, + { + "epoch": 10.870324189526185, + "grad_norm": 7.800948619842529, + "learning_rate": 9.134663341645887e-06, + "loss": 0.3274, + "step": 43590 + }, + { + "epoch": 10.87281795511222, + "grad_norm": 7.095271587371826, + "learning_rate": 9.132169576059852e-06, + "loss": 0.3438, + "step": 43600 + }, + { + "epoch": 10.875311720698255, + "grad_norm": 19.4151668548584, + "learning_rate": 9.129675810473816e-06, + "loss": 0.3501, + "step": 43610 + }, + { + "epoch": 10.87780548628429, + "grad_norm": 7.045140743255615, + "learning_rate": 9.127182044887781e-06, + "loss": 0.3194, + "step": 43620 + }, + { + "epoch": 10.880299251870325, + "grad_norm": 7.247674465179443, + "learning_rate": 9.124688279301747e-06, + "loss": 0.3335, + "step": 43630 + }, + { + "epoch": 10.88279301745636, + "grad_norm": 7.939310550689697, + "learning_rate": 9.122194513715712e-06, + "loss": 0.3741, + "step": 43640 + }, + { + "epoch": 10.885286783042394, + "grad_norm": 7.598067283630371, + "learning_rate": 9.119700748129676e-06, + "loss": 0.3175, + "step": 43650 + }, + { + "epoch": 10.88778054862843, + "grad_norm": 6.730564594268799, + "learning_rate": 9.11720698254364e-06, + "loss": 0.3614, + "step": 43660 + }, + { + "epoch": 10.890274314214464, + "grad_norm": 8.91694450378418, + "learning_rate": 9.114713216957606e-06, + "loss": 0.4158, + "step": 43670 + }, + { + "epoch": 10.892768079800499, + "grad_norm": 11.187231063842773, + "learning_rate": 9.112219451371571e-06, + "loss": 0.3443, + "step": 43680 + }, + { + "epoch": 10.895261845386534, + "grad_norm": 4.5806450843811035, + "learning_rate": 9.109725685785537e-06, + "loss": 0.3801, + "step": 43690 + }, + { + "epoch": 10.897755610972569, + "grad_norm": 8.682236671447754, + "learning_rate": 9.107231920199502e-06, + "loss": 0.3462, + "step": 43700 + }, + { + "epoch": 10.900249376558603, + "grad_norm": 6.818325042724609, + "learning_rate": 9.104738154613467e-06, + "loss": 0.3968, + "step": 43710 + }, + { + "epoch": 10.902743142144638, + "grad_norm": 6.7168426513671875, + "learning_rate": 9.102244389027433e-06, + "loss": 0.5048, + "step": 43720 + }, + { + "epoch": 10.905236907730673, + "grad_norm": 7.661280155181885, + "learning_rate": 9.099750623441396e-06, + "loss": 0.3603, + "step": 43730 + }, + { + "epoch": 10.907730673316708, + "grad_norm": 11.238551139831543, + "learning_rate": 9.097256857855362e-06, + "loss": 0.3616, + "step": 43740 + }, + { + "epoch": 10.910224438902743, + "grad_norm": 8.43984317779541, + "learning_rate": 9.094763092269327e-06, + "loss": 0.3914, + "step": 43750 + }, + { + "epoch": 10.912718204488778, + "grad_norm": 6.167079925537109, + "learning_rate": 9.092269326683292e-06, + "loss": 0.3538, + "step": 43760 + }, + { + "epoch": 10.915211970074813, + "grad_norm": 5.880906581878662, + "learning_rate": 9.089775561097258e-06, + "loss": 0.3578, + "step": 43770 + }, + { + "epoch": 10.917705735660848, + "grad_norm": 11.39787483215332, + "learning_rate": 9.087281795511223e-06, + "loss": 0.3781, + "step": 43780 + }, + { + "epoch": 10.920199501246882, + "grad_norm": 11.926407814025879, + "learning_rate": 9.084788029925188e-06, + "loss": 0.3413, + "step": 43790 + }, + { + "epoch": 10.922693266832917, + "grad_norm": 9.85400390625, + "learning_rate": 9.082294264339153e-06, + "loss": 0.4972, + "step": 43800 + }, + { + "epoch": 10.925187032418952, + "grad_norm": 8.283854484558105, + "learning_rate": 9.079800498753117e-06, + "loss": 0.3979, + "step": 43810 + }, + { + "epoch": 10.927680798004987, + "grad_norm": 9.599810600280762, + "learning_rate": 9.077306733167082e-06, + "loss": 0.3405, + "step": 43820 + }, + { + "epoch": 10.930174563591022, + "grad_norm": 8.065226554870605, + "learning_rate": 9.074812967581048e-06, + "loss": 0.4098, + "step": 43830 + }, + { + "epoch": 10.932668329177057, + "grad_norm": 6.116038799285889, + "learning_rate": 9.072319201995013e-06, + "loss": 0.3745, + "step": 43840 + }, + { + "epoch": 10.935162094763092, + "grad_norm": 8.363578796386719, + "learning_rate": 9.069825436408978e-06, + "loss": 0.3899, + "step": 43850 + }, + { + "epoch": 10.937655860349127, + "grad_norm": 5.87863302230835, + "learning_rate": 9.067331670822944e-06, + "loss": 0.3468, + "step": 43860 + }, + { + "epoch": 10.940149625935161, + "grad_norm": 7.009591102600098, + "learning_rate": 9.064837905236909e-06, + "loss": 0.3486, + "step": 43870 + }, + { + "epoch": 10.942643391521196, + "grad_norm": 8.262612342834473, + "learning_rate": 9.062344139650874e-06, + "loss": 0.3601, + "step": 43880 + }, + { + "epoch": 10.945137157107231, + "grad_norm": 11.304754257202148, + "learning_rate": 9.05985037406484e-06, + "loss": 0.3933, + "step": 43890 + }, + { + "epoch": 10.947630922693268, + "grad_norm": 7.655803680419922, + "learning_rate": 9.057356608478803e-06, + "loss": 0.3177, + "step": 43900 + }, + { + "epoch": 10.950124688279303, + "grad_norm": 7.3225226402282715, + "learning_rate": 9.054862842892768e-06, + "loss": 0.3793, + "step": 43910 + }, + { + "epoch": 10.952618453865338, + "grad_norm": 11.696869850158691, + "learning_rate": 9.052369077306734e-06, + "loss": 0.3915, + "step": 43920 + }, + { + "epoch": 10.955112219451372, + "grad_norm": 7.450771331787109, + "learning_rate": 9.049875311720699e-06, + "loss": 0.3423, + "step": 43930 + }, + { + "epoch": 10.957605985037407, + "grad_norm": 6.9573655128479, + "learning_rate": 9.047381546134664e-06, + "loss": 0.3169, + "step": 43940 + }, + { + "epoch": 10.960099750623442, + "grad_norm": 6.270671844482422, + "learning_rate": 9.04488778054863e-06, + "loss": 0.3519, + "step": 43950 + }, + { + "epoch": 10.962593516209477, + "grad_norm": 8.106279373168945, + "learning_rate": 9.042394014962595e-06, + "loss": 0.3458, + "step": 43960 + }, + { + "epoch": 10.965087281795512, + "grad_norm": 2.7840754985809326, + "learning_rate": 9.03990024937656e-06, + "loss": 0.3244, + "step": 43970 + }, + { + "epoch": 10.967581047381547, + "grad_norm": 6.383938789367676, + "learning_rate": 9.037406483790524e-06, + "loss": 0.3242, + "step": 43980 + }, + { + "epoch": 10.970074812967582, + "grad_norm": 8.362710952758789, + "learning_rate": 9.03491271820449e-06, + "loss": 0.3966, + "step": 43990 + }, + { + "epoch": 10.972568578553616, + "grad_norm": 8.788472175598145, + "learning_rate": 9.032418952618455e-06, + "loss": 0.3279, + "step": 44000 + }, + { + "epoch": 10.975062344139651, + "grad_norm": 7.121370792388916, + "learning_rate": 9.02992518703242e-06, + "loss": 0.3171, + "step": 44010 + }, + { + "epoch": 10.977556109725686, + "grad_norm": 8.656166076660156, + "learning_rate": 9.027431421446384e-06, + "loss": 0.3256, + "step": 44020 + }, + { + "epoch": 10.980049875311721, + "grad_norm": 7.024209022521973, + "learning_rate": 9.024937655860349e-06, + "loss": 0.394, + "step": 44030 + }, + { + "epoch": 10.982543640897756, + "grad_norm": 8.922717094421387, + "learning_rate": 9.022443890274316e-06, + "loss": 0.32, + "step": 44040 + }, + { + "epoch": 10.98503740648379, + "grad_norm": 7.146017074584961, + "learning_rate": 9.019950124688281e-06, + "loss": 0.3368, + "step": 44050 + }, + { + "epoch": 10.987531172069826, + "grad_norm": 6.3751749992370605, + "learning_rate": 9.017456359102245e-06, + "loss": 0.3405, + "step": 44060 + }, + { + "epoch": 10.99002493765586, + "grad_norm": 4.537290096282959, + "learning_rate": 9.01496259351621e-06, + "loss": 0.2856, + "step": 44070 + }, + { + "epoch": 10.992518703241895, + "grad_norm": 9.014129638671875, + "learning_rate": 9.012468827930175e-06, + "loss": 0.3719, + "step": 44080 + }, + { + "epoch": 10.99501246882793, + "grad_norm": 7.991206645965576, + "learning_rate": 9.00997506234414e-06, + "loss": 0.3403, + "step": 44090 + }, + { + "epoch": 10.997506234413965, + "grad_norm": 6.865902900695801, + "learning_rate": 9.007481296758106e-06, + "loss": 0.3167, + "step": 44100 + }, + { + "epoch": 11.0, + "grad_norm": 8.729063034057617, + "learning_rate": 9.00498753117207e-06, + "loss": 0.3674, + "step": 44110 + }, + { + "epoch": 11.0, + "eval_loss": 0.4132729470729828, + "eval_runtime": 59.9748, + "eval_samples_per_second": 16.724, + "eval_steps_per_second": 16.724, + "step": 44110 + }, + { + "epoch": 11.002493765586035, + "grad_norm": 8.256260871887207, + "learning_rate": 9.002493765586035e-06, + "loss": 0.3417, + "step": 44120 + }, + { + "epoch": 11.00498753117207, + "grad_norm": 8.294591903686523, + "learning_rate": 9e-06, + "loss": 0.3305, + "step": 44130 + }, + { + "epoch": 11.007481296758105, + "grad_norm": 7.374525547027588, + "learning_rate": 8.997506234413966e-06, + "loss": 0.3561, + "step": 44140 + }, + { + "epoch": 11.00997506234414, + "grad_norm": 7.575953006744385, + "learning_rate": 8.995012468827931e-06, + "loss": 0.3326, + "step": 44150 + }, + { + "epoch": 11.012468827930174, + "grad_norm": 8.2825345993042, + "learning_rate": 8.992518703241896e-06, + "loss": 0.3432, + "step": 44160 + }, + { + "epoch": 11.01496259351621, + "grad_norm": 6.32546854019165, + "learning_rate": 8.990024937655861e-06, + "loss": 0.345, + "step": 44170 + }, + { + "epoch": 11.017456359102244, + "grad_norm": 8.19118595123291, + "learning_rate": 8.987531172069827e-06, + "loss": 0.3045, + "step": 44180 + }, + { + "epoch": 11.019950124688279, + "grad_norm": 6.752303600311279, + "learning_rate": 8.98503740648379e-06, + "loss": 0.3918, + "step": 44190 + }, + { + "epoch": 11.022443890274314, + "grad_norm": 5.932526588439941, + "learning_rate": 8.982543640897756e-06, + "loss": 0.2983, + "step": 44200 + }, + { + "epoch": 11.024937655860349, + "grad_norm": 10.784844398498535, + "learning_rate": 8.980049875311721e-06, + "loss": 0.328, + "step": 44210 + }, + { + "epoch": 11.027431421446384, + "grad_norm": 6.154462814331055, + "learning_rate": 8.977556109725686e-06, + "loss": 0.3924, + "step": 44220 + }, + { + "epoch": 11.029925187032418, + "grad_norm": 6.966713905334473, + "learning_rate": 8.975062344139652e-06, + "loss": 0.3309, + "step": 44230 + }, + { + "epoch": 11.032418952618453, + "grad_norm": 4.729964256286621, + "learning_rate": 8.972568578553617e-06, + "loss": 0.3863, + "step": 44240 + }, + { + "epoch": 11.034912718204488, + "grad_norm": 7.567785739898682, + "learning_rate": 8.970074812967582e-06, + "loss": 0.305, + "step": 44250 + }, + { + "epoch": 11.037406483790523, + "grad_norm": 7.73862886428833, + "learning_rate": 8.967581047381548e-06, + "loss": 0.4145, + "step": 44260 + }, + { + "epoch": 11.039900249376558, + "grad_norm": 6.4053168296813965, + "learning_rate": 8.965087281795511e-06, + "loss": 0.3216, + "step": 44270 + }, + { + "epoch": 11.042394014962593, + "grad_norm": 4.572265148162842, + "learning_rate": 8.962593516209476e-06, + "loss": 0.3534, + "step": 44280 + }, + { + "epoch": 11.044887780548628, + "grad_norm": 5.619957447052002, + "learning_rate": 8.960099750623442e-06, + "loss": 0.3327, + "step": 44290 + }, + { + "epoch": 11.047381546134662, + "grad_norm": 13.95361042022705, + "learning_rate": 8.957605985037407e-06, + "loss": 0.3672, + "step": 44300 + }, + { + "epoch": 11.049875311720697, + "grad_norm": 5.5569963455200195, + "learning_rate": 8.955112219451372e-06, + "loss": 0.4164, + "step": 44310 + }, + { + "epoch": 11.052369077306734, + "grad_norm": 5.730642318725586, + "learning_rate": 8.952618453865338e-06, + "loss": 0.2852, + "step": 44320 + }, + { + "epoch": 11.054862842892769, + "grad_norm": 10.969292640686035, + "learning_rate": 8.950124688279303e-06, + "loss": 0.3387, + "step": 44330 + }, + { + "epoch": 11.057356608478804, + "grad_norm": 6.42288064956665, + "learning_rate": 8.947630922693268e-06, + "loss": 0.3338, + "step": 44340 + }, + { + "epoch": 11.059850374064839, + "grad_norm": 7.737945556640625, + "learning_rate": 8.945137157107234e-06, + "loss": 0.3709, + "step": 44350 + }, + { + "epoch": 11.062344139650873, + "grad_norm": 8.051152229309082, + "learning_rate": 8.942643391521197e-06, + "loss": 0.356, + "step": 44360 + }, + { + "epoch": 11.064837905236908, + "grad_norm": 6.058350563049316, + "learning_rate": 8.940149625935163e-06, + "loss": 0.2939, + "step": 44370 + }, + { + "epoch": 11.067331670822943, + "grad_norm": 7.329736232757568, + "learning_rate": 8.937655860349128e-06, + "loss": 0.3819, + "step": 44380 + }, + { + "epoch": 11.069825436408978, + "grad_norm": 7.439964771270752, + "learning_rate": 8.935162094763093e-06, + "loss": 0.325, + "step": 44390 + }, + { + "epoch": 11.072319201995013, + "grad_norm": 7.003705978393555, + "learning_rate": 8.932668329177059e-06, + "loss": 0.3168, + "step": 44400 + }, + { + "epoch": 11.074812967581048, + "grad_norm": 5.3289570808410645, + "learning_rate": 8.930423940149627e-06, + "loss": 0.3069, + "step": 44410 + }, + { + "epoch": 11.077306733167083, + "grad_norm": 6.427891731262207, + "learning_rate": 8.927930174563592e-06, + "loss": 0.3028, + "step": 44420 + }, + { + "epoch": 11.079800498753118, + "grad_norm": 7.827418327331543, + "learning_rate": 8.925436408977557e-06, + "loss": 0.3543, + "step": 44430 + }, + { + "epoch": 11.082294264339152, + "grad_norm": 9.805115699768066, + "learning_rate": 8.922942643391523e-06, + "loss": 0.3216, + "step": 44440 + }, + { + "epoch": 11.084788029925187, + "grad_norm": 6.640481948852539, + "learning_rate": 8.920448877805486e-06, + "loss": 0.372, + "step": 44450 + }, + { + "epoch": 11.087281795511222, + "grad_norm": 7.233997821807861, + "learning_rate": 8.917955112219452e-06, + "loss": 0.3521, + "step": 44460 + }, + { + "epoch": 11.089775561097257, + "grad_norm": 7.673696517944336, + "learning_rate": 8.915461346633417e-06, + "loss": 0.3184, + "step": 44470 + }, + { + "epoch": 11.092269326683292, + "grad_norm": 7.302619934082031, + "learning_rate": 8.912967581047382e-06, + "loss": 0.3365, + "step": 44480 + }, + { + "epoch": 11.094763092269327, + "grad_norm": 9.098041534423828, + "learning_rate": 8.910473815461348e-06, + "loss": 0.3513, + "step": 44490 + }, + { + "epoch": 11.097256857855362, + "grad_norm": 6.597679615020752, + "learning_rate": 8.907980049875313e-06, + "loss": 0.3133, + "step": 44500 + }, + { + "epoch": 11.099750623441397, + "grad_norm": 6.860264301300049, + "learning_rate": 8.905486284289278e-06, + "loss": 0.4096, + "step": 44510 + }, + { + "epoch": 11.102244389027431, + "grad_norm": 4.556776523590088, + "learning_rate": 8.902992518703243e-06, + "loss": 0.3389, + "step": 44520 + }, + { + "epoch": 11.104738154613466, + "grad_norm": 9.30516529083252, + "learning_rate": 8.900498753117209e-06, + "loss": 0.3051, + "step": 44530 + }, + { + "epoch": 11.107231920199501, + "grad_norm": 10.449723243713379, + "learning_rate": 8.898004987531172e-06, + "loss": 0.3412, + "step": 44540 + }, + { + "epoch": 11.109725685785536, + "grad_norm": 7.47760009765625, + "learning_rate": 8.895511221945138e-06, + "loss": 0.3901, + "step": 44550 + }, + { + "epoch": 11.11221945137157, + "grad_norm": 12.093515396118164, + "learning_rate": 8.893017456359103e-06, + "loss": 0.3252, + "step": 44560 + }, + { + "epoch": 11.114713216957606, + "grad_norm": 9.427157402038574, + "learning_rate": 8.890523690773068e-06, + "loss": 0.3101, + "step": 44570 + }, + { + "epoch": 11.11720698254364, + "grad_norm": 10.620285034179688, + "learning_rate": 8.888029925187032e-06, + "loss": 0.3178, + "step": 44580 + }, + { + "epoch": 11.119700748129675, + "grad_norm": 7.5438079833984375, + "learning_rate": 8.885536159600999e-06, + "loss": 0.3176, + "step": 44590 + }, + { + "epoch": 11.12219451371571, + "grad_norm": 6.367938041687012, + "learning_rate": 8.883042394014964e-06, + "loss": 0.3195, + "step": 44600 + }, + { + "epoch": 11.124688279301745, + "grad_norm": 16.17655372619629, + "learning_rate": 8.88054862842893e-06, + "loss": 0.3953, + "step": 44610 + }, + { + "epoch": 11.12718204488778, + "grad_norm": 10.65559196472168, + "learning_rate": 8.878054862842893e-06, + "loss": 0.3251, + "step": 44620 + }, + { + "epoch": 11.129675810473815, + "grad_norm": 6.366180419921875, + "learning_rate": 8.875561097256859e-06, + "loss": 0.3519, + "step": 44630 + }, + { + "epoch": 11.13216957605985, + "grad_norm": 8.204825401306152, + "learning_rate": 8.873067331670824e-06, + "loss": 0.3497, + "step": 44640 + }, + { + "epoch": 11.134663341645885, + "grad_norm": 10.065892219543457, + "learning_rate": 8.870573566084789e-06, + "loss": 0.2901, + "step": 44650 + }, + { + "epoch": 11.13715710723192, + "grad_norm": 7.356030464172363, + "learning_rate": 8.868079800498753e-06, + "loss": 0.3841, + "step": 44660 + }, + { + "epoch": 11.139650872817954, + "grad_norm": 7.364969730377197, + "learning_rate": 8.865586034912718e-06, + "loss": 0.3247, + "step": 44670 + }, + { + "epoch": 11.14214463840399, + "grad_norm": 7.613313674926758, + "learning_rate": 8.863092269326683e-06, + "loss": 0.3508, + "step": 44680 + }, + { + "epoch": 11.144638403990024, + "grad_norm": 8.078596115112305, + "learning_rate": 8.860598503740649e-06, + "loss": 0.3305, + "step": 44690 + }, + { + "epoch": 11.147132169576059, + "grad_norm": 10.208447456359863, + "learning_rate": 8.858104738154614e-06, + "loss": 0.3537, + "step": 44700 + }, + { + "epoch": 11.149625935162096, + "grad_norm": 8.215570449829102, + "learning_rate": 8.85561097256858e-06, + "loss": 0.3275, + "step": 44710 + }, + { + "epoch": 11.15211970074813, + "grad_norm": 5.312885284423828, + "learning_rate": 8.853117206982545e-06, + "loss": 0.3202, + "step": 44720 + }, + { + "epoch": 11.154613466334165, + "grad_norm": 7.666956901550293, + "learning_rate": 8.85062344139651e-06, + "loss": 0.3238, + "step": 44730 + }, + { + "epoch": 11.1571072319202, + "grad_norm": 7.116825103759766, + "learning_rate": 8.848129675810474e-06, + "loss": 0.3231, + "step": 44740 + }, + { + "epoch": 11.159600997506235, + "grad_norm": 6.489451885223389, + "learning_rate": 8.845635910224439e-06, + "loss": 0.3411, + "step": 44750 + }, + { + "epoch": 11.16209476309227, + "grad_norm": 6.022216320037842, + "learning_rate": 8.843142144638404e-06, + "loss": 0.4006, + "step": 44760 + }, + { + "epoch": 11.164588528678305, + "grad_norm": 8.421282768249512, + "learning_rate": 8.84064837905237e-06, + "loss": 0.4011, + "step": 44770 + }, + { + "epoch": 11.16708229426434, + "grad_norm": 7.724409580230713, + "learning_rate": 8.838154613466335e-06, + "loss": 0.325, + "step": 44780 + }, + { + "epoch": 11.169576059850375, + "grad_norm": 8.80372142791748, + "learning_rate": 8.8356608478803e-06, + "loss": 0.3197, + "step": 44790 + }, + { + "epoch": 11.17206982543641, + "grad_norm": 10.359016418457031, + "learning_rate": 8.833167082294265e-06, + "loss": 0.3586, + "step": 44800 + }, + { + "epoch": 11.174563591022444, + "grad_norm": 8.572481155395508, + "learning_rate": 8.83067331670823e-06, + "loss": 0.3292, + "step": 44810 + }, + { + "epoch": 11.17705735660848, + "grad_norm": 13.053606033325195, + "learning_rate": 8.828179551122196e-06, + "loss": 0.3645, + "step": 44820 + }, + { + "epoch": 11.179551122194514, + "grad_norm": 10.115473747253418, + "learning_rate": 8.82568578553616e-06, + "loss": 0.2904, + "step": 44830 + }, + { + "epoch": 11.182044887780549, + "grad_norm": 8.798450469970703, + "learning_rate": 8.823192019950125e-06, + "loss": 0.334, + "step": 44840 + }, + { + "epoch": 11.184538653366584, + "grad_norm": 8.745047569274902, + "learning_rate": 8.82069825436409e-06, + "loss": 0.3467, + "step": 44850 + }, + { + "epoch": 11.187032418952619, + "grad_norm": 6.6508941650390625, + "learning_rate": 8.818204488778056e-06, + "loss": 0.4056, + "step": 44860 + }, + { + "epoch": 11.189526184538654, + "grad_norm": 7.657419204711914, + "learning_rate": 8.815710723192021e-06, + "loss": 0.3708, + "step": 44870 + }, + { + "epoch": 11.192019950124688, + "grad_norm": 7.383201599121094, + "learning_rate": 8.813216957605986e-06, + "loss": 0.3759, + "step": 44880 + }, + { + "epoch": 11.194513715710723, + "grad_norm": 5.854804992675781, + "learning_rate": 8.810723192019951e-06, + "loss": 0.3151, + "step": 44890 + }, + { + "epoch": 11.197007481296758, + "grad_norm": 6.18996000289917, + "learning_rate": 8.808229426433917e-06, + "loss": 0.3223, + "step": 44900 + }, + { + "epoch": 11.199501246882793, + "grad_norm": 7.372582912445068, + "learning_rate": 8.80573566084788e-06, + "loss": 0.3174, + "step": 44910 + }, + { + "epoch": 11.201995012468828, + "grad_norm": 4.5158233642578125, + "learning_rate": 8.803241895261846e-06, + "loss": 0.2967, + "step": 44920 + }, + { + "epoch": 11.204488778054863, + "grad_norm": 8.73290729522705, + "learning_rate": 8.800748129675811e-06, + "loss": 0.3015, + "step": 44930 + }, + { + "epoch": 11.206982543640898, + "grad_norm": 12.908029556274414, + "learning_rate": 8.798254364089776e-06, + "loss": 0.351, + "step": 44940 + }, + { + "epoch": 11.209476309226932, + "grad_norm": 6.993527889251709, + "learning_rate": 8.795760598503742e-06, + "loss": 0.3274, + "step": 44950 + }, + { + "epoch": 11.211970074812967, + "grad_norm": 8.653640747070312, + "learning_rate": 8.793266832917707e-06, + "loss": 0.3817, + "step": 44960 + }, + { + "epoch": 11.214463840399002, + "grad_norm": 4.629430770874023, + "learning_rate": 8.790773067331672e-06, + "loss": 0.3772, + "step": 44970 + }, + { + "epoch": 11.216957605985037, + "grad_norm": 7.76257848739624, + "learning_rate": 8.788279301745638e-06, + "loss": 0.3459, + "step": 44980 + }, + { + "epoch": 11.219451371571072, + "grad_norm": 8.239355087280273, + "learning_rate": 8.785785536159601e-06, + "loss": 0.3039, + "step": 44990 + }, + { + "epoch": 11.221945137157107, + "grad_norm": 8.355988502502441, + "learning_rate": 8.783291770573566e-06, + "loss": 0.363, + "step": 45000 + }, + { + "epoch": 11.224438902743142, + "grad_norm": 7.481101989746094, + "learning_rate": 8.780798004987532e-06, + "loss": 0.3703, + "step": 45010 + }, + { + "epoch": 11.226932668329177, + "grad_norm": 4.93237829208374, + "learning_rate": 8.778304239401497e-06, + "loss": 0.3426, + "step": 45020 + }, + { + "epoch": 11.229426433915211, + "grad_norm": 5.618884563446045, + "learning_rate": 8.775810473815462e-06, + "loss": 0.3202, + "step": 45030 + }, + { + "epoch": 11.231920199501246, + "grad_norm": 12.628522872924805, + "learning_rate": 8.773316708229426e-06, + "loss": 0.36, + "step": 45040 + }, + { + "epoch": 11.234413965087281, + "grad_norm": 7.585283279418945, + "learning_rate": 8.770822942643391e-06, + "loss": 0.3422, + "step": 45050 + }, + { + "epoch": 11.236907730673316, + "grad_norm": 7.396781921386719, + "learning_rate": 8.768329177057358e-06, + "loss": 0.3276, + "step": 45060 + }, + { + "epoch": 11.239401496259351, + "grad_norm": 13.765165328979492, + "learning_rate": 8.765835411471324e-06, + "loss": 0.3655, + "step": 45070 + }, + { + "epoch": 11.241895261845386, + "grad_norm": 7.234029769897461, + "learning_rate": 8.763341645885287e-06, + "loss": 0.3718, + "step": 45080 + }, + { + "epoch": 11.24438902743142, + "grad_norm": 4.461481094360352, + "learning_rate": 8.760847880299253e-06, + "loss": 0.3412, + "step": 45090 + }, + { + "epoch": 11.246882793017456, + "grad_norm": 8.242216110229492, + "learning_rate": 8.758354114713218e-06, + "loss": 0.3455, + "step": 45100 + }, + { + "epoch": 11.24937655860349, + "grad_norm": 10.74194622039795, + "learning_rate": 8.755860349127183e-06, + "loss": 0.3511, + "step": 45110 + }, + { + "epoch": 11.251870324189527, + "grad_norm": 9.051909446716309, + "learning_rate": 8.753366583541147e-06, + "loss": 0.3333, + "step": 45120 + }, + { + "epoch": 11.254364089775562, + "grad_norm": 9.297087669372559, + "learning_rate": 8.750872817955112e-06, + "loss": 0.3345, + "step": 45130 + }, + { + "epoch": 11.256857855361597, + "grad_norm": 7.317951679229736, + "learning_rate": 8.748379052369077e-06, + "loss": 0.3696, + "step": 45140 + }, + { + "epoch": 11.259351620947632, + "grad_norm": 6.456167221069336, + "learning_rate": 8.745885286783043e-06, + "loss": 0.2631, + "step": 45150 + }, + { + "epoch": 11.261845386533667, + "grad_norm": 9.548169136047363, + "learning_rate": 8.743391521197008e-06, + "loss": 0.3358, + "step": 45160 + }, + { + "epoch": 11.264339152119701, + "grad_norm": 4.220322608947754, + "learning_rate": 8.740897755610973e-06, + "loss": 0.309, + "step": 45170 + }, + { + "epoch": 11.266832917705736, + "grad_norm": 7.340099334716797, + "learning_rate": 8.738403990024939e-06, + "loss": 0.2948, + "step": 45180 + }, + { + "epoch": 11.269326683291771, + "grad_norm": 6.146076202392578, + "learning_rate": 8.735910224438904e-06, + "loss": 0.3575, + "step": 45190 + }, + { + "epoch": 11.271820448877806, + "grad_norm": 8.388291358947754, + "learning_rate": 8.733416458852868e-06, + "loss": 0.3306, + "step": 45200 + }, + { + "epoch": 11.27431421446384, + "grad_norm": 12.431069374084473, + "learning_rate": 8.730922693266833e-06, + "loss": 0.3423, + "step": 45210 + }, + { + "epoch": 11.276807980049876, + "grad_norm": 7.34416389465332, + "learning_rate": 8.728428927680798e-06, + "loss": 0.3473, + "step": 45220 + }, + { + "epoch": 11.27930174563591, + "grad_norm": 4.615060806274414, + "learning_rate": 8.725935162094764e-06, + "loss": 0.3393, + "step": 45230 + }, + { + "epoch": 11.281795511221945, + "grad_norm": 7.668536186218262, + "learning_rate": 8.723441396508729e-06, + "loss": 0.3442, + "step": 45240 + }, + { + "epoch": 11.28428927680798, + "grad_norm": 8.232738494873047, + "learning_rate": 8.720947630922694e-06, + "loss": 0.334, + "step": 45250 + }, + { + "epoch": 11.286783042394015, + "grad_norm": 9.045811653137207, + "learning_rate": 8.71845386533666e-06, + "loss": 0.428, + "step": 45260 + }, + { + "epoch": 11.28927680798005, + "grad_norm": 7.1297526359558105, + "learning_rate": 8.715960099750625e-06, + "loss": 0.3156, + "step": 45270 + }, + { + "epoch": 11.291770573566085, + "grad_norm": 8.763090133666992, + "learning_rate": 8.71346633416459e-06, + "loss": 0.3447, + "step": 45280 + }, + { + "epoch": 11.29426433915212, + "grad_norm": 8.615846633911133, + "learning_rate": 8.710972568578554e-06, + "loss": 0.3308, + "step": 45290 + }, + { + "epoch": 11.296758104738155, + "grad_norm": 10.320062637329102, + "learning_rate": 8.708478802992519e-06, + "loss": 0.3569, + "step": 45300 + }, + { + "epoch": 11.29925187032419, + "grad_norm": 7.310534954071045, + "learning_rate": 8.705985037406484e-06, + "loss": 0.362, + "step": 45310 + }, + { + "epoch": 11.301745635910224, + "grad_norm": 6.569928169250488, + "learning_rate": 8.70349127182045e-06, + "loss": 0.2893, + "step": 45320 + }, + { + "epoch": 11.30423940149626, + "grad_norm": 6.204464435577393, + "learning_rate": 8.700997506234415e-06, + "loss": 0.3239, + "step": 45330 + }, + { + "epoch": 11.306733167082294, + "grad_norm": 6.959688663482666, + "learning_rate": 8.69850374064838e-06, + "loss": 0.3394, + "step": 45340 + }, + { + "epoch": 11.309226932668329, + "grad_norm": 8.13005256652832, + "learning_rate": 8.696009975062346e-06, + "loss": 0.3523, + "step": 45350 + }, + { + "epoch": 11.311720698254364, + "grad_norm": 5.307433128356934, + "learning_rate": 8.693516209476311e-06, + "loss": 0.3485, + "step": 45360 + }, + { + "epoch": 11.314214463840399, + "grad_norm": 7.195748329162598, + "learning_rate": 8.691022443890274e-06, + "loss": 0.3992, + "step": 45370 + }, + { + "epoch": 11.316708229426434, + "grad_norm": 8.693116188049316, + "learning_rate": 8.68852867830424e-06, + "loss": 0.3582, + "step": 45380 + }, + { + "epoch": 11.319201995012468, + "grad_norm": 7.39258337020874, + "learning_rate": 8.686034912718205e-06, + "loss": 0.33, + "step": 45390 + }, + { + "epoch": 11.321695760598503, + "grad_norm": 7.933567047119141, + "learning_rate": 8.68354114713217e-06, + "loss": 0.3464, + "step": 45400 + }, + { + "epoch": 11.324189526184538, + "grad_norm": 8.168654441833496, + "learning_rate": 8.681047381546136e-06, + "loss": 0.3156, + "step": 45410 + }, + { + "epoch": 11.326683291770573, + "grad_norm": 10.021574974060059, + "learning_rate": 8.678553615960101e-06, + "loss": 0.3644, + "step": 45420 + }, + { + "epoch": 11.329177057356608, + "grad_norm": 7.775262832641602, + "learning_rate": 8.676059850374066e-06, + "loss": 0.3631, + "step": 45430 + }, + { + "epoch": 11.331670822942643, + "grad_norm": 8.023462295532227, + "learning_rate": 8.673566084788032e-06, + "loss": 0.3567, + "step": 45440 + }, + { + "epoch": 11.334164588528678, + "grad_norm": 10.410110473632812, + "learning_rate": 8.671072319201995e-06, + "loss": 0.3938, + "step": 45450 + }, + { + "epoch": 11.336658354114713, + "grad_norm": 10.069509506225586, + "learning_rate": 8.66857855361596e-06, + "loss": 0.4094, + "step": 45460 + }, + { + "epoch": 11.339152119700747, + "grad_norm": 5.57853364944458, + "learning_rate": 8.666084788029926e-06, + "loss": 0.3017, + "step": 45470 + }, + { + "epoch": 11.341645885286782, + "grad_norm": 7.023243427276611, + "learning_rate": 8.663591022443891e-06, + "loss": 0.3085, + "step": 45480 + }, + { + "epoch": 11.344139650872817, + "grad_norm": 6.86598014831543, + "learning_rate": 8.661097256857855e-06, + "loss": 0.3638, + "step": 45490 + }, + { + "epoch": 11.346633416458852, + "grad_norm": 8.568735122680664, + "learning_rate": 8.65860349127182e-06, + "loss": 0.3982, + "step": 45500 + }, + { + "epoch": 11.349127182044889, + "grad_norm": 8.5831880569458, + "learning_rate": 8.656109725685785e-06, + "loss": 0.3462, + "step": 45510 + }, + { + "epoch": 11.351620947630924, + "grad_norm": 6.069805145263672, + "learning_rate": 8.65361596009975e-06, + "loss": 0.2987, + "step": 45520 + }, + { + "epoch": 11.354114713216958, + "grad_norm": 7.621370315551758, + "learning_rate": 8.651122194513718e-06, + "loss": 0.3374, + "step": 45530 + }, + { + "epoch": 11.356608478802993, + "grad_norm": 6.071798324584961, + "learning_rate": 8.648628428927681e-06, + "loss": 0.3222, + "step": 45540 + }, + { + "epoch": 11.359102244389028, + "grad_norm": 5.18229341506958, + "learning_rate": 8.646134663341647e-06, + "loss": 0.3218, + "step": 45550 + }, + { + "epoch": 11.361596009975063, + "grad_norm": 6.7007293701171875, + "learning_rate": 8.643640897755612e-06, + "loss": 0.3675, + "step": 45560 + }, + { + "epoch": 11.364089775561098, + "grad_norm": 5.716126918792725, + "learning_rate": 8.641147132169577e-06, + "loss": 0.336, + "step": 45570 + }, + { + "epoch": 11.366583541147133, + "grad_norm": 6.14850378036499, + "learning_rate": 8.638653366583541e-06, + "loss": 0.3959, + "step": 45580 + }, + { + "epoch": 11.369077306733168, + "grad_norm": 4.6964898109436035, + "learning_rate": 8.636159600997506e-06, + "loss": 0.343, + "step": 45590 + }, + { + "epoch": 11.371571072319203, + "grad_norm": 8.84803581237793, + "learning_rate": 8.633665835411472e-06, + "loss": 0.332, + "step": 45600 + }, + { + "epoch": 11.374064837905237, + "grad_norm": 5.964016914367676, + "learning_rate": 8.631172069825437e-06, + "loss": 0.3045, + "step": 45610 + }, + { + "epoch": 11.376558603491272, + "grad_norm": 4.455854415893555, + "learning_rate": 8.628678304239402e-06, + "loss": 0.3272, + "step": 45620 + }, + { + "epoch": 11.379052369077307, + "grad_norm": 8.803267478942871, + "learning_rate": 8.626184538653367e-06, + "loss": 0.337, + "step": 45630 + }, + { + "epoch": 11.381546134663342, + "grad_norm": 9.714362144470215, + "learning_rate": 8.623690773067333e-06, + "loss": 0.3589, + "step": 45640 + }, + { + "epoch": 11.384039900249377, + "grad_norm": 4.950045108795166, + "learning_rate": 8.621197007481298e-06, + "loss": 0.3321, + "step": 45650 + }, + { + "epoch": 11.386533665835412, + "grad_norm": 8.148637771606445, + "learning_rate": 8.618703241895262e-06, + "loss": 0.3178, + "step": 45660 + }, + { + "epoch": 11.389027431421447, + "grad_norm": 21.125547409057617, + "learning_rate": 8.616209476309227e-06, + "loss": 0.3481, + "step": 45670 + }, + { + "epoch": 11.391521197007481, + "grad_norm": 7.620448112487793, + "learning_rate": 8.613715710723192e-06, + "loss": 0.3811, + "step": 45680 + }, + { + "epoch": 11.394014962593516, + "grad_norm": 7.3992743492126465, + "learning_rate": 8.611221945137158e-06, + "loss": 0.3509, + "step": 45690 + }, + { + "epoch": 11.396508728179551, + "grad_norm": 7.22592830657959, + "learning_rate": 8.608728179551123e-06, + "loss": 0.3888, + "step": 45700 + }, + { + "epoch": 11.399002493765586, + "grad_norm": 7.094658851623535, + "learning_rate": 8.606234413965088e-06, + "loss": 0.3421, + "step": 45710 + }, + { + "epoch": 11.401496259351621, + "grad_norm": 7.960940837860107, + "learning_rate": 8.603740648379054e-06, + "loss": 0.4286, + "step": 45720 + }, + { + "epoch": 11.403990024937656, + "grad_norm": 10.1074857711792, + "learning_rate": 8.601246882793019e-06, + "loss": 0.3917, + "step": 45730 + }, + { + "epoch": 11.40648379052369, + "grad_norm": 10.943230628967285, + "learning_rate": 8.598753117206984e-06, + "loss": 0.3824, + "step": 45740 + }, + { + "epoch": 11.408977556109726, + "grad_norm": 6.1284685134887695, + "learning_rate": 8.596259351620948e-06, + "loss": 0.2953, + "step": 45750 + }, + { + "epoch": 11.41147132169576, + "grad_norm": 6.4964776039123535, + "learning_rate": 8.593765586034913e-06, + "loss": 0.3705, + "step": 45760 + }, + { + "epoch": 11.413965087281795, + "grad_norm": 7.87790584564209, + "learning_rate": 8.591271820448878e-06, + "loss": 0.3237, + "step": 45770 + }, + { + "epoch": 11.41645885286783, + "grad_norm": 7.569821357727051, + "learning_rate": 8.588778054862844e-06, + "loss": 0.303, + "step": 45780 + }, + { + "epoch": 11.418952618453865, + "grad_norm": 8.668131828308105, + "learning_rate": 8.586284289276809e-06, + "loss": 0.2865, + "step": 45790 + }, + { + "epoch": 11.4214463840399, + "grad_norm": 8.034530639648438, + "learning_rate": 8.583790523690774e-06, + "loss": 0.3503, + "step": 45800 + }, + { + "epoch": 11.423940149625935, + "grad_norm": 8.05600357055664, + "learning_rate": 8.58129675810474e-06, + "loss": 0.3285, + "step": 45810 + }, + { + "epoch": 11.42643391521197, + "grad_norm": 7.00352668762207, + "learning_rate": 8.578802992518705e-06, + "loss": 0.3335, + "step": 45820 + }, + { + "epoch": 11.428927680798004, + "grad_norm": 12.199906349182129, + "learning_rate": 8.576309226932669e-06, + "loss": 0.4104, + "step": 45830 + }, + { + "epoch": 11.43142144638404, + "grad_norm": 8.1293306350708, + "learning_rate": 8.573815461346634e-06, + "loss": 0.3462, + "step": 45840 + }, + { + "epoch": 11.433915211970074, + "grad_norm": 9.158780097961426, + "learning_rate": 8.5713216957606e-06, + "loss": 0.3138, + "step": 45850 + }, + { + "epoch": 11.436408977556109, + "grad_norm": 13.985179901123047, + "learning_rate": 8.568827930174564e-06, + "loss": 0.3182, + "step": 45860 + }, + { + "epoch": 11.438902743142144, + "grad_norm": 7.04974889755249, + "learning_rate": 8.566334164588528e-06, + "loss": 0.3452, + "step": 45870 + }, + { + "epoch": 11.441396508728179, + "grad_norm": 7.684841632843018, + "learning_rate": 8.563840399002495e-06, + "loss": 0.3312, + "step": 45880 + }, + { + "epoch": 11.443890274314214, + "grad_norm": 7.361491680145264, + "learning_rate": 8.56134663341646e-06, + "loss": 0.3133, + "step": 45890 + }, + { + "epoch": 11.446384039900249, + "grad_norm": 8.644780158996582, + "learning_rate": 8.558852867830426e-06, + "loss": 0.3371, + "step": 45900 + }, + { + "epoch": 11.448877805486283, + "grad_norm": 8.315042495727539, + "learning_rate": 8.55635910224439e-06, + "loss": 0.3792, + "step": 45910 + }, + { + "epoch": 11.451371571072318, + "grad_norm": 8.369626998901367, + "learning_rate": 8.553865336658355e-06, + "loss": 0.3525, + "step": 45920 + }, + { + "epoch": 11.453865336658355, + "grad_norm": 12.835140228271484, + "learning_rate": 8.55137157107232e-06, + "loss": 0.3088, + "step": 45930 + }, + { + "epoch": 11.45635910224439, + "grad_norm": 10.077214241027832, + "learning_rate": 8.548877805486285e-06, + "loss": 0.3456, + "step": 45940 + }, + { + "epoch": 11.458852867830425, + "grad_norm": 8.558807373046875, + "learning_rate": 8.546384039900249e-06, + "loss": 0.2983, + "step": 45950 + }, + { + "epoch": 11.46134663341646, + "grad_norm": 10.858403205871582, + "learning_rate": 8.543890274314214e-06, + "loss": 0.3396, + "step": 45960 + }, + { + "epoch": 11.463840399002494, + "grad_norm": 6.8004469871521, + "learning_rate": 8.54139650872818e-06, + "loss": 0.3272, + "step": 45970 + }, + { + "epoch": 11.46633416458853, + "grad_norm": 6.694581985473633, + "learning_rate": 8.538902743142145e-06, + "loss": 0.3775, + "step": 45980 + }, + { + "epoch": 11.468827930174564, + "grad_norm": 8.675992012023926, + "learning_rate": 8.53640897755611e-06, + "loss": 0.3789, + "step": 45990 + }, + { + "epoch": 11.471321695760599, + "grad_norm": 6.141960620880127, + "learning_rate": 8.533915211970075e-06, + "loss": 0.3364, + "step": 46000 + }, + { + "epoch": 11.473815461346634, + "grad_norm": 14.198030471801758, + "learning_rate": 8.53142144638404e-06, + "loss": 0.3398, + "step": 46010 + }, + { + "epoch": 11.476309226932669, + "grad_norm": 6.926119327545166, + "learning_rate": 8.528927680798006e-06, + "loss": 0.2966, + "step": 46020 + }, + { + "epoch": 11.478802992518704, + "grad_norm": 5.021880626678467, + "learning_rate": 8.526433915211971e-06, + "loss": 0.3197, + "step": 46030 + }, + { + "epoch": 11.481296758104738, + "grad_norm": 6.237401008605957, + "learning_rate": 8.523940149625935e-06, + "loss": 0.3225, + "step": 46040 + }, + { + "epoch": 11.483790523690773, + "grad_norm": 11.097943305969238, + "learning_rate": 8.5214463840399e-06, + "loss": 0.4856, + "step": 46050 + }, + { + "epoch": 11.486284289276808, + "grad_norm": 8.584773063659668, + "learning_rate": 8.518952618453866e-06, + "loss": 0.3588, + "step": 46060 + }, + { + "epoch": 11.488778054862843, + "grad_norm": 5.824810981750488, + "learning_rate": 8.516458852867831e-06, + "loss": 0.2905, + "step": 46070 + }, + { + "epoch": 11.491271820448878, + "grad_norm": 8.207144737243652, + "learning_rate": 8.513965087281796e-06, + "loss": 0.3239, + "step": 46080 + }, + { + "epoch": 11.493765586034913, + "grad_norm": 8.661499977111816, + "learning_rate": 8.511471321695762e-06, + "loss": 0.3761, + "step": 46090 + }, + { + "epoch": 11.496259351620948, + "grad_norm": 6.3622941970825195, + "learning_rate": 8.508977556109727e-06, + "loss": 0.346, + "step": 46100 + }, + { + "epoch": 11.498753117206983, + "grad_norm": 5.697100639343262, + "learning_rate": 8.506483790523692e-06, + "loss": 0.3489, + "step": 46110 + }, + { + "epoch": 11.501246882793017, + "grad_norm": 10.397525787353516, + "learning_rate": 8.503990024937656e-06, + "loss": 0.4333, + "step": 46120 + }, + { + "epoch": 11.503740648379052, + "grad_norm": 11.356361389160156, + "learning_rate": 8.501496259351621e-06, + "loss": 0.3788, + "step": 46130 + }, + { + "epoch": 11.506234413965087, + "grad_norm": 10.53273868560791, + "learning_rate": 8.499002493765586e-06, + "loss": 0.4482, + "step": 46140 + }, + { + "epoch": 11.508728179551122, + "grad_norm": 7.545499324798584, + "learning_rate": 8.496508728179552e-06, + "loss": 0.3394, + "step": 46150 + }, + { + "epoch": 11.511221945137157, + "grad_norm": 5.596482276916504, + "learning_rate": 8.494014962593517e-06, + "loss": 0.3306, + "step": 46160 + }, + { + "epoch": 11.513715710723192, + "grad_norm": 7.381369590759277, + "learning_rate": 8.491521197007482e-06, + "loss": 0.3239, + "step": 46170 + }, + { + "epoch": 11.516209476309227, + "grad_norm": 9.224650382995605, + "learning_rate": 8.489027431421448e-06, + "loss": 0.3463, + "step": 46180 + }, + { + "epoch": 11.518703241895262, + "grad_norm": 6.10484504699707, + "learning_rate": 8.486533665835413e-06, + "loss": 0.457, + "step": 46190 + }, + { + "epoch": 11.521197007481296, + "grad_norm": 8.393863677978516, + "learning_rate": 8.484039900249377e-06, + "loss": 0.3499, + "step": 46200 + }, + { + "epoch": 11.523690773067331, + "grad_norm": 10.636434555053711, + "learning_rate": 8.481546134663342e-06, + "loss": 0.429, + "step": 46210 + }, + { + "epoch": 11.526184538653366, + "grad_norm": 8.618541717529297, + "learning_rate": 8.479052369077307e-06, + "loss": 0.355, + "step": 46220 + }, + { + "epoch": 11.528678304239401, + "grad_norm": 5.982786655426025, + "learning_rate": 8.476558603491272e-06, + "loss": 0.3373, + "step": 46230 + }, + { + "epoch": 11.531172069825436, + "grad_norm": 10.3162202835083, + "learning_rate": 8.474064837905238e-06, + "loss": 0.2873, + "step": 46240 + }, + { + "epoch": 11.53366583541147, + "grad_norm": 5.130282878875732, + "learning_rate": 8.471571072319203e-06, + "loss": 0.2964, + "step": 46250 + }, + { + "epoch": 11.536159600997506, + "grad_norm": 5.8889360427856445, + "learning_rate": 8.469077306733168e-06, + "loss": 0.3102, + "step": 46260 + }, + { + "epoch": 11.53865336658354, + "grad_norm": 4.879234313964844, + "learning_rate": 8.466583541147134e-06, + "loss": 0.3211, + "step": 46270 + }, + { + "epoch": 11.541147132169575, + "grad_norm": 7.243957042694092, + "learning_rate": 8.464089775561099e-06, + "loss": 0.3416, + "step": 46280 + }, + { + "epoch": 11.54364089775561, + "grad_norm": 8.841197967529297, + "learning_rate": 8.461596009975063e-06, + "loss": 0.3346, + "step": 46290 + }, + { + "epoch": 11.546134663341645, + "grad_norm": 7.920102119445801, + "learning_rate": 8.459102244389028e-06, + "loss": 0.3587, + "step": 46300 + }, + { + "epoch": 11.548628428927682, + "grad_norm": 8.432982444763184, + "learning_rate": 8.456608478802993e-06, + "loss": 0.3364, + "step": 46310 + }, + { + "epoch": 11.551122194513717, + "grad_norm": 7.410879135131836, + "learning_rate": 8.454114713216959e-06, + "loss": 0.4021, + "step": 46320 + }, + { + "epoch": 11.553615960099751, + "grad_norm": 9.919821739196777, + "learning_rate": 8.451620947630922e-06, + "loss": 0.4665, + "step": 46330 + }, + { + "epoch": 11.556109725685786, + "grad_norm": 6.146061420440674, + "learning_rate": 8.449127182044888e-06, + "loss": 0.3562, + "step": 46340 + }, + { + "epoch": 11.558603491271821, + "grad_norm": 6.502902507781982, + "learning_rate": 8.446633416458855e-06, + "loss": 0.3341, + "step": 46350 + }, + { + "epoch": 11.561097256857856, + "grad_norm": 7.523041725158691, + "learning_rate": 8.44413965087282e-06, + "loss": 0.3158, + "step": 46360 + }, + { + "epoch": 11.563591022443891, + "grad_norm": 5.467228412628174, + "learning_rate": 8.441645885286783e-06, + "loss": 0.3464, + "step": 46370 + }, + { + "epoch": 11.566084788029926, + "grad_norm": 7.694350242614746, + "learning_rate": 8.439152119700749e-06, + "loss": 0.3111, + "step": 46380 + }, + { + "epoch": 11.56857855361596, + "grad_norm": 13.799556732177734, + "learning_rate": 8.436658354114714e-06, + "loss": 0.3468, + "step": 46390 + }, + { + "epoch": 11.571072319201996, + "grad_norm": 8.76827621459961, + "learning_rate": 8.43416458852868e-06, + "loss": 0.3029, + "step": 46400 + }, + { + "epoch": 11.57356608478803, + "grad_norm": 11.704144477844238, + "learning_rate": 8.431670822942643e-06, + "loss": 0.3546, + "step": 46410 + }, + { + "epoch": 11.576059850374065, + "grad_norm": 8.31985855102539, + "learning_rate": 8.429177057356608e-06, + "loss": 0.3563, + "step": 46420 + }, + { + "epoch": 11.5785536159601, + "grad_norm": 6.618094444274902, + "learning_rate": 8.426683291770574e-06, + "loss": 0.2983, + "step": 46430 + }, + { + "epoch": 11.581047381546135, + "grad_norm": 13.60894775390625, + "learning_rate": 8.424189526184539e-06, + "loss": 0.3255, + "step": 46440 + }, + { + "epoch": 11.58354114713217, + "grad_norm": 5.429731369018555, + "learning_rate": 8.421695760598504e-06, + "loss": 0.3448, + "step": 46450 + }, + { + "epoch": 11.586034912718205, + "grad_norm": 5.1549224853515625, + "learning_rate": 8.41920199501247e-06, + "loss": 0.3547, + "step": 46460 + }, + { + "epoch": 11.58852867830424, + "grad_norm": 6.888156890869141, + "learning_rate": 8.416708229426435e-06, + "loss": 0.3126, + "step": 46470 + }, + { + "epoch": 11.591022443890274, + "grad_norm": 8.632163047790527, + "learning_rate": 8.4142144638404e-06, + "loss": 0.3216, + "step": 46480 + }, + { + "epoch": 11.59351620947631, + "grad_norm": 8.811448097229004, + "learning_rate": 8.411720698254365e-06, + "loss": 0.3131, + "step": 46490 + }, + { + "epoch": 11.596009975062344, + "grad_norm": 7.669614791870117, + "learning_rate": 8.409226932668329e-06, + "loss": 0.3222, + "step": 46500 + }, + { + "epoch": 11.598503740648379, + "grad_norm": 12.28158950805664, + "learning_rate": 8.406733167082294e-06, + "loss": 0.3609, + "step": 46510 + }, + { + "epoch": 11.600997506234414, + "grad_norm": 12.088496208190918, + "learning_rate": 8.40423940149626e-06, + "loss": 0.3294, + "step": 46520 + }, + { + "epoch": 11.603491271820449, + "grad_norm": 7.644925594329834, + "learning_rate": 8.401745635910225e-06, + "loss": 0.3499, + "step": 46530 + }, + { + "epoch": 11.605985037406484, + "grad_norm": 8.952886581420898, + "learning_rate": 8.39925187032419e-06, + "loss": 0.4748, + "step": 46540 + }, + { + "epoch": 11.608478802992519, + "grad_norm": 7.003511428833008, + "learning_rate": 8.396758104738156e-06, + "loss": 0.3068, + "step": 46550 + }, + { + "epoch": 11.610972568578553, + "grad_norm": 9.938292503356934, + "learning_rate": 8.394264339152121e-06, + "loss": 0.3804, + "step": 46560 + }, + { + "epoch": 11.613466334164588, + "grad_norm": 6.018657684326172, + "learning_rate": 8.391770573566086e-06, + "loss": 0.3036, + "step": 46570 + }, + { + "epoch": 11.615960099750623, + "grad_norm": 4.126202583312988, + "learning_rate": 8.38927680798005e-06, + "loss": 0.2909, + "step": 46580 + }, + { + "epoch": 11.618453865336658, + "grad_norm": 4.8967132568359375, + "learning_rate": 8.386783042394015e-06, + "loss": 0.2835, + "step": 46590 + }, + { + "epoch": 11.620947630922693, + "grad_norm": 7.265740871429443, + "learning_rate": 8.38428927680798e-06, + "loss": 0.3781, + "step": 46600 + }, + { + "epoch": 11.623441396508728, + "grad_norm": 8.042332649230957, + "learning_rate": 8.381795511221946e-06, + "loss": 0.3315, + "step": 46610 + }, + { + "epoch": 11.625935162094763, + "grad_norm": 9.90203857421875, + "learning_rate": 8.379301745635911e-06, + "loss": 0.3179, + "step": 46620 + }, + { + "epoch": 11.628428927680797, + "grad_norm": 5.170167446136475, + "learning_rate": 8.376807980049876e-06, + "loss": 0.3211, + "step": 46630 + }, + { + "epoch": 11.630922693266832, + "grad_norm": 11.844198226928711, + "learning_rate": 8.374314214463842e-06, + "loss": 0.3923, + "step": 46640 + }, + { + "epoch": 11.633416458852867, + "grad_norm": 6.6527862548828125, + "learning_rate": 8.371820448877807e-06, + "loss": 0.3308, + "step": 46650 + }, + { + "epoch": 11.635910224438902, + "grad_norm": 10.204750061035156, + "learning_rate": 8.36932668329177e-06, + "loss": 0.4115, + "step": 46660 + }, + { + "epoch": 11.638403990024937, + "grad_norm": 5.856804847717285, + "learning_rate": 8.366832917705736e-06, + "loss": 0.3647, + "step": 46670 + }, + { + "epoch": 11.640897755610972, + "grad_norm": 5.034014701843262, + "learning_rate": 8.364339152119701e-06, + "loss": 0.344, + "step": 46680 + }, + { + "epoch": 11.643391521197007, + "grad_norm": 6.951847076416016, + "learning_rate": 8.361845386533667e-06, + "loss": 0.3241, + "step": 46690 + }, + { + "epoch": 11.645885286783042, + "grad_norm": 6.871116638183594, + "learning_rate": 8.359351620947632e-06, + "loss": 0.3133, + "step": 46700 + }, + { + "epoch": 11.648379052369076, + "grad_norm": 9.318560600280762, + "learning_rate": 8.356857855361597e-06, + "loss": 0.3388, + "step": 46710 + }, + { + "epoch": 11.650872817955111, + "grad_norm": 10.321300506591797, + "learning_rate": 8.354364089775563e-06, + "loss": 0.3425, + "step": 46720 + }, + { + "epoch": 11.653366583541148, + "grad_norm": 6.536388874053955, + "learning_rate": 8.351870324189528e-06, + "loss": 0.4034, + "step": 46730 + }, + { + "epoch": 11.655860349127183, + "grad_norm": 7.95648717880249, + "learning_rate": 8.349376558603493e-06, + "loss": 0.387, + "step": 46740 + }, + { + "epoch": 11.658354114713218, + "grad_norm": 4.451998710632324, + "learning_rate": 8.346882793017457e-06, + "loss": 0.3272, + "step": 46750 + }, + { + "epoch": 11.660847880299253, + "grad_norm": 5.903833389282227, + "learning_rate": 8.344389027431422e-06, + "loss": 0.3689, + "step": 46760 + }, + { + "epoch": 11.663341645885287, + "grad_norm": 9.391934394836426, + "learning_rate": 8.341895261845387e-06, + "loss": 0.3681, + "step": 46770 + }, + { + "epoch": 11.665835411471322, + "grad_norm": 7.286145210266113, + "learning_rate": 8.339401496259353e-06, + "loss": 0.3198, + "step": 46780 + }, + { + "epoch": 11.668329177057357, + "grad_norm": 7.64100456237793, + "learning_rate": 8.336907730673316e-06, + "loss": 0.3116, + "step": 46790 + }, + { + "epoch": 11.670822942643392, + "grad_norm": 6.605118751525879, + "learning_rate": 8.334413965087282e-06, + "loss": 0.2764, + "step": 46800 + }, + { + "epoch": 11.673316708229427, + "grad_norm": 7.660946369171143, + "learning_rate": 8.331920199501249e-06, + "loss": 0.4052, + "step": 46810 + }, + { + "epoch": 11.675810473815462, + "grad_norm": 5.863259315490723, + "learning_rate": 8.329426433915214e-06, + "loss": 0.3599, + "step": 46820 + }, + { + "epoch": 11.678304239401497, + "grad_norm": 8.02147388458252, + "learning_rate": 8.326932668329178e-06, + "loss": 0.3401, + "step": 46830 + }, + { + "epoch": 11.680798004987532, + "grad_norm": 7.035722255706787, + "learning_rate": 8.324438902743143e-06, + "loss": 0.3427, + "step": 46840 + }, + { + "epoch": 11.683291770573566, + "grad_norm": 8.856318473815918, + "learning_rate": 8.321945137157108e-06, + "loss": 0.3217, + "step": 46850 + }, + { + "epoch": 11.685785536159601, + "grad_norm": 12.737922668457031, + "learning_rate": 8.319451371571073e-06, + "loss": 0.3705, + "step": 46860 + }, + { + "epoch": 11.688279301745636, + "grad_norm": 7.125262260437012, + "learning_rate": 8.316957605985037e-06, + "loss": 0.2677, + "step": 46870 + }, + { + "epoch": 11.690773067331671, + "grad_norm": 9.105042457580566, + "learning_rate": 8.314463840399002e-06, + "loss": 0.3248, + "step": 46880 + }, + { + "epoch": 11.693266832917706, + "grad_norm": 6.2453436851501465, + "learning_rate": 8.311970074812968e-06, + "loss": 0.3346, + "step": 46890 + }, + { + "epoch": 11.69576059850374, + "grad_norm": 9.779679298400879, + "learning_rate": 8.309476309226933e-06, + "loss": 0.3915, + "step": 46900 + }, + { + "epoch": 11.698254364089776, + "grad_norm": 8.361473083496094, + "learning_rate": 8.306982543640898e-06, + "loss": 0.3421, + "step": 46910 + }, + { + "epoch": 11.70074812967581, + "grad_norm": 8.903398513793945, + "learning_rate": 8.304488778054864e-06, + "loss": 0.3676, + "step": 46920 + }, + { + "epoch": 11.703241895261845, + "grad_norm": 7.4261016845703125, + "learning_rate": 8.301995012468829e-06, + "loss": 0.4291, + "step": 46930 + }, + { + "epoch": 11.70573566084788, + "grad_norm": 10.814020156860352, + "learning_rate": 8.299501246882794e-06, + "loss": 0.3552, + "step": 46940 + }, + { + "epoch": 11.708229426433915, + "grad_norm": 6.702065467834473, + "learning_rate": 8.297007481296758e-06, + "loss": 0.2939, + "step": 46950 + }, + { + "epoch": 11.71072319201995, + "grad_norm": 8.938770294189453, + "learning_rate": 8.294513715710723e-06, + "loss": 0.3835, + "step": 46960 + }, + { + "epoch": 11.713216957605985, + "grad_norm": 8.248211860656738, + "learning_rate": 8.292019950124688e-06, + "loss": 0.3454, + "step": 46970 + }, + { + "epoch": 11.71571072319202, + "grad_norm": 5.559350967407227, + "learning_rate": 8.289526184538654e-06, + "loss": 0.4123, + "step": 46980 + }, + { + "epoch": 11.718204488778055, + "grad_norm": 7.779847145080566, + "learning_rate": 8.287032418952619e-06, + "loss": 0.3581, + "step": 46990 + }, + { + "epoch": 11.72069825436409, + "grad_norm": 6.771767616271973, + "learning_rate": 8.284538653366584e-06, + "loss": 0.3246, + "step": 47000 + }, + { + "epoch": 11.723192019950124, + "grad_norm": 6.579812049865723, + "learning_rate": 8.28204488778055e-06, + "loss": 0.3924, + "step": 47010 + }, + { + "epoch": 11.72568578553616, + "grad_norm": 10.849081039428711, + "learning_rate": 8.279551122194515e-06, + "loss": 0.4271, + "step": 47020 + }, + { + "epoch": 11.728179551122194, + "grad_norm": 7.1727824211120605, + "learning_rate": 8.27705735660848e-06, + "loss": 0.3297, + "step": 47030 + }, + { + "epoch": 11.730673316708229, + "grad_norm": 9.346254348754883, + "learning_rate": 8.274563591022444e-06, + "loss": 0.336, + "step": 47040 + }, + { + "epoch": 11.733167082294264, + "grad_norm": 9.930221557617188, + "learning_rate": 8.27206982543641e-06, + "loss": 0.3422, + "step": 47050 + }, + { + "epoch": 11.735660847880299, + "grad_norm": 8.86354923248291, + "learning_rate": 8.269576059850375e-06, + "loss": 0.3921, + "step": 47060 + }, + { + "epoch": 11.738154613466333, + "grad_norm": 6.808560371398926, + "learning_rate": 8.26708229426434e-06, + "loss": 0.3291, + "step": 47070 + }, + { + "epoch": 11.740648379052368, + "grad_norm": 8.037238121032715, + "learning_rate": 8.264588528678305e-06, + "loss": 0.3975, + "step": 47080 + }, + { + "epoch": 11.743142144638403, + "grad_norm": 6.9091010093688965, + "learning_rate": 8.26209476309227e-06, + "loss": 0.2959, + "step": 47090 + }, + { + "epoch": 11.745635910224438, + "grad_norm": 7.109037399291992, + "learning_rate": 8.259600997506236e-06, + "loss": 0.3498, + "step": 47100 + }, + { + "epoch": 11.748129675810475, + "grad_norm": 13.113969802856445, + "learning_rate": 8.257107231920201e-06, + "loss": 0.3391, + "step": 47110 + }, + { + "epoch": 11.75062344139651, + "grad_norm": 10.038171768188477, + "learning_rate": 8.254613466334165e-06, + "loss": 0.3651, + "step": 47120 + }, + { + "epoch": 11.753117206982544, + "grad_norm": 9.342373847961426, + "learning_rate": 8.25211970074813e-06, + "loss": 0.3071, + "step": 47130 + }, + { + "epoch": 11.75561097256858, + "grad_norm": 6.506314277648926, + "learning_rate": 8.249625935162095e-06, + "loss": 0.4171, + "step": 47140 + }, + { + "epoch": 11.758104738154614, + "grad_norm": 9.478023529052734, + "learning_rate": 8.24713216957606e-06, + "loss": 0.3932, + "step": 47150 + }, + { + "epoch": 11.760598503740649, + "grad_norm": 6.497440814971924, + "learning_rate": 8.244638403990026e-06, + "loss": 0.3667, + "step": 47160 + }, + { + "epoch": 11.763092269326684, + "grad_norm": 7.87533712387085, + "learning_rate": 8.242144638403991e-06, + "loss": 0.3391, + "step": 47170 + }, + { + "epoch": 11.765586034912719, + "grad_norm": 5.2254557609558105, + "learning_rate": 8.239650872817957e-06, + "loss": 0.3359, + "step": 47180 + }, + { + "epoch": 11.768079800498754, + "grad_norm": 6.769067287445068, + "learning_rate": 8.237157107231922e-06, + "loss": 0.3649, + "step": 47190 + }, + { + "epoch": 11.770573566084789, + "grad_norm": 9.426133155822754, + "learning_rate": 8.234663341645886e-06, + "loss": 0.3654, + "step": 47200 + }, + { + "epoch": 11.773067331670823, + "grad_norm": 14.436060905456543, + "learning_rate": 8.23216957605985e-06, + "loss": 0.404, + "step": 47210 + }, + { + "epoch": 11.775561097256858, + "grad_norm": 5.8233137130737305, + "learning_rate": 8.229675810473816e-06, + "loss": 0.3201, + "step": 47220 + }, + { + "epoch": 11.778054862842893, + "grad_norm": 8.890005111694336, + "learning_rate": 8.227182044887781e-06, + "loss": 0.3244, + "step": 47230 + }, + { + "epoch": 11.780548628428928, + "grad_norm": 9.727415084838867, + "learning_rate": 8.224688279301747e-06, + "loss": 0.3729, + "step": 47240 + }, + { + "epoch": 11.783042394014963, + "grad_norm": 7.346099853515625, + "learning_rate": 8.22219451371571e-06, + "loss": 0.3708, + "step": 47250 + }, + { + "epoch": 11.785536159600998, + "grad_norm": 7.652705192565918, + "learning_rate": 8.219700748129676e-06, + "loss": 0.3199, + "step": 47260 + }, + { + "epoch": 11.788029925187033, + "grad_norm": 9.516809463500977, + "learning_rate": 8.217206982543641e-06, + "loss": 0.3202, + "step": 47270 + }, + { + "epoch": 11.790523690773068, + "grad_norm": 6.376964569091797, + "learning_rate": 8.214713216957608e-06, + "loss": 0.3306, + "step": 47280 + }, + { + "epoch": 11.793017456359102, + "grad_norm": 7.07741641998291, + "learning_rate": 8.212219451371572e-06, + "loss": 0.3668, + "step": 47290 + }, + { + "epoch": 11.795511221945137, + "grad_norm": 8.181482315063477, + "learning_rate": 8.209725685785537e-06, + "loss": 0.3843, + "step": 47300 + }, + { + "epoch": 11.798004987531172, + "grad_norm": 9.766548156738281, + "learning_rate": 8.207231920199502e-06, + "loss": 0.3964, + "step": 47310 + }, + { + "epoch": 11.800498753117207, + "grad_norm": 7.150549411773682, + "learning_rate": 8.204738154613468e-06, + "loss": 0.3973, + "step": 47320 + }, + { + "epoch": 11.802992518703242, + "grad_norm": 6.823990821838379, + "learning_rate": 8.202244389027431e-06, + "loss": 0.3975, + "step": 47330 + }, + { + "epoch": 11.805486284289277, + "grad_norm": 7.102343559265137, + "learning_rate": 8.199750623441396e-06, + "loss": 0.3856, + "step": 47340 + }, + { + "epoch": 11.807980049875312, + "grad_norm": 7.7780585289001465, + "learning_rate": 8.197256857855362e-06, + "loss": 0.3287, + "step": 47350 + }, + { + "epoch": 11.810473815461346, + "grad_norm": 16.928571701049805, + "learning_rate": 8.194763092269327e-06, + "loss": 0.315, + "step": 47360 + }, + { + "epoch": 11.812967581047381, + "grad_norm": 9.088364601135254, + "learning_rate": 8.192269326683292e-06, + "loss": 0.3788, + "step": 47370 + }, + { + "epoch": 11.815461346633416, + "grad_norm": 6.3990044593811035, + "learning_rate": 8.189775561097258e-06, + "loss": 0.3317, + "step": 47380 + }, + { + "epoch": 11.817955112219451, + "grad_norm": 8.615131378173828, + "learning_rate": 8.187281795511223e-06, + "loss": 0.449, + "step": 47390 + }, + { + "epoch": 11.820448877805486, + "grad_norm": 5.688783168792725, + "learning_rate": 8.184788029925188e-06, + "loss": 0.3203, + "step": 47400 + }, + { + "epoch": 11.82294264339152, + "grad_norm": 9.329352378845215, + "learning_rate": 8.182294264339152e-06, + "loss": 0.4166, + "step": 47410 + }, + { + "epoch": 11.825436408977556, + "grad_norm": 5.748128414154053, + "learning_rate": 8.179800498753117e-06, + "loss": 0.3375, + "step": 47420 + }, + { + "epoch": 11.82793017456359, + "grad_norm": 8.721724510192871, + "learning_rate": 8.177306733167083e-06, + "loss": 0.361, + "step": 47430 + }, + { + "epoch": 11.830423940149625, + "grad_norm": 8.815082550048828, + "learning_rate": 8.174812967581048e-06, + "loss": 0.3311, + "step": 47440 + }, + { + "epoch": 11.83291770573566, + "grad_norm": 9.213290214538574, + "learning_rate": 8.172319201995013e-06, + "loss": 0.3334, + "step": 47450 + }, + { + "epoch": 11.835411471321695, + "grad_norm": 10.43193531036377, + "learning_rate": 8.169825436408978e-06, + "loss": 0.4101, + "step": 47460 + }, + { + "epoch": 11.83790523690773, + "grad_norm": 7.8552350997924805, + "learning_rate": 8.167331670822944e-06, + "loss": 0.3305, + "step": 47470 + }, + { + "epoch": 11.840399002493765, + "grad_norm": 8.425485610961914, + "learning_rate": 8.164837905236909e-06, + "loss": 0.4038, + "step": 47480 + }, + { + "epoch": 11.8428927680798, + "grad_norm": 7.427910804748535, + "learning_rate": 8.162344139650874e-06, + "loss": 0.3453, + "step": 47490 + }, + { + "epoch": 11.845386533665835, + "grad_norm": 4.184894561767578, + "learning_rate": 8.159850374064838e-06, + "loss": 0.3295, + "step": 47500 + }, + { + "epoch": 11.84788029925187, + "grad_norm": 9.951261520385742, + "learning_rate": 8.157356608478803e-06, + "loss": 0.3587, + "step": 47510 + }, + { + "epoch": 11.850374064837904, + "grad_norm": 9.598414421081543, + "learning_rate": 8.154862842892769e-06, + "loss": 0.4135, + "step": 47520 + }, + { + "epoch": 11.85286783042394, + "grad_norm": 7.297958850860596, + "learning_rate": 8.152369077306734e-06, + "loss": 0.3501, + "step": 47530 + }, + { + "epoch": 11.855361596009976, + "grad_norm": 6.240098476409912, + "learning_rate": 8.1498753117207e-06, + "loss": 0.344, + "step": 47540 + }, + { + "epoch": 11.85785536159601, + "grad_norm": 7.912670135498047, + "learning_rate": 8.147381546134665e-06, + "loss": 0.2993, + "step": 47550 + }, + { + "epoch": 11.860349127182046, + "grad_norm": 7.632608890533447, + "learning_rate": 8.14488778054863e-06, + "loss": 0.3528, + "step": 47560 + }, + { + "epoch": 11.86284289276808, + "grad_norm": 12.331526756286621, + "learning_rate": 8.142394014962595e-06, + "loss": 0.4064, + "step": 47570 + }, + { + "epoch": 11.865336658354115, + "grad_norm": 10.794772148132324, + "learning_rate": 8.139900249376559e-06, + "loss": 0.3422, + "step": 47580 + }, + { + "epoch": 11.86783042394015, + "grad_norm": 9.130438804626465, + "learning_rate": 8.137406483790524e-06, + "loss": 0.3605, + "step": 47590 + }, + { + "epoch": 11.870324189526185, + "grad_norm": 6.942079067230225, + "learning_rate": 8.13491271820449e-06, + "loss": 0.3104, + "step": 47600 + }, + { + "epoch": 11.87281795511222, + "grad_norm": 12.40993595123291, + "learning_rate": 8.132418952618455e-06, + "loss": 0.2948, + "step": 47610 + }, + { + "epoch": 11.875311720698255, + "grad_norm": 6.655416011810303, + "learning_rate": 8.129925187032418e-06, + "loss": 0.3096, + "step": 47620 + }, + { + "epoch": 11.87780548628429, + "grad_norm": 6.1850175857543945, + "learning_rate": 8.127431421446385e-06, + "loss": 0.3626, + "step": 47630 + }, + { + "epoch": 11.880299251870325, + "grad_norm": 9.720369338989258, + "learning_rate": 8.12493765586035e-06, + "loss": 0.3307, + "step": 47640 + }, + { + "epoch": 11.88279301745636, + "grad_norm": 9.563785552978516, + "learning_rate": 8.122443890274316e-06, + "loss": 0.3062, + "step": 47650 + }, + { + "epoch": 11.885286783042394, + "grad_norm": 10.218947410583496, + "learning_rate": 8.11995012468828e-06, + "loss": 0.4228, + "step": 47660 + }, + { + "epoch": 11.88778054862843, + "grad_norm": 5.189842700958252, + "learning_rate": 8.117456359102245e-06, + "loss": 0.354, + "step": 47670 + }, + { + "epoch": 11.890274314214464, + "grad_norm": 6.639830112457275, + "learning_rate": 8.11496259351621e-06, + "loss": 0.3331, + "step": 47680 + }, + { + "epoch": 11.892768079800499, + "grad_norm": 8.736455917358398, + "learning_rate": 8.112468827930176e-06, + "loss": 0.3691, + "step": 47690 + }, + { + "epoch": 11.895261845386534, + "grad_norm": 7.995376110076904, + "learning_rate": 8.109975062344139e-06, + "loss": 0.3516, + "step": 47700 + }, + { + "epoch": 11.897755610972569, + "grad_norm": 7.362260341644287, + "learning_rate": 8.107481296758104e-06, + "loss": 0.4472, + "step": 47710 + }, + { + "epoch": 11.900249376558603, + "grad_norm": 9.83117961883545, + "learning_rate": 8.10498753117207e-06, + "loss": 0.3402, + "step": 47720 + }, + { + "epoch": 11.902743142144638, + "grad_norm": 7.758623123168945, + "learning_rate": 8.102493765586035e-06, + "loss": 0.3799, + "step": 47730 + }, + { + "epoch": 11.905236907730673, + "grad_norm": 9.175792694091797, + "learning_rate": 8.1e-06, + "loss": 0.391, + "step": 47740 + }, + { + "epoch": 11.907730673316708, + "grad_norm": 7.377160549163818, + "learning_rate": 8.097506234413966e-06, + "loss": 0.3799, + "step": 47750 + }, + { + "epoch": 11.910224438902743, + "grad_norm": 5.763824462890625, + "learning_rate": 8.095012468827931e-06, + "loss": 0.348, + "step": 47760 + }, + { + "epoch": 11.912718204488778, + "grad_norm": 8.172435760498047, + "learning_rate": 8.092518703241896e-06, + "loss": 0.4006, + "step": 47770 + }, + { + "epoch": 11.915211970074813, + "grad_norm": 9.261870384216309, + "learning_rate": 8.090024937655862e-06, + "loss": 0.3811, + "step": 47780 + }, + { + "epoch": 11.917705735660848, + "grad_norm": 11.933865547180176, + "learning_rate": 8.087531172069825e-06, + "loss": 0.4131, + "step": 47790 + }, + { + "epoch": 11.920199501246882, + "grad_norm": 8.930726051330566, + "learning_rate": 8.08503740648379e-06, + "loss": 0.3954, + "step": 47800 + }, + { + "epoch": 11.922693266832917, + "grad_norm": 7.120776653289795, + "learning_rate": 8.082543640897756e-06, + "loss": 0.3536, + "step": 47810 + }, + { + "epoch": 11.925187032418952, + "grad_norm": 6.882978439331055, + "learning_rate": 8.080049875311721e-06, + "loss": 0.351, + "step": 47820 + }, + { + "epoch": 11.927680798004987, + "grad_norm": 7.883481502532959, + "learning_rate": 8.077556109725686e-06, + "loss": 0.3385, + "step": 47830 + }, + { + "epoch": 11.930174563591022, + "grad_norm": 8.083521842956543, + "learning_rate": 8.075062344139652e-06, + "loss": 0.31, + "step": 47840 + }, + { + "epoch": 11.932668329177057, + "grad_norm": 7.6285080909729, + "learning_rate": 8.072568578553617e-06, + "loss": 0.3895, + "step": 47850 + }, + { + "epoch": 11.935162094763092, + "grad_norm": 5.246705532073975, + "learning_rate": 8.070074812967582e-06, + "loss": 0.3273, + "step": 47860 + }, + { + "epoch": 11.937655860349127, + "grad_norm": 7.005518913269043, + "learning_rate": 8.067581047381546e-06, + "loss": 0.3466, + "step": 47870 + }, + { + "epoch": 11.940149625935161, + "grad_norm": 10.184428215026855, + "learning_rate": 8.065087281795511e-06, + "loss": 0.3226, + "step": 47880 + }, + { + "epoch": 11.942643391521196, + "grad_norm": 6.404749870300293, + "learning_rate": 8.062593516209477e-06, + "loss": 0.3122, + "step": 47890 + }, + { + "epoch": 11.945137157107231, + "grad_norm": 9.22337818145752, + "learning_rate": 8.060099750623442e-06, + "loss": 0.4094, + "step": 47900 + }, + { + "epoch": 11.947630922693268, + "grad_norm": 6.901223182678223, + "learning_rate": 8.057605985037407e-06, + "loss": 0.3463, + "step": 47910 + }, + { + "epoch": 11.950124688279303, + "grad_norm": 7.0179033279418945, + "learning_rate": 8.055112219451373e-06, + "loss": 0.335, + "step": 47920 + }, + { + "epoch": 11.952618453865338, + "grad_norm": 8.986421585083008, + "learning_rate": 8.052618453865338e-06, + "loss": 0.3582, + "step": 47930 + }, + { + "epoch": 11.955112219451372, + "grad_norm": 7.0583577156066895, + "learning_rate": 8.050124688279303e-06, + "loss": 0.3613, + "step": 47940 + }, + { + "epoch": 11.957605985037407, + "grad_norm": 6.976990222930908, + "learning_rate": 8.047630922693267e-06, + "loss": 0.3075, + "step": 47950 + }, + { + "epoch": 11.960099750623442, + "grad_norm": 7.128075122833252, + "learning_rate": 8.045137157107232e-06, + "loss": 0.4033, + "step": 47960 + }, + { + "epoch": 11.962593516209477, + "grad_norm": 7.31989049911499, + "learning_rate": 8.042643391521197e-06, + "loss": 0.3093, + "step": 47970 + }, + { + "epoch": 11.965087281795512, + "grad_norm": 4.972882270812988, + "learning_rate": 8.040149625935163e-06, + "loss": 0.299, + "step": 47980 + }, + { + "epoch": 11.967581047381547, + "grad_norm": 11.064085006713867, + "learning_rate": 8.037655860349128e-06, + "loss": 0.3248, + "step": 47990 + }, + { + "epoch": 11.970074812967582, + "grad_norm": 13.1663179397583, + "learning_rate": 8.035162094763093e-06, + "loss": 0.3242, + "step": 48000 + }, + { + "epoch": 11.972568578553616, + "grad_norm": 5.935441970825195, + "learning_rate": 8.032668329177059e-06, + "loss": 0.3544, + "step": 48010 + }, + { + "epoch": 11.975062344139651, + "grad_norm": 7.8656840324401855, + "learning_rate": 8.030174563591024e-06, + "loss": 0.2541, + "step": 48020 + }, + { + "epoch": 11.977556109725686, + "grad_norm": 11.03922176361084, + "learning_rate": 8.02768079800499e-06, + "loss": 0.3548, + "step": 48030 + }, + { + "epoch": 11.980049875311721, + "grad_norm": 8.151224136352539, + "learning_rate": 8.025187032418953e-06, + "loss": 0.3088, + "step": 48040 + }, + { + "epoch": 11.982543640897756, + "grad_norm": 9.86962890625, + "learning_rate": 8.022693266832918e-06, + "loss": 0.3831, + "step": 48050 + }, + { + "epoch": 11.98503740648379, + "grad_norm": 6.20054292678833, + "learning_rate": 8.020199501246884e-06, + "loss": 0.3202, + "step": 48060 + }, + { + "epoch": 11.987531172069826, + "grad_norm": 7.702700614929199, + "learning_rate": 8.017705735660849e-06, + "loss": 0.3719, + "step": 48070 + }, + { + "epoch": 11.99002493765586, + "grad_norm": 8.15425968170166, + "learning_rate": 8.015211970074812e-06, + "loss": 0.3887, + "step": 48080 + }, + { + "epoch": 11.992518703241895, + "grad_norm": 4.551555633544922, + "learning_rate": 8.012718204488778e-06, + "loss": 0.3482, + "step": 48090 + }, + { + "epoch": 11.99501246882793, + "grad_norm": 5.3351640701293945, + "learning_rate": 8.010224438902745e-06, + "loss": 0.3449, + "step": 48100 + }, + { + "epoch": 11.997506234413965, + "grad_norm": 9.141931533813477, + "learning_rate": 8.00773067331671e-06, + "loss": 0.3044, + "step": 48110 + }, + { + "epoch": 12.0, + "grad_norm": 4.466921806335449, + "learning_rate": 8.005236907730674e-06, + "loss": 0.3335, + "step": 48120 + }, + { + "epoch": 12.0, + "eval_loss": 0.41446128487586975, + "eval_runtime": 60.0916, + "eval_samples_per_second": 16.691, + "eval_steps_per_second": 16.691, + "step": 48120 + }, + { + "epoch": 12.002493765586035, + "grad_norm": 7.059330463409424, + "learning_rate": 8.002743142144639e-06, + "loss": 0.3357, + "step": 48130 + }, + { + "epoch": 12.00498753117207, + "grad_norm": 7.653445720672607, + "learning_rate": 8.000249376558604e-06, + "loss": 0.4164, + "step": 48140 + }, + { + "epoch": 12.007481296758105, + "grad_norm": 6.720803737640381, + "learning_rate": 7.99775561097257e-06, + "loss": 0.3236, + "step": 48150 + }, + { + "epoch": 12.00997506234414, + "grad_norm": 5.9595746994018555, + "learning_rate": 7.995261845386533e-06, + "loss": 0.3662, + "step": 48160 + }, + { + "epoch": 12.012468827930174, + "grad_norm": 6.095608234405518, + "learning_rate": 7.992768079800499e-06, + "loss": 0.3405, + "step": 48170 + }, + { + "epoch": 12.01496259351621, + "grad_norm": 8.565361022949219, + "learning_rate": 7.990274314214464e-06, + "loss": 0.2904, + "step": 48180 + }, + { + "epoch": 12.017456359102244, + "grad_norm": 6.547665119171143, + "learning_rate": 7.987780548628429e-06, + "loss": 0.3475, + "step": 48190 + }, + { + "epoch": 12.019950124688279, + "grad_norm": 10.47833251953125, + "learning_rate": 7.985286783042394e-06, + "loss": 0.3089, + "step": 48200 + }, + { + "epoch": 12.022443890274314, + "grad_norm": 15.010177612304688, + "learning_rate": 7.98279301745636e-06, + "loss": 0.3161, + "step": 48210 + }, + { + "epoch": 12.024937655860349, + "grad_norm": 9.223240852355957, + "learning_rate": 7.980299251870325e-06, + "loss": 0.3958, + "step": 48220 + }, + { + "epoch": 12.027431421446384, + "grad_norm": 7.6100382804870605, + "learning_rate": 7.97780548628429e-06, + "loss": 0.3337, + "step": 48230 + }, + { + "epoch": 12.029925187032418, + "grad_norm": 8.66632080078125, + "learning_rate": 7.975311720698256e-06, + "loss": 0.3571, + "step": 48240 + }, + { + "epoch": 12.032418952618453, + "grad_norm": 8.049689292907715, + "learning_rate": 7.97281795511222e-06, + "loss": 0.398, + "step": 48250 + }, + { + "epoch": 12.034912718204488, + "grad_norm": 9.214011192321777, + "learning_rate": 7.970324189526185e-06, + "loss": 0.3354, + "step": 48260 + }, + { + "epoch": 12.037406483790523, + "grad_norm": 7.5215229988098145, + "learning_rate": 7.96783042394015e-06, + "loss": 0.3055, + "step": 48270 + }, + { + "epoch": 12.039900249376558, + "grad_norm": 5.8552327156066895, + "learning_rate": 7.965336658354115e-06, + "loss": 0.3736, + "step": 48280 + }, + { + "epoch": 12.042394014962593, + "grad_norm": 8.036945343017578, + "learning_rate": 7.96284289276808e-06, + "loss": 0.3435, + "step": 48290 + }, + { + "epoch": 12.044887780548628, + "grad_norm": 9.113265037536621, + "learning_rate": 7.960349127182046e-06, + "loss": 0.3947, + "step": 48300 + }, + { + "epoch": 12.047381546134662, + "grad_norm": 8.261163711547852, + "learning_rate": 7.957855361596011e-06, + "loss": 0.3772, + "step": 48310 + }, + { + "epoch": 12.049875311720697, + "grad_norm": 8.655675888061523, + "learning_rate": 7.955361596009976e-06, + "loss": 0.4154, + "step": 48320 + }, + { + "epoch": 12.052369077306734, + "grad_norm": 7.443731307983398, + "learning_rate": 7.95286783042394e-06, + "loss": 0.3444, + "step": 48330 + }, + { + "epoch": 12.054862842892769, + "grad_norm": 8.582282066345215, + "learning_rate": 7.950374064837905e-06, + "loss": 0.3394, + "step": 48340 + }, + { + "epoch": 12.057356608478804, + "grad_norm": 4.728538513183594, + "learning_rate": 7.94788029925187e-06, + "loss": 0.334, + "step": 48350 + }, + { + "epoch": 12.059850374064839, + "grad_norm": 7.743102073669434, + "learning_rate": 7.945386533665836e-06, + "loss": 0.2972, + "step": 48360 + }, + { + "epoch": 12.062344139650873, + "grad_norm": 5.460146427154541, + "learning_rate": 7.942892768079801e-06, + "loss": 0.3959, + "step": 48370 + }, + { + "epoch": 12.064837905236908, + "grad_norm": 9.469954490661621, + "learning_rate": 7.940399002493767e-06, + "loss": 0.3682, + "step": 48380 + }, + { + "epoch": 12.067331670822943, + "grad_norm": 8.519420623779297, + "learning_rate": 7.937905236907732e-06, + "loss": 0.3083, + "step": 48390 + }, + { + "epoch": 12.069825436408978, + "grad_norm": 5.299350261688232, + "learning_rate": 7.935411471321697e-06, + "loss": 0.3441, + "step": 48400 + }, + { + "epoch": 12.072319201995013, + "grad_norm": 7.708430290222168, + "learning_rate": 7.932917705735661e-06, + "loss": 0.3613, + "step": 48410 + }, + { + "epoch": 12.074812967581048, + "grad_norm": 8.682879447937012, + "learning_rate": 7.930423940149626e-06, + "loss": 0.3806, + "step": 48420 + }, + { + "epoch": 12.077306733167083, + "grad_norm": 7.677736282348633, + "learning_rate": 7.927930174563591e-06, + "loss": 0.3095, + "step": 48430 + }, + { + "epoch": 12.079800498753118, + "grad_norm": 6.203550338745117, + "learning_rate": 7.925436408977557e-06, + "loss": 0.3583, + "step": 48440 + }, + { + "epoch": 12.082294264339152, + "grad_norm": 9.091981887817383, + "learning_rate": 7.922942643391522e-06, + "loss": 0.3517, + "step": 48450 + }, + { + "epoch": 12.084788029925187, + "grad_norm": 6.73786735534668, + "learning_rate": 7.920448877805487e-06, + "loss": 0.3773, + "step": 48460 + }, + { + "epoch": 12.087281795511222, + "grad_norm": 10.410232543945312, + "learning_rate": 7.917955112219453e-06, + "loss": 0.3852, + "step": 48470 + }, + { + "epoch": 12.089775561097257, + "grad_norm": 6.987138271331787, + "learning_rate": 7.915461346633418e-06, + "loss": 0.3285, + "step": 48480 + }, + { + "epoch": 12.092269326683292, + "grad_norm": 6.348522663116455, + "learning_rate": 7.912967581047383e-06, + "loss": 0.3437, + "step": 48490 + }, + { + "epoch": 12.094763092269327, + "grad_norm": 5.560189247131348, + "learning_rate": 7.910473815461347e-06, + "loss": 0.326, + "step": 48500 + }, + { + "epoch": 12.097256857855362, + "grad_norm": 6.605633735656738, + "learning_rate": 7.907980049875312e-06, + "loss": 0.3387, + "step": 48510 + }, + { + "epoch": 12.099750623441397, + "grad_norm": 11.42223072052002, + "learning_rate": 7.905486284289278e-06, + "loss": 0.4234, + "step": 48520 + }, + { + "epoch": 12.102244389027431, + "grad_norm": 7.262038707733154, + "learning_rate": 7.902992518703243e-06, + "loss": 0.3345, + "step": 48530 + }, + { + "epoch": 12.104738154613466, + "grad_norm": 10.022313117980957, + "learning_rate": 7.900498753117207e-06, + "loss": 0.3145, + "step": 48540 + }, + { + "epoch": 12.107231920199501, + "grad_norm": 8.85555362701416, + "learning_rate": 7.898004987531172e-06, + "loss": 0.2839, + "step": 48550 + }, + { + "epoch": 12.109725685785536, + "grad_norm": 7.570572376251221, + "learning_rate": 7.895511221945137e-06, + "loss": 0.3078, + "step": 48560 + }, + { + "epoch": 12.11221945137157, + "grad_norm": 7.913335800170898, + "learning_rate": 7.893017456359104e-06, + "loss": 0.314, + "step": 48570 + }, + { + "epoch": 12.114713216957606, + "grad_norm": 7.83607292175293, + "learning_rate": 7.890523690773068e-06, + "loss": 0.2858, + "step": 48580 + }, + { + "epoch": 12.11720698254364, + "grad_norm": 9.3693208694458, + "learning_rate": 7.888279301745636e-06, + "loss": 0.2934, + "step": 48590 + }, + { + "epoch": 12.119700748129675, + "grad_norm": 6.200734615325928, + "learning_rate": 7.885785536159601e-06, + "loss": 0.409, + "step": 48600 + }, + { + "epoch": 12.12219451371571, + "grad_norm": 4.377243518829346, + "learning_rate": 7.883291770573567e-06, + "loss": 0.2854, + "step": 48610 + }, + { + "epoch": 12.124688279301745, + "grad_norm": 9.177326202392578, + "learning_rate": 7.880798004987532e-06, + "loss": 0.3049, + "step": 48620 + }, + { + "epoch": 12.12718204488778, + "grad_norm": 7.598722457885742, + "learning_rate": 7.878304239401496e-06, + "loss": 0.3693, + "step": 48630 + }, + { + "epoch": 12.129675810473815, + "grad_norm": 8.089449882507324, + "learning_rate": 7.875810473815461e-06, + "loss": 0.3015, + "step": 48640 + }, + { + "epoch": 12.13216957605985, + "grad_norm": 6.351145267486572, + "learning_rate": 7.873316708229428e-06, + "loss": 0.3215, + "step": 48650 + }, + { + "epoch": 12.134663341645885, + "grad_norm": 4.465890884399414, + "learning_rate": 7.870822942643393e-06, + "loss": 0.3268, + "step": 48660 + }, + { + "epoch": 12.13715710723192, + "grad_norm": 7.8744707107543945, + "learning_rate": 7.868329177057359e-06, + "loss": 0.3502, + "step": 48670 + }, + { + "epoch": 12.139650872817954, + "grad_norm": 6.566856384277344, + "learning_rate": 7.865835411471322e-06, + "loss": 0.2754, + "step": 48680 + }, + { + "epoch": 12.14214463840399, + "grad_norm": 8.633282661437988, + "learning_rate": 7.863341645885287e-06, + "loss": 0.3319, + "step": 48690 + }, + { + "epoch": 12.144638403990024, + "grad_norm": 5.682807922363281, + "learning_rate": 7.860847880299253e-06, + "loss": 0.3287, + "step": 48700 + }, + { + "epoch": 12.147132169576059, + "grad_norm": 6.86955451965332, + "learning_rate": 7.858354114713218e-06, + "loss": 0.3137, + "step": 48710 + }, + { + "epoch": 12.149625935162096, + "grad_norm": 8.783101081848145, + "learning_rate": 7.855860349127182e-06, + "loss": 0.4136, + "step": 48720 + }, + { + "epoch": 12.15211970074813, + "grad_norm": 3.7242095470428467, + "learning_rate": 7.853366583541147e-06, + "loss": 0.2878, + "step": 48730 + }, + { + "epoch": 12.154613466334165, + "grad_norm": 7.640246391296387, + "learning_rate": 7.850872817955112e-06, + "loss": 0.3841, + "step": 48740 + }, + { + "epoch": 12.1571072319202, + "grad_norm": 11.29294490814209, + "learning_rate": 7.848379052369078e-06, + "loss": 0.3812, + "step": 48750 + }, + { + "epoch": 12.159600997506235, + "grad_norm": 7.655768871307373, + "learning_rate": 7.845885286783043e-06, + "loss": 0.3404, + "step": 48760 + }, + { + "epoch": 12.16209476309227, + "grad_norm": 6.344391822814941, + "learning_rate": 7.843391521197008e-06, + "loss": 0.3118, + "step": 48770 + }, + { + "epoch": 12.164588528678305, + "grad_norm": 9.420690536499023, + "learning_rate": 7.840897755610974e-06, + "loss": 0.3424, + "step": 48780 + }, + { + "epoch": 12.16708229426434, + "grad_norm": 7.383553981781006, + "learning_rate": 7.838403990024939e-06, + "loss": 0.3298, + "step": 48790 + }, + { + "epoch": 12.169576059850375, + "grad_norm": 6.917516708374023, + "learning_rate": 7.835910224438902e-06, + "loss": 0.3557, + "step": 48800 + }, + { + "epoch": 12.17206982543641, + "grad_norm": 7.802567005157471, + "learning_rate": 7.833416458852868e-06, + "loss": 0.4077, + "step": 48810 + }, + { + "epoch": 12.174563591022444, + "grad_norm": 6.911114692687988, + "learning_rate": 7.830922693266833e-06, + "loss": 0.2852, + "step": 48820 + }, + { + "epoch": 12.17705735660848, + "grad_norm": 10.203409194946289, + "learning_rate": 7.828428927680798e-06, + "loss": 0.3605, + "step": 48830 + }, + { + "epoch": 12.179551122194514, + "grad_norm": 9.16174030303955, + "learning_rate": 7.825935162094764e-06, + "loss": 0.3409, + "step": 48840 + }, + { + "epoch": 12.182044887780549, + "grad_norm": 6.887585639953613, + "learning_rate": 7.823441396508729e-06, + "loss": 0.3659, + "step": 48850 + }, + { + "epoch": 12.184538653366584, + "grad_norm": 7.380236625671387, + "learning_rate": 7.820947630922694e-06, + "loss": 0.3259, + "step": 48860 + }, + { + "epoch": 12.187032418952619, + "grad_norm": 5.912197589874268, + "learning_rate": 7.81845386533666e-06, + "loss": 0.2935, + "step": 48870 + }, + { + "epoch": 12.189526184538654, + "grad_norm": 10.71356201171875, + "learning_rate": 7.815960099750623e-06, + "loss": 0.3154, + "step": 48880 + }, + { + "epoch": 12.192019950124688, + "grad_norm": 9.142561912536621, + "learning_rate": 7.813466334164589e-06, + "loss": 0.3491, + "step": 48890 + }, + { + "epoch": 12.194513715710723, + "grad_norm": 5.923270225524902, + "learning_rate": 7.810972568578554e-06, + "loss": 0.3462, + "step": 48900 + }, + { + "epoch": 12.197007481296758, + "grad_norm": 8.043718338012695, + "learning_rate": 7.80847880299252e-06, + "loss": 0.3948, + "step": 48910 + }, + { + "epoch": 12.199501246882793, + "grad_norm": 10.630451202392578, + "learning_rate": 7.805985037406484e-06, + "loss": 0.2927, + "step": 48920 + }, + { + "epoch": 12.201995012468828, + "grad_norm": 8.33590316772461, + "learning_rate": 7.80349127182045e-06, + "loss": 0.3302, + "step": 48930 + }, + { + "epoch": 12.204488778054863, + "grad_norm": 4.248763084411621, + "learning_rate": 7.800997506234415e-06, + "loss": 0.2981, + "step": 48940 + }, + { + "epoch": 12.206982543640898, + "grad_norm": 7.462469577789307, + "learning_rate": 7.79850374064838e-06, + "loss": 0.353, + "step": 48950 + }, + { + "epoch": 12.209476309226932, + "grad_norm": 7.565711975097656, + "learning_rate": 7.796009975062346e-06, + "loss": 0.3257, + "step": 48960 + }, + { + "epoch": 12.211970074812967, + "grad_norm": 5.535632133483887, + "learning_rate": 7.79351620947631e-06, + "loss": 0.318, + "step": 48970 + }, + { + "epoch": 12.214463840399002, + "grad_norm": 8.999574661254883, + "learning_rate": 7.791022443890275e-06, + "loss": 0.3884, + "step": 48980 + }, + { + "epoch": 12.216957605985037, + "grad_norm": 7.833513259887695, + "learning_rate": 7.78852867830424e-06, + "loss": 0.3175, + "step": 48990 + }, + { + "epoch": 12.219451371571072, + "grad_norm": 6.90658712387085, + "learning_rate": 7.786034912718205e-06, + "loss": 0.2902, + "step": 49000 + }, + { + "epoch": 12.221945137157107, + "grad_norm": 10.725184440612793, + "learning_rate": 7.78354114713217e-06, + "loss": 0.3352, + "step": 49010 + }, + { + "epoch": 12.224438902743142, + "grad_norm": 5.924967288970947, + "learning_rate": 7.781047381546136e-06, + "loss": 0.3747, + "step": 49020 + }, + { + "epoch": 12.226932668329177, + "grad_norm": 15.080745697021484, + "learning_rate": 7.778553615960101e-06, + "loss": 0.3795, + "step": 49030 + }, + { + "epoch": 12.229426433915211, + "grad_norm": 9.706949234008789, + "learning_rate": 7.776059850374066e-06, + "loss": 0.3756, + "step": 49040 + }, + { + "epoch": 12.231920199501246, + "grad_norm": 6.478403091430664, + "learning_rate": 7.77356608478803e-06, + "loss": 0.3344, + "step": 49050 + }, + { + "epoch": 12.234413965087281, + "grad_norm": 20.941816329956055, + "learning_rate": 7.771072319201995e-06, + "loss": 0.3319, + "step": 49060 + }, + { + "epoch": 12.236907730673316, + "grad_norm": 4.482845306396484, + "learning_rate": 7.76857855361596e-06, + "loss": 0.3203, + "step": 49070 + }, + { + "epoch": 12.239401496259351, + "grad_norm": 5.751954078674316, + "learning_rate": 7.766084788029926e-06, + "loss": 0.3554, + "step": 49080 + }, + { + "epoch": 12.241895261845386, + "grad_norm": 7.152318477630615, + "learning_rate": 7.76359102244389e-06, + "loss": 0.3193, + "step": 49090 + }, + { + "epoch": 12.24438902743142, + "grad_norm": 7.085709571838379, + "learning_rate": 7.761097256857855e-06, + "loss": 0.3458, + "step": 49100 + }, + { + "epoch": 12.246882793017456, + "grad_norm": 7.26195764541626, + "learning_rate": 7.75860349127182e-06, + "loss": 0.3644, + "step": 49110 + }, + { + "epoch": 12.24937655860349, + "grad_norm": 5.610538959503174, + "learning_rate": 7.756109725685787e-06, + "loss": 0.3654, + "step": 49120 + }, + { + "epoch": 12.251870324189527, + "grad_norm": 6.579119682312012, + "learning_rate": 7.753615960099751e-06, + "loss": 0.3116, + "step": 49130 + }, + { + "epoch": 12.254364089775562, + "grad_norm": 9.59024429321289, + "learning_rate": 7.751122194513716e-06, + "loss": 0.3193, + "step": 49140 + }, + { + "epoch": 12.256857855361597, + "grad_norm": 7.97966194152832, + "learning_rate": 7.748628428927682e-06, + "loss": 0.3293, + "step": 49150 + }, + { + "epoch": 12.259351620947632, + "grad_norm": 9.137805938720703, + "learning_rate": 7.746134663341647e-06, + "loss": 0.409, + "step": 49160 + }, + { + "epoch": 12.261845386533667, + "grad_norm": 9.003076553344727, + "learning_rate": 7.743640897755612e-06, + "loss": 0.3663, + "step": 49170 + }, + { + "epoch": 12.264339152119701, + "grad_norm": 9.627756118774414, + "learning_rate": 7.741147132169576e-06, + "loss": 0.3485, + "step": 49180 + }, + { + "epoch": 12.266832917705736, + "grad_norm": 5.549960613250732, + "learning_rate": 7.738653366583541e-06, + "loss": 0.3514, + "step": 49190 + }, + { + "epoch": 12.269326683291771, + "grad_norm": 6.5951080322265625, + "learning_rate": 7.736159600997506e-06, + "loss": 0.3182, + "step": 49200 + }, + { + "epoch": 12.271820448877806, + "grad_norm": 10.914071083068848, + "learning_rate": 7.733665835411472e-06, + "loss": 0.3302, + "step": 49210 + }, + { + "epoch": 12.27431421446384, + "grad_norm": 8.802125930786133, + "learning_rate": 7.731172069825437e-06, + "loss": 0.2761, + "step": 49220 + }, + { + "epoch": 12.276807980049876, + "grad_norm": 7.288644790649414, + "learning_rate": 7.728678304239402e-06, + "loss": 0.365, + "step": 49230 + }, + { + "epoch": 12.27930174563591, + "grad_norm": 6.142082691192627, + "learning_rate": 7.726184538653368e-06, + "loss": 0.2988, + "step": 49240 + }, + { + "epoch": 12.281795511221945, + "grad_norm": 8.087503433227539, + "learning_rate": 7.723690773067333e-06, + "loss": 0.3527, + "step": 49250 + }, + { + "epoch": 12.28428927680798, + "grad_norm": 7.012362480163574, + "learning_rate": 7.721197007481297e-06, + "loss": 0.3552, + "step": 49260 + }, + { + "epoch": 12.286783042394015, + "grad_norm": 5.985617160797119, + "learning_rate": 7.718703241895262e-06, + "loss": 0.2818, + "step": 49270 + }, + { + "epoch": 12.28927680798005, + "grad_norm": 6.581480026245117, + "learning_rate": 7.716209476309227e-06, + "loss": 0.3897, + "step": 49280 + }, + { + "epoch": 12.291770573566085, + "grad_norm": 6.56619930267334, + "learning_rate": 7.713965087281795e-06, + "loss": 0.4206, + "step": 49290 + }, + { + "epoch": 12.29426433915212, + "grad_norm": 12.421707153320312, + "learning_rate": 7.71147132169576e-06, + "loss": 0.3616, + "step": 49300 + }, + { + "epoch": 12.296758104738155, + "grad_norm": 5.2989726066589355, + "learning_rate": 7.708977556109726e-06, + "loss": 0.3153, + "step": 49310 + }, + { + "epoch": 12.29925187032419, + "grad_norm": 8.248827934265137, + "learning_rate": 7.706483790523691e-06, + "loss": 0.4233, + "step": 49320 + }, + { + "epoch": 12.301745635910224, + "grad_norm": 9.170528411865234, + "learning_rate": 7.703990024937657e-06, + "loss": 0.3728, + "step": 49330 + }, + { + "epoch": 12.30423940149626, + "grad_norm": 8.818942070007324, + "learning_rate": 7.701496259351622e-06, + "loss": 0.3719, + "step": 49340 + }, + { + "epoch": 12.306733167082294, + "grad_norm": 4.5717997550964355, + "learning_rate": 7.699002493765587e-06, + "loss": 0.3448, + "step": 49350 + }, + { + "epoch": 12.309226932668329, + "grad_norm": 7.510899543762207, + "learning_rate": 7.696508728179551e-06, + "loss": 0.3549, + "step": 49360 + }, + { + "epoch": 12.311720698254364, + "grad_norm": 9.351420402526855, + "learning_rate": 7.694014962593516e-06, + "loss": 0.2777, + "step": 49370 + }, + { + "epoch": 12.314214463840399, + "grad_norm": 5.524614334106445, + "learning_rate": 7.691521197007482e-06, + "loss": 0.3373, + "step": 49380 + }, + { + "epoch": 12.316708229426434, + "grad_norm": 6.964102268218994, + "learning_rate": 7.689027431421447e-06, + "loss": 0.3343, + "step": 49390 + }, + { + "epoch": 12.319201995012468, + "grad_norm": 5.369339466094971, + "learning_rate": 7.686533665835412e-06, + "loss": 0.2794, + "step": 49400 + }, + { + "epoch": 12.321695760598503, + "grad_norm": 8.168196678161621, + "learning_rate": 7.684039900249377e-06, + "loss": 0.3399, + "step": 49410 + }, + { + "epoch": 12.324189526184538, + "grad_norm": 8.461885452270508, + "learning_rate": 7.681546134663343e-06, + "loss": 0.363, + "step": 49420 + }, + { + "epoch": 12.326683291770573, + "grad_norm": 8.73070240020752, + "learning_rate": 7.679052369077308e-06, + "loss": 0.352, + "step": 49430 + }, + { + "epoch": 12.329177057356608, + "grad_norm": 5.573559284210205, + "learning_rate": 7.676558603491272e-06, + "loss": 0.2903, + "step": 49440 + }, + { + "epoch": 12.331670822942643, + "grad_norm": 8.736505508422852, + "learning_rate": 7.674064837905237e-06, + "loss": 0.3443, + "step": 49450 + }, + { + "epoch": 12.334164588528678, + "grad_norm": 8.074347496032715, + "learning_rate": 7.671571072319202e-06, + "loss": 0.3055, + "step": 49460 + }, + { + "epoch": 12.336658354114713, + "grad_norm": 10.68688678741455, + "learning_rate": 7.669077306733168e-06, + "loss": 0.3402, + "step": 49470 + }, + { + "epoch": 12.339152119700747, + "grad_norm": 6.418087005615234, + "learning_rate": 7.666583541147133e-06, + "loss": 0.3485, + "step": 49480 + }, + { + "epoch": 12.341645885286782, + "grad_norm": 6.3780903816223145, + "learning_rate": 7.664089775561098e-06, + "loss": 0.3058, + "step": 49490 + }, + { + "epoch": 12.344139650872817, + "grad_norm": 6.528293132781982, + "learning_rate": 7.661596009975064e-06, + "loss": 0.3228, + "step": 49500 + }, + { + "epoch": 12.346633416458852, + "grad_norm": 12.134942054748535, + "learning_rate": 7.659102244389029e-06, + "loss": 0.3832, + "step": 49510 + }, + { + "epoch": 12.349127182044889, + "grad_norm": 9.033289909362793, + "learning_rate": 7.656608478802992e-06, + "loss": 0.3629, + "step": 49520 + }, + { + "epoch": 12.351620947630924, + "grad_norm": 5.786545276641846, + "learning_rate": 7.654114713216958e-06, + "loss": 0.386, + "step": 49530 + }, + { + "epoch": 12.354114713216958, + "grad_norm": 5.447564601898193, + "learning_rate": 7.651620947630923e-06, + "loss": 0.2737, + "step": 49540 + }, + { + "epoch": 12.356608478802993, + "grad_norm": 6.9369916915893555, + "learning_rate": 7.649127182044888e-06, + "loss": 0.4029, + "step": 49550 + }, + { + "epoch": 12.359102244389028, + "grad_norm": 5.646476745605469, + "learning_rate": 7.646633416458854e-06, + "loss": 0.3803, + "step": 49560 + }, + { + "epoch": 12.361596009975063, + "grad_norm": 8.804864883422852, + "learning_rate": 7.644139650872819e-06, + "loss": 0.3256, + "step": 49570 + }, + { + "epoch": 12.364089775561098, + "grad_norm": 4.971907615661621, + "learning_rate": 7.641645885286784e-06, + "loss": 0.3518, + "step": 49580 + }, + { + "epoch": 12.366583541147133, + "grad_norm": 7.365206718444824, + "learning_rate": 7.63915211970075e-06, + "loss": 0.3382, + "step": 49590 + }, + { + "epoch": 12.369077306733168, + "grad_norm": 4.726197719573975, + "learning_rate": 7.636658354114715e-06, + "loss": 0.325, + "step": 49600 + }, + { + "epoch": 12.371571072319203, + "grad_norm": 8.982644081115723, + "learning_rate": 7.634164588528679e-06, + "loss": 0.3807, + "step": 49610 + }, + { + "epoch": 12.374064837905237, + "grad_norm": 6.123873233795166, + "learning_rate": 7.631670822942644e-06, + "loss": 0.3543, + "step": 49620 + }, + { + "epoch": 12.376558603491272, + "grad_norm": 7.114482879638672, + "learning_rate": 7.629177057356609e-06, + "loss": 0.3948, + "step": 49630 + }, + { + "epoch": 12.379052369077307, + "grad_norm": 6.665993690490723, + "learning_rate": 7.6266832917705745e-06, + "loss": 0.2877, + "step": 49640 + }, + { + "epoch": 12.381546134663342, + "grad_norm": 8.79172134399414, + "learning_rate": 7.624189526184539e-06, + "loss": 0.3899, + "step": 49650 + }, + { + "epoch": 12.384039900249377, + "grad_norm": 6.153012752532959, + "learning_rate": 7.621695760598504e-06, + "loss": 0.318, + "step": 49660 + }, + { + "epoch": 12.386533665835412, + "grad_norm": 8.169590950012207, + "learning_rate": 7.6192019950124696e-06, + "loss": 0.3068, + "step": 49670 + }, + { + "epoch": 12.389027431421447, + "grad_norm": 8.219849586486816, + "learning_rate": 7.616708229426435e-06, + "loss": 0.3231, + "step": 49680 + }, + { + "epoch": 12.391521197007481, + "grad_norm": 7.9375152587890625, + "learning_rate": 7.614214463840399e-06, + "loss": 0.2944, + "step": 49690 + }, + { + "epoch": 12.394014962593516, + "grad_norm": 6.999446392059326, + "learning_rate": 7.611720698254365e-06, + "loss": 0.3445, + "step": 49700 + }, + { + "epoch": 12.396508728179551, + "grad_norm": 5.5128278732299805, + "learning_rate": 7.60922693266833e-06, + "loss": 0.3404, + "step": 49710 + }, + { + "epoch": 12.399002493765586, + "grad_norm": 17.4468994140625, + "learning_rate": 7.606733167082295e-06, + "loss": 0.3414, + "step": 49720 + }, + { + "epoch": 12.401496259351621, + "grad_norm": 7.349917888641357, + "learning_rate": 7.60423940149626e-06, + "loss": 0.4121, + "step": 49730 + }, + { + "epoch": 12.403990024937656, + "grad_norm": 7.664532661437988, + "learning_rate": 7.601745635910225e-06, + "loss": 0.3575, + "step": 49740 + }, + { + "epoch": 12.40648379052369, + "grad_norm": 7.233582496643066, + "learning_rate": 7.59925187032419e-06, + "loss": 0.3506, + "step": 49750 + }, + { + "epoch": 12.408977556109726, + "grad_norm": 5.654517650604248, + "learning_rate": 7.596758104738156e-06, + "loss": 0.3195, + "step": 49760 + }, + { + "epoch": 12.41147132169576, + "grad_norm": 10.009135246276855, + "learning_rate": 7.59426433915212e-06, + "loss": 0.2798, + "step": 49770 + }, + { + "epoch": 12.413965087281795, + "grad_norm": 6.770684719085693, + "learning_rate": 7.5917705735660854e-06, + "loss": 0.3402, + "step": 49780 + }, + { + "epoch": 12.41645885286783, + "grad_norm": 7.9065093994140625, + "learning_rate": 7.589276807980051e-06, + "loss": 0.2968, + "step": 49790 + }, + { + "epoch": 12.418952618453865, + "grad_norm": 6.457505226135254, + "learning_rate": 7.586783042394016e-06, + "loss": 0.2829, + "step": 49800 + }, + { + "epoch": 12.4214463840399, + "grad_norm": 10.425678253173828, + "learning_rate": 7.584289276807981e-06, + "loss": 0.3081, + "step": 49810 + }, + { + "epoch": 12.423940149625935, + "grad_norm": 9.779804229736328, + "learning_rate": 7.581795511221946e-06, + "loss": 0.3627, + "step": 49820 + }, + { + "epoch": 12.42643391521197, + "grad_norm": 8.749654769897461, + "learning_rate": 7.579301745635911e-06, + "loss": 0.31, + "step": 49830 + }, + { + "epoch": 12.428927680798004, + "grad_norm": 5.994139194488525, + "learning_rate": 7.5768079800498764e-06, + "loss": 0.3538, + "step": 49840 + }, + { + "epoch": 12.43142144638404, + "grad_norm": 9.143582344055176, + "learning_rate": 7.574314214463842e-06, + "loss": 0.3849, + "step": 49850 + }, + { + "epoch": 12.433915211970074, + "grad_norm": 5.614033222198486, + "learning_rate": 7.571820448877805e-06, + "loss": 0.3452, + "step": 49860 + }, + { + "epoch": 12.436408977556109, + "grad_norm": 5.022126197814941, + "learning_rate": 7.5693266832917715e-06, + "loss": 0.3065, + "step": 49870 + }, + { + "epoch": 12.438902743142144, + "grad_norm": 8.072562217712402, + "learning_rate": 7.566832917705737e-06, + "loss": 0.3097, + "step": 49880 + }, + { + "epoch": 12.441396508728179, + "grad_norm": 11.745954513549805, + "learning_rate": 7.564339152119702e-06, + "loss": 0.3353, + "step": 49890 + }, + { + "epoch": 12.443890274314214, + "grad_norm": 7.331192970275879, + "learning_rate": 7.561845386533666e-06, + "loss": 0.3727, + "step": 49900 + }, + { + "epoch": 12.446384039900249, + "grad_norm": 8.64783000946045, + "learning_rate": 7.559351620947631e-06, + "loss": 0.3649, + "step": 49910 + }, + { + "epoch": 12.448877805486283, + "grad_norm": 8.688132286071777, + "learning_rate": 7.556857855361596e-06, + "loss": 0.3288, + "step": 49920 + }, + { + "epoch": 12.451371571072318, + "grad_norm": 9.612068176269531, + "learning_rate": 7.5543640897755625e-06, + "loss": 0.3738, + "step": 49930 + }, + { + "epoch": 12.453865336658355, + "grad_norm": 10.88000202178955, + "learning_rate": 7.551870324189526e-06, + "loss": 0.3455, + "step": 49940 + }, + { + "epoch": 12.45635910224439, + "grad_norm": 6.210239887237549, + "learning_rate": 7.5493765586034915e-06, + "loss": 0.3381, + "step": 49950 + }, + { + "epoch": 12.458852867830425, + "grad_norm": 6.904968738555908, + "learning_rate": 7.546882793017457e-06, + "loss": 0.323, + "step": 49960 + }, + { + "epoch": 12.46134663341646, + "grad_norm": 8.996318817138672, + "learning_rate": 7.544389027431422e-06, + "loss": 0.3161, + "step": 49970 + }, + { + "epoch": 12.463840399002494, + "grad_norm": 12.361374855041504, + "learning_rate": 7.5418952618453865e-06, + "loss": 0.4072, + "step": 49980 + }, + { + "epoch": 12.46633416458853, + "grad_norm": 6.888818740844727, + "learning_rate": 7.539401496259352e-06, + "loss": 0.4031, + "step": 49990 + }, + { + "epoch": 12.468827930174564, + "grad_norm": 7.364432334899902, + "learning_rate": 7.536907730673317e-06, + "loss": 0.3472, + "step": 50000 + }, + { + "epoch": 12.471321695760599, + "grad_norm": 7.185758590698242, + "learning_rate": 7.5344139650872825e-06, + "loss": 0.3492, + "step": 50010 + }, + { + "epoch": 12.473815461346634, + "grad_norm": 4.996051788330078, + "learning_rate": 7.531920199501247e-06, + "loss": 0.3285, + "step": 50020 + }, + { + "epoch": 12.476309226932669, + "grad_norm": 9.784656524658203, + "learning_rate": 7.529426433915212e-06, + "loss": 0.3681, + "step": 50030 + }, + { + "epoch": 12.478802992518704, + "grad_norm": 9.993110656738281, + "learning_rate": 7.5269326683291776e-06, + "loss": 0.4345, + "step": 50040 + }, + { + "epoch": 12.481296758104738, + "grad_norm": 7.204555511474609, + "learning_rate": 7.524438902743143e-06, + "loss": 0.336, + "step": 50050 + }, + { + "epoch": 12.483790523690773, + "grad_norm": 5.9999165534973145, + "learning_rate": 7.521945137157108e-06, + "loss": 0.4092, + "step": 50060 + }, + { + "epoch": 12.486284289276808, + "grad_norm": 4.7937445640563965, + "learning_rate": 7.519451371571073e-06, + "loss": 0.3532, + "step": 50070 + }, + { + "epoch": 12.488778054862843, + "grad_norm": 7.650313854217529, + "learning_rate": 7.516957605985038e-06, + "loss": 0.277, + "step": 50080 + }, + { + "epoch": 12.491271820448878, + "grad_norm": 9.389827728271484, + "learning_rate": 7.514463840399003e-06, + "loss": 0.3082, + "step": 50090 + }, + { + "epoch": 12.493765586034913, + "grad_norm": 7.2986578941345215, + "learning_rate": 7.5119700748129686e-06, + "loss": 0.3464, + "step": 50100 + }, + { + "epoch": 12.496259351620948, + "grad_norm": 9.90869140625, + "learning_rate": 7.509476309226933e-06, + "loss": 0.3514, + "step": 50110 + }, + { + "epoch": 12.498753117206983, + "grad_norm": 10.251230239868164, + "learning_rate": 7.506982543640898e-06, + "loss": 0.3623, + "step": 50120 + }, + { + "epoch": 12.501246882793017, + "grad_norm": 6.3379974365234375, + "learning_rate": 7.504488778054864e-06, + "loss": 0.3323, + "step": 50130 + }, + { + "epoch": 12.503740648379052, + "grad_norm": 7.917457580566406, + "learning_rate": 7.501995012468829e-06, + "loss": 0.3967, + "step": 50140 + }, + { + "epoch": 12.506234413965087, + "grad_norm": 10.125364303588867, + "learning_rate": 7.499501246882793e-06, + "loss": 0.3394, + "step": 50150 + }, + { + "epoch": 12.508728179551122, + "grad_norm": 6.0608978271484375, + "learning_rate": 7.497007481296759e-06, + "loss": 0.3227, + "step": 50160 + }, + { + "epoch": 12.511221945137157, + "grad_norm": 6.0084919929504395, + "learning_rate": 7.494513715710724e-06, + "loss": 0.3547, + "step": 50170 + }, + { + "epoch": 12.513715710723192, + "grad_norm": 7.461294174194336, + "learning_rate": 7.492019950124689e-06, + "loss": 0.3247, + "step": 50180 + }, + { + "epoch": 12.516209476309227, + "grad_norm": 8.043869018554688, + "learning_rate": 7.489526184538654e-06, + "loss": 0.3308, + "step": 50190 + }, + { + "epoch": 12.518703241895262, + "grad_norm": 8.986414909362793, + "learning_rate": 7.487032418952619e-06, + "loss": 0.3246, + "step": 50200 + }, + { + "epoch": 12.521197007481296, + "grad_norm": 9.545452117919922, + "learning_rate": 7.4845386533665844e-06, + "loss": 0.3351, + "step": 50210 + }, + { + "epoch": 12.523690773067331, + "grad_norm": 7.211226940155029, + "learning_rate": 7.48204488778055e-06, + "loss": 0.3332, + "step": 50220 + }, + { + "epoch": 12.526184538653366, + "grad_norm": 15.852411270141602, + "learning_rate": 7.479551122194514e-06, + "loss": 0.3832, + "step": 50230 + }, + { + "epoch": 12.528678304239401, + "grad_norm": 11.03563117980957, + "learning_rate": 7.4770573566084795e-06, + "loss": 0.3479, + "step": 50240 + }, + { + "epoch": 12.531172069825436, + "grad_norm": 7.793381214141846, + "learning_rate": 7.474563591022445e-06, + "loss": 0.3877, + "step": 50250 + }, + { + "epoch": 12.53366583541147, + "grad_norm": 6.964505195617676, + "learning_rate": 7.47206982543641e-06, + "loss": 0.3602, + "step": 50260 + }, + { + "epoch": 12.536159600997506, + "grad_norm": 9.570211410522461, + "learning_rate": 7.469576059850374e-06, + "loss": 0.4639, + "step": 50270 + }, + { + "epoch": 12.53865336658354, + "grad_norm": 6.632506370544434, + "learning_rate": 7.46708229426434e-06, + "loss": 0.3231, + "step": 50280 + }, + { + "epoch": 12.541147132169575, + "grad_norm": 12.00216293334961, + "learning_rate": 7.464588528678305e-06, + "loss": 0.3751, + "step": 50290 + }, + { + "epoch": 12.54364089775561, + "grad_norm": 9.346277236938477, + "learning_rate": 7.4620947630922705e-06, + "loss": 0.384, + "step": 50300 + }, + { + "epoch": 12.546134663341645, + "grad_norm": 11.572287559509277, + "learning_rate": 7.459600997506236e-06, + "loss": 0.3448, + "step": 50310 + }, + { + "epoch": 12.548628428927682, + "grad_norm": 5.089780807495117, + "learning_rate": 7.4571072319201995e-06, + "loss": 0.3452, + "step": 50320 + }, + { + "epoch": 12.551122194513717, + "grad_norm": 5.670629024505615, + "learning_rate": 7.454613466334165e-06, + "loss": 0.2836, + "step": 50330 + }, + { + "epoch": 12.553615960099751, + "grad_norm": 8.412741661071777, + "learning_rate": 7.452119700748131e-06, + "loss": 0.3167, + "step": 50340 + }, + { + "epoch": 12.556109725685786, + "grad_norm": 6.057131767272949, + "learning_rate": 7.449625935162096e-06, + "loss": 0.3257, + "step": 50350 + }, + { + "epoch": 12.558603491271821, + "grad_norm": 13.145732879638672, + "learning_rate": 7.44713216957606e-06, + "loss": 0.3762, + "step": 50360 + }, + { + "epoch": 12.561097256857856, + "grad_norm": 6.84805965423584, + "learning_rate": 7.444638403990025e-06, + "loss": 0.3416, + "step": 50370 + }, + { + "epoch": 12.563591022443891, + "grad_norm": 9.685379028320312, + "learning_rate": 7.4421446384039905e-06, + "loss": 0.4172, + "step": 50380 + }, + { + "epoch": 12.566084788029926, + "grad_norm": 10.725334167480469, + "learning_rate": 7.439650872817956e-06, + "loss": 0.3302, + "step": 50390 + }, + { + "epoch": 12.56857855361596, + "grad_norm": 9.620137214660645, + "learning_rate": 7.43715710723192e-06, + "loss": 0.3222, + "step": 50400 + }, + { + "epoch": 12.571072319201996, + "grad_norm": 6.089214324951172, + "learning_rate": 7.4346633416458855e-06, + "loss": 0.2796, + "step": 50410 + }, + { + "epoch": 12.57356608478803, + "grad_norm": 9.723833084106445, + "learning_rate": 7.432169576059851e-06, + "loss": 0.3645, + "step": 50420 + }, + { + "epoch": 12.576059850374065, + "grad_norm": 7.900272369384766, + "learning_rate": 7.429675810473816e-06, + "loss": 0.3341, + "step": 50430 + }, + { + "epoch": 12.5785536159601, + "grad_norm": 5.3839521408081055, + "learning_rate": 7.427182044887781e-06, + "loss": 0.3581, + "step": 50440 + }, + { + "epoch": 12.581047381546135, + "grad_norm": 7.9673848152160645, + "learning_rate": 7.424688279301746e-06, + "loss": 0.3934, + "step": 50450 + }, + { + "epoch": 12.58354114713217, + "grad_norm": 7.145327091217041, + "learning_rate": 7.422194513715711e-06, + "loss": 0.3564, + "step": 50460 + }, + { + "epoch": 12.586034912718205, + "grad_norm": 8.317575454711914, + "learning_rate": 7.4197007481296766e-06, + "loss": 0.3919, + "step": 50470 + }, + { + "epoch": 12.58852867830424, + "grad_norm": 8.809738159179688, + "learning_rate": 7.417206982543641e-06, + "loss": 0.3832, + "step": 50480 + }, + { + "epoch": 12.591022443890274, + "grad_norm": 7.227417945861816, + "learning_rate": 7.414713216957606e-06, + "loss": 0.3157, + "step": 50490 + }, + { + "epoch": 12.59351620947631, + "grad_norm": 7.549991130828857, + "learning_rate": 7.412219451371572e-06, + "loss": 0.3329, + "step": 50500 + }, + { + "epoch": 12.596009975062344, + "grad_norm": 15.35284423828125, + "learning_rate": 7.409725685785537e-06, + "loss": 0.3495, + "step": 50510 + }, + { + "epoch": 12.598503740648379, + "grad_norm": 4.876797199249268, + "learning_rate": 7.407231920199501e-06, + "loss": 0.3429, + "step": 50520 + }, + { + "epoch": 12.600997506234414, + "grad_norm": 6.604551315307617, + "learning_rate": 7.404738154613467e-06, + "loss": 0.3698, + "step": 50530 + }, + { + "epoch": 12.603491271820449, + "grad_norm": 7.2110443115234375, + "learning_rate": 7.402244389027432e-06, + "loss": 0.331, + "step": 50540 + }, + { + "epoch": 12.605985037406484, + "grad_norm": 7.359330654144287, + "learning_rate": 7.399750623441397e-06, + "loss": 0.3591, + "step": 50550 + }, + { + "epoch": 12.608478802992519, + "grad_norm": 5.08826208114624, + "learning_rate": 7.397256857855363e-06, + "loss": 0.2935, + "step": 50560 + }, + { + "epoch": 12.610972568578553, + "grad_norm": 4.144226551055908, + "learning_rate": 7.394763092269327e-06, + "loss": 0.3926, + "step": 50570 + }, + { + "epoch": 12.613466334164588, + "grad_norm": 5.52124547958374, + "learning_rate": 7.392269326683292e-06, + "loss": 0.4041, + "step": 50580 + }, + { + "epoch": 12.615960099750623, + "grad_norm": 13.029586791992188, + "learning_rate": 7.389775561097258e-06, + "loss": 0.3669, + "step": 50590 + }, + { + "epoch": 12.618453865336658, + "grad_norm": 7.825071811676025, + "learning_rate": 7.387281795511223e-06, + "loss": 0.3541, + "step": 50600 + }, + { + "epoch": 12.620947630922693, + "grad_norm": 7.498974323272705, + "learning_rate": 7.3847880299251875e-06, + "loss": 0.2758, + "step": 50610 + }, + { + "epoch": 12.623441396508728, + "grad_norm": 10.242654800415039, + "learning_rate": 7.382294264339153e-06, + "loss": 0.302, + "step": 50620 + }, + { + "epoch": 12.625935162094763, + "grad_norm": 6.927699089050293, + "learning_rate": 7.379800498753118e-06, + "loss": 0.3352, + "step": 50630 + }, + { + "epoch": 12.628428927680797, + "grad_norm": 4.644331932067871, + "learning_rate": 7.3773067331670834e-06, + "loss": 0.3491, + "step": 50640 + }, + { + "epoch": 12.630922693266832, + "grad_norm": 5.701728343963623, + "learning_rate": 7.374812967581048e-06, + "loss": 0.2982, + "step": 50650 + }, + { + "epoch": 12.633416458852867, + "grad_norm": 10.834186553955078, + "learning_rate": 7.372319201995013e-06, + "loss": 0.3135, + "step": 50660 + }, + { + "epoch": 12.635910224438902, + "grad_norm": 9.567243576049805, + "learning_rate": 7.3698254364089785e-06, + "loss": 0.3627, + "step": 50670 + }, + { + "epoch": 12.638403990024937, + "grad_norm": 5.764795780181885, + "learning_rate": 7.367331670822944e-06, + "loss": 0.3256, + "step": 50680 + }, + { + "epoch": 12.640897755610972, + "grad_norm": 6.912472724914551, + "learning_rate": 7.364837905236908e-06, + "loss": 0.3154, + "step": 50690 + }, + { + "epoch": 12.643391521197007, + "grad_norm": 4.959378719329834, + "learning_rate": 7.362344139650874e-06, + "loss": 0.3111, + "step": 50700 + }, + { + "epoch": 12.645885286783042, + "grad_norm": 8.026573181152344, + "learning_rate": 7.359850374064839e-06, + "loss": 0.3211, + "step": 50710 + }, + { + "epoch": 12.648379052369076, + "grad_norm": 9.039690017700195, + "learning_rate": 7.357356608478804e-06, + "loss": 0.2915, + "step": 50720 + }, + { + "epoch": 12.650872817955111, + "grad_norm": 6.101404666900635, + "learning_rate": 7.354862842892768e-06, + "loss": 0.3123, + "step": 50730 + }, + { + "epoch": 12.653366583541148, + "grad_norm": 8.664443969726562, + "learning_rate": 7.352369077306733e-06, + "loss": 0.3825, + "step": 50740 + }, + { + "epoch": 12.655860349127183, + "grad_norm": 9.475874900817871, + "learning_rate": 7.349875311720699e-06, + "loss": 0.3463, + "step": 50750 + }, + { + "epoch": 12.658354114713218, + "grad_norm": 6.189550399780273, + "learning_rate": 7.347381546134665e-06, + "loss": 0.3556, + "step": 50760 + }, + { + "epoch": 12.660847880299253, + "grad_norm": 6.720332622528076, + "learning_rate": 7.344887780548628e-06, + "loss": 0.2882, + "step": 50770 + }, + { + "epoch": 12.663341645885287, + "grad_norm": 6.461765766143799, + "learning_rate": 7.3423940149625935e-06, + "loss": 0.3059, + "step": 50780 + }, + { + "epoch": 12.665835411471322, + "grad_norm": 5.625673294067383, + "learning_rate": 7.339900249376559e-06, + "loss": 0.3903, + "step": 50790 + }, + { + "epoch": 12.668329177057357, + "grad_norm": 4.355860233306885, + "learning_rate": 7.337406483790524e-06, + "loss": 0.332, + "step": 50800 + }, + { + "epoch": 12.670822942643392, + "grad_norm": 8.476668357849121, + "learning_rate": 7.33491271820449e-06, + "loss": 0.354, + "step": 50810 + }, + { + "epoch": 12.673316708229427, + "grad_norm": 8.867452621459961, + "learning_rate": 7.332418952618454e-06, + "loss": 0.2912, + "step": 50820 + }, + { + "epoch": 12.675810473815462, + "grad_norm": 6.675667762756348, + "learning_rate": 7.329925187032419e-06, + "loss": 0.369, + "step": 50830 + }, + { + "epoch": 12.678304239401497, + "grad_norm": 6.0077972412109375, + "learning_rate": 7.327680798004988e-06, + "loss": 0.3377, + "step": 50840 + }, + { + "epoch": 12.680798004987532, + "grad_norm": 10.627485275268555, + "learning_rate": 7.325187032418954e-06, + "loss": 0.3032, + "step": 50850 + }, + { + "epoch": 12.683291770573566, + "grad_norm": 6.776688098907471, + "learning_rate": 7.322693266832919e-06, + "loss": 0.3235, + "step": 50860 + }, + { + "epoch": 12.685785536159601, + "grad_norm": 8.06655216217041, + "learning_rate": 7.320199501246883e-06, + "loss": 0.3717, + "step": 50870 + }, + { + "epoch": 12.688279301745636, + "grad_norm": 5.898749351501465, + "learning_rate": 7.317705735660848e-06, + "loss": 0.3136, + "step": 50880 + }, + { + "epoch": 12.690773067331671, + "grad_norm": 5.939793109893799, + "learning_rate": 7.315211970074814e-06, + "loss": 0.2735, + "step": 50890 + }, + { + "epoch": 12.693266832917706, + "grad_norm": 9.688426971435547, + "learning_rate": 7.312718204488779e-06, + "loss": 0.3447, + "step": 50900 + }, + { + "epoch": 12.69576059850374, + "grad_norm": 8.256994247436523, + "learning_rate": 7.310224438902743e-06, + "loss": 0.3271, + "step": 50910 + }, + { + "epoch": 12.698254364089776, + "grad_norm": 8.417771339416504, + "learning_rate": 7.307730673316708e-06, + "loss": 0.4127, + "step": 50920 + }, + { + "epoch": 12.70074812967581, + "grad_norm": 7.872901916503906, + "learning_rate": 7.305236907730674e-06, + "loss": 0.3853, + "step": 50930 + }, + { + "epoch": 12.703241895261845, + "grad_norm": 7.196390151977539, + "learning_rate": 7.302743142144639e-06, + "loss": 0.3114, + "step": 50940 + }, + { + "epoch": 12.70573566084788, + "grad_norm": 7.429532051086426, + "learning_rate": 7.300249376558603e-06, + "loss": 0.3283, + "step": 50950 + }, + { + "epoch": 12.708229426433915, + "grad_norm": 11.154653549194336, + "learning_rate": 7.297755610972569e-06, + "loss": 0.3902, + "step": 50960 + }, + { + "epoch": 12.71072319201995, + "grad_norm": 4.860759258270264, + "learning_rate": 7.295261845386534e-06, + "loss": 0.3529, + "step": 50970 + }, + { + "epoch": 12.713216957605985, + "grad_norm": 4.447295665740967, + "learning_rate": 7.292768079800499e-06, + "loss": 0.3266, + "step": 50980 + }, + { + "epoch": 12.71571072319202, + "grad_norm": 8.30553913116455, + "learning_rate": 7.290274314214465e-06, + "loss": 0.3435, + "step": 50990 + }, + { + "epoch": 12.718204488778055, + "grad_norm": 8.856745719909668, + "learning_rate": 7.287780548628429e-06, + "loss": 0.3163, + "step": 51000 + }, + { + "epoch": 12.72069825436409, + "grad_norm": 8.923396110534668, + "learning_rate": 7.285286783042394e-06, + "loss": 0.3417, + "step": 51010 + }, + { + "epoch": 12.723192019950124, + "grad_norm": 9.019140243530273, + "learning_rate": 7.28279301745636e-06, + "loss": 0.3917, + "step": 51020 + }, + { + "epoch": 12.72568578553616, + "grad_norm": 8.857991218566895, + "learning_rate": 7.280299251870325e-06, + "loss": 0.3368, + "step": 51030 + }, + { + "epoch": 12.728179551122194, + "grad_norm": 7.459349632263184, + "learning_rate": 7.2778054862842895e-06, + "loss": 0.2827, + "step": 51040 + }, + { + "epoch": 12.730673316708229, + "grad_norm": 6.536787033081055, + "learning_rate": 7.275311720698255e-06, + "loss": 0.3304, + "step": 51050 + }, + { + "epoch": 12.733167082294264, + "grad_norm": 6.800075531005859, + "learning_rate": 7.27281795511222e-06, + "loss": 0.4169, + "step": 51060 + }, + { + "epoch": 12.735660847880299, + "grad_norm": 9.216288566589355, + "learning_rate": 7.270324189526185e-06, + "loss": 0.3732, + "step": 51070 + }, + { + "epoch": 12.738154613466333, + "grad_norm": 9.165617942810059, + "learning_rate": 7.26783042394015e-06, + "loss": 0.31, + "step": 51080 + }, + { + "epoch": 12.740648379052368, + "grad_norm": 7.988641262054443, + "learning_rate": 7.265336658354115e-06, + "loss": 0.3234, + "step": 51090 + }, + { + "epoch": 12.743142144638403, + "grad_norm": 6.585142135620117, + "learning_rate": 7.2628428927680805e-06, + "loss": 0.3289, + "step": 51100 + }, + { + "epoch": 12.745635910224438, + "grad_norm": 10.255621910095215, + "learning_rate": 7.260349127182046e-06, + "loss": 0.3875, + "step": 51110 + }, + { + "epoch": 12.748129675810475, + "grad_norm": 11.782647132873535, + "learning_rate": 7.25785536159601e-06, + "loss": 0.417, + "step": 51120 + }, + { + "epoch": 12.75062344139651, + "grad_norm": 9.408047676086426, + "learning_rate": 7.2553615960099756e-06, + "loss": 0.2934, + "step": 51130 + }, + { + "epoch": 12.753117206982544, + "grad_norm": 8.07828426361084, + "learning_rate": 7.252867830423941e-06, + "loss": 0.3938, + "step": 51140 + }, + { + "epoch": 12.75561097256858, + "grad_norm": 4.914299011230469, + "learning_rate": 7.250374064837906e-06, + "loss": 0.3387, + "step": 51150 + }, + { + "epoch": 12.758104738154614, + "grad_norm": 6.668241024017334, + "learning_rate": 7.247880299251871e-06, + "loss": 0.3265, + "step": 51160 + }, + { + "epoch": 12.760598503740649, + "grad_norm": 9.32829761505127, + "learning_rate": 7.245386533665836e-06, + "loss": 0.3249, + "step": 51170 + }, + { + "epoch": 12.763092269326684, + "grad_norm": 10.00203800201416, + "learning_rate": 7.242892768079801e-06, + "loss": 0.4209, + "step": 51180 + }, + { + "epoch": 12.765586034912719, + "grad_norm": 8.861324310302734, + "learning_rate": 7.2403990024937666e-06, + "loss": 0.3101, + "step": 51190 + }, + { + "epoch": 12.768079800498754, + "grad_norm": 8.35321044921875, + "learning_rate": 7.237905236907731e-06, + "loss": 0.3043, + "step": 51200 + }, + { + "epoch": 12.770573566084789, + "grad_norm": 5.797079086303711, + "learning_rate": 7.235411471321696e-06, + "loss": 0.3619, + "step": 51210 + }, + { + "epoch": 12.773067331670823, + "grad_norm": 8.274916648864746, + "learning_rate": 7.232917705735662e-06, + "loss": 0.2735, + "step": 51220 + }, + { + "epoch": 12.775561097256858, + "grad_norm": 6.340693473815918, + "learning_rate": 7.230423940149627e-06, + "loss": 0.3211, + "step": 51230 + }, + { + "epoch": 12.778054862842893, + "grad_norm": 6.9829792976379395, + "learning_rate": 7.227930174563592e-06, + "loss": 0.2852, + "step": 51240 + }, + { + "epoch": 12.780548628428928, + "grad_norm": 11.263195037841797, + "learning_rate": 7.225436408977557e-06, + "loss": 0.3557, + "step": 51250 + }, + { + "epoch": 12.783042394014963, + "grad_norm": 8.167370796203613, + "learning_rate": 7.222942643391522e-06, + "loss": 0.3264, + "step": 51260 + }, + { + "epoch": 12.785536159600998, + "grad_norm": 6.718631744384766, + "learning_rate": 7.220448877805487e-06, + "loss": 0.2878, + "step": 51270 + }, + { + "epoch": 12.788029925187033, + "grad_norm": 7.236240386962891, + "learning_rate": 7.217955112219453e-06, + "loss": 0.3249, + "step": 51280 + }, + { + "epoch": 12.790523690773068, + "grad_norm": 10.440741539001465, + "learning_rate": 7.215461346633416e-06, + "loss": 0.3534, + "step": 51290 + }, + { + "epoch": 12.793017456359102, + "grad_norm": 5.670083522796631, + "learning_rate": 7.2129675810473824e-06, + "loss": 0.3517, + "step": 51300 + }, + { + "epoch": 12.795511221945137, + "grad_norm": 15.356858253479004, + "learning_rate": 7.210473815461348e-06, + "loss": 0.4574, + "step": 51310 + }, + { + "epoch": 12.798004987531172, + "grad_norm": 9.100034713745117, + "learning_rate": 7.207980049875313e-06, + "loss": 0.3513, + "step": 51320 + }, + { + "epoch": 12.800498753117207, + "grad_norm": 5.6677937507629395, + "learning_rate": 7.205486284289277e-06, + "loss": 0.3663, + "step": 51330 + }, + { + "epoch": 12.802992518703242, + "grad_norm": 8.217962265014648, + "learning_rate": 7.202992518703242e-06, + "loss": 0.2736, + "step": 51340 + }, + { + "epoch": 12.805486284289277, + "grad_norm": 5.241400241851807, + "learning_rate": 7.200498753117207e-06, + "loss": 0.3087, + "step": 51350 + }, + { + "epoch": 12.807980049875312, + "grad_norm": 10.871718406677246, + "learning_rate": 7.1980049875311734e-06, + "loss": 0.3461, + "step": 51360 + }, + { + "epoch": 12.810473815461346, + "grad_norm": 6.515952110290527, + "learning_rate": 7.195511221945137e-06, + "loss": 0.3521, + "step": 51370 + }, + { + "epoch": 12.812967581047381, + "grad_norm": 9.03691577911377, + "learning_rate": 7.193017456359102e-06, + "loss": 0.3166, + "step": 51380 + }, + { + "epoch": 12.815461346633416, + "grad_norm": 8.544856071472168, + "learning_rate": 7.190523690773068e-06, + "loss": 0.3556, + "step": 51390 + }, + { + "epoch": 12.817955112219451, + "grad_norm": 6.503901481628418, + "learning_rate": 7.188029925187033e-06, + "loss": 0.3814, + "step": 51400 + }, + { + "epoch": 12.820448877805486, + "grad_norm": 6.1890387535095215, + "learning_rate": 7.1855361596009975e-06, + "loss": 0.3247, + "step": 51410 + }, + { + "epoch": 12.82294264339152, + "grad_norm": 5.103339672088623, + "learning_rate": 7.183042394014963e-06, + "loss": 0.4179, + "step": 51420 + }, + { + "epoch": 12.825436408977556, + "grad_norm": 9.049005508422852, + "learning_rate": 7.180548628428928e-06, + "loss": 0.3371, + "step": 51430 + }, + { + "epoch": 12.82793017456359, + "grad_norm": 8.748699188232422, + "learning_rate": 7.178054862842893e-06, + "loss": 0.3176, + "step": 51440 + }, + { + "epoch": 12.830423940149625, + "grad_norm": 8.368945121765137, + "learning_rate": 7.175561097256858e-06, + "loss": 0.3286, + "step": 51450 + }, + { + "epoch": 12.83291770573566, + "grad_norm": 7.695786952972412, + "learning_rate": 7.173067331670823e-06, + "loss": 0.3397, + "step": 51460 + }, + { + "epoch": 12.835411471321695, + "grad_norm": 8.735140800476074, + "learning_rate": 7.1705735660847885e-06, + "loss": 0.3353, + "step": 51470 + }, + { + "epoch": 12.83790523690773, + "grad_norm": 4.386609077453613, + "learning_rate": 7.168079800498754e-06, + "loss": 0.3446, + "step": 51480 + }, + { + "epoch": 12.840399002493765, + "grad_norm": 6.733331680297852, + "learning_rate": 7.165586034912719e-06, + "loss": 0.3685, + "step": 51490 + }, + { + "epoch": 12.8428927680798, + "grad_norm": 7.894585132598877, + "learning_rate": 7.1630922693266835e-06, + "loss": 0.332, + "step": 51500 + }, + { + "epoch": 12.845386533665835, + "grad_norm": 9.471677780151367, + "learning_rate": 7.160598503740649e-06, + "loss": 0.2614, + "step": 51510 + }, + { + "epoch": 12.84788029925187, + "grad_norm": 11.194404602050781, + "learning_rate": 7.158104738154614e-06, + "loss": 0.3671, + "step": 51520 + }, + { + "epoch": 12.850374064837904, + "grad_norm": 9.611897468566895, + "learning_rate": 7.1556109725685795e-06, + "loss": 0.3233, + "step": 51530 + }, + { + "epoch": 12.85286783042394, + "grad_norm": 8.516155242919922, + "learning_rate": 7.153117206982544e-06, + "loss": 0.3259, + "step": 51540 + }, + { + "epoch": 12.855361596009976, + "grad_norm": 6.97627067565918, + "learning_rate": 7.150623441396509e-06, + "loss": 0.3679, + "step": 51550 + }, + { + "epoch": 12.85785536159601, + "grad_norm": 9.25831127166748, + "learning_rate": 7.1481296758104746e-06, + "loss": 0.3705, + "step": 51560 + }, + { + "epoch": 12.860349127182046, + "grad_norm": 4.845349311828613, + "learning_rate": 7.14563591022444e-06, + "loss": 0.2548, + "step": 51570 + }, + { + "epoch": 12.86284289276808, + "grad_norm": 7.061251163482666, + "learning_rate": 7.143142144638404e-06, + "loss": 0.3352, + "step": 51580 + }, + { + "epoch": 12.865336658354115, + "grad_norm": 7.0790557861328125, + "learning_rate": 7.14064837905237e-06, + "loss": 0.2947, + "step": 51590 + }, + { + "epoch": 12.86783042394015, + "grad_norm": 9.186144828796387, + "learning_rate": 7.138154613466335e-06, + "loss": 0.3689, + "step": 51600 + }, + { + "epoch": 12.870324189526185, + "grad_norm": 5.841263771057129, + "learning_rate": 7.1356608478803e-06, + "loss": 0.3377, + "step": 51610 + }, + { + "epoch": 12.87281795511222, + "grad_norm": 15.965258598327637, + "learning_rate": 7.133167082294265e-06, + "loss": 0.3976, + "step": 51620 + }, + { + "epoch": 12.875311720698255, + "grad_norm": 7.985716342926025, + "learning_rate": 7.13067331670823e-06, + "loss": 0.3237, + "step": 51630 + }, + { + "epoch": 12.87780548628429, + "grad_norm": 7.872910499572754, + "learning_rate": 7.128179551122195e-06, + "loss": 0.3373, + "step": 51640 + }, + { + "epoch": 12.880299251870325, + "grad_norm": 8.896018028259277, + "learning_rate": 7.125685785536161e-06, + "loss": 0.3554, + "step": 51650 + }, + { + "epoch": 12.88279301745636, + "grad_norm": 8.754325866699219, + "learning_rate": 7.123192019950125e-06, + "loss": 0.3481, + "step": 51660 + }, + { + "epoch": 12.885286783042394, + "grad_norm": 9.110607147216797, + "learning_rate": 7.1206982543640904e-06, + "loss": 0.3519, + "step": 51670 + }, + { + "epoch": 12.88778054862843, + "grad_norm": 10.086724281311035, + "learning_rate": 7.118204488778056e-06, + "loss": 0.3255, + "step": 51680 + }, + { + "epoch": 12.890274314214464, + "grad_norm": 9.028765678405762, + "learning_rate": 7.115710723192021e-06, + "loss": 0.3527, + "step": 51690 + }, + { + "epoch": 12.892768079800499, + "grad_norm": 12.050107955932617, + "learning_rate": 7.113216957605985e-06, + "loss": 0.3747, + "step": 51700 + }, + { + "epoch": 12.895261845386534, + "grad_norm": 5.377803325653076, + "learning_rate": 7.110723192019951e-06, + "loss": 0.3493, + "step": 51710 + }, + { + "epoch": 12.897755610972569, + "grad_norm": 12.90459156036377, + "learning_rate": 7.108229426433916e-06, + "loss": 0.3973, + "step": 51720 + }, + { + "epoch": 12.900249376558603, + "grad_norm": 8.819035530090332, + "learning_rate": 7.1057356608478814e-06, + "loss": 0.3746, + "step": 51730 + }, + { + "epoch": 12.902743142144638, + "grad_norm": 7.119004726409912, + "learning_rate": 7.103241895261847e-06, + "loss": 0.3413, + "step": 51740 + }, + { + "epoch": 12.905236907730673, + "grad_norm": 9.566146850585938, + "learning_rate": 7.10074812967581e-06, + "loss": 0.3475, + "step": 51750 + }, + { + "epoch": 12.907730673316708, + "grad_norm": 6.812338829040527, + "learning_rate": 7.098254364089776e-06, + "loss": 0.3002, + "step": 51760 + }, + { + "epoch": 12.910224438902743, + "grad_norm": 6.894874095916748, + "learning_rate": 7.095760598503742e-06, + "loss": 0.3632, + "step": 51770 + }, + { + "epoch": 12.912718204488778, + "grad_norm": 10.196784019470215, + "learning_rate": 7.093266832917707e-06, + "loss": 0.3053, + "step": 51780 + }, + { + "epoch": 12.915211970074813, + "grad_norm": 7.024259090423584, + "learning_rate": 7.090773067331671e-06, + "loss": 0.3858, + "step": 51790 + }, + { + "epoch": 12.917705735660848, + "grad_norm": 8.830977439880371, + "learning_rate": 7.088279301745636e-06, + "loss": 0.319, + "step": 51800 + }, + { + "epoch": 12.920199501246882, + "grad_norm": 7.131579875946045, + "learning_rate": 7.085785536159601e-06, + "loss": 0.3245, + "step": 51810 + }, + { + "epoch": 12.922693266832917, + "grad_norm": 10.366734504699707, + "learning_rate": 7.083291770573567e-06, + "loss": 0.3072, + "step": 51820 + }, + { + "epoch": 12.925187032418952, + "grad_norm": 14.305132865905762, + "learning_rate": 7.080798004987531e-06, + "loss": 0.3282, + "step": 51830 + }, + { + "epoch": 12.927680798004987, + "grad_norm": 5.115872859954834, + "learning_rate": 7.0783042394014965e-06, + "loss": 0.3539, + "step": 51840 + }, + { + "epoch": 12.930174563591022, + "grad_norm": 6.633999347686768, + "learning_rate": 7.075810473815462e-06, + "loss": 0.4114, + "step": 51850 + }, + { + "epoch": 12.932668329177057, + "grad_norm": 9.143322944641113, + "learning_rate": 7.073316708229427e-06, + "loss": 0.3024, + "step": 51860 + }, + { + "epoch": 12.935162094763092, + "grad_norm": 8.794203758239746, + "learning_rate": 7.0708229426433915e-06, + "loss": 0.3265, + "step": 51870 + }, + { + "epoch": 12.937655860349127, + "grad_norm": 8.29458999633789, + "learning_rate": 7.068329177057357e-06, + "loss": 0.3302, + "step": 51880 + }, + { + "epoch": 12.940149625935161, + "grad_norm": 8.905904769897461, + "learning_rate": 7.065835411471322e-06, + "loss": 0.4036, + "step": 51890 + }, + { + "epoch": 12.942643391521196, + "grad_norm": 11.505474090576172, + "learning_rate": 7.0633416458852875e-06, + "loss": 0.3459, + "step": 51900 + }, + { + "epoch": 12.945137157107231, + "grad_norm": 7.7994561195373535, + "learning_rate": 7.060847880299252e-06, + "loss": 0.3477, + "step": 51910 + }, + { + "epoch": 12.947630922693268, + "grad_norm": 8.445225715637207, + "learning_rate": 7.058354114713217e-06, + "loss": 0.3153, + "step": 51920 + }, + { + "epoch": 12.950124688279303, + "grad_norm": 7.54745626449585, + "learning_rate": 7.0558603491271825e-06, + "loss": 0.2894, + "step": 51930 + }, + { + "epoch": 12.952618453865338, + "grad_norm": 8.576315879821777, + "learning_rate": 7.053366583541148e-06, + "loss": 0.3849, + "step": 51940 + }, + { + "epoch": 12.955112219451372, + "grad_norm": 11.285699844360352, + "learning_rate": 7.050872817955112e-06, + "loss": 0.3399, + "step": 51950 + }, + { + "epoch": 12.957605985037407, + "grad_norm": 5.247829914093018, + "learning_rate": 7.048379052369078e-06, + "loss": 0.299, + "step": 51960 + }, + { + "epoch": 12.960099750623442, + "grad_norm": 8.704336166381836, + "learning_rate": 7.045885286783043e-06, + "loss": 0.3415, + "step": 51970 + }, + { + "epoch": 12.962593516209477, + "grad_norm": 9.179566383361816, + "learning_rate": 7.043391521197008e-06, + "loss": 0.3734, + "step": 51980 + }, + { + "epoch": 12.965087281795512, + "grad_norm": 6.699599266052246, + "learning_rate": 7.0408977556109736e-06, + "loss": 0.3288, + "step": 51990 + }, + { + "epoch": 12.967581047381547, + "grad_norm": 6.417098045349121, + "learning_rate": 7.038403990024938e-06, + "loss": 0.2907, + "step": 52000 + }, + { + "epoch": 12.970074812967582, + "grad_norm": 7.011273384094238, + "learning_rate": 7.035910224438903e-06, + "loss": 0.3603, + "step": 52010 + }, + { + "epoch": 12.972568578553616, + "grad_norm": 8.827155113220215, + "learning_rate": 7.033416458852869e-06, + "loss": 0.3839, + "step": 52020 + }, + { + "epoch": 12.975062344139651, + "grad_norm": 7.616529941558838, + "learning_rate": 7.030922693266834e-06, + "loss": 0.3665, + "step": 52030 + }, + { + "epoch": 12.977556109725686, + "grad_norm": 8.937703132629395, + "learning_rate": 7.028428927680798e-06, + "loss": 0.3125, + "step": 52040 + }, + { + "epoch": 12.980049875311721, + "grad_norm": 9.48192310333252, + "learning_rate": 7.025935162094764e-06, + "loss": 0.445, + "step": 52050 + }, + { + "epoch": 12.982543640897756, + "grad_norm": 8.205970764160156, + "learning_rate": 7.023441396508729e-06, + "loss": 0.3578, + "step": 52060 + }, + { + "epoch": 12.98503740648379, + "grad_norm": 9.55613899230957, + "learning_rate": 7.020947630922694e-06, + "loss": 0.301, + "step": 52070 + }, + { + "epoch": 12.987531172069826, + "grad_norm": 7.20795202255249, + "learning_rate": 7.018453865336659e-06, + "loss": 0.3893, + "step": 52080 + }, + { + "epoch": 12.99002493765586, + "grad_norm": 12.070457458496094, + "learning_rate": 7.015960099750624e-06, + "loss": 0.3387, + "step": 52090 + }, + { + "epoch": 12.992518703241895, + "grad_norm": 4.415650844573975, + "learning_rate": 7.0134663341645894e-06, + "loss": 0.3183, + "step": 52100 + }, + { + "epoch": 12.99501246882793, + "grad_norm": 6.928340911865234, + "learning_rate": 7.010972568578555e-06, + "loss": 0.3319, + "step": 52110 + }, + { + "epoch": 12.997506234413965, + "grad_norm": 6.329419136047363, + "learning_rate": 7.008478802992519e-06, + "loss": 0.3946, + "step": 52120 + }, + { + "epoch": 13.0, + "grad_norm": 4.517069339752197, + "learning_rate": 7.0059850374064845e-06, + "loss": 0.3482, + "step": 52130 + }, + { + "epoch": 13.0, + "eval_loss": 0.41498884558677673, + "eval_runtime": 60.065, + "eval_samples_per_second": 16.699, + "eval_steps_per_second": 16.699, + "step": 52130 + }, + { + "epoch": 13.002493765586035, + "grad_norm": 6.689090728759766, + "learning_rate": 7.00349127182045e-06, + "loss": 0.3395, + "step": 52140 + }, + { + "epoch": 13.00498753117207, + "grad_norm": 7.670643329620361, + "learning_rate": 7.000997506234415e-06, + "loss": 0.3143, + "step": 52150 + }, + { + "epoch": 13.007481296758105, + "grad_norm": 10.944851875305176, + "learning_rate": 6.998503740648379e-06, + "loss": 0.3196, + "step": 52160 + }, + { + "epoch": 13.00997506234414, + "grad_norm": 5.979905128479004, + "learning_rate": 6.996009975062344e-06, + "loss": 0.3221, + "step": 52170 + }, + { + "epoch": 13.012468827930174, + "grad_norm": 6.5112762451171875, + "learning_rate": 6.99351620947631e-06, + "loss": 0.2684, + "step": 52180 + }, + { + "epoch": 13.01496259351621, + "grad_norm": 5.4960761070251465, + "learning_rate": 6.9910224438902755e-06, + "loss": 0.2942, + "step": 52190 + }, + { + "epoch": 13.017456359102244, + "grad_norm": 6.578577995300293, + "learning_rate": 6.988528678304239e-06, + "loss": 0.3318, + "step": 52200 + }, + { + "epoch": 13.019950124688279, + "grad_norm": 7.574843406677246, + "learning_rate": 6.9860349127182044e-06, + "loss": 0.3365, + "step": 52210 + }, + { + "epoch": 13.022443890274314, + "grad_norm": 6.713075160980225, + "learning_rate": 6.98354114713217e-06, + "loss": 0.3522, + "step": 52220 + }, + { + "epoch": 13.024937655860349, + "grad_norm": 10.647099494934082, + "learning_rate": 6.981047381546135e-06, + "loss": 0.2822, + "step": 52230 + }, + { + "epoch": 13.027431421446384, + "grad_norm": 6.798612594604492, + "learning_rate": 6.978553615960101e-06, + "loss": 0.2812, + "step": 52240 + }, + { + "epoch": 13.029925187032418, + "grad_norm": 6.139517784118652, + "learning_rate": 6.976059850374065e-06, + "loss": 0.3224, + "step": 52250 + }, + { + "epoch": 13.032418952618453, + "grad_norm": 5.845915794372559, + "learning_rate": 6.97356608478803e-06, + "loss": 0.2987, + "step": 52260 + }, + { + "epoch": 13.034912718204488, + "grad_norm": 6.590796947479248, + "learning_rate": 6.9710723192019955e-06, + "loss": 0.3018, + "step": 52270 + }, + { + "epoch": 13.037406483790523, + "grad_norm": 6.91572904586792, + "learning_rate": 6.968578553615961e-06, + "loss": 0.3517, + "step": 52280 + }, + { + "epoch": 13.039900249376558, + "grad_norm": 5.153928756713867, + "learning_rate": 6.966084788029925e-06, + "loss": 0.2843, + "step": 52290 + }, + { + "epoch": 13.042394014962593, + "grad_norm": 8.661881446838379, + "learning_rate": 6.9635910224438905e-06, + "loss": 0.278, + "step": 52300 + }, + { + "epoch": 13.044887780548628, + "grad_norm": 6.513763904571533, + "learning_rate": 6.961097256857856e-06, + "loss": 0.3141, + "step": 52310 + }, + { + "epoch": 13.047381546134662, + "grad_norm": 7.926911354064941, + "learning_rate": 6.958603491271821e-06, + "loss": 0.3725, + "step": 52320 + }, + { + "epoch": 13.049875311720697, + "grad_norm": 10.43213176727295, + "learning_rate": 6.956109725685786e-06, + "loss": 0.3259, + "step": 52330 + }, + { + "epoch": 13.052369077306734, + "grad_norm": 5.205043792724609, + "learning_rate": 6.953615960099751e-06, + "loss": 0.3007, + "step": 52340 + }, + { + "epoch": 13.054862842892769, + "grad_norm": 6.522244930267334, + "learning_rate": 6.951122194513716e-06, + "loss": 0.2957, + "step": 52350 + }, + { + "epoch": 13.057356608478804, + "grad_norm": 6.536242961883545, + "learning_rate": 6.9486284289276815e-06, + "loss": 0.3116, + "step": 52360 + }, + { + "epoch": 13.059850374064839, + "grad_norm": 8.81574535369873, + "learning_rate": 6.946134663341646e-06, + "loss": 0.3632, + "step": 52370 + }, + { + "epoch": 13.062344139650873, + "grad_norm": 6.305959701538086, + "learning_rate": 6.943640897755611e-06, + "loss": 0.3337, + "step": 52380 + }, + { + "epoch": 13.064837905236908, + "grad_norm": 12.51152515411377, + "learning_rate": 6.941147132169577e-06, + "loss": 0.3609, + "step": 52390 + }, + { + "epoch": 13.067331670822943, + "grad_norm": 8.995133399963379, + "learning_rate": 6.938653366583542e-06, + "loss": 0.3537, + "step": 52400 + }, + { + "epoch": 13.069825436408978, + "grad_norm": 10.008258819580078, + "learning_rate": 6.936159600997506e-06, + "loss": 0.3284, + "step": 52410 + }, + { + "epoch": 13.072319201995013, + "grad_norm": 7.722633361816406, + "learning_rate": 6.933665835411472e-06, + "loss": 0.3823, + "step": 52420 + }, + { + "epoch": 13.074812967581048, + "grad_norm": 8.553972244262695, + "learning_rate": 6.931172069825437e-06, + "loss": 0.3226, + "step": 52430 + }, + { + "epoch": 13.077306733167083, + "grad_norm": 6.591207027435303, + "learning_rate": 6.928678304239402e-06, + "loss": 0.3421, + "step": 52440 + }, + { + "epoch": 13.079800498753118, + "grad_norm": 5.500464916229248, + "learning_rate": 6.926184538653367e-06, + "loss": 0.2842, + "step": 52450 + }, + { + "epoch": 13.082294264339152, + "grad_norm": 8.214122772216797, + "learning_rate": 6.923690773067332e-06, + "loss": 0.2846, + "step": 52460 + }, + { + "epoch": 13.084788029925187, + "grad_norm": 7.543227195739746, + "learning_rate": 6.921197007481297e-06, + "loss": 0.3326, + "step": 52470 + }, + { + "epoch": 13.087281795511222, + "grad_norm": 8.072737693786621, + "learning_rate": 6.918703241895263e-06, + "loss": 0.3319, + "step": 52480 + }, + { + "epoch": 13.089775561097257, + "grad_norm": 8.425067901611328, + "learning_rate": 6.916209476309228e-06, + "loss": 0.303, + "step": 52490 + }, + { + "epoch": 13.092269326683292, + "grad_norm": 8.168347358703613, + "learning_rate": 6.9137157107231925e-06, + "loss": 0.3316, + "step": 52500 + }, + { + "epoch": 13.094763092269327, + "grad_norm": 7.062584400177002, + "learning_rate": 6.911221945137158e-06, + "loss": 0.2915, + "step": 52510 + }, + { + "epoch": 13.097256857855362, + "grad_norm": 8.098386764526367, + "learning_rate": 6.908728179551123e-06, + "loss": 0.3826, + "step": 52520 + }, + { + "epoch": 13.099750623441397, + "grad_norm": 6.54594087600708, + "learning_rate": 6.9062344139650884e-06, + "loss": 0.3046, + "step": 52530 + }, + { + "epoch": 13.102244389027431, + "grad_norm": 8.44714069366455, + "learning_rate": 6.903740648379053e-06, + "loss": 0.296, + "step": 52540 + }, + { + "epoch": 13.104738154613466, + "grad_norm": 7.943593502044678, + "learning_rate": 6.901246882793018e-06, + "loss": 0.3459, + "step": 52550 + }, + { + "epoch": 13.107231920199501, + "grad_norm": 5.09817361831665, + "learning_rate": 6.8987531172069835e-06, + "loss": 0.2701, + "step": 52560 + }, + { + "epoch": 13.109725685785536, + "grad_norm": 10.518460273742676, + "learning_rate": 6.896259351620949e-06, + "loss": 0.3679, + "step": 52570 + }, + { + "epoch": 13.11221945137157, + "grad_norm": 7.73917293548584, + "learning_rate": 6.893765586034913e-06, + "loss": 0.2963, + "step": 52580 + }, + { + "epoch": 13.114713216957606, + "grad_norm": 8.970417976379395, + "learning_rate": 6.891271820448879e-06, + "loss": 0.316, + "step": 52590 + }, + { + "epoch": 13.11720698254364, + "grad_norm": 9.928048133850098, + "learning_rate": 6.888778054862844e-06, + "loss": 0.3444, + "step": 52600 + }, + { + "epoch": 13.119700748129675, + "grad_norm": 12.679057121276855, + "learning_rate": 6.886284289276809e-06, + "loss": 0.3914, + "step": 52610 + }, + { + "epoch": 13.12219451371571, + "grad_norm": 8.21277904510498, + "learning_rate": 6.883790523690773e-06, + "loss": 0.293, + "step": 52620 + }, + { + "epoch": 13.124688279301745, + "grad_norm": 8.76871109008789, + "learning_rate": 6.881296758104738e-06, + "loss": 0.3419, + "step": 52630 + }, + { + "epoch": 13.12718204488778, + "grad_norm": 7.2397236824035645, + "learning_rate": 6.878802992518704e-06, + "loss": 0.3769, + "step": 52640 + }, + { + "epoch": 13.129675810473815, + "grad_norm": 7.731866836547852, + "learning_rate": 6.87630922693267e-06, + "loss": 0.3589, + "step": 52650 + }, + { + "epoch": 13.13216957605985, + "grad_norm": 8.515292167663574, + "learning_rate": 6.873815461346633e-06, + "loss": 0.3328, + "step": 52660 + }, + { + "epoch": 13.134663341645885, + "grad_norm": 5.041269779205322, + "learning_rate": 6.8713216957605985e-06, + "loss": 0.3182, + "step": 52670 + }, + { + "epoch": 13.13715710723192, + "grad_norm": 7.612249851226807, + "learning_rate": 6.868827930174564e-06, + "loss": 0.3596, + "step": 52680 + }, + { + "epoch": 13.139650872817954, + "grad_norm": 8.448023796081543, + "learning_rate": 6.866334164588529e-06, + "loss": 0.3286, + "step": 52690 + }, + { + "epoch": 13.14214463840399, + "grad_norm": 8.699821472167969, + "learning_rate": 6.863840399002495e-06, + "loss": 0.3803, + "step": 52700 + }, + { + "epoch": 13.144638403990024, + "grad_norm": 9.197280883789062, + "learning_rate": 6.861346633416459e-06, + "loss": 0.3577, + "step": 52710 + }, + { + "epoch": 13.147132169576059, + "grad_norm": 5.9945597648620605, + "learning_rate": 6.858852867830424e-06, + "loss": 0.3208, + "step": 52720 + }, + { + "epoch": 13.149625935162096, + "grad_norm": 7.514776229858398, + "learning_rate": 6.8563591022443895e-06, + "loss": 0.3625, + "step": 52730 + }, + { + "epoch": 13.15211970074813, + "grad_norm": 17.048492431640625, + "learning_rate": 6.853865336658355e-06, + "loss": 0.3416, + "step": 52740 + }, + { + "epoch": 13.154613466334165, + "grad_norm": 9.108981132507324, + "learning_rate": 6.851371571072319e-06, + "loss": 0.3295, + "step": 52750 + }, + { + "epoch": 13.1571072319202, + "grad_norm": 4.699746131896973, + "learning_rate": 6.848877805486285e-06, + "loss": 0.3258, + "step": 52760 + }, + { + "epoch": 13.159600997506235, + "grad_norm": 9.46227741241455, + "learning_rate": 6.84638403990025e-06, + "loss": 0.3021, + "step": 52770 + }, + { + "epoch": 13.16209476309227, + "grad_norm": 10.8130521774292, + "learning_rate": 6.843890274314215e-06, + "loss": 0.3349, + "step": 52780 + }, + { + "epoch": 13.164588528678305, + "grad_norm": 6.772179126739502, + "learning_rate": 6.84139650872818e-06, + "loss": 0.3241, + "step": 52790 + }, + { + "epoch": 13.16708229426434, + "grad_norm": 8.732086181640625, + "learning_rate": 6.838902743142145e-06, + "loss": 0.2967, + "step": 52800 + }, + { + "epoch": 13.169576059850375, + "grad_norm": 8.218696594238281, + "learning_rate": 6.83640897755611e-06, + "loss": 0.3601, + "step": 52810 + }, + { + "epoch": 13.17206982543641, + "grad_norm": 11.002167701721191, + "learning_rate": 6.833915211970076e-06, + "loss": 0.3686, + "step": 52820 + }, + { + "epoch": 13.174563591022444, + "grad_norm": 7.543792724609375, + "learning_rate": 6.83142144638404e-06, + "loss": 0.3467, + "step": 52830 + }, + { + "epoch": 13.17705735660848, + "grad_norm": 7.169468402862549, + "learning_rate": 6.828927680798005e-06, + "loss": 0.3385, + "step": 52840 + }, + { + "epoch": 13.179551122194514, + "grad_norm": 11.047392845153809, + "learning_rate": 6.826433915211971e-06, + "loss": 0.3213, + "step": 52850 + }, + { + "epoch": 13.182044887780549, + "grad_norm": 7.974063396453857, + "learning_rate": 6.823940149625936e-06, + "loss": 0.3075, + "step": 52860 + }, + { + "epoch": 13.184538653366584, + "grad_norm": 9.911652565002441, + "learning_rate": 6.8214463840399005e-06, + "loss": 0.2636, + "step": 52870 + }, + { + "epoch": 13.187032418952619, + "grad_norm": 5.923473834991455, + "learning_rate": 6.818952618453866e-06, + "loss": 0.3013, + "step": 52880 + }, + { + "epoch": 13.189526184538654, + "grad_norm": 7.126490592956543, + "learning_rate": 6.816458852867831e-06, + "loss": 0.2964, + "step": 52890 + }, + { + "epoch": 13.192019950124688, + "grad_norm": 6.427547931671143, + "learning_rate": 6.813965087281796e-06, + "loss": 0.3058, + "step": 52900 + }, + { + "epoch": 13.194513715710723, + "grad_norm": 6.297787189483643, + "learning_rate": 6.811471321695761e-06, + "loss": 0.2955, + "step": 52910 + }, + { + "epoch": 13.197007481296758, + "grad_norm": 7.9449005126953125, + "learning_rate": 6.808977556109726e-06, + "loss": 0.3024, + "step": 52920 + }, + { + "epoch": 13.199501246882793, + "grad_norm": 10.155074119567871, + "learning_rate": 6.8064837905236915e-06, + "loss": 0.3011, + "step": 52930 + }, + { + "epoch": 13.201995012468828, + "grad_norm": 8.037057876586914, + "learning_rate": 6.803990024937657e-06, + "loss": 0.3507, + "step": 52940 + }, + { + "epoch": 13.204488778054863, + "grad_norm": 8.216534614562988, + "learning_rate": 6.801496259351622e-06, + "loss": 0.3815, + "step": 52950 + }, + { + "epoch": 13.206982543640898, + "grad_norm": 8.674677848815918, + "learning_rate": 6.799002493765587e-06, + "loss": 0.3702, + "step": 52960 + }, + { + "epoch": 13.209476309226932, + "grad_norm": 6.525129795074463, + "learning_rate": 6.796508728179552e-06, + "loss": 0.2832, + "step": 52970 + }, + { + "epoch": 13.211970074812967, + "grad_norm": 20.87378692626953, + "learning_rate": 6.794014962593517e-06, + "loss": 0.4099, + "step": 52980 + }, + { + "epoch": 13.214463840399002, + "grad_norm": 6.119201183319092, + "learning_rate": 6.7915211970074825e-06, + "loss": 0.3321, + "step": 52990 + }, + { + "epoch": 13.216957605985037, + "grad_norm": 9.405641555786133, + "learning_rate": 6.789027431421447e-06, + "loss": 0.3069, + "step": 53000 + }, + { + "epoch": 13.219451371571072, + "grad_norm": 9.708418846130371, + "learning_rate": 6.786533665835412e-06, + "loss": 0.3953, + "step": 53010 + }, + { + "epoch": 13.221945137157107, + "grad_norm": 6.832964897155762, + "learning_rate": 6.784039900249378e-06, + "loss": 0.3541, + "step": 53020 + }, + { + "epoch": 13.224438902743142, + "grad_norm": 15.42807674407959, + "learning_rate": 6.781546134663343e-06, + "loss": 0.4261, + "step": 53030 + }, + { + "epoch": 13.226932668329177, + "grad_norm": 17.721216201782227, + "learning_rate": 6.7790523690773065e-06, + "loss": 0.3629, + "step": 53040 + }, + { + "epoch": 13.229426433915211, + "grad_norm": 10.285332679748535, + "learning_rate": 6.776558603491273e-06, + "loss": 0.329, + "step": 53050 + }, + { + "epoch": 13.231920199501246, + "grad_norm": 4.8609113693237305, + "learning_rate": 6.774064837905238e-06, + "loss": 0.3147, + "step": 53060 + }, + { + "epoch": 13.234413965087281, + "grad_norm": 8.452110290527344, + "learning_rate": 6.771571072319203e-06, + "loss": 0.3458, + "step": 53070 + }, + { + "epoch": 13.236907730673316, + "grad_norm": 6.897256851196289, + "learning_rate": 6.769077306733167e-06, + "loss": 0.3171, + "step": 53080 + }, + { + "epoch": 13.239401496259351, + "grad_norm": 8.701833724975586, + "learning_rate": 6.766583541147132e-06, + "loss": 0.3182, + "step": 53090 + }, + { + "epoch": 13.241895261845386, + "grad_norm": 6.701672554016113, + "learning_rate": 6.7640897755610975e-06, + "loss": 0.4088, + "step": 53100 + }, + { + "epoch": 13.24438902743142, + "grad_norm": 9.619290351867676, + "learning_rate": 6.761596009975064e-06, + "loss": 0.354, + "step": 53110 + }, + { + "epoch": 13.246882793017456, + "grad_norm": 8.733819007873535, + "learning_rate": 6.759102244389027e-06, + "loss": 0.3971, + "step": 53120 + }, + { + "epoch": 13.24937655860349, + "grad_norm": 7.536902904510498, + "learning_rate": 6.756608478802993e-06, + "loss": 0.382, + "step": 53130 + }, + { + "epoch": 13.251870324189527, + "grad_norm": 8.806652069091797, + "learning_rate": 6.754114713216958e-06, + "loss": 0.359, + "step": 53140 + }, + { + "epoch": 13.254364089775562, + "grad_norm": 8.000337600708008, + "learning_rate": 6.751620947630923e-06, + "loss": 0.3648, + "step": 53150 + }, + { + "epoch": 13.256857855361597, + "grad_norm": 10.321428298950195, + "learning_rate": 6.749127182044888e-06, + "loss": 0.3462, + "step": 53160 + }, + { + "epoch": 13.259351620947632, + "grad_norm": 11.810160636901855, + "learning_rate": 6.746633416458853e-06, + "loss": 0.3129, + "step": 53170 + }, + { + "epoch": 13.261845386533667, + "grad_norm": 8.895573616027832, + "learning_rate": 6.744139650872818e-06, + "loss": 0.2583, + "step": 53180 + }, + { + "epoch": 13.264339152119701, + "grad_norm": 8.861416816711426, + "learning_rate": 6.741645885286784e-06, + "loss": 0.3389, + "step": 53190 + }, + { + "epoch": 13.266832917705736, + "grad_norm": 7.294126510620117, + "learning_rate": 6.739152119700749e-06, + "loss": 0.3496, + "step": 53200 + }, + { + "epoch": 13.269326683291771, + "grad_norm": 7.825159072875977, + "learning_rate": 6.736658354114713e-06, + "loss": 0.321, + "step": 53210 + }, + { + "epoch": 13.271820448877806, + "grad_norm": 7.070465564727783, + "learning_rate": 6.734164588528679e-06, + "loss": 0.2964, + "step": 53220 + }, + { + "epoch": 13.27431421446384, + "grad_norm": 11.65014934539795, + "learning_rate": 6.731670822942644e-06, + "loss": 0.2895, + "step": 53230 + }, + { + "epoch": 13.276807980049876, + "grad_norm": 4.547674655914307, + "learning_rate": 6.729177057356609e-06, + "loss": 0.3454, + "step": 53240 + }, + { + "epoch": 13.27930174563591, + "grad_norm": 6.833098411560059, + "learning_rate": 6.726683291770574e-06, + "loss": 0.3121, + "step": 53250 + }, + { + "epoch": 13.281795511221945, + "grad_norm": 11.146747589111328, + "learning_rate": 6.724189526184539e-06, + "loss": 0.36, + "step": 53260 + }, + { + "epoch": 13.28428927680798, + "grad_norm": 9.843538284301758, + "learning_rate": 6.721695760598504e-06, + "loss": 0.3221, + "step": 53270 + }, + { + "epoch": 13.286783042394015, + "grad_norm": 8.288082122802734, + "learning_rate": 6.71920199501247e-06, + "loss": 0.3221, + "step": 53280 + }, + { + "epoch": 13.28927680798005, + "grad_norm": 9.893383979797363, + "learning_rate": 6.716708229426434e-06, + "loss": 0.2934, + "step": 53290 + }, + { + "epoch": 13.291770573566085, + "grad_norm": 5.806131362915039, + "learning_rate": 6.7142144638403995e-06, + "loss": 0.401, + "step": 53300 + }, + { + "epoch": 13.29426433915212, + "grad_norm": 9.158594131469727, + "learning_rate": 6.711720698254365e-06, + "loss": 0.2943, + "step": 53310 + }, + { + "epoch": 13.296758104738155, + "grad_norm": 6.4060139656066895, + "learning_rate": 6.70922693266833e-06, + "loss": 0.3538, + "step": 53320 + }, + { + "epoch": 13.29925187032419, + "grad_norm": 10.887904167175293, + "learning_rate": 6.7067331670822946e-06, + "loss": 0.3591, + "step": 53330 + }, + { + "epoch": 13.301745635910224, + "grad_norm": 6.867051124572754, + "learning_rate": 6.70423940149626e-06, + "loss": 0.2973, + "step": 53340 + }, + { + "epoch": 13.30423940149626, + "grad_norm": 6.542517185211182, + "learning_rate": 6.701745635910225e-06, + "loss": 0.2912, + "step": 53350 + }, + { + "epoch": 13.306733167082294, + "grad_norm": 7.9812517166137695, + "learning_rate": 6.6992518703241905e-06, + "loss": 0.29, + "step": 53360 + }, + { + "epoch": 13.309226932668329, + "grad_norm": 7.906765460968018, + "learning_rate": 6.696758104738155e-06, + "loss": 0.2706, + "step": 53370 + }, + { + "epoch": 13.311720698254364, + "grad_norm": 8.329736709594727, + "learning_rate": 6.69426433915212e-06, + "loss": 0.3764, + "step": 53380 + }, + { + "epoch": 13.314214463840399, + "grad_norm": 9.357162475585938, + "learning_rate": 6.691770573566086e-06, + "loss": 0.3357, + "step": 53390 + }, + { + "epoch": 13.316708229426434, + "grad_norm": 7.998769760131836, + "learning_rate": 6.689276807980051e-06, + "loss": 0.3845, + "step": 53400 + }, + { + "epoch": 13.319201995012468, + "grad_norm": 9.348040580749512, + "learning_rate": 6.686783042394015e-06, + "loss": 0.389, + "step": 53410 + }, + { + "epoch": 13.321695760598503, + "grad_norm": 6.765722274780273, + "learning_rate": 6.684289276807981e-06, + "loss": 0.3126, + "step": 53420 + }, + { + "epoch": 13.324189526184538, + "grad_norm": 7.536268711090088, + "learning_rate": 6.681795511221946e-06, + "loss": 0.3189, + "step": 53430 + }, + { + "epoch": 13.326683291770573, + "grad_norm": 5.146828651428223, + "learning_rate": 6.679301745635911e-06, + "loss": 0.3335, + "step": 53440 + }, + { + "epoch": 13.329177057356608, + "grad_norm": 7.732426166534424, + "learning_rate": 6.676807980049877e-06, + "loss": 0.3094, + "step": 53450 + }, + { + "epoch": 13.331670822942643, + "grad_norm": 7.80210018157959, + "learning_rate": 6.674314214463841e-06, + "loss": 0.4274, + "step": 53460 + }, + { + "epoch": 13.334164588528678, + "grad_norm": 11.900932312011719, + "learning_rate": 6.671820448877806e-06, + "loss": 0.3221, + "step": 53470 + }, + { + "epoch": 13.336658354114713, + "grad_norm": 7.599810600280762, + "learning_rate": 6.669326683291772e-06, + "loss": 0.3771, + "step": 53480 + }, + { + "epoch": 13.339152119700747, + "grad_norm": 6.467700958251953, + "learning_rate": 6.666832917705737e-06, + "loss": 0.32, + "step": 53490 + }, + { + "epoch": 13.341645885286782, + "grad_norm": 8.103590965270996, + "learning_rate": 6.664339152119701e-06, + "loss": 0.3803, + "step": 53500 + }, + { + "epoch": 13.344139650872817, + "grad_norm": 8.192358016967773, + "learning_rate": 6.661845386533666e-06, + "loss": 0.3562, + "step": 53510 + }, + { + "epoch": 13.346633416458852, + "grad_norm": 6.621999263763428, + "learning_rate": 6.659351620947632e-06, + "loss": 0.306, + "step": 53520 + }, + { + "epoch": 13.349127182044889, + "grad_norm": 10.717852592468262, + "learning_rate": 6.656857855361597e-06, + "loss": 0.3724, + "step": 53530 + }, + { + "epoch": 13.351620947630924, + "grad_norm": 11.658101081848145, + "learning_rate": 6.654364089775561e-06, + "loss": 0.3673, + "step": 53540 + }, + { + "epoch": 13.354114713216958, + "grad_norm": 8.711099624633789, + "learning_rate": 6.651870324189526e-06, + "loss": 0.3506, + "step": 53550 + }, + { + "epoch": 13.356608478802993, + "grad_norm": 8.620141983032227, + "learning_rate": 6.649376558603492e-06, + "loss": 0.341, + "step": 53560 + }, + { + "epoch": 13.359102244389028, + "grad_norm": 6.421422958374023, + "learning_rate": 6.646882793017457e-06, + "loss": 0.3318, + "step": 53570 + }, + { + "epoch": 13.361596009975063, + "grad_norm": 7.4736127853393555, + "learning_rate": 6.644389027431421e-06, + "loss": 0.386, + "step": 53580 + }, + { + "epoch": 13.364089775561098, + "grad_norm": 8.752002716064453, + "learning_rate": 6.641895261845387e-06, + "loss": 0.3384, + "step": 53590 + }, + { + "epoch": 13.366583541147133, + "grad_norm": 4.710286617279053, + "learning_rate": 6.639401496259352e-06, + "loss": 0.3548, + "step": 53600 + }, + { + "epoch": 13.369077306733168, + "grad_norm": 6.772241115570068, + "learning_rate": 6.636907730673317e-06, + "loss": 0.3097, + "step": 53610 + }, + { + "epoch": 13.371571072319203, + "grad_norm": 9.710107803344727, + "learning_rate": 6.634413965087282e-06, + "loss": 0.3294, + "step": 53620 + }, + { + "epoch": 13.374064837905237, + "grad_norm": 7.724490165710449, + "learning_rate": 6.631920199501247e-06, + "loss": 0.3282, + "step": 53630 + }, + { + "epoch": 13.376558603491272, + "grad_norm": 11.170283317565918, + "learning_rate": 6.629426433915212e-06, + "loss": 0.4139, + "step": 53640 + }, + { + "epoch": 13.379052369077307, + "grad_norm": 11.032581329345703, + "learning_rate": 6.626932668329178e-06, + "loss": 0.3775, + "step": 53650 + }, + { + "epoch": 13.381546134663342, + "grad_norm": 7.674757480621338, + "learning_rate": 6.624438902743142e-06, + "loss": 0.2728, + "step": 53660 + }, + { + "epoch": 13.384039900249377, + "grad_norm": 8.27985954284668, + "learning_rate": 6.6219451371571075e-06, + "loss": 0.3075, + "step": 53670 + }, + { + "epoch": 13.386533665835412, + "grad_norm": 9.89207935333252, + "learning_rate": 6.619451371571073e-06, + "loss": 0.3486, + "step": 53680 + }, + { + "epoch": 13.389027431421447, + "grad_norm": 7.001489162445068, + "learning_rate": 6.616957605985038e-06, + "loss": 0.3218, + "step": 53690 + }, + { + "epoch": 13.391521197007481, + "grad_norm": 8.43419361114502, + "learning_rate": 6.614463840399003e-06, + "loss": 0.3296, + "step": 53700 + }, + { + "epoch": 13.394014962593516, + "grad_norm": 9.101119995117188, + "learning_rate": 6.611970074812968e-06, + "loss": 0.4188, + "step": 53710 + }, + { + "epoch": 13.396508728179551, + "grad_norm": 7.946018218994141, + "learning_rate": 6.609476309226933e-06, + "loss": 0.3638, + "step": 53720 + }, + { + "epoch": 13.399002493765586, + "grad_norm": 4.495571136474609, + "learning_rate": 6.6069825436408985e-06, + "loss": 0.2811, + "step": 53730 + }, + { + "epoch": 13.401496259351621, + "grad_norm": 9.637377738952637, + "learning_rate": 6.604488778054864e-06, + "loss": 0.3226, + "step": 53740 + }, + { + "epoch": 13.403990024937656, + "grad_norm": 12.018438339233398, + "learning_rate": 6.601995012468828e-06, + "loss": 0.3199, + "step": 53750 + }, + { + "epoch": 13.40648379052369, + "grad_norm": 8.298521995544434, + "learning_rate": 6.5995012468827936e-06, + "loss": 0.3609, + "step": 53760 + }, + { + "epoch": 13.408977556109726, + "grad_norm": 8.563523292541504, + "learning_rate": 6.597007481296759e-06, + "loss": 0.319, + "step": 53770 + }, + { + "epoch": 13.41147132169576, + "grad_norm": 6.873093128204346, + "learning_rate": 6.594513715710724e-06, + "loss": 0.3697, + "step": 53780 + }, + { + "epoch": 13.413965087281795, + "grad_norm": 6.433836936950684, + "learning_rate": 6.592019950124689e-06, + "loss": 0.3516, + "step": 53790 + }, + { + "epoch": 13.41645885286783, + "grad_norm": 7.9647908210754395, + "learning_rate": 6.589526184538654e-06, + "loss": 0.3636, + "step": 53800 + }, + { + "epoch": 13.418952618453865, + "grad_norm": 8.467369079589844, + "learning_rate": 6.587032418952619e-06, + "loss": 0.3561, + "step": 53810 + }, + { + "epoch": 13.4214463840399, + "grad_norm": 5.356767654418945, + "learning_rate": 6.584538653366585e-06, + "loss": 0.3067, + "step": 53820 + }, + { + "epoch": 13.423940149625935, + "grad_norm": 8.168974876403809, + "learning_rate": 6.582044887780549e-06, + "loss": 0.3501, + "step": 53830 + }, + { + "epoch": 13.42643391521197, + "grad_norm": 7.033174514770508, + "learning_rate": 6.579551122194514e-06, + "loss": 0.3376, + "step": 53840 + }, + { + "epoch": 13.428927680798004, + "grad_norm": 9.300447463989258, + "learning_rate": 6.57705735660848e-06, + "loss": 0.3486, + "step": 53850 + }, + { + "epoch": 13.43142144638404, + "grad_norm": 7.828090667724609, + "learning_rate": 6.574563591022445e-06, + "loss": 0.3739, + "step": 53860 + }, + { + "epoch": 13.433915211970074, + "grad_norm": 8.298700332641602, + "learning_rate": 6.5720698254364094e-06, + "loss": 0.3279, + "step": 53870 + }, + { + "epoch": 13.436408977556109, + "grad_norm": 6.854353427886963, + "learning_rate": 6.569576059850375e-06, + "loss": 0.2454, + "step": 53880 + }, + { + "epoch": 13.438902743142144, + "grad_norm": 10.509041786193848, + "learning_rate": 6.56708229426434e-06, + "loss": 0.4803, + "step": 53890 + }, + { + "epoch": 13.441396508728179, + "grad_norm": 9.381756782531738, + "learning_rate": 6.564588528678305e-06, + "loss": 0.3879, + "step": 53900 + }, + { + "epoch": 13.443890274314214, + "grad_norm": 7.465641498565674, + "learning_rate": 6.562094763092269e-06, + "loss": 0.3324, + "step": 53910 + }, + { + "epoch": 13.446384039900249, + "grad_norm": 9.815617561340332, + "learning_rate": 6.559600997506234e-06, + "loss": 0.3524, + "step": 53920 + }, + { + "epoch": 13.448877805486283, + "grad_norm": 7.948529243469238, + "learning_rate": 6.5571072319202004e-06, + "loss": 0.4092, + "step": 53930 + }, + { + "epoch": 13.451371571072318, + "grad_norm": 6.234860897064209, + "learning_rate": 6.554613466334166e-06, + "loss": 0.2804, + "step": 53940 + }, + { + "epoch": 13.453865336658355, + "grad_norm": 8.012157440185547, + "learning_rate": 6.552119700748131e-06, + "loss": 0.2756, + "step": 53950 + }, + { + "epoch": 13.45635910224439, + "grad_norm": 9.687308311462402, + "learning_rate": 6.549625935162095e-06, + "loss": 0.3588, + "step": 53960 + }, + { + "epoch": 13.458852867830425, + "grad_norm": 9.27532958984375, + "learning_rate": 6.54713216957606e-06, + "loss": 0.3675, + "step": 53970 + }, + { + "epoch": 13.46134663341646, + "grad_norm": 6.1434712409973145, + "learning_rate": 6.544638403990025e-06, + "loss": 0.3445, + "step": 53980 + }, + { + "epoch": 13.463840399002494, + "grad_norm": 8.947775840759277, + "learning_rate": 6.5421446384039915e-06, + "loss": 0.334, + "step": 53990 + }, + { + "epoch": 13.46633416458853, + "grad_norm": 9.55975341796875, + "learning_rate": 6.539650872817955e-06, + "loss": 0.4319, + "step": 54000 + }, + { + "epoch": 13.468827930174564, + "grad_norm": 7.890659332275391, + "learning_rate": 6.53715710723192e-06, + "loss": 0.3558, + "step": 54010 + }, + { + "epoch": 13.471321695760599, + "grad_norm": 9.459158897399902, + "learning_rate": 6.534663341645886e-06, + "loss": 0.347, + "step": 54020 + }, + { + "epoch": 13.473815461346634, + "grad_norm": 10.759072303771973, + "learning_rate": 6.532169576059851e-06, + "loss": 0.336, + "step": 54030 + }, + { + "epoch": 13.476309226932669, + "grad_norm": 5.313018798828125, + "learning_rate": 6.5296758104738155e-06, + "loss": 0.2731, + "step": 54040 + }, + { + "epoch": 13.478802992518704, + "grad_norm": 6.71405029296875, + "learning_rate": 6.527182044887781e-06, + "loss": 0.3615, + "step": 54050 + }, + { + "epoch": 13.481296758104738, + "grad_norm": 10.912115097045898, + "learning_rate": 6.524688279301746e-06, + "loss": 0.3431, + "step": 54060 + }, + { + "epoch": 13.483790523690773, + "grad_norm": 7.443094730377197, + "learning_rate": 6.522194513715711e-06, + "loss": 0.3877, + "step": 54070 + }, + { + "epoch": 13.486284289276808, + "grad_norm": 6.929405689239502, + "learning_rate": 6.519700748129676e-06, + "loss": 0.2966, + "step": 54080 + }, + { + "epoch": 13.488778054862843, + "grad_norm": 6.779537200927734, + "learning_rate": 6.517206982543641e-06, + "loss": 0.2756, + "step": 54090 + }, + { + "epoch": 13.491271820448878, + "grad_norm": 13.575571060180664, + "learning_rate": 6.5147132169576065e-06, + "loss": 0.3851, + "step": 54100 + }, + { + "epoch": 13.493765586034913, + "grad_norm": 6.581482410430908, + "learning_rate": 6.512219451371572e-06, + "loss": 0.3736, + "step": 54110 + }, + { + "epoch": 13.496259351620948, + "grad_norm": 9.323427200317383, + "learning_rate": 6.509725685785536e-06, + "loss": 0.4017, + "step": 54120 + }, + { + "epoch": 13.498753117206983, + "grad_norm": 6.501451015472412, + "learning_rate": 6.5072319201995016e-06, + "loss": 0.363, + "step": 54130 + }, + { + "epoch": 13.501246882793017, + "grad_norm": 7.850208282470703, + "learning_rate": 6.504738154613467e-06, + "loss": 0.351, + "step": 54140 + }, + { + "epoch": 13.503740648379052, + "grad_norm": 7.5729780197143555, + "learning_rate": 6.502244389027432e-06, + "loss": 0.2832, + "step": 54150 + }, + { + "epoch": 13.506234413965087, + "grad_norm": 5.65486478805542, + "learning_rate": 6.499750623441397e-06, + "loss": 0.3237, + "step": 54160 + }, + { + "epoch": 13.508728179551122, + "grad_norm": 10.24453067779541, + "learning_rate": 6.497256857855362e-06, + "loss": 0.3901, + "step": 54170 + }, + { + "epoch": 13.511221945137157, + "grad_norm": 7.5691962242126465, + "learning_rate": 6.494763092269327e-06, + "loss": 0.3493, + "step": 54180 + }, + { + "epoch": 13.513715710723192, + "grad_norm": 8.992122650146484, + "learning_rate": 6.4922693266832926e-06, + "loss": 0.3426, + "step": 54190 + }, + { + "epoch": 13.516209476309227, + "grad_norm": 6.614286422729492, + "learning_rate": 6.489775561097258e-06, + "loss": 0.3462, + "step": 54200 + }, + { + "epoch": 13.518703241895262, + "grad_norm": 8.444540977478027, + "learning_rate": 6.487281795511222e-06, + "loss": 0.2959, + "step": 54210 + }, + { + "epoch": 13.521197007481296, + "grad_norm": 4.639540672302246, + "learning_rate": 6.484788029925188e-06, + "loss": 0.3116, + "step": 54220 + }, + { + "epoch": 13.523690773067331, + "grad_norm": 5.426086902618408, + "learning_rate": 6.482294264339153e-06, + "loss": 0.3411, + "step": 54230 + }, + { + "epoch": 13.526184538653366, + "grad_norm": 12.386012077331543, + "learning_rate": 6.479800498753118e-06, + "loss": 0.3102, + "step": 54240 + }, + { + "epoch": 13.528678304239401, + "grad_norm": 6.084399700164795, + "learning_rate": 6.477306733167083e-06, + "loss": 0.3842, + "step": 54250 + }, + { + "epoch": 13.531172069825436, + "grad_norm": 7.526169300079346, + "learning_rate": 6.474812967581048e-06, + "loss": 0.3568, + "step": 54260 + }, + { + "epoch": 13.53366583541147, + "grad_norm": 7.161496639251709, + "learning_rate": 6.472319201995013e-06, + "loss": 0.3225, + "step": 54270 + }, + { + "epoch": 13.536159600997506, + "grad_norm": 5.644833087921143, + "learning_rate": 6.469825436408979e-06, + "loss": 0.293, + "step": 54280 + }, + { + "epoch": 13.53865336658354, + "grad_norm": 11.303448677062988, + "learning_rate": 6.467331670822943e-06, + "loss": 0.3523, + "step": 54290 + }, + { + "epoch": 13.541147132169575, + "grad_norm": 9.144119262695312, + "learning_rate": 6.4648379052369084e-06, + "loss": 0.3661, + "step": 54300 + }, + { + "epoch": 13.54364089775561, + "grad_norm": 8.02839469909668, + "learning_rate": 6.462344139650874e-06, + "loss": 0.3792, + "step": 54310 + }, + { + "epoch": 13.546134663341645, + "grad_norm": 9.855666160583496, + "learning_rate": 6.459850374064839e-06, + "loss": 0.3324, + "step": 54320 + }, + { + "epoch": 13.548628428927682, + "grad_norm": 8.013253211975098, + "learning_rate": 6.457356608478803e-06, + "loss": 0.3991, + "step": 54330 + }, + { + "epoch": 13.551122194513717, + "grad_norm": 8.09378433227539, + "learning_rate": 6.454862842892769e-06, + "loss": 0.3573, + "step": 54340 + }, + { + "epoch": 13.553615960099751, + "grad_norm": 8.810187339782715, + "learning_rate": 6.452369077306734e-06, + "loss": 0.335, + "step": 54350 + }, + { + "epoch": 13.556109725685786, + "grad_norm": 5.002645492553711, + "learning_rate": 6.4498753117206994e-06, + "loss": 0.2775, + "step": 54360 + }, + { + "epoch": 13.558603491271821, + "grad_norm": 5.791070938110352, + "learning_rate": 6.447381546134663e-06, + "loss": 0.2979, + "step": 54370 + }, + { + "epoch": 13.561097256857856, + "grad_norm": 11.853717803955078, + "learning_rate": 6.444887780548628e-06, + "loss": 0.3168, + "step": 54380 + }, + { + "epoch": 13.563591022443891, + "grad_norm": 7.837883949279785, + "learning_rate": 6.442394014962594e-06, + "loss": 0.4237, + "step": 54390 + }, + { + "epoch": 13.566084788029926, + "grad_norm": 8.088776588439941, + "learning_rate": 6.43990024937656e-06, + "loss": 0.29, + "step": 54400 + }, + { + "epoch": 13.56857855361596, + "grad_norm": 8.3961763381958, + "learning_rate": 6.4374064837905235e-06, + "loss": 0.3884, + "step": 54410 + }, + { + "epoch": 13.571072319201996, + "grad_norm": 8.64345645904541, + "learning_rate": 6.434912718204489e-06, + "loss": 0.335, + "step": 54420 + }, + { + "epoch": 13.57356608478803, + "grad_norm": 5.859470844268799, + "learning_rate": 6.432418952618454e-06, + "loss": 0.3534, + "step": 54430 + }, + { + "epoch": 13.576059850374065, + "grad_norm": 3.3717424869537354, + "learning_rate": 6.429925187032419e-06, + "loss": 0.2729, + "step": 54440 + }, + { + "epoch": 13.5785536159601, + "grad_norm": 6.788022518157959, + "learning_rate": 6.427431421446385e-06, + "loss": 0.311, + "step": 54450 + }, + { + "epoch": 13.581047381546135, + "grad_norm": 5.033174514770508, + "learning_rate": 6.424937655860349e-06, + "loss": 0.3251, + "step": 54460 + }, + { + "epoch": 13.58354114713217, + "grad_norm": 8.036091804504395, + "learning_rate": 6.4224438902743145e-06, + "loss": 0.4072, + "step": 54470 + }, + { + "epoch": 13.586034912718205, + "grad_norm": 8.163284301757812, + "learning_rate": 6.41995012468828e-06, + "loss": 0.318, + "step": 54480 + }, + { + "epoch": 13.58852867830424, + "grad_norm": 6.626342296600342, + "learning_rate": 6.417456359102245e-06, + "loss": 0.3064, + "step": 54490 + }, + { + "epoch": 13.591022443890274, + "grad_norm": 7.7200469970703125, + "learning_rate": 6.4149625935162095e-06, + "loss": 0.3844, + "step": 54500 + }, + { + "epoch": 13.59351620947631, + "grad_norm": 8.76436710357666, + "learning_rate": 6.412468827930175e-06, + "loss": 0.3757, + "step": 54510 + }, + { + "epoch": 13.596009975062344, + "grad_norm": 9.245163917541504, + "learning_rate": 6.40997506234414e-06, + "loss": 0.3857, + "step": 54520 + }, + { + "epoch": 13.598503740648379, + "grad_norm": 6.557892322540283, + "learning_rate": 6.4074812967581055e-06, + "loss": 0.3491, + "step": 54530 + }, + { + "epoch": 13.600997506234414, + "grad_norm": 9.482400894165039, + "learning_rate": 6.40498753117207e-06, + "loss": 0.3553, + "step": 54540 + }, + { + "epoch": 13.603491271820449, + "grad_norm": 8.862476348876953, + "learning_rate": 6.402493765586035e-06, + "loss": 0.334, + "step": 54550 + }, + { + "epoch": 13.605985037406484, + "grad_norm": 6.397747039794922, + "learning_rate": 6.4000000000000006e-06, + "loss": 0.3245, + "step": 54560 + }, + { + "epoch": 13.608478802992519, + "grad_norm": 6.339082717895508, + "learning_rate": 6.397506234413966e-06, + "loss": 0.3779, + "step": 54570 + }, + { + "epoch": 13.610972568578553, + "grad_norm": 11.248029708862305, + "learning_rate": 6.39501246882793e-06, + "loss": 0.3455, + "step": 54580 + }, + { + "epoch": 13.613466334164588, + "grad_norm": 6.5518341064453125, + "learning_rate": 6.392518703241896e-06, + "loss": 0.2906, + "step": 54590 + }, + { + "epoch": 13.615960099750623, + "grad_norm": 6.682394981384277, + "learning_rate": 6.390024937655861e-06, + "loss": 0.314, + "step": 54600 + }, + { + "epoch": 13.618453865336658, + "grad_norm": 8.466720581054688, + "learning_rate": 6.387531172069826e-06, + "loss": 0.4433, + "step": 54610 + }, + { + "epoch": 13.620947630922693, + "grad_norm": 13.29272174835205, + "learning_rate": 6.385037406483791e-06, + "loss": 0.3942, + "step": 54620 + }, + { + "epoch": 13.623441396508728, + "grad_norm": 6.229150295257568, + "learning_rate": 6.382543640897756e-06, + "loss": 0.3084, + "step": 54630 + }, + { + "epoch": 13.625935162094763, + "grad_norm": 10.2169828414917, + "learning_rate": 6.380049875311721e-06, + "loss": 0.4638, + "step": 54640 + }, + { + "epoch": 13.628428927680797, + "grad_norm": 8.496891021728516, + "learning_rate": 6.377556109725687e-06, + "loss": 0.3698, + "step": 54650 + }, + { + "epoch": 13.630922693266832, + "grad_norm": 8.761347770690918, + "learning_rate": 6.375062344139651e-06, + "loss": 0.3581, + "step": 54660 + }, + { + "epoch": 13.633416458852867, + "grad_norm": 7.339252948760986, + "learning_rate": 6.372568578553616e-06, + "loss": 0.3113, + "step": 54670 + }, + { + "epoch": 13.635910224438902, + "grad_norm": 8.492396354675293, + "learning_rate": 6.370074812967582e-06, + "loss": 0.3409, + "step": 54680 + }, + { + "epoch": 13.638403990024937, + "grad_norm": 11.395689964294434, + "learning_rate": 6.367581047381547e-06, + "loss": 0.3892, + "step": 54690 + }, + { + "epoch": 13.640897755610972, + "grad_norm": 8.986297607421875, + "learning_rate": 6.365087281795512e-06, + "loss": 0.3154, + "step": 54700 + }, + { + "epoch": 13.643391521197007, + "grad_norm": 9.709697723388672, + "learning_rate": 6.362593516209477e-06, + "loss": 0.3331, + "step": 54710 + }, + { + "epoch": 13.645885286783042, + "grad_norm": 6.466365337371826, + "learning_rate": 6.360099750623442e-06, + "loss": 0.273, + "step": 54720 + }, + { + "epoch": 13.648379052369076, + "grad_norm": 10.639359474182129, + "learning_rate": 6.3576059850374074e-06, + "loss": 0.3514, + "step": 54730 + }, + { + "epoch": 13.650872817955111, + "grad_norm": 7.763825416564941, + "learning_rate": 6.355112219451373e-06, + "loss": 0.3726, + "step": 54740 + }, + { + "epoch": 13.653366583541148, + "grad_norm": 6.4132585525512695, + "learning_rate": 6.352618453865337e-06, + "loss": 0.3819, + "step": 54750 + }, + { + "epoch": 13.655860349127183, + "grad_norm": 9.225993156433105, + "learning_rate": 6.3501246882793025e-06, + "loss": 0.32, + "step": 54760 + }, + { + "epoch": 13.658354114713218, + "grad_norm": 7.557957172393799, + "learning_rate": 6.347630922693268e-06, + "loss": 0.3545, + "step": 54770 + }, + { + "epoch": 13.660847880299253, + "grad_norm": 6.421631813049316, + "learning_rate": 6.345137157107233e-06, + "loss": 0.2719, + "step": 54780 + }, + { + "epoch": 13.663341645885287, + "grad_norm": 7.9836602210998535, + "learning_rate": 6.342643391521197e-06, + "loss": 0.3194, + "step": 54790 + }, + { + "epoch": 13.665835411471322, + "grad_norm": 7.941399097442627, + "learning_rate": 6.340149625935162e-06, + "loss": 0.2969, + "step": 54800 + }, + { + "epoch": 13.668329177057357, + "grad_norm": 6.198439121246338, + "learning_rate": 6.337655860349128e-06, + "loss": 0.3156, + "step": 54810 + }, + { + "epoch": 13.670822942643392, + "grad_norm": 8.44516658782959, + "learning_rate": 6.3351620947630935e-06, + "loss": 0.3344, + "step": 54820 + }, + { + "epoch": 13.673316708229427, + "grad_norm": 8.17326831817627, + "learning_rate": 6.332668329177057e-06, + "loss": 0.2907, + "step": 54830 + }, + { + "epoch": 13.675810473815462, + "grad_norm": 8.938278198242188, + "learning_rate": 6.3301745635910225e-06, + "loss": 0.3873, + "step": 54840 + }, + { + "epoch": 13.678304239401497, + "grad_norm": 7.996182918548584, + "learning_rate": 6.327680798004988e-06, + "loss": 0.3435, + "step": 54850 + }, + { + "epoch": 13.680798004987532, + "grad_norm": 5.846700191497803, + "learning_rate": 6.325187032418953e-06, + "loss": 0.3513, + "step": 54860 + }, + { + "epoch": 13.683291770573566, + "grad_norm": 5.51074743270874, + "learning_rate": 6.3226932668329175e-06, + "loss": 0.3079, + "step": 54870 + }, + { + "epoch": 13.685785536159601, + "grad_norm": 6.8024492263793945, + "learning_rate": 6.320199501246883e-06, + "loss": 0.306, + "step": 54880 + }, + { + "epoch": 13.688279301745636, + "grad_norm": 11.52742862701416, + "learning_rate": 6.317705735660848e-06, + "loss": 0.3317, + "step": 54890 + }, + { + "epoch": 13.690773067331671, + "grad_norm": 14.795852661132812, + "learning_rate": 6.3152119700748135e-06, + "loss": 0.3375, + "step": 54900 + }, + { + "epoch": 13.693266832917706, + "grad_norm": 12.156822204589844, + "learning_rate": 6.312718204488778e-06, + "loss": 0.3397, + "step": 54910 + }, + { + "epoch": 13.69576059850374, + "grad_norm": 9.437907218933105, + "learning_rate": 6.310224438902743e-06, + "loss": 0.365, + "step": 54920 + }, + { + "epoch": 13.698254364089776, + "grad_norm": 9.907096862792969, + "learning_rate": 6.3077306733167085e-06, + "loss": 0.3039, + "step": 54930 + }, + { + "epoch": 13.70074812967581, + "grad_norm": 8.711502075195312, + "learning_rate": 6.305236907730674e-06, + "loss": 0.3617, + "step": 54940 + }, + { + "epoch": 13.703241895261845, + "grad_norm": 7.6768622398376465, + "learning_rate": 6.302743142144639e-06, + "loss": 0.3392, + "step": 54950 + }, + { + "epoch": 13.70573566084788, + "grad_norm": 5.637898921966553, + "learning_rate": 6.300249376558604e-06, + "loss": 0.3591, + "step": 54960 + }, + { + "epoch": 13.708229426433915, + "grad_norm": 8.709186553955078, + "learning_rate": 6.297755610972569e-06, + "loss": 0.3944, + "step": 54970 + }, + { + "epoch": 13.71072319201995, + "grad_norm": 9.957995414733887, + "learning_rate": 6.295261845386534e-06, + "loss": 0.2579, + "step": 54980 + }, + { + "epoch": 13.713216957605985, + "grad_norm": 7.942922115325928, + "learning_rate": 6.2927680798004996e-06, + "loss": 0.3825, + "step": 54990 + }, + { + "epoch": 13.71571072319202, + "grad_norm": 6.098124980926514, + "learning_rate": 6.290274314214464e-06, + "loss": 0.315, + "step": 55000 + }, + { + "epoch": 13.718204488778055, + "grad_norm": 6.805457592010498, + "learning_rate": 6.287780548628429e-06, + "loss": 0.3479, + "step": 55010 + }, + { + "epoch": 13.72069825436409, + "grad_norm": 7.8078765869140625, + "learning_rate": 6.285286783042395e-06, + "loss": 0.3516, + "step": 55020 + }, + { + "epoch": 13.723192019950124, + "grad_norm": 8.491788864135742, + "learning_rate": 6.28279301745636e-06, + "loss": 0.3239, + "step": 55030 + }, + { + "epoch": 13.72568578553616, + "grad_norm": 5.934401512145996, + "learning_rate": 6.280299251870324e-06, + "loss": 0.3531, + "step": 55040 + }, + { + "epoch": 13.728179551122194, + "grad_norm": 6.76987886428833, + "learning_rate": 6.27780548628429e-06, + "loss": 0.37, + "step": 55050 + }, + { + "epoch": 13.730673316708229, + "grad_norm": 9.285640716552734, + "learning_rate": 6.275311720698255e-06, + "loss": 0.3165, + "step": 55060 + }, + { + "epoch": 13.733167082294264, + "grad_norm": 6.509362697601318, + "learning_rate": 6.27281795511222e-06, + "loss": 0.3212, + "step": 55070 + }, + { + "epoch": 13.735660847880299, + "grad_norm": 9.682168006896973, + "learning_rate": 6.270324189526185e-06, + "loss": 0.3663, + "step": 55080 + }, + { + "epoch": 13.738154613466333, + "grad_norm": 12.535362243652344, + "learning_rate": 6.26783042394015e-06, + "loss": 0.3563, + "step": 55090 + }, + { + "epoch": 13.740648379052368, + "grad_norm": 7.953882217407227, + "learning_rate": 6.2653366583541154e-06, + "loss": 0.3366, + "step": 55100 + }, + { + "epoch": 13.743142144638403, + "grad_norm": 9.631477355957031, + "learning_rate": 6.262842892768081e-06, + "loss": 0.322, + "step": 55110 + }, + { + "epoch": 13.745635910224438, + "grad_norm": 7.036581516265869, + "learning_rate": 6.260349127182045e-06, + "loss": 0.3567, + "step": 55120 + }, + { + "epoch": 13.748129675810475, + "grad_norm": 10.336029052734375, + "learning_rate": 6.2578553615960105e-06, + "loss": 0.3196, + "step": 55130 + }, + { + "epoch": 13.75062344139651, + "grad_norm": 5.572134017944336, + "learning_rate": 6.255361596009976e-06, + "loss": 0.2873, + "step": 55140 + }, + { + "epoch": 13.753117206982544, + "grad_norm": 7.690764427185059, + "learning_rate": 6.252867830423941e-06, + "loss": 0.3743, + "step": 55150 + }, + { + "epoch": 13.75561097256858, + "grad_norm": 9.607911109924316, + "learning_rate": 6.250374064837906e-06, + "loss": 0.3554, + "step": 55160 + }, + { + "epoch": 13.758104738154614, + "grad_norm": 7.156585216522217, + "learning_rate": 6.247880299251871e-06, + "loss": 0.3376, + "step": 55170 + }, + { + "epoch": 13.760598503740649, + "grad_norm": 7.8262128829956055, + "learning_rate": 6.245386533665836e-06, + "loss": 0.3767, + "step": 55180 + }, + { + "epoch": 13.763092269326684, + "grad_norm": 7.737610340118408, + "learning_rate": 6.2428927680798015e-06, + "loss": 0.3563, + "step": 55190 + }, + { + "epoch": 13.765586034912719, + "grad_norm": 8.548540115356445, + "learning_rate": 6.240399002493767e-06, + "loss": 0.3358, + "step": 55200 + }, + { + "epoch": 13.768079800498754, + "grad_norm": 5.896496295928955, + "learning_rate": 6.2379052369077304e-06, + "loss": 0.3586, + "step": 55210 + }, + { + "epoch": 13.770573566084789, + "grad_norm": 6.512773036956787, + "learning_rate": 6.235411471321697e-06, + "loss": 0.3925, + "step": 55220 + }, + { + "epoch": 13.773067331670823, + "grad_norm": 8.241216659545898, + "learning_rate": 6.232917705735662e-06, + "loss": 0.3697, + "step": 55230 + }, + { + "epoch": 13.775561097256858, + "grad_norm": 7.041107654571533, + "learning_rate": 6.230423940149627e-06, + "loss": 0.3284, + "step": 55240 + }, + { + "epoch": 13.778054862842893, + "grad_norm": 7.809773921966553, + "learning_rate": 6.227930174563591e-06, + "loss": 0.3691, + "step": 55250 + }, + { + "epoch": 13.780548628428928, + "grad_norm": 6.446750640869141, + "learning_rate": 6.225436408977556e-06, + "loss": 0.3216, + "step": 55260 + }, + { + "epoch": 13.783042394014963, + "grad_norm": 6.692153453826904, + "learning_rate": 6.2229426433915215e-06, + "loss": 0.3639, + "step": 55270 + }, + { + "epoch": 13.785536159600998, + "grad_norm": 8.99974536895752, + "learning_rate": 6.220448877805488e-06, + "loss": 0.3577, + "step": 55280 + }, + { + "epoch": 13.788029925187033, + "grad_norm": 8.162135124206543, + "learning_rate": 6.217955112219451e-06, + "loss": 0.3082, + "step": 55290 + }, + { + "epoch": 13.790523690773068, + "grad_norm": 9.059871673583984, + "learning_rate": 6.2154613466334165e-06, + "loss": 0.3538, + "step": 55300 + }, + { + "epoch": 13.793017456359102, + "grad_norm": 8.866132736206055, + "learning_rate": 6.212967581047382e-06, + "loss": 0.3224, + "step": 55310 + }, + { + "epoch": 13.795511221945137, + "grad_norm": 6.690679550170898, + "learning_rate": 6.210473815461347e-06, + "loss": 0.3352, + "step": 55320 + }, + { + "epoch": 13.798004987531172, + "grad_norm": 7.613649368286133, + "learning_rate": 6.207980049875312e-06, + "loss": 0.3311, + "step": 55330 + }, + { + "epoch": 13.800498753117207, + "grad_norm": 5.005457878112793, + "learning_rate": 6.205486284289277e-06, + "loss": 0.3305, + "step": 55340 + }, + { + "epoch": 13.802992518703242, + "grad_norm": 5.639405250549316, + "learning_rate": 6.202992518703242e-06, + "loss": 0.3504, + "step": 55350 + }, + { + "epoch": 13.805486284289277, + "grad_norm": 4.40685510635376, + "learning_rate": 6.2004987531172075e-06, + "loss": 0.3078, + "step": 55360 + }, + { + "epoch": 13.807980049875312, + "grad_norm": 6.308961868286133, + "learning_rate": 6.198004987531172e-06, + "loss": 0.3049, + "step": 55370 + }, + { + "epoch": 13.810473815461346, + "grad_norm": 6.514129638671875, + "learning_rate": 6.195511221945137e-06, + "loss": 0.3479, + "step": 55380 + }, + { + "epoch": 13.812967581047381, + "grad_norm": 6.240500450134277, + "learning_rate": 6.193017456359103e-06, + "loss": 0.3622, + "step": 55390 + }, + { + "epoch": 13.815461346633416, + "grad_norm": 6.060596466064453, + "learning_rate": 6.190523690773068e-06, + "loss": 0.3123, + "step": 55400 + }, + { + "epoch": 13.817955112219451, + "grad_norm": 8.058761596679688, + "learning_rate": 6.188029925187032e-06, + "loss": 0.3519, + "step": 55410 + }, + { + "epoch": 13.820448877805486, + "grad_norm": 5.577200889587402, + "learning_rate": 6.185536159600998e-06, + "loss": 0.3323, + "step": 55420 + }, + { + "epoch": 13.82294264339152, + "grad_norm": 8.922237396240234, + "learning_rate": 6.183042394014963e-06, + "loss": 0.3886, + "step": 55430 + }, + { + "epoch": 13.825436408977556, + "grad_norm": 10.180780410766602, + "learning_rate": 6.180548628428928e-06, + "loss": 0.3128, + "step": 55440 + }, + { + "epoch": 13.82793017456359, + "grad_norm": 6.981229305267334, + "learning_rate": 6.178054862842894e-06, + "loss": 0.2746, + "step": 55450 + }, + { + "epoch": 13.830423940149625, + "grad_norm": 5.601588726043701, + "learning_rate": 6.175561097256858e-06, + "loss": 0.2593, + "step": 55460 + }, + { + "epoch": 13.83291770573566, + "grad_norm": 9.501214027404785, + "learning_rate": 6.173067331670823e-06, + "loss": 0.3332, + "step": 55470 + }, + { + "epoch": 13.835411471321695, + "grad_norm": 10.897354125976562, + "learning_rate": 6.170573566084789e-06, + "loss": 0.3535, + "step": 55480 + }, + { + "epoch": 13.83790523690773, + "grad_norm": 9.307845115661621, + "learning_rate": 6.168079800498754e-06, + "loss": 0.3206, + "step": 55490 + }, + { + "epoch": 13.840399002493765, + "grad_norm": 5.495110511779785, + "learning_rate": 6.1655860349127185e-06, + "loss": 0.3112, + "step": 55500 + }, + { + "epoch": 13.8428927680798, + "grad_norm": 9.099871635437012, + "learning_rate": 6.163092269326684e-06, + "loss": 0.3681, + "step": 55510 + }, + { + "epoch": 13.845386533665835, + "grad_norm": 7.402480602264404, + "learning_rate": 6.160598503740649e-06, + "loss": 0.3062, + "step": 55520 + }, + { + "epoch": 13.84788029925187, + "grad_norm": 6.1998982429504395, + "learning_rate": 6.1581047381546144e-06, + "loss": 0.3698, + "step": 55530 + }, + { + "epoch": 13.850374064837904, + "grad_norm": 5.2379021644592285, + "learning_rate": 6.155610972568579e-06, + "loss": 0.319, + "step": 55540 + }, + { + "epoch": 13.85286783042394, + "grad_norm": 7.226074695587158, + "learning_rate": 6.153117206982544e-06, + "loss": 0.35, + "step": 55550 + }, + { + "epoch": 13.855361596009976, + "grad_norm": 8.524081230163574, + "learning_rate": 6.1506234413965095e-06, + "loss": 0.3067, + "step": 55560 + }, + { + "epoch": 13.85785536159601, + "grad_norm": 7.00763463973999, + "learning_rate": 6.148129675810475e-06, + "loss": 0.3146, + "step": 55570 + }, + { + "epoch": 13.860349127182046, + "grad_norm": 10.624029159545898, + "learning_rate": 6.145635910224439e-06, + "loss": 0.3798, + "step": 55580 + }, + { + "epoch": 13.86284289276808, + "grad_norm": 8.03925609588623, + "learning_rate": 6.143142144638405e-06, + "loss": 0.3862, + "step": 55590 + }, + { + "epoch": 13.865336658354115, + "grad_norm": 12.083026885986328, + "learning_rate": 6.14064837905237e-06, + "loss": 0.3485, + "step": 55600 + }, + { + "epoch": 13.86783042394015, + "grad_norm": 7.343206405639648, + "learning_rate": 6.138154613466335e-06, + "loss": 0.3835, + "step": 55610 + }, + { + "epoch": 13.870324189526185, + "grad_norm": 11.571039199829102, + "learning_rate": 6.1356608478803e-06, + "loss": 0.3813, + "step": 55620 + }, + { + "epoch": 13.87281795511222, + "grad_norm": 5.217142581939697, + "learning_rate": 6.133167082294265e-06, + "loss": 0.3064, + "step": 55630 + }, + { + "epoch": 13.875311720698255, + "grad_norm": 7.893732070922852, + "learning_rate": 6.13067331670823e-06, + "loss": 0.3811, + "step": 55640 + }, + { + "epoch": 13.87780548628429, + "grad_norm": 7.880045413970947, + "learning_rate": 6.128179551122196e-06, + "loss": 0.3338, + "step": 55650 + }, + { + "epoch": 13.880299251870325, + "grad_norm": 10.907844543457031, + "learning_rate": 6.125685785536159e-06, + "loss": 0.4448, + "step": 55660 + }, + { + "epoch": 13.88279301745636, + "grad_norm": 6.203114032745361, + "learning_rate": 6.1231920199501245e-06, + "loss": 0.3407, + "step": 55670 + }, + { + "epoch": 13.885286783042394, + "grad_norm": 9.57861328125, + "learning_rate": 6.120698254364091e-06, + "loss": 0.3475, + "step": 55680 + }, + { + "epoch": 13.88778054862843, + "grad_norm": 9.820565223693848, + "learning_rate": 6.118204488778056e-06, + "loss": 0.3151, + "step": 55690 + }, + { + "epoch": 13.890274314214464, + "grad_norm": 7.9553046226501465, + "learning_rate": 6.115710723192021e-06, + "loss": 0.3343, + "step": 55700 + }, + { + "epoch": 13.892768079800499, + "grad_norm": 7.718328952789307, + "learning_rate": 6.113216957605985e-06, + "loss": 0.3281, + "step": 55710 + }, + { + "epoch": 13.895261845386534, + "grad_norm": 9.396270751953125, + "learning_rate": 6.11072319201995e-06, + "loss": 0.3393, + "step": 55720 + }, + { + "epoch": 13.897755610972569, + "grad_norm": 6.914912223815918, + "learning_rate": 6.1082294264339155e-06, + "loss": 0.3104, + "step": 55730 + }, + { + "epoch": 13.900249376558603, + "grad_norm": 7.916605472564697, + "learning_rate": 6.105735660847882e-06, + "loss": 0.3162, + "step": 55740 + }, + { + "epoch": 13.902743142144638, + "grad_norm": 7.331538200378418, + "learning_rate": 6.103241895261845e-06, + "loss": 0.3076, + "step": 55750 + }, + { + "epoch": 13.905236907730673, + "grad_norm": 6.395208835601807, + "learning_rate": 6.100748129675811e-06, + "loss": 0.2805, + "step": 55760 + }, + { + "epoch": 13.907730673316708, + "grad_norm": 9.674216270446777, + "learning_rate": 6.098254364089776e-06, + "loss": 0.303, + "step": 55770 + }, + { + "epoch": 13.910224438902743, + "grad_norm": 8.970970153808594, + "learning_rate": 6.095760598503741e-06, + "loss": 0.3443, + "step": 55780 + }, + { + "epoch": 13.912718204488778, + "grad_norm": 7.25998592376709, + "learning_rate": 6.093266832917706e-06, + "loss": 0.2639, + "step": 55790 + }, + { + "epoch": 13.915211970074813, + "grad_norm": 10.04069709777832, + "learning_rate": 6.090773067331671e-06, + "loss": 0.361, + "step": 55800 + }, + { + "epoch": 13.917705735660848, + "grad_norm": 10.420782089233398, + "learning_rate": 6.088279301745636e-06, + "loss": 0.3985, + "step": 55810 + }, + { + "epoch": 13.920199501246882, + "grad_norm": 7.295711040496826, + "learning_rate": 6.085785536159602e-06, + "loss": 0.2836, + "step": 55820 + }, + { + "epoch": 13.922693266832917, + "grad_norm": 10.063495635986328, + "learning_rate": 6.083291770573566e-06, + "loss": 0.3421, + "step": 55830 + }, + { + "epoch": 13.925187032418952, + "grad_norm": 11.42710018157959, + "learning_rate": 6.080798004987531e-06, + "loss": 0.3341, + "step": 55840 + }, + { + "epoch": 13.927680798004987, + "grad_norm": 5.669824123382568, + "learning_rate": 6.078304239401497e-06, + "loss": 0.3636, + "step": 55850 + }, + { + "epoch": 13.930174563591022, + "grad_norm": 9.360200881958008, + "learning_rate": 6.075810473815462e-06, + "loss": 0.3751, + "step": 55860 + }, + { + "epoch": 13.932668329177057, + "grad_norm": 8.900922775268555, + "learning_rate": 6.0733167082294265e-06, + "loss": 0.3607, + "step": 55870 + }, + { + "epoch": 13.935162094763092, + "grad_norm": 5.942974090576172, + "learning_rate": 6.070822942643392e-06, + "loss": 0.2925, + "step": 55880 + }, + { + "epoch": 13.937655860349127, + "grad_norm": 8.712531089782715, + "learning_rate": 6.068329177057357e-06, + "loss": 0.3076, + "step": 55890 + }, + { + "epoch": 13.940149625935161, + "grad_norm": 5.564552307128906, + "learning_rate": 6.065835411471322e-06, + "loss": 0.347, + "step": 55900 + }, + { + "epoch": 13.942643391521196, + "grad_norm": 11.01707935333252, + "learning_rate": 6.063341645885287e-06, + "loss": 0.3483, + "step": 55910 + }, + { + "epoch": 13.945137157107231, + "grad_norm": 6.945023536682129, + "learning_rate": 6.060847880299252e-06, + "loss": 0.3604, + "step": 55920 + }, + { + "epoch": 13.947630922693268, + "grad_norm": 6.796305179595947, + "learning_rate": 6.0583541147132175e-06, + "loss": 0.3503, + "step": 55930 + }, + { + "epoch": 13.950124688279303, + "grad_norm": 5.797430515289307, + "learning_rate": 6.055860349127183e-06, + "loss": 0.4618, + "step": 55940 + }, + { + "epoch": 13.952618453865338, + "grad_norm": 8.793232917785645, + "learning_rate": 6.053615960099751e-06, + "loss": 0.3292, + "step": 55950 + }, + { + "epoch": 13.955112219451372, + "grad_norm": 7.438122272491455, + "learning_rate": 6.051122194513716e-06, + "loss": 0.2957, + "step": 55960 + }, + { + "epoch": 13.957605985037407, + "grad_norm": 9.06357192993164, + "learning_rate": 6.048628428927681e-06, + "loss": 0.392, + "step": 55970 + }, + { + "epoch": 13.960099750623442, + "grad_norm": 11.190245628356934, + "learning_rate": 6.046134663341646e-06, + "loss": 0.3157, + "step": 55980 + }, + { + "epoch": 13.962593516209477, + "grad_norm": 11.194109916687012, + "learning_rate": 6.0436408977556115e-06, + "loss": 0.333, + "step": 55990 + }, + { + "epoch": 13.965087281795512, + "grad_norm": 10.485845565795898, + "learning_rate": 6.041147132169577e-06, + "loss": 0.3388, + "step": 56000 + }, + { + "epoch": 13.967581047381547, + "grad_norm": 9.132210731506348, + "learning_rate": 6.038653366583541e-06, + "loss": 0.3132, + "step": 56010 + }, + { + "epoch": 13.970074812967582, + "grad_norm": 9.017107963562012, + "learning_rate": 6.0361596009975065e-06, + "loss": 0.3127, + "step": 56020 + }, + { + "epoch": 13.972568578553616, + "grad_norm": 8.930654525756836, + "learning_rate": 6.033665835411472e-06, + "loss": 0.3004, + "step": 56030 + }, + { + "epoch": 13.975062344139651, + "grad_norm": 14.12149429321289, + "learning_rate": 6.031172069825437e-06, + "loss": 0.413, + "step": 56040 + }, + { + "epoch": 13.977556109725686, + "grad_norm": 5.871008396148682, + "learning_rate": 6.028678304239402e-06, + "loss": 0.2982, + "step": 56050 + }, + { + "epoch": 13.980049875311721, + "grad_norm": 6.392510890960693, + "learning_rate": 6.026184538653367e-06, + "loss": 0.3514, + "step": 56060 + }, + { + "epoch": 13.982543640897756, + "grad_norm": 10.883293151855469, + "learning_rate": 6.023690773067332e-06, + "loss": 0.3422, + "step": 56070 + }, + { + "epoch": 13.98503740648379, + "grad_norm": 5.680593013763428, + "learning_rate": 6.0211970074812976e-06, + "loss": 0.3912, + "step": 56080 + }, + { + "epoch": 13.987531172069826, + "grad_norm": 11.184502601623535, + "learning_rate": 6.018703241895262e-06, + "loss": 0.3154, + "step": 56090 + }, + { + "epoch": 13.99002493765586, + "grad_norm": 8.648889541625977, + "learning_rate": 6.016209476309227e-06, + "loss": 0.3405, + "step": 56100 + }, + { + "epoch": 13.992518703241895, + "grad_norm": 9.194254875183105, + "learning_rate": 6.013715710723193e-06, + "loss": 0.3542, + "step": 56110 + }, + { + "epoch": 13.99501246882793, + "grad_norm": 7.495959281921387, + "learning_rate": 6.011221945137158e-06, + "loss": 0.3882, + "step": 56120 + }, + { + "epoch": 13.997506234413965, + "grad_norm": 10.184008598327637, + "learning_rate": 6.008728179551123e-06, + "loss": 0.332, + "step": 56130 + }, + { + "epoch": 14.0, + "grad_norm": 7.123004913330078, + "learning_rate": 6.006234413965088e-06, + "loss": 0.3622, + "step": 56140 + }, + { + "epoch": 14.0, + "eval_loss": 0.41418567299842834, + "eval_runtime": 60.1065, + "eval_samples_per_second": 16.687, + "eval_steps_per_second": 16.687, + "step": 56140 + }, + { + "epoch": 14.002493765586035, + "grad_norm": 7.25279426574707, + "learning_rate": 6.003740648379053e-06, + "loss": 0.3301, + "step": 56150 + }, + { + "epoch": 14.00498753117207, + "grad_norm": 5.500768661499023, + "learning_rate": 6.001246882793018e-06, + "loss": 0.3115, + "step": 56160 + }, + { + "epoch": 14.007481296758105, + "grad_norm": 6.643815994262695, + "learning_rate": 5.998753117206984e-06, + "loss": 0.2826, + "step": 56170 + }, + { + "epoch": 14.00997506234414, + "grad_norm": 10.706582069396973, + "learning_rate": 5.996259351620948e-06, + "loss": 0.3527, + "step": 56180 + }, + { + "epoch": 14.012468827930174, + "grad_norm": 15.78691291809082, + "learning_rate": 5.9937655860349134e-06, + "loss": 0.3879, + "step": 56190 + }, + { + "epoch": 14.01496259351621, + "grad_norm": 7.744325637817383, + "learning_rate": 5.991271820448879e-06, + "loss": 0.2975, + "step": 56200 + }, + { + "epoch": 14.017456359102244, + "grad_norm": 6.840442657470703, + "learning_rate": 5.988778054862844e-06, + "loss": 0.3212, + "step": 56210 + }, + { + "epoch": 14.019950124688279, + "grad_norm": 6.66912841796875, + "learning_rate": 5.986284289276808e-06, + "loss": 0.3231, + "step": 56220 + }, + { + "epoch": 14.022443890274314, + "grad_norm": 11.040656089782715, + "learning_rate": 5.983790523690774e-06, + "loss": 0.3638, + "step": 56230 + }, + { + "epoch": 14.024937655860349, + "grad_norm": 12.657490730285645, + "learning_rate": 5.981296758104739e-06, + "loss": 0.3794, + "step": 56240 + }, + { + "epoch": 14.027431421446384, + "grad_norm": 7.612829208374023, + "learning_rate": 5.9788029925187044e-06, + "loss": 0.379, + "step": 56250 + }, + { + "epoch": 14.029925187032418, + "grad_norm": 6.018895149230957, + "learning_rate": 5.976309226932668e-06, + "loss": 0.3329, + "step": 56260 + }, + { + "epoch": 14.032418952618453, + "grad_norm": 9.050536155700684, + "learning_rate": 5.973815461346633e-06, + "loss": 0.3569, + "step": 56270 + }, + { + "epoch": 14.034912718204488, + "grad_norm": 5.334194660186768, + "learning_rate": 5.971321695760599e-06, + "loss": 0.3331, + "step": 56280 + }, + { + "epoch": 14.037406483790523, + "grad_norm": 7.58427095413208, + "learning_rate": 5.968827930174565e-06, + "loss": 0.2786, + "step": 56290 + }, + { + "epoch": 14.039900249376558, + "grad_norm": 10.09249210357666, + "learning_rate": 5.9663341645885284e-06, + "loss": 0.3011, + "step": 56300 + }, + { + "epoch": 14.042394014962593, + "grad_norm": 12.218475341796875, + "learning_rate": 5.963840399002494e-06, + "loss": 0.3299, + "step": 56310 + }, + { + "epoch": 14.044887780548628, + "grad_norm": 6.630500316619873, + "learning_rate": 5.961346633416459e-06, + "loss": 0.3609, + "step": 56320 + }, + { + "epoch": 14.047381546134662, + "grad_norm": 5.265425205230713, + "learning_rate": 5.958852867830424e-06, + "loss": 0.2524, + "step": 56330 + }, + { + "epoch": 14.049875311720697, + "grad_norm": 10.030232429504395, + "learning_rate": 5.95635910224439e-06, + "loss": 0.3, + "step": 56340 + }, + { + "epoch": 14.052369077306734, + "grad_norm": 8.025049209594727, + "learning_rate": 5.953865336658354e-06, + "loss": 0.3647, + "step": 56350 + }, + { + "epoch": 14.054862842892769, + "grad_norm": 8.812301635742188, + "learning_rate": 5.9513715710723195e-06, + "loss": 0.4328, + "step": 56360 + }, + { + "epoch": 14.057356608478804, + "grad_norm": 8.302797317504883, + "learning_rate": 5.948877805486285e-06, + "loss": 0.3217, + "step": 56370 + }, + { + "epoch": 14.059850374064839, + "grad_norm": 7.739553928375244, + "learning_rate": 5.94638403990025e-06, + "loss": 0.352, + "step": 56380 + }, + { + "epoch": 14.062344139650873, + "grad_norm": 6.211906433105469, + "learning_rate": 5.9438902743142145e-06, + "loss": 0.3744, + "step": 56390 + }, + { + "epoch": 14.064837905236908, + "grad_norm": 9.784815788269043, + "learning_rate": 5.94139650872818e-06, + "loss": 0.3688, + "step": 56400 + }, + { + "epoch": 14.067331670822943, + "grad_norm": 7.506394386291504, + "learning_rate": 5.938902743142145e-06, + "loss": 0.3215, + "step": 56410 + }, + { + "epoch": 14.069825436408978, + "grad_norm": 9.98291301727295, + "learning_rate": 5.9364089775561105e-06, + "loss": 0.3251, + "step": 56420 + }, + { + "epoch": 14.072319201995013, + "grad_norm": NaN, + "learning_rate": 5.934164588528679e-06, + "loss": 0.35, + "step": 56430 + }, + { + "epoch": 14.074812967581048, + "grad_norm": 7.1763434410095215, + "learning_rate": 5.931670822942643e-06, + "loss": 0.3382, + "step": 56440 + }, + { + "epoch": 14.077306733167083, + "grad_norm": 11.202238082885742, + "learning_rate": 5.9291770573566085e-06, + "loss": 0.3186, + "step": 56450 + }, + { + "epoch": 14.079800498753118, + "grad_norm": 7.892131805419922, + "learning_rate": 5.926683291770574e-06, + "loss": 0.3167, + "step": 56460 + }, + { + "epoch": 14.082294264339152, + "grad_norm": 7.04252815246582, + "learning_rate": 5.924189526184539e-06, + "loss": 0.3132, + "step": 56470 + }, + { + "epoch": 14.084788029925187, + "grad_norm": 10.996591567993164, + "learning_rate": 5.921695760598504e-06, + "loss": 0.3149, + "step": 56480 + }, + { + "epoch": 14.087281795511222, + "grad_norm": 6.079100608825684, + "learning_rate": 5.919201995012469e-06, + "loss": 0.3049, + "step": 56490 + }, + { + "epoch": 14.089775561097257, + "grad_norm": 7.392295837402344, + "learning_rate": 5.916708229426434e-06, + "loss": 0.3318, + "step": 56500 + }, + { + "epoch": 14.092269326683292, + "grad_norm": 5.498250484466553, + "learning_rate": 5.9142144638403995e-06, + "loss": 0.2813, + "step": 56510 + }, + { + "epoch": 14.094763092269327, + "grad_norm": 8.986740112304688, + "learning_rate": 5.911720698254365e-06, + "loss": 0.285, + "step": 56520 + }, + { + "epoch": 14.097256857855362, + "grad_norm": 8.847763061523438, + "learning_rate": 5.909226932668329e-06, + "loss": 0.3253, + "step": 56530 + }, + { + "epoch": 14.099750623441397, + "grad_norm": 10.011739730834961, + "learning_rate": 5.906733167082295e-06, + "loss": 0.3322, + "step": 56540 + }, + { + "epoch": 14.102244389027431, + "grad_norm": 7.457201957702637, + "learning_rate": 5.90423940149626e-06, + "loss": 0.3628, + "step": 56550 + }, + { + "epoch": 14.104738154613466, + "grad_norm": 5.719525337219238, + "learning_rate": 5.901745635910225e-06, + "loss": 0.3435, + "step": 56560 + }, + { + "epoch": 14.107231920199501, + "grad_norm": 11.263422012329102, + "learning_rate": 5.89925187032419e-06, + "loss": 0.3875, + "step": 56570 + }, + { + "epoch": 14.109725685785536, + "grad_norm": 9.578998565673828, + "learning_rate": 5.896758104738155e-06, + "loss": 0.3096, + "step": 56580 + }, + { + "epoch": 14.11221945137157, + "grad_norm": 7.107484817504883, + "learning_rate": 5.89426433915212e-06, + "loss": 0.3913, + "step": 56590 + }, + { + "epoch": 14.114713216957606, + "grad_norm": 10.798450469970703, + "learning_rate": 5.891770573566086e-06, + "loss": 0.2936, + "step": 56600 + }, + { + "epoch": 14.11720698254364, + "grad_norm": 7.937857627868652, + "learning_rate": 5.88927680798005e-06, + "loss": 0.2655, + "step": 56610 + }, + { + "epoch": 14.119700748129675, + "grad_norm": 5.9058613777160645, + "learning_rate": 5.886783042394015e-06, + "loss": 0.3056, + "step": 56620 + }, + { + "epoch": 14.12219451371571, + "grad_norm": 5.481210231781006, + "learning_rate": 5.884289276807981e-06, + "loss": 0.378, + "step": 56630 + }, + { + "epoch": 14.124688279301745, + "grad_norm": 8.08363151550293, + "learning_rate": 5.881795511221946e-06, + "loss": 0.336, + "step": 56640 + }, + { + "epoch": 14.12718204488778, + "grad_norm": 10.191091537475586, + "learning_rate": 5.8793017456359105e-06, + "loss": 0.3545, + "step": 56650 + }, + { + "epoch": 14.129675810473815, + "grad_norm": 8.802706718444824, + "learning_rate": 5.876807980049876e-06, + "loss": 0.3284, + "step": 56660 + }, + { + "epoch": 14.13216957605985, + "grad_norm": 6.7587890625, + "learning_rate": 5.874314214463841e-06, + "loss": 0.3659, + "step": 56670 + }, + { + "epoch": 14.134663341645885, + "grad_norm": 6.1083526611328125, + "learning_rate": 5.871820448877806e-06, + "loss": 0.3007, + "step": 56680 + }, + { + "epoch": 14.13715710723192, + "grad_norm": 10.109125137329102, + "learning_rate": 5.869326683291771e-06, + "loss": 0.3703, + "step": 56690 + }, + { + "epoch": 14.139650872817954, + "grad_norm": 9.299257278442383, + "learning_rate": 5.866832917705736e-06, + "loss": 0.3109, + "step": 56700 + }, + { + "epoch": 14.14214463840399, + "grad_norm": 11.763932228088379, + "learning_rate": 5.8643391521197015e-06, + "loss": 0.4337, + "step": 56710 + }, + { + "epoch": 14.144638403990024, + "grad_norm": 9.687064170837402, + "learning_rate": 5.861845386533667e-06, + "loss": 0.3345, + "step": 56720 + }, + { + "epoch": 14.147132169576059, + "grad_norm": 7.107930660247803, + "learning_rate": 5.859351620947631e-06, + "loss": 0.3442, + "step": 56730 + }, + { + "epoch": 14.149625935162096, + "grad_norm": 8.584643363952637, + "learning_rate": 5.8568578553615966e-06, + "loss": 0.2821, + "step": 56740 + }, + { + "epoch": 14.15211970074813, + "grad_norm": 10.098069190979004, + "learning_rate": 5.854364089775562e-06, + "loss": 0.3944, + "step": 56750 + }, + { + "epoch": 14.154613466334165, + "grad_norm": 9.100006103515625, + "learning_rate": 5.851870324189527e-06, + "loss": 0.2938, + "step": 56760 + }, + { + "epoch": 14.1571072319202, + "grad_norm": 12.640045166015625, + "learning_rate": 5.8493765586034925e-06, + "loss": 0.3651, + "step": 56770 + }, + { + "epoch": 14.159600997506235, + "grad_norm": 8.430350303649902, + "learning_rate": 5.846882793017457e-06, + "loss": 0.2942, + "step": 56780 + }, + { + "epoch": 14.16209476309227, + "grad_norm": 7.6195855140686035, + "learning_rate": 5.844389027431422e-06, + "loss": 0.3382, + "step": 56790 + }, + { + "epoch": 14.164588528678305, + "grad_norm": 10.6289701461792, + "learning_rate": 5.8418952618453876e-06, + "loss": 0.3124, + "step": 56800 + }, + { + "epoch": 14.16708229426434, + "grad_norm": 10.420262336730957, + "learning_rate": 5.839401496259353e-06, + "loss": 0.3315, + "step": 56810 + }, + { + "epoch": 14.169576059850375, + "grad_norm": 8.555750846862793, + "learning_rate": 5.8369077306733165e-06, + "loss": 0.3292, + "step": 56820 + }, + { + "epoch": 14.17206982543641, + "grad_norm": 10.164884567260742, + "learning_rate": 5.834413965087282e-06, + "loss": 0.3674, + "step": 56830 + }, + { + "epoch": 14.174563591022444, + "grad_norm": 6.1933746337890625, + "learning_rate": 5.831920199501248e-06, + "loss": 0.3447, + "step": 56840 + }, + { + "epoch": 14.17705735660848, + "grad_norm": 6.741939544677734, + "learning_rate": 5.829426433915213e-06, + "loss": 0.3415, + "step": 56850 + }, + { + "epoch": 14.179551122194514, + "grad_norm": 9.930939674377441, + "learning_rate": 5.826932668329177e-06, + "loss": 0.3466, + "step": 56860 + }, + { + "epoch": 14.182044887780549, + "grad_norm": 6.0956854820251465, + "learning_rate": 5.824438902743142e-06, + "loss": 0.3609, + "step": 56870 + }, + { + "epoch": 14.184538653366584, + "grad_norm": 9.120026588439941, + "learning_rate": 5.8219451371571075e-06, + "loss": 0.3623, + "step": 56880 + }, + { + "epoch": 14.187032418952619, + "grad_norm": 5.484027862548828, + "learning_rate": 5.819451371571073e-06, + "loss": 0.334, + "step": 56890 + }, + { + "epoch": 14.189526184538654, + "grad_norm": 10.343612670898438, + "learning_rate": 5.816957605985037e-06, + "loss": 0.3805, + "step": 56900 + }, + { + "epoch": 14.192019950124688, + "grad_norm": 11.961114883422852, + "learning_rate": 5.814463840399003e-06, + "loss": 0.4391, + "step": 56910 + }, + { + "epoch": 14.194513715710723, + "grad_norm": 8.149081230163574, + "learning_rate": 5.811970074812968e-06, + "loss": 0.3533, + "step": 56920 + }, + { + "epoch": 14.197007481296758, + "grad_norm": 11.395288467407227, + "learning_rate": 5.809476309226933e-06, + "loss": 0.3897, + "step": 56930 + }, + { + "epoch": 14.199501246882793, + "grad_norm": 8.990917205810547, + "learning_rate": 5.806982543640898e-06, + "loss": 0.2918, + "step": 56940 + }, + { + "epoch": 14.201995012468828, + "grad_norm": 9.0519380569458, + "learning_rate": 5.804488778054863e-06, + "loss": 0.326, + "step": 56950 + }, + { + "epoch": 14.204488778054863, + "grad_norm": 5.42004919052124, + "learning_rate": 5.801995012468828e-06, + "loss": 0.3336, + "step": 56960 + }, + { + "epoch": 14.206982543640898, + "grad_norm": 24.50922966003418, + "learning_rate": 5.799501246882794e-06, + "loss": 0.3462, + "step": 56970 + }, + { + "epoch": 14.209476309226932, + "grad_norm": 7.2156877517700195, + "learning_rate": 5.797007481296758e-06, + "loss": 0.299, + "step": 56980 + }, + { + "epoch": 14.211970074812967, + "grad_norm": 8.618878364562988, + "learning_rate": 5.794513715710723e-06, + "loss": 0.3368, + "step": 56990 + }, + { + "epoch": 14.214463840399002, + "grad_norm": 8.678366661071777, + "learning_rate": 5.792019950124689e-06, + "loss": 0.3555, + "step": 57000 + }, + { + "epoch": 14.216957605985037, + "grad_norm": 10.28817081451416, + "learning_rate": 5.789526184538654e-06, + "loss": 0.3683, + "step": 57010 + }, + { + "epoch": 14.219451371571072, + "grad_norm": 11.527560234069824, + "learning_rate": 5.787032418952619e-06, + "loss": 0.3426, + "step": 57020 + }, + { + "epoch": 14.221945137157107, + "grad_norm": 5.7110724449157715, + "learning_rate": 5.784538653366584e-06, + "loss": 0.3402, + "step": 57030 + }, + { + "epoch": 14.224438902743142, + "grad_norm": 8.909784317016602, + "learning_rate": 5.782044887780549e-06, + "loss": 0.415, + "step": 57040 + }, + { + "epoch": 14.226932668329177, + "grad_norm": 9.159612655639648, + "learning_rate": 5.779551122194514e-06, + "loss": 0.3765, + "step": 57050 + }, + { + "epoch": 14.229426433915211, + "grad_norm": 7.4799346923828125, + "learning_rate": 5.77705735660848e-06, + "loss": 0.3718, + "step": 57060 + }, + { + "epoch": 14.231920199501246, + "grad_norm": 17.878170013427734, + "learning_rate": 5.774563591022444e-06, + "loss": 0.3199, + "step": 57070 + }, + { + "epoch": 14.234413965087281, + "grad_norm": 7.576568603515625, + "learning_rate": 5.7720698254364095e-06, + "loss": 0.3624, + "step": 57080 + }, + { + "epoch": 14.236907730673316, + "grad_norm": 8.394084930419922, + "learning_rate": 5.769576059850375e-06, + "loss": 0.3559, + "step": 57090 + }, + { + "epoch": 14.239401496259351, + "grad_norm": 9.45663070678711, + "learning_rate": 5.76708229426434e-06, + "loss": 0.3238, + "step": 57100 + }, + { + "epoch": 14.241895261845386, + "grad_norm": 7.425997734069824, + "learning_rate": 5.7645885286783046e-06, + "loss": 0.3416, + "step": 57110 + }, + { + "epoch": 14.24438902743142, + "grad_norm": 7.545619964599609, + "learning_rate": 5.76209476309227e-06, + "loss": 0.4245, + "step": 57120 + }, + { + "epoch": 14.246882793017456, + "grad_norm": 9.640861511230469, + "learning_rate": 5.759600997506235e-06, + "loss": 0.3163, + "step": 57130 + }, + { + "epoch": 14.24937655860349, + "grad_norm": 9.203802108764648, + "learning_rate": 5.7571072319202005e-06, + "loss": 0.4469, + "step": 57140 + }, + { + "epoch": 14.251870324189527, + "grad_norm": 7.76652193069458, + "learning_rate": 5.754613466334165e-06, + "loss": 0.3865, + "step": 57150 + }, + { + "epoch": 14.254364089775562, + "grad_norm": 7.3147358894348145, + "learning_rate": 5.75211970074813e-06, + "loss": 0.3029, + "step": 57160 + }, + { + "epoch": 14.256857855361597, + "grad_norm": 9.476174354553223, + "learning_rate": 5.7496259351620956e-06, + "loss": 0.3637, + "step": 57170 + }, + { + "epoch": 14.259351620947632, + "grad_norm": 6.6377434730529785, + "learning_rate": 5.747132169576061e-06, + "loss": 0.2983, + "step": 57180 + }, + { + "epoch": 14.261845386533667, + "grad_norm": 6.473023891448975, + "learning_rate": 5.744638403990025e-06, + "loss": 0.319, + "step": 57190 + }, + { + "epoch": 14.264339152119701, + "grad_norm": 5.787720680236816, + "learning_rate": 5.742144638403991e-06, + "loss": 0.325, + "step": 57200 + }, + { + "epoch": 14.266832917705736, + "grad_norm": 7.323090076446533, + "learning_rate": 5.739650872817956e-06, + "loss": 0.3233, + "step": 57210 + }, + { + "epoch": 14.269326683291771, + "grad_norm": 7.3568549156188965, + "learning_rate": 5.737157107231921e-06, + "loss": 0.3679, + "step": 57220 + }, + { + "epoch": 14.271820448877806, + "grad_norm": 8.636281967163086, + "learning_rate": 5.734663341645885e-06, + "loss": 0.2744, + "step": 57230 + }, + { + "epoch": 14.27431421446384, + "grad_norm": 6.717350482940674, + "learning_rate": 5.73216957605985e-06, + "loss": 0.293, + "step": 57240 + }, + { + "epoch": 14.276807980049876, + "grad_norm": 9.460376739501953, + "learning_rate": 5.729675810473816e-06, + "loss": 0.3502, + "step": 57250 + }, + { + "epoch": 14.27930174563591, + "grad_norm": 8.800605773925781, + "learning_rate": 5.727182044887782e-06, + "loss": 0.3078, + "step": 57260 + }, + { + "epoch": 14.281795511221945, + "grad_norm": 7.889537811279297, + "learning_rate": 5.724688279301747e-06, + "loss": 0.365, + "step": 57270 + }, + { + "epoch": 14.28428927680798, + "grad_norm": 10.018247604370117, + "learning_rate": 5.722194513715711e-06, + "loss": 0.2993, + "step": 57280 + }, + { + "epoch": 14.286783042394015, + "grad_norm": 5.913888931274414, + "learning_rate": 5.719700748129676e-06, + "loss": 0.2821, + "step": 57290 + }, + { + "epoch": 14.28927680798005, + "grad_norm": 6.76155424118042, + "learning_rate": 5.717206982543641e-06, + "loss": 0.3191, + "step": 57300 + }, + { + "epoch": 14.291770573566085, + "grad_norm": 6.1279215812683105, + "learning_rate": 5.714713216957607e-06, + "loss": 0.3588, + "step": 57310 + }, + { + "epoch": 14.29426433915212, + "grad_norm": 8.195113182067871, + "learning_rate": 5.712219451371571e-06, + "loss": 0.3062, + "step": 57320 + }, + { + "epoch": 14.296758104738155, + "grad_norm": 7.183683395385742, + "learning_rate": 5.709725685785536e-06, + "loss": 0.3615, + "step": 57330 + }, + { + "epoch": 14.29925187032419, + "grad_norm": 10.08060359954834, + "learning_rate": 5.707231920199502e-06, + "loss": 0.3116, + "step": 57340 + }, + { + "epoch": 14.301745635910224, + "grad_norm": 7.461446762084961, + "learning_rate": 5.704738154613467e-06, + "loss": 0.3036, + "step": 57350 + }, + { + "epoch": 14.30423940149626, + "grad_norm": 8.784198760986328, + "learning_rate": 5.702244389027431e-06, + "loss": 0.2936, + "step": 57360 + }, + { + "epoch": 14.306733167082294, + "grad_norm": 8.67398452758789, + "learning_rate": 5.699750623441397e-06, + "loss": 0.2893, + "step": 57370 + }, + { + "epoch": 14.309226932668329, + "grad_norm": 7.162827014923096, + "learning_rate": 5.697256857855362e-06, + "loss": 0.4105, + "step": 57380 + }, + { + "epoch": 14.311720698254364, + "grad_norm": 7.423861026763916, + "learning_rate": 5.694763092269327e-06, + "loss": 0.3044, + "step": 57390 + }, + { + "epoch": 14.314214463840399, + "grad_norm": 11.698079109191895, + "learning_rate": 5.692269326683292e-06, + "loss": 0.3955, + "step": 57400 + }, + { + "epoch": 14.316708229426434, + "grad_norm": 11.152626037597656, + "learning_rate": 5.689775561097257e-06, + "loss": 0.3488, + "step": 57410 + }, + { + "epoch": 14.319201995012468, + "grad_norm": 9.103079795837402, + "learning_rate": 5.687281795511222e-06, + "loss": 0.3548, + "step": 57420 + }, + { + "epoch": 14.321695760598503, + "grad_norm": 10.388227462768555, + "learning_rate": 5.684788029925188e-06, + "loss": 0.323, + "step": 57430 + }, + { + "epoch": 14.324189526184538, + "grad_norm": 9.910160064697266, + "learning_rate": 5.682294264339152e-06, + "loss": 0.3425, + "step": 57440 + }, + { + "epoch": 14.326683291770573, + "grad_norm": 8.148764610290527, + "learning_rate": 5.6798004987531175e-06, + "loss": 0.3473, + "step": 57450 + }, + { + "epoch": 14.329177057356608, + "grad_norm": 9.755147933959961, + "learning_rate": 5.677306733167083e-06, + "loss": 0.316, + "step": 57460 + }, + { + "epoch": 14.331670822942643, + "grad_norm": 5.160188674926758, + "learning_rate": 5.674812967581048e-06, + "loss": 0.3304, + "step": 57470 + }, + { + "epoch": 14.334164588528678, + "grad_norm": 9.043822288513184, + "learning_rate": 5.6723192019950125e-06, + "loss": 0.2585, + "step": 57480 + }, + { + "epoch": 14.336658354114713, + "grad_norm": 9.036372184753418, + "learning_rate": 5.669825436408978e-06, + "loss": 0.2661, + "step": 57490 + }, + { + "epoch": 14.339152119700747, + "grad_norm": 8.77037525177002, + "learning_rate": 5.667331670822943e-06, + "loss": 0.288, + "step": 57500 + }, + { + "epoch": 14.341645885286782, + "grad_norm": 5.4910712242126465, + "learning_rate": 5.6648379052369085e-06, + "loss": 0.3033, + "step": 57510 + }, + { + "epoch": 14.344139650872817, + "grad_norm": 11.051765441894531, + "learning_rate": 5.662344139650874e-06, + "loss": 0.3897, + "step": 57520 + }, + { + "epoch": 14.346633416458852, + "grad_norm": 6.824289798736572, + "learning_rate": 5.659850374064838e-06, + "loss": 0.3452, + "step": 57530 + }, + { + "epoch": 14.349127182044889, + "grad_norm": 7.920559406280518, + "learning_rate": 5.6573566084788036e-06, + "loss": 0.3175, + "step": 57540 + }, + { + "epoch": 14.351620947630924, + "grad_norm": 9.274563789367676, + "learning_rate": 5.654862842892769e-06, + "loss": 0.3346, + "step": 57550 + }, + { + "epoch": 14.354114713216958, + "grad_norm": 9.150640487670898, + "learning_rate": 5.652369077306734e-06, + "loss": 0.2985, + "step": 57560 + }, + { + "epoch": 14.356608478802993, + "grad_norm": 10.429580688476562, + "learning_rate": 5.649875311720699e-06, + "loss": 0.3712, + "step": 57570 + }, + { + "epoch": 14.359102244389028, + "grad_norm": 6.863627910614014, + "learning_rate": 5.647381546134664e-06, + "loss": 0.3518, + "step": 57580 + }, + { + "epoch": 14.361596009975063, + "grad_norm": 10.7223482131958, + "learning_rate": 5.644887780548629e-06, + "loss": 0.2925, + "step": 57590 + }, + { + "epoch": 14.364089775561098, + "grad_norm": 9.246960639953613, + "learning_rate": 5.6423940149625946e-06, + "loss": 0.3043, + "step": 57600 + }, + { + "epoch": 14.366583541147133, + "grad_norm": 8.545487403869629, + "learning_rate": 5.639900249376559e-06, + "loss": 0.3126, + "step": 57610 + }, + { + "epoch": 14.369077306733168, + "grad_norm": 7.441158771514893, + "learning_rate": 5.637406483790524e-06, + "loss": 0.3215, + "step": 57620 + }, + { + "epoch": 14.371571072319203, + "grad_norm": 8.528327941894531, + "learning_rate": 5.63491271820449e-06, + "loss": 0.352, + "step": 57630 + }, + { + "epoch": 14.374064837905237, + "grad_norm": 10.5371732711792, + "learning_rate": 5.632418952618455e-06, + "loss": 0.3339, + "step": 57640 + }, + { + "epoch": 14.376558603491272, + "grad_norm": 7.135348320007324, + "learning_rate": 5.6299251870324186e-06, + "loss": 0.4033, + "step": 57650 + }, + { + "epoch": 14.379052369077307, + "grad_norm": 8.768062591552734, + "learning_rate": 5.627431421446385e-06, + "loss": 0.3769, + "step": 57660 + }, + { + "epoch": 14.381546134663342, + "grad_norm": 9.260017395019531, + "learning_rate": 5.62493765586035e-06, + "loss": 0.3351, + "step": 57670 + }, + { + "epoch": 14.384039900249377, + "grad_norm": 10.839845657348633, + "learning_rate": 5.622443890274315e-06, + "loss": 0.317, + "step": 57680 + }, + { + "epoch": 14.386533665835412, + "grad_norm": 5.972354888916016, + "learning_rate": 5.619950124688279e-06, + "loss": 0.3857, + "step": 57690 + }, + { + "epoch": 14.389027431421447, + "grad_norm": 9.633734703063965, + "learning_rate": 5.617456359102244e-06, + "loss": 0.3434, + "step": 57700 + }, + { + "epoch": 14.391521197007481, + "grad_norm": 9.835312843322754, + "learning_rate": 5.61496259351621e-06, + "loss": 0.3031, + "step": 57710 + }, + { + "epoch": 14.394014962593516, + "grad_norm": 8.749693870544434, + "learning_rate": 5.612468827930176e-06, + "loss": 0.3306, + "step": 57720 + }, + { + "epoch": 14.396508728179551, + "grad_norm": 6.144033908843994, + "learning_rate": 5.609975062344139e-06, + "loss": 0.332, + "step": 57730 + }, + { + "epoch": 14.399002493765586, + "grad_norm": 6.014637470245361, + "learning_rate": 5.607481296758105e-06, + "loss": 0.3146, + "step": 57740 + }, + { + "epoch": 14.401496259351621, + "grad_norm": 7.245567321777344, + "learning_rate": 5.60498753117207e-06, + "loss": 0.3847, + "step": 57750 + }, + { + "epoch": 14.403990024937656, + "grad_norm": 4.784339904785156, + "learning_rate": 5.602493765586035e-06, + "loss": 0.3821, + "step": 57760 + }, + { + "epoch": 14.40648379052369, + "grad_norm": 8.494146347045898, + "learning_rate": 5.600000000000001e-06, + "loss": 0.3402, + "step": 57770 + }, + { + "epoch": 14.408977556109726, + "grad_norm": 7.598515510559082, + "learning_rate": 5.597506234413965e-06, + "loss": 0.325, + "step": 57780 + }, + { + "epoch": 14.41147132169576, + "grad_norm": 7.4690423011779785, + "learning_rate": 5.59501246882793e-06, + "loss": 0.3582, + "step": 57790 + }, + { + "epoch": 14.413965087281795, + "grad_norm": 8.942574501037598, + "learning_rate": 5.592518703241896e-06, + "loss": 0.3168, + "step": 57800 + }, + { + "epoch": 14.41645885286783, + "grad_norm": 8.781198501586914, + "learning_rate": 5.590024937655861e-06, + "loss": 0.2854, + "step": 57810 + }, + { + "epoch": 14.418952618453865, + "grad_norm": 5.599854469299316, + "learning_rate": 5.5875311720698254e-06, + "loss": 0.3373, + "step": 57820 + }, + { + "epoch": 14.4214463840399, + "grad_norm": 6.340407371520996, + "learning_rate": 5.585037406483791e-06, + "loss": 0.2954, + "step": 57830 + }, + { + "epoch": 14.423940149625935, + "grad_norm": 5.647294998168945, + "learning_rate": 5.582543640897756e-06, + "loss": 0.2989, + "step": 57840 + }, + { + "epoch": 14.42643391521197, + "grad_norm": 8.372790336608887, + "learning_rate": 5.580049875311721e-06, + "loss": 0.332, + "step": 57850 + }, + { + "epoch": 14.428927680798004, + "grad_norm": 8.006759643554688, + "learning_rate": 5.577556109725686e-06, + "loss": 0.362, + "step": 57860 + }, + { + "epoch": 14.43142144638404, + "grad_norm": 5.205109596252441, + "learning_rate": 5.575062344139651e-06, + "loss": 0.3863, + "step": 57870 + }, + { + "epoch": 14.433915211970074, + "grad_norm": 6.987797260284424, + "learning_rate": 5.5725685785536165e-06, + "loss": 0.3135, + "step": 57880 + }, + { + "epoch": 14.436408977556109, + "grad_norm": 9.735954284667969, + "learning_rate": 5.570074812967582e-06, + "loss": 0.2886, + "step": 57890 + }, + { + "epoch": 14.438902743142144, + "grad_norm": 10.121098518371582, + "learning_rate": 5.567581047381546e-06, + "loss": 0.3274, + "step": 57900 + }, + { + "epoch": 14.441396508728179, + "grad_norm": 8.132126808166504, + "learning_rate": 5.5650872817955115e-06, + "loss": 0.309, + "step": 57910 + }, + { + "epoch": 14.443890274314214, + "grad_norm": 9.95006275177002, + "learning_rate": 5.562593516209477e-06, + "loss": 0.3218, + "step": 57920 + }, + { + "epoch": 14.446384039900249, + "grad_norm": 9.619258880615234, + "learning_rate": 5.560099750623442e-06, + "loss": 0.3107, + "step": 57930 + }, + { + "epoch": 14.448877805486283, + "grad_norm": 8.964179039001465, + "learning_rate": 5.557605985037407e-06, + "loss": 0.4061, + "step": 57940 + }, + { + "epoch": 14.451371571072318, + "grad_norm": 15.57805347442627, + "learning_rate": 5.555112219451372e-06, + "loss": 0.3487, + "step": 57950 + }, + { + "epoch": 14.453865336658355, + "grad_norm": 10.295059204101562, + "learning_rate": 5.552618453865337e-06, + "loss": 0.2948, + "step": 57960 + }, + { + "epoch": 14.45635910224439, + "grad_norm": 6.323406219482422, + "learning_rate": 5.5501246882793026e-06, + "loss": 0.3311, + "step": 57970 + }, + { + "epoch": 14.458852867830425, + "grad_norm": 6.79067325592041, + "learning_rate": 5.547630922693267e-06, + "loss": 0.368, + "step": 57980 + }, + { + "epoch": 14.46134663341646, + "grad_norm": 7.148687839508057, + "learning_rate": 5.545137157107232e-06, + "loss": 0.312, + "step": 57990 + }, + { + "epoch": 14.463840399002494, + "grad_norm": 11.232911109924316, + "learning_rate": 5.542643391521198e-06, + "loss": 0.3485, + "step": 58000 + }, + { + "epoch": 14.46633416458853, + "grad_norm": 7.9350738525390625, + "learning_rate": 5.540149625935163e-06, + "loss": 0.2499, + "step": 58010 + }, + { + "epoch": 14.468827930174564, + "grad_norm": 7.820169448852539, + "learning_rate": 5.537655860349128e-06, + "loss": 0.3012, + "step": 58020 + }, + { + "epoch": 14.471321695760599, + "grad_norm": 6.683993816375732, + "learning_rate": 5.535162094763093e-06, + "loss": 0.3388, + "step": 58030 + }, + { + "epoch": 14.473815461346634, + "grad_norm": 8.00978946685791, + "learning_rate": 5.532668329177058e-06, + "loss": 0.2825, + "step": 58040 + }, + { + "epoch": 14.476309226932669, + "grad_norm": 7.9973015785217285, + "learning_rate": 5.530174563591023e-06, + "loss": 0.3373, + "step": 58050 + }, + { + "epoch": 14.478802992518704, + "grad_norm": 9.217852592468262, + "learning_rate": 5.527680798004989e-06, + "loss": 0.2765, + "step": 58060 + }, + { + "epoch": 14.481296758104738, + "grad_norm": 9.490992546081543, + "learning_rate": 5.525187032418953e-06, + "loss": 0.3424, + "step": 58070 + }, + { + "epoch": 14.483790523690773, + "grad_norm": 6.221439361572266, + "learning_rate": 5.522693266832918e-06, + "loss": 0.3139, + "step": 58080 + }, + { + "epoch": 14.486284289276808, + "grad_norm": 5.196888446807861, + "learning_rate": 5.520199501246884e-06, + "loss": 0.3762, + "step": 58090 + }, + { + "epoch": 14.488778054862843, + "grad_norm": 9.887178421020508, + "learning_rate": 5.517705735660849e-06, + "loss": 0.3322, + "step": 58100 + }, + { + "epoch": 14.491271820448878, + "grad_norm": 11.542625427246094, + "learning_rate": 5.515211970074813e-06, + "loss": 0.3423, + "step": 58110 + }, + { + "epoch": 14.493765586034913, + "grad_norm": 7.159249782562256, + "learning_rate": 5.512718204488778e-06, + "loss": 0.3121, + "step": 58120 + }, + { + "epoch": 14.496259351620948, + "grad_norm": 11.838788986206055, + "learning_rate": 5.510224438902744e-06, + "loss": 0.3516, + "step": 58130 + }, + { + "epoch": 14.498753117206983, + "grad_norm": 8.8449068069458, + "learning_rate": 5.5077306733167094e-06, + "loss": 0.3357, + "step": 58140 + }, + { + "epoch": 14.501246882793017, + "grad_norm": 10.590514183044434, + "learning_rate": 5.505236907730673e-06, + "loss": 0.2957, + "step": 58150 + }, + { + "epoch": 14.503740648379052, + "grad_norm": 9.069424629211426, + "learning_rate": 5.502743142144638e-06, + "loss": 0.2783, + "step": 58160 + }, + { + "epoch": 14.506234413965087, + "grad_norm": 7.590739727020264, + "learning_rate": 5.500249376558604e-06, + "loss": 0.2904, + "step": 58170 + }, + { + "epoch": 14.508728179551122, + "grad_norm": 6.409052848815918, + "learning_rate": 5.497755610972569e-06, + "loss": 0.3753, + "step": 58180 + }, + { + "epoch": 14.511221945137157, + "grad_norm": 7.4375481605529785, + "learning_rate": 5.4952618453865334e-06, + "loss": 0.2849, + "step": 58190 + }, + { + "epoch": 14.513715710723192, + "grad_norm": 8.864033699035645, + "learning_rate": 5.492768079800499e-06, + "loss": 0.32, + "step": 58200 + }, + { + "epoch": 14.516209476309227, + "grad_norm": 6.455445766448975, + "learning_rate": 5.490274314214464e-06, + "loss": 0.3475, + "step": 58210 + }, + { + "epoch": 14.518703241895262, + "grad_norm": 6.384631633758545, + "learning_rate": 5.487780548628429e-06, + "loss": 0.2812, + "step": 58220 + }, + { + "epoch": 14.521197007481296, + "grad_norm": 9.22549057006836, + "learning_rate": 5.485286783042394e-06, + "loss": 0.3304, + "step": 58230 + }, + { + "epoch": 14.523690773067331, + "grad_norm": 7.639939308166504, + "learning_rate": 5.482793017456359e-06, + "loss": 0.3435, + "step": 58240 + }, + { + "epoch": 14.526184538653366, + "grad_norm": 5.905279636383057, + "learning_rate": 5.4802992518703244e-06, + "loss": 0.2744, + "step": 58250 + }, + { + "epoch": 14.528678304239401, + "grad_norm": 8.444292068481445, + "learning_rate": 5.47780548628429e-06, + "loss": 0.3367, + "step": 58260 + }, + { + "epoch": 14.531172069825436, + "grad_norm": 9.172270774841309, + "learning_rate": 5.475311720698255e-06, + "loss": 0.3669, + "step": 58270 + }, + { + "epoch": 14.53366583541147, + "grad_norm": 15.167724609375, + "learning_rate": 5.4728179551122195e-06, + "loss": 0.4021, + "step": 58280 + }, + { + "epoch": 14.536159600997506, + "grad_norm": 8.261863708496094, + "learning_rate": 5.470324189526185e-06, + "loss": 0.2871, + "step": 58290 + }, + { + "epoch": 14.53865336658354, + "grad_norm": 10.16102123260498, + "learning_rate": 5.46783042394015e-06, + "loss": 0.3401, + "step": 58300 + }, + { + "epoch": 14.541147132169575, + "grad_norm": 8.043190956115723, + "learning_rate": 5.4653366583541155e-06, + "loss": 0.3537, + "step": 58310 + }, + { + "epoch": 14.54364089775561, + "grad_norm": 21.67076873779297, + "learning_rate": 5.46284289276808e-06, + "loss": 0.3269, + "step": 58320 + }, + { + "epoch": 14.546134663341645, + "grad_norm": 6.5939459800720215, + "learning_rate": 5.460349127182045e-06, + "loss": 0.3313, + "step": 58330 + }, + { + "epoch": 14.548628428927682, + "grad_norm": 6.585400581359863, + "learning_rate": 5.4578553615960105e-06, + "loss": 0.3449, + "step": 58340 + }, + { + "epoch": 14.551122194513717, + "grad_norm": 9.368444442749023, + "learning_rate": 5.455361596009976e-06, + "loss": 0.3536, + "step": 58350 + }, + { + "epoch": 14.553615960099751, + "grad_norm": 8.152172088623047, + "learning_rate": 5.45286783042394e-06, + "loss": 0.3483, + "step": 58360 + }, + { + "epoch": 14.556109725685786, + "grad_norm": 9.879807472229004, + "learning_rate": 5.450374064837906e-06, + "loss": 0.3036, + "step": 58370 + }, + { + "epoch": 14.558603491271821, + "grad_norm": 16.792516708374023, + "learning_rate": 5.447880299251871e-06, + "loss": 0.3782, + "step": 58380 + }, + { + "epoch": 14.561097256857856, + "grad_norm": 6.8559112548828125, + "learning_rate": 5.445386533665836e-06, + "loss": 0.3641, + "step": 58390 + }, + { + "epoch": 14.563591022443891, + "grad_norm": 12.230280876159668, + "learning_rate": 5.442892768079801e-06, + "loss": 0.3335, + "step": 58400 + }, + { + "epoch": 14.566084788029926, + "grad_norm": 5.788147926330566, + "learning_rate": 5.440399002493766e-06, + "loss": 0.3376, + "step": 58410 + }, + { + "epoch": 14.56857855361596, + "grad_norm": 9.005399703979492, + "learning_rate": 5.437905236907731e-06, + "loss": 0.3828, + "step": 58420 + }, + { + "epoch": 14.571072319201996, + "grad_norm": 6.153980255126953, + "learning_rate": 5.435411471321697e-06, + "loss": 0.3214, + "step": 58430 + }, + { + "epoch": 14.57356608478803, + "grad_norm": 7.769611835479736, + "learning_rate": 5.432917705735661e-06, + "loss": 0.2673, + "step": 58440 + }, + { + "epoch": 14.576059850374065, + "grad_norm": 7.903584957122803, + "learning_rate": 5.430423940149626e-06, + "loss": 0.3313, + "step": 58450 + }, + { + "epoch": 14.5785536159601, + "grad_norm": 11.687101364135742, + "learning_rate": 5.427930174563592e-06, + "loss": 0.3828, + "step": 58460 + }, + { + "epoch": 14.581047381546135, + "grad_norm": 9.845630645751953, + "learning_rate": 5.425436408977557e-06, + "loss": 0.3014, + "step": 58470 + }, + { + "epoch": 14.58354114713217, + "grad_norm": 7.4955244064331055, + "learning_rate": 5.4229426433915215e-06, + "loss": 0.3243, + "step": 58480 + }, + { + "epoch": 14.586034912718205, + "grad_norm": 9.153183937072754, + "learning_rate": 5.420448877805487e-06, + "loss": 0.3849, + "step": 58490 + }, + { + "epoch": 14.58852867830424, + "grad_norm": 8.650388717651367, + "learning_rate": 5.417955112219452e-06, + "loss": 0.3057, + "step": 58500 + }, + { + "epoch": 14.591022443890274, + "grad_norm": 5.724157333374023, + "learning_rate": 5.415461346633417e-06, + "loss": 0.2931, + "step": 58510 + }, + { + "epoch": 14.59351620947631, + "grad_norm": 8.662379264831543, + "learning_rate": 5.412967581047383e-06, + "loss": 0.324, + "step": 58520 + }, + { + "epoch": 14.596009975062344, + "grad_norm": 6.165246963500977, + "learning_rate": 5.410473815461346e-06, + "loss": 0.305, + "step": 58530 + }, + { + "epoch": 14.598503740648379, + "grad_norm": 10.333334922790527, + "learning_rate": 5.4079800498753125e-06, + "loss": 0.412, + "step": 58540 + }, + { + "epoch": 14.600997506234414, + "grad_norm": 12.94969654083252, + "learning_rate": 5.405486284289278e-06, + "loss": 0.3273, + "step": 58550 + }, + { + "epoch": 14.603491271820449, + "grad_norm": 8.342123985290527, + "learning_rate": 5.402992518703243e-06, + "loss": 0.3934, + "step": 58560 + }, + { + "epoch": 14.605985037406484, + "grad_norm": 6.839729309082031, + "learning_rate": 5.400498753117207e-06, + "loss": 0.303, + "step": 58570 + }, + { + "epoch": 14.608478802992519, + "grad_norm": 9.169561386108398, + "learning_rate": 5.398004987531172e-06, + "loss": 0.2673, + "step": 58580 + }, + { + "epoch": 14.610972568578553, + "grad_norm": 7.035985946655273, + "learning_rate": 5.395511221945137e-06, + "loss": 0.2992, + "step": 58590 + }, + { + "epoch": 14.613466334164588, + "grad_norm": 7.293179988861084, + "learning_rate": 5.3930174563591035e-06, + "loss": 0.3203, + "step": 58600 + }, + { + "epoch": 14.615960099750623, + "grad_norm": 8.904796600341797, + "learning_rate": 5.390523690773067e-06, + "loss": 0.3217, + "step": 58610 + }, + { + "epoch": 14.618453865336658, + "grad_norm": 9.282125473022461, + "learning_rate": 5.3880299251870324e-06, + "loss": 0.3283, + "step": 58620 + }, + { + "epoch": 14.620947630922693, + "grad_norm": 7.752005577087402, + "learning_rate": 5.385536159600998e-06, + "loss": 0.3145, + "step": 58630 + }, + { + "epoch": 14.623441396508728, + "grad_norm": 5.774524688720703, + "learning_rate": 5.383042394014963e-06, + "loss": 0.2747, + "step": 58640 + }, + { + "epoch": 14.625935162094763, + "grad_norm": 8.292061805725098, + "learning_rate": 5.3805486284289275e-06, + "loss": 0.2488, + "step": 58650 + }, + { + "epoch": 14.628428927680797, + "grad_norm": 9.226009368896484, + "learning_rate": 5.378054862842893e-06, + "loss": 0.2889, + "step": 58660 + }, + { + "epoch": 14.630922693266832, + "grad_norm": 8.59598159790039, + "learning_rate": 5.375561097256858e-06, + "loss": 0.3311, + "step": 58670 + }, + { + "epoch": 14.633416458852867, + "grad_norm": 11.77875804901123, + "learning_rate": 5.3730673316708234e-06, + "loss": 0.289, + "step": 58680 + }, + { + "epoch": 14.635910224438902, + "grad_norm": 8.944512367248535, + "learning_rate": 5.370573566084788e-06, + "loss": 0.3456, + "step": 58690 + }, + { + "epoch": 14.638403990024937, + "grad_norm": 8.320801734924316, + "learning_rate": 5.368079800498753e-06, + "loss": 0.3284, + "step": 58700 + }, + { + "epoch": 14.640897755610972, + "grad_norm": 20.068309783935547, + "learning_rate": 5.3655860349127185e-06, + "loss": 0.3567, + "step": 58710 + }, + { + "epoch": 14.643391521197007, + "grad_norm": 13.9030122756958, + "learning_rate": 5.363092269326684e-06, + "loss": 0.3672, + "step": 58720 + }, + { + "epoch": 14.645885286783042, + "grad_norm": 9.718421936035156, + "learning_rate": 5.360598503740648e-06, + "loss": 0.3213, + "step": 58730 + }, + { + "epoch": 14.648379052369076, + "grad_norm": 7.2954301834106445, + "learning_rate": 5.358104738154614e-06, + "loss": 0.3902, + "step": 58740 + }, + { + "epoch": 14.650872817955111, + "grad_norm": 7.279516696929932, + "learning_rate": 5.355610972568579e-06, + "loss": 0.2793, + "step": 58750 + }, + { + "epoch": 14.653366583541148, + "grad_norm": 10.923696517944336, + "learning_rate": 5.353117206982544e-06, + "loss": 0.3365, + "step": 58760 + }, + { + "epoch": 14.655860349127183, + "grad_norm": 9.616853713989258, + "learning_rate": 5.3506234413965095e-06, + "loss": 0.3174, + "step": 58770 + }, + { + "epoch": 14.658354114713218, + "grad_norm": 9.328225135803223, + "learning_rate": 5.348129675810474e-06, + "loss": 0.3395, + "step": 58780 + }, + { + "epoch": 14.660847880299253, + "grad_norm": 8.011582374572754, + "learning_rate": 5.345635910224439e-06, + "loss": 0.3506, + "step": 58790 + }, + { + "epoch": 14.663341645885287, + "grad_norm": 9.812850952148438, + "learning_rate": 5.343142144638405e-06, + "loss": 0.2907, + "step": 58800 + }, + { + "epoch": 14.665835411471322, + "grad_norm": 8.969762802124023, + "learning_rate": 5.34064837905237e-06, + "loss": 0.3821, + "step": 58810 + }, + { + "epoch": 14.668329177057357, + "grad_norm": 8.796773910522461, + "learning_rate": 5.338154613466334e-06, + "loss": 0.3282, + "step": 58820 + }, + { + "epoch": 14.670822942643392, + "grad_norm": 9.331019401550293, + "learning_rate": 5.3356608478803e-06, + "loss": 0.3352, + "step": 58830 + }, + { + "epoch": 14.673316708229427, + "grad_norm": 14.473604202270508, + "learning_rate": 5.333167082294265e-06, + "loss": 0.2999, + "step": 58840 + }, + { + "epoch": 14.675810473815462, + "grad_norm": 7.026119232177734, + "learning_rate": 5.33067331670823e-06, + "loss": 0.3559, + "step": 58850 + }, + { + "epoch": 14.678304239401497, + "grad_norm": 7.760469436645508, + "learning_rate": 5.328179551122195e-06, + "loss": 0.3282, + "step": 58860 + }, + { + "epoch": 14.680798004987532, + "grad_norm": 10.54293155670166, + "learning_rate": 5.32568578553616e-06, + "loss": 0.4874, + "step": 58870 + }, + { + "epoch": 14.683291770573566, + "grad_norm": 10.247196197509766, + "learning_rate": 5.323192019950125e-06, + "loss": 0.3434, + "step": 58880 + }, + { + "epoch": 14.685785536159601, + "grad_norm": 12.179352760314941, + "learning_rate": 5.320698254364091e-06, + "loss": 0.3833, + "step": 58890 + }, + { + "epoch": 14.688279301745636, + "grad_norm": 8.831155776977539, + "learning_rate": 5.318204488778055e-06, + "loss": 0.3422, + "step": 58900 + }, + { + "epoch": 14.690773067331671, + "grad_norm": 5.796462535858154, + "learning_rate": 5.3157107231920205e-06, + "loss": 0.3022, + "step": 58910 + }, + { + "epoch": 14.693266832917706, + "grad_norm": 9.8500337600708, + "learning_rate": 5.313216957605986e-06, + "loss": 0.3686, + "step": 58920 + }, + { + "epoch": 14.69576059850374, + "grad_norm": 9.79806137084961, + "learning_rate": 5.310723192019951e-06, + "loss": 0.3164, + "step": 58930 + }, + { + "epoch": 14.698254364089776, + "grad_norm": 7.471320629119873, + "learning_rate": 5.308229426433915e-06, + "loss": 0.3621, + "step": 58940 + }, + { + "epoch": 14.70074812967581, + "grad_norm": 15.296502113342285, + "learning_rate": 5.305735660847881e-06, + "loss": 0.3376, + "step": 58950 + }, + { + "epoch": 14.703241895261845, + "grad_norm": 8.871153831481934, + "learning_rate": 5.303241895261846e-06, + "loss": 0.3574, + "step": 58960 + }, + { + "epoch": 14.70573566084788, + "grad_norm": 9.778191566467285, + "learning_rate": 5.3007481296758115e-06, + "loss": 0.3363, + "step": 58970 + }, + { + "epoch": 14.708229426433915, + "grad_norm": 8.317192077636719, + "learning_rate": 5.298254364089775e-06, + "loss": 0.2979, + "step": 58980 + }, + { + "epoch": 14.71072319201995, + "grad_norm": 9.88344669342041, + "learning_rate": 5.2957605985037404e-06, + "loss": 0.3778, + "step": 58990 + }, + { + "epoch": 14.713216957605985, + "grad_norm": 8.39840030670166, + "learning_rate": 5.293266832917706e-06, + "loss": 0.3757, + "step": 59000 + }, + { + "epoch": 14.71571072319202, + "grad_norm": 11.54598331451416, + "learning_rate": 5.290773067331672e-06, + "loss": 0.3286, + "step": 59010 + }, + { + "epoch": 14.718204488778055, + "grad_norm": 6.494427680969238, + "learning_rate": 5.28852867830424e-06, + "loss": 0.3799, + "step": 59020 + }, + { + "epoch": 14.72069825436409, + "grad_norm": 9.357276916503906, + "learning_rate": 5.2860349127182055e-06, + "loss": 0.2867, + "step": 59030 + }, + { + "epoch": 14.723192019950124, + "grad_norm": 8.344010353088379, + "learning_rate": 5.28354114713217e-06, + "loss": 0.3489, + "step": 59040 + }, + { + "epoch": 14.72568578553616, + "grad_norm": 6.008798599243164, + "learning_rate": 5.281047381546135e-06, + "loss": 0.3094, + "step": 59050 + }, + { + "epoch": 14.728179551122194, + "grad_norm": 10.23482894897461, + "learning_rate": 5.2785536159601006e-06, + "loss": 0.3203, + "step": 59060 + }, + { + "epoch": 14.730673316708229, + "grad_norm": 6.847126007080078, + "learning_rate": 5.276059850374066e-06, + "loss": 0.25, + "step": 59070 + }, + { + "epoch": 14.733167082294264, + "grad_norm": 9.797203063964844, + "learning_rate": 5.2735660847880295e-06, + "loss": 0.3885, + "step": 59080 + }, + { + "epoch": 14.735660847880299, + "grad_norm": 8.682002067565918, + "learning_rate": 5.271072319201996e-06, + "loss": 0.3408, + "step": 59090 + }, + { + "epoch": 14.738154613466333, + "grad_norm": 7.708632946014404, + "learning_rate": 5.268578553615961e-06, + "loss": 0.3084, + "step": 59100 + }, + { + "epoch": 14.740648379052368, + "grad_norm": 9.106949806213379, + "learning_rate": 5.266084788029926e-06, + "loss": 0.2894, + "step": 59110 + }, + { + "epoch": 14.743142144638403, + "grad_norm": 8.195284843444824, + "learning_rate": 5.26359102244389e-06, + "loss": 0.3228, + "step": 59120 + }, + { + "epoch": 14.745635910224438, + "grad_norm": 10.221166610717773, + "learning_rate": 5.261097256857855e-06, + "loss": 0.3822, + "step": 59130 + }, + { + "epoch": 14.748129675810475, + "grad_norm": 8.522336959838867, + "learning_rate": 5.2586034912718205e-06, + "loss": 0.3341, + "step": 59140 + }, + { + "epoch": 14.75062344139651, + "grad_norm": 13.70825481414795, + "learning_rate": 5.256109725685787e-06, + "loss": 0.3543, + "step": 59150 + }, + { + "epoch": 14.753117206982544, + "grad_norm": 7.4037184715271, + "learning_rate": 5.25361596009975e-06, + "loss": 0.3357, + "step": 59160 + }, + { + "epoch": 14.75561097256858, + "grad_norm": 10.474981307983398, + "learning_rate": 5.2511221945137156e-06, + "loss": 0.3487, + "step": 59170 + }, + { + "epoch": 14.758104738154614, + "grad_norm": 5.967161655426025, + "learning_rate": 5.248628428927681e-06, + "loss": 0.3564, + "step": 59180 + }, + { + "epoch": 14.760598503740649, + "grad_norm": 7.465957164764404, + "learning_rate": 5.246134663341646e-06, + "loss": 0.3241, + "step": 59190 + }, + { + "epoch": 14.763092269326684, + "grad_norm": 7.333925247192383, + "learning_rate": 5.2436408977556115e-06, + "loss": 0.2976, + "step": 59200 + }, + { + "epoch": 14.765586034912719, + "grad_norm": 8.941734313964844, + "learning_rate": 5.241147132169576e-06, + "loss": 0.4209, + "step": 59210 + }, + { + "epoch": 14.768079800498754, + "grad_norm": 7.812376976013184, + "learning_rate": 5.238653366583541e-06, + "loss": 0.2961, + "step": 59220 + }, + { + "epoch": 14.770573566084789, + "grad_norm": 9.06418228149414, + "learning_rate": 5.236159600997507e-06, + "loss": 0.3249, + "step": 59230 + }, + { + "epoch": 14.773067331670823, + "grad_norm": 8.520550727844238, + "learning_rate": 5.233665835411472e-06, + "loss": 0.3469, + "step": 59240 + }, + { + "epoch": 14.775561097256858, + "grad_norm": 5.776762962341309, + "learning_rate": 5.231172069825436e-06, + "loss": 0.3048, + "step": 59250 + }, + { + "epoch": 14.778054862842893, + "grad_norm": 13.731014251708984, + "learning_rate": 5.228678304239402e-06, + "loss": 0.3533, + "step": 59260 + }, + { + "epoch": 14.780548628428928, + "grad_norm": 10.045135498046875, + "learning_rate": 5.226184538653367e-06, + "loss": 0.3062, + "step": 59270 + }, + { + "epoch": 14.783042394014963, + "grad_norm": 9.857339859008789, + "learning_rate": 5.223690773067332e-06, + "loss": 0.2885, + "step": 59280 + }, + { + "epoch": 14.785536159600998, + "grad_norm": 7.196331977844238, + "learning_rate": 5.221197007481297e-06, + "loss": 0.3024, + "step": 59290 + }, + { + "epoch": 14.788029925187033, + "grad_norm": 8.33714485168457, + "learning_rate": 5.218703241895262e-06, + "loss": 0.3089, + "step": 59300 + }, + { + "epoch": 14.790523690773068, + "grad_norm": 7.40169095993042, + "learning_rate": 5.216209476309227e-06, + "loss": 0.3327, + "step": 59310 + }, + { + "epoch": 14.793017456359102, + "grad_norm": 6.180658340454102, + "learning_rate": 5.213715710723193e-06, + "loss": 0.3209, + "step": 59320 + }, + { + "epoch": 14.795511221945137, + "grad_norm": 8.041952133178711, + "learning_rate": 5.211221945137157e-06, + "loss": 0.2623, + "step": 59330 + }, + { + "epoch": 14.798004987531172, + "grad_norm": 7.726354598999023, + "learning_rate": 5.2087281795511225e-06, + "loss": 0.3279, + "step": 59340 + }, + { + "epoch": 14.800498753117207, + "grad_norm": 7.853705406188965, + "learning_rate": 5.206234413965088e-06, + "loss": 0.3187, + "step": 59350 + }, + { + "epoch": 14.802992518703242, + "grad_norm": 6.197792053222656, + "learning_rate": 5.203740648379053e-06, + "loss": 0.4057, + "step": 59360 + }, + { + "epoch": 14.805486284289277, + "grad_norm": 8.7774658203125, + "learning_rate": 5.2012468827930175e-06, + "loss": 0.3922, + "step": 59370 + }, + { + "epoch": 14.807980049875312, + "grad_norm": 7.630876541137695, + "learning_rate": 5.198753117206983e-06, + "loss": 0.3802, + "step": 59380 + }, + { + "epoch": 14.810473815461346, + "grad_norm": 9.605398178100586, + "learning_rate": 5.196259351620948e-06, + "loss": 0.3564, + "step": 59390 + }, + { + "epoch": 14.812967581047381, + "grad_norm": 11.865791320800781, + "learning_rate": 5.1937655860349135e-06, + "loss": 0.4148, + "step": 59400 + }, + { + "epoch": 14.815461346633416, + "grad_norm": 12.097999572753906, + "learning_rate": 5.191271820448879e-06, + "loss": 0.3287, + "step": 59410 + }, + { + "epoch": 14.817955112219451, + "grad_norm": 7.8101277351379395, + "learning_rate": 5.188778054862843e-06, + "loss": 0.3158, + "step": 59420 + }, + { + "epoch": 14.820448877805486, + "grad_norm": 6.201448917388916, + "learning_rate": 5.1862842892768085e-06, + "loss": 0.3469, + "step": 59430 + }, + { + "epoch": 14.82294264339152, + "grad_norm": 11.883674621582031, + "learning_rate": 5.183790523690774e-06, + "loss": 0.3443, + "step": 59440 + }, + { + "epoch": 14.825436408977556, + "grad_norm": 10.622366905212402, + "learning_rate": 5.181296758104739e-06, + "loss": 0.3804, + "step": 59450 + }, + { + "epoch": 14.82793017456359, + "grad_norm": 10.955135345458984, + "learning_rate": 5.178802992518704e-06, + "loss": 0.3149, + "step": 59460 + }, + { + "epoch": 14.830423940149625, + "grad_norm": 6.3978495597839355, + "learning_rate": 5.176309226932669e-06, + "loss": 0.3854, + "step": 59470 + }, + { + "epoch": 14.83291770573566, + "grad_norm": 5.390768527984619, + "learning_rate": 5.173815461346634e-06, + "loss": 0.3576, + "step": 59480 + }, + { + "epoch": 14.835411471321695, + "grad_norm": 6.621118068695068, + "learning_rate": 5.1713216957605996e-06, + "loss": 0.328, + "step": 59490 + }, + { + "epoch": 14.83790523690773, + "grad_norm": 7.241116523742676, + "learning_rate": 5.168827930174564e-06, + "loss": 0.3244, + "step": 59500 + }, + { + "epoch": 14.840399002493765, + "grad_norm": 9.426288604736328, + "learning_rate": 5.166334164588529e-06, + "loss": 0.358, + "step": 59510 + }, + { + "epoch": 14.8428927680798, + "grad_norm": 7.703991413116455, + "learning_rate": 5.163840399002495e-06, + "loss": 0.304, + "step": 59520 + }, + { + "epoch": 14.845386533665835, + "grad_norm": 5.415951251983643, + "learning_rate": 5.16134663341646e-06, + "loss": 0.2659, + "step": 59530 + }, + { + "epoch": 14.84788029925187, + "grad_norm": 10.298295021057129, + "learning_rate": 5.1588528678304236e-06, + "loss": 0.3267, + "step": 59540 + }, + { + "epoch": 14.850374064837904, + "grad_norm": 8.1857328414917, + "learning_rate": 5.156359102244389e-06, + "loss": 0.4304, + "step": 59550 + }, + { + "epoch": 14.85286783042394, + "grad_norm": 8.54920482635498, + "learning_rate": 5.153865336658355e-06, + "loss": 0.4247, + "step": 59560 + }, + { + "epoch": 14.855361596009976, + "grad_norm": 6.647639751434326, + "learning_rate": 5.15137157107232e-06, + "loss": 0.3184, + "step": 59570 + }, + { + "epoch": 14.85785536159601, + "grad_norm": 6.457409858703613, + "learning_rate": 5.148877805486284e-06, + "loss": 0.3595, + "step": 59580 + }, + { + "epoch": 14.860349127182046, + "grad_norm": 9.045681953430176, + "learning_rate": 5.146384039900249e-06, + "loss": 0.3741, + "step": 59590 + }, + { + "epoch": 14.86284289276808, + "grad_norm": 8.095196723937988, + "learning_rate": 5.1438902743142146e-06, + "loss": 0.3488, + "step": 59600 + }, + { + "epoch": 14.865336658354115, + "grad_norm": 12.992335319519043, + "learning_rate": 5.14139650872818e-06, + "loss": 0.3655, + "step": 59610 + }, + { + "epoch": 14.86783042394015, + "grad_norm": 8.386945724487305, + "learning_rate": 5.138902743142144e-06, + "loss": 0.3483, + "step": 59620 + }, + { + "epoch": 14.870324189526185, + "grad_norm": 7.935437202453613, + "learning_rate": 5.13640897755611e-06, + "loss": 0.3158, + "step": 59630 + }, + { + "epoch": 14.87281795511222, + "grad_norm": 6.484121322631836, + "learning_rate": 5.133915211970075e-06, + "loss": 0.3386, + "step": 59640 + }, + { + "epoch": 14.875311720698255, + "grad_norm": 8.329977035522461, + "learning_rate": 5.13142144638404e-06, + "loss": 0.2833, + "step": 59650 + }, + { + "epoch": 14.87780548628429, + "grad_norm": 6.874395370483398, + "learning_rate": 5.128927680798006e-06, + "loss": 0.3502, + "step": 59660 + }, + { + "epoch": 14.880299251870325, + "grad_norm": 7.59706974029541, + "learning_rate": 5.12643391521197e-06, + "loss": 0.3216, + "step": 59670 + }, + { + "epoch": 14.88279301745636, + "grad_norm": 7.426278591156006, + "learning_rate": 5.123940149625935e-06, + "loss": 0.297, + "step": 59680 + }, + { + "epoch": 14.885286783042394, + "grad_norm": 10.72069263458252, + "learning_rate": 5.121446384039901e-06, + "loss": 0.3663, + "step": 59690 + }, + { + "epoch": 14.88778054862843, + "grad_norm": 8.7599458694458, + "learning_rate": 5.118952618453866e-06, + "loss": 0.308, + "step": 59700 + }, + { + "epoch": 14.890274314214464, + "grad_norm": 7.034759521484375, + "learning_rate": 5.1164588528678304e-06, + "loss": 0.2957, + "step": 59710 + }, + { + "epoch": 14.892768079800499, + "grad_norm": 8.246294975280762, + "learning_rate": 5.113965087281796e-06, + "loss": 0.3802, + "step": 59720 + }, + { + "epoch": 14.895261845386534, + "grad_norm": 7.399212837219238, + "learning_rate": 5.111471321695761e-06, + "loss": 0.3906, + "step": 59730 + }, + { + "epoch": 14.897755610972569, + "grad_norm": 6.495842933654785, + "learning_rate": 5.108977556109726e-06, + "loss": 0.3517, + "step": 59740 + }, + { + "epoch": 14.900249376558603, + "grad_norm": 8.81147289276123, + "learning_rate": 5.106483790523691e-06, + "loss": 0.307, + "step": 59750 + }, + { + "epoch": 14.902743142144638, + "grad_norm": 8.447113990783691, + "learning_rate": 5.103990024937656e-06, + "loss": 0.3051, + "step": 59760 + }, + { + "epoch": 14.905236907730673, + "grad_norm": 7.5276288986206055, + "learning_rate": 5.1014962593516215e-06, + "loss": 0.3735, + "step": 59770 + }, + { + "epoch": 14.907730673316708, + "grad_norm": 5.61832857131958, + "learning_rate": 5.099002493765587e-06, + "loss": 0.2694, + "step": 59780 + }, + { + "epoch": 14.910224438902743, + "grad_norm": 7.429715633392334, + "learning_rate": 5.096508728179551e-06, + "loss": 0.3523, + "step": 59790 + }, + { + "epoch": 14.912718204488778, + "grad_norm": 6.293092727661133, + "learning_rate": 5.0940149625935165e-06, + "loss": 0.2826, + "step": 59800 + }, + { + "epoch": 14.915211970074813, + "grad_norm": 8.379650115966797, + "learning_rate": 5.091521197007482e-06, + "loss": 0.2812, + "step": 59810 + }, + { + "epoch": 14.917705735660848, + "grad_norm": 8.945039749145508, + "learning_rate": 5.089027431421447e-06, + "loss": 0.393, + "step": 59820 + }, + { + "epoch": 14.920199501246882, + "grad_norm": 5.737639904022217, + "learning_rate": 5.086533665835412e-06, + "loss": 0.3738, + "step": 59830 + }, + { + "epoch": 14.922693266832917, + "grad_norm": 10.540114402770996, + "learning_rate": 5.084039900249377e-06, + "loss": 0.3052, + "step": 59840 + }, + { + "epoch": 14.925187032418952, + "grad_norm": 7.083829879760742, + "learning_rate": 5.081546134663342e-06, + "loss": 0.3163, + "step": 59850 + }, + { + "epoch": 14.927680798004987, + "grad_norm": 9.188494682312012, + "learning_rate": 5.0790523690773075e-06, + "loss": 0.3344, + "step": 59860 + }, + { + "epoch": 14.930174563591022, + "grad_norm": 5.866366386413574, + "learning_rate": 5.076558603491272e-06, + "loss": 0.3339, + "step": 59870 + }, + { + "epoch": 14.932668329177057, + "grad_norm": 7.339391708374023, + "learning_rate": 5.074064837905237e-06, + "loss": 0.312, + "step": 59880 + }, + { + "epoch": 14.935162094763092, + "grad_norm": 8.920500755310059, + "learning_rate": 5.071571072319203e-06, + "loss": 0.338, + "step": 59890 + }, + { + "epoch": 14.937655860349127, + "grad_norm": 11.074267387390137, + "learning_rate": 5.069077306733168e-06, + "loss": 0.3554, + "step": 59900 + }, + { + "epoch": 14.940149625935161, + "grad_norm": 7.53389310836792, + "learning_rate": 5.066583541147133e-06, + "loss": 0.3171, + "step": 59910 + }, + { + "epoch": 14.942643391521196, + "grad_norm": 5.2787346839904785, + "learning_rate": 5.064089775561098e-06, + "loss": 0.3658, + "step": 59920 + }, + { + "epoch": 14.945137157107231, + "grad_norm": 10.729007720947266, + "learning_rate": 5.061596009975063e-06, + "loss": 0.3204, + "step": 59930 + }, + { + "epoch": 14.947630922693268, + "grad_norm": 4.510223865509033, + "learning_rate": 5.059102244389028e-06, + "loss": 0.3533, + "step": 59940 + }, + { + "epoch": 14.950124688279303, + "grad_norm": 8.356785774230957, + "learning_rate": 5.056608478802994e-06, + "loss": 0.3236, + "step": 59950 + }, + { + "epoch": 14.952618453865338, + "grad_norm": 8.854043006896973, + "learning_rate": 5.054114713216958e-06, + "loss": 0.3763, + "step": 59960 + }, + { + "epoch": 14.955112219451372, + "grad_norm": 7.022368907928467, + "learning_rate": 5.051620947630923e-06, + "loss": 0.3308, + "step": 59970 + }, + { + "epoch": 14.957605985037407, + "grad_norm": 7.40346622467041, + "learning_rate": 5.049127182044889e-06, + "loss": 0.2865, + "step": 59980 + }, + { + "epoch": 14.960099750623442, + "grad_norm": 6.403690338134766, + "learning_rate": 5.046633416458854e-06, + "loss": 0.3002, + "step": 59990 + }, + { + "epoch": 14.962593516209477, + "grad_norm": 4.785618782043457, + "learning_rate": 5.044139650872818e-06, + "loss": 0.3558, + "step": 60000 + }, + { + "epoch": 14.965087281795512, + "grad_norm": 8.796896934509277, + "learning_rate": 5.041645885286783e-06, + "loss": 0.3605, + "step": 60010 + }, + { + "epoch": 14.967581047381547, + "grad_norm": 8.200934410095215, + "learning_rate": 5.039152119700749e-06, + "loss": 0.3184, + "step": 60020 + }, + { + "epoch": 14.970074812967582, + "grad_norm": 9.922554016113281, + "learning_rate": 5.036658354114714e-06, + "loss": 0.3724, + "step": 60030 + }, + { + "epoch": 14.972568578553616, + "grad_norm": 8.850462913513184, + "learning_rate": 5.034164588528678e-06, + "loss": 0.3298, + "step": 60040 + }, + { + "epoch": 14.975062344139651, + "grad_norm": 4.286106586456299, + "learning_rate": 5.031670822942643e-06, + "loss": 0.3485, + "step": 60050 + }, + { + "epoch": 14.977556109725686, + "grad_norm": 8.860762596130371, + "learning_rate": 5.029177057356609e-06, + "loss": 0.3658, + "step": 60060 + }, + { + "epoch": 14.980049875311721, + "grad_norm": 11.523205757141113, + "learning_rate": 5.026683291770574e-06, + "loss": 0.3549, + "step": 60070 + }, + { + "epoch": 14.982543640897756, + "grad_norm": 7.7817277908325195, + "learning_rate": 5.0241895261845384e-06, + "loss": 0.287, + "step": 60080 + }, + { + "epoch": 14.98503740648379, + "grad_norm": 9.745018005371094, + "learning_rate": 5.021695760598504e-06, + "loss": 0.3697, + "step": 60090 + }, + { + "epoch": 14.987531172069826, + "grad_norm": 7.144688606262207, + "learning_rate": 5.019201995012469e-06, + "loss": 0.3164, + "step": 60100 + }, + { + "epoch": 14.99002493765586, + "grad_norm": 7.69403600692749, + "learning_rate": 5.016708229426434e-06, + "loss": 0.3714, + "step": 60110 + }, + { + "epoch": 14.992518703241895, + "grad_norm": 7.915139198303223, + "learning_rate": 5.014214463840399e-06, + "loss": 0.3015, + "step": 60120 + }, + { + "epoch": 14.99501246882793, + "grad_norm": 7.1008381843566895, + "learning_rate": 5.011720698254364e-06, + "loss": 0.314, + "step": 60130 + }, + { + "epoch": 14.997506234413965, + "grad_norm": 9.205920219421387, + "learning_rate": 5.0092269326683294e-06, + "loss": 0.3682, + "step": 60140 + }, + { + "epoch": 15.0, + "grad_norm": 5.531941890716553, + "learning_rate": 5.006733167082295e-06, + "loss": 0.2957, + "step": 60150 + }, + { + "epoch": 15.0, + "eval_loss": 0.4161190390586853, + "eval_runtime": 60.2534, + "eval_samples_per_second": 16.646, + "eval_steps_per_second": 16.646, + "step": 60150 + }, + { + "epoch": 15.002493765586035, + "grad_norm": 5.979081153869629, + "learning_rate": 5.00423940149626e-06, + "loss": 0.3142, + "step": 60160 + }, + { + "epoch": 15.00498753117207, + "grad_norm": 14.604164123535156, + "learning_rate": 5.0017456359102245e-06, + "loss": 0.3561, + "step": 60170 + }, + { + "epoch": 15.007481296758105, + "grad_norm": 8.22153377532959, + "learning_rate": 4.99925187032419e-06, + "loss": 0.379, + "step": 60180 + }, + { + "epoch": 15.00997506234414, + "grad_norm": 8.119734764099121, + "learning_rate": 4.996758104738155e-06, + "loss": 0.2916, + "step": 60190 + }, + { + "epoch": 15.012468827930174, + "grad_norm": 8.341341972351074, + "learning_rate": 4.9942643391521205e-06, + "loss": 0.345, + "step": 60200 + }, + { + "epoch": 15.01496259351621, + "grad_norm": 9.086451530456543, + "learning_rate": 4.991770573566086e-06, + "loss": 0.3281, + "step": 60210 + }, + { + "epoch": 15.017456359102244, + "grad_norm": 6.594851970672607, + "learning_rate": 4.98927680798005e-06, + "loss": 0.3928, + "step": 60220 + }, + { + "epoch": 15.019950124688279, + "grad_norm": 4.85838508605957, + "learning_rate": 4.9867830423940155e-06, + "loss": 0.2773, + "step": 60230 + }, + { + "epoch": 15.022443890274314, + "grad_norm": 6.930510520935059, + "learning_rate": 4.98428927680798e-06, + "loss": 0.3527, + "step": 60240 + }, + { + "epoch": 15.024937655860349, + "grad_norm": 10.528151512145996, + "learning_rate": 4.981795511221945e-06, + "loss": 0.3468, + "step": 60250 + }, + { + "epoch": 15.027431421446384, + "grad_norm": 13.538934707641602, + "learning_rate": 4.979301745635911e-06, + "loss": 0.3329, + "step": 60260 + }, + { + "epoch": 15.029925187032418, + "grad_norm": 9.08583927154541, + "learning_rate": 4.976807980049876e-06, + "loss": 0.3067, + "step": 60270 + }, + { + "epoch": 15.032418952618453, + "grad_norm": 6.1941423416137695, + "learning_rate": 4.97431421446384e-06, + "loss": 0.3442, + "step": 60280 + }, + { + "epoch": 15.034912718204488, + "grad_norm": 8.902884483337402, + "learning_rate": 4.971820448877806e-06, + "loss": 0.2784, + "step": 60290 + }, + { + "epoch": 15.037406483790523, + "grad_norm": 8.3753023147583, + "learning_rate": 4.969326683291771e-06, + "loss": 0.3317, + "step": 60300 + }, + { + "epoch": 15.039900249376558, + "grad_norm": 9.228056907653809, + "learning_rate": 4.966832917705736e-06, + "loss": 0.3551, + "step": 60310 + }, + { + "epoch": 15.042394014962593, + "grad_norm": 8.301530838012695, + "learning_rate": 4.964339152119701e-06, + "loss": 0.4634, + "step": 60320 + }, + { + "epoch": 15.044887780548628, + "grad_norm": 7.827813625335693, + "learning_rate": 4.961845386533666e-06, + "loss": 0.336, + "step": 60330 + }, + { + "epoch": 15.047381546134662, + "grad_norm": 8.87314510345459, + "learning_rate": 4.959351620947631e-06, + "loss": 0.3343, + "step": 60340 + }, + { + "epoch": 15.049875311720697, + "grad_norm": 5.845686912536621, + "learning_rate": 4.956857855361597e-06, + "loss": 0.3175, + "step": 60350 + }, + { + "epoch": 15.052369077306734, + "grad_norm": 7.405905246734619, + "learning_rate": 4.954364089775561e-06, + "loss": 0.354, + "step": 60360 + }, + { + "epoch": 15.054862842892769, + "grad_norm": 6.437865734100342, + "learning_rate": 4.9518703241895265e-06, + "loss": 0.3171, + "step": 60370 + }, + { + "epoch": 15.057356608478804, + "grad_norm": 10.79374885559082, + "learning_rate": 4.949376558603492e-06, + "loss": 0.317, + "step": 60380 + }, + { + "epoch": 15.059850374064839, + "grad_norm": 8.71488094329834, + "learning_rate": 4.946882793017457e-06, + "loss": 0.3546, + "step": 60390 + }, + { + "epoch": 15.062344139650873, + "grad_norm": 7.864600658416748, + "learning_rate": 4.944389027431422e-06, + "loss": 0.4108, + "step": 60400 + }, + { + "epoch": 15.064837905236908, + "grad_norm": 6.649560928344727, + "learning_rate": 4.941895261845387e-06, + "loss": 0.3197, + "step": 60410 + }, + { + "epoch": 15.067331670822943, + "grad_norm": 9.850674629211426, + "learning_rate": 4.939401496259352e-06, + "loss": 0.3384, + "step": 60420 + }, + { + "epoch": 15.069825436408978, + "grad_norm": 16.706798553466797, + "learning_rate": 4.9369077306733175e-06, + "loss": 0.3184, + "step": 60430 + }, + { + "epoch": 15.072319201995013, + "grad_norm": 8.220061302185059, + "learning_rate": 4.934413965087283e-06, + "loss": 0.308, + "step": 60440 + }, + { + "epoch": 15.074812967581048, + "grad_norm": 6.942609786987305, + "learning_rate": 4.931920199501247e-06, + "loss": 0.3105, + "step": 60450 + }, + { + "epoch": 15.077306733167083, + "grad_norm": 8.182219505310059, + "learning_rate": 4.9294264339152126e-06, + "loss": 0.2868, + "step": 60460 + }, + { + "epoch": 15.079800498753118, + "grad_norm": 7.964657783508301, + "learning_rate": 4.926932668329177e-06, + "loss": 0.3713, + "step": 60470 + }, + { + "epoch": 15.082294264339152, + "grad_norm": 6.614937782287598, + "learning_rate": 4.924438902743142e-06, + "loss": 0.3498, + "step": 60480 + }, + { + "epoch": 15.084788029925187, + "grad_norm": 9.722759246826172, + "learning_rate": 4.921945137157108e-06, + "loss": 0.3951, + "step": 60490 + }, + { + "epoch": 15.087281795511222, + "grad_norm": 7.471789836883545, + "learning_rate": 4.919451371571073e-06, + "loss": 0.289, + "step": 60500 + }, + { + "epoch": 15.089775561097257, + "grad_norm": 8.353428840637207, + "learning_rate": 4.9169576059850374e-06, + "loss": 0.3236, + "step": 60510 + }, + { + "epoch": 15.092269326683292, + "grad_norm": 6.462151527404785, + "learning_rate": 4.914463840399003e-06, + "loss": 0.3103, + "step": 60520 + }, + { + "epoch": 15.094763092269327, + "grad_norm": 10.315178871154785, + "learning_rate": 4.911970074812968e-06, + "loss": 0.3587, + "step": 60530 + }, + { + "epoch": 15.097256857855362, + "grad_norm": 7.671100616455078, + "learning_rate": 4.909476309226933e-06, + "loss": 0.3657, + "step": 60540 + }, + { + "epoch": 15.099750623441397, + "grad_norm": 8.20209789276123, + "learning_rate": 4.906982543640898e-06, + "loss": 0.3923, + "step": 60550 + }, + { + "epoch": 15.102244389027431, + "grad_norm": 5.816895008087158, + "learning_rate": 4.904488778054863e-06, + "loss": 0.3362, + "step": 60560 + }, + { + "epoch": 15.104738154613466, + "grad_norm": 6.219974994659424, + "learning_rate": 4.9019950124688284e-06, + "loss": 0.4108, + "step": 60570 + }, + { + "epoch": 15.107231920199501, + "grad_norm": 8.702142715454102, + "learning_rate": 4.899501246882794e-06, + "loss": 0.3341, + "step": 60580 + }, + { + "epoch": 15.109725685785536, + "grad_norm": 7.942946434020996, + "learning_rate": 4.897007481296758e-06, + "loss": 0.298, + "step": 60590 + }, + { + "epoch": 15.11221945137157, + "grad_norm": 14.992578506469727, + "learning_rate": 4.8945137157107235e-06, + "loss": 0.3463, + "step": 60600 + }, + { + "epoch": 15.114713216957606, + "grad_norm": 8.40830135345459, + "learning_rate": 4.892019950124689e-06, + "loss": 0.3821, + "step": 60610 + }, + { + "epoch": 15.11720698254364, + "grad_norm": 9.050780296325684, + "learning_rate": 4.889526184538654e-06, + "loss": 0.3323, + "step": 60620 + }, + { + "epoch": 15.119700748129675, + "grad_norm": 3.6410436630249023, + "learning_rate": 4.887032418952619e-06, + "loss": 0.321, + "step": 60630 + }, + { + "epoch": 15.12219451371571, + "grad_norm": 10.186809539794922, + "learning_rate": 4.884538653366584e-06, + "loss": 0.3119, + "step": 60640 + }, + { + "epoch": 15.124688279301745, + "grad_norm": 9.280686378479004, + "learning_rate": 4.882044887780549e-06, + "loss": 0.2915, + "step": 60650 + }, + { + "epoch": 15.12718204488778, + "grad_norm": 10.575470924377441, + "learning_rate": 4.879551122194514e-06, + "loss": 0.3587, + "step": 60660 + }, + { + "epoch": 15.129675810473815, + "grad_norm": 10.077107429504395, + "learning_rate": 4.87705735660848e-06, + "loss": 0.4142, + "step": 60670 + }, + { + "epoch": 15.13216957605985, + "grad_norm": 8.206668853759766, + "learning_rate": 4.874563591022444e-06, + "loss": 0.3306, + "step": 60680 + }, + { + "epoch": 15.134663341645885, + "grad_norm": 6.796830654144287, + "learning_rate": 4.87206982543641e-06, + "loss": 0.3437, + "step": 60690 + }, + { + "epoch": 15.13715710723192, + "grad_norm": 8.173311233520508, + "learning_rate": 4.869576059850374e-06, + "loss": 0.3, + "step": 60700 + }, + { + "epoch": 15.139650872817954, + "grad_norm": 11.791805267333984, + "learning_rate": 4.867082294264339e-06, + "loss": 0.2989, + "step": 60710 + }, + { + "epoch": 15.14214463840399, + "grad_norm": 8.304753303527832, + "learning_rate": 4.864588528678305e-06, + "loss": 0.3687, + "step": 60720 + }, + { + "epoch": 15.144638403990024, + "grad_norm": 6.197001934051514, + "learning_rate": 4.86209476309227e-06, + "loss": 0.3366, + "step": 60730 + }, + { + "epoch": 15.147132169576059, + "grad_norm": 11.391712188720703, + "learning_rate": 4.8596009975062345e-06, + "loss": 0.3422, + "step": 60740 + }, + { + "epoch": 15.149625935162096, + "grad_norm": 5.738236904144287, + "learning_rate": 4.8571072319202e-06, + "loss": 0.3359, + "step": 60750 + }, + { + "epoch": 15.15211970074813, + "grad_norm": 9.785381317138672, + "learning_rate": 4.854613466334165e-06, + "loss": 0.3786, + "step": 60760 + }, + { + "epoch": 15.154613466334165, + "grad_norm": 6.396745204925537, + "learning_rate": 4.85211970074813e-06, + "loss": 0.2676, + "step": 60770 + }, + { + "epoch": 15.1571072319202, + "grad_norm": 7.886439800262451, + "learning_rate": 4.849625935162095e-06, + "loss": 0.3038, + "step": 60780 + }, + { + "epoch": 15.159600997506235, + "grad_norm": 11.808472633361816, + "learning_rate": 4.84713216957606e-06, + "loss": 0.3218, + "step": 60790 + }, + { + "epoch": 15.16209476309227, + "grad_norm": 13.33260440826416, + "learning_rate": 4.8446384039900255e-06, + "loss": 0.3795, + "step": 60800 + }, + { + "epoch": 15.164588528678305, + "grad_norm": 6.950128078460693, + "learning_rate": 4.842144638403991e-06, + "loss": 0.3344, + "step": 60810 + }, + { + "epoch": 15.16708229426434, + "grad_norm": 10.497044563293457, + "learning_rate": 4.839650872817955e-06, + "loss": 0.3543, + "step": 60820 + }, + { + "epoch": 15.169576059850375, + "grad_norm": 10.250978469848633, + "learning_rate": 4.8371571072319206e-06, + "loss": 0.3404, + "step": 60830 + }, + { + "epoch": 15.17206982543641, + "grad_norm": 9.228592872619629, + "learning_rate": 4.834663341645886e-06, + "loss": 0.3418, + "step": 60840 + }, + { + "epoch": 15.174563591022444, + "grad_norm": 9.230294227600098, + "learning_rate": 4.832169576059851e-06, + "loss": 0.3317, + "step": 60850 + }, + { + "epoch": 15.17705735660848, + "grad_norm": 6.274941921234131, + "learning_rate": 4.829675810473816e-06, + "loss": 0.3361, + "step": 60860 + }, + { + "epoch": 15.179551122194514, + "grad_norm": 7.867391586303711, + "learning_rate": 4.827182044887781e-06, + "loss": 0.3096, + "step": 60870 + }, + { + "epoch": 15.182044887780549, + "grad_norm": 7.890642166137695, + "learning_rate": 4.824688279301745e-06, + "loss": 0.3399, + "step": 60880 + }, + { + "epoch": 15.184538653366584, + "grad_norm": 9.05803394317627, + "learning_rate": 4.822194513715711e-06, + "loss": 0.3148, + "step": 60890 + }, + { + "epoch": 15.187032418952619, + "grad_norm": 5.658694744110107, + "learning_rate": 4.819700748129677e-06, + "loss": 0.3686, + "step": 60900 + }, + { + "epoch": 15.189526184538654, + "grad_norm": 7.717238426208496, + "learning_rate": 4.817206982543641e-06, + "loss": 0.3431, + "step": 60910 + }, + { + "epoch": 15.192019950124688, + "grad_norm": 9.4716215133667, + "learning_rate": 4.814713216957607e-06, + "loss": 0.3377, + "step": 60920 + }, + { + "epoch": 15.194513715710723, + "grad_norm": 6.269632816314697, + "learning_rate": 4.812219451371571e-06, + "loss": 0.3173, + "step": 60930 + }, + { + "epoch": 15.197007481296758, + "grad_norm": 4.857752799987793, + "learning_rate": 4.8097256857855364e-06, + "loss": 0.2644, + "step": 60940 + }, + { + "epoch": 15.199501246882793, + "grad_norm": 4.956846714019775, + "learning_rate": 4.807231920199502e-06, + "loss": 0.4057, + "step": 60950 + }, + { + "epoch": 15.201995012468828, + "grad_norm": 11.810905456542969, + "learning_rate": 4.804738154613467e-06, + "loss": 0.3344, + "step": 60960 + }, + { + "epoch": 15.204488778054863, + "grad_norm": 10.063115119934082, + "learning_rate": 4.8022443890274315e-06, + "loss": 0.356, + "step": 60970 + }, + { + "epoch": 15.206982543640898, + "grad_norm": 8.853325843811035, + "learning_rate": 4.799750623441397e-06, + "loss": 0.2966, + "step": 60980 + }, + { + "epoch": 15.209476309226932, + "grad_norm": 7.260303974151611, + "learning_rate": 4.797256857855362e-06, + "loss": 0.2996, + "step": 60990 + }, + { + "epoch": 15.211970074812967, + "grad_norm": 9.30881118774414, + "learning_rate": 4.7947630922693274e-06, + "loss": 0.3598, + "step": 61000 + }, + { + "epoch": 15.214463840399002, + "grad_norm": 8.756742477416992, + "learning_rate": 4.792269326683292e-06, + "loss": 0.2864, + "step": 61010 + }, + { + "epoch": 15.216957605985037, + "grad_norm": 6.806258201599121, + "learning_rate": 4.789775561097257e-06, + "loss": 0.3255, + "step": 61020 + }, + { + "epoch": 15.219451371571072, + "grad_norm": 7.81227445602417, + "learning_rate": 4.7872817955112225e-06, + "loss": 0.42, + "step": 61030 + }, + { + "epoch": 15.221945137157107, + "grad_norm": 8.680373191833496, + "learning_rate": 4.784788029925188e-06, + "loss": 0.3207, + "step": 61040 + }, + { + "epoch": 15.224438902743142, + "grad_norm": 7.925756454467773, + "learning_rate": 4.782294264339152e-06, + "loss": 0.3674, + "step": 61050 + }, + { + "epoch": 15.226932668329177, + "grad_norm": 9.870626449584961, + "learning_rate": 4.779800498753118e-06, + "loss": 0.3326, + "step": 61060 + }, + { + "epoch": 15.229426433915211, + "grad_norm": 6.636346340179443, + "learning_rate": 4.777306733167082e-06, + "loss": 0.2297, + "step": 61070 + }, + { + "epoch": 15.231920199501246, + "grad_norm": 7.3385009765625, + "learning_rate": 4.774812967581048e-06, + "loss": 0.3006, + "step": 61080 + }, + { + "epoch": 15.234413965087281, + "grad_norm": 7.8916215896606445, + "learning_rate": 4.772319201995013e-06, + "loss": 0.3171, + "step": 61090 + }, + { + "epoch": 15.236907730673316, + "grad_norm": 7.509432792663574, + "learning_rate": 4.769825436408978e-06, + "loss": 0.3321, + "step": 61100 + }, + { + "epoch": 15.239401496259351, + "grad_norm": 5.825787544250488, + "learning_rate": 4.7673316708229425e-06, + "loss": 0.3242, + "step": 61110 + }, + { + "epoch": 15.241895261845386, + "grad_norm": 43.301937103271484, + "learning_rate": 4.764837905236908e-06, + "loss": 0.3342, + "step": 61120 + }, + { + "epoch": 15.24438902743142, + "grad_norm": 5.396152019500732, + "learning_rate": 4.762344139650873e-06, + "loss": 0.3322, + "step": 61130 + }, + { + "epoch": 15.246882793017456, + "grad_norm": 7.503886699676514, + "learning_rate": 4.759850374064838e-06, + "loss": 0.3069, + "step": 61140 + }, + { + "epoch": 15.24937655860349, + "grad_norm": 9.121578216552734, + "learning_rate": 4.757356608478804e-06, + "loss": 0.3596, + "step": 61150 + }, + { + "epoch": 15.251870324189527, + "grad_norm": 8.742789268493652, + "learning_rate": 4.754862842892768e-06, + "loss": 0.2804, + "step": 61160 + }, + { + "epoch": 15.254364089775562, + "grad_norm": 7.584442138671875, + "learning_rate": 4.7523690773067335e-06, + "loss": 0.2849, + "step": 61170 + }, + { + "epoch": 15.256857855361597, + "grad_norm": 6.909498691558838, + "learning_rate": 4.749875311720699e-06, + "loss": 0.2826, + "step": 61180 + }, + { + "epoch": 15.259351620947632, + "grad_norm": 6.300384044647217, + "learning_rate": 4.747381546134664e-06, + "loss": 0.3083, + "step": 61190 + }, + { + "epoch": 15.261845386533667, + "grad_norm": 7.162856101989746, + "learning_rate": 4.7448877805486286e-06, + "loss": 0.2551, + "step": 61200 + }, + { + "epoch": 15.264339152119701, + "grad_norm": 8.662135124206543, + "learning_rate": 4.742394014962594e-06, + "loss": 0.3813, + "step": 61210 + }, + { + "epoch": 15.266832917705736, + "grad_norm": 7.602468013763428, + "learning_rate": 4.739900249376559e-06, + "loss": 0.3187, + "step": 61220 + }, + { + "epoch": 15.269326683291771, + "grad_norm": 8.263647079467773, + "learning_rate": 4.7374064837905245e-06, + "loss": 0.3223, + "step": 61230 + }, + { + "epoch": 15.271820448877806, + "grad_norm": 8.883642196655273, + "learning_rate": 4.734912718204489e-06, + "loss": 0.4101, + "step": 61240 + }, + { + "epoch": 15.27431421446384, + "grad_norm": 13.972535133361816, + "learning_rate": 4.732418952618454e-06, + "loss": 0.3395, + "step": 61250 + }, + { + "epoch": 15.276807980049876, + "grad_norm": 12.901040077209473, + "learning_rate": 4.7299251870324196e-06, + "loss": 0.3364, + "step": 61260 + }, + { + "epoch": 15.27930174563591, + "grad_norm": 9.556396484375, + "learning_rate": 4.727431421446385e-06, + "loss": 0.3784, + "step": 61270 + }, + { + "epoch": 15.281795511221945, + "grad_norm": 9.603511810302734, + "learning_rate": 4.724937655860349e-06, + "loss": 0.321, + "step": 61280 + }, + { + "epoch": 15.28428927680798, + "grad_norm": 12.59118366241455, + "learning_rate": 4.722443890274315e-06, + "loss": 0.3617, + "step": 61290 + }, + { + "epoch": 15.286783042394015, + "grad_norm": 7.6207194328308105, + "learning_rate": 4.719950124688279e-06, + "loss": 0.2976, + "step": 61300 + }, + { + "epoch": 15.28927680798005, + "grad_norm": 10.782099723815918, + "learning_rate": 4.717456359102245e-06, + "loss": 0.2937, + "step": 61310 + }, + { + "epoch": 15.291770573566085, + "grad_norm": 7.862745761871338, + "learning_rate": 4.71496259351621e-06, + "loss": 0.2923, + "step": 61320 + }, + { + "epoch": 15.29426433915212, + "grad_norm": 10.322379112243652, + "learning_rate": 4.712468827930175e-06, + "loss": 0.3436, + "step": 61330 + }, + { + "epoch": 15.296758104738155, + "grad_norm": 6.147068500518799, + "learning_rate": 4.7099750623441395e-06, + "loss": 0.3373, + "step": 61340 + }, + { + "epoch": 15.29925187032419, + "grad_norm": 7.12548303604126, + "learning_rate": 4.707481296758105e-06, + "loss": 0.3147, + "step": 61350 + }, + { + "epoch": 15.301745635910224, + "grad_norm": 10.481398582458496, + "learning_rate": 4.70498753117207e-06, + "loss": 0.3268, + "step": 61360 + }, + { + "epoch": 15.30423940149626, + "grad_norm": 8.344328880310059, + "learning_rate": 4.7024937655860354e-06, + "loss": 0.3231, + "step": 61370 + }, + { + "epoch": 15.306733167082294, + "grad_norm": 5.839080333709717, + "learning_rate": 4.7e-06, + "loss": 0.2777, + "step": 61380 + }, + { + "epoch": 15.309226932668329, + "grad_norm": 14.974725723266602, + "learning_rate": 4.697506234413965e-06, + "loss": 0.3513, + "step": 61390 + }, + { + "epoch": 15.311720698254364, + "grad_norm": 6.981603145599365, + "learning_rate": 4.6950124688279305e-06, + "loss": 0.3106, + "step": 61400 + }, + { + "epoch": 15.314214463840399, + "grad_norm": 10.228157043457031, + "learning_rate": 4.692518703241896e-06, + "loss": 0.3035, + "step": 61410 + }, + { + "epoch": 15.316708229426434, + "grad_norm": 10.667278289794922, + "learning_rate": 4.690024937655861e-06, + "loss": 0.331, + "step": 61420 + }, + { + "epoch": 15.319201995012468, + "grad_norm": 7.606900215148926, + "learning_rate": 4.687531172069826e-06, + "loss": 0.2967, + "step": 61430 + }, + { + "epoch": 15.321695760598503, + "grad_norm": 8.036834716796875, + "learning_rate": 4.685037406483791e-06, + "loss": 0.3349, + "step": 61440 + }, + { + "epoch": 15.324189526184538, + "grad_norm": 7.838779449462891, + "learning_rate": 4.682543640897756e-06, + "loss": 0.3439, + "step": 61450 + }, + { + "epoch": 15.326683291770573, + "grad_norm": 13.692665100097656, + "learning_rate": 4.6800498753117215e-06, + "loss": 0.3764, + "step": 61460 + }, + { + "epoch": 15.329177057356608, + "grad_norm": 9.359086036682129, + "learning_rate": 4.677556109725686e-06, + "loss": 0.3118, + "step": 61470 + }, + { + "epoch": 15.331670822942643, + "grad_norm": 7.252995014190674, + "learning_rate": 4.675062344139651e-06, + "loss": 0.311, + "step": 61480 + }, + { + "epoch": 15.334164588528678, + "grad_norm": 7.640556812286377, + "learning_rate": 4.672568578553617e-06, + "loss": 0.3451, + "step": 61490 + }, + { + "epoch": 15.336658354114713, + "grad_norm": 8.669981956481934, + "learning_rate": 4.670074812967582e-06, + "loss": 0.3612, + "step": 61500 + }, + { + "epoch": 15.339152119700747, + "grad_norm": 12.157310485839844, + "learning_rate": 4.667581047381546e-06, + "loss": 0.3596, + "step": 61510 + }, + { + "epoch": 15.341645885286782, + "grad_norm": 7.998473644256592, + "learning_rate": 4.665087281795512e-06, + "loss": 0.3214, + "step": 61520 + }, + { + "epoch": 15.344139650872817, + "grad_norm": 11.854910850524902, + "learning_rate": 4.662593516209476e-06, + "loss": 0.3224, + "step": 61530 + }, + { + "epoch": 15.346633416458852, + "grad_norm": 8.815274238586426, + "learning_rate": 4.660099750623442e-06, + "loss": 0.317, + "step": 61540 + }, + { + "epoch": 15.349127182044889, + "grad_norm": 6.209719181060791, + "learning_rate": 4.657605985037407e-06, + "loss": 0.3578, + "step": 61550 + }, + { + "epoch": 15.351620947630924, + "grad_norm": 10.795744895935059, + "learning_rate": 4.655112219451372e-06, + "loss": 0.2813, + "step": 61560 + }, + { + "epoch": 15.354114713216958, + "grad_norm": 8.15768814086914, + "learning_rate": 4.6526184538653365e-06, + "loss": 0.3336, + "step": 61570 + }, + { + "epoch": 15.356608478802993, + "grad_norm": 8.229902267456055, + "learning_rate": 4.650124688279302e-06, + "loss": 0.291, + "step": 61580 + }, + { + "epoch": 15.359102244389028, + "grad_norm": 8.55030632019043, + "learning_rate": 4.647630922693267e-06, + "loss": 0.2709, + "step": 61590 + }, + { + "epoch": 15.361596009975063, + "grad_norm": 6.729324817657471, + "learning_rate": 4.6451371571072325e-06, + "loss": 0.3014, + "step": 61600 + }, + { + "epoch": 15.364089775561098, + "grad_norm": 9.249410629272461, + "learning_rate": 4.642643391521197e-06, + "loss": 0.3409, + "step": 61610 + }, + { + "epoch": 15.366583541147133, + "grad_norm": 8.171239852905273, + "learning_rate": 4.640149625935162e-06, + "loss": 0.2956, + "step": 61620 + }, + { + "epoch": 15.369077306733168, + "grad_norm": 8.610503196716309, + "learning_rate": 4.6376558603491276e-06, + "loss": 0.2898, + "step": 61630 + }, + { + "epoch": 15.371571072319203, + "grad_norm": 6.898585796356201, + "learning_rate": 4.635162094763093e-06, + "loss": 0.3064, + "step": 61640 + }, + { + "epoch": 15.374064837905237, + "grad_norm": 10.55229377746582, + "learning_rate": 4.632668329177058e-06, + "loss": 0.3469, + "step": 61650 + }, + { + "epoch": 15.376558603491272, + "grad_norm": 13.5460786819458, + "learning_rate": 4.630174563591023e-06, + "loss": 0.3784, + "step": 61660 + }, + { + "epoch": 15.379052369077307, + "grad_norm": 6.476283073425293, + "learning_rate": 4.627680798004988e-06, + "loss": 0.2643, + "step": 61670 + }, + { + "epoch": 15.381546134663342, + "grad_norm": 13.659258842468262, + "learning_rate": 4.625187032418953e-06, + "loss": 0.4079, + "step": 61680 + }, + { + "epoch": 15.384039900249377, + "grad_norm": 10.668188095092773, + "learning_rate": 4.6226932668329186e-06, + "loss": 0.3074, + "step": 61690 + }, + { + "epoch": 15.386533665835412, + "grad_norm": 8.238740921020508, + "learning_rate": 4.620199501246883e-06, + "loss": 0.304, + "step": 61700 + }, + { + "epoch": 15.389027431421447, + "grad_norm": 7.179644584655762, + "learning_rate": 4.617705735660848e-06, + "loss": 0.3141, + "step": 61710 + }, + { + "epoch": 15.391521197007481, + "grad_norm": 6.697994709014893, + "learning_rate": 4.615211970074814e-06, + "loss": 0.3026, + "step": 61720 + }, + { + "epoch": 15.394014962593516, + "grad_norm": 8.70664119720459, + "learning_rate": 4.612718204488779e-06, + "loss": 0.3417, + "step": 61730 + }, + { + "epoch": 15.396508728179551, + "grad_norm": 6.116442680358887, + "learning_rate": 4.610224438902743e-06, + "loss": 0.3057, + "step": 61740 + }, + { + "epoch": 15.399002493765586, + "grad_norm": 7.209672927856445, + "learning_rate": 4.607730673316709e-06, + "loss": 0.2968, + "step": 61750 + }, + { + "epoch": 15.401496259351621, + "grad_norm": 9.802582740783691, + "learning_rate": 4.605236907730673e-06, + "loss": 0.2957, + "step": 61760 + }, + { + "epoch": 15.403990024937656, + "grad_norm": 15.796855926513672, + "learning_rate": 4.6027431421446385e-06, + "loss": 0.4296, + "step": 61770 + }, + { + "epoch": 15.40648379052369, + "grad_norm": 6.593606948852539, + "learning_rate": 4.600249376558604e-06, + "loss": 0.3552, + "step": 61780 + }, + { + "epoch": 15.408977556109726, + "grad_norm": 6.938211441040039, + "learning_rate": 4.597755610972569e-06, + "loss": 0.3235, + "step": 61790 + }, + { + "epoch": 15.41147132169576, + "grad_norm": 9.084220886230469, + "learning_rate": 4.595261845386534e-06, + "loss": 0.3951, + "step": 61800 + }, + { + "epoch": 15.413965087281795, + "grad_norm": 8.550397872924805, + "learning_rate": 4.592768079800499e-06, + "loss": 0.3256, + "step": 61810 + }, + { + "epoch": 15.41645885286783, + "grad_norm": 8.361339569091797, + "learning_rate": 4.590274314214464e-06, + "loss": 0.3312, + "step": 61820 + }, + { + "epoch": 15.418952618453865, + "grad_norm": 6.701113700866699, + "learning_rate": 4.5877805486284295e-06, + "loss": 0.2941, + "step": 61830 + }, + { + "epoch": 15.4214463840399, + "grad_norm": 14.242850303649902, + "learning_rate": 4.585286783042394e-06, + "loss": 0.3327, + "step": 61840 + }, + { + "epoch": 15.423940149625935, + "grad_norm": 8.893102645874023, + "learning_rate": 4.582793017456359e-06, + "loss": 0.3021, + "step": 61850 + }, + { + "epoch": 15.42643391521197, + "grad_norm": 9.866777420043945, + "learning_rate": 4.580299251870325e-06, + "loss": 0.3488, + "step": 61860 + }, + { + "epoch": 15.428927680798004, + "grad_norm": 9.676253318786621, + "learning_rate": 4.57780548628429e-06, + "loss": 0.3901, + "step": 61870 + }, + { + "epoch": 15.43142144638404, + "grad_norm": 6.8704657554626465, + "learning_rate": 4.575311720698254e-06, + "loss": 0.3065, + "step": 61880 + }, + { + "epoch": 15.433915211970074, + "grad_norm": 8.320914268493652, + "learning_rate": 4.57281795511222e-06, + "loss": 0.3204, + "step": 61890 + }, + { + "epoch": 15.436408977556109, + "grad_norm": 9.73550033569336, + "learning_rate": 4.570324189526185e-06, + "loss": 0.3785, + "step": 61900 + }, + { + "epoch": 15.438902743142144, + "grad_norm": 7.0275139808654785, + "learning_rate": 4.56783042394015e-06, + "loss": 0.3277, + "step": 61910 + }, + { + "epoch": 15.441396508728179, + "grad_norm": 8.163436889648438, + "learning_rate": 4.565336658354116e-06, + "loss": 0.3182, + "step": 61920 + }, + { + "epoch": 15.443890274314214, + "grad_norm": 7.171289920806885, + "learning_rate": 4.56284289276808e-06, + "loss": 0.3612, + "step": 61930 + }, + { + "epoch": 15.446384039900249, + "grad_norm": 11.349479675292969, + "learning_rate": 4.560349127182045e-06, + "loss": 0.3601, + "step": 61940 + }, + { + "epoch": 15.448877805486283, + "grad_norm": 5.093303680419922, + "learning_rate": 4.557855361596011e-06, + "loss": 0.3162, + "step": 61950 + }, + { + "epoch": 15.451371571072318, + "grad_norm": 8.058874130249023, + "learning_rate": 4.555361596009976e-06, + "loss": 0.3508, + "step": 61960 + }, + { + "epoch": 15.453865336658355, + "grad_norm": 7.41152286529541, + "learning_rate": 4.5528678304239405e-06, + "loss": 0.256, + "step": 61970 + }, + { + "epoch": 15.45635910224439, + "grad_norm": 5.370427131652832, + "learning_rate": 4.550374064837906e-06, + "loss": 0.3207, + "step": 61980 + }, + { + "epoch": 15.458852867830425, + "grad_norm": 9.166321754455566, + "learning_rate": 4.54788029925187e-06, + "loss": 0.3825, + "step": 61990 + }, + { + "epoch": 15.46134663341646, + "grad_norm": 11.046178817749023, + "learning_rate": 4.5453865336658355e-06, + "loss": 0.3506, + "step": 62000 + }, + { + "epoch": 15.463840399002494, + "grad_norm": 10.814038276672363, + "learning_rate": 4.542892768079801e-06, + "loss": 0.3217, + "step": 62010 + }, + { + "epoch": 15.46633416458853, + "grad_norm": 6.165196895599365, + "learning_rate": 4.540399002493766e-06, + "loss": 0.321, + "step": 62020 + }, + { + "epoch": 15.468827930174564, + "grad_norm": 6.679711818695068, + "learning_rate": 4.537905236907731e-06, + "loss": 0.3212, + "step": 62030 + }, + { + "epoch": 15.471321695760599, + "grad_norm": 10.853338241577148, + "learning_rate": 4.535411471321696e-06, + "loss": 0.2962, + "step": 62040 + }, + { + "epoch": 15.473815461346634, + "grad_norm": 7.180294990539551, + "learning_rate": 4.532917705735661e-06, + "loss": 0.3367, + "step": 62050 + }, + { + "epoch": 15.476309226932669, + "grad_norm": 6.090845108032227, + "learning_rate": 4.5304239401496266e-06, + "loss": 0.3281, + "step": 62060 + }, + { + "epoch": 15.478802992518704, + "grad_norm": 9.395833015441895, + "learning_rate": 4.527930174563591e-06, + "loss": 0.3029, + "step": 62070 + }, + { + "epoch": 15.481296758104738, + "grad_norm": 8.523276329040527, + "learning_rate": 4.525436408977556e-06, + "loss": 0.3147, + "step": 62080 + }, + { + "epoch": 15.483790523690773, + "grad_norm": 6.579370498657227, + "learning_rate": 4.522942643391522e-06, + "loss": 0.3089, + "step": 62090 + }, + { + "epoch": 15.486284289276808, + "grad_norm": 9.35316276550293, + "learning_rate": 4.520448877805487e-06, + "loss": 0.3479, + "step": 62100 + }, + { + "epoch": 15.488778054862843, + "grad_norm": 8.047480583190918, + "learning_rate": 4.517955112219451e-06, + "loss": 0.3327, + "step": 62110 + }, + { + "epoch": 15.491271820448878, + "grad_norm": 9.682567596435547, + "learning_rate": 4.515461346633417e-06, + "loss": 0.2934, + "step": 62120 + }, + { + "epoch": 15.493765586034913, + "grad_norm": 6.29941463470459, + "learning_rate": 4.512967581047382e-06, + "loss": 0.3575, + "step": 62130 + }, + { + "epoch": 15.496259351620948, + "grad_norm": 6.268800735473633, + "learning_rate": 4.510473815461347e-06, + "loss": 0.301, + "step": 62140 + }, + { + "epoch": 15.498753117206983, + "grad_norm": 9.379264831542969, + "learning_rate": 4.507980049875313e-06, + "loss": 0.3548, + "step": 62150 + }, + { + "epoch": 15.501246882793017, + "grad_norm": 7.664111137390137, + "learning_rate": 4.505486284289277e-06, + "loss": 0.2977, + "step": 62160 + }, + { + "epoch": 15.503740648379052, + "grad_norm": 5.527115821838379, + "learning_rate": 4.502992518703242e-06, + "loss": 0.3337, + "step": 62170 + }, + { + "epoch": 15.506234413965087, + "grad_norm": 9.211461067199707, + "learning_rate": 4.500498753117207e-06, + "loss": 0.3509, + "step": 62180 + }, + { + "epoch": 15.508728179551122, + "grad_norm": 8.124340057373047, + "learning_rate": 4.498004987531173e-06, + "loss": 0.37, + "step": 62190 + }, + { + "epoch": 15.511221945137157, + "grad_norm": 8.361662864685059, + "learning_rate": 4.4955112219451375e-06, + "loss": 0.3075, + "step": 62200 + }, + { + "epoch": 15.513715710723192, + "grad_norm": 9.21060848236084, + "learning_rate": 4.493017456359103e-06, + "loss": 0.277, + "step": 62210 + }, + { + "epoch": 15.516209476309227, + "grad_norm": 9.97355842590332, + "learning_rate": 4.490523690773067e-06, + "loss": 0.3102, + "step": 62220 + }, + { + "epoch": 15.518703241895262, + "grad_norm": 7.273804664611816, + "learning_rate": 4.488029925187033e-06, + "loss": 0.273, + "step": 62230 + }, + { + "epoch": 15.521197007481296, + "grad_norm": 4.933453559875488, + "learning_rate": 4.485536159600998e-06, + "loss": 0.2666, + "step": 62240 + }, + { + "epoch": 15.523690773067331, + "grad_norm": 6.50958251953125, + "learning_rate": 4.483042394014963e-06, + "loss": 0.2831, + "step": 62250 + }, + { + "epoch": 15.526184538653366, + "grad_norm": 5.90592622756958, + "learning_rate": 4.480548628428928e-06, + "loss": 0.4132, + "step": 62260 + }, + { + "epoch": 15.528678304239401, + "grad_norm": 7.461799144744873, + "learning_rate": 4.478054862842893e-06, + "loss": 0.3894, + "step": 62270 + }, + { + "epoch": 15.531172069825436, + "grad_norm": 6.117193222045898, + "learning_rate": 4.475561097256858e-06, + "loss": 0.318, + "step": 62280 + }, + { + "epoch": 15.53366583541147, + "grad_norm": 7.674281120300293, + "learning_rate": 4.473067331670824e-06, + "loss": 0.3456, + "step": 62290 + }, + { + "epoch": 15.536159600997506, + "grad_norm": 7.840825080871582, + "learning_rate": 4.470573566084788e-06, + "loss": 0.3274, + "step": 62300 + }, + { + "epoch": 15.53865336658354, + "grad_norm": 9.947293281555176, + "learning_rate": 4.468079800498753e-06, + "loss": 0.3462, + "step": 62310 + }, + { + "epoch": 15.541147132169575, + "grad_norm": 6.473944664001465, + "learning_rate": 4.465586034912719e-06, + "loss": 0.2612, + "step": 62320 + }, + { + "epoch": 15.54364089775561, + "grad_norm": 10.87900447845459, + "learning_rate": 4.463092269326684e-06, + "loss": 0.3052, + "step": 62330 + }, + { + "epoch": 15.546134663341645, + "grad_norm": 7.721189022064209, + "learning_rate": 4.4605985037406484e-06, + "loss": 0.3771, + "step": 62340 + }, + { + "epoch": 15.548628428927682, + "grad_norm": 7.034101963043213, + "learning_rate": 4.458104738154614e-06, + "loss": 0.3067, + "step": 62350 + }, + { + "epoch": 15.551122194513717, + "grad_norm": 8.35641860961914, + "learning_rate": 4.455610972568579e-06, + "loss": 0.3294, + "step": 62360 + }, + { + "epoch": 15.553615960099751, + "grad_norm": 7.652080535888672, + "learning_rate": 4.453117206982544e-06, + "loss": 0.3738, + "step": 62370 + }, + { + "epoch": 15.556109725685786, + "grad_norm": 7.658141136169434, + "learning_rate": 4.450623441396509e-06, + "loss": 0.3161, + "step": 62380 + }, + { + "epoch": 15.558603491271821, + "grad_norm": 7.808701992034912, + "learning_rate": 4.448129675810474e-06, + "loss": 0.3701, + "step": 62390 + }, + { + "epoch": 15.561097256857856, + "grad_norm": 9.722882270812988, + "learning_rate": 4.4456359102244395e-06, + "loss": 0.3375, + "step": 62400 + }, + { + "epoch": 15.563591022443891, + "grad_norm": 8.67414379119873, + "learning_rate": 4.443142144638404e-06, + "loss": 0.3462, + "step": 62410 + }, + { + "epoch": 15.566084788029926, + "grad_norm": 5.231639385223389, + "learning_rate": 4.44064837905237e-06, + "loss": 0.3686, + "step": 62420 + }, + { + "epoch": 15.56857855361596, + "grad_norm": 11.116351127624512, + "learning_rate": 4.4381546134663345e-06, + "loss": 0.3446, + "step": 62430 + }, + { + "epoch": 15.571072319201996, + "grad_norm": 6.519227504730225, + "learning_rate": 4.4356608478803e-06, + "loss": 0.2998, + "step": 62440 + }, + { + "epoch": 15.57356608478803, + "grad_norm": 12.292588233947754, + "learning_rate": 4.433167082294264e-06, + "loss": 0.3128, + "step": 62450 + }, + { + "epoch": 15.576059850374065, + "grad_norm": 10.114727973937988, + "learning_rate": 4.43067331670823e-06, + "loss": 0.4101, + "step": 62460 + }, + { + "epoch": 15.5785536159601, + "grad_norm": 6.243036270141602, + "learning_rate": 4.428179551122195e-06, + "loss": 0.3902, + "step": 62470 + }, + { + "epoch": 15.581047381546135, + "grad_norm": 7.189294338226318, + "learning_rate": 4.42568578553616e-06, + "loss": 0.2728, + "step": 62480 + }, + { + "epoch": 15.58354114713217, + "grad_norm": 6.678802967071533, + "learning_rate": 4.423192019950125e-06, + "loss": 0.3568, + "step": 62490 + }, + { + "epoch": 15.586034912718205, + "grad_norm": 10.736851692199707, + "learning_rate": 4.42069825436409e-06, + "loss": 0.3335, + "step": 62500 + }, + { + "epoch": 15.58852867830424, + "grad_norm": 7.438121318817139, + "learning_rate": 4.418204488778055e-06, + "loss": 0.2941, + "step": 62510 + }, + { + "epoch": 15.591022443890274, + "grad_norm": 6.257914066314697, + "learning_rate": 4.415710723192021e-06, + "loss": 0.2712, + "step": 62520 + }, + { + "epoch": 15.59351620947631, + "grad_norm": 9.817953109741211, + "learning_rate": 4.413216957605985e-06, + "loss": 0.327, + "step": 62530 + }, + { + "epoch": 15.596009975062344, + "grad_norm": 6.332709312438965, + "learning_rate": 4.41072319201995e-06, + "loss": 0.3838, + "step": 62540 + }, + { + "epoch": 15.598503740648379, + "grad_norm": 7.172346115112305, + "learning_rate": 4.408229426433916e-06, + "loss": 0.3826, + "step": 62550 + }, + { + "epoch": 15.600997506234414, + "grad_norm": 8.06655502319336, + "learning_rate": 4.405735660847881e-06, + "loss": 0.2852, + "step": 62560 + }, + { + "epoch": 15.603491271820449, + "grad_norm": 12.748686790466309, + "learning_rate": 4.4032418952618455e-06, + "loss": 0.3728, + "step": 62570 + }, + { + "epoch": 15.605985037406484, + "grad_norm": 8.942939758300781, + "learning_rate": 4.400748129675811e-06, + "loss": 0.3794, + "step": 62580 + }, + { + "epoch": 15.608478802992519, + "grad_norm": 7.845804691314697, + "learning_rate": 4.398254364089775e-06, + "loss": 0.3231, + "step": 62590 + }, + { + "epoch": 15.610972568578553, + "grad_norm": 10.373620986938477, + "learning_rate": 4.395760598503741e-06, + "loss": 0.2595, + "step": 62600 + }, + { + "epoch": 15.613466334164588, + "grad_norm": 7.292490005493164, + "learning_rate": 4.393266832917706e-06, + "loss": 0.316, + "step": 62610 + }, + { + "epoch": 15.615960099750623, + "grad_norm": 8.094284057617188, + "learning_rate": 4.390773067331671e-06, + "loss": 0.3601, + "step": 62620 + }, + { + "epoch": 15.618453865336658, + "grad_norm": 8.893630981445312, + "learning_rate": 4.3882793017456365e-06, + "loss": 0.3487, + "step": 62630 + }, + { + "epoch": 15.620947630922693, + "grad_norm": 6.261669635772705, + "learning_rate": 4.385785536159601e-06, + "loss": 0.2983, + "step": 62640 + }, + { + "epoch": 15.623441396508728, + "grad_norm": 6.368238925933838, + "learning_rate": 4.383291770573566e-06, + "loss": 0.3524, + "step": 62650 + }, + { + "epoch": 15.625935162094763, + "grad_norm": 15.371926307678223, + "learning_rate": 4.380798004987532e-06, + "loss": 0.378, + "step": 62660 + }, + { + "epoch": 15.628428927680797, + "grad_norm": 5.758829593658447, + "learning_rate": 4.378304239401497e-06, + "loss": 0.3458, + "step": 62670 + }, + { + "epoch": 15.630922693266832, + "grad_norm": 7.6789326667785645, + "learning_rate": 4.375810473815461e-06, + "loss": 0.2688, + "step": 62680 + }, + { + "epoch": 15.633416458852867, + "grad_norm": 7.106253147125244, + "learning_rate": 4.373316708229427e-06, + "loss": 0.2851, + "step": 62690 + }, + { + "epoch": 15.635910224438902, + "grad_norm": 6.524157524108887, + "learning_rate": 4.370822942643392e-06, + "loss": 0.3196, + "step": 62700 + }, + { + "epoch": 15.638403990024937, + "grad_norm": 12.354263305664062, + "learning_rate": 4.368329177057357e-06, + "loss": 0.3764, + "step": 62710 + }, + { + "epoch": 15.640897755610972, + "grad_norm": 8.018882751464844, + "learning_rate": 4.365835411471322e-06, + "loss": 0.3431, + "step": 62720 + }, + { + "epoch": 15.643391521197007, + "grad_norm": 4.7391228675842285, + "learning_rate": 4.363341645885287e-06, + "loss": 0.2748, + "step": 62730 + }, + { + "epoch": 15.645885286783042, + "grad_norm": 7.503388404846191, + "learning_rate": 4.360847880299252e-06, + "loss": 0.3772, + "step": 62740 + }, + { + "epoch": 15.648379052369076, + "grad_norm": 8.94284725189209, + "learning_rate": 4.358354114713218e-06, + "loss": 0.3285, + "step": 62750 + }, + { + "epoch": 15.650872817955111, + "grad_norm": 7.088261127471924, + "learning_rate": 4.355860349127182e-06, + "loss": 0.2969, + "step": 62760 + }, + { + "epoch": 15.653366583541148, + "grad_norm": 5.1243414878845215, + "learning_rate": 4.3533665835411475e-06, + "loss": 0.3318, + "step": 62770 + }, + { + "epoch": 15.655860349127183, + "grad_norm": 9.318085670471191, + "learning_rate": 4.350872817955113e-06, + "loss": 0.3598, + "step": 62780 + }, + { + "epoch": 15.658354114713218, + "grad_norm": 7.903304576873779, + "learning_rate": 4.348379052369078e-06, + "loss": 0.3319, + "step": 62790 + }, + { + "epoch": 15.660847880299253, + "grad_norm": 7.943011283874512, + "learning_rate": 4.3458852867830425e-06, + "loss": 0.3095, + "step": 62800 + }, + { + "epoch": 15.663341645885287, + "grad_norm": 8.990947723388672, + "learning_rate": 4.343391521197008e-06, + "loss": 0.2832, + "step": 62810 + }, + { + "epoch": 15.665835411471322, + "grad_norm": 11.269744873046875, + "learning_rate": 4.340897755610972e-06, + "loss": 0.3504, + "step": 62820 + }, + { + "epoch": 15.668329177057357, + "grad_norm": 8.170928955078125, + "learning_rate": 4.3384039900249385e-06, + "loss": 0.2957, + "step": 62830 + }, + { + "epoch": 15.670822942643392, + "grad_norm": 10.878934860229492, + "learning_rate": 4.335910224438903e-06, + "loss": 0.3487, + "step": 62840 + }, + { + "epoch": 15.673316708229427, + "grad_norm": 7.941391944885254, + "learning_rate": 4.333416458852868e-06, + "loss": 0.3278, + "step": 62850 + }, + { + "epoch": 15.675810473815462, + "grad_norm": 8.739579200744629, + "learning_rate": 4.330922693266833e-06, + "loss": 0.3454, + "step": 62860 + }, + { + "epoch": 15.678304239401497, + "grad_norm": 7.752486705780029, + "learning_rate": 4.328428927680798e-06, + "loss": 0.3343, + "step": 62870 + }, + { + "epoch": 15.680798004987532, + "grad_norm": 10.62015151977539, + "learning_rate": 4.325935162094763e-06, + "loss": 0.3248, + "step": 62880 + }, + { + "epoch": 15.683291770573566, + "grad_norm": 7.1810832023620605, + "learning_rate": 4.323441396508729e-06, + "loss": 0.2936, + "step": 62890 + }, + { + "epoch": 15.685785536159601, + "grad_norm": 7.9837236404418945, + "learning_rate": 4.320947630922694e-06, + "loss": 0.2879, + "step": 62900 + }, + { + "epoch": 15.688279301745636, + "grad_norm": 9.112959861755371, + "learning_rate": 4.318453865336658e-06, + "loss": 0.2799, + "step": 62910 + }, + { + "epoch": 15.690773067331671, + "grad_norm": 8.914131164550781, + "learning_rate": 4.315960099750624e-06, + "loss": 0.3874, + "step": 62920 + }, + { + "epoch": 15.693266832917706, + "grad_norm": 10.754671096801758, + "learning_rate": 4.313466334164589e-06, + "loss": 0.3674, + "step": 62930 + }, + { + "epoch": 15.69576059850374, + "grad_norm": 8.270440101623535, + "learning_rate": 4.310972568578554e-06, + "loss": 0.3293, + "step": 62940 + }, + { + "epoch": 15.698254364089776, + "grad_norm": 9.554506301879883, + "learning_rate": 4.308478802992519e-06, + "loss": 0.3053, + "step": 62950 + }, + { + "epoch": 15.70074812967581, + "grad_norm": 8.808588981628418, + "learning_rate": 4.305985037406484e-06, + "loss": 0.3664, + "step": 62960 + }, + { + "epoch": 15.703241895261845, + "grad_norm": 16.313865661621094, + "learning_rate": 4.303491271820449e-06, + "loss": 0.3232, + "step": 62970 + }, + { + "epoch": 15.70573566084788, + "grad_norm": 18.93476104736328, + "learning_rate": 4.300997506234415e-06, + "loss": 0.3315, + "step": 62980 + }, + { + "epoch": 15.708229426433915, + "grad_norm": 7.082018852233887, + "learning_rate": 4.298503740648379e-06, + "loss": 0.3039, + "step": 62990 + }, + { + "epoch": 15.71072319201995, + "grad_norm": 5.019651889801025, + "learning_rate": 4.2960099750623445e-06, + "loss": 0.3773, + "step": 63000 + }, + { + "epoch": 15.713216957605985, + "grad_norm": 7.572514057159424, + "learning_rate": 4.29351620947631e-06, + "loss": 0.3187, + "step": 63010 + }, + { + "epoch": 15.71571072319202, + "grad_norm": 7.050946235656738, + "learning_rate": 4.291022443890275e-06, + "loss": 0.3341, + "step": 63020 + }, + { + "epoch": 15.718204488778055, + "grad_norm": 11.754009246826172, + "learning_rate": 4.2885286783042396e-06, + "loss": 0.3174, + "step": 63030 + }, + { + "epoch": 15.72069825436409, + "grad_norm": 6.041005611419678, + "learning_rate": 4.286034912718205e-06, + "loss": 0.2952, + "step": 63040 + }, + { + "epoch": 15.723192019950124, + "grad_norm": 9.302490234375, + "learning_rate": 4.283541147132169e-06, + "loss": 0.3406, + "step": 63050 + }, + { + "epoch": 15.72568578553616, + "grad_norm": 7.180544853210449, + "learning_rate": 4.2810473815461355e-06, + "loss": 0.3036, + "step": 63060 + }, + { + "epoch": 15.728179551122194, + "grad_norm": 10.246197700500488, + "learning_rate": 4.2785536159601e-06, + "loss": 0.3262, + "step": 63070 + }, + { + "epoch": 15.730673316708229, + "grad_norm": 10.846905708312988, + "learning_rate": 4.276059850374065e-06, + "loss": 0.3819, + "step": 63080 + }, + { + "epoch": 15.733167082294264, + "grad_norm": 7.604201316833496, + "learning_rate": 4.27356608478803e-06, + "loss": 0.3568, + "step": 63090 + }, + { + "epoch": 15.735660847880299, + "grad_norm": 9.36225414276123, + "learning_rate": 4.271072319201995e-06, + "loss": 0.3201, + "step": 63100 + }, + { + "epoch": 15.738154613466333, + "grad_norm": 15.31506061553955, + "learning_rate": 4.26857855361596e-06, + "loss": 0.3591, + "step": 63110 + }, + { + "epoch": 15.740648379052368, + "grad_norm": 10.249829292297363, + "learning_rate": 4.266084788029926e-06, + "loss": 0.3486, + "step": 63120 + }, + { + "epoch": 15.743142144638403, + "grad_norm": 8.24986457824707, + "learning_rate": 4.263591022443891e-06, + "loss": 0.3455, + "step": 63130 + }, + { + "epoch": 15.745635910224438, + "grad_norm": 6.340268135070801, + "learning_rate": 4.2610972568578554e-06, + "loss": 0.3159, + "step": 63140 + }, + { + "epoch": 15.748129675810475, + "grad_norm": 8.134374618530273, + "learning_rate": 4.258603491271821e-06, + "loss": 0.3524, + "step": 63150 + }, + { + "epoch": 15.75062344139651, + "grad_norm": 12.707883834838867, + "learning_rate": 4.256109725685786e-06, + "loss": 0.3194, + "step": 63160 + }, + { + "epoch": 15.753117206982544, + "grad_norm": 10.832149505615234, + "learning_rate": 4.253615960099751e-06, + "loss": 0.3122, + "step": 63170 + }, + { + "epoch": 15.75561097256858, + "grad_norm": 10.519184112548828, + "learning_rate": 4.251122194513716e-06, + "loss": 0.3642, + "step": 63180 + }, + { + "epoch": 15.758104738154614, + "grad_norm": 14.12321949005127, + "learning_rate": 4.248628428927681e-06, + "loss": 0.3388, + "step": 63190 + }, + { + "epoch": 15.760598503740649, + "grad_norm": 6.439551830291748, + "learning_rate": 4.2461346633416465e-06, + "loss": 0.385, + "step": 63200 + }, + { + "epoch": 15.763092269326684, + "grad_norm": 25.51148223876953, + "learning_rate": 4.243640897755612e-06, + "loss": 0.4235, + "step": 63210 + }, + { + "epoch": 15.765586034912719, + "grad_norm": 9.474204063415527, + "learning_rate": 4.241147132169576e-06, + "loss": 0.3963, + "step": 63220 + }, + { + "epoch": 15.768079800498754, + "grad_norm": 10.349353790283203, + "learning_rate": 4.2386533665835415e-06, + "loss": 0.293, + "step": 63230 + }, + { + "epoch": 15.770573566084789, + "grad_norm": 9.036044120788574, + "learning_rate": 4.236159600997507e-06, + "loss": 0.3493, + "step": 63240 + }, + { + "epoch": 15.773067331670823, + "grad_norm": 6.4520392417907715, + "learning_rate": 4.233665835411472e-06, + "loss": 0.3464, + "step": 63250 + }, + { + "epoch": 15.775561097256858, + "grad_norm": 6.277656078338623, + "learning_rate": 4.231172069825437e-06, + "loss": 0.4086, + "step": 63260 + }, + { + "epoch": 15.778054862842893, + "grad_norm": 7.3123979568481445, + "learning_rate": 4.228678304239402e-06, + "loss": 0.3339, + "step": 63270 + }, + { + "epoch": 15.780548628428928, + "grad_norm": 4.954773426055908, + "learning_rate": 4.226184538653366e-06, + "loss": 0.3325, + "step": 63280 + }, + { + "epoch": 15.783042394014963, + "grad_norm": 8.477773666381836, + "learning_rate": 4.223690773067332e-06, + "loss": 0.3338, + "step": 63290 + }, + { + "epoch": 15.785536159600998, + "grad_norm": 6.290766716003418, + "learning_rate": 4.221197007481297e-06, + "loss": 0.3108, + "step": 63300 + }, + { + "epoch": 15.788029925187033, + "grad_norm": 6.442891597747803, + "learning_rate": 4.218703241895262e-06, + "loss": 0.304, + "step": 63310 + }, + { + "epoch": 15.790523690773068, + "grad_norm": 7.096189022064209, + "learning_rate": 4.216209476309227e-06, + "loss": 0.2894, + "step": 63320 + }, + { + "epoch": 15.793017456359102, + "grad_norm": 7.581128120422363, + "learning_rate": 4.213715710723192e-06, + "loss": 0.3329, + "step": 63330 + }, + { + "epoch": 15.795511221945137, + "grad_norm": 7.5609130859375, + "learning_rate": 4.211221945137157e-06, + "loss": 0.233, + "step": 63340 + }, + { + "epoch": 15.798004987531172, + "grad_norm": 6.8791584968566895, + "learning_rate": 4.208728179551123e-06, + "loss": 0.3167, + "step": 63350 + }, + { + "epoch": 15.800498753117207, + "grad_norm": 8.420093536376953, + "learning_rate": 4.206234413965087e-06, + "loss": 0.3346, + "step": 63360 + }, + { + "epoch": 15.802992518703242, + "grad_norm": 8.641119003295898, + "learning_rate": 4.2037406483790525e-06, + "loss": 0.3995, + "step": 63370 + }, + { + "epoch": 15.805486284289277, + "grad_norm": 13.097655296325684, + "learning_rate": 4.201246882793018e-06, + "loss": 0.3696, + "step": 63380 + }, + { + "epoch": 15.807980049875312, + "grad_norm": 7.517343521118164, + "learning_rate": 4.198753117206983e-06, + "loss": 0.2953, + "step": 63390 + }, + { + "epoch": 15.810473815461346, + "grad_norm": 9.811420440673828, + "learning_rate": 4.196259351620948e-06, + "loss": 0.3546, + "step": 63400 + }, + { + "epoch": 15.812967581047381, + "grad_norm": 8.417027473449707, + "learning_rate": 4.193765586034913e-06, + "loss": 0.3077, + "step": 63410 + }, + { + "epoch": 15.815461346633416, + "grad_norm": 7.933226108551025, + "learning_rate": 4.191271820448878e-06, + "loss": 0.3254, + "step": 63420 + }, + { + "epoch": 15.817955112219451, + "grad_norm": 6.797752380371094, + "learning_rate": 4.1887780548628435e-06, + "loss": 0.2962, + "step": 63430 + }, + { + "epoch": 15.820448877805486, + "grad_norm": 6.070878505706787, + "learning_rate": 4.186284289276809e-06, + "loss": 0.2808, + "step": 63440 + }, + { + "epoch": 15.82294264339152, + "grad_norm": 10.812505722045898, + "learning_rate": 4.183790523690773e-06, + "loss": 0.334, + "step": 63450 + }, + { + "epoch": 15.825436408977556, + "grad_norm": 9.91736125946045, + "learning_rate": 4.1812967581047386e-06, + "loss": 0.2903, + "step": 63460 + }, + { + "epoch": 15.82793017456359, + "grad_norm": 7.868504524230957, + "learning_rate": 4.178802992518704e-06, + "loss": 0.3427, + "step": 63470 + }, + { + "epoch": 15.830423940149625, + "grad_norm": 10.431106567382812, + "learning_rate": 4.176309226932669e-06, + "loss": 0.3997, + "step": 63480 + }, + { + "epoch": 15.83291770573566, + "grad_norm": 9.199599266052246, + "learning_rate": 4.173815461346634e-06, + "loss": 0.3854, + "step": 63490 + }, + { + "epoch": 15.835411471321695, + "grad_norm": 9.323473930358887, + "learning_rate": 4.171321695760599e-06, + "loss": 0.3475, + "step": 63500 + }, + { + "epoch": 15.83790523690773, + "grad_norm": 6.273562431335449, + "learning_rate": 4.1688279301745634e-06, + "loss": 0.3192, + "step": 63510 + }, + { + "epoch": 15.840399002493765, + "grad_norm": 9.469862937927246, + "learning_rate": 4.166334164588529e-06, + "loss": 0.3289, + "step": 63520 + }, + { + "epoch": 15.8428927680798, + "grad_norm": 10.834332466125488, + "learning_rate": 4.163840399002494e-06, + "loss": 0.3935, + "step": 63530 + }, + { + "epoch": 15.845386533665835, + "grad_norm": 11.64710807800293, + "learning_rate": 4.161346633416459e-06, + "loss": 0.3366, + "step": 63540 + }, + { + "epoch": 15.84788029925187, + "grad_norm": 8.563414573669434, + "learning_rate": 4.158852867830424e-06, + "loss": 0.3576, + "step": 63550 + }, + { + "epoch": 15.850374064837904, + "grad_norm": 8.679342269897461, + "learning_rate": 4.156359102244389e-06, + "loss": 0.2798, + "step": 63560 + }, + { + "epoch": 15.85286783042394, + "grad_norm": 4.7781548500061035, + "learning_rate": 4.1538653366583544e-06, + "loss": 0.3337, + "step": 63570 + }, + { + "epoch": 15.855361596009976, + "grad_norm": 4.6593523025512695, + "learning_rate": 4.15137157107232e-06, + "loss": 0.3748, + "step": 63580 + }, + { + "epoch": 15.85785536159601, + "grad_norm": 7.632694244384766, + "learning_rate": 4.148877805486284e-06, + "loss": 0.3599, + "step": 63590 + }, + { + "epoch": 15.860349127182046, + "grad_norm": 6.817638397216797, + "learning_rate": 4.1463840399002495e-06, + "loss": 0.2949, + "step": 63600 + }, + { + "epoch": 15.86284289276808, + "grad_norm": 10.572402954101562, + "learning_rate": 4.143890274314215e-06, + "loss": 0.3316, + "step": 63610 + }, + { + "epoch": 15.865336658354115, + "grad_norm": 7.423572540283203, + "learning_rate": 4.14139650872818e-06, + "loss": 0.2964, + "step": 63620 + }, + { + "epoch": 15.86783042394015, + "grad_norm": 8.025785446166992, + "learning_rate": 4.1389027431421455e-06, + "loss": 0.2967, + "step": 63630 + }, + { + "epoch": 15.870324189526185, + "grad_norm": 5.359241485595703, + "learning_rate": 4.13640897755611e-06, + "loss": 0.3303, + "step": 63640 + }, + { + "epoch": 15.87281795511222, + "grad_norm": 7.528040885925293, + "learning_rate": 4.133915211970075e-06, + "loss": 0.3465, + "step": 63650 + }, + { + "epoch": 15.875311720698255, + "grad_norm": 8.721107482910156, + "learning_rate": 4.1314214463840405e-06, + "loss": 0.3032, + "step": 63660 + }, + { + "epoch": 15.87780548628429, + "grad_norm": 8.390384674072266, + "learning_rate": 4.128927680798006e-06, + "loss": 0.2707, + "step": 63670 + }, + { + "epoch": 15.880299251870325, + "grad_norm": 6.81118631362915, + "learning_rate": 4.12643391521197e-06, + "loss": 0.3659, + "step": 63680 + }, + { + "epoch": 15.88279301745636, + "grad_norm": 11.449249267578125, + "learning_rate": 4.123940149625936e-06, + "loss": 0.3118, + "step": 63690 + }, + { + "epoch": 15.885286783042394, + "grad_norm": 8.672457695007324, + "learning_rate": 4.1214463840399e-06, + "loss": 0.3703, + "step": 63700 + }, + { + "epoch": 15.88778054862843, + "grad_norm": 6.935692310333252, + "learning_rate": 4.118952618453866e-06, + "loss": 0.2715, + "step": 63710 + }, + { + "epoch": 15.890274314214464, + "grad_norm": 7.293999195098877, + "learning_rate": 4.116458852867831e-06, + "loss": 0.3367, + "step": 63720 + }, + { + "epoch": 15.892768079800499, + "grad_norm": 8.051675796508789, + "learning_rate": 4.113965087281796e-06, + "loss": 0.3378, + "step": 63730 + }, + { + "epoch": 15.895261845386534, + "grad_norm": 8.13101577758789, + "learning_rate": 4.1114713216957605e-06, + "loss": 0.3245, + "step": 63740 + }, + { + "epoch": 15.897755610972569, + "grad_norm": 8.935870170593262, + "learning_rate": 4.108977556109726e-06, + "loss": 0.3168, + "step": 63750 + }, + { + "epoch": 15.900249376558603, + "grad_norm": 8.20327091217041, + "learning_rate": 4.106483790523691e-06, + "loss": 0.3086, + "step": 63760 + }, + { + "epoch": 15.902743142144638, + "grad_norm": 6.9828667640686035, + "learning_rate": 4.103990024937656e-06, + "loss": 0.3724, + "step": 63770 + }, + { + "epoch": 15.905236907730673, + "grad_norm": 7.1580915451049805, + "learning_rate": 4.101496259351621e-06, + "loss": 0.3337, + "step": 63780 + }, + { + "epoch": 15.907730673316708, + "grad_norm": 7.152059078216553, + "learning_rate": 4.099002493765586e-06, + "loss": 0.3067, + "step": 63790 + }, + { + "epoch": 15.910224438902743, + "grad_norm": 8.175209999084473, + "learning_rate": 4.0965087281795515e-06, + "loss": 0.3518, + "step": 63800 + }, + { + "epoch": 15.912718204488778, + "grad_norm": 8.411026000976562, + "learning_rate": 4.094014962593517e-06, + "loss": 0.2738, + "step": 63810 + }, + { + "epoch": 15.915211970074813, + "grad_norm": 4.850775241851807, + "learning_rate": 4.091521197007481e-06, + "loss": 0.2802, + "step": 63820 + }, + { + "epoch": 15.917705735660848, + "grad_norm": 11.431254386901855, + "learning_rate": 4.0890274314214466e-06, + "loss": 0.3478, + "step": 63830 + }, + { + "epoch": 15.920199501246882, + "grad_norm": 7.01432991027832, + "learning_rate": 4.086533665835412e-06, + "loss": 0.3393, + "step": 63840 + }, + { + "epoch": 15.922693266832917, + "grad_norm": 6.793428421020508, + "learning_rate": 4.084039900249377e-06, + "loss": 0.3174, + "step": 63850 + }, + { + "epoch": 15.925187032418952, + "grad_norm": 7.512074947357178, + "learning_rate": 4.081546134663342e-06, + "loss": 0.2974, + "step": 63860 + }, + { + "epoch": 15.927680798004987, + "grad_norm": 8.08775806427002, + "learning_rate": 4.079052369077307e-06, + "loss": 0.3465, + "step": 63870 + }, + { + "epoch": 15.930174563591022, + "grad_norm": 8.87288761138916, + "learning_rate": 4.076558603491272e-06, + "loss": 0.329, + "step": 63880 + }, + { + "epoch": 15.932668329177057, + "grad_norm": 8.738507270812988, + "learning_rate": 4.0740648379052376e-06, + "loss": 0.3361, + "step": 63890 + }, + { + "epoch": 15.935162094763092, + "grad_norm": 6.354411602020264, + "learning_rate": 4.071571072319203e-06, + "loss": 0.3113, + "step": 63900 + }, + { + "epoch": 15.937655860349127, + "grad_norm": 10.517175674438477, + "learning_rate": 4.069077306733167e-06, + "loss": 0.3381, + "step": 63910 + }, + { + "epoch": 15.940149625935161, + "grad_norm": 8.709144592285156, + "learning_rate": 4.066583541147133e-06, + "loss": 0.3011, + "step": 63920 + }, + { + "epoch": 15.942643391521196, + "grad_norm": 7.812455654144287, + "learning_rate": 4.064089775561097e-06, + "loss": 0.3099, + "step": 63930 + }, + { + "epoch": 15.945137157107231, + "grad_norm": 7.7127685546875, + "learning_rate": 4.061596009975063e-06, + "loss": 0.3536, + "step": 63940 + }, + { + "epoch": 15.947630922693268, + "grad_norm": 8.810644149780273, + "learning_rate": 4.059102244389028e-06, + "loss": 0.3048, + "step": 63950 + }, + { + "epoch": 15.950124688279303, + "grad_norm": 9.955845832824707, + "learning_rate": 4.056608478802993e-06, + "loss": 0.3582, + "step": 63960 + }, + { + "epoch": 15.952618453865338, + "grad_norm": 8.85975456237793, + "learning_rate": 4.0541147132169575e-06, + "loss": 0.3893, + "step": 63970 + }, + { + "epoch": 15.955112219451372, + "grad_norm": 8.970422744750977, + "learning_rate": 4.051620947630923e-06, + "loss": 0.2967, + "step": 63980 + }, + { + "epoch": 15.957605985037407, + "grad_norm": 8.689909934997559, + "learning_rate": 4.049127182044888e-06, + "loss": 0.3129, + "step": 63990 + }, + { + "epoch": 15.960099750623442, + "grad_norm": 12.43597412109375, + "learning_rate": 4.0466334164588534e-06, + "loss": 0.4056, + "step": 64000 + }, + { + "epoch": 15.962593516209477, + "grad_norm": 5.898103713989258, + "learning_rate": 4.044139650872818e-06, + "loss": 0.3884, + "step": 64010 + }, + { + "epoch": 15.965087281795512, + "grad_norm": 9.561400413513184, + "learning_rate": 4.041645885286783e-06, + "loss": 0.2965, + "step": 64020 + }, + { + "epoch": 15.967581047381547, + "grad_norm": 13.149139404296875, + "learning_rate": 4.0391521197007485e-06, + "loss": 0.2631, + "step": 64030 + }, + { + "epoch": 15.970074812967582, + "grad_norm": 8.486005783081055, + "learning_rate": 4.036658354114714e-06, + "loss": 0.2731, + "step": 64040 + }, + { + "epoch": 15.972568578553616, + "grad_norm": 10.793266296386719, + "learning_rate": 4.034164588528678e-06, + "loss": 0.3073, + "step": 64050 + }, + { + "epoch": 15.975062344139651, + "grad_norm": 19.346826553344727, + "learning_rate": 4.031670822942644e-06, + "loss": 0.3753, + "step": 64060 + }, + { + "epoch": 15.977556109725686, + "grad_norm": 9.948132514953613, + "learning_rate": 4.029177057356609e-06, + "loss": 0.314, + "step": 64070 + }, + { + "epoch": 15.980049875311721, + "grad_norm": 8.224431991577148, + "learning_rate": 4.026683291770574e-06, + "loss": 0.3025, + "step": 64080 + }, + { + "epoch": 15.982543640897756, + "grad_norm": 9.23838996887207, + "learning_rate": 4.024189526184539e-06, + "loss": 0.4478, + "step": 64090 + }, + { + "epoch": 15.98503740648379, + "grad_norm": 11.388845443725586, + "learning_rate": 4.021695760598504e-06, + "loss": 0.3425, + "step": 64100 + }, + { + "epoch": 15.987531172069826, + "grad_norm": 7.132105350494385, + "learning_rate": 4.0192019950124685e-06, + "loss": 0.3889, + "step": 64110 + }, + { + "epoch": 15.99002493765586, + "grad_norm": 7.140593528747559, + "learning_rate": 4.016708229426435e-06, + "loss": 0.319, + "step": 64120 + }, + { + "epoch": 15.992518703241895, + "grad_norm": 7.605659484863281, + "learning_rate": 4.0142144638404e-06, + "loss": 0.3565, + "step": 64130 + }, + { + "epoch": 15.99501246882793, + "grad_norm": 9.850330352783203, + "learning_rate": 4.011720698254364e-06, + "loss": 0.363, + "step": 64140 + }, + { + "epoch": 15.997506234413965, + "grad_norm": 5.737972259521484, + "learning_rate": 4.00922693266833e-06, + "loss": 0.3383, + "step": 64150 + }, + { + "epoch": 16.0, + "grad_norm": 9.490584373474121, + "learning_rate": 4.006733167082294e-06, + "loss": 0.3047, + "step": 64160 + }, + { + "epoch": 16.0, + "eval_loss": 0.41599616408348083, + "eval_runtime": 60.0234, + "eval_samples_per_second": 16.71, + "eval_steps_per_second": 16.71, + "step": 64160 + }, + { + "epoch": 16.002493765586035, + "grad_norm": 13.953182220458984, + "learning_rate": 4.0042394014962595e-06, + "loss": 0.3067, + "step": 64170 + }, + { + "epoch": 16.00498753117207, + "grad_norm": 7.221786022186279, + "learning_rate": 4.001745635910225e-06, + "loss": 0.3639, + "step": 64180 + }, + { + "epoch": 16.007481296758105, + "grad_norm": 8.095586776733398, + "learning_rate": 3.99925187032419e-06, + "loss": 0.3427, + "step": 64190 + }, + { + "epoch": 16.00997506234414, + "grad_norm": 5.289329528808594, + "learning_rate": 3.9967581047381546e-06, + "loss": 0.3658, + "step": 64200 + }, + { + "epoch": 16.012468827930174, + "grad_norm": 10.353299140930176, + "learning_rate": 3.99426433915212e-06, + "loss": 0.3201, + "step": 64210 + }, + { + "epoch": 16.01496259351621, + "grad_norm": 6.504826545715332, + "learning_rate": 3.991770573566085e-06, + "loss": 0.338, + "step": 64220 + }, + { + "epoch": 16.017456359102244, + "grad_norm": 10.51669692993164, + "learning_rate": 3.9892768079800505e-06, + "loss": 0.2674, + "step": 64230 + }, + { + "epoch": 16.01995012468828, + "grad_norm": 10.650725364685059, + "learning_rate": 3.986783042394015e-06, + "loss": 0.2787, + "step": 64240 + }, + { + "epoch": 16.022443890274314, + "grad_norm": 8.505513191223145, + "learning_rate": 3.98428927680798e-06, + "loss": 0.2972, + "step": 64250 + }, + { + "epoch": 16.02493765586035, + "grad_norm": 10.878944396972656, + "learning_rate": 3.9817955112219456e-06, + "loss": 0.3168, + "step": 64260 + }, + { + "epoch": 16.027431421446384, + "grad_norm": 7.452136993408203, + "learning_rate": 3.979301745635911e-06, + "loss": 0.319, + "step": 64270 + }, + { + "epoch": 16.02992518703242, + "grad_norm": 7.434098720550537, + "learning_rate": 3.976807980049875e-06, + "loss": 0.3388, + "step": 64280 + }, + { + "epoch": 16.032418952618453, + "grad_norm": 6.33979606628418, + "learning_rate": 3.974314214463841e-06, + "loss": 0.2748, + "step": 64290 + }, + { + "epoch": 16.034912718204488, + "grad_norm": 13.049262046813965, + "learning_rate": 3.971820448877806e-06, + "loss": 0.3333, + "step": 64300 + }, + { + "epoch": 16.037406483790523, + "grad_norm": 12.862238883972168, + "learning_rate": 3.969326683291771e-06, + "loss": 0.3516, + "step": 64310 + }, + { + "epoch": 16.039900249376558, + "grad_norm": 7.679028034210205, + "learning_rate": 3.966832917705736e-06, + "loss": 0.3192, + "step": 64320 + }, + { + "epoch": 16.042394014962593, + "grad_norm": 7.861089706420898, + "learning_rate": 3.964339152119701e-06, + "loss": 0.2909, + "step": 64330 + }, + { + "epoch": 16.044887780548628, + "grad_norm": 7.005455493927002, + "learning_rate": 3.9618453865336655e-06, + "loss": 0.3583, + "step": 64340 + }, + { + "epoch": 16.047381546134662, + "grad_norm": 11.43169116973877, + "learning_rate": 3.959351620947632e-06, + "loss": 0.3038, + "step": 64350 + }, + { + "epoch": 16.049875311720697, + "grad_norm": 8.389317512512207, + "learning_rate": 3.956857855361596e-06, + "loss": 0.2716, + "step": 64360 + }, + { + "epoch": 16.052369077306732, + "grad_norm": 8.883370399475098, + "learning_rate": 3.9543640897755614e-06, + "loss": 0.3067, + "step": 64370 + }, + { + "epoch": 16.054862842892767, + "grad_norm": 10.897636413574219, + "learning_rate": 3.951870324189527e-06, + "loss": 0.297, + "step": 64380 + }, + { + "epoch": 16.057356608478802, + "grad_norm": 9.456404685974121, + "learning_rate": 3.949376558603491e-06, + "loss": 0.3652, + "step": 64390 + }, + { + "epoch": 16.059850374064837, + "grad_norm": 8.108953475952148, + "learning_rate": 3.9468827930174565e-06, + "loss": 0.323, + "step": 64400 + }, + { + "epoch": 16.06234413965087, + "grad_norm": 5.497220039367676, + "learning_rate": 3.944389027431422e-06, + "loss": 0.2902, + "step": 64410 + }, + { + "epoch": 16.064837905236907, + "grad_norm": 8.865570068359375, + "learning_rate": 3.941895261845387e-06, + "loss": 0.356, + "step": 64420 + }, + { + "epoch": 16.06733167082294, + "grad_norm": 6.835111141204834, + "learning_rate": 3.939401496259352e-06, + "loss": 0.3206, + "step": 64430 + }, + { + "epoch": 16.069825436408976, + "grad_norm": 10.127936363220215, + "learning_rate": 3.936907730673317e-06, + "loss": 0.2834, + "step": 64440 + }, + { + "epoch": 16.07231920199501, + "grad_norm": 10.34428882598877, + "learning_rate": 3.934413965087282e-06, + "loss": 0.2795, + "step": 64450 + }, + { + "epoch": 16.074812967581046, + "grad_norm": 7.148595809936523, + "learning_rate": 3.9319201995012475e-06, + "loss": 0.3463, + "step": 64460 + }, + { + "epoch": 16.07730673316708, + "grad_norm": 7.980280876159668, + "learning_rate": 3.929426433915212e-06, + "loss": 0.3167, + "step": 64470 + }, + { + "epoch": 16.079800498753116, + "grad_norm": 11.756796836853027, + "learning_rate": 3.926932668329177e-06, + "loss": 0.3765, + "step": 64480 + }, + { + "epoch": 16.08229426433915, + "grad_norm": 11.97585391998291, + "learning_rate": 3.924438902743143e-06, + "loss": 0.3805, + "step": 64490 + }, + { + "epoch": 16.084788029925186, + "grad_norm": 11.348858833312988, + "learning_rate": 3.921945137157108e-06, + "loss": 0.2978, + "step": 64500 + }, + { + "epoch": 16.08728179551122, + "grad_norm": 6.33409309387207, + "learning_rate": 3.919451371571072e-06, + "loss": 0.3154, + "step": 64510 + }, + { + "epoch": 16.089775561097255, + "grad_norm": 10.497039794921875, + "learning_rate": 3.916957605985038e-06, + "loss": 0.2952, + "step": 64520 + }, + { + "epoch": 16.09226932668329, + "grad_norm": 6.439425945281982, + "learning_rate": 3.914463840399003e-06, + "loss": 0.3474, + "step": 64530 + }, + { + "epoch": 16.094763092269325, + "grad_norm": 11.366449356079102, + "learning_rate": 3.911970074812968e-06, + "loss": 0.3775, + "step": 64540 + }, + { + "epoch": 16.09725685785536, + "grad_norm": 7.353078365325928, + "learning_rate": 3.909476309226933e-06, + "loss": 0.3125, + "step": 64550 + }, + { + "epoch": 16.099750623441395, + "grad_norm": 7.988730430603027, + "learning_rate": 3.906982543640898e-06, + "loss": 0.3449, + "step": 64560 + }, + { + "epoch": 16.102244389027433, + "grad_norm": 7.281782627105713, + "learning_rate": 3.9044887780548625e-06, + "loss": 0.3189, + "step": 64570 + }, + { + "epoch": 16.104738154613468, + "grad_norm": 7.062814712524414, + "learning_rate": 3.901995012468829e-06, + "loss": 0.3316, + "step": 64580 + }, + { + "epoch": 16.107231920199503, + "grad_norm": 6.708259582519531, + "learning_rate": 3.899501246882793e-06, + "loss": 0.2937, + "step": 64590 + }, + { + "epoch": 16.109725685785538, + "grad_norm": 7.511795520782471, + "learning_rate": 3.8970074812967585e-06, + "loss": 0.3319, + "step": 64600 + }, + { + "epoch": 16.112219451371573, + "grad_norm": 7.629890441894531, + "learning_rate": 3.894513715710724e-06, + "loss": 0.3135, + "step": 64610 + }, + { + "epoch": 16.114713216957608, + "grad_norm": 12.183755874633789, + "learning_rate": 3.892019950124688e-06, + "loss": 0.3092, + "step": 64620 + }, + { + "epoch": 16.117206982543642, + "grad_norm": 12.015825271606445, + "learning_rate": 3.8895261845386536e-06, + "loss": 0.3199, + "step": 64630 + }, + { + "epoch": 16.119700748129677, + "grad_norm": 8.182219505310059, + "learning_rate": 3.887032418952619e-06, + "loss": 0.3227, + "step": 64640 + }, + { + "epoch": 16.122194513715712, + "grad_norm": 8.107385635375977, + "learning_rate": 3.884538653366584e-06, + "loss": 0.275, + "step": 64650 + }, + { + "epoch": 16.124688279301747, + "grad_norm": 12.364629745483398, + "learning_rate": 3.882044887780549e-06, + "loss": 0.3205, + "step": 64660 + }, + { + "epoch": 16.127182044887782, + "grad_norm": 9.689504623413086, + "learning_rate": 3.879551122194514e-06, + "loss": 0.3629, + "step": 64670 + }, + { + "epoch": 16.129675810473817, + "grad_norm": 9.650782585144043, + "learning_rate": 3.877057356608479e-06, + "loss": 0.3607, + "step": 64680 + }, + { + "epoch": 16.13216957605985, + "grad_norm": 8.96108341217041, + "learning_rate": 3.8745635910224446e-06, + "loss": 0.3243, + "step": 64690 + }, + { + "epoch": 16.134663341645886, + "grad_norm": 9.31535816192627, + "learning_rate": 3.872069825436409e-06, + "loss": 0.353, + "step": 64700 + }, + { + "epoch": 16.13715710723192, + "grad_norm": 9.662627220153809, + "learning_rate": 3.869576059850374e-06, + "loss": 0.2988, + "step": 64710 + }, + { + "epoch": 16.139650872817956, + "grad_norm": 6.727764129638672, + "learning_rate": 3.86708229426434e-06, + "loss": 0.2802, + "step": 64720 + }, + { + "epoch": 16.14214463840399, + "grad_norm": 11.656500816345215, + "learning_rate": 3.864588528678305e-06, + "loss": 0.319, + "step": 64730 + }, + { + "epoch": 16.144638403990026, + "grad_norm": 9.718652725219727, + "learning_rate": 3.862094763092269e-06, + "loss": 0.3288, + "step": 64740 + }, + { + "epoch": 16.14713216957606, + "grad_norm": 9.155292510986328, + "learning_rate": 3.859600997506235e-06, + "loss": 0.305, + "step": 64750 + }, + { + "epoch": 16.149625935162096, + "grad_norm": 8.969863891601562, + "learning_rate": 3.8571072319202e-06, + "loss": 0.3435, + "step": 64760 + }, + { + "epoch": 16.15211970074813, + "grad_norm": 9.428884506225586, + "learning_rate": 3.854613466334165e-06, + "loss": 0.3198, + "step": 64770 + }, + { + "epoch": 16.154613466334165, + "grad_norm": 10.018813133239746, + "learning_rate": 3.85211970074813e-06, + "loss": 0.3452, + "step": 64780 + }, + { + "epoch": 16.1571072319202, + "grad_norm": 11.803756713867188, + "learning_rate": 3.849625935162095e-06, + "loss": 0.3619, + "step": 64790 + }, + { + "epoch": 16.159600997506235, + "grad_norm": 14.001852989196777, + "learning_rate": 3.84713216957606e-06, + "loss": 0.3259, + "step": 64800 + }, + { + "epoch": 16.16209476309227, + "grad_norm": 7.694314956665039, + "learning_rate": 3.844638403990025e-06, + "loss": 0.3697, + "step": 64810 + }, + { + "epoch": 16.164588528678305, + "grad_norm": 7.822086811065674, + "learning_rate": 3.84214463840399e-06, + "loss": 0.2878, + "step": 64820 + }, + { + "epoch": 16.16708229426434, + "grad_norm": 8.987359046936035, + "learning_rate": 3.8396508728179555e-06, + "loss": 0.3268, + "step": 64830 + }, + { + "epoch": 16.169576059850375, + "grad_norm": 6.423337459564209, + "learning_rate": 3.83715710723192e-06, + "loss": 0.3347, + "step": 64840 + }, + { + "epoch": 16.17206982543641, + "grad_norm": 9.673188209533691, + "learning_rate": 3.834663341645885e-06, + "loss": 0.3471, + "step": 64850 + }, + { + "epoch": 16.174563591022444, + "grad_norm": 4.7321319580078125, + "learning_rate": 3.832169576059851e-06, + "loss": 0.3425, + "step": 64860 + }, + { + "epoch": 16.17705735660848, + "grad_norm": 10.942599296569824, + "learning_rate": 3.829675810473816e-06, + "loss": 0.2551, + "step": 64870 + }, + { + "epoch": 16.179551122194514, + "grad_norm": 9.56152629852295, + "learning_rate": 3.827182044887781e-06, + "loss": 0.3806, + "step": 64880 + }, + { + "epoch": 16.18204488778055, + "grad_norm": 10.860115051269531, + "learning_rate": 3.824688279301746e-06, + "loss": 0.3677, + "step": 64890 + }, + { + "epoch": 16.184538653366584, + "grad_norm": 6.14926815032959, + "learning_rate": 3.822194513715711e-06, + "loss": 0.3018, + "step": 64900 + }, + { + "epoch": 16.18703241895262, + "grad_norm": 9.40666675567627, + "learning_rate": 3.819700748129676e-06, + "loss": 0.307, + "step": 64910 + }, + { + "epoch": 16.189526184538654, + "grad_norm": 10.993182182312012, + "learning_rate": 3.817206982543642e-06, + "loss": 0.287, + "step": 64920 + }, + { + "epoch": 16.19201995012469, + "grad_norm": 8.050580978393555, + "learning_rate": 3.814713216957606e-06, + "loss": 0.3454, + "step": 64930 + }, + { + "epoch": 16.194513715710723, + "grad_norm": 7.049538612365723, + "learning_rate": 3.8122194513715714e-06, + "loss": 0.3349, + "step": 64940 + }, + { + "epoch": 16.197007481296758, + "grad_norm": 6.026520729064941, + "learning_rate": 3.8097256857855363e-06, + "loss": 0.2819, + "step": 64950 + }, + { + "epoch": 16.199501246882793, + "grad_norm": 9.586450576782227, + "learning_rate": 3.8072319201995016e-06, + "loss": 0.369, + "step": 64960 + }, + { + "epoch": 16.201995012468828, + "grad_norm": 12.329106330871582, + "learning_rate": 3.8047381546134665e-06, + "loss": 0.2958, + "step": 64970 + }, + { + "epoch": 16.204488778054863, + "grad_norm": 9.353446960449219, + "learning_rate": 3.8022443890274318e-06, + "loss": 0.3131, + "step": 64980 + }, + { + "epoch": 16.206982543640898, + "grad_norm": 7.740355014801025, + "learning_rate": 3.7997506234413967e-06, + "loss": 0.265, + "step": 64990 + }, + { + "epoch": 16.209476309226932, + "grad_norm": 7.058204174041748, + "learning_rate": 3.797256857855362e-06, + "loss": 0.3649, + "step": 65000 + }, + { + "epoch": 16.211970074812967, + "grad_norm": 9.42304801940918, + "learning_rate": 3.794763092269327e-06, + "loss": 0.3743, + "step": 65010 + }, + { + "epoch": 16.214463840399002, + "grad_norm": 10.909148216247559, + "learning_rate": 3.792269326683292e-06, + "loss": 0.3381, + "step": 65020 + }, + { + "epoch": 16.216957605985037, + "grad_norm": 6.674330711364746, + "learning_rate": 3.789775561097257e-06, + "loss": 0.3916, + "step": 65030 + }, + { + "epoch": 16.219451371571072, + "grad_norm": 7.515557765960693, + "learning_rate": 3.7872817955112224e-06, + "loss": 0.2461, + "step": 65040 + }, + { + "epoch": 16.221945137157107, + "grad_norm": 8.817490577697754, + "learning_rate": 3.7847880299251872e-06, + "loss": 0.282, + "step": 65050 + }, + { + "epoch": 16.22443890274314, + "grad_norm": 9.24267578125, + "learning_rate": 3.7822942643391526e-06, + "loss": 0.3451, + "step": 65060 + }, + { + "epoch": 16.226932668329177, + "grad_norm": 8.435280799865723, + "learning_rate": 3.7798004987531174e-06, + "loss": 0.3645, + "step": 65070 + }, + { + "epoch": 16.22942643391521, + "grad_norm": 8.729866027832031, + "learning_rate": 3.7773067331670827e-06, + "loss": 0.346, + "step": 65080 + }, + { + "epoch": 16.231920199501246, + "grad_norm": 9.877275466918945, + "learning_rate": 3.7748129675810476e-06, + "loss": 0.3218, + "step": 65090 + }, + { + "epoch": 16.23441396508728, + "grad_norm": 7.41887903213501, + "learning_rate": 3.772319201995013e-06, + "loss": 0.2789, + "step": 65100 + }, + { + "epoch": 16.236907730673316, + "grad_norm": 8.857915878295898, + "learning_rate": 3.7698254364089783e-06, + "loss": 0.347, + "step": 65110 + }, + { + "epoch": 16.23940149625935, + "grad_norm": 9.079187393188477, + "learning_rate": 3.767331670822943e-06, + "loss": 0.2908, + "step": 65120 + }, + { + "epoch": 16.241895261845386, + "grad_norm": 8.195780754089355, + "learning_rate": 3.7648379052369085e-06, + "loss": 0.3252, + "step": 65130 + }, + { + "epoch": 16.24438902743142, + "grad_norm": 11.435444831848145, + "learning_rate": 3.762344139650873e-06, + "loss": 0.377, + "step": 65140 + }, + { + "epoch": 16.246882793017456, + "grad_norm": 8.027785301208496, + "learning_rate": 3.7598503740648386e-06, + "loss": 0.2409, + "step": 65150 + }, + { + "epoch": 16.24937655860349, + "grad_norm": 9.426610946655273, + "learning_rate": 3.757356608478803e-06, + "loss": 0.3731, + "step": 65160 + }, + { + "epoch": 16.251870324189525, + "grad_norm": 11.091520309448242, + "learning_rate": 3.7548628428927684e-06, + "loss": 0.3764, + "step": 65170 + }, + { + "epoch": 16.25436408977556, + "grad_norm": 5.973906517028809, + "learning_rate": 3.7523690773067333e-06, + "loss": 0.2766, + "step": 65180 + }, + { + "epoch": 16.256857855361595, + "grad_norm": 8.147195816040039, + "learning_rate": 3.7498753117206986e-06, + "loss": 0.3152, + "step": 65190 + }, + { + "epoch": 16.25935162094763, + "grad_norm": 8.979331016540527, + "learning_rate": 3.7473815461346635e-06, + "loss": 0.3257, + "step": 65200 + }, + { + "epoch": 16.261845386533665, + "grad_norm": 11.531305313110352, + "learning_rate": 3.744887780548629e-06, + "loss": 0.3556, + "step": 65210 + }, + { + "epoch": 16.2643391521197, + "grad_norm": 11.863848686218262, + "learning_rate": 3.7423940149625937e-06, + "loss": 0.3515, + "step": 65220 + }, + { + "epoch": 16.266832917705734, + "grad_norm": 5.3083038330078125, + "learning_rate": 3.739900249376559e-06, + "loss": 0.2752, + "step": 65230 + }, + { + "epoch": 16.26932668329177, + "grad_norm": 7.9635820388793945, + "learning_rate": 3.737406483790524e-06, + "loss": 0.3552, + "step": 65240 + }, + { + "epoch": 16.271820448877804, + "grad_norm": 10.961655616760254, + "learning_rate": 3.734912718204489e-06, + "loss": 0.3359, + "step": 65250 + }, + { + "epoch": 16.27431421446384, + "grad_norm": 8.826502799987793, + "learning_rate": 3.732418952618454e-06, + "loss": 0.2981, + "step": 65260 + }, + { + "epoch": 16.276807980049874, + "grad_norm": 11.395182609558105, + "learning_rate": 3.7299251870324194e-06, + "loss": 0.3708, + "step": 65270 + }, + { + "epoch": 16.27930174563591, + "grad_norm": 7.6695146560668945, + "learning_rate": 3.7274314214463843e-06, + "loss": 0.3029, + "step": 65280 + }, + { + "epoch": 16.281795511221944, + "grad_norm": 10.41583251953125, + "learning_rate": 3.7249376558603496e-06, + "loss": 0.3226, + "step": 65290 + }, + { + "epoch": 16.28428927680798, + "grad_norm": 7.618106842041016, + "learning_rate": 3.7224438902743145e-06, + "loss": 0.3199, + "step": 65300 + }, + { + "epoch": 16.286783042394013, + "grad_norm": 8.326313018798828, + "learning_rate": 3.71995012468828e-06, + "loss": 0.3513, + "step": 65310 + }, + { + "epoch": 16.28927680798005, + "grad_norm": 5.363463878631592, + "learning_rate": 3.7174563591022443e-06, + "loss": 0.3408, + "step": 65320 + }, + { + "epoch": 16.291770573566083, + "grad_norm": 9.034497261047363, + "learning_rate": 3.71496259351621e-06, + "loss": 0.324, + "step": 65330 + }, + { + "epoch": 16.294264339152118, + "grad_norm": 13.3429536819458, + "learning_rate": 3.7124688279301744e-06, + "loss": 0.3495, + "step": 65340 + }, + { + "epoch": 16.296758104738153, + "grad_norm": 14.830777168273926, + "learning_rate": 3.7099750623441398e-06, + "loss": 0.3499, + "step": 65350 + }, + { + "epoch": 16.29925187032419, + "grad_norm": 6.612533092498779, + "learning_rate": 3.7074812967581055e-06, + "loss": 0.3221, + "step": 65360 + }, + { + "epoch": 16.301745635910226, + "grad_norm": 6.141602516174316, + "learning_rate": 3.70498753117207e-06, + "loss": 0.3303, + "step": 65370 + }, + { + "epoch": 16.30423940149626, + "grad_norm": 8.41512393951416, + "learning_rate": 3.7024937655860353e-06, + "loss": 0.2878, + "step": 65380 + }, + { + "epoch": 16.306733167082296, + "grad_norm": 5.508592128753662, + "learning_rate": 3.7e-06, + "loss": 0.3166, + "step": 65390 + }, + { + "epoch": 16.30922693266833, + "grad_norm": 9.821744918823242, + "learning_rate": 3.6975062344139655e-06, + "loss": 0.3125, + "step": 65400 + }, + { + "epoch": 16.311720698254366, + "grad_norm": 6.723392963409424, + "learning_rate": 3.6950124688279303e-06, + "loss": 0.3018, + "step": 65410 + }, + { + "epoch": 16.3142144638404, + "grad_norm": 8.72385311126709, + "learning_rate": 3.6925187032418957e-06, + "loss": 0.3337, + "step": 65420 + }, + { + "epoch": 16.316708229426435, + "grad_norm": 12.842667579650879, + "learning_rate": 3.6900249376558605e-06, + "loss": 0.3676, + "step": 65430 + }, + { + "epoch": 16.31920199501247, + "grad_norm": 9.1225004196167, + "learning_rate": 3.687531172069826e-06, + "loss": 0.3989, + "step": 65440 + }, + { + "epoch": 16.321695760598505, + "grad_norm": 11.330621719360352, + "learning_rate": 3.6850374064837907e-06, + "loss": 0.3592, + "step": 65450 + }, + { + "epoch": 16.32418952618454, + "grad_norm": 10.708224296569824, + "learning_rate": 3.682543640897756e-06, + "loss": 0.3368, + "step": 65460 + }, + { + "epoch": 16.326683291770575, + "grad_norm": 11.212067604064941, + "learning_rate": 3.680049875311721e-06, + "loss": 0.3081, + "step": 65470 + }, + { + "epoch": 16.32917705735661, + "grad_norm": 6.18818473815918, + "learning_rate": 3.6775561097256862e-06, + "loss": 0.3428, + "step": 65480 + }, + { + "epoch": 16.331670822942645, + "grad_norm": 7.7143635749816895, + "learning_rate": 3.675062344139651e-06, + "loss": 0.2853, + "step": 65490 + }, + { + "epoch": 16.33416458852868, + "grad_norm": 9.196212768554688, + "learning_rate": 3.6725685785536164e-06, + "loss": 0.3176, + "step": 65500 + }, + { + "epoch": 16.336658354114714, + "grad_norm": 9.68375301361084, + "learning_rate": 3.6700748129675813e-06, + "loss": 0.3151, + "step": 65510 + }, + { + "epoch": 16.33915211970075, + "grad_norm": 7.000097274780273, + "learning_rate": 3.6675810473815466e-06, + "loss": 0.3043, + "step": 65520 + }, + { + "epoch": 16.341645885286784, + "grad_norm": 6.249640464782715, + "learning_rate": 3.6650872817955115e-06, + "loss": 0.3183, + "step": 65530 + }, + { + "epoch": 16.34413965087282, + "grad_norm": 9.410320281982422, + "learning_rate": 3.662593516209477e-06, + "loss": 0.355, + "step": 65540 + }, + { + "epoch": 16.346633416458854, + "grad_norm": 8.34964656829834, + "learning_rate": 3.6600997506234413e-06, + "loss": 0.3565, + "step": 65550 + }, + { + "epoch": 16.34912718204489, + "grad_norm": 7.488079071044922, + "learning_rate": 3.657605985037407e-06, + "loss": 0.3327, + "step": 65560 + }, + { + "epoch": 16.351620947630924, + "grad_norm": 12.476749420166016, + "learning_rate": 3.6551122194513715e-06, + "loss": 0.2578, + "step": 65570 + }, + { + "epoch": 16.35411471321696, + "grad_norm": 7.019128322601318, + "learning_rate": 3.652618453865337e-06, + "loss": 0.332, + "step": 65580 + }, + { + "epoch": 16.356608478802993, + "grad_norm": 7.373568058013916, + "learning_rate": 3.6501246882793017e-06, + "loss": 0.3988, + "step": 65590 + }, + { + "epoch": 16.359102244389028, + "grad_norm": 8.25925064086914, + "learning_rate": 3.647630922693267e-06, + "loss": 0.359, + "step": 65600 + }, + { + "epoch": 16.361596009975063, + "grad_norm": 9.343754768371582, + "learning_rate": 3.6451371571072323e-06, + "loss": 0.3369, + "step": 65610 + }, + { + "epoch": 16.364089775561098, + "grad_norm": 7.88173770904541, + "learning_rate": 3.642892768079801e-06, + "loss": 0.3372, + "step": 65620 + }, + { + "epoch": 16.366583541147133, + "grad_norm": 12.04601001739502, + "learning_rate": 3.640399002493766e-06, + "loss": 0.3215, + "step": 65630 + }, + { + "epoch": 16.369077306733168, + "grad_norm": 11.845905303955078, + "learning_rate": 3.637905236907731e-06, + "loss": 0.4036, + "step": 65640 + }, + { + "epoch": 16.371571072319203, + "grad_norm": 11.93884563446045, + "learning_rate": 3.635411471321696e-06, + "loss": 0.3596, + "step": 65650 + }, + { + "epoch": 16.374064837905237, + "grad_norm": 8.246026992797852, + "learning_rate": 3.6329177057356614e-06, + "loss": 0.2996, + "step": 65660 + }, + { + "epoch": 16.376558603491272, + "grad_norm": 10.389656066894531, + "learning_rate": 3.6304239401496263e-06, + "loss": 0.3356, + "step": 65670 + }, + { + "epoch": 16.379052369077307, + "grad_norm": 9.619356155395508, + "learning_rate": 3.6279301745635916e-06, + "loss": 0.3053, + "step": 65680 + }, + { + "epoch": 16.381546134663342, + "grad_norm": 10.337359428405762, + "learning_rate": 3.625436408977556e-06, + "loss": 0.3775, + "step": 65690 + }, + { + "epoch": 16.384039900249377, + "grad_norm": 9.273616790771484, + "learning_rate": 3.6229426433915218e-06, + "loss": 0.3492, + "step": 65700 + }, + { + "epoch": 16.38653366583541, + "grad_norm": 7.120643138885498, + "learning_rate": 3.6204488778054862e-06, + "loss": 0.3044, + "step": 65710 + }, + { + "epoch": 16.389027431421447, + "grad_norm": 7.253455638885498, + "learning_rate": 3.6179551122194516e-06, + "loss": 0.3432, + "step": 65720 + }, + { + "epoch": 16.39152119700748, + "grad_norm": 9.472309112548828, + "learning_rate": 3.6154613466334164e-06, + "loss": 0.3314, + "step": 65730 + }, + { + "epoch": 16.394014962593516, + "grad_norm": 12.149874687194824, + "learning_rate": 3.6129675810473818e-06, + "loss": 0.4217, + "step": 65740 + }, + { + "epoch": 16.39650872817955, + "grad_norm": 9.527915954589844, + "learning_rate": 3.6104738154613466e-06, + "loss": 0.3714, + "step": 65750 + }, + { + "epoch": 16.399002493765586, + "grad_norm": 11.615216255187988, + "learning_rate": 3.607980049875312e-06, + "loss": 0.3939, + "step": 65760 + }, + { + "epoch": 16.40149625935162, + "grad_norm": 7.617849826812744, + "learning_rate": 3.605486284289277e-06, + "loss": 0.3533, + "step": 65770 + }, + { + "epoch": 16.403990024937656, + "grad_norm": 6.392154693603516, + "learning_rate": 3.602992518703242e-06, + "loss": 0.3103, + "step": 65780 + }, + { + "epoch": 16.40648379052369, + "grad_norm": 9.79951000213623, + "learning_rate": 3.6004987531172075e-06, + "loss": 0.3713, + "step": 65790 + }, + { + "epoch": 16.408977556109726, + "grad_norm": 8.584783554077148, + "learning_rate": 3.5980049875311723e-06, + "loss": 0.3699, + "step": 65800 + }, + { + "epoch": 16.41147132169576, + "grad_norm": 9.751562118530273, + "learning_rate": 3.5955112219451376e-06, + "loss": 0.3959, + "step": 65810 + }, + { + "epoch": 16.413965087281795, + "grad_norm": 6.995100975036621, + "learning_rate": 3.5930174563591025e-06, + "loss": 0.2217, + "step": 65820 + }, + { + "epoch": 16.41645885286783, + "grad_norm": 12.037008285522461, + "learning_rate": 3.590523690773068e-06, + "loss": 0.3115, + "step": 65830 + }, + { + "epoch": 16.418952618453865, + "grad_norm": 6.427266597747803, + "learning_rate": 3.5880299251870327e-06, + "loss": 0.3391, + "step": 65840 + }, + { + "epoch": 16.4214463840399, + "grad_norm": 6.5554890632629395, + "learning_rate": 3.585536159600998e-06, + "loss": 0.3549, + "step": 65850 + }, + { + "epoch": 16.423940149625935, + "grad_norm": 7.430528163909912, + "learning_rate": 3.583042394014963e-06, + "loss": 0.322, + "step": 65860 + }, + { + "epoch": 16.42643391521197, + "grad_norm": 6.291595935821533, + "learning_rate": 3.5805486284289282e-06, + "loss": 0.3099, + "step": 65870 + }, + { + "epoch": 16.428927680798004, + "grad_norm": 8.320535659790039, + "learning_rate": 3.578054862842893e-06, + "loss": 0.304, + "step": 65880 + }, + { + "epoch": 16.43142144638404, + "grad_norm": 8.501724243164062, + "learning_rate": 3.5755610972568584e-06, + "loss": 0.3474, + "step": 65890 + }, + { + "epoch": 16.433915211970074, + "grad_norm": 10.219943046569824, + "learning_rate": 3.573067331670823e-06, + "loss": 0.2741, + "step": 65900 + }, + { + "epoch": 16.43640897755611, + "grad_norm": 9.32536792755127, + "learning_rate": 3.5705735660847886e-06, + "loss": 0.3118, + "step": 65910 + }, + { + "epoch": 16.438902743142144, + "grad_norm": 7.996541976928711, + "learning_rate": 3.568079800498753e-06, + "loss": 0.3923, + "step": 65920 + }, + { + "epoch": 16.44139650872818, + "grad_norm": 7.270846366882324, + "learning_rate": 3.5655860349127184e-06, + "loss": 0.4512, + "step": 65930 + }, + { + "epoch": 16.443890274314214, + "grad_norm": 8.105086326599121, + "learning_rate": 3.5630922693266833e-06, + "loss": 0.325, + "step": 65940 + }, + { + "epoch": 16.44638403990025, + "grad_norm": 7.63092565536499, + "learning_rate": 3.5605985037406486e-06, + "loss": 0.3325, + "step": 65950 + }, + { + "epoch": 16.448877805486283, + "grad_norm": 8.287890434265137, + "learning_rate": 3.5581047381546135e-06, + "loss": 0.3664, + "step": 65960 + }, + { + "epoch": 16.45137157107232, + "grad_norm": 9.698472023010254, + "learning_rate": 3.555610972568579e-06, + "loss": 0.297, + "step": 65970 + }, + { + "epoch": 16.453865336658353, + "grad_norm": 8.163863182067871, + "learning_rate": 3.5531172069825437e-06, + "loss": 0.3469, + "step": 65980 + }, + { + "epoch": 16.456359102244388, + "grad_norm": 6.485888957977295, + "learning_rate": 3.550623441396509e-06, + "loss": 0.3444, + "step": 65990 + }, + { + "epoch": 16.458852867830423, + "grad_norm": 9.558843612670898, + "learning_rate": 3.548129675810474e-06, + "loss": 0.3527, + "step": 66000 + }, + { + "epoch": 16.461346633416458, + "grad_norm": 11.721412658691406, + "learning_rate": 3.545635910224439e-06, + "loss": 0.3955, + "step": 66010 + }, + { + "epoch": 16.463840399002493, + "grad_norm": 7.238114356994629, + "learning_rate": 3.5431421446384045e-06, + "loss": 0.3075, + "step": 66020 + }, + { + "epoch": 16.466334164588527, + "grad_norm": 8.826272964477539, + "learning_rate": 3.5406483790523694e-06, + "loss": 0.3622, + "step": 66030 + }, + { + "epoch": 16.468827930174562, + "grad_norm": 8.230854034423828, + "learning_rate": 3.5381546134663347e-06, + "loss": 0.3097, + "step": 66040 + }, + { + "epoch": 16.471321695760597, + "grad_norm": 7.81684684753418, + "learning_rate": 3.5356608478802996e-06, + "loss": 0.3132, + "step": 66050 + }, + { + "epoch": 16.473815461346632, + "grad_norm": 12.46484661102295, + "learning_rate": 3.533167082294265e-06, + "loss": 0.3048, + "step": 66060 + }, + { + "epoch": 16.476309226932667, + "grad_norm": 7.251391887664795, + "learning_rate": 3.5306733167082298e-06, + "loss": 0.313, + "step": 66070 + }, + { + "epoch": 16.478802992518702, + "grad_norm": 4.723079681396484, + "learning_rate": 3.528179551122195e-06, + "loss": 0.2885, + "step": 66080 + }, + { + "epoch": 16.481296758104737, + "grad_norm": 9.58585262298584, + "learning_rate": 3.52568578553616e-06, + "loss": 0.3646, + "step": 66090 + }, + { + "epoch": 16.48379052369077, + "grad_norm": 8.872747421264648, + "learning_rate": 3.5231920199501253e-06, + "loss": 0.3397, + "step": 66100 + }, + { + "epoch": 16.486284289276806, + "grad_norm": 7.260976314544678, + "learning_rate": 3.52069825436409e-06, + "loss": 0.2967, + "step": 66110 + }, + { + "epoch": 16.48877805486284, + "grad_norm": 8.966513633728027, + "learning_rate": 3.5182044887780555e-06, + "loss": 0.3234, + "step": 66120 + }, + { + "epoch": 16.491271820448876, + "grad_norm": 8.668418884277344, + "learning_rate": 3.51571072319202e-06, + "loss": 0.3109, + "step": 66130 + }, + { + "epoch": 16.49376558603491, + "grad_norm": 9.92924690246582, + "learning_rate": 3.5132169576059857e-06, + "loss": 0.3407, + "step": 66140 + }, + { + "epoch": 16.496259351620946, + "grad_norm": 8.693164825439453, + "learning_rate": 3.51072319201995e-06, + "loss": 0.3044, + "step": 66150 + }, + { + "epoch": 16.49875311720698, + "grad_norm": 14.11042594909668, + "learning_rate": 3.5082294264339154e-06, + "loss": 0.3371, + "step": 66160 + }, + { + "epoch": 16.50124688279302, + "grad_norm": 8.329866409301758, + "learning_rate": 3.5057356608478803e-06, + "loss": 0.3241, + "step": 66170 + }, + { + "epoch": 16.503740648379054, + "grad_norm": 7.5446553230285645, + "learning_rate": 3.5032418952618456e-06, + "loss": 0.3096, + "step": 66180 + }, + { + "epoch": 16.50623441396509, + "grad_norm": 5.738336563110352, + "learning_rate": 3.5007481296758105e-06, + "loss": 0.3996, + "step": 66190 + }, + { + "epoch": 16.508728179551124, + "grad_norm": 6.674221515655518, + "learning_rate": 3.498254364089776e-06, + "loss": 0.327, + "step": 66200 + }, + { + "epoch": 16.51122194513716, + "grad_norm": 6.403293132781982, + "learning_rate": 3.4957605985037407e-06, + "loss": 0.3152, + "step": 66210 + }, + { + "epoch": 16.513715710723194, + "grad_norm": 9.35428237915039, + "learning_rate": 3.493266832917706e-06, + "loss": 0.3089, + "step": 66220 + }, + { + "epoch": 16.51620947630923, + "grad_norm": 7.628061771392822, + "learning_rate": 3.490773067331671e-06, + "loss": 0.3204, + "step": 66230 + }, + { + "epoch": 16.518703241895263, + "grad_norm": 12.166457176208496, + "learning_rate": 3.4882793017456362e-06, + "loss": 0.354, + "step": 66240 + }, + { + "epoch": 16.521197007481298, + "grad_norm": 10.643746376037598, + "learning_rate": 3.485785536159601e-06, + "loss": 0.3516, + "step": 66250 + }, + { + "epoch": 16.523690773067333, + "grad_norm": 6.07993745803833, + "learning_rate": 3.4832917705735664e-06, + "loss": 0.3867, + "step": 66260 + }, + { + "epoch": 16.526184538653368, + "grad_norm": 7.434722423553467, + "learning_rate": 3.4807980049875317e-06, + "loss": 0.3612, + "step": 66270 + }, + { + "epoch": 16.528678304239403, + "grad_norm": 7.824409484863281, + "learning_rate": 3.4783042394014966e-06, + "loss": 0.3013, + "step": 66280 + }, + { + "epoch": 16.531172069825438, + "grad_norm": 10.760315895080566, + "learning_rate": 3.475810473815462e-06, + "loss": 0.3326, + "step": 66290 + }, + { + "epoch": 16.533665835411473, + "grad_norm": 9.23198413848877, + "learning_rate": 3.473316708229427e-06, + "loss": 0.4361, + "step": 66300 + }, + { + "epoch": 16.536159600997507, + "grad_norm": 7.750860214233398, + "learning_rate": 3.470822942643392e-06, + "loss": 0.3055, + "step": 66310 + }, + { + "epoch": 16.538653366583542, + "grad_norm": 7.906196594238281, + "learning_rate": 3.468329177057357e-06, + "loss": 0.282, + "step": 66320 + }, + { + "epoch": 16.541147132169577, + "grad_norm": 8.782937049865723, + "learning_rate": 3.4658354114713223e-06, + "loss": 0.3205, + "step": 66330 + }, + { + "epoch": 16.543640897755612, + "grad_norm": 10.13814926147461, + "learning_rate": 3.4633416458852868e-06, + "loss": 0.3365, + "step": 66340 + }, + { + "epoch": 16.546134663341647, + "grad_norm": 9.947074890136719, + "learning_rate": 3.4608478802992525e-06, + "loss": 0.3592, + "step": 66350 + }, + { + "epoch": 16.54862842892768, + "grad_norm": 12.302644729614258, + "learning_rate": 3.458354114713217e-06, + "loss": 0.3738, + "step": 66360 + }, + { + "epoch": 16.551122194513717, + "grad_norm": 8.22531509399414, + "learning_rate": 3.4558603491271823e-06, + "loss": 0.3024, + "step": 66370 + }, + { + "epoch": 16.55361596009975, + "grad_norm": 8.880837440490723, + "learning_rate": 3.453366583541147e-06, + "loss": 0.2816, + "step": 66380 + }, + { + "epoch": 16.556109725685786, + "grad_norm": 7.537718296051025, + "learning_rate": 3.4508728179551125e-06, + "loss": 0.3165, + "step": 66390 + }, + { + "epoch": 16.55860349127182, + "grad_norm": 11.703903198242188, + "learning_rate": 3.4483790523690774e-06, + "loss": 0.4277, + "step": 66400 + }, + { + "epoch": 16.561097256857856, + "grad_norm": 13.591885566711426, + "learning_rate": 3.4458852867830427e-06, + "loss": 0.2994, + "step": 66410 + }, + { + "epoch": 16.56359102244389, + "grad_norm": 8.424422264099121, + "learning_rate": 3.4433915211970076e-06, + "loss": 0.3531, + "step": 66420 + }, + { + "epoch": 16.566084788029926, + "grad_norm": 5.670681953430176, + "learning_rate": 3.440897755610973e-06, + "loss": 0.3566, + "step": 66430 + }, + { + "epoch": 16.56857855361596, + "grad_norm": 13.465998649597168, + "learning_rate": 3.4384039900249378e-06, + "loss": 0.297, + "step": 66440 + }, + { + "epoch": 16.571072319201996, + "grad_norm": 6.211828708648682, + "learning_rate": 3.435910224438903e-06, + "loss": 0.2967, + "step": 66450 + }, + { + "epoch": 16.57356608478803, + "grad_norm": 8.583992958068848, + "learning_rate": 3.433416458852868e-06, + "loss": 0.2802, + "step": 66460 + }, + { + "epoch": 16.576059850374065, + "grad_norm": 6.500486373901367, + "learning_rate": 3.4309226932668333e-06, + "loss": 0.3138, + "step": 66470 + }, + { + "epoch": 16.5785536159601, + "grad_norm": 9.81982707977295, + "learning_rate": 3.428428927680798e-06, + "loss": 0.3349, + "step": 66480 + }, + { + "epoch": 16.581047381546135, + "grad_norm": 9.09636402130127, + "learning_rate": 3.4259351620947635e-06, + "loss": 0.2904, + "step": 66490 + }, + { + "epoch": 16.58354114713217, + "grad_norm": 7.976047039031982, + "learning_rate": 3.4234413965087283e-06, + "loss": 0.3574, + "step": 66500 + }, + { + "epoch": 16.586034912718205, + "grad_norm": 6.352416038513184, + "learning_rate": 3.4209476309226937e-06, + "loss": 0.3534, + "step": 66510 + }, + { + "epoch": 16.58852867830424, + "grad_norm": 7.957899570465088, + "learning_rate": 3.418453865336659e-06, + "loss": 0.417, + "step": 66520 + }, + { + "epoch": 16.591022443890274, + "grad_norm": 7.401804447174072, + "learning_rate": 3.415960099750624e-06, + "loss": 0.3865, + "step": 66530 + }, + { + "epoch": 16.59351620947631, + "grad_norm": 10.400482177734375, + "learning_rate": 3.413466334164589e-06, + "loss": 0.3071, + "step": 66540 + }, + { + "epoch": 16.596009975062344, + "grad_norm": 8.039289474487305, + "learning_rate": 3.410972568578554e-06, + "loss": 0.3396, + "step": 66550 + }, + { + "epoch": 16.59850374064838, + "grad_norm": 6.777886390686035, + "learning_rate": 3.4084788029925194e-06, + "loss": 0.2646, + "step": 66560 + }, + { + "epoch": 16.600997506234414, + "grad_norm": 9.714869499206543, + "learning_rate": 3.405985037406484e-06, + "loss": 0.3047, + "step": 66570 + }, + { + "epoch": 16.60349127182045, + "grad_norm": 8.811583518981934, + "learning_rate": 3.4034912718204496e-06, + "loss": 0.2972, + "step": 66580 + }, + { + "epoch": 16.605985037406484, + "grad_norm": 8.421089172363281, + "learning_rate": 3.400997506234414e-06, + "loss": 0.3288, + "step": 66590 + }, + { + "epoch": 16.60847880299252, + "grad_norm": 7.008131980895996, + "learning_rate": 3.3985037406483793e-06, + "loss": 0.3063, + "step": 66600 + }, + { + "epoch": 16.610972568578553, + "grad_norm": 7.8703789710998535, + "learning_rate": 3.3960099750623442e-06, + "loss": 0.2479, + "step": 66610 + }, + { + "epoch": 16.61346633416459, + "grad_norm": 8.42544937133789, + "learning_rate": 3.3935162094763095e-06, + "loss": 0.3384, + "step": 66620 + }, + { + "epoch": 16.615960099750623, + "grad_norm": 6.888559341430664, + "learning_rate": 3.3910224438902744e-06, + "loss": 0.3679, + "step": 66630 + }, + { + "epoch": 16.618453865336658, + "grad_norm": 8.545351028442383, + "learning_rate": 3.3885286783042397e-06, + "loss": 0.3226, + "step": 66640 + }, + { + "epoch": 16.620947630922693, + "grad_norm": 7.288325309753418, + "learning_rate": 3.3860349127182046e-06, + "loss": 0.3422, + "step": 66650 + }, + { + "epoch": 16.623441396508728, + "grad_norm": 10.019088745117188, + "learning_rate": 3.38354114713217e-06, + "loss": 0.3509, + "step": 66660 + }, + { + "epoch": 16.625935162094763, + "grad_norm": 6.888891220092773, + "learning_rate": 3.381047381546135e-06, + "loss": 0.3493, + "step": 66670 + }, + { + "epoch": 16.628428927680797, + "grad_norm": 11.555747032165527, + "learning_rate": 3.3785536159601e-06, + "loss": 0.3701, + "step": 66680 + }, + { + "epoch": 16.630922693266832, + "grad_norm": 8.428287506103516, + "learning_rate": 3.376059850374065e-06, + "loss": 0.3779, + "step": 66690 + }, + { + "epoch": 16.633416458852867, + "grad_norm": 10.036828994750977, + "learning_rate": 3.3735660847880303e-06, + "loss": 0.3879, + "step": 66700 + }, + { + "epoch": 16.635910224438902, + "grad_norm": 7.905212879180908, + "learning_rate": 3.371072319201995e-06, + "loss": 0.2564, + "step": 66710 + }, + { + "epoch": 16.638403990024937, + "grad_norm": 7.539322853088379, + "learning_rate": 3.3685785536159605e-06, + "loss": 0.3989, + "step": 66720 + }, + { + "epoch": 16.640897755610972, + "grad_norm": 8.233596801757812, + "learning_rate": 3.3660847880299254e-06, + "loss": 0.3014, + "step": 66730 + }, + { + "epoch": 16.643391521197007, + "grad_norm": 7.139582633972168, + "learning_rate": 3.3635910224438907e-06, + "loss": 0.2907, + "step": 66740 + }, + { + "epoch": 16.64588528678304, + "grad_norm": 9.440067291259766, + "learning_rate": 3.361097256857855e-06, + "loss": 0.3126, + "step": 66750 + }, + { + "epoch": 16.648379052369076, + "grad_norm": 10.730769157409668, + "learning_rate": 3.358603491271821e-06, + "loss": 0.3447, + "step": 66760 + }, + { + "epoch": 16.65087281795511, + "grad_norm": 12.145304679870605, + "learning_rate": 3.356109725685786e-06, + "loss": 0.318, + "step": 66770 + }, + { + "epoch": 16.653366583541146, + "grad_norm": 13.246317863464355, + "learning_rate": 3.3536159600997507e-06, + "loss": 0.3085, + "step": 66780 + }, + { + "epoch": 16.65586034912718, + "grad_norm": 6.979218482971191, + "learning_rate": 3.3511221945137164e-06, + "loss": 0.328, + "step": 66790 + }, + { + "epoch": 16.658354114713216, + "grad_norm": 7.944563388824463, + "learning_rate": 3.348628428927681e-06, + "loss": 0.2856, + "step": 66800 + }, + { + "epoch": 16.66084788029925, + "grad_norm": 8.603992462158203, + "learning_rate": 3.346134663341646e-06, + "loss": 0.3337, + "step": 66810 + }, + { + "epoch": 16.663341645885286, + "grad_norm": 9.603078842163086, + "learning_rate": 3.343640897755611e-06, + "loss": 0.3376, + "step": 66820 + }, + { + "epoch": 16.66583541147132, + "grad_norm": 5.392066478729248, + "learning_rate": 3.3411471321695764e-06, + "loss": 0.3134, + "step": 66830 + }, + { + "epoch": 16.668329177057355, + "grad_norm": 6.164698600769043, + "learning_rate": 3.3386533665835413e-06, + "loss": 0.323, + "step": 66840 + }, + { + "epoch": 16.67082294264339, + "grad_norm": 8.777530670166016, + "learning_rate": 3.3361596009975066e-06, + "loss": 0.3715, + "step": 66850 + }, + { + "epoch": 16.673316708229425, + "grad_norm": 9.735917091369629, + "learning_rate": 3.3336658354114715e-06, + "loss": 0.3017, + "step": 66860 + }, + { + "epoch": 16.67581047381546, + "grad_norm": 5.572236061096191, + "learning_rate": 3.3311720698254368e-06, + "loss": 0.324, + "step": 66870 + }, + { + "epoch": 16.678304239401495, + "grad_norm": 8.303897857666016, + "learning_rate": 3.3286783042394016e-06, + "loss": 0.3146, + "step": 66880 + }, + { + "epoch": 16.68079800498753, + "grad_norm": 10.715437889099121, + "learning_rate": 3.326184538653367e-06, + "loss": 0.3026, + "step": 66890 + }, + { + "epoch": 16.683291770573565, + "grad_norm": 8.109511375427246, + "learning_rate": 3.323690773067332e-06, + "loss": 0.3878, + "step": 66900 + }, + { + "epoch": 16.6857855361596, + "grad_norm": 5.563307762145996, + "learning_rate": 3.321197007481297e-06, + "loss": 0.2698, + "step": 66910 + }, + { + "epoch": 16.688279301745634, + "grad_norm": 13.141777038574219, + "learning_rate": 3.318703241895262e-06, + "loss": 0.3028, + "step": 66920 + }, + { + "epoch": 16.69077306733167, + "grad_norm": 9.709403991699219, + "learning_rate": 3.3162094763092273e-06, + "loss": 0.3567, + "step": 66930 + }, + { + "epoch": 16.693266832917704, + "grad_norm": 9.365985870361328, + "learning_rate": 3.3137157107231922e-06, + "loss": 0.2793, + "step": 66940 + }, + { + "epoch": 16.69576059850374, + "grad_norm": 10.170108795166016, + "learning_rate": 3.3112219451371575e-06, + "loss": 0.2982, + "step": 66950 + }, + { + "epoch": 16.698254364089777, + "grad_norm": 5.353804588317871, + "learning_rate": 3.3087281795511224e-06, + "loss": 0.3412, + "step": 66960 + }, + { + "epoch": 16.70074812967581, + "grad_norm": 13.283953666687012, + "learning_rate": 3.3062344139650877e-06, + "loss": 0.2865, + "step": 66970 + }, + { + "epoch": 16.703241895261847, + "grad_norm": 9.242694854736328, + "learning_rate": 3.303740648379052e-06, + "loss": 0.2743, + "step": 66980 + }, + { + "epoch": 16.705735660847882, + "grad_norm": 9.898309707641602, + "learning_rate": 3.301246882793018e-06, + "loss": 0.3607, + "step": 66990 + }, + { + "epoch": 16.708229426433917, + "grad_norm": 7.244779109954834, + "learning_rate": 3.2987531172069824e-06, + "loss": 0.2872, + "step": 67000 + }, + { + "epoch": 16.71072319201995, + "grad_norm": 7.123824119567871, + "learning_rate": 3.2962593516209477e-06, + "loss": 0.2879, + "step": 67010 + }, + { + "epoch": 16.713216957605987, + "grad_norm": 11.419690132141113, + "learning_rate": 3.2937655860349134e-06, + "loss": 0.3193, + "step": 67020 + }, + { + "epoch": 16.71571072319202, + "grad_norm": 12.919515609741211, + "learning_rate": 3.291271820448878e-06, + "loss": 0.3836, + "step": 67030 + }, + { + "epoch": 16.718204488778056, + "grad_norm": 8.977980613708496, + "learning_rate": 3.2887780548628432e-06, + "loss": 0.303, + "step": 67040 + }, + { + "epoch": 16.72069825436409, + "grad_norm": 8.428853988647461, + "learning_rate": 3.286284289276808e-06, + "loss": 0.3334, + "step": 67050 + }, + { + "epoch": 16.723192019950126, + "grad_norm": 9.528043746948242, + "learning_rate": 3.2837905236907734e-06, + "loss": 0.3773, + "step": 67060 + }, + { + "epoch": 16.72568578553616, + "grad_norm": 8.615249633789062, + "learning_rate": 3.2812967581047383e-06, + "loss": 0.2869, + "step": 67070 + }, + { + "epoch": 16.728179551122196, + "grad_norm": 11.474807739257812, + "learning_rate": 3.2788029925187036e-06, + "loss": 0.3592, + "step": 67080 + }, + { + "epoch": 16.73067331670823, + "grad_norm": 7.433091640472412, + "learning_rate": 3.2763092269326685e-06, + "loss": 0.294, + "step": 67090 + }, + { + "epoch": 16.733167082294266, + "grad_norm": 9.934019088745117, + "learning_rate": 3.273815461346634e-06, + "loss": 0.3228, + "step": 67100 + }, + { + "epoch": 16.7356608478803, + "grad_norm": 8.578768730163574, + "learning_rate": 3.2713216957605987e-06, + "loss": 0.279, + "step": 67110 + }, + { + "epoch": 16.738154613466335, + "grad_norm": 7.551397800445557, + "learning_rate": 3.268827930174564e-06, + "loss": 0.3451, + "step": 67120 + }, + { + "epoch": 16.74064837905237, + "grad_norm": 7.463733196258545, + "learning_rate": 3.266334164588529e-06, + "loss": 0.3792, + "step": 67130 + }, + { + "epoch": 16.743142144638405, + "grad_norm": 9.001909255981445, + "learning_rate": 3.263840399002494e-06, + "loss": 0.3244, + "step": 67140 + }, + { + "epoch": 16.74563591022444, + "grad_norm": 10.928662300109863, + "learning_rate": 3.261346633416459e-06, + "loss": 0.3328, + "step": 67150 + }, + { + "epoch": 16.748129675810475, + "grad_norm": 5.639915466308594, + "learning_rate": 3.2588528678304244e-06, + "loss": 0.3326, + "step": 67160 + }, + { + "epoch": 16.75062344139651, + "grad_norm": 10.53235912322998, + "learning_rate": 3.2563591022443893e-06, + "loss": 0.361, + "step": 67170 + }, + { + "epoch": 16.753117206982544, + "grad_norm": 5.4268951416015625, + "learning_rate": 3.2538653366583546e-06, + "loss": 0.3213, + "step": 67180 + }, + { + "epoch": 16.75561097256858, + "grad_norm": 10.269506454467773, + "learning_rate": 3.2513715710723195e-06, + "loss": 0.3346, + "step": 67190 + }, + { + "epoch": 16.758104738154614, + "grad_norm": 21.99469757080078, + "learning_rate": 3.2488778054862848e-06, + "loss": 0.3419, + "step": 67200 + }, + { + "epoch": 16.76059850374065, + "grad_norm": 6.769282341003418, + "learning_rate": 3.2463840399002492e-06, + "loss": 0.321, + "step": 67210 + }, + { + "epoch": 16.763092269326684, + "grad_norm": 5.740752220153809, + "learning_rate": 3.243890274314215e-06, + "loss": 0.3051, + "step": 67220 + }, + { + "epoch": 16.76558603491272, + "grad_norm": 9.068950653076172, + "learning_rate": 3.2413965087281794e-06, + "loss": 0.4055, + "step": 67230 + }, + { + "epoch": 16.768079800498754, + "grad_norm": 11.04249095916748, + "learning_rate": 3.2389027431421448e-06, + "loss": 0.336, + "step": 67240 + }, + { + "epoch": 16.77057356608479, + "grad_norm": 9.966495513916016, + "learning_rate": 3.2364089775561096e-06, + "loss": 0.3491, + "step": 67250 + }, + { + "epoch": 16.773067331670823, + "grad_norm": 11.092144012451172, + "learning_rate": 3.233915211970075e-06, + "loss": 0.2806, + "step": 67260 + }, + { + "epoch": 16.77556109725686, + "grad_norm": 8.785165786743164, + "learning_rate": 3.2314214463840403e-06, + "loss": 0.3081, + "step": 67270 + }, + { + "epoch": 16.778054862842893, + "grad_norm": 9.333272933959961, + "learning_rate": 3.228927680798005e-06, + "loss": 0.364, + "step": 67280 + }, + { + "epoch": 16.780548628428928, + "grad_norm": 8.067471504211426, + "learning_rate": 3.2264339152119705e-06, + "loss": 0.3608, + "step": 67290 + }, + { + "epoch": 16.783042394014963, + "grad_norm": 4.908388614654541, + "learning_rate": 3.2239401496259353e-06, + "loss": 0.3417, + "step": 67300 + }, + { + "epoch": 16.785536159600998, + "grad_norm": 10.774919509887695, + "learning_rate": 3.2214463840399006e-06, + "loss": 0.3317, + "step": 67310 + }, + { + "epoch": 16.788029925187033, + "grad_norm": 19.7656192779541, + "learning_rate": 3.2189526184538655e-06, + "loss": 0.3345, + "step": 67320 + }, + { + "epoch": 16.790523690773068, + "grad_norm": 8.48336124420166, + "learning_rate": 3.216458852867831e-06, + "loss": 0.3249, + "step": 67330 + }, + { + "epoch": 16.793017456359102, + "grad_norm": 7.753775119781494, + "learning_rate": 3.2139650872817957e-06, + "loss": 0.295, + "step": 67340 + }, + { + "epoch": 16.795511221945137, + "grad_norm": 7.943778038024902, + "learning_rate": 3.211471321695761e-06, + "loss": 0.356, + "step": 67350 + }, + { + "epoch": 16.798004987531172, + "grad_norm": 8.604585647583008, + "learning_rate": 3.208977556109726e-06, + "loss": 0.3197, + "step": 67360 + }, + { + "epoch": 16.800498753117207, + "grad_norm": 7.526785373687744, + "learning_rate": 3.2064837905236912e-06, + "loss": 0.3185, + "step": 67370 + }, + { + "epoch": 16.802992518703242, + "grad_norm": 9.398643493652344, + "learning_rate": 3.203990024937656e-06, + "loss": 0.3266, + "step": 67380 + }, + { + "epoch": 16.805486284289277, + "grad_norm": 9.048074722290039, + "learning_rate": 3.2014962593516214e-06, + "loss": 0.3011, + "step": 67390 + }, + { + "epoch": 16.80798004987531, + "grad_norm": 8.05639362335205, + "learning_rate": 3.1990024937655863e-06, + "loss": 0.3413, + "step": 67400 + }, + { + "epoch": 16.810473815461346, + "grad_norm": 8.279003143310547, + "learning_rate": 3.1965087281795516e-06, + "loss": 0.3302, + "step": 67410 + }, + { + "epoch": 16.81296758104738, + "grad_norm": 10.437573432922363, + "learning_rate": 3.194014962593516e-06, + "loss": 0.3326, + "step": 67420 + }, + { + "epoch": 16.815461346633416, + "grad_norm": 10.088639259338379, + "learning_rate": 3.191521197007482e-06, + "loss": 0.2668, + "step": 67430 + }, + { + "epoch": 16.81795511221945, + "grad_norm": 11.739412307739258, + "learning_rate": 3.1890274314214463e-06, + "loss": 0.2719, + "step": 67440 + }, + { + "epoch": 16.820448877805486, + "grad_norm": 7.2401604652404785, + "learning_rate": 3.1865336658354116e-06, + "loss": 0.3168, + "step": 67450 + }, + { + "epoch": 16.82294264339152, + "grad_norm": 9.88704776763916, + "learning_rate": 3.1840399002493765e-06, + "loss": 0.4028, + "step": 67460 + }, + { + "epoch": 16.825436408977556, + "grad_norm": 10.055147171020508, + "learning_rate": 3.181546134663342e-06, + "loss": 0.3385, + "step": 67470 + }, + { + "epoch": 16.82793017456359, + "grad_norm": 11.214886665344238, + "learning_rate": 3.1790523690773067e-06, + "loss": 0.3381, + "step": 67480 + }, + { + "epoch": 16.830423940149625, + "grad_norm": 9.053804397583008, + "learning_rate": 3.176558603491272e-06, + "loss": 0.3107, + "step": 67490 + }, + { + "epoch": 16.83291770573566, + "grad_norm": 9.388381004333496, + "learning_rate": 3.174064837905237e-06, + "loss": 0.3519, + "step": 67500 + }, + { + "epoch": 16.835411471321695, + "grad_norm": 11.288336753845215, + "learning_rate": 3.171571072319202e-06, + "loss": 0.3049, + "step": 67510 + }, + { + "epoch": 16.83790523690773, + "grad_norm": 11.12149429321289, + "learning_rate": 3.1690773067331675e-06, + "loss": 0.3766, + "step": 67520 + }, + { + "epoch": 16.840399002493765, + "grad_norm": 6.969629287719727, + "learning_rate": 3.1665835411471324e-06, + "loss": 0.3209, + "step": 67530 + }, + { + "epoch": 16.8428927680798, + "grad_norm": 9.759892463684082, + "learning_rate": 3.1640897755610977e-06, + "loss": 0.2822, + "step": 67540 + }, + { + "epoch": 16.845386533665835, + "grad_norm": 9.30257797241211, + "learning_rate": 3.1615960099750626e-06, + "loss": 0.3563, + "step": 67550 + }, + { + "epoch": 16.84788029925187, + "grad_norm": 7.607780933380127, + "learning_rate": 3.159102244389028e-06, + "loss": 0.352, + "step": 67560 + }, + { + "epoch": 16.850374064837904, + "grad_norm": 6.663292407989502, + "learning_rate": 3.1566084788029928e-06, + "loss": 0.3008, + "step": 67570 + }, + { + "epoch": 16.85286783042394, + "grad_norm": 9.025524139404297, + "learning_rate": 3.154114713216958e-06, + "loss": 0.3307, + "step": 67580 + }, + { + "epoch": 16.855361596009974, + "grad_norm": 7.929129123687744, + "learning_rate": 3.151620947630923e-06, + "loss": 0.3064, + "step": 67590 + }, + { + "epoch": 16.85785536159601, + "grad_norm": 6.258866786956787, + "learning_rate": 3.1491271820448883e-06, + "loss": 0.2586, + "step": 67600 + }, + { + "epoch": 16.860349127182044, + "grad_norm": 5.975445747375488, + "learning_rate": 3.146633416458853e-06, + "loss": 0.2883, + "step": 67610 + }, + { + "epoch": 16.86284289276808, + "grad_norm": 6.100348472595215, + "learning_rate": 3.1441396508728185e-06, + "loss": 0.3816, + "step": 67620 + }, + { + "epoch": 16.865336658354114, + "grad_norm": 8.461503028869629, + "learning_rate": 3.1416458852867834e-06, + "loss": 0.3295, + "step": 67630 + }, + { + "epoch": 16.86783042394015, + "grad_norm": 11.094927787780762, + "learning_rate": 3.1391521197007487e-06, + "loss": 0.3346, + "step": 67640 + }, + { + "epoch": 16.870324189526183, + "grad_norm": 8.778679847717285, + "learning_rate": 3.136658354114713e-06, + "loss": 0.3169, + "step": 67650 + }, + { + "epoch": 16.872817955112218, + "grad_norm": 8.165081024169922, + "learning_rate": 3.134164588528679e-06, + "loss": 0.3515, + "step": 67660 + }, + { + "epoch": 16.875311720698253, + "grad_norm": 8.707517623901367, + "learning_rate": 3.1316708229426433e-06, + "loss": 0.3255, + "step": 67670 + }, + { + "epoch": 16.877805486284288, + "grad_norm": 12.160292625427246, + "learning_rate": 3.1291770573566086e-06, + "loss": 0.3691, + "step": 67680 + }, + { + "epoch": 16.880299251870323, + "grad_norm": 7.535975933074951, + "learning_rate": 3.1266832917705735e-06, + "loss": 0.3717, + "step": 67690 + }, + { + "epoch": 16.882793017456358, + "grad_norm": 9.464982032775879, + "learning_rate": 3.124189526184539e-06, + "loss": 0.3114, + "step": 67700 + }, + { + "epoch": 16.885286783042392, + "grad_norm": 6.380134582519531, + "learning_rate": 3.1216957605985037e-06, + "loss": 0.2951, + "step": 67710 + }, + { + "epoch": 16.887780548628427, + "grad_norm": 11.643170356750488, + "learning_rate": 3.119201995012469e-06, + "loss": 0.3128, + "step": 67720 + }, + { + "epoch": 16.890274314214462, + "grad_norm": 12.784978866577148, + "learning_rate": 3.116708229426434e-06, + "loss": 0.3273, + "step": 67730 + }, + { + "epoch": 16.892768079800497, + "grad_norm": 9.829355239868164, + "learning_rate": 3.1142144638403992e-06, + "loss": 0.3061, + "step": 67740 + }, + { + "epoch": 16.895261845386532, + "grad_norm": 10.183358192443848, + "learning_rate": 3.111720698254364e-06, + "loss": 0.3194, + "step": 67750 + }, + { + "epoch": 16.897755610972567, + "grad_norm": 9.071417808532715, + "learning_rate": 3.1092269326683294e-06, + "loss": 0.2804, + "step": 67760 + }, + { + "epoch": 16.900249376558605, + "grad_norm": 10.647289276123047, + "learning_rate": 3.1067331670822947e-06, + "loss": 0.3488, + "step": 67770 + }, + { + "epoch": 16.902743142144637, + "grad_norm": 9.39749813079834, + "learning_rate": 3.1042394014962596e-06, + "loss": 0.335, + "step": 67780 + }, + { + "epoch": 16.905236907730675, + "grad_norm": 9.534618377685547, + "learning_rate": 3.101745635910225e-06, + "loss": 0.2999, + "step": 67790 + }, + { + "epoch": 16.90773067331671, + "grad_norm": 13.632329940795898, + "learning_rate": 3.09925187032419e-06, + "loss": 0.3342, + "step": 67800 + }, + { + "epoch": 16.910224438902745, + "grad_norm": 7.592889308929443, + "learning_rate": 3.096758104738155e-06, + "loss": 0.2828, + "step": 67810 + }, + { + "epoch": 16.91271820448878, + "grad_norm": 6.196469783782959, + "learning_rate": 3.09426433915212e-06, + "loss": 0.3377, + "step": 67820 + }, + { + "epoch": 16.915211970074814, + "grad_norm": 9.526688575744629, + "learning_rate": 3.0917705735660853e-06, + "loss": 0.2684, + "step": 67830 + }, + { + "epoch": 16.91770573566085, + "grad_norm": 7.148454189300537, + "learning_rate": 3.08927680798005e-06, + "loss": 0.3152, + "step": 67840 + }, + { + "epoch": 16.920199501246884, + "grad_norm": 7.666889190673828, + "learning_rate": 3.0867830423940155e-06, + "loss": 0.3181, + "step": 67850 + }, + { + "epoch": 16.92269326683292, + "grad_norm": 6.633835792541504, + "learning_rate": 3.08428927680798e-06, + "loss": 0.3662, + "step": 67860 + }, + { + "epoch": 16.925187032418954, + "grad_norm": 9.805349349975586, + "learning_rate": 3.0817955112219457e-06, + "loss": 0.3224, + "step": 67870 + }, + { + "epoch": 16.92768079800499, + "grad_norm": 25.773767471313477, + "learning_rate": 3.07930174563591e-06, + "loss": 0.3182, + "step": 67880 + }, + { + "epoch": 16.930174563591024, + "grad_norm": 5.862217426300049, + "learning_rate": 3.0768079800498755e-06, + "loss": 0.362, + "step": 67890 + }, + { + "epoch": 16.93266832917706, + "grad_norm": 6.981509208679199, + "learning_rate": 3.0743142144638404e-06, + "loss": 0.2904, + "step": 67900 + }, + { + "epoch": 16.935162094763093, + "grad_norm": 7.372140407562256, + "learning_rate": 3.0718204488778057e-06, + "loss": 0.3158, + "step": 67910 + }, + { + "epoch": 16.93765586034913, + "grad_norm": 6.7870612144470215, + "learning_rate": 3.0693266832917706e-06, + "loss": 0.323, + "step": 67920 + }, + { + "epoch": 16.940149625935163, + "grad_norm": 11.441544532775879, + "learning_rate": 3.066832917705736e-06, + "loss": 0.3994, + "step": 67930 + }, + { + "epoch": 16.942643391521198, + "grad_norm": 7.740766525268555, + "learning_rate": 3.0643391521197008e-06, + "loss": 0.4086, + "step": 67940 + }, + { + "epoch": 16.945137157107233, + "grad_norm": 10.935362815856934, + "learning_rate": 3.061845386533666e-06, + "loss": 0.32, + "step": 67950 + }, + { + "epoch": 16.947630922693268, + "grad_norm": 9.322646141052246, + "learning_rate": 3.059351620947631e-06, + "loss": 0.3604, + "step": 67960 + }, + { + "epoch": 16.950124688279303, + "grad_norm": 8.889856338500977, + "learning_rate": 3.0568578553615963e-06, + "loss": 0.3302, + "step": 67970 + }, + { + "epoch": 16.952618453865338, + "grad_norm": 12.106937408447266, + "learning_rate": 3.054364089775561e-06, + "loss": 0.2517, + "step": 67980 + }, + { + "epoch": 16.955112219451372, + "grad_norm": 9.038848876953125, + "learning_rate": 3.0518703241895265e-06, + "loss": 0.3813, + "step": 67990 + }, + { + "epoch": 16.957605985037407, + "grad_norm": 10.314130783081055, + "learning_rate": 3.0493765586034918e-06, + "loss": 0.3414, + "step": 68000 + }, + { + "epoch": 16.960099750623442, + "grad_norm": 6.741665840148926, + "learning_rate": 3.0468827930174567e-06, + "loss": 0.3446, + "step": 68010 + }, + { + "epoch": 16.962593516209477, + "grad_norm": 7.708677768707275, + "learning_rate": 3.044389027431422e-06, + "loss": 0.2833, + "step": 68020 + }, + { + "epoch": 16.965087281795512, + "grad_norm": 7.930616855621338, + "learning_rate": 3.041895261845387e-06, + "loss": 0.2681, + "step": 68030 + }, + { + "epoch": 16.967581047381547, + "grad_norm": 6.562017440795898, + "learning_rate": 3.039401496259352e-06, + "loss": 0.3424, + "step": 68040 + }, + { + "epoch": 16.97007481296758, + "grad_norm": 9.841915130615234, + "learning_rate": 3.036907730673317e-06, + "loss": 0.3586, + "step": 68050 + }, + { + "epoch": 16.972568578553616, + "grad_norm": 13.928132057189941, + "learning_rate": 3.0344139650872824e-06, + "loss": 0.3287, + "step": 68060 + }, + { + "epoch": 16.97506234413965, + "grad_norm": 6.605895042419434, + "learning_rate": 3.0319201995012472e-06, + "loss": 0.3126, + "step": 68070 + }, + { + "epoch": 16.977556109725686, + "grad_norm": 8.77118968963623, + "learning_rate": 3.0294264339152126e-06, + "loss": 0.345, + "step": 68080 + }, + { + "epoch": 16.98004987531172, + "grad_norm": 10.565832138061523, + "learning_rate": 3.026932668329177e-06, + "loss": 0.3402, + "step": 68090 + }, + { + "epoch": 16.982543640897756, + "grad_norm": 7.508286952972412, + "learning_rate": 3.0244389027431428e-06, + "loss": 0.3443, + "step": 68100 + }, + { + "epoch": 16.98503740648379, + "grad_norm": 7.770997524261475, + "learning_rate": 3.0219451371571072e-06, + "loss": 0.3283, + "step": 68110 + }, + { + "epoch": 16.987531172069826, + "grad_norm": 10.09821891784668, + "learning_rate": 3.0194513715710725e-06, + "loss": 0.3236, + "step": 68120 + }, + { + "epoch": 16.99002493765586, + "grad_norm": 6.971126079559326, + "learning_rate": 3.0169576059850374e-06, + "loss": 0.3546, + "step": 68130 + }, + { + "epoch": 16.992518703241895, + "grad_norm": 8.345663070678711, + "learning_rate": 3.0144638403990027e-06, + "loss": 0.3018, + "step": 68140 + }, + { + "epoch": 16.99501246882793, + "grad_norm": 8.50364875793457, + "learning_rate": 3.0119700748129676e-06, + "loss": 0.342, + "step": 68150 + }, + { + "epoch": 16.997506234413965, + "grad_norm": 7.714181423187256, + "learning_rate": 3.009476309226933e-06, + "loss": 0.2599, + "step": 68160 + }, + { + "epoch": 17.0, + "grad_norm": 9.013660430908203, + "learning_rate": 3.006982543640898e-06, + "loss": 0.2809, + "step": 68170 + }, + { + "epoch": 17.0, + "eval_loss": 0.4180484116077423, + "eval_runtime": 59.928, + "eval_samples_per_second": 16.737, + "eval_steps_per_second": 16.737, + "step": 68170 + }, + { + "epoch": 17.002493765586035, + "grad_norm": 9.489847183227539, + "learning_rate": 3.004488778054863e-06, + "loss": 0.3557, + "step": 68180 + }, + { + "epoch": 17.00498753117207, + "grad_norm": 7.423935890197754, + "learning_rate": 3.001995012468828e-06, + "loss": 0.3209, + "step": 68190 + }, + { + "epoch": 17.007481296758105, + "grad_norm": 5.265260219573975, + "learning_rate": 2.9995012468827933e-06, + "loss": 0.2767, + "step": 68200 + }, + { + "epoch": 17.00997506234414, + "grad_norm": 6.882249355316162, + "learning_rate": 2.997007481296758e-06, + "loss": 0.2949, + "step": 68210 + }, + { + "epoch": 17.012468827930174, + "grad_norm": 10.994675636291504, + "learning_rate": 2.9945137157107235e-06, + "loss": 0.2567, + "step": 68220 + }, + { + "epoch": 17.01496259351621, + "grad_norm": 6.603150367736816, + "learning_rate": 2.9920199501246884e-06, + "loss": 0.2465, + "step": 68230 + }, + { + "epoch": 17.017456359102244, + "grad_norm": 7.306191921234131, + "learning_rate": 2.9895261845386537e-06, + "loss": 0.3047, + "step": 68240 + }, + { + "epoch": 17.01995012468828, + "grad_norm": 6.446793556213379, + "learning_rate": 2.987032418952619e-06, + "loss": 0.3648, + "step": 68250 + }, + { + "epoch": 17.022443890274314, + "grad_norm": 9.686697006225586, + "learning_rate": 2.984538653366584e-06, + "loss": 0.4001, + "step": 68260 + }, + { + "epoch": 17.02493765586035, + "grad_norm": 8.045364379882812, + "learning_rate": 2.982044887780549e-06, + "loss": 0.3045, + "step": 68270 + }, + { + "epoch": 17.027431421446384, + "grad_norm": 11.373425483703613, + "learning_rate": 2.979551122194514e-06, + "loss": 0.3371, + "step": 68280 + }, + { + "epoch": 17.02992518703242, + "grad_norm": 9.176929473876953, + "learning_rate": 2.9770573566084794e-06, + "loss": 0.3362, + "step": 68290 + }, + { + "epoch": 17.032418952618453, + "grad_norm": 9.755634307861328, + "learning_rate": 2.974563591022444e-06, + "loss": 0.3246, + "step": 68300 + }, + { + "epoch": 17.034912718204488, + "grad_norm": 10.913490295410156, + "learning_rate": 2.9720698254364096e-06, + "loss": 0.3034, + "step": 68310 + }, + { + "epoch": 17.037406483790523, + "grad_norm": 7.916656494140625, + "learning_rate": 2.969576059850374e-06, + "loss": 0.2902, + "step": 68320 + }, + { + "epoch": 17.039900249376558, + "grad_norm": 12.00803279876709, + "learning_rate": 2.9670822942643394e-06, + "loss": 0.3386, + "step": 68330 + }, + { + "epoch": 17.042394014962593, + "grad_norm": 8.119451522827148, + "learning_rate": 2.9645885286783043e-06, + "loss": 0.3176, + "step": 68340 + }, + { + "epoch": 17.044887780548628, + "grad_norm": 11.683120727539062, + "learning_rate": 2.9620947630922696e-06, + "loss": 0.3665, + "step": 68350 + }, + { + "epoch": 17.047381546134662, + "grad_norm": 8.209905624389648, + "learning_rate": 2.9596009975062345e-06, + "loss": 0.3437, + "step": 68360 + }, + { + "epoch": 17.049875311720697, + "grad_norm": 9.127650260925293, + "learning_rate": 2.9571072319201998e-06, + "loss": 0.4074, + "step": 68370 + }, + { + "epoch": 17.052369077306732, + "grad_norm": 8.173192977905273, + "learning_rate": 2.9546134663341646e-06, + "loss": 0.2907, + "step": 68380 + }, + { + "epoch": 17.054862842892767, + "grad_norm": 8.60781192779541, + "learning_rate": 2.95211970074813e-06, + "loss": 0.3102, + "step": 68390 + }, + { + "epoch": 17.057356608478802, + "grad_norm": 12.1088228225708, + "learning_rate": 2.949625935162095e-06, + "loss": 0.3105, + "step": 68400 + }, + { + "epoch": 17.059850374064837, + "grad_norm": 6.979897499084473, + "learning_rate": 2.94713216957606e-06, + "loss": 0.3078, + "step": 68410 + }, + { + "epoch": 17.06234413965087, + "grad_norm": 9.642083168029785, + "learning_rate": 2.944638403990025e-06, + "loss": 0.3219, + "step": 68420 + }, + { + "epoch": 17.064837905236907, + "grad_norm": 7.635913848876953, + "learning_rate": 2.9421446384039903e-06, + "loss": 0.3145, + "step": 68430 + }, + { + "epoch": 17.06733167082294, + "grad_norm": 7.816982269287109, + "learning_rate": 2.9396508728179552e-06, + "loss": 0.3144, + "step": 68440 + }, + { + "epoch": 17.069825436408976, + "grad_norm": 9.944385528564453, + "learning_rate": 2.9371571072319205e-06, + "loss": 0.3679, + "step": 68450 + }, + { + "epoch": 17.07231920199501, + "grad_norm": 7.051234722137451, + "learning_rate": 2.9346633416458854e-06, + "loss": 0.3836, + "step": 68460 + }, + { + "epoch": 17.074812967581046, + "grad_norm": 6.5841965675354, + "learning_rate": 2.9321695760598507e-06, + "loss": 0.3354, + "step": 68470 + }, + { + "epoch": 17.07730673316708, + "grad_norm": 7.379906177520752, + "learning_rate": 2.9296758104738156e-06, + "loss": 0.3299, + "step": 68480 + }, + { + "epoch": 17.079800498753116, + "grad_norm": 6.527453899383545, + "learning_rate": 2.927182044887781e-06, + "loss": 0.2663, + "step": 68490 + }, + { + "epoch": 17.08229426433915, + "grad_norm": 9.604272842407227, + "learning_rate": 2.9246882793017462e-06, + "loss": 0.3929, + "step": 68500 + }, + { + "epoch": 17.084788029925186, + "grad_norm": 13.529383659362793, + "learning_rate": 2.922194513715711e-06, + "loss": 0.2815, + "step": 68510 + }, + { + "epoch": 17.08728179551122, + "grad_norm": 8.98625373840332, + "learning_rate": 2.9197007481296764e-06, + "loss": 0.303, + "step": 68520 + }, + { + "epoch": 17.089775561097255, + "grad_norm": 8.426309585571289, + "learning_rate": 2.917206982543641e-06, + "loss": 0.3067, + "step": 68530 + }, + { + "epoch": 17.09226932668329, + "grad_norm": 11.854998588562012, + "learning_rate": 2.9147132169576066e-06, + "loss": 0.3384, + "step": 68540 + }, + { + "epoch": 17.094763092269325, + "grad_norm": 7.317206382751465, + "learning_rate": 2.912219451371571e-06, + "loss": 0.3036, + "step": 68550 + }, + { + "epoch": 17.09725685785536, + "grad_norm": 7.078338146209717, + "learning_rate": 2.9097256857855364e-06, + "loss": 0.3772, + "step": 68560 + }, + { + "epoch": 17.099750623441395, + "grad_norm": 3.3603427410125732, + "learning_rate": 2.9072319201995013e-06, + "loss": 0.3848, + "step": 68570 + }, + { + "epoch": 17.102244389027433, + "grad_norm": 8.076667785644531, + "learning_rate": 2.9047381546134666e-06, + "loss": 0.3636, + "step": 68580 + }, + { + "epoch": 17.104738154613468, + "grad_norm": 8.68441104888916, + "learning_rate": 2.9022443890274315e-06, + "loss": 0.3029, + "step": 68590 + }, + { + "epoch": 17.107231920199503, + "grad_norm": 15.482074737548828, + "learning_rate": 2.899750623441397e-06, + "loss": 0.3457, + "step": 68600 + }, + { + "epoch": 17.109725685785538, + "grad_norm": 8.531909942626953, + "learning_rate": 2.8972568578553617e-06, + "loss": 0.3074, + "step": 68610 + }, + { + "epoch": 17.112219451371573, + "grad_norm": 6.706440448760986, + "learning_rate": 2.894763092269327e-06, + "loss": 0.3379, + "step": 68620 + }, + { + "epoch": 17.114713216957608, + "grad_norm": 8.066004753112793, + "learning_rate": 2.892269326683292e-06, + "loss": 0.3175, + "step": 68630 + }, + { + "epoch": 17.117206982543642, + "grad_norm": 7.285860061645508, + "learning_rate": 2.889775561097257e-06, + "loss": 0.2926, + "step": 68640 + }, + { + "epoch": 17.119700748129677, + "grad_norm": 11.93729019165039, + "learning_rate": 2.887531172069826e-06, + "loss": 0.4289, + "step": 68650 + }, + { + "epoch": 17.122194513715712, + "grad_norm": 9.996557235717773, + "learning_rate": 2.8850374064837904e-06, + "loss": 0.2979, + "step": 68660 + }, + { + "epoch": 17.124688279301747, + "grad_norm": 10.025423049926758, + "learning_rate": 2.8825436408977557e-06, + "loss": 0.3345, + "step": 68670 + }, + { + "epoch": 17.127182044887782, + "grad_norm": 7.561511993408203, + "learning_rate": 2.8800498753117214e-06, + "loss": 0.3385, + "step": 68680 + }, + { + "epoch": 17.129675810473817, + "grad_norm": 6.57540225982666, + "learning_rate": 2.877556109725686e-06, + "loss": 0.2567, + "step": 68690 + }, + { + "epoch": 17.13216957605985, + "grad_norm": 11.776047706604004, + "learning_rate": 2.875062344139651e-06, + "loss": 0.3308, + "step": 68700 + }, + { + "epoch": 17.134663341645886, + "grad_norm": 6.203807353973389, + "learning_rate": 2.872568578553616e-06, + "loss": 0.2472, + "step": 68710 + }, + { + "epoch": 17.13715710723192, + "grad_norm": 9.786406517028809, + "learning_rate": 2.8700748129675814e-06, + "loss": 0.3577, + "step": 68720 + }, + { + "epoch": 17.139650872817956, + "grad_norm": 6.93914794921875, + "learning_rate": 2.8675810473815462e-06, + "loss": 0.2713, + "step": 68730 + }, + { + "epoch": 17.14214463840399, + "grad_norm": 7.5632500648498535, + "learning_rate": 2.8650872817955116e-06, + "loss": 0.2773, + "step": 68740 + }, + { + "epoch": 17.144638403990026, + "grad_norm": 6.6398234367370605, + "learning_rate": 2.8625935162094764e-06, + "loss": 0.3466, + "step": 68750 + }, + { + "epoch": 17.14713216957606, + "grad_norm": 5.831292152404785, + "learning_rate": 2.8600997506234418e-06, + "loss": 0.348, + "step": 68760 + }, + { + "epoch": 17.149625935162096, + "grad_norm": 11.87356185913086, + "learning_rate": 2.8576059850374066e-06, + "loss": 0.2968, + "step": 68770 + }, + { + "epoch": 17.15211970074813, + "grad_norm": 10.019725799560547, + "learning_rate": 2.855112219451372e-06, + "loss": 0.3226, + "step": 68780 + }, + { + "epoch": 17.154613466334165, + "grad_norm": 9.507071495056152, + "learning_rate": 2.852618453865337e-06, + "loss": 0.3549, + "step": 68790 + }, + { + "epoch": 17.1571072319202, + "grad_norm": 10.056197166442871, + "learning_rate": 2.850124688279302e-06, + "loss": 0.2982, + "step": 68800 + }, + { + "epoch": 17.159600997506235, + "grad_norm": 15.976298332214355, + "learning_rate": 2.847630922693267e-06, + "loss": 0.3499, + "step": 68810 + }, + { + "epoch": 17.16209476309227, + "grad_norm": 11.574234962463379, + "learning_rate": 2.8451371571072323e-06, + "loss": 0.3486, + "step": 68820 + }, + { + "epoch": 17.164588528678305, + "grad_norm": 7.35728645324707, + "learning_rate": 2.8426433915211972e-06, + "loss": 0.3749, + "step": 68830 + }, + { + "epoch": 17.16708229426434, + "grad_norm": 9.311493873596191, + "learning_rate": 2.8401496259351625e-06, + "loss": 0.3784, + "step": 68840 + }, + { + "epoch": 17.169576059850375, + "grad_norm": 7.325751304626465, + "learning_rate": 2.837655860349127e-06, + "loss": 0.2806, + "step": 68850 + }, + { + "epoch": 17.17206982543641, + "grad_norm": 6.172769069671631, + "learning_rate": 2.8351620947630927e-06, + "loss": 0.2538, + "step": 68860 + }, + { + "epoch": 17.174563591022444, + "grad_norm": 10.258570671081543, + "learning_rate": 2.832668329177057e-06, + "loss": 0.3006, + "step": 68870 + }, + { + "epoch": 17.17705735660848, + "grad_norm": 6.926236152648926, + "learning_rate": 2.8301745635910225e-06, + "loss": 0.2841, + "step": 68880 + }, + { + "epoch": 17.179551122194514, + "grad_norm": 9.146723747253418, + "learning_rate": 2.8276807980049874e-06, + "loss": 0.3016, + "step": 68890 + }, + { + "epoch": 17.18204488778055, + "grad_norm": 6.017948150634766, + "learning_rate": 2.8251870324189527e-06, + "loss": 0.3461, + "step": 68900 + }, + { + "epoch": 17.184538653366584, + "grad_norm": 10.936685562133789, + "learning_rate": 2.8226932668329176e-06, + "loss": 0.3771, + "step": 68910 + }, + { + "epoch": 17.18703241895262, + "grad_norm": 8.554216384887695, + "learning_rate": 2.820199501246883e-06, + "loss": 0.2986, + "step": 68920 + }, + { + "epoch": 17.189526184538654, + "grad_norm": 8.152572631835938, + "learning_rate": 2.817705735660848e-06, + "loss": 0.3383, + "step": 68930 + }, + { + "epoch": 17.19201995012469, + "grad_norm": 8.737922668457031, + "learning_rate": 2.815211970074813e-06, + "loss": 0.316, + "step": 68940 + }, + { + "epoch": 17.194513715710723, + "grad_norm": 8.8933744430542, + "learning_rate": 2.8127182044887784e-06, + "loss": 0.3481, + "step": 68950 + }, + { + "epoch": 17.197007481296758, + "grad_norm": 9.593643188476562, + "learning_rate": 2.8102244389027433e-06, + "loss": 0.3012, + "step": 68960 + }, + { + "epoch": 17.199501246882793, + "grad_norm": 6.942617416381836, + "learning_rate": 2.8077306733167086e-06, + "loss": 0.2645, + "step": 68970 + }, + { + "epoch": 17.201995012468828, + "grad_norm": 9.486404418945312, + "learning_rate": 2.8052369077306735e-06, + "loss": 0.3266, + "step": 68980 + }, + { + "epoch": 17.204488778054863, + "grad_norm": 5.64856481552124, + "learning_rate": 2.802743142144639e-06, + "loss": 0.2926, + "step": 68990 + }, + { + "epoch": 17.206982543640898, + "grad_norm": 8.93766975402832, + "learning_rate": 2.8002493765586037e-06, + "loss": 0.2835, + "step": 69000 + }, + { + "epoch": 17.209476309226932, + "grad_norm": 7.289651393890381, + "learning_rate": 2.797755610972569e-06, + "loss": 0.3044, + "step": 69010 + }, + { + "epoch": 17.211970074812967, + "grad_norm": 9.757658004760742, + "learning_rate": 2.795261845386534e-06, + "loss": 0.304, + "step": 69020 + }, + { + "epoch": 17.214463840399002, + "grad_norm": 9.677502632141113, + "learning_rate": 2.792768079800499e-06, + "loss": 0.354, + "step": 69030 + }, + { + "epoch": 17.216957605985037, + "grad_norm": 7.527002811431885, + "learning_rate": 2.790274314214464e-06, + "loss": 0.3659, + "step": 69040 + }, + { + "epoch": 17.219451371571072, + "grad_norm": 15.7623872756958, + "learning_rate": 2.7877805486284294e-06, + "loss": 0.2914, + "step": 69050 + }, + { + "epoch": 17.221945137157107, + "grad_norm": 7.540277004241943, + "learning_rate": 2.7852867830423943e-06, + "loss": 0.3512, + "step": 69060 + }, + { + "epoch": 17.22443890274314, + "grad_norm": 8.1314697265625, + "learning_rate": 2.7827930174563596e-06, + "loss": 0.317, + "step": 69070 + }, + { + "epoch": 17.226932668329177, + "grad_norm": 8.093489646911621, + "learning_rate": 2.780299251870324e-06, + "loss": 0.3402, + "step": 69080 + }, + { + "epoch": 17.22942643391521, + "grad_norm": 4.8851094245910645, + "learning_rate": 2.7778054862842898e-06, + "loss": 0.3102, + "step": 69090 + }, + { + "epoch": 17.231920199501246, + "grad_norm": 6.918999671936035, + "learning_rate": 2.7753117206982542e-06, + "loss": 0.2954, + "step": 69100 + }, + { + "epoch": 17.23441396508728, + "grad_norm": 11.607989311218262, + "learning_rate": 2.7728179551122195e-06, + "loss": 0.3554, + "step": 69110 + }, + { + "epoch": 17.236907730673316, + "grad_norm": 8.533638000488281, + "learning_rate": 2.7703241895261844e-06, + "loss": 0.2778, + "step": 69120 + }, + { + "epoch": 17.23940149625935, + "grad_norm": 6.778941631317139, + "learning_rate": 2.7678304239401497e-06, + "loss": 0.3103, + "step": 69130 + }, + { + "epoch": 17.241895261845386, + "grad_norm": 8.85729694366455, + "learning_rate": 2.7653366583541146e-06, + "loss": 0.328, + "step": 69140 + }, + { + "epoch": 17.24438902743142, + "grad_norm": 11.336535453796387, + "learning_rate": 2.76284289276808e-06, + "loss": 0.3471, + "step": 69150 + }, + { + "epoch": 17.246882793017456, + "grad_norm": 8.672399520874023, + "learning_rate": 2.760349127182045e-06, + "loss": 0.4223, + "step": 69160 + }, + { + "epoch": 17.24937655860349, + "grad_norm": 6.419161796569824, + "learning_rate": 2.75785536159601e-06, + "loss": 0.3823, + "step": 69170 + }, + { + "epoch": 17.251870324189525, + "grad_norm": 6.270040035247803, + "learning_rate": 2.7553615960099754e-06, + "loss": 0.2993, + "step": 69180 + }, + { + "epoch": 17.25436408977556, + "grad_norm": 6.386322975158691, + "learning_rate": 2.7528678304239403e-06, + "loss": 0.3222, + "step": 69190 + }, + { + "epoch": 17.256857855361595, + "grad_norm": 10.8588285446167, + "learning_rate": 2.7503740648379056e-06, + "loss": 0.372, + "step": 69200 + }, + { + "epoch": 17.25935162094763, + "grad_norm": 9.645064353942871, + "learning_rate": 2.7478802992518705e-06, + "loss": 0.2823, + "step": 69210 + }, + { + "epoch": 17.261845386533665, + "grad_norm": 8.688511848449707, + "learning_rate": 2.745386533665836e-06, + "loss": 0.3246, + "step": 69220 + }, + { + "epoch": 17.2643391521197, + "grad_norm": 7.549156665802002, + "learning_rate": 2.7428927680798007e-06, + "loss": 0.2917, + "step": 69230 + }, + { + "epoch": 17.266832917705734, + "grad_norm": 7.3209357261657715, + "learning_rate": 2.740399002493766e-06, + "loss": 0.2893, + "step": 69240 + }, + { + "epoch": 17.26932668329177, + "grad_norm": 9.329798698425293, + "learning_rate": 2.737905236907731e-06, + "loss": 0.3382, + "step": 69250 + }, + { + "epoch": 17.271820448877804, + "grad_norm": 17.011377334594727, + "learning_rate": 2.7354114713216962e-06, + "loss": 0.2877, + "step": 69260 + }, + { + "epoch": 17.27431421446384, + "grad_norm": 13.201550483703613, + "learning_rate": 2.732917705735661e-06, + "loss": 0.3846, + "step": 69270 + }, + { + "epoch": 17.276807980049874, + "grad_norm": 7.851199150085449, + "learning_rate": 2.7304239401496264e-06, + "loss": 0.2899, + "step": 69280 + }, + { + "epoch": 17.27930174563591, + "grad_norm": 7.01190185546875, + "learning_rate": 2.7279301745635913e-06, + "loss": 0.3709, + "step": 69290 + }, + { + "epoch": 17.281795511221944, + "grad_norm": 8.432855606079102, + "learning_rate": 2.7254364089775566e-06, + "loss": 0.2868, + "step": 69300 + }, + { + "epoch": 17.28428927680798, + "grad_norm": 9.088372230529785, + "learning_rate": 2.722942643391521e-06, + "loss": 0.3208, + "step": 69310 + }, + { + "epoch": 17.286783042394013, + "grad_norm": 7.609044551849365, + "learning_rate": 2.720448877805487e-06, + "loss": 0.3291, + "step": 69320 + }, + { + "epoch": 17.28927680798005, + "grad_norm": 18.67131996154785, + "learning_rate": 2.7179551122194513e-06, + "loss": 0.38, + "step": 69330 + }, + { + "epoch": 17.291770573566083, + "grad_norm": 7.562769889831543, + "learning_rate": 2.7154613466334166e-06, + "loss": 0.3773, + "step": 69340 + }, + { + "epoch": 17.294264339152118, + "grad_norm": 6.86823844909668, + "learning_rate": 2.7129675810473815e-06, + "loss": 0.3447, + "step": 69350 + }, + { + "epoch": 17.296758104738153, + "grad_norm": 13.892622947692871, + "learning_rate": 2.7104738154613468e-06, + "loss": 0.3643, + "step": 69360 + }, + { + "epoch": 17.29925187032419, + "grad_norm": 7.478859901428223, + "learning_rate": 2.7079800498753117e-06, + "loss": 0.3812, + "step": 69370 + }, + { + "epoch": 17.301745635910226, + "grad_norm": 11.587136268615723, + "learning_rate": 2.705486284289277e-06, + "loss": 0.3247, + "step": 69380 + }, + { + "epoch": 17.30423940149626, + "grad_norm": 11.155722618103027, + "learning_rate": 2.702992518703242e-06, + "loss": 0.2928, + "step": 69390 + }, + { + "epoch": 17.306733167082296, + "grad_norm": 5.776709079742432, + "learning_rate": 2.700498753117207e-06, + "loss": 0.3003, + "step": 69400 + }, + { + "epoch": 17.30922693266833, + "grad_norm": 11.081796646118164, + "learning_rate": 2.698004987531172e-06, + "loss": 0.3558, + "step": 69410 + }, + { + "epoch": 17.311720698254366, + "grad_norm": 10.856132507324219, + "learning_rate": 2.6955112219451374e-06, + "loss": 0.3469, + "step": 69420 + }, + { + "epoch": 17.3142144638404, + "grad_norm": 7.837653636932373, + "learning_rate": 2.6930174563591027e-06, + "loss": 0.2866, + "step": 69430 + }, + { + "epoch": 17.316708229426435, + "grad_norm": 8.779632568359375, + "learning_rate": 2.6905236907730676e-06, + "loss": 0.3857, + "step": 69440 + }, + { + "epoch": 17.31920199501247, + "grad_norm": 9.35014820098877, + "learning_rate": 2.688029925187033e-06, + "loss": 0.2853, + "step": 69450 + }, + { + "epoch": 17.321695760598505, + "grad_norm": 4.780736923217773, + "learning_rate": 2.6855361596009978e-06, + "loss": 0.3128, + "step": 69460 + }, + { + "epoch": 17.32418952618454, + "grad_norm": 9.295073509216309, + "learning_rate": 2.683042394014963e-06, + "loss": 0.3148, + "step": 69470 + }, + { + "epoch": 17.326683291770575, + "grad_norm": 8.449419975280762, + "learning_rate": 2.680548628428928e-06, + "loss": 0.3382, + "step": 69480 + }, + { + "epoch": 17.32917705735661, + "grad_norm": 7.874120712280273, + "learning_rate": 2.6780548628428933e-06, + "loss": 0.3143, + "step": 69490 + }, + { + "epoch": 17.331670822942645, + "grad_norm": 6.140561103820801, + "learning_rate": 2.675561097256858e-06, + "loss": 0.3332, + "step": 69500 + }, + { + "epoch": 17.33416458852868, + "grad_norm": 9.675243377685547, + "learning_rate": 2.6730673316708235e-06, + "loss": 0.2659, + "step": 69510 + }, + { + "epoch": 17.336658354114714, + "grad_norm": 8.507969856262207, + "learning_rate": 2.670573566084788e-06, + "loss": 0.3464, + "step": 69520 + }, + { + "epoch": 17.33915211970075, + "grad_norm": 9.791665077209473, + "learning_rate": 2.6680798004987537e-06, + "loss": 0.3107, + "step": 69530 + }, + { + "epoch": 17.341645885286784, + "grad_norm": 6.693220138549805, + "learning_rate": 2.665586034912718e-06, + "loss": 0.3108, + "step": 69540 + }, + { + "epoch": 17.34413965087282, + "grad_norm": 9.014039993286133, + "learning_rate": 2.6630922693266834e-06, + "loss": 0.3689, + "step": 69550 + }, + { + "epoch": 17.346633416458854, + "grad_norm": 9.79101848602295, + "learning_rate": 2.6605985037406483e-06, + "loss": 0.3281, + "step": 69560 + }, + { + "epoch": 17.34912718204489, + "grad_norm": 6.341735363006592, + "learning_rate": 2.6581047381546136e-06, + "loss": 0.2585, + "step": 69570 + }, + { + "epoch": 17.351620947630924, + "grad_norm": 7.738804340362549, + "learning_rate": 2.6556109725685785e-06, + "loss": 0.3152, + "step": 69580 + }, + { + "epoch": 17.35411471321696, + "grad_norm": 8.257248878479004, + "learning_rate": 2.653117206982544e-06, + "loss": 0.3581, + "step": 69590 + }, + { + "epoch": 17.356608478802993, + "grad_norm": 9.338336944580078, + "learning_rate": 2.6506234413965087e-06, + "loss": 0.302, + "step": 69600 + }, + { + "epoch": 17.359102244389028, + "grad_norm": 9.739322662353516, + "learning_rate": 2.648129675810474e-06, + "loss": 0.2901, + "step": 69610 + }, + { + "epoch": 17.361596009975063, + "grad_norm": 9.487130165100098, + "learning_rate": 2.645635910224439e-06, + "loss": 0.3585, + "step": 69620 + }, + { + "epoch": 17.364089775561098, + "grad_norm": 10.598410606384277, + "learning_rate": 2.6431421446384042e-06, + "loss": 0.3382, + "step": 69630 + }, + { + "epoch": 17.366583541147133, + "grad_norm": 8.358076095581055, + "learning_rate": 2.640648379052369e-06, + "loss": 0.3341, + "step": 69640 + }, + { + "epoch": 17.369077306733168, + "grad_norm": 11.573534965515137, + "learning_rate": 2.6381546134663344e-06, + "loss": 0.3532, + "step": 69650 + }, + { + "epoch": 17.371571072319203, + "grad_norm": 10.417460441589355, + "learning_rate": 2.6356608478802997e-06, + "loss": 0.3544, + "step": 69660 + }, + { + "epoch": 17.374064837905237, + "grad_norm": 12.179265975952148, + "learning_rate": 2.6331670822942646e-06, + "loss": 0.304, + "step": 69670 + }, + { + "epoch": 17.376558603491272, + "grad_norm": 8.12159252166748, + "learning_rate": 2.63067331670823e-06, + "loss": 0.3341, + "step": 69680 + }, + { + "epoch": 17.379052369077307, + "grad_norm": 9.424428939819336, + "learning_rate": 2.628179551122195e-06, + "loss": 0.3425, + "step": 69690 + }, + { + "epoch": 17.381546134663342, + "grad_norm": 7.936733245849609, + "learning_rate": 2.62568578553616e-06, + "loss": 0.2796, + "step": 69700 + }, + { + "epoch": 17.384039900249377, + "grad_norm": 9.32441234588623, + "learning_rate": 2.623192019950125e-06, + "loss": 0.3235, + "step": 69710 + }, + { + "epoch": 17.38653366583541, + "grad_norm": 10.321310043334961, + "learning_rate": 2.6206982543640903e-06, + "loss": 0.3289, + "step": 69720 + }, + { + "epoch": 17.389027431421447, + "grad_norm": 7.7665181159973145, + "learning_rate": 2.618204488778055e-06, + "loss": 0.3161, + "step": 69730 + }, + { + "epoch": 17.39152119700748, + "grad_norm": 9.732200622558594, + "learning_rate": 2.6157107231920205e-06, + "loss": 0.2868, + "step": 69740 + }, + { + "epoch": 17.394014962593516, + "grad_norm": 6.528557300567627, + "learning_rate": 2.613216957605985e-06, + "loss": 0.3435, + "step": 69750 + }, + { + "epoch": 17.39650872817955, + "grad_norm": 8.758277893066406, + "learning_rate": 2.6107231920199507e-06, + "loss": 0.3674, + "step": 69760 + }, + { + "epoch": 17.399002493765586, + "grad_norm": 7.994776248931885, + "learning_rate": 2.608229426433915e-06, + "loss": 0.372, + "step": 69770 + }, + { + "epoch": 17.40149625935162, + "grad_norm": 9.376730918884277, + "learning_rate": 2.6057356608478805e-06, + "loss": 0.3216, + "step": 69780 + }, + { + "epoch": 17.403990024937656, + "grad_norm": 6.297952175140381, + "learning_rate": 2.6032418952618454e-06, + "loss": 0.3051, + "step": 69790 + }, + { + "epoch": 17.40648379052369, + "grad_norm": 8.456469535827637, + "learning_rate": 2.6007481296758107e-06, + "loss": 0.3467, + "step": 69800 + }, + { + "epoch": 17.408977556109726, + "grad_norm": 7.9176859855651855, + "learning_rate": 2.5982543640897756e-06, + "loss": 0.3259, + "step": 69810 + }, + { + "epoch": 17.41147132169576, + "grad_norm": 9.109759330749512, + "learning_rate": 2.595760598503741e-06, + "loss": 0.3349, + "step": 69820 + }, + { + "epoch": 17.413965087281795, + "grad_norm": 8.499098777770996, + "learning_rate": 2.5932668329177058e-06, + "loss": 0.3412, + "step": 69830 + }, + { + "epoch": 17.41645885286783, + "grad_norm": 8.873187065124512, + "learning_rate": 2.590773067331671e-06, + "loss": 0.2991, + "step": 69840 + }, + { + "epoch": 17.418952618453865, + "grad_norm": 6.6649489402771, + "learning_rate": 2.588279301745636e-06, + "loss": 0.362, + "step": 69850 + }, + { + "epoch": 17.4214463840399, + "grad_norm": 7.3186259269714355, + "learning_rate": 2.5857855361596013e-06, + "loss": 0.3563, + "step": 69860 + }, + { + "epoch": 17.423940149625935, + "grad_norm": 6.746584892272949, + "learning_rate": 2.583291770573566e-06, + "loss": 0.3346, + "step": 69870 + }, + { + "epoch": 17.42643391521197, + "grad_norm": 10.556224822998047, + "learning_rate": 2.5807980049875315e-06, + "loss": 0.367, + "step": 69880 + }, + { + "epoch": 17.428927680798004, + "grad_norm": 8.307790756225586, + "learning_rate": 2.5783042394014963e-06, + "loss": 0.337, + "step": 69890 + }, + { + "epoch": 17.43142144638404, + "grad_norm": 18.567258834838867, + "learning_rate": 2.5758104738154617e-06, + "loss": 0.3254, + "step": 69900 + }, + { + "epoch": 17.433915211970074, + "grad_norm": 8.430441856384277, + "learning_rate": 2.573316708229427e-06, + "loss": 0.2723, + "step": 69910 + }, + { + "epoch": 17.43640897755611, + "grad_norm": 6.945594310760498, + "learning_rate": 2.570822942643392e-06, + "loss": 0.3487, + "step": 69920 + }, + { + "epoch": 17.438902743142144, + "grad_norm": 9.82526969909668, + "learning_rate": 2.568329177057357e-06, + "loss": 0.3007, + "step": 69930 + }, + { + "epoch": 17.44139650872818, + "grad_norm": 9.881370544433594, + "learning_rate": 2.565835411471322e-06, + "loss": 0.3641, + "step": 69940 + }, + { + "epoch": 17.443890274314214, + "grad_norm": 10.655155181884766, + "learning_rate": 2.5633416458852874e-06, + "loss": 0.316, + "step": 69950 + }, + { + "epoch": 17.44638403990025, + "grad_norm": 7.941926002502441, + "learning_rate": 2.560847880299252e-06, + "loss": 0.2926, + "step": 69960 + }, + { + "epoch": 17.448877805486283, + "grad_norm": 10.348051071166992, + "learning_rate": 2.5583541147132175e-06, + "loss": 0.3244, + "step": 69970 + }, + { + "epoch": 17.45137157107232, + "grad_norm": 9.219032287597656, + "learning_rate": 2.555860349127182e-06, + "loss": 0.2996, + "step": 69980 + }, + { + "epoch": 17.453865336658353, + "grad_norm": 12.931209564208984, + "learning_rate": 2.5533665835411473e-06, + "loss": 0.3215, + "step": 69990 + }, + { + "epoch": 17.456359102244388, + "grad_norm": 12.621508598327637, + "learning_rate": 2.550872817955112e-06, + "loss": 0.3766, + "step": 70000 + }, + { + "epoch": 17.458852867830423, + "grad_norm": 13.947381019592285, + "learning_rate": 2.5483790523690775e-06, + "loss": 0.3613, + "step": 70010 + }, + { + "epoch": 17.461346633416458, + "grad_norm": 8.649821281433105, + "learning_rate": 2.5458852867830424e-06, + "loss": 0.3608, + "step": 70020 + }, + { + "epoch": 17.463840399002493, + "grad_norm": 13.746424674987793, + "learning_rate": 2.5433915211970077e-06, + "loss": 0.3476, + "step": 70030 + }, + { + "epoch": 17.466334164588527, + "grad_norm": 8.04629898071289, + "learning_rate": 2.5408977556109726e-06, + "loss": 0.2839, + "step": 70040 + }, + { + "epoch": 17.468827930174562, + "grad_norm": 13.645923614501953, + "learning_rate": 2.538403990024938e-06, + "loss": 0.3092, + "step": 70050 + }, + { + "epoch": 17.471321695760597, + "grad_norm": 6.63210391998291, + "learning_rate": 2.535910224438903e-06, + "loss": 0.3725, + "step": 70060 + }, + { + "epoch": 17.473815461346632, + "grad_norm": 6.891839981079102, + "learning_rate": 2.533416458852868e-06, + "loss": 0.2888, + "step": 70070 + }, + { + "epoch": 17.476309226932667, + "grad_norm": 9.221366882324219, + "learning_rate": 2.530922693266833e-06, + "loss": 0.3489, + "step": 70080 + }, + { + "epoch": 17.478802992518702, + "grad_norm": 10.869357109069824, + "learning_rate": 2.5284289276807983e-06, + "loss": 0.4087, + "step": 70090 + }, + { + "epoch": 17.481296758104737, + "grad_norm": 9.193957328796387, + "learning_rate": 2.525935162094763e-06, + "loss": 0.324, + "step": 70100 + }, + { + "epoch": 17.48379052369077, + "grad_norm": 7.817142009735107, + "learning_rate": 2.5234413965087285e-06, + "loss": 0.3491, + "step": 70110 + }, + { + "epoch": 17.486284289276806, + "grad_norm": 11.097153663635254, + "learning_rate": 2.5209476309226934e-06, + "loss": 0.2901, + "step": 70120 + }, + { + "epoch": 17.48877805486284, + "grad_norm": 8.653058052062988, + "learning_rate": 2.5184538653366587e-06, + "loss": 0.3366, + "step": 70130 + }, + { + "epoch": 17.491271820448876, + "grad_norm": 7.455108642578125, + "learning_rate": 2.5159600997506236e-06, + "loss": 0.342, + "step": 70140 + }, + { + "epoch": 17.49376558603491, + "grad_norm": 6.872528553009033, + "learning_rate": 2.513466334164589e-06, + "loss": 0.3033, + "step": 70150 + }, + { + "epoch": 17.496259351620946, + "grad_norm": 7.771061420440674, + "learning_rate": 2.510972568578554e-06, + "loss": 0.2978, + "step": 70160 + }, + { + "epoch": 17.49875311720698, + "grad_norm": 7.269931316375732, + "learning_rate": 2.508478802992519e-06, + "loss": 0.3072, + "step": 70170 + }, + { + "epoch": 17.50124688279302, + "grad_norm": 10.571673393249512, + "learning_rate": 2.5059850374064844e-06, + "loss": 0.4029, + "step": 70180 + }, + { + "epoch": 17.503740648379054, + "grad_norm": 7.55682373046875, + "learning_rate": 2.503491271820449e-06, + "loss": 0.3645, + "step": 70190 + }, + { + "epoch": 17.50623441396509, + "grad_norm": 8.619101524353027, + "learning_rate": 2.5009975062344146e-06, + "loss": 0.4596, + "step": 70200 + }, + { + "epoch": 17.508728179551124, + "grad_norm": 8.528571128845215, + "learning_rate": 2.498503740648379e-06, + "loss": 0.3011, + "step": 70210 + }, + { + "epoch": 17.51122194513716, + "grad_norm": 11.281834602355957, + "learning_rate": 2.4960099750623444e-06, + "loss": 0.3314, + "step": 70220 + }, + { + "epoch": 17.513715710723194, + "grad_norm": 6.625566482543945, + "learning_rate": 2.4935162094763092e-06, + "loss": 0.2803, + "step": 70230 + }, + { + "epoch": 17.51620947630923, + "grad_norm": 11.519903182983398, + "learning_rate": 2.4910224438902746e-06, + "loss": 0.3145, + "step": 70240 + }, + { + "epoch": 17.518703241895263, + "grad_norm": 7.888875484466553, + "learning_rate": 2.4885286783042394e-06, + "loss": 0.3021, + "step": 70250 + }, + { + "epoch": 17.521197007481298, + "grad_norm": 7.573606967926025, + "learning_rate": 2.4860349127182048e-06, + "loss": 0.2962, + "step": 70260 + }, + { + "epoch": 17.523690773067333, + "grad_norm": 8.137581825256348, + "learning_rate": 2.4835411471321696e-06, + "loss": 0.3396, + "step": 70270 + }, + { + "epoch": 17.526184538653368, + "grad_norm": 10.423111915588379, + "learning_rate": 2.481047381546135e-06, + "loss": 0.3194, + "step": 70280 + }, + { + "epoch": 17.528678304239403, + "grad_norm": 11.856714248657227, + "learning_rate": 2.4785536159601003e-06, + "loss": 0.3732, + "step": 70290 + }, + { + "epoch": 17.531172069825438, + "grad_norm": 8.993725776672363, + "learning_rate": 2.476059850374065e-06, + "loss": 0.288, + "step": 70300 + }, + { + "epoch": 17.533665835411473, + "grad_norm": 9.244604110717773, + "learning_rate": 2.47356608478803e-06, + "loss": 0.3439, + "step": 70310 + }, + { + "epoch": 17.536159600997507, + "grad_norm": 9.765801429748535, + "learning_rate": 2.4710723192019953e-06, + "loss": 0.3375, + "step": 70320 + }, + { + "epoch": 17.538653366583542, + "grad_norm": 5.98468542098999, + "learning_rate": 2.4685785536159602e-06, + "loss": 0.302, + "step": 70330 + }, + { + "epoch": 17.541147132169577, + "grad_norm": 8.877677917480469, + "learning_rate": 2.4660847880299255e-06, + "loss": 0.387, + "step": 70340 + }, + { + "epoch": 17.543640897755612, + "grad_norm": 10.188618659973145, + "learning_rate": 2.4635910224438904e-06, + "loss": 0.3714, + "step": 70350 + }, + { + "epoch": 17.546134663341647, + "grad_norm": 9.320550918579102, + "learning_rate": 2.4610972568578557e-06, + "loss": 0.3256, + "step": 70360 + }, + { + "epoch": 17.54862842892768, + "grad_norm": 7.385508060455322, + "learning_rate": 2.4586034912718206e-06, + "loss": 0.2825, + "step": 70370 + }, + { + "epoch": 17.551122194513717, + "grad_norm": 8.037924766540527, + "learning_rate": 2.456109725685786e-06, + "loss": 0.3273, + "step": 70380 + }, + { + "epoch": 17.55361596009975, + "grad_norm": 10.301573753356934, + "learning_rate": 2.453615960099751e-06, + "loss": 0.2664, + "step": 70390 + }, + { + "epoch": 17.556109725685786, + "grad_norm": 7.737729072570801, + "learning_rate": 2.451122194513716e-06, + "loss": 0.3422, + "step": 70400 + }, + { + "epoch": 17.55860349127182, + "grad_norm": 7.797543525695801, + "learning_rate": 2.448628428927681e-06, + "loss": 0.3644, + "step": 70410 + }, + { + "epoch": 17.561097256857856, + "grad_norm": 9.298747062683105, + "learning_rate": 2.446134663341646e-06, + "loss": 0.3983, + "step": 70420 + }, + { + "epoch": 17.56359102244389, + "grad_norm": 8.475441932678223, + "learning_rate": 2.443640897755611e-06, + "loss": 0.3913, + "step": 70430 + }, + { + "epoch": 17.566084788029926, + "grad_norm": 7.733828544616699, + "learning_rate": 2.441147132169576e-06, + "loss": 0.3371, + "step": 70440 + }, + { + "epoch": 17.56857855361596, + "grad_norm": 7.250517845153809, + "learning_rate": 2.4386533665835414e-06, + "loss": 0.3646, + "step": 70450 + }, + { + "epoch": 17.571072319201996, + "grad_norm": 13.300467491149902, + "learning_rate": 2.4361596009975063e-06, + "loss": 0.3736, + "step": 70460 + }, + { + "epoch": 17.57356608478803, + "grad_norm": 7.141026973724365, + "learning_rate": 2.4336658354114716e-06, + "loss": 0.2415, + "step": 70470 + }, + { + "epoch": 17.576059850374065, + "grad_norm": 6.321515083312988, + "learning_rate": 2.4311720698254365e-06, + "loss": 0.3182, + "step": 70480 + }, + { + "epoch": 17.5785536159601, + "grad_norm": 10.123273849487305, + "learning_rate": 2.428678304239402e-06, + "loss": 0.3007, + "step": 70490 + }, + { + "epoch": 17.581047381546135, + "grad_norm": 6.999142169952393, + "learning_rate": 2.4261845386533667e-06, + "loss": 0.3756, + "step": 70500 + }, + { + "epoch": 17.58354114713217, + "grad_norm": 11.76720905303955, + "learning_rate": 2.4236907730673316e-06, + "loss": 0.3423, + "step": 70510 + }, + { + "epoch": 17.586034912718205, + "grad_norm": 7.583934783935547, + "learning_rate": 2.421197007481297e-06, + "loss": 0.3457, + "step": 70520 + }, + { + "epoch": 17.58852867830424, + "grad_norm": 7.330530166625977, + "learning_rate": 2.418703241895262e-06, + "loss": 0.2704, + "step": 70530 + }, + { + "epoch": 17.591022443890274, + "grad_norm": 9.098861694335938, + "learning_rate": 2.416209476309227e-06, + "loss": 0.3421, + "step": 70540 + }, + { + "epoch": 17.59351620947631, + "grad_norm": 7.645537853240967, + "learning_rate": 2.4137157107231924e-06, + "loss": 0.3421, + "step": 70550 + }, + { + "epoch": 17.596009975062344, + "grad_norm": 7.081808090209961, + "learning_rate": 2.4112219451371573e-06, + "loss": 0.3081, + "step": 70560 + }, + { + "epoch": 17.59850374064838, + "grad_norm": 9.392647743225098, + "learning_rate": 2.4087281795511226e-06, + "loss": 0.322, + "step": 70570 + }, + { + "epoch": 17.600997506234414, + "grad_norm": 7.144293308258057, + "learning_rate": 2.4062344139650875e-06, + "loss": 0.3469, + "step": 70580 + }, + { + "epoch": 17.60349127182045, + "grad_norm": 4.9244489669799805, + "learning_rate": 2.4037406483790528e-06, + "loss": 0.2498, + "step": 70590 + }, + { + "epoch": 17.605985037406484, + "grad_norm": 7.220204830169678, + "learning_rate": 2.4012468827930177e-06, + "loss": 0.2706, + "step": 70600 + }, + { + "epoch": 17.60847880299252, + "grad_norm": 13.143561363220215, + "learning_rate": 2.398753117206983e-06, + "loss": 0.2832, + "step": 70610 + }, + { + "epoch": 17.610972568578553, + "grad_norm": 6.485062599182129, + "learning_rate": 2.396259351620948e-06, + "loss": 0.2975, + "step": 70620 + }, + { + "epoch": 17.61346633416459, + "grad_norm": 9.139724731445312, + "learning_rate": 2.3937655860349127e-06, + "loss": 0.3112, + "step": 70630 + }, + { + "epoch": 17.615960099750623, + "grad_norm": 9.77446460723877, + "learning_rate": 2.391271820448878e-06, + "loss": 0.3726, + "step": 70640 + }, + { + "epoch": 17.618453865336658, + "grad_norm": 8.16642951965332, + "learning_rate": 2.388778054862843e-06, + "loss": 0.3558, + "step": 70650 + }, + { + "epoch": 17.620947630922693, + "grad_norm": 8.044200897216797, + "learning_rate": 2.3862842892768082e-06, + "loss": 0.3816, + "step": 70660 + }, + { + "epoch": 17.623441396508728, + "grad_norm": 6.677599906921387, + "learning_rate": 2.383790523690773e-06, + "loss": 0.3632, + "step": 70670 + }, + { + "epoch": 17.625935162094763, + "grad_norm": 7.929919719696045, + "learning_rate": 2.3812967581047384e-06, + "loss": 0.3247, + "step": 70680 + }, + { + "epoch": 17.628428927680797, + "grad_norm": 5.376601219177246, + "learning_rate": 2.3788029925187033e-06, + "loss": 0.3405, + "step": 70690 + }, + { + "epoch": 17.630922693266832, + "grad_norm": 3.786297082901001, + "learning_rate": 2.3763092269326686e-06, + "loss": 0.2899, + "step": 70700 + }, + { + "epoch": 17.633416458852867, + "grad_norm": 6.230623722076416, + "learning_rate": 2.3738154613466335e-06, + "loss": 0.3461, + "step": 70710 + }, + { + "epoch": 17.635910224438902, + "grad_norm": 7.953461170196533, + "learning_rate": 2.3713216957605984e-06, + "loss": 0.2477, + "step": 70720 + }, + { + "epoch": 17.638403990024937, + "grad_norm": 6.597314357757568, + "learning_rate": 2.3688279301745637e-06, + "loss": 0.3126, + "step": 70730 + }, + { + "epoch": 17.640897755610972, + "grad_norm": 10.526558876037598, + "learning_rate": 2.3663341645885286e-06, + "loss": 0.3603, + "step": 70740 + }, + { + "epoch": 17.643391521197007, + "grad_norm": 6.605552673339844, + "learning_rate": 2.363840399002494e-06, + "loss": 0.3097, + "step": 70750 + }, + { + "epoch": 17.64588528678304, + "grad_norm": 7.1740031242370605, + "learning_rate": 2.361346633416459e-06, + "loss": 0.3014, + "step": 70760 + }, + { + "epoch": 17.648379052369076, + "grad_norm": 9.406676292419434, + "learning_rate": 2.358852867830424e-06, + "loss": 0.3107, + "step": 70770 + }, + { + "epoch": 17.65087281795511, + "grad_norm": 8.029032707214355, + "learning_rate": 2.3563591022443894e-06, + "loss": 0.3425, + "step": 70780 + }, + { + "epoch": 17.653366583541146, + "grad_norm": 13.621152877807617, + "learning_rate": 2.3538653366583543e-06, + "loss": 0.4274, + "step": 70790 + }, + { + "epoch": 17.65586034912718, + "grad_norm": 7.198687553405762, + "learning_rate": 2.3513715710723196e-06, + "loss": 0.3595, + "step": 70800 + }, + { + "epoch": 17.658354114713216, + "grad_norm": 7.494908332824707, + "learning_rate": 2.3488778054862845e-06, + "loss": 0.2798, + "step": 70810 + }, + { + "epoch": 17.66084788029925, + "grad_norm": 10.102776527404785, + "learning_rate": 2.34638403990025e-06, + "loss": 0.3385, + "step": 70820 + }, + { + "epoch": 17.663341645885286, + "grad_norm": 7.839235782623291, + "learning_rate": 2.3438902743142147e-06, + "loss": 0.3029, + "step": 70830 + }, + { + "epoch": 17.66583541147132, + "grad_norm": 7.807239532470703, + "learning_rate": 2.34139650872818e-06, + "loss": 0.2806, + "step": 70840 + }, + { + "epoch": 17.668329177057355, + "grad_norm": 7.2098069190979, + "learning_rate": 2.338902743142145e-06, + "loss": 0.3324, + "step": 70850 + }, + { + "epoch": 17.67082294264339, + "grad_norm": 12.253145217895508, + "learning_rate": 2.3364089775561098e-06, + "loss": 0.3326, + "step": 70860 + }, + { + "epoch": 17.673316708229425, + "grad_norm": 6.79581880569458, + "learning_rate": 2.333915211970075e-06, + "loss": 0.3108, + "step": 70870 + }, + { + "epoch": 17.67581047381546, + "grad_norm": 6.1217145919799805, + "learning_rate": 2.33142144638404e-06, + "loss": 0.2697, + "step": 70880 + }, + { + "epoch": 17.678304239401495, + "grad_norm": 5.935173034667969, + "learning_rate": 2.3289276807980053e-06, + "loss": 0.3026, + "step": 70890 + }, + { + "epoch": 17.68079800498753, + "grad_norm": 10.652816772460938, + "learning_rate": 2.32643391521197e-06, + "loss": 0.2878, + "step": 70900 + }, + { + "epoch": 17.683291770573565, + "grad_norm": 9.911921501159668, + "learning_rate": 2.3239401496259355e-06, + "loss": 0.305, + "step": 70910 + }, + { + "epoch": 17.6857855361596, + "grad_norm": 9.336223602294922, + "learning_rate": 2.3214463840399004e-06, + "loss": 0.3035, + "step": 70920 + }, + { + "epoch": 17.688279301745634, + "grad_norm": 10.193656921386719, + "learning_rate": 2.3189526184538657e-06, + "loss": 0.2473, + "step": 70930 + }, + { + "epoch": 17.69077306733167, + "grad_norm": 11.795038223266602, + "learning_rate": 2.3164588528678306e-06, + "loss": 0.3295, + "step": 70940 + }, + { + "epoch": 17.693266832917704, + "grad_norm": 10.375980377197266, + "learning_rate": 2.3139650872817955e-06, + "loss": 0.3085, + "step": 70950 + }, + { + "epoch": 17.69576059850374, + "grad_norm": 6.3128533363342285, + "learning_rate": 2.3114713216957608e-06, + "loss": 0.282, + "step": 70960 + }, + { + "epoch": 17.698254364089777, + "grad_norm": 5.7575907707214355, + "learning_rate": 2.3089775561097256e-06, + "loss": 0.3209, + "step": 70970 + }, + { + "epoch": 17.70074812967581, + "grad_norm": 8.107242584228516, + "learning_rate": 2.306483790523691e-06, + "loss": 0.3082, + "step": 70980 + }, + { + "epoch": 17.703241895261847, + "grad_norm": 9.448348999023438, + "learning_rate": 2.303990024937656e-06, + "loss": 0.3068, + "step": 70990 + }, + { + "epoch": 17.705735660847882, + "grad_norm": 15.375732421875, + "learning_rate": 2.301496259351621e-06, + "loss": 0.3056, + "step": 71000 + }, + { + "epoch": 17.708229426433917, + "grad_norm": 10.136244773864746, + "learning_rate": 2.299002493765586e-06, + "loss": 0.297, + "step": 71010 + }, + { + "epoch": 17.71072319201995, + "grad_norm": 7.518948078155518, + "learning_rate": 2.2965087281795514e-06, + "loss": 0.3659, + "step": 71020 + }, + { + "epoch": 17.713216957605987, + "grad_norm": 8.733294486999512, + "learning_rate": 2.2940149625935167e-06, + "loss": 0.3045, + "step": 71030 + }, + { + "epoch": 17.71571072319202, + "grad_norm": 11.162075996398926, + "learning_rate": 2.2915211970074815e-06, + "loss": 0.3342, + "step": 71040 + }, + { + "epoch": 17.718204488778056, + "grad_norm": 7.541927814483643, + "learning_rate": 2.289027431421447e-06, + "loss": 0.2797, + "step": 71050 + }, + { + "epoch": 17.72069825436409, + "grad_norm": 9.148726463317871, + "learning_rate": 2.2865336658354117e-06, + "loss": 0.2668, + "step": 71060 + }, + { + "epoch": 17.723192019950126, + "grad_norm": 10.433960914611816, + "learning_rate": 2.2840399002493766e-06, + "loss": 0.3384, + "step": 71070 + }, + { + "epoch": 17.72568578553616, + "grad_norm": 10.94124984741211, + "learning_rate": 2.281546134663342e-06, + "loss": 0.2648, + "step": 71080 + }, + { + "epoch": 17.728179551122196, + "grad_norm": 10.347514152526855, + "learning_rate": 2.279052369077307e-06, + "loss": 0.3198, + "step": 71090 + }, + { + "epoch": 17.73067331670823, + "grad_norm": 9.887333869934082, + "learning_rate": 2.276558603491272e-06, + "loss": 0.3412, + "step": 71100 + }, + { + "epoch": 17.733167082294266, + "grad_norm": 11.739676475524902, + "learning_rate": 2.274064837905237e-06, + "loss": 0.3266, + "step": 71110 + }, + { + "epoch": 17.7356608478803, + "grad_norm": 8.010821342468262, + "learning_rate": 2.2715710723192023e-06, + "loss": 0.2687, + "step": 71120 + }, + { + "epoch": 17.738154613466335, + "grad_norm": 7.8744730949401855, + "learning_rate": 2.2690773067331672e-06, + "loss": 0.3463, + "step": 71130 + }, + { + "epoch": 17.74064837905237, + "grad_norm": 6.5923237800598145, + "learning_rate": 2.2665835411471325e-06, + "loss": 0.3428, + "step": 71140 + }, + { + "epoch": 17.743142144638405, + "grad_norm": 12.174713134765625, + "learning_rate": 2.2640897755610974e-06, + "loss": 0.3093, + "step": 71150 + }, + { + "epoch": 17.74563591022444, + "grad_norm": 7.997066020965576, + "learning_rate": 2.2615960099750627e-06, + "loss": 0.3085, + "step": 71160 + }, + { + "epoch": 17.748129675810475, + "grad_norm": 10.162776947021484, + "learning_rate": 2.2591022443890276e-06, + "loss": 0.3193, + "step": 71170 + }, + { + "epoch": 17.75062344139651, + "grad_norm": 8.954973220825195, + "learning_rate": 2.2566084788029925e-06, + "loss": 0.3455, + "step": 71180 + }, + { + "epoch": 17.753117206982544, + "grad_norm": 5.909659385681152, + "learning_rate": 2.254114713216958e-06, + "loss": 0.3149, + "step": 71190 + }, + { + "epoch": 17.75561097256858, + "grad_norm": 7.491812229156494, + "learning_rate": 2.2516209476309227e-06, + "loss": 0.2545, + "step": 71200 + }, + { + "epoch": 17.758104738154614, + "grad_norm": 7.293992519378662, + "learning_rate": 2.249127182044888e-06, + "loss": 0.2647, + "step": 71210 + }, + { + "epoch": 17.76059850374065, + "grad_norm": 8.303019523620605, + "learning_rate": 2.246633416458853e-06, + "loss": 0.3833, + "step": 71220 + }, + { + "epoch": 17.763092269326684, + "grad_norm": 7.720460414886475, + "learning_rate": 2.244139650872818e-06, + "loss": 0.2887, + "step": 71230 + }, + { + "epoch": 17.76558603491272, + "grad_norm": 10.40861988067627, + "learning_rate": 2.241645885286783e-06, + "loss": 0.3591, + "step": 71240 + }, + { + "epoch": 17.768079800498754, + "grad_norm": 12.28356647491455, + "learning_rate": 2.2391521197007484e-06, + "loss": 0.3628, + "step": 71250 + }, + { + "epoch": 17.77057356608479, + "grad_norm": 10.298823356628418, + "learning_rate": 2.2366583541147133e-06, + "loss": 0.3126, + "step": 71260 + }, + { + "epoch": 17.773067331670823, + "grad_norm": 12.331887245178223, + "learning_rate": 2.2341645885286786e-06, + "loss": 0.3214, + "step": 71270 + }, + { + "epoch": 17.77556109725686, + "grad_norm": 5.967679977416992, + "learning_rate": 2.231670822942644e-06, + "loss": 0.3022, + "step": 71280 + }, + { + "epoch": 17.778054862842893, + "grad_norm": 8.2744722366333, + "learning_rate": 2.2291770573566088e-06, + "loss": 0.3448, + "step": 71290 + }, + { + "epoch": 17.780548628428928, + "grad_norm": 7.625812530517578, + "learning_rate": 2.2266832917705737e-06, + "loss": 0.2958, + "step": 71300 + }, + { + "epoch": 17.783042394014963, + "grad_norm": 9.751514434814453, + "learning_rate": 2.224189526184539e-06, + "loss": 0.3786, + "step": 71310 + }, + { + "epoch": 17.785536159600998, + "grad_norm": 8.716211318969727, + "learning_rate": 2.221695760598504e-06, + "loss": 0.2933, + "step": 71320 + }, + { + "epoch": 17.788029925187033, + "grad_norm": 11.199061393737793, + "learning_rate": 2.219201995012469e-06, + "loss": 0.3668, + "step": 71330 + }, + { + "epoch": 17.790523690773068, + "grad_norm": 14.230875015258789, + "learning_rate": 2.216708229426434e-06, + "loss": 0.3918, + "step": 71340 + }, + { + "epoch": 17.793017456359102, + "grad_norm": 7.556332111358643, + "learning_rate": 2.2142144638403994e-06, + "loss": 0.2992, + "step": 71350 + }, + { + "epoch": 17.795511221945137, + "grad_norm": 10.51162052154541, + "learning_rate": 2.2117206982543643e-06, + "loss": 0.2895, + "step": 71360 + }, + { + "epoch": 17.798004987531172, + "grad_norm": 8.932967185974121, + "learning_rate": 2.2092269326683296e-06, + "loss": 0.3283, + "step": 71370 + }, + { + "epoch": 17.800498753117207, + "grad_norm": 6.184909343719482, + "learning_rate": 2.2067331670822945e-06, + "loss": 0.3352, + "step": 71380 + }, + { + "epoch": 17.802992518703242, + "grad_norm": 16.87145233154297, + "learning_rate": 2.2042394014962593e-06, + "loss": 0.3586, + "step": 71390 + }, + { + "epoch": 17.805486284289277, + "grad_norm": 9.270705223083496, + "learning_rate": 2.2017456359102246e-06, + "loss": 0.2885, + "step": 71400 + }, + { + "epoch": 17.80798004987531, + "grad_norm": 8.402549743652344, + "learning_rate": 2.1992518703241895e-06, + "loss": 0.2626, + "step": 71410 + }, + { + "epoch": 17.810473815461346, + "grad_norm": 5.3837738037109375, + "learning_rate": 2.196758104738155e-06, + "loss": 0.3185, + "step": 71420 + }, + { + "epoch": 17.81296758104738, + "grad_norm": 9.095513343811035, + "learning_rate": 2.1942643391521197e-06, + "loss": 0.3384, + "step": 71430 + }, + { + "epoch": 17.815461346633416, + "grad_norm": 15.026154518127441, + "learning_rate": 2.191770573566085e-06, + "loss": 0.3384, + "step": 71440 + }, + { + "epoch": 17.81795511221945, + "grad_norm": 9.241385459899902, + "learning_rate": 2.18927680798005e-06, + "loss": 0.3365, + "step": 71450 + }, + { + "epoch": 17.820448877805486, + "grad_norm": 8.417593002319336, + "learning_rate": 2.1867830423940152e-06, + "loss": 0.3111, + "step": 71460 + }, + { + "epoch": 17.82294264339152, + "grad_norm": 6.55775260925293, + "learning_rate": 2.18428927680798e-06, + "loss": 0.3183, + "step": 71470 + }, + { + "epoch": 17.825436408977556, + "grad_norm": 5.751813888549805, + "learning_rate": 2.181795511221945e-06, + "loss": 0.248, + "step": 71480 + }, + { + "epoch": 17.82793017456359, + "grad_norm": 8.865701675415039, + "learning_rate": 2.1793017456359103e-06, + "loss": 0.3105, + "step": 71490 + }, + { + "epoch": 17.830423940149625, + "grad_norm": 8.64411449432373, + "learning_rate": 2.176807980049875e-06, + "loss": 0.3189, + "step": 71500 + }, + { + "epoch": 17.83291770573566, + "grad_norm": 10.088348388671875, + "learning_rate": 2.1743142144638405e-06, + "loss": 0.3171, + "step": 71510 + }, + { + "epoch": 17.835411471321695, + "grad_norm": 8.263297080993652, + "learning_rate": 2.171820448877806e-06, + "loss": 0.2926, + "step": 71520 + }, + { + "epoch": 17.83790523690773, + "grad_norm": 14.566165924072266, + "learning_rate": 2.1693266832917707e-06, + "loss": 0.3174, + "step": 71530 + }, + { + "epoch": 17.840399002493765, + "grad_norm": 7.461530685424805, + "learning_rate": 2.166832917705736e-06, + "loss": 0.3255, + "step": 71540 + }, + { + "epoch": 17.8428927680798, + "grad_norm": 5.400354385375977, + "learning_rate": 2.164339152119701e-06, + "loss": 0.263, + "step": 71550 + }, + { + "epoch": 17.845386533665835, + "grad_norm": 9.746865272521973, + "learning_rate": 2.1618453865336662e-06, + "loss": 0.3657, + "step": 71560 + }, + { + "epoch": 17.84788029925187, + "grad_norm": 7.5796380043029785, + "learning_rate": 2.159351620947631e-06, + "loss": 0.3111, + "step": 71570 + }, + { + "epoch": 17.850374064837904, + "grad_norm": 11.697222709655762, + "learning_rate": 2.1568578553615964e-06, + "loss": 0.2908, + "step": 71580 + }, + { + "epoch": 17.85286783042394, + "grad_norm": 6.474806308746338, + "learning_rate": 2.1543640897755613e-06, + "loss": 0.285, + "step": 71590 + }, + { + "epoch": 17.855361596009974, + "grad_norm": 9.604869842529297, + "learning_rate": 2.1518703241895266e-06, + "loss": 0.3314, + "step": 71600 + }, + { + "epoch": 17.85785536159601, + "grad_norm": 7.376259803771973, + "learning_rate": 2.1493765586034915e-06, + "loss": 0.257, + "step": 71610 + }, + { + "epoch": 17.860349127182044, + "grad_norm": 8.758426666259766, + "learning_rate": 2.1468827930174564e-06, + "loss": 0.3672, + "step": 71620 + }, + { + "epoch": 17.86284289276808, + "grad_norm": 8.611882209777832, + "learning_rate": 2.1443890274314217e-06, + "loss": 0.3306, + "step": 71630 + }, + { + "epoch": 17.865336658354114, + "grad_norm": 10.081382751464844, + "learning_rate": 2.1418952618453866e-06, + "loss": 0.3223, + "step": 71640 + }, + { + "epoch": 17.86783042394015, + "grad_norm": 11.713033676147461, + "learning_rate": 2.139401496259352e-06, + "loss": 0.3424, + "step": 71650 + }, + { + "epoch": 17.870324189526183, + "grad_norm": 8.003156661987305, + "learning_rate": 2.1369077306733168e-06, + "loss": 0.327, + "step": 71660 + }, + { + "epoch": 17.872817955112218, + "grad_norm": 9.68770694732666, + "learning_rate": 2.134413965087282e-06, + "loss": 0.3158, + "step": 71670 + }, + { + "epoch": 17.875311720698253, + "grad_norm": 4.524538516998291, + "learning_rate": 2.131920199501247e-06, + "loss": 0.3178, + "step": 71680 + }, + { + "epoch": 17.877805486284288, + "grad_norm": 8.126871109008789, + "learning_rate": 2.1294264339152123e-06, + "loss": 0.3448, + "step": 71690 + }, + { + "epoch": 17.880299251870323, + "grad_norm": 8.234221458435059, + "learning_rate": 2.126932668329177e-06, + "loss": 0.3894, + "step": 71700 + }, + { + "epoch": 17.882793017456358, + "grad_norm": 8.27507209777832, + "learning_rate": 2.124438902743142e-06, + "loss": 0.3331, + "step": 71710 + }, + { + "epoch": 17.885286783042392, + "grad_norm": 9.675071716308594, + "learning_rate": 2.1219451371571074e-06, + "loss": 0.3486, + "step": 71720 + }, + { + "epoch": 17.887780548628427, + "grad_norm": 7.789792537689209, + "learning_rate": 2.1194513715710722e-06, + "loss": 0.3582, + "step": 71730 + }, + { + "epoch": 17.890274314214462, + "grad_norm": 17.104684829711914, + "learning_rate": 2.1169576059850376e-06, + "loss": 0.3253, + "step": 71740 + }, + { + "epoch": 17.892768079800497, + "grad_norm": 10.092325210571289, + "learning_rate": 2.1144638403990024e-06, + "loss": 0.3024, + "step": 71750 + }, + { + "epoch": 17.895261845386532, + "grad_norm": 7.519723415374756, + "learning_rate": 2.1119700748129678e-06, + "loss": 0.3506, + "step": 71760 + }, + { + "epoch": 17.897755610972567, + "grad_norm": 9.717748641967773, + "learning_rate": 2.109476309226933e-06, + "loss": 0.2639, + "step": 71770 + }, + { + "epoch": 17.900249376558605, + "grad_norm": 7.854363441467285, + "learning_rate": 2.106982543640898e-06, + "loss": 0.3382, + "step": 71780 + }, + { + "epoch": 17.902743142144637, + "grad_norm": 6.1837158203125, + "learning_rate": 2.1044887780548633e-06, + "loss": 0.328, + "step": 71790 + }, + { + "epoch": 17.905236907730675, + "grad_norm": 9.821382522583008, + "learning_rate": 2.101995012468828e-06, + "loss": 0.3765, + "step": 71800 + }, + { + "epoch": 17.90773067331671, + "grad_norm": 8.692037582397461, + "learning_rate": 2.0995012468827935e-06, + "loss": 0.3051, + "step": 71810 + }, + { + "epoch": 17.910224438902745, + "grad_norm": 8.943056106567383, + "learning_rate": 2.0970074812967583e-06, + "loss": 0.339, + "step": 71820 + }, + { + "epoch": 17.91271820448878, + "grad_norm": 8.475810050964355, + "learning_rate": 2.0945137157107232e-06, + "loss": 0.3219, + "step": 71830 + }, + { + "epoch": 17.915211970074814, + "grad_norm": 13.694457054138184, + "learning_rate": 2.0920199501246885e-06, + "loss": 0.2674, + "step": 71840 + }, + { + "epoch": 17.91770573566085, + "grad_norm": 15.57154655456543, + "learning_rate": 2.0895261845386534e-06, + "loss": 0.3422, + "step": 71850 + }, + { + "epoch": 17.920199501246884, + "grad_norm": 7.596667289733887, + "learning_rate": 2.0870324189526187e-06, + "loss": 0.352, + "step": 71860 + }, + { + "epoch": 17.92269326683292, + "grad_norm": 7.299078941345215, + "learning_rate": 2.0845386533665836e-06, + "loss": 0.3473, + "step": 71870 + }, + { + "epoch": 17.925187032418954, + "grad_norm": 8.171175003051758, + "learning_rate": 2.082044887780549e-06, + "loss": 0.3868, + "step": 71880 + }, + { + "epoch": 17.92768079800499, + "grad_norm": 27.5074405670166, + "learning_rate": 2.079551122194514e-06, + "loss": 0.3894, + "step": 71890 + }, + { + "epoch": 17.930174563591024, + "grad_norm": 10.98017692565918, + "learning_rate": 2.077057356608479e-06, + "loss": 0.2976, + "step": 71900 + }, + { + "epoch": 17.93266832917706, + "grad_norm": 10.127460479736328, + "learning_rate": 2.074563591022444e-06, + "loss": 0.3238, + "step": 71910 + }, + { + "epoch": 17.935162094763093, + "grad_norm": 6.212870121002197, + "learning_rate": 2.0720698254364093e-06, + "loss": 0.3211, + "step": 71920 + }, + { + "epoch": 17.93765586034913, + "grad_norm": 8.205334663391113, + "learning_rate": 2.069576059850374e-06, + "loss": 0.3003, + "step": 71930 + }, + { + "epoch": 17.940149625935163, + "grad_norm": 8.863457679748535, + "learning_rate": 2.067082294264339e-06, + "loss": 0.3392, + "step": 71940 + }, + { + "epoch": 17.942643391521198, + "grad_norm": 8.939859390258789, + "learning_rate": 2.0645885286783044e-06, + "loss": 0.2978, + "step": 71950 + }, + { + "epoch": 17.945137157107233, + "grad_norm": 12.467900276184082, + "learning_rate": 2.0620947630922693e-06, + "loss": 0.3426, + "step": 71960 + }, + { + "epoch": 17.947630922693268, + "grad_norm": 9.684351921081543, + "learning_rate": 2.0596009975062346e-06, + "loss": 0.3648, + "step": 71970 + }, + { + "epoch": 17.950124688279303, + "grad_norm": 7.206076622009277, + "learning_rate": 2.0571072319201995e-06, + "loss": 0.3311, + "step": 71980 + }, + { + "epoch": 17.952618453865338, + "grad_norm": 8.474637985229492, + "learning_rate": 2.054613466334165e-06, + "loss": 0.2923, + "step": 71990 + }, + { + "epoch": 17.955112219451372, + "grad_norm": 5.08350944519043, + "learning_rate": 2.0521197007481297e-06, + "loss": 0.2908, + "step": 72000 + }, + { + "epoch": 17.957605985037407, + "grad_norm": 8.969040870666504, + "learning_rate": 2.049625935162095e-06, + "loss": 0.3462, + "step": 72010 + }, + { + "epoch": 17.960099750623442, + "grad_norm": 6.145038604736328, + "learning_rate": 2.0471321695760603e-06, + "loss": 0.3551, + "step": 72020 + }, + { + "epoch": 17.962593516209477, + "grad_norm": 6.762228012084961, + "learning_rate": 2.044638403990025e-06, + "loss": 0.3054, + "step": 72030 + }, + { + "epoch": 17.965087281795512, + "grad_norm": 8.712302207946777, + "learning_rate": 2.0421446384039905e-06, + "loss": 0.3249, + "step": 72040 + }, + { + "epoch": 17.967581047381547, + "grad_norm": 8.052441596984863, + "learning_rate": 2.0396508728179554e-06, + "loss": 0.2844, + "step": 72050 + }, + { + "epoch": 17.97007481296758, + "grad_norm": 11.43187427520752, + "learning_rate": 2.0371571072319203e-06, + "loss": 0.3358, + "step": 72060 + }, + { + "epoch": 17.972568578553616, + "grad_norm": 11.102864265441895, + "learning_rate": 2.0346633416458856e-06, + "loss": 0.3596, + "step": 72070 + }, + { + "epoch": 17.97506234413965, + "grad_norm": 9.357625961303711, + "learning_rate": 2.0321695760598505e-06, + "loss": 0.4051, + "step": 72080 + }, + { + "epoch": 17.977556109725686, + "grad_norm": 10.954850196838379, + "learning_rate": 2.0296758104738158e-06, + "loss": 0.3903, + "step": 72090 + }, + { + "epoch": 17.98004987531172, + "grad_norm": 7.4843645095825195, + "learning_rate": 2.0271820448877807e-06, + "loss": 0.3329, + "step": 72100 + }, + { + "epoch": 17.982543640897756, + "grad_norm": 6.2788286209106445, + "learning_rate": 2.024688279301746e-06, + "loss": 0.3574, + "step": 72110 + }, + { + "epoch": 17.98503740648379, + "grad_norm": 8.560807228088379, + "learning_rate": 2.022194513715711e-06, + "loss": 0.384, + "step": 72120 + }, + { + "epoch": 17.987531172069826, + "grad_norm": 6.629812717437744, + "learning_rate": 2.019700748129676e-06, + "loss": 0.3336, + "step": 72130 + }, + { + "epoch": 17.99002493765586, + "grad_norm": 7.1960859298706055, + "learning_rate": 2.017206982543641e-06, + "loss": 0.3042, + "step": 72140 + }, + { + "epoch": 17.992518703241895, + "grad_norm": 12.617613792419434, + "learning_rate": 2.014713216957606e-06, + "loss": 0.303, + "step": 72150 + }, + { + "epoch": 17.99501246882793, + "grad_norm": 8.579712867736816, + "learning_rate": 2.0122194513715712e-06, + "loss": 0.3387, + "step": 72160 + }, + { + "epoch": 17.997506234413965, + "grad_norm": 7.062646865844727, + "learning_rate": 2.009725685785536e-06, + "loss": 0.3563, + "step": 72170 + }, + { + "epoch": 18.0, + "grad_norm": 9.439546585083008, + "learning_rate": 2.0072319201995014e-06, + "loss": 0.3308, + "step": 72180 + }, + { + "epoch": 18.0, + "eval_loss": 0.4180074632167816, + "eval_runtime": 60.175, + "eval_samples_per_second": 16.668, + "eval_steps_per_second": 16.668, + "step": 72180 + }, + { + "epoch": 18.002493765586035, + "grad_norm": 12.682416915893555, + "learning_rate": 2.0047381546134663e-06, + "loss": 0.315, + "step": 72190 + }, + { + "epoch": 18.00498753117207, + "grad_norm": 6.629142761230469, + "learning_rate": 2.0022443890274316e-06, + "loss": 0.2881, + "step": 72200 + }, + { + "epoch": 18.007481296758105, + "grad_norm": 11.43194580078125, + "learning_rate": 1.9997506234413965e-06, + "loss": 0.3674, + "step": 72210 + }, + { + "epoch": 18.00997506234414, + "grad_norm": 10.104514122009277, + "learning_rate": 1.997256857855362e-06, + "loss": 0.3005, + "step": 72220 + }, + { + "epoch": 18.012468827930174, + "grad_norm": 10.51015853881836, + "learning_rate": 1.9947630922693267e-06, + "loss": 0.3445, + "step": 72230 + }, + { + "epoch": 18.01496259351621, + "grad_norm": 7.913299083709717, + "learning_rate": 1.9922693266832916e-06, + "loss": 0.3416, + "step": 72240 + }, + { + "epoch": 18.017456359102244, + "grad_norm": 10.231574058532715, + "learning_rate": 1.989775561097257e-06, + "loss": 0.2745, + "step": 72250 + }, + { + "epoch": 18.01995012468828, + "grad_norm": 7.487124919891357, + "learning_rate": 1.9872817955112222e-06, + "loss": 0.3448, + "step": 72260 + }, + { + "epoch": 18.022443890274314, + "grad_norm": 8.64004898071289, + "learning_rate": 1.984788029925187e-06, + "loss": 0.3579, + "step": 72270 + }, + { + "epoch": 18.02493765586035, + "grad_norm": 10.440835952758789, + "learning_rate": 1.9822942643391524e-06, + "loss": 0.2995, + "step": 72280 + }, + { + "epoch": 18.027431421446384, + "grad_norm": 7.366189956665039, + "learning_rate": 1.9798004987531173e-06, + "loss": 0.276, + "step": 72290 + }, + { + "epoch": 18.02992518703242, + "grad_norm": 3.2743232250213623, + "learning_rate": 1.9773067331670826e-06, + "loss": 0.2778, + "step": 72300 + }, + { + "epoch": 18.032418952618453, + "grad_norm": 5.090819358825684, + "learning_rate": 1.9748129675810475e-06, + "loss": 0.296, + "step": 72310 + }, + { + "epoch": 18.034912718204488, + "grad_norm": 9.35080337524414, + "learning_rate": 1.972319201995013e-06, + "loss": 0.2632, + "step": 72320 + }, + { + "epoch": 18.037406483790523, + "grad_norm": 7.471287250518799, + "learning_rate": 1.9698254364089777e-06, + "loss": 0.3306, + "step": 72330 + }, + { + "epoch": 18.039900249376558, + "grad_norm": 8.039196014404297, + "learning_rate": 1.967331670822943e-06, + "loss": 0.3021, + "step": 72340 + }, + { + "epoch": 18.042394014962593, + "grad_norm": 8.003158569335938, + "learning_rate": 1.964837905236908e-06, + "loss": 0.3735, + "step": 72350 + }, + { + "epoch": 18.044887780548628, + "grad_norm": 7.758488178253174, + "learning_rate": 1.962344139650873e-06, + "loss": 0.3305, + "step": 72360 + }, + { + "epoch": 18.047381546134662, + "grad_norm": 9.10808277130127, + "learning_rate": 1.959850374064838e-06, + "loss": 0.3393, + "step": 72370 + }, + { + "epoch": 18.049875311720697, + "grad_norm": 6.801268100738525, + "learning_rate": 1.957356608478803e-06, + "loss": 0.2978, + "step": 72380 + }, + { + "epoch": 18.052369077306732, + "grad_norm": 7.056491851806641, + "learning_rate": 1.9548628428927683e-06, + "loss": 0.3622, + "step": 72390 + }, + { + "epoch": 18.054862842892767, + "grad_norm": 8.197697639465332, + "learning_rate": 1.952369077306733e-06, + "loss": 0.3272, + "step": 72400 + }, + { + "epoch": 18.057356608478802, + "grad_norm": 9.100240707397461, + "learning_rate": 1.9498753117206985e-06, + "loss": 0.2909, + "step": 72410 + }, + { + "epoch": 18.059850374064837, + "grad_norm": 9.016037940979004, + "learning_rate": 1.9473815461346634e-06, + "loss": 0.3508, + "step": 72420 + }, + { + "epoch": 18.06234413965087, + "grad_norm": 11.260120391845703, + "learning_rate": 1.9448877805486287e-06, + "loss": 0.3345, + "step": 72430 + }, + { + "epoch": 18.064837905236907, + "grad_norm": 8.461393356323242, + "learning_rate": 1.9423940149625936e-06, + "loss": 0.2626, + "step": 72440 + }, + { + "epoch": 18.06733167082294, + "grad_norm": 7.152003765106201, + "learning_rate": 1.939900249376559e-06, + "loss": 0.3907, + "step": 72450 + }, + { + "epoch": 18.069825436408976, + "grad_norm": 7.978116512298584, + "learning_rate": 1.9374064837905238e-06, + "loss": 0.3898, + "step": 72460 + }, + { + "epoch": 18.07231920199501, + "grad_norm": 6.364957332611084, + "learning_rate": 1.9349127182044886e-06, + "loss": 0.2275, + "step": 72470 + }, + { + "epoch": 18.074812967581046, + "grad_norm": 8.800970077514648, + "learning_rate": 1.932418952618454e-06, + "loss": 0.253, + "step": 72480 + }, + { + "epoch": 18.07730673316708, + "grad_norm": 5.478081226348877, + "learning_rate": 1.929925187032419e-06, + "loss": 0.3061, + "step": 72490 + }, + { + "epoch": 18.079800498753116, + "grad_norm": 6.752138614654541, + "learning_rate": 1.927431421446384e-06, + "loss": 0.3113, + "step": 72500 + }, + { + "epoch": 18.08229426433915, + "grad_norm": 10.513050079345703, + "learning_rate": 1.9249376558603495e-06, + "loss": 0.2699, + "step": 72510 + }, + { + "epoch": 18.084788029925186, + "grad_norm": 8.9693021774292, + "learning_rate": 1.9224438902743144e-06, + "loss": 0.3346, + "step": 72520 + }, + { + "epoch": 18.08728179551122, + "grad_norm": 7.213470458984375, + "learning_rate": 1.9199501246882797e-06, + "loss": 0.306, + "step": 72530 + }, + { + "epoch": 18.089775561097255, + "grad_norm": 9.323833465576172, + "learning_rate": 1.9174563591022445e-06, + "loss": 0.3102, + "step": 72540 + }, + { + "epoch": 18.09226932668329, + "grad_norm": 12.904609680175781, + "learning_rate": 1.91496259351621e-06, + "loss": 0.3915, + "step": 72550 + }, + { + "epoch": 18.094763092269325, + "grad_norm": 7.509303092956543, + "learning_rate": 1.9124688279301747e-06, + "loss": 0.2904, + "step": 72560 + }, + { + "epoch": 18.09725685785536, + "grad_norm": 7.649249076843262, + "learning_rate": 1.90997506234414e-06, + "loss": 0.2865, + "step": 72570 + }, + { + "epoch": 18.099750623441395, + "grad_norm": 11.983275413513184, + "learning_rate": 1.907481296758105e-06, + "loss": 0.3197, + "step": 72580 + }, + { + "epoch": 18.102244389027433, + "grad_norm": 10.603513717651367, + "learning_rate": 1.90498753117207e-06, + "loss": 0.3186, + "step": 72590 + }, + { + "epoch": 18.104738154613468, + "grad_norm": 8.808806419372559, + "learning_rate": 1.9024937655860351e-06, + "loss": 0.3585, + "step": 72600 + }, + { + "epoch": 18.107231920199503, + "grad_norm": 8.247468948364258, + "learning_rate": 1.9000000000000002e-06, + "loss": 0.3955, + "step": 72610 + }, + { + "epoch": 18.109725685785538, + "grad_norm": 10.441139221191406, + "learning_rate": 1.8975062344139653e-06, + "loss": 0.355, + "step": 72620 + }, + { + "epoch": 18.112219451371573, + "grad_norm": 6.936132431030273, + "learning_rate": 1.8950124688279304e-06, + "loss": 0.3121, + "step": 72630 + }, + { + "epoch": 18.114713216957608, + "grad_norm": 7.062928676605225, + "learning_rate": 1.8925187032418953e-06, + "loss": 0.2723, + "step": 72640 + }, + { + "epoch": 18.117206982543642, + "grad_norm": 6.312658786773682, + "learning_rate": 1.8900249376558604e-06, + "loss": 0.2906, + "step": 72650 + }, + { + "epoch": 18.119700748129677, + "grad_norm": 8.172001838684082, + "learning_rate": 1.8877805486284289e-06, + "loss": 0.3375, + "step": 72660 + }, + { + "epoch": 18.122194513715712, + "grad_norm": 9.601921081542969, + "learning_rate": 1.885286783042394e-06, + "loss": 0.365, + "step": 72670 + }, + { + "epoch": 18.124688279301747, + "grad_norm": 7.265673637390137, + "learning_rate": 1.882793017456359e-06, + "loss": 0.2651, + "step": 72680 + }, + { + "epoch": 18.127182044887782, + "grad_norm": 9.785295486450195, + "learning_rate": 1.8802992518703244e-06, + "loss": 0.3108, + "step": 72690 + }, + { + "epoch": 18.129675810473817, + "grad_norm": 10.865586280822754, + "learning_rate": 1.8778054862842895e-06, + "loss": 0.2968, + "step": 72700 + }, + { + "epoch": 18.13216957605985, + "grad_norm": 5.767469882965088, + "learning_rate": 1.8753117206982546e-06, + "loss": 0.2573, + "step": 72710 + }, + { + "epoch": 18.134663341645886, + "grad_norm": 9.119926452636719, + "learning_rate": 1.8728179551122197e-06, + "loss": 0.336, + "step": 72720 + }, + { + "epoch": 18.13715710723192, + "grad_norm": 11.801755905151367, + "learning_rate": 1.8703241895261848e-06, + "loss": 0.3547, + "step": 72730 + }, + { + "epoch": 18.139650872817956, + "grad_norm": 8.99133014678955, + "learning_rate": 1.8678304239401499e-06, + "loss": 0.3888, + "step": 72740 + }, + { + "epoch": 18.14214463840399, + "grad_norm": 9.552180290222168, + "learning_rate": 1.865336658354115e-06, + "loss": 0.3176, + "step": 72750 + }, + { + "epoch": 18.144638403990026, + "grad_norm": 9.201723098754883, + "learning_rate": 1.86284289276808e-06, + "loss": 0.2666, + "step": 72760 + }, + { + "epoch": 18.14713216957606, + "grad_norm": 9.10981273651123, + "learning_rate": 1.8603491271820452e-06, + "loss": 0.2701, + "step": 72770 + }, + { + "epoch": 18.149625935162096, + "grad_norm": 5.472398281097412, + "learning_rate": 1.85785536159601e-06, + "loss": 0.3164, + "step": 72780 + }, + { + "epoch": 18.15211970074813, + "grad_norm": 10.77421760559082, + "learning_rate": 1.8553615960099752e-06, + "loss": 0.3434, + "step": 72790 + }, + { + "epoch": 18.154613466334165, + "grad_norm": 7.737188339233398, + "learning_rate": 1.8528678304239403e-06, + "loss": 0.2856, + "step": 72800 + }, + { + "epoch": 18.1571072319202, + "grad_norm": 10.025762557983398, + "learning_rate": 1.8503740648379054e-06, + "loss": 0.3094, + "step": 72810 + }, + { + "epoch": 18.159600997506235, + "grad_norm": 12.608803749084473, + "learning_rate": 1.8478802992518705e-06, + "loss": 0.3368, + "step": 72820 + }, + { + "epoch": 18.16209476309227, + "grad_norm": 17.044055938720703, + "learning_rate": 1.8453865336658356e-06, + "loss": 0.3715, + "step": 72830 + }, + { + "epoch": 18.164588528678305, + "grad_norm": 8.355241775512695, + "learning_rate": 1.8428927680798007e-06, + "loss": 0.329, + "step": 72840 + }, + { + "epoch": 18.16708229426434, + "grad_norm": 8.667624473571777, + "learning_rate": 1.8403990024937658e-06, + "loss": 0.3585, + "step": 72850 + }, + { + "epoch": 18.169576059850375, + "grad_norm": 9.941221237182617, + "learning_rate": 1.8379052369077309e-06, + "loss": 0.376, + "step": 72860 + }, + { + "epoch": 18.17206982543641, + "grad_norm": 8.516250610351562, + "learning_rate": 1.8354114713216957e-06, + "loss": 0.2555, + "step": 72870 + }, + { + "epoch": 18.174563591022444, + "grad_norm": 9.351677894592285, + "learning_rate": 1.8329177057356608e-06, + "loss": 0.312, + "step": 72880 + }, + { + "epoch": 18.17705735660848, + "grad_norm": 10.373080253601074, + "learning_rate": 1.830423940149626e-06, + "loss": 0.3313, + "step": 72890 + }, + { + "epoch": 18.179551122194514, + "grad_norm": 6.510061264038086, + "learning_rate": 1.827930174563591e-06, + "loss": 0.2775, + "step": 72900 + }, + { + "epoch": 18.18204488778055, + "grad_norm": 7.710317611694336, + "learning_rate": 1.8254364089775561e-06, + "loss": 0.3265, + "step": 72910 + }, + { + "epoch": 18.184538653366584, + "grad_norm": 9.246587753295898, + "learning_rate": 1.8229426433915212e-06, + "loss": 0.3061, + "step": 72920 + }, + { + "epoch": 18.18703241895262, + "grad_norm": 8.690896987915039, + "learning_rate": 1.8204488778054865e-06, + "loss": 0.3098, + "step": 72930 + }, + { + "epoch": 18.189526184538654, + "grad_norm": 4.682961463928223, + "learning_rate": 1.8179551122194516e-06, + "loss": 0.2886, + "step": 72940 + }, + { + "epoch": 18.19201995012469, + "grad_norm": 5.759263038635254, + "learning_rate": 1.8154613466334167e-06, + "loss": 0.2854, + "step": 72950 + }, + { + "epoch": 18.194513715710723, + "grad_norm": 7.35156774520874, + "learning_rate": 1.8129675810473818e-06, + "loss": 0.3615, + "step": 72960 + }, + { + "epoch": 18.197007481296758, + "grad_norm": 7.8746819496154785, + "learning_rate": 1.810473815461347e-06, + "loss": 0.3827, + "step": 72970 + }, + { + "epoch": 18.199501246882793, + "grad_norm": 10.189519882202148, + "learning_rate": 1.807980049875312e-06, + "loss": 0.3083, + "step": 72980 + }, + { + "epoch": 18.201995012468828, + "grad_norm": 8.979336738586426, + "learning_rate": 1.8054862842892771e-06, + "loss": 0.366, + "step": 72990 + }, + { + "epoch": 18.204488778054863, + "grad_norm": 6.781293869018555, + "learning_rate": 1.802992518703242e-06, + "loss": 0.348, + "step": 73000 + }, + { + "epoch": 18.206982543640898, + "grad_norm": 9.620187759399414, + "learning_rate": 1.8004987531172071e-06, + "loss": 0.3721, + "step": 73010 + }, + { + "epoch": 18.209476309226932, + "grad_norm": 8.865036964416504, + "learning_rate": 1.7980049875311722e-06, + "loss": 0.3411, + "step": 73020 + }, + { + "epoch": 18.211970074812967, + "grad_norm": 9.20656681060791, + "learning_rate": 1.7955112219451373e-06, + "loss": 0.3595, + "step": 73030 + }, + { + "epoch": 18.214463840399002, + "grad_norm": 11.974329948425293, + "learning_rate": 1.7930174563591024e-06, + "loss": 0.3608, + "step": 73040 + }, + { + "epoch": 18.216957605985037, + "grad_norm": 13.708403587341309, + "learning_rate": 1.7905236907730675e-06, + "loss": 0.3302, + "step": 73050 + }, + { + "epoch": 18.219451371571072, + "grad_norm": 8.671035766601562, + "learning_rate": 1.7880299251870326e-06, + "loss": 0.2768, + "step": 73060 + }, + { + "epoch": 18.221945137157107, + "grad_norm": 9.130983352661133, + "learning_rate": 1.7855361596009977e-06, + "loss": 0.3167, + "step": 73070 + }, + { + "epoch": 18.22443890274314, + "grad_norm": 9.449166297912598, + "learning_rate": 1.7830423940149628e-06, + "loss": 0.3409, + "step": 73080 + }, + { + "epoch": 18.226932668329177, + "grad_norm": 8.308408737182617, + "learning_rate": 1.7805486284289277e-06, + "loss": 0.2954, + "step": 73090 + }, + { + "epoch": 18.22942643391521, + "grad_norm": 11.468031883239746, + "learning_rate": 1.7780548628428928e-06, + "loss": 0.3183, + "step": 73100 + }, + { + "epoch": 18.231920199501246, + "grad_norm": 10.764180183410645, + "learning_rate": 1.7755610972568579e-06, + "loss": 0.3812, + "step": 73110 + }, + { + "epoch": 18.23441396508728, + "grad_norm": 8.669768333435059, + "learning_rate": 1.773067331670823e-06, + "loss": 0.3437, + "step": 73120 + }, + { + "epoch": 18.236907730673316, + "grad_norm": 8.164244651794434, + "learning_rate": 1.770573566084788e-06, + "loss": 0.2734, + "step": 73130 + }, + { + "epoch": 18.23940149625935, + "grad_norm": 6.5684943199157715, + "learning_rate": 1.7680798004987532e-06, + "loss": 0.3716, + "step": 73140 + }, + { + "epoch": 18.241895261845386, + "grad_norm": 8.023477554321289, + "learning_rate": 1.7655860349127183e-06, + "loss": 0.3461, + "step": 73150 + }, + { + "epoch": 18.24438902743142, + "grad_norm": 6.442568302154541, + "learning_rate": 1.7630922693266834e-06, + "loss": 0.3287, + "step": 73160 + }, + { + "epoch": 18.246882793017456, + "grad_norm": 10.371484756469727, + "learning_rate": 1.7605985037406485e-06, + "loss": 0.2998, + "step": 73170 + }, + { + "epoch": 18.24937655860349, + "grad_norm": 10.30907917022705, + "learning_rate": 1.7581047381546138e-06, + "loss": 0.3161, + "step": 73180 + }, + { + "epoch": 18.251870324189525, + "grad_norm": 11.629962921142578, + "learning_rate": 1.7556109725685789e-06, + "loss": 0.2857, + "step": 73190 + }, + { + "epoch": 18.25436408977556, + "grad_norm": 10.187129020690918, + "learning_rate": 1.753117206982544e-06, + "loss": 0.3485, + "step": 73200 + }, + { + "epoch": 18.256857855361595, + "grad_norm": 13.606742858886719, + "learning_rate": 1.750623441396509e-06, + "loss": 0.3258, + "step": 73210 + }, + { + "epoch": 18.25935162094763, + "grad_norm": 7.9029860496521, + "learning_rate": 1.748129675810474e-06, + "loss": 0.264, + "step": 73220 + }, + { + "epoch": 18.261845386533665, + "grad_norm": 6.414330959320068, + "learning_rate": 1.745635910224439e-06, + "loss": 0.3278, + "step": 73230 + }, + { + "epoch": 18.2643391521197, + "grad_norm": 4.603034019470215, + "learning_rate": 1.7431421446384042e-06, + "loss": 0.304, + "step": 73240 + }, + { + "epoch": 18.266832917705734, + "grad_norm": 9.417990684509277, + "learning_rate": 1.7406483790523692e-06, + "loss": 0.3797, + "step": 73250 + }, + { + "epoch": 18.26932668329177, + "grad_norm": 10.289613723754883, + "learning_rate": 1.7381546134663343e-06, + "loss": 0.4033, + "step": 73260 + }, + { + "epoch": 18.271820448877804, + "grad_norm": 18.14396858215332, + "learning_rate": 1.7356608478802994e-06, + "loss": 0.2768, + "step": 73270 + }, + { + "epoch": 18.27431421446384, + "grad_norm": 11.941176414489746, + "learning_rate": 1.7331670822942645e-06, + "loss": 0.431, + "step": 73280 + }, + { + "epoch": 18.276807980049874, + "grad_norm": 9.569537162780762, + "learning_rate": 1.7306733167082296e-06, + "loss": 0.2749, + "step": 73290 + }, + { + "epoch": 18.27930174563591, + "grad_norm": 7.719649791717529, + "learning_rate": 1.7281795511221947e-06, + "loss": 0.3432, + "step": 73300 + }, + { + "epoch": 18.281795511221944, + "grad_norm": 6.728184223175049, + "learning_rate": 1.7256857855361596e-06, + "loss": 0.2864, + "step": 73310 + }, + { + "epoch": 18.28428927680798, + "grad_norm": 7.6080708503723145, + "learning_rate": 1.7231920199501247e-06, + "loss": 0.3257, + "step": 73320 + }, + { + "epoch": 18.286783042394013, + "grad_norm": 8.406044006347656, + "learning_rate": 1.7206982543640898e-06, + "loss": 0.3528, + "step": 73330 + }, + { + "epoch": 18.28927680798005, + "grad_norm": 11.21180534362793, + "learning_rate": 1.718204488778055e-06, + "loss": 0.3166, + "step": 73340 + }, + { + "epoch": 18.291770573566083, + "grad_norm": 12.678295135498047, + "learning_rate": 1.71571072319202e-06, + "loss": 0.3484, + "step": 73350 + }, + { + "epoch": 18.294264339152118, + "grad_norm": 13.616421699523926, + "learning_rate": 1.7132169576059851e-06, + "loss": 0.3677, + "step": 73360 + }, + { + "epoch": 18.296758104738153, + "grad_norm": 8.696226119995117, + "learning_rate": 1.7107231920199502e-06, + "loss": 0.3082, + "step": 73370 + }, + { + "epoch": 18.29925187032419, + "grad_norm": 10.46649169921875, + "learning_rate": 1.7082294264339153e-06, + "loss": 0.3416, + "step": 73380 + }, + { + "epoch": 18.301745635910226, + "grad_norm": 5.726923942565918, + "learning_rate": 1.7057356608478804e-06, + "loss": 0.3065, + "step": 73390 + }, + { + "epoch": 18.30423940149626, + "grad_norm": 12.030797004699707, + "learning_rate": 1.7032418952618455e-06, + "loss": 0.328, + "step": 73400 + }, + { + "epoch": 18.306733167082296, + "grad_norm": 8.201338768005371, + "learning_rate": 1.7007481296758104e-06, + "loss": 0.3321, + "step": 73410 + }, + { + "epoch": 18.30922693266833, + "grad_norm": 9.528154373168945, + "learning_rate": 1.6982543640897755e-06, + "loss": 0.2487, + "step": 73420 + }, + { + "epoch": 18.311720698254366, + "grad_norm": 8.872268676757812, + "learning_rate": 1.695760598503741e-06, + "loss": 0.3468, + "step": 73430 + }, + { + "epoch": 18.3142144638404, + "grad_norm": 6.192471027374268, + "learning_rate": 1.693266832917706e-06, + "loss": 0.2942, + "step": 73440 + }, + { + "epoch": 18.316708229426435, + "grad_norm": 8.524904251098633, + "learning_rate": 1.690773067331671e-06, + "loss": 0.3305, + "step": 73450 + }, + { + "epoch": 18.31920199501247, + "grad_norm": 6.720682621002197, + "learning_rate": 1.688279301745636e-06, + "loss": 0.3114, + "step": 73460 + }, + { + "epoch": 18.321695760598505, + "grad_norm": 8.790159225463867, + "learning_rate": 1.6857855361596012e-06, + "loss": 0.3181, + "step": 73470 + }, + { + "epoch": 18.32418952618454, + "grad_norm": 6.875416278839111, + "learning_rate": 1.6832917705735663e-06, + "loss": 0.3115, + "step": 73480 + }, + { + "epoch": 18.326683291770575, + "grad_norm": 8.09900188446045, + "learning_rate": 1.6807980049875314e-06, + "loss": 0.3607, + "step": 73490 + }, + { + "epoch": 18.32917705735661, + "grad_norm": 8.227255821228027, + "learning_rate": 1.6783042394014965e-06, + "loss": 0.3341, + "step": 73500 + }, + { + "epoch": 18.331670822942645, + "grad_norm": 11.627192497253418, + "learning_rate": 1.6758104738154616e-06, + "loss": 0.3915, + "step": 73510 + }, + { + "epoch": 18.33416458852868, + "grad_norm": 11.05119800567627, + "learning_rate": 1.6733167082294267e-06, + "loss": 0.3357, + "step": 73520 + }, + { + "epoch": 18.336658354114714, + "grad_norm": 9.547077178955078, + "learning_rate": 1.6708229426433918e-06, + "loss": 0.3351, + "step": 73530 + }, + { + "epoch": 18.33915211970075, + "grad_norm": 10.876405715942383, + "learning_rate": 1.6683291770573567e-06, + "loss": 0.3085, + "step": 73540 + }, + { + "epoch": 18.341645885286784, + "grad_norm": 5.926941394805908, + "learning_rate": 1.6658354114713218e-06, + "loss": 0.2994, + "step": 73550 + }, + { + "epoch": 18.34413965087282, + "grad_norm": 9.938848495483398, + "learning_rate": 1.6633416458852869e-06, + "loss": 0.3295, + "step": 73560 + }, + { + "epoch": 18.346633416458854, + "grad_norm": 9.622965812683105, + "learning_rate": 1.660847880299252e-06, + "loss": 0.2535, + "step": 73570 + }, + { + "epoch": 18.34912718204489, + "grad_norm": 7.86170768737793, + "learning_rate": 1.658354114713217e-06, + "loss": 0.3655, + "step": 73580 + }, + { + "epoch": 18.351620947630924, + "grad_norm": 11.039167404174805, + "learning_rate": 1.6558603491271822e-06, + "loss": 0.3227, + "step": 73590 + }, + { + "epoch": 18.35411471321696, + "grad_norm": 9.131935119628906, + "learning_rate": 1.6533665835411473e-06, + "loss": 0.2782, + "step": 73600 + }, + { + "epoch": 18.356608478802993, + "grad_norm": 7.852529048919678, + "learning_rate": 1.6508728179551124e-06, + "loss": 0.2881, + "step": 73610 + }, + { + "epoch": 18.359102244389028, + "grad_norm": 6.67081356048584, + "learning_rate": 1.6483790523690775e-06, + "loss": 0.3281, + "step": 73620 + }, + { + "epoch": 18.361596009975063, + "grad_norm": 9.186594009399414, + "learning_rate": 1.6458852867830423e-06, + "loss": 0.3104, + "step": 73630 + }, + { + "epoch": 18.364089775561098, + "grad_norm": 10.543354988098145, + "learning_rate": 1.6433915211970074e-06, + "loss": 0.2973, + "step": 73640 + }, + { + "epoch": 18.366583541147133, + "grad_norm": 8.135270118713379, + "learning_rate": 1.6408977556109725e-06, + "loss": 0.3834, + "step": 73650 + }, + { + "epoch": 18.369077306733168, + "grad_norm": 8.696205139160156, + "learning_rate": 1.6384039900249376e-06, + "loss": 0.317, + "step": 73660 + }, + { + "epoch": 18.371571072319203, + "grad_norm": 8.284186363220215, + "learning_rate": 1.6359102244389027e-06, + "loss": 0.3482, + "step": 73670 + }, + { + "epoch": 18.374064837905237, + "grad_norm": 8.957206726074219, + "learning_rate": 1.633416458852868e-06, + "loss": 0.3858, + "step": 73680 + }, + { + "epoch": 18.376558603491272, + "grad_norm": 11.914627075195312, + "learning_rate": 1.6309226932668331e-06, + "loss": 0.335, + "step": 73690 + }, + { + "epoch": 18.379052369077307, + "grad_norm": 5.708254814147949, + "learning_rate": 1.6284289276807982e-06, + "loss": 0.365, + "step": 73700 + }, + { + "epoch": 18.381546134663342, + "grad_norm": 9.29157829284668, + "learning_rate": 1.6259351620947633e-06, + "loss": 0.29, + "step": 73710 + }, + { + "epoch": 18.384039900249377, + "grad_norm": 6.766468048095703, + "learning_rate": 1.6234413965087284e-06, + "loss": 0.2897, + "step": 73720 + }, + { + "epoch": 18.38653366583541, + "grad_norm": 7.3969621658325195, + "learning_rate": 1.6209476309226935e-06, + "loss": 0.3482, + "step": 73730 + }, + { + "epoch": 18.389027431421447, + "grad_norm": 6.613345623016357, + "learning_rate": 1.6184538653366586e-06, + "loss": 0.3101, + "step": 73740 + }, + { + "epoch": 18.39152119700748, + "grad_norm": 7.1980881690979, + "learning_rate": 1.6159600997506237e-06, + "loss": 0.2709, + "step": 73750 + }, + { + "epoch": 18.394014962593516, + "grad_norm": 12.448799133300781, + "learning_rate": 1.6134663341645886e-06, + "loss": 0.2926, + "step": 73760 + }, + { + "epoch": 18.39650872817955, + "grad_norm": 9.95907974243164, + "learning_rate": 1.6109725685785537e-06, + "loss": 0.303, + "step": 73770 + }, + { + "epoch": 18.399002493765586, + "grad_norm": 8.818389892578125, + "learning_rate": 1.6084788029925188e-06, + "loss": 0.2965, + "step": 73780 + }, + { + "epoch": 18.40149625935162, + "grad_norm": 6.657719612121582, + "learning_rate": 1.605985037406484e-06, + "loss": 0.3098, + "step": 73790 + }, + { + "epoch": 18.403990024937656, + "grad_norm": 8.187033653259277, + "learning_rate": 1.603491271820449e-06, + "loss": 0.2981, + "step": 73800 + }, + { + "epoch": 18.40648379052369, + "grad_norm": 8.11359977722168, + "learning_rate": 1.600997506234414e-06, + "loss": 0.3991, + "step": 73810 + }, + { + "epoch": 18.408977556109726, + "grad_norm": 5.73296594619751, + "learning_rate": 1.5985037406483792e-06, + "loss": 0.3214, + "step": 73820 + }, + { + "epoch": 18.41147132169576, + "grad_norm": 7.935540199279785, + "learning_rate": 1.5960099750623443e-06, + "loss": 0.3434, + "step": 73830 + }, + { + "epoch": 18.413965087281795, + "grad_norm": 6.838698863983154, + "learning_rate": 1.5935162094763094e-06, + "loss": 0.2924, + "step": 73840 + }, + { + "epoch": 18.41645885286783, + "grad_norm": 7.937646389007568, + "learning_rate": 1.5910224438902743e-06, + "loss": 0.2674, + "step": 73850 + }, + { + "epoch": 18.418952618453865, + "grad_norm": 8.589198112487793, + "learning_rate": 1.5885286783042394e-06, + "loss": 0.3232, + "step": 73860 + }, + { + "epoch": 18.4214463840399, + "grad_norm": 6.257779598236084, + "learning_rate": 1.5860349127182045e-06, + "loss": 0.2764, + "step": 73870 + }, + { + "epoch": 18.423940149625935, + "grad_norm": 9.784756660461426, + "learning_rate": 1.5835411471321696e-06, + "loss": 0.3288, + "step": 73880 + }, + { + "epoch": 18.42643391521197, + "grad_norm": 7.663664817810059, + "learning_rate": 1.5810473815461347e-06, + "loss": 0.2703, + "step": 73890 + }, + { + "epoch": 18.428927680798004, + "grad_norm": 7.855859756469727, + "learning_rate": 1.5785536159600998e-06, + "loss": 0.2777, + "step": 73900 + }, + { + "epoch": 18.43142144638404, + "grad_norm": 8.598638534545898, + "learning_rate": 1.5760598503740649e-06, + "loss": 0.3726, + "step": 73910 + }, + { + "epoch": 18.433915211970074, + "grad_norm": 12.342257499694824, + "learning_rate": 1.5735660847880302e-06, + "loss": 0.3803, + "step": 73920 + }, + { + "epoch": 18.43640897755611, + "grad_norm": 10.327555656433105, + "learning_rate": 1.5710723192019953e-06, + "loss": 0.324, + "step": 73930 + }, + { + "epoch": 18.438902743142144, + "grad_norm": 10.554641723632812, + "learning_rate": 1.5685785536159604e-06, + "loss": 0.2346, + "step": 73940 + }, + { + "epoch": 18.44139650872818, + "grad_norm": 10.217893600463867, + "learning_rate": 1.5660847880299255e-06, + "loss": 0.3869, + "step": 73950 + }, + { + "epoch": 18.443890274314214, + "grad_norm": 9.727376937866211, + "learning_rate": 1.5635910224438906e-06, + "loss": 0.3377, + "step": 73960 + }, + { + "epoch": 18.44638403990025, + "grad_norm": 8.947395324707031, + "learning_rate": 1.5610972568578557e-06, + "loss": 0.3863, + "step": 73970 + }, + { + "epoch": 18.448877805486283, + "grad_norm": 8.704399108886719, + "learning_rate": 1.5586034912718206e-06, + "loss": 0.3594, + "step": 73980 + }, + { + "epoch": 18.45137157107232, + "grad_norm": 7.907713413238525, + "learning_rate": 1.5561097256857857e-06, + "loss": 0.3043, + "step": 73990 + }, + { + "epoch": 18.453865336658353, + "grad_norm": 11.05506420135498, + "learning_rate": 1.5536159600997507e-06, + "loss": 0.307, + "step": 74000 + }, + { + "epoch": 18.456359102244388, + "grad_norm": 10.79157829284668, + "learning_rate": 1.5511221945137158e-06, + "loss": 0.3657, + "step": 74010 + }, + { + "epoch": 18.458852867830423, + "grad_norm": 9.126988410949707, + "learning_rate": 1.548628428927681e-06, + "loss": 0.2559, + "step": 74020 + }, + { + "epoch": 18.461346633416458, + "grad_norm": 15.90583324432373, + "learning_rate": 1.546134663341646e-06, + "loss": 0.387, + "step": 74030 + }, + { + "epoch": 18.463840399002493, + "grad_norm": 7.69087028503418, + "learning_rate": 1.5436408977556111e-06, + "loss": 0.3183, + "step": 74040 + }, + { + "epoch": 18.466334164588527, + "grad_norm": 10.746506690979004, + "learning_rate": 1.5411471321695762e-06, + "loss": 0.3444, + "step": 74050 + }, + { + "epoch": 18.468827930174562, + "grad_norm": 10.501643180847168, + "learning_rate": 1.5386533665835413e-06, + "loss": 0.3714, + "step": 74060 + }, + { + "epoch": 18.471321695760597, + "grad_norm": 5.846063137054443, + "learning_rate": 1.5361596009975062e-06, + "loss": 0.3788, + "step": 74070 + }, + { + "epoch": 18.473815461346632, + "grad_norm": 5.375149726867676, + "learning_rate": 1.5336658354114713e-06, + "loss": 0.2884, + "step": 74080 + }, + { + "epoch": 18.476309226932667, + "grad_norm": 8.232563972473145, + "learning_rate": 1.5311720698254364e-06, + "loss": 0.3549, + "step": 74090 + }, + { + "epoch": 18.478802992518702, + "grad_norm": 7.693140983581543, + "learning_rate": 1.5286783042394015e-06, + "loss": 0.3765, + "step": 74100 + }, + { + "epoch": 18.481296758104737, + "grad_norm": 8.750802040100098, + "learning_rate": 1.5261845386533666e-06, + "loss": 0.2949, + "step": 74110 + }, + { + "epoch": 18.48379052369077, + "grad_norm": 7.751341819763184, + "learning_rate": 1.5236907730673317e-06, + "loss": 0.3237, + "step": 74120 + }, + { + "epoch": 18.486284289276806, + "grad_norm": 8.8473482131958, + "learning_rate": 1.5211970074812968e-06, + "loss": 0.2869, + "step": 74130 + }, + { + "epoch": 18.48877805486284, + "grad_norm": 8.20888900756836, + "learning_rate": 1.518703241895262e-06, + "loss": 0.3399, + "step": 74140 + }, + { + "epoch": 18.491271820448876, + "grad_norm": 8.531835556030273, + "learning_rate": 1.516209476309227e-06, + "loss": 0.2784, + "step": 74150 + }, + { + "epoch": 18.49376558603491, + "grad_norm": 13.030848503112793, + "learning_rate": 1.513715710723192e-06, + "loss": 0.3464, + "step": 74160 + }, + { + "epoch": 18.496259351620946, + "grad_norm": 9.676426887512207, + "learning_rate": 1.5112219451371574e-06, + "loss": 0.3156, + "step": 74170 + }, + { + "epoch": 18.49875311720698, + "grad_norm": 5.307429313659668, + "learning_rate": 1.5087281795511225e-06, + "loss": 0.3121, + "step": 74180 + }, + { + "epoch": 18.50124688279302, + "grad_norm": 8.61061954498291, + "learning_rate": 1.5062344139650876e-06, + "loss": 0.4055, + "step": 74190 + }, + { + "epoch": 18.503740648379054, + "grad_norm": 7.381380558013916, + "learning_rate": 1.5037406483790525e-06, + "loss": 0.3907, + "step": 74200 + }, + { + "epoch": 18.50623441396509, + "grad_norm": 10.964561462402344, + "learning_rate": 1.5012468827930176e-06, + "loss": 0.3313, + "step": 74210 + }, + { + "epoch": 18.508728179551124, + "grad_norm": 8.3394136428833, + "learning_rate": 1.4987531172069827e-06, + "loss": 0.2691, + "step": 74220 + }, + { + "epoch": 18.51122194513716, + "grad_norm": 7.458098411560059, + "learning_rate": 1.4962593516209478e-06, + "loss": 0.307, + "step": 74230 + }, + { + "epoch": 18.513715710723194, + "grad_norm": 12.269042015075684, + "learning_rate": 1.4937655860349129e-06, + "loss": 0.299, + "step": 74240 + }, + { + "epoch": 18.51620947630923, + "grad_norm": 5.956721305847168, + "learning_rate": 1.491271820448878e-06, + "loss": 0.3303, + "step": 74250 + }, + { + "epoch": 18.518703241895263, + "grad_norm": 8.427618980407715, + "learning_rate": 1.488778054862843e-06, + "loss": 0.3256, + "step": 74260 + }, + { + "epoch": 18.521197007481298, + "grad_norm": 7.890618801116943, + "learning_rate": 1.4862842892768082e-06, + "loss": 0.3507, + "step": 74270 + }, + { + "epoch": 18.523690773067333, + "grad_norm": 10.247663497924805, + "learning_rate": 1.4837905236907733e-06, + "loss": 0.345, + "step": 74280 + }, + { + "epoch": 18.526184538653368, + "grad_norm": 11.79736614227295, + "learning_rate": 1.4812967581047384e-06, + "loss": 0.3271, + "step": 74290 + }, + { + "epoch": 18.528678304239403, + "grad_norm": 7.097879409790039, + "learning_rate": 1.4788029925187033e-06, + "loss": 0.2992, + "step": 74300 + }, + { + "epoch": 18.531172069825438, + "grad_norm": 9.422330856323242, + "learning_rate": 1.4763092269326684e-06, + "loss": 0.3164, + "step": 74310 + }, + { + "epoch": 18.533665835411473, + "grad_norm": 8.391748428344727, + "learning_rate": 1.4738154613466335e-06, + "loss": 0.3513, + "step": 74320 + }, + { + "epoch": 18.536159600997507, + "grad_norm": 12.254179954528809, + "learning_rate": 1.4713216957605986e-06, + "loss": 0.3117, + "step": 74330 + }, + { + "epoch": 18.538653366583542, + "grad_norm": 7.639408588409424, + "learning_rate": 1.4688279301745637e-06, + "loss": 0.2665, + "step": 74340 + }, + { + "epoch": 18.541147132169577, + "grad_norm": 13.742928504943848, + "learning_rate": 1.4663341645885288e-06, + "loss": 0.2916, + "step": 74350 + }, + { + "epoch": 18.543640897755612, + "grad_norm": 8.131094932556152, + "learning_rate": 1.4638403990024939e-06, + "loss": 0.3436, + "step": 74360 + }, + { + "epoch": 18.546134663341647, + "grad_norm": 8.458292961120605, + "learning_rate": 1.461346633416459e-06, + "loss": 0.3361, + "step": 74370 + }, + { + "epoch": 18.54862842892768, + "grad_norm": 7.697943210601807, + "learning_rate": 1.458852867830424e-06, + "loss": 0.2833, + "step": 74380 + }, + { + "epoch": 18.551122194513717, + "grad_norm": 11.632762908935547, + "learning_rate": 1.456359102244389e-06, + "loss": 0.2625, + "step": 74390 + }, + { + "epoch": 18.55361596009975, + "grad_norm": 8.714035034179688, + "learning_rate": 1.453865336658354e-06, + "loss": 0.3138, + "step": 74400 + }, + { + "epoch": 18.556109725685786, + "grad_norm": 10.529500961303711, + "learning_rate": 1.4513715710723191e-06, + "loss": 0.4054, + "step": 74410 + }, + { + "epoch": 18.55860349127182, + "grad_norm": 7.038276672363281, + "learning_rate": 1.4488778054862844e-06, + "loss": 0.318, + "step": 74420 + }, + { + "epoch": 18.561097256857856, + "grad_norm": 8.944663047790527, + "learning_rate": 1.4463840399002495e-06, + "loss": 0.3261, + "step": 74430 + }, + { + "epoch": 18.56359102244389, + "grad_norm": 6.774261474609375, + "learning_rate": 1.4438902743142146e-06, + "loss": 0.3315, + "step": 74440 + }, + { + "epoch": 18.566084788029926, + "grad_norm": 8.720541954040527, + "learning_rate": 1.4413965087281797e-06, + "loss": 0.3715, + "step": 74450 + }, + { + "epoch": 18.56857855361596, + "grad_norm": 9.304561614990234, + "learning_rate": 1.4389027431421448e-06, + "loss": 0.2913, + "step": 74460 + }, + { + "epoch": 18.571072319201996, + "grad_norm": 6.90985631942749, + "learning_rate": 1.43640897755611e-06, + "loss": 0.3142, + "step": 74470 + }, + { + "epoch": 18.57356608478803, + "grad_norm": 6.733709335327148, + "learning_rate": 1.433915211970075e-06, + "loss": 0.3388, + "step": 74480 + }, + { + "epoch": 18.576059850374065, + "grad_norm": 10.039854049682617, + "learning_rate": 1.4314214463840401e-06, + "loss": 0.3232, + "step": 74490 + }, + { + "epoch": 18.5785536159601, + "grad_norm": 9.462594032287598, + "learning_rate": 1.4289276807980052e-06, + "loss": 0.3591, + "step": 74500 + }, + { + "epoch": 18.581047381546135, + "grad_norm": 9.804282188415527, + "learning_rate": 1.4264339152119703e-06, + "loss": 0.3105, + "step": 74510 + }, + { + "epoch": 18.58354114713217, + "grad_norm": 11.309243202209473, + "learning_rate": 1.4239401496259352e-06, + "loss": 0.3292, + "step": 74520 + }, + { + "epoch": 18.586034912718205, + "grad_norm": 13.264601707458496, + "learning_rate": 1.4214463840399003e-06, + "loss": 0.3184, + "step": 74530 + }, + { + "epoch": 18.58852867830424, + "grad_norm": 8.009174346923828, + "learning_rate": 1.4189526184538654e-06, + "loss": 0.3198, + "step": 74540 + }, + { + "epoch": 18.591022443890274, + "grad_norm": 9.68816089630127, + "learning_rate": 1.4164588528678305e-06, + "loss": 0.2911, + "step": 74550 + }, + { + "epoch": 18.59351620947631, + "grad_norm": 15.687336921691895, + "learning_rate": 1.4139650872817956e-06, + "loss": 0.2892, + "step": 74560 + }, + { + "epoch": 18.596009975062344, + "grad_norm": 11.459478378295898, + "learning_rate": 1.4114713216957607e-06, + "loss": 0.361, + "step": 74570 + }, + { + "epoch": 18.59850374064838, + "grad_norm": 12.541219711303711, + "learning_rate": 1.4089775561097258e-06, + "loss": 0.3387, + "step": 74580 + }, + { + "epoch": 18.600997506234414, + "grad_norm": 8.407682418823242, + "learning_rate": 1.4064837905236909e-06, + "loss": 0.366, + "step": 74590 + }, + { + "epoch": 18.60349127182045, + "grad_norm": 9.860024452209473, + "learning_rate": 1.403990024937656e-06, + "loss": 0.3266, + "step": 74600 + }, + { + "epoch": 18.605985037406484, + "grad_norm": 13.05792236328125, + "learning_rate": 1.4014962593516209e-06, + "loss": 0.3454, + "step": 74610 + }, + { + "epoch": 18.60847880299252, + "grad_norm": 9.172198295593262, + "learning_rate": 1.399002493765586e-06, + "loss": 0.3367, + "step": 74620 + }, + { + "epoch": 18.610972568578553, + "grad_norm": 9.569443702697754, + "learning_rate": 1.396508728179551e-06, + "loss": 0.2912, + "step": 74630 + }, + { + "epoch": 18.61346633416459, + "grad_norm": 6.067528247833252, + "learning_rate": 1.3940149625935162e-06, + "loss": 0.2704, + "step": 74640 + }, + { + "epoch": 18.615960099750623, + "grad_norm": 12.0790433883667, + "learning_rate": 1.3915211970074813e-06, + "loss": 0.2749, + "step": 74650 + }, + { + "epoch": 18.618453865336658, + "grad_norm": 8.685276985168457, + "learning_rate": 1.3890274314214464e-06, + "loss": 0.2838, + "step": 74660 + }, + { + "epoch": 18.620947630922693, + "grad_norm": 8.051512718200684, + "learning_rate": 1.3865336658354117e-06, + "loss": 0.2934, + "step": 74670 + }, + { + "epoch": 18.623441396508728, + "grad_norm": 6.928664684295654, + "learning_rate": 1.3840399002493768e-06, + "loss": 0.2874, + "step": 74680 + }, + { + "epoch": 18.625935162094763, + "grad_norm": 5.946801662445068, + "learning_rate": 1.3815461346633419e-06, + "loss": 0.2963, + "step": 74690 + }, + { + "epoch": 18.628428927680797, + "grad_norm": 9.436132431030273, + "learning_rate": 1.379052369077307e-06, + "loss": 0.3327, + "step": 74700 + }, + { + "epoch": 18.630922693266832, + "grad_norm": 6.325564384460449, + "learning_rate": 1.376558603491272e-06, + "loss": 0.313, + "step": 74710 + }, + { + "epoch": 18.633416458852867, + "grad_norm": 5.90425443649292, + "learning_rate": 1.3740648379052372e-06, + "loss": 0.2912, + "step": 74720 + }, + { + "epoch": 18.635910224438902, + "grad_norm": 6.267174243927002, + "learning_rate": 1.3715710723192023e-06, + "loss": 0.3487, + "step": 74730 + }, + { + "epoch": 18.638403990024937, + "grad_norm": 6.616288185119629, + "learning_rate": 1.3690773067331672e-06, + "loss": 0.3146, + "step": 74740 + }, + { + "epoch": 18.640897755610972, + "grad_norm": 8.680601119995117, + "learning_rate": 1.3665835411471322e-06, + "loss": 0.3009, + "step": 74750 + }, + { + "epoch": 18.643391521197007, + "grad_norm": 12.029298782348633, + "learning_rate": 1.3640897755610973e-06, + "loss": 0.328, + "step": 74760 + }, + { + "epoch": 18.64588528678304, + "grad_norm": 6.389432907104492, + "learning_rate": 1.3615960099750624e-06, + "loss": 0.3464, + "step": 74770 + }, + { + "epoch": 18.648379052369076, + "grad_norm": 6.558785915374756, + "learning_rate": 1.3591022443890275e-06, + "loss": 0.2935, + "step": 74780 + }, + { + "epoch": 18.65087281795511, + "grad_norm": 8.406317710876465, + "learning_rate": 1.3566084788029926e-06, + "loss": 0.3135, + "step": 74790 + }, + { + "epoch": 18.653366583541146, + "grad_norm": 11.625472068786621, + "learning_rate": 1.3541147132169577e-06, + "loss": 0.3286, + "step": 74800 + }, + { + "epoch": 18.65586034912718, + "grad_norm": 7.691009044647217, + "learning_rate": 1.3516209476309228e-06, + "loss": 0.2337, + "step": 74810 + }, + { + "epoch": 18.658354114713216, + "grad_norm": 8.831992149353027, + "learning_rate": 1.349127182044888e-06, + "loss": 0.393, + "step": 74820 + }, + { + "epoch": 18.66084788029925, + "grad_norm": 10.607884407043457, + "learning_rate": 1.3466334164588528e-06, + "loss": 0.2846, + "step": 74830 + }, + { + "epoch": 18.663341645885286, + "grad_norm": 12.452134132385254, + "learning_rate": 1.344139650872818e-06, + "loss": 0.391, + "step": 74840 + }, + { + "epoch": 18.66583541147132, + "grad_norm": 7.653696537017822, + "learning_rate": 1.341645885286783e-06, + "loss": 0.2668, + "step": 74850 + }, + { + "epoch": 18.668329177057355, + "grad_norm": 4.616998195648193, + "learning_rate": 1.3391521197007481e-06, + "loss": 0.2979, + "step": 74860 + }, + { + "epoch": 18.67082294264339, + "grad_norm": 8.645465850830078, + "learning_rate": 1.3366583541147132e-06, + "loss": 0.3351, + "step": 74870 + }, + { + "epoch": 18.673316708229425, + "grad_norm": 11.444828987121582, + "learning_rate": 1.3341645885286783e-06, + "loss": 0.2851, + "step": 74880 + }, + { + "epoch": 18.67581047381546, + "grad_norm": 8.822684288024902, + "learning_rate": 1.3316708229426434e-06, + "loss": 0.3381, + "step": 74890 + }, + { + "epoch": 18.678304239401495, + "grad_norm": 6.625451564788818, + "learning_rate": 1.3291770573566085e-06, + "loss": 0.3038, + "step": 74900 + }, + { + "epoch": 18.68079800498753, + "grad_norm": 9.063295364379883, + "learning_rate": 1.3269326683291772e-06, + "loss": 0.2841, + "step": 74910 + }, + { + "epoch": 18.683291770573565, + "grad_norm": 10.658787727355957, + "learning_rate": 1.3244389027431423e-06, + "loss": 0.2965, + "step": 74920 + }, + { + "epoch": 18.6857855361596, + "grad_norm": 10.322769165039062, + "learning_rate": 1.3219451371571074e-06, + "loss": 0.3753, + "step": 74930 + }, + { + "epoch": 18.688279301745634, + "grad_norm": 5.894896030426025, + "learning_rate": 1.3194513715710725e-06, + "loss": 0.351, + "step": 74940 + }, + { + "epoch": 18.69077306733167, + "grad_norm": 11.83723258972168, + "learning_rate": 1.3169576059850376e-06, + "loss": 0.3247, + "step": 74950 + }, + { + "epoch": 18.693266832917704, + "grad_norm": 8.492645263671875, + "learning_rate": 1.314713216957606e-06, + "loss": 0.3656, + "step": 74960 + }, + { + "epoch": 18.69576059850374, + "grad_norm": 11.175936698913574, + "learning_rate": 1.3122194513715712e-06, + "loss": 0.348, + "step": 74970 + }, + { + "epoch": 18.698254364089777, + "grad_norm": 7.240017890930176, + "learning_rate": 1.309725685785536e-06, + "loss": 0.2931, + "step": 74980 + }, + { + "epoch": 18.70074812967581, + "grad_norm": 8.305476188659668, + "learning_rate": 1.3072319201995012e-06, + "loss": 0.3615, + "step": 74990 + }, + { + "epoch": 18.703241895261847, + "grad_norm": 12.025943756103516, + "learning_rate": 1.3047381546134663e-06, + "loss": 0.3484, + "step": 75000 + }, + { + "epoch": 18.705735660847882, + "grad_norm": 9.331719398498535, + "learning_rate": 1.3022443890274316e-06, + "loss": 0.3532, + "step": 75010 + }, + { + "epoch": 18.708229426433917, + "grad_norm": 11.926877975463867, + "learning_rate": 1.2997506234413967e-06, + "loss": 0.3444, + "step": 75020 + }, + { + "epoch": 18.71072319201995, + "grad_norm": 7.448859214782715, + "learning_rate": 1.2972568578553618e-06, + "loss": 0.2879, + "step": 75030 + }, + { + "epoch": 18.713216957605987, + "grad_norm": 11.024309158325195, + "learning_rate": 1.2947630922693269e-06, + "loss": 0.34, + "step": 75040 + }, + { + "epoch": 18.71571072319202, + "grad_norm": 6.629947185516357, + "learning_rate": 1.292269326683292e-06, + "loss": 0.3714, + "step": 75050 + }, + { + "epoch": 18.718204488778056, + "grad_norm": 7.899852275848389, + "learning_rate": 1.289775561097257e-06, + "loss": 0.3046, + "step": 75060 + }, + { + "epoch": 18.72069825436409, + "grad_norm": 8.794214248657227, + "learning_rate": 1.2872817955112222e-06, + "loss": 0.3517, + "step": 75070 + }, + { + "epoch": 18.723192019950126, + "grad_norm": 6.43766450881958, + "learning_rate": 1.2847880299251873e-06, + "loss": 0.3165, + "step": 75080 + }, + { + "epoch": 18.72568578553616, + "grad_norm": 6.629555702209473, + "learning_rate": 1.2822942643391523e-06, + "loss": 0.3499, + "step": 75090 + }, + { + "epoch": 18.728179551122196, + "grad_norm": 11.044578552246094, + "learning_rate": 1.2798004987531174e-06, + "loss": 0.3129, + "step": 75100 + }, + { + "epoch": 18.73067331670823, + "grad_norm": 9.229756355285645, + "learning_rate": 1.2773067331670823e-06, + "loss": 0.3121, + "step": 75110 + }, + { + "epoch": 18.733167082294266, + "grad_norm": 7.941111087799072, + "learning_rate": 1.2748129675810474e-06, + "loss": 0.4249, + "step": 75120 + }, + { + "epoch": 18.7356608478803, + "grad_norm": 9.484469413757324, + "learning_rate": 1.2723192019950125e-06, + "loss": 0.2897, + "step": 75130 + }, + { + "epoch": 18.738154613466335, + "grad_norm": 7.1095075607299805, + "learning_rate": 1.2698254364089776e-06, + "loss": 0.3664, + "step": 75140 + }, + { + "epoch": 18.74064837905237, + "grad_norm": 6.883621692657471, + "learning_rate": 1.2673316708229427e-06, + "loss": 0.2501, + "step": 75150 + }, + { + "epoch": 18.743142144638405, + "grad_norm": 8.864466667175293, + "learning_rate": 1.2648379052369078e-06, + "loss": 0.3146, + "step": 75160 + }, + { + "epoch": 18.74563591022444, + "grad_norm": 7.686639308929443, + "learning_rate": 1.262344139650873e-06, + "loss": 0.2874, + "step": 75170 + }, + { + "epoch": 18.748129675810475, + "grad_norm": 7.733112812042236, + "learning_rate": 1.259850374064838e-06, + "loss": 0.3358, + "step": 75180 + }, + { + "epoch": 18.75062344139651, + "grad_norm": 8.906079292297363, + "learning_rate": 1.2573566084788031e-06, + "loss": 0.3112, + "step": 75190 + }, + { + "epoch": 18.753117206982544, + "grad_norm": 11.127494812011719, + "learning_rate": 1.254862842892768e-06, + "loss": 0.3105, + "step": 75200 + }, + { + "epoch": 18.75561097256858, + "grad_norm": 8.906811714172363, + "learning_rate": 1.252369077306733e-06, + "loss": 0.3047, + "step": 75210 + }, + { + "epoch": 18.758104738154614, + "grad_norm": 6.8946027755737305, + "learning_rate": 1.2498753117206984e-06, + "loss": 0.3289, + "step": 75220 + }, + { + "epoch": 18.76059850374065, + "grad_norm": 8.487861633300781, + "learning_rate": 1.2473815461346635e-06, + "loss": 0.4042, + "step": 75230 + }, + { + "epoch": 18.763092269326684, + "grad_norm": 8.7031831741333, + "learning_rate": 1.2448877805486286e-06, + "loss": 0.3045, + "step": 75240 + }, + { + "epoch": 18.76558603491272, + "grad_norm": 8.133673667907715, + "learning_rate": 1.2423940149625937e-06, + "loss": 0.324, + "step": 75250 + }, + { + "epoch": 18.768079800498754, + "grad_norm": 8.087395668029785, + "learning_rate": 1.2399002493765588e-06, + "loss": 0.3004, + "step": 75260 + }, + { + "epoch": 18.77057356608479, + "grad_norm": 8.729161262512207, + "learning_rate": 1.2374064837905237e-06, + "loss": 0.3408, + "step": 75270 + }, + { + "epoch": 18.773067331670823, + "grad_norm": 9.859537124633789, + "learning_rate": 1.2349127182044888e-06, + "loss": 0.3635, + "step": 75280 + }, + { + "epoch": 18.77556109725686, + "grad_norm": 8.983450889587402, + "learning_rate": 1.2324189526184539e-06, + "loss": 0.2727, + "step": 75290 + }, + { + "epoch": 18.778054862842893, + "grad_norm": 7.337526798248291, + "learning_rate": 1.229925187032419e-06, + "loss": 0.2853, + "step": 75300 + }, + { + "epoch": 18.780548628428928, + "grad_norm": 9.413741111755371, + "learning_rate": 1.227431421446384e-06, + "loss": 0.3259, + "step": 75310 + }, + { + "epoch": 18.783042394014963, + "grad_norm": 8.917171478271484, + "learning_rate": 1.2249376558603494e-06, + "loss": 0.3087, + "step": 75320 + }, + { + "epoch": 18.785536159600998, + "grad_norm": 9.118664741516113, + "learning_rate": 1.2224438902743143e-06, + "loss": 0.3468, + "step": 75330 + }, + { + "epoch": 18.788029925187033, + "grad_norm": 11.285893440246582, + "learning_rate": 1.2199501246882794e-06, + "loss": 0.3248, + "step": 75340 + }, + { + "epoch": 18.790523690773068, + "grad_norm": 8.557662963867188, + "learning_rate": 1.2174563591022445e-06, + "loss": 0.3836, + "step": 75350 + }, + { + "epoch": 18.793017456359102, + "grad_norm": 9.565607070922852, + "learning_rate": 1.2149625935162096e-06, + "loss": 0.3616, + "step": 75360 + }, + { + "epoch": 18.795511221945137, + "grad_norm": 7.839760780334473, + "learning_rate": 1.2124688279301747e-06, + "loss": 0.4556, + "step": 75370 + }, + { + "epoch": 18.798004987531172, + "grad_norm": 8.867552757263184, + "learning_rate": 1.2099750623441398e-06, + "loss": 0.3541, + "step": 75380 + }, + { + "epoch": 18.800498753117207, + "grad_norm": 8.26724910736084, + "learning_rate": 1.2074812967581049e-06, + "loss": 0.3567, + "step": 75390 + }, + { + "epoch": 18.802992518703242, + "grad_norm": 11.374317169189453, + "learning_rate": 1.20498753117207e-06, + "loss": 0.2463, + "step": 75400 + }, + { + "epoch": 18.805486284289277, + "grad_norm": 6.54574728012085, + "learning_rate": 1.202493765586035e-06, + "loss": 0.3166, + "step": 75410 + }, + { + "epoch": 18.80798004987531, + "grad_norm": 6.992059230804443, + "learning_rate": 1.2000000000000002e-06, + "loss": 0.3017, + "step": 75420 + }, + { + "epoch": 18.810473815461346, + "grad_norm": 6.233072757720947, + "learning_rate": 1.197506234413965e-06, + "loss": 0.3673, + "step": 75430 + }, + { + "epoch": 18.81296758104738, + "grad_norm": 7.485711097717285, + "learning_rate": 1.1950124688279301e-06, + "loss": 0.2989, + "step": 75440 + }, + { + "epoch": 18.815461346633416, + "grad_norm": 9.747830390930176, + "learning_rate": 1.1925187032418955e-06, + "loss": 0.325, + "step": 75450 + }, + { + "epoch": 18.81795511221945, + "grad_norm": 6.727051734924316, + "learning_rate": 1.1900249376558605e-06, + "loss": 0.3339, + "step": 75460 + }, + { + "epoch": 18.820448877805486, + "grad_norm": 7.6110968589782715, + "learning_rate": 1.1875311720698256e-06, + "loss": 0.2869, + "step": 75470 + }, + { + "epoch": 18.82294264339152, + "grad_norm": 9.239736557006836, + "learning_rate": 1.1850374064837907e-06, + "loss": 0.2949, + "step": 75480 + }, + { + "epoch": 18.825436408977556, + "grad_norm": 11.853121757507324, + "learning_rate": 1.1825436408977556e-06, + "loss": 0.3425, + "step": 75490 + }, + { + "epoch": 18.82793017456359, + "grad_norm": 9.181052207946777, + "learning_rate": 1.1800498753117207e-06, + "loss": 0.3708, + "step": 75500 + }, + { + "epoch": 18.830423940149625, + "grad_norm": 13.224994659423828, + "learning_rate": 1.1775561097256858e-06, + "loss": 0.3853, + "step": 75510 + }, + { + "epoch": 18.83291770573566, + "grad_norm": 8.818222045898438, + "learning_rate": 1.175062344139651e-06, + "loss": 0.3206, + "step": 75520 + }, + { + "epoch": 18.835411471321695, + "grad_norm": 10.314600944519043, + "learning_rate": 1.172568578553616e-06, + "loss": 0.3437, + "step": 75530 + }, + { + "epoch": 18.83790523690773, + "grad_norm": 9.383597373962402, + "learning_rate": 1.1700748129675811e-06, + "loss": 0.4307, + "step": 75540 + }, + { + "epoch": 18.840399002493765, + "grad_norm": 7.339836120605469, + "learning_rate": 1.1675810473815462e-06, + "loss": 0.3421, + "step": 75550 + }, + { + "epoch": 18.8428927680798, + "grad_norm": 8.67222785949707, + "learning_rate": 1.1650872817955113e-06, + "loss": 0.2738, + "step": 75560 + }, + { + "epoch": 18.845386533665835, + "grad_norm": 12.440013885498047, + "learning_rate": 1.1625935162094764e-06, + "loss": 0.3523, + "step": 75570 + }, + { + "epoch": 18.84788029925187, + "grad_norm": 8.511222839355469, + "learning_rate": 1.1600997506234415e-06, + "loss": 0.3205, + "step": 75580 + }, + { + "epoch": 18.850374064837904, + "grad_norm": 10.684443473815918, + "learning_rate": 1.1576059850374066e-06, + "loss": 0.3092, + "step": 75590 + }, + { + "epoch": 18.85286783042394, + "grad_norm": 8.839110374450684, + "learning_rate": 1.1551122194513717e-06, + "loss": 0.3351, + "step": 75600 + }, + { + "epoch": 18.855361596009974, + "grad_norm": 9.381013870239258, + "learning_rate": 1.1526184538653368e-06, + "loss": 0.2935, + "step": 75610 + }, + { + "epoch": 18.85785536159601, + "grad_norm": 10.822402954101562, + "learning_rate": 1.150124688279302e-06, + "loss": 0.3281, + "step": 75620 + }, + { + "epoch": 18.860349127182044, + "grad_norm": 8.065682411193848, + "learning_rate": 1.147630922693267e-06, + "loss": 0.336, + "step": 75630 + }, + { + "epoch": 18.86284289276808, + "grad_norm": 8.760025978088379, + "learning_rate": 1.145137157107232e-06, + "loss": 0.285, + "step": 75640 + }, + { + "epoch": 18.865336658354114, + "grad_norm": 10.8186616897583, + "learning_rate": 1.142643391521197e-06, + "loss": 0.4083, + "step": 75650 + }, + { + "epoch": 18.86783042394015, + "grad_norm": 10.44265079498291, + "learning_rate": 1.140149625935162e-06, + "loss": 0.2865, + "step": 75660 + }, + { + "epoch": 18.870324189526183, + "grad_norm": 12.26836109161377, + "learning_rate": 1.1376558603491272e-06, + "loss": 0.3483, + "step": 75670 + }, + { + "epoch": 18.872817955112218, + "grad_norm": 7.17544412612915, + "learning_rate": 1.1351620947630923e-06, + "loss": 0.3349, + "step": 75680 + }, + { + "epoch": 18.875311720698253, + "grad_norm": 10.033705711364746, + "learning_rate": 1.1326683291770576e-06, + "loss": 0.3098, + "step": 75690 + }, + { + "epoch": 18.877805486284288, + "grad_norm": 11.135791778564453, + "learning_rate": 1.1301745635910227e-06, + "loss": 0.2847, + "step": 75700 + }, + { + "epoch": 18.880299251870323, + "grad_norm": 6.80994176864624, + "learning_rate": 1.1276807980049876e-06, + "loss": 0.2888, + "step": 75710 + }, + { + "epoch": 18.882793017456358, + "grad_norm": 7.766765117645264, + "learning_rate": 1.1251870324189527e-06, + "loss": 0.3815, + "step": 75720 + }, + { + "epoch": 18.885286783042392, + "grad_norm": 6.553428649902344, + "learning_rate": 1.1226932668329178e-06, + "loss": 0.3442, + "step": 75730 + }, + { + "epoch": 18.887780548628427, + "grad_norm": 5.193823337554932, + "learning_rate": 1.1201995012468829e-06, + "loss": 0.2764, + "step": 75740 + }, + { + "epoch": 18.890274314214462, + "grad_norm": 12.545166015625, + "learning_rate": 1.117705735660848e-06, + "loss": 0.338, + "step": 75750 + }, + { + "epoch": 18.892768079800497, + "grad_norm": 9.6239595413208, + "learning_rate": 1.115211970074813e-06, + "loss": 0.3336, + "step": 75760 + }, + { + "epoch": 18.895261845386532, + "grad_norm": 8.860621452331543, + "learning_rate": 1.1127182044887782e-06, + "loss": 0.2912, + "step": 75770 + }, + { + "epoch": 18.897755610972567, + "grad_norm": 10.455620765686035, + "learning_rate": 1.1102244389027433e-06, + "loss": 0.3473, + "step": 75780 + }, + { + "epoch": 18.900249376558605, + "grad_norm": 7.781944751739502, + "learning_rate": 1.1077306733167084e-06, + "loss": 0.3008, + "step": 75790 + }, + { + "epoch": 18.902743142144637, + "grad_norm": 6.780787944793701, + "learning_rate": 1.1052369077306735e-06, + "loss": 0.2852, + "step": 75800 + }, + { + "epoch": 18.905236907730675, + "grad_norm": 11.432109832763672, + "learning_rate": 1.1027431421446383e-06, + "loss": 0.3317, + "step": 75810 + }, + { + "epoch": 18.90773067331671, + "grad_norm": 9.996076583862305, + "learning_rate": 1.1002493765586037e-06, + "loss": 0.2939, + "step": 75820 + }, + { + "epoch": 18.910224438902745, + "grad_norm": 9.775397300720215, + "learning_rate": 1.0977556109725688e-06, + "loss": 0.3415, + "step": 75830 + }, + { + "epoch": 18.91271820448878, + "grad_norm": 10.835949897766113, + "learning_rate": 1.0952618453865338e-06, + "loss": 0.2963, + "step": 75840 + }, + { + "epoch": 18.915211970074814, + "grad_norm": 8.529813766479492, + "learning_rate": 1.092768079800499e-06, + "loss": 0.2763, + "step": 75850 + }, + { + "epoch": 18.91770573566085, + "grad_norm": 7.813754558563232, + "learning_rate": 1.090274314214464e-06, + "loss": 0.2713, + "step": 75860 + }, + { + "epoch": 18.920199501246884, + "grad_norm": 10.183265686035156, + "learning_rate": 1.087780548628429e-06, + "loss": 0.3235, + "step": 75870 + }, + { + "epoch": 18.92269326683292, + "grad_norm": 8.408893585205078, + "learning_rate": 1.085286783042394e-06, + "loss": 0.3334, + "step": 75880 + }, + { + "epoch": 18.925187032418954, + "grad_norm": 11.72058391571045, + "learning_rate": 1.0827930174563591e-06, + "loss": 0.3118, + "step": 75890 + }, + { + "epoch": 18.92768079800499, + "grad_norm": 6.197012424468994, + "learning_rate": 1.0802992518703242e-06, + "loss": 0.2938, + "step": 75900 + }, + { + "epoch": 18.930174563591024, + "grad_norm": 7.970619201660156, + "learning_rate": 1.0778054862842893e-06, + "loss": 0.3396, + "step": 75910 + }, + { + "epoch": 18.93266832917706, + "grad_norm": 10.110791206359863, + "learning_rate": 1.0753117206982544e-06, + "loss": 0.3286, + "step": 75920 + }, + { + "epoch": 18.935162094763093, + "grad_norm": 10.215046882629395, + "learning_rate": 1.0728179551122195e-06, + "loss": 0.3226, + "step": 75930 + }, + { + "epoch": 18.93765586034913, + "grad_norm": 10.97467041015625, + "learning_rate": 1.0703241895261846e-06, + "loss": 0.2734, + "step": 75940 + }, + { + "epoch": 18.940149625935163, + "grad_norm": 8.248761177062988, + "learning_rate": 1.0678304239401497e-06, + "loss": 0.379, + "step": 75950 + }, + { + "epoch": 18.942643391521198, + "grad_norm": 8.312829971313477, + "learning_rate": 1.0653366583541148e-06, + "loss": 0.3253, + "step": 75960 + }, + { + "epoch": 18.945137157107233, + "grad_norm": 4.772835731506348, + "learning_rate": 1.06284289276808e-06, + "loss": 0.3596, + "step": 75970 + }, + { + "epoch": 18.947630922693268, + "grad_norm": 10.931731224060059, + "learning_rate": 1.060349127182045e-06, + "loss": 0.3222, + "step": 75980 + }, + { + "epoch": 18.950124688279303, + "grad_norm": 10.215840339660645, + "learning_rate": 1.05785536159601e-06, + "loss": 0.3021, + "step": 75990 + }, + { + "epoch": 18.952618453865338, + "grad_norm": 9.405722618103027, + "learning_rate": 1.0553615960099752e-06, + "loss": 0.2664, + "step": 76000 + }, + { + "epoch": 18.955112219451372, + "grad_norm": 7.368799209594727, + "learning_rate": 1.0528678304239403e-06, + "loss": 0.3743, + "step": 76010 + }, + { + "epoch": 18.957605985037407, + "grad_norm": 6.817213535308838, + "learning_rate": 1.0503740648379054e-06, + "loss": 0.3206, + "step": 76020 + }, + { + "epoch": 18.960099750623442, + "grad_norm": 12.117660522460938, + "learning_rate": 1.0478802992518703e-06, + "loss": 0.3007, + "step": 76030 + }, + { + "epoch": 18.962593516209477, + "grad_norm": 12.789581298828125, + "learning_rate": 1.0453865336658354e-06, + "loss": 0.2822, + "step": 76040 + }, + { + "epoch": 18.965087281795512, + "grad_norm": 11.194233894348145, + "learning_rate": 1.0428927680798005e-06, + "loss": 0.3346, + "step": 76050 + }, + { + "epoch": 18.967581047381547, + "grad_norm": 7.340683937072754, + "learning_rate": 1.0403990024937656e-06, + "loss": 0.2921, + "step": 76060 + }, + { + "epoch": 18.97007481296758, + "grad_norm": 8.430867195129395, + "learning_rate": 1.0379052369077309e-06, + "loss": 0.2891, + "step": 76070 + }, + { + "epoch": 18.972568578553616, + "grad_norm": 8.543846130371094, + "learning_rate": 1.035411471321696e-06, + "loss": 0.2746, + "step": 76080 + }, + { + "epoch": 18.97506234413965, + "grad_norm": 7.078313827514648, + "learning_rate": 1.0329177057356609e-06, + "loss": 0.2897, + "step": 76090 + }, + { + "epoch": 18.977556109725686, + "grad_norm": 5.388221263885498, + "learning_rate": 1.030423940149626e-06, + "loss": 0.2994, + "step": 76100 + }, + { + "epoch": 18.98004987531172, + "grad_norm": 7.963678359985352, + "learning_rate": 1.027930174563591e-06, + "loss": 0.3703, + "step": 76110 + }, + { + "epoch": 18.982543640897756, + "grad_norm": 8.849908828735352, + "learning_rate": 1.0254364089775562e-06, + "loss": 0.3534, + "step": 76120 + }, + { + "epoch": 18.98503740648379, + "grad_norm": 8.504463195800781, + "learning_rate": 1.0229426433915213e-06, + "loss": 0.3156, + "step": 76130 + }, + { + "epoch": 18.987531172069826, + "grad_norm": 7.680803298950195, + "learning_rate": 1.0204488778054864e-06, + "loss": 0.3605, + "step": 76140 + }, + { + "epoch": 18.99002493765586, + "grad_norm": 8.039253234863281, + "learning_rate": 1.0179551122194515e-06, + "loss": 0.3321, + "step": 76150 + }, + { + "epoch": 18.992518703241895, + "grad_norm": 6.414565563201904, + "learning_rate": 1.0154613466334166e-06, + "loss": 0.3486, + "step": 76160 + }, + { + "epoch": 18.99501246882793, + "grad_norm": 10.342339515686035, + "learning_rate": 1.0129675810473817e-06, + "loss": 0.3262, + "step": 76170 + }, + { + "epoch": 18.997506234413965, + "grad_norm": 10.92263412475586, + "learning_rate": 1.0104738154613468e-06, + "loss": 0.3949, + "step": 76180 + }, + { + "epoch": 19.0, + "grad_norm": 9.860681533813477, + "learning_rate": 1.0079800498753119e-06, + "loss": 0.3274, + "step": 76190 + }, + { + "epoch": 19.0, + "eval_loss": 0.4184245765209198, + "eval_runtime": 60.1429, + "eval_samples_per_second": 16.677, + "eval_steps_per_second": 16.677, + "step": 76190 + }, + { + "epoch": 19.002493765586035, + "grad_norm": 6.654420852661133, + "learning_rate": 1.005486284289277e-06, + "loss": 0.3282, + "step": 76200 + }, + { + "epoch": 19.00498753117207, + "grad_norm": 11.368739128112793, + "learning_rate": 1.002992518703242e-06, + "loss": 0.3369, + "step": 76210 + }, + { + "epoch": 19.007481296758105, + "grad_norm": 6.13724422454834, + "learning_rate": 1.0004987531172071e-06, + "loss": 0.3149, + "step": 76220 + }, + { + "epoch": 19.00997506234414, + "grad_norm": 9.071732521057129, + "learning_rate": 9.980049875311722e-07, + "loss": 0.3517, + "step": 76230 + }, + { + "epoch": 19.012468827930174, + "grad_norm": 9.942281723022461, + "learning_rate": 9.955112219451373e-07, + "loss": 0.3144, + "step": 76240 + }, + { + "epoch": 19.01496259351621, + "grad_norm": 7.623038291931152, + "learning_rate": 9.930174563591022e-07, + "loss": 0.3624, + "step": 76250 + }, + { + "epoch": 19.017456359102244, + "grad_norm": 8.054025650024414, + "learning_rate": 9.905236907730673e-07, + "loss": 0.3465, + "step": 76260 + }, + { + "epoch": 19.01995012468828, + "grad_norm": 8.097540855407715, + "learning_rate": 9.880299251870324e-07, + "loss": 0.2926, + "step": 76270 + }, + { + "epoch": 19.022443890274314, + "grad_norm": 8.881487846374512, + "learning_rate": 9.855361596009975e-07, + "loss": 0.3272, + "step": 76280 + }, + { + "epoch": 19.02493765586035, + "grad_norm": 9.261902809143066, + "learning_rate": 9.830423940149626e-07, + "loss": 0.3085, + "step": 76290 + }, + { + "epoch": 19.027431421446384, + "grad_norm": 8.03958511352539, + "learning_rate": 9.805486284289277e-07, + "loss": 0.2904, + "step": 76300 + }, + { + "epoch": 19.02992518703242, + "grad_norm": 7.877828121185303, + "learning_rate": 9.780548628428928e-07, + "loss": 0.3262, + "step": 76310 + }, + { + "epoch": 19.032418952618453, + "grad_norm": 8.608955383300781, + "learning_rate": 9.75561097256858e-07, + "loss": 0.3105, + "step": 76320 + }, + { + "epoch": 19.034912718204488, + "grad_norm": 5.959877014160156, + "learning_rate": 9.73067331670823e-07, + "loss": 0.2964, + "step": 76330 + }, + { + "epoch": 19.037406483790523, + "grad_norm": 7.390963554382324, + "learning_rate": 9.705735660847881e-07, + "loss": 0.3047, + "step": 76340 + }, + { + "epoch": 19.039900249376558, + "grad_norm": 7.07940149307251, + "learning_rate": 9.680798004987532e-07, + "loss": 0.2936, + "step": 76350 + }, + { + "epoch": 19.042394014962593, + "grad_norm": 10.116901397705078, + "learning_rate": 9.655860349127183e-07, + "loss": 0.3208, + "step": 76360 + }, + { + "epoch": 19.044887780548628, + "grad_norm": 6.5279717445373535, + "learning_rate": 9.630922693266834e-07, + "loss": 0.3736, + "step": 76370 + }, + { + "epoch": 19.047381546134662, + "grad_norm": 7.244894981384277, + "learning_rate": 9.605985037406485e-07, + "loss": 0.3719, + "step": 76380 + }, + { + "epoch": 19.049875311720697, + "grad_norm": 9.443936347961426, + "learning_rate": 9.581047381546136e-07, + "loss": 0.2698, + "step": 76390 + }, + { + "epoch": 19.052369077306732, + "grad_norm": 7.093422889709473, + "learning_rate": 9.556109725685787e-07, + "loss": 0.3562, + "step": 76400 + }, + { + "epoch": 19.054862842892767, + "grad_norm": 8.13650131225586, + "learning_rate": 9.531172069825437e-07, + "loss": 0.3588, + "step": 76410 + }, + { + "epoch": 19.057356608478802, + "grad_norm": 8.89100170135498, + "learning_rate": 9.506234413965088e-07, + "loss": 0.2875, + "step": 76420 + }, + { + "epoch": 19.059850374064837, + "grad_norm": 12.49089527130127, + "learning_rate": 9.481296758104738e-07, + "loss": 0.2841, + "step": 76430 + }, + { + "epoch": 19.06234413965087, + "grad_norm": 8.83289623260498, + "learning_rate": 9.456359102244391e-07, + "loss": 0.3851, + "step": 76440 + }, + { + "epoch": 19.064837905236907, + "grad_norm": 8.544670104980469, + "learning_rate": 9.431421446384041e-07, + "loss": 0.309, + "step": 76450 + }, + { + "epoch": 19.06733167082294, + "grad_norm": 13.055142402648926, + "learning_rate": 9.406483790523692e-07, + "loss": 0.3448, + "step": 76460 + }, + { + "epoch": 19.069825436408976, + "grad_norm": 5.936143398284912, + "learning_rate": 9.381546134663343e-07, + "loss": 0.2878, + "step": 76470 + }, + { + "epoch": 19.07231920199501, + "grad_norm": 9.928505897521973, + "learning_rate": 9.356608478802994e-07, + "loss": 0.2742, + "step": 76480 + }, + { + "epoch": 19.074812967581046, + "grad_norm": 10.485602378845215, + "learning_rate": 9.331670822942644e-07, + "loss": 0.2967, + "step": 76490 + }, + { + "epoch": 19.07730673316708, + "grad_norm": 9.832148551940918, + "learning_rate": 9.306733167082295e-07, + "loss": 0.3564, + "step": 76500 + }, + { + "epoch": 19.079800498753116, + "grad_norm": 5.720829963684082, + "learning_rate": 9.281795511221946e-07, + "loss": 0.3261, + "step": 76510 + }, + { + "epoch": 19.08229426433915, + "grad_norm": 9.377291679382324, + "learning_rate": 9.256857855361597e-07, + "loss": 0.3256, + "step": 76520 + }, + { + "epoch": 19.084788029925186, + "grad_norm": 8.832295417785645, + "learning_rate": 9.231920199501248e-07, + "loss": 0.3498, + "step": 76530 + }, + { + "epoch": 19.08728179551122, + "grad_norm": 7.698011875152588, + "learning_rate": 9.206982543640898e-07, + "loss": 0.2729, + "step": 76540 + }, + { + "epoch": 19.089775561097255, + "grad_norm": 7.579693794250488, + "learning_rate": 9.182044887780549e-07, + "loss": 0.322, + "step": 76550 + }, + { + "epoch": 19.09226932668329, + "grad_norm": 8.33119010925293, + "learning_rate": 9.157107231920201e-07, + "loss": 0.2915, + "step": 76560 + }, + { + "epoch": 19.094763092269325, + "grad_norm": 11.253091812133789, + "learning_rate": 9.132169576059852e-07, + "loss": 0.3446, + "step": 76570 + }, + { + "epoch": 19.09725685785536, + "grad_norm": 9.968659400939941, + "learning_rate": 9.107231920199502e-07, + "loss": 0.2651, + "step": 76580 + }, + { + "epoch": 19.099750623441395, + "grad_norm": 10.114205360412598, + "learning_rate": 9.082294264339153e-07, + "loss": 0.3819, + "step": 76590 + }, + { + "epoch": 19.102244389027433, + "grad_norm": 10.0556058883667, + "learning_rate": 9.057356608478804e-07, + "loss": 0.3939, + "step": 76600 + }, + { + "epoch": 19.104738154613468, + "grad_norm": 6.652792930603027, + "learning_rate": 9.032418952618454e-07, + "loss": 0.4161, + "step": 76610 + }, + { + "epoch": 19.107231920199503, + "grad_norm": 8.197524070739746, + "learning_rate": 9.007481296758105e-07, + "loss": 0.3817, + "step": 76620 + }, + { + "epoch": 19.109725685785538, + "grad_norm": 9.194473266601562, + "learning_rate": 8.982543640897756e-07, + "loss": 0.3205, + "step": 76630 + }, + { + "epoch": 19.112219451371573, + "grad_norm": 9.719461441040039, + "learning_rate": 8.957605985037407e-07, + "loss": 0.3286, + "step": 76640 + }, + { + "epoch": 19.114713216957608, + "grad_norm": 5.662110328674316, + "learning_rate": 8.932668329177057e-07, + "loss": 0.3241, + "step": 76650 + }, + { + "epoch": 19.117206982543642, + "grad_norm": 7.443528652191162, + "learning_rate": 8.907730673316708e-07, + "loss": 0.3294, + "step": 76660 + }, + { + "epoch": 19.119700748129677, + "grad_norm": 8.850680351257324, + "learning_rate": 8.882793017456359e-07, + "loss": 0.4117, + "step": 76670 + }, + { + "epoch": 19.122194513715712, + "grad_norm": 9.90893840789795, + "learning_rate": 8.857855361596011e-07, + "loss": 0.3235, + "step": 76680 + }, + { + "epoch": 19.124688279301747, + "grad_norm": 6.257623672485352, + "learning_rate": 8.832917705735662e-07, + "loss": 0.2876, + "step": 76690 + }, + { + "epoch": 19.127182044887782, + "grad_norm": 10.78929615020752, + "learning_rate": 8.807980049875313e-07, + "loss": 0.2937, + "step": 76700 + }, + { + "epoch": 19.129675810473817, + "grad_norm": 12.467625617980957, + "learning_rate": 8.783042394014964e-07, + "loss": 0.2836, + "step": 76710 + }, + { + "epoch": 19.13216957605985, + "grad_norm": 9.467742919921875, + "learning_rate": 8.758104738154614e-07, + "loss": 0.2906, + "step": 76720 + }, + { + "epoch": 19.134663341645886, + "grad_norm": 7.464754581451416, + "learning_rate": 8.733167082294265e-07, + "loss": 0.3037, + "step": 76730 + }, + { + "epoch": 19.13715710723192, + "grad_norm": 9.207623481750488, + "learning_rate": 8.708229426433916e-07, + "loss": 0.3483, + "step": 76740 + }, + { + "epoch": 19.139650872817956, + "grad_norm": 5.893961429595947, + "learning_rate": 8.683291770573567e-07, + "loss": 0.327, + "step": 76750 + }, + { + "epoch": 19.14214463840399, + "grad_norm": 7.405910015106201, + "learning_rate": 8.658354114713217e-07, + "loss": 0.2723, + "step": 76760 + }, + { + "epoch": 19.144638403990026, + "grad_norm": 6.716821670532227, + "learning_rate": 8.633416458852868e-07, + "loss": 0.3184, + "step": 76770 + }, + { + "epoch": 19.14713216957606, + "grad_norm": 10.331329345703125, + "learning_rate": 8.608478802992519e-07, + "loss": 0.3315, + "step": 76780 + }, + { + "epoch": 19.149625935162096, + "grad_norm": 11.650618553161621, + "learning_rate": 8.58354114713217e-07, + "loss": 0.3256, + "step": 76790 + }, + { + "epoch": 19.15211970074813, + "grad_norm": 10.218818664550781, + "learning_rate": 8.558603491271821e-07, + "loss": 0.3823, + "step": 76800 + }, + { + "epoch": 19.154613466334165, + "grad_norm": 11.632461547851562, + "learning_rate": 8.533665835411473e-07, + "loss": 0.2996, + "step": 76810 + }, + { + "epoch": 19.1571072319202, + "grad_norm": 7.20107364654541, + "learning_rate": 8.508728179551124e-07, + "loss": 0.2705, + "step": 76820 + }, + { + "epoch": 19.159600997506235, + "grad_norm": 7.080248832702637, + "learning_rate": 8.483790523690774e-07, + "loss": 0.3582, + "step": 76830 + }, + { + "epoch": 19.16209476309227, + "grad_norm": 7.180111885070801, + "learning_rate": 8.458852867830425e-07, + "loss": 0.3788, + "step": 76840 + }, + { + "epoch": 19.164588528678305, + "grad_norm": 7.654664516448975, + "learning_rate": 8.433915211970076e-07, + "loss": 0.3894, + "step": 76850 + }, + { + "epoch": 19.16708229426434, + "grad_norm": 7.925827980041504, + "learning_rate": 8.408977556109727e-07, + "loss": 0.3759, + "step": 76860 + }, + { + "epoch": 19.169576059850375, + "grad_norm": 9.838397979736328, + "learning_rate": 8.384039900249377e-07, + "loss": 0.2958, + "step": 76870 + }, + { + "epoch": 19.17206982543641, + "grad_norm": 10.40937328338623, + "learning_rate": 8.359102244389028e-07, + "loss": 0.3657, + "step": 76880 + }, + { + "epoch": 19.174563591022444, + "grad_norm": 9.061084747314453, + "learning_rate": 8.334164588528679e-07, + "loss": 0.3918, + "step": 76890 + }, + { + "epoch": 19.17705735660848, + "grad_norm": 6.745436668395996, + "learning_rate": 8.30922693266833e-07, + "loss": 0.314, + "step": 76900 + }, + { + "epoch": 19.179551122194514, + "grad_norm": 9.674360275268555, + "learning_rate": 8.284289276807981e-07, + "loss": 0.3089, + "step": 76910 + }, + { + "epoch": 19.18204488778055, + "grad_norm": 7.983299732208252, + "learning_rate": 8.25935162094763e-07, + "loss": 0.3077, + "step": 76920 + }, + { + "epoch": 19.184538653366584, + "grad_norm": 8.201288223266602, + "learning_rate": 8.234413965087284e-07, + "loss": 0.3265, + "step": 76930 + }, + { + "epoch": 19.18703241895262, + "grad_norm": 6.199214935302734, + "learning_rate": 8.209476309226934e-07, + "loss": 0.2781, + "step": 76940 + }, + { + "epoch": 19.189526184538654, + "grad_norm": 7.515051364898682, + "learning_rate": 8.184538653366585e-07, + "loss": 0.2741, + "step": 76950 + }, + { + "epoch": 19.19201995012469, + "grad_norm": 7.1991658210754395, + "learning_rate": 8.159600997506235e-07, + "loss": 0.3119, + "step": 76960 + }, + { + "epoch": 19.194513715710723, + "grad_norm": 13.45710277557373, + "learning_rate": 8.134663341645886e-07, + "loss": 0.3671, + "step": 76970 + }, + { + "epoch": 19.197007481296758, + "grad_norm": 6.285548210144043, + "learning_rate": 8.109725685785537e-07, + "loss": 0.2633, + "step": 76980 + }, + { + "epoch": 19.199501246882793, + "grad_norm": 13.47431755065918, + "learning_rate": 8.087281795511223e-07, + "loss": 0.3932, + "step": 76990 + }, + { + "epoch": 19.201995012468828, + "grad_norm": 11.600540161132812, + "learning_rate": 8.062344139650874e-07, + "loss": 0.3936, + "step": 77000 + }, + { + "epoch": 19.204488778054863, + "grad_norm": 8.96855640411377, + "learning_rate": 8.037406483790524e-07, + "loss": 0.361, + "step": 77010 + }, + { + "epoch": 19.206982543640898, + "grad_norm": 10.361422538757324, + "learning_rate": 8.012468827930175e-07, + "loss": 0.3356, + "step": 77020 + }, + { + "epoch": 19.209476309226932, + "grad_norm": 7.201541423797607, + "learning_rate": 7.987531172069826e-07, + "loss": 0.3742, + "step": 77030 + }, + { + "epoch": 19.211970074812967, + "grad_norm": 8.441081047058105, + "learning_rate": 7.962593516209477e-07, + "loss": 0.3109, + "step": 77040 + }, + { + "epoch": 19.214463840399002, + "grad_norm": 7.7734174728393555, + "learning_rate": 7.937655860349128e-07, + "loss": 0.2973, + "step": 77050 + }, + { + "epoch": 19.216957605985037, + "grad_norm": 10.10339641571045, + "learning_rate": 7.912718204488778e-07, + "loss": 0.2633, + "step": 77060 + }, + { + "epoch": 19.219451371571072, + "grad_norm": 11.610758781433105, + "learning_rate": 7.887780548628429e-07, + "loss": 0.3264, + "step": 77070 + }, + { + "epoch": 19.221945137157107, + "grad_norm": 11.709192276000977, + "learning_rate": 7.86284289276808e-07, + "loss": 0.3189, + "step": 77080 + }, + { + "epoch": 19.22443890274314, + "grad_norm": 6.54279899597168, + "learning_rate": 7.837905236907731e-07, + "loss": 0.2883, + "step": 77090 + }, + { + "epoch": 19.226932668329177, + "grad_norm": 11.589969635009766, + "learning_rate": 7.812967581047382e-07, + "loss": 0.2831, + "step": 77100 + }, + { + "epoch": 19.22942643391521, + "grad_norm": 6.521005153656006, + "learning_rate": 7.788029925187034e-07, + "loss": 0.2952, + "step": 77110 + }, + { + "epoch": 19.231920199501246, + "grad_norm": 6.133444309234619, + "learning_rate": 7.763092269326684e-07, + "loss": 0.2859, + "step": 77120 + }, + { + "epoch": 19.23441396508728, + "grad_norm": 7.011719703674316, + "learning_rate": 7.738154613466335e-07, + "loss": 0.3177, + "step": 77130 + }, + { + "epoch": 19.236907730673316, + "grad_norm": 6.836624622344971, + "learning_rate": 7.713216957605986e-07, + "loss": 0.3955, + "step": 77140 + }, + { + "epoch": 19.23940149625935, + "grad_norm": 11.907731056213379, + "learning_rate": 7.688279301745637e-07, + "loss": 0.2778, + "step": 77150 + }, + { + "epoch": 19.241895261845386, + "grad_norm": 11.505409240722656, + "learning_rate": 7.663341645885288e-07, + "loss": 0.366, + "step": 77160 + }, + { + "epoch": 19.24438902743142, + "grad_norm": 10.92941951751709, + "learning_rate": 7.638403990024938e-07, + "loss": 0.3527, + "step": 77170 + }, + { + "epoch": 19.246882793017456, + "grad_norm": 8.509658813476562, + "learning_rate": 7.613466334164589e-07, + "loss": 0.269, + "step": 77180 + }, + { + "epoch": 19.24937655860349, + "grad_norm": 6.147039413452148, + "learning_rate": 7.58852867830424e-07, + "loss": 0.3095, + "step": 77190 + }, + { + "epoch": 19.251870324189525, + "grad_norm": 9.925485610961914, + "learning_rate": 7.563591022443891e-07, + "loss": 0.3432, + "step": 77200 + }, + { + "epoch": 19.25436408977556, + "grad_norm": 7.7526116371154785, + "learning_rate": 7.538653366583542e-07, + "loss": 0.3582, + "step": 77210 + }, + { + "epoch": 19.256857855361595, + "grad_norm": 9.207817077636719, + "learning_rate": 7.513715710723192e-07, + "loss": 0.2891, + "step": 77220 + }, + { + "epoch": 19.25935162094763, + "grad_norm": 8.330227851867676, + "learning_rate": 7.488778054862844e-07, + "loss": 0.2408, + "step": 77230 + }, + { + "epoch": 19.261845386533665, + "grad_norm": 7.16584587097168, + "learning_rate": 7.463840399002495e-07, + "loss": 0.2844, + "step": 77240 + }, + { + "epoch": 19.2643391521197, + "grad_norm": 7.896230220794678, + "learning_rate": 7.438902743142146e-07, + "loss": 0.3163, + "step": 77250 + }, + { + "epoch": 19.266832917705734, + "grad_norm": 6.485842704772949, + "learning_rate": 7.413965087281797e-07, + "loss": 0.3416, + "step": 77260 + }, + { + "epoch": 19.26932668329177, + "grad_norm": 10.570438385009766, + "learning_rate": 7.389027431421448e-07, + "loss": 0.3151, + "step": 77270 + }, + { + "epoch": 19.271820448877804, + "grad_norm": 10.489306449890137, + "learning_rate": 7.364089775561097e-07, + "loss": 0.3103, + "step": 77280 + }, + { + "epoch": 19.27431421446384, + "grad_norm": 17.813852310180664, + "learning_rate": 7.339152119700748e-07, + "loss": 0.3384, + "step": 77290 + }, + { + "epoch": 19.276807980049874, + "grad_norm": 6.28749418258667, + "learning_rate": 7.314214463840399e-07, + "loss": 0.3067, + "step": 77300 + }, + { + "epoch": 19.27930174563591, + "grad_norm": 10.992226600646973, + "learning_rate": 7.28927680798005e-07, + "loss": 0.3487, + "step": 77310 + }, + { + "epoch": 19.281795511221944, + "grad_norm": 8.586907386779785, + "learning_rate": 7.264339152119701e-07, + "loss": 0.2888, + "step": 77320 + }, + { + "epoch": 19.28428927680798, + "grad_norm": 8.104104995727539, + "learning_rate": 7.239401496259351e-07, + "loss": 0.2809, + "step": 77330 + }, + { + "epoch": 19.286783042394013, + "grad_norm": 9.069332122802734, + "learning_rate": 7.214463840399002e-07, + "loss": 0.2921, + "step": 77340 + }, + { + "epoch": 19.28927680798005, + "grad_norm": 8.148301124572754, + "learning_rate": 7.189526184538654e-07, + "loss": 0.4029, + "step": 77350 + }, + { + "epoch": 19.291770573566083, + "grad_norm": 7.358614921569824, + "learning_rate": 7.164588528678305e-07, + "loss": 0.3364, + "step": 77360 + }, + { + "epoch": 19.294264339152118, + "grad_norm": 11.08927059173584, + "learning_rate": 7.139650872817956e-07, + "loss": 0.3398, + "step": 77370 + }, + { + "epoch": 19.296758104738153, + "grad_norm": 10.33334732055664, + "learning_rate": 7.114713216957607e-07, + "loss": 0.326, + "step": 77380 + }, + { + "epoch": 19.29925187032419, + "grad_norm": 10.560935974121094, + "learning_rate": 7.089775561097257e-07, + "loss": 0.3557, + "step": 77390 + }, + { + "epoch": 19.301745635910226, + "grad_norm": 8.311334609985352, + "learning_rate": 7.064837905236908e-07, + "loss": 0.3314, + "step": 77400 + }, + { + "epoch": 19.30423940149626, + "grad_norm": 11.71918773651123, + "learning_rate": 7.039900249376559e-07, + "loss": 0.3246, + "step": 77410 + }, + { + "epoch": 19.306733167082296, + "grad_norm": 8.50728988647461, + "learning_rate": 7.01496259351621e-07, + "loss": 0.3343, + "step": 77420 + }, + { + "epoch": 19.30922693266833, + "grad_norm": 11.546276092529297, + "learning_rate": 6.990024937655861e-07, + "loss": 0.2922, + "step": 77430 + }, + { + "epoch": 19.311720698254366, + "grad_norm": 10.145132064819336, + "learning_rate": 6.965087281795511e-07, + "loss": 0.3306, + "step": 77440 + }, + { + "epoch": 19.3142144638404, + "grad_norm": 12.020730018615723, + "learning_rate": 6.940149625935162e-07, + "loss": 0.2949, + "step": 77450 + }, + { + "epoch": 19.316708229426435, + "grad_norm": 13.925124168395996, + "learning_rate": 6.915211970074813e-07, + "loss": 0.4027, + "step": 77460 + }, + { + "epoch": 19.31920199501247, + "grad_norm": 9.396332740783691, + "learning_rate": 6.890274314214464e-07, + "loss": 0.283, + "step": 77470 + }, + { + "epoch": 19.321695760598505, + "grad_norm": 5.274721622467041, + "learning_rate": 6.865336658354116e-07, + "loss": 0.3043, + "step": 77480 + }, + { + "epoch": 19.32418952618454, + "grad_norm": 7.85326623916626, + "learning_rate": 6.840399002493767e-07, + "loss": 0.3407, + "step": 77490 + }, + { + "epoch": 19.326683291770575, + "grad_norm": 12.338591575622559, + "learning_rate": 6.815461346633417e-07, + "loss": 0.2622, + "step": 77500 + }, + { + "epoch": 19.32917705735661, + "grad_norm": 7.619169235229492, + "learning_rate": 6.790523690773068e-07, + "loss": 0.2779, + "step": 77510 + }, + { + "epoch": 19.331670822942645, + "grad_norm": 8.566308975219727, + "learning_rate": 6.768079800498753e-07, + "loss": 0.3057, + "step": 77520 + }, + { + "epoch": 19.33416458852868, + "grad_norm": 11.934048652648926, + "learning_rate": 6.743142144638405e-07, + "loss": 0.3626, + "step": 77530 + }, + { + "epoch": 19.336658354114714, + "grad_norm": 12.807787895202637, + "learning_rate": 6.718204488778056e-07, + "loss": 0.297, + "step": 77540 + }, + { + "epoch": 19.33915211970075, + "grad_norm": 8.593514442443848, + "learning_rate": 6.693266832917707e-07, + "loss": 0.3174, + "step": 77550 + }, + { + "epoch": 19.341645885286784, + "grad_norm": 10.601576805114746, + "learning_rate": 6.668329177057358e-07, + "loss": 0.3191, + "step": 77560 + }, + { + "epoch": 19.34413965087282, + "grad_norm": 10.128413200378418, + "learning_rate": 6.643391521197009e-07, + "loss": 0.302, + "step": 77570 + }, + { + "epoch": 19.346633416458854, + "grad_norm": 8.060428619384766, + "learning_rate": 6.618453865336659e-07, + "loss": 0.2979, + "step": 77580 + }, + { + "epoch": 19.34912718204489, + "grad_norm": 5.3289079666137695, + "learning_rate": 6.59351620947631e-07, + "loss": 0.2762, + "step": 77590 + }, + { + "epoch": 19.351620947630924, + "grad_norm": 6.777313709259033, + "learning_rate": 6.568578553615961e-07, + "loss": 0.2989, + "step": 77600 + }, + { + "epoch": 19.35411471321696, + "grad_norm": 6.17332124710083, + "learning_rate": 6.543640897755612e-07, + "loss": 0.2851, + "step": 77610 + }, + { + "epoch": 19.356608478802993, + "grad_norm": 9.076419830322266, + "learning_rate": 6.518703241895261e-07, + "loss": 0.325, + "step": 77620 + }, + { + "epoch": 19.359102244389028, + "grad_norm": 9.705153465270996, + "learning_rate": 6.493765586034912e-07, + "loss": 0.2434, + "step": 77630 + }, + { + "epoch": 19.361596009975063, + "grad_norm": 11.839200973510742, + "learning_rate": 6.468827930174563e-07, + "loss": 0.3439, + "step": 77640 + }, + { + "epoch": 19.364089775561098, + "grad_norm": 10.060364723205566, + "learning_rate": 6.443890274314215e-07, + "loss": 0.3398, + "step": 77650 + }, + { + "epoch": 19.366583541147133, + "grad_norm": 4.9707560539245605, + "learning_rate": 6.418952618453866e-07, + "loss": 0.293, + "step": 77660 + }, + { + "epoch": 19.369077306733168, + "grad_norm": 7.344512462615967, + "learning_rate": 6.394014962593517e-07, + "loss": 0.3303, + "step": 77670 + }, + { + "epoch": 19.371571072319203, + "grad_norm": 8.75753402709961, + "learning_rate": 6.369077306733168e-07, + "loss": 0.3225, + "step": 77680 + }, + { + "epoch": 19.374064837905237, + "grad_norm": 8.907544136047363, + "learning_rate": 6.344139650872818e-07, + "loss": 0.4169, + "step": 77690 + }, + { + "epoch": 19.376558603491272, + "grad_norm": 8.724864959716797, + "learning_rate": 6.319201995012469e-07, + "loss": 0.4323, + "step": 77700 + }, + { + "epoch": 19.379052369077307, + "grad_norm": 10.523849487304688, + "learning_rate": 6.29426433915212e-07, + "loss": 0.3098, + "step": 77710 + }, + { + "epoch": 19.381546134663342, + "grad_norm": 6.705490589141846, + "learning_rate": 6.269326683291771e-07, + "loss": 0.2966, + "step": 77720 + }, + { + "epoch": 19.384039900249377, + "grad_norm": 9.971707344055176, + "learning_rate": 6.244389027431422e-07, + "loss": 0.3189, + "step": 77730 + }, + { + "epoch": 19.38653366583541, + "grad_norm": 10.387377738952637, + "learning_rate": 6.219451371571072e-07, + "loss": 0.3321, + "step": 77740 + }, + { + "epoch": 19.389027431421447, + "grad_norm": 8.39964485168457, + "learning_rate": 6.194513715710724e-07, + "loss": 0.2786, + "step": 77750 + }, + { + "epoch": 19.39152119700748, + "grad_norm": 7.76651668548584, + "learning_rate": 6.169576059850375e-07, + "loss": 0.3166, + "step": 77760 + }, + { + "epoch": 19.394014962593516, + "grad_norm": 8.83347225189209, + "learning_rate": 6.144638403990025e-07, + "loss": 0.3197, + "step": 77770 + }, + { + "epoch": 19.39650872817955, + "grad_norm": 11.635566711425781, + "learning_rate": 6.119700748129676e-07, + "loss": 0.3204, + "step": 77780 + }, + { + "epoch": 19.399002493765586, + "grad_norm": 8.75010871887207, + "learning_rate": 6.094763092269327e-07, + "loss": 0.3247, + "step": 77790 + }, + { + "epoch": 19.40149625935162, + "grad_norm": 12.465825080871582, + "learning_rate": 6.069825436408978e-07, + "loss": 0.3004, + "step": 77800 + }, + { + "epoch": 19.403990024937656, + "grad_norm": 10.757222175598145, + "learning_rate": 6.044887780548629e-07, + "loss": 0.3444, + "step": 77810 + }, + { + "epoch": 19.40648379052369, + "grad_norm": 4.095909118652344, + "learning_rate": 6.01995012468828e-07, + "loss": 0.3217, + "step": 77820 + }, + { + "epoch": 19.408977556109726, + "grad_norm": 9.890026092529297, + "learning_rate": 5.995012468827931e-07, + "loss": 0.3058, + "step": 77830 + }, + { + "epoch": 19.41147132169576, + "grad_norm": 9.338248252868652, + "learning_rate": 5.970074812967582e-07, + "loss": 0.2974, + "step": 77840 + }, + { + "epoch": 19.413965087281795, + "grad_norm": 7.386726379394531, + "learning_rate": 5.945137157107232e-07, + "loss": 0.4195, + "step": 77850 + }, + { + "epoch": 19.41645885286783, + "grad_norm": 10.347369194030762, + "learning_rate": 5.920199501246883e-07, + "loss": 0.3167, + "step": 77860 + }, + { + "epoch": 19.418952618453865, + "grad_norm": 5.733013153076172, + "learning_rate": 5.895261845386535e-07, + "loss": 0.2585, + "step": 77870 + }, + { + "epoch": 19.4214463840399, + "grad_norm": 8.385955810546875, + "learning_rate": 5.870324189526185e-07, + "loss": 0.2847, + "step": 77880 + }, + { + "epoch": 19.423940149625935, + "grad_norm": 7.1023736000061035, + "learning_rate": 5.845386533665836e-07, + "loss": 0.336, + "step": 77890 + }, + { + "epoch": 19.42643391521197, + "grad_norm": 9.88353157043457, + "learning_rate": 5.820448877805487e-07, + "loss": 0.3549, + "step": 77900 + }, + { + "epoch": 19.428927680798004, + "grad_norm": 8.863442420959473, + "learning_rate": 5.795511221945138e-07, + "loss": 0.3858, + "step": 77910 + }, + { + "epoch": 19.43142144638404, + "grad_norm": 8.624554634094238, + "learning_rate": 5.770573566084789e-07, + "loss": 0.3137, + "step": 77920 + }, + { + "epoch": 19.433915211970074, + "grad_norm": 9.726763725280762, + "learning_rate": 5.74563591022444e-07, + "loss": 0.345, + "step": 77930 + }, + { + "epoch": 19.43640897755611, + "grad_norm": 7.916418075561523, + "learning_rate": 5.720698254364091e-07, + "loss": 0.2833, + "step": 77940 + }, + { + "epoch": 19.438902743142144, + "grad_norm": 10.546299934387207, + "learning_rate": 5.695760598503742e-07, + "loss": 0.2614, + "step": 77950 + }, + { + "epoch": 19.44139650872818, + "grad_norm": 9.657865524291992, + "learning_rate": 5.670822942643392e-07, + "loss": 0.3221, + "step": 77960 + }, + { + "epoch": 19.443890274314214, + "grad_norm": 7.228270530700684, + "learning_rate": 5.645885286783043e-07, + "loss": 0.285, + "step": 77970 + }, + { + "epoch": 19.44638403990025, + "grad_norm": 12.523133277893066, + "learning_rate": 5.620947630922694e-07, + "loss": 0.2654, + "step": 77980 + }, + { + "epoch": 19.448877805486283, + "grad_norm": 9.408562660217285, + "learning_rate": 5.596009975062345e-07, + "loss": 0.2996, + "step": 77990 + }, + { + "epoch": 19.45137157107232, + "grad_norm": 8.884123802185059, + "learning_rate": 5.571072319201996e-07, + "loss": 0.3365, + "step": 78000 + }, + { + "epoch": 19.453865336658353, + "grad_norm": 7.461065769195557, + "learning_rate": 5.546134663341646e-07, + "loss": 0.3295, + "step": 78010 + }, + { + "epoch": 19.456359102244388, + "grad_norm": 7.785035610198975, + "learning_rate": 5.521197007481297e-07, + "loss": 0.235, + "step": 78020 + }, + { + "epoch": 19.458852867830423, + "grad_norm": 8.66875171661377, + "learning_rate": 5.496259351620948e-07, + "loss": 0.2687, + "step": 78030 + }, + { + "epoch": 19.461346633416458, + "grad_norm": 6.394353866577148, + "learning_rate": 5.471321695760598e-07, + "loss": 0.2604, + "step": 78040 + }, + { + "epoch": 19.463840399002493, + "grad_norm": 9.329657554626465, + "learning_rate": 5.44638403990025e-07, + "loss": 0.3127, + "step": 78050 + }, + { + "epoch": 19.466334164588527, + "grad_norm": 9.868875503540039, + "learning_rate": 5.421446384039901e-07, + "loss": 0.2944, + "step": 78060 + }, + { + "epoch": 19.468827930174562, + "grad_norm": 8.829019546508789, + "learning_rate": 5.396508728179551e-07, + "loss": 0.4614, + "step": 78070 + }, + { + "epoch": 19.471321695760597, + "grad_norm": 5.467095851898193, + "learning_rate": 5.371571072319202e-07, + "loss": 0.3467, + "step": 78080 + }, + { + "epoch": 19.473815461346632, + "grad_norm": 11.852470397949219, + "learning_rate": 5.346633416458853e-07, + "loss": 0.3606, + "step": 78090 + }, + { + "epoch": 19.476309226932667, + "grad_norm": 10.978060722351074, + "learning_rate": 5.321695760598504e-07, + "loss": 0.4001, + "step": 78100 + }, + { + "epoch": 19.478802992518702, + "grad_norm": 4.6936163902282715, + "learning_rate": 5.296758104738155e-07, + "loss": 0.2956, + "step": 78110 + }, + { + "epoch": 19.481296758104737, + "grad_norm": 8.054365158081055, + "learning_rate": 5.271820448877806e-07, + "loss": 0.3031, + "step": 78120 + }, + { + "epoch": 19.48379052369077, + "grad_norm": 7.315567493438721, + "learning_rate": 5.246882793017457e-07, + "loss": 0.292, + "step": 78130 + }, + { + "epoch": 19.486284289276806, + "grad_norm": 6.582238674163818, + "learning_rate": 5.221945137157108e-07, + "loss": 0.3452, + "step": 78140 + }, + { + "epoch": 19.48877805486284, + "grad_norm": 9.56821346282959, + "learning_rate": 5.197007481296758e-07, + "loss": 0.371, + "step": 78150 + }, + { + "epoch": 19.491271820448876, + "grad_norm": 10.5318603515625, + "learning_rate": 5.172069825436409e-07, + "loss": 0.3602, + "step": 78160 + }, + { + "epoch": 19.49376558603491, + "grad_norm": 7.174575328826904, + "learning_rate": 5.14713216957606e-07, + "loss": 0.2854, + "step": 78170 + }, + { + "epoch": 19.496259351620946, + "grad_norm": 9.72573184967041, + "learning_rate": 5.122194513715711e-07, + "loss": 0.2975, + "step": 78180 + }, + { + "epoch": 19.49875311720698, + "grad_norm": 9.575770378112793, + "learning_rate": 5.097256857855362e-07, + "loss": 0.303, + "step": 78190 + }, + { + "epoch": 19.50124688279302, + "grad_norm": 9.664167404174805, + "learning_rate": 5.072319201995013e-07, + "loss": 0.3959, + "step": 78200 + }, + { + "epoch": 19.503740648379054, + "grad_norm": 9.702856063842773, + "learning_rate": 5.047381546134664e-07, + "loss": 0.3454, + "step": 78210 + }, + { + "epoch": 19.50623441396509, + "grad_norm": 13.055885314941406, + "learning_rate": 5.022443890274315e-07, + "loss": 0.2945, + "step": 78220 + }, + { + "epoch": 19.508728179551124, + "grad_norm": 7.412993907928467, + "learning_rate": 4.997506234413965e-07, + "loss": 0.3167, + "step": 78230 + }, + { + "epoch": 19.51122194513716, + "grad_norm": 5.914247512817383, + "learning_rate": 4.972568578553617e-07, + "loss": 0.3067, + "step": 78240 + }, + { + "epoch": 19.513715710723194, + "grad_norm": 6.335524559020996, + "learning_rate": 4.947630922693268e-07, + "loss": 0.3002, + "step": 78250 + }, + { + "epoch": 19.51620947630923, + "grad_norm": 9.340645790100098, + "learning_rate": 4.922693266832918e-07, + "loss": 0.3535, + "step": 78260 + }, + { + "epoch": 19.518703241895263, + "grad_norm": 8.127272605895996, + "learning_rate": 4.897755610972569e-07, + "loss": 0.3276, + "step": 78270 + }, + { + "epoch": 19.521197007481298, + "grad_norm": 8.352450370788574, + "learning_rate": 4.87281795511222e-07, + "loss": 0.3708, + "step": 78280 + }, + { + "epoch": 19.523690773067333, + "grad_norm": 5.065845489501953, + "learning_rate": 4.847880299251871e-07, + "loss": 0.266, + "step": 78290 + }, + { + "epoch": 19.526184538653368, + "grad_norm": 9.564516067504883, + "learning_rate": 4.822942643391522e-07, + "loss": 0.2801, + "step": 78300 + }, + { + "epoch": 19.528678304239403, + "grad_norm": 8.529273986816406, + "learning_rate": 4.798004987531173e-07, + "loss": 0.3554, + "step": 78310 + }, + { + "epoch": 19.531172069825438, + "grad_norm": 9.940313339233398, + "learning_rate": 4.773067331670824e-07, + "loss": 0.2956, + "step": 78320 + }, + { + "epoch": 19.533665835411473, + "grad_norm": 7.539468288421631, + "learning_rate": 4.748129675810474e-07, + "loss": 0.3047, + "step": 78330 + }, + { + "epoch": 19.536159600997507, + "grad_norm": 9.497523307800293, + "learning_rate": 4.723192019950125e-07, + "loss": 0.2872, + "step": 78340 + }, + { + "epoch": 19.538653366583542, + "grad_norm": 8.197120666503906, + "learning_rate": 4.6982543640897756e-07, + "loss": 0.3116, + "step": 78350 + }, + { + "epoch": 19.541147132169577, + "grad_norm": 8.740259170532227, + "learning_rate": 4.673316708229427e-07, + "loss": 0.3306, + "step": 78360 + }, + { + "epoch": 19.543640897755612, + "grad_norm": 6.9125494956970215, + "learning_rate": 4.648379052369078e-07, + "loss": 0.3007, + "step": 78370 + }, + { + "epoch": 19.546134663341647, + "grad_norm": 7.752559185028076, + "learning_rate": 4.6234413965087285e-07, + "loss": 0.3, + "step": 78380 + }, + { + "epoch": 19.54862842892768, + "grad_norm": 10.674647331237793, + "learning_rate": 4.5985037406483795e-07, + "loss": 0.3157, + "step": 78390 + }, + { + "epoch": 19.551122194513717, + "grad_norm": 9.210230827331543, + "learning_rate": 4.57356608478803e-07, + "loss": 0.2928, + "step": 78400 + }, + { + "epoch": 19.55361596009975, + "grad_norm": 8.934961318969727, + "learning_rate": 4.548628428927681e-07, + "loss": 0.3272, + "step": 78410 + }, + { + "epoch": 19.556109725685786, + "grad_norm": 9.600099563598633, + "learning_rate": 4.523690773067332e-07, + "loss": 0.3249, + "step": 78420 + }, + { + "epoch": 19.55860349127182, + "grad_norm": 9.030034065246582, + "learning_rate": 4.4987531172069834e-07, + "loss": 0.283, + "step": 78430 + }, + { + "epoch": 19.561097256857856, + "grad_norm": 8.656168937683105, + "learning_rate": 4.473815461346634e-07, + "loss": 0.2666, + "step": 78440 + }, + { + "epoch": 19.56359102244389, + "grad_norm": 8.763611793518066, + "learning_rate": 4.448877805486285e-07, + "loss": 0.3127, + "step": 78450 + }, + { + "epoch": 19.566084788029926, + "grad_norm": 6.0083136558532715, + "learning_rate": 4.4239401496259353e-07, + "loss": 0.2952, + "step": 78460 + }, + { + "epoch": 19.56857855361596, + "grad_norm": 8.106850624084473, + "learning_rate": 4.399002493765586e-07, + "loss": 0.2619, + "step": 78470 + }, + { + "epoch": 19.571072319201996, + "grad_norm": 5.120926380157471, + "learning_rate": 4.3740648379052367e-07, + "loss": 0.3112, + "step": 78480 + }, + { + "epoch": 19.57356608478803, + "grad_norm": 8.02582836151123, + "learning_rate": 4.349127182044888e-07, + "loss": 0.3119, + "step": 78490 + }, + { + "epoch": 19.576059850374065, + "grad_norm": 14.72515869140625, + "learning_rate": 4.324189526184539e-07, + "loss": 0.3285, + "step": 78500 + }, + { + "epoch": 19.5785536159601, + "grad_norm": 10.0016508102417, + "learning_rate": 4.29925187032419e-07, + "loss": 0.3348, + "step": 78510 + }, + { + "epoch": 19.581047381546135, + "grad_norm": 8.692368507385254, + "learning_rate": 4.2743142144638406e-07, + "loss": 0.3112, + "step": 78520 + }, + { + "epoch": 19.58354114713217, + "grad_norm": 10.636957168579102, + "learning_rate": 4.2493765586034916e-07, + "loss": 0.3008, + "step": 78530 + }, + { + "epoch": 19.586034912718205, + "grad_norm": 14.530603408813477, + "learning_rate": 4.224438902743142e-07, + "loss": 0.3169, + "step": 78540 + }, + { + "epoch": 19.58852867830424, + "grad_norm": 10.313661575317383, + "learning_rate": 4.1995012468827936e-07, + "loss": 0.4105, + "step": 78550 + }, + { + "epoch": 19.591022443890274, + "grad_norm": 8.854100227355957, + "learning_rate": 4.1745635910224446e-07, + "loss": 0.3473, + "step": 78560 + }, + { + "epoch": 19.59351620947631, + "grad_norm": 5.632763385772705, + "learning_rate": 4.149625935162095e-07, + "loss": 0.3534, + "step": 78570 + }, + { + "epoch": 19.596009975062344, + "grad_norm": 10.139565467834473, + "learning_rate": 4.124688279301746e-07, + "loss": 0.2864, + "step": 78580 + }, + { + "epoch": 19.59850374064838, + "grad_norm": 10.44018268585205, + "learning_rate": 4.0997506234413964e-07, + "loss": 0.3235, + "step": 78590 + }, + { + "epoch": 19.600997506234414, + "grad_norm": 8.089249610900879, + "learning_rate": 4.0748129675810474e-07, + "loss": 0.3483, + "step": 78600 + }, + { + "epoch": 19.60349127182045, + "grad_norm": 8.919448852539062, + "learning_rate": 4.049875311720699e-07, + "loss": 0.2935, + "step": 78610 + }, + { + "epoch": 19.605985037406484, + "grad_norm": 12.397672653198242, + "learning_rate": 4.02493765586035e-07, + "loss": 0.2637, + "step": 78620 + }, + { + "epoch": 19.60847880299252, + "grad_norm": 7.087367057800293, + "learning_rate": 4.0000000000000003e-07, + "loss": 0.3281, + "step": 78630 + }, + { + "epoch": 19.610972568578553, + "grad_norm": 10.792654037475586, + "learning_rate": 3.9750623441396513e-07, + "loss": 0.2798, + "step": 78640 + }, + { + "epoch": 19.61346633416459, + "grad_norm": 11.265054702758789, + "learning_rate": 3.950124688279302e-07, + "loss": 0.3599, + "step": 78650 + }, + { + "epoch": 19.615960099750623, + "grad_norm": 7.813275337219238, + "learning_rate": 3.925187032418953e-07, + "loss": 0.3727, + "step": 78660 + }, + { + "epoch": 19.618453865336658, + "grad_norm": 7.351382255554199, + "learning_rate": 3.9002493765586043e-07, + "loss": 0.337, + "step": 78670 + }, + { + "epoch": 19.620947630922693, + "grad_norm": 8.567827224731445, + "learning_rate": 3.8753117206982547e-07, + "loss": 0.3548, + "step": 78680 + }, + { + "epoch": 19.623441396508728, + "grad_norm": 8.134788513183594, + "learning_rate": 3.8503740648379057e-07, + "loss": 0.3234, + "step": 78690 + }, + { + "epoch": 19.625935162094763, + "grad_norm": 8.092358589172363, + "learning_rate": 3.8254364089775567e-07, + "loss": 0.4081, + "step": 78700 + }, + { + "epoch": 19.628428927680797, + "grad_norm": 10.603907585144043, + "learning_rate": 3.800498753117207e-07, + "loss": 0.3384, + "step": 78710 + }, + { + "epoch": 19.630922693266832, + "grad_norm": 7.839836120605469, + "learning_rate": 3.775561097256858e-07, + "loss": 0.3905, + "step": 78720 + }, + { + "epoch": 19.633416458852867, + "grad_norm": 10.831620216369629, + "learning_rate": 3.7506234413965086e-07, + "loss": 0.2838, + "step": 78730 + }, + { + "epoch": 19.635910224438902, + "grad_norm": 7.171167850494385, + "learning_rate": 3.72568578553616e-07, + "loss": 0.361, + "step": 78740 + }, + { + "epoch": 19.638403990024937, + "grad_norm": 8.736506462097168, + "learning_rate": 3.700748129675811e-07, + "loss": 0.3024, + "step": 78750 + }, + { + "epoch": 19.640897755610972, + "grad_norm": 8.326260566711426, + "learning_rate": 3.6758104738154615e-07, + "loss": 0.3674, + "step": 78760 + }, + { + "epoch": 19.643391521197007, + "grad_norm": 7.458507061004639, + "learning_rate": 3.6508728179551125e-07, + "loss": 0.3018, + "step": 78770 + }, + { + "epoch": 19.64588528678304, + "grad_norm": 10.653674125671387, + "learning_rate": 3.625935162094763e-07, + "loss": 0.3053, + "step": 78780 + }, + { + "epoch": 19.648379052369076, + "grad_norm": 7.049093723297119, + "learning_rate": 3.600997506234414e-07, + "loss": 0.3749, + "step": 78790 + }, + { + "epoch": 19.65087281795511, + "grad_norm": 8.122016906738281, + "learning_rate": 3.5760598503740654e-07, + "loss": 0.3234, + "step": 78800 + }, + { + "epoch": 19.653366583541146, + "grad_norm": 11.616135597229004, + "learning_rate": 3.5511221945137164e-07, + "loss": 0.2943, + "step": 78810 + }, + { + "epoch": 19.65586034912718, + "grad_norm": 12.034366607666016, + "learning_rate": 3.526184538653367e-07, + "loss": 0.322, + "step": 78820 + }, + { + "epoch": 19.658354114713216, + "grad_norm": 11.825723648071289, + "learning_rate": 3.501246882793018e-07, + "loss": 0.3468, + "step": 78830 + }, + { + "epoch": 19.66084788029925, + "grad_norm": 8.839386940002441, + "learning_rate": 3.4763092269326683e-07, + "loss": 0.3245, + "step": 78840 + }, + { + "epoch": 19.663341645885286, + "grad_norm": 5.619390487670898, + "learning_rate": 3.451371571072319e-07, + "loss": 0.2613, + "step": 78850 + }, + { + "epoch": 19.66583541147132, + "grad_norm": 7.329450607299805, + "learning_rate": 3.426433915211971e-07, + "loss": 0.3291, + "step": 78860 + }, + { + "epoch": 19.668329177057355, + "grad_norm": 8.157301902770996, + "learning_rate": 3.401496259351621e-07, + "loss": 0.3298, + "step": 78870 + }, + { + "epoch": 19.67082294264339, + "grad_norm": 18.877344131469727, + "learning_rate": 3.376558603491272e-07, + "loss": 0.279, + "step": 78880 + }, + { + "epoch": 19.673316708229425, + "grad_norm": 9.491035461425781, + "learning_rate": 3.351620947630923e-07, + "loss": 0.2876, + "step": 78890 + }, + { + "epoch": 19.67581047381546, + "grad_norm": 9.576622009277344, + "learning_rate": 3.3266832917705736e-07, + "loss": 0.2932, + "step": 78900 + }, + { + "epoch": 19.678304239401495, + "grad_norm": 8.893289566040039, + "learning_rate": 3.3017456359102246e-07, + "loss": 0.2582, + "step": 78910 + }, + { + "epoch": 19.68079800498753, + "grad_norm": 12.161534309387207, + "learning_rate": 3.276807980049876e-07, + "loss": 0.3097, + "step": 78920 + }, + { + "epoch": 19.683291770573565, + "grad_norm": 9.909988403320312, + "learning_rate": 3.2518703241895266e-07, + "loss": 0.326, + "step": 78930 + }, + { + "epoch": 19.6857855361596, + "grad_norm": 7.692135810852051, + "learning_rate": 3.2269326683291775e-07, + "loss": 0.3324, + "step": 78940 + }, + { + "epoch": 19.688279301745634, + "grad_norm": 9.485343933105469, + "learning_rate": 3.201995012468828e-07, + "loss": 0.2779, + "step": 78950 + }, + { + "epoch": 19.69077306733167, + "grad_norm": 9.124640464782715, + "learning_rate": 3.177057356608479e-07, + "loss": 0.3118, + "step": 78960 + }, + { + "epoch": 19.693266832917704, + "grad_norm": 8.47995376586914, + "learning_rate": 3.1521197007481294e-07, + "loss": 0.2873, + "step": 78970 + }, + { + "epoch": 19.69576059850374, + "grad_norm": 9.394947052001953, + "learning_rate": 3.127182044887781e-07, + "loss": 0.317, + "step": 78980 + }, + { + "epoch": 19.698254364089777, + "grad_norm": 9.908628463745117, + "learning_rate": 3.1022443890274314e-07, + "loss": 0.3296, + "step": 78990 + }, + { + "epoch": 19.70074812967581, + "grad_norm": 8.319961547851562, + "learning_rate": 3.077306733167083e-07, + "loss": 0.3183, + "step": 79000 + }, + { + "epoch": 19.703241895261847, + "grad_norm": 9.598522186279297, + "learning_rate": 3.0523690773067333e-07, + "loss": 0.3059, + "step": 79010 + }, + { + "epoch": 19.705735660847882, + "grad_norm": 7.940982341766357, + "learning_rate": 3.0274314214463843e-07, + "loss": 0.385, + "step": 79020 + }, + { + "epoch": 19.708229426433917, + "grad_norm": 9.714994430541992, + "learning_rate": 3.0024937655860353e-07, + "loss": 0.3216, + "step": 79030 + }, + { + "epoch": 19.71072319201995, + "grad_norm": 11.18153190612793, + "learning_rate": 2.9775561097256863e-07, + "loss": 0.3392, + "step": 79040 + }, + { + "epoch": 19.713216957605987, + "grad_norm": 8.196310043334961, + "learning_rate": 2.9526184538653367e-07, + "loss": 0.277, + "step": 79050 + }, + { + "epoch": 19.71571072319202, + "grad_norm": 12.826026916503906, + "learning_rate": 2.9276807980049877e-07, + "loss": 0.3442, + "step": 79060 + }, + { + "epoch": 19.718204488778056, + "grad_norm": 14.502406120300293, + "learning_rate": 2.9027431421446387e-07, + "loss": 0.3195, + "step": 79070 + }, + { + "epoch": 19.72069825436409, + "grad_norm": 10.214339256286621, + "learning_rate": 2.8778054862842897e-07, + "loss": 0.2983, + "step": 79080 + }, + { + "epoch": 19.723192019950126, + "grad_norm": 9.869833946228027, + "learning_rate": 2.8528678304239406e-07, + "loss": 0.3399, + "step": 79090 + }, + { + "epoch": 19.72568578553616, + "grad_norm": 10.579812049865723, + "learning_rate": 2.827930174563591e-07, + "loss": 0.2615, + "step": 79100 + }, + { + "epoch": 19.728179551122196, + "grad_norm": 9.650101661682129, + "learning_rate": 2.802992518703242e-07, + "loss": 0.2366, + "step": 79110 + }, + { + "epoch": 19.73067331670823, + "grad_norm": 7.501360893249512, + "learning_rate": 2.778054862842893e-07, + "loss": 0.2624, + "step": 79120 + }, + { + "epoch": 19.733167082294266, + "grad_norm": 7.877026081085205, + "learning_rate": 2.753117206982544e-07, + "loss": 0.302, + "step": 79130 + }, + { + "epoch": 19.7356608478803, + "grad_norm": 9.600006103515625, + "learning_rate": 2.7281795511221945e-07, + "loss": 0.3465, + "step": 79140 + }, + { + "epoch": 19.738154613466335, + "grad_norm": 8.802675247192383, + "learning_rate": 2.703241895261846e-07, + "loss": 0.2923, + "step": 79150 + }, + { + "epoch": 19.74064837905237, + "grad_norm": 16.73734474182129, + "learning_rate": 2.6783042394014964e-07, + "loss": 0.3215, + "step": 79160 + }, + { + "epoch": 19.743142144638405, + "grad_norm": 13.215644836425781, + "learning_rate": 2.6533665835411474e-07, + "loss": 0.3195, + "step": 79170 + }, + { + "epoch": 19.74563591022444, + "grad_norm": 6.847998142242432, + "learning_rate": 2.6284289276807984e-07, + "loss": 0.3243, + "step": 79180 + }, + { + "epoch": 19.748129675810475, + "grad_norm": 8.965968132019043, + "learning_rate": 2.6034912718204494e-07, + "loss": 0.3299, + "step": 79190 + }, + { + "epoch": 19.75062344139651, + "grad_norm": 8.921945571899414, + "learning_rate": 2.5785536159601e-07, + "loss": 0.3414, + "step": 79200 + }, + { + "epoch": 19.753117206982544, + "grad_norm": 8.51665210723877, + "learning_rate": 2.553615960099751e-07, + "loss": 0.3682, + "step": 79210 + }, + { + "epoch": 19.75561097256858, + "grad_norm": 9.166069984436035, + "learning_rate": 2.528678304239402e-07, + "loss": 0.3229, + "step": 79220 + }, + { + "epoch": 19.758104738154614, + "grad_norm": 16.574934005737305, + "learning_rate": 2.503740648379053e-07, + "loss": 0.3596, + "step": 79230 + }, + { + "epoch": 19.76059850374065, + "grad_norm": 8.440783500671387, + "learning_rate": 2.478802992518703e-07, + "loss": 0.259, + "step": 79240 + }, + { + "epoch": 19.763092269326684, + "grad_norm": 10.631123542785645, + "learning_rate": 2.453865336658354e-07, + "loss": 0.3265, + "step": 79250 + }, + { + "epoch": 19.76558603491272, + "grad_norm": 10.455692291259766, + "learning_rate": 2.428927680798005e-07, + "loss": 0.3613, + "step": 79260 + }, + { + "epoch": 19.768079800498754, + "grad_norm": 9.9364013671875, + "learning_rate": 2.403990024937656e-07, + "loss": 0.3156, + "step": 79270 + }, + { + "epoch": 19.77057356608479, + "grad_norm": 8.415620803833008, + "learning_rate": 2.3790523690773071e-07, + "loss": 0.3035, + "step": 79280 + }, + { + "epoch": 19.773067331670823, + "grad_norm": 9.595970153808594, + "learning_rate": 2.3541147132169579e-07, + "loss": 0.2897, + "step": 79290 + }, + { + "epoch": 19.77556109725686, + "grad_norm": 8.665029525756836, + "learning_rate": 2.3291770573566086e-07, + "loss": 0.4021, + "step": 79300 + }, + { + "epoch": 19.778054862842893, + "grad_norm": 8.567267417907715, + "learning_rate": 2.3042394014962595e-07, + "loss": 0.3056, + "step": 79310 + }, + { + "epoch": 19.780548628428928, + "grad_norm": 12.760359764099121, + "learning_rate": 2.2793017456359103e-07, + "loss": 0.3164, + "step": 79320 + }, + { + "epoch": 19.783042394014963, + "grad_norm": 11.60568904876709, + "learning_rate": 2.2543640897755612e-07, + "loss": 0.3371, + "step": 79330 + }, + { + "epoch": 19.785536159600998, + "grad_norm": 8.413047790527344, + "learning_rate": 2.2294264339152122e-07, + "loss": 0.312, + "step": 79340 + }, + { + "epoch": 19.788029925187033, + "grad_norm": 9.830360412597656, + "learning_rate": 2.204488778054863e-07, + "loss": 0.3139, + "step": 79350 + }, + { + "epoch": 19.790523690773068, + "grad_norm": 9.22851848602295, + "learning_rate": 2.1795511221945137e-07, + "loss": 0.3348, + "step": 79360 + }, + { + "epoch": 19.793017456359102, + "grad_norm": 9.71314811706543, + "learning_rate": 2.154613466334165e-07, + "loss": 0.4009, + "step": 79370 + }, + { + "epoch": 19.795511221945137, + "grad_norm": 12.098420143127441, + "learning_rate": 2.1296758104738156e-07, + "loss": 0.2935, + "step": 79380 + }, + { + "epoch": 19.798004987531172, + "grad_norm": 11.904885292053223, + "learning_rate": 2.1047381546134663e-07, + "loss": 0.3176, + "step": 79390 + }, + { + "epoch": 19.800498753117207, + "grad_norm": 10.853218078613281, + "learning_rate": 2.0798004987531176e-07, + "loss": 0.3419, + "step": 79400 + }, + { + "epoch": 19.802992518703242, + "grad_norm": 10.429359436035156, + "learning_rate": 2.0548628428927683e-07, + "loss": 0.28, + "step": 79410 + }, + { + "epoch": 19.805486284289277, + "grad_norm": 6.893282413482666, + "learning_rate": 2.029925187032419e-07, + "loss": 0.2803, + "step": 79420 + }, + { + "epoch": 19.80798004987531, + "grad_norm": 17.683412551879883, + "learning_rate": 2.0049875311720702e-07, + "loss": 0.3656, + "step": 79430 + }, + { + "epoch": 19.810473815461346, + "grad_norm": 6.478290557861328, + "learning_rate": 1.980049875311721e-07, + "loss": 0.3119, + "step": 79440 + }, + { + "epoch": 19.81296758104738, + "grad_norm": 11.704423904418945, + "learning_rate": 1.9551122194513717e-07, + "loss": 0.4239, + "step": 79450 + }, + { + "epoch": 19.815461346633416, + "grad_norm": 10.740715980529785, + "learning_rate": 1.9301745635910227e-07, + "loss": 0.3632, + "step": 79460 + }, + { + "epoch": 19.81795511221945, + "grad_norm": 7.998754501342773, + "learning_rate": 1.9052369077306736e-07, + "loss": 0.3205, + "step": 79470 + }, + { + "epoch": 19.820448877805486, + "grad_norm": 12.84117317199707, + "learning_rate": 1.8802992518703243e-07, + "loss": 0.322, + "step": 79480 + }, + { + "epoch": 19.82294264339152, + "grad_norm": 11.327634811401367, + "learning_rate": 1.8553615960099753e-07, + "loss": 0.3062, + "step": 79490 + }, + { + "epoch": 19.825436408977556, + "grad_norm": 16.317493438720703, + "learning_rate": 1.830423940149626e-07, + "loss": 0.3968, + "step": 79500 + }, + { + "epoch": 19.82793017456359, + "grad_norm": 9.288350105285645, + "learning_rate": 1.8054862842892768e-07, + "loss": 0.2814, + "step": 79510 + }, + { + "epoch": 19.830423940149625, + "grad_norm": 6.440745830535889, + "learning_rate": 1.780548628428928e-07, + "loss": 0.2889, + "step": 79520 + }, + { + "epoch": 19.83291770573566, + "grad_norm": 7.100279331207275, + "learning_rate": 1.7556109725685787e-07, + "loss": 0.2817, + "step": 79530 + }, + { + "epoch": 19.835411471321695, + "grad_norm": 8.271157264709473, + "learning_rate": 1.7306733167082294e-07, + "loss": 0.2249, + "step": 79540 + }, + { + "epoch": 19.83790523690773, + "grad_norm": 7.640126705169678, + "learning_rate": 1.7057356608478801e-07, + "loss": 0.3156, + "step": 79550 + }, + { + "epoch": 19.840399002493765, + "grad_norm": 11.971923828125, + "learning_rate": 1.6807980049875314e-07, + "loss": 0.3263, + "step": 79560 + }, + { + "epoch": 19.8428927680798, + "grad_norm": 6.482527256011963, + "learning_rate": 1.655860349127182e-07, + "loss": 0.2904, + "step": 79570 + }, + { + "epoch": 19.845386533665835, + "grad_norm": 7.9649200439453125, + "learning_rate": 1.6309226932668328e-07, + "loss": 0.309, + "step": 79580 + }, + { + "epoch": 19.84788029925187, + "grad_norm": 9.517223358154297, + "learning_rate": 1.605985037406484e-07, + "loss": 0.3661, + "step": 79590 + }, + { + "epoch": 19.850374064837904, + "grad_norm": 16.75897216796875, + "learning_rate": 1.5810473815461348e-07, + "loss": 0.3179, + "step": 79600 + }, + { + "epoch": 19.85286783042394, + "grad_norm": 7.259194850921631, + "learning_rate": 1.5561097256857858e-07, + "loss": 0.313, + "step": 79610 + }, + { + "epoch": 19.855361596009974, + "grad_norm": 6.734643936157227, + "learning_rate": 1.5311720698254367e-07, + "loss": 0.3129, + "step": 79620 + }, + { + "epoch": 19.85785536159601, + "grad_norm": 8.320770263671875, + "learning_rate": 1.5062344139650875e-07, + "loss": 0.2911, + "step": 79630 + }, + { + "epoch": 19.860349127182044, + "grad_norm": 9.448941230773926, + "learning_rate": 1.4812967581047384e-07, + "loss": 0.3353, + "step": 79640 + }, + { + "epoch": 19.86284289276808, + "grad_norm": 7.158919811248779, + "learning_rate": 1.4563591022443891e-07, + "loss": 0.3327, + "step": 79650 + }, + { + "epoch": 19.865336658354114, + "grad_norm": 6.863051891326904, + "learning_rate": 1.43142144638404e-07, + "loss": 0.2756, + "step": 79660 + }, + { + "epoch": 19.86783042394015, + "grad_norm": 9.206826210021973, + "learning_rate": 1.4064837905236908e-07, + "loss": 0.3184, + "step": 79670 + }, + { + "epoch": 19.870324189526183, + "grad_norm": 9.977852821350098, + "learning_rate": 1.3815461346633418e-07, + "loss": 0.3783, + "step": 79680 + }, + { + "epoch": 19.872817955112218, + "grad_norm": 8.145519256591797, + "learning_rate": 1.3566084788029925e-07, + "loss": 0.3138, + "step": 79690 + }, + { + "epoch": 19.875311720698253, + "grad_norm": 10.240920066833496, + "learning_rate": 1.3316708229426435e-07, + "loss": 0.3352, + "step": 79700 + }, + { + "epoch": 19.877805486284288, + "grad_norm": 4.372189998626709, + "learning_rate": 1.3067331670822942e-07, + "loss": 0.3623, + "step": 79710 + }, + { + "epoch": 19.880299251870323, + "grad_norm": 8.158557891845703, + "learning_rate": 1.2817955112219452e-07, + "loss": 0.3992, + "step": 79720 + }, + { + "epoch": 19.882793017456358, + "grad_norm": 9.305794715881348, + "learning_rate": 1.2568578553615962e-07, + "loss": 0.3528, + "step": 79730 + }, + { + "epoch": 19.885286783042392, + "grad_norm": 8.110395431518555, + "learning_rate": 1.231920199501247e-07, + "loss": 0.3128, + "step": 79740 + }, + { + "epoch": 19.887780548628427, + "grad_norm": 7.968203067779541, + "learning_rate": 1.206982543640898e-07, + "loss": 0.3007, + "step": 79750 + }, + { + "epoch": 19.890274314214462, + "grad_norm": 7.759973526000977, + "learning_rate": 1.1820448877805489e-07, + "loss": 0.2825, + "step": 79760 + }, + { + "epoch": 19.892768079800497, + "grad_norm": 10.173065185546875, + "learning_rate": 1.1571072319201996e-07, + "loss": 0.411, + "step": 79770 + }, + { + "epoch": 19.895261845386532, + "grad_norm": 7.4826555252075195, + "learning_rate": 1.1321695760598506e-07, + "loss": 0.2562, + "step": 79780 + }, + { + "epoch": 19.897755610972567, + "grad_norm": 9.044584274291992, + "learning_rate": 1.1072319201995014e-07, + "loss": 0.3396, + "step": 79790 + }, + { + "epoch": 19.900249376558605, + "grad_norm": 9.79257583618164, + "learning_rate": 1.0822942643391521e-07, + "loss": 0.323, + "step": 79800 + }, + { + "epoch": 19.902743142144637, + "grad_norm": 10.325493812561035, + "learning_rate": 1.0573566084788031e-07, + "loss": 0.2948, + "step": 79810 + }, + { + "epoch": 19.905236907730675, + "grad_norm": 8.43407917022705, + "learning_rate": 1.0324189526184538e-07, + "loss": 0.2562, + "step": 79820 + }, + { + "epoch": 19.90773067331671, + "grad_norm": 9.039451599121094, + "learning_rate": 1.0074812967581048e-07, + "loss": 0.319, + "step": 79830 + }, + { + "epoch": 19.910224438902745, + "grad_norm": 5.012162208557129, + "learning_rate": 9.825436408977558e-08, + "loss": 0.2856, + "step": 79840 + }, + { + "epoch": 19.91271820448878, + "grad_norm": 6.816104412078857, + "learning_rate": 9.576059850374065e-08, + "loss": 0.2679, + "step": 79850 + }, + { + "epoch": 19.915211970074814, + "grad_norm": 12.247861862182617, + "learning_rate": 9.326683291770575e-08, + "loss": 0.3458, + "step": 79860 + }, + { + "epoch": 19.91770573566085, + "grad_norm": 6.4964118003845215, + "learning_rate": 9.077306733167083e-08, + "loss": 0.3334, + "step": 79870 + }, + { + "epoch": 19.920199501246884, + "grad_norm": 9.710594177246094, + "learning_rate": 8.827930174563592e-08, + "loss": 0.3469, + "step": 79880 + }, + { + "epoch": 19.92269326683292, + "grad_norm": 12.090693473815918, + "learning_rate": 8.5785536159601e-08, + "loss": 0.3702, + "step": 79890 + }, + { + "epoch": 19.925187032418954, + "grad_norm": 8.259521484375, + "learning_rate": 8.32917705735661e-08, + "loss": 0.3142, + "step": 79900 + }, + { + "epoch": 19.92768079800499, + "grad_norm": 10.591052055358887, + "learning_rate": 8.079800498753117e-08, + "loss": 0.3415, + "step": 79910 + }, + { + "epoch": 19.930174563591024, + "grad_norm": 8.798907279968262, + "learning_rate": 7.830423940149627e-08, + "loss": 0.2823, + "step": 79920 + }, + { + "epoch": 19.93266832917706, + "grad_norm": 6.978452682495117, + "learning_rate": 7.581047381546135e-08, + "loss": 0.3098, + "step": 79930 + }, + { + "epoch": 19.935162094763093, + "grad_norm": 8.613764762878418, + "learning_rate": 7.331670822942644e-08, + "loss": 0.3456, + "step": 79940 + }, + { + "epoch": 19.93765586034913, + "grad_norm": 9.246316909790039, + "learning_rate": 7.082294264339154e-08, + "loss": 0.2926, + "step": 79950 + }, + { + "epoch": 19.940149625935163, + "grad_norm": 6.40701961517334, + "learning_rate": 6.832917705735662e-08, + "loss": 0.4304, + "step": 79960 + }, + { + "epoch": 19.942643391521198, + "grad_norm": 9.808752059936523, + "learning_rate": 6.58354114713217e-08, + "loss": 0.3531, + "step": 79970 + }, + { + "epoch": 19.945137157107233, + "grad_norm": 10.570954322814941, + "learning_rate": 6.334164588528679e-08, + "loss": 0.304, + "step": 79980 + }, + { + "epoch": 19.947630922693268, + "grad_norm": 6.992794036865234, + "learning_rate": 6.084788029925187e-08, + "loss": 0.333, + "step": 79990 + }, + { + "epoch": 19.950124688279303, + "grad_norm": 6.577144145965576, + "learning_rate": 5.835411471321696e-08, + "loss": 0.2929, + "step": 80000 + }, + { + "epoch": 19.952618453865338, + "grad_norm": 8.778485298156738, + "learning_rate": 5.5860349127182044e-08, + "loss": 0.3606, + "step": 80010 + }, + { + "epoch": 19.955112219451372, + "grad_norm": 6.685991287231445, + "learning_rate": 5.336658354114714e-08, + "loss": 0.3066, + "step": 80020 + }, + { + "epoch": 19.957605985037407, + "grad_norm": 11.452533721923828, + "learning_rate": 5.087281795511222e-08, + "loss": 0.284, + "step": 80030 + }, + { + "epoch": 19.960099750623442, + "grad_norm": 9.021291732788086, + "learning_rate": 4.8379052369077305e-08, + "loss": 0.3149, + "step": 80040 + }, + { + "epoch": 19.962593516209477, + "grad_norm": 8.216629981994629, + "learning_rate": 4.58852867830424e-08, + "loss": 0.3321, + "step": 80050 + }, + { + "epoch": 19.965087281795512, + "grad_norm": 9.343437194824219, + "learning_rate": 4.339152119700749e-08, + "loss": 0.325, + "step": 80060 + }, + { + "epoch": 19.967581047381547, + "grad_norm": 7.705197811126709, + "learning_rate": 4.089775561097257e-08, + "loss": 0.3533, + "step": 80070 + }, + { + "epoch": 19.97007481296758, + "grad_norm": 8.739502906799316, + "learning_rate": 3.840399002493766e-08, + "loss": 0.3282, + "step": 80080 + }, + { + "epoch": 19.972568578553616, + "grad_norm": 10.272692680358887, + "learning_rate": 3.591022443890275e-08, + "loss": 0.3355, + "step": 80090 + }, + { + "epoch": 19.97506234413965, + "grad_norm": 7.91961145401001, + "learning_rate": 3.341645885286783e-08, + "loss": 0.342, + "step": 80100 + }, + { + "epoch": 19.977556109725686, + "grad_norm": 7.7013936042785645, + "learning_rate": 3.0922693266832924e-08, + "loss": 0.3139, + "step": 80110 + }, + { + "epoch": 19.98004987531172, + "grad_norm": 8.244073867797852, + "learning_rate": 2.8428927680798006e-08, + "loss": 0.3602, + "step": 80120 + }, + { + "epoch": 19.982543640897756, + "grad_norm": 9.814148902893066, + "learning_rate": 2.5935162094763094e-08, + "loss": 0.3641, + "step": 80130 + }, + { + "epoch": 19.98503740648379, + "grad_norm": 7.1216020584106445, + "learning_rate": 2.344139650872818e-08, + "loss": 0.326, + "step": 80140 + }, + { + "epoch": 19.987531172069826, + "grad_norm": 9.763239860534668, + "learning_rate": 2.094763092269327e-08, + "loss": 0.3333, + "step": 80150 + }, + { + "epoch": 19.99002493765586, + "grad_norm": 7.955997943878174, + "learning_rate": 1.8453865336658355e-08, + "loss": 0.3693, + "step": 80160 + }, + { + "epoch": 19.992518703241895, + "grad_norm": 8.751893997192383, + "learning_rate": 1.5960099750623443e-08, + "loss": 0.3065, + "step": 80170 + }, + { + "epoch": 19.99501246882793, + "grad_norm": 11.452372550964355, + "learning_rate": 1.346633416458853e-08, + "loss": 0.3088, + "step": 80180 + }, + { + "epoch": 19.997506234413965, + "grad_norm": 7.998083591461182, + "learning_rate": 1.0972568578553617e-08, + "loss": 0.2875, + "step": 80190 + }, + { + "epoch": 20.0, + "grad_norm": 9.69516372680664, + "learning_rate": 8.478802992518703e-09, + "loss": 0.3132, + "step": 80200 + } + ], + "logging_steps": 10, + "max_steps": 80200, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.9992048140288e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}