diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.7699711260827719, + "epoch": 1.1549566891241578, "eval_steps": 500, - "global_step": 20000, + "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -28007,6 +28007,14006 @@ "learning_rate": 0.00013542003106948943, "loss": 1.1504, "step": 20000 + }, + { + "epoch": 0.7701636188642926, + "grad_norm": 1.804743766784668, + "learning_rate": 0.00013539175029099405, + "loss": 1.1263, + "step": 20005 + }, + { + "epoch": 0.7703561116458133, + "grad_norm": 1.006145715713501, + "learning_rate": 0.00013536346627615755, + "loss": 1.2072, + "step": 20010 + }, + { + "epoch": 0.770548604427334, + "grad_norm": 1.6577104330062866, + "learning_rate": 0.0001353351790275663, + "loss": 0.9571, + "step": 20015 + }, + { + "epoch": 0.7707410972088546, + "grad_norm": 1.1842409372329712, + "learning_rate": 0.00013530688854780705, + "loss": 1.0826, + "step": 20020 + }, + { + "epoch": 0.7709335899903753, + "grad_norm": 2.0414535999298096, + "learning_rate": 0.00013527859483946668, + "loss": 0.9676, + "step": 20025 + }, + { + "epoch": 0.7711260827718961, + "grad_norm": 1.6052886247634888, + "learning_rate": 0.00013525029790513254, + "loss": 1.211, + "step": 20030 + }, + { + "epoch": 0.7713185755534168, + "grad_norm": 1.241951823234558, + "learning_rate": 0.00013522199774739218, + "loss": 1.0928, + "step": 20035 + }, + { + "epoch": 0.7715110683349374, + "grad_norm": 1.220537543296814, + "learning_rate": 0.0001351936943688334, + "loss": 1.0877, + "step": 20040 + }, + { + "epoch": 0.7717035611164581, + "grad_norm": 1.6344817876815796, + "learning_rate": 0.00013516538777204445, + "loss": 0.9949, + "step": 20045 + }, + { + "epoch": 0.7718960538979788, + "grad_norm": 1.798012137413025, + "learning_rate": 0.00013513707795961374, + "loss": 1.1155, + "step": 20050 + }, + { + "epoch": 0.7720885466794996, + "grad_norm": 1.1736531257629395, + "learning_rate": 0.00013510876493413, + "loss": 1.1333, + "step": 20055 + }, + { + "epoch": 0.7722810394610202, + "grad_norm": 1.3480515480041504, + "learning_rate": 0.00013508044869818225, + "loss": 1.316, + "step": 20060 + }, + { + "epoch": 0.7724735322425409, + "grad_norm": 1.5204951763153076, + "learning_rate": 0.00013505212925435994, + "loss": 1.157, + "step": 20065 + }, + { + "epoch": 0.7726660250240616, + "grad_norm": 1.116921067237854, + "learning_rate": 0.00013502380660525255, + "loss": 0.9601, + "step": 20070 + }, + { + "epoch": 0.7728585178055823, + "grad_norm": 2.0013625621795654, + "learning_rate": 0.00013499548075345002, + "loss": 0.9889, + "step": 20075 + }, + { + "epoch": 0.773051010587103, + "grad_norm": 2.779212474822998, + "learning_rate": 0.00013496715170154266, + "loss": 1.1228, + "step": 20080 + }, + { + "epoch": 0.7732435033686237, + "grad_norm": 1.983289361000061, + "learning_rate": 0.00013493881945212088, + "loss": 1.1228, + "step": 20085 + }, + { + "epoch": 0.7734359961501444, + "grad_norm": 1.5862370729446411, + "learning_rate": 0.0001349104840077755, + "loss": 1.2704, + "step": 20090 + }, + { + "epoch": 0.773628488931665, + "grad_norm": 1.318428874015808, + "learning_rate": 0.00013488214537109766, + "loss": 1.0183, + "step": 20095 + }, + { + "epoch": 0.7738209817131858, + "grad_norm": 1.2089256048202515, + "learning_rate": 0.00013485380354467868, + "loss": 1.2204, + "step": 20100 + }, + { + "epoch": 0.7740134744947065, + "grad_norm": 1.587889552116394, + "learning_rate": 0.00013482545853111023, + "loss": 1.1076, + "step": 20105 + }, + { + "epoch": 0.7742059672762271, + "grad_norm": 1.168034553527832, + "learning_rate": 0.00013479711033298432, + "loss": 1.1339, + "step": 20110 + }, + { + "epoch": 0.7743984600577478, + "grad_norm": 1.2924379110336304, + "learning_rate": 0.0001347687589528932, + "loss": 1.1975, + "step": 20115 + }, + { + "epoch": 0.7745909528392685, + "grad_norm": 1.0745187997817993, + "learning_rate": 0.00013474040439342933, + "loss": 1.185, + "step": 20120 + }, + { + "epoch": 0.7747834456207893, + "grad_norm": 1.319265365600586, + "learning_rate": 0.00013471204665718568, + "loss": 0.9424, + "step": 20125 + }, + { + "epoch": 0.7749759384023099, + "grad_norm": 1.1227604150772095, + "learning_rate": 0.00013468368574675528, + "loss": 1.1125, + "step": 20130 + }, + { + "epoch": 0.7751684311838306, + "grad_norm": 0.9614366888999939, + "learning_rate": 0.00013465532166473157, + "loss": 1.2149, + "step": 20135 + }, + { + "epoch": 0.7753609239653513, + "grad_norm": 1.4985253810882568, + "learning_rate": 0.00013462695441370827, + "loss": 1.0558, + "step": 20140 + }, + { + "epoch": 0.775553416746872, + "grad_norm": 2.495373487472534, + "learning_rate": 0.00013459858399627938, + "loss": 1.1317, + "step": 20145 + }, + { + "epoch": 0.7757459095283927, + "grad_norm": 1.7242252826690674, + "learning_rate": 0.00013457021041503916, + "loss": 1.1204, + "step": 20150 + }, + { + "epoch": 0.7759384023099134, + "grad_norm": 1.0122432708740234, + "learning_rate": 0.00013454183367258223, + "loss": 1.0719, + "step": 20155 + }, + { + "epoch": 0.7761308950914341, + "grad_norm": 1.057358741760254, + "learning_rate": 0.00013451345377150342, + "loss": 1.0651, + "step": 20160 + }, + { + "epoch": 0.7763233878729547, + "grad_norm": 0.9141941666603088, + "learning_rate": 0.00013448507071439788, + "loss": 1.0958, + "step": 20165 + }, + { + "epoch": 0.7765158806544754, + "grad_norm": 1.0692884922027588, + "learning_rate": 0.00013445668450386106, + "loss": 0.9905, + "step": 20170 + }, + { + "epoch": 0.7767083734359962, + "grad_norm": 1.3968685865402222, + "learning_rate": 0.00013442829514248873, + "loss": 1.0801, + "step": 20175 + }, + { + "epoch": 0.7769008662175169, + "grad_norm": 1.427119255065918, + "learning_rate": 0.00013439990263287686, + "loss": 1.2859, + "step": 20180 + }, + { + "epoch": 0.7770933589990375, + "grad_norm": 1.9501906633377075, + "learning_rate": 0.00013437150697762177, + "loss": 1.1413, + "step": 20185 + }, + { + "epoch": 0.7772858517805582, + "grad_norm": 0.9638822674751282, + "learning_rate": 0.00013434310817932006, + "loss": 1.0041, + "step": 20190 + }, + { + "epoch": 0.7774783445620789, + "grad_norm": 1.9460463523864746, + "learning_rate": 0.00013431470624056862, + "loss": 1.1464, + "step": 20195 + }, + { + "epoch": 0.7776708373435997, + "grad_norm": 1.4077537059783936, + "learning_rate": 0.00013428630116396457, + "loss": 1.086, + "step": 20200 + }, + { + "epoch": 0.7778633301251203, + "grad_norm": 1.545357346534729, + "learning_rate": 0.00013425789295210545, + "loss": 1.0626, + "step": 20205 + }, + { + "epoch": 0.778055822906641, + "grad_norm": 1.826687216758728, + "learning_rate": 0.00013422948160758894, + "loss": 1.1287, + "step": 20210 + }, + { + "epoch": 0.7782483156881617, + "grad_norm": 1.4467684030532837, + "learning_rate": 0.00013420106713301307, + "loss": 1.0774, + "step": 20215 + }, + { + "epoch": 0.7784408084696823, + "grad_norm": 1.0557719469070435, + "learning_rate": 0.00013417264953097618, + "loss": 0.968, + "step": 20220 + }, + { + "epoch": 0.7786333012512031, + "grad_norm": 1.5922484397888184, + "learning_rate": 0.00013414422880407686, + "loss": 1.1911, + "step": 20225 + }, + { + "epoch": 0.7788257940327238, + "grad_norm": 1.2360528707504272, + "learning_rate": 0.000134115804954914, + "loss": 1.123, + "step": 20230 + }, + { + "epoch": 0.7790182868142445, + "grad_norm": 1.3299078941345215, + "learning_rate": 0.0001340873779860868, + "loss": 1.1586, + "step": 20235 + }, + { + "epoch": 0.7792107795957651, + "grad_norm": 1.2143315076828003, + "learning_rate": 0.00013405894790019467, + "loss": 1.0069, + "step": 20240 + }, + { + "epoch": 0.7794032723772859, + "grad_norm": 0.9542683959007263, + "learning_rate": 0.00013403051469983737, + "loss": 1.2013, + "step": 20245 + }, + { + "epoch": 0.7795957651588066, + "grad_norm": 1.3608813285827637, + "learning_rate": 0.00013400207838761496, + "loss": 1.1684, + "step": 20250 + }, + { + "epoch": 0.7797882579403272, + "grad_norm": 0.4999949038028717, + "learning_rate": 0.0001339736389661277, + "loss": 1.1161, + "step": 20255 + }, + { + "epoch": 0.7799807507218479, + "grad_norm": 1.8794244527816772, + "learning_rate": 0.00013394519643797622, + "loss": 0.8838, + "step": 20260 + }, + { + "epoch": 0.7801732435033686, + "grad_norm": 0.8917430639266968, + "learning_rate": 0.00013391675080576138, + "loss": 0.9322, + "step": 20265 + }, + { + "epoch": 0.7803657362848894, + "grad_norm": 0.8630626797676086, + "learning_rate": 0.0001338883020720844, + "loss": 1.0805, + "step": 20270 + }, + { + "epoch": 0.78055822906641, + "grad_norm": 1.7655322551727295, + "learning_rate": 0.00013385985023954664, + "loss": 1.0011, + "step": 20275 + }, + { + "epoch": 0.7807507218479307, + "grad_norm": 2.459364652633667, + "learning_rate": 0.00013383139531074987, + "loss": 1.2634, + "step": 20280 + }, + { + "epoch": 0.7809432146294514, + "grad_norm": 0.9355961084365845, + "learning_rate": 0.00013380293728829613, + "loss": 1.1082, + "step": 20285 + }, + { + "epoch": 0.781135707410972, + "grad_norm": 1.9479714632034302, + "learning_rate": 0.00013377447617478772, + "loss": 1.2329, + "step": 20290 + }, + { + "epoch": 0.7813282001924928, + "grad_norm": 1.408583164215088, + "learning_rate": 0.00013374601197282715, + "loss": 1.0494, + "step": 20295 + }, + { + "epoch": 0.7815206929740135, + "grad_norm": 1.0006494522094727, + "learning_rate": 0.00013371754468501735, + "loss": 1.0935, + "step": 20300 + }, + { + "epoch": 0.7817131857555342, + "grad_norm": 1.8096888065338135, + "learning_rate": 0.00013368907431396147, + "loss": 1.14, + "step": 20305 + }, + { + "epoch": 0.7819056785370548, + "grad_norm": 1.373366117477417, + "learning_rate": 0.00013366060086226288, + "loss": 1.0523, + "step": 20310 + }, + { + "epoch": 0.7820981713185755, + "grad_norm": 1.7965810298919678, + "learning_rate": 0.00013363212433252538, + "loss": 1.1531, + "step": 20315 + }, + { + "epoch": 0.7822906641000963, + "grad_norm": 1.3974156379699707, + "learning_rate": 0.00013360364472735285, + "loss": 1.0295, + "step": 20320 + }, + { + "epoch": 0.782483156881617, + "grad_norm": 2.2229206562042236, + "learning_rate": 0.00013357516204934961, + "loss": 1.2168, + "step": 20325 + }, + { + "epoch": 0.7826756496631376, + "grad_norm": 2.140164852142334, + "learning_rate": 0.00013354667630112026, + "loss": 1.1943, + "step": 20330 + }, + { + "epoch": 0.7828681424446583, + "grad_norm": 1.12675940990448, + "learning_rate": 0.00013351818748526953, + "loss": 1.1229, + "step": 20335 + }, + { + "epoch": 0.783060635226179, + "grad_norm": 0.9767451286315918, + "learning_rate": 0.00013348969560440262, + "loss": 1.0961, + "step": 20340 + }, + { + "epoch": 0.7832531280076998, + "grad_norm": 2.367725372314453, + "learning_rate": 0.00013346120066112492, + "loss": 0.9969, + "step": 20345 + }, + { + "epoch": 0.7834456207892204, + "grad_norm": 1.4590439796447754, + "learning_rate": 0.00013343270265804205, + "loss": 1.2416, + "step": 20350 + }, + { + "epoch": 0.7836381135707411, + "grad_norm": 1.0241880416870117, + "learning_rate": 0.00013340420159776, + "loss": 1.1929, + "step": 20355 + }, + { + "epoch": 0.7838306063522618, + "grad_norm": 1.1249408721923828, + "learning_rate": 0.000133375697482885, + "loss": 1.1346, + "step": 20360 + }, + { + "epoch": 0.7840230991337824, + "grad_norm": 0.3698437809944153, + "learning_rate": 0.00013334719031602357, + "loss": 0.9347, + "step": 20365 + }, + { + "epoch": 0.7842155919153032, + "grad_norm": 0.9476606845855713, + "learning_rate": 0.00013331868009978248, + "loss": 1.0438, + "step": 20370 + }, + { + "epoch": 0.7844080846968239, + "grad_norm": 1.432447910308838, + "learning_rate": 0.0001332901668367688, + "loss": 1.0981, + "step": 20375 + }, + { + "epoch": 0.7846005774783446, + "grad_norm": 0.9563504457473755, + "learning_rate": 0.0001332616505295899, + "loss": 1.0899, + "step": 20380 + }, + { + "epoch": 0.7847930702598652, + "grad_norm": 1.3649405241012573, + "learning_rate": 0.00013323313118085341, + "loss": 1.2495, + "step": 20385 + }, + { + "epoch": 0.784985563041386, + "grad_norm": 1.109165906906128, + "learning_rate": 0.00013320460879316719, + "loss": 1.122, + "step": 20390 + }, + { + "epoch": 0.7851780558229067, + "grad_norm": 1.2185308933258057, + "learning_rate": 0.0001331760833691395, + "loss": 1.0217, + "step": 20395 + }, + { + "epoch": 0.7853705486044273, + "grad_norm": 1.0969536304473877, + "learning_rate": 0.00013314755491137872, + "loss": 1.0269, + "step": 20400 + }, + { + "epoch": 0.785563041385948, + "grad_norm": 1.2872892618179321, + "learning_rate": 0.00013311902342249364, + "loss": 1.2322, + "step": 20405 + }, + { + "epoch": 0.7857555341674687, + "grad_norm": 1.7610993385314941, + "learning_rate": 0.0001330904889050933, + "loss": 0.9873, + "step": 20410 + }, + { + "epoch": 0.7859480269489895, + "grad_norm": 2.3899195194244385, + "learning_rate": 0.00013306195136178687, + "loss": 1.4362, + "step": 20415 + }, + { + "epoch": 0.7861405197305101, + "grad_norm": 0.9143069982528687, + "learning_rate": 0.00013303341079518404, + "loss": 1.0994, + "step": 20420 + }, + { + "epoch": 0.7863330125120308, + "grad_norm": 0.9000052809715271, + "learning_rate": 0.00013300486720789465, + "loss": 1.0955, + "step": 20425 + }, + { + "epoch": 0.7865255052935515, + "grad_norm": 1.6691547632217407, + "learning_rate": 0.00013297632060252875, + "loss": 1.1152, + "step": 20430 + }, + { + "epoch": 0.7867179980750721, + "grad_norm": 1.8236645460128784, + "learning_rate": 0.0001329477709816968, + "loss": 0.9309, + "step": 20435 + }, + { + "epoch": 0.7869104908565929, + "grad_norm": 1.151060700416565, + "learning_rate": 0.00013291921834800947, + "loss": 1.0847, + "step": 20440 + }, + { + "epoch": 0.7871029836381136, + "grad_norm": 0.9802745580673218, + "learning_rate": 0.00013289066270407766, + "loss": 1.18, + "step": 20445 + }, + { + "epoch": 0.7872954764196343, + "grad_norm": 1.4023500680923462, + "learning_rate": 0.00013286210405251262, + "loss": 1.1138, + "step": 20450 + }, + { + "epoch": 0.7874879692011549, + "grad_norm": 2.108933925628662, + "learning_rate": 0.0001328335423959259, + "loss": 1.245, + "step": 20455 + }, + { + "epoch": 0.7876804619826756, + "grad_norm": 1.586288332939148, + "learning_rate": 0.0001328049777369292, + "loss": 1.1811, + "step": 20460 + }, + { + "epoch": 0.7878729547641964, + "grad_norm": 1.2598310708999634, + "learning_rate": 0.00013277641007813457, + "loss": 0.9954, + "step": 20465 + }, + { + "epoch": 0.7880654475457171, + "grad_norm": 0.8444984555244446, + "learning_rate": 0.00013274783942215443, + "loss": 0.9792, + "step": 20470 + }, + { + "epoch": 0.7882579403272377, + "grad_norm": 1.5442594289779663, + "learning_rate": 0.00013271926577160126, + "loss": 1.1249, + "step": 20475 + }, + { + "epoch": 0.7884504331087584, + "grad_norm": 1.1979268789291382, + "learning_rate": 0.000132690689129088, + "loss": 1.1101, + "step": 20480 + }, + { + "epoch": 0.7886429258902791, + "grad_norm": 1.0047119855880737, + "learning_rate": 0.00013266210949722777, + "loss": 1.2296, + "step": 20485 + }, + { + "epoch": 0.7888354186717998, + "grad_norm": 0.9286414980888367, + "learning_rate": 0.000132633526878634, + "loss": 1.0575, + "step": 20490 + }, + { + "epoch": 0.7890279114533205, + "grad_norm": 1.0984097719192505, + "learning_rate": 0.00013260494127592036, + "loss": 1.2199, + "step": 20495 + }, + { + "epoch": 0.7892204042348412, + "grad_norm": 2.0137503147125244, + "learning_rate": 0.00013257635269170082, + "loss": 1.1294, + "step": 20500 + }, + { + "epoch": 0.7894128970163619, + "grad_norm": 1.5907988548278809, + "learning_rate": 0.00013254776112858966, + "loss": 1.0603, + "step": 20505 + }, + { + "epoch": 0.7896053897978825, + "grad_norm": 1.6451369524002075, + "learning_rate": 0.00013251916658920133, + "loss": 1.2061, + "step": 20510 + }, + { + "epoch": 0.7897978825794033, + "grad_norm": 1.7516169548034668, + "learning_rate": 0.00013249056907615065, + "loss": 1.0927, + "step": 20515 + }, + { + "epoch": 0.789990375360924, + "grad_norm": 2.2118046283721924, + "learning_rate": 0.00013246196859205265, + "loss": 1.2632, + "step": 20520 + }, + { + "epoch": 0.7901828681424446, + "grad_norm": 1.0801787376403809, + "learning_rate": 0.00013243336513952265, + "loss": 1.104, + "step": 20525 + }, + { + "epoch": 0.7903753609239653, + "grad_norm": 1.737065076828003, + "learning_rate": 0.00013240475872117625, + "loss": 1.1356, + "step": 20530 + }, + { + "epoch": 0.790567853705486, + "grad_norm": 2.103365659713745, + "learning_rate": 0.0001323761493396294, + "loss": 1.1386, + "step": 20535 + }, + { + "epoch": 0.7907603464870068, + "grad_norm": 1.2037795782089233, + "learning_rate": 0.0001323475369974981, + "loss": 1.1696, + "step": 20540 + }, + { + "epoch": 0.7909528392685274, + "grad_norm": 2.0184690952301025, + "learning_rate": 0.0001323189216973988, + "loss": 1.2378, + "step": 20545 + }, + { + "epoch": 0.7911453320500481, + "grad_norm": 1.1056318283081055, + "learning_rate": 0.0001322903034419483, + "loss": 1.0001, + "step": 20550 + }, + { + "epoch": 0.7913378248315688, + "grad_norm": 1.8512266874313354, + "learning_rate": 0.00013226168223376335, + "loss": 1.2393, + "step": 20555 + }, + { + "epoch": 0.7915303176130896, + "grad_norm": 1.0075215101242065, + "learning_rate": 0.00013223305807546134, + "loss": 1.0399, + "step": 20560 + }, + { + "epoch": 0.7917228103946102, + "grad_norm": 1.206949234008789, + "learning_rate": 0.00013220443096965967, + "loss": 1.131, + "step": 20565 + }, + { + "epoch": 0.7919153031761309, + "grad_norm": 1.0401040315628052, + "learning_rate": 0.00013217580091897613, + "loss": 1.0425, + "step": 20570 + }, + { + "epoch": 0.7921077959576516, + "grad_norm": 1.1868935823440552, + "learning_rate": 0.00013214716792602873, + "loss": 1.2813, + "step": 20575 + }, + { + "epoch": 0.7923002887391722, + "grad_norm": 1.0238397121429443, + "learning_rate": 0.00013211853199343577, + "loss": 1.1308, + "step": 20580 + }, + { + "epoch": 0.792492781520693, + "grad_norm": 1.0374101400375366, + "learning_rate": 0.00013208989312381585, + "loss": 0.9725, + "step": 20585 + }, + { + "epoch": 0.7926852743022137, + "grad_norm": 1.1706335544586182, + "learning_rate": 0.00013206125131978778, + "loss": 1.2237, + "step": 20590 + }, + { + "epoch": 0.7928777670837344, + "grad_norm": 1.3591363430023193, + "learning_rate": 0.00013203260658397066, + "loss": 1.217, + "step": 20595 + }, + { + "epoch": 0.793070259865255, + "grad_norm": 1.0355279445648193, + "learning_rate": 0.00013200395891898388, + "loss": 1.1943, + "step": 20600 + }, + { + "epoch": 0.7932627526467757, + "grad_norm": 1.4872701168060303, + "learning_rate": 0.000131975308327447, + "loss": 1.0404, + "step": 20605 + }, + { + "epoch": 0.7934552454282965, + "grad_norm": 1.1777255535125732, + "learning_rate": 0.00013194665481198006, + "loss": 1.1498, + "step": 20610 + }, + { + "epoch": 0.7936477382098172, + "grad_norm": 1.446821689605713, + "learning_rate": 0.00013191799837520312, + "loss": 1.1557, + "step": 20615 + }, + { + "epoch": 0.7938402309913378, + "grad_norm": 1.1746444702148438, + "learning_rate": 0.00013188933901973668, + "loss": 1.0335, + "step": 20620 + }, + { + "epoch": 0.7940327237728585, + "grad_norm": 1.7785364389419556, + "learning_rate": 0.00013186067674820145, + "loss": 1.2201, + "step": 20625 + }, + { + "epoch": 0.7942252165543792, + "grad_norm": 2.1019060611724854, + "learning_rate": 0.0001318320115632184, + "loss": 1.3937, + "step": 20630 + }, + { + "epoch": 0.7944177093359, + "grad_norm": 0.9918488264083862, + "learning_rate": 0.0001318033434674087, + "loss": 0.899, + "step": 20635 + }, + { + "epoch": 0.7946102021174206, + "grad_norm": 1.4679549932479858, + "learning_rate": 0.00013177467246339395, + "loss": 1.1425, + "step": 20640 + }, + { + "epoch": 0.7948026948989413, + "grad_norm": 1.2556777000427246, + "learning_rate": 0.0001317459985537959, + "loss": 1.0685, + "step": 20645 + }, + { + "epoch": 0.794995187680462, + "grad_norm": 1.1215819120407104, + "learning_rate": 0.00013171732174123657, + "loss": 1.1001, + "step": 20650 + }, + { + "epoch": 0.7951876804619826, + "grad_norm": 1.128730297088623, + "learning_rate": 0.0001316886420283383, + "loss": 1.1104, + "step": 20655 + }, + { + "epoch": 0.7953801732435034, + "grad_norm": 0.9813867807388306, + "learning_rate": 0.0001316599594177236, + "loss": 1.1989, + "step": 20660 + }, + { + "epoch": 0.7955726660250241, + "grad_norm": 1.1886361837387085, + "learning_rate": 0.00013163127391201536, + "loss": 1.1553, + "step": 20665 + }, + { + "epoch": 0.7957651588065447, + "grad_norm": 1.1876790523529053, + "learning_rate": 0.00013160258551383664, + "loss": 0.9751, + "step": 20670 + }, + { + "epoch": 0.7959576515880654, + "grad_norm": 0.8354418873786926, + "learning_rate": 0.00013157389422581087, + "loss": 1.1553, + "step": 20675 + }, + { + "epoch": 0.7961501443695861, + "grad_norm": 1.0195808410644531, + "learning_rate": 0.00013154520005056157, + "loss": 1.173, + "step": 20680 + }, + { + "epoch": 0.7963426371511069, + "grad_norm": 1.6569643020629883, + "learning_rate": 0.0001315165029907127, + "loss": 1.0836, + "step": 20685 + }, + { + "epoch": 0.7965351299326275, + "grad_norm": 1.1490013599395752, + "learning_rate": 0.00013148780304888846, + "loss": 1.1071, + "step": 20690 + }, + { + "epoch": 0.7967276227141482, + "grad_norm": 1.1535710096359253, + "learning_rate": 0.0001314591002277132, + "loss": 0.9335, + "step": 20695 + }, + { + "epoch": 0.7969201154956689, + "grad_norm": 1.2786650657653809, + "learning_rate": 0.00013143039452981158, + "loss": 1.0348, + "step": 20700 + }, + { + "epoch": 0.7971126082771897, + "grad_norm": 1.0382118225097656, + "learning_rate": 0.00013140168595780863, + "loss": 1.0676, + "step": 20705 + }, + { + "epoch": 0.7973051010587103, + "grad_norm": 1.2393159866333008, + "learning_rate": 0.00013137297451432952, + "loss": 1.0786, + "step": 20710 + }, + { + "epoch": 0.797497593840231, + "grad_norm": 1.04021155834198, + "learning_rate": 0.00013134426020199968, + "loss": 1.1149, + "step": 20715 + }, + { + "epoch": 0.7976900866217517, + "grad_norm": 0.871838390827179, + "learning_rate": 0.0001313155430234449, + "loss": 1.1709, + "step": 20720 + }, + { + "epoch": 0.7978825794032723, + "grad_norm": 1.0589998960494995, + "learning_rate": 0.00013128682298129118, + "loss": 1.0185, + "step": 20725 + }, + { + "epoch": 0.7980750721847931, + "grad_norm": 1.4314393997192383, + "learning_rate": 0.0001312581000781647, + "loss": 1.0613, + "step": 20730 + }, + { + "epoch": 0.7982675649663138, + "grad_norm": 1.0706419944763184, + "learning_rate": 0.00013122937431669208, + "loss": 1.3543, + "step": 20735 + }, + { + "epoch": 0.7984600577478345, + "grad_norm": 1.705357313156128, + "learning_rate": 0.00013120064569950004, + "loss": 1.0586, + "step": 20740 + }, + { + "epoch": 0.7986525505293551, + "grad_norm": 1.2182518243789673, + "learning_rate": 0.00013117191422921566, + "loss": 1.1202, + "step": 20745 + }, + { + "epoch": 0.7988450433108758, + "grad_norm": 1.1450048685073853, + "learning_rate": 0.00013114317990846618, + "loss": 1.2812, + "step": 20750 + }, + { + "epoch": 0.7990375360923966, + "grad_norm": 1.8270835876464844, + "learning_rate": 0.00013111444273987923, + "loss": 1.1169, + "step": 20755 + }, + { + "epoch": 0.7992300288739173, + "grad_norm": 2.412691593170166, + "learning_rate": 0.0001310857027260826, + "loss": 1.2125, + "step": 20760 + }, + { + "epoch": 0.7994225216554379, + "grad_norm": 1.1688083410263062, + "learning_rate": 0.00013105695986970434, + "loss": 1.1396, + "step": 20765 + }, + { + "epoch": 0.7996150144369586, + "grad_norm": 1.3766731023788452, + "learning_rate": 0.00013102821417337288, + "loss": 1.0521, + "step": 20770 + }, + { + "epoch": 0.7998075072184793, + "grad_norm": 1.4227287769317627, + "learning_rate": 0.00013099946563971678, + "loss": 1.2142, + "step": 20775 + }, + { + "epoch": 0.8, + "grad_norm": 1.2115936279296875, + "learning_rate": 0.00013097071427136485, + "loss": 1.1114, + "step": 20780 + }, + { + "epoch": 0.8001924927815207, + "grad_norm": 1.5830806493759155, + "learning_rate": 0.00013094196007094633, + "loss": 1.015, + "step": 20785 + }, + { + "epoch": 0.8003849855630414, + "grad_norm": 1.4669450521469116, + "learning_rate": 0.0001309132030410905, + "loss": 1.1081, + "step": 20790 + }, + { + "epoch": 0.800577478344562, + "grad_norm": 1.534693956375122, + "learning_rate": 0.00013088444318442704, + "loss": 1.1994, + "step": 20795 + }, + { + "epoch": 0.8007699711260827, + "grad_norm": 1.6019155979156494, + "learning_rate": 0.0001308556805035858, + "loss": 1.1598, + "step": 20800 + }, + { + "epoch": 0.8009624639076035, + "grad_norm": 2.0105199813842773, + "learning_rate": 0.00013082691500119703, + "loss": 1.0093, + "step": 20805 + }, + { + "epoch": 0.8011549566891242, + "grad_norm": 0.9521466493606567, + "learning_rate": 0.00013079814667989108, + "loss": 0.9551, + "step": 20810 + }, + { + "epoch": 0.8013474494706448, + "grad_norm": 1.155561089515686, + "learning_rate": 0.00013076937554229866, + "loss": 1.012, + "step": 20815 + }, + { + "epoch": 0.8015399422521655, + "grad_norm": 2.6006276607513428, + "learning_rate": 0.00013074060159105061, + "loss": 1.2012, + "step": 20820 + }, + { + "epoch": 0.8017324350336862, + "grad_norm": 1.318688154220581, + "learning_rate": 0.0001307118248287782, + "loss": 1.0862, + "step": 20825 + }, + { + "epoch": 0.801924927815207, + "grad_norm": 1.0366714000701904, + "learning_rate": 0.0001306830452581129, + "loss": 0.8681, + "step": 20830 + }, + { + "epoch": 0.8021174205967276, + "grad_norm": 1.463513731956482, + "learning_rate": 0.00013065426288168635, + "loss": 1.0677, + "step": 20835 + }, + { + "epoch": 0.8023099133782483, + "grad_norm": 1.2675501108169556, + "learning_rate": 0.00013062547770213053, + "loss": 1.0329, + "step": 20840 + }, + { + "epoch": 0.802502406159769, + "grad_norm": 1.9978399276733398, + "learning_rate": 0.00013059668972207762, + "loss": 1.106, + "step": 20845 + }, + { + "epoch": 0.8026948989412896, + "grad_norm": 0.831674337387085, + "learning_rate": 0.00013056789894416013, + "loss": 1.1393, + "step": 20850 + }, + { + "epoch": 0.8028873917228104, + "grad_norm": 1.492956280708313, + "learning_rate": 0.00013053910537101076, + "loss": 1.1106, + "step": 20855 + }, + { + "epoch": 0.8030798845043311, + "grad_norm": 1.5051852464675903, + "learning_rate": 0.00013051030900526256, + "loss": 1.1677, + "step": 20860 + }, + { + "epoch": 0.8032723772858518, + "grad_norm": 1.2556471824645996, + "learning_rate": 0.00013048150984954867, + "loss": 1.006, + "step": 20865 + }, + { + "epoch": 0.8034648700673724, + "grad_norm": 1.4677566289901733, + "learning_rate": 0.00013045270790650263, + "loss": 1.2083, + "step": 20870 + }, + { + "epoch": 0.8036573628488932, + "grad_norm": 1.2184150218963623, + "learning_rate": 0.00013042390317875822, + "loss": 1.0107, + "step": 20875 + }, + { + "epoch": 0.8038498556304139, + "grad_norm": 1.0294650793075562, + "learning_rate": 0.00013039509566894935, + "loss": 1.1263, + "step": 20880 + }, + { + "epoch": 0.8040423484119346, + "grad_norm": 1.0486441850662231, + "learning_rate": 0.00013036628537971037, + "loss": 1.2297, + "step": 20885 + }, + { + "epoch": 0.8042348411934552, + "grad_norm": 1.209472894668579, + "learning_rate": 0.00013033747231367573, + "loss": 1.0295, + "step": 20890 + }, + { + "epoch": 0.8044273339749759, + "grad_norm": 1.3195816278457642, + "learning_rate": 0.00013030865647348022, + "loss": 1.1223, + "step": 20895 + }, + { + "epoch": 0.8046198267564967, + "grad_norm": 1.127369999885559, + "learning_rate": 0.00013027983786175883, + "loss": 1.1293, + "step": 20900 + }, + { + "epoch": 0.8048123195380174, + "grad_norm": 1.7267670631408691, + "learning_rate": 0.00013025101648114692, + "loss": 1.1115, + "step": 20905 + }, + { + "epoch": 0.805004812319538, + "grad_norm": 1.1385564804077148, + "learning_rate": 0.0001302221923342799, + "loss": 1.2283, + "step": 20910 + }, + { + "epoch": 0.8051973051010587, + "grad_norm": 2.3153762817382812, + "learning_rate": 0.0001301933654237936, + "loss": 1.0369, + "step": 20915 + }, + { + "epoch": 0.8053897978825794, + "grad_norm": 2.7795004844665527, + "learning_rate": 0.000130164535752324, + "loss": 1.1383, + "step": 20920 + }, + { + "epoch": 0.8055822906641001, + "grad_norm": 1.4328944683074951, + "learning_rate": 0.00013013570332250752, + "loss": 1.0496, + "step": 20925 + }, + { + "epoch": 0.8057747834456208, + "grad_norm": 1.35623037815094, + "learning_rate": 0.00013010686813698055, + "loss": 1.034, + "step": 20930 + }, + { + "epoch": 0.8059672762271415, + "grad_norm": 2.241270065307617, + "learning_rate": 0.0001300780301983799, + "loss": 1.1334, + "step": 20935 + }, + { + "epoch": 0.8061597690086622, + "grad_norm": 1.0069741010665894, + "learning_rate": 0.00013004918950934268, + "loss": 1.0983, + "step": 20940 + }, + { + "epoch": 0.8063522617901828, + "grad_norm": 2.6190080642700195, + "learning_rate": 0.0001300203460725061, + "loss": 1.0864, + "step": 20945 + }, + { + "epoch": 0.8065447545717036, + "grad_norm": 1.0321654081344604, + "learning_rate": 0.00012999149989050777, + "loss": 1.1463, + "step": 20950 + }, + { + "epoch": 0.8067372473532243, + "grad_norm": 1.1893829107284546, + "learning_rate": 0.00012996265096598545, + "loss": 1.1653, + "step": 20955 + }, + { + "epoch": 0.8069297401347449, + "grad_norm": 1.0437567234039307, + "learning_rate": 0.00012993379930157717, + "loss": 1.0993, + "step": 20960 + }, + { + "epoch": 0.8071222329162656, + "grad_norm": 0.9310737252235413, + "learning_rate": 0.00012990494489992124, + "loss": 1.0351, + "step": 20965 + }, + { + "epoch": 0.8073147256977863, + "grad_norm": 1.0145392417907715, + "learning_rate": 0.00012987608776365622, + "loss": 1.0316, + "step": 20970 + }, + { + "epoch": 0.8075072184793071, + "grad_norm": 0.8872208595275879, + "learning_rate": 0.00012984722789542084, + "loss": 0.9912, + "step": 20975 + }, + { + "epoch": 0.8076997112608277, + "grad_norm": 1.4148224592208862, + "learning_rate": 0.00012981836529785422, + "loss": 1.1786, + "step": 20980 + }, + { + "epoch": 0.8078922040423484, + "grad_norm": 0.9235820770263672, + "learning_rate": 0.00012978949997359565, + "loss": 1.1649, + "step": 20985 + }, + { + "epoch": 0.8080846968238691, + "grad_norm": 1.932556390762329, + "learning_rate": 0.0001297606319252846, + "loss": 1.1976, + "step": 20990 + }, + { + "epoch": 0.8082771896053897, + "grad_norm": 1.1439324617385864, + "learning_rate": 0.00012973176115556092, + "loss": 0.9586, + "step": 20995 + }, + { + "epoch": 0.8084696823869105, + "grad_norm": 1.4145176410675049, + "learning_rate": 0.00012970288766706465, + "loss": 1.1235, + "step": 21000 + }, + { + "epoch": 0.8086621751684312, + "grad_norm": 1.2468702793121338, + "learning_rate": 0.00012967401146243604, + "loss": 1.2867, + "step": 21005 + }, + { + "epoch": 0.8088546679499519, + "grad_norm": 1.230528473854065, + "learning_rate": 0.00012964513254431567, + "loss": 1.1464, + "step": 21010 + }, + { + "epoch": 0.8090471607314725, + "grad_norm": 1.3284807205200195, + "learning_rate": 0.00012961625091534432, + "loss": 1.2665, + "step": 21015 + }, + { + "epoch": 0.8092396535129933, + "grad_norm": 1.5851751565933228, + "learning_rate": 0.000129587366578163, + "loss": 1.1016, + "step": 21020 + }, + { + "epoch": 0.809432146294514, + "grad_norm": 1.7818470001220703, + "learning_rate": 0.00012955847953541303, + "loss": 1.1393, + "step": 21025 + }, + { + "epoch": 0.8096246390760347, + "grad_norm": 2.6962087154388428, + "learning_rate": 0.00012952958978973585, + "loss": 1.1494, + "step": 21030 + }, + { + "epoch": 0.8098171318575553, + "grad_norm": 1.0526777505874634, + "learning_rate": 0.00012950069734377336, + "loss": 1.1411, + "step": 21035 + }, + { + "epoch": 0.810009624639076, + "grad_norm": 0.8282346129417419, + "learning_rate": 0.0001294718022001675, + "loss": 1.2306, + "step": 21040 + }, + { + "epoch": 0.8102021174205968, + "grad_norm": 0.9357605576515198, + "learning_rate": 0.00012944290436156056, + "loss": 1.0414, + "step": 21045 + }, + { + "epoch": 0.8103946102021174, + "grad_norm": 1.5050805807113647, + "learning_rate": 0.00012941400383059508, + "loss": 1.0919, + "step": 21050 + }, + { + "epoch": 0.8105871029836381, + "grad_norm": 1.8481179475784302, + "learning_rate": 0.00012938510060991377, + "loss": 1.2084, + "step": 21055 + }, + { + "epoch": 0.8107795957651588, + "grad_norm": 1.1937155723571777, + "learning_rate": 0.00012935619470215968, + "loss": 1.2939, + "step": 21060 + }, + { + "epoch": 0.8109720885466795, + "grad_norm": 1.077461838722229, + "learning_rate": 0.00012932728610997606, + "loss": 1.1172, + "step": 21065 + }, + { + "epoch": 0.8111645813282002, + "grad_norm": 1.2674756050109863, + "learning_rate": 0.00012929837483600638, + "loss": 1.0764, + "step": 21070 + }, + { + "epoch": 0.8113570741097209, + "grad_norm": 1.3384912014007568, + "learning_rate": 0.00012926946088289443, + "loss": 1.0626, + "step": 21075 + }, + { + "epoch": 0.8115495668912416, + "grad_norm": 1.9473553895950317, + "learning_rate": 0.00012924054425328415, + "loss": 0.9904, + "step": 21080 + }, + { + "epoch": 0.8117420596727623, + "grad_norm": 0.9565866589546204, + "learning_rate": 0.00012921162494981982, + "loss": 1.015, + "step": 21085 + }, + { + "epoch": 0.8119345524542829, + "grad_norm": 2.1136703491210938, + "learning_rate": 0.00012918270297514588, + "loss": 1.1105, + "step": 21090 + }, + { + "epoch": 0.8121270452358037, + "grad_norm": 1.2925735712051392, + "learning_rate": 0.0001291537783319071, + "loss": 1.1225, + "step": 21095 + }, + { + "epoch": 0.8123195380173244, + "grad_norm": 1.0134514570236206, + "learning_rate": 0.00012912485102274836, + "loss": 1.1333, + "step": 21100 + }, + { + "epoch": 0.812512030798845, + "grad_norm": 1.2157081365585327, + "learning_rate": 0.00012909592105031495, + "loss": 1.2096, + "step": 21105 + }, + { + "epoch": 0.8127045235803657, + "grad_norm": 1.0096231698989868, + "learning_rate": 0.00012906698841725234, + "loss": 1.1434, + "step": 21110 + }, + { + "epoch": 0.8128970163618864, + "grad_norm": 1.3600200414657593, + "learning_rate": 0.00012903805312620617, + "loss": 1.0407, + "step": 21115 + }, + { + "epoch": 0.8130895091434072, + "grad_norm": 1.7842247486114502, + "learning_rate": 0.00012900911517982238, + "loss": 1.0577, + "step": 21120 + }, + { + "epoch": 0.8132820019249278, + "grad_norm": 1.8753024339675903, + "learning_rate": 0.0001289801745807472, + "loss": 1.1792, + "step": 21125 + }, + { + "epoch": 0.8134744947064485, + "grad_norm": 1.000182867050171, + "learning_rate": 0.00012895123133162704, + "loss": 1.0124, + "step": 21130 + }, + { + "epoch": 0.8136669874879692, + "grad_norm": 0.9506773948669434, + "learning_rate": 0.00012892228543510856, + "loss": 1.2867, + "step": 21135 + }, + { + "epoch": 0.8138594802694898, + "grad_norm": 1.084410548210144, + "learning_rate": 0.00012889333689383863, + "loss": 1.2061, + "step": 21140 + }, + { + "epoch": 0.8140519730510106, + "grad_norm": 2.0268046855926514, + "learning_rate": 0.00012886438571046448, + "loss": 0.9941, + "step": 21145 + }, + { + "epoch": 0.8142444658325313, + "grad_norm": 1.1148359775543213, + "learning_rate": 0.00012883543188763345, + "loss": 1.3397, + "step": 21150 + }, + { + "epoch": 0.814436958614052, + "grad_norm": 1.990718126296997, + "learning_rate": 0.00012880647542799317, + "loss": 1.1802, + "step": 21155 + }, + { + "epoch": 0.8146294513955726, + "grad_norm": 0.9613543152809143, + "learning_rate": 0.00012877751633419158, + "loss": 1.0403, + "step": 21160 + }, + { + "epoch": 0.8148219441770933, + "grad_norm": 1.0529913902282715, + "learning_rate": 0.00012874855460887674, + "loss": 1.0797, + "step": 21165 + }, + { + "epoch": 0.8150144369586141, + "grad_norm": 1.083970069885254, + "learning_rate": 0.000128719590254697, + "loss": 1.0772, + "step": 21170 + }, + { + "epoch": 0.8152069297401348, + "grad_norm": 1.39369535446167, + "learning_rate": 0.000128690623274301, + "loss": 1.1004, + "step": 21175 + }, + { + "epoch": 0.8153994225216554, + "grad_norm": 1.5318938493728638, + "learning_rate": 0.00012866165367033755, + "loss": 1.0723, + "step": 21180 + }, + { + "epoch": 0.8155919153031761, + "grad_norm": 2.069422483444214, + "learning_rate": 0.00012863268144545575, + "loss": 1.1655, + "step": 21185 + }, + { + "epoch": 0.8157844080846969, + "grad_norm": 2.672980785369873, + "learning_rate": 0.0001286037066023049, + "loss": 1.1651, + "step": 21190 + }, + { + "epoch": 0.8159769008662175, + "grad_norm": 1.552042007446289, + "learning_rate": 0.00012857472914353456, + "loss": 1.1443, + "step": 21195 + }, + { + "epoch": 0.8161693936477382, + "grad_norm": 1.6215529441833496, + "learning_rate": 0.00012854574907179454, + "loss": 1.0127, + "step": 21200 + }, + { + "epoch": 0.8163618864292589, + "grad_norm": 1.9641019105911255, + "learning_rate": 0.00012851676638973486, + "loss": 1.1913, + "step": 21205 + }, + { + "epoch": 0.8165543792107796, + "grad_norm": 1.0362906455993652, + "learning_rate": 0.00012848778110000582, + "loss": 1.0131, + "step": 21210 + }, + { + "epoch": 0.8167468719923003, + "grad_norm": 1.3487643003463745, + "learning_rate": 0.00012845879320525787, + "loss": 1.3209, + "step": 21215 + }, + { + "epoch": 0.816939364773821, + "grad_norm": 1.2834949493408203, + "learning_rate": 0.00012842980270814182, + "loss": 0.9396, + "step": 21220 + }, + { + "epoch": 0.8171318575553417, + "grad_norm": 1.530375599861145, + "learning_rate": 0.00012840080961130864, + "loss": 0.9925, + "step": 21225 + }, + { + "epoch": 0.8173243503368623, + "grad_norm": 1.6313562393188477, + "learning_rate": 0.00012837181391740958, + "loss": 1.1684, + "step": 21230 + }, + { + "epoch": 0.817516843118383, + "grad_norm": 2.681976556777954, + "learning_rate": 0.0001283428156290961, + "loss": 1.0749, + "step": 21235 + }, + { + "epoch": 0.8177093358999038, + "grad_norm": 1.3075588941574097, + "learning_rate": 0.0001283138147490198, + "loss": 1.2704, + "step": 21240 + }, + { + "epoch": 0.8179018286814245, + "grad_norm": 1.4993854761123657, + "learning_rate": 0.0001282848112798328, + "loss": 1.1457, + "step": 21245 + }, + { + "epoch": 0.8180943214629451, + "grad_norm": 0.991303563117981, + "learning_rate": 0.00012825580522418716, + "loss": 1.0834, + "step": 21250 + }, + { + "epoch": 0.8182868142444658, + "grad_norm": 1.5046969652175903, + "learning_rate": 0.0001282267965847353, + "loss": 1.1156, + "step": 21255 + }, + { + "epoch": 0.8184793070259865, + "grad_norm": 2.711761713027954, + "learning_rate": 0.00012819778536412983, + "loss": 1.0736, + "step": 21260 + }, + { + "epoch": 0.8186717998075073, + "grad_norm": 2.3247745037078857, + "learning_rate": 0.00012816877156502374, + "loss": 1.0703, + "step": 21265 + }, + { + "epoch": 0.8188642925890279, + "grad_norm": 1.8839764595031738, + "learning_rate": 0.00012813975519007005, + "loss": 1.2301, + "step": 21270 + }, + { + "epoch": 0.8190567853705486, + "grad_norm": 1.3224703073501587, + "learning_rate": 0.00012811073624192218, + "loss": 0.942, + "step": 21275 + }, + { + "epoch": 0.8192492781520693, + "grad_norm": 1.2975091934204102, + "learning_rate": 0.00012808171472323366, + "loss": 1.0815, + "step": 21280 + }, + { + "epoch": 0.8194417709335899, + "grad_norm": 1.1744860410690308, + "learning_rate": 0.0001280526906366584, + "loss": 1.2635, + "step": 21285 + }, + { + "epoch": 0.8196342637151107, + "grad_norm": 1.3168641328811646, + "learning_rate": 0.0001280236639848504, + "loss": 1.1034, + "step": 21290 + }, + { + "epoch": 0.8198267564966314, + "grad_norm": 1.7554851770401, + "learning_rate": 0.00012799463477046396, + "loss": 1.215, + "step": 21295 + }, + { + "epoch": 0.8200192492781521, + "grad_norm": 1.458289384841919, + "learning_rate": 0.00012796560299615365, + "loss": 1.0511, + "step": 21300 + }, + { + "epoch": 0.8202117420596727, + "grad_norm": 1.4137495756149292, + "learning_rate": 0.00012793656866457416, + "loss": 0.9379, + "step": 21305 + }, + { + "epoch": 0.8204042348411934, + "grad_norm": 1.4759835004806519, + "learning_rate": 0.00012790753177838052, + "loss": 1.1523, + "step": 21310 + }, + { + "epoch": 0.8205967276227142, + "grad_norm": 1.7477853298187256, + "learning_rate": 0.00012787849234022802, + "loss": 1.1982, + "step": 21315 + }, + { + "epoch": 0.8207892204042349, + "grad_norm": 1.6621637344360352, + "learning_rate": 0.00012784945035277206, + "loss": 1.1225, + "step": 21320 + }, + { + "epoch": 0.8209817131857555, + "grad_norm": 1.2455633878707886, + "learning_rate": 0.0001278204058186683, + "loss": 1.1079, + "step": 21325 + }, + { + "epoch": 0.8211742059672762, + "grad_norm": 2.1116111278533936, + "learning_rate": 0.0001277913587405728, + "loss": 1.16, + "step": 21330 + }, + { + "epoch": 0.821366698748797, + "grad_norm": 1.7187306880950928, + "learning_rate": 0.00012776230912114157, + "loss": 1.0653, + "step": 21335 + }, + { + "epoch": 0.8215591915303176, + "grad_norm": 1.4385313987731934, + "learning_rate": 0.00012773325696303114, + "loss": 1.1132, + "step": 21340 + }, + { + "epoch": 0.8217516843118383, + "grad_norm": 1.1039965152740479, + "learning_rate": 0.00012770420226889802, + "loss": 1.072, + "step": 21345 + }, + { + "epoch": 0.821944177093359, + "grad_norm": 1.4439443349838257, + "learning_rate": 0.0001276751450413992, + "loss": 1.2279, + "step": 21350 + }, + { + "epoch": 0.8221366698748797, + "grad_norm": 1.0179024934768677, + "learning_rate": 0.00012764608528319165, + "loss": 1.0026, + "step": 21355 + }, + { + "epoch": 0.8223291626564004, + "grad_norm": 1.2904207706451416, + "learning_rate": 0.00012761702299693274, + "loss": 1.0848, + "step": 21360 + }, + { + "epoch": 0.8225216554379211, + "grad_norm": 0.8580986261367798, + "learning_rate": 0.00012758795818528003, + "loss": 1.0943, + "step": 21365 + }, + { + "epoch": 0.8227141482194418, + "grad_norm": 1.2557419538497925, + "learning_rate": 0.00012755889085089127, + "loss": 1.1489, + "step": 21370 + }, + { + "epoch": 0.8229066410009624, + "grad_norm": 1.0299915075302124, + "learning_rate": 0.00012752982099642455, + "loss": 1.0721, + "step": 21375 + }, + { + "epoch": 0.8230991337824831, + "grad_norm": 0.9946519136428833, + "learning_rate": 0.00012750074862453802, + "loss": 1.096, + "step": 21380 + }, + { + "epoch": 0.8232916265640039, + "grad_norm": 1.6168420314788818, + "learning_rate": 0.00012747167373789023, + "loss": 1.275, + "step": 21385 + }, + { + "epoch": 0.8234841193455246, + "grad_norm": 1.7367397546768188, + "learning_rate": 0.00012744259633913987, + "loss": 1.1666, + "step": 21390 + }, + { + "epoch": 0.8236766121270452, + "grad_norm": 1.6572107076644897, + "learning_rate": 0.00012741351643094583, + "loss": 1.0053, + "step": 21395 + }, + { + "epoch": 0.8238691049085659, + "grad_norm": 0.7522270083427429, + "learning_rate": 0.00012738443401596734, + "loss": 1.0007, + "step": 21400 + }, + { + "epoch": 0.8240615976900866, + "grad_norm": 1.8706845045089722, + "learning_rate": 0.00012735534909686375, + "loss": 1.2394, + "step": 21405 + }, + { + "epoch": 0.8242540904716074, + "grad_norm": 2.571775197982788, + "learning_rate": 0.0001273262616762947, + "loss": 1.4034, + "step": 21410 + }, + { + "epoch": 0.824446583253128, + "grad_norm": 1.1532241106033325, + "learning_rate": 0.00012729717175692006, + "loss": 1.095, + "step": 21415 + }, + { + "epoch": 0.8246390760346487, + "grad_norm": 1.6400253772735596, + "learning_rate": 0.00012726807934139987, + "loss": 1.1114, + "step": 21420 + }, + { + "epoch": 0.8248315688161694, + "grad_norm": 1.572492003440857, + "learning_rate": 0.00012723898443239447, + "loss": 1.0996, + "step": 21425 + }, + { + "epoch": 0.82502406159769, + "grad_norm": 1.266436219215393, + "learning_rate": 0.00012720988703256438, + "loss": 1.1825, + "step": 21430 + }, + { + "epoch": 0.8252165543792108, + "grad_norm": 1.0934292078018188, + "learning_rate": 0.00012718078714457035, + "loss": 1.1494, + "step": 21435 + }, + { + "epoch": 0.8254090471607315, + "grad_norm": 1.5081112384796143, + "learning_rate": 0.0001271516847710734, + "loss": 1.2056, + "step": 21440 + }, + { + "epoch": 0.8256015399422522, + "grad_norm": 0.9314339756965637, + "learning_rate": 0.00012712257991473477, + "loss": 1.105, + "step": 21445 + }, + { + "epoch": 0.8257940327237728, + "grad_norm": 1.483852744102478, + "learning_rate": 0.00012709347257821587, + "loss": 1.2253, + "step": 21450 + }, + { + "epoch": 0.8259865255052935, + "grad_norm": 0.9682250022888184, + "learning_rate": 0.00012706436276417837, + "loss": 1.0754, + "step": 21455 + }, + { + "epoch": 0.8261790182868143, + "grad_norm": 1.4954203367233276, + "learning_rate": 0.00012703525047528418, + "loss": 1.1777, + "step": 21460 + }, + { + "epoch": 0.826371511068335, + "grad_norm": 1.303179383277893, + "learning_rate": 0.00012700613571419543, + "loss": 1.1226, + "step": 21465 + }, + { + "epoch": 0.8265640038498556, + "grad_norm": 1.0216439962387085, + "learning_rate": 0.0001269770184835745, + "loss": 1.0702, + "step": 21470 + }, + { + "epoch": 0.8267564966313763, + "grad_norm": 1.5021692514419556, + "learning_rate": 0.00012694789878608386, + "loss": 1.1696, + "step": 21475 + }, + { + "epoch": 0.826948989412897, + "grad_norm": 1.1756644248962402, + "learning_rate": 0.00012691877662438642, + "loss": 1.1441, + "step": 21480 + }, + { + "epoch": 0.8271414821944177, + "grad_norm": 1.7375133037567139, + "learning_rate": 0.0001268896520011452, + "loss": 1.0854, + "step": 21485 + }, + { + "epoch": 0.8273339749759384, + "grad_norm": 0.894169270992279, + "learning_rate": 0.00012686052491902344, + "loss": 1.0767, + "step": 21490 + }, + { + "epoch": 0.8275264677574591, + "grad_norm": 2.3879435062408447, + "learning_rate": 0.00012683139538068457, + "loss": 1.0683, + "step": 21495 + }, + { + "epoch": 0.8277189605389798, + "grad_norm": 0.7615637183189392, + "learning_rate": 0.00012680226338879238, + "loss": 1.0482, + "step": 21500 + }, + { + "epoch": 0.8279114533205005, + "grad_norm": 1.056795597076416, + "learning_rate": 0.0001267731289460107, + "loss": 1.0303, + "step": 21505 + }, + { + "epoch": 0.8281039461020212, + "grad_norm": 1.099278211593628, + "learning_rate": 0.00012674399205500375, + "loss": 1.1923, + "step": 21510 + }, + { + "epoch": 0.8282964388835419, + "grad_norm": 1.1515510082244873, + "learning_rate": 0.0001267148527184359, + "loss": 1.1697, + "step": 21515 + }, + { + "epoch": 0.8284889316650625, + "grad_norm": 1.8237887620925903, + "learning_rate": 0.00012668571093897175, + "loss": 1.0131, + "step": 21520 + }, + { + "epoch": 0.8286814244465832, + "grad_norm": 1.1378989219665527, + "learning_rate": 0.0001266565667192761, + "loss": 1.1541, + "step": 21525 + }, + { + "epoch": 0.828873917228104, + "grad_norm": 1.1044882535934448, + "learning_rate": 0.000126627420062014, + "loss": 0.9193, + "step": 21530 + }, + { + "epoch": 0.8290664100096247, + "grad_norm": 1.0803862810134888, + "learning_rate": 0.00012659827096985073, + "loss": 1.0778, + "step": 21535 + }, + { + "epoch": 0.8292589027911453, + "grad_norm": 1.004387617111206, + "learning_rate": 0.00012656911944545178, + "loss": 1.0134, + "step": 21540 + }, + { + "epoch": 0.829451395572666, + "grad_norm": 1.0931042432785034, + "learning_rate": 0.00012653996549148285, + "loss": 1.0738, + "step": 21545 + }, + { + "epoch": 0.8296438883541867, + "grad_norm": 1.053134560585022, + "learning_rate": 0.00012651080911060993, + "loss": 0.9885, + "step": 21550 + }, + { + "epoch": 0.8298363811357075, + "grad_norm": 1.2585800886154175, + "learning_rate": 0.00012648165030549908, + "loss": 1.0844, + "step": 21555 + }, + { + "epoch": 0.8300288739172281, + "grad_norm": 1.301685094833374, + "learning_rate": 0.00012645248907881677, + "loss": 1.1168, + "step": 21560 + }, + { + "epoch": 0.8302213666987488, + "grad_norm": 2.356616973876953, + "learning_rate": 0.00012642332543322954, + "loss": 1.3397, + "step": 21565 + }, + { + "epoch": 0.8304138594802695, + "grad_norm": 1.0727852582931519, + "learning_rate": 0.00012639415937140428, + "loss": 1.1311, + "step": 21570 + }, + { + "epoch": 0.8306063522617901, + "grad_norm": 1.3422895669937134, + "learning_rate": 0.00012636499089600794, + "loss": 1.2474, + "step": 21575 + }, + { + "epoch": 0.8307988450433109, + "grad_norm": 1.1913752555847168, + "learning_rate": 0.00012633582000970788, + "loss": 1.1439, + "step": 21580 + }, + { + "epoch": 0.8309913378248316, + "grad_norm": 1.068538784980774, + "learning_rate": 0.0001263066467151715, + "loss": 1.0762, + "step": 21585 + }, + { + "epoch": 0.8311838306063523, + "grad_norm": 1.4798871278762817, + "learning_rate": 0.00012627747101506655, + "loss": 1.1836, + "step": 21590 + }, + { + "epoch": 0.8313763233878729, + "grad_norm": 0.9586134552955627, + "learning_rate": 0.00012624829291206098, + "loss": 1.154, + "step": 21595 + }, + { + "epoch": 0.8315688161693936, + "grad_norm": 1.3402488231658936, + "learning_rate": 0.00012621911240882287, + "loss": 1.0223, + "step": 21600 + }, + { + "epoch": 0.8317613089509144, + "grad_norm": 0.896938145160675, + "learning_rate": 0.00012618992950802062, + "loss": 1.0489, + "step": 21605 + }, + { + "epoch": 0.831953801732435, + "grad_norm": 1.466774821281433, + "learning_rate": 0.00012616074421232281, + "loss": 1.0963, + "step": 21610 + }, + { + "epoch": 0.8321462945139557, + "grad_norm": 1.714902639389038, + "learning_rate": 0.00012613155652439826, + "loss": 1.1946, + "step": 21615 + }, + { + "epoch": 0.8323387872954764, + "grad_norm": 1.3679184913635254, + "learning_rate": 0.00012610236644691592, + "loss": 1.1356, + "step": 21620 + }, + { + "epoch": 0.8325312800769971, + "grad_norm": 2.8363022804260254, + "learning_rate": 0.00012607317398254515, + "loss": 1.2324, + "step": 21625 + }, + { + "epoch": 0.8327237728585178, + "grad_norm": 1.7993220090866089, + "learning_rate": 0.00012604397913395528, + "loss": 1.0687, + "step": 21630 + }, + { + "epoch": 0.8329162656400385, + "grad_norm": 1.42396879196167, + "learning_rate": 0.00012601478190381608, + "loss": 1.1426, + "step": 21635 + }, + { + "epoch": 0.8331087584215592, + "grad_norm": 0.9980899691581726, + "learning_rate": 0.0001259855822947974, + "loss": 1.0239, + "step": 21640 + }, + { + "epoch": 0.8333012512030799, + "grad_norm": 1.641063928604126, + "learning_rate": 0.00012595638030956936, + "loss": 1.1006, + "step": 21645 + }, + { + "epoch": 0.8334937439846006, + "grad_norm": 0.7932726740837097, + "learning_rate": 0.00012592717595080226, + "loss": 0.9438, + "step": 21650 + }, + { + "epoch": 0.8336862367661213, + "grad_norm": 0.988121747970581, + "learning_rate": 0.0001258979692211667, + "loss": 1.0918, + "step": 21655 + }, + { + "epoch": 0.833878729547642, + "grad_norm": 2.024733066558838, + "learning_rate": 0.0001258687601233334, + "loss": 1.1982, + "step": 21660 + }, + { + "epoch": 0.8340712223291626, + "grad_norm": 1.4302330017089844, + "learning_rate": 0.00012583954865997337, + "loss": 1.0746, + "step": 21665 + }, + { + "epoch": 0.8342637151106833, + "grad_norm": 1.084583044052124, + "learning_rate": 0.00012581033483375777, + "loss": 1.173, + "step": 21670 + }, + { + "epoch": 0.8344562078922041, + "grad_norm": 1.3885962963104248, + "learning_rate": 0.00012578111864735802, + "loss": 1.0817, + "step": 21675 + }, + { + "epoch": 0.8346487006737248, + "grad_norm": 1.2596712112426758, + "learning_rate": 0.00012575190010344578, + "loss": 1.0657, + "step": 21680 + }, + { + "epoch": 0.8348411934552454, + "grad_norm": 1.376128911972046, + "learning_rate": 0.00012572267920469283, + "loss": 1.0961, + "step": 21685 + }, + { + "epoch": 0.8350336862367661, + "grad_norm": 1.203218936920166, + "learning_rate": 0.00012569345595377128, + "loss": 1.2123, + "step": 21690 + }, + { + "epoch": 0.8352261790182868, + "grad_norm": 2.2045395374298096, + "learning_rate": 0.00012566423035335338, + "loss": 1.2024, + "step": 21695 + }, + { + "epoch": 0.8354186717998076, + "grad_norm": 2.305617094039917, + "learning_rate": 0.00012563500240611166, + "loss": 1.027, + "step": 21700 + }, + { + "epoch": 0.8356111645813282, + "grad_norm": 0.8593419194221497, + "learning_rate": 0.00012560577211471875, + "loss": 1.0436, + "step": 21705 + }, + { + "epoch": 0.8358036573628489, + "grad_norm": 1.3252314329147339, + "learning_rate": 0.00012557653948184761, + "loss": 0.9099, + "step": 21710 + }, + { + "epoch": 0.8359961501443696, + "grad_norm": 1.569014072418213, + "learning_rate": 0.0001255473045101714, + "loss": 1.1202, + "step": 21715 + }, + { + "epoch": 0.8361886429258902, + "grad_norm": 0.9022922515869141, + "learning_rate": 0.00012551806720236338, + "loss": 1.1666, + "step": 21720 + }, + { + "epoch": 0.836381135707411, + "grad_norm": 1.5258845090866089, + "learning_rate": 0.00012548882756109717, + "loss": 1.1072, + "step": 21725 + }, + { + "epoch": 0.8365736284889317, + "grad_norm": 1.4463955163955688, + "learning_rate": 0.0001254595855890465, + "loss": 1.0444, + "step": 21730 + }, + { + "epoch": 0.8367661212704524, + "grad_norm": 1.6336601972579956, + "learning_rate": 0.00012543034128888544, + "loss": 1.2973, + "step": 21735 + }, + { + "epoch": 0.836958614051973, + "grad_norm": 1.2576451301574707, + "learning_rate": 0.0001254010946632881, + "loss": 1.0968, + "step": 21740 + }, + { + "epoch": 0.8371511068334937, + "grad_norm": 1.2530792951583862, + "learning_rate": 0.0001253718457149289, + "loss": 0.9215, + "step": 21745 + }, + { + "epoch": 0.8373435996150145, + "grad_norm": 1.3370553255081177, + "learning_rate": 0.0001253425944464825, + "loss": 1.1147, + "step": 21750 + }, + { + "epoch": 0.8375360923965351, + "grad_norm": 1.5159647464752197, + "learning_rate": 0.00012531334086062374, + "loss": 1.1326, + "step": 21755 + }, + { + "epoch": 0.8377285851780558, + "grad_norm": 1.4643933773040771, + "learning_rate": 0.0001252840849600276, + "loss": 0.9886, + "step": 21760 + }, + { + "epoch": 0.8379210779595765, + "grad_norm": 1.5805963277816772, + "learning_rate": 0.00012525482674736942, + "loss": 1.1807, + "step": 21765 + }, + { + "epoch": 0.8381135707410972, + "grad_norm": 1.3167610168457031, + "learning_rate": 0.00012522556622532462, + "loss": 0.93, + "step": 21770 + }, + { + "epoch": 0.8383060635226179, + "grad_norm": 1.4402116537094116, + "learning_rate": 0.00012519630339656888, + "loss": 1.1223, + "step": 21775 + }, + { + "epoch": 0.8384985563041386, + "grad_norm": 1.5734583139419556, + "learning_rate": 0.00012516703826377814, + "loss": 1.1476, + "step": 21780 + }, + { + "epoch": 0.8386910490856593, + "grad_norm": 1.581139326095581, + "learning_rate": 0.00012513777082962842, + "loss": 1.3, + "step": 21785 + }, + { + "epoch": 0.83888354186718, + "grad_norm": 1.0609793663024902, + "learning_rate": 0.0001251085010967961, + "loss": 1.1791, + "step": 21790 + }, + { + "epoch": 0.8390760346487006, + "grad_norm": 1.130085825920105, + "learning_rate": 0.00012507922906795772, + "loss": 1.0816, + "step": 21795 + }, + { + "epoch": 0.8392685274302214, + "grad_norm": 2.4827651977539062, + "learning_rate": 0.00012504995474578993, + "loss": 1.1677, + "step": 21800 + }, + { + "epoch": 0.8394610202117421, + "grad_norm": 1.187456727027893, + "learning_rate": 0.00012502067813296972, + "loss": 0.9263, + "step": 21805 + }, + { + "epoch": 0.8396535129932627, + "grad_norm": 1.375260353088379, + "learning_rate": 0.00012499139923217425, + "loss": 1.1236, + "step": 21810 + }, + { + "epoch": 0.8398460057747834, + "grad_norm": 1.0707255601882935, + "learning_rate": 0.00012496211804608089, + "loss": 1.0672, + "step": 21815 + }, + { + "epoch": 0.8400384985563042, + "grad_norm": 1.7111786603927612, + "learning_rate": 0.00012493283457736716, + "loss": 1.0458, + "step": 21820 + }, + { + "epoch": 0.8402309913378249, + "grad_norm": 0.9973586201667786, + "learning_rate": 0.0001249035488287109, + "loss": 0.9639, + "step": 21825 + }, + { + "epoch": 0.8404234841193455, + "grad_norm": 1.5254673957824707, + "learning_rate": 0.00012487426080279006, + "loss": 1.2324, + "step": 21830 + }, + { + "epoch": 0.8406159769008662, + "grad_norm": 1.4964022636413574, + "learning_rate": 0.00012484497050228285, + "loss": 1.1236, + "step": 21835 + }, + { + "epoch": 0.8408084696823869, + "grad_norm": 1.157464623451233, + "learning_rate": 0.00012481567792986767, + "loss": 1.0155, + "step": 21840 + }, + { + "epoch": 0.8410009624639077, + "grad_norm": 1.8712735176086426, + "learning_rate": 0.0001247863830882231, + "loss": 1.0772, + "step": 21845 + }, + { + "epoch": 0.8411934552454283, + "grad_norm": 1.2394201755523682, + "learning_rate": 0.00012475708598002805, + "loss": 1.2253, + "step": 21850 + }, + { + "epoch": 0.841385948026949, + "grad_norm": 1.3483343124389648, + "learning_rate": 0.00012472778660796145, + "loss": 1.0598, + "step": 21855 + }, + { + "epoch": 0.8415784408084697, + "grad_norm": 1.1623843908309937, + "learning_rate": 0.0001246984849747026, + "loss": 1.0261, + "step": 21860 + }, + { + "epoch": 0.8417709335899903, + "grad_norm": 5.310647964477539, + "learning_rate": 0.0001246691810829309, + "loss": 1.094, + "step": 21865 + }, + { + "epoch": 0.8419634263715111, + "grad_norm": 1.5004624128341675, + "learning_rate": 0.000124639874935326, + "loss": 1.1192, + "step": 21870 + }, + { + "epoch": 0.8421559191530318, + "grad_norm": 1.1160995960235596, + "learning_rate": 0.00012461056653456775, + "loss": 1.1409, + "step": 21875 + }, + { + "epoch": 0.8423484119345525, + "grad_norm": 1.3328487873077393, + "learning_rate": 0.0001245812558833362, + "loss": 0.7442, + "step": 21880 + }, + { + "epoch": 0.8425409047160731, + "grad_norm": 1.2014681100845337, + "learning_rate": 0.0001245519429843117, + "loss": 1.0333, + "step": 21885 + }, + { + "epoch": 0.8427333974975938, + "grad_norm": 0.916928231716156, + "learning_rate": 0.00012452262784017464, + "loss": 1.0076, + "step": 21890 + }, + { + "epoch": 0.8429258902791146, + "grad_norm": 1.1074447631835938, + "learning_rate": 0.0001244933104536057, + "loss": 1.1731, + "step": 21895 + }, + { + "epoch": 0.8431183830606352, + "grad_norm": 1.059792160987854, + "learning_rate": 0.00012446399082728578, + "loss": 1.0673, + "step": 21900 + }, + { + "epoch": 0.8433108758421559, + "grad_norm": 0.8817097544670105, + "learning_rate": 0.00012443466896389595, + "loss": 0.9764, + "step": 21905 + }, + { + "epoch": 0.8435033686236766, + "grad_norm": 1.0615062713623047, + "learning_rate": 0.0001244053448661175, + "loss": 1.162, + "step": 21910 + }, + { + "epoch": 0.8436958614051973, + "grad_norm": 1.173344612121582, + "learning_rate": 0.0001243760185366319, + "loss": 0.997, + "step": 21915 + }, + { + "epoch": 0.843888354186718, + "grad_norm": 1.5543683767318726, + "learning_rate": 0.00012434668997812092, + "loss": 1.1386, + "step": 21920 + }, + { + "epoch": 0.8440808469682387, + "grad_norm": 1.753692388534546, + "learning_rate": 0.00012431735919326645, + "loss": 1.24, + "step": 21925 + }, + { + "epoch": 0.8442733397497594, + "grad_norm": 1.3150075674057007, + "learning_rate": 0.00012428802618475053, + "loss": 1.0783, + "step": 21930 + }, + { + "epoch": 0.84446583253128, + "grad_norm": 1.4009346961975098, + "learning_rate": 0.00012425869095525552, + "loss": 0.9953, + "step": 21935 + }, + { + "epoch": 0.8446583253128007, + "grad_norm": 1.7203071117401123, + "learning_rate": 0.00012422935350746395, + "loss": 0.9115, + "step": 21940 + }, + { + "epoch": 0.8448508180943215, + "grad_norm": 1.1231714487075806, + "learning_rate": 0.00012420001384405845, + "loss": 1.1767, + "step": 21945 + }, + { + "epoch": 0.8450433108758422, + "grad_norm": 1.0938594341278076, + "learning_rate": 0.00012417067196772202, + "loss": 1.0433, + "step": 21950 + }, + { + "epoch": 0.8452358036573628, + "grad_norm": 1.9310616254806519, + "learning_rate": 0.00012414132788113777, + "loss": 1.0809, + "step": 21955 + }, + { + "epoch": 0.8454282964388835, + "grad_norm": 1.4712507724761963, + "learning_rate": 0.00012411198158698898, + "loss": 1.0663, + "step": 21960 + }, + { + "epoch": 0.8456207892204043, + "grad_norm": 0.9529402256011963, + "learning_rate": 0.00012408263308795922, + "loss": 1.0486, + "step": 21965 + }, + { + "epoch": 0.845813282001925, + "grad_norm": 0.8903969526290894, + "learning_rate": 0.0001240532823867322, + "loss": 1.0795, + "step": 21970 + }, + { + "epoch": 0.8460057747834456, + "grad_norm": 1.0439949035644531, + "learning_rate": 0.00012402392948599185, + "loss": 1.0456, + "step": 21975 + }, + { + "epoch": 0.8461982675649663, + "grad_norm": 1.5263192653656006, + "learning_rate": 0.00012399457438842226, + "loss": 1.004, + "step": 21980 + }, + { + "epoch": 0.846390760346487, + "grad_norm": 1.3312830924987793, + "learning_rate": 0.00012396521709670782, + "loss": 1.1062, + "step": 21985 + }, + { + "epoch": 0.8465832531280078, + "grad_norm": 1.6657592058181763, + "learning_rate": 0.00012393585761353302, + "loss": 1.019, + "step": 21990 + }, + { + "epoch": 0.8467757459095284, + "grad_norm": 0.9013869166374207, + "learning_rate": 0.0001239064959415826, + "loss": 1.3499, + "step": 21995 + }, + { + "epoch": 0.8469682386910491, + "grad_norm": 0.9990372061729431, + "learning_rate": 0.0001238771320835415, + "loss": 1.064, + "step": 22000 + }, + { + "epoch": 0.8471607314725698, + "grad_norm": 1.070052146911621, + "learning_rate": 0.00012384776604209484, + "loss": 1.0608, + "step": 22005 + }, + { + "epoch": 0.8473532242540904, + "grad_norm": 2.112750291824341, + "learning_rate": 0.00012381839781992797, + "loss": 1.1134, + "step": 22010 + }, + { + "epoch": 0.8475457170356112, + "grad_norm": 1.3599921464920044, + "learning_rate": 0.00012378902741972636, + "loss": 0.964, + "step": 22015 + }, + { + "epoch": 0.8477382098171319, + "grad_norm": 1.8712466955184937, + "learning_rate": 0.00012375965484417582, + "loss": 1.129, + "step": 22020 + }, + { + "epoch": 0.8479307025986526, + "grad_norm": 2.496819019317627, + "learning_rate": 0.00012373028009596222, + "loss": 1.1981, + "step": 22025 + }, + { + "epoch": 0.8481231953801732, + "grad_norm": 1.8566465377807617, + "learning_rate": 0.00012370090317777172, + "loss": 1.1229, + "step": 22030 + }, + { + "epoch": 0.8483156881616939, + "grad_norm": 1.0162265300750732, + "learning_rate": 0.0001236715240922906, + "loss": 1.2759, + "step": 22035 + }, + { + "epoch": 0.8485081809432147, + "grad_norm": 1.4011484384536743, + "learning_rate": 0.0001236421428422054, + "loss": 1.0725, + "step": 22040 + }, + { + "epoch": 0.8487006737247353, + "grad_norm": 0.9119108319282532, + "learning_rate": 0.0001236127594302029, + "loss": 1.0087, + "step": 22045 + }, + { + "epoch": 0.848893166506256, + "grad_norm": 1.4366172552108765, + "learning_rate": 0.00012358337385896993, + "loss": 0.9379, + "step": 22050 + }, + { + "epoch": 0.8490856592877767, + "grad_norm": 1.162611961364746, + "learning_rate": 0.00012355398613119367, + "loss": 1.0068, + "step": 22055 + }, + { + "epoch": 0.8492781520692974, + "grad_norm": 1.5164228677749634, + "learning_rate": 0.0001235245962495614, + "loss": 1.2239, + "step": 22060 + }, + { + "epoch": 0.8494706448508181, + "grad_norm": 1.0570234060287476, + "learning_rate": 0.00012349520421676066, + "loss": 1.0332, + "step": 22065 + }, + { + "epoch": 0.8496631376323388, + "grad_norm": 1.4047578573226929, + "learning_rate": 0.0001234658100354791, + "loss": 1.051, + "step": 22070 + }, + { + "epoch": 0.8498556304138595, + "grad_norm": 1.317559003829956, + "learning_rate": 0.00012343641370840465, + "loss": 1.0702, + "step": 22075 + }, + { + "epoch": 0.8500481231953801, + "grad_norm": 1.0683611631393433, + "learning_rate": 0.0001234070152382255, + "loss": 0.9898, + "step": 22080 + }, + { + "epoch": 0.8502406159769008, + "grad_norm": 0.960689902305603, + "learning_rate": 0.00012337761462762978, + "loss": 1.377, + "step": 22085 + }, + { + "epoch": 0.8504331087584216, + "grad_norm": 1.2345051765441895, + "learning_rate": 0.0001233482118793061, + "loss": 1.0417, + "step": 22090 + }, + { + "epoch": 0.8506256015399423, + "grad_norm": 1.9396436214447021, + "learning_rate": 0.0001233188069959431, + "loss": 1.204, + "step": 22095 + }, + { + "epoch": 0.8508180943214629, + "grad_norm": 1.8735712766647339, + "learning_rate": 0.0001232893999802297, + "loss": 0.9903, + "step": 22100 + }, + { + "epoch": 0.8510105871029836, + "grad_norm": 2.0837035179138184, + "learning_rate": 0.00012325999083485494, + "loss": 1.01, + "step": 22105 + }, + { + "epoch": 0.8512030798845043, + "grad_norm": 1.9637258052825928, + "learning_rate": 0.00012323057956250806, + "loss": 1.1652, + "step": 22110 + }, + { + "epoch": 0.8513955726660251, + "grad_norm": 1.3333475589752197, + "learning_rate": 0.0001232011661658786, + "loss": 0.9568, + "step": 22115 + }, + { + "epoch": 0.8515880654475457, + "grad_norm": 1.8034814596176147, + "learning_rate": 0.0001231717506476562, + "loss": 1.1863, + "step": 22120 + }, + { + "epoch": 0.8517805582290664, + "grad_norm": 0.7985187768936157, + "learning_rate": 0.00012314233301053068, + "loss": 0.9462, + "step": 22125 + }, + { + "epoch": 0.8519730510105871, + "grad_norm": 1.7575490474700928, + "learning_rate": 0.00012311291325719213, + "loss": 1.1151, + "step": 22130 + }, + { + "epoch": 0.8521655437921078, + "grad_norm": 1.058147668838501, + "learning_rate": 0.00012308349139033076, + "loss": 1.071, + "step": 22135 + }, + { + "epoch": 0.8523580365736285, + "grad_norm": 1.0178147554397583, + "learning_rate": 0.00012305406741263701, + "loss": 1.2246, + "step": 22140 + }, + { + "epoch": 0.8525505293551492, + "grad_norm": 1.0183746814727783, + "learning_rate": 0.0001230246413268015, + "loss": 1.076, + "step": 22145 + }, + { + "epoch": 0.8527430221366699, + "grad_norm": 1.4708818197250366, + "learning_rate": 0.0001229952131355151, + "loss": 1.2221, + "step": 22150 + }, + { + "epoch": 0.8529355149181905, + "grad_norm": 1.1673319339752197, + "learning_rate": 0.00012296578284146879, + "loss": 1.0985, + "step": 22155 + }, + { + "epoch": 0.8531280076997113, + "grad_norm": 1.2929120063781738, + "learning_rate": 0.00012293635044735373, + "loss": 1.1093, + "step": 22160 + }, + { + "epoch": 0.853320500481232, + "grad_norm": 1.097374677658081, + "learning_rate": 0.0001229069159558614, + "loss": 1.1361, + "step": 22165 + }, + { + "epoch": 0.8535129932627527, + "grad_norm": 1.5961623191833496, + "learning_rate": 0.00012287747936968335, + "loss": 1.0493, + "step": 22170 + }, + { + "epoch": 0.8537054860442733, + "grad_norm": 1.378281593322754, + "learning_rate": 0.00012284804069151133, + "loss": 1.0436, + "step": 22175 + }, + { + "epoch": 0.853897978825794, + "grad_norm": 0.4067041277885437, + "learning_rate": 0.00012281859992403736, + "loss": 1.0238, + "step": 22180 + }, + { + "epoch": 0.8540904716073148, + "grad_norm": 1.24373197555542, + "learning_rate": 0.00012278915706995358, + "loss": 1.0049, + "step": 22185 + }, + { + "epoch": 0.8542829643888354, + "grad_norm": 2.2100603580474854, + "learning_rate": 0.00012275971213195237, + "loss": 1.1251, + "step": 22190 + }, + { + "epoch": 0.8544754571703561, + "grad_norm": 1.0543677806854248, + "learning_rate": 0.00012273026511272622, + "loss": 1.0744, + "step": 22195 + }, + { + "epoch": 0.8546679499518768, + "grad_norm": 1.5290037393569946, + "learning_rate": 0.00012270081601496792, + "loss": 1.1105, + "step": 22200 + }, + { + "epoch": 0.8548604427333975, + "grad_norm": 1.0767719745635986, + "learning_rate": 0.00012267136484137042, + "loss": 1.0246, + "step": 22205 + }, + { + "epoch": 0.8550529355149182, + "grad_norm": 1.285439133644104, + "learning_rate": 0.00012264191159462674, + "loss": 0.9625, + "step": 22210 + }, + { + "epoch": 0.8552454282964389, + "grad_norm": 1.0118284225463867, + "learning_rate": 0.00012261245627743022, + "loss": 0.8992, + "step": 22215 + }, + { + "epoch": 0.8554379210779596, + "grad_norm": 1.4398612976074219, + "learning_rate": 0.0001225829988924744, + "loss": 1.2539, + "step": 22220 + }, + { + "epoch": 0.8556304138594802, + "grad_norm": 0.7380401492118835, + "learning_rate": 0.00012255353944245294, + "loss": 0.9303, + "step": 22225 + }, + { + "epoch": 0.8558229066410009, + "grad_norm": 1.2834627628326416, + "learning_rate": 0.00012252407793005964, + "loss": 1.0884, + "step": 22230 + }, + { + "epoch": 0.8560153994225217, + "grad_norm": 1.7945185899734497, + "learning_rate": 0.00012249461435798867, + "loss": 1.1546, + "step": 22235 + }, + { + "epoch": 0.8562078922040424, + "grad_norm": 1.4909627437591553, + "learning_rate": 0.00012246514872893424, + "loss": 1.097, + "step": 22240 + }, + { + "epoch": 0.856400384985563, + "grad_norm": 1.3793891668319702, + "learning_rate": 0.00012243568104559075, + "loss": 1.0008, + "step": 22245 + }, + { + "epoch": 0.8565928777670837, + "grad_norm": 1.5637750625610352, + "learning_rate": 0.00012240621131065287, + "loss": 1.0054, + "step": 22250 + }, + { + "epoch": 0.8567853705486044, + "grad_norm": 1.0803687572479248, + "learning_rate": 0.00012237673952681538, + "loss": 1.0349, + "step": 22255 + }, + { + "epoch": 0.8569778633301252, + "grad_norm": 1.220694661140442, + "learning_rate": 0.00012234726569677328, + "loss": 1.0886, + "step": 22260 + }, + { + "epoch": 0.8571703561116458, + "grad_norm": 1.5394152402877808, + "learning_rate": 0.0001223177898232218, + "loss": 1.1316, + "step": 22265 + }, + { + "epoch": 0.8573628488931665, + "grad_norm": 1.927856206893921, + "learning_rate": 0.00012228831190885627, + "loss": 1.1063, + "step": 22270 + }, + { + "epoch": 0.8575553416746872, + "grad_norm": 2.0876247882843018, + "learning_rate": 0.00012225883195637224, + "loss": 1.0356, + "step": 22275 + }, + { + "epoch": 0.857747834456208, + "grad_norm": 1.3493059873580933, + "learning_rate": 0.00012222934996846551, + "loss": 1.191, + "step": 22280 + }, + { + "epoch": 0.8579403272377286, + "grad_norm": 2.021245002746582, + "learning_rate": 0.00012219986594783194, + "loss": 1.2042, + "step": 22285 + }, + { + "epoch": 0.8581328200192493, + "grad_norm": 0.9374911189079285, + "learning_rate": 0.00012217037989716774, + "loss": 1.1407, + "step": 22290 + }, + { + "epoch": 0.85832531280077, + "grad_norm": 1.2450497150421143, + "learning_rate": 0.00012214089181916915, + "loss": 1.0325, + "step": 22295 + }, + { + "epoch": 0.8585178055822906, + "grad_norm": 1.8689814805984497, + "learning_rate": 0.00012211140171653265, + "loss": 1.086, + "step": 22300 + }, + { + "epoch": 0.8587102983638114, + "grad_norm": 1.4810429811477661, + "learning_rate": 0.00012208190959195496, + "loss": 1.1968, + "step": 22305 + }, + { + "epoch": 0.8589027911453321, + "grad_norm": 0.8686331510543823, + "learning_rate": 0.00012205241544813289, + "loss": 1.1492, + "step": 22310 + }, + { + "epoch": 0.8590952839268527, + "grad_norm": 1.288923740386963, + "learning_rate": 0.00012202291928776355, + "loss": 1.1514, + "step": 22315 + }, + { + "epoch": 0.8592877767083734, + "grad_norm": 1.5871915817260742, + "learning_rate": 0.00012199342111354411, + "loss": 0.9645, + "step": 22320 + }, + { + "epoch": 0.8594802694898941, + "grad_norm": 1.8444538116455078, + "learning_rate": 0.00012196392092817202, + "loss": 1.173, + "step": 22325 + }, + { + "epoch": 0.8596727622714149, + "grad_norm": 1.4887306690216064, + "learning_rate": 0.00012193441873434481, + "loss": 1.0281, + "step": 22330 + }, + { + "epoch": 0.8598652550529355, + "grad_norm": 1.3933312892913818, + "learning_rate": 0.00012190491453476036, + "loss": 0.9568, + "step": 22335 + }, + { + "epoch": 0.8600577478344562, + "grad_norm": 1.9519412517547607, + "learning_rate": 0.00012187540833211657, + "loss": 1.0294, + "step": 22340 + }, + { + "epoch": 0.8602502406159769, + "grad_norm": 1.5969189405441284, + "learning_rate": 0.00012184590012911158, + "loss": 1.111, + "step": 22345 + }, + { + "epoch": 0.8604427333974976, + "grad_norm": 1.068150281906128, + "learning_rate": 0.00012181638992844377, + "loss": 1.119, + "step": 22350 + }, + { + "epoch": 0.8606352261790183, + "grad_norm": 1.8505868911743164, + "learning_rate": 0.00012178687773281159, + "loss": 1.1762, + "step": 22355 + }, + { + "epoch": 0.860827718960539, + "grad_norm": 0.8779274225234985, + "learning_rate": 0.00012175736354491381, + "loss": 1.0762, + "step": 22360 + }, + { + "epoch": 0.8610202117420597, + "grad_norm": 0.7734150886535645, + "learning_rate": 0.0001217278473674492, + "loss": 1.139, + "step": 22365 + }, + { + "epoch": 0.8612127045235803, + "grad_norm": 1.2648577690124512, + "learning_rate": 0.00012169832920311693, + "loss": 0.9826, + "step": 22370 + }, + { + "epoch": 0.861405197305101, + "grad_norm": 1.2812930345535278, + "learning_rate": 0.00012166880905461618, + "loss": 1.0287, + "step": 22375 + }, + { + "epoch": 0.8615976900866218, + "grad_norm": 1.3110228776931763, + "learning_rate": 0.00012163928692464637, + "loss": 1.1135, + "step": 22380 + }, + { + "epoch": 0.8617901828681425, + "grad_norm": 1.1708343029022217, + "learning_rate": 0.00012160976281590713, + "loss": 1.2012, + "step": 22385 + }, + { + "epoch": 0.8619826756496631, + "grad_norm": 1.004078984260559, + "learning_rate": 0.00012158023673109824, + "loss": 0.978, + "step": 22390 + }, + { + "epoch": 0.8621751684311838, + "grad_norm": 1.4461795091629028, + "learning_rate": 0.00012155070867291965, + "loss": 1.2483, + "step": 22395 + }, + { + "epoch": 0.8623676612127045, + "grad_norm": 1.3903266191482544, + "learning_rate": 0.00012152117864407149, + "loss": 1.2962, + "step": 22400 + }, + { + "epoch": 0.8625601539942253, + "grad_norm": 1.88125479221344, + "learning_rate": 0.00012149164664725414, + "loss": 1.1184, + "step": 22405 + }, + { + "epoch": 0.8627526467757459, + "grad_norm": 1.5932334661483765, + "learning_rate": 0.00012146211268516805, + "loss": 1.117, + "step": 22410 + }, + { + "epoch": 0.8629451395572666, + "grad_norm": 1.5401246547698975, + "learning_rate": 0.00012143257676051394, + "loss": 1.1546, + "step": 22415 + }, + { + "epoch": 0.8631376323387873, + "grad_norm": 1.215257167816162, + "learning_rate": 0.00012140303887599269, + "loss": 1.139, + "step": 22420 + }, + { + "epoch": 0.8633301251203079, + "grad_norm": 1.1622364521026611, + "learning_rate": 0.00012137349903430528, + "loss": 1.1076, + "step": 22425 + }, + { + "epoch": 0.8635226179018287, + "grad_norm": 1.5525301694869995, + "learning_rate": 0.00012134395723815297, + "loss": 0.8998, + "step": 22430 + }, + { + "epoch": 0.8637151106833494, + "grad_norm": 1.1511106491088867, + "learning_rate": 0.00012131441349023718, + "loss": 1.0576, + "step": 22435 + }, + { + "epoch": 0.86390760346487, + "grad_norm": 1.3346484899520874, + "learning_rate": 0.00012128486779325947, + "loss": 1.1717, + "step": 22440 + }, + { + "epoch": 0.8641000962463907, + "grad_norm": 1.0038443803787231, + "learning_rate": 0.0001212553201499216, + "loss": 1.0569, + "step": 22445 + }, + { + "epoch": 0.8642925890279115, + "grad_norm": 1.3039393424987793, + "learning_rate": 0.00012122577056292548, + "loss": 1.117, + "step": 22450 + }, + { + "epoch": 0.8644850818094322, + "grad_norm": 1.9241334199905396, + "learning_rate": 0.00012119621903497329, + "loss": 1.3177, + "step": 22455 + }, + { + "epoch": 0.8646775745909528, + "grad_norm": 1.7553646564483643, + "learning_rate": 0.00012116666556876725, + "loss": 1.1536, + "step": 22460 + }, + { + "epoch": 0.8648700673724735, + "grad_norm": 1.4490256309509277, + "learning_rate": 0.00012113711016700986, + "loss": 1.0066, + "step": 22465 + }, + { + "epoch": 0.8650625601539942, + "grad_norm": 0.8614715337753296, + "learning_rate": 0.0001211075528324038, + "loss": 1.0534, + "step": 22470 + }, + { + "epoch": 0.865255052935515, + "grad_norm": 1.0804800987243652, + "learning_rate": 0.00012107799356765181, + "loss": 1.0743, + "step": 22475 + }, + { + "epoch": 0.8654475457170356, + "grad_norm": 1.186589241027832, + "learning_rate": 0.00012104843237545695, + "loss": 1.1916, + "step": 22480 + }, + { + "epoch": 0.8656400384985563, + "grad_norm": 1.1526156663894653, + "learning_rate": 0.00012101886925852242, + "loss": 1.069, + "step": 22485 + }, + { + "epoch": 0.865832531280077, + "grad_norm": 1.382684350013733, + "learning_rate": 0.0001209893042195515, + "loss": 1.1266, + "step": 22490 + }, + { + "epoch": 0.8660250240615976, + "grad_norm": 1.9565086364746094, + "learning_rate": 0.00012095973726124774, + "loss": 1.3461, + "step": 22495 + }, + { + "epoch": 0.8662175168431184, + "grad_norm": 0.820969820022583, + "learning_rate": 0.0001209301683863149, + "loss": 0.9658, + "step": 22500 + }, + { + "epoch": 0.8664100096246391, + "grad_norm": 1.1080729961395264, + "learning_rate": 0.00012090059759745677, + "loss": 1.0718, + "step": 22505 + }, + { + "epoch": 0.8666025024061598, + "grad_norm": 1.6945521831512451, + "learning_rate": 0.00012087102489737746, + "loss": 1.1481, + "step": 22510 + }, + { + "epoch": 0.8667949951876804, + "grad_norm": 1.9991236925125122, + "learning_rate": 0.00012084145028878117, + "loss": 1.06, + "step": 22515 + }, + { + "epoch": 0.8669874879692011, + "grad_norm": 1.1101261377334595, + "learning_rate": 0.00012081187377437233, + "loss": 1.1425, + "step": 22520 + }, + { + "epoch": 0.8671799807507219, + "grad_norm": 0.9426703453063965, + "learning_rate": 0.00012078229535685548, + "loss": 1.1868, + "step": 22525 + }, + { + "epoch": 0.8673724735322426, + "grad_norm": 1.0097079277038574, + "learning_rate": 0.0001207527150389354, + "loss": 1.1158, + "step": 22530 + }, + { + "epoch": 0.8675649663137632, + "grad_norm": 1.3798326253890991, + "learning_rate": 0.000120723132823317, + "loss": 1.217, + "step": 22535 + }, + { + "epoch": 0.8677574590952839, + "grad_norm": 1.17965567111969, + "learning_rate": 0.00012069354871270538, + "loss": 1.1504, + "step": 22540 + }, + { + "epoch": 0.8679499518768046, + "grad_norm": 0.9790163040161133, + "learning_rate": 0.00012066396270980583, + "loss": 1.1189, + "step": 22545 + }, + { + "epoch": 0.8681424446583254, + "grad_norm": 1.134660005569458, + "learning_rate": 0.00012063437481732375, + "loss": 1.1093, + "step": 22550 + }, + { + "epoch": 0.868334937439846, + "grad_norm": 2.053043842315674, + "learning_rate": 0.0001206047850379648, + "loss": 1.2373, + "step": 22555 + }, + { + "epoch": 0.8685274302213667, + "grad_norm": 1.0980230569839478, + "learning_rate": 0.00012057519337443477, + "loss": 1.0597, + "step": 22560 + }, + { + "epoch": 0.8687199230028874, + "grad_norm": 0.8968497514724731, + "learning_rate": 0.00012054559982943958, + "loss": 1.0963, + "step": 22565 + }, + { + "epoch": 0.868912415784408, + "grad_norm": 1.183481216430664, + "learning_rate": 0.00012051600440568538, + "loss": 1.0797, + "step": 22570 + }, + { + "epoch": 0.8691049085659288, + "grad_norm": 1.0228111743927002, + "learning_rate": 0.00012048640710587848, + "loss": 1.1906, + "step": 22575 + }, + { + "epoch": 0.8692974013474495, + "grad_norm": 1.070470929145813, + "learning_rate": 0.00012045680793272537, + "loss": 1.0835, + "step": 22580 + }, + { + "epoch": 0.8694898941289702, + "grad_norm": 1.7659672498703003, + "learning_rate": 0.0001204272068889327, + "loss": 1.0718, + "step": 22585 + }, + { + "epoch": 0.8696823869104908, + "grad_norm": 1.503348469734192, + "learning_rate": 0.00012039760397720727, + "loss": 1.052, + "step": 22590 + }, + { + "epoch": 0.8698748796920116, + "grad_norm": 1.4245517253875732, + "learning_rate": 0.00012036799920025607, + "loss": 1.1893, + "step": 22595 + }, + { + "epoch": 0.8700673724735323, + "grad_norm": 1.6989365816116333, + "learning_rate": 0.0001203383925607863, + "loss": 1.1963, + "step": 22600 + }, + { + "epoch": 0.870259865255053, + "grad_norm": 0.9346055388450623, + "learning_rate": 0.0001203087840615052, + "loss": 1.2617, + "step": 22605 + }, + { + "epoch": 0.8704523580365736, + "grad_norm": 1.197245717048645, + "learning_rate": 0.00012027917370512039, + "loss": 1.0629, + "step": 22610 + }, + { + "epoch": 0.8706448508180943, + "grad_norm": 1.3511501550674438, + "learning_rate": 0.00012024956149433946, + "loss": 1.2573, + "step": 22615 + }, + { + "epoch": 0.8708373435996151, + "grad_norm": 1.259965181350708, + "learning_rate": 0.00012021994743187026, + "loss": 1.03, + "step": 22620 + }, + { + "epoch": 0.8710298363811357, + "grad_norm": 0.90989089012146, + "learning_rate": 0.00012019033152042083, + "loss": 1.1222, + "step": 22625 + }, + { + "epoch": 0.8712223291626564, + "grad_norm": 1.5417457818984985, + "learning_rate": 0.00012016071376269932, + "loss": 1.2125, + "step": 22630 + }, + { + "epoch": 0.8714148219441771, + "grad_norm": 1.0979586839675903, + "learning_rate": 0.00012013109416141408, + "loss": 1.1149, + "step": 22635 + }, + { + "epoch": 0.8716073147256977, + "grad_norm": 1.3153108358383179, + "learning_rate": 0.00012010147271927367, + "loss": 0.9733, + "step": 22640 + }, + { + "epoch": 0.8717998075072185, + "grad_norm": 1.1540855169296265, + "learning_rate": 0.0001200718494389867, + "loss": 1.1337, + "step": 22645 + }, + { + "epoch": 0.8719923002887392, + "grad_norm": 2.1098320484161377, + "learning_rate": 0.00012004222432326211, + "loss": 1.2008, + "step": 22650 + }, + { + "epoch": 0.8721847930702599, + "grad_norm": 2.331286668777466, + "learning_rate": 0.00012001259737480883, + "loss": 1.3357, + "step": 22655 + }, + { + "epoch": 0.8723772858517805, + "grad_norm": 0.9849653244018555, + "learning_rate": 0.00011998296859633613, + "loss": 0.9834, + "step": 22660 + }, + { + "epoch": 0.8725697786333012, + "grad_norm": 1.9478217363357544, + "learning_rate": 0.0001199533379905533, + "loss": 1.1192, + "step": 22665 + }, + { + "epoch": 0.872762271414822, + "grad_norm": 1.009040355682373, + "learning_rate": 0.0001199237055601699, + "loss": 1.2274, + "step": 22670 + }, + { + "epoch": 0.8729547641963427, + "grad_norm": 1.4086748361587524, + "learning_rate": 0.00011989407130789558, + "loss": 1.028, + "step": 22675 + }, + { + "epoch": 0.8731472569778633, + "grad_norm": 1.224057674407959, + "learning_rate": 0.00011986443523644025, + "loss": 1.046, + "step": 22680 + }, + { + "epoch": 0.873339749759384, + "grad_norm": 1.7258949279785156, + "learning_rate": 0.00011983479734851393, + "loss": 1.1056, + "step": 22685 + }, + { + "epoch": 0.8735322425409047, + "grad_norm": 1.5146769285202026, + "learning_rate": 0.00011980515764682674, + "loss": 1.3491, + "step": 22690 + }, + { + "epoch": 0.8737247353224254, + "grad_norm": 1.7950037717819214, + "learning_rate": 0.0001197755161340891, + "loss": 1.0302, + "step": 22695 + }, + { + "epoch": 0.8739172281039461, + "grad_norm": 1.0561470985412598, + "learning_rate": 0.00011974587281301151, + "loss": 1.026, + "step": 22700 + }, + { + "epoch": 0.8741097208854668, + "grad_norm": 2.0204522609710693, + "learning_rate": 0.00011971622768630466, + "loss": 0.999, + "step": 22705 + }, + { + "epoch": 0.8743022136669875, + "grad_norm": 1.23675537109375, + "learning_rate": 0.00011968658075667938, + "loss": 0.9875, + "step": 22710 + }, + { + "epoch": 0.8744947064485081, + "grad_norm": 1.2208534479141235, + "learning_rate": 0.00011965693202684671, + "loss": 1.1022, + "step": 22715 + }, + { + "epoch": 0.8746871992300289, + "grad_norm": 1.2948905229568481, + "learning_rate": 0.00011962728149951785, + "loss": 1.1026, + "step": 22720 + }, + { + "epoch": 0.8748796920115496, + "grad_norm": 1.0135208368301392, + "learning_rate": 0.00011959762917740407, + "loss": 1.1093, + "step": 22725 + }, + { + "epoch": 0.8750721847930703, + "grad_norm": 1.3146969079971313, + "learning_rate": 0.00011956797506321695, + "loss": 1.0676, + "step": 22730 + }, + { + "epoch": 0.8752646775745909, + "grad_norm": 1.5172390937805176, + "learning_rate": 0.00011953831915966813, + "loss": 1.0981, + "step": 22735 + }, + { + "epoch": 0.8754571703561116, + "grad_norm": 1.0460716485977173, + "learning_rate": 0.00011950866146946946, + "loss": 1.1096, + "step": 22740 + }, + { + "epoch": 0.8756496631376324, + "grad_norm": 1.5505667924880981, + "learning_rate": 0.00011947900199533291, + "loss": 1.1211, + "step": 22745 + }, + { + "epoch": 0.875842155919153, + "grad_norm": 1.3323757648468018, + "learning_rate": 0.00011944934073997069, + "loss": 1.0103, + "step": 22750 + }, + { + "epoch": 0.8760346487006737, + "grad_norm": 1.1330286264419556, + "learning_rate": 0.0001194196777060951, + "loss": 1.0935, + "step": 22755 + }, + { + "epoch": 0.8762271414821944, + "grad_norm": 1.3528751134872437, + "learning_rate": 0.00011939001289641863, + "loss": 1.0812, + "step": 22760 + }, + { + "epoch": 0.8764196342637152, + "grad_norm": 1.3828253746032715, + "learning_rate": 0.0001193603463136539, + "loss": 1.0722, + "step": 22765 + }, + { + "epoch": 0.8766121270452358, + "grad_norm": 1.1661975383758545, + "learning_rate": 0.00011933067796051378, + "loss": 1.0631, + "step": 22770 + }, + { + "epoch": 0.8768046198267565, + "grad_norm": 1.3716634511947632, + "learning_rate": 0.00011930100783971118, + "loss": 1.0528, + "step": 22775 + }, + { + "epoch": 0.8769971126082772, + "grad_norm": 1.4899778366088867, + "learning_rate": 0.00011927133595395932, + "loss": 1.0373, + "step": 22780 + }, + { + "epoch": 0.8771896053897978, + "grad_norm": 1.0731201171875, + "learning_rate": 0.0001192416623059714, + "loss": 1.1288, + "step": 22785 + }, + { + "epoch": 0.8773820981713186, + "grad_norm": 1.1406874656677246, + "learning_rate": 0.00011921198689846094, + "loss": 1.0467, + "step": 22790 + }, + { + "epoch": 0.8775745909528393, + "grad_norm": 1.203275442123413, + "learning_rate": 0.00011918230973414157, + "loss": 1.1394, + "step": 22795 + }, + { + "epoch": 0.87776708373436, + "grad_norm": 1.0929608345031738, + "learning_rate": 0.000119152630815727, + "loss": 1.0472, + "step": 22800 + }, + { + "epoch": 0.8779595765158806, + "grad_norm": 0.9629132747650146, + "learning_rate": 0.00011912295014593124, + "loss": 1.0325, + "step": 22805 + }, + { + "epoch": 0.8781520692974013, + "grad_norm": 1.3864428997039795, + "learning_rate": 0.00011909326772746839, + "loss": 0.9702, + "step": 22810 + }, + { + "epoch": 0.8783445620789221, + "grad_norm": 2.243774175643921, + "learning_rate": 0.00011906358356305265, + "loss": 1.1504, + "step": 22815 + }, + { + "epoch": 0.8785370548604428, + "grad_norm": 1.1354432106018066, + "learning_rate": 0.0001190338976553985, + "loss": 1.0098, + "step": 22820 + }, + { + "epoch": 0.8787295476419634, + "grad_norm": 1.9807575941085815, + "learning_rate": 0.00011900421000722049, + "loss": 1.1305, + "step": 22825 + }, + { + "epoch": 0.8789220404234841, + "grad_norm": 1.0899982452392578, + "learning_rate": 0.00011897452062123338, + "loss": 1.0502, + "step": 22830 + }, + { + "epoch": 0.8791145332050048, + "grad_norm": 1.194968819618225, + "learning_rate": 0.00011894482950015202, + "loss": 1.0947, + "step": 22835 + }, + { + "epoch": 0.8793070259865255, + "grad_norm": 1.032558560371399, + "learning_rate": 0.00011891513664669152, + "loss": 1.3069, + "step": 22840 + }, + { + "epoch": 0.8794995187680462, + "grad_norm": 1.0530674457550049, + "learning_rate": 0.00011888544206356709, + "loss": 1.0373, + "step": 22845 + }, + { + "epoch": 0.8796920115495669, + "grad_norm": 2.1762609481811523, + "learning_rate": 0.00011885574575349407, + "loss": 1.1478, + "step": 22850 + }, + { + "epoch": 0.8798845043310876, + "grad_norm": 1.5496394634246826, + "learning_rate": 0.00011882604771918802, + "loss": 1.078, + "step": 22855 + }, + { + "epoch": 0.8800769971126082, + "grad_norm": 1.5198618173599243, + "learning_rate": 0.0001187963479633646, + "loss": 0.9666, + "step": 22860 + }, + { + "epoch": 0.880269489894129, + "grad_norm": 2.006168842315674, + "learning_rate": 0.00011876664648873969, + "loss": 1.0162, + "step": 22865 + }, + { + "epoch": 0.8804619826756497, + "grad_norm": 1.6335991621017456, + "learning_rate": 0.0001187369432980293, + "loss": 1.2807, + "step": 22870 + }, + { + "epoch": 0.8806544754571703, + "grad_norm": 1.4370218515396118, + "learning_rate": 0.00011870723839394953, + "loss": 1.0569, + "step": 22875 + }, + { + "epoch": 0.880846968238691, + "grad_norm": 1.4292523860931396, + "learning_rate": 0.00011867753177921675, + "loss": 1.0509, + "step": 22880 + }, + { + "epoch": 0.8810394610202117, + "grad_norm": 0.8672935366630554, + "learning_rate": 0.00011864782345654739, + "loss": 1.0674, + "step": 22885 + }, + { + "epoch": 0.8812319538017325, + "grad_norm": 2.000182628631592, + "learning_rate": 0.00011861811342865814, + "loss": 1.142, + "step": 22890 + }, + { + "epoch": 0.8814244465832531, + "grad_norm": 1.2838269472122192, + "learning_rate": 0.00011858840169826573, + "loss": 1.2496, + "step": 22895 + }, + { + "epoch": 0.8816169393647738, + "grad_norm": 0.8586519956588745, + "learning_rate": 0.00011855868826808712, + "loss": 1.0047, + "step": 22900 + }, + { + "epoch": 0.8818094321462945, + "grad_norm": 0.9435043334960938, + "learning_rate": 0.00011852897314083942, + "loss": 0.9603, + "step": 22905 + }, + { + "epoch": 0.8820019249278153, + "grad_norm": 1.5908135175704956, + "learning_rate": 0.00011849925631923986, + "loss": 1.1064, + "step": 22910 + }, + { + "epoch": 0.8821944177093359, + "grad_norm": 1.6797462701797485, + "learning_rate": 0.00011846953780600585, + "loss": 1.1746, + "step": 22915 + }, + { + "epoch": 0.8823869104908566, + "grad_norm": 1.5768578052520752, + "learning_rate": 0.00011843981760385496, + "loss": 1.153, + "step": 22920 + }, + { + "epoch": 0.8825794032723773, + "grad_norm": 1.136809229850769, + "learning_rate": 0.0001184100957155049, + "loss": 1.1523, + "step": 22925 + }, + { + "epoch": 0.8827718960538979, + "grad_norm": 1.0501734018325806, + "learning_rate": 0.00011838037214367354, + "loss": 1.0341, + "step": 22930 + }, + { + "epoch": 0.8829643888354187, + "grad_norm": 1.3567155599594116, + "learning_rate": 0.00011835064689107893, + "loss": 1.1462, + "step": 22935 + }, + { + "epoch": 0.8831568816169394, + "grad_norm": 2.095165729522705, + "learning_rate": 0.00011832091996043921, + "loss": 1.1365, + "step": 22940 + }, + { + "epoch": 0.8833493743984601, + "grad_norm": 1.119279384613037, + "learning_rate": 0.00011829119135447274, + "loss": 1.1334, + "step": 22945 + }, + { + "epoch": 0.8835418671799807, + "grad_norm": 0.8329470157623291, + "learning_rate": 0.00011826146107589795, + "loss": 0.9907, + "step": 22950 + }, + { + "epoch": 0.8837343599615014, + "grad_norm": 1.6559773683547974, + "learning_rate": 0.00011823172912743355, + "loss": 1.0937, + "step": 22955 + }, + { + "epoch": 0.8839268527430222, + "grad_norm": 1.1509395837783813, + "learning_rate": 0.00011820199551179827, + "loss": 1.213, + "step": 22960 + }, + { + "epoch": 0.8841193455245429, + "grad_norm": 1.926554799079895, + "learning_rate": 0.00011817226023171109, + "loss": 1.1967, + "step": 22965 + }, + { + "epoch": 0.8843118383060635, + "grad_norm": 1.3691015243530273, + "learning_rate": 0.00011814252328989111, + "loss": 1.2489, + "step": 22970 + }, + { + "epoch": 0.8845043310875842, + "grad_norm": 1.807802438735962, + "learning_rate": 0.00011811278468905753, + "loss": 1.2371, + "step": 22975 + }, + { + "epoch": 0.8846968238691049, + "grad_norm": 1.0577672719955444, + "learning_rate": 0.00011808304443192982, + "loss": 0.9763, + "step": 22980 + }, + { + "epoch": 0.8848893166506256, + "grad_norm": 1.69723379611969, + "learning_rate": 0.00011805330252122743, + "loss": 1.3159, + "step": 22985 + }, + { + "epoch": 0.8850818094321463, + "grad_norm": 2.4447522163391113, + "learning_rate": 0.00011802355895967017, + "loss": 1.1288, + "step": 22990 + }, + { + "epoch": 0.885274302213667, + "grad_norm": 1.1180788278579712, + "learning_rate": 0.0001179938137499778, + "loss": 1.2414, + "step": 22995 + }, + { + "epoch": 0.8854667949951877, + "grad_norm": 1.8794695138931274, + "learning_rate": 0.00011796406689487038, + "loss": 1.1329, + "step": 23000 + }, + { + "epoch": 0.8856592877767083, + "grad_norm": 1.1928154230117798, + "learning_rate": 0.00011793431839706803, + "loss": 1.0499, + "step": 23005 + }, + { + "epoch": 0.8858517805582291, + "grad_norm": 0.8071653842926025, + "learning_rate": 0.00011790456825929106, + "loss": 0.9116, + "step": 23010 + }, + { + "epoch": 0.8860442733397498, + "grad_norm": 1.4556511640548706, + "learning_rate": 0.00011787481648425995, + "loss": 0.9842, + "step": 23015 + }, + { + "epoch": 0.8862367661212704, + "grad_norm": 1.4011625051498413, + "learning_rate": 0.00011784506307469527, + "loss": 1.1377, + "step": 23020 + }, + { + "epoch": 0.8864292589027911, + "grad_norm": 1.596177339553833, + "learning_rate": 0.00011781530803331778, + "loss": 1.2584, + "step": 23025 + }, + { + "epoch": 0.8866217516843118, + "grad_norm": 1.4121986627578735, + "learning_rate": 0.00011778555136284839, + "loss": 1.0889, + "step": 23030 + }, + { + "epoch": 0.8868142444658326, + "grad_norm": 1.7569849491119385, + "learning_rate": 0.00011775579306600814, + "loss": 1.0882, + "step": 23035 + }, + { + "epoch": 0.8870067372473532, + "grad_norm": 2.0657474994659424, + "learning_rate": 0.00011772603314551825, + "loss": 1.0512, + "step": 23040 + }, + { + "epoch": 0.8871992300288739, + "grad_norm": 1.419628620147705, + "learning_rate": 0.00011769627160410006, + "loss": 1.2821, + "step": 23045 + }, + { + "epoch": 0.8873917228103946, + "grad_norm": 2.094897747039795, + "learning_rate": 0.00011766650844447505, + "loss": 1.0668, + "step": 23050 + }, + { + "epoch": 0.8875842155919152, + "grad_norm": 1.6981192827224731, + "learning_rate": 0.00011763674366936485, + "loss": 1.1257, + "step": 23055 + }, + { + "epoch": 0.887776708373436, + "grad_norm": 2.0729424953460693, + "learning_rate": 0.0001176069772814913, + "loss": 1.1388, + "step": 23060 + }, + { + "epoch": 0.8879692011549567, + "grad_norm": 1.7522398233413696, + "learning_rate": 0.0001175772092835763, + "loss": 1.3847, + "step": 23065 + }, + { + "epoch": 0.8881616939364774, + "grad_norm": 1.2705082893371582, + "learning_rate": 0.00011754743967834197, + "loss": 1.042, + "step": 23070 + }, + { + "epoch": 0.888354186717998, + "grad_norm": 1.9361348152160645, + "learning_rate": 0.0001175176684685105, + "loss": 1.1387, + "step": 23075 + }, + { + "epoch": 0.8885466794995188, + "grad_norm": 2.4146616458892822, + "learning_rate": 0.00011748789565680429, + "loss": 1.1017, + "step": 23080 + }, + { + "epoch": 0.8887391722810395, + "grad_norm": 1.01850163936615, + "learning_rate": 0.00011745812124594589, + "loss": 0.9676, + "step": 23085 + }, + { + "epoch": 0.8889316650625602, + "grad_norm": 1.8511348962783813, + "learning_rate": 0.00011742834523865796, + "loss": 1.0274, + "step": 23090 + }, + { + "epoch": 0.8891241578440808, + "grad_norm": 1.0579948425292969, + "learning_rate": 0.0001173985676376633, + "loss": 0.9528, + "step": 23095 + }, + { + "epoch": 0.8893166506256015, + "grad_norm": 1.9102215766906738, + "learning_rate": 0.00011736878844568486, + "loss": 1.0718, + "step": 23100 + }, + { + "epoch": 0.8895091434071223, + "grad_norm": 1.7665621042251587, + "learning_rate": 0.00011733900766544579, + "loss": 0.9721, + "step": 23105 + }, + { + "epoch": 0.889701636188643, + "grad_norm": 1.6327630281448364, + "learning_rate": 0.00011730922529966934, + "loss": 1.1314, + "step": 23110 + }, + { + "epoch": 0.8898941289701636, + "grad_norm": 1.567986011505127, + "learning_rate": 0.00011727944135107889, + "loss": 1.1881, + "step": 23115 + }, + { + "epoch": 0.8900866217516843, + "grad_norm": 1.3827725648880005, + "learning_rate": 0.00011724965582239798, + "loss": 1.0747, + "step": 23120 + }, + { + "epoch": 0.890279114533205, + "grad_norm": 1.1218315362930298, + "learning_rate": 0.00011721986871635034, + "loss": 1.0224, + "step": 23125 + }, + { + "epoch": 0.8904716073147257, + "grad_norm": 0.9669145941734314, + "learning_rate": 0.00011719008003565975, + "loss": 1.1214, + "step": 23130 + }, + { + "epoch": 0.8906641000962464, + "grad_norm": 2.2118616104125977, + "learning_rate": 0.00011716028978305023, + "loss": 1.0058, + "step": 23135 + }, + { + "epoch": 0.8908565928777671, + "grad_norm": 1.0324665307998657, + "learning_rate": 0.00011713049796124589, + "loss": 1.0645, + "step": 23140 + }, + { + "epoch": 0.8910490856592878, + "grad_norm": 0.9733279347419739, + "learning_rate": 0.00011710070457297097, + "loss": 1.1258, + "step": 23145 + }, + { + "epoch": 0.8912415784408084, + "grad_norm": 0.8568596243858337, + "learning_rate": 0.00011707090962094991, + "loss": 1.046, + "step": 23150 + }, + { + "epoch": 0.8914340712223292, + "grad_norm": 1.0557111501693726, + "learning_rate": 0.00011704111310790728, + "loss": 1.0856, + "step": 23155 + }, + { + "epoch": 0.8916265640038499, + "grad_norm": 1.436998724937439, + "learning_rate": 0.00011701131503656771, + "loss": 1.0475, + "step": 23160 + }, + { + "epoch": 0.8918190567853705, + "grad_norm": 1.3652065992355347, + "learning_rate": 0.00011698151540965607, + "loss": 1.2369, + "step": 23165 + }, + { + "epoch": 0.8920115495668912, + "grad_norm": 1.6808583736419678, + "learning_rate": 0.00011695171422989735, + "loss": 1.2898, + "step": 23170 + }, + { + "epoch": 0.8922040423484119, + "grad_norm": 1.468636155128479, + "learning_rate": 0.00011692191150001666, + "loss": 1.1976, + "step": 23175 + }, + { + "epoch": 0.8923965351299327, + "grad_norm": 1.6479716300964355, + "learning_rate": 0.00011689210722273925, + "loss": 0.9855, + "step": 23180 + }, + { + "epoch": 0.8925890279114533, + "grad_norm": 1.1517993211746216, + "learning_rate": 0.00011686230140079054, + "loss": 1.0893, + "step": 23185 + }, + { + "epoch": 0.892781520692974, + "grad_norm": 1.0213438272476196, + "learning_rate": 0.00011683249403689606, + "loss": 0.9994, + "step": 23190 + }, + { + "epoch": 0.8929740134744947, + "grad_norm": 0.9689280390739441, + "learning_rate": 0.00011680268513378152, + "loss": 0.9765, + "step": 23195 + }, + { + "epoch": 0.8931665062560153, + "grad_norm": 1.0091511011123657, + "learning_rate": 0.00011677287469417272, + "loss": 1.143, + "step": 23200 + }, + { + "epoch": 0.8933589990375361, + "grad_norm": 1.0775808095932007, + "learning_rate": 0.00011674306272079567, + "loss": 0.9407, + "step": 23205 + }, + { + "epoch": 0.8935514918190568, + "grad_norm": 2.0991709232330322, + "learning_rate": 0.00011671324921637641, + "loss": 1.1681, + "step": 23210 + }, + { + "epoch": 0.8937439846005775, + "grad_norm": 1.5666394233703613, + "learning_rate": 0.00011668343418364122, + "loss": 1.2306, + "step": 23215 + }, + { + "epoch": 0.8939364773820981, + "grad_norm": 1.256421685218811, + "learning_rate": 0.00011665361762531653, + "loss": 1.092, + "step": 23220 + }, + { + "epoch": 0.8941289701636189, + "grad_norm": 1.5252572298049927, + "learning_rate": 0.00011662379954412879, + "loss": 1.0102, + "step": 23225 + }, + { + "epoch": 0.8943214629451396, + "grad_norm": 1.1907771825790405, + "learning_rate": 0.00011659397994280472, + "loss": 1.0496, + "step": 23230 + }, + { + "epoch": 0.8945139557266603, + "grad_norm": 1.2852728366851807, + "learning_rate": 0.00011656415882407113, + "loss": 1.093, + "step": 23235 + }, + { + "epoch": 0.8947064485081809, + "grad_norm": 1.3600996732711792, + "learning_rate": 0.0001165343361906549, + "loss": 1.0328, + "step": 23240 + }, + { + "epoch": 0.8948989412897016, + "grad_norm": 1.609398603439331, + "learning_rate": 0.00011650451204528316, + "loss": 1.2157, + "step": 23245 + }, + { + "epoch": 0.8950914340712224, + "grad_norm": 0.9227400422096252, + "learning_rate": 0.00011647468639068318, + "loss": 1.0264, + "step": 23250 + }, + { + "epoch": 0.895283926852743, + "grad_norm": 1.7690601348876953, + "learning_rate": 0.00011644485922958222, + "loss": 1.3007, + "step": 23255 + }, + { + "epoch": 0.8954764196342637, + "grad_norm": 0.9615120887756348, + "learning_rate": 0.00011641503056470782, + "loss": 0.9696, + "step": 23260 + }, + { + "epoch": 0.8956689124157844, + "grad_norm": 1.3075053691864014, + "learning_rate": 0.00011638520039878762, + "loss": 1.0539, + "step": 23265 + }, + { + "epoch": 0.8958614051973051, + "grad_norm": 1.8990029096603394, + "learning_rate": 0.00011635536873454937, + "loss": 1.2406, + "step": 23270 + }, + { + "epoch": 0.8960538979788258, + "grad_norm": 1.562389850616455, + "learning_rate": 0.00011632553557472101, + "loss": 1.1273, + "step": 23275 + }, + { + "epoch": 0.8962463907603465, + "grad_norm": 0.8242204785346985, + "learning_rate": 0.00011629570092203059, + "loss": 0.991, + "step": 23280 + }, + { + "epoch": 0.8964388835418672, + "grad_norm": 1.8995356559753418, + "learning_rate": 0.00011626586477920625, + "loss": 1.3523, + "step": 23285 + }, + { + "epoch": 0.8966313763233879, + "grad_norm": 1.2772774696350098, + "learning_rate": 0.00011623602714897632, + "loss": 1.1427, + "step": 23290 + }, + { + "epoch": 0.8968238691049085, + "grad_norm": 1.5422502756118774, + "learning_rate": 0.00011620618803406929, + "loss": 1.0688, + "step": 23295 + }, + { + "epoch": 0.8970163618864293, + "grad_norm": 1.4164756536483765, + "learning_rate": 0.00011617634743721369, + "loss": 1.0575, + "step": 23300 + }, + { + "epoch": 0.89720885466795, + "grad_norm": 0.9892400503158569, + "learning_rate": 0.0001161465053611383, + "loss": 1.0522, + "step": 23305 + }, + { + "epoch": 0.8974013474494706, + "grad_norm": 1.217403531074524, + "learning_rate": 0.00011611666180857193, + "loss": 1.2071, + "step": 23310 + }, + { + "epoch": 0.8975938402309913, + "grad_norm": 1.9891669750213623, + "learning_rate": 0.00011608681678224362, + "loss": 1.0608, + "step": 23315 + }, + { + "epoch": 0.897786333012512, + "grad_norm": 1.7623512744903564, + "learning_rate": 0.00011605697028488244, + "loss": 1.0479, + "step": 23320 + }, + { + "epoch": 0.8979788257940328, + "grad_norm": 1.8815721273422241, + "learning_rate": 0.00011602712231921775, + "loss": 1.2632, + "step": 23325 + }, + { + "epoch": 0.8981713185755534, + "grad_norm": 1.2247025966644287, + "learning_rate": 0.00011599727288797888, + "loss": 1.0253, + "step": 23330 + }, + { + "epoch": 0.8983638113570741, + "grad_norm": 1.4108113050460815, + "learning_rate": 0.00011596742199389534, + "loss": 1.1154, + "step": 23335 + }, + { + "epoch": 0.8985563041385948, + "grad_norm": 1.6885722875595093, + "learning_rate": 0.00011593756963969686, + "loss": 1.0641, + "step": 23340 + }, + { + "epoch": 0.8987487969201154, + "grad_norm": 1.6542564630508423, + "learning_rate": 0.00011590771582811324, + "loss": 0.914, + "step": 23345 + }, + { + "epoch": 0.8989412897016362, + "grad_norm": 1.0524113178253174, + "learning_rate": 0.00011587786056187435, + "loss": 1.0983, + "step": 23350 + }, + { + "epoch": 0.8991337824831569, + "grad_norm": 1.223191499710083, + "learning_rate": 0.00011584800384371028, + "loss": 0.996, + "step": 23355 + }, + { + "epoch": 0.8993262752646776, + "grad_norm": 1.6196818351745605, + "learning_rate": 0.00011581814567635128, + "loss": 0.973, + "step": 23360 + }, + { + "epoch": 0.8995187680461982, + "grad_norm": 1.0416576862335205, + "learning_rate": 0.00011578828606252763, + "loss": 1.0055, + "step": 23365 + }, + { + "epoch": 0.8997112608277189, + "grad_norm": 0.8480489253997803, + "learning_rate": 0.00011575842500496979, + "loss": 1.039, + "step": 23370 + }, + { + "epoch": 0.8999037536092397, + "grad_norm": 1.4879505634307861, + "learning_rate": 0.0001157285625064084, + "loss": 1.1408, + "step": 23375 + }, + { + "epoch": 0.9000962463907604, + "grad_norm": 1.2672336101531982, + "learning_rate": 0.00011569869856957416, + "loss": 0.9923, + "step": 23380 + }, + { + "epoch": 0.900288739172281, + "grad_norm": 1.147540807723999, + "learning_rate": 0.0001156688331971979, + "loss": 1.04, + "step": 23385 + }, + { + "epoch": 0.9004812319538017, + "grad_norm": 1.0331413745880127, + "learning_rate": 0.00011563896639201068, + "loss": 1.0502, + "step": 23390 + }, + { + "epoch": 0.9006737247353225, + "grad_norm": 1.2713568210601807, + "learning_rate": 0.00011560909815674354, + "loss": 1.109, + "step": 23395 + }, + { + "epoch": 0.9008662175168431, + "grad_norm": 1.5601911544799805, + "learning_rate": 0.0001155792284941278, + "loss": 0.9567, + "step": 23400 + }, + { + "epoch": 0.9010587102983638, + "grad_norm": 2.2069153785705566, + "learning_rate": 0.00011554935740689483, + "loss": 1.1025, + "step": 23405 + }, + { + "epoch": 0.9012512030798845, + "grad_norm": 1.4782518148422241, + "learning_rate": 0.00011551948489777611, + "loss": 1.1219, + "step": 23410 + }, + { + "epoch": 0.9014436958614052, + "grad_norm": 1.1145074367523193, + "learning_rate": 0.00011548961096950332, + "loss": 0.9797, + "step": 23415 + }, + { + "epoch": 0.9016361886429259, + "grad_norm": 1.0069323778152466, + "learning_rate": 0.00011545973562480821, + "loss": 1.1028, + "step": 23420 + }, + { + "epoch": 0.9018286814244466, + "grad_norm": 1.3039555549621582, + "learning_rate": 0.00011542985886642268, + "loss": 0.9536, + "step": 23425 + }, + { + "epoch": 0.9020211742059673, + "grad_norm": 1.2636560201644897, + "learning_rate": 0.00011539998069707876, + "loss": 1.1687, + "step": 23430 + }, + { + "epoch": 0.902213666987488, + "grad_norm": 1.4051100015640259, + "learning_rate": 0.00011537010111950866, + "loss": 1.1032, + "step": 23435 + }, + { + "epoch": 0.9024061597690086, + "grad_norm": 1.3477351665496826, + "learning_rate": 0.00011534022013644459, + "loss": 1.0785, + "step": 23440 + }, + { + "epoch": 0.9025986525505294, + "grad_norm": 1.7776321172714233, + "learning_rate": 0.00011531033775061901, + "loss": 0.9295, + "step": 23445 + }, + { + "epoch": 0.9027911453320501, + "grad_norm": 1.9729288816452026, + "learning_rate": 0.0001152804539647645, + "loss": 1.1657, + "step": 23450 + }, + { + "epoch": 0.9029836381135707, + "grad_norm": 0.9185901284217834, + "learning_rate": 0.00011525056878161367, + "loss": 1.0089, + "step": 23455 + }, + { + "epoch": 0.9031761308950914, + "grad_norm": 1.6784918308258057, + "learning_rate": 0.00011522068220389934, + "loss": 1.0847, + "step": 23460 + }, + { + "epoch": 0.9033686236766121, + "grad_norm": 1.261899709701538, + "learning_rate": 0.00011519079423435449, + "loss": 1.0281, + "step": 23465 + }, + { + "epoch": 0.9035611164581329, + "grad_norm": 1.1247740983963013, + "learning_rate": 0.0001151609048757121, + "loss": 1.143, + "step": 23470 + }, + { + "epoch": 0.9037536092396535, + "grad_norm": 1.3260306119918823, + "learning_rate": 0.0001151310141307054, + "loss": 1.022, + "step": 23475 + }, + { + "epoch": 0.9039461020211742, + "grad_norm": 1.0106364488601685, + "learning_rate": 0.00011510112200206769, + "loss": 1.0949, + "step": 23480 + }, + { + "epoch": 0.9041385948026949, + "grad_norm": 1.2663025856018066, + "learning_rate": 0.00011507122849253243, + "loss": 1.1526, + "step": 23485 + }, + { + "epoch": 0.9043310875842155, + "grad_norm": 1.2372430562973022, + "learning_rate": 0.00011504731269249489, + "loss": 1.0767, + "step": 23490 + }, + { + "epoch": 0.9045235803657363, + "grad_norm": 1.1163527965545654, + "learning_rate": 0.00011501741670423267, + "loss": 1.0479, + "step": 23495 + }, + { + "epoch": 0.904716073147257, + "grad_norm": 1.766180157661438, + "learning_rate": 0.00011498751934272714, + "loss": 1.0545, + "step": 23500 + }, + { + "epoch": 0.9049085659287777, + "grad_norm": 1.2410964965820312, + "learning_rate": 0.00011495762061071227, + "loss": 1.0771, + "step": 23505 + }, + { + "epoch": 0.9051010587102983, + "grad_norm": 1.8052672147750854, + "learning_rate": 0.00011492772051092206, + "loss": 1.2158, + "step": 23510 + }, + { + "epoch": 0.905293551491819, + "grad_norm": 1.9236223697662354, + "learning_rate": 0.00011489781904609071, + "loss": 1.147, + "step": 23515 + }, + { + "epoch": 0.9054860442733398, + "grad_norm": 1.454115867614746, + "learning_rate": 0.0001148679162189525, + "loss": 1.2127, + "step": 23520 + }, + { + "epoch": 0.9056785370548605, + "grad_norm": 1.20502769947052, + "learning_rate": 0.00011483801203224185, + "loss": 1.2464, + "step": 23525 + }, + { + "epoch": 0.9058710298363811, + "grad_norm": 1.5403794050216675, + "learning_rate": 0.00011480810648869327, + "loss": 0.9593, + "step": 23530 + }, + { + "epoch": 0.9060635226179018, + "grad_norm": 0.9871656894683838, + "learning_rate": 0.00011477819959104145, + "loss": 1.0469, + "step": 23535 + }, + { + "epoch": 0.9062560153994226, + "grad_norm": 1.037180781364441, + "learning_rate": 0.00011474829134202121, + "loss": 0.8791, + "step": 23540 + }, + { + "epoch": 0.9064485081809432, + "grad_norm": 0.9241686463356018, + "learning_rate": 0.00011471838174436738, + "loss": 1.1382, + "step": 23545 + }, + { + "epoch": 0.9066410009624639, + "grad_norm": 1.891224980354309, + "learning_rate": 0.00011468847080081506, + "loss": 1.0493, + "step": 23550 + }, + { + "epoch": 0.9068334937439846, + "grad_norm": 1.9200764894485474, + "learning_rate": 0.00011465855851409939, + "loss": 1.0265, + "step": 23555 + }, + { + "epoch": 0.9070259865255053, + "grad_norm": 0.8929172158241272, + "learning_rate": 0.00011462864488695563, + "loss": 1.1777, + "step": 23560 + }, + { + "epoch": 0.907218479307026, + "grad_norm": 2.08207368850708, + "learning_rate": 0.00011459872992211922, + "loss": 0.9146, + "step": 23565 + }, + { + "epoch": 0.9074109720885467, + "grad_norm": 1.6728849411010742, + "learning_rate": 0.00011456881362232567, + "loss": 1.1298, + "step": 23570 + }, + { + "epoch": 0.9076034648700674, + "grad_norm": 2.403190851211548, + "learning_rate": 0.0001145388959903106, + "loss": 1.1824, + "step": 23575 + }, + { + "epoch": 0.907795957651588, + "grad_norm": 1.7039401531219482, + "learning_rate": 0.00011450897702880981, + "loss": 1.1583, + "step": 23580 + }, + { + "epoch": 0.9079884504331087, + "grad_norm": 1.2045179605484009, + "learning_rate": 0.0001144790567405592, + "loss": 1.2031, + "step": 23585 + }, + { + "epoch": 0.9081809432146295, + "grad_norm": 1.4418444633483887, + "learning_rate": 0.00011444913512829476, + "loss": 1.1499, + "step": 23590 + }, + { + "epoch": 0.9083734359961502, + "grad_norm": 2.100736379623413, + "learning_rate": 0.00011441921219475264, + "loss": 1.1585, + "step": 23595 + }, + { + "epoch": 0.9085659287776708, + "grad_norm": 1.10848867893219, + "learning_rate": 0.00011438928794266905, + "loss": 1.2494, + "step": 23600 + }, + { + "epoch": 0.9087584215591915, + "grad_norm": 1.8016185760498047, + "learning_rate": 0.00011435936237478045, + "loss": 1.061, + "step": 23605 + }, + { + "epoch": 0.9089509143407122, + "grad_norm": 1.1875159740447998, + "learning_rate": 0.00011432943549382325, + "loss": 1.1222, + "step": 23610 + }, + { + "epoch": 0.909143407122233, + "grad_norm": 1.8396415710449219, + "learning_rate": 0.00011429950730253407, + "loss": 1.1891, + "step": 23615 + }, + { + "epoch": 0.9093358999037536, + "grad_norm": 1.5621596574783325, + "learning_rate": 0.00011426957780364973, + "loss": 1.0585, + "step": 23620 + }, + { + "epoch": 0.9095283926852743, + "grad_norm": 1.253409504890442, + "learning_rate": 0.000114239646999907, + "loss": 1.1791, + "step": 23625 + }, + { + "epoch": 0.909720885466795, + "grad_norm": 2.3588509559631348, + "learning_rate": 0.00011420971489404287, + "loss": 1.1373, + "step": 23630 + }, + { + "epoch": 0.9099133782483156, + "grad_norm": 1.597448706626892, + "learning_rate": 0.00011417978148879445, + "loss": 1.2299, + "step": 23635 + }, + { + "epoch": 0.9101058710298364, + "grad_norm": 0.90355384349823, + "learning_rate": 0.00011414984678689895, + "loss": 0.9769, + "step": 23640 + }, + { + "epoch": 0.9102983638113571, + "grad_norm": 1.7792707681655884, + "learning_rate": 0.00011411991079109368, + "loss": 1.0331, + "step": 23645 + }, + { + "epoch": 0.9104908565928778, + "grad_norm": 1.5718694925308228, + "learning_rate": 0.00011408997350411614, + "loss": 1.1637, + "step": 23650 + }, + { + "epoch": 0.9106833493743984, + "grad_norm": 1.0493242740631104, + "learning_rate": 0.00011406003492870383, + "loss": 0.9587, + "step": 23655 + }, + { + "epoch": 0.9108758421559191, + "grad_norm": 1.3035274744033813, + "learning_rate": 0.00011403009506759445, + "loss": 1.0251, + "step": 23660 + }, + { + "epoch": 0.9110683349374399, + "grad_norm": 1.4814205169677734, + "learning_rate": 0.00011400015392352585, + "loss": 1.0466, + "step": 23665 + }, + { + "epoch": 0.9112608277189606, + "grad_norm": 1.178098440170288, + "learning_rate": 0.0001139702114992359, + "loss": 0.9776, + "step": 23670 + }, + { + "epoch": 0.9114533205004812, + "grad_norm": 1.1865342855453491, + "learning_rate": 0.00011394026779746267, + "loss": 1.0895, + "step": 23675 + }, + { + "epoch": 0.9116458132820019, + "grad_norm": 0.8956665992736816, + "learning_rate": 0.00011391032282094429, + "loss": 1.0819, + "step": 23680 + }, + { + "epoch": 0.9118383060635226, + "grad_norm": 1.0805063247680664, + "learning_rate": 0.00011388037657241904, + "loss": 0.9655, + "step": 23685 + }, + { + "epoch": 0.9120307988450433, + "grad_norm": 1.116804838180542, + "learning_rate": 0.00011385042905462526, + "loss": 0.945, + "step": 23690 + }, + { + "epoch": 0.912223291626564, + "grad_norm": 1.1748266220092773, + "learning_rate": 0.00011382048027030155, + "loss": 1.0803, + "step": 23695 + }, + { + "epoch": 0.9124157844080847, + "grad_norm": 1.5685302019119263, + "learning_rate": 0.00011379053022218645, + "loss": 1.1678, + "step": 23700 + }, + { + "epoch": 0.9126082771896054, + "grad_norm": 1.1496498584747314, + "learning_rate": 0.00011376057891301873, + "loss": 1.007, + "step": 23705 + }, + { + "epoch": 0.9128007699711261, + "grad_norm": 1.3414727449417114, + "learning_rate": 0.0001137306263455372, + "loss": 1.0309, + "step": 23710 + }, + { + "epoch": 0.9129932627526468, + "grad_norm": 0.9307123422622681, + "learning_rate": 0.00011370067252248085, + "loss": 1.2324, + "step": 23715 + }, + { + "epoch": 0.9131857555341675, + "grad_norm": 2.051027536392212, + "learning_rate": 0.00011367071744658875, + "loss": 1.2293, + "step": 23720 + }, + { + "epoch": 0.9133782483156881, + "grad_norm": 1.3794739246368408, + "learning_rate": 0.00011364076112060014, + "loss": 1.1828, + "step": 23725 + }, + { + "epoch": 0.9135707410972088, + "grad_norm": 1.082683801651001, + "learning_rate": 0.00011361080354725424, + "loss": 0.9434, + "step": 23730 + }, + { + "epoch": 0.9137632338787296, + "grad_norm": 2.3093974590301514, + "learning_rate": 0.00011358084472929054, + "loss": 1.1994, + "step": 23735 + }, + { + "epoch": 0.9139557266602503, + "grad_norm": 1.007186770439148, + "learning_rate": 0.00011355088466944855, + "loss": 1.0195, + "step": 23740 + }, + { + "epoch": 0.9141482194417709, + "grad_norm": 1.239039659500122, + "learning_rate": 0.00011352092337046791, + "loss": 1.1732, + "step": 23745 + }, + { + "epoch": 0.9143407122232916, + "grad_norm": 1.795091986656189, + "learning_rate": 0.00011349096083508841, + "loss": 1.1553, + "step": 23750 + }, + { + "epoch": 0.9145332050048123, + "grad_norm": 1.2720720767974854, + "learning_rate": 0.00011346099706604991, + "loss": 1.0455, + "step": 23755 + }, + { + "epoch": 0.9147256977863331, + "grad_norm": 1.3282806873321533, + "learning_rate": 0.00011343103206609241, + "loss": 0.9086, + "step": 23760 + }, + { + "epoch": 0.9149181905678537, + "grad_norm": 1.055714726448059, + "learning_rate": 0.00011340106583795597, + "loss": 1.2688, + "step": 23765 + }, + { + "epoch": 0.9151106833493744, + "grad_norm": 1.3557606935501099, + "learning_rate": 0.00011337109838438085, + "loss": 1.1387, + "step": 23770 + }, + { + "epoch": 0.9153031761308951, + "grad_norm": 1.2776196002960205, + "learning_rate": 0.00011334112970810737, + "loss": 0.9351, + "step": 23775 + }, + { + "epoch": 0.9154956689124157, + "grad_norm": 1.4037102460861206, + "learning_rate": 0.00011331115981187595, + "loss": 1.0117, + "step": 23780 + }, + { + "epoch": 0.9156881616939365, + "grad_norm": 0.8925278782844543, + "learning_rate": 0.00011328118869842712, + "loss": 1.108, + "step": 23785 + }, + { + "epoch": 0.9158806544754572, + "grad_norm": 2.050219774246216, + "learning_rate": 0.0001132512163705016, + "loss": 1.1364, + "step": 23790 + }, + { + "epoch": 0.9160731472569779, + "grad_norm": 1.0239925384521484, + "learning_rate": 0.0001132212428308401, + "loss": 1.0919, + "step": 23795 + }, + { + "epoch": 0.9162656400384985, + "grad_norm": 1.554990530014038, + "learning_rate": 0.00011319126808218355, + "loss": 1.1526, + "step": 23800 + }, + { + "epoch": 0.9164581328200192, + "grad_norm": 1.433929443359375, + "learning_rate": 0.00011316129212727292, + "loss": 1.1302, + "step": 23805 + }, + { + "epoch": 0.91665062560154, + "grad_norm": 1.6558189392089844, + "learning_rate": 0.00011313131496884927, + "loss": 1.1572, + "step": 23810 + }, + { + "epoch": 0.9168431183830607, + "grad_norm": 1.2385213375091553, + "learning_rate": 0.00011310133660965387, + "loss": 1.0693, + "step": 23815 + }, + { + "epoch": 0.9170356111645813, + "grad_norm": 1.636640191078186, + "learning_rate": 0.00011307135705242806, + "loss": 1.0358, + "step": 23820 + }, + { + "epoch": 0.917228103946102, + "grad_norm": 1.5593379735946655, + "learning_rate": 0.0001130413762999132, + "loss": 1.1899, + "step": 23825 + }, + { + "epoch": 0.9174205967276227, + "grad_norm": 1.4657301902770996, + "learning_rate": 0.00011301139435485086, + "loss": 1.0543, + "step": 23830 + }, + { + "epoch": 0.9176130895091434, + "grad_norm": 1.0801408290863037, + "learning_rate": 0.00011298141121998273, + "loss": 1.181, + "step": 23835 + }, + { + "epoch": 0.9178055822906641, + "grad_norm": 1.6339335441589355, + "learning_rate": 0.00011295142689805052, + "loss": 1.197, + "step": 23840 + }, + { + "epoch": 0.9179980750721848, + "grad_norm": 1.4383010864257812, + "learning_rate": 0.00011292144139179612, + "loss": 0.98, + "step": 23845 + }, + { + "epoch": 0.9181905678537055, + "grad_norm": 1.440516471862793, + "learning_rate": 0.00011289145470396152, + "loss": 1.1183, + "step": 23850 + }, + { + "epoch": 0.9183830606352262, + "grad_norm": 0.940071165561676, + "learning_rate": 0.00011286146683728876, + "loss": 1.0116, + "step": 23855 + }, + { + "epoch": 0.9185755534167469, + "grad_norm": 1.1691455841064453, + "learning_rate": 0.00011283147779452003, + "loss": 1.0608, + "step": 23860 + }, + { + "epoch": 0.9187680461982676, + "grad_norm": 1.218925952911377, + "learning_rate": 0.00011280148757839771, + "loss": 1.0606, + "step": 23865 + }, + { + "epoch": 0.9189605389797882, + "grad_norm": 0.890487790107727, + "learning_rate": 0.00011277149619166414, + "loss": 0.8924, + "step": 23870 + }, + { + "epoch": 0.9191530317613089, + "grad_norm": 1.4061552286148071, + "learning_rate": 0.00011274150363706182, + "loss": 1.1201, + "step": 23875 + }, + { + "epoch": 0.9193455245428297, + "grad_norm": 2.254596471786499, + "learning_rate": 0.00011271150991733341, + "loss": 1.1457, + "step": 23880 + }, + { + "epoch": 0.9195380173243504, + "grad_norm": 1.2079321146011353, + "learning_rate": 0.00011268151503522164, + "loss": 1.0976, + "step": 23885 + }, + { + "epoch": 0.919730510105871, + "grad_norm": 1.8789191246032715, + "learning_rate": 0.00011265151899346931, + "loss": 1.0205, + "step": 23890 + }, + { + "epoch": 0.9199230028873917, + "grad_norm": 1.5862555503845215, + "learning_rate": 0.00011262152179481938, + "loss": 1.1655, + "step": 23895 + }, + { + "epoch": 0.9201154956689124, + "grad_norm": 1.342571496963501, + "learning_rate": 0.00011259152344201492, + "loss": 1.1423, + "step": 23900 + }, + { + "epoch": 0.9203079884504332, + "grad_norm": 1.4979714155197144, + "learning_rate": 0.00011256152393779901, + "loss": 1.0377, + "step": 23905 + }, + { + "epoch": 0.9205004812319538, + "grad_norm": 2.0960614681243896, + "learning_rate": 0.00011253152328491496, + "loss": 1.0474, + "step": 23910 + }, + { + "epoch": 0.9206929740134745, + "grad_norm": 1.4610568284988403, + "learning_rate": 0.00011250152148610613, + "loss": 1.1822, + "step": 23915 + }, + { + "epoch": 0.9208854667949952, + "grad_norm": 1.4984396696090698, + "learning_rate": 0.000112471518544116, + "loss": 0.9195, + "step": 23920 + }, + { + "epoch": 0.9210779595765158, + "grad_norm": 1.0438648462295532, + "learning_rate": 0.00011244151446168807, + "loss": 0.9973, + "step": 23925 + }, + { + "epoch": 0.9212704523580366, + "grad_norm": 1.51102876663208, + "learning_rate": 0.00011241150924156609, + "loss": 1.1294, + "step": 23930 + }, + { + "epoch": 0.9214629451395573, + "grad_norm": 1.743852138519287, + "learning_rate": 0.00011238150288649381, + "loss": 1.0525, + "step": 23935 + }, + { + "epoch": 0.921655437921078, + "grad_norm": 1.6502209901809692, + "learning_rate": 0.00011235149539921508, + "loss": 1.0904, + "step": 23940 + }, + { + "epoch": 0.9218479307025986, + "grad_norm": 1.1279150247573853, + "learning_rate": 0.00011232148678247393, + "loss": 1.0176, + "step": 23945 + }, + { + "epoch": 0.9220404234841193, + "grad_norm": 1.2196617126464844, + "learning_rate": 0.00011229147703901444, + "loss": 0.8492, + "step": 23950 + }, + { + "epoch": 0.9222329162656401, + "grad_norm": 1.4295985698699951, + "learning_rate": 0.0001122614661715808, + "loss": 1.1331, + "step": 23955 + }, + { + "epoch": 0.9224254090471607, + "grad_norm": 1.8270727396011353, + "learning_rate": 0.00011223145418291731, + "loss": 1.0813, + "step": 23960 + }, + { + "epoch": 0.9226179018286814, + "grad_norm": 1.327848196029663, + "learning_rate": 0.00011220144107576834, + "loss": 0.9626, + "step": 23965 + }, + { + "epoch": 0.9228103946102021, + "grad_norm": 1.5928614139556885, + "learning_rate": 0.00011217142685287842, + "loss": 1.1328, + "step": 23970 + }, + { + "epoch": 0.9230028873917228, + "grad_norm": 1.3863136768341064, + "learning_rate": 0.00011214141151699215, + "loss": 1.1175, + "step": 23975 + }, + { + "epoch": 0.9231953801732435, + "grad_norm": 1.4480311870574951, + "learning_rate": 0.00011211139507085422, + "loss": 0.9841, + "step": 23980 + }, + { + "epoch": 0.9233878729547642, + "grad_norm": 1.1457321643829346, + "learning_rate": 0.00011208137751720944, + "loss": 1.0934, + "step": 23985 + }, + { + "epoch": 0.9235803657362849, + "grad_norm": 1.614317774772644, + "learning_rate": 0.00011205135885880272, + "loss": 1.0216, + "step": 23990 + }, + { + "epoch": 0.9237728585178056, + "grad_norm": 1.0232197046279907, + "learning_rate": 0.00011202133909837906, + "loss": 1.1024, + "step": 23995 + }, + { + "epoch": 0.9239653512993262, + "grad_norm": 1.317587971687317, + "learning_rate": 0.00011199131823868358, + "loss": 1.2039, + "step": 24000 + }, + { + "epoch": 0.924157844080847, + "grad_norm": 1.0058887004852295, + "learning_rate": 0.00011196129628246148, + "loss": 1.068, + "step": 24005 + }, + { + "epoch": 0.9243503368623677, + "grad_norm": 1.1201086044311523, + "learning_rate": 0.00011193127323245809, + "loss": 1.0745, + "step": 24010 + }, + { + "epoch": 0.9245428296438883, + "grad_norm": 1.1013514995574951, + "learning_rate": 0.00011190124909141877, + "loss": 1.1211, + "step": 24015 + }, + { + "epoch": 0.924735322425409, + "grad_norm": 1.3184977769851685, + "learning_rate": 0.00011187122386208908, + "loss": 1.2223, + "step": 24020 + }, + { + "epoch": 0.9249278152069298, + "grad_norm": 1.053260087966919, + "learning_rate": 0.0001118411975472146, + "loss": 0.9556, + "step": 24025 + }, + { + "epoch": 0.9251203079884505, + "grad_norm": 1.6878530979156494, + "learning_rate": 0.00011181117014954105, + "loss": 0.9915, + "step": 24030 + }, + { + "epoch": 0.9253128007699711, + "grad_norm": 0.6977512836456299, + "learning_rate": 0.00011178114167181423, + "loss": 1.0629, + "step": 24035 + }, + { + "epoch": 0.9255052935514918, + "grad_norm": 1.4478118419647217, + "learning_rate": 0.00011175111211678006, + "loss": 1.1585, + "step": 24040 + }, + { + "epoch": 0.9256977863330125, + "grad_norm": 1.1715469360351562, + "learning_rate": 0.0001117210814871845, + "loss": 0.9811, + "step": 24045 + }, + { + "epoch": 0.9258902791145333, + "grad_norm": 1.2581676244735718, + "learning_rate": 0.00011169104978577369, + "loss": 1.0722, + "step": 24050 + }, + { + "epoch": 0.9260827718960539, + "grad_norm": 1.064126968383789, + "learning_rate": 0.00011166101701529385, + "loss": 0.983, + "step": 24055 + }, + { + "epoch": 0.9262752646775746, + "grad_norm": 1.0658310651779175, + "learning_rate": 0.00011163098317849123, + "loss": 1.0436, + "step": 24060 + }, + { + "epoch": 0.9264677574590953, + "grad_norm": 1.1407309770584106, + "learning_rate": 0.00011160094827811223, + "loss": 0.957, + "step": 24065 + }, + { + "epoch": 0.9266602502406159, + "grad_norm": 1.3511934280395508, + "learning_rate": 0.0001115709123169034, + "loss": 1.0751, + "step": 24070 + }, + { + "epoch": 0.9268527430221367, + "grad_norm": 1.416675329208374, + "learning_rate": 0.00011154087529761125, + "loss": 1.2558, + "step": 24075 + }, + { + "epoch": 0.9270452358036574, + "grad_norm": 1.5210598707199097, + "learning_rate": 0.00011151083722298252, + "loss": 0.945, + "step": 24080 + }, + { + "epoch": 0.9272377285851781, + "grad_norm": 1.1831562519073486, + "learning_rate": 0.00011148079809576399, + "loss": 1.0632, + "step": 24085 + }, + { + "epoch": 0.9274302213666987, + "grad_norm": 1.1255745887756348, + "learning_rate": 0.00011145075791870252, + "loss": 1.0556, + "step": 24090 + }, + { + "epoch": 0.9276227141482194, + "grad_norm": 1.213739275932312, + "learning_rate": 0.00011142071669454507, + "loss": 1.1672, + "step": 24095 + }, + { + "epoch": 0.9278152069297402, + "grad_norm": 1.0633070468902588, + "learning_rate": 0.00011139067442603877, + "loss": 1.1084, + "step": 24100 + }, + { + "epoch": 0.9280076997112608, + "grad_norm": 0.8085178136825562, + "learning_rate": 0.00011136063111593073, + "loss": 0.9406, + "step": 24105 + }, + { + "epoch": 0.9282001924927815, + "grad_norm": 1.0284137725830078, + "learning_rate": 0.00011133058676696823, + "loss": 1.1271, + "step": 24110 + }, + { + "epoch": 0.9283926852743022, + "grad_norm": 1.3331536054611206, + "learning_rate": 0.00011130054138189863, + "loss": 1.1798, + "step": 24115 + }, + { + "epoch": 0.9285851780558229, + "grad_norm": 1.1535316705703735, + "learning_rate": 0.00011127049496346939, + "loss": 0.9878, + "step": 24120 + }, + { + "epoch": 0.9287776708373436, + "grad_norm": 1.4867428541183472, + "learning_rate": 0.00011124044751442803, + "loss": 1.0329, + "step": 24125 + }, + { + "epoch": 0.9289701636188643, + "grad_norm": 0.9730262756347656, + "learning_rate": 0.00011121039903752224, + "loss": 1.0598, + "step": 24130 + }, + { + "epoch": 0.929162656400385, + "grad_norm": 2.8743958473205566, + "learning_rate": 0.0001111803495354997, + "loss": 0.9686, + "step": 24135 + }, + { + "epoch": 0.9293551491819056, + "grad_norm": 2.40010404586792, + "learning_rate": 0.00011115029901110825, + "loss": 1.1852, + "step": 24140 + }, + { + "epoch": 0.9295476419634263, + "grad_norm": 1.7937008142471313, + "learning_rate": 0.00011112024746709581, + "loss": 1.0724, + "step": 24145 + }, + { + "epoch": 0.9297401347449471, + "grad_norm": 1.1320335865020752, + "learning_rate": 0.00011109019490621044, + "loss": 1.0574, + "step": 24150 + }, + { + "epoch": 0.9299326275264678, + "grad_norm": 1.4164587259292603, + "learning_rate": 0.0001110601413312002, + "loss": 1.1617, + "step": 24155 + }, + { + "epoch": 0.9301251203079884, + "grad_norm": 1.665640950202942, + "learning_rate": 0.00011103008674481328, + "loss": 1.1355, + "step": 24160 + }, + { + "epoch": 0.9303176130895091, + "grad_norm": 1.509874701499939, + "learning_rate": 0.00011100003114979802, + "loss": 1.0042, + "step": 24165 + }, + { + "epoch": 0.9305101058710299, + "grad_norm": 1.702763319015503, + "learning_rate": 0.00011096997454890275, + "loss": 1.1039, + "step": 24170 + }, + { + "epoch": 0.9307025986525506, + "grad_norm": 0.9513006210327148, + "learning_rate": 0.00011093991694487597, + "loss": 1.1699, + "step": 24175 + }, + { + "epoch": 0.9308950914340712, + "grad_norm": 0.8490985035896301, + "learning_rate": 0.0001109098583404663, + "loss": 1.0261, + "step": 24180 + }, + { + "epoch": 0.9310875842155919, + "grad_norm": 1.2935947179794312, + "learning_rate": 0.00011087979873842233, + "loss": 1.1481, + "step": 24185 + }, + { + "epoch": 0.9312800769971126, + "grad_norm": 0.9786511063575745, + "learning_rate": 0.00011084973814149284, + "loss": 0.9385, + "step": 24190 + }, + { + "epoch": 0.9314725697786334, + "grad_norm": 1.1575534343719482, + "learning_rate": 0.00011081967655242668, + "loss": 0.966, + "step": 24195 + }, + { + "epoch": 0.931665062560154, + "grad_norm": 1.2462104558944702, + "learning_rate": 0.00011078961397397276, + "loss": 1.0768, + "step": 24200 + }, + { + "epoch": 0.9318575553416747, + "grad_norm": 1.0118372440338135, + "learning_rate": 0.00011075955040888011, + "loss": 1.033, + "step": 24205 + }, + { + "epoch": 0.9320500481231954, + "grad_norm": 1.2791298627853394, + "learning_rate": 0.00011072948585989789, + "loss": 0.9917, + "step": 24210 + }, + { + "epoch": 0.932242540904716, + "grad_norm": 1.0876747369766235, + "learning_rate": 0.00011069942032977522, + "loss": 1.1617, + "step": 24215 + }, + { + "epoch": 0.9324350336862368, + "grad_norm": 2.205045223236084, + "learning_rate": 0.00011066935382126144, + "loss": 0.996, + "step": 24220 + }, + { + "epoch": 0.9326275264677575, + "grad_norm": 1.4552040100097656, + "learning_rate": 0.00011063928633710596, + "loss": 1.0561, + "step": 24225 + }, + { + "epoch": 0.9328200192492782, + "grad_norm": 1.6502306461334229, + "learning_rate": 0.0001106092178800582, + "loss": 1.0455, + "step": 24230 + }, + { + "epoch": 0.9330125120307988, + "grad_norm": 1.861480474472046, + "learning_rate": 0.00011057914845286777, + "loss": 0.9918, + "step": 24235 + }, + { + "epoch": 0.9332050048123195, + "grad_norm": 1.1051427125930786, + "learning_rate": 0.00011054907805828427, + "loss": 1.0833, + "step": 24240 + }, + { + "epoch": 0.9333974975938403, + "grad_norm": 1.3723036050796509, + "learning_rate": 0.00011051900669905748, + "loss": 1.1338, + "step": 24245 + }, + { + "epoch": 0.933589990375361, + "grad_norm": 2.1225595474243164, + "learning_rate": 0.00011048893437793721, + "loss": 1.0879, + "step": 24250 + }, + { + "epoch": 0.9337824831568816, + "grad_norm": 1.4194490909576416, + "learning_rate": 0.00011045886109767336, + "loss": 1.1349, + "step": 24255 + }, + { + "epoch": 0.9339749759384023, + "grad_norm": 1.5220705270767212, + "learning_rate": 0.00011042878686101597, + "loss": 1.0283, + "step": 24260 + }, + { + "epoch": 0.934167468719923, + "grad_norm": 1.7352094650268555, + "learning_rate": 0.00011039871167071507, + "loss": 1.2378, + "step": 24265 + }, + { + "epoch": 0.9343599615014437, + "grad_norm": 2.2150251865386963, + "learning_rate": 0.00011036863552952088, + "loss": 1.1452, + "step": 24270 + }, + { + "epoch": 0.9345524542829644, + "grad_norm": 0.9917442202568054, + "learning_rate": 0.00011033855844018368, + "loss": 0.8594, + "step": 24275 + }, + { + "epoch": 0.9347449470644851, + "grad_norm": 1.0165492296218872, + "learning_rate": 0.00011030848040545378, + "loss": 1.2048, + "step": 24280 + }, + { + "epoch": 0.9349374398460057, + "grad_norm": 1.1605945825576782, + "learning_rate": 0.00011027840142808163, + "loss": 1.0497, + "step": 24285 + }, + { + "epoch": 0.9351299326275264, + "grad_norm": 1.9988510608673096, + "learning_rate": 0.00011024832151081778, + "loss": 1.1581, + "step": 24290 + }, + { + "epoch": 0.9353224254090472, + "grad_norm": 1.6264874935150146, + "learning_rate": 0.0001102182406564128, + "loss": 1.089, + "step": 24295 + }, + { + "epoch": 0.9355149181905679, + "grad_norm": 1.006515622138977, + "learning_rate": 0.0001101881588676174, + "loss": 0.9477, + "step": 24300 + }, + { + "epoch": 0.9357074109720885, + "grad_norm": 1.7896440029144287, + "learning_rate": 0.00011015807614718236, + "loss": 0.9834, + "step": 24305 + }, + { + "epoch": 0.9358999037536092, + "grad_norm": 1.7942471504211426, + "learning_rate": 0.00011012799249785854, + "loss": 1.1321, + "step": 24310 + }, + { + "epoch": 0.9360923965351299, + "grad_norm": 1.4186915159225464, + "learning_rate": 0.00011009790792239692, + "loss": 1.1373, + "step": 24315 + }, + { + "epoch": 0.9362848893166507, + "grad_norm": 0.9895558953285217, + "learning_rate": 0.00011006782242354852, + "loss": 1.0627, + "step": 24320 + }, + { + "epoch": 0.9364773820981713, + "grad_norm": 1.438604474067688, + "learning_rate": 0.00011003773600406442, + "loss": 1.0535, + "step": 24325 + }, + { + "epoch": 0.936669874879692, + "grad_norm": 1.1675355434417725, + "learning_rate": 0.00011000764866669586, + "loss": 1.1195, + "step": 24330 + }, + { + "epoch": 0.9368623676612127, + "grad_norm": 1.4219048023223877, + "learning_rate": 0.00010997756041419416, + "loss": 1.0578, + "step": 24335 + }, + { + "epoch": 0.9370548604427335, + "grad_norm": 2.0969905853271484, + "learning_rate": 0.00010994747124931062, + "loss": 1.1122, + "step": 24340 + }, + { + "epoch": 0.9372473532242541, + "grad_norm": 1.433214545249939, + "learning_rate": 0.00010991738117479673, + "loss": 1.0633, + "step": 24345 + }, + { + "epoch": 0.9374398460057748, + "grad_norm": 1.5070579051971436, + "learning_rate": 0.00010988729019340407, + "loss": 0.9797, + "step": 24350 + }, + { + "epoch": 0.9376323387872955, + "grad_norm": 0.9930442571640015, + "learning_rate": 0.00010985719830788417, + "loss": 0.9648, + "step": 24355 + }, + { + "epoch": 0.9378248315688161, + "grad_norm": 1.7221930027008057, + "learning_rate": 0.00010982710552098883, + "loss": 0.9712, + "step": 24360 + }, + { + "epoch": 0.9380173243503369, + "grad_norm": 1.0316636562347412, + "learning_rate": 0.00010979701183546976, + "loss": 1.0854, + "step": 24365 + }, + { + "epoch": 0.9382098171318576, + "grad_norm": 1.5038282871246338, + "learning_rate": 0.00010976691725407886, + "loss": 1.1366, + "step": 24370 + }, + { + "epoch": 0.9384023099133783, + "grad_norm": 1.6560473442077637, + "learning_rate": 0.00010973682177956808, + "loss": 1.1497, + "step": 24375 + }, + { + "epoch": 0.9385948026948989, + "grad_norm": 1.3170095682144165, + "learning_rate": 0.00010970672541468943, + "loss": 1.1246, + "step": 24380 + }, + { + "epoch": 0.9387872954764196, + "grad_norm": 1.8806647062301636, + "learning_rate": 0.00010967662816219506, + "loss": 1.1005, + "step": 24385 + }, + { + "epoch": 0.9389797882579404, + "grad_norm": 1.4266135692596436, + "learning_rate": 0.00010964653002483713, + "loss": 1.0151, + "step": 24390 + }, + { + "epoch": 0.939172281039461, + "grad_norm": 2.0234365463256836, + "learning_rate": 0.00010961643100536794, + "loss": 1.1368, + "step": 24395 + }, + { + "epoch": 0.9393647738209817, + "grad_norm": 1.403439998626709, + "learning_rate": 0.00010958633110653987, + "loss": 1.2229, + "step": 24400 + }, + { + "epoch": 0.9395572666025024, + "grad_norm": 1.2399364709854126, + "learning_rate": 0.0001095562303311053, + "loss": 1.0342, + "step": 24405 + }, + { + "epoch": 0.939749759384023, + "grad_norm": 1.1271092891693115, + "learning_rate": 0.00010952612868181673, + "loss": 1.1703, + "step": 24410 + }, + { + "epoch": 0.9399422521655438, + "grad_norm": 1.1748933792114258, + "learning_rate": 0.00010949602616142685, + "loss": 1.1191, + "step": 24415 + }, + { + "epoch": 0.9401347449470645, + "grad_norm": 1.4523221254348755, + "learning_rate": 0.00010946592277268825, + "loss": 1.1226, + "step": 24420 + }, + { + "epoch": 0.9403272377285852, + "grad_norm": 2.0243780612945557, + "learning_rate": 0.00010943581851835373, + "loss": 1.1353, + "step": 24425 + }, + { + "epoch": 0.9405197305101058, + "grad_norm": 1.318712830543518, + "learning_rate": 0.00010940571340117613, + "loss": 1.1688, + "step": 24430 + }, + { + "epoch": 0.9407122232916265, + "grad_norm": 1.6720144748687744, + "learning_rate": 0.00010937560742390833, + "loss": 1.1818, + "step": 24435 + }, + { + "epoch": 0.9409047160731473, + "grad_norm": 0.8121243119239807, + "learning_rate": 0.00010934550058930336, + "loss": 0.9643, + "step": 24440 + }, + { + "epoch": 0.941097208854668, + "grad_norm": 1.427445650100708, + "learning_rate": 0.00010931539290011425, + "loss": 1.0619, + "step": 24445 + }, + { + "epoch": 0.9412897016361886, + "grad_norm": 1.4542162418365479, + "learning_rate": 0.00010928528435909415, + "loss": 1.278, + "step": 24450 + }, + { + "epoch": 0.9414821944177093, + "grad_norm": 0.9350261688232422, + "learning_rate": 0.00010925517496899633, + "loss": 1.1934, + "step": 24455 + }, + { + "epoch": 0.94167468719923, + "grad_norm": 0.9128903150558472, + "learning_rate": 0.00010922506473257408, + "loss": 0.991, + "step": 24460 + }, + { + "epoch": 0.9418671799807508, + "grad_norm": 2.040079355239868, + "learning_rate": 0.00010919495365258077, + "loss": 1.2565, + "step": 24465 + }, + { + "epoch": 0.9420596727622714, + "grad_norm": 1.4088531732559204, + "learning_rate": 0.00010916484173176984, + "loss": 1.0567, + "step": 24470 + }, + { + "epoch": 0.9422521655437921, + "grad_norm": 1.0955448150634766, + "learning_rate": 0.00010913472897289485, + "loss": 0.9855, + "step": 24475 + }, + { + "epoch": 0.9424446583253128, + "grad_norm": 1.6121997833251953, + "learning_rate": 0.00010910461537870942, + "loss": 1.0122, + "step": 24480 + }, + { + "epoch": 0.9426371511068335, + "grad_norm": 1.548582911491394, + "learning_rate": 0.00010907450095196718, + "loss": 0.9924, + "step": 24485 + }, + { + "epoch": 0.9428296438883542, + "grad_norm": 1.2104709148406982, + "learning_rate": 0.00010904438569542202, + "loss": 1.0338, + "step": 24490 + }, + { + "epoch": 0.9430221366698749, + "grad_norm": 1.3403939008712769, + "learning_rate": 0.00010901426961182764, + "loss": 1.0474, + "step": 24495 + }, + { + "epoch": 0.9432146294513956, + "grad_norm": 1.8973404169082642, + "learning_rate": 0.00010898415270393802, + "loss": 0.9729, + "step": 24500 + }, + { + "epoch": 0.9434071222329162, + "grad_norm": 0.9071372747421265, + "learning_rate": 0.00010895403497450716, + "loss": 1.2973, + "step": 24505 + }, + { + "epoch": 0.943599615014437, + "grad_norm": 2.5191125869750977, + "learning_rate": 0.00010892391642628912, + "loss": 1.1838, + "step": 24510 + }, + { + "epoch": 0.9437921077959577, + "grad_norm": 1.3101541996002197, + "learning_rate": 0.00010889379706203804, + "loss": 1.0826, + "step": 24515 + }, + { + "epoch": 0.9439846005774783, + "grad_norm": 1.1167513132095337, + "learning_rate": 0.00010886367688450811, + "loss": 0.9787, + "step": 24520 + }, + { + "epoch": 0.944177093358999, + "grad_norm": 1.8214826583862305, + "learning_rate": 0.0001088335558964537, + "loss": 1.1172, + "step": 24525 + }, + { + "epoch": 0.9443695861405197, + "grad_norm": 0.8352447748184204, + "learning_rate": 0.00010880343410062908, + "loss": 0.9896, + "step": 24530 + }, + { + "epoch": 0.9445620789220405, + "grad_norm": 1.6729201078414917, + "learning_rate": 0.00010877331149978873, + "loss": 1.1613, + "step": 24535 + }, + { + "epoch": 0.9447545717035611, + "grad_norm": 1.173158049583435, + "learning_rate": 0.00010874318809668718, + "loss": 1.206, + "step": 24540 + }, + { + "epoch": 0.9449470644850818, + "grad_norm": 2.4930291175842285, + "learning_rate": 0.00010871306389407898, + "loss": 1.2144, + "step": 24545 + }, + { + "epoch": 0.9451395572666025, + "grad_norm": 0.8936457633972168, + "learning_rate": 0.00010868293889471881, + "loss": 1.0495, + "step": 24550 + }, + { + "epoch": 0.9453320500481232, + "grad_norm": 1.9174963235855103, + "learning_rate": 0.00010865281310136142, + "loss": 1.2971, + "step": 24555 + }, + { + "epoch": 0.9455245428296439, + "grad_norm": 0.8766136169433594, + "learning_rate": 0.00010862268651676155, + "loss": 1.3837, + "step": 24560 + }, + { + "epoch": 0.9457170356111646, + "grad_norm": 1.577832818031311, + "learning_rate": 0.00010859255914367414, + "loss": 0.9897, + "step": 24565 + }, + { + "epoch": 0.9459095283926853, + "grad_norm": 1.1107361316680908, + "learning_rate": 0.00010856243098485412, + "loss": 1.0474, + "step": 24570 + }, + { + "epoch": 0.9461020211742059, + "grad_norm": 1.0004538297653198, + "learning_rate": 0.00010853230204305651, + "loss": 1.0261, + "step": 24575 + }, + { + "epoch": 0.9462945139557266, + "grad_norm": 0.9005370736122131, + "learning_rate": 0.00010850217232103639, + "loss": 0.9002, + "step": 24580 + }, + { + "epoch": 0.9464870067372474, + "grad_norm": 0.3695490062236786, + "learning_rate": 0.00010847204182154895, + "loss": 0.9869, + "step": 24585 + }, + { + "epoch": 0.9466794995187681, + "grad_norm": 0.9990583658218384, + "learning_rate": 0.00010844191054734938, + "loss": 1.0765, + "step": 24590 + }, + { + "epoch": 0.9468719923002887, + "grad_norm": 1.0551097393035889, + "learning_rate": 0.00010841177850119301, + "loss": 1.0853, + "step": 24595 + }, + { + "epoch": 0.9470644850818094, + "grad_norm": 1.0011540651321411, + "learning_rate": 0.00010838164568583526, + "loss": 1.2651, + "step": 24600 + }, + { + "epoch": 0.9472569778633301, + "grad_norm": 1.1266576051712036, + "learning_rate": 0.0001083515121040315, + "loss": 0.9916, + "step": 24605 + }, + { + "epoch": 0.9474494706448509, + "grad_norm": 1.543762445449829, + "learning_rate": 0.00010832137775853728, + "loss": 1.2188, + "step": 24610 + }, + { + "epoch": 0.9476419634263715, + "grad_norm": 1.1634615659713745, + "learning_rate": 0.00010829124265210822, + "loss": 0.953, + "step": 24615 + }, + { + "epoch": 0.9478344562078922, + "grad_norm": 2.0008814334869385, + "learning_rate": 0.00010826110678749992, + "loss": 1.1433, + "step": 24620 + }, + { + "epoch": 0.9480269489894129, + "grad_norm": 1.6230239868164062, + "learning_rate": 0.00010823097016746813, + "loss": 1.0228, + "step": 24625 + }, + { + "epoch": 0.9482194417709335, + "grad_norm": 0.9347220659255981, + "learning_rate": 0.00010820083279476865, + "loss": 1.0337, + "step": 24630 + }, + { + "epoch": 0.9484119345524543, + "grad_norm": 1.8142660856246948, + "learning_rate": 0.00010817069467215732, + "loss": 1.0862, + "step": 24635 + }, + { + "epoch": 0.948604427333975, + "grad_norm": 1.6012142896652222, + "learning_rate": 0.0001081405558023901, + "loss": 1.0592, + "step": 24640 + }, + { + "epoch": 0.9487969201154957, + "grad_norm": 2.480301856994629, + "learning_rate": 0.00010811041618822297, + "loss": 1.1688, + "step": 24645 + }, + { + "epoch": 0.9489894128970163, + "grad_norm": 1.339879035949707, + "learning_rate": 0.00010808027583241203, + "loss": 1.1535, + "step": 24650 + }, + { + "epoch": 0.9491819056785371, + "grad_norm": 2.2561163902282715, + "learning_rate": 0.00010805013473771337, + "loss": 1.1379, + "step": 24655 + }, + { + "epoch": 0.9493743984600578, + "grad_norm": 0.9310626983642578, + "learning_rate": 0.00010801999290688323, + "loss": 1.117, + "step": 24660 + }, + { + "epoch": 0.9495668912415784, + "grad_norm": 1.099223256111145, + "learning_rate": 0.00010798985034267786, + "loss": 0.9384, + "step": 24665 + }, + { + "epoch": 0.9497593840230991, + "grad_norm": 1.5492980480194092, + "learning_rate": 0.0001079597070478536, + "loss": 1.0908, + "step": 24670 + }, + { + "epoch": 0.9499518768046198, + "grad_norm": 0.9033131003379822, + "learning_rate": 0.00010792956302516688, + "loss": 1.0303, + "step": 24675 + }, + { + "epoch": 0.9501443695861406, + "grad_norm": 1.0277695655822754, + "learning_rate": 0.00010789941827737411, + "loss": 1.1463, + "step": 24680 + }, + { + "epoch": 0.9503368623676612, + "grad_norm": 1.6218613386154175, + "learning_rate": 0.00010786927280723192, + "loss": 1.0647, + "step": 24685 + }, + { + "epoch": 0.9505293551491819, + "grad_norm": 0.9777045249938965, + "learning_rate": 0.00010783912661749682, + "loss": 0.9672, + "step": 24690 + }, + { + "epoch": 0.9507218479307026, + "grad_norm": 1.7008765935897827, + "learning_rate": 0.00010780897971092554, + "loss": 1.0837, + "step": 24695 + }, + { + "epoch": 0.9509143407122232, + "grad_norm": 1.286529779434204, + "learning_rate": 0.00010777883209027477, + "loss": 1.0237, + "step": 24700 + }, + { + "epoch": 0.951106833493744, + "grad_norm": 1.0446587800979614, + "learning_rate": 0.00010774868375830133, + "loss": 1.1015, + "step": 24705 + }, + { + "epoch": 0.9512993262752647, + "grad_norm": 1.1524755954742432, + "learning_rate": 0.00010771853471776215, + "loss": 1.0399, + "step": 24710 + }, + { + "epoch": 0.9514918190567854, + "grad_norm": 1.3211500644683838, + "learning_rate": 0.00010768838497141404, + "loss": 1.1109, + "step": 24715 + }, + { + "epoch": 0.951684311838306, + "grad_norm": 1.840817928314209, + "learning_rate": 0.00010765823452201406, + "loss": 1.1012, + "step": 24720 + }, + { + "epoch": 0.9518768046198267, + "grad_norm": 1.2010953426361084, + "learning_rate": 0.00010762808337231931, + "loss": 0.9257, + "step": 24725 + }, + { + "epoch": 0.9520692974013475, + "grad_norm": 1.1567620038986206, + "learning_rate": 0.00010759793152508684, + "loss": 1.1649, + "step": 24730 + }, + { + "epoch": 0.9522617901828682, + "grad_norm": 1.1958009004592896, + "learning_rate": 0.00010756777898307384, + "loss": 1.0035, + "step": 24735 + }, + { + "epoch": 0.9524542829643888, + "grad_norm": 1.7795593738555908, + "learning_rate": 0.00010753762574903763, + "loss": 1.1245, + "step": 24740 + }, + { + "epoch": 0.9526467757459095, + "grad_norm": 1.2421553134918213, + "learning_rate": 0.00010750747182573544, + "loss": 1.1169, + "step": 24745 + }, + { + "epoch": 0.9528392685274302, + "grad_norm": 2.212799549102783, + "learning_rate": 0.00010747731721592469, + "loss": 1.1195, + "step": 24750 + }, + { + "epoch": 0.953031761308951, + "grad_norm": 1.3483097553253174, + "learning_rate": 0.00010744716192236284, + "loss": 1.0309, + "step": 24755 + }, + { + "epoch": 0.9532242540904716, + "grad_norm": 0.8752015233039856, + "learning_rate": 0.00010741700594780734, + "loss": 0.9972, + "step": 24760 + }, + { + "epoch": 0.9534167468719923, + "grad_norm": 0.778307318687439, + "learning_rate": 0.00010738684929501576, + "loss": 0.9989, + "step": 24765 + }, + { + "epoch": 0.953609239653513, + "grad_norm": 1.3148306608200073, + "learning_rate": 0.00010735669196674578, + "loss": 1.0519, + "step": 24770 + }, + { + "epoch": 0.9538017324350336, + "grad_norm": 1.7633293867111206, + "learning_rate": 0.00010732653396575504, + "loss": 1.0377, + "step": 24775 + }, + { + "epoch": 0.9539942252165544, + "grad_norm": 1.5333178043365479, + "learning_rate": 0.00010729637529480132, + "loss": 0.9923, + "step": 24780 + }, + { + "epoch": 0.9541867179980751, + "grad_norm": 1.2737149000167847, + "learning_rate": 0.0001072662159566424, + "loss": 1.2048, + "step": 24785 + }, + { + "epoch": 0.9543792107795958, + "grad_norm": 1.3000686168670654, + "learning_rate": 0.00010723605595403616, + "loss": 0.8916, + "step": 24790 + }, + { + "epoch": 0.9545717035611164, + "grad_norm": 1.2083548307418823, + "learning_rate": 0.00010720589528974056, + "loss": 1.0176, + "step": 24795 + }, + { + "epoch": 0.9547641963426372, + "grad_norm": 1.8047492504119873, + "learning_rate": 0.00010717573396651355, + "loss": 1.094, + "step": 24800 + }, + { + "epoch": 0.9549566891241579, + "grad_norm": 1.2499585151672363, + "learning_rate": 0.00010714557198711321, + "loss": 1.2239, + "step": 24805 + }, + { + "epoch": 0.9551491819056785, + "grad_norm": 1.58943510055542, + "learning_rate": 0.00010711540935429764, + "loss": 1.1283, + "step": 24810 + }, + { + "epoch": 0.9553416746871992, + "grad_norm": 1.0315110683441162, + "learning_rate": 0.00010708524607082502, + "loss": 0.9449, + "step": 24815 + }, + { + "epoch": 0.9555341674687199, + "grad_norm": 1.182955265045166, + "learning_rate": 0.00010705508213945362, + "loss": 0.9075, + "step": 24820 + }, + { + "epoch": 0.9557266602502407, + "grad_norm": 1.0133094787597656, + "learning_rate": 0.00010702491756294164, + "loss": 1.0881, + "step": 24825 + }, + { + "epoch": 0.9559191530317613, + "grad_norm": 1.5667768716812134, + "learning_rate": 0.00010699475234404749, + "loss": 1.0341, + "step": 24830 + }, + { + "epoch": 0.956111645813282, + "grad_norm": 1.2236697673797607, + "learning_rate": 0.0001069645864855296, + "loss": 1.1377, + "step": 24835 + }, + { + "epoch": 0.9563041385948027, + "grad_norm": 1.6470654010772705, + "learning_rate": 0.0001069344199901464, + "loss": 1.1478, + "step": 24840 + }, + { + "epoch": 0.9564966313763233, + "grad_norm": 0.8241544365882874, + "learning_rate": 0.00010690425286065642, + "loss": 1.018, + "step": 24845 + }, + { + "epoch": 0.9566891241578441, + "grad_norm": 1.580249547958374, + "learning_rate": 0.00010687408509981827, + "loss": 1.2341, + "step": 24850 + }, + { + "epoch": 0.9568816169393648, + "grad_norm": 2.250305414199829, + "learning_rate": 0.00010684391671039056, + "loss": 1.0795, + "step": 24855 + }, + { + "epoch": 0.9570741097208855, + "grad_norm": 1.0578948259353638, + "learning_rate": 0.000106813747695132, + "loss": 1.0914, + "step": 24860 + }, + { + "epoch": 0.9572666025024061, + "grad_norm": 1.2390888929367065, + "learning_rate": 0.00010678357805680137, + "loss": 1.2108, + "step": 24865 + }, + { + "epoch": 0.9574590952839268, + "grad_norm": 2.08738112449646, + "learning_rate": 0.00010675340779815745, + "loss": 1.064, + "step": 24870 + }, + { + "epoch": 0.9576515880654476, + "grad_norm": 1.882812738418579, + "learning_rate": 0.00010672323692195912, + "loss": 0.9825, + "step": 24875 + }, + { + "epoch": 0.9578440808469683, + "grad_norm": 1.5878349542617798, + "learning_rate": 0.00010669306543096534, + "loss": 1.1894, + "step": 24880 + }, + { + "epoch": 0.9580365736284889, + "grad_norm": 1.1730328798294067, + "learning_rate": 0.00010666289332793503, + "loss": 1.0882, + "step": 24885 + }, + { + "epoch": 0.9582290664100096, + "grad_norm": 1.14130699634552, + "learning_rate": 0.00010663272061562726, + "loss": 0.9501, + "step": 24890 + }, + { + "epoch": 0.9584215591915303, + "grad_norm": 1.1763651371002197, + "learning_rate": 0.00010660254729680117, + "loss": 1.1355, + "step": 24895 + }, + { + "epoch": 0.958614051973051, + "grad_norm": 1.5443150997161865, + "learning_rate": 0.00010657237337421582, + "loss": 0.8956, + "step": 24900 + }, + { + "epoch": 0.9588065447545717, + "grad_norm": 1.0410224199295044, + "learning_rate": 0.0001065421988506305, + "loss": 1.0776, + "step": 24905 + }, + { + "epoch": 0.9589990375360924, + "grad_norm": 0.9984595775604248, + "learning_rate": 0.0001065120237288044, + "loss": 1.1329, + "step": 24910 + }, + { + "epoch": 0.9591915303176131, + "grad_norm": 1.8508329391479492, + "learning_rate": 0.00010648184801149689, + "loss": 1.0724, + "step": 24915 + }, + { + "epoch": 0.9593840230991337, + "grad_norm": 2.0840296745300293, + "learning_rate": 0.00010645167170146733, + "loss": 1.0316, + "step": 24920 + }, + { + "epoch": 0.9595765158806545, + "grad_norm": 1.6280990839004517, + "learning_rate": 0.00010642149480147509, + "loss": 0.9811, + "step": 24925 + }, + { + "epoch": 0.9597690086621752, + "grad_norm": 1.7951642274856567, + "learning_rate": 0.00010639131731427974, + "loss": 1.0428, + "step": 24930 + }, + { + "epoch": 0.9599615014436959, + "grad_norm": 1.0857096910476685, + "learning_rate": 0.00010636113924264073, + "loss": 0.9805, + "step": 24935 + }, + { + "epoch": 0.9601539942252165, + "grad_norm": 1.919179916381836, + "learning_rate": 0.00010633096058931766, + "loss": 1.0721, + "step": 24940 + }, + { + "epoch": 0.9603464870067372, + "grad_norm": 0.9968999028205872, + "learning_rate": 0.0001063007813570702, + "loss": 1.0928, + "step": 24945 + }, + { + "epoch": 0.960538979788258, + "grad_norm": 1.4042326211929321, + "learning_rate": 0.00010627060154865802, + "loss": 1.1576, + "step": 24950 + }, + { + "epoch": 0.9607314725697786, + "grad_norm": 1.0694444179534912, + "learning_rate": 0.00010624042116684088, + "loss": 1.0468, + "step": 24955 + }, + { + "epoch": 0.9609239653512993, + "grad_norm": 1.127734899520874, + "learning_rate": 0.00010621024021437855, + "loss": 1.1967, + "step": 24960 + }, + { + "epoch": 0.96111645813282, + "grad_norm": 0.9498047232627869, + "learning_rate": 0.0001061800586940309, + "loss": 1.0848, + "step": 24965 + }, + { + "epoch": 0.9613089509143408, + "grad_norm": 1.6244068145751953, + "learning_rate": 0.0001061498766085578, + "loss": 1.1116, + "step": 24970 + }, + { + "epoch": 0.9615014436958614, + "grad_norm": 1.0797539949417114, + "learning_rate": 0.00010611969396071926, + "loss": 1.025, + "step": 24975 + }, + { + "epoch": 0.9616939364773821, + "grad_norm": 1.1946651935577393, + "learning_rate": 0.00010608951075327522, + "loss": 1.3113, + "step": 24980 + }, + { + "epoch": 0.9618864292589028, + "grad_norm": 0.915337324142456, + "learning_rate": 0.00010605932698898576, + "loss": 1.1864, + "step": 24985 + }, + { + "epoch": 0.9620789220404234, + "grad_norm": 0.821142315864563, + "learning_rate": 0.00010602914267061101, + "loss": 0.9772, + "step": 24990 + }, + { + "epoch": 0.9622714148219442, + "grad_norm": 1.0821641683578491, + "learning_rate": 0.00010599895780091106, + "loss": 0.9488, + "step": 24995 + }, + { + "epoch": 0.9624639076034649, + "grad_norm": 1.600436806678772, + "learning_rate": 0.0001059687723826462, + "loss": 1.1065, + "step": 25000 + }, + { + "epoch": 0.9626564003849856, + "grad_norm": 1.296973466873169, + "learning_rate": 0.00010593858641857664, + "loss": 1.0329, + "step": 25005 + }, + { + "epoch": 0.9628488931665062, + "grad_norm": 1.2150559425354004, + "learning_rate": 0.00010590839991146269, + "loss": 1.176, + "step": 25010 + }, + { + "epoch": 0.9630413859480269, + "grad_norm": 1.5039284229278564, + "learning_rate": 0.00010587821286406469, + "loss": 1.055, + "step": 25015 + }, + { + "epoch": 0.9632338787295477, + "grad_norm": 1.5378166437149048, + "learning_rate": 0.00010584802527914308, + "loss": 1.1067, + "step": 25020 + }, + { + "epoch": 0.9634263715110684, + "grad_norm": 1.1952167749404907, + "learning_rate": 0.0001058178371594583, + "loss": 1.1889, + "step": 25025 + }, + { + "epoch": 0.963618864292589, + "grad_norm": 1.8712360858917236, + "learning_rate": 0.00010578764850777084, + "loss": 1.2401, + "step": 25030 + }, + { + "epoch": 0.9638113570741097, + "grad_norm": 1.0512574911117554, + "learning_rate": 0.00010575745932684125, + "loss": 1.1021, + "step": 25035 + }, + { + "epoch": 0.9640038498556304, + "grad_norm": 1.1200644969940186, + "learning_rate": 0.00010572726961943017, + "loss": 1.1023, + "step": 25040 + }, + { + "epoch": 0.9641963426371511, + "grad_norm": 1.448231816291809, + "learning_rate": 0.00010569707938829821, + "loss": 0.979, + "step": 25045 + }, + { + "epoch": 0.9643888354186718, + "grad_norm": 2.4415132999420166, + "learning_rate": 0.00010566688863620608, + "loss": 1.037, + "step": 25050 + }, + { + "epoch": 0.9645813282001925, + "grad_norm": 1.2661056518554688, + "learning_rate": 0.00010563669736591453, + "loss": 1.2452, + "step": 25055 + }, + { + "epoch": 0.9647738209817132, + "grad_norm": 1.1040503978729248, + "learning_rate": 0.00010560650558018434, + "loss": 1.0118, + "step": 25060 + }, + { + "epoch": 0.9649663137632338, + "grad_norm": 1.2488709688186646, + "learning_rate": 0.00010557631328177636, + "loss": 1.0467, + "step": 25065 + }, + { + "epoch": 0.9651588065447546, + "grad_norm": 1.267828106880188, + "learning_rate": 0.00010554612047345147, + "loss": 0.9722, + "step": 25070 + }, + { + "epoch": 0.9653512993262753, + "grad_norm": 1.5606194734573364, + "learning_rate": 0.00010551592715797058, + "loss": 1.0212, + "step": 25075 + }, + { + "epoch": 0.965543792107796, + "grad_norm": 1.2547963857650757, + "learning_rate": 0.0001054857333380947, + "loss": 0.9649, + "step": 25080 + }, + { + "epoch": 0.9657362848893166, + "grad_norm": 1.1586534976959229, + "learning_rate": 0.00010545553901658486, + "loss": 1.1205, + "step": 25085 + }, + { + "epoch": 0.9659287776708373, + "grad_norm": 1.03237783908844, + "learning_rate": 0.00010542534419620214, + "loss": 0.9806, + "step": 25090 + }, + { + "epoch": 0.9661212704523581, + "grad_norm": 1.0828287601470947, + "learning_rate": 0.00010539514887970758, + "loss": 1.1297, + "step": 25095 + }, + { + "epoch": 0.9663137632338787, + "grad_norm": 1.0693068504333496, + "learning_rate": 0.00010536495306986243, + "loss": 0.9167, + "step": 25100 + }, + { + "epoch": 0.9665062560153994, + "grad_norm": 0.9770272374153137, + "learning_rate": 0.00010533475676942785, + "loss": 1.1324, + "step": 25105 + }, + { + "epoch": 0.9666987487969201, + "grad_norm": 1.3809349536895752, + "learning_rate": 0.00010530455998116511, + "loss": 1.2411, + "step": 25110 + }, + { + "epoch": 0.9668912415784409, + "grad_norm": 1.8168786764144897, + "learning_rate": 0.00010527436270783551, + "loss": 1.0129, + "step": 25115 + }, + { + "epoch": 0.9670837343599615, + "grad_norm": 1.3123118877410889, + "learning_rate": 0.00010524416495220035, + "loss": 1.0677, + "step": 25120 + }, + { + "epoch": 0.9672762271414822, + "grad_norm": 1.329199194908142, + "learning_rate": 0.00010521396671702106, + "loss": 1.0804, + "step": 25125 + }, + { + "epoch": 0.9674687199230029, + "grad_norm": 1.5670257806777954, + "learning_rate": 0.00010518376800505907, + "loss": 1.1608, + "step": 25130 + }, + { + "epoch": 0.9676612127045235, + "grad_norm": 1.2058695554733276, + "learning_rate": 0.00010515356881907581, + "loss": 1.2903, + "step": 25135 + }, + { + "epoch": 0.9678537054860443, + "grad_norm": 1.4438813924789429, + "learning_rate": 0.0001051233691618328, + "loss": 1.2552, + "step": 25140 + }, + { + "epoch": 0.968046198267565, + "grad_norm": 1.1188582181930542, + "learning_rate": 0.00010509316903609167, + "loss": 1.1011, + "step": 25145 + }, + { + "epoch": 0.9682386910490857, + "grad_norm": 1.0020146369934082, + "learning_rate": 0.00010506296844461394, + "loss": 1.3431, + "step": 25150 + }, + { + "epoch": 0.9684311838306063, + "grad_norm": 1.0120351314544678, + "learning_rate": 0.00010503276739016128, + "loss": 1.0744, + "step": 25155 + }, + { + "epoch": 0.968623676612127, + "grad_norm": 1.0021151304244995, + "learning_rate": 0.0001050025658754954, + "loss": 1.0356, + "step": 25160 + }, + { + "epoch": 0.9688161693936478, + "grad_norm": 1.2012884616851807, + "learning_rate": 0.00010497236390337801, + "loss": 1.0556, + "step": 25165 + }, + { + "epoch": 0.9690086621751685, + "grad_norm": 2.0097708702087402, + "learning_rate": 0.00010494216147657086, + "loss": 1.1924, + "step": 25170 + }, + { + "epoch": 0.9692011549566891, + "grad_norm": 1.8217684030532837, + "learning_rate": 0.0001049119585978358, + "loss": 1.1365, + "step": 25175 + }, + { + "epoch": 0.9693936477382098, + "grad_norm": 2.422804832458496, + "learning_rate": 0.00010488175526993466, + "loss": 1.1675, + "step": 25180 + }, + { + "epoch": 0.9695861405197305, + "grad_norm": 1.59537672996521, + "learning_rate": 0.00010485155149562933, + "loss": 1.018, + "step": 25185 + }, + { + "epoch": 0.9697786333012512, + "grad_norm": 1.0549684762954712, + "learning_rate": 0.00010482134727768175, + "loss": 1.076, + "step": 25190 + }, + { + "epoch": 0.9699711260827719, + "grad_norm": 1.1994887590408325, + "learning_rate": 0.00010479114261885395, + "loss": 1.0012, + "step": 25195 + }, + { + "epoch": 0.9701636188642926, + "grad_norm": 1.6719814538955688, + "learning_rate": 0.00010476093752190784, + "loss": 1.1797, + "step": 25200 + }, + { + "epoch": 0.9703561116458133, + "grad_norm": 1.5435569286346436, + "learning_rate": 0.00010473073198960555, + "loss": 1.0019, + "step": 25205 + }, + { + "epoch": 0.9705486044273339, + "grad_norm": 1.0406394004821777, + "learning_rate": 0.00010470052602470917, + "loss": 1.0301, + "step": 25210 + }, + { + "epoch": 0.9707410972088547, + "grad_norm": 0.8994914889335632, + "learning_rate": 0.0001046703196299808, + "loss": 0.9683, + "step": 25215 + }, + { + "epoch": 0.9709335899903754, + "grad_norm": 1.6628937721252441, + "learning_rate": 0.00010464011280818266, + "loss": 1.2508, + "step": 25220 + }, + { + "epoch": 0.971126082771896, + "grad_norm": 1.7515249252319336, + "learning_rate": 0.00010460990556207693, + "loss": 1.3141, + "step": 25225 + }, + { + "epoch": 0.9713185755534167, + "grad_norm": 1.4734959602355957, + "learning_rate": 0.00010457969789442587, + "loss": 1.0589, + "step": 25230 + }, + { + "epoch": 0.9715110683349374, + "grad_norm": 1.102239966392517, + "learning_rate": 0.00010454948980799179, + "loss": 1.0115, + "step": 25235 + }, + { + "epoch": 0.9717035611164582, + "grad_norm": 1.9879227876663208, + "learning_rate": 0.000104519281305537, + "loss": 0.9781, + "step": 25240 + }, + { + "epoch": 0.9718960538979788, + "grad_norm": 1.2674641609191895, + "learning_rate": 0.00010448907238982387, + "loss": 1.1459, + "step": 25245 + }, + { + "epoch": 0.9720885466794995, + "grad_norm": 1.922810673713684, + "learning_rate": 0.00010445886306361479, + "loss": 1.0625, + "step": 25250 + }, + { + "epoch": 0.9722810394610202, + "grad_norm": 1.0224851369857788, + "learning_rate": 0.00010442865332967225, + "loss": 0.9894, + "step": 25255 + }, + { + "epoch": 0.9724735322425409, + "grad_norm": 1.3521889448165894, + "learning_rate": 0.00010439844319075868, + "loss": 1.0474, + "step": 25260 + }, + { + "epoch": 0.9726660250240616, + "grad_norm": 1.4089123010635376, + "learning_rate": 0.00010436823264963662, + "loss": 1.0874, + "step": 25265 + }, + { + "epoch": 0.9728585178055823, + "grad_norm": 1.0368316173553467, + "learning_rate": 0.00010433802170906863, + "loss": 1.0466, + "step": 25270 + }, + { + "epoch": 0.973051010587103, + "grad_norm": 1.541629672050476, + "learning_rate": 0.00010430781037181727, + "loss": 1.0565, + "step": 25275 + }, + { + "epoch": 0.9732435033686236, + "grad_norm": 0.7525898814201355, + "learning_rate": 0.00010427759864064521, + "loss": 0.9438, + "step": 25280 + }, + { + "epoch": 0.9734359961501444, + "grad_norm": 1.5569766759872437, + "learning_rate": 0.00010424738651831507, + "loss": 1.1061, + "step": 25285 + }, + { + "epoch": 0.9736284889316651, + "grad_norm": 1.4111804962158203, + "learning_rate": 0.0001042171740075896, + "loss": 1.2118, + "step": 25290 + }, + { + "epoch": 0.9738209817131858, + "grad_norm": 1.2276591062545776, + "learning_rate": 0.00010418696111123148, + "loss": 1.1494, + "step": 25295 + }, + { + "epoch": 0.9740134744947064, + "grad_norm": 1.320253849029541, + "learning_rate": 0.00010415674783200349, + "loss": 1.1241, + "step": 25300 + }, + { + "epoch": 0.9742059672762271, + "grad_norm": 1.6126363277435303, + "learning_rate": 0.00010412653417266849, + "loss": 1.068, + "step": 25305 + }, + { + "epoch": 0.9743984600577479, + "grad_norm": 1.2316113710403442, + "learning_rate": 0.00010409632013598924, + "loss": 1.124, + "step": 25310 + }, + { + "epoch": 0.9745909528392686, + "grad_norm": 1.3497854471206665, + "learning_rate": 0.00010406610572472866, + "loss": 1.0544, + "step": 25315 + }, + { + "epoch": 0.9747834456207892, + "grad_norm": 1.1057459115982056, + "learning_rate": 0.00010403589094164966, + "loss": 0.982, + "step": 25320 + }, + { + "epoch": 0.9749759384023099, + "grad_norm": 1.654336929321289, + "learning_rate": 0.00010400567578951515, + "loss": 1.1425, + "step": 25325 + }, + { + "epoch": 0.9751684311838306, + "grad_norm": 0.8302839398384094, + "learning_rate": 0.00010397546027108814, + "loss": 1.0543, + "step": 25330 + }, + { + "epoch": 0.9753609239653513, + "grad_norm": 3.0035483837127686, + "learning_rate": 0.00010394524438913161, + "loss": 1.0277, + "step": 25335 + }, + { + "epoch": 0.975553416746872, + "grad_norm": 1.4531986713409424, + "learning_rate": 0.00010391502814640864, + "loss": 1.1209, + "step": 25340 + }, + { + "epoch": 0.9757459095283927, + "grad_norm": 1.7825361490249634, + "learning_rate": 0.00010388481154568224, + "loss": 0.9739, + "step": 25345 + }, + { + "epoch": 0.9759384023099134, + "grad_norm": 0.9865066409111023, + "learning_rate": 0.00010385459458971558, + "loss": 1.0587, + "step": 25350 + }, + { + "epoch": 0.976130895091434, + "grad_norm": 1.5732641220092773, + "learning_rate": 0.00010382437728127176, + "loss": 1.1509, + "step": 25355 + }, + { + "epoch": 0.9763233878729548, + "grad_norm": 1.3935140371322632, + "learning_rate": 0.000103794159623114, + "loss": 1.0582, + "step": 25360 + }, + { + "epoch": 0.9765158806544755, + "grad_norm": 1.156906247138977, + "learning_rate": 0.0001037639416180055, + "loss": 0.9355, + "step": 25365 + }, + { + "epoch": 0.9767083734359961, + "grad_norm": 1.3732504844665527, + "learning_rate": 0.0001037337232687094, + "loss": 1.085, + "step": 25370 + }, + { + "epoch": 0.9769008662175168, + "grad_norm": 1.41510009765625, + "learning_rate": 0.00010370350457798907, + "loss": 1.0622, + "step": 25375 + }, + { + "epoch": 0.9770933589990375, + "grad_norm": 2.79821515083313, + "learning_rate": 0.00010367328554860783, + "loss": 1.1103, + "step": 25380 + }, + { + "epoch": 0.9772858517805583, + "grad_norm": 0.6538259387016296, + "learning_rate": 0.00010364306618332889, + "loss": 0.8355, + "step": 25385 + }, + { + "epoch": 0.9774783445620789, + "grad_norm": 1.126719355583191, + "learning_rate": 0.00010361284648491571, + "loss": 1.0243, + "step": 25390 + }, + { + "epoch": 0.9776708373435996, + "grad_norm": 1.7321900129318237, + "learning_rate": 0.00010358262645613166, + "loss": 1.1714, + "step": 25395 + }, + { + "epoch": 0.9778633301251203, + "grad_norm": 1.1369127035140991, + "learning_rate": 0.00010355240609974015, + "loss": 0.9301, + "step": 25400 + }, + { + "epoch": 0.978055822906641, + "grad_norm": 1.0569565296173096, + "learning_rate": 0.00010352218541850461, + "loss": 1.1441, + "step": 25405 + }, + { + "epoch": 0.9782483156881617, + "grad_norm": 1.2385330200195312, + "learning_rate": 0.00010349196441518855, + "loss": 1.0675, + "step": 25410 + }, + { + "epoch": 0.9784408084696824, + "grad_norm": 1.161966323852539, + "learning_rate": 0.00010346174309255552, + "loss": 1.0511, + "step": 25415 + }, + { + "epoch": 0.9786333012512031, + "grad_norm": 1.3913847208023071, + "learning_rate": 0.00010343152145336899, + "loss": 1.2011, + "step": 25420 + }, + { + "epoch": 0.9788257940327237, + "grad_norm": 2.150585412979126, + "learning_rate": 0.00010340129950039253, + "loss": 0.9328, + "step": 25425 + }, + { + "epoch": 0.9790182868142445, + "grad_norm": 0.9360420107841492, + "learning_rate": 0.00010337107723638979, + "loss": 1.0759, + "step": 25430 + }, + { + "epoch": 0.9792107795957652, + "grad_norm": 1.2746261358261108, + "learning_rate": 0.00010334085466412435, + "loss": 1.0884, + "step": 25435 + }, + { + "epoch": 0.9794032723772859, + "grad_norm": 1.4159737825393677, + "learning_rate": 0.00010331063178635991, + "loss": 0.8767, + "step": 25440 + }, + { + "epoch": 0.9795957651588065, + "grad_norm": 1.0822529792785645, + "learning_rate": 0.00010328040860586013, + "loss": 1.0087, + "step": 25445 + }, + { + "epoch": 0.9797882579403272, + "grad_norm": 1.3765298128128052, + "learning_rate": 0.00010325018512538868, + "loss": 1.1506, + "step": 25450 + }, + { + "epoch": 0.979980750721848, + "grad_norm": 1.1696947813034058, + "learning_rate": 0.00010321996134770935, + "loss": 1.1162, + "step": 25455 + }, + { + "epoch": 0.9801732435033687, + "grad_norm": 1.0629351139068604, + "learning_rate": 0.0001031897372755859, + "loss": 1.0682, + "step": 25460 + }, + { + "epoch": 0.9803657362848893, + "grad_norm": 1.7273883819580078, + "learning_rate": 0.00010315951291178208, + "loss": 1.0015, + "step": 25465 + }, + { + "epoch": 0.98055822906641, + "grad_norm": 1.154524803161621, + "learning_rate": 0.00010312928825906172, + "loss": 1.0113, + "step": 25470 + }, + { + "epoch": 0.9807507218479307, + "grad_norm": 1.5868383646011353, + "learning_rate": 0.0001030990633201887, + "loss": 1.1021, + "step": 25475 + }, + { + "epoch": 0.9809432146294514, + "grad_norm": 1.425208568572998, + "learning_rate": 0.00010306883809792687, + "loss": 1.0993, + "step": 25480 + }, + { + "epoch": 0.9811357074109721, + "grad_norm": 1.5670615434646606, + "learning_rate": 0.00010303861259504011, + "loss": 1.1032, + "step": 25485 + }, + { + "epoch": 0.9813282001924928, + "grad_norm": 0.9901431202888489, + "learning_rate": 0.00010300838681429239, + "loss": 1.0594, + "step": 25490 + }, + { + "epoch": 0.9815206929740135, + "grad_norm": 1.2887781858444214, + "learning_rate": 0.0001029781607584476, + "loss": 1.0748, + "step": 25495 + }, + { + "epoch": 0.9817131857555341, + "grad_norm": 1.1845282316207886, + "learning_rate": 0.00010294793443026974, + "loss": 1.0229, + "step": 25500 + }, + { + "epoch": 0.9819056785370549, + "grad_norm": 1.3194280862808228, + "learning_rate": 0.0001029177078325228, + "loss": 1.0796, + "step": 25505 + }, + { + "epoch": 0.9820981713185756, + "grad_norm": 1.5246963500976562, + "learning_rate": 0.0001028874809679708, + "loss": 1.2291, + "step": 25510 + }, + { + "epoch": 0.9822906641000962, + "grad_norm": 1.210331678390503, + "learning_rate": 0.00010285725383937782, + "loss": 0.9774, + "step": 25515 + }, + { + "epoch": 0.9824831568816169, + "grad_norm": 1.1501210927963257, + "learning_rate": 0.00010282702644950788, + "loss": 0.8948, + "step": 25520 + }, + { + "epoch": 0.9826756496631376, + "grad_norm": 1.2343779802322388, + "learning_rate": 0.0001027967988011251, + "loss": 1.1353, + "step": 25525 + }, + { + "epoch": 0.9828681424446584, + "grad_norm": 0.9154959321022034, + "learning_rate": 0.00010276657089699359, + "loss": 1.1039, + "step": 25530 + }, + { + "epoch": 0.983060635226179, + "grad_norm": 2.102421283721924, + "learning_rate": 0.00010273634273987754, + "loss": 1.1101, + "step": 25535 + }, + { + "epoch": 0.9832531280076997, + "grad_norm": 1.5680655241012573, + "learning_rate": 0.00010270611433254102, + "loss": 0.9902, + "step": 25540 + }, + { + "epoch": 0.9834456207892204, + "grad_norm": 0.9244110584259033, + "learning_rate": 0.0001026758856777483, + "loss": 0.9179, + "step": 25545 + }, + { + "epoch": 0.983638113570741, + "grad_norm": 0.8917207717895508, + "learning_rate": 0.00010264565677826356, + "loss": 0.987, + "step": 25550 + }, + { + "epoch": 0.9838306063522618, + "grad_norm": 1.52036452293396, + "learning_rate": 0.00010261542763685104, + "loss": 0.9879, + "step": 25555 + }, + { + "epoch": 0.9840230991337825, + "grad_norm": 1.535866618156433, + "learning_rate": 0.000102585198256275, + "loss": 1.121, + "step": 25560 + }, + { + "epoch": 0.9842155919153032, + "grad_norm": 1.3526102304458618, + "learning_rate": 0.00010255496863929965, + "loss": 1.0613, + "step": 25565 + }, + { + "epoch": 0.9844080846968238, + "grad_norm": 1.4781625270843506, + "learning_rate": 0.0001025247387886894, + "loss": 1.0461, + "step": 25570 + }, + { + "epoch": 0.9846005774783445, + "grad_norm": 1.0873816013336182, + "learning_rate": 0.00010249450870720849, + "loss": 0.9743, + "step": 25575 + }, + { + "epoch": 0.9847930702598653, + "grad_norm": 1.3945918083190918, + "learning_rate": 0.00010246427839762127, + "loss": 1.1183, + "step": 25580 + }, + { + "epoch": 0.984985563041386, + "grad_norm": 2.6019561290740967, + "learning_rate": 0.00010243404786269215, + "loss": 1.0604, + "step": 25585 + }, + { + "epoch": 0.9851780558229066, + "grad_norm": 1.1407113075256348, + "learning_rate": 0.00010240381710518542, + "loss": 0.9804, + "step": 25590 + }, + { + "epoch": 0.9853705486044273, + "grad_norm": 1.5416691303253174, + "learning_rate": 0.00010237358612786558, + "loss": 1.097, + "step": 25595 + }, + { + "epoch": 0.9855630413859481, + "grad_norm": 1.0081915855407715, + "learning_rate": 0.00010234335493349703, + "loss": 1.0342, + "step": 25600 + }, + { + "epoch": 0.9857555341674687, + "grad_norm": 1.0994783639907837, + "learning_rate": 0.00010231312352484417, + "loss": 1.2357, + "step": 25605 + }, + { + "epoch": 0.9859480269489894, + "grad_norm": 1.106605887413025, + "learning_rate": 0.00010228289190467146, + "loss": 1.0279, + "step": 25610 + }, + { + "epoch": 0.9861405197305101, + "grad_norm": 0.9887366890907288, + "learning_rate": 0.00010225266007574345, + "loss": 1.1127, + "step": 25615 + }, + { + "epoch": 0.9863330125120308, + "grad_norm": 1.079148530960083, + "learning_rate": 0.00010222242804082458, + "loss": 1.1353, + "step": 25620 + }, + { + "epoch": 0.9865255052935515, + "grad_norm": 1.2472405433654785, + "learning_rate": 0.00010219219580267938, + "loss": 1.1579, + "step": 25625 + }, + { + "epoch": 0.9867179980750722, + "grad_norm": 1.0981230735778809, + "learning_rate": 0.00010216196336407242, + "loss": 1.1264, + "step": 25630 + }, + { + "epoch": 0.9869104908565929, + "grad_norm": 1.4295231103897095, + "learning_rate": 0.00010213173072776823, + "loss": 1.0099, + "step": 25635 + }, + { + "epoch": 0.9871029836381136, + "grad_norm": 1.3902168273925781, + "learning_rate": 0.00010210149789653137, + "loss": 1.1308, + "step": 25640 + }, + { + "epoch": 0.9872954764196342, + "grad_norm": 1.2584228515625, + "learning_rate": 0.00010207126487312646, + "loss": 1.0638, + "step": 25645 + }, + { + "epoch": 0.987487969201155, + "grad_norm": 1.27899968624115, + "learning_rate": 0.00010204103166031809, + "loss": 1.0931, + "step": 25650 + }, + { + "epoch": 0.9876804619826757, + "grad_norm": 2.330070972442627, + "learning_rate": 0.00010201079826087088, + "loss": 1.1429, + "step": 25655 + }, + { + "epoch": 0.9878729547641963, + "grad_norm": 1.0945899486541748, + "learning_rate": 0.00010198056467754953, + "loss": 1.0337, + "step": 25660 + }, + { + "epoch": 0.988065447545717, + "grad_norm": 1.3564465045928955, + "learning_rate": 0.00010195033091311866, + "loss": 1.1632, + "step": 25665 + }, + { + "epoch": 0.9882579403272377, + "grad_norm": 1.0468459129333496, + "learning_rate": 0.0001019200969703429, + "loss": 1.1247, + "step": 25670 + }, + { + "epoch": 0.9884504331087585, + "grad_norm": 1.7445849180221558, + "learning_rate": 0.00010188986285198703, + "loss": 0.9937, + "step": 25675 + }, + { + "epoch": 0.9886429258902791, + "grad_norm": 0.9142084121704102, + "learning_rate": 0.0001018596285608157, + "loss": 1.1336, + "step": 25680 + }, + { + "epoch": 0.9888354186717998, + "grad_norm": 1.7198858261108398, + "learning_rate": 0.00010182939409959366, + "loss": 1.1145, + "step": 25685 + }, + { + "epoch": 0.9890279114533205, + "grad_norm": 1.204819679260254, + "learning_rate": 0.00010179915947108565, + "loss": 1.0323, + "step": 25690 + }, + { + "epoch": 0.9892204042348411, + "grad_norm": 0.8539313673973083, + "learning_rate": 0.00010176892467805646, + "loss": 0.8969, + "step": 25695 + }, + { + "epoch": 0.9894128970163619, + "grad_norm": 1.1470470428466797, + "learning_rate": 0.00010173868972327079, + "loss": 1.1075, + "step": 25700 + }, + { + "epoch": 0.9896053897978826, + "grad_norm": 1.3425363302230835, + "learning_rate": 0.00010170845460949345, + "loss": 1.178, + "step": 25705 + }, + { + "epoch": 0.9897978825794033, + "grad_norm": 1.2167271375656128, + "learning_rate": 0.00010167821933948929, + "loss": 0.9844, + "step": 25710 + }, + { + "epoch": 0.9899903753609239, + "grad_norm": 1.0317187309265137, + "learning_rate": 0.00010164798391602306, + "loss": 1.1865, + "step": 25715 + }, + { + "epoch": 0.9901828681424446, + "grad_norm": 1.983253836631775, + "learning_rate": 0.00010161774834185962, + "loss": 1.1188, + "step": 25720 + }, + { + "epoch": 0.9903753609239654, + "grad_norm": 0.8479856848716736, + "learning_rate": 0.00010158751261976382, + "loss": 1.1431, + "step": 25725 + }, + { + "epoch": 0.9905678537054861, + "grad_norm": 1.3559181690216064, + "learning_rate": 0.0001015572767525005, + "loss": 1.1233, + "step": 25730 + }, + { + "epoch": 0.9907603464870067, + "grad_norm": 0.8876566290855408, + "learning_rate": 0.00010152704074283454, + "loss": 1.0564, + "step": 25735 + }, + { + "epoch": 0.9909528392685274, + "grad_norm": 1.189932942390442, + "learning_rate": 0.00010149680459353083, + "loss": 1.0271, + "step": 25740 + }, + { + "epoch": 0.9911453320500482, + "grad_norm": 2.000798225402832, + "learning_rate": 0.00010146656830735424, + "loss": 1.3183, + "step": 25745 + }, + { + "epoch": 0.9913378248315688, + "grad_norm": 1.558982253074646, + "learning_rate": 0.00010143633188706969, + "loss": 0.9362, + "step": 25750 + }, + { + "epoch": 0.9915303176130895, + "grad_norm": 1.8294328451156616, + "learning_rate": 0.00010140609533544215, + "loss": 0.94, + "step": 25755 + }, + { + "epoch": 0.9917228103946102, + "grad_norm": 1.833295226097107, + "learning_rate": 0.00010137585865523644, + "loss": 0.8496, + "step": 25760 + }, + { + "epoch": 0.9919153031761309, + "grad_norm": 0.8973735570907593, + "learning_rate": 0.00010134562184921761, + "loss": 1.2634, + "step": 25765 + }, + { + "epoch": 0.9921077959576516, + "grad_norm": 1.6813448667526245, + "learning_rate": 0.00010131538492015056, + "loss": 1.1065, + "step": 25770 + }, + { + "epoch": 0.9923002887391723, + "grad_norm": 1.0313218832015991, + "learning_rate": 0.0001012851478708003, + "loss": 1.026, + "step": 25775 + }, + { + "epoch": 0.992492781520693, + "grad_norm": 1.324073314666748, + "learning_rate": 0.00010125491070393176, + "loss": 1.168, + "step": 25780 + }, + { + "epoch": 0.9926852743022136, + "grad_norm": 1.5872528553009033, + "learning_rate": 0.00010122467342230997, + "loss": 0.9647, + "step": 25785 + }, + { + "epoch": 0.9928777670837343, + "grad_norm": 1.108878254890442, + "learning_rate": 0.00010119443602869987, + "loss": 0.9983, + "step": 25790 + }, + { + "epoch": 0.9930702598652551, + "grad_norm": 0.7914887070655823, + "learning_rate": 0.00010116419852586652, + "loss": 0.974, + "step": 25795 + }, + { + "epoch": 0.9932627526467758, + "grad_norm": 1.9735599756240845, + "learning_rate": 0.00010113396091657492, + "loss": 1.1217, + "step": 25800 + }, + { + "epoch": 0.9934552454282964, + "grad_norm": 1.5698872804641724, + "learning_rate": 0.00010110372320359014, + "loss": 1.1984, + "step": 25805 + }, + { + "epoch": 0.9936477382098171, + "grad_norm": 1.5581351518630981, + "learning_rate": 0.00010107348538967714, + "loss": 1.0301, + "step": 25810 + }, + { + "epoch": 0.9938402309913378, + "grad_norm": 0.868083655834198, + "learning_rate": 0.00010104324747760104, + "loss": 1.1439, + "step": 25815 + }, + { + "epoch": 0.9940327237728586, + "grad_norm": 1.065421462059021, + "learning_rate": 0.00010101300947012686, + "loss": 1.0275, + "step": 25820 + }, + { + "epoch": 0.9942252165543792, + "grad_norm": 1.3174570798873901, + "learning_rate": 0.00010098277137001967, + "loss": 1.0925, + "step": 25825 + }, + { + "epoch": 0.9944177093358999, + "grad_norm": 1.1705584526062012, + "learning_rate": 0.00010095253318004457, + "loss": 1.1513, + "step": 25830 + }, + { + "epoch": 0.9946102021174206, + "grad_norm": 1.9153684377670288, + "learning_rate": 0.00010092229490296661, + "loss": 0.9746, + "step": 25835 + }, + { + "epoch": 0.9948026948989412, + "grad_norm": 1.1694215536117554, + "learning_rate": 0.00010089205654155087, + "loss": 1.1149, + "step": 25840 + }, + { + "epoch": 0.994995187680462, + "grad_norm": 1.2398862838745117, + "learning_rate": 0.00010086181809856248, + "loss": 1.0225, + "step": 25845 + }, + { + "epoch": 0.9951876804619827, + "grad_norm": 0.803766131401062, + "learning_rate": 0.00010083157957676657, + "loss": 0.9342, + "step": 25850 + }, + { + "epoch": 0.9953801732435034, + "grad_norm": 1.1521053314208984, + "learning_rate": 0.00010080134097892817, + "loss": 1.1817, + "step": 25855 + }, + { + "epoch": 0.995572666025024, + "grad_norm": 1.6817054748535156, + "learning_rate": 0.00010077110230781246, + "loss": 0.9607, + "step": 25860 + }, + { + "epoch": 0.9957651588065447, + "grad_norm": 1.8072839975357056, + "learning_rate": 0.00010074086356618457, + "loss": 1.2474, + "step": 25865 + }, + { + "epoch": 0.9959576515880655, + "grad_norm": 1.1039899587631226, + "learning_rate": 0.00010071062475680959, + "loss": 1.0931, + "step": 25870 + }, + { + "epoch": 0.9961501443695862, + "grad_norm": 1.3213703632354736, + "learning_rate": 0.0001006803858824527, + "loss": 0.9709, + "step": 25875 + }, + { + "epoch": 0.9963426371511068, + "grad_norm": 2.1169626712799072, + "learning_rate": 0.00010065014694587902, + "loss": 1.0185, + "step": 25880 + }, + { + "epoch": 0.9965351299326275, + "grad_norm": 1.997347116470337, + "learning_rate": 0.00010061990794985372, + "loss": 0.9641, + "step": 25885 + }, + { + "epoch": 0.9967276227141482, + "grad_norm": 2.4005537033081055, + "learning_rate": 0.00010058966889714192, + "loss": 0.956, + "step": 25890 + }, + { + "epoch": 0.996920115495669, + "grad_norm": 1.1593470573425293, + "learning_rate": 0.00010055942979050886, + "loss": 0.9971, + "step": 25895 + }, + { + "epoch": 0.9971126082771896, + "grad_norm": 1.5145149230957031, + "learning_rate": 0.0001005291906327196, + "loss": 1.029, + "step": 25900 + }, + { + "epoch": 0.9973051010587103, + "grad_norm": 1.6127921342849731, + "learning_rate": 0.00010049895142653936, + "loss": 1.0643, + "step": 25905 + }, + { + "epoch": 0.997497593840231, + "grad_norm": 1.326854944229126, + "learning_rate": 0.00010046871217473334, + "loss": 1.0119, + "step": 25910 + }, + { + "epoch": 0.9976900866217517, + "grad_norm": 1.0918645858764648, + "learning_rate": 0.00010043847288006666, + "loss": 1.2797, + "step": 25915 + }, + { + "epoch": 0.9978825794032724, + "grad_norm": 1.0558148622512817, + "learning_rate": 0.00010040823354530457, + "loss": 1.0526, + "step": 25920 + }, + { + "epoch": 0.9980750721847931, + "grad_norm": 1.1978880167007446, + "learning_rate": 0.00010037799417321222, + "loss": 1.0564, + "step": 25925 + }, + { + "epoch": 0.9982675649663137, + "grad_norm": 1.9178636074066162, + "learning_rate": 0.00010034775476655482, + "loss": 1.068, + "step": 25930 + }, + { + "epoch": 0.9984600577478344, + "grad_norm": 1.7356677055358887, + "learning_rate": 0.00010031751532809755, + "loss": 1.2817, + "step": 25935 + }, + { + "epoch": 0.9986525505293552, + "grad_norm": 1.0780870914459229, + "learning_rate": 0.00010028727586060558, + "loss": 1.0834, + "step": 25940 + }, + { + "epoch": 0.9988450433108759, + "grad_norm": 1.4201254844665527, + "learning_rate": 0.00010025703636684416, + "loss": 0.9698, + "step": 25945 + }, + { + "epoch": 0.9990375360923965, + "grad_norm": 1.0446133613586426, + "learning_rate": 0.00010022679684957845, + "loss": 1.1268, + "step": 25950 + }, + { + "epoch": 0.9992300288739172, + "grad_norm": 1.2714285850524902, + "learning_rate": 0.0001001965573115737, + "loss": 1.2019, + "step": 25955 + }, + { + "epoch": 0.9994225216554379, + "grad_norm": 0.9263586401939392, + "learning_rate": 0.00010016631775559506, + "loss": 1.1232, + "step": 25960 + }, + { + "epoch": 0.9996150144369587, + "grad_norm": 2.6559371948242188, + "learning_rate": 0.00010013607818440775, + "loss": 1.1367, + "step": 25965 + }, + { + "epoch": 0.9998075072184793, + "grad_norm": 1.6373353004455566, + "learning_rate": 0.00010010583860077703, + "loss": 1.002, + "step": 25970 + }, + { + "epoch": 1.0, + "grad_norm": 2.38175630569458, + "learning_rate": 0.00010007559900746805, + "loss": 1.198, + "step": 25975 + }, + { + "epoch": 1.0001924927815207, + "grad_norm": 0.9677699208259583, + "learning_rate": 0.00010004535940724604, + "loss": 0.9076, + "step": 25980 + }, + { + "epoch": 1.0003849855630413, + "grad_norm": 1.5017547607421875, + "learning_rate": 0.00010001511980287623, + "loss": 0.9611, + "step": 25985 + }, + { + "epoch": 1.000577478344562, + "grad_norm": 0.8973893523216248, + "learning_rate": 9.998488019712379e-05, + "loss": 0.8939, + "step": 25990 + }, + { + "epoch": 1.0007699711260827, + "grad_norm": 1.614953875541687, + "learning_rate": 9.995464059275396e-05, + "loss": 0.6977, + "step": 25995 + }, + { + "epoch": 1.0009624639076036, + "grad_norm": 2.5739526748657227, + "learning_rate": 9.992440099253196e-05, + "loss": 0.967, + "step": 26000 + }, + { + "epoch": 1.0011549566891242, + "grad_norm": 1.2019327878952026, + "learning_rate": 9.989416139922301e-05, + "loss": 0.8387, + "step": 26005 + }, + { + "epoch": 1.001347449470645, + "grad_norm": 1.8093628883361816, + "learning_rate": 9.986392181559223e-05, + "loss": 0.9027, + "step": 26010 + }, + { + "epoch": 1.0015399422521656, + "grad_norm": 1.4864177703857422, + "learning_rate": 9.983368224440496e-05, + "loss": 0.7639, + "step": 26015 + }, + { + "epoch": 1.0017324350336863, + "grad_norm": 1.123429775238037, + "learning_rate": 9.980344268842634e-05, + "loss": 0.7783, + "step": 26020 + }, + { + "epoch": 1.001924927815207, + "grad_norm": 1.5506818294525146, + "learning_rate": 9.977320315042154e-05, + "loss": 0.7143, + "step": 26025 + }, + { + "epoch": 1.0021174205967276, + "grad_norm": 2.1665854454040527, + "learning_rate": 9.974296363315585e-05, + "loss": 0.9101, + "step": 26030 + }, + { + "epoch": 1.0023099133782483, + "grad_norm": 1.8006185293197632, + "learning_rate": 9.971272413939444e-05, + "loss": 0.9212, + "step": 26035 + }, + { + "epoch": 1.002502406159769, + "grad_norm": 1.2385761737823486, + "learning_rate": 9.968248467190245e-05, + "loss": 0.8715, + "step": 26040 + }, + { + "epoch": 1.0026948989412896, + "grad_norm": 1.4337763786315918, + "learning_rate": 9.965224523344519e-05, + "loss": 0.6949, + "step": 26045 + }, + { + "epoch": 1.0028873917228105, + "grad_norm": 1.586151123046875, + "learning_rate": 9.96220058267878e-05, + "loss": 0.913, + "step": 26050 + }, + { + "epoch": 1.0030798845043312, + "grad_norm": 1.3005743026733398, + "learning_rate": 9.959176645469542e-05, + "loss": 0.8218, + "step": 26055 + }, + { + "epoch": 1.0032723772858518, + "grad_norm": 0.735365629196167, + "learning_rate": 9.956152711993335e-05, + "loss": 0.7206, + "step": 26060 + }, + { + "epoch": 1.0034648700673725, + "grad_norm": 1.7162381410598755, + "learning_rate": 9.95312878252667e-05, + "loss": 0.7916, + "step": 26065 + }, + { + "epoch": 1.0036573628488932, + "grad_norm": 0.894173800945282, + "learning_rate": 9.950104857346064e-05, + "loss": 0.9683, + "step": 26070 + }, + { + "epoch": 1.0038498556304138, + "grad_norm": 1.6856434345245361, + "learning_rate": 9.947080936728044e-05, + "loss": 0.7997, + "step": 26075 + }, + { + "epoch": 1.0040423484119345, + "grad_norm": 1.2891976833343506, + "learning_rate": 9.94405702094912e-05, + "loss": 0.802, + "step": 26080 + }, + { + "epoch": 1.0042348411934552, + "grad_norm": 1.5862544775009155, + "learning_rate": 9.941033110285809e-05, + "loss": 0.857, + "step": 26085 + }, + { + "epoch": 1.0044273339749759, + "grad_norm": 2.165689706802368, + "learning_rate": 9.93800920501463e-05, + "loss": 1.01, + "step": 26090 + }, + { + "epoch": 1.0046198267564965, + "grad_norm": 1.3247888088226318, + "learning_rate": 9.9349853054121e-05, + "loss": 0.872, + "step": 26095 + }, + { + "epoch": 1.0048123195380174, + "grad_norm": 1.7285022735595703, + "learning_rate": 9.931961411754732e-05, + "loss": 0.913, + "step": 26100 + }, + { + "epoch": 1.005004812319538, + "grad_norm": 1.504143238067627, + "learning_rate": 9.928937524319043e-05, + "loss": 0.7396, + "step": 26105 + }, + { + "epoch": 1.0051973051010588, + "grad_norm": 1.633792519569397, + "learning_rate": 9.925913643381546e-05, + "loss": 0.8946, + "step": 26110 + }, + { + "epoch": 1.0053897978825794, + "grad_norm": 1.8839080333709717, + "learning_rate": 9.922889769218755e-05, + "loss": 0.8844, + "step": 26115 + }, + { + "epoch": 1.0055822906641, + "grad_norm": 0.9367867708206177, + "learning_rate": 9.919865902107185e-05, + "loss": 0.8064, + "step": 26120 + }, + { + "epoch": 1.0057747834456208, + "grad_norm": 1.0992298126220703, + "learning_rate": 9.916842042323348e-05, + "loss": 0.8674, + "step": 26125 + }, + { + "epoch": 1.0059672762271414, + "grad_norm": 1.031310796737671, + "learning_rate": 9.913818190143751e-05, + "loss": 0.8082, + "step": 26130 + }, + { + "epoch": 1.006159769008662, + "grad_norm": 1.5799235105514526, + "learning_rate": 9.910794345844914e-05, + "loss": 0.7128, + "step": 26135 + }, + { + "epoch": 1.0063522617901828, + "grad_norm": 1.5742192268371582, + "learning_rate": 9.907770509703344e-05, + "loss": 0.8554, + "step": 26140 + }, + { + "epoch": 1.0065447545717037, + "grad_norm": 1.3698316812515259, + "learning_rate": 9.904746681995544e-05, + "loss": 0.6932, + "step": 26145 + }, + { + "epoch": 1.0067372473532243, + "grad_norm": 1.2731399536132812, + "learning_rate": 9.901722862998034e-05, + "loss": 0.9624, + "step": 26150 + }, + { + "epoch": 1.006929740134745, + "grad_norm": 1.0184792280197144, + "learning_rate": 9.898699052987318e-05, + "loss": 0.8248, + "step": 26155 + }, + { + "epoch": 1.0071222329162657, + "grad_norm": 1.1166132688522339, + "learning_rate": 9.895675252239896e-05, + "loss": 0.7907, + "step": 26160 + }, + { + "epoch": 1.0073147256977864, + "grad_norm": 2.4963834285736084, + "learning_rate": 9.892651461032287e-05, + "loss": 0.8758, + "step": 26165 + }, + { + "epoch": 1.007507218479307, + "grad_norm": 2.192413091659546, + "learning_rate": 9.889627679640991e-05, + "loss": 0.9262, + "step": 26170 + }, + { + "epoch": 1.0076997112608277, + "grad_norm": 1.760149598121643, + "learning_rate": 9.886603908342508e-05, + "loss": 0.8604, + "step": 26175 + }, + { + "epoch": 1.0078922040423484, + "grad_norm": 1.2598868608474731, + "learning_rate": 9.88358014741335e-05, + "loss": 0.9674, + "step": 26180 + }, + { + "epoch": 1.008084696823869, + "grad_norm": 1.29741632938385, + "learning_rate": 9.880556397130018e-05, + "loss": 0.8107, + "step": 26185 + }, + { + "epoch": 1.0082771896053897, + "grad_norm": 1.4447206258773804, + "learning_rate": 9.877532657769006e-05, + "loss": 0.9182, + "step": 26190 + }, + { + "epoch": 1.0084696823869106, + "grad_norm": 1.8422036170959473, + "learning_rate": 9.874508929606827e-05, + "loss": 1.0373, + "step": 26195 + }, + { + "epoch": 1.0086621751684313, + "grad_norm": 1.393976092338562, + "learning_rate": 9.871485212919974e-05, + "loss": 0.8484, + "step": 26200 + }, + { + "epoch": 1.008854667949952, + "grad_norm": 1.280700922012329, + "learning_rate": 9.868461507984945e-05, + "loss": 0.7288, + "step": 26205 + }, + { + "epoch": 1.0090471607314726, + "grad_norm": 1.8476749658584595, + "learning_rate": 9.86543781507824e-05, + "loss": 0.9356, + "step": 26210 + }, + { + "epoch": 1.0092396535129933, + "grad_norm": 1.1410245895385742, + "learning_rate": 9.862414134476358e-05, + "loss": 0.7967, + "step": 26215 + }, + { + "epoch": 1.009432146294514, + "grad_norm": 1.401632308959961, + "learning_rate": 9.859390466455789e-05, + "loss": 0.825, + "step": 26220 + }, + { + "epoch": 1.0096246390760346, + "grad_norm": 1.705074667930603, + "learning_rate": 9.856366811293033e-05, + "loss": 1.0509, + "step": 26225 + }, + { + "epoch": 1.0098171318575553, + "grad_norm": 1.788540244102478, + "learning_rate": 9.853343169264581e-05, + "loss": 0.9138, + "step": 26230 + }, + { + "epoch": 1.010009624639076, + "grad_norm": 1.9128284454345703, + "learning_rate": 9.850319540646919e-05, + "loss": 0.931, + "step": 26235 + }, + { + "epoch": 1.0102021174205966, + "grad_norm": 1.1917674541473389, + "learning_rate": 9.847295925716548e-05, + "loss": 0.7869, + "step": 26240 + }, + { + "epoch": 1.0103946102021175, + "grad_norm": 1.351189136505127, + "learning_rate": 9.844272324749955e-05, + "loss": 0.868, + "step": 26245 + }, + { + "epoch": 1.0105871029836382, + "grad_norm": 1.3934807777404785, + "learning_rate": 9.841248738023619e-05, + "loss": 0.9278, + "step": 26250 + }, + { + "epoch": 1.0107795957651589, + "grad_norm": 1.337436556816101, + "learning_rate": 9.83822516581404e-05, + "loss": 0.9855, + "step": 26255 + }, + { + "epoch": 1.0109720885466795, + "grad_norm": 1.5759284496307373, + "learning_rate": 9.835201608397695e-05, + "loss": 0.6804, + "step": 26260 + }, + { + "epoch": 1.0111645813282002, + "grad_norm": 1.530190110206604, + "learning_rate": 9.832178066051075e-05, + "loss": 1.1206, + "step": 26265 + }, + { + "epoch": 1.0113570741097209, + "grad_norm": 1.9554476737976074, + "learning_rate": 9.829154539050657e-05, + "loss": 1.0103, + "step": 26270 + }, + { + "epoch": 1.0115495668912415, + "grad_norm": 1.9727224111557007, + "learning_rate": 9.826131027672922e-05, + "loss": 0.9634, + "step": 26275 + }, + { + "epoch": 1.0117420596727622, + "grad_norm": 1.2419236898422241, + "learning_rate": 9.823107532194358e-05, + "loss": 0.6752, + "step": 26280 + }, + { + "epoch": 1.0119345524542829, + "grad_norm": 1.707638144493103, + "learning_rate": 9.820084052891436e-05, + "loss": 0.8883, + "step": 26285 + }, + { + "epoch": 1.0121270452358035, + "grad_norm": 1.57963228225708, + "learning_rate": 9.817060590040633e-05, + "loss": 0.8674, + "step": 26290 + }, + { + "epoch": 1.0123195380173244, + "grad_norm": 0.9437096118927002, + "learning_rate": 9.81403714391843e-05, + "loss": 0.7648, + "step": 26295 + }, + { + "epoch": 1.012512030798845, + "grad_norm": 1.6244466304779053, + "learning_rate": 9.8110137148013e-05, + "loss": 0.8088, + "step": 26300 + }, + { + "epoch": 1.0127045235803658, + "grad_norm": 1.200982689857483, + "learning_rate": 9.807990302965712e-05, + "loss": 0.9869, + "step": 26305 + }, + { + "epoch": 1.0128970163618864, + "grad_norm": 1.7576247453689575, + "learning_rate": 9.804966908688137e-05, + "loss": 0.7445, + "step": 26310 + }, + { + "epoch": 1.0130895091434071, + "grad_norm": 1.2843434810638428, + "learning_rate": 9.801943532245049e-05, + "loss": 0.7024, + "step": 26315 + }, + { + "epoch": 1.0132820019249278, + "grad_norm": 1.0596975088119507, + "learning_rate": 9.79892017391291e-05, + "loss": 0.8457, + "step": 26320 + }, + { + "epoch": 1.0134744947064485, + "grad_norm": 1.8278613090515137, + "learning_rate": 9.795896833968193e-05, + "loss": 0.9703, + "step": 26325 + }, + { + "epoch": 1.0136669874879691, + "grad_norm": 1.732739806175232, + "learning_rate": 9.792873512687359e-05, + "loss": 0.8173, + "step": 26330 + }, + { + "epoch": 1.0138594802694898, + "grad_norm": 0.9648169279098511, + "learning_rate": 9.789850210346864e-05, + "loss": 0.8673, + "step": 26335 + }, + { + "epoch": 1.0140519730510107, + "grad_norm": 1.4606342315673828, + "learning_rate": 9.78682692722318e-05, + "loss": 0.9835, + "step": 26340 + }, + { + "epoch": 1.0142444658325314, + "grad_norm": 1.4847335815429688, + "learning_rate": 9.783803663592762e-05, + "loss": 0.9562, + "step": 26345 + }, + { + "epoch": 1.014436958614052, + "grad_norm": 1.0714442729949951, + "learning_rate": 9.78078041973206e-05, + "loss": 0.8459, + "step": 26350 + }, + { + "epoch": 1.0146294513955727, + "grad_norm": 1.2830302715301514, + "learning_rate": 9.777757195917544e-05, + "loss": 1.0389, + "step": 26355 + }, + { + "epoch": 1.0148219441770934, + "grad_norm": 1.007949948310852, + "learning_rate": 9.774733992425659e-05, + "loss": 0.8296, + "step": 26360 + }, + { + "epoch": 1.015014436958614, + "grad_norm": 1.4365791082382202, + "learning_rate": 9.771710809532853e-05, + "loss": 0.8113, + "step": 26365 + }, + { + "epoch": 1.0152069297401347, + "grad_norm": 1.213368535041809, + "learning_rate": 9.768687647515587e-05, + "loss": 0.829, + "step": 26370 + }, + { + "epoch": 1.0153994225216554, + "grad_norm": 1.6301283836364746, + "learning_rate": 9.765664506650302e-05, + "loss": 0.8657, + "step": 26375 + }, + { + "epoch": 1.015591915303176, + "grad_norm": 1.4944705963134766, + "learning_rate": 9.762641387213442e-05, + "loss": 0.8043, + "step": 26380 + }, + { + "epoch": 1.0157844080846967, + "grad_norm": 1.2845187187194824, + "learning_rate": 9.759618289481459e-05, + "loss": 0.6866, + "step": 26385 + }, + { + "epoch": 1.0159769008662176, + "grad_norm": 1.635873556137085, + "learning_rate": 9.75659521373079e-05, + "loss": 0.9432, + "step": 26390 + }, + { + "epoch": 1.0161693936477383, + "grad_norm": 0.9494144320487976, + "learning_rate": 9.753572160237873e-05, + "loss": 0.7428, + "step": 26395 + }, + { + "epoch": 1.016361886429259, + "grad_norm": 1.2726250886917114, + "learning_rate": 9.750549129279153e-05, + "loss": 0.6935, + "step": 26400 + }, + { + "epoch": 1.0165543792107796, + "grad_norm": 1.4167382717132568, + "learning_rate": 9.747526121131064e-05, + "loss": 1.0576, + "step": 26405 + }, + { + "epoch": 1.0167468719923003, + "grad_norm": 1.8464453220367432, + "learning_rate": 9.744503136070036e-05, + "loss": 0.8568, + "step": 26410 + }, + { + "epoch": 1.016939364773821, + "grad_norm": 1.164482593536377, + "learning_rate": 9.741480174372504e-05, + "loss": 0.8255, + "step": 26415 + }, + { + "epoch": 1.0171318575553416, + "grad_norm": 0.8602749109268188, + "learning_rate": 9.738457236314898e-05, + "loss": 0.7958, + "step": 26420 + }, + { + "epoch": 1.0173243503368623, + "grad_norm": 1.5463776588439941, + "learning_rate": 9.735434322173645e-05, + "loss": 0.79, + "step": 26425 + }, + { + "epoch": 1.017516843118383, + "grad_norm": 1.2181867361068726, + "learning_rate": 9.732411432225173e-05, + "loss": 0.8152, + "step": 26430 + }, + { + "epoch": 1.0177093358999039, + "grad_norm": 1.4031577110290527, + "learning_rate": 9.729388566745899e-05, + "loss": 0.8809, + "step": 26435 + }, + { + "epoch": 1.0179018286814245, + "grad_norm": 0.7895151376724243, + "learning_rate": 9.726365726012249e-05, + "loss": 0.8356, + "step": 26440 + }, + { + "epoch": 1.0180943214629452, + "grad_norm": 1.0852110385894775, + "learning_rate": 9.723342910300642e-05, + "loss": 0.71, + "step": 26445 + }, + { + "epoch": 1.0182868142444659, + "grad_norm": 1.8769862651824951, + "learning_rate": 9.720320119887494e-05, + "loss": 0.8581, + "step": 26450 + }, + { + "epoch": 1.0184793070259865, + "grad_norm": 1.0951234102249146, + "learning_rate": 9.717297355049215e-05, + "loss": 0.9184, + "step": 26455 + }, + { + "epoch": 1.0186717998075072, + "grad_norm": 1.4983422756195068, + "learning_rate": 9.714274616062222e-05, + "loss": 0.9141, + "step": 26460 + }, + { + "epoch": 1.0188642925890279, + "grad_norm": 1.0044662952423096, + "learning_rate": 9.711251903202923e-05, + "loss": 0.77, + "step": 26465 + }, + { + "epoch": 1.0190567853705486, + "grad_norm": 1.339004635810852, + "learning_rate": 9.708229216747721e-05, + "loss": 1.0225, + "step": 26470 + }, + { + "epoch": 1.0192492781520692, + "grad_norm": 1.5618289709091187, + "learning_rate": 9.70520655697303e-05, + "loss": 0.9058, + "step": 26475 + }, + { + "epoch": 1.01944177093359, + "grad_norm": 0.7431265115737915, + "learning_rate": 9.702183924155246e-05, + "loss": 0.8251, + "step": 26480 + }, + { + "epoch": 1.0196342637151108, + "grad_norm": 1.1725512742996216, + "learning_rate": 9.699161318570765e-05, + "loss": 0.8755, + "step": 26485 + }, + { + "epoch": 1.0198267564966315, + "grad_norm": 2.5678439140319824, + "learning_rate": 9.696138740495991e-05, + "loss": 0.9383, + "step": 26490 + }, + { + "epoch": 1.0200192492781521, + "grad_norm": 1.203439474105835, + "learning_rate": 9.693116190207318e-05, + "loss": 0.8122, + "step": 26495 + }, + { + "epoch": 1.0202117420596728, + "grad_norm": 1.7654101848602295, + "learning_rate": 9.690093667981132e-05, + "loss": 0.8668, + "step": 26500 + }, + { + "epoch": 1.0204042348411935, + "grad_norm": 1.5297460556030273, + "learning_rate": 9.687071174093831e-05, + "loss": 0.9342, + "step": 26505 + }, + { + "epoch": 1.0205967276227141, + "grad_norm": 1.2737789154052734, + "learning_rate": 9.684048708821796e-05, + "loss": 0.8387, + "step": 26510 + }, + { + "epoch": 1.0207892204042348, + "grad_norm": 1.0593212842941284, + "learning_rate": 9.681026272441414e-05, + "loss": 0.9148, + "step": 26515 + }, + { + "epoch": 1.0209817131857555, + "grad_norm": 1.3698254823684692, + "learning_rate": 9.678003865229067e-05, + "loss": 0.9545, + "step": 26520 + }, + { + "epoch": 1.0211742059672761, + "grad_norm": 1.5303738117218018, + "learning_rate": 9.674981487461133e-05, + "loss": 0.98, + "step": 26525 + }, + { + "epoch": 1.0213666987487968, + "grad_norm": 0.8624149560928345, + "learning_rate": 9.671959139413991e-05, + "loss": 0.8921, + "step": 26530 + }, + { + "epoch": 1.0215591915303177, + "grad_norm": 0.9601109027862549, + "learning_rate": 9.66893682136401e-05, + "loss": 0.8469, + "step": 26535 + }, + { + "epoch": 1.0217516843118384, + "grad_norm": 1.3074990510940552, + "learning_rate": 9.665914533587563e-05, + "loss": 0.8174, + "step": 26540 + }, + { + "epoch": 1.021944177093359, + "grad_norm": 1.4614131450653076, + "learning_rate": 9.662892276361022e-05, + "loss": 1.0461, + "step": 26545 + }, + { + "epoch": 1.0221366698748797, + "grad_norm": 2.2266509532928467, + "learning_rate": 9.65987004996075e-05, + "loss": 0.9664, + "step": 26550 + }, + { + "epoch": 1.0223291626564004, + "grad_norm": 1.8726714849472046, + "learning_rate": 9.656847854663102e-05, + "loss": 0.9988, + "step": 26555 + }, + { + "epoch": 1.022521655437921, + "grad_norm": 1.4316071271896362, + "learning_rate": 9.65382569074445e-05, + "loss": 0.9426, + "step": 26560 + }, + { + "epoch": 1.0227141482194417, + "grad_norm": 0.9609633088111877, + "learning_rate": 9.650803558481146e-05, + "loss": 0.8867, + "step": 26565 + }, + { + "epoch": 1.0229066410009624, + "grad_norm": 2.933589458465576, + "learning_rate": 9.647781458149539e-05, + "loss": 0.9714, + "step": 26570 + }, + { + "epoch": 1.023099133782483, + "grad_norm": 1.019455075263977, + "learning_rate": 9.644759390025988e-05, + "loss": 0.7742, + "step": 26575 + }, + { + "epoch": 1.0232916265640037, + "grad_norm": 1.2443771362304688, + "learning_rate": 9.641737354386839e-05, + "loss": 0.7977, + "step": 26580 + }, + { + "epoch": 1.0234841193455246, + "grad_norm": 1.4607973098754883, + "learning_rate": 9.63871535150843e-05, + "loss": 0.9563, + "step": 26585 + }, + { + "epoch": 1.0236766121270453, + "grad_norm": 1.0155866146087646, + "learning_rate": 9.635693381667112e-05, + "loss": 0.8839, + "step": 26590 + }, + { + "epoch": 1.023869104908566, + "grad_norm": 1.453011155128479, + "learning_rate": 9.632671445139223e-05, + "loss": 0.8905, + "step": 26595 + }, + { + "epoch": 1.0240615976900866, + "grad_norm": 1.598872184753418, + "learning_rate": 9.629649542201091e-05, + "loss": 0.8839, + "step": 26600 + }, + { + "epoch": 1.0242540904716073, + "grad_norm": 1.3655000925064087, + "learning_rate": 9.626627673129062e-05, + "loss": 0.8606, + "step": 26605 + }, + { + "epoch": 1.024446583253128, + "grad_norm": 0.9747033715248108, + "learning_rate": 9.623605838199457e-05, + "loss": 0.9387, + "step": 26610 + }, + { + "epoch": 1.0246390760346487, + "grad_norm": 1.2741761207580566, + "learning_rate": 9.6205840376886e-05, + "loss": 0.8915, + "step": 26615 + }, + { + "epoch": 1.0248315688161693, + "grad_norm": 1.0303744077682495, + "learning_rate": 9.617562271872825e-05, + "loss": 0.9422, + "step": 26620 + }, + { + "epoch": 1.02502406159769, + "grad_norm": 1.2387011051177979, + "learning_rate": 9.614540541028445e-05, + "loss": 0.9212, + "step": 26625 + }, + { + "epoch": 1.0252165543792109, + "grad_norm": 1.31088387966156, + "learning_rate": 9.611518845431778e-05, + "loss": 0.8258, + "step": 26630 + }, + { + "epoch": 1.0254090471607316, + "grad_norm": 1.3650875091552734, + "learning_rate": 9.60849718535914e-05, + "loss": 0.8836, + "step": 26635 + }, + { + "epoch": 1.0256015399422522, + "grad_norm": 1.539516568183899, + "learning_rate": 9.605475561086842e-05, + "loss": 0.8616, + "step": 26640 + }, + { + "epoch": 1.025794032723773, + "grad_norm": 1.0243744850158691, + "learning_rate": 9.602453972891189e-05, + "loss": 0.7763, + "step": 26645 + }, + { + "epoch": 1.0259865255052936, + "grad_norm": 1.4782140254974365, + "learning_rate": 9.599432421048488e-05, + "loss": 0.8951, + "step": 26650 + }, + { + "epoch": 1.0261790182868142, + "grad_norm": 1.0604002475738525, + "learning_rate": 9.596410905835037e-05, + "loss": 0.8381, + "step": 26655 + }, + { + "epoch": 1.026371511068335, + "grad_norm": 1.6597970724105835, + "learning_rate": 9.593389427527135e-05, + "loss": 0.9625, + "step": 26660 + }, + { + "epoch": 1.0265640038498556, + "grad_norm": 0.9541493654251099, + "learning_rate": 9.590367986401078e-05, + "loss": 0.8285, + "step": 26665 + }, + { + "epoch": 1.0267564966313762, + "grad_norm": 1.096489667892456, + "learning_rate": 9.587346582733155e-05, + "loss": 0.8462, + "step": 26670 + }, + { + "epoch": 1.026948989412897, + "grad_norm": 1.7347831726074219, + "learning_rate": 9.58432521679965e-05, + "loss": 0.9057, + "step": 26675 + }, + { + "epoch": 1.0271414821944178, + "grad_norm": 1.02500319480896, + "learning_rate": 9.581303888876855e-05, + "loss": 0.8081, + "step": 26680 + }, + { + "epoch": 1.0273339749759385, + "grad_norm": 1.090671181678772, + "learning_rate": 9.578282599241044e-05, + "loss": 0.829, + "step": 26685 + }, + { + "epoch": 1.0275264677574591, + "grad_norm": 1.345054030418396, + "learning_rate": 9.575261348168493e-05, + "loss": 0.9518, + "step": 26690 + }, + { + "epoch": 1.0277189605389798, + "grad_norm": 1.129591703414917, + "learning_rate": 9.572240135935483e-05, + "loss": 0.8178, + "step": 26695 + }, + { + "epoch": 1.0279114533205005, + "grad_norm": 1.1437878608703613, + "learning_rate": 9.569218962818276e-05, + "loss": 0.8266, + "step": 26700 + }, + { + "epoch": 1.0281039461020212, + "grad_norm": 1.2581342458724976, + "learning_rate": 9.566197829093138e-05, + "loss": 0.8247, + "step": 26705 + }, + { + "epoch": 1.0282964388835418, + "grad_norm": 1.0760700702667236, + "learning_rate": 9.56317673503634e-05, + "loss": 1.003, + "step": 26710 + }, + { + "epoch": 1.0284889316650625, + "grad_norm": 1.10942804813385, + "learning_rate": 9.560155680924137e-05, + "loss": 0.7992, + "step": 26715 + }, + { + "epoch": 1.0286814244465832, + "grad_norm": 1.9202826023101807, + "learning_rate": 9.557134667032776e-05, + "loss": 0.989, + "step": 26720 + }, + { + "epoch": 1.0288739172281038, + "grad_norm": 2.0237245559692383, + "learning_rate": 9.554113693638522e-05, + "loss": 0.9404, + "step": 26725 + }, + { + "epoch": 1.0290664100096247, + "grad_norm": 2.128678798675537, + "learning_rate": 9.551092761017618e-05, + "loss": 1.0402, + "step": 26730 + }, + { + "epoch": 1.0292589027911454, + "grad_norm": 0.9286008477210999, + "learning_rate": 9.5480718694463e-05, + "loss": 0.9927, + "step": 26735 + }, + { + "epoch": 1.029451395572666, + "grad_norm": 1.2808765172958374, + "learning_rate": 9.545051019200823e-05, + "loss": 0.9026, + "step": 26740 + }, + { + "epoch": 1.0296438883541867, + "grad_norm": 1.4116573333740234, + "learning_rate": 9.542030210557415e-05, + "loss": 0.8285, + "step": 26745 + }, + { + "epoch": 1.0298363811357074, + "grad_norm": 2.100184440612793, + "learning_rate": 9.539009443792309e-05, + "loss": 0.925, + "step": 26750 + }, + { + "epoch": 1.030028873917228, + "grad_norm": 1.3617011308670044, + "learning_rate": 9.535988719181736e-05, + "loss": 0.8356, + "step": 26755 + }, + { + "epoch": 1.0302213666987488, + "grad_norm": 0.9685754776000977, + "learning_rate": 9.532968037001923e-05, + "loss": 0.8056, + "step": 26760 + }, + { + "epoch": 1.0304138594802694, + "grad_norm": 1.2491365671157837, + "learning_rate": 9.529947397529086e-05, + "loss": 0.9138, + "step": 26765 + }, + { + "epoch": 1.03060635226179, + "grad_norm": 0.8603072166442871, + "learning_rate": 9.526926801039449e-05, + "loss": 0.9497, + "step": 26770 + }, + { + "epoch": 1.030798845043311, + "grad_norm": 0.8559325337409973, + "learning_rate": 9.523906247809216e-05, + "loss": 0.6938, + "step": 26775 + }, + { + "epoch": 1.0309913378248317, + "grad_norm": 1.330196738243103, + "learning_rate": 9.520885738114609e-05, + "loss": 0.8141, + "step": 26780 + }, + { + "epoch": 1.0311838306063523, + "grad_norm": 1.7451072931289673, + "learning_rate": 9.517865272231827e-05, + "loss": 0.8798, + "step": 26785 + }, + { + "epoch": 1.031376323387873, + "grad_norm": 2.3189198970794678, + "learning_rate": 9.514844850437066e-05, + "loss": 0.9944, + "step": 26790 + }, + { + "epoch": 1.0315688161693937, + "grad_norm": 1.4511535167694092, + "learning_rate": 9.511824473006535e-05, + "loss": 0.8759, + "step": 26795 + }, + { + "epoch": 1.0317613089509143, + "grad_norm": 1.5764187574386597, + "learning_rate": 9.508804140216423e-05, + "loss": 0.8792, + "step": 26800 + }, + { + "epoch": 1.031953801732435, + "grad_norm": 1.368013620376587, + "learning_rate": 9.505783852342913e-05, + "loss": 0.8946, + "step": 26805 + }, + { + "epoch": 1.0321462945139557, + "grad_norm": 1.747062087059021, + "learning_rate": 9.502763609662201e-05, + "loss": 0.9188, + "step": 26810 + }, + { + "epoch": 1.0323387872954763, + "grad_norm": 1.7886524200439453, + "learning_rate": 9.499743412450463e-05, + "loss": 0.8947, + "step": 26815 + }, + { + "epoch": 1.032531280076997, + "grad_norm": 1.3724089860916138, + "learning_rate": 9.496723260983871e-05, + "loss": 0.909, + "step": 26820 + }, + { + "epoch": 1.032723772858518, + "grad_norm": 1.7624560594558716, + "learning_rate": 9.493703155538607e-05, + "loss": 0.6966, + "step": 26825 + }, + { + "epoch": 1.0329162656400386, + "grad_norm": 1.3632899522781372, + "learning_rate": 9.490683096390836e-05, + "loss": 0.7867, + "step": 26830 + }, + { + "epoch": 1.0331087584215592, + "grad_norm": 0.9639708399772644, + "learning_rate": 9.487663083816718e-05, + "loss": 1.0168, + "step": 26835 + }, + { + "epoch": 1.03330125120308, + "grad_norm": 1.6078749895095825, + "learning_rate": 9.484643118092422e-05, + "loss": 0.8775, + "step": 26840 + }, + { + "epoch": 1.0334937439846006, + "grad_norm": 0.9812796711921692, + "learning_rate": 9.481623199494098e-05, + "loss": 0.8573, + "step": 26845 + }, + { + "epoch": 1.0336862367661213, + "grad_norm": 1.926906943321228, + "learning_rate": 9.478603328297896e-05, + "loss": 0.8621, + "step": 26850 + }, + { + "epoch": 1.033878729547642, + "grad_norm": 1.4582041501998901, + "learning_rate": 9.475583504779966e-05, + "loss": 0.8978, + "step": 26855 + }, + { + "epoch": 1.0340712223291626, + "grad_norm": 1.22900390625, + "learning_rate": 9.472563729216453e-05, + "loss": 0.8079, + "step": 26860 + }, + { + "epoch": 1.0342637151106833, + "grad_norm": 1.8301913738250732, + "learning_rate": 9.46954400188349e-05, + "loss": 0.8844, + "step": 26865 + }, + { + "epoch": 1.034456207892204, + "grad_norm": 0.9074123501777649, + "learning_rate": 9.466524323057217e-05, + "loss": 0.8437, + "step": 26870 + }, + { + "epoch": 1.0346487006737248, + "grad_norm": 1.1448841094970703, + "learning_rate": 9.463504693013761e-05, + "loss": 0.9197, + "step": 26875 + }, + { + "epoch": 1.0348411934552455, + "grad_norm": 1.2825102806091309, + "learning_rate": 9.460485112029241e-05, + "loss": 0.8754, + "step": 26880 + }, + { + "epoch": 1.0350336862367662, + "grad_norm": 1.4492766857147217, + "learning_rate": 9.45746558037979e-05, + "loss": 1.068, + "step": 26885 + }, + { + "epoch": 1.0352261790182868, + "grad_norm": 1.4964579343795776, + "learning_rate": 9.454446098341516e-05, + "loss": 0.7369, + "step": 26890 + }, + { + "epoch": 1.0354186717998075, + "grad_norm": 1.2019277811050415, + "learning_rate": 9.45142666619053e-05, + "loss": 0.8437, + "step": 26895 + }, + { + "epoch": 1.0356111645813282, + "grad_norm": 1.22611665725708, + "learning_rate": 9.448407284202943e-05, + "loss": 0.7403, + "step": 26900 + }, + { + "epoch": 1.0358036573628489, + "grad_norm": 1.3472671508789062, + "learning_rate": 9.445387952654857e-05, + "loss": 0.8897, + "step": 26905 + }, + { + "epoch": 1.0359961501443695, + "grad_norm": 0.9485424160957336, + "learning_rate": 9.442368671822364e-05, + "loss": 1.0716, + "step": 26910 + }, + { + "epoch": 1.0361886429258902, + "grad_norm": 1.5943937301635742, + "learning_rate": 9.439349441981569e-05, + "loss": 0.797, + "step": 26915 + }, + { + "epoch": 1.036381135707411, + "grad_norm": 1.312227487564087, + "learning_rate": 9.436330263408551e-05, + "loss": 0.7671, + "step": 26920 + }, + { + "epoch": 1.0365736284889318, + "grad_norm": 1.4809763431549072, + "learning_rate": 9.433311136379393e-05, + "loss": 0.958, + "step": 26925 + }, + { + "epoch": 1.0367661212704524, + "grad_norm": 1.0630615949630737, + "learning_rate": 9.430292061170181e-05, + "loss": 0.8594, + "step": 26930 + }, + { + "epoch": 1.036958614051973, + "grad_norm": 2.018148422241211, + "learning_rate": 9.427273038056988e-05, + "loss": 0.8566, + "step": 26935 + }, + { + "epoch": 1.0371511068334938, + "grad_norm": 1.4871729612350464, + "learning_rate": 9.424254067315875e-05, + "loss": 1.0174, + "step": 26940 + }, + { + "epoch": 1.0373435996150144, + "grad_norm": 1.2022002935409546, + "learning_rate": 9.421235149222919e-05, + "loss": 0.7991, + "step": 26945 + }, + { + "epoch": 1.037536092396535, + "grad_norm": 1.7707716226577759, + "learning_rate": 9.418216284054174e-05, + "loss": 0.881, + "step": 26950 + }, + { + "epoch": 1.0377285851780558, + "grad_norm": 1.1901010274887085, + "learning_rate": 9.415197472085693e-05, + "loss": 0.7872, + "step": 26955 + }, + { + "epoch": 1.0379210779595764, + "grad_norm": 1.460284948348999, + "learning_rate": 9.412178713593532e-05, + "loss": 0.9017, + "step": 26960 + }, + { + "epoch": 1.0381135707410971, + "grad_norm": 1.8410388231277466, + "learning_rate": 9.409160008853735e-05, + "loss": 0.8422, + "step": 26965 + }, + { + "epoch": 1.038306063522618, + "grad_norm": 1.3502047061920166, + "learning_rate": 9.406141358142339e-05, + "loss": 0.6949, + "step": 26970 + }, + { + "epoch": 1.0384985563041387, + "grad_norm": 1.433706521987915, + "learning_rate": 9.403122761735382e-05, + "loss": 0.8941, + "step": 26975 + }, + { + "epoch": 1.0386910490856593, + "grad_norm": 1.3440196514129639, + "learning_rate": 9.400104219908895e-05, + "loss": 1.1485, + "step": 26980 + }, + { + "epoch": 1.03888354186718, + "grad_norm": 1.2331663370132446, + "learning_rate": 9.397085732938902e-05, + "loss": 0.7441, + "step": 26985 + }, + { + "epoch": 1.0390760346487007, + "grad_norm": 0.953507125377655, + "learning_rate": 9.394067301101425e-05, + "loss": 0.9023, + "step": 26990 + }, + { + "epoch": 1.0392685274302214, + "grad_norm": 1.993162751197815, + "learning_rate": 9.391048924672483e-05, + "loss": 0.9112, + "step": 26995 + }, + { + "epoch": 1.039461020211742, + "grad_norm": 1.193826675415039, + "learning_rate": 9.388030603928077e-05, + "loss": 0.9214, + "step": 27000 + }, + { + "epoch": 1.0396535129932627, + "grad_norm": 1.5678527355194092, + "learning_rate": 9.385012339144221e-05, + "loss": 0.8798, + "step": 27005 + }, + { + "epoch": 1.0398460057747834, + "grad_norm": 1.09235680103302, + "learning_rate": 9.381994130596916e-05, + "loss": 0.9111, + "step": 27010 + }, + { + "epoch": 1.040038498556304, + "grad_norm": 1.3763002157211304, + "learning_rate": 9.378975978562147e-05, + "loss": 0.8024, + "step": 27015 + }, + { + "epoch": 1.040230991337825, + "grad_norm": 2.1225831508636475, + "learning_rate": 9.375957883315916e-05, + "loss": 0.8644, + "step": 27020 + }, + { + "epoch": 1.0404234841193456, + "grad_norm": 1.7183401584625244, + "learning_rate": 9.372939845134197e-05, + "loss": 0.6908, + "step": 27025 + }, + { + "epoch": 1.0406159769008663, + "grad_norm": 1.1370265483856201, + "learning_rate": 9.369921864292981e-05, + "loss": 0.7209, + "step": 27030 + }, + { + "epoch": 1.040808469682387, + "grad_norm": 1.4425493478775024, + "learning_rate": 9.366903941068237e-05, + "loss": 0.9938, + "step": 27035 + }, + { + "epoch": 1.0410009624639076, + "grad_norm": 1.009415864944458, + "learning_rate": 9.363886075735928e-05, + "loss": 0.822, + "step": 27040 + }, + { + "epoch": 1.0411934552454283, + "grad_norm": 1.7961349487304688, + "learning_rate": 9.36086826857203e-05, + "loss": 0.8856, + "step": 27045 + }, + { + "epoch": 1.041385948026949, + "grad_norm": 1.6287988424301147, + "learning_rate": 9.357850519852492e-05, + "loss": 0.6491, + "step": 27050 + }, + { + "epoch": 1.0415784408084696, + "grad_norm": 0.8782758116722107, + "learning_rate": 9.354832829853268e-05, + "loss": 0.7966, + "step": 27055 + }, + { + "epoch": 1.0417709335899903, + "grad_norm": 2.51961612701416, + "learning_rate": 9.351815198850312e-05, + "loss": 0.7717, + "step": 27060 + }, + { + "epoch": 1.041963426371511, + "grad_norm": 1.5812909603118896, + "learning_rate": 9.348797627119561e-05, + "loss": 0.8912, + "step": 27065 + }, + { + "epoch": 1.0421559191530319, + "grad_norm": 2.106905460357666, + "learning_rate": 9.345780114936951e-05, + "loss": 0.8447, + "step": 27070 + }, + { + "epoch": 1.0423484119345525, + "grad_norm": 1.7063692808151245, + "learning_rate": 9.342762662578419e-05, + "loss": 0.9389, + "step": 27075 + }, + { + "epoch": 1.0425409047160732, + "grad_norm": 1.5984817743301392, + "learning_rate": 9.339745270319886e-05, + "loss": 0.9812, + "step": 27080 + }, + { + "epoch": 1.0427333974975939, + "grad_norm": 1.0404373407363892, + "learning_rate": 9.336727938437274e-05, + "loss": 0.8982, + "step": 27085 + }, + { + "epoch": 1.0429258902791145, + "grad_norm": 1.7776751518249512, + "learning_rate": 9.333710667206498e-05, + "loss": 0.7817, + "step": 27090 + }, + { + "epoch": 1.0431183830606352, + "grad_norm": 1.092366099357605, + "learning_rate": 9.330693456903471e-05, + "loss": 0.8735, + "step": 27095 + }, + { + "epoch": 1.0433108758421559, + "grad_norm": 1.3183677196502686, + "learning_rate": 9.327676307804088e-05, + "loss": 0.855, + "step": 27100 + }, + { + "epoch": 1.0435033686236765, + "grad_norm": 1.6006138324737549, + "learning_rate": 9.324659220184256e-05, + "loss": 0.9884, + "step": 27105 + }, + { + "epoch": 1.0436958614051972, + "grad_norm": 2.0356695652008057, + "learning_rate": 9.321642194319867e-05, + "loss": 0.9388, + "step": 27110 + }, + { + "epoch": 1.043888354186718, + "grad_norm": 1.6827491521835327, + "learning_rate": 9.3186252304868e-05, + "loss": 0.8646, + "step": 27115 + }, + { + "epoch": 1.0440808469682388, + "grad_norm": 1.6442832946777344, + "learning_rate": 9.315608328960946e-05, + "loss": 0.9892, + "step": 27120 + }, + { + "epoch": 1.0442733397497594, + "grad_norm": 2.0704457759857178, + "learning_rate": 9.312591490018176e-05, + "loss": 0.7849, + "step": 27125 + }, + { + "epoch": 1.0444658325312801, + "grad_norm": 1.5209687948226929, + "learning_rate": 9.309574713934359e-05, + "loss": 0.7079, + "step": 27130 + }, + { + "epoch": 1.0446583253128008, + "grad_norm": 1.6896909475326538, + "learning_rate": 9.306558000985363e-05, + "loss": 0.8909, + "step": 27135 + }, + { + "epoch": 1.0448508180943215, + "grad_norm": 1.2594019174575806, + "learning_rate": 9.303541351447043e-05, + "loss": 0.9134, + "step": 27140 + }, + { + "epoch": 1.0450433108758421, + "grad_norm": 1.4927109479904175, + "learning_rate": 9.300524765595252e-05, + "loss": 0.9467, + "step": 27145 + }, + { + "epoch": 1.0452358036573628, + "grad_norm": 2.307347059249878, + "learning_rate": 9.297508243705838e-05, + "loss": 0.9415, + "step": 27150 + }, + { + "epoch": 1.0454282964388835, + "grad_norm": 1.2363544702529907, + "learning_rate": 9.294491786054645e-05, + "loss": 0.8342, + "step": 27155 + }, + { + "epoch": 1.0456207892204041, + "grad_norm": 1.7037842273712158, + "learning_rate": 9.291475392917498e-05, + "loss": 0.8995, + "step": 27160 + }, + { + "epoch": 1.045813282001925, + "grad_norm": 1.9591143131256104, + "learning_rate": 9.288459064570238e-05, + "loss": 0.9448, + "step": 27165 + }, + { + "epoch": 1.0460057747834457, + "grad_norm": 1.5714846849441528, + "learning_rate": 9.285442801288681e-05, + "loss": 0.7211, + "step": 27170 + }, + { + "epoch": 1.0461982675649664, + "grad_norm": 1.2620651721954346, + "learning_rate": 9.282426603348648e-05, + "loss": 0.8678, + "step": 27175 + }, + { + "epoch": 1.046390760346487, + "grad_norm": 1.3091076612472534, + "learning_rate": 9.279410471025946e-05, + "loss": 0.947, + "step": 27180 + }, + { + "epoch": 1.0465832531280077, + "grad_norm": 1.055066704750061, + "learning_rate": 9.276394404596384e-05, + "loss": 0.8502, + "step": 27185 + }, + { + "epoch": 1.0467757459095284, + "grad_norm": 1.4299228191375732, + "learning_rate": 9.27337840433576e-05, + "loss": 0.8472, + "step": 27190 + }, + { + "epoch": 1.046968238691049, + "grad_norm": 1.315097451210022, + "learning_rate": 9.27036247051987e-05, + "loss": 0.9291, + "step": 27195 + }, + { + "epoch": 1.0471607314725697, + "grad_norm": 1.491268515586853, + "learning_rate": 9.267346603424497e-05, + "loss": 0.8054, + "step": 27200 + }, + { + "epoch": 1.0473532242540904, + "grad_norm": 1.2065061330795288, + "learning_rate": 9.264330803325422e-05, + "loss": 0.7778, + "step": 27205 + }, + { + "epoch": 1.0475457170356113, + "grad_norm": 1.1134014129638672, + "learning_rate": 9.261315070498425e-05, + "loss": 0.9711, + "step": 27210 + }, + { + "epoch": 1.047738209817132, + "grad_norm": 1.3288688659667969, + "learning_rate": 9.258299405219271e-05, + "loss": 0.833, + "step": 27215 + }, + { + "epoch": 1.0479307025986526, + "grad_norm": 1.440891146659851, + "learning_rate": 9.255283807763719e-05, + "loss": 0.9198, + "step": 27220 + }, + { + "epoch": 1.0481231953801733, + "grad_norm": 1.3492053747177124, + "learning_rate": 9.252268278407532e-05, + "loss": 0.7503, + "step": 27225 + }, + { + "epoch": 1.048315688161694, + "grad_norm": 0.9512596726417542, + "learning_rate": 9.249252817426459e-05, + "loss": 0.8406, + "step": 27230 + }, + { + "epoch": 1.0485081809432146, + "grad_norm": 1.8228788375854492, + "learning_rate": 9.24623742509624e-05, + "loss": 0.8661, + "step": 27235 + }, + { + "epoch": 1.0487006737247353, + "grad_norm": 1.9597275257110596, + "learning_rate": 9.243222101692617e-05, + "loss": 0.8972, + "step": 27240 + }, + { + "epoch": 1.048893166506256, + "grad_norm": 0.8910306692123413, + "learning_rate": 9.24020684749132e-05, + "loss": 0.7583, + "step": 27245 + }, + { + "epoch": 1.0490856592877766, + "grad_norm": 1.1714658737182617, + "learning_rate": 9.237191662768071e-05, + "loss": 0.7962, + "step": 27250 + }, + { + "epoch": 1.0492781520692973, + "grad_norm": 1.0180914402008057, + "learning_rate": 9.234176547798595e-05, + "loss": 0.7615, + "step": 27255 + }, + { + "epoch": 1.0494706448508182, + "grad_norm": 1.5986478328704834, + "learning_rate": 9.231161502858599e-05, + "loss": 0.6797, + "step": 27260 + }, + { + "epoch": 1.0496631376323389, + "grad_norm": 1.1098225116729736, + "learning_rate": 9.228146528223787e-05, + "loss": 0.867, + "step": 27265 + }, + { + "epoch": 1.0498556304138595, + "grad_norm": 1.7311850786209106, + "learning_rate": 9.225131624169868e-05, + "loss": 0.8742, + "step": 27270 + }, + { + "epoch": 1.0500481231953802, + "grad_norm": 1.0296027660369873, + "learning_rate": 9.222116790972526e-05, + "loss": 0.9372, + "step": 27275 + }, + { + "epoch": 1.0502406159769009, + "grad_norm": 1.6226547956466675, + "learning_rate": 9.219102028907448e-05, + "loss": 0.7823, + "step": 27280 + }, + { + "epoch": 1.0504331087584216, + "grad_norm": 1.745197057723999, + "learning_rate": 9.216087338250321e-05, + "loss": 0.7434, + "step": 27285 + }, + { + "epoch": 1.0506256015399422, + "grad_norm": 1.3191584348678589, + "learning_rate": 9.21307271927681e-05, + "loss": 1.0177, + "step": 27290 + }, + { + "epoch": 1.050818094321463, + "grad_norm": 1.4447453022003174, + "learning_rate": 9.21005817226259e-05, + "loss": 0.7972, + "step": 27295 + }, + { + "epoch": 1.0510105871029836, + "grad_norm": 1.4396947622299194, + "learning_rate": 9.207043697483315e-05, + "loss": 0.901, + "step": 27300 + }, + { + "epoch": 1.0512030798845042, + "grad_norm": 2.1123616695404053, + "learning_rate": 9.204029295214641e-05, + "loss": 1.0325, + "step": 27305 + }, + { + "epoch": 1.0513955726660251, + "grad_norm": 1.1420029401779175, + "learning_rate": 9.201014965732216e-05, + "loss": 0.8349, + "step": 27310 + }, + { + "epoch": 1.0515880654475458, + "grad_norm": 1.5591840744018555, + "learning_rate": 9.198000709311681e-05, + "loss": 0.8653, + "step": 27315 + }, + { + "epoch": 1.0517805582290665, + "grad_norm": 1.5964603424072266, + "learning_rate": 9.194986526228662e-05, + "loss": 0.9203, + "step": 27320 + }, + { + "epoch": 1.0519730510105871, + "grad_norm": 1.5072814226150513, + "learning_rate": 9.191972416758798e-05, + "loss": 0.8917, + "step": 27325 + }, + { + "epoch": 1.0521655437921078, + "grad_norm": 1.7254325151443481, + "learning_rate": 9.188958381177705e-05, + "loss": 0.7569, + "step": 27330 + }, + { + "epoch": 1.0523580365736285, + "grad_norm": 1.856126070022583, + "learning_rate": 9.18594441976099e-05, + "loss": 0.8379, + "step": 27335 + }, + { + "epoch": 1.0525505293551491, + "grad_norm": 1.1711968183517456, + "learning_rate": 9.18293053278427e-05, + "loss": 0.9595, + "step": 27340 + }, + { + "epoch": 1.0527430221366698, + "grad_norm": 0.8440997004508972, + "learning_rate": 9.179916720523138e-05, + "loss": 0.8221, + "step": 27345 + }, + { + "epoch": 1.0529355149181905, + "grad_norm": 1.532020092010498, + "learning_rate": 9.176902983253187e-05, + "loss": 0.8155, + "step": 27350 + }, + { + "epoch": 1.0531280076997112, + "grad_norm": 1.3105454444885254, + "learning_rate": 9.173889321250009e-05, + "loss": 1.0119, + "step": 27355 + }, + { + "epoch": 1.053320500481232, + "grad_norm": 1.6066848039627075, + "learning_rate": 9.170875734789182e-05, + "loss": 0.8037, + "step": 27360 + }, + { + "epoch": 1.0535129932627527, + "grad_norm": 0.9645727872848511, + "learning_rate": 9.167862224146271e-05, + "loss": 0.8052, + "step": 27365 + }, + { + "epoch": 1.0537054860442734, + "grad_norm": 1.2492599487304688, + "learning_rate": 9.164848789596851e-05, + "loss": 0.8299, + "step": 27370 + }, + { + "epoch": 1.053897978825794, + "grad_norm": 1.861069679260254, + "learning_rate": 9.161835431416478e-05, + "loss": 0.8103, + "step": 27375 + }, + { + "epoch": 1.0540904716073147, + "grad_norm": 1.3065990209579468, + "learning_rate": 9.158822149880699e-05, + "loss": 0.7432, + "step": 27380 + }, + { + "epoch": 1.0542829643888354, + "grad_norm": 1.0527342557907104, + "learning_rate": 9.155808945265063e-05, + "loss": 0.9543, + "step": 27385 + }, + { + "epoch": 1.054475457170356, + "grad_norm": 1.0329846143722534, + "learning_rate": 9.152795817845109e-05, + "loss": 0.7798, + "step": 27390 + }, + { + "epoch": 1.0546679499518767, + "grad_norm": 1.2137219905853271, + "learning_rate": 9.149782767896364e-05, + "loss": 0.8731, + "step": 27395 + }, + { + "epoch": 1.0548604427333974, + "grad_norm": 1.7884348630905151, + "learning_rate": 9.146769795694351e-05, + "loss": 0.7893, + "step": 27400 + }, + { + "epoch": 1.0550529355149183, + "grad_norm": 1.842133641242981, + "learning_rate": 9.143756901514591e-05, + "loss": 1.0068, + "step": 27405 + }, + { + "epoch": 1.055245428296439, + "grad_norm": 1.8429181575775146, + "learning_rate": 9.140744085632587e-05, + "loss": 0.8311, + "step": 27410 + }, + { + "epoch": 1.0554379210779596, + "grad_norm": 1.6385420560836792, + "learning_rate": 9.137731348323848e-05, + "loss": 0.9761, + "step": 27415 + }, + { + "epoch": 1.0556304138594803, + "grad_norm": 1.289536714553833, + "learning_rate": 9.134718689863863e-05, + "loss": 0.9112, + "step": 27420 + }, + { + "epoch": 1.055822906641001, + "grad_norm": 1.3178956508636475, + "learning_rate": 9.13170611052812e-05, + "loss": 0.9767, + "step": 27425 + }, + { + "epoch": 1.0560153994225217, + "grad_norm": 1.588117003440857, + "learning_rate": 9.128693610592104e-05, + "loss": 0.9646, + "step": 27430 + }, + { + "epoch": 1.0562078922040423, + "grad_norm": 2.409613609313965, + "learning_rate": 9.125681190331287e-05, + "loss": 0.9672, + "step": 27435 + }, + { + "epoch": 1.056400384985563, + "grad_norm": 2.1366968154907227, + "learning_rate": 9.123271311673885e-05, + "loss": 1.0157, + "step": 27440 + }, + { + "epoch": 1.0565928777670837, + "grad_norm": 1.3184314966201782, + "learning_rate": 9.12025903552259e-05, + "loss": 0.7433, + "step": 27445 + }, + { + "epoch": 1.0567853705486043, + "grad_norm": 1.710648775100708, + "learning_rate": 9.117246839817776e-05, + "loss": 1.145, + "step": 27450 + }, + { + "epoch": 1.0569778633301252, + "grad_norm": 1.0037586688995361, + "learning_rate": 9.114234724834895e-05, + "loss": 0.8956, + "step": 27455 + }, + { + "epoch": 1.057170356111646, + "grad_norm": 1.9379615783691406, + "learning_rate": 9.11122269084938e-05, + "loss": 0.8227, + "step": 27460 + }, + { + "epoch": 1.0573628488931666, + "grad_norm": 1.1451090574264526, + "learning_rate": 9.108210738136661e-05, + "loss": 1.0623, + "step": 27465 + }, + { + "epoch": 1.0575553416746872, + "grad_norm": 1.3829764127731323, + "learning_rate": 9.105198866972162e-05, + "loss": 0.6903, + "step": 27470 + }, + { + "epoch": 1.057747834456208, + "grad_norm": 0.7273270487785339, + "learning_rate": 9.102187077631302e-05, + "loss": 0.7338, + "step": 27475 + }, + { + "epoch": 1.0579403272377286, + "grad_norm": 1.3922228813171387, + "learning_rate": 9.099175370389482e-05, + "loss": 0.9767, + "step": 27480 + }, + { + "epoch": 1.0581328200192492, + "grad_norm": 1.5572834014892578, + "learning_rate": 9.096163745522112e-05, + "loss": 0.7942, + "step": 27485 + }, + { + "epoch": 1.05832531280077, + "grad_norm": 1.2158164978027344, + "learning_rate": 9.093152203304578e-05, + "loss": 0.8304, + "step": 27490 + }, + { + "epoch": 1.0585178055822906, + "grad_norm": 1.2715733051300049, + "learning_rate": 9.090140744012265e-05, + "loss": 0.7802, + "step": 27495 + }, + { + "epoch": 1.0587102983638113, + "grad_norm": 1.2508288621902466, + "learning_rate": 9.087129367920557e-05, + "loss": 0.8579, + "step": 27500 + }, + { + "epoch": 1.0589027911453321, + "grad_norm": 1.9734135866165161, + "learning_rate": 9.084118075304821e-05, + "loss": 0.8703, + "step": 27505 + }, + { + "epoch": 1.0590952839268528, + "grad_norm": 1.4569380283355713, + "learning_rate": 9.081106866440415e-05, + "loss": 0.9245, + "step": 27510 + }, + { + "epoch": 1.0592877767083735, + "grad_norm": 1.2966618537902832, + "learning_rate": 9.078095741602704e-05, + "loss": 0.8588, + "step": 27515 + }, + { + "epoch": 1.0594802694898942, + "grad_norm": 1.5596932172775269, + "learning_rate": 9.07508470106703e-05, + "loss": 0.8499, + "step": 27520 + }, + { + "epoch": 1.0596727622714148, + "grad_norm": 1.3186569213867188, + "learning_rate": 9.07207374510873e-05, + "loss": 0.825, + "step": 27525 + }, + { + "epoch": 1.0598652550529355, + "grad_norm": 1.393056869506836, + "learning_rate": 9.069062874003141e-05, + "loss": 0.7853, + "step": 27530 + }, + { + "epoch": 1.0600577478344562, + "grad_norm": 1.091132402420044, + "learning_rate": 9.066052088025587e-05, + "loss": 0.9176, + "step": 27535 + }, + { + "epoch": 1.0602502406159768, + "grad_norm": 2.1310064792633057, + "learning_rate": 9.063041387451378e-05, + "loss": 0.6352, + "step": 27540 + }, + { + "epoch": 1.0604427333974975, + "grad_norm": 1.153936505317688, + "learning_rate": 9.060030772555833e-05, + "loss": 0.9243, + "step": 27545 + }, + { + "epoch": 1.0606352261790182, + "grad_norm": 1.3241852521896362, + "learning_rate": 9.05702024361424e-05, + "loss": 0.7682, + "step": 27550 + }, + { + "epoch": 1.060827718960539, + "grad_norm": 1.4337151050567627, + "learning_rate": 9.054009800901906e-05, + "loss": 0.8711, + "step": 27555 + }, + { + "epoch": 1.0610202117420597, + "grad_norm": 1.3560181856155396, + "learning_rate": 9.050999444694108e-05, + "loss": 0.8178, + "step": 27560 + }, + { + "epoch": 1.0612127045235804, + "grad_norm": 1.4069291353225708, + "learning_rate": 9.047989175266123e-05, + "loss": 0.7403, + "step": 27565 + }, + { + "epoch": 1.061405197305101, + "grad_norm": 2.0509390830993652, + "learning_rate": 9.044978992893219e-05, + "loss": 1.0252, + "step": 27570 + }, + { + "epoch": 1.0615976900866217, + "grad_norm": 0.6099883913993835, + "learning_rate": 9.041968897850664e-05, + "loss": 0.8478, + "step": 27575 + }, + { + "epoch": 1.0617901828681424, + "grad_norm": 1.1845272779464722, + "learning_rate": 9.038958890413705e-05, + "loss": 0.9869, + "step": 27580 + }, + { + "epoch": 1.061982675649663, + "grad_norm": 1.2324453592300415, + "learning_rate": 9.035948970857589e-05, + "loss": 0.8353, + "step": 27585 + }, + { + "epoch": 1.0621751684311838, + "grad_norm": 2.6329379081726074, + "learning_rate": 9.032939139457556e-05, + "loss": 0.8846, + "step": 27590 + }, + { + "epoch": 1.0623676612127044, + "grad_norm": 1.4631036520004272, + "learning_rate": 9.029929396488826e-05, + "loss": 0.8486, + "step": 27595 + }, + { + "epoch": 1.0625601539942253, + "grad_norm": 2.0869596004486084, + "learning_rate": 9.026919742226633e-05, + "loss": 0.9341, + "step": 27600 + }, + { + "epoch": 1.062752646775746, + "grad_norm": 1.2778350114822388, + "learning_rate": 9.023910176946182e-05, + "loss": 0.7416, + "step": 27605 + }, + { + "epoch": 1.0629451395572667, + "grad_norm": 1.2002792358398438, + "learning_rate": 9.020900700922675e-05, + "loss": 0.8239, + "step": 27610 + }, + { + "epoch": 1.0631376323387873, + "grad_norm": 1.811063528060913, + "learning_rate": 9.017891314431318e-05, + "loss": 0.961, + "step": 27615 + }, + { + "epoch": 1.063330125120308, + "grad_norm": 1.2313002347946167, + "learning_rate": 9.014882017747294e-05, + "loss": 0.987, + "step": 27620 + }, + { + "epoch": 1.0635226179018287, + "grad_norm": 1.319166898727417, + "learning_rate": 9.01187281114578e-05, + "loss": 0.8669, + "step": 27625 + }, + { + "epoch": 1.0637151106833493, + "grad_norm": 1.418129801750183, + "learning_rate": 9.008863694901955e-05, + "loss": 0.8651, + "step": 27630 + }, + { + "epoch": 1.06390760346487, + "grad_norm": 1.349688172340393, + "learning_rate": 9.00585466929098e-05, + "loss": 0.7572, + "step": 27635 + }, + { + "epoch": 1.0641000962463907, + "grad_norm": 1.4476646184921265, + "learning_rate": 9.002845734588005e-05, + "loss": 0.8944, + "step": 27640 + }, + { + "epoch": 1.0642925890279114, + "grad_norm": 2.580551862716675, + "learning_rate": 8.999836891068187e-05, + "loss": 0.6778, + "step": 27645 + }, + { + "epoch": 1.0644850818094322, + "grad_norm": 1.5499801635742188, + "learning_rate": 8.99682813900666e-05, + "loss": 0.7813, + "step": 27650 + }, + { + "epoch": 1.064677574590953, + "grad_norm": 3.0829336643218994, + "learning_rate": 8.993819478678549e-05, + "loss": 0.9905, + "step": 27655 + }, + { + "epoch": 1.0648700673724736, + "grad_norm": 1.489709734916687, + "learning_rate": 8.990810910358987e-05, + "loss": 0.9231, + "step": 27660 + }, + { + "epoch": 1.0650625601539943, + "grad_norm": 1.199367642402649, + "learning_rate": 8.987802434323081e-05, + "loss": 0.971, + "step": 27665 + }, + { + "epoch": 1.065255052935515, + "grad_norm": 1.2478957176208496, + "learning_rate": 8.984794050845933e-05, + "loss": 0.9586, + "step": 27670 + }, + { + "epoch": 1.0654475457170356, + "grad_norm": 1.5783430337905884, + "learning_rate": 8.981785760202647e-05, + "loss": 0.7706, + "step": 27675 + }, + { + "epoch": 1.0656400384985563, + "grad_norm": 1.6337023973464966, + "learning_rate": 8.97877756266831e-05, + "loss": 0.7538, + "step": 27680 + }, + { + "epoch": 1.065832531280077, + "grad_norm": 1.195513367652893, + "learning_rate": 8.975769458517997e-05, + "loss": 0.9013, + "step": 27685 + }, + { + "epoch": 1.0660250240615976, + "grad_norm": 1.2982577085494995, + "learning_rate": 8.972761448026785e-05, + "loss": 0.9172, + "step": 27690 + }, + { + "epoch": 1.0662175168431185, + "grad_norm": 1.3517059087753296, + "learning_rate": 8.96975353146973e-05, + "loss": 0.8085, + "step": 27695 + }, + { + "epoch": 1.0664100096246392, + "grad_norm": 0.9628300666809082, + "learning_rate": 8.966745709121893e-05, + "loss": 0.9518, + "step": 27700 + }, + { + "epoch": 1.0666025024061598, + "grad_norm": 1.872689127922058, + "learning_rate": 8.963737981258315e-05, + "loss": 0.8118, + "step": 27705 + }, + { + "epoch": 1.0667949951876805, + "grad_norm": 1.307530403137207, + "learning_rate": 8.960730348154037e-05, + "loss": 0.7289, + "step": 27710 + }, + { + "epoch": 1.0669874879692012, + "grad_norm": 1.127726435661316, + "learning_rate": 8.957722810084077e-05, + "loss": 0.8415, + "step": 27715 + }, + { + "epoch": 1.0671799807507218, + "grad_norm": 1.2474530935287476, + "learning_rate": 8.954715367323468e-05, + "loss": 0.9341, + "step": 27720 + }, + { + "epoch": 1.0673724735322425, + "grad_norm": 1.6957672834396362, + "learning_rate": 8.951708020147212e-05, + "loss": 0.9228, + "step": 27725 + }, + { + "epoch": 1.0675649663137632, + "grad_norm": 2.220019817352295, + "learning_rate": 8.94870076883031e-05, + "loss": 0.9219, + "step": 27730 + }, + { + "epoch": 1.0677574590952839, + "grad_norm": 0.9045952558517456, + "learning_rate": 8.945693613647763e-05, + "loss": 0.6171, + "step": 27735 + }, + { + "epoch": 1.0679499518768045, + "grad_norm": 1.5750988721847534, + "learning_rate": 8.94268655487455e-05, + "loss": 0.9442, + "step": 27740 + }, + { + "epoch": 1.0681424446583252, + "grad_norm": 1.328931450843811, + "learning_rate": 8.939679592785646e-05, + "loss": 0.8479, + "step": 27745 + }, + { + "epoch": 1.068334937439846, + "grad_norm": 1.809605598449707, + "learning_rate": 8.936672727656021e-05, + "loss": 0.8478, + "step": 27750 + }, + { + "epoch": 1.0685274302213668, + "grad_norm": 1.630718469619751, + "learning_rate": 8.933665959760632e-05, + "loss": 0.7898, + "step": 27755 + }, + { + "epoch": 1.0687199230028874, + "grad_norm": 1.7636911869049072, + "learning_rate": 8.930659289374422e-05, + "loss": 0.8724, + "step": 27760 + }, + { + "epoch": 1.068912415784408, + "grad_norm": 1.659645915031433, + "learning_rate": 8.927652716772342e-05, + "loss": 1.0174, + "step": 27765 + }, + { + "epoch": 1.0691049085659288, + "grad_norm": 2.0495598316192627, + "learning_rate": 8.924646242229317e-05, + "loss": 0.8524, + "step": 27770 + }, + { + "epoch": 1.0692974013474494, + "grad_norm": 0.9576983451843262, + "learning_rate": 8.921639866020264e-05, + "loss": 0.8567, + "step": 27775 + }, + { + "epoch": 1.06948989412897, + "grad_norm": 1.3908922672271729, + "learning_rate": 8.918633588420106e-05, + "loss": 0.9384, + "step": 27780 + }, + { + "epoch": 1.0696823869104908, + "grad_norm": 1.0620615482330322, + "learning_rate": 8.915627409703745e-05, + "loss": 0.9836, + "step": 27785 + }, + { + "epoch": 1.0698748796920114, + "grad_norm": 1.0709110498428345, + "learning_rate": 8.912621330146071e-05, + "loss": 0.9535, + "step": 27790 + }, + { + "epoch": 1.0700673724735323, + "grad_norm": 1.297035813331604, + "learning_rate": 8.909615350021973e-05, + "loss": 0.8007, + "step": 27795 + }, + { + "epoch": 1.070259865255053, + "grad_norm": 0.9443962574005127, + "learning_rate": 8.906609469606329e-05, + "loss": 0.9376, + "step": 27800 + }, + { + "epoch": 1.0704523580365737, + "grad_norm": 2.0467529296875, + "learning_rate": 8.903603689174006e-05, + "loss": 0.9165, + "step": 27805 + }, + { + "epoch": 1.0706448508180944, + "grad_norm": 1.3403770923614502, + "learning_rate": 8.900598008999864e-05, + "loss": 0.8009, + "step": 27810 + }, + { + "epoch": 1.070837343599615, + "grad_norm": 1.8648827075958252, + "learning_rate": 8.897592429358746e-05, + "loss": 0.8834, + "step": 27815 + }, + { + "epoch": 1.0710298363811357, + "grad_norm": 1.1583093404769897, + "learning_rate": 8.894586950525502e-05, + "loss": 0.8895, + "step": 27820 + }, + { + "epoch": 1.0712223291626564, + "grad_norm": 1.1083035469055176, + "learning_rate": 8.89158157277496e-05, + "loss": 0.9152, + "step": 27825 + }, + { + "epoch": 1.071414821944177, + "grad_norm": 0.8488582372665405, + "learning_rate": 8.888576296381936e-05, + "loss": 0.6838, + "step": 27830 + }, + { + "epoch": 1.0716073147256977, + "grad_norm": 2.6488595008850098, + "learning_rate": 8.885571121621251e-05, + "loss": 0.9074, + "step": 27835 + }, + { + "epoch": 1.0717998075072184, + "grad_norm": 1.0296202898025513, + "learning_rate": 8.882566048767705e-05, + "loss": 0.8427, + "step": 27840 + }, + { + "epoch": 1.0719923002887393, + "grad_norm": 1.2265279293060303, + "learning_rate": 8.879561078096088e-05, + "loss": 0.9545, + "step": 27845 + }, + { + "epoch": 1.07218479307026, + "grad_norm": 1.3146073818206787, + "learning_rate": 8.87655620988119e-05, + "loss": 0.8112, + "step": 27850 + }, + { + "epoch": 1.0723772858517806, + "grad_norm": 1.1017197370529175, + "learning_rate": 8.873551444397785e-05, + "loss": 0.739, + "step": 27855 + }, + { + "epoch": 1.0725697786333013, + "grad_norm": 1.3772103786468506, + "learning_rate": 8.870546781920636e-05, + "loss": 0.86, + "step": 27860 + }, + { + "epoch": 1.072762271414822, + "grad_norm": 1.4853140115737915, + "learning_rate": 8.867542222724504e-05, + "loss": 0.9195, + "step": 27865 + }, + { + "epoch": 1.0729547641963426, + "grad_norm": 1.3116743564605713, + "learning_rate": 8.864537767084135e-05, + "loss": 0.8492, + "step": 27870 + }, + { + "epoch": 1.0731472569778633, + "grad_norm": 1.78131103515625, + "learning_rate": 8.861533415274262e-05, + "loss": 0.8794, + "step": 27875 + }, + { + "epoch": 1.073339749759384, + "grad_norm": 1.1533085107803345, + "learning_rate": 8.858529167569619e-05, + "loss": 0.952, + "step": 27880 + }, + { + "epoch": 1.0735322425409046, + "grad_norm": 2.657715320587158, + "learning_rate": 8.855525024244923e-05, + "loss": 0.885, + "step": 27885 + }, + { + "epoch": 1.0737247353224255, + "grad_norm": 1.6451362371444702, + "learning_rate": 8.85252098557488e-05, + "loss": 0.823, + "step": 27890 + }, + { + "epoch": 1.0739172281039462, + "grad_norm": 1.7528831958770752, + "learning_rate": 8.84951705183419e-05, + "loss": 0.7666, + "step": 27895 + }, + { + "epoch": 1.0741097208854669, + "grad_norm": 1.086206078529358, + "learning_rate": 8.846513223297549e-05, + "loss": 0.8837, + "step": 27900 + }, + { + "epoch": 1.0743022136669875, + "grad_norm": 1.413942575454712, + "learning_rate": 8.843509500239628e-05, + "loss": 0.737, + "step": 27905 + }, + { + "epoch": 1.0744947064485082, + "grad_norm": 1.2918885946273804, + "learning_rate": 8.840505882935106e-05, + "loss": 0.7301, + "step": 27910 + }, + { + "epoch": 1.0746871992300289, + "grad_norm": 1.7935266494750977, + "learning_rate": 8.837502371658639e-05, + "loss": 0.9754, + "step": 27915 + }, + { + "epoch": 1.0748796920115495, + "grad_norm": 1.8554896116256714, + "learning_rate": 8.834498966684876e-05, + "loss": 1.0135, + "step": 27920 + }, + { + "epoch": 1.0750721847930702, + "grad_norm": 1.3628082275390625, + "learning_rate": 8.831495668288467e-05, + "loss": 0.836, + "step": 27925 + }, + { + "epoch": 1.0752646775745909, + "grad_norm": 1.181231141090393, + "learning_rate": 8.828492476744037e-05, + "loss": 1.0394, + "step": 27930 + }, + { + "epoch": 1.0754571703561115, + "grad_norm": 1.8281370401382446, + "learning_rate": 8.825489392326205e-05, + "loss": 0.8396, + "step": 27935 + }, + { + "epoch": 1.0756496631376324, + "grad_norm": 1.069940209388733, + "learning_rate": 8.822486415309592e-05, + "loss": 0.9147, + "step": 27940 + }, + { + "epoch": 1.075842155919153, + "grad_norm": 1.711498737335205, + "learning_rate": 8.819483545968797e-05, + "loss": 0.7523, + "step": 27945 + }, + { + "epoch": 1.0760346487006738, + "grad_norm": 1.5465821027755737, + "learning_rate": 8.816480784578406e-05, + "loss": 0.8782, + "step": 27950 + }, + { + "epoch": 1.0762271414821944, + "grad_norm": 1.0662082433700562, + "learning_rate": 8.813478131413013e-05, + "loss": 0.6925, + "step": 27955 + }, + { + "epoch": 1.0764196342637151, + "grad_norm": 0.8745738863945007, + "learning_rate": 8.810475586747183e-05, + "loss": 0.7974, + "step": 27960 + }, + { + "epoch": 1.0766121270452358, + "grad_norm": 1.5703550577163696, + "learning_rate": 8.807473150855476e-05, + "loss": 0.7965, + "step": 27965 + }, + { + "epoch": 1.0768046198267565, + "grad_norm": 2.1191911697387695, + "learning_rate": 8.804470824012456e-05, + "loss": 0.8466, + "step": 27970 + }, + { + "epoch": 1.0769971126082771, + "grad_norm": 2.149782419204712, + "learning_rate": 8.801468606492659e-05, + "loss": 1.009, + "step": 27975 + }, + { + "epoch": 1.0771896053897978, + "grad_norm": 1.0900071859359741, + "learning_rate": 8.798466498570614e-05, + "loss": 0.8092, + "step": 27980 + }, + { + "epoch": 1.0773820981713187, + "grad_norm": 1.022477149963379, + "learning_rate": 8.795464500520851e-05, + "loss": 0.9038, + "step": 27985 + }, + { + "epoch": 1.0775745909528394, + "grad_norm": 1.4904948472976685, + "learning_rate": 8.792462612617882e-05, + "loss": 0.8276, + "step": 27990 + }, + { + "epoch": 1.07776708373436, + "grad_norm": 1.8655019998550415, + "learning_rate": 8.789460835136203e-05, + "loss": 0.9897, + "step": 27995 + }, + { + "epoch": 1.0779595765158807, + "grad_norm": 1.3288123607635498, + "learning_rate": 8.786459168350316e-05, + "loss": 0.7634, + "step": 28000 + }, + { + "epoch": 1.0781520692974014, + "grad_norm": 1.4979158639907837, + "learning_rate": 8.783457612534699e-05, + "loss": 1.0058, + "step": 28005 + }, + { + "epoch": 1.078344562078922, + "grad_norm": 0.987000048160553, + "learning_rate": 8.780456167963821e-05, + "loss": 0.7555, + "step": 28010 + }, + { + "epoch": 1.0785370548604427, + "grad_norm": 1.124505639076233, + "learning_rate": 8.77745483491215e-05, + "loss": 0.8965, + "step": 28015 + }, + { + "epoch": 1.0787295476419634, + "grad_norm": 1.8975436687469482, + "learning_rate": 8.774453613654136e-05, + "loss": 0.7971, + "step": 28020 + }, + { + "epoch": 1.078922040423484, + "grad_norm": 2.273435115814209, + "learning_rate": 8.77145250446422e-05, + "loss": 0.9065, + "step": 28025 + }, + { + "epoch": 1.0791145332050047, + "grad_norm": 2.141667366027832, + "learning_rate": 8.768451507616834e-05, + "loss": 0.937, + "step": 28030 + }, + { + "epoch": 1.0793070259865254, + "grad_norm": 1.4163758754730225, + "learning_rate": 8.7654506233864e-05, + "loss": 0.8959, + "step": 28035 + }, + { + "epoch": 1.0794995187680463, + "grad_norm": 2.3507914543151855, + "learning_rate": 8.762449852047326e-05, + "loss": 0.9258, + "step": 28040 + }, + { + "epoch": 1.079692011549567, + "grad_norm": 1.0547107458114624, + "learning_rate": 8.759449193874018e-05, + "loss": 0.9966, + "step": 28045 + }, + { + "epoch": 1.0798845043310876, + "grad_norm": 1.1531494855880737, + "learning_rate": 8.756448649140864e-05, + "loss": 0.8557, + "step": 28050 + }, + { + "epoch": 1.0800769971126083, + "grad_norm": 0.93568354845047, + "learning_rate": 8.753448218122239e-05, + "loss": 0.9067, + "step": 28055 + }, + { + "epoch": 1.080269489894129, + "grad_norm": 1.3804124593734741, + "learning_rate": 8.75044790109252e-05, + "loss": 0.9757, + "step": 28060 + }, + { + "epoch": 1.0804619826756496, + "grad_norm": 1.065398097038269, + "learning_rate": 8.747447698326058e-05, + "loss": 0.8644, + "step": 28065 + }, + { + "epoch": 1.0806544754571703, + "grad_norm": 2.9915854930877686, + "learning_rate": 8.744447610097213e-05, + "loss": 0.8151, + "step": 28070 + }, + { + "epoch": 1.080846968238691, + "grad_norm": 1.506426453590393, + "learning_rate": 8.741447636680317e-05, + "loss": 1.007, + "step": 28075 + }, + { + "epoch": 1.0810394610202116, + "grad_norm": 0.8236234784126282, + "learning_rate": 8.738447778349692e-05, + "loss": 0.8457, + "step": 28080 + }, + { + "epoch": 1.0812319538017325, + "grad_norm": 1.224062442779541, + "learning_rate": 8.735448035379665e-05, + "loss": 0.8969, + "step": 28085 + }, + { + "epoch": 1.0814244465832532, + "grad_norm": 1.5686476230621338, + "learning_rate": 8.732448408044539e-05, + "loss": 0.8932, + "step": 28090 + }, + { + "epoch": 1.0816169393647739, + "grad_norm": 1.104203462600708, + "learning_rate": 8.729448896618603e-05, + "loss": 0.8526, + "step": 28095 + }, + { + "epoch": 1.0818094321462945, + "grad_norm": 0.723469614982605, + "learning_rate": 8.726449501376153e-05, + "loss": 0.761, + "step": 28100 + }, + { + "epoch": 1.0820019249278152, + "grad_norm": 1.4393385648727417, + "learning_rate": 8.723450222591462e-05, + "loss": 0.876, + "step": 28105 + }, + { + "epoch": 1.0821944177093359, + "grad_norm": 1.0453604459762573, + "learning_rate": 8.720451060538789e-05, + "loss": 0.7178, + "step": 28110 + }, + { + "epoch": 1.0823869104908566, + "grad_norm": 1.7050689458847046, + "learning_rate": 8.717452015492388e-05, + "loss": 0.8199, + "step": 28115 + }, + { + "epoch": 1.0825794032723772, + "grad_norm": 1.400423526763916, + "learning_rate": 8.71445308772651e-05, + "loss": 0.8114, + "step": 28120 + }, + { + "epoch": 1.082771896053898, + "grad_norm": 1.93320631980896, + "learning_rate": 8.711454277515377e-05, + "loss": 0.7295, + "step": 28125 + }, + { + "epoch": 1.0829643888354186, + "grad_norm": 1.5179084539413452, + "learning_rate": 8.708455585133216e-05, + "loss": 1.083, + "step": 28130 + }, + { + "epoch": 1.0831568816169395, + "grad_norm": 2.2808666229248047, + "learning_rate": 8.705457010854235e-05, + "loss": 0.9567, + "step": 28135 + }, + { + "epoch": 1.0833493743984601, + "grad_norm": 1.0833265781402588, + "learning_rate": 8.702458554952635e-05, + "loss": 0.8316, + "step": 28140 + }, + { + "epoch": 1.0835418671799808, + "grad_norm": 1.4321517944335938, + "learning_rate": 8.699460217702606e-05, + "loss": 0.7606, + "step": 28145 + }, + { + "epoch": 1.0837343599615015, + "grad_norm": 0.9563835263252258, + "learning_rate": 8.696461999378324e-05, + "loss": 0.8625, + "step": 28150 + }, + { + "epoch": 1.0839268527430221, + "grad_norm": 1.0265910625457764, + "learning_rate": 8.693463900253954e-05, + "loss": 0.842, + "step": 28155 + }, + { + "epoch": 1.0841193455245428, + "grad_norm": 1.2975974082946777, + "learning_rate": 8.690465920603659e-05, + "loss": 0.9416, + "step": 28160 + }, + { + "epoch": 1.0843118383060635, + "grad_norm": 0.9792843461036682, + "learning_rate": 8.687468060701582e-05, + "loss": 0.786, + "step": 28165 + }, + { + "epoch": 1.0845043310875842, + "grad_norm": 1.5455747842788696, + "learning_rate": 8.684470320821851e-05, + "loss": 0.9023, + "step": 28170 + }, + { + "epoch": 1.0846968238691048, + "grad_norm": 1.3106191158294678, + "learning_rate": 8.681472701238599e-05, + "loss": 0.793, + "step": 28175 + }, + { + "epoch": 1.0848893166506257, + "grad_norm": 1.0658930540084839, + "learning_rate": 8.678475202225933e-05, + "loss": 0.8117, + "step": 28180 + }, + { + "epoch": 1.0850818094321464, + "grad_norm": 1.1267075538635254, + "learning_rate": 8.675477824057953e-05, + "loss": 0.9166, + "step": 28185 + }, + { + "epoch": 1.085274302213667, + "grad_norm": 2.638794183731079, + "learning_rate": 8.672480567008753e-05, + "loss": 0.9889, + "step": 28190 + }, + { + "epoch": 1.0854667949951877, + "grad_norm": 1.0422507524490356, + "learning_rate": 8.669483431352414e-05, + "loss": 0.8901, + "step": 28195 + }, + { + "epoch": 1.0856592877767084, + "grad_norm": 1.2292778491973877, + "learning_rate": 8.666486417362996e-05, + "loss": 0.917, + "step": 28200 + }, + { + "epoch": 1.085851780558229, + "grad_norm": 1.1497111320495605, + "learning_rate": 8.663489525314568e-05, + "loss": 0.7443, + "step": 28205 + }, + { + "epoch": 1.0860442733397497, + "grad_norm": 1.712801456451416, + "learning_rate": 8.660492755481167e-05, + "loss": 1.0358, + "step": 28210 + }, + { + "epoch": 1.0862367661212704, + "grad_norm": 1.057096004486084, + "learning_rate": 8.657496108136826e-05, + "loss": 0.829, + "step": 28215 + }, + { + "epoch": 1.086429258902791, + "grad_norm": 0.9071182012557983, + "learning_rate": 8.654499583555576e-05, + "loss": 0.8827, + "step": 28220 + }, + { + "epoch": 1.0866217516843117, + "grad_norm": 1.586111068725586, + "learning_rate": 8.651503182011427e-05, + "loss": 0.8244, + "step": 28225 + }, + { + "epoch": 1.0868142444658326, + "grad_norm": 0.845116913318634, + "learning_rate": 8.648506903778377e-05, + "loss": 0.9803, + "step": 28230 + }, + { + "epoch": 1.0870067372473533, + "grad_norm": 1.4213730096817017, + "learning_rate": 8.64551074913042e-05, + "loss": 0.7278, + "step": 28235 + }, + { + "epoch": 1.087199230028874, + "grad_norm": 1.0875608921051025, + "learning_rate": 8.64251471834153e-05, + "loss": 0.8259, + "step": 28240 + }, + { + "epoch": 1.0873917228103946, + "grad_norm": 0.9991816878318787, + "learning_rate": 8.639518811685676e-05, + "loss": 0.847, + "step": 28245 + }, + { + "epoch": 1.0875842155919153, + "grad_norm": 1.6519125699996948, + "learning_rate": 8.636523029436819e-05, + "loss": 0.8359, + "step": 28250 + }, + { + "epoch": 1.087776708373436, + "grad_norm": 1.6119956970214844, + "learning_rate": 8.633527371868896e-05, + "loss": 0.9016, + "step": 28255 + }, + { + "epoch": 1.0879692011549567, + "grad_norm": 0.7314580678939819, + "learning_rate": 8.630531839255839e-05, + "loss": 0.6791, + "step": 28260 + }, + { + "epoch": 1.0881616939364773, + "grad_norm": 0.9393051266670227, + "learning_rate": 8.627536431871578e-05, + "loss": 0.8801, + "step": 28265 + }, + { + "epoch": 1.088354186717998, + "grad_norm": 2.3234903812408447, + "learning_rate": 8.624541149990018e-05, + "loss": 0.8656, + "step": 28270 + }, + { + "epoch": 1.0885466794995187, + "grad_norm": 0.9072073101997375, + "learning_rate": 8.621545993885054e-05, + "loss": 0.8702, + "step": 28275 + }, + { + "epoch": 1.0887391722810396, + "grad_norm": 1.1024999618530273, + "learning_rate": 8.618550963830583e-05, + "loss": 0.9067, + "step": 28280 + }, + { + "epoch": 1.0889316650625602, + "grad_norm": 1.7317997217178345, + "learning_rate": 8.615556060100474e-05, + "loss": 0.975, + "step": 28285 + }, + { + "epoch": 1.089124157844081, + "grad_norm": 1.9279747009277344, + "learning_rate": 8.612561282968588e-05, + "loss": 0.9101, + "step": 28290 + }, + { + "epoch": 1.0893166506256016, + "grad_norm": 1.848134994506836, + "learning_rate": 8.609566632708786e-05, + "loss": 0.7642, + "step": 28295 + }, + { + "epoch": 1.0895091434071222, + "grad_norm": 1.3254327774047852, + "learning_rate": 8.606572109594906e-05, + "loss": 1.0743, + "step": 28300 + }, + { + "epoch": 1.089701636188643, + "grad_norm": 1.483266830444336, + "learning_rate": 8.60357771390077e-05, + "loss": 0.7679, + "step": 28305 + }, + { + "epoch": 1.0898941289701636, + "grad_norm": 1.0175541639328003, + "learning_rate": 8.600583445900206e-05, + "loss": 0.8647, + "step": 28310 + }, + { + "epoch": 1.0900866217516842, + "grad_norm": 1.3284434080123901, + "learning_rate": 8.597589305867012e-05, + "loss": 0.8266, + "step": 28315 + }, + { + "epoch": 1.090279114533205, + "grad_norm": 1.2873848676681519, + "learning_rate": 8.59459529407499e-05, + "loss": 0.9412, + "step": 28320 + }, + { + "epoch": 1.0904716073147256, + "grad_norm": 1.6508309841156006, + "learning_rate": 8.591601410797918e-05, + "loss": 0.8731, + "step": 28325 + }, + { + "epoch": 1.0906641000962465, + "grad_norm": 1.7631590366363525, + "learning_rate": 8.588607656309565e-05, + "loss": 0.8443, + "step": 28330 + }, + { + "epoch": 1.0908565928777672, + "grad_norm": 1.6212012767791748, + "learning_rate": 8.585614030883695e-05, + "loss": 0.9399, + "step": 28335 + }, + { + "epoch": 1.0910490856592878, + "grad_norm": 1.2359917163848877, + "learning_rate": 8.58262053479405e-05, + "loss": 0.8504, + "step": 28340 + }, + { + "epoch": 1.0912415784408085, + "grad_norm": 1.0911660194396973, + "learning_rate": 8.579627168314368e-05, + "loss": 0.7694, + "step": 28345 + }, + { + "epoch": 1.0914340712223292, + "grad_norm": 1.2516300678253174, + "learning_rate": 8.576633931718374e-05, + "loss": 0.7347, + "step": 28350 + }, + { + "epoch": 1.0916265640038498, + "grad_norm": 1.3555577993392944, + "learning_rate": 8.573640825279779e-05, + "loss": 0.9815, + "step": 28355 + }, + { + "epoch": 1.0918190567853705, + "grad_norm": 1.2513625621795654, + "learning_rate": 8.570647849272276e-05, + "loss": 0.856, + "step": 28360 + }, + { + "epoch": 1.0920115495668912, + "grad_norm": 1.3024554252624512, + "learning_rate": 8.567655003969563e-05, + "loss": 0.8517, + "step": 28365 + }, + { + "epoch": 1.0922040423484118, + "grad_norm": 0.9453557729721069, + "learning_rate": 8.564662289645313e-05, + "loss": 0.9392, + "step": 28370 + }, + { + "epoch": 1.0923965351299327, + "grad_norm": 1.9402847290039062, + "learning_rate": 8.561669706573184e-05, + "loss": 0.9265, + "step": 28375 + }, + { + "epoch": 1.0925890279114534, + "grad_norm": 0.8357999324798584, + "learning_rate": 8.558677255026835e-05, + "loss": 0.8744, + "step": 28380 + }, + { + "epoch": 1.092781520692974, + "grad_norm": 1.287455439567566, + "learning_rate": 8.555684935279904e-05, + "loss": 0.7733, + "step": 28385 + }, + { + "epoch": 1.0929740134744947, + "grad_norm": 1.2665926218032837, + "learning_rate": 8.552692747606014e-05, + "loss": 0.8229, + "step": 28390 + }, + { + "epoch": 1.0931665062560154, + "grad_norm": 1.5380029678344727, + "learning_rate": 8.549700692278788e-05, + "loss": 0.9208, + "step": 28395 + }, + { + "epoch": 1.093358999037536, + "grad_norm": 1.7858625650405884, + "learning_rate": 8.546708769571827e-05, + "loss": 0.8756, + "step": 28400 + }, + { + "epoch": 1.0935514918190568, + "grad_norm": 2.022096633911133, + "learning_rate": 8.543716979758717e-05, + "loss": 0.828, + "step": 28405 + }, + { + "epoch": 1.0937439846005774, + "grad_norm": 1.1538034677505493, + "learning_rate": 8.540725323113047e-05, + "loss": 0.8699, + "step": 28410 + }, + { + "epoch": 1.093936477382098, + "grad_norm": 2.0683727264404297, + "learning_rate": 8.537733799908379e-05, + "loss": 0.9554, + "step": 28415 + }, + { + "epoch": 1.0941289701636188, + "grad_norm": 1.3001796007156372, + "learning_rate": 8.534742410418265e-05, + "loss": 0.9327, + "step": 28420 + }, + { + "epoch": 1.0943214629451397, + "grad_norm": 0.7399016618728638, + "learning_rate": 8.531751154916254e-05, + "loss": 0.904, + "step": 28425 + }, + { + "epoch": 1.0945139557266603, + "grad_norm": 1.1065526008605957, + "learning_rate": 8.528760033675875e-05, + "loss": 0.7723, + "step": 28430 + }, + { + "epoch": 1.094706448508181, + "grad_norm": 1.9489480257034302, + "learning_rate": 8.525769046970642e-05, + "loss": 0.8233, + "step": 28435 + }, + { + "epoch": 1.0948989412897017, + "grad_norm": 2.0606977939605713, + "learning_rate": 8.522778195074064e-05, + "loss": 0.9149, + "step": 28440 + }, + { + "epoch": 1.0950914340712223, + "grad_norm": 1.784341812133789, + "learning_rate": 8.519787478259637e-05, + "loss": 0.8731, + "step": 28445 + }, + { + "epoch": 1.095283926852743, + "grad_norm": 0.9500823020935059, + "learning_rate": 8.516796896800836e-05, + "loss": 0.8144, + "step": 28450 + }, + { + "epoch": 1.0954764196342637, + "grad_norm": 1.5665167570114136, + "learning_rate": 8.513806450971137e-05, + "loss": 0.7198, + "step": 28455 + }, + { + "epoch": 1.0956689124157843, + "grad_norm": 0.8766529560089111, + "learning_rate": 8.51081614104399e-05, + "loss": 0.8412, + "step": 28460 + }, + { + "epoch": 1.095861405197305, + "grad_norm": 1.1104719638824463, + "learning_rate": 8.507825967292842e-05, + "loss": 0.7384, + "step": 28465 + }, + { + "epoch": 1.096053897978826, + "grad_norm": 1.2092657089233398, + "learning_rate": 8.504835929991128e-05, + "loss": 0.9372, + "step": 28470 + }, + { + "epoch": 1.0962463907603466, + "grad_norm": 1.244201898574829, + "learning_rate": 8.501846029412262e-05, + "loss": 1.0071, + "step": 28475 + }, + { + "epoch": 1.0964388835418672, + "grad_norm": 1.6996936798095703, + "learning_rate": 8.49885626582965e-05, + "loss": 0.8313, + "step": 28480 + }, + { + "epoch": 1.096631376323388, + "grad_norm": 1.2669320106506348, + "learning_rate": 8.49586663951669e-05, + "loss": 0.8763, + "step": 28485 + }, + { + "epoch": 1.0968238691049086, + "grad_norm": 1.1832565069198608, + "learning_rate": 8.492877150746764e-05, + "loss": 0.9931, + "step": 28490 + }, + { + "epoch": 1.0970163618864293, + "grad_norm": 0.8875114321708679, + "learning_rate": 8.489887799793232e-05, + "loss": 0.9127, + "step": 28495 + }, + { + "epoch": 1.09720885466795, + "grad_norm": 0.9826710224151611, + "learning_rate": 8.486898586929464e-05, + "loss": 0.8378, + "step": 28500 + }, + { + "epoch": 1.0974013474494706, + "grad_norm": 1.0573769807815552, + "learning_rate": 8.483909512428796e-05, + "loss": 0.8276, + "step": 28505 + }, + { + "epoch": 1.0975938402309913, + "grad_norm": 0.9481573104858398, + "learning_rate": 8.480920576564555e-05, + "loss": 0.8358, + "step": 28510 + }, + { + "epoch": 1.097786333012512, + "grad_norm": 1.1130627393722534, + "learning_rate": 8.477931779610068e-05, + "loss": 0.9346, + "step": 28515 + }, + { + "epoch": 1.0979788257940326, + "grad_norm": 2.038252115249634, + "learning_rate": 8.474943121838638e-05, + "loss": 0.787, + "step": 28520 + }, + { + "epoch": 1.0981713185755535, + "grad_norm": 1.3713279962539673, + "learning_rate": 8.471954603523553e-05, + "loss": 0.805, + "step": 28525 + }, + { + "epoch": 1.0983638113570742, + "grad_norm": 1.3389123678207397, + "learning_rate": 8.468966224938101e-05, + "loss": 0.8322, + "step": 28530 + }, + { + "epoch": 1.0985563041385948, + "grad_norm": 2.199444532394409, + "learning_rate": 8.465977986355545e-05, + "loss": 1.0354, + "step": 28535 + }, + { + "epoch": 1.0987487969201155, + "grad_norm": 1.5610674619674683, + "learning_rate": 8.462989888049137e-05, + "loss": 0.9338, + "step": 28540 + }, + { + "epoch": 1.0989412897016362, + "grad_norm": 1.1883718967437744, + "learning_rate": 8.460001930292125e-05, + "loss": 0.7661, + "step": 28545 + }, + { + "epoch": 1.0991337824831569, + "grad_norm": 1.4481984376907349, + "learning_rate": 8.457014113357736e-05, + "loss": 0.8787, + "step": 28550 + }, + { + "epoch": 1.0993262752646775, + "grad_norm": 1.3184305429458618, + "learning_rate": 8.454026437519182e-05, + "loss": 0.789, + "step": 28555 + }, + { + "epoch": 1.0995187680461982, + "grad_norm": 1.6092246770858765, + "learning_rate": 8.45103890304967e-05, + "loss": 0.9798, + "step": 28560 + }, + { + "epoch": 1.0997112608277189, + "grad_norm": 1.512444019317627, + "learning_rate": 8.448051510222391e-05, + "loss": 0.7606, + "step": 28565 + }, + { + "epoch": 1.0999037536092398, + "grad_norm": 0.880029022693634, + "learning_rate": 8.445064259310518e-05, + "loss": 0.7627, + "step": 28570 + }, + { + "epoch": 1.1000962463907604, + "grad_norm": 1.796959638595581, + "learning_rate": 8.442077150587222e-05, + "loss": 0.8827, + "step": 28575 + }, + { + "epoch": 1.100288739172281, + "grad_norm": 1.1779733896255493, + "learning_rate": 8.439090184325644e-05, + "loss": 0.9346, + "step": 28580 + }, + { + "epoch": 1.1004812319538018, + "grad_norm": 0.7998082041740417, + "learning_rate": 8.436103360798934e-05, + "loss": 0.799, + "step": 28585 + }, + { + "epoch": 1.1006737247353224, + "grad_norm": 1.9549007415771484, + "learning_rate": 8.433116680280212e-05, + "loss": 1.0649, + "step": 28590 + }, + { + "epoch": 1.100866217516843, + "grad_norm": 1.2117373943328857, + "learning_rate": 8.430130143042584e-05, + "loss": 0.8325, + "step": 28595 + }, + { + "epoch": 1.1010587102983638, + "grad_norm": 1.4036140441894531, + "learning_rate": 8.427143749359161e-05, + "loss": 0.7584, + "step": 28600 + }, + { + "epoch": 1.1012512030798844, + "grad_norm": 0.979111909866333, + "learning_rate": 8.424157499503022e-05, + "loss": 0.8124, + "step": 28605 + }, + { + "epoch": 1.1014436958614051, + "grad_norm": 1.6154838800430298, + "learning_rate": 8.421171393747237e-05, + "loss": 1.0092, + "step": 28610 + }, + { + "epoch": 1.1016361886429258, + "grad_norm": 1.6151224374771118, + "learning_rate": 8.418185432364873e-05, + "loss": 0.8584, + "step": 28615 + }, + { + "epoch": 1.1018286814244467, + "grad_norm": 0.9842272996902466, + "learning_rate": 8.415199615628974e-05, + "loss": 0.819, + "step": 28620 + }, + { + "epoch": 1.1020211742059673, + "grad_norm": 1.6019426584243774, + "learning_rate": 8.412213943812566e-05, + "loss": 0.7806, + "step": 28625 + }, + { + "epoch": 1.102213666987488, + "grad_norm": 1.4778478145599365, + "learning_rate": 8.40922841718868e-05, + "loss": 0.878, + "step": 28630 + }, + { + "epoch": 1.1024061597690087, + "grad_norm": 1.3348811864852905, + "learning_rate": 8.406243036030316e-05, + "loss": 0.8364, + "step": 28635 + }, + { + "epoch": 1.1025986525505294, + "grad_norm": 2.31394100189209, + "learning_rate": 8.403257800610465e-05, + "loss": 0.8119, + "step": 28640 + }, + { + "epoch": 1.10279114533205, + "grad_norm": 2.239318370819092, + "learning_rate": 8.400272711202115e-05, + "loss": 0.865, + "step": 28645 + }, + { + "epoch": 1.1029836381135707, + "grad_norm": 2.2287495136260986, + "learning_rate": 8.397287768078227e-05, + "loss": 0.9477, + "step": 28650 + }, + { + "epoch": 1.1031761308950914, + "grad_norm": 1.1342699527740479, + "learning_rate": 8.394302971511757e-05, + "loss": 0.8375, + "step": 28655 + }, + { + "epoch": 1.103368623676612, + "grad_norm": 1.326915979385376, + "learning_rate": 8.39131832177564e-05, + "loss": 0.8521, + "step": 28660 + }, + { + "epoch": 1.103561116458133, + "grad_norm": 1.2437760829925537, + "learning_rate": 8.38833381914281e-05, + "loss": 0.8906, + "step": 28665 + }, + { + "epoch": 1.1037536092396536, + "grad_norm": 1.5128203630447388, + "learning_rate": 8.385349463886174e-05, + "loss": 0.9937, + "step": 28670 + }, + { + "epoch": 1.1039461020211743, + "grad_norm": 1.7990190982818604, + "learning_rate": 8.382365256278635e-05, + "loss": 0.8797, + "step": 28675 + }, + { + "epoch": 1.104138594802695, + "grad_norm": 0.9128474593162537, + "learning_rate": 8.379381196593075e-05, + "loss": 0.8568, + "step": 28680 + }, + { + "epoch": 1.1043310875842156, + "grad_norm": 0.8986829519271851, + "learning_rate": 8.376397285102369e-05, + "loss": 0.6961, + "step": 28685 + }, + { + "epoch": 1.1045235803657363, + "grad_norm": 1.1345797777175903, + "learning_rate": 8.373413522079377e-05, + "loss": 0.8966, + "step": 28690 + }, + { + "epoch": 1.104716073147257, + "grad_norm": 0.927941083908081, + "learning_rate": 8.370429907796945e-05, + "loss": 0.955, + "step": 28695 + }, + { + "epoch": 1.1049085659287776, + "grad_norm": 1.2426468133926392, + "learning_rate": 8.367446442527898e-05, + "loss": 0.6989, + "step": 28700 + }, + { + "epoch": 1.1051010587102983, + "grad_norm": 1.2944527864456177, + "learning_rate": 8.364463126545064e-05, + "loss": 0.8288, + "step": 28705 + }, + { + "epoch": 1.105293551491819, + "grad_norm": 0.8285412788391113, + "learning_rate": 8.361479960121242e-05, + "loss": 0.7919, + "step": 28710 + }, + { + "epoch": 1.1054860442733399, + "grad_norm": 1.3904072046279907, + "learning_rate": 8.358496943529219e-05, + "loss": 0.8751, + "step": 28715 + }, + { + "epoch": 1.1056785370548605, + "grad_norm": 1.4166022539138794, + "learning_rate": 8.355514077041782e-05, + "loss": 0.8239, + "step": 28720 + }, + { + "epoch": 1.1058710298363812, + "grad_norm": 1.4481518268585205, + "learning_rate": 8.352531360931688e-05, + "loss": 0.7997, + "step": 28725 + }, + { + "epoch": 1.1060635226179019, + "grad_norm": 1.293131709098816, + "learning_rate": 8.349548795471683e-05, + "loss": 0.8535, + "step": 28730 + }, + { + "epoch": 1.1062560153994225, + "grad_norm": 1.2604767084121704, + "learning_rate": 8.346566380934511e-05, + "loss": 0.892, + "step": 28735 + }, + { + "epoch": 1.1064485081809432, + "grad_norm": 1.0578190088272095, + "learning_rate": 8.343584117592894e-05, + "loss": 0.8556, + "step": 28740 + }, + { + "epoch": 1.1066410009624639, + "grad_norm": 0.7822481989860535, + "learning_rate": 8.340602005719529e-05, + "loss": 0.8676, + "step": 28745 + }, + { + "epoch": 1.1068334937439845, + "grad_norm": 1.1127487421035767, + "learning_rate": 8.337620045587123e-05, + "loss": 0.8864, + "step": 28750 + }, + { + "epoch": 1.1070259865255052, + "grad_norm": 1.4301708936691284, + "learning_rate": 8.334638237468352e-05, + "loss": 0.7653, + "step": 28755 + }, + { + "epoch": 1.107218479307026, + "grad_norm": 1.5563634634017944, + "learning_rate": 8.331656581635879e-05, + "loss": 0.8272, + "step": 28760 + }, + { + "epoch": 1.1074109720885468, + "grad_norm": 1.9292504787445068, + "learning_rate": 8.328675078362363e-05, + "loss": 0.9858, + "step": 28765 + }, + { + "epoch": 1.1076034648700674, + "grad_norm": 0.9912068247795105, + "learning_rate": 8.325693727920438e-05, + "loss": 0.805, + "step": 28770 + }, + { + "epoch": 1.1077959576515881, + "grad_norm": 1.0305575132369995, + "learning_rate": 8.32271253058273e-05, + "loss": 0.9662, + "step": 28775 + }, + { + "epoch": 1.1079884504331088, + "grad_norm": 1.2367445230484009, + "learning_rate": 8.31973148662185e-05, + "loss": 0.9497, + "step": 28780 + }, + { + "epoch": 1.1081809432146295, + "grad_norm": 1.2530933618545532, + "learning_rate": 8.316750596310396e-05, + "loss": 0.8991, + "step": 28785 + }, + { + "epoch": 1.1083734359961501, + "grad_norm": 0.8720499873161316, + "learning_rate": 8.313769859920947e-05, + "loss": 0.7811, + "step": 28790 + }, + { + "epoch": 1.1085659287776708, + "grad_norm": 1.3998817205429077, + "learning_rate": 8.310789277726079e-05, + "loss": 0.8827, + "step": 28795 + }, + { + "epoch": 1.1087584215591915, + "grad_norm": 2.0500388145446777, + "learning_rate": 8.30780884999834e-05, + "loss": 0.8259, + "step": 28800 + }, + { + "epoch": 1.1089509143407121, + "grad_norm": 1.5865284204483032, + "learning_rate": 8.304828577010266e-05, + "loss": 0.7836, + "step": 28805 + }, + { + "epoch": 1.1091434071222328, + "grad_norm": 1.2251750230789185, + "learning_rate": 8.301848459034396e-05, + "loss": 0.773, + "step": 28810 + }, + { + "epoch": 1.1093358999037537, + "grad_norm": 1.2303904294967651, + "learning_rate": 8.298868496343234e-05, + "loss": 0.9684, + "step": 28815 + }, + { + "epoch": 1.1095283926852744, + "grad_norm": 1.153835415840149, + "learning_rate": 8.295888689209274e-05, + "loss": 0.834, + "step": 28820 + }, + { + "epoch": 1.109720885466795, + "grad_norm": 1.5492242574691772, + "learning_rate": 8.29290903790501e-05, + "loss": 0.7547, + "step": 28825 + }, + { + "epoch": 1.1099133782483157, + "grad_norm": 1.5805435180664062, + "learning_rate": 8.289929542702905e-05, + "loss": 0.8278, + "step": 28830 + }, + { + "epoch": 1.1101058710298364, + "grad_norm": 1.0030450820922852, + "learning_rate": 8.286950203875412e-05, + "loss": 0.8725, + "step": 28835 + }, + { + "epoch": 1.110298363811357, + "grad_norm": 1.6949737071990967, + "learning_rate": 8.28397102169498e-05, + "loss": 0.8892, + "step": 28840 + }, + { + "epoch": 1.1104908565928777, + "grad_norm": 1.040964126586914, + "learning_rate": 8.280991996434025e-05, + "loss": 0.9165, + "step": 28845 + }, + { + "epoch": 1.1106833493743984, + "grad_norm": 1.1363362073898315, + "learning_rate": 8.278013128364967e-05, + "loss": 0.8674, + "step": 28850 + }, + { + "epoch": 1.110875842155919, + "grad_norm": 0.8567915558815002, + "learning_rate": 8.275034417760205e-05, + "loss": 0.8689, + "step": 28855 + }, + { + "epoch": 1.11106833493744, + "grad_norm": 1.597185492515564, + "learning_rate": 8.272055864892112e-05, + "loss": 0.7718, + "step": 28860 + }, + { + "epoch": 1.1112608277189606, + "grad_norm": 0.7887008786201477, + "learning_rate": 8.269077470033068e-05, + "loss": 0.8011, + "step": 28865 + }, + { + "epoch": 1.1114533205004813, + "grad_norm": 1.3229820728302002, + "learning_rate": 8.266099233455423e-05, + "loss": 0.8735, + "step": 28870 + }, + { + "epoch": 1.111645813282002, + "grad_norm": 1.3275078535079956, + "learning_rate": 8.263121155431515e-05, + "loss": 0.9547, + "step": 28875 + }, + { + "epoch": 1.1118383060635226, + "grad_norm": 1.2389519214630127, + "learning_rate": 8.260143236233673e-05, + "loss": 1.079, + "step": 28880 + }, + { + "epoch": 1.1120307988450433, + "grad_norm": 1.1708885431289673, + "learning_rate": 8.257165476134207e-05, + "loss": 0.9086, + "step": 28885 + }, + { + "epoch": 1.112223291626564, + "grad_norm": 1.1817975044250488, + "learning_rate": 8.254187875405411e-05, + "loss": 0.914, + "step": 28890 + }, + { + "epoch": 1.1124157844080846, + "grad_norm": 1.4715931415557861, + "learning_rate": 8.251210434319572e-05, + "loss": 0.9382, + "step": 28895 + }, + { + "epoch": 1.1126082771896053, + "grad_norm": 1.0971523523330688, + "learning_rate": 8.248233153148953e-05, + "loss": 0.6749, + "step": 28900 + }, + { + "epoch": 1.112800769971126, + "grad_norm": 0.9213184714317322, + "learning_rate": 8.245256032165804e-05, + "loss": 0.8894, + "step": 28905 + }, + { + "epoch": 1.1129932627526469, + "grad_norm": 1.3410868644714355, + "learning_rate": 8.242279071642371e-05, + "loss": 0.8031, + "step": 28910 + }, + { + "epoch": 1.1131857555341675, + "grad_norm": 0.9468138813972473, + "learning_rate": 8.239302271850874e-05, + "loss": 0.9081, + "step": 28915 + }, + { + "epoch": 1.1133782483156882, + "grad_norm": 0.9489649534225464, + "learning_rate": 8.236325633063515e-05, + "loss": 0.9047, + "step": 28920 + }, + { + "epoch": 1.1135707410972089, + "grad_norm": 1.7119163274765015, + "learning_rate": 8.233349155552499e-05, + "loss": 0.7558, + "step": 28925 + }, + { + "epoch": 1.1137632338787296, + "grad_norm": 1.610944151878357, + "learning_rate": 8.230372839589998e-05, + "loss": 0.868, + "step": 28930 + }, + { + "epoch": 1.1139557266602502, + "grad_norm": 1.2964144945144653, + "learning_rate": 8.227396685448176e-05, + "loss": 0.7636, + "step": 28935 + }, + { + "epoch": 1.114148219441771, + "grad_norm": 1.2506333589553833, + "learning_rate": 8.224420693399187e-05, + "loss": 0.8933, + "step": 28940 + }, + { + "epoch": 1.1143407122232916, + "grad_norm": 1.260901689529419, + "learning_rate": 8.221444863715165e-05, + "loss": 0.8687, + "step": 28945 + }, + { + "epoch": 1.1145332050048122, + "grad_norm": 0.6778833866119385, + "learning_rate": 8.218469196668222e-05, + "loss": 0.7826, + "step": 28950 + }, + { + "epoch": 1.1147256977863331, + "grad_norm": 2.43918514251709, + "learning_rate": 8.215493692530475e-05, + "loss": 1.0159, + "step": 28955 + }, + { + "epoch": 1.1149181905678538, + "grad_norm": 1.1964274644851685, + "learning_rate": 8.21251835157401e-05, + "loss": 0.8936, + "step": 28960 + }, + { + "epoch": 1.1151106833493745, + "grad_norm": 1.0071650743484497, + "learning_rate": 8.209543174070894e-05, + "loss": 0.894, + "step": 28965 + }, + { + "epoch": 1.1153031761308951, + "grad_norm": 1.0765222311019897, + "learning_rate": 8.2065681602932e-05, + "loss": 0.943, + "step": 28970 + }, + { + "epoch": 1.1154956689124158, + "grad_norm": 1.2114105224609375, + "learning_rate": 8.203593310512967e-05, + "loss": 0.9682, + "step": 28975 + }, + { + "epoch": 1.1156881616939365, + "grad_norm": 0.9356218576431274, + "learning_rate": 8.200618625002223e-05, + "loss": 0.7903, + "step": 28980 + }, + { + "epoch": 1.1158806544754571, + "grad_norm": 1.1432327032089233, + "learning_rate": 8.197644104032987e-05, + "loss": 0.79, + "step": 28985 + }, + { + "epoch": 1.1160731472569778, + "grad_norm": 1.7446037530899048, + "learning_rate": 8.194669747877259e-05, + "loss": 0.8792, + "step": 28990 + }, + { + "epoch": 1.1162656400384985, + "grad_norm": 2.04817533493042, + "learning_rate": 8.191695556807022e-05, + "loss": 0.9906, + "step": 28995 + }, + { + "epoch": 1.1164581328200192, + "grad_norm": 1.7263697385787964, + "learning_rate": 8.18872153109425e-05, + "loss": 0.7607, + "step": 29000 + }, + { + "epoch": 1.1166506256015398, + "grad_norm": 1.177629828453064, + "learning_rate": 8.185747671010891e-05, + "loss": 0.7068, + "step": 29005 + }, + { + "epoch": 1.1168431183830607, + "grad_norm": 0.9758390188217163, + "learning_rate": 8.18277397682889e-05, + "loss": 0.9517, + "step": 29010 + }, + { + "epoch": 1.1170356111645814, + "grad_norm": 1.2482577562332153, + "learning_rate": 8.179800448820174e-05, + "loss": 0.839, + "step": 29015 + }, + { + "epoch": 1.117228103946102, + "grad_norm": 1.2977570295333862, + "learning_rate": 8.176827087256649e-05, + "loss": 0.8067, + "step": 29020 + }, + { + "epoch": 1.1174205967276227, + "grad_norm": 1.2474614381790161, + "learning_rate": 8.173853892410206e-05, + "loss": 0.947, + "step": 29025 + }, + { + "epoch": 1.1176130895091434, + "grad_norm": 1.5093929767608643, + "learning_rate": 8.17088086455273e-05, + "loss": 0.67, + "step": 29030 + }, + { + "epoch": 1.117805582290664, + "grad_norm": 1.944604754447937, + "learning_rate": 8.167908003956082e-05, + "loss": 0.8075, + "step": 29035 + }, + { + "epoch": 1.1179980750721847, + "grad_norm": 1.2519726753234863, + "learning_rate": 8.164935310892108e-05, + "loss": 0.9158, + "step": 29040 + }, + { + "epoch": 1.1181905678537054, + "grad_norm": 1.3411145210266113, + "learning_rate": 8.161962785632646e-05, + "loss": 0.9831, + "step": 29045 + }, + { + "epoch": 1.118383060635226, + "grad_norm": 1.7142786979675293, + "learning_rate": 8.158990428449514e-05, + "loss": 0.8387, + "step": 29050 + }, + { + "epoch": 1.118575553416747, + "grad_norm": 1.1233665943145752, + "learning_rate": 8.156018239614505e-05, + "loss": 0.8295, + "step": 29055 + }, + { + "epoch": 1.1187680461982676, + "grad_norm": 1.210729718208313, + "learning_rate": 8.153046219399418e-05, + "loss": 0.8221, + "step": 29060 + }, + { + "epoch": 1.1189605389797883, + "grad_norm": 1.134706735610962, + "learning_rate": 8.150074368076019e-05, + "loss": 0.8517, + "step": 29065 + }, + { + "epoch": 1.119153031761309, + "grad_norm": 1.5711884498596191, + "learning_rate": 8.14710268591606e-05, + "loss": 0.8291, + "step": 29070 + }, + { + "epoch": 1.1193455245428297, + "grad_norm": 1.2242764234542847, + "learning_rate": 8.144131173191292e-05, + "loss": 0.8554, + "step": 29075 + }, + { + "epoch": 1.1195380173243503, + "grad_norm": 0.9937723875045776, + "learning_rate": 8.141159830173432e-05, + "loss": 0.9085, + "step": 29080 + }, + { + "epoch": 1.119730510105871, + "grad_norm": 1.4553319215774536, + "learning_rate": 8.138188657134188e-05, + "loss": 0.7981, + "step": 29085 + }, + { + "epoch": 1.1199230028873917, + "grad_norm": 1.1583729982376099, + "learning_rate": 8.135217654345263e-05, + "loss": 0.8145, + "step": 29090 + }, + { + "epoch": 1.1201154956689123, + "grad_norm": 1.2025479078292847, + "learning_rate": 8.132246822078328e-05, + "loss": 0.925, + "step": 29095 + }, + { + "epoch": 1.120307988450433, + "grad_norm": 1.4865069389343262, + "learning_rate": 8.12927616060505e-05, + "loss": 0.7075, + "step": 29100 + }, + { + "epoch": 1.120500481231954, + "grad_norm": 2.3236899375915527, + "learning_rate": 8.126305670197073e-05, + "loss": 0.9911, + "step": 29105 + }, + { + "epoch": 1.1206929740134746, + "grad_norm": 0.9563505053520203, + "learning_rate": 8.12333535112603e-05, + "loss": 0.7803, + "step": 29110 + }, + { + "epoch": 1.1208854667949952, + "grad_norm": 1.071954607963562, + "learning_rate": 8.12036520366354e-05, + "loss": 0.808, + "step": 29115 + }, + { + "epoch": 1.121077959576516, + "grad_norm": 0.5228383541107178, + "learning_rate": 8.117395228081202e-05, + "loss": 0.8721, + "step": 29120 + }, + { + "epoch": 1.1212704523580366, + "grad_norm": 1.238578200340271, + "learning_rate": 8.114425424650592e-05, + "loss": 0.8783, + "step": 29125 + }, + { + "epoch": 1.1214629451395572, + "grad_norm": 0.9781158566474915, + "learning_rate": 8.111455793643292e-05, + "loss": 0.8673, + "step": 29130 + }, + { + "epoch": 1.121655437921078, + "grad_norm": 1.529806137084961, + "learning_rate": 8.10848633533085e-05, + "loss": 0.9377, + "step": 29135 + }, + { + "epoch": 1.1218479307025986, + "grad_norm": 1.0390987396240234, + "learning_rate": 8.105517049984797e-05, + "loss": 0.8467, + "step": 29140 + }, + { + "epoch": 1.1220404234841193, + "grad_norm": 1.066085696220398, + "learning_rate": 8.102547937876664e-05, + "loss": 0.7405, + "step": 29145 + }, + { + "epoch": 1.1222329162656401, + "grad_norm": 0.9266438484191895, + "learning_rate": 8.099578999277953e-05, + "loss": 0.8576, + "step": 29150 + }, + { + "epoch": 1.1224254090471608, + "grad_norm": 1.52492356300354, + "learning_rate": 8.09661023446015e-05, + "loss": 0.86, + "step": 29155 + }, + { + "epoch": 1.1226179018286815, + "grad_norm": 1.8862872123718262, + "learning_rate": 8.093641643694736e-05, + "loss": 0.9879, + "step": 29160 + }, + { + "epoch": 1.1228103946102022, + "grad_norm": 1.223181128501892, + "learning_rate": 8.090673227253165e-05, + "loss": 0.8792, + "step": 29165 + }, + { + "epoch": 1.1230028873917228, + "grad_norm": 1.4018183946609497, + "learning_rate": 8.087704985406875e-05, + "loss": 0.9087, + "step": 29170 + }, + { + "epoch": 1.1231953801732435, + "grad_norm": 0.8999775052070618, + "learning_rate": 8.084736918427302e-05, + "loss": 0.9368, + "step": 29175 + }, + { + "epoch": 1.1233878729547642, + "grad_norm": 1.3538589477539062, + "learning_rate": 8.081769026585848e-05, + "loss": 0.7531, + "step": 29180 + }, + { + "epoch": 1.1235803657362848, + "grad_norm": 1.039942741394043, + "learning_rate": 8.078801310153906e-05, + "loss": 0.89, + "step": 29185 + }, + { + "epoch": 1.1237728585178055, + "grad_norm": 1.8016685247421265, + "learning_rate": 8.075833769402861e-05, + "loss": 0.8751, + "step": 29190 + }, + { + "epoch": 1.1239653512993262, + "grad_norm": 1.5245671272277832, + "learning_rate": 8.072866404604073e-05, + "loss": 0.7489, + "step": 29195 + }, + { + "epoch": 1.124157844080847, + "grad_norm": 1.9634850025177002, + "learning_rate": 8.069899216028883e-05, + "loss": 0.7756, + "step": 29200 + }, + { + "epoch": 1.1243503368623677, + "grad_norm": 1.0487682819366455, + "learning_rate": 8.066932203948625e-05, + "loss": 0.697, + "step": 29205 + }, + { + "epoch": 1.1245428296438884, + "grad_norm": 1.292322039604187, + "learning_rate": 8.063965368634611e-05, + "loss": 0.8285, + "step": 29210 + }, + { + "epoch": 1.124735322425409, + "grad_norm": 1.1624624729156494, + "learning_rate": 8.06099871035814e-05, + "loss": 0.7489, + "step": 29215 + }, + { + "epoch": 1.1249278152069297, + "grad_norm": 0.9536521434783936, + "learning_rate": 8.058032229390493e-05, + "loss": 0.8356, + "step": 29220 + }, + { + "epoch": 1.1251203079884504, + "grad_norm": 1.499011516571045, + "learning_rate": 8.055065926002933e-05, + "loss": 0.964, + "step": 29225 + }, + { + "epoch": 1.125312800769971, + "grad_norm": 2.0605900287628174, + "learning_rate": 8.052099800466708e-05, + "loss": 0.8371, + "step": 29230 + }, + { + "epoch": 1.1255052935514918, + "grad_norm": 1.7930225133895874, + "learning_rate": 8.049133853053057e-05, + "loss": 0.7997, + "step": 29235 + }, + { + "epoch": 1.1256977863330124, + "grad_norm": 2.043825626373291, + "learning_rate": 8.04616808403319e-05, + "loss": 0.8135, + "step": 29240 + }, + { + "epoch": 1.1258902791145333, + "grad_norm": 1.0916283130645752, + "learning_rate": 8.043202493678306e-05, + "loss": 0.8819, + "step": 29245 + }, + { + "epoch": 1.126082771896054, + "grad_norm": 1.748055100440979, + "learning_rate": 8.040237082259594e-05, + "loss": 0.8833, + "step": 29250 + }, + { + "epoch": 1.1262752646775747, + "grad_norm": 1.4773916006088257, + "learning_rate": 8.03727185004822e-05, + "loss": 0.7417, + "step": 29255 + }, + { + "epoch": 1.1264677574590953, + "grad_norm": 0.9700600504875183, + "learning_rate": 8.034306797315328e-05, + "loss": 0.7682, + "step": 29260 + }, + { + "epoch": 1.126660250240616, + "grad_norm": 1.6670438051223755, + "learning_rate": 8.031341924332063e-05, + "loss": 0.8024, + "step": 29265 + }, + { + "epoch": 1.1268527430221367, + "grad_norm": 1.4361203908920288, + "learning_rate": 8.028377231369538e-05, + "loss": 0.9223, + "step": 29270 + }, + { + "epoch": 1.1270452358036573, + "grad_norm": 2.1577651500701904, + "learning_rate": 8.02541271869885e-05, + "loss": 0.7592, + "step": 29275 + }, + { + "epoch": 1.127237728585178, + "grad_norm": 1.7753071784973145, + "learning_rate": 8.022448386591093e-05, + "loss": 0.7609, + "step": 29280 + }, + { + "epoch": 1.1274302213666987, + "grad_norm": 1.2456077337265015, + "learning_rate": 8.01948423531733e-05, + "loss": 0.9491, + "step": 29285 + }, + { + "epoch": 1.1276227141482194, + "grad_norm": 1.634460687637329, + "learning_rate": 8.01652026514861e-05, + "loss": 0.9136, + "step": 29290 + }, + { + "epoch": 1.12781520692974, + "grad_norm": 1.8183555603027344, + "learning_rate": 8.013556476355976e-05, + "loss": 0.8587, + "step": 29295 + }, + { + "epoch": 1.128007699711261, + "grad_norm": 1.5357935428619385, + "learning_rate": 8.010592869210445e-05, + "loss": 0.8325, + "step": 29300 + }, + { + "epoch": 1.1282001924927816, + "grad_norm": 1.0693864822387695, + "learning_rate": 8.007629443983013e-05, + "loss": 0.7546, + "step": 29305 + }, + { + "epoch": 1.1283926852743023, + "grad_norm": 1.100164771080017, + "learning_rate": 8.004666200944673e-05, + "loss": 0.9358, + "step": 29310 + }, + { + "epoch": 1.128585178055823, + "grad_norm": 1.2228190898895264, + "learning_rate": 8.00170314036639e-05, + "loss": 0.9255, + "step": 29315 + }, + { + "epoch": 1.1287776708373436, + "grad_norm": 1.408209204673767, + "learning_rate": 7.998740262519118e-05, + "loss": 0.7362, + "step": 29320 + }, + { + "epoch": 1.1289701636188643, + "grad_norm": 2.5063939094543457, + "learning_rate": 7.995777567673791e-05, + "loss": 0.7319, + "step": 29325 + }, + { + "epoch": 1.129162656400385, + "grad_norm": 1.9858168363571167, + "learning_rate": 7.992815056101331e-05, + "loss": 0.9686, + "step": 29330 + }, + { + "epoch": 1.1293551491819056, + "grad_norm": 1.276520013809204, + "learning_rate": 7.989852728072635e-05, + "loss": 0.8679, + "step": 29335 + }, + { + "epoch": 1.1295476419634265, + "grad_norm": 1.9740073680877686, + "learning_rate": 7.986890583858593e-05, + "loss": 0.9984, + "step": 29340 + }, + { + "epoch": 1.1297401347449472, + "grad_norm": 0.7693697214126587, + "learning_rate": 7.983928623730073e-05, + "loss": 0.7382, + "step": 29345 + }, + { + "epoch": 1.1299326275264678, + "grad_norm": 1.349514365196228, + "learning_rate": 7.980966847957918e-05, + "loss": 0.9728, + "step": 29350 + }, + { + "epoch": 1.1301251203079885, + "grad_norm": 1.5095326900482178, + "learning_rate": 7.978005256812977e-05, + "loss": 0.9931, + "step": 29355 + }, + { + "epoch": 1.1303176130895092, + "grad_norm": 1.1477704048156738, + "learning_rate": 7.975043850566053e-05, + "loss": 0.7388, + "step": 29360 + }, + { + "epoch": 1.1305101058710298, + "grad_norm": 1.9314098358154297, + "learning_rate": 7.972082629487962e-05, + "loss": 0.8511, + "step": 29365 + }, + { + "epoch": 1.1307025986525505, + "grad_norm": 2.503371477127075, + "learning_rate": 7.969121593849481e-05, + "loss": 0.8943, + "step": 29370 + }, + { + "epoch": 1.1308950914340712, + "grad_norm": 1.379692792892456, + "learning_rate": 7.966160743921371e-05, + "loss": 0.8356, + "step": 29375 + }, + { + "epoch": 1.1310875842155919, + "grad_norm": 1.3109577894210815, + "learning_rate": 7.963200079974394e-05, + "loss": 0.9496, + "step": 29380 + }, + { + "epoch": 1.1312800769971125, + "grad_norm": 1.2980083227157593, + "learning_rate": 7.960239602279275e-05, + "loss": 0.9289, + "step": 29385 + }, + { + "epoch": 1.1314725697786332, + "grad_norm": 1.131455659866333, + "learning_rate": 7.95727931110673e-05, + "loss": 0.8283, + "step": 29390 + }, + { + "epoch": 1.131665062560154, + "grad_norm": 2.513030529022217, + "learning_rate": 7.954319206727464e-05, + "loss": 0.8418, + "step": 29395 + }, + { + "epoch": 1.1318575553416748, + "grad_norm": 1.2375028133392334, + "learning_rate": 7.951359289412154e-05, + "loss": 0.9583, + "step": 29400 + }, + { + "epoch": 1.1320500481231954, + "grad_norm": 1.336858868598938, + "learning_rate": 7.948399559431463e-05, + "loss": 0.8757, + "step": 29405 + }, + { + "epoch": 1.132242540904716, + "grad_norm": 1.1518652439117432, + "learning_rate": 7.945440017056045e-05, + "loss": 0.8316, + "step": 29410 + }, + { + "epoch": 1.1324350336862368, + "grad_norm": 1.9742542505264282, + "learning_rate": 7.942480662556527e-05, + "loss": 0.7784, + "step": 29415 + }, + { + "epoch": 1.1326275264677574, + "grad_norm": 1.131545066833496, + "learning_rate": 7.939521496203521e-05, + "loss": 0.974, + "step": 29420 + }, + { + "epoch": 1.132820019249278, + "grad_norm": 1.558993935585022, + "learning_rate": 7.936562518267625e-05, + "loss": 1.0205, + "step": 29425 + }, + { + "epoch": 1.1330125120307988, + "grad_norm": 1.082209825515747, + "learning_rate": 7.93360372901942e-05, + "loss": 1.0551, + "step": 29430 + }, + { + "epoch": 1.1332050048123194, + "grad_norm": 1.8491493463516235, + "learning_rate": 7.930645128729462e-05, + "loss": 0.8227, + "step": 29435 + }, + { + "epoch": 1.1333974975938403, + "grad_norm": 1.647204875946045, + "learning_rate": 7.927686717668302e-05, + "loss": 0.9909, + "step": 29440 + }, + { + "epoch": 1.133589990375361, + "grad_norm": 1.1486252546310425, + "learning_rate": 7.924728496106464e-05, + "loss": 0.8855, + "step": 29445 + }, + { + "epoch": 1.1337824831568817, + "grad_norm": 1.3618193864822388, + "learning_rate": 7.921770464314453e-05, + "loss": 0.8427, + "step": 29450 + }, + { + "epoch": 1.1339749759384024, + "grad_norm": 1.4775711297988892, + "learning_rate": 7.91881262256277e-05, + "loss": 0.8925, + "step": 29455 + }, + { + "epoch": 1.134167468719923, + "grad_norm": 1.0450371503829956, + "learning_rate": 7.915854971121886e-05, + "loss": 0.6872, + "step": 29460 + }, + { + "epoch": 1.1343599615014437, + "grad_norm": 1.4951140880584717, + "learning_rate": 7.912897510262255e-05, + "loss": 0.7814, + "step": 29465 + }, + { + "epoch": 1.1345524542829644, + "grad_norm": 0.9859576225280762, + "learning_rate": 7.909940240254325e-05, + "loss": 0.7146, + "step": 29470 + }, + { + "epoch": 1.134744947064485, + "grad_norm": 2.4736711978912354, + "learning_rate": 7.906983161368515e-05, + "loss": 1.0436, + "step": 29475 + }, + { + "epoch": 1.1349374398460057, + "grad_norm": 0.912487268447876, + "learning_rate": 7.904026273875225e-05, + "loss": 0.8881, + "step": 29480 + }, + { + "epoch": 1.1351299326275264, + "grad_norm": 1.1811585426330566, + "learning_rate": 7.901069578044852e-05, + "loss": 0.8878, + "step": 29485 + }, + { + "epoch": 1.135322425409047, + "grad_norm": 1.4744383096694946, + "learning_rate": 7.898113074147762e-05, + "loss": 0.8272, + "step": 29490 + }, + { + "epoch": 1.135514918190568, + "grad_norm": 1.061719536781311, + "learning_rate": 7.895748009403729e-05, + "loss": 1.3432, + "step": 29495 + }, + { + "epoch": 1.1357074109720886, + "grad_norm": 1.2878315448760986, + "learning_rate": 7.892791851667824e-05, + "loss": 1.045, + "step": 29500 + }, + { + "epoch": 1.1358999037536093, + "grad_norm": 1.3372122049331665, + "learning_rate": 7.889835886622145e-05, + "loss": 0.8942, + "step": 29505 + }, + { + "epoch": 1.13609239653513, + "grad_norm": 1.7610641717910767, + "learning_rate": 7.886880114536997e-05, + "loss": 0.9568, + "step": 29510 + }, + { + "epoch": 1.1362848893166506, + "grad_norm": 1.9298161268234253, + "learning_rate": 7.883924535682665e-05, + "loss": 0.8845, + "step": 29515 + }, + { + "epoch": 1.1364773820981713, + "grad_norm": 1.6231752634048462, + "learning_rate": 7.880969150329412e-05, + "loss": 0.7979, + "step": 29520 + }, + { + "epoch": 1.136669874879692, + "grad_norm": 2.3804054260253906, + "learning_rate": 7.878013958747496e-05, + "loss": 1.0056, + "step": 29525 + }, + { + "epoch": 1.1368623676612126, + "grad_norm": 1.7176682949066162, + "learning_rate": 7.875058961207148e-05, + "loss": 0.8073, + "step": 29530 + }, + { + "epoch": 1.1370548604427335, + "grad_norm": 1.6357464790344238, + "learning_rate": 7.872104157978576e-05, + "loss": 1.0359, + "step": 29535 + }, + { + "epoch": 1.1372473532242542, + "grad_norm": 0.9791454076766968, + "learning_rate": 7.869149549331989e-05, + "loss": 0.5608, + "step": 29540 + }, + { + "epoch": 1.1374398460057749, + "grad_norm": 1.282942533493042, + "learning_rate": 7.866195135537558e-05, + "loss": 0.8483, + "step": 29545 + }, + { + "epoch": 1.1376323387872955, + "grad_norm": 1.7176309823989868, + "learning_rate": 7.863240916865445e-05, + "loss": 0.8454, + "step": 29550 + }, + { + "epoch": 1.1378248315688162, + "grad_norm": 2.1249489784240723, + "learning_rate": 7.860286893585798e-05, + "loss": 1.0242, + "step": 29555 + }, + { + "epoch": 1.1380173243503369, + "grad_norm": 1.37616765499115, + "learning_rate": 7.85733306596874e-05, + "loss": 0.7752, + "step": 29560 + }, + { + "epoch": 1.1382098171318575, + "grad_norm": 1.312436580657959, + "learning_rate": 7.854379434284375e-05, + "loss": 0.7942, + "step": 29565 + }, + { + "epoch": 1.1384023099133782, + "grad_norm": 1.2645299434661865, + "learning_rate": 7.851425998802801e-05, + "loss": 0.8355, + "step": 29570 + }, + { + "epoch": 1.1385948026948989, + "grad_norm": 1.6107802391052246, + "learning_rate": 7.848472759794087e-05, + "loss": 0.7958, + "step": 29575 + }, + { + "epoch": 1.1387872954764195, + "grad_norm": 1.874605655670166, + "learning_rate": 7.845519717528284e-05, + "loss": 1.0369, + "step": 29580 + }, + { + "epoch": 1.1389797882579402, + "grad_norm": 0.8068329691886902, + "learning_rate": 7.842566872275433e-05, + "loss": 0.7825, + "step": 29585 + }, + { + "epoch": 1.139172281039461, + "grad_norm": 0.9913658499717712, + "learning_rate": 7.839614224305551e-05, + "loss": 0.8762, + "step": 29590 + }, + { + "epoch": 1.1393647738209818, + "grad_norm": 1.0012116432189941, + "learning_rate": 7.836661773888632e-05, + "loss": 0.7111, + "step": 29595 + }, + { + "epoch": 1.1395572666025024, + "grad_norm": 1.5434329509735107, + "learning_rate": 7.833709521294668e-05, + "loss": 0.7117, + "step": 29600 + }, + { + "epoch": 1.1397497593840231, + "grad_norm": 1.6689229011535645, + "learning_rate": 7.830757466793617e-05, + "loss": 0.878, + "step": 29605 + }, + { + "epoch": 1.1399422521655438, + "grad_norm": 1.267446756362915, + "learning_rate": 7.827805610655423e-05, + "loss": 0.8429, + "step": 29610 + }, + { + "epoch": 1.1401347449470645, + "grad_norm": 1.4829270839691162, + "learning_rate": 7.824853953150019e-05, + "loss": 0.9683, + "step": 29615 + }, + { + "epoch": 1.1403272377285851, + "grad_norm": 1.028507113456726, + "learning_rate": 7.821902494547309e-05, + "loss": 0.7345, + "step": 29620 + }, + { + "epoch": 1.1405197305101058, + "grad_norm": 1.30324125289917, + "learning_rate": 7.818951235117191e-05, + "loss": 0.8416, + "step": 29625 + }, + { + "epoch": 1.1407122232916265, + "grad_norm": 1.0241384506225586, + "learning_rate": 7.816000175129534e-05, + "loss": 0.9229, + "step": 29630 + }, + { + "epoch": 1.1409047160731474, + "grad_norm": 1.6417129039764404, + "learning_rate": 7.813049314854189e-05, + "loss": 1.075, + "step": 29635 + }, + { + "epoch": 1.141097208854668, + "grad_norm": 0.99880450963974, + "learning_rate": 7.810098654561002e-05, + "loss": 0.6836, + "step": 29640 + }, + { + "epoch": 1.1412897016361887, + "grad_norm": 1.3350822925567627, + "learning_rate": 7.807148194519784e-05, + "loss": 0.8302, + "step": 29645 + }, + { + "epoch": 1.1414821944177094, + "grad_norm": 1.4701656103134155, + "learning_rate": 7.804197935000336e-05, + "loss": 0.8758, + "step": 29650 + }, + { + "epoch": 1.14167468719923, + "grad_norm": 1.867395281791687, + "learning_rate": 7.801247876272444e-05, + "loss": 0.8234, + "step": 29655 + }, + { + "epoch": 1.1418671799807507, + "grad_norm": 1.0658506155014038, + "learning_rate": 7.798298018605868e-05, + "loss": 0.854, + "step": 29660 + }, + { + "epoch": 1.1420596727622714, + "grad_norm": 1.4265475273132324, + "learning_rate": 7.795348362270349e-05, + "loss": 0.8507, + "step": 29665 + }, + { + "epoch": 1.142252165543792, + "grad_norm": 1.2551696300506592, + "learning_rate": 7.792398907535622e-05, + "loss": 0.9168, + "step": 29670 + }, + { + "epoch": 1.1424446583253127, + "grad_norm": 1.5808075666427612, + "learning_rate": 7.789449654671391e-05, + "loss": 0.9197, + "step": 29675 + }, + { + "epoch": 1.1426371511068334, + "grad_norm": 1.2171685695648193, + "learning_rate": 7.786500603947342e-05, + "loss": 0.8965, + "step": 29680 + }, + { + "epoch": 1.142829643888354, + "grad_norm": 1.4671955108642578, + "learning_rate": 7.783551755633152e-05, + "loss": 0.84, + "step": 29685 + }, + { + "epoch": 1.143022136669875, + "grad_norm": 0.9525561332702637, + "learning_rate": 7.780603109998475e-05, + "loss": 0.9211, + "step": 29690 + }, + { + "epoch": 1.1432146294513956, + "grad_norm": 2.0476224422454834, + "learning_rate": 7.777654667312934e-05, + "loss": 0.8695, + "step": 29695 + }, + { + "epoch": 1.1434071222329163, + "grad_norm": 0.7733902335166931, + "learning_rate": 7.774706427846157e-05, + "loss": 0.8636, + "step": 29700 + }, + { + "epoch": 1.143599615014437, + "grad_norm": 2.125204086303711, + "learning_rate": 7.771758391867736e-05, + "loss": 1.0092, + "step": 29705 + }, + { + "epoch": 1.1437921077959576, + "grad_norm": 1.576104998588562, + "learning_rate": 7.768810559647248e-05, + "loss": 0.7684, + "step": 29710 + }, + { + "epoch": 1.1439846005774783, + "grad_norm": 1.8463459014892578, + "learning_rate": 7.765862931454256e-05, + "loss": 0.9924, + "step": 29715 + }, + { + "epoch": 1.144177093358999, + "grad_norm": 1.1519972085952759, + "learning_rate": 7.762915507558295e-05, + "loss": 0.8446, + "step": 29720 + }, + { + "epoch": 1.1443695861405196, + "grad_norm": 1.6794402599334717, + "learning_rate": 7.759968288228892e-05, + "loss": 0.8016, + "step": 29725 + }, + { + "epoch": 1.1445620789220405, + "grad_norm": 0.8587988018989563, + "learning_rate": 7.757021273735554e-05, + "loss": 0.7753, + "step": 29730 + }, + { + "epoch": 1.1447545717035612, + "grad_norm": 1.1401095390319824, + "learning_rate": 7.754074464347761e-05, + "loss": 0.7876, + "step": 29735 + }, + { + "epoch": 1.1449470644850819, + "grad_norm": 0.9994805455207825, + "learning_rate": 7.751127860334977e-05, + "loss": 0.9192, + "step": 29740 + }, + { + "epoch": 1.1451395572666025, + "grad_norm": 1.1678563356399536, + "learning_rate": 7.748181461966658e-05, + "loss": 0.8725, + "step": 29745 + }, + { + "epoch": 1.1453320500481232, + "grad_norm": 0.8279195427894592, + "learning_rate": 7.745235269512224e-05, + "loss": 0.7797, + "step": 29750 + }, + { + "epoch": 1.1455245428296439, + "grad_norm": 1.5446690320968628, + "learning_rate": 7.742289283241086e-05, + "loss": 0.7591, + "step": 29755 + }, + { + "epoch": 1.1457170356111646, + "grad_norm": 1.552101969718933, + "learning_rate": 7.739343503422641e-05, + "loss": 0.958, + "step": 29760 + }, + { + "epoch": 1.1459095283926852, + "grad_norm": 1.6127655506134033, + "learning_rate": 7.736397930326259e-05, + "loss": 0.8331, + "step": 29765 + }, + { + "epoch": 1.146102021174206, + "grad_norm": 1.63381028175354, + "learning_rate": 7.733452564221284e-05, + "loss": 1.0676, + "step": 29770 + }, + { + "epoch": 1.1462945139557266, + "grad_norm": 2.0071661472320557, + "learning_rate": 7.730507405377062e-05, + "loss": 0.8955, + "step": 29775 + }, + { + "epoch": 1.1464870067372472, + "grad_norm": 1.577379822731018, + "learning_rate": 7.727562454062906e-05, + "loss": 0.9404, + "step": 29780 + }, + { + "epoch": 1.1466794995187681, + "grad_norm": 1.0836290121078491, + "learning_rate": 7.724617710548104e-05, + "loss": 0.9939, + "step": 29785 + }, + { + "epoch": 1.1468719923002888, + "grad_norm": 1.5348353385925293, + "learning_rate": 7.721673175101944e-05, + "loss": 0.8655, + "step": 29790 + }, + { + "epoch": 1.1470644850818095, + "grad_norm": 1.7705544233322144, + "learning_rate": 7.718728847993679e-05, + "loss": 0.9213, + "step": 29795 + }, + { + "epoch": 1.1472569778633301, + "grad_norm": 1.228938341140747, + "learning_rate": 7.715784729492546e-05, + "loss": 0.8888, + "step": 29800 + }, + { + "epoch": 1.1474494706448508, + "grad_norm": 1.3519344329833984, + "learning_rate": 7.712840819867771e-05, + "loss": 0.7919, + "step": 29805 + }, + { + "epoch": 1.1476419634263715, + "grad_norm": 2.427093267440796, + "learning_rate": 7.709897119388553e-05, + "loss": 0.9861, + "step": 29810 + }, + { + "epoch": 1.1478344562078922, + "grad_norm": 1.0887788534164429, + "learning_rate": 7.706953628324069e-05, + "loss": 0.9012, + "step": 29815 + }, + { + "epoch": 1.1480269489894128, + "grad_norm": 1.7134822607040405, + "learning_rate": 7.704010346943488e-05, + "loss": 0.849, + "step": 29820 + }, + { + "epoch": 1.1482194417709337, + "grad_norm": 0.9797286987304688, + "learning_rate": 7.701067275515952e-05, + "loss": 0.8742, + "step": 29825 + }, + { + "epoch": 1.1484119345524544, + "grad_norm": 1.1243619918823242, + "learning_rate": 7.698124414310584e-05, + "loss": 0.7453, + "step": 29830 + }, + { + "epoch": 1.148604427333975, + "grad_norm": 0.9771775603294373, + "learning_rate": 7.695181763596489e-05, + "loss": 0.8668, + "step": 29835 + }, + { + "epoch": 1.1487969201154957, + "grad_norm": 1.8043612241744995, + "learning_rate": 7.692239323642753e-05, + "loss": 0.9907, + "step": 29840 + }, + { + "epoch": 1.1489894128970164, + "grad_norm": 2.071789503097534, + "learning_rate": 7.689297094718444e-05, + "loss": 0.8891, + "step": 29845 + }, + { + "epoch": 1.149181905678537, + "grad_norm": 1.8060542345046997, + "learning_rate": 7.686355077092611e-05, + "loss": 1.0377, + "step": 29850 + }, + { + "epoch": 1.1493743984600577, + "grad_norm": 1.0479457378387451, + "learning_rate": 7.683413271034279e-05, + "loss": 0.7771, + "step": 29855 + }, + { + "epoch": 1.1495668912415784, + "grad_norm": 1.4982253313064575, + "learning_rate": 7.680471676812453e-05, + "loss": 0.8573, + "step": 29860 + }, + { + "epoch": 1.149759384023099, + "grad_norm": 1.1916199922561646, + "learning_rate": 7.677530294696131e-05, + "loss": 0.726, + "step": 29865 + }, + { + "epoch": 1.1499518768046197, + "grad_norm": 1.0968197584152222, + "learning_rate": 7.674589124954274e-05, + "loss": 0.7354, + "step": 29870 + }, + { + "epoch": 1.1501443695861404, + "grad_norm": 1.3688973188400269, + "learning_rate": 7.671648167855842e-05, + "loss": 0.7691, + "step": 29875 + }, + { + "epoch": 1.1503368623676613, + "grad_norm": 2.174705743789673, + "learning_rate": 7.66870742366976e-05, + "loss": 0.8993, + "step": 29880 + }, + { + "epoch": 1.150529355149182, + "grad_norm": 1.718237280845642, + "learning_rate": 7.665766892664938e-05, + "loss": 0.8576, + "step": 29885 + }, + { + "epoch": 1.1507218479307026, + "grad_norm": 1.2384010553359985, + "learning_rate": 7.662826575110276e-05, + "loss": 0.8853, + "step": 29890 + }, + { + "epoch": 1.1509143407122233, + "grad_norm": 1.3564085960388184, + "learning_rate": 7.65988647127464e-05, + "loss": 0.7828, + "step": 29895 + }, + { + "epoch": 1.151106833493744, + "grad_norm": 1.820794701576233, + "learning_rate": 7.65694658142688e-05, + "loss": 0.893, + "step": 29900 + }, + { + "epoch": 1.1512993262752647, + "grad_norm": 2.1679587364196777, + "learning_rate": 7.654006905835838e-05, + "loss": 0.8594, + "step": 29905 + }, + { + "epoch": 1.1514918190567853, + "grad_norm": 2.045226573944092, + "learning_rate": 7.651067444770324e-05, + "loss": 1.0744, + "step": 29910 + }, + { + "epoch": 1.151684311838306, + "grad_norm": 1.4716484546661377, + "learning_rate": 7.648128198499128e-05, + "loss": 0.6354, + "step": 29915 + }, + { + "epoch": 1.1518768046198267, + "grad_norm": 2.3998184204101562, + "learning_rate": 7.64518916729103e-05, + "loss": 0.7672, + "step": 29920 + }, + { + "epoch": 1.1520692974013476, + "grad_norm": 1.5186283588409424, + "learning_rate": 7.642250351414786e-05, + "loss": 0.8835, + "step": 29925 + }, + { + "epoch": 1.1522617901828682, + "grad_norm": 1.9617729187011719, + "learning_rate": 7.639311751139127e-05, + "loss": 0.828, + "step": 29930 + }, + { + "epoch": 1.152454282964389, + "grad_norm": 0.981577455997467, + "learning_rate": 7.636373366732771e-05, + "loss": 0.8872, + "step": 29935 + }, + { + "epoch": 1.1526467757459096, + "grad_norm": 1.1515179872512817, + "learning_rate": 7.633435198464411e-05, + "loss": 0.9887, + "step": 29940 + }, + { + "epoch": 1.1528392685274302, + "grad_norm": 1.667133092880249, + "learning_rate": 7.630497246602725e-05, + "loss": 0.8211, + "step": 29945 + }, + { + "epoch": 1.153031761308951, + "grad_norm": 0.8391001224517822, + "learning_rate": 7.627559511416373e-05, + "loss": 0.8981, + "step": 29950 + }, + { + "epoch": 1.1532242540904716, + "grad_norm": 1.4484329223632812, + "learning_rate": 7.624621993173985e-05, + "loss": 0.865, + "step": 29955 + }, + { + "epoch": 1.1534167468719922, + "grad_norm": 1.7156023979187012, + "learning_rate": 7.621684692144178e-05, + "loss": 0.8699, + "step": 29960 + }, + { + "epoch": 1.153609239653513, + "grad_norm": 1.1260522603988647, + "learning_rate": 7.618747608595555e-05, + "loss": 0.9687, + "step": 29965 + }, + { + "epoch": 1.1538017324350336, + "grad_norm": 0.9241002202033997, + "learning_rate": 7.615810742796688e-05, + "loss": 0.7649, + "step": 29970 + }, + { + "epoch": 1.1539942252165543, + "grad_norm": 1.7365343570709229, + "learning_rate": 7.612874095016132e-05, + "loss": 0.9088, + "step": 29975 + }, + { + "epoch": 1.1541867179980752, + "grad_norm": 1.0506408214569092, + "learning_rate": 7.60993766552243e-05, + "loss": 0.9671, + "step": 29980 + }, + { + "epoch": 1.1543792107795958, + "grad_norm": 1.6173326969146729, + "learning_rate": 7.607001454584097e-05, + "loss": 0.9204, + "step": 29985 + }, + { + "epoch": 1.1545717035611165, + "grad_norm": 0.9955803155899048, + "learning_rate": 7.604065462469623e-05, + "loss": 0.923, + "step": 29990 + }, + { + "epoch": 1.1547641963426372, + "grad_norm": 1.820084571838379, + "learning_rate": 7.601129689447495e-05, + "loss": 0.9913, + "step": 29995 + }, + { + "epoch": 1.1549566891241578, + "grad_norm": 1.0505925416946411, + "learning_rate": 7.598194135786166e-05, + "loss": 0.8086, + "step": 30000 } ], "logging_steps": 5, @@ -28014,7 +42014,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, - "total_flos": 6.241026539658117e+17, + "total_flos": 9.370503974649938e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null