diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.3849855630413859, + "epoch": 0.7699711260827719, "eval_steps": 500, - "global_step": 10000, + "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -14007,6 +14007,14006 @@ "learning_rate": 0.00018229640445610988, "loss": 1.1324, "step": 10000 + }, + { + "epoch": 0.38517805582290665, + "grad_norm": 1.413021445274353, + "learning_rate": 0.00018227922177125984, + "loss": 1.0402, + "step": 10005 + }, + { + "epoch": 0.3853705486044273, + "grad_norm": 1.125299334526062, + "learning_rate": 0.00018226203156252005, + "loss": 1.271, + "step": 10010 + }, + { + "epoch": 0.38556304138594805, + "grad_norm": 1.2611075639724731, + "learning_rate": 0.00018224483383146237, + "loss": 1.2228, + "step": 10015 + }, + { + "epoch": 0.3857555341674687, + "grad_norm": 1.0332306623458862, + "learning_rate": 0.00018222762857965944, + "loss": 1.2059, + "step": 10020 + }, + { + "epoch": 0.3859480269489894, + "grad_norm": 1.965288758277893, + "learning_rate": 0.00018221041580868464, + "loss": 1.217, + "step": 10025 + }, + { + "epoch": 0.3861405197305101, + "grad_norm": 0.8059799075126648, + "learning_rate": 0.00018219319552011186, + "loss": 1.2039, + "step": 10030 + }, + { + "epoch": 0.3863330125120308, + "grad_norm": 1.4955195188522339, + "learning_rate": 0.00018217596771551584, + "loss": 1.2206, + "step": 10035 + }, + { + "epoch": 0.3865255052935515, + "grad_norm": 0.987479567527771, + "learning_rate": 0.00018215873239647197, + "loss": 1.3134, + "step": 10040 + }, + { + "epoch": 0.3867179980750722, + "grad_norm": 1.7247464656829834, + "learning_rate": 0.00018214148956455627, + "loss": 1.1786, + "step": 10045 + }, + { + "epoch": 0.3869104908565929, + "grad_norm": 0.9822973608970642, + "learning_rate": 0.00018212423922134546, + "loss": 1.0866, + "step": 10050 + }, + { + "epoch": 0.3871029836381136, + "grad_norm": 1.1217613220214844, + "learning_rate": 0.000182106981368417, + "loss": 1.3292, + "step": 10055 + }, + { + "epoch": 0.38729547641963424, + "grad_norm": 1.2722941637039185, + "learning_rate": 0.000182089716007349, + "loss": 1.0294, + "step": 10060 + }, + { + "epoch": 0.38748796920115497, + "grad_norm": 1.6616365909576416, + "learning_rate": 0.00018207244313972026, + "loss": 1.3691, + "step": 10065 + }, + { + "epoch": 0.38768046198267564, + "grad_norm": 4.093936443328857, + "learning_rate": 0.0001820551627671103, + "loss": 1.1916, + "step": 10070 + }, + { + "epoch": 0.38787295476419636, + "grad_norm": 1.9061866998672485, + "learning_rate": 0.00018203787489109926, + "loss": 1.3733, + "step": 10075 + }, + { + "epoch": 0.38806544754571703, + "grad_norm": 1.6439005136489868, + "learning_rate": 0.00018202057951326804, + "loss": 1.3533, + "step": 10080 + }, + { + "epoch": 0.3882579403272377, + "grad_norm": 1.535980224609375, + "learning_rate": 0.0001820032766351981, + "loss": 1.3916, + "step": 10085 + }, + { + "epoch": 0.38845043310875843, + "grad_norm": 1.6342761516571045, + "learning_rate": 0.00018198596625847177, + "loss": 1.335, + "step": 10090 + }, + { + "epoch": 0.3886429258902791, + "grad_norm": 2.2760815620422363, + "learning_rate": 0.00018196864838467192, + "loss": 1.0399, + "step": 10095 + }, + { + "epoch": 0.3888354186717998, + "grad_norm": 1.173302412033081, + "learning_rate": 0.0001819513230153822, + "loss": 1.3414, + "step": 10100 + }, + { + "epoch": 0.3890279114533205, + "grad_norm": 1.7409497499465942, + "learning_rate": 0.00018193399015218684, + "loss": 1.3377, + "step": 10105 + }, + { + "epoch": 0.38922040423484117, + "grad_norm": 1.3547555208206177, + "learning_rate": 0.00018191664979667085, + "loss": 1.2576, + "step": 10110 + }, + { + "epoch": 0.3894128970163619, + "grad_norm": 2.2421867847442627, + "learning_rate": 0.0001818993019504199, + "loss": 1.1624, + "step": 10115 + }, + { + "epoch": 0.38960538979788256, + "grad_norm": 1.5812993049621582, + "learning_rate": 0.00018188194661502029, + "loss": 1.2319, + "step": 10120 + }, + { + "epoch": 0.3897978825794033, + "grad_norm": 1.8024287223815918, + "learning_rate": 0.00018186458379205908, + "loss": 1.4016, + "step": 10125 + }, + { + "epoch": 0.38999037536092396, + "grad_norm": 0.9069392681121826, + "learning_rate": 0.000181847213483124, + "loss": 1.1683, + "step": 10130 + }, + { + "epoch": 0.3901828681424447, + "grad_norm": 1.6808935403823853, + "learning_rate": 0.00018182983568980346, + "loss": 1.3519, + "step": 10135 + }, + { + "epoch": 0.39037536092396535, + "grad_norm": 2.584958553314209, + "learning_rate": 0.0001818124504136865, + "loss": 1.3804, + "step": 10140 + }, + { + "epoch": 0.390567853705486, + "grad_norm": 1.4569361209869385, + "learning_rate": 0.00018179505765636287, + "loss": 1.2862, + "step": 10145 + }, + { + "epoch": 0.39076034648700675, + "grad_norm": 2.0809457302093506, + "learning_rate": 0.0001817776574194231, + "loss": 1.1108, + "step": 10150 + }, + { + "epoch": 0.3909528392685274, + "grad_norm": 1.7902493476867676, + "learning_rate": 0.00018176024970445828, + "loss": 1.0611, + "step": 10155 + }, + { + "epoch": 0.39114533205004814, + "grad_norm": 0.9953207969665527, + "learning_rate": 0.00018174283451306025, + "loss": 1.1883, + "step": 10160 + }, + { + "epoch": 0.3913378248315688, + "grad_norm": 1.0629642009735107, + "learning_rate": 0.00018172541184682147, + "loss": 1.3, + "step": 10165 + }, + { + "epoch": 0.3915303176130895, + "grad_norm": 1.546132206916809, + "learning_rate": 0.0001817079817073352, + "loss": 1.2446, + "step": 10170 + }, + { + "epoch": 0.3917228103946102, + "grad_norm": 1.379883050918579, + "learning_rate": 0.0001816905440961952, + "loss": 1.2964, + "step": 10175 + }, + { + "epoch": 0.3919153031761309, + "grad_norm": 1.132592797279358, + "learning_rate": 0.00018167309901499613, + "loss": 1.3951, + "step": 10180 + }, + { + "epoch": 0.3921077959576516, + "grad_norm": 1.4765934944152832, + "learning_rate": 0.00018165564646533322, + "loss": 1.2278, + "step": 10185 + }, + { + "epoch": 0.3923002887391723, + "grad_norm": 1.5826079845428467, + "learning_rate": 0.00018163818644880233, + "loss": 1.2615, + "step": 10190 + }, + { + "epoch": 0.392492781520693, + "grad_norm": 1.5647984743118286, + "learning_rate": 0.00018162071896700007, + "loss": 1.4696, + "step": 10195 + }, + { + "epoch": 0.39268527430221367, + "grad_norm": 1.0377607345581055, + "learning_rate": 0.0001816032440215238, + "loss": 1.1309, + "step": 10200 + }, + { + "epoch": 0.39287776708373434, + "grad_norm": 1.1878221035003662, + "learning_rate": 0.0001815857616139714, + "loss": 1.1442, + "step": 10205 + }, + { + "epoch": 0.39307025986525507, + "grad_norm": 1.5119047164916992, + "learning_rate": 0.00018156827174594157, + "loss": 1.2436, + "step": 10210 + }, + { + "epoch": 0.39326275264677574, + "grad_norm": 1.6624690294265747, + "learning_rate": 0.00018155077441903364, + "loss": 1.1726, + "step": 10215 + }, + { + "epoch": 0.39345524542829646, + "grad_norm": 1.2995012998580933, + "learning_rate": 0.0001815332696348476, + "loss": 1.3053, + "step": 10220 + }, + { + "epoch": 0.39364773820981713, + "grad_norm": 1.3727355003356934, + "learning_rate": 0.00018151575739498417, + "loss": 1.4224, + "step": 10225 + }, + { + "epoch": 0.3938402309913378, + "grad_norm": 1.1980619430541992, + "learning_rate": 0.0001814982377010447, + "loss": 1.0973, + "step": 10230 + }, + { + "epoch": 0.39403272377285853, + "grad_norm": 1.4235668182373047, + "learning_rate": 0.00018148071055463128, + "loss": 1.1659, + "step": 10235 + }, + { + "epoch": 0.3942252165543792, + "grad_norm": 1.1501004695892334, + "learning_rate": 0.00018146317595734663, + "loss": 1.2738, + "step": 10240 + }, + { + "epoch": 0.3944177093358999, + "grad_norm": 1.1686300039291382, + "learning_rate": 0.00018144563391079419, + "loss": 1.1691, + "step": 10245 + }, + { + "epoch": 0.3946102021174206, + "grad_norm": 1.3350188732147217, + "learning_rate": 0.00018142808441657806, + "loss": 1.2344, + "step": 10250 + }, + { + "epoch": 0.39480269489894126, + "grad_norm": 1.0583946704864502, + "learning_rate": 0.00018141052747630302, + "loss": 1.1358, + "step": 10255 + }, + { + "epoch": 0.394995187680462, + "grad_norm": 1.0637165307998657, + "learning_rate": 0.00018139296309157454, + "loss": 1.2589, + "step": 10260 + }, + { + "epoch": 0.39518768046198266, + "grad_norm": 1.971304178237915, + "learning_rate": 0.00018137539126399874, + "loss": 1.1413, + "step": 10265 + }, + { + "epoch": 0.3953801732435034, + "grad_norm": 1.1685267686843872, + "learning_rate": 0.0001813578119951825, + "loss": 1.1702, + "step": 10270 + }, + { + "epoch": 0.39557266602502406, + "grad_norm": 1.620936393737793, + "learning_rate": 0.0001813402252867333, + "loss": 1.2636, + "step": 10275 + }, + { + "epoch": 0.3957651588065448, + "grad_norm": 1.553240180015564, + "learning_rate": 0.00018132263114025934, + "loss": 1.4167, + "step": 10280 + }, + { + "epoch": 0.39595765158806545, + "grad_norm": 1.260498285293579, + "learning_rate": 0.00018130502955736942, + "loss": 1.2984, + "step": 10285 + }, + { + "epoch": 0.3961501443695861, + "grad_norm": 1.7073127031326294, + "learning_rate": 0.0001812874205396732, + "loss": 1.29, + "step": 10290 + }, + { + "epoch": 0.39634263715110685, + "grad_norm": 0.900610625743866, + "learning_rate": 0.00018126980408878082, + "loss": 1.0423, + "step": 10295 + }, + { + "epoch": 0.3965351299326275, + "grad_norm": 1.359563946723938, + "learning_rate": 0.00018125218020630324, + "loss": 1.1576, + "step": 10300 + }, + { + "epoch": 0.39672762271414824, + "grad_norm": 0.9399506449699402, + "learning_rate": 0.000181234548893852, + "loss": 1.1481, + "step": 10305 + }, + { + "epoch": 0.3969201154956689, + "grad_norm": 1.4632538557052612, + "learning_rate": 0.00018121691015303944, + "loss": 1.1404, + "step": 10310 + }, + { + "epoch": 0.3971126082771896, + "grad_norm": 1.644718050956726, + "learning_rate": 0.00018119926398547839, + "loss": 1.1783, + "step": 10315 + }, + { + "epoch": 0.3973051010587103, + "grad_norm": 1.299018144607544, + "learning_rate": 0.00018118161039278258, + "loss": 1.2076, + "step": 10320 + }, + { + "epoch": 0.397497593840231, + "grad_norm": 1.5833697319030762, + "learning_rate": 0.00018116394937656632, + "loss": 1.0825, + "step": 10325 + }, + { + "epoch": 0.3976900866217517, + "grad_norm": 1.4813597202301025, + "learning_rate": 0.0001811462809384445, + "loss": 1.263, + "step": 10330 + }, + { + "epoch": 0.3978825794032724, + "grad_norm": 1.8714033365249634, + "learning_rate": 0.00018112860508003284, + "loss": 1.2425, + "step": 10335 + }, + { + "epoch": 0.39807507218479304, + "grad_norm": 1.5847947597503662, + "learning_rate": 0.0001811109218029477, + "loss": 1.0863, + "step": 10340 + }, + { + "epoch": 0.39826756496631377, + "grad_norm": 1.339046597480774, + "learning_rate": 0.00018109323110880604, + "loss": 1.3871, + "step": 10345 + }, + { + "epoch": 0.39846005774783444, + "grad_norm": 2.370396375656128, + "learning_rate": 0.0001810755329992256, + "loss": 1.2629, + "step": 10350 + }, + { + "epoch": 0.39865255052935517, + "grad_norm": 1.2930303812026978, + "learning_rate": 0.00018105782747582474, + "loss": 1.1281, + "step": 10355 + }, + { + "epoch": 0.39884504331087584, + "grad_norm": 1.2590947151184082, + "learning_rate": 0.0001810401145402225, + "loss": 1.2229, + "step": 10360 + }, + { + "epoch": 0.39903753609239656, + "grad_norm": 0.8280492424964905, + "learning_rate": 0.00018102239419403866, + "loss": 1.2601, + "step": 10365 + }, + { + "epoch": 0.39923002887391723, + "grad_norm": 1.6567853689193726, + "learning_rate": 0.0001810046664388936, + "loss": 1.1296, + "step": 10370 + }, + { + "epoch": 0.3994225216554379, + "grad_norm": 1.2103195190429688, + "learning_rate": 0.00018098693127640834, + "loss": 1.1524, + "step": 10375 + }, + { + "epoch": 0.3996150144369586, + "grad_norm": 1.4716650247573853, + "learning_rate": 0.00018096918870820475, + "loss": 1.1805, + "step": 10380 + }, + { + "epoch": 0.3998075072184793, + "grad_norm": 1.291873574256897, + "learning_rate": 0.00018095143873590524, + "loss": 1.2877, + "step": 10385 + }, + { + "epoch": 0.4, + "grad_norm": 0.8508723974227905, + "learning_rate": 0.0001809336813611329, + "loss": 0.9215, + "step": 10390 + }, + { + "epoch": 0.4001924927815207, + "grad_norm": 1.1256935596466064, + "learning_rate": 0.00018091591658551154, + "loss": 1.3286, + "step": 10395 + }, + { + "epoch": 0.40038498556304136, + "grad_norm": 1.1910960674285889, + "learning_rate": 0.0001808981444106656, + "loss": 1.1078, + "step": 10400 + }, + { + "epoch": 0.4005774783445621, + "grad_norm": 2.188884735107422, + "learning_rate": 0.00018088036483822028, + "loss": 1.2762, + "step": 10405 + }, + { + "epoch": 0.40076997112608276, + "grad_norm": 0.9240724444389343, + "learning_rate": 0.00018086257786980136, + "loss": 1.1288, + "step": 10410 + }, + { + "epoch": 0.4009624639076035, + "grad_norm": 1.961204171180725, + "learning_rate": 0.00018084478350703537, + "loss": 1.1863, + "step": 10415 + }, + { + "epoch": 0.40115495668912415, + "grad_norm": 1.5713763236999512, + "learning_rate": 0.00018082698175154947, + "loss": 1.2157, + "step": 10420 + }, + { + "epoch": 0.4013474494706448, + "grad_norm": 2.006776809692383, + "learning_rate": 0.00018080917260497153, + "loss": 1.1671, + "step": 10425 + }, + { + "epoch": 0.40153994225216555, + "grad_norm": 1.511513352394104, + "learning_rate": 0.00018079135606893006, + "loss": 1.2428, + "step": 10430 + }, + { + "epoch": 0.4017324350336862, + "grad_norm": 1.5270637273788452, + "learning_rate": 0.00018077353214505427, + "loss": 1.2887, + "step": 10435 + }, + { + "epoch": 0.40192492781520694, + "grad_norm": 1.470389723777771, + "learning_rate": 0.00018075570083497407, + "loss": 1.2739, + "step": 10440 + }, + { + "epoch": 0.4021174205967276, + "grad_norm": 1.224330186843872, + "learning_rate": 0.00018073786214031992, + "loss": 1.0882, + "step": 10445 + }, + { + "epoch": 0.40230991337824834, + "grad_norm": 2.0693979263305664, + "learning_rate": 0.00018072001606272316, + "loss": 1.5091, + "step": 10450 + }, + { + "epoch": 0.402502406159769, + "grad_norm": 1.418346643447876, + "learning_rate": 0.00018070216260381567, + "loss": 1.2886, + "step": 10455 + }, + { + "epoch": 0.4026948989412897, + "grad_norm": 1.8632601499557495, + "learning_rate": 0.00018068430176522998, + "loss": 1.1809, + "step": 10460 + }, + { + "epoch": 0.4028873917228104, + "grad_norm": 1.6064730882644653, + "learning_rate": 0.00018066643354859937, + "loss": 1.2394, + "step": 10465 + }, + { + "epoch": 0.4030798845043311, + "grad_norm": 1.2319833040237427, + "learning_rate": 0.0001806485579555578, + "loss": 1.1979, + "step": 10470 + }, + { + "epoch": 0.4032723772858518, + "grad_norm": 1.5506865978240967, + "learning_rate": 0.00018063067498773987, + "loss": 1.1899, + "step": 10475 + }, + { + "epoch": 0.40346487006737247, + "grad_norm": 1.360120415687561, + "learning_rate": 0.00018061278464678082, + "loss": 1.0995, + "step": 10480 + }, + { + "epoch": 0.40365736284889314, + "grad_norm": 1.133346438407898, + "learning_rate": 0.00018059488693431664, + "loss": 1.1972, + "step": 10485 + }, + { + "epoch": 0.40384985563041387, + "grad_norm": 1.6961482763290405, + "learning_rate": 0.00018057698185198394, + "loss": 1.0823, + "step": 10490 + }, + { + "epoch": 0.40404234841193454, + "grad_norm": 1.0126832723617554, + "learning_rate": 0.00018055906940142, + "loss": 1.3294, + "step": 10495 + }, + { + "epoch": 0.40423484119345526, + "grad_norm": 1.378825068473816, + "learning_rate": 0.00018054114958426283, + "loss": 1.3188, + "step": 10500 + }, + { + "epoch": 0.40442733397497593, + "grad_norm": 1.1392402648925781, + "learning_rate": 0.00018052322240215104, + "loss": 1.2428, + "step": 10505 + }, + { + "epoch": 0.40461982675649666, + "grad_norm": 1.0441240072250366, + "learning_rate": 0.00018050528785672402, + "loss": 1.2997, + "step": 10510 + }, + { + "epoch": 0.40481231953801733, + "grad_norm": 1.3564190864562988, + "learning_rate": 0.00018048734594962171, + "loss": 1.3018, + "step": 10515 + }, + { + "epoch": 0.405004812319538, + "grad_norm": 1.3429349660873413, + "learning_rate": 0.0001804693966824848, + "loss": 1.0567, + "step": 10520 + }, + { + "epoch": 0.4051973051010587, + "grad_norm": 0.920313835144043, + "learning_rate": 0.00018045144005695462, + "loss": 1.1386, + "step": 10525 + }, + { + "epoch": 0.4053897978825794, + "grad_norm": 2.402700662612915, + "learning_rate": 0.00018043347607467317, + "loss": 1.2837, + "step": 10530 + }, + { + "epoch": 0.4055822906641001, + "grad_norm": 1.7154083251953125, + "learning_rate": 0.00018041550473728318, + "loss": 1.3188, + "step": 10535 + }, + { + "epoch": 0.4057747834456208, + "grad_norm": 0.8770251274108887, + "learning_rate": 0.000180397526046428, + "loss": 1.1641, + "step": 10540 + }, + { + "epoch": 0.40596727622714146, + "grad_norm": 0.9887571334838867, + "learning_rate": 0.0001803795400037516, + "loss": 1.0042, + "step": 10545 + }, + { + "epoch": 0.4061597690086622, + "grad_norm": 2.665354013442993, + "learning_rate": 0.00018036154661089877, + "loss": 1.2579, + "step": 10550 + }, + { + "epoch": 0.40635226179018286, + "grad_norm": 2.6088809967041016, + "learning_rate": 0.00018034354586951486, + "loss": 1.1098, + "step": 10555 + }, + { + "epoch": 0.4065447545717036, + "grad_norm": 1.4641830921173096, + "learning_rate": 0.00018032553778124586, + "loss": 1.1108, + "step": 10560 + }, + { + "epoch": 0.40673724735322425, + "grad_norm": 1.0744770765304565, + "learning_rate": 0.00018030752234773854, + "loss": 1.1234, + "step": 10565 + }, + { + "epoch": 0.4069297401347449, + "grad_norm": 1.2617886066436768, + "learning_rate": 0.00018028949957064034, + "loss": 1.1753, + "step": 10570 + }, + { + "epoch": 0.40712223291626565, + "grad_norm": 1.4641857147216797, + "learning_rate": 0.00018027146945159923, + "loss": 1.2671, + "step": 10575 + }, + { + "epoch": 0.4073147256977863, + "grad_norm": 1.4347914457321167, + "learning_rate": 0.00018025343199226402, + "loss": 1.1348, + "step": 10580 + }, + { + "epoch": 0.40750721847930704, + "grad_norm": 1.434019923210144, + "learning_rate": 0.00018023538719428407, + "loss": 1.2439, + "step": 10585 + }, + { + "epoch": 0.4076997112608277, + "grad_norm": 1.1034338474273682, + "learning_rate": 0.00018021733505930944, + "loss": 1.0502, + "step": 10590 + }, + { + "epoch": 0.40789220404234844, + "grad_norm": 1.591850996017456, + "learning_rate": 0.00018019927558899097, + "loss": 1.178, + "step": 10595 + }, + { + "epoch": 0.4080846968238691, + "grad_norm": 1.672735333442688, + "learning_rate": 0.00018018120878498, + "loss": 1.2363, + "step": 10600 + }, + { + "epoch": 0.4082771896053898, + "grad_norm": 1.8779442310333252, + "learning_rate": 0.00018016313464892862, + "loss": 1.2537, + "step": 10605 + }, + { + "epoch": 0.4084696823869105, + "grad_norm": 1.075453281402588, + "learning_rate": 0.00018014505318248963, + "loss": 1.081, + "step": 10610 + }, + { + "epoch": 0.4086621751684312, + "grad_norm": 1.350914478302002, + "learning_rate": 0.0001801269643873164, + "loss": 1.3958, + "step": 10615 + }, + { + "epoch": 0.4088546679499519, + "grad_norm": 1.6566729545593262, + "learning_rate": 0.0001801088682650631, + "loss": 1.3208, + "step": 10620 + }, + { + "epoch": 0.40904716073147257, + "grad_norm": 1.243171215057373, + "learning_rate": 0.00018009076481738446, + "loss": 1.17, + "step": 10625 + }, + { + "epoch": 0.40923965351299324, + "grad_norm": 1.110456109046936, + "learning_rate": 0.00018007265404593593, + "loss": 1.1311, + "step": 10630 + }, + { + "epoch": 0.40943214629451397, + "grad_norm": 2.485719919204712, + "learning_rate": 0.00018005453595237362, + "loss": 1.3703, + "step": 10635 + }, + { + "epoch": 0.40962463907603464, + "grad_norm": 1.3115043640136719, + "learning_rate": 0.00018003641053835435, + "loss": 1.2551, + "step": 10640 + }, + { + "epoch": 0.40981713185755536, + "grad_norm": 1.530535340309143, + "learning_rate": 0.0001800182778055355, + "loss": 1.3577, + "step": 10645 + }, + { + "epoch": 0.41000962463907603, + "grad_norm": 1.2339287996292114, + "learning_rate": 0.00018000013775557521, + "loss": 1.2539, + "step": 10650 + }, + { + "epoch": 0.4102021174205967, + "grad_norm": 1.579942226409912, + "learning_rate": 0.00017998199039013225, + "loss": 1.3568, + "step": 10655 + }, + { + "epoch": 0.4103946102021174, + "grad_norm": 1.831764817237854, + "learning_rate": 0.00017996383571086612, + "loss": 1.3662, + "step": 10660 + }, + { + "epoch": 0.4105871029836381, + "grad_norm": 2.1747963428497314, + "learning_rate": 0.00017994567371943697, + "loss": 1.1333, + "step": 10665 + }, + { + "epoch": 0.4107795957651588, + "grad_norm": 1.5603039264678955, + "learning_rate": 0.00017992750441750549, + "loss": 1.2327, + "step": 10670 + }, + { + "epoch": 0.4109720885466795, + "grad_norm": 1.7836112976074219, + "learning_rate": 0.00017990932780673324, + "loss": 1.0281, + "step": 10675 + }, + { + "epoch": 0.4111645813282002, + "grad_norm": 1.5049426555633545, + "learning_rate": 0.0001798911438887823, + "loss": 1.3338, + "step": 10680 + }, + { + "epoch": 0.4113570741097209, + "grad_norm": 1.6236990690231323, + "learning_rate": 0.00017987295266531548, + "loss": 1.3937, + "step": 10685 + }, + { + "epoch": 0.41154956689124156, + "grad_norm": 1.2450697422027588, + "learning_rate": 0.00017985475413799623, + "loss": 1.3456, + "step": 10690 + }, + { + "epoch": 0.4117420596727623, + "grad_norm": 1.031137228012085, + "learning_rate": 0.00017983654830848873, + "loss": 1.0254, + "step": 10695 + }, + { + "epoch": 0.41193455245428295, + "grad_norm": 1.588884949684143, + "learning_rate": 0.00017981833517845773, + "loss": 1.0554, + "step": 10700 + }, + { + "epoch": 0.4121270452358037, + "grad_norm": 1.2405824661254883, + "learning_rate": 0.00017980011474956874, + "loss": 1.4561, + "step": 10705 + }, + { + "epoch": 0.41231953801732435, + "grad_norm": 2.03009295463562, + "learning_rate": 0.00017978188702348792, + "loss": 1.2479, + "step": 10710 + }, + { + "epoch": 0.412512030798845, + "grad_norm": 0.9755954146385193, + "learning_rate": 0.00017976365200188198, + "loss": 1.1632, + "step": 10715 + }, + { + "epoch": 0.41270452358036575, + "grad_norm": 1.3121798038482666, + "learning_rate": 0.00017974540968641848, + "loss": 1.2069, + "step": 10720 + }, + { + "epoch": 0.4128970163618864, + "grad_norm": 1.880199909210205, + "learning_rate": 0.00017972716007876556, + "loss": 1.32, + "step": 10725 + }, + { + "epoch": 0.41308950914340714, + "grad_norm": 2.1090636253356934, + "learning_rate": 0.00017970890318059194, + "loss": 1.2943, + "step": 10730 + }, + { + "epoch": 0.4132820019249278, + "grad_norm": 1.2155611515045166, + "learning_rate": 0.00017969063899356716, + "loss": 1.3022, + "step": 10735 + }, + { + "epoch": 0.4134744947064485, + "grad_norm": 1.191871166229248, + "learning_rate": 0.00017967236751936135, + "loss": 1.2699, + "step": 10740 + }, + { + "epoch": 0.4136669874879692, + "grad_norm": 1.4702094793319702, + "learning_rate": 0.00017965408875964534, + "loss": 1.3936, + "step": 10745 + }, + { + "epoch": 0.4138594802694899, + "grad_norm": 1.7658724784851074, + "learning_rate": 0.00017963580271609052, + "loss": 1.1633, + "step": 10750 + }, + { + "epoch": 0.4140519730510106, + "grad_norm": 1.5030126571655273, + "learning_rate": 0.00017961750939036913, + "loss": 1.4213, + "step": 10755 + }, + { + "epoch": 0.4142444658325313, + "grad_norm": 1.5616711378097534, + "learning_rate": 0.0001795992087841539, + "loss": 1.3342, + "step": 10760 + }, + { + "epoch": 0.414436958614052, + "grad_norm": 1.2506111860275269, + "learning_rate": 0.0001795809008991183, + "loss": 1.0034, + "step": 10765 + }, + { + "epoch": 0.41462945139557267, + "grad_norm": 1.1011154651641846, + "learning_rate": 0.00017956258573693657, + "loss": 1.0936, + "step": 10770 + }, + { + "epoch": 0.41482194417709334, + "grad_norm": 1.2040156126022339, + "learning_rate": 0.00017954426329928335, + "loss": 1.1974, + "step": 10775 + }, + { + "epoch": 0.41501443695861406, + "grad_norm": 1.5271620750427246, + "learning_rate": 0.0001795259335878342, + "loss": 1.2563, + "step": 10780 + }, + { + "epoch": 0.41520692974013473, + "grad_norm": 1.342129111289978, + "learning_rate": 0.00017950759660426523, + "loss": 1.2319, + "step": 10785 + }, + { + "epoch": 0.41539942252165546, + "grad_norm": 0.9986871480941772, + "learning_rate": 0.00017948925235025326, + "loss": 1.0781, + "step": 10790 + }, + { + "epoch": 0.41559191530317613, + "grad_norm": 1.107088327407837, + "learning_rate": 0.00017947090082747573, + "loss": 1.1499, + "step": 10795 + }, + { + "epoch": 0.4157844080846968, + "grad_norm": 1.5566056966781616, + "learning_rate": 0.00017945254203761076, + "loss": 1.0997, + "step": 10800 + }, + { + "epoch": 0.4159769008662175, + "grad_norm": 1.4681777954101562, + "learning_rate": 0.00017943417598233715, + "loss": 1.3307, + "step": 10805 + }, + { + "epoch": 0.4161693936477382, + "grad_norm": 1.4198453426361084, + "learning_rate": 0.00017941580266333433, + "loss": 0.9664, + "step": 10810 + }, + { + "epoch": 0.4163618864292589, + "grad_norm": 1.1474230289459229, + "learning_rate": 0.00017939742208228246, + "loss": 1.2454, + "step": 10815 + }, + { + "epoch": 0.4165543792107796, + "grad_norm": 1.186672568321228, + "learning_rate": 0.00017937903424086228, + "loss": 1.3311, + "step": 10820 + }, + { + "epoch": 0.4167468719923003, + "grad_norm": 1.4548507928848267, + "learning_rate": 0.00017936063914075526, + "loss": 1.2508, + "step": 10825 + }, + { + "epoch": 0.416939364773821, + "grad_norm": 1.0224876403808594, + "learning_rate": 0.00017934223678364353, + "loss": 0.9364, + "step": 10830 + }, + { + "epoch": 0.41713185755534166, + "grad_norm": 1.5561485290527344, + "learning_rate": 0.00017932382717120984, + "loss": 1.1686, + "step": 10835 + }, + { + "epoch": 0.4173243503368624, + "grad_norm": 1.9549082517623901, + "learning_rate": 0.00017930541030513762, + "loss": 1.2678, + "step": 10840 + }, + { + "epoch": 0.41751684311838305, + "grad_norm": 1.2266019582748413, + "learning_rate": 0.00017928698618711094, + "loss": 1.2963, + "step": 10845 + }, + { + "epoch": 0.4177093358999038, + "grad_norm": 0.6992445588111877, + "learning_rate": 0.00017926855481881465, + "loss": 1.1042, + "step": 10850 + }, + { + "epoch": 0.41790182868142445, + "grad_norm": 1.515512466430664, + "learning_rate": 0.00017925011620193408, + "loss": 1.0718, + "step": 10855 + }, + { + "epoch": 0.4180943214629451, + "grad_norm": 1.5123271942138672, + "learning_rate": 0.0001792316703381554, + "loss": 1.1307, + "step": 10860 + }, + { + "epoch": 0.41828681424446584, + "grad_norm": 1.3709865808486938, + "learning_rate": 0.00017921321722916535, + "loss": 1.3652, + "step": 10865 + }, + { + "epoch": 0.4184793070259865, + "grad_norm": 1.3327142000198364, + "learning_rate": 0.0001791947568766513, + "loss": 1.2644, + "step": 10870 + }, + { + "epoch": 0.41867179980750724, + "grad_norm": 1.460595726966858, + "learning_rate": 0.00017917628928230134, + "loss": 1.2783, + "step": 10875 + }, + { + "epoch": 0.4188642925890279, + "grad_norm": 1.1008737087249756, + "learning_rate": 0.00017915781444780425, + "loss": 1.2889, + "step": 10880 + }, + { + "epoch": 0.4190567853705486, + "grad_norm": 1.8467929363250732, + "learning_rate": 0.00017913933237484936, + "loss": 1.1897, + "step": 10885 + }, + { + "epoch": 0.4192492781520693, + "grad_norm": 1.286544680595398, + "learning_rate": 0.00017912084306512683, + "loss": 1.1239, + "step": 10890 + }, + { + "epoch": 0.41944177093359, + "grad_norm": 1.8240995407104492, + "learning_rate": 0.00017910234652032726, + "loss": 1.3085, + "step": 10895 + }, + { + "epoch": 0.4196342637151107, + "grad_norm": 1.1262156963348389, + "learning_rate": 0.00017908384274214215, + "loss": 1.3779, + "step": 10900 + }, + { + "epoch": 0.41982675649663137, + "grad_norm": 1.2274012565612793, + "learning_rate": 0.0001790653317322635, + "loss": 1.3361, + "step": 10905 + }, + { + "epoch": 0.4200192492781521, + "grad_norm": 2.0522284507751465, + "learning_rate": 0.000179046813492384, + "loss": 1.2329, + "step": 10910 + }, + { + "epoch": 0.42021174205967277, + "grad_norm": 1.927666187286377, + "learning_rate": 0.0001790282880241971, + "loss": 1.2217, + "step": 10915 + }, + { + "epoch": 0.42040423484119344, + "grad_norm": 2.254720687866211, + "learning_rate": 0.0001790097553293967, + "loss": 1.2867, + "step": 10920 + }, + { + "epoch": 0.42059672762271416, + "grad_norm": 1.9560370445251465, + "learning_rate": 0.0001789912154096776, + "loss": 1.2959, + "step": 10925 + }, + { + "epoch": 0.42078922040423483, + "grad_norm": 1.109393835067749, + "learning_rate": 0.00017897266826673517, + "loss": 1.2397, + "step": 10930 + }, + { + "epoch": 0.42098171318575556, + "grad_norm": 1.1880956888198853, + "learning_rate": 0.00017895411390226527, + "loss": 1.192, + "step": 10935 + }, + { + "epoch": 0.4211742059672762, + "grad_norm": 1.851517677307129, + "learning_rate": 0.00017893555231796477, + "loss": 1.1866, + "step": 10940 + }, + { + "epoch": 0.4213666987487969, + "grad_norm": 1.1871724128723145, + "learning_rate": 0.0001789169835155309, + "loss": 1.1627, + "step": 10945 + }, + { + "epoch": 0.4215591915303176, + "grad_norm": 0.9478880167007446, + "learning_rate": 0.0001788984074966616, + "loss": 1.198, + "step": 10950 + }, + { + "epoch": 0.4217516843118383, + "grad_norm": 1.753989577293396, + "learning_rate": 0.00017887982426305566, + "loss": 1.2923, + "step": 10955 + }, + { + "epoch": 0.421944177093359, + "grad_norm": 2.161820650100708, + "learning_rate": 0.00017886123381641227, + "loss": 1.2651, + "step": 10960 + }, + { + "epoch": 0.4221366698748797, + "grad_norm": 1.203307867050171, + "learning_rate": 0.00017884263615843145, + "loss": 1.1854, + "step": 10965 + }, + { + "epoch": 0.42232916265640036, + "grad_norm": 1.6671913862228394, + "learning_rate": 0.0001788240312908139, + "loss": 1.2466, + "step": 10970 + }, + { + "epoch": 0.4225216554379211, + "grad_norm": 1.643796443939209, + "learning_rate": 0.0001788054192152608, + "loss": 1.202, + "step": 10975 + }, + { + "epoch": 0.42271414821944175, + "grad_norm": 1.024296522140503, + "learning_rate": 0.00017878679993347415, + "loss": 1.2392, + "step": 10980 + }, + { + "epoch": 0.4229066410009625, + "grad_norm": 1.363425612449646, + "learning_rate": 0.0001787681734471566, + "loss": 1.3577, + "step": 10985 + }, + { + "epoch": 0.42309913378248315, + "grad_norm": 1.7815190553665161, + "learning_rate": 0.00017874953975801134, + "loss": 0.9826, + "step": 10990 + }, + { + "epoch": 0.4232916265640039, + "grad_norm": 1.6736468076705933, + "learning_rate": 0.00017873089886774236, + "loss": 1.168, + "step": 10995 + }, + { + "epoch": 0.42348411934552455, + "grad_norm": 1.3047553300857544, + "learning_rate": 0.0001787122507780542, + "loss": 1.1839, + "step": 11000 + }, + { + "epoch": 0.4236766121270452, + "grad_norm": 1.5737935304641724, + "learning_rate": 0.00017869359549065216, + "loss": 1.0693, + "step": 11005 + }, + { + "epoch": 0.42386910490856594, + "grad_norm": 0.9130328893661499, + "learning_rate": 0.00017867493300724208, + "loss": 1.1609, + "step": 11010 + }, + { + "epoch": 0.4240615976900866, + "grad_norm": 2.444490432739258, + "learning_rate": 0.00017865626332953056, + "loss": 1.2422, + "step": 11015 + }, + { + "epoch": 0.42425409047160734, + "grad_norm": 1.4214091300964355, + "learning_rate": 0.00017863758645922481, + "loss": 1.2028, + "step": 11020 + }, + { + "epoch": 0.424446583253128, + "grad_norm": 1.3986276388168335, + "learning_rate": 0.0001786189023980327, + "loss": 0.9271, + "step": 11025 + }, + { + "epoch": 0.4246390760346487, + "grad_norm": 1.6309832334518433, + "learning_rate": 0.00017860021114766275, + "loss": 1.1242, + "step": 11030 + }, + { + "epoch": 0.4248315688161694, + "grad_norm": 1.0703374147415161, + "learning_rate": 0.00017858151270982423, + "loss": 1.1688, + "step": 11035 + }, + { + "epoch": 0.4250240615976901, + "grad_norm": 0.9345492720603943, + "learning_rate": 0.00017856280708622687, + "loss": 1.0759, + "step": 11040 + }, + { + "epoch": 0.4252165543792108, + "grad_norm": 1.1012792587280273, + "learning_rate": 0.00017854409427858124, + "loss": 1.3299, + "step": 11045 + }, + { + "epoch": 0.42540904716073147, + "grad_norm": 1.087344765663147, + "learning_rate": 0.00017852537428859853, + "loss": 1.1188, + "step": 11050 + }, + { + "epoch": 0.42560153994225214, + "grad_norm": 1.0374698638916016, + "learning_rate": 0.0001785066471179905, + "loss": 1.2403, + "step": 11055 + }, + { + "epoch": 0.42579403272377286, + "grad_norm": 1.2250018119812012, + "learning_rate": 0.00017848791276846963, + "loss": 1.1217, + "step": 11060 + }, + { + "epoch": 0.42598652550529353, + "grad_norm": 1.9863545894622803, + "learning_rate": 0.0001784691712417491, + "loss": 1.0159, + "step": 11065 + }, + { + "epoch": 0.42617901828681426, + "grad_norm": 1.3587582111358643, + "learning_rate": 0.0001784504225395427, + "loss": 1.1266, + "step": 11070 + }, + { + "epoch": 0.42637151106833493, + "grad_norm": 1.3274664878845215, + "learning_rate": 0.0001784316666635648, + "loss": 1.2295, + "step": 11075 + }, + { + "epoch": 0.42656400384985566, + "grad_norm": 1.594498872756958, + "learning_rate": 0.00017841290361553057, + "loss": 1.2942, + "step": 11080 + }, + { + "epoch": 0.4267564966313763, + "grad_norm": 2.5940325260162354, + "learning_rate": 0.00017839413339715572, + "loss": 1.3333, + "step": 11085 + }, + { + "epoch": 0.426948989412897, + "grad_norm": 1.5368024110794067, + "learning_rate": 0.0001783753560101567, + "loss": 1.2738, + "step": 11090 + }, + { + "epoch": 0.4271414821944177, + "grad_norm": 1.8095320463180542, + "learning_rate": 0.00017835657145625055, + "loss": 1.3245, + "step": 11095 + }, + { + "epoch": 0.4273339749759384, + "grad_norm": 1.4597771167755127, + "learning_rate": 0.000178337779737155, + "loss": 1.3837, + "step": 11100 + }, + { + "epoch": 0.4275264677574591, + "grad_norm": 1.052746057510376, + "learning_rate": 0.00017831898085458842, + "loss": 1.1603, + "step": 11105 + }, + { + "epoch": 0.4277189605389798, + "grad_norm": 1.547523856163025, + "learning_rate": 0.0001783001748102699, + "loss": 1.2277, + "step": 11110 + }, + { + "epoch": 0.42791145332050046, + "grad_norm": 2.109560012817383, + "learning_rate": 0.00017828136160591906, + "loss": 1.1299, + "step": 11115 + }, + { + "epoch": 0.4281039461020212, + "grad_norm": 0.9221099019050598, + "learning_rate": 0.00017826254124325626, + "loss": 1.1447, + "step": 11120 + }, + { + "epoch": 0.42829643888354185, + "grad_norm": 1.1257829666137695, + "learning_rate": 0.00017824371372400255, + "loss": 1.0844, + "step": 11125 + }, + { + "epoch": 0.4284889316650626, + "grad_norm": 1.9643393754959106, + "learning_rate": 0.00017822487904987948, + "loss": 1.1511, + "step": 11130 + }, + { + "epoch": 0.42868142444658325, + "grad_norm": 1.2279611825942993, + "learning_rate": 0.00017820603722260944, + "loss": 1.3039, + "step": 11135 + }, + { + "epoch": 0.428873917228104, + "grad_norm": 1.8037766218185425, + "learning_rate": 0.00017818718824391536, + "loss": 1.2338, + "step": 11140 + }, + { + "epoch": 0.42906641000962464, + "grad_norm": 2.1256327629089355, + "learning_rate": 0.00017816833211552085, + "loss": 1.2502, + "step": 11145 + }, + { + "epoch": 0.4292589027911453, + "grad_norm": 1.1520932912826538, + "learning_rate": 0.0001781494688391502, + "loss": 1.121, + "step": 11150 + }, + { + "epoch": 0.42945139557266604, + "grad_norm": 1.1287842988967896, + "learning_rate": 0.00017813059841652833, + "loss": 1.2012, + "step": 11155 + }, + { + "epoch": 0.4296438883541867, + "grad_norm": 1.2584294080734253, + "learning_rate": 0.00017811172084938076, + "loss": 1.3221, + "step": 11160 + }, + { + "epoch": 0.42983638113570743, + "grad_norm": 1.901994228363037, + "learning_rate": 0.0001780928361394338, + "loss": 1.1184, + "step": 11165 + }, + { + "epoch": 0.4300288739172281, + "grad_norm": 1.564501166343689, + "learning_rate": 0.00017807394428841428, + "loss": 1.12, + "step": 11170 + }, + { + "epoch": 0.4302213666987488, + "grad_norm": 2.138155221939087, + "learning_rate": 0.00017805504529804975, + "loss": 1.1928, + "step": 11175 + }, + { + "epoch": 0.4304138594802695, + "grad_norm": 1.3132466077804565, + "learning_rate": 0.00017803613917006841, + "loss": 1.2674, + "step": 11180 + }, + { + "epoch": 0.43060635226179017, + "grad_norm": 1.1847275495529175, + "learning_rate": 0.00017801722590619903, + "loss": 1.1457, + "step": 11185 + }, + { + "epoch": 0.4307988450433109, + "grad_norm": 1.6100077629089355, + "learning_rate": 0.00017799830550817124, + "loss": 1.3779, + "step": 11190 + }, + { + "epoch": 0.43099133782483157, + "grad_norm": 2.1193013191223145, + "learning_rate": 0.00017797937797771503, + "loss": 1.0515, + "step": 11195 + }, + { + "epoch": 0.43118383060635224, + "grad_norm": 1.6185005903244019, + "learning_rate": 0.0001779604433165613, + "loss": 1.078, + "step": 11200 + }, + { + "epoch": 0.43137632338787296, + "grad_norm": 1.275046467781067, + "learning_rate": 0.00017794150152644148, + "loss": 2.2652, + "step": 11205 + }, + { + "epoch": 0.43156881616939363, + "grad_norm": 1.4507300853729248, + "learning_rate": 0.00017792255260908765, + "loss": 1.3556, + "step": 11210 + }, + { + "epoch": 0.43176130895091436, + "grad_norm": 1.5722453594207764, + "learning_rate": 0.00017790359656623256, + "loss": 1.1115, + "step": 11215 + }, + { + "epoch": 0.431953801732435, + "grad_norm": 1.802585244178772, + "learning_rate": 0.00017788463339960962, + "loss": 1.1885, + "step": 11220 + }, + { + "epoch": 0.43214629451395575, + "grad_norm": 1.0945521593093872, + "learning_rate": 0.00017786566311095295, + "loss": 1.2419, + "step": 11225 + }, + { + "epoch": 0.4323387872954764, + "grad_norm": 1.6798467636108398, + "learning_rate": 0.00017784668570199714, + "loss": 1.0404, + "step": 11230 + }, + { + "epoch": 0.4325312800769971, + "grad_norm": 1.9263988733291626, + "learning_rate": 0.00017782770117447764, + "loss": 1.2925, + "step": 11235 + }, + { + "epoch": 0.4327237728585178, + "grad_norm": 1.3327709436416626, + "learning_rate": 0.0001778087095301304, + "loss": 1.2621, + "step": 11240 + }, + { + "epoch": 0.4329162656400385, + "grad_norm": 1.540216088294983, + "learning_rate": 0.00017778971077069214, + "loss": 1.2733, + "step": 11245 + }, + { + "epoch": 0.4331087584215592, + "grad_norm": 0.8980332612991333, + "learning_rate": 0.00017777070489790014, + "loss": 1.1849, + "step": 11250 + }, + { + "epoch": 0.4333012512030799, + "grad_norm": 1.1286743879318237, + "learning_rate": 0.00017775169191349238, + "loss": 1.0491, + "step": 11255 + }, + { + "epoch": 0.43349374398460055, + "grad_norm": 1.5880367755889893, + "learning_rate": 0.0001777326718192074, + "loss": 1.1371, + "step": 11260 + }, + { + "epoch": 0.4336862367661213, + "grad_norm": 1.8634532690048218, + "learning_rate": 0.00017771364461678454, + "loss": 1.3491, + "step": 11265 + }, + { + "epoch": 0.43387872954764195, + "grad_norm": 1.13876473903656, + "learning_rate": 0.0001776946103079637, + "loss": 1.1284, + "step": 11270 + }, + { + "epoch": 0.4340712223291627, + "grad_norm": 1.1511520147323608, + "learning_rate": 0.0001776755688944854, + "loss": 0.9705, + "step": 11275 + }, + { + "epoch": 0.43426371511068335, + "grad_norm": 2.0832314491271973, + "learning_rate": 0.00017765652037809087, + "loss": 1.1134, + "step": 11280 + }, + { + "epoch": 0.434456207892204, + "grad_norm": 1.3219777345657349, + "learning_rate": 0.000177637464760522, + "loss": 1.1519, + "step": 11285 + }, + { + "epoch": 0.43464870067372474, + "grad_norm": 1.2205532789230347, + "learning_rate": 0.0001776184020435213, + "loss": 1.1526, + "step": 11290 + }, + { + "epoch": 0.4348411934552454, + "grad_norm": 1.1612414121627808, + "learning_rate": 0.00017759933222883187, + "loss": 1.2236, + "step": 11295 + }, + { + "epoch": 0.43503368623676614, + "grad_norm": 2.214245319366455, + "learning_rate": 0.00017758025531819756, + "loss": 1.1962, + "step": 11300 + }, + { + "epoch": 0.4352261790182868, + "grad_norm": 1.1582585573196411, + "learning_rate": 0.00017756117131336284, + "loss": 1.1488, + "step": 11305 + }, + { + "epoch": 0.43541867179980753, + "grad_norm": 1.6610682010650635, + "learning_rate": 0.0001775420802160728, + "loss": 1.2349, + "step": 11310 + }, + { + "epoch": 0.4356111645813282, + "grad_norm": 1.2163527011871338, + "learning_rate": 0.00017752298202807317, + "loss": 1.0914, + "step": 11315 + }, + { + "epoch": 0.4358036573628489, + "grad_norm": 1.3684804439544678, + "learning_rate": 0.00017750387675111043, + "loss": 1.1035, + "step": 11320 + }, + { + "epoch": 0.4359961501443696, + "grad_norm": 2.0042598247528076, + "learning_rate": 0.00017748476438693151, + "loss": 1.1783, + "step": 11325 + }, + { + "epoch": 0.43618864292589027, + "grad_norm": 1.4552195072174072, + "learning_rate": 0.00017746564493728424, + "loss": 1.1373, + "step": 11330 + }, + { + "epoch": 0.436381135707411, + "grad_norm": 1.1513317823410034, + "learning_rate": 0.00017744651840391685, + "loss": 1.122, + "step": 11335 + }, + { + "epoch": 0.43657362848893166, + "grad_norm": 1.1842467784881592, + "learning_rate": 0.0001774273847885784, + "loss": 1.085, + "step": 11340 + }, + { + "epoch": 0.43676612127045233, + "grad_norm": 1.5492455959320068, + "learning_rate": 0.00017740824409301852, + "loss": 1.1355, + "step": 11345 + }, + { + "epoch": 0.43695861405197306, + "grad_norm": 1.6276592016220093, + "learning_rate": 0.00017738909631898753, + "loss": 1.3922, + "step": 11350 + }, + { + "epoch": 0.43715110683349373, + "grad_norm": 1.5947320461273193, + "learning_rate": 0.0001773699414682363, + "loss": 1.1952, + "step": 11355 + }, + { + "epoch": 0.43734359961501446, + "grad_norm": 1.0628368854522705, + "learning_rate": 0.00017735077954251648, + "loss": 1.3908, + "step": 11360 + }, + { + "epoch": 0.4375360923965351, + "grad_norm": 1.6347852945327759, + "learning_rate": 0.00017733161054358027, + "loss": 1.3614, + "step": 11365 + }, + { + "epoch": 0.4377285851780558, + "grad_norm": 0.98406583070755, + "learning_rate": 0.00017731243447318055, + "loss": 1.0818, + "step": 11370 + }, + { + "epoch": 0.4379210779595765, + "grad_norm": 2.522155284881592, + "learning_rate": 0.0001772932513330708, + "loss": 1.1043, + "step": 11375 + }, + { + "epoch": 0.4381135707410972, + "grad_norm": 1.3053642511367798, + "learning_rate": 0.0001772740611250053, + "loss": 1.1731, + "step": 11380 + }, + { + "epoch": 0.4383060635226179, + "grad_norm": 1.540334701538086, + "learning_rate": 0.0001772548638507388, + "loss": 1.0897, + "step": 11385 + }, + { + "epoch": 0.4384985563041386, + "grad_norm": 1.756795048713684, + "learning_rate": 0.00017723565951202673, + "loss": 1.26, + "step": 11390 + }, + { + "epoch": 0.4386910490856593, + "grad_norm": 2.263253688812256, + "learning_rate": 0.00017721644811062524, + "loss": 1.2498, + "step": 11395 + }, + { + "epoch": 0.43888354186718, + "grad_norm": 1.2686541080474854, + "learning_rate": 0.0001771972296482911, + "loss": 1.2002, + "step": 11400 + }, + { + "epoch": 0.43907603464870065, + "grad_norm": 1.7692358493804932, + "learning_rate": 0.00017717800412678168, + "loss": 1.2989, + "step": 11405 + }, + { + "epoch": 0.4392685274302214, + "grad_norm": 0.9414786100387573, + "learning_rate": 0.00017715877154785505, + "loss": 1.0743, + "step": 11410 + }, + { + "epoch": 0.43946102021174205, + "grad_norm": 1.6488560438156128, + "learning_rate": 0.0001771395319132699, + "loss": 1.3217, + "step": 11415 + }, + { + "epoch": 0.4396535129932628, + "grad_norm": 0.9546147584915161, + "learning_rate": 0.00017712028522478556, + "loss": 1.1849, + "step": 11420 + }, + { + "epoch": 0.43984600577478344, + "grad_norm": 1.9460307359695435, + "learning_rate": 0.000177101031484162, + "loss": 1.3702, + "step": 11425 + }, + { + "epoch": 0.4400384985563041, + "grad_norm": 0.8990427255630493, + "learning_rate": 0.00017708177069315987, + "loss": 1.2009, + "step": 11430 + }, + { + "epoch": 0.44023099133782484, + "grad_norm": 1.3581219911575317, + "learning_rate": 0.0001770625028535404, + "loss": 1.1846, + "step": 11435 + }, + { + "epoch": 0.4404234841193455, + "grad_norm": 1.259728193283081, + "learning_rate": 0.00017704322796706557, + "loss": 1.2683, + "step": 11440 + }, + { + "epoch": 0.44061597690086624, + "grad_norm": 1.1262446641921997, + "learning_rate": 0.00017702394603549788, + "loss": 1.0015, + "step": 11445 + }, + { + "epoch": 0.4408084696823869, + "grad_norm": 2.5833356380462646, + "learning_rate": 0.0001770046570606006, + "loss": 1.1348, + "step": 11450 + }, + { + "epoch": 0.44100096246390763, + "grad_norm": 0.9725410342216492, + "learning_rate": 0.00017698536104413749, + "loss": 1.2214, + "step": 11455 + }, + { + "epoch": 0.4411934552454283, + "grad_norm": 1.0890756845474243, + "learning_rate": 0.00017696605798787313, + "loss": 1.1178, + "step": 11460 + }, + { + "epoch": 0.44138594802694897, + "grad_norm": 1.3130367994308472, + "learning_rate": 0.0001769467478935726, + "loss": 1.1582, + "step": 11465 + }, + { + "epoch": 0.4415784408084697, + "grad_norm": 2.401630163192749, + "learning_rate": 0.00017692743076300172, + "loss": 1.3043, + "step": 11470 + }, + { + "epoch": 0.44177093358999037, + "grad_norm": 1.2928016185760498, + "learning_rate": 0.00017690810659792686, + "loss": 1.1354, + "step": 11475 + }, + { + "epoch": 0.4419634263715111, + "grad_norm": 1.6433988809585571, + "learning_rate": 0.00017688877540011517, + "loss": 1.2385, + "step": 11480 + }, + { + "epoch": 0.44215591915303176, + "grad_norm": 1.5450482368469238, + "learning_rate": 0.00017686943717133428, + "loss": 1.3096, + "step": 11485 + }, + { + "epoch": 0.44234841193455243, + "grad_norm": 2.0231974124908447, + "learning_rate": 0.00017685009191335257, + "loss": 1.2129, + "step": 11490 + }, + { + "epoch": 0.44254090471607316, + "grad_norm": 1.2831270694732666, + "learning_rate": 0.00017683073962793908, + "loss": 1.2213, + "step": 11495 + }, + { + "epoch": 0.44273339749759383, + "grad_norm": 1.035520076751709, + "learning_rate": 0.00017681138031686337, + "loss": 1.248, + "step": 11500 + }, + { + "epoch": 0.44292589027911455, + "grad_norm": 1.113934874534607, + "learning_rate": 0.00017679201398189577, + "loss": 1.0017, + "step": 11505 + }, + { + "epoch": 0.4431183830606352, + "grad_norm": 1.202412724494934, + "learning_rate": 0.0001767726406248072, + "loss": 1.0873, + "step": 11510 + }, + { + "epoch": 0.4433108758421559, + "grad_norm": 1.2946287393569946, + "learning_rate": 0.0001767532602473692, + "loss": 1.3873, + "step": 11515 + }, + { + "epoch": 0.4435033686236766, + "grad_norm": 1.2840358018875122, + "learning_rate": 0.00017673387285135398, + "loss": 1.2559, + "step": 11520 + }, + { + "epoch": 0.4436958614051973, + "grad_norm": 0.9422056078910828, + "learning_rate": 0.00017671447843853444, + "loss": 1.1179, + "step": 11525 + }, + { + "epoch": 0.443888354186718, + "grad_norm": 1.9112647771835327, + "learning_rate": 0.000176695077010684, + "loss": 1.1519, + "step": 11530 + }, + { + "epoch": 0.4440808469682387, + "grad_norm": 0.9463594555854797, + "learning_rate": 0.00017667566856957687, + "loss": 1.1175, + "step": 11535 + }, + { + "epoch": 0.4442733397497594, + "grad_norm": 2.1585206985473633, + "learning_rate": 0.00017665625311698776, + "loss": 1.1535, + "step": 11540 + }, + { + "epoch": 0.4444658325312801, + "grad_norm": 1.038095474243164, + "learning_rate": 0.0001766368306546921, + "loss": 1.1633, + "step": 11545 + }, + { + "epoch": 0.44465832531280075, + "grad_norm": 1.4679070711135864, + "learning_rate": 0.00017661740118446594, + "loss": 1.3792, + "step": 11550 + }, + { + "epoch": 0.4448508180943215, + "grad_norm": 1.3058511018753052, + "learning_rate": 0.00017659796470808597, + "loss": 1.2802, + "step": 11555 + }, + { + "epoch": 0.44504331087584215, + "grad_norm": 1.0330942869186401, + "learning_rate": 0.0001765785212273296, + "loss": 1.1621, + "step": 11560 + }, + { + "epoch": 0.44523580365736287, + "grad_norm": 1.6481776237487793, + "learning_rate": 0.0001765590707439747, + "loss": 1.1098, + "step": 11565 + }, + { + "epoch": 0.44542829643888354, + "grad_norm": 1.3850781917572021, + "learning_rate": 0.00017653961325979998, + "loss": 1.3687, + "step": 11570 + }, + { + "epoch": 0.4456207892204042, + "grad_norm": 1.6551322937011719, + "learning_rate": 0.0001765201487765846, + "loss": 1.3436, + "step": 11575 + }, + { + "epoch": 0.44581328200192494, + "grad_norm": 1.0752167701721191, + "learning_rate": 0.00017650067729610856, + "loss": 1.1667, + "step": 11580 + }, + { + "epoch": 0.4460057747834456, + "grad_norm": 1.4762775897979736, + "learning_rate": 0.00017648119882015232, + "loss": 1.0119, + "step": 11585 + }, + { + "epoch": 0.44619826756496633, + "grad_norm": 0.7833762764930725, + "learning_rate": 0.0001764617133504971, + "loss": 1.1047, + "step": 11590 + }, + { + "epoch": 0.446390760346487, + "grad_norm": 1.1666022539138794, + "learning_rate": 0.00017644222088892473, + "loss": 1.2339, + "step": 11595 + }, + { + "epoch": 0.4465832531280077, + "grad_norm": 1.7897813320159912, + "learning_rate": 0.0001764227214372176, + "loss": 1.205, + "step": 11600 + }, + { + "epoch": 0.4467757459095284, + "grad_norm": 1.2021222114562988, + "learning_rate": 0.00017640321499715888, + "loss": 1.2518, + "step": 11605 + }, + { + "epoch": 0.44696823869104907, + "grad_norm": 2.9843320846557617, + "learning_rate": 0.00017638370157053228, + "loss": 0.9705, + "step": 11610 + }, + { + "epoch": 0.4471607314725698, + "grad_norm": 1.2910903692245483, + "learning_rate": 0.00017636418115912213, + "loss": 1.4018, + "step": 11615 + }, + { + "epoch": 0.44735322425409046, + "grad_norm": 1.0188699960708618, + "learning_rate": 0.00017634855980214943, + "loss": 1.6206, + "step": 11620 + }, + { + "epoch": 0.4475457170356112, + "grad_norm": 1.0419138669967651, + "learning_rate": 0.00017632902682262764, + "loss": 1.2483, + "step": 11625 + }, + { + "epoch": 0.44773820981713186, + "grad_norm": 1.665586233139038, + "learning_rate": 0.000176309486863322, + "loss": 1.1838, + "step": 11630 + }, + { + "epoch": 0.44793070259865253, + "grad_norm": 2.3444008827209473, + "learning_rate": 0.00017628993992601925, + "loss": 1.291, + "step": 11635 + }, + { + "epoch": 0.44812319538017326, + "grad_norm": 1.960339069366455, + "learning_rate": 0.00017627038601250686, + "loss": 1.2312, + "step": 11640 + }, + { + "epoch": 0.4483156881616939, + "grad_norm": 1.5672719478607178, + "learning_rate": 0.00017625082512457297, + "loss": 1.2281, + "step": 11645 + }, + { + "epoch": 0.44850818094321465, + "grad_norm": 1.5053352117538452, + "learning_rate": 0.00017623125726400621, + "loss": 1.1688, + "step": 11650 + }, + { + "epoch": 0.4487006737247353, + "grad_norm": 1.841610312461853, + "learning_rate": 0.00017621168243259596, + "loss": 1.1607, + "step": 11655 + }, + { + "epoch": 0.448893166506256, + "grad_norm": 1.1526665687561035, + "learning_rate": 0.0001761921006321322, + "loss": 1.1788, + "step": 11660 + }, + { + "epoch": 0.4490856592877767, + "grad_norm": 1.4064139127731323, + "learning_rate": 0.00017617251186440556, + "loss": 1.0825, + "step": 11665 + }, + { + "epoch": 0.4492781520692974, + "grad_norm": 1.1119096279144287, + "learning_rate": 0.00017615291613120736, + "loss": 1.2768, + "step": 11670 + }, + { + "epoch": 0.4494706448508181, + "grad_norm": 1.2367806434631348, + "learning_rate": 0.00017613331343432938, + "loss": 1.2612, + "step": 11675 + }, + { + "epoch": 0.4496631376323388, + "grad_norm": 1.093410611152649, + "learning_rate": 0.00017611370377556423, + "loss": 1.3075, + "step": 11680 + }, + { + "epoch": 0.44985563041385945, + "grad_norm": 1.0085220336914062, + "learning_rate": 0.00017609408715670512, + "loss": 1.2391, + "step": 11685 + }, + { + "epoch": 0.4500481231953802, + "grad_norm": 1.4346550703048706, + "learning_rate": 0.0001760744635795458, + "loss": 1.2241, + "step": 11690 + }, + { + "epoch": 0.45024061597690085, + "grad_norm": 1.483905553817749, + "learning_rate": 0.0001760548330458807, + "loss": 1.2696, + "step": 11695 + }, + { + "epoch": 0.4504331087584216, + "grad_norm": 1.6455215215682983, + "learning_rate": 0.00017603519555750498, + "loss": 1.2113, + "step": 11700 + }, + { + "epoch": 0.45062560153994224, + "grad_norm": 1.7613027095794678, + "learning_rate": 0.00017601555111621428, + "loss": 1.1581, + "step": 11705 + }, + { + "epoch": 0.45081809432146297, + "grad_norm": 1.5872759819030762, + "learning_rate": 0.000175995899723805, + "loss": 1.0977, + "step": 11710 + }, + { + "epoch": 0.45101058710298364, + "grad_norm": 1.5521520376205444, + "learning_rate": 0.00017597624138207413, + "loss": 1.3003, + "step": 11715 + }, + { + "epoch": 0.4512030798845043, + "grad_norm": 2.1746668815612793, + "learning_rate": 0.0001759565760928193, + "loss": 1.1861, + "step": 11720 + }, + { + "epoch": 0.45139557266602504, + "grad_norm": 1.73439359664917, + "learning_rate": 0.00017593690385783866, + "loss": 1.242, + "step": 11725 + }, + { + "epoch": 0.4515880654475457, + "grad_norm": 1.6027134656906128, + "learning_rate": 0.0001759172246789313, + "loss": 1.2936, + "step": 11730 + }, + { + "epoch": 0.45178055822906643, + "grad_norm": 1.62489652633667, + "learning_rate": 0.0001758975385578966, + "loss": 1.3521, + "step": 11735 + }, + { + "epoch": 0.4519730510105871, + "grad_norm": 1.3407773971557617, + "learning_rate": 0.00017587784549653477, + "loss": 1.1653, + "step": 11740 + }, + { + "epoch": 0.45216554379210777, + "grad_norm": 2.064875364303589, + "learning_rate": 0.00017585814549664664, + "loss": 1.2321, + "step": 11745 + }, + { + "epoch": 0.4523580365736285, + "grad_norm": 1.115850806236267, + "learning_rate": 0.0001758384385600336, + "loss": 1.0289, + "step": 11750 + }, + { + "epoch": 0.45255052935514917, + "grad_norm": 1.3943949937820435, + "learning_rate": 0.00017581872468849777, + "loss": 1.2846, + "step": 11755 + }, + { + "epoch": 0.4527430221366699, + "grad_norm": 1.0405654907226562, + "learning_rate": 0.0001757990038838418, + "loss": 1.1209, + "step": 11760 + }, + { + "epoch": 0.45293551491819056, + "grad_norm": 1.0115854740142822, + "learning_rate": 0.00017577927614786902, + "loss": 1.0178, + "step": 11765 + }, + { + "epoch": 0.4531280076997113, + "grad_norm": 2.48100209236145, + "learning_rate": 0.00017575954148238345, + "loss": 1.2485, + "step": 11770 + }, + { + "epoch": 0.45332050048123196, + "grad_norm": 1.5187568664550781, + "learning_rate": 0.00017573979988918967, + "loss": 1.3345, + "step": 11775 + }, + { + "epoch": 0.45351299326275263, + "grad_norm": 1.2286217212677002, + "learning_rate": 0.00017572005137009292, + "loss": 1.1079, + "step": 11780 + }, + { + "epoch": 0.45370548604427335, + "grad_norm": 1.5858092308044434, + "learning_rate": 0.00017570029592689908, + "loss": 1.4054, + "step": 11785 + }, + { + "epoch": 0.453897978825794, + "grad_norm": 2.0436697006225586, + "learning_rate": 0.00017568053356141464, + "loss": 1.3221, + "step": 11790 + }, + { + "epoch": 0.45409047160731475, + "grad_norm": 1.6980565786361694, + "learning_rate": 0.00017566076427544673, + "loss": 1.2384, + "step": 11795 + }, + { + "epoch": 0.4542829643888354, + "grad_norm": 1.3811545372009277, + "learning_rate": 0.00017564098807080315, + "loss": 1.171, + "step": 11800 + }, + { + "epoch": 0.4544754571703561, + "grad_norm": 1.2215286493301392, + "learning_rate": 0.00017562120494929228, + "loss": 1.1781, + "step": 11805 + }, + { + "epoch": 0.4546679499518768, + "grad_norm": 1.1313782930374146, + "learning_rate": 0.00017560141491272319, + "loss": 1.2166, + "step": 11810 + }, + { + "epoch": 0.4548604427333975, + "grad_norm": 1.2630988359451294, + "learning_rate": 0.0001755816179629055, + "loss": 1.2652, + "step": 11815 + }, + { + "epoch": 0.4550529355149182, + "grad_norm": 1.0977842807769775, + "learning_rate": 0.0001755618141016495, + "loss": 1.3057, + "step": 11820 + }, + { + "epoch": 0.4552454282964389, + "grad_norm": 0.8517459034919739, + "learning_rate": 0.0001755420033307662, + "loss": 1.1769, + "step": 11825 + }, + { + "epoch": 0.45543792107795955, + "grad_norm": 0.7195164561271667, + "learning_rate": 0.00017552218565206707, + "loss": 0.9777, + "step": 11830 + }, + { + "epoch": 0.4556304138594803, + "grad_norm": 1.125056266784668, + "learning_rate": 0.00017550236106736436, + "loss": 1.1008, + "step": 11835 + }, + { + "epoch": 0.45582290664100095, + "grad_norm": 0.8211593627929688, + "learning_rate": 0.00017548252957847092, + "loss": 1.1539, + "step": 11840 + }, + { + "epoch": 0.4560153994225217, + "grad_norm": 1.8936784267425537, + "learning_rate": 0.00017546269118720015, + "loss": 1.143, + "step": 11845 + }, + { + "epoch": 0.45620789220404234, + "grad_norm": 1.5479308366775513, + "learning_rate": 0.00017544284589536617, + "loss": 1.1481, + "step": 11850 + }, + { + "epoch": 0.45640038498556307, + "grad_norm": 1.4597593545913696, + "learning_rate": 0.00017542299370478372, + "loss": 1.2907, + "step": 11855 + }, + { + "epoch": 0.45659287776708374, + "grad_norm": 1.4036239385604858, + "learning_rate": 0.0001754031346172681, + "loss": 1.2927, + "step": 11860 + }, + { + "epoch": 0.4567853705486044, + "grad_norm": 0.9842814207077026, + "learning_rate": 0.00017538326863463533, + "loss": 1.0571, + "step": 11865 + }, + { + "epoch": 0.45697786333012513, + "grad_norm": 2.478254556655884, + "learning_rate": 0.000175363395758702, + "loss": 1.3115, + "step": 11870 + }, + { + "epoch": 0.4571703561116458, + "grad_norm": 1.000182032585144, + "learning_rate": 0.00017534351599128538, + "loss": 1.3071, + "step": 11875 + }, + { + "epoch": 0.45736284889316653, + "grad_norm": 1.8669004440307617, + "learning_rate": 0.0001753236293342033, + "loss": 1.2386, + "step": 11880 + }, + { + "epoch": 0.4575553416746872, + "grad_norm": 1.6287200450897217, + "learning_rate": 0.00017530373578927432, + "loss": 1.2196, + "step": 11885 + }, + { + "epoch": 0.45774783445620787, + "grad_norm": 2.1733322143554688, + "learning_rate": 0.00017528383535831755, + "loss": 1.6165, + "step": 11890 + }, + { + "epoch": 0.4579403272377286, + "grad_norm": 1.0370094776153564, + "learning_rate": 0.00017526392804315273, + "loss": 1.1799, + "step": 11895 + }, + { + "epoch": 0.45813282001924927, + "grad_norm": 1.3969937562942505, + "learning_rate": 0.00017524401384560025, + "loss": 1.2224, + "step": 11900 + }, + { + "epoch": 0.45832531280077, + "grad_norm": 1.3850924968719482, + "learning_rate": 0.00017522409276748117, + "loss": 1.4161, + "step": 11905 + }, + { + "epoch": 0.45851780558229066, + "grad_norm": 1.4318947792053223, + "learning_rate": 0.00017520416481061712, + "loss": 1.4166, + "step": 11910 + }, + { + "epoch": 0.45871029836381133, + "grad_norm": 1.525709629058838, + "learning_rate": 0.00017518422997683038, + "loss": 1.255, + "step": 11915 + }, + { + "epoch": 0.45890279114533206, + "grad_norm": 0.9193233847618103, + "learning_rate": 0.00017516428826794384, + "loss": 1.2299, + "step": 11920 + }, + { + "epoch": 0.4590952839268527, + "grad_norm": 1.8636525869369507, + "learning_rate": 0.00017514433968578107, + "loss": 1.1992, + "step": 11925 + }, + { + "epoch": 0.45928777670837345, + "grad_norm": 1.3876943588256836, + "learning_rate": 0.00017512438423216624, + "loss": 1.2022, + "step": 11930 + }, + { + "epoch": 0.4594802694898941, + "grad_norm": 1.5370129346847534, + "learning_rate": 0.00017510442190892412, + "loss": 1.319, + "step": 11935 + }, + { + "epoch": 0.45967276227141485, + "grad_norm": 1.8562203645706177, + "learning_rate": 0.00017508445271788013, + "loss": 1.0784, + "step": 11940 + }, + { + "epoch": 0.4598652550529355, + "grad_norm": 1.1265978813171387, + "learning_rate": 0.0001750644766608603, + "loss": 1.1591, + "step": 11945 + }, + { + "epoch": 0.4600577478344562, + "grad_norm": 1.3049321174621582, + "learning_rate": 0.00017504449373969137, + "loss": 1.2567, + "step": 11950 + }, + { + "epoch": 0.4602502406159769, + "grad_norm": 1.4252487421035767, + "learning_rate": 0.0001750245039562006, + "loss": 1.0848, + "step": 11955 + }, + { + "epoch": 0.4604427333974976, + "grad_norm": 1.888185977935791, + "learning_rate": 0.00017500450731221592, + "loss": 1.1976, + "step": 11960 + }, + { + "epoch": 0.4606352261790183, + "grad_norm": 1.4043982028961182, + "learning_rate": 0.00017498450380956594, + "loss": 1.2038, + "step": 11965 + }, + { + "epoch": 0.460827718960539, + "grad_norm": 1.182576060295105, + "learning_rate": 0.00017496449345007982, + "loss": 1.3408, + "step": 11970 + }, + { + "epoch": 0.46102021174205965, + "grad_norm": 2.084197521209717, + "learning_rate": 0.00017494447623558733, + "loss": 1.269, + "step": 11975 + }, + { + "epoch": 0.4612127045235804, + "grad_norm": 1.709518551826477, + "learning_rate": 0.00017492445216791896, + "loss": 1.3736, + "step": 11980 + }, + { + "epoch": 0.46140519730510104, + "grad_norm": 1.1446977853775024, + "learning_rate": 0.00017490442124890577, + "loss": 1.2449, + "step": 11985 + }, + { + "epoch": 0.46159769008662177, + "grad_norm": 1.9139240980148315, + "learning_rate": 0.00017488438348037946, + "loss": 1.0845, + "step": 11990 + }, + { + "epoch": 0.46179018286814244, + "grad_norm": 1.6536133289337158, + "learning_rate": 0.00017486433886417234, + "loss": 1.4398, + "step": 11995 + }, + { + "epoch": 0.4619826756496631, + "grad_norm": 1.0629438161849976, + "learning_rate": 0.00017484428740211736, + "loss": 1.2631, + "step": 12000 + }, + { + "epoch": 0.46217516843118384, + "grad_norm": 1.1966623067855835, + "learning_rate": 0.00017482422909604809, + "loss": 1.088, + "step": 12005 + }, + { + "epoch": 0.4623676612127045, + "grad_norm": 1.1087130308151245, + "learning_rate": 0.00017480416394779878, + "loss": 1.2133, + "step": 12010 + }, + { + "epoch": 0.46256015399422523, + "grad_norm": 1.9408375024795532, + "learning_rate": 0.00017478409195920413, + "loss": 1.1677, + "step": 12015 + }, + { + "epoch": 0.4627526467757459, + "grad_norm": 1.2703943252563477, + "learning_rate": 0.00017476401313209973, + "loss": 1.232, + "step": 12020 + }, + { + "epoch": 0.4629451395572666, + "grad_norm": 1.7841099500656128, + "learning_rate": 0.0001747439274683216, + "loss": 1.1688, + "step": 12025 + }, + { + "epoch": 0.4631376323387873, + "grad_norm": 1.9395395517349243, + "learning_rate": 0.0001747238349697064, + "loss": 1.2336, + "step": 12030 + }, + { + "epoch": 0.46333012512030797, + "grad_norm": 1.5011239051818848, + "learning_rate": 0.0001747037356380915, + "loss": 1.1849, + "step": 12035 + }, + { + "epoch": 0.4635226179018287, + "grad_norm": 1.6130584478378296, + "learning_rate": 0.00017468362947531486, + "loss": 1.3113, + "step": 12040 + }, + { + "epoch": 0.46371511068334936, + "grad_norm": 1.5666422843933105, + "learning_rate": 0.000174663516483215, + "loss": 1.2178, + "step": 12045 + }, + { + "epoch": 0.4639076034648701, + "grad_norm": 1.490662932395935, + "learning_rate": 0.0001746433966636312, + "loss": 1.3034, + "step": 12050 + }, + { + "epoch": 0.46410009624639076, + "grad_norm": 1.1972042322158813, + "learning_rate": 0.00017462327001840322, + "loss": 1.1732, + "step": 12055 + }, + { + "epoch": 0.46429258902791143, + "grad_norm": 1.5201470851898193, + "learning_rate": 0.00017460313654937154, + "loss": 1.1545, + "step": 12060 + }, + { + "epoch": 0.46448508180943215, + "grad_norm": 0.8927121758460999, + "learning_rate": 0.00017458299625837723, + "loss": 1.1516, + "step": 12065 + }, + { + "epoch": 0.4646775745909528, + "grad_norm": 1.394187092781067, + "learning_rate": 0.00017456284914726196, + "loss": 1.2791, + "step": 12070 + }, + { + "epoch": 0.46487006737247355, + "grad_norm": 1.8900322914123535, + "learning_rate": 0.00017454269521786808, + "loss": 1.244, + "step": 12075 + }, + { + "epoch": 0.4650625601539942, + "grad_norm": 2.20624041557312, + "learning_rate": 0.00017452253447203852, + "loss": 1.2526, + "step": 12080 + }, + { + "epoch": 0.46525505293551495, + "grad_norm": 1.404261827468872, + "learning_rate": 0.00017450236691161686, + "loss": 1.1711, + "step": 12085 + }, + { + "epoch": 0.4654475457170356, + "grad_norm": 1.6828880310058594, + "learning_rate": 0.00017448219253844726, + "loss": 1.3007, + "step": 12090 + }, + { + "epoch": 0.4656400384985563, + "grad_norm": 1.0239325761795044, + "learning_rate": 0.00017446201135437456, + "loss": 1.1359, + "step": 12095 + }, + { + "epoch": 0.465832531280077, + "grad_norm": 0.9242125749588013, + "learning_rate": 0.0001744418233612442, + "loss": 1.1848, + "step": 12100 + }, + { + "epoch": 0.4660250240615977, + "grad_norm": 2.9907031059265137, + "learning_rate": 0.0001744216285609022, + "loss": 1.155, + "step": 12105 + }, + { + "epoch": 0.4662175168431184, + "grad_norm": 0.9708018898963928, + "learning_rate": 0.0001744014269551953, + "loss": 1.4752, + "step": 12110 + }, + { + "epoch": 0.4664100096246391, + "grad_norm": 1.1917387247085571, + "learning_rate": 0.00017438121854597075, + "loss": 1.1197, + "step": 12115 + }, + { + "epoch": 0.46660250240615975, + "grad_norm": 1.5464357137680054, + "learning_rate": 0.00017436100333507648, + "loss": 1.1908, + "step": 12120 + }, + { + "epoch": 0.4667949951876805, + "grad_norm": 1.9502155780792236, + "learning_rate": 0.00017434078132436107, + "loss": 1.1888, + "step": 12125 + }, + { + "epoch": 0.46698748796920114, + "grad_norm": 2.054029941558838, + "learning_rate": 0.00017432055251567365, + "loss": 1.2771, + "step": 12130 + }, + { + "epoch": 0.46717998075072187, + "grad_norm": 1.276356816291809, + "learning_rate": 0.00017430031691086407, + "loss": 1.2392, + "step": 12135 + }, + { + "epoch": 0.46737247353224254, + "grad_norm": 1.4474079608917236, + "learning_rate": 0.00017428007451178267, + "loss": 1.2111, + "step": 12140 + }, + { + "epoch": 0.4675649663137632, + "grad_norm": 1.389797568321228, + "learning_rate": 0.00017425982532028053, + "loss": 1.2094, + "step": 12145 + }, + { + "epoch": 0.46775745909528393, + "grad_norm": 1.2491530179977417, + "learning_rate": 0.00017423956933820928, + "loss": 1.2374, + "step": 12150 + }, + { + "epoch": 0.4679499518768046, + "grad_norm": 1.0517950057983398, + "learning_rate": 0.00017421930656742122, + "loss": 1.1003, + "step": 12155 + }, + { + "epoch": 0.46814244465832533, + "grad_norm": 1.410630226135254, + "learning_rate": 0.00017419903700976924, + "loss": 1.2722, + "step": 12160 + }, + { + "epoch": 0.468334937439846, + "grad_norm": 1.5544359683990479, + "learning_rate": 0.00017417876066710682, + "loss": 1.1961, + "step": 12165 + }, + { + "epoch": 0.4685274302213667, + "grad_norm": 1.3200881481170654, + "learning_rate": 0.00017415847754128817, + "loss": 1.0058, + "step": 12170 + }, + { + "epoch": 0.4687199230028874, + "grad_norm": 1.998949646949768, + "learning_rate": 0.00017413818763416795, + "loss": 1.1513, + "step": 12175 + }, + { + "epoch": 0.46891241578440807, + "grad_norm": 1.4105117321014404, + "learning_rate": 0.0001741178909476016, + "loss": 1.2993, + "step": 12180 + }, + { + "epoch": 0.4691049085659288, + "grad_norm": 1.4521151781082153, + "learning_rate": 0.00017409758748344515, + "loss": 1.1659, + "step": 12185 + }, + { + "epoch": 0.46929740134744946, + "grad_norm": 1.3822886943817139, + "learning_rate": 0.00017407727724355515, + "loss": 1.3419, + "step": 12190 + }, + { + "epoch": 0.4694898941289702, + "grad_norm": 1.5602283477783203, + "learning_rate": 0.00017405696022978885, + "loss": 1.1506, + "step": 12195 + }, + { + "epoch": 0.46968238691049086, + "grad_norm": 1.2674669027328491, + "learning_rate": 0.00017403663644400413, + "loss": 1.2992, + "step": 12200 + }, + { + "epoch": 0.4698748796920115, + "grad_norm": 1.6091759204864502, + "learning_rate": 0.00017401630588805947, + "loss": 1.1105, + "step": 12205 + }, + { + "epoch": 0.47006737247353225, + "grad_norm": 1.591635823249817, + "learning_rate": 0.00017399596856381395, + "loss": 1.2884, + "step": 12210 + }, + { + "epoch": 0.4702598652550529, + "grad_norm": 1.5781102180480957, + "learning_rate": 0.00017397562447312725, + "loss": 1.1476, + "step": 12215 + }, + { + "epoch": 0.47045235803657365, + "grad_norm": 1.4029310941696167, + "learning_rate": 0.00017395527361785976, + "loss": 1.4271, + "step": 12220 + }, + { + "epoch": 0.4706448508180943, + "grad_norm": 1.8287990093231201, + "learning_rate": 0.0001739349159998724, + "loss": 1.2079, + "step": 12225 + }, + { + "epoch": 0.470837343599615, + "grad_norm": 0.9693268537521362, + "learning_rate": 0.00017391455162102677, + "loss": 1.2341, + "step": 12230 + }, + { + "epoch": 0.4710298363811357, + "grad_norm": 1.4181095361709595, + "learning_rate": 0.00017389418048318502, + "loss": 1.2796, + "step": 12235 + }, + { + "epoch": 0.4712223291626564, + "grad_norm": 1.9247058629989624, + "learning_rate": 0.00017387380258820993, + "loss": 1.1858, + "step": 12240 + }, + { + "epoch": 0.4714148219441771, + "grad_norm": 1.0236104726791382, + "learning_rate": 0.00017385341793796502, + "loss": 1.1713, + "step": 12245 + }, + { + "epoch": 0.4716073147256978, + "grad_norm": 1.0250846147537231, + "learning_rate": 0.00017383302653431427, + "loss": 1.3036, + "step": 12250 + }, + { + "epoch": 0.4717998075072185, + "grad_norm": 1.1760774850845337, + "learning_rate": 0.00017381262837912228, + "loss": 1.1779, + "step": 12255 + }, + { + "epoch": 0.4719923002887392, + "grad_norm": 1.6482713222503662, + "learning_rate": 0.00017379222347425446, + "loss": 1.151, + "step": 12260 + }, + { + "epoch": 0.47218479307025985, + "grad_norm": 1.3430352210998535, + "learning_rate": 0.00017377181182157657, + "loss": 1.3512, + "step": 12265 + }, + { + "epoch": 0.47237728585178057, + "grad_norm": 2.4042775630950928, + "learning_rate": 0.00017375139342295522, + "loss": 1.3002, + "step": 12270 + }, + { + "epoch": 0.47256977863330124, + "grad_norm": 0.967472493648529, + "learning_rate": 0.00017373096828025752, + "loss": 1.0813, + "step": 12275 + }, + { + "epoch": 0.47276227141482197, + "grad_norm": 1.9774664640426636, + "learning_rate": 0.00017371053639535117, + "loss": 1.2232, + "step": 12280 + }, + { + "epoch": 0.47295476419634264, + "grad_norm": 1.2525962591171265, + "learning_rate": 0.00017369009777010454, + "loss": 1.3974, + "step": 12285 + }, + { + "epoch": 0.4731472569778633, + "grad_norm": 2.268892765045166, + "learning_rate": 0.00017366965240638664, + "loss": 1.2812, + "step": 12290 + }, + { + "epoch": 0.47333974975938403, + "grad_norm": 1.143028974533081, + "learning_rate": 0.000173649200306067, + "loss": 1.2017, + "step": 12295 + }, + { + "epoch": 0.4735322425409047, + "grad_norm": 0.9833802580833435, + "learning_rate": 0.00017362874147101596, + "loss": 1.1669, + "step": 12300 + }, + { + "epoch": 0.4737247353224254, + "grad_norm": 1.5986253023147583, + "learning_rate": 0.0001736082759031042, + "loss": 1.2275, + "step": 12305 + }, + { + "epoch": 0.4739172281039461, + "grad_norm": 1.8394620418548584, + "learning_rate": 0.0001735878036042032, + "loss": 1.1077, + "step": 12310 + }, + { + "epoch": 0.47410972088546677, + "grad_norm": 2.2321078777313232, + "learning_rate": 0.00017356732457618506, + "loss": 1.1385, + "step": 12315 + }, + { + "epoch": 0.4743022136669875, + "grad_norm": 1.2479119300842285, + "learning_rate": 0.00017354683882092245, + "loss": 1.2189, + "step": 12320 + }, + { + "epoch": 0.47449470644850816, + "grad_norm": 1.6812646389007568, + "learning_rate": 0.0001735263463402886, + "loss": 1.3836, + "step": 12325 + }, + { + "epoch": 0.4746871992300289, + "grad_norm": 1.4916552305221558, + "learning_rate": 0.00017350584713615746, + "loss": 1.1306, + "step": 12330 + }, + { + "epoch": 0.47487969201154956, + "grad_norm": 1.7067712545394897, + "learning_rate": 0.00017348534121040354, + "loss": 1.3352, + "step": 12335 + }, + { + "epoch": 0.4750721847930703, + "grad_norm": 1.1849184036254883, + "learning_rate": 0.00017346482856490196, + "loss": 0.9746, + "step": 12340 + }, + { + "epoch": 0.47526467757459095, + "grad_norm": 1.700038194656372, + "learning_rate": 0.00017344430920152845, + "loss": 1.3462, + "step": 12345 + }, + { + "epoch": 0.4754571703561116, + "grad_norm": 1.4579262733459473, + "learning_rate": 0.0001734237831221594, + "loss": 1.2296, + "step": 12350 + }, + { + "epoch": 0.47564966313763235, + "grad_norm": 1.230469822883606, + "learning_rate": 0.00017340325032867178, + "loss": 1.1615, + "step": 12355 + }, + { + "epoch": 0.475842155919153, + "grad_norm": 1.4839364290237427, + "learning_rate": 0.00017338271082294315, + "loss": 1.2143, + "step": 12360 + }, + { + "epoch": 0.47603464870067375, + "grad_norm": 0.8386423587799072, + "learning_rate": 0.00017336216460685173, + "loss": 1.1173, + "step": 12365 + }, + { + "epoch": 0.4762271414821944, + "grad_norm": 1.9203957319259644, + "learning_rate": 0.00017334161168227634, + "loss": 1.2371, + "step": 12370 + }, + { + "epoch": 0.4764196342637151, + "grad_norm": 1.752314567565918, + "learning_rate": 0.00017332105205109641, + "loss": 1.1022, + "step": 12375 + }, + { + "epoch": 0.4766121270452358, + "grad_norm": 1.2998472452163696, + "learning_rate": 0.00017330048571519198, + "loss": 1.3008, + "step": 12380 + }, + { + "epoch": 0.4768046198267565, + "grad_norm": 1.8506637811660767, + "learning_rate": 0.0001732799126764437, + "loss": 1.0814, + "step": 12385 + }, + { + "epoch": 0.4769971126082772, + "grad_norm": 1.4652866125106812, + "learning_rate": 0.00017325933293673283, + "loss": 1.3528, + "step": 12390 + }, + { + "epoch": 0.4771896053897979, + "grad_norm": 1.0838465690612793, + "learning_rate": 0.00017323874649794127, + "loss": 1.1435, + "step": 12395 + }, + { + "epoch": 0.4773820981713186, + "grad_norm": 1.1437288522720337, + "learning_rate": 0.0001732181533619515, + "loss": 1.2403, + "step": 12400 + }, + { + "epoch": 0.4775745909528393, + "grad_norm": 1.5026469230651855, + "learning_rate": 0.00017319755353064665, + "loss": 1.3211, + "step": 12405 + }, + { + "epoch": 0.47776708373435994, + "grad_norm": 1.477759838104248, + "learning_rate": 0.00017317694700591041, + "loss": 1.062, + "step": 12410 + }, + { + "epoch": 0.47795957651588067, + "grad_norm": 1.6099724769592285, + "learning_rate": 0.00017315633378962712, + "loss": 1.322, + "step": 12415 + }, + { + "epoch": 0.47815206929740134, + "grad_norm": 1.2413129806518555, + "learning_rate": 0.00017313571388368173, + "loss": 1.3106, + "step": 12420 + }, + { + "epoch": 0.47834456207892206, + "grad_norm": 1.2218198776245117, + "learning_rate": 0.00017311508728995976, + "loss": 1.3899, + "step": 12425 + }, + { + "epoch": 0.47853705486044273, + "grad_norm": 1.337332844734192, + "learning_rate": 0.0001730944540103474, + "loss": 1.2143, + "step": 12430 + }, + { + "epoch": 0.4787295476419634, + "grad_norm": 1.132523775100708, + "learning_rate": 0.00017307381404673143, + "loss": 1.2243, + "step": 12435 + }, + { + "epoch": 0.47892204042348413, + "grad_norm": 1.481467366218567, + "learning_rate": 0.00017305316740099928, + "loss": 1.1563, + "step": 12440 + }, + { + "epoch": 0.4791145332050048, + "grad_norm": 1.3262776136398315, + "learning_rate": 0.00017303251407503885, + "loss": 1.1995, + "step": 12445 + }, + { + "epoch": 0.4793070259865255, + "grad_norm": 1.8554911613464355, + "learning_rate": 0.00017301185407073884, + "loss": 1.2679, + "step": 12450 + }, + { + "epoch": 0.4794995187680462, + "grad_norm": 1.5468156337738037, + "learning_rate": 0.0001729911873899884, + "loss": 1.1839, + "step": 12455 + }, + { + "epoch": 0.47969201154956687, + "grad_norm": 1.1333458423614502, + "learning_rate": 0.00017297051403467745, + "loss": 1.2927, + "step": 12460 + }, + { + "epoch": 0.4798845043310876, + "grad_norm": 1.5019558668136597, + "learning_rate": 0.00017294983400669632, + "loss": 1.1634, + "step": 12465 + }, + { + "epoch": 0.48007699711260826, + "grad_norm": 0.9122928977012634, + "learning_rate": 0.00017292914730793614, + "loss": 1.0842, + "step": 12470 + }, + { + "epoch": 0.480269489894129, + "grad_norm": 3.623866558074951, + "learning_rate": 0.00017290845394028853, + "loss": 1.2303, + "step": 12475 + }, + { + "epoch": 0.48046198267564966, + "grad_norm": 1.1163458824157715, + "learning_rate": 0.0001728877539056458, + "loss": 1.1877, + "step": 12480 + }, + { + "epoch": 0.4806544754571704, + "grad_norm": 1.1737778186798096, + "learning_rate": 0.00017286704720590083, + "loss": 1.2183, + "step": 12485 + }, + { + "epoch": 0.48084696823869105, + "grad_norm": 1.0381931066513062, + "learning_rate": 0.0001728463338429471, + "loss": 1.103, + "step": 12490 + }, + { + "epoch": 0.4810394610202117, + "grad_norm": 1.4400640726089478, + "learning_rate": 0.00017282561381867865, + "loss": 1.0941, + "step": 12495 + }, + { + "epoch": 0.48123195380173245, + "grad_norm": 1.774886131286621, + "learning_rate": 0.00017280488713499029, + "loss": 1.2161, + "step": 12500 + }, + { + "epoch": 0.4814244465832531, + "grad_norm": 1.680755376815796, + "learning_rate": 0.00017278415379377724, + "loss": 1.2248, + "step": 12505 + }, + { + "epoch": 0.48161693936477384, + "grad_norm": 1.0955753326416016, + "learning_rate": 0.00017276341379693553, + "loss": 1.1558, + "step": 12510 + }, + { + "epoch": 0.4818094321462945, + "grad_norm": 1.9817503690719604, + "learning_rate": 0.00017274266714636163, + "loss": 0.9682, + "step": 12515 + }, + { + "epoch": 0.4820019249278152, + "grad_norm": 1.2484976053237915, + "learning_rate": 0.00017272191384395266, + "loss": 1.1304, + "step": 12520 + }, + { + "epoch": 0.4821944177093359, + "grad_norm": 1.263295292854309, + "learning_rate": 0.00017270115389160645, + "loss": 1.1851, + "step": 12525 + }, + { + "epoch": 0.4823869104908566, + "grad_norm": 1.749971628189087, + "learning_rate": 0.00017268038729122126, + "loss": 1.2665, + "step": 12530 + }, + { + "epoch": 0.4825794032723773, + "grad_norm": 2.1695139408111572, + "learning_rate": 0.0001726596140446962, + "loss": 1.2351, + "step": 12535 + }, + { + "epoch": 0.482771896053898, + "grad_norm": 1.8199032545089722, + "learning_rate": 0.00017263883415393069, + "loss": 1.0922, + "step": 12540 + }, + { + "epoch": 0.48296438883541865, + "grad_norm": 1.5746350288391113, + "learning_rate": 0.00017261804762082501, + "loss": 1.2759, + "step": 12545 + }, + { + "epoch": 0.48315688161693937, + "grad_norm": 1.1286424398422241, + "learning_rate": 0.00017259725444727995, + "loss": 1.2286, + "step": 12550 + }, + { + "epoch": 0.48334937439846004, + "grad_norm": 1.1256860494613647, + "learning_rate": 0.00017257645463519686, + "loss": 1.1408, + "step": 12555 + }, + { + "epoch": 0.48354186717998077, + "grad_norm": 0.907913863658905, + "learning_rate": 0.00017255564818647776, + "loss": 1.1889, + "step": 12560 + }, + { + "epoch": 0.48373435996150144, + "grad_norm": 1.0480241775512695, + "learning_rate": 0.0001725348351030253, + "loss": 1.0954, + "step": 12565 + }, + { + "epoch": 0.48392685274302216, + "grad_norm": 1.4278559684753418, + "learning_rate": 0.0001725140153867426, + "loss": 1.1367, + "step": 12570 + }, + { + "epoch": 0.48411934552454283, + "grad_norm": 0.9501696825027466, + "learning_rate": 0.00017249318903953364, + "loss": 1.2135, + "step": 12575 + }, + { + "epoch": 0.4843118383060635, + "grad_norm": 1.8626338243484497, + "learning_rate": 0.00017247235606330271, + "loss": 1.2106, + "step": 12580 + }, + { + "epoch": 0.48450433108758423, + "grad_norm": 1.3876720666885376, + "learning_rate": 0.00017245151645995494, + "loss": 1.3711, + "step": 12585 + }, + { + "epoch": 0.4846968238691049, + "grad_norm": 1.6943193674087524, + "learning_rate": 0.0001724306702313959, + "loss": 1.117, + "step": 12590 + }, + { + "epoch": 0.4848893166506256, + "grad_norm": 0.9052426218986511, + "learning_rate": 0.00017240981737953192, + "loss": 1.2578, + "step": 12595 + }, + { + "epoch": 0.4850818094321463, + "grad_norm": 0.8325613141059875, + "learning_rate": 0.00017238895790626976, + "loss": 1.1599, + "step": 12600 + }, + { + "epoch": 0.48527430221366696, + "grad_norm": 1.2736178636550903, + "learning_rate": 0.00017236809181351697, + "loss": 1.266, + "step": 12605 + }, + { + "epoch": 0.4854667949951877, + "grad_norm": 1.8093243837356567, + "learning_rate": 0.00017234721910318158, + "loss": 1.2076, + "step": 12610 + }, + { + "epoch": 0.48565928777670836, + "grad_norm": 1.7740625143051147, + "learning_rate": 0.00017232633977717226, + "loss": 1.2431, + "step": 12615 + }, + { + "epoch": 0.4858517805582291, + "grad_norm": 0.83774334192276, + "learning_rate": 0.00017230545383739826, + "loss": 1.1987, + "step": 12620 + }, + { + "epoch": 0.48604427333974976, + "grad_norm": 0.987789511680603, + "learning_rate": 0.0001722845612857695, + "loss": 1.0, + "step": 12625 + }, + { + "epoch": 0.4862367661212704, + "grad_norm": 1.1741127967834473, + "learning_rate": 0.00017226366212419645, + "loss": 1.1961, + "step": 12630 + }, + { + "epoch": 0.48642925890279115, + "grad_norm": 1.9914991855621338, + "learning_rate": 0.00017224275635459023, + "loss": 1.3657, + "step": 12635 + }, + { + "epoch": 0.4866217516843118, + "grad_norm": 1.187045693397522, + "learning_rate": 0.00017222184397886245, + "loss": 1.1048, + "step": 12640 + }, + { + "epoch": 0.48681424446583255, + "grad_norm": 1.1656368970870972, + "learning_rate": 0.0001722009249989255, + "loss": 1.1226, + "step": 12645 + }, + { + "epoch": 0.4870067372473532, + "grad_norm": 1.050398349761963, + "learning_rate": 0.00017217999941669226, + "loss": 1.2018, + "step": 12650 + }, + { + "epoch": 0.48719923002887394, + "grad_norm": 1.4217538833618164, + "learning_rate": 0.00017215906723407618, + "loss": 1.0039, + "step": 12655 + }, + { + "epoch": 0.4873917228103946, + "grad_norm": 1.1657346487045288, + "learning_rate": 0.00017213812845299145, + "loss": 1.3153, + "step": 12660 + }, + { + "epoch": 0.4875842155919153, + "grad_norm": 1.0622743368148804, + "learning_rate": 0.0001721171830753527, + "loss": 1.1889, + "step": 12665 + }, + { + "epoch": 0.487776708373436, + "grad_norm": 1.3200461864471436, + "learning_rate": 0.00017209623110307534, + "loss": 1.2643, + "step": 12670 + }, + { + "epoch": 0.4879692011549567, + "grad_norm": 1.3201463222503662, + "learning_rate": 0.0001720752725380752, + "loss": 1.1021, + "step": 12675 + }, + { + "epoch": 0.4881616939364774, + "grad_norm": 1.3110108375549316, + "learning_rate": 0.00017205430738226885, + "loss": 1.297, + "step": 12680 + }, + { + "epoch": 0.4883541867179981, + "grad_norm": 1.9574589729309082, + "learning_rate": 0.00017203333563757344, + "loss": 1.0589, + "step": 12685 + }, + { + "epoch": 0.48854667949951874, + "grad_norm": 2.1387152671813965, + "learning_rate": 0.00017201235730590663, + "loss": 1.1929, + "step": 12690 + }, + { + "epoch": 0.48873917228103947, + "grad_norm": 1.007409691810608, + "learning_rate": 0.0001719913723891868, + "loss": 1.1213, + "step": 12695 + }, + { + "epoch": 0.48893166506256014, + "grad_norm": 0.9808946847915649, + "learning_rate": 0.00017197038088933285, + "loss": 1.2553, + "step": 12700 + }, + { + "epoch": 0.48912415784408086, + "grad_norm": 1.276231050491333, + "learning_rate": 0.00017194938280826433, + "loss": 1.2783, + "step": 12705 + }, + { + "epoch": 0.48931665062560153, + "grad_norm": 2.288778305053711, + "learning_rate": 0.0001719283781479014, + "loss": 1.2736, + "step": 12710 + }, + { + "epoch": 0.48950914340712226, + "grad_norm": 1.3838545083999634, + "learning_rate": 0.00017190736691016475, + "loss": 1.1101, + "step": 12715 + }, + { + "epoch": 0.48970163618864293, + "grad_norm": 1.5680103302001953, + "learning_rate": 0.00017188634909697572, + "loss": 1.3685, + "step": 12720 + }, + { + "epoch": 0.4898941289701636, + "grad_norm": 0.9690240621566772, + "learning_rate": 0.00017186532471025626, + "loss": 1.1677, + "step": 12725 + }, + { + "epoch": 0.4900866217516843, + "grad_norm": 1.617100477218628, + "learning_rate": 0.00017184429375192894, + "loss": 1.3773, + "step": 12730 + }, + { + "epoch": 0.490279114533205, + "grad_norm": 1.751895785331726, + "learning_rate": 0.0001718232562239169, + "loss": 1.1795, + "step": 12735 + }, + { + "epoch": 0.4904716073147257, + "grad_norm": 1.1766438484191895, + "learning_rate": 0.00017180221212814386, + "loss": 1.0915, + "step": 12740 + }, + { + "epoch": 0.4906641000962464, + "grad_norm": 1.384392499923706, + "learning_rate": 0.00017178116146653415, + "loss": 1.2823, + "step": 12745 + }, + { + "epoch": 0.49085659287776706, + "grad_norm": 1.2379616498947144, + "learning_rate": 0.00017176010424101274, + "loss": 1.1182, + "step": 12750 + }, + { + "epoch": 0.4910490856592878, + "grad_norm": 1.0620834827423096, + "learning_rate": 0.00017173904045350515, + "loss": 1.0414, + "step": 12755 + }, + { + "epoch": 0.49124157844080846, + "grad_norm": 1.2448549270629883, + "learning_rate": 0.00017171797010593755, + "loss": 1.2487, + "step": 12760 + }, + { + "epoch": 0.4914340712223292, + "grad_norm": 1.1862956285476685, + "learning_rate": 0.00017169689320023666, + "loss": 1.2117, + "step": 12765 + }, + { + "epoch": 0.49162656400384985, + "grad_norm": 0.8380292057991028, + "learning_rate": 0.00017167580973832984, + "loss": 1.1396, + "step": 12770 + }, + { + "epoch": 0.4918190567853705, + "grad_norm": 1.807305097579956, + "learning_rate": 0.00017165471972214506, + "loss": 1.2316, + "step": 12775 + }, + { + "epoch": 0.49201154956689125, + "grad_norm": 1.5265247821807861, + "learning_rate": 0.00017163362315361077, + "loss": 1.1191, + "step": 12780 + }, + { + "epoch": 0.4922040423484119, + "grad_norm": 1.4569288492202759, + "learning_rate": 0.00017161252003465626, + "loss": 1.3273, + "step": 12785 + }, + { + "epoch": 0.49239653512993264, + "grad_norm": 1.0343568325042725, + "learning_rate": 0.00017159141036721112, + "loss": 1.3457, + "step": 12790 + }, + { + "epoch": 0.4925890279114533, + "grad_norm": 1.4867749214172363, + "learning_rate": 0.00017157029415320577, + "loss": 1.1783, + "step": 12795 + }, + { + "epoch": 0.49278152069297404, + "grad_norm": 1.0775165557861328, + "learning_rate": 0.0001715491713945711, + "loss": 1.2355, + "step": 12800 + }, + { + "epoch": 0.4929740134744947, + "grad_norm": 1.1424553394317627, + "learning_rate": 0.0001715280420932387, + "loss": 1.1305, + "step": 12805 + }, + { + "epoch": 0.4931665062560154, + "grad_norm": 2.403656482696533, + "learning_rate": 0.00017150690625114065, + "loss": 1.2118, + "step": 12810 + }, + { + "epoch": 0.4933589990375361, + "grad_norm": 1.2673543691635132, + "learning_rate": 0.00017148576387020976, + "loss": 1.163, + "step": 12815 + }, + { + "epoch": 0.4935514918190568, + "grad_norm": 1.4545459747314453, + "learning_rate": 0.0001714646149523793, + "loss": 1.1661, + "step": 12820 + }, + { + "epoch": 0.4937439846005775, + "grad_norm": 1.8716140985488892, + "learning_rate": 0.0001714434594995832, + "loss": 1.0518, + "step": 12825 + }, + { + "epoch": 0.49393647738209817, + "grad_norm": 1.4617652893066406, + "learning_rate": 0.000171422297513756, + "loss": 0.9978, + "step": 12830 + }, + { + "epoch": 0.49412897016361884, + "grad_norm": 1.8650813102722168, + "learning_rate": 0.00017140112899683284, + "loss": 1.0077, + "step": 12835 + }, + { + "epoch": 0.49432146294513957, + "grad_norm": 1.4080079793930054, + "learning_rate": 0.00017137995395074938, + "loss": 1.2906, + "step": 12840 + }, + { + "epoch": 0.49451395572666024, + "grad_norm": 1.2144896984100342, + "learning_rate": 0.000171358772377442, + "loss": 1.1323, + "step": 12845 + }, + { + "epoch": 0.49470644850818096, + "grad_norm": 1.3294404745101929, + "learning_rate": 0.0001713375842788476, + "loss": 1.1987, + "step": 12850 + }, + { + "epoch": 0.49489894128970163, + "grad_norm": 1.3663264513015747, + "learning_rate": 0.00017131638965690372, + "loss": 1.224, + "step": 12855 + }, + { + "epoch": 0.4950914340712223, + "grad_norm": 2.325491428375244, + "learning_rate": 0.0001712951885135484, + "loss": 1.2449, + "step": 12860 + }, + { + "epoch": 0.49528392685274303, + "grad_norm": 1.3226628303527832, + "learning_rate": 0.00017127398085072039, + "loss": 1.2321, + "step": 12865 + }, + { + "epoch": 0.4954764196342637, + "grad_norm": 1.309049367904663, + "learning_rate": 0.00017125276667035895, + "loss": 1.1242, + "step": 12870 + }, + { + "epoch": 0.4956689124157844, + "grad_norm": 2.189549446105957, + "learning_rate": 0.00017123154597440402, + "loss": 1.1479, + "step": 12875 + }, + { + "epoch": 0.4958614051973051, + "grad_norm": 1.258787989616394, + "learning_rate": 0.00017121031876479606, + "loss": 1.3375, + "step": 12880 + }, + { + "epoch": 0.4960538979788258, + "grad_norm": 1.4555957317352295, + "learning_rate": 0.00017118908504347623, + "loss": 1.29, + "step": 12885 + }, + { + "epoch": 0.4962463907603465, + "grad_norm": 1.4220309257507324, + "learning_rate": 0.00017116784481238613, + "loss": 1.2438, + "step": 12890 + }, + { + "epoch": 0.49643888354186716, + "grad_norm": 1.1141269207000732, + "learning_rate": 0.00017114659807346803, + "loss": 1.1286, + "step": 12895 + }, + { + "epoch": 0.4966313763233879, + "grad_norm": 2.7541897296905518, + "learning_rate": 0.00017112534482866486, + "loss": 1.2506, + "step": 12900 + }, + { + "epoch": 0.49682386910490856, + "grad_norm": 1.8478270769119263, + "learning_rate": 0.00017110408507992007, + "loss": 1.2337, + "step": 12905 + }, + { + "epoch": 0.4970163618864293, + "grad_norm": 1.3013496398925781, + "learning_rate": 0.00017108281882917775, + "loss": 1.109, + "step": 12910 + }, + { + "epoch": 0.49720885466794995, + "grad_norm": 1.6363266706466675, + "learning_rate": 0.00017106154607838249, + "loss": 1.2546, + "step": 12915 + }, + { + "epoch": 0.4974013474494706, + "grad_norm": 1.4996516704559326, + "learning_rate": 0.0001710402668294796, + "loss": 1.3066, + "step": 12920 + }, + { + "epoch": 0.49759384023099135, + "grad_norm": 1.2411632537841797, + "learning_rate": 0.0001710189810844149, + "loss": 1.2678, + "step": 12925 + }, + { + "epoch": 0.497786333012512, + "grad_norm": 1.197771430015564, + "learning_rate": 0.00017099768884513484, + "loss": 1.1069, + "step": 12930 + }, + { + "epoch": 0.49797882579403274, + "grad_norm": 1.1568015813827515, + "learning_rate": 0.00017097639011358644, + "loss": 1.1863, + "step": 12935 + }, + { + "epoch": 0.4981713185755534, + "grad_norm": 1.8848886489868164, + "learning_rate": 0.00017095508489171736, + "loss": 1.3294, + "step": 12940 + }, + { + "epoch": 0.4983638113570741, + "grad_norm": 1.4993650913238525, + "learning_rate": 0.00017093377318147578, + "loss": 1.1768, + "step": 12945 + }, + { + "epoch": 0.4985563041385948, + "grad_norm": 1.1212975978851318, + "learning_rate": 0.00017091245498481055, + "loss": 1.2018, + "step": 12950 + }, + { + "epoch": 0.4987487969201155, + "grad_norm": 1.086147665977478, + "learning_rate": 0.00017089113030367107, + "loss": 1.3022, + "step": 12955 + }, + { + "epoch": 0.4989412897016362, + "grad_norm": 2.185974359512329, + "learning_rate": 0.00017086979914000732, + "loss": 1.239, + "step": 12960 + }, + { + "epoch": 0.4991337824831569, + "grad_norm": 1.0493237972259521, + "learning_rate": 0.00017084846149576993, + "loss": 1.1614, + "step": 12965 + }, + { + "epoch": 0.4993262752646776, + "grad_norm": 2.034449815750122, + "learning_rate": 0.00017082711737291005, + "loss": 1.2636, + "step": 12970 + }, + { + "epoch": 0.49951876804619827, + "grad_norm": 1.6736334562301636, + "learning_rate": 0.00017080576677337944, + "loss": 1.1721, + "step": 12975 + }, + { + "epoch": 0.49971126082771894, + "grad_norm": 1.8189449310302734, + "learning_rate": 0.00017078440969913055, + "loss": 1.3739, + "step": 12980 + }, + { + "epoch": 0.49990375360923966, + "grad_norm": 1.500243067741394, + "learning_rate": 0.00017076304615211627, + "loss": 1.0446, + "step": 12985 + }, + { + "epoch": 0.5000962463907603, + "grad_norm": 1.2671639919281006, + "learning_rate": 0.0001707416761342902, + "loss": 1.1841, + "step": 12990 + }, + { + "epoch": 0.500288739172281, + "grad_norm": 1.3602843284606934, + "learning_rate": 0.00017072029964760644, + "loss": 1.3311, + "step": 12995 + }, + { + "epoch": 0.5004812319538018, + "grad_norm": 1.106224775314331, + "learning_rate": 0.00017069891669401982, + "loss": 1.1083, + "step": 13000 + }, + { + "epoch": 0.5006737247353225, + "grad_norm": 1.3517072200775146, + "learning_rate": 0.00017067752727548555, + "loss": 1.2598, + "step": 13005 + }, + { + "epoch": 0.5008662175168431, + "grad_norm": 1.1175580024719238, + "learning_rate": 0.00017065613139395958, + "loss": 1.0876, + "step": 13010 + }, + { + "epoch": 0.5010587102983638, + "grad_norm": 1.8314218521118164, + "learning_rate": 0.00017063472905139854, + "loss": 1.3194, + "step": 13015 + }, + { + "epoch": 0.5012512030798845, + "grad_norm": 0.7892528176307678, + "learning_rate": 0.00017061332024975937, + "loss": 1.0172, + "step": 13020 + }, + { + "epoch": 0.5014436958614052, + "grad_norm": 0.8713880777359009, + "learning_rate": 0.00017059190499099986, + "loss": 1.0775, + "step": 13025 + }, + { + "epoch": 0.5016361886429259, + "grad_norm": 2.4726779460906982, + "learning_rate": 0.00017057048327707826, + "loss": 1.3351, + "step": 13030 + }, + { + "epoch": 0.5018286814244466, + "grad_norm": 1.1800824403762817, + "learning_rate": 0.00017054905510995342, + "loss": 1.3102, + "step": 13035 + }, + { + "epoch": 0.5020211742059673, + "grad_norm": 1.583617091178894, + "learning_rate": 0.0001705276204915849, + "loss": 1.3633, + "step": 13040 + }, + { + "epoch": 0.5022136669874879, + "grad_norm": 2.0497727394104004, + "learning_rate": 0.00017050617942393264, + "loss": 1.2055, + "step": 13045 + }, + { + "epoch": 0.5024061597690087, + "grad_norm": 1.775793433189392, + "learning_rate": 0.00017048473190895734, + "loss": 1.1747, + "step": 13050 + }, + { + "epoch": 0.5025986525505294, + "grad_norm": 1.8494744300842285, + "learning_rate": 0.00017046327794862024, + "loss": 1.2159, + "step": 13055 + }, + { + "epoch": 0.50279114533205, + "grad_norm": 1.6188912391662598, + "learning_rate": 0.00017044181754488315, + "loss": 1.156, + "step": 13060 + }, + { + "epoch": 0.5029836381135707, + "grad_norm": 1.575838565826416, + "learning_rate": 0.00017042035069970846, + "loss": 1.2103, + "step": 13065 + }, + { + "epoch": 0.5031761308950914, + "grad_norm": 1.7594157457351685, + "learning_rate": 0.0001703988774150592, + "loss": 1.4271, + "step": 13070 + }, + { + "epoch": 0.5033686236766122, + "grad_norm": 2.066418170928955, + "learning_rate": 0.00017037739769289894, + "loss": 1.215, + "step": 13075 + }, + { + "epoch": 0.5035611164581328, + "grad_norm": 1.8343390226364136, + "learning_rate": 0.0001703559115351919, + "loss": 1.3318, + "step": 13080 + }, + { + "epoch": 0.5037536092396535, + "grad_norm": 1.177186369895935, + "learning_rate": 0.00017033441894390278, + "loss": 1.2845, + "step": 13085 + }, + { + "epoch": 0.5039461020211742, + "grad_norm": 1.7650407552719116, + "learning_rate": 0.000170312919920997, + "loss": 1.2133, + "step": 13090 + }, + { + "epoch": 0.5041385948026949, + "grad_norm": 1.0483806133270264, + "learning_rate": 0.00017029141446844043, + "loss": 1.2309, + "step": 13095 + }, + { + "epoch": 0.5043310875842156, + "grad_norm": 1.1729573011398315, + "learning_rate": 0.00017026990258819968, + "loss": 1.2975, + "step": 13100 + }, + { + "epoch": 0.5045235803657363, + "grad_norm": 1.8557977676391602, + "learning_rate": 0.00017024838428224184, + "loss": 1.1332, + "step": 13105 + }, + { + "epoch": 0.504716073147257, + "grad_norm": 2.2607064247131348, + "learning_rate": 0.00017022685955253458, + "loss": 1.186, + "step": 13110 + }, + { + "epoch": 0.5049085659287776, + "grad_norm": 1.0992141962051392, + "learning_rate": 0.00017020532840104625, + "loss": 1.0708, + "step": 13115 + }, + { + "epoch": 0.5051010587102983, + "grad_norm": 0.9550696611404419, + "learning_rate": 0.0001701837908297457, + "loss": 1.2069, + "step": 13120 + }, + { + "epoch": 0.5052935514918191, + "grad_norm": 1.9301509857177734, + "learning_rate": 0.00017016224684060242, + "loss": 1.1152, + "step": 13125 + }, + { + "epoch": 0.5054860442733398, + "grad_norm": 1.2657769918441772, + "learning_rate": 0.0001701406964355864, + "loss": 1.1028, + "step": 13130 + }, + { + "epoch": 0.5056785370548604, + "grad_norm": 1.549902081489563, + "learning_rate": 0.00017011913961666837, + "loss": 1.1677, + "step": 13135 + }, + { + "epoch": 0.5058710298363811, + "grad_norm": 1.5015727281570435, + "learning_rate": 0.00017009757638581952, + "loss": 1.1799, + "step": 13140 + }, + { + "epoch": 0.5060635226179019, + "grad_norm": 1.206145167350769, + "learning_rate": 0.00017007600674501166, + "loss": 1.131, + "step": 13145 + }, + { + "epoch": 0.5062560153994226, + "grad_norm": 1.1034317016601562, + "learning_rate": 0.00017005443069621716, + "loss": 1.0504, + "step": 13150 + }, + { + "epoch": 0.5064485081809432, + "grad_norm": 1.0830001831054688, + "learning_rate": 0.00017003284824140908, + "loss": 1.1653, + "step": 13155 + }, + { + "epoch": 0.5066410009624639, + "grad_norm": 1.8119686841964722, + "learning_rate": 0.00017001125938256094, + "loss": 1.1646, + "step": 13160 + }, + { + "epoch": 0.5068334937439846, + "grad_norm": 1.1078890562057495, + "learning_rate": 0.00016998966412164692, + "loss": 1.3281, + "step": 13165 + }, + { + "epoch": 0.5070259865255053, + "grad_norm": 2.1965198516845703, + "learning_rate": 0.00016996806246064174, + "loss": 1.1042, + "step": 13170 + }, + { + "epoch": 0.507218479307026, + "grad_norm": 1.3997282981872559, + "learning_rate": 0.00016994645440152075, + "loss": 1.1662, + "step": 13175 + }, + { + "epoch": 0.5074109720885467, + "grad_norm": 1.2493458986282349, + "learning_rate": 0.00016992483994625985, + "loss": 1.1594, + "step": 13180 + }, + { + "epoch": 0.5076034648700674, + "grad_norm": 0.8307852745056152, + "learning_rate": 0.00016990321909683557, + "loss": 1.1701, + "step": 13185 + }, + { + "epoch": 0.507795957651588, + "grad_norm": 1.1798492670059204, + "learning_rate": 0.00016988159185522497, + "loss": 1.2217, + "step": 13190 + }, + { + "epoch": 0.5079884504331088, + "grad_norm": 2.131786823272705, + "learning_rate": 0.00016985995822340567, + "loss": 1.277, + "step": 13195 + }, + { + "epoch": 0.5081809432146295, + "grad_norm": 1.0258443355560303, + "learning_rate": 0.00016983831820335603, + "loss": 1.0954, + "step": 13200 + }, + { + "epoch": 0.5083734359961501, + "grad_norm": 2.4005777835845947, + "learning_rate": 0.0001698166717970548, + "loss": 1.3484, + "step": 13205 + }, + { + "epoch": 0.5085659287776708, + "grad_norm": 1.3329745531082153, + "learning_rate": 0.00016979501900648143, + "loss": 1.2548, + "step": 13210 + }, + { + "epoch": 0.5087584215591915, + "grad_norm": 1.2792582511901855, + "learning_rate": 0.00016977335983361594, + "loss": 1.1056, + "step": 13215 + }, + { + "epoch": 0.5089509143407123, + "grad_norm": 1.1146180629730225, + "learning_rate": 0.00016975169428043888, + "loss": 1.1728, + "step": 13220 + }, + { + "epoch": 0.5091434071222329, + "grad_norm": 1.155003309249878, + "learning_rate": 0.0001697300223489314, + "loss": 1.2469, + "step": 13225 + }, + { + "epoch": 0.5093358999037536, + "grad_norm": 1.8456053733825684, + "learning_rate": 0.00016970834404107535, + "loss": 1.1515, + "step": 13230 + }, + { + "epoch": 0.5095283926852743, + "grad_norm": 1.3863856792449951, + "learning_rate": 0.000169686659358853, + "loss": 1.2561, + "step": 13235 + }, + { + "epoch": 0.509720885466795, + "grad_norm": 2.099985361099243, + "learning_rate": 0.00016966496830424728, + "loss": 1.2639, + "step": 13240 + }, + { + "epoch": 0.5099133782483157, + "grad_norm": 1.4132083654403687, + "learning_rate": 0.0001696432708792417, + "loss": 0.9859, + "step": 13245 + }, + { + "epoch": 0.5101058710298364, + "grad_norm": 1.0421473979949951, + "learning_rate": 0.00016962156708582037, + "loss": 1.1239, + "step": 13250 + }, + { + "epoch": 0.5102983638113571, + "grad_norm": 1.4971591234207153, + "learning_rate": 0.0001695998569259679, + "loss": 1.183, + "step": 13255 + }, + { + "epoch": 0.5104908565928777, + "grad_norm": 1.7850632667541504, + "learning_rate": 0.00016957814040166955, + "loss": 1.2342, + "step": 13260 + }, + { + "epoch": 0.5106833493743984, + "grad_norm": 2.817624092102051, + "learning_rate": 0.0001695564175149112, + "loss": 1.1795, + "step": 13265 + }, + { + "epoch": 0.5108758421559192, + "grad_norm": 1.4107112884521484, + "learning_rate": 0.00016953468826767925, + "loss": 1.2791, + "step": 13270 + }, + { + "epoch": 0.5110683349374399, + "grad_norm": 1.4817914962768555, + "learning_rate": 0.00016951295266196063, + "loss": 1.0816, + "step": 13275 + }, + { + "epoch": 0.5112608277189605, + "grad_norm": 1.054870367050171, + "learning_rate": 0.00016949121069974302, + "loss": 1.2114, + "step": 13280 + }, + { + "epoch": 0.5114533205004812, + "grad_norm": 1.2629690170288086, + "learning_rate": 0.00016946946238301453, + "loss": 1.1014, + "step": 13285 + }, + { + "epoch": 0.5116458132820019, + "grad_norm": 0.9189853668212891, + "learning_rate": 0.00016944770771376387, + "loss": 1.319, + "step": 13290 + }, + { + "epoch": 0.5118383060635227, + "grad_norm": 1.4326847791671753, + "learning_rate": 0.0001694259466939804, + "loss": 1.299, + "step": 13295 + }, + { + "epoch": 0.5120307988450433, + "grad_norm": 1.1833186149597168, + "learning_rate": 0.00016940417932565402, + "loss": 1.2863, + "step": 13300 + }, + { + "epoch": 0.512223291626564, + "grad_norm": 1.1329289674758911, + "learning_rate": 0.0001693824056107752, + "loss": 1.2228, + "step": 13305 + }, + { + "epoch": 0.5124157844080847, + "grad_norm": 1.2103817462921143, + "learning_rate": 0.000169360625551335, + "loss": 1.042, + "step": 13310 + }, + { + "epoch": 0.5126082771896054, + "grad_norm": 1.2664172649383545, + "learning_rate": 0.00016933883914932506, + "loss": 1.299, + "step": 13315 + }, + { + "epoch": 0.5128007699711261, + "grad_norm": 1.8509985208511353, + "learning_rate": 0.0001693170464067376, + "loss": 1.135, + "step": 13320 + }, + { + "epoch": 0.5129932627526468, + "grad_norm": 2.1004250049591064, + "learning_rate": 0.00016929524732556546, + "loss": 1.1321, + "step": 13325 + }, + { + "epoch": 0.5131857555341675, + "grad_norm": 1.1648815870285034, + "learning_rate": 0.00016927344190780197, + "loss": 1.094, + "step": 13330 + }, + { + "epoch": 0.5133782483156881, + "grad_norm": 0.9492617249488831, + "learning_rate": 0.0001692516301554411, + "loss": 1.2055, + "step": 13335 + }, + { + "epoch": 0.5135707410972089, + "grad_norm": 1.7911789417266846, + "learning_rate": 0.00016922981207047742, + "loss": 1.1726, + "step": 13340 + }, + { + "epoch": 0.5137632338787296, + "grad_norm": 1.2055487632751465, + "learning_rate": 0.00016920798765490601, + "loss": 1.158, + "step": 13345 + }, + { + "epoch": 0.5139557266602502, + "grad_norm": 1.1120411157608032, + "learning_rate": 0.0001691861569107226, + "loss": 1.2508, + "step": 13350 + }, + { + "epoch": 0.5141482194417709, + "grad_norm": 1.1816275119781494, + "learning_rate": 0.0001691643198399235, + "loss": 1.0294, + "step": 13355 + }, + { + "epoch": 0.5143407122232916, + "grad_norm": 1.1714962720870972, + "learning_rate": 0.00016914247644450546, + "loss": 1.2843, + "step": 13360 + }, + { + "epoch": 0.5145332050048124, + "grad_norm": 2.0812292098999023, + "learning_rate": 0.000169120626726466, + "loss": 1.284, + "step": 13365 + }, + { + "epoch": 0.514725697786333, + "grad_norm": 1.7628620862960815, + "learning_rate": 0.00016909877068780314, + "loss": 1.1104, + "step": 13370 + }, + { + "epoch": 0.5149181905678537, + "grad_norm": 1.7429643869400024, + "learning_rate": 0.0001690769083305154, + "loss": 1.1387, + "step": 13375 + }, + { + "epoch": 0.5151106833493744, + "grad_norm": 2.087916612625122, + "learning_rate": 0.00016905503965660196, + "loss": 1.2737, + "step": 13380 + }, + { + "epoch": 0.515303176130895, + "grad_norm": 1.5689221620559692, + "learning_rate": 0.00016903316466806265, + "loss": 1.145, + "step": 13385 + }, + { + "epoch": 0.5154956689124158, + "grad_norm": 1.0740375518798828, + "learning_rate": 0.0001690112833668977, + "loss": 0.9748, + "step": 13390 + }, + { + "epoch": 0.5156881616939365, + "grad_norm": 1.4595876932144165, + "learning_rate": 0.00016898939575510805, + "loss": 1.1378, + "step": 13395 + }, + { + "epoch": 0.5158806544754572, + "grad_norm": 1.9210182428359985, + "learning_rate": 0.00016896750183469517, + "loss": 1.2581, + "step": 13400 + }, + { + "epoch": 0.5160731472569778, + "grad_norm": 1.0922927856445312, + "learning_rate": 0.00016894560160766117, + "loss": 1.1601, + "step": 13405 + }, + { + "epoch": 0.5162656400384985, + "grad_norm": 2.037611246109009, + "learning_rate": 0.00016892369507600855, + "loss": 1.2394, + "step": 13410 + }, + { + "epoch": 0.5164581328200193, + "grad_norm": 1.1577821969985962, + "learning_rate": 0.0001689017822417406, + "loss": 1.2428, + "step": 13415 + }, + { + "epoch": 0.51665062560154, + "grad_norm": 1.1762430667877197, + "learning_rate": 0.00016887986310686114, + "loss": 1.1648, + "step": 13420 + }, + { + "epoch": 0.5168431183830606, + "grad_norm": 1.8631316423416138, + "learning_rate": 0.00016885793767337445, + "loss": 1.2288, + "step": 13425 + }, + { + "epoch": 0.5170356111645813, + "grad_norm": 1.191747784614563, + "learning_rate": 0.0001688360059432855, + "loss": 1.1287, + "step": 13430 + }, + { + "epoch": 0.517228103946102, + "grad_norm": 1.092367172241211, + "learning_rate": 0.00016881406791859985, + "loss": 1.2073, + "step": 13435 + }, + { + "epoch": 0.5174205967276228, + "grad_norm": 0.9805938601493835, + "learning_rate": 0.00016879212360132345, + "loss": 1.3199, + "step": 13440 + }, + { + "epoch": 0.5176130895091434, + "grad_norm": 1.0042074918746948, + "learning_rate": 0.00016877017299346314, + "loss": 1.1389, + "step": 13445 + }, + { + "epoch": 0.5178055822906641, + "grad_norm": 1.3087821006774902, + "learning_rate": 0.00016874821609702605, + "loss": 1.2112, + "step": 13450 + }, + { + "epoch": 0.5179980750721848, + "grad_norm": 1.4208637475967407, + "learning_rate": 0.00016872625291401998, + "loss": 1.119, + "step": 13455 + }, + { + "epoch": 0.5181905678537055, + "grad_norm": 0.9211226105690002, + "learning_rate": 0.0001687042834464534, + "loss": 1.1458, + "step": 13460 + }, + { + "epoch": 0.5183830606352262, + "grad_norm": 1.1774996519088745, + "learning_rate": 0.00016868230769633518, + "loss": 1.195, + "step": 13465 + }, + { + "epoch": 0.5185755534167469, + "grad_norm": 1.395883321762085, + "learning_rate": 0.0001686603256656749, + "loss": 1.1633, + "step": 13470 + }, + { + "epoch": 0.5187680461982676, + "grad_norm": 2.2554938793182373, + "learning_rate": 0.00016863833735648268, + "loss": 1.1345, + "step": 13475 + }, + { + "epoch": 0.5189605389797882, + "grad_norm": 1.2396293878555298, + "learning_rate": 0.00016861634277076922, + "loss": 1.1109, + "step": 13480 + }, + { + "epoch": 0.519153031761309, + "grad_norm": 1.2292909622192383, + "learning_rate": 0.00016859434191054574, + "loss": 1.1029, + "step": 13485 + }, + { + "epoch": 0.5193455245428297, + "grad_norm": 1.145571231842041, + "learning_rate": 0.00016857233477782409, + "loss": 1.1734, + "step": 13490 + }, + { + "epoch": 0.5195380173243503, + "grad_norm": 1.7307795286178589, + "learning_rate": 0.00016855032137461667, + "loss": 1.1476, + "step": 13495 + }, + { + "epoch": 0.519730510105871, + "grad_norm": 1.611140489578247, + "learning_rate": 0.0001685283017029365, + "loss": 1.1304, + "step": 13500 + }, + { + "epoch": 0.5199230028873917, + "grad_norm": 1.3966014385223389, + "learning_rate": 0.00016850627576479705, + "loss": 1.1231, + "step": 13505 + }, + { + "epoch": 0.5201154956689125, + "grad_norm": 1.505765676498413, + "learning_rate": 0.0001684842435622125, + "loss": 1.0741, + "step": 13510 + }, + { + "epoch": 0.5203079884504331, + "grad_norm": 1.791595220565796, + "learning_rate": 0.00016846220509719755, + "loss": 1.1928, + "step": 13515 + }, + { + "epoch": 0.5205004812319538, + "grad_norm": 1.2992479801177979, + "learning_rate": 0.00016844016037176744, + "loss": 1.1523, + "step": 13520 + }, + { + "epoch": 0.5206929740134745, + "grad_norm": 1.8747221231460571, + "learning_rate": 0.00016841810938793807, + "loss": 1.0704, + "step": 13525 + }, + { + "epoch": 0.5208854667949951, + "grad_norm": 1.3441274166107178, + "learning_rate": 0.00016839605214772583, + "loss": 1.1979, + "step": 13530 + }, + { + "epoch": 0.5210779595765159, + "grad_norm": 0.8640159964561462, + "learning_rate": 0.0001683739886531477, + "loss": 1.1577, + "step": 13535 + }, + { + "epoch": 0.5212704523580366, + "grad_norm": 1.7198442220687866, + "learning_rate": 0.00016835191890622123, + "loss": 1.2623, + "step": 13540 + }, + { + "epoch": 0.5214629451395573, + "grad_norm": 1.2651041746139526, + "learning_rate": 0.0001683298429089646, + "loss": 1.2428, + "step": 13545 + }, + { + "epoch": 0.5216554379210779, + "grad_norm": 1.9191710948944092, + "learning_rate": 0.00016830776066339642, + "loss": 1.2872, + "step": 13550 + }, + { + "epoch": 0.5218479307025986, + "grad_norm": 1.8098481893539429, + "learning_rate": 0.00016828567217153605, + "loss": 1.2838, + "step": 13555 + }, + { + "epoch": 0.5220404234841194, + "grad_norm": 1.732160210609436, + "learning_rate": 0.00016826357743540332, + "loss": 1.0766, + "step": 13560 + }, + { + "epoch": 0.5222329162656401, + "grad_norm": 1.4580518007278442, + "learning_rate": 0.00016824147645701863, + "loss": 1.2825, + "step": 13565 + }, + { + "epoch": 0.5224254090471607, + "grad_norm": 1.5836480855941772, + "learning_rate": 0.000168219369238403, + "loss": 1.1772, + "step": 13570 + }, + { + "epoch": 0.5226179018286814, + "grad_norm": 1.5529143810272217, + "learning_rate": 0.00016819725578157794, + "loss": 1.2795, + "step": 13575 + }, + { + "epoch": 0.5228103946102021, + "grad_norm": 1.1405484676361084, + "learning_rate": 0.0001681751360885656, + "loss": 1.2133, + "step": 13580 + }, + { + "epoch": 0.5230028873917228, + "grad_norm": 1.0912057161331177, + "learning_rate": 0.00016815301016138873, + "loss": 1.0493, + "step": 13585 + }, + { + "epoch": 0.5231953801732435, + "grad_norm": 0.9384201169013977, + "learning_rate": 0.0001681308780020705, + "loss": 1.1638, + "step": 13590 + }, + { + "epoch": 0.5233878729547642, + "grad_norm": 1.3467286825180054, + "learning_rate": 0.0001681087396126348, + "loss": 1.1927, + "step": 13595 + }, + { + "epoch": 0.5235803657362849, + "grad_norm": 0.9008259773254395, + "learning_rate": 0.00016808659499510607, + "loss": 1.2158, + "step": 13600 + }, + { + "epoch": 0.5237728585178055, + "grad_norm": 1.1013727188110352, + "learning_rate": 0.00016806444415150927, + "loss": 1.1575, + "step": 13605 + }, + { + "epoch": 0.5239653512993263, + "grad_norm": 1.160654902458191, + "learning_rate": 0.00016804228708386992, + "loss": 1.1662, + "step": 13610 + }, + { + "epoch": 0.524157844080847, + "grad_norm": 1.5752032995224, + "learning_rate": 0.00016802012379421414, + "loss": 1.1596, + "step": 13615 + }, + { + "epoch": 0.5243503368623676, + "grad_norm": 1.1819881200790405, + "learning_rate": 0.00016799795428456865, + "loss": 1.1686, + "step": 13620 + }, + { + "epoch": 0.5245428296438883, + "grad_norm": 0.9841921329498291, + "learning_rate": 0.00016797577855696069, + "loss": 1.0872, + "step": 13625 + }, + { + "epoch": 0.5247353224254091, + "grad_norm": 1.2292228937149048, + "learning_rate": 0.00016795359661341808, + "loss": 1.3943, + "step": 13630 + }, + { + "epoch": 0.5249278152069298, + "grad_norm": 1.2674068212509155, + "learning_rate": 0.0001679314084559692, + "loss": 1.232, + "step": 13635 + }, + { + "epoch": 0.5251203079884504, + "grad_norm": 1.1942312717437744, + "learning_rate": 0.00016790921408664302, + "loss": 1.2223, + "step": 13640 + }, + { + "epoch": 0.5253128007699711, + "grad_norm": 1.5753337144851685, + "learning_rate": 0.00016788701350746907, + "loss": 1.2936, + "step": 13645 + }, + { + "epoch": 0.5255052935514918, + "grad_norm": 1.1031461954116821, + "learning_rate": 0.00016786480672047744, + "loss": 1.2651, + "step": 13650 + }, + { + "epoch": 0.5256977863330126, + "grad_norm": 3.8325674533843994, + "learning_rate": 0.00016784259372769884, + "loss": 1.1693, + "step": 13655 + }, + { + "epoch": 0.5258902791145332, + "grad_norm": 1.6535909175872803, + "learning_rate": 0.0001678203745311644, + "loss": 1.1606, + "step": 13660 + }, + { + "epoch": 0.5260827718960539, + "grad_norm": 1.6406097412109375, + "learning_rate": 0.000167798149132906, + "loss": 1.3232, + "step": 13665 + }, + { + "epoch": 0.5262752646775746, + "grad_norm": 1.6994904279708862, + "learning_rate": 0.000167775917534956, + "loss": 1.0998, + "step": 13670 + }, + { + "epoch": 0.5264677574590952, + "grad_norm": 1.6446374654769897, + "learning_rate": 0.0001677536797393473, + "loss": 1.2997, + "step": 13675 + }, + { + "epoch": 0.526660250240616, + "grad_norm": 1.6050851345062256, + "learning_rate": 0.0001677314357481134, + "loss": 1.19, + "step": 13680 + }, + { + "epoch": 0.5268527430221367, + "grad_norm": 1.473940134048462, + "learning_rate": 0.00016770918556328844, + "loss": 1.2007, + "step": 13685 + }, + { + "epoch": 0.5270452358036574, + "grad_norm": 1.1209567785263062, + "learning_rate": 0.00016768692918690695, + "loss": 1.2956, + "step": 13690 + }, + { + "epoch": 0.527237728585178, + "grad_norm": 1.4143558740615845, + "learning_rate": 0.00016766466662100415, + "loss": 1.1734, + "step": 13695 + }, + { + "epoch": 0.5274302213666987, + "grad_norm": 1.138107180595398, + "learning_rate": 0.00016764239786761585, + "loss": 1.1318, + "step": 13700 + }, + { + "epoch": 0.5276227141482195, + "grad_norm": 1.5194774866104126, + "learning_rate": 0.00016762012292877835, + "loss": 1.1525, + "step": 13705 + }, + { + "epoch": 0.5278152069297402, + "grad_norm": 1.136946439743042, + "learning_rate": 0.00016759784180652858, + "loss": 1.1289, + "step": 13710 + }, + { + "epoch": 0.5280076997112608, + "grad_norm": 1.5263949632644653, + "learning_rate": 0.00016757555450290396, + "loss": 1.2811, + "step": 13715 + }, + { + "epoch": 0.5282001924927815, + "grad_norm": 2.306833505630493, + "learning_rate": 0.00016755326101994248, + "loss": 1.0326, + "step": 13720 + }, + { + "epoch": 0.5283926852743022, + "grad_norm": 1.4330452680587769, + "learning_rate": 0.0001675309613596828, + "loss": 1.0126, + "step": 13725 + }, + { + "epoch": 0.528585178055823, + "grad_norm": 0.8746087551116943, + "learning_rate": 0.00016750865552416408, + "loss": 1.224, + "step": 13730 + }, + { + "epoch": 0.5287776708373436, + "grad_norm": 2.576612949371338, + "learning_rate": 0.000167486343515426, + "loss": 1.178, + "step": 13735 + }, + { + "epoch": 0.5289701636188643, + "grad_norm": 1.3074976205825806, + "learning_rate": 0.00016746402533550887, + "loss": 1.2453, + "step": 13740 + }, + { + "epoch": 0.529162656400385, + "grad_norm": 1.0941317081451416, + "learning_rate": 0.00016744170098645353, + "loss": 0.9341, + "step": 13745 + }, + { + "epoch": 0.5293551491819056, + "grad_norm": 1.6738418340682983, + "learning_rate": 0.00016741937047030139, + "loss": 1.1423, + "step": 13750 + }, + { + "epoch": 0.5295476419634264, + "grad_norm": 1.9735844135284424, + "learning_rate": 0.00016739703378909444, + "loss": 1.0691, + "step": 13755 + }, + { + "epoch": 0.5297401347449471, + "grad_norm": 1.0063233375549316, + "learning_rate": 0.00016737469094487518, + "loss": 1.0096, + "step": 13760 + }, + { + "epoch": 0.5299326275264677, + "grad_norm": 1.2500115633010864, + "learning_rate": 0.00016735234193968678, + "loss": 1.1627, + "step": 13765 + }, + { + "epoch": 0.5301251203079884, + "grad_norm": 1.0908536911010742, + "learning_rate": 0.00016732998677557287, + "loss": 1.0477, + "step": 13770 + }, + { + "epoch": 0.5303176130895092, + "grad_norm": 1.609208106994629, + "learning_rate": 0.0001673076254545777, + "loss": 0.964, + "step": 13775 + }, + { + "epoch": 0.5305101058710299, + "grad_norm": 1.0210634469985962, + "learning_rate": 0.00016728525797874607, + "loss": 1.2982, + "step": 13780 + }, + { + "epoch": 0.5307025986525505, + "grad_norm": 2.0595545768737793, + "learning_rate": 0.0001672628843501233, + "loss": 1.1969, + "step": 13785 + }, + { + "epoch": 0.5308950914340712, + "grad_norm": 1.7514983415603638, + "learning_rate": 0.00016724050457075533, + "loss": 1.2918, + "step": 13790 + }, + { + "epoch": 0.5310875842155919, + "grad_norm": 1.483798861503601, + "learning_rate": 0.00016721811864268865, + "loss": 1.0163, + "step": 13795 + }, + { + "epoch": 0.5312800769971127, + "grad_norm": 1.4174484014511108, + "learning_rate": 0.0001671957265679703, + "loss": 1.1936, + "step": 13800 + }, + { + "epoch": 0.5314725697786333, + "grad_norm": 1.4664232730865479, + "learning_rate": 0.00016717332834864787, + "loss": 1.2553, + "step": 13805 + }, + { + "epoch": 0.531665062560154, + "grad_norm": 0.6863868832588196, + "learning_rate": 0.00016715092398676958, + "loss": 0.8998, + "step": 13810 + }, + { + "epoch": 0.5318575553416747, + "grad_norm": 2.3511574268341064, + "learning_rate": 0.00016712851348438408, + "loss": 1.4484, + "step": 13815 + }, + { + "epoch": 0.5320500481231953, + "grad_norm": 1.418361783027649, + "learning_rate": 0.00016710609684354074, + "loss": 1.1139, + "step": 13820 + }, + { + "epoch": 0.5322425409047161, + "grad_norm": 1.5918070077896118, + "learning_rate": 0.00016708367406628938, + "loss": 1.1045, + "step": 13825 + }, + { + "epoch": 0.5324350336862368, + "grad_norm": 1.1937044858932495, + "learning_rate": 0.00016706124515468042, + "loss": 1.2665, + "step": 13830 + }, + { + "epoch": 0.5326275264677575, + "grad_norm": 1.60366952419281, + "learning_rate": 0.00016703881011076482, + "loss": 1.3277, + "step": 13835 + }, + { + "epoch": 0.5328200192492781, + "grad_norm": 1.2769535779953003, + "learning_rate": 0.00016701636893659414, + "loss": 1.2517, + "step": 13840 + }, + { + "epoch": 0.5330125120307988, + "grad_norm": 1.3906430006027222, + "learning_rate": 0.00016699392163422043, + "loss": 1.3485, + "step": 13845 + }, + { + "epoch": 0.5332050048123196, + "grad_norm": 1.461391568183899, + "learning_rate": 0.0001669714682056964, + "loss": 1.1297, + "step": 13850 + }, + { + "epoch": 0.5333974975938403, + "grad_norm": 1.3566093444824219, + "learning_rate": 0.00016694900865307525, + "loss": 1.2833, + "step": 13855 + }, + { + "epoch": 0.5335899903753609, + "grad_norm": 1.4480105638504028, + "learning_rate": 0.00016692654297841076, + "loss": 1.0877, + "step": 13860 + }, + { + "epoch": 0.5337824831568816, + "grad_norm": 1.0896391868591309, + "learning_rate": 0.00016690407118375724, + "loss": 1.1286, + "step": 13865 + }, + { + "epoch": 0.5339749759384023, + "grad_norm": 1.101636290550232, + "learning_rate": 0.00016688159327116962, + "loss": 1.0802, + "step": 13870 + }, + { + "epoch": 0.534167468719923, + "grad_norm": 1.1488208770751953, + "learning_rate": 0.00016685910924270337, + "loss": 1.144, + "step": 13875 + }, + { + "epoch": 0.5343599615014437, + "grad_norm": 1.4691115617752075, + "learning_rate": 0.00016683661910041445, + "loss": 1.2133, + "step": 13880 + }, + { + "epoch": 0.5345524542829644, + "grad_norm": 0.9920752048492432, + "learning_rate": 0.0001668141228463595, + "loss": 1.1326, + "step": 13885 + }, + { + "epoch": 0.534744947064485, + "grad_norm": 1.2828654050827026, + "learning_rate": 0.00016679162048259557, + "loss": 1.2162, + "step": 13890 + }, + { + "epoch": 0.5349374398460057, + "grad_norm": 1.3294516801834106, + "learning_rate": 0.00016676911201118043, + "loss": 1.1797, + "step": 13895 + }, + { + "epoch": 0.5351299326275265, + "grad_norm": 1.5326685905456543, + "learning_rate": 0.00016674659743417232, + "loss": 1.1147, + "step": 13900 + }, + { + "epoch": 0.5353224254090472, + "grad_norm": 1.9222960472106934, + "learning_rate": 0.00016672407675363, + "loss": 1.1615, + "step": 13905 + }, + { + "epoch": 0.5355149181905678, + "grad_norm": 1.412458062171936, + "learning_rate": 0.00016670154997161288, + "loss": 1.1556, + "step": 13910 + }, + { + "epoch": 0.5357074109720885, + "grad_norm": 1.230669617652893, + "learning_rate": 0.00016667901709018087, + "loss": 1.062, + "step": 13915 + }, + { + "epoch": 0.5358999037536092, + "grad_norm": 1.431746006011963, + "learning_rate": 0.00016665647811139444, + "loss": 1.0561, + "step": 13920 + }, + { + "epoch": 0.53609239653513, + "grad_norm": 1.6623647212982178, + "learning_rate": 0.00016663393303731466, + "loss": 1.1495, + "step": 13925 + }, + { + "epoch": 0.5362848893166506, + "grad_norm": 1.5261880159378052, + "learning_rate": 0.00016661138187000312, + "loss": 1.3093, + "step": 13930 + }, + { + "epoch": 0.5364773820981713, + "grad_norm": 1.5623407363891602, + "learning_rate": 0.00016658882461152195, + "loss": 1.0859, + "step": 13935 + }, + { + "epoch": 0.536669874879692, + "grad_norm": 1.2155213356018066, + "learning_rate": 0.0001665662612639339, + "loss": 1.2502, + "step": 13940 + }, + { + "epoch": 0.5368623676612128, + "grad_norm": 0.7948794364929199, + "learning_rate": 0.0001665436918293022, + "loss": 1.1741, + "step": 13945 + }, + { + "epoch": 0.5370548604427334, + "grad_norm": 1.370322585105896, + "learning_rate": 0.0001665211163096907, + "loss": 1.2727, + "step": 13950 + }, + { + "epoch": 0.5372473532242541, + "grad_norm": 1.146519660949707, + "learning_rate": 0.00016649853470716378, + "loss": 1.2603, + "step": 13955 + }, + { + "epoch": 0.5374398460057748, + "grad_norm": 1.1492048501968384, + "learning_rate": 0.00016647594702378637, + "loss": 1.1772, + "step": 13960 + }, + { + "epoch": 0.5376323387872954, + "grad_norm": 2.4730112552642822, + "learning_rate": 0.00016645335326162397, + "loss": 1.4024, + "step": 13965 + }, + { + "epoch": 0.5378248315688162, + "grad_norm": 1.411889910697937, + "learning_rate": 0.00016643075342274264, + "loss": 1.1121, + "step": 13970 + }, + { + "epoch": 0.5380173243503369, + "grad_norm": 1.0818617343902588, + "learning_rate": 0.00016640814750920895, + "loss": 1.2139, + "step": 13975 + }, + { + "epoch": 0.5382098171318576, + "grad_norm": 1.1196002960205078, + "learning_rate": 0.0001663855355230901, + "loss": 1.0877, + "step": 13980 + }, + { + "epoch": 0.5384023099133782, + "grad_norm": 1.5476993322372437, + "learning_rate": 0.00016636291746645378, + "loss": 1.1055, + "step": 13985 + }, + { + "epoch": 0.5385948026948989, + "grad_norm": 0.924186646938324, + "learning_rate": 0.00016634029334136827, + "loss": 1.0307, + "step": 13990 + }, + { + "epoch": 0.5387872954764197, + "grad_norm": 1.157355546951294, + "learning_rate": 0.0001663176631499024, + "loss": 1.0783, + "step": 13995 + }, + { + "epoch": 0.5389797882579404, + "grad_norm": 1.1704423427581787, + "learning_rate": 0.00016629502689412555, + "loss": 1.3452, + "step": 14000 + }, + { + "epoch": 0.539172281039461, + "grad_norm": 2.0251457691192627, + "learning_rate": 0.00016627238457610766, + "loss": 1.3611, + "step": 14005 + }, + { + "epoch": 0.5393647738209817, + "grad_norm": 1.018612265586853, + "learning_rate": 0.0001662497361979192, + "loss": 1.115, + "step": 14010 + }, + { + "epoch": 0.5395572666025024, + "grad_norm": 1.2389349937438965, + "learning_rate": 0.00016622708176163126, + "loss": 1.2055, + "step": 14015 + }, + { + "epoch": 0.5397497593840231, + "grad_norm": 2.2555086612701416, + "learning_rate": 0.0001662044212693154, + "loss": 1.0512, + "step": 14020 + }, + { + "epoch": 0.5399422521655438, + "grad_norm": 1.059856653213501, + "learning_rate": 0.00016618175472304375, + "loss": 1.2114, + "step": 14025 + }, + { + "epoch": 0.5401347449470645, + "grad_norm": 1.484417200088501, + "learning_rate": 0.00016615908212488906, + "loss": 1.1872, + "step": 14030 + }, + { + "epoch": 0.5403272377285852, + "grad_norm": 1.4816780090332031, + "learning_rate": 0.00016613640347692458, + "loss": 1.1261, + "step": 14035 + }, + { + "epoch": 0.5405197305101058, + "grad_norm": 1.6735597848892212, + "learning_rate": 0.00016611371878122412, + "loss": 1.2311, + "step": 14040 + }, + { + "epoch": 0.5407122232916266, + "grad_norm": 1.8882919549942017, + "learning_rate": 0.00016609102803986204, + "loss": 1.3099, + "step": 14045 + }, + { + "epoch": 0.5409047160731473, + "grad_norm": 1.4272384643554688, + "learning_rate": 0.00016606833125491327, + "loss": 1.2343, + "step": 14050 + }, + { + "epoch": 0.5410972088546679, + "grad_norm": 1.2361105680465698, + "learning_rate": 0.0001660456284284532, + "loss": 1.155, + "step": 14055 + }, + { + "epoch": 0.5412897016361886, + "grad_norm": 1.294826626777649, + "learning_rate": 0.000166022919562558, + "loss": 1.0691, + "step": 14060 + }, + { + "epoch": 0.5414821944177093, + "grad_norm": 2.163748264312744, + "learning_rate": 0.00016600020465930415, + "loss": 1.4603, + "step": 14065 + }, + { + "epoch": 0.5416746871992301, + "grad_norm": 2.8181777000427246, + "learning_rate": 0.00016597748372076878, + "loss": 1.1513, + "step": 14070 + }, + { + "epoch": 0.5418671799807507, + "grad_norm": 1.558497667312622, + "learning_rate": 0.00016595475674902957, + "loss": 1.1758, + "step": 14075 + }, + { + "epoch": 0.5420596727622714, + "grad_norm": 1.5868738889694214, + "learning_rate": 0.0001659320237461648, + "loss": 1.1867, + "step": 14080 + }, + { + "epoch": 0.5422521655437921, + "grad_norm": 0.850387692451477, + "learning_rate": 0.0001659092847142532, + "loss": 0.8849, + "step": 14085 + }, + { + "epoch": 0.5424446583253129, + "grad_norm": 1.334726095199585, + "learning_rate": 0.00016588653965537412, + "loss": 1.252, + "step": 14090 + }, + { + "epoch": 0.5426371511068335, + "grad_norm": 1.1548973321914673, + "learning_rate": 0.00016586378857160743, + "loss": 1.2255, + "step": 14095 + }, + { + "epoch": 0.5428296438883542, + "grad_norm": 1.3282769918441772, + "learning_rate": 0.00016584103146503364, + "loss": 1.0991, + "step": 14100 + }, + { + "epoch": 0.5430221366698749, + "grad_norm": 1.635657548904419, + "learning_rate": 0.00016581826833773363, + "loss": 1.1963, + "step": 14105 + }, + { + "epoch": 0.5432146294513955, + "grad_norm": 1.7892380952835083, + "learning_rate": 0.00016579549919178903, + "loss": 1.0593, + "step": 14110 + }, + { + "epoch": 0.5434071222329163, + "grad_norm": 2.381394147872925, + "learning_rate": 0.00016577272402928183, + "loss": 1.2743, + "step": 14115 + }, + { + "epoch": 0.543599615014437, + "grad_norm": 1.1770328283309937, + "learning_rate": 0.00016574994285229478, + "loss": 1.1433, + "step": 14120 + }, + { + "epoch": 0.5437921077959577, + "grad_norm": 1.9077178239822388, + "learning_rate": 0.00016572715566291098, + "loss": 1.2422, + "step": 14125 + }, + { + "epoch": 0.5439846005774783, + "grad_norm": 1.2600334882736206, + "learning_rate": 0.00016570436246321417, + "loss": 1.1479, + "step": 14130 + }, + { + "epoch": 0.544177093358999, + "grad_norm": 1.0997780561447144, + "learning_rate": 0.0001656815632552887, + "loss": 1.2516, + "step": 14135 + }, + { + "epoch": 0.5443695861405198, + "grad_norm": 1.1767383813858032, + "learning_rate": 0.00016565875804121935, + "loss": 1.1713, + "step": 14140 + }, + { + "epoch": 0.5445620789220404, + "grad_norm": 1.62860906124115, + "learning_rate": 0.00016563594682309152, + "loss": 1.2017, + "step": 14145 + }, + { + "epoch": 0.5447545717035611, + "grad_norm": 1.6149252653121948, + "learning_rate": 0.0001656131296029912, + "loss": 1.2104, + "step": 14150 + }, + { + "epoch": 0.5449470644850818, + "grad_norm": 1.0693351030349731, + "learning_rate": 0.0001655903063830048, + "loss": 1.2656, + "step": 14155 + }, + { + "epoch": 0.5451395572666025, + "grad_norm": 1.4624438285827637, + "learning_rate": 0.00016556747716521937, + "loss": 1.2323, + "step": 14160 + }, + { + "epoch": 0.5453320500481232, + "grad_norm": 1.8848096132278442, + "learning_rate": 0.0001655446419517225, + "loss": 1.2104, + "step": 14165 + }, + { + "epoch": 0.5455245428296439, + "grad_norm": 1.076907753944397, + "learning_rate": 0.00016552180074460231, + "loss": 1.2503, + "step": 14170 + }, + { + "epoch": 0.5457170356111646, + "grad_norm": 2.496718645095825, + "learning_rate": 0.00016549895354594748, + "loss": 1.146, + "step": 14175 + }, + { + "epoch": 0.5459095283926853, + "grad_norm": 1.8133556842803955, + "learning_rate": 0.00016547610035784724, + "loss": 1.274, + "step": 14180 + }, + { + "epoch": 0.5461020211742059, + "grad_norm": 1.1353720426559448, + "learning_rate": 0.0001654532411823914, + "loss": 1.3842, + "step": 14185 + }, + { + "epoch": 0.5462945139557267, + "grad_norm": 2.368894577026367, + "learning_rate": 0.00016543037602167017, + "loss": 1.3566, + "step": 14190 + }, + { + "epoch": 0.5464870067372474, + "grad_norm": 1.884104609489441, + "learning_rate": 0.00016540750487777455, + "loss": 1.15, + "step": 14195 + }, + { + "epoch": 0.546679499518768, + "grad_norm": 1.1348326206207275, + "learning_rate": 0.00016538462775279587, + "loss": 1.1782, + "step": 14200 + }, + { + "epoch": 0.5468719923002887, + "grad_norm": 1.2342017889022827, + "learning_rate": 0.00016536174464882613, + "loss": 1.1361, + "step": 14205 + }, + { + "epoch": 0.5470644850818094, + "grad_norm": 1.0037345886230469, + "learning_rate": 0.0001653388555679578, + "loss": 1.1282, + "step": 14210 + }, + { + "epoch": 0.5472569778633302, + "grad_norm": 2.8669965267181396, + "learning_rate": 0.000165315960512284, + "loss": 1.184, + "step": 14215 + }, + { + "epoch": 0.5474494706448508, + "grad_norm": 1.0212280750274658, + "learning_rate": 0.00016529305948389825, + "loss": 1.1422, + "step": 14220 + }, + { + "epoch": 0.5476419634263715, + "grad_norm": 1.1197772026062012, + "learning_rate": 0.00016527015248489474, + "loss": 1.077, + "step": 14225 + }, + { + "epoch": 0.5478344562078922, + "grad_norm": 1.4821882247924805, + "learning_rate": 0.0001652472395173682, + "loss": 1.3187, + "step": 14230 + }, + { + "epoch": 0.5480269489894128, + "grad_norm": 1.1993844509124756, + "learning_rate": 0.00016522432058341377, + "loss": 1.1834, + "step": 14235 + }, + { + "epoch": 0.5482194417709336, + "grad_norm": 1.9386481046676636, + "learning_rate": 0.00016520139568512734, + "loss": 1.1461, + "step": 14240 + }, + { + "epoch": 0.5484119345524543, + "grad_norm": 0.8914703130722046, + "learning_rate": 0.00016517846482460517, + "loss": 1.4175, + "step": 14245 + }, + { + "epoch": 0.548604427333975, + "grad_norm": 1.8703666925430298, + "learning_rate": 0.00016515552800394417, + "loss": 1.2483, + "step": 14250 + }, + { + "epoch": 0.5487969201154956, + "grad_norm": 1.1656851768493652, + "learning_rate": 0.00016513258522524177, + "loss": 1.2293, + "step": 14255 + }, + { + "epoch": 0.5489894128970164, + "grad_norm": 1.402370810508728, + "learning_rate": 0.0001651096364905959, + "loss": 1.3326, + "step": 14260 + }, + { + "epoch": 0.5491819056785371, + "grad_norm": 1.8804208040237427, + "learning_rate": 0.00016508668180210506, + "loss": 1.1033, + "step": 14265 + }, + { + "epoch": 0.5493743984600578, + "grad_norm": 1.0970590114593506, + "learning_rate": 0.00016506372116186836, + "loss": 1.1887, + "step": 14270 + }, + { + "epoch": 0.5495668912415784, + "grad_norm": 1.3364982604980469, + "learning_rate": 0.00016504075457198533, + "loss": 1.1183, + "step": 14275 + }, + { + "epoch": 0.5497593840230991, + "grad_norm": 1.4718800783157349, + "learning_rate": 0.0001650177820345562, + "loss": 1.1305, + "step": 14280 + }, + { + "epoch": 0.5499518768046199, + "grad_norm": 1.3023836612701416, + "learning_rate": 0.00016499480355168156, + "loss": 1.2267, + "step": 14285 + }, + { + "epoch": 0.5501443695861405, + "grad_norm": 1.809346079826355, + "learning_rate": 0.0001649718191254627, + "loss": 1.081, + "step": 14290 + }, + { + "epoch": 0.5503368623676612, + "grad_norm": 1.3828262090682983, + "learning_rate": 0.0001649488287580014, + "loss": 1.209, + "step": 14295 + }, + { + "epoch": 0.5505293551491819, + "grad_norm": 1.4741365909576416, + "learning_rate": 0.00016492583245139995, + "loss": 1.0607, + "step": 14300 + }, + { + "epoch": 0.5507218479307026, + "grad_norm": 1.057210922241211, + "learning_rate": 0.0001649028302077612, + "loss": 1.281, + "step": 14305 + }, + { + "epoch": 0.5509143407122233, + "grad_norm": 2.588911294937134, + "learning_rate": 0.00016487982202918858, + "loss": 1.382, + "step": 14310 + }, + { + "epoch": 0.551106833493744, + "grad_norm": 2.2811248302459717, + "learning_rate": 0.00016485680791778604, + "loss": 1.3173, + "step": 14315 + }, + { + "epoch": 0.5512993262752647, + "grad_norm": 1.675776481628418, + "learning_rate": 0.00016483378787565802, + "loss": 1.1948, + "step": 14320 + }, + { + "epoch": 0.5514918190567853, + "grad_norm": 1.1149309873580933, + "learning_rate": 0.0001648107619049096, + "loss": 1.1406, + "step": 14325 + }, + { + "epoch": 0.551684311838306, + "grad_norm": 1.0165066719055176, + "learning_rate": 0.00016478773000764635, + "loss": 1.1491, + "step": 14330 + }, + { + "epoch": 0.5518768046198268, + "grad_norm": 1.8692020177841187, + "learning_rate": 0.00016476469218597433, + "loss": 1.0848, + "step": 14335 + }, + { + "epoch": 0.5520692974013475, + "grad_norm": 0.9627811908721924, + "learning_rate": 0.0001647416484420003, + "loss": 1.2159, + "step": 14340 + }, + { + "epoch": 0.5522617901828681, + "grad_norm": 2.1085097789764404, + "learning_rate": 0.00016471859877783133, + "loss": 1.1551, + "step": 14345 + }, + { + "epoch": 0.5524542829643888, + "grad_norm": 2.2478790283203125, + "learning_rate": 0.00016469554319557527, + "loss": 1.3081, + "step": 14350 + }, + { + "epoch": 0.5526467757459095, + "grad_norm": 1.0580302476882935, + "learning_rate": 0.00016467248169734037, + "loss": 1.3293, + "step": 14355 + }, + { + "epoch": 0.5528392685274303, + "grad_norm": 1.3953101634979248, + "learning_rate": 0.00016464941428523538, + "loss": 1.1256, + "step": 14360 + }, + { + "epoch": 0.5530317613089509, + "grad_norm": 0.9302542209625244, + "learning_rate": 0.0001646263409613697, + "loss": 0.9645, + "step": 14365 + }, + { + "epoch": 0.5532242540904716, + "grad_norm": 1.9415937662124634, + "learning_rate": 0.00016460326172785332, + "loss": 1.3428, + "step": 14370 + }, + { + "epoch": 0.5534167468719923, + "grad_norm": 0.9449756145477295, + "learning_rate": 0.00016458017658679656, + "loss": 1.3183, + "step": 14375 + }, + { + "epoch": 0.5536092396535129, + "grad_norm": 1.2944326400756836, + "learning_rate": 0.00016455708554031047, + "loss": 1.1277, + "step": 14380 + }, + { + "epoch": 0.5538017324350337, + "grad_norm": 1.3632171154022217, + "learning_rate": 0.00016453398859050657, + "loss": 1.3262, + "step": 14385 + }, + { + "epoch": 0.5539942252165544, + "grad_norm": 1.119086503982544, + "learning_rate": 0.00016451088573949692, + "loss": 1.2639, + "step": 14390 + }, + { + "epoch": 0.5541867179980751, + "grad_norm": 1.3261640071868896, + "learning_rate": 0.00016448777698939407, + "loss": 1.0911, + "step": 14395 + }, + { + "epoch": 0.5543792107795957, + "grad_norm": 1.6098653078079224, + "learning_rate": 0.00016446466234231125, + "loss": 1.2942, + "step": 14400 + }, + { + "epoch": 0.5545717035611165, + "grad_norm": 2.1425249576568604, + "learning_rate": 0.0001644415418003621, + "loss": 1.1566, + "step": 14405 + }, + { + "epoch": 0.5547641963426372, + "grad_norm": 1.0087484121322632, + "learning_rate": 0.0001644184153656608, + "loss": 1.1515, + "step": 14410 + }, + { + "epoch": 0.5549566891241579, + "grad_norm": 1.3792825937271118, + "learning_rate": 0.00016439528304032218, + "loss": 1.3815, + "step": 14415 + }, + { + "epoch": 0.5551491819056785, + "grad_norm": 1.0076264142990112, + "learning_rate": 0.0001643721448264615, + "loss": 1.1996, + "step": 14420 + }, + { + "epoch": 0.5553416746871992, + "grad_norm": 1.6108455657958984, + "learning_rate": 0.0001643490007261946, + "loss": 1.2801, + "step": 14425 + }, + { + "epoch": 0.55553416746872, + "grad_norm": 1.4850428104400635, + "learning_rate": 0.00016432585074163783, + "loss": 1.1272, + "step": 14430 + }, + { + "epoch": 0.5557266602502406, + "grad_norm": 1.482926607131958, + "learning_rate": 0.0001643026948749082, + "loss": 0.9271, + "step": 14435 + }, + { + "epoch": 0.5559191530317613, + "grad_norm": 1.404266119003296, + "learning_rate": 0.000164279533128123, + "loss": 1.2476, + "step": 14440 + }, + { + "epoch": 0.556111645813282, + "grad_norm": 1.5951578617095947, + "learning_rate": 0.00016425636550340035, + "loss": 1.036, + "step": 14445 + }, + { + "epoch": 0.5563041385948027, + "grad_norm": 1.11802339553833, + "learning_rate": 0.00016423319200285877, + "loss": 1.2595, + "step": 14450 + }, + { + "epoch": 0.5564966313763234, + "grad_norm": 0.9702684283256531, + "learning_rate": 0.00016421001262861723, + "loss": 1.1478, + "step": 14455 + }, + { + "epoch": 0.5566891241578441, + "grad_norm": 1.4077606201171875, + "learning_rate": 0.00016418682738279542, + "loss": 1.2807, + "step": 14460 + }, + { + "epoch": 0.5568816169393648, + "grad_norm": 1.5000783205032349, + "learning_rate": 0.00016416363626751344, + "loss": 1.3231, + "step": 14465 + }, + { + "epoch": 0.5570741097208854, + "grad_norm": 1.0804152488708496, + "learning_rate": 0.00016414043928489195, + "loss": 1.2609, + "step": 14470 + }, + { + "epoch": 0.5572666025024061, + "grad_norm": 2.0902814865112305, + "learning_rate": 0.0001641172364370522, + "loss": 1.1007, + "step": 14475 + }, + { + "epoch": 0.5574590952839269, + "grad_norm": 0.9129114151000977, + "learning_rate": 0.0001640940277261159, + "loss": 1.2385, + "step": 14480 + }, + { + "epoch": 0.5576515880654476, + "grad_norm": 1.5251227617263794, + "learning_rate": 0.0001640708131542054, + "loss": 1.3068, + "step": 14485 + }, + { + "epoch": 0.5578440808469682, + "grad_norm": 1.7822771072387695, + "learning_rate": 0.00016404759272344342, + "loss": 1.2942, + "step": 14490 + }, + { + "epoch": 0.5580365736284889, + "grad_norm": 1.7675615549087524, + "learning_rate": 0.00016402436643595336, + "loss": 0.9753, + "step": 14495 + }, + { + "epoch": 0.5582290664100096, + "grad_norm": 1.4113742113113403, + "learning_rate": 0.0001640011342938591, + "loss": 1.2727, + "step": 14500 + }, + { + "epoch": 0.5584215591915304, + "grad_norm": 2.213724136352539, + "learning_rate": 0.0001639778962992851, + "loss": 1.1275, + "step": 14505 + }, + { + "epoch": 0.558614051973051, + "grad_norm": 0.777229368686676, + "learning_rate": 0.0001639546524543563, + "loss": 1.0434, + "step": 14510 + }, + { + "epoch": 0.5588065447545717, + "grad_norm": 1.0420740842819214, + "learning_rate": 0.00016393140276119817, + "loss": 1.1202, + "step": 14515 + }, + { + "epoch": 0.5589990375360924, + "grad_norm": 1.4241138696670532, + "learning_rate": 0.00016390814722193678, + "loss": 1.0245, + "step": 14520 + }, + { + "epoch": 0.559191530317613, + "grad_norm": 1.1826037168502808, + "learning_rate": 0.00016388488583869872, + "loss": 1.1894, + "step": 14525 + }, + { + "epoch": 0.5593840230991338, + "grad_norm": 1.136072039604187, + "learning_rate": 0.000163861618613611, + "loss": 1.2093, + "step": 14530 + }, + { + "epoch": 0.5595765158806545, + "grad_norm": 1.0932581424713135, + "learning_rate": 0.0001638383455488013, + "loss": 1.26, + "step": 14535 + }, + { + "epoch": 0.5597690086621752, + "grad_norm": 1.4892606735229492, + "learning_rate": 0.00016381506664639784, + "loss": 1.0244, + "step": 14540 + }, + { + "epoch": 0.5599615014436958, + "grad_norm": 1.4259272813796997, + "learning_rate": 0.0001637917819085292, + "loss": 0.9896, + "step": 14545 + }, + { + "epoch": 0.5601539942252165, + "grad_norm": 1.0615971088409424, + "learning_rate": 0.00016376849133732473, + "loss": 1.1619, + "step": 14550 + }, + { + "epoch": 0.5603464870067373, + "grad_norm": 0.8815811276435852, + "learning_rate": 0.00016374519493491413, + "loss": 1.1123, + "step": 14555 + }, + { + "epoch": 0.560538979788258, + "grad_norm": 1.2956461906433105, + "learning_rate": 0.00016372189270342778, + "loss": 1.1978, + "step": 14560 + }, + { + "epoch": 0.5607314725697786, + "grad_norm": 1.8797427415847778, + "learning_rate": 0.00016369858464499641, + "loss": 1.4186, + "step": 14565 + }, + { + "epoch": 0.5609239653512993, + "grad_norm": 1.6631108522415161, + "learning_rate": 0.00016367527076175143, + "loss": 0.9839, + "step": 14570 + }, + { + "epoch": 0.5611164581328201, + "grad_norm": 1.8200160264968872, + "learning_rate": 0.0001636519510558248, + "loss": 1.1272, + "step": 14575 + }, + { + "epoch": 0.5613089509143407, + "grad_norm": 1.884712815284729, + "learning_rate": 0.00016362862552934886, + "loss": 1.15, + "step": 14580 + }, + { + "epoch": 0.5615014436958614, + "grad_norm": 0.7094476222991943, + "learning_rate": 0.00016360529418445662, + "loss": 0.9581, + "step": 14585 + }, + { + "epoch": 0.5616939364773821, + "grad_norm": 0.9652591347694397, + "learning_rate": 0.00016358195702328158, + "loss": 1.0858, + "step": 14590 + }, + { + "epoch": 0.5618864292589028, + "grad_norm": 1.3010308742523193, + "learning_rate": 0.00016355861404795778, + "loss": 1.2491, + "step": 14595 + }, + { + "epoch": 0.5620789220404235, + "grad_norm": 1.459953784942627, + "learning_rate": 0.00016353526526061973, + "loss": 1.1194, + "step": 14600 + }, + { + "epoch": 0.5622714148219442, + "grad_norm": 1.0818215608596802, + "learning_rate": 0.0001635119106634026, + "loss": 1.202, + "step": 14605 + }, + { + "epoch": 0.5624639076034649, + "grad_norm": 1.0625619888305664, + "learning_rate": 0.0001634885502584419, + "loss": 1.3284, + "step": 14610 + }, + { + "epoch": 0.5626564003849855, + "grad_norm": 1.5708478689193726, + "learning_rate": 0.0001634651840478739, + "loss": 1.036, + "step": 14615 + }, + { + "epoch": 0.5628488931665062, + "grad_norm": 1.2847293615341187, + "learning_rate": 0.00016344181203383523, + "loss": 1.0858, + "step": 14620 + }, + { + "epoch": 0.563041385948027, + "grad_norm": 1.082689881324768, + "learning_rate": 0.00016341843421846313, + "loss": 1.3457, + "step": 14625 + }, + { + "epoch": 0.5632338787295477, + "grad_norm": 1.9000965356826782, + "learning_rate": 0.0001633950506038953, + "loss": 1.3901, + "step": 14630 + }, + { + "epoch": 0.5634263715110683, + "grad_norm": 1.4664018154144287, + "learning_rate": 0.0001633716611922701, + "loss": 1.0836, + "step": 14635 + }, + { + "epoch": 0.563618864292589, + "grad_norm": 1.6126337051391602, + "learning_rate": 0.0001633482659857262, + "loss": 1.0794, + "step": 14640 + }, + { + "epoch": 0.5638113570741097, + "grad_norm": 1.865504503250122, + "learning_rate": 0.00016332486498640307, + "loss": 0.9427, + "step": 14645 + }, + { + "epoch": 0.5640038498556305, + "grad_norm": 1.4346791505813599, + "learning_rate": 0.0001633014581964405, + "loss": 1.1952, + "step": 14650 + }, + { + "epoch": 0.5641963426371511, + "grad_norm": 1.3558484315872192, + "learning_rate": 0.00016327804561797895, + "loss": 1.1679, + "step": 14655 + }, + { + "epoch": 0.5643888354186718, + "grad_norm": 1.3297834396362305, + "learning_rate": 0.00016325462725315926, + "loss": 1.2225, + "step": 14660 + }, + { + "epoch": 0.5645813282001925, + "grad_norm": 2.106694221496582, + "learning_rate": 0.00016323120310412297, + "loss": 1.072, + "step": 14665 + }, + { + "epoch": 0.5647738209817131, + "grad_norm": 1.284629225730896, + "learning_rate": 0.00016320777317301198, + "loss": 1.0004, + "step": 14670 + }, + { + "epoch": 0.5649663137632339, + "grad_norm": 1.4289201498031616, + "learning_rate": 0.0001631843374619689, + "loss": 1.1239, + "step": 14675 + }, + { + "epoch": 0.5651588065447546, + "grad_norm": 1.9027820825576782, + "learning_rate": 0.0001631608959731367, + "loss": 1.2137, + "step": 14680 + }, + { + "epoch": 0.5653512993262753, + "grad_norm": 1.878009557723999, + "learning_rate": 0.00016313744870865895, + "loss": 1.247, + "step": 14685 + }, + { + "epoch": 0.5655437921077959, + "grad_norm": 1.4919451475143433, + "learning_rate": 0.00016311399567067974, + "loss": 1.3506, + "step": 14690 + }, + { + "epoch": 0.5657362848893166, + "grad_norm": 2.0583205223083496, + "learning_rate": 0.00016309053686134378, + "loss": 1.1191, + "step": 14695 + }, + { + "epoch": 0.5659287776708374, + "grad_norm": 1.1545616388320923, + "learning_rate": 0.00016306707228279615, + "loss": 1.2105, + "step": 14700 + }, + { + "epoch": 0.566121270452358, + "grad_norm": 0.8714199662208557, + "learning_rate": 0.0001630436019371825, + "loss": 1.0834, + "step": 14705 + }, + { + "epoch": 0.5663137632338787, + "grad_norm": 2.1866228580474854, + "learning_rate": 0.0001630201258266491, + "loss": 1.3334, + "step": 14710 + }, + { + "epoch": 0.5665062560153994, + "grad_norm": 1.4117622375488281, + "learning_rate": 0.00016299664395334266, + "loss": 1.1353, + "step": 14715 + }, + { + "epoch": 0.5666987487969202, + "grad_norm": 1.5454515218734741, + "learning_rate": 0.00016297315631941045, + "loss": 1.096, + "step": 14720 + }, + { + "epoch": 0.5668912415784408, + "grad_norm": 1.1799986362457275, + "learning_rate": 0.00016294966292700026, + "loss": 1.214, + "step": 14725 + }, + { + "epoch": 0.5670837343599615, + "grad_norm": 1.2906007766723633, + "learning_rate": 0.00016292616377826038, + "loss": 1.2613, + "step": 14730 + }, + { + "epoch": 0.5672762271414822, + "grad_norm": 2.8731329441070557, + "learning_rate": 0.00016290265887533968, + "loss": 1.3257, + "step": 14735 + }, + { + "epoch": 0.5674687199230029, + "grad_norm": 1.0078117847442627, + "learning_rate": 0.0001628791482203875, + "loss": 1.2053, + "step": 14740 + }, + { + "epoch": 0.5676612127045236, + "grad_norm": 1.05767023563385, + "learning_rate": 0.0001628556318155538, + "loss": 0.9775, + "step": 14745 + }, + { + "epoch": 0.5678537054860443, + "grad_norm": 2.118110418319702, + "learning_rate": 0.0001628321096629889, + "loss": 1.1801, + "step": 14750 + }, + { + "epoch": 0.568046198267565, + "grad_norm": 1.1577699184417725, + "learning_rate": 0.00016280858176484384, + "loss": 1.2156, + "step": 14755 + }, + { + "epoch": 0.5682386910490856, + "grad_norm": 1.5565030574798584, + "learning_rate": 0.00016278504812327002, + "loss": 1.0586, + "step": 14760 + }, + { + "epoch": 0.5684311838306063, + "grad_norm": 1.5205986499786377, + "learning_rate": 0.00016276150874041946, + "loss": 1.3679, + "step": 14765 + }, + { + "epoch": 0.5686236766121271, + "grad_norm": 0.9402291774749756, + "learning_rate": 0.00016273796361844468, + "loss": 1.0996, + "step": 14770 + }, + { + "epoch": 0.5688161693936478, + "grad_norm": 1.3806294202804565, + "learning_rate": 0.00016271441275949875, + "loss": 1.1815, + "step": 14775 + }, + { + "epoch": 0.5690086621751684, + "grad_norm": 2.0714609622955322, + "learning_rate": 0.0001626908561657352, + "loss": 1.1945, + "step": 14780 + }, + { + "epoch": 0.5692011549566891, + "grad_norm": 0.9732249975204468, + "learning_rate": 0.00016266729383930816, + "loss": 1.0233, + "step": 14785 + }, + { + "epoch": 0.5693936477382098, + "grad_norm": 1.3748955726623535, + "learning_rate": 0.0001626437257823722, + "loss": 1.25, + "step": 14790 + }, + { + "epoch": 0.5695861405197306, + "grad_norm": 1.9781707525253296, + "learning_rate": 0.00016262015199708252, + "loss": 1.1745, + "step": 14795 + }, + { + "epoch": 0.5697786333012512, + "grad_norm": 1.5062282085418701, + "learning_rate": 0.00016259657248559475, + "loss": 1.098, + "step": 14800 + }, + { + "epoch": 0.5699711260827719, + "grad_norm": 1.7073885202407837, + "learning_rate": 0.0001625729872500651, + "loss": 1.1191, + "step": 14805 + }, + { + "epoch": 0.5701636188642926, + "grad_norm": 2.0891575813293457, + "learning_rate": 0.00016254939629265026, + "loss": 1.2533, + "step": 14810 + }, + { + "epoch": 0.5703561116458132, + "grad_norm": 1.6380434036254883, + "learning_rate": 0.0001625257996155075, + "loss": 1.2756, + "step": 14815 + }, + { + "epoch": 0.570548604427334, + "grad_norm": 1.1182715892791748, + "learning_rate": 0.00016250219722079452, + "loss": 1.3084, + "step": 14820 + }, + { + "epoch": 0.5707410972088547, + "grad_norm": 1.2113651037216187, + "learning_rate": 0.0001624785891106697, + "loss": 1.287, + "step": 14825 + }, + { + "epoch": 0.5709335899903754, + "grad_norm": 1.1726208925247192, + "learning_rate": 0.00016245497528729174, + "loss": 1.1491, + "step": 14830 + }, + { + "epoch": 0.571126082771896, + "grad_norm": 1.0203557014465332, + "learning_rate": 0.00016243135575282004, + "loss": 1.0809, + "step": 14835 + }, + { + "epoch": 0.5713185755534167, + "grad_norm": 1.2878923416137695, + "learning_rate": 0.00016240773050941443, + "loss": 1.1848, + "step": 14840 + }, + { + "epoch": 0.5715110683349375, + "grad_norm": 1.5805665254592896, + "learning_rate": 0.00016238409955923527, + "loss": 1.1191, + "step": 14845 + }, + { + "epoch": 0.5717035611164581, + "grad_norm": 1.089296579360962, + "learning_rate": 0.00016236046290444347, + "loss": 1.066, + "step": 14850 + }, + { + "epoch": 0.5718960538979788, + "grad_norm": 1.1492708921432495, + "learning_rate": 0.0001623368205472004, + "loss": 1.174, + "step": 14855 + }, + { + "epoch": 0.5720885466794995, + "grad_norm": 1.9744573831558228, + "learning_rate": 0.00016231317248966809, + "loss": 1.248, + "step": 14860 + }, + { + "epoch": 0.5722810394610202, + "grad_norm": 2.2061898708343506, + "learning_rate": 0.0001622895187340089, + "loss": 1.2028, + "step": 14865 + }, + { + "epoch": 0.5724735322425409, + "grad_norm": 1.0993640422821045, + "learning_rate": 0.0001622658592823859, + "loss": 1.059, + "step": 14870 + }, + { + "epoch": 0.5726660250240616, + "grad_norm": 1.5680936574935913, + "learning_rate": 0.00016224219413696252, + "loss": 1.4181, + "step": 14875 + }, + { + "epoch": 0.5728585178055823, + "grad_norm": 1.3295773267745972, + "learning_rate": 0.00016221852329990276, + "loss": 1.214, + "step": 14880 + }, + { + "epoch": 0.573051010587103, + "grad_norm": 1.2004729509353638, + "learning_rate": 0.00016219484677337126, + "loss": 1.1474, + "step": 14885 + }, + { + "epoch": 0.5732435033686237, + "grad_norm": 1.3868520259857178, + "learning_rate": 0.000162171164559533, + "loss": 1.2373, + "step": 14890 + }, + { + "epoch": 0.5734359961501444, + "grad_norm": 1.2218377590179443, + "learning_rate": 0.00016214747666055358, + "loss": 1.1009, + "step": 14895 + }, + { + "epoch": 0.5736284889316651, + "grad_norm": 1.1113415956497192, + "learning_rate": 0.00016212378307859914, + "loss": 1.2191, + "step": 14900 + }, + { + "epoch": 0.5738209817131857, + "grad_norm": 1.099223256111145, + "learning_rate": 0.00016210008381583623, + "loss": 1.2024, + "step": 14905 + }, + { + "epoch": 0.5740134744947064, + "grad_norm": 1.3597705364227295, + "learning_rate": 0.00016207637887443208, + "loss": 1.1785, + "step": 14910 + }, + { + "epoch": 0.5742059672762272, + "grad_norm": 1.675276279449463, + "learning_rate": 0.00016205266825655427, + "loss": 1.1492, + "step": 14915 + }, + { + "epoch": 0.5743984600577479, + "grad_norm": 1.5977553129196167, + "learning_rate": 0.000162028951964371, + "loss": 1.1355, + "step": 14920 + }, + { + "epoch": 0.5745909528392685, + "grad_norm": 2.0862395763397217, + "learning_rate": 0.000162005230000051, + "loss": 1.1747, + "step": 14925 + }, + { + "epoch": 0.5747834456207892, + "grad_norm": 0.8812354803085327, + "learning_rate": 0.00016198150236576347, + "loss": 1.2876, + "step": 14930 + }, + { + "epoch": 0.5749759384023099, + "grad_norm": 1.3878661394119263, + "learning_rate": 0.0001619577690636781, + "loss": 1.2269, + "step": 14935 + }, + { + "epoch": 0.5751684311838307, + "grad_norm": 1.0739976167678833, + "learning_rate": 0.0001619340300959652, + "loss": 1.2467, + "step": 14940 + }, + { + "epoch": 0.5753609239653513, + "grad_norm": 0.766392707824707, + "learning_rate": 0.0001619102854647955, + "loss": 1.0829, + "step": 14945 + }, + { + "epoch": 0.575553416746872, + "grad_norm": 1.2837680578231812, + "learning_rate": 0.00016188653517234036, + "loss": 1.2027, + "step": 14950 + }, + { + "epoch": 0.5757459095283927, + "grad_norm": 2.2257256507873535, + "learning_rate": 0.00016186277922077152, + "loss": 1.1181, + "step": 14955 + }, + { + "epoch": 0.5759384023099133, + "grad_norm": 1.257380723953247, + "learning_rate": 0.00016183901761226133, + "loss": 1.1899, + "step": 14960 + }, + { + "epoch": 0.5761308950914341, + "grad_norm": 1.1324365139007568, + "learning_rate": 0.00016181525034898261, + "loss": 1.0823, + "step": 14965 + }, + { + "epoch": 0.5763233878729548, + "grad_norm": 0.9696788787841797, + "learning_rate": 0.00016179147743310872, + "loss": 1.176, + "step": 14970 + }, + { + "epoch": 0.5765158806544755, + "grad_norm": 1.8557454347610474, + "learning_rate": 0.00016176769886681357, + "loss": 1.1396, + "step": 14975 + }, + { + "epoch": 0.5767083734359961, + "grad_norm": 1.2395600080490112, + "learning_rate": 0.00016174391465227154, + "loss": 1.0799, + "step": 14980 + }, + { + "epoch": 0.5769008662175168, + "grad_norm": 1.8957431316375732, + "learning_rate": 0.00016172012479165752, + "loss": 1.146, + "step": 14985 + }, + { + "epoch": 0.5770933589990376, + "grad_norm": 1.191486120223999, + "learning_rate": 0.00016169632928714697, + "loss": 1.0166, + "step": 14990 + }, + { + "epoch": 0.5772858517805582, + "grad_norm": 1.7964496612548828, + "learning_rate": 0.0001616725281409158, + "loss": 1.2131, + "step": 14995 + }, + { + "epoch": 0.5774783445620789, + "grad_norm": 1.4722768068313599, + "learning_rate": 0.00016164872135514044, + "loss": 1.0148, + "step": 15000 + }, + { + "epoch": 0.5776708373435996, + "grad_norm": 1.265663981437683, + "learning_rate": 0.00016162490893199791, + "loss": 1.1166, + "step": 15005 + }, + { + "epoch": 0.5778633301251203, + "grad_norm": 1.2796491384506226, + "learning_rate": 0.0001616010908736657, + "loss": 1.2911, + "step": 15010 + }, + { + "epoch": 0.578055822906641, + "grad_norm": 1.025158166885376, + "learning_rate": 0.00016157726718232177, + "loss": 1.0723, + "step": 15015 + }, + { + "epoch": 0.5782483156881617, + "grad_norm": 1.5206444263458252, + "learning_rate": 0.00016155343786014472, + "loss": 1.0406, + "step": 15020 + }, + { + "epoch": 0.5784408084696824, + "grad_norm": 1.5212637186050415, + "learning_rate": 0.0001615296029093135, + "loss": 1.0445, + "step": 15025 + }, + { + "epoch": 0.578633301251203, + "grad_norm": 1.3746932744979858, + "learning_rate": 0.0001615057623320077, + "loss": 1.1385, + "step": 15030 + }, + { + "epoch": 0.5788257940327238, + "grad_norm": 1.8660439252853394, + "learning_rate": 0.00016148191613040734, + "loss": 1.0786, + "step": 15035 + }, + { + "epoch": 0.5790182868142445, + "grad_norm": 2.343719720840454, + "learning_rate": 0.0001614580643066931, + "loss": 1.276, + "step": 15040 + }, + { + "epoch": 0.5792107795957652, + "grad_norm": 1.1358321905136108, + "learning_rate": 0.00016143420686304594, + "loss": 1.3055, + "step": 15045 + }, + { + "epoch": 0.5794032723772858, + "grad_norm": 1.6678638458251953, + "learning_rate": 0.00016141034380164754, + "loss": 1.0694, + "step": 15050 + }, + { + "epoch": 0.5795957651588065, + "grad_norm": 1.6096512079238892, + "learning_rate": 0.00016138647512468004, + "loss": 1.4079, + "step": 15055 + }, + { + "epoch": 0.5797882579403273, + "grad_norm": 2.3922042846679688, + "learning_rate": 0.000161362600834326, + "loss": 1.2739, + "step": 15060 + }, + { + "epoch": 0.579980750721848, + "grad_norm": 1.167476773262024, + "learning_rate": 0.0001613387209327686, + "loss": 1.2336, + "step": 15065 + }, + { + "epoch": 0.5801732435033686, + "grad_norm": 0.9550272226333618, + "learning_rate": 0.00016131483542219152, + "loss": 1.1557, + "step": 15070 + }, + { + "epoch": 0.5803657362848893, + "grad_norm": 1.1105631589889526, + "learning_rate": 0.00016129094430477893, + "loss": 1.0289, + "step": 15075 + }, + { + "epoch": 0.58055822906641, + "grad_norm": 1.3411318063735962, + "learning_rate": 0.00016126704758271548, + "loss": 1.2454, + "step": 15080 + }, + { + "epoch": 0.5807507218479308, + "grad_norm": 1.2867335081100464, + "learning_rate": 0.00016124314525818635, + "loss": 1.2983, + "step": 15085 + }, + { + "epoch": 0.5809432146294514, + "grad_norm": 1.6035441160202026, + "learning_rate": 0.00016121923733337736, + "loss": 1.2227, + "step": 15090 + }, + { + "epoch": 0.5811357074109721, + "grad_norm": 1.6657713651657104, + "learning_rate": 0.0001611953238104746, + "loss": 1.1144, + "step": 15095 + }, + { + "epoch": 0.5813282001924928, + "grad_norm": 1.8781518936157227, + "learning_rate": 0.00016117140469166486, + "loss": 1.1393, + "step": 15100 + }, + { + "epoch": 0.5815206929740134, + "grad_norm": 1.542438268661499, + "learning_rate": 0.00016114747997913542, + "loss": 1.188, + "step": 15105 + }, + { + "epoch": 0.5817131857555342, + "grad_norm": 2.148175001144409, + "learning_rate": 0.00016112354967507398, + "loss": 1.1323, + "step": 15110 + }, + { + "epoch": 0.5819056785370549, + "grad_norm": 1.3092713356018066, + "learning_rate": 0.0001610996137816688, + "loss": 1.1799, + "step": 15115 + }, + { + "epoch": 0.5820981713185756, + "grad_norm": 1.4203580617904663, + "learning_rate": 0.00016107567230110874, + "loss": 1.0916, + "step": 15120 + }, + { + "epoch": 0.5822906641000962, + "grad_norm": 1.2932054996490479, + "learning_rate": 0.00016105172523558301, + "loss": 1.13, + "step": 15125 + }, + { + "epoch": 0.5824831568816169, + "grad_norm": 2.218705654144287, + "learning_rate": 0.00016102777258728142, + "loss": 1.153, + "step": 15130 + }, + { + "epoch": 0.5826756496631377, + "grad_norm": 1.180166244506836, + "learning_rate": 0.00016100381435839433, + "loss": 1.2611, + "step": 15135 + }, + { + "epoch": 0.5828681424446583, + "grad_norm": 2.007887125015259, + "learning_rate": 0.00016097985055111256, + "loss": 1.2046, + "step": 15140 + }, + { + "epoch": 0.583060635226179, + "grad_norm": 1.20327889919281, + "learning_rate": 0.00016095588116762734, + "loss": 1.3217, + "step": 15145 + }, + { + "epoch": 0.5832531280076997, + "grad_norm": 1.2758903503417969, + "learning_rate": 0.00016093190621013063, + "loss": 1.1277, + "step": 15150 + }, + { + "epoch": 0.5834456207892204, + "grad_norm": 1.851881980895996, + "learning_rate": 0.00016090792568081473, + "loss": 1.1701, + "step": 15155 + }, + { + "epoch": 0.5836381135707411, + "grad_norm": 1.6895406246185303, + "learning_rate": 0.00016088393958187247, + "loss": 1.3331, + "step": 15160 + }, + { + "epoch": 0.5838306063522618, + "grad_norm": 1.4138762950897217, + "learning_rate": 0.0001608599479154973, + "loss": 1.3016, + "step": 15165 + }, + { + "epoch": 0.5840230991337825, + "grad_norm": 1.3571628332138062, + "learning_rate": 0.00016083595068388303, + "loss": 1.1407, + "step": 15170 + }, + { + "epoch": 0.5842155919153031, + "grad_norm": 1.3217098712921143, + "learning_rate": 0.00016081194788922405, + "loss": 1.0032, + "step": 15175 + }, + { + "epoch": 0.5844080846968238, + "grad_norm": 0.9765079617500305, + "learning_rate": 0.00016078793953371533, + "loss": 1.1543, + "step": 15180 + }, + { + "epoch": 0.5846005774783446, + "grad_norm": 1.0757596492767334, + "learning_rate": 0.0001607639256195522, + "loss": 1.0828, + "step": 15185 + }, + { + "epoch": 0.5847930702598653, + "grad_norm": 1.2296372652053833, + "learning_rate": 0.00016073990614893057, + "loss": 1.2089, + "step": 15190 + }, + { + "epoch": 0.5849855630413859, + "grad_norm": 1.8743308782577515, + "learning_rate": 0.00016071588112404693, + "loss": 1.2195, + "step": 15195 + }, + { + "epoch": 0.5851780558229066, + "grad_norm": 1.347332239151001, + "learning_rate": 0.00016069185054709814, + "loss": 1.1664, + "step": 15200 + }, + { + "epoch": 0.5853705486044274, + "grad_norm": 1.629981279373169, + "learning_rate": 0.00016066781442028165, + "loss": 1.1888, + "step": 15205 + }, + { + "epoch": 0.5855630413859481, + "grad_norm": 1.353702425956726, + "learning_rate": 0.00016064377274579544, + "loss": 1.265, + "step": 15210 + }, + { + "epoch": 0.5857555341674687, + "grad_norm": 0.9861169457435608, + "learning_rate": 0.00016061972552583795, + "loss": 1.0908, + "step": 15215 + }, + { + "epoch": 0.5859480269489894, + "grad_norm": 1.1305365562438965, + "learning_rate": 0.00016059567276260813, + "loss": 1.0076, + "step": 15220 + }, + { + "epoch": 0.5861405197305101, + "grad_norm": 1.4098013639450073, + "learning_rate": 0.00016057161445830542, + "loss": 1.1882, + "step": 15225 + }, + { + "epoch": 0.5863330125120308, + "grad_norm": 1.1900111436843872, + "learning_rate": 0.00016054755061512986, + "loss": 1.1961, + "step": 15230 + }, + { + "epoch": 0.5865255052935515, + "grad_norm": 1.0856738090515137, + "learning_rate": 0.00016052348123528183, + "loss": 1.2169, + "step": 15235 + }, + { + "epoch": 0.5867179980750722, + "grad_norm": 1.109937071800232, + "learning_rate": 0.0001604994063209624, + "loss": 1.0818, + "step": 15240 + }, + { + "epoch": 0.5869104908565929, + "grad_norm": 1.9059746265411377, + "learning_rate": 0.00016047532587437304, + "loss": 1.3035, + "step": 15245 + }, + { + "epoch": 0.5871029836381135, + "grad_norm": 1.089796781539917, + "learning_rate": 0.00016045123989771575, + "loss": 1.0872, + "step": 15250 + }, + { + "epoch": 0.5872954764196343, + "grad_norm": 1.3014196157455444, + "learning_rate": 0.00016042714839319298, + "loss": 1.1809, + "step": 15255 + }, + { + "epoch": 0.587487969201155, + "grad_norm": 1.5097154378890991, + "learning_rate": 0.00016040305136300783, + "loss": 1.1026, + "step": 15260 + }, + { + "epoch": 0.5876804619826757, + "grad_norm": 1.9508148431777954, + "learning_rate": 0.00016037894880936376, + "loss": 1.0489, + "step": 15265 + }, + { + "epoch": 0.5878729547641963, + "grad_norm": 1.2007025480270386, + "learning_rate": 0.0001603548407344648, + "loss": 1.2376, + "step": 15270 + }, + { + "epoch": 0.588065447545717, + "grad_norm": 4.035842418670654, + "learning_rate": 0.00016033072714051545, + "loss": 1.2894, + "step": 15275 + }, + { + "epoch": 0.5882579403272378, + "grad_norm": 1.2279680967330933, + "learning_rate": 0.00016030660802972074, + "loss": 1.1945, + "step": 15280 + }, + { + "epoch": 0.5884504331087584, + "grad_norm": 1.0882714986801147, + "learning_rate": 0.00016028248340428625, + "loss": 1.0842, + "step": 15285 + }, + { + "epoch": 0.5886429258902791, + "grad_norm": 1.3169769048690796, + "learning_rate": 0.00016025835326641797, + "loss": 1.0085, + "step": 15290 + }, + { + "epoch": 0.5888354186717998, + "grad_norm": 1.3032643795013428, + "learning_rate": 0.00016023421761832246, + "loss": 1.1994, + "step": 15295 + }, + { + "epoch": 0.5890279114533205, + "grad_norm": 1.053415060043335, + "learning_rate": 0.00016021007646220678, + "loss": 1.0983, + "step": 15300 + }, + { + "epoch": 0.5892204042348412, + "grad_norm": 1.483736515045166, + "learning_rate": 0.00016018592980027846, + "loss": 1.1709, + "step": 15305 + }, + { + "epoch": 0.5894128970163619, + "grad_norm": 1.4688469171524048, + "learning_rate": 0.00016016177763474555, + "loss": 1.0505, + "step": 15310 + }, + { + "epoch": 0.5896053897978826, + "grad_norm": 1.7809165716171265, + "learning_rate": 0.00016013761996781661, + "loss": 1.1585, + "step": 15315 + }, + { + "epoch": 0.5897978825794032, + "grad_norm": 1.5344901084899902, + "learning_rate": 0.00016011345680170072, + "loss": 1.1269, + "step": 15320 + }, + { + "epoch": 0.5899903753609239, + "grad_norm": 1.298094630241394, + "learning_rate": 0.0001600892881386074, + "loss": 1.1804, + "step": 15325 + }, + { + "epoch": 0.5901828681424447, + "grad_norm": 1.8283668756484985, + "learning_rate": 0.0001600651139807467, + "loss": 1.2059, + "step": 15330 + }, + { + "epoch": 0.5903753609239654, + "grad_norm": 1.3290801048278809, + "learning_rate": 0.00016004093433032924, + "loss": 1.2334, + "step": 15335 + }, + { + "epoch": 0.590567853705486, + "grad_norm": 1.461422324180603, + "learning_rate": 0.00016001674918956612, + "loss": 1.2987, + "step": 15340 + }, + { + "epoch": 0.5907603464870067, + "grad_norm": 1.6681803464889526, + "learning_rate": 0.00015999255856066885, + "loss": 1.0221, + "step": 15345 + }, + { + "epoch": 0.5909528392685275, + "grad_norm": 1.1714918613433838, + "learning_rate": 0.00015996836244584948, + "loss": 1.0144, + "step": 15350 + }, + { + "epoch": 0.5911453320500482, + "grad_norm": 0.9316911697387695, + "learning_rate": 0.00015994416084732062, + "loss": 1.3241, + "step": 15355 + }, + { + "epoch": 0.5913378248315688, + "grad_norm": 2.429568290710449, + "learning_rate": 0.00015991995376729535, + "loss": 1.3155, + "step": 15360 + }, + { + "epoch": 0.5915303176130895, + "grad_norm": 1.3793234825134277, + "learning_rate": 0.00015989574120798725, + "loss": 1.2822, + "step": 15365 + }, + { + "epoch": 0.5917228103946102, + "grad_norm": 1.1756724119186401, + "learning_rate": 0.0001598715231716104, + "loss": 1.0682, + "step": 15370 + }, + { + "epoch": 0.591915303176131, + "grad_norm": 1.9872701168060303, + "learning_rate": 0.00015984729966037934, + "loss": 1.2034, + "step": 15375 + }, + { + "epoch": 0.5921077959576516, + "grad_norm": 1.5333032608032227, + "learning_rate": 0.00015982307067650918, + "loss": 1.3922, + "step": 15380 + }, + { + "epoch": 0.5923002887391723, + "grad_norm": 1.1813582181930542, + "learning_rate": 0.00015979883622221555, + "loss": 1.1811, + "step": 15385 + }, + { + "epoch": 0.592492781520693, + "grad_norm": 1.632565975189209, + "learning_rate": 0.00015977459629971442, + "loss": 1.0877, + "step": 15390 + }, + { + "epoch": 0.5926852743022136, + "grad_norm": 1.0945332050323486, + "learning_rate": 0.00015975035091122245, + "loss": 1.0836, + "step": 15395 + }, + { + "epoch": 0.5928777670837344, + "grad_norm": 0.8069517016410828, + "learning_rate": 0.0001597261000589567, + "loss": 1.1574, + "step": 15400 + }, + { + "epoch": 0.5930702598652551, + "grad_norm": 1.8364413976669312, + "learning_rate": 0.00015970184374513476, + "loss": 1.1935, + "step": 15405 + }, + { + "epoch": 0.5932627526467757, + "grad_norm": 1.5146484375, + "learning_rate": 0.00015967758197197468, + "loss": 1.06, + "step": 15410 + }, + { + "epoch": 0.5934552454282964, + "grad_norm": 1.5792328119277954, + "learning_rate": 0.00015965331474169508, + "loss": 1.1464, + "step": 15415 + }, + { + "epoch": 0.5936477382098171, + "grad_norm": 1.887292742729187, + "learning_rate": 0.00015962904205651495, + "loss": 1.2039, + "step": 15420 + }, + { + "epoch": 0.5938402309913379, + "grad_norm": 1.8241037130355835, + "learning_rate": 0.000159604763918654, + "loss": 1.2011, + "step": 15425 + }, + { + "epoch": 0.5940327237728585, + "grad_norm": 1.2130569219589233, + "learning_rate": 0.0001595804803303322, + "loss": 1.2401, + "step": 15430 + }, + { + "epoch": 0.5942252165543792, + "grad_norm": 1.1083897352218628, + "learning_rate": 0.00015955619129377017, + "loss": 1.2919, + "step": 15435 + }, + { + "epoch": 0.5944177093358999, + "grad_norm": 1.8266736268997192, + "learning_rate": 0.00015953189681118895, + "loss": 1.1609, + "step": 15440 + }, + { + "epoch": 0.5946102021174205, + "grad_norm": 1.5710999965667725, + "learning_rate": 0.0001595075968848102, + "loss": 1.2178, + "step": 15445 + }, + { + "epoch": 0.5948026948989413, + "grad_norm": 2.023061752319336, + "learning_rate": 0.00015948329151685583, + "loss": 1.2577, + "step": 15450 + }, + { + "epoch": 0.594995187680462, + "grad_norm": 1.3245149850845337, + "learning_rate": 0.00015945898070954853, + "loss": 0.9832, + "step": 15455 + }, + { + "epoch": 0.5951876804619827, + "grad_norm": 1.7696577310562134, + "learning_rate": 0.00015943466446511132, + "loss": 1.1991, + "step": 15460 + }, + { + "epoch": 0.5953801732435033, + "grad_norm": 1.0893733501434326, + "learning_rate": 0.00015941034278576775, + "loss": 1.3321, + "step": 15465 + }, + { + "epoch": 0.595572666025024, + "grad_norm": 1.294731616973877, + "learning_rate": 0.0001593860156737419, + "loss": 1.0485, + "step": 15470 + }, + { + "epoch": 0.5957651588065448, + "grad_norm": 1.1282588243484497, + "learning_rate": 0.00015936168313125833, + "loss": 1.0585, + "step": 15475 + }, + { + "epoch": 0.5959576515880655, + "grad_norm": 0.9207860231399536, + "learning_rate": 0.00015933734516054203, + "loss": 1.1343, + "step": 15480 + }, + { + "epoch": 0.5961501443695861, + "grad_norm": 2.2860140800476074, + "learning_rate": 0.00015931300176381865, + "loss": 1.3317, + "step": 15485 + }, + { + "epoch": 0.5963426371511068, + "grad_norm": 1.2698768377304077, + "learning_rate": 0.00015928865294331413, + "loss": 1.175, + "step": 15490 + }, + { + "epoch": 0.5965351299326275, + "grad_norm": 1.0986465215682983, + "learning_rate": 0.00015926429870125505, + "loss": 1.1309, + "step": 15495 + }, + { + "epoch": 0.5967276227141483, + "grad_norm": 1.5664902925491333, + "learning_rate": 0.00015923993903986844, + "loss": 1.0117, + "step": 15500 + }, + { + "epoch": 0.5969201154956689, + "grad_norm": 1.3162322044372559, + "learning_rate": 0.00015921557396138188, + "loss": 1.1964, + "step": 15505 + }, + { + "epoch": 0.5971126082771896, + "grad_norm": 0.8635309934616089, + "learning_rate": 0.0001591912034680233, + "loss": 1.1119, + "step": 15510 + }, + { + "epoch": 0.5973051010587103, + "grad_norm": 1.3118690252304077, + "learning_rate": 0.00015916682756202127, + "loss": 1.0618, + "step": 15515 + }, + { + "epoch": 0.597497593840231, + "grad_norm": 1.0313913822174072, + "learning_rate": 0.00015914244624560481, + "loss": 1.0686, + "step": 15520 + }, + { + "epoch": 0.5976900866217517, + "grad_norm": 1.3414394855499268, + "learning_rate": 0.00015911805952100347, + "loss": 1.2013, + "step": 15525 + }, + { + "epoch": 0.5978825794032724, + "grad_norm": 1.2710504531860352, + "learning_rate": 0.00015909366739044715, + "loss": 1.3748, + "step": 15530 + }, + { + "epoch": 0.598075072184793, + "grad_norm": 1.6694974899291992, + "learning_rate": 0.0001590692698561664, + "loss": 0.9833, + "step": 15535 + }, + { + "epoch": 0.5982675649663137, + "grad_norm": 1.5924476385116577, + "learning_rate": 0.00015904486692039227, + "loss": 1.2046, + "step": 15540 + }, + { + "epoch": 0.5984600577478345, + "grad_norm": 2.3105616569519043, + "learning_rate": 0.00015902045858535616, + "loss": 1.233, + "step": 15545 + }, + { + "epoch": 0.5986525505293552, + "grad_norm": 1.3003478050231934, + "learning_rate": 0.00015899604485329012, + "loss": 1.1891, + "step": 15550 + }, + { + "epoch": 0.5988450433108758, + "grad_norm": 1.2988343238830566, + "learning_rate": 0.00015897162572642656, + "loss": 1.0767, + "step": 15555 + }, + { + "epoch": 0.5990375360923965, + "grad_norm": 1.0845260620117188, + "learning_rate": 0.00015894720120699849, + "loss": 1.2702, + "step": 15560 + }, + { + "epoch": 0.5992300288739172, + "grad_norm": 1.0050013065338135, + "learning_rate": 0.00015892277129723935, + "loss": 1.2267, + "step": 15565 + }, + { + "epoch": 0.599422521655438, + "grad_norm": 1.3145102262496948, + "learning_rate": 0.0001588983359993831, + "loss": 1.1086, + "step": 15570 + }, + { + "epoch": 0.5996150144369586, + "grad_norm": 1.817396640777588, + "learning_rate": 0.00015887389531566424, + "loss": 1.0999, + "step": 15575 + }, + { + "epoch": 0.5998075072184793, + "grad_norm": 1.4001067876815796, + "learning_rate": 0.0001588494492483176, + "loss": 1.2802, + "step": 15580 + }, + { + "epoch": 0.6, + "grad_norm": 2.1305971145629883, + "learning_rate": 0.00015882499779957868, + "loss": 1.2481, + "step": 15585 + }, + { + "epoch": 0.6001924927815206, + "grad_norm": 1.5675426721572876, + "learning_rate": 0.00015880054097168337, + "loss": 1.2555, + "step": 15590 + }, + { + "epoch": 0.6003849855630414, + "grad_norm": 1.3107160329818726, + "learning_rate": 0.00015877607876686815, + "loss": 1.273, + "step": 15595 + }, + { + "epoch": 0.6005774783445621, + "grad_norm": 0.5348256230354309, + "learning_rate": 0.00015875161118736986, + "loss": 0.9708, + "step": 15600 + }, + { + "epoch": 0.6007699711260828, + "grad_norm": 1.0877107381820679, + "learning_rate": 0.00015872713823542593, + "loss": 1.1419, + "step": 15605 + }, + { + "epoch": 0.6009624639076034, + "grad_norm": 1.0563950538635254, + "learning_rate": 0.00015870265991327424, + "loss": 1.0216, + "step": 15610 + }, + { + "epoch": 0.6011549566891241, + "grad_norm": 1.0346797704696655, + "learning_rate": 0.00015867817622315316, + "loss": 1.205, + "step": 15615 + }, + { + "epoch": 0.6013474494706449, + "grad_norm": 1.67006254196167, + "learning_rate": 0.00015865368716730158, + "loss": 1.2875, + "step": 15620 + }, + { + "epoch": 0.6015399422521656, + "grad_norm": 1.8183788061141968, + "learning_rate": 0.00015862919274795884, + "loss": 1.1703, + "step": 15625 + }, + { + "epoch": 0.6017324350336862, + "grad_norm": 1.1460903882980347, + "learning_rate": 0.00015860469296736482, + "loss": 1.1998, + "step": 15630 + }, + { + "epoch": 0.6019249278152069, + "grad_norm": 1.5365129709243774, + "learning_rate": 0.00015858018782775985, + "loss": 1.054, + "step": 15635 + }, + { + "epoch": 0.6021174205967276, + "grad_norm": 1.4886486530303955, + "learning_rate": 0.00015855567733138478, + "loss": 1.2914, + "step": 15640 + }, + { + "epoch": 0.6023099133782484, + "grad_norm": 1.519114375114441, + "learning_rate": 0.00015853116148048087, + "loss": 1.0586, + "step": 15645 + }, + { + "epoch": 0.602502406159769, + "grad_norm": 1.2735627889633179, + "learning_rate": 0.00015850664027729, + "loss": 1.1287, + "step": 15650 + }, + { + "epoch": 0.6026948989412897, + "grad_norm": 2.464672327041626, + "learning_rate": 0.00015848211372405444, + "loss": 1.0616, + "step": 15655 + }, + { + "epoch": 0.6028873917228104, + "grad_norm": 0.9507278800010681, + "learning_rate": 0.000158457581823017, + "loss": 1.0118, + "step": 15660 + }, + { + "epoch": 0.6030798845043311, + "grad_norm": 1.155150294303894, + "learning_rate": 0.00015843304457642093, + "loss": 1.0563, + "step": 15665 + }, + { + "epoch": 0.6032723772858518, + "grad_norm": 2.669029474258423, + "learning_rate": 0.00015840850198651002, + "loss": 1.1918, + "step": 15670 + }, + { + "epoch": 0.6034648700673725, + "grad_norm": 1.4008570909500122, + "learning_rate": 0.00015838395405552854, + "loss": 1.2122, + "step": 15675 + }, + { + "epoch": 0.6036573628488932, + "grad_norm": 1.4199731349945068, + "learning_rate": 0.0001583594007857212, + "loss": 1.2546, + "step": 15680 + }, + { + "epoch": 0.6038498556304138, + "grad_norm": 2.2346031665802, + "learning_rate": 0.0001583348421793333, + "loss": 1.1184, + "step": 15685 + }, + { + "epoch": 0.6040423484119346, + "grad_norm": 1.1559759378433228, + "learning_rate": 0.00015831027823861048, + "loss": 1.157, + "step": 15690 + }, + { + "epoch": 0.6042348411934553, + "grad_norm": 1.9930438995361328, + "learning_rate": 0.00015828570896579897, + "loss": 1.1095, + "step": 15695 + }, + { + "epoch": 0.6044273339749759, + "grad_norm": 1.040358304977417, + "learning_rate": 0.00015826113436314548, + "loss": 1.062, + "step": 15700 + }, + { + "epoch": 0.6046198267564966, + "grad_norm": 0.8409137725830078, + "learning_rate": 0.00015823655443289724, + "loss": 1.0204, + "step": 15705 + }, + { + "epoch": 0.6048123195380173, + "grad_norm": 1.477950930595398, + "learning_rate": 0.00015821196917730184, + "loss": 1.2479, + "step": 15710 + }, + { + "epoch": 0.6050048123195381, + "grad_norm": 1.5752694606781006, + "learning_rate": 0.00015818737859860752, + "loss": 1.343, + "step": 15715 + }, + { + "epoch": 0.6051973051010587, + "grad_norm": 1.505356788635254, + "learning_rate": 0.00015816278269906284, + "loss": 1.0742, + "step": 15720 + }, + { + "epoch": 0.6053897978825794, + "grad_norm": 1.165273904800415, + "learning_rate": 0.000158138181480917, + "loss": 1.1023, + "step": 15725 + }, + { + "epoch": 0.6055822906641001, + "grad_norm": 1.7088487148284912, + "learning_rate": 0.00015811357494641958, + "loss": 1.2899, + "step": 15730 + }, + { + "epoch": 0.6057747834456207, + "grad_norm": 1.6200921535491943, + "learning_rate": 0.0001580889630978207, + "loss": 1.0353, + "step": 15735 + }, + { + "epoch": 0.6059672762271415, + "grad_norm": 1.1059575080871582, + "learning_rate": 0.00015806434593737095, + "loss": 0.8117, + "step": 15740 + }, + { + "epoch": 0.6061597690086622, + "grad_norm": 1.3026262521743774, + "learning_rate": 0.00015803972346732143, + "loss": 1.1648, + "step": 15745 + }, + { + "epoch": 0.6063522617901829, + "grad_norm": 1.316931128501892, + "learning_rate": 0.00015801509568992366, + "loss": 1.0999, + "step": 15750 + }, + { + "epoch": 0.6065447545717035, + "grad_norm": 0.9396672248840332, + "learning_rate": 0.00015799046260742968, + "loss": 1.0374, + "step": 15755 + }, + { + "epoch": 0.6067372473532242, + "grad_norm": 1.1851413249969482, + "learning_rate": 0.00015796582422209206, + "loss": 1.1861, + "step": 15760 + }, + { + "epoch": 0.606929740134745, + "grad_norm": 2.0202128887176514, + "learning_rate": 0.00015794118053616383, + "loss": 1.139, + "step": 15765 + }, + { + "epoch": 0.6071222329162657, + "grad_norm": 1.642561912536621, + "learning_rate": 0.00015791653155189841, + "loss": 1.1811, + "step": 15770 + }, + { + "epoch": 0.6073147256977863, + "grad_norm": 1.4148608446121216, + "learning_rate": 0.0001578918772715499, + "loss": 1.3261, + "step": 15775 + }, + { + "epoch": 0.607507218479307, + "grad_norm": 1.160662293434143, + "learning_rate": 0.0001578672176973727, + "loss": 1.2117, + "step": 15780 + }, + { + "epoch": 0.6076997112608277, + "grad_norm": 1.4699779748916626, + "learning_rate": 0.00015784255283162176, + "loss": 1.1937, + "step": 15785 + }, + { + "epoch": 0.6078922040423484, + "grad_norm": 1.579142451286316, + "learning_rate": 0.00015781788267655252, + "loss": 1.2722, + "step": 15790 + }, + { + "epoch": 0.6080846968238691, + "grad_norm": 1.3598978519439697, + "learning_rate": 0.00015779320723442096, + "loss": 1.0829, + "step": 15795 + }, + { + "epoch": 0.6082771896053898, + "grad_norm": 1.1840283870697021, + "learning_rate": 0.0001577734630755471, + "loss": 1.3599, + "step": 15800 + }, + { + "epoch": 0.6084696823869105, + "grad_norm": 1.1004847288131714, + "learning_rate": 0.00015774877812238972, + "loss": 1.3756, + "step": 15805 + }, + { + "epoch": 0.6086621751684311, + "grad_norm": 1.6455458402633667, + "learning_rate": 0.00015772408788848914, + "loss": 1.1067, + "step": 15810 + }, + { + "epoch": 0.6088546679499519, + "grad_norm": 1.6274205446243286, + "learning_rate": 0.00015769939237610312, + "loss": 1.3339, + "step": 15815 + }, + { + "epoch": 0.6090471607314726, + "grad_norm": 1.2150076627731323, + "learning_rate": 0.00015767469158748987, + "loss": 0.9821, + "step": 15820 + }, + { + "epoch": 0.6092396535129933, + "grad_norm": 1.2452518939971924, + "learning_rate": 0.00015764998552490815, + "loss": 0.9994, + "step": 15825 + }, + { + "epoch": 0.6094321462945139, + "grad_norm": 1.4766079187393188, + "learning_rate": 0.00015762527419061715, + "loss": 1.0281, + "step": 15830 + }, + { + "epoch": 0.6096246390760347, + "grad_norm": 1.1288725137710571, + "learning_rate": 0.00015760055758687655, + "loss": 1.082, + "step": 15835 + }, + { + "epoch": 0.6098171318575554, + "grad_norm": 1.181159496307373, + "learning_rate": 0.00015757583571594653, + "loss": 1.1515, + "step": 15840 + }, + { + "epoch": 0.610009624639076, + "grad_norm": 1.2939519882202148, + "learning_rate": 0.00015755110858008773, + "loss": 0.9892, + "step": 15845 + }, + { + "epoch": 0.6102021174205967, + "grad_norm": 2.3088269233703613, + "learning_rate": 0.0001575263761815613, + "loss": 0.9798, + "step": 15850 + }, + { + "epoch": 0.6103946102021174, + "grad_norm": 1.4175939559936523, + "learning_rate": 0.00015750163852262886, + "loss": 1.1408, + "step": 15855 + }, + { + "epoch": 0.6105871029836382, + "grad_norm": 1.0206336975097656, + "learning_rate": 0.00015747689560555248, + "loss": 1.2078, + "step": 15860 + }, + { + "epoch": 0.6107795957651588, + "grad_norm": 0.9995696544647217, + "learning_rate": 0.0001574521474325948, + "loss": 1.1922, + "step": 15865 + }, + { + "epoch": 0.6109720885466795, + "grad_norm": 1.6652652025222778, + "learning_rate": 0.00015742739400601872, + "loss": 1.1039, + "step": 15870 + }, + { + "epoch": 0.6111645813282002, + "grad_norm": 1.3411548137664795, + "learning_rate": 0.00015740263532808792, + "loss": 1.1592, + "step": 15875 + }, + { + "epoch": 0.6113570741097208, + "grad_norm": 0.9215561151504517, + "learning_rate": 0.0001573778714010664, + "loss": 1.1379, + "step": 15880 + }, + { + "epoch": 0.6115495668912416, + "grad_norm": 1.269482970237732, + "learning_rate": 0.00015735310222721863, + "loss": 1.2042, + "step": 15885 + }, + { + "epoch": 0.6117420596727623, + "grad_norm": 1.316909909248352, + "learning_rate": 0.00015732832780880957, + "loss": 1.2702, + "step": 15890 + }, + { + "epoch": 0.611934552454283, + "grad_norm": 1.2689425945281982, + "learning_rate": 0.0001573035481481047, + "loss": 1.0755, + "step": 15895 + }, + { + "epoch": 0.6121270452358036, + "grad_norm": 1.0369685888290405, + "learning_rate": 0.00015727876324736996, + "loss": 1.0574, + "step": 15900 + }, + { + "epoch": 0.6123195380173243, + "grad_norm": 1.0056127309799194, + "learning_rate": 0.00015725397310887174, + "loss": 1.2219, + "step": 15905 + }, + { + "epoch": 0.6125120307988451, + "grad_norm": 1.3123587369918823, + "learning_rate": 0.00015722917773487702, + "loss": 1.2203, + "step": 15910 + }, + { + "epoch": 0.6127045235803658, + "grad_norm": 1.0959875583648682, + "learning_rate": 0.00015720437712765306, + "loss": 1.2516, + "step": 15915 + }, + { + "epoch": 0.6128970163618864, + "grad_norm": 2.0152196884155273, + "learning_rate": 0.00015717957128946774, + "loss": 1.2099, + "step": 15920 + }, + { + "epoch": 0.6130895091434071, + "grad_norm": 2.816568374633789, + "learning_rate": 0.00015715476022258942, + "loss": 1.1093, + "step": 15925 + }, + { + "epoch": 0.6132820019249278, + "grad_norm": 1.8223321437835693, + "learning_rate": 0.00015712994392928689, + "loss": 1.0474, + "step": 15930 + }, + { + "epoch": 0.6134744947064485, + "grad_norm": 1.2718263864517212, + "learning_rate": 0.00015710512241182945, + "loss": 1.1405, + "step": 15935 + }, + { + "epoch": 0.6136669874879692, + "grad_norm": 1.2518097162246704, + "learning_rate": 0.00015708029567248683, + "loss": 1.13, + "step": 15940 + }, + { + "epoch": 0.6138594802694899, + "grad_norm": 0.8542113900184631, + "learning_rate": 0.0001570554637135293, + "loss": 1.0871, + "step": 15945 + }, + { + "epoch": 0.6140519730510106, + "grad_norm": 1.0798470973968506, + "learning_rate": 0.00015703062653722757, + "loss": 1.1563, + "step": 15950 + }, + { + "epoch": 0.6142444658325312, + "grad_norm": 1.123974084854126, + "learning_rate": 0.00015700578414585284, + "loss": 1.1253, + "step": 15955 + }, + { + "epoch": 0.614436958614052, + "grad_norm": 1.2129628658294678, + "learning_rate": 0.0001569809365416768, + "loss": 1.1123, + "step": 15960 + }, + { + "epoch": 0.6146294513955727, + "grad_norm": 1.4137890338897705, + "learning_rate": 0.00015695608372697154, + "loss": 1.115, + "step": 15965 + }, + { + "epoch": 0.6148219441770933, + "grad_norm": 1.2815289497375488, + "learning_rate": 0.00015693122570400975, + "loss": 1.0876, + "step": 15970 + }, + { + "epoch": 0.615014436958614, + "grad_norm": 0.9300668835639954, + "learning_rate": 0.00015690636247506448, + "loss": 1.1442, + "step": 15975 + }, + { + "epoch": 0.6152069297401348, + "grad_norm": 0.9866906404495239, + "learning_rate": 0.00015688149404240938, + "loss": 1.0664, + "step": 15980 + }, + { + "epoch": 0.6153994225216555, + "grad_norm": 1.1951825618743896, + "learning_rate": 0.0001568566204083184, + "loss": 1.0933, + "step": 15985 + }, + { + "epoch": 0.6155919153031761, + "grad_norm": 1.4439541101455688, + "learning_rate": 0.00015683174157506616, + "loss": 1.1618, + "step": 15990 + }, + { + "epoch": 0.6157844080846968, + "grad_norm": 1.242619276046753, + "learning_rate": 0.00015680685754492762, + "loss": 1.0794, + "step": 15995 + }, + { + "epoch": 0.6159769008662175, + "grad_norm": 1.9631248712539673, + "learning_rate": 0.00015678196832017823, + "loss": 1.1082, + "step": 16000 + }, + { + "epoch": 0.6161693936477383, + "grad_norm": 1.056715488433838, + "learning_rate": 0.00015675707390309403, + "loss": 1.0893, + "step": 16005 + }, + { + "epoch": 0.6163618864292589, + "grad_norm": 2.3864753246307373, + "learning_rate": 0.00015673217429595143, + "loss": 1.3378, + "step": 16010 + }, + { + "epoch": 0.6165543792107796, + "grad_norm": 1.3226178884506226, + "learning_rate": 0.00015670726950102725, + "loss": 1.1959, + "step": 16015 + }, + { + "epoch": 0.6167468719923003, + "grad_norm": 2.254422426223755, + "learning_rate": 0.00015668235952059892, + "loss": 1.1495, + "step": 16020 + }, + { + "epoch": 0.6169393647738209, + "grad_norm": 1.6376910209655762, + "learning_rate": 0.00015665744435694435, + "loss": 1.0027, + "step": 16025 + }, + { + "epoch": 0.6171318575553417, + "grad_norm": 1.190169334411621, + "learning_rate": 0.00015663252401234177, + "loss": 1.0419, + "step": 16030 + }, + { + "epoch": 0.6173243503368624, + "grad_norm": 1.6388911008834839, + "learning_rate": 0.00015660759848907008, + "loss": 1.3868, + "step": 16035 + }, + { + "epoch": 0.6175168431183831, + "grad_norm": 0.9445647597312927, + "learning_rate": 0.00015658266778940843, + "loss": 1.1382, + "step": 16040 + }, + { + "epoch": 0.6177093358999037, + "grad_norm": 0.9717797636985779, + "learning_rate": 0.00015655773191563664, + "loss": 1.3385, + "step": 16045 + }, + { + "epoch": 0.6179018286814244, + "grad_norm": 1.7297828197479248, + "learning_rate": 0.000156532790870035, + "loss": 1.2607, + "step": 16050 + }, + { + "epoch": 0.6180943214629452, + "grad_norm": 1.3885836601257324, + "learning_rate": 0.00015650784465488405, + "loss": 1.2271, + "step": 16055 + }, + { + "epoch": 0.6182868142444659, + "grad_norm": 1.3968501091003418, + "learning_rate": 0.00015648289327246508, + "loss": 1.255, + "step": 16060 + }, + { + "epoch": 0.6184793070259865, + "grad_norm": 1.7532678842544556, + "learning_rate": 0.00015645793672505967, + "loss": 1.2088, + "step": 16065 + }, + { + "epoch": 0.6186717998075072, + "grad_norm": 1.4146851301193237, + "learning_rate": 0.00015643297501494999, + "loss": 0.9797, + "step": 16070 + }, + { + "epoch": 0.6188642925890279, + "grad_norm": 1.4249024391174316, + "learning_rate": 0.00015640800814441851, + "loss": 1.1446, + "step": 16075 + }, + { + "epoch": 0.6190567853705486, + "grad_norm": 1.3387399911880493, + "learning_rate": 0.0001563830361157484, + "loss": 1.2204, + "step": 16080 + }, + { + "epoch": 0.6192492781520693, + "grad_norm": 1.137149691581726, + "learning_rate": 0.00015635805893122312, + "loss": 1.1626, + "step": 16085 + }, + { + "epoch": 0.61944177093359, + "grad_norm": 1.8353437185287476, + "learning_rate": 0.0001563330765931267, + "loss": 1.2476, + "step": 16090 + }, + { + "epoch": 0.6196342637151107, + "grad_norm": 0.969289243221283, + "learning_rate": 0.00015630808910374358, + "loss": 1.026, + "step": 16095 + }, + { + "epoch": 0.6198267564966313, + "grad_norm": 1.0529965162277222, + "learning_rate": 0.0001562830964653587, + "loss": 1.15, + "step": 16100 + }, + { + "epoch": 0.6200192492781521, + "grad_norm": 1.2508490085601807, + "learning_rate": 0.00015625809868025756, + "loss": 1.08, + "step": 16105 + }, + { + "epoch": 0.6202117420596728, + "grad_norm": 1.1188933849334717, + "learning_rate": 0.0001562330957507259, + "loss": 1.2273, + "step": 16110 + }, + { + "epoch": 0.6204042348411934, + "grad_norm": 1.9137325286865234, + "learning_rate": 0.00015620808767905018, + "loss": 1.1073, + "step": 16115 + }, + { + "epoch": 0.6205967276227141, + "grad_norm": 1.146921157836914, + "learning_rate": 0.0001561830744675172, + "loss": 1.0292, + "step": 16120 + }, + { + "epoch": 0.6207892204042348, + "grad_norm": 1.6574608087539673, + "learning_rate": 0.00015615805611841424, + "loss": 1.2067, + "step": 16125 + }, + { + "epoch": 0.6209817131857556, + "grad_norm": 1.599156379699707, + "learning_rate": 0.00015613303263402903, + "loss": 1.4416, + "step": 16130 + }, + { + "epoch": 0.6211742059672762, + "grad_norm": 1.9472912549972534, + "learning_rate": 0.00015610800401664988, + "loss": 0.9591, + "step": 16135 + }, + { + "epoch": 0.6213666987487969, + "grad_norm": 1.2037914991378784, + "learning_rate": 0.00015608297026856538, + "loss": 1.0899, + "step": 16140 + }, + { + "epoch": 0.6215591915303176, + "grad_norm": 1.0116618871688843, + "learning_rate": 0.0001560579313920648, + "loss": 1.2294, + "step": 16145 + }, + { + "epoch": 0.6217516843118384, + "grad_norm": 1.6344687938690186, + "learning_rate": 0.00015603288738943774, + "loss": 1.1918, + "step": 16150 + }, + { + "epoch": 0.621944177093359, + "grad_norm": 1.3862853050231934, + "learning_rate": 0.0001560078382629743, + "loss": 1.157, + "step": 16155 + }, + { + "epoch": 0.6221366698748797, + "grad_norm": 0.9576367139816284, + "learning_rate": 0.00015598278401496508, + "loss": 1.0759, + "step": 16160 + }, + { + "epoch": 0.6223291626564004, + "grad_norm": 1.2092609405517578, + "learning_rate": 0.0001559577246477011, + "loss": 1.1928, + "step": 16165 + }, + { + "epoch": 0.622521655437921, + "grad_norm": 1.594510793685913, + "learning_rate": 0.0001559326601634739, + "loss": 1.2336, + "step": 16170 + }, + { + "epoch": 0.6227141482194418, + "grad_norm": 0.851620078086853, + "learning_rate": 0.00015590759056457546, + "loss": 1.1646, + "step": 16175 + }, + { + "epoch": 0.6229066410009625, + "grad_norm": 1.1468600034713745, + "learning_rate": 0.0001558825158532982, + "loss": 1.1879, + "step": 16180 + }, + { + "epoch": 0.6230991337824832, + "grad_norm": 1.934251308441162, + "learning_rate": 0.00015585743603193505, + "loss": 1.1207, + "step": 16185 + }, + { + "epoch": 0.6232916265640038, + "grad_norm": 0.9963223934173584, + "learning_rate": 0.00015583235110277943, + "loss": 1.068, + "step": 16190 + }, + { + "epoch": 0.6234841193455245, + "grad_norm": 0.8857359290122986, + "learning_rate": 0.00015580726106812512, + "loss": 1.1148, + "step": 16195 + }, + { + "epoch": 0.6236766121270453, + "grad_norm": 1.2589722871780396, + "learning_rate": 0.00015578216593026647, + "loss": 1.0485, + "step": 16200 + }, + { + "epoch": 0.623869104908566, + "grad_norm": 1.0346484184265137, + "learning_rate": 0.0001557570656914983, + "loss": 1.1276, + "step": 16205 + }, + { + "epoch": 0.6240615976900866, + "grad_norm": 0.8794786334037781, + "learning_rate": 0.0001557319603541158, + "loss": 1.2591, + "step": 16210 + }, + { + "epoch": 0.6242540904716073, + "grad_norm": 1.0909137725830078, + "learning_rate": 0.00015570684992041473, + "loss": 1.1197, + "step": 16215 + }, + { + "epoch": 0.624446583253128, + "grad_norm": 1.3499592542648315, + "learning_rate": 0.0001556817343926913, + "loss": 1.0165, + "step": 16220 + }, + { + "epoch": 0.6246390760346487, + "grad_norm": 1.5356526374816895, + "learning_rate": 0.00015565661377324203, + "loss": 1.0144, + "step": 16225 + }, + { + "epoch": 0.6248315688161694, + "grad_norm": 1.849442958831787, + "learning_rate": 0.0001556314880643642, + "loss": 1.2191, + "step": 16230 + }, + { + "epoch": 0.6250240615976901, + "grad_norm": 1.1928755044937134, + "learning_rate": 0.00015560635726835525, + "loss": 1.2685, + "step": 16235 + }, + { + "epoch": 0.6252165543792108, + "grad_norm": 1.1445300579071045, + "learning_rate": 0.00015558122138751332, + "loss": 1.445, + "step": 16240 + }, + { + "epoch": 0.6254090471607314, + "grad_norm": 1.7465559244155884, + "learning_rate": 0.00015555608042413689, + "loss": 1.2479, + "step": 16245 + }, + { + "epoch": 0.6256015399422522, + "grad_norm": 1.1695505380630493, + "learning_rate": 0.0001555309343805249, + "loss": 1.2347, + "step": 16250 + }, + { + "epoch": 0.6257940327237729, + "grad_norm": 1.2655342817306519, + "learning_rate": 0.00015550578325897687, + "loss": 1.2343, + "step": 16255 + }, + { + "epoch": 0.6259865255052935, + "grad_norm": 1.569800853729248, + "learning_rate": 0.0001554806270617926, + "loss": 1.0798, + "step": 16260 + }, + { + "epoch": 0.6261790182868142, + "grad_norm": 2.0027542114257812, + "learning_rate": 0.00015545546579127256, + "loss": 1.0084, + "step": 16265 + }, + { + "epoch": 0.6263715110683349, + "grad_norm": 2.259096145629883, + "learning_rate": 0.0001554302994497175, + "loss": 1.1921, + "step": 16270 + }, + { + "epoch": 0.6265640038498557, + "grad_norm": 1.092046856880188, + "learning_rate": 0.00015540512803942878, + "loss": 1.112, + "step": 16275 + }, + { + "epoch": 0.6267564966313763, + "grad_norm": 1.67642343044281, + "learning_rate": 0.00015537995156270808, + "loss": 1.3709, + "step": 16280 + }, + { + "epoch": 0.626948989412897, + "grad_norm": 1.7039928436279297, + "learning_rate": 0.0001553547700218577, + "loss": 1.2211, + "step": 16285 + }, + { + "epoch": 0.6271414821944177, + "grad_norm": 2.0744543075561523, + "learning_rate": 0.00015532958341918027, + "loss": 1.2324, + "step": 16290 + }, + { + "epoch": 0.6273339749759385, + "grad_norm": 1.2610362768173218, + "learning_rate": 0.00015530439175697898, + "loss": 1.1924, + "step": 16295 + }, + { + "epoch": 0.6275264677574591, + "grad_norm": 1.8385295867919922, + "learning_rate": 0.00015527919503755742, + "loss": 1.2602, + "step": 16300 + }, + { + "epoch": 0.6277189605389798, + "grad_norm": 1.62607741355896, + "learning_rate": 0.00015525399326321966, + "loss": 1.2135, + "step": 16305 + }, + { + "epoch": 0.6279114533205005, + "grad_norm": 1.164507508277893, + "learning_rate": 0.00015522878643627023, + "loss": 1.07, + "step": 16310 + }, + { + "epoch": 0.6281039461020211, + "grad_norm": 0.9871059060096741, + "learning_rate": 0.0001552035745590142, + "loss": 1.1749, + "step": 16315 + }, + { + "epoch": 0.6282964388835419, + "grad_norm": 1.1414002180099487, + "learning_rate": 0.00015517835763375688, + "loss": 1.233, + "step": 16320 + }, + { + "epoch": 0.6284889316650626, + "grad_norm": 1.1266084909439087, + "learning_rate": 0.00015515313566280428, + "loss": 1.1642, + "step": 16325 + }, + { + "epoch": 0.6286814244465833, + "grad_norm": 1.8156638145446777, + "learning_rate": 0.00015512790864846286, + "loss": 1.0328, + "step": 16330 + }, + { + "epoch": 0.6288739172281039, + "grad_norm": 1.9357597827911377, + "learning_rate": 0.00015510267659303933, + "loss": 1.3325, + "step": 16335 + }, + { + "epoch": 0.6290664100096246, + "grad_norm": 1.767910122871399, + "learning_rate": 0.00015507743949884104, + "loss": 1.2381, + "step": 16340 + }, + { + "epoch": 0.6292589027911454, + "grad_norm": 2.1196887493133545, + "learning_rate": 0.0001550521973681758, + "loss": 1.2286, + "step": 16345 + }, + { + "epoch": 0.629451395572666, + "grad_norm": 1.7220022678375244, + "learning_rate": 0.00015502695020335177, + "loss": 1.1699, + "step": 16350 + }, + { + "epoch": 0.6296438883541867, + "grad_norm": 1.9612696170806885, + "learning_rate": 0.00015500169800667765, + "loss": 1.0786, + "step": 16355 + }, + { + "epoch": 0.6298363811357074, + "grad_norm": 1.666223406791687, + "learning_rate": 0.00015497644078046261, + "loss": 1.2211, + "step": 16360 + }, + { + "epoch": 0.6300288739172281, + "grad_norm": 1.7156059741973877, + "learning_rate": 0.00015495117852701626, + "loss": 1.0621, + "step": 16365 + }, + { + "epoch": 0.6302213666987488, + "grad_norm": 1.5840719938278198, + "learning_rate": 0.00015492591124864865, + "loss": 1.2364, + "step": 16370 + }, + { + "epoch": 0.6304138594802695, + "grad_norm": 1.1821776628494263, + "learning_rate": 0.0001549006389476703, + "loss": 1.1479, + "step": 16375 + }, + { + "epoch": 0.6306063522617902, + "grad_norm": 1.2549364566802979, + "learning_rate": 0.00015487536162639223, + "loss": 1.0564, + "step": 16380 + }, + { + "epoch": 0.6307988450433109, + "grad_norm": 1.5308479070663452, + "learning_rate": 0.0001548500792871258, + "loss": 1.1825, + "step": 16385 + }, + { + "epoch": 0.6309913378248315, + "grad_norm": 1.6546053886413574, + "learning_rate": 0.000154824791932183, + "loss": 1.1673, + "step": 16390 + }, + { + "epoch": 0.6311838306063523, + "grad_norm": 1.1561111211776733, + "learning_rate": 0.00015479949956387617, + "loss": 1.1014, + "step": 16395 + }, + { + "epoch": 0.631376323387873, + "grad_norm": 1.6901589632034302, + "learning_rate": 0.0001547742021845181, + "loss": 1.2377, + "step": 16400 + }, + { + "epoch": 0.6315688161693936, + "grad_norm": 1.2808809280395508, + "learning_rate": 0.0001547488997964221, + "loss": 1.2976, + "step": 16405 + }, + { + "epoch": 0.6317613089509143, + "grad_norm": 0.9793625473976135, + "learning_rate": 0.0001547235924019019, + "loss": 1.0328, + "step": 16410 + }, + { + "epoch": 0.631953801732435, + "grad_norm": 1.6001505851745605, + "learning_rate": 0.00015469828000327164, + "loss": 1.0232, + "step": 16415 + }, + { + "epoch": 0.6321462945139558, + "grad_norm": 1.3900479078292847, + "learning_rate": 0.00015467296260284605, + "loss": 1.2412, + "step": 16420 + }, + { + "epoch": 0.6323387872954764, + "grad_norm": 1.6030535697937012, + "learning_rate": 0.0001546476402029402, + "loss": 1.178, + "step": 16425 + }, + { + "epoch": 0.6325312800769971, + "grad_norm": 1.5602627992630005, + "learning_rate": 0.00015462231280586965, + "loss": 1.2834, + "step": 16430 + }, + { + "epoch": 0.6327237728585178, + "grad_norm": 1.3648455142974854, + "learning_rate": 0.00015459698041395045, + "loss": 1.1425, + "step": 16435 + }, + { + "epoch": 0.6329162656400384, + "grad_norm": 1.4346479177474976, + "learning_rate": 0.00015457164302949908, + "loss": 1.0076, + "step": 16440 + }, + { + "epoch": 0.6331087584215592, + "grad_norm": 0.9692068696022034, + "learning_rate": 0.00015454630065483242, + "loss": 1.0133, + "step": 16445 + }, + { + "epoch": 0.6333012512030799, + "grad_norm": 1.479915976524353, + "learning_rate": 0.0001545209532922679, + "loss": 1.131, + "step": 16450 + }, + { + "epoch": 0.6334937439846006, + "grad_norm": 1.0446960926055908, + "learning_rate": 0.00015449560094412342, + "loss": 1.2545, + "step": 16455 + }, + { + "epoch": 0.6336862367661212, + "grad_norm": 1.458414077758789, + "learning_rate": 0.00015447024361271721, + "loss": 1.325, + "step": 16460 + }, + { + "epoch": 0.633878729547642, + "grad_norm": 1.2071151733398438, + "learning_rate": 0.00015444488130036802, + "loss": 1.2303, + "step": 16465 + }, + { + "epoch": 0.6340712223291627, + "grad_norm": 1.9108256101608276, + "learning_rate": 0.00015441951400939515, + "loss": 1.2031, + "step": 16470 + }, + { + "epoch": 0.6342637151106834, + "grad_norm": 1.1393382549285889, + "learning_rate": 0.0001543941417421182, + "loss": 1.016, + "step": 16475 + }, + { + "epoch": 0.634456207892204, + "grad_norm": 2.0735628604888916, + "learning_rate": 0.00015436876450085728, + "loss": 1.1619, + "step": 16480 + }, + { + "epoch": 0.6346487006737247, + "grad_norm": 1.6895620822906494, + "learning_rate": 0.00015434338228793306, + "loss": 1.1621, + "step": 16485 + }, + { + "epoch": 0.6348411934552455, + "grad_norm": 1.9663159847259521, + "learning_rate": 0.0001543179951056665, + "loss": 1.2465, + "step": 16490 + }, + { + "epoch": 0.6350336862367661, + "grad_norm": 1.1372085809707642, + "learning_rate": 0.0001542926029563791, + "loss": 1.1643, + "step": 16495 + }, + { + "epoch": 0.6352261790182868, + "grad_norm": 0.5948193669319153, + "learning_rate": 0.00015426720584239283, + "loss": 0.9659, + "step": 16500 + }, + { + "epoch": 0.6354186717998075, + "grad_norm": 1.829047441482544, + "learning_rate": 0.00015424180376603008, + "loss": 1.1334, + "step": 16505 + }, + { + "epoch": 0.6356111645813282, + "grad_norm": 1.4863371849060059, + "learning_rate": 0.00015421639672961367, + "loss": 1.1206, + "step": 16510 + }, + { + "epoch": 0.6358036573628489, + "grad_norm": 1.2481038570404053, + "learning_rate": 0.00015419098473546696, + "loss": 1.1101, + "step": 16515 + }, + { + "epoch": 0.6359961501443696, + "grad_norm": 1.8721559047698975, + "learning_rate": 0.00015416556778591363, + "loss": 1.1293, + "step": 16520 + }, + { + "epoch": 0.6361886429258903, + "grad_norm": 1.5730985403060913, + "learning_rate": 0.000154140145883278, + "loss": 1.277, + "step": 16525 + }, + { + "epoch": 0.636381135707411, + "grad_norm": 1.4351321458816528, + "learning_rate": 0.00015411471902988463, + "loss": 1.2475, + "step": 16530 + }, + { + "epoch": 0.6365736284889316, + "grad_norm": 0.8733989596366882, + "learning_rate": 0.00015408928722805874, + "loss": 1.0728, + "step": 16535 + }, + { + "epoch": 0.6367661212704524, + "grad_norm": 1.454068899154663, + "learning_rate": 0.00015406385048012577, + "loss": 1.0163, + "step": 16540 + }, + { + "epoch": 0.6369586140519731, + "grad_norm": 0.9600105285644531, + "learning_rate": 0.00015403840878841182, + "loss": 1.097, + "step": 16545 + }, + { + "epoch": 0.6371511068334937, + "grad_norm": 2.419609546661377, + "learning_rate": 0.00015401296215524345, + "loss": 1.2003, + "step": 16550 + }, + { + "epoch": 0.6373435996150144, + "grad_norm": 1.313755989074707, + "learning_rate": 0.0001539875105829474, + "loss": 1.1276, + "step": 16555 + }, + { + "epoch": 0.6375360923965351, + "grad_norm": 1.6932001113891602, + "learning_rate": 0.00015396205407385116, + "loss": 1.1689, + "step": 16560 + }, + { + "epoch": 0.6377285851780559, + "grad_norm": 1.076905608177185, + "learning_rate": 0.00015393659263028257, + "loss": 1.189, + "step": 16565 + }, + { + "epoch": 0.6379210779595765, + "grad_norm": 1.2433785200119019, + "learning_rate": 0.00015391112625456983, + "loss": 0.9797, + "step": 16570 + }, + { + "epoch": 0.6381135707410972, + "grad_norm": 1.1299281120300293, + "learning_rate": 0.00015388565494904176, + "loss": 1.1399, + "step": 16575 + }, + { + "epoch": 0.6383060635226179, + "grad_norm": 1.0440160036087036, + "learning_rate": 0.0001538601787160275, + "loss": 1.2491, + "step": 16580 + }, + { + "epoch": 0.6384985563041385, + "grad_norm": 1.1874500513076782, + "learning_rate": 0.00015383469755785668, + "loss": 1.2762, + "step": 16585 + }, + { + "epoch": 0.6386910490856593, + "grad_norm": 1.2737995386123657, + "learning_rate": 0.0001538092114768594, + "loss": 1.2102, + "step": 16590 + }, + { + "epoch": 0.63888354186718, + "grad_norm": 1.8649038076400757, + "learning_rate": 0.0001537837204753662, + "loss": 1.0711, + "step": 16595 + }, + { + "epoch": 0.6390760346487007, + "grad_norm": 1.0375845432281494, + "learning_rate": 0.000153758224555708, + "loss": 1.0349, + "step": 16600 + }, + { + "epoch": 0.6392685274302213, + "grad_norm": 1.4500465393066406, + "learning_rate": 0.0001537327237202163, + "loss": 1.1501, + "step": 16605 + }, + { + "epoch": 0.6394610202117421, + "grad_norm": 1.5905102491378784, + "learning_rate": 0.000153707217971223, + "loss": 1.1946, + "step": 16610 + }, + { + "epoch": 0.6396535129932628, + "grad_norm": 1.224752426147461, + "learning_rate": 0.00015368170731106036, + "loss": 1.1101, + "step": 16615 + }, + { + "epoch": 0.6398460057747835, + "grad_norm": 2.605717182159424, + "learning_rate": 0.00015365619174206117, + "loss": 1.0483, + "step": 16620 + }, + { + "epoch": 0.6400384985563041, + "grad_norm": 1.2829294204711914, + "learning_rate": 0.00015363067126655873, + "loss": 1.2265, + "step": 16625 + }, + { + "epoch": 0.6402309913378248, + "grad_norm": 1.1748125553131104, + "learning_rate": 0.00015360514588688665, + "loss": 1.0909, + "step": 16630 + }, + { + "epoch": 0.6404234841193456, + "grad_norm": 1.0052121877670288, + "learning_rate": 0.00015357961560537908, + "loss": 1.3145, + "step": 16635 + }, + { + "epoch": 0.6406159769008662, + "grad_norm": 1.1692798137664795, + "learning_rate": 0.00015355408042437061, + "loss": 1.3134, + "step": 16640 + }, + { + "epoch": 0.6408084696823869, + "grad_norm": 1.1379728317260742, + "learning_rate": 0.00015352854034619622, + "loss": 1.0519, + "step": 16645 + }, + { + "epoch": 0.6410009624639076, + "grad_norm": 1.067920207977295, + "learning_rate": 0.00015350299537319147, + "loss": 1.17, + "step": 16650 + }, + { + "epoch": 0.6411934552454283, + "grad_norm": 1.9951469898223877, + "learning_rate": 0.00015347744550769216, + "loss": 1.0478, + "step": 16655 + }, + { + "epoch": 0.641385948026949, + "grad_norm": 1.036605715751648, + "learning_rate": 0.00015345189075203477, + "loss": 1.1288, + "step": 16660 + }, + { + "epoch": 0.6415784408084697, + "grad_norm": 0.5938658714294434, + "learning_rate": 0.000153426331108556, + "loss": 1.0589, + "step": 16665 + }, + { + "epoch": 0.6417709335899904, + "grad_norm": 1.212049961090088, + "learning_rate": 0.00015340076657959317, + "loss": 1.1104, + "step": 16670 + }, + { + "epoch": 0.641963426371511, + "grad_norm": 1.3548222780227661, + "learning_rate": 0.00015337519716748403, + "loss": 1.2639, + "step": 16675 + }, + { + "epoch": 0.6421559191530317, + "grad_norm": 1.210879921913147, + "learning_rate": 0.00015334962287456665, + "loss": 1.0576, + "step": 16680 + }, + { + "epoch": 0.6423484119345525, + "grad_norm": 2.2316668033599854, + "learning_rate": 0.00015332404370317965, + "loss": 1.2075, + "step": 16685 + }, + { + "epoch": 0.6425409047160732, + "grad_norm": 1.0065557956695557, + "learning_rate": 0.00015329845965566215, + "loss": 1.0872, + "step": 16690 + }, + { + "epoch": 0.6427333974975938, + "grad_norm": 1.36894953250885, + "learning_rate": 0.00015327287073435355, + "loss": 0.9866, + "step": 16695 + }, + { + "epoch": 0.6429258902791145, + "grad_norm": 0.9726212620735168, + "learning_rate": 0.0001532472769415938, + "loss": 1.0069, + "step": 16700 + }, + { + "epoch": 0.6431183830606352, + "grad_norm": 0.9447348117828369, + "learning_rate": 0.00015322167827972334, + "loss": 1.3184, + "step": 16705 + }, + { + "epoch": 0.643310875842156, + "grad_norm": 1.7236000299453735, + "learning_rate": 0.00015319607475108296, + "loss": 1.2547, + "step": 16710 + }, + { + "epoch": 0.6435033686236766, + "grad_norm": 2.3541550636291504, + "learning_rate": 0.00015317046635801392, + "loss": 1.2886, + "step": 16715 + }, + { + "epoch": 0.6436958614051973, + "grad_norm": 1.8849072456359863, + "learning_rate": 0.00015314485310285796, + "loss": 1.1295, + "step": 16720 + }, + { + "epoch": 0.643888354186718, + "grad_norm": 4.183611869812012, + "learning_rate": 0.00015311923498795724, + "loss": 1.1109, + "step": 16725 + }, + { + "epoch": 0.6440808469682386, + "grad_norm": 1.4037699699401855, + "learning_rate": 0.00015309361201565436, + "loss": 1.1097, + "step": 16730 + }, + { + "epoch": 0.6442733397497594, + "grad_norm": 1.626489520072937, + "learning_rate": 0.00015306798418829236, + "loss": 1.2515, + "step": 16735 + }, + { + "epoch": 0.6444658325312801, + "grad_norm": 2.0744874477386475, + "learning_rate": 0.00015304235150821475, + "loss": 1.2196, + "step": 16740 + }, + { + "epoch": 0.6446583253128008, + "grad_norm": 1.2196972370147705, + "learning_rate": 0.0001530167139777655, + "loss": 1.0935, + "step": 16745 + }, + { + "epoch": 0.6448508180943214, + "grad_norm": 2.1287968158721924, + "learning_rate": 0.00015299107159928897, + "loss": 0.9476, + "step": 16750 + }, + { + "epoch": 0.6450433108758421, + "grad_norm": 1.6050670146942139, + "learning_rate": 0.00015296542437512995, + "loss": 1.2276, + "step": 16755 + }, + { + "epoch": 0.6452358036573629, + "grad_norm": 1.316373348236084, + "learning_rate": 0.0001529397723076337, + "loss": 1.121, + "step": 16760 + }, + { + "epoch": 0.6454282964388836, + "grad_norm": 1.4219224452972412, + "learning_rate": 0.00015291411539914603, + "loss": 1.3219, + "step": 16765 + }, + { + "epoch": 0.6456207892204042, + "grad_norm": 1.3470525741577148, + "learning_rate": 0.00015288845365201299, + "loss": 1.0538, + "step": 16770 + }, + { + "epoch": 0.6458132820019249, + "grad_norm": 1.6893870830535889, + "learning_rate": 0.0001528627870685812, + "loss": 1.1907, + "step": 16775 + }, + { + "epoch": 0.6460057747834457, + "grad_norm": 1.7264561653137207, + "learning_rate": 0.00015283711565119775, + "loss": 1.167, + "step": 16780 + }, + { + "epoch": 0.6461982675649663, + "grad_norm": 1.1093302965164185, + "learning_rate": 0.0001528114394022101, + "loss": 1.1477, + "step": 16785 + }, + { + "epoch": 0.646390760346487, + "grad_norm": 1.1114470958709717, + "learning_rate": 0.00015278575832396613, + "loss": 1.1224, + "step": 16790 + }, + { + "epoch": 0.6465832531280077, + "grad_norm": 2.0239744186401367, + "learning_rate": 0.00015276007241881424, + "loss": 1.1655, + "step": 16795 + }, + { + "epoch": 0.6467757459095284, + "grad_norm": 1.0726968050003052, + "learning_rate": 0.00015273438168910322, + "loss": 0.9021, + "step": 16800 + }, + { + "epoch": 0.6469682386910491, + "grad_norm": 1.2715688943862915, + "learning_rate": 0.00015270868613718238, + "loss": 1.1776, + "step": 16805 + }, + { + "epoch": 0.6471607314725698, + "grad_norm": 1.4808478355407715, + "learning_rate": 0.00015268298576540129, + "loss": 1.1023, + "step": 16810 + }, + { + "epoch": 0.6473532242540905, + "grad_norm": 1.63973069190979, + "learning_rate": 0.0001526572805761102, + "loss": 1.0025, + "step": 16815 + }, + { + "epoch": 0.6475457170356111, + "grad_norm": 1.0935505628585815, + "learning_rate": 0.0001526315705716596, + "loss": 1.1039, + "step": 16820 + }, + { + "epoch": 0.6477382098171318, + "grad_norm": 1.0586233139038086, + "learning_rate": 0.00015260585575440052, + "loss": 1.0884, + "step": 16825 + }, + { + "epoch": 0.6479307025986526, + "grad_norm": 1.0608752965927124, + "learning_rate": 0.0001525801361266844, + "loss": 1.2997, + "step": 16830 + }, + { + "epoch": 0.6481231953801733, + "grad_norm": 1.0017322301864624, + "learning_rate": 0.00015255441169086318, + "loss": 1.3023, + "step": 16835 + }, + { + "epoch": 0.6483156881616939, + "grad_norm": 0.9409940242767334, + "learning_rate": 0.00015252868244928914, + "loss": 1.2462, + "step": 16840 + }, + { + "epoch": 0.6485081809432146, + "grad_norm": 1.646735429763794, + "learning_rate": 0.00015250294840431504, + "loss": 1.1759, + "step": 16845 + }, + { + "epoch": 0.6487006737247353, + "grad_norm": 2.878627300262451, + "learning_rate": 0.00015247720955829412, + "loss": 1.2458, + "step": 16850 + }, + { + "epoch": 0.6488931665062561, + "grad_norm": 1.6578867435455322, + "learning_rate": 0.00015245146591358002, + "loss": 1.297, + "step": 16855 + }, + { + "epoch": 0.6490856592877767, + "grad_norm": 1.9454634189605713, + "learning_rate": 0.00015242571747252682, + "loss": 1.2366, + "step": 16860 + }, + { + "epoch": 0.6492781520692974, + "grad_norm": 1.8211311101913452, + "learning_rate": 0.00015239996423748906, + "loss": 1.1163, + "step": 16865 + }, + { + "epoch": 0.6494706448508181, + "grad_norm": 1.5382091999053955, + "learning_rate": 0.00015237420621082163, + "loss": 1.0103, + "step": 16870 + }, + { + "epoch": 0.6496631376323387, + "grad_norm": 1.7348453998565674, + "learning_rate": 0.00015234844339488004, + "loss": 1.1667, + "step": 16875 + }, + { + "epoch": 0.6498556304138595, + "grad_norm": 1.0255297422409058, + "learning_rate": 0.0001523226757920201, + "loss": 1.1472, + "step": 16880 + }, + { + "epoch": 0.6500481231953802, + "grad_norm": 1.730460524559021, + "learning_rate": 0.00015229690340459802, + "loss": 1.2442, + "step": 16885 + }, + { + "epoch": 0.6502406159769009, + "grad_norm": 1.6826850175857544, + "learning_rate": 0.00015227112623497058, + "loss": 1.2426, + "step": 16890 + }, + { + "epoch": 0.6504331087584215, + "grad_norm": 1.6523195505142212, + "learning_rate": 0.00015224534428549488, + "loss": 1.1543, + "step": 16895 + }, + { + "epoch": 0.6506256015399422, + "grad_norm": 2.3335843086242676, + "learning_rate": 0.00015221955755852858, + "loss": 1.115, + "step": 16900 + }, + { + "epoch": 0.650818094321463, + "grad_norm": 1.0122956037521362, + "learning_rate": 0.00015219376605642962, + "loss": 1.2913, + "step": 16905 + }, + { + "epoch": 0.6510105871029837, + "grad_norm": 1.5100213289260864, + "learning_rate": 0.00015216796978155655, + "loss": 1.0309, + "step": 16910 + }, + { + "epoch": 0.6512030798845043, + "grad_norm": 1.1331759691238403, + "learning_rate": 0.0001521421687362682, + "loss": 1.0732, + "step": 16915 + }, + { + "epoch": 0.651395572666025, + "grad_norm": 0.9450187087059021, + "learning_rate": 0.00015211636292292394, + "loss": 1.2011, + "step": 16920 + }, + { + "epoch": 0.6515880654475458, + "grad_norm": 1.1546697616577148, + "learning_rate": 0.00015209055234388354, + "loss": 1.1368, + "step": 16925 + }, + { + "epoch": 0.6517805582290664, + "grad_norm": 1.5972734689712524, + "learning_rate": 0.00015206473700150717, + "loss": 1.0546, + "step": 16930 + }, + { + "epoch": 0.6519730510105871, + "grad_norm": 1.1828382015228271, + "learning_rate": 0.0001520389168981555, + "loss": 1.0311, + "step": 16935 + }, + { + "epoch": 0.6521655437921078, + "grad_norm": 1.0515602827072144, + "learning_rate": 0.00015201309203618962, + "loss": 1.3763, + "step": 16940 + }, + { + "epoch": 0.6523580365736285, + "grad_norm": 1.0648945569992065, + "learning_rate": 0.00015198726241797103, + "loss": 1.136, + "step": 16945 + }, + { + "epoch": 0.6525505293551492, + "grad_norm": 1.3983291387557983, + "learning_rate": 0.00015196142804586166, + "loss": 1.121, + "step": 16950 + }, + { + "epoch": 0.6527430221366699, + "grad_norm": 1.1980384588241577, + "learning_rate": 0.00015193558892222394, + "loss": 1.1442, + "step": 16955 + }, + { + "epoch": 0.6529355149181906, + "grad_norm": 0.92877596616745, + "learning_rate": 0.00015190974504942064, + "loss": 1.1025, + "step": 16960 + }, + { + "epoch": 0.6531280076997112, + "grad_norm": 1.3868606090545654, + "learning_rate": 0.00015188389642981502, + "loss": 1.0714, + "step": 16965 + }, + { + "epoch": 0.6533205004812319, + "grad_norm": 2.058389663696289, + "learning_rate": 0.00015185804306577075, + "loss": 1.3543, + "step": 16970 + }, + { + "epoch": 0.6535129932627527, + "grad_norm": 0.5963343381881714, + "learning_rate": 0.00015183218495965202, + "loss": 0.9247, + "step": 16975 + }, + { + "epoch": 0.6537054860442734, + "grad_norm": 1.6353943347930908, + "learning_rate": 0.0001518063221138233, + "loss": 1.1284, + "step": 16980 + }, + { + "epoch": 0.653897978825794, + "grad_norm": 2.303635597229004, + "learning_rate": 0.00015178045453064962, + "loss": 1.3496, + "step": 16985 + }, + { + "epoch": 0.6540904716073147, + "grad_norm": 0.9238683581352234, + "learning_rate": 0.00015175458221249638, + "loss": 1.1348, + "step": 16990 + }, + { + "epoch": 0.6542829643888354, + "grad_norm": 1.4203814268112183, + "learning_rate": 0.00015172870516172942, + "loss": 1.1032, + "step": 16995 + }, + { + "epoch": 0.6544754571703562, + "grad_norm": 1.018648386001587, + "learning_rate": 0.0001517028233807151, + "loss": 1.237, + "step": 17000 + }, + { + "epoch": 0.6546679499518768, + "grad_norm": 1.4779586791992188, + "learning_rate": 0.00015167693687182, + "loss": 1.173, + "step": 17005 + }, + { + "epoch": 0.6548604427333975, + "grad_norm": 1.7097437381744385, + "learning_rate": 0.0001516510456374114, + "loss": 1.1935, + "step": 17010 + }, + { + "epoch": 0.6550529355149182, + "grad_norm": 1.4055527448654175, + "learning_rate": 0.00015162514967985682, + "loss": 1.0832, + "step": 17015 + }, + { + "epoch": 0.6552454282964388, + "grad_norm": 1.5012494325637817, + "learning_rate": 0.00015159924900152432, + "loss": 1.3221, + "step": 17020 + }, + { + "epoch": 0.6554379210779596, + "grad_norm": 1.13307785987854, + "learning_rate": 0.00015157334360478228, + "loss": 1.2599, + "step": 17025 + }, + { + "epoch": 0.6556304138594803, + "grad_norm": 2.10911226272583, + "learning_rate": 0.0001515474334919996, + "loss": 1.1446, + "step": 17030 + }, + { + "epoch": 0.655822906641001, + "grad_norm": 1.4689563512802124, + "learning_rate": 0.00015152151866554563, + "loss": 1.3851, + "step": 17035 + }, + { + "epoch": 0.6560153994225216, + "grad_norm": 1.3363420963287354, + "learning_rate": 0.00015149559912779005, + "loss": 1.1939, + "step": 17040 + }, + { + "epoch": 0.6562078922040423, + "grad_norm": 1.665319561958313, + "learning_rate": 0.00015146967488110307, + "loss": 1.3353, + "step": 17045 + }, + { + "epoch": 0.6564003849855631, + "grad_norm": 1.03946852684021, + "learning_rate": 0.00015144374592785528, + "loss": 1.0736, + "step": 17050 + }, + { + "epoch": 0.6565928777670837, + "grad_norm": 1.941311240196228, + "learning_rate": 0.0001514178122704177, + "loss": 1.1745, + "step": 17055 + }, + { + "epoch": 0.6567853705486044, + "grad_norm": 2.091871738433838, + "learning_rate": 0.00015139187391116182, + "loss": 0.9826, + "step": 17060 + }, + { + "epoch": 0.6569778633301251, + "grad_norm": 1.3722056150436401, + "learning_rate": 0.0001513659308524595, + "loss": 1.0969, + "step": 17065 + }, + { + "epoch": 0.6571703561116458, + "grad_norm": 1.9604045152664185, + "learning_rate": 0.00015133998309668306, + "loss": 1.0726, + "step": 17070 + }, + { + "epoch": 0.6573628488931665, + "grad_norm": 1.1731983423233032, + "learning_rate": 0.00015131403064620527, + "loss": 1.0909, + "step": 17075 + }, + { + "epoch": 0.6575553416746872, + "grad_norm": 1.3418563604354858, + "learning_rate": 0.0001512880735033993, + "loss": 1.2574, + "step": 17080 + }, + { + "epoch": 0.6577478344562079, + "grad_norm": 2.054722785949707, + "learning_rate": 0.00015126211167063876, + "loss": 1.1705, + "step": 17085 + }, + { + "epoch": 0.6579403272377286, + "grad_norm": 1.1431398391723633, + "learning_rate": 0.00015123614515029772, + "loss": 1.2606, + "step": 17090 + }, + { + "epoch": 0.6581328200192493, + "grad_norm": 1.4750339984893799, + "learning_rate": 0.0001512101739447506, + "loss": 1.1471, + "step": 17095 + }, + { + "epoch": 0.65832531280077, + "grad_norm": 1.6877497434616089, + "learning_rate": 0.00015118419805637228, + "loss": 0.986, + "step": 17100 + }, + { + "epoch": 0.6585178055822907, + "grad_norm": 0.6538336873054504, + "learning_rate": 0.0001511582174875381, + "loss": 0.9426, + "step": 17105 + }, + { + "epoch": 0.6587102983638113, + "grad_norm": 1.1754498481750488, + "learning_rate": 0.00015113223224062384, + "loss": 1.0994, + "step": 17110 + }, + { + "epoch": 0.658902791145332, + "grad_norm": 2.219837188720703, + "learning_rate": 0.00015110624231800567, + "loss": 1.1205, + "step": 17115 + }, + { + "epoch": 0.6590952839268528, + "grad_norm": 1.826324701309204, + "learning_rate": 0.0001510802477220602, + "loss": 1.2335, + "step": 17120 + }, + { + "epoch": 0.6592877767083735, + "grad_norm": 1.8668159246444702, + "learning_rate": 0.00015105424845516445, + "loss": 1.2609, + "step": 17125 + }, + { + "epoch": 0.6594802694898941, + "grad_norm": 0.9887051582336426, + "learning_rate": 0.00015102824451969585, + "loss": 1.0539, + "step": 17130 + }, + { + "epoch": 0.6596727622714148, + "grad_norm": 1.2473443746566772, + "learning_rate": 0.00015100223591803236, + "loss": 1.2355, + "step": 17135 + }, + { + "epoch": 0.6598652550529355, + "grad_norm": 1.2736021280288696, + "learning_rate": 0.00015097622265255222, + "loss": 1.3073, + "step": 17140 + }, + { + "epoch": 0.6600577478344563, + "grad_norm": 1.0870583057403564, + "learning_rate": 0.00015095020472563424, + "loss": 0.8381, + "step": 17145 + }, + { + "epoch": 0.6602502406159769, + "grad_norm": 1.6099382638931274, + "learning_rate": 0.0001509241821396575, + "loss": 1.2738, + "step": 17150 + }, + { + "epoch": 0.6604427333974976, + "grad_norm": 1.3321658372879028, + "learning_rate": 0.0001508981548970017, + "loss": 1.2924, + "step": 17155 + }, + { + "epoch": 0.6606352261790183, + "grad_norm": 1.0399209260940552, + "learning_rate": 0.00015087212300004678, + "loss": 0.9254, + "step": 17160 + }, + { + "epoch": 0.6608277189605389, + "grad_norm": 0.9332255721092224, + "learning_rate": 0.0001508460864511732, + "loss": 1.2693, + "step": 17165 + }, + { + "epoch": 0.6610202117420597, + "grad_norm": 1.408109188079834, + "learning_rate": 0.00015082004525276185, + "loss": 1.0394, + "step": 17170 + }, + { + "epoch": 0.6612127045235804, + "grad_norm": 1.3958436250686646, + "learning_rate": 0.00015079399940719402, + "loss": 1.1119, + "step": 17175 + }, + { + "epoch": 0.6614051973051011, + "grad_norm": 1.3326903581619263, + "learning_rate": 0.00015076794891685143, + "loss": 1.0996, + "step": 17180 + }, + { + "epoch": 0.6615976900866217, + "grad_norm": 1.1485531330108643, + "learning_rate": 0.00015074189378411622, + "loss": 1.1617, + "step": 17185 + }, + { + "epoch": 0.6617901828681424, + "grad_norm": 1.9735444784164429, + "learning_rate": 0.00015071583401137092, + "loss": 1.1168, + "step": 17190 + }, + { + "epoch": 0.6619826756496632, + "grad_norm": 1.6123241186141968, + "learning_rate": 0.00015068976960099862, + "loss": 1.1232, + "step": 17195 + }, + { + "epoch": 0.6621751684311838, + "grad_norm": 1.3553659915924072, + "learning_rate": 0.0001506637005553826, + "loss": 0.9969, + "step": 17200 + }, + { + "epoch": 0.6623676612127045, + "grad_norm": 1.3059508800506592, + "learning_rate": 0.00015063762687690684, + "loss": 1.0852, + "step": 17205 + }, + { + "epoch": 0.6625601539942252, + "grad_norm": 0.9797844290733337, + "learning_rate": 0.00015061154856795553, + "loss": 0.8927, + "step": 17210 + }, + { + "epoch": 0.6627526467757459, + "grad_norm": 1.2405691146850586, + "learning_rate": 0.00015058546563091337, + "loss": 1.1381, + "step": 17215 + }, + { + "epoch": 0.6629451395572666, + "grad_norm": 0.7226620316505432, + "learning_rate": 0.00015055937806816548, + "loss": 0.9773, + "step": 17220 + }, + { + "epoch": 0.6631376323387873, + "grad_norm": 1.302935004234314, + "learning_rate": 0.0001505332858820974, + "loss": 1.2386, + "step": 17225 + }, + { + "epoch": 0.663330125120308, + "grad_norm": 0.8981648683547974, + "learning_rate": 0.00015050718907509505, + "loss": 1.1499, + "step": 17230 + }, + { + "epoch": 0.6635226179018286, + "grad_norm": 1.6177557706832886, + "learning_rate": 0.00015048108764954487, + "loss": 1.0118, + "step": 17235 + }, + { + "epoch": 0.6637151106833494, + "grad_norm": 1.4030743837356567, + "learning_rate": 0.00015045498160783362, + "loss": 1.2892, + "step": 17240 + }, + { + "epoch": 0.6639076034648701, + "grad_norm": 1.3468968868255615, + "learning_rate": 0.00015042887095234852, + "loss": 1.2397, + "step": 17245 + }, + { + "epoch": 0.6641000962463908, + "grad_norm": 0.9706347584724426, + "learning_rate": 0.00015040275568547728, + "loss": 1.0251, + "step": 17250 + }, + { + "epoch": 0.6642925890279114, + "grad_norm": 1.623147964477539, + "learning_rate": 0.00015037663580960787, + "loss": 1.1651, + "step": 17255 + }, + { + "epoch": 0.6644850818094321, + "grad_norm": 0.9518052935600281, + "learning_rate": 0.00015035051132712883, + "loss": 1.1605, + "step": 17260 + }, + { + "epoch": 0.6646775745909529, + "grad_norm": 1.36576509475708, + "learning_rate": 0.00015032438224042908, + "loss": 1.1485, + "step": 17265 + }, + { + "epoch": 0.6648700673724736, + "grad_norm": 1.4218300580978394, + "learning_rate": 0.00015029824855189797, + "loss": 1.0527, + "step": 17270 + }, + { + "epoch": 0.6650625601539942, + "grad_norm": 1.573996663093567, + "learning_rate": 0.0001502721102639252, + "loss": 1.1692, + "step": 17275 + }, + { + "epoch": 0.6652550529355149, + "grad_norm": 1.1809152364730835, + "learning_rate": 0.00015024596737890097, + "loss": 1.0801, + "step": 17280 + }, + { + "epoch": 0.6654475457170356, + "grad_norm": 1.043346881866455, + "learning_rate": 0.00015021981989921587, + "loss": 1.181, + "step": 17285 + }, + { + "epoch": 0.6656400384985564, + "grad_norm": 0.9252155423164368, + "learning_rate": 0.00015019366782726093, + "loss": 1.0204, + "step": 17290 + }, + { + "epoch": 0.665832531280077, + "grad_norm": 1.4319888353347778, + "learning_rate": 0.00015016751116542757, + "loss": 1.2009, + "step": 17295 + }, + { + "epoch": 0.6660250240615977, + "grad_norm": 0.6749492287635803, + "learning_rate": 0.00015014134991610766, + "loss": 1.1157, + "step": 17300 + }, + { + "epoch": 0.6662175168431184, + "grad_norm": 0.9866890907287598, + "learning_rate": 0.0001501151840816934, + "loss": 1.1943, + "step": 17305 + }, + { + "epoch": 0.666410009624639, + "grad_norm": 1.4207334518432617, + "learning_rate": 0.00015008901366457756, + "loss": 1.1103, + "step": 17310 + }, + { + "epoch": 0.6666025024061598, + "grad_norm": 1.0321522951126099, + "learning_rate": 0.00015006283866715326, + "loss": 1.079, + "step": 17315 + }, + { + "epoch": 0.6667949951876805, + "grad_norm": 1.6033141613006592, + "learning_rate": 0.000150036659091814, + "loss": 0.982, + "step": 17320 + }, + { + "epoch": 0.6669874879692012, + "grad_norm": 1.503190279006958, + "learning_rate": 0.00015001047494095368, + "loss": 1.1371, + "step": 17325 + }, + { + "epoch": 0.6671799807507218, + "grad_norm": 1.2487331628799438, + "learning_rate": 0.00014998428621696677, + "loss": 1.1328, + "step": 17330 + }, + { + "epoch": 0.6673724735322425, + "grad_norm": 1.2876261472702026, + "learning_rate": 0.00014995809292224797, + "loss": 1.2034, + "step": 17335 + }, + { + "epoch": 0.6675649663137633, + "grad_norm": 1.0377410650253296, + "learning_rate": 0.0001499318950591925, + "loss": 1.2794, + "step": 17340 + }, + { + "epoch": 0.667757459095284, + "grad_norm": 2.4566397666931152, + "learning_rate": 0.00014990569263019602, + "loss": 1.1211, + "step": 17345 + }, + { + "epoch": 0.6679499518768046, + "grad_norm": 1.3069671392440796, + "learning_rate": 0.00014987948563765455, + "loss": 1.1101, + "step": 17350 + }, + { + "epoch": 0.6681424446583253, + "grad_norm": 1.0914125442504883, + "learning_rate": 0.0001498532740839645, + "loss": 1.0383, + "step": 17355 + }, + { + "epoch": 0.668334937439846, + "grad_norm": 1.1379315853118896, + "learning_rate": 0.00014982705797152285, + "loss": 1.0903, + "step": 17360 + }, + { + "epoch": 0.6685274302213667, + "grad_norm": 0.9188007712364197, + "learning_rate": 0.00014980083730272675, + "loss": 1.0696, + "step": 17365 + }, + { + "epoch": 0.6687199230028874, + "grad_norm": 1.2434134483337402, + "learning_rate": 0.00014977461207997403, + "loss": 1.2438, + "step": 17370 + }, + { + "epoch": 0.6689124157844081, + "grad_norm": 1.1543229818344116, + "learning_rate": 0.00014974838230566274, + "loss": 1.12, + "step": 17375 + }, + { + "epoch": 0.6691049085659287, + "grad_norm": 1.4789245128631592, + "learning_rate": 0.00014972214798219144, + "loss": 1.0437, + "step": 17380 + }, + { + "epoch": 0.6692974013474494, + "grad_norm": 1.4191787242889404, + "learning_rate": 0.0001496959091119591, + "loss": 1.1827, + "step": 17385 + }, + { + "epoch": 0.6694898941289702, + "grad_norm": 1.749631404876709, + "learning_rate": 0.00014966966569736508, + "loss": 1.0353, + "step": 17390 + }, + { + "epoch": 0.6696823869104909, + "grad_norm": 1.4120956659317017, + "learning_rate": 0.00014964341774080912, + "loss": 1.2257, + "step": 17395 + }, + { + "epoch": 0.6698748796920115, + "grad_norm": 1.6030794382095337, + "learning_rate": 0.00014961716524469152, + "loss": 1.0767, + "step": 17400 + }, + { + "epoch": 0.6700673724735322, + "grad_norm": 1.4263496398925781, + "learning_rate": 0.00014959090821141282, + "loss": 1.1188, + "step": 17405 + }, + { + "epoch": 0.670259865255053, + "grad_norm": 1.1514267921447754, + "learning_rate": 0.00014956464664337408, + "loss": 1.0731, + "step": 17410 + }, + { + "epoch": 0.6704523580365737, + "grad_norm": 1.5985325574874878, + "learning_rate": 0.00014953838054297672, + "loss": 1.1342, + "step": 17415 + }, + { + "epoch": 0.6706448508180943, + "grad_norm": 2.1868584156036377, + "learning_rate": 0.00014951210991262262, + "loss": 1.1169, + "step": 17420 + }, + { + "epoch": 0.670837343599615, + "grad_norm": 1.1203131675720215, + "learning_rate": 0.0001494858347547141, + "loss": 1.051, + "step": 17425 + }, + { + "epoch": 0.6710298363811357, + "grad_norm": 1.3077278137207031, + "learning_rate": 0.00014945955507165377, + "loss": 1.19, + "step": 17430 + }, + { + "epoch": 0.6712223291626565, + "grad_norm": 1.1149485111236572, + "learning_rate": 0.00014943327086584476, + "loss": 1.3471, + "step": 17435 + }, + { + "epoch": 0.6714148219441771, + "grad_norm": 1.7210713624954224, + "learning_rate": 0.00014940698213969063, + "loss": 1.0918, + "step": 17440 + }, + { + "epoch": 0.6716073147256978, + "grad_norm": 1.265023946762085, + "learning_rate": 0.00014938068889559526, + "loss": 1.0716, + "step": 17445 + }, + { + "epoch": 0.6717998075072185, + "grad_norm": 1.37469482421875, + "learning_rate": 0.00014935439113596298, + "loss": 1.1524, + "step": 17450 + }, + { + "epoch": 0.6719923002887391, + "grad_norm": 1.189141035079956, + "learning_rate": 0.0001493280888631986, + "loss": 1.1097, + "step": 17455 + }, + { + "epoch": 0.6721847930702599, + "grad_norm": 1.5825908184051514, + "learning_rate": 0.00014930178207970727, + "loss": 1.2842, + "step": 17460 + }, + { + "epoch": 0.6723772858517806, + "grad_norm": 1.1093425750732422, + "learning_rate": 0.00014927547078789452, + "loss": 1.0679, + "step": 17465 + }, + { + "epoch": 0.6725697786333013, + "grad_norm": 1.3306807279586792, + "learning_rate": 0.00014924915499016646, + "loss": 1.2877, + "step": 17470 + }, + { + "epoch": 0.6727622714148219, + "grad_norm": 1.9391852617263794, + "learning_rate": 0.00014922283468892935, + "loss": 1.1743, + "step": 17475 + }, + { + "epoch": 0.6729547641963426, + "grad_norm": 1.5213755369186401, + "learning_rate": 0.0001491965098865901, + "loss": 1.1793, + "step": 17480 + }, + { + "epoch": 0.6731472569778634, + "grad_norm": 1.6637414693832397, + "learning_rate": 0.00014917018058555593, + "loss": 1.1441, + "step": 17485 + }, + { + "epoch": 0.673339749759384, + "grad_norm": 1.7859970331192017, + "learning_rate": 0.00014914384678823447, + "loss": 1.1376, + "step": 17490 + }, + { + "epoch": 0.6735322425409047, + "grad_norm": 0.9251899719238281, + "learning_rate": 0.00014911750849703378, + "loss": 1.0523, + "step": 17495 + }, + { + "epoch": 0.6737247353224254, + "grad_norm": 2.6382827758789062, + "learning_rate": 0.00014909116571436228, + "loss": 1.311, + "step": 17500 + }, + { + "epoch": 0.673917228103946, + "grad_norm": 2.1472413539886475, + "learning_rate": 0.00014906481844262888, + "loss": 1.3515, + "step": 17505 + }, + { + "epoch": 0.6741097208854668, + "grad_norm": 1.6070085763931274, + "learning_rate": 0.0001490384666842429, + "loss": 1.121, + "step": 17510 + }, + { + "epoch": 0.6743022136669875, + "grad_norm": 1.637009620666504, + "learning_rate": 0.00014901211044161393, + "loss": 1.1249, + "step": 17515 + }, + { + "epoch": 0.6744947064485082, + "grad_norm": 1.4050389528274536, + "learning_rate": 0.00014898574971715218, + "loss": 1.1719, + "step": 17520 + }, + { + "epoch": 0.6746871992300288, + "grad_norm": 1.7863889932632446, + "learning_rate": 0.0001489593845132681, + "loss": 1.2576, + "step": 17525 + }, + { + "epoch": 0.6748796920115495, + "grad_norm": 1.149431586265564, + "learning_rate": 0.00014893301483237263, + "loss": 1.0863, + "step": 17530 + }, + { + "epoch": 0.6750721847930703, + "grad_norm": 1.4066704511642456, + "learning_rate": 0.0001489066406768771, + "loss": 1.1338, + "step": 17535 + }, + { + "epoch": 0.675264677574591, + "grad_norm": 1.2270228862762451, + "learning_rate": 0.00014888026204919327, + "loss": 1.1118, + "step": 17540 + }, + { + "epoch": 0.6754571703561116, + "grad_norm": 1.6182643175125122, + "learning_rate": 0.0001488538789517333, + "loss": 1.3269, + "step": 17545 + }, + { + "epoch": 0.6756496631376323, + "grad_norm": 2.3642048835754395, + "learning_rate": 0.0001488274913869097, + "loss": 1.439, + "step": 17550 + }, + { + "epoch": 0.6758421559191531, + "grad_norm": 1.8097171783447266, + "learning_rate": 0.00014880109935713548, + "loss": 1.093, + "step": 17555 + }, + { + "epoch": 0.6760346487006738, + "grad_norm": 0.8650147914886475, + "learning_rate": 0.00014877470286482397, + "loss": 1.0413, + "step": 17560 + }, + { + "epoch": 0.6762271414821944, + "grad_norm": 1.2217522859573364, + "learning_rate": 0.00014874830191238903, + "loss": 1.1818, + "step": 17565 + }, + { + "epoch": 0.6764196342637151, + "grad_norm": 1.1500258445739746, + "learning_rate": 0.00014872189650224477, + "loss": 1.0607, + "step": 17570 + }, + { + "epoch": 0.6766121270452358, + "grad_norm": 1.1867146492004395, + "learning_rate": 0.00014869548663680584, + "loss": 0.9716, + "step": 17575 + }, + { + "epoch": 0.6768046198267565, + "grad_norm": 1.0046483278274536, + "learning_rate": 0.00014866907231848723, + "loss": 1.1875, + "step": 17580 + }, + { + "epoch": 0.6769971126082772, + "grad_norm": 2.1072323322296143, + "learning_rate": 0.00014864265354970436, + "loss": 1.194, + "step": 17585 + }, + { + "epoch": 0.6771896053897979, + "grad_norm": 1.4290494918823242, + "learning_rate": 0.00014861623033287307, + "loss": 1.2389, + "step": 17590 + }, + { + "epoch": 0.6773820981713186, + "grad_norm": 0.8890597820281982, + "learning_rate": 0.00014858980267040957, + "loss": 0.9362, + "step": 17595 + }, + { + "epoch": 0.6775745909528392, + "grad_norm": 0.9515128135681152, + "learning_rate": 0.00014856337056473045, + "loss": 1.039, + "step": 17600 + }, + { + "epoch": 0.67776708373436, + "grad_norm": 1.540008544921875, + "learning_rate": 0.00014853693401825283, + "loss": 1.1778, + "step": 17605 + }, + { + "epoch": 0.6779595765158807, + "grad_norm": 1.0766023397445679, + "learning_rate": 0.00014851049303339414, + "loss": 0.9362, + "step": 17610 + }, + { + "epoch": 0.6781520692974013, + "grad_norm": 1.854201078414917, + "learning_rate": 0.00014848404761257217, + "loss": 0.9427, + "step": 17615 + }, + { + "epoch": 0.678344562078922, + "grad_norm": 2.292722463607788, + "learning_rate": 0.00014845759775820527, + "loss": 1.0835, + "step": 17620 + }, + { + "epoch": 0.6785370548604427, + "grad_norm": 1.768997311592102, + "learning_rate": 0.00014843114347271204, + "loss": 1.0976, + "step": 17625 + }, + { + "epoch": 0.6787295476419635, + "grad_norm": 2.223881721496582, + "learning_rate": 0.00014840468475851154, + "loss": 1.1417, + "step": 17630 + }, + { + "epoch": 0.6789220404234841, + "grad_norm": 1.1589646339416504, + "learning_rate": 0.0001483782216180233, + "loss": 1.2536, + "step": 17635 + }, + { + "epoch": 0.6791145332050048, + "grad_norm": 1.6478285789489746, + "learning_rate": 0.00014835175405366718, + "loss": 1.1534, + "step": 17640 + }, + { + "epoch": 0.6793070259865255, + "grad_norm": 1.6837091445922852, + "learning_rate": 0.00014832528206786344, + "loss": 1.3415, + "step": 17645 + }, + { + "epoch": 0.6794995187680462, + "grad_norm": 1.6697105169296265, + "learning_rate": 0.00014829880566303273, + "loss": 1.0241, + "step": 17650 + }, + { + "epoch": 0.6796920115495669, + "grad_norm": 1.08551025390625, + "learning_rate": 0.00014827232484159624, + "loss": 1.2322, + "step": 17655 + }, + { + "epoch": 0.6798845043310876, + "grad_norm": 1.9399616718292236, + "learning_rate": 0.00014824583960597543, + "loss": 1.393, + "step": 17660 + }, + { + "epoch": 0.6800769971126083, + "grad_norm": 1.0628485679626465, + "learning_rate": 0.00014821934995859216, + "loss": 1.2078, + "step": 17665 + }, + { + "epoch": 0.6802694898941289, + "grad_norm": 0.9613144397735596, + "learning_rate": 0.00014819285590186875, + "loss": 1.1234, + "step": 17670 + }, + { + "epoch": 0.6804619826756496, + "grad_norm": 0.9686816930770874, + "learning_rate": 0.00014816635743822795, + "loss": 1.1959, + "step": 17675 + }, + { + "epoch": 0.6806544754571704, + "grad_norm": 1.4415709972381592, + "learning_rate": 0.00014813985457009282, + "loss": 1.0775, + "step": 17680 + }, + { + "epoch": 0.6808469682386911, + "grad_norm": 1.5800002813339233, + "learning_rate": 0.00014811334729988688, + "loss": 1.0802, + "step": 17685 + }, + { + "epoch": 0.6810394610202117, + "grad_norm": 1.1061028242111206, + "learning_rate": 0.0001480868356300341, + "loss": 1.0415, + "step": 17690 + }, + { + "epoch": 0.6812319538017324, + "grad_norm": 2.3262946605682373, + "learning_rate": 0.00014806031956295868, + "loss": 1.2431, + "step": 17695 + }, + { + "epoch": 0.6814244465832531, + "grad_norm": 1.6517562866210938, + "learning_rate": 0.00014803379910108543, + "loss": 1.1792, + "step": 17700 + }, + { + "epoch": 0.6816169393647739, + "grad_norm": 1.3823506832122803, + "learning_rate": 0.00014800727424683948, + "loss": 1.1293, + "step": 17705 + }, + { + "epoch": 0.6818094321462945, + "grad_norm": 1.5448585748672485, + "learning_rate": 0.00014798074500264627, + "loss": 1.2126, + "step": 17710 + }, + { + "epoch": 0.6820019249278152, + "grad_norm": 1.2395973205566406, + "learning_rate": 0.0001479542113709318, + "loss": 1.3002, + "step": 17715 + }, + { + "epoch": 0.6821944177093359, + "grad_norm": 1.8366637229919434, + "learning_rate": 0.00014792767335412233, + "loss": 1.1798, + "step": 17720 + }, + { + "epoch": 0.6823869104908566, + "grad_norm": 1.3830804824829102, + "learning_rate": 0.00014790113095464465, + "loss": 1.3001, + "step": 17725 + }, + { + "epoch": 0.6825794032723773, + "grad_norm": 10.001764297485352, + "learning_rate": 0.0001478745841749259, + "loss": 1.1643, + "step": 17730 + }, + { + "epoch": 0.682771896053898, + "grad_norm": 1.0113561153411865, + "learning_rate": 0.00014784803301739352, + "loss": 1.1725, + "step": 17735 + }, + { + "epoch": 0.6829643888354187, + "grad_norm": 2.7240827083587646, + "learning_rate": 0.00014782147748447554, + "loss": 1.2348, + "step": 17740 + }, + { + "epoch": 0.6831568816169393, + "grad_norm": 1.0802150964736938, + "learning_rate": 0.00014779491757860015, + "loss": 1.3556, + "step": 17745 + }, + { + "epoch": 0.6833493743984601, + "grad_norm": 1.6339032649993896, + "learning_rate": 0.00014776835330219623, + "loss": 0.9967, + "step": 17750 + }, + { + "epoch": 0.6835418671799808, + "grad_norm": 1.6983892917633057, + "learning_rate": 0.0001477417846576928, + "loss": 1.041, + "step": 17755 + }, + { + "epoch": 0.6837343599615014, + "grad_norm": 1.6230486631393433, + "learning_rate": 0.00014771521164751942, + "loss": 1.2298, + "step": 17760 + }, + { + "epoch": 0.6839268527430221, + "grad_norm": 1.1079175472259521, + "learning_rate": 0.00014768863427410604, + "loss": 1.214, + "step": 17765 + }, + { + "epoch": 0.6841193455245428, + "grad_norm": 1.1601203680038452, + "learning_rate": 0.00014766205253988294, + "loss": 1.2399, + "step": 17770 + }, + { + "epoch": 0.6843118383060636, + "grad_norm": 2.2776849269866943, + "learning_rate": 0.00014763546644728088, + "loss": 1.0071, + "step": 17775 + }, + { + "epoch": 0.6845043310875842, + "grad_norm": 1.362021565437317, + "learning_rate": 0.00014760887599873094, + "loss": 1.1233, + "step": 17780 + }, + { + "epoch": 0.6846968238691049, + "grad_norm": 1.933518409729004, + "learning_rate": 0.00014758228119666472, + "loss": 1.0854, + "step": 17785 + }, + { + "epoch": 0.6848893166506256, + "grad_norm": 1.148533582687378, + "learning_rate": 0.00014755568204351407, + "loss": 1.0694, + "step": 17790 + }, + { + "epoch": 0.6850818094321462, + "grad_norm": 1.2880831956863403, + "learning_rate": 0.0001475290785417113, + "loss": 1.0814, + "step": 17795 + }, + { + "epoch": 0.685274302213667, + "grad_norm": 1.5790437459945679, + "learning_rate": 0.0001475024706936892, + "loss": 1.0467, + "step": 17800 + }, + { + "epoch": 0.6854667949951877, + "grad_norm": 1.636828899383545, + "learning_rate": 0.0001474758585018808, + "loss": 1.3419, + "step": 17805 + }, + { + "epoch": 0.6856592877767084, + "grad_norm": 1.0403766632080078, + "learning_rate": 0.00014744924196871963, + "loss": 1.1468, + "step": 17810 + }, + { + "epoch": 0.685851780558229, + "grad_norm": 1.1266472339630127, + "learning_rate": 0.0001474226210966396, + "loss": 1.2723, + "step": 17815 + }, + { + "epoch": 0.6860442733397497, + "grad_norm": 1.352543830871582, + "learning_rate": 0.00014739599588807506, + "loss": 1.1345, + "step": 17820 + }, + { + "epoch": 0.6862367661212705, + "grad_norm": 1.674023151397705, + "learning_rate": 0.00014736936634546062, + "loss": 1.2522, + "step": 17825 + }, + { + "epoch": 0.6864292589027912, + "grad_norm": 1.3684656620025635, + "learning_rate": 0.00014734273247123144, + "loss": 1.1169, + "step": 17830 + }, + { + "epoch": 0.6866217516843118, + "grad_norm": 1.917075514793396, + "learning_rate": 0.00014731609426782297, + "loss": 1.2523, + "step": 17835 + }, + { + "epoch": 0.6868142444658325, + "grad_norm": 1.5463966131210327, + "learning_rate": 0.00014728945173767116, + "loss": 0.9929, + "step": 17840 + }, + { + "epoch": 0.6870067372473532, + "grad_norm": 1.7427698373794556, + "learning_rate": 0.00014726280488321222, + "loss": 1.22, + "step": 17845 + }, + { + "epoch": 0.687199230028874, + "grad_norm": 1.8021422624588013, + "learning_rate": 0.0001472361537068829, + "loss": 1.0429, + "step": 17850 + }, + { + "epoch": 0.6873917228103946, + "grad_norm": 1.571053147315979, + "learning_rate": 0.0001472094982111202, + "loss": 1.17, + "step": 17855 + }, + { + "epoch": 0.6875842155919153, + "grad_norm": 1.3607596158981323, + "learning_rate": 0.00014718283839836166, + "loss": 1.0644, + "step": 17860 + }, + { + "epoch": 0.687776708373436, + "grad_norm": 0.9396845102310181, + "learning_rate": 0.00014715617427104504, + "loss": 1.0807, + "step": 17865 + }, + { + "epoch": 0.6879692011549567, + "grad_norm": 1.605432152748108, + "learning_rate": 0.00014712950583160872, + "loss": 1.0641, + "step": 17870 + }, + { + "epoch": 0.6881616939364774, + "grad_norm": 1.4847965240478516, + "learning_rate": 0.0001471081679769722, + "loss": 1.1625, + "step": 17875 + }, + { + "epoch": 0.6883541867179981, + "grad_norm": 1.930336594581604, + "learning_rate": 0.00014708149178186593, + "loss": 1.3346, + "step": 17880 + }, + { + "epoch": 0.6885466794995188, + "grad_norm": 1.7398570775985718, + "learning_rate": 0.00014705481128146917, + "loss": 1.2316, + "step": 17885 + }, + { + "epoch": 0.6887391722810394, + "grad_norm": 1.5817015171051025, + "learning_rate": 0.00014702812647822162, + "loss": 1.0292, + "step": 17890 + }, + { + "epoch": 0.6889316650625602, + "grad_norm": 3.2520430088043213, + "learning_rate": 0.00014700143737456342, + "loss": 1.088, + "step": 17895 + }, + { + "epoch": 0.6891241578440809, + "grad_norm": 2.165456533432007, + "learning_rate": 0.00014697474397293517, + "loss": 0.9452, + "step": 17900 + }, + { + "epoch": 0.6893166506256015, + "grad_norm": 0.9637191295623779, + "learning_rate": 0.00014694804627577771, + "loss": 1.266, + "step": 17905 + }, + { + "epoch": 0.6895091434071222, + "grad_norm": 1.9606934785842896, + "learning_rate": 0.00014692134428553248, + "loss": 1.0773, + "step": 17910 + }, + { + "epoch": 0.6897016361886429, + "grad_norm": 1.1911338567733765, + "learning_rate": 0.0001468946380046411, + "loss": 1.1359, + "step": 17915 + }, + { + "epoch": 0.6898941289701637, + "grad_norm": 1.3913235664367676, + "learning_rate": 0.00014686792743554575, + "loss": 1.3053, + "step": 17920 + }, + { + "epoch": 0.6900866217516843, + "grad_norm": 1.2314075231552124, + "learning_rate": 0.00014684121258068888, + "loss": 1.0624, + "step": 17925 + }, + { + "epoch": 0.690279114533205, + "grad_norm": 2.1499176025390625, + "learning_rate": 0.00014681449344251338, + "loss": 1.2147, + "step": 17930 + }, + { + "epoch": 0.6904716073147257, + "grad_norm": 1.6417664289474487, + "learning_rate": 0.00014678777002346264, + "loss": 1.1139, + "step": 17935 + }, + { + "epoch": 0.6906641000962463, + "grad_norm": 1.181154727935791, + "learning_rate": 0.00014676104232598026, + "loss": 1.0503, + "step": 17940 + }, + { + "epoch": 0.6908565928777671, + "grad_norm": 1.7786331176757812, + "learning_rate": 0.00014673431035251027, + "loss": 1.05, + "step": 17945 + }, + { + "epoch": 0.6910490856592878, + "grad_norm": 0.948625922203064, + "learning_rate": 0.00014670757410549724, + "loss": 1.0888, + "step": 17950 + }, + { + "epoch": 0.6912415784408085, + "grad_norm": 1.9812164306640625, + "learning_rate": 0.00014668083358738597, + "loss": 1.1467, + "step": 17955 + }, + { + "epoch": 0.6914340712223291, + "grad_norm": 0.9091313481330872, + "learning_rate": 0.0001466540888006217, + "loss": 1.1226, + "step": 17960 + }, + { + "epoch": 0.6916265640038498, + "grad_norm": 2.100114583969116, + "learning_rate": 0.00014662733974765005, + "loss": 1.1233, + "step": 17965 + }, + { + "epoch": 0.6918190567853706, + "grad_norm": 2.0999033451080322, + "learning_rate": 0.00014660058643091702, + "loss": 1.086, + "step": 17970 + }, + { + "epoch": 0.6920115495668913, + "grad_norm": 1.543411374092102, + "learning_rate": 0.0001465738288528691, + "loss": 1.218, + "step": 17975 + }, + { + "epoch": 0.6922040423484119, + "grad_norm": 2.6429097652435303, + "learning_rate": 0.00014654706701595305, + "loss": 1.1425, + "step": 17980 + }, + { + "epoch": 0.6923965351299326, + "grad_norm": 1.258535385131836, + "learning_rate": 0.00014652030092261606, + "loss": 1.124, + "step": 17985 + }, + { + "epoch": 0.6925890279114533, + "grad_norm": 0.9203128814697266, + "learning_rate": 0.00014649353057530573, + "loss": 1.0035, + "step": 17990 + }, + { + "epoch": 0.692781520692974, + "grad_norm": 1.7482789754867554, + "learning_rate": 0.00014646675597647003, + "loss": 1.2393, + "step": 17995 + }, + { + "epoch": 0.6929740134744947, + "grad_norm": 1.3026279211044312, + "learning_rate": 0.0001464399771285573, + "loss": 1.294, + "step": 18000 + }, + { + "epoch": 0.6931665062560154, + "grad_norm": 1.5518649816513062, + "learning_rate": 0.00014641319403401628, + "loss": 1.2397, + "step": 18005 + }, + { + "epoch": 0.6933589990375361, + "grad_norm": 1.3904852867126465, + "learning_rate": 0.00014638640669529615, + "loss": 1.079, + "step": 18010 + }, + { + "epoch": 0.6935514918190567, + "grad_norm": 0.7677931189537048, + "learning_rate": 0.0001463596151148464, + "loss": 1.1485, + "step": 18015 + }, + { + "epoch": 0.6937439846005775, + "grad_norm": 1.1935845613479614, + "learning_rate": 0.00014633281929511696, + "loss": 1.167, + "step": 18020 + }, + { + "epoch": 0.6939364773820982, + "grad_norm": 1.8612521886825562, + "learning_rate": 0.00014630601923855814, + "loss": 1.2335, + "step": 18025 + }, + { + "epoch": 0.6941289701636189, + "grad_norm": 1.9979881048202515, + "learning_rate": 0.00014627921494762055, + "loss": 1.0421, + "step": 18030 + }, + { + "epoch": 0.6943214629451395, + "grad_norm": 1.9426400661468506, + "learning_rate": 0.00014625240642475538, + "loss": 1.0918, + "step": 18035 + }, + { + "epoch": 0.6945139557266603, + "grad_norm": 0.9990954399108887, + "learning_rate": 0.000146225593672414, + "loss": 1.3456, + "step": 18040 + }, + { + "epoch": 0.694706448508181, + "grad_norm": 2.187206745147705, + "learning_rate": 0.00014619877669304834, + "loss": 1.0926, + "step": 18045 + }, + { + "epoch": 0.6948989412897016, + "grad_norm": 1.5417639017105103, + "learning_rate": 0.00014617195548911053, + "loss": 1.2796, + "step": 18050 + }, + { + "epoch": 0.6950914340712223, + "grad_norm": 1.476150631904602, + "learning_rate": 0.0001461451300630533, + "loss": 1.1623, + "step": 18055 + }, + { + "epoch": 0.695283926852743, + "grad_norm": 1.6524615287780762, + "learning_rate": 0.0001461183004173296, + "loss": 1.0976, + "step": 18060 + }, + { + "epoch": 0.6954764196342638, + "grad_norm": 1.4800169467926025, + "learning_rate": 0.0001460914665543928, + "loss": 1.2641, + "step": 18065 + }, + { + "epoch": 0.6956689124157844, + "grad_norm": 1.2046303749084473, + "learning_rate": 0.00014606462847669674, + "loss": 1.0037, + "step": 18070 + }, + { + "epoch": 0.6958614051973051, + "grad_norm": 1.3457711935043335, + "learning_rate": 0.00014603778618669556, + "loss": 1.1599, + "step": 18075 + }, + { + "epoch": 0.6960538979788258, + "grad_norm": 1.8690896034240723, + "learning_rate": 0.0001460109396868438, + "loss": 1.3016, + "step": 18080 + }, + { + "epoch": 0.6962463907603464, + "grad_norm": 0.8788353204727173, + "learning_rate": 0.00014598408897959639, + "loss": 1.0261, + "step": 18085 + }, + { + "epoch": 0.6964388835418672, + "grad_norm": 1.064239501953125, + "learning_rate": 0.00014595723406740868, + "loss": 1.1159, + "step": 18090 + }, + { + "epoch": 0.6966313763233879, + "grad_norm": 0.9102209210395813, + "learning_rate": 0.00014593037495273635, + "loss": 1.1263, + "step": 18095 + }, + { + "epoch": 0.6968238691049086, + "grad_norm": 1.4841855764389038, + "learning_rate": 0.00014590351163803545, + "loss": 1.0526, + "step": 18100 + }, + { + "epoch": 0.6970163618864292, + "grad_norm": 2.282543182373047, + "learning_rate": 0.00014587664412576254, + "loss": 1.0876, + "step": 18105 + }, + { + "epoch": 0.6972088546679499, + "grad_norm": 1.149782657623291, + "learning_rate": 0.0001458497724183744, + "loss": 1.2092, + "step": 18110 + }, + { + "epoch": 0.6974013474494707, + "grad_norm": 1.6531153917312622, + "learning_rate": 0.0001458228965183283, + "loss": 1.2421, + "step": 18115 + }, + { + "epoch": 0.6975938402309914, + "grad_norm": 2.376281976699829, + "learning_rate": 0.00014579601642808192, + "loss": 1.2179, + "step": 18120 + }, + { + "epoch": 0.697786333012512, + "grad_norm": 1.9077723026275635, + "learning_rate": 0.0001457691321500932, + "loss": 1.1962, + "step": 18125 + }, + { + "epoch": 0.6979788257940327, + "grad_norm": 1.3130842447280884, + "learning_rate": 0.00014574224368682048, + "loss": 1.3169, + "step": 18130 + }, + { + "epoch": 0.6981713185755534, + "grad_norm": 1.0211979150772095, + "learning_rate": 0.00014571535104072262, + "loss": 1.0256, + "step": 18135 + }, + { + "epoch": 0.6983638113570741, + "grad_norm": 1.7479397058486938, + "learning_rate": 0.00014568845421425875, + "loss": 1.0906, + "step": 18140 + }, + { + "epoch": 0.6985563041385948, + "grad_norm": 1.3305407762527466, + "learning_rate": 0.00014566155320988838, + "loss": 1.206, + "step": 18145 + }, + { + "epoch": 0.6987487969201155, + "grad_norm": 1.2185992002487183, + "learning_rate": 0.00014563464803007145, + "loss": 1.2765, + "step": 18150 + }, + { + "epoch": 0.6989412897016362, + "grad_norm": 1.3256112337112427, + "learning_rate": 0.00014560773867726827, + "loss": 1.0899, + "step": 18155 + }, + { + "epoch": 0.6991337824831568, + "grad_norm": 1.9090956449508667, + "learning_rate": 0.0001455808251539395, + "loss": 1.1944, + "step": 18160 + }, + { + "epoch": 0.6993262752646776, + "grad_norm": 1.078116774559021, + "learning_rate": 0.00014555390746254622, + "loss": 1.1393, + "step": 18165 + }, + { + "epoch": 0.6995187680461983, + "grad_norm": 1.21144437789917, + "learning_rate": 0.00014552698560554988, + "loss": 1.0835, + "step": 18170 + }, + { + "epoch": 0.699711260827719, + "grad_norm": 1.4013081789016724, + "learning_rate": 0.00014550005958541227, + "loss": 1.0785, + "step": 18175 + }, + { + "epoch": 0.6999037536092396, + "grad_norm": 1.102122187614441, + "learning_rate": 0.00014547312940459562, + "loss": 1.0839, + "step": 18180 + }, + { + "epoch": 0.7000962463907604, + "grad_norm": 1.602994680404663, + "learning_rate": 0.00014544619506556256, + "loss": 1.2608, + "step": 18185 + }, + { + "epoch": 0.7002887391722811, + "grad_norm": 2.8694801330566406, + "learning_rate": 0.000145419256570776, + "loss": 1.3161, + "step": 18190 + }, + { + "epoch": 0.7004812319538017, + "grad_norm": 1.5687551498413086, + "learning_rate": 0.00014539231392269927, + "loss": 1.0668, + "step": 18195 + }, + { + "epoch": 0.7006737247353224, + "grad_norm": 1.1013094186782837, + "learning_rate": 0.00014536536712379618, + "loss": 1.0829, + "step": 18200 + }, + { + "epoch": 0.7008662175168431, + "grad_norm": 1.4294344186782837, + "learning_rate": 0.00014533841617653075, + "loss": 1.0003, + "step": 18205 + }, + { + "epoch": 0.7010587102983639, + "grad_norm": 1.168997049331665, + "learning_rate": 0.0001453114610833675, + "loss": 1.2252, + "step": 18210 + }, + { + "epoch": 0.7012512030798845, + "grad_norm": 1.21929132938385, + "learning_rate": 0.0001452845018467713, + "loss": 1.102, + "step": 18215 + }, + { + "epoch": 0.7014436958614052, + "grad_norm": 1.0682016611099243, + "learning_rate": 0.00014525753846920738, + "loss": 1.0219, + "step": 18220 + }, + { + "epoch": 0.7016361886429259, + "grad_norm": 1.210161566734314, + "learning_rate": 0.00014523057095314142, + "loss": 1.0666, + "step": 18225 + }, + { + "epoch": 0.7018286814244465, + "grad_norm": 0.9966996312141418, + "learning_rate": 0.0001452035993010393, + "loss": 1.1343, + "step": 18230 + }, + { + "epoch": 0.7020211742059673, + "grad_norm": 1.2477959394454956, + "learning_rate": 0.00014517662351536752, + "loss": 1.2147, + "step": 18235 + }, + { + "epoch": 0.702213666987488, + "grad_norm": 1.8020172119140625, + "learning_rate": 0.00014514964359859276, + "loss": 1.1945, + "step": 18240 + }, + { + "epoch": 0.7024061597690087, + "grad_norm": 1.0535303354263306, + "learning_rate": 0.0001451226595531822, + "loss": 1.0792, + "step": 18245 + }, + { + "epoch": 0.7025986525505293, + "grad_norm": 1.913590431213379, + "learning_rate": 0.0001450956713816033, + "loss": 1.1344, + "step": 18250 + }, + { + "epoch": 0.70279114533205, + "grad_norm": 0.998621940612793, + "learning_rate": 0.00014506867908632403, + "loss": 1.1139, + "step": 18255 + }, + { + "epoch": 0.7029836381135708, + "grad_norm": 1.8913546800613403, + "learning_rate": 0.0001450416826698126, + "loss": 1.0621, + "step": 18260 + }, + { + "epoch": 0.7031761308950915, + "grad_norm": 1.0329716205596924, + "learning_rate": 0.00014501468213453763, + "loss": 1.2732, + "step": 18265 + }, + { + "epoch": 0.7033686236766121, + "grad_norm": 0.9243387579917908, + "learning_rate": 0.0001449876774829682, + "loss": 1.2272, + "step": 18270 + }, + { + "epoch": 0.7035611164581328, + "grad_norm": 1.6289262771606445, + "learning_rate": 0.0001449606687175737, + "loss": 1.0912, + "step": 18275 + }, + { + "epoch": 0.7037536092396535, + "grad_norm": 2.005293607711792, + "learning_rate": 0.00014493365584082384, + "loss": 1.018, + "step": 18280 + }, + { + "epoch": 0.7039461020211742, + "grad_norm": 1.2743504047393799, + "learning_rate": 0.00014490663885518881, + "loss": 1.0026, + "step": 18285 + }, + { + "epoch": 0.7041385948026949, + "grad_norm": 1.4915635585784912, + "learning_rate": 0.00014487961776313922, + "loss": 1.0489, + "step": 18290 + }, + { + "epoch": 0.7043310875842156, + "grad_norm": 0.9605044722557068, + "learning_rate": 0.00014485259256714577, + "loss": 1.1053, + "step": 18295 + }, + { + "epoch": 0.7045235803657363, + "grad_norm": 1.8121784925460815, + "learning_rate": 0.0001448255632696799, + "loss": 1.1862, + "step": 18300 + }, + { + "epoch": 0.7047160731472569, + "grad_norm": 1.2540571689605713, + "learning_rate": 0.00014479852987321322, + "loss": 1.1361, + "step": 18305 + }, + { + "epoch": 0.7049085659287777, + "grad_norm": 1.4160270690917969, + "learning_rate": 0.00014477149238021776, + "loss": 1.0917, + "step": 18310 + }, + { + "epoch": 0.7051010587102984, + "grad_norm": 1.4298075437545776, + "learning_rate": 0.0001447444507931659, + "loss": 1.1407, + "step": 18315 + }, + { + "epoch": 0.705293551491819, + "grad_norm": 1.0214334726333618, + "learning_rate": 0.00014471740511453037, + "loss": 1.0714, + "step": 18320 + }, + { + "epoch": 0.7054860442733397, + "grad_norm": 1.6246428489685059, + "learning_rate": 0.00014469035534678444, + "loss": 1.258, + "step": 18325 + }, + { + "epoch": 0.7056785370548604, + "grad_norm": 1.5467473268508911, + "learning_rate": 0.0001446633014924015, + "loss": 1.1811, + "step": 18330 + }, + { + "epoch": 0.7058710298363812, + "grad_norm": 2.038041114807129, + "learning_rate": 0.00014463624355385557, + "loss": 1.1339, + "step": 18335 + }, + { + "epoch": 0.7060635226179018, + "grad_norm": 1.5328725576400757, + "learning_rate": 0.0001446091815336208, + "loss": 1.2261, + "step": 18340 + }, + { + "epoch": 0.7062560153994225, + "grad_norm": 0.9550712704658508, + "learning_rate": 0.0001445821154341719, + "loss": 1.0973, + "step": 18345 + }, + { + "epoch": 0.7064485081809432, + "grad_norm": 1.4610974788665771, + "learning_rate": 0.0001445550452579839, + "loss": 1.2341, + "step": 18350 + }, + { + "epoch": 0.706641000962464, + "grad_norm": 1.9539941549301147, + "learning_rate": 0.00014452797100753212, + "loss": 1.1115, + "step": 18355 + }, + { + "epoch": 0.7068334937439846, + "grad_norm": 1.136670708656311, + "learning_rate": 0.0001445008926852924, + "loss": 1.1883, + "step": 18360 + }, + { + "epoch": 0.7070259865255053, + "grad_norm": 1.2136088609695435, + "learning_rate": 0.00014447381029374082, + "loss": 1.1384, + "step": 18365 + }, + { + "epoch": 0.707218479307026, + "grad_norm": 1.3836339712142944, + "learning_rate": 0.00014444672383535388, + "loss": 1.2371, + "step": 18370 + }, + { + "epoch": 0.7074109720885466, + "grad_norm": 1.6226662397384644, + "learning_rate": 0.00014441963331260848, + "loss": 1.3057, + "step": 18375 + }, + { + "epoch": 0.7076034648700674, + "grad_norm": 1.249576449394226, + "learning_rate": 0.0001443925387279819, + "loss": 1.0849, + "step": 18380 + }, + { + "epoch": 0.7077959576515881, + "grad_norm": 1.9330114126205444, + "learning_rate": 0.0001443654400839517, + "loss": 0.9933, + "step": 18385 + }, + { + "epoch": 0.7079884504331088, + "grad_norm": 1.4878582954406738, + "learning_rate": 0.0001443383373829959, + "loss": 0.8842, + "step": 18390 + }, + { + "epoch": 0.7081809432146294, + "grad_norm": 2.3553292751312256, + "learning_rate": 0.00014431123062759286, + "loss": 1.1733, + "step": 18395 + }, + { + "epoch": 0.7083734359961501, + "grad_norm": 0.8834003210067749, + "learning_rate": 0.00014428411982022135, + "loss": 1.1275, + "step": 18400 + }, + { + "epoch": 0.7085659287776709, + "grad_norm": 1.331040620803833, + "learning_rate": 0.00014425700496336038, + "loss": 1.0753, + "step": 18405 + }, + { + "epoch": 0.7087584215591916, + "grad_norm": 1.0972214937210083, + "learning_rate": 0.0001442298860594895, + "loss": 1.2045, + "step": 18410 + }, + { + "epoch": 0.7089509143407122, + "grad_norm": 1.5350794792175293, + "learning_rate": 0.00014420276311108857, + "loss": 1.0097, + "step": 18415 + }, + { + "epoch": 0.7091434071222329, + "grad_norm": 1.8360435962677002, + "learning_rate": 0.00014417563612063777, + "loss": 1.177, + "step": 18420 + }, + { + "epoch": 0.7093358999037536, + "grad_norm": 1.0898863077163696, + "learning_rate": 0.00014414850509061764, + "loss": 1.0374, + "step": 18425 + }, + { + "epoch": 0.7095283926852743, + "grad_norm": 1.2654744386672974, + "learning_rate": 0.00014412137002350919, + "loss": 1.1494, + "step": 18430 + }, + { + "epoch": 0.709720885466795, + "grad_norm": 1.8603087663650513, + "learning_rate": 0.00014409423092179375, + "loss": 1.2723, + "step": 18435 + }, + { + "epoch": 0.7099133782483157, + "grad_norm": 0.9974476099014282, + "learning_rate": 0.00014406708778795296, + "loss": 1.1139, + "step": 18440 + }, + { + "epoch": 0.7101058710298364, + "grad_norm": 0.998330295085907, + "learning_rate": 0.00014403994062446893, + "loss": 1.2881, + "step": 18445 + }, + { + "epoch": 0.710298363811357, + "grad_norm": 2.04758882522583, + "learning_rate": 0.00014401278943382406, + "loss": 1.0089, + "step": 18450 + }, + { + "epoch": 0.7104908565928778, + "grad_norm": 1.301059603691101, + "learning_rate": 0.0001439856342185012, + "loss": 1.1405, + "step": 18455 + }, + { + "epoch": 0.7106833493743985, + "grad_norm": 1.684041142463684, + "learning_rate": 0.00014395847498098338, + "loss": 1.1387, + "step": 18460 + }, + { + "epoch": 0.7108758421559191, + "grad_norm": 1.95292067527771, + "learning_rate": 0.0001439313117237543, + "loss": 1.1659, + "step": 18465 + }, + { + "epoch": 0.7110683349374398, + "grad_norm": 1.1917790174484253, + "learning_rate": 0.00014390414444929775, + "loss": 1.0497, + "step": 18470 + }, + { + "epoch": 0.7112608277189605, + "grad_norm": 1.1583658456802368, + "learning_rate": 0.000143876973160098, + "loss": 1.2276, + "step": 18475 + }, + { + "epoch": 0.7114533205004813, + "grad_norm": 1.116721749305725, + "learning_rate": 0.00014384979785863976, + "loss": 1.3688, + "step": 18480 + }, + { + "epoch": 0.7116458132820019, + "grad_norm": 1.1651076078414917, + "learning_rate": 0.00014382261854740795, + "loss": 1.3093, + "step": 18485 + }, + { + "epoch": 0.7118383060635226, + "grad_norm": 1.2162317037582397, + "learning_rate": 0.00014379543522888798, + "loss": 1.1324, + "step": 18490 + }, + { + "epoch": 0.7120307988450433, + "grad_norm": 1.5792020559310913, + "learning_rate": 0.0001437682479055656, + "loss": 1.123, + "step": 18495 + }, + { + "epoch": 0.7122232916265641, + "grad_norm": 0.9636641144752502, + "learning_rate": 0.00014374105657992688, + "loss": 1.0547, + "step": 18500 + }, + { + "epoch": 0.7124157844080847, + "grad_norm": 1.1409319639205933, + "learning_rate": 0.00014371386125445828, + "loss": 1.1277, + "step": 18505 + }, + { + "epoch": 0.7126082771896054, + "grad_norm": 1.074267029762268, + "learning_rate": 0.00014368666193164664, + "loss": 1.1041, + "step": 18510 + }, + { + "epoch": 0.7128007699711261, + "grad_norm": 1.2324203252792358, + "learning_rate": 0.00014365945861397918, + "loss": 1.1274, + "step": 18515 + }, + { + "epoch": 0.7129932627526467, + "grad_norm": 1.2441449165344238, + "learning_rate": 0.00014363225130394343, + "loss": 1.0739, + "step": 18520 + }, + { + "epoch": 0.7131857555341675, + "grad_norm": 1.0249239206314087, + "learning_rate": 0.00014360504000402737, + "loss": 1.1945, + "step": 18525 + }, + { + "epoch": 0.7133782483156882, + "grad_norm": 1.0297977924346924, + "learning_rate": 0.00014357782471671922, + "loss": 1.1694, + "step": 18530 + }, + { + "epoch": 0.7135707410972089, + "grad_norm": 1.6610252857208252, + "learning_rate": 0.00014355060544450767, + "loss": 1.2034, + "step": 18535 + }, + { + "epoch": 0.7137632338787295, + "grad_norm": 1.290869951248169, + "learning_rate": 0.0001435233821898818, + "loss": 1.1195, + "step": 18540 + }, + { + "epoch": 0.7139557266602502, + "grad_norm": 1.4730745553970337, + "learning_rate": 0.0001434961549553309, + "loss": 1.1237, + "step": 18545 + }, + { + "epoch": 0.714148219441771, + "grad_norm": 1.0857551097869873, + "learning_rate": 0.00014346892374334479, + "loss": 1.013, + "step": 18550 + }, + { + "epoch": 0.7143407122232917, + "grad_norm": 1.0761737823486328, + "learning_rate": 0.00014344168855641356, + "loss": 0.9948, + "step": 18555 + }, + { + "epoch": 0.7145332050048123, + "grad_norm": 2.012099027633667, + "learning_rate": 0.00014341444939702767, + "loss": 1.1598, + "step": 18560 + }, + { + "epoch": 0.714725697786333, + "grad_norm": 1.837538242340088, + "learning_rate": 0.000143387206267678, + "loss": 1.1389, + "step": 18565 + }, + { + "epoch": 0.7149181905678537, + "grad_norm": 1.1099295616149902, + "learning_rate": 0.0001433599591708557, + "loss": 1.0835, + "step": 18570 + }, + { + "epoch": 0.7151106833493744, + "grad_norm": 0.9746969938278198, + "learning_rate": 0.00014333270810905238, + "loss": 0.973, + "step": 18575 + }, + { + "epoch": 0.7153031761308951, + "grad_norm": 1.9786537885665894, + "learning_rate": 0.00014330545308475996, + "loss": 1.1564, + "step": 18580 + }, + { + "epoch": 0.7154956689124158, + "grad_norm": 1.020973801612854, + "learning_rate": 0.0001432781941004707, + "loss": 1.0202, + "step": 18585 + }, + { + "epoch": 0.7156881616939365, + "grad_norm": 1.2314329147338867, + "learning_rate": 0.0001432509311586773, + "loss": 1.2654, + "step": 18590 + }, + { + "epoch": 0.7158806544754571, + "grad_norm": 1.1897294521331787, + "learning_rate": 0.00014322366426187277, + "loss": 1.3241, + "step": 18595 + }, + { + "epoch": 0.7160731472569779, + "grad_norm": 1.2122468948364258, + "learning_rate": 0.00014319639341255048, + "loss": 1.0044, + "step": 18600 + }, + { + "epoch": 0.7162656400384986, + "grad_norm": 1.5471996068954468, + "learning_rate": 0.00014316911861320415, + "loss": 1.2251, + "step": 18605 + }, + { + "epoch": 0.7164581328200192, + "grad_norm": 1.4441865682601929, + "learning_rate": 0.00014314183986632788, + "loss": 1.1717, + "step": 18610 + }, + { + "epoch": 0.7166506256015399, + "grad_norm": 1.092637300491333, + "learning_rate": 0.00014311455717441616, + "loss": 1.0724, + "step": 18615 + }, + { + "epoch": 0.7168431183830606, + "grad_norm": 1.0974675416946411, + "learning_rate": 0.00014308727053996377, + "loss": 1.0623, + "step": 18620 + }, + { + "epoch": 0.7170356111645814, + "grad_norm": 1.513769507408142, + "learning_rate": 0.00014305997996546594, + "loss": 1.1027, + "step": 18625 + }, + { + "epoch": 0.717228103946102, + "grad_norm": 1.0637279748916626, + "learning_rate": 0.00014303268545341817, + "loss": 1.0313, + "step": 18630 + }, + { + "epoch": 0.7174205967276227, + "grad_norm": 1.3569130897521973, + "learning_rate": 0.00014300538700631643, + "loss": 1.0324, + "step": 18635 + }, + { + "epoch": 0.7176130895091434, + "grad_norm": 1.0008260011672974, + "learning_rate": 0.00014297808462665688, + "loss": 1.0383, + "step": 18640 + }, + { + "epoch": 0.717805582290664, + "grad_norm": 1.291493535041809, + "learning_rate": 0.0001429507783169362, + "loss": 1.2128, + "step": 18645 + }, + { + "epoch": 0.7179980750721848, + "grad_norm": 2.5597760677337646, + "learning_rate": 0.0001429234680796514, + "loss": 1.146, + "step": 18650 + }, + { + "epoch": 0.7181905678537055, + "grad_norm": 2.4308478832244873, + "learning_rate": 0.00014289615391729974, + "loss": 1.3797, + "step": 18655 + }, + { + "epoch": 0.7183830606352262, + "grad_norm": 1.1110010147094727, + "learning_rate": 0.00014286883583237896, + "loss": 1.2471, + "step": 18660 + }, + { + "epoch": 0.7185755534167468, + "grad_norm": 1.075013279914856, + "learning_rate": 0.00014284151382738718, + "loss": 1.0836, + "step": 18665 + }, + { + "epoch": 0.7187680461982676, + "grad_norm": 1.9422922134399414, + "learning_rate": 0.00014281418790482273, + "loss": 1.3271, + "step": 18670 + }, + { + "epoch": 0.7189605389797883, + "grad_norm": 1.58540678024292, + "learning_rate": 0.00014278685806718442, + "loss": 1.0762, + "step": 18675 + }, + { + "epoch": 0.719153031761309, + "grad_norm": 1.1696521043777466, + "learning_rate": 0.00014275952431697138, + "loss": 1.1783, + "step": 18680 + }, + { + "epoch": 0.7193455245428296, + "grad_norm": 1.6518898010253906, + "learning_rate": 0.0001427321866566831, + "loss": 0.9509, + "step": 18685 + }, + { + "epoch": 0.7195380173243503, + "grad_norm": 1.2448405027389526, + "learning_rate": 0.0001427048450888194, + "loss": 1.1316, + "step": 18690 + }, + { + "epoch": 0.7197305101058711, + "grad_norm": 0.9715486168861389, + "learning_rate": 0.00014267749961588053, + "loss": 1.1547, + "step": 18695 + }, + { + "epoch": 0.7199230028873917, + "grad_norm": 1.176511287689209, + "learning_rate": 0.00014265015024036702, + "loss": 1.0325, + "step": 18700 + }, + { + "epoch": 0.7201154956689124, + "grad_norm": 1.096604824066162, + "learning_rate": 0.0001426227969647798, + "loss": 0.9389, + "step": 18705 + }, + { + "epoch": 0.7203079884504331, + "grad_norm": 1.2895269393920898, + "learning_rate": 0.00014259543979162017, + "loss": 1.1157, + "step": 18710 + }, + { + "epoch": 0.7205004812319538, + "grad_norm": 1.1590831279754639, + "learning_rate": 0.00014256807872338974, + "loss": 1.0154, + "step": 18715 + }, + { + "epoch": 0.7206929740134745, + "grad_norm": 1.1659713983535767, + "learning_rate": 0.00014254071376259046, + "loss": 1.0744, + "step": 18720 + }, + { + "epoch": 0.7208854667949952, + "grad_norm": 1.3548671007156372, + "learning_rate": 0.00014251334491172473, + "loss": 0.9823, + "step": 18725 + }, + { + "epoch": 0.7210779595765159, + "grad_norm": 1.5639405250549316, + "learning_rate": 0.00014248597217329526, + "loss": 1.1793, + "step": 18730 + }, + { + "epoch": 0.7212704523580366, + "grad_norm": 1.9836759567260742, + "learning_rate": 0.00014245859554980504, + "loss": 1.1827, + "step": 18735 + }, + { + "epoch": 0.7214629451395572, + "grad_norm": 1.2241086959838867, + "learning_rate": 0.00014243121504375753, + "loss": 1.2403, + "step": 18740 + }, + { + "epoch": 0.721655437921078, + "grad_norm": 1.1298317909240723, + "learning_rate": 0.0001424038306576565, + "loss": 1.1577, + "step": 18745 + }, + { + "epoch": 0.7218479307025987, + "grad_norm": 1.325210452079773, + "learning_rate": 0.00014237644239400605, + "loss": 1.1232, + "step": 18750 + }, + { + "epoch": 0.7220404234841193, + "grad_norm": 1.613929033279419, + "learning_rate": 0.00014234905025531066, + "loss": 0.9627, + "step": 18755 + }, + { + "epoch": 0.72223291626564, + "grad_norm": 1.6307876110076904, + "learning_rate": 0.00014232165424407517, + "loss": 1.0229, + "step": 18760 + }, + { + "epoch": 0.7224254090471607, + "grad_norm": 0.8971173167228699, + "learning_rate": 0.00014229425436280475, + "loss": 1.1371, + "step": 18765 + }, + { + "epoch": 0.7226179018286815, + "grad_norm": 1.3740814924240112, + "learning_rate": 0.00014226685061400496, + "loss": 1.1431, + "step": 18770 + }, + { + "epoch": 0.7228103946102021, + "grad_norm": 1.7433820962905884, + "learning_rate": 0.00014223944300018163, + "loss": 1.1876, + "step": 18775 + }, + { + "epoch": 0.7230028873917228, + "grad_norm": 1.1470065116882324, + "learning_rate": 0.0001422120315238411, + "loss": 1.1971, + "step": 18780 + }, + { + "epoch": 0.7231953801732435, + "grad_norm": 2.0566489696502686, + "learning_rate": 0.00014218461618748987, + "loss": 1.0274, + "step": 18785 + }, + { + "epoch": 0.7233878729547641, + "grad_norm": 1.589087724685669, + "learning_rate": 0.00014215719699363496, + "loss": 1.067, + "step": 18790 + }, + { + "epoch": 0.7235803657362849, + "grad_norm": 2.473461866378784, + "learning_rate": 0.00014212977394478365, + "loss": 1.2185, + "step": 18795 + }, + { + "epoch": 0.7237728585178056, + "grad_norm": 1.3214609622955322, + "learning_rate": 0.00014210234704344359, + "loss": 1.2501, + "step": 18800 + }, + { + "epoch": 0.7239653512993263, + "grad_norm": 2.281226873397827, + "learning_rate": 0.0001420749162921228, + "loss": 1.2262, + "step": 18805 + }, + { + "epoch": 0.7241578440808469, + "grad_norm": 1.188148021697998, + "learning_rate": 0.0001420474816933296, + "loss": 1.3338, + "step": 18810 + }, + { + "epoch": 0.7243503368623677, + "grad_norm": 2.0242867469787598, + "learning_rate": 0.00014202004324957279, + "loss": 1.1157, + "step": 18815 + }, + { + "epoch": 0.7245428296438884, + "grad_norm": 1.2399152517318726, + "learning_rate": 0.00014199260096336134, + "loss": 1.119, + "step": 18820 + }, + { + "epoch": 0.7247353224254091, + "grad_norm": 1.7323557138442993, + "learning_rate": 0.00014196515483720477, + "loss": 1.1672, + "step": 18825 + }, + { + "epoch": 0.7249278152069297, + "grad_norm": 1.4888850450515747, + "learning_rate": 0.00014193770487361273, + "loss": 0.9814, + "step": 18830 + }, + { + "epoch": 0.7251203079884504, + "grad_norm": 1.5241479873657227, + "learning_rate": 0.0001419102510750954, + "loss": 1.0614, + "step": 18835 + }, + { + "epoch": 0.7253128007699712, + "grad_norm": 1.2932441234588623, + "learning_rate": 0.00014188279344416323, + "loss": 1.1905, + "step": 18840 + }, + { + "epoch": 0.7255052935514918, + "grad_norm": 1.4357131719589233, + "learning_rate": 0.0001418553319833271, + "loss": 1.1303, + "step": 18845 + }, + { + "epoch": 0.7256977863330125, + "grad_norm": 2.1818439960479736, + "learning_rate": 0.00014182786669509806, + "loss": 1.2141, + "step": 18850 + }, + { + "epoch": 0.7258902791145332, + "grad_norm": 2.111520528793335, + "learning_rate": 0.00014180039758198774, + "loss": 1.165, + "step": 18855 + }, + { + "epoch": 0.7260827718960539, + "grad_norm": 1.3923039436340332, + "learning_rate": 0.00014177292464650796, + "loss": 1.3364, + "step": 18860 + }, + { + "epoch": 0.7262752646775746, + "grad_norm": 1.905661702156067, + "learning_rate": 0.0001417454478911709, + "loss": 1.1535, + "step": 18865 + }, + { + "epoch": 0.7264677574590953, + "grad_norm": 1.1814746856689453, + "learning_rate": 0.0001417179673184892, + "loss": 1.2141, + "step": 18870 + }, + { + "epoch": 0.726660250240616, + "grad_norm": 1.4515434503555298, + "learning_rate": 0.00014169048293097576, + "loss": 1.0955, + "step": 18875 + }, + { + "epoch": 0.7268527430221366, + "grad_norm": 1.2174112796783447, + "learning_rate": 0.0001416629947311438, + "loss": 1.0399, + "step": 18880 + }, + { + "epoch": 0.7270452358036573, + "grad_norm": 1.3769662380218506, + "learning_rate": 0.00014163550272150698, + "loss": 1.2164, + "step": 18885 + }, + { + "epoch": 0.7272377285851781, + "grad_norm": 1.3401464223861694, + "learning_rate": 0.00014160800690457927, + "loss": 1.1039, + "step": 18890 + }, + { + "epoch": 0.7274302213666988, + "grad_norm": 1.1210380792617798, + "learning_rate": 0.0001415805072828749, + "loss": 1.0771, + "step": 18895 + }, + { + "epoch": 0.7276227141482194, + "grad_norm": 1.3425636291503906, + "learning_rate": 0.00014155300385890863, + "loss": 1.1506, + "step": 18900 + }, + { + "epoch": 0.7278152069297401, + "grad_norm": 1.155220866203308, + "learning_rate": 0.0001415254966351954, + "loss": 1.1321, + "step": 18905 + }, + { + "epoch": 0.7280076997112608, + "grad_norm": 1.440024733543396, + "learning_rate": 0.0001414979856142506, + "loss": 1.2324, + "step": 18910 + }, + { + "epoch": 0.7282001924927816, + "grad_norm": 1.6521823406219482, + "learning_rate": 0.0001414704707985899, + "loss": 1.196, + "step": 18915 + }, + { + "epoch": 0.7283926852743022, + "grad_norm": 3.4958372116088867, + "learning_rate": 0.00014144295219072937, + "loss": 0.9906, + "step": 18920 + }, + { + "epoch": 0.7285851780558229, + "grad_norm": 0.9254593849182129, + "learning_rate": 0.00014141542979318538, + "loss": 1.2552, + "step": 18925 + }, + { + "epoch": 0.7287776708373436, + "grad_norm": 1.519364833831787, + "learning_rate": 0.00014138790360847473, + "loss": 1.0491, + "step": 18930 + }, + { + "epoch": 0.7289701636188642, + "grad_norm": 1.199167013168335, + "learning_rate": 0.0001413603736391144, + "loss": 0.9939, + "step": 18935 + }, + { + "epoch": 0.729162656400385, + "grad_norm": 1.0213391780853271, + "learning_rate": 0.00014133283988762192, + "loss": 1.222, + "step": 18940 + }, + { + "epoch": 0.7293551491819057, + "grad_norm": 1.27894127368927, + "learning_rate": 0.00014130530235651506, + "loss": 1.2881, + "step": 18945 + }, + { + "epoch": 0.7295476419634264, + "grad_norm": 1.1660070419311523, + "learning_rate": 0.0001412777610483119, + "loss": 1.1839, + "step": 18950 + }, + { + "epoch": 0.729740134744947, + "grad_norm": 0.9614414572715759, + "learning_rate": 0.00014125021596553093, + "loss": 1.0397, + "step": 18955 + }, + { + "epoch": 0.7299326275264677, + "grad_norm": 1.5278538465499878, + "learning_rate": 0.00014122266711069095, + "loss": 1.2835, + "step": 18960 + }, + { + "epoch": 0.7301251203079885, + "grad_norm": 1.2992238998413086, + "learning_rate": 0.00014119511448631118, + "loss": 1.2873, + "step": 18965 + }, + { + "epoch": 0.7303176130895092, + "grad_norm": 1.0794028043746948, + "learning_rate": 0.00014116755809491104, + "loss": 1.1677, + "step": 18970 + }, + { + "epoch": 0.7305101058710298, + "grad_norm": 1.672555685043335, + "learning_rate": 0.00014113999793901046, + "loss": 0.9295, + "step": 18975 + }, + { + "epoch": 0.7307025986525505, + "grad_norm": 1.630053997039795, + "learning_rate": 0.00014111243402112957, + "loss": 1.1635, + "step": 18980 + }, + { + "epoch": 0.7308950914340713, + "grad_norm": 1.3171367645263672, + "learning_rate": 0.00014108486634378895, + "loss": 1.065, + "step": 18985 + }, + { + "epoch": 0.731087584215592, + "grad_norm": 1.1997402906417847, + "learning_rate": 0.00014105729490950948, + "loss": 1.0747, + "step": 18990 + }, + { + "epoch": 0.7312800769971126, + "grad_norm": 1.6320029497146606, + "learning_rate": 0.00014102971972081233, + "loss": 1.2414, + "step": 18995 + }, + { + "epoch": 0.7314725697786333, + "grad_norm": 1.3852897882461548, + "learning_rate": 0.00014100214078021915, + "loss": 1.0307, + "step": 19000 + }, + { + "epoch": 0.731665062560154, + "grad_norm": 1.29547119140625, + "learning_rate": 0.00014097455809025178, + "loss": 1.1411, + "step": 19005 + }, + { + "epoch": 0.7318575553416747, + "grad_norm": 1.0764034986495972, + "learning_rate": 0.00014094697165343252, + "loss": 1.1789, + "step": 19010 + }, + { + "epoch": 0.7320500481231954, + "grad_norm": 1.7445317506790161, + "learning_rate": 0.00014091938147228395, + "loss": 1.2379, + "step": 19015 + }, + { + "epoch": 0.7322425409047161, + "grad_norm": 1.844789743423462, + "learning_rate": 0.00014089178754932898, + "loss": 1.0126, + "step": 19020 + }, + { + "epoch": 0.7324350336862367, + "grad_norm": 1.370970368385315, + "learning_rate": 0.00014086418988709095, + "loss": 1.3182, + "step": 19025 + }, + { + "epoch": 0.7326275264677574, + "grad_norm": 1.2565025091171265, + "learning_rate": 0.00014083658848809347, + "loss": 1.1753, + "step": 19030 + }, + { + "epoch": 0.7328200192492782, + "grad_norm": 1.7159111499786377, + "learning_rate": 0.00014080898335486046, + "loss": 1.0572, + "step": 19035 + }, + { + "epoch": 0.7330125120307989, + "grad_norm": 1.8323345184326172, + "learning_rate": 0.0001407813744899163, + "loss": 1.0822, + "step": 19040 + }, + { + "epoch": 0.7332050048123195, + "grad_norm": 1.6878646612167358, + "learning_rate": 0.00014075376189578553, + "loss": 1.1133, + "step": 19045 + }, + { + "epoch": 0.7333974975938402, + "grad_norm": 1.7448841333389282, + "learning_rate": 0.00014072614557499323, + "loss": 1.0922, + "step": 19050 + }, + { + "epoch": 0.7335899903753609, + "grad_norm": 0.9125509262084961, + "learning_rate": 0.00014069852553006472, + "loss": 1.1788, + "step": 19055 + }, + { + "epoch": 0.7337824831568817, + "grad_norm": 1.8741627931594849, + "learning_rate": 0.00014067090176352563, + "loss": 1.0538, + "step": 19060 + }, + { + "epoch": 0.7339749759384023, + "grad_norm": 3.1138720512390137, + "learning_rate": 0.00014064327427790201, + "loss": 1.256, + "step": 19065 + }, + { + "epoch": 0.734167468719923, + "grad_norm": 1.3083161115646362, + "learning_rate": 0.00014061564307572022, + "loss": 0.976, + "step": 19070 + }, + { + "epoch": 0.7343599615014437, + "grad_norm": 1.176721215248108, + "learning_rate": 0.00014058800815950687, + "loss": 1.1733, + "step": 19075 + }, + { + "epoch": 0.7345524542829643, + "grad_norm": 0.9016759395599365, + "learning_rate": 0.00014056036953178906, + "loss": 1.0671, + "step": 19080 + }, + { + "epoch": 0.7347449470644851, + "grad_norm": 1.4011337757110596, + "learning_rate": 0.00014053272719509417, + "loss": 1.1453, + "step": 19085 + }, + { + "epoch": 0.7349374398460058, + "grad_norm": 1.2671010494232178, + "learning_rate": 0.00014050508115194988, + "loss": 1.1453, + "step": 19090 + }, + { + "epoch": 0.7351299326275265, + "grad_norm": 1.3316471576690674, + "learning_rate": 0.00014047743140488422, + "loss": 0.9451, + "step": 19095 + }, + { + "epoch": 0.7353224254090471, + "grad_norm": 1.963815689086914, + "learning_rate": 0.0001404497779564256, + "loss": 0.993, + "step": 19100 + }, + { + "epoch": 0.7355149181905678, + "grad_norm": 1.4354350566864014, + "learning_rate": 0.00014042212080910276, + "loss": 1.1263, + "step": 19105 + }, + { + "epoch": 0.7357074109720886, + "grad_norm": 1.6670982837677002, + "learning_rate": 0.00014039445996544473, + "loss": 1.0964, + "step": 19110 + }, + { + "epoch": 0.7358999037536093, + "grad_norm": 0.9805311560630798, + "learning_rate": 0.00014036679542798092, + "loss": 1.056, + "step": 19115 + }, + { + "epoch": 0.7360923965351299, + "grad_norm": 1.4659690856933594, + "learning_rate": 0.0001403391271992411, + "loss": 1.0984, + "step": 19120 + }, + { + "epoch": 0.7362848893166506, + "grad_norm": 0.5292593240737915, + "learning_rate": 0.00014031145528175525, + "loss": 1.0774, + "step": 19125 + }, + { + "epoch": 0.7364773820981714, + "grad_norm": 1.9471726417541504, + "learning_rate": 0.00014028377967805392, + "loss": 1.1648, + "step": 19130 + }, + { + "epoch": 0.736669874879692, + "grad_norm": 1.2082020044326782, + "learning_rate": 0.0001402561003906678, + "loss": 1.0764, + "step": 19135 + }, + { + "epoch": 0.7368623676612127, + "grad_norm": 1.558237075805664, + "learning_rate": 0.00014022841742212792, + "loss": 1.3944, + "step": 19140 + }, + { + "epoch": 0.7370548604427334, + "grad_norm": 1.7463306188583374, + "learning_rate": 0.0001402007307749658, + "loss": 1.0599, + "step": 19145 + }, + { + "epoch": 0.737247353224254, + "grad_norm": 1.2820191383361816, + "learning_rate": 0.00014017304045171316, + "loss": 1.042, + "step": 19150 + }, + { + "epoch": 0.7374398460057748, + "grad_norm": 1.617754340171814, + "learning_rate": 0.00014014534645490206, + "loss": 1.1031, + "step": 19155 + }, + { + "epoch": 0.7376323387872955, + "grad_norm": 1.0561091899871826, + "learning_rate": 0.00014011764878706497, + "loss": 1.1711, + "step": 19160 + }, + { + "epoch": 0.7378248315688162, + "grad_norm": 1.0614964962005615, + "learning_rate": 0.00014008994745073468, + "loss": 1.0783, + "step": 19165 + }, + { + "epoch": 0.7380173243503368, + "grad_norm": 1.5456453561782837, + "learning_rate": 0.0001400622424484442, + "loss": 1.0303, + "step": 19170 + }, + { + "epoch": 0.7382098171318575, + "grad_norm": 1.4854921102523804, + "learning_rate": 0.00014003453378272712, + "loss": 1.0719, + "step": 19175 + }, + { + "epoch": 0.7384023099133783, + "grad_norm": 1.4764469861984253, + "learning_rate": 0.00014000682145611708, + "loss": 1.2755, + "step": 19180 + }, + { + "epoch": 0.738594802694899, + "grad_norm": 1.6524717807769775, + "learning_rate": 0.00013997910547114826, + "loss": 1.1086, + "step": 19185 + }, + { + "epoch": 0.7387872954764196, + "grad_norm": 1.264930248260498, + "learning_rate": 0.00013995138583035508, + "loss": 1.1087, + "step": 19190 + }, + { + "epoch": 0.7389797882579403, + "grad_norm": 1.8001179695129395, + "learning_rate": 0.0001399236625362723, + "loss": 1.1736, + "step": 19195 + }, + { + "epoch": 0.739172281039461, + "grad_norm": 1.0975139141082764, + "learning_rate": 0.00013989593559143507, + "loss": 1.1669, + "step": 19200 + }, + { + "epoch": 0.7393647738209818, + "grad_norm": 1.078940987586975, + "learning_rate": 0.0001398682049983788, + "loss": 1.1259, + "step": 19205 + }, + { + "epoch": 0.7395572666025024, + "grad_norm": 1.0370323657989502, + "learning_rate": 0.0001398404707596393, + "loss": 1.2454, + "step": 19210 + }, + { + "epoch": 0.7397497593840231, + "grad_norm": 1.8001567125320435, + "learning_rate": 0.00013981273287775266, + "loss": 1.2803, + "step": 19215 + }, + { + "epoch": 0.7399422521655438, + "grad_norm": 1.00836181640625, + "learning_rate": 0.00013978499135525535, + "loss": 1.2406, + "step": 19220 + }, + { + "epoch": 0.7401347449470644, + "grad_norm": 1.169600009918213, + "learning_rate": 0.00013975724619468414, + "loss": 1.2738, + "step": 19225 + }, + { + "epoch": 0.7403272377285852, + "grad_norm": 1.6229758262634277, + "learning_rate": 0.00013972949739857613, + "loss": 1.1428, + "step": 19230 + }, + { + "epoch": 0.7405197305101059, + "grad_norm": 1.573930263519287, + "learning_rate": 0.00013970174496946873, + "loss": 1.1467, + "step": 19235 + }, + { + "epoch": 0.7407122232916266, + "grad_norm": 1.4224984645843506, + "learning_rate": 0.00013967398890989979, + "loss": 1.1335, + "step": 19240 + }, + { + "epoch": 0.7409047160731472, + "grad_norm": 1.5381492376327515, + "learning_rate": 0.00013964622922240736, + "loss": 1.1332, + "step": 19245 + }, + { + "epoch": 0.7410972088546679, + "grad_norm": 1.7980502843856812, + "learning_rate": 0.0001396184659095299, + "loss": 1.2107, + "step": 19250 + }, + { + "epoch": 0.7412897016361887, + "grad_norm": 0.8735668063163757, + "learning_rate": 0.00013959069897380617, + "loss": 1.0948, + "step": 19255 + }, + { + "epoch": 0.7414821944177094, + "grad_norm": 1.1920636892318726, + "learning_rate": 0.0001395629284177753, + "loss": 1.1663, + "step": 19260 + }, + { + "epoch": 0.74167468719923, + "grad_norm": 1.3055362701416016, + "learning_rate": 0.0001395351542439767, + "loss": 1.1108, + "step": 19265 + }, + { + "epoch": 0.7418671799807507, + "grad_norm": 1.8382583856582642, + "learning_rate": 0.00013950737645495014, + "loss": 1.0279, + "step": 19270 + }, + { + "epoch": 0.7420596727622714, + "grad_norm": 0.865042507648468, + "learning_rate": 0.00013947959505323577, + "loss": 1.1005, + "step": 19275 + }, + { + "epoch": 0.7422521655437921, + "grad_norm": 1.182671070098877, + "learning_rate": 0.0001394518100413739, + "loss": 1.199, + "step": 19280 + }, + { + "epoch": 0.7424446583253128, + "grad_norm": 1.5641695261001587, + "learning_rate": 0.00013942402142190532, + "loss": 1.098, + "step": 19285 + }, + { + "epoch": 0.7426371511068335, + "grad_norm": 0.924503743648529, + "learning_rate": 0.0001393962291973712, + "loss": 1.2765, + "step": 19290 + }, + { + "epoch": 0.7428296438883542, + "grad_norm": 1.1100239753723145, + "learning_rate": 0.00013936843337031287, + "loss": 1.2573, + "step": 19295 + }, + { + "epoch": 0.7430221366698749, + "grad_norm": 1.2185837030410767, + "learning_rate": 0.0001393406339432721, + "loss": 1.2064, + "step": 19300 + }, + { + "epoch": 0.7432146294513956, + "grad_norm": 1.5463718175888062, + "learning_rate": 0.000139312830918791, + "loss": 1.1559, + "step": 19305 + }, + { + "epoch": 0.7434071222329163, + "grad_norm": 1.8315119743347168, + "learning_rate": 0.00013928502429941188, + "loss": 1.2292, + "step": 19310 + }, + { + "epoch": 0.7435996150144369, + "grad_norm": 1.303144097328186, + "learning_rate": 0.00013925721408767757, + "loss": 1.1463, + "step": 19315 + }, + { + "epoch": 0.7437921077959576, + "grad_norm": 1.2040412425994873, + "learning_rate": 0.00013922940028613106, + "loss": 0.9717, + "step": 19320 + }, + { + "epoch": 0.7439846005774784, + "grad_norm": 1.0547009706497192, + "learning_rate": 0.0001392015828973158, + "loss": 1.0389, + "step": 19325 + }, + { + "epoch": 0.7441770933589991, + "grad_norm": 1.597541093826294, + "learning_rate": 0.00013917376192377543, + "loss": 1.0937, + "step": 19330 + }, + { + "epoch": 0.7443695861405197, + "grad_norm": 0.9714812636375427, + "learning_rate": 0.00013914593736805402, + "loss": 1.1641, + "step": 19335 + }, + { + "epoch": 0.7445620789220404, + "grad_norm": 1.2114696502685547, + "learning_rate": 0.00013911810923269603, + "loss": 1.2167, + "step": 19340 + }, + { + "epoch": 0.7447545717035611, + "grad_norm": 1.330718755722046, + "learning_rate": 0.000139090277520246, + "loss": 1.0583, + "step": 19345 + }, + { + "epoch": 0.7449470644850819, + "grad_norm": 2.2889277935028076, + "learning_rate": 0.0001390624422332491, + "loss": 0.9747, + "step": 19350 + }, + { + "epoch": 0.7451395572666025, + "grad_norm": 1.2337167263031006, + "learning_rate": 0.0001390346033742506, + "loss": 1.0294, + "step": 19355 + }, + { + "epoch": 0.7453320500481232, + "grad_norm": 1.0727423429489136, + "learning_rate": 0.0001390067609457962, + "loss": 1.0961, + "step": 19360 + }, + { + "epoch": 0.7455245428296439, + "grad_norm": 1.7654608488082886, + "learning_rate": 0.0001389789149504319, + "loss": 1.0775, + "step": 19365 + }, + { + "epoch": 0.7457170356111645, + "grad_norm": 1.2609182596206665, + "learning_rate": 0.0001389510653907041, + "loss": 1.0706, + "step": 19370 + }, + { + "epoch": 0.7459095283926853, + "grad_norm": 1.906533122062683, + "learning_rate": 0.00013892321226915933, + "loss": 1.1035, + "step": 19375 + }, + { + "epoch": 0.746102021174206, + "grad_norm": 1.1768391132354736, + "learning_rate": 0.00013889535558834462, + "loss": 1.097, + "step": 19380 + }, + { + "epoch": 0.7462945139557267, + "grad_norm": 1.7764432430267334, + "learning_rate": 0.00013886749535080737, + "loss": 1.2136, + "step": 19385 + }, + { + "epoch": 0.7464870067372473, + "grad_norm": 2.2302021980285645, + "learning_rate": 0.0001388396315590951, + "loss": 1.1236, + "step": 19390 + }, + { + "epoch": 0.746679499518768, + "grad_norm": 1.0161263942718506, + "learning_rate": 0.00013881176421575583, + "loss": 1.3818, + "step": 19395 + }, + { + "epoch": 0.7468719923002888, + "grad_norm": 1.002767562866211, + "learning_rate": 0.00013878389332333784, + "loss": 1.1468, + "step": 19400 + }, + { + "epoch": 0.7470644850818094, + "grad_norm": 1.637412667274475, + "learning_rate": 0.00013875601888438968, + "loss": 1.2074, + "step": 19405 + }, + { + "epoch": 0.7472569778633301, + "grad_norm": 1.440507173538208, + "learning_rate": 0.00013872814090146036, + "loss": 0.9903, + "step": 19410 + }, + { + "epoch": 0.7474494706448508, + "grad_norm": 1.8063361644744873, + "learning_rate": 0.00013870025937709913, + "loss": 1.1508, + "step": 19415 + }, + { + "epoch": 0.7476419634263715, + "grad_norm": 1.0809664726257324, + "learning_rate": 0.0001386723743138555, + "loss": 1.1509, + "step": 19420 + }, + { + "epoch": 0.7478344562078922, + "grad_norm": 0.7736053466796875, + "learning_rate": 0.00013864448571427945, + "loss": 1.048, + "step": 19425 + }, + { + "epoch": 0.7480269489894129, + "grad_norm": 1.588610291481018, + "learning_rate": 0.00013861659358092117, + "loss": 1.1118, + "step": 19430 + }, + { + "epoch": 0.7482194417709336, + "grad_norm": 1.5141923427581787, + "learning_rate": 0.00013858869791633124, + "loss": 1.1847, + "step": 19435 + }, + { + "epoch": 0.7484119345524542, + "grad_norm": 1.6033471822738647, + "learning_rate": 0.00013856079872306046, + "loss": 1.1109, + "step": 19440 + }, + { + "epoch": 0.748604427333975, + "grad_norm": 1.1898064613342285, + "learning_rate": 0.0001385328960036601, + "loss": 1.1758, + "step": 19445 + }, + { + "epoch": 0.7487969201154957, + "grad_norm": 1.8286123275756836, + "learning_rate": 0.00013850498976068166, + "loss": 1.1827, + "step": 19450 + }, + { + "epoch": 0.7489894128970164, + "grad_norm": 1.6806395053863525, + "learning_rate": 0.00013847707999667698, + "loss": 0.9877, + "step": 19455 + }, + { + "epoch": 0.749181905678537, + "grad_norm": 2.1087961196899414, + "learning_rate": 0.00013844916671419823, + "loss": 1.2052, + "step": 19460 + }, + { + "epoch": 0.7493743984600577, + "grad_norm": 1.4643951654434204, + "learning_rate": 0.0001384212499157979, + "loss": 0.9652, + "step": 19465 + }, + { + "epoch": 0.7495668912415785, + "grad_norm": 1.4248661994934082, + "learning_rate": 0.00013839332960402872, + "loss": 1.1903, + "step": 19470 + }, + { + "epoch": 0.7497593840230992, + "grad_norm": 1.8101911544799805, + "learning_rate": 0.0001383709908267036, + "loss": 1.0335, + "step": 19475 + }, + { + "epoch": 0.7499518768046198, + "grad_norm": 1.9175690412521362, + "learning_rate": 0.00013834306419730473, + "loss": 1.2055, + "step": 19480 + }, + { + "epoch": 0.7501443695861405, + "grad_norm": 0.930147647857666, + "learning_rate": 0.00013831513406168663, + "loss": 1.2383, + "step": 19485 + }, + { + "epoch": 0.7503368623676612, + "grad_norm": 2.496994733810425, + "learning_rate": 0.00013828720042240338, + "loss": 1.2861, + "step": 19490 + }, + { + "epoch": 0.750529355149182, + "grad_norm": 1.9224547147750854, + "learning_rate": 0.00013825926328200926, + "loss": 1.247, + "step": 19495 + }, + { + "epoch": 0.7507218479307026, + "grad_norm": 1.7266567945480347, + "learning_rate": 0.00013823132264305894, + "loss": 1.1127, + "step": 19500 + }, + { + "epoch": 0.7509143407122233, + "grad_norm": 1.2304484844207764, + "learning_rate": 0.00013820337850810744, + "loss": 1.1432, + "step": 19505 + }, + { + "epoch": 0.751106833493744, + "grad_norm": 2.311600685119629, + "learning_rate": 0.00013817543087971004, + "loss": 1.2405, + "step": 19510 + }, + { + "epoch": 0.7512993262752646, + "grad_norm": 2.358445644378662, + "learning_rate": 0.0001381474797604224, + "loss": 1.2407, + "step": 19515 + }, + { + "epoch": 0.7514918190567854, + "grad_norm": 1.2297358512878418, + "learning_rate": 0.00013811952515280042, + "loss": 0.9701, + "step": 19520 + }, + { + "epoch": 0.7516843118383061, + "grad_norm": 1.260389804840088, + "learning_rate": 0.00013809156705940037, + "loss": 1.1711, + "step": 19525 + }, + { + "epoch": 0.7518768046198268, + "grad_norm": 0.9936877489089966, + "learning_rate": 0.00013806360548277886, + "loss": 0.9045, + "step": 19530 + }, + { + "epoch": 0.7520692974013474, + "grad_norm": 1.951540470123291, + "learning_rate": 0.0001380356404254928, + "loss": 1.0988, + "step": 19535 + }, + { + "epoch": 0.7522617901828681, + "grad_norm": 1.0897135734558105, + "learning_rate": 0.00013800767189009935, + "loss": 1.0216, + "step": 19540 + }, + { + "epoch": 0.7524542829643889, + "grad_norm": 1.3618708848953247, + "learning_rate": 0.00013797969987915608, + "loss": 1.0604, + "step": 19545 + }, + { + "epoch": 0.7526467757459095, + "grad_norm": 1.413282871246338, + "learning_rate": 0.00013795172439522087, + "loss": 1.2045, + "step": 19550 + }, + { + "epoch": 0.7528392685274302, + "grad_norm": 1.4086360931396484, + "learning_rate": 0.00013792374544085187, + "loss": 1.0724, + "step": 19555 + }, + { + "epoch": 0.7530317613089509, + "grad_norm": 1.2165982723236084, + "learning_rate": 0.00013789576301860757, + "loss": 1.1886, + "step": 19560 + }, + { + "epoch": 0.7532242540904716, + "grad_norm": 1.4711132049560547, + "learning_rate": 0.00013786777713104678, + "loss": 1.1847, + "step": 19565 + }, + { + "epoch": 0.7534167468719923, + "grad_norm": 1.0978587865829468, + "learning_rate": 0.00013783978778072862, + "loss": 1.1521, + "step": 19570 + }, + { + "epoch": 0.753609239653513, + "grad_norm": 1.1508560180664062, + "learning_rate": 0.00013781179497021251, + "loss": 1.089, + "step": 19575 + }, + { + "epoch": 0.7538017324350337, + "grad_norm": 1.3086730241775513, + "learning_rate": 0.00013778379870205829, + "loss": 1.2293, + "step": 19580 + }, + { + "epoch": 0.7539942252165543, + "grad_norm": 1.63782799243927, + "learning_rate": 0.0001377557989788259, + "loss": 1.0373, + "step": 19585 + }, + { + "epoch": 0.754186717998075, + "grad_norm": 1.4707633256912231, + "learning_rate": 0.00013772779580307584, + "loss": 1.0481, + "step": 19590 + }, + { + "epoch": 0.7543792107795958, + "grad_norm": 1.6030997037887573, + "learning_rate": 0.0001376997891773688, + "loss": 1.0953, + "step": 19595 + }, + { + "epoch": 0.7545717035611165, + "grad_norm": 1.0709367990493774, + "learning_rate": 0.00013767177910426574, + "loss": 1.1094, + "step": 19600 + }, + { + "epoch": 0.7547641963426371, + "grad_norm": 1.2302757501602173, + "learning_rate": 0.00013764376558632807, + "loss": 0.9815, + "step": 19605 + }, + { + "epoch": 0.7549566891241578, + "grad_norm": 2.4043121337890625, + "learning_rate": 0.00013761574862611737, + "loss": 1.1146, + "step": 19610 + }, + { + "epoch": 0.7551491819056786, + "grad_norm": 1.2333440780639648, + "learning_rate": 0.00013758772822619565, + "loss": 1.367, + "step": 19615 + }, + { + "epoch": 0.7553416746871993, + "grad_norm": 2.032453775405884, + "learning_rate": 0.0001375597043891252, + "loss": 1.1401, + "step": 19620 + }, + { + "epoch": 0.7555341674687199, + "grad_norm": 1.1483811140060425, + "learning_rate": 0.00013753167711746858, + "loss": 1.0757, + "step": 19625 + }, + { + "epoch": 0.7557266602502406, + "grad_norm": 2.314659833908081, + "learning_rate": 0.0001375036464137887, + "loss": 1.162, + "step": 19630 + }, + { + "epoch": 0.7559191530317613, + "grad_norm": 1.460924744606018, + "learning_rate": 0.0001374756122806488, + "loss": 1.1596, + "step": 19635 + }, + { + "epoch": 0.756111645813282, + "grad_norm": 1.628796935081482, + "learning_rate": 0.0001374475747206124, + "loss": 1.2437, + "step": 19640 + }, + { + "epoch": 0.7563041385948027, + "grad_norm": 0.9428819417953491, + "learning_rate": 0.0001374195337362434, + "loss": 1.1804, + "step": 19645 + }, + { + "epoch": 0.7564966313763234, + "grad_norm": 1.1497089862823486, + "learning_rate": 0.00013739148933010587, + "loss": 1.0776, + "step": 19650 + }, + { + "epoch": 0.7566891241578441, + "grad_norm": 1.2695974111557007, + "learning_rate": 0.00013736344150476435, + "loss": 1.2446, + "step": 19655 + }, + { + "epoch": 0.7568816169393647, + "grad_norm": 1.4802236557006836, + "learning_rate": 0.00013733539026278364, + "loss": 1.066, + "step": 19660 + }, + { + "epoch": 0.7570741097208855, + "grad_norm": 1.7089695930480957, + "learning_rate": 0.0001373073356067288, + "loss": 1.0265, + "step": 19665 + }, + { + "epoch": 0.7572666025024062, + "grad_norm": 1.4578391313552856, + "learning_rate": 0.00013727927753916523, + "loss": 1.0214, + "step": 19670 + }, + { + "epoch": 0.7574590952839269, + "grad_norm": 0.7848085761070251, + "learning_rate": 0.00013725121606265872, + "loss": 1.0254, + "step": 19675 + }, + { + "epoch": 0.7576515880654475, + "grad_norm": 1.5217918157577515, + "learning_rate": 0.00013722315117977525, + "loss": 1.2226, + "step": 19680 + }, + { + "epoch": 0.7578440808469682, + "grad_norm": 1.0195049047470093, + "learning_rate": 0.00013719508289308118, + "loss": 1.0981, + "step": 19685 + }, + { + "epoch": 0.758036573628489, + "grad_norm": 0.8907167315483093, + "learning_rate": 0.00013716701120514323, + "loss": 1.0012, + "step": 19690 + }, + { + "epoch": 0.7582290664100096, + "grad_norm": 1.6701477766036987, + "learning_rate": 0.00013713893611852824, + "loss": 1.1048, + "step": 19695 + }, + { + "epoch": 0.7584215591915303, + "grad_norm": 1.4811270236968994, + "learning_rate": 0.0001371108576358036, + "loss": 1.2534, + "step": 19700 + }, + { + "epoch": 0.758614051973051, + "grad_norm": 2.0855724811553955, + "learning_rate": 0.00013708277575953686, + "loss": 0.968, + "step": 19705 + }, + { + "epoch": 0.7588065447545717, + "grad_norm": 1.4841949939727783, + "learning_rate": 0.00013705469049229594, + "loss": 1.1039, + "step": 19710 + }, + { + "epoch": 0.7589990375360924, + "grad_norm": 1.2720580101013184, + "learning_rate": 0.000137026601836649, + "loss": 0.9966, + "step": 19715 + }, + { + "epoch": 0.7591915303176131, + "grad_norm": 2.275491714477539, + "learning_rate": 0.00013699850979516465, + "loss": 1.1684, + "step": 19720 + }, + { + "epoch": 0.7593840230991338, + "grad_norm": 1.2187795639038086, + "learning_rate": 0.00013697041437041167, + "loss": 0.9793, + "step": 19725 + }, + { + "epoch": 0.7595765158806544, + "grad_norm": 1.1858078241348267, + "learning_rate": 0.00013694231556495915, + "loss": 1.1833, + "step": 19730 + }, + { + "epoch": 0.7597690086621751, + "grad_norm": 1.2739187479019165, + "learning_rate": 0.00013691421338137664, + "loss": 1.1139, + "step": 19735 + }, + { + "epoch": 0.7599615014436959, + "grad_norm": 1.7635918855667114, + "learning_rate": 0.00013688610782223382, + "loss": 1.0935, + "step": 19740 + }, + { + "epoch": 0.7601539942252166, + "grad_norm": 1.7312551736831665, + "learning_rate": 0.00013685799889010084, + "loss": 1.036, + "step": 19745 + }, + { + "epoch": 0.7603464870067372, + "grad_norm": 1.222069501876831, + "learning_rate": 0.00013682988658754797, + "loss": 1.1653, + "step": 19750 + }, + { + "epoch": 0.7605389797882579, + "grad_norm": 2.664635181427002, + "learning_rate": 0.00013680177091714596, + "loss": 1.281, + "step": 19755 + }, + { + "epoch": 0.7607314725697787, + "grad_norm": 1.2842050790786743, + "learning_rate": 0.00013677365188146577, + "loss": 1.1799, + "step": 19760 + }, + { + "epoch": 0.7609239653512994, + "grad_norm": 1.45145583152771, + "learning_rate": 0.00013674552948307874, + "loss": 1.1625, + "step": 19765 + }, + { + "epoch": 0.76111645813282, + "grad_norm": 1.8897767066955566, + "learning_rate": 0.00013671740372455648, + "loss": 1.1714, + "step": 19770 + }, + { + "epoch": 0.7613089509143407, + "grad_norm": 1.171235203742981, + "learning_rate": 0.00013668927460847084, + "loss": 1.2752, + "step": 19775 + }, + { + "epoch": 0.7615014436958614, + "grad_norm": 0.9240397810935974, + "learning_rate": 0.00013666114213739408, + "loss": 1.0669, + "step": 19780 + }, + { + "epoch": 0.7616939364773821, + "grad_norm": 1.654099941253662, + "learning_rate": 0.0001366330063138988, + "loss": 1.1941, + "step": 19785 + }, + { + "epoch": 0.7618864292589028, + "grad_norm": 1.2961543798446655, + "learning_rate": 0.00013660486714055768, + "loss": 1.2552, + "step": 19790 + }, + { + "epoch": 0.7620789220404235, + "grad_norm": 1.2810674905776978, + "learning_rate": 0.00013657672461994398, + "loss": 1.1035, + "step": 19795 + }, + { + "epoch": 0.7622714148219442, + "grad_norm": 3.574240207672119, + "learning_rate": 0.00013654857875463111, + "loss": 1.1724, + "step": 19800 + }, + { + "epoch": 0.7624639076034648, + "grad_norm": 1.0426640510559082, + "learning_rate": 0.00013652042954719282, + "loss": 1.351, + "step": 19805 + }, + { + "epoch": 0.7626564003849856, + "grad_norm": 0.9059193730354309, + "learning_rate": 0.00013649227700020318, + "loss": 1.1989, + "step": 19810 + }, + { + "epoch": 0.7628488931665063, + "grad_norm": 2.0250661373138428, + "learning_rate": 0.00013646412111623657, + "loss": 1.5794, + "step": 19815 + }, + { + "epoch": 0.763041385948027, + "grad_norm": 1.030274748802185, + "learning_rate": 0.00013643596189786758, + "loss": 0.965, + "step": 19820 + }, + { + "epoch": 0.7632338787295476, + "grad_norm": 1.976044774055481, + "learning_rate": 0.0001364077993476713, + "loss": 1.1147, + "step": 19825 + }, + { + "epoch": 0.7634263715110683, + "grad_norm": 1.6923823356628418, + "learning_rate": 0.00013637963346822292, + "loss": 1.1618, + "step": 19830 + }, + { + "epoch": 0.7636188642925891, + "grad_norm": 1.3266521692276, + "learning_rate": 0.00013635146426209805, + "loss": 1.1782, + "step": 19835 + }, + { + "epoch": 0.7638113570741097, + "grad_norm": 1.6700036525726318, + "learning_rate": 0.00013632329173187256, + "loss": 0.9154, + "step": 19840 + }, + { + "epoch": 0.7640038498556304, + "grad_norm": 1.5041186809539795, + "learning_rate": 0.00013629511588012273, + "loss": 1.1082, + "step": 19845 + }, + { + "epoch": 0.7641963426371511, + "grad_norm": 1.4730234146118164, + "learning_rate": 0.0001362669367094249, + "loss": 1.128, + "step": 19850 + }, + { + "epoch": 0.7643888354186718, + "grad_norm": 1.415727972984314, + "learning_rate": 0.00013623875422235602, + "loss": 1.1844, + "step": 19855 + }, + { + "epoch": 0.7645813282001925, + "grad_norm": 1.9785696268081665, + "learning_rate": 0.00013621056842149306, + "loss": 1.1183, + "step": 19860 + }, + { + "epoch": 0.7647738209817132, + "grad_norm": 1.0615553855895996, + "learning_rate": 0.00013618237930941357, + "loss": 1.1257, + "step": 19865 + }, + { + "epoch": 0.7649663137632339, + "grad_norm": 1.749930500984192, + "learning_rate": 0.00013615418688869512, + "loss": 0.9933, + "step": 19870 + }, + { + "epoch": 0.7651588065447545, + "grad_norm": 1.5585590600967407, + "learning_rate": 0.0001361259911619158, + "loss": 1.1877, + "step": 19875 + }, + { + "epoch": 0.7653512993262752, + "grad_norm": 1.4237456321716309, + "learning_rate": 0.00013609779213165393, + "loss": 1.0918, + "step": 19880 + }, + { + "epoch": 0.765543792107796, + "grad_norm": 1.2364110946655273, + "learning_rate": 0.00013606958980048805, + "loss": 1.0557, + "step": 19885 + }, + { + "epoch": 0.7657362848893167, + "grad_norm": 1.0982424020767212, + "learning_rate": 0.00013604138417099712, + "loss": 1.1845, + "step": 19890 + }, + { + "epoch": 0.7659287776708373, + "grad_norm": 0.8089034557342529, + "learning_rate": 0.00013601317524576038, + "loss": 1.139, + "step": 19895 + }, + { + "epoch": 0.766121270452358, + "grad_norm": 1.0913941860198975, + "learning_rate": 0.0001359849630273573, + "loss": 1.2198, + "step": 19900 + }, + { + "epoch": 0.7663137632338787, + "grad_norm": 1.3398661613464355, + "learning_rate": 0.00013595674751836777, + "loss": 0.9453, + "step": 19905 + }, + { + "epoch": 0.7665062560153995, + "grad_norm": 2.1962482929229736, + "learning_rate": 0.00013592852872137186, + "loss": 1.3174, + "step": 19910 + }, + { + "epoch": 0.7666987487969201, + "grad_norm": 1.4308804273605347, + "learning_rate": 0.00013590030663895001, + "loss": 1.1015, + "step": 19915 + }, + { + "epoch": 0.7668912415784408, + "grad_norm": 0.915403425693512, + "learning_rate": 0.00013587208127368292, + "loss": 1.0555, + "step": 19920 + }, + { + "epoch": 0.7670837343599615, + "grad_norm": 1.0108091831207275, + "learning_rate": 0.00013584385262815164, + "loss": 1.1591, + "step": 19925 + }, + { + "epoch": 0.7672762271414822, + "grad_norm": 1.7001339197158813, + "learning_rate": 0.00013581562070493747, + "loss": 1.1671, + "step": 19930 + }, + { + "epoch": 0.7674687199230029, + "grad_norm": 0.7533661723136902, + "learning_rate": 0.00013578738550662207, + "loss": 0.9644, + "step": 19935 + }, + { + "epoch": 0.7676612127045236, + "grad_norm": 1.1101553440093994, + "learning_rate": 0.0001357591470357873, + "loss": 1.0671, + "step": 19940 + }, + { + "epoch": 0.7678537054860443, + "grad_norm": 2.112529993057251, + "learning_rate": 0.00013573090529501544, + "loss": 1.1979, + "step": 19945 + }, + { + "epoch": 0.7680461982675649, + "grad_norm": 1.2636981010437012, + "learning_rate": 0.000135702660286889, + "loss": 1.1494, + "step": 19950 + }, + { + "epoch": 0.7682386910490857, + "grad_norm": 1.5712652206420898, + "learning_rate": 0.00013567441201399073, + "loss": 1.1171, + "step": 19955 + }, + { + "epoch": 0.7684311838306064, + "grad_norm": 1.0548748970031738, + "learning_rate": 0.00013564616047890383, + "loss": 0.9806, + "step": 19960 + }, + { + "epoch": 0.768623676612127, + "grad_norm": 1.828020453453064, + "learning_rate": 0.00013561790568421172, + "loss": 1.2924, + "step": 19965 + }, + { + "epoch": 0.7688161693936477, + "grad_norm": 1.037383794784546, + "learning_rate": 0.00013558964763249804, + "loss": 1.0602, + "step": 19970 + }, + { + "epoch": 0.7690086621751684, + "grad_norm": 1.3003454208374023, + "learning_rate": 0.00013556138632634686, + "loss": 1.0049, + "step": 19975 + }, + { + "epoch": 0.7692011549566892, + "grad_norm": 1.0770816802978516, + "learning_rate": 0.00013553312176834247, + "loss": 1.2497, + "step": 19980 + }, + { + "epoch": 0.7693936477382098, + "grad_norm": 1.5512239933013916, + "learning_rate": 0.00013550485396106947, + "loss": 1.053, + "step": 19985 + }, + { + "epoch": 0.7695861405197305, + "grad_norm": 1.0486184358596802, + "learning_rate": 0.0001354765829071128, + "loss": 1.2014, + "step": 19990 + }, + { + "epoch": 0.7697786333012512, + "grad_norm": 1.2066504955291748, + "learning_rate": 0.00013544830860905762, + "loss": 1.1933, + "step": 19995 + }, + { + "epoch": 0.7699711260827719, + "grad_norm": 1.090703010559082, + "learning_rate": 0.00013542003106948943, + "loss": 1.1504, + "step": 20000 } ], "logging_steps": 5, @@ -14014,7 +28014,7 @@ "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, - "total_flos": 3.133033729973453e+17, + "total_flos": 6.241026539658117e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null