{ "best_metric": 3.161168336868286, "best_model_checkpoint": "./cifar100_outputs/checkpoint-26565", "epoch": 5.0, "eval_steps": 500, "global_step": 26565, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018821757952192735, "grad_norm": 1.8490731716156006, "learning_rate": 0.0004998117824204781, "loss": 4.6141, "step": 10 }, { "epoch": 0.003764351590438547, "grad_norm": 1.622644066810608, "learning_rate": 0.0004996235648409562, "loss": 4.6699, "step": 20 }, { "epoch": 0.00564652738565782, "grad_norm": 1.3981355428695679, "learning_rate": 0.0004994353472614342, "loss": 4.6266, "step": 30 }, { "epoch": 0.007528703180877094, "grad_norm": 1.4281316995620728, "learning_rate": 0.0004992471296819123, "loss": 4.5814, "step": 40 }, { "epoch": 0.009410878976096368, "grad_norm": 1.5957081317901611, "learning_rate": 0.0004990589121023904, "loss": 4.6204, "step": 50 }, { "epoch": 0.01129305477131564, "grad_norm": 1.291913390159607, "learning_rate": 0.0004988706945228685, "loss": 4.643, "step": 60 }, { "epoch": 0.013175230566534914, "grad_norm": 0.8935217261314392, "learning_rate": 0.0004986824769433465, "loss": 4.6356, "step": 70 }, { "epoch": 0.015057406361754188, "grad_norm": 0.9641174674034119, "learning_rate": 0.0004984942593638246, "loss": 4.6652, "step": 80 }, { "epoch": 0.01693958215697346, "grad_norm": 0.9098806381225586, "learning_rate": 0.0004983060417843027, "loss": 4.61, "step": 90 }, { "epoch": 0.018821757952192736, "grad_norm": 0.8219736814498901, "learning_rate": 0.0004981178242047808, "loss": 4.5734, "step": 100 }, { "epoch": 0.020703933747412008, "grad_norm": 1.2911275625228882, "learning_rate": 0.0004979296066252589, "loss": 4.6272, "step": 110 }, { "epoch": 0.02258610954263128, "grad_norm": 0.796344518661499, "learning_rate": 0.0004977413890457369, "loss": 4.637, "step": 120 }, { "epoch": 0.024468285337850556, "grad_norm": 1.0010029077529907, "learning_rate": 0.000497553171466215, "loss": 4.6, "step": 130 }, { "epoch": 0.026350461133069828, "grad_norm": 1.2629014253616333, "learning_rate": 0.000497364953886693, "loss": 4.5761, "step": 140 }, { "epoch": 0.028232636928289104, "grad_norm": 1.2411388158798218, "learning_rate": 0.000497176736307171, "loss": 4.6005, "step": 150 }, { "epoch": 0.030114812723508376, "grad_norm": 0.893264651298523, "learning_rate": 0.0004969885187276491, "loss": 4.6009, "step": 160 }, { "epoch": 0.03199698851872765, "grad_norm": 1.3933730125427246, "learning_rate": 0.0004968003011481272, "loss": 4.5853, "step": 170 }, { "epoch": 0.03387916431394692, "grad_norm": 1.3685733079910278, "learning_rate": 0.0004966120835686053, "loss": 4.6171, "step": 180 }, { "epoch": 0.0357613401091662, "grad_norm": 0.8030378818511963, "learning_rate": 0.0004964238659890834, "loss": 4.5877, "step": 190 }, { "epoch": 0.03764351590438547, "grad_norm": 1.0896660089492798, "learning_rate": 0.0004962356484095614, "loss": 4.54, "step": 200 }, { "epoch": 0.039525691699604744, "grad_norm": 1.415959119796753, "learning_rate": 0.0004960474308300395, "loss": 4.5448, "step": 210 }, { "epoch": 0.041407867494824016, "grad_norm": 1.3994077444076538, "learning_rate": 0.0004958592132505176, "loss": 4.5989, "step": 220 }, { "epoch": 0.04329004329004329, "grad_norm": 1.2095682621002197, "learning_rate": 0.0004956709956709957, "loss": 4.5381, "step": 230 }, { "epoch": 0.04517221908526256, "grad_norm": 1.5952842235565186, "learning_rate": 0.0004954827780914737, "loss": 4.5794, "step": 240 }, { "epoch": 0.04705439488048184, "grad_norm": 1.3413859605789185, "learning_rate": 0.0004952945605119518, "loss": 4.5359, "step": 250 }, { "epoch": 0.04893657067570111, "grad_norm": 1.0710666179656982, "learning_rate": 0.0004951063429324299, "loss": 4.6534, "step": 260 }, { "epoch": 0.050818746470920384, "grad_norm": 1.269322395324707, "learning_rate": 0.000494918125352908, "loss": 4.6267, "step": 270 }, { "epoch": 0.052700922266139656, "grad_norm": 1.1679551601409912, "learning_rate": 0.000494729907773386, "loss": 4.4577, "step": 280 }, { "epoch": 0.05458309806135893, "grad_norm": 1.586521029472351, "learning_rate": 0.0004945416901938641, "loss": 4.5726, "step": 290 }, { "epoch": 0.05646527385657821, "grad_norm": 1.1866834163665771, "learning_rate": 0.0004943534726143422, "loss": 4.5967, "step": 300 }, { "epoch": 0.05834744965179748, "grad_norm": 1.2843154668807983, "learning_rate": 0.0004941652550348203, "loss": 4.557, "step": 310 }, { "epoch": 0.06022962544701675, "grad_norm": 0.9741870760917664, "learning_rate": 0.0004939770374552983, "loss": 4.5775, "step": 320 }, { "epoch": 0.062111801242236024, "grad_norm": 1.1748979091644287, "learning_rate": 0.0004937888198757764, "loss": 4.4401, "step": 330 }, { "epoch": 0.0639939770374553, "grad_norm": 1.1653861999511719, "learning_rate": 0.0004936006022962545, "loss": 4.5607, "step": 340 }, { "epoch": 0.06587615283267458, "grad_norm": 1.1719738245010376, "learning_rate": 0.0004934123847167326, "loss": 4.5217, "step": 350 }, { "epoch": 0.06775832862789384, "grad_norm": 0.9163254499435425, "learning_rate": 0.0004932241671372107, "loss": 4.6028, "step": 360 }, { "epoch": 0.06964050442311312, "grad_norm": 0.9993066191673279, "learning_rate": 0.0004930359495576887, "loss": 4.4492, "step": 370 }, { "epoch": 0.0715226802183324, "grad_norm": 1.3269693851470947, "learning_rate": 0.0004928477319781668, "loss": 4.4583, "step": 380 }, { "epoch": 0.07340485601355166, "grad_norm": 1.4873899221420288, "learning_rate": 0.0004926595143986449, "loss": 4.5468, "step": 390 }, { "epoch": 0.07528703180877094, "grad_norm": 1.2793279886245728, "learning_rate": 0.0004924712968191229, "loss": 4.5068, "step": 400 }, { "epoch": 0.07716920760399021, "grad_norm": 1.2136483192443848, "learning_rate": 0.0004922830792396009, "loss": 4.4443, "step": 410 }, { "epoch": 0.07905138339920949, "grad_norm": 1.7519340515136719, "learning_rate": 0.000492094861660079, "loss": 4.422, "step": 420 }, { "epoch": 0.08093355919442875, "grad_norm": 1.1981459856033325, "learning_rate": 0.0004919066440805571, "loss": 4.4402, "step": 430 }, { "epoch": 0.08281573498964803, "grad_norm": 1.1304644346237183, "learning_rate": 0.0004917184265010352, "loss": 4.4206, "step": 440 }, { "epoch": 0.08469791078486731, "grad_norm": 1.0927811861038208, "learning_rate": 0.0004915302089215133, "loss": 4.4864, "step": 450 }, { "epoch": 0.08658008658008658, "grad_norm": 1.6989763975143433, "learning_rate": 0.0004913419913419914, "loss": 4.4381, "step": 460 }, { "epoch": 0.08846226237530586, "grad_norm": 0.9959608316421509, "learning_rate": 0.0004911537737624695, "loss": 4.5199, "step": 470 }, { "epoch": 0.09034443817052512, "grad_norm": 1.1445202827453613, "learning_rate": 0.0004909655561829475, "loss": 4.4059, "step": 480 }, { "epoch": 0.0922266139657444, "grad_norm": 2.0817947387695312, "learning_rate": 0.0004907773386034255, "loss": 4.4654, "step": 490 }, { "epoch": 0.09410878976096368, "grad_norm": 1.2125662565231323, "learning_rate": 0.0004905891210239036, "loss": 4.4801, "step": 500 }, { "epoch": 0.09599096555618294, "grad_norm": 1.3871793746948242, "learning_rate": 0.0004904009034443817, "loss": 4.5371, "step": 510 }, { "epoch": 0.09787314135140222, "grad_norm": 1.5888006687164307, "learning_rate": 0.0004902126858648598, "loss": 4.493, "step": 520 }, { "epoch": 0.09975531714662149, "grad_norm": 1.6036688089370728, "learning_rate": 0.0004900244682853378, "loss": 4.5377, "step": 530 }, { "epoch": 0.10163749294184077, "grad_norm": 1.4229373931884766, "learning_rate": 0.0004898362507058159, "loss": 4.5266, "step": 540 }, { "epoch": 0.10351966873706005, "grad_norm": 4.853313446044922, "learning_rate": 0.000489648033126294, "loss": 4.4592, "step": 550 }, { "epoch": 0.10540184453227931, "grad_norm": 0.9515388607978821, "learning_rate": 0.0004894598155467721, "loss": 4.4654, "step": 560 }, { "epoch": 0.10728402032749859, "grad_norm": 1.1957790851593018, "learning_rate": 0.0004892715979672501, "loss": 4.4855, "step": 570 }, { "epoch": 0.10916619612271786, "grad_norm": 1.4205329418182373, "learning_rate": 0.0004890833803877282, "loss": 4.457, "step": 580 }, { "epoch": 0.11104837191793714, "grad_norm": 1.360944151878357, "learning_rate": 0.0004888951628082063, "loss": 4.508, "step": 590 }, { "epoch": 0.11293054771315642, "grad_norm": 1.8090227842330933, "learning_rate": 0.0004887069452286844, "loss": 4.4387, "step": 600 }, { "epoch": 0.11481272350837568, "grad_norm": 1.6240856647491455, "learning_rate": 0.0004885187276491625, "loss": 4.4155, "step": 610 }, { "epoch": 0.11669489930359496, "grad_norm": 1.5647474527359009, "learning_rate": 0.0004883305100696405, "loss": 4.3604, "step": 620 }, { "epoch": 0.11857707509881422, "grad_norm": 1.6423243284225464, "learning_rate": 0.0004881422924901186, "loss": 4.472, "step": 630 }, { "epoch": 0.1204592508940335, "grad_norm": 1.5964930057525635, "learning_rate": 0.0004879540749105967, "loss": 4.409, "step": 640 }, { "epoch": 0.12234142668925278, "grad_norm": 1.5382463932037354, "learning_rate": 0.00048776585733107476, "loss": 4.3099, "step": 650 }, { "epoch": 0.12422360248447205, "grad_norm": 1.5318776369094849, "learning_rate": 0.0004875776397515528, "loss": 4.4296, "step": 660 }, { "epoch": 0.12610577827969133, "grad_norm": 1.3270454406738281, "learning_rate": 0.00048738942217203086, "loss": 4.2642, "step": 670 }, { "epoch": 0.1279879540749106, "grad_norm": 1.3209527730941772, "learning_rate": 0.00048720120459250894, "loss": 4.4877, "step": 680 }, { "epoch": 0.12987012987012986, "grad_norm": 1.1077908277511597, "learning_rate": 0.000487012987012987, "loss": 4.4918, "step": 690 }, { "epoch": 0.13175230566534915, "grad_norm": 1.3791859149932861, "learning_rate": 0.0004868247694334651, "loss": 4.3768, "step": 700 }, { "epoch": 0.13363448146056842, "grad_norm": 1.269770622253418, "learning_rate": 0.00048663655185394317, "loss": 4.4217, "step": 710 }, { "epoch": 0.13551665725578768, "grad_norm": 1.2476558685302734, "learning_rate": 0.00048644833427442124, "loss": 4.4766, "step": 720 }, { "epoch": 0.13739883305100697, "grad_norm": 1.3043088912963867, "learning_rate": 0.0004862601166948993, "loss": 4.4827, "step": 730 }, { "epoch": 0.13928100884622624, "grad_norm": 1.1240277290344238, "learning_rate": 0.0004860718991153774, "loss": 4.4419, "step": 740 }, { "epoch": 0.1411631846414455, "grad_norm": 1.52292001247406, "learning_rate": 0.0004858836815358554, "loss": 4.4896, "step": 750 }, { "epoch": 0.1430453604366648, "grad_norm": 1.5645487308502197, "learning_rate": 0.0004856954639563335, "loss": 4.4388, "step": 760 }, { "epoch": 0.14492753623188406, "grad_norm": 1.3656686544418335, "learning_rate": 0.00048550724637681157, "loss": 4.4646, "step": 770 }, { "epoch": 0.14680971202710333, "grad_norm": 1.2550185918807983, "learning_rate": 0.0004853190287972897, "loss": 4.3995, "step": 780 }, { "epoch": 0.1486918878223226, "grad_norm": 0.9675887227058411, "learning_rate": 0.0004851308112177678, "loss": 4.4296, "step": 790 }, { "epoch": 0.1505740636175419, "grad_norm": 1.1478201150894165, "learning_rate": 0.00048494259363824585, "loss": 4.3977, "step": 800 }, { "epoch": 0.15245623941276115, "grad_norm": 1.4751043319702148, "learning_rate": 0.00048475437605872393, "loss": 4.3892, "step": 810 }, { "epoch": 0.15433841520798042, "grad_norm": 1.5579464435577393, "learning_rate": 0.000484566158479202, "loss": 4.2969, "step": 820 }, { "epoch": 0.1562205910031997, "grad_norm": 1.3829562664031982, "learning_rate": 0.00048437794089968003, "loss": 4.3862, "step": 830 }, { "epoch": 0.15810276679841898, "grad_norm": 1.6938810348510742, "learning_rate": 0.0004841897233201581, "loss": 4.3947, "step": 840 }, { "epoch": 0.15998494259363824, "grad_norm": 1.089751124382019, "learning_rate": 0.0004840015057406362, "loss": 4.4267, "step": 850 }, { "epoch": 0.1618671183888575, "grad_norm": 0.999055802822113, "learning_rate": 0.00048381328816111426, "loss": 4.504, "step": 860 }, { "epoch": 0.1637492941840768, "grad_norm": 1.2850788831710815, "learning_rate": 0.00048362507058159233, "loss": 4.4024, "step": 870 }, { "epoch": 0.16563146997929606, "grad_norm": 2.449049472808838, "learning_rate": 0.0004834368530020704, "loss": 4.3419, "step": 880 }, { "epoch": 0.16751364577451533, "grad_norm": 1.521384358406067, "learning_rate": 0.0004832486354225485, "loss": 4.4893, "step": 890 }, { "epoch": 0.16939582156973462, "grad_norm": 1.0191763639450073, "learning_rate": 0.00048306041784302656, "loss": 4.4793, "step": 900 }, { "epoch": 0.1712779973649539, "grad_norm": 1.553985834121704, "learning_rate": 0.00048287220026350464, "loss": 4.4463, "step": 910 }, { "epoch": 0.17316017316017315, "grad_norm": 1.253300428390503, "learning_rate": 0.00048268398268398266, "loss": 4.4088, "step": 920 }, { "epoch": 0.17504234895539245, "grad_norm": 1.1468899250030518, "learning_rate": 0.00048249576510446074, "loss": 4.4099, "step": 930 }, { "epoch": 0.1769245247506117, "grad_norm": 1.6279855966567993, "learning_rate": 0.0004823075475249388, "loss": 4.2465, "step": 940 }, { "epoch": 0.17880670054583098, "grad_norm": 1.0638151168823242, "learning_rate": 0.0004821193299454169, "loss": 4.2858, "step": 950 }, { "epoch": 0.18068887634105024, "grad_norm": 1.3204110860824585, "learning_rate": 0.00048193111236589497, "loss": 4.4057, "step": 960 }, { "epoch": 0.18257105213626953, "grad_norm": 1.4396264553070068, "learning_rate": 0.00048174289478637304, "loss": 4.2264, "step": 970 }, { "epoch": 0.1844532279314888, "grad_norm": 1.4043022394180298, "learning_rate": 0.0004815546772068511, "loss": 4.3651, "step": 980 }, { "epoch": 0.18633540372670807, "grad_norm": 2.881331443786621, "learning_rate": 0.0004813664596273292, "loss": 4.3227, "step": 990 }, { "epoch": 0.18821757952192736, "grad_norm": 1.3331611156463623, "learning_rate": 0.00048117824204780733, "loss": 4.4728, "step": 1000 }, { "epoch": 0.19009975531714662, "grad_norm": 1.2341046333312988, "learning_rate": 0.00048099002446828535, "loss": 4.3222, "step": 1010 }, { "epoch": 0.1919819311123659, "grad_norm": 1.335827350616455, "learning_rate": 0.0004808018068887634, "loss": 4.4133, "step": 1020 }, { "epoch": 0.19386410690758518, "grad_norm": 1.5440466403961182, "learning_rate": 0.0004806135893092415, "loss": 4.2855, "step": 1030 }, { "epoch": 0.19574628270280445, "grad_norm": 1.4489920139312744, "learning_rate": 0.0004804253717297196, "loss": 4.3718, "step": 1040 }, { "epoch": 0.1976284584980237, "grad_norm": 1.4121023416519165, "learning_rate": 0.00048023715415019766, "loss": 4.2291, "step": 1050 }, { "epoch": 0.19951063429324298, "grad_norm": 1.3732514381408691, "learning_rate": 0.00048004893657067573, "loss": 4.3611, "step": 1060 }, { "epoch": 0.20139281008846227, "grad_norm": 1.475473165512085, "learning_rate": 0.0004798607189911538, "loss": 4.6097, "step": 1070 }, { "epoch": 0.20327498588368154, "grad_norm": 1.2706341743469238, "learning_rate": 0.0004796725014116319, "loss": 4.2615, "step": 1080 }, { "epoch": 0.2051571616789008, "grad_norm": 1.344974398612976, "learning_rate": 0.0004794842838321099, "loss": 4.3608, "step": 1090 }, { "epoch": 0.2070393374741201, "grad_norm": 1.092437505722046, "learning_rate": 0.000479296066252588, "loss": 4.5277, "step": 1100 }, { "epoch": 0.20892151326933936, "grad_norm": 1.5371427536010742, "learning_rate": 0.00047910784867306606, "loss": 4.3284, "step": 1110 }, { "epoch": 0.21080368906455862, "grad_norm": 2.021251916885376, "learning_rate": 0.00047891963109354414, "loss": 4.3508, "step": 1120 }, { "epoch": 0.2126858648597779, "grad_norm": 1.4358062744140625, "learning_rate": 0.0004787314135140222, "loss": 4.3418, "step": 1130 }, { "epoch": 0.21456804065499718, "grad_norm": 1.276908040046692, "learning_rate": 0.0004785431959345003, "loss": 4.3299, "step": 1140 }, { "epoch": 0.21645021645021645, "grad_norm": 1.2175958156585693, "learning_rate": 0.00047835497835497837, "loss": 4.2586, "step": 1150 }, { "epoch": 0.2183323922454357, "grad_norm": 1.2082417011260986, "learning_rate": 0.00047816676077545644, "loss": 4.4002, "step": 1160 }, { "epoch": 0.220214568040655, "grad_norm": 1.368486762046814, "learning_rate": 0.0004779785431959345, "loss": 4.2378, "step": 1170 }, { "epoch": 0.22209674383587427, "grad_norm": 1.3902267217636108, "learning_rate": 0.00047779032561641254, "loss": 4.3557, "step": 1180 }, { "epoch": 0.22397891963109354, "grad_norm": 1.1859028339385986, "learning_rate": 0.0004776021080368906, "loss": 4.35, "step": 1190 }, { "epoch": 0.22586109542631283, "grad_norm": 1.0749539136886597, "learning_rate": 0.0004774138904573687, "loss": 4.2978, "step": 1200 }, { "epoch": 0.2277432712215321, "grad_norm": 1.339378833770752, "learning_rate": 0.00047722567287784677, "loss": 4.4142, "step": 1210 }, { "epoch": 0.22962544701675136, "grad_norm": 1.3744890689849854, "learning_rate": 0.0004770374552983249, "loss": 4.2964, "step": 1220 }, { "epoch": 0.23150762281197063, "grad_norm": 1.2274056673049927, "learning_rate": 0.000476849237718803, "loss": 4.2599, "step": 1230 }, { "epoch": 0.23338979860718992, "grad_norm": 1.4189059734344482, "learning_rate": 0.00047666102013928105, "loss": 4.1228, "step": 1240 }, { "epoch": 0.23527197440240918, "grad_norm": 1.1967359781265259, "learning_rate": 0.00047647280255975913, "loss": 4.4719, "step": 1250 }, { "epoch": 0.23715415019762845, "grad_norm": 1.1312259435653687, "learning_rate": 0.0004762845849802372, "loss": 4.29, "step": 1260 }, { "epoch": 0.23903632599284774, "grad_norm": 1.4020061492919922, "learning_rate": 0.00047609636740071523, "loss": 4.4895, "step": 1270 }, { "epoch": 0.240918501788067, "grad_norm": 1.219526767730713, "learning_rate": 0.0004759081498211933, "loss": 4.3912, "step": 1280 }, { "epoch": 0.24280067758328627, "grad_norm": 1.411958932876587, "learning_rate": 0.0004757199322416714, "loss": 4.3785, "step": 1290 }, { "epoch": 0.24468285337850557, "grad_norm": 1.2712162733078003, "learning_rate": 0.00047553171466214946, "loss": 4.4343, "step": 1300 }, { "epoch": 0.24656502917372483, "grad_norm": 1.3762128353118896, "learning_rate": 0.00047534349708262753, "loss": 4.2164, "step": 1310 }, { "epoch": 0.2484472049689441, "grad_norm": 1.3928210735321045, "learning_rate": 0.0004751552795031056, "loss": 4.4329, "step": 1320 }, { "epoch": 0.2503293807641634, "grad_norm": 6.659452438354492, "learning_rate": 0.0004749670619235837, "loss": 4.2896, "step": 1330 }, { "epoch": 0.25221155655938265, "grad_norm": 1.5140278339385986, "learning_rate": 0.00047477884434406176, "loss": 4.2381, "step": 1340 }, { "epoch": 0.2540937323546019, "grad_norm": 1.4585140943527222, "learning_rate": 0.00047459062676453984, "loss": 4.2149, "step": 1350 }, { "epoch": 0.2559759081498212, "grad_norm": 1.842198371887207, "learning_rate": 0.00047440240918501786, "loss": 4.2652, "step": 1360 }, { "epoch": 0.25785808394504045, "grad_norm": 1.442714810371399, "learning_rate": 0.00047421419160549594, "loss": 4.2783, "step": 1370 }, { "epoch": 0.2597402597402597, "grad_norm": 1.6346895694732666, "learning_rate": 0.000474025974025974, "loss": 4.1949, "step": 1380 }, { "epoch": 0.26162243553547904, "grad_norm": 1.3816555738449097, "learning_rate": 0.0004738377564464521, "loss": 4.3527, "step": 1390 }, { "epoch": 0.2635046113306983, "grad_norm": 1.330315113067627, "learning_rate": 0.00047364953886693017, "loss": 4.2987, "step": 1400 }, { "epoch": 0.26538678712591757, "grad_norm": 1.4008479118347168, "learning_rate": 0.00047346132128740824, "loss": 4.2562, "step": 1410 }, { "epoch": 0.26726896292113683, "grad_norm": 1.5573803186416626, "learning_rate": 0.0004732731037078863, "loss": 4.2236, "step": 1420 }, { "epoch": 0.2691511387163561, "grad_norm": 1.4606269598007202, "learning_rate": 0.0004730848861283644, "loss": 4.1763, "step": 1430 }, { "epoch": 0.27103331451157536, "grad_norm": 1.312117338180542, "learning_rate": 0.0004728966685488424, "loss": 4.3037, "step": 1440 }, { "epoch": 0.27291549030679463, "grad_norm": 1.4533745050430298, "learning_rate": 0.00047270845096932055, "loss": 4.2849, "step": 1450 }, { "epoch": 0.27479766610201395, "grad_norm": 1.4557713270187378, "learning_rate": 0.0004725202333897986, "loss": 4.2465, "step": 1460 }, { "epoch": 0.2766798418972332, "grad_norm": 1.4600456953048706, "learning_rate": 0.0004723320158102767, "loss": 4.3342, "step": 1470 }, { "epoch": 0.2785620176924525, "grad_norm": 1.6165292263031006, "learning_rate": 0.0004721437982307548, "loss": 4.1626, "step": 1480 }, { "epoch": 0.28044419348767174, "grad_norm": 1.9924274682998657, "learning_rate": 0.00047195558065123286, "loss": 4.1766, "step": 1490 }, { "epoch": 0.282326369282891, "grad_norm": 1.4779736995697021, "learning_rate": 0.00047176736307171093, "loss": 4.1712, "step": 1500 }, { "epoch": 0.2842085450781103, "grad_norm": 1.7137341499328613, "learning_rate": 0.000471579145492189, "loss": 4.3836, "step": 1510 }, { "epoch": 0.2860907208733296, "grad_norm": 1.364959955215454, "learning_rate": 0.0004713909279126671, "loss": 4.3685, "step": 1520 }, { "epoch": 0.28797289666854886, "grad_norm": 1.3166732788085938, "learning_rate": 0.0004712027103331451, "loss": 4.2714, "step": 1530 }, { "epoch": 0.2898550724637681, "grad_norm": 1.5516678094863892, "learning_rate": 0.0004710144927536232, "loss": 4.2025, "step": 1540 }, { "epoch": 0.2917372482589874, "grad_norm": 1.399585247039795, "learning_rate": 0.00047082627517410126, "loss": 4.2243, "step": 1550 }, { "epoch": 0.29361942405420666, "grad_norm": 1.48573899269104, "learning_rate": 0.00047063805759457934, "loss": 4.1842, "step": 1560 }, { "epoch": 0.2955015998494259, "grad_norm": 1.4109703302383423, "learning_rate": 0.0004704498400150574, "loss": 4.2215, "step": 1570 }, { "epoch": 0.2973837756446452, "grad_norm": 1.3801723718643188, "learning_rate": 0.0004702616224355355, "loss": 4.2273, "step": 1580 }, { "epoch": 0.2992659514398645, "grad_norm": 1.7305430173873901, "learning_rate": 0.00047007340485601357, "loss": 4.0852, "step": 1590 }, { "epoch": 0.3011481272350838, "grad_norm": 1.4525911808013916, "learning_rate": 0.00046988518727649164, "loss": 4.3404, "step": 1600 }, { "epoch": 0.30303030303030304, "grad_norm": 1.7163445949554443, "learning_rate": 0.0004696969696969697, "loss": 4.4006, "step": 1610 }, { "epoch": 0.3049124788255223, "grad_norm": 1.2165595293045044, "learning_rate": 0.00046950875211744774, "loss": 4.2984, "step": 1620 }, { "epoch": 0.30679465462074157, "grad_norm": 1.5305345058441162, "learning_rate": 0.0004693205345379258, "loss": 4.2391, "step": 1630 }, { "epoch": 0.30867683041596083, "grad_norm": 1.9343332052230835, "learning_rate": 0.0004691323169584039, "loss": 4.1895, "step": 1640 }, { "epoch": 0.3105590062111801, "grad_norm": 1.4054638147354126, "learning_rate": 0.00046894409937888197, "loss": 4.3532, "step": 1650 }, { "epoch": 0.3124411820063994, "grad_norm": 1.3328653573989868, "learning_rate": 0.0004687558817993601, "loss": 4.0972, "step": 1660 }, { "epoch": 0.3143233578016187, "grad_norm": 1.452494502067566, "learning_rate": 0.0004685676642198382, "loss": 4.1327, "step": 1670 }, { "epoch": 0.31620553359683795, "grad_norm": 1.5234417915344238, "learning_rate": 0.00046837944664031625, "loss": 4.2734, "step": 1680 }, { "epoch": 0.3180877093920572, "grad_norm": 1.4561686515808105, "learning_rate": 0.00046819122906079433, "loss": 4.059, "step": 1690 }, { "epoch": 0.3199698851872765, "grad_norm": 1.7445582151412964, "learning_rate": 0.00046800301148127235, "loss": 4.1765, "step": 1700 }, { "epoch": 0.32185206098249575, "grad_norm": 2.538134813308716, "learning_rate": 0.00046781479390175043, "loss": 4.2543, "step": 1710 }, { "epoch": 0.323734236777715, "grad_norm": 1.6532399654388428, "learning_rate": 0.0004676265763222285, "loss": 4.1489, "step": 1720 }, { "epoch": 0.32561641257293433, "grad_norm": 1.4668339490890503, "learning_rate": 0.0004674383587427066, "loss": 4.2505, "step": 1730 }, { "epoch": 0.3274985883681536, "grad_norm": 1.553717017173767, "learning_rate": 0.00046725014116318466, "loss": 4.2593, "step": 1740 }, { "epoch": 0.32938076416337286, "grad_norm": 1.5993154048919678, "learning_rate": 0.00046706192358366273, "loss": 4.1331, "step": 1750 }, { "epoch": 0.33126293995859213, "grad_norm": 1.5123755931854248, "learning_rate": 0.0004668737060041408, "loss": 4.3395, "step": 1760 }, { "epoch": 0.3331451157538114, "grad_norm": 1.5383719205856323, "learning_rate": 0.0004666854884246189, "loss": 4.1803, "step": 1770 }, { "epoch": 0.33502729154903066, "grad_norm": 1.5453109741210938, "learning_rate": 0.00046649727084509696, "loss": 4.3612, "step": 1780 }, { "epoch": 0.33690946734425, "grad_norm": 1.447769284248352, "learning_rate": 0.000466309053265575, "loss": 4.2859, "step": 1790 }, { "epoch": 0.33879164313946925, "grad_norm": 1.6781235933303833, "learning_rate": 0.00046612083568605306, "loss": 4.0687, "step": 1800 }, { "epoch": 0.3406738189346885, "grad_norm": 1.5212960243225098, "learning_rate": 0.00046593261810653114, "loss": 4.2642, "step": 1810 }, { "epoch": 0.3425559947299078, "grad_norm": 1.2794291973114014, "learning_rate": 0.0004657444005270092, "loss": 4.3414, "step": 1820 }, { "epoch": 0.34443817052512704, "grad_norm": 1.3889094591140747, "learning_rate": 0.0004655561829474873, "loss": 4.1447, "step": 1830 }, { "epoch": 0.3463203463203463, "grad_norm": 1.7165048122406006, "learning_rate": 0.00046536796536796537, "loss": 4.1519, "step": 1840 }, { "epoch": 0.34820252211556557, "grad_norm": 1.3793126344680786, "learning_rate": 0.00046517974778844344, "loss": 4.1838, "step": 1850 }, { "epoch": 0.3500846979107849, "grad_norm": 1.2954260110855103, "learning_rate": 0.0004649915302089215, "loss": 4.2141, "step": 1860 }, { "epoch": 0.35196687370600416, "grad_norm": 1.372908592224121, "learning_rate": 0.0004648033126293996, "loss": 4.2655, "step": 1870 }, { "epoch": 0.3538490495012234, "grad_norm": 1.400633692741394, "learning_rate": 0.0004646150950498776, "loss": 4.2115, "step": 1880 }, { "epoch": 0.3557312252964427, "grad_norm": 1.5919150114059448, "learning_rate": 0.00046442687747035575, "loss": 4.4, "step": 1890 }, { "epoch": 0.35761340109166195, "grad_norm": 1.6317856311798096, "learning_rate": 0.0004642386598908338, "loss": 4.2424, "step": 1900 }, { "epoch": 0.3594955768868812, "grad_norm": 1.5302605628967285, "learning_rate": 0.0004640504423113119, "loss": 4.3054, "step": 1910 }, { "epoch": 0.3613777526821005, "grad_norm": 1.3222270011901855, "learning_rate": 0.00046386222473179, "loss": 4.0888, "step": 1920 }, { "epoch": 0.3632599284773198, "grad_norm": 1.514804482460022, "learning_rate": 0.00046367400715226806, "loss": 4.2651, "step": 1930 }, { "epoch": 0.36514210427253907, "grad_norm": 1.5988330841064453, "learning_rate": 0.00046348578957274613, "loss": 4.2283, "step": 1940 }, { "epoch": 0.36702428006775834, "grad_norm": 1.3682210445404053, "learning_rate": 0.0004632975719932242, "loss": 4.0756, "step": 1950 }, { "epoch": 0.3689064558629776, "grad_norm": 1.4985891580581665, "learning_rate": 0.00046310935441370223, "loss": 4.1255, "step": 1960 }, { "epoch": 0.37078863165819687, "grad_norm": 1.6254980564117432, "learning_rate": 0.0004629211368341803, "loss": 4.1481, "step": 1970 }, { "epoch": 0.37267080745341613, "grad_norm": 1.5056555271148682, "learning_rate": 0.0004627329192546584, "loss": 4.1395, "step": 1980 }, { "epoch": 0.3745529832486354, "grad_norm": 1.5908918380737305, "learning_rate": 0.00046254470167513646, "loss": 4.1166, "step": 1990 }, { "epoch": 0.3764351590438547, "grad_norm": 1.4263468980789185, "learning_rate": 0.00046235648409561454, "loss": 4.1601, "step": 2000 }, { "epoch": 0.378317334839074, "grad_norm": 1.8191322088241577, "learning_rate": 0.0004621682665160926, "loss": 4.3289, "step": 2010 }, { "epoch": 0.38019951063429325, "grad_norm": 1.7897543907165527, "learning_rate": 0.0004619800489365707, "loss": 4.283, "step": 2020 }, { "epoch": 0.3820816864295125, "grad_norm": 1.5530258417129517, "learning_rate": 0.00046179183135704877, "loss": 4.2163, "step": 2030 }, { "epoch": 0.3839638622247318, "grad_norm": 1.560866355895996, "learning_rate": 0.00046160361377752684, "loss": 4.0749, "step": 2040 }, { "epoch": 0.38584603801995104, "grad_norm": 1.2918670177459717, "learning_rate": 0.00046141539619800486, "loss": 4.2007, "step": 2050 }, { "epoch": 0.38772821381517036, "grad_norm": 1.4356296062469482, "learning_rate": 0.00046122717861848294, "loss": 4.2335, "step": 2060 }, { "epoch": 0.38961038961038963, "grad_norm": 1.8680976629257202, "learning_rate": 0.000461038961038961, "loss": 4.0791, "step": 2070 }, { "epoch": 0.3914925654056089, "grad_norm": 1.8302345275878906, "learning_rate": 0.0004608507434594391, "loss": 4.1799, "step": 2080 }, { "epoch": 0.39337474120082816, "grad_norm": 1.5143201351165771, "learning_rate": 0.00046066252587991717, "loss": 4.2113, "step": 2090 }, { "epoch": 0.3952569169960474, "grad_norm": 1.3850644826889038, "learning_rate": 0.00046047430830039525, "loss": 4.2189, "step": 2100 }, { "epoch": 0.3971390927912667, "grad_norm": 1.7276452779769897, "learning_rate": 0.0004602860907208734, "loss": 4.3723, "step": 2110 }, { "epoch": 0.39902126858648596, "grad_norm": 1.5579780340194702, "learning_rate": 0.00046009787314135145, "loss": 4.2855, "step": 2120 }, { "epoch": 0.4009034443817053, "grad_norm": 1.269708275794983, "learning_rate": 0.00045990965556182953, "loss": 4.2179, "step": 2130 }, { "epoch": 0.40278562017692454, "grad_norm": 1.565470814704895, "learning_rate": 0.00045972143798230755, "loss": 4.095, "step": 2140 }, { "epoch": 0.4046677959721438, "grad_norm": 1.5279659032821655, "learning_rate": 0.00045953322040278563, "loss": 4.4537, "step": 2150 }, { "epoch": 0.40654997176736307, "grad_norm": 1.3654329776763916, "learning_rate": 0.0004593450028232637, "loss": 4.306, "step": 2160 }, { "epoch": 0.40843214756258234, "grad_norm": 1.5698778629302979, "learning_rate": 0.0004591567852437418, "loss": 4.1646, "step": 2170 }, { "epoch": 0.4103143233578016, "grad_norm": 1.4834023714065552, "learning_rate": 0.00045896856766421986, "loss": 4.1753, "step": 2180 }, { "epoch": 0.41219649915302087, "grad_norm": 1.5586764812469482, "learning_rate": 0.00045878035008469793, "loss": 4.0278, "step": 2190 }, { "epoch": 0.4140786749482402, "grad_norm": 1.7778464555740356, "learning_rate": 0.000458592132505176, "loss": 4.3505, "step": 2200 }, { "epoch": 0.41596085074345945, "grad_norm": 1.6066936254501343, "learning_rate": 0.0004584039149256541, "loss": 4.0907, "step": 2210 }, { "epoch": 0.4178430265386787, "grad_norm": 1.6600680351257324, "learning_rate": 0.00045821569734613216, "loss": 4.1453, "step": 2220 }, { "epoch": 0.419725202333898, "grad_norm": 1.248590111732483, "learning_rate": 0.0004580274797666102, "loss": 4.1283, "step": 2230 }, { "epoch": 0.42160737812911725, "grad_norm": 1.4683887958526611, "learning_rate": 0.00045783926218708826, "loss": 4.0942, "step": 2240 }, { "epoch": 0.4234895539243365, "grad_norm": 1.6800812482833862, "learning_rate": 0.00045765104460756634, "loss": 4.2249, "step": 2250 }, { "epoch": 0.4253717297195558, "grad_norm": 1.4758554697036743, "learning_rate": 0.0004574628270280444, "loss": 3.9934, "step": 2260 }, { "epoch": 0.4272539055147751, "grad_norm": 1.590091586112976, "learning_rate": 0.0004572746094485225, "loss": 4.0494, "step": 2270 }, { "epoch": 0.42913608130999437, "grad_norm": 1.5667383670806885, "learning_rate": 0.00045708639186900057, "loss": 4.1898, "step": 2280 }, { "epoch": 0.43101825710521363, "grad_norm": 1.5386614799499512, "learning_rate": 0.00045689817428947864, "loss": 4.2019, "step": 2290 }, { "epoch": 0.4329004329004329, "grad_norm": 1.4483133554458618, "learning_rate": 0.0004567099567099567, "loss": 4.2014, "step": 2300 }, { "epoch": 0.43478260869565216, "grad_norm": 1.5541331768035889, "learning_rate": 0.00045652173913043474, "loss": 4.0834, "step": 2310 }, { "epoch": 0.4366647844908714, "grad_norm": 1.7615141868591309, "learning_rate": 0.0004563335215509128, "loss": 4.0695, "step": 2320 }, { "epoch": 0.43854696028609075, "grad_norm": 1.6555922031402588, "learning_rate": 0.00045614530397139095, "loss": 3.9955, "step": 2330 }, { "epoch": 0.44042913608131, "grad_norm": 1.6086030006408691, "learning_rate": 0.000455957086391869, "loss": 4.2755, "step": 2340 }, { "epoch": 0.4423113118765293, "grad_norm": 1.4765291213989258, "learning_rate": 0.0004557688688123471, "loss": 4.1072, "step": 2350 }, { "epoch": 0.44419348767174854, "grad_norm": 1.5320438146591187, "learning_rate": 0.0004555806512328252, "loss": 4.2496, "step": 2360 }, { "epoch": 0.4460756634669678, "grad_norm": 1.398559808731079, "learning_rate": 0.00045539243365330326, "loss": 4.097, "step": 2370 }, { "epoch": 0.4479578392621871, "grad_norm": 1.7332468032836914, "learning_rate": 0.00045520421607378133, "loss": 4.1197, "step": 2380 }, { "epoch": 0.44984001505740634, "grad_norm": 1.7244771718978882, "learning_rate": 0.0004550159984942594, "loss": 4.2804, "step": 2390 }, { "epoch": 0.45172219085262566, "grad_norm": 1.318703532218933, "learning_rate": 0.00045482778091473743, "loss": 4.2258, "step": 2400 }, { "epoch": 0.4536043666478449, "grad_norm": 1.726775050163269, "learning_rate": 0.0004546395633352155, "loss": 4.2059, "step": 2410 }, { "epoch": 0.4554865424430642, "grad_norm": 1.4962884187698364, "learning_rate": 0.0004544513457556936, "loss": 4.0591, "step": 2420 }, { "epoch": 0.45736871823828346, "grad_norm": 1.5442137718200684, "learning_rate": 0.00045426312817617166, "loss": 4.1446, "step": 2430 }, { "epoch": 0.4592508940335027, "grad_norm": 1.510866641998291, "learning_rate": 0.00045407491059664974, "loss": 4.2509, "step": 2440 }, { "epoch": 0.461133069828722, "grad_norm": 1.864684820175171, "learning_rate": 0.0004538866930171278, "loss": 3.9895, "step": 2450 }, { "epoch": 0.46301524562394125, "grad_norm": 1.387701392173767, "learning_rate": 0.0004536984754376059, "loss": 4.1968, "step": 2460 }, { "epoch": 0.4648974214191606, "grad_norm": 2.0901427268981934, "learning_rate": 0.00045351025785808397, "loss": 4.2134, "step": 2470 }, { "epoch": 0.46677959721437984, "grad_norm": 1.6149876117706299, "learning_rate": 0.00045332204027856204, "loss": 4.0816, "step": 2480 }, { "epoch": 0.4686617730095991, "grad_norm": 1.5137728452682495, "learning_rate": 0.00045313382269904006, "loss": 4.1154, "step": 2490 }, { "epoch": 0.47054394880481837, "grad_norm": 3.7697200775146484, "learning_rate": 0.00045294560511951814, "loss": 4.3487, "step": 2500 }, { "epoch": 0.47242612460003763, "grad_norm": 1.4249904155731201, "learning_rate": 0.0004527573875399962, "loss": 4.1435, "step": 2510 }, { "epoch": 0.4743083003952569, "grad_norm": 1.586154818534851, "learning_rate": 0.0004525691699604743, "loss": 4.2354, "step": 2520 }, { "epoch": 0.47619047619047616, "grad_norm": 1.3356468677520752, "learning_rate": 0.00045238095238095237, "loss": 4.2267, "step": 2530 }, { "epoch": 0.4780726519856955, "grad_norm": 1.392693042755127, "learning_rate": 0.00045219273480143045, "loss": 4.2072, "step": 2540 }, { "epoch": 0.47995482778091475, "grad_norm": 1.5842061042785645, "learning_rate": 0.0004520045172219086, "loss": 4.0889, "step": 2550 }, { "epoch": 0.481837003576134, "grad_norm": 1.4757062196731567, "learning_rate": 0.00045181629964238665, "loss": 4.0046, "step": 2560 }, { "epoch": 0.4837191793713533, "grad_norm": 1.8355207443237305, "learning_rate": 0.0004516280820628647, "loss": 4.1834, "step": 2570 }, { "epoch": 0.48560135516657255, "grad_norm": 1.4048017263412476, "learning_rate": 0.00045143986448334275, "loss": 4.1623, "step": 2580 }, { "epoch": 0.4874835309617918, "grad_norm": 1.6089762449264526, "learning_rate": 0.00045125164690382083, "loss": 4.1906, "step": 2590 }, { "epoch": 0.48936570675701113, "grad_norm": 1.5948734283447266, "learning_rate": 0.0004510634293242989, "loss": 4.2207, "step": 2600 }, { "epoch": 0.4912478825522304, "grad_norm": 1.6670500040054321, "learning_rate": 0.000450875211744777, "loss": 4.289, "step": 2610 }, { "epoch": 0.49313005834744966, "grad_norm": 1.7025580406188965, "learning_rate": 0.00045068699416525506, "loss": 4.0739, "step": 2620 }, { "epoch": 0.4950122341426689, "grad_norm": 1.7070814371109009, "learning_rate": 0.00045049877658573313, "loss": 3.8954, "step": 2630 }, { "epoch": 0.4968944099378882, "grad_norm": 1.5894651412963867, "learning_rate": 0.0004503105590062112, "loss": 4.096, "step": 2640 }, { "epoch": 0.49877658573310746, "grad_norm": 1.6690545082092285, "learning_rate": 0.0004501223414266893, "loss": 4.1386, "step": 2650 }, { "epoch": 0.5006587615283268, "grad_norm": 2.1716558933258057, "learning_rate": 0.0004499341238471673, "loss": 4.2536, "step": 2660 }, { "epoch": 0.502540937323546, "grad_norm": 1.5564872026443481, "learning_rate": 0.0004497459062676454, "loss": 4.1387, "step": 2670 }, { "epoch": 0.5044231131187653, "grad_norm": 1.5847035646438599, "learning_rate": 0.00044955768868812346, "loss": 4.2487, "step": 2680 }, { "epoch": 0.5063052889139845, "grad_norm": 1.5552998781204224, "learning_rate": 0.00044936947110860154, "loss": 4.2158, "step": 2690 }, { "epoch": 0.5081874647092038, "grad_norm": 1.409266471862793, "learning_rate": 0.0004491812535290796, "loss": 4.0273, "step": 2700 }, { "epoch": 0.5100696405044232, "grad_norm": 1.5766602754592896, "learning_rate": 0.0004489930359495577, "loss": 4.1938, "step": 2710 }, { "epoch": 0.5119518162996424, "grad_norm": 1.191178321838379, "learning_rate": 0.00044880481837003577, "loss": 4.0563, "step": 2720 }, { "epoch": 0.5138339920948617, "grad_norm": 1.6059616804122925, "learning_rate": 0.00044861660079051384, "loss": 4.2029, "step": 2730 }, { "epoch": 0.5157161678900809, "grad_norm": 1.6359925270080566, "learning_rate": 0.0004484283832109919, "loss": 4.237, "step": 2740 }, { "epoch": 0.5175983436853002, "grad_norm": 1.5290448665618896, "learning_rate": 0.00044824016563146994, "loss": 4.0689, "step": 2750 }, { "epoch": 0.5194805194805194, "grad_norm": 1.4904544353485107, "learning_rate": 0.000448051948051948, "loss": 4.0665, "step": 2760 }, { "epoch": 0.5213626952757388, "grad_norm": 1.4702355861663818, "learning_rate": 0.00044786373047242615, "loss": 4.0486, "step": 2770 }, { "epoch": 0.5232448710709581, "grad_norm": 1.407709002494812, "learning_rate": 0.0004476755128929042, "loss": 4.0626, "step": 2780 }, { "epoch": 0.5251270468661773, "grad_norm": 1.6709678173065186, "learning_rate": 0.0004474872953133823, "loss": 4.1701, "step": 2790 }, { "epoch": 0.5270092226613966, "grad_norm": 1.7822628021240234, "learning_rate": 0.0004472990777338604, "loss": 4.0396, "step": 2800 }, { "epoch": 0.5288913984566158, "grad_norm": 1.638765811920166, "learning_rate": 0.00044711086015433846, "loss": 4.1889, "step": 2810 }, { "epoch": 0.5307735742518351, "grad_norm": 1.868028163909912, "learning_rate": 0.00044692264257481653, "loss": 4.0863, "step": 2820 }, { "epoch": 0.5326557500470543, "grad_norm": 1.691738247871399, "learning_rate": 0.0004467344249952946, "loss": 4.3616, "step": 2830 }, { "epoch": 0.5345379258422737, "grad_norm": 1.8202202320098877, "learning_rate": 0.00044654620741577263, "loss": 4.0839, "step": 2840 }, { "epoch": 0.536420101637493, "grad_norm": 1.5820993185043335, "learning_rate": 0.0004463579898362507, "loss": 4.0845, "step": 2850 }, { "epoch": 0.5383022774327122, "grad_norm": 1.599906325340271, "learning_rate": 0.0004461697722567288, "loss": 3.807, "step": 2860 }, { "epoch": 0.5401844532279315, "grad_norm": 1.9688726663589478, "learning_rate": 0.00044598155467720686, "loss": 3.9582, "step": 2870 }, { "epoch": 0.5420666290231507, "grad_norm": 1.6034358739852905, "learning_rate": 0.00044579333709768494, "loss": 4.1365, "step": 2880 }, { "epoch": 0.54394880481837, "grad_norm": 1.613648533821106, "learning_rate": 0.000445605119518163, "loss": 3.9885, "step": 2890 }, { "epoch": 0.5458309806135893, "grad_norm": 1.4738061428070068, "learning_rate": 0.0004454169019386411, "loss": 4.0579, "step": 2900 }, { "epoch": 0.5477131564088086, "grad_norm": 1.5942338705062866, "learning_rate": 0.00044522868435911917, "loss": 3.9049, "step": 2910 }, { "epoch": 0.5495953322040279, "grad_norm": 1.7312630414962769, "learning_rate": 0.0004450404667795972, "loss": 4.161, "step": 2920 }, { "epoch": 0.5514775079992471, "grad_norm": 1.5873860120773315, "learning_rate": 0.00044485224920007526, "loss": 3.9691, "step": 2930 }, { "epoch": 0.5533596837944664, "grad_norm": 1.842726707458496, "learning_rate": 0.00044466403162055334, "loss": 4.0464, "step": 2940 }, { "epoch": 0.5552418595896856, "grad_norm": 1.5641363859176636, "learning_rate": 0.0004444758140410314, "loss": 3.9814, "step": 2950 }, { "epoch": 0.557124035384905, "grad_norm": 1.7177900075912476, "learning_rate": 0.0004442875964615095, "loss": 4.118, "step": 2960 }, { "epoch": 0.5590062111801242, "grad_norm": 1.7312358617782593, "learning_rate": 0.00044409937888198757, "loss": 4.0702, "step": 2970 }, { "epoch": 0.5608883869753435, "grad_norm": 1.684001088142395, "learning_rate": 0.00044391116130246565, "loss": 4.0587, "step": 2980 }, { "epoch": 0.5627705627705628, "grad_norm": 1.5080180168151855, "learning_rate": 0.0004437229437229438, "loss": 4.2036, "step": 2990 }, { "epoch": 0.564652738565782, "grad_norm": 1.512826681137085, "learning_rate": 0.00044353472614342185, "loss": 4.1267, "step": 3000 }, { "epoch": 0.5665349143610013, "grad_norm": 1.5618807077407837, "learning_rate": 0.0004433465085638999, "loss": 4.0083, "step": 3010 }, { "epoch": 0.5684170901562206, "grad_norm": 1.6022028923034668, "learning_rate": 0.00044315829098437795, "loss": 3.9643, "step": 3020 }, { "epoch": 0.5702992659514399, "grad_norm": 1.5317556858062744, "learning_rate": 0.00044297007340485603, "loss": 4.2311, "step": 3030 }, { "epoch": 0.5721814417466592, "grad_norm": 1.5174047946929932, "learning_rate": 0.0004427818558253341, "loss": 4.0171, "step": 3040 }, { "epoch": 0.5740636175418784, "grad_norm": 2.2394042015075684, "learning_rate": 0.0004425936382458122, "loss": 3.998, "step": 3050 }, { "epoch": 0.5759457933370977, "grad_norm": 2.815605878829956, "learning_rate": 0.00044240542066629026, "loss": 4.0416, "step": 3060 }, { "epoch": 0.5778279691323169, "grad_norm": 1.5818989276885986, "learning_rate": 0.00044221720308676833, "loss": 4.2079, "step": 3070 }, { "epoch": 0.5797101449275363, "grad_norm": 1.8354148864746094, "learning_rate": 0.0004420289855072464, "loss": 4.0311, "step": 3080 }, { "epoch": 0.5815923207227555, "grad_norm": 1.6760532855987549, "learning_rate": 0.0004418407679277245, "loss": 4.1009, "step": 3090 }, { "epoch": 0.5834744965179748, "grad_norm": 1.7229177951812744, "learning_rate": 0.0004416525503482025, "loss": 4.198, "step": 3100 }, { "epoch": 0.5853566723131941, "grad_norm": 1.525316596031189, "learning_rate": 0.0004414643327686806, "loss": 3.8761, "step": 3110 }, { "epoch": 0.5872388481084133, "grad_norm": 1.4251718521118164, "learning_rate": 0.00044127611518915866, "loss": 3.9151, "step": 3120 }, { "epoch": 0.5891210239036326, "grad_norm": 2.061256170272827, "learning_rate": 0.00044108789760963674, "loss": 4.1635, "step": 3130 }, { "epoch": 0.5910031996988518, "grad_norm": 1.8884425163269043, "learning_rate": 0.0004408996800301148, "loss": 3.9408, "step": 3140 }, { "epoch": 0.5928853754940712, "grad_norm": 1.5586994886398315, "learning_rate": 0.0004407114624505929, "loss": 4.1696, "step": 3150 }, { "epoch": 0.5947675512892904, "grad_norm": 1.6019229888916016, "learning_rate": 0.00044052324487107097, "loss": 4.0717, "step": 3160 }, { "epoch": 0.5966497270845097, "grad_norm": 1.593672752380371, "learning_rate": 0.00044033502729154904, "loss": 4.1436, "step": 3170 }, { "epoch": 0.598531902879729, "grad_norm": 1.5448564291000366, "learning_rate": 0.00044014680971202707, "loss": 4.1807, "step": 3180 }, { "epoch": 0.6004140786749482, "grad_norm": 1.498767375946045, "learning_rate": 0.00043995859213250514, "loss": 4.0741, "step": 3190 }, { "epoch": 0.6022962544701675, "grad_norm": 1.4406611919403076, "learning_rate": 0.0004397703745529832, "loss": 3.8672, "step": 3200 }, { "epoch": 0.6041784302653868, "grad_norm": 2.3306994438171387, "learning_rate": 0.0004395821569734613, "loss": 4.0911, "step": 3210 }, { "epoch": 0.6060606060606061, "grad_norm": 1.641932487487793, "learning_rate": 0.0004393939393939394, "loss": 4.1758, "step": 3220 }, { "epoch": 0.6079427818558253, "grad_norm": 1.6793614625930786, "learning_rate": 0.0004392057218144175, "loss": 3.854, "step": 3230 }, { "epoch": 0.6098249576510446, "grad_norm": 1.7702709436416626, "learning_rate": 0.0004390175042348956, "loss": 4.4409, "step": 3240 }, { "epoch": 0.6117071334462639, "grad_norm": 1.4433175325393677, "learning_rate": 0.00043882928665537366, "loss": 3.8387, "step": 3250 }, { "epoch": 0.6135893092414831, "grad_norm": 1.8953951597213745, "learning_rate": 0.00043864106907585173, "loss": 4.238, "step": 3260 }, { "epoch": 0.6154714850367025, "grad_norm": 1.5574579238891602, "learning_rate": 0.00043845285149632975, "loss": 4.0235, "step": 3270 }, { "epoch": 0.6173536608319217, "grad_norm": 1.6335999965667725, "learning_rate": 0.00043826463391680783, "loss": 4.2361, "step": 3280 }, { "epoch": 0.619235836627141, "grad_norm": 1.5225999355316162, "learning_rate": 0.0004380764163372859, "loss": 3.9487, "step": 3290 }, { "epoch": 0.6211180124223602, "grad_norm": 1.7795838117599487, "learning_rate": 0.000437888198757764, "loss": 4.2099, "step": 3300 }, { "epoch": 0.6230001882175795, "grad_norm": 1.8371700048446655, "learning_rate": 0.00043769998117824206, "loss": 4.1858, "step": 3310 }, { "epoch": 0.6248823640127988, "grad_norm": 1.5186216831207275, "learning_rate": 0.00043751176359872014, "loss": 3.9859, "step": 3320 }, { "epoch": 0.626764539808018, "grad_norm": 1.853470802307129, "learning_rate": 0.0004373235460191982, "loss": 4.0308, "step": 3330 }, { "epoch": 0.6286467156032374, "grad_norm": 1.5872433185577393, "learning_rate": 0.0004371353284396763, "loss": 3.9893, "step": 3340 }, { "epoch": 0.6305288913984566, "grad_norm": 1.3795815706253052, "learning_rate": 0.00043694711086015437, "loss": 4.193, "step": 3350 }, { "epoch": 0.6324110671936759, "grad_norm": 1.5030205249786377, "learning_rate": 0.0004367588932806324, "loss": 4.0368, "step": 3360 }, { "epoch": 0.6342932429888951, "grad_norm": 1.807570219039917, "learning_rate": 0.00043657067570111046, "loss": 3.9148, "step": 3370 }, { "epoch": 0.6361754187841144, "grad_norm": 1.7171461582183838, "learning_rate": 0.00043638245812158854, "loss": 4.1182, "step": 3380 }, { "epoch": 0.6380575945793338, "grad_norm": 1.5727144479751587, "learning_rate": 0.0004361942405420666, "loss": 4.0768, "step": 3390 }, { "epoch": 0.639939770374553, "grad_norm": 1.7362148761749268, "learning_rate": 0.0004360060229625447, "loss": 3.9668, "step": 3400 }, { "epoch": 0.6418219461697723, "grad_norm": 1.6728789806365967, "learning_rate": 0.00043581780538302277, "loss": 4.0369, "step": 3410 }, { "epoch": 0.6437041219649915, "grad_norm": 1.7012757062911987, "learning_rate": 0.00043562958780350085, "loss": 4.12, "step": 3420 }, { "epoch": 0.6455862977602108, "grad_norm": 1.728248119354248, "learning_rate": 0.000435441370223979, "loss": 4.0729, "step": 3430 }, { "epoch": 0.64746847355543, "grad_norm": 1.8527989387512207, "learning_rate": 0.00043525315264445705, "loss": 4.2596, "step": 3440 }, { "epoch": 0.6493506493506493, "grad_norm": 1.7408626079559326, "learning_rate": 0.0004350649350649351, "loss": 4.1345, "step": 3450 }, { "epoch": 0.6512328251458687, "grad_norm": 1.4284261465072632, "learning_rate": 0.00043487671748541315, "loss": 4.1048, "step": 3460 }, { "epoch": 0.6531150009410879, "grad_norm": 1.3651869297027588, "learning_rate": 0.00043468849990589123, "loss": 4.0933, "step": 3470 }, { "epoch": 0.6549971767363072, "grad_norm": 1.5003199577331543, "learning_rate": 0.0004345002823263693, "loss": 4.185, "step": 3480 }, { "epoch": 0.6568793525315264, "grad_norm": 1.5484837293624878, "learning_rate": 0.0004343120647468474, "loss": 4.1184, "step": 3490 }, { "epoch": 0.6587615283267457, "grad_norm": 1.3103121519088745, "learning_rate": 0.00043412384716732546, "loss": 4.0191, "step": 3500 }, { "epoch": 0.6606437041219649, "grad_norm": 1.587458848953247, "learning_rate": 0.00043393562958780353, "loss": 4.0854, "step": 3510 }, { "epoch": 0.6625258799171843, "grad_norm": 2.2741780281066895, "learning_rate": 0.0004337474120082816, "loss": 4.0218, "step": 3520 }, { "epoch": 0.6644080557124036, "grad_norm": 1.7086808681488037, "learning_rate": 0.00043355919442875963, "loss": 3.8825, "step": 3530 }, { "epoch": 0.6662902315076228, "grad_norm": 1.551834225654602, "learning_rate": 0.0004333709768492377, "loss": 4.2044, "step": 3540 }, { "epoch": 0.6681724073028421, "grad_norm": 1.484415888786316, "learning_rate": 0.0004331827592697158, "loss": 3.9214, "step": 3550 }, { "epoch": 0.6700545830980613, "grad_norm": 1.6641745567321777, "learning_rate": 0.00043299454169019386, "loss": 4.1115, "step": 3560 }, { "epoch": 0.6719367588932806, "grad_norm": 1.9629614353179932, "learning_rate": 0.00043280632411067194, "loss": 4.0877, "step": 3570 }, { "epoch": 0.6738189346885, "grad_norm": 1.6242876052856445, "learning_rate": 0.00043261810653115, "loss": 3.9146, "step": 3580 }, { "epoch": 0.6757011104837192, "grad_norm": 1.8593428134918213, "learning_rate": 0.0004324298889516281, "loss": 4.0876, "step": 3590 }, { "epoch": 0.6775832862789385, "grad_norm": 1.5309560298919678, "learning_rate": 0.00043224167137210617, "loss": 3.9557, "step": 3600 }, { "epoch": 0.6794654620741577, "grad_norm": 1.5672924518585205, "learning_rate": 0.00043205345379258424, "loss": 4.0553, "step": 3610 }, { "epoch": 0.681347637869377, "grad_norm": 2.1588938236236572, "learning_rate": 0.00043186523621306227, "loss": 3.9586, "step": 3620 }, { "epoch": 0.6832298136645962, "grad_norm": 2.019364595413208, "learning_rate": 0.00043167701863354034, "loss": 3.9824, "step": 3630 }, { "epoch": 0.6851119894598156, "grad_norm": 2.110015869140625, "learning_rate": 0.0004314888010540184, "loss": 3.8083, "step": 3640 }, { "epoch": 0.6869941652550349, "grad_norm": 2.0237793922424316, "learning_rate": 0.0004313005834744965, "loss": 4.0203, "step": 3650 }, { "epoch": 0.6888763410502541, "grad_norm": 1.7034662961959839, "learning_rate": 0.0004311123658949746, "loss": 3.8691, "step": 3660 }, { "epoch": 0.6907585168454734, "grad_norm": 1.8328206539154053, "learning_rate": 0.0004309241483154527, "loss": 4.2907, "step": 3670 }, { "epoch": 0.6926406926406926, "grad_norm": 1.8003268241882324, "learning_rate": 0.0004307359307359308, "loss": 3.8922, "step": 3680 }, { "epoch": 0.6945228684359119, "grad_norm": 2.132638692855835, "learning_rate": 0.00043054771315640886, "loss": 4.1068, "step": 3690 }, { "epoch": 0.6964050442311311, "grad_norm": 1.8007606267929077, "learning_rate": 0.00043035949557688693, "loss": 4.0505, "step": 3700 }, { "epoch": 0.6982872200263505, "grad_norm": 1.9239156246185303, "learning_rate": 0.00043017127799736495, "loss": 3.9898, "step": 3710 }, { "epoch": 0.7001693958215698, "grad_norm": 1.4964417219161987, "learning_rate": 0.00042998306041784303, "loss": 4.0079, "step": 3720 }, { "epoch": 0.702051571616789, "grad_norm": 1.6524834632873535, "learning_rate": 0.0004297948428383211, "loss": 4.118, "step": 3730 }, { "epoch": 0.7039337474120083, "grad_norm": 1.7997394800186157, "learning_rate": 0.0004296066252587992, "loss": 4.1419, "step": 3740 }, { "epoch": 0.7058159232072275, "grad_norm": 1.7603429555892944, "learning_rate": 0.00042941840767927726, "loss": 4.0473, "step": 3750 }, { "epoch": 0.7076980990024468, "grad_norm": 1.5642491579055786, "learning_rate": 0.00042923019009975534, "loss": 4.0603, "step": 3760 }, { "epoch": 0.709580274797666, "grad_norm": 2.023898124694824, "learning_rate": 0.0004290419725202334, "loss": 3.899, "step": 3770 }, { "epoch": 0.7114624505928854, "grad_norm": 1.593841791152954, "learning_rate": 0.0004288537549407115, "loss": 3.9971, "step": 3780 }, { "epoch": 0.7133446263881047, "grad_norm": 1.7649803161621094, "learning_rate": 0.0004286655373611895, "loss": 3.8812, "step": 3790 }, { "epoch": 0.7152268021833239, "grad_norm": 1.7265058755874634, "learning_rate": 0.0004284773197816676, "loss": 4.1328, "step": 3800 }, { "epoch": 0.7171089779785432, "grad_norm": 1.5733685493469238, "learning_rate": 0.00042828910220214566, "loss": 4.0023, "step": 3810 }, { "epoch": 0.7189911537737624, "grad_norm": 1.5658947229385376, "learning_rate": 0.00042810088462262374, "loss": 4.1734, "step": 3820 }, { "epoch": 0.7208733295689818, "grad_norm": 1.6648072004318237, "learning_rate": 0.0004279126670431018, "loss": 4.1272, "step": 3830 }, { "epoch": 0.722755505364201, "grad_norm": 1.7949475049972534, "learning_rate": 0.0004277244494635799, "loss": 3.9424, "step": 3840 }, { "epoch": 0.7246376811594203, "grad_norm": 1.9354840517044067, "learning_rate": 0.00042753623188405797, "loss": 4.1731, "step": 3850 }, { "epoch": 0.7265198569546396, "grad_norm": 1.7900172472000122, "learning_rate": 0.00042734801430453605, "loss": 3.9624, "step": 3860 }, { "epoch": 0.7284020327498588, "grad_norm": 2.012629985809326, "learning_rate": 0.0004271597967250141, "loss": 4.169, "step": 3870 }, { "epoch": 0.7302842085450781, "grad_norm": 1.7707184553146362, "learning_rate": 0.0004269715791454922, "loss": 3.872, "step": 3880 }, { "epoch": 0.7321663843402973, "grad_norm": 1.6689708232879639, "learning_rate": 0.0004267833615659703, "loss": 3.9675, "step": 3890 }, { "epoch": 0.7340485601355167, "grad_norm": 1.8784040212631226, "learning_rate": 0.00042659514398644835, "loss": 4.0006, "step": 3900 }, { "epoch": 0.7359307359307359, "grad_norm": 2.014258623123169, "learning_rate": 0.00042640692640692643, "loss": 4.168, "step": 3910 }, { "epoch": 0.7378129117259552, "grad_norm": 2.2676868438720703, "learning_rate": 0.0004262187088274045, "loss": 3.9855, "step": 3920 }, { "epoch": 0.7396950875211745, "grad_norm": 1.5652729272842407, "learning_rate": 0.0004260304912478826, "loss": 4.0502, "step": 3930 }, { "epoch": 0.7415772633163937, "grad_norm": 2.0387094020843506, "learning_rate": 0.00042584227366836066, "loss": 3.8425, "step": 3940 }, { "epoch": 0.743459439111613, "grad_norm": 1.6358362436294556, "learning_rate": 0.00042565405608883873, "loss": 4.0084, "step": 3950 }, { "epoch": 0.7453416149068323, "grad_norm": 1.7088682651519775, "learning_rate": 0.0004254658385093168, "loss": 4.1777, "step": 3960 }, { "epoch": 0.7472237907020516, "grad_norm": 1.7116954326629639, "learning_rate": 0.00042527762092979483, "loss": 3.8762, "step": 3970 }, { "epoch": 0.7491059664972708, "grad_norm": 2.027803897857666, "learning_rate": 0.0004250894033502729, "loss": 3.9327, "step": 3980 }, { "epoch": 0.7509881422924901, "grad_norm": 2.3380789756774902, "learning_rate": 0.000424901185770751, "loss": 4.0325, "step": 3990 }, { "epoch": 0.7528703180877094, "grad_norm": 2.0593698024749756, "learning_rate": 0.00042471296819122906, "loss": 4.0288, "step": 4000 }, { "epoch": 0.7547524938829286, "grad_norm": 1.9557839632034302, "learning_rate": 0.00042452475061170714, "loss": 4.1358, "step": 4010 }, { "epoch": 0.756634669678148, "grad_norm": 1.571593165397644, "learning_rate": 0.0004243365330321852, "loss": 3.9639, "step": 4020 }, { "epoch": 0.7585168454733672, "grad_norm": 1.6397957801818848, "learning_rate": 0.0004241483154526633, "loss": 4.0101, "step": 4030 }, { "epoch": 0.7603990212685865, "grad_norm": 1.9671742916107178, "learning_rate": 0.00042396009787314137, "loss": 4.0802, "step": 4040 }, { "epoch": 0.7622811970638057, "grad_norm": 1.6972992420196533, "learning_rate": 0.00042377188029361944, "loss": 3.8887, "step": 4050 }, { "epoch": 0.764163372859025, "grad_norm": 1.699816107749939, "learning_rate": 0.00042358366271409747, "loss": 3.8207, "step": 4060 }, { "epoch": 0.7660455486542443, "grad_norm": 1.866045355796814, "learning_rate": 0.00042339544513457554, "loss": 3.9556, "step": 4070 }, { "epoch": 0.7679277244494636, "grad_norm": 1.6141557693481445, "learning_rate": 0.0004232072275550536, "loss": 3.8906, "step": 4080 }, { "epoch": 0.7698099002446829, "grad_norm": 2.044933319091797, "learning_rate": 0.0004230190099755317, "loss": 4.1045, "step": 4090 }, { "epoch": 0.7716920760399021, "grad_norm": 1.7318826913833618, "learning_rate": 0.0004228307923960098, "loss": 4.0916, "step": 4100 }, { "epoch": 0.7735742518351214, "grad_norm": 1.7338465452194214, "learning_rate": 0.0004226425748164879, "loss": 4.1873, "step": 4110 }, { "epoch": 0.7754564276303407, "grad_norm": 1.597584843635559, "learning_rate": 0.000422454357236966, "loss": 3.8938, "step": 4120 }, { "epoch": 0.7773386034255599, "grad_norm": 1.9002665281295776, "learning_rate": 0.00042226613965744406, "loss": 4.2353, "step": 4130 }, { "epoch": 0.7792207792207793, "grad_norm": 1.9633022546768188, "learning_rate": 0.0004220779220779221, "loss": 4.0478, "step": 4140 }, { "epoch": 0.7811029550159985, "grad_norm": 1.6098625659942627, "learning_rate": 0.00042188970449840015, "loss": 3.8488, "step": 4150 }, { "epoch": 0.7829851308112178, "grad_norm": 1.7783234119415283, "learning_rate": 0.00042170148691887823, "loss": 4.2271, "step": 4160 }, { "epoch": 0.784867306606437, "grad_norm": 1.3400135040283203, "learning_rate": 0.0004215132693393563, "loss": 3.9819, "step": 4170 }, { "epoch": 0.7867494824016563, "grad_norm": 1.805132508277893, "learning_rate": 0.0004213250517598344, "loss": 3.9726, "step": 4180 }, { "epoch": 0.7886316581968756, "grad_norm": 1.5507073402404785, "learning_rate": 0.00042113683418031246, "loss": 4.1114, "step": 4190 }, { "epoch": 0.7905138339920948, "grad_norm": 1.6533308029174805, "learning_rate": 0.00042094861660079054, "loss": 4.0381, "step": 4200 }, { "epoch": 0.7923960097873142, "grad_norm": 1.5455400943756104, "learning_rate": 0.0004207603990212686, "loss": 3.9636, "step": 4210 }, { "epoch": 0.7942781855825334, "grad_norm": 1.6970956325531006, "learning_rate": 0.0004205721814417467, "loss": 3.8815, "step": 4220 }, { "epoch": 0.7961603613777527, "grad_norm": 1.7900029420852661, "learning_rate": 0.0004203839638622247, "loss": 3.8217, "step": 4230 }, { "epoch": 0.7980425371729719, "grad_norm": 1.7710243463516235, "learning_rate": 0.0004201957462827028, "loss": 3.9771, "step": 4240 }, { "epoch": 0.7999247129681912, "grad_norm": 1.6762468814849854, "learning_rate": 0.00042000752870318086, "loss": 3.9662, "step": 4250 }, { "epoch": 0.8018068887634106, "grad_norm": 1.785457968711853, "learning_rate": 0.00041981931112365894, "loss": 4.0751, "step": 4260 }, { "epoch": 0.8036890645586298, "grad_norm": 1.6882836818695068, "learning_rate": 0.000419631093544137, "loss": 3.8162, "step": 4270 }, { "epoch": 0.8055712403538491, "grad_norm": 1.752007246017456, "learning_rate": 0.0004194428759646151, "loss": 3.9087, "step": 4280 }, { "epoch": 0.8074534161490683, "grad_norm": 1.6861228942871094, "learning_rate": 0.00041925465838509317, "loss": 4.0087, "step": 4290 }, { "epoch": 0.8093355919442876, "grad_norm": 2.41631817817688, "learning_rate": 0.00041906644080557125, "loss": 3.8155, "step": 4300 }, { "epoch": 0.8112177677395068, "grad_norm": 1.6490652561187744, "learning_rate": 0.0004188782232260493, "loss": 3.8881, "step": 4310 }, { "epoch": 0.8130999435347261, "grad_norm": 1.7375346422195435, "learning_rate": 0.0004186900056465274, "loss": 4.0091, "step": 4320 }, { "epoch": 0.8149821193299455, "grad_norm": 1.9426676034927368, "learning_rate": 0.0004185017880670055, "loss": 3.882, "step": 4330 }, { "epoch": 0.8168642951251647, "grad_norm": 1.895981788635254, "learning_rate": 0.00041831357048748355, "loss": 4.3555, "step": 4340 }, { "epoch": 0.818746470920384, "grad_norm": 1.5857805013656616, "learning_rate": 0.00041812535290796163, "loss": 3.9986, "step": 4350 }, { "epoch": 0.8206286467156032, "grad_norm": 2.0020341873168945, "learning_rate": 0.0004179371353284397, "loss": 3.8985, "step": 4360 }, { "epoch": 0.8225108225108225, "grad_norm": 1.570229172706604, "learning_rate": 0.0004177489177489178, "loss": 3.9683, "step": 4370 }, { "epoch": 0.8243929983060417, "grad_norm": 1.7034296989440918, "learning_rate": 0.00041756070016939586, "loss": 3.7937, "step": 4380 }, { "epoch": 0.8262751741012611, "grad_norm": 1.8847661018371582, "learning_rate": 0.00041737248258987393, "loss": 4.1219, "step": 4390 }, { "epoch": 0.8281573498964804, "grad_norm": 2.25219988822937, "learning_rate": 0.00041718426501035196, "loss": 4.2165, "step": 4400 }, { "epoch": 0.8300395256916996, "grad_norm": 1.679608702659607, "learning_rate": 0.00041699604743083003, "loss": 4.1739, "step": 4410 }, { "epoch": 0.8319217014869189, "grad_norm": 1.7975382804870605, "learning_rate": 0.0004168078298513081, "loss": 3.9637, "step": 4420 }, { "epoch": 0.8338038772821381, "grad_norm": 1.6868278980255127, "learning_rate": 0.0004166196122717862, "loss": 3.7738, "step": 4430 }, { "epoch": 0.8356860530773574, "grad_norm": 1.7453500032424927, "learning_rate": 0.00041643139469226426, "loss": 3.9673, "step": 4440 }, { "epoch": 0.8375682288725766, "grad_norm": 1.622382640838623, "learning_rate": 0.00041624317711274234, "loss": 4.0327, "step": 4450 }, { "epoch": 0.839450404667796, "grad_norm": 2.1576337814331055, "learning_rate": 0.0004160549595332204, "loss": 3.91, "step": 4460 }, { "epoch": 0.8413325804630153, "grad_norm": 1.755730390548706, "learning_rate": 0.0004158667419536985, "loss": 3.8582, "step": 4470 }, { "epoch": 0.8432147562582345, "grad_norm": 1.8487035036087036, "learning_rate": 0.00041567852437417657, "loss": 4.0083, "step": 4480 }, { "epoch": 0.8450969320534538, "grad_norm": 1.5082343816757202, "learning_rate": 0.0004154903067946546, "loss": 3.865, "step": 4490 }, { "epoch": 0.846979107848673, "grad_norm": 2.0205516815185547, "learning_rate": 0.00041530208921513267, "loss": 3.9817, "step": 4500 }, { "epoch": 0.8488612836438924, "grad_norm": 1.7600107192993164, "learning_rate": 0.00041511387163561074, "loss": 4.1025, "step": 4510 }, { "epoch": 0.8507434594391116, "grad_norm": 1.8236727714538574, "learning_rate": 0.0004149256540560888, "loss": 3.9428, "step": 4520 }, { "epoch": 0.8526256352343309, "grad_norm": 1.743336796760559, "learning_rate": 0.0004147374364765669, "loss": 4.0617, "step": 4530 }, { "epoch": 0.8545078110295502, "grad_norm": 1.7079042196273804, "learning_rate": 0.000414549218897045, "loss": 3.9175, "step": 4540 }, { "epoch": 0.8563899868247694, "grad_norm": 1.808450698852539, "learning_rate": 0.0004143610013175231, "loss": 3.911, "step": 4550 }, { "epoch": 0.8582721626199887, "grad_norm": 1.5372434854507446, "learning_rate": 0.0004141727837380012, "loss": 3.847, "step": 4560 }, { "epoch": 0.8601543384152079, "grad_norm": 1.904667854309082, "learning_rate": 0.00041398456615847926, "loss": 4.0016, "step": 4570 }, { "epoch": 0.8620365142104273, "grad_norm": 1.7665495872497559, "learning_rate": 0.0004137963485789573, "loss": 3.7517, "step": 4580 }, { "epoch": 0.8639186900056465, "grad_norm": 1.6588249206542969, "learning_rate": 0.00041360813099943535, "loss": 3.8672, "step": 4590 }, { "epoch": 0.8658008658008658, "grad_norm": 1.5419063568115234, "learning_rate": 0.00041341991341991343, "loss": 4.1185, "step": 4600 }, { "epoch": 0.8676830415960851, "grad_norm": 1.8266115188598633, "learning_rate": 0.0004132316958403915, "loss": 3.8609, "step": 4610 }, { "epoch": 0.8695652173913043, "grad_norm": 1.8634611368179321, "learning_rate": 0.0004130434782608696, "loss": 4.0257, "step": 4620 }, { "epoch": 0.8714473931865236, "grad_norm": 1.7247673273086548, "learning_rate": 0.00041285526068134766, "loss": 4.1326, "step": 4630 }, { "epoch": 0.8733295689817429, "grad_norm": 1.533218264579773, "learning_rate": 0.00041266704310182574, "loss": 3.8748, "step": 4640 }, { "epoch": 0.8752117447769622, "grad_norm": 1.7721878290176392, "learning_rate": 0.0004124788255223038, "loss": 3.991, "step": 4650 }, { "epoch": 0.8770939205721815, "grad_norm": 1.8932552337646484, "learning_rate": 0.00041229060794278183, "loss": 4.0442, "step": 4660 }, { "epoch": 0.8789760963674007, "grad_norm": 1.8900960683822632, "learning_rate": 0.0004121023903632599, "loss": 3.8255, "step": 4670 }, { "epoch": 0.88085827216262, "grad_norm": 1.672855019569397, "learning_rate": 0.000411914172783738, "loss": 3.8736, "step": 4680 }, { "epoch": 0.8827404479578392, "grad_norm": 1.7743964195251465, "learning_rate": 0.00041172595520421606, "loss": 3.9798, "step": 4690 }, { "epoch": 0.8846226237530586, "grad_norm": 1.7346552610397339, "learning_rate": 0.00041153773762469414, "loss": 3.9649, "step": 4700 }, { "epoch": 0.8865047995482778, "grad_norm": 1.8058704137802124, "learning_rate": 0.0004113495200451722, "loss": 3.8931, "step": 4710 }, { "epoch": 0.8883869753434971, "grad_norm": 1.6042428016662598, "learning_rate": 0.0004111613024656503, "loss": 3.9251, "step": 4720 }, { "epoch": 0.8902691511387164, "grad_norm": 1.6752597093582153, "learning_rate": 0.00041097308488612837, "loss": 4.0351, "step": 4730 }, { "epoch": 0.8921513269339356, "grad_norm": 1.673338532447815, "learning_rate": 0.00041078486730660645, "loss": 3.9119, "step": 4740 }, { "epoch": 0.8940335027291549, "grad_norm": 1.7413945198059082, "learning_rate": 0.00041059664972708447, "loss": 3.9578, "step": 4750 }, { "epoch": 0.8959156785243741, "grad_norm": 1.8221259117126465, "learning_rate": 0.00041040843214756254, "loss": 3.8471, "step": 4760 }, { "epoch": 0.8977978543195935, "grad_norm": 1.9183502197265625, "learning_rate": 0.0004102202145680407, "loss": 4.0963, "step": 4770 }, { "epoch": 0.8996800301148127, "grad_norm": 1.765283465385437, "learning_rate": 0.00041003199698851875, "loss": 4.006, "step": 4780 }, { "epoch": 0.901562205910032, "grad_norm": 1.787667155265808, "learning_rate": 0.00040984377940899683, "loss": 3.9434, "step": 4790 }, { "epoch": 0.9034443817052513, "grad_norm": 1.744106650352478, "learning_rate": 0.0004096555618294749, "loss": 4.0278, "step": 4800 }, { "epoch": 0.9053265575004705, "grad_norm": 1.6062647104263306, "learning_rate": 0.000409467344249953, "loss": 4.099, "step": 4810 }, { "epoch": 0.9072087332956899, "grad_norm": 1.778028964996338, "learning_rate": 0.00040927912667043106, "loss": 3.9152, "step": 4820 }, { "epoch": 0.9090909090909091, "grad_norm": 1.7794740200042725, "learning_rate": 0.00040909090909090913, "loss": 3.9034, "step": 4830 }, { "epoch": 0.9109730848861284, "grad_norm": 1.8644254207611084, "learning_rate": 0.00040890269151138716, "loss": 4.0704, "step": 4840 }, { "epoch": 0.9128552606813476, "grad_norm": 1.7791409492492676, "learning_rate": 0.00040871447393186523, "loss": 4.0014, "step": 4850 }, { "epoch": 0.9147374364765669, "grad_norm": 2.3520307540893555, "learning_rate": 0.0004085262563523433, "loss": 3.8349, "step": 4860 }, { "epoch": 0.9166196122717862, "grad_norm": 1.7046860456466675, "learning_rate": 0.0004083380387728214, "loss": 3.95, "step": 4870 }, { "epoch": 0.9185017880670054, "grad_norm": 2.0061821937561035, "learning_rate": 0.00040814982119329946, "loss": 3.8543, "step": 4880 }, { "epoch": 0.9203839638622248, "grad_norm": 1.903624176979065, "learning_rate": 0.00040796160361377754, "loss": 3.9311, "step": 4890 }, { "epoch": 0.922266139657444, "grad_norm": 1.834676742553711, "learning_rate": 0.0004077733860342556, "loss": 4.1316, "step": 4900 }, { "epoch": 0.9241483154526633, "grad_norm": 1.7539821863174438, "learning_rate": 0.0004075851684547337, "loss": 4.0676, "step": 4910 }, { "epoch": 0.9260304912478825, "grad_norm": 2.8327972888946533, "learning_rate": 0.00040739695087521177, "loss": 4.1533, "step": 4920 }, { "epoch": 0.9279126670431018, "grad_norm": 2.6836674213409424, "learning_rate": 0.0004072087332956898, "loss": 3.8235, "step": 4930 }, { "epoch": 0.9297948428383211, "grad_norm": 2.0225014686584473, "learning_rate": 0.00040702051571616787, "loss": 4.2813, "step": 4940 }, { "epoch": 0.9316770186335404, "grad_norm": 1.8623604774475098, "learning_rate": 0.00040683229813664594, "loss": 4.0378, "step": 4950 }, { "epoch": 0.9335591944287597, "grad_norm": 2.075603723526001, "learning_rate": 0.000406644080557124, "loss": 3.8749, "step": 4960 }, { "epoch": 0.9354413702239789, "grad_norm": 1.5741757154464722, "learning_rate": 0.0004064558629776021, "loss": 3.8205, "step": 4970 }, { "epoch": 0.9373235460191982, "grad_norm": 1.8074135780334473, "learning_rate": 0.0004062676453980802, "loss": 3.7819, "step": 4980 }, { "epoch": 0.9392057218144174, "grad_norm": 2.2221150398254395, "learning_rate": 0.0004060794278185583, "loss": 4.1042, "step": 4990 }, { "epoch": 0.9410878976096367, "grad_norm": 1.7813559770584106, "learning_rate": 0.0004058912102390364, "loss": 3.7184, "step": 5000 }, { "epoch": 0.9429700734048561, "grad_norm": 2.023097515106201, "learning_rate": 0.0004057029926595144, "loss": 3.9837, "step": 5010 }, { "epoch": 0.9448522492000753, "grad_norm": 1.661009430885315, "learning_rate": 0.0004055147750799925, "loss": 4.0824, "step": 5020 }, { "epoch": 0.9467344249952946, "grad_norm": 1.5688236951828003, "learning_rate": 0.00040532655750047055, "loss": 3.9856, "step": 5030 }, { "epoch": 0.9486166007905138, "grad_norm": 2.1541364192962646, "learning_rate": 0.00040513833992094863, "loss": 3.7916, "step": 5040 }, { "epoch": 0.9504987765857331, "grad_norm": 1.6808667182922363, "learning_rate": 0.0004049501223414267, "loss": 3.9749, "step": 5050 }, { "epoch": 0.9523809523809523, "grad_norm": 1.978331446647644, "learning_rate": 0.0004047619047619048, "loss": 4.1186, "step": 5060 }, { "epoch": 0.9542631281761716, "grad_norm": 1.737565279006958, "learning_rate": 0.00040457368718238286, "loss": 3.8198, "step": 5070 }, { "epoch": 0.956145303971391, "grad_norm": 1.7728204727172852, "learning_rate": 0.00040438546960286094, "loss": 3.8926, "step": 5080 }, { "epoch": 0.9580274797666102, "grad_norm": 1.7337509393692017, "learning_rate": 0.000404197252023339, "loss": 3.8826, "step": 5090 }, { "epoch": 0.9599096555618295, "grad_norm": 1.9217840433120728, "learning_rate": 0.00040400903444381703, "loss": 3.7385, "step": 5100 }, { "epoch": 0.9617918313570487, "grad_norm": 1.8276395797729492, "learning_rate": 0.0004038208168642951, "loss": 3.9753, "step": 5110 }, { "epoch": 0.963674007152268, "grad_norm": 1.7400699853897095, "learning_rate": 0.0004036325992847732, "loss": 4.0928, "step": 5120 }, { "epoch": 0.9655561829474872, "grad_norm": 1.511858582496643, "learning_rate": 0.00040344438170525126, "loss": 3.9909, "step": 5130 }, { "epoch": 0.9674383587427066, "grad_norm": 1.6028978824615479, "learning_rate": 0.00040325616412572934, "loss": 3.732, "step": 5140 }, { "epoch": 0.9693205345379259, "grad_norm": 2.316171169281006, "learning_rate": 0.0004030679465462074, "loss": 3.9026, "step": 5150 }, { "epoch": 0.9712027103331451, "grad_norm": 1.5085681676864624, "learning_rate": 0.0004028797289666855, "loss": 4.1728, "step": 5160 }, { "epoch": 0.9730848861283644, "grad_norm": 1.9052393436431885, "learning_rate": 0.00040269151138716357, "loss": 4.0342, "step": 5170 }, { "epoch": 0.9749670619235836, "grad_norm": 1.9131525754928589, "learning_rate": 0.00040250329380764165, "loss": 4.0431, "step": 5180 }, { "epoch": 0.9768492377188029, "grad_norm": 1.4925379753112793, "learning_rate": 0.00040231507622811967, "loss": 3.9273, "step": 5190 }, { "epoch": 0.9787314135140223, "grad_norm": 1.9738259315490723, "learning_rate": 0.00040212685864859774, "loss": 3.8051, "step": 5200 }, { "epoch": 0.9806135893092415, "grad_norm": 1.8526817560195923, "learning_rate": 0.0004019386410690759, "loss": 3.9126, "step": 5210 }, { "epoch": 0.9824957651044608, "grad_norm": 1.584219217300415, "learning_rate": 0.00040175042348955395, "loss": 3.9394, "step": 5220 }, { "epoch": 0.98437794089968, "grad_norm": 1.7924845218658447, "learning_rate": 0.00040156220591003203, "loss": 3.95, "step": 5230 }, { "epoch": 0.9862601166948993, "grad_norm": 1.6530725955963135, "learning_rate": 0.0004013739883305101, "loss": 3.8088, "step": 5240 }, { "epoch": 0.9881422924901185, "grad_norm": 2.2157840728759766, "learning_rate": 0.0004011857707509882, "loss": 3.9543, "step": 5250 }, { "epoch": 0.9900244682853379, "grad_norm": 1.9279638528823853, "learning_rate": 0.00040099755317146626, "loss": 3.6932, "step": 5260 }, { "epoch": 0.9919066440805572, "grad_norm": 1.7662793397903442, "learning_rate": 0.0004008093355919443, "loss": 3.8718, "step": 5270 }, { "epoch": 0.9937888198757764, "grad_norm": 1.8058240413665771, "learning_rate": 0.00040062111801242236, "loss": 4.0627, "step": 5280 }, { "epoch": 0.9956709956709957, "grad_norm": 1.6732690334320068, "learning_rate": 0.00040043290043290043, "loss": 3.9005, "step": 5290 }, { "epoch": 0.9975531714662149, "grad_norm": 1.7442728281021118, "learning_rate": 0.0004002446828533785, "loss": 3.8012, "step": 5300 }, { "epoch": 0.9994353472614342, "grad_norm": 1.855129599571228, "learning_rate": 0.0004000564652738566, "loss": 3.7207, "step": 5310 }, { "epoch": 1.0, "eval_accuracy": 0.09853333333333333, "eval_loss": 3.863232374191284, "eval_runtime": 117.7167, "eval_samples_per_second": 63.712, "eval_steps_per_second": 7.968, "step": 5313 }, { "epoch": 1.0013175230566536, "grad_norm": 1.724755883216858, "learning_rate": 0.00039986824769433466, "loss": 3.9155, "step": 5320 }, { "epoch": 1.0031996988518728, "grad_norm": 1.7752708196640015, "learning_rate": 0.00039968003011481274, "loss": 3.9399, "step": 5330 }, { "epoch": 1.005081874647092, "grad_norm": 1.7419073581695557, "learning_rate": 0.0003994918125352908, "loss": 3.866, "step": 5340 }, { "epoch": 1.0069640504423114, "grad_norm": 1.829726219177246, "learning_rate": 0.0003993035949557689, "loss": 3.7939, "step": 5350 }, { "epoch": 1.0088462262375306, "grad_norm": 2.0104973316192627, "learning_rate": 0.0003991153773762469, "loss": 4.0264, "step": 5360 }, { "epoch": 1.0107284020327498, "grad_norm": 1.9856433868408203, "learning_rate": 0.000398927159796725, "loss": 3.9405, "step": 5370 }, { "epoch": 1.012610577827969, "grad_norm": 1.782480001449585, "learning_rate": 0.00039873894221720307, "loss": 3.779, "step": 5380 }, { "epoch": 1.0144927536231885, "grad_norm": 1.956235408782959, "learning_rate": 0.00039855072463768114, "loss": 3.7016, "step": 5390 }, { "epoch": 1.0163749294184077, "grad_norm": 1.7101922035217285, "learning_rate": 0.0003983625070581592, "loss": 3.974, "step": 5400 }, { "epoch": 1.018257105213627, "grad_norm": 1.6829845905303955, "learning_rate": 0.0003981742894786373, "loss": 3.9825, "step": 5410 }, { "epoch": 1.0201392810088463, "grad_norm": 1.9459203481674194, "learning_rate": 0.00039798607189911537, "loss": 3.8148, "step": 5420 }, { "epoch": 1.0220214568040655, "grad_norm": 1.8005633354187012, "learning_rate": 0.0003977978543195935, "loss": 4.0029, "step": 5430 }, { "epoch": 1.0239036325992847, "grad_norm": 1.9317513704299927, "learning_rate": 0.0003976096367400716, "loss": 3.8425, "step": 5440 }, { "epoch": 1.025785808394504, "grad_norm": 1.8140696287155151, "learning_rate": 0.0003974214191605496, "loss": 4.0525, "step": 5450 }, { "epoch": 1.0276679841897234, "grad_norm": 1.8898414373397827, "learning_rate": 0.0003972332015810277, "loss": 3.8046, "step": 5460 }, { "epoch": 1.0295501599849426, "grad_norm": 1.8269977569580078, "learning_rate": 0.00039704498400150575, "loss": 3.9079, "step": 5470 }, { "epoch": 1.0314323357801618, "grad_norm": 1.6486467123031616, "learning_rate": 0.00039685676642198383, "loss": 3.7988, "step": 5480 }, { "epoch": 1.0333145115753812, "grad_norm": 1.916007399559021, "learning_rate": 0.0003966685488424619, "loss": 3.9587, "step": 5490 }, { "epoch": 1.0351966873706004, "grad_norm": 1.9689396619796753, "learning_rate": 0.00039648033126294, "loss": 3.7973, "step": 5500 }, { "epoch": 1.0370788631658197, "grad_norm": 2.0306029319763184, "learning_rate": 0.00039629211368341806, "loss": 3.6851, "step": 5510 }, { "epoch": 1.0389610389610389, "grad_norm": 2.138746976852417, "learning_rate": 0.00039610389610389614, "loss": 3.7968, "step": 5520 }, { "epoch": 1.0408432147562583, "grad_norm": 2.021477699279785, "learning_rate": 0.0003959156785243742, "loss": 3.9782, "step": 5530 }, { "epoch": 1.0427253905514775, "grad_norm": 1.9355220794677734, "learning_rate": 0.00039572746094485223, "loss": 3.5401, "step": 5540 }, { "epoch": 1.0446075663466967, "grad_norm": 2.985809087753296, "learning_rate": 0.0003955392433653303, "loss": 3.8579, "step": 5550 }, { "epoch": 1.0464897421419161, "grad_norm": 2.070136070251465, "learning_rate": 0.0003953510257858084, "loss": 3.81, "step": 5560 }, { "epoch": 1.0483719179371354, "grad_norm": 2.088350772857666, "learning_rate": 0.00039516280820628646, "loss": 3.9072, "step": 5570 }, { "epoch": 1.0502540937323546, "grad_norm": 1.9731718301773071, "learning_rate": 0.00039497459062676454, "loss": 3.7835, "step": 5580 }, { "epoch": 1.0521362695275738, "grad_norm": 1.894665002822876, "learning_rate": 0.0003947863730472426, "loss": 3.7677, "step": 5590 }, { "epoch": 1.0540184453227932, "grad_norm": 2.145423650741577, "learning_rate": 0.0003945981554677207, "loss": 3.7991, "step": 5600 }, { "epoch": 1.0559006211180124, "grad_norm": 1.9230924844741821, "learning_rate": 0.00039440993788819877, "loss": 4.0038, "step": 5610 }, { "epoch": 1.0577827969132316, "grad_norm": 1.7206995487213135, "learning_rate": 0.0003942217203086768, "loss": 3.8487, "step": 5620 }, { "epoch": 1.059664972708451, "grad_norm": 1.8569245338439941, "learning_rate": 0.00039403350272915487, "loss": 3.8765, "step": 5630 }, { "epoch": 1.0615471485036703, "grad_norm": 2.204007148742676, "learning_rate": 0.00039384528514963294, "loss": 4.0123, "step": 5640 }, { "epoch": 1.0634293242988895, "grad_norm": 1.744128942489624, "learning_rate": 0.0003936570675701111, "loss": 4.1011, "step": 5650 }, { "epoch": 1.0653115000941087, "grad_norm": 2.6771187782287598, "learning_rate": 0.00039346884999058915, "loss": 3.8366, "step": 5660 }, { "epoch": 1.0671936758893281, "grad_norm": 2.1822588443756104, "learning_rate": 0.00039328063241106723, "loss": 3.9191, "step": 5670 }, { "epoch": 1.0690758516845473, "grad_norm": 1.7872580289840698, "learning_rate": 0.0003930924148315453, "loss": 3.7534, "step": 5680 }, { "epoch": 1.0709580274797665, "grad_norm": 1.9511586427688599, "learning_rate": 0.0003929041972520234, "loss": 3.6996, "step": 5690 }, { "epoch": 1.072840203274986, "grad_norm": 1.9602952003479004, "learning_rate": 0.00039271597967250146, "loss": 4.0202, "step": 5700 }, { "epoch": 1.0747223790702052, "grad_norm": 2.0772409439086914, "learning_rate": 0.0003925277620929795, "loss": 3.9165, "step": 5710 }, { "epoch": 1.0766045548654244, "grad_norm": 1.903748869895935, "learning_rate": 0.00039233954451345756, "loss": 3.6467, "step": 5720 }, { "epoch": 1.0784867306606438, "grad_norm": 1.9211351871490479, "learning_rate": 0.00039215132693393563, "loss": 3.8662, "step": 5730 }, { "epoch": 1.080368906455863, "grad_norm": 2.4941396713256836, "learning_rate": 0.0003919631093544137, "loss": 3.7316, "step": 5740 }, { "epoch": 1.0822510822510822, "grad_norm": 1.5563691854476929, "learning_rate": 0.0003917748917748918, "loss": 3.9315, "step": 5750 }, { "epoch": 1.0841332580463015, "grad_norm": 1.6314982175827026, "learning_rate": 0.00039158667419536986, "loss": 4.1015, "step": 5760 }, { "epoch": 1.0860154338415209, "grad_norm": 1.7421598434448242, "learning_rate": 0.00039139845661584794, "loss": 3.5859, "step": 5770 }, { "epoch": 1.08789760963674, "grad_norm": 1.7598062753677368, "learning_rate": 0.000391210239036326, "loss": 4.0092, "step": 5780 }, { "epoch": 1.0897797854319593, "grad_norm": 1.8014366626739502, "learning_rate": 0.0003910220214568041, "loss": 3.9926, "step": 5790 }, { "epoch": 1.0916619612271785, "grad_norm": 1.5961711406707764, "learning_rate": 0.0003908338038772821, "loss": 3.8729, "step": 5800 }, { "epoch": 1.093544137022398, "grad_norm": 1.7829089164733887, "learning_rate": 0.0003906455862977602, "loss": 3.7447, "step": 5810 }, { "epoch": 1.0954263128176172, "grad_norm": 2.3133766651153564, "learning_rate": 0.00039045736871823827, "loss": 3.8943, "step": 5820 }, { "epoch": 1.0973084886128364, "grad_norm": 1.9624972343444824, "learning_rate": 0.00039026915113871634, "loss": 4.1605, "step": 5830 }, { "epoch": 1.0991906644080558, "grad_norm": 1.9229626655578613, "learning_rate": 0.0003900809335591944, "loss": 3.8495, "step": 5840 }, { "epoch": 1.101072840203275, "grad_norm": 2.002578020095825, "learning_rate": 0.0003898927159796725, "loss": 3.8138, "step": 5850 }, { "epoch": 1.1029550159984942, "grad_norm": 1.9529259204864502, "learning_rate": 0.00038970449840015057, "loss": 3.9551, "step": 5860 }, { "epoch": 1.1048371917937136, "grad_norm": 1.998673677444458, "learning_rate": 0.0003895162808206287, "loss": 3.8178, "step": 5870 }, { "epoch": 1.1067193675889329, "grad_norm": 2.191293478012085, "learning_rate": 0.0003893280632411067, "loss": 3.5839, "step": 5880 }, { "epoch": 1.108601543384152, "grad_norm": 1.7941858768463135, "learning_rate": 0.0003891398456615848, "loss": 3.9038, "step": 5890 }, { "epoch": 1.1104837191793713, "grad_norm": 1.7840150594711304, "learning_rate": 0.0003889516280820629, "loss": 3.9601, "step": 5900 }, { "epoch": 1.1123658949745907, "grad_norm": 1.9555600881576538, "learning_rate": 0.00038876341050254095, "loss": 3.6967, "step": 5910 }, { "epoch": 1.11424807076981, "grad_norm": 1.902140736579895, "learning_rate": 0.00038857519292301903, "loss": 3.8388, "step": 5920 }, { "epoch": 1.1161302465650291, "grad_norm": 1.6516783237457275, "learning_rate": 0.0003883869753434971, "loss": 3.937, "step": 5930 }, { "epoch": 1.1180124223602483, "grad_norm": 1.862953543663025, "learning_rate": 0.0003881987577639752, "loss": 3.9593, "step": 5940 }, { "epoch": 1.1198945981554678, "grad_norm": 2.068586587905884, "learning_rate": 0.00038801054018445326, "loss": 3.8958, "step": 5950 }, { "epoch": 1.121776773950687, "grad_norm": 1.7183383703231812, "learning_rate": 0.00038782232260493134, "loss": 4.0018, "step": 5960 }, { "epoch": 1.1236589497459062, "grad_norm": 1.8313102722167969, "learning_rate": 0.00038763410502540936, "loss": 3.9836, "step": 5970 }, { "epoch": 1.1255411255411256, "grad_norm": 1.7724077701568604, "learning_rate": 0.00038744588744588743, "loss": 3.7711, "step": 5980 }, { "epoch": 1.1274233013363448, "grad_norm": 2.0144224166870117, "learning_rate": 0.0003872576698663655, "loss": 3.7638, "step": 5990 }, { "epoch": 1.129305477131564, "grad_norm": 2.1548473834991455, "learning_rate": 0.0003870694522868436, "loss": 3.8382, "step": 6000 }, { "epoch": 1.1311876529267835, "grad_norm": 1.721322774887085, "learning_rate": 0.00038688123470732166, "loss": 4.0194, "step": 6010 }, { "epoch": 1.1330698287220027, "grad_norm": 2.17598295211792, "learning_rate": 0.00038669301712779974, "loss": 3.7419, "step": 6020 }, { "epoch": 1.134952004517222, "grad_norm": 2.023616075515747, "learning_rate": 0.0003865047995482778, "loss": 3.6595, "step": 6030 }, { "epoch": 1.136834180312441, "grad_norm": 2.1189849376678467, "learning_rate": 0.0003863165819687559, "loss": 3.7727, "step": 6040 }, { "epoch": 1.1387163561076605, "grad_norm": 1.9913543462753296, "learning_rate": 0.00038612836438923397, "loss": 3.7929, "step": 6050 }, { "epoch": 1.1405985319028797, "grad_norm": 1.8483470678329468, "learning_rate": 0.000385940146809712, "loss": 3.7482, "step": 6060 }, { "epoch": 1.142480707698099, "grad_norm": 2.0956037044525146, "learning_rate": 0.00038575192923019007, "loss": 3.8818, "step": 6070 }, { "epoch": 1.1443628834933182, "grad_norm": 2.2748496532440186, "learning_rate": 0.00038556371165066814, "loss": 3.8901, "step": 6080 }, { "epoch": 1.1462450592885376, "grad_norm": 1.7614413499832153, "learning_rate": 0.0003853754940711463, "loss": 3.7777, "step": 6090 }, { "epoch": 1.1481272350837568, "grad_norm": 1.8067692518234253, "learning_rate": 0.00038518727649162435, "loss": 3.8457, "step": 6100 }, { "epoch": 1.150009410878976, "grad_norm": 2.009246587753296, "learning_rate": 0.00038499905891210243, "loss": 3.8384, "step": 6110 }, { "epoch": 1.1518915866741954, "grad_norm": 1.9846868515014648, "learning_rate": 0.0003848108413325805, "loss": 3.9047, "step": 6120 }, { "epoch": 1.1537737624694147, "grad_norm": 1.5468846559524536, "learning_rate": 0.0003846226237530586, "loss": 3.7324, "step": 6130 }, { "epoch": 1.1556559382646339, "grad_norm": 1.7952297925949097, "learning_rate": 0.00038443440617353666, "loss": 3.6679, "step": 6140 }, { "epoch": 1.1575381140598533, "grad_norm": 1.754523754119873, "learning_rate": 0.0003842461885940147, "loss": 3.8266, "step": 6150 }, { "epoch": 1.1594202898550725, "grad_norm": 1.77576744556427, "learning_rate": 0.00038405797101449276, "loss": 3.7629, "step": 6160 }, { "epoch": 1.1613024656502917, "grad_norm": 1.9010967016220093, "learning_rate": 0.00038386975343497083, "loss": 3.8496, "step": 6170 }, { "epoch": 1.163184641445511, "grad_norm": 1.6943968534469604, "learning_rate": 0.0003836815358554489, "loss": 3.7759, "step": 6180 }, { "epoch": 1.1650668172407304, "grad_norm": 2.6153507232666016, "learning_rate": 0.000383493318275927, "loss": 3.6857, "step": 6190 }, { "epoch": 1.1669489930359496, "grad_norm": 1.597812533378601, "learning_rate": 0.00038330510069640506, "loss": 3.8673, "step": 6200 }, { "epoch": 1.1688311688311688, "grad_norm": 1.7507632970809937, "learning_rate": 0.00038311688311688314, "loss": 3.9767, "step": 6210 }, { "epoch": 1.170713344626388, "grad_norm": 1.8110359907150269, "learning_rate": 0.0003829286655373612, "loss": 3.5311, "step": 6220 }, { "epoch": 1.1725955204216074, "grad_norm": 1.7214828729629517, "learning_rate": 0.00038274044795783924, "loss": 3.7362, "step": 6230 }, { "epoch": 1.1744776962168266, "grad_norm": 1.9230563640594482, "learning_rate": 0.0003825522303783173, "loss": 3.9467, "step": 6240 }, { "epoch": 1.1763598720120458, "grad_norm": 1.797293782234192, "learning_rate": 0.0003823640127987954, "loss": 3.9876, "step": 6250 }, { "epoch": 1.1782420478072653, "grad_norm": 1.9491268396377563, "learning_rate": 0.00038217579521927347, "loss": 3.7447, "step": 6260 }, { "epoch": 1.1801242236024845, "grad_norm": 1.9252543449401855, "learning_rate": 0.00038198757763975154, "loss": 3.9862, "step": 6270 }, { "epoch": 1.1820063993977037, "grad_norm": 2.3227150440216064, "learning_rate": 0.0003817993600602296, "loss": 3.9708, "step": 6280 }, { "epoch": 1.1838885751929231, "grad_norm": 1.932223916053772, "learning_rate": 0.0003816111424807077, "loss": 3.6916, "step": 6290 }, { "epoch": 1.1857707509881423, "grad_norm": 2.2664384841918945, "learning_rate": 0.00038142292490118577, "loss": 3.6087, "step": 6300 }, { "epoch": 1.1876529267833615, "grad_norm": 1.9968208074569702, "learning_rate": 0.0003812347073216639, "loss": 3.8954, "step": 6310 }, { "epoch": 1.1895351025785807, "grad_norm": 2.0324888229370117, "learning_rate": 0.0003810464897421419, "loss": 3.9083, "step": 6320 }, { "epoch": 1.1914172783738002, "grad_norm": 1.6557317972183228, "learning_rate": 0.00038085827216262, "loss": 3.7426, "step": 6330 }, { "epoch": 1.1932994541690194, "grad_norm": 1.7409571409225464, "learning_rate": 0.0003806700545830981, "loss": 3.7642, "step": 6340 }, { "epoch": 1.1951816299642386, "grad_norm": 1.8588122129440308, "learning_rate": 0.00038048183700357615, "loss": 3.893, "step": 6350 }, { "epoch": 1.1970638057594578, "grad_norm": 1.8750513792037964, "learning_rate": 0.00038029361942405423, "loss": 3.8427, "step": 6360 }, { "epoch": 1.1989459815546772, "grad_norm": 2.1529922485351562, "learning_rate": 0.0003801054018445323, "loss": 3.8215, "step": 6370 }, { "epoch": 1.2008281573498965, "grad_norm": 4.190012454986572, "learning_rate": 0.0003799171842650104, "loss": 3.9574, "step": 6380 }, { "epoch": 1.2027103331451157, "grad_norm": 1.9367660284042358, "learning_rate": 0.00037972896668548846, "loss": 3.8696, "step": 6390 }, { "epoch": 1.204592508940335, "grad_norm": 1.7562202215194702, "learning_rate": 0.00037954074910596654, "loss": 3.6758, "step": 6400 }, { "epoch": 1.2064746847355543, "grad_norm": 1.964674472808838, "learning_rate": 0.00037935253152644456, "loss": 3.7275, "step": 6410 }, { "epoch": 1.2083568605307735, "grad_norm": 5.957378387451172, "learning_rate": 0.00037916431394692263, "loss": 3.8423, "step": 6420 }, { "epoch": 1.210239036325993, "grad_norm": 2.2306909561157227, "learning_rate": 0.0003789760963674007, "loss": 3.8344, "step": 6430 }, { "epoch": 1.2121212121212122, "grad_norm": 1.6602094173431396, "learning_rate": 0.0003787878787878788, "loss": 3.7842, "step": 6440 }, { "epoch": 1.2140033879164314, "grad_norm": 1.708493947982788, "learning_rate": 0.00037859966120835686, "loss": 3.9448, "step": 6450 }, { "epoch": 1.2158855637116506, "grad_norm": 1.8723329305648804, "learning_rate": 0.00037841144362883494, "loss": 3.4864, "step": 6460 }, { "epoch": 1.21776773950687, "grad_norm": 1.9271537065505981, "learning_rate": 0.000378223226049313, "loss": 3.9397, "step": 6470 }, { "epoch": 1.2196499153020892, "grad_norm": 2.0064141750335693, "learning_rate": 0.0003780350084697911, "loss": 3.8913, "step": 6480 }, { "epoch": 1.2215320910973084, "grad_norm": 2.434368848800659, "learning_rate": 0.0003778467908902691, "loss": 3.6471, "step": 6490 }, { "epoch": 1.2234142668925279, "grad_norm": 1.9352556467056274, "learning_rate": 0.0003776585733107472, "loss": 3.8066, "step": 6500 }, { "epoch": 1.225296442687747, "grad_norm": 1.7813034057617188, "learning_rate": 0.00037747035573122527, "loss": 3.8825, "step": 6510 }, { "epoch": 1.2271786184829663, "grad_norm": 3.184835433959961, "learning_rate": 0.00037728213815170334, "loss": 3.9238, "step": 6520 }, { "epoch": 1.2290607942781855, "grad_norm": 1.8562132120132446, "learning_rate": 0.0003770939205721814, "loss": 3.7629, "step": 6530 }, { "epoch": 1.230942970073405, "grad_norm": 2.0398480892181396, "learning_rate": 0.00037690570299265955, "loss": 3.821, "step": 6540 }, { "epoch": 1.2328251458686241, "grad_norm": 2.021771192550659, "learning_rate": 0.00037671748541313763, "loss": 3.6642, "step": 6550 }, { "epoch": 1.2347073216638433, "grad_norm": 2.077044725418091, "learning_rate": 0.0003765292678336157, "loss": 3.6619, "step": 6560 }, { "epoch": 1.2365894974590628, "grad_norm": 2.127864360809326, "learning_rate": 0.0003763410502540938, "loss": 3.8543, "step": 6570 }, { "epoch": 1.238471673254282, "grad_norm": 2.043787717819214, "learning_rate": 0.0003761528326745718, "loss": 3.9452, "step": 6580 }, { "epoch": 1.2403538490495012, "grad_norm": 1.8492457866668701, "learning_rate": 0.0003759646150950499, "loss": 3.8527, "step": 6590 }, { "epoch": 1.2422360248447206, "grad_norm": 1.6727014780044556, "learning_rate": 0.00037577639751552796, "loss": 3.5721, "step": 6600 }, { "epoch": 1.2441182006399398, "grad_norm": 1.6961781978607178, "learning_rate": 0.00037558817993600603, "loss": 3.6185, "step": 6610 }, { "epoch": 1.246000376435159, "grad_norm": 2.209446430206299, "learning_rate": 0.0003753999623564841, "loss": 3.9786, "step": 6620 }, { "epoch": 1.2478825522303783, "grad_norm": 1.7225549221038818, "learning_rate": 0.0003752117447769622, "loss": 3.6741, "step": 6630 }, { "epoch": 1.2497647280255977, "grad_norm": 1.9335850477218628, "learning_rate": 0.00037502352719744026, "loss": 3.6865, "step": 6640 }, { "epoch": 1.251646903820817, "grad_norm": 2.207566738128662, "learning_rate": 0.00037483530961791834, "loss": 3.7731, "step": 6650 }, { "epoch": 1.253529079616036, "grad_norm": 2.0488553047180176, "learning_rate": 0.0003746470920383964, "loss": 3.7351, "step": 6660 }, { "epoch": 1.2554112554112553, "grad_norm": 2.094182014465332, "learning_rate": 0.00037445887445887444, "loss": 3.7246, "step": 6670 }, { "epoch": 1.2572934312064747, "grad_norm": 1.9110198020935059, "learning_rate": 0.0003742706568793525, "loss": 3.7926, "step": 6680 }, { "epoch": 1.259175607001694, "grad_norm": 1.949934959411621, "learning_rate": 0.0003740824392998306, "loss": 3.5716, "step": 6690 }, { "epoch": 1.2610577827969132, "grad_norm": 1.7895861864089966, "learning_rate": 0.00037389422172030867, "loss": 3.8369, "step": 6700 }, { "epoch": 1.2629399585921326, "grad_norm": 1.8408478498458862, "learning_rate": 0.00037370600414078674, "loss": 3.8636, "step": 6710 }, { "epoch": 1.2648221343873518, "grad_norm": 2.2939276695251465, "learning_rate": 0.0003735177865612648, "loss": 4.2916, "step": 6720 }, { "epoch": 1.266704310182571, "grad_norm": 2.4173057079315186, "learning_rate": 0.0003733295689817429, "loss": 3.7414, "step": 6730 }, { "epoch": 1.2685864859777904, "grad_norm": 3.014364719390869, "learning_rate": 0.00037314135140222097, "loss": 3.8336, "step": 6740 }, { "epoch": 1.2704686617730097, "grad_norm": 1.7591136693954468, "learning_rate": 0.0003729531338226991, "loss": 3.8213, "step": 6750 }, { "epoch": 1.2723508375682289, "grad_norm": 1.7625524997711182, "learning_rate": 0.0003727649162431771, "loss": 3.8489, "step": 6760 }, { "epoch": 1.274233013363448, "grad_norm": 1.863606333732605, "learning_rate": 0.0003725766986636552, "loss": 3.7942, "step": 6770 }, { "epoch": 1.2761151891586673, "grad_norm": 2.5267436504364014, "learning_rate": 0.0003723884810841333, "loss": 3.7174, "step": 6780 }, { "epoch": 1.2779973649538867, "grad_norm": 2.354560136795044, "learning_rate": 0.00037220026350461135, "loss": 3.7846, "step": 6790 }, { "epoch": 1.279879540749106, "grad_norm": 2.1386847496032715, "learning_rate": 0.00037201204592508943, "loss": 3.9158, "step": 6800 }, { "epoch": 1.2817617165443251, "grad_norm": 2.3380281925201416, "learning_rate": 0.0003718238283455675, "loss": 3.5337, "step": 6810 }, { "epoch": 1.2836438923395446, "grad_norm": 2.3061459064483643, "learning_rate": 0.0003716356107660456, "loss": 4.0088, "step": 6820 }, { "epoch": 1.2855260681347638, "grad_norm": 1.9884824752807617, "learning_rate": 0.00037144739318652366, "loss": 3.9162, "step": 6830 }, { "epoch": 1.287408243929983, "grad_norm": 1.793647050857544, "learning_rate": 0.0003712591756070017, "loss": 3.7617, "step": 6840 }, { "epoch": 1.2892904197252024, "grad_norm": 2.4594411849975586, "learning_rate": 0.00037107095802747976, "loss": 3.8177, "step": 6850 }, { "epoch": 1.2911725955204216, "grad_norm": 2.076636791229248, "learning_rate": 0.00037088274044795783, "loss": 4.0619, "step": 6860 }, { "epoch": 1.2930547713156408, "grad_norm": 1.7677685022354126, "learning_rate": 0.0003706945228684359, "loss": 3.8239, "step": 6870 }, { "epoch": 1.2949369471108603, "grad_norm": 1.7307604551315308, "learning_rate": 0.000370506305288914, "loss": 3.6623, "step": 6880 }, { "epoch": 1.2968191229060795, "grad_norm": 1.8652023077011108, "learning_rate": 0.00037031808770939206, "loss": 3.9653, "step": 6890 }, { "epoch": 1.2987012987012987, "grad_norm": 1.8615070581436157, "learning_rate": 0.00037012987012987014, "loss": 3.7819, "step": 6900 }, { "epoch": 1.300583474496518, "grad_norm": 2.0547096729278564, "learning_rate": 0.0003699416525503482, "loss": 3.9213, "step": 6910 }, { "epoch": 1.3024656502917373, "grad_norm": 1.9442291259765625, "learning_rate": 0.0003697534349708263, "loss": 3.8774, "step": 6920 }, { "epoch": 1.3043478260869565, "grad_norm": 1.550815463066101, "learning_rate": 0.0003695652173913043, "loss": 3.7824, "step": 6930 }, { "epoch": 1.3062300018821758, "grad_norm": 1.9164446592330933, "learning_rate": 0.0003693769998117824, "loss": 3.8274, "step": 6940 }, { "epoch": 1.308112177677395, "grad_norm": 2.2354953289031982, "learning_rate": 0.00036918878223226047, "loss": 3.7788, "step": 6950 }, { "epoch": 1.3099943534726144, "grad_norm": 1.9069128036499023, "learning_rate": 0.00036900056465273854, "loss": 3.5732, "step": 6960 }, { "epoch": 1.3118765292678336, "grad_norm": 2.2385337352752686, "learning_rate": 0.0003688123470732166, "loss": 3.8875, "step": 6970 }, { "epoch": 1.3137587050630528, "grad_norm": 1.7741254568099976, "learning_rate": 0.00036862412949369475, "loss": 3.7516, "step": 6980 }, { "epoch": 1.3156408808582722, "grad_norm": 1.8411750793457031, "learning_rate": 0.00036843591191417283, "loss": 3.738, "step": 6990 }, { "epoch": 1.3175230566534915, "grad_norm": 2.279778003692627, "learning_rate": 0.0003682476943346509, "loss": 3.5345, "step": 7000 }, { "epoch": 1.3194052324487107, "grad_norm": 1.9499634504318237, "learning_rate": 0.000368059476755129, "loss": 3.6431, "step": 7010 }, { "epoch": 1.32128740824393, "grad_norm": 2.2675411701202393, "learning_rate": 0.000367871259175607, "loss": 3.6623, "step": 7020 }, { "epoch": 1.3231695840391493, "grad_norm": 2.3390955924987793, "learning_rate": 0.0003676830415960851, "loss": 3.7228, "step": 7030 }, { "epoch": 1.3250517598343685, "grad_norm": 1.957190752029419, "learning_rate": 0.00036749482401656316, "loss": 3.6893, "step": 7040 }, { "epoch": 1.3269339356295877, "grad_norm": 1.7984035015106201, "learning_rate": 0.00036730660643704123, "loss": 3.6364, "step": 7050 }, { "epoch": 1.3288161114248072, "grad_norm": 3.774339199066162, "learning_rate": 0.0003671183888575193, "loss": 3.8089, "step": 7060 }, { "epoch": 1.3306982872200264, "grad_norm": 2.1288113594055176, "learning_rate": 0.0003669301712779974, "loss": 3.5106, "step": 7070 }, { "epoch": 1.3325804630152456, "grad_norm": 2.831446886062622, "learning_rate": 0.00036674195369847546, "loss": 3.7959, "step": 7080 }, { "epoch": 1.3344626388104648, "grad_norm": 2.0102405548095703, "learning_rate": 0.00036655373611895354, "loss": 3.8562, "step": 7090 }, { "epoch": 1.3363448146056842, "grad_norm": 2.1046881675720215, "learning_rate": 0.00036636551853943156, "loss": 3.8892, "step": 7100 }, { "epoch": 1.3382269904009034, "grad_norm": 1.8360822200775146, "learning_rate": 0.00036617730095990964, "loss": 3.6919, "step": 7110 }, { "epoch": 1.3401091661961226, "grad_norm": 2.020962715148926, "learning_rate": 0.0003659890833803877, "loss": 3.7452, "step": 7120 }, { "epoch": 1.341991341991342, "grad_norm": 1.8157103061676025, "learning_rate": 0.0003658008658008658, "loss": 3.4925, "step": 7130 }, { "epoch": 1.3438735177865613, "grad_norm": 2.1023714542388916, "learning_rate": 0.00036561264822134387, "loss": 3.9745, "step": 7140 }, { "epoch": 1.3457556935817805, "grad_norm": 1.8831114768981934, "learning_rate": 0.00036542443064182194, "loss": 3.8476, "step": 7150 }, { "epoch": 1.347637869377, "grad_norm": 2.2279751300811768, "learning_rate": 0.0003652362130623, "loss": 4.0432, "step": 7160 }, { "epoch": 1.3495200451722191, "grad_norm": 2.070411443710327, "learning_rate": 0.0003650479954827781, "loss": 3.4937, "step": 7170 }, { "epoch": 1.3514022209674383, "grad_norm": 2.1437418460845947, "learning_rate": 0.00036485977790325617, "loss": 3.9656, "step": 7180 }, { "epoch": 1.3532843967626578, "grad_norm": 1.7823975086212158, "learning_rate": 0.0003646715603237342, "loss": 3.8643, "step": 7190 }, { "epoch": 1.355166572557877, "grad_norm": 2.0218539237976074, "learning_rate": 0.0003644833427442123, "loss": 3.7161, "step": 7200 }, { "epoch": 1.3570487483530962, "grad_norm": 2.5077080726623535, "learning_rate": 0.0003642951251646904, "loss": 3.7599, "step": 7210 }, { "epoch": 1.3589309241483154, "grad_norm": 1.9613864421844482, "learning_rate": 0.0003641069075851685, "loss": 3.6094, "step": 7220 }, { "epoch": 1.3608130999435346, "grad_norm": 1.76811945438385, "learning_rate": 0.00036391869000564655, "loss": 3.7828, "step": 7230 }, { "epoch": 1.362695275738754, "grad_norm": 2.1831283569335938, "learning_rate": 0.00036373047242612463, "loss": 3.5293, "step": 7240 }, { "epoch": 1.3645774515339733, "grad_norm": 1.9235538244247437, "learning_rate": 0.0003635422548466027, "loss": 3.4915, "step": 7250 }, { "epoch": 1.3664596273291925, "grad_norm": 2.1489882469177246, "learning_rate": 0.0003633540372670808, "loss": 3.8052, "step": 7260 }, { "epoch": 1.368341803124412, "grad_norm": 1.9953148365020752, "learning_rate": 0.00036316581968755886, "loss": 3.8223, "step": 7270 }, { "epoch": 1.370223978919631, "grad_norm": 2.3743884563446045, "learning_rate": 0.0003629776021080369, "loss": 3.6882, "step": 7280 }, { "epoch": 1.3721061547148503, "grad_norm": 1.9586063623428345, "learning_rate": 0.00036278938452851496, "loss": 3.7565, "step": 7290 }, { "epoch": 1.3739883305100697, "grad_norm": 1.9832416772842407, "learning_rate": 0.00036260116694899303, "loss": 3.6247, "step": 7300 }, { "epoch": 1.375870506305289, "grad_norm": 1.792038917541504, "learning_rate": 0.0003624129493694711, "loss": 3.643, "step": 7310 }, { "epoch": 1.3777526821005082, "grad_norm": 2.0198590755462646, "learning_rate": 0.0003622247317899492, "loss": 3.762, "step": 7320 }, { "epoch": 1.3796348578957276, "grad_norm": 2.0416181087493896, "learning_rate": 0.00036203651421042726, "loss": 3.5613, "step": 7330 }, { "epoch": 1.3815170336909468, "grad_norm": 2.0119645595550537, "learning_rate": 0.00036184829663090534, "loss": 3.7551, "step": 7340 }, { "epoch": 1.383399209486166, "grad_norm": 1.6131541728973389, "learning_rate": 0.0003616600790513834, "loss": 3.788, "step": 7350 }, { "epoch": 1.3852813852813852, "grad_norm": 1.9785635471343994, "learning_rate": 0.00036147186147186144, "loss": 3.6012, "step": 7360 }, { "epoch": 1.3871635610766044, "grad_norm": 1.961735725402832, "learning_rate": 0.0003612836438923395, "loss": 3.52, "step": 7370 }, { "epoch": 1.3890457368718239, "grad_norm": 2.128279209136963, "learning_rate": 0.0003610954263128176, "loss": 3.8339, "step": 7380 }, { "epoch": 1.390927912667043, "grad_norm": 1.8960368633270264, "learning_rate": 0.00036090720873329567, "loss": 3.6089, "step": 7390 }, { "epoch": 1.3928100884622623, "grad_norm": 2.069545269012451, "learning_rate": 0.00036071899115377374, "loss": 3.9857, "step": 7400 }, { "epoch": 1.3946922642574817, "grad_norm": 2.1086106300354004, "learning_rate": 0.0003605307735742518, "loss": 3.9457, "step": 7410 }, { "epoch": 1.396574440052701, "grad_norm": 1.82731294631958, "learning_rate": 0.00036034255599472995, "loss": 3.8394, "step": 7420 }, { "epoch": 1.3984566158479201, "grad_norm": 1.7885284423828125, "learning_rate": 0.00036015433841520803, "loss": 3.5917, "step": 7430 }, { "epoch": 1.4003387916431396, "grad_norm": 1.8847382068634033, "learning_rate": 0.0003599661208356861, "loss": 3.8681, "step": 7440 }, { "epoch": 1.4022209674383588, "grad_norm": 1.9530550241470337, "learning_rate": 0.0003597779032561641, "loss": 3.7272, "step": 7450 }, { "epoch": 1.404103143233578, "grad_norm": 2.2070717811584473, "learning_rate": 0.0003595896856766422, "loss": 4.1098, "step": 7460 }, { "epoch": 1.4059853190287974, "grad_norm": 1.8252630233764648, "learning_rate": 0.0003594014680971203, "loss": 3.6822, "step": 7470 }, { "epoch": 1.4078674948240166, "grad_norm": 1.8794349431991577, "learning_rate": 0.00035921325051759836, "loss": 3.8877, "step": 7480 }, { "epoch": 1.4097496706192358, "grad_norm": 1.8576912879943848, "learning_rate": 0.00035902503293807643, "loss": 3.8438, "step": 7490 }, { "epoch": 1.411631846414455, "grad_norm": 2.027756690979004, "learning_rate": 0.0003588368153585545, "loss": 3.7399, "step": 7500 }, { "epoch": 1.4135140222096743, "grad_norm": 2.149160861968994, "learning_rate": 0.0003586485977790326, "loss": 3.8171, "step": 7510 }, { "epoch": 1.4153961980048937, "grad_norm": 2.319580554962158, "learning_rate": 0.00035846038019951066, "loss": 3.7717, "step": 7520 }, { "epoch": 1.417278373800113, "grad_norm": 3.2247745990753174, "learning_rate": 0.00035827216261998874, "loss": 3.7937, "step": 7530 }, { "epoch": 1.419160549595332, "grad_norm": 1.9991421699523926, "learning_rate": 0.00035808394504046676, "loss": 3.437, "step": 7540 }, { "epoch": 1.4210427253905515, "grad_norm": 2.586153984069824, "learning_rate": 0.00035789572746094484, "loss": 3.963, "step": 7550 }, { "epoch": 1.4229249011857708, "grad_norm": 2.0890746116638184, "learning_rate": 0.0003577075098814229, "loss": 3.6881, "step": 7560 }, { "epoch": 1.42480707698099, "grad_norm": 1.775581955909729, "learning_rate": 0.000357519292301901, "loss": 3.5551, "step": 7570 }, { "epoch": 1.4266892527762094, "grad_norm": 2.140167474746704, "learning_rate": 0.00035733107472237907, "loss": 3.7377, "step": 7580 }, { "epoch": 1.4285714285714286, "grad_norm": 1.9478838443756104, "learning_rate": 0.00035714285714285714, "loss": 3.786, "step": 7590 }, { "epoch": 1.4304536043666478, "grad_norm": 2.1335339546203613, "learning_rate": 0.0003569546395633352, "loss": 3.6391, "step": 7600 }, { "epoch": 1.4323357801618672, "grad_norm": 2.1355767250061035, "learning_rate": 0.0003567664219838133, "loss": 3.5135, "step": 7610 }, { "epoch": 1.4342179559570865, "grad_norm": 1.8865129947662354, "learning_rate": 0.00035657820440429137, "loss": 3.7835, "step": 7620 }, { "epoch": 1.4361001317523057, "grad_norm": 2.1274356842041016, "learning_rate": 0.0003563899868247694, "loss": 3.671, "step": 7630 }, { "epoch": 1.4379823075475249, "grad_norm": 2.126617670059204, "learning_rate": 0.0003562017692452475, "loss": 3.953, "step": 7640 }, { "epoch": 1.439864483342744, "grad_norm": 1.896268367767334, "learning_rate": 0.0003560135516657256, "loss": 3.7844, "step": 7650 }, { "epoch": 1.4417466591379635, "grad_norm": 2.0018656253814697, "learning_rate": 0.0003558253340862037, "loss": 3.9747, "step": 7660 }, { "epoch": 1.4436288349331827, "grad_norm": 2.149848222732544, "learning_rate": 0.00035563711650668175, "loss": 3.6292, "step": 7670 }, { "epoch": 1.445511010728402, "grad_norm": 1.9063892364501953, "learning_rate": 0.00035544889892715983, "loss": 3.9736, "step": 7680 }, { "epoch": 1.4473931865236214, "grad_norm": 2.0680341720581055, "learning_rate": 0.0003552606813476379, "loss": 3.7596, "step": 7690 }, { "epoch": 1.4492753623188406, "grad_norm": 1.9787496328353882, "learning_rate": 0.000355072463768116, "loss": 3.9205, "step": 7700 }, { "epoch": 1.4511575381140598, "grad_norm": 2.1122937202453613, "learning_rate": 0.000354884246188594, "loss": 3.8455, "step": 7710 }, { "epoch": 1.4530397139092792, "grad_norm": 2.253380298614502, "learning_rate": 0.0003546960286090721, "loss": 3.797, "step": 7720 }, { "epoch": 1.4549218897044984, "grad_norm": 1.7869551181793213, "learning_rate": 0.00035450781102955016, "loss": 3.7453, "step": 7730 }, { "epoch": 1.4568040654997176, "grad_norm": 2.2952880859375, "learning_rate": 0.00035431959345002823, "loss": 3.8178, "step": 7740 }, { "epoch": 1.458686241294937, "grad_norm": 2.147580623626709, "learning_rate": 0.0003541313758705063, "loss": 3.8506, "step": 7750 }, { "epoch": 1.4605684170901563, "grad_norm": 1.6160011291503906, "learning_rate": 0.0003539431582909844, "loss": 3.8723, "step": 7760 }, { "epoch": 1.4624505928853755, "grad_norm": 1.8009629249572754, "learning_rate": 0.00035375494071146246, "loss": 4.0286, "step": 7770 }, { "epoch": 1.4643327686805947, "grad_norm": 2.0394809246063232, "learning_rate": 0.00035356672313194054, "loss": 3.7727, "step": 7780 }, { "epoch": 1.466214944475814, "grad_norm": 1.983485221862793, "learning_rate": 0.0003533785055524186, "loss": 3.674, "step": 7790 }, { "epoch": 1.4680971202710333, "grad_norm": 1.917427659034729, "learning_rate": 0.00035319028797289664, "loss": 3.8096, "step": 7800 }, { "epoch": 1.4699792960662525, "grad_norm": 1.7408658266067505, "learning_rate": 0.0003530020703933747, "loss": 3.9625, "step": 7810 }, { "epoch": 1.4718614718614718, "grad_norm": 2.1909754276275635, "learning_rate": 0.0003528138528138528, "loss": 3.4834, "step": 7820 }, { "epoch": 1.4737436476566912, "grad_norm": 2.2905852794647217, "learning_rate": 0.00035262563523433087, "loss": 3.7214, "step": 7830 }, { "epoch": 1.4756258234519104, "grad_norm": 2.4130899906158447, "learning_rate": 0.00035243741765480894, "loss": 3.7839, "step": 7840 }, { "epoch": 1.4775079992471296, "grad_norm": 2.264302968978882, "learning_rate": 0.000352249200075287, "loss": 3.7727, "step": 7850 }, { "epoch": 1.479390175042349, "grad_norm": 2.3765761852264404, "learning_rate": 0.00035206098249576515, "loss": 3.7051, "step": 7860 }, { "epoch": 1.4812723508375683, "grad_norm": 1.75021231174469, "learning_rate": 0.00035187276491624323, "loss": 3.9133, "step": 7870 }, { "epoch": 1.4831545266327875, "grad_norm": 1.860624074935913, "learning_rate": 0.0003516845473367213, "loss": 3.8135, "step": 7880 }, { "epoch": 1.485036702428007, "grad_norm": 1.8294997215270996, "learning_rate": 0.0003514963297571993, "loss": 3.6046, "step": 7890 }, { "epoch": 1.486918878223226, "grad_norm": 1.9020800590515137, "learning_rate": 0.0003513081121776774, "loss": 3.7568, "step": 7900 }, { "epoch": 1.4888010540184453, "grad_norm": 1.8927626609802246, "learning_rate": 0.0003511198945981555, "loss": 4.121, "step": 7910 }, { "epoch": 1.4906832298136645, "grad_norm": 1.9175612926483154, "learning_rate": 0.00035093167701863356, "loss": 3.657, "step": 7920 }, { "epoch": 1.4925654056088837, "grad_norm": 2.6176860332489014, "learning_rate": 0.00035074345943911163, "loss": 3.7087, "step": 7930 }, { "epoch": 1.4944475814041032, "grad_norm": 1.8022160530090332, "learning_rate": 0.0003505552418595897, "loss": 3.9254, "step": 7940 }, { "epoch": 1.4963297571993224, "grad_norm": 1.8247177600860596, "learning_rate": 0.0003503670242800678, "loss": 3.5697, "step": 7950 }, { "epoch": 1.4982119329945416, "grad_norm": 1.8403944969177246, "learning_rate": 0.00035017880670054586, "loss": 3.591, "step": 7960 }, { "epoch": 1.500094108789761, "grad_norm": 1.9349924325942993, "learning_rate": 0.0003499905891210239, "loss": 3.6947, "step": 7970 }, { "epoch": 1.5019762845849802, "grad_norm": 2.04209303855896, "learning_rate": 0.00034980237154150196, "loss": 3.8419, "step": 7980 }, { "epoch": 1.5038584603801994, "grad_norm": 1.9403793811798096, "learning_rate": 0.00034961415396198004, "loss": 3.8686, "step": 7990 }, { "epoch": 1.5057406361754189, "grad_norm": 2.132564067840576, "learning_rate": 0.0003494259363824581, "loss": 3.7871, "step": 8000 }, { "epoch": 1.507622811970638, "grad_norm": 1.7702946662902832, "learning_rate": 0.0003492377188029362, "loss": 3.6537, "step": 8010 }, { "epoch": 1.5095049877658573, "grad_norm": 1.8843390941619873, "learning_rate": 0.00034904950122341427, "loss": 3.9158, "step": 8020 }, { "epoch": 1.5113871635610767, "grad_norm": 2.0728650093078613, "learning_rate": 0.00034886128364389234, "loss": 3.711, "step": 8030 }, { "epoch": 1.513269339356296, "grad_norm": 2.3643085956573486, "learning_rate": 0.0003486730660643704, "loss": 3.7737, "step": 8040 }, { "epoch": 1.5151515151515151, "grad_norm": 2.087571859359741, "learning_rate": 0.0003484848484848485, "loss": 3.8039, "step": 8050 }, { "epoch": 1.5170336909467346, "grad_norm": 1.801803469657898, "learning_rate": 0.0003482966309053265, "loss": 3.6293, "step": 8060 }, { "epoch": 1.5189158667419536, "grad_norm": 2.0934271812438965, "learning_rate": 0.0003481084133258046, "loss": 3.7736, "step": 8070 }, { "epoch": 1.520798042537173, "grad_norm": 2.0371158123016357, "learning_rate": 0.00034792019574628267, "loss": 3.6974, "step": 8080 }, { "epoch": 1.5226802183323922, "grad_norm": 2.2584123611450195, "learning_rate": 0.0003477319781667608, "loss": 3.8715, "step": 8090 }, { "epoch": 1.5245623941276114, "grad_norm": 1.8013052940368652, "learning_rate": 0.0003475437605872389, "loss": 3.6496, "step": 8100 }, { "epoch": 1.5264445699228308, "grad_norm": 1.840074896812439, "learning_rate": 0.00034735554300771695, "loss": 3.6919, "step": 8110 }, { "epoch": 1.52832674571805, "grad_norm": 1.870167851448059, "learning_rate": 0.00034716732542819503, "loss": 3.6268, "step": 8120 }, { "epoch": 1.5302089215132693, "grad_norm": 2.217456340789795, "learning_rate": 0.0003469791078486731, "loss": 3.7207, "step": 8130 }, { "epoch": 1.5320910973084887, "grad_norm": 2.008578300476074, "learning_rate": 0.0003467908902691512, "loss": 3.6737, "step": 8140 }, { "epoch": 1.533973273103708, "grad_norm": 1.9132500886917114, "learning_rate": 0.0003466026726896292, "loss": 3.8286, "step": 8150 }, { "epoch": 1.5358554488989271, "grad_norm": 1.8686176538467407, "learning_rate": 0.0003464144551101073, "loss": 3.6473, "step": 8160 }, { "epoch": 1.5377376246941465, "grad_norm": 2.028219699859619, "learning_rate": 0.00034622623753058536, "loss": 3.9318, "step": 8170 }, { "epoch": 1.5396198004893658, "grad_norm": 2.1390788555145264, "learning_rate": 0.00034603801995106343, "loss": 3.6349, "step": 8180 }, { "epoch": 1.541501976284585, "grad_norm": 1.924244999885559, "learning_rate": 0.0003458498023715415, "loss": 3.9472, "step": 8190 }, { "epoch": 1.5433841520798044, "grad_norm": 1.8411937952041626, "learning_rate": 0.0003456615847920196, "loss": 3.8442, "step": 8200 }, { "epoch": 1.5452663278750234, "grad_norm": 1.9796812534332275, "learning_rate": 0.00034547336721249766, "loss": 3.5435, "step": 8210 }, { "epoch": 1.5471485036702428, "grad_norm": 2.0202064514160156, "learning_rate": 0.00034528514963297574, "loss": 3.6203, "step": 8220 }, { "epoch": 1.549030679465462, "grad_norm": 2.0582363605499268, "learning_rate": 0.0003450969320534538, "loss": 3.7731, "step": 8230 }, { "epoch": 1.5509128552606812, "grad_norm": 1.8645615577697754, "learning_rate": 0.00034490871447393184, "loss": 3.9406, "step": 8240 }, { "epoch": 1.5527950310559007, "grad_norm": 2.5893502235412598, "learning_rate": 0.0003447204968944099, "loss": 3.6305, "step": 8250 }, { "epoch": 1.5546772068511199, "grad_norm": 2.3030471801757812, "learning_rate": 0.000344532279314888, "loss": 3.9195, "step": 8260 }, { "epoch": 1.556559382646339, "grad_norm": 2.0002682209014893, "learning_rate": 0.00034434406173536607, "loss": 3.8249, "step": 8270 }, { "epoch": 1.5584415584415585, "grad_norm": 1.8045579195022583, "learning_rate": 0.00034415584415584414, "loss": 3.8484, "step": 8280 }, { "epoch": 1.5603237342367777, "grad_norm": 1.8935033082962036, "learning_rate": 0.0003439676265763222, "loss": 3.6696, "step": 8290 }, { "epoch": 1.562205910031997, "grad_norm": 1.8719751834869385, "learning_rate": 0.00034377940899680035, "loss": 3.7094, "step": 8300 }, { "epoch": 1.5640880858272164, "grad_norm": 2.444491147994995, "learning_rate": 0.00034359119141727843, "loss": 4.0846, "step": 8310 }, { "epoch": 1.5659702616224356, "grad_norm": 2.4288089275360107, "learning_rate": 0.00034340297383775645, "loss": 3.7514, "step": 8320 }, { "epoch": 1.5678524374176548, "grad_norm": 2.324831008911133, "learning_rate": 0.0003432147562582345, "loss": 3.6748, "step": 8330 }, { "epoch": 1.5697346132128742, "grad_norm": 2.605628728866577, "learning_rate": 0.0003430265386787126, "loss": 3.9288, "step": 8340 }, { "epoch": 1.5716167890080932, "grad_norm": 1.9950097799301147, "learning_rate": 0.0003428383210991907, "loss": 3.5434, "step": 8350 }, { "epoch": 1.5734989648033126, "grad_norm": 2.8846535682678223, "learning_rate": 0.00034265010351966876, "loss": 3.7692, "step": 8360 }, { "epoch": 1.5753811405985318, "grad_norm": 1.8857200145721436, "learning_rate": 0.00034246188594014683, "loss": 3.5541, "step": 8370 }, { "epoch": 1.577263316393751, "grad_norm": 1.9304348230361938, "learning_rate": 0.0003422736683606249, "loss": 3.7289, "step": 8380 }, { "epoch": 1.5791454921889705, "grad_norm": 2.28515887260437, "learning_rate": 0.000342085450781103, "loss": 3.7085, "step": 8390 }, { "epoch": 1.5810276679841897, "grad_norm": 2.3015100955963135, "learning_rate": 0.00034189723320158106, "loss": 3.5947, "step": 8400 }, { "epoch": 1.582909843779409, "grad_norm": 2.521632432937622, "learning_rate": 0.0003417090156220591, "loss": 3.8023, "step": 8410 }, { "epoch": 1.5847920195746283, "grad_norm": 2.1546032428741455, "learning_rate": 0.00034152079804253716, "loss": 3.9682, "step": 8420 }, { "epoch": 1.5866741953698476, "grad_norm": 2.0820095539093018, "learning_rate": 0.00034133258046301524, "loss": 3.9084, "step": 8430 }, { "epoch": 1.5885563711650668, "grad_norm": 1.989452838897705, "learning_rate": 0.0003411443628834933, "loss": 3.793, "step": 8440 }, { "epoch": 1.5904385469602862, "grad_norm": 1.795833706855774, "learning_rate": 0.0003409561453039714, "loss": 3.7056, "step": 8450 }, { "epoch": 1.5923207227555054, "grad_norm": 2.0363857746124268, "learning_rate": 0.00034076792772444947, "loss": 3.795, "step": 8460 }, { "epoch": 1.5942028985507246, "grad_norm": 1.7800500392913818, "learning_rate": 0.00034057971014492754, "loss": 3.7061, "step": 8470 }, { "epoch": 1.596085074345944, "grad_norm": 2.08921480178833, "learning_rate": 0.0003403914925654056, "loss": 3.5798, "step": 8480 }, { "epoch": 1.597967250141163, "grad_norm": 1.9512754678726196, "learning_rate": 0.0003402032749858837, "loss": 3.8301, "step": 8490 }, { "epoch": 1.5998494259363825, "grad_norm": 2.3853600025177, "learning_rate": 0.0003400150574063617, "loss": 3.9632, "step": 8500 }, { "epoch": 1.601731601731602, "grad_norm": 2.057528018951416, "learning_rate": 0.0003398268398268398, "loss": 3.6631, "step": 8510 }, { "epoch": 1.6036137775268209, "grad_norm": 1.8551961183547974, "learning_rate": 0.00033963862224731787, "loss": 3.6559, "step": 8520 }, { "epoch": 1.6054959533220403, "grad_norm": 1.7113333940505981, "learning_rate": 0.000339450404667796, "loss": 3.5879, "step": 8530 }, { "epoch": 1.6073781291172595, "grad_norm": 1.894375205039978, "learning_rate": 0.0003392621870882741, "loss": 3.9233, "step": 8540 }, { "epoch": 1.6092603049124787, "grad_norm": 1.7292157411575317, "learning_rate": 0.00033907396950875215, "loss": 3.6998, "step": 8550 }, { "epoch": 1.6111424807076982, "grad_norm": 2.0833964347839355, "learning_rate": 0.00033888575192923023, "loss": 3.7333, "step": 8560 }, { "epoch": 1.6130246565029174, "grad_norm": 2.16780686378479, "learning_rate": 0.0003386975343497083, "loss": 3.858, "step": 8570 }, { "epoch": 1.6149068322981366, "grad_norm": 2.318399429321289, "learning_rate": 0.00033850931677018633, "loss": 3.6575, "step": 8580 }, { "epoch": 1.616789008093356, "grad_norm": 2.1125340461730957, "learning_rate": 0.0003383210991906644, "loss": 3.4829, "step": 8590 }, { "epoch": 1.6186711838885752, "grad_norm": 2.2121551036834717, "learning_rate": 0.0003381328816111425, "loss": 3.7985, "step": 8600 }, { "epoch": 1.6205533596837944, "grad_norm": 2.090364933013916, "learning_rate": 0.00033794466403162056, "loss": 3.3342, "step": 8610 }, { "epoch": 1.6224355354790139, "grad_norm": 1.9245707988739014, "learning_rate": 0.00033775644645209863, "loss": 3.7779, "step": 8620 }, { "epoch": 1.6243177112742329, "grad_norm": 1.9286541938781738, "learning_rate": 0.0003375682288725767, "loss": 3.7115, "step": 8630 }, { "epoch": 1.6261998870694523, "grad_norm": 2.4994375705718994, "learning_rate": 0.0003373800112930548, "loss": 3.9738, "step": 8640 }, { "epoch": 1.6280820628646717, "grad_norm": 2.1817786693573, "learning_rate": 0.00033719179371353286, "loss": 3.7679, "step": 8650 }, { "epoch": 1.6299642386598907, "grad_norm": 2.399630308151245, "learning_rate": 0.00033700357613401094, "loss": 3.7907, "step": 8660 }, { "epoch": 1.6318464144551101, "grad_norm": 1.7689156532287598, "learning_rate": 0.00033681535855448896, "loss": 3.7116, "step": 8670 }, { "epoch": 1.6337285902503293, "grad_norm": 2.208608627319336, "learning_rate": 0.00033662714097496704, "loss": 3.9298, "step": 8680 }, { "epoch": 1.6356107660455486, "grad_norm": 2.1811182498931885, "learning_rate": 0.0003364389233954451, "loss": 3.6754, "step": 8690 }, { "epoch": 1.637492941840768, "grad_norm": 2.085291862487793, "learning_rate": 0.0003362507058159232, "loss": 3.7016, "step": 8700 }, { "epoch": 1.6393751176359872, "grad_norm": 1.8821417093276978, "learning_rate": 0.00033606248823640127, "loss": 3.7248, "step": 8710 }, { "epoch": 1.6412572934312064, "grad_norm": 1.8898390531539917, "learning_rate": 0.00033587427065687934, "loss": 3.6938, "step": 8720 }, { "epoch": 1.6431394692264258, "grad_norm": 1.7819640636444092, "learning_rate": 0.0003356860530773574, "loss": 3.5247, "step": 8730 }, { "epoch": 1.645021645021645, "grad_norm": 1.8509273529052734, "learning_rate": 0.0003354978354978355, "loss": 3.6139, "step": 8740 }, { "epoch": 1.6469038208168643, "grad_norm": 2.061574935913086, "learning_rate": 0.00033530961791831363, "loss": 3.803, "step": 8750 }, { "epoch": 1.6487859966120837, "grad_norm": 1.8840912580490112, "learning_rate": 0.00033512140033879165, "loss": 3.5845, "step": 8760 }, { "epoch": 1.6506681724073027, "grad_norm": 2.0159287452697754, "learning_rate": 0.0003349331827592697, "loss": 3.9504, "step": 8770 }, { "epoch": 1.6525503482025221, "grad_norm": 1.6903138160705566, "learning_rate": 0.0003347449651797478, "loss": 3.6987, "step": 8780 }, { "epoch": 1.6544325239977415, "grad_norm": 2.134758234024048, "learning_rate": 0.0003345567476002259, "loss": 3.7158, "step": 8790 }, { "epoch": 1.6563146997929605, "grad_norm": 2.000469923019409, "learning_rate": 0.00033436853002070396, "loss": 3.7533, "step": 8800 }, { "epoch": 1.65819687558818, "grad_norm": 1.822241187095642, "learning_rate": 0.00033418031244118203, "loss": 3.7504, "step": 8810 }, { "epoch": 1.6600790513833992, "grad_norm": 1.8779278993606567, "learning_rate": 0.0003339920948616601, "loss": 3.6283, "step": 8820 }, { "epoch": 1.6619612271786184, "grad_norm": 1.9598212242126465, "learning_rate": 0.0003338038772821382, "loss": 3.7713, "step": 8830 }, { "epoch": 1.6638434029738378, "grad_norm": 1.7098469734191895, "learning_rate": 0.00033361565970261626, "loss": 3.5398, "step": 8840 }, { "epoch": 1.665725578769057, "grad_norm": 2.3827857971191406, "learning_rate": 0.0003334274421230943, "loss": 3.861, "step": 8850 }, { "epoch": 1.6676077545642762, "grad_norm": 1.9005036354064941, "learning_rate": 0.00033323922454357236, "loss": 3.9028, "step": 8860 }, { "epoch": 1.6694899303594957, "grad_norm": 2.504202127456665, "learning_rate": 0.00033305100696405044, "loss": 3.8158, "step": 8870 }, { "epoch": 1.6713721061547149, "grad_norm": 2.1613893508911133, "learning_rate": 0.0003328627893845285, "loss": 3.8541, "step": 8880 }, { "epoch": 1.673254281949934, "grad_norm": 2.010159492492676, "learning_rate": 0.0003326745718050066, "loss": 3.8192, "step": 8890 }, { "epoch": 1.6751364577451535, "grad_norm": 2.078258752822876, "learning_rate": 0.00033248635422548467, "loss": 3.6468, "step": 8900 }, { "epoch": 1.6770186335403725, "grad_norm": 1.9027917385101318, "learning_rate": 0.00033229813664596274, "loss": 3.918, "step": 8910 }, { "epoch": 1.678900809335592, "grad_norm": 2.3127923011779785, "learning_rate": 0.0003321099190664408, "loss": 3.5837, "step": 8920 }, { "epoch": 1.6807829851308114, "grad_norm": 1.974990963935852, "learning_rate": 0.00033192170148691884, "loss": 3.5565, "step": 8930 }, { "epoch": 1.6826651609260304, "grad_norm": 2.2739603519439697, "learning_rate": 0.0003317334839073969, "loss": 3.7445, "step": 8940 }, { "epoch": 1.6845473367212498, "grad_norm": 2.020967483520508, "learning_rate": 0.000331545266327875, "loss": 3.5698, "step": 8950 }, { "epoch": 1.686429512516469, "grad_norm": 1.9386134147644043, "learning_rate": 0.00033135704874835307, "loss": 3.5284, "step": 8960 }, { "epoch": 1.6883116883116882, "grad_norm": 1.9373177289962769, "learning_rate": 0.0003311688311688312, "loss": 3.4506, "step": 8970 }, { "epoch": 1.6901938641069076, "grad_norm": 1.9539148807525635, "learning_rate": 0.0003309806135893093, "loss": 3.8893, "step": 8980 }, { "epoch": 1.6920760399021268, "grad_norm": 2.09279203414917, "learning_rate": 0.00033079239600978735, "loss": 3.6947, "step": 8990 }, { "epoch": 1.693958215697346, "grad_norm": 1.8553531169891357, "learning_rate": 0.00033060417843026543, "loss": 3.6849, "step": 9000 }, { "epoch": 1.6958403914925655, "grad_norm": 1.9943382740020752, "learning_rate": 0.0003304159608507435, "loss": 3.862, "step": 9010 }, { "epoch": 1.6977225672877847, "grad_norm": 1.917205810546875, "learning_rate": 0.00033022774327122153, "loss": 3.6617, "step": 9020 }, { "epoch": 1.699604743083004, "grad_norm": 1.9813207387924194, "learning_rate": 0.0003300395256916996, "loss": 3.9757, "step": 9030 }, { "epoch": 1.7014869188782233, "grad_norm": 1.8657891750335693, "learning_rate": 0.0003298513081121777, "loss": 3.6165, "step": 9040 }, { "epoch": 1.7033690946734426, "grad_norm": 1.8018442392349243, "learning_rate": 0.00032966309053265576, "loss": 3.9178, "step": 9050 }, { "epoch": 1.7052512704686618, "grad_norm": 2.157297372817993, "learning_rate": 0.00032947487295313383, "loss": 3.5394, "step": 9060 }, { "epoch": 1.7071334462638812, "grad_norm": 2.088235855102539, "learning_rate": 0.0003292866553736119, "loss": 3.869, "step": 9070 }, { "epoch": 1.7090156220591002, "grad_norm": 2.250558614730835, "learning_rate": 0.00032909843779409, "loss": 3.6271, "step": 9080 }, { "epoch": 1.7108977978543196, "grad_norm": 1.9630236625671387, "learning_rate": 0.00032891022021456806, "loss": 3.6673, "step": 9090 }, { "epoch": 1.7127799736495388, "grad_norm": 2.3485467433929443, "learning_rate": 0.00032872200263504614, "loss": 3.6769, "step": 9100 }, { "epoch": 1.714662149444758, "grad_norm": 2.215182304382324, "learning_rate": 0.00032853378505552416, "loss": 3.5654, "step": 9110 }, { "epoch": 1.7165443252399775, "grad_norm": 2.0479445457458496, "learning_rate": 0.00032834556747600224, "loss": 3.763, "step": 9120 }, { "epoch": 1.7184265010351967, "grad_norm": 11.284534454345703, "learning_rate": 0.0003281573498964803, "loss": 3.5577, "step": 9130 }, { "epoch": 1.7203086768304159, "grad_norm": 2.0643694400787354, "learning_rate": 0.0003279691323169584, "loss": 3.6663, "step": 9140 }, { "epoch": 1.7221908526256353, "grad_norm": 1.885436773300171, "learning_rate": 0.00032778091473743647, "loss": 3.7123, "step": 9150 }, { "epoch": 1.7240730284208545, "grad_norm": 1.779462456703186, "learning_rate": 0.00032759269715791454, "loss": 3.7451, "step": 9160 }, { "epoch": 1.7259552042160737, "grad_norm": 1.9323447942733765, "learning_rate": 0.0003274044795783926, "loss": 3.3766, "step": 9170 }, { "epoch": 1.7278373800112932, "grad_norm": 1.9485670328140259, "learning_rate": 0.0003272162619988707, "loss": 3.5259, "step": 9180 }, { "epoch": 1.7297195558065124, "grad_norm": 2.178581476211548, "learning_rate": 0.0003270280444193488, "loss": 3.4961, "step": 9190 }, { "epoch": 1.7316017316017316, "grad_norm": 2.1433050632476807, "learning_rate": 0.00032683982683982685, "loss": 3.5747, "step": 9200 }, { "epoch": 1.733483907396951, "grad_norm": 2.328223943710327, "learning_rate": 0.0003266516092603049, "loss": 4.0973, "step": 9210 }, { "epoch": 1.73536608319217, "grad_norm": 1.82240629196167, "learning_rate": 0.000326463391680783, "loss": 3.636, "step": 9220 }, { "epoch": 1.7372482589873894, "grad_norm": 1.9444760084152222, "learning_rate": 0.0003262751741012611, "loss": 3.8834, "step": 9230 }, { "epoch": 1.7391304347826086, "grad_norm": 2.2533152103424072, "learning_rate": 0.00032608695652173916, "loss": 3.5912, "step": 9240 }, { "epoch": 1.7410126105778279, "grad_norm": 1.868534803390503, "learning_rate": 0.00032589873894221723, "loss": 3.7303, "step": 9250 }, { "epoch": 1.7428947863730473, "grad_norm": 2.066074848175049, "learning_rate": 0.0003257105213626953, "loss": 3.6382, "step": 9260 }, { "epoch": 1.7447769621682665, "grad_norm": 2.045961856842041, "learning_rate": 0.0003255223037831734, "loss": 3.7104, "step": 9270 }, { "epoch": 1.7466591379634857, "grad_norm": 1.985349178314209, "learning_rate": 0.0003253340862036514, "loss": 3.9342, "step": 9280 }, { "epoch": 1.7485413137587051, "grad_norm": 2.5832815170288086, "learning_rate": 0.0003251458686241295, "loss": 3.5175, "step": 9290 }, { "epoch": 1.7504234895539243, "grad_norm": 2.066349744796753, "learning_rate": 0.00032495765104460756, "loss": 3.8287, "step": 9300 }, { "epoch": 1.7523056653491436, "grad_norm": 2.0592126846313477, "learning_rate": 0.00032476943346508564, "loss": 3.5318, "step": 9310 }, { "epoch": 1.754187841144363, "grad_norm": 2.074842929840088, "learning_rate": 0.0003245812158855637, "loss": 3.6565, "step": 9320 }, { "epoch": 1.7560700169395822, "grad_norm": 2.7729220390319824, "learning_rate": 0.0003243929983060418, "loss": 3.6626, "step": 9330 }, { "epoch": 1.7579521927348014, "grad_norm": 2.1782331466674805, "learning_rate": 0.00032420478072651987, "loss": 3.6741, "step": 9340 }, { "epoch": 1.7598343685300208, "grad_norm": 2.2513933181762695, "learning_rate": 0.00032401656314699794, "loss": 3.732, "step": 9350 }, { "epoch": 1.7617165443252398, "grad_norm": 1.8551970720291138, "learning_rate": 0.000323828345567476, "loss": 3.7461, "step": 9360 }, { "epoch": 1.7635987201204593, "grad_norm": 2.0710809230804443, "learning_rate": 0.00032364012798795404, "loss": 3.5542, "step": 9370 }, { "epoch": 1.7654808959156785, "grad_norm": 2.2162725925445557, "learning_rate": 0.0003234519104084321, "loss": 3.6126, "step": 9380 }, { "epoch": 1.7673630717108977, "grad_norm": 2.5064737796783447, "learning_rate": 0.0003232636928289102, "loss": 3.5244, "step": 9390 }, { "epoch": 1.7692452475061171, "grad_norm": 2.1575369834899902, "learning_rate": 0.00032307547524938827, "loss": 3.9985, "step": 9400 }, { "epoch": 1.7711274233013363, "grad_norm": 2.06486439704895, "learning_rate": 0.0003228872576698664, "loss": 3.7231, "step": 9410 }, { "epoch": 1.7730095990965555, "grad_norm": 2.050130605697632, "learning_rate": 0.0003226990400903445, "loss": 3.5453, "step": 9420 }, { "epoch": 1.774891774891775, "grad_norm": 2.550203323364258, "learning_rate": 0.00032251082251082255, "loss": 3.7155, "step": 9430 }, { "epoch": 1.7767739506869942, "grad_norm": 2.0353000164031982, "learning_rate": 0.00032232260493130063, "loss": 3.4982, "step": 9440 }, { "epoch": 1.7786561264822134, "grad_norm": 2.8174867630004883, "learning_rate": 0.0003221343873517787, "loss": 3.7628, "step": 9450 }, { "epoch": 1.7805383022774328, "grad_norm": 1.9355627298355103, "learning_rate": 0.00032194616977225673, "loss": 3.4049, "step": 9460 }, { "epoch": 1.782420478072652, "grad_norm": 2.0293326377868652, "learning_rate": 0.0003217579521927348, "loss": 3.6546, "step": 9470 }, { "epoch": 1.7843026538678712, "grad_norm": 2.5834407806396484, "learning_rate": 0.0003215697346132129, "loss": 3.6633, "step": 9480 }, { "epoch": 1.7861848296630907, "grad_norm": 2.149320602416992, "learning_rate": 0.00032138151703369096, "loss": 3.9751, "step": 9490 }, { "epoch": 1.7880670054583097, "grad_norm": 2.027061700820923, "learning_rate": 0.00032119329945416903, "loss": 3.6954, "step": 9500 }, { "epoch": 1.789949181253529, "grad_norm": 2.0868654251098633, "learning_rate": 0.0003210050818746471, "loss": 3.8539, "step": 9510 }, { "epoch": 1.7918313570487485, "grad_norm": 1.9372637271881104, "learning_rate": 0.0003208168642951252, "loss": 3.6822, "step": 9520 }, { "epoch": 1.7937135328439675, "grad_norm": 2.4747002124786377, "learning_rate": 0.00032062864671560326, "loss": 3.6153, "step": 9530 }, { "epoch": 1.795595708639187, "grad_norm": 2.240008592605591, "learning_rate": 0.0003204404291360813, "loss": 3.5098, "step": 9540 }, { "epoch": 1.7974778844344061, "grad_norm": 1.9073539972305298, "learning_rate": 0.00032025221155655936, "loss": 3.6324, "step": 9550 }, { "epoch": 1.7993600602296254, "grad_norm": 1.914000391960144, "learning_rate": 0.00032006399397703744, "loss": 3.8111, "step": 9560 }, { "epoch": 1.8012422360248448, "grad_norm": 2.694849729537964, "learning_rate": 0.0003198757763975155, "loss": 4.0209, "step": 9570 }, { "epoch": 1.803124411820064, "grad_norm": 1.8979138135910034, "learning_rate": 0.0003196875588179936, "loss": 3.7862, "step": 9580 }, { "epoch": 1.8050065876152832, "grad_norm": 2.3798153400421143, "learning_rate": 0.00031949934123847167, "loss": 3.7394, "step": 9590 }, { "epoch": 1.8068887634105026, "grad_norm": 2.3953449726104736, "learning_rate": 0.00031931112365894974, "loss": 3.6356, "step": 9600 }, { "epoch": 1.8087709392057219, "grad_norm": 1.94780695438385, "learning_rate": 0.0003191229060794278, "loss": 3.5705, "step": 9610 }, { "epoch": 1.810653115000941, "grad_norm": 2.1115198135375977, "learning_rate": 0.0003189346884999059, "loss": 3.6439, "step": 9620 }, { "epoch": 1.8125352907961605, "grad_norm": 2.078925848007202, "learning_rate": 0.0003187464709203839, "loss": 3.6912, "step": 9630 }, { "epoch": 1.8144174665913795, "grad_norm": 1.6534162759780884, "learning_rate": 0.00031855825334086205, "loss": 3.682, "step": 9640 }, { "epoch": 1.816299642386599, "grad_norm": 2.398630142211914, "learning_rate": 0.0003183700357613401, "loss": 3.6939, "step": 9650 }, { "epoch": 1.8181818181818183, "grad_norm": 2.451552152633667, "learning_rate": 0.0003181818181818182, "loss": 3.5832, "step": 9660 }, { "epoch": 1.8200639939770373, "grad_norm": 1.9941877126693726, "learning_rate": 0.0003179936006022963, "loss": 3.6506, "step": 9670 }, { "epoch": 1.8219461697722568, "grad_norm": 1.82035493850708, "learning_rate": 0.00031780538302277436, "loss": 3.5554, "step": 9680 }, { "epoch": 1.823828345567476, "grad_norm": 1.8042329549789429, "learning_rate": 0.00031761716544325243, "loss": 3.5753, "step": 9690 }, { "epoch": 1.8257105213626952, "grad_norm": 2.1336681842803955, "learning_rate": 0.0003174289478637305, "loss": 3.911, "step": 9700 }, { "epoch": 1.8275926971579146, "grad_norm": 2.4392285346984863, "learning_rate": 0.0003172407302842086, "loss": 3.8885, "step": 9710 }, { "epoch": 1.8294748729531338, "grad_norm": 2.26499080657959, "learning_rate": 0.0003170525127046866, "loss": 3.4365, "step": 9720 }, { "epoch": 1.831357048748353, "grad_norm": 2.173511266708374, "learning_rate": 0.0003168642951251647, "loss": 3.5159, "step": 9730 }, { "epoch": 1.8332392245435725, "grad_norm": 1.923460841178894, "learning_rate": 0.00031667607754564276, "loss": 4.0331, "step": 9740 }, { "epoch": 1.8351214003387917, "grad_norm": 1.9043619632720947, "learning_rate": 0.00031648785996612084, "loss": 3.5768, "step": 9750 }, { "epoch": 1.8370035761340109, "grad_norm": 2.6198337078094482, "learning_rate": 0.0003162996423865989, "loss": 3.5863, "step": 9760 }, { "epoch": 1.8388857519292303, "grad_norm": 2.625149965286255, "learning_rate": 0.000316111424807077, "loss": 3.5364, "step": 9770 }, { "epoch": 1.8407679277244493, "grad_norm": 2.2209835052490234, "learning_rate": 0.00031592320722755507, "loss": 3.6741, "step": 9780 }, { "epoch": 1.8426501035196687, "grad_norm": 2.7067711353302, "learning_rate": 0.00031573498964803314, "loss": 3.5254, "step": 9790 }, { "epoch": 1.8445322793148882, "grad_norm": 2.327434539794922, "learning_rate": 0.00031554677206851116, "loss": 3.665, "step": 9800 }, { "epoch": 1.8464144551101072, "grad_norm": 2.767713785171509, "learning_rate": 0.00031535855448898924, "loss": 3.5376, "step": 9810 }, { "epoch": 1.8482966309053266, "grad_norm": 2.0066473484039307, "learning_rate": 0.0003151703369094673, "loss": 3.4858, "step": 9820 }, { "epoch": 1.8501788067005458, "grad_norm": 1.8936408758163452, "learning_rate": 0.0003149821193299454, "loss": 3.4119, "step": 9830 }, { "epoch": 1.852060982495765, "grad_norm": 2.1529366970062256, "learning_rate": 0.00031479390175042347, "loss": 3.5605, "step": 9840 }, { "epoch": 1.8539431582909844, "grad_norm": 2.163797378540039, "learning_rate": 0.00031460568417090155, "loss": 3.755, "step": 9850 }, { "epoch": 1.8558253340862036, "grad_norm": 2.1908974647521973, "learning_rate": 0.0003144174665913797, "loss": 3.4836, "step": 9860 }, { "epoch": 1.8577075098814229, "grad_norm": 2.2800731658935547, "learning_rate": 0.00031422924901185775, "loss": 3.5026, "step": 9870 }, { "epoch": 1.8595896856766423, "grad_norm": 2.2017064094543457, "learning_rate": 0.00031404103143233583, "loss": 3.3399, "step": 9880 }, { "epoch": 1.8614718614718615, "grad_norm": 2.403064489364624, "learning_rate": 0.00031385281385281385, "loss": 3.7772, "step": 9890 }, { "epoch": 1.8633540372670807, "grad_norm": 1.948598861694336, "learning_rate": 0.00031366459627329193, "loss": 3.6041, "step": 9900 }, { "epoch": 1.8652362130623001, "grad_norm": 2.0806872844696045, "learning_rate": 0.00031347637869377, "loss": 3.8162, "step": 9910 }, { "epoch": 1.8671183888575191, "grad_norm": 2.627696990966797, "learning_rate": 0.0003132881611142481, "loss": 3.6738, "step": 9920 }, { "epoch": 1.8690005646527386, "grad_norm": 2.0168612003326416, "learning_rate": 0.00031309994353472616, "loss": 3.6254, "step": 9930 }, { "epoch": 1.870882740447958, "grad_norm": 2.210606575012207, "learning_rate": 0.00031291172595520423, "loss": 3.8663, "step": 9940 }, { "epoch": 1.872764916243177, "grad_norm": 1.9852879047393799, "learning_rate": 0.0003127235083756823, "loss": 3.6297, "step": 9950 }, { "epoch": 1.8746470920383964, "grad_norm": 2.120920419692993, "learning_rate": 0.0003125352907961604, "loss": 3.9663, "step": 9960 }, { "epoch": 1.8765292678336156, "grad_norm": 2.316328763961792, "learning_rate": 0.00031234707321663846, "loss": 3.9639, "step": 9970 }, { "epoch": 1.8784114436288348, "grad_norm": 1.8062055110931396, "learning_rate": 0.0003121588556371165, "loss": 3.8023, "step": 9980 }, { "epoch": 1.8802936194240543, "grad_norm": 1.8895719051361084, "learning_rate": 0.00031197063805759456, "loss": 3.6085, "step": 9990 }, { "epoch": 1.8821757952192735, "grad_norm": 1.95108962059021, "learning_rate": 0.00031178242047807264, "loss": 3.8635, "step": 10000 }, { "epoch": 1.8840579710144927, "grad_norm": 1.824989676475525, "learning_rate": 0.0003115942028985507, "loss": 3.5738, "step": 10010 }, { "epoch": 1.8859401468097121, "grad_norm": 1.949899673461914, "learning_rate": 0.0003114059853190288, "loss": 3.4552, "step": 10020 }, { "epoch": 1.8878223226049313, "grad_norm": 2.350801944732666, "learning_rate": 0.00031121776773950687, "loss": 3.6764, "step": 10030 }, { "epoch": 1.8897044984001505, "grad_norm": 1.800048589706421, "learning_rate": 0.00031102955015998494, "loss": 3.6032, "step": 10040 }, { "epoch": 1.89158667419537, "grad_norm": 2.1716933250427246, "learning_rate": 0.000310841332580463, "loss": 3.5876, "step": 10050 }, { "epoch": 1.8934688499905892, "grad_norm": 2.26385760307312, "learning_rate": 0.00031065311500094104, "loss": 3.6223, "step": 10060 }, { "epoch": 1.8953510257858084, "grad_norm": 1.9308747053146362, "learning_rate": 0.0003104648974214191, "loss": 3.5269, "step": 10070 }, { "epoch": 1.8972332015810278, "grad_norm": 2.064584970474243, "learning_rate": 0.00031027667984189725, "loss": 3.7674, "step": 10080 }, { "epoch": 1.8991153773762468, "grad_norm": 2.29477858543396, "learning_rate": 0.0003100884622623753, "loss": 3.4748, "step": 10090 }, { "epoch": 1.9009975531714662, "grad_norm": 2.1337435245513916, "learning_rate": 0.0003099002446828534, "loss": 3.8851, "step": 10100 }, { "epoch": 1.9028797289666854, "grad_norm": 2.0595479011535645, "learning_rate": 0.0003097120271033315, "loss": 3.5353, "step": 10110 }, { "epoch": 1.9047619047619047, "grad_norm": 2.213545560836792, "learning_rate": 0.00030952380952380956, "loss": 3.5947, "step": 10120 }, { "epoch": 1.906644080557124, "grad_norm": 2.010716438293457, "learning_rate": 0.00030933559194428763, "loss": 3.3873, "step": 10130 }, { "epoch": 1.9085262563523433, "grad_norm": 2.145021915435791, "learning_rate": 0.0003091473743647657, "loss": 3.6004, "step": 10140 }, { "epoch": 1.9104084321475625, "grad_norm": 2.155932903289795, "learning_rate": 0.00030895915678524373, "loss": 3.8732, "step": 10150 }, { "epoch": 1.912290607942782, "grad_norm": 2.0870883464813232, "learning_rate": 0.0003087709392057218, "loss": 3.7121, "step": 10160 }, { "epoch": 1.9141727837380011, "grad_norm": 1.889845371246338, "learning_rate": 0.0003085827216261999, "loss": 3.7184, "step": 10170 }, { "epoch": 1.9160549595332204, "grad_norm": 2.0182039737701416, "learning_rate": 0.00030839450404667796, "loss": 3.7952, "step": 10180 }, { "epoch": 1.9179371353284398, "grad_norm": 1.9103964567184448, "learning_rate": 0.00030820628646715604, "loss": 3.546, "step": 10190 }, { "epoch": 1.919819311123659, "grad_norm": 1.934048056602478, "learning_rate": 0.0003080180688876341, "loss": 3.4859, "step": 10200 }, { "epoch": 1.9217014869188782, "grad_norm": 2.16024112701416, "learning_rate": 0.0003078298513081122, "loss": 3.3423, "step": 10210 }, { "epoch": 1.9235836627140976, "grad_norm": 1.946714997291565, "learning_rate": 0.00030764163372859027, "loss": 3.6281, "step": 10220 }, { "epoch": 1.9254658385093166, "grad_norm": 2.031848907470703, "learning_rate": 0.00030745341614906834, "loss": 3.6591, "step": 10230 }, { "epoch": 1.927348014304536, "grad_norm": 2.2257654666900635, "learning_rate": 0.00030726519856954636, "loss": 3.6939, "step": 10240 }, { "epoch": 1.9292301900997553, "grad_norm": 2.163264274597168, "learning_rate": 0.00030707698099002444, "loss": 3.6587, "step": 10250 }, { "epoch": 1.9311123658949745, "grad_norm": 2.368788719177246, "learning_rate": 0.0003068887634105025, "loss": 3.774, "step": 10260 }, { "epoch": 1.932994541690194, "grad_norm": 1.9467861652374268, "learning_rate": 0.0003067005458309806, "loss": 3.6306, "step": 10270 }, { "epoch": 1.9348767174854131, "grad_norm": 1.9181021451950073, "learning_rate": 0.00030651232825145867, "loss": 3.6298, "step": 10280 }, { "epoch": 1.9367588932806323, "grad_norm": 1.849297285079956, "learning_rate": 0.00030632411067193675, "loss": 3.641, "step": 10290 }, { "epoch": 1.9386410690758518, "grad_norm": 1.93975830078125, "learning_rate": 0.0003061358930924149, "loss": 3.6084, "step": 10300 }, { "epoch": 1.940523244871071, "grad_norm": 1.8382394313812256, "learning_rate": 0.00030594767551289295, "loss": 3.4179, "step": 10310 }, { "epoch": 1.9424054206662902, "grad_norm": 2.686518907546997, "learning_rate": 0.00030575945793337103, "loss": 3.7513, "step": 10320 }, { "epoch": 1.9442875964615096, "grad_norm": 2.0112316608428955, "learning_rate": 0.00030557124035384905, "loss": 3.7188, "step": 10330 }, { "epoch": 1.9461697722567288, "grad_norm": 1.9542816877365112, "learning_rate": 0.00030538302277432713, "loss": 3.5804, "step": 10340 }, { "epoch": 1.948051948051948, "grad_norm": 1.8192522525787354, "learning_rate": 0.0003051948051948052, "loss": 3.4211, "step": 10350 }, { "epoch": 1.9499341238471675, "grad_norm": 2.1841719150543213, "learning_rate": 0.0003050065876152833, "loss": 3.5976, "step": 10360 }, { "epoch": 1.9518162996423865, "grad_norm": 2.133845090866089, "learning_rate": 0.00030481837003576136, "loss": 3.5101, "step": 10370 }, { "epoch": 1.9536984754376059, "grad_norm": 2.269397497177124, "learning_rate": 0.00030463015245623943, "loss": 3.7982, "step": 10380 }, { "epoch": 1.955580651232825, "grad_norm": 2.094409704208374, "learning_rate": 0.0003044419348767175, "loss": 3.4589, "step": 10390 }, { "epoch": 1.9574628270280443, "grad_norm": 2.1520373821258545, "learning_rate": 0.0003042537172971956, "loss": 3.6693, "step": 10400 }, { "epoch": 1.9593450028232637, "grad_norm": 1.810929298400879, "learning_rate": 0.0003040654997176736, "loss": 3.8383, "step": 10410 }, { "epoch": 1.961227178618483, "grad_norm": 2.058192491531372, "learning_rate": 0.0003038772821381517, "loss": 3.5365, "step": 10420 }, { "epoch": 1.9631093544137022, "grad_norm": 2.311635732650757, "learning_rate": 0.00030368906455862976, "loss": 3.7174, "step": 10430 }, { "epoch": 1.9649915302089216, "grad_norm": 2.0313923358917236, "learning_rate": 0.00030350084697910784, "loss": 3.4804, "step": 10440 }, { "epoch": 1.9668737060041408, "grad_norm": 2.259661912918091, "learning_rate": 0.0003033126293995859, "loss": 3.8336, "step": 10450 }, { "epoch": 1.96875588179936, "grad_norm": 2.002938985824585, "learning_rate": 0.000303124411820064, "loss": 3.6662, "step": 10460 }, { "epoch": 1.9706380575945794, "grad_norm": 2.394352912902832, "learning_rate": 0.00030293619424054207, "loss": 3.6835, "step": 10470 }, { "epoch": 1.9725202333897986, "grad_norm": 2.634260892868042, "learning_rate": 0.00030274797666102014, "loss": 3.4938, "step": 10480 }, { "epoch": 1.9744024091850179, "grad_norm": 1.8893793821334839, "learning_rate": 0.0003025597590814982, "loss": 3.6379, "step": 10490 }, { "epoch": 1.9762845849802373, "grad_norm": 2.149104356765747, "learning_rate": 0.00030237154150197624, "loss": 4.076, "step": 10500 }, { "epoch": 1.9781667607754563, "grad_norm": 1.9862107038497925, "learning_rate": 0.0003021833239224543, "loss": 3.7925, "step": 10510 }, { "epoch": 1.9800489365706757, "grad_norm": 2.4172115325927734, "learning_rate": 0.00030199510634293245, "loss": 3.4149, "step": 10520 }, { "epoch": 1.981931112365895, "grad_norm": 2.012948751449585, "learning_rate": 0.0003018068887634105, "loss": 3.7194, "step": 10530 }, { "epoch": 1.9838132881611141, "grad_norm": 2.0994248390197754, "learning_rate": 0.0003016186711838886, "loss": 3.7299, "step": 10540 }, { "epoch": 1.9856954639563336, "grad_norm": 1.8276288509368896, "learning_rate": 0.0003014304536043667, "loss": 3.5345, "step": 10550 }, { "epoch": 1.9875776397515528, "grad_norm": 2.070876359939575, "learning_rate": 0.00030124223602484476, "loss": 3.8717, "step": 10560 }, { "epoch": 1.989459815546772, "grad_norm": 2.233039140701294, "learning_rate": 0.00030105401844532283, "loss": 3.4603, "step": 10570 }, { "epoch": 1.9913419913419914, "grad_norm": 2.3521862030029297, "learning_rate": 0.0003008658008658009, "loss": 3.6113, "step": 10580 }, { "epoch": 1.9932241671372106, "grad_norm": 3.8352150917053223, "learning_rate": 0.00030067758328627893, "loss": 3.8694, "step": 10590 }, { "epoch": 1.9951063429324298, "grad_norm": 2.0548229217529297, "learning_rate": 0.000300489365706757, "loss": 3.4568, "step": 10600 }, { "epoch": 1.9969885187276493, "grad_norm": 1.9041173458099365, "learning_rate": 0.0003003011481272351, "loss": 3.569, "step": 10610 }, { "epoch": 1.9988706945228685, "grad_norm": 2.2218592166900635, "learning_rate": 0.00030011293054771316, "loss": 3.5093, "step": 10620 }, { "epoch": 2.0, "eval_accuracy": 0.1472, "eval_loss": 3.5664186477661133, "eval_runtime": 82.3517, "eval_samples_per_second": 91.073, "eval_steps_per_second": 11.39, "step": 10626 }, { "epoch": 2.0007528703180877, "grad_norm": 2.0325100421905518, "learning_rate": 0.00029992471296819124, "loss": 3.6207, "step": 10630 }, { "epoch": 2.002635046113307, "grad_norm": 1.9634112119674683, "learning_rate": 0.0002997364953886693, "loss": 3.8266, "step": 10640 }, { "epoch": 2.004517221908526, "grad_norm": 2.0167596340179443, "learning_rate": 0.0002995482778091474, "loss": 3.668, "step": 10650 }, { "epoch": 2.0063993977037455, "grad_norm": 1.9793752431869507, "learning_rate": 0.00029936006022962547, "loss": 3.7249, "step": 10660 }, { "epoch": 2.008281573498965, "grad_norm": 1.982495665550232, "learning_rate": 0.0002991718426501035, "loss": 3.7413, "step": 10670 }, { "epoch": 2.010163749294184, "grad_norm": 2.1777305603027344, "learning_rate": 0.00029898362507058156, "loss": 3.7726, "step": 10680 }, { "epoch": 2.0120459250894034, "grad_norm": 2.2762796878814697, "learning_rate": 0.00029879540749105964, "loss": 3.6714, "step": 10690 }, { "epoch": 2.013928100884623, "grad_norm": 1.967294692993164, "learning_rate": 0.0002986071899115377, "loss": 3.6496, "step": 10700 }, { "epoch": 2.015810276679842, "grad_norm": 2.13759708404541, "learning_rate": 0.0002984189723320158, "loss": 3.5799, "step": 10710 }, { "epoch": 2.0176924524750612, "grad_norm": 2.114488124847412, "learning_rate": 0.00029823075475249387, "loss": 3.5195, "step": 10720 }, { "epoch": 2.0195746282702802, "grad_norm": 1.7854301929473877, "learning_rate": 0.00029804253717297195, "loss": 3.5987, "step": 10730 }, { "epoch": 2.0214568040654997, "grad_norm": 2.2474656105041504, "learning_rate": 0.0002978543195934501, "loss": 3.6328, "step": 10740 }, { "epoch": 2.023338979860719, "grad_norm": 1.9125057458877563, "learning_rate": 0.00029766610201392815, "loss": 3.5598, "step": 10750 }, { "epoch": 2.025221155655938, "grad_norm": 2.0102808475494385, "learning_rate": 0.0002974778844344062, "loss": 3.5107, "step": 10760 }, { "epoch": 2.0271033314511575, "grad_norm": 2.132345676422119, "learning_rate": 0.00029728966685488425, "loss": 3.5667, "step": 10770 }, { "epoch": 2.028985507246377, "grad_norm": 3.9572272300720215, "learning_rate": 0.00029710144927536233, "loss": 3.3139, "step": 10780 }, { "epoch": 2.030867683041596, "grad_norm": 2.37929105758667, "learning_rate": 0.0002969132316958404, "loss": 3.8562, "step": 10790 }, { "epoch": 2.0327498588368154, "grad_norm": 2.9635579586029053, "learning_rate": 0.0002967250141163185, "loss": 3.5757, "step": 10800 }, { "epoch": 2.034632034632035, "grad_norm": 1.985288381576538, "learning_rate": 0.00029653679653679656, "loss": 3.5637, "step": 10810 }, { "epoch": 2.036514210427254, "grad_norm": 2.2944083213806152, "learning_rate": 0.00029634857895727463, "loss": 3.6574, "step": 10820 }, { "epoch": 2.038396386222473, "grad_norm": 2.4931163787841797, "learning_rate": 0.0002961603613777527, "loss": 3.3834, "step": 10830 }, { "epoch": 2.0402785620176926, "grad_norm": 2.0097668170928955, "learning_rate": 0.0002959721437982308, "loss": 3.4972, "step": 10840 }, { "epoch": 2.0421607378129116, "grad_norm": 2.063840627670288, "learning_rate": 0.0002957839262187088, "loss": 3.7874, "step": 10850 }, { "epoch": 2.044042913608131, "grad_norm": 2.0522704124450684, "learning_rate": 0.0002955957086391869, "loss": 3.5479, "step": 10860 }, { "epoch": 2.04592508940335, "grad_norm": 2.0130808353424072, "learning_rate": 0.00029540749105966496, "loss": 3.8158, "step": 10870 }, { "epoch": 2.0478072651985695, "grad_norm": 2.5037670135498047, "learning_rate": 0.00029521927348014304, "loss": 3.8272, "step": 10880 }, { "epoch": 2.049689440993789, "grad_norm": 2.120812177658081, "learning_rate": 0.0002950310559006211, "loss": 3.6263, "step": 10890 }, { "epoch": 2.051571616789008, "grad_norm": 2.170079469680786, "learning_rate": 0.0002948428383210992, "loss": 3.7242, "step": 10900 }, { "epoch": 2.0534537925842273, "grad_norm": 1.8562462329864502, "learning_rate": 0.00029465462074157727, "loss": 3.6286, "step": 10910 }, { "epoch": 2.0553359683794468, "grad_norm": 2.1779134273529053, "learning_rate": 0.00029446640316205534, "loss": 3.5787, "step": 10920 }, { "epoch": 2.0572181441746658, "grad_norm": 2.044370651245117, "learning_rate": 0.0002942781855825334, "loss": 3.6205, "step": 10930 }, { "epoch": 2.059100319969885, "grad_norm": 2.4833273887634277, "learning_rate": 0.00029408996800301144, "loss": 3.4728, "step": 10940 }, { "epoch": 2.0609824957651046, "grad_norm": 2.2788689136505127, "learning_rate": 0.0002939017504234895, "loss": 3.5166, "step": 10950 }, { "epoch": 2.0628646715603236, "grad_norm": 2.383826971054077, "learning_rate": 0.00029371353284396765, "loss": 3.7058, "step": 10960 }, { "epoch": 2.064746847355543, "grad_norm": 2.5065038204193115, "learning_rate": 0.0002935253152644457, "loss": 3.4283, "step": 10970 }, { "epoch": 2.0666290231507625, "grad_norm": 2.153620958328247, "learning_rate": 0.0002933370976849238, "loss": 3.3929, "step": 10980 }, { "epoch": 2.0685111989459815, "grad_norm": 2.111159563064575, "learning_rate": 0.0002931488801054019, "loss": 3.3886, "step": 10990 }, { "epoch": 2.070393374741201, "grad_norm": 2.286848545074463, "learning_rate": 0.00029296066252587996, "loss": 3.6433, "step": 11000 }, { "epoch": 2.0722755505364203, "grad_norm": 2.1899757385253906, "learning_rate": 0.00029277244494635803, "loss": 3.8215, "step": 11010 }, { "epoch": 2.0741577263316393, "grad_norm": 2.477233409881592, "learning_rate": 0.00029258422736683605, "loss": 3.7537, "step": 11020 }, { "epoch": 2.0760399021268587, "grad_norm": 2.1020686626434326, "learning_rate": 0.00029239600978731413, "loss": 3.6285, "step": 11030 }, { "epoch": 2.0779220779220777, "grad_norm": 1.9838166236877441, "learning_rate": 0.0002922077922077922, "loss": 3.8239, "step": 11040 }, { "epoch": 2.079804253717297, "grad_norm": 2.234279155731201, "learning_rate": 0.0002920195746282703, "loss": 3.5579, "step": 11050 }, { "epoch": 2.0816864295125166, "grad_norm": 2.4791712760925293, "learning_rate": 0.00029183135704874836, "loss": 3.7128, "step": 11060 }, { "epoch": 2.0835686053077356, "grad_norm": 2.0139009952545166, "learning_rate": 0.00029164313946922644, "loss": 3.497, "step": 11070 }, { "epoch": 2.085450781102955, "grad_norm": 1.918221354484558, "learning_rate": 0.0002914549218897045, "loss": 3.5534, "step": 11080 }, { "epoch": 2.0873329568981744, "grad_norm": 2.0779190063476562, "learning_rate": 0.0002912667043101826, "loss": 3.6238, "step": 11090 }, { "epoch": 2.0892151326933934, "grad_norm": 2.1959855556488037, "learning_rate": 0.00029107848673066067, "loss": 3.5225, "step": 11100 }, { "epoch": 2.091097308488613, "grad_norm": 1.7541618347167969, "learning_rate": 0.0002908902691511387, "loss": 3.7214, "step": 11110 }, { "epoch": 2.0929794842838323, "grad_norm": 1.9482086896896362, "learning_rate": 0.00029070205157161676, "loss": 3.4, "step": 11120 }, { "epoch": 2.0948616600790513, "grad_norm": 1.9692625999450684, "learning_rate": 0.00029051383399209484, "loss": 3.643, "step": 11130 }, { "epoch": 2.0967438358742707, "grad_norm": 2.198240280151367, "learning_rate": 0.0002903256164125729, "loss": 3.4483, "step": 11140 }, { "epoch": 2.09862601166949, "grad_norm": 1.9893345832824707, "learning_rate": 0.000290137398833051, "loss": 3.7261, "step": 11150 }, { "epoch": 2.100508187464709, "grad_norm": 2.090209484100342, "learning_rate": 0.00028994918125352907, "loss": 3.3484, "step": 11160 }, { "epoch": 2.1023903632599286, "grad_norm": 2.24429988861084, "learning_rate": 0.00028976096367400715, "loss": 3.604, "step": 11170 }, { "epoch": 2.1042725390551476, "grad_norm": 2.234273910522461, "learning_rate": 0.0002895727460944853, "loss": 3.4037, "step": 11180 }, { "epoch": 2.106154714850367, "grad_norm": 2.5589098930358887, "learning_rate": 0.00028938452851496335, "loss": 3.443, "step": 11190 }, { "epoch": 2.1080368906455864, "grad_norm": 2.2039730548858643, "learning_rate": 0.0002891963109354414, "loss": 3.8742, "step": 11200 }, { "epoch": 2.1099190664408054, "grad_norm": 2.143895149230957, "learning_rate": 0.00028900809335591945, "loss": 3.5561, "step": 11210 }, { "epoch": 2.111801242236025, "grad_norm": 2.077723741531372, "learning_rate": 0.00028881987577639753, "loss": 3.3852, "step": 11220 }, { "epoch": 2.1136834180312443, "grad_norm": 2.2736127376556396, "learning_rate": 0.0002886316581968756, "loss": 3.4958, "step": 11230 }, { "epoch": 2.1155655938264633, "grad_norm": 2.400620937347412, "learning_rate": 0.0002884434406173537, "loss": 3.785, "step": 11240 }, { "epoch": 2.1174477696216827, "grad_norm": 2.3708560466766357, "learning_rate": 0.00028825522303783176, "loss": 3.6247, "step": 11250 }, { "epoch": 2.119329945416902, "grad_norm": 2.456202983856201, "learning_rate": 0.00028806700545830983, "loss": 3.5149, "step": 11260 }, { "epoch": 2.121212121212121, "grad_norm": 2.273723840713501, "learning_rate": 0.0002878787878787879, "loss": 3.6566, "step": 11270 }, { "epoch": 2.1230942970073405, "grad_norm": 2.1427016258239746, "learning_rate": 0.00028769057029926593, "loss": 3.8057, "step": 11280 }, { "epoch": 2.12497647280256, "grad_norm": 2.0322492122650146, "learning_rate": 0.000287502352719744, "loss": 3.7683, "step": 11290 }, { "epoch": 2.126858648597779, "grad_norm": 2.18509578704834, "learning_rate": 0.0002873141351402221, "loss": 3.7631, "step": 11300 }, { "epoch": 2.1287408243929984, "grad_norm": 2.1193268299102783, "learning_rate": 0.00028712591756070016, "loss": 3.5023, "step": 11310 }, { "epoch": 2.1306230001882174, "grad_norm": 1.880686640739441, "learning_rate": 0.00028693769998117824, "loss": 3.6056, "step": 11320 }, { "epoch": 2.132505175983437, "grad_norm": 2.0465972423553467, "learning_rate": 0.0002867494824016563, "loss": 3.9191, "step": 11330 }, { "epoch": 2.1343873517786562, "grad_norm": 2.2074384689331055, "learning_rate": 0.0002865612648221344, "loss": 3.4342, "step": 11340 }, { "epoch": 2.1362695275738752, "grad_norm": 2.1300830841064453, "learning_rate": 0.00028637304724261247, "loss": 3.5027, "step": 11350 }, { "epoch": 2.1381517033690947, "grad_norm": 2.1655023097991943, "learning_rate": 0.00028618482966309054, "loss": 3.8552, "step": 11360 }, { "epoch": 2.140033879164314, "grad_norm": 2.5276312828063965, "learning_rate": 0.00028599661208356857, "loss": 3.5268, "step": 11370 }, { "epoch": 2.141916054959533, "grad_norm": 2.214873790740967, "learning_rate": 0.00028580839450404664, "loss": 3.3984, "step": 11380 }, { "epoch": 2.1437982307547525, "grad_norm": 2.229903221130371, "learning_rate": 0.0002856201769245247, "loss": 3.8769, "step": 11390 }, { "epoch": 2.145680406549972, "grad_norm": 2.1930768489837646, "learning_rate": 0.0002854319593450028, "loss": 3.8591, "step": 11400 }, { "epoch": 2.147562582345191, "grad_norm": 2.090045690536499, "learning_rate": 0.0002852437417654809, "loss": 3.4975, "step": 11410 }, { "epoch": 2.1494447581404104, "grad_norm": 2.0409741401672363, "learning_rate": 0.000285055524185959, "loss": 3.3442, "step": 11420 }, { "epoch": 2.15132693393563, "grad_norm": 2.0023446083068848, "learning_rate": 0.0002848673066064371, "loss": 3.7851, "step": 11430 }, { "epoch": 2.153209109730849, "grad_norm": 2.3585290908813477, "learning_rate": 0.00028467908902691516, "loss": 3.6652, "step": 11440 }, { "epoch": 2.155091285526068, "grad_norm": 1.9051262140274048, "learning_rate": 0.00028449087144739323, "loss": 3.6001, "step": 11450 }, { "epoch": 2.1569734613212876, "grad_norm": 2.1293647289276123, "learning_rate": 0.00028430265386787125, "loss": 3.8165, "step": 11460 }, { "epoch": 2.1588556371165066, "grad_norm": 1.993503451347351, "learning_rate": 0.00028411443628834933, "loss": 3.692, "step": 11470 }, { "epoch": 2.160737812911726, "grad_norm": 2.0621178150177, "learning_rate": 0.0002839262187088274, "loss": 3.7678, "step": 11480 }, { "epoch": 2.162619988706945, "grad_norm": 2.0874807834625244, "learning_rate": 0.0002837380011293055, "loss": 3.6538, "step": 11490 }, { "epoch": 2.1645021645021645, "grad_norm": 2.309434413909912, "learning_rate": 0.00028354978354978356, "loss": 3.5588, "step": 11500 }, { "epoch": 2.166384340297384, "grad_norm": 2.163496494293213, "learning_rate": 0.00028336156597026164, "loss": 3.5295, "step": 11510 }, { "epoch": 2.168266516092603, "grad_norm": 2.23266863822937, "learning_rate": 0.0002831733483907397, "loss": 3.3534, "step": 11520 }, { "epoch": 2.1701486918878223, "grad_norm": 2.2005791664123535, "learning_rate": 0.0002829851308112178, "loss": 3.4239, "step": 11530 }, { "epoch": 2.1720308676830418, "grad_norm": 2.8036704063415527, "learning_rate": 0.00028279691323169587, "loss": 3.6307, "step": 11540 }, { "epoch": 2.1739130434782608, "grad_norm": 2.1723809242248535, "learning_rate": 0.0002826086956521739, "loss": 3.5663, "step": 11550 }, { "epoch": 2.17579521927348, "grad_norm": 2.3068792819976807, "learning_rate": 0.00028242047807265196, "loss": 3.6506, "step": 11560 }, { "epoch": 2.1776773950686996, "grad_norm": 2.3664402961730957, "learning_rate": 0.00028223226049313004, "loss": 3.9696, "step": 11570 }, { "epoch": 2.1795595708639186, "grad_norm": 2.1242728233337402, "learning_rate": 0.0002820440429136081, "loss": 3.6441, "step": 11580 }, { "epoch": 2.181441746659138, "grad_norm": 1.8504153490066528, "learning_rate": 0.0002818558253340862, "loss": 3.2865, "step": 11590 }, { "epoch": 2.183323922454357, "grad_norm": 2.618985176086426, "learning_rate": 0.00028166760775456427, "loss": 3.7115, "step": 11600 }, { "epoch": 2.1852060982495765, "grad_norm": 2.038625478744507, "learning_rate": 0.00028147939017504235, "loss": 3.4573, "step": 11610 }, { "epoch": 2.187088274044796, "grad_norm": 2.480666160583496, "learning_rate": 0.0002812911725955205, "loss": 3.8504, "step": 11620 }, { "epoch": 2.188970449840015, "grad_norm": 2.2521214485168457, "learning_rate": 0.0002811029550159985, "loss": 3.614, "step": 11630 }, { "epoch": 2.1908526256352343, "grad_norm": 3.09513258934021, "learning_rate": 0.0002809147374364766, "loss": 3.8761, "step": 11640 }, { "epoch": 2.1927348014304537, "grad_norm": 2.276827335357666, "learning_rate": 0.00028072651985695465, "loss": 3.3449, "step": 11650 }, { "epoch": 2.1946169772256727, "grad_norm": 2.4314024448394775, "learning_rate": 0.00028053830227743273, "loss": 3.6035, "step": 11660 }, { "epoch": 2.196499153020892, "grad_norm": 2.1677989959716797, "learning_rate": 0.0002803500846979108, "loss": 3.7463, "step": 11670 }, { "epoch": 2.1983813288161116, "grad_norm": 2.4452247619628906, "learning_rate": 0.0002801618671183889, "loss": 3.976, "step": 11680 }, { "epoch": 2.2002635046113306, "grad_norm": 2.3938846588134766, "learning_rate": 0.00027997364953886696, "loss": 3.696, "step": 11690 }, { "epoch": 2.20214568040655, "grad_norm": 2.150054454803467, "learning_rate": 0.00027978543195934503, "loss": 3.4701, "step": 11700 }, { "epoch": 2.2040278562017694, "grad_norm": 2.1537985801696777, "learning_rate": 0.0002795972143798231, "loss": 3.63, "step": 11710 }, { "epoch": 2.2059100319969884, "grad_norm": 2.190314292907715, "learning_rate": 0.00027940899680030113, "loss": 3.4569, "step": 11720 }, { "epoch": 2.207792207792208, "grad_norm": 2.0873048305511475, "learning_rate": 0.0002792207792207792, "loss": 3.5355, "step": 11730 }, { "epoch": 2.2096743835874273, "grad_norm": 2.0455803871154785, "learning_rate": 0.0002790325616412573, "loss": 3.1505, "step": 11740 }, { "epoch": 2.2115565593826463, "grad_norm": 2.204108953475952, "learning_rate": 0.00027884434406173536, "loss": 3.523, "step": 11750 }, { "epoch": 2.2134387351778657, "grad_norm": 2.099332332611084, "learning_rate": 0.00027865612648221344, "loss": 3.6056, "step": 11760 }, { "epoch": 2.2153209109730847, "grad_norm": 2.1077349185943604, "learning_rate": 0.0002784679089026915, "loss": 3.6528, "step": 11770 }, { "epoch": 2.217203086768304, "grad_norm": 2.1245169639587402, "learning_rate": 0.0002782796913231696, "loss": 3.5215, "step": 11780 }, { "epoch": 2.2190852625635236, "grad_norm": 2.1498658657073975, "learning_rate": 0.00027809147374364767, "loss": 3.814, "step": 11790 }, { "epoch": 2.2209674383587426, "grad_norm": 2.0961034297943115, "learning_rate": 0.00027790325616412574, "loss": 3.3516, "step": 11800 }, { "epoch": 2.222849614153962, "grad_norm": 2.0248188972473145, "learning_rate": 0.00027771503858460377, "loss": 3.7715, "step": 11810 }, { "epoch": 2.2247317899491814, "grad_norm": 2.018819570541382, "learning_rate": 0.00027752682100508184, "loss": 3.645, "step": 11820 }, { "epoch": 2.2266139657444004, "grad_norm": 1.9713603258132935, "learning_rate": 0.0002773386034255599, "loss": 3.4662, "step": 11830 }, { "epoch": 2.22849614153962, "grad_norm": 2.605026960372925, "learning_rate": 0.000277150385846038, "loss": 3.5945, "step": 11840 }, { "epoch": 2.2303783173348393, "grad_norm": 1.9455453157424927, "learning_rate": 0.0002769621682665161, "loss": 3.7957, "step": 11850 }, { "epoch": 2.2322604931300583, "grad_norm": 2.095759630203247, "learning_rate": 0.0002767739506869942, "loss": 3.3663, "step": 11860 }, { "epoch": 2.2341426689252777, "grad_norm": 2.201913595199585, "learning_rate": 0.0002765857331074723, "loss": 3.3748, "step": 11870 }, { "epoch": 2.2360248447204967, "grad_norm": 2.163905143737793, "learning_rate": 0.00027639751552795036, "loss": 3.7517, "step": 11880 }, { "epoch": 2.237907020515716, "grad_norm": 2.3969902992248535, "learning_rate": 0.0002762092979484284, "loss": 3.7489, "step": 11890 }, { "epoch": 2.2397891963109355, "grad_norm": 1.9157522916793823, "learning_rate": 0.00027602108036890645, "loss": 3.4607, "step": 11900 }, { "epoch": 2.2416713721061545, "grad_norm": 2.334146499633789, "learning_rate": 0.00027583286278938453, "loss": 3.7616, "step": 11910 }, { "epoch": 2.243553547901374, "grad_norm": 2.0595529079437256, "learning_rate": 0.0002756446452098626, "loss": 3.5452, "step": 11920 }, { "epoch": 2.2454357236965934, "grad_norm": 2.5498175621032715, "learning_rate": 0.0002754564276303407, "loss": 3.3551, "step": 11930 }, { "epoch": 2.2473178994918124, "grad_norm": 2.1727492809295654, "learning_rate": 0.00027526821005081876, "loss": 3.2736, "step": 11940 }, { "epoch": 2.249200075287032, "grad_norm": 2.6587021350860596, "learning_rate": 0.00027507999247129684, "loss": 3.5636, "step": 11950 }, { "epoch": 2.2510822510822512, "grad_norm": 2.283400297164917, "learning_rate": 0.0002748917748917749, "loss": 3.5174, "step": 11960 }, { "epoch": 2.2529644268774702, "grad_norm": 2.0880675315856934, "learning_rate": 0.000274703557312253, "loss": 3.474, "step": 11970 }, { "epoch": 2.2548466026726897, "grad_norm": 2.504103660583496, "learning_rate": 0.000274515339732731, "loss": 3.5673, "step": 11980 }, { "epoch": 2.256728778467909, "grad_norm": 2.2468066215515137, "learning_rate": 0.0002743271221532091, "loss": 3.3428, "step": 11990 }, { "epoch": 2.258610954263128, "grad_norm": 2.028759717941284, "learning_rate": 0.00027413890457368716, "loss": 3.6325, "step": 12000 }, { "epoch": 2.2604931300583475, "grad_norm": 2.020503044128418, "learning_rate": 0.00027395068699416524, "loss": 3.5208, "step": 12010 }, { "epoch": 2.262375305853567, "grad_norm": 2.351266384124756, "learning_rate": 0.0002737624694146433, "loss": 3.5115, "step": 12020 }, { "epoch": 2.264257481648786, "grad_norm": 2.273869752883911, "learning_rate": 0.0002735742518351214, "loss": 3.685, "step": 12030 }, { "epoch": 2.2661396574440054, "grad_norm": 2.696059465408325, "learning_rate": 0.00027338603425559947, "loss": 3.6462, "step": 12040 }, { "epoch": 2.2680218332392243, "grad_norm": 1.843395709991455, "learning_rate": 0.00027319781667607755, "loss": 3.3474, "step": 12050 }, { "epoch": 2.269904009034444, "grad_norm": 2.270089864730835, "learning_rate": 0.0002730095990965556, "loss": 3.4742, "step": 12060 }, { "epoch": 2.271786184829663, "grad_norm": 2.3248908519744873, "learning_rate": 0.0002728213815170337, "loss": 3.4222, "step": 12070 }, { "epoch": 2.273668360624882, "grad_norm": 2.705111026763916, "learning_rate": 0.0002726331639375118, "loss": 3.5915, "step": 12080 }, { "epoch": 2.2755505364201016, "grad_norm": 2.456393003463745, "learning_rate": 0.00027244494635798985, "loss": 3.695, "step": 12090 }, { "epoch": 2.277432712215321, "grad_norm": 1.988847017288208, "learning_rate": 0.00027225672877846793, "loss": 3.54, "step": 12100 }, { "epoch": 2.27931488801054, "grad_norm": 2.099121570587158, "learning_rate": 0.000272068511198946, "loss": 3.4574, "step": 12110 }, { "epoch": 2.2811970638057595, "grad_norm": 2.102046489715576, "learning_rate": 0.0002718802936194241, "loss": 3.8274, "step": 12120 }, { "epoch": 2.283079239600979, "grad_norm": 3.1600725650787354, "learning_rate": 0.00027169207603990216, "loss": 3.6157, "step": 12130 }, { "epoch": 2.284961415396198, "grad_norm": 2.4826478958129883, "learning_rate": 0.00027150385846038023, "loss": 3.5571, "step": 12140 }, { "epoch": 2.2868435911914173, "grad_norm": 2.2563583850860596, "learning_rate": 0.0002713156408808583, "loss": 3.6172, "step": 12150 }, { "epoch": 2.2887257669866363, "grad_norm": 2.2361643314361572, "learning_rate": 0.00027112742330133633, "loss": 3.4263, "step": 12160 }, { "epoch": 2.2906079427818558, "grad_norm": 2.2822108268737793, "learning_rate": 0.0002709392057218144, "loss": 3.4694, "step": 12170 }, { "epoch": 2.292490118577075, "grad_norm": 2.0215232372283936, "learning_rate": 0.0002707509881422925, "loss": 3.6782, "step": 12180 }, { "epoch": 2.2943722943722946, "grad_norm": 2.178675651550293, "learning_rate": 0.00027056277056277056, "loss": 3.626, "step": 12190 }, { "epoch": 2.2962544701675136, "grad_norm": 2.0457639694213867, "learning_rate": 0.00027037455298324864, "loss": 3.5606, "step": 12200 }, { "epoch": 2.298136645962733, "grad_norm": 2.4033384323120117, "learning_rate": 0.0002701863354037267, "loss": 3.3515, "step": 12210 }, { "epoch": 2.300018821757952, "grad_norm": 2.2202134132385254, "learning_rate": 0.0002699981178242048, "loss": 3.5911, "step": 12220 }, { "epoch": 2.3019009975531715, "grad_norm": 2.3605339527130127, "learning_rate": 0.00026980990024468287, "loss": 3.6939, "step": 12230 }, { "epoch": 2.303783173348391, "grad_norm": 2.1413116455078125, "learning_rate": 0.0002696216826651609, "loss": 3.7362, "step": 12240 }, { "epoch": 2.30566534914361, "grad_norm": 2.47611403465271, "learning_rate": 0.00026943346508563897, "loss": 3.3767, "step": 12250 }, { "epoch": 2.3075475249388293, "grad_norm": 2.8459672927856445, "learning_rate": 0.00026924524750611704, "loss": 3.5238, "step": 12260 }, { "epoch": 2.3094297007340487, "grad_norm": 2.266383171081543, "learning_rate": 0.0002690570299265951, "loss": 3.7339, "step": 12270 }, { "epoch": 2.3113118765292677, "grad_norm": 2.063275098800659, "learning_rate": 0.0002688688123470732, "loss": 3.4752, "step": 12280 }, { "epoch": 2.313194052324487, "grad_norm": 2.5126569271087646, "learning_rate": 0.0002686805947675513, "loss": 3.596, "step": 12290 }, { "epoch": 2.3150762281197066, "grad_norm": 2.4327335357666016, "learning_rate": 0.0002684923771880294, "loss": 3.6448, "step": 12300 }, { "epoch": 2.3169584039149256, "grad_norm": 2.125715494155884, "learning_rate": 0.0002683041596085075, "loss": 3.3007, "step": 12310 }, { "epoch": 2.318840579710145, "grad_norm": 2.6347568035125732, "learning_rate": 0.00026811594202898556, "loss": 3.6187, "step": 12320 }, { "epoch": 2.320722755505364, "grad_norm": 2.244523048400879, "learning_rate": 0.0002679277244494636, "loss": 3.6203, "step": 12330 }, { "epoch": 2.3226049313005834, "grad_norm": 2.629132032394409, "learning_rate": 0.00026773950686994165, "loss": 3.5333, "step": 12340 }, { "epoch": 2.324487107095803, "grad_norm": 2.6131486892700195, "learning_rate": 0.00026755128929041973, "loss": 3.6913, "step": 12350 }, { "epoch": 2.326369282891022, "grad_norm": 2.2687394618988037, "learning_rate": 0.0002673630717108978, "loss": 3.5431, "step": 12360 }, { "epoch": 2.3282514586862413, "grad_norm": 1.9078861474990845, "learning_rate": 0.0002671748541313759, "loss": 3.721, "step": 12370 }, { "epoch": 2.3301336344814607, "grad_norm": 2.5289957523345947, "learning_rate": 0.00026698663655185396, "loss": 3.6161, "step": 12380 }, { "epoch": 2.3320158102766797, "grad_norm": 2.075119733810425, "learning_rate": 0.00026679841897233204, "loss": 3.5798, "step": 12390 }, { "epoch": 2.333897986071899, "grad_norm": 2.6997997760772705, "learning_rate": 0.0002666102013928101, "loss": 3.8208, "step": 12400 }, { "epoch": 2.3357801618671186, "grad_norm": 2.278843641281128, "learning_rate": 0.0002664219838132882, "loss": 3.7117, "step": 12410 }, { "epoch": 2.3376623376623376, "grad_norm": 2.0488789081573486, "learning_rate": 0.0002662337662337662, "loss": 3.514, "step": 12420 }, { "epoch": 2.339544513457557, "grad_norm": 2.1381616592407227, "learning_rate": 0.0002660455486542443, "loss": 3.8678, "step": 12430 }, { "epoch": 2.341426689252776, "grad_norm": 2.3508248329162598, "learning_rate": 0.00026585733107472236, "loss": 3.6683, "step": 12440 }, { "epoch": 2.3433088650479954, "grad_norm": 1.9220882654190063, "learning_rate": 0.00026566911349520044, "loss": 3.4588, "step": 12450 }, { "epoch": 2.345191040843215, "grad_norm": 2.929403066635132, "learning_rate": 0.0002654808959156785, "loss": 3.3913, "step": 12460 }, { "epoch": 2.3470732166384343, "grad_norm": 2.3121893405914307, "learning_rate": 0.0002652926783361566, "loss": 3.3751, "step": 12470 }, { "epoch": 2.3489553924336533, "grad_norm": 2.4507408142089844, "learning_rate": 0.00026510446075663467, "loss": 3.7149, "step": 12480 }, { "epoch": 2.3508375682288727, "grad_norm": 2.333505630493164, "learning_rate": 0.00026491624317711275, "loss": 3.4675, "step": 12490 }, { "epoch": 2.3527197440240917, "grad_norm": 2.398038148880005, "learning_rate": 0.00026472802559759077, "loss": 3.7468, "step": 12500 }, { "epoch": 2.354601919819311, "grad_norm": 2.136672019958496, "learning_rate": 0.0002645398080180689, "loss": 3.7055, "step": 12510 }, { "epoch": 2.3564840956145305, "grad_norm": 2.207770586013794, "learning_rate": 0.000264351590438547, "loss": 3.6393, "step": 12520 }, { "epoch": 2.3583662714097495, "grad_norm": 2.111809015274048, "learning_rate": 0.00026416337285902505, "loss": 3.4927, "step": 12530 }, { "epoch": 2.360248447204969, "grad_norm": 1.9219146966934204, "learning_rate": 0.00026397515527950313, "loss": 3.3832, "step": 12540 }, { "epoch": 2.3621306230001884, "grad_norm": 2.373422384262085, "learning_rate": 0.0002637869376999812, "loss": 3.3155, "step": 12550 }, { "epoch": 2.3640127987954074, "grad_norm": 2.4442925453186035, "learning_rate": 0.0002635987201204593, "loss": 3.6199, "step": 12560 }, { "epoch": 2.365894974590627, "grad_norm": 2.070446252822876, "learning_rate": 0.00026341050254093736, "loss": 3.5309, "step": 12570 }, { "epoch": 2.3677771503858462, "grad_norm": 2.5628628730773926, "learning_rate": 0.00026322228496141543, "loss": 3.5052, "step": 12580 }, { "epoch": 2.3696593261810652, "grad_norm": 2.644749879837036, "learning_rate": 0.00026303406738189346, "loss": 3.403, "step": 12590 }, { "epoch": 2.3715415019762847, "grad_norm": 2.175874948501587, "learning_rate": 0.00026284584980237153, "loss": 3.3338, "step": 12600 }, { "epoch": 2.3734236777715036, "grad_norm": 2.5138566493988037, "learning_rate": 0.0002626576322228496, "loss": 3.3175, "step": 12610 }, { "epoch": 2.375305853566723, "grad_norm": 2.261333465576172, "learning_rate": 0.0002624694146433277, "loss": 3.3355, "step": 12620 }, { "epoch": 2.3771880293619425, "grad_norm": 2.922394275665283, "learning_rate": 0.00026228119706380576, "loss": 3.5954, "step": 12630 }, { "epoch": 2.3790702051571615, "grad_norm": 2.592365026473999, "learning_rate": 0.00026209297948428384, "loss": 3.6931, "step": 12640 }, { "epoch": 2.380952380952381, "grad_norm": 2.305166721343994, "learning_rate": 0.0002619047619047619, "loss": 3.4529, "step": 12650 }, { "epoch": 2.3828345567476004, "grad_norm": 2.466231346130371, "learning_rate": 0.00026171654432524, "loss": 3.1573, "step": 12660 }, { "epoch": 2.3847167325428194, "grad_norm": 2.3147668838500977, "learning_rate": 0.00026152832674571807, "loss": 3.6978, "step": 12670 }, { "epoch": 2.386598908338039, "grad_norm": 2.0412979125976562, "learning_rate": 0.0002613401091661961, "loss": 3.6198, "step": 12680 }, { "epoch": 2.388481084133258, "grad_norm": 2.3344006538391113, "learning_rate": 0.00026115189158667417, "loss": 3.5463, "step": 12690 }, { "epoch": 2.390363259928477, "grad_norm": 2.2637939453125, "learning_rate": 0.00026096367400715224, "loss": 3.7714, "step": 12700 }, { "epoch": 2.3922454357236966, "grad_norm": 2.0158164501190186, "learning_rate": 0.0002607754564276303, "loss": 3.6364, "step": 12710 }, { "epoch": 2.3941276115189156, "grad_norm": 2.035423755645752, "learning_rate": 0.0002605872388481084, "loss": 3.6938, "step": 12720 }, { "epoch": 2.396009787314135, "grad_norm": 1.8480379581451416, "learning_rate": 0.0002603990212685865, "loss": 3.8096, "step": 12730 }, { "epoch": 2.3978919631093545, "grad_norm": 2.079611301422119, "learning_rate": 0.0002602108036890646, "loss": 3.4641, "step": 12740 }, { "epoch": 2.399774138904574, "grad_norm": 1.9311271905899048, "learning_rate": 0.0002600225861095427, "loss": 3.5725, "step": 12750 }, { "epoch": 2.401656314699793, "grad_norm": 2.1596946716308594, "learning_rate": 0.0002598343685300207, "loss": 3.7113, "step": 12760 }, { "epoch": 2.4035384904950123, "grad_norm": 2.4744985103607178, "learning_rate": 0.0002596461509504988, "loss": 3.5168, "step": 12770 }, { "epoch": 2.4054206662902313, "grad_norm": 2.025146245956421, "learning_rate": 0.00025945793337097685, "loss": 3.9381, "step": 12780 }, { "epoch": 2.4073028420854508, "grad_norm": 2.1392645835876465, "learning_rate": 0.00025926971579145493, "loss": 3.3267, "step": 12790 }, { "epoch": 2.40918501788067, "grad_norm": 2.1594839096069336, "learning_rate": 0.000259081498211933, "loss": 3.7087, "step": 12800 }, { "epoch": 2.411067193675889, "grad_norm": 1.840181827545166, "learning_rate": 0.0002588932806324111, "loss": 3.5356, "step": 12810 }, { "epoch": 2.4129493694711086, "grad_norm": 2.2144663333892822, "learning_rate": 0.00025870506305288916, "loss": 3.4101, "step": 12820 }, { "epoch": 2.414831545266328, "grad_norm": 2.0941808223724365, "learning_rate": 0.00025851684547336724, "loss": 3.7866, "step": 12830 }, { "epoch": 2.416713721061547, "grad_norm": 2.089059829711914, "learning_rate": 0.0002583286278938453, "loss": 3.6845, "step": 12840 }, { "epoch": 2.4185958968567665, "grad_norm": 2.912400960922241, "learning_rate": 0.00025814041031432334, "loss": 3.7385, "step": 12850 }, { "epoch": 2.420478072651986, "grad_norm": 2.2902355194091797, "learning_rate": 0.0002579521927348014, "loss": 3.7033, "step": 12860 }, { "epoch": 2.422360248447205, "grad_norm": 1.9202638864517212, "learning_rate": 0.0002577639751552795, "loss": 3.6759, "step": 12870 }, { "epoch": 2.4242424242424243, "grad_norm": 2.116422414779663, "learning_rate": 0.00025757575757575756, "loss": 3.7004, "step": 12880 }, { "epoch": 2.4261246000376433, "grad_norm": 2.2019739151000977, "learning_rate": 0.00025738753999623564, "loss": 3.4297, "step": 12890 }, { "epoch": 2.4280067758328627, "grad_norm": 2.3102312088012695, "learning_rate": 0.0002571993224167137, "loss": 3.6366, "step": 12900 }, { "epoch": 2.429888951628082, "grad_norm": 2.1590123176574707, "learning_rate": 0.0002570111048371918, "loss": 3.4755, "step": 12910 }, { "epoch": 2.431771127423301, "grad_norm": 2.459792137145996, "learning_rate": 0.00025682288725766987, "loss": 3.4308, "step": 12920 }, { "epoch": 2.4336533032185206, "grad_norm": 2.6131508350372314, "learning_rate": 0.00025663466967814795, "loss": 3.3086, "step": 12930 }, { "epoch": 2.43553547901374, "grad_norm": 2.1226155757904053, "learning_rate": 0.00025644645209862597, "loss": 3.5569, "step": 12940 }, { "epoch": 2.437417654808959, "grad_norm": 2.094432830810547, "learning_rate": 0.00025625823451910405, "loss": 3.7615, "step": 12950 }, { "epoch": 2.4392998306041784, "grad_norm": 2.2213399410247803, "learning_rate": 0.0002560700169395822, "loss": 3.42, "step": 12960 }, { "epoch": 2.441182006399398, "grad_norm": 2.1061394214630127, "learning_rate": 0.00025588179936006025, "loss": 3.6418, "step": 12970 }, { "epoch": 2.443064182194617, "grad_norm": 2.438858985900879, "learning_rate": 0.00025569358178053833, "loss": 3.3957, "step": 12980 }, { "epoch": 2.4449463579898363, "grad_norm": 2.3009002208709717, "learning_rate": 0.0002555053642010164, "loss": 3.4535, "step": 12990 }, { "epoch": 2.4468285337850557, "grad_norm": 2.1550045013427734, "learning_rate": 0.0002553171466214945, "loss": 3.5498, "step": 13000 }, { "epoch": 2.4487107095802747, "grad_norm": 2.06872820854187, "learning_rate": 0.00025512892904197256, "loss": 3.4783, "step": 13010 }, { "epoch": 2.450592885375494, "grad_norm": 1.8921680450439453, "learning_rate": 0.00025494071146245063, "loss": 3.7661, "step": 13020 }, { "epoch": 2.4524750611707136, "grad_norm": 2.2813234329223633, "learning_rate": 0.00025475249388292866, "loss": 3.5401, "step": 13030 }, { "epoch": 2.4543572369659326, "grad_norm": 2.5784573554992676, "learning_rate": 0.00025456427630340673, "loss": 3.2728, "step": 13040 }, { "epoch": 2.456239412761152, "grad_norm": 1.9490470886230469, "learning_rate": 0.0002543760587238848, "loss": 3.4593, "step": 13050 }, { "epoch": 2.458121588556371, "grad_norm": 2.069911479949951, "learning_rate": 0.0002541878411443629, "loss": 3.418, "step": 13060 }, { "epoch": 2.4600037643515904, "grad_norm": 2.3085179328918457, "learning_rate": 0.00025399962356484096, "loss": 3.3341, "step": 13070 }, { "epoch": 2.46188594014681, "grad_norm": 2.4705824851989746, "learning_rate": 0.00025381140598531904, "loss": 3.733, "step": 13080 }, { "epoch": 2.463768115942029, "grad_norm": 2.3225464820861816, "learning_rate": 0.0002536231884057971, "loss": 3.499, "step": 13090 }, { "epoch": 2.4656502917372483, "grad_norm": 2.043210506439209, "learning_rate": 0.0002534349708262752, "loss": 3.6164, "step": 13100 }, { "epoch": 2.4675324675324677, "grad_norm": 1.996595025062561, "learning_rate": 0.0002532467532467532, "loss": 3.2083, "step": 13110 }, { "epoch": 2.4694146433276867, "grad_norm": 2.048947811126709, "learning_rate": 0.0002530585356672313, "loss": 3.6867, "step": 13120 }, { "epoch": 2.471296819122906, "grad_norm": 2.1778929233551025, "learning_rate": 0.00025287031808770937, "loss": 3.3652, "step": 13130 }, { "epoch": 2.4731789949181255, "grad_norm": 2.0750463008880615, "learning_rate": 0.00025268210050818744, "loss": 3.3186, "step": 13140 }, { "epoch": 2.4750611707133445, "grad_norm": 2.405975818634033, "learning_rate": 0.0002524938829286655, "loss": 3.5292, "step": 13150 }, { "epoch": 2.476943346508564, "grad_norm": 2.224957227706909, "learning_rate": 0.0002523056653491436, "loss": 3.4451, "step": 13160 }, { "epoch": 2.478825522303783, "grad_norm": 2.276805877685547, "learning_rate": 0.00025211744776962167, "loss": 3.8172, "step": 13170 }, { "epoch": 2.4807076980990024, "grad_norm": 2.076664924621582, "learning_rate": 0.0002519292301900998, "loss": 3.4015, "step": 13180 }, { "epoch": 2.482589873894222, "grad_norm": 2.32965087890625, "learning_rate": 0.0002517410126105779, "loss": 3.4551, "step": 13190 }, { "epoch": 2.4844720496894412, "grad_norm": 2.174405336380005, "learning_rate": 0.0002515527950310559, "loss": 3.3485, "step": 13200 }, { "epoch": 2.4863542254846602, "grad_norm": 2.069518804550171, "learning_rate": 0.000251364577451534, "loss": 3.5047, "step": 13210 }, { "epoch": 2.4882364012798797, "grad_norm": 2.1788432598114014, "learning_rate": 0.00025117635987201205, "loss": 3.442, "step": 13220 }, { "epoch": 2.4901185770750986, "grad_norm": 2.014214515686035, "learning_rate": 0.00025098814229249013, "loss": 3.5642, "step": 13230 }, { "epoch": 2.492000752870318, "grad_norm": 2.407564640045166, "learning_rate": 0.0002507999247129682, "loss": 3.6167, "step": 13240 }, { "epoch": 2.4938829286655375, "grad_norm": 2.017845869064331, "learning_rate": 0.0002506117071334463, "loss": 3.7589, "step": 13250 }, { "epoch": 2.4957651044607565, "grad_norm": 2.3813204765319824, "learning_rate": 0.00025042348955392436, "loss": 3.3354, "step": 13260 }, { "epoch": 2.497647280255976, "grad_norm": 2.23264741897583, "learning_rate": 0.00025023527197440244, "loss": 3.5316, "step": 13270 }, { "epoch": 2.4995294560511954, "grad_norm": 2.2221202850341797, "learning_rate": 0.0002500470543948805, "loss": 3.5831, "step": 13280 }, { "epoch": 2.5014116318464144, "grad_norm": 2.5750515460968018, "learning_rate": 0.0002498588368153586, "loss": 3.427, "step": 13290 }, { "epoch": 2.503293807641634, "grad_norm": 2.982229471206665, "learning_rate": 0.00024967061923583667, "loss": 3.4038, "step": 13300 }, { "epoch": 2.505175983436853, "grad_norm": 3.0934581756591797, "learning_rate": 0.0002494824016563147, "loss": 3.5624, "step": 13310 }, { "epoch": 2.507058159232072, "grad_norm": 2.419921398162842, "learning_rate": 0.00024929418407679276, "loss": 3.4934, "step": 13320 }, { "epoch": 2.5089403350272916, "grad_norm": 2.056140899658203, "learning_rate": 0.00024910596649727084, "loss": 3.3226, "step": 13330 }, { "epoch": 2.5108225108225106, "grad_norm": 2.220564603805542, "learning_rate": 0.0002489177489177489, "loss": 3.5698, "step": 13340 }, { "epoch": 2.51270468661773, "grad_norm": 2.1118175983428955, "learning_rate": 0.000248729531338227, "loss": 3.4314, "step": 13350 }, { "epoch": 2.5145868624129495, "grad_norm": 2.184598684310913, "learning_rate": 0.00024854131375870507, "loss": 3.5573, "step": 13360 }, { "epoch": 2.516469038208169, "grad_norm": 2.0085346698760986, "learning_rate": 0.00024835309617918315, "loss": 3.4515, "step": 13370 }, { "epoch": 2.518351214003388, "grad_norm": 2.25004243850708, "learning_rate": 0.0002481648785996612, "loss": 3.5318, "step": 13380 }, { "epoch": 2.5202333897986073, "grad_norm": 2.047452688217163, "learning_rate": 0.0002479766610201393, "loss": 3.4558, "step": 13390 }, { "epoch": 2.5221155655938263, "grad_norm": 2.483903169631958, "learning_rate": 0.0002477884434406174, "loss": 3.6721, "step": 13400 }, { "epoch": 2.5239977413890458, "grad_norm": 2.2482049465179443, "learning_rate": 0.00024760022586109545, "loss": 3.4841, "step": 13410 }, { "epoch": 2.525879917184265, "grad_norm": 2.063401937484741, "learning_rate": 0.00024741200828157353, "loss": 3.4875, "step": 13420 }, { "epoch": 2.527762092979484, "grad_norm": 1.967253565788269, "learning_rate": 0.0002472237907020516, "loss": 3.5811, "step": 13430 }, { "epoch": 2.5296442687747036, "grad_norm": 1.9973634481430054, "learning_rate": 0.00024703557312252963, "loss": 3.4078, "step": 13440 }, { "epoch": 2.5315264445699226, "grad_norm": 2.570246458053589, "learning_rate": 0.0002468473555430077, "loss": 3.5667, "step": 13450 }, { "epoch": 2.533408620365142, "grad_norm": 2.4065957069396973, "learning_rate": 0.0002466591379634858, "loss": 3.458, "step": 13460 }, { "epoch": 2.5352907961603615, "grad_norm": 2.274855852127075, "learning_rate": 0.00024647092038396386, "loss": 3.5801, "step": 13470 }, { "epoch": 2.537172971955581, "grad_norm": 2.714076519012451, "learning_rate": 0.00024628270280444193, "loss": 3.4157, "step": 13480 }, { "epoch": 2.5390551477508, "grad_norm": 3.3540868759155273, "learning_rate": 0.00024609448522492, "loss": 3.8492, "step": 13490 }, { "epoch": 2.5409373235460193, "grad_norm": 2.6228458881378174, "learning_rate": 0.0002459062676453981, "loss": 3.4858, "step": 13500 }, { "epoch": 2.5428194993412383, "grad_norm": 2.2022087574005127, "learning_rate": 0.00024571805006587616, "loss": 3.4548, "step": 13510 }, { "epoch": 2.5447016751364577, "grad_norm": 2.2522380352020264, "learning_rate": 0.00024552983248635424, "loss": 3.5098, "step": 13520 }, { "epoch": 2.546583850931677, "grad_norm": 2.1798369884490967, "learning_rate": 0.0002453416149068323, "loss": 3.658, "step": 13530 }, { "epoch": 2.548466026726896, "grad_norm": 2.4629127979278564, "learning_rate": 0.0002451533973273104, "loss": 3.3775, "step": 13540 }, { "epoch": 2.5503482025221156, "grad_norm": 2.09586763381958, "learning_rate": 0.00024496517974778847, "loss": 3.3251, "step": 13550 }, { "epoch": 2.5522303783173346, "grad_norm": 2.1991000175476074, "learning_rate": 0.00024477696216826654, "loss": 3.4861, "step": 13560 }, { "epoch": 2.554112554112554, "grad_norm": 3.098156690597534, "learning_rate": 0.00024458874458874457, "loss": 3.3637, "step": 13570 }, { "epoch": 2.5559947299077734, "grad_norm": 2.7341153621673584, "learning_rate": 0.00024440052700922264, "loss": 3.5117, "step": 13580 }, { "epoch": 2.557876905702993, "grad_norm": 2.235184907913208, "learning_rate": 0.0002442123094297007, "loss": 3.4566, "step": 13590 }, { "epoch": 2.559759081498212, "grad_norm": 2.4505128860473633, "learning_rate": 0.00024402409185017882, "loss": 3.4255, "step": 13600 }, { "epoch": 2.5616412572934313, "grad_norm": 2.367823600769043, "learning_rate": 0.00024383587427065687, "loss": 3.7588, "step": 13610 }, { "epoch": 2.5635234330886503, "grad_norm": 2.214277982711792, "learning_rate": 0.00024364765669113495, "loss": 3.5693, "step": 13620 }, { "epoch": 2.5654056088838697, "grad_norm": 2.072028636932373, "learning_rate": 0.00024345943911161303, "loss": 3.8349, "step": 13630 }, { "epoch": 2.567287784679089, "grad_norm": 2.3604848384857178, "learning_rate": 0.0002432712215320911, "loss": 3.7526, "step": 13640 }, { "epoch": 2.5691699604743086, "grad_norm": 2.4565625190734863, "learning_rate": 0.00024308300395256918, "loss": 3.3296, "step": 13650 }, { "epoch": 2.5710521362695276, "grad_norm": 2.181539297103882, "learning_rate": 0.00024289478637304723, "loss": 3.4487, "step": 13660 }, { "epoch": 2.572934312064747, "grad_norm": 2.3154828548431396, "learning_rate": 0.0002427065687935253, "loss": 3.4625, "step": 13670 }, { "epoch": 2.574816487859966, "grad_norm": 2.2067558765411377, "learning_rate": 0.0002425183512140034, "loss": 3.6838, "step": 13680 }, { "epoch": 2.5766986636551854, "grad_norm": 2.6612298488616943, "learning_rate": 0.00024233013363448148, "loss": 3.4488, "step": 13690 }, { "epoch": 2.578580839450405, "grad_norm": 2.5054516792297363, "learning_rate": 0.00024214191605495953, "loss": 3.7849, "step": 13700 }, { "epoch": 2.580463015245624, "grad_norm": 2.342247247695923, "learning_rate": 0.0002419536984754376, "loss": 3.5564, "step": 13710 }, { "epoch": 2.5823451910408433, "grad_norm": 2.155902147293091, "learning_rate": 0.00024176548089591569, "loss": 3.5476, "step": 13720 }, { "epoch": 2.5842273668360622, "grad_norm": 2.3009862899780273, "learning_rate": 0.00024157726331639376, "loss": 3.4861, "step": 13730 }, { "epoch": 2.5861095426312817, "grad_norm": 1.888217806816101, "learning_rate": 0.0002413890457368718, "loss": 3.3693, "step": 13740 }, { "epoch": 2.587991718426501, "grad_norm": 2.2217419147491455, "learning_rate": 0.0002412008281573499, "loss": 3.828, "step": 13750 }, { "epoch": 2.5898738942217205, "grad_norm": 2.217092275619507, "learning_rate": 0.00024101261057782796, "loss": 3.3636, "step": 13760 }, { "epoch": 2.5917560700169395, "grad_norm": 2.1933679580688477, "learning_rate": 0.00024082439299830604, "loss": 3.4174, "step": 13770 }, { "epoch": 2.593638245812159, "grad_norm": 2.34127140045166, "learning_rate": 0.00024063617541878414, "loss": 3.4482, "step": 13780 }, { "epoch": 2.595520421607378, "grad_norm": 2.314049005508423, "learning_rate": 0.0002404479578392622, "loss": 3.3114, "step": 13790 }, { "epoch": 2.5974025974025974, "grad_norm": 2.2674081325531006, "learning_rate": 0.00024025974025974027, "loss": 3.637, "step": 13800 }, { "epoch": 2.599284773197817, "grad_norm": 2.1805338859558105, "learning_rate": 0.00024007152268021835, "loss": 3.6175, "step": 13810 }, { "epoch": 2.601166948993036, "grad_norm": 2.096508741378784, "learning_rate": 0.00023988330510069642, "loss": 3.8226, "step": 13820 }, { "epoch": 2.6030491247882552, "grad_norm": 2.3623275756835938, "learning_rate": 0.00023969508752117447, "loss": 3.6106, "step": 13830 }, { "epoch": 2.6049313005834747, "grad_norm": 2.232391119003296, "learning_rate": 0.00023950686994165255, "loss": 3.7322, "step": 13840 }, { "epoch": 2.6068134763786937, "grad_norm": 2.4925694465637207, "learning_rate": 0.00023931865236213063, "loss": 3.557, "step": 13850 }, { "epoch": 2.608695652173913, "grad_norm": 2.8005504608154297, "learning_rate": 0.0002391304347826087, "loss": 3.5394, "step": 13860 }, { "epoch": 2.6105778279691325, "grad_norm": 2.3423023223876953, "learning_rate": 0.00023894221720308678, "loss": 3.5762, "step": 13870 }, { "epoch": 2.6124600037643515, "grad_norm": 2.3541934490203857, "learning_rate": 0.00023875399962356483, "loss": 3.6592, "step": 13880 }, { "epoch": 2.614342179559571, "grad_norm": 2.138944387435913, "learning_rate": 0.0002385657820440429, "loss": 3.3244, "step": 13890 }, { "epoch": 2.61622435535479, "grad_norm": 2.374363899230957, "learning_rate": 0.000238377564464521, "loss": 3.4745, "step": 13900 }, { "epoch": 2.6181065311500094, "grad_norm": 2.2343404293060303, "learning_rate": 0.00023818934688499908, "loss": 3.4339, "step": 13910 }, { "epoch": 2.619988706945229, "grad_norm": 5.174120903015137, "learning_rate": 0.00023800112930547713, "loss": 3.4124, "step": 13920 }, { "epoch": 2.621870882740448, "grad_norm": 2.39028000831604, "learning_rate": 0.0002378129117259552, "loss": 3.404, "step": 13930 }, { "epoch": 2.623753058535667, "grad_norm": 2.3732285499572754, "learning_rate": 0.00023762469414643329, "loss": 3.6385, "step": 13940 }, { "epoch": 2.6256352343308866, "grad_norm": 2.5302624702453613, "learning_rate": 0.00023743647656691136, "loss": 3.4885, "step": 13950 }, { "epoch": 2.6275174101261056, "grad_norm": 2.292924642562866, "learning_rate": 0.0002372482589873894, "loss": 3.4066, "step": 13960 }, { "epoch": 2.629399585921325, "grad_norm": 2.267543315887451, "learning_rate": 0.0002370600414078675, "loss": 3.1002, "step": 13970 }, { "epoch": 2.6312817617165445, "grad_norm": 2.347487688064575, "learning_rate": 0.00023687182382834556, "loss": 3.5494, "step": 13980 }, { "epoch": 2.6331639375117635, "grad_norm": 2.2723233699798584, "learning_rate": 0.00023668360624882364, "loss": 3.5419, "step": 13990 }, { "epoch": 2.635046113306983, "grad_norm": 2.240119457244873, "learning_rate": 0.00023649538866930172, "loss": 3.5517, "step": 14000 }, { "epoch": 2.636928289102202, "grad_norm": 2.802652359008789, "learning_rate": 0.0002363071710897798, "loss": 3.4292, "step": 14010 }, { "epoch": 2.6388104648974213, "grad_norm": 2.53572678565979, "learning_rate": 0.00023611895351025787, "loss": 3.1934, "step": 14020 }, { "epoch": 2.6406926406926408, "grad_norm": 2.339966297149658, "learning_rate": 0.00023593073593073595, "loss": 3.5413, "step": 14030 }, { "epoch": 2.64257481648786, "grad_norm": 2.430595636367798, "learning_rate": 0.00023574251835121402, "loss": 3.3905, "step": 14040 }, { "epoch": 2.644456992283079, "grad_norm": 2.0948545932769775, "learning_rate": 0.00023555430077169207, "loss": 3.34, "step": 14050 }, { "epoch": 2.6463391680782986, "grad_norm": 2.448134422302246, "learning_rate": 0.00023536608319217015, "loss": 3.5613, "step": 14060 }, { "epoch": 2.6482213438735176, "grad_norm": 2.4141626358032227, "learning_rate": 0.00023517786561264823, "loss": 3.6671, "step": 14070 }, { "epoch": 2.650103519668737, "grad_norm": 2.3373255729675293, "learning_rate": 0.0002349896480331263, "loss": 3.8727, "step": 14080 }, { "epoch": 2.6519856954639565, "grad_norm": 2.106760025024414, "learning_rate": 0.00023480143045360435, "loss": 3.5622, "step": 14090 }, { "epoch": 2.6538678712591754, "grad_norm": 2.769371271133423, "learning_rate": 0.00023461321287408243, "loss": 3.4578, "step": 14100 }, { "epoch": 2.655750047054395, "grad_norm": 2.594217300415039, "learning_rate": 0.0002344249952945605, "loss": 3.3999, "step": 14110 }, { "epoch": 2.6576322228496143, "grad_norm": 2.42844820022583, "learning_rate": 0.0002342367777150386, "loss": 3.326, "step": 14120 }, { "epoch": 2.6595143986448333, "grad_norm": 2.072188138961792, "learning_rate": 0.00023404856013551668, "loss": 3.5705, "step": 14130 }, { "epoch": 2.6613965744400527, "grad_norm": 2.64915132522583, "learning_rate": 0.00023386034255599473, "loss": 3.5551, "step": 14140 }, { "epoch": 2.663278750235272, "grad_norm": 1.9408377408981323, "learning_rate": 0.0002336721249764728, "loss": 3.402, "step": 14150 }, { "epoch": 2.665160926030491, "grad_norm": 2.603893756866455, "learning_rate": 0.00023348390739695089, "loss": 3.5726, "step": 14160 }, { "epoch": 2.6670431018257106, "grad_norm": 2.6545052528381348, "learning_rate": 0.00023329568981742896, "loss": 3.5343, "step": 14170 }, { "epoch": 2.6689252776209296, "grad_norm": 2.0449416637420654, "learning_rate": 0.000233107472237907, "loss": 3.6172, "step": 14180 }, { "epoch": 2.670807453416149, "grad_norm": 2.6366517543792725, "learning_rate": 0.0002329192546583851, "loss": 3.4519, "step": 14190 }, { "epoch": 2.6726896292113684, "grad_norm": 2.118492841720581, "learning_rate": 0.00023273103707886316, "loss": 3.4333, "step": 14200 }, { "epoch": 2.674571805006588, "grad_norm": 2.4209890365600586, "learning_rate": 0.00023254281949934124, "loss": 3.5463, "step": 14210 }, { "epoch": 2.676453980801807, "grad_norm": 2.1032493114471436, "learning_rate": 0.0002323546019198193, "loss": 3.6745, "step": 14220 }, { "epoch": 2.6783361565970263, "grad_norm": 2.586094379425049, "learning_rate": 0.0002321663843402974, "loss": 3.475, "step": 14230 }, { "epoch": 2.6802183323922453, "grad_norm": 1.9233129024505615, "learning_rate": 0.00023197816676077547, "loss": 3.4387, "step": 14240 }, { "epoch": 2.6821005081874647, "grad_norm": 2.343977689743042, "learning_rate": 0.00023178994918125355, "loss": 3.7495, "step": 14250 }, { "epoch": 2.683982683982684, "grad_norm": 1.9606075286865234, "learning_rate": 0.00023160173160173162, "loss": 3.8449, "step": 14260 }, { "epoch": 2.685864859777903, "grad_norm": 2.037147045135498, "learning_rate": 0.00023141351402220967, "loss": 3.6515, "step": 14270 }, { "epoch": 2.6877470355731226, "grad_norm": 2.135225296020508, "learning_rate": 0.00023122529644268775, "loss": 3.4237, "step": 14280 }, { "epoch": 2.6896292113683415, "grad_norm": 2.5087924003601074, "learning_rate": 0.00023103707886316583, "loss": 3.654, "step": 14290 }, { "epoch": 2.691511387163561, "grad_norm": 2.4269285202026367, "learning_rate": 0.0002308488612836439, "loss": 3.333, "step": 14300 }, { "epoch": 2.6933935629587804, "grad_norm": 2.0803372859954834, "learning_rate": 0.00023066064370412195, "loss": 3.4516, "step": 14310 }, { "epoch": 2.695275738754, "grad_norm": 2.165856122970581, "learning_rate": 0.00023047242612460003, "loss": 3.6354, "step": 14320 }, { "epoch": 2.697157914549219, "grad_norm": 2.180093288421631, "learning_rate": 0.0002302842085450781, "loss": 3.4879, "step": 14330 }, { "epoch": 2.6990400903444383, "grad_norm": 2.306670904159546, "learning_rate": 0.0002300959909655562, "loss": 3.5027, "step": 14340 }, { "epoch": 2.7009222661396572, "grad_norm": 1.9162133932113647, "learning_rate": 0.00022990777338603426, "loss": 3.6179, "step": 14350 }, { "epoch": 2.7028044419348767, "grad_norm": 2.0616095066070557, "learning_rate": 0.00022971955580651233, "loss": 3.5577, "step": 14360 }, { "epoch": 2.704686617730096, "grad_norm": 2.197929859161377, "learning_rate": 0.0002295313382269904, "loss": 3.2005, "step": 14370 }, { "epoch": 2.7065687935253155, "grad_norm": 2.8560714721679688, "learning_rate": 0.00022934312064746849, "loss": 3.3947, "step": 14380 }, { "epoch": 2.7084509693205345, "grad_norm": 2.397538185119629, "learning_rate": 0.00022915490306794656, "loss": 3.4252, "step": 14390 }, { "epoch": 2.710333145115754, "grad_norm": 2.24406099319458, "learning_rate": 0.0002289666854884246, "loss": 3.3707, "step": 14400 }, { "epoch": 2.712215320910973, "grad_norm": 2.7690091133117676, "learning_rate": 0.0002287784679089027, "loss": 3.5741, "step": 14410 }, { "epoch": 2.7140974967061924, "grad_norm": 2.4531538486480713, "learning_rate": 0.00022859025032938076, "loss": 3.2768, "step": 14420 }, { "epoch": 2.715979672501412, "grad_norm": 2.0439727306365967, "learning_rate": 0.00022840203274985884, "loss": 3.2689, "step": 14430 }, { "epoch": 2.717861848296631, "grad_norm": 2.421132802963257, "learning_rate": 0.0002282138151703369, "loss": 3.2724, "step": 14440 }, { "epoch": 2.7197440240918502, "grad_norm": 2.507495164871216, "learning_rate": 0.000228025597590815, "loss": 3.2866, "step": 14450 }, { "epoch": 2.721626199887069, "grad_norm": 2.518876075744629, "learning_rate": 0.00022783738001129307, "loss": 3.4307, "step": 14460 }, { "epoch": 2.7235083756822887, "grad_norm": 2.889340400695801, "learning_rate": 0.00022764916243177115, "loss": 3.3949, "step": 14470 }, { "epoch": 2.725390551477508, "grad_norm": 3.0600709915161133, "learning_rate": 0.00022746094485224922, "loss": 3.5561, "step": 14480 }, { "epoch": 2.7272727272727275, "grad_norm": 2.439391851425171, "learning_rate": 0.00022727272727272727, "loss": 3.5498, "step": 14490 }, { "epoch": 2.7291549030679465, "grad_norm": 2.550637722015381, "learning_rate": 0.00022708450969320535, "loss": 3.344, "step": 14500 }, { "epoch": 2.731037078863166, "grad_norm": 2.25809645652771, "learning_rate": 0.00022689629211368343, "loss": 3.5153, "step": 14510 }, { "epoch": 2.732919254658385, "grad_norm": 2.413231611251831, "learning_rate": 0.0002267080745341615, "loss": 3.5793, "step": 14520 }, { "epoch": 2.7348014304536044, "grad_norm": 2.406266450881958, "learning_rate": 0.00022651985695463955, "loss": 3.3934, "step": 14530 }, { "epoch": 2.736683606248824, "grad_norm": 2.2091832160949707, "learning_rate": 0.00022633163937511763, "loss": 3.3904, "step": 14540 }, { "epoch": 2.7385657820440428, "grad_norm": 2.170200824737549, "learning_rate": 0.0002261434217955957, "loss": 3.6256, "step": 14550 }, { "epoch": 2.740447957839262, "grad_norm": 2.615487575531006, "learning_rate": 0.0002259552042160738, "loss": 3.6725, "step": 14560 }, { "epoch": 2.742330133634481, "grad_norm": 2.3170371055603027, "learning_rate": 0.00022576698663655186, "loss": 3.4314, "step": 14570 }, { "epoch": 2.7442123094297006, "grad_norm": 2.150153398513794, "learning_rate": 0.00022557876905702993, "loss": 3.4444, "step": 14580 }, { "epoch": 2.74609448522492, "grad_norm": 2.0498740673065186, "learning_rate": 0.000225390551477508, "loss": 3.578, "step": 14590 }, { "epoch": 2.7479766610201395, "grad_norm": 2.061810255050659, "learning_rate": 0.00022520233389798609, "loss": 3.4493, "step": 14600 }, { "epoch": 2.7498588368153585, "grad_norm": 2.162282705307007, "learning_rate": 0.00022501411631846416, "loss": 3.4586, "step": 14610 }, { "epoch": 2.751741012610578, "grad_norm": 2.4143502712249756, "learning_rate": 0.0002248258987389422, "loss": 3.603, "step": 14620 }, { "epoch": 2.753623188405797, "grad_norm": 2.4602837562561035, "learning_rate": 0.0002246376811594203, "loss": 3.4643, "step": 14630 }, { "epoch": 2.7555053642010163, "grad_norm": 2.0327515602111816, "learning_rate": 0.00022444946357989836, "loss": 3.3329, "step": 14640 }, { "epoch": 2.7573875399962358, "grad_norm": 3.8538384437561035, "learning_rate": 0.00022426124600037644, "loss": 3.4552, "step": 14650 }, { "epoch": 2.759269715791455, "grad_norm": 2.4830667972564697, "learning_rate": 0.0002240730284208545, "loss": 3.5332, "step": 14660 }, { "epoch": 2.761151891586674, "grad_norm": 2.0901987552642822, "learning_rate": 0.0002238848108413326, "loss": 3.5226, "step": 14670 }, { "epoch": 2.7630340673818936, "grad_norm": 3.789461374282837, "learning_rate": 0.00022369659326181067, "loss": 3.5308, "step": 14680 }, { "epoch": 2.7649162431771126, "grad_norm": 2.136140823364258, "learning_rate": 0.00022350837568228875, "loss": 3.6822, "step": 14690 }, { "epoch": 2.766798418972332, "grad_norm": 2.2833197116851807, "learning_rate": 0.0002233201581027668, "loss": 3.6, "step": 14700 }, { "epoch": 2.7686805947675515, "grad_norm": 2.147174596786499, "learning_rate": 0.00022313194052324487, "loss": 3.4034, "step": 14710 }, { "epoch": 2.7705627705627704, "grad_norm": 2.3008553981781006, "learning_rate": 0.00022294372294372295, "loss": 3.462, "step": 14720 }, { "epoch": 2.77244494635799, "grad_norm": 2.269583225250244, "learning_rate": 0.00022275550536420103, "loss": 3.3978, "step": 14730 }, { "epoch": 2.774327122153209, "grad_norm": 2.5759148597717285, "learning_rate": 0.0002225672877846791, "loss": 3.4445, "step": 14740 }, { "epoch": 2.7762092979484283, "grad_norm": 2.253596067428589, "learning_rate": 0.00022237907020515715, "loss": 3.4524, "step": 14750 }, { "epoch": 2.7780914737436477, "grad_norm": 2.3863766193389893, "learning_rate": 0.00022219085262563523, "loss": 3.5521, "step": 14760 }, { "epoch": 2.779973649538867, "grad_norm": 2.2985453605651855, "learning_rate": 0.0002220026350461133, "loss": 3.5765, "step": 14770 }, { "epoch": 2.781855825334086, "grad_norm": 3.2895071506500244, "learning_rate": 0.0002218144174665914, "loss": 3.4979, "step": 14780 }, { "epoch": 2.7837380011293056, "grad_norm": 2.392838478088379, "learning_rate": 0.00022162619988706946, "loss": 3.4905, "step": 14790 }, { "epoch": 2.7856201769245246, "grad_norm": 2.6074109077453613, "learning_rate": 0.00022143798230754753, "loss": 3.6035, "step": 14800 }, { "epoch": 2.787502352719744, "grad_norm": 2.218205690383911, "learning_rate": 0.0002212497647280256, "loss": 3.5234, "step": 14810 }, { "epoch": 2.7893845285149634, "grad_norm": 2.9633259773254395, "learning_rate": 0.00022106154714850369, "loss": 3.6222, "step": 14820 }, { "epoch": 2.7912667043101824, "grad_norm": 2.1877028942108154, "learning_rate": 0.00022087332956898174, "loss": 3.722, "step": 14830 }, { "epoch": 2.793148880105402, "grad_norm": 2.6830685138702393, "learning_rate": 0.0002206851119894598, "loss": 3.4448, "step": 14840 }, { "epoch": 2.795031055900621, "grad_norm": 2.181081771850586, "learning_rate": 0.0002204968944099379, "loss": 3.5549, "step": 14850 }, { "epoch": 2.7969132316958403, "grad_norm": 2.0416409969329834, "learning_rate": 0.00022030867683041596, "loss": 3.6628, "step": 14860 }, { "epoch": 2.7987954074910597, "grad_norm": 2.270547866821289, "learning_rate": 0.00022012045925089404, "loss": 3.378, "step": 14870 }, { "epoch": 2.800677583286279, "grad_norm": 3.774535655975342, "learning_rate": 0.0002199322416713721, "loss": 3.4061, "step": 14880 }, { "epoch": 2.802559759081498, "grad_norm": 2.869417428970337, "learning_rate": 0.0002197440240918502, "loss": 3.5096, "step": 14890 }, { "epoch": 2.8044419348767176, "grad_norm": 2.835237503051758, "learning_rate": 0.00021955580651232827, "loss": 3.3399, "step": 14900 }, { "epoch": 2.8063241106719365, "grad_norm": 2.811671495437622, "learning_rate": 0.00021936758893280635, "loss": 3.4577, "step": 14910 }, { "epoch": 2.808206286467156, "grad_norm": 2.3493731021881104, "learning_rate": 0.0002191793713532844, "loss": 3.5547, "step": 14920 }, { "epoch": 2.8100884622623754, "grad_norm": 2.611755132675171, "learning_rate": 0.00021899115377376247, "loss": 3.4613, "step": 14930 }, { "epoch": 2.811970638057595, "grad_norm": 2.213094711303711, "learning_rate": 0.00021880293619424055, "loss": 3.2579, "step": 14940 }, { "epoch": 2.813852813852814, "grad_norm": 3.3088784217834473, "learning_rate": 0.00021861471861471863, "loss": 3.4697, "step": 14950 }, { "epoch": 2.8157349896480333, "grad_norm": 2.3907647132873535, "learning_rate": 0.00021842650103519667, "loss": 3.5001, "step": 14960 }, { "epoch": 2.8176171654432522, "grad_norm": 2.6922473907470703, "learning_rate": 0.00021823828345567475, "loss": 3.2857, "step": 14970 }, { "epoch": 2.8194993412384717, "grad_norm": 2.418233633041382, "learning_rate": 0.00021805006587615283, "loss": 3.4984, "step": 14980 }, { "epoch": 2.821381517033691, "grad_norm": 2.222984790802002, "learning_rate": 0.0002178618482966309, "loss": 3.4617, "step": 14990 }, { "epoch": 2.82326369282891, "grad_norm": 2.219573974609375, "learning_rate": 0.000217673630717109, "loss": 3.5526, "step": 15000 }, { "epoch": 2.8251458686241295, "grad_norm": 2.521733522415161, "learning_rate": 0.00021748541313758706, "loss": 3.5159, "step": 15010 }, { "epoch": 2.8270280444193485, "grad_norm": 1.963568925857544, "learning_rate": 0.00021729719555806513, "loss": 3.5813, "step": 15020 }, { "epoch": 2.828910220214568, "grad_norm": 2.4848878383636475, "learning_rate": 0.0002171089779785432, "loss": 3.4577, "step": 15030 }, { "epoch": 2.8307923960097874, "grad_norm": 3.544420003890991, "learning_rate": 0.00021692076039902129, "loss": 3.4027, "step": 15040 }, { "epoch": 2.832674571805007, "grad_norm": 2.314347743988037, "learning_rate": 0.00021673254281949934, "loss": 3.5866, "step": 15050 }, { "epoch": 2.834556747600226, "grad_norm": 2.8646903038024902, "learning_rate": 0.0002165443252399774, "loss": 3.3809, "step": 15060 }, { "epoch": 2.8364389233954452, "grad_norm": 2.6082592010498047, "learning_rate": 0.0002163561076604555, "loss": 3.5331, "step": 15070 }, { "epoch": 2.838321099190664, "grad_norm": 2.5774693489074707, "learning_rate": 0.00021616789008093356, "loss": 3.484, "step": 15080 }, { "epoch": 2.8402032749858837, "grad_norm": 2.395798921585083, "learning_rate": 0.00021597967250141161, "loss": 3.4579, "step": 15090 }, { "epoch": 2.842085450781103, "grad_norm": 2.2384536266326904, "learning_rate": 0.0002157914549218897, "loss": 3.6846, "step": 15100 }, { "epoch": 2.843967626576322, "grad_norm": 1.9982672929763794, "learning_rate": 0.0002156032373423678, "loss": 3.3189, "step": 15110 }, { "epoch": 2.8458498023715415, "grad_norm": 2.47128963470459, "learning_rate": 0.00021541501976284587, "loss": 3.5565, "step": 15120 }, { "epoch": 2.847731978166761, "grad_norm": 2.4589641094207764, "learning_rate": 0.00021522680218332395, "loss": 3.4213, "step": 15130 }, { "epoch": 2.84961415396198, "grad_norm": 2.188739538192749, "learning_rate": 0.000215038584603802, "loss": 3.624, "step": 15140 }, { "epoch": 2.8514963297571994, "grad_norm": 2.4727349281311035, "learning_rate": 0.00021485036702428007, "loss": 3.5586, "step": 15150 }, { "epoch": 2.853378505552419, "grad_norm": 2.101638078689575, "learning_rate": 0.00021466214944475815, "loss": 3.2808, "step": 15160 }, { "epoch": 2.8552606813476378, "grad_norm": 2.9845190048217773, "learning_rate": 0.00021447393186523623, "loss": 3.5844, "step": 15170 }, { "epoch": 2.857142857142857, "grad_norm": 2.1420416831970215, "learning_rate": 0.00021428571428571427, "loss": 3.1567, "step": 15180 }, { "epoch": 2.859025032938076, "grad_norm": 2.4418587684631348, "learning_rate": 0.00021409749670619235, "loss": 3.3212, "step": 15190 }, { "epoch": 2.8609072087332956, "grad_norm": 3.390158176422119, "learning_rate": 0.00021390927912667043, "loss": 3.3359, "step": 15200 }, { "epoch": 2.862789384528515, "grad_norm": 2.4932174682617188, "learning_rate": 0.0002137210615471485, "loss": 3.5236, "step": 15210 }, { "epoch": 2.8646715603237345, "grad_norm": 2.0627095699310303, "learning_rate": 0.0002135328439676266, "loss": 3.3669, "step": 15220 }, { "epoch": 2.8665537361189535, "grad_norm": 3.4032983779907227, "learning_rate": 0.00021334462638810466, "loss": 3.4306, "step": 15230 }, { "epoch": 2.868435911914173, "grad_norm": 2.5139946937561035, "learning_rate": 0.00021315640880858273, "loss": 3.4435, "step": 15240 }, { "epoch": 2.870318087709392, "grad_norm": 2.587301254272461, "learning_rate": 0.0002129681912290608, "loss": 3.4574, "step": 15250 }, { "epoch": 2.8722002635046113, "grad_norm": 2.299530267715454, "learning_rate": 0.00021277997364953889, "loss": 3.5394, "step": 15260 }, { "epoch": 2.8740824392998308, "grad_norm": 1.9752168655395508, "learning_rate": 0.00021259175607001694, "loss": 3.2919, "step": 15270 }, { "epoch": 2.8759646150950497, "grad_norm": 2.2354319095611572, "learning_rate": 0.000212403538490495, "loss": 3.6547, "step": 15280 }, { "epoch": 2.877846790890269, "grad_norm": 2.2502505779266357, "learning_rate": 0.0002122153209109731, "loss": 3.2796, "step": 15290 }, { "epoch": 2.879728966685488, "grad_norm": 2.381803274154663, "learning_rate": 0.00021202710333145116, "loss": 3.5734, "step": 15300 }, { "epoch": 2.8816111424807076, "grad_norm": 2.6856446266174316, "learning_rate": 0.00021183888575192921, "loss": 3.5075, "step": 15310 }, { "epoch": 2.883493318275927, "grad_norm": 2.4393763542175293, "learning_rate": 0.0002116506681724073, "loss": 3.5841, "step": 15320 }, { "epoch": 2.8853754940711465, "grad_norm": 2.8853402137756348, "learning_rate": 0.00021146245059288537, "loss": 3.5743, "step": 15330 }, { "epoch": 2.8872576698663655, "grad_norm": 2.005941152572632, "learning_rate": 0.00021127423301336347, "loss": 3.4861, "step": 15340 }, { "epoch": 2.889139845661585, "grad_norm": 2.736389398574829, "learning_rate": 0.00021108601543384155, "loss": 3.3339, "step": 15350 }, { "epoch": 2.891022021456804, "grad_norm": 2.5647521018981934, "learning_rate": 0.0002108977978543196, "loss": 3.4417, "step": 15360 }, { "epoch": 2.8929041972520233, "grad_norm": 2.37971568107605, "learning_rate": 0.00021070958027479767, "loss": 3.3668, "step": 15370 }, { "epoch": 2.8947863730472427, "grad_norm": 2.441014289855957, "learning_rate": 0.00021052136269527575, "loss": 3.5414, "step": 15380 }, { "epoch": 2.8966685488424617, "grad_norm": 3.1848180294036865, "learning_rate": 0.00021033314511575383, "loss": 3.6509, "step": 15390 }, { "epoch": 2.898550724637681, "grad_norm": 2.5912351608276367, "learning_rate": 0.00021014492753623187, "loss": 3.2655, "step": 15400 }, { "epoch": 2.9004329004329006, "grad_norm": 3.1243958473205566, "learning_rate": 0.00020995670995670995, "loss": 3.6256, "step": 15410 }, { "epoch": 2.9023150762281196, "grad_norm": 2.3207552433013916, "learning_rate": 0.00020976849237718803, "loss": 3.1971, "step": 15420 }, { "epoch": 2.904197252023339, "grad_norm": 2.6017887592315674, "learning_rate": 0.0002095802747976661, "loss": 3.61, "step": 15430 }, { "epoch": 2.9060794278185584, "grad_norm": 2.248102903366089, "learning_rate": 0.00020939205721814415, "loss": 3.5052, "step": 15440 }, { "epoch": 2.9079616036137774, "grad_norm": 2.675691604614258, "learning_rate": 0.00020920383963862226, "loss": 3.5116, "step": 15450 }, { "epoch": 2.909843779408997, "grad_norm": 2.4814398288726807, "learning_rate": 0.00020901562205910033, "loss": 3.2238, "step": 15460 }, { "epoch": 2.911725955204216, "grad_norm": 2.3238048553466797, "learning_rate": 0.0002088274044795784, "loss": 3.3525, "step": 15470 }, { "epoch": 2.9136081309994353, "grad_norm": 2.466846466064453, "learning_rate": 0.00020863918690005649, "loss": 3.6302, "step": 15480 }, { "epoch": 2.9154903067946547, "grad_norm": 2.3691577911376953, "learning_rate": 0.00020845096932053454, "loss": 3.1262, "step": 15490 }, { "epoch": 2.917372482589874, "grad_norm": 2.283759593963623, "learning_rate": 0.0002082627517410126, "loss": 3.2809, "step": 15500 }, { "epoch": 2.919254658385093, "grad_norm": 2.8975417613983154, "learning_rate": 0.0002080745341614907, "loss": 3.6239, "step": 15510 }, { "epoch": 2.9211368341803126, "grad_norm": 2.6288318634033203, "learning_rate": 0.00020788631658196876, "loss": 3.6008, "step": 15520 }, { "epoch": 2.9230190099755315, "grad_norm": 2.5137789249420166, "learning_rate": 0.00020769809900244681, "loss": 3.4699, "step": 15530 }, { "epoch": 2.924901185770751, "grad_norm": 2.2313966751098633, "learning_rate": 0.0002075098814229249, "loss": 3.4451, "step": 15540 }, { "epoch": 2.9267833615659704, "grad_norm": 2.7938575744628906, "learning_rate": 0.00020732166384340297, "loss": 3.6125, "step": 15550 }, { "epoch": 2.9286655373611894, "grad_norm": 2.416111946105957, "learning_rate": 0.00020713344626388107, "loss": 3.5303, "step": 15560 }, { "epoch": 2.930547713156409, "grad_norm": 3.317490816116333, "learning_rate": 0.00020694522868435912, "loss": 3.3833, "step": 15570 }, { "epoch": 2.932429888951628, "grad_norm": 2.4095804691314697, "learning_rate": 0.0002067570111048372, "loss": 3.5556, "step": 15580 }, { "epoch": 2.9343120647468472, "grad_norm": 2.984218120574951, "learning_rate": 0.00020656879352531527, "loss": 3.2939, "step": 15590 }, { "epoch": 2.9361942405420667, "grad_norm": 3.039593458175659, "learning_rate": 0.00020638057594579335, "loss": 3.792, "step": 15600 }, { "epoch": 2.938076416337286, "grad_norm": 2.3210456371307373, "learning_rate": 0.00020619235836627143, "loss": 3.4311, "step": 15610 }, { "epoch": 2.939958592132505, "grad_norm": 3.5044655799865723, "learning_rate": 0.00020600414078674947, "loss": 3.6649, "step": 15620 }, { "epoch": 2.9418407679277245, "grad_norm": 2.563467264175415, "learning_rate": 0.00020581592320722755, "loss": 3.7411, "step": 15630 }, { "epoch": 2.9437229437229435, "grad_norm": 2.5478553771972656, "learning_rate": 0.00020562770562770563, "loss": 3.6339, "step": 15640 }, { "epoch": 2.945605119518163, "grad_norm": 2.3911826610565186, "learning_rate": 0.0002054394880481837, "loss": 3.588, "step": 15650 }, { "epoch": 2.9474872953133824, "grad_norm": 2.204178810119629, "learning_rate": 0.00020525127046866175, "loss": 3.2706, "step": 15660 }, { "epoch": 2.949369471108602, "grad_norm": 2.1773793697357178, "learning_rate": 0.00020506305288913986, "loss": 3.5213, "step": 15670 }, { "epoch": 2.951251646903821, "grad_norm": 2.37015962600708, "learning_rate": 0.00020487483530961793, "loss": 3.5603, "step": 15680 }, { "epoch": 2.9531338226990402, "grad_norm": 2.6659293174743652, "learning_rate": 0.000204686617730096, "loss": 3.2855, "step": 15690 }, { "epoch": 2.955015998494259, "grad_norm": 2.7114241123199463, "learning_rate": 0.00020449840015057406, "loss": 3.2266, "step": 15700 }, { "epoch": 2.9568981742894787, "grad_norm": 2.4781360626220703, "learning_rate": 0.00020431018257105214, "loss": 3.3296, "step": 15710 }, { "epoch": 2.958780350084698, "grad_norm": 2.8543200492858887, "learning_rate": 0.0002041219649915302, "loss": 3.384, "step": 15720 }, { "epoch": 2.960662525879917, "grad_norm": 2.0750515460968018, "learning_rate": 0.0002039337474120083, "loss": 3.3709, "step": 15730 }, { "epoch": 2.9625447016751365, "grad_norm": 2.2768306732177734, "learning_rate": 0.00020374552983248636, "loss": 3.5925, "step": 15740 }, { "epoch": 2.9644268774703555, "grad_norm": 3.9188880920410156, "learning_rate": 0.00020355731225296441, "loss": 3.3453, "step": 15750 }, { "epoch": 2.966309053265575, "grad_norm": 2.0652997493743896, "learning_rate": 0.0002033690946734425, "loss": 3.4702, "step": 15760 }, { "epoch": 2.9681912290607944, "grad_norm": 2.6457831859588623, "learning_rate": 0.00020318087709392057, "loss": 3.6202, "step": 15770 }, { "epoch": 2.970073404856014, "grad_norm": 2.8359549045562744, "learning_rate": 0.00020299265951439867, "loss": 3.4095, "step": 15780 }, { "epoch": 2.9719555806512328, "grad_norm": 2.460312843322754, "learning_rate": 0.00020280444193487672, "loss": 3.2642, "step": 15790 }, { "epoch": 2.973837756446452, "grad_norm": 2.7726497650146484, "learning_rate": 0.0002026162243553548, "loss": 3.2432, "step": 15800 }, { "epoch": 2.975719932241671, "grad_norm": 2.8087449073791504, "learning_rate": 0.00020242800677583287, "loss": 3.9875, "step": 15810 }, { "epoch": 2.9776021080368906, "grad_norm": 2.3682594299316406, "learning_rate": 0.00020223978919631095, "loss": 3.2547, "step": 15820 }, { "epoch": 2.97948428383211, "grad_norm": 2.968564748764038, "learning_rate": 0.00020205157161678903, "loss": 3.3891, "step": 15830 }, { "epoch": 2.981366459627329, "grad_norm": 2.396745204925537, "learning_rate": 0.00020186335403726707, "loss": 3.4696, "step": 15840 }, { "epoch": 2.9832486354225485, "grad_norm": 3.5899250507354736, "learning_rate": 0.00020167513645774515, "loss": 3.6297, "step": 15850 }, { "epoch": 2.9851308112177675, "grad_norm": 2.914106607437134, "learning_rate": 0.00020148691887822323, "loss": 3.2882, "step": 15860 }, { "epoch": 2.987012987012987, "grad_norm": 2.3831381797790527, "learning_rate": 0.0002012987012987013, "loss": 3.6653, "step": 15870 }, { "epoch": 2.9888951628082063, "grad_norm": 2.077132225036621, "learning_rate": 0.00020111048371917935, "loss": 3.3957, "step": 15880 }, { "epoch": 2.9907773386034258, "grad_norm": 2.5786349773406982, "learning_rate": 0.00020092226613965746, "loss": 3.5659, "step": 15890 }, { "epoch": 2.9926595143986447, "grad_norm": 2.3395864963531494, "learning_rate": 0.00020073404856013553, "loss": 3.2228, "step": 15900 }, { "epoch": 2.994541690193864, "grad_norm": 2.300260066986084, "learning_rate": 0.0002005458309806136, "loss": 3.336, "step": 15910 }, { "epoch": 2.996423865989083, "grad_norm": 3.540156364440918, "learning_rate": 0.00020035761340109166, "loss": 3.3551, "step": 15920 }, { "epoch": 2.9983060417843026, "grad_norm": 2.3956615924835205, "learning_rate": 0.00020016939582156974, "loss": 3.3675, "step": 15930 }, { "epoch": 3.0, "eval_accuracy": 0.166, "eval_loss": 3.4389488697052, "eval_runtime": 87.0069, "eval_samples_per_second": 86.2, "eval_steps_per_second": 10.781, "step": 15939 }, { "epoch": 3.000188217579522, "grad_norm": 2.3464903831481934, "learning_rate": 0.0001999811782420478, "loss": 3.5502, "step": 15940 }, { "epoch": 3.002070393374741, "grad_norm": 2.4275286197662354, "learning_rate": 0.0001997929606625259, "loss": 3.3322, "step": 15950 }, { "epoch": 3.0039525691699605, "grad_norm": 2.139082908630371, "learning_rate": 0.00019960474308300396, "loss": 3.2053, "step": 15960 }, { "epoch": 3.00583474496518, "grad_norm": 2.784451961517334, "learning_rate": 0.00019941652550348201, "loss": 3.5752, "step": 15970 }, { "epoch": 3.007716920760399, "grad_norm": 2.6467702388763428, "learning_rate": 0.0001992283079239601, "loss": 3.346, "step": 15980 }, { "epoch": 3.0095990965556183, "grad_norm": 2.888674736022949, "learning_rate": 0.00019904009034443817, "loss": 3.4121, "step": 15990 }, { "epoch": 3.0114812723508377, "grad_norm": 2.2215163707733154, "learning_rate": 0.00019885187276491627, "loss": 3.5202, "step": 16000 }, { "epoch": 3.0133634481460567, "grad_norm": 3.2592546939849854, "learning_rate": 0.00019866365518539432, "loss": 3.1009, "step": 16010 }, { "epoch": 3.015245623941276, "grad_norm": 2.3010451793670654, "learning_rate": 0.0001984754376058724, "loss": 3.3903, "step": 16020 }, { "epoch": 3.0171277997364956, "grad_norm": 2.2252073287963867, "learning_rate": 0.00019828722002635047, "loss": 3.3718, "step": 16030 }, { "epoch": 3.0190099755317146, "grad_norm": 2.8217551708221436, "learning_rate": 0.00019809900244682855, "loss": 3.2882, "step": 16040 }, { "epoch": 3.020892151326934, "grad_norm": 2.717163324356079, "learning_rate": 0.0001979107848673066, "loss": 3.1833, "step": 16050 }, { "epoch": 3.022774327122153, "grad_norm": 2.6875417232513428, "learning_rate": 0.00019772256728778467, "loss": 3.6372, "step": 16060 }, { "epoch": 3.0246565029173724, "grad_norm": 2.6113834381103516, "learning_rate": 0.00019753434970826275, "loss": 3.2069, "step": 16070 }, { "epoch": 3.026538678712592, "grad_norm": 3.7651712894439697, "learning_rate": 0.00019734613212874083, "loss": 3.5204, "step": 16080 }, { "epoch": 3.028420854507811, "grad_norm": 3.171018600463867, "learning_rate": 0.0001971579145492189, "loss": 3.4848, "step": 16090 }, { "epoch": 3.0303030303030303, "grad_norm": 2.6116936206817627, "learning_rate": 0.00019696969696969695, "loss": 3.525, "step": 16100 }, { "epoch": 3.0321852060982497, "grad_norm": 2.1383917331695557, "learning_rate": 0.00019678147939017506, "loss": 3.1989, "step": 16110 }, { "epoch": 3.0340673818934687, "grad_norm": 2.373936891555786, "learning_rate": 0.00019659326181065313, "loss": 3.3162, "step": 16120 }, { "epoch": 3.035949557688688, "grad_norm": 1.9612059593200684, "learning_rate": 0.0001964050442311312, "loss": 3.58, "step": 16130 }, { "epoch": 3.0378317334839076, "grad_norm": 2.311664342880249, "learning_rate": 0.00019621682665160926, "loss": 3.5013, "step": 16140 }, { "epoch": 3.0397139092791265, "grad_norm": 2.8051741123199463, "learning_rate": 0.00019602860907208734, "loss": 3.3944, "step": 16150 }, { "epoch": 3.041596085074346, "grad_norm": 2.781449794769287, "learning_rate": 0.0001958403914925654, "loss": 3.2989, "step": 16160 }, { "epoch": 3.0434782608695654, "grad_norm": 2.8339180946350098, "learning_rate": 0.0001956521739130435, "loss": 3.513, "step": 16170 }, { "epoch": 3.0453604366647844, "grad_norm": 2.3824877738952637, "learning_rate": 0.00019546395633352154, "loss": 3.4308, "step": 16180 }, { "epoch": 3.047242612460004, "grad_norm": 2.3761489391326904, "learning_rate": 0.00019527573875399961, "loss": 3.2124, "step": 16190 }, { "epoch": 3.049124788255223, "grad_norm": 2.7145802974700928, "learning_rate": 0.0001950875211744777, "loss": 3.2744, "step": 16200 }, { "epoch": 3.0510069640504422, "grad_norm": 3.7028284072875977, "learning_rate": 0.00019489930359495577, "loss": 3.3786, "step": 16210 }, { "epoch": 3.0528891398456617, "grad_norm": 2.8431806564331055, "learning_rate": 0.00019471108601543387, "loss": 3.5511, "step": 16220 }, { "epoch": 3.0547713156408807, "grad_norm": 2.4497873783111572, "learning_rate": 0.00019452286843591192, "loss": 3.4525, "step": 16230 }, { "epoch": 3.0566534914361, "grad_norm": 3.198949098587036, "learning_rate": 0.00019433465085639, "loss": 3.2419, "step": 16240 }, { "epoch": 3.0585356672313195, "grad_norm": 2.400282859802246, "learning_rate": 0.00019414643327686807, "loss": 3.2808, "step": 16250 }, { "epoch": 3.0604178430265385, "grad_norm": 3.089738607406616, "learning_rate": 0.00019395821569734615, "loss": 3.5481, "step": 16260 }, { "epoch": 3.062300018821758, "grad_norm": 2.3858842849731445, "learning_rate": 0.0001937699981178242, "loss": 3.4233, "step": 16270 }, { "epoch": 3.0641821946169774, "grad_norm": 2.711099863052368, "learning_rate": 0.00019358178053830227, "loss": 3.7983, "step": 16280 }, { "epoch": 3.0660643704121964, "grad_norm": 2.758885622024536, "learning_rate": 0.00019339356295878035, "loss": 3.5862, "step": 16290 }, { "epoch": 3.067946546207416, "grad_norm": 2.564115285873413, "learning_rate": 0.00019320534537925843, "loss": 3.4485, "step": 16300 }, { "epoch": 3.0698287220026352, "grad_norm": 2.551232099533081, "learning_rate": 0.00019301712779973648, "loss": 3.2021, "step": 16310 }, { "epoch": 3.0717108977978542, "grad_norm": 2.466330051422119, "learning_rate": 0.00019282891022021455, "loss": 3.5043, "step": 16320 }, { "epoch": 3.0735930735930737, "grad_norm": 2.2110447883605957, "learning_rate": 0.00019264069264069266, "loss": 3.5946, "step": 16330 }, { "epoch": 3.075475249388293, "grad_norm": 2.7100703716278076, "learning_rate": 0.00019245247506117073, "loss": 3.4292, "step": 16340 }, { "epoch": 3.077357425183512, "grad_norm": 2.8344309329986572, "learning_rate": 0.0001922642574816488, "loss": 3.4147, "step": 16350 }, { "epoch": 3.0792396009787315, "grad_norm": 2.4452314376831055, "learning_rate": 0.00019207603990212686, "loss": 3.2555, "step": 16360 }, { "epoch": 3.0811217767739505, "grad_norm": 2.6006791591644287, "learning_rate": 0.00019188782232260494, "loss": 3.4488, "step": 16370 }, { "epoch": 3.08300395256917, "grad_norm": 2.4451346397399902, "learning_rate": 0.000191699604743083, "loss": 3.3258, "step": 16380 }, { "epoch": 3.0848861283643894, "grad_norm": 2.379737615585327, "learning_rate": 0.0001915113871635611, "loss": 3.6737, "step": 16390 }, { "epoch": 3.0867683041596083, "grad_norm": 2.5672545433044434, "learning_rate": 0.00019132316958403914, "loss": 3.4565, "step": 16400 }, { "epoch": 3.0886504799548278, "grad_norm": 2.4402852058410645, "learning_rate": 0.00019113495200451721, "loss": 3.6682, "step": 16410 }, { "epoch": 3.090532655750047, "grad_norm": 3.9088499546051025, "learning_rate": 0.0001909467344249953, "loss": 3.2161, "step": 16420 }, { "epoch": 3.092414831545266, "grad_norm": 2.7074153423309326, "learning_rate": 0.00019075851684547337, "loss": 3.1454, "step": 16430 }, { "epoch": 3.0942970073404856, "grad_norm": 2.2738146781921387, "learning_rate": 0.00019057029926595144, "loss": 3.196, "step": 16440 }, { "epoch": 3.096179183135705, "grad_norm": 2.12372088432312, "learning_rate": 0.00019038208168642952, "loss": 3.172, "step": 16450 }, { "epoch": 3.098061358930924, "grad_norm": 2.799208641052246, "learning_rate": 0.0001901938641069076, "loss": 3.3201, "step": 16460 }, { "epoch": 3.0999435347261435, "grad_norm": 2.7520501613616943, "learning_rate": 0.00019000564652738567, "loss": 3.4104, "step": 16470 }, { "epoch": 3.1018257105213625, "grad_norm": 2.407956838607788, "learning_rate": 0.00018981742894786375, "loss": 3.2854, "step": 16480 }, { "epoch": 3.103707886316582, "grad_norm": 2.214393377304077, "learning_rate": 0.0001896292113683418, "loss": 3.1286, "step": 16490 }, { "epoch": 3.1055900621118013, "grad_norm": 2.534752130508423, "learning_rate": 0.00018944099378881987, "loss": 3.4013, "step": 16500 }, { "epoch": 3.1074722379070203, "grad_norm": 2.607753038406372, "learning_rate": 0.00018925277620929795, "loss": 3.5454, "step": 16510 }, { "epoch": 3.1093544137022397, "grad_norm": 2.6956064701080322, "learning_rate": 0.00018906455862977603, "loss": 3.354, "step": 16520 }, { "epoch": 3.111236589497459, "grad_norm": 2.5441951751708984, "learning_rate": 0.00018887634105025408, "loss": 3.5804, "step": 16530 }, { "epoch": 3.113118765292678, "grad_norm": 2.281205177307129, "learning_rate": 0.00018868812347073215, "loss": 3.3036, "step": 16540 }, { "epoch": 3.1150009410878976, "grad_norm": 2.4613826274871826, "learning_rate": 0.00018849990589121026, "loss": 3.4355, "step": 16550 }, { "epoch": 3.116883116883117, "grad_norm": 2.2320241928100586, "learning_rate": 0.00018831168831168833, "loss": 3.5351, "step": 16560 }, { "epoch": 3.118765292678336, "grad_norm": 2.5939552783966064, "learning_rate": 0.0001881234707321664, "loss": 3.3774, "step": 16570 }, { "epoch": 3.1206474684735555, "grad_norm": 2.816632032394409, "learning_rate": 0.00018793525315264446, "loss": 3.5098, "step": 16580 }, { "epoch": 3.122529644268775, "grad_norm": 2.7928173542022705, "learning_rate": 0.00018774703557312254, "loss": 3.0776, "step": 16590 }, { "epoch": 3.124411820063994, "grad_norm": 1.9923349618911743, "learning_rate": 0.0001875588179936006, "loss": 3.6454, "step": 16600 }, { "epoch": 3.1262939958592133, "grad_norm": 2.296696901321411, "learning_rate": 0.0001873706004140787, "loss": 3.527, "step": 16610 }, { "epoch": 3.1281761716544327, "grad_norm": 2.5350286960601807, "learning_rate": 0.00018718238283455674, "loss": 3.5192, "step": 16620 }, { "epoch": 3.1300583474496517, "grad_norm": 2.438202381134033, "learning_rate": 0.00018699416525503481, "loss": 3.3552, "step": 16630 }, { "epoch": 3.131940523244871, "grad_norm": 2.678471565246582, "learning_rate": 0.0001868059476755129, "loss": 3.5464, "step": 16640 }, { "epoch": 3.13382269904009, "grad_norm": 2.5259385108947754, "learning_rate": 0.00018661773009599097, "loss": 3.308, "step": 16650 }, { "epoch": 3.1357048748353096, "grad_norm": 2.369147539138794, "learning_rate": 0.00018642951251646904, "loss": 3.3693, "step": 16660 }, { "epoch": 3.137587050630529, "grad_norm": 2.7190983295440674, "learning_rate": 0.00018624129493694712, "loss": 3.3369, "step": 16670 }, { "epoch": 3.139469226425748, "grad_norm": 3.0043201446533203, "learning_rate": 0.0001860530773574252, "loss": 3.4178, "step": 16680 }, { "epoch": 3.1413514022209674, "grad_norm": 2.472367525100708, "learning_rate": 0.00018586485977790327, "loss": 3.9501, "step": 16690 }, { "epoch": 3.143233578016187, "grad_norm": 2.0951249599456787, "learning_rate": 0.00018567664219838135, "loss": 3.4487, "step": 16700 }, { "epoch": 3.145115753811406, "grad_norm": 2.3645644187927246, "learning_rate": 0.0001854884246188594, "loss": 3.5567, "step": 16710 }, { "epoch": 3.1469979296066253, "grad_norm": 2.6005163192749023, "learning_rate": 0.00018530020703933747, "loss": 3.1231, "step": 16720 }, { "epoch": 3.1488801054018447, "grad_norm": 2.6122794151306152, "learning_rate": 0.00018511198945981555, "loss": 2.9662, "step": 16730 }, { "epoch": 3.1507622811970637, "grad_norm": 2.8639791011810303, "learning_rate": 0.00018492377188029363, "loss": 3.5101, "step": 16740 }, { "epoch": 3.152644456992283, "grad_norm": 2.615168571472168, "learning_rate": 0.00018473555430077168, "loss": 3.3929, "step": 16750 }, { "epoch": 3.1545266327875026, "grad_norm": 2.6724283695220947, "learning_rate": 0.00018454733672124975, "loss": 3.3804, "step": 16760 }, { "epoch": 3.1564088085827215, "grad_norm": 2.6021275520324707, "learning_rate": 0.00018435911914172786, "loss": 3.5446, "step": 16770 }, { "epoch": 3.158290984377941, "grad_norm": 2.5110085010528564, "learning_rate": 0.00018417090156220593, "loss": 3.372, "step": 16780 }, { "epoch": 3.16017316017316, "grad_norm": 2.4576356410980225, "learning_rate": 0.00018398268398268398, "loss": 3.3233, "step": 16790 }, { "epoch": 3.1620553359683794, "grad_norm": 2.512538433074951, "learning_rate": 0.00018379446640316206, "loss": 3.5851, "step": 16800 }, { "epoch": 3.163937511763599, "grad_norm": 2.883690595626831, "learning_rate": 0.00018360624882364014, "loss": 3.7707, "step": 16810 }, { "epoch": 3.165819687558818, "grad_norm": 2.1599953174591064, "learning_rate": 0.0001834180312441182, "loss": 3.273, "step": 16820 }, { "epoch": 3.1677018633540373, "grad_norm": 2.552882432937622, "learning_rate": 0.0001832298136645963, "loss": 3.1311, "step": 16830 }, { "epoch": 3.1695840391492567, "grad_norm": 3.2530431747436523, "learning_rate": 0.00018304159608507434, "loss": 3.5942, "step": 16840 }, { "epoch": 3.1714662149444757, "grad_norm": 1.7661073207855225, "learning_rate": 0.00018285337850555241, "loss": 3.3458, "step": 16850 }, { "epoch": 3.173348390739695, "grad_norm": 2.1382436752319336, "learning_rate": 0.0001826651609260305, "loss": 2.9106, "step": 16860 }, { "epoch": 3.1752305665349145, "grad_norm": 2.80940580368042, "learning_rate": 0.00018247694334650857, "loss": 3.3798, "step": 16870 }, { "epoch": 3.1771127423301335, "grad_norm": 2.5640830993652344, "learning_rate": 0.00018228872576698662, "loss": 3.1892, "step": 16880 }, { "epoch": 3.178994918125353, "grad_norm": 2.5059783458709717, "learning_rate": 0.00018210050818746472, "loss": 3.5314, "step": 16890 }, { "epoch": 3.1808770939205724, "grad_norm": 2.6395926475524902, "learning_rate": 0.0001819122906079428, "loss": 3.5268, "step": 16900 }, { "epoch": 3.1827592697157914, "grad_norm": 2.771196126937866, "learning_rate": 0.00018172407302842087, "loss": 3.3903, "step": 16910 }, { "epoch": 3.184641445511011, "grad_norm": 2.2118568420410156, "learning_rate": 0.00018153585544889892, "loss": 3.5801, "step": 16920 }, { "epoch": 3.18652362130623, "grad_norm": 3.436131238937378, "learning_rate": 0.000181347637869377, "loss": 3.2285, "step": 16930 }, { "epoch": 3.1884057971014492, "grad_norm": 2.3162379264831543, "learning_rate": 0.00018115942028985507, "loss": 3.5493, "step": 16940 }, { "epoch": 3.1902879728966687, "grad_norm": 2.5042316913604736, "learning_rate": 0.00018097120271033315, "loss": 3.8026, "step": 16950 }, { "epoch": 3.1921701486918876, "grad_norm": 2.483301877975464, "learning_rate": 0.00018078298513081123, "loss": 3.2815, "step": 16960 }, { "epoch": 3.194052324487107, "grad_norm": 2.8055741786956787, "learning_rate": 0.00018059476755128928, "loss": 3.1948, "step": 16970 }, { "epoch": 3.1959345002823265, "grad_norm": 3.0562124252319336, "learning_rate": 0.00018040654997176735, "loss": 3.2669, "step": 16980 }, { "epoch": 3.1978166760775455, "grad_norm": 2.5893020629882812, "learning_rate": 0.00018021833239224543, "loss": 3.4459, "step": 16990 }, { "epoch": 3.199698851872765, "grad_norm": 3.113239049911499, "learning_rate": 0.00018003011481272353, "loss": 3.3262, "step": 17000 }, { "epoch": 3.2015810276679844, "grad_norm": 2.3117594718933105, "learning_rate": 0.00017984189723320158, "loss": 3.1799, "step": 17010 }, { "epoch": 3.2034632034632033, "grad_norm": 2.710742712020874, "learning_rate": 0.00017965367965367966, "loss": 3.4581, "step": 17020 }, { "epoch": 3.2053453792584228, "grad_norm": 2.3555855751037598, "learning_rate": 0.00017946546207415774, "loss": 3.1941, "step": 17030 }, { "epoch": 3.207227555053642, "grad_norm": 2.6905875205993652, "learning_rate": 0.0001792772444946358, "loss": 3.4441, "step": 17040 }, { "epoch": 3.209109730848861, "grad_norm": 2.401326894760132, "learning_rate": 0.00017908902691511386, "loss": 3.2281, "step": 17050 }, { "epoch": 3.2109919066440806, "grad_norm": 2.5542731285095215, "learning_rate": 0.00017890080933559194, "loss": 3.5194, "step": 17060 }, { "epoch": 3.2128740824393, "grad_norm": 2.9633452892303467, "learning_rate": 0.00017871259175607001, "loss": 3.4064, "step": 17070 }, { "epoch": 3.214756258234519, "grad_norm": 2.62150502204895, "learning_rate": 0.0001785243741765481, "loss": 3.4583, "step": 17080 }, { "epoch": 3.2166384340297385, "grad_norm": 2.9389212131500244, "learning_rate": 0.00017833615659702617, "loss": 3.4426, "step": 17090 }, { "epoch": 3.2185206098249575, "grad_norm": 2.7606170177459717, "learning_rate": 0.00017814793901750422, "loss": 3.3341, "step": 17100 }, { "epoch": 3.220402785620177, "grad_norm": 3.0567126274108887, "learning_rate": 0.00017795972143798232, "loss": 3.1219, "step": 17110 }, { "epoch": 3.2222849614153963, "grad_norm": 2.618769645690918, "learning_rate": 0.0001777715038584604, "loss": 3.3479, "step": 17120 }, { "epoch": 3.2241671372106153, "grad_norm": 3.627694606781006, "learning_rate": 0.00017758328627893847, "loss": 3.4016, "step": 17130 }, { "epoch": 3.2260493130058348, "grad_norm": 2.895933151245117, "learning_rate": 0.00017739506869941652, "loss": 3.4261, "step": 17140 }, { "epoch": 3.227931488801054, "grad_norm": 2.9938414096832275, "learning_rate": 0.0001772068511198946, "loss": 3.2197, "step": 17150 }, { "epoch": 3.229813664596273, "grad_norm": 2.576021194458008, "learning_rate": 0.00017701863354037267, "loss": 3.6523, "step": 17160 }, { "epoch": 3.2316958403914926, "grad_norm": 2.8756418228149414, "learning_rate": 0.00017683041596085075, "loss": 3.3199, "step": 17170 }, { "epoch": 3.233578016186712, "grad_norm": 4.0849289894104, "learning_rate": 0.00017664219838132883, "loss": 3.7093, "step": 17180 }, { "epoch": 3.235460191981931, "grad_norm": 2.724503993988037, "learning_rate": 0.00017645398080180688, "loss": 3.3134, "step": 17190 }, { "epoch": 3.2373423677771505, "grad_norm": 3.1503915786743164, "learning_rate": 0.00017626576322228495, "loss": 3.5615, "step": 17200 }, { "epoch": 3.2392245435723694, "grad_norm": 2.5145812034606934, "learning_rate": 0.00017607754564276303, "loss": 3.3714, "step": 17210 }, { "epoch": 3.241106719367589, "grad_norm": 2.7142868041992188, "learning_rate": 0.00017588932806324113, "loss": 3.6091, "step": 17220 }, { "epoch": 3.2429888951628083, "grad_norm": 3.1583940982818604, "learning_rate": 0.00017570111048371918, "loss": 3.3126, "step": 17230 }, { "epoch": 3.2448710709580273, "grad_norm": 2.589491128921509, "learning_rate": 0.00017551289290419726, "loss": 3.4296, "step": 17240 }, { "epoch": 3.2467532467532467, "grad_norm": 2.6531646251678467, "learning_rate": 0.00017532467532467534, "loss": 3.2807, "step": 17250 }, { "epoch": 3.248635422548466, "grad_norm": 2.35854434967041, "learning_rate": 0.0001751364577451534, "loss": 3.1465, "step": 17260 }, { "epoch": 3.250517598343685, "grad_norm": 2.510300636291504, "learning_rate": 0.00017494824016563146, "loss": 3.6213, "step": 17270 }, { "epoch": 3.2523997741389046, "grad_norm": 2.660430669784546, "learning_rate": 0.00017476002258610954, "loss": 3.7127, "step": 17280 }, { "epoch": 3.254281949934124, "grad_norm": 2.52522349357605, "learning_rate": 0.00017457180500658761, "loss": 3.3203, "step": 17290 }, { "epoch": 3.256164125729343, "grad_norm": 2.3675825595855713, "learning_rate": 0.0001743835874270657, "loss": 3.6297, "step": 17300 }, { "epoch": 3.2580463015245624, "grad_norm": 2.6578762531280518, "learning_rate": 0.00017419536984754377, "loss": 3.3509, "step": 17310 }, { "epoch": 3.259928477319782, "grad_norm": 2.3674263954162598, "learning_rate": 0.00017400715226802182, "loss": 3.1971, "step": 17320 }, { "epoch": 3.261810653115001, "grad_norm": 2.461933135986328, "learning_rate": 0.00017381893468849992, "loss": 3.2012, "step": 17330 }, { "epoch": 3.2636928289102203, "grad_norm": 3.2475147247314453, "learning_rate": 0.000173630717108978, "loss": 3.4897, "step": 17340 }, { "epoch": 3.2655750047054397, "grad_norm": 2.4707467555999756, "learning_rate": 0.00017344249952945607, "loss": 3.4132, "step": 17350 }, { "epoch": 3.2674571805006587, "grad_norm": 2.574896812438965, "learning_rate": 0.00017325428194993412, "loss": 3.4592, "step": 17360 }, { "epoch": 3.269339356295878, "grad_norm": 2.799384117126465, "learning_rate": 0.0001730660643704122, "loss": 3.4034, "step": 17370 }, { "epoch": 3.271221532091097, "grad_norm": 3.077925205230713, "learning_rate": 0.00017287784679089027, "loss": 3.4267, "step": 17380 }, { "epoch": 3.2731037078863165, "grad_norm": 2.2687697410583496, "learning_rate": 0.00017268962921136835, "loss": 3.4686, "step": 17390 }, { "epoch": 3.274985883681536, "grad_norm": 2.218032121658325, "learning_rate": 0.0001725014116318464, "loss": 3.1861, "step": 17400 }, { "epoch": 3.276868059476755, "grad_norm": 2.9106338024139404, "learning_rate": 0.00017231319405232448, "loss": 3.5033, "step": 17410 }, { "epoch": 3.2787502352719744, "grad_norm": 3.4965925216674805, "learning_rate": 0.00017212497647280255, "loss": 3.5055, "step": 17420 }, { "epoch": 3.280632411067194, "grad_norm": 2.3185698986053467, "learning_rate": 0.00017193675889328063, "loss": 3.4843, "step": 17430 }, { "epoch": 3.282514586862413, "grad_norm": 2.887524366378784, "learning_rate": 0.00017174854131375873, "loss": 3.5924, "step": 17440 }, { "epoch": 3.2843967626576323, "grad_norm": 3.108182668685913, "learning_rate": 0.00017156032373423678, "loss": 3.3346, "step": 17450 }, { "epoch": 3.2862789384528517, "grad_norm": 3.007628917694092, "learning_rate": 0.00017137210615471486, "loss": 3.2649, "step": 17460 }, { "epoch": 3.2881611142480707, "grad_norm": 2.5218400955200195, "learning_rate": 0.00017118388857519294, "loss": 3.2575, "step": 17470 }, { "epoch": 3.29004329004329, "grad_norm": 4.477243900299072, "learning_rate": 0.000170995670995671, "loss": 3.343, "step": 17480 }, { "epoch": 3.291925465838509, "grad_norm": 4.656927108764648, "learning_rate": 0.00017080745341614906, "loss": 3.32, "step": 17490 }, { "epoch": 3.2938076416337285, "grad_norm": 3.0149948596954346, "learning_rate": 0.00017061923583662714, "loss": 3.6426, "step": 17500 }, { "epoch": 3.295689817428948, "grad_norm": 2.364891529083252, "learning_rate": 0.00017043101825710521, "loss": 3.4555, "step": 17510 }, { "epoch": 3.2975719932241674, "grad_norm": 2.841169595718384, "learning_rate": 0.0001702428006775833, "loss": 2.9926, "step": 17520 }, { "epoch": 3.2994541690193864, "grad_norm": 2.4284143447875977, "learning_rate": 0.00017005458309806134, "loss": 3.4719, "step": 17530 }, { "epoch": 3.301336344814606, "grad_norm": 2.8778367042541504, "learning_rate": 0.00016986636551853942, "loss": 3.2167, "step": 17540 }, { "epoch": 3.303218520609825, "grad_norm": 3.5048258304595947, "learning_rate": 0.00016967814793901752, "loss": 3.2399, "step": 17550 }, { "epoch": 3.3051006964050442, "grad_norm": 4.433194160461426, "learning_rate": 0.0001694899303594956, "loss": 3.3544, "step": 17560 }, { "epoch": 3.3069828722002637, "grad_norm": 3.8717234134674072, "learning_rate": 0.00016930171277997367, "loss": 3.3017, "step": 17570 }, { "epoch": 3.3088650479954826, "grad_norm": 2.5483531951904297, "learning_rate": 0.00016911349520045172, "loss": 3.4301, "step": 17580 }, { "epoch": 3.310747223790702, "grad_norm": 2.6057353019714355, "learning_rate": 0.0001689252776209298, "loss": 3.9394, "step": 17590 }, { "epoch": 3.3126293995859215, "grad_norm": 3.1346988677978516, "learning_rate": 0.00016873706004140787, "loss": 3.4337, "step": 17600 }, { "epoch": 3.3145115753811405, "grad_norm": 2.919283151626587, "learning_rate": 0.00016854884246188595, "loss": 3.105, "step": 17610 }, { "epoch": 3.31639375117636, "grad_norm": 2.9236936569213867, "learning_rate": 0.000168360624882364, "loss": 3.6559, "step": 17620 }, { "epoch": 3.3182759269715794, "grad_norm": 2.968472719192505, "learning_rate": 0.00016817240730284208, "loss": 3.5618, "step": 17630 }, { "epoch": 3.3201581027667983, "grad_norm": 3.1104965209960938, "learning_rate": 0.00016798418972332015, "loss": 3.2912, "step": 17640 }, { "epoch": 3.322040278562018, "grad_norm": 3.617663860321045, "learning_rate": 0.00016779597214379823, "loss": 3.116, "step": 17650 }, { "epoch": 3.3239224543572368, "grad_norm": 2.541982650756836, "learning_rate": 0.0001676077545642763, "loss": 3.6969, "step": 17660 }, { "epoch": 3.325804630152456, "grad_norm": 2.6385083198547363, "learning_rate": 0.00016741953698475438, "loss": 3.2644, "step": 17670 }, { "epoch": 3.3276868059476756, "grad_norm": 2.365657329559326, "learning_rate": 0.00016723131940523246, "loss": 3.3977, "step": 17680 }, { "epoch": 3.3295689817428946, "grad_norm": 3.0871853828430176, "learning_rate": 0.00016704310182571054, "loss": 3.4599, "step": 17690 }, { "epoch": 3.331451157538114, "grad_norm": 2.563349962234497, "learning_rate": 0.0001668548842461886, "loss": 3.3422, "step": 17700 }, { "epoch": 3.3333333333333335, "grad_norm": 3.314103126525879, "learning_rate": 0.00016666666666666666, "loss": 3.5439, "step": 17710 }, { "epoch": 3.3352155091285525, "grad_norm": 2.4972023963928223, "learning_rate": 0.00016647844908714474, "loss": 3.2783, "step": 17720 }, { "epoch": 3.337097684923772, "grad_norm": 3.040225028991699, "learning_rate": 0.00016629023150762281, "loss": 3.3601, "step": 17730 }, { "epoch": 3.3389798607189913, "grad_norm": 2.6701977252960205, "learning_rate": 0.0001661020139281009, "loss": 3.6541, "step": 17740 }, { "epoch": 3.3408620365142103, "grad_norm": 2.771026372909546, "learning_rate": 0.00016591379634857894, "loss": 3.6395, "step": 17750 }, { "epoch": 3.3427442123094298, "grad_norm": 2.5545403957366943, "learning_rate": 0.00016572557876905702, "loss": 3.4282, "step": 17760 }, { "epoch": 3.3446263881046487, "grad_norm": 3.2837064266204834, "learning_rate": 0.00016553736118953512, "loss": 3.3282, "step": 17770 }, { "epoch": 3.346508563899868, "grad_norm": 2.680157423019409, "learning_rate": 0.0001653491436100132, "loss": 3.5378, "step": 17780 }, { "epoch": 3.3483907396950876, "grad_norm": 2.6319072246551514, "learning_rate": 0.00016516092603049125, "loss": 3.3281, "step": 17790 }, { "epoch": 3.350272915490307, "grad_norm": 2.3553097248077393, "learning_rate": 0.00016497270845096932, "loss": 3.3224, "step": 17800 }, { "epoch": 3.352155091285526, "grad_norm": 2.488459348678589, "learning_rate": 0.0001647844908714474, "loss": 3.2201, "step": 17810 }, { "epoch": 3.3540372670807455, "grad_norm": 2.488111972808838, "learning_rate": 0.00016459627329192547, "loss": 3.3382, "step": 17820 }, { "epoch": 3.3559194428759644, "grad_norm": 3.3139944076538086, "learning_rate": 0.00016440805571240355, "loss": 3.5223, "step": 17830 }, { "epoch": 3.357801618671184, "grad_norm": 2.8468685150146484, "learning_rate": 0.0001642198381328816, "loss": 3.5038, "step": 17840 }, { "epoch": 3.3596837944664033, "grad_norm": 2.1689863204956055, "learning_rate": 0.00016403162055335968, "loss": 3.2975, "step": 17850 }, { "epoch": 3.3615659702616223, "grad_norm": 2.7081663608551025, "learning_rate": 0.00016384340297383775, "loss": 3.3812, "step": 17860 }, { "epoch": 3.3634481460568417, "grad_norm": 3.3801076412200928, "learning_rate": 0.00016365518539431583, "loss": 3.347, "step": 17870 }, { "epoch": 3.365330321852061, "grad_norm": 2.969923973083496, "learning_rate": 0.0001634669678147939, "loss": 3.4156, "step": 17880 }, { "epoch": 3.36721249764728, "grad_norm": 2.6103897094726562, "learning_rate": 0.00016327875023527198, "loss": 3.1598, "step": 17890 }, { "epoch": 3.3690946734424996, "grad_norm": 2.7593328952789307, "learning_rate": 0.00016309053265575006, "loss": 3.1087, "step": 17900 }, { "epoch": 3.370976849237719, "grad_norm": 2.5040123462677, "learning_rate": 0.00016290231507622814, "loss": 3.3374, "step": 17910 }, { "epoch": 3.372859025032938, "grad_norm": 2.686152935028076, "learning_rate": 0.0001627140974967062, "loss": 3.6755, "step": 17920 }, { "epoch": 3.3747412008281574, "grad_norm": 2.570594549179077, "learning_rate": 0.00016252587991718426, "loss": 3.0671, "step": 17930 }, { "epoch": 3.3766233766233764, "grad_norm": 3.6530938148498535, "learning_rate": 0.00016233766233766234, "loss": 3.4958, "step": 17940 }, { "epoch": 3.378505552418596, "grad_norm": 2.7650959491729736, "learning_rate": 0.00016214944475814041, "loss": 3.2266, "step": 17950 }, { "epoch": 3.3803877282138153, "grad_norm": 2.8386785984039307, "learning_rate": 0.0001619612271786185, "loss": 3.513, "step": 17960 }, { "epoch": 3.3822699040090343, "grad_norm": 2.6067397594451904, "learning_rate": 0.00016177300959909654, "loss": 3.2123, "step": 17970 }, { "epoch": 3.3841520798042537, "grad_norm": 2.2532970905303955, "learning_rate": 0.00016158479201957462, "loss": 3.475, "step": 17980 }, { "epoch": 3.386034255599473, "grad_norm": 2.9290244579315186, "learning_rate": 0.00016139657444005272, "loss": 3.4137, "step": 17990 }, { "epoch": 3.387916431394692, "grad_norm": 2.3853273391723633, "learning_rate": 0.0001612083568605308, "loss": 3.3931, "step": 18000 }, { "epoch": 3.3897986071899116, "grad_norm": 2.483642339706421, "learning_rate": 0.00016102013928100885, "loss": 3.3691, "step": 18010 }, { "epoch": 3.391680782985131, "grad_norm": 3.018707752227783, "learning_rate": 0.00016083192170148692, "loss": 3.5776, "step": 18020 }, { "epoch": 3.39356295878035, "grad_norm": 3.4228579998016357, "learning_rate": 0.000160643704121965, "loss": 3.3006, "step": 18030 }, { "epoch": 3.3954451345755694, "grad_norm": 2.5835790634155273, "learning_rate": 0.00016045548654244307, "loss": 3.4533, "step": 18040 }, { "epoch": 3.3973273103707884, "grad_norm": 3.668458938598633, "learning_rate": 0.00016026726896292115, "loss": 3.4412, "step": 18050 }, { "epoch": 3.399209486166008, "grad_norm": 2.2307446002960205, "learning_rate": 0.0001600790513833992, "loss": 3.1922, "step": 18060 }, { "epoch": 3.4010916619612273, "grad_norm": 2.579155206680298, "learning_rate": 0.00015989083380387728, "loss": 3.6517, "step": 18070 }, { "epoch": 3.4029738377564467, "grad_norm": 2.7033772468566895, "learning_rate": 0.00015970261622435535, "loss": 3.4283, "step": 18080 }, { "epoch": 3.4048560135516657, "grad_norm": 2.3598473072052, "learning_rate": 0.00015951439864483343, "loss": 3.4329, "step": 18090 }, { "epoch": 3.406738189346885, "grad_norm": 2.421225070953369, "learning_rate": 0.0001593261810653115, "loss": 2.9126, "step": 18100 }, { "epoch": 3.408620365142104, "grad_norm": 2.460188388824463, "learning_rate": 0.00015913796348578958, "loss": 3.1849, "step": 18110 }, { "epoch": 3.4105025409373235, "grad_norm": 2.6091690063476562, "learning_rate": 0.00015894974590626766, "loss": 3.4228, "step": 18120 }, { "epoch": 3.412384716732543, "grad_norm": 2.700566053390503, "learning_rate": 0.00015876152832674574, "loss": 3.4774, "step": 18130 }, { "epoch": 3.414266892527762, "grad_norm": 2.6323583126068115, "learning_rate": 0.00015857331074722378, "loss": 3.3954, "step": 18140 }, { "epoch": 3.4161490683229814, "grad_norm": 2.687390089035034, "learning_rate": 0.00015838509316770186, "loss": 3.2594, "step": 18150 }, { "epoch": 3.418031244118201, "grad_norm": 2.705817222595215, "learning_rate": 0.00015819687558817994, "loss": 3.3878, "step": 18160 }, { "epoch": 3.41991341991342, "grad_norm": 2.4590365886688232, "learning_rate": 0.00015800865800865801, "loss": 3.0333, "step": 18170 }, { "epoch": 3.4217955957086392, "grad_norm": 2.336115598678589, "learning_rate": 0.0001578204404291361, "loss": 3.1843, "step": 18180 }, { "epoch": 3.4236777715038587, "grad_norm": 2.8003556728363037, "learning_rate": 0.00015763222284961414, "loss": 3.1254, "step": 18190 }, { "epoch": 3.4255599472990776, "grad_norm": 3.6783621311187744, "learning_rate": 0.00015744400527009222, "loss": 3.4193, "step": 18200 }, { "epoch": 3.427442123094297, "grad_norm": 2.280116558074951, "learning_rate": 0.00015725578769057032, "loss": 3.2221, "step": 18210 }, { "epoch": 3.429324298889516, "grad_norm": 2.6370630264282227, "learning_rate": 0.0001570675701110484, "loss": 3.4528, "step": 18220 }, { "epoch": 3.4312064746847355, "grad_norm": 2.820404291152954, "learning_rate": 0.00015687935253152645, "loss": 3.4164, "step": 18230 }, { "epoch": 3.433088650479955, "grad_norm": 2.970364809036255, "learning_rate": 0.00015669113495200452, "loss": 3.4036, "step": 18240 }, { "epoch": 3.434970826275174, "grad_norm": 2.6260478496551514, "learning_rate": 0.0001565029173724826, "loss": 3.572, "step": 18250 }, { "epoch": 3.4368530020703933, "grad_norm": 2.628375768661499, "learning_rate": 0.00015631469979296067, "loss": 3.1255, "step": 18260 }, { "epoch": 3.438735177865613, "grad_norm": 3.0799124240875244, "learning_rate": 0.00015612648221343872, "loss": 3.2365, "step": 18270 }, { "epoch": 3.4406173536608318, "grad_norm": 2.8040168285369873, "learning_rate": 0.0001559382646339168, "loss": 3.3586, "step": 18280 }, { "epoch": 3.442499529456051, "grad_norm": 2.400254249572754, "learning_rate": 0.00015575004705439488, "loss": 3.1926, "step": 18290 }, { "epoch": 3.4443817052512706, "grad_norm": 2.8902792930603027, "learning_rate": 0.00015556182947487295, "loss": 3.0558, "step": 18300 }, { "epoch": 3.4462638810464896, "grad_norm": 2.6344738006591797, "learning_rate": 0.00015537361189535103, "loss": 3.3546, "step": 18310 }, { "epoch": 3.448146056841709, "grad_norm": 2.634481906890869, "learning_rate": 0.0001551853943158291, "loss": 3.3618, "step": 18320 }, { "epoch": 3.4500282326369285, "grad_norm": 2.5868468284606934, "learning_rate": 0.00015499717673630718, "loss": 3.3979, "step": 18330 }, { "epoch": 3.4519104084321475, "grad_norm": 2.580573320388794, "learning_rate": 0.00015480895915678526, "loss": 3.344, "step": 18340 }, { "epoch": 3.453792584227367, "grad_norm": 2.240208148956299, "learning_rate": 0.00015462074157726334, "loss": 3.406, "step": 18350 }, { "epoch": 3.4556747600225863, "grad_norm": 3.5148346424102783, "learning_rate": 0.00015443252399774138, "loss": 3.0173, "step": 18360 }, { "epoch": 3.4575569358178053, "grad_norm": 2.5353798866271973, "learning_rate": 0.00015424430641821946, "loss": 3.4003, "step": 18370 }, { "epoch": 3.4594391116130248, "grad_norm": 2.6224095821380615, "learning_rate": 0.00015405608883869754, "loss": 3.3535, "step": 18380 }, { "epoch": 3.4613212874082437, "grad_norm": 2.489593029022217, "learning_rate": 0.00015386787125917561, "loss": 3.4504, "step": 18390 }, { "epoch": 3.463203463203463, "grad_norm": 2.6428322792053223, "learning_rate": 0.00015367965367965366, "loss": 3.4557, "step": 18400 }, { "epoch": 3.4650856389986826, "grad_norm": 2.464837074279785, "learning_rate": 0.00015349143610013174, "loss": 3.0501, "step": 18410 }, { "epoch": 3.4669678147939016, "grad_norm": 3.0857815742492676, "learning_rate": 0.00015330321852060982, "loss": 3.3766, "step": 18420 }, { "epoch": 3.468849990589121, "grad_norm": 2.774864912033081, "learning_rate": 0.00015311500094108792, "loss": 3.2938, "step": 18430 }, { "epoch": 3.4707321663843405, "grad_norm": 2.6012887954711914, "learning_rate": 0.000152926783361566, "loss": 3.3509, "step": 18440 }, { "epoch": 3.4726143421795594, "grad_norm": 3.4910905361175537, "learning_rate": 0.00015273856578204405, "loss": 3.1945, "step": 18450 }, { "epoch": 3.474496517974779, "grad_norm": 3.091975450515747, "learning_rate": 0.00015255034820252212, "loss": 3.2423, "step": 18460 }, { "epoch": 3.4763786937699983, "grad_norm": 2.299147129058838, "learning_rate": 0.0001523621306230002, "loss": 3.1601, "step": 18470 }, { "epoch": 3.4782608695652173, "grad_norm": 2.402646064758301, "learning_rate": 0.00015217391304347827, "loss": 3.1178, "step": 18480 }, { "epoch": 3.4801430453604367, "grad_norm": 3.045456647872925, "learning_rate": 0.00015198569546395632, "loss": 3.527, "step": 18490 }, { "epoch": 3.4820252211556557, "grad_norm": 2.992879867553711, "learning_rate": 0.0001517974778844344, "loss": 3.4033, "step": 18500 }, { "epoch": 3.483907396950875, "grad_norm": 2.254138469696045, "learning_rate": 0.00015160926030491248, "loss": 2.9717, "step": 18510 }, { "epoch": 3.4857895727460946, "grad_norm": 2.8303282260894775, "learning_rate": 0.00015142104272539055, "loss": 3.3559, "step": 18520 }, { "epoch": 3.4876717485413136, "grad_norm": 2.7462894916534424, "learning_rate": 0.00015123282514586863, "loss": 3.3438, "step": 18530 }, { "epoch": 3.489553924336533, "grad_norm": 3.280153274536133, "learning_rate": 0.00015104460756634668, "loss": 3.4967, "step": 18540 }, { "epoch": 3.4914361001317524, "grad_norm": 3.010605573654175, "learning_rate": 0.00015085638998682478, "loss": 3.3245, "step": 18550 }, { "epoch": 3.4933182759269714, "grad_norm": 2.8119089603424072, "learning_rate": 0.00015066817240730286, "loss": 3.0419, "step": 18560 }, { "epoch": 3.495200451722191, "grad_norm": 3.2307541370391846, "learning_rate": 0.00015047995482778094, "loss": 3.4478, "step": 18570 }, { "epoch": 3.4970826275174103, "grad_norm": 2.8820271492004395, "learning_rate": 0.00015029173724825898, "loss": 3.5247, "step": 18580 }, { "epoch": 3.4989648033126293, "grad_norm": 2.232989549636841, "learning_rate": 0.00015010351966873706, "loss": 3.0745, "step": 18590 }, { "epoch": 3.5008469791078487, "grad_norm": 3.2824294567108154, "learning_rate": 0.00014991530208921514, "loss": 3.4003, "step": 18600 }, { "epoch": 3.5027291549030677, "grad_norm": 2.7704029083251953, "learning_rate": 0.00014972708450969321, "loss": 3.2815, "step": 18610 }, { "epoch": 3.504611330698287, "grad_norm": 3.6349306106567383, "learning_rate": 0.00014953886693017126, "loss": 3.6239, "step": 18620 }, { "epoch": 3.5064935064935066, "grad_norm": 3.6622002124786377, "learning_rate": 0.00014935064935064934, "loss": 3.4647, "step": 18630 }, { "epoch": 3.508375682288726, "grad_norm": 2.65909481048584, "learning_rate": 0.00014916243177112742, "loss": 3.0913, "step": 18640 }, { "epoch": 3.510257858083945, "grad_norm": 2.877439022064209, "learning_rate": 0.0001489742141916055, "loss": 2.859, "step": 18650 }, { "epoch": 3.5121400338791644, "grad_norm": 2.6584038734436035, "learning_rate": 0.0001487859966120836, "loss": 3.2673, "step": 18660 }, { "epoch": 3.5140222096743834, "grad_norm": 3.749934196472168, "learning_rate": 0.00014859777903256165, "loss": 3.7862, "step": 18670 }, { "epoch": 3.515904385469603, "grad_norm": 2.909475088119507, "learning_rate": 0.00014840956145303972, "loss": 3.4305, "step": 18680 }, { "epoch": 3.5177865612648223, "grad_norm": 3.0480315685272217, "learning_rate": 0.0001482213438735178, "loss": 3.2649, "step": 18690 }, { "epoch": 3.5196687370600412, "grad_norm": 3.11883282661438, "learning_rate": 0.00014803312629399587, "loss": 3.3649, "step": 18700 }, { "epoch": 3.5215509128552607, "grad_norm": 2.5214035511016846, "learning_rate": 0.00014784490871447392, "loss": 3.6397, "step": 18710 }, { "epoch": 3.52343308865048, "grad_norm": 3.1389122009277344, "learning_rate": 0.000147656691134952, "loss": 3.5458, "step": 18720 }, { "epoch": 3.525315264445699, "grad_norm": 2.5999817848205566, "learning_rate": 0.00014746847355543008, "loss": 3.372, "step": 18730 }, { "epoch": 3.5271974402409185, "grad_norm": 2.3595402240753174, "learning_rate": 0.00014728025597590815, "loss": 3.1727, "step": 18740 }, { "epoch": 3.529079616036138, "grad_norm": 2.473238468170166, "learning_rate": 0.0001470920383963862, "loss": 3.4563, "step": 18750 }, { "epoch": 3.530961791831357, "grad_norm": 5.704708576202393, "learning_rate": 0.00014690382081686428, "loss": 3.2771, "step": 18760 }, { "epoch": 3.5328439676265764, "grad_norm": 2.5150506496429443, "learning_rate": 0.00014671560323734238, "loss": 3.0659, "step": 18770 }, { "epoch": 3.5347261434217954, "grad_norm": 2.901177406311035, "learning_rate": 0.00014652738565782046, "loss": 3.1914, "step": 18780 }, { "epoch": 3.536608319217015, "grad_norm": 2.633591413497925, "learning_rate": 0.00014633916807829854, "loss": 3.5465, "step": 18790 }, { "epoch": 3.5384904950122342, "grad_norm": 2.6768107414245605, "learning_rate": 0.00014615095049877658, "loss": 3.2693, "step": 18800 }, { "epoch": 3.5403726708074537, "grad_norm": 2.265665054321289, "learning_rate": 0.00014596273291925466, "loss": 3.3685, "step": 18810 }, { "epoch": 3.5422548466026726, "grad_norm": 2.9285104274749756, "learning_rate": 0.00014577451533973274, "loss": 3.2335, "step": 18820 }, { "epoch": 3.544137022397892, "grad_norm": 3.480781078338623, "learning_rate": 0.00014558629776021081, "loss": 3.2812, "step": 18830 }, { "epoch": 3.546019198193111, "grad_norm": 2.353689432144165, "learning_rate": 0.00014539808018068886, "loss": 3.3355, "step": 18840 }, { "epoch": 3.5479013739883305, "grad_norm": 3.253490447998047, "learning_rate": 0.00014520986260116694, "loss": 2.9411, "step": 18850 }, { "epoch": 3.54978354978355, "grad_norm": 2.585083484649658, "learning_rate": 0.00014502164502164502, "loss": 3.3769, "step": 18860 }, { "epoch": 3.551665725578769, "grad_norm": 2.4804880619049072, "learning_rate": 0.0001448334274421231, "loss": 3.3273, "step": 18870 }, { "epoch": 3.5535479013739883, "grad_norm": 3.0086333751678467, "learning_rate": 0.00014464520986260117, "loss": 3.364, "step": 18880 }, { "epoch": 3.5554300771692073, "grad_norm": 2.616567373275757, "learning_rate": 0.00014445699228307925, "loss": 3.5107, "step": 18890 }, { "epoch": 3.5573122529644268, "grad_norm": 2.6643967628479004, "learning_rate": 0.00014426877470355732, "loss": 3.2795, "step": 18900 }, { "epoch": 3.559194428759646, "grad_norm": 3.104766607284546, "learning_rate": 0.0001440805571240354, "loss": 3.1807, "step": 18910 }, { "epoch": 3.5610766045548656, "grad_norm": 2.417785406112671, "learning_rate": 0.00014389233954451347, "loss": 3.0671, "step": 18920 }, { "epoch": 3.5629587803500846, "grad_norm": 3.15303897857666, "learning_rate": 0.00014370412196499152, "loss": 3.1559, "step": 18930 }, { "epoch": 3.564840956145304, "grad_norm": 2.950138807296753, "learning_rate": 0.0001435159043854696, "loss": 3.2226, "step": 18940 }, { "epoch": 3.566723131940523, "grad_norm": 3.2707948684692383, "learning_rate": 0.00014332768680594768, "loss": 3.4891, "step": 18950 }, { "epoch": 3.5686053077357425, "grad_norm": 2.545849084854126, "learning_rate": 0.00014313946922642575, "loss": 3.3594, "step": 18960 }, { "epoch": 3.570487483530962, "grad_norm": 3.009657621383667, "learning_rate": 0.0001429512516469038, "loss": 3.6016, "step": 18970 }, { "epoch": 3.5723696593261813, "grad_norm": 2.9132399559020996, "learning_rate": 0.00014276303406738188, "loss": 3.4437, "step": 18980 }, { "epoch": 3.5742518351214003, "grad_norm": 2.985321044921875, "learning_rate": 0.00014257481648785998, "loss": 2.7064, "step": 18990 }, { "epoch": 3.5761340109166198, "grad_norm": 2.5817177295684814, "learning_rate": 0.00014238659890833806, "loss": 3.5903, "step": 19000 }, { "epoch": 3.5780161867118387, "grad_norm": 2.643543004989624, "learning_rate": 0.0001421983813288161, "loss": 3.3309, "step": 19010 }, { "epoch": 3.579898362507058, "grad_norm": 2.5157105922698975, "learning_rate": 0.00014201016374929418, "loss": 3.7219, "step": 19020 }, { "epoch": 3.5817805383022776, "grad_norm": 2.6616153717041016, "learning_rate": 0.00014182194616977226, "loss": 3.2806, "step": 19030 }, { "epoch": 3.5836627140974966, "grad_norm": 2.7307190895080566, "learning_rate": 0.00014163372859025034, "loss": 3.3157, "step": 19040 }, { "epoch": 3.585544889892716, "grad_norm": 2.92560076713562, "learning_rate": 0.00014144551101072841, "loss": 3.0866, "step": 19050 }, { "epoch": 3.587427065687935, "grad_norm": 3.155165433883667, "learning_rate": 0.00014125729343120646, "loss": 3.4034, "step": 19060 }, { "epoch": 3.5893092414831544, "grad_norm": 2.795790195465088, "learning_rate": 0.00014106907585168454, "loss": 3.2257, "step": 19070 }, { "epoch": 3.591191417278374, "grad_norm": 2.648715019226074, "learning_rate": 0.00014088085827216262, "loss": 3.3564, "step": 19080 }, { "epoch": 3.5930735930735933, "grad_norm": 2.7506496906280518, "learning_rate": 0.0001406926406926407, "loss": 3.197, "step": 19090 }, { "epoch": 3.5949557688688123, "grad_norm": 2.9696731567382812, "learning_rate": 0.00014050442311311877, "loss": 3.3379, "step": 19100 }, { "epoch": 3.5968379446640317, "grad_norm": 2.6705081462860107, "learning_rate": 0.00014031620553359685, "loss": 3.588, "step": 19110 }, { "epoch": 3.5987201204592507, "grad_norm": 3.0333800315856934, "learning_rate": 0.00014012798795407492, "loss": 3.3548, "step": 19120 }, { "epoch": 3.60060229625447, "grad_norm": 2.8488924503326416, "learning_rate": 0.000139939770374553, "loss": 2.8974, "step": 19130 }, { "epoch": 3.6024844720496896, "grad_norm": 2.7044355869293213, "learning_rate": 0.00013975155279503105, "loss": 3.3713, "step": 19140 }, { "epoch": 3.6043666478449086, "grad_norm": 2.565526247024536, "learning_rate": 0.00013956333521550912, "loss": 3.2275, "step": 19150 }, { "epoch": 3.606248823640128, "grad_norm": 3.3287127017974854, "learning_rate": 0.0001393751176359872, "loss": 3.2934, "step": 19160 }, { "epoch": 3.608130999435347, "grad_norm": 3.1183736324310303, "learning_rate": 0.00013918690005646528, "loss": 3.375, "step": 19170 }, { "epoch": 3.6100131752305664, "grad_norm": 2.3812272548675537, "learning_rate": 0.00013899868247694335, "loss": 2.9646, "step": 19180 }, { "epoch": 3.611895351025786, "grad_norm": 2.601849317550659, "learning_rate": 0.0001388104648974214, "loss": 3.4268, "step": 19190 }, { "epoch": 3.6137775268210053, "grad_norm": 2.694033622741699, "learning_rate": 0.00013862224731789948, "loss": 3.7199, "step": 19200 }, { "epoch": 3.6156597026162243, "grad_norm": 2.751224994659424, "learning_rate": 0.00013843402973837758, "loss": 3.5829, "step": 19210 }, { "epoch": 3.6175418784114437, "grad_norm": 2.70204496383667, "learning_rate": 0.00013824581215885566, "loss": 3.3441, "step": 19220 }, { "epoch": 3.6194240542066627, "grad_norm": 3.35942006111145, "learning_rate": 0.0001380575945793337, "loss": 3.0396, "step": 19230 }, { "epoch": 3.621306230001882, "grad_norm": 2.7449188232421875, "learning_rate": 0.00013786937699981178, "loss": 3.4542, "step": 19240 }, { "epoch": 3.6231884057971016, "grad_norm": 2.499721050262451, "learning_rate": 0.00013768115942028986, "loss": 3.2617, "step": 19250 }, { "epoch": 3.625070581592321, "grad_norm": 3.022339344024658, "learning_rate": 0.00013749294184076794, "loss": 3.5071, "step": 19260 }, { "epoch": 3.62695275738754, "grad_norm": 3.4480507373809814, "learning_rate": 0.00013730472426124601, "loss": 3.3696, "step": 19270 }, { "epoch": 3.6288349331827594, "grad_norm": 2.719674587249756, "learning_rate": 0.00013711650668172406, "loss": 3.2398, "step": 19280 }, { "epoch": 3.6307171089779784, "grad_norm": 2.420034646987915, "learning_rate": 0.00013692828910220214, "loss": 3.399, "step": 19290 }, { "epoch": 3.632599284773198, "grad_norm": 2.120476007461548, "learning_rate": 0.00013674007152268022, "loss": 3.0942, "step": 19300 }, { "epoch": 3.6344814605684173, "grad_norm": 2.2568111419677734, "learning_rate": 0.0001365518539431583, "loss": 3.5591, "step": 19310 }, { "epoch": 3.6363636363636362, "grad_norm": 2.630584478378296, "learning_rate": 0.00013636363636363637, "loss": 3.3086, "step": 19320 }, { "epoch": 3.6382458121588557, "grad_norm": 2.691880941390991, "learning_rate": 0.00013617541878411445, "loss": 3.3616, "step": 19330 }, { "epoch": 3.6401279879540747, "grad_norm": 3.3408854007720947, "learning_rate": 0.00013598720120459252, "loss": 3.2374, "step": 19340 }, { "epoch": 3.642010163749294, "grad_norm": 2.621610641479492, "learning_rate": 0.0001357989836250706, "loss": 3.3335, "step": 19350 }, { "epoch": 3.6438923395445135, "grad_norm": 2.847215414047241, "learning_rate": 0.00013561076604554865, "loss": 3.3562, "step": 19360 }, { "epoch": 3.645774515339733, "grad_norm": 2.9248063564300537, "learning_rate": 0.00013542254846602672, "loss": 3.4876, "step": 19370 }, { "epoch": 3.647656691134952, "grad_norm": 2.526078701019287, "learning_rate": 0.0001352343308865048, "loss": 3.4567, "step": 19380 }, { "epoch": 3.6495388669301714, "grad_norm": 2.450305938720703, "learning_rate": 0.00013504611330698288, "loss": 3.4384, "step": 19390 }, { "epoch": 3.6514210427253904, "grad_norm": 2.7489380836486816, "learning_rate": 0.00013485789572746095, "loss": 3.598, "step": 19400 }, { "epoch": 3.65330321852061, "grad_norm": 2.759094476699829, "learning_rate": 0.000134669678147939, "loss": 3.567, "step": 19410 }, { "epoch": 3.6551853943158292, "grad_norm": 3.249265670776367, "learning_rate": 0.00013448146056841708, "loss": 3.227, "step": 19420 }, { "epoch": 3.657067570111048, "grad_norm": 3.172544240951538, "learning_rate": 0.00013429324298889518, "loss": 3.1833, "step": 19430 }, { "epoch": 3.6589497459062676, "grad_norm": 2.491276502609253, "learning_rate": 0.00013410502540937326, "loss": 3.4064, "step": 19440 }, { "epoch": 3.660831921701487, "grad_norm": 3.467625379562378, "learning_rate": 0.0001339168078298513, "loss": 3.5775, "step": 19450 }, { "epoch": 3.662714097496706, "grad_norm": 2.929133653640747, "learning_rate": 0.00013372859025032938, "loss": 3.3226, "step": 19460 }, { "epoch": 3.6645962732919255, "grad_norm": 2.4177849292755127, "learning_rate": 0.00013354037267080746, "loss": 3.4322, "step": 19470 }, { "epoch": 3.666478449087145, "grad_norm": 2.494344711303711, "learning_rate": 0.00013335215509128554, "loss": 3.2928, "step": 19480 }, { "epoch": 3.668360624882364, "grad_norm": 3.252093553543091, "learning_rate": 0.0001331639375117636, "loss": 3.2858, "step": 19490 }, { "epoch": 3.6702428006775834, "grad_norm": 2.993792772293091, "learning_rate": 0.00013297571993224166, "loss": 3.1031, "step": 19500 }, { "epoch": 3.6721249764728023, "grad_norm": 3.514948606491089, "learning_rate": 0.00013278750235271974, "loss": 3.2385, "step": 19510 }, { "epoch": 3.6740071522680218, "grad_norm": 3.2754671573638916, "learning_rate": 0.00013259928477319782, "loss": 3.3253, "step": 19520 }, { "epoch": 3.675889328063241, "grad_norm": 3.0171761512756348, "learning_rate": 0.0001324110671936759, "loss": 3.1884, "step": 19530 }, { "epoch": 3.6777715038584606, "grad_norm": 3.297513246536255, "learning_rate": 0.00013222284961415397, "loss": 3.2366, "step": 19540 }, { "epoch": 3.6796536796536796, "grad_norm": 2.680819272994995, "learning_rate": 0.00013203463203463205, "loss": 3.5181, "step": 19550 }, { "epoch": 3.681535855448899, "grad_norm": 2.5602333545684814, "learning_rate": 0.00013184641445511012, "loss": 3.2382, "step": 19560 }, { "epoch": 3.683418031244118, "grad_norm": 2.667515993118286, "learning_rate": 0.0001316581968755882, "loss": 3.2604, "step": 19570 }, { "epoch": 3.6853002070393375, "grad_norm": 3.419037103652954, "learning_rate": 0.00013146997929606625, "loss": 3.1958, "step": 19580 }, { "epoch": 3.687182382834557, "grad_norm": 2.5114715099334717, "learning_rate": 0.00013128176171654432, "loss": 3.2694, "step": 19590 }, { "epoch": 3.689064558629776, "grad_norm": 2.8820993900299072, "learning_rate": 0.0001310935441370224, "loss": 3.2628, "step": 19600 }, { "epoch": 3.6909467344249953, "grad_norm": 2.631727695465088, "learning_rate": 0.00013090532655750048, "loss": 3.3984, "step": 19610 }, { "epoch": 3.6928289102202143, "grad_norm": 3.3341405391693115, "learning_rate": 0.00013071710897797853, "loss": 3.3687, "step": 19620 }, { "epoch": 3.6947110860154337, "grad_norm": 3.213141918182373, "learning_rate": 0.0001305288913984566, "loss": 3.6688, "step": 19630 }, { "epoch": 3.696593261810653, "grad_norm": 2.9482338428497314, "learning_rate": 0.00013034067381893468, "loss": 3.2922, "step": 19640 }, { "epoch": 3.6984754376058726, "grad_norm": 2.2478370666503906, "learning_rate": 0.00013015245623941278, "loss": 3.1485, "step": 19650 }, { "epoch": 3.7003576134010916, "grad_norm": 2.829216480255127, "learning_rate": 0.00012996423865989086, "loss": 3.1318, "step": 19660 }, { "epoch": 3.702239789196311, "grad_norm": 2.8538355827331543, "learning_rate": 0.0001297760210803689, "loss": 3.3705, "step": 19670 }, { "epoch": 3.70412196499153, "grad_norm": 2.4689252376556396, "learning_rate": 0.00012958780350084698, "loss": 3.2183, "step": 19680 }, { "epoch": 3.7060041407867494, "grad_norm": 2.2744150161743164, "learning_rate": 0.00012939958592132506, "loss": 3.3251, "step": 19690 }, { "epoch": 3.707886316581969, "grad_norm": 4.31705904006958, "learning_rate": 0.00012921136834180314, "loss": 3.4272, "step": 19700 }, { "epoch": 3.709768492377188, "grad_norm": 2.2697396278381348, "learning_rate": 0.0001290231507622812, "loss": 3.3771, "step": 19710 }, { "epoch": 3.7116506681724073, "grad_norm": 2.675645351409912, "learning_rate": 0.00012883493318275926, "loss": 3.5572, "step": 19720 }, { "epoch": 3.7135328439676267, "grad_norm": 3.0459091663360596, "learning_rate": 0.00012864671560323734, "loss": 3.2407, "step": 19730 }, { "epoch": 3.7154150197628457, "grad_norm": 2.8712270259857178, "learning_rate": 0.00012845849802371542, "loss": 3.3708, "step": 19740 }, { "epoch": 3.717297195558065, "grad_norm": 2.0616369247436523, "learning_rate": 0.00012827028044419347, "loss": 3.0517, "step": 19750 }, { "epoch": 3.7191793713532846, "grad_norm": 2.9897196292877197, "learning_rate": 0.00012808206286467157, "loss": 3.506, "step": 19760 }, { "epoch": 3.7210615471485036, "grad_norm": 2.4094443321228027, "learning_rate": 0.00012789384528514965, "loss": 3.2432, "step": 19770 }, { "epoch": 3.722943722943723, "grad_norm": 2.9298617839813232, "learning_rate": 0.00012770562770562772, "loss": 3.4935, "step": 19780 }, { "epoch": 3.724825898738942, "grad_norm": 2.423041820526123, "learning_rate": 0.0001275174101261058, "loss": 3.2528, "step": 19790 }, { "epoch": 3.7267080745341614, "grad_norm": 2.4703598022460938, "learning_rate": 0.00012732919254658385, "loss": 3.7537, "step": 19800 }, { "epoch": 3.728590250329381, "grad_norm": 2.2207388877868652, "learning_rate": 0.00012714097496706192, "loss": 3.1028, "step": 19810 }, { "epoch": 3.7304724261246003, "grad_norm": 2.2429516315460205, "learning_rate": 0.00012695275738754, "loss": 3.133, "step": 19820 }, { "epoch": 3.7323546019198193, "grad_norm": 2.499889850616455, "learning_rate": 0.00012676453980801808, "loss": 3.444, "step": 19830 }, { "epoch": 3.7342367777150387, "grad_norm": 3.0017619132995605, "learning_rate": 0.00012657632222849613, "loss": 3.4312, "step": 19840 }, { "epoch": 3.7361189535102577, "grad_norm": 2.957941770553589, "learning_rate": 0.0001263881046489742, "loss": 3.1642, "step": 19850 }, { "epoch": 3.738001129305477, "grad_norm": 3.130078077316284, "learning_rate": 0.00012619988706945228, "loss": 3.1626, "step": 19860 }, { "epoch": 3.7398833051006966, "grad_norm": 2.6402502059936523, "learning_rate": 0.00012601166948993038, "loss": 3.3958, "step": 19870 }, { "epoch": 3.7417654808959155, "grad_norm": 2.338499069213867, "learning_rate": 0.00012582345191040846, "loss": 3.4364, "step": 19880 }, { "epoch": 3.743647656691135, "grad_norm": 2.4969937801361084, "learning_rate": 0.0001256352343308865, "loss": 3.5577, "step": 19890 }, { "epoch": 3.745529832486354, "grad_norm": 2.380075454711914, "learning_rate": 0.00012544701675136458, "loss": 3.244, "step": 19900 }, { "epoch": 3.7474120082815734, "grad_norm": 2.291037082672119, "learning_rate": 0.00012525879917184266, "loss": 3.0756, "step": 19910 }, { "epoch": 3.749294184076793, "grad_norm": 2.3379275798797607, "learning_rate": 0.00012507058159232074, "loss": 3.1479, "step": 19920 }, { "epoch": 3.7511763598720123, "grad_norm": 2.849090337753296, "learning_rate": 0.00012488236401279881, "loss": 3.3691, "step": 19930 }, { "epoch": 3.7530585356672312, "grad_norm": 2.6632776260375977, "learning_rate": 0.00012469414643327686, "loss": 3.309, "step": 19940 }, { "epoch": 3.7549407114624507, "grad_norm": 1.9748684167861938, "learning_rate": 0.00012450592885375494, "loss": 3.2076, "step": 19950 }, { "epoch": 3.7568228872576697, "grad_norm": 2.6560304164886475, "learning_rate": 0.00012431771127423302, "loss": 3.3641, "step": 19960 }, { "epoch": 3.758705063052889, "grad_norm": 3.091458797454834, "learning_rate": 0.0001241294936947111, "loss": 3.7965, "step": 19970 }, { "epoch": 3.7605872388481085, "grad_norm": 2.292750835418701, "learning_rate": 0.00012394127611518917, "loss": 3.0958, "step": 19980 }, { "epoch": 3.762469414643328, "grad_norm": 2.81014347076416, "learning_rate": 0.00012375305853566725, "loss": 3.4827, "step": 19990 }, { "epoch": 3.764351590438547, "grad_norm": 2.6002442836761475, "learning_rate": 0.0001235648409561453, "loss": 2.8435, "step": 20000 }, { "epoch": 3.7662337662337664, "grad_norm": 2.743107318878174, "learning_rate": 0.00012337662337662337, "loss": 3.2989, "step": 20010 }, { "epoch": 3.7681159420289854, "grad_norm": 3.365095615386963, "learning_rate": 0.00012318840579710145, "loss": 3.516, "step": 20020 }, { "epoch": 3.769998117824205, "grad_norm": 2.544158458709717, "learning_rate": 0.00012300018821757952, "loss": 3.2268, "step": 20030 }, { "epoch": 3.7718802936194242, "grad_norm": 3.235170364379883, "learning_rate": 0.0001228119706380576, "loss": 3.2683, "step": 20040 }, { "epoch": 3.773762469414643, "grad_norm": 3.5878753662109375, "learning_rate": 0.00012262375305853568, "loss": 3.4592, "step": 20050 }, { "epoch": 3.7756446452098626, "grad_norm": 2.7629101276397705, "learning_rate": 0.00012243553547901375, "loss": 3.1159, "step": 20060 }, { "epoch": 3.7775268210050816, "grad_norm": 2.5514304637908936, "learning_rate": 0.0001222473178994918, "loss": 3.1758, "step": 20070 }, { "epoch": 3.779408996800301, "grad_norm": 2.6327264308929443, "learning_rate": 0.00012205910031996989, "loss": 3.5325, "step": 20080 }, { "epoch": 3.7812911725955205, "grad_norm": 2.35088849067688, "learning_rate": 0.00012187088274044796, "loss": 3.2252, "step": 20090 }, { "epoch": 3.78317334839074, "grad_norm": 3.5405220985412598, "learning_rate": 0.00012168266516092603, "loss": 3.3319, "step": 20100 }, { "epoch": 3.785055524185959, "grad_norm": 3.110680103302002, "learning_rate": 0.0001214944475814041, "loss": 3.3915, "step": 20110 }, { "epoch": 3.7869376999811784, "grad_norm": 2.190539598464966, "learning_rate": 0.00012130623000188218, "loss": 3.2086, "step": 20120 }, { "epoch": 3.7888198757763973, "grad_norm": 2.484727621078491, "learning_rate": 0.00012111801242236026, "loss": 3.5845, "step": 20130 }, { "epoch": 3.7907020515716168, "grad_norm": 2.2589473724365234, "learning_rate": 0.00012092979484283832, "loss": 3.2769, "step": 20140 }, { "epoch": 3.792584227366836, "grad_norm": 2.6829514503479004, "learning_rate": 0.0001207415772633164, "loss": 3.2124, "step": 20150 }, { "epoch": 3.794466403162055, "grad_norm": 2.3834097385406494, "learning_rate": 0.00012055335968379446, "loss": 3.3095, "step": 20160 }, { "epoch": 3.7963485789572746, "grad_norm": 2.6875038146972656, "learning_rate": 0.00012036514210427254, "loss": 3.329, "step": 20170 }, { "epoch": 3.7982307547524936, "grad_norm": 3.3290762901306152, "learning_rate": 0.00012017692452475062, "loss": 2.977, "step": 20180 }, { "epoch": 3.800112930547713, "grad_norm": 2.4440839290618896, "learning_rate": 0.00011998870694522869, "loss": 3.6461, "step": 20190 }, { "epoch": 3.8019951063429325, "grad_norm": 2.430534601211548, "learning_rate": 0.00011980048936570676, "loss": 3.1679, "step": 20200 }, { "epoch": 3.803877282138152, "grad_norm": 2.636639356613159, "learning_rate": 0.00011961227178618483, "loss": 3.361, "step": 20210 }, { "epoch": 3.805759457933371, "grad_norm": 2.4769604206085205, "learning_rate": 0.0001194240542066629, "loss": 3.3063, "step": 20220 }, { "epoch": 3.8076416337285903, "grad_norm": 2.9437851905822754, "learning_rate": 0.00011923583662714098, "loss": 3.138, "step": 20230 }, { "epoch": 3.8095238095238093, "grad_norm": 3.349299192428589, "learning_rate": 0.00011904761904761905, "loss": 3.107, "step": 20240 }, { "epoch": 3.8114059853190287, "grad_norm": 2.5073299407958984, "learning_rate": 0.00011885940146809712, "loss": 3.5264, "step": 20250 }, { "epoch": 3.813288161114248, "grad_norm": 3.159978151321411, "learning_rate": 0.0001186711838885752, "loss": 3.3368, "step": 20260 }, { "epoch": 3.8151703369094676, "grad_norm": 2.660733461380005, "learning_rate": 0.00011848296630905326, "loss": 3.0852, "step": 20270 }, { "epoch": 3.8170525127046866, "grad_norm": 2.7388837337493896, "learning_rate": 0.00011829474872953134, "loss": 3.2356, "step": 20280 }, { "epoch": 3.818934688499906, "grad_norm": 3.001278877258301, "learning_rate": 0.00011810653115000942, "loss": 3.3562, "step": 20290 }, { "epoch": 3.820816864295125, "grad_norm": 3.217205047607422, "learning_rate": 0.00011791831357048749, "loss": 3.3602, "step": 20300 }, { "epoch": 3.8226990400903444, "grad_norm": 2.8541181087493896, "learning_rate": 0.00011773009599096556, "loss": 3.3973, "step": 20310 }, { "epoch": 3.824581215885564, "grad_norm": 2.6750447750091553, "learning_rate": 0.00011754187841144363, "loss": 2.9959, "step": 20320 }, { "epoch": 3.826463391680783, "grad_norm": 8.156944274902344, "learning_rate": 0.0001173536608319217, "loss": 3.3854, "step": 20330 }, { "epoch": 3.8283455674760023, "grad_norm": 2.9745311737060547, "learning_rate": 0.00011716544325239978, "loss": 3.0903, "step": 20340 }, { "epoch": 3.8302277432712213, "grad_norm": 2.6606485843658447, "learning_rate": 0.00011697722567287785, "loss": 3.3105, "step": 20350 }, { "epoch": 3.8321099190664407, "grad_norm": 2.661266803741455, "learning_rate": 0.00011678900809335592, "loss": 3.2123, "step": 20360 }, { "epoch": 3.83399209486166, "grad_norm": 2.7658510208129883, "learning_rate": 0.000116600790513834, "loss": 3.4492, "step": 20370 }, { "epoch": 3.8358742706568796, "grad_norm": 4.285975456237793, "learning_rate": 0.00011641257293431206, "loss": 3.338, "step": 20380 }, { "epoch": 3.8377564464520986, "grad_norm": 3.6666557788848877, "learning_rate": 0.00011622435535479014, "loss": 3.388, "step": 20390 }, { "epoch": 3.839638622247318, "grad_norm": 2.724447250366211, "learning_rate": 0.00011603613777526822, "loss": 3.0808, "step": 20400 }, { "epoch": 3.841520798042537, "grad_norm": 2.9280121326446533, "learning_rate": 0.00011584792019574629, "loss": 3.1565, "step": 20410 }, { "epoch": 3.8434029738377564, "grad_norm": 3.070014238357544, "learning_rate": 0.00011565970261622436, "loss": 3.2554, "step": 20420 }, { "epoch": 3.845285149632976, "grad_norm": 2.623056650161743, "learning_rate": 0.00011547148503670243, "loss": 3.8541, "step": 20430 }, { "epoch": 3.847167325428195, "grad_norm": 2.4293594360351562, "learning_rate": 0.0001152832674571805, "loss": 3.303, "step": 20440 }, { "epoch": 3.8490495012234143, "grad_norm": 2.4312355518341064, "learning_rate": 0.00011509504987765858, "loss": 3.2644, "step": 20450 }, { "epoch": 3.8509316770186337, "grad_norm": 3.47027325630188, "learning_rate": 0.00011490683229813665, "loss": 3.3996, "step": 20460 }, { "epoch": 3.8528138528138527, "grad_norm": 2.698883533477783, "learning_rate": 0.00011471861471861472, "loss": 3.2559, "step": 20470 }, { "epoch": 3.854696028609072, "grad_norm": 2.323240280151367, "learning_rate": 0.00011453039713909279, "loss": 3.4054, "step": 20480 }, { "epoch": 3.8565782044042916, "grad_norm": 2.6935620307922363, "learning_rate": 0.00011434217955957086, "loss": 3.2728, "step": 20490 }, { "epoch": 3.8584603801995105, "grad_norm": 3.0033633708953857, "learning_rate": 0.00011415396198004894, "loss": 3.1861, "step": 20500 }, { "epoch": 3.86034255599473, "grad_norm": 2.8364367485046387, "learning_rate": 0.00011396574440052702, "loss": 3.3365, "step": 20510 }, { "epoch": 3.862224731789949, "grad_norm": 3.5239908695220947, "learning_rate": 0.00011377752682100509, "loss": 3.4504, "step": 20520 }, { "epoch": 3.8641069075851684, "grad_norm": 2.6138646602630615, "learning_rate": 0.00011358930924148316, "loss": 3.3374, "step": 20530 }, { "epoch": 3.865989083380388, "grad_norm": 2.864724636077881, "learning_rate": 0.00011340109166196123, "loss": 3.4036, "step": 20540 }, { "epoch": 3.8678712591756073, "grad_norm": 3.0685441493988037, "learning_rate": 0.0001132128740824393, "loss": 3.3029, "step": 20550 }, { "epoch": 3.8697534349708262, "grad_norm": 2.7818939685821533, "learning_rate": 0.00011302465650291738, "loss": 3.388, "step": 20560 }, { "epoch": 3.8716356107660457, "grad_norm": 3.0230255126953125, "learning_rate": 0.00011283643892339545, "loss": 3.1544, "step": 20570 }, { "epoch": 3.8735177865612647, "grad_norm": 2.7873122692108154, "learning_rate": 0.00011264822134387352, "loss": 2.8861, "step": 20580 }, { "epoch": 3.875399962356484, "grad_norm": 2.592653274536133, "learning_rate": 0.00011246000376435159, "loss": 3.415, "step": 20590 }, { "epoch": 3.8772821381517035, "grad_norm": 2.8197875022888184, "learning_rate": 0.00011227178618482966, "loss": 3.1919, "step": 20600 }, { "epoch": 3.8791643139469225, "grad_norm": 2.8188552856445312, "learning_rate": 0.00011208356860530773, "loss": 3.2859, "step": 20610 }, { "epoch": 3.881046489742142, "grad_norm": 3.598754644393921, "learning_rate": 0.00011189535102578582, "loss": 3.322, "step": 20620 }, { "epoch": 3.882928665537361, "grad_norm": 2.8028104305267334, "learning_rate": 0.00011170713344626389, "loss": 3.3449, "step": 20630 }, { "epoch": 3.8848108413325804, "grad_norm": 3.098341226577759, "learning_rate": 0.00011151891586674196, "loss": 3.3943, "step": 20640 }, { "epoch": 3.8866930171278, "grad_norm": 2.6026523113250732, "learning_rate": 0.00011133069828722003, "loss": 3.168, "step": 20650 }, { "epoch": 3.8885751929230192, "grad_norm": 2.6130552291870117, "learning_rate": 0.0001111424807076981, "loss": 3.3081, "step": 20660 }, { "epoch": 3.890457368718238, "grad_norm": 2.8330435752868652, "learning_rate": 0.00011095426312817617, "loss": 3.2736, "step": 20670 }, { "epoch": 3.8923395445134576, "grad_norm": 2.8563363552093506, "learning_rate": 0.00011076604554865425, "loss": 3.2376, "step": 20680 }, { "epoch": 3.8942217203086766, "grad_norm": 2.8903121948242188, "learning_rate": 0.00011057782796913232, "loss": 3.0781, "step": 20690 }, { "epoch": 3.896103896103896, "grad_norm": 3.307441234588623, "learning_rate": 0.00011038961038961039, "loss": 3.0561, "step": 20700 }, { "epoch": 3.8979860718991155, "grad_norm": 2.68328595161438, "learning_rate": 0.00011020139281008846, "loss": 3.358, "step": 20710 }, { "epoch": 3.8998682476943345, "grad_norm": 2.7960171699523926, "learning_rate": 0.00011001317523056653, "loss": 3.4266, "step": 20720 }, { "epoch": 3.901750423489554, "grad_norm": 2.629624128341675, "learning_rate": 0.00010982495765104462, "loss": 3.1278, "step": 20730 }, { "epoch": 3.9036325992847734, "grad_norm": 3.141634225845337, "learning_rate": 0.00010963674007152269, "loss": 3.5527, "step": 20740 }, { "epoch": 3.9055147750799923, "grad_norm": 2.913501739501953, "learning_rate": 0.00010944852249200076, "loss": 3.4974, "step": 20750 }, { "epoch": 3.9073969508752118, "grad_norm": 2.5849661827087402, "learning_rate": 0.00010926030491247883, "loss": 3.1552, "step": 20760 }, { "epoch": 3.909279126670431, "grad_norm": 2.997143030166626, "learning_rate": 0.0001090720873329569, "loss": 3.302, "step": 20770 }, { "epoch": 3.91116130246565, "grad_norm": 2.5189380645751953, "learning_rate": 0.00010888386975343497, "loss": 3.3858, "step": 20780 }, { "epoch": 3.9130434782608696, "grad_norm": 3.202120780944824, "learning_rate": 0.00010869565217391305, "loss": 3.8227, "step": 20790 }, { "epoch": 3.9149256540560886, "grad_norm": 2.4213297367095947, "learning_rate": 0.00010850743459439112, "loss": 3.4291, "step": 20800 }, { "epoch": 3.916807829851308, "grad_norm": 2.5922842025756836, "learning_rate": 0.00010831921701486919, "loss": 3.1879, "step": 20810 }, { "epoch": 3.9186900056465275, "grad_norm": 2.7009775638580322, "learning_rate": 0.00010813099943534726, "loss": 3.1444, "step": 20820 }, { "epoch": 3.920572181441747, "grad_norm": 2.252577066421509, "learning_rate": 0.00010794278185582533, "loss": 3.2912, "step": 20830 }, { "epoch": 3.922454357236966, "grad_norm": 3.549348831176758, "learning_rate": 0.00010775456427630342, "loss": 3.2258, "step": 20840 }, { "epoch": 3.9243365330321853, "grad_norm": 3.6950178146362305, "learning_rate": 0.00010756634669678148, "loss": 3.1597, "step": 20850 }, { "epoch": 3.9262187088274043, "grad_norm": 2.4231810569763184, "learning_rate": 0.00010737812911725956, "loss": 2.8874, "step": 20860 }, { "epoch": 3.9281008846226237, "grad_norm": 2.3907105922698975, "learning_rate": 0.00010718991153773763, "loss": 3.15, "step": 20870 }, { "epoch": 3.929983060417843, "grad_norm": 2.8133437633514404, "learning_rate": 0.0001070016939582157, "loss": 3.5011, "step": 20880 }, { "epoch": 3.931865236213062, "grad_norm": 2.9195053577423096, "learning_rate": 0.00010681347637869377, "loss": 3.3655, "step": 20890 }, { "epoch": 3.9337474120082816, "grad_norm": 3.61535906791687, "learning_rate": 0.00010662525879917185, "loss": 3.2766, "step": 20900 }, { "epoch": 3.9356295878035006, "grad_norm": 3.8378422260284424, "learning_rate": 0.00010643704121964992, "loss": 3.515, "step": 20910 }, { "epoch": 3.93751176359872, "grad_norm": 2.60741925239563, "learning_rate": 0.00010624882364012799, "loss": 3.3687, "step": 20920 }, { "epoch": 3.9393939393939394, "grad_norm": 2.7538509368896484, "learning_rate": 0.00010606060606060606, "loss": 3.2729, "step": 20930 }, { "epoch": 3.941276115189159, "grad_norm": 2.943190813064575, "learning_rate": 0.00010587238848108413, "loss": 3.3263, "step": 20940 }, { "epoch": 3.943158290984378, "grad_norm": 2.7204387187957764, "learning_rate": 0.00010568417090156222, "loss": 2.9551, "step": 20950 }, { "epoch": 3.9450404667795973, "grad_norm": 2.7319447994232178, "learning_rate": 0.00010549595332204028, "loss": 3.5181, "step": 20960 }, { "epoch": 3.9469226425748163, "grad_norm": 3.1706159114837646, "learning_rate": 0.00010530773574251836, "loss": 3.3221, "step": 20970 }, { "epoch": 3.9488048183700357, "grad_norm": 2.647839307785034, "learning_rate": 0.00010511951816299642, "loss": 3.5333, "step": 20980 }, { "epoch": 3.950686994165255, "grad_norm": 3.027841329574585, "learning_rate": 0.0001049313005834745, "loss": 3.2927, "step": 20990 }, { "epoch": 3.9525691699604746, "grad_norm": 2.752403974533081, "learning_rate": 0.00010474308300395257, "loss": 3.2126, "step": 21000 }, { "epoch": 3.9544513457556936, "grad_norm": 2.5150058269500732, "learning_rate": 0.00010455486542443065, "loss": 3.5071, "step": 21010 }, { "epoch": 3.956333521550913, "grad_norm": 3.4964075088500977, "learning_rate": 0.00010436664784490872, "loss": 3.2479, "step": 21020 }, { "epoch": 3.958215697346132, "grad_norm": 2.55086612701416, "learning_rate": 0.00010417843026538679, "loss": 3.3188, "step": 21030 }, { "epoch": 3.9600978731413514, "grad_norm": 3.518401622772217, "learning_rate": 0.00010399021268586486, "loss": 3.1695, "step": 21040 }, { "epoch": 3.961980048936571, "grad_norm": 2.718132972717285, "learning_rate": 0.00010380199510634293, "loss": 3.3029, "step": 21050 }, { "epoch": 3.96386222473179, "grad_norm": 3.0271036624908447, "learning_rate": 0.00010361377752682102, "loss": 3.0488, "step": 21060 }, { "epoch": 3.9657444005270093, "grad_norm": 2.508157968521118, "learning_rate": 0.00010342555994729908, "loss": 3.2239, "step": 21070 }, { "epoch": 3.9676265763222283, "grad_norm": 3.0764923095703125, "learning_rate": 0.00010323734236777716, "loss": 3.2407, "step": 21080 }, { "epoch": 3.9695087521174477, "grad_norm": 2.797504425048828, "learning_rate": 0.00010304912478825522, "loss": 3.5661, "step": 21090 }, { "epoch": 3.971390927912667, "grad_norm": 3.5259289741516113, "learning_rate": 0.0001028609072087333, "loss": 3.7717, "step": 21100 }, { "epoch": 3.9732731037078866, "grad_norm": 2.6272242069244385, "learning_rate": 0.00010267268962921137, "loss": 3.4713, "step": 21110 }, { "epoch": 3.9751552795031055, "grad_norm": 2.6657400131225586, "learning_rate": 0.00010248447204968945, "loss": 3.1154, "step": 21120 }, { "epoch": 3.977037455298325, "grad_norm": 2.7080202102661133, "learning_rate": 0.00010229625447016752, "loss": 3.552, "step": 21130 }, { "epoch": 3.978919631093544, "grad_norm": 2.928029775619507, "learning_rate": 0.00010210803689064559, "loss": 3.0428, "step": 21140 }, { "epoch": 3.9808018068887634, "grad_norm": 3.023933172225952, "learning_rate": 0.00010191981931112366, "loss": 3.2027, "step": 21150 }, { "epoch": 3.982683982683983, "grad_norm": 2.8178048133850098, "learning_rate": 0.00010173160173160173, "loss": 3.3582, "step": 21160 }, { "epoch": 3.984566158479202, "grad_norm": 2.425347089767456, "learning_rate": 0.00010154338415207982, "loss": 3.1945, "step": 21170 }, { "epoch": 3.9864483342744212, "grad_norm": 4.833703994750977, "learning_rate": 0.00010135516657255788, "loss": 3.3424, "step": 21180 }, { "epoch": 3.9883305100696402, "grad_norm": 3.0012435913085938, "learning_rate": 0.00010116694899303596, "loss": 3.3914, "step": 21190 }, { "epoch": 3.9902126858648597, "grad_norm": 2.9188785552978516, "learning_rate": 0.00010097873141351402, "loss": 3.1256, "step": 21200 }, { "epoch": 3.992094861660079, "grad_norm": 2.435695171356201, "learning_rate": 0.0001007905138339921, "loss": 3.1203, "step": 21210 }, { "epoch": 3.9939770374552985, "grad_norm": 3.414642810821533, "learning_rate": 0.00010060229625447016, "loss": 3.371, "step": 21220 }, { "epoch": 3.9958592132505175, "grad_norm": 2.751316785812378, "learning_rate": 0.00010041407867494825, "loss": 3.0688, "step": 21230 }, { "epoch": 3.997741389045737, "grad_norm": 2.6124749183654785, "learning_rate": 0.00010022586109542632, "loss": 3.2969, "step": 21240 }, { "epoch": 3.999623564840956, "grad_norm": 2.908557653427124, "learning_rate": 0.00010003764351590439, "loss": 2.9505, "step": 21250 }, { "epoch": 4.0, "eval_accuracy": 0.20933333333333334, "eval_loss": 3.2325947284698486, "eval_runtime": 94.8421, "eval_samples_per_second": 79.079, "eval_steps_per_second": 9.89, "step": 21252 }, { "epoch": 4.001505740636175, "grad_norm": 2.891474962234497, "learning_rate": 9.984942593638246e-05, "loss": 3.1685, "step": 21260 }, { "epoch": 4.003387916431395, "grad_norm": 2.610499858856201, "learning_rate": 9.966120835686053e-05, "loss": 2.977, "step": 21270 }, { "epoch": 4.005270092226614, "grad_norm": 2.853829860687256, "learning_rate": 9.947299077733862e-05, "loss": 3.1707, "step": 21280 }, { "epoch": 4.007152268021834, "grad_norm": 3.7910237312316895, "learning_rate": 9.928477319781668e-05, "loss": 3.3487, "step": 21290 }, { "epoch": 4.009034443817052, "grad_norm": 3.1136045455932617, "learning_rate": 9.909655561829476e-05, "loss": 3.354, "step": 21300 }, { "epoch": 4.010916619612272, "grad_norm": 2.9077839851379395, "learning_rate": 9.890833803877282e-05, "loss": 3.208, "step": 21310 }, { "epoch": 4.012798795407491, "grad_norm": 2.4560554027557373, "learning_rate": 9.87201204592509e-05, "loss": 3.4798, "step": 21320 }, { "epoch": 4.0146809712027105, "grad_norm": 3.8288331031799316, "learning_rate": 9.853190287972896e-05, "loss": 3.3021, "step": 21330 }, { "epoch": 4.01656314699793, "grad_norm": 2.5452582836151123, "learning_rate": 9.834368530020705e-05, "loss": 3.2399, "step": 21340 }, { "epoch": 4.0184453227931485, "grad_norm": 3.3501482009887695, "learning_rate": 9.815546772068511e-05, "loss": 3.1027, "step": 21350 }, { "epoch": 4.020327498588368, "grad_norm": 2.5182175636291504, "learning_rate": 9.796725014116319e-05, "loss": 3.075, "step": 21360 }, { "epoch": 4.022209674383587, "grad_norm": 2.5452823638916016, "learning_rate": 9.777903256164126e-05, "loss": 3.3111, "step": 21370 }, { "epoch": 4.024091850178807, "grad_norm": 2.716010570526123, "learning_rate": 9.759081498211933e-05, "loss": 3.1905, "step": 21380 }, { "epoch": 4.025974025974026, "grad_norm": 2.6728432178497314, "learning_rate": 9.740259740259742e-05, "loss": 3.4892, "step": 21390 }, { "epoch": 4.027856201769246, "grad_norm": 3.552994728088379, "learning_rate": 9.721437982307548e-05, "loss": 3.2987, "step": 21400 }, { "epoch": 4.029738377564464, "grad_norm": 2.918489933013916, "learning_rate": 9.702616224355356e-05, "loss": 3.4667, "step": 21410 }, { "epoch": 4.031620553359684, "grad_norm": 3.082736015319824, "learning_rate": 9.683794466403162e-05, "loss": 3.2245, "step": 21420 }, { "epoch": 4.033502729154903, "grad_norm": 2.7363715171813965, "learning_rate": 9.66497270845097e-05, "loss": 3.3247, "step": 21430 }, { "epoch": 4.0353849049501225, "grad_norm": 2.707773447036743, "learning_rate": 9.646150950498776e-05, "loss": 3.3738, "step": 21440 }, { "epoch": 4.037267080745342, "grad_norm": 2.463040828704834, "learning_rate": 9.627329192546585e-05, "loss": 3.1729, "step": 21450 }, { "epoch": 4.0391492565405605, "grad_norm": 3.271688938140869, "learning_rate": 9.608507434594391e-05, "loss": 3.1545, "step": 21460 }, { "epoch": 4.04103143233578, "grad_norm": 2.990217685699463, "learning_rate": 9.589685676642199e-05, "loss": 2.9567, "step": 21470 }, { "epoch": 4.042913608130999, "grad_norm": 2.7719483375549316, "learning_rate": 9.570863918690006e-05, "loss": 3.0559, "step": 21480 }, { "epoch": 4.044795783926219, "grad_norm": 2.990938425064087, "learning_rate": 9.552042160737813e-05, "loss": 3.3036, "step": 21490 }, { "epoch": 4.046677959721438, "grad_norm": 2.75420880317688, "learning_rate": 9.53322040278562e-05, "loss": 3.2986, "step": 21500 }, { "epoch": 4.048560135516658, "grad_norm": 3.4968814849853516, "learning_rate": 9.514398644833428e-05, "loss": 3.2688, "step": 21510 }, { "epoch": 4.050442311311876, "grad_norm": 3.2569799423217773, "learning_rate": 9.495576886881236e-05, "loss": 3.388, "step": 21520 }, { "epoch": 4.052324487107096, "grad_norm": 3.2045726776123047, "learning_rate": 9.476755128929042e-05, "loss": 3.0191, "step": 21530 }, { "epoch": 4.054206662902315, "grad_norm": 2.2563700675964355, "learning_rate": 9.45793337097685e-05, "loss": 3.3321, "step": 21540 }, { "epoch": 4.0560888386975344, "grad_norm": 2.6916544437408447, "learning_rate": 9.439111613024656e-05, "loss": 3.5008, "step": 21550 }, { "epoch": 4.057971014492754, "grad_norm": 3.3092386722564697, "learning_rate": 9.420289855072465e-05, "loss": 3.1675, "step": 21560 }, { "epoch": 4.059853190287973, "grad_norm": 2.2328038215637207, "learning_rate": 9.401468097120271e-05, "loss": 3.2129, "step": 21570 }, { "epoch": 4.061735366083192, "grad_norm": 2.782254695892334, "learning_rate": 9.382646339168079e-05, "loss": 3.1073, "step": 21580 }, { "epoch": 4.063617541878411, "grad_norm": 2.7193851470947266, "learning_rate": 9.363824581215885e-05, "loss": 3.1614, "step": 21590 }, { "epoch": 4.065499717673631, "grad_norm": 2.849473714828491, "learning_rate": 9.345002823263693e-05, "loss": 2.7911, "step": 21600 }, { "epoch": 4.06738189346885, "grad_norm": 3.0582633018493652, "learning_rate": 9.3261810653115e-05, "loss": 3.5445, "step": 21610 }, { "epoch": 4.06926406926407, "grad_norm": 2.8459339141845703, "learning_rate": 9.307359307359308e-05, "loss": 3.1633, "step": 21620 }, { "epoch": 4.071146245059288, "grad_norm": 2.8063242435455322, "learning_rate": 9.288537549407116e-05, "loss": 3.3386, "step": 21630 }, { "epoch": 4.073028420854508, "grad_norm": 3.0272722244262695, "learning_rate": 9.269715791454922e-05, "loss": 3.1299, "step": 21640 }, { "epoch": 4.074910596649727, "grad_norm": 2.7872672080993652, "learning_rate": 9.25089403350273e-05, "loss": 3.2995, "step": 21650 }, { "epoch": 4.076792772444946, "grad_norm": 3.4354023933410645, "learning_rate": 9.232072275550536e-05, "loss": 3.205, "step": 21660 }, { "epoch": 4.078674948240166, "grad_norm": 2.8815746307373047, "learning_rate": 9.213250517598345e-05, "loss": 3.1771, "step": 21670 }, { "epoch": 4.080557124035385, "grad_norm": 3.5228142738342285, "learning_rate": 9.194428759646151e-05, "loss": 3.0383, "step": 21680 }, { "epoch": 4.082439299830604, "grad_norm": 3.062067985534668, "learning_rate": 9.175607001693959e-05, "loss": 3.3322, "step": 21690 }, { "epoch": 4.084321475625823, "grad_norm": 4.2620015144348145, "learning_rate": 9.156785243741765e-05, "loss": 3.1793, "step": 21700 }, { "epoch": 4.086203651421043, "grad_norm": 4.1625285148620605, "learning_rate": 9.137963485789573e-05, "loss": 3.3753, "step": 21710 }, { "epoch": 4.088085827216262, "grad_norm": 3.0724854469299316, "learning_rate": 9.11914172783738e-05, "loss": 3.183, "step": 21720 }, { "epoch": 4.089968003011482, "grad_norm": 4.273207664489746, "learning_rate": 9.100319969885188e-05, "loss": 3.0482, "step": 21730 }, { "epoch": 4.0918501788067, "grad_norm": 2.5469064712524414, "learning_rate": 9.081498211932996e-05, "loss": 3.234, "step": 21740 }, { "epoch": 4.0937323546019195, "grad_norm": 2.6905412673950195, "learning_rate": 9.062676453980802e-05, "loss": 3.2977, "step": 21750 }, { "epoch": 4.095614530397139, "grad_norm": 3.4132399559020996, "learning_rate": 9.04385469602861e-05, "loss": 3.3364, "step": 21760 }, { "epoch": 4.097496706192358, "grad_norm": 3.024416208267212, "learning_rate": 9.025032938076416e-05, "loss": 3.4897, "step": 21770 }, { "epoch": 4.099378881987578, "grad_norm": 2.8688106536865234, "learning_rate": 9.006211180124225e-05, "loss": 3.2479, "step": 21780 }, { "epoch": 4.101261057782797, "grad_norm": 4.9075398445129395, "learning_rate": 8.987389422172031e-05, "loss": 2.818, "step": 21790 }, { "epoch": 4.103143233578016, "grad_norm": 2.8078277111053467, "learning_rate": 8.968567664219839e-05, "loss": 3.1297, "step": 21800 }, { "epoch": 4.105025409373235, "grad_norm": 3.4160094261169434, "learning_rate": 8.949745906267645e-05, "loss": 3.3984, "step": 21810 }, { "epoch": 4.106907585168455, "grad_norm": 3.5791547298431396, "learning_rate": 8.930924148315453e-05, "loss": 2.9892, "step": 21820 }, { "epoch": 4.108789760963674, "grad_norm": 2.7223494052886963, "learning_rate": 8.912102390363259e-05, "loss": 3.1671, "step": 21830 }, { "epoch": 4.1106719367588935, "grad_norm": 3.5885062217712402, "learning_rate": 8.893280632411068e-05, "loss": 3.1588, "step": 21840 }, { "epoch": 4.112554112554113, "grad_norm": 3.027226686477661, "learning_rate": 8.874458874458876e-05, "loss": 3.1116, "step": 21850 }, { "epoch": 4.1144362883493315, "grad_norm": 2.6472368240356445, "learning_rate": 8.855637116506682e-05, "loss": 3.3602, "step": 21860 }, { "epoch": 4.116318464144551, "grad_norm": 3.1893270015716553, "learning_rate": 8.83681535855449e-05, "loss": 3.0538, "step": 21870 }, { "epoch": 4.11820063993977, "grad_norm": 3.10514760017395, "learning_rate": 8.817993600602296e-05, "loss": 3.4478, "step": 21880 }, { "epoch": 4.12008281573499, "grad_norm": 2.870718002319336, "learning_rate": 8.799171842650105e-05, "loss": 3.3483, "step": 21890 }, { "epoch": 4.121964991530209, "grad_norm": 2.7556862831115723, "learning_rate": 8.780350084697911e-05, "loss": 3.0957, "step": 21900 }, { "epoch": 4.123847167325428, "grad_norm": 3.0777435302734375, "learning_rate": 8.761528326745719e-05, "loss": 3.0024, "step": 21910 }, { "epoch": 4.125729343120647, "grad_norm": 4.2900567054748535, "learning_rate": 8.742706568793525e-05, "loss": 3.3981, "step": 21920 }, { "epoch": 4.127611518915867, "grad_norm": 3.218428373336792, "learning_rate": 8.723884810841333e-05, "loss": 3.1353, "step": 21930 }, { "epoch": 4.129493694711086, "grad_norm": 3.2349698543548584, "learning_rate": 8.705063052889139e-05, "loss": 3.1707, "step": 21940 }, { "epoch": 4.1313758705063055, "grad_norm": 3.2318453788757324, "learning_rate": 8.686241294936948e-05, "loss": 3.1009, "step": 21950 }, { "epoch": 4.133258046301525, "grad_norm": 3.21386456489563, "learning_rate": 8.667419536984754e-05, "loss": 3.1957, "step": 21960 }, { "epoch": 4.1351402220967435, "grad_norm": 2.730018138885498, "learning_rate": 8.648597779032562e-05, "loss": 3.1705, "step": 21970 }, { "epoch": 4.137022397891963, "grad_norm": 3.171736001968384, "learning_rate": 8.62977602108037e-05, "loss": 3.7192, "step": 21980 }, { "epoch": 4.138904573687182, "grad_norm": 3.063369035720825, "learning_rate": 8.610954263128176e-05, "loss": 3.3647, "step": 21990 }, { "epoch": 4.140786749482402, "grad_norm": 3.849029779434204, "learning_rate": 8.592132505175985e-05, "loss": 3.2821, "step": 22000 }, { "epoch": 4.142668925277621, "grad_norm": 4.603402614593506, "learning_rate": 8.573310747223791e-05, "loss": 3.2007, "step": 22010 }, { "epoch": 4.144551101072841, "grad_norm": 2.3862357139587402, "learning_rate": 8.554488989271599e-05, "loss": 3.3233, "step": 22020 }, { "epoch": 4.146433276868059, "grad_norm": 3.104445457458496, "learning_rate": 8.535667231319405e-05, "loss": 2.9834, "step": 22030 }, { "epoch": 4.148315452663279, "grad_norm": 3.098273515701294, "learning_rate": 8.516845473367213e-05, "loss": 3.2513, "step": 22040 }, { "epoch": 4.150197628458498, "grad_norm": 3.133579730987549, "learning_rate": 8.498023715415019e-05, "loss": 3.2901, "step": 22050 }, { "epoch": 4.1520798042537175, "grad_norm": 2.9663634300231934, "learning_rate": 8.479201957462828e-05, "loss": 3.4831, "step": 22060 }, { "epoch": 4.153961980048937, "grad_norm": 2.842804193496704, "learning_rate": 8.460380199510634e-05, "loss": 3.1227, "step": 22070 }, { "epoch": 4.1558441558441555, "grad_norm": 3.0301401615142822, "learning_rate": 8.441558441558442e-05, "loss": 3.1276, "step": 22080 }, { "epoch": 4.157726331639375, "grad_norm": 3.2375950813293457, "learning_rate": 8.42273668360625e-05, "loss": 3.3902, "step": 22090 }, { "epoch": 4.159608507434594, "grad_norm": 2.83935809135437, "learning_rate": 8.403914925654056e-05, "loss": 3.5444, "step": 22100 }, { "epoch": 4.161490683229814, "grad_norm": 2.878446578979492, "learning_rate": 8.385093167701865e-05, "loss": 3.2114, "step": 22110 }, { "epoch": 4.163372859025033, "grad_norm": 2.6201987266540527, "learning_rate": 8.366271409749671e-05, "loss": 3.2031, "step": 22120 }, { "epoch": 4.165255034820253, "grad_norm": 3.6080262660980225, "learning_rate": 8.347449651797479e-05, "loss": 3.3242, "step": 22130 }, { "epoch": 4.167137210615471, "grad_norm": 4.1051411628723145, "learning_rate": 8.328627893845285e-05, "loss": 3.4341, "step": 22140 }, { "epoch": 4.169019386410691, "grad_norm": 4.344814300537109, "learning_rate": 8.309806135893093e-05, "loss": 3.2216, "step": 22150 }, { "epoch": 4.17090156220591, "grad_norm": 3.445652723312378, "learning_rate": 8.290984377940899e-05, "loss": 2.9959, "step": 22160 }, { "epoch": 4.1727837380011294, "grad_norm": 3.2097978591918945, "learning_rate": 8.272162619988708e-05, "loss": 3.2192, "step": 22170 }, { "epoch": 4.174665913796349, "grad_norm": 3.208712339401245, "learning_rate": 8.253340862036514e-05, "loss": 3.1598, "step": 22180 }, { "epoch": 4.176548089591567, "grad_norm": 3.730483055114746, "learning_rate": 8.234519104084322e-05, "loss": 3.4319, "step": 22190 }, { "epoch": 4.178430265386787, "grad_norm": 3.2604448795318604, "learning_rate": 8.215697346132128e-05, "loss": 3.0354, "step": 22200 }, { "epoch": 4.180312441182006, "grad_norm": 2.9703927040100098, "learning_rate": 8.196875588179936e-05, "loss": 3.4206, "step": 22210 }, { "epoch": 4.182194616977226, "grad_norm": 3.236666440963745, "learning_rate": 8.178053830227745e-05, "loss": 3.3407, "step": 22220 }, { "epoch": 4.184076792772445, "grad_norm": 3.3799564838409424, "learning_rate": 8.159232072275551e-05, "loss": 2.9908, "step": 22230 }, { "epoch": 4.185958968567665, "grad_norm": 3.3381948471069336, "learning_rate": 8.140410314323359e-05, "loss": 3.1457, "step": 22240 }, { "epoch": 4.187841144362883, "grad_norm": 2.3282175064086914, "learning_rate": 8.121588556371165e-05, "loss": 3.0359, "step": 22250 }, { "epoch": 4.189723320158103, "grad_norm": 2.6722915172576904, "learning_rate": 8.102766798418973e-05, "loss": 3.6484, "step": 22260 }, { "epoch": 4.191605495953322, "grad_norm": 3.148141622543335, "learning_rate": 8.083945040466779e-05, "loss": 3.3126, "step": 22270 }, { "epoch": 4.193487671748541, "grad_norm": 2.9252452850341797, "learning_rate": 8.065123282514588e-05, "loss": 3.1325, "step": 22280 }, { "epoch": 4.195369847543761, "grad_norm": 3.091193437576294, "learning_rate": 8.046301524562394e-05, "loss": 3.4042, "step": 22290 }, { "epoch": 4.19725202333898, "grad_norm": 3.0783419609069824, "learning_rate": 8.027479766610202e-05, "loss": 2.6655, "step": 22300 }, { "epoch": 4.199134199134199, "grad_norm": 3.132167100906372, "learning_rate": 8.008658008658008e-05, "loss": 3.463, "step": 22310 }, { "epoch": 4.201016374929418, "grad_norm": 3.468315601348877, "learning_rate": 7.989836250705816e-05, "loss": 3.6242, "step": 22320 }, { "epoch": 4.202898550724638, "grad_norm": 3.142845392227173, "learning_rate": 7.971014492753622e-05, "loss": 3.2898, "step": 22330 }, { "epoch": 4.204780726519857, "grad_norm": 2.87841534614563, "learning_rate": 7.952192734801431e-05, "loss": 3.0523, "step": 22340 }, { "epoch": 4.206662902315077, "grad_norm": 2.8895421028137207, "learning_rate": 7.933370976849239e-05, "loss": 3.2179, "step": 22350 }, { "epoch": 4.208545078110295, "grad_norm": 3.0601487159729004, "learning_rate": 7.914549218897045e-05, "loss": 3.3932, "step": 22360 }, { "epoch": 4.2104272539055145, "grad_norm": 2.776848793029785, "learning_rate": 7.895727460944853e-05, "loss": 3.21, "step": 22370 }, { "epoch": 4.212309429700734, "grad_norm": 3.218050718307495, "learning_rate": 7.876905702992659e-05, "loss": 3.3461, "step": 22380 }, { "epoch": 4.214191605495953, "grad_norm": 2.931539535522461, "learning_rate": 7.858083945040468e-05, "loss": 3.0586, "step": 22390 }, { "epoch": 4.216073781291173, "grad_norm": 3.7356948852539062, "learning_rate": 7.839262187088274e-05, "loss": 3.2584, "step": 22400 }, { "epoch": 4.217955957086392, "grad_norm": 2.919062852859497, "learning_rate": 7.820440429136082e-05, "loss": 3.0511, "step": 22410 }, { "epoch": 4.219838132881611, "grad_norm": 3.2050087451934814, "learning_rate": 7.801618671183888e-05, "loss": 3.26, "step": 22420 }, { "epoch": 4.22172030867683, "grad_norm": 3.362515926361084, "learning_rate": 7.782796913231696e-05, "loss": 3.176, "step": 22430 }, { "epoch": 4.22360248447205, "grad_norm": 2.7202816009521484, "learning_rate": 7.763975155279502e-05, "loss": 3.3341, "step": 22440 }, { "epoch": 4.225484660267269, "grad_norm": 2.699904203414917, "learning_rate": 7.745153397327311e-05, "loss": 3.4156, "step": 22450 }, { "epoch": 4.2273668360624885, "grad_norm": 3.7537598609924316, "learning_rate": 7.726331639375119e-05, "loss": 3.239, "step": 22460 }, { "epoch": 4.229249011857707, "grad_norm": 2.781452178955078, "learning_rate": 7.707509881422925e-05, "loss": 3.1356, "step": 22470 }, { "epoch": 4.2311311876529265, "grad_norm": 4.503787994384766, "learning_rate": 7.688688123470733e-05, "loss": 3.3394, "step": 22480 }, { "epoch": 4.233013363448146, "grad_norm": 2.8764593601226807, "learning_rate": 7.669866365518539e-05, "loss": 3.3024, "step": 22490 }, { "epoch": 4.234895539243365, "grad_norm": 2.42519211769104, "learning_rate": 7.651044607566348e-05, "loss": 3.2218, "step": 22500 }, { "epoch": 4.236777715038585, "grad_norm": 3.315824508666992, "learning_rate": 7.632222849614154e-05, "loss": 2.7431, "step": 22510 }, { "epoch": 4.238659890833804, "grad_norm": 2.6028430461883545, "learning_rate": 7.613401091661962e-05, "loss": 3.4724, "step": 22520 }, { "epoch": 4.240542066629023, "grad_norm": 3.285188674926758, "learning_rate": 7.594579333709768e-05, "loss": 3.3218, "step": 22530 }, { "epoch": 4.242424242424242, "grad_norm": 3.028482675552368, "learning_rate": 7.575757575757576e-05, "loss": 3.4616, "step": 22540 }, { "epoch": 4.244306418219462, "grad_norm": 3.097353219985962, "learning_rate": 7.556935817805382e-05, "loss": 3.359, "step": 22550 }, { "epoch": 4.246188594014681, "grad_norm": 3.4922540187835693, "learning_rate": 7.538114059853191e-05, "loss": 3.0898, "step": 22560 }, { "epoch": 4.2480707698099005, "grad_norm": 2.766566753387451, "learning_rate": 7.519292301900997e-05, "loss": 3.1267, "step": 22570 }, { "epoch": 4.24995294560512, "grad_norm": 4.96455717086792, "learning_rate": 7.500470543948805e-05, "loss": 3.3342, "step": 22580 }, { "epoch": 4.2518351214003385, "grad_norm": 4.132570743560791, "learning_rate": 7.481648785996613e-05, "loss": 3.0486, "step": 22590 }, { "epoch": 4.253717297195558, "grad_norm": 2.6853671073913574, "learning_rate": 7.462827028044419e-05, "loss": 3.0899, "step": 22600 }, { "epoch": 4.255599472990777, "grad_norm": 2.830772638320923, "learning_rate": 7.444005270092228e-05, "loss": 3.3514, "step": 22610 }, { "epoch": 4.257481648785997, "grad_norm": 3.1268768310546875, "learning_rate": 7.425183512140034e-05, "loss": 3.2823, "step": 22620 }, { "epoch": 4.259363824581216, "grad_norm": 3.380945920944214, "learning_rate": 7.406361754187842e-05, "loss": 3.5613, "step": 22630 }, { "epoch": 4.261246000376435, "grad_norm": 2.9997637271881104, "learning_rate": 7.387539996235648e-05, "loss": 3.1986, "step": 22640 }, { "epoch": 4.263128176171654, "grad_norm": 3.073002338409424, "learning_rate": 7.368718238283456e-05, "loss": 3.122, "step": 22650 }, { "epoch": 4.265010351966874, "grad_norm": 2.674787759780884, "learning_rate": 7.349896480331262e-05, "loss": 3.1517, "step": 22660 }, { "epoch": 4.266892527762093, "grad_norm": 2.988823652267456, "learning_rate": 7.331074722379071e-05, "loss": 3.3729, "step": 22670 }, { "epoch": 4.2687747035573125, "grad_norm": 2.2312140464782715, "learning_rate": 7.312252964426877e-05, "loss": 3.2023, "step": 22680 }, { "epoch": 4.270656879352532, "grad_norm": 2.4428207874298096, "learning_rate": 7.293431206474685e-05, "loss": 3.0415, "step": 22690 }, { "epoch": 4.2725390551477505, "grad_norm": 2.431781530380249, "learning_rate": 7.274609448522493e-05, "loss": 3.0563, "step": 22700 }, { "epoch": 4.27442123094297, "grad_norm": 2.9765193462371826, "learning_rate": 7.255787690570299e-05, "loss": 3.0556, "step": 22710 }, { "epoch": 4.276303406738189, "grad_norm": 2.8088014125823975, "learning_rate": 7.236965932618108e-05, "loss": 3.443, "step": 22720 }, { "epoch": 4.278185582533409, "grad_norm": 2.609876871109009, "learning_rate": 7.218144174665914e-05, "loss": 3.1875, "step": 22730 }, { "epoch": 4.280067758328628, "grad_norm": 3.226731300354004, "learning_rate": 7.199322416713722e-05, "loss": 3.415, "step": 22740 }, { "epoch": 4.281949934123848, "grad_norm": 2.7580599784851074, "learning_rate": 7.180500658761528e-05, "loss": 3.058, "step": 22750 }, { "epoch": 4.283832109919066, "grad_norm": 2.7163636684417725, "learning_rate": 7.161678900809336e-05, "loss": 3.0862, "step": 22760 }, { "epoch": 4.285714285714286, "grad_norm": 2.734267473220825, "learning_rate": 7.142857142857142e-05, "loss": 3.2912, "step": 22770 }, { "epoch": 4.287596461509505, "grad_norm": 3.4720916748046875, "learning_rate": 7.124035384904951e-05, "loss": 3.1295, "step": 22780 }, { "epoch": 4.2894786373047245, "grad_norm": 2.542668104171753, "learning_rate": 7.105213626952757e-05, "loss": 3.3263, "step": 22790 }, { "epoch": 4.291360813099944, "grad_norm": 2.874943494796753, "learning_rate": 7.086391869000565e-05, "loss": 3.111, "step": 22800 }, { "epoch": 4.293242988895162, "grad_norm": 3.1843760013580322, "learning_rate": 7.067570111048371e-05, "loss": 3.1787, "step": 22810 }, { "epoch": 4.295125164690382, "grad_norm": 2.4640989303588867, "learning_rate": 7.048748353096179e-05, "loss": 3.0852, "step": 22820 }, { "epoch": 4.297007340485601, "grad_norm": 2.776172161102295, "learning_rate": 7.029926595143988e-05, "loss": 2.6763, "step": 22830 }, { "epoch": 4.298889516280821, "grad_norm": 3.5256826877593994, "learning_rate": 7.011104837191794e-05, "loss": 2.8748, "step": 22840 }, { "epoch": 4.30077169207604, "grad_norm": 3.026275396347046, "learning_rate": 6.992283079239602e-05, "loss": 3.1198, "step": 22850 }, { "epoch": 4.30265386787126, "grad_norm": 3.187401533126831, "learning_rate": 6.973461321287408e-05, "loss": 3.2494, "step": 22860 }, { "epoch": 4.304536043666478, "grad_norm": 2.384568452835083, "learning_rate": 6.954639563335216e-05, "loss": 3.1702, "step": 22870 }, { "epoch": 4.306418219461698, "grad_norm": 3.4012176990509033, "learning_rate": 6.935817805383022e-05, "loss": 3.0068, "step": 22880 }, { "epoch": 4.308300395256917, "grad_norm": 3.3398947715759277, "learning_rate": 6.916996047430831e-05, "loss": 3.1857, "step": 22890 }, { "epoch": 4.310182571052136, "grad_norm": 2.42055606842041, "learning_rate": 6.898174289478637e-05, "loss": 3.0783, "step": 22900 }, { "epoch": 4.312064746847356, "grad_norm": 3.519979476928711, "learning_rate": 6.879352531526445e-05, "loss": 3.2145, "step": 22910 }, { "epoch": 4.313946922642575, "grad_norm": 3.9459290504455566, "learning_rate": 6.860530773574251e-05, "loss": 3.5089, "step": 22920 }, { "epoch": 4.315829098437794, "grad_norm": 2.7116644382476807, "learning_rate": 6.841709015622059e-05, "loss": 3.2327, "step": 22930 }, { "epoch": 4.317711274233013, "grad_norm": 3.385389566421509, "learning_rate": 6.822887257669865e-05, "loss": 3.3809, "step": 22940 }, { "epoch": 4.319593450028233, "grad_norm": 3.4308905601501465, "learning_rate": 6.804065499717674e-05, "loss": 3.067, "step": 22950 }, { "epoch": 4.321475625823452, "grad_norm": 2.6280534267425537, "learning_rate": 6.785243741765482e-05, "loss": 3.3243, "step": 22960 }, { "epoch": 4.323357801618672, "grad_norm": 2.3873131275177, "learning_rate": 6.766421983813288e-05, "loss": 2.8105, "step": 22970 }, { "epoch": 4.32523997741389, "grad_norm": 2.873793601989746, "learning_rate": 6.747600225861096e-05, "loss": 3.563, "step": 22980 }, { "epoch": 4.3271221532091095, "grad_norm": 3.03075909614563, "learning_rate": 6.728778467908902e-05, "loss": 3.2434, "step": 22990 }, { "epoch": 4.329004329004329, "grad_norm": 3.1492440700531006, "learning_rate": 6.709956709956711e-05, "loss": 3.1969, "step": 23000 }, { "epoch": 4.330886504799548, "grad_norm": 3.263035774230957, "learning_rate": 6.691134952004517e-05, "loss": 3.1421, "step": 23010 }, { "epoch": 4.332768680594768, "grad_norm": 3.79087495803833, "learning_rate": 6.672313194052325e-05, "loss": 3.2209, "step": 23020 }, { "epoch": 4.334650856389986, "grad_norm": 3.310776710510254, "learning_rate": 6.653491436100131e-05, "loss": 3.321, "step": 23030 }, { "epoch": 4.336533032185206, "grad_norm": 3.0996947288513184, "learning_rate": 6.634669678147939e-05, "loss": 3.0465, "step": 23040 }, { "epoch": 4.338415207980425, "grad_norm": 3.124666452407837, "learning_rate": 6.615847920195745e-05, "loss": 3.2509, "step": 23050 }, { "epoch": 4.340297383775645, "grad_norm": 2.6973137855529785, "learning_rate": 6.597026162243554e-05, "loss": 3.1465, "step": 23060 }, { "epoch": 4.342179559570864, "grad_norm": 2.4782652854919434, "learning_rate": 6.578204404291362e-05, "loss": 3.3048, "step": 23070 }, { "epoch": 4.3440617353660835, "grad_norm": 3.4553232192993164, "learning_rate": 6.559382646339168e-05, "loss": 3.2484, "step": 23080 }, { "epoch": 4.345943911161302, "grad_norm": 2.5292747020721436, "learning_rate": 6.540560888386976e-05, "loss": 3.2178, "step": 23090 }, { "epoch": 4.3478260869565215, "grad_norm": 2.555924892425537, "learning_rate": 6.521739130434782e-05, "loss": 3.1644, "step": 23100 }, { "epoch": 4.349708262751741, "grad_norm": 2.9614477157592773, "learning_rate": 6.502917372482591e-05, "loss": 3.2833, "step": 23110 }, { "epoch": 4.35159043854696, "grad_norm": 2.6555614471435547, "learning_rate": 6.484095614530397e-05, "loss": 3.3832, "step": 23120 }, { "epoch": 4.35347261434218, "grad_norm": 2.8622424602508545, "learning_rate": 6.465273856578205e-05, "loss": 3.1168, "step": 23130 }, { "epoch": 4.355354790137399, "grad_norm": 4.439013481140137, "learning_rate": 6.446452098626011e-05, "loss": 3.0878, "step": 23140 }, { "epoch": 4.357236965932618, "grad_norm": 4.426267147064209, "learning_rate": 6.427630340673819e-05, "loss": 3.3674, "step": 23150 }, { "epoch": 4.359119141727837, "grad_norm": 3.5063087940216064, "learning_rate": 6.408808582721625e-05, "loss": 3.321, "step": 23160 }, { "epoch": 4.361001317523057, "grad_norm": 3.535888433456421, "learning_rate": 6.389986824769434e-05, "loss": 3.2064, "step": 23170 }, { "epoch": 4.362883493318276, "grad_norm": 2.365108013153076, "learning_rate": 6.37116506681724e-05, "loss": 3.0767, "step": 23180 }, { "epoch": 4.3647656691134955, "grad_norm": 2.9245288372039795, "learning_rate": 6.352343308865048e-05, "loss": 3.1926, "step": 23190 }, { "epoch": 4.366647844908714, "grad_norm": 3.0852577686309814, "learning_rate": 6.333521550912856e-05, "loss": 3.4855, "step": 23200 }, { "epoch": 4.3685300207039335, "grad_norm": 3.742438316345215, "learning_rate": 6.314699792960662e-05, "loss": 3.4457, "step": 23210 }, { "epoch": 4.370412196499153, "grad_norm": 2.9332962036132812, "learning_rate": 6.295878035008471e-05, "loss": 3.053, "step": 23220 }, { "epoch": 4.372294372294372, "grad_norm": 2.669461250305176, "learning_rate": 6.277056277056277e-05, "loss": 3.2083, "step": 23230 }, { "epoch": 4.374176548089592, "grad_norm": 2.6461141109466553, "learning_rate": 6.258234519104085e-05, "loss": 3.1934, "step": 23240 }, { "epoch": 4.376058723884811, "grad_norm": 4.071197986602783, "learning_rate": 6.239412761151891e-05, "loss": 3.2333, "step": 23250 }, { "epoch": 4.37794089968003, "grad_norm": 3.3596227169036865, "learning_rate": 6.220591003199699e-05, "loss": 3.3347, "step": 23260 }, { "epoch": 4.379823075475249, "grad_norm": 2.816563129425049, "learning_rate": 6.201769245247507e-05, "loss": 3.2306, "step": 23270 }, { "epoch": 4.381705251270469, "grad_norm": 2.9401190280914307, "learning_rate": 6.182947487295314e-05, "loss": 3.4218, "step": 23280 }, { "epoch": 4.383587427065688, "grad_norm": 2.8984081745147705, "learning_rate": 6.16412572934312e-05, "loss": 3.3952, "step": 23290 }, { "epoch": 4.3854696028609075, "grad_norm": 2.7690205574035645, "learning_rate": 6.145303971390928e-05, "loss": 3.0585, "step": 23300 }, { "epoch": 4.387351778656127, "grad_norm": 2.664947271347046, "learning_rate": 6.126482213438736e-05, "loss": 3.1813, "step": 23310 }, { "epoch": 4.3892339544513455, "grad_norm": 3.077943801879883, "learning_rate": 6.107660455486542e-05, "loss": 3.1159, "step": 23320 }, { "epoch": 4.391116130246565, "grad_norm": 3.109022378921509, "learning_rate": 6.08883869753435e-05, "loss": 3.1827, "step": 23330 }, { "epoch": 4.392998306041784, "grad_norm": 2.6905243396759033, "learning_rate": 6.0700169395821566e-05, "loss": 2.7874, "step": 23340 }, { "epoch": 4.394880481837004, "grad_norm": 2.7580599784851074, "learning_rate": 6.051195181629964e-05, "loss": 3.0988, "step": 23350 }, { "epoch": 4.396762657632223, "grad_norm": 2.6568217277526855, "learning_rate": 6.032373423677771e-05, "loss": 3.0959, "step": 23360 }, { "epoch": 4.398644833427442, "grad_norm": 3.559309720993042, "learning_rate": 6.013551665725579e-05, "loss": 3.3385, "step": 23370 }, { "epoch": 4.400527009222661, "grad_norm": 2.7499680519104004, "learning_rate": 5.9947299077733865e-05, "loss": 3.2577, "step": 23380 }, { "epoch": 4.402409185017881, "grad_norm": 3.9470832347869873, "learning_rate": 5.9759081498211935e-05, "loss": 3.2105, "step": 23390 }, { "epoch": 4.4042913608131, "grad_norm": 2.9097394943237305, "learning_rate": 5.957086391869001e-05, "loss": 3.0114, "step": 23400 }, { "epoch": 4.4061735366083195, "grad_norm": 3.433927536010742, "learning_rate": 5.938264633916808e-05, "loss": 2.8237, "step": 23410 }, { "epoch": 4.408055712403539, "grad_norm": 3.0882952213287354, "learning_rate": 5.919442875964615e-05, "loss": 3.2526, "step": 23420 }, { "epoch": 4.409937888198757, "grad_norm": 4.191737651824951, "learning_rate": 5.900621118012423e-05, "loss": 3.1569, "step": 23430 }, { "epoch": 4.411820063993977, "grad_norm": 3.4990599155426025, "learning_rate": 5.88179936006023e-05, "loss": 3.2241, "step": 23440 }, { "epoch": 4.413702239789196, "grad_norm": 3.3133044242858887, "learning_rate": 5.8629776021080366e-05, "loss": 3.0933, "step": 23450 }, { "epoch": 4.415584415584416, "grad_norm": 3.126660108566284, "learning_rate": 5.844155844155844e-05, "loss": 3.3786, "step": 23460 }, { "epoch": 4.417466591379635, "grad_norm": 2.7241156101226807, "learning_rate": 5.825334086203651e-05, "loss": 3.2118, "step": 23470 }, { "epoch": 4.419348767174855, "grad_norm": 2.985119581222534, "learning_rate": 5.806512328251459e-05, "loss": 3.2246, "step": 23480 }, { "epoch": 4.421230942970073, "grad_norm": 2.953031301498413, "learning_rate": 5.787690570299266e-05, "loss": 3.2377, "step": 23490 }, { "epoch": 4.423113118765293, "grad_norm": 2.686232089996338, "learning_rate": 5.7688688123470735e-05, "loss": 3.2919, "step": 23500 }, { "epoch": 4.424995294560512, "grad_norm": 2.8897335529327393, "learning_rate": 5.750047054394881e-05, "loss": 3.4231, "step": 23510 }, { "epoch": 4.426877470355731, "grad_norm": 2.7605607509613037, "learning_rate": 5.731225296442688e-05, "loss": 3.3684, "step": 23520 }, { "epoch": 4.428759646150951, "grad_norm": 3.12101674079895, "learning_rate": 5.712403538490495e-05, "loss": 3.1358, "step": 23530 }, { "epoch": 4.430641821946169, "grad_norm": 3.256334066390991, "learning_rate": 5.693581780538303e-05, "loss": 3.0332, "step": 23540 }, { "epoch": 4.432523997741389, "grad_norm": 4.081264019012451, "learning_rate": 5.67476002258611e-05, "loss": 3.0865, "step": 23550 }, { "epoch": 4.434406173536608, "grad_norm": 3.149183511734009, "learning_rate": 5.6559382646339166e-05, "loss": 3.2729, "step": 23560 }, { "epoch": 4.436288349331828, "grad_norm": 7.6489105224609375, "learning_rate": 5.637116506681724e-05, "loss": 3.2411, "step": 23570 }, { "epoch": 4.438170525127047, "grad_norm": 2.61975359916687, "learning_rate": 5.618294748729531e-05, "loss": 3.3824, "step": 23580 }, { "epoch": 4.440052700922266, "grad_norm": 2.765589475631714, "learning_rate": 5.599472990777338e-05, "loss": 3.3724, "step": 23590 }, { "epoch": 4.441934876717485, "grad_norm": 3.076176881790161, "learning_rate": 5.580651232825146e-05, "loss": 3.197, "step": 23600 }, { "epoch": 4.4438170525127045, "grad_norm": 3.716524600982666, "learning_rate": 5.561829474872953e-05, "loss": 3.4739, "step": 23610 }, { "epoch": 4.445699228307924, "grad_norm": 3.2613918781280518, "learning_rate": 5.5430077169207605e-05, "loss": 3.2186, "step": 23620 }, { "epoch": 4.447581404103143, "grad_norm": 3.2381677627563477, "learning_rate": 5.524185958968568e-05, "loss": 3.1227, "step": 23630 }, { "epoch": 4.449463579898363, "grad_norm": 3.682084083557129, "learning_rate": 5.505364201016375e-05, "loss": 2.9952, "step": 23640 }, { "epoch": 4.451345755693581, "grad_norm": 3.6691043376922607, "learning_rate": 5.486542443064183e-05, "loss": 3.2097, "step": 23650 }, { "epoch": 4.453227931488801, "grad_norm": 2.951138973236084, "learning_rate": 5.46772068511199e-05, "loss": 3.339, "step": 23660 }, { "epoch": 4.45511010728402, "grad_norm": 2.816178321838379, "learning_rate": 5.4488989271597966e-05, "loss": 3.0707, "step": 23670 }, { "epoch": 4.45699228307924, "grad_norm": 2.796182870864868, "learning_rate": 5.430077169207604e-05, "loss": 3.2437, "step": 23680 }, { "epoch": 4.458874458874459, "grad_norm": 2.747709274291992, "learning_rate": 5.411255411255411e-05, "loss": 2.9044, "step": 23690 }, { "epoch": 4.4607566346696785, "grad_norm": 7.65395975112915, "learning_rate": 5.392433653303218e-05, "loss": 2.8958, "step": 23700 }, { "epoch": 4.462638810464897, "grad_norm": 2.6658778190612793, "learning_rate": 5.373611895351026e-05, "loss": 3.4358, "step": 23710 }, { "epoch": 4.4645209862601165, "grad_norm": 3.1606075763702393, "learning_rate": 5.354790137398833e-05, "loss": 3.2838, "step": 23720 }, { "epoch": 4.466403162055336, "grad_norm": 3.0979714393615723, "learning_rate": 5.3359683794466405e-05, "loss": 3.4613, "step": 23730 }, { "epoch": 4.468285337850555, "grad_norm": 3.3036980628967285, "learning_rate": 5.3171466214944474e-05, "loss": 3.0426, "step": 23740 }, { "epoch": 4.470167513645775, "grad_norm": 2.8867993354797363, "learning_rate": 5.298324863542255e-05, "loss": 2.9376, "step": 23750 }, { "epoch": 4.472049689440993, "grad_norm": 3.0002734661102295, "learning_rate": 5.279503105590063e-05, "loss": 3.0216, "step": 23760 }, { "epoch": 4.473931865236213, "grad_norm": 3.7616660594940186, "learning_rate": 5.26068134763787e-05, "loss": 3.338, "step": 23770 }, { "epoch": 4.475814041031432, "grad_norm": 2.7143394947052, "learning_rate": 5.2418595896856766e-05, "loss": 3.2587, "step": 23780 }, { "epoch": 4.477696216826652, "grad_norm": 3.7299182415008545, "learning_rate": 5.223037831733484e-05, "loss": 2.9254, "step": 23790 }, { "epoch": 4.479578392621871, "grad_norm": 2.715489149093628, "learning_rate": 5.204216073781291e-05, "loss": 3.1811, "step": 23800 }, { "epoch": 4.4814605684170905, "grad_norm": 3.3261804580688477, "learning_rate": 5.185394315829098e-05, "loss": 3.3265, "step": 23810 }, { "epoch": 4.483342744212309, "grad_norm": 2.696645736694336, "learning_rate": 5.166572557876906e-05, "loss": 3.1978, "step": 23820 }, { "epoch": 4.4852249200075285, "grad_norm": 2.7663300037384033, "learning_rate": 5.147750799924713e-05, "loss": 2.9375, "step": 23830 }, { "epoch": 4.487107095802748, "grad_norm": 3.4459638595581055, "learning_rate": 5.1289290419725205e-05, "loss": 3.4786, "step": 23840 }, { "epoch": 4.488989271597967, "grad_norm": 2.642287492752075, "learning_rate": 5.1101072840203274e-05, "loss": 3.0888, "step": 23850 }, { "epoch": 4.490871447393187, "grad_norm": 2.4454216957092285, "learning_rate": 5.0912855260681344e-05, "loss": 3.0148, "step": 23860 }, { "epoch": 4.492753623188406, "grad_norm": 2.8533213138580322, "learning_rate": 5.072463768115943e-05, "loss": 3.3603, "step": 23870 }, { "epoch": 4.494635798983625, "grad_norm": 2.8486480712890625, "learning_rate": 5.05364201016375e-05, "loss": 3.0267, "step": 23880 }, { "epoch": 4.496517974778844, "grad_norm": 3.5169730186462402, "learning_rate": 5.0348202522115566e-05, "loss": 2.9836, "step": 23890 }, { "epoch": 4.498400150574064, "grad_norm": 4.43604040145874, "learning_rate": 5.015998494259364e-05, "loss": 3.2031, "step": 23900 }, { "epoch": 4.500282326369283, "grad_norm": 2.625504493713379, "learning_rate": 4.997176736307171e-05, "loss": 3.1256, "step": 23910 }, { "epoch": 4.5021645021645025, "grad_norm": 2.9266598224639893, "learning_rate": 4.978354978354978e-05, "loss": 3.1863, "step": 23920 }, { "epoch": 4.504046677959721, "grad_norm": 4.946619987487793, "learning_rate": 4.959533220402786e-05, "loss": 3.4948, "step": 23930 }, { "epoch": 4.5059288537549405, "grad_norm": 3.0961945056915283, "learning_rate": 4.940711462450593e-05, "loss": 3.4399, "step": 23940 }, { "epoch": 4.50781102955016, "grad_norm": 2.682321548461914, "learning_rate": 4.9218897044984e-05, "loss": 3.3231, "step": 23950 }, { "epoch": 4.509693205345379, "grad_norm": 3.179645299911499, "learning_rate": 4.9030679465462074e-05, "loss": 3.2811, "step": 23960 }, { "epoch": 4.511575381140599, "grad_norm": 2.6911354064941406, "learning_rate": 4.8842461885940144e-05, "loss": 3.2388, "step": 23970 }, { "epoch": 4.513457556935818, "grad_norm": 3.0853166580200195, "learning_rate": 4.865424430641822e-05, "loss": 3.6197, "step": 23980 }, { "epoch": 4.515339732731037, "grad_norm": 2.753788471221924, "learning_rate": 4.84660267268963e-05, "loss": 3.111, "step": 23990 }, { "epoch": 4.517221908526256, "grad_norm": 2.9684507846832275, "learning_rate": 4.8277809147374366e-05, "loss": 3.0002, "step": 24000 }, { "epoch": 4.519104084321476, "grad_norm": 3.3698248863220215, "learning_rate": 4.808959156785244e-05, "loss": 3.3243, "step": 24010 }, { "epoch": 4.520986260116695, "grad_norm": 2.9527013301849365, "learning_rate": 4.790137398833051e-05, "loss": 3.1075, "step": 24020 }, { "epoch": 4.5228684359119145, "grad_norm": 3.412144899368286, "learning_rate": 4.771315640880858e-05, "loss": 3.189, "step": 24030 }, { "epoch": 4.524750611707134, "grad_norm": 2.9642202854156494, "learning_rate": 4.752493882928666e-05, "loss": 3.205, "step": 24040 }, { "epoch": 4.526632787502352, "grad_norm": 3.5863893032073975, "learning_rate": 4.733672124976473e-05, "loss": 3.2214, "step": 24050 }, { "epoch": 4.528514963297572, "grad_norm": 2.6433331966400146, "learning_rate": 4.71485036702428e-05, "loss": 3.347, "step": 24060 }, { "epoch": 4.530397139092791, "grad_norm": 2.9724504947662354, "learning_rate": 4.6960286090720874e-05, "loss": 3.021, "step": 24070 }, { "epoch": 4.532279314888011, "grad_norm": 3.0654139518737793, "learning_rate": 4.6772068511198944e-05, "loss": 3.4734, "step": 24080 }, { "epoch": 4.53416149068323, "grad_norm": 2.883718729019165, "learning_rate": 4.658385093167702e-05, "loss": 3.0313, "step": 24090 }, { "epoch": 4.536043666478449, "grad_norm": 3.460390090942383, "learning_rate": 4.639563335215509e-05, "loss": 3.4964, "step": 24100 }, { "epoch": 4.537925842273668, "grad_norm": 2.498680830001831, "learning_rate": 4.620741577263316e-05, "loss": 2.9641, "step": 24110 }, { "epoch": 4.539808018068888, "grad_norm": 4.274486064910889, "learning_rate": 4.601919819311124e-05, "loss": 3.1713, "step": 24120 }, { "epoch": 4.541690193864107, "grad_norm": 3.244508743286133, "learning_rate": 4.583098061358931e-05, "loss": 3.2632, "step": 24130 }, { "epoch": 4.543572369659326, "grad_norm": 2.921966791152954, "learning_rate": 4.564276303406738e-05, "loss": 3.0531, "step": 24140 }, { "epoch": 4.545454545454545, "grad_norm": 3.98917555809021, "learning_rate": 4.545454545454546e-05, "loss": 3.3034, "step": 24150 }, { "epoch": 4.547336721249764, "grad_norm": 3.7164669036865234, "learning_rate": 4.526632787502353e-05, "loss": 3.162, "step": 24160 }, { "epoch": 4.549218897044984, "grad_norm": 3.392164945602417, "learning_rate": 4.50781102955016e-05, "loss": 3.0602, "step": 24170 }, { "epoch": 4.551101072840203, "grad_norm": 3.1076338291168213, "learning_rate": 4.4889892715979674e-05, "loss": 3.3768, "step": 24180 }, { "epoch": 4.552983248635423, "grad_norm": 2.799534797668457, "learning_rate": 4.4701675136457744e-05, "loss": 2.9775, "step": 24190 }, { "epoch": 4.554865424430642, "grad_norm": 2.9202513694763184, "learning_rate": 4.451345755693582e-05, "loss": 3.2604, "step": 24200 }, { "epoch": 4.556747600225862, "grad_norm": 4.149763107299805, "learning_rate": 4.432523997741389e-05, "loss": 3.0419, "step": 24210 }, { "epoch": 4.55862977602108, "grad_norm": 2.8485655784606934, "learning_rate": 4.413702239789196e-05, "loss": 3.0743, "step": 24220 }, { "epoch": 4.5605119518162995, "grad_norm": 2.8361072540283203, "learning_rate": 4.3948804818370036e-05, "loss": 3.3415, "step": 24230 }, { "epoch": 4.562394127611519, "grad_norm": 3.362912178039551, "learning_rate": 4.376058723884811e-05, "loss": 3.1703, "step": 24240 }, { "epoch": 4.564276303406738, "grad_norm": 2.6403450965881348, "learning_rate": 4.357236965932618e-05, "loss": 2.8701, "step": 24250 }, { "epoch": 4.566158479201958, "grad_norm": 2.5263137817382812, "learning_rate": 4.338415207980426e-05, "loss": 2.8825, "step": 24260 }, { "epoch": 4.568040654997176, "grad_norm": 2.7215659618377686, "learning_rate": 4.319593450028233e-05, "loss": 2.9142, "step": 24270 }, { "epoch": 4.569922830792396, "grad_norm": 3.132044553756714, "learning_rate": 4.30077169207604e-05, "loss": 3.4087, "step": 24280 }, { "epoch": 4.571805006587615, "grad_norm": 3.3651986122131348, "learning_rate": 4.2819499341238474e-05, "loss": 3.3137, "step": 24290 }, { "epoch": 4.573687182382835, "grad_norm": 3.7178475856781006, "learning_rate": 4.2631281761716544e-05, "loss": 2.882, "step": 24300 }, { "epoch": 4.575569358178054, "grad_norm": 5.011250972747803, "learning_rate": 4.244306418219462e-05, "loss": 3.4289, "step": 24310 }, { "epoch": 4.577451533973273, "grad_norm": 3.3479485511779785, "learning_rate": 4.225484660267269e-05, "loss": 3.28, "step": 24320 }, { "epoch": 4.579333709768492, "grad_norm": 2.715578079223633, "learning_rate": 4.206662902315076e-05, "loss": 2.9993, "step": 24330 }, { "epoch": 4.5812158855637115, "grad_norm": 3.1181604862213135, "learning_rate": 4.1878411443628836e-05, "loss": 2.9622, "step": 24340 }, { "epoch": 4.583098061358931, "grad_norm": 4.305251121520996, "learning_rate": 4.1690193864106906e-05, "loss": 3.034, "step": 24350 }, { "epoch": 4.58498023715415, "grad_norm": 3.093553304672241, "learning_rate": 4.150197628458498e-05, "loss": 3.6412, "step": 24360 }, { "epoch": 4.58686241294937, "grad_norm": 4.383254051208496, "learning_rate": 4.131375870506306e-05, "loss": 3.6157, "step": 24370 }, { "epoch": 4.588744588744589, "grad_norm": 3.1755826473236084, "learning_rate": 4.112554112554113e-05, "loss": 3.4622, "step": 24380 }, { "epoch": 4.590626764539808, "grad_norm": 2.9450700283050537, "learning_rate": 4.09373235460192e-05, "loss": 3.3494, "step": 24390 }, { "epoch": 4.592508940335027, "grad_norm": 3.380802869796753, "learning_rate": 4.0749105966497274e-05, "loss": 3.2481, "step": 24400 }, { "epoch": 4.594391116130247, "grad_norm": 2.74828839302063, "learning_rate": 4.0560888386975344e-05, "loss": 3.2937, "step": 24410 }, { "epoch": 4.596273291925466, "grad_norm": 2.6550493240356445, "learning_rate": 4.0372670807453414e-05, "loss": 2.9292, "step": 24420 }, { "epoch": 4.5981554677206855, "grad_norm": 2.6913087368011475, "learning_rate": 4.018445322793149e-05, "loss": 3.3403, "step": 24430 }, { "epoch": 4.600037643515904, "grad_norm": 3.7004897594451904, "learning_rate": 3.999623564840956e-05, "loss": 3.1304, "step": 24440 }, { "epoch": 4.6019198193111235, "grad_norm": 3.044382095336914, "learning_rate": 3.9808018068887636e-05, "loss": 2.8982, "step": 24450 }, { "epoch": 4.603801995106343, "grad_norm": 2.8291525840759277, "learning_rate": 3.9619800489365706e-05, "loss": 3.1667, "step": 24460 }, { "epoch": 4.605684170901562, "grad_norm": 2.786212205886841, "learning_rate": 3.9431582909843775e-05, "loss": 3.0917, "step": 24470 }, { "epoch": 4.607566346696782, "grad_norm": 2.716911554336548, "learning_rate": 3.924336533032186e-05, "loss": 3.5009, "step": 24480 }, { "epoch": 4.609448522492, "grad_norm": 2.9290614128112793, "learning_rate": 3.905514775079993e-05, "loss": 3.2019, "step": 24490 }, { "epoch": 4.61133069828722, "grad_norm": 2.7946932315826416, "learning_rate": 3.8866930171278e-05, "loss": 3.3843, "step": 24500 }, { "epoch": 4.613212874082439, "grad_norm": 3.1367475986480713, "learning_rate": 3.8678712591756074e-05, "loss": 3.1811, "step": 24510 }, { "epoch": 4.615095049877659, "grad_norm": 2.5652503967285156, "learning_rate": 3.8490495012234144e-05, "loss": 3.0921, "step": 24520 }, { "epoch": 4.616977225672878, "grad_norm": 2.903272867202759, "learning_rate": 3.8302277432712214e-05, "loss": 3.1698, "step": 24530 }, { "epoch": 4.6188594014680975, "grad_norm": 3.0952274799346924, "learning_rate": 3.811405985319029e-05, "loss": 2.991, "step": 24540 }, { "epoch": 4.620741577263316, "grad_norm": 2.6256113052368164, "learning_rate": 3.792584227366836e-05, "loss": 3.0366, "step": 24550 }, { "epoch": 4.6226237530585355, "grad_norm": 2.872602939605713, "learning_rate": 3.7737624694146436e-05, "loss": 3.065, "step": 24560 }, { "epoch": 4.624505928853755, "grad_norm": 3.981386184692383, "learning_rate": 3.7549407114624506e-05, "loss": 3.0991, "step": 24570 }, { "epoch": 4.626388104648974, "grad_norm": 6.0364251136779785, "learning_rate": 3.7361189535102575e-05, "loss": 3.1146, "step": 24580 }, { "epoch": 4.628270280444194, "grad_norm": 3.1531736850738525, "learning_rate": 3.717297195558065e-05, "loss": 3.3461, "step": 24590 }, { "epoch": 4.630152456239413, "grad_norm": 3.7676281929016113, "learning_rate": 3.698475437605872e-05, "loss": 3.0683, "step": 24600 }, { "epoch": 4.632034632034632, "grad_norm": 2.6802709102630615, "learning_rate": 3.67965367965368e-05, "loss": 3.2123, "step": 24610 }, { "epoch": 4.633916807829851, "grad_norm": 3.7825965881347656, "learning_rate": 3.6608319217014874e-05, "loss": 3.3615, "step": 24620 }, { "epoch": 4.635798983625071, "grad_norm": 2.9915659427642822, "learning_rate": 3.6420101637492944e-05, "loss": 3.3229, "step": 24630 }, { "epoch": 4.63768115942029, "grad_norm": 3.3726351261138916, "learning_rate": 3.6231884057971014e-05, "loss": 3.3137, "step": 24640 }, { "epoch": 4.6395633352155095, "grad_norm": 2.611403465270996, "learning_rate": 3.604366647844909e-05, "loss": 3.2665, "step": 24650 }, { "epoch": 4.641445511010728, "grad_norm": 4.3528523445129395, "learning_rate": 3.585544889892716e-05, "loss": 3.1482, "step": 24660 }, { "epoch": 4.643327686805947, "grad_norm": 2.827160596847534, "learning_rate": 3.5667231319405236e-05, "loss": 3.5753, "step": 24670 }, { "epoch": 4.645209862601167, "grad_norm": 2.940286159515381, "learning_rate": 3.5479013739883306e-05, "loss": 3.2837, "step": 24680 }, { "epoch": 4.647092038396386, "grad_norm": 2.595956325531006, "learning_rate": 3.5290796160361375e-05, "loss": 3.1817, "step": 24690 }, { "epoch": 4.648974214191606, "grad_norm": 3.200382947921753, "learning_rate": 3.510257858083945e-05, "loss": 3.4257, "step": 24700 }, { "epoch": 4.650856389986825, "grad_norm": 2.6763854026794434, "learning_rate": 3.491436100131752e-05, "loss": 2.9655, "step": 24710 }, { "epoch": 4.652738565782044, "grad_norm": 3.2725934982299805, "learning_rate": 3.472614342179559e-05, "loss": 3.2608, "step": 24720 }, { "epoch": 4.654620741577263, "grad_norm": 2.985708236694336, "learning_rate": 3.4537925842273674e-05, "loss": 3.1168, "step": 24730 }, { "epoch": 4.656502917372483, "grad_norm": 3.0978457927703857, "learning_rate": 3.4349708262751744e-05, "loss": 3.0778, "step": 24740 }, { "epoch": 4.658385093167702, "grad_norm": 8.759063720703125, "learning_rate": 3.4161490683229814e-05, "loss": 3.246, "step": 24750 }, { "epoch": 4.660267268962921, "grad_norm": 3.0920560359954834, "learning_rate": 3.397327310370789e-05, "loss": 3.0907, "step": 24760 }, { "epoch": 4.662149444758141, "grad_norm": 2.807457685470581, "learning_rate": 3.378505552418596e-05, "loss": 3.2512, "step": 24770 }, { "epoch": 4.664031620553359, "grad_norm": 3.2192564010620117, "learning_rate": 3.359683794466403e-05, "loss": 3.4434, "step": 24780 }, { "epoch": 4.665913796348579, "grad_norm": 3.1085925102233887, "learning_rate": 3.3408620365142106e-05, "loss": 3.375, "step": 24790 }, { "epoch": 4.667795972143798, "grad_norm": 3.6151530742645264, "learning_rate": 3.3220402785620175e-05, "loss": 3.2606, "step": 24800 }, { "epoch": 4.669678147939018, "grad_norm": 2.7845237255096436, "learning_rate": 3.303218520609825e-05, "loss": 3.1624, "step": 24810 }, { "epoch": 4.671560323734237, "grad_norm": 2.3765504360198975, "learning_rate": 3.284396762657632e-05, "loss": 3.4659, "step": 24820 }, { "epoch": 4.673442499529456, "grad_norm": 3.301711082458496, "learning_rate": 3.265575004705439e-05, "loss": 3.5859, "step": 24830 }, { "epoch": 4.675324675324675, "grad_norm": 2.6286189556121826, "learning_rate": 3.246753246753247e-05, "loss": 3.0791, "step": 24840 }, { "epoch": 4.6772068511198945, "grad_norm": 2.879110813140869, "learning_rate": 3.2279314888010544e-05, "loss": 3.1536, "step": 24850 }, { "epoch": 4.679089026915114, "grad_norm": 3.2365987300872803, "learning_rate": 3.2091097308488614e-05, "loss": 3.1614, "step": 24860 }, { "epoch": 4.680971202710333, "grad_norm": 3.932792901992798, "learning_rate": 3.190287972896669e-05, "loss": 3.0712, "step": 24870 }, { "epoch": 4.682853378505552, "grad_norm": 3.3613767623901367, "learning_rate": 3.171466214944476e-05, "loss": 3.4802, "step": 24880 }, { "epoch": 4.684735554300771, "grad_norm": 2.611473798751831, "learning_rate": 3.152644456992283e-05, "loss": 3.2437, "step": 24890 }, { "epoch": 4.686617730095991, "grad_norm": 3.1896893978118896, "learning_rate": 3.1338226990400906e-05, "loss": 3.2623, "step": 24900 }, { "epoch": 4.68849990589121, "grad_norm": 2.306182861328125, "learning_rate": 3.1150009410878975e-05, "loss": 2.9952, "step": 24910 }, { "epoch": 4.69038208168643, "grad_norm": 2.9280362129211426, "learning_rate": 3.096179183135705e-05, "loss": 3.3888, "step": 24920 }, { "epoch": 4.692264257481649, "grad_norm": 3.746704578399658, "learning_rate": 3.077357425183512e-05, "loss": 2.9443, "step": 24930 }, { "epoch": 4.6941464332768685, "grad_norm": 3.5688564777374268, "learning_rate": 3.05853566723132e-05, "loss": 3.258, "step": 24940 }, { "epoch": 4.696028609072087, "grad_norm": 4.903964042663574, "learning_rate": 3.0397139092791268e-05, "loss": 3.4195, "step": 24950 }, { "epoch": 4.6979107848673065, "grad_norm": 3.729482412338257, "learning_rate": 3.020892151326934e-05, "loss": 3.3093, "step": 24960 }, { "epoch": 4.699792960662526, "grad_norm": 3.215060234069824, "learning_rate": 3.0020703933747414e-05, "loss": 3.475, "step": 24970 }, { "epoch": 4.701675136457745, "grad_norm": 2.280484914779663, "learning_rate": 2.9832486354225483e-05, "loss": 3.1306, "step": 24980 }, { "epoch": 4.703557312252965, "grad_norm": 3.377382278442383, "learning_rate": 2.9644268774703556e-05, "loss": 3.2126, "step": 24990 }, { "epoch": 4.705439488048183, "grad_norm": 3.27429461479187, "learning_rate": 2.945605119518163e-05, "loss": 3.281, "step": 25000 }, { "epoch": 4.707321663843403, "grad_norm": 2.8424630165100098, "learning_rate": 2.9267833615659706e-05, "loss": 3.2675, "step": 25010 }, { "epoch": 4.709203839638622, "grad_norm": 3.1172735691070557, "learning_rate": 2.9079616036137775e-05, "loss": 3.4589, "step": 25020 }, { "epoch": 4.711086015433842, "grad_norm": 2.8301455974578857, "learning_rate": 2.889139845661585e-05, "loss": 3.3261, "step": 25030 }, { "epoch": 4.712968191229061, "grad_norm": 2.538022041320801, "learning_rate": 2.870318087709392e-05, "loss": 3.0696, "step": 25040 }, { "epoch": 4.71485036702428, "grad_norm": 2.5753090381622314, "learning_rate": 2.851496329757199e-05, "loss": 2.9831, "step": 25050 }, { "epoch": 4.716732542819499, "grad_norm": 3.2386507987976074, "learning_rate": 2.8326745718050064e-05, "loss": 3.04, "step": 25060 }, { "epoch": 4.7186147186147185, "grad_norm": 2.7614169120788574, "learning_rate": 2.813852813852814e-05, "loss": 3.292, "step": 25070 }, { "epoch": 4.720496894409938, "grad_norm": 3.875685214996338, "learning_rate": 2.7950310559006214e-05, "loss": 3.3236, "step": 25080 }, { "epoch": 4.722379070205157, "grad_norm": 3.0858395099639893, "learning_rate": 2.7762092979484283e-05, "loss": 3.2861, "step": 25090 }, { "epoch": 4.724261246000377, "grad_norm": 4.012641906738281, "learning_rate": 2.7573875399962356e-05, "loss": 3.3092, "step": 25100 }, { "epoch": 4.726143421795595, "grad_norm": 2.8968312740325928, "learning_rate": 2.738565782044043e-05, "loss": 3.0026, "step": 25110 }, { "epoch": 4.728025597590815, "grad_norm": 4.2459001541137695, "learning_rate": 2.7197440240918502e-05, "loss": 2.9701, "step": 25120 }, { "epoch": 4.729907773386034, "grad_norm": 3.4018383026123047, "learning_rate": 2.7009222661396575e-05, "loss": 3.1652, "step": 25130 }, { "epoch": 4.731789949181254, "grad_norm": 3.798372268676758, "learning_rate": 2.682100508187465e-05, "loss": 3.2211, "step": 25140 }, { "epoch": 4.733672124976473, "grad_norm": 3.093381643295288, "learning_rate": 2.663278750235272e-05, "loss": 3.2467, "step": 25150 }, { "epoch": 4.7355543007716925, "grad_norm": 2.5847713947296143, "learning_rate": 2.644456992283079e-05, "loss": 2.995, "step": 25160 }, { "epoch": 4.737436476566911, "grad_norm": 3.9761364459991455, "learning_rate": 2.6256352343308864e-05, "loss": 2.8856, "step": 25170 }, { "epoch": 4.7393186523621305, "grad_norm": 2.505748987197876, "learning_rate": 2.6068134763786937e-05, "loss": 3.0557, "step": 25180 }, { "epoch": 4.74120082815735, "grad_norm": 3.296593189239502, "learning_rate": 2.5879917184265014e-05, "loss": 3.0296, "step": 25190 }, { "epoch": 4.743083003952569, "grad_norm": 3.624117136001587, "learning_rate": 2.5691699604743083e-05, "loss": 2.8997, "step": 25200 }, { "epoch": 4.744965179747789, "grad_norm": 3.9669125080108643, "learning_rate": 2.5503482025221156e-05, "loss": 2.946, "step": 25210 }, { "epoch": 4.746847355543007, "grad_norm": 2.9287898540496826, "learning_rate": 2.531526444569923e-05, "loss": 3.2531, "step": 25220 }, { "epoch": 4.748729531338227, "grad_norm": 3.1249217987060547, "learning_rate": 2.5127046866177302e-05, "loss": 3.2925, "step": 25230 }, { "epoch": 4.750611707133446, "grad_norm": 3.173673152923584, "learning_rate": 2.4938829286655372e-05, "loss": 3.2142, "step": 25240 }, { "epoch": 4.752493882928666, "grad_norm": 2.8763136863708496, "learning_rate": 2.475061170713345e-05, "loss": 3.1028, "step": 25250 }, { "epoch": 4.754376058723885, "grad_norm": 4.05729866027832, "learning_rate": 2.456239412761152e-05, "loss": 2.7233, "step": 25260 }, { "epoch": 4.7562582345191045, "grad_norm": 3.805598497390747, "learning_rate": 2.437417654808959e-05, "loss": 3.1988, "step": 25270 }, { "epoch": 4.758140410314323, "grad_norm": 3.6921346187591553, "learning_rate": 2.4185958968567664e-05, "loss": 2.8693, "step": 25280 }, { "epoch": 4.760022586109542, "grad_norm": 3.240833044052124, "learning_rate": 2.3997741389045737e-05, "loss": 3.0211, "step": 25290 }, { "epoch": 4.761904761904762, "grad_norm": 3.152642250061035, "learning_rate": 2.380952380952381e-05, "loss": 3.0866, "step": 25300 }, { "epoch": 4.763786937699981, "grad_norm": 3.251298427581787, "learning_rate": 2.3621306230001883e-05, "loss": 3.1911, "step": 25310 }, { "epoch": 4.765669113495201, "grad_norm": 2.7727744579315186, "learning_rate": 2.3433088650479956e-05, "loss": 3.2782, "step": 25320 }, { "epoch": 4.76755128929042, "grad_norm": 2.8768370151519775, "learning_rate": 2.324487107095803e-05, "loss": 2.9626, "step": 25330 }, { "epoch": 4.769433465085639, "grad_norm": 3.1602566242218018, "learning_rate": 2.30566534914361e-05, "loss": 3.0035, "step": 25340 }, { "epoch": 4.771315640880858, "grad_norm": 4.934927940368652, "learning_rate": 2.2868435911914172e-05, "loss": 2.9412, "step": 25350 }, { "epoch": 4.773197816676078, "grad_norm": 3.632086992263794, "learning_rate": 2.2680218332392245e-05, "loss": 3.4639, "step": 25360 }, { "epoch": 4.775079992471297, "grad_norm": 3.2546494007110596, "learning_rate": 2.249200075287032e-05, "loss": 2.9484, "step": 25370 }, { "epoch": 4.776962168266516, "grad_norm": 4.427943229675293, "learning_rate": 2.230378317334839e-05, "loss": 3.2541, "step": 25380 }, { "epoch": 4.778844344061735, "grad_norm": 2.785226583480835, "learning_rate": 2.2115565593826464e-05, "loss": 2.9992, "step": 25390 }, { "epoch": 4.780726519856954, "grad_norm": 2.93110990524292, "learning_rate": 2.1927348014304537e-05, "loss": 3.2238, "step": 25400 }, { "epoch": 4.782608695652174, "grad_norm": 2.6001505851745605, "learning_rate": 2.173913043478261e-05, "loss": 3.1899, "step": 25410 }, { "epoch": 4.784490871447393, "grad_norm": 2.8628385066986084, "learning_rate": 2.155091285526068e-05, "loss": 3.0132, "step": 25420 }, { "epoch": 4.786373047242613, "grad_norm": 2.6850054264068604, "learning_rate": 2.1362695275738753e-05, "loss": 3.4278, "step": 25430 }, { "epoch": 4.788255223037831, "grad_norm": 2.7586615085601807, "learning_rate": 2.117447769621683e-05, "loss": 3.428, "step": 25440 }, { "epoch": 4.790137398833051, "grad_norm": 3.3150601387023926, "learning_rate": 2.09862601166949e-05, "loss": 3.1502, "step": 25450 }, { "epoch": 4.79201957462827, "grad_norm": 3.676438093185425, "learning_rate": 2.0798042537172972e-05, "loss": 3.3348, "step": 25460 }, { "epoch": 4.7939017504234895, "grad_norm": 3.2945897579193115, "learning_rate": 2.0609824957651045e-05, "loss": 3.1048, "step": 25470 }, { "epoch": 4.795783926218709, "grad_norm": 3.1217949390411377, "learning_rate": 2.0421607378129118e-05, "loss": 3.0479, "step": 25480 }, { "epoch": 4.797666102013928, "grad_norm": 2.6365010738372803, "learning_rate": 2.0233389798607188e-05, "loss": 2.9137, "step": 25490 }, { "epoch": 4.799548277809148, "grad_norm": 3.463871955871582, "learning_rate": 2.0045172219085264e-05, "loss": 3.129, "step": 25500 }, { "epoch": 4.801430453604366, "grad_norm": 3.1459686756134033, "learning_rate": 1.9856954639563337e-05, "loss": 3.3096, "step": 25510 }, { "epoch": 4.803312629399586, "grad_norm": 2.775165557861328, "learning_rate": 1.9668737060041407e-05, "loss": 3.2217, "step": 25520 }, { "epoch": 4.805194805194805, "grad_norm": 3.2702062129974365, "learning_rate": 1.948051948051948e-05, "loss": 2.9853, "step": 25530 }, { "epoch": 4.807076980990025, "grad_norm": 2.980114698410034, "learning_rate": 1.9292301900997553e-05, "loss": 2.9909, "step": 25540 }, { "epoch": 4.808959156785244, "grad_norm": 3.2365455627441406, "learning_rate": 1.9104084321475626e-05, "loss": 2.9872, "step": 25550 }, { "epoch": 4.810841332580463, "grad_norm": 2.9393603801727295, "learning_rate": 1.89158667419537e-05, "loss": 3.4253, "step": 25560 }, { "epoch": 4.812723508375682, "grad_norm": 2.5274322032928467, "learning_rate": 1.8727649162431772e-05, "loss": 2.8669, "step": 25570 }, { "epoch": 4.8146056841709015, "grad_norm": 3.172722816467285, "learning_rate": 1.8539431582909845e-05, "loss": 2.9687, "step": 25580 }, { "epoch": 4.816487859966121, "grad_norm": 3.130019426345825, "learning_rate": 1.8351214003387918e-05, "loss": 3.0413, "step": 25590 }, { "epoch": 4.81837003576134, "grad_norm": 3.0461792945861816, "learning_rate": 1.8162996423865988e-05, "loss": 3.2432, "step": 25600 }, { "epoch": 4.820252211556559, "grad_norm": 2.9114632606506348, "learning_rate": 1.797477884434406e-05, "loss": 3.1182, "step": 25610 }, { "epoch": 4.822134387351778, "grad_norm": 3.019514799118042, "learning_rate": 1.7786561264822137e-05, "loss": 2.9704, "step": 25620 }, { "epoch": 4.824016563146998, "grad_norm": 3.043949604034424, "learning_rate": 1.7598343685300207e-05, "loss": 3.5735, "step": 25630 }, { "epoch": 4.825898738942217, "grad_norm": 2.620448589324951, "learning_rate": 1.741012610577828e-05, "loss": 2.9745, "step": 25640 }, { "epoch": 4.827780914737437, "grad_norm": 2.6471874713897705, "learning_rate": 1.7221908526256353e-05, "loss": 2.6752, "step": 25650 }, { "epoch": 4.829663090532656, "grad_norm": 2.99849271774292, "learning_rate": 1.7033690946734426e-05, "loss": 3.3014, "step": 25660 }, { "epoch": 4.8315452663278755, "grad_norm": 3.3653836250305176, "learning_rate": 1.6845473367212496e-05, "loss": 2.8202, "step": 25670 }, { "epoch": 4.833427442123094, "grad_norm": 3.4769065380096436, "learning_rate": 1.6657255787690572e-05, "loss": 3.2475, "step": 25680 }, { "epoch": 4.8353096179183135, "grad_norm": 3.774672508239746, "learning_rate": 1.6469038208168645e-05, "loss": 3.2216, "step": 25690 }, { "epoch": 4.837191793713533, "grad_norm": 4.837364196777344, "learning_rate": 1.6280820628646715e-05, "loss": 3.3699, "step": 25700 }, { "epoch": 4.839073969508752, "grad_norm": 2.9457051753997803, "learning_rate": 1.6092603049124788e-05, "loss": 2.9287, "step": 25710 }, { "epoch": 4.840956145303972, "grad_norm": 2.990029811859131, "learning_rate": 1.590438546960286e-05, "loss": 3.0793, "step": 25720 }, { "epoch": 4.84283832109919, "grad_norm": 3.9842264652252197, "learning_rate": 1.5716167890080934e-05, "loss": 3.043, "step": 25730 }, { "epoch": 4.84472049689441, "grad_norm": 4.764008045196533, "learning_rate": 1.5527950310559007e-05, "loss": 2.9053, "step": 25740 }, { "epoch": 4.846602672689629, "grad_norm": 3.5892012119293213, "learning_rate": 1.533973273103708e-05, "loss": 3.1736, "step": 25750 }, { "epoch": 4.848484848484849, "grad_norm": 3.524770498275757, "learning_rate": 1.5151515151515153e-05, "loss": 3.1188, "step": 25760 }, { "epoch": 4.850367024280068, "grad_norm": 3.6661603450775146, "learning_rate": 1.4963297571993224e-05, "loss": 3.3539, "step": 25770 }, { "epoch": 4.852249200075287, "grad_norm": 2.802971124649048, "learning_rate": 1.4775079992471297e-05, "loss": 3.0607, "step": 25780 }, { "epoch": 4.854131375870506, "grad_norm": 2.968571662902832, "learning_rate": 1.458686241294937e-05, "loss": 3.0637, "step": 25790 }, { "epoch": 4.8560135516657255, "grad_norm": 4.406318187713623, "learning_rate": 1.4398644833427442e-05, "loss": 3.3069, "step": 25800 }, { "epoch": 4.857895727460945, "grad_norm": 2.874358654022217, "learning_rate": 1.4210427253905515e-05, "loss": 3.045, "step": 25810 }, { "epoch": 4.859777903256164, "grad_norm": 3.047579526901245, "learning_rate": 1.4022209674383588e-05, "loss": 2.9849, "step": 25820 }, { "epoch": 4.861660079051384, "grad_norm": 2.9793665409088135, "learning_rate": 1.383399209486166e-05, "loss": 3.2543, "step": 25830 }, { "epoch": 4.863542254846602, "grad_norm": 3.1359031200408936, "learning_rate": 1.3645774515339732e-05, "loss": 3.0561, "step": 25840 }, { "epoch": 4.865424430641822, "grad_norm": 3.4481136798858643, "learning_rate": 1.3457556935817807e-05, "loss": 3.231, "step": 25850 }, { "epoch": 4.867306606437041, "grad_norm": 2.9049251079559326, "learning_rate": 1.3269339356295878e-05, "loss": 3.3585, "step": 25860 }, { "epoch": 4.869188782232261, "grad_norm": 3.000098943710327, "learning_rate": 1.3081121776773951e-05, "loss": 3.2807, "step": 25870 }, { "epoch": 4.87107095802748, "grad_norm": 3.4274730682373047, "learning_rate": 1.2892904197252024e-05, "loss": 3.1417, "step": 25880 }, { "epoch": 4.8729531338226995, "grad_norm": 3.498537063598633, "learning_rate": 1.2704686617730096e-05, "loss": 3.2978, "step": 25890 }, { "epoch": 4.874835309617918, "grad_norm": 2.6304216384887695, "learning_rate": 1.2516469038208169e-05, "loss": 3.3949, "step": 25900 }, { "epoch": 4.876717485413137, "grad_norm": 3.301097869873047, "learning_rate": 1.2328251458686242e-05, "loss": 2.9152, "step": 25910 }, { "epoch": 4.878599661208357, "grad_norm": 4.412207126617432, "learning_rate": 1.2140033879164315e-05, "loss": 3.3158, "step": 25920 }, { "epoch": 4.880481837003576, "grad_norm": 3.0507736206054688, "learning_rate": 1.1951816299642386e-05, "loss": 3.2019, "step": 25930 }, { "epoch": 4.882364012798796, "grad_norm": 3.0898468494415283, "learning_rate": 1.176359872012046e-05, "loss": 3.1667, "step": 25940 }, { "epoch": 4.884246188594014, "grad_norm": 2.6355340480804443, "learning_rate": 1.1575381140598532e-05, "loss": 3.1116, "step": 25950 }, { "epoch": 4.886128364389234, "grad_norm": 3.2803468704223633, "learning_rate": 1.1387163561076605e-05, "loss": 3.0631, "step": 25960 }, { "epoch": 4.888010540184453, "grad_norm": 3.071220636367798, "learning_rate": 1.1198945981554678e-05, "loss": 3.0295, "step": 25970 }, { "epoch": 4.889892715979673, "grad_norm": 3.0252721309661865, "learning_rate": 1.101072840203275e-05, "loss": 3.0742, "step": 25980 }, { "epoch": 4.891774891774892, "grad_norm": 3.041609764099121, "learning_rate": 1.0822510822510823e-05, "loss": 3.1043, "step": 25990 }, { "epoch": 4.893657067570111, "grad_norm": 3.819404125213623, "learning_rate": 1.0634293242988896e-05, "loss": 3.275, "step": 26000 }, { "epoch": 4.89553924336533, "grad_norm": 2.8616020679473877, "learning_rate": 1.0446075663466969e-05, "loss": 2.8112, "step": 26010 }, { "epoch": 4.897421419160549, "grad_norm": 3.4349212646484375, "learning_rate": 1.025785808394504e-05, "loss": 3.3595, "step": 26020 }, { "epoch": 4.899303594955769, "grad_norm": 3.0160281658172607, "learning_rate": 1.0069640504423115e-05, "loss": 3.1097, "step": 26030 }, { "epoch": 4.901185770750988, "grad_norm": 3.218116521835327, "learning_rate": 9.881422924901186e-06, "loss": 2.9423, "step": 26040 }, { "epoch": 4.903067946546208, "grad_norm": 3.245553970336914, "learning_rate": 9.693205345379259e-06, "loss": 3.151, "step": 26050 }, { "epoch": 4.904950122341427, "grad_norm": 3.4128801822662354, "learning_rate": 9.504987765857332e-06, "loss": 3.1474, "step": 26060 }, { "epoch": 4.906832298136646, "grad_norm": 3.061704397201538, "learning_rate": 9.316770186335403e-06, "loss": 3.1783, "step": 26070 }, { "epoch": 4.908714473931865, "grad_norm": 3.863232374191284, "learning_rate": 9.128552606813476e-06, "loss": 3.4587, "step": 26080 }, { "epoch": 4.9105966497270845, "grad_norm": 3.27815580368042, "learning_rate": 8.94033502729155e-06, "loss": 3.2556, "step": 26090 }, { "epoch": 4.912478825522304, "grad_norm": 3.329728364944458, "learning_rate": 8.752117447769623e-06, "loss": 3.0733, "step": 26100 }, { "epoch": 4.914361001317523, "grad_norm": 4.6205058097839355, "learning_rate": 8.563899868247694e-06, "loss": 3.2327, "step": 26110 }, { "epoch": 4.916243177112742, "grad_norm": 3.4472804069519043, "learning_rate": 8.375682288725767e-06, "loss": 3.2555, "step": 26120 }, { "epoch": 4.918125352907961, "grad_norm": 3.038895845413208, "learning_rate": 8.18746470920384e-06, "loss": 3.4315, "step": 26130 }, { "epoch": 4.920007528703181, "grad_norm": 2.9684219360351562, "learning_rate": 7.999247129681913e-06, "loss": 3.3593, "step": 26140 }, { "epoch": 4.9218897044984, "grad_norm": 5.256326675415039, "learning_rate": 7.811029550159986e-06, "loss": 3.2857, "step": 26150 }, { "epoch": 4.92377188029362, "grad_norm": 2.682718515396118, "learning_rate": 7.622811970638058e-06, "loss": 3.2377, "step": 26160 }, { "epoch": 4.925654056088838, "grad_norm": 3.5212459564208984, "learning_rate": 7.43459439111613e-06, "loss": 3.3286, "step": 26170 }, { "epoch": 4.927536231884058, "grad_norm": 3.4938385486602783, "learning_rate": 7.246376811594203e-06, "loss": 3.1179, "step": 26180 }, { "epoch": 4.929418407679277, "grad_norm": 3.7432050704956055, "learning_rate": 7.0581592320722764e-06, "loss": 3.3646, "step": 26190 }, { "epoch": 4.9313005834744965, "grad_norm": 4.2980475425720215, "learning_rate": 6.869941652550348e-06, "loss": 3.1579, "step": 26200 }, { "epoch": 4.933182759269716, "grad_norm": 3.2238101959228516, "learning_rate": 6.681724073028421e-06, "loss": 3.4587, "step": 26210 }, { "epoch": 4.935064935064935, "grad_norm": 3.5493414402008057, "learning_rate": 6.493506493506494e-06, "loss": 3.1526, "step": 26220 }, { "epoch": 4.936947110860155, "grad_norm": 2.7393136024475098, "learning_rate": 6.305288913984566e-06, "loss": 3.2802, "step": 26230 }, { "epoch": 4.938829286655373, "grad_norm": 4.225271701812744, "learning_rate": 6.117071334462639e-06, "loss": 3.3651, "step": 26240 }, { "epoch": 4.940711462450593, "grad_norm": 3.390509605407715, "learning_rate": 5.928853754940711e-06, "loss": 3.1389, "step": 26250 }, { "epoch": 4.942593638245812, "grad_norm": 3.0110936164855957, "learning_rate": 5.740636175418784e-06, "loss": 3.01, "step": 26260 }, { "epoch": 4.944475814041032, "grad_norm": 3.559267997741699, "learning_rate": 5.552418595896857e-06, "loss": 3.1301, "step": 26270 }, { "epoch": 4.946357989836251, "grad_norm": 2.8467063903808594, "learning_rate": 5.3642010163749295e-06, "loss": 3.0362, "step": 26280 }, { "epoch": 4.94824016563147, "grad_norm": 2.625588893890381, "learning_rate": 5.175983436853002e-06, "loss": 3.0604, "step": 26290 }, { "epoch": 4.950122341426689, "grad_norm": 2.751812696456909, "learning_rate": 4.987765857331075e-06, "loss": 2.7247, "step": 26300 }, { "epoch": 4.9520045172219085, "grad_norm": 3.6566710472106934, "learning_rate": 4.799548277809147e-06, "loss": 3.0319, "step": 26310 }, { "epoch": 4.953886693017128, "grad_norm": 4.1501288414001465, "learning_rate": 4.61133069828722e-06, "loss": 3.2094, "step": 26320 }, { "epoch": 4.955768868812347, "grad_norm": 2.843015670776367, "learning_rate": 4.423113118765293e-06, "loss": 3.1845, "step": 26330 }, { "epoch": 4.957651044607566, "grad_norm": 3.4206199645996094, "learning_rate": 4.234895539243365e-06, "loss": 3.0502, "step": 26340 }, { "epoch": 4.959533220402785, "grad_norm": 3.2094857692718506, "learning_rate": 4.046677959721438e-06, "loss": 3.3604, "step": 26350 }, { "epoch": 4.961415396198005, "grad_norm": 3.3807573318481445, "learning_rate": 3.85846038019951e-06, "loss": 3.3002, "step": 26360 }, { "epoch": 4.963297571993224, "grad_norm": 2.6988401412963867, "learning_rate": 3.6702428006775834e-06, "loss": 3.0346, "step": 26370 }, { "epoch": 4.965179747788444, "grad_norm": 2.562386989593506, "learning_rate": 3.482025221155656e-06, "loss": 3.1302, "step": 26380 }, { "epoch": 4.967061923583663, "grad_norm": 2.9049251079559326, "learning_rate": 3.2938076416337287e-06, "loss": 3.1375, "step": 26390 }, { "epoch": 4.9689440993788825, "grad_norm": 5.675838947296143, "learning_rate": 3.1055900621118013e-06, "loss": 3.312, "step": 26400 }, { "epoch": 4.970826275174101, "grad_norm": 2.4691736698150635, "learning_rate": 2.917372482589874e-06, "loss": 3.1683, "step": 26410 }, { "epoch": 4.9727084509693205, "grad_norm": 3.464921236038208, "learning_rate": 2.7291549030679465e-06, "loss": 3.2794, "step": 26420 }, { "epoch": 4.97459062676454, "grad_norm": 3.111931562423706, "learning_rate": 2.540937323546019e-06, "loss": 3.2953, "step": 26430 }, { "epoch": 4.976472802559759, "grad_norm": 3.5703999996185303, "learning_rate": 2.352719744024092e-06, "loss": 2.9351, "step": 26440 }, { "epoch": 4.978354978354979, "grad_norm": 2.725428342819214, "learning_rate": 2.1645021645021648e-06, "loss": 2.988, "step": 26450 }, { "epoch": 4.980237154150197, "grad_norm": 2.8327372074127197, "learning_rate": 1.976284584980237e-06, "loss": 2.7986, "step": 26460 }, { "epoch": 4.982119329945417, "grad_norm": 3.3417296409606934, "learning_rate": 1.7880670054583098e-06, "loss": 3.096, "step": 26470 }, { "epoch": 4.984001505740636, "grad_norm": 2.8281989097595215, "learning_rate": 1.5998494259363826e-06, "loss": 3.0627, "step": 26480 }, { "epoch": 4.985883681535856, "grad_norm": 2.6077282428741455, "learning_rate": 1.411631846414455e-06, "loss": 3.1875, "step": 26490 }, { "epoch": 4.987765857331075, "grad_norm": 3.3847365379333496, "learning_rate": 1.2234142668925278e-06, "loss": 3.311, "step": 26500 }, { "epoch": 4.989648033126294, "grad_norm": 4.011746883392334, "learning_rate": 1.0351966873706004e-06, "loss": 2.8797, "step": 26510 }, { "epoch": 4.991530208921513, "grad_norm": 3.3813626766204834, "learning_rate": 8.469791078486731e-07, "loss": 3.1457, "step": 26520 }, { "epoch": 4.993412384716732, "grad_norm": 2.967564344406128, "learning_rate": 6.587615283267458e-07, "loss": 3.2162, "step": 26530 }, { "epoch": 4.995294560511952, "grad_norm": 2.9905147552490234, "learning_rate": 4.7054394880481837e-07, "loss": 3.188, "step": 26540 }, { "epoch": 4.997176736307171, "grad_norm": 3.274362802505493, "learning_rate": 2.8232636928289103e-07, "loss": 2.7407, "step": 26550 }, { "epoch": 4.999058912102391, "grad_norm": 3.104670524597168, "learning_rate": 9.410878976096367e-08, "loss": 3.1158, "step": 26560 }, { "epoch": 5.0, "eval_accuracy": 0.22226666666666667, "eval_loss": 3.161168336868286, "eval_runtime": 116.485, "eval_samples_per_second": 64.386, "eval_steps_per_second": 8.053, "step": 26565 }, { "epoch": 5.0, "step": 26565, "total_flos": 1.64815115092992e+19, "train_loss": 3.597675289453456, "train_runtime": 6995.4319, "train_samples_per_second": 30.377, "train_steps_per_second": 3.797 } ], "logging_steps": 10, "max_steps": 26565, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.64815115092992e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }