{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1743243, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008604652363439865, "grad_norm": 3.2356650829315186, "learning_rate": 4.99856589127276e-05, "loss": 1.6234, "step": 500 }, { "epoch": 0.001720930472687973, "grad_norm": 3.1607751846313477, "learning_rate": 4.99713178254552e-05, "loss": 1.4802, "step": 1000 }, { "epoch": 0.0025813957090319592, "grad_norm": 2.918936252593994, "learning_rate": 4.99569767381828e-05, "loss": 1.439, "step": 1500 }, { "epoch": 0.003441860945375946, "grad_norm": 2.8961517810821533, "learning_rate": 4.9942635650910404e-05, "loss": 1.4051, "step": 2000 }, { "epoch": 0.004302326181719932, "grad_norm": 2.8767917156219482, "learning_rate": 4.992829456363801e-05, "loss": 1.37, "step": 2500 }, { "epoch": 0.0051627914180639185, "grad_norm": 2.6426689624786377, "learning_rate": 4.9913953476365605e-05, "loss": 1.3463, "step": 3000 }, { "epoch": 0.006023256654407905, "grad_norm": 2.5832014083862305, "learning_rate": 4.98996123890932e-05, "loss": 1.3317, "step": 3500 }, { "epoch": 0.006883721890751892, "grad_norm": 2.686662435531616, "learning_rate": 4.98852713018208e-05, "loss": 1.3207, "step": 4000 }, { "epoch": 0.0077441871270958786, "grad_norm": 2.7090184688568115, "learning_rate": 4.98709302145484e-05, "loss": 1.3116, "step": 4500 }, { "epoch": 0.008604652363439864, "grad_norm": 2.712428569793701, "learning_rate": 4.985658912727601e-05, "loss": 1.2907, "step": 5000 }, { "epoch": 0.009465117599783851, "grad_norm": 2.6741058826446533, "learning_rate": 4.9842248040003604e-05, "loss": 1.2853, "step": 5500 }, { "epoch": 0.010325582836127837, "grad_norm": 2.6448793411254883, "learning_rate": 4.982790695273121e-05, "loss": 1.2688, "step": 6000 }, { "epoch": 0.011186048072471824, "grad_norm": 2.575014352798462, "learning_rate": 4.9813565865458805e-05, "loss": 1.2704, "step": 6500 }, { "epoch": 0.01204651330881581, "grad_norm": 2.5448801517486572, "learning_rate": 4.97992247781864e-05, "loss": 1.2578, "step": 7000 }, { "epoch": 0.012906978545159797, "grad_norm": 2.6225686073303223, "learning_rate": 4.9784883690914006e-05, "loss": 1.2471, "step": 7500 }, { "epoch": 0.013767443781503784, "grad_norm": 2.385652542114258, "learning_rate": 4.977054260364161e-05, "loss": 1.2391, "step": 8000 }, { "epoch": 0.01462790901784777, "grad_norm": 2.431713819503784, "learning_rate": 4.975620151636921e-05, "loss": 1.2347, "step": 8500 }, { "epoch": 0.015488374254191757, "grad_norm": 2.5336477756500244, "learning_rate": 4.9741860429096804e-05, "loss": 1.2276, "step": 9000 }, { "epoch": 0.016348839490535744, "grad_norm": 2.459806203842163, "learning_rate": 4.972751934182441e-05, "loss": 1.2269, "step": 9500 }, { "epoch": 0.017209304726879728, "grad_norm": 2.6384100914001465, "learning_rate": 4.9713178254552005e-05, "loss": 1.2196, "step": 10000 }, { "epoch": 0.018069769963223715, "grad_norm": 2.530402898788452, "learning_rate": 4.96988371672796e-05, "loss": 1.2087, "step": 10500 }, { "epoch": 0.018930235199567703, "grad_norm": 3.5155889987945557, "learning_rate": 4.9684496080007206e-05, "loss": 1.2019, "step": 11000 }, { "epoch": 0.01979070043591169, "grad_norm": 2.5882487297058105, "learning_rate": 4.967015499273481e-05, "loss": 1.2044, "step": 11500 }, { "epoch": 0.020651165672255674, "grad_norm": 2.349508047103882, "learning_rate": 4.965581390546241e-05, "loss": 1.1935, "step": 12000 }, { "epoch": 0.02151163090859966, "grad_norm": 2.4116039276123047, "learning_rate": 4.964147281819001e-05, "loss": 1.1893, "step": 12500 }, { "epoch": 0.02237209614494365, "grad_norm": 2.585287570953369, "learning_rate": 4.962713173091761e-05, "loss": 1.1874, "step": 13000 }, { "epoch": 0.023232561381287636, "grad_norm": 2.419912338256836, "learning_rate": 4.9612790643645205e-05, "loss": 1.1769, "step": 13500 }, { "epoch": 0.02409302661763162, "grad_norm": 2.4008758068084717, "learning_rate": 4.959844955637281e-05, "loss": 1.1772, "step": 14000 }, { "epoch": 0.024953491853975607, "grad_norm": 2.352156400680542, "learning_rate": 4.958410846910041e-05, "loss": 1.1751, "step": 14500 }, { "epoch": 0.025813957090319594, "grad_norm": 2.361285448074341, "learning_rate": 4.956976738182801e-05, "loss": 1.1698, "step": 15000 }, { "epoch": 0.02667442232666358, "grad_norm": 2.274899959564209, "learning_rate": 4.955542629455561e-05, "loss": 1.1721, "step": 15500 }, { "epoch": 0.02753488756300757, "grad_norm": 2.572073221206665, "learning_rate": 4.954108520728321e-05, "loss": 1.1679, "step": 16000 }, { "epoch": 0.028395352799351552, "grad_norm": 2.4291021823883057, "learning_rate": 4.952674412001081e-05, "loss": 1.1646, "step": 16500 }, { "epoch": 0.02925581803569554, "grad_norm": 2.2385716438293457, "learning_rate": 4.9512403032738405e-05, "loss": 1.1583, "step": 17000 }, { "epoch": 0.030116283272039527, "grad_norm": 2.2407643795013428, "learning_rate": 4.9498061945466015e-05, "loss": 1.1575, "step": 17500 }, { "epoch": 0.030976748508383514, "grad_norm": 2.469423532485962, "learning_rate": 4.948372085819361e-05, "loss": 1.1584, "step": 18000 }, { "epoch": 0.0318372137447275, "grad_norm": 2.4228737354278564, "learning_rate": 4.946937977092121e-05, "loss": 1.1456, "step": 18500 }, { "epoch": 0.03269767898107149, "grad_norm": 2.16351056098938, "learning_rate": 4.9455038683648813e-05, "loss": 1.1475, "step": 19000 }, { "epoch": 0.03355814421741547, "grad_norm": 2.4227852821350098, "learning_rate": 4.944069759637641e-05, "loss": 1.1538, "step": 19500 }, { "epoch": 0.034418609453759456, "grad_norm": 2.178745746612549, "learning_rate": 4.942635650910401e-05, "loss": 1.1488, "step": 20000 }, { "epoch": 0.03527907469010345, "grad_norm": 2.2594761848449707, "learning_rate": 4.941201542183161e-05, "loss": 1.1475, "step": 20500 }, { "epoch": 0.03613953992644743, "grad_norm": 2.50846791267395, "learning_rate": 4.9397674334559215e-05, "loss": 1.1377, "step": 21000 }, { "epoch": 0.037000005162791415, "grad_norm": 2.3424227237701416, "learning_rate": 4.938333324728681e-05, "loss": 1.1357, "step": 21500 }, { "epoch": 0.037860470399135406, "grad_norm": 2.2766966819763184, "learning_rate": 4.936899216001441e-05, "loss": 1.1285, "step": 22000 }, { "epoch": 0.03872093563547939, "grad_norm": 2.2529232501983643, "learning_rate": 4.9354651072742013e-05, "loss": 1.1261, "step": 22500 }, { "epoch": 0.03958140087182338, "grad_norm": 2.248143434524536, "learning_rate": 4.934030998546961e-05, "loss": 1.1252, "step": 23000 }, { "epoch": 0.040441866108167364, "grad_norm": 2.3664751052856445, "learning_rate": 4.9325968898197214e-05, "loss": 1.1214, "step": 23500 }, { "epoch": 0.04130233134451135, "grad_norm": 2.2858259677886963, "learning_rate": 4.931162781092482e-05, "loss": 1.1162, "step": 24000 }, { "epoch": 0.04216279658085534, "grad_norm": 2.3754398822784424, "learning_rate": 4.9297286723652415e-05, "loss": 1.1183, "step": 24500 }, { "epoch": 0.04302326181719932, "grad_norm": 2.450465202331543, "learning_rate": 4.928294563638001e-05, "loss": 1.1198, "step": 25000 }, { "epoch": 0.04388372705354331, "grad_norm": 2.2866413593292236, "learning_rate": 4.9268604549107616e-05, "loss": 1.1151, "step": 25500 }, { "epoch": 0.0447441922898873, "grad_norm": 2.2568089962005615, "learning_rate": 4.925426346183521e-05, "loss": 1.1113, "step": 26000 }, { "epoch": 0.04560465752623128, "grad_norm": 2.668402910232544, "learning_rate": 4.923992237456281e-05, "loss": 1.1109, "step": 26500 }, { "epoch": 0.04646512276257527, "grad_norm": 2.635807514190674, "learning_rate": 4.9225581287290414e-05, "loss": 1.1176, "step": 27000 }, { "epoch": 0.047325587998919255, "grad_norm": 2.1348018646240234, "learning_rate": 4.921124020001802e-05, "loss": 1.1063, "step": 27500 }, { "epoch": 0.04818605323526324, "grad_norm": 2.548943519592285, "learning_rate": 4.9196899112745615e-05, "loss": 1.1102, "step": 28000 }, { "epoch": 0.04904651847160723, "grad_norm": 2.266880989074707, "learning_rate": 4.918255802547321e-05, "loss": 1.106, "step": 28500 }, { "epoch": 0.049906983707951214, "grad_norm": 2.3181064128875732, "learning_rate": 4.9168216938200816e-05, "loss": 1.1094, "step": 29000 }, { "epoch": 0.050767448944295204, "grad_norm": 2.3166842460632324, "learning_rate": 4.915387585092841e-05, "loss": 1.0996, "step": 29500 }, { "epoch": 0.05162791418063919, "grad_norm": 2.133981704711914, "learning_rate": 4.913953476365602e-05, "loss": 1.1021, "step": 30000 }, { "epoch": 0.05248837941698317, "grad_norm": 2.3361051082611084, "learning_rate": 4.912519367638362e-05, "loss": 1.0997, "step": 30500 }, { "epoch": 0.05334884465332716, "grad_norm": 2.381316661834717, "learning_rate": 4.911085258911122e-05, "loss": 1.0956, "step": 31000 }, { "epoch": 0.054209309889671146, "grad_norm": 2.2757225036621094, "learning_rate": 4.9096511501838815e-05, "loss": 1.0952, "step": 31500 }, { "epoch": 0.05506977512601514, "grad_norm": 2.4369475841522217, "learning_rate": 4.908217041456641e-05, "loss": 1.0925, "step": 32000 }, { "epoch": 0.05593024036235912, "grad_norm": 2.6689693927764893, "learning_rate": 4.9067829327294016e-05, "loss": 1.0907, "step": 32500 }, { "epoch": 0.056790705598703105, "grad_norm": 2.2719929218292236, "learning_rate": 4.905348824002162e-05, "loss": 1.0888, "step": 33000 }, { "epoch": 0.057651170835047096, "grad_norm": 2.323052406311035, "learning_rate": 4.903914715274922e-05, "loss": 1.0898, "step": 33500 }, { "epoch": 0.05851163607139108, "grad_norm": 2.3382821083068848, "learning_rate": 4.902480606547682e-05, "loss": 1.0804, "step": 34000 }, { "epoch": 0.05937210130773506, "grad_norm": 2.1152403354644775, "learning_rate": 4.901046497820442e-05, "loss": 1.0767, "step": 34500 }, { "epoch": 0.060232566544079054, "grad_norm": 2.0752382278442383, "learning_rate": 4.8996123890932015e-05, "loss": 1.0838, "step": 35000 }, { "epoch": 0.06109303178042304, "grad_norm": 2.2990305423736572, "learning_rate": 4.898178280365962e-05, "loss": 1.084, "step": 35500 }, { "epoch": 0.06195349701676703, "grad_norm": 2.2469754219055176, "learning_rate": 4.8967441716387216e-05, "loss": 1.0819, "step": 36000 }, { "epoch": 0.062813962253111, "grad_norm": 2.147425889968872, "learning_rate": 4.895310062911482e-05, "loss": 1.0785, "step": 36500 }, { "epoch": 0.063674427489455, "grad_norm": 2.2332513332366943, "learning_rate": 4.8938759541842424e-05, "loss": 1.0752, "step": 37000 }, { "epoch": 0.06453489272579899, "grad_norm": 2.3052799701690674, "learning_rate": 4.892441845457002e-05, "loss": 1.0749, "step": 37500 }, { "epoch": 0.06539535796214298, "grad_norm": 2.1933348178863525, "learning_rate": 4.891007736729762e-05, "loss": 1.0737, "step": 38000 }, { "epoch": 0.06625582319848695, "grad_norm": 2.2870287895202637, "learning_rate": 4.8895736280025215e-05, "loss": 1.0778, "step": 38500 }, { "epoch": 0.06711628843483095, "grad_norm": 2.422950506210327, "learning_rate": 4.888139519275282e-05, "loss": 1.0725, "step": 39000 }, { "epoch": 0.06797675367117494, "grad_norm": 2.274256706237793, "learning_rate": 4.886705410548042e-05, "loss": 1.0652, "step": 39500 }, { "epoch": 0.06883721890751891, "grad_norm": 2.131338596343994, "learning_rate": 4.885271301820802e-05, "loss": 1.0693, "step": 40000 }, { "epoch": 0.0696976841438629, "grad_norm": 2.249906539916992, "learning_rate": 4.8838371930935624e-05, "loss": 1.0642, "step": 40500 }, { "epoch": 0.0705581493802069, "grad_norm": 2.419240951538086, "learning_rate": 4.882403084366322e-05, "loss": 1.0743, "step": 41000 }, { "epoch": 0.07141861461655087, "grad_norm": 2.2712082862854004, "learning_rate": 4.880968975639082e-05, "loss": 1.0711, "step": 41500 }, { "epoch": 0.07227907985289486, "grad_norm": 2.220015048980713, "learning_rate": 4.879534866911842e-05, "loss": 1.0625, "step": 42000 }, { "epoch": 0.07313954508923885, "grad_norm": 2.2555344104766846, "learning_rate": 4.8781007581846025e-05, "loss": 1.0638, "step": 42500 }, { "epoch": 0.07400001032558283, "grad_norm": 2.2342844009399414, "learning_rate": 4.876666649457362e-05, "loss": 1.0577, "step": 43000 }, { "epoch": 0.07486047556192682, "grad_norm": 2.1202335357666016, "learning_rate": 4.875232540730122e-05, "loss": 1.06, "step": 43500 }, { "epoch": 0.07572094079827081, "grad_norm": 2.153123140335083, "learning_rate": 4.8737984320028823e-05, "loss": 1.0555, "step": 44000 }, { "epoch": 0.0765814060346148, "grad_norm": 2.4861624240875244, "learning_rate": 4.872364323275642e-05, "loss": 1.059, "step": 44500 }, { "epoch": 0.07744187127095878, "grad_norm": 2.0985679626464844, "learning_rate": 4.870930214548402e-05, "loss": 1.0608, "step": 45000 }, { "epoch": 0.07830233650730277, "grad_norm": 2.197465658187866, "learning_rate": 4.869496105821162e-05, "loss": 1.0531, "step": 45500 }, { "epoch": 0.07916280174364676, "grad_norm": 2.1348724365234375, "learning_rate": 4.8680619970939225e-05, "loss": 1.0622, "step": 46000 }, { "epoch": 0.08002326697999074, "grad_norm": 2.1105077266693115, "learning_rate": 4.866627888366682e-05, "loss": 1.0523, "step": 46500 }, { "epoch": 0.08088373221633473, "grad_norm": 2.1300766468048096, "learning_rate": 4.8651937796394426e-05, "loss": 1.055, "step": 47000 }, { "epoch": 0.08174419745267872, "grad_norm": 2.1636085510253906, "learning_rate": 4.863759670912202e-05, "loss": 1.0469, "step": 47500 }, { "epoch": 0.0826046626890227, "grad_norm": 2.230454206466675, "learning_rate": 4.862325562184962e-05, "loss": 1.0504, "step": 48000 }, { "epoch": 0.08346512792536669, "grad_norm": 2.13574481010437, "learning_rate": 4.8608914534577224e-05, "loss": 1.0484, "step": 48500 }, { "epoch": 0.08432559316171068, "grad_norm": 2.431797981262207, "learning_rate": 4.859457344730483e-05, "loss": 1.0496, "step": 49000 }, { "epoch": 0.08518605839805465, "grad_norm": 2.244641065597534, "learning_rate": 4.8580232360032425e-05, "loss": 1.0496, "step": 49500 }, { "epoch": 0.08604652363439864, "grad_norm": 2.164771556854248, "learning_rate": 4.856589127276002e-05, "loss": 1.0421, "step": 50000 }, { "epoch": 0.08690698887074264, "grad_norm": 2.0793097019195557, "learning_rate": 4.8551550185487626e-05, "loss": 1.0379, "step": 50500 }, { "epoch": 0.08776745410708663, "grad_norm": 2.270684242248535, "learning_rate": 4.853720909821522e-05, "loss": 1.0533, "step": 51000 }, { "epoch": 0.0886279193434306, "grad_norm": 2.2440924644470215, "learning_rate": 4.852286801094282e-05, "loss": 1.0355, "step": 51500 }, { "epoch": 0.0894883845797746, "grad_norm": 2.041602611541748, "learning_rate": 4.850852692367043e-05, "loss": 1.0345, "step": 52000 }, { "epoch": 0.09034884981611858, "grad_norm": 2.487091302871704, "learning_rate": 4.849418583639803e-05, "loss": 1.0497, "step": 52500 }, { "epoch": 0.09120931505246256, "grad_norm": 2.3336119651794434, "learning_rate": 4.8479844749125625e-05, "loss": 1.0399, "step": 53000 }, { "epoch": 0.09206978028880655, "grad_norm": 2.019739866256714, "learning_rate": 4.846550366185323e-05, "loss": 1.0378, "step": 53500 }, { "epoch": 0.09293024552515054, "grad_norm": 2.2122652530670166, "learning_rate": 4.8451162574580826e-05, "loss": 1.0397, "step": 54000 }, { "epoch": 0.09379071076149452, "grad_norm": 2.15346360206604, "learning_rate": 4.843682148730842e-05, "loss": 1.0378, "step": 54500 }, { "epoch": 0.09465117599783851, "grad_norm": 2.288830518722534, "learning_rate": 4.842248040003603e-05, "loss": 1.0352, "step": 55000 }, { "epoch": 0.0955116412341825, "grad_norm": 2.551696538925171, "learning_rate": 4.840813931276363e-05, "loss": 1.0329, "step": 55500 }, { "epoch": 0.09637210647052648, "grad_norm": 2.044560432434082, "learning_rate": 4.839379822549123e-05, "loss": 1.0296, "step": 56000 }, { "epoch": 0.09723257170687047, "grad_norm": 2.240878105163574, "learning_rate": 4.8379457138218825e-05, "loss": 1.0382, "step": 56500 }, { "epoch": 0.09809303694321446, "grad_norm": 2.011512517929077, "learning_rate": 4.836511605094643e-05, "loss": 1.0315, "step": 57000 }, { "epoch": 0.09895350217955845, "grad_norm": 2.134789228439331, "learning_rate": 4.8350774963674026e-05, "loss": 1.0358, "step": 57500 }, { "epoch": 0.09981396741590243, "grad_norm": 2.1063222885131836, "learning_rate": 4.833643387640163e-05, "loss": 1.0284, "step": 58000 }, { "epoch": 0.10067443265224642, "grad_norm": 2.1646718978881836, "learning_rate": 4.8322092789129234e-05, "loss": 1.0239, "step": 58500 }, { "epoch": 0.10153489788859041, "grad_norm": 2.18721866607666, "learning_rate": 4.830775170185683e-05, "loss": 1.0311, "step": 59000 }, { "epoch": 0.10239536312493439, "grad_norm": 2.0538229942321777, "learning_rate": 4.829341061458443e-05, "loss": 1.0265, "step": 59500 }, { "epoch": 0.10325582836127838, "grad_norm": 2.197871208190918, "learning_rate": 4.827906952731203e-05, "loss": 1.0317, "step": 60000 }, { "epoch": 0.10411629359762237, "grad_norm": 2.194244623184204, "learning_rate": 4.826472844003963e-05, "loss": 1.0327, "step": 60500 }, { "epoch": 0.10497675883396634, "grad_norm": 2.290822744369507, "learning_rate": 4.8250387352767226e-05, "loss": 1.0228, "step": 61000 }, { "epoch": 0.10583722407031033, "grad_norm": 2.1944305896759033, "learning_rate": 4.823604626549483e-05, "loss": 1.0208, "step": 61500 }, { "epoch": 0.10669768930665433, "grad_norm": 2.5611112117767334, "learning_rate": 4.8221705178222434e-05, "loss": 1.0299, "step": 62000 }, { "epoch": 0.1075581545429983, "grad_norm": 2.017531394958496, "learning_rate": 4.820736409095003e-05, "loss": 1.0193, "step": 62500 }, { "epoch": 0.10841861977934229, "grad_norm": 2.0505359172821045, "learning_rate": 4.819302300367763e-05, "loss": 1.018, "step": 63000 }, { "epoch": 0.10927908501568628, "grad_norm": 2.133496046066284, "learning_rate": 4.817868191640523e-05, "loss": 1.0268, "step": 63500 }, { "epoch": 0.11013955025203027, "grad_norm": 2.1716065406799316, "learning_rate": 4.816434082913283e-05, "loss": 1.0231, "step": 64000 }, { "epoch": 0.11100001548837425, "grad_norm": 2.344346523284912, "learning_rate": 4.814999974186043e-05, "loss": 1.0107, "step": 64500 }, { "epoch": 0.11186048072471824, "grad_norm": 2.2010936737060547, "learning_rate": 4.8135658654588036e-05, "loss": 1.0268, "step": 65000 }, { "epoch": 0.11272094596106223, "grad_norm": 2.142312526702881, "learning_rate": 4.8121317567315634e-05, "loss": 1.0243, "step": 65500 }, { "epoch": 0.11358141119740621, "grad_norm": 2.0880520343780518, "learning_rate": 4.810697648004323e-05, "loss": 1.0151, "step": 66000 }, { "epoch": 0.1144418764337502, "grad_norm": 1.9939734935760498, "learning_rate": 4.8092635392770834e-05, "loss": 1.0112, "step": 66500 }, { "epoch": 0.11530234167009419, "grad_norm": 2.421113967895508, "learning_rate": 4.807829430549843e-05, "loss": 1.0165, "step": 67000 }, { "epoch": 0.11616280690643817, "grad_norm": 2.137781858444214, "learning_rate": 4.8063953218226035e-05, "loss": 1.0141, "step": 67500 }, { "epoch": 0.11702327214278216, "grad_norm": 2.0419673919677734, "learning_rate": 4.804961213095363e-05, "loss": 1.0135, "step": 68000 }, { "epoch": 0.11788373737912615, "grad_norm": 2.1220955848693848, "learning_rate": 4.8035271043681236e-05, "loss": 1.0154, "step": 68500 }, { "epoch": 0.11874420261547013, "grad_norm": 2.096822500228882, "learning_rate": 4.8020929956408833e-05, "loss": 1.013, "step": 69000 }, { "epoch": 0.11960466785181412, "grad_norm": 2.2863707542419434, "learning_rate": 4.800658886913643e-05, "loss": 1.0105, "step": 69500 }, { "epoch": 0.12046513308815811, "grad_norm": 2.1425676345825195, "learning_rate": 4.7992247781864034e-05, "loss": 1.0183, "step": 70000 }, { "epoch": 0.1213255983245021, "grad_norm": 2.125267744064331, "learning_rate": 4.797790669459163e-05, "loss": 1.0169, "step": 70500 }, { "epoch": 0.12218606356084608, "grad_norm": 2.2705278396606445, "learning_rate": 4.7963565607319235e-05, "loss": 1.0125, "step": 71000 }, { "epoch": 0.12304652879719007, "grad_norm": 2.3827016353607178, "learning_rate": 4.794922452004684e-05, "loss": 1.0072, "step": 71500 }, { "epoch": 0.12390699403353406, "grad_norm": 2.2930362224578857, "learning_rate": 4.7934883432774436e-05, "loss": 1.0013, "step": 72000 }, { "epoch": 0.12476745926987803, "grad_norm": 2.1181716918945312, "learning_rate": 4.792054234550203e-05, "loss": 1.0106, "step": 72500 }, { "epoch": 0.125627924506222, "grad_norm": 2.098687171936035, "learning_rate": 4.790620125822963e-05, "loss": 1.0028, "step": 73000 }, { "epoch": 0.12648838974256602, "grad_norm": 2.1100914478302, "learning_rate": 4.7891860170957234e-05, "loss": 1.0112, "step": 73500 }, { "epoch": 0.12734885497891, "grad_norm": 2.1170222759246826, "learning_rate": 4.787751908368484e-05, "loss": 1.0159, "step": 74000 }, { "epoch": 0.128209320215254, "grad_norm": 2.1959116458892822, "learning_rate": 4.7863177996412435e-05, "loss": 1.0046, "step": 74500 }, { "epoch": 0.12906978545159797, "grad_norm": 2.074676990509033, "learning_rate": 4.784883690914004e-05, "loss": 1.0086, "step": 75000 }, { "epoch": 0.12993025068794195, "grad_norm": 2.09879207611084, "learning_rate": 4.7834495821867636e-05, "loss": 1.0059, "step": 75500 }, { "epoch": 0.13079071592428596, "grad_norm": 2.1059296131134033, "learning_rate": 4.782015473459523e-05, "loss": 1.0072, "step": 76000 }, { "epoch": 0.13165118116062993, "grad_norm": 2.043699026107788, "learning_rate": 4.780581364732284e-05, "loss": 1.0002, "step": 76500 }, { "epoch": 0.1325116463969739, "grad_norm": 2.139575242996216, "learning_rate": 4.779147256005044e-05, "loss": 1.003, "step": 77000 }, { "epoch": 0.1333721116333179, "grad_norm": 2.240222930908203, "learning_rate": 4.777713147277804e-05, "loss": 1.0075, "step": 77500 }, { "epoch": 0.1342325768696619, "grad_norm": 1.817133903503418, "learning_rate": 4.776279038550564e-05, "loss": 0.9962, "step": 78000 }, { "epoch": 0.13509304210600587, "grad_norm": 1.9295564889907837, "learning_rate": 4.774844929823324e-05, "loss": 0.9977, "step": 78500 }, { "epoch": 0.13595350734234987, "grad_norm": 2.252047538757324, "learning_rate": 4.7734108210960836e-05, "loss": 1.0009, "step": 79000 }, { "epoch": 0.13681397257869385, "grad_norm": 2.113853931427002, "learning_rate": 4.771976712368843e-05, "loss": 1.0032, "step": 79500 }, { "epoch": 0.13767443781503783, "grad_norm": 2.0588555335998535, "learning_rate": 4.770542603641604e-05, "loss": 0.999, "step": 80000 }, { "epoch": 0.13853490305138183, "grad_norm": 2.266942024230957, "learning_rate": 4.769108494914364e-05, "loss": 1.0058, "step": 80500 }, { "epoch": 0.1393953682877258, "grad_norm": 2.1544504165649414, "learning_rate": 4.767674386187124e-05, "loss": 0.9998, "step": 81000 }, { "epoch": 0.14025583352406978, "grad_norm": 2.166877508163452, "learning_rate": 4.766240277459884e-05, "loss": 1.0047, "step": 81500 }, { "epoch": 0.1411162987604138, "grad_norm": 1.9637346267700195, "learning_rate": 4.764806168732644e-05, "loss": 0.9995, "step": 82000 }, { "epoch": 0.14197676399675777, "grad_norm": 1.9987211227416992, "learning_rate": 4.7633720600054036e-05, "loss": 0.9922, "step": 82500 }, { "epoch": 0.14283722923310174, "grad_norm": 2.114132881164551, "learning_rate": 4.761937951278164e-05, "loss": 0.9936, "step": 83000 }, { "epoch": 0.14369769446944575, "grad_norm": 2.1807868480682373, "learning_rate": 4.7605038425509244e-05, "loss": 0.9946, "step": 83500 }, { "epoch": 0.14455815970578972, "grad_norm": 2.150728940963745, "learning_rate": 4.759069733823684e-05, "loss": 0.9982, "step": 84000 }, { "epoch": 0.1454186249421337, "grad_norm": 2.000176429748535, "learning_rate": 4.757635625096444e-05, "loss": 0.9922, "step": 84500 }, { "epoch": 0.1462790901784777, "grad_norm": 2.2587642669677734, "learning_rate": 4.756201516369204e-05, "loss": 0.9921, "step": 85000 }, { "epoch": 0.14713955541482168, "grad_norm": 2.2462809085845947, "learning_rate": 4.754767407641964e-05, "loss": 0.9948, "step": 85500 }, { "epoch": 0.14800002065116566, "grad_norm": 2.114885091781616, "learning_rate": 4.7533332989147236e-05, "loss": 0.9977, "step": 86000 }, { "epoch": 0.14886048588750966, "grad_norm": 2.3015403747558594, "learning_rate": 4.751899190187484e-05, "loss": 0.9924, "step": 86500 }, { "epoch": 0.14972095112385364, "grad_norm": 2.2971363067626953, "learning_rate": 4.7504650814602444e-05, "loss": 0.988, "step": 87000 }, { "epoch": 0.15058141636019765, "grad_norm": 2.223186731338501, "learning_rate": 4.749030972733004e-05, "loss": 0.9891, "step": 87500 }, { "epoch": 0.15144188159654162, "grad_norm": 2.1935551166534424, "learning_rate": 4.7475968640057645e-05, "loss": 0.9885, "step": 88000 }, { "epoch": 0.1523023468328856, "grad_norm": 2.0309863090515137, "learning_rate": 4.746162755278524e-05, "loss": 0.9989, "step": 88500 }, { "epoch": 0.1531628120692296, "grad_norm": 2.113100051879883, "learning_rate": 4.744728646551284e-05, "loss": 0.9924, "step": 89000 }, { "epoch": 0.15402327730557358, "grad_norm": 1.9869849681854248, "learning_rate": 4.743294537824044e-05, "loss": 0.991, "step": 89500 }, { "epoch": 0.15488374254191756, "grad_norm": 2.1156396865844727, "learning_rate": 4.7418604290968046e-05, "loss": 0.9907, "step": 90000 }, { "epoch": 0.15574420777826156, "grad_norm": 1.9725556373596191, "learning_rate": 4.7404263203695644e-05, "loss": 0.9938, "step": 90500 }, { "epoch": 0.15660467301460554, "grad_norm": 2.0329513549804688, "learning_rate": 4.738992211642324e-05, "loss": 0.9935, "step": 91000 }, { "epoch": 0.15746513825094952, "grad_norm": 2.1494927406311035, "learning_rate": 4.7375581029150844e-05, "loss": 0.9783, "step": 91500 }, { "epoch": 0.15832560348729352, "grad_norm": 2.001607656478882, "learning_rate": 4.736123994187844e-05, "loss": 0.9936, "step": 92000 }, { "epoch": 0.1591860687236375, "grad_norm": 1.998681902885437, "learning_rate": 4.7346898854606045e-05, "loss": 0.9894, "step": 92500 }, { "epoch": 0.16004653395998147, "grad_norm": 2.114757537841797, "learning_rate": 4.733255776733365e-05, "loss": 0.9901, "step": 93000 }, { "epoch": 0.16090699919632548, "grad_norm": 2.082709789276123, "learning_rate": 4.7318216680061246e-05, "loss": 0.9859, "step": 93500 }, { "epoch": 0.16176746443266946, "grad_norm": 2.1626060009002686, "learning_rate": 4.7303875592788843e-05, "loss": 0.9848, "step": 94000 }, { "epoch": 0.16262792966901343, "grad_norm": 2.2629404067993164, "learning_rate": 4.728953450551645e-05, "loss": 0.9872, "step": 94500 }, { "epoch": 0.16348839490535744, "grad_norm": 2.1497435569763184, "learning_rate": 4.7275193418244044e-05, "loss": 0.984, "step": 95000 }, { "epoch": 0.16434886014170141, "grad_norm": 2.212721586227417, "learning_rate": 4.726085233097164e-05, "loss": 0.9874, "step": 95500 }, { "epoch": 0.1652093253780454, "grad_norm": 2.1093273162841797, "learning_rate": 4.7246511243699245e-05, "loss": 0.9752, "step": 96000 }, { "epoch": 0.1660697906143894, "grad_norm": 2.026782274246216, "learning_rate": 4.723217015642685e-05, "loss": 0.9847, "step": 96500 }, { "epoch": 0.16693025585073337, "grad_norm": 2.3008055686950684, "learning_rate": 4.7217829069154446e-05, "loss": 0.982, "step": 97000 }, { "epoch": 0.16779072108707735, "grad_norm": 2.1564080715179443, "learning_rate": 4.720348798188204e-05, "loss": 0.9794, "step": 97500 }, { "epoch": 0.16865118632342135, "grad_norm": 2.0874173641204834, "learning_rate": 4.718914689460965e-05, "loss": 0.9804, "step": 98000 }, { "epoch": 0.16951165155976533, "grad_norm": 1.9421424865722656, "learning_rate": 4.7174805807337244e-05, "loss": 0.9862, "step": 98500 }, { "epoch": 0.1703721167961093, "grad_norm": 2.108234167098999, "learning_rate": 4.716046472006485e-05, "loss": 0.9826, "step": 99000 }, { "epoch": 0.1712325820324533, "grad_norm": 2.1940762996673584, "learning_rate": 4.714612363279245e-05, "loss": 0.9807, "step": 99500 }, { "epoch": 0.1720930472687973, "grad_norm": 2.1281466484069824, "learning_rate": 4.713178254552005e-05, "loss": 0.9801, "step": 100000 }, { "epoch": 0.1729535125051413, "grad_norm": 2.0339930057525635, "learning_rate": 4.7117441458247646e-05, "loss": 0.9703, "step": 100500 }, { "epoch": 0.17381397774148527, "grad_norm": 2.073927402496338, "learning_rate": 4.710310037097525e-05, "loss": 0.9773, "step": 101000 }, { "epoch": 0.17467444297782925, "grad_norm": 2.0153615474700928, "learning_rate": 4.708875928370285e-05, "loss": 0.9817, "step": 101500 }, { "epoch": 0.17553490821417325, "grad_norm": 2.182114839553833, "learning_rate": 4.707441819643045e-05, "loss": 0.9702, "step": 102000 }, { "epoch": 0.17639537345051723, "grad_norm": 2.2643656730651855, "learning_rate": 4.706007710915805e-05, "loss": 0.9803, "step": 102500 }, { "epoch": 0.1772558386868612, "grad_norm": 2.1115529537200928, "learning_rate": 4.704573602188565e-05, "loss": 0.9782, "step": 103000 }, { "epoch": 0.1781163039232052, "grad_norm": 2.089778184890747, "learning_rate": 4.703139493461325e-05, "loss": 0.974, "step": 103500 }, { "epoch": 0.1789767691595492, "grad_norm": 2.1720387935638428, "learning_rate": 4.7017053847340846e-05, "loss": 0.9726, "step": 104000 }, { "epoch": 0.17983723439589316, "grad_norm": 2.0718801021575928, "learning_rate": 4.700271276006845e-05, "loss": 0.9693, "step": 104500 }, { "epoch": 0.18069769963223717, "grad_norm": 1.9460430145263672, "learning_rate": 4.698837167279605e-05, "loss": 0.9661, "step": 105000 }, { "epoch": 0.18155816486858115, "grad_norm": 2.153810977935791, "learning_rate": 4.697403058552365e-05, "loss": 0.9734, "step": 105500 }, { "epoch": 0.18241863010492512, "grad_norm": 2.104356050491333, "learning_rate": 4.6959689498251255e-05, "loss": 0.9718, "step": 106000 }, { "epoch": 0.18327909534126913, "grad_norm": 2.3347885608673096, "learning_rate": 4.694534841097885e-05, "loss": 0.9745, "step": 106500 }, { "epoch": 0.1841395605776131, "grad_norm": 2.0650954246520996, "learning_rate": 4.693100732370645e-05, "loss": 0.9731, "step": 107000 }, { "epoch": 0.18500002581395708, "grad_norm": 2.060877561569214, "learning_rate": 4.6916666236434046e-05, "loss": 0.9762, "step": 107500 }, { "epoch": 0.18586049105030109, "grad_norm": 2.2722277641296387, "learning_rate": 4.690232514916165e-05, "loss": 0.9679, "step": 108000 }, { "epoch": 0.18672095628664506, "grad_norm": 2.2440106868743896, "learning_rate": 4.6887984061889254e-05, "loss": 0.978, "step": 108500 }, { "epoch": 0.18758142152298904, "grad_norm": 1.9629226922988892, "learning_rate": 4.687364297461685e-05, "loss": 0.972, "step": 109000 }, { "epoch": 0.18844188675933304, "grad_norm": 2.1158809661865234, "learning_rate": 4.6859301887344455e-05, "loss": 0.9697, "step": 109500 }, { "epoch": 0.18930235199567702, "grad_norm": 2.001575231552124, "learning_rate": 4.684496080007205e-05, "loss": 0.9699, "step": 110000 }, { "epoch": 0.190162817232021, "grad_norm": 2.1017117500305176, "learning_rate": 4.683061971279965e-05, "loss": 0.965, "step": 110500 }, { "epoch": 0.191023282468365, "grad_norm": 2.198977470397949, "learning_rate": 4.681627862552725e-05, "loss": 0.9682, "step": 111000 }, { "epoch": 0.19188374770470898, "grad_norm": 2.0062718391418457, "learning_rate": 4.680193753825485e-05, "loss": 0.9709, "step": 111500 }, { "epoch": 0.19274421294105296, "grad_norm": 2.038653612136841, "learning_rate": 4.6787596450982454e-05, "loss": 0.9668, "step": 112000 }, { "epoch": 0.19360467817739696, "grad_norm": 2.0191009044647217, "learning_rate": 4.677325536371006e-05, "loss": 0.9669, "step": 112500 }, { "epoch": 0.19446514341374094, "grad_norm": 2.0882835388183594, "learning_rate": 4.6758914276437655e-05, "loss": 0.9727, "step": 113000 }, { "epoch": 0.19532560865008491, "grad_norm": 2.0213959217071533, "learning_rate": 4.674457318916525e-05, "loss": 0.9689, "step": 113500 }, { "epoch": 0.19618607388642892, "grad_norm": 2.1264288425445557, "learning_rate": 4.673023210189285e-05, "loss": 0.9778, "step": 114000 }, { "epoch": 0.1970465391227729, "grad_norm": 2.0388376712799072, "learning_rate": 4.671589101462045e-05, "loss": 0.9625, "step": 114500 }, { "epoch": 0.1979070043591169, "grad_norm": 2.0679802894592285, "learning_rate": 4.6701549927348056e-05, "loss": 0.9651, "step": 115000 }, { "epoch": 0.19876746959546088, "grad_norm": 2.2121481895446777, "learning_rate": 4.6687208840075653e-05, "loss": 0.9602, "step": 115500 }, { "epoch": 0.19962793483180485, "grad_norm": 1.9560799598693848, "learning_rate": 4.667286775280326e-05, "loss": 0.9625, "step": 116000 }, { "epoch": 0.20048840006814886, "grad_norm": 2.0703752040863037, "learning_rate": 4.6658526665530854e-05, "loss": 0.965, "step": 116500 }, { "epoch": 0.20134886530449284, "grad_norm": 2.160184621810913, "learning_rate": 4.664418557825845e-05, "loss": 0.9673, "step": 117000 }, { "epoch": 0.2022093305408368, "grad_norm": 1.996235728263855, "learning_rate": 4.6629844490986055e-05, "loss": 0.9601, "step": 117500 }, { "epoch": 0.20306979577718082, "grad_norm": 2.1551196575164795, "learning_rate": 4.661550340371366e-05, "loss": 0.9662, "step": 118000 }, { "epoch": 0.2039302610135248, "grad_norm": 1.9875051975250244, "learning_rate": 4.6601162316441256e-05, "loss": 0.9648, "step": 118500 }, { "epoch": 0.20479072624986877, "grad_norm": 2.0819616317749023, "learning_rate": 4.658682122916885e-05, "loss": 0.9612, "step": 119000 }, { "epoch": 0.20565119148621278, "grad_norm": 2.1169705390930176, "learning_rate": 4.657248014189646e-05, "loss": 0.9575, "step": 119500 }, { "epoch": 0.20651165672255675, "grad_norm": 2.142817258834839, "learning_rate": 4.6558139054624054e-05, "loss": 0.965, "step": 120000 }, { "epoch": 0.20737212195890073, "grad_norm": 2.0226452350616455, "learning_rate": 4.654379796735165e-05, "loss": 0.9574, "step": 120500 }, { "epoch": 0.20823258719524473, "grad_norm": 2.0718817710876465, "learning_rate": 4.6529456880079255e-05, "loss": 0.9624, "step": 121000 }, { "epoch": 0.2090930524315887, "grad_norm": 2.1692280769348145, "learning_rate": 4.651511579280686e-05, "loss": 0.9608, "step": 121500 }, { "epoch": 0.2099535176679327, "grad_norm": 2.2331693172454834, "learning_rate": 4.6500774705534456e-05, "loss": 0.9628, "step": 122000 }, { "epoch": 0.2108139829042767, "grad_norm": 2.127037763595581, "learning_rate": 4.648643361826206e-05, "loss": 0.9657, "step": 122500 }, { "epoch": 0.21167444814062067, "grad_norm": 2.1657791137695312, "learning_rate": 4.647209253098966e-05, "loss": 0.9615, "step": 123000 }, { "epoch": 0.21253491337696465, "grad_norm": 2.10907244682312, "learning_rate": 4.6457751443717254e-05, "loss": 0.9575, "step": 123500 }, { "epoch": 0.21339537861330865, "grad_norm": 2.026007652282715, "learning_rate": 4.644341035644486e-05, "loss": 0.9584, "step": 124000 }, { "epoch": 0.21425584384965263, "grad_norm": 2.1519062519073486, "learning_rate": 4.642906926917246e-05, "loss": 0.9653, "step": 124500 }, { "epoch": 0.2151163090859966, "grad_norm": 2.004723310470581, "learning_rate": 4.641472818190006e-05, "loss": 0.9603, "step": 125000 }, { "epoch": 0.2159767743223406, "grad_norm": 2.032517433166504, "learning_rate": 4.6400387094627656e-05, "loss": 0.9531, "step": 125500 }, { "epoch": 0.21683723955868459, "grad_norm": 2.0830347537994385, "learning_rate": 4.638604600735526e-05, "loss": 0.9581, "step": 126000 }, { "epoch": 0.21769770479502856, "grad_norm": 2.0828824043273926, "learning_rate": 4.637170492008286e-05, "loss": 0.95, "step": 126500 }, { "epoch": 0.21855817003137257, "grad_norm": 1.990696907043457, "learning_rate": 4.635736383281046e-05, "loss": 0.961, "step": 127000 }, { "epoch": 0.21941863526771654, "grad_norm": 2.3424088954925537, "learning_rate": 4.6343022745538065e-05, "loss": 0.9562, "step": 127500 }, { "epoch": 0.22027910050406055, "grad_norm": 2.015568494796753, "learning_rate": 4.632868165826566e-05, "loss": 0.9582, "step": 128000 }, { "epoch": 0.22113956574040453, "grad_norm": 2.360718011856079, "learning_rate": 4.631434057099326e-05, "loss": 0.9551, "step": 128500 }, { "epoch": 0.2220000309767485, "grad_norm": 2.2896780967712402, "learning_rate": 4.629999948372086e-05, "loss": 0.9588, "step": 129000 }, { "epoch": 0.2228604962130925, "grad_norm": 2.264202356338501, "learning_rate": 4.628565839644846e-05, "loss": 0.956, "step": 129500 }, { "epoch": 0.22372096144943648, "grad_norm": 2.0719611644744873, "learning_rate": 4.627131730917606e-05, "loss": 0.9545, "step": 130000 }, { "epoch": 0.22458142668578046, "grad_norm": 2.0525126457214355, "learning_rate": 4.625697622190366e-05, "loss": 0.9538, "step": 130500 }, { "epoch": 0.22544189192212447, "grad_norm": 2.126758098602295, "learning_rate": 4.6242635134631265e-05, "loss": 0.9572, "step": 131000 }, { "epoch": 0.22630235715846844, "grad_norm": 2.1774702072143555, "learning_rate": 4.622829404735886e-05, "loss": 0.9506, "step": 131500 }, { "epoch": 0.22716282239481242, "grad_norm": 1.9970122575759888, "learning_rate": 4.621395296008646e-05, "loss": 0.9566, "step": 132000 }, { "epoch": 0.22802328763115642, "grad_norm": 2.1346707344055176, "learning_rate": 4.619961187281406e-05, "loss": 0.9504, "step": 132500 }, { "epoch": 0.2288837528675004, "grad_norm": 2.0689857006073, "learning_rate": 4.618527078554166e-05, "loss": 0.9561, "step": 133000 }, { "epoch": 0.22974421810384438, "grad_norm": 1.9728847742080688, "learning_rate": 4.6170929698269264e-05, "loss": 0.9616, "step": 133500 }, { "epoch": 0.23060468334018838, "grad_norm": 2.1437137126922607, "learning_rate": 4.615658861099687e-05, "loss": 0.9522, "step": 134000 }, { "epoch": 0.23146514857653236, "grad_norm": 1.8884685039520264, "learning_rate": 4.6142247523724465e-05, "loss": 0.9501, "step": 134500 }, { "epoch": 0.23232561381287634, "grad_norm": 1.8019119501113892, "learning_rate": 4.612790643645206e-05, "loss": 0.9532, "step": 135000 }, { "epoch": 0.23318607904922034, "grad_norm": 2.117466688156128, "learning_rate": 4.6113565349179666e-05, "loss": 0.9482, "step": 135500 }, { "epoch": 0.23404654428556432, "grad_norm": 1.976753830909729, "learning_rate": 4.609922426190726e-05, "loss": 0.9515, "step": 136000 }, { "epoch": 0.2349070095219083, "grad_norm": 2.071876049041748, "learning_rate": 4.608488317463486e-05, "loss": 0.9518, "step": 136500 }, { "epoch": 0.2357674747582523, "grad_norm": 2.0765326023101807, "learning_rate": 4.6070542087362464e-05, "loss": 0.947, "step": 137000 }, { "epoch": 0.23662793999459628, "grad_norm": 1.9267277717590332, "learning_rate": 4.605620100009007e-05, "loss": 0.9525, "step": 137500 }, { "epoch": 0.23748840523094025, "grad_norm": 2.0131049156188965, "learning_rate": 4.6041859912817664e-05, "loss": 0.9485, "step": 138000 }, { "epoch": 0.23834887046728426, "grad_norm": 1.991150140762329, "learning_rate": 4.602751882554526e-05, "loss": 0.956, "step": 138500 }, { "epoch": 0.23920933570362823, "grad_norm": 2.0469107627868652, "learning_rate": 4.6013177738272865e-05, "loss": 0.949, "step": 139000 }, { "epoch": 0.2400698009399722, "grad_norm": 2.15240216255188, "learning_rate": 4.599883665100046e-05, "loss": 0.9451, "step": 139500 }, { "epoch": 0.24093026617631622, "grad_norm": 3.1448512077331543, "learning_rate": 4.5984495563728066e-05, "loss": 0.9438, "step": 140000 }, { "epoch": 0.2417907314126602, "grad_norm": 1.9543808698654175, "learning_rate": 4.597015447645567e-05, "loss": 0.9417, "step": 140500 }, { "epoch": 0.2426511966490042, "grad_norm": 2.0985629558563232, "learning_rate": 4.595581338918327e-05, "loss": 0.944, "step": 141000 }, { "epoch": 0.24351166188534817, "grad_norm": 2.0854549407958984, "learning_rate": 4.5941472301910864e-05, "loss": 0.9538, "step": 141500 }, { "epoch": 0.24437212712169215, "grad_norm": 1.9694116115570068, "learning_rate": 4.592713121463846e-05, "loss": 0.9491, "step": 142000 }, { "epoch": 0.24523259235803616, "grad_norm": 1.9953951835632324, "learning_rate": 4.5912790127366065e-05, "loss": 0.939, "step": 142500 }, { "epoch": 0.24609305759438013, "grad_norm": 2.2640116214752197, "learning_rate": 4.589844904009367e-05, "loss": 0.9506, "step": 143000 }, { "epoch": 0.2469535228307241, "grad_norm": 2.2083499431610107, "learning_rate": 4.5884107952821266e-05, "loss": 0.951, "step": 143500 }, { "epoch": 0.24781398806706811, "grad_norm": 2.226285457611084, "learning_rate": 4.586976686554887e-05, "loss": 0.9461, "step": 144000 }, { "epoch": 0.2486744533034121, "grad_norm": 2.1445624828338623, "learning_rate": 4.585542577827647e-05, "loss": 0.9462, "step": 144500 }, { "epoch": 0.24953491853975607, "grad_norm": 2.054795265197754, "learning_rate": 4.5841084691004064e-05, "loss": 0.9508, "step": 145000 }, { "epoch": 0.25039538377610004, "grad_norm": 2.2365095615386963, "learning_rate": 4.582674360373167e-05, "loss": 0.944, "step": 145500 }, { "epoch": 0.251255849012444, "grad_norm": 1.9677467346191406, "learning_rate": 4.5812402516459265e-05, "loss": 0.9462, "step": 146000 }, { "epoch": 0.25211631424878805, "grad_norm": 2.1352016925811768, "learning_rate": 4.579806142918687e-05, "loss": 0.938, "step": 146500 }, { "epoch": 0.25297677948513203, "grad_norm": 2.162677526473999, "learning_rate": 4.578372034191447e-05, "loss": 0.9475, "step": 147000 }, { "epoch": 0.253837244721476, "grad_norm": 2.0151124000549316, "learning_rate": 4.576937925464207e-05, "loss": 0.9448, "step": 147500 }, { "epoch": 0.25469770995782, "grad_norm": 2.222620964050293, "learning_rate": 4.575503816736967e-05, "loss": 0.9423, "step": 148000 }, { "epoch": 0.25555817519416396, "grad_norm": 2.1002838611602783, "learning_rate": 4.5740697080097264e-05, "loss": 0.9398, "step": 148500 }, { "epoch": 0.256418640430508, "grad_norm": 2.1127843856811523, "learning_rate": 4.572635599282487e-05, "loss": 0.9391, "step": 149000 }, { "epoch": 0.25727910566685197, "grad_norm": 2.1093132495880127, "learning_rate": 4.571201490555247e-05, "loss": 0.9428, "step": 149500 }, { "epoch": 0.25813957090319595, "grad_norm": 2.0932860374450684, "learning_rate": 4.569767381828007e-05, "loss": 0.9475, "step": 150000 }, { "epoch": 0.2590000361395399, "grad_norm": 2.1527037620544434, "learning_rate": 4.568333273100767e-05, "loss": 0.9453, "step": 150500 }, { "epoch": 0.2598605013758839, "grad_norm": 2.1468565464019775, "learning_rate": 4.566899164373527e-05, "loss": 0.9398, "step": 151000 }, { "epoch": 0.2607209666122279, "grad_norm": 2.050854444503784, "learning_rate": 4.565465055646287e-05, "loss": 0.9412, "step": 151500 }, { "epoch": 0.2615814318485719, "grad_norm": 2.9939045906066895, "learning_rate": 4.564030946919047e-05, "loss": 0.9459, "step": 152000 }, { "epoch": 0.2624418970849159, "grad_norm": 2.3177473545074463, "learning_rate": 4.5625968381918075e-05, "loss": 0.9381, "step": 152500 }, { "epoch": 0.26330236232125986, "grad_norm": 2.2248950004577637, "learning_rate": 4.561162729464567e-05, "loss": 0.9454, "step": 153000 }, { "epoch": 0.26416282755760384, "grad_norm": 2.0898079872131348, "learning_rate": 4.559728620737327e-05, "loss": 0.9422, "step": 153500 }, { "epoch": 0.2650232927939478, "grad_norm": 2.272148609161377, "learning_rate": 4.558294512010087e-05, "loss": 0.9464, "step": 154000 }, { "epoch": 0.2658837580302918, "grad_norm": 1.8870999813079834, "learning_rate": 4.556860403282847e-05, "loss": 0.9379, "step": 154500 }, { "epoch": 0.2667442232666358, "grad_norm": 2.9211108684539795, "learning_rate": 4.555426294555607e-05, "loss": 0.9381, "step": 155000 }, { "epoch": 0.2676046885029798, "grad_norm": 1.9935648441314697, "learning_rate": 4.553992185828367e-05, "loss": 0.9415, "step": 155500 }, { "epoch": 0.2684651537393238, "grad_norm": 2.087963819503784, "learning_rate": 4.5525580771011275e-05, "loss": 0.9431, "step": 156000 }, { "epoch": 0.26932561897566776, "grad_norm": 2.077944278717041, "learning_rate": 4.551123968373887e-05, "loss": 0.9312, "step": 156500 }, { "epoch": 0.27018608421201173, "grad_norm": 2.0577354431152344, "learning_rate": 4.5496898596466476e-05, "loss": 0.9431, "step": 157000 }, { "epoch": 0.2710465494483557, "grad_norm": 1.9151005744934082, "learning_rate": 4.548255750919407e-05, "loss": 0.9417, "step": 157500 }, { "epoch": 0.27190701468469974, "grad_norm": 2.0716753005981445, "learning_rate": 4.546821642192167e-05, "loss": 0.9411, "step": 158000 }, { "epoch": 0.2727674799210437, "grad_norm": 2.233751058578491, "learning_rate": 4.5453875334649274e-05, "loss": 0.943, "step": 158500 }, { "epoch": 0.2736279451573877, "grad_norm": 2.095139265060425, "learning_rate": 4.543953424737688e-05, "loss": 0.9401, "step": 159000 }, { "epoch": 0.2744884103937317, "grad_norm": 2.020110607147217, "learning_rate": 4.5425193160104475e-05, "loss": 0.9443, "step": 159500 }, { "epoch": 0.27534887563007565, "grad_norm": 2.152085542678833, "learning_rate": 4.541085207283207e-05, "loss": 0.9357, "step": 160000 }, { "epoch": 0.27620934086641963, "grad_norm": 2.083021879196167, "learning_rate": 4.5396510985559675e-05, "loss": 0.9367, "step": 160500 }, { "epoch": 0.27706980610276366, "grad_norm": 2.171924591064453, "learning_rate": 4.538216989828727e-05, "loss": 0.9325, "step": 161000 }, { "epoch": 0.27793027133910764, "grad_norm": 1.9896093606948853, "learning_rate": 4.5367828811014876e-05, "loss": 0.9379, "step": 161500 }, { "epoch": 0.2787907365754516, "grad_norm": 2.2307465076446533, "learning_rate": 4.535348772374248e-05, "loss": 0.9376, "step": 162000 }, { "epoch": 0.2796512018117956, "grad_norm": 2.0789954662323, "learning_rate": 4.533914663647008e-05, "loss": 0.9394, "step": 162500 }, { "epoch": 0.28051166704813957, "grad_norm": 2.0075278282165527, "learning_rate": 4.5324805549197674e-05, "loss": 0.9353, "step": 163000 }, { "epoch": 0.2813721322844836, "grad_norm": 2.100450277328491, "learning_rate": 4.531046446192528e-05, "loss": 0.9393, "step": 163500 }, { "epoch": 0.2822325975208276, "grad_norm": 2.096583604812622, "learning_rate": 4.5296123374652875e-05, "loss": 0.9267, "step": 164000 }, { "epoch": 0.28309306275717155, "grad_norm": 2.025454044342041, "learning_rate": 4.528178228738047e-05, "loss": 0.934, "step": 164500 }, { "epoch": 0.28395352799351553, "grad_norm": 2.026967763900757, "learning_rate": 4.5267441200108076e-05, "loss": 0.9324, "step": 165000 }, { "epoch": 0.2848139932298595, "grad_norm": 1.9689258337020874, "learning_rate": 4.525310011283568e-05, "loss": 0.9298, "step": 165500 }, { "epoch": 0.2856744584662035, "grad_norm": 2.0960588455200195, "learning_rate": 4.523875902556328e-05, "loss": 0.9262, "step": 166000 }, { "epoch": 0.2865349237025475, "grad_norm": 2.2745180130004883, "learning_rate": 4.5224417938290874e-05, "loss": 0.93, "step": 166500 }, { "epoch": 0.2873953889388915, "grad_norm": 2.1345441341400146, "learning_rate": 4.521007685101848e-05, "loss": 0.9297, "step": 167000 }, { "epoch": 0.28825585417523547, "grad_norm": 1.9947483539581299, "learning_rate": 4.5195735763746075e-05, "loss": 0.9325, "step": 167500 }, { "epoch": 0.28911631941157945, "grad_norm": 1.983390212059021, "learning_rate": 4.518139467647368e-05, "loss": 0.9317, "step": 168000 }, { "epoch": 0.2899767846479234, "grad_norm": 2.014618396759033, "learning_rate": 4.516705358920128e-05, "loss": 0.9414, "step": 168500 }, { "epoch": 0.2908372498842674, "grad_norm": 2.0016281604766846, "learning_rate": 4.515271250192888e-05, "loss": 0.9368, "step": 169000 }, { "epoch": 0.29169771512061143, "grad_norm": 1.968947410583496, "learning_rate": 4.513837141465648e-05, "loss": 0.9318, "step": 169500 }, { "epoch": 0.2925581803569554, "grad_norm": 2.0170702934265137, "learning_rate": 4.512403032738408e-05, "loss": 0.9312, "step": 170000 }, { "epoch": 0.2934186455932994, "grad_norm": 2.0024240016937256, "learning_rate": 4.510968924011168e-05, "loss": 0.9267, "step": 170500 }, { "epoch": 0.29427911082964336, "grad_norm": 2.0785553455352783, "learning_rate": 4.5095348152839275e-05, "loss": 0.9322, "step": 171000 }, { "epoch": 0.29513957606598734, "grad_norm": 2.0966298580169678, "learning_rate": 4.508100706556688e-05, "loss": 0.9303, "step": 171500 }, { "epoch": 0.2960000413023313, "grad_norm": 2.1931986808776855, "learning_rate": 4.506666597829448e-05, "loss": 0.9331, "step": 172000 }, { "epoch": 0.29686050653867535, "grad_norm": 2.2337117195129395, "learning_rate": 4.505232489102208e-05, "loss": 0.9332, "step": 172500 }, { "epoch": 0.2977209717750193, "grad_norm": 1.900471568107605, "learning_rate": 4.503798380374968e-05, "loss": 0.926, "step": 173000 }, { "epoch": 0.2985814370113633, "grad_norm": 2.045421838760376, "learning_rate": 4.502364271647728e-05, "loss": 0.9237, "step": 173500 }, { "epoch": 0.2994419022477073, "grad_norm": 1.988563060760498, "learning_rate": 4.500930162920488e-05, "loss": 0.9318, "step": 174000 }, { "epoch": 0.30030236748405126, "grad_norm": 2.1729953289031982, "learning_rate": 4.499496054193248e-05, "loss": 0.9283, "step": 174500 }, { "epoch": 0.3011628327203953, "grad_norm": 2.043877601623535, "learning_rate": 4.4980619454660086e-05, "loss": 0.9258, "step": 175000 }, { "epoch": 0.30202329795673927, "grad_norm": 1.941645622253418, "learning_rate": 4.496627836738768e-05, "loss": 0.9342, "step": 175500 }, { "epoch": 0.30288376319308324, "grad_norm": 2.305682897567749, "learning_rate": 4.495193728011528e-05, "loss": 0.9228, "step": 176000 }, { "epoch": 0.3037442284294272, "grad_norm": 1.9681456089019775, "learning_rate": 4.493759619284288e-05, "loss": 0.9228, "step": 176500 }, { "epoch": 0.3046046936657712, "grad_norm": 2.2018189430236816, "learning_rate": 4.492325510557048e-05, "loss": 0.9249, "step": 177000 }, { "epoch": 0.3054651589021152, "grad_norm": 2.322690963745117, "learning_rate": 4.4908914018298085e-05, "loss": 0.9298, "step": 177500 }, { "epoch": 0.3063256241384592, "grad_norm": 2.019329786300659, "learning_rate": 4.489457293102568e-05, "loss": 0.9281, "step": 178000 }, { "epoch": 0.3071860893748032, "grad_norm": 2.0158770084381104, "learning_rate": 4.4880231843753286e-05, "loss": 0.9282, "step": 178500 }, { "epoch": 0.30804655461114716, "grad_norm": 1.9363014698028564, "learning_rate": 4.486589075648088e-05, "loss": 0.9281, "step": 179000 }, { "epoch": 0.30890701984749114, "grad_norm": 2.32789945602417, "learning_rate": 4.485154966920848e-05, "loss": 0.9268, "step": 179500 }, { "epoch": 0.3097674850838351, "grad_norm": 2.1968231201171875, "learning_rate": 4.4837208581936084e-05, "loss": 0.9172, "step": 180000 }, { "epoch": 0.3106279503201791, "grad_norm": 1.9847798347473145, "learning_rate": 4.482286749466368e-05, "loss": 0.9294, "step": 180500 }, { "epoch": 0.3114884155565231, "grad_norm": 1.9857689142227173, "learning_rate": 4.4808526407391285e-05, "loss": 0.9336, "step": 181000 }, { "epoch": 0.3123488807928671, "grad_norm": 2.266566514968872, "learning_rate": 4.479418532011889e-05, "loss": 0.9257, "step": 181500 }, { "epoch": 0.3132093460292111, "grad_norm": 2.045213222503662, "learning_rate": 4.4779844232846486e-05, "loss": 0.93, "step": 182000 }, { "epoch": 0.31406981126555505, "grad_norm": 2.0169789791107178, "learning_rate": 4.476550314557408e-05, "loss": 0.9235, "step": 182500 }, { "epoch": 0.31493027650189903, "grad_norm": 2.1827285289764404, "learning_rate": 4.475116205830168e-05, "loss": 0.9251, "step": 183000 }, { "epoch": 0.315790741738243, "grad_norm": 2.1066367626190186, "learning_rate": 4.4736820971029284e-05, "loss": 0.9243, "step": 183500 }, { "epoch": 0.31665120697458704, "grad_norm": 2.0616202354431152, "learning_rate": 4.472247988375689e-05, "loss": 0.9263, "step": 184000 }, { "epoch": 0.317511672210931, "grad_norm": 2.0447213649749756, "learning_rate": 4.4708138796484485e-05, "loss": 0.9276, "step": 184500 }, { "epoch": 0.318372137447275, "grad_norm": 2.1739165782928467, "learning_rate": 4.469379770921209e-05, "loss": 0.9239, "step": 185000 }, { "epoch": 0.31923260268361897, "grad_norm": 3.1863811016082764, "learning_rate": 4.4679456621939685e-05, "loss": 0.9234, "step": 185500 }, { "epoch": 0.32009306791996295, "grad_norm": 2.1314053535461426, "learning_rate": 4.466511553466728e-05, "loss": 0.925, "step": 186000 }, { "epoch": 0.3209535331563069, "grad_norm": 2.092536449432373, "learning_rate": 4.4650774447394886e-05, "loss": 0.9256, "step": 186500 }, { "epoch": 0.32181399839265096, "grad_norm": 2.1120293140411377, "learning_rate": 4.463643336012249e-05, "loss": 0.9252, "step": 187000 }, { "epoch": 0.32267446362899493, "grad_norm": 1.976959466934204, "learning_rate": 4.462209227285009e-05, "loss": 0.9208, "step": 187500 }, { "epoch": 0.3235349288653389, "grad_norm": 2.041245937347412, "learning_rate": 4.4607751185577684e-05, "loss": 0.9258, "step": 188000 }, { "epoch": 0.3243953941016829, "grad_norm": 2.3442962169647217, "learning_rate": 4.459341009830529e-05, "loss": 0.9212, "step": 188500 }, { "epoch": 0.32525585933802686, "grad_norm": 2.0697786808013916, "learning_rate": 4.4579069011032885e-05, "loss": 0.9139, "step": 189000 }, { "epoch": 0.3261163245743709, "grad_norm": 2.3078439235687256, "learning_rate": 4.456472792376048e-05, "loss": 0.9223, "step": 189500 }, { "epoch": 0.3269767898107149, "grad_norm": 2.058361768722534, "learning_rate": 4.4550386836488086e-05, "loss": 0.9224, "step": 190000 }, { "epoch": 0.32783725504705885, "grad_norm": 2.203118324279785, "learning_rate": 4.453604574921569e-05, "loss": 0.9237, "step": 190500 }, { "epoch": 0.32869772028340283, "grad_norm": 1.89760422706604, "learning_rate": 4.452170466194329e-05, "loss": 0.9212, "step": 191000 }, { "epoch": 0.3295581855197468, "grad_norm": 1.964350938796997, "learning_rate": 4.450736357467089e-05, "loss": 0.9237, "step": 191500 }, { "epoch": 0.3304186507560908, "grad_norm": 2.0023722648620605, "learning_rate": 4.449302248739849e-05, "loss": 0.9227, "step": 192000 }, { "epoch": 0.3312791159924348, "grad_norm": 2.03126859664917, "learning_rate": 4.4478681400126085e-05, "loss": 0.9167, "step": 192500 }, { "epoch": 0.3321395812287788, "grad_norm": 2.0440711975097656, "learning_rate": 4.446434031285369e-05, "loss": 0.9125, "step": 193000 }, { "epoch": 0.33300004646512277, "grad_norm": 2.1581292152404785, "learning_rate": 4.444999922558129e-05, "loss": 0.9168, "step": 193500 }, { "epoch": 0.33386051170146674, "grad_norm": 2.038708448410034, "learning_rate": 4.443565813830889e-05, "loss": 0.9225, "step": 194000 }, { "epoch": 0.3347209769378107, "grad_norm": 2.237109422683716, "learning_rate": 4.442131705103649e-05, "loss": 0.9146, "step": 194500 }, { "epoch": 0.3355814421741547, "grad_norm": 2.081531286239624, "learning_rate": 4.440697596376409e-05, "loss": 0.9151, "step": 195000 }, { "epoch": 0.33644190741049873, "grad_norm": 1.8253999948501587, "learning_rate": 4.439263487649169e-05, "loss": 0.9221, "step": 195500 }, { "epoch": 0.3373023726468427, "grad_norm": 2.0977344512939453, "learning_rate": 4.4378293789219285e-05, "loss": 0.9182, "step": 196000 }, { "epoch": 0.3381628378831867, "grad_norm": 2.108025550842285, "learning_rate": 4.4363952701946896e-05, "loss": 0.9145, "step": 196500 }, { "epoch": 0.33902330311953066, "grad_norm": 2.023393154144287, "learning_rate": 4.434961161467449e-05, "loss": 0.9179, "step": 197000 }, { "epoch": 0.33988376835587464, "grad_norm": 2.1165921688079834, "learning_rate": 4.433527052740209e-05, "loss": 0.9206, "step": 197500 }, { "epoch": 0.3407442335922186, "grad_norm": 1.7804551124572754, "learning_rate": 4.4320929440129694e-05, "loss": 0.9188, "step": 198000 }, { "epoch": 0.34160469882856265, "grad_norm": 1.9795186519622803, "learning_rate": 4.430658835285729e-05, "loss": 0.9176, "step": 198500 }, { "epoch": 0.3424651640649066, "grad_norm": 2.0974135398864746, "learning_rate": 4.429224726558489e-05, "loss": 0.921, "step": 199000 }, { "epoch": 0.3433256293012506, "grad_norm": 2.106494903564453, "learning_rate": 4.427790617831249e-05, "loss": 0.9218, "step": 199500 }, { "epoch": 0.3441860945375946, "grad_norm": 2.2233493328094482, "learning_rate": 4.4263565091040096e-05, "loss": 0.9184, "step": 200000 }, { "epoch": 0.34504655977393855, "grad_norm": 1.9548823833465576, "learning_rate": 4.424922400376769e-05, "loss": 0.9166, "step": 200500 }, { "epoch": 0.3459070250102826, "grad_norm": 1.9309402704238892, "learning_rate": 4.423488291649529e-05, "loss": 0.9133, "step": 201000 }, { "epoch": 0.34676749024662656, "grad_norm": 2.1494619846343994, "learning_rate": 4.4220541829222894e-05, "loss": 0.9163, "step": 201500 }, { "epoch": 0.34762795548297054, "grad_norm": 2.0527141094207764, "learning_rate": 4.420620074195049e-05, "loss": 0.9221, "step": 202000 }, { "epoch": 0.3484884207193145, "grad_norm": 2.224276542663574, "learning_rate": 4.4191859654678095e-05, "loss": 0.9114, "step": 202500 }, { "epoch": 0.3493488859556585, "grad_norm": 2.2672996520996094, "learning_rate": 4.41775185674057e-05, "loss": 0.9164, "step": 203000 }, { "epoch": 0.35020935119200247, "grad_norm": 1.9365850687026978, "learning_rate": 4.4163177480133296e-05, "loss": 0.909, "step": 203500 }, { "epoch": 0.3510698164283465, "grad_norm": 2.1146013736724854, "learning_rate": 4.414883639286089e-05, "loss": 0.9205, "step": 204000 }, { "epoch": 0.3519302816646905, "grad_norm": 2.0115785598754883, "learning_rate": 4.4134495305588497e-05, "loss": 0.9118, "step": 204500 }, { "epoch": 0.35279074690103446, "grad_norm": 1.9710958003997803, "learning_rate": 4.4120154218316094e-05, "loss": 0.9147, "step": 205000 }, { "epoch": 0.35365121213737843, "grad_norm": 2.1901769638061523, "learning_rate": 4.410581313104369e-05, "loss": 0.9157, "step": 205500 }, { "epoch": 0.3545116773737224, "grad_norm": 1.9956777095794678, "learning_rate": 4.4091472043771295e-05, "loss": 0.9112, "step": 206000 }, { "epoch": 0.3553721426100664, "grad_norm": 2.0977365970611572, "learning_rate": 4.40771309564989e-05, "loss": 0.9199, "step": 206500 }, { "epoch": 0.3562326078464104, "grad_norm": 2.092785358428955, "learning_rate": 4.4062789869226496e-05, "loss": 0.9111, "step": 207000 }, { "epoch": 0.3570930730827544, "grad_norm": 2.122471570968628, "learning_rate": 4.404844878195409e-05, "loss": 0.9114, "step": 207500 }, { "epoch": 0.3579535383190984, "grad_norm": 1.9477378129959106, "learning_rate": 4.4034107694681696e-05, "loss": 0.9091, "step": 208000 }, { "epoch": 0.35881400355544235, "grad_norm": 1.773989200592041, "learning_rate": 4.4019766607409294e-05, "loss": 0.9117, "step": 208500 }, { "epoch": 0.35967446879178633, "grad_norm": 2.234323024749756, "learning_rate": 4.40054255201369e-05, "loss": 0.9144, "step": 209000 }, { "epoch": 0.3605349340281303, "grad_norm": 2.09895920753479, "learning_rate": 4.39910844328645e-05, "loss": 0.9127, "step": 209500 }, { "epoch": 0.36139539926447434, "grad_norm": 2.123444080352783, "learning_rate": 4.39767433455921e-05, "loss": 0.9126, "step": 210000 }, { "epoch": 0.3622558645008183, "grad_norm": 2.052734851837158, "learning_rate": 4.3962402258319695e-05, "loss": 0.9128, "step": 210500 }, { "epoch": 0.3631163297371623, "grad_norm": 2.132145404815674, "learning_rate": 4.39480611710473e-05, "loss": 0.9117, "step": 211000 }, { "epoch": 0.36397679497350627, "grad_norm": 2.2205147743225098, "learning_rate": 4.3933720083774896e-05, "loss": 0.9174, "step": 211500 }, { "epoch": 0.36483726020985024, "grad_norm": 2.234994411468506, "learning_rate": 4.39193789965025e-05, "loss": 0.9121, "step": 212000 }, { "epoch": 0.3656977254461942, "grad_norm": 1.8479299545288086, "learning_rate": 4.39050379092301e-05, "loss": 0.9083, "step": 212500 }, { "epoch": 0.36655819068253825, "grad_norm": 2.0153796672821045, "learning_rate": 4.38906968219577e-05, "loss": 0.9124, "step": 213000 }, { "epoch": 0.36741865591888223, "grad_norm": 2.0594232082366943, "learning_rate": 4.38763557346853e-05, "loss": 0.9075, "step": 213500 }, { "epoch": 0.3682791211552262, "grad_norm": 2.0284030437469482, "learning_rate": 4.3862014647412895e-05, "loss": 0.9139, "step": 214000 }, { "epoch": 0.3691395863915702, "grad_norm": 2.0767407417297363, "learning_rate": 4.38476735601405e-05, "loss": 0.9152, "step": 214500 }, { "epoch": 0.37000005162791416, "grad_norm": 2.1043102741241455, "learning_rate": 4.3833332472868096e-05, "loss": 0.9116, "step": 215000 }, { "epoch": 0.3708605168642582, "grad_norm": 1.960959792137146, "learning_rate": 4.38189913855957e-05, "loss": 0.9092, "step": 215500 }, { "epoch": 0.37172098210060217, "grad_norm": 1.9991062879562378, "learning_rate": 4.3804650298323304e-05, "loss": 0.9193, "step": 216000 }, { "epoch": 0.37258144733694615, "grad_norm": 2.0325255393981934, "learning_rate": 4.37903092110509e-05, "loss": 0.9115, "step": 216500 }, { "epoch": 0.3734419125732901, "grad_norm": 2.1963534355163574, "learning_rate": 4.37759681237785e-05, "loss": 0.9094, "step": 217000 }, { "epoch": 0.3743023778096341, "grad_norm": 2.172563314437866, "learning_rate": 4.3761627036506095e-05, "loss": 0.909, "step": 217500 }, { "epoch": 0.3751628430459781, "grad_norm": 2.0587527751922607, "learning_rate": 4.37472859492337e-05, "loss": 0.9089, "step": 218000 }, { "epoch": 0.3760233082823221, "grad_norm": 2.1344945430755615, "learning_rate": 4.37329448619613e-05, "loss": 0.9146, "step": 218500 }, { "epoch": 0.3768837735186661, "grad_norm": 4.116335868835449, "learning_rate": 4.37186037746889e-05, "loss": 0.9093, "step": 219000 }, { "epoch": 0.37774423875501006, "grad_norm": 1.8812421560287476, "learning_rate": 4.3704262687416504e-05, "loss": 0.9085, "step": 219500 }, { "epoch": 0.37860470399135404, "grad_norm": 1.841057538986206, "learning_rate": 4.36899216001441e-05, "loss": 0.9108, "step": 220000 }, { "epoch": 0.379465169227698, "grad_norm": 2.1964564323425293, "learning_rate": 4.36755805128717e-05, "loss": 0.9086, "step": 220500 }, { "epoch": 0.380325634464042, "grad_norm": 1.981557011604309, "learning_rate": 4.36612394255993e-05, "loss": 0.9109, "step": 221000 }, { "epoch": 0.381186099700386, "grad_norm": 2.1632473468780518, "learning_rate": 4.3646898338326906e-05, "loss": 0.9012, "step": 221500 }, { "epoch": 0.38204656493673, "grad_norm": 2.0308001041412354, "learning_rate": 4.36325572510545e-05, "loss": 0.9077, "step": 222000 }, { "epoch": 0.382907030173074, "grad_norm": 1.95619535446167, "learning_rate": 4.361821616378211e-05, "loss": 0.9083, "step": 222500 }, { "epoch": 0.38376749540941796, "grad_norm": 2.0333635807037354, "learning_rate": 4.3603875076509704e-05, "loss": 0.9116, "step": 223000 }, { "epoch": 0.38462796064576193, "grad_norm": 2.3641154766082764, "learning_rate": 4.35895339892373e-05, "loss": 0.9077, "step": 223500 }, { "epoch": 0.3854884258821059, "grad_norm": 2.0519320964813232, "learning_rate": 4.35751929019649e-05, "loss": 0.9131, "step": 224000 }, { "epoch": 0.38634889111844994, "grad_norm": 2.1741113662719727, "learning_rate": 4.35608518146925e-05, "loss": 0.9051, "step": 224500 }, { "epoch": 0.3872093563547939, "grad_norm": 2.2641000747680664, "learning_rate": 4.3546510727420106e-05, "loss": 0.9148, "step": 225000 }, { "epoch": 0.3880698215911379, "grad_norm": 2.112330675125122, "learning_rate": 4.35321696401477e-05, "loss": 0.9028, "step": 225500 }, { "epoch": 0.3889302868274819, "grad_norm": 2.238736152648926, "learning_rate": 4.351782855287531e-05, "loss": 0.9075, "step": 226000 }, { "epoch": 0.38979075206382585, "grad_norm": 2.0583317279815674, "learning_rate": 4.3503487465602904e-05, "loss": 0.9059, "step": 226500 }, { "epoch": 0.39065121730016983, "grad_norm": 2.021120309829712, "learning_rate": 4.34891463783305e-05, "loss": 0.907, "step": 227000 }, { "epoch": 0.39151168253651386, "grad_norm": 2.0049400329589844, "learning_rate": 4.3474805291058105e-05, "loss": 0.9028, "step": 227500 }, { "epoch": 0.39237214777285784, "grad_norm": 2.042653799057007, "learning_rate": 4.346046420378571e-05, "loss": 0.9047, "step": 228000 }, { "epoch": 0.3932326130092018, "grad_norm": 2.180102586746216, "learning_rate": 4.3446123116513306e-05, "loss": 0.9072, "step": 228500 }, { "epoch": 0.3940930782455458, "grad_norm": 2.1076409816741943, "learning_rate": 4.34317820292409e-05, "loss": 0.9014, "step": 229000 }, { "epoch": 0.39495354348188977, "grad_norm": 2.024866819381714, "learning_rate": 4.3417440941968507e-05, "loss": 0.9078, "step": 229500 }, { "epoch": 0.3958140087182338, "grad_norm": 2.0629196166992188, "learning_rate": 4.3403099854696104e-05, "loss": 0.9021, "step": 230000 }, { "epoch": 0.3966744739545778, "grad_norm": 2.060455799102783, "learning_rate": 4.33887587674237e-05, "loss": 0.8988, "step": 230500 }, { "epoch": 0.39753493919092175, "grad_norm": 2.0068538188934326, "learning_rate": 4.337441768015131e-05, "loss": 0.9071, "step": 231000 }, { "epoch": 0.39839540442726573, "grad_norm": 2.2743167877197266, "learning_rate": 4.336007659287891e-05, "loss": 0.9067, "step": 231500 }, { "epoch": 0.3992558696636097, "grad_norm": 1.9746699333190918, "learning_rate": 4.3345735505606505e-05, "loss": 0.9028, "step": 232000 }, { "epoch": 0.4001163348999537, "grad_norm": 2.108346700668335, "learning_rate": 4.333139441833411e-05, "loss": 0.9046, "step": 232500 }, { "epoch": 0.4009768001362977, "grad_norm": 2.0433006286621094, "learning_rate": 4.3317053331061706e-05, "loss": 0.9079, "step": 233000 }, { "epoch": 0.4018372653726417, "grad_norm": 2.050401210784912, "learning_rate": 4.3302712243789304e-05, "loss": 0.9032, "step": 233500 }, { "epoch": 0.40269773060898567, "grad_norm": 2.0607173442840576, "learning_rate": 4.328837115651691e-05, "loss": 0.8957, "step": 234000 }, { "epoch": 0.40355819584532965, "grad_norm": 2.1468214988708496, "learning_rate": 4.327403006924451e-05, "loss": 0.8996, "step": 234500 }, { "epoch": 0.4044186610816736, "grad_norm": 2.10554838180542, "learning_rate": 4.325968898197211e-05, "loss": 0.9052, "step": 235000 }, { "epoch": 0.4052791263180176, "grad_norm": 1.9938172101974487, "learning_rate": 4.3245347894699705e-05, "loss": 0.8994, "step": 235500 }, { "epoch": 0.40613959155436163, "grad_norm": 2.218541383743286, "learning_rate": 4.323100680742731e-05, "loss": 0.9005, "step": 236000 }, { "epoch": 0.4070000567907056, "grad_norm": 1.946315050125122, "learning_rate": 4.3216665720154906e-05, "loss": 0.9019, "step": 236500 }, { "epoch": 0.4078605220270496, "grad_norm": 2.236579179763794, "learning_rate": 4.320232463288251e-05, "loss": 0.9049, "step": 237000 }, { "epoch": 0.40872098726339356, "grad_norm": 2.0288383960723877, "learning_rate": 4.3187983545610114e-05, "loss": 0.9001, "step": 237500 }, { "epoch": 0.40958145249973754, "grad_norm": 1.885482907295227, "learning_rate": 4.317364245833771e-05, "loss": 0.9043, "step": 238000 }, { "epoch": 0.4104419177360815, "grad_norm": 1.9730161428451538, "learning_rate": 4.315930137106531e-05, "loss": 0.9043, "step": 238500 }, { "epoch": 0.41130238297242555, "grad_norm": 3.811640501022339, "learning_rate": 4.314496028379291e-05, "loss": 0.9007, "step": 239000 }, { "epoch": 0.41216284820876953, "grad_norm": 1.9219677448272705, "learning_rate": 4.313061919652051e-05, "loss": 0.9059, "step": 239500 }, { "epoch": 0.4130233134451135, "grad_norm": 2.092369318008423, "learning_rate": 4.3116278109248106e-05, "loss": 0.9017, "step": 240000 }, { "epoch": 0.4138837786814575, "grad_norm": 4.2498297691345215, "learning_rate": 4.310193702197571e-05, "loss": 0.8922, "step": 240500 }, { "epoch": 0.41474424391780146, "grad_norm": 2.057892322540283, "learning_rate": 4.3087595934703314e-05, "loss": 0.9016, "step": 241000 }, { "epoch": 0.4156047091541455, "grad_norm": 1.8440794944763184, "learning_rate": 4.307325484743091e-05, "loss": 0.9037, "step": 241500 }, { "epoch": 0.41646517439048947, "grad_norm": 2.4426589012145996, "learning_rate": 4.305891376015851e-05, "loss": 0.8994, "step": 242000 }, { "epoch": 0.41732563962683344, "grad_norm": 2.063756227493286, "learning_rate": 4.304457267288611e-05, "loss": 0.8958, "step": 242500 }, { "epoch": 0.4181861048631774, "grad_norm": 2.2012550830841064, "learning_rate": 4.303023158561371e-05, "loss": 0.9004, "step": 243000 }, { "epoch": 0.4190465700995214, "grad_norm": 2.1932969093322754, "learning_rate": 4.301589049834131e-05, "loss": 0.902, "step": 243500 }, { "epoch": 0.4199070353358654, "grad_norm": 1.9205131530761719, "learning_rate": 4.300154941106892e-05, "loss": 0.9038, "step": 244000 }, { "epoch": 0.4207675005722094, "grad_norm": 2.281463861465454, "learning_rate": 4.2987208323796514e-05, "loss": 0.901, "step": 244500 }, { "epoch": 0.4216279658085534, "grad_norm": 1.8984506130218506, "learning_rate": 4.297286723652411e-05, "loss": 0.8934, "step": 245000 }, { "epoch": 0.42248843104489736, "grad_norm": 2.0649311542510986, "learning_rate": 4.2958526149251715e-05, "loss": 0.8973, "step": 245500 }, { "epoch": 0.42334889628124134, "grad_norm": 2.0315778255462646, "learning_rate": 4.294418506197931e-05, "loss": 0.9, "step": 246000 }, { "epoch": 0.4242093615175853, "grad_norm": 2.0501866340637207, "learning_rate": 4.2929843974706916e-05, "loss": 0.9006, "step": 246500 }, { "epoch": 0.4250698267539293, "grad_norm": 1.8434195518493652, "learning_rate": 4.291550288743451e-05, "loss": 0.8996, "step": 247000 }, { "epoch": 0.4259302919902733, "grad_norm": 1.9517645835876465, "learning_rate": 4.290116180016212e-05, "loss": 0.8961, "step": 247500 }, { "epoch": 0.4267907572266173, "grad_norm": 2.049384355545044, "learning_rate": 4.2886820712889714e-05, "loss": 0.8958, "step": 248000 }, { "epoch": 0.4276512224629613, "grad_norm": 2.0335021018981934, "learning_rate": 4.287247962561731e-05, "loss": 0.9003, "step": 248500 }, { "epoch": 0.42851168769930525, "grad_norm": 1.9440404176712036, "learning_rate": 4.2858138538344915e-05, "loss": 0.9009, "step": 249000 }, { "epoch": 0.42937215293564923, "grad_norm": 1.907614827156067, "learning_rate": 4.284379745107251e-05, "loss": 0.8947, "step": 249500 }, { "epoch": 0.4302326181719932, "grad_norm": 1.9623234272003174, "learning_rate": 4.2829456363800116e-05, "loss": 0.9006, "step": 250000 }, { "epoch": 0.43109308340833724, "grad_norm": 1.9491050243377686, "learning_rate": 4.281511527652772e-05, "loss": 0.901, "step": 250500 }, { "epoch": 0.4319535486446812, "grad_norm": 2.1582417488098145, "learning_rate": 4.2800774189255317e-05, "loss": 0.9015, "step": 251000 }, { "epoch": 0.4328140138810252, "grad_norm": 1.8875306844711304, "learning_rate": 4.2786433101982914e-05, "loss": 0.893, "step": 251500 }, { "epoch": 0.43367447911736917, "grad_norm": 1.9841439723968506, "learning_rate": 4.277209201471051e-05, "loss": 0.895, "step": 252000 }, { "epoch": 0.43453494435371315, "grad_norm": 1.9874733686447144, "learning_rate": 4.2757750927438115e-05, "loss": 0.8975, "step": 252500 }, { "epoch": 0.4353954095900571, "grad_norm": 2.1332311630249023, "learning_rate": 4.274340984016572e-05, "loss": 0.8924, "step": 253000 }, { "epoch": 0.43625587482640116, "grad_norm": 2.069314956665039, "learning_rate": 4.2729068752893316e-05, "loss": 0.8932, "step": 253500 }, { "epoch": 0.43711634006274513, "grad_norm": 2.087435245513916, "learning_rate": 4.271472766562092e-05, "loss": 0.8966, "step": 254000 }, { "epoch": 0.4379768052990891, "grad_norm": 1.9603450298309326, "learning_rate": 4.2700386578348516e-05, "loss": 0.902, "step": 254500 }, { "epoch": 0.4388372705354331, "grad_norm": 2.302950859069824, "learning_rate": 4.2686045491076114e-05, "loss": 0.8944, "step": 255000 }, { "epoch": 0.43969773577177707, "grad_norm": 2.033417224884033, "learning_rate": 4.267170440380372e-05, "loss": 0.8938, "step": 255500 }, { "epoch": 0.4405582010081211, "grad_norm": 2.086822271347046, "learning_rate": 4.265736331653132e-05, "loss": 0.8947, "step": 256000 }, { "epoch": 0.4414186662444651, "grad_norm": 1.9544868469238281, "learning_rate": 4.264302222925892e-05, "loss": 0.8908, "step": 256500 }, { "epoch": 0.44227913148080905, "grad_norm": 2.105898380279541, "learning_rate": 4.262868114198652e-05, "loss": 0.8936, "step": 257000 }, { "epoch": 0.44313959671715303, "grad_norm": 2.320366144180298, "learning_rate": 4.261434005471412e-05, "loss": 0.8963, "step": 257500 }, { "epoch": 0.444000061953497, "grad_norm": 2.063095808029175, "learning_rate": 4.2599998967441716e-05, "loss": 0.896, "step": 258000 }, { "epoch": 0.444860527189841, "grad_norm": 1.9787700176239014, "learning_rate": 4.2585657880169313e-05, "loss": 0.893, "step": 258500 }, { "epoch": 0.445720992426185, "grad_norm": 1.8536509275436401, "learning_rate": 4.257131679289692e-05, "loss": 0.8991, "step": 259000 }, { "epoch": 0.446581457662529, "grad_norm": 2.1534652709960938, "learning_rate": 4.255697570562452e-05, "loss": 0.901, "step": 259500 }, { "epoch": 0.44744192289887297, "grad_norm": 2.1015608310699463, "learning_rate": 4.254263461835212e-05, "loss": 0.8984, "step": 260000 }, { "epoch": 0.44830238813521694, "grad_norm": 2.066599130630493, "learning_rate": 4.252829353107972e-05, "loss": 0.898, "step": 260500 }, { "epoch": 0.4491628533715609, "grad_norm": 2.0920300483703613, "learning_rate": 4.251395244380732e-05, "loss": 0.8964, "step": 261000 }, { "epoch": 0.4500233186079049, "grad_norm": 1.9317692518234253, "learning_rate": 4.2499611356534916e-05, "loss": 0.8948, "step": 261500 }, { "epoch": 0.45088378384424893, "grad_norm": 2.1189675331115723, "learning_rate": 4.248527026926252e-05, "loss": 0.8939, "step": 262000 }, { "epoch": 0.4517442490805929, "grad_norm": 2.0204405784606934, "learning_rate": 4.2470929181990124e-05, "loss": 0.8919, "step": 262500 }, { "epoch": 0.4526047143169369, "grad_norm": 2.0056357383728027, "learning_rate": 4.245658809471772e-05, "loss": 0.8933, "step": 263000 }, { "epoch": 0.45346517955328086, "grad_norm": 2.0812103748321533, "learning_rate": 4.244224700744532e-05, "loss": 0.8939, "step": 263500 }, { "epoch": 0.45432564478962484, "grad_norm": 2.1630799770355225, "learning_rate": 4.242790592017292e-05, "loss": 0.8919, "step": 264000 }, { "epoch": 0.4551861100259688, "grad_norm": 2.2746987342834473, "learning_rate": 4.241356483290052e-05, "loss": 0.898, "step": 264500 }, { "epoch": 0.45604657526231285, "grad_norm": 2.476505756378174, "learning_rate": 4.2399223745628116e-05, "loss": 0.8919, "step": 265000 }, { "epoch": 0.4569070404986568, "grad_norm": 2.2061052322387695, "learning_rate": 4.238488265835573e-05, "loss": 0.8959, "step": 265500 }, { "epoch": 0.4577675057350008, "grad_norm": 2.035796642303467, "learning_rate": 4.2370541571083324e-05, "loss": 0.8907, "step": 266000 }, { "epoch": 0.4586279709713448, "grad_norm": 2.0652384757995605, "learning_rate": 4.235620048381092e-05, "loss": 0.8902, "step": 266500 }, { "epoch": 0.45948843620768876, "grad_norm": 2.0927200317382812, "learning_rate": 4.2341859396538525e-05, "loss": 0.8942, "step": 267000 }, { "epoch": 0.46034890144403273, "grad_norm": 1.8390779495239258, "learning_rate": 4.232751830926612e-05, "loss": 0.8916, "step": 267500 }, { "epoch": 0.46120936668037676, "grad_norm": 2.0620462894439697, "learning_rate": 4.231317722199372e-05, "loss": 0.8914, "step": 268000 }, { "epoch": 0.46206983191672074, "grad_norm": 1.9553086757659912, "learning_rate": 4.229883613472132e-05, "loss": 0.8897, "step": 268500 }, { "epoch": 0.4629302971530647, "grad_norm": 2.0123965740203857, "learning_rate": 4.228449504744893e-05, "loss": 0.8935, "step": 269000 }, { "epoch": 0.4637907623894087, "grad_norm": 1.8187013864517212, "learning_rate": 4.2270153960176524e-05, "loss": 0.8921, "step": 269500 }, { "epoch": 0.46465122762575267, "grad_norm": 2.2623324394226074, "learning_rate": 4.225581287290412e-05, "loss": 0.8918, "step": 270000 }, { "epoch": 0.4655116928620967, "grad_norm": 1.965198278427124, "learning_rate": 4.2241471785631725e-05, "loss": 0.8944, "step": 270500 }, { "epoch": 0.4663721580984407, "grad_norm": 1.9791353940963745, "learning_rate": 4.222713069835932e-05, "loss": 0.8899, "step": 271000 }, { "epoch": 0.46723262333478466, "grad_norm": 2.0294482707977295, "learning_rate": 4.2212789611086926e-05, "loss": 0.8929, "step": 271500 }, { "epoch": 0.46809308857112863, "grad_norm": 1.869797706604004, "learning_rate": 4.219844852381453e-05, "loss": 0.8914, "step": 272000 }, { "epoch": 0.4689535538074726, "grad_norm": 2.0433285236358643, "learning_rate": 4.218410743654213e-05, "loss": 0.8929, "step": 272500 }, { "epoch": 0.4698140190438166, "grad_norm": 2.2087464332580566, "learning_rate": 4.2169766349269724e-05, "loss": 0.8923, "step": 273000 }, { "epoch": 0.4706744842801606, "grad_norm": 2.0753061771392822, "learning_rate": 4.215542526199733e-05, "loss": 0.8918, "step": 273500 }, { "epoch": 0.4715349495165046, "grad_norm": 2.107851266860962, "learning_rate": 4.2141084174724925e-05, "loss": 0.8943, "step": 274000 }, { "epoch": 0.4723954147528486, "grad_norm": 2.038882255554199, "learning_rate": 4.212674308745252e-05, "loss": 0.8932, "step": 274500 }, { "epoch": 0.47325587998919255, "grad_norm": 2.12312388420105, "learning_rate": 4.2112402000180126e-05, "loss": 0.8867, "step": 275000 }, { "epoch": 0.47411634522553653, "grad_norm": 2.0398507118225098, "learning_rate": 4.209806091290773e-05, "loss": 0.8921, "step": 275500 }, { "epoch": 0.4749768104618805, "grad_norm": 2.09047794342041, "learning_rate": 4.2083719825635327e-05, "loss": 0.8817, "step": 276000 }, { "epoch": 0.47583727569822454, "grad_norm": 1.9717376232147217, "learning_rate": 4.2069378738362924e-05, "loss": 0.8922, "step": 276500 }, { "epoch": 0.4766977409345685, "grad_norm": 1.9686635732650757, "learning_rate": 4.205503765109053e-05, "loss": 0.8972, "step": 277000 }, { "epoch": 0.4775582061709125, "grad_norm": 1.8727903366088867, "learning_rate": 4.2040696563818125e-05, "loss": 0.8892, "step": 277500 }, { "epoch": 0.47841867140725647, "grad_norm": 2.0806689262390137, "learning_rate": 4.202635547654573e-05, "loss": 0.8869, "step": 278000 }, { "epoch": 0.47927913664360045, "grad_norm": 1.9705919027328491, "learning_rate": 4.201201438927333e-05, "loss": 0.8844, "step": 278500 }, { "epoch": 0.4801396018799444, "grad_norm": 2.01405668258667, "learning_rate": 4.199767330200093e-05, "loss": 0.893, "step": 279000 }, { "epoch": 0.48100006711628845, "grad_norm": 2.08960223197937, "learning_rate": 4.1983332214728526e-05, "loss": 0.8842, "step": 279500 }, { "epoch": 0.48186053235263243, "grad_norm": 2.0530412197113037, "learning_rate": 4.196899112745613e-05, "loss": 0.8826, "step": 280000 }, { "epoch": 0.4827209975889764, "grad_norm": 1.8881099224090576, "learning_rate": 4.195465004018373e-05, "loss": 0.8829, "step": 280500 }, { "epoch": 0.4835814628253204, "grad_norm": 2.0287468433380127, "learning_rate": 4.194030895291133e-05, "loss": 0.8814, "step": 281000 }, { "epoch": 0.48444192806166436, "grad_norm": 2.0433034896850586, "learning_rate": 4.192596786563893e-05, "loss": 0.8889, "step": 281500 }, { "epoch": 0.4853023932980084, "grad_norm": 2.0601441860198975, "learning_rate": 4.191162677836653e-05, "loss": 0.8868, "step": 282000 }, { "epoch": 0.48616285853435237, "grad_norm": 2.3189380168914795, "learning_rate": 4.189728569109413e-05, "loss": 0.8894, "step": 282500 }, { "epoch": 0.48702332377069635, "grad_norm": 2.302962064743042, "learning_rate": 4.1882944603821726e-05, "loss": 0.8834, "step": 283000 }, { "epoch": 0.4878837890070403, "grad_norm": 1.9201202392578125, "learning_rate": 4.186860351654933e-05, "loss": 0.8846, "step": 283500 }, { "epoch": 0.4887442542433843, "grad_norm": 2.085836887359619, "learning_rate": 4.185426242927693e-05, "loss": 0.8919, "step": 284000 }, { "epoch": 0.4896047194797283, "grad_norm": 2.1023192405700684, "learning_rate": 4.183992134200453e-05, "loss": 0.8881, "step": 284500 }, { "epoch": 0.4904651847160723, "grad_norm": 2.0571980476379395, "learning_rate": 4.1825580254732135e-05, "loss": 0.8833, "step": 285000 }, { "epoch": 0.4913256499524163, "grad_norm": 2.038604259490967, "learning_rate": 4.181123916745973e-05, "loss": 0.8816, "step": 285500 }, { "epoch": 0.49218611518876026, "grad_norm": 2.018510341644287, "learning_rate": 4.179689808018733e-05, "loss": 0.8829, "step": 286000 }, { "epoch": 0.49304658042510424, "grad_norm": 2.025078535079956, "learning_rate": 4.1782556992914926e-05, "loss": 0.8874, "step": 286500 }, { "epoch": 0.4939070456614482, "grad_norm": 2.0259318351745605, "learning_rate": 4.176821590564253e-05, "loss": 0.8783, "step": 287000 }, { "epoch": 0.4947675108977922, "grad_norm": 2.0440592765808105, "learning_rate": 4.1753874818370134e-05, "loss": 0.8852, "step": 287500 }, { "epoch": 0.49562797613413623, "grad_norm": 2.1548845767974854, "learning_rate": 4.173953373109773e-05, "loss": 0.8843, "step": 288000 }, { "epoch": 0.4964884413704802, "grad_norm": 2.020847797393799, "learning_rate": 4.1725192643825335e-05, "loss": 0.8881, "step": 288500 }, { "epoch": 0.4973489066068242, "grad_norm": 2.0484495162963867, "learning_rate": 4.171085155655293e-05, "loss": 0.8902, "step": 289000 }, { "epoch": 0.49820937184316816, "grad_norm": 1.9658195972442627, "learning_rate": 4.169651046928053e-05, "loss": 0.8805, "step": 289500 }, { "epoch": 0.49906983707951214, "grad_norm": 2.1585335731506348, "learning_rate": 4.168216938200813e-05, "loss": 0.8885, "step": 290000 }, { "epoch": 0.4999303023158561, "grad_norm": 2.0190553665161133, "learning_rate": 4.166782829473574e-05, "loss": 0.8833, "step": 290500 }, { "epoch": 0.5007907675522001, "grad_norm": 2.143428087234497, "learning_rate": 4.1653487207463334e-05, "loss": 0.8837, "step": 291000 }, { "epoch": 0.5016512327885441, "grad_norm": 2.24603533744812, "learning_rate": 4.163914612019094e-05, "loss": 0.8779, "step": 291500 }, { "epoch": 0.502511698024888, "grad_norm": 2.109105348587036, "learning_rate": 4.1624805032918535e-05, "loss": 0.8875, "step": 292000 }, { "epoch": 0.5033721632612321, "grad_norm": 1.9702835083007812, "learning_rate": 4.161046394564613e-05, "loss": 0.8804, "step": 292500 }, { "epoch": 0.5042326284975761, "grad_norm": 2.784588575363159, "learning_rate": 4.159612285837373e-05, "loss": 0.8879, "step": 293000 }, { "epoch": 0.5050930937339201, "grad_norm": 1.999629020690918, "learning_rate": 4.158178177110133e-05, "loss": 0.8817, "step": 293500 }, { "epoch": 0.5059535589702641, "grad_norm": 2.034653663635254, "learning_rate": 4.156744068382894e-05, "loss": 0.8719, "step": 294000 }, { "epoch": 0.506814024206608, "grad_norm": 2.095053195953369, "learning_rate": 4.1553099596556534e-05, "loss": 0.8823, "step": 294500 }, { "epoch": 0.507674489442952, "grad_norm": 1.9798400402069092, "learning_rate": 4.153875850928414e-05, "loss": 0.8818, "step": 295000 }, { "epoch": 0.508534954679296, "grad_norm": 2.06270170211792, "learning_rate": 4.1524417422011735e-05, "loss": 0.8829, "step": 295500 }, { "epoch": 0.50939541991564, "grad_norm": 2.1127400398254395, "learning_rate": 4.151007633473933e-05, "loss": 0.8815, "step": 296000 }, { "epoch": 0.510255885151984, "grad_norm": 2.151163101196289, "learning_rate": 4.1495735247466936e-05, "loss": 0.8791, "step": 296500 }, { "epoch": 0.5111163503883279, "grad_norm": 2.0091350078582764, "learning_rate": 4.148139416019454e-05, "loss": 0.8948, "step": 297000 }, { "epoch": 0.5119768156246719, "grad_norm": 2.0361227989196777, "learning_rate": 4.146705307292214e-05, "loss": 0.8772, "step": 297500 }, { "epoch": 0.512837280861016, "grad_norm": 1.9893290996551514, "learning_rate": 4.1452711985649734e-05, "loss": 0.8823, "step": 298000 }, { "epoch": 0.51369774609736, "grad_norm": 1.988218069076538, "learning_rate": 4.143837089837734e-05, "loss": 0.891, "step": 298500 }, { "epoch": 0.5145582113337039, "grad_norm": 1.9783134460449219, "learning_rate": 4.1424029811104935e-05, "loss": 0.8815, "step": 299000 }, { "epoch": 0.5154186765700479, "grad_norm": 2.102525234222412, "learning_rate": 4.140968872383253e-05, "loss": 0.8734, "step": 299500 }, { "epoch": 0.5162791418063919, "grad_norm": 2.2450239658355713, "learning_rate": 4.1395347636560136e-05, "loss": 0.8838, "step": 300000 }, { "epoch": 0.5171396070427359, "grad_norm": 2.089251756668091, "learning_rate": 4.138100654928774e-05, "loss": 0.8758, "step": 300500 }, { "epoch": 0.5180000722790798, "grad_norm": 2.2183520793914795, "learning_rate": 4.1366665462015337e-05, "loss": 0.8814, "step": 301000 }, { "epoch": 0.5188605375154238, "grad_norm": 1.9809156656265259, "learning_rate": 4.135232437474294e-05, "loss": 0.8751, "step": 301500 }, { "epoch": 0.5197210027517678, "grad_norm": 2.149174451828003, "learning_rate": 4.133798328747054e-05, "loss": 0.8779, "step": 302000 }, { "epoch": 0.5205814679881118, "grad_norm": 2.312516927719116, "learning_rate": 4.1323642200198135e-05, "loss": 0.8797, "step": 302500 }, { "epoch": 0.5214419332244558, "grad_norm": 2.053981065750122, "learning_rate": 4.130930111292574e-05, "loss": 0.8824, "step": 303000 }, { "epoch": 0.5223023984607997, "grad_norm": 1.9515398740768433, "learning_rate": 4.129496002565334e-05, "loss": 0.8838, "step": 303500 }, { "epoch": 0.5231628636971438, "grad_norm": 2.1182141304016113, "learning_rate": 4.128061893838094e-05, "loss": 0.8817, "step": 304000 }, { "epoch": 0.5240233289334878, "grad_norm": 2.140738010406494, "learning_rate": 4.1266277851108536e-05, "loss": 0.8806, "step": 304500 }, { "epoch": 0.5248837941698318, "grad_norm": 2.0797948837280273, "learning_rate": 4.125193676383614e-05, "loss": 0.88, "step": 305000 }, { "epoch": 0.5257442594061758, "grad_norm": 2.092442274093628, "learning_rate": 4.123759567656374e-05, "loss": 0.8825, "step": 305500 }, { "epoch": 0.5266047246425197, "grad_norm": 2.1355602741241455, "learning_rate": 4.122325458929134e-05, "loss": 0.8776, "step": 306000 }, { "epoch": 0.5274651898788637, "grad_norm": 2.0246939659118652, "learning_rate": 4.1208913502018945e-05, "loss": 0.8827, "step": 306500 }, { "epoch": 0.5283256551152077, "grad_norm": 1.9206238985061646, "learning_rate": 4.119457241474654e-05, "loss": 0.8833, "step": 307000 }, { "epoch": 0.5291861203515517, "grad_norm": 2.153930187225342, "learning_rate": 4.118023132747414e-05, "loss": 0.8825, "step": 307500 }, { "epoch": 0.5300465855878956, "grad_norm": 1.9237273931503296, "learning_rate": 4.116589024020174e-05, "loss": 0.8895, "step": 308000 }, { "epoch": 0.5309070508242396, "grad_norm": 2.0631232261657715, "learning_rate": 4.115154915292934e-05, "loss": 0.8824, "step": 308500 }, { "epoch": 0.5317675160605836, "grad_norm": 1.9997292757034302, "learning_rate": 4.113720806565694e-05, "loss": 0.8784, "step": 309000 }, { "epoch": 0.5326279812969277, "grad_norm": 2.1037609577178955, "learning_rate": 4.112286697838454e-05, "loss": 0.8733, "step": 309500 }, { "epoch": 0.5334884465332717, "grad_norm": 2.1329736709594727, "learning_rate": 4.1108525891112145e-05, "loss": 0.8767, "step": 310000 }, { "epoch": 0.5343489117696156, "grad_norm": 2.07114577293396, "learning_rate": 4.109418480383974e-05, "loss": 0.8743, "step": 310500 }, { "epoch": 0.5352093770059596, "grad_norm": 2.110933780670166, "learning_rate": 4.107984371656734e-05, "loss": 0.8737, "step": 311000 }, { "epoch": 0.5360698422423036, "grad_norm": 1.9786311388015747, "learning_rate": 4.106550262929494e-05, "loss": 0.877, "step": 311500 }, { "epoch": 0.5369303074786476, "grad_norm": 1.9082682132720947, "learning_rate": 4.105116154202254e-05, "loss": 0.8783, "step": 312000 }, { "epoch": 0.5377907727149915, "grad_norm": 2.0248301029205322, "learning_rate": 4.1036820454750144e-05, "loss": 0.8786, "step": 312500 }, { "epoch": 0.5386512379513355, "grad_norm": 2.0271735191345215, "learning_rate": 4.102247936747775e-05, "loss": 0.8784, "step": 313000 }, { "epoch": 0.5395117031876795, "grad_norm": 1.8867460489273071, "learning_rate": 4.1008138280205345e-05, "loss": 0.8698, "step": 313500 }, { "epoch": 0.5403721684240235, "grad_norm": 2.0477957725524902, "learning_rate": 4.099379719293294e-05, "loss": 0.8793, "step": 314000 }, { "epoch": 0.5412326336603674, "grad_norm": 2.1383612155914307, "learning_rate": 4.0979456105660546e-05, "loss": 0.8828, "step": 314500 }, { "epoch": 0.5420930988967114, "grad_norm": 1.9967807531356812, "learning_rate": 4.096511501838814e-05, "loss": 0.8816, "step": 315000 }, { "epoch": 0.5429535641330555, "grad_norm": 2.489108085632324, "learning_rate": 4.095077393111575e-05, "loss": 0.8741, "step": 315500 }, { "epoch": 0.5438140293693995, "grad_norm": 2.342057466506958, "learning_rate": 4.0936432843843344e-05, "loss": 0.8875, "step": 316000 }, { "epoch": 0.5446744946057435, "grad_norm": 1.9754986763000488, "learning_rate": 4.092209175657095e-05, "loss": 0.8767, "step": 316500 }, { "epoch": 0.5455349598420874, "grad_norm": 2.211503267288208, "learning_rate": 4.0907750669298545e-05, "loss": 0.8791, "step": 317000 }, { "epoch": 0.5463954250784314, "grad_norm": 1.9339312314987183, "learning_rate": 4.089340958202614e-05, "loss": 0.8754, "step": 317500 }, { "epoch": 0.5472558903147754, "grad_norm": 2.0220510959625244, "learning_rate": 4.0879068494753746e-05, "loss": 0.8705, "step": 318000 }, { "epoch": 0.5481163555511194, "grad_norm": 1.974448323249817, "learning_rate": 4.086472740748134e-05, "loss": 0.8719, "step": 318500 }, { "epoch": 0.5489768207874633, "grad_norm": 2.084419012069702, "learning_rate": 4.085038632020895e-05, "loss": 0.8704, "step": 319000 }, { "epoch": 0.5498372860238073, "grad_norm": 1.9440003633499146, "learning_rate": 4.083604523293655e-05, "loss": 0.8788, "step": 319500 }, { "epoch": 0.5506977512601513, "grad_norm": 2.0453526973724365, "learning_rate": 4.082170414566415e-05, "loss": 0.8752, "step": 320000 }, { "epoch": 0.5515582164964953, "grad_norm": 2.0000038146972656, "learning_rate": 4.0807363058391745e-05, "loss": 0.8773, "step": 320500 }, { "epoch": 0.5524186817328393, "grad_norm": 2.253737688064575, "learning_rate": 4.079302197111935e-05, "loss": 0.8721, "step": 321000 }, { "epoch": 0.5532791469691833, "grad_norm": 2.0602428913116455, "learning_rate": 4.0778680883846946e-05, "loss": 0.8794, "step": 321500 }, { "epoch": 0.5541396122055273, "grad_norm": 2.0261833667755127, "learning_rate": 4.076433979657455e-05, "loss": 0.8724, "step": 322000 }, { "epoch": 0.5550000774418713, "grad_norm": 1.9648793935775757, "learning_rate": 4.0749998709302147e-05, "loss": 0.8724, "step": 322500 }, { "epoch": 0.5558605426782153, "grad_norm": 1.9868428707122803, "learning_rate": 4.073565762202975e-05, "loss": 0.8721, "step": 323000 }, { "epoch": 0.5567210079145593, "grad_norm": 1.9594879150390625, "learning_rate": 4.072131653475735e-05, "loss": 0.8717, "step": 323500 }, { "epoch": 0.5575814731509032, "grad_norm": 2.072599411010742, "learning_rate": 4.0706975447484945e-05, "loss": 0.8758, "step": 324000 }, { "epoch": 0.5584419383872472, "grad_norm": 2.037163257598877, "learning_rate": 4.069263436021255e-05, "loss": 0.8779, "step": 324500 }, { "epoch": 0.5593024036235912, "grad_norm": 2.0014655590057373, "learning_rate": 4.0678293272940146e-05, "loss": 0.8753, "step": 325000 }, { "epoch": 0.5601628688599352, "grad_norm": 1.9225795269012451, "learning_rate": 4.066395218566775e-05, "loss": 0.8713, "step": 325500 }, { "epoch": 0.5610233340962791, "grad_norm": 1.923448085784912, "learning_rate": 4.064961109839535e-05, "loss": 0.8739, "step": 326000 }, { "epoch": 0.5618837993326231, "grad_norm": 2.097306251525879, "learning_rate": 4.063527001112295e-05, "loss": 0.8808, "step": 326500 }, { "epoch": 0.5627442645689672, "grad_norm": 2.081660270690918, "learning_rate": 4.062092892385055e-05, "loss": 0.8812, "step": 327000 }, { "epoch": 0.5636047298053112, "grad_norm": 2.085153818130493, "learning_rate": 4.0606587836578145e-05, "loss": 0.871, "step": 327500 }, { "epoch": 0.5644651950416552, "grad_norm": 2.0828776359558105, "learning_rate": 4.059224674930575e-05, "loss": 0.8691, "step": 328000 }, { "epoch": 0.5653256602779991, "grad_norm": 2.226726531982422, "learning_rate": 4.057790566203335e-05, "loss": 0.879, "step": 328500 }, { "epoch": 0.5661861255143431, "grad_norm": 1.9523491859436035, "learning_rate": 4.056356457476095e-05, "loss": 0.8706, "step": 329000 }, { "epoch": 0.5670465907506871, "grad_norm": 2.002776861190796, "learning_rate": 4.054922348748855e-05, "loss": 0.8752, "step": 329500 }, { "epoch": 0.5679070559870311, "grad_norm": 2.166841745376587, "learning_rate": 4.053488240021615e-05, "loss": 0.8791, "step": 330000 }, { "epoch": 0.568767521223375, "grad_norm": 2.093510627746582, "learning_rate": 4.052054131294375e-05, "loss": 0.8687, "step": 330500 }, { "epoch": 0.569627986459719, "grad_norm": 2.178293466567993, "learning_rate": 4.050620022567135e-05, "loss": 0.8676, "step": 331000 }, { "epoch": 0.570488451696063, "grad_norm": 1.8907274007797241, "learning_rate": 4.0491859138398955e-05, "loss": 0.8666, "step": 331500 }, { "epoch": 0.571348916932407, "grad_norm": 1.8355967998504639, "learning_rate": 4.047751805112655e-05, "loss": 0.8754, "step": 332000 }, { "epoch": 0.572209382168751, "grad_norm": 2.001462936401367, "learning_rate": 4.0463176963854156e-05, "loss": 0.8737, "step": 332500 }, { "epoch": 0.573069847405095, "grad_norm": 2.0783512592315674, "learning_rate": 4.044883587658175e-05, "loss": 0.8642, "step": 333000 }, { "epoch": 0.573930312641439, "grad_norm": 2.1608786582946777, "learning_rate": 4.043449478930935e-05, "loss": 0.878, "step": 333500 }, { "epoch": 0.574790777877783, "grad_norm": 2.18611216545105, "learning_rate": 4.042015370203695e-05, "loss": 0.8803, "step": 334000 }, { "epoch": 0.575651243114127, "grad_norm": 2.0941479206085205, "learning_rate": 4.040581261476455e-05, "loss": 0.874, "step": 334500 }, { "epoch": 0.5765117083504709, "grad_norm": 2.0728914737701416, "learning_rate": 4.0391471527492155e-05, "loss": 0.8729, "step": 335000 }, { "epoch": 0.5773721735868149, "grad_norm": 5.831918716430664, "learning_rate": 4.037713044021975e-05, "loss": 0.8658, "step": 335500 }, { "epoch": 0.5782326388231589, "grad_norm": 2.3052661418914795, "learning_rate": 4.0362789352947356e-05, "loss": 0.8712, "step": 336000 }, { "epoch": 0.5790931040595029, "grad_norm": 1.9301319122314453, "learning_rate": 4.034844826567495e-05, "loss": 0.8811, "step": 336500 }, { "epoch": 0.5799535692958468, "grad_norm": 2.0595436096191406, "learning_rate": 4.033410717840255e-05, "loss": 0.8686, "step": 337000 }, { "epoch": 0.5808140345321908, "grad_norm": 2.0486223697662354, "learning_rate": 4.0319766091130154e-05, "loss": 0.8667, "step": 337500 }, { "epoch": 0.5816744997685348, "grad_norm": 2.193305492401123, "learning_rate": 4.030542500385776e-05, "loss": 0.8702, "step": 338000 }, { "epoch": 0.5825349650048789, "grad_norm": 1.9140911102294922, "learning_rate": 4.0291083916585355e-05, "loss": 0.8711, "step": 338500 }, { "epoch": 0.5833954302412229, "grad_norm": 2.1117045879364014, "learning_rate": 4.027674282931295e-05, "loss": 0.8765, "step": 339000 }, { "epoch": 0.5842558954775668, "grad_norm": 2.079667806625366, "learning_rate": 4.0262401742040556e-05, "loss": 0.8615, "step": 339500 }, { "epoch": 0.5851163607139108, "grad_norm": 2.0438807010650635, "learning_rate": 4.024806065476815e-05, "loss": 0.8701, "step": 340000 }, { "epoch": 0.5859768259502548, "grad_norm": 2.0004425048828125, "learning_rate": 4.023371956749576e-05, "loss": 0.8714, "step": 340500 }, { "epoch": 0.5868372911865988, "grad_norm": 1.8124200105667114, "learning_rate": 4.021937848022336e-05, "loss": 0.869, "step": 341000 }, { "epoch": 0.5876977564229428, "grad_norm": 2.239046812057495, "learning_rate": 4.020503739295096e-05, "loss": 0.8687, "step": 341500 }, { "epoch": 0.5885582216592867, "grad_norm": 1.9475022554397583, "learning_rate": 4.0190696305678555e-05, "loss": 0.8727, "step": 342000 }, { "epoch": 0.5894186868956307, "grad_norm": 2.0992038249969482, "learning_rate": 4.017635521840616e-05, "loss": 0.8747, "step": 342500 }, { "epoch": 0.5902791521319747, "grad_norm": 1.9996228218078613, "learning_rate": 4.0162014131133756e-05, "loss": 0.8728, "step": 343000 }, { "epoch": 0.5911396173683187, "grad_norm": 1.9434428215026855, "learning_rate": 4.014767304386135e-05, "loss": 0.8663, "step": 343500 }, { "epoch": 0.5920000826046626, "grad_norm": 2.194171667098999, "learning_rate": 4.013333195658896e-05, "loss": 0.8769, "step": 344000 }, { "epoch": 0.5928605478410067, "grad_norm": 2.1467361450195312, "learning_rate": 4.011899086931656e-05, "loss": 0.8749, "step": 344500 }, { "epoch": 0.5937210130773507, "grad_norm": 2.1336467266082764, "learning_rate": 4.010464978204416e-05, "loss": 0.8694, "step": 345000 }, { "epoch": 0.5945814783136947, "grad_norm": 2.085965394973755, "learning_rate": 4.0090308694771755e-05, "loss": 0.8696, "step": 345500 }, { "epoch": 0.5954419435500387, "grad_norm": 2.0303428173065186, "learning_rate": 4.007596760749936e-05, "loss": 0.8708, "step": 346000 }, { "epoch": 0.5963024087863826, "grad_norm": 2.0182602405548096, "learning_rate": 4.0061626520226956e-05, "loss": 0.8692, "step": 346500 }, { "epoch": 0.5971628740227266, "grad_norm": 2.3182501792907715, "learning_rate": 4.004728543295456e-05, "loss": 0.8725, "step": 347000 }, { "epoch": 0.5980233392590706, "grad_norm": 1.9483355283737183, "learning_rate": 4.003294434568216e-05, "loss": 0.8704, "step": 347500 }, { "epoch": 0.5988838044954146, "grad_norm": 2.027533531188965, "learning_rate": 4.001860325840976e-05, "loss": 0.8646, "step": 348000 }, { "epoch": 0.5997442697317585, "grad_norm": 2.079859733581543, "learning_rate": 4.000426217113736e-05, "loss": 0.8679, "step": 348500 }, { "epoch": 0.6006047349681025, "grad_norm": 2.240821599960327, "learning_rate": 3.998992108386496e-05, "loss": 0.8701, "step": 349000 }, { "epoch": 0.6014652002044465, "grad_norm": 1.9912835359573364, "learning_rate": 3.997557999659256e-05, "loss": 0.8675, "step": 349500 }, { "epoch": 0.6023256654407906, "grad_norm": 2.0753517150878906, "learning_rate": 3.9961238909320156e-05, "loss": 0.8649, "step": 350000 }, { "epoch": 0.6031861306771346, "grad_norm": 1.8598897457122803, "learning_rate": 3.994689782204776e-05, "loss": 0.8681, "step": 350500 }, { "epoch": 0.6040465959134785, "grad_norm": 1.8939234018325806, "learning_rate": 3.993255673477536e-05, "loss": 0.8702, "step": 351000 }, { "epoch": 0.6049070611498225, "grad_norm": 1.9680449962615967, "learning_rate": 3.991821564750296e-05, "loss": 0.8656, "step": 351500 }, { "epoch": 0.6057675263861665, "grad_norm": 1.9677945375442505, "learning_rate": 3.990387456023056e-05, "loss": 0.8692, "step": 352000 }, { "epoch": 0.6066279916225105, "grad_norm": 2.178060531616211, "learning_rate": 3.988953347295816e-05, "loss": 0.8652, "step": 352500 }, { "epoch": 0.6074884568588544, "grad_norm": 2.035489082336426, "learning_rate": 3.987519238568576e-05, "loss": 0.8633, "step": 353000 }, { "epoch": 0.6083489220951984, "grad_norm": 2.296475410461426, "learning_rate": 3.986085129841336e-05, "loss": 0.8695, "step": 353500 }, { "epoch": 0.6092093873315424, "grad_norm": 2.06257963180542, "learning_rate": 3.9846510211140966e-05, "loss": 0.8701, "step": 354000 }, { "epoch": 0.6100698525678864, "grad_norm": 1.7742199897766113, "learning_rate": 3.983216912386856e-05, "loss": 0.8653, "step": 354500 }, { "epoch": 0.6109303178042303, "grad_norm": 2.1947309970855713, "learning_rate": 3.981782803659616e-05, "loss": 0.8722, "step": 355000 }, { "epoch": 0.6117907830405743, "grad_norm": 1.9710649251937866, "learning_rate": 3.9803486949323764e-05, "loss": 0.8631, "step": 355500 }, { "epoch": 0.6126512482769184, "grad_norm": 2.3806471824645996, "learning_rate": 3.978914586205136e-05, "loss": 0.8651, "step": 356000 }, { "epoch": 0.6135117135132624, "grad_norm": 2.1118831634521484, "learning_rate": 3.9774804774778965e-05, "loss": 0.8688, "step": 356500 }, { "epoch": 0.6143721787496064, "grad_norm": 2.0460760593414307, "learning_rate": 3.976046368750656e-05, "loss": 0.865, "step": 357000 }, { "epoch": 0.6152326439859503, "grad_norm": 1.9501177072525024, "learning_rate": 3.9746122600234166e-05, "loss": 0.863, "step": 357500 }, { "epoch": 0.6160931092222943, "grad_norm": 2.000192165374756, "learning_rate": 3.973178151296176e-05, "loss": 0.8625, "step": 358000 }, { "epoch": 0.6169535744586383, "grad_norm": 2.1251659393310547, "learning_rate": 3.971744042568936e-05, "loss": 0.8684, "step": 358500 }, { "epoch": 0.6178140396949823, "grad_norm": 2.025874376296997, "learning_rate": 3.9703099338416964e-05, "loss": 0.8685, "step": 359000 }, { "epoch": 0.6186745049313263, "grad_norm": 1.909056544303894, "learning_rate": 3.968875825114456e-05, "loss": 0.8639, "step": 359500 }, { "epoch": 0.6195349701676702, "grad_norm": 1.8679569959640503, "learning_rate": 3.9674417163872165e-05, "loss": 0.865, "step": 360000 }, { "epoch": 0.6203954354040142, "grad_norm": 1.9933589696884155, "learning_rate": 3.966007607659977e-05, "loss": 0.8676, "step": 360500 }, { "epoch": 0.6212559006403582, "grad_norm": 2.170215129852295, "learning_rate": 3.9645734989327366e-05, "loss": 0.8652, "step": 361000 }, { "epoch": 0.6221163658767023, "grad_norm": 2.0594518184661865, "learning_rate": 3.963139390205496e-05, "loss": 0.8664, "step": 361500 }, { "epoch": 0.6229768311130462, "grad_norm": 2.1189212799072266, "learning_rate": 3.961705281478256e-05, "loss": 0.861, "step": 362000 }, { "epoch": 0.6238372963493902, "grad_norm": 2.0330159664154053, "learning_rate": 3.9602711727510164e-05, "loss": 0.8609, "step": 362500 }, { "epoch": 0.6246977615857342, "grad_norm": 2.09208345413208, "learning_rate": 3.958837064023777e-05, "loss": 0.8688, "step": 363000 }, { "epoch": 0.6255582268220782, "grad_norm": 2.081916570663452, "learning_rate": 3.9574029552965365e-05, "loss": 0.8615, "step": 363500 }, { "epoch": 0.6264186920584222, "grad_norm": 2.043161630630493, "learning_rate": 3.955968846569297e-05, "loss": 0.8625, "step": 364000 }, { "epoch": 0.6272791572947661, "grad_norm": 2.077566146850586, "learning_rate": 3.9545347378420566e-05, "loss": 0.865, "step": 364500 }, { "epoch": 0.6281396225311101, "grad_norm": 1.8295567035675049, "learning_rate": 3.953100629114816e-05, "loss": 0.8613, "step": 365000 }, { "epoch": 0.6290000877674541, "grad_norm": 2.1657068729400635, "learning_rate": 3.951666520387577e-05, "loss": 0.8603, "step": 365500 }, { "epoch": 0.6298605530037981, "grad_norm": 2.0411875247955322, "learning_rate": 3.950232411660337e-05, "loss": 0.8657, "step": 366000 }, { "epoch": 0.630721018240142, "grad_norm": 2.146219491958618, "learning_rate": 3.948798302933097e-05, "loss": 0.8648, "step": 366500 }, { "epoch": 0.631581483476486, "grad_norm": 2.2003700733184814, "learning_rate": 3.947364194205857e-05, "loss": 0.8626, "step": 367000 }, { "epoch": 0.6324419487128301, "grad_norm": 2.0370073318481445, "learning_rate": 3.945930085478617e-05, "loss": 0.8623, "step": 367500 }, { "epoch": 0.6333024139491741, "grad_norm": 2.0403857231140137, "learning_rate": 3.9444959767513766e-05, "loss": 0.8634, "step": 368000 }, { "epoch": 0.6341628791855181, "grad_norm": 2.0806796550750732, "learning_rate": 3.943061868024136e-05, "loss": 0.8657, "step": 368500 }, { "epoch": 0.635023344421862, "grad_norm": 2.0758426189422607, "learning_rate": 3.941627759296897e-05, "loss": 0.8672, "step": 369000 }, { "epoch": 0.635883809658206, "grad_norm": 1.9031485319137573, "learning_rate": 3.940193650569657e-05, "loss": 0.862, "step": 369500 }, { "epoch": 0.63674427489455, "grad_norm": 1.9357956647872925, "learning_rate": 3.938759541842417e-05, "loss": 0.8659, "step": 370000 }, { "epoch": 0.637604740130894, "grad_norm": 1.8705717325210571, "learning_rate": 3.937325433115177e-05, "loss": 0.8637, "step": 370500 }, { "epoch": 0.6384652053672379, "grad_norm": 1.963387370109558, "learning_rate": 3.935891324387937e-05, "loss": 0.8609, "step": 371000 }, { "epoch": 0.6393256706035819, "grad_norm": 2.1126718521118164, "learning_rate": 3.9344572156606966e-05, "loss": 0.867, "step": 371500 }, { "epoch": 0.6401861358399259, "grad_norm": 1.9715133905410767, "learning_rate": 3.933023106933457e-05, "loss": 0.8633, "step": 372000 }, { "epoch": 0.6410466010762699, "grad_norm": 2.0479183197021484, "learning_rate": 3.931588998206217e-05, "loss": 0.8678, "step": 372500 }, { "epoch": 0.6419070663126138, "grad_norm": 2.021559953689575, "learning_rate": 3.930154889478977e-05, "loss": 0.8661, "step": 373000 }, { "epoch": 0.6427675315489579, "grad_norm": 2.037088394165039, "learning_rate": 3.928720780751737e-05, "loss": 0.8595, "step": 373500 }, { "epoch": 0.6436279967853019, "grad_norm": 1.8662008047103882, "learning_rate": 3.927286672024497e-05, "loss": 0.859, "step": 374000 }, { "epoch": 0.6444884620216459, "grad_norm": 2.037175178527832, "learning_rate": 3.925852563297257e-05, "loss": 0.865, "step": 374500 }, { "epoch": 0.6453489272579899, "grad_norm": 1.90609872341156, "learning_rate": 3.924418454570017e-05, "loss": 0.858, "step": 375000 }, { "epoch": 0.6462093924943338, "grad_norm": 1.9687128067016602, "learning_rate": 3.9229843458427776e-05, "loss": 0.8596, "step": 375500 }, { "epoch": 0.6470698577306778, "grad_norm": 1.905707597732544, "learning_rate": 3.921550237115537e-05, "loss": 0.8594, "step": 376000 }, { "epoch": 0.6479303229670218, "grad_norm": 1.876888394355774, "learning_rate": 3.920116128388297e-05, "loss": 0.8612, "step": 376500 }, { "epoch": 0.6487907882033658, "grad_norm": 2.0150208473205566, "learning_rate": 3.9186820196610574e-05, "loss": 0.8543, "step": 377000 }, { "epoch": 0.6496512534397098, "grad_norm": 1.8568073511123657, "learning_rate": 3.917247910933817e-05, "loss": 0.8634, "step": 377500 }, { "epoch": 0.6505117186760537, "grad_norm": 1.9990817308425903, "learning_rate": 3.915813802206577e-05, "loss": 0.8584, "step": 378000 }, { "epoch": 0.6513721839123977, "grad_norm": 2.0287017822265625, "learning_rate": 3.914379693479337e-05, "loss": 0.8644, "step": 378500 }, { "epoch": 0.6522326491487418, "grad_norm": 1.886164665222168, "learning_rate": 3.9129455847520976e-05, "loss": 0.8662, "step": 379000 }, { "epoch": 0.6530931143850858, "grad_norm": 2.0619380474090576, "learning_rate": 3.911511476024857e-05, "loss": 0.8609, "step": 379500 }, { "epoch": 0.6539535796214297, "grad_norm": 2.1117541790008545, "learning_rate": 3.910077367297617e-05, "loss": 0.8679, "step": 380000 }, { "epoch": 0.6548140448577737, "grad_norm": 2.0683281421661377, "learning_rate": 3.9086432585703774e-05, "loss": 0.8558, "step": 380500 }, { "epoch": 0.6556745100941177, "grad_norm": 2.209404706954956, "learning_rate": 3.907209149843137e-05, "loss": 0.8634, "step": 381000 }, { "epoch": 0.6565349753304617, "grad_norm": 2.2918388843536377, "learning_rate": 3.9057750411158975e-05, "loss": 0.856, "step": 381500 }, { "epoch": 0.6573954405668057, "grad_norm": 1.9002126455307007, "learning_rate": 3.904340932388658e-05, "loss": 0.8663, "step": 382000 }, { "epoch": 0.6582559058031496, "grad_norm": 2.235064744949341, "learning_rate": 3.9029068236614176e-05, "loss": 0.8601, "step": 382500 }, { "epoch": 0.6591163710394936, "grad_norm": 2.0584187507629395, "learning_rate": 3.901472714934177e-05, "loss": 0.8635, "step": 383000 }, { "epoch": 0.6599768362758376, "grad_norm": 2.0273942947387695, "learning_rate": 3.900038606206938e-05, "loss": 0.8618, "step": 383500 }, { "epoch": 0.6608373015121816, "grad_norm": 1.9882392883300781, "learning_rate": 3.8986044974796974e-05, "loss": 0.8552, "step": 384000 }, { "epoch": 0.6616977667485255, "grad_norm": 1.9432196617126465, "learning_rate": 3.897170388752457e-05, "loss": 0.8597, "step": 384500 }, { "epoch": 0.6625582319848696, "grad_norm": 2.048971652984619, "learning_rate": 3.8957362800252175e-05, "loss": 0.8623, "step": 385000 }, { "epoch": 0.6634186972212136, "grad_norm": 2.0021302700042725, "learning_rate": 3.894302171297978e-05, "loss": 0.8611, "step": 385500 }, { "epoch": 0.6642791624575576, "grad_norm": 2.2216758728027344, "learning_rate": 3.8928680625707376e-05, "loss": 0.8574, "step": 386000 }, { "epoch": 0.6651396276939016, "grad_norm": 1.8725037574768066, "learning_rate": 3.891433953843497e-05, "loss": 0.8555, "step": 386500 }, { "epoch": 0.6660000929302455, "grad_norm": 1.8659377098083496, "learning_rate": 3.889999845116258e-05, "loss": 0.8647, "step": 387000 }, { "epoch": 0.6668605581665895, "grad_norm": 2.034635066986084, "learning_rate": 3.8885657363890174e-05, "loss": 0.8603, "step": 387500 }, { "epoch": 0.6677210234029335, "grad_norm": 1.9781776666641235, "learning_rate": 3.887131627661778e-05, "loss": 0.8512, "step": 388000 }, { "epoch": 0.6685814886392775, "grad_norm": 2.0884103775024414, "learning_rate": 3.885697518934538e-05, "loss": 0.8653, "step": 388500 }, { "epoch": 0.6694419538756214, "grad_norm": 2.025995969772339, "learning_rate": 3.884263410207298e-05, "loss": 0.8617, "step": 389000 }, { "epoch": 0.6703024191119654, "grad_norm": 2.181551456451416, "learning_rate": 3.8828293014800576e-05, "loss": 0.8572, "step": 389500 }, { "epoch": 0.6711628843483094, "grad_norm": 1.9846434593200684, "learning_rate": 3.881395192752818e-05, "loss": 0.8558, "step": 390000 }, { "epoch": 0.6720233495846535, "grad_norm": 2.3479115962982178, "learning_rate": 3.879961084025578e-05, "loss": 0.8532, "step": 390500 }, { "epoch": 0.6728838148209975, "grad_norm": 1.9880118370056152, "learning_rate": 3.878526975298338e-05, "loss": 0.862, "step": 391000 }, { "epoch": 0.6737442800573414, "grad_norm": 2.1184558868408203, "learning_rate": 3.877092866571098e-05, "loss": 0.8632, "step": 391500 }, { "epoch": 0.6746047452936854, "grad_norm": 2.0495119094848633, "learning_rate": 3.875658757843858e-05, "loss": 0.8592, "step": 392000 }, { "epoch": 0.6754652105300294, "grad_norm": 1.9544061422348022, "learning_rate": 3.874224649116618e-05, "loss": 0.8583, "step": 392500 }, { "epoch": 0.6763256757663734, "grad_norm": 1.8652530908584595, "learning_rate": 3.8727905403893776e-05, "loss": 0.8523, "step": 393000 }, { "epoch": 0.6771861410027173, "grad_norm": 1.9426854848861694, "learning_rate": 3.871356431662138e-05, "loss": 0.8517, "step": 393500 }, { "epoch": 0.6780466062390613, "grad_norm": 1.9407514333724976, "learning_rate": 3.8699223229348977e-05, "loss": 0.8629, "step": 394000 }, { "epoch": 0.6789070714754053, "grad_norm": 2.0087180137634277, "learning_rate": 3.868488214207658e-05, "loss": 0.8581, "step": 394500 }, { "epoch": 0.6797675367117493, "grad_norm": 2.013326644897461, "learning_rate": 3.8670541054804184e-05, "loss": 0.859, "step": 395000 }, { "epoch": 0.6806280019480933, "grad_norm": 2.0862598419189453, "learning_rate": 3.865619996753178e-05, "loss": 0.8652, "step": 395500 }, { "epoch": 0.6814884671844372, "grad_norm": 2.002089023590088, "learning_rate": 3.864185888025938e-05, "loss": 0.851, "step": 396000 }, { "epoch": 0.6823489324207813, "grad_norm": 2.0043892860412598, "learning_rate": 3.8627517792986976e-05, "loss": 0.8596, "step": 396500 }, { "epoch": 0.6832093976571253, "grad_norm": 2.03305721282959, "learning_rate": 3.861317670571458e-05, "loss": 0.8565, "step": 397000 }, { "epoch": 0.6840698628934693, "grad_norm": 2.194655418395996, "learning_rate": 3.859883561844218e-05, "loss": 0.8539, "step": 397500 }, { "epoch": 0.6849303281298132, "grad_norm": 2.1509432792663574, "learning_rate": 3.858449453116978e-05, "loss": 0.8588, "step": 398000 }, { "epoch": 0.6857907933661572, "grad_norm": 2.0242624282836914, "learning_rate": 3.8570153443897384e-05, "loss": 0.8528, "step": 398500 }, { "epoch": 0.6866512586025012, "grad_norm": 2.357621908187866, "learning_rate": 3.855581235662498e-05, "loss": 0.8556, "step": 399000 }, { "epoch": 0.6875117238388452, "grad_norm": 4.633720874786377, "learning_rate": 3.854147126935258e-05, "loss": 0.8514, "step": 399500 }, { "epoch": 0.6883721890751892, "grad_norm": 2.0474085807800293, "learning_rate": 3.852713018208018e-05, "loss": 0.8552, "step": 400000 }, { "epoch": 0.6892326543115331, "grad_norm": 2.1123034954071045, "learning_rate": 3.8512789094807786e-05, "loss": 0.8535, "step": 400500 }, { "epoch": 0.6900931195478771, "grad_norm": 2.002837657928467, "learning_rate": 3.849844800753538e-05, "loss": 0.8545, "step": 401000 }, { "epoch": 0.6909535847842211, "grad_norm": 1.9797070026397705, "learning_rate": 3.848410692026299e-05, "loss": 0.8508, "step": 401500 }, { "epoch": 0.6918140500205652, "grad_norm": 2.1309895515441895, "learning_rate": 3.8469765832990584e-05, "loss": 0.8537, "step": 402000 }, { "epoch": 0.6926745152569092, "grad_norm": 1.9080448150634766, "learning_rate": 3.845542474571818e-05, "loss": 0.855, "step": 402500 }, { "epoch": 0.6935349804932531, "grad_norm": 2.081582546234131, "learning_rate": 3.844108365844578e-05, "loss": 0.8513, "step": 403000 }, { "epoch": 0.6943954457295971, "grad_norm": 1.9564839601516724, "learning_rate": 3.842674257117338e-05, "loss": 0.851, "step": 403500 }, { "epoch": 0.6952559109659411, "grad_norm": 1.9080193042755127, "learning_rate": 3.8412401483900986e-05, "loss": 0.8528, "step": 404000 }, { "epoch": 0.6961163762022851, "grad_norm": 1.936774730682373, "learning_rate": 3.839806039662858e-05, "loss": 0.8536, "step": 404500 }, { "epoch": 0.696976841438629, "grad_norm": 2.0372822284698486, "learning_rate": 3.838371930935619e-05, "loss": 0.8485, "step": 405000 }, { "epoch": 0.697837306674973, "grad_norm": 1.938328504562378, "learning_rate": 3.8369378222083784e-05, "loss": 0.8537, "step": 405500 }, { "epoch": 0.698697771911317, "grad_norm": 2.093485116958618, "learning_rate": 3.835503713481138e-05, "loss": 0.8591, "step": 406000 }, { "epoch": 0.699558237147661, "grad_norm": 1.9015600681304932, "learning_rate": 3.8340696047538985e-05, "loss": 0.8559, "step": 406500 }, { "epoch": 0.7004187023840049, "grad_norm": 2.13163685798645, "learning_rate": 3.832635496026659e-05, "loss": 0.853, "step": 407000 }, { "epoch": 0.7012791676203489, "grad_norm": 2.031601667404175, "learning_rate": 3.8312013872994186e-05, "loss": 0.8601, "step": 407500 }, { "epoch": 0.702139632856693, "grad_norm": 2.0390450954437256, "learning_rate": 3.829767278572178e-05, "loss": 0.8499, "step": 408000 }, { "epoch": 0.703000098093037, "grad_norm": 2.225257396697998, "learning_rate": 3.828333169844939e-05, "loss": 0.8594, "step": 408500 }, { "epoch": 0.703860563329381, "grad_norm": 1.9891399145126343, "learning_rate": 3.8268990611176984e-05, "loss": 0.861, "step": 409000 }, { "epoch": 0.7047210285657249, "grad_norm": 2.017092227935791, "learning_rate": 3.825464952390458e-05, "loss": 0.8531, "step": 409500 }, { "epoch": 0.7055814938020689, "grad_norm": 2.091958999633789, "learning_rate": 3.824030843663219e-05, "loss": 0.8517, "step": 410000 }, { "epoch": 0.7064419590384129, "grad_norm": 1.9318962097167969, "learning_rate": 3.822596734935979e-05, "loss": 0.8515, "step": 410500 }, { "epoch": 0.7073024242747569, "grad_norm": 2.068328619003296, "learning_rate": 3.8211626262087386e-05, "loss": 0.8513, "step": 411000 }, { "epoch": 0.7081628895111008, "grad_norm": 2.139561414718628, "learning_rate": 3.819728517481499e-05, "loss": 0.859, "step": 411500 }, { "epoch": 0.7090233547474448, "grad_norm": 1.9060418605804443, "learning_rate": 3.818294408754259e-05, "loss": 0.8536, "step": 412000 }, { "epoch": 0.7098838199837888, "grad_norm": 2.385798692703247, "learning_rate": 3.8168603000270184e-05, "loss": 0.8498, "step": 412500 }, { "epoch": 0.7107442852201328, "grad_norm": 2.1265676021575928, "learning_rate": 3.815426191299779e-05, "loss": 0.8501, "step": 413000 }, { "epoch": 0.7116047504564768, "grad_norm": 2.1203105449676514, "learning_rate": 3.813992082572539e-05, "loss": 0.8481, "step": 413500 }, { "epoch": 0.7124652156928208, "grad_norm": 2.1768009662628174, "learning_rate": 3.812557973845299e-05, "loss": 0.8533, "step": 414000 }, { "epoch": 0.7133256809291648, "grad_norm": 2.061535120010376, "learning_rate": 3.8111238651180586e-05, "loss": 0.8532, "step": 414500 }, { "epoch": 0.7141861461655088, "grad_norm": 1.9680122137069702, "learning_rate": 3.809689756390819e-05, "loss": 0.8541, "step": 415000 }, { "epoch": 0.7150466114018528, "grad_norm": 1.9438929557800293, "learning_rate": 3.808255647663579e-05, "loss": 0.8474, "step": 415500 }, { "epoch": 0.7159070766381967, "grad_norm": 2.017174482345581, "learning_rate": 3.806821538936339e-05, "loss": 0.8526, "step": 416000 }, { "epoch": 0.7167675418745407, "grad_norm": 2.2368977069854736, "learning_rate": 3.8053874302090994e-05, "loss": 0.8616, "step": 416500 }, { "epoch": 0.7176280071108847, "grad_norm": 1.999829649925232, "learning_rate": 3.803953321481859e-05, "loss": 0.8525, "step": 417000 }, { "epoch": 0.7184884723472287, "grad_norm": 2.337951183319092, "learning_rate": 3.802519212754619e-05, "loss": 0.8609, "step": 417500 }, { "epoch": 0.7193489375835727, "grad_norm": 2.295762300491333, "learning_rate": 3.801085104027379e-05, "loss": 0.852, "step": 418000 }, { "epoch": 0.7202094028199166, "grad_norm": 2.2162396907806396, "learning_rate": 3.799650995300139e-05, "loss": 0.8486, "step": 418500 }, { "epoch": 0.7210698680562606, "grad_norm": 2.156071424484253, "learning_rate": 3.7982168865728987e-05, "loss": 0.8534, "step": 419000 }, { "epoch": 0.7219303332926047, "grad_norm": 2.052499532699585, "learning_rate": 3.796782777845659e-05, "loss": 0.8518, "step": 419500 }, { "epoch": 0.7227907985289487, "grad_norm": 2.0268807411193848, "learning_rate": 3.7953486691184194e-05, "loss": 0.8492, "step": 420000 }, { "epoch": 0.7236512637652927, "grad_norm": 2.169954776763916, "learning_rate": 3.793914560391179e-05, "loss": 0.8579, "step": 420500 }, { "epoch": 0.7245117290016366, "grad_norm": 2.2587361335754395, "learning_rate": 3.792480451663939e-05, "loss": 0.8553, "step": 421000 }, { "epoch": 0.7253721942379806, "grad_norm": 1.8236783742904663, "learning_rate": 3.791046342936699e-05, "loss": 0.8448, "step": 421500 }, { "epoch": 0.7262326594743246, "grad_norm": 2.1673099994659424, "learning_rate": 3.789612234209459e-05, "loss": 0.8438, "step": 422000 }, { "epoch": 0.7270931247106686, "grad_norm": 2.214162588119507, "learning_rate": 3.788178125482219e-05, "loss": 0.8468, "step": 422500 }, { "epoch": 0.7279535899470125, "grad_norm": 2.025665760040283, "learning_rate": 3.78674401675498e-05, "loss": 0.8503, "step": 423000 }, { "epoch": 0.7288140551833565, "grad_norm": 1.8908655643463135, "learning_rate": 3.7853099080277394e-05, "loss": 0.8546, "step": 423500 }, { "epoch": 0.7296745204197005, "grad_norm": 2.1749110221862793, "learning_rate": 3.783875799300499e-05, "loss": 0.8586, "step": 424000 }, { "epoch": 0.7305349856560445, "grad_norm": 2.0959668159484863, "learning_rate": 3.7824416905732595e-05, "loss": 0.8471, "step": 424500 }, { "epoch": 0.7313954508923884, "grad_norm": 2.0514888763427734, "learning_rate": 3.781007581846019e-05, "loss": 0.8529, "step": 425000 }, { "epoch": 0.7322559161287325, "grad_norm": 2.1552107334136963, "learning_rate": 3.7795734731187796e-05, "loss": 0.8497, "step": 425500 }, { "epoch": 0.7331163813650765, "grad_norm": 2.833772659301758, "learning_rate": 3.778139364391539e-05, "loss": 0.8492, "step": 426000 }, { "epoch": 0.7339768466014205, "grad_norm": 2.154860019683838, "learning_rate": 3.7767052556643e-05, "loss": 0.8512, "step": 426500 }, { "epoch": 0.7348373118377645, "grad_norm": 2.1189260482788086, "learning_rate": 3.7752711469370594e-05, "loss": 0.8455, "step": 427000 }, { "epoch": 0.7356977770741084, "grad_norm": 1.8865413665771484, "learning_rate": 3.773837038209819e-05, "loss": 0.8556, "step": 427500 }, { "epoch": 0.7365582423104524, "grad_norm": 2.020766258239746, "learning_rate": 3.7724029294825795e-05, "loss": 0.8427, "step": 428000 }, { "epoch": 0.7374187075467964, "grad_norm": 2.0161490440368652, "learning_rate": 3.770968820755339e-05, "loss": 0.8517, "step": 428500 }, { "epoch": 0.7382791727831404, "grad_norm": 1.9801963567733765, "learning_rate": 3.7695347120280996e-05, "loss": 0.8501, "step": 429000 }, { "epoch": 0.7391396380194843, "grad_norm": 2.0916380882263184, "learning_rate": 3.76810060330086e-05, "loss": 0.8462, "step": 429500 }, { "epoch": 0.7400001032558283, "grad_norm": 2.019502639770508, "learning_rate": 3.76666649457362e-05, "loss": 0.8532, "step": 430000 }, { "epoch": 0.7408605684921723, "grad_norm": 1.979933500289917, "learning_rate": 3.7652323858463794e-05, "loss": 0.8473, "step": 430500 }, { "epoch": 0.7417210337285164, "grad_norm": 2.0010876655578613, "learning_rate": 3.763798277119139e-05, "loss": 0.8468, "step": 431000 }, { "epoch": 0.7425814989648604, "grad_norm": 2.2110819816589355, "learning_rate": 3.7623641683918995e-05, "loss": 0.8453, "step": 431500 }, { "epoch": 0.7434419642012043, "grad_norm": 2.0687079429626465, "learning_rate": 3.76093005966466e-05, "loss": 0.8522, "step": 432000 }, { "epoch": 0.7443024294375483, "grad_norm": 1.9159475564956665, "learning_rate": 3.7594959509374196e-05, "loss": 0.8488, "step": 432500 }, { "epoch": 0.7451628946738923, "grad_norm": 2.194312572479248, "learning_rate": 3.75806184221018e-05, "loss": 0.8513, "step": 433000 }, { "epoch": 0.7460233599102363, "grad_norm": 1.8975337743759155, "learning_rate": 3.75662773348294e-05, "loss": 0.8454, "step": 433500 }, { "epoch": 0.7468838251465802, "grad_norm": 2.1459665298461914, "learning_rate": 3.7551936247556994e-05, "loss": 0.8496, "step": 434000 }, { "epoch": 0.7477442903829242, "grad_norm": 1.8642619848251343, "learning_rate": 3.75375951602846e-05, "loss": 0.8446, "step": 434500 }, { "epoch": 0.7486047556192682, "grad_norm": 2.064448356628418, "learning_rate": 3.75232540730122e-05, "loss": 0.8488, "step": 435000 }, { "epoch": 0.7494652208556122, "grad_norm": 2.065342903137207, "learning_rate": 3.75089129857398e-05, "loss": 0.8493, "step": 435500 }, { "epoch": 0.7503256860919562, "grad_norm": 2.1863017082214355, "learning_rate": 3.74945718984674e-05, "loss": 0.8436, "step": 436000 }, { "epoch": 0.7511861513283001, "grad_norm": 1.9626061916351318, "learning_rate": 3.7480230811195e-05, "loss": 0.8505, "step": 436500 }, { "epoch": 0.7520466165646442, "grad_norm": 1.9098788499832153, "learning_rate": 3.74658897239226e-05, "loss": 0.8514, "step": 437000 }, { "epoch": 0.7529070818009882, "grad_norm": 1.9859448671340942, "learning_rate": 3.7451548636650194e-05, "loss": 0.8524, "step": 437500 }, { "epoch": 0.7537675470373322, "grad_norm": 1.9206920862197876, "learning_rate": 3.74372075493778e-05, "loss": 0.8515, "step": 438000 }, { "epoch": 0.7546280122736762, "grad_norm": 1.8871228694915771, "learning_rate": 3.74228664621054e-05, "loss": 0.8504, "step": 438500 }, { "epoch": 0.7554884775100201, "grad_norm": 1.9660274982452393, "learning_rate": 3.7408525374833e-05, "loss": 0.8548, "step": 439000 }, { "epoch": 0.7563489427463641, "grad_norm": 2.231797456741333, "learning_rate": 3.73941842875606e-05, "loss": 0.8417, "step": 439500 }, { "epoch": 0.7572094079827081, "grad_norm": 7.269671440124512, "learning_rate": 3.73798432002882e-05, "loss": 0.8504, "step": 440000 }, { "epoch": 0.7580698732190521, "grad_norm": 1.7683589458465576, "learning_rate": 3.73655021130158e-05, "loss": 0.8476, "step": 440500 }, { "epoch": 0.758930338455396, "grad_norm": 2.11978816986084, "learning_rate": 3.73511610257434e-05, "loss": 0.8534, "step": 441000 }, { "epoch": 0.75979080369174, "grad_norm": 1.9887040853500366, "learning_rate": 3.7336819938471004e-05, "loss": 0.8495, "step": 441500 }, { "epoch": 0.760651268928084, "grad_norm": 2.1202428340911865, "learning_rate": 3.73224788511986e-05, "loss": 0.8403, "step": 442000 }, { "epoch": 0.7615117341644281, "grad_norm": 2.0636634826660156, "learning_rate": 3.73081377639262e-05, "loss": 0.8457, "step": 442500 }, { "epoch": 0.762372199400772, "grad_norm": 1.9846032857894897, "learning_rate": 3.72937966766538e-05, "loss": 0.8473, "step": 443000 }, { "epoch": 0.763232664637116, "grad_norm": 2.072988748550415, "learning_rate": 3.72794555893814e-05, "loss": 0.847, "step": 443500 }, { "epoch": 0.76409312987346, "grad_norm": 1.830478310585022, "learning_rate": 3.7265114502108997e-05, "loss": 0.8467, "step": 444000 }, { "epoch": 0.764953595109804, "grad_norm": 2.1850337982177734, "learning_rate": 3.725077341483661e-05, "loss": 0.8449, "step": 444500 }, { "epoch": 0.765814060346148, "grad_norm": 1.9432579278945923, "learning_rate": 3.7236432327564204e-05, "loss": 0.8415, "step": 445000 }, { "epoch": 0.7666745255824919, "grad_norm": 2.1667871475219727, "learning_rate": 3.72220912402918e-05, "loss": 0.8514, "step": 445500 }, { "epoch": 0.7675349908188359, "grad_norm": 2.2337772846221924, "learning_rate": 3.7207750153019405e-05, "loss": 0.8516, "step": 446000 }, { "epoch": 0.7683954560551799, "grad_norm": 2.006190776824951, "learning_rate": 3.7193409065747e-05, "loss": 0.844, "step": 446500 }, { "epoch": 0.7692559212915239, "grad_norm": 1.884323239326477, "learning_rate": 3.71790679784746e-05, "loss": 0.8375, "step": 447000 }, { "epoch": 0.7701163865278678, "grad_norm": 2.1498095989227295, "learning_rate": 3.71647268912022e-05, "loss": 0.8446, "step": 447500 }, { "epoch": 0.7709768517642118, "grad_norm": 2.063354253768921, "learning_rate": 3.715038580392981e-05, "loss": 0.8484, "step": 448000 }, { "epoch": 0.7718373170005559, "grad_norm": 2.038367986679077, "learning_rate": 3.7136044716657404e-05, "loss": 0.8418, "step": 448500 }, { "epoch": 0.7726977822368999, "grad_norm": 1.953763484954834, "learning_rate": 3.7121703629385e-05, "loss": 0.8511, "step": 449000 }, { "epoch": 0.7735582474732439, "grad_norm": 1.9874523878097534, "learning_rate": 3.7107362542112605e-05, "loss": 0.8453, "step": 449500 }, { "epoch": 0.7744187127095878, "grad_norm": 1.9465683698654175, "learning_rate": 3.70930214548402e-05, "loss": 0.8453, "step": 450000 }, { "epoch": 0.7752791779459318, "grad_norm": 1.8829959630966187, "learning_rate": 3.7078680367567806e-05, "loss": 0.8382, "step": 450500 }, { "epoch": 0.7761396431822758, "grad_norm": 2.064000368118286, "learning_rate": 3.706433928029541e-05, "loss": 0.8441, "step": 451000 }, { "epoch": 0.7770001084186198, "grad_norm": 2.108696699142456, "learning_rate": 3.704999819302301e-05, "loss": 0.8486, "step": 451500 }, { "epoch": 0.7778605736549637, "grad_norm": 2.208709239959717, "learning_rate": 3.7035657105750604e-05, "loss": 0.8443, "step": 452000 }, { "epoch": 0.7787210388913077, "grad_norm": 1.9275108575820923, "learning_rate": 3.702131601847821e-05, "loss": 0.8413, "step": 452500 }, { "epoch": 0.7795815041276517, "grad_norm": 2.0423073768615723, "learning_rate": 3.7006974931205805e-05, "loss": 0.8457, "step": 453000 }, { "epoch": 0.7804419693639957, "grad_norm": 1.9696944952011108, "learning_rate": 3.69926338439334e-05, "loss": 0.8462, "step": 453500 }, { "epoch": 0.7813024346003397, "grad_norm": 1.974177598953247, "learning_rate": 3.697829275666101e-05, "loss": 0.845, "step": 454000 }, { "epoch": 0.7821628998366837, "grad_norm": 2.136514663696289, "learning_rate": 3.696395166938861e-05, "loss": 0.8423, "step": 454500 }, { "epoch": 0.7830233650730277, "grad_norm": 2.2075812816619873, "learning_rate": 3.694961058211621e-05, "loss": 0.8422, "step": 455000 }, { "epoch": 0.7838838303093717, "grad_norm": 1.9981763362884521, "learning_rate": 3.6935269494843804e-05, "loss": 0.841, "step": 455500 }, { "epoch": 0.7847442955457157, "grad_norm": 1.9658746719360352, "learning_rate": 3.692092840757141e-05, "loss": 0.843, "step": 456000 }, { "epoch": 0.7856047607820597, "grad_norm": 1.9719587564468384, "learning_rate": 3.6906587320299005e-05, "loss": 0.8469, "step": 456500 }, { "epoch": 0.7864652260184036, "grad_norm": 1.9210423231124878, "learning_rate": 3.689224623302661e-05, "loss": 0.8457, "step": 457000 }, { "epoch": 0.7873256912547476, "grad_norm": 1.938105583190918, "learning_rate": 3.687790514575421e-05, "loss": 0.8455, "step": 457500 }, { "epoch": 0.7881861564910916, "grad_norm": 1.9795875549316406, "learning_rate": 3.686356405848181e-05, "loss": 0.842, "step": 458000 }, { "epoch": 0.7890466217274356, "grad_norm": 2.140584945678711, "learning_rate": 3.684922297120941e-05, "loss": 0.8482, "step": 458500 }, { "epoch": 0.7899070869637795, "grad_norm": 1.879016637802124, "learning_rate": 3.683488188393701e-05, "loss": 0.8426, "step": 459000 }, { "epoch": 0.7907675522001235, "grad_norm": 1.9474905729293823, "learning_rate": 3.682054079666461e-05, "loss": 0.8496, "step": 459500 }, { "epoch": 0.7916280174364676, "grad_norm": 2.1440017223358154, "learning_rate": 3.680619970939221e-05, "loss": 0.8457, "step": 460000 }, { "epoch": 0.7924884826728116, "grad_norm": 2.030857801437378, "learning_rate": 3.679185862211981e-05, "loss": 0.8455, "step": 460500 }, { "epoch": 0.7933489479091556, "grad_norm": 2.1576220989227295, "learning_rate": 3.677751753484741e-05, "loss": 0.8428, "step": 461000 }, { "epoch": 0.7942094131454995, "grad_norm": 2.0410971641540527, "learning_rate": 3.676317644757501e-05, "loss": 0.8458, "step": 461500 }, { "epoch": 0.7950698783818435, "grad_norm": 1.9619431495666504, "learning_rate": 3.674883536030261e-05, "loss": 0.843, "step": 462000 }, { "epoch": 0.7959303436181875, "grad_norm": 1.9847012758255005, "learning_rate": 3.673449427303021e-05, "loss": 0.8408, "step": 462500 }, { "epoch": 0.7967908088545315, "grad_norm": 2.00701642036438, "learning_rate": 3.672015318575781e-05, "loss": 0.8382, "step": 463000 }, { "epoch": 0.7976512740908754, "grad_norm": 1.9219588041305542, "learning_rate": 3.670581209848541e-05, "loss": 0.842, "step": 463500 }, { "epoch": 0.7985117393272194, "grad_norm": 2.0915541648864746, "learning_rate": 3.6691471011213015e-05, "loss": 0.8383, "step": 464000 }, { "epoch": 0.7993722045635634, "grad_norm": 2.0045886039733887, "learning_rate": 3.667712992394061e-05, "loss": 0.8454, "step": 464500 }, { "epoch": 0.8002326697999074, "grad_norm": 2.008676767349243, "learning_rate": 3.666278883666821e-05, "loss": 0.8388, "step": 465000 }, { "epoch": 0.8010931350362513, "grad_norm": 2.072183609008789, "learning_rate": 3.664844774939581e-05, "loss": 0.8444, "step": 465500 }, { "epoch": 0.8019536002725954, "grad_norm": 2.069936513900757, "learning_rate": 3.663410666212341e-05, "loss": 0.8409, "step": 466000 }, { "epoch": 0.8028140655089394, "grad_norm": 2.0726213455200195, "learning_rate": 3.6619765574851014e-05, "loss": 0.8374, "step": 466500 }, { "epoch": 0.8036745307452834, "grad_norm": 2.0134403705596924, "learning_rate": 3.660542448757861e-05, "loss": 0.8467, "step": 467000 }, { "epoch": 0.8045349959816274, "grad_norm": 1.969626545906067, "learning_rate": 3.6591083400306215e-05, "loss": 0.8377, "step": 467500 }, { "epoch": 0.8053954612179713, "grad_norm": 1.9737474918365479, "learning_rate": 3.657674231303381e-05, "loss": 0.8365, "step": 468000 }, { "epoch": 0.8062559264543153, "grad_norm": 2.0155138969421387, "learning_rate": 3.656240122576141e-05, "loss": 0.8461, "step": 468500 }, { "epoch": 0.8071163916906593, "grad_norm": 1.9591182470321655, "learning_rate": 3.654806013848901e-05, "loss": 0.847, "step": 469000 }, { "epoch": 0.8079768569270033, "grad_norm": 2.1773741245269775, "learning_rate": 3.653371905121662e-05, "loss": 0.838, "step": 469500 }, { "epoch": 0.8088373221633472, "grad_norm": 2.013202428817749, "learning_rate": 3.6519377963944214e-05, "loss": 0.844, "step": 470000 }, { "epoch": 0.8096977873996912, "grad_norm": 2.2754344940185547, "learning_rate": 3.650503687667182e-05, "loss": 0.8456, "step": 470500 }, { "epoch": 0.8105582526360352, "grad_norm": 1.956955909729004, "learning_rate": 3.6490695789399415e-05, "loss": 0.8398, "step": 471000 }, { "epoch": 0.8114187178723793, "grad_norm": 2.2035508155822754, "learning_rate": 3.647635470212701e-05, "loss": 0.8413, "step": 471500 }, { "epoch": 0.8122791831087233, "grad_norm": 1.993390679359436, "learning_rate": 3.646201361485461e-05, "loss": 0.8417, "step": 472000 }, { "epoch": 0.8131396483450672, "grad_norm": 2.0903286933898926, "learning_rate": 3.644767252758221e-05, "loss": 0.8392, "step": 472500 }, { "epoch": 0.8140001135814112, "grad_norm": 1.9314452409744263, "learning_rate": 3.643333144030982e-05, "loss": 0.8336, "step": 473000 }, { "epoch": 0.8148605788177552, "grad_norm": 2.128455877304077, "learning_rate": 3.6418990353037414e-05, "loss": 0.8382, "step": 473500 }, { "epoch": 0.8157210440540992, "grad_norm": 2.0214078426361084, "learning_rate": 3.640464926576502e-05, "loss": 0.8408, "step": 474000 }, { "epoch": 0.8165815092904432, "grad_norm": 1.9782541990280151, "learning_rate": 3.6390308178492615e-05, "loss": 0.8415, "step": 474500 }, { "epoch": 0.8174419745267871, "grad_norm": 2.0621581077575684, "learning_rate": 3.637596709122021e-05, "loss": 0.8352, "step": 475000 }, { "epoch": 0.8183024397631311, "grad_norm": 1.955025553703308, "learning_rate": 3.6361626003947816e-05, "loss": 0.844, "step": 475500 }, { "epoch": 0.8191629049994751, "grad_norm": 1.9758415222167969, "learning_rate": 3.634728491667542e-05, "loss": 0.8393, "step": 476000 }, { "epoch": 0.8200233702358191, "grad_norm": 1.9560452699661255, "learning_rate": 3.633294382940302e-05, "loss": 0.8439, "step": 476500 }, { "epoch": 0.820883835472163, "grad_norm": 2.189896583557129, "learning_rate": 3.631860274213062e-05, "loss": 0.8383, "step": 477000 }, { "epoch": 0.8217443007085071, "grad_norm": 2.1071627140045166, "learning_rate": 3.630426165485822e-05, "loss": 0.8346, "step": 477500 }, { "epoch": 0.8226047659448511, "grad_norm": 2.060502767562866, "learning_rate": 3.6289920567585815e-05, "loss": 0.8445, "step": 478000 }, { "epoch": 0.8234652311811951, "grad_norm": 2.4608230590820312, "learning_rate": 3.627557948031341e-05, "loss": 0.8415, "step": 478500 }, { "epoch": 0.8243256964175391, "grad_norm": 2.0735158920288086, "learning_rate": 3.626123839304102e-05, "loss": 0.8342, "step": 479000 }, { "epoch": 0.825186161653883, "grad_norm": 2.0641353130340576, "learning_rate": 3.624689730576862e-05, "loss": 0.8389, "step": 479500 }, { "epoch": 0.826046626890227, "grad_norm": 1.985877513885498, "learning_rate": 3.623255621849622e-05, "loss": 0.8406, "step": 480000 }, { "epoch": 0.826907092126571, "grad_norm": 2.053553819656372, "learning_rate": 3.621821513122382e-05, "loss": 0.8423, "step": 480500 }, { "epoch": 0.827767557362915, "grad_norm": 2.022402048110962, "learning_rate": 3.620387404395142e-05, "loss": 0.841, "step": 481000 }, { "epoch": 0.8286280225992589, "grad_norm": 2.0855982303619385, "learning_rate": 3.6189532956679015e-05, "loss": 0.8371, "step": 481500 }, { "epoch": 0.8294884878356029, "grad_norm": 2.0884454250335693, "learning_rate": 3.617519186940662e-05, "loss": 0.8377, "step": 482000 }, { "epoch": 0.8303489530719469, "grad_norm": 2.163231134414673, "learning_rate": 3.616085078213422e-05, "loss": 0.8407, "step": 482500 }, { "epoch": 0.831209418308291, "grad_norm": 2.1384119987487793, "learning_rate": 3.614650969486182e-05, "loss": 0.8344, "step": 483000 }, { "epoch": 0.832069883544635, "grad_norm": 2.0148468017578125, "learning_rate": 3.613216860758942e-05, "loss": 0.841, "step": 483500 }, { "epoch": 0.8329303487809789, "grad_norm": 1.9842921495437622, "learning_rate": 3.611782752031702e-05, "loss": 0.837, "step": 484000 }, { "epoch": 0.8337908140173229, "grad_norm": 1.9228190183639526, "learning_rate": 3.610348643304462e-05, "loss": 0.8348, "step": 484500 }, { "epoch": 0.8346512792536669, "grad_norm": 2.2085182666778564, "learning_rate": 3.608914534577222e-05, "loss": 0.8416, "step": 485000 }, { "epoch": 0.8355117444900109, "grad_norm": 2.118323802947998, "learning_rate": 3.6074804258499825e-05, "loss": 0.8345, "step": 485500 }, { "epoch": 0.8363722097263548, "grad_norm": 2.1001532077789307, "learning_rate": 3.606046317122742e-05, "loss": 0.8397, "step": 486000 }, { "epoch": 0.8372326749626988, "grad_norm": 2.194197177886963, "learning_rate": 3.604612208395502e-05, "loss": 0.8437, "step": 486500 }, { "epoch": 0.8380931401990428, "grad_norm": 1.9958704710006714, "learning_rate": 3.6031780996682623e-05, "loss": 0.8342, "step": 487000 }, { "epoch": 0.8389536054353868, "grad_norm": 2.2926270961761475, "learning_rate": 3.601743990941022e-05, "loss": 0.8383, "step": 487500 }, { "epoch": 0.8398140706717308, "grad_norm": 2.2010133266448975, "learning_rate": 3.600309882213782e-05, "loss": 0.8364, "step": 488000 }, { "epoch": 0.8406745359080747, "grad_norm": 2.061776876449585, "learning_rate": 3.598875773486542e-05, "loss": 0.8385, "step": 488500 }, { "epoch": 0.8415350011444188, "grad_norm": 2.888331413269043, "learning_rate": 3.5974416647593025e-05, "loss": 0.8407, "step": 489000 }, { "epoch": 0.8423954663807628, "grad_norm": 2.224823236465454, "learning_rate": 3.596007556032062e-05, "loss": 0.8372, "step": 489500 }, { "epoch": 0.8432559316171068, "grad_norm": 2.077299118041992, "learning_rate": 3.594573447304822e-05, "loss": 0.8402, "step": 490000 }, { "epoch": 0.8441163968534507, "grad_norm": 1.9325640201568604, "learning_rate": 3.593139338577582e-05, "loss": 0.8364, "step": 490500 }, { "epoch": 0.8449768620897947, "grad_norm": 2.32381534576416, "learning_rate": 3.591705229850342e-05, "loss": 0.8398, "step": 491000 }, { "epoch": 0.8458373273261387, "grad_norm": 1.8531659841537476, "learning_rate": 3.5902711211231024e-05, "loss": 0.8276, "step": 491500 }, { "epoch": 0.8466977925624827, "grad_norm": 1.9492835998535156, "learning_rate": 3.588837012395863e-05, "loss": 0.8356, "step": 492000 }, { "epoch": 0.8475582577988267, "grad_norm": 1.9821016788482666, "learning_rate": 3.5874029036686225e-05, "loss": 0.8375, "step": 492500 }, { "epoch": 0.8484187230351706, "grad_norm": 4.051340103149414, "learning_rate": 3.585968794941382e-05, "loss": 0.8365, "step": 493000 }, { "epoch": 0.8492791882715146, "grad_norm": 2.0185279846191406, "learning_rate": 3.5845346862141426e-05, "loss": 0.8397, "step": 493500 }, { "epoch": 0.8501396535078586, "grad_norm": 1.8271918296813965, "learning_rate": 3.583100577486902e-05, "loss": 0.8354, "step": 494000 }, { "epoch": 0.8510001187442026, "grad_norm": 2.056795597076416, "learning_rate": 3.581666468759663e-05, "loss": 0.8299, "step": 494500 }, { "epoch": 0.8518605839805466, "grad_norm": 1.9958736896514893, "learning_rate": 3.5802323600324224e-05, "loss": 0.8359, "step": 495000 }, { "epoch": 0.8527210492168906, "grad_norm": 2.0710606575012207, "learning_rate": 3.578798251305183e-05, "loss": 0.8371, "step": 495500 }, { "epoch": 0.8535815144532346, "grad_norm": 1.956955909729004, "learning_rate": 3.5773641425779425e-05, "loss": 0.8379, "step": 496000 }, { "epoch": 0.8544419796895786, "grad_norm": 1.928909420967102, "learning_rate": 3.575930033850702e-05, "loss": 0.837, "step": 496500 }, { "epoch": 0.8553024449259226, "grad_norm": 2.067176580429077, "learning_rate": 3.5744959251234626e-05, "loss": 0.8416, "step": 497000 }, { "epoch": 0.8561629101622665, "grad_norm": 2.0131676197052, "learning_rate": 3.573061816396222e-05, "loss": 0.8402, "step": 497500 }, { "epoch": 0.8570233753986105, "grad_norm": 1.9614657163619995, "learning_rate": 3.571627707668983e-05, "loss": 0.8374, "step": 498000 }, { "epoch": 0.8578838406349545, "grad_norm": 1.9758917093276978, "learning_rate": 3.570193598941743e-05, "loss": 0.831, "step": 498500 }, { "epoch": 0.8587443058712985, "grad_norm": 2.080070734024048, "learning_rate": 3.568759490214503e-05, "loss": 0.8359, "step": 499000 }, { "epoch": 0.8596047711076424, "grad_norm": 2.107738733291626, "learning_rate": 3.5673253814872625e-05, "loss": 0.8351, "step": 499500 }, { "epoch": 0.8604652363439864, "grad_norm": 2.010946750640869, "learning_rate": 3.565891272760023e-05, "loss": 0.8405, "step": 500000 }, { "epoch": 0.8613257015803305, "grad_norm": 1.9980274438858032, "learning_rate": 3.5644571640327826e-05, "loss": 0.8326, "step": 500500 }, { "epoch": 0.8621861668166745, "grad_norm": 1.8315647840499878, "learning_rate": 3.563023055305543e-05, "loss": 0.8287, "step": 501000 }, { "epoch": 0.8630466320530185, "grad_norm": 1.926206350326538, "learning_rate": 3.561588946578303e-05, "loss": 0.8362, "step": 501500 }, { "epoch": 0.8639070972893624, "grad_norm": 2.189225196838379, "learning_rate": 3.560154837851063e-05, "loss": 0.8343, "step": 502000 }, { "epoch": 0.8647675625257064, "grad_norm": 2.096952199935913, "learning_rate": 3.558720729123823e-05, "loss": 0.8339, "step": 502500 }, { "epoch": 0.8656280277620504, "grad_norm": 2.0683114528656006, "learning_rate": 3.5572866203965825e-05, "loss": 0.8394, "step": 503000 }, { "epoch": 0.8664884929983944, "grad_norm": 2.128429651260376, "learning_rate": 3.555852511669343e-05, "loss": 0.8332, "step": 503500 }, { "epoch": 0.8673489582347383, "grad_norm": 2.0516273975372314, "learning_rate": 3.554418402942103e-05, "loss": 0.8383, "step": 504000 }, { "epoch": 0.8682094234710823, "grad_norm": 1.9619426727294922, "learning_rate": 3.552984294214863e-05, "loss": 0.8247, "step": 504500 }, { "epoch": 0.8690698887074263, "grad_norm": 2.0234973430633545, "learning_rate": 3.5515501854876234e-05, "loss": 0.8323, "step": 505000 }, { "epoch": 0.8699303539437703, "grad_norm": 2.0937347412109375, "learning_rate": 3.550116076760383e-05, "loss": 0.8316, "step": 505500 }, { "epoch": 0.8707908191801143, "grad_norm": 2.096928119659424, "learning_rate": 3.548681968033143e-05, "loss": 0.8333, "step": 506000 }, { "epoch": 0.8716512844164583, "grad_norm": 2.187704086303711, "learning_rate": 3.5472478593059025e-05, "loss": 0.8403, "step": 506500 }, { "epoch": 0.8725117496528023, "grad_norm": 1.8772039413452148, "learning_rate": 3.545813750578663e-05, "loss": 0.8332, "step": 507000 }, { "epoch": 0.8733722148891463, "grad_norm": 2.0619008541107178, "learning_rate": 3.544379641851423e-05, "loss": 0.8351, "step": 507500 }, { "epoch": 0.8742326801254903, "grad_norm": 1.9941927194595337, "learning_rate": 3.542945533124183e-05, "loss": 0.8345, "step": 508000 }, { "epoch": 0.8750931453618342, "grad_norm": 1.9549013376235962, "learning_rate": 3.5415114243969434e-05, "loss": 0.8361, "step": 508500 }, { "epoch": 0.8759536105981782, "grad_norm": 5.23269510269165, "learning_rate": 3.540077315669703e-05, "loss": 0.8344, "step": 509000 }, { "epoch": 0.8768140758345222, "grad_norm": 2.060516595840454, "learning_rate": 3.538643206942463e-05, "loss": 0.8322, "step": 509500 }, { "epoch": 0.8776745410708662, "grad_norm": 2.7960243225097656, "learning_rate": 3.537209098215223e-05, "loss": 0.8354, "step": 510000 }, { "epoch": 0.8785350063072102, "grad_norm": 2.1856071949005127, "learning_rate": 3.5357749894879835e-05, "loss": 0.8338, "step": 510500 }, { "epoch": 0.8793954715435541, "grad_norm": 2.1425588130950928, "learning_rate": 3.534340880760743e-05, "loss": 0.8303, "step": 511000 }, { "epoch": 0.8802559367798981, "grad_norm": 2.1811203956604004, "learning_rate": 3.5329067720335036e-05, "loss": 0.8364, "step": 511500 }, { "epoch": 0.8811164020162422, "grad_norm": 2.2466700077056885, "learning_rate": 3.5314726633062633e-05, "loss": 0.8391, "step": 512000 }, { "epoch": 0.8819768672525862, "grad_norm": 2.0680465698242188, "learning_rate": 3.530038554579023e-05, "loss": 0.8359, "step": 512500 }, { "epoch": 0.8828373324889301, "grad_norm": 2.07084321975708, "learning_rate": 3.528604445851783e-05, "loss": 0.8359, "step": 513000 }, { "epoch": 0.8836977977252741, "grad_norm": 2.203376054763794, "learning_rate": 3.527170337124543e-05, "loss": 0.8351, "step": 513500 }, { "epoch": 0.8845582629616181, "grad_norm": 2.1163270473480225, "learning_rate": 3.5257362283973035e-05, "loss": 0.8345, "step": 514000 }, { "epoch": 0.8854187281979621, "grad_norm": 2.2538020610809326, "learning_rate": 3.524302119670063e-05, "loss": 0.8333, "step": 514500 }, { "epoch": 0.8862791934343061, "grad_norm": 2.0660109519958496, "learning_rate": 3.5228680109428236e-05, "loss": 0.8356, "step": 515000 }, { "epoch": 0.88713965867065, "grad_norm": 2.139326333999634, "learning_rate": 3.521433902215583e-05, "loss": 0.8319, "step": 515500 }, { "epoch": 0.888000123906994, "grad_norm": 1.9566582441329956, "learning_rate": 3.519999793488343e-05, "loss": 0.8339, "step": 516000 }, { "epoch": 0.888860589143338, "grad_norm": 2.0667715072631836, "learning_rate": 3.5185656847611034e-05, "loss": 0.8342, "step": 516500 }, { "epoch": 0.889721054379682, "grad_norm": 2.1288838386535645, "learning_rate": 3.517131576033864e-05, "loss": 0.8303, "step": 517000 }, { "epoch": 0.8905815196160259, "grad_norm": 2.0571327209472656, "learning_rate": 3.5156974673066235e-05, "loss": 0.8384, "step": 517500 }, { "epoch": 0.89144198485237, "grad_norm": 2.157618761062622, "learning_rate": 3.514263358579383e-05, "loss": 0.8314, "step": 518000 }, { "epoch": 0.892302450088714, "grad_norm": 2.11012864112854, "learning_rate": 3.5128292498521436e-05, "loss": 0.8343, "step": 518500 }, { "epoch": 0.893162915325058, "grad_norm": 2.0556013584136963, "learning_rate": 3.511395141124903e-05, "loss": 0.8378, "step": 519000 }, { "epoch": 0.894023380561402, "grad_norm": 1.9106919765472412, "learning_rate": 3.509961032397664e-05, "loss": 0.8331, "step": 519500 }, { "epoch": 0.8948838457977459, "grad_norm": 2.0491740703582764, "learning_rate": 3.508526923670424e-05, "loss": 0.8359, "step": 520000 }, { "epoch": 0.8957443110340899, "grad_norm": 1.9946589469909668, "learning_rate": 3.507092814943184e-05, "loss": 0.8289, "step": 520500 }, { "epoch": 0.8966047762704339, "grad_norm": 2.1237125396728516, "learning_rate": 3.5056587062159435e-05, "loss": 0.8388, "step": 521000 }, { "epoch": 0.8974652415067779, "grad_norm": 2.3399975299835205, "learning_rate": 3.504224597488704e-05, "loss": 0.828, "step": 521500 }, { "epoch": 0.8983257067431218, "grad_norm": 1.986943244934082, "learning_rate": 3.5027904887614636e-05, "loss": 0.8326, "step": 522000 }, { "epoch": 0.8991861719794658, "grad_norm": 2.0902836322784424, "learning_rate": 3.501356380034223e-05, "loss": 0.8319, "step": 522500 }, { "epoch": 0.9000466372158098, "grad_norm": 2.1451847553253174, "learning_rate": 3.499922271306984e-05, "loss": 0.8367, "step": 523000 }, { "epoch": 0.9009071024521539, "grad_norm": 1.9373173713684082, "learning_rate": 3.498488162579744e-05, "loss": 0.8332, "step": 523500 }, { "epoch": 0.9017675676884979, "grad_norm": 2.0312416553497314, "learning_rate": 3.497054053852504e-05, "loss": 0.8292, "step": 524000 }, { "epoch": 0.9026280329248418, "grad_norm": 2.035531520843506, "learning_rate": 3.4956199451252635e-05, "loss": 0.8249, "step": 524500 }, { "epoch": 0.9034884981611858, "grad_norm": 2.07161808013916, "learning_rate": 3.494185836398024e-05, "loss": 0.8347, "step": 525000 }, { "epoch": 0.9043489633975298, "grad_norm": 2.058345317840576, "learning_rate": 3.4927517276707836e-05, "loss": 0.8309, "step": 525500 }, { "epoch": 0.9052094286338738, "grad_norm": 2.0514707565307617, "learning_rate": 3.491317618943544e-05, "loss": 0.8352, "step": 526000 }, { "epoch": 0.9060698938702177, "grad_norm": 2.442887544631958, "learning_rate": 3.4898835102163044e-05, "loss": 0.8272, "step": 526500 }, { "epoch": 0.9069303591065617, "grad_norm": 2.133779764175415, "learning_rate": 3.488449401489064e-05, "loss": 0.8231, "step": 527000 }, { "epoch": 0.9077908243429057, "grad_norm": 2.0897367000579834, "learning_rate": 3.487015292761824e-05, "loss": 0.8309, "step": 527500 }, { "epoch": 0.9086512895792497, "grad_norm": 8.865015983581543, "learning_rate": 3.485581184034584e-05, "loss": 0.8311, "step": 528000 }, { "epoch": 0.9095117548155937, "grad_norm": 1.8066165447235107, "learning_rate": 3.484147075307344e-05, "loss": 0.8274, "step": 528500 }, { "epoch": 0.9103722200519376, "grad_norm": 1.9701728820800781, "learning_rate": 3.482712966580104e-05, "loss": 0.832, "step": 529000 }, { "epoch": 0.9112326852882817, "grad_norm": 2.0116164684295654, "learning_rate": 3.481278857852864e-05, "loss": 0.8261, "step": 529500 }, { "epoch": 0.9120931505246257, "grad_norm": 1.9901094436645508, "learning_rate": 3.4798447491256244e-05, "loss": 0.8324, "step": 530000 }, { "epoch": 0.9129536157609697, "grad_norm": 1.9889498949050903, "learning_rate": 3.478410640398384e-05, "loss": 0.8182, "step": 530500 }, { "epoch": 0.9138140809973136, "grad_norm": 2.01224946975708, "learning_rate": 3.476976531671144e-05, "loss": 0.8269, "step": 531000 }, { "epoch": 0.9146745462336576, "grad_norm": 1.9385786056518555, "learning_rate": 3.475542422943904e-05, "loss": 0.8306, "step": 531500 }, { "epoch": 0.9155350114700016, "grad_norm": 2.022874593734741, "learning_rate": 3.474108314216664e-05, "loss": 0.8352, "step": 532000 }, { "epoch": 0.9163954767063456, "grad_norm": 2.1472716331481934, "learning_rate": 3.472674205489424e-05, "loss": 0.8313, "step": 532500 }, { "epoch": 0.9172559419426896, "grad_norm": 2.008678436279297, "learning_rate": 3.4712400967621846e-05, "loss": 0.83, "step": 533000 }, { "epoch": 0.9181164071790335, "grad_norm": 1.9580414295196533, "learning_rate": 3.4698059880349443e-05, "loss": 0.8265, "step": 533500 }, { "epoch": 0.9189768724153775, "grad_norm": 1.7564566135406494, "learning_rate": 3.468371879307704e-05, "loss": 0.8254, "step": 534000 }, { "epoch": 0.9198373376517215, "grad_norm": 2.0323212146759033, "learning_rate": 3.4669377705804644e-05, "loss": 0.8363, "step": 534500 }, { "epoch": 0.9206978028880655, "grad_norm": 2.200009822845459, "learning_rate": 3.465503661853224e-05, "loss": 0.8216, "step": 535000 }, { "epoch": 0.9215582681244096, "grad_norm": 2.065061092376709, "learning_rate": 3.4640695531259845e-05, "loss": 0.83, "step": 535500 }, { "epoch": 0.9224187333607535, "grad_norm": 2.178332805633545, "learning_rate": 3.462635444398744e-05, "loss": 0.8317, "step": 536000 }, { "epoch": 0.9232791985970975, "grad_norm": 1.9318639039993286, "learning_rate": 3.4612013356715046e-05, "loss": 0.8322, "step": 536500 }, { "epoch": 0.9241396638334415, "grad_norm": 2.050119400024414, "learning_rate": 3.459767226944264e-05, "loss": 0.8304, "step": 537000 }, { "epoch": 0.9250001290697855, "grad_norm": 2.291924238204956, "learning_rate": 3.458333118217024e-05, "loss": 0.8258, "step": 537500 }, { "epoch": 0.9258605943061294, "grad_norm": 1.9168674945831299, "learning_rate": 3.4568990094897844e-05, "loss": 0.8228, "step": 538000 }, { "epoch": 0.9267210595424734, "grad_norm": 2.229269027709961, "learning_rate": 3.455464900762544e-05, "loss": 0.8265, "step": 538500 }, { "epoch": 0.9275815247788174, "grad_norm": 1.979021668434143, "learning_rate": 3.4540307920353045e-05, "loss": 0.8275, "step": 539000 }, { "epoch": 0.9284419900151614, "grad_norm": 2.030496835708618, "learning_rate": 3.452596683308065e-05, "loss": 0.8302, "step": 539500 }, { "epoch": 0.9293024552515053, "grad_norm": 2.054203748703003, "learning_rate": 3.4511625745808246e-05, "loss": 0.8313, "step": 540000 }, { "epoch": 0.9301629204878493, "grad_norm": 2.2082176208496094, "learning_rate": 3.449728465853584e-05, "loss": 0.8259, "step": 540500 }, { "epoch": 0.9310233857241934, "grad_norm": 1.9945560693740845, "learning_rate": 3.448294357126344e-05, "loss": 0.8277, "step": 541000 }, { "epoch": 0.9318838509605374, "grad_norm": 2.0513644218444824, "learning_rate": 3.4468602483991044e-05, "loss": 0.8343, "step": 541500 }, { "epoch": 0.9327443161968814, "grad_norm": 1.8823306560516357, "learning_rate": 3.445426139671865e-05, "loss": 0.8274, "step": 542000 }, { "epoch": 0.9336047814332253, "grad_norm": 1.9960150718688965, "learning_rate": 3.4439920309446245e-05, "loss": 0.8263, "step": 542500 }, { "epoch": 0.9344652466695693, "grad_norm": 2.011401653289795, "learning_rate": 3.442557922217385e-05, "loss": 0.8279, "step": 543000 }, { "epoch": 0.9353257119059133, "grad_norm": 1.9199022054672241, "learning_rate": 3.4411238134901446e-05, "loss": 0.8305, "step": 543500 }, { "epoch": 0.9361861771422573, "grad_norm": 2.1455698013305664, "learning_rate": 3.439689704762904e-05, "loss": 0.8288, "step": 544000 }, { "epoch": 0.9370466423786012, "grad_norm": 2.0815937519073486, "learning_rate": 3.438255596035665e-05, "loss": 0.8276, "step": 544500 }, { "epoch": 0.9379071076149452, "grad_norm": 2.0950100421905518, "learning_rate": 3.436821487308425e-05, "loss": 0.8346, "step": 545000 }, { "epoch": 0.9387675728512892, "grad_norm": 2.0615501403808594, "learning_rate": 3.435387378581185e-05, "loss": 0.8185, "step": 545500 }, { "epoch": 0.9396280380876332, "grad_norm": 1.8258684873580933, "learning_rate": 3.433953269853945e-05, "loss": 0.83, "step": 546000 }, { "epoch": 0.9404885033239772, "grad_norm": 2.053507089614868, "learning_rate": 3.432519161126705e-05, "loss": 0.8301, "step": 546500 }, { "epoch": 0.9413489685603212, "grad_norm": 2.033759593963623, "learning_rate": 3.4310850523994646e-05, "loss": 0.8303, "step": 547000 }, { "epoch": 0.9422094337966652, "grad_norm": 1.9754514694213867, "learning_rate": 3.429650943672224e-05, "loss": 0.8307, "step": 547500 }, { "epoch": 0.9430698990330092, "grad_norm": 2.1358065605163574, "learning_rate": 3.428216834944985e-05, "loss": 0.8237, "step": 548000 }, { "epoch": 0.9439303642693532, "grad_norm": 2.076956033706665, "learning_rate": 3.426782726217745e-05, "loss": 0.8289, "step": 548500 }, { "epoch": 0.9447908295056971, "grad_norm": 2.1519205570220947, "learning_rate": 3.425348617490505e-05, "loss": 0.8264, "step": 549000 }, { "epoch": 0.9456512947420411, "grad_norm": 2.108743906021118, "learning_rate": 3.423914508763265e-05, "loss": 0.8238, "step": 549500 }, { "epoch": 0.9465117599783851, "grad_norm": 2.0745456218719482, "learning_rate": 3.422480400036025e-05, "loss": 0.827, "step": 550000 }, { "epoch": 0.9473722252147291, "grad_norm": 2.0193209648132324, "learning_rate": 3.4210462913087846e-05, "loss": 0.8232, "step": 550500 }, { "epoch": 0.9482326904510731, "grad_norm": 2.215498208999634, "learning_rate": 3.419612182581545e-05, "loss": 0.8314, "step": 551000 }, { "epoch": 0.949093155687417, "grad_norm": 2.105421304702759, "learning_rate": 3.4181780738543054e-05, "loss": 0.8246, "step": 551500 }, { "epoch": 0.949953620923761, "grad_norm": 2.102644920349121, "learning_rate": 3.416743965127065e-05, "loss": 0.8283, "step": 552000 }, { "epoch": 0.9508140861601051, "grad_norm": 1.9746520519256592, "learning_rate": 3.415309856399825e-05, "loss": 0.8286, "step": 552500 }, { "epoch": 0.9516745513964491, "grad_norm": 2.012468099594116, "learning_rate": 3.413875747672585e-05, "loss": 0.8259, "step": 553000 }, { "epoch": 0.952535016632793, "grad_norm": 1.770936131477356, "learning_rate": 3.412441638945345e-05, "loss": 0.8236, "step": 553500 }, { "epoch": 0.953395481869137, "grad_norm": 2.082106351852417, "learning_rate": 3.411007530218105e-05, "loss": 0.8252, "step": 554000 }, { "epoch": 0.954255947105481, "grad_norm": 2.122305393218994, "learning_rate": 3.4095734214908656e-05, "loss": 0.8295, "step": 554500 }, { "epoch": 0.955116412341825, "grad_norm": 2.12324595451355, "learning_rate": 3.4081393127636254e-05, "loss": 0.8282, "step": 555000 }, { "epoch": 0.955976877578169, "grad_norm": 1.997396469116211, "learning_rate": 3.406705204036385e-05, "loss": 0.8297, "step": 555500 }, { "epoch": 0.9568373428145129, "grad_norm": 2.3350627422332764, "learning_rate": 3.4052710953091454e-05, "loss": 0.8308, "step": 556000 }, { "epoch": 0.9576978080508569, "grad_norm": 2.1347360610961914, "learning_rate": 3.403836986581905e-05, "loss": 0.8287, "step": 556500 }, { "epoch": 0.9585582732872009, "grad_norm": 2.196913719177246, "learning_rate": 3.402402877854665e-05, "loss": 0.8236, "step": 557000 }, { "epoch": 0.9594187385235449, "grad_norm": 2.0057709217071533, "learning_rate": 3.400968769127425e-05, "loss": 0.8312, "step": 557500 }, { "epoch": 0.9602792037598888, "grad_norm": 2.223766803741455, "learning_rate": 3.3995346604001856e-05, "loss": 0.834, "step": 558000 }, { "epoch": 0.9611396689962329, "grad_norm": 1.8879119157791138, "learning_rate": 3.3981005516729453e-05, "loss": 0.8227, "step": 558500 }, { "epoch": 0.9620001342325769, "grad_norm": 2.166381359100342, "learning_rate": 3.396666442945705e-05, "loss": 0.826, "step": 559000 }, { "epoch": 0.9628605994689209, "grad_norm": 2.1011924743652344, "learning_rate": 3.3952323342184654e-05, "loss": 0.8283, "step": 559500 }, { "epoch": 0.9637210647052649, "grad_norm": 2.1456775665283203, "learning_rate": 3.393798225491225e-05, "loss": 0.8162, "step": 560000 }, { "epoch": 0.9645815299416088, "grad_norm": 1.9335885047912598, "learning_rate": 3.3923641167639855e-05, "loss": 0.8227, "step": 560500 }, { "epoch": 0.9654419951779528, "grad_norm": 3.3997044563293457, "learning_rate": 3.390930008036746e-05, "loss": 0.8252, "step": 561000 }, { "epoch": 0.9663024604142968, "grad_norm": 2.0262041091918945, "learning_rate": 3.3894958993095056e-05, "loss": 0.8304, "step": 561500 }, { "epoch": 0.9671629256506408, "grad_norm": 2.062438726425171, "learning_rate": 3.388061790582265e-05, "loss": 0.8305, "step": 562000 }, { "epoch": 0.9680233908869847, "grad_norm": 1.937840461730957, "learning_rate": 3.386627681855026e-05, "loss": 0.829, "step": 562500 }, { "epoch": 0.9688838561233287, "grad_norm": 2.0924041271209717, "learning_rate": 3.3851935731277854e-05, "loss": 0.829, "step": 563000 }, { "epoch": 0.9697443213596727, "grad_norm": 2.0761170387268066, "learning_rate": 3.383759464400546e-05, "loss": 0.8273, "step": 563500 }, { "epoch": 0.9706047865960168, "grad_norm": 2.1548244953155518, "learning_rate": 3.3823253556733055e-05, "loss": 0.8281, "step": 564000 }, { "epoch": 0.9714652518323608, "grad_norm": 3.1409645080566406, "learning_rate": 3.380891246946066e-05, "loss": 0.8261, "step": 564500 }, { "epoch": 0.9723257170687047, "grad_norm": 2.254758358001709, "learning_rate": 3.3794571382188256e-05, "loss": 0.8254, "step": 565000 }, { "epoch": 0.9731861823050487, "grad_norm": 1.963372826576233, "learning_rate": 3.378023029491585e-05, "loss": 0.8219, "step": 565500 }, { "epoch": 0.9740466475413927, "grad_norm": 2.0996382236480713, "learning_rate": 3.376588920764346e-05, "loss": 0.8254, "step": 566000 }, { "epoch": 0.9749071127777367, "grad_norm": 1.9645578861236572, "learning_rate": 3.3751548120371054e-05, "loss": 0.8238, "step": 566500 }, { "epoch": 0.9757675780140806, "grad_norm": 2.255702495574951, "learning_rate": 3.373720703309866e-05, "loss": 0.8205, "step": 567000 }, { "epoch": 0.9766280432504246, "grad_norm": 2.045149087905884, "learning_rate": 3.372286594582626e-05, "loss": 0.8257, "step": 567500 }, { "epoch": 0.9774885084867686, "grad_norm": 1.8754820823669434, "learning_rate": 3.370852485855386e-05, "loss": 0.8262, "step": 568000 }, { "epoch": 0.9783489737231126, "grad_norm": 2.100236654281616, "learning_rate": 3.3694183771281456e-05, "loss": 0.8206, "step": 568500 }, { "epoch": 0.9792094389594566, "grad_norm": 2.0266075134277344, "learning_rate": 3.367984268400906e-05, "loss": 0.8205, "step": 569000 }, { "epoch": 0.9800699041958005, "grad_norm": 2.2990362644195557, "learning_rate": 3.366550159673666e-05, "loss": 0.8178, "step": 569500 }, { "epoch": 0.9809303694321446, "grad_norm": 2.3041062355041504, "learning_rate": 3.365116050946426e-05, "loss": 0.827, "step": 570000 }, { "epoch": 0.9817908346684886, "grad_norm": 1.9612354040145874, "learning_rate": 3.363681942219186e-05, "loss": 0.8252, "step": 570500 }, { "epoch": 0.9826512999048326, "grad_norm": 2.0809218883514404, "learning_rate": 3.362247833491946e-05, "loss": 0.8303, "step": 571000 }, { "epoch": 0.9835117651411766, "grad_norm": 2.0215840339660645, "learning_rate": 3.360813724764706e-05, "loss": 0.8272, "step": 571500 }, { "epoch": 0.9843722303775205, "grad_norm": 2.0683231353759766, "learning_rate": 3.3593796160374656e-05, "loss": 0.8242, "step": 572000 }, { "epoch": 0.9852326956138645, "grad_norm": 2.0237414836883545, "learning_rate": 3.357945507310226e-05, "loss": 0.8251, "step": 572500 }, { "epoch": 0.9860931608502085, "grad_norm": 2.115190267562866, "learning_rate": 3.356511398582986e-05, "loss": 0.8298, "step": 573000 }, { "epoch": 0.9869536260865525, "grad_norm": 1.906149983406067, "learning_rate": 3.355077289855746e-05, "loss": 0.8176, "step": 573500 }, { "epoch": 0.9878140913228964, "grad_norm": 1.9369081258773804, "learning_rate": 3.3536431811285065e-05, "loss": 0.8216, "step": 574000 }, { "epoch": 0.9886745565592404, "grad_norm": 2.188481092453003, "learning_rate": 3.352209072401266e-05, "loss": 0.8215, "step": 574500 }, { "epoch": 0.9895350217955844, "grad_norm": 1.9754977226257324, "learning_rate": 3.350774963674026e-05, "loss": 0.8233, "step": 575000 }, { "epoch": 0.9903954870319284, "grad_norm": 1.8967516422271729, "learning_rate": 3.349340854946786e-05, "loss": 0.8265, "step": 575500 }, { "epoch": 0.9912559522682725, "grad_norm": 1.9887170791625977, "learning_rate": 3.347906746219546e-05, "loss": 0.8212, "step": 576000 }, { "epoch": 0.9921164175046164, "grad_norm": 2.0904929637908936, "learning_rate": 3.3464726374923064e-05, "loss": 0.8225, "step": 576500 }, { "epoch": 0.9929768827409604, "grad_norm": 2.0081520080566406, "learning_rate": 3.345038528765066e-05, "loss": 0.8215, "step": 577000 }, { "epoch": 0.9938373479773044, "grad_norm": 2.2610244750976562, "learning_rate": 3.3436044200378265e-05, "loss": 0.8178, "step": 577500 }, { "epoch": 0.9946978132136484, "grad_norm": 1.992607831954956, "learning_rate": 3.342170311310586e-05, "loss": 0.8191, "step": 578000 }, { "epoch": 0.9955582784499923, "grad_norm": 2.1425204277038574, "learning_rate": 3.340736202583346e-05, "loss": 0.8234, "step": 578500 }, { "epoch": 0.9964187436863363, "grad_norm": 2.132256269454956, "learning_rate": 3.339302093856106e-05, "loss": 0.8236, "step": 579000 }, { "epoch": 0.9972792089226803, "grad_norm": 2.027850389480591, "learning_rate": 3.3378679851288666e-05, "loss": 0.8211, "step": 579500 }, { "epoch": 0.9981396741590243, "grad_norm": 2.0103042125701904, "learning_rate": 3.3364338764016264e-05, "loss": 0.8232, "step": 580000 }, { "epoch": 0.9990001393953682, "grad_norm": 2.1737022399902344, "learning_rate": 3.334999767674387e-05, "loss": 0.8241, "step": 580500 }, { "epoch": 0.9998606046317122, "grad_norm": 2.199488401412964, "learning_rate": 3.3335656589471464e-05, "loss": 0.8159, "step": 581000 }, { "epoch": 1.0007210698680562, "grad_norm": 2.1512362957000732, "learning_rate": 3.332131550219906e-05, "loss": 0.8182, "step": 581500 }, { "epoch": 1.0015815351044002, "grad_norm": 2.1353979110717773, "learning_rate": 3.330697441492666e-05, "loss": 0.8258, "step": 582000 }, { "epoch": 1.0024420003407442, "grad_norm": 1.8595000505447388, "learning_rate": 3.329263332765426e-05, "loss": 0.8191, "step": 582500 }, { "epoch": 1.0033024655770881, "grad_norm": 2.160736322402954, "learning_rate": 3.3278292240381866e-05, "loss": 0.8202, "step": 583000 }, { "epoch": 1.004162930813432, "grad_norm": 2.159769296646118, "learning_rate": 3.3263951153109463e-05, "loss": 0.8232, "step": 583500 }, { "epoch": 1.005023396049776, "grad_norm": 1.9886316061019897, "learning_rate": 3.324961006583707e-05, "loss": 0.814, "step": 584000 }, { "epoch": 1.0058838612861203, "grad_norm": 2.038942337036133, "learning_rate": 3.3235268978564664e-05, "loss": 0.8239, "step": 584500 }, { "epoch": 1.0067443265224643, "grad_norm": 1.850509524345398, "learning_rate": 3.322092789129226e-05, "loss": 0.8229, "step": 585000 }, { "epoch": 1.0076047917588082, "grad_norm": 2.1953375339508057, "learning_rate": 3.3206586804019865e-05, "loss": 0.822, "step": 585500 }, { "epoch": 1.0084652569951522, "grad_norm": 2.1173160076141357, "learning_rate": 3.319224571674747e-05, "loss": 0.8185, "step": 586000 }, { "epoch": 1.0093257222314962, "grad_norm": 1.989945888519287, "learning_rate": 3.3177904629475066e-05, "loss": 0.8225, "step": 586500 }, { "epoch": 1.0101861874678402, "grad_norm": 1.9819010496139526, "learning_rate": 3.316356354220267e-05, "loss": 0.8178, "step": 587000 }, { "epoch": 1.0110466527041841, "grad_norm": 2.016489267349243, "learning_rate": 3.314922245493027e-05, "loss": 0.8168, "step": 587500 }, { "epoch": 1.0119071179405281, "grad_norm": 2.2139735221862793, "learning_rate": 3.3134881367657864e-05, "loss": 0.8225, "step": 588000 }, { "epoch": 1.012767583176872, "grad_norm": 2.0623409748077393, "learning_rate": 3.312054028038547e-05, "loss": 0.817, "step": 588500 }, { "epoch": 1.013628048413216, "grad_norm": 1.8793323040008545, "learning_rate": 3.310619919311307e-05, "loss": 0.8201, "step": 589000 }, { "epoch": 1.01448851364956, "grad_norm": 1.849631428718567, "learning_rate": 3.309185810584067e-05, "loss": 0.8257, "step": 589500 }, { "epoch": 1.015348978885904, "grad_norm": 2.2486824989318848, "learning_rate": 3.3077517018568266e-05, "loss": 0.814, "step": 590000 }, { "epoch": 1.016209444122248, "grad_norm": 1.9355723857879639, "learning_rate": 3.306317593129587e-05, "loss": 0.8148, "step": 590500 }, { "epoch": 1.017069909358592, "grad_norm": 2.243500232696533, "learning_rate": 3.304883484402347e-05, "loss": 0.8242, "step": 591000 }, { "epoch": 1.017930374594936, "grad_norm": 2.0630578994750977, "learning_rate": 3.3034493756751064e-05, "loss": 0.8258, "step": 591500 }, { "epoch": 1.01879083983128, "grad_norm": 2.0572683811187744, "learning_rate": 3.302015266947867e-05, "loss": 0.8181, "step": 592000 }, { "epoch": 1.019651305067624, "grad_norm": 1.9563729763031006, "learning_rate": 3.300581158220627e-05, "loss": 0.8188, "step": 592500 }, { "epoch": 1.020511770303968, "grad_norm": 2.0135724544525146, "learning_rate": 3.299147049493387e-05, "loss": 0.828, "step": 593000 }, { "epoch": 1.0213722355403119, "grad_norm": 2.0690765380859375, "learning_rate": 3.2977129407661466e-05, "loss": 0.8237, "step": 593500 }, { "epoch": 1.0222327007766558, "grad_norm": 1.8143502473831177, "learning_rate": 3.296278832038907e-05, "loss": 0.8142, "step": 594000 }, { "epoch": 1.0230931660129998, "grad_norm": 2.014436721801758, "learning_rate": 3.294844723311667e-05, "loss": 0.8287, "step": 594500 }, { "epoch": 1.0239536312493438, "grad_norm": 2.035250425338745, "learning_rate": 3.293410614584427e-05, "loss": 0.8168, "step": 595000 }, { "epoch": 1.0248140964856878, "grad_norm": 2.0608718395233154, "learning_rate": 3.2919765058571875e-05, "loss": 0.8156, "step": 595500 }, { "epoch": 1.025674561722032, "grad_norm": 2.038362741470337, "learning_rate": 3.290542397129947e-05, "loss": 0.8213, "step": 596000 }, { "epoch": 1.026535026958376, "grad_norm": 1.972121238708496, "learning_rate": 3.289108288402707e-05, "loss": 0.8172, "step": 596500 }, { "epoch": 1.02739549219472, "grad_norm": 2.067946195602417, "learning_rate": 3.287674179675467e-05, "loss": 0.8188, "step": 597000 }, { "epoch": 1.028255957431064, "grad_norm": 2.0822267532348633, "learning_rate": 3.286240070948227e-05, "loss": 0.8189, "step": 597500 }, { "epoch": 1.0291164226674079, "grad_norm": 1.9324904680252075, "learning_rate": 3.284805962220987e-05, "loss": 0.8211, "step": 598000 }, { "epoch": 1.0299768879037519, "grad_norm": 2.024289608001709, "learning_rate": 3.283371853493748e-05, "loss": 0.8163, "step": 598500 }, { "epoch": 1.0308373531400958, "grad_norm": 1.9507628679275513, "learning_rate": 3.2819377447665075e-05, "loss": 0.8135, "step": 599000 }, { "epoch": 1.0316978183764398, "grad_norm": 2.0539042949676514, "learning_rate": 3.280503636039267e-05, "loss": 0.8146, "step": 599500 }, { "epoch": 1.0325582836127838, "grad_norm": 1.9990108013153076, "learning_rate": 3.279069527312027e-05, "loss": 0.8163, "step": 600000 }, { "epoch": 1.0334187488491278, "grad_norm": 2.1447951793670654, "learning_rate": 3.277635418584787e-05, "loss": 0.815, "step": 600500 }, { "epoch": 1.0342792140854717, "grad_norm": 2.0616796016693115, "learning_rate": 3.276201309857547e-05, "loss": 0.8205, "step": 601000 }, { "epoch": 1.0351396793218157, "grad_norm": 1.999974012374878, "learning_rate": 3.2747672011303074e-05, "loss": 0.8204, "step": 601500 }, { "epoch": 1.0360001445581597, "grad_norm": 1.998840570449829, "learning_rate": 3.273333092403068e-05, "loss": 0.8181, "step": 602000 }, { "epoch": 1.0368606097945037, "grad_norm": 2.0231664180755615, "learning_rate": 3.2718989836758275e-05, "loss": 0.8095, "step": 602500 }, { "epoch": 1.0377210750308477, "grad_norm": 2.1706063747406006, "learning_rate": 3.270464874948587e-05, "loss": 0.812, "step": 603000 }, { "epoch": 1.0385815402671916, "grad_norm": 2.0731775760650635, "learning_rate": 3.2690307662213475e-05, "loss": 0.824, "step": 603500 }, { "epoch": 1.0394420055035356, "grad_norm": 2.0796427726745605, "learning_rate": 3.267596657494107e-05, "loss": 0.8137, "step": 604000 }, { "epoch": 1.0403024707398796, "grad_norm": 2.0877110958099365, "learning_rate": 3.2661625487668676e-05, "loss": 0.8186, "step": 604500 }, { "epoch": 1.0411629359762236, "grad_norm": 1.938895344734192, "learning_rate": 3.2647284400396273e-05, "loss": 0.8214, "step": 605000 }, { "epoch": 1.0420234012125675, "grad_norm": 2.034285068511963, "learning_rate": 3.263294331312388e-05, "loss": 0.8204, "step": 605500 }, { "epoch": 1.0428838664489115, "grad_norm": 1.960330605506897, "learning_rate": 3.2618602225851474e-05, "loss": 0.8137, "step": 606000 }, { "epoch": 1.0437443316852555, "grad_norm": 2.037123680114746, "learning_rate": 3.260426113857907e-05, "loss": 0.8172, "step": 606500 }, { "epoch": 1.0446047969215995, "grad_norm": 1.9923410415649414, "learning_rate": 3.2589920051306675e-05, "loss": 0.8164, "step": 607000 }, { "epoch": 1.0454652621579437, "grad_norm": 2.060210943222046, "learning_rate": 3.257557896403427e-05, "loss": 0.8191, "step": 607500 }, { "epoch": 1.0463257273942876, "grad_norm": 1.9298114776611328, "learning_rate": 3.2561237876761876e-05, "loss": 0.8167, "step": 608000 }, { "epoch": 1.0471861926306316, "grad_norm": 2.114866256713867, "learning_rate": 3.254689678948948e-05, "loss": 0.8225, "step": 608500 }, { "epoch": 1.0480466578669756, "grad_norm": 2.275494337081909, "learning_rate": 3.253255570221708e-05, "loss": 0.8135, "step": 609000 }, { "epoch": 1.0489071231033196, "grad_norm": 1.9952894449234009, "learning_rate": 3.2518214614944674e-05, "loss": 0.8133, "step": 609500 }, { "epoch": 1.0497675883396635, "grad_norm": 2.223127841949463, "learning_rate": 3.250387352767228e-05, "loss": 0.8114, "step": 610000 }, { "epoch": 1.0506280535760075, "grad_norm": 2.092866897583008, "learning_rate": 3.2489532440399875e-05, "loss": 0.817, "step": 610500 }, { "epoch": 1.0514885188123515, "grad_norm": 1.9106268882751465, "learning_rate": 3.247519135312748e-05, "loss": 0.8218, "step": 611000 }, { "epoch": 1.0523489840486955, "grad_norm": 2.031660318374634, "learning_rate": 3.2460850265855076e-05, "loss": 0.8157, "step": 611500 }, { "epoch": 1.0532094492850395, "grad_norm": 2.047598123550415, "learning_rate": 3.244650917858268e-05, "loss": 0.8192, "step": 612000 }, { "epoch": 1.0540699145213834, "grad_norm": 2.0678765773773193, "learning_rate": 3.243216809131028e-05, "loss": 0.813, "step": 612500 }, { "epoch": 1.0549303797577274, "grad_norm": 2.12496280670166, "learning_rate": 3.2417827004037874e-05, "loss": 0.8164, "step": 613000 }, { "epoch": 1.0557908449940714, "grad_norm": 2.0553905963897705, "learning_rate": 3.240348591676548e-05, "loss": 0.8211, "step": 613500 }, { "epoch": 1.0566513102304154, "grad_norm": 2.044658660888672, "learning_rate": 3.238914482949308e-05, "loss": 0.8096, "step": 614000 }, { "epoch": 1.0575117754667593, "grad_norm": 2.074327230453491, "learning_rate": 3.237480374222068e-05, "loss": 0.8195, "step": 614500 }, { "epoch": 1.0583722407031033, "grad_norm": 2.0854201316833496, "learning_rate": 3.236046265494828e-05, "loss": 0.8112, "step": 615000 }, { "epoch": 1.0592327059394473, "grad_norm": 2.189030408859253, "learning_rate": 3.234612156767588e-05, "loss": 0.8158, "step": 615500 }, { "epoch": 1.0600931711757913, "grad_norm": 2.0248332023620605, "learning_rate": 3.233178048040348e-05, "loss": 0.8105, "step": 616000 }, { "epoch": 1.0609536364121352, "grad_norm": 1.9920114278793335, "learning_rate": 3.2317439393131074e-05, "loss": 0.8217, "step": 616500 }, { "epoch": 1.0618141016484792, "grad_norm": 2.0297579765319824, "learning_rate": 3.230309830585868e-05, "loss": 0.8163, "step": 617000 }, { "epoch": 1.0626745668848232, "grad_norm": 2.2208950519561768, "learning_rate": 3.228875721858628e-05, "loss": 0.8147, "step": 617500 }, { "epoch": 1.0635350321211672, "grad_norm": 2.023688316345215, "learning_rate": 3.227441613131388e-05, "loss": 0.8121, "step": 618000 }, { "epoch": 1.0643954973575112, "grad_norm": 2.1245839595794678, "learning_rate": 3.226007504404148e-05, "loss": 0.8113, "step": 618500 }, { "epoch": 1.0652559625938554, "grad_norm": 2.0178346633911133, "learning_rate": 3.224573395676908e-05, "loss": 0.8145, "step": 619000 }, { "epoch": 1.0661164278301993, "grad_norm": 2.1993958950042725, "learning_rate": 3.223139286949668e-05, "loss": 0.8236, "step": 619500 }, { "epoch": 1.0669768930665433, "grad_norm": 1.9215905666351318, "learning_rate": 3.221705178222428e-05, "loss": 0.8134, "step": 620000 }, { "epoch": 1.0678373583028873, "grad_norm": 2.226318836212158, "learning_rate": 3.2202710694951885e-05, "loss": 0.8177, "step": 620500 }, { "epoch": 1.0686978235392313, "grad_norm": 1.9630376100540161, "learning_rate": 3.218836960767948e-05, "loss": 0.8103, "step": 621000 }, { "epoch": 1.0695582887755752, "grad_norm": 1.9889850616455078, "learning_rate": 3.2174028520407086e-05, "loss": 0.8198, "step": 621500 }, { "epoch": 1.0704187540119192, "grad_norm": 2.355071544647217, "learning_rate": 3.215968743313468e-05, "loss": 0.8156, "step": 622000 }, { "epoch": 1.0712792192482632, "grad_norm": 2.0080490112304688, "learning_rate": 3.214534634586228e-05, "loss": 0.8094, "step": 622500 }, { "epoch": 1.0721396844846072, "grad_norm": 2.099216938018799, "learning_rate": 3.213100525858988e-05, "loss": 0.8163, "step": 623000 }, { "epoch": 1.0730001497209511, "grad_norm": 2.0304114818573, "learning_rate": 3.211666417131749e-05, "loss": 0.811, "step": 623500 }, { "epoch": 1.0738606149572951, "grad_norm": 2.1474432945251465, "learning_rate": 3.2102323084045085e-05, "loss": 0.8196, "step": 624000 }, { "epoch": 1.074721080193639, "grad_norm": 2.216209650039673, "learning_rate": 3.208798199677268e-05, "loss": 0.8149, "step": 624500 }, { "epoch": 1.075581545429983, "grad_norm": 1.8711814880371094, "learning_rate": 3.2073640909500286e-05, "loss": 0.8258, "step": 625000 }, { "epoch": 1.076442010666327, "grad_norm": 2.1735618114471436, "learning_rate": 3.205929982222788e-05, "loss": 0.813, "step": 625500 }, { "epoch": 1.077302475902671, "grad_norm": 2.166551113128662, "learning_rate": 3.204495873495548e-05, "loss": 0.8189, "step": 626000 }, { "epoch": 1.078162941139015, "grad_norm": 2.1133315563201904, "learning_rate": 3.2030617647683084e-05, "loss": 0.816, "step": 626500 }, { "epoch": 1.079023406375359, "grad_norm": 2.0182137489318848, "learning_rate": 3.201627656041069e-05, "loss": 0.816, "step": 627000 }, { "epoch": 1.079883871611703, "grad_norm": 2.062633991241455, "learning_rate": 3.2001935473138284e-05, "loss": 0.8186, "step": 627500 }, { "epoch": 1.080744336848047, "grad_norm": 2.1116020679473877, "learning_rate": 3.198759438586588e-05, "loss": 0.8134, "step": 628000 }, { "epoch": 1.081604802084391, "grad_norm": 2.0337700843811035, "learning_rate": 3.1973253298593485e-05, "loss": 0.8133, "step": 628500 }, { "epoch": 1.082465267320735, "grad_norm": 1.9344936609268188, "learning_rate": 3.195891221132108e-05, "loss": 0.8107, "step": 629000 }, { "epoch": 1.0833257325570789, "grad_norm": 2.3094825744628906, "learning_rate": 3.1944571124048686e-05, "loss": 0.8086, "step": 629500 }, { "epoch": 1.0841861977934228, "grad_norm": 2.158808946609497, "learning_rate": 3.193023003677629e-05, "loss": 0.811, "step": 630000 }, { "epoch": 1.0850466630297668, "grad_norm": 2.1025898456573486, "learning_rate": 3.191588894950389e-05, "loss": 0.8131, "step": 630500 }, { "epoch": 1.085907128266111, "grad_norm": 2.234091281890869, "learning_rate": 3.1901547862231484e-05, "loss": 0.8186, "step": 631000 }, { "epoch": 1.086767593502455, "grad_norm": 2.186819553375244, "learning_rate": 3.188720677495909e-05, "loss": 0.8114, "step": 631500 }, { "epoch": 1.087628058738799, "grad_norm": 2.036576271057129, "learning_rate": 3.1872865687686685e-05, "loss": 0.8144, "step": 632000 }, { "epoch": 1.088488523975143, "grad_norm": 2.1673030853271484, "learning_rate": 3.185852460041428e-05, "loss": 0.815, "step": 632500 }, { "epoch": 1.089348989211487, "grad_norm": 2.0698742866516113, "learning_rate": 3.184418351314189e-05, "loss": 0.818, "step": 633000 }, { "epoch": 1.090209454447831, "grad_norm": 2.974994659423828, "learning_rate": 3.182984242586949e-05, "loss": 0.8106, "step": 633500 }, { "epoch": 1.0910699196841749, "grad_norm": 2.020418643951416, "learning_rate": 3.181550133859709e-05, "loss": 0.809, "step": 634000 }, { "epoch": 1.0919303849205189, "grad_norm": 1.9625979661941528, "learning_rate": 3.1801160251324684e-05, "loss": 0.8067, "step": 634500 }, { "epoch": 1.0927908501568628, "grad_norm": 2.004453420639038, "learning_rate": 3.178681916405229e-05, "loss": 0.81, "step": 635000 }, { "epoch": 1.0936513153932068, "grad_norm": 2.179042339324951, "learning_rate": 3.1772478076779885e-05, "loss": 0.8178, "step": 635500 }, { "epoch": 1.0945117806295508, "grad_norm": 2.153991937637329, "learning_rate": 3.175813698950749e-05, "loss": 0.8144, "step": 636000 }, { "epoch": 1.0953722458658948, "grad_norm": 2.238384962081909, "learning_rate": 3.174379590223509e-05, "loss": 0.81, "step": 636500 }, { "epoch": 1.0962327111022387, "grad_norm": 1.7696861028671265, "learning_rate": 3.172945481496269e-05, "loss": 0.8139, "step": 637000 }, { "epoch": 1.0970931763385827, "grad_norm": 1.9494590759277344, "learning_rate": 3.171511372769029e-05, "loss": 0.8122, "step": 637500 }, { "epoch": 1.0979536415749267, "grad_norm": 1.9551820755004883, "learning_rate": 3.170077264041789e-05, "loss": 0.8104, "step": 638000 }, { "epoch": 1.0988141068112707, "grad_norm": 2.033318042755127, "learning_rate": 3.168643155314549e-05, "loss": 0.8071, "step": 638500 }, { "epoch": 1.0996745720476147, "grad_norm": 2.080103874206543, "learning_rate": 3.167209046587309e-05, "loss": 0.8099, "step": 639000 }, { "epoch": 1.1005350372839586, "grad_norm": 1.9861481189727783, "learning_rate": 3.165774937860069e-05, "loss": 0.8104, "step": 639500 }, { "epoch": 1.1013955025203026, "grad_norm": 2.102166175842285, "learning_rate": 3.164340829132829e-05, "loss": 0.8092, "step": 640000 }, { "epoch": 1.1022559677566466, "grad_norm": 2.0474326610565186, "learning_rate": 3.162906720405589e-05, "loss": 0.8123, "step": 640500 }, { "epoch": 1.1031164329929906, "grad_norm": 1.8780696392059326, "learning_rate": 3.161472611678349e-05, "loss": 0.8145, "step": 641000 }, { "epoch": 1.1039768982293345, "grad_norm": 2.027871608734131, "learning_rate": 3.160038502951109e-05, "loss": 0.8119, "step": 641500 }, { "epoch": 1.1048373634656787, "grad_norm": 2.076012372970581, "learning_rate": 3.158604394223869e-05, "loss": 0.8084, "step": 642000 }, { "epoch": 1.1056978287020227, "grad_norm": 2.0570199489593506, "learning_rate": 3.157170285496629e-05, "loss": 0.809, "step": 642500 }, { "epoch": 1.1065582939383667, "grad_norm": 2.052778959274292, "learning_rate": 3.1557361767693896e-05, "loss": 0.8123, "step": 643000 }, { "epoch": 1.1074187591747107, "grad_norm": 2.0170114040374756, "learning_rate": 3.154302068042149e-05, "loss": 0.8137, "step": 643500 }, { "epoch": 1.1082792244110546, "grad_norm": 2.030034065246582, "learning_rate": 3.152867959314909e-05, "loss": 0.8127, "step": 644000 }, { "epoch": 1.1091396896473986, "grad_norm": 2.0542829036712646, "learning_rate": 3.1514338505876694e-05, "loss": 0.8167, "step": 644500 }, { "epoch": 1.1100001548837426, "grad_norm": 2.1526877880096436, "learning_rate": 3.149999741860429e-05, "loss": 0.8128, "step": 645000 }, { "epoch": 1.1108606201200866, "grad_norm": 2.040294647216797, "learning_rate": 3.1485656331331895e-05, "loss": 0.815, "step": 645500 }, { "epoch": 1.1117210853564305, "grad_norm": 2.10483717918396, "learning_rate": 3.147131524405949e-05, "loss": 0.8155, "step": 646000 }, { "epoch": 1.1125815505927745, "grad_norm": 2.1790919303894043, "learning_rate": 3.1456974156787096e-05, "loss": 0.8048, "step": 646500 }, { "epoch": 1.1134420158291185, "grad_norm": 2.011568784713745, "learning_rate": 3.144263306951469e-05, "loss": 0.8075, "step": 647000 }, { "epoch": 1.1143024810654625, "grad_norm": 2.0351784229278564, "learning_rate": 3.142829198224229e-05, "loss": 0.8207, "step": 647500 }, { "epoch": 1.1151629463018065, "grad_norm": 1.9416171312332153, "learning_rate": 3.1413950894969894e-05, "loss": 0.8096, "step": 648000 }, { "epoch": 1.1160234115381504, "grad_norm": 1.9289367198944092, "learning_rate": 3.13996098076975e-05, "loss": 0.8144, "step": 648500 }, { "epoch": 1.1168838767744944, "grad_norm": 2.048586845397949, "learning_rate": 3.1385268720425095e-05, "loss": 0.8122, "step": 649000 }, { "epoch": 1.1177443420108384, "grad_norm": 2.136021852493286, "learning_rate": 3.13709276331527e-05, "loss": 0.8064, "step": 649500 }, { "epoch": 1.1186048072471824, "grad_norm": 2.0434165000915527, "learning_rate": 3.1356586545880295e-05, "loss": 0.807, "step": 650000 }, { "epoch": 1.1194652724835263, "grad_norm": 2.0496816635131836, "learning_rate": 3.134224545860789e-05, "loss": 0.8124, "step": 650500 }, { "epoch": 1.1203257377198703, "grad_norm": 2.304429292678833, "learning_rate": 3.132790437133549e-05, "loss": 0.809, "step": 651000 }, { "epoch": 1.1211862029562143, "grad_norm": 2.278761148452759, "learning_rate": 3.1313563284063094e-05, "loss": 0.8135, "step": 651500 }, { "epoch": 1.1220466681925583, "grad_norm": 2.150486707687378, "learning_rate": 3.12992221967907e-05, "loss": 0.813, "step": 652000 }, { "epoch": 1.1229071334289022, "grad_norm": 2.233739137649536, "learning_rate": 3.1284881109518294e-05, "loss": 0.811, "step": 652500 }, { "epoch": 1.1237675986652462, "grad_norm": 2.1455652713775635, "learning_rate": 3.12705400222459e-05, "loss": 0.8113, "step": 653000 }, { "epoch": 1.1246280639015902, "grad_norm": 2.0067386627197266, "learning_rate": 3.1256198934973495e-05, "loss": 0.8094, "step": 653500 }, { "epoch": 1.1254885291379342, "grad_norm": 1.9603629112243652, "learning_rate": 3.124185784770109e-05, "loss": 0.8063, "step": 654000 }, { "epoch": 1.1263489943742784, "grad_norm": 2.033787727355957, "learning_rate": 3.1227516760428696e-05, "loss": 0.812, "step": 654500 }, { "epoch": 1.1272094596106224, "grad_norm": 2.0316808223724365, "learning_rate": 3.12131756731563e-05, "loss": 0.8169, "step": 655000 }, { "epoch": 1.1280699248469663, "grad_norm": 1.9483590126037598, "learning_rate": 3.11988345858839e-05, "loss": 0.8063, "step": 655500 }, { "epoch": 1.1289303900833103, "grad_norm": 1.9957387447357178, "learning_rate": 3.11844934986115e-05, "loss": 0.8052, "step": 656000 }, { "epoch": 1.1297908553196543, "grad_norm": 2.2134573459625244, "learning_rate": 3.11701524113391e-05, "loss": 0.8047, "step": 656500 }, { "epoch": 1.1306513205559983, "grad_norm": 2.1203839778900146, "learning_rate": 3.1155811324066695e-05, "loss": 0.8128, "step": 657000 }, { "epoch": 1.1315117857923422, "grad_norm": 1.9745606184005737, "learning_rate": 3.114147023679429e-05, "loss": 0.8106, "step": 657500 }, { "epoch": 1.1323722510286862, "grad_norm": 1.8720873594284058, "learning_rate": 3.11271291495219e-05, "loss": 0.803, "step": 658000 }, { "epoch": 1.1332327162650302, "grad_norm": 2.2003602981567383, "learning_rate": 3.11127880622495e-05, "loss": 0.8057, "step": 658500 }, { "epoch": 1.1340931815013742, "grad_norm": 1.8851550817489624, "learning_rate": 3.10984469749771e-05, "loss": 0.8054, "step": 659000 }, { "epoch": 1.1349536467377181, "grad_norm": 2.158602714538574, "learning_rate": 3.10841058877047e-05, "loss": 0.8099, "step": 659500 }, { "epoch": 1.1358141119740621, "grad_norm": 2.1378893852233887, "learning_rate": 3.10697648004323e-05, "loss": 0.8043, "step": 660000 }, { "epoch": 1.136674577210406, "grad_norm": 2.137439489364624, "learning_rate": 3.1055423713159895e-05, "loss": 0.8173, "step": 660500 }, { "epoch": 1.13753504244675, "grad_norm": 2.1251001358032227, "learning_rate": 3.10410826258875e-05, "loss": 0.8084, "step": 661000 }, { "epoch": 1.138395507683094, "grad_norm": 1.9800820350646973, "learning_rate": 3.10267415386151e-05, "loss": 0.8085, "step": 661500 }, { "epoch": 1.139255972919438, "grad_norm": 2.1576268672943115, "learning_rate": 3.10124004513427e-05, "loss": 0.8082, "step": 662000 }, { "epoch": 1.140116438155782, "grad_norm": 2.003831624984741, "learning_rate": 3.09980593640703e-05, "loss": 0.816, "step": 662500 }, { "epoch": 1.140976903392126, "grad_norm": 1.969831943511963, "learning_rate": 3.09837182767979e-05, "loss": 0.8124, "step": 663000 }, { "epoch": 1.14183736862847, "grad_norm": 2.072331190109253, "learning_rate": 3.09693771895255e-05, "loss": 0.8093, "step": 663500 }, { "epoch": 1.142697833864814, "grad_norm": 1.948601484298706, "learning_rate": 3.09550361022531e-05, "loss": 0.809, "step": 664000 }, { "epoch": 1.1435582991011581, "grad_norm": 2.0233547687530518, "learning_rate": 3.0940695014980706e-05, "loss": 0.8039, "step": 664500 }, { "epoch": 1.1444187643375021, "grad_norm": 2.019453287124634, "learning_rate": 3.09263539277083e-05, "loss": 0.8007, "step": 665000 }, { "epoch": 1.145279229573846, "grad_norm": 1.9295461177825928, "learning_rate": 3.09120128404359e-05, "loss": 0.8081, "step": 665500 }, { "epoch": 1.14613969481019, "grad_norm": 1.9981800317764282, "learning_rate": 3.0897671753163504e-05, "loss": 0.808, "step": 666000 }, { "epoch": 1.147000160046534, "grad_norm": 2.1135873794555664, "learning_rate": 3.08833306658911e-05, "loss": 0.8186, "step": 666500 }, { "epoch": 1.147860625282878, "grad_norm": 2.176304817199707, "learning_rate": 3.08689895786187e-05, "loss": 0.8106, "step": 667000 }, { "epoch": 1.148721090519222, "grad_norm": 2.132354974746704, "learning_rate": 3.085464849134631e-05, "loss": 0.8062, "step": 667500 }, { "epoch": 1.149581555755566, "grad_norm": 2.330860137939453, "learning_rate": 3.0840307404073906e-05, "loss": 0.8081, "step": 668000 }, { "epoch": 1.15044202099191, "grad_norm": 2.1148693561553955, "learning_rate": 3.08259663168015e-05, "loss": 0.8119, "step": 668500 }, { "epoch": 1.151302486228254, "grad_norm": 2.028310537338257, "learning_rate": 3.08116252295291e-05, "loss": 0.8051, "step": 669000 }, { "epoch": 1.152162951464598, "grad_norm": 2.107956647872925, "learning_rate": 3.0797284142256704e-05, "loss": 0.811, "step": 669500 }, { "epoch": 1.1530234167009419, "grad_norm": 2.11625599861145, "learning_rate": 3.07829430549843e-05, "loss": 0.8095, "step": 670000 }, { "epoch": 1.1538838819372859, "grad_norm": 2.1002602577209473, "learning_rate": 3.0768601967711905e-05, "loss": 0.8072, "step": 670500 }, { "epoch": 1.1547443471736298, "grad_norm": 2.5475449562072754, "learning_rate": 3.075426088043951e-05, "loss": 0.8082, "step": 671000 }, { "epoch": 1.1556048124099738, "grad_norm": 1.9919356107711792, "learning_rate": 3.0739919793167106e-05, "loss": 0.813, "step": 671500 }, { "epoch": 1.1564652776463178, "grad_norm": 1.9276983737945557, "learning_rate": 3.07255787058947e-05, "loss": 0.8069, "step": 672000 }, { "epoch": 1.1573257428826618, "grad_norm": 2.14060115814209, "learning_rate": 3.0711237618622306e-05, "loss": 0.8045, "step": 672500 }, { "epoch": 1.1581862081190057, "grad_norm": 2.1278932094573975, "learning_rate": 3.0696896531349904e-05, "loss": 0.7996, "step": 673000 }, { "epoch": 1.1590466733553497, "grad_norm": 2.2851386070251465, "learning_rate": 3.068255544407751e-05, "loss": 0.8064, "step": 673500 }, { "epoch": 1.1599071385916937, "grad_norm": 1.997027039527893, "learning_rate": 3.0668214356805105e-05, "loss": 0.8101, "step": 674000 }, { "epoch": 1.1607676038280377, "grad_norm": 1.8951047658920288, "learning_rate": 3.065387326953271e-05, "loss": 0.8115, "step": 674500 }, { "epoch": 1.1616280690643817, "grad_norm": 2.016145706176758, "learning_rate": 3.0639532182260305e-05, "loss": 0.8105, "step": 675000 }, { "epoch": 1.1624885343007256, "grad_norm": 2.0284934043884277, "learning_rate": 3.06251910949879e-05, "loss": 0.8099, "step": 675500 }, { "epoch": 1.1633489995370696, "grad_norm": 2.1791024208068848, "learning_rate": 3.0610850007715506e-05, "loss": 0.8073, "step": 676000 }, { "epoch": 1.1642094647734136, "grad_norm": 2.1079273223876953, "learning_rate": 3.0596508920443103e-05, "loss": 0.8077, "step": 676500 }, { "epoch": 1.1650699300097576, "grad_norm": 1.8703237771987915, "learning_rate": 3.058216783317071e-05, "loss": 0.8091, "step": 677000 }, { "epoch": 1.1659303952461018, "grad_norm": 2.1863420009613037, "learning_rate": 3.056782674589831e-05, "loss": 0.804, "step": 677500 }, { "epoch": 1.1667908604824457, "grad_norm": 2.140691041946411, "learning_rate": 3.055348565862591e-05, "loss": 0.806, "step": 678000 }, { "epoch": 1.1676513257187897, "grad_norm": 2.0398032665252686, "learning_rate": 3.0539144571353505e-05, "loss": 0.8098, "step": 678500 }, { "epoch": 1.1685117909551337, "grad_norm": 2.1374642848968506, "learning_rate": 3.052480348408111e-05, "loss": 0.8047, "step": 679000 }, { "epoch": 1.1693722561914777, "grad_norm": 2.0204110145568848, "learning_rate": 3.051046239680871e-05, "loss": 0.8092, "step": 679500 }, { "epoch": 1.1702327214278216, "grad_norm": 2.0928399562835693, "learning_rate": 3.0496121309536307e-05, "loss": 0.802, "step": 680000 }, { "epoch": 1.1710931866641656, "grad_norm": 2.136254072189331, "learning_rate": 3.0481780222263907e-05, "loss": 0.8055, "step": 680500 }, { "epoch": 1.1719536519005096, "grad_norm": 1.9934322834014893, "learning_rate": 3.046743913499151e-05, "loss": 0.8011, "step": 681000 }, { "epoch": 1.1728141171368536, "grad_norm": 2.214151382446289, "learning_rate": 3.0453098047719108e-05, "loss": 0.8094, "step": 681500 }, { "epoch": 1.1736745823731976, "grad_norm": 2.0825462341308594, "learning_rate": 3.043875696044671e-05, "loss": 0.8137, "step": 682000 }, { "epoch": 1.1745350476095415, "grad_norm": 1.945638656616211, "learning_rate": 3.0424415873174313e-05, "loss": 0.8055, "step": 682500 }, { "epoch": 1.1753955128458855, "grad_norm": 1.9353575706481934, "learning_rate": 3.041007478590191e-05, "loss": 0.8028, "step": 683000 }, { "epoch": 1.1762559780822295, "grad_norm": 2.180316686630249, "learning_rate": 3.0395733698629507e-05, "loss": 0.8095, "step": 683500 }, { "epoch": 1.1771164433185735, "grad_norm": 2.010213851928711, "learning_rate": 3.0381392611357114e-05, "loss": 0.806, "step": 684000 }, { "epoch": 1.1779769085549174, "grad_norm": 2.019655227661133, "learning_rate": 3.036705152408471e-05, "loss": 0.8023, "step": 684500 }, { "epoch": 1.1788373737912614, "grad_norm": 2.287731170654297, "learning_rate": 3.0352710436812308e-05, "loss": 0.8078, "step": 685000 }, { "epoch": 1.1796978390276054, "grad_norm": 1.9522863626480103, "learning_rate": 3.033836934953991e-05, "loss": 0.8051, "step": 685500 }, { "epoch": 1.1805583042639494, "grad_norm": 1.9352096319198608, "learning_rate": 3.0324028262267512e-05, "loss": 0.8016, "step": 686000 }, { "epoch": 1.1814187695002933, "grad_norm": 1.9841794967651367, "learning_rate": 3.030968717499511e-05, "loss": 0.8011, "step": 686500 }, { "epoch": 1.1822792347366373, "grad_norm": 1.9001405239105225, "learning_rate": 3.029534608772271e-05, "loss": 0.8056, "step": 687000 }, { "epoch": 1.1831396999729813, "grad_norm": 1.9476912021636963, "learning_rate": 3.0281005000450314e-05, "loss": 0.8004, "step": 687500 }, { "epoch": 1.1840001652093255, "grad_norm": 2.282691717147827, "learning_rate": 3.026666391317791e-05, "loss": 0.806, "step": 688000 }, { "epoch": 1.1848606304456695, "grad_norm": 2.1164588928222656, "learning_rate": 3.025232282590551e-05, "loss": 0.8096, "step": 688500 }, { "epoch": 1.1857210956820134, "grad_norm": 2.249427318572998, "learning_rate": 3.0237981738633115e-05, "loss": 0.8067, "step": 689000 }, { "epoch": 1.1865815609183574, "grad_norm": 2.1805477142333984, "learning_rate": 3.0223640651360712e-05, "loss": 0.8049, "step": 689500 }, { "epoch": 1.1874420261547014, "grad_norm": 2.3295178413391113, "learning_rate": 3.0209299564088313e-05, "loss": 0.8091, "step": 690000 }, { "epoch": 1.1883024913910454, "grad_norm": 2.17014741897583, "learning_rate": 3.0194958476815917e-05, "loss": 0.8094, "step": 690500 }, { "epoch": 1.1891629566273894, "grad_norm": 1.8217567205429077, "learning_rate": 3.0180617389543514e-05, "loss": 0.8088, "step": 691000 }, { "epoch": 1.1900234218637333, "grad_norm": 2.156625747680664, "learning_rate": 3.0166276302271114e-05, "loss": 0.8039, "step": 691500 }, { "epoch": 1.1908838871000773, "grad_norm": 1.924224853515625, "learning_rate": 3.015193521499871e-05, "loss": 0.7991, "step": 692000 }, { "epoch": 1.1917443523364213, "grad_norm": 2.1445975303649902, "learning_rate": 3.0137594127726315e-05, "loss": 0.803, "step": 692500 }, { "epoch": 1.1926048175727653, "grad_norm": 2.235790729522705, "learning_rate": 3.0123253040453912e-05, "loss": 0.8096, "step": 693000 }, { "epoch": 1.1934652828091092, "grad_norm": 1.9758915901184082, "learning_rate": 3.0108911953181513e-05, "loss": 0.8016, "step": 693500 }, { "epoch": 1.1943257480454532, "grad_norm": 2.021066427230835, "learning_rate": 3.0094570865909117e-05, "loss": 0.8035, "step": 694000 }, { "epoch": 1.1951862132817972, "grad_norm": 2.056852340698242, "learning_rate": 3.0080229778636714e-05, "loss": 0.806, "step": 694500 }, { "epoch": 1.1960466785181412, "grad_norm": 2.177889347076416, "learning_rate": 3.0065888691364314e-05, "loss": 0.8104, "step": 695000 }, { "epoch": 1.1969071437544851, "grad_norm": 2.194106340408325, "learning_rate": 3.0051547604091918e-05, "loss": 0.8062, "step": 695500 }, { "epoch": 1.1977676089908291, "grad_norm": 2.0299899578094482, "learning_rate": 3.0037206516819515e-05, "loss": 0.801, "step": 696000 }, { "epoch": 1.198628074227173, "grad_norm": 2.2731423377990723, "learning_rate": 3.0022865429547116e-05, "loss": 0.8099, "step": 696500 }, { "epoch": 1.199488539463517, "grad_norm": 2.029911756515503, "learning_rate": 3.0008524342274713e-05, "loss": 0.807, "step": 697000 }, { "epoch": 1.200349004699861, "grad_norm": 2.0720105171203613, "learning_rate": 2.9994183255002316e-05, "loss": 0.8114, "step": 697500 }, { "epoch": 1.201209469936205, "grad_norm": 1.9225728511810303, "learning_rate": 2.9979842167729917e-05, "loss": 0.8052, "step": 698000 }, { "epoch": 1.202069935172549, "grad_norm": 2.078575849533081, "learning_rate": 2.9965501080457514e-05, "loss": 0.8064, "step": 698500 }, { "epoch": 1.202930400408893, "grad_norm": 3.18145751953125, "learning_rate": 2.9951159993185118e-05, "loss": 0.8038, "step": 699000 }, { "epoch": 1.203790865645237, "grad_norm": 2.0273611545562744, "learning_rate": 2.993681890591272e-05, "loss": 0.807, "step": 699500 }, { "epoch": 1.204651330881581, "grad_norm": 2.17539119720459, "learning_rate": 2.9922477818640315e-05, "loss": 0.8059, "step": 700000 }, { "epoch": 1.205511796117925, "grad_norm": 2.16237211227417, "learning_rate": 2.990813673136792e-05, "loss": 0.8083, "step": 700500 }, { "epoch": 1.2063722613542691, "grad_norm": 2.004716634750366, "learning_rate": 2.9893795644095516e-05, "loss": 0.8029, "step": 701000 }, { "epoch": 1.207232726590613, "grad_norm": 2.149090528488159, "learning_rate": 2.9879454556823117e-05, "loss": 0.8038, "step": 701500 }, { "epoch": 1.208093191826957, "grad_norm": 2.193436622619629, "learning_rate": 2.986511346955072e-05, "loss": 0.8105, "step": 702000 }, { "epoch": 1.208953657063301, "grad_norm": 2.0396177768707275, "learning_rate": 2.9850772382278318e-05, "loss": 0.8075, "step": 702500 }, { "epoch": 1.209814122299645, "grad_norm": 2.060792922973633, "learning_rate": 2.9836431295005918e-05, "loss": 0.8064, "step": 703000 }, { "epoch": 1.210674587535989, "grad_norm": 2.02762508392334, "learning_rate": 2.9822090207733515e-05, "loss": 0.8028, "step": 703500 }, { "epoch": 1.211535052772333, "grad_norm": 2.1213901042938232, "learning_rate": 2.980774912046112e-05, "loss": 0.8069, "step": 704000 }, { "epoch": 1.212395518008677, "grad_norm": 2.2344391345977783, "learning_rate": 2.979340803318872e-05, "loss": 0.7963, "step": 704500 }, { "epoch": 1.213255983245021, "grad_norm": 1.9241515398025513, "learning_rate": 2.9779066945916317e-05, "loss": 0.803, "step": 705000 }, { "epoch": 1.214116448481365, "grad_norm": 2.2126944065093994, "learning_rate": 2.976472585864392e-05, "loss": 0.8061, "step": 705500 }, { "epoch": 1.2149769137177089, "grad_norm": 2.169583320617676, "learning_rate": 2.975038477137152e-05, "loss": 0.8037, "step": 706000 }, { "epoch": 1.2158373789540529, "grad_norm": 2.132267475128174, "learning_rate": 2.9736043684099118e-05, "loss": 0.807, "step": 706500 }, { "epoch": 1.2166978441903968, "grad_norm": 2.0939061641693115, "learning_rate": 2.9721702596826722e-05, "loss": 0.8023, "step": 707000 }, { "epoch": 1.2175583094267408, "grad_norm": 2.0683717727661133, "learning_rate": 2.9707361509554322e-05, "loss": 0.8027, "step": 707500 }, { "epoch": 1.2184187746630848, "grad_norm": 2.210061550140381, "learning_rate": 2.969302042228192e-05, "loss": 0.8008, "step": 708000 }, { "epoch": 1.2192792398994288, "grad_norm": 2.072613000869751, "learning_rate": 2.9678679335009523e-05, "loss": 0.8027, "step": 708500 }, { "epoch": 1.2201397051357727, "grad_norm": 2.157541275024414, "learning_rate": 2.9664338247737124e-05, "loss": 0.8038, "step": 709000 }, { "epoch": 1.2210001703721167, "grad_norm": 2.2376465797424316, "learning_rate": 2.964999716046472e-05, "loss": 0.799, "step": 709500 }, { "epoch": 1.2218606356084607, "grad_norm": 2.337463855743408, "learning_rate": 2.9635656073192318e-05, "loss": 0.8016, "step": 710000 }, { "epoch": 1.2227211008448047, "grad_norm": 2.2762887477874756, "learning_rate": 2.9621314985919922e-05, "loss": 0.8053, "step": 710500 }, { "epoch": 1.2235815660811489, "grad_norm": 2.1711015701293945, "learning_rate": 2.9606973898647522e-05, "loss": 0.8064, "step": 711000 }, { "epoch": 1.2244420313174929, "grad_norm": 2.0335445404052734, "learning_rate": 2.959263281137512e-05, "loss": 0.8029, "step": 711500 }, { "epoch": 1.2253024965538368, "grad_norm": 2.1329965591430664, "learning_rate": 2.9578291724102723e-05, "loss": 0.8068, "step": 712000 }, { "epoch": 1.2261629617901808, "grad_norm": 2.1596531867980957, "learning_rate": 2.9563950636830324e-05, "loss": 0.8075, "step": 712500 }, { "epoch": 1.2270234270265248, "grad_norm": 1.909252643585205, "learning_rate": 2.954960954955792e-05, "loss": 0.8048, "step": 713000 }, { "epoch": 1.2278838922628688, "grad_norm": 1.9895068407058716, "learning_rate": 2.9535268462285525e-05, "loss": 0.8019, "step": 713500 }, { "epoch": 1.2287443574992127, "grad_norm": 2.0683231353759766, "learning_rate": 2.9520927375013125e-05, "loss": 0.8019, "step": 714000 }, { "epoch": 1.2296048227355567, "grad_norm": 2.2134108543395996, "learning_rate": 2.9506586287740722e-05, "loss": 0.8018, "step": 714500 }, { "epoch": 1.2304652879719007, "grad_norm": 1.9018226861953735, "learning_rate": 2.9492245200468323e-05, "loss": 0.8085, "step": 715000 }, { "epoch": 1.2313257532082447, "grad_norm": 2.1485302448272705, "learning_rate": 2.9477904113195927e-05, "loss": 0.7978, "step": 715500 }, { "epoch": 1.2321862184445886, "grad_norm": 2.0110764503479004, "learning_rate": 2.9463563025923524e-05, "loss": 0.7995, "step": 716000 }, { "epoch": 1.2330466836809326, "grad_norm": 1.984613299369812, "learning_rate": 2.9449221938651124e-05, "loss": 0.8049, "step": 716500 }, { "epoch": 1.2339071489172766, "grad_norm": 2.059112787246704, "learning_rate": 2.9434880851378728e-05, "loss": 0.807, "step": 717000 }, { "epoch": 1.2347676141536206, "grad_norm": 2.2094051837921143, "learning_rate": 2.9420539764106325e-05, "loss": 0.8006, "step": 717500 }, { "epoch": 1.2356280793899646, "grad_norm": 2.09016752243042, "learning_rate": 2.9406198676833922e-05, "loss": 0.8031, "step": 718000 }, { "epoch": 1.2364885446263085, "grad_norm": 1.952208399772644, "learning_rate": 2.939185758956153e-05, "loss": 0.8045, "step": 718500 }, { "epoch": 1.2373490098626525, "grad_norm": 2.049285411834717, "learning_rate": 2.9377516502289127e-05, "loss": 0.8032, "step": 719000 }, { "epoch": 1.2382094750989965, "grad_norm": 2.2089593410491943, "learning_rate": 2.9363175415016724e-05, "loss": 0.8082, "step": 719500 }, { "epoch": 1.2390699403353405, "grad_norm": 2.038893699645996, "learning_rate": 2.9348834327744327e-05, "loss": 0.8035, "step": 720000 }, { "epoch": 1.2399304055716844, "grad_norm": 1.9960216283798218, "learning_rate": 2.9334493240471928e-05, "loss": 0.8006, "step": 720500 }, { "epoch": 1.2407908708080284, "grad_norm": 1.9117454290390015, "learning_rate": 2.9320152153199525e-05, "loss": 0.8069, "step": 721000 }, { "epoch": 1.2416513360443724, "grad_norm": 1.9376147985458374, "learning_rate": 2.9305811065927125e-05, "loss": 0.7984, "step": 721500 }, { "epoch": 1.2425118012807164, "grad_norm": 2.009612798690796, "learning_rate": 2.929146997865473e-05, "loss": 0.7956, "step": 722000 }, { "epoch": 1.2433722665170603, "grad_norm": 2.106132745742798, "learning_rate": 2.9277128891382326e-05, "loss": 0.8042, "step": 722500 }, { "epoch": 1.2442327317534043, "grad_norm": 2.345006227493286, "learning_rate": 2.9262787804109927e-05, "loss": 0.8082, "step": 723000 }, { "epoch": 1.2450931969897483, "grad_norm": 2.0162105560302734, "learning_rate": 2.924844671683753e-05, "loss": 0.7961, "step": 723500 }, { "epoch": 1.2459536622260925, "grad_norm": 2.337620735168457, "learning_rate": 2.9234105629565128e-05, "loss": 0.8081, "step": 724000 }, { "epoch": 1.2468141274624365, "grad_norm": 1.895340085029602, "learning_rate": 2.921976454229273e-05, "loss": 0.7943, "step": 724500 }, { "epoch": 1.2476745926987804, "grad_norm": 2.0250372886657715, "learning_rate": 2.9205423455020332e-05, "loss": 0.8062, "step": 725000 }, { "epoch": 1.2485350579351244, "grad_norm": 2.109849691390991, "learning_rate": 2.919108236774793e-05, "loss": 0.8067, "step": 725500 }, { "epoch": 1.2493955231714684, "grad_norm": 1.9787213802337646, "learning_rate": 2.9176741280475526e-05, "loss": 0.7954, "step": 726000 }, { "epoch": 1.2502559884078124, "grad_norm": 1.98014235496521, "learning_rate": 2.9162400193203127e-05, "loss": 0.8023, "step": 726500 }, { "epoch": 1.2511164536441564, "grad_norm": 2.0103554725646973, "learning_rate": 2.914805910593073e-05, "loss": 0.799, "step": 727000 }, { "epoch": 1.2519769188805003, "grad_norm": 2.017174005508423, "learning_rate": 2.9133718018658328e-05, "loss": 0.7974, "step": 727500 }, { "epoch": 1.2528373841168443, "grad_norm": 2.1232900619506836, "learning_rate": 2.9119376931385928e-05, "loss": 0.8008, "step": 728000 }, { "epoch": 1.2536978493531883, "grad_norm": 1.8166762590408325, "learning_rate": 2.9105035844113532e-05, "loss": 0.7992, "step": 728500 }, { "epoch": 1.2545583145895323, "grad_norm": 2.131291627883911, "learning_rate": 2.909069475684113e-05, "loss": 0.7965, "step": 729000 }, { "epoch": 1.2554187798258762, "grad_norm": 1.8457263708114624, "learning_rate": 2.907635366956873e-05, "loss": 0.7992, "step": 729500 }, { "epoch": 1.2562792450622202, "grad_norm": 2.3724331855773926, "learning_rate": 2.9062012582296333e-05, "loss": 0.7944, "step": 730000 }, { "epoch": 1.2571397102985642, "grad_norm": 1.9103881120681763, "learning_rate": 2.904767149502393e-05, "loss": 0.8008, "step": 730500 }, { "epoch": 1.2580001755349082, "grad_norm": 2.1090755462646484, "learning_rate": 2.903333040775153e-05, "loss": 0.7935, "step": 731000 }, { "epoch": 1.2588606407712521, "grad_norm": 2.2821147441864014, "learning_rate": 2.9018989320479135e-05, "loss": 0.8027, "step": 731500 }, { "epoch": 1.2597211060075961, "grad_norm": 2.159932851791382, "learning_rate": 2.9004648233206732e-05, "loss": 0.8053, "step": 732000 }, { "epoch": 1.26058157124394, "grad_norm": 2.012387752532959, "learning_rate": 2.8990307145934332e-05, "loss": 0.8017, "step": 732500 }, { "epoch": 1.261442036480284, "grad_norm": 2.0438127517700195, "learning_rate": 2.897596605866193e-05, "loss": 0.7992, "step": 733000 }, { "epoch": 1.2623025017166283, "grad_norm": 2.1479904651641846, "learning_rate": 2.8961624971389533e-05, "loss": 0.8013, "step": 733500 }, { "epoch": 1.2631629669529723, "grad_norm": 2.248114585876465, "learning_rate": 2.8947283884117134e-05, "loss": 0.7976, "step": 734000 }, { "epoch": 1.2640234321893162, "grad_norm": 1.8806530237197876, "learning_rate": 2.893294279684473e-05, "loss": 0.8029, "step": 734500 }, { "epoch": 1.2648838974256602, "grad_norm": 1.8940871953964233, "learning_rate": 2.8918601709572335e-05, "loss": 0.797, "step": 735000 }, { "epoch": 1.2657443626620042, "grad_norm": 2.128809928894043, "learning_rate": 2.8904260622299932e-05, "loss": 0.7958, "step": 735500 }, { "epoch": 1.2666048278983482, "grad_norm": 2.1618053913116455, "learning_rate": 2.8889919535027532e-05, "loss": 0.7974, "step": 736000 }, { "epoch": 1.2674652931346921, "grad_norm": 2.073007345199585, "learning_rate": 2.8875578447755136e-05, "loss": 0.8066, "step": 736500 }, { "epoch": 1.2683257583710361, "grad_norm": 1.8977770805358887, "learning_rate": 2.8861237360482733e-05, "loss": 0.7984, "step": 737000 }, { "epoch": 1.26918622360738, "grad_norm": 2.1680843830108643, "learning_rate": 2.8846896273210334e-05, "loss": 0.799, "step": 737500 }, { "epoch": 1.270046688843724, "grad_norm": 1.998214602470398, "learning_rate": 2.883255518593793e-05, "loss": 0.7951, "step": 738000 }, { "epoch": 1.270907154080068, "grad_norm": 2.0669476985931396, "learning_rate": 2.8818214098665535e-05, "loss": 0.7962, "step": 738500 }, { "epoch": 1.271767619316412, "grad_norm": 2.1200294494628906, "learning_rate": 2.8803873011393135e-05, "loss": 0.795, "step": 739000 }, { "epoch": 1.272628084552756, "grad_norm": 2.27681040763855, "learning_rate": 2.8789531924120732e-05, "loss": 0.7988, "step": 739500 }, { "epoch": 1.2734885497891, "grad_norm": 2.213317394256592, "learning_rate": 2.8775190836848336e-05, "loss": 0.7966, "step": 740000 }, { "epoch": 1.274349015025444, "grad_norm": 2.038018226623535, "learning_rate": 2.8760849749575937e-05, "loss": 0.7996, "step": 740500 }, { "epoch": 1.275209480261788, "grad_norm": 2.1453094482421875, "learning_rate": 2.8746508662303534e-05, "loss": 0.7944, "step": 741000 }, { "epoch": 1.276069945498132, "grad_norm": 2.005464553833008, "learning_rate": 2.8732167575031138e-05, "loss": 0.7977, "step": 741500 }, { "epoch": 1.2769304107344759, "grad_norm": 2.018033027648926, "learning_rate": 2.8717826487758738e-05, "loss": 0.8066, "step": 742000 }, { "epoch": 1.2777908759708199, "grad_norm": 2.1077606678009033, "learning_rate": 2.8703485400486335e-05, "loss": 0.8033, "step": 742500 }, { "epoch": 1.2786513412071638, "grad_norm": 2.3417065143585205, "learning_rate": 2.868914431321394e-05, "loss": 0.8019, "step": 743000 }, { "epoch": 1.2795118064435078, "grad_norm": 2.445237636566162, "learning_rate": 2.867480322594154e-05, "loss": 0.7984, "step": 743500 }, { "epoch": 1.2803722716798518, "grad_norm": 2.2032246589660645, "learning_rate": 2.8660462138669136e-05, "loss": 0.7971, "step": 744000 }, { "epoch": 1.2812327369161958, "grad_norm": 2.074751615524292, "learning_rate": 2.8646121051396734e-05, "loss": 0.799, "step": 744500 }, { "epoch": 1.2820932021525397, "grad_norm": 2.1933205127716064, "learning_rate": 2.8631779964124337e-05, "loss": 0.7944, "step": 745000 }, { "epoch": 1.2829536673888837, "grad_norm": 2.374495029449463, "learning_rate": 2.8617438876851938e-05, "loss": 0.7947, "step": 745500 }, { "epoch": 1.2838141326252277, "grad_norm": 2.0120351314544678, "learning_rate": 2.8603097789579535e-05, "loss": 0.7951, "step": 746000 }, { "epoch": 1.2846745978615717, "grad_norm": 2.224721908569336, "learning_rate": 2.858875670230714e-05, "loss": 0.7985, "step": 746500 }, { "epoch": 1.2855350630979157, "grad_norm": 1.875423789024353, "learning_rate": 2.857441561503474e-05, "loss": 0.7913, "step": 747000 }, { "epoch": 1.2863955283342596, "grad_norm": 2.116672992706299, "learning_rate": 2.8560074527762336e-05, "loss": 0.8006, "step": 747500 }, { "epoch": 1.2872559935706038, "grad_norm": 2.1012139320373535, "learning_rate": 2.854573344048994e-05, "loss": 0.8027, "step": 748000 }, { "epoch": 1.2881164588069478, "grad_norm": 2.454265594482422, "learning_rate": 2.853139235321754e-05, "loss": 0.7992, "step": 748500 }, { "epoch": 1.2889769240432918, "grad_norm": 2.262613534927368, "learning_rate": 2.8517051265945138e-05, "loss": 0.7956, "step": 749000 }, { "epoch": 1.2898373892796358, "grad_norm": 2.1560659408569336, "learning_rate": 2.8502710178672738e-05, "loss": 0.7982, "step": 749500 }, { "epoch": 1.2906978545159797, "grad_norm": 2.0437734127044678, "learning_rate": 2.8488369091400342e-05, "loss": 0.791, "step": 750000 }, { "epoch": 1.2915583197523237, "grad_norm": 2.1690988540649414, "learning_rate": 2.847402800412794e-05, "loss": 0.796, "step": 750500 }, { "epoch": 1.2924187849886677, "grad_norm": 2.0291025638580322, "learning_rate": 2.845968691685554e-05, "loss": 0.7942, "step": 751000 }, { "epoch": 1.2932792502250117, "grad_norm": 2.1441104412078857, "learning_rate": 2.8445345829583144e-05, "loss": 0.795, "step": 751500 }, { "epoch": 1.2941397154613556, "grad_norm": 2.0123281478881836, "learning_rate": 2.843100474231074e-05, "loss": 0.8002, "step": 752000 }, { "epoch": 1.2950001806976996, "grad_norm": 2.0292112827301025, "learning_rate": 2.8416663655038338e-05, "loss": 0.8004, "step": 752500 }, { "epoch": 1.2958606459340436, "grad_norm": 2.077418565750122, "learning_rate": 2.840232256776594e-05, "loss": 0.79, "step": 753000 }, { "epoch": 1.2967211111703876, "grad_norm": 1.8476754426956177, "learning_rate": 2.8387981480493542e-05, "loss": 0.7929, "step": 753500 }, { "epoch": 1.2975815764067316, "grad_norm": 2.1577837467193604, "learning_rate": 2.837364039322114e-05, "loss": 0.7975, "step": 754000 }, { "epoch": 1.2984420416430755, "grad_norm": 2.1313822269439697, "learning_rate": 2.8359299305948743e-05, "loss": 0.8, "step": 754500 }, { "epoch": 1.2993025068794195, "grad_norm": 1.9185123443603516, "learning_rate": 2.8344958218676343e-05, "loss": 0.797, "step": 755000 }, { "epoch": 1.3001629721157635, "grad_norm": 2.139822006225586, "learning_rate": 2.833061713140394e-05, "loss": 0.7926, "step": 755500 }, { "epoch": 1.3010234373521075, "grad_norm": 2.4005961418151855, "learning_rate": 2.831627604413154e-05, "loss": 0.7925, "step": 756000 }, { "epoch": 1.3018839025884517, "grad_norm": 2.130309820175171, "learning_rate": 2.8301934956859145e-05, "loss": 0.7971, "step": 756500 }, { "epoch": 1.3027443678247956, "grad_norm": 1.9937305450439453, "learning_rate": 2.8287593869586742e-05, "loss": 0.7948, "step": 757000 }, { "epoch": 1.3036048330611396, "grad_norm": 2.077946186065674, "learning_rate": 2.8273252782314342e-05, "loss": 0.7924, "step": 757500 }, { "epoch": 1.3044652982974836, "grad_norm": 2.286851644515991, "learning_rate": 2.8258911695041946e-05, "loss": 0.7989, "step": 758000 }, { "epoch": 1.3053257635338276, "grad_norm": 2.0541067123413086, "learning_rate": 2.8244570607769543e-05, "loss": 0.7943, "step": 758500 }, { "epoch": 1.3061862287701715, "grad_norm": 2.131272315979004, "learning_rate": 2.8230229520497144e-05, "loss": 0.7963, "step": 759000 }, { "epoch": 1.3070466940065155, "grad_norm": 2.097992181777954, "learning_rate": 2.8215888433224748e-05, "loss": 0.7926, "step": 759500 }, { "epoch": 1.3079071592428595, "grad_norm": 2.153024196624756, "learning_rate": 2.8201547345952345e-05, "loss": 0.7941, "step": 760000 }, { "epoch": 1.3087676244792035, "grad_norm": 2.235593557357788, "learning_rate": 2.8187206258679942e-05, "loss": 0.7963, "step": 760500 }, { "epoch": 1.3096280897155474, "grad_norm": 2.201599359512329, "learning_rate": 2.8172865171407542e-05, "loss": 0.7913, "step": 761000 }, { "epoch": 1.3104885549518914, "grad_norm": 1.854984164237976, "learning_rate": 2.8158524084135146e-05, "loss": 0.7949, "step": 761500 }, { "epoch": 1.3113490201882354, "grad_norm": 2.277507781982422, "learning_rate": 2.8144182996862743e-05, "loss": 0.7938, "step": 762000 }, { "epoch": 1.3122094854245794, "grad_norm": 2.180150032043457, "learning_rate": 2.8129841909590344e-05, "loss": 0.7941, "step": 762500 }, { "epoch": 1.3130699506609234, "grad_norm": 2.0832903385162354, "learning_rate": 2.8115500822317948e-05, "loss": 0.7906, "step": 763000 }, { "epoch": 1.3139304158972673, "grad_norm": 2.1612677574157715, "learning_rate": 2.8101159735045545e-05, "loss": 0.7922, "step": 763500 }, { "epoch": 1.3147908811336113, "grad_norm": 1.9641860723495483, "learning_rate": 2.8086818647773145e-05, "loss": 0.7943, "step": 764000 }, { "epoch": 1.3156513463699553, "grad_norm": 2.2745723724365234, "learning_rate": 2.807247756050075e-05, "loss": 0.796, "step": 764500 }, { "epoch": 1.3165118116062993, "grad_norm": 1.9285554885864258, "learning_rate": 2.8058136473228346e-05, "loss": 0.7999, "step": 765000 }, { "epoch": 1.3173722768426432, "grad_norm": 2.124898672103882, "learning_rate": 2.8043795385955947e-05, "loss": 0.7919, "step": 765500 }, { "epoch": 1.3182327420789872, "grad_norm": 2.0991389751434326, "learning_rate": 2.802945429868355e-05, "loss": 0.7942, "step": 766000 }, { "epoch": 1.3190932073153312, "grad_norm": 2.0024592876434326, "learning_rate": 2.8015113211411148e-05, "loss": 0.7937, "step": 766500 }, { "epoch": 1.3199536725516752, "grad_norm": 2.1842150688171387, "learning_rate": 2.8000772124138748e-05, "loss": 0.7988, "step": 767000 }, { "epoch": 1.3208141377880191, "grad_norm": 2.1621017456054688, "learning_rate": 2.7986431036866345e-05, "loss": 0.7991, "step": 767500 }, { "epoch": 1.3216746030243631, "grad_norm": 2.237100839614868, "learning_rate": 2.797208994959395e-05, "loss": 0.7895, "step": 768000 }, { "epoch": 1.322535068260707, "grad_norm": 2.050424098968506, "learning_rate": 2.795774886232155e-05, "loss": 0.7929, "step": 768500 }, { "epoch": 1.323395533497051, "grad_norm": 2.0490074157714844, "learning_rate": 2.7943407775049146e-05, "loss": 0.7906, "step": 769000 }, { "epoch": 1.324255998733395, "grad_norm": 2.1215157508850098, "learning_rate": 2.792906668777675e-05, "loss": 0.8012, "step": 769500 }, { "epoch": 1.325116463969739, "grad_norm": 2.28594708442688, "learning_rate": 2.7914725600504347e-05, "loss": 0.7936, "step": 770000 }, { "epoch": 1.325976929206083, "grad_norm": 1.9430696964263916, "learning_rate": 2.7900384513231948e-05, "loss": 0.7891, "step": 770500 }, { "epoch": 1.3268373944424272, "grad_norm": 2.1309525966644287, "learning_rate": 2.7886043425959552e-05, "loss": 0.7881, "step": 771000 }, { "epoch": 1.3276978596787712, "grad_norm": 1.9575910568237305, "learning_rate": 2.787170233868715e-05, "loss": 0.7892, "step": 771500 }, { "epoch": 1.3285583249151152, "grad_norm": 2.077388048171997, "learning_rate": 2.785736125141475e-05, "loss": 0.7905, "step": 772000 }, { "epoch": 1.3294187901514591, "grad_norm": 2.061293840408325, "learning_rate": 2.7843020164142346e-05, "loss": 0.796, "step": 772500 }, { "epoch": 1.3302792553878031, "grad_norm": 2.2610721588134766, "learning_rate": 2.782867907686995e-05, "loss": 0.7873, "step": 773000 }, { "epoch": 1.331139720624147, "grad_norm": 2.2062535285949707, "learning_rate": 2.781433798959755e-05, "loss": 0.7969, "step": 773500 }, { "epoch": 1.332000185860491, "grad_norm": 2.090650796890259, "learning_rate": 2.7799996902325148e-05, "loss": 0.7914, "step": 774000 }, { "epoch": 1.332860651096835, "grad_norm": 2.083618402481079, "learning_rate": 2.778565581505275e-05, "loss": 0.7907, "step": 774500 }, { "epoch": 1.333721116333179, "grad_norm": 1.9934265613555908, "learning_rate": 2.7771314727780352e-05, "loss": 0.7986, "step": 775000 }, { "epoch": 1.334581581569523, "grad_norm": 1.9388866424560547, "learning_rate": 2.775697364050795e-05, "loss": 0.7874, "step": 775500 }, { "epoch": 1.335442046805867, "grad_norm": 2.0379679203033447, "learning_rate": 2.7742632553235553e-05, "loss": 0.7916, "step": 776000 }, { "epoch": 1.336302512042211, "grad_norm": 2.174241542816162, "learning_rate": 2.7728291465963154e-05, "loss": 0.7892, "step": 776500 }, { "epoch": 1.337162977278555, "grad_norm": 2.243471384048462, "learning_rate": 2.771395037869075e-05, "loss": 0.7976, "step": 777000 }, { "epoch": 1.338023442514899, "grad_norm": 2.231506824493408, "learning_rate": 2.7699609291418354e-05, "loss": 0.7988, "step": 777500 }, { "epoch": 1.3388839077512429, "grad_norm": 1.9651925563812256, "learning_rate": 2.768526820414595e-05, "loss": 0.7943, "step": 778000 }, { "epoch": 1.3397443729875869, "grad_norm": 2.184065818786621, "learning_rate": 2.7670927116873552e-05, "loss": 0.7884, "step": 778500 }, { "epoch": 1.3406048382239308, "grad_norm": 2.0603458881378174, "learning_rate": 2.765658602960115e-05, "loss": 0.7952, "step": 779000 }, { "epoch": 1.341465303460275, "grad_norm": 2.1272175312042236, "learning_rate": 2.7642244942328753e-05, "loss": 0.7968, "step": 779500 }, { "epoch": 1.342325768696619, "grad_norm": 2.0302109718322754, "learning_rate": 2.7627903855056353e-05, "loss": 0.7892, "step": 780000 }, { "epoch": 1.343186233932963, "grad_norm": 2.089341163635254, "learning_rate": 2.761356276778395e-05, "loss": 0.7902, "step": 780500 }, { "epoch": 1.344046699169307, "grad_norm": 2.258023500442505, "learning_rate": 2.7599221680511554e-05, "loss": 0.7874, "step": 781000 }, { "epoch": 1.344907164405651, "grad_norm": 2.0876317024230957, "learning_rate": 2.7584880593239155e-05, "loss": 0.7919, "step": 781500 }, { "epoch": 1.345767629641995, "grad_norm": 1.8872166872024536, "learning_rate": 2.7570539505966752e-05, "loss": 0.8001, "step": 782000 }, { "epoch": 1.346628094878339, "grad_norm": 2.0026206970214844, "learning_rate": 2.7556198418694356e-05, "loss": 0.7963, "step": 782500 }, { "epoch": 1.3474885601146829, "grad_norm": 1.90133535861969, "learning_rate": 2.7541857331421956e-05, "loss": 0.7928, "step": 783000 }, { "epoch": 1.3483490253510269, "grad_norm": 2.0178732872009277, "learning_rate": 2.7527516244149553e-05, "loss": 0.7958, "step": 783500 }, { "epoch": 1.3492094905873708, "grad_norm": 2.0105807781219482, "learning_rate": 2.7513175156877154e-05, "loss": 0.7899, "step": 784000 }, { "epoch": 1.3500699558237148, "grad_norm": 2.131472587585449, "learning_rate": 2.7498834069604758e-05, "loss": 0.7949, "step": 784500 }, { "epoch": 1.3509304210600588, "grad_norm": 2.2190399169921875, "learning_rate": 2.7484492982332355e-05, "loss": 0.8001, "step": 785000 }, { "epoch": 1.3517908862964028, "grad_norm": 2.1613001823425293, "learning_rate": 2.7470151895059952e-05, "loss": 0.7914, "step": 785500 }, { "epoch": 1.3526513515327467, "grad_norm": 2.416355609893799, "learning_rate": 2.745581080778756e-05, "loss": 0.7977, "step": 786000 }, { "epoch": 1.3535118167690907, "grad_norm": 2.159780740737915, "learning_rate": 2.7441469720515156e-05, "loss": 0.7909, "step": 786500 }, { "epoch": 1.3543722820054347, "grad_norm": 2.2719836235046387, "learning_rate": 2.7427128633242753e-05, "loss": 0.7887, "step": 787000 }, { "epoch": 1.3552327472417787, "grad_norm": 2.027290105819702, "learning_rate": 2.7412787545970357e-05, "loss": 0.7936, "step": 787500 }, { "epoch": 1.3560932124781226, "grad_norm": 2.148388624191284, "learning_rate": 2.7398446458697958e-05, "loss": 0.7945, "step": 788000 }, { "epoch": 1.3569536777144666, "grad_norm": 2.142289161682129, "learning_rate": 2.7384105371425555e-05, "loss": 0.7937, "step": 788500 }, { "epoch": 1.3578141429508106, "grad_norm": 2.2423949241638184, "learning_rate": 2.736976428415316e-05, "loss": 0.7912, "step": 789000 }, { "epoch": 1.3586746081871546, "grad_norm": 2.101126194000244, "learning_rate": 2.735542319688076e-05, "loss": 0.7911, "step": 789500 }, { "epoch": 1.3595350734234986, "grad_norm": 2.2099528312683105, "learning_rate": 2.7341082109608356e-05, "loss": 0.7948, "step": 790000 }, { "epoch": 1.3603955386598425, "grad_norm": 1.9013993740081787, "learning_rate": 2.7326741022335957e-05, "loss": 0.7987, "step": 790500 }, { "epoch": 1.3612560038961865, "grad_norm": 2.013784646987915, "learning_rate": 2.731239993506356e-05, "loss": 0.7913, "step": 791000 }, { "epoch": 1.3621164691325305, "grad_norm": 2.1542694568634033, "learning_rate": 2.7298058847791157e-05, "loss": 0.7947, "step": 791500 }, { "epoch": 1.3629769343688745, "grad_norm": 2.124849796295166, "learning_rate": 2.7283717760518758e-05, "loss": 0.79, "step": 792000 }, { "epoch": 1.3638373996052184, "grad_norm": 2.034287691116333, "learning_rate": 2.7269376673246362e-05, "loss": 0.7918, "step": 792500 }, { "epoch": 1.3646978648415624, "grad_norm": 2.1357600688934326, "learning_rate": 2.725503558597396e-05, "loss": 0.7942, "step": 793000 }, { "epoch": 1.3655583300779064, "grad_norm": 2.0473575592041016, "learning_rate": 2.724069449870156e-05, "loss": 0.7944, "step": 793500 }, { "epoch": 1.3664187953142506, "grad_norm": 2.184004545211792, "learning_rate": 2.7226353411429163e-05, "loss": 0.7896, "step": 794000 }, { "epoch": 1.3672792605505946, "grad_norm": 2.0148677825927734, "learning_rate": 2.721201232415676e-05, "loss": 0.7834, "step": 794500 }, { "epoch": 1.3681397257869385, "grad_norm": 2.2517378330230713, "learning_rate": 2.7197671236884357e-05, "loss": 0.7933, "step": 795000 }, { "epoch": 1.3690001910232825, "grad_norm": 2.022488594055176, "learning_rate": 2.7183330149611958e-05, "loss": 0.7912, "step": 795500 }, { "epoch": 1.3698606562596265, "grad_norm": 2.242121458053589, "learning_rate": 2.7168989062339562e-05, "loss": 0.7959, "step": 796000 }, { "epoch": 1.3707211214959705, "grad_norm": 1.8660246133804321, "learning_rate": 2.715464797506716e-05, "loss": 0.7889, "step": 796500 }, { "epoch": 1.3715815867323145, "grad_norm": 1.893210768699646, "learning_rate": 2.714030688779476e-05, "loss": 0.7919, "step": 797000 }, { "epoch": 1.3724420519686584, "grad_norm": 1.7890762090682983, "learning_rate": 2.7125965800522363e-05, "loss": 0.7938, "step": 797500 }, { "epoch": 1.3733025172050024, "grad_norm": 2.115593910217285, "learning_rate": 2.711162471324996e-05, "loss": 0.7844, "step": 798000 }, { "epoch": 1.3741629824413464, "grad_norm": 2.0752272605895996, "learning_rate": 2.709728362597756e-05, "loss": 0.7849, "step": 798500 }, { "epoch": 1.3750234476776904, "grad_norm": 2.2504701614379883, "learning_rate": 2.7082942538705165e-05, "loss": 0.7883, "step": 799000 }, { "epoch": 1.3758839129140343, "grad_norm": 1.9834873676300049, "learning_rate": 2.706860145143276e-05, "loss": 0.7902, "step": 799500 }, { "epoch": 1.3767443781503783, "grad_norm": 2.212036609649658, "learning_rate": 2.7054260364160362e-05, "loss": 0.788, "step": 800000 }, { "epoch": 1.3776048433867223, "grad_norm": 2.212900161743164, "learning_rate": 2.7039919276887966e-05, "loss": 0.7913, "step": 800500 }, { "epoch": 1.3784653086230663, "grad_norm": 2.1146013736724854, "learning_rate": 2.7025578189615563e-05, "loss": 0.786, "step": 801000 }, { "epoch": 1.3793257738594102, "grad_norm": 1.9993088245391846, "learning_rate": 2.7011237102343163e-05, "loss": 0.7874, "step": 801500 }, { "epoch": 1.3801862390957542, "grad_norm": 2.087582588195801, "learning_rate": 2.699689601507076e-05, "loss": 0.7959, "step": 802000 }, { "epoch": 1.3810467043320982, "grad_norm": 2.0022664070129395, "learning_rate": 2.6982554927798364e-05, "loss": 0.788, "step": 802500 }, { "epoch": 1.3819071695684424, "grad_norm": 2.188950777053833, "learning_rate": 2.6968213840525965e-05, "loss": 0.7871, "step": 803000 }, { "epoch": 1.3827676348047864, "grad_norm": 2.002671003341675, "learning_rate": 2.6953872753253562e-05, "loss": 0.7899, "step": 803500 }, { "epoch": 1.3836281000411303, "grad_norm": 2.184645175933838, "learning_rate": 2.6939531665981166e-05, "loss": 0.791, "step": 804000 }, { "epoch": 1.3844885652774743, "grad_norm": 2.0297811031341553, "learning_rate": 2.6925190578708763e-05, "loss": 0.7846, "step": 804500 }, { "epoch": 1.3853490305138183, "grad_norm": 2.2588882446289062, "learning_rate": 2.6910849491436363e-05, "loss": 0.7898, "step": 805000 }, { "epoch": 1.3862094957501623, "grad_norm": 1.9149460792541504, "learning_rate": 2.6896508404163967e-05, "loss": 0.7891, "step": 805500 }, { "epoch": 1.3870699609865063, "grad_norm": 2.145394802093506, "learning_rate": 2.6882167316891564e-05, "loss": 0.7866, "step": 806000 }, { "epoch": 1.3879304262228502, "grad_norm": 2.0832836627960205, "learning_rate": 2.6867826229619165e-05, "loss": 0.7943, "step": 806500 }, { "epoch": 1.3887908914591942, "grad_norm": 2.0837013721466064, "learning_rate": 2.6853485142346762e-05, "loss": 0.7868, "step": 807000 }, { "epoch": 1.3896513566955382, "grad_norm": 2.2829363346099854, "learning_rate": 2.6839144055074366e-05, "loss": 0.7926, "step": 807500 }, { "epoch": 1.3905118219318822, "grad_norm": 1.8782167434692383, "learning_rate": 2.6824802967801966e-05, "loss": 0.7989, "step": 808000 }, { "epoch": 1.3913722871682261, "grad_norm": 2.088763952255249, "learning_rate": 2.6810461880529563e-05, "loss": 0.7846, "step": 808500 }, { "epoch": 1.3922327524045701, "grad_norm": 2.1010475158691406, "learning_rate": 2.6796120793257167e-05, "loss": 0.7855, "step": 809000 }, { "epoch": 1.393093217640914, "grad_norm": 2.3863370418548584, "learning_rate": 2.6781779705984768e-05, "loss": 0.7866, "step": 809500 }, { "epoch": 1.393953682877258, "grad_norm": 2.236123561859131, "learning_rate": 2.6767438618712365e-05, "loss": 0.7878, "step": 810000 }, { "epoch": 1.394814148113602, "grad_norm": 2.2748334407806396, "learning_rate": 2.675309753143997e-05, "loss": 0.7925, "step": 810500 }, { "epoch": 1.395674613349946, "grad_norm": 2.1135990619659424, "learning_rate": 2.673875644416757e-05, "loss": 0.7934, "step": 811000 }, { "epoch": 1.39653507858629, "grad_norm": 2.133629083633423, "learning_rate": 2.6724415356895166e-05, "loss": 0.7862, "step": 811500 }, { "epoch": 1.397395543822634, "grad_norm": 1.9208927154541016, "learning_rate": 2.671007426962277e-05, "loss": 0.7879, "step": 812000 }, { "epoch": 1.398256009058978, "grad_norm": 1.9107301235198975, "learning_rate": 2.6695733182350367e-05, "loss": 0.7919, "step": 812500 }, { "epoch": 1.399116474295322, "grad_norm": 2.0901243686676025, "learning_rate": 2.6681392095077968e-05, "loss": 0.7935, "step": 813000 }, { "epoch": 1.399976939531666, "grad_norm": 2.2657084465026855, "learning_rate": 2.6667051007805565e-05, "loss": 0.7869, "step": 813500 }, { "epoch": 1.4008374047680099, "grad_norm": 1.8893272876739502, "learning_rate": 2.665270992053317e-05, "loss": 0.7882, "step": 814000 }, { "epoch": 1.4016978700043539, "grad_norm": 2.058760404586792, "learning_rate": 2.663836883326077e-05, "loss": 0.7915, "step": 814500 }, { "epoch": 1.4025583352406978, "grad_norm": 1.98762845993042, "learning_rate": 2.6624027745988366e-05, "loss": 0.7847, "step": 815000 }, { "epoch": 1.4034188004770418, "grad_norm": 1.9851107597351074, "learning_rate": 2.660968665871597e-05, "loss": 0.7921, "step": 815500 }, { "epoch": 1.4042792657133858, "grad_norm": 2.0246217250823975, "learning_rate": 2.659534557144357e-05, "loss": 0.7882, "step": 816000 }, { "epoch": 1.4051397309497298, "grad_norm": 2.0061659812927246, "learning_rate": 2.6581004484171167e-05, "loss": 0.7917, "step": 816500 }, { "epoch": 1.406000196186074, "grad_norm": 2.1723735332489014, "learning_rate": 2.656666339689877e-05, "loss": 0.7853, "step": 817000 }, { "epoch": 1.406860661422418, "grad_norm": 2.245793342590332, "learning_rate": 2.6552322309626372e-05, "loss": 0.7894, "step": 817500 }, { "epoch": 1.407721126658762, "grad_norm": 1.9718598127365112, "learning_rate": 2.653798122235397e-05, "loss": 0.7898, "step": 818000 }, { "epoch": 1.408581591895106, "grad_norm": 2.3851137161254883, "learning_rate": 2.652364013508157e-05, "loss": 0.7886, "step": 818500 }, { "epoch": 1.4094420571314499, "grad_norm": 2.107008218765259, "learning_rate": 2.6509299047809173e-05, "loss": 0.7865, "step": 819000 }, { "epoch": 1.4103025223677939, "grad_norm": 2.15132999420166, "learning_rate": 2.649495796053677e-05, "loss": 0.7899, "step": 819500 }, { "epoch": 1.4111629876041378, "grad_norm": 2.262600898742676, "learning_rate": 2.6480616873264367e-05, "loss": 0.7867, "step": 820000 }, { "epoch": 1.4120234528404818, "grad_norm": 2.184628486633301, "learning_rate": 2.6466275785991975e-05, "loss": 0.7841, "step": 820500 }, { "epoch": 1.4128839180768258, "grad_norm": 2.1769585609436035, "learning_rate": 2.645193469871957e-05, "loss": 0.7902, "step": 821000 }, { "epoch": 1.4137443833131698, "grad_norm": 1.972236156463623, "learning_rate": 2.643759361144717e-05, "loss": 0.7807, "step": 821500 }, { "epoch": 1.4146048485495137, "grad_norm": 2.0056240558624268, "learning_rate": 2.6423252524174773e-05, "loss": 0.7876, "step": 822000 }, { "epoch": 1.4154653137858577, "grad_norm": 1.9562522172927856, "learning_rate": 2.6408911436902373e-05, "loss": 0.7888, "step": 822500 }, { "epoch": 1.4163257790222017, "grad_norm": 1.9554847478866577, "learning_rate": 2.639457034962997e-05, "loss": 0.7817, "step": 823000 }, { "epoch": 1.4171862442585457, "grad_norm": 2.1087417602539062, "learning_rate": 2.6380229262357574e-05, "loss": 0.7848, "step": 823500 }, { "epoch": 1.4180467094948896, "grad_norm": 2.0702404975891113, "learning_rate": 2.6365888175085174e-05, "loss": 0.7847, "step": 824000 }, { "epoch": 1.4189071747312336, "grad_norm": 2.1107707023620605, "learning_rate": 2.635154708781277e-05, "loss": 0.7875, "step": 824500 }, { "epoch": 1.4197676399675776, "grad_norm": 2.111306667327881, "learning_rate": 2.6337206000540372e-05, "loss": 0.7847, "step": 825000 }, { "epoch": 1.4206281052039216, "grad_norm": 2.0543792247772217, "learning_rate": 2.6322864913267976e-05, "loss": 0.7801, "step": 825500 }, { "epoch": 1.4214885704402658, "grad_norm": 2.030547857284546, "learning_rate": 2.6308523825995573e-05, "loss": 0.788, "step": 826000 }, { "epoch": 1.4223490356766098, "grad_norm": 2.2248077392578125, "learning_rate": 2.6294182738723173e-05, "loss": 0.7876, "step": 826500 }, { "epoch": 1.4232095009129537, "grad_norm": 1.9383457899093628, "learning_rate": 2.6279841651450777e-05, "loss": 0.7853, "step": 827000 }, { "epoch": 1.4240699661492977, "grad_norm": 2.11863112449646, "learning_rate": 2.6265500564178374e-05, "loss": 0.7822, "step": 827500 }, { "epoch": 1.4249304313856417, "grad_norm": 2.0522687435150146, "learning_rate": 2.6251159476905975e-05, "loss": 0.7876, "step": 828000 }, { "epoch": 1.4257908966219857, "grad_norm": 2.1168301105499268, "learning_rate": 2.623681838963358e-05, "loss": 0.7864, "step": 828500 }, { "epoch": 1.4266513618583296, "grad_norm": 2.140026569366455, "learning_rate": 2.6222477302361176e-05, "loss": 0.7795, "step": 829000 }, { "epoch": 1.4275118270946736, "grad_norm": 1.9109783172607422, "learning_rate": 2.6208136215088773e-05, "loss": 0.7868, "step": 829500 }, { "epoch": 1.4283722923310176, "grad_norm": 2.115460157394409, "learning_rate": 2.619379512781638e-05, "loss": 0.7908, "step": 830000 }, { "epoch": 1.4292327575673616, "grad_norm": 2.587132692337036, "learning_rate": 2.6179454040543977e-05, "loss": 0.7954, "step": 830500 }, { "epoch": 1.4300932228037055, "grad_norm": 1.930621862411499, "learning_rate": 2.6165112953271574e-05, "loss": 0.7875, "step": 831000 }, { "epoch": 1.4309536880400495, "grad_norm": 2.050018072128296, "learning_rate": 2.6150771865999175e-05, "loss": 0.7929, "step": 831500 }, { "epoch": 1.4318141532763935, "grad_norm": 2.272819995880127, "learning_rate": 2.613643077872678e-05, "loss": 0.7858, "step": 832000 }, { "epoch": 1.4326746185127375, "grad_norm": 1.9963629245758057, "learning_rate": 2.6122089691454376e-05, "loss": 0.7816, "step": 832500 }, { "epoch": 1.4335350837490815, "grad_norm": 2.0288803577423096, "learning_rate": 2.6107748604181976e-05, "loss": 0.7887, "step": 833000 }, { "epoch": 1.4343955489854254, "grad_norm": 2.02726411819458, "learning_rate": 2.609340751690958e-05, "loss": 0.7814, "step": 833500 }, { "epoch": 1.4352560142217694, "grad_norm": 2.012253999710083, "learning_rate": 2.6079066429637177e-05, "loss": 0.7818, "step": 834000 }, { "epoch": 1.4361164794581134, "grad_norm": 2.117372512817383, "learning_rate": 2.6064725342364778e-05, "loss": 0.792, "step": 834500 }, { "epoch": 1.4369769446944574, "grad_norm": 2.0443718433380127, "learning_rate": 2.605038425509238e-05, "loss": 0.7854, "step": 835000 }, { "epoch": 1.4378374099308013, "grad_norm": 2.198227882385254, "learning_rate": 2.603604316781998e-05, "loss": 0.7855, "step": 835500 }, { "epoch": 1.4386978751671453, "grad_norm": 2.020047187805176, "learning_rate": 2.602170208054758e-05, "loss": 0.7972, "step": 836000 }, { "epoch": 1.4395583404034893, "grad_norm": 2.201357841491699, "learning_rate": 2.6007360993275176e-05, "loss": 0.7895, "step": 836500 }, { "epoch": 1.4404188056398333, "grad_norm": 1.8877038955688477, "learning_rate": 2.599301990600278e-05, "loss": 0.7868, "step": 837000 }, { "epoch": 1.4412792708761772, "grad_norm": 2.0734703540802, "learning_rate": 2.5978678818730377e-05, "loss": 0.7822, "step": 837500 }, { "epoch": 1.4421397361125212, "grad_norm": 2.0934813022613525, "learning_rate": 2.5964337731457978e-05, "loss": 0.7779, "step": 838000 }, { "epoch": 1.4430002013488652, "grad_norm": 2.2466344833374023, "learning_rate": 2.594999664418558e-05, "loss": 0.7773, "step": 838500 }, { "epoch": 1.4438606665852092, "grad_norm": 2.1962478160858154, "learning_rate": 2.593565555691318e-05, "loss": 0.7824, "step": 839000 }, { "epoch": 1.4447211318215532, "grad_norm": 2.0171070098876953, "learning_rate": 2.592131446964078e-05, "loss": 0.7868, "step": 839500 }, { "epoch": 1.4455815970578971, "grad_norm": 2.070829153060913, "learning_rate": 2.5906973382368383e-05, "loss": 0.7862, "step": 840000 }, { "epoch": 1.4464420622942413, "grad_norm": 2.067033052444458, "learning_rate": 2.589263229509598e-05, "loss": 0.7832, "step": 840500 }, { "epoch": 1.4473025275305853, "grad_norm": 2.241600751876831, "learning_rate": 2.587829120782358e-05, "loss": 0.7822, "step": 841000 }, { "epoch": 1.4481629927669293, "grad_norm": 2.129298210144043, "learning_rate": 2.5863950120551184e-05, "loss": 0.7838, "step": 841500 }, { "epoch": 1.4490234580032733, "grad_norm": 1.9001485109329224, "learning_rate": 2.584960903327878e-05, "loss": 0.7836, "step": 842000 }, { "epoch": 1.4498839232396172, "grad_norm": 2.0611612796783447, "learning_rate": 2.5835267946006382e-05, "loss": 0.7885, "step": 842500 }, { "epoch": 1.4507443884759612, "grad_norm": 2.0608203411102295, "learning_rate": 2.582092685873398e-05, "loss": 0.7813, "step": 843000 }, { "epoch": 1.4516048537123052, "grad_norm": 2.246284008026123, "learning_rate": 2.5806585771461583e-05, "loss": 0.7902, "step": 843500 }, { "epoch": 1.4524653189486492, "grad_norm": 2.0218515396118164, "learning_rate": 2.5792244684189183e-05, "loss": 0.7825, "step": 844000 }, { "epoch": 1.4533257841849931, "grad_norm": 1.9001346826553345, "learning_rate": 2.577790359691678e-05, "loss": 0.7847, "step": 844500 }, { "epoch": 1.4541862494213371, "grad_norm": 2.2586870193481445, "learning_rate": 2.5763562509644384e-05, "loss": 0.786, "step": 845000 }, { "epoch": 1.455046714657681, "grad_norm": 2.003862142562866, "learning_rate": 2.5749221422371985e-05, "loss": 0.7761, "step": 845500 }, { "epoch": 1.455907179894025, "grad_norm": 2.1195812225341797, "learning_rate": 2.573488033509958e-05, "loss": 0.7877, "step": 846000 }, { "epoch": 1.456767645130369, "grad_norm": 2.1536705493927, "learning_rate": 2.5720539247827186e-05, "loss": 0.7871, "step": 846500 }, { "epoch": 1.457628110366713, "grad_norm": 2.003711700439453, "learning_rate": 2.5706198160554783e-05, "loss": 0.7824, "step": 847000 }, { "epoch": 1.458488575603057, "grad_norm": 2.0670664310455322, "learning_rate": 2.5691857073282383e-05, "loss": 0.7894, "step": 847500 }, { "epoch": 1.459349040839401, "grad_norm": 2.1034762859344482, "learning_rate": 2.567751598600998e-05, "loss": 0.7916, "step": 848000 }, { "epoch": 1.460209506075745, "grad_norm": 2.1943678855895996, "learning_rate": 2.5663174898737584e-05, "loss": 0.7867, "step": 848500 }, { "epoch": 1.4610699713120892, "grad_norm": 2.0474393367767334, "learning_rate": 2.5648833811465184e-05, "loss": 0.7886, "step": 849000 }, { "epoch": 1.4619304365484331, "grad_norm": 2.041001558303833, "learning_rate": 2.563449272419278e-05, "loss": 0.782, "step": 849500 }, { "epoch": 1.462790901784777, "grad_norm": 2.227505683898926, "learning_rate": 2.5620151636920385e-05, "loss": 0.7847, "step": 850000 }, { "epoch": 1.463651367021121, "grad_norm": 2.0753791332244873, "learning_rate": 2.5605810549647986e-05, "loss": 0.7835, "step": 850500 }, { "epoch": 1.464511832257465, "grad_norm": 2.0337870121002197, "learning_rate": 2.5591469462375583e-05, "loss": 0.7849, "step": 851000 }, { "epoch": 1.465372297493809, "grad_norm": 2.089561700820923, "learning_rate": 2.5577128375103187e-05, "loss": 0.7793, "step": 851500 }, { "epoch": 1.466232762730153, "grad_norm": 2.158207654953003, "learning_rate": 2.5562787287830787e-05, "loss": 0.784, "step": 852000 }, { "epoch": 1.467093227966497, "grad_norm": 2.1370127201080322, "learning_rate": 2.5548446200558384e-05, "loss": 0.7834, "step": 852500 }, { "epoch": 1.467953693202841, "grad_norm": 2.061452627182007, "learning_rate": 2.5534105113285988e-05, "loss": 0.7817, "step": 853000 }, { "epoch": 1.468814158439185, "grad_norm": 2.0483486652374268, "learning_rate": 2.551976402601359e-05, "loss": 0.7869, "step": 853500 }, { "epoch": 1.469674623675529, "grad_norm": 2.03947377204895, "learning_rate": 2.5505422938741186e-05, "loss": 0.7811, "step": 854000 }, { "epoch": 1.470535088911873, "grad_norm": 2.0983335971832275, "learning_rate": 2.5491081851468783e-05, "loss": 0.7821, "step": 854500 }, { "epoch": 1.4713955541482169, "grad_norm": 2.134584903717041, "learning_rate": 2.547674076419639e-05, "loss": 0.7827, "step": 855000 }, { "epoch": 1.4722560193845609, "grad_norm": 2.0175790786743164, "learning_rate": 2.5462399676923987e-05, "loss": 0.783, "step": 855500 }, { "epoch": 1.4731164846209048, "grad_norm": 1.9841803312301636, "learning_rate": 2.5448058589651584e-05, "loss": 0.7782, "step": 856000 }, { "epoch": 1.4739769498572488, "grad_norm": 1.9247914552688599, "learning_rate": 2.5433717502379188e-05, "loss": 0.792, "step": 856500 }, { "epoch": 1.4748374150935928, "grad_norm": 1.9805024862289429, "learning_rate": 2.541937641510679e-05, "loss": 0.7829, "step": 857000 }, { "epoch": 1.4756978803299368, "grad_norm": 1.9660342931747437, "learning_rate": 2.5405035327834386e-05, "loss": 0.7824, "step": 857500 }, { "epoch": 1.4765583455662807, "grad_norm": 2.1732521057128906, "learning_rate": 2.539069424056199e-05, "loss": 0.782, "step": 858000 }, { "epoch": 1.4774188108026247, "grad_norm": 2.0135018825531006, "learning_rate": 2.537635315328959e-05, "loss": 0.78, "step": 858500 }, { "epoch": 1.4782792760389687, "grad_norm": 2.021561861038208, "learning_rate": 2.5362012066017187e-05, "loss": 0.7863, "step": 859000 }, { "epoch": 1.4791397412753127, "grad_norm": 2.2785863876342773, "learning_rate": 2.5347670978744788e-05, "loss": 0.7862, "step": 859500 }, { "epoch": 1.4800002065116566, "grad_norm": 1.9567214250564575, "learning_rate": 2.533332989147239e-05, "loss": 0.7847, "step": 860000 }, { "epoch": 1.4808606717480006, "grad_norm": 2.1036536693573, "learning_rate": 2.531898880419999e-05, "loss": 0.7813, "step": 860500 }, { "epoch": 1.4817211369843446, "grad_norm": 2.144239664077759, "learning_rate": 2.530464771692759e-05, "loss": 0.7854, "step": 861000 }, { "epoch": 1.4825816022206886, "grad_norm": 2.243926525115967, "learning_rate": 2.5290306629655193e-05, "loss": 0.7927, "step": 861500 }, { "epoch": 1.4834420674570326, "grad_norm": 2.141397714614868, "learning_rate": 2.527596554238279e-05, "loss": 0.7895, "step": 862000 }, { "epoch": 1.4843025326933765, "grad_norm": 2.1214475631713867, "learning_rate": 2.5261624455110387e-05, "loss": 0.7831, "step": 862500 }, { "epoch": 1.4851629979297205, "grad_norm": 2.0760138034820557, "learning_rate": 2.5247283367837994e-05, "loss": 0.7839, "step": 863000 }, { "epoch": 1.4860234631660647, "grad_norm": 2.261528491973877, "learning_rate": 2.523294228056559e-05, "loss": 0.7794, "step": 863500 }, { "epoch": 1.4868839284024087, "grad_norm": 2.129002809524536, "learning_rate": 2.521860119329319e-05, "loss": 0.7752, "step": 864000 }, { "epoch": 1.4877443936387527, "grad_norm": 2.004643440246582, "learning_rate": 2.5204260106020792e-05, "loss": 0.7842, "step": 864500 }, { "epoch": 1.4886048588750966, "grad_norm": 1.9456123113632202, "learning_rate": 2.5189919018748393e-05, "loss": 0.7798, "step": 865000 }, { "epoch": 1.4894653241114406, "grad_norm": 2.0475871562957764, "learning_rate": 2.517557793147599e-05, "loss": 0.7867, "step": 865500 }, { "epoch": 1.4903257893477846, "grad_norm": 2.360131025314331, "learning_rate": 2.516123684420359e-05, "loss": 0.7775, "step": 866000 }, { "epoch": 1.4911862545841286, "grad_norm": 2.2324533462524414, "learning_rate": 2.5146895756931194e-05, "loss": 0.7877, "step": 866500 }, { "epoch": 1.4920467198204725, "grad_norm": 2.1139626502990723, "learning_rate": 2.513255466965879e-05, "loss": 0.7892, "step": 867000 }, { "epoch": 1.4929071850568165, "grad_norm": 2.0497446060180664, "learning_rate": 2.5118213582386392e-05, "loss": 0.7828, "step": 867500 }, { "epoch": 1.4937676502931605, "grad_norm": 2.2306737899780273, "learning_rate": 2.5103872495113996e-05, "loss": 0.7873, "step": 868000 }, { "epoch": 1.4946281155295045, "grad_norm": 2.103174924850464, "learning_rate": 2.5089531407841593e-05, "loss": 0.7827, "step": 868500 }, { "epoch": 1.4954885807658485, "grad_norm": 2.195070743560791, "learning_rate": 2.5075190320569193e-05, "loss": 0.7758, "step": 869000 }, { "epoch": 1.4963490460021924, "grad_norm": 2.206838607788086, "learning_rate": 2.5060849233296797e-05, "loss": 0.7786, "step": 869500 }, { "epoch": 1.4972095112385364, "grad_norm": 2.0621566772460938, "learning_rate": 2.5046508146024394e-05, "loss": 0.7844, "step": 870000 }, { "epoch": 1.4980699764748804, "grad_norm": 2.2440409660339355, "learning_rate": 2.5032167058751995e-05, "loss": 0.7817, "step": 870500 }, { "epoch": 1.4989304417112244, "grad_norm": 2.126237392425537, "learning_rate": 2.501782597147959e-05, "loss": 0.784, "step": 871000 }, { "epoch": 1.4997909069475683, "grad_norm": 2.475597381591797, "learning_rate": 2.5003484884207195e-05, "loss": 0.784, "step": 871500 }, { "epoch": 1.5006513721839125, "grad_norm": 1.868719458580017, "learning_rate": 2.4989143796934793e-05, "loss": 0.7921, "step": 872000 }, { "epoch": 1.5015118374202565, "grad_norm": 2.4086437225341797, "learning_rate": 2.4974802709662396e-05, "loss": 0.786, "step": 872500 }, { "epoch": 1.5023723026566005, "grad_norm": 2.0455057621002197, "learning_rate": 2.4960461622389993e-05, "loss": 0.7737, "step": 873000 }, { "epoch": 1.5032327678929445, "grad_norm": 2.060215711593628, "learning_rate": 2.4946120535117594e-05, "loss": 0.7831, "step": 873500 }, { "epoch": 1.5040932331292884, "grad_norm": 2.206716537475586, "learning_rate": 2.4931779447845198e-05, "loss": 0.7786, "step": 874000 }, { "epoch": 1.5049536983656324, "grad_norm": 2.23823618888855, "learning_rate": 2.4917438360572795e-05, "loss": 0.7794, "step": 874500 }, { "epoch": 1.5058141636019764, "grad_norm": 1.9800615310668945, "learning_rate": 2.4903097273300395e-05, "loss": 0.7717, "step": 875000 }, { "epoch": 1.5066746288383204, "grad_norm": 2.373481035232544, "learning_rate": 2.4888756186027996e-05, "loss": 0.782, "step": 875500 }, { "epoch": 1.5075350940746644, "grad_norm": 2.071988344192505, "learning_rate": 2.4874415098755596e-05, "loss": 0.7754, "step": 876000 }, { "epoch": 1.5083955593110083, "grad_norm": 2.165459394454956, "learning_rate": 2.4860074011483197e-05, "loss": 0.7822, "step": 876500 }, { "epoch": 1.5092560245473523, "grad_norm": 2.2290797233581543, "learning_rate": 2.4845732924210797e-05, "loss": 0.7783, "step": 877000 }, { "epoch": 1.5101164897836963, "grad_norm": 1.7810156345367432, "learning_rate": 2.4831391836938398e-05, "loss": 0.7803, "step": 877500 }, { "epoch": 1.5109769550200403, "grad_norm": 2.023090124130249, "learning_rate": 2.4817050749665995e-05, "loss": 0.7701, "step": 878000 }, { "epoch": 1.5118374202563842, "grad_norm": 2.0860238075256348, "learning_rate": 2.48027096623936e-05, "loss": 0.7855, "step": 878500 }, { "epoch": 1.5126978854927282, "grad_norm": 2.0972609519958496, "learning_rate": 2.47883685751212e-05, "loss": 0.78, "step": 879000 }, { "epoch": 1.5135583507290722, "grad_norm": 2.0910556316375732, "learning_rate": 2.4774027487848796e-05, "loss": 0.7828, "step": 879500 }, { "epoch": 1.5144188159654162, "grad_norm": 2.109569549560547, "learning_rate": 2.47596864005764e-05, "loss": 0.7832, "step": 880000 }, { "epoch": 1.5152792812017601, "grad_norm": 2.0984444618225098, "learning_rate": 2.4745345313304e-05, "loss": 0.7821, "step": 880500 }, { "epoch": 1.5161397464381041, "grad_norm": 2.1185643672943115, "learning_rate": 2.4731004226031598e-05, "loss": 0.7831, "step": 881000 }, { "epoch": 1.517000211674448, "grad_norm": 2.2747139930725098, "learning_rate": 2.4716663138759198e-05, "loss": 0.7775, "step": 881500 }, { "epoch": 1.517860676910792, "grad_norm": 2.023024320602417, "learning_rate": 2.47023220514868e-05, "loss": 0.7772, "step": 882000 }, { "epoch": 1.518721142147136, "grad_norm": 2.236903429031372, "learning_rate": 2.46879809642144e-05, "loss": 0.7797, "step": 882500 }, { "epoch": 1.51958160738348, "grad_norm": 2.104433298110962, "learning_rate": 2.4673639876942e-05, "loss": 0.7823, "step": 883000 }, { "epoch": 1.520442072619824, "grad_norm": 2.371293544769287, "learning_rate": 2.46592987896696e-05, "loss": 0.7781, "step": 883500 }, { "epoch": 1.521302537856168, "grad_norm": 2.0767617225646973, "learning_rate": 2.46449577023972e-05, "loss": 0.778, "step": 884000 }, { "epoch": 1.522163003092512, "grad_norm": 1.856438159942627, "learning_rate": 2.4630616615124798e-05, "loss": 0.7771, "step": 884500 }, { "epoch": 1.523023468328856, "grad_norm": 2.0989482402801514, "learning_rate": 2.46162755278524e-05, "loss": 0.7774, "step": 885000 }, { "epoch": 1.5238839335652, "grad_norm": 1.8530632257461548, "learning_rate": 2.4601934440580002e-05, "loss": 0.7752, "step": 885500 }, { "epoch": 1.5247443988015439, "grad_norm": 2.0262563228607178, "learning_rate": 2.45875933533076e-05, "loss": 0.7796, "step": 886000 }, { "epoch": 1.5256048640378879, "grad_norm": 2.2254626750946045, "learning_rate": 2.4573252266035203e-05, "loss": 0.7793, "step": 886500 }, { "epoch": 1.5264653292742318, "grad_norm": 2.0390655994415283, "learning_rate": 2.45589111787628e-05, "loss": 0.7805, "step": 887000 }, { "epoch": 1.5273257945105758, "grad_norm": 2.127013683319092, "learning_rate": 2.45445700914904e-05, "loss": 0.7819, "step": 887500 }, { "epoch": 1.52818625974692, "grad_norm": 2.007023572921753, "learning_rate": 2.4530229004218004e-05, "loss": 0.7837, "step": 888000 }, { "epoch": 1.529046724983264, "grad_norm": 1.9689440727233887, "learning_rate": 2.45158879169456e-05, "loss": 0.7796, "step": 888500 }, { "epoch": 1.529907190219608, "grad_norm": 1.9465103149414062, "learning_rate": 2.4501546829673202e-05, "loss": 0.7783, "step": 889000 }, { "epoch": 1.530767655455952, "grad_norm": 2.0596694946289062, "learning_rate": 2.4487205742400802e-05, "loss": 0.7874, "step": 889500 }, { "epoch": 1.531628120692296, "grad_norm": 2.059352397918701, "learning_rate": 2.4472864655128403e-05, "loss": 0.7789, "step": 890000 }, { "epoch": 1.53248858592864, "grad_norm": 2.119255542755127, "learning_rate": 2.4458523567856003e-05, "loss": 0.782, "step": 890500 }, { "epoch": 1.5333490511649839, "grad_norm": 2.0945546627044678, "learning_rate": 2.4444182480583604e-05, "loss": 0.7888, "step": 891000 }, { "epoch": 1.5342095164013279, "grad_norm": 2.090996265411377, "learning_rate": 2.4429841393311204e-05, "loss": 0.7818, "step": 891500 }, { "epoch": 1.5350699816376718, "grad_norm": 2.1802480220794678, "learning_rate": 2.4415500306038805e-05, "loss": 0.7745, "step": 892000 }, { "epoch": 1.5359304468740158, "grad_norm": 1.8560463190078735, "learning_rate": 2.4401159218766405e-05, "loss": 0.7792, "step": 892500 }, { "epoch": 1.5367909121103598, "grad_norm": 2.067178249359131, "learning_rate": 2.4386818131494006e-05, "loss": 0.7813, "step": 893000 }, { "epoch": 1.5376513773467038, "grad_norm": 2.077035665512085, "learning_rate": 2.4372477044221603e-05, "loss": 0.7756, "step": 893500 }, { "epoch": 1.538511842583048, "grad_norm": 1.9644643068313599, "learning_rate": 2.4358135956949203e-05, "loss": 0.775, "step": 894000 }, { "epoch": 1.539372307819392, "grad_norm": 2.1593332290649414, "learning_rate": 2.4343794869676807e-05, "loss": 0.7829, "step": 894500 }, { "epoch": 1.540232773055736, "grad_norm": 2.035702705383301, "learning_rate": 2.4329453782404404e-05, "loss": 0.7789, "step": 895000 }, { "epoch": 1.54109323829208, "grad_norm": 1.9208450317382812, "learning_rate": 2.4315112695132005e-05, "loss": 0.7737, "step": 895500 }, { "epoch": 1.5419537035284239, "grad_norm": 2.035158395767212, "learning_rate": 2.4300771607859605e-05, "loss": 0.7754, "step": 896000 }, { "epoch": 1.5428141687647678, "grad_norm": 2.130017042160034, "learning_rate": 2.4286430520587205e-05, "loss": 0.7805, "step": 896500 }, { "epoch": 1.5436746340011118, "grad_norm": 2.0422582626342773, "learning_rate": 2.4272089433314806e-05, "loss": 0.7746, "step": 897000 }, { "epoch": 1.5445350992374558, "grad_norm": 1.9432518482208252, "learning_rate": 2.4257748346042406e-05, "loss": 0.7772, "step": 897500 }, { "epoch": 1.5453955644737998, "grad_norm": 2.2162744998931885, "learning_rate": 2.4243407258770007e-05, "loss": 0.7771, "step": 898000 }, { "epoch": 1.5462560297101438, "grad_norm": 1.968711018562317, "learning_rate": 2.4229066171497604e-05, "loss": 0.7763, "step": 898500 }, { "epoch": 1.5471164949464877, "grad_norm": 2.030256748199463, "learning_rate": 2.4214725084225208e-05, "loss": 0.7808, "step": 899000 }, { "epoch": 1.5479769601828317, "grad_norm": 2.1321399211883545, "learning_rate": 2.4200383996952808e-05, "loss": 0.7828, "step": 899500 }, { "epoch": 1.5488374254191757, "grad_norm": 2.0849130153656006, "learning_rate": 2.4186042909680405e-05, "loss": 0.7749, "step": 900000 }, { "epoch": 1.5496978906555197, "grad_norm": 2.061917543411255, "learning_rate": 2.417170182240801e-05, "loss": 0.7801, "step": 900500 }, { "epoch": 1.5505583558918636, "grad_norm": 1.9755289554595947, "learning_rate": 2.4157360735135606e-05, "loss": 0.7789, "step": 901000 }, { "epoch": 1.5514188211282076, "grad_norm": 2.4730238914489746, "learning_rate": 2.4143019647863207e-05, "loss": 0.7739, "step": 901500 }, { "epoch": 1.5522792863645516, "grad_norm": 2.032313108444214, "learning_rate": 2.4128678560590807e-05, "loss": 0.7821, "step": 902000 }, { "epoch": 1.5531397516008956, "grad_norm": 2.0177109241485596, "learning_rate": 2.4114337473318408e-05, "loss": 0.7789, "step": 902500 }, { "epoch": 1.5540002168372395, "grad_norm": 2.158867120742798, "learning_rate": 2.4099996386046008e-05, "loss": 0.7752, "step": 903000 }, { "epoch": 1.5548606820735835, "grad_norm": 2.1361749172210693, "learning_rate": 2.408565529877361e-05, "loss": 0.7755, "step": 903500 }, { "epoch": 1.5557211473099275, "grad_norm": 2.079887628555298, "learning_rate": 2.407131421150121e-05, "loss": 0.7744, "step": 904000 }, { "epoch": 1.5565816125462715, "grad_norm": 2.108428478240967, "learning_rate": 2.405697312422881e-05, "loss": 0.7738, "step": 904500 }, { "epoch": 1.5574420777826155, "grad_norm": 2.295281171798706, "learning_rate": 2.404263203695641e-05, "loss": 0.7747, "step": 905000 }, { "epoch": 1.5583025430189594, "grad_norm": 2.012521743774414, "learning_rate": 2.402829094968401e-05, "loss": 0.7816, "step": 905500 }, { "epoch": 1.5591630082553034, "grad_norm": 2.0468950271606445, "learning_rate": 2.401394986241161e-05, "loss": 0.7806, "step": 906000 }, { "epoch": 1.5600234734916474, "grad_norm": 2.070603609085083, "learning_rate": 2.3999608775139208e-05, "loss": 0.7759, "step": 906500 }, { "epoch": 1.5608839387279914, "grad_norm": 2.0377748012542725, "learning_rate": 2.3985267687866812e-05, "loss": 0.7781, "step": 907000 }, { "epoch": 1.5617444039643353, "grad_norm": 2.356083631515503, "learning_rate": 2.397092660059441e-05, "loss": 0.7772, "step": 907500 }, { "epoch": 1.5626048692006793, "grad_norm": 2.108891010284424, "learning_rate": 2.395658551332201e-05, "loss": 0.7729, "step": 908000 }, { "epoch": 1.5634653344370233, "grad_norm": 2.133230686187744, "learning_rate": 2.3942244426049613e-05, "loss": 0.772, "step": 908500 }, { "epoch": 1.5643257996733673, "grad_norm": 2.0232138633728027, "learning_rate": 2.392790333877721e-05, "loss": 0.7725, "step": 909000 }, { "epoch": 1.5651862649097112, "grad_norm": 2.1651556491851807, "learning_rate": 2.391356225150481e-05, "loss": 0.7794, "step": 909500 }, { "epoch": 1.5660467301460552, "grad_norm": 2.0832695960998535, "learning_rate": 2.389922116423241e-05, "loss": 0.7836, "step": 910000 }, { "epoch": 1.5669071953823992, "grad_norm": 2.2311220169067383, "learning_rate": 2.3884880076960012e-05, "loss": 0.7729, "step": 910500 }, { "epoch": 1.5677676606187434, "grad_norm": 1.9575518369674683, "learning_rate": 2.3870538989687612e-05, "loss": 0.7852, "step": 911000 }, { "epoch": 1.5686281258550874, "grad_norm": 2.2378692626953125, "learning_rate": 2.3856197902415213e-05, "loss": 0.775, "step": 911500 }, { "epoch": 1.5694885910914314, "grad_norm": 2.328058958053589, "learning_rate": 2.3841856815142813e-05, "loss": 0.7738, "step": 912000 }, { "epoch": 1.5703490563277753, "grad_norm": 2.132398843765259, "learning_rate": 2.382751572787041e-05, "loss": 0.7826, "step": 912500 }, { "epoch": 1.5712095215641193, "grad_norm": 2.2009437084198, "learning_rate": 2.3813174640598014e-05, "loss": 0.7766, "step": 913000 }, { "epoch": 1.5720699868004633, "grad_norm": 2.1141409873962402, "learning_rate": 2.3798833553325615e-05, "loss": 0.7776, "step": 913500 }, { "epoch": 1.5729304520368073, "grad_norm": 2.1866352558135986, "learning_rate": 2.3784492466053212e-05, "loss": 0.7801, "step": 914000 }, { "epoch": 1.5737909172731512, "grad_norm": 2.5267555713653564, "learning_rate": 2.3770151378780812e-05, "loss": 0.7754, "step": 914500 }, { "epoch": 1.5746513825094952, "grad_norm": 2.0585038661956787, "learning_rate": 2.3755810291508416e-05, "loss": 0.7799, "step": 915000 }, { "epoch": 1.5755118477458392, "grad_norm": 2.211273670196533, "learning_rate": 2.3741469204236013e-05, "loss": 0.7758, "step": 915500 }, { "epoch": 1.5763723129821832, "grad_norm": 2.3003201484680176, "learning_rate": 2.3727128116963614e-05, "loss": 0.7742, "step": 916000 }, { "epoch": 1.5772327782185271, "grad_norm": 2.176165819168091, "learning_rate": 2.3712787029691214e-05, "loss": 0.7792, "step": 916500 }, { "epoch": 1.5780932434548713, "grad_norm": 2.0095202922821045, "learning_rate": 2.3698445942418815e-05, "loss": 0.7768, "step": 917000 }, { "epoch": 1.5789537086912153, "grad_norm": 1.853319764137268, "learning_rate": 2.3684104855146415e-05, "loss": 0.7828, "step": 917500 }, { "epoch": 1.5798141739275593, "grad_norm": 1.9998551607131958, "learning_rate": 2.3669763767874016e-05, "loss": 0.775, "step": 918000 }, { "epoch": 1.5806746391639033, "grad_norm": 2.039543867111206, "learning_rate": 2.3655422680601616e-05, "loss": 0.7798, "step": 918500 }, { "epoch": 1.5815351044002472, "grad_norm": 1.8610059022903442, "learning_rate": 2.3641081593329213e-05, "loss": 0.7767, "step": 919000 }, { "epoch": 1.5823955696365912, "grad_norm": 2.313720703125, "learning_rate": 2.3626740506056817e-05, "loss": 0.7792, "step": 919500 }, { "epoch": 1.5832560348729352, "grad_norm": 1.9850460290908813, "learning_rate": 2.3612399418784417e-05, "loss": 0.7795, "step": 920000 }, { "epoch": 1.5841165001092792, "grad_norm": 2.0019094944000244, "learning_rate": 2.3598058331512014e-05, "loss": 0.775, "step": 920500 }, { "epoch": 1.5849769653456232, "grad_norm": 2.1106672286987305, "learning_rate": 2.358371724423962e-05, "loss": 0.7683, "step": 921000 }, { "epoch": 1.5858374305819671, "grad_norm": 2.102208137512207, "learning_rate": 2.3569376156967215e-05, "loss": 0.775, "step": 921500 }, { "epoch": 1.586697895818311, "grad_norm": 2.1192538738250732, "learning_rate": 2.3555035069694816e-05, "loss": 0.7751, "step": 922000 }, { "epoch": 1.587558361054655, "grad_norm": 2.1068108081817627, "learning_rate": 2.354069398242242e-05, "loss": 0.775, "step": 922500 }, { "epoch": 1.588418826290999, "grad_norm": 1.947999358177185, "learning_rate": 2.3526352895150017e-05, "loss": 0.7753, "step": 923000 }, { "epoch": 1.589279291527343, "grad_norm": 2.0711350440979004, "learning_rate": 2.3512011807877617e-05, "loss": 0.7682, "step": 923500 }, { "epoch": 1.590139756763687, "grad_norm": 2.0397300720214844, "learning_rate": 2.3497670720605218e-05, "loss": 0.774, "step": 924000 }, { "epoch": 1.591000222000031, "grad_norm": 2.084848165512085, "learning_rate": 2.3483329633332818e-05, "loss": 0.7811, "step": 924500 }, { "epoch": 1.591860687236375, "grad_norm": 2.303589105606079, "learning_rate": 2.346898854606042e-05, "loss": 0.7776, "step": 925000 }, { "epoch": 1.592721152472719, "grad_norm": 2.401545524597168, "learning_rate": 2.345464745878802e-05, "loss": 0.7766, "step": 925500 }, { "epoch": 1.593581617709063, "grad_norm": 1.9984424114227295, "learning_rate": 2.344030637151562e-05, "loss": 0.7738, "step": 926000 }, { "epoch": 1.594442082945407, "grad_norm": 2.219634532928467, "learning_rate": 2.342596528424322e-05, "loss": 0.7714, "step": 926500 }, { "epoch": 1.5953025481817509, "grad_norm": 1.869746208190918, "learning_rate": 2.3411624196970817e-05, "loss": 0.7761, "step": 927000 }, { "epoch": 1.5961630134180949, "grad_norm": 2.093459129333496, "learning_rate": 2.339728310969842e-05, "loss": 0.7746, "step": 927500 }, { "epoch": 1.5970234786544388, "grad_norm": 2.0869009494781494, "learning_rate": 2.3382942022426018e-05, "loss": 0.7769, "step": 928000 }, { "epoch": 1.5978839438907828, "grad_norm": 1.8973535299301147, "learning_rate": 2.336860093515362e-05, "loss": 0.7741, "step": 928500 }, { "epoch": 1.5987444091271268, "grad_norm": 2.18300461769104, "learning_rate": 2.3354259847881222e-05, "loss": 0.7759, "step": 929000 }, { "epoch": 1.5996048743634708, "grad_norm": 2.473493814468384, "learning_rate": 2.333991876060882e-05, "loss": 0.7796, "step": 929500 }, { "epoch": 1.6004653395998147, "grad_norm": 1.9093661308288574, "learning_rate": 2.332557767333642e-05, "loss": 0.7755, "step": 930000 }, { "epoch": 1.6013258048361587, "grad_norm": 2.159060478210449, "learning_rate": 2.331123658606402e-05, "loss": 0.7775, "step": 930500 }, { "epoch": 1.6021862700725027, "grad_norm": 2.0607120990753174, "learning_rate": 2.329689549879162e-05, "loss": 0.769, "step": 931000 }, { "epoch": 1.6030467353088467, "grad_norm": 2.243135929107666, "learning_rate": 2.328255441151922e-05, "loss": 0.7767, "step": 931500 }, { "epoch": 1.6039072005451906, "grad_norm": 2.1867527961730957, "learning_rate": 2.3268213324246822e-05, "loss": 0.7775, "step": 932000 }, { "epoch": 1.6047676657815346, "grad_norm": 1.9539246559143066, "learning_rate": 2.3253872236974422e-05, "loss": 0.7721, "step": 932500 }, { "epoch": 1.6056281310178786, "grad_norm": 2.351499557495117, "learning_rate": 2.323953114970202e-05, "loss": 0.7676, "step": 933000 }, { "epoch": 1.6064885962542226, "grad_norm": 2.0803720951080322, "learning_rate": 2.3225190062429623e-05, "loss": 0.7755, "step": 933500 }, { "epoch": 1.6073490614905666, "grad_norm": 2.089561700820923, "learning_rate": 2.3210848975157224e-05, "loss": 0.7735, "step": 934000 }, { "epoch": 1.6082095267269108, "grad_norm": 2.2439472675323486, "learning_rate": 2.319650788788482e-05, "loss": 0.7818, "step": 934500 }, { "epoch": 1.6090699919632547, "grad_norm": 2.1251161098480225, "learning_rate": 2.3182166800612425e-05, "loss": 0.7733, "step": 935000 }, { "epoch": 1.6099304571995987, "grad_norm": 2.3418426513671875, "learning_rate": 2.3167825713340025e-05, "loss": 0.7765, "step": 935500 }, { "epoch": 1.6107909224359427, "grad_norm": 2.1627066135406494, "learning_rate": 2.3153484626067622e-05, "loss": 0.7717, "step": 936000 }, { "epoch": 1.6116513876722867, "grad_norm": 2.13486385345459, "learning_rate": 2.3139143538795223e-05, "loss": 0.7706, "step": 936500 }, { "epoch": 1.6125118529086306, "grad_norm": 1.9072346687316895, "learning_rate": 2.3124802451522823e-05, "loss": 0.775, "step": 937000 }, { "epoch": 1.6133723181449746, "grad_norm": 2.1810319423675537, "learning_rate": 2.3110461364250424e-05, "loss": 0.7739, "step": 937500 }, { "epoch": 1.6142327833813186, "grad_norm": 2.179624080657959, "learning_rate": 2.3096120276978024e-05, "loss": 0.7716, "step": 938000 }, { "epoch": 1.6150932486176626, "grad_norm": 2.184668779373169, "learning_rate": 2.3081779189705625e-05, "loss": 0.7727, "step": 938500 }, { "epoch": 1.6159537138540065, "grad_norm": 1.9632959365844727, "learning_rate": 2.3067438102433225e-05, "loss": 0.7714, "step": 939000 }, { "epoch": 1.6168141790903505, "grad_norm": 2.0400161743164062, "learning_rate": 2.3053097015160822e-05, "loss": 0.7764, "step": 939500 }, { "epoch": 1.6176746443266947, "grad_norm": 2.226841926574707, "learning_rate": 2.3038755927888426e-05, "loss": 0.7689, "step": 940000 }, { "epoch": 1.6185351095630387, "grad_norm": 2.1980738639831543, "learning_rate": 2.3024414840616027e-05, "loss": 0.7724, "step": 940500 }, { "epoch": 1.6193955747993827, "grad_norm": 2.0220143795013428, "learning_rate": 2.3010073753343624e-05, "loss": 0.7769, "step": 941000 }, { "epoch": 1.6202560400357267, "grad_norm": 2.0732641220092773, "learning_rate": 2.2995732666071227e-05, "loss": 0.7795, "step": 941500 }, { "epoch": 1.6211165052720706, "grad_norm": 1.9810514450073242, "learning_rate": 2.2981391578798825e-05, "loss": 0.775, "step": 942000 }, { "epoch": 1.6219769705084146, "grad_norm": 2.1949236392974854, "learning_rate": 2.2967050491526425e-05, "loss": 0.7668, "step": 942500 }, { "epoch": 1.6228374357447586, "grad_norm": 2.2184219360351562, "learning_rate": 2.295270940425403e-05, "loss": 0.774, "step": 943000 }, { "epoch": 1.6236979009811026, "grad_norm": 2.1051313877105713, "learning_rate": 2.2938368316981626e-05, "loss": 0.7711, "step": 943500 }, { "epoch": 1.6245583662174465, "grad_norm": 2.0821473598480225, "learning_rate": 2.2924027229709226e-05, "loss": 0.7824, "step": 944000 }, { "epoch": 1.6254188314537905, "grad_norm": 2.077939987182617, "learning_rate": 2.2909686142436827e-05, "loss": 0.7745, "step": 944500 }, { "epoch": 1.6262792966901345, "grad_norm": 2.041835069656372, "learning_rate": 2.2895345055164427e-05, "loss": 0.777, "step": 945000 }, { "epoch": 1.6271397619264785, "grad_norm": 2.2172017097473145, "learning_rate": 2.2881003967892028e-05, "loss": 0.7778, "step": 945500 }, { "epoch": 1.6280002271628224, "grad_norm": 2.0836751461029053, "learning_rate": 2.2866662880619628e-05, "loss": 0.7749, "step": 946000 }, { "epoch": 1.6288606923991664, "grad_norm": 1.9881094694137573, "learning_rate": 2.285232179334723e-05, "loss": 0.7735, "step": 946500 }, { "epoch": 1.6297211576355104, "grad_norm": 2.445798873901367, "learning_rate": 2.283798070607483e-05, "loss": 0.7689, "step": 947000 }, { "epoch": 1.6305816228718544, "grad_norm": 2.1970605850219727, "learning_rate": 2.282363961880243e-05, "loss": 0.7704, "step": 947500 }, { "epoch": 1.6314420881081984, "grad_norm": 2.2833495140075684, "learning_rate": 2.280929853153003e-05, "loss": 0.7726, "step": 948000 }, { "epoch": 1.6323025533445423, "grad_norm": 2.2963755130767822, "learning_rate": 2.2794957444257627e-05, "loss": 0.7721, "step": 948500 }, { "epoch": 1.6331630185808863, "grad_norm": 2.0407752990722656, "learning_rate": 2.2780616356985228e-05, "loss": 0.7708, "step": 949000 }, { "epoch": 1.6340234838172303, "grad_norm": 2.055379867553711, "learning_rate": 2.276627526971283e-05, "loss": 0.7735, "step": 949500 }, { "epoch": 1.6348839490535743, "grad_norm": 1.9972501993179321, "learning_rate": 2.275193418244043e-05, "loss": 0.766, "step": 950000 }, { "epoch": 1.6357444142899182, "grad_norm": 1.9890713691711426, "learning_rate": 2.273759309516803e-05, "loss": 0.7742, "step": 950500 }, { "epoch": 1.6366048795262622, "grad_norm": 2.0152220726013184, "learning_rate": 2.272325200789563e-05, "loss": 0.7722, "step": 951000 }, { "epoch": 1.6374653447626062, "grad_norm": 1.9014008045196533, "learning_rate": 2.270891092062323e-05, "loss": 0.7738, "step": 951500 }, { "epoch": 1.6383258099989502, "grad_norm": 2.1452271938323975, "learning_rate": 2.269456983335083e-05, "loss": 0.7766, "step": 952000 }, { "epoch": 1.6391862752352941, "grad_norm": 3.4031081199645996, "learning_rate": 2.268022874607843e-05, "loss": 0.7694, "step": 952500 }, { "epoch": 1.6400467404716381, "grad_norm": 2.1318936347961426, "learning_rate": 2.266588765880603e-05, "loss": 0.7699, "step": 953000 }, { "epoch": 1.640907205707982, "grad_norm": 1.984821081161499, "learning_rate": 2.265154657153363e-05, "loss": 0.7717, "step": 953500 }, { "epoch": 1.641767670944326, "grad_norm": 2.1505789756774902, "learning_rate": 2.2637205484261232e-05, "loss": 0.768, "step": 954000 }, { "epoch": 1.64262813618067, "grad_norm": 2.2152884006500244, "learning_rate": 2.2622864396988833e-05, "loss": 0.7697, "step": 954500 }, { "epoch": 1.643488601417014, "grad_norm": 2.0147619247436523, "learning_rate": 2.260852330971643e-05, "loss": 0.7761, "step": 955000 }, { "epoch": 1.644349066653358, "grad_norm": 1.9571027755737305, "learning_rate": 2.2594182222444034e-05, "loss": 0.775, "step": 955500 }, { "epoch": 1.645209531889702, "grad_norm": 2.1475048065185547, "learning_rate": 2.257984113517163e-05, "loss": 0.7704, "step": 956000 }, { "epoch": 1.646069997126046, "grad_norm": 2.149017333984375, "learning_rate": 2.256550004789923e-05, "loss": 0.7772, "step": 956500 }, { "epoch": 1.64693046236239, "grad_norm": 2.056325912475586, "learning_rate": 2.2551158960626835e-05, "loss": 0.7673, "step": 957000 }, { "epoch": 1.6477909275987341, "grad_norm": 2.0990519523620605, "learning_rate": 2.2536817873354432e-05, "loss": 0.777, "step": 957500 }, { "epoch": 1.6486513928350781, "grad_norm": 2.128030776977539, "learning_rate": 2.2522476786082033e-05, "loss": 0.7731, "step": 958000 }, { "epoch": 1.649511858071422, "grad_norm": 2.33375883102417, "learning_rate": 2.2508135698809633e-05, "loss": 0.767, "step": 958500 }, { "epoch": 1.650372323307766, "grad_norm": 2.3622138500213623, "learning_rate": 2.2493794611537234e-05, "loss": 0.7735, "step": 959000 }, { "epoch": 1.65123278854411, "grad_norm": 1.9758704900741577, "learning_rate": 2.2479453524264834e-05, "loss": 0.767, "step": 959500 }, { "epoch": 1.652093253780454, "grad_norm": 1.9486836194992065, "learning_rate": 2.2465112436992435e-05, "loss": 0.7764, "step": 960000 }, { "epoch": 1.652953719016798, "grad_norm": 2.24963641166687, "learning_rate": 2.2450771349720035e-05, "loss": 0.77, "step": 960500 }, { "epoch": 1.653814184253142, "grad_norm": 2.146021842956543, "learning_rate": 2.2436430262447636e-05, "loss": 0.7672, "step": 961000 }, { "epoch": 1.654674649489486, "grad_norm": 2.173103094100952, "learning_rate": 2.2422089175175233e-05, "loss": 0.7654, "step": 961500 }, { "epoch": 1.65553511472583, "grad_norm": 1.993402123451233, "learning_rate": 2.2407748087902837e-05, "loss": 0.7746, "step": 962000 }, { "epoch": 1.656395579962174, "grad_norm": 2.026460647583008, "learning_rate": 2.2393407000630434e-05, "loss": 0.7684, "step": 962500 }, { "epoch": 1.6572560451985179, "grad_norm": 2.0561816692352295, "learning_rate": 2.2379065913358034e-05, "loss": 0.7761, "step": 963000 }, { "epoch": 1.658116510434862, "grad_norm": 2.2467610836029053, "learning_rate": 2.2364724826085638e-05, "loss": 0.7726, "step": 963500 }, { "epoch": 1.658976975671206, "grad_norm": 2.3235909938812256, "learning_rate": 2.2350383738813235e-05, "loss": 0.7747, "step": 964000 }, { "epoch": 1.65983744090755, "grad_norm": 2.0124661922454834, "learning_rate": 2.2336042651540836e-05, "loss": 0.77, "step": 964500 }, { "epoch": 1.660697906143894, "grad_norm": 2.090353012084961, "learning_rate": 2.2321701564268436e-05, "loss": 0.7714, "step": 965000 }, { "epoch": 1.661558371380238, "grad_norm": 2.065913200378418, "learning_rate": 2.2307360476996036e-05, "loss": 0.7634, "step": 965500 }, { "epoch": 1.662418836616582, "grad_norm": 1.965511441230774, "learning_rate": 2.2293019389723637e-05, "loss": 0.7725, "step": 966000 }, { "epoch": 1.663279301852926, "grad_norm": 2.0912036895751953, "learning_rate": 2.2278678302451237e-05, "loss": 0.7749, "step": 966500 }, { "epoch": 1.66413976708927, "grad_norm": 2.1004321575164795, "learning_rate": 2.2264337215178838e-05, "loss": 0.7674, "step": 967000 }, { "epoch": 1.665000232325614, "grad_norm": 2.041999101638794, "learning_rate": 2.2249996127906435e-05, "loss": 0.7733, "step": 967500 }, { "epoch": 1.6658606975619579, "grad_norm": 2.0694518089294434, "learning_rate": 2.223565504063404e-05, "loss": 0.7739, "step": 968000 }, { "epoch": 1.6667211627983018, "grad_norm": 2.137014389038086, "learning_rate": 2.222131395336164e-05, "loss": 0.7684, "step": 968500 }, { "epoch": 1.6675816280346458, "grad_norm": 2.0307774543762207, "learning_rate": 2.2206972866089236e-05, "loss": 0.7685, "step": 969000 }, { "epoch": 1.6684420932709898, "grad_norm": 2.3049731254577637, "learning_rate": 2.219263177881684e-05, "loss": 0.771, "step": 969500 }, { "epoch": 1.6693025585073338, "grad_norm": 2.1394450664520264, "learning_rate": 2.217829069154444e-05, "loss": 0.7748, "step": 970000 }, { "epoch": 1.6701630237436778, "grad_norm": 2.100409507751465, "learning_rate": 2.2163949604272038e-05, "loss": 0.7703, "step": 970500 }, { "epoch": 1.6710234889800217, "grad_norm": 2.216879367828369, "learning_rate": 2.2149608516999638e-05, "loss": 0.7736, "step": 971000 }, { "epoch": 1.6718839542163657, "grad_norm": 2.220621109008789, "learning_rate": 2.213526742972724e-05, "loss": 0.7746, "step": 971500 }, { "epoch": 1.6727444194527097, "grad_norm": 2.032130241394043, "learning_rate": 2.212092634245484e-05, "loss": 0.7708, "step": 972000 }, { "epoch": 1.6736048846890537, "grad_norm": 2.065264940261841, "learning_rate": 2.210658525518244e-05, "loss": 0.772, "step": 972500 }, { "epoch": 1.6744653499253976, "grad_norm": 2.0372793674468994, "learning_rate": 2.209224416791004e-05, "loss": 0.7702, "step": 973000 }, { "epoch": 1.6753258151617416, "grad_norm": 2.292205333709717, "learning_rate": 2.207790308063764e-05, "loss": 0.7802, "step": 973500 }, { "epoch": 1.6761862803980856, "grad_norm": 1.9135103225708008, "learning_rate": 2.2063561993365238e-05, "loss": 0.7692, "step": 974000 }, { "epoch": 1.6770467456344296, "grad_norm": 1.9825247526168823, "learning_rate": 2.204922090609284e-05, "loss": 0.7747, "step": 974500 }, { "epoch": 1.6779072108707735, "grad_norm": 1.8973615169525146, "learning_rate": 2.2034879818820442e-05, "loss": 0.7717, "step": 975000 }, { "epoch": 1.6787676761071175, "grad_norm": 2.1794941425323486, "learning_rate": 2.202053873154804e-05, "loss": 0.7787, "step": 975500 }, { "epoch": 1.6796281413434615, "grad_norm": 2.13614559173584, "learning_rate": 2.2006197644275643e-05, "loss": 0.7687, "step": 976000 }, { "epoch": 1.6804886065798055, "grad_norm": 2.015418767929077, "learning_rate": 2.199185655700324e-05, "loss": 0.7716, "step": 976500 }, { "epoch": 1.6813490718161495, "grad_norm": 2.1459038257598877, "learning_rate": 2.197751546973084e-05, "loss": 0.7695, "step": 977000 }, { "epoch": 1.6822095370524934, "grad_norm": 2.061182975769043, "learning_rate": 2.1963174382458444e-05, "loss": 0.7659, "step": 977500 }, { "epoch": 1.6830700022888374, "grad_norm": 2.1469383239746094, "learning_rate": 2.194883329518604e-05, "loss": 0.7649, "step": 978000 }, { "epoch": 1.6839304675251814, "grad_norm": 2.144745111465454, "learning_rate": 2.1934492207913642e-05, "loss": 0.7725, "step": 978500 }, { "epoch": 1.6847909327615254, "grad_norm": 2.1422934532165527, "learning_rate": 2.1920151120641242e-05, "loss": 0.7703, "step": 979000 }, { "epoch": 1.6856513979978693, "grad_norm": 2.156254529953003, "learning_rate": 2.1905810033368843e-05, "loss": 0.7723, "step": 979500 }, { "epoch": 1.6865118632342133, "grad_norm": 1.8768316507339478, "learning_rate": 2.1891468946096443e-05, "loss": 0.7693, "step": 980000 }, { "epoch": 1.6873723284705575, "grad_norm": 2.1997392177581787, "learning_rate": 2.1877127858824044e-05, "loss": 0.7717, "step": 980500 }, { "epoch": 1.6882327937069015, "grad_norm": 2.163787603378296, "learning_rate": 2.1862786771551644e-05, "loss": 0.7724, "step": 981000 }, { "epoch": 1.6890932589432455, "grad_norm": 2.279360771179199, "learning_rate": 2.1848445684279245e-05, "loss": 0.7652, "step": 981500 }, { "epoch": 1.6899537241795894, "grad_norm": 1.8542513847351074, "learning_rate": 2.1834104597006845e-05, "loss": 0.7695, "step": 982000 }, { "epoch": 1.6908141894159334, "grad_norm": 2.083712100982666, "learning_rate": 2.1819763509734446e-05, "loss": 0.7726, "step": 982500 }, { "epoch": 1.6916746546522774, "grad_norm": 2.247375965118408, "learning_rate": 2.1805422422462043e-05, "loss": 0.7673, "step": 983000 }, { "epoch": 1.6925351198886214, "grad_norm": 2.1620235443115234, "learning_rate": 2.1791081335189643e-05, "loss": 0.7692, "step": 983500 }, { "epoch": 1.6933955851249654, "grad_norm": 2.0795235633850098, "learning_rate": 2.1776740247917247e-05, "loss": 0.7718, "step": 984000 }, { "epoch": 1.6942560503613093, "grad_norm": 2.1203017234802246, "learning_rate": 2.1762399160644844e-05, "loss": 0.7677, "step": 984500 }, { "epoch": 1.6951165155976533, "grad_norm": 4.682893753051758, "learning_rate": 2.1748058073372445e-05, "loss": 0.766, "step": 985000 }, { "epoch": 1.6959769808339973, "grad_norm": 2.2313523292541504, "learning_rate": 2.1733716986100045e-05, "loss": 0.7667, "step": 985500 }, { "epoch": 1.6968374460703413, "grad_norm": 2.307849407196045, "learning_rate": 2.1719375898827646e-05, "loss": 0.7674, "step": 986000 }, { "epoch": 1.6976979113066855, "grad_norm": 2.143303632736206, "learning_rate": 2.1705034811555246e-05, "loss": 0.7698, "step": 986500 }, { "epoch": 1.6985583765430294, "grad_norm": 2.1578030586242676, "learning_rate": 2.1690693724282847e-05, "loss": 0.7729, "step": 987000 }, { "epoch": 1.6994188417793734, "grad_norm": 2.201508045196533, "learning_rate": 2.1676352637010447e-05, "loss": 0.7677, "step": 987500 }, { "epoch": 1.7002793070157174, "grad_norm": 2.0309183597564697, "learning_rate": 2.1662011549738044e-05, "loss": 0.7733, "step": 988000 }, { "epoch": 1.7011397722520614, "grad_norm": 2.2656939029693604, "learning_rate": 2.1647670462465648e-05, "loss": 0.7658, "step": 988500 }, { "epoch": 1.7020002374884053, "grad_norm": 2.0198137760162354, "learning_rate": 2.163332937519325e-05, "loss": 0.7708, "step": 989000 }, { "epoch": 1.7028607027247493, "grad_norm": 2.3302905559539795, "learning_rate": 2.1618988287920846e-05, "loss": 0.7731, "step": 989500 }, { "epoch": 1.7037211679610933, "grad_norm": 1.9794892072677612, "learning_rate": 2.160464720064845e-05, "loss": 0.7635, "step": 990000 }, { "epoch": 1.7045816331974373, "grad_norm": 2.205071210861206, "learning_rate": 2.159030611337605e-05, "loss": 0.7698, "step": 990500 }, { "epoch": 1.7054420984337813, "grad_norm": 2.046074867248535, "learning_rate": 2.1575965026103647e-05, "loss": 0.7729, "step": 991000 }, { "epoch": 1.7063025636701252, "grad_norm": 2.0251247882843018, "learning_rate": 2.156162393883125e-05, "loss": 0.7605, "step": 991500 }, { "epoch": 1.7071630289064692, "grad_norm": 1.9598090648651123, "learning_rate": 2.1547282851558848e-05, "loss": 0.7742, "step": 992000 }, { "epoch": 1.7080234941428132, "grad_norm": 2.079505205154419, "learning_rate": 2.153294176428645e-05, "loss": 0.7694, "step": 992500 }, { "epoch": 1.7088839593791572, "grad_norm": 2.1227147579193115, "learning_rate": 2.151860067701405e-05, "loss": 0.7683, "step": 993000 }, { "epoch": 1.7097444246155011, "grad_norm": 1.9971901178359985, "learning_rate": 2.150425958974165e-05, "loss": 0.7697, "step": 993500 }, { "epoch": 1.7106048898518451, "grad_norm": 2.0586302280426025, "learning_rate": 2.148991850246925e-05, "loss": 0.766, "step": 994000 }, { "epoch": 1.711465355088189, "grad_norm": 1.9828028678894043, "learning_rate": 2.147557741519685e-05, "loss": 0.7666, "step": 994500 }, { "epoch": 1.712325820324533, "grad_norm": 1.8900585174560547, "learning_rate": 2.146123632792445e-05, "loss": 0.7647, "step": 995000 }, { "epoch": 1.713186285560877, "grad_norm": 2.2461373805999756, "learning_rate": 2.144689524065205e-05, "loss": 0.7719, "step": 995500 }, { "epoch": 1.714046750797221, "grad_norm": 2.4255807399749756, "learning_rate": 2.1432554153379648e-05, "loss": 0.7637, "step": 996000 }, { "epoch": 1.714907216033565, "grad_norm": 2.102754831314087, "learning_rate": 2.1418213066107252e-05, "loss": 0.7621, "step": 996500 }, { "epoch": 1.715767681269909, "grad_norm": 1.9721736907958984, "learning_rate": 2.140387197883485e-05, "loss": 0.7622, "step": 997000 }, { "epoch": 1.716628146506253, "grad_norm": 2.0540759563446045, "learning_rate": 2.138953089156245e-05, "loss": 0.7702, "step": 997500 }, { "epoch": 1.717488611742597, "grad_norm": 2.1698076725006104, "learning_rate": 2.1375189804290054e-05, "loss": 0.769, "step": 998000 }, { "epoch": 1.718349076978941, "grad_norm": 2.2580907344818115, "learning_rate": 2.136084871701765e-05, "loss": 0.7667, "step": 998500 }, { "epoch": 1.7192095422152849, "grad_norm": 2.198620080947876, "learning_rate": 2.134650762974525e-05, "loss": 0.7724, "step": 999000 }, { "epoch": 1.7200700074516289, "grad_norm": 2.2969367504119873, "learning_rate": 2.133216654247285e-05, "loss": 0.7674, "step": 999500 }, { "epoch": 1.7209304726879728, "grad_norm": 2.1570844650268555, "learning_rate": 2.1317825455200452e-05, "loss": 0.7639, "step": 1000000 }, { "epoch": 1.7217909379243168, "grad_norm": 1.9657459259033203, "learning_rate": 2.1303484367928052e-05, "loss": 0.7672, "step": 1000500 }, { "epoch": 1.7226514031606608, "grad_norm": 1.9201055765151978, "learning_rate": 2.1289143280655653e-05, "loss": 0.7685, "step": 1001000 }, { "epoch": 1.7235118683970048, "grad_norm": 2.067403793334961, "learning_rate": 2.1274802193383253e-05, "loss": 0.7713, "step": 1001500 }, { "epoch": 1.7243723336333487, "grad_norm": 2.0172691345214844, "learning_rate": 2.1260461106110854e-05, "loss": 0.7662, "step": 1002000 }, { "epoch": 1.7252327988696927, "grad_norm": 2.24025821685791, "learning_rate": 2.1246120018838454e-05, "loss": 0.7658, "step": 1002500 }, { "epoch": 1.7260932641060367, "grad_norm": 2.196293830871582, "learning_rate": 2.1231778931566055e-05, "loss": 0.7639, "step": 1003000 }, { "epoch": 1.7269537293423807, "grad_norm": 2.160950183868408, "learning_rate": 2.1217437844293652e-05, "loss": 0.7676, "step": 1003500 }, { "epoch": 1.7278141945787249, "grad_norm": 2.226396083831787, "learning_rate": 2.1203096757021256e-05, "loss": 0.7659, "step": 1004000 }, { "epoch": 1.7286746598150688, "grad_norm": 2.082913637161255, "learning_rate": 2.1188755669748856e-05, "loss": 0.7687, "step": 1004500 }, { "epoch": 1.7295351250514128, "grad_norm": 2.2899117469787598, "learning_rate": 2.1174414582476453e-05, "loss": 0.7672, "step": 1005000 }, { "epoch": 1.7303955902877568, "grad_norm": 2.0957179069519043, "learning_rate": 2.1160073495204054e-05, "loss": 0.7608, "step": 1005500 }, { "epoch": 1.7312560555241008, "grad_norm": 2.2185802459716797, "learning_rate": 2.1145732407931654e-05, "loss": 0.7669, "step": 1006000 }, { "epoch": 1.7321165207604448, "grad_norm": 2.14453125, "learning_rate": 2.1131391320659255e-05, "loss": 0.7616, "step": 1006500 }, { "epoch": 1.7329769859967887, "grad_norm": 2.0426855087280273, "learning_rate": 2.1117050233386855e-05, "loss": 0.7609, "step": 1007000 }, { "epoch": 1.7338374512331327, "grad_norm": 2.1771767139434814, "learning_rate": 2.1102709146114456e-05, "loss": 0.7683, "step": 1007500 }, { "epoch": 1.7346979164694767, "grad_norm": 2.215662956237793, "learning_rate": 2.1088368058842056e-05, "loss": 0.7644, "step": 1008000 }, { "epoch": 1.7355583817058207, "grad_norm": 2.010657548904419, "learning_rate": 2.1074026971569653e-05, "loss": 0.762, "step": 1008500 }, { "epoch": 1.7364188469421646, "grad_norm": 2.2185544967651367, "learning_rate": 2.1059685884297257e-05, "loss": 0.7663, "step": 1009000 }, { "epoch": 1.7372793121785088, "grad_norm": 1.9308372735977173, "learning_rate": 2.1045344797024858e-05, "loss": 0.7642, "step": 1009500 }, { "epoch": 1.7381397774148528, "grad_norm": 2.116773843765259, "learning_rate": 2.1031003709752455e-05, "loss": 0.7688, "step": 1010000 }, { "epoch": 1.7390002426511968, "grad_norm": 2.2500100135803223, "learning_rate": 2.101666262248006e-05, "loss": 0.7627, "step": 1010500 }, { "epoch": 1.7398607078875408, "grad_norm": 2.222611665725708, "learning_rate": 2.1002321535207656e-05, "loss": 0.7697, "step": 1011000 }, { "epoch": 1.7407211731238847, "grad_norm": 2.250838279724121, "learning_rate": 2.0987980447935256e-05, "loss": 0.7672, "step": 1011500 }, { "epoch": 1.7415816383602287, "grad_norm": 2.3026421070098877, "learning_rate": 2.097363936066286e-05, "loss": 0.7621, "step": 1012000 }, { "epoch": 1.7424421035965727, "grad_norm": 2.1122705936431885, "learning_rate": 2.0959298273390457e-05, "loss": 0.7667, "step": 1012500 }, { "epoch": 1.7433025688329167, "grad_norm": 2.2275261878967285, "learning_rate": 2.0944957186118057e-05, "loss": 0.767, "step": 1013000 }, { "epoch": 1.7441630340692607, "grad_norm": 2.136019468307495, "learning_rate": 2.0930616098845658e-05, "loss": 0.7684, "step": 1013500 }, { "epoch": 1.7450234993056046, "grad_norm": 2.2779674530029297, "learning_rate": 2.091627501157326e-05, "loss": 0.7647, "step": 1014000 }, { "epoch": 1.7458839645419486, "grad_norm": 2.158823013305664, "learning_rate": 2.090193392430086e-05, "loss": 0.7689, "step": 1014500 }, { "epoch": 1.7467444297782926, "grad_norm": 2.0115678310394287, "learning_rate": 2.088759283702846e-05, "loss": 0.7649, "step": 1015000 }, { "epoch": 1.7476048950146366, "grad_norm": 2.3010997772216797, "learning_rate": 2.087325174975606e-05, "loss": 0.7648, "step": 1015500 }, { "epoch": 1.7484653602509805, "grad_norm": 2.1609082221984863, "learning_rate": 2.085891066248366e-05, "loss": 0.7704, "step": 1016000 }, { "epoch": 1.7493258254873245, "grad_norm": 2.122830390930176, "learning_rate": 2.084456957521126e-05, "loss": 0.7663, "step": 1016500 }, { "epoch": 1.7501862907236685, "grad_norm": 2.0455617904663086, "learning_rate": 2.083022848793886e-05, "loss": 0.7591, "step": 1017000 }, { "epoch": 1.7510467559600125, "grad_norm": 2.0559451580047607, "learning_rate": 2.0815887400666458e-05, "loss": 0.7646, "step": 1017500 }, { "epoch": 1.7519072211963564, "grad_norm": 2.1012825965881348, "learning_rate": 2.080154631339406e-05, "loss": 0.7634, "step": 1018000 }, { "epoch": 1.7527676864327004, "grad_norm": 2.210088014602661, "learning_rate": 2.0787205226121663e-05, "loss": 0.7707, "step": 1018500 }, { "epoch": 1.7536281516690444, "grad_norm": 2.0194787979125977, "learning_rate": 2.077286413884926e-05, "loss": 0.7653, "step": 1019000 }, { "epoch": 1.7544886169053884, "grad_norm": 2.322936773300171, "learning_rate": 2.075852305157686e-05, "loss": 0.7741, "step": 1019500 }, { "epoch": 1.7553490821417324, "grad_norm": 2.146965980529785, "learning_rate": 2.074418196430446e-05, "loss": 0.7677, "step": 1020000 }, { "epoch": 1.7562095473780763, "grad_norm": 1.9335236549377441, "learning_rate": 2.072984087703206e-05, "loss": 0.7694, "step": 1020500 }, { "epoch": 1.7570700126144203, "grad_norm": 1.9779301881790161, "learning_rate": 2.071549978975966e-05, "loss": 0.7611, "step": 1021000 }, { "epoch": 1.7579304778507643, "grad_norm": 2.1787054538726807, "learning_rate": 2.0701158702487262e-05, "loss": 0.7725, "step": 1021500 }, { "epoch": 1.7587909430871083, "grad_norm": 2.254455089569092, "learning_rate": 2.0686817615214863e-05, "loss": 0.7672, "step": 1022000 }, { "epoch": 1.7596514083234522, "grad_norm": 2.1716644763946533, "learning_rate": 2.067247652794246e-05, "loss": 0.7624, "step": 1022500 }, { "epoch": 1.7605118735597962, "grad_norm": 2.0068957805633545, "learning_rate": 2.0658135440670063e-05, "loss": 0.7635, "step": 1023000 }, { "epoch": 1.7613723387961402, "grad_norm": 2.0849902629852295, "learning_rate": 2.0643794353397664e-05, "loss": 0.766, "step": 1023500 }, { "epoch": 1.7622328040324842, "grad_norm": 2.2821147441864014, "learning_rate": 2.062945326612526e-05, "loss": 0.7705, "step": 1024000 }, { "epoch": 1.7630932692688281, "grad_norm": 2.0670814514160156, "learning_rate": 2.0615112178852865e-05, "loss": 0.7722, "step": 1024500 }, { "epoch": 1.7639537345051721, "grad_norm": 2.3591463565826416, "learning_rate": 2.0600771091580465e-05, "loss": 0.7671, "step": 1025000 }, { "epoch": 1.764814199741516, "grad_norm": 1.9846746921539307, "learning_rate": 2.0586430004308062e-05, "loss": 0.7705, "step": 1025500 }, { "epoch": 1.76567466497786, "grad_norm": 1.9745829105377197, "learning_rate": 2.0572088917035663e-05, "loss": 0.7669, "step": 1026000 }, { "epoch": 1.766535130214204, "grad_norm": 1.9543824195861816, "learning_rate": 2.0557747829763263e-05, "loss": 0.7687, "step": 1026500 }, { "epoch": 1.7673955954505483, "grad_norm": 2.1610777378082275, "learning_rate": 2.0543406742490864e-05, "loss": 0.7659, "step": 1027000 }, { "epoch": 1.7682560606868922, "grad_norm": 2.0030741691589355, "learning_rate": 2.0529065655218464e-05, "loss": 0.7676, "step": 1027500 }, { "epoch": 1.7691165259232362, "grad_norm": 2.3656604290008545, "learning_rate": 2.0514724567946065e-05, "loss": 0.7666, "step": 1028000 }, { "epoch": 1.7699769911595802, "grad_norm": 2.1445231437683105, "learning_rate": 2.0500383480673665e-05, "loss": 0.7697, "step": 1028500 }, { "epoch": 1.7708374563959242, "grad_norm": 1.9734889268875122, "learning_rate": 2.0486042393401266e-05, "loss": 0.7618, "step": 1029000 }, { "epoch": 1.7716979216322681, "grad_norm": 2.2043867111206055, "learning_rate": 2.0471701306128866e-05, "loss": 0.765, "step": 1029500 }, { "epoch": 1.7725583868686121, "grad_norm": 2.0947585105895996, "learning_rate": 2.0457360218856467e-05, "loss": 0.7618, "step": 1030000 }, { "epoch": 1.773418852104956, "grad_norm": 2.01607346534729, "learning_rate": 2.0443019131584064e-05, "loss": 0.7632, "step": 1030500 }, { "epoch": 1.7742793173413, "grad_norm": 1.974541425704956, "learning_rate": 2.0428678044311668e-05, "loss": 0.7632, "step": 1031000 }, { "epoch": 1.775139782577644, "grad_norm": 1.9963077306747437, "learning_rate": 2.0414336957039265e-05, "loss": 0.7654, "step": 1031500 }, { "epoch": 1.776000247813988, "grad_norm": 2.0639266967773438, "learning_rate": 2.0399995869766865e-05, "loss": 0.7696, "step": 1032000 }, { "epoch": 1.7768607130503322, "grad_norm": 2.184873580932617, "learning_rate": 2.038565478249447e-05, "loss": 0.7646, "step": 1032500 }, { "epoch": 1.7777211782866762, "grad_norm": 2.1645946502685547, "learning_rate": 2.0371313695222066e-05, "loss": 0.7612, "step": 1033000 }, { "epoch": 1.7785816435230202, "grad_norm": 2.45359206199646, "learning_rate": 2.0356972607949667e-05, "loss": 0.763, "step": 1033500 }, { "epoch": 1.7794421087593641, "grad_norm": 2.0774848461151123, "learning_rate": 2.0342631520677267e-05, "loss": 0.7629, "step": 1034000 }, { "epoch": 1.7803025739957081, "grad_norm": 2.261568546295166, "learning_rate": 2.0328290433404868e-05, "loss": 0.759, "step": 1034500 }, { "epoch": 1.781163039232052, "grad_norm": 2.4625566005706787, "learning_rate": 2.0313949346132468e-05, "loss": 0.7669, "step": 1035000 }, { "epoch": 1.782023504468396, "grad_norm": 2.1339073181152344, "learning_rate": 2.029960825886007e-05, "loss": 0.7582, "step": 1035500 }, { "epoch": 1.78288396970474, "grad_norm": 2.3074862957000732, "learning_rate": 2.028526717158767e-05, "loss": 0.7651, "step": 1036000 }, { "epoch": 1.783744434941084, "grad_norm": 2.2727060317993164, "learning_rate": 2.027092608431527e-05, "loss": 0.7684, "step": 1036500 }, { "epoch": 1.784604900177428, "grad_norm": 2.086061716079712, "learning_rate": 2.025658499704287e-05, "loss": 0.7695, "step": 1037000 }, { "epoch": 1.785465365413772, "grad_norm": 2.036452293395996, "learning_rate": 2.024224390977047e-05, "loss": 0.7679, "step": 1037500 }, { "epoch": 1.786325830650116, "grad_norm": 2.2285068035125732, "learning_rate": 2.0227902822498067e-05, "loss": 0.7566, "step": 1038000 }, { "epoch": 1.78718629588646, "grad_norm": 2.1766035556793213, "learning_rate": 2.0213561735225668e-05, "loss": 0.7636, "step": 1038500 }, { "epoch": 1.788046761122804, "grad_norm": 2.3141143321990967, "learning_rate": 2.0199220647953272e-05, "loss": 0.762, "step": 1039000 }, { "epoch": 1.788907226359148, "grad_norm": 2.2814242839813232, "learning_rate": 2.018487956068087e-05, "loss": 0.7609, "step": 1039500 }, { "epoch": 1.7897676915954919, "grad_norm": 2.2914586067199707, "learning_rate": 2.017053847340847e-05, "loss": 0.7658, "step": 1040000 }, { "epoch": 1.7906281568318358, "grad_norm": 2.054856300354004, "learning_rate": 2.015619738613607e-05, "loss": 0.765, "step": 1040500 }, { "epoch": 1.7914886220681798, "grad_norm": 2.02543568611145, "learning_rate": 2.014185629886367e-05, "loss": 0.7657, "step": 1041000 }, { "epoch": 1.7923490873045238, "grad_norm": 2.087737560272217, "learning_rate": 2.012751521159127e-05, "loss": 0.763, "step": 1041500 }, { "epoch": 1.7932095525408678, "grad_norm": 2.2256555557250977, "learning_rate": 2.011317412431887e-05, "loss": 0.761, "step": 1042000 }, { "epoch": 1.7940700177772118, "grad_norm": 2.0341453552246094, "learning_rate": 2.009883303704647e-05, "loss": 0.7623, "step": 1042500 }, { "epoch": 1.7949304830135557, "grad_norm": 2.1734702587127686, "learning_rate": 2.008449194977407e-05, "loss": 0.7615, "step": 1043000 }, { "epoch": 1.7957909482498997, "grad_norm": 1.8783676624298096, "learning_rate": 2.0070150862501673e-05, "loss": 0.7649, "step": 1043500 }, { "epoch": 1.7966514134862437, "grad_norm": 2.2101192474365234, "learning_rate": 2.0055809775229273e-05, "loss": 0.7649, "step": 1044000 }, { "epoch": 1.7975118787225877, "grad_norm": 2.016972780227661, "learning_rate": 2.004146868795687e-05, "loss": 0.7634, "step": 1044500 }, { "epoch": 1.7983723439589316, "grad_norm": 33.173458099365234, "learning_rate": 2.0027127600684474e-05, "loss": 0.7701, "step": 1045000 }, { "epoch": 1.7992328091952756, "grad_norm": 2.1872565746307373, "learning_rate": 2.001278651341207e-05, "loss": 0.7631, "step": 1045500 }, { "epoch": 1.8000932744316196, "grad_norm": 2.1605005264282227, "learning_rate": 1.999844542613967e-05, "loss": 0.7659, "step": 1046000 }, { "epoch": 1.8009537396679636, "grad_norm": 2.3672409057617188, "learning_rate": 1.9984104338867275e-05, "loss": 0.7599, "step": 1046500 }, { "epoch": 1.8018142049043075, "grad_norm": 2.111316680908203, "learning_rate": 1.9969763251594873e-05, "loss": 0.7609, "step": 1047000 }, { "epoch": 1.8026746701406515, "grad_norm": 2.227252721786499, "learning_rate": 1.9955422164322473e-05, "loss": 0.7632, "step": 1047500 }, { "epoch": 1.8035351353769955, "grad_norm": 2.072519302368164, "learning_rate": 1.9941081077050073e-05, "loss": 0.7644, "step": 1048000 }, { "epoch": 1.8043956006133395, "grad_norm": 2.0403823852539062, "learning_rate": 1.9926739989777674e-05, "loss": 0.761, "step": 1048500 }, { "epoch": 1.8052560658496835, "grad_norm": 6.927855968475342, "learning_rate": 1.9912398902505274e-05, "loss": 0.7598, "step": 1049000 }, { "epoch": 1.8061165310860274, "grad_norm": 2.0393149852752686, "learning_rate": 1.9898057815232875e-05, "loss": 0.7702, "step": 1049500 }, { "epoch": 1.8069769963223716, "grad_norm": 2.066805124282837, "learning_rate": 1.9883716727960475e-05, "loss": 0.7655, "step": 1050000 }, { "epoch": 1.8078374615587156, "grad_norm": 2.2270755767822266, "learning_rate": 1.9869375640688076e-05, "loss": 0.7641, "step": 1050500 }, { "epoch": 1.8086979267950596, "grad_norm": 2.265721082687378, "learning_rate": 1.9855034553415673e-05, "loss": 0.7644, "step": 1051000 }, { "epoch": 1.8095583920314036, "grad_norm": 2.069361448287964, "learning_rate": 1.9840693466143277e-05, "loss": 0.759, "step": 1051500 }, { "epoch": 1.8104188572677475, "grad_norm": 2.284576177597046, "learning_rate": 1.9826352378870874e-05, "loss": 0.7663, "step": 1052000 }, { "epoch": 1.8112793225040915, "grad_norm": 2.2018840312957764, "learning_rate": 1.9812011291598474e-05, "loss": 0.7661, "step": 1052500 }, { "epoch": 1.8121397877404355, "grad_norm": 2.477423667907715, "learning_rate": 1.9797670204326078e-05, "loss": 0.7638, "step": 1053000 }, { "epoch": 1.8130002529767795, "grad_norm": 7.185637474060059, "learning_rate": 1.9783329117053675e-05, "loss": 0.7734, "step": 1053500 }, { "epoch": 1.8138607182131234, "grad_norm": 2.8006057739257812, "learning_rate": 1.9768988029781276e-05, "loss": 0.7602, "step": 1054000 }, { "epoch": 1.8147211834494674, "grad_norm": 2.2117958068847656, "learning_rate": 1.9754646942508876e-05, "loss": 0.7638, "step": 1054500 }, { "epoch": 1.8155816486858114, "grad_norm": 2.083202362060547, "learning_rate": 1.9740305855236477e-05, "loss": 0.7673, "step": 1055000 }, { "epoch": 1.8164421139221554, "grad_norm": 2.132225751876831, "learning_rate": 1.9725964767964077e-05, "loss": 0.7708, "step": 1055500 }, { "epoch": 1.8173025791584996, "grad_norm": 2.12669038772583, "learning_rate": 1.9711623680691678e-05, "loss": 0.7627, "step": 1056000 }, { "epoch": 1.8181630443948436, "grad_norm": 1.9951871633529663, "learning_rate": 1.9697282593419278e-05, "loss": 0.7612, "step": 1056500 }, { "epoch": 1.8190235096311875, "grad_norm": 2.106513738632202, "learning_rate": 1.968294150614688e-05, "loss": 0.7627, "step": 1057000 }, { "epoch": 1.8198839748675315, "grad_norm": 2.035362482070923, "learning_rate": 1.966860041887448e-05, "loss": 0.765, "step": 1057500 }, { "epoch": 1.8207444401038755, "grad_norm": 2.076943874359131, "learning_rate": 1.965425933160208e-05, "loss": 0.7668, "step": 1058000 }, { "epoch": 1.8216049053402195, "grad_norm": 2.2097012996673584, "learning_rate": 1.9639918244329677e-05, "loss": 0.7589, "step": 1058500 }, { "epoch": 1.8224653705765634, "grad_norm": 2.264671802520752, "learning_rate": 1.962557715705728e-05, "loss": 0.7628, "step": 1059000 }, { "epoch": 1.8233258358129074, "grad_norm": 2.2609753608703613, "learning_rate": 1.961123606978488e-05, "loss": 0.7682, "step": 1059500 }, { "epoch": 1.8241863010492514, "grad_norm": 2.274721384048462, "learning_rate": 1.9596894982512478e-05, "loss": 0.758, "step": 1060000 }, { "epoch": 1.8250467662855954, "grad_norm": 2.301239252090454, "learning_rate": 1.958255389524008e-05, "loss": 0.7665, "step": 1060500 }, { "epoch": 1.8259072315219393, "grad_norm": 2.0700104236602783, "learning_rate": 1.956821280796768e-05, "loss": 0.757, "step": 1061000 }, { "epoch": 1.8267676967582833, "grad_norm": 2.252894639968872, "learning_rate": 1.955387172069528e-05, "loss": 0.7589, "step": 1061500 }, { "epoch": 1.8276281619946273, "grad_norm": 2.1328372955322266, "learning_rate": 1.953953063342288e-05, "loss": 0.7634, "step": 1062000 }, { "epoch": 1.8284886272309713, "grad_norm": 2.389291763305664, "learning_rate": 1.952518954615048e-05, "loss": 0.7658, "step": 1062500 }, { "epoch": 1.8293490924673153, "grad_norm": 2.197857141494751, "learning_rate": 1.951084845887808e-05, "loss": 0.764, "step": 1063000 }, { "epoch": 1.8302095577036592, "grad_norm": 1.887812614440918, "learning_rate": 1.9496507371605678e-05, "loss": 0.7606, "step": 1063500 }, { "epoch": 1.8310700229400032, "grad_norm": 2.0983939170837402, "learning_rate": 1.9482166284333282e-05, "loss": 0.7591, "step": 1064000 }, { "epoch": 1.8319304881763472, "grad_norm": 2.231890916824341, "learning_rate": 1.9467825197060882e-05, "loss": 0.7644, "step": 1064500 }, { "epoch": 1.8327909534126912, "grad_norm": 2.0913729667663574, "learning_rate": 1.945348410978848e-05, "loss": 0.7675, "step": 1065000 }, { "epoch": 1.8336514186490351, "grad_norm": 2.2117438316345215, "learning_rate": 1.9439143022516083e-05, "loss": 0.7549, "step": 1065500 }, { "epoch": 1.8345118838853791, "grad_norm": 2.160916805267334, "learning_rate": 1.942480193524368e-05, "loss": 0.7601, "step": 1066000 }, { "epoch": 1.835372349121723, "grad_norm": 2.1662871837615967, "learning_rate": 1.941046084797128e-05, "loss": 0.7615, "step": 1066500 }, { "epoch": 1.836232814358067, "grad_norm": 2.103234052658081, "learning_rate": 1.9396119760698885e-05, "loss": 0.7571, "step": 1067000 }, { "epoch": 1.837093279594411, "grad_norm": 2.0720338821411133, "learning_rate": 1.938177867342648e-05, "loss": 0.7576, "step": 1067500 }, { "epoch": 1.837953744830755, "grad_norm": 2.079674005508423, "learning_rate": 1.9367437586154082e-05, "loss": 0.7621, "step": 1068000 }, { "epoch": 1.838814210067099, "grad_norm": 2.1196000576019287, "learning_rate": 1.9353096498881686e-05, "loss": 0.7627, "step": 1068500 }, { "epoch": 1.839674675303443, "grad_norm": 3.3423213958740234, "learning_rate": 1.9338755411609283e-05, "loss": 0.7632, "step": 1069000 }, { "epoch": 1.840535140539787, "grad_norm": 2.1786093711853027, "learning_rate": 1.9324414324336884e-05, "loss": 0.7593, "step": 1069500 }, { "epoch": 1.841395605776131, "grad_norm": 1.9805326461791992, "learning_rate": 1.9310073237064484e-05, "loss": 0.7664, "step": 1070000 }, { "epoch": 1.842256071012475, "grad_norm": 2.258626699447632, "learning_rate": 1.9295732149792084e-05, "loss": 0.7596, "step": 1070500 }, { "epoch": 1.8431165362488189, "grad_norm": 1.964232087135315, "learning_rate": 1.9281391062519685e-05, "loss": 0.763, "step": 1071000 }, { "epoch": 1.8439770014851629, "grad_norm": 2.0664947032928467, "learning_rate": 1.9267049975247285e-05, "loss": 0.7645, "step": 1071500 }, { "epoch": 1.8448374667215068, "grad_norm": 2.0968148708343506, "learning_rate": 1.9252708887974886e-05, "loss": 0.7673, "step": 1072000 }, { "epoch": 1.8456979319578508, "grad_norm": 2.034278154373169, "learning_rate": 1.9238367800702483e-05, "loss": 0.7614, "step": 1072500 }, { "epoch": 1.846558397194195, "grad_norm": 2.2998197078704834, "learning_rate": 1.9224026713430083e-05, "loss": 0.7549, "step": 1073000 }, { "epoch": 1.847418862430539, "grad_norm": 2.016671657562256, "learning_rate": 1.9209685626157687e-05, "loss": 0.7577, "step": 1073500 }, { "epoch": 1.848279327666883, "grad_norm": 1.9989349842071533, "learning_rate": 1.9195344538885284e-05, "loss": 0.7543, "step": 1074000 }, { "epoch": 1.849139792903227, "grad_norm": 2.155221462249756, "learning_rate": 1.9181003451612885e-05, "loss": 0.7628, "step": 1074500 }, { "epoch": 1.850000258139571, "grad_norm": 2.4140572547912598, "learning_rate": 1.9166662364340485e-05, "loss": 0.7603, "step": 1075000 }, { "epoch": 1.850860723375915, "grad_norm": 1.9228155612945557, "learning_rate": 1.9152321277068086e-05, "loss": 0.7645, "step": 1075500 }, { "epoch": 1.8517211886122589, "grad_norm": 2.0937721729278564, "learning_rate": 1.9137980189795686e-05, "loss": 0.7633, "step": 1076000 }, { "epoch": 1.8525816538486028, "grad_norm": 2.3551013469696045, "learning_rate": 1.9123639102523287e-05, "loss": 0.7578, "step": 1076500 }, { "epoch": 1.8534421190849468, "grad_norm": 2.13285231590271, "learning_rate": 1.9109298015250887e-05, "loss": 0.7586, "step": 1077000 }, { "epoch": 1.8543025843212908, "grad_norm": 2.2685706615448, "learning_rate": 1.9094956927978484e-05, "loss": 0.7595, "step": 1077500 }, { "epoch": 1.8551630495576348, "grad_norm": 2.0785560607910156, "learning_rate": 1.9080615840706088e-05, "loss": 0.7588, "step": 1078000 }, { "epoch": 1.8560235147939788, "grad_norm": 2.3855340480804443, "learning_rate": 1.906627475343369e-05, "loss": 0.7569, "step": 1078500 }, { "epoch": 1.856883980030323, "grad_norm": 2.313601016998291, "learning_rate": 1.9051933666161286e-05, "loss": 0.7609, "step": 1079000 }, { "epoch": 1.857744445266667, "grad_norm": 2.230159044265747, "learning_rate": 1.903759257888889e-05, "loss": 0.7609, "step": 1079500 }, { "epoch": 1.858604910503011, "grad_norm": 2.331984519958496, "learning_rate": 1.902325149161649e-05, "loss": 0.7627, "step": 1080000 }, { "epoch": 1.8594653757393549, "grad_norm": 1.8024656772613525, "learning_rate": 1.9008910404344087e-05, "loss": 0.7533, "step": 1080500 }, { "epoch": 1.8603258409756989, "grad_norm": 2.5354342460632324, "learning_rate": 1.899456931707169e-05, "loss": 0.7596, "step": 1081000 }, { "epoch": 1.8611863062120428, "grad_norm": 2.176863193511963, "learning_rate": 1.8980228229799288e-05, "loss": 0.757, "step": 1081500 }, { "epoch": 1.8620467714483868, "grad_norm": 2.0789923667907715, "learning_rate": 1.896588714252689e-05, "loss": 0.762, "step": 1082000 }, { "epoch": 1.8629072366847308, "grad_norm": 2.1080963611602783, "learning_rate": 1.895154605525449e-05, "loss": 0.7577, "step": 1082500 }, { "epoch": 1.8637677019210748, "grad_norm": 1.9690420627593994, "learning_rate": 1.893720496798209e-05, "loss": 0.7602, "step": 1083000 }, { "epoch": 1.8646281671574187, "grad_norm": 2.039321184158325, "learning_rate": 1.892286388070969e-05, "loss": 0.7543, "step": 1083500 }, { "epoch": 1.8654886323937627, "grad_norm": 2.1178793907165527, "learning_rate": 1.890852279343729e-05, "loss": 0.7565, "step": 1084000 }, { "epoch": 1.8663490976301067, "grad_norm": 2.1791107654571533, "learning_rate": 1.889418170616489e-05, "loss": 0.7563, "step": 1084500 }, { "epoch": 1.8672095628664507, "grad_norm": 2.110280990600586, "learning_rate": 1.887984061889249e-05, "loss": 0.7606, "step": 1085000 }, { "epoch": 1.8680700281027947, "grad_norm": 2.0059497356414795, "learning_rate": 1.886549953162009e-05, "loss": 0.7556, "step": 1085500 }, { "epoch": 1.8689304933391386, "grad_norm": 2.052835702896118, "learning_rate": 1.8851158444347692e-05, "loss": 0.763, "step": 1086000 }, { "epoch": 1.8697909585754826, "grad_norm": 2.0899126529693604, "learning_rate": 1.883681735707529e-05, "loss": 0.7584, "step": 1086500 }, { "epoch": 1.8706514238118266, "grad_norm": 2.019315481185913, "learning_rate": 1.882247626980289e-05, "loss": 0.7586, "step": 1087000 }, { "epoch": 1.8715118890481706, "grad_norm": 2.061671733856201, "learning_rate": 1.8808135182530494e-05, "loss": 0.7551, "step": 1087500 }, { "epoch": 1.8723723542845145, "grad_norm": 2.065790891647339, "learning_rate": 1.879379409525809e-05, "loss": 0.7631, "step": 1088000 }, { "epoch": 1.8732328195208585, "grad_norm": 2.2259931564331055, "learning_rate": 1.877945300798569e-05, "loss": 0.7591, "step": 1088500 }, { "epoch": 1.8740932847572025, "grad_norm": 1.9983274936676025, "learning_rate": 1.8765111920713292e-05, "loss": 0.7585, "step": 1089000 }, { "epoch": 1.8749537499935465, "grad_norm": 2.08388090133667, "learning_rate": 1.8750770833440892e-05, "loss": 0.7607, "step": 1089500 }, { "epoch": 1.8758142152298904, "grad_norm": 2.1535439491271973, "learning_rate": 1.8736429746168493e-05, "loss": 0.7578, "step": 1090000 }, { "epoch": 1.8766746804662344, "grad_norm": 2.4029664993286133, "learning_rate": 1.8722088658896093e-05, "loss": 0.7602, "step": 1090500 }, { "epoch": 1.8775351457025784, "grad_norm": 2.107679605484009, "learning_rate": 1.8707747571623694e-05, "loss": 0.7621, "step": 1091000 }, { "epoch": 1.8783956109389224, "grad_norm": 2.1485376358032227, "learning_rate": 1.8693406484351294e-05, "loss": 0.7541, "step": 1091500 }, { "epoch": 1.8792560761752664, "grad_norm": 2.088392972946167, "learning_rate": 1.8679065397078895e-05, "loss": 0.756, "step": 1092000 }, { "epoch": 1.8801165414116103, "grad_norm": 2.1533939838409424, "learning_rate": 1.8664724309806495e-05, "loss": 0.7581, "step": 1092500 }, { "epoch": 1.8809770066479543, "grad_norm": 2.3350436687469482, "learning_rate": 1.8650383222534092e-05, "loss": 0.7497, "step": 1093000 }, { "epoch": 1.8818374718842983, "grad_norm": 2.236022472381592, "learning_rate": 1.8636042135261696e-05, "loss": 0.7617, "step": 1093500 }, { "epoch": 1.8826979371206423, "grad_norm": 2.3586575984954834, "learning_rate": 1.8621701047989296e-05, "loss": 0.759, "step": 1094000 }, { "epoch": 1.8835584023569862, "grad_norm": 2.1718757152557373, "learning_rate": 1.8607359960716893e-05, "loss": 0.7598, "step": 1094500 }, { "epoch": 1.8844188675933302, "grad_norm": 2.206676959991455, "learning_rate": 1.8593018873444494e-05, "loss": 0.7595, "step": 1095000 }, { "epoch": 1.8852793328296742, "grad_norm": 2.1057584285736084, "learning_rate": 1.8578677786172094e-05, "loss": 0.7613, "step": 1095500 }, { "epoch": 1.8861397980660182, "grad_norm": 2.199988842010498, "learning_rate": 1.8564336698899695e-05, "loss": 0.7597, "step": 1096000 }, { "epoch": 1.8870002633023624, "grad_norm": 2.3150246143341064, "learning_rate": 1.8549995611627295e-05, "loss": 0.7588, "step": 1096500 }, { "epoch": 1.8878607285387063, "grad_norm": 2.125781536102295, "learning_rate": 1.8535654524354896e-05, "loss": 0.7548, "step": 1097000 }, { "epoch": 1.8887211937750503, "grad_norm": 2.0690038204193115, "learning_rate": 1.8521313437082496e-05, "loss": 0.7563, "step": 1097500 }, { "epoch": 1.8895816590113943, "grad_norm": 2.1967461109161377, "learning_rate": 1.8506972349810093e-05, "loss": 0.7564, "step": 1098000 }, { "epoch": 1.8904421242477383, "grad_norm": 2.0948901176452637, "learning_rate": 1.8492631262537697e-05, "loss": 0.7619, "step": 1098500 }, { "epoch": 1.8913025894840823, "grad_norm": 2.2009682655334473, "learning_rate": 1.8478290175265298e-05, "loss": 0.7519, "step": 1099000 }, { "epoch": 1.8921630547204262, "grad_norm": 2.0295839309692383, "learning_rate": 1.8463949087992895e-05, "loss": 0.7551, "step": 1099500 }, { "epoch": 1.8930235199567702, "grad_norm": 2.0854525566101074, "learning_rate": 1.84496080007205e-05, "loss": 0.7591, "step": 1100000 }, { "epoch": 1.8938839851931142, "grad_norm": 2.133030414581299, "learning_rate": 1.8435266913448096e-05, "loss": 0.7625, "step": 1100500 }, { "epoch": 1.8947444504294582, "grad_norm": 1.9579893350601196, "learning_rate": 1.8420925826175696e-05, "loss": 0.7635, "step": 1101000 }, { "epoch": 1.8956049156658021, "grad_norm": 2.1973876953125, "learning_rate": 1.84065847389033e-05, "loss": 0.7488, "step": 1101500 }, { "epoch": 1.8964653809021463, "grad_norm": 2.209946393966675, "learning_rate": 1.8392243651630897e-05, "loss": 0.755, "step": 1102000 }, { "epoch": 1.8973258461384903, "grad_norm": 2.0465190410614014, "learning_rate": 1.8377902564358498e-05, "loss": 0.7578, "step": 1102500 }, { "epoch": 1.8981863113748343, "grad_norm": 2.160445213317871, "learning_rate": 1.8363561477086098e-05, "loss": 0.7566, "step": 1103000 }, { "epoch": 1.8990467766111783, "grad_norm": 2.2007155418395996, "learning_rate": 1.83492203898137e-05, "loss": 0.7502, "step": 1103500 }, { "epoch": 1.8999072418475222, "grad_norm": 2.144000768661499, "learning_rate": 1.83348793025413e-05, "loss": 0.7576, "step": 1104000 }, { "epoch": 1.9007677070838662, "grad_norm": 2.284233808517456, "learning_rate": 1.83205382152689e-05, "loss": 0.7547, "step": 1104500 }, { "epoch": 1.9016281723202102, "grad_norm": 2.2595906257629395, "learning_rate": 1.83061971279965e-05, "loss": 0.7582, "step": 1105000 }, { "epoch": 1.9024886375565542, "grad_norm": 2.0282812118530273, "learning_rate": 1.82918560407241e-05, "loss": 0.7558, "step": 1105500 }, { "epoch": 1.9033491027928982, "grad_norm": 2.142599105834961, "learning_rate": 1.82775149534517e-05, "loss": 0.7527, "step": 1106000 }, { "epoch": 1.9042095680292421, "grad_norm": 2.237638235092163, "learning_rate": 1.82631738661793e-05, "loss": 0.749, "step": 1106500 }, { "epoch": 1.905070033265586, "grad_norm": 2.2329697608947754, "learning_rate": 1.82488327789069e-05, "loss": 0.7585, "step": 1107000 }, { "epoch": 1.90593049850193, "grad_norm": 2.2512173652648926, "learning_rate": 1.82344916916345e-05, "loss": 0.7553, "step": 1107500 }, { "epoch": 1.906790963738274, "grad_norm": 2.0716335773468018, "learning_rate": 1.8220150604362103e-05, "loss": 0.7541, "step": 1108000 }, { "epoch": 1.907651428974618, "grad_norm": 2.0387918949127197, "learning_rate": 1.82058095170897e-05, "loss": 0.7582, "step": 1108500 }, { "epoch": 1.908511894210962, "grad_norm": 1.9657924175262451, "learning_rate": 1.81914684298173e-05, "loss": 0.7667, "step": 1109000 }, { "epoch": 1.909372359447306, "grad_norm": 2.0124640464782715, "learning_rate": 1.81771273425449e-05, "loss": 0.7581, "step": 1109500 }, { "epoch": 1.91023282468365, "grad_norm": 2.183788537979126, "learning_rate": 1.81627862552725e-05, "loss": 0.7488, "step": 1110000 }, { "epoch": 1.911093289919994, "grad_norm": 2.0841352939605713, "learning_rate": 1.8148445168000102e-05, "loss": 0.7567, "step": 1110500 }, { "epoch": 1.911953755156338, "grad_norm": 2.0061280727386475, "learning_rate": 1.8134104080727702e-05, "loss": 0.7633, "step": 1111000 }, { "epoch": 1.912814220392682, "grad_norm": 1.9698486328125, "learning_rate": 1.8119762993455303e-05, "loss": 0.7533, "step": 1111500 }, { "epoch": 1.9136746856290259, "grad_norm": 2.1442909240722656, "learning_rate": 1.81054219061829e-05, "loss": 0.7594, "step": 1112000 }, { "epoch": 1.9145351508653699, "grad_norm": 2.279815196990967, "learning_rate": 1.8091080818910504e-05, "loss": 0.7584, "step": 1112500 }, { "epoch": 1.9153956161017138, "grad_norm": 2.1331374645233154, "learning_rate": 1.8076739731638104e-05, "loss": 0.7553, "step": 1113000 }, { "epoch": 1.9162560813380578, "grad_norm": 2.1729178428649902, "learning_rate": 1.80623986443657e-05, "loss": 0.7575, "step": 1113500 }, { "epoch": 1.9171165465744018, "grad_norm": 2.0933024883270264, "learning_rate": 1.8048057557093305e-05, "loss": 0.7575, "step": 1114000 }, { "epoch": 1.9179770118107458, "grad_norm": 2.0787551403045654, "learning_rate": 1.8033716469820906e-05, "loss": 0.7532, "step": 1114500 }, { "epoch": 1.9188374770470897, "grad_norm": 2.139012336730957, "learning_rate": 1.8019375382548503e-05, "loss": 0.7597, "step": 1115000 }, { "epoch": 1.9196979422834337, "grad_norm": 2.0884130001068115, "learning_rate": 1.8005034295276103e-05, "loss": 0.754, "step": 1115500 }, { "epoch": 1.9205584075197777, "grad_norm": 2.2890164852142334, "learning_rate": 1.7990693208003704e-05, "loss": 0.754, "step": 1116000 }, { "epoch": 1.9214188727561217, "grad_norm": 2.274405002593994, "learning_rate": 1.7976352120731304e-05, "loss": 0.7465, "step": 1116500 }, { "epoch": 1.9222793379924656, "grad_norm": 2.3451716899871826, "learning_rate": 1.7962011033458904e-05, "loss": 0.7614, "step": 1117000 }, { "epoch": 1.9231398032288096, "grad_norm": 2.0729122161865234, "learning_rate": 1.7947669946186505e-05, "loss": 0.7546, "step": 1117500 }, { "epoch": 1.9240002684651536, "grad_norm": 2.164996862411499, "learning_rate": 1.7933328858914105e-05, "loss": 0.7596, "step": 1118000 }, { "epoch": 1.9248607337014976, "grad_norm": 2.23533296585083, "learning_rate": 1.7918987771641706e-05, "loss": 0.7573, "step": 1118500 }, { "epoch": 1.9257211989378415, "grad_norm": 2.1161954402923584, "learning_rate": 1.7904646684369306e-05, "loss": 0.7551, "step": 1119000 }, { "epoch": 1.9265816641741857, "grad_norm": 2.200545072555542, "learning_rate": 1.7890305597096907e-05, "loss": 0.7533, "step": 1119500 }, { "epoch": 1.9274421294105297, "grad_norm": 1.9847606420516968, "learning_rate": 1.7875964509824504e-05, "loss": 0.7537, "step": 1120000 }, { "epoch": 1.9283025946468737, "grad_norm": 2.063674211502075, "learning_rate": 1.7861623422552108e-05, "loss": 0.7587, "step": 1120500 }, { "epoch": 1.9291630598832177, "grad_norm": 2.1384332180023193, "learning_rate": 1.7847282335279705e-05, "loss": 0.7527, "step": 1121000 }, { "epoch": 1.9300235251195617, "grad_norm": 2.213263750076294, "learning_rate": 1.7832941248007305e-05, "loss": 0.7555, "step": 1121500 }, { "epoch": 1.9308839903559056, "grad_norm": 2.0558905601501465, "learning_rate": 1.781860016073491e-05, "loss": 0.7546, "step": 1122000 }, { "epoch": 1.9317444555922496, "grad_norm": 2.223896026611328, "learning_rate": 1.7804259073462506e-05, "loss": 0.7577, "step": 1122500 }, { "epoch": 1.9326049208285936, "grad_norm": 2.3315932750701904, "learning_rate": 1.7789917986190107e-05, "loss": 0.7549, "step": 1123000 }, { "epoch": 1.9334653860649376, "grad_norm": 2.0839650630950928, "learning_rate": 1.777557689891771e-05, "loss": 0.7558, "step": 1123500 }, { "epoch": 1.9343258513012815, "grad_norm": 2.223665237426758, "learning_rate": 1.7761235811645308e-05, "loss": 0.7516, "step": 1124000 }, { "epoch": 1.9351863165376255, "grad_norm": 2.5846803188323975, "learning_rate": 1.7746894724372908e-05, "loss": 0.7591, "step": 1124500 }, { "epoch": 1.9360467817739697, "grad_norm": 2.2104952335357666, "learning_rate": 1.773255363710051e-05, "loss": 0.754, "step": 1125000 }, { "epoch": 1.9369072470103137, "grad_norm": 1.9384921789169312, "learning_rate": 1.771821254982811e-05, "loss": 0.7559, "step": 1125500 }, { "epoch": 1.9377677122466577, "grad_norm": 1.9238662719726562, "learning_rate": 1.770387146255571e-05, "loss": 0.7514, "step": 1126000 }, { "epoch": 1.9386281774830016, "grad_norm": 2.1696484088897705, "learning_rate": 1.768953037528331e-05, "loss": 0.7535, "step": 1126500 }, { "epoch": 1.9394886427193456, "grad_norm": 1.9859446287155151, "learning_rate": 1.767518928801091e-05, "loss": 0.7547, "step": 1127000 }, { "epoch": 1.9403491079556896, "grad_norm": 2.1499342918395996, "learning_rate": 1.7660848200738508e-05, "loss": 0.753, "step": 1127500 }, { "epoch": 1.9412095731920336, "grad_norm": 2.1544241905212402, "learning_rate": 1.7646507113466108e-05, "loss": 0.7535, "step": 1128000 }, { "epoch": 1.9420700384283776, "grad_norm": 2.1659960746765137, "learning_rate": 1.7632166026193712e-05, "loss": 0.7583, "step": 1128500 }, { "epoch": 1.9429305036647215, "grad_norm": 2.0657598972320557, "learning_rate": 1.761782493892131e-05, "loss": 0.7553, "step": 1129000 }, { "epoch": 1.9437909689010655, "grad_norm": 2.312696695327759, "learning_rate": 1.760348385164891e-05, "loss": 0.7538, "step": 1129500 }, { "epoch": 1.9446514341374095, "grad_norm": 2.268152952194214, "learning_rate": 1.758914276437651e-05, "loss": 0.7533, "step": 1130000 }, { "epoch": 1.9455118993737535, "grad_norm": 2.2214696407318115, "learning_rate": 1.757480167710411e-05, "loss": 0.7567, "step": 1130500 }, { "epoch": 1.9463723646100974, "grad_norm": 2.3125360012054443, "learning_rate": 1.756046058983171e-05, "loss": 0.7504, "step": 1131000 }, { "epoch": 1.9472328298464414, "grad_norm": 2.032647132873535, "learning_rate": 1.754611950255931e-05, "loss": 0.7549, "step": 1131500 }, { "epoch": 1.9480932950827854, "grad_norm": 1.9928709268569946, "learning_rate": 1.7531778415286912e-05, "loss": 0.7537, "step": 1132000 }, { "epoch": 1.9489537603191294, "grad_norm": 1.8794376850128174, "learning_rate": 1.751743732801451e-05, "loss": 0.7503, "step": 1132500 }, { "epoch": 1.9498142255554733, "grad_norm": 2.239978790283203, "learning_rate": 1.7503096240742113e-05, "loss": 0.751, "step": 1133000 }, { "epoch": 1.9506746907918173, "grad_norm": 2.246455430984497, "learning_rate": 1.7488755153469713e-05, "loss": 0.7537, "step": 1133500 }, { "epoch": 1.9515351560281613, "grad_norm": 2.074331045150757, "learning_rate": 1.747441406619731e-05, "loss": 0.757, "step": 1134000 }, { "epoch": 1.9523956212645053, "grad_norm": 2.2001051902770996, "learning_rate": 1.7460072978924914e-05, "loss": 0.7537, "step": 1134500 }, { "epoch": 1.9532560865008493, "grad_norm": 2.2409298419952393, "learning_rate": 1.7445731891652515e-05, "loss": 0.7558, "step": 1135000 }, { "epoch": 1.9541165517371932, "grad_norm": 1.9500313997268677, "learning_rate": 1.7431390804380112e-05, "loss": 0.7534, "step": 1135500 }, { "epoch": 1.9549770169735372, "grad_norm": 2.0206470489501953, "learning_rate": 1.7417049717107716e-05, "loss": 0.7507, "step": 1136000 }, { "epoch": 1.9558374822098812, "grad_norm": 1.9329551458358765, "learning_rate": 1.7402708629835313e-05, "loss": 0.7514, "step": 1136500 }, { "epoch": 1.9566979474462252, "grad_norm": 2.4962384700775146, "learning_rate": 1.7388367542562913e-05, "loss": 0.7494, "step": 1137000 }, { "epoch": 1.9575584126825691, "grad_norm": 2.5123205184936523, "learning_rate": 1.7374026455290514e-05, "loss": 0.7532, "step": 1137500 }, { "epoch": 1.9584188779189131, "grad_norm": 1.9868464469909668, "learning_rate": 1.7359685368018114e-05, "loss": 0.7511, "step": 1138000 }, { "epoch": 1.959279343155257, "grad_norm": 2.332765579223633, "learning_rate": 1.7345344280745715e-05, "loss": 0.7476, "step": 1138500 }, { "epoch": 1.960139808391601, "grad_norm": 2.1686367988586426, "learning_rate": 1.7331003193473315e-05, "loss": 0.7435, "step": 1139000 }, { "epoch": 1.961000273627945, "grad_norm": 2.017496347427368, "learning_rate": 1.7316662106200915e-05, "loss": 0.7547, "step": 1139500 }, { "epoch": 1.961860738864289, "grad_norm": 2.2544643878936768, "learning_rate": 1.7302321018928516e-05, "loss": 0.7589, "step": 1140000 }, { "epoch": 1.962721204100633, "grad_norm": 2.2415223121643066, "learning_rate": 1.7287979931656113e-05, "loss": 0.7549, "step": 1140500 }, { "epoch": 1.963581669336977, "grad_norm": 2.0396015644073486, "learning_rate": 1.7273638844383717e-05, "loss": 0.757, "step": 1141000 }, { "epoch": 1.964442134573321, "grad_norm": 2.1235740184783936, "learning_rate": 1.7259297757111314e-05, "loss": 0.7537, "step": 1141500 }, { "epoch": 1.965302599809665, "grad_norm": 2.126119613647461, "learning_rate": 1.7244956669838914e-05, "loss": 0.7643, "step": 1142000 }, { "epoch": 1.9661630650460091, "grad_norm": 2.5373361110687256, "learning_rate": 1.723061558256652e-05, "loss": 0.7533, "step": 1142500 }, { "epoch": 1.967023530282353, "grad_norm": 2.3410146236419678, "learning_rate": 1.7216274495294115e-05, "loss": 0.761, "step": 1143000 }, { "epoch": 1.967883995518697, "grad_norm": 2.128046989440918, "learning_rate": 1.7201933408021716e-05, "loss": 0.7572, "step": 1143500 }, { "epoch": 1.968744460755041, "grad_norm": 2.1182539463043213, "learning_rate": 1.7187592320749316e-05, "loss": 0.7556, "step": 1144000 }, { "epoch": 1.969604925991385, "grad_norm": 2.128629446029663, "learning_rate": 1.7173251233476917e-05, "loss": 0.7558, "step": 1144500 }, { "epoch": 1.970465391227729, "grad_norm": 2.249554395675659, "learning_rate": 1.7158910146204517e-05, "loss": 0.7547, "step": 1145000 }, { "epoch": 1.971325856464073, "grad_norm": 2.3424324989318848, "learning_rate": 1.7144569058932118e-05, "loss": 0.7472, "step": 1145500 }, { "epoch": 1.972186321700417, "grad_norm": 2.139158010482788, "learning_rate": 1.7130227971659718e-05, "loss": 0.7554, "step": 1146000 }, { "epoch": 1.973046786936761, "grad_norm": 2.144340991973877, "learning_rate": 1.711588688438732e-05, "loss": 0.7564, "step": 1146500 }, { "epoch": 1.973907252173105, "grad_norm": 2.1247239112854004, "learning_rate": 1.710154579711492e-05, "loss": 0.7548, "step": 1147000 }, { "epoch": 1.974767717409449, "grad_norm": 2.092890501022339, "learning_rate": 1.708720470984252e-05, "loss": 0.7536, "step": 1147500 }, { "epoch": 1.9756281826457929, "grad_norm": 2.0415735244750977, "learning_rate": 1.7072863622570117e-05, "loss": 0.7538, "step": 1148000 }, { "epoch": 1.976488647882137, "grad_norm": 2.2634592056274414, "learning_rate": 1.705852253529772e-05, "loss": 0.7566, "step": 1148500 }, { "epoch": 1.977349113118481, "grad_norm": 2.0326716899871826, "learning_rate": 1.704418144802532e-05, "loss": 0.7566, "step": 1149000 }, { "epoch": 1.978209578354825, "grad_norm": 2.2424416542053223, "learning_rate": 1.7029840360752918e-05, "loss": 0.7538, "step": 1149500 }, { "epoch": 1.979070043591169, "grad_norm": 2.1675209999084473, "learning_rate": 1.701549927348052e-05, "loss": 0.7512, "step": 1150000 }, { "epoch": 1.979930508827513, "grad_norm": 2.072476625442505, "learning_rate": 1.700115818620812e-05, "loss": 0.7528, "step": 1150500 }, { "epoch": 1.980790974063857, "grad_norm": 2.203397274017334, "learning_rate": 1.698681709893572e-05, "loss": 0.7522, "step": 1151000 }, { "epoch": 1.981651439300201, "grad_norm": 2.2937443256378174, "learning_rate": 1.697247601166332e-05, "loss": 0.7587, "step": 1151500 }, { "epoch": 1.982511904536545, "grad_norm": 2.0122551918029785, "learning_rate": 1.695813492439092e-05, "loss": 0.749, "step": 1152000 }, { "epoch": 1.9833723697728889, "grad_norm": 2.243720531463623, "learning_rate": 1.694379383711852e-05, "loss": 0.7536, "step": 1152500 }, { "epoch": 1.9842328350092329, "grad_norm": 2.1484217643737793, "learning_rate": 1.692945274984612e-05, "loss": 0.7524, "step": 1153000 }, { "epoch": 1.9850933002455768, "grad_norm": 2.1573312282562256, "learning_rate": 1.6915111662573722e-05, "loss": 0.753, "step": 1153500 }, { "epoch": 1.9859537654819208, "grad_norm": 2.0914833545684814, "learning_rate": 1.6900770575301322e-05, "loss": 0.7529, "step": 1154000 }, { "epoch": 1.9868142307182648, "grad_norm": 2.172123432159424, "learning_rate": 1.688642948802892e-05, "loss": 0.7524, "step": 1154500 }, { "epoch": 1.9876746959546088, "grad_norm": 2.469355344772339, "learning_rate": 1.6872088400756523e-05, "loss": 0.753, "step": 1155000 }, { "epoch": 1.9885351611909527, "grad_norm": 2.174379587173462, "learning_rate": 1.685774731348412e-05, "loss": 0.7538, "step": 1155500 }, { "epoch": 1.9893956264272967, "grad_norm": 2.1006627082824707, "learning_rate": 1.684340622621172e-05, "loss": 0.7549, "step": 1156000 }, { "epoch": 1.9902560916636407, "grad_norm": 2.183161973953247, "learning_rate": 1.6829065138939325e-05, "loss": 0.7535, "step": 1156500 }, { "epoch": 1.9911165568999847, "grad_norm": 2.2185134887695312, "learning_rate": 1.6814724051666922e-05, "loss": 0.7555, "step": 1157000 }, { "epoch": 1.9919770221363287, "grad_norm": 2.2173125743865967, "learning_rate": 1.6800382964394522e-05, "loss": 0.7455, "step": 1157500 }, { "epoch": 1.9928374873726726, "grad_norm": 1.9993641376495361, "learning_rate": 1.6786041877122126e-05, "loss": 0.7502, "step": 1158000 }, { "epoch": 1.9936979526090166, "grad_norm": 2.248117685317993, "learning_rate": 1.6771700789849723e-05, "loss": 0.7535, "step": 1158500 }, { "epoch": 1.9945584178453606, "grad_norm": 2.1918869018554688, "learning_rate": 1.6757359702577324e-05, "loss": 0.7534, "step": 1159000 }, { "epoch": 1.9954188830817046, "grad_norm": 2.221872091293335, "learning_rate": 1.6743018615304924e-05, "loss": 0.747, "step": 1159500 }, { "epoch": 1.9962793483180485, "grad_norm": 2.3851265907287598, "learning_rate": 1.6728677528032525e-05, "loss": 0.7533, "step": 1160000 }, { "epoch": 1.9971398135543925, "grad_norm": 2.5031321048736572, "learning_rate": 1.6714336440760125e-05, "loss": 0.7546, "step": 1160500 }, { "epoch": 1.9980002787907365, "grad_norm": 2.7600536346435547, "learning_rate": 1.6699995353487726e-05, "loss": 0.7541, "step": 1161000 }, { "epoch": 1.9988607440270805, "grad_norm": 2.3278236389160156, "learning_rate": 1.6685654266215326e-05, "loss": 0.7485, "step": 1161500 }, { "epoch": 1.9997212092634244, "grad_norm": 2.1311707496643066, "learning_rate": 1.6671313178942923e-05, "loss": 0.7475, "step": 1162000 }, { "epoch": 2.0005816744997684, "grad_norm": 2.3819072246551514, "learning_rate": 1.6656972091670524e-05, "loss": 0.7473, "step": 1162500 }, { "epoch": 2.0014421397361124, "grad_norm": 2.184633255004883, "learning_rate": 1.6642631004398127e-05, "loss": 0.7514, "step": 1163000 }, { "epoch": 2.0023026049724564, "grad_norm": 2.1459732055664062, "learning_rate": 1.6628289917125725e-05, "loss": 0.7483, "step": 1163500 }, { "epoch": 2.0031630702088004, "grad_norm": 2.32151198387146, "learning_rate": 1.6613948829853325e-05, "loss": 0.7509, "step": 1164000 }, { "epoch": 2.0040235354451443, "grad_norm": 2.1330671310424805, "learning_rate": 1.6599607742580925e-05, "loss": 0.7514, "step": 1164500 }, { "epoch": 2.0048840006814883, "grad_norm": 2.2304253578186035, "learning_rate": 1.6585266655308526e-05, "loss": 0.7582, "step": 1165000 }, { "epoch": 2.0057444659178323, "grad_norm": 2.269551992416382, "learning_rate": 1.6570925568036126e-05, "loss": 0.7556, "step": 1165500 }, { "epoch": 2.0066049311541763, "grad_norm": 2.071958065032959, "learning_rate": 1.6556584480763727e-05, "loss": 0.7435, "step": 1166000 }, { "epoch": 2.0074653963905202, "grad_norm": 2.009504556655884, "learning_rate": 1.6542243393491327e-05, "loss": 0.7503, "step": 1166500 }, { "epoch": 2.008325861626864, "grad_norm": 2.0557193756103516, "learning_rate": 1.6527902306218924e-05, "loss": 0.7491, "step": 1167000 }, { "epoch": 2.009186326863208, "grad_norm": 2.093259572982788, "learning_rate": 1.6513561218946528e-05, "loss": 0.7515, "step": 1167500 }, { "epoch": 2.010046792099552, "grad_norm": 2.12383770942688, "learning_rate": 1.649922013167413e-05, "loss": 0.7466, "step": 1168000 }, { "epoch": 2.0109072573358966, "grad_norm": 2.0827810764312744, "learning_rate": 1.6484879044401726e-05, "loss": 0.7474, "step": 1168500 }, { "epoch": 2.0117677225722406, "grad_norm": 2.178234338760376, "learning_rate": 1.647053795712933e-05, "loss": 0.7499, "step": 1169000 }, { "epoch": 2.0126281878085845, "grad_norm": 2.224212646484375, "learning_rate": 1.645619686985693e-05, "loss": 0.7491, "step": 1169500 }, { "epoch": 2.0134886530449285, "grad_norm": 2.0262248516082764, "learning_rate": 1.6441855782584527e-05, "loss": 0.7486, "step": 1170000 }, { "epoch": 2.0143491182812725, "grad_norm": 2.3332037925720215, "learning_rate": 1.642751469531213e-05, "loss": 0.7416, "step": 1170500 }, { "epoch": 2.0152095835176165, "grad_norm": 2.1311421394348145, "learning_rate": 1.6413173608039728e-05, "loss": 0.7491, "step": 1171000 }, { "epoch": 2.0160700487539605, "grad_norm": 1.9237945079803467, "learning_rate": 1.639883252076733e-05, "loss": 0.7487, "step": 1171500 }, { "epoch": 2.0169305139903044, "grad_norm": 2.152676820755005, "learning_rate": 1.638449143349493e-05, "loss": 0.7505, "step": 1172000 }, { "epoch": 2.0177909792266484, "grad_norm": 2.240447759628296, "learning_rate": 1.637015034622253e-05, "loss": 0.7487, "step": 1172500 }, { "epoch": 2.0186514444629924, "grad_norm": 2.2949647903442383, "learning_rate": 1.635580925895013e-05, "loss": 0.7508, "step": 1173000 }, { "epoch": 2.0195119096993364, "grad_norm": 2.259401321411133, "learning_rate": 1.634146817167773e-05, "loss": 0.745, "step": 1173500 }, { "epoch": 2.0203723749356803, "grad_norm": 2.1563282012939453, "learning_rate": 1.632712708440533e-05, "loss": 0.7499, "step": 1174000 }, { "epoch": 2.0212328401720243, "grad_norm": 2.198007822036743, "learning_rate": 1.631278599713293e-05, "loss": 0.7484, "step": 1174500 }, { "epoch": 2.0220933054083683, "grad_norm": 2.1369738578796387, "learning_rate": 1.629844490986053e-05, "loss": 0.7545, "step": 1175000 }, { "epoch": 2.0229537706447123, "grad_norm": 2.226137638092041, "learning_rate": 1.6284103822588132e-05, "loss": 0.7532, "step": 1175500 }, { "epoch": 2.0238142358810562, "grad_norm": 2.0248351097106934, "learning_rate": 1.626976273531573e-05, "loss": 0.7513, "step": 1176000 }, { "epoch": 2.0246747011174, "grad_norm": 2.1181812286376953, "learning_rate": 1.625542164804333e-05, "loss": 0.7445, "step": 1176500 }, { "epoch": 2.025535166353744, "grad_norm": 2.3558504581451416, "learning_rate": 1.6241080560770934e-05, "loss": 0.7465, "step": 1177000 }, { "epoch": 2.026395631590088, "grad_norm": 2.0135488510131836, "learning_rate": 1.622673947349853e-05, "loss": 0.7492, "step": 1177500 }, { "epoch": 2.027256096826432, "grad_norm": 2.0925824642181396, "learning_rate": 1.621239838622613e-05, "loss": 0.7468, "step": 1178000 }, { "epoch": 2.028116562062776, "grad_norm": 2.0782012939453125, "learning_rate": 1.6198057298953732e-05, "loss": 0.7505, "step": 1178500 }, { "epoch": 2.02897702729912, "grad_norm": 2.0250723361968994, "learning_rate": 1.6183716211681332e-05, "loss": 0.7494, "step": 1179000 }, { "epoch": 2.029837492535464, "grad_norm": 2.0154037475585938, "learning_rate": 1.6169375124408933e-05, "loss": 0.7478, "step": 1179500 }, { "epoch": 2.030697957771808, "grad_norm": 2.1285109519958496, "learning_rate": 1.6155034037136533e-05, "loss": 0.7462, "step": 1180000 }, { "epoch": 2.031558423008152, "grad_norm": 2.2587168216705322, "learning_rate": 1.6140692949864134e-05, "loss": 0.7456, "step": 1180500 }, { "epoch": 2.032418888244496, "grad_norm": 2.261652946472168, "learning_rate": 1.6126351862591734e-05, "loss": 0.7484, "step": 1181000 }, { "epoch": 2.03327935348084, "grad_norm": 2.2460522651672363, "learning_rate": 1.6112010775319335e-05, "loss": 0.7486, "step": 1181500 }, { "epoch": 2.034139818717184, "grad_norm": 2.511417865753174, "learning_rate": 1.6097669688046935e-05, "loss": 0.7472, "step": 1182000 }, { "epoch": 2.035000283953528, "grad_norm": 2.14890193939209, "learning_rate": 1.6083328600774532e-05, "loss": 0.7476, "step": 1182500 }, { "epoch": 2.035860749189872, "grad_norm": 2.20908260345459, "learning_rate": 1.6068987513502136e-05, "loss": 0.7534, "step": 1183000 }, { "epoch": 2.036721214426216, "grad_norm": 2.3535568714141846, "learning_rate": 1.6054646426229737e-05, "loss": 0.7505, "step": 1183500 }, { "epoch": 2.03758167966256, "grad_norm": 2.2410316467285156, "learning_rate": 1.6040305338957334e-05, "loss": 0.7467, "step": 1184000 }, { "epoch": 2.038442144898904, "grad_norm": 2.1737914085388184, "learning_rate": 1.6025964251684934e-05, "loss": 0.747, "step": 1184500 }, { "epoch": 2.039302610135248, "grad_norm": 2.2757697105407715, "learning_rate": 1.6011623164412535e-05, "loss": 0.7423, "step": 1185000 }, { "epoch": 2.040163075371592, "grad_norm": 2.2338526248931885, "learning_rate": 1.5997282077140135e-05, "loss": 0.7474, "step": 1185500 }, { "epoch": 2.041023540607936, "grad_norm": 2.1928253173828125, "learning_rate": 1.5982940989867736e-05, "loss": 0.7538, "step": 1186000 }, { "epoch": 2.0418840058442798, "grad_norm": 2.32772159576416, "learning_rate": 1.5968599902595336e-05, "loss": 0.7449, "step": 1186500 }, { "epoch": 2.0427444710806237, "grad_norm": 2.1408066749572754, "learning_rate": 1.5954258815322936e-05, "loss": 0.7463, "step": 1187000 }, { "epoch": 2.0436049363169677, "grad_norm": 2.2051782608032227, "learning_rate": 1.5939917728050534e-05, "loss": 0.7463, "step": 1187500 }, { "epoch": 2.0444654015533117, "grad_norm": 2.259019136428833, "learning_rate": 1.5925576640778137e-05, "loss": 0.7451, "step": 1188000 }, { "epoch": 2.0453258667896557, "grad_norm": 2.147264242172241, "learning_rate": 1.5911235553505738e-05, "loss": 0.7554, "step": 1188500 }, { "epoch": 2.0461863320259996, "grad_norm": 2.019881248474121, "learning_rate": 1.5896894466233335e-05, "loss": 0.743, "step": 1189000 }, { "epoch": 2.0470467972623436, "grad_norm": 2.1331872940063477, "learning_rate": 1.588255337896094e-05, "loss": 0.7454, "step": 1189500 }, { "epoch": 2.0479072624986876, "grad_norm": 2.1059558391571045, "learning_rate": 1.586821229168854e-05, "loss": 0.7452, "step": 1190000 }, { "epoch": 2.0487677277350316, "grad_norm": 2.168329954147339, "learning_rate": 1.5853871204416136e-05, "loss": 0.7548, "step": 1190500 }, { "epoch": 2.0496281929713756, "grad_norm": 2.21160888671875, "learning_rate": 1.583953011714374e-05, "loss": 0.7478, "step": 1191000 }, { "epoch": 2.05048865820772, "grad_norm": 2.0780110359191895, "learning_rate": 1.5825189029871337e-05, "loss": 0.7444, "step": 1191500 }, { "epoch": 2.051349123444064, "grad_norm": 2.0575709342956543, "learning_rate": 1.5810847942598938e-05, "loss": 0.7393, "step": 1192000 }, { "epoch": 2.052209588680408, "grad_norm": 2.3351075649261475, "learning_rate": 1.579650685532654e-05, "loss": 0.7448, "step": 1192500 }, { "epoch": 2.053070053916752, "grad_norm": 2.023301362991333, "learning_rate": 1.578216576805414e-05, "loss": 0.7454, "step": 1193000 }, { "epoch": 2.053930519153096, "grad_norm": 2.1645848751068115, "learning_rate": 1.576782468078174e-05, "loss": 0.7443, "step": 1193500 }, { "epoch": 2.05479098438944, "grad_norm": 2.1541600227355957, "learning_rate": 1.575348359350934e-05, "loss": 0.7458, "step": 1194000 }, { "epoch": 2.055651449625784, "grad_norm": 2.260538339614868, "learning_rate": 1.573914250623694e-05, "loss": 0.7435, "step": 1194500 }, { "epoch": 2.056511914862128, "grad_norm": 2.1604511737823486, "learning_rate": 1.572480141896454e-05, "loss": 0.7527, "step": 1195000 }, { "epoch": 2.057372380098472, "grad_norm": 2.21130633354187, "learning_rate": 1.571046033169214e-05, "loss": 0.7499, "step": 1195500 }, { "epoch": 2.0582328453348158, "grad_norm": 2.1533782482147217, "learning_rate": 1.569611924441974e-05, "loss": 0.7475, "step": 1196000 }, { "epoch": 2.0590933105711597, "grad_norm": 2.1712965965270996, "learning_rate": 1.568177815714734e-05, "loss": 0.742, "step": 1196500 }, { "epoch": 2.0599537758075037, "grad_norm": 2.198495388031006, "learning_rate": 1.566743706987494e-05, "loss": 0.7473, "step": 1197000 }, { "epoch": 2.0608142410438477, "grad_norm": 2.3291733264923096, "learning_rate": 1.5653095982602543e-05, "loss": 0.7453, "step": 1197500 }, { "epoch": 2.0616747062801917, "grad_norm": 2.1571593284606934, "learning_rate": 1.563875489533014e-05, "loss": 0.7467, "step": 1198000 }, { "epoch": 2.0625351715165356, "grad_norm": 2.022718667984009, "learning_rate": 1.562441380805774e-05, "loss": 0.7401, "step": 1198500 }, { "epoch": 2.0633956367528796, "grad_norm": 2.2326862812042236, "learning_rate": 1.561007272078534e-05, "loss": 0.7498, "step": 1199000 }, { "epoch": 2.0642561019892236, "grad_norm": 2.4144537448883057, "learning_rate": 1.559573163351294e-05, "loss": 0.7467, "step": 1199500 }, { "epoch": 2.0651165672255676, "grad_norm": 2.1190388202667236, "learning_rate": 1.5581390546240542e-05, "loss": 0.7462, "step": 1200000 }, { "epoch": 2.0659770324619116, "grad_norm": 2.1224961280822754, "learning_rate": 1.5567049458968142e-05, "loss": 0.7461, "step": 1200500 }, { "epoch": 2.0668374976982555, "grad_norm": 2.1967227458953857, "learning_rate": 1.5552708371695743e-05, "loss": 0.7479, "step": 1201000 }, { "epoch": 2.0676979629345995, "grad_norm": 2.357408046722412, "learning_rate": 1.5538367284423343e-05, "loss": 0.7408, "step": 1201500 }, { "epoch": 2.0685584281709435, "grad_norm": 2.196808099746704, "learning_rate": 1.5524026197150944e-05, "loss": 0.7493, "step": 1202000 }, { "epoch": 2.0694188934072875, "grad_norm": 2.0822932720184326, "learning_rate": 1.5509685109878544e-05, "loss": 0.7426, "step": 1202500 }, { "epoch": 2.0702793586436314, "grad_norm": 2.3068885803222656, "learning_rate": 1.549534402260614e-05, "loss": 0.747, "step": 1203000 }, { "epoch": 2.0711398238799754, "grad_norm": 2.0930685997009277, "learning_rate": 1.5481002935333745e-05, "loss": 0.7487, "step": 1203500 }, { "epoch": 2.0720002891163194, "grad_norm": 2.1573619842529297, "learning_rate": 1.5466661848061346e-05, "loss": 0.75, "step": 1204000 }, { "epoch": 2.0728607543526634, "grad_norm": 2.1406896114349365, "learning_rate": 1.5452320760788943e-05, "loss": 0.7539, "step": 1204500 }, { "epoch": 2.0737212195890073, "grad_norm": 2.2356812953948975, "learning_rate": 1.5437979673516547e-05, "loss": 0.7456, "step": 1205000 }, { "epoch": 2.0745816848253513, "grad_norm": 2.211362361907959, "learning_rate": 1.5423638586244144e-05, "loss": 0.7448, "step": 1205500 }, { "epoch": 2.0754421500616953, "grad_norm": 1.9883774518966675, "learning_rate": 1.5409297498971744e-05, "loss": 0.7448, "step": 1206000 }, { "epoch": 2.0763026152980393, "grad_norm": 2.2814600467681885, "learning_rate": 1.5394956411699345e-05, "loss": 0.7479, "step": 1206500 }, { "epoch": 2.0771630805343833, "grad_norm": 1.9190709590911865, "learning_rate": 1.5380615324426945e-05, "loss": 0.7465, "step": 1207000 }, { "epoch": 2.0780235457707272, "grad_norm": 2.15561580657959, "learning_rate": 1.5366274237154546e-05, "loss": 0.7414, "step": 1207500 }, { "epoch": 2.078884011007071, "grad_norm": 2.1482765674591064, "learning_rate": 1.5351933149882146e-05, "loss": 0.7477, "step": 1208000 }, { "epoch": 2.079744476243415, "grad_norm": 2.2311723232269287, "learning_rate": 1.5337592062609747e-05, "loss": 0.7425, "step": 1208500 }, { "epoch": 2.080604941479759, "grad_norm": 2.053049325942993, "learning_rate": 1.5323250975337347e-05, "loss": 0.7523, "step": 1209000 }, { "epoch": 2.081465406716103, "grad_norm": 1.9471055269241333, "learning_rate": 1.5308909888064944e-05, "loss": 0.7486, "step": 1209500 }, { "epoch": 2.082325871952447, "grad_norm": 2.088299512863159, "learning_rate": 1.5294568800792548e-05, "loss": 0.7417, "step": 1210000 }, { "epoch": 2.083186337188791, "grad_norm": 2.083714008331299, "learning_rate": 1.5280227713520145e-05, "loss": 0.7433, "step": 1210500 }, { "epoch": 2.084046802425135, "grad_norm": 2.0353481769561768, "learning_rate": 1.5265886626247745e-05, "loss": 0.7512, "step": 1211000 }, { "epoch": 2.084907267661479, "grad_norm": 2.2791249752044678, "learning_rate": 1.5251545538975348e-05, "loss": 0.7446, "step": 1211500 }, { "epoch": 2.085767732897823, "grad_norm": 2.114086627960205, "learning_rate": 1.5237204451702946e-05, "loss": 0.7454, "step": 1212000 }, { "epoch": 2.086628198134167, "grad_norm": 2.1683554649353027, "learning_rate": 1.5222863364430549e-05, "loss": 0.7456, "step": 1212500 }, { "epoch": 2.087488663370511, "grad_norm": 2.1287851333618164, "learning_rate": 1.5208522277158149e-05, "loss": 0.7474, "step": 1213000 }, { "epoch": 2.088349128606855, "grad_norm": 2.2126824855804443, "learning_rate": 1.5194181189885748e-05, "loss": 0.7477, "step": 1213500 }, { "epoch": 2.089209593843199, "grad_norm": 2.0747759342193604, "learning_rate": 1.5179840102613348e-05, "loss": 0.7482, "step": 1214000 }, { "epoch": 2.090070059079543, "grad_norm": 2.1446080207824707, "learning_rate": 1.5165499015340947e-05, "loss": 0.7426, "step": 1214500 }, { "epoch": 2.0909305243158873, "grad_norm": 2.1761319637298584, "learning_rate": 1.515115792806855e-05, "loss": 0.7485, "step": 1215000 }, { "epoch": 2.0917909895522313, "grad_norm": 2.060673236846924, "learning_rate": 1.513681684079615e-05, "loss": 0.7415, "step": 1215500 }, { "epoch": 2.0926514547885753, "grad_norm": 2.208153009414673, "learning_rate": 1.5122475753523749e-05, "loss": 0.7501, "step": 1216000 }, { "epoch": 2.0935119200249193, "grad_norm": 2.4316937923431396, "learning_rate": 1.510813466625135e-05, "loss": 0.7481, "step": 1216500 }, { "epoch": 2.0943723852612632, "grad_norm": 2.358034133911133, "learning_rate": 1.509379357897895e-05, "loss": 0.7501, "step": 1217000 }, { "epoch": 2.095232850497607, "grad_norm": 2.3785462379455566, "learning_rate": 1.507945249170655e-05, "loss": 0.7485, "step": 1217500 }, { "epoch": 2.096093315733951, "grad_norm": 2.0434811115264893, "learning_rate": 1.5065111404434152e-05, "loss": 0.7431, "step": 1218000 }, { "epoch": 2.096953780970295, "grad_norm": 2.2284770011901855, "learning_rate": 1.505077031716175e-05, "loss": 0.7461, "step": 1218500 }, { "epoch": 2.097814246206639, "grad_norm": 2.4026939868927, "learning_rate": 1.5036429229889351e-05, "loss": 0.7492, "step": 1219000 }, { "epoch": 2.098674711442983, "grad_norm": 1.9877442121505737, "learning_rate": 1.502208814261695e-05, "loss": 0.7471, "step": 1219500 }, { "epoch": 2.099535176679327, "grad_norm": 2.2232022285461426, "learning_rate": 1.500774705534455e-05, "loss": 0.7472, "step": 1220000 }, { "epoch": 2.100395641915671, "grad_norm": 2.2752528190612793, "learning_rate": 1.4993405968072153e-05, "loss": 0.748, "step": 1220500 }, { "epoch": 2.101256107152015, "grad_norm": 2.3126862049102783, "learning_rate": 1.4979064880799752e-05, "loss": 0.7516, "step": 1221000 }, { "epoch": 2.102116572388359, "grad_norm": 2.171881914138794, "learning_rate": 1.4964723793527352e-05, "loss": 0.7539, "step": 1221500 }, { "epoch": 2.102977037624703, "grad_norm": 2.145996570587158, "learning_rate": 1.495038270625495e-05, "loss": 0.7455, "step": 1222000 }, { "epoch": 2.103837502861047, "grad_norm": 2.396362781524658, "learning_rate": 1.4936041618982551e-05, "loss": 0.7477, "step": 1222500 }, { "epoch": 2.104697968097391, "grad_norm": 2.1479058265686035, "learning_rate": 1.4921700531710153e-05, "loss": 0.7431, "step": 1223000 }, { "epoch": 2.105558433333735, "grad_norm": 2.0262203216552734, "learning_rate": 1.4907359444437752e-05, "loss": 0.7443, "step": 1223500 }, { "epoch": 2.106418898570079, "grad_norm": 2.2727341651916504, "learning_rate": 1.4893018357165353e-05, "loss": 0.7525, "step": 1224000 }, { "epoch": 2.107279363806423, "grad_norm": 2.1716670989990234, "learning_rate": 1.4878677269892955e-05, "loss": 0.7483, "step": 1224500 }, { "epoch": 2.108139829042767, "grad_norm": 2.2675817012786865, "learning_rate": 1.4864336182620554e-05, "loss": 0.7395, "step": 1225000 }, { "epoch": 2.109000294279111, "grad_norm": 2.3223788738250732, "learning_rate": 1.4849995095348154e-05, "loss": 0.7452, "step": 1225500 }, { "epoch": 2.109860759515455, "grad_norm": 2.005021572113037, "learning_rate": 1.4835654008075753e-05, "loss": 0.7466, "step": 1226000 }, { "epoch": 2.110721224751799, "grad_norm": 2.136507749557495, "learning_rate": 1.4821312920803353e-05, "loss": 0.7549, "step": 1226500 }, { "epoch": 2.1115816899881428, "grad_norm": 2.095691442489624, "learning_rate": 1.4806971833530955e-05, "loss": 0.7451, "step": 1227000 }, { "epoch": 2.1124421552244868, "grad_norm": 2.3151662349700928, "learning_rate": 1.4792630746258554e-05, "loss": 0.7487, "step": 1227500 }, { "epoch": 2.1133026204608307, "grad_norm": 1.9382226467132568, "learning_rate": 1.4778289658986155e-05, "loss": 0.743, "step": 1228000 }, { "epoch": 2.1141630856971747, "grad_norm": 2.3391473293304443, "learning_rate": 1.4763948571713753e-05, "loss": 0.7511, "step": 1228500 }, { "epoch": 2.1150235509335187, "grad_norm": 2.2001867294311523, "learning_rate": 1.4749607484441356e-05, "loss": 0.7451, "step": 1229000 }, { "epoch": 2.1158840161698627, "grad_norm": 2.054943084716797, "learning_rate": 1.4735266397168956e-05, "loss": 0.744, "step": 1229500 }, { "epoch": 2.1167444814062066, "grad_norm": 2.11088490486145, "learning_rate": 1.4720925309896555e-05, "loss": 0.7486, "step": 1230000 }, { "epoch": 2.1176049466425506, "grad_norm": 2.307340383529663, "learning_rate": 1.4706584222624157e-05, "loss": 0.7465, "step": 1230500 }, { "epoch": 2.1184654118788946, "grad_norm": 2.4765310287475586, "learning_rate": 1.4692243135351754e-05, "loss": 0.7438, "step": 1231000 }, { "epoch": 2.1193258771152386, "grad_norm": 2.0979506969451904, "learning_rate": 1.4677902048079356e-05, "loss": 0.7455, "step": 1231500 }, { "epoch": 2.1201863423515825, "grad_norm": 2.199089765548706, "learning_rate": 1.4663560960806957e-05, "loss": 0.7432, "step": 1232000 }, { "epoch": 2.1210468075879265, "grad_norm": 2.230161428451538, "learning_rate": 1.4649219873534556e-05, "loss": 0.7446, "step": 1232500 }, { "epoch": 2.1219072728242705, "grad_norm": 2.351138114929199, "learning_rate": 1.4634878786262158e-05, "loss": 0.7419, "step": 1233000 }, { "epoch": 2.1227677380606145, "grad_norm": 2.1402597427368164, "learning_rate": 1.4620537698989757e-05, "loss": 0.7454, "step": 1233500 }, { "epoch": 2.1236282032969584, "grad_norm": 2.3817059993743896, "learning_rate": 1.4606196611717357e-05, "loss": 0.7449, "step": 1234000 }, { "epoch": 2.1244886685333024, "grad_norm": 2.16794753074646, "learning_rate": 1.4591855524444959e-05, "loss": 0.7396, "step": 1234500 }, { "epoch": 2.1253491337696464, "grad_norm": 2.18027925491333, "learning_rate": 1.4577514437172556e-05, "loss": 0.7466, "step": 1235000 }, { "epoch": 2.1262095990059904, "grad_norm": 2.0430901050567627, "learning_rate": 1.4563173349900158e-05, "loss": 0.7452, "step": 1235500 }, { "epoch": 2.1270700642423344, "grad_norm": 2.072225332260132, "learning_rate": 1.4548832262627759e-05, "loss": 0.7422, "step": 1236000 }, { "epoch": 2.1279305294786783, "grad_norm": 2.125211715698242, "learning_rate": 1.4534491175355358e-05, "loss": 0.7458, "step": 1236500 }, { "epoch": 2.1287909947150223, "grad_norm": 2.3344390392303467, "learning_rate": 1.452015008808296e-05, "loss": 0.7456, "step": 1237000 }, { "epoch": 2.1296514599513667, "grad_norm": 2.0346381664276123, "learning_rate": 1.4505809000810559e-05, "loss": 0.7432, "step": 1237500 }, { "epoch": 2.1305119251877107, "grad_norm": 2.1993465423583984, "learning_rate": 1.4491467913538159e-05, "loss": 0.7481, "step": 1238000 }, { "epoch": 2.1313723904240547, "grad_norm": 2.192777633666992, "learning_rate": 1.4477126826265761e-05, "loss": 0.747, "step": 1238500 }, { "epoch": 2.1322328556603987, "grad_norm": 2.124511480331421, "learning_rate": 1.4462785738993358e-05, "loss": 0.7459, "step": 1239000 }, { "epoch": 2.1330933208967426, "grad_norm": 2.1302876472473145, "learning_rate": 1.444844465172096e-05, "loss": 0.7431, "step": 1239500 }, { "epoch": 2.1339537861330866, "grad_norm": 2.170459747314453, "learning_rate": 1.443410356444856e-05, "loss": 0.739, "step": 1240000 }, { "epoch": 2.1348142513694306, "grad_norm": 2.136836528778076, "learning_rate": 1.441976247717616e-05, "loss": 0.7445, "step": 1240500 }, { "epoch": 2.1356747166057746, "grad_norm": 2.169245719909668, "learning_rate": 1.4405421389903762e-05, "loss": 0.7391, "step": 1241000 }, { "epoch": 2.1365351818421185, "grad_norm": 2.053624391555786, "learning_rate": 1.439108030263136e-05, "loss": 0.7415, "step": 1241500 }, { "epoch": 2.1373956470784625, "grad_norm": 2.225248336791992, "learning_rate": 1.4376739215358961e-05, "loss": 0.7459, "step": 1242000 }, { "epoch": 2.1382561123148065, "grad_norm": 2.267460584640503, "learning_rate": 1.436239812808656e-05, "loss": 0.7454, "step": 1242500 }, { "epoch": 2.1391165775511505, "grad_norm": 2.054811716079712, "learning_rate": 1.4348057040814162e-05, "loss": 0.7435, "step": 1243000 }, { "epoch": 2.1399770427874945, "grad_norm": 2.2870633602142334, "learning_rate": 1.4333715953541763e-05, "loss": 0.7454, "step": 1243500 }, { "epoch": 2.1408375080238384, "grad_norm": 2.0539770126342773, "learning_rate": 1.4319374866269361e-05, "loss": 0.7471, "step": 1244000 }, { "epoch": 2.1416979732601824, "grad_norm": 2.073671817779541, "learning_rate": 1.4305033778996962e-05, "loss": 0.7416, "step": 1244500 }, { "epoch": 2.1425584384965264, "grad_norm": 2.143728017807007, "learning_rate": 1.4290692691724564e-05, "loss": 0.7441, "step": 1245000 }, { "epoch": 2.1434189037328704, "grad_norm": 2.2988219261169434, "learning_rate": 1.4276351604452163e-05, "loss": 0.7457, "step": 1245500 }, { "epoch": 2.1442793689692143, "grad_norm": 2.397913694381714, "learning_rate": 1.4262010517179763e-05, "loss": 0.7459, "step": 1246000 }, { "epoch": 2.1451398342055583, "grad_norm": 2.177351474761963, "learning_rate": 1.4247669429907362e-05, "loss": 0.7409, "step": 1246500 }, { "epoch": 2.1460002994419023, "grad_norm": 2.097346305847168, "learning_rate": 1.4233328342634964e-05, "loss": 0.737, "step": 1247000 }, { "epoch": 2.1468607646782463, "grad_norm": 2.0776567459106445, "learning_rate": 1.4218987255362565e-05, "loss": 0.738, "step": 1247500 }, { "epoch": 2.1477212299145902, "grad_norm": 2.189054250717163, "learning_rate": 1.4204646168090163e-05, "loss": 0.7446, "step": 1248000 }, { "epoch": 2.1485816951509342, "grad_norm": 2.0094523429870605, "learning_rate": 1.4190305080817764e-05, "loss": 0.7351, "step": 1248500 }, { "epoch": 2.149442160387278, "grad_norm": 2.0026373863220215, "learning_rate": 1.4175963993545363e-05, "loss": 0.7406, "step": 1249000 }, { "epoch": 2.150302625623622, "grad_norm": 2.2804622650146484, "learning_rate": 1.4161622906272965e-05, "loss": 0.7421, "step": 1249500 }, { "epoch": 2.151163090859966, "grad_norm": 2.289987325668335, "learning_rate": 1.4147281819000565e-05, "loss": 0.745, "step": 1250000 }, { "epoch": 2.15202355609631, "grad_norm": 2.009991407394409, "learning_rate": 1.4132940731728164e-05, "loss": 0.7475, "step": 1250500 }, { "epoch": 2.152884021332654, "grad_norm": 2.136585235595703, "learning_rate": 1.4118599644455766e-05, "loss": 0.7396, "step": 1251000 }, { "epoch": 2.153744486568998, "grad_norm": 2.2174477577209473, "learning_rate": 1.4104258557183363e-05, "loss": 0.7403, "step": 1251500 }, { "epoch": 2.154604951805342, "grad_norm": 2.1195108890533447, "learning_rate": 1.4089917469910965e-05, "loss": 0.7475, "step": 1252000 }, { "epoch": 2.155465417041686, "grad_norm": 2.339773178100586, "learning_rate": 1.4075576382638566e-05, "loss": 0.7345, "step": 1252500 }, { "epoch": 2.15632588227803, "grad_norm": 2.186708450317383, "learning_rate": 1.4061235295366165e-05, "loss": 0.7432, "step": 1253000 }, { "epoch": 2.157186347514374, "grad_norm": 2.0798115730285645, "learning_rate": 1.4046894208093767e-05, "loss": 0.7439, "step": 1253500 }, { "epoch": 2.158046812750718, "grad_norm": 2.256768226623535, "learning_rate": 1.4032553120821366e-05, "loss": 0.7389, "step": 1254000 }, { "epoch": 2.158907277987062, "grad_norm": 2.122495174407959, "learning_rate": 1.4018212033548966e-05, "loss": 0.7401, "step": 1254500 }, { "epoch": 2.159767743223406, "grad_norm": 2.2577829360961914, "learning_rate": 1.4003870946276568e-05, "loss": 0.7393, "step": 1255000 }, { "epoch": 2.16062820845975, "grad_norm": 2.040308952331543, "learning_rate": 1.3989529859004167e-05, "loss": 0.7419, "step": 1255500 }, { "epoch": 2.161488673696094, "grad_norm": 2.253164291381836, "learning_rate": 1.3975188771731768e-05, "loss": 0.7388, "step": 1256000 }, { "epoch": 2.162349138932438, "grad_norm": 2.166598320007324, "learning_rate": 1.396084768445937e-05, "loss": 0.743, "step": 1256500 }, { "epoch": 2.163209604168782, "grad_norm": 2.025723934173584, "learning_rate": 1.3946506597186967e-05, "loss": 0.7392, "step": 1257000 }, { "epoch": 2.164070069405126, "grad_norm": 2.3602654933929443, "learning_rate": 1.3932165509914569e-05, "loss": 0.7418, "step": 1257500 }, { "epoch": 2.16493053464147, "grad_norm": 2.2957653999328613, "learning_rate": 1.3917824422642168e-05, "loss": 0.741, "step": 1258000 }, { "epoch": 2.1657909998778138, "grad_norm": 2.3943967819213867, "learning_rate": 1.3903483335369768e-05, "loss": 0.7481, "step": 1258500 }, { "epoch": 2.1666514651141577, "grad_norm": 2.1326005458831787, "learning_rate": 1.388914224809737e-05, "loss": 0.7392, "step": 1259000 }, { "epoch": 2.1675119303505017, "grad_norm": 2.3093254566192627, "learning_rate": 1.3874801160824969e-05, "loss": 0.7385, "step": 1259500 }, { "epoch": 2.1683723955868457, "grad_norm": 2.1691677570343018, "learning_rate": 1.386046007355257e-05, "loss": 0.7347, "step": 1260000 }, { "epoch": 2.1692328608231897, "grad_norm": 2.205759048461914, "learning_rate": 1.3846118986280168e-05, "loss": 0.737, "step": 1260500 }, { "epoch": 2.1700933260595336, "grad_norm": 2.23650860786438, "learning_rate": 1.3831777899007769e-05, "loss": 0.7378, "step": 1261000 }, { "epoch": 2.1709537912958776, "grad_norm": 2.0677170753479004, "learning_rate": 1.3817436811735371e-05, "loss": 0.7417, "step": 1261500 }, { "epoch": 2.171814256532222, "grad_norm": 1.9268617630004883, "learning_rate": 1.380309572446297e-05, "loss": 0.7399, "step": 1262000 }, { "epoch": 2.172674721768566, "grad_norm": 2.105142831802368, "learning_rate": 1.378875463719057e-05, "loss": 0.7337, "step": 1262500 }, { "epoch": 2.17353518700491, "grad_norm": 2.2743546962738037, "learning_rate": 1.3774413549918169e-05, "loss": 0.7447, "step": 1263000 }, { "epoch": 2.174395652241254, "grad_norm": 2.34230375289917, "learning_rate": 1.3760072462645771e-05, "loss": 0.7413, "step": 1263500 }, { "epoch": 2.175256117477598, "grad_norm": 2.048245668411255, "learning_rate": 1.3745731375373372e-05, "loss": 0.7435, "step": 1264000 }, { "epoch": 2.176116582713942, "grad_norm": 2.1016547679901123, "learning_rate": 1.373139028810097e-05, "loss": 0.7411, "step": 1264500 }, { "epoch": 2.176977047950286, "grad_norm": 1.9493845701217651, "learning_rate": 1.3717049200828571e-05, "loss": 0.7403, "step": 1265000 }, { "epoch": 2.17783751318663, "grad_norm": 2.23221755027771, "learning_rate": 1.370270811355617e-05, "loss": 0.7385, "step": 1265500 }, { "epoch": 2.178697978422974, "grad_norm": 2.298320770263672, "learning_rate": 1.3688367026283772e-05, "loss": 0.7458, "step": 1266000 }, { "epoch": 2.179558443659318, "grad_norm": 2.093395709991455, "learning_rate": 1.3674025939011372e-05, "loss": 0.7366, "step": 1266500 }, { "epoch": 2.180418908895662, "grad_norm": 2.1902594566345215, "learning_rate": 1.3659684851738971e-05, "loss": 0.7397, "step": 1267000 }, { "epoch": 2.181279374132006, "grad_norm": 2.1553637981414795, "learning_rate": 1.3645343764466573e-05, "loss": 0.7438, "step": 1267500 }, { "epoch": 2.1821398393683498, "grad_norm": 2.153245449066162, "learning_rate": 1.3631002677194174e-05, "loss": 0.7415, "step": 1268000 }, { "epoch": 2.1830003046046937, "grad_norm": 1.9978289604187012, "learning_rate": 1.3616661589921772e-05, "loss": 0.7356, "step": 1268500 }, { "epoch": 2.1838607698410377, "grad_norm": 2.105203628540039, "learning_rate": 1.3602320502649375e-05, "loss": 0.7415, "step": 1269000 }, { "epoch": 2.1847212350773817, "grad_norm": 2.2339985370635986, "learning_rate": 1.3587979415376972e-05, "loss": 0.7338, "step": 1269500 }, { "epoch": 2.1855817003137257, "grad_norm": 2.240745782852173, "learning_rate": 1.3573638328104574e-05, "loss": 0.7433, "step": 1270000 }, { "epoch": 2.1864421655500696, "grad_norm": 2.2821898460388184, "learning_rate": 1.3559297240832174e-05, "loss": 0.7429, "step": 1270500 }, { "epoch": 2.1873026307864136, "grad_norm": 2.1928908824920654, "learning_rate": 1.3544956153559773e-05, "loss": 0.7424, "step": 1271000 }, { "epoch": 2.1881630960227576, "grad_norm": 2.1943445205688477, "learning_rate": 1.3530615066287375e-05, "loss": 0.743, "step": 1271500 }, { "epoch": 2.1890235612591016, "grad_norm": 2.0637905597686768, "learning_rate": 1.3516273979014974e-05, "loss": 0.7384, "step": 1272000 }, { "epoch": 2.1898840264954456, "grad_norm": 2.1010494232177734, "learning_rate": 1.3501932891742575e-05, "loss": 0.7392, "step": 1272500 }, { "epoch": 2.1907444917317895, "grad_norm": 1.9924651384353638, "learning_rate": 1.3487591804470177e-05, "loss": 0.7347, "step": 1273000 }, { "epoch": 2.1916049569681335, "grad_norm": 2.502352476119995, "learning_rate": 1.3473250717197774e-05, "loss": 0.7418, "step": 1273500 }, { "epoch": 2.1924654222044775, "grad_norm": 2.1891558170318604, "learning_rate": 1.3458909629925376e-05, "loss": 0.738, "step": 1274000 }, { "epoch": 2.1933258874408215, "grad_norm": 2.170668601989746, "learning_rate": 1.3444568542652975e-05, "loss": 0.7384, "step": 1274500 }, { "epoch": 2.1941863526771654, "grad_norm": 2.1300723552703857, "learning_rate": 1.3430227455380575e-05, "loss": 0.7431, "step": 1275000 }, { "epoch": 2.1950468179135094, "grad_norm": 2.207932472229004, "learning_rate": 1.3415886368108177e-05, "loss": 0.7381, "step": 1275500 }, { "epoch": 2.1959072831498534, "grad_norm": 2.3557584285736084, "learning_rate": 1.3401545280835776e-05, "loss": 0.7448, "step": 1276000 }, { "epoch": 2.1967677483861974, "grad_norm": 2.1110777854919434, "learning_rate": 1.3387204193563377e-05, "loss": 0.7396, "step": 1276500 }, { "epoch": 2.1976282136225413, "grad_norm": 2.0678913593292236, "learning_rate": 1.3372863106290975e-05, "loss": 0.7408, "step": 1277000 }, { "epoch": 2.1984886788588853, "grad_norm": 2.3854904174804688, "learning_rate": 1.3358522019018576e-05, "loss": 0.7485, "step": 1277500 }, { "epoch": 2.1993491440952293, "grad_norm": 2.1214091777801514, "learning_rate": 1.3344180931746178e-05, "loss": 0.7409, "step": 1278000 }, { "epoch": 2.2002096093315733, "grad_norm": 2.177196741104126, "learning_rate": 1.3329839844473777e-05, "loss": 0.748, "step": 1278500 }, { "epoch": 2.2010700745679173, "grad_norm": 2.102724552154541, "learning_rate": 1.3315498757201377e-05, "loss": 0.7416, "step": 1279000 }, { "epoch": 2.2019305398042612, "grad_norm": 2.392582893371582, "learning_rate": 1.330115766992898e-05, "loss": 0.7376, "step": 1279500 }, { "epoch": 2.202791005040605, "grad_norm": 2.1870553493499756, "learning_rate": 1.3286816582656578e-05, "loss": 0.7355, "step": 1280000 }, { "epoch": 2.203651470276949, "grad_norm": 2.246654987335205, "learning_rate": 1.3272475495384179e-05, "loss": 0.7366, "step": 1280500 }, { "epoch": 2.204511935513293, "grad_norm": 2.104764461517334, "learning_rate": 1.3258134408111777e-05, "loss": 0.7377, "step": 1281000 }, { "epoch": 2.205372400749637, "grad_norm": 2.2333292961120605, "learning_rate": 1.324379332083938e-05, "loss": 0.7355, "step": 1281500 }, { "epoch": 2.206232865985981, "grad_norm": 2.2619197368621826, "learning_rate": 1.322945223356698e-05, "loss": 0.7392, "step": 1282000 }, { "epoch": 2.207093331222325, "grad_norm": 2.268139600753784, "learning_rate": 1.3215111146294579e-05, "loss": 0.7459, "step": 1282500 }, { "epoch": 2.207953796458669, "grad_norm": 2.0993845462799072, "learning_rate": 1.320077005902218e-05, "loss": 0.7407, "step": 1283000 }, { "epoch": 2.2088142616950135, "grad_norm": 2.396580219268799, "learning_rate": 1.3186428971749778e-05, "loss": 0.7344, "step": 1283500 }, { "epoch": 2.2096747269313575, "grad_norm": 2.1309633255004883, "learning_rate": 1.317208788447738e-05, "loss": 0.736, "step": 1284000 }, { "epoch": 2.2105351921677014, "grad_norm": 2.0994322299957275, "learning_rate": 1.315774679720498e-05, "loss": 0.7407, "step": 1284500 }, { "epoch": 2.2113956574040454, "grad_norm": 2.230180025100708, "learning_rate": 1.314340570993258e-05, "loss": 0.7395, "step": 1285000 }, { "epoch": 2.2122561226403894, "grad_norm": 2.205920457839966, "learning_rate": 1.3129064622660182e-05, "loss": 0.7408, "step": 1285500 }, { "epoch": 2.2131165878767334, "grad_norm": 2.1964640617370605, "learning_rate": 1.3114723535387779e-05, "loss": 0.7378, "step": 1286000 }, { "epoch": 2.2139770531130774, "grad_norm": 2.0684380531311035, "learning_rate": 1.3100382448115381e-05, "loss": 0.7337, "step": 1286500 }, { "epoch": 2.2148375183494213, "grad_norm": 2.1438052654266357, "learning_rate": 1.3086041360842981e-05, "loss": 0.735, "step": 1287000 }, { "epoch": 2.2156979835857653, "grad_norm": 2.076854944229126, "learning_rate": 1.307170027357058e-05, "loss": 0.743, "step": 1287500 }, { "epoch": 2.2165584488221093, "grad_norm": 2.2474782466888428, "learning_rate": 1.3057359186298182e-05, "loss": 0.7349, "step": 1288000 }, { "epoch": 2.2174189140584533, "grad_norm": 2.067883253097534, "learning_rate": 1.3043018099025781e-05, "loss": 0.7308, "step": 1288500 }, { "epoch": 2.2182793792947972, "grad_norm": 2.1735494136810303, "learning_rate": 1.3028677011753382e-05, "loss": 0.7308, "step": 1289000 }, { "epoch": 2.219139844531141, "grad_norm": 1.9676673412322998, "learning_rate": 1.3014335924480984e-05, "loss": 0.7366, "step": 1289500 }, { "epoch": 2.220000309767485, "grad_norm": 2.2277417182922363, "learning_rate": 1.2999994837208581e-05, "loss": 0.7395, "step": 1290000 }, { "epoch": 2.220860775003829, "grad_norm": 2.261632204055786, "learning_rate": 1.2985653749936183e-05, "loss": 0.738, "step": 1290500 }, { "epoch": 2.221721240240173, "grad_norm": 2.178764820098877, "learning_rate": 1.2971312662663783e-05, "loss": 0.7406, "step": 1291000 }, { "epoch": 2.222581705476517, "grad_norm": 2.2707815170288086, "learning_rate": 1.2956971575391382e-05, "loss": 0.735, "step": 1291500 }, { "epoch": 2.223442170712861, "grad_norm": 2.0521087646484375, "learning_rate": 1.2942630488118984e-05, "loss": 0.7375, "step": 1292000 }, { "epoch": 2.224302635949205, "grad_norm": 2.2074644565582275, "learning_rate": 1.2928289400846583e-05, "loss": 0.7376, "step": 1292500 }, { "epoch": 2.225163101185549, "grad_norm": 2.157623291015625, "learning_rate": 1.2913948313574184e-05, "loss": 0.7285, "step": 1293000 }, { "epoch": 2.226023566421893, "grad_norm": 2.132406711578369, "learning_rate": 1.2899607226301786e-05, "loss": 0.7356, "step": 1293500 }, { "epoch": 2.226884031658237, "grad_norm": 2.0070013999938965, "learning_rate": 1.2885266139029385e-05, "loss": 0.7315, "step": 1294000 }, { "epoch": 2.227744496894581, "grad_norm": 2.2132139205932617, "learning_rate": 1.2870925051756985e-05, "loss": 0.7411, "step": 1294500 }, { "epoch": 2.228604962130925, "grad_norm": 2.1949164867401123, "learning_rate": 1.2856583964484584e-05, "loss": 0.7316, "step": 1295000 }, { "epoch": 2.229465427367269, "grad_norm": 2.2194552421569824, "learning_rate": 1.2842242877212184e-05, "loss": 0.7381, "step": 1295500 }, { "epoch": 2.230325892603613, "grad_norm": 2.1448280811309814, "learning_rate": 1.2827901789939787e-05, "loss": 0.7388, "step": 1296000 }, { "epoch": 2.231186357839957, "grad_norm": 2.047916889190674, "learning_rate": 1.2813560702667385e-05, "loss": 0.7358, "step": 1296500 }, { "epoch": 2.232046823076301, "grad_norm": 2.1904945373535156, "learning_rate": 1.2799219615394986e-05, "loss": 0.7396, "step": 1297000 }, { "epoch": 2.232907288312645, "grad_norm": 2.256561040878296, "learning_rate": 1.2784878528122585e-05, "loss": 0.7396, "step": 1297500 }, { "epoch": 2.233767753548989, "grad_norm": 2.319228172302246, "learning_rate": 1.2770537440850187e-05, "loss": 0.7398, "step": 1298000 }, { "epoch": 2.234628218785333, "grad_norm": 2.2488300800323486, "learning_rate": 1.2756196353577787e-05, "loss": 0.7365, "step": 1298500 }, { "epoch": 2.2354886840216768, "grad_norm": 2.3413503170013428, "learning_rate": 1.2741855266305386e-05, "loss": 0.7369, "step": 1299000 }, { "epoch": 2.2363491492580208, "grad_norm": 2.1444482803344727, "learning_rate": 1.2727514179032986e-05, "loss": 0.7332, "step": 1299500 }, { "epoch": 2.2372096144943647, "grad_norm": 1.9140009880065918, "learning_rate": 1.2713173091760585e-05, "loss": 0.738, "step": 1300000 }, { "epoch": 2.2380700797307087, "grad_norm": 1.9779835939407349, "learning_rate": 1.2698832004488187e-05, "loss": 0.7373, "step": 1300500 }, { "epoch": 2.2389305449670527, "grad_norm": 2.290299415588379, "learning_rate": 1.2684490917215788e-05, "loss": 0.7372, "step": 1301000 }, { "epoch": 2.2397910102033967, "grad_norm": 2.2834126949310303, "learning_rate": 1.2670149829943387e-05, "loss": 0.7294, "step": 1301500 }, { "epoch": 2.2406514754397406, "grad_norm": 2.0117619037628174, "learning_rate": 1.2655808742670989e-05, "loss": 0.7354, "step": 1302000 }, { "epoch": 2.2415119406760846, "grad_norm": 2.1566827297210693, "learning_rate": 1.264146765539859e-05, "loss": 0.734, "step": 1302500 }, { "epoch": 2.2423724059124286, "grad_norm": 1.8935664892196655, "learning_rate": 1.2627126568126188e-05, "loss": 0.7403, "step": 1303000 }, { "epoch": 2.2432328711487726, "grad_norm": 2.1429378986358643, "learning_rate": 1.2612785480853788e-05, "loss": 0.7323, "step": 1303500 }, { "epoch": 2.2440933363851165, "grad_norm": 2.3545150756835938, "learning_rate": 1.2598444393581387e-05, "loss": 0.7439, "step": 1304000 }, { "epoch": 2.2449538016214605, "grad_norm": 2.245781660079956, "learning_rate": 1.258410330630899e-05, "loss": 0.7394, "step": 1304500 }, { "epoch": 2.2458142668578045, "grad_norm": 2.2469215393066406, "learning_rate": 1.256976221903659e-05, "loss": 0.7403, "step": 1305000 }, { "epoch": 2.2466747320941485, "grad_norm": 2.2666895389556885, "learning_rate": 1.2555421131764189e-05, "loss": 0.7347, "step": 1305500 }, { "epoch": 2.2475351973304925, "grad_norm": 2.2507004737854004, "learning_rate": 1.254108004449179e-05, "loss": 0.7326, "step": 1306000 }, { "epoch": 2.2483956625668364, "grad_norm": 2.1795663833618164, "learning_rate": 1.252673895721939e-05, "loss": 0.7323, "step": 1306500 }, { "epoch": 2.2492561278031804, "grad_norm": 2.247745990753174, "learning_rate": 1.251239786994699e-05, "loss": 0.7365, "step": 1307000 }, { "epoch": 2.2501165930395244, "grad_norm": 2.1461105346679688, "learning_rate": 1.249805678267459e-05, "loss": 0.7374, "step": 1307500 }, { "epoch": 2.2509770582758684, "grad_norm": 1.8984283208847046, "learning_rate": 1.248371569540219e-05, "loss": 0.7335, "step": 1308000 }, { "epoch": 2.2518375235122123, "grad_norm": 2.113375186920166, "learning_rate": 1.2469374608129791e-05, "loss": 0.7356, "step": 1308500 }, { "epoch": 2.2526979887485568, "grad_norm": 2.187218427658081, "learning_rate": 1.2455033520857392e-05, "loss": 0.7368, "step": 1309000 }, { "epoch": 2.2535584539849007, "grad_norm": 2.347452402114868, "learning_rate": 1.244069243358499e-05, "loss": 0.7382, "step": 1309500 }, { "epoch": 2.2544189192212447, "grad_norm": 2.2602875232696533, "learning_rate": 1.2426351346312591e-05, "loss": 0.7421, "step": 1310000 }, { "epoch": 2.2552793844575887, "grad_norm": 2.1275899410247803, "learning_rate": 1.2412010259040192e-05, "loss": 0.7339, "step": 1310500 }, { "epoch": 2.2561398496939327, "grad_norm": 2.11209774017334, "learning_rate": 1.2397669171767792e-05, "loss": 0.7365, "step": 1311000 }, { "epoch": 2.2570003149302766, "grad_norm": 2.1969621181488037, "learning_rate": 1.2383328084495393e-05, "loss": 0.739, "step": 1311500 }, { "epoch": 2.2578607801666206, "grad_norm": 2.293888807296753, "learning_rate": 1.2368986997222991e-05, "loss": 0.7302, "step": 1312000 }, { "epoch": 2.2587212454029646, "grad_norm": 2.102910280227661, "learning_rate": 1.2354645909950592e-05, "loss": 0.7342, "step": 1312500 }, { "epoch": 2.2595817106393086, "grad_norm": 2.209803342819214, "learning_rate": 1.2340304822678194e-05, "loss": 0.7357, "step": 1313000 }, { "epoch": 2.2604421758756525, "grad_norm": 2.1391923427581787, "learning_rate": 1.2325963735405793e-05, "loss": 0.7366, "step": 1313500 }, { "epoch": 2.2613026411119965, "grad_norm": 2.047585964202881, "learning_rate": 1.2311622648133393e-05, "loss": 0.7455, "step": 1314000 }, { "epoch": 2.2621631063483405, "grad_norm": 2.154750347137451, "learning_rate": 1.2297281560860994e-05, "loss": 0.7318, "step": 1314500 }, { "epoch": 2.2630235715846845, "grad_norm": 2.435215473175049, "learning_rate": 1.2282940473588594e-05, "loss": 0.7387, "step": 1315000 }, { "epoch": 2.2638840368210285, "grad_norm": 2.0441153049468994, "learning_rate": 1.2268599386316195e-05, "loss": 0.7321, "step": 1315500 }, { "epoch": 2.2647445020573724, "grad_norm": 2.128629446029663, "learning_rate": 1.2254258299043793e-05, "loss": 0.7315, "step": 1316000 }, { "epoch": 2.2656049672937164, "grad_norm": 2.077057123184204, "learning_rate": 1.2239917211771394e-05, "loss": 0.7375, "step": 1316500 }, { "epoch": 2.2664654325300604, "grad_norm": 2.0758187770843506, "learning_rate": 1.2225576124498994e-05, "loss": 0.735, "step": 1317000 }, { "epoch": 2.2673258977664044, "grad_norm": 1.947325587272644, "learning_rate": 1.2211235037226595e-05, "loss": 0.7337, "step": 1317500 }, { "epoch": 2.2681863630027483, "grad_norm": 2.228104591369629, "learning_rate": 1.2196893949954195e-05, "loss": 0.7334, "step": 1318000 }, { "epoch": 2.2690468282390923, "grad_norm": 2.0669751167297363, "learning_rate": 1.2182552862681796e-05, "loss": 0.7388, "step": 1318500 }, { "epoch": 2.2699072934754363, "grad_norm": 2.1029419898986816, "learning_rate": 1.2168211775409395e-05, "loss": 0.7338, "step": 1319000 }, { "epoch": 2.2707677587117803, "grad_norm": 2.1721904277801514, "learning_rate": 1.2153870688136995e-05, "loss": 0.7384, "step": 1319500 }, { "epoch": 2.2716282239481242, "grad_norm": 2.2310101985931396, "learning_rate": 1.2139529600864597e-05, "loss": 0.7364, "step": 1320000 }, { "epoch": 2.2724886891844682, "grad_norm": 2.108856201171875, "learning_rate": 1.2125188513592196e-05, "loss": 0.7382, "step": 1320500 }, { "epoch": 2.273349154420812, "grad_norm": 2.3644378185272217, "learning_rate": 1.2110847426319796e-05, "loss": 0.7382, "step": 1321000 }, { "epoch": 2.274209619657156, "grad_norm": 2.2770323753356934, "learning_rate": 1.2096506339047397e-05, "loss": 0.7339, "step": 1321500 }, { "epoch": 2.2750700848935, "grad_norm": 2.357496738433838, "learning_rate": 1.2082165251774997e-05, "loss": 0.7354, "step": 1322000 }, { "epoch": 2.275930550129844, "grad_norm": 1.9793803691864014, "learning_rate": 1.2067824164502598e-05, "loss": 0.7329, "step": 1322500 }, { "epoch": 2.276791015366188, "grad_norm": 2.253159284591675, "learning_rate": 1.2053483077230197e-05, "loss": 0.7369, "step": 1323000 }, { "epoch": 2.277651480602532, "grad_norm": 2.310882091522217, "learning_rate": 1.2039141989957797e-05, "loss": 0.7387, "step": 1323500 }, { "epoch": 2.278511945838876, "grad_norm": 2.1455655097961426, "learning_rate": 1.2024800902685398e-05, "loss": 0.7387, "step": 1324000 }, { "epoch": 2.27937241107522, "grad_norm": 2.1674752235412598, "learning_rate": 1.2010459815412998e-05, "loss": 0.7344, "step": 1324500 }, { "epoch": 2.280232876311564, "grad_norm": 2.193643808364868, "learning_rate": 1.1996118728140599e-05, "loss": 0.7328, "step": 1325000 }, { "epoch": 2.281093341547908, "grad_norm": 2.1150364875793457, "learning_rate": 1.1981777640868199e-05, "loss": 0.7379, "step": 1325500 }, { "epoch": 2.281953806784252, "grad_norm": 2.2378151416778564, "learning_rate": 1.1967436553595798e-05, "loss": 0.7305, "step": 1326000 }, { "epoch": 2.282814272020596, "grad_norm": 2.3178298473358154, "learning_rate": 1.19530954663234e-05, "loss": 0.7361, "step": 1326500 }, { "epoch": 2.28367473725694, "grad_norm": 2.16298770904541, "learning_rate": 1.1938754379050999e-05, "loss": 0.7339, "step": 1327000 }, { "epoch": 2.284535202493284, "grad_norm": 2.0893654823303223, "learning_rate": 1.19244132917786e-05, "loss": 0.7347, "step": 1327500 }, { "epoch": 2.285395667729628, "grad_norm": 2.0143134593963623, "learning_rate": 1.19100722045062e-05, "loss": 0.7375, "step": 1328000 }, { "epoch": 2.286256132965972, "grad_norm": 2.1765668392181396, "learning_rate": 1.1895731117233798e-05, "loss": 0.7374, "step": 1328500 }, { "epoch": 2.2871165982023163, "grad_norm": 2.2966361045837402, "learning_rate": 1.18813900299614e-05, "loss": 0.74, "step": 1329000 }, { "epoch": 2.2879770634386603, "grad_norm": 2.144141674041748, "learning_rate": 1.1867048942689001e-05, "loss": 0.7347, "step": 1329500 }, { "epoch": 2.2888375286750042, "grad_norm": 2.3457894325256348, "learning_rate": 1.18527078554166e-05, "loss": 0.7373, "step": 1330000 }, { "epoch": 2.289697993911348, "grad_norm": 2.230165958404541, "learning_rate": 1.18383667681442e-05, "loss": 0.7328, "step": 1330500 }, { "epoch": 2.290558459147692, "grad_norm": 2.159299373626709, "learning_rate": 1.1824025680871802e-05, "loss": 0.7313, "step": 1331000 }, { "epoch": 2.291418924384036, "grad_norm": 2.08553409576416, "learning_rate": 1.1809684593599401e-05, "loss": 0.7438, "step": 1331500 }, { "epoch": 2.29227938962038, "grad_norm": 2.318000078201294, "learning_rate": 1.1795343506327002e-05, "loss": 0.7357, "step": 1332000 }, { "epoch": 2.293139854856724, "grad_norm": 2.38122296333313, "learning_rate": 1.1781002419054602e-05, "loss": 0.7385, "step": 1332500 }, { "epoch": 2.294000320093068, "grad_norm": 2.157275915145874, "learning_rate": 1.1766661331782201e-05, "loss": 0.7319, "step": 1333000 }, { "epoch": 2.294860785329412, "grad_norm": 2.2765510082244873, "learning_rate": 1.1752320244509803e-05, "loss": 0.7366, "step": 1333500 }, { "epoch": 2.295721250565756, "grad_norm": 2.6490728855133057, "learning_rate": 1.1737979157237402e-05, "loss": 0.7373, "step": 1334000 }, { "epoch": 2.2965817158021, "grad_norm": 2.020374298095703, "learning_rate": 1.1723638069965002e-05, "loss": 0.7359, "step": 1334500 }, { "epoch": 2.297442181038444, "grad_norm": 1.8266252279281616, "learning_rate": 1.1709296982692603e-05, "loss": 0.7316, "step": 1335000 }, { "epoch": 2.298302646274788, "grad_norm": 2.322174072265625, "learning_rate": 1.1694955895420202e-05, "loss": 0.7333, "step": 1335500 }, { "epoch": 2.299163111511132, "grad_norm": 2.295354127883911, "learning_rate": 1.1680614808147804e-05, "loss": 0.7354, "step": 1336000 }, { "epoch": 2.300023576747476, "grad_norm": 2.2270147800445557, "learning_rate": 1.1666273720875404e-05, "loss": 0.7409, "step": 1336500 }, { "epoch": 2.30088404198382, "grad_norm": 2.2788033485412598, "learning_rate": 1.1651932633603003e-05, "loss": 0.7389, "step": 1337000 }, { "epoch": 2.301744507220164, "grad_norm": 2.205354690551758, "learning_rate": 1.1637591546330604e-05, "loss": 0.7338, "step": 1337500 }, { "epoch": 2.302604972456508, "grad_norm": 2.1826252937316895, "learning_rate": 1.1623250459058204e-05, "loss": 0.7362, "step": 1338000 }, { "epoch": 2.303465437692852, "grad_norm": 2.2182676792144775, "learning_rate": 1.1608909371785804e-05, "loss": 0.7265, "step": 1338500 }, { "epoch": 2.304325902929196, "grad_norm": 2.089738130569458, "learning_rate": 1.1594568284513405e-05, "loss": 0.7387, "step": 1339000 }, { "epoch": 2.30518636816554, "grad_norm": 2.235210418701172, "learning_rate": 1.1580227197241004e-05, "loss": 0.7343, "step": 1339500 }, { "epoch": 2.3060468334018838, "grad_norm": 2.5968661308288574, "learning_rate": 1.1565886109968604e-05, "loss": 0.7356, "step": 1340000 }, { "epoch": 2.3069072986382277, "grad_norm": 2.1616084575653076, "learning_rate": 1.1551545022696206e-05, "loss": 0.7287, "step": 1340500 }, { "epoch": 2.3077677638745717, "grad_norm": 2.295888662338257, "learning_rate": 1.1537203935423805e-05, "loss": 0.7356, "step": 1341000 }, { "epoch": 2.3086282291109157, "grad_norm": 2.2833211421966553, "learning_rate": 1.1522862848151406e-05, "loss": 0.7337, "step": 1341500 }, { "epoch": 2.3094886943472597, "grad_norm": 2.069758415222168, "learning_rate": 1.1508521760879006e-05, "loss": 0.7403, "step": 1342000 }, { "epoch": 2.3103491595836037, "grad_norm": 2.1476922035217285, "learning_rate": 1.1494180673606607e-05, "loss": 0.7368, "step": 1342500 }, { "epoch": 2.3112096248199476, "grad_norm": 2.3149638175964355, "learning_rate": 1.1479839586334207e-05, "loss": 0.7375, "step": 1343000 }, { "epoch": 2.3120700900562916, "grad_norm": 2.2947609424591064, "learning_rate": 1.1465498499061807e-05, "loss": 0.7343, "step": 1343500 }, { "epoch": 2.3129305552926356, "grad_norm": 2.098161458969116, "learning_rate": 1.1451157411789406e-05, "loss": 0.7311, "step": 1344000 }, { "epoch": 2.3137910205289796, "grad_norm": 2.2303125858306885, "learning_rate": 1.1436816324517007e-05, "loss": 0.7318, "step": 1344500 }, { "epoch": 2.3146514857653235, "grad_norm": 2.2480945587158203, "learning_rate": 1.1422475237244607e-05, "loss": 0.7318, "step": 1345000 }, { "epoch": 2.3155119510016675, "grad_norm": 2.3394484519958496, "learning_rate": 1.1408134149972208e-05, "loss": 0.7283, "step": 1345500 }, { "epoch": 2.3163724162380115, "grad_norm": 2.292473316192627, "learning_rate": 1.1393793062699808e-05, "loss": 0.735, "step": 1346000 }, { "epoch": 2.3172328814743555, "grad_norm": 2.0187926292419434, "learning_rate": 1.1379451975427407e-05, "loss": 0.7329, "step": 1346500 }, { "epoch": 2.3180933467106994, "grad_norm": 2.197573661804199, "learning_rate": 1.1365110888155007e-05, "loss": 0.7372, "step": 1347000 }, { "epoch": 2.3189538119470434, "grad_norm": 2.140190601348877, "learning_rate": 1.135076980088261e-05, "loss": 0.7338, "step": 1347500 }, { "epoch": 2.3198142771833874, "grad_norm": 2.119563102722168, "learning_rate": 1.1336428713610208e-05, "loss": 0.7285, "step": 1348000 }, { "epoch": 2.3206747424197314, "grad_norm": 2.1876964569091797, "learning_rate": 1.1322087626337809e-05, "loss": 0.7345, "step": 1348500 }, { "epoch": 2.3215352076560754, "grad_norm": 2.1912875175476074, "learning_rate": 1.130774653906541e-05, "loss": 0.7311, "step": 1349000 }, { "epoch": 2.3223956728924193, "grad_norm": 2.2343170642852783, "learning_rate": 1.129340545179301e-05, "loss": 0.7377, "step": 1349500 }, { "epoch": 2.3232561381287633, "grad_norm": 1.9957581758499146, "learning_rate": 1.127906436452061e-05, "loss": 0.7314, "step": 1350000 }, { "epoch": 2.3241166033651073, "grad_norm": 2.154148578643799, "learning_rate": 1.1264723277248209e-05, "loss": 0.7268, "step": 1350500 }, { "epoch": 2.3249770686014513, "grad_norm": 2.0333240032196045, "learning_rate": 1.125038218997581e-05, "loss": 0.7324, "step": 1351000 }, { "epoch": 2.3258375338377952, "grad_norm": 1.997631311416626, "learning_rate": 1.123604110270341e-05, "loss": 0.7314, "step": 1351500 }, { "epoch": 2.326697999074139, "grad_norm": 2.216383934020996, "learning_rate": 1.122170001543101e-05, "loss": 0.7327, "step": 1352000 }, { "epoch": 2.327558464310483, "grad_norm": 2.4634883403778076, "learning_rate": 1.1207358928158611e-05, "loss": 0.7299, "step": 1352500 }, { "epoch": 2.328418929546827, "grad_norm": 1.891774296760559, "learning_rate": 1.1193017840886211e-05, "loss": 0.7339, "step": 1353000 }, { "epoch": 2.329279394783171, "grad_norm": 2.076943874359131, "learning_rate": 1.117867675361381e-05, "loss": 0.7351, "step": 1353500 }, { "epoch": 2.330139860019515, "grad_norm": 2.08978533744812, "learning_rate": 1.1164335666341412e-05, "loss": 0.7357, "step": 1354000 }, { "epoch": 2.331000325255859, "grad_norm": 2.38840913772583, "learning_rate": 1.1149994579069011e-05, "loss": 0.7343, "step": 1354500 }, { "epoch": 2.3318607904922035, "grad_norm": 1.997729778289795, "learning_rate": 1.1135653491796612e-05, "loss": 0.7279, "step": 1355000 }, { "epoch": 2.3327212557285475, "grad_norm": 2.2807769775390625, "learning_rate": 1.1121312404524212e-05, "loss": 0.7297, "step": 1355500 }, { "epoch": 2.3335817209648915, "grad_norm": 2.291337251663208, "learning_rate": 1.1106971317251812e-05, "loss": 0.7373, "step": 1356000 }, { "epoch": 2.3344421862012354, "grad_norm": 2.2745909690856934, "learning_rate": 1.1092630229979413e-05, "loss": 0.7381, "step": 1356500 }, { "epoch": 2.3353026514375794, "grad_norm": 2.1283040046691895, "learning_rate": 1.1078289142707013e-05, "loss": 0.7277, "step": 1357000 }, { "epoch": 2.3361631166739234, "grad_norm": 2.2192940711975098, "learning_rate": 1.1063948055434612e-05, "loss": 0.731, "step": 1357500 }, { "epoch": 2.3370235819102674, "grad_norm": 2.1145095825195312, "learning_rate": 1.1049606968162213e-05, "loss": 0.7297, "step": 1358000 }, { "epoch": 2.3378840471466114, "grad_norm": 2.089151382446289, "learning_rate": 1.1035265880889815e-05, "loss": 0.7357, "step": 1358500 }, { "epoch": 2.3387445123829553, "grad_norm": 2.203192710876465, "learning_rate": 1.1020924793617414e-05, "loss": 0.7269, "step": 1359000 }, { "epoch": 2.3396049776192993, "grad_norm": 2.030491352081299, "learning_rate": 1.1006583706345014e-05, "loss": 0.7308, "step": 1359500 }, { "epoch": 2.3404654428556433, "grad_norm": 2.259591817855835, "learning_rate": 1.0992242619072615e-05, "loss": 0.7325, "step": 1360000 }, { "epoch": 2.3413259080919873, "grad_norm": 2.142476797103882, "learning_rate": 1.0977901531800213e-05, "loss": 0.7313, "step": 1360500 }, { "epoch": 2.3421863733283312, "grad_norm": 2.402512311935425, "learning_rate": 1.0963560444527815e-05, "loss": 0.7338, "step": 1361000 }, { "epoch": 2.343046838564675, "grad_norm": 2.097332239151001, "learning_rate": 1.0949219357255414e-05, "loss": 0.7295, "step": 1361500 }, { "epoch": 2.343907303801019, "grad_norm": 2.12955904006958, "learning_rate": 1.0934878269983015e-05, "loss": 0.7269, "step": 1362000 }, { "epoch": 2.344767769037363, "grad_norm": 2.1505727767944336, "learning_rate": 1.0920537182710615e-05, "loss": 0.7332, "step": 1362500 }, { "epoch": 2.345628234273707, "grad_norm": 2.206063985824585, "learning_rate": 1.0906196095438214e-05, "loss": 0.7299, "step": 1363000 }, { "epoch": 2.346488699510051, "grad_norm": 2.3586273193359375, "learning_rate": 1.0891855008165816e-05, "loss": 0.7345, "step": 1363500 }, { "epoch": 2.347349164746395, "grad_norm": 2.1021995544433594, "learning_rate": 1.0877513920893417e-05, "loss": 0.7316, "step": 1364000 }, { "epoch": 2.348209629982739, "grad_norm": 2.3361055850982666, "learning_rate": 1.0863172833621015e-05, "loss": 0.7345, "step": 1364500 }, { "epoch": 2.349070095219083, "grad_norm": 2.2210545539855957, "learning_rate": 1.0848831746348616e-05, "loss": 0.7385, "step": 1365000 }, { "epoch": 2.349930560455427, "grad_norm": 2.323305368423462, "learning_rate": 1.0834490659076216e-05, "loss": 0.7333, "step": 1365500 }, { "epoch": 2.350791025691771, "grad_norm": 2.095165967941284, "learning_rate": 1.0820149571803817e-05, "loss": 0.7328, "step": 1366000 }, { "epoch": 2.351651490928115, "grad_norm": 2.113354206085205, "learning_rate": 1.0805808484531417e-05, "loss": 0.7363, "step": 1366500 }, { "epoch": 2.352511956164459, "grad_norm": 2.207193613052368, "learning_rate": 1.0791467397259018e-05, "loss": 0.7302, "step": 1367000 }, { "epoch": 2.353372421400803, "grad_norm": 2.3709208965301514, "learning_rate": 1.0777126309986617e-05, "loss": 0.7305, "step": 1367500 }, { "epoch": 2.354232886637147, "grad_norm": 2.1980271339416504, "learning_rate": 1.0762785222714219e-05, "loss": 0.7331, "step": 1368000 }, { "epoch": 2.355093351873491, "grad_norm": 2.143127202987671, "learning_rate": 1.0748444135441817e-05, "loss": 0.7289, "step": 1368500 }, { "epoch": 2.355953817109835, "grad_norm": 2.1765642166137695, "learning_rate": 1.0734103048169418e-05, "loss": 0.7347, "step": 1369000 }, { "epoch": 2.356814282346179, "grad_norm": 2.2334864139556885, "learning_rate": 1.0719761960897018e-05, "loss": 0.7311, "step": 1369500 }, { "epoch": 2.357674747582523, "grad_norm": 2.210925340652466, "learning_rate": 1.0705420873624619e-05, "loss": 0.7312, "step": 1370000 }, { "epoch": 2.358535212818867, "grad_norm": 2.15215802192688, "learning_rate": 1.069107978635222e-05, "loss": 0.73, "step": 1370500 }, { "epoch": 2.3593956780552108, "grad_norm": 2.062427520751953, "learning_rate": 1.067673869907982e-05, "loss": 0.7304, "step": 1371000 }, { "epoch": 2.3602561432915548, "grad_norm": 2.322567939758301, "learning_rate": 1.0662397611807419e-05, "loss": 0.7387, "step": 1371500 }, { "epoch": 2.3611166085278987, "grad_norm": 2.153791666030884, "learning_rate": 1.0648056524535019e-05, "loss": 0.7327, "step": 1372000 }, { "epoch": 2.3619770737642427, "grad_norm": 2.4358983039855957, "learning_rate": 1.063371543726262e-05, "loss": 0.7331, "step": 1372500 }, { "epoch": 2.3628375390005867, "grad_norm": 2.2714719772338867, "learning_rate": 1.061937434999022e-05, "loss": 0.731, "step": 1373000 }, { "epoch": 2.3636980042369307, "grad_norm": 2.271050214767456, "learning_rate": 1.060503326271782e-05, "loss": 0.734, "step": 1373500 }, { "epoch": 2.3645584694732746, "grad_norm": 2.266615390777588, "learning_rate": 1.059069217544542e-05, "loss": 0.727, "step": 1374000 }, { "epoch": 2.3654189347096186, "grad_norm": 2.197510004043579, "learning_rate": 1.057635108817302e-05, "loss": 0.7285, "step": 1374500 }, { "epoch": 2.3662793999459626, "grad_norm": 2.2251787185668945, "learning_rate": 1.0562010000900622e-05, "loss": 0.7266, "step": 1375000 }, { "epoch": 2.367139865182307, "grad_norm": 2.2310283184051514, "learning_rate": 1.054766891362822e-05, "loss": 0.7339, "step": 1375500 }, { "epoch": 2.368000330418651, "grad_norm": 2.100308418273926, "learning_rate": 1.0533327826355821e-05, "loss": 0.7352, "step": 1376000 }, { "epoch": 2.368860795654995, "grad_norm": 2.230976104736328, "learning_rate": 1.0518986739083422e-05, "loss": 0.7339, "step": 1376500 }, { "epoch": 2.369721260891339, "grad_norm": 2.412017345428467, "learning_rate": 1.0504645651811022e-05, "loss": 0.7325, "step": 1377000 }, { "epoch": 2.370581726127683, "grad_norm": 2.1025924682617188, "learning_rate": 1.0490304564538623e-05, "loss": 0.7349, "step": 1377500 }, { "epoch": 2.371442191364027, "grad_norm": 2.063314914703369, "learning_rate": 1.0475963477266221e-05, "loss": 0.7356, "step": 1378000 }, { "epoch": 2.372302656600371, "grad_norm": 2.093384027481079, "learning_rate": 1.0461622389993822e-05, "loss": 0.7277, "step": 1378500 }, { "epoch": 2.373163121836715, "grad_norm": 2.142827033996582, "learning_rate": 1.0447281302721422e-05, "loss": 0.7301, "step": 1379000 }, { "epoch": 2.374023587073059, "grad_norm": 2.0535013675689697, "learning_rate": 1.0432940215449023e-05, "loss": 0.7332, "step": 1379500 }, { "epoch": 2.374884052309403, "grad_norm": 2.119863510131836, "learning_rate": 1.0418599128176623e-05, "loss": 0.7342, "step": 1380000 }, { "epoch": 2.375744517545747, "grad_norm": 2.02392840385437, "learning_rate": 1.0404258040904224e-05, "loss": 0.7309, "step": 1380500 }, { "epoch": 2.3766049827820908, "grad_norm": 2.1911582946777344, "learning_rate": 1.0389916953631822e-05, "loss": 0.7309, "step": 1381000 }, { "epoch": 2.3774654480184347, "grad_norm": 2.183885335922241, "learning_rate": 1.0375575866359425e-05, "loss": 0.7327, "step": 1381500 }, { "epoch": 2.3783259132547787, "grad_norm": 2.272047996520996, "learning_rate": 1.0361234779087025e-05, "loss": 0.7288, "step": 1382000 }, { "epoch": 2.3791863784911227, "grad_norm": 2.1483206748962402, "learning_rate": 1.0346893691814624e-05, "loss": 0.7294, "step": 1382500 }, { "epoch": 2.3800468437274667, "grad_norm": 2.1930770874023438, "learning_rate": 1.0332552604542224e-05, "loss": 0.722, "step": 1383000 }, { "epoch": 2.3809073089638106, "grad_norm": 2.2109477519989014, "learning_rate": 1.0318211517269825e-05, "loss": 0.7269, "step": 1383500 }, { "epoch": 2.3817677742001546, "grad_norm": 2.27551007270813, "learning_rate": 1.0303870429997425e-05, "loss": 0.7272, "step": 1384000 }, { "epoch": 2.3826282394364986, "grad_norm": 2.266842842102051, "learning_rate": 1.0289529342725026e-05, "loss": 0.7267, "step": 1384500 }, { "epoch": 2.3834887046728426, "grad_norm": 1.9027714729309082, "learning_rate": 1.0275188255452625e-05, "loss": 0.7346, "step": 1385000 }, { "epoch": 2.3843491699091865, "grad_norm": 2.027723789215088, "learning_rate": 1.0260847168180225e-05, "loss": 0.7278, "step": 1385500 }, { "epoch": 2.3852096351455305, "grad_norm": 2.2213375568389893, "learning_rate": 1.0246506080907825e-05, "loss": 0.7326, "step": 1386000 }, { "epoch": 2.3860701003818745, "grad_norm": 2.044480085372925, "learning_rate": 1.0232164993635426e-05, "loss": 0.7371, "step": 1386500 }, { "epoch": 2.3869305656182185, "grad_norm": 2.011831045150757, "learning_rate": 1.0217823906363026e-05, "loss": 0.7316, "step": 1387000 }, { "epoch": 2.3877910308545625, "grad_norm": 2.323796272277832, "learning_rate": 1.0203482819090627e-05, "loss": 0.735, "step": 1387500 }, { "epoch": 2.3886514960909064, "grad_norm": 2.00823974609375, "learning_rate": 1.0189141731818226e-05, "loss": 0.721, "step": 1388000 }, { "epoch": 2.3895119613272504, "grad_norm": 2.2485666275024414, "learning_rate": 1.0174800644545828e-05, "loss": 0.7282, "step": 1388500 }, { "epoch": 2.3903724265635944, "grad_norm": 2.062854290008545, "learning_rate": 1.0160459557273427e-05, "loss": 0.7298, "step": 1389000 }, { "epoch": 2.3912328917999384, "grad_norm": 2.141507625579834, "learning_rate": 1.0146118470001027e-05, "loss": 0.7297, "step": 1389500 }, { "epoch": 2.3920933570362823, "grad_norm": 2.1152637004852295, "learning_rate": 1.0131777382728628e-05, "loss": 0.7346, "step": 1390000 }, { "epoch": 2.3929538222726263, "grad_norm": 2.3339223861694336, "learning_rate": 1.0117436295456226e-05, "loss": 0.7269, "step": 1390500 }, { "epoch": 2.3938142875089703, "grad_norm": 2.2937827110290527, "learning_rate": 1.0103095208183828e-05, "loss": 0.7313, "step": 1391000 }, { "epoch": 2.3946747527453143, "grad_norm": 2.311241626739502, "learning_rate": 1.0088754120911429e-05, "loss": 0.7332, "step": 1391500 }, { "epoch": 2.3955352179816582, "grad_norm": 2.4438748359680176, "learning_rate": 1.0074413033639028e-05, "loss": 0.7273, "step": 1392000 }, { "epoch": 2.3963956832180022, "grad_norm": 2.4046483039855957, "learning_rate": 1.0060071946366628e-05, "loss": 0.7322, "step": 1392500 }, { "epoch": 2.397256148454346, "grad_norm": 2.233520984649658, "learning_rate": 1.004573085909423e-05, "loss": 0.7263, "step": 1393000 }, { "epoch": 2.39811661369069, "grad_norm": 2.1059939861297607, "learning_rate": 1.0031389771821829e-05, "loss": 0.7343, "step": 1393500 }, { "epoch": 2.398977078927034, "grad_norm": 2.5026044845581055, "learning_rate": 1.001704868454943e-05, "loss": 0.7294, "step": 1394000 }, { "epoch": 2.399837544163378, "grad_norm": 2.279144287109375, "learning_rate": 1.000270759727703e-05, "loss": 0.7287, "step": 1394500 }, { "epoch": 2.400698009399722, "grad_norm": 2.19842529296875, "learning_rate": 9.988366510004629e-06, "loss": 0.7281, "step": 1395000 }, { "epoch": 2.401558474636066, "grad_norm": 2.2801008224487305, "learning_rate": 9.974025422732231e-06, "loss": 0.7302, "step": 1395500 }, { "epoch": 2.40241893987241, "grad_norm": 1.9947917461395264, "learning_rate": 9.95968433545983e-06, "loss": 0.727, "step": 1396000 }, { "epoch": 2.403279405108754, "grad_norm": 2.186469793319702, "learning_rate": 9.94534324818743e-06, "loss": 0.729, "step": 1396500 }, { "epoch": 2.404139870345098, "grad_norm": 1.9610108137130737, "learning_rate": 9.93100216091503e-06, "loss": 0.7298, "step": 1397000 }, { "epoch": 2.405000335581442, "grad_norm": 2.3492963314056396, "learning_rate": 9.916661073642631e-06, "loss": 0.732, "step": 1397500 }, { "epoch": 2.405860800817786, "grad_norm": 2.2294130325317383, "learning_rate": 9.902319986370232e-06, "loss": 0.729, "step": 1398000 }, { "epoch": 2.40672126605413, "grad_norm": 2.0163352489471436, "learning_rate": 9.887978899097832e-06, "loss": 0.7272, "step": 1398500 }, { "epoch": 2.407581731290474, "grad_norm": 2.016042709350586, "learning_rate": 9.873637811825431e-06, "loss": 0.7286, "step": 1399000 }, { "epoch": 2.408442196526818, "grad_norm": 2.637369155883789, "learning_rate": 9.859296724553031e-06, "loss": 0.7308, "step": 1399500 }, { "epoch": 2.409302661763162, "grad_norm": 2.234714984893799, "learning_rate": 9.844955637280632e-06, "loss": 0.7261, "step": 1400000 }, { "epoch": 2.410163126999506, "grad_norm": 2.139127254486084, "learning_rate": 9.830614550008232e-06, "loss": 0.7224, "step": 1400500 }, { "epoch": 2.41102359223585, "grad_norm": 2.3268463611602783, "learning_rate": 9.816273462735833e-06, "loss": 0.7261, "step": 1401000 }, { "epoch": 2.4118840574721943, "grad_norm": 2.230707883834839, "learning_rate": 9.801932375463432e-06, "loss": 0.7251, "step": 1401500 }, { "epoch": 2.4127445227085382, "grad_norm": 2.266864061355591, "learning_rate": 9.787591288191032e-06, "loss": 0.7306, "step": 1402000 }, { "epoch": 2.413604987944882, "grad_norm": 2.000060558319092, "learning_rate": 9.773250200918634e-06, "loss": 0.7273, "step": 1402500 }, { "epoch": 2.414465453181226, "grad_norm": 2.3916234970092773, "learning_rate": 9.758909113646233e-06, "loss": 0.7296, "step": 1403000 }, { "epoch": 2.41532591841757, "grad_norm": 2.1868340969085693, "learning_rate": 9.744568026373833e-06, "loss": 0.73, "step": 1403500 }, { "epoch": 2.416186383653914, "grad_norm": 2.2738113403320312, "learning_rate": 9.730226939101434e-06, "loss": 0.7329, "step": 1404000 }, { "epoch": 2.417046848890258, "grad_norm": 2.2496702671051025, "learning_rate": 9.715885851829034e-06, "loss": 0.7296, "step": 1404500 }, { "epoch": 2.417907314126602, "grad_norm": 2.0994648933410645, "learning_rate": 9.701544764556635e-06, "loss": 0.727, "step": 1405000 }, { "epoch": 2.418767779362946, "grad_norm": 2.1675145626068115, "learning_rate": 9.687203677284235e-06, "loss": 0.7325, "step": 1405500 }, { "epoch": 2.41962824459929, "grad_norm": 2.0863234996795654, "learning_rate": 9.672862590011834e-06, "loss": 0.7269, "step": 1406000 }, { "epoch": 2.420488709835634, "grad_norm": 2.3856394290924072, "learning_rate": 9.658521502739435e-06, "loss": 0.7242, "step": 1406500 }, { "epoch": 2.421349175071978, "grad_norm": 2.326630115509033, "learning_rate": 9.644180415467035e-06, "loss": 0.7236, "step": 1407000 }, { "epoch": 2.422209640308322, "grad_norm": 2.0575623512268066, "learning_rate": 9.629839328194636e-06, "loss": 0.7328, "step": 1407500 }, { "epoch": 2.423070105544666, "grad_norm": 2.304266929626465, "learning_rate": 9.615498240922236e-06, "loss": 0.7255, "step": 1408000 }, { "epoch": 2.42393057078101, "grad_norm": 2.350442409515381, "learning_rate": 9.601157153649835e-06, "loss": 0.7265, "step": 1408500 }, { "epoch": 2.424791036017354, "grad_norm": 2.382349729537964, "learning_rate": 9.586816066377437e-06, "loss": 0.7294, "step": 1409000 }, { "epoch": 2.425651501253698, "grad_norm": 2.092679500579834, "learning_rate": 9.572474979105037e-06, "loss": 0.7242, "step": 1409500 }, { "epoch": 2.426511966490042, "grad_norm": 2.3096089363098145, "learning_rate": 9.558133891832636e-06, "loss": 0.7253, "step": 1410000 }, { "epoch": 2.427372431726386, "grad_norm": 2.160024881362915, "learning_rate": 9.543792804560237e-06, "loss": 0.7275, "step": 1410500 }, { "epoch": 2.42823289696273, "grad_norm": 2.258770704269409, "learning_rate": 9.529451717287837e-06, "loss": 0.7189, "step": 1411000 }, { "epoch": 2.429093362199074, "grad_norm": 2.0837507247924805, "learning_rate": 9.515110630015438e-06, "loss": 0.7297, "step": 1411500 }, { "epoch": 2.4299538274354178, "grad_norm": 2.198729991912842, "learning_rate": 9.500769542743038e-06, "loss": 0.7338, "step": 1412000 }, { "epoch": 2.4308142926717617, "grad_norm": 2.3990306854248047, "learning_rate": 9.486428455470637e-06, "loss": 0.7321, "step": 1412500 }, { "epoch": 2.4316747579081057, "grad_norm": 2.0542526245117188, "learning_rate": 9.472087368198237e-06, "loss": 0.7278, "step": 1413000 }, { "epoch": 2.4325352231444497, "grad_norm": 2.1463656425476074, "learning_rate": 9.457746280925838e-06, "loss": 0.7346, "step": 1413500 }, { "epoch": 2.4333956883807937, "grad_norm": 2.061422348022461, "learning_rate": 9.443405193653438e-06, "loss": 0.7197, "step": 1414000 }, { "epoch": 2.4342561536171377, "grad_norm": 2.2131097316741943, "learning_rate": 9.429064106381039e-06, "loss": 0.7243, "step": 1414500 }, { "epoch": 2.4351166188534816, "grad_norm": 2.2422876358032227, "learning_rate": 9.41472301910864e-06, "loss": 0.7338, "step": 1415000 }, { "epoch": 2.4359770840898256, "grad_norm": 2.2505648136138916, "learning_rate": 9.400381931836238e-06, "loss": 0.7242, "step": 1415500 }, { "epoch": 2.4368375493261696, "grad_norm": 2.3576719760894775, "learning_rate": 9.38604084456384e-06, "loss": 0.7257, "step": 1416000 }, { "epoch": 2.4376980145625136, "grad_norm": 2.1250061988830566, "learning_rate": 9.371699757291439e-06, "loss": 0.7227, "step": 1416500 }, { "epoch": 2.4385584797988575, "grad_norm": 2.289149045944214, "learning_rate": 9.35735867001904e-06, "loss": 0.7265, "step": 1417000 }, { "epoch": 2.4394189450352015, "grad_norm": 2.392317533493042, "learning_rate": 9.34301758274664e-06, "loss": 0.7291, "step": 1417500 }, { "epoch": 2.4402794102715455, "grad_norm": 2.255974769592285, "learning_rate": 9.32867649547424e-06, "loss": 0.7304, "step": 1418000 }, { "epoch": 2.4411398755078895, "grad_norm": 2.070075035095215, "learning_rate": 9.31433540820184e-06, "loss": 0.7312, "step": 1418500 }, { "epoch": 2.4420003407442334, "grad_norm": 2.2537167072296143, "learning_rate": 9.299994320929441e-06, "loss": 0.7321, "step": 1419000 }, { "epoch": 2.4428608059805774, "grad_norm": 2.128941535949707, "learning_rate": 9.28565323365704e-06, "loss": 0.7267, "step": 1419500 }, { "epoch": 2.4437212712169214, "grad_norm": 2.2561581134796143, "learning_rate": 9.27131214638464e-06, "loss": 0.7287, "step": 1420000 }, { "epoch": 2.4445817364532654, "grad_norm": 2.095482110977173, "learning_rate": 9.256971059112243e-06, "loss": 0.7261, "step": 1420500 }, { "epoch": 2.4454422016896094, "grad_norm": 2.4081618785858154, "learning_rate": 9.242629971839841e-06, "loss": 0.7286, "step": 1421000 }, { "epoch": 2.4463026669259538, "grad_norm": 2.0608112812042236, "learning_rate": 9.228288884567442e-06, "loss": 0.7212, "step": 1421500 }, { "epoch": 2.4471631321622977, "grad_norm": 2.0750913619995117, "learning_rate": 9.213947797295042e-06, "loss": 0.7248, "step": 1422000 }, { "epoch": 2.4480235973986417, "grad_norm": 2.242100715637207, "learning_rate": 9.199606710022641e-06, "loss": 0.7287, "step": 1422500 }, { "epoch": 2.4488840626349857, "grad_norm": 2.1421103477478027, "learning_rate": 9.185265622750243e-06, "loss": 0.7284, "step": 1423000 }, { "epoch": 2.4497445278713297, "grad_norm": 2.3732893466949463, "learning_rate": 9.170924535477842e-06, "loss": 0.7303, "step": 1423500 }, { "epoch": 2.4506049931076737, "grad_norm": 2.1787123680114746, "learning_rate": 9.156583448205443e-06, "loss": 0.7262, "step": 1424000 }, { "epoch": 2.4514654583440176, "grad_norm": 2.1791365146636963, "learning_rate": 9.142242360933043e-06, "loss": 0.7209, "step": 1424500 }, { "epoch": 2.4523259235803616, "grad_norm": 2.1248362064361572, "learning_rate": 9.127901273660644e-06, "loss": 0.7279, "step": 1425000 }, { "epoch": 2.4531863888167056, "grad_norm": 2.3395819664001465, "learning_rate": 9.113560186388244e-06, "loss": 0.7291, "step": 1425500 }, { "epoch": 2.4540468540530496, "grad_norm": 2.144453763961792, "learning_rate": 9.099219099115844e-06, "loss": 0.7272, "step": 1426000 }, { "epoch": 2.4549073192893935, "grad_norm": 2.2412824630737305, "learning_rate": 9.084878011843443e-06, "loss": 0.7249, "step": 1426500 }, { "epoch": 2.4557677845257375, "grad_norm": 2.4376273155212402, "learning_rate": 9.070536924571044e-06, "loss": 0.7204, "step": 1427000 }, { "epoch": 2.4566282497620815, "grad_norm": 2.246067523956299, "learning_rate": 9.056195837298644e-06, "loss": 0.7272, "step": 1427500 }, { "epoch": 2.4574887149984255, "grad_norm": 2.154071092605591, "learning_rate": 9.041854750026245e-06, "loss": 0.7192, "step": 1428000 }, { "epoch": 2.4583491802347694, "grad_norm": 2.44376802444458, "learning_rate": 9.027513662753845e-06, "loss": 0.7301, "step": 1428500 }, { "epoch": 2.4592096454711134, "grad_norm": 2.3021342754364014, "learning_rate": 9.013172575481444e-06, "loss": 0.7202, "step": 1429000 }, { "epoch": 2.4600701107074574, "grad_norm": 2.472088575363159, "learning_rate": 8.998831488209044e-06, "loss": 0.7256, "step": 1429500 }, { "epoch": 2.4609305759438014, "grad_norm": 2.3227767944335938, "learning_rate": 8.984490400936647e-06, "loss": 0.7257, "step": 1430000 }, { "epoch": 2.4617910411801454, "grad_norm": 2.3386566638946533, "learning_rate": 8.970149313664245e-06, "loss": 0.727, "step": 1430500 }, { "epoch": 2.4626515064164893, "grad_norm": 2.3084495067596436, "learning_rate": 8.955808226391846e-06, "loss": 0.7305, "step": 1431000 }, { "epoch": 2.4635119716528333, "grad_norm": 2.0986111164093018, "learning_rate": 8.941467139119446e-06, "loss": 0.7208, "step": 1431500 }, { "epoch": 2.4643724368891773, "grad_norm": 2.0349369049072266, "learning_rate": 8.927126051847047e-06, "loss": 0.7239, "step": 1432000 }, { "epoch": 2.4652329021255213, "grad_norm": 2.507856607437134, "learning_rate": 8.912784964574647e-06, "loss": 0.7288, "step": 1432500 }, { "epoch": 2.4660933673618652, "grad_norm": 2.2290287017822266, "learning_rate": 8.898443877302248e-06, "loss": 0.7324, "step": 1433000 }, { "epoch": 2.466953832598209, "grad_norm": 2.147047281265259, "learning_rate": 8.884102790029846e-06, "loss": 0.7232, "step": 1433500 }, { "epoch": 2.467814297834553, "grad_norm": 2.185572624206543, "learning_rate": 8.869761702757447e-06, "loss": 0.7264, "step": 1434000 }, { "epoch": 2.468674763070897, "grad_norm": 2.173811435699463, "learning_rate": 8.855420615485047e-06, "loss": 0.7244, "step": 1434500 }, { "epoch": 2.469535228307241, "grad_norm": 2.1698076725006104, "learning_rate": 8.841079528212648e-06, "loss": 0.722, "step": 1435000 }, { "epoch": 2.470395693543585, "grad_norm": 2.0112743377685547, "learning_rate": 8.826738440940248e-06, "loss": 0.7231, "step": 1435500 }, { "epoch": 2.471256158779929, "grad_norm": 2.455514669418335, "learning_rate": 8.812397353667847e-06, "loss": 0.7222, "step": 1436000 }, { "epoch": 2.472116624016273, "grad_norm": 2.2176318168640137, "learning_rate": 8.79805626639545e-06, "loss": 0.7255, "step": 1436500 }, { "epoch": 2.472977089252617, "grad_norm": 2.0305678844451904, "learning_rate": 8.78371517912305e-06, "loss": 0.7264, "step": 1437000 }, { "epoch": 2.473837554488961, "grad_norm": 1.9838893413543701, "learning_rate": 8.769374091850648e-06, "loss": 0.7248, "step": 1437500 }, { "epoch": 2.474698019725305, "grad_norm": 2.39805006980896, "learning_rate": 8.755033004578249e-06, "loss": 0.724, "step": 1438000 }, { "epoch": 2.475558484961649, "grad_norm": 2.250845432281494, "learning_rate": 8.74069191730585e-06, "loss": 0.7266, "step": 1438500 }, { "epoch": 2.476418950197993, "grad_norm": 2.292314291000366, "learning_rate": 8.72635083003345e-06, "loss": 0.719, "step": 1439000 }, { "epoch": 2.477279415434337, "grad_norm": 2.3114101886749268, "learning_rate": 8.71200974276105e-06, "loss": 0.7289, "step": 1439500 }, { "epoch": 2.478139880670681, "grad_norm": 2.1311428546905518, "learning_rate": 8.697668655488649e-06, "loss": 0.7294, "step": 1440000 }, { "epoch": 2.479000345907025, "grad_norm": 2.3691234588623047, "learning_rate": 8.68332756821625e-06, "loss": 0.7186, "step": 1440500 }, { "epoch": 2.479860811143369, "grad_norm": 2.3010482788085938, "learning_rate": 8.66898648094385e-06, "loss": 0.7329, "step": 1441000 }, { "epoch": 2.480721276379713, "grad_norm": 2.0064890384674072, "learning_rate": 8.65464539367145e-06, "loss": 0.7234, "step": 1441500 }, { "epoch": 2.481581741616057, "grad_norm": 2.1313636302948, "learning_rate": 8.640304306399051e-06, "loss": 0.7229, "step": 1442000 }, { "epoch": 2.482442206852401, "grad_norm": 2.41935133934021, "learning_rate": 8.625963219126652e-06, "loss": 0.7206, "step": 1442500 }, { "epoch": 2.483302672088745, "grad_norm": 2.29103684425354, "learning_rate": 8.61162213185425e-06, "loss": 0.7272, "step": 1443000 }, { "epoch": 2.4841631373250888, "grad_norm": 2.1962101459503174, "learning_rate": 8.597281044581852e-06, "loss": 0.7324, "step": 1443500 }, { "epoch": 2.4850236025614327, "grad_norm": 2.147658348083496, "learning_rate": 8.582939957309453e-06, "loss": 0.7233, "step": 1444000 }, { "epoch": 2.4858840677977767, "grad_norm": 2.0254199504852295, "learning_rate": 8.568598870037052e-06, "loss": 0.7243, "step": 1444500 }, { "epoch": 2.4867445330341207, "grad_norm": 2.3642418384552, "learning_rate": 8.554257782764652e-06, "loss": 0.7252, "step": 1445000 }, { "epoch": 2.4876049982704647, "grad_norm": 2.0392184257507324, "learning_rate": 8.539916695492253e-06, "loss": 0.7261, "step": 1445500 }, { "epoch": 2.4884654635068086, "grad_norm": 2.1036550998687744, "learning_rate": 8.525575608219853e-06, "loss": 0.7181, "step": 1446000 }, { "epoch": 2.4893259287431526, "grad_norm": 2.117926836013794, "learning_rate": 8.511234520947454e-06, "loss": 0.7248, "step": 1446500 }, { "epoch": 2.4901863939794966, "grad_norm": 2.2823784351348877, "learning_rate": 8.496893433675052e-06, "loss": 0.7218, "step": 1447000 }, { "epoch": 2.491046859215841, "grad_norm": 2.1260955333709717, "learning_rate": 8.482552346402653e-06, "loss": 0.7291, "step": 1447500 }, { "epoch": 2.491907324452185, "grad_norm": 2.096526861190796, "learning_rate": 8.468211259130255e-06, "loss": 0.7325, "step": 1448000 }, { "epoch": 2.492767789688529, "grad_norm": 2.419377565383911, "learning_rate": 8.453870171857854e-06, "loss": 0.7236, "step": 1448500 }, { "epoch": 2.493628254924873, "grad_norm": 2.5147461891174316, "learning_rate": 8.439529084585454e-06, "loss": 0.7247, "step": 1449000 }, { "epoch": 2.494488720161217, "grad_norm": 2.212557792663574, "learning_rate": 8.425187997313055e-06, "loss": 0.7292, "step": 1449500 }, { "epoch": 2.495349185397561, "grad_norm": 2.241730213165283, "learning_rate": 8.410846910040653e-06, "loss": 0.7325, "step": 1450000 }, { "epoch": 2.496209650633905, "grad_norm": 2.205704927444458, "learning_rate": 8.396505822768256e-06, "loss": 0.7247, "step": 1450500 }, { "epoch": 2.497070115870249, "grad_norm": 1.9690256118774414, "learning_rate": 8.382164735495854e-06, "loss": 0.7257, "step": 1451000 }, { "epoch": 2.497930581106593, "grad_norm": 2.6139450073242188, "learning_rate": 8.367823648223455e-06, "loss": 0.724, "step": 1451500 }, { "epoch": 2.498791046342937, "grad_norm": 2.0735630989074707, "learning_rate": 8.353482560951055e-06, "loss": 0.7255, "step": 1452000 }, { "epoch": 2.499651511579281, "grad_norm": 2.2452523708343506, "learning_rate": 8.339141473678656e-06, "loss": 0.7266, "step": 1452500 }, { "epoch": 2.5005119768156248, "grad_norm": 2.239596128463745, "learning_rate": 8.324800386406256e-06, "loss": 0.7227, "step": 1453000 }, { "epoch": 2.5013724420519687, "grad_norm": 2.209899663925171, "learning_rate": 8.310459299133857e-06, "loss": 0.7297, "step": 1453500 }, { "epoch": 2.5022329072883127, "grad_norm": 2.1387386322021484, "learning_rate": 8.296118211861456e-06, "loss": 0.7269, "step": 1454000 }, { "epoch": 2.5030933725246567, "grad_norm": 2.203953981399536, "learning_rate": 8.281777124589056e-06, "loss": 0.7275, "step": 1454500 }, { "epoch": 2.5039538377610007, "grad_norm": 2.340108871459961, "learning_rate": 8.267436037316656e-06, "loss": 0.7213, "step": 1455000 }, { "epoch": 2.5048143029973446, "grad_norm": 2.1395070552825928, "learning_rate": 8.253094950044257e-06, "loss": 0.7235, "step": 1455500 }, { "epoch": 2.5056747682336886, "grad_norm": 2.0611159801483154, "learning_rate": 8.238753862771857e-06, "loss": 0.7281, "step": 1456000 }, { "epoch": 2.5065352334700326, "grad_norm": 2.2360119819641113, "learning_rate": 8.224412775499458e-06, "loss": 0.72, "step": 1456500 }, { "epoch": 2.5073956987063766, "grad_norm": 2.186635971069336, "learning_rate": 8.210071688227057e-06, "loss": 0.7147, "step": 1457000 }, { "epoch": 2.5082561639427206, "grad_norm": 2.166893482208252, "learning_rate": 8.195730600954659e-06, "loss": 0.7231, "step": 1457500 }, { "epoch": 2.5091166291790645, "grad_norm": 2.1585497856140137, "learning_rate": 8.181389513682258e-06, "loss": 0.7254, "step": 1458000 }, { "epoch": 2.5099770944154085, "grad_norm": 2.2699153423309326, "learning_rate": 8.167048426409858e-06, "loss": 0.7252, "step": 1458500 }, { "epoch": 2.5108375596517525, "grad_norm": 2.149649143218994, "learning_rate": 8.152707339137459e-06, "loss": 0.7263, "step": 1459000 }, { "epoch": 2.5116980248880965, "grad_norm": 2.4442532062530518, "learning_rate": 8.138366251865059e-06, "loss": 0.7213, "step": 1459500 }, { "epoch": 2.5125584901244404, "grad_norm": 2.098191738128662, "learning_rate": 8.12402516459266e-06, "loss": 0.7187, "step": 1460000 }, { "epoch": 2.5134189553607844, "grad_norm": 2.221888780593872, "learning_rate": 8.10968407732026e-06, "loss": 0.722, "step": 1460500 }, { "epoch": 2.5142794205971284, "grad_norm": 2.258897542953491, "learning_rate": 8.095342990047859e-06, "loss": 0.7256, "step": 1461000 }, { "epoch": 2.5151398858334724, "grad_norm": 2.0834147930145264, "learning_rate": 8.08100190277546e-06, "loss": 0.7232, "step": 1461500 }, { "epoch": 2.5160003510698163, "grad_norm": 2.1934962272644043, "learning_rate": 8.06666081550306e-06, "loss": 0.7234, "step": 1462000 }, { "epoch": 2.5168608163061603, "grad_norm": 2.236333131790161, "learning_rate": 8.05231972823066e-06, "loss": 0.7232, "step": 1462500 }, { "epoch": 2.5177212815425043, "grad_norm": 2.2630205154418945, "learning_rate": 8.03797864095826e-06, "loss": 0.7188, "step": 1463000 }, { "epoch": 2.5185817467788483, "grad_norm": 2.093230724334717, "learning_rate": 8.02363755368586e-06, "loss": 0.7207, "step": 1463500 }, { "epoch": 2.5194422120151923, "grad_norm": 2.096151351928711, "learning_rate": 8.009296466413462e-06, "loss": 0.7179, "step": 1464000 }, { "epoch": 2.5203026772515362, "grad_norm": 2.2687249183654785, "learning_rate": 7.994955379141062e-06, "loss": 0.7251, "step": 1464500 }, { "epoch": 2.52116314248788, "grad_norm": 2.351755380630493, "learning_rate": 7.98061429186866e-06, "loss": 0.7242, "step": 1465000 }, { "epoch": 2.522023607724224, "grad_norm": 2.3624746799468994, "learning_rate": 7.966273204596261e-06, "loss": 0.7254, "step": 1465500 }, { "epoch": 2.522884072960568, "grad_norm": 2.23112416267395, "learning_rate": 7.951932117323862e-06, "loss": 0.7214, "step": 1466000 }, { "epoch": 2.5237445381969126, "grad_norm": 2.169090747833252, "learning_rate": 7.937591030051462e-06, "loss": 0.7189, "step": 1466500 }, { "epoch": 2.5246050034332566, "grad_norm": 2.4218993186950684, "learning_rate": 7.923249942779063e-06, "loss": 0.7271, "step": 1467000 }, { "epoch": 2.5254654686696005, "grad_norm": 2.2031071186065674, "learning_rate": 7.908908855506663e-06, "loss": 0.7254, "step": 1467500 }, { "epoch": 2.5263259339059445, "grad_norm": 2.2774407863616943, "learning_rate": 7.894567768234262e-06, "loss": 0.7231, "step": 1468000 }, { "epoch": 2.5271863991422885, "grad_norm": 2.4886295795440674, "learning_rate": 7.880226680961862e-06, "loss": 0.7206, "step": 1468500 }, { "epoch": 2.5280468643786325, "grad_norm": 2.159146547317505, "learning_rate": 7.865885593689463e-06, "loss": 0.7265, "step": 1469000 }, { "epoch": 2.5289073296149764, "grad_norm": 2.215956926345825, "learning_rate": 7.851544506417063e-06, "loss": 0.7169, "step": 1469500 }, { "epoch": 2.5297677948513204, "grad_norm": 2.1954901218414307, "learning_rate": 7.837203419144664e-06, "loss": 0.7195, "step": 1470000 }, { "epoch": 2.5306282600876644, "grad_norm": 2.310394048690796, "learning_rate": 7.822862331872263e-06, "loss": 0.7241, "step": 1470500 }, { "epoch": 2.5314887253240084, "grad_norm": 2.211066961288452, "learning_rate": 7.808521244599865e-06, "loss": 0.7218, "step": 1471000 }, { "epoch": 2.5323491905603523, "grad_norm": 2.2090892791748047, "learning_rate": 7.794180157327465e-06, "loss": 0.7251, "step": 1471500 }, { "epoch": 2.5332096557966963, "grad_norm": 2.221142053604126, "learning_rate": 7.779839070055064e-06, "loss": 0.7253, "step": 1472000 }, { "epoch": 2.5340701210330403, "grad_norm": 2.2364909648895264, "learning_rate": 7.765497982782664e-06, "loss": 0.7286, "step": 1472500 }, { "epoch": 2.5349305862693843, "grad_norm": 2.1668782234191895, "learning_rate": 7.751156895510265e-06, "loss": 0.7219, "step": 1473000 }, { "epoch": 2.5357910515057283, "grad_norm": 2.125824451446533, "learning_rate": 7.736815808237865e-06, "loss": 0.7223, "step": 1473500 }, { "epoch": 2.5366515167420722, "grad_norm": 1.9486610889434814, "learning_rate": 7.722474720965466e-06, "loss": 0.7259, "step": 1474000 }, { "epoch": 2.537511981978416, "grad_norm": 2.0973780155181885, "learning_rate": 7.708133633693065e-06, "loss": 0.7234, "step": 1474500 }, { "epoch": 2.53837244721476, "grad_norm": 2.1345973014831543, "learning_rate": 7.693792546420665e-06, "loss": 0.7221, "step": 1475000 }, { "epoch": 2.539232912451104, "grad_norm": 2.146336078643799, "learning_rate": 7.679451459148267e-06, "loss": 0.7261, "step": 1475500 }, { "epoch": 2.540093377687448, "grad_norm": 2.5509378910064697, "learning_rate": 7.665110371875866e-06, "loss": 0.726, "step": 1476000 }, { "epoch": 2.540953842923792, "grad_norm": 2.422461748123169, "learning_rate": 7.650769284603467e-06, "loss": 0.73, "step": 1476500 }, { "epoch": 2.541814308160136, "grad_norm": 2.1949241161346436, "learning_rate": 7.636428197331067e-06, "loss": 0.7229, "step": 1477000 }, { "epoch": 2.54267477339648, "grad_norm": 2.390719413757324, "learning_rate": 7.622087110058667e-06, "loss": 0.7234, "step": 1477500 }, { "epoch": 2.543535238632824, "grad_norm": 2.194368600845337, "learning_rate": 7.607746022786268e-06, "loss": 0.723, "step": 1478000 }, { "epoch": 2.544395703869168, "grad_norm": 2.5566372871398926, "learning_rate": 7.593404935513868e-06, "loss": 0.7312, "step": 1478500 }, { "epoch": 2.545256169105512, "grad_norm": 2.225067615509033, "learning_rate": 7.579063848241467e-06, "loss": 0.7157, "step": 1479000 }, { "epoch": 2.546116634341856, "grad_norm": 2.281120777130127, "learning_rate": 7.564722760969068e-06, "loss": 0.7259, "step": 1479500 }, { "epoch": 2.5469770995782, "grad_norm": 2.297755479812622, "learning_rate": 7.550381673696667e-06, "loss": 0.7217, "step": 1480000 }, { "epoch": 2.547837564814544, "grad_norm": 2.464024066925049, "learning_rate": 7.536040586424269e-06, "loss": 0.7235, "step": 1480500 }, { "epoch": 2.548698030050888, "grad_norm": 2.3035905361175537, "learning_rate": 7.521699499151868e-06, "loss": 0.722, "step": 1481000 }, { "epoch": 2.549558495287232, "grad_norm": 2.119723320007324, "learning_rate": 7.507358411879469e-06, "loss": 0.7263, "step": 1481500 }, { "epoch": 2.550418960523576, "grad_norm": 2.320289373397827, "learning_rate": 7.493017324607068e-06, "loss": 0.7267, "step": 1482000 }, { "epoch": 2.55127942575992, "grad_norm": 2.233952522277832, "learning_rate": 7.47867623733467e-06, "loss": 0.7249, "step": 1482500 }, { "epoch": 2.552139890996264, "grad_norm": 8.134510040283203, "learning_rate": 7.464335150062269e-06, "loss": 0.7205, "step": 1483000 }, { "epoch": 2.553000356232608, "grad_norm": 2.3019843101501465, "learning_rate": 7.44999406278987e-06, "loss": 0.7187, "step": 1483500 }, { "epoch": 2.5538608214689518, "grad_norm": 2.252256393432617, "learning_rate": 7.435652975517469e-06, "loss": 0.7246, "step": 1484000 }, { "epoch": 2.5547212867052957, "grad_norm": 2.285295009613037, "learning_rate": 7.421311888245069e-06, "loss": 0.7266, "step": 1484500 }, { "epoch": 2.5555817519416397, "grad_norm": 2.0101242065429688, "learning_rate": 7.40697080097267e-06, "loss": 0.7238, "step": 1485000 }, { "epoch": 2.5564422171779837, "grad_norm": 2.1717300415039062, "learning_rate": 7.392629713700271e-06, "loss": 0.7274, "step": 1485500 }, { "epoch": 2.5573026824143277, "grad_norm": 2.275235652923584, "learning_rate": 7.37828862642787e-06, "loss": 0.7238, "step": 1486000 }, { "epoch": 2.5581631476506717, "grad_norm": 2.19974684715271, "learning_rate": 7.363947539155471e-06, "loss": 0.7286, "step": 1486500 }, { "epoch": 2.5590236128870156, "grad_norm": 2.2823832035064697, "learning_rate": 7.349606451883072e-06, "loss": 0.7196, "step": 1487000 }, { "epoch": 2.5598840781233596, "grad_norm": 2.2638237476348877, "learning_rate": 7.335265364610672e-06, "loss": 0.7199, "step": 1487500 }, { "epoch": 2.5607445433597036, "grad_norm": 2.039130926132202, "learning_rate": 7.3209242773382714e-06, "loss": 0.7157, "step": 1488000 }, { "epoch": 2.5616050085960476, "grad_norm": 2.1070847511291504, "learning_rate": 7.306583190065872e-06, "loss": 0.7229, "step": 1488500 }, { "epoch": 2.5624654738323915, "grad_norm": 2.121931552886963, "learning_rate": 7.2922421027934715e-06, "loss": 0.7133, "step": 1489000 }, { "epoch": 2.5633259390687355, "grad_norm": 2.1481940746307373, "learning_rate": 7.277901015521073e-06, "loss": 0.7218, "step": 1489500 }, { "epoch": 2.5641864043050795, "grad_norm": 2.1555721759796143, "learning_rate": 7.2635599282486725e-06, "loss": 0.7166, "step": 1490000 }, { "epoch": 2.5650468695414235, "grad_norm": 2.187934398651123, "learning_rate": 7.249218840976273e-06, "loss": 0.721, "step": 1490500 }, { "epoch": 2.5659073347777674, "grad_norm": 2.292440176010132, "learning_rate": 7.2348777537038726e-06, "loss": 0.7282, "step": 1491000 }, { "epoch": 2.5667678000141114, "grad_norm": 2.3000996112823486, "learning_rate": 7.220536666431474e-06, "loss": 0.7247, "step": 1491500 }, { "epoch": 2.5676282652504554, "grad_norm": 2.1090991497039795, "learning_rate": 7.2061955791590735e-06, "loss": 0.7236, "step": 1492000 }, { "epoch": 2.5684887304867994, "grad_norm": 2.2887206077575684, "learning_rate": 7.191854491886674e-06, "loss": 0.7212, "step": 1492500 }, { "epoch": 2.5693491957231434, "grad_norm": 2.1364707946777344, "learning_rate": 7.177513404614274e-06, "loss": 0.7239, "step": 1493000 }, { "epoch": 2.5702096609594873, "grad_norm": 2.2923684120178223, "learning_rate": 7.163172317341873e-06, "loss": 0.7195, "step": 1493500 }, { "epoch": 2.5710701261958313, "grad_norm": 2.278289794921875, "learning_rate": 7.1488312300694745e-06, "loss": 0.7159, "step": 1494000 }, { "epoch": 2.5719305914321753, "grad_norm": 2.3480536937713623, "learning_rate": 7.134490142797075e-06, "loss": 0.72, "step": 1494500 }, { "epoch": 2.5727910566685193, "grad_norm": 2.2005200386047363, "learning_rate": 7.120149055524675e-06, "loss": 0.7221, "step": 1495000 }, { "epoch": 2.5736515219048637, "grad_norm": 2.2753360271453857, "learning_rate": 7.105807968252274e-06, "loss": 0.7264, "step": 1495500 }, { "epoch": 2.5745119871412077, "grad_norm": 2.1410017013549805, "learning_rate": 7.091466880979875e-06, "loss": 0.7216, "step": 1496000 }, { "epoch": 2.5753724523775516, "grad_norm": 2.2372915744781494, "learning_rate": 7.077125793707476e-06, "loss": 0.7202, "step": 1496500 }, { "epoch": 2.5762329176138956, "grad_norm": 2.1890013217926025, "learning_rate": 7.062784706435076e-06, "loss": 0.7207, "step": 1497000 }, { "epoch": 2.5770933828502396, "grad_norm": 2.078773021697998, "learning_rate": 7.048443619162675e-06, "loss": 0.7203, "step": 1497500 }, { "epoch": 2.5779538480865836, "grad_norm": 2.2205190658569336, "learning_rate": 7.034102531890276e-06, "loss": 0.7191, "step": 1498000 }, { "epoch": 2.5788143133229275, "grad_norm": 2.2238242626190186, "learning_rate": 7.019761444617877e-06, "loss": 0.7233, "step": 1498500 }, { "epoch": 2.5796747785592715, "grad_norm": 2.231977701187134, "learning_rate": 7.005420357345477e-06, "loss": 0.7199, "step": 1499000 }, { "epoch": 2.5805352437956155, "grad_norm": 2.04296612739563, "learning_rate": 6.991079270073077e-06, "loss": 0.7187, "step": 1499500 }, { "epoch": 2.5813957090319595, "grad_norm": 2.2166175842285156, "learning_rate": 6.976738182800677e-06, "loss": 0.7213, "step": 1500000 }, { "epoch": 2.5822561742683035, "grad_norm": 2.2233524322509766, "learning_rate": 6.9623970955282764e-06, "loss": 0.7186, "step": 1500500 }, { "epoch": 2.5831166395046474, "grad_norm": 2.268268585205078, "learning_rate": 6.948056008255878e-06, "loss": 0.7263, "step": 1501000 }, { "epoch": 2.5839771047409914, "grad_norm": 2.200861930847168, "learning_rate": 6.933714920983478e-06, "loss": 0.7242, "step": 1501500 }, { "epoch": 2.5848375699773354, "grad_norm": 2.327687978744507, "learning_rate": 6.919373833711078e-06, "loss": 0.7217, "step": 1502000 }, { "epoch": 2.5856980352136794, "grad_norm": 2.0236458778381348, "learning_rate": 6.9050327464386775e-06, "loss": 0.7195, "step": 1502500 }, { "epoch": 2.5865585004500233, "grad_norm": 2.2688241004943848, "learning_rate": 6.890691659166279e-06, "loss": 0.7213, "step": 1503000 }, { "epoch": 2.5874189656863673, "grad_norm": 2.138439893722534, "learning_rate": 6.876350571893879e-06, "loss": 0.7229, "step": 1503500 }, { "epoch": 2.5882794309227113, "grad_norm": 2.194458246231079, "learning_rate": 6.862009484621479e-06, "loss": 0.7224, "step": 1504000 }, { "epoch": 2.5891398961590553, "grad_norm": 2.2974252700805664, "learning_rate": 6.8476683973490785e-06, "loss": 0.7232, "step": 1504500 }, { "epoch": 2.5900003613953992, "grad_norm": 2.4050469398498535, "learning_rate": 6.833327310076679e-06, "loss": 0.7218, "step": 1505000 }, { "epoch": 2.590860826631743, "grad_norm": 2.0813820362091064, "learning_rate": 6.81898622280428e-06, "loss": 0.7183, "step": 1505500 }, { "epoch": 2.591721291868087, "grad_norm": 2.3573648929595947, "learning_rate": 6.80464513553188e-06, "loss": 0.7178, "step": 1506000 }, { "epoch": 2.592581757104431, "grad_norm": 2.2958922386169434, "learning_rate": 6.7903040482594795e-06, "loss": 0.7174, "step": 1506500 }, { "epoch": 2.593442222340775, "grad_norm": 2.2089388370513916, "learning_rate": 6.77596296098708e-06, "loss": 0.7242, "step": 1507000 }, { "epoch": 2.594302687577119, "grad_norm": 2.1768064498901367, "learning_rate": 6.76162187371468e-06, "loss": 0.7232, "step": 1507500 }, { "epoch": 2.595163152813463, "grad_norm": 2.0804152488708496, "learning_rate": 6.747280786442281e-06, "loss": 0.7218, "step": 1508000 }, { "epoch": 2.596023618049807, "grad_norm": 2.265146255493164, "learning_rate": 6.7329396991698806e-06, "loss": 0.7233, "step": 1508500 }, { "epoch": 2.596884083286151, "grad_norm": 2.0737407207489014, "learning_rate": 6.718598611897481e-06, "loss": 0.7142, "step": 1509000 }, { "epoch": 2.597744548522495, "grad_norm": 2.1167831420898438, "learning_rate": 6.704257524625081e-06, "loss": 0.7221, "step": 1509500 }, { "epoch": 2.598605013758839, "grad_norm": 2.4157352447509766, "learning_rate": 6.689916437352682e-06, "loss": 0.7193, "step": 1510000 }, { "epoch": 2.599465478995183, "grad_norm": 2.2811946868896484, "learning_rate": 6.675575350080282e-06, "loss": 0.7244, "step": 1510500 }, { "epoch": 2.600325944231527, "grad_norm": 2.2633402347564697, "learning_rate": 6.661234262807882e-06, "loss": 0.7217, "step": 1511000 }, { "epoch": 2.601186409467871, "grad_norm": 2.3065993785858154, "learning_rate": 6.646893175535482e-06, "loss": 0.7236, "step": 1511500 }, { "epoch": 2.602046874704215, "grad_norm": 2.197575569152832, "learning_rate": 6.632552088263082e-06, "loss": 0.7213, "step": 1512000 }, { "epoch": 2.602907339940559, "grad_norm": 2.446925640106201, "learning_rate": 6.6182110009906835e-06, "loss": 0.7154, "step": 1512500 }, { "epoch": 2.6037678051769033, "grad_norm": 2.5222103595733643, "learning_rate": 6.603869913718283e-06, "loss": 0.7174, "step": 1513000 }, { "epoch": 2.6046282704132473, "grad_norm": 2.3834874629974365, "learning_rate": 6.589528826445883e-06, "loss": 0.7262, "step": 1513500 }, { "epoch": 2.6054887356495913, "grad_norm": 2.1265454292297363, "learning_rate": 6.575187739173483e-06, "loss": 0.7213, "step": 1514000 }, { "epoch": 2.6063492008859352, "grad_norm": 2.1744658946990967, "learning_rate": 6.5608466519010845e-06, "loss": 0.7266, "step": 1514500 }, { "epoch": 2.6072096661222792, "grad_norm": 2.554331064224243, "learning_rate": 6.546505564628684e-06, "loss": 0.7233, "step": 1515000 }, { "epoch": 2.608070131358623, "grad_norm": 2.343867778778076, "learning_rate": 6.532164477356284e-06, "loss": 0.7138, "step": 1515500 }, { "epoch": 2.608930596594967, "grad_norm": 2.1199309825897217, "learning_rate": 6.517823390083884e-06, "loss": 0.7177, "step": 1516000 }, { "epoch": 2.609791061831311, "grad_norm": 2.4904963970184326, "learning_rate": 6.503482302811484e-06, "loss": 0.7158, "step": 1516500 }, { "epoch": 2.610651527067655, "grad_norm": 2.239297389984131, "learning_rate": 6.489141215539085e-06, "loss": 0.7218, "step": 1517000 }, { "epoch": 2.611511992303999, "grad_norm": 2.024224042892456, "learning_rate": 6.474800128266685e-06, "loss": 0.7141, "step": 1517500 }, { "epoch": 2.612372457540343, "grad_norm": 3.024536609649658, "learning_rate": 6.460459040994285e-06, "loss": 0.7165, "step": 1518000 }, { "epoch": 2.613232922776687, "grad_norm": 2.288699150085449, "learning_rate": 6.446117953721885e-06, "loss": 0.7231, "step": 1518500 }, { "epoch": 2.614093388013031, "grad_norm": 2.127619743347168, "learning_rate": 6.431776866449486e-06, "loss": 0.7144, "step": 1519000 }, { "epoch": 2.614953853249375, "grad_norm": 2.112839460372925, "learning_rate": 6.417435779177086e-06, "loss": 0.7263, "step": 1519500 }, { "epoch": 2.615814318485719, "grad_norm": 2.175316333770752, "learning_rate": 6.403094691904686e-06, "loss": 0.7233, "step": 1520000 }, { "epoch": 2.616674783722063, "grad_norm": 2.203071355819702, "learning_rate": 6.388753604632286e-06, "loss": 0.7239, "step": 1520500 }, { "epoch": 2.617535248958407, "grad_norm": 2.312666416168213, "learning_rate": 6.3744125173598856e-06, "loss": 0.7222, "step": 1521000 }, { "epoch": 2.618395714194751, "grad_norm": 2.164508104324341, "learning_rate": 6.360071430087487e-06, "loss": 0.7194, "step": 1521500 }, { "epoch": 2.619256179431095, "grad_norm": 2.3181352615356445, "learning_rate": 6.345730342815087e-06, "loss": 0.7183, "step": 1522000 }, { "epoch": 2.620116644667439, "grad_norm": 2.6277377605438232, "learning_rate": 6.331389255542687e-06, "loss": 0.7212, "step": 1522500 }, { "epoch": 2.620977109903783, "grad_norm": 2.370537519454956, "learning_rate": 6.317048168270287e-06, "loss": 0.7193, "step": 1523000 }, { "epoch": 2.621837575140127, "grad_norm": 2.3331568241119385, "learning_rate": 6.302707080997887e-06, "loss": 0.7237, "step": 1523500 }, { "epoch": 2.622698040376471, "grad_norm": 2.125333547592163, "learning_rate": 6.288365993725488e-06, "loss": 0.7228, "step": 1524000 }, { "epoch": 2.623558505612815, "grad_norm": 2.0734002590179443, "learning_rate": 6.274024906453088e-06, "loss": 0.7272, "step": 1524500 }, { "epoch": 2.6244189708491588, "grad_norm": 2.216912031173706, "learning_rate": 6.2596838191806885e-06, "loss": 0.7157, "step": 1525000 }, { "epoch": 2.6252794360855027, "grad_norm": 2.442035436630249, "learning_rate": 6.245342731908289e-06, "loss": 0.7155, "step": 1525500 }, { "epoch": 2.6261399013218467, "grad_norm": 2.1774885654449463, "learning_rate": 6.2310016446358886e-06, "loss": 0.7178, "step": 1526000 }, { "epoch": 2.6270003665581907, "grad_norm": 2.2715914249420166, "learning_rate": 6.216660557363489e-06, "loss": 0.7141, "step": 1526500 }, { "epoch": 2.6278608317945347, "grad_norm": 2.1633667945861816, "learning_rate": 6.2023194700910895e-06, "loss": 0.7173, "step": 1527000 }, { "epoch": 2.6287212970308786, "grad_norm": 2.4417827129364014, "learning_rate": 6.187978382818689e-06, "loss": 0.7247, "step": 1527500 }, { "epoch": 2.6295817622672226, "grad_norm": 2.2331953048706055, "learning_rate": 6.17363729554629e-06, "loss": 0.7194, "step": 1528000 }, { "epoch": 2.6304422275035666, "grad_norm": 2.3902461528778076, "learning_rate": 6.159296208273889e-06, "loss": 0.7189, "step": 1528500 }, { "epoch": 2.6313026927399106, "grad_norm": 2.0932114124298096, "learning_rate": 6.1449551210014905e-06, "loss": 0.719, "step": 1529000 }, { "epoch": 2.6321631579762546, "grad_norm": 2.4338607788085938, "learning_rate": 6.13061403372909e-06, "loss": 0.7136, "step": 1529500 }, { "epoch": 2.6330236232125985, "grad_norm": 2.4229514598846436, "learning_rate": 6.116272946456691e-06, "loss": 0.719, "step": 1530000 }, { "epoch": 2.6338840884489425, "grad_norm": 2.219041109085083, "learning_rate": 6.101931859184291e-06, "loss": 0.715, "step": 1530500 }, { "epoch": 2.6347445536852865, "grad_norm": 2.183692455291748, "learning_rate": 6.087590771911891e-06, "loss": 0.7195, "step": 1531000 }, { "epoch": 2.6356050189216305, "grad_norm": 2.0069658756256104, "learning_rate": 6.073249684639491e-06, "loss": 0.7184, "step": 1531500 }, { "epoch": 2.6364654841579744, "grad_norm": 2.0577995777130127, "learning_rate": 6.058908597367091e-06, "loss": 0.7165, "step": 1532000 }, { "epoch": 2.6373259493943184, "grad_norm": 2.0661566257476807, "learning_rate": 6.044567510094692e-06, "loss": 0.7172, "step": 1532500 }, { "epoch": 2.6381864146306624, "grad_norm": 2.362360954284668, "learning_rate": 6.030226422822292e-06, "loss": 0.7185, "step": 1533000 }, { "epoch": 2.6390468798670064, "grad_norm": 2.183109998703003, "learning_rate": 6.015885335549892e-06, "loss": 0.7221, "step": 1533500 }, { "epoch": 2.6399073451033503, "grad_norm": 3.1113739013671875, "learning_rate": 6.001544248277492e-06, "loss": 0.7196, "step": 1534000 }, { "epoch": 2.6407678103396943, "grad_norm": 2.251563787460327, "learning_rate": 5.987203161005093e-06, "loss": 0.7117, "step": 1534500 }, { "epoch": 2.6416282755760383, "grad_norm": 2.264225721359253, "learning_rate": 5.972862073732693e-06, "loss": 0.7211, "step": 1535000 }, { "epoch": 2.6424887408123823, "grad_norm": 2.278864860534668, "learning_rate": 5.958520986460292e-06, "loss": 0.7175, "step": 1535500 }, { "epoch": 2.6433492060487263, "grad_norm": 2.30007004737854, "learning_rate": 5.944179899187893e-06, "loss": 0.7235, "step": 1536000 }, { "epoch": 2.6442096712850702, "grad_norm": 2.183664560317993, "learning_rate": 5.929838811915493e-06, "loss": 0.7182, "step": 1536500 }, { "epoch": 2.645070136521414, "grad_norm": 2.0430800914764404, "learning_rate": 5.915497724643094e-06, "loss": 0.7184, "step": 1537000 }, { "epoch": 2.645930601757758, "grad_norm": 2.242837429046631, "learning_rate": 5.9011566373706935e-06, "loss": 0.7154, "step": 1537500 }, { "epoch": 2.646791066994102, "grad_norm": 2.0278875827789307, "learning_rate": 5.886815550098295e-06, "loss": 0.723, "step": 1538000 }, { "epoch": 2.647651532230446, "grad_norm": 2.1112453937530518, "learning_rate": 5.872474462825894e-06, "loss": 0.7193, "step": 1538500 }, { "epoch": 2.64851199746679, "grad_norm": 2.058110475540161, "learning_rate": 5.858133375553494e-06, "loss": 0.7187, "step": 1539000 }, { "epoch": 2.649372462703134, "grad_norm": 2.3659400939941406, "learning_rate": 5.8437922882810945e-06, "loss": 0.714, "step": 1539500 }, { "epoch": 2.650232927939478, "grad_norm": 1.9934048652648926, "learning_rate": 5.829451201008695e-06, "loss": 0.7185, "step": 1540000 }, { "epoch": 2.651093393175822, "grad_norm": 2.47733473777771, "learning_rate": 5.8151101137362954e-06, "loss": 0.7158, "step": 1540500 }, { "epoch": 2.651953858412166, "grad_norm": 2.123170852661133, "learning_rate": 5.800769026463895e-06, "loss": 0.7115, "step": 1541000 }, { "epoch": 2.6528143236485104, "grad_norm": 2.4889888763427734, "learning_rate": 5.7864279391914955e-06, "loss": 0.7171, "step": 1541500 }, { "epoch": 2.6536747888848544, "grad_norm": 2.187666177749634, "learning_rate": 5.772086851919096e-06, "loss": 0.7199, "step": 1542000 }, { "epoch": 2.6545352541211984, "grad_norm": 2.203145742416382, "learning_rate": 5.7577457646466965e-06, "loss": 0.7189, "step": 1542500 }, { "epoch": 2.6553957193575424, "grad_norm": 2.287001371383667, "learning_rate": 5.743404677374296e-06, "loss": 0.7182, "step": 1543000 }, { "epoch": 2.6562561845938863, "grad_norm": 2.1928348541259766, "learning_rate": 5.7290635901018966e-06, "loss": 0.7168, "step": 1543500 }, { "epoch": 2.6571166498302303, "grad_norm": 2.1668388843536377, "learning_rate": 5.714722502829497e-06, "loss": 0.7168, "step": 1544000 }, { "epoch": 2.6579771150665743, "grad_norm": 2.414255380630493, "learning_rate": 5.700381415557097e-06, "loss": 0.7173, "step": 1544500 }, { "epoch": 2.6588375803029183, "grad_norm": 2.3311398029327393, "learning_rate": 5.686040328284697e-06, "loss": 0.7201, "step": 1545000 }, { "epoch": 2.6596980455392623, "grad_norm": 2.3980069160461426, "learning_rate": 5.671699241012298e-06, "loss": 0.7238, "step": 1545500 }, { "epoch": 2.6605585107756062, "grad_norm": 2.217282772064209, "learning_rate": 5.657358153739898e-06, "loss": 0.7228, "step": 1546000 }, { "epoch": 2.66141897601195, "grad_norm": 2.153686285018921, "learning_rate": 5.643017066467498e-06, "loss": 0.7108, "step": 1546500 }, { "epoch": 2.662279441248294, "grad_norm": 2.188215494155884, "learning_rate": 5.628675979195098e-06, "loss": 0.7144, "step": 1547000 }, { "epoch": 2.663139906484638, "grad_norm": 2.301560640335083, "learning_rate": 5.614334891922699e-06, "loss": 0.7198, "step": 1547500 }, { "epoch": 2.664000371720982, "grad_norm": 2.305785894393921, "learning_rate": 5.599993804650298e-06, "loss": 0.7208, "step": 1548000 }, { "epoch": 2.664860836957326, "grad_norm": 2.377692937850952, "learning_rate": 5.585652717377899e-06, "loss": 0.7178, "step": 1548500 }, { "epoch": 2.66572130219367, "grad_norm": 2.112016439437866, "learning_rate": 5.571311630105499e-06, "loss": 0.7147, "step": 1549000 }, { "epoch": 2.666581767430014, "grad_norm": 2.1709604263305664, "learning_rate": 5.5569705428331e-06, "loss": 0.7178, "step": 1549500 }, { "epoch": 2.667442232666358, "grad_norm": 2.2401998043060303, "learning_rate": 5.542629455560699e-06, "loss": 0.7196, "step": 1550000 }, { "epoch": 2.668302697902702, "grad_norm": 2.144211769104004, "learning_rate": 5.5282883682883e-06, "loss": 0.7187, "step": 1550500 }, { "epoch": 2.669163163139046, "grad_norm": 2.3315768241882324, "learning_rate": 5.5139472810159e-06, "loss": 0.7158, "step": 1551000 }, { "epoch": 2.67002362837539, "grad_norm": 2.303516149520874, "learning_rate": 5.4996061937435e-06, "loss": 0.711, "step": 1551500 }, { "epoch": 2.670884093611734, "grad_norm": 2.337053060531616, "learning_rate": 5.4852651064711e-06, "loss": 0.722, "step": 1552000 }, { "epoch": 2.671744558848078, "grad_norm": 2.1762800216674805, "learning_rate": 5.470924019198701e-06, "loss": 0.716, "step": 1552500 }, { "epoch": 2.672605024084422, "grad_norm": 2.34902286529541, "learning_rate": 5.456582931926301e-06, "loss": 0.716, "step": 1553000 }, { "epoch": 2.673465489320766, "grad_norm": 2.3098535537719727, "learning_rate": 5.442241844653901e-06, "loss": 0.7156, "step": 1553500 }, { "epoch": 2.67432595455711, "grad_norm": 2.463733434677124, "learning_rate": 5.427900757381501e-06, "loss": 0.7184, "step": 1554000 }, { "epoch": 2.675186419793454, "grad_norm": 2.3530120849609375, "learning_rate": 5.413559670109102e-06, "loss": 0.7143, "step": 1554500 }, { "epoch": 2.676046885029798, "grad_norm": 2.367659330368042, "learning_rate": 5.3992185828367014e-06, "loss": 0.719, "step": 1555000 }, { "epoch": 2.676907350266142, "grad_norm": 2.357699155807495, "learning_rate": 5.384877495564302e-06, "loss": 0.7192, "step": 1555500 }, { "epoch": 2.6777678155024858, "grad_norm": 2.980602502822876, "learning_rate": 5.370536408291902e-06, "loss": 0.7164, "step": 1556000 }, { "epoch": 2.6786282807388297, "grad_norm": 2.181764602661133, "learning_rate": 5.356195321019503e-06, "loss": 0.7233, "step": 1556500 }, { "epoch": 2.6794887459751737, "grad_norm": 2.156656265258789, "learning_rate": 5.3418542337471025e-06, "loss": 0.7187, "step": 1557000 }, { "epoch": 2.6803492112115177, "grad_norm": 2.2862629890441895, "learning_rate": 5.327513146474703e-06, "loss": 0.723, "step": 1557500 }, { "epoch": 2.6812096764478617, "grad_norm": 2.340113639831543, "learning_rate": 5.3131720592023034e-06, "loss": 0.7213, "step": 1558000 }, { "epoch": 2.6820701416842057, "grad_norm": 2.258385419845581, "learning_rate": 5.298830971929903e-06, "loss": 0.7208, "step": 1558500 }, { "epoch": 2.68293060692055, "grad_norm": 2.2693705558776855, "learning_rate": 5.2844898846575035e-06, "loss": 0.7172, "step": 1559000 }, { "epoch": 2.683791072156894, "grad_norm": 2.1848247051239014, "learning_rate": 5.270148797385103e-06, "loss": 0.7187, "step": 1559500 }, { "epoch": 2.684651537393238, "grad_norm": 2.1628623008728027, "learning_rate": 5.2558077101127045e-06, "loss": 0.717, "step": 1560000 }, { "epoch": 2.685512002629582, "grad_norm": 2.308980703353882, "learning_rate": 5.241466622840304e-06, "loss": 0.7178, "step": 1560500 }, { "epoch": 2.686372467865926, "grad_norm": 2.333484649658203, "learning_rate": 5.2271255355679046e-06, "loss": 0.7124, "step": 1561000 }, { "epoch": 2.68723293310227, "grad_norm": 2.2160236835479736, "learning_rate": 5.212784448295505e-06, "loss": 0.7181, "step": 1561500 }, { "epoch": 2.688093398338614, "grad_norm": 2.2782230377197266, "learning_rate": 5.1984433610231055e-06, "loss": 0.7144, "step": 1562000 }, { "epoch": 2.688953863574958, "grad_norm": 2.1509571075439453, "learning_rate": 5.184102273750705e-06, "loss": 0.7162, "step": 1562500 }, { "epoch": 2.689814328811302, "grad_norm": 2.432591199874878, "learning_rate": 5.169761186478305e-06, "loss": 0.7184, "step": 1563000 }, { "epoch": 2.690674794047646, "grad_norm": 2.318580389022827, "learning_rate": 5.155420099205906e-06, "loss": 0.7211, "step": 1563500 }, { "epoch": 2.69153525928399, "grad_norm": 2.24094820022583, "learning_rate": 5.141079011933506e-06, "loss": 0.7178, "step": 1564000 }, { "epoch": 2.692395724520334, "grad_norm": 2.3038113117218018, "learning_rate": 5.126737924661106e-06, "loss": 0.7092, "step": 1564500 }, { "epoch": 2.693256189756678, "grad_norm": 2.427248954772949, "learning_rate": 5.112396837388706e-06, "loss": 0.7157, "step": 1565000 }, { "epoch": 2.6941166549930218, "grad_norm": 2.2189440727233887, "learning_rate": 5.098055750116307e-06, "loss": 0.7186, "step": 1565500 }, { "epoch": 2.6949771202293658, "grad_norm": 2.4753639698028564, "learning_rate": 5.083714662843907e-06, "loss": 0.7176, "step": 1566000 }, { "epoch": 2.6958375854657097, "grad_norm": 2.243990421295166, "learning_rate": 5.069373575571506e-06, "loss": 0.7159, "step": 1566500 }, { "epoch": 2.6966980507020537, "grad_norm": 2.1928725242614746, "learning_rate": 5.055032488299107e-06, "loss": 0.7201, "step": 1567000 }, { "epoch": 2.6975585159383977, "grad_norm": 16.473033905029297, "learning_rate": 5.040691401026707e-06, "loss": 0.713, "step": 1567500 }, { "epoch": 2.6984189811747417, "grad_norm": 2.0748605728149414, "learning_rate": 5.026350313754308e-06, "loss": 0.719, "step": 1568000 }, { "epoch": 2.6992794464110856, "grad_norm": 2.2077245712280273, "learning_rate": 5.012009226481907e-06, "loss": 0.7133, "step": 1568500 }, { "epoch": 2.7001399116474296, "grad_norm": 2.1680569648742676, "learning_rate": 4.997668139209509e-06, "loss": 0.7192, "step": 1569000 }, { "epoch": 2.7010003768837736, "grad_norm": 2.239652156829834, "learning_rate": 4.983327051937108e-06, "loss": 0.7151, "step": 1569500 }, { "epoch": 2.7018608421201176, "grad_norm": 2.357666015625, "learning_rate": 4.968985964664708e-06, "loss": 0.7252, "step": 1570000 }, { "epoch": 2.7027213073564615, "grad_norm": 2.201154947280884, "learning_rate": 4.954644877392308e-06, "loss": 0.7147, "step": 1570500 }, { "epoch": 2.7035817725928055, "grad_norm": 3.680764675140381, "learning_rate": 4.940303790119909e-06, "loss": 0.7137, "step": 1571000 }, { "epoch": 2.7044422378291495, "grad_norm": 2.185701608657837, "learning_rate": 4.925962702847509e-06, "loss": 0.7142, "step": 1571500 }, { "epoch": 2.7053027030654935, "grad_norm": 2.305809736251831, "learning_rate": 4.911621615575109e-06, "loss": 0.7244, "step": 1572000 }, { "epoch": 2.7061631683018375, "grad_norm": 2.3235483169555664, "learning_rate": 4.8972805283027094e-06, "loss": 0.7162, "step": 1572500 }, { "epoch": 2.7070236335381814, "grad_norm": 9.165050506591797, "learning_rate": 4.88293944103031e-06, "loss": 0.718, "step": 1573000 }, { "epoch": 2.7078840987745254, "grad_norm": 2.1549155712127686, "learning_rate": 4.86859835375791e-06, "loss": 0.7105, "step": 1573500 }, { "epoch": 2.7087445640108694, "grad_norm": 2.574151039123535, "learning_rate": 4.85425726648551e-06, "loss": 0.7169, "step": 1574000 }, { "epoch": 2.7096050292472134, "grad_norm": 2.3129498958587646, "learning_rate": 4.8399161792131105e-06, "loss": 0.7116, "step": 1574500 }, { "epoch": 2.7104654944835573, "grad_norm": 2.2015106678009033, "learning_rate": 4.825575091940711e-06, "loss": 0.7174, "step": 1575000 }, { "epoch": 2.7113259597199013, "grad_norm": 2.3222100734710693, "learning_rate": 4.8112340046683106e-06, "loss": 0.7202, "step": 1575500 }, { "epoch": 2.7121864249562453, "grad_norm": 2.226888418197632, "learning_rate": 4.796892917395911e-06, "loss": 0.718, "step": 1576000 }, { "epoch": 2.7130468901925893, "grad_norm": 2.2705578804016113, "learning_rate": 4.7825518301235115e-06, "loss": 0.7136, "step": 1576500 }, { "epoch": 2.7139073554289332, "grad_norm": 2.3102686405181885, "learning_rate": 4.768210742851112e-06, "loss": 0.7222, "step": 1577000 }, { "epoch": 2.714767820665277, "grad_norm": 2.3631961345672607, "learning_rate": 4.753869655578712e-06, "loss": 0.7129, "step": 1577500 }, { "epoch": 2.715628285901621, "grad_norm": 2.1715190410614014, "learning_rate": 4.739528568306312e-06, "loss": 0.718, "step": 1578000 }, { "epoch": 2.716488751137965, "grad_norm": 2.1148438453674316, "learning_rate": 4.7251874810339125e-06, "loss": 0.7113, "step": 1578500 }, { "epoch": 2.717349216374309, "grad_norm": 2.140204668045044, "learning_rate": 4.710846393761512e-06, "loss": 0.7188, "step": 1579000 }, { "epoch": 2.718209681610653, "grad_norm": 2.269834041595459, "learning_rate": 4.696505306489113e-06, "loss": 0.7139, "step": 1579500 }, { "epoch": 2.719070146846997, "grad_norm": 2.177370548248291, "learning_rate": 4.682164219216713e-06, "loss": 0.7139, "step": 1580000 }, { "epoch": 2.719930612083341, "grad_norm": 2.2315120697021484, "learning_rate": 4.667823131944314e-06, "loss": 0.7171, "step": 1580500 }, { "epoch": 2.720791077319685, "grad_norm": 2.628296375274658, "learning_rate": 4.653482044671913e-06, "loss": 0.7164, "step": 1581000 }, { "epoch": 2.721651542556029, "grad_norm": 2.3418428897857666, "learning_rate": 4.639140957399514e-06, "loss": 0.7134, "step": 1581500 }, { "epoch": 2.722512007792373, "grad_norm": 2.3924858570098877, "learning_rate": 4.624799870127114e-06, "loss": 0.7081, "step": 1582000 }, { "epoch": 2.723372473028717, "grad_norm": 2.1358137130737305, "learning_rate": 4.610458782854714e-06, "loss": 0.7163, "step": 1582500 }, { "epoch": 2.724232938265061, "grad_norm": 2.155128240585327, "learning_rate": 4.596117695582314e-06, "loss": 0.7186, "step": 1583000 }, { "epoch": 2.725093403501405, "grad_norm": 2.1119282245635986, "learning_rate": 4.581776608309915e-06, "loss": 0.7141, "step": 1583500 }, { "epoch": 2.725953868737749, "grad_norm": 2.3449249267578125, "learning_rate": 4.567435521037515e-06, "loss": 0.7169, "step": 1584000 }, { "epoch": 2.726814333974093, "grad_norm": 2.5031585693359375, "learning_rate": 4.553094433765115e-06, "loss": 0.7203, "step": 1584500 }, { "epoch": 2.727674799210437, "grad_norm": 2.304473400115967, "learning_rate": 4.538753346492715e-06, "loss": 0.7182, "step": 1585000 }, { "epoch": 2.728535264446781, "grad_norm": 2.4063103199005127, "learning_rate": 4.524412259220316e-06, "loss": 0.7122, "step": 1585500 }, { "epoch": 2.729395729683125, "grad_norm": 2.4361159801483154, "learning_rate": 4.510071171947915e-06, "loss": 0.7111, "step": 1586000 }, { "epoch": 2.730256194919469, "grad_norm": 2.2181286811828613, "learning_rate": 4.495730084675516e-06, "loss": 0.7176, "step": 1586500 }, { "epoch": 2.731116660155813, "grad_norm": 2.414072036743164, "learning_rate": 4.481388997403116e-06, "loss": 0.7226, "step": 1587000 }, { "epoch": 2.7319771253921568, "grad_norm": 2.095151662826538, "learning_rate": 4.467047910130717e-06, "loss": 0.7134, "step": 1587500 }, { "epoch": 2.732837590628501, "grad_norm": 2.0435218811035156, "learning_rate": 4.452706822858316e-06, "loss": 0.7117, "step": 1588000 }, { "epoch": 2.733698055864845, "grad_norm": 2.6669647693634033, "learning_rate": 4.438365735585917e-06, "loss": 0.7096, "step": 1588500 }, { "epoch": 2.734558521101189, "grad_norm": 1.955651879310608, "learning_rate": 4.424024648313517e-06, "loss": 0.721, "step": 1589000 }, { "epoch": 2.735418986337533, "grad_norm": 2.1903460025787354, "learning_rate": 4.409683561041118e-06, "loss": 0.7173, "step": 1589500 }, { "epoch": 2.736279451573877, "grad_norm": 2.187950372695923, "learning_rate": 4.3953424737687174e-06, "loss": 0.7102, "step": 1590000 }, { "epoch": 2.737139916810221, "grad_norm": 2.1879000663757324, "learning_rate": 4.381001386496317e-06, "loss": 0.7166, "step": 1590500 }, { "epoch": 2.738000382046565, "grad_norm": 2.2645974159240723, "learning_rate": 4.366660299223918e-06, "loss": 0.7154, "step": 1591000 }, { "epoch": 2.738860847282909, "grad_norm": 2.1378839015960693, "learning_rate": 4.352319211951518e-06, "loss": 0.7153, "step": 1591500 }, { "epoch": 2.739721312519253, "grad_norm": 2.3064563274383545, "learning_rate": 4.3379781246791185e-06, "loss": 0.7147, "step": 1592000 }, { "epoch": 2.740581777755597, "grad_norm": 2.2445333003997803, "learning_rate": 4.323637037406719e-06, "loss": 0.707, "step": 1592500 }, { "epoch": 2.741442242991941, "grad_norm": 2.3018760681152344, "learning_rate": 4.309295950134319e-06, "loss": 0.7121, "step": 1593000 }, { "epoch": 2.742302708228285, "grad_norm": 2.269040107727051, "learning_rate": 4.294954862861919e-06, "loss": 0.7126, "step": 1593500 }, { "epoch": 2.743163173464629, "grad_norm": 2.353139638900757, "learning_rate": 4.280613775589519e-06, "loss": 0.7233, "step": 1594000 }, { "epoch": 2.744023638700973, "grad_norm": 2.38712215423584, "learning_rate": 4.26627268831712e-06, "loss": 0.7131, "step": 1594500 }, { "epoch": 2.744884103937317, "grad_norm": 2.411133289337158, "learning_rate": 4.25193160104472e-06, "loss": 0.71, "step": 1595000 }, { "epoch": 2.745744569173661, "grad_norm": 2.1307971477508545, "learning_rate": 4.23759051377232e-06, "loss": 0.718, "step": 1595500 }, { "epoch": 2.746605034410005, "grad_norm": 2.3968119621276855, "learning_rate": 4.22324942649992e-06, "loss": 0.7162, "step": 1596000 }, { "epoch": 2.747465499646349, "grad_norm": 2.289236068725586, "learning_rate": 4.208908339227521e-06, "loss": 0.7117, "step": 1596500 }, { "epoch": 2.7483259648826928, "grad_norm": 2.1929147243499756, "learning_rate": 4.194567251955121e-06, "loss": 0.7153, "step": 1597000 }, { "epoch": 2.7491864301190367, "grad_norm": 2.2217397689819336, "learning_rate": 4.18022616468272e-06, "loss": 0.7133, "step": 1597500 }, { "epoch": 2.7500468953553807, "grad_norm": 2.5000901222229004, "learning_rate": 4.165885077410321e-06, "loss": 0.7143, "step": 1598000 }, { "epoch": 2.7509073605917247, "grad_norm": 2.1965184211730957, "learning_rate": 4.151543990137921e-06, "loss": 0.711, "step": 1598500 }, { "epoch": 2.7517678258280687, "grad_norm": 2.4667575359344482, "learning_rate": 4.137202902865522e-06, "loss": 0.721, "step": 1599000 }, { "epoch": 2.7526282910644126, "grad_norm": 2.006913661956787, "learning_rate": 4.122861815593121e-06, "loss": 0.715, "step": 1599500 }, { "epoch": 2.7534887563007566, "grad_norm": 2.0907866954803467, "learning_rate": 4.108520728320723e-06, "loss": 0.716, "step": 1600000 }, { "epoch": 2.7543492215371006, "grad_norm": 2.113661289215088, "learning_rate": 4.094179641048322e-06, "loss": 0.7127, "step": 1600500 }, { "epoch": 2.7552096867734446, "grad_norm": 2.062978744506836, "learning_rate": 4.079838553775923e-06, "loss": 0.7155, "step": 1601000 }, { "epoch": 2.7560701520097886, "grad_norm": 2.0641098022460938, "learning_rate": 4.065497466503522e-06, "loss": 0.7156, "step": 1601500 }, { "epoch": 2.7569306172461325, "grad_norm": 2.1433663368225098, "learning_rate": 4.051156379231123e-06, "loss": 0.7186, "step": 1602000 }, { "epoch": 2.7577910824824765, "grad_norm": 2.1974167823791504, "learning_rate": 4.036815291958723e-06, "loss": 0.7123, "step": 1602500 }, { "epoch": 2.7586515477188205, "grad_norm": 2.3867440223693848, "learning_rate": 4.022474204686323e-06, "loss": 0.7195, "step": 1603000 }, { "epoch": 2.7595120129551645, "grad_norm": 2.26507306098938, "learning_rate": 4.008133117413923e-06, "loss": 0.71, "step": 1603500 }, { "epoch": 2.7603724781915084, "grad_norm": 2.171724319458008, "learning_rate": 3.993792030141524e-06, "loss": 0.7135, "step": 1604000 }, { "epoch": 2.7612329434278524, "grad_norm": 2.4163520336151123, "learning_rate": 3.979450942869124e-06, "loss": 0.7221, "step": 1604500 }, { "epoch": 2.7620934086641964, "grad_norm": 2.2443559169769287, "learning_rate": 3.965109855596724e-06, "loss": 0.7165, "step": 1605000 }, { "epoch": 2.762953873900541, "grad_norm": 1.999537467956543, "learning_rate": 3.950768768324324e-06, "loss": 0.7166, "step": 1605500 }, { "epoch": 2.763814339136885, "grad_norm": 2.4677159786224365, "learning_rate": 3.936427681051925e-06, "loss": 0.7085, "step": 1606000 }, { "epoch": 2.7646748043732288, "grad_norm": 2.335520029067993, "learning_rate": 3.9220865937795245e-06, "loss": 0.7078, "step": 1606500 }, { "epoch": 2.7655352696095727, "grad_norm": 2.352250576019287, "learning_rate": 3.907745506507125e-06, "loss": 0.7164, "step": 1607000 }, { "epoch": 2.7663957348459167, "grad_norm": 2.2408668994903564, "learning_rate": 3.8934044192347254e-06, "loss": 0.7158, "step": 1607500 }, { "epoch": 2.7672562000822607, "grad_norm": 2.1717495918273926, "learning_rate": 3.879063331962326e-06, "loss": 0.7152, "step": 1608000 }, { "epoch": 2.7681166653186047, "grad_norm": 2.263266086578369, "learning_rate": 3.8647222446899255e-06, "loss": 0.7139, "step": 1608500 }, { "epoch": 2.7689771305549487, "grad_norm": 2.527707099914551, "learning_rate": 3.850381157417526e-06, "loss": 0.7116, "step": 1609000 }, { "epoch": 2.7698375957912926, "grad_norm": 2.510777473449707, "learning_rate": 3.8360400701451265e-06, "loss": 0.7161, "step": 1609500 }, { "epoch": 2.7706980610276366, "grad_norm": 1.8678065538406372, "learning_rate": 3.821698982872726e-06, "loss": 0.7149, "step": 1610000 }, { "epoch": 2.7715585262639806, "grad_norm": 2.1824417114257812, "learning_rate": 3.807357895600327e-06, "loss": 0.7106, "step": 1610500 }, { "epoch": 2.7724189915003246, "grad_norm": 2.100099802017212, "learning_rate": 3.7930168083279266e-06, "loss": 0.712, "step": 1611000 }, { "epoch": 2.7732794567366685, "grad_norm": 2.1548209190368652, "learning_rate": 3.7786757210555275e-06, "loss": 0.7082, "step": 1611500 }, { "epoch": 2.7741399219730125, "grad_norm": 2.0101969242095947, "learning_rate": 3.764334633783127e-06, "loss": 0.7105, "step": 1612000 }, { "epoch": 2.7750003872093565, "grad_norm": 2.102904796600342, "learning_rate": 3.749993546510728e-06, "loss": 0.7055, "step": 1612500 }, { "epoch": 2.7758608524457005, "grad_norm": 2.2598965167999268, "learning_rate": 3.7356524592383276e-06, "loss": 0.7118, "step": 1613000 }, { "epoch": 2.7767213176820444, "grad_norm": 2.071484088897705, "learning_rate": 3.7213113719659277e-06, "loss": 0.709, "step": 1613500 }, { "epoch": 2.7775817829183884, "grad_norm": 2.030569076538086, "learning_rate": 3.706970284693528e-06, "loss": 0.7142, "step": 1614000 }, { "epoch": 2.7784422481547324, "grad_norm": 2.0957064628601074, "learning_rate": 3.692629197421128e-06, "loss": 0.7083, "step": 1614500 }, { "epoch": 2.7793027133910764, "grad_norm": 2.3552098274230957, "learning_rate": 3.678288110148729e-06, "loss": 0.7138, "step": 1615000 }, { "epoch": 2.7801631786274204, "grad_norm": 2.4001100063323975, "learning_rate": 3.6639470228763287e-06, "loss": 0.7178, "step": 1615500 }, { "epoch": 2.7810236438637643, "grad_norm": 2.3316562175750732, "learning_rate": 3.6496059356039296e-06, "loss": 0.7125, "step": 1616000 }, { "epoch": 2.7818841091001083, "grad_norm": 2.0596606731414795, "learning_rate": 3.6352648483315292e-06, "loss": 0.7096, "step": 1616500 }, { "epoch": 2.7827445743364523, "grad_norm": 2.2260353565216064, "learning_rate": 3.6209237610591293e-06, "loss": 0.7193, "step": 1617000 }, { "epoch": 2.7836050395727963, "grad_norm": 2.0922586917877197, "learning_rate": 3.6065826737867298e-06, "loss": 0.7118, "step": 1617500 }, { "epoch": 2.7844655048091402, "grad_norm": 2.368934154510498, "learning_rate": 3.59224158651433e-06, "loss": 0.7126, "step": 1618000 }, { "epoch": 2.785325970045484, "grad_norm": 2.3944685459136963, "learning_rate": 3.5779004992419303e-06, "loss": 0.7109, "step": 1618500 }, { "epoch": 2.786186435281828, "grad_norm": 2.3955578804016113, "learning_rate": 3.5635594119695303e-06, "loss": 0.7081, "step": 1619000 }, { "epoch": 2.787046900518172, "grad_norm": 2.152188301086426, "learning_rate": 3.549218324697131e-06, "loss": 0.7163, "step": 1619500 }, { "epoch": 2.787907365754516, "grad_norm": 1.973374605178833, "learning_rate": 3.534877237424731e-06, "loss": 0.7167, "step": 1620000 }, { "epoch": 2.78876783099086, "grad_norm": 2.142923593521118, "learning_rate": 3.5205361501523313e-06, "loss": 0.7101, "step": 1620500 }, { "epoch": 2.789628296227204, "grad_norm": 2.1474125385284424, "learning_rate": 3.5061950628799314e-06, "loss": 0.7173, "step": 1621000 }, { "epoch": 2.790488761463548, "grad_norm": 2.028701066970825, "learning_rate": 3.4918539756075314e-06, "loss": 0.7148, "step": 1621500 }, { "epoch": 2.791349226699892, "grad_norm": 2.265554189682007, "learning_rate": 3.477512888335132e-06, "loss": 0.7124, "step": 1622000 }, { "epoch": 2.792209691936236, "grad_norm": 2.309709310531616, "learning_rate": 3.463171801062732e-06, "loss": 0.7109, "step": 1622500 }, { "epoch": 2.79307015717258, "grad_norm": 2.34216046333313, "learning_rate": 3.4488307137903324e-06, "loss": 0.7131, "step": 1623000 }, { "epoch": 2.793930622408924, "grad_norm": 2.198592185974121, "learning_rate": 3.4344896265179324e-06, "loss": 0.7077, "step": 1623500 }, { "epoch": 2.794791087645268, "grad_norm": 2.2854268550872803, "learning_rate": 3.420148539245533e-06, "loss": 0.7145, "step": 1624000 }, { "epoch": 2.795651552881612, "grad_norm": 2.2535135746002197, "learning_rate": 3.405807451973133e-06, "loss": 0.715, "step": 1624500 }, { "epoch": 2.796512018117956, "grad_norm": 2.2533137798309326, "learning_rate": 3.391466364700733e-06, "loss": 0.7105, "step": 1625000 }, { "epoch": 2.7973724833543, "grad_norm": 2.0734574794769287, "learning_rate": 3.3771252774283335e-06, "loss": 0.7128, "step": 1625500 }, { "epoch": 2.798232948590644, "grad_norm": 2.3115506172180176, "learning_rate": 3.3627841901559335e-06, "loss": 0.712, "step": 1626000 }, { "epoch": 2.799093413826988, "grad_norm": 2.0518035888671875, "learning_rate": 3.348443102883534e-06, "loss": 0.7065, "step": 1626500 }, { "epoch": 2.799953879063332, "grad_norm": 2.206268072128296, "learning_rate": 3.334102015611134e-06, "loss": 0.7109, "step": 1627000 }, { "epoch": 2.800814344299676, "grad_norm": 2.0784969329833984, "learning_rate": 3.3197609283387345e-06, "loss": 0.7105, "step": 1627500 }, { "epoch": 2.8016748095360198, "grad_norm": 2.217696189880371, "learning_rate": 3.3054198410663346e-06, "loss": 0.7092, "step": 1628000 }, { "epoch": 2.8025352747723637, "grad_norm": 2.317526340484619, "learning_rate": 3.291078753793935e-06, "loss": 0.7107, "step": 1628500 }, { "epoch": 2.8033957400087077, "grad_norm": 2.112042188644409, "learning_rate": 3.276737666521535e-06, "loss": 0.7141, "step": 1629000 }, { "epoch": 2.8042562052450517, "grad_norm": 2.475770950317383, "learning_rate": 3.262396579249135e-06, "loss": 0.714, "step": 1629500 }, { "epoch": 2.8051166704813957, "grad_norm": 2.3125, "learning_rate": 3.2480554919767356e-06, "loss": 0.7104, "step": 1630000 }, { "epoch": 2.8059771357177397, "grad_norm": 2.1881062984466553, "learning_rate": 3.2337144047043356e-06, "loss": 0.7104, "step": 1630500 }, { "epoch": 2.8068376009540836, "grad_norm": 2.3878414630889893, "learning_rate": 3.219373317431936e-06, "loss": 0.7065, "step": 1631000 }, { "epoch": 2.8076980661904276, "grad_norm": 2.6674928665161133, "learning_rate": 3.205032230159536e-06, "loss": 0.7168, "step": 1631500 }, { "epoch": 2.8085585314267716, "grad_norm": 2.050078868865967, "learning_rate": 3.1906911428871366e-06, "loss": 0.7118, "step": 1632000 }, { "epoch": 2.8094189966631156, "grad_norm": 2.17096209526062, "learning_rate": 3.1763500556147367e-06, "loss": 0.7136, "step": 1632500 }, { "epoch": 2.8102794618994595, "grad_norm": 2.199841260910034, "learning_rate": 3.1620089683423363e-06, "loss": 0.7085, "step": 1633000 }, { "epoch": 2.8111399271358035, "grad_norm": 2.2607522010803223, "learning_rate": 3.147667881069937e-06, "loss": 0.7138, "step": 1633500 }, { "epoch": 2.812000392372148, "grad_norm": 2.1586544513702393, "learning_rate": 3.1333267937975372e-06, "loss": 0.7089, "step": 1634000 }, { "epoch": 2.812860857608492, "grad_norm": 2.181818723678589, "learning_rate": 3.1189857065251377e-06, "loss": 0.7105, "step": 1634500 }, { "epoch": 2.813721322844836, "grad_norm": 2.3212060928344727, "learning_rate": 3.1046446192527378e-06, "loss": 0.7112, "step": 1635000 }, { "epoch": 2.81458178808118, "grad_norm": 2.3202192783355713, "learning_rate": 3.090303531980338e-06, "loss": 0.716, "step": 1635500 }, { "epoch": 2.815442253317524, "grad_norm": 2.2512378692626953, "learning_rate": 3.0759624447079383e-06, "loss": 0.711, "step": 1636000 }, { "epoch": 2.816302718553868, "grad_norm": 2.1948952674865723, "learning_rate": 3.0616213574355383e-06, "loss": 0.7098, "step": 1636500 }, { "epoch": 2.817163183790212, "grad_norm": 2.0541694164276123, "learning_rate": 3.047280270163139e-06, "loss": 0.7133, "step": 1637000 }, { "epoch": 2.8180236490265558, "grad_norm": 2.204808473587036, "learning_rate": 3.032939182890739e-06, "loss": 0.7149, "step": 1637500 }, { "epoch": 2.8188841142628998, "grad_norm": 2.3189733028411865, "learning_rate": 3.0185980956183393e-06, "loss": 0.7067, "step": 1638000 }, { "epoch": 2.8197445794992437, "grad_norm": 2.3678770065307617, "learning_rate": 3.004257008345939e-06, "loss": 0.7081, "step": 1638500 }, { "epoch": 2.8206050447355877, "grad_norm": 2.376946210861206, "learning_rate": 2.9899159210735394e-06, "loss": 0.7155, "step": 1639000 }, { "epoch": 2.8214655099719317, "grad_norm": 2.2931296825408936, "learning_rate": 2.9755748338011394e-06, "loss": 0.7107, "step": 1639500 }, { "epoch": 2.8223259752082757, "grad_norm": 2.1000328063964844, "learning_rate": 2.96123374652874e-06, "loss": 0.7128, "step": 1640000 }, { "epoch": 2.8231864404446196, "grad_norm": 2.423055648803711, "learning_rate": 2.9468926592563404e-06, "loss": 0.7117, "step": 1640500 }, { "epoch": 2.8240469056809636, "grad_norm": 2.1777422428131104, "learning_rate": 2.9325515719839404e-06, "loss": 0.7121, "step": 1641000 }, { "epoch": 2.8249073709173076, "grad_norm": 2.3314688205718994, "learning_rate": 2.918210484711541e-06, "loss": 0.7083, "step": 1641500 }, { "epoch": 2.8257678361536516, "grad_norm": 2.2967236042022705, "learning_rate": 2.903869397439141e-06, "loss": 0.7108, "step": 1642000 }, { "epoch": 2.8266283013899955, "grad_norm": 2.349409341812134, "learning_rate": 2.889528310166741e-06, "loss": 0.7097, "step": 1642500 }, { "epoch": 2.8274887666263395, "grad_norm": 2.4915151596069336, "learning_rate": 2.875187222894341e-06, "loss": 0.7145, "step": 1643000 }, { "epoch": 2.8283492318626835, "grad_norm": 2.3456923961639404, "learning_rate": 2.8608461356219415e-06, "loss": 0.7157, "step": 1643500 }, { "epoch": 2.8292096970990275, "grad_norm": 2.219536781311035, "learning_rate": 2.8465050483495416e-06, "loss": 0.7079, "step": 1644000 }, { "epoch": 2.8300701623353715, "grad_norm": 2.14593505859375, "learning_rate": 2.832163961077142e-06, "loss": 0.7116, "step": 1644500 }, { "epoch": 2.8309306275717154, "grad_norm": 2.3239259719848633, "learning_rate": 2.817822873804742e-06, "loss": 0.7058, "step": 1645000 }, { "epoch": 2.8317910928080594, "grad_norm": 2.493803024291992, "learning_rate": 2.8034817865323426e-06, "loss": 0.7092, "step": 1645500 }, { "epoch": 2.8326515580444034, "grad_norm": 2.3556675910949707, "learning_rate": 2.789140699259943e-06, "loss": 0.713, "step": 1646000 }, { "epoch": 2.8335120232807474, "grad_norm": 2.187157392501831, "learning_rate": 2.7747996119875426e-06, "loss": 0.7095, "step": 1646500 }, { "epoch": 2.8343724885170913, "grad_norm": 2.2635531425476074, "learning_rate": 2.760458524715143e-06, "loss": 0.7093, "step": 1647000 }, { "epoch": 2.8352329537534353, "grad_norm": 2.098905086517334, "learning_rate": 2.746117437442743e-06, "loss": 0.7116, "step": 1647500 }, { "epoch": 2.8360934189897793, "grad_norm": 2.43005633354187, "learning_rate": 2.7317763501703436e-06, "loss": 0.7103, "step": 1648000 }, { "epoch": 2.8369538842261233, "grad_norm": 2.3651890754699707, "learning_rate": 2.7174352628979437e-06, "loss": 0.714, "step": 1648500 }, { "epoch": 2.8378143494624672, "grad_norm": 2.2694835662841797, "learning_rate": 2.703094175625544e-06, "loss": 0.7139, "step": 1649000 }, { "epoch": 2.838674814698811, "grad_norm": 2.0746357440948486, "learning_rate": 2.688753088353144e-06, "loss": 0.7135, "step": 1649500 }, { "epoch": 2.839535279935155, "grad_norm": 2.3595311641693115, "learning_rate": 2.6744120010807447e-06, "loss": 0.7105, "step": 1650000 }, { "epoch": 2.840395745171499, "grad_norm": 2.122243642807007, "learning_rate": 2.6600709138083447e-06, "loss": 0.7106, "step": 1650500 }, { "epoch": 2.841256210407843, "grad_norm": 2.4125218391418457, "learning_rate": 2.6457298265359448e-06, "loss": 0.7063, "step": 1651000 }, { "epoch": 2.8421166756441876, "grad_norm": 2.1687426567077637, "learning_rate": 2.6313887392635452e-06, "loss": 0.7134, "step": 1651500 }, { "epoch": 2.8429771408805316, "grad_norm": 2.412323474884033, "learning_rate": 2.6170476519911453e-06, "loss": 0.7061, "step": 1652000 }, { "epoch": 2.8438376061168755, "grad_norm": 2.353044033050537, "learning_rate": 2.6027065647187458e-06, "loss": 0.7095, "step": 1652500 }, { "epoch": 2.8446980713532195, "grad_norm": 2.2941107749938965, "learning_rate": 2.588365477446346e-06, "loss": 0.7082, "step": 1653000 }, { "epoch": 2.8455585365895635, "grad_norm": 2.473214626312256, "learning_rate": 2.5740243901739463e-06, "loss": 0.7069, "step": 1653500 }, { "epoch": 2.8464190018259075, "grad_norm": 2.324897527694702, "learning_rate": 2.5596833029015463e-06, "loss": 0.7156, "step": 1654000 }, { "epoch": 2.8472794670622514, "grad_norm": 2.1640710830688477, "learning_rate": 2.5453422156291464e-06, "loss": 0.7128, "step": 1654500 }, { "epoch": 2.8481399322985954, "grad_norm": 2.296011209487915, "learning_rate": 2.5310011283567464e-06, "loss": 0.7117, "step": 1655000 }, { "epoch": 2.8490003975349394, "grad_norm": 2.151864767074585, "learning_rate": 2.516660041084347e-06, "loss": 0.7072, "step": 1655500 }, { "epoch": 2.8498608627712834, "grad_norm": 2.3428564071655273, "learning_rate": 2.5023189538119474e-06, "loss": 0.7157, "step": 1656000 }, { "epoch": 2.8507213280076273, "grad_norm": 2.3723461627960205, "learning_rate": 2.4879778665395474e-06, "loss": 0.708, "step": 1656500 }, { "epoch": 2.8515817932439713, "grad_norm": 2.235034227371216, "learning_rate": 2.473636779267148e-06, "loss": 0.7027, "step": 1657000 }, { "epoch": 2.8524422584803153, "grad_norm": 2.4615862369537354, "learning_rate": 2.459295691994748e-06, "loss": 0.7095, "step": 1657500 }, { "epoch": 2.8533027237166593, "grad_norm": 2.3678109645843506, "learning_rate": 2.4449546047223484e-06, "loss": 0.7058, "step": 1658000 }, { "epoch": 2.8541631889530032, "grad_norm": 2.1077773571014404, "learning_rate": 2.430613517449948e-06, "loss": 0.7132, "step": 1658500 }, { "epoch": 2.8550236541893472, "grad_norm": 2.354522228240967, "learning_rate": 2.4162724301775485e-06, "loss": 0.7129, "step": 1659000 }, { "epoch": 2.855884119425691, "grad_norm": 2.43855357170105, "learning_rate": 2.4019313429051485e-06, "loss": 0.7082, "step": 1659500 }, { "epoch": 2.856744584662035, "grad_norm": 2.218585252761841, "learning_rate": 2.387590255632749e-06, "loss": 0.7151, "step": 1660000 }, { "epoch": 2.857605049898379, "grad_norm": 2.1669533252716064, "learning_rate": 2.373249168360349e-06, "loss": 0.7123, "step": 1660500 }, { "epoch": 2.858465515134723, "grad_norm": 2.2842636108398438, "learning_rate": 2.3589080810879495e-06, "loss": 0.7156, "step": 1661000 }, { "epoch": 2.859325980371067, "grad_norm": 2.3778250217437744, "learning_rate": 2.3445669938155496e-06, "loss": 0.708, "step": 1661500 }, { "epoch": 2.860186445607411, "grad_norm": 2.1641619205474854, "learning_rate": 2.3302259065431496e-06, "loss": 0.7066, "step": 1662000 }, { "epoch": 2.861046910843755, "grad_norm": 2.343029737472534, "learning_rate": 2.31588481927075e-06, "loss": 0.7049, "step": 1662500 }, { "epoch": 2.861907376080099, "grad_norm": 2.098466157913208, "learning_rate": 2.30154373199835e-06, "loss": 0.7094, "step": 1663000 }, { "epoch": 2.862767841316443, "grad_norm": 2.4116451740264893, "learning_rate": 2.2872026447259506e-06, "loss": 0.7092, "step": 1663500 }, { "epoch": 2.863628306552787, "grad_norm": 2.115022897720337, "learning_rate": 2.2728615574535506e-06, "loss": 0.712, "step": 1664000 }, { "epoch": 2.864488771789131, "grad_norm": 2.248368978500366, "learning_rate": 2.258520470181151e-06, "loss": 0.7119, "step": 1664500 }, { "epoch": 2.865349237025475, "grad_norm": 2.036473274230957, "learning_rate": 2.244179382908751e-06, "loss": 0.7103, "step": 1665000 }, { "epoch": 2.866209702261819, "grad_norm": 2.222977638244629, "learning_rate": 2.2298382956363516e-06, "loss": 0.7106, "step": 1665500 }, { "epoch": 2.867070167498163, "grad_norm": 2.3005199432373047, "learning_rate": 2.2154972083639517e-06, "loss": 0.712, "step": 1666000 }, { "epoch": 2.867930632734507, "grad_norm": 2.1439499855041504, "learning_rate": 2.2011561210915517e-06, "loss": 0.7085, "step": 1666500 }, { "epoch": 2.868791097970851, "grad_norm": 2.1440727710723877, "learning_rate": 2.186815033819152e-06, "loss": 0.7091, "step": 1667000 }, { "epoch": 2.869651563207195, "grad_norm": 2.1596696376800537, "learning_rate": 2.1724739465467522e-06, "loss": 0.7039, "step": 1667500 }, { "epoch": 2.870512028443539, "grad_norm": 2.039153575897217, "learning_rate": 2.1581328592743527e-06, "loss": 0.7029, "step": 1668000 }, { "epoch": 2.871372493679883, "grad_norm": 2.2410709857940674, "learning_rate": 2.1437917720019528e-06, "loss": 0.7078, "step": 1668500 }, { "epoch": 2.8722329589162268, "grad_norm": 2.2461483478546143, "learning_rate": 2.1294506847295532e-06, "loss": 0.7161, "step": 1669000 }, { "epoch": 2.8730934241525707, "grad_norm": 2.4385788440704346, "learning_rate": 2.1151095974571533e-06, "loss": 0.7163, "step": 1669500 }, { "epoch": 2.8739538893889147, "grad_norm": 2.215613842010498, "learning_rate": 2.1007685101847533e-06, "loss": 0.7109, "step": 1670000 }, { "epoch": 2.8748143546252587, "grad_norm": 2.438634157180786, "learning_rate": 2.0864274229123534e-06, "loss": 0.7137, "step": 1670500 }, { "epoch": 2.8756748198616027, "grad_norm": 2.2035157680511475, "learning_rate": 2.072086335639954e-06, "loss": 0.712, "step": 1671000 }, { "epoch": 2.8765352850979466, "grad_norm": 2.183810234069824, "learning_rate": 2.0577452483675543e-06, "loss": 0.7111, "step": 1671500 }, { "epoch": 2.8773957503342906, "grad_norm": 2.2220458984375, "learning_rate": 2.0434041610951544e-06, "loss": 0.7046, "step": 1672000 }, { "epoch": 2.8782562155706346, "grad_norm": 2.3107755184173584, "learning_rate": 2.029063073822755e-06, "loss": 0.7065, "step": 1672500 }, { "epoch": 2.8791166808069786, "grad_norm": 2.3349597454071045, "learning_rate": 2.014721986550355e-06, "loss": 0.7124, "step": 1673000 }, { "epoch": 2.8799771460433226, "grad_norm": 2.2262802124023438, "learning_rate": 2.0003808992779553e-06, "loss": 0.7125, "step": 1673500 }, { "epoch": 2.8808376112796665, "grad_norm": 2.3967578411102295, "learning_rate": 1.986039812005555e-06, "loss": 0.7069, "step": 1674000 }, { "epoch": 2.8816980765160105, "grad_norm": 2.216095209121704, "learning_rate": 1.9716987247331554e-06, "loss": 0.7128, "step": 1674500 }, { "epoch": 2.8825585417523545, "grad_norm": 2.380256175994873, "learning_rate": 1.9573576374607555e-06, "loss": 0.7121, "step": 1675000 }, { "epoch": 2.8834190069886985, "grad_norm": 2.2062182426452637, "learning_rate": 1.943016550188356e-06, "loss": 0.7066, "step": 1675500 }, { "epoch": 2.8842794722250424, "grad_norm": 2.055337905883789, "learning_rate": 1.928675462915956e-06, "loss": 0.7104, "step": 1676000 }, { "epoch": 2.8851399374613864, "grad_norm": 2.2458910942077637, "learning_rate": 1.9143343756435565e-06, "loss": 0.7049, "step": 1676500 }, { "epoch": 2.8860004026977304, "grad_norm": 2.4219136238098145, "learning_rate": 1.8999932883711567e-06, "loss": 0.7036, "step": 1677000 }, { "epoch": 2.8868608679340744, "grad_norm": 2.331808567047119, "learning_rate": 1.885652201098757e-06, "loss": 0.7065, "step": 1677500 }, { "epoch": 2.8877213331704183, "grad_norm": 2.258765459060669, "learning_rate": 1.8713111138263568e-06, "loss": 0.7025, "step": 1678000 }, { "epoch": 2.8885817984067623, "grad_norm": 2.276099681854248, "learning_rate": 1.856970026553957e-06, "loss": 0.7109, "step": 1678500 }, { "epoch": 2.8894422636431063, "grad_norm": 2.2920784950256348, "learning_rate": 1.8426289392815576e-06, "loss": 0.7092, "step": 1679000 }, { "epoch": 2.8903027288794503, "grad_norm": 2.336284637451172, "learning_rate": 1.8282878520091578e-06, "loss": 0.7134, "step": 1679500 }, { "epoch": 2.8911631941157943, "grad_norm": 2.173426389694214, "learning_rate": 1.813946764736758e-06, "loss": 0.6993, "step": 1680000 }, { "epoch": 2.8920236593521387, "grad_norm": 2.4683492183685303, "learning_rate": 1.7996056774643583e-06, "loss": 0.7116, "step": 1680500 }, { "epoch": 2.8928841245884827, "grad_norm": 2.2608296871185303, "learning_rate": 1.7852645901919586e-06, "loss": 0.7081, "step": 1681000 }, { "epoch": 2.8937445898248266, "grad_norm": 2.368868589401245, "learning_rate": 1.7709235029195588e-06, "loss": 0.7101, "step": 1681500 }, { "epoch": 2.8946050550611706, "grad_norm": 2.291165590286255, "learning_rate": 1.7565824156471587e-06, "loss": 0.705, "step": 1682000 }, { "epoch": 2.8954655202975146, "grad_norm": 2.262145519256592, "learning_rate": 1.742241328374759e-06, "loss": 0.7079, "step": 1682500 }, { "epoch": 2.8963259855338586, "grad_norm": 2.2170474529266357, "learning_rate": 1.7279002411023592e-06, "loss": 0.7126, "step": 1683000 }, { "epoch": 2.8971864507702025, "grad_norm": 2.0943198204040527, "learning_rate": 1.7135591538299595e-06, "loss": 0.7084, "step": 1683500 }, { "epoch": 2.8980469160065465, "grad_norm": 2.274672746658325, "learning_rate": 1.6992180665575597e-06, "loss": 0.7046, "step": 1684000 }, { "epoch": 2.8989073812428905, "grad_norm": 2.342705011367798, "learning_rate": 1.68487697928516e-06, "loss": 0.7074, "step": 1684500 }, { "epoch": 2.8997678464792345, "grad_norm": 2.3994994163513184, "learning_rate": 1.6705358920127602e-06, "loss": 0.7096, "step": 1685000 }, { "epoch": 2.9006283117155784, "grad_norm": 2.6676313877105713, "learning_rate": 1.6561948047403603e-06, "loss": 0.7096, "step": 1685500 }, { "epoch": 2.9014887769519224, "grad_norm": 2.506844997406006, "learning_rate": 1.6418537174679605e-06, "loss": 0.709, "step": 1686000 }, { "epoch": 2.9023492421882664, "grad_norm": 2.238424301147461, "learning_rate": 1.6275126301955608e-06, "loss": 0.7046, "step": 1686500 }, { "epoch": 2.9032097074246104, "grad_norm": 2.2722549438476562, "learning_rate": 1.613171542923161e-06, "loss": 0.7024, "step": 1687000 }, { "epoch": 2.9040701726609544, "grad_norm": 2.783315420150757, "learning_rate": 1.5988304556507613e-06, "loss": 0.7166, "step": 1687500 }, { "epoch": 2.9049306378972983, "grad_norm": 2.3448448181152344, "learning_rate": 1.5844893683783616e-06, "loss": 0.7065, "step": 1688000 }, { "epoch": 2.9057911031336423, "grad_norm": 2.162137031555176, "learning_rate": 1.5701482811059618e-06, "loss": 0.7146, "step": 1688500 }, { "epoch": 2.9066515683699863, "grad_norm": 2.2553751468658447, "learning_rate": 1.5558071938335619e-06, "loss": 0.7053, "step": 1689000 }, { "epoch": 2.9075120336063303, "grad_norm": 2.0961170196533203, "learning_rate": 1.5414661065611621e-06, "loss": 0.7124, "step": 1689500 }, { "epoch": 2.9083724988426742, "grad_norm": 2.2436742782592773, "learning_rate": 1.5271250192887626e-06, "loss": 0.7053, "step": 1690000 }, { "epoch": 2.909232964079018, "grad_norm": 2.397540330886841, "learning_rate": 1.5127839320163627e-06, "loss": 0.7113, "step": 1690500 }, { "epoch": 2.910093429315362, "grad_norm": 2.4580838680267334, "learning_rate": 1.498442844743963e-06, "loss": 0.7089, "step": 1691000 }, { "epoch": 2.910953894551706, "grad_norm": 2.1462783813476562, "learning_rate": 1.4841017574715632e-06, "loss": 0.7102, "step": 1691500 }, { "epoch": 2.91181435978805, "grad_norm": 2.124737024307251, "learning_rate": 1.4697606701991634e-06, "loss": 0.7063, "step": 1692000 }, { "epoch": 2.912674825024394, "grad_norm": 2.294898271560669, "learning_rate": 1.4554195829267635e-06, "loss": 0.7074, "step": 1692500 }, { "epoch": 2.913535290260738, "grad_norm": 2.4345502853393555, "learning_rate": 1.4410784956543637e-06, "loss": 0.7116, "step": 1693000 }, { "epoch": 2.914395755497082, "grad_norm": 2.2579216957092285, "learning_rate": 1.426737408381964e-06, "loss": 0.7096, "step": 1693500 }, { "epoch": 2.915256220733426, "grad_norm": 2.086071252822876, "learning_rate": 1.4123963211095643e-06, "loss": 0.7071, "step": 1694000 }, { "epoch": 2.91611668596977, "grad_norm": 2.2941625118255615, "learning_rate": 1.3980552338371645e-06, "loss": 0.7117, "step": 1694500 }, { "epoch": 2.916977151206114, "grad_norm": 2.3904547691345215, "learning_rate": 1.3837141465647648e-06, "loss": 0.7125, "step": 1695000 }, { "epoch": 2.917837616442458, "grad_norm": 2.387474775314331, "learning_rate": 1.369373059292365e-06, "loss": 0.7047, "step": 1695500 }, { "epoch": 2.918698081678802, "grad_norm": 2.228543281555176, "learning_rate": 1.3550319720199653e-06, "loss": 0.7043, "step": 1696000 }, { "epoch": 2.919558546915146, "grad_norm": 2.063142776489258, "learning_rate": 1.3406908847475653e-06, "loss": 0.7069, "step": 1696500 }, { "epoch": 2.92041901215149, "grad_norm": 2.1467058658599854, "learning_rate": 1.3263497974751656e-06, "loss": 0.7109, "step": 1697000 }, { "epoch": 2.921279477387834, "grad_norm": 2.3527698516845703, "learning_rate": 1.3120087102027659e-06, "loss": 0.7098, "step": 1697500 }, { "epoch": 2.9221399426241783, "grad_norm": 2.164032459259033, "learning_rate": 1.2976676229303661e-06, "loss": 0.7096, "step": 1698000 }, { "epoch": 2.9230004078605223, "grad_norm": 2.1595492362976074, "learning_rate": 1.2833265356579662e-06, "loss": 0.7139, "step": 1698500 }, { "epoch": 2.9238608730968663, "grad_norm": 2.340627670288086, "learning_rate": 1.2689854483855664e-06, "loss": 0.7066, "step": 1699000 }, { "epoch": 2.9247213383332102, "grad_norm": 2.104126214981079, "learning_rate": 1.2546443611131667e-06, "loss": 0.7053, "step": 1699500 }, { "epoch": 2.925581803569554, "grad_norm": 2.251032590866089, "learning_rate": 1.240303273840767e-06, "loss": 0.7069, "step": 1700000 }, { "epoch": 2.926442268805898, "grad_norm": 2.411782741546631, "learning_rate": 1.2259621865683672e-06, "loss": 0.7109, "step": 1700500 }, { "epoch": 2.927302734042242, "grad_norm": 2.136021137237549, "learning_rate": 1.2116210992959675e-06, "loss": 0.7092, "step": 1701000 }, { "epoch": 2.928163199278586, "grad_norm": 2.3158106803894043, "learning_rate": 1.1972800120235677e-06, "loss": 0.7044, "step": 1701500 }, { "epoch": 2.92902366451493, "grad_norm": 2.3899245262145996, "learning_rate": 1.182938924751168e-06, "loss": 0.7096, "step": 1702000 }, { "epoch": 2.929884129751274, "grad_norm": 2.265352249145508, "learning_rate": 1.168597837478768e-06, "loss": 0.7057, "step": 1702500 }, { "epoch": 2.930744594987618, "grad_norm": 2.140774726867676, "learning_rate": 1.1542567502063683e-06, "loss": 0.7097, "step": 1703000 }, { "epoch": 2.931605060223962, "grad_norm": 2.231912136077881, "learning_rate": 1.1399156629339685e-06, "loss": 0.7009, "step": 1703500 }, { "epoch": 2.932465525460306, "grad_norm": 2.278916835784912, "learning_rate": 1.1255745756615688e-06, "loss": 0.7149, "step": 1704000 }, { "epoch": 2.93332599069665, "grad_norm": 2.3676211833953857, "learning_rate": 1.1112334883891688e-06, "loss": 0.7097, "step": 1704500 }, { "epoch": 2.934186455932994, "grad_norm": 2.293724536895752, "learning_rate": 1.096892401116769e-06, "loss": 0.7083, "step": 1705000 }, { "epoch": 2.935046921169338, "grad_norm": 2.1331515312194824, "learning_rate": 1.0825513138443696e-06, "loss": 0.6971, "step": 1705500 }, { "epoch": 2.935907386405682, "grad_norm": 2.2781825065612793, "learning_rate": 1.0682102265719698e-06, "loss": 0.7116, "step": 1706000 }, { "epoch": 2.936767851642026, "grad_norm": 2.1554043292999268, "learning_rate": 1.0538691392995699e-06, "loss": 0.7103, "step": 1706500 }, { "epoch": 2.93762831687837, "grad_norm": 2.3552112579345703, "learning_rate": 1.0395280520271701e-06, "loss": 0.7088, "step": 1707000 }, { "epoch": 2.938488782114714, "grad_norm": 2.341012954711914, "learning_rate": 1.0251869647547704e-06, "loss": 0.7169, "step": 1707500 }, { "epoch": 2.939349247351058, "grad_norm": 2.056424856185913, "learning_rate": 1.0108458774823704e-06, "loss": 0.7039, "step": 1708000 }, { "epoch": 2.940209712587402, "grad_norm": 2.2107203006744385, "learning_rate": 9.965047902099707e-07, "loss": 0.7044, "step": 1708500 }, { "epoch": 2.941070177823746, "grad_norm": 2.317542791366577, "learning_rate": 9.82163702937571e-07, "loss": 0.7053, "step": 1709000 }, { "epoch": 2.94193064306009, "grad_norm": 1.9674264192581177, "learning_rate": 9.678226156651712e-07, "loss": 0.7058, "step": 1709500 }, { "epoch": 2.9427911082964338, "grad_norm": 2.1635196208953857, "learning_rate": 9.534815283927714e-07, "loss": 0.7025, "step": 1710000 }, { "epoch": 2.9436515735327777, "grad_norm": 2.205508232116699, "learning_rate": 9.391404411203716e-07, "loss": 0.7068, "step": 1710500 }, { "epoch": 2.9445120387691217, "grad_norm": 2.3096201419830322, "learning_rate": 9.247993538479719e-07, "loss": 0.7066, "step": 1711000 }, { "epoch": 2.9453725040054657, "grad_norm": 2.1905324459075928, "learning_rate": 9.104582665755721e-07, "loss": 0.7035, "step": 1711500 }, { "epoch": 2.9462329692418097, "grad_norm": 2.4388911724090576, "learning_rate": 8.961171793031723e-07, "loss": 0.71, "step": 1712000 }, { "epoch": 2.9470934344781536, "grad_norm": 2.2021710872650146, "learning_rate": 8.817760920307726e-07, "loss": 0.7055, "step": 1712500 }, { "epoch": 2.9479538997144976, "grad_norm": 2.2784557342529297, "learning_rate": 8.674350047583728e-07, "loss": 0.7062, "step": 1713000 }, { "epoch": 2.9488143649508416, "grad_norm": 2.1617753505706787, "learning_rate": 8.530939174859731e-07, "loss": 0.7057, "step": 1713500 }, { "epoch": 2.9496748301871856, "grad_norm": 2.285795211791992, "learning_rate": 8.387528302135732e-07, "loss": 0.7106, "step": 1714000 }, { "epoch": 2.9505352954235295, "grad_norm": 2.445192337036133, "learning_rate": 8.244117429411735e-07, "loss": 0.7048, "step": 1714500 }, { "epoch": 2.9513957606598735, "grad_norm": 2.245013475418091, "learning_rate": 8.100706556687737e-07, "loss": 0.7064, "step": 1715000 }, { "epoch": 2.9522562258962175, "grad_norm": 2.259819269180298, "learning_rate": 7.95729568396374e-07, "loss": 0.7101, "step": 1715500 }, { "epoch": 2.9531166911325615, "grad_norm": 2.1456029415130615, "learning_rate": 7.81388481123974e-07, "loss": 0.7025, "step": 1716000 }, { "epoch": 2.9539771563689055, "grad_norm": 2.2223355770111084, "learning_rate": 7.670473938515743e-07, "loss": 0.7048, "step": 1716500 }, { "epoch": 2.9548376216052494, "grad_norm": 2.179152011871338, "learning_rate": 7.527063065791746e-07, "loss": 0.7075, "step": 1717000 }, { "epoch": 2.9556980868415934, "grad_norm": 2.1588294506073, "learning_rate": 7.383652193067748e-07, "loss": 0.7109, "step": 1717500 }, { "epoch": 2.9565585520779374, "grad_norm": 2.1333980560302734, "learning_rate": 7.240241320343751e-07, "loss": 0.7111, "step": 1718000 }, { "epoch": 2.9574190173142814, "grad_norm": 2.512148380279541, "learning_rate": 7.096830447619752e-07, "loss": 0.7039, "step": 1718500 }, { "epoch": 2.9582794825506253, "grad_norm": 2.4139938354492188, "learning_rate": 6.953419574895755e-07, "loss": 0.7115, "step": 1719000 }, { "epoch": 2.9591399477869693, "grad_norm": 2.1482958793640137, "learning_rate": 6.810008702171758e-07, "loss": 0.7001, "step": 1719500 }, { "epoch": 2.9600004130233133, "grad_norm": 2.4225013256073, "learning_rate": 6.66659782944776e-07, "loss": 0.7079, "step": 1720000 }, { "epoch": 2.9608608782596573, "grad_norm": 2.3143699169158936, "learning_rate": 6.523186956723762e-07, "loss": 0.7059, "step": 1720500 }, { "epoch": 2.9617213434960012, "grad_norm": 2.1610562801361084, "learning_rate": 6.379776083999764e-07, "loss": 0.7112, "step": 1721000 }, { "epoch": 2.9625818087323452, "grad_norm": 2.4261155128479004, "learning_rate": 6.236365211275766e-07, "loss": 0.7089, "step": 1721500 }, { "epoch": 2.963442273968689, "grad_norm": 2.2455458641052246, "learning_rate": 6.092954338551768e-07, "loss": 0.7095, "step": 1722000 }, { "epoch": 2.964302739205033, "grad_norm": 2.034914493560791, "learning_rate": 5.949543465827771e-07, "loss": 0.7051, "step": 1722500 }, { "epoch": 2.965163204441377, "grad_norm": 2.3136467933654785, "learning_rate": 5.806132593103774e-07, "loss": 0.7055, "step": 1723000 }, { "epoch": 2.966023669677721, "grad_norm": 2.1953635215759277, "learning_rate": 5.662721720379775e-07, "loss": 0.7039, "step": 1723500 }, { "epoch": 2.966884134914065, "grad_norm": 2.405912160873413, "learning_rate": 5.519310847655778e-07, "loss": 0.7106, "step": 1724000 }, { "epoch": 2.967744600150409, "grad_norm": 2.216336250305176, "learning_rate": 5.375899974931779e-07, "loss": 0.7082, "step": 1724500 }, { "epoch": 2.968605065386753, "grad_norm": 2.5332186222076416, "learning_rate": 5.232489102207782e-07, "loss": 0.7062, "step": 1725000 }, { "epoch": 2.969465530623097, "grad_norm": 2.1927483081817627, "learning_rate": 5.089078229483784e-07, "loss": 0.7146, "step": 1725500 }, { "epoch": 2.970325995859441, "grad_norm": 2.2343924045562744, "learning_rate": 4.945667356759787e-07, "loss": 0.7102, "step": 1726000 }, { "epoch": 2.971186461095785, "grad_norm": 2.3195652961730957, "learning_rate": 4.802256484035788e-07, "loss": 0.7045, "step": 1726500 }, { "epoch": 2.9720469263321294, "grad_norm": 2.3527255058288574, "learning_rate": 4.6588456113117916e-07, "loss": 0.7085, "step": 1727000 }, { "epoch": 2.9729073915684734, "grad_norm": 2.1666224002838135, "learning_rate": 4.515434738587793e-07, "loss": 0.7037, "step": 1727500 }, { "epoch": 2.9737678568048174, "grad_norm": 2.1871511936187744, "learning_rate": 4.3720238658637957e-07, "loss": 0.7054, "step": 1728000 }, { "epoch": 2.9746283220411613, "grad_norm": 2.1100893020629883, "learning_rate": 4.228612993139798e-07, "loss": 0.7099, "step": 1728500 }, { "epoch": 2.9754887872775053, "grad_norm": 2.129465341567993, "learning_rate": 4.0852021204158004e-07, "loss": 0.7036, "step": 1729000 }, { "epoch": 2.9763492525138493, "grad_norm": 2.361323356628418, "learning_rate": 3.9417912476918024e-07, "loss": 0.7105, "step": 1729500 }, { "epoch": 2.9772097177501933, "grad_norm": 2.1633265018463135, "learning_rate": 3.7983803749678045e-07, "loss": 0.7035, "step": 1730000 }, { "epoch": 2.9780701829865373, "grad_norm": 2.143930673599243, "learning_rate": 3.6549695022438065e-07, "loss": 0.7043, "step": 1730500 }, { "epoch": 2.9789306482228812, "grad_norm": 2.5136213302612305, "learning_rate": 3.511558629519809e-07, "loss": 0.7052, "step": 1731000 }, { "epoch": 2.979791113459225, "grad_norm": 2.3515727519989014, "learning_rate": 3.368147756795811e-07, "loss": 0.6972, "step": 1731500 }, { "epoch": 2.980651578695569, "grad_norm": 2.180917501449585, "learning_rate": 3.224736884071813e-07, "loss": 0.7069, "step": 1732000 }, { "epoch": 2.981512043931913, "grad_norm": 2.28267240524292, "learning_rate": 3.081326011347816e-07, "loss": 0.7077, "step": 1732500 }, { "epoch": 2.982372509168257, "grad_norm": 2.4988763332366943, "learning_rate": 2.937915138623818e-07, "loss": 0.7109, "step": 1733000 }, { "epoch": 2.983232974404601, "grad_norm": 1.968199610710144, "learning_rate": 2.7945042658998205e-07, "loss": 0.7079, "step": 1733500 }, { "epoch": 2.984093439640945, "grad_norm": 2.156123161315918, "learning_rate": 2.6510933931758225e-07, "loss": 0.7057, "step": 1734000 }, { "epoch": 2.984953904877289, "grad_norm": 2.4039177894592285, "learning_rate": 2.5076825204518246e-07, "loss": 0.7094, "step": 1734500 }, { "epoch": 2.985814370113633, "grad_norm": 2.0483903884887695, "learning_rate": 2.3642716477278272e-07, "loss": 0.7071, "step": 1735000 }, { "epoch": 2.986674835349977, "grad_norm": 2.2727344036102295, "learning_rate": 2.2208607750038292e-07, "loss": 0.706, "step": 1735500 }, { "epoch": 2.987535300586321, "grad_norm": 2.286410093307495, "learning_rate": 2.0774499022798315e-07, "loss": 0.7068, "step": 1736000 }, { "epoch": 2.988395765822665, "grad_norm": 2.280778169631958, "learning_rate": 1.9340390295558336e-07, "loss": 0.7066, "step": 1736500 }, { "epoch": 2.989256231059009, "grad_norm": 2.085190534591675, "learning_rate": 1.790628156831836e-07, "loss": 0.7085, "step": 1737000 }, { "epoch": 2.990116696295353, "grad_norm": 2.1531052589416504, "learning_rate": 1.647217284107838e-07, "loss": 0.7059, "step": 1737500 }, { "epoch": 2.990977161531697, "grad_norm": 2.329993486404419, "learning_rate": 1.5038064113838403e-07, "loss": 0.703, "step": 1738000 }, { "epoch": 2.991837626768041, "grad_norm": 2.5018150806427, "learning_rate": 1.3603955386598426e-07, "loss": 0.7058, "step": 1738500 }, { "epoch": 2.992698092004385, "grad_norm": 2.285696029663086, "learning_rate": 1.216984665935845e-07, "loss": 0.7069, "step": 1739000 }, { "epoch": 2.993558557240729, "grad_norm": 2.3342292308807373, "learning_rate": 1.0735737932118471e-07, "loss": 0.7029, "step": 1739500 }, { "epoch": 2.994419022477073, "grad_norm": 2.3220858573913574, "learning_rate": 9.301629204878495e-08, "loss": 0.702, "step": 1740000 }, { "epoch": 2.995279487713417, "grad_norm": 2.693772792816162, "learning_rate": 7.867520477638517e-08, "loss": 0.7053, "step": 1740500 }, { "epoch": 2.9961399529497608, "grad_norm": 2.2459399700164795, "learning_rate": 6.43341175039854e-08, "loss": 0.7045, "step": 1741000 }, { "epoch": 2.9970004181861047, "grad_norm": 2.4572110176086426, "learning_rate": 4.9993030231585616e-08, "loss": 0.707, "step": 1741500 }, { "epoch": 2.9978608834224487, "grad_norm": 2.173382520675659, "learning_rate": 3.565194295918584e-08, "loss": 0.7097, "step": 1742000 }, { "epoch": 2.9987213486587927, "grad_norm": 2.199479579925537, "learning_rate": 2.1310855686786068e-08, "loss": 0.7047, "step": 1742500 }, { "epoch": 2.9995818138951367, "grad_norm": 2.2377700805664062, "learning_rate": 6.969768414386291e-09, "loss": 0.7061, "step": 1743000 }, { "epoch": 3.0, "step": 1743243, "total_flos": 1.4697804058526134e+19, "train_loss": 0.807495192525475, "train_runtime": 715284.2162, "train_samples_per_second": 77.988, "train_steps_per_second": 2.437 } ], "logging_steps": 500, "max_steps": 1743243, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.4697804058526134e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }