{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 15.929184960782006, "learning_rate": 2.666666666666667e-07, "loss": 12.7783, "step": 1 }, { "epoch": 0.0, "grad_norm": 15.874932389885064, "learning_rate": 5.333333333333335e-07, "loss": 12.7829, "step": 2 }, { "epoch": 0.0, "grad_norm": 16.638474194681965, "learning_rate": 8.000000000000001e-07, "loss": 12.7492, "step": 3 }, { "epoch": 0.0, "grad_norm": 15.691613562458222, "learning_rate": 1.066666666666667e-06, "loss": 12.7644, "step": 4 }, { "epoch": 0.0, "grad_norm": 16.183637154093034, "learning_rate": 1.3333333333333334e-06, "loss": 12.7361, "step": 5 }, { "epoch": 0.0, "grad_norm": 15.394137566345053, "learning_rate": 1.6000000000000001e-06, "loss": 12.7412, "step": 6 }, { "epoch": 0.0, "grad_norm": 16.41810330919611, "learning_rate": 1.8666666666666669e-06, "loss": 12.7497, "step": 7 }, { "epoch": 0.0, "grad_norm": 15.703173654632728, "learning_rate": 2.133333333333334e-06, "loss": 12.6923, "step": 8 }, { "epoch": 0.0, "grad_norm": 15.28258963587572, "learning_rate": 2.4000000000000003e-06, "loss": 12.633, "step": 9 }, { "epoch": 0.0, "grad_norm": 14.979699480878217, "learning_rate": 2.666666666666667e-06, "loss": 12.4999, "step": 10 }, { "epoch": 0.0, "grad_norm": 13.901151346983722, "learning_rate": 2.9333333333333338e-06, "loss": 12.4535, "step": 11 }, { "epoch": 0.0, "grad_norm": 14.29923785648717, "learning_rate": 3.2000000000000003e-06, "loss": 12.3986, "step": 12 }, { "epoch": 0.01, "grad_norm": 11.471178506450912, "learning_rate": 3.4666666666666672e-06, "loss": 11.9871, "step": 13 }, { "epoch": 0.01, "grad_norm": 11.838514649534746, "learning_rate": 3.7333333333333337e-06, "loss": 11.9267, "step": 14 }, { "epoch": 0.01, "grad_norm": 10.978160035469458, "learning_rate": 4.000000000000001e-06, "loss": 11.8394, "step": 15 }, { "epoch": 0.01, "grad_norm": 11.141794873591074, "learning_rate": 4.266666666666668e-06, "loss": 11.7715, "step": 16 }, { "epoch": 0.01, "grad_norm": 11.298617438327794, "learning_rate": 4.533333333333334e-06, "loss": 11.3915, "step": 17 }, { "epoch": 0.01, "grad_norm": 10.25498056051837, "learning_rate": 4.800000000000001e-06, "loss": 11.17, "step": 18 }, { "epoch": 0.01, "grad_norm": 10.283997955198105, "learning_rate": 5.0666666666666676e-06, "loss": 11.0347, "step": 19 }, { "epoch": 0.01, "grad_norm": 9.780727278149014, "learning_rate": 5.333333333333334e-06, "loss": 10.8716, "step": 20 }, { "epoch": 0.01, "grad_norm": 7.854956693311563, "learning_rate": 5.600000000000001e-06, "loss": 10.8734, "step": 21 }, { "epoch": 0.01, "grad_norm": 7.375861574845129, "learning_rate": 5.8666666666666675e-06, "loss": 10.7295, "step": 22 }, { "epoch": 0.01, "grad_norm": 7.478804987639091, "learning_rate": 6.133333333333334e-06, "loss": 10.5077, "step": 23 }, { "epoch": 0.01, "grad_norm": 7.061475152313659, "learning_rate": 6.4000000000000006e-06, "loss": 10.4395, "step": 24 }, { "epoch": 0.01, "grad_norm": 7.26564612763676, "learning_rate": 6.666666666666667e-06, "loss": 10.2625, "step": 25 }, { "epoch": 0.01, "grad_norm": 6.538214638535141, "learning_rate": 6.9333333333333344e-06, "loss": 10.1915, "step": 26 }, { "epoch": 0.01, "grad_norm": 6.337218899132541, "learning_rate": 7.2000000000000005e-06, "loss": 10.1794, "step": 27 }, { "epoch": 0.01, "grad_norm": 5.415165410790723, "learning_rate": 7.4666666666666675e-06, "loss": 10.2069, "step": 28 }, { "epoch": 0.01, "grad_norm": 6.187915970859848, "learning_rate": 7.733333333333334e-06, "loss": 9.895, "step": 29 }, { "epoch": 0.01, "grad_norm": 4.631030212687751, "learning_rate": 8.000000000000001e-06, "loss": 9.9977, "step": 30 }, { "epoch": 0.01, "grad_norm": 4.631218434317244, "learning_rate": 8.266666666666667e-06, "loss": 9.8149, "step": 31 }, { "epoch": 0.01, "grad_norm": 4.8158165880323365, "learning_rate": 8.533333333333335e-06, "loss": 9.7459, "step": 32 }, { "epoch": 0.01, "grad_norm": 4.841399667684455, "learning_rate": 8.8e-06, "loss": 9.7156, "step": 33 }, { "epoch": 0.01, "grad_norm": 4.610263893321706, "learning_rate": 9.066666666666667e-06, "loss": 9.6304, "step": 34 }, { "epoch": 0.01, "grad_norm": 4.235066753616497, "learning_rate": 9.333333333333334e-06, "loss": 9.41, "step": 35 }, { "epoch": 0.01, "grad_norm": 5.452655010828091, "learning_rate": 9.600000000000001e-06, "loss": 9.498, "step": 36 }, { "epoch": 0.01, "grad_norm": 4.938743934202386, "learning_rate": 9.866666666666668e-06, "loss": 9.3042, "step": 37 }, { "epoch": 0.02, "grad_norm": 3.9544713684597945, "learning_rate": 1.0133333333333335e-05, "loss": 9.4645, "step": 38 }, { "epoch": 0.02, "grad_norm": 3.6572227241816835, "learning_rate": 1.04e-05, "loss": 9.2902, "step": 39 }, { "epoch": 0.02, "grad_norm": 6.466505871627913, "learning_rate": 1.0666666666666667e-05, "loss": 9.3163, "step": 40 }, { "epoch": 0.02, "grad_norm": 3.778564965485367, "learning_rate": 1.0933333333333334e-05, "loss": 9.1139, "step": 41 }, { "epoch": 0.02, "grad_norm": 5.410075586798703, "learning_rate": 1.1200000000000001e-05, "loss": 9.0775, "step": 42 }, { "epoch": 0.02, "grad_norm": 4.122133392766589, "learning_rate": 1.1466666666666668e-05, "loss": 9.146, "step": 43 }, { "epoch": 0.02, "grad_norm": 3.6810042264298595, "learning_rate": 1.1733333333333335e-05, "loss": 9.0396, "step": 44 }, { "epoch": 0.02, "grad_norm": 4.227042286632877, "learning_rate": 1.2e-05, "loss": 9.0946, "step": 45 }, { "epoch": 0.02, "grad_norm": 5.629133823957898, "learning_rate": 1.2266666666666667e-05, "loss": 8.9797, "step": 46 }, { "epoch": 0.02, "grad_norm": 3.7271542497913503, "learning_rate": 1.2533333333333336e-05, "loss": 8.9485, "step": 47 }, { "epoch": 0.02, "grad_norm": 5.242646810059511, "learning_rate": 1.2800000000000001e-05, "loss": 8.8412, "step": 48 }, { "epoch": 0.02, "grad_norm": 7.076863398153946, "learning_rate": 1.3066666666666668e-05, "loss": 8.9591, "step": 49 }, { "epoch": 0.02, "grad_norm": 4.744253371158508, "learning_rate": 1.3333333333333333e-05, "loss": 8.849, "step": 50 }, { "epoch": 0.02, "grad_norm": 4.4165606869895475, "learning_rate": 1.3600000000000002e-05, "loss": 8.8491, "step": 51 }, { "epoch": 0.02, "grad_norm": 6.857593202175524, "learning_rate": 1.3866666666666669e-05, "loss": 8.8226, "step": 52 }, { "epoch": 0.02, "grad_norm": 4.475861448921801, "learning_rate": 1.4133333333333334e-05, "loss": 8.808, "step": 53 }, { "epoch": 0.02, "grad_norm": 4.8910608136482905, "learning_rate": 1.4400000000000001e-05, "loss": 8.6174, "step": 54 }, { "epoch": 0.02, "grad_norm": 3.759581203767945, "learning_rate": 1.4666666666666666e-05, "loss": 8.7544, "step": 55 }, { "epoch": 0.02, "grad_norm": 5.430056602810506, "learning_rate": 1.4933333333333335e-05, "loss": 8.7811, "step": 56 }, { "epoch": 0.02, "grad_norm": 3.6362047197084126, "learning_rate": 1.5200000000000002e-05, "loss": 8.6588, "step": 57 }, { "epoch": 0.02, "grad_norm": 6.307397557457014, "learning_rate": 1.546666666666667e-05, "loss": 8.6764, "step": 58 }, { "epoch": 0.02, "grad_norm": 5.163868735280828, "learning_rate": 1.5733333333333334e-05, "loss": 8.5804, "step": 59 }, { "epoch": 0.02, "grad_norm": 5.926174882030974, "learning_rate": 1.6000000000000003e-05, "loss": 8.4866, "step": 60 }, { "epoch": 0.02, "grad_norm": 4.622342586255422, "learning_rate": 1.6266666666666668e-05, "loss": 8.4828, "step": 61 }, { "epoch": 0.02, "grad_norm": 4.589183178299784, "learning_rate": 1.6533333333333333e-05, "loss": 8.5785, "step": 62 }, { "epoch": 0.03, "grad_norm": 4.736115426468919, "learning_rate": 1.6800000000000002e-05, "loss": 8.3527, "step": 63 }, { "epoch": 0.03, "grad_norm": 3.6673697167117676, "learning_rate": 1.706666666666667e-05, "loss": 8.4629, "step": 64 }, { "epoch": 0.03, "grad_norm": 4.672070519476944, "learning_rate": 1.7333333333333336e-05, "loss": 8.2062, "step": 65 }, { "epoch": 0.03, "grad_norm": 5.187720892130302, "learning_rate": 1.76e-05, "loss": 8.3856, "step": 66 }, { "epoch": 0.03, "grad_norm": 4.15197703033149, "learning_rate": 1.7866666666666666e-05, "loss": 8.2769, "step": 67 }, { "epoch": 0.03, "grad_norm": 5.912600166722876, "learning_rate": 1.8133333333333335e-05, "loss": 8.3477, "step": 68 }, { "epoch": 0.03, "grad_norm": 4.244802476034463, "learning_rate": 1.8400000000000003e-05, "loss": 8.1459, "step": 69 }, { "epoch": 0.03, "grad_norm": 5.722980011319047, "learning_rate": 1.866666666666667e-05, "loss": 8.2767, "step": 70 }, { "epoch": 0.03, "grad_norm": 4.174831807901733, "learning_rate": 1.8933333333333334e-05, "loss": 8.1762, "step": 71 }, { "epoch": 0.03, "grad_norm": 8.528943545214611, "learning_rate": 1.9200000000000003e-05, "loss": 8.2168, "step": 72 }, { "epoch": 0.03, "grad_norm": 3.639645536558215, "learning_rate": 1.9466666666666668e-05, "loss": 8.1628, "step": 73 }, { "epoch": 0.03, "grad_norm": 5.52886572716252, "learning_rate": 1.9733333333333336e-05, "loss": 8.1529, "step": 74 }, { "epoch": 0.03, "grad_norm": 6.075612772806507, "learning_rate": 2e-05, "loss": 7.8176, "step": 75 }, { "epoch": 0.03, "grad_norm": 6.783051863921666, "learning_rate": 1.9999991608372392e-05, "loss": 8.028, "step": 76 }, { "epoch": 0.03, "grad_norm": 3.4072414013421133, "learning_rate": 1.999996643350365e-05, "loss": 8.018, "step": 77 }, { "epoch": 0.03, "grad_norm": 4.2518888973752365, "learning_rate": 1.999992447543603e-05, "loss": 8.0281, "step": 78 }, { "epoch": 0.03, "grad_norm": 5.342242469543838, "learning_rate": 1.999986573423995e-05, "loss": 7.7778, "step": 79 }, { "epoch": 0.03, "grad_norm": 4.492869195214892, "learning_rate": 1.999979021001399e-05, "loss": 7.9055, "step": 80 }, { "epoch": 0.03, "grad_norm": 5.969553752122128, "learning_rate": 1.999969790288491e-05, "loss": 7.8983, "step": 81 }, { "epoch": 0.03, "grad_norm": 5.683302910481509, "learning_rate": 1.999958881300763e-05, "loss": 7.7587, "step": 82 }, { "epoch": 0.03, "grad_norm": 4.970945676308067, "learning_rate": 1.9999462940565242e-05, "loss": 7.7445, "step": 83 }, { "epoch": 0.03, "grad_norm": 3.102972572272466, "learning_rate": 1.9999320285769e-05, "loss": 7.73, "step": 84 }, { "epoch": 0.03, "grad_norm": 5.501542226169796, "learning_rate": 1.999916084885832e-05, "loss": 7.7589, "step": 85 }, { "epoch": 0.03, "grad_norm": 3.2240889084532633, "learning_rate": 1.999898463010079e-05, "loss": 7.7714, "step": 86 }, { "epoch": 0.03, "grad_norm": 6.2224615585033245, "learning_rate": 1.9998791629792172e-05, "loss": 7.863, "step": 87 }, { "epoch": 0.04, "grad_norm": 4.918041691814854, "learning_rate": 1.999858184825637e-05, "loss": 7.6648, "step": 88 }, { "epoch": 0.04, "grad_norm": 5.96906876524204, "learning_rate": 1.9998355285845473e-05, "loss": 7.6013, "step": 89 }, { "epoch": 0.04, "grad_norm": 5.462241702890193, "learning_rate": 1.9998111942939727e-05, "loss": 7.6095, "step": 90 }, { "epoch": 0.04, "grad_norm": 4.324181915141825, "learning_rate": 1.9997851819947537e-05, "loss": 7.5812, "step": 91 }, { "epoch": 0.04, "grad_norm": 4.031985651604528, "learning_rate": 1.999757491730548e-05, "loss": 7.6367, "step": 92 }, { "epoch": 0.04, "grad_norm": 4.638146112260251, "learning_rate": 1.999728123547828e-05, "loss": 7.5123, "step": 93 }, { "epoch": 0.04, "grad_norm": 5.290147893102006, "learning_rate": 1.9996970774958836e-05, "loss": 7.3667, "step": 94 }, { "epoch": 0.04, "grad_norm": 3.6217639310893253, "learning_rate": 1.9996643536268202e-05, "loss": 7.3957, "step": 95 }, { "epoch": 0.04, "grad_norm": 4.565797170343041, "learning_rate": 1.999629951995559e-05, "loss": 7.4662, "step": 96 }, { "epoch": 0.04, "grad_norm": 3.3508669824426702, "learning_rate": 1.9995938726598374e-05, "loss": 7.378, "step": 97 }, { "epoch": 0.04, "grad_norm": 5.424192916253537, "learning_rate": 1.999556115680208e-05, "loss": 7.3639, "step": 98 }, { "epoch": 0.04, "grad_norm": 4.232652541715236, "learning_rate": 1.999516681120039e-05, "loss": 7.2653, "step": 99 }, { "epoch": 0.04, "grad_norm": 4.129496340056508, "learning_rate": 1.9994755690455154e-05, "loss": 7.4364, "step": 100 }, { "epoch": 0.04, "grad_norm": 4.233080193988463, "learning_rate": 1.999432779525635e-05, "loss": 7.2178, "step": 101 }, { "epoch": 0.04, "grad_norm": 4.176827501719459, "learning_rate": 1.9993883126322142e-05, "loss": 7.254, "step": 102 }, { "epoch": 0.04, "grad_norm": 3.758141329766479, "learning_rate": 1.9993421684398825e-05, "loss": 7.1993, "step": 103 }, { "epoch": 0.04, "grad_norm": 4.405265927651899, "learning_rate": 1.9992943470260845e-05, "loss": 7.3139, "step": 104 }, { "epoch": 0.04, "grad_norm": 4.152161371033381, "learning_rate": 1.99924484847108e-05, "loss": 7.0201, "step": 105 }, { "epoch": 0.04, "grad_norm": 3.7267363140868968, "learning_rate": 1.9991936728579438e-05, "loss": 7.0296, "step": 106 }, { "epoch": 0.04, "grad_norm": 4.3497962003683925, "learning_rate": 1.999140820272566e-05, "loss": 7.2447, "step": 107 }, { "epoch": 0.04, "grad_norm": 4.298332761550472, "learning_rate": 1.9990862908036492e-05, "loss": 7.0496, "step": 108 }, { "epoch": 0.04, "grad_norm": 2.846041734386933, "learning_rate": 1.9990300845427123e-05, "loss": 7.015, "step": 109 }, { "epoch": 0.04, "grad_norm": 5.155174975783645, "learning_rate": 1.998972201584088e-05, "loss": 6.952, "step": 110 }, { "epoch": 0.04, "grad_norm": 3.420517963314204, "learning_rate": 1.998912642024922e-05, "loss": 7.0024, "step": 111 }, { "epoch": 0.04, "grad_norm": 4.174524593326725, "learning_rate": 1.998851405965175e-05, "loss": 6.9717, "step": 112 }, { "epoch": 0.05, "grad_norm": 4.633064351625967, "learning_rate": 1.9987884935076213e-05, "loss": 7.0441, "step": 113 }, { "epoch": 0.05, "grad_norm": 3.841624516136794, "learning_rate": 1.9987239047578482e-05, "loss": 6.9816, "step": 114 }, { "epoch": 0.05, "grad_norm": 5.012980808319145, "learning_rate": 1.9986576398242566e-05, "loss": 6.8687, "step": 115 }, { "epoch": 0.05, "grad_norm": 4.412369426332821, "learning_rate": 1.9985896988180607e-05, "loss": 6.9009, "step": 116 }, { "epoch": 0.05, "grad_norm": 3.597355208644285, "learning_rate": 1.9985200818532873e-05, "loss": 6.9828, "step": 117 }, { "epoch": 0.05, "grad_norm": 2.7175005237062635, "learning_rate": 1.9984487890467773e-05, "loss": 7.0573, "step": 118 }, { "epoch": 0.05, "grad_norm": 4.03043168020476, "learning_rate": 1.9983758205181824e-05, "loss": 6.8331, "step": 119 }, { "epoch": 0.05, "grad_norm": 4.88783774497073, "learning_rate": 1.9983011763899674e-05, "loss": 6.6711, "step": 120 }, { "epoch": 0.05, "grad_norm": 4.280680262363997, "learning_rate": 1.9982248567874098e-05, "loss": 6.8023, "step": 121 }, { "epoch": 0.05, "grad_norm": 4.577251349440423, "learning_rate": 1.998146861838599e-05, "loss": 6.8457, "step": 122 }, { "epoch": 0.05, "grad_norm": 3.5891117859872472, "learning_rate": 1.9980671916744356e-05, "loss": 6.8008, "step": 123 }, { "epoch": 0.05, "grad_norm": 4.099854105360618, "learning_rate": 1.9979858464286317e-05, "loss": 6.726, "step": 124 }, { "epoch": 0.05, "grad_norm": 3.9821886115445677, "learning_rate": 1.997902826237712e-05, "loss": 6.6006, "step": 125 }, { "epoch": 0.05, "grad_norm": 4.681735516522654, "learning_rate": 1.9978181312410104e-05, "loss": 6.7371, "step": 126 }, { "epoch": 0.05, "grad_norm": 3.4290275825448147, "learning_rate": 1.9977317615806738e-05, "loss": 6.7282, "step": 127 }, { "epoch": 0.05, "grad_norm": 3.803532374626138, "learning_rate": 1.9976437174016575e-05, "loss": 6.6815, "step": 128 }, { "epoch": 0.05, "grad_norm": 4.32888767020378, "learning_rate": 1.997553998851729e-05, "loss": 6.6643, "step": 129 }, { "epoch": 0.05, "grad_norm": 3.066789919091347, "learning_rate": 1.997462606081465e-05, "loss": 6.6853, "step": 130 }, { "epoch": 0.05, "grad_norm": 4.708099567003482, "learning_rate": 1.997369539244252e-05, "loss": 6.7651, "step": 131 }, { "epoch": 0.05, "grad_norm": 4.245589097217936, "learning_rate": 1.997274798496287e-05, "loss": 6.6351, "step": 132 }, { "epoch": 0.05, "grad_norm": 6.240512001177884, "learning_rate": 1.9971783839965756e-05, "loss": 6.509, "step": 133 }, { "epoch": 0.05, "grad_norm": 3.9243143342593974, "learning_rate": 1.997080295906933e-05, "loss": 6.6422, "step": 134 }, { "epoch": 0.05, "grad_norm": 3.9373753907930666, "learning_rate": 1.9969805343919822e-05, "loss": 6.6452, "step": 135 }, { "epoch": 0.05, "grad_norm": 3.29054775888337, "learning_rate": 1.996879099619156e-05, "loss": 6.5095, "step": 136 }, { "epoch": 0.05, "grad_norm": 4.39805253052477, "learning_rate": 1.9967759917586953e-05, "loss": 6.4897, "step": 137 }, { "epoch": 0.06, "grad_norm": 3.7120504701706247, "learning_rate": 1.9966712109836476e-05, "loss": 6.4993, "step": 138 }, { "epoch": 0.06, "grad_norm": 5.288830803394844, "learning_rate": 1.9965647574698705e-05, "loss": 6.5248, "step": 139 }, { "epoch": 0.06, "grad_norm": 4.908954938672954, "learning_rate": 1.9964566313960265e-05, "loss": 6.382, "step": 140 }, { "epoch": 0.06, "grad_norm": 6.400649085165072, "learning_rate": 1.9963468329435872e-05, "loss": 6.6161, "step": 141 }, { "epoch": 0.06, "grad_norm": 3.6352934691721006, "learning_rate": 1.9962353622968296e-05, "loss": 6.2676, "step": 142 }, { "epoch": 0.06, "grad_norm": 4.869261630058483, "learning_rate": 1.996122219642838e-05, "loss": 6.6562, "step": 143 }, { "epoch": 0.06, "grad_norm": 4.386275014923958, "learning_rate": 1.9960074051715022e-05, "loss": 6.4506, "step": 144 }, { "epoch": 0.06, "grad_norm": 4.209204783609756, "learning_rate": 1.995890919075519e-05, "loss": 6.3668, "step": 145 }, { "epoch": 0.06, "grad_norm": 6.247052362466022, "learning_rate": 1.995772761550389e-05, "loss": 6.3787, "step": 146 }, { "epoch": 0.06, "grad_norm": 4.27883778192868, "learning_rate": 1.9956529327944198e-05, "loss": 6.3488, "step": 147 }, { "epoch": 0.06, "grad_norm": 5.635114569082134, "learning_rate": 1.9955314330087225e-05, "loss": 6.343, "step": 148 }, { "epoch": 0.06, "grad_norm": 3.8840203593424962, "learning_rate": 1.9954082623972143e-05, "loss": 6.3235, "step": 149 }, { "epoch": 0.06, "grad_norm": 3.5382476230277056, "learning_rate": 1.995283421166614e-05, "loss": 6.3011, "step": 150 }, { "epoch": 0.06, "grad_norm": 3.234026714115879, "learning_rate": 1.9951569095264473e-05, "loss": 6.2582, "step": 151 }, { "epoch": 0.06, "grad_norm": 3.4587502541675947, "learning_rate": 1.995028727689041e-05, "loss": 6.3841, "step": 152 }, { "epoch": 0.06, "grad_norm": 6.061472086983153, "learning_rate": 1.9948988758695263e-05, "loss": 6.1556, "step": 153 }, { "epoch": 0.06, "grad_norm": 5.147311138116654, "learning_rate": 1.994767354285837e-05, "loss": 6.1378, "step": 154 }, { "epoch": 0.06, "grad_norm": 7.231891466089776, "learning_rate": 1.9946341631587086e-05, "loss": 6.2871, "step": 155 }, { "epoch": 0.06, "grad_norm": 4.434419768405808, "learning_rate": 1.9944993027116798e-05, "loss": 6.332, "step": 156 }, { "epoch": 0.06, "grad_norm": 5.531188775167551, "learning_rate": 1.9943627731710896e-05, "loss": 6.0622, "step": 157 }, { "epoch": 0.06, "grad_norm": 5.843702023801363, "learning_rate": 1.9942245747660797e-05, "loss": 6.2306, "step": 158 }, { "epoch": 0.06, "grad_norm": 4.766495021175679, "learning_rate": 1.9940847077285918e-05, "loss": 6.1657, "step": 159 }, { "epoch": 0.06, "grad_norm": 4.226745630752804, "learning_rate": 1.9939431722933678e-05, "loss": 6.2713, "step": 160 }, { "epoch": 0.06, "grad_norm": 4.213643944441643, "learning_rate": 1.993799968697951e-05, "loss": 6.2101, "step": 161 }, { "epoch": 0.06, "grad_norm": 4.320563316825058, "learning_rate": 1.9936550971826835e-05, "loss": 6.1438, "step": 162 }, { "epoch": 0.07, "grad_norm": 3.816127954551905, "learning_rate": 1.9935085579907064e-05, "loss": 6.2491, "step": 163 }, { "epoch": 0.07, "grad_norm": 5.6329522483416845, "learning_rate": 1.9933603513679604e-05, "loss": 6.1279, "step": 164 }, { "epoch": 0.07, "grad_norm": 3.027487870638996, "learning_rate": 1.9932104775631847e-05, "loss": 6.0602, "step": 165 }, { "epoch": 0.07, "grad_norm": 4.98578715574418, "learning_rate": 1.993058936827916e-05, "loss": 6.0202, "step": 166 }, { "epoch": 0.07, "grad_norm": 5.604110505866004, "learning_rate": 1.9929057294164894e-05, "loss": 6.0824, "step": 167 }, { "epoch": 0.07, "grad_norm": 5.246568749376813, "learning_rate": 1.992750855586036e-05, "loss": 5.9831, "step": 168 }, { "epoch": 0.07, "grad_norm": 2.7019710360614204, "learning_rate": 1.9925943155964857e-05, "loss": 6.0617, "step": 169 }, { "epoch": 0.07, "grad_norm": 5.276255882489964, "learning_rate": 1.9924361097105624e-05, "loss": 5.8458, "step": 170 }, { "epoch": 0.07, "grad_norm": 10.92369718317991, "learning_rate": 1.992276238193788e-05, "loss": 5.9862, "step": 171 }, { "epoch": 0.07, "grad_norm": 7.178133635023751, "learning_rate": 1.9921147013144782e-05, "loss": 5.8947, "step": 172 }, { "epoch": 0.07, "grad_norm": 4.894193449640341, "learning_rate": 1.9919514993437445e-05, "loss": 5.864, "step": 173 }, { "epoch": 0.07, "grad_norm": 6.701018088179099, "learning_rate": 1.9917866325554936e-05, "loss": 6.0865, "step": 174 }, { "epoch": 0.07, "grad_norm": 6.066248382914103, "learning_rate": 1.9916201012264255e-05, "loss": 6.032, "step": 175 }, { "epoch": 0.07, "grad_norm": 6.103277009648441, "learning_rate": 1.991451905636033e-05, "loss": 6.1079, "step": 176 }, { "epoch": 0.07, "grad_norm": 5.025146053384431, "learning_rate": 1.9912820460666046e-05, "loss": 5.9556, "step": 177 }, { "epoch": 0.07, "grad_norm": 4.297603739475917, "learning_rate": 1.9911105228032186e-05, "loss": 5.9033, "step": 178 }, { "epoch": 0.07, "grad_norm": 5.973721423418755, "learning_rate": 1.9909373361337475e-05, "loss": 5.9913, "step": 179 }, { "epoch": 0.07, "grad_norm": 3.7597853707129434, "learning_rate": 1.990762486348855e-05, "loss": 5.91, "step": 180 }, { "epoch": 0.07, "grad_norm": 6.429544778679046, "learning_rate": 1.990585973741996e-05, "loss": 5.9034, "step": 181 }, { "epoch": 0.07, "grad_norm": 5.746428539982332, "learning_rate": 1.9904077986094153e-05, "loss": 5.8531, "step": 182 }, { "epoch": 0.07, "grad_norm": 4.325501317331493, "learning_rate": 1.9902279612501494e-05, "loss": 5.8167, "step": 183 }, { "epoch": 0.07, "grad_norm": 4.410086017324782, "learning_rate": 1.9900464619660243e-05, "loss": 5.7331, "step": 184 }, { "epoch": 0.07, "grad_norm": 3.99166656509949, "learning_rate": 1.989863301061654e-05, "loss": 5.9225, "step": 185 }, { "epoch": 0.07, "grad_norm": 4.535795454860607, "learning_rate": 1.989678478844443e-05, "loss": 5.8929, "step": 186 }, { "epoch": 0.07, "grad_norm": 3.547069866912832, "learning_rate": 1.9894919956245825e-05, "loss": 5.9094, "step": 187 }, { "epoch": 0.08, "grad_norm": 3.4283860612781245, "learning_rate": 1.9893038517150526e-05, "loss": 5.9043, "step": 188 }, { "epoch": 0.08, "grad_norm": 4.162844234054413, "learning_rate": 1.9891140474316197e-05, "loss": 5.7997, "step": 189 }, { "epoch": 0.08, "grad_norm": 3.305168114018011, "learning_rate": 1.9889225830928365e-05, "loss": 5.7349, "step": 190 }, { "epoch": 0.08, "grad_norm": 4.055746785153127, "learning_rate": 1.9887294590200437e-05, "loss": 5.8111, "step": 191 }, { "epoch": 0.08, "grad_norm": 2.7328825091394324, "learning_rate": 1.988534675537366e-05, "loss": 5.8384, "step": 192 }, { "epoch": 0.08, "grad_norm": 3.1019661386474735, "learning_rate": 1.988338232971713e-05, "loss": 5.8723, "step": 193 }, { "epoch": 0.08, "grad_norm": 3.497989489932904, "learning_rate": 1.9881401316527795e-05, "loss": 5.9128, "step": 194 }, { "epoch": 0.08, "grad_norm": 3.8831242401657087, "learning_rate": 1.987940371913044e-05, "loss": 5.8428, "step": 195 }, { "epoch": 0.08, "grad_norm": 3.174688702002974, "learning_rate": 1.9877389540877686e-05, "loss": 5.7467, "step": 196 }, { "epoch": 0.08, "grad_norm": 3.055915757767483, "learning_rate": 1.9875358785149982e-05, "loss": 5.7186, "step": 197 }, { "epoch": 0.08, "grad_norm": 5.782137439007435, "learning_rate": 1.987331145535559e-05, "loss": 5.8389, "step": 198 }, { "epoch": 0.08, "grad_norm": 3.2379181329566777, "learning_rate": 1.98712475549306e-05, "loss": 5.7215, "step": 199 }, { "epoch": 0.08, "grad_norm": 3.6456344440328845, "learning_rate": 1.9869167087338908e-05, "loss": 5.618, "step": 200 }, { "epoch": 0.08, "grad_norm": 3.7619772961608255, "learning_rate": 1.9867070056072215e-05, "loss": 5.8516, "step": 201 }, { "epoch": 0.08, "grad_norm": 3.5740327111798322, "learning_rate": 1.9864956464650027e-05, "loss": 5.7066, "step": 202 }, { "epoch": 0.08, "grad_norm": 4.033968317687985, "learning_rate": 1.986282631661963e-05, "loss": 5.6463, "step": 203 }, { "epoch": 0.08, "grad_norm": 3.644873695688273, "learning_rate": 1.9860679615556112e-05, "loss": 5.5472, "step": 204 }, { "epoch": 0.08, "grad_norm": 3.2695708458307617, "learning_rate": 1.9858516365062334e-05, "loss": 5.7738, "step": 205 }, { "epoch": 0.08, "grad_norm": 4.43428089723817, "learning_rate": 1.9856336568768936e-05, "loss": 5.6885, "step": 206 }, { "epoch": 0.08, "grad_norm": 4.203310963319522, "learning_rate": 1.9854140230334323e-05, "loss": 5.6778, "step": 207 }, { "epoch": 0.08, "grad_norm": 3.4114108774306633, "learning_rate": 1.985192735344467e-05, "loss": 5.9011, "step": 208 }, { "epoch": 0.08, "grad_norm": 4.125731707197647, "learning_rate": 1.98496979418139e-05, "loss": 5.5251, "step": 209 }, { "epoch": 0.08, "grad_norm": 3.558175716829582, "learning_rate": 1.9847451999183692e-05, "loss": 5.5834, "step": 210 }, { "epoch": 0.08, "grad_norm": 3.1705697609316523, "learning_rate": 1.9845189529323473e-05, "loss": 5.6806, "step": 211 }, { "epoch": 0.08, "grad_norm": 5.375356341487538, "learning_rate": 1.98429105360304e-05, "loss": 5.6382, "step": 212 }, { "epoch": 0.09, "grad_norm": 4.886412279702093, "learning_rate": 1.9840615023129372e-05, "loss": 5.5464, "step": 213 }, { "epoch": 0.09, "grad_norm": 4.003221004452615, "learning_rate": 1.9838302994473e-05, "loss": 5.5527, "step": 214 }, { "epoch": 0.09, "grad_norm": 3.2142950313355976, "learning_rate": 1.9835974453941623e-05, "loss": 5.4318, "step": 215 }, { "epoch": 0.09, "grad_norm": 4.124707418757366, "learning_rate": 1.9833629405443283e-05, "loss": 5.4974, "step": 216 }, { "epoch": 0.09, "grad_norm": 5.162365896102337, "learning_rate": 1.983126785291375e-05, "loss": 5.5908, "step": 217 }, { "epoch": 0.09, "grad_norm": 3.5671782583753178, "learning_rate": 1.9828889800316467e-05, "loss": 5.5886, "step": 218 }, { "epoch": 0.09, "grad_norm": 5.552839596348838, "learning_rate": 1.982649525164258e-05, "loss": 5.527, "step": 219 }, { "epoch": 0.09, "grad_norm": 5.338000185805563, "learning_rate": 1.9824084210910924e-05, "loss": 5.6058, "step": 220 }, { "epoch": 0.09, "grad_norm": 4.314810218621173, "learning_rate": 1.9821656682168013e-05, "loss": 5.6443, "step": 221 }, { "epoch": 0.09, "grad_norm": 4.124905233622171, "learning_rate": 1.9819212669488026e-05, "loss": 5.8652, "step": 222 }, { "epoch": 0.09, "grad_norm": 5.601461559913127, "learning_rate": 1.9816752176972815e-05, "loss": 5.6487, "step": 223 }, { "epoch": 0.09, "grad_norm": 4.682447349841513, "learning_rate": 1.9814275208751882e-05, "loss": 5.4415, "step": 224 }, { "epoch": 0.09, "grad_norm": 4.092388938276944, "learning_rate": 1.9811781768982392e-05, "loss": 5.4588, "step": 225 }, { "epoch": 0.09, "grad_norm": 5.5748442841522134, "learning_rate": 1.9809271861849147e-05, "loss": 5.4366, "step": 226 }, { "epoch": 0.09, "grad_norm": 3.6795744789993554, "learning_rate": 1.9806745491564588e-05, "loss": 5.5378, "step": 227 }, { "epoch": 0.09, "grad_norm": 4.236814926224578, "learning_rate": 1.9804202662368782e-05, "loss": 5.5051, "step": 228 }, { "epoch": 0.09, "grad_norm": 3.8825245603921315, "learning_rate": 1.980164337852943e-05, "loss": 5.4982, "step": 229 }, { "epoch": 0.09, "grad_norm": 4.689216752889219, "learning_rate": 1.9799067644341844e-05, "loss": 5.4037, "step": 230 }, { "epoch": 0.09, "grad_norm": 4.498751852842296, "learning_rate": 1.9796475464128943e-05, "loss": 5.5379, "step": 231 }, { "epoch": 0.09, "grad_norm": 3.2926041035928995, "learning_rate": 1.9793866842241245e-05, "loss": 5.6656, "step": 232 }, { "epoch": 0.09, "grad_norm": 4.563969254045201, "learning_rate": 1.9791241783056874e-05, "loss": 5.5352, "step": 233 }, { "epoch": 0.09, "grad_norm": 3.9349353489105567, "learning_rate": 1.9788600290981525e-05, "loss": 5.6132, "step": 234 }, { "epoch": 0.09, "grad_norm": 4.461095251707113, "learning_rate": 1.978594237044849e-05, "loss": 5.5776, "step": 235 }, { "epoch": 0.09, "grad_norm": 4.128877934256824, "learning_rate": 1.9783268025918622e-05, "loss": 5.3886, "step": 236 }, { "epoch": 0.09, "grad_norm": 4.340131228111977, "learning_rate": 1.9780577261880336e-05, "loss": 5.4318, "step": 237 }, { "epoch": 0.1, "grad_norm": 3.5405936438145473, "learning_rate": 1.977787008284962e-05, "loss": 5.612, "step": 238 }, { "epoch": 0.1, "grad_norm": 3.9277621763337125, "learning_rate": 1.9775146493369996e-05, "loss": 5.5956, "step": 239 }, { "epoch": 0.1, "grad_norm": 4.189175924053646, "learning_rate": 1.977240649801253e-05, "loss": 5.4193, "step": 240 }, { "epoch": 0.1, "grad_norm": 3.7261802456797044, "learning_rate": 1.9769650101375835e-05, "loss": 5.4232, "step": 241 }, { "epoch": 0.1, "grad_norm": 5.461796387404502, "learning_rate": 1.9766877308086038e-05, "loss": 5.5134, "step": 242 }, { "epoch": 0.1, "grad_norm": 4.061775651187604, "learning_rate": 1.9764088122796785e-05, "loss": 5.4679, "step": 243 }, { "epoch": 0.1, "grad_norm": 5.965102040882132, "learning_rate": 1.976128255018924e-05, "loss": 5.5467, "step": 244 }, { "epoch": 0.1, "grad_norm": 4.676829205300839, "learning_rate": 1.9758460594972068e-05, "loss": 5.4451, "step": 245 }, { "epoch": 0.1, "grad_norm": 4.911400602977502, "learning_rate": 1.975562226188143e-05, "loss": 5.4803, "step": 246 }, { "epoch": 0.1, "grad_norm": 6.282418377084345, "learning_rate": 1.9752767555680967e-05, "loss": 5.286, "step": 247 }, { "epoch": 0.1, "grad_norm": 4.308118562484986, "learning_rate": 1.9749896481161807e-05, "loss": 5.4824, "step": 248 }, { "epoch": 0.1, "grad_norm": 7.263589886497265, "learning_rate": 1.9747009043142556e-05, "loss": 5.5256, "step": 249 }, { "epoch": 0.1, "grad_norm": 3.827111939511474, "learning_rate": 1.9744105246469264e-05, "loss": 5.5244, "step": 250 }, { "epoch": 0.1, "grad_norm": 5.403518844265166, "learning_rate": 1.974118509601545e-05, "loss": 5.3989, "step": 251 }, { "epoch": 0.1, "grad_norm": 3.720678665989358, "learning_rate": 1.9738248596682078e-05, "loss": 5.4725, "step": 252 }, { "epoch": 0.1, "grad_norm": 5.631390734909404, "learning_rate": 1.973529575339755e-05, "loss": 5.4297, "step": 253 }, { "epoch": 0.1, "grad_norm": 3.629825341493495, "learning_rate": 1.9732326571117703e-05, "loss": 5.3291, "step": 254 }, { "epoch": 0.1, "grad_norm": 4.527878508159066, "learning_rate": 1.9729341054825783e-05, "loss": 5.4482, "step": 255 }, { "epoch": 0.1, "grad_norm": 5.041074384352786, "learning_rate": 1.9726339209532462e-05, "loss": 5.4306, "step": 256 }, { "epoch": 0.1, "grad_norm": 3.7876647301759494, "learning_rate": 1.9723321040275816e-05, "loss": 5.3598, "step": 257 }, { "epoch": 0.1, "grad_norm": 3.9349814631193314, "learning_rate": 1.972028655212131e-05, "loss": 5.2161, "step": 258 }, { "epoch": 0.1, "grad_norm": 4.196007883835419, "learning_rate": 1.9717235750161808e-05, "loss": 5.1868, "step": 259 }, { "epoch": 0.1, "grad_norm": 4.128921591316096, "learning_rate": 1.9714168639517543e-05, "loss": 5.274, "step": 260 }, { "epoch": 0.1, "grad_norm": 3.42696847333355, "learning_rate": 1.971108522533613e-05, "loss": 5.4778, "step": 261 }, { "epoch": 0.1, "grad_norm": 3.0050007534041336, "learning_rate": 1.9707985512792544e-05, "loss": 5.5596, "step": 262 }, { "epoch": 0.11, "grad_norm": 3.669616234270172, "learning_rate": 1.9704869507089105e-05, "loss": 5.3656, "step": 263 }, { "epoch": 0.11, "grad_norm": 3.7645184234700584, "learning_rate": 1.970173721345549e-05, "loss": 5.3022, "step": 264 }, { "epoch": 0.11, "grad_norm": 3.0530948203414745, "learning_rate": 1.9698588637148705e-05, "loss": 5.374, "step": 265 }, { "epoch": 0.11, "grad_norm": 3.8200253090291936, "learning_rate": 1.9695423783453086e-05, "loss": 5.2075, "step": 266 }, { "epoch": 0.11, "grad_norm": 4.315490825813043, "learning_rate": 1.9692242657680286e-05, "loss": 5.2312, "step": 267 }, { "epoch": 0.11, "grad_norm": 4.5436673528211955, "learning_rate": 1.9689045265169272e-05, "loss": 5.3477, "step": 268 }, { "epoch": 0.11, "grad_norm": 3.6433231953876364, "learning_rate": 1.9685831611286312e-05, "loss": 5.3378, "step": 269 }, { "epoch": 0.11, "grad_norm": 4.116580849675184, "learning_rate": 1.9682601701424958e-05, "loss": 5.3044, "step": 270 }, { "epoch": 0.11, "grad_norm": 3.3227233442460795, "learning_rate": 1.9679355541006056e-05, "loss": 5.3657, "step": 271 }, { "epoch": 0.11, "grad_norm": 3.7533500222827096, "learning_rate": 1.9676093135477713e-05, "loss": 5.255, "step": 272 }, { "epoch": 0.11, "grad_norm": 4.564992658943727, "learning_rate": 1.9672814490315312e-05, "loss": 5.3761, "step": 273 }, { "epoch": 0.11, "grad_norm": 3.8094013405705556, "learning_rate": 1.9669519611021485e-05, "loss": 5.2762, "step": 274 }, { "epoch": 0.11, "grad_norm": 3.43100443242049, "learning_rate": 1.9666208503126115e-05, "loss": 5.1598, "step": 275 }, { "epoch": 0.11, "grad_norm": 4.2699612429758895, "learning_rate": 1.9662881172186313e-05, "loss": 5.3879, "step": 276 }, { "epoch": 0.11, "grad_norm": 3.922986884818375, "learning_rate": 1.9659537623786428e-05, "loss": 5.4821, "step": 277 }, { "epoch": 0.11, "grad_norm": 5.4516156643641995, "learning_rate": 1.9656177863538025e-05, "loss": 5.3939, "step": 278 }, { "epoch": 0.11, "grad_norm": 4.939693355525761, "learning_rate": 1.965280189707987e-05, "loss": 5.2649, "step": 279 }, { "epoch": 0.11, "grad_norm": 3.3627846721747887, "learning_rate": 1.9649409730077934e-05, "loss": 5.3007, "step": 280 }, { "epoch": 0.11, "grad_norm": 5.506360077258328, "learning_rate": 1.9646001368225382e-05, "loss": 5.4097, "step": 281 }, { "epoch": 0.11, "grad_norm": 6.214193129635226, "learning_rate": 1.9642576817242553e-05, "loss": 5.316, "step": 282 }, { "epoch": 0.11, "grad_norm": 5.2863221250018135, "learning_rate": 1.9639136082876954e-05, "loss": 5.3198, "step": 283 }, { "epoch": 0.11, "grad_norm": 5.826690727773217, "learning_rate": 1.9635679170903258e-05, "loss": 5.3665, "step": 284 }, { "epoch": 0.11, "grad_norm": 6.521409795613998, "learning_rate": 1.9632206087123296e-05, "loss": 5.1958, "step": 285 }, { "epoch": 0.11, "grad_norm": 6.195903307493117, "learning_rate": 1.962871683736603e-05, "loss": 5.1238, "step": 286 }, { "epoch": 0.11, "grad_norm": 5.701677642348548, "learning_rate": 1.962521142748755e-05, "loss": 5.1754, "step": 287 }, { "epoch": 0.12, "grad_norm": 5.650371964203301, "learning_rate": 1.9621689863371083e-05, "loss": 5.2856, "step": 288 }, { "epoch": 0.12, "grad_norm": 4.643759454410139, "learning_rate": 1.9618152150926953e-05, "loss": 5.4458, "step": 289 }, { "epoch": 0.12, "grad_norm": 5.0810750026652345, "learning_rate": 1.9614598296092603e-05, "loss": 5.3881, "step": 290 }, { "epoch": 0.12, "grad_norm": 4.540921451453481, "learning_rate": 1.9611028304832547e-05, "loss": 5.3133, "step": 291 }, { "epoch": 0.12, "grad_norm": 4.449089021608378, "learning_rate": 1.9607442183138403e-05, "loss": 5.2165, "step": 292 }, { "epoch": 0.12, "grad_norm": 4.303653173381438, "learning_rate": 1.960383993702884e-05, "loss": 5.2999, "step": 293 }, { "epoch": 0.12, "grad_norm": 4.777311506166449, "learning_rate": 1.9600221572549607e-05, "loss": 5.1426, "step": 294 }, { "epoch": 0.12, "grad_norm": 5.756333904685148, "learning_rate": 1.9596587095773496e-05, "loss": 5.3915, "step": 295 }, { "epoch": 0.12, "grad_norm": 6.378184736178621, "learning_rate": 1.959293651280034e-05, "loss": 5.138, "step": 296 }, { "epoch": 0.12, "grad_norm": 4.883143470991173, "learning_rate": 1.958926982975701e-05, "loss": 5.1153, "step": 297 }, { "epoch": 0.12, "grad_norm": 6.929401125683602, "learning_rate": 1.958558705279739e-05, "loss": 5.2267, "step": 298 }, { "epoch": 0.12, "grad_norm": 5.781275244606163, "learning_rate": 1.9581888188102375e-05, "loss": 5.2871, "step": 299 }, { "epoch": 0.12, "grad_norm": 4.234221317741716, "learning_rate": 1.957817324187987e-05, "loss": 5.1617, "step": 300 }, { "epoch": 0.12, "grad_norm": 7.159465777574264, "learning_rate": 1.9574442220364768e-05, "loss": 5.1299, "step": 301 }, { "epoch": 0.12, "grad_norm": 4.256329018916455, "learning_rate": 1.9570695129818928e-05, "loss": 5.2352, "step": 302 }, { "epoch": 0.12, "grad_norm": 5.133018933678203, "learning_rate": 1.956693197653119e-05, "loss": 5.2031, "step": 303 }, { "epoch": 0.12, "grad_norm": 3.87755014809663, "learning_rate": 1.9563152766817356e-05, "loss": 5.2025, "step": 304 }, { "epoch": 0.12, "grad_norm": 3.9463633578914727, "learning_rate": 1.9559357507020163e-05, "loss": 5.0225, "step": 305 }, { "epoch": 0.12, "grad_norm": 4.274298024612477, "learning_rate": 1.9555546203509297e-05, "loss": 5.1675, "step": 306 }, { "epoch": 0.12, "grad_norm": 4.571969693205734, "learning_rate": 1.9551718862681363e-05, "loss": 5.1387, "step": 307 }, { "epoch": 0.12, "grad_norm": 4.441878767657563, "learning_rate": 1.9547875490959884e-05, "loss": 5.3539, "step": 308 }, { "epoch": 0.12, "grad_norm": 3.627572689101822, "learning_rate": 1.9544016094795294e-05, "loss": 5.1321, "step": 309 }, { "epoch": 0.12, "grad_norm": 4.781514761663345, "learning_rate": 1.9540140680664915e-05, "loss": 5.1977, "step": 310 }, { "epoch": 0.12, "grad_norm": 4.140560769424146, "learning_rate": 1.953624925507295e-05, "loss": 5.0422, "step": 311 }, { "epoch": 0.12, "grad_norm": 4.907850959965522, "learning_rate": 1.953234182455048e-05, "loss": 5.3369, "step": 312 }, { "epoch": 0.13, "grad_norm": 4.691324420893443, "learning_rate": 1.9528418395655443e-05, "loss": 5.2007, "step": 313 }, { "epoch": 0.13, "grad_norm": 5.0983085273471955, "learning_rate": 1.952447897497263e-05, "loss": 5.1451, "step": 314 }, { "epoch": 0.13, "grad_norm": 5.354850165008846, "learning_rate": 1.952052356911368e-05, "loss": 5.1415, "step": 315 }, { "epoch": 0.13, "grad_norm": 3.8975948202258897, "learning_rate": 1.9516552184717036e-05, "loss": 5.0287, "step": 316 }, { "epoch": 0.13, "grad_norm": 4.873422769676822, "learning_rate": 1.951256482844799e-05, "loss": 4.9794, "step": 317 }, { "epoch": 0.13, "grad_norm": 4.683604880793774, "learning_rate": 1.9508561506998613e-05, "loss": 5.105, "step": 318 }, { "epoch": 0.13, "grad_norm": 7.194250897520733, "learning_rate": 1.950454222708778e-05, "loss": 5.2785, "step": 319 }, { "epoch": 0.13, "grad_norm": 6.560112427184304, "learning_rate": 1.950050699546116e-05, "loss": 4.9859, "step": 320 }, { "epoch": 0.13, "grad_norm": 5.652738444026526, "learning_rate": 1.949645581889118e-05, "loss": 5.0616, "step": 321 }, { "epoch": 0.13, "grad_norm": 3.8360003224169783, "learning_rate": 1.9492388704177036e-05, "loss": 5.0376, "step": 322 }, { "epoch": 0.13, "grad_norm": 4.744616157206043, "learning_rate": 1.9488305658144666e-05, "loss": 5.0505, "step": 323 }, { "epoch": 0.13, "grad_norm": 5.197730198094315, "learning_rate": 1.9484206687646753e-05, "loss": 5.2067, "step": 324 }, { "epoch": 0.13, "grad_norm": 4.344506328513772, "learning_rate": 1.9480091799562706e-05, "loss": 5.2547, "step": 325 }, { "epoch": 0.13, "grad_norm": 5.033536417957443, "learning_rate": 1.9475961000798645e-05, "loss": 5.1837, "step": 326 }, { "epoch": 0.13, "grad_norm": 4.220408063420163, "learning_rate": 1.947181429828739e-05, "loss": 5.0591, "step": 327 }, { "epoch": 0.13, "grad_norm": 4.20562324255212, "learning_rate": 1.9467651698988464e-05, "loss": 5.2423, "step": 328 }, { "epoch": 0.13, "grad_norm": 4.741171703804873, "learning_rate": 1.9463473209888063e-05, "loss": 5.0776, "step": 329 }, { "epoch": 0.13, "grad_norm": 3.352390084872122, "learning_rate": 1.9459278837999048e-05, "loss": 5.1908, "step": 330 }, { "epoch": 0.13, "grad_norm": 7.336766536879142, "learning_rate": 1.9455068590360943e-05, "loss": 5.0963, "step": 331 }, { "epoch": 0.13, "grad_norm": 5.323080508118235, "learning_rate": 1.9450842474039914e-05, "loss": 5.0653, "step": 332 }, { "epoch": 0.13, "grad_norm": 7.748250010976563, "learning_rate": 1.944660049612876e-05, "loss": 5.1332, "step": 333 }, { "epoch": 0.13, "grad_norm": 4.7527834896436865, "learning_rate": 1.9442342663746903e-05, "loss": 4.9813, "step": 334 }, { "epoch": 0.13, "grad_norm": 5.5313121566876555, "learning_rate": 1.9438068984040366e-05, "loss": 4.8612, "step": 335 }, { "epoch": 0.13, "grad_norm": 4.769650373642717, "learning_rate": 1.943377946418178e-05, "loss": 5.1038, "step": 336 }, { "epoch": 0.13, "grad_norm": 4.254447255879978, "learning_rate": 1.942947411137035e-05, "loss": 4.9938, "step": 337 }, { "epoch": 0.14, "grad_norm": 7.232316770724755, "learning_rate": 1.942515293283187e-05, "loss": 4.9693, "step": 338 }, { "epoch": 0.14, "grad_norm": 3.84144956221707, "learning_rate": 1.9420815935818673e-05, "loss": 5.0987, "step": 339 }, { "epoch": 0.14, "grad_norm": 4.639198534380874, "learning_rate": 1.9416463127609655e-05, "loss": 5.0796, "step": 340 }, { "epoch": 0.14, "grad_norm": 4.293798925443441, "learning_rate": 1.941209451551025e-05, "loss": 4.9292, "step": 341 }, { "epoch": 0.14, "grad_norm": 4.903633350612866, "learning_rate": 1.9407710106852405e-05, "loss": 5.129, "step": 342 }, { "epoch": 0.14, "grad_norm": 5.126542780485656, "learning_rate": 1.940330990899459e-05, "loss": 4.9603, "step": 343 }, { "epoch": 0.14, "grad_norm": 4.288069212982325, "learning_rate": 1.9398893929321763e-05, "loss": 4.9396, "step": 344 }, { "epoch": 0.14, "grad_norm": 3.9523613385551686, "learning_rate": 1.9394462175245382e-05, "loss": 5.1768, "step": 345 }, { "epoch": 0.14, "grad_norm": 3.996899954396938, "learning_rate": 1.939001465420337e-05, "loss": 4.9787, "step": 346 }, { "epoch": 0.14, "grad_norm": 4.569032903501148, "learning_rate": 1.9385551373660113e-05, "loss": 4.9764, "step": 347 }, { "epoch": 0.14, "grad_norm": 4.176747620302349, "learning_rate": 1.9381072341106453e-05, "loss": 5.1336, "step": 348 }, { "epoch": 0.14, "grad_norm": 4.154232238794425, "learning_rate": 1.937657756405966e-05, "loss": 5.058, "step": 349 }, { "epoch": 0.14, "grad_norm": 3.8984462605590084, "learning_rate": 1.937206705006344e-05, "loss": 4.9333, "step": 350 }, { "epoch": 0.14, "grad_norm": 4.690682787780671, "learning_rate": 1.9367540806687894e-05, "loss": 4.974, "step": 351 }, { "epoch": 0.14, "grad_norm": 4.031124140257661, "learning_rate": 1.9362998841529542e-05, "loss": 5.1886, "step": 352 }, { "epoch": 0.14, "grad_norm": 3.992297500197864, "learning_rate": 1.935844116221127e-05, "loss": 5.0888, "step": 353 }, { "epoch": 0.14, "grad_norm": 3.818973294829996, "learning_rate": 1.9353867776382357e-05, "loss": 5.1499, "step": 354 }, { "epoch": 0.14, "grad_norm": 3.619804549551641, "learning_rate": 1.9349278691718426e-05, "loss": 4.9794, "step": 355 }, { "epoch": 0.14, "grad_norm": 3.592935767722761, "learning_rate": 1.934467391592146e-05, "loss": 4.8134, "step": 356 }, { "epoch": 0.14, "grad_norm": 3.796223984937865, "learning_rate": 1.9340053456719768e-05, "loss": 5.0538, "step": 357 }, { "epoch": 0.14, "grad_norm": 3.2741434670658767, "learning_rate": 1.9335417321867988e-05, "loss": 4.8695, "step": 358 }, { "epoch": 0.14, "grad_norm": 3.4290531934102613, "learning_rate": 1.9330765519147058e-05, "loss": 4.8899, "step": 359 }, { "epoch": 0.14, "grad_norm": 3.8359637299600364, "learning_rate": 1.9326098056364224e-05, "loss": 5.0173, "step": 360 }, { "epoch": 0.14, "grad_norm": 5.249562964456709, "learning_rate": 1.9321414941353006e-05, "loss": 4.9638, "step": 361 }, { "epoch": 0.14, "grad_norm": 3.552155369652054, "learning_rate": 1.931671618197319e-05, "loss": 4.8706, "step": 362 }, { "epoch": 0.15, "grad_norm": 4.94673706390727, "learning_rate": 1.931200178611083e-05, "loss": 4.9603, "step": 363 }, { "epoch": 0.15, "grad_norm": 3.587398864725715, "learning_rate": 1.9307271761678214e-05, "loss": 5.1801, "step": 364 }, { "epoch": 0.15, "grad_norm": 3.786932404767004, "learning_rate": 1.9302526116613863e-05, "loss": 4.8651, "step": 365 }, { "epoch": 0.15, "grad_norm": 4.4252520799983905, "learning_rate": 1.9297764858882516e-05, "loss": 5.1099, "step": 366 }, { "epoch": 0.15, "grad_norm": 3.1820317143483927, "learning_rate": 1.9292987996475113e-05, "loss": 4.8881, "step": 367 }, { "epoch": 0.15, "grad_norm": 4.807420527910623, "learning_rate": 1.928819553740878e-05, "loss": 4.9814, "step": 368 }, { "epoch": 0.15, "grad_norm": 4.2681398731258975, "learning_rate": 1.9283387489726827e-05, "loss": 4.8342, "step": 369 }, { "epoch": 0.15, "grad_norm": 4.2039721487863915, "learning_rate": 1.9278563861498726e-05, "loss": 4.9355, "step": 370 }, { "epoch": 0.15, "grad_norm": 4.125766177541317, "learning_rate": 1.9273724660820086e-05, "loss": 5.0716, "step": 371 }, { "epoch": 0.15, "grad_norm": 3.872774946960404, "learning_rate": 1.9268869895812673e-05, "loss": 4.8714, "step": 372 }, { "epoch": 0.15, "grad_norm": 5.4800006941199735, "learning_rate": 1.9263999574624357e-05, "loss": 4.7984, "step": 373 }, { "epoch": 0.15, "grad_norm": 4.493398067066022, "learning_rate": 1.925911370542912e-05, "loss": 5.1133, "step": 374 }, { "epoch": 0.15, "grad_norm": 5.493654763578316, "learning_rate": 1.9254212296427043e-05, "loss": 4.7568, "step": 375 }, { "epoch": 0.15, "grad_norm": 4.658644058543485, "learning_rate": 1.9249295355844286e-05, "loss": 4.8862, "step": 376 }, { "epoch": 0.15, "grad_norm": 4.74943938464457, "learning_rate": 1.9244362891933077e-05, "loss": 4.9593, "step": 377 }, { "epoch": 0.15, "grad_norm": 4.3633638724612815, "learning_rate": 1.9239414912971697e-05, "loss": 4.8014, "step": 378 }, { "epoch": 0.15, "grad_norm": 3.8515410115588504, "learning_rate": 1.923445142726446e-05, "loss": 5.0227, "step": 379 }, { "epoch": 0.15, "grad_norm": 4.555780430950132, "learning_rate": 1.922947244314172e-05, "loss": 4.8137, "step": 380 }, { "epoch": 0.15, "grad_norm": 5.809535670084992, "learning_rate": 1.922447796895982e-05, "loss": 4.9212, "step": 381 }, { "epoch": 0.15, "grad_norm": 5.688606257617834, "learning_rate": 1.9219468013101123e-05, "loss": 5.0151, "step": 382 }, { "epoch": 0.15, "grad_norm": 6.349121714522054, "learning_rate": 1.9214442583973965e-05, "loss": 4.8788, "step": 383 }, { "epoch": 0.15, "grad_norm": 5.863385327006022, "learning_rate": 1.920940169001265e-05, "loss": 4.8467, "step": 384 }, { "epoch": 0.15, "grad_norm": 6.852291860004504, "learning_rate": 1.9204345339677442e-05, "loss": 5.0244, "step": 385 }, { "epoch": 0.15, "grad_norm": 5.064040378679961, "learning_rate": 1.919927354145454e-05, "loss": 4.8919, "step": 386 }, { "epoch": 0.15, "grad_norm": 4.802953333476715, "learning_rate": 1.919418630385607e-05, "loss": 4.9746, "step": 387 }, { "epoch": 0.16, "grad_norm": 4.651834945637443, "learning_rate": 1.9189083635420077e-05, "loss": 4.7957, "step": 388 }, { "epoch": 0.16, "grad_norm": 4.565966174891365, "learning_rate": 1.9183965544710495e-05, "loss": 4.9191, "step": 389 }, { "epoch": 0.16, "grad_norm": 4.027565446582875, "learning_rate": 1.9178832040317153e-05, "loss": 4.9894, "step": 390 }, { "epoch": 0.16, "grad_norm": 5.785579850054768, "learning_rate": 1.9173683130855737e-05, "loss": 4.8868, "step": 391 }, { "epoch": 0.16, "grad_norm": 4.764008160209594, "learning_rate": 1.9168518824967797e-05, "loss": 4.9743, "step": 392 }, { "epoch": 0.16, "grad_norm": 4.849385545583676, "learning_rate": 1.916333913132072e-05, "loss": 4.9855, "step": 393 }, { "epoch": 0.16, "grad_norm": 5.172608067530522, "learning_rate": 1.915814405860771e-05, "loss": 4.8692, "step": 394 }, { "epoch": 0.16, "grad_norm": 6.252142640840125, "learning_rate": 1.91529336155478e-05, "loss": 4.7972, "step": 395 }, { "epoch": 0.16, "grad_norm": 3.7879672350261075, "learning_rate": 1.9147707810885798e-05, "loss": 4.8605, "step": 396 }, { "epoch": 0.16, "grad_norm": 5.846876109888715, "learning_rate": 1.9142466653392317e-05, "loss": 4.7777, "step": 397 }, { "epoch": 0.16, "grad_norm": 4.271047397565751, "learning_rate": 1.913721015186372e-05, "loss": 4.7847, "step": 398 }, { "epoch": 0.16, "grad_norm": 3.792822143066731, "learning_rate": 1.913193831512213e-05, "loss": 4.8928, "step": 399 }, { "epoch": 0.16, "grad_norm": 3.818952980145011, "learning_rate": 1.9126651152015404e-05, "loss": 4.7874, "step": 400 }, { "epoch": 0.16, "grad_norm": 4.5004007269500335, "learning_rate": 1.912134867141712e-05, "loss": 4.9484, "step": 401 }, { "epoch": 0.16, "grad_norm": 3.8619583600345053, "learning_rate": 1.911603088222657e-05, "loss": 4.8863, "step": 402 }, { "epoch": 0.16, "grad_norm": 4.068544846748943, "learning_rate": 1.9110697793368733e-05, "loss": 4.9838, "step": 403 }, { "epoch": 0.16, "grad_norm": 4.678329405936819, "learning_rate": 1.9105349413794272e-05, "loss": 4.8077, "step": 404 }, { "epoch": 0.16, "grad_norm": 4.1845179577894065, "learning_rate": 1.9099985752479505e-05, "loss": 4.9151, "step": 405 }, { "epoch": 0.16, "grad_norm": 4.25200584743188, "learning_rate": 1.9094606818426403e-05, "loss": 4.9883, "step": 406 }, { "epoch": 0.16, "grad_norm": 4.469433610330476, "learning_rate": 1.908921262066257e-05, "loss": 4.9338, "step": 407 }, { "epoch": 0.16, "grad_norm": 3.9569422357282615, "learning_rate": 1.9083803168241225e-05, "loss": 4.7066, "step": 408 }, { "epoch": 0.16, "grad_norm": 4.162760399856056, "learning_rate": 1.9078378470241183e-05, "loss": 4.8367, "step": 409 }, { "epoch": 0.16, "grad_norm": 3.5660794253037156, "learning_rate": 1.9072938535766864e-05, "loss": 4.7543, "step": 410 }, { "epoch": 0.16, "grad_norm": 4.797315689316765, "learning_rate": 1.9067483373948245e-05, "loss": 4.9331, "step": 411 }, { "epoch": 0.16, "grad_norm": 4.111914062880509, "learning_rate": 1.906201299394086e-05, "loss": 4.7696, "step": 412 }, { "epoch": 0.17, "grad_norm": 4.539315392333889, "learning_rate": 1.9056527404925788e-05, "loss": 5.0399, "step": 413 }, { "epoch": 0.17, "grad_norm": 4.059585069189535, "learning_rate": 1.9051026616109637e-05, "loss": 4.757, "step": 414 }, { "epoch": 0.17, "grad_norm": 5.3260168539805735, "learning_rate": 1.904551063672452e-05, "loss": 4.702, "step": 415 }, { "epoch": 0.17, "grad_norm": 4.549834876237694, "learning_rate": 1.9039979476028044e-05, "loss": 4.6862, "step": 416 }, { "epoch": 0.17, "grad_norm": 3.6787257685993997, "learning_rate": 1.90344331433033e-05, "loss": 4.7645, "step": 417 }, { "epoch": 0.17, "grad_norm": 4.4154234577817055, "learning_rate": 1.9028871647858836e-05, "loss": 4.7376, "step": 418 }, { "epoch": 0.17, "grad_norm": 3.9661339259733364, "learning_rate": 1.9023294999028654e-05, "loss": 4.7945, "step": 419 }, { "epoch": 0.17, "grad_norm": 4.932704209466543, "learning_rate": 1.9017703206172187e-05, "loss": 4.7688, "step": 420 }, { "epoch": 0.17, "grad_norm": 3.861972016862522, "learning_rate": 1.9012096278674283e-05, "loss": 4.8772, "step": 421 }, { "epoch": 0.17, "grad_norm": 3.548438857043438, "learning_rate": 1.900647422594519e-05, "loss": 4.8837, "step": 422 }, { "epoch": 0.17, "grad_norm": 4.6790725280248475, "learning_rate": 1.900083705742054e-05, "loss": 4.8103, "step": 423 }, { "epoch": 0.17, "grad_norm": 5.539654010468228, "learning_rate": 1.8995184782561343e-05, "loss": 4.893, "step": 424 }, { "epoch": 0.17, "grad_norm": 3.68762323673155, "learning_rate": 1.8989517410853956e-05, "loss": 4.8705, "step": 425 }, { "epoch": 0.17, "grad_norm": 6.36298700012937, "learning_rate": 1.8983834951810068e-05, "loss": 4.8289, "step": 426 }, { "epoch": 0.17, "grad_norm": 4.5203690816614435, "learning_rate": 1.89781374149667e-05, "loss": 4.7811, "step": 427 }, { "epoch": 0.17, "grad_norm": 4.99630344415343, "learning_rate": 1.897242480988617e-05, "loss": 4.6708, "step": 428 }, { "epoch": 0.17, "grad_norm": 4.297946514995462, "learning_rate": 1.8966697146156092e-05, "loss": 4.7412, "step": 429 }, { "epoch": 0.17, "grad_norm": 4.807089182475005, "learning_rate": 1.896095443338935e-05, "loss": 4.8245, "step": 430 }, { "epoch": 0.17, "grad_norm": 4.442015701587801, "learning_rate": 1.895519668122408e-05, "loss": 4.741, "step": 431 }, { "epoch": 0.17, "grad_norm": 4.627904445348331, "learning_rate": 1.894942389932367e-05, "loss": 4.9728, "step": 432 }, { "epoch": 0.17, "grad_norm": 4.3819001573345915, "learning_rate": 1.8943636097376728e-05, "loss": 4.4982, "step": 433 }, { "epoch": 0.17, "grad_norm": 3.9790732584475466, "learning_rate": 1.8937833285097067e-05, "loss": 4.9344, "step": 434 }, { "epoch": 0.17, "grad_norm": 4.418435773767789, "learning_rate": 1.8932015472223692e-05, "loss": 4.7223, "step": 435 }, { "epoch": 0.17, "grad_norm": 4.379821395443412, "learning_rate": 1.8926182668520794e-05, "loss": 4.6574, "step": 436 }, { "epoch": 0.17, "grad_norm": 4.002954466901974, "learning_rate": 1.892033488377771e-05, "loss": 4.7822, "step": 437 }, { "epoch": 0.18, "grad_norm": 5.36170107372927, "learning_rate": 1.891447212780893e-05, "loss": 4.6219, "step": 438 }, { "epoch": 0.18, "grad_norm": 4.555620399748319, "learning_rate": 1.8908594410454068e-05, "loss": 4.8075, "step": 439 }, { "epoch": 0.18, "grad_norm": 4.765240769452751, "learning_rate": 1.8902701741577844e-05, "loss": 4.8486, "step": 440 }, { "epoch": 0.18, "grad_norm": 4.479000049549065, "learning_rate": 1.8896794131070073e-05, "loss": 4.8357, "step": 441 }, { "epoch": 0.18, "grad_norm": 4.846180547802962, "learning_rate": 1.8890871588845653e-05, "loss": 4.8586, "step": 442 }, { "epoch": 0.18, "grad_norm": 5.4394887924737105, "learning_rate": 1.8884934124844534e-05, "loss": 4.8759, "step": 443 }, { "epoch": 0.18, "grad_norm": 5.771942523822463, "learning_rate": 1.8878981749031718e-05, "loss": 4.6933, "step": 444 }, { "epoch": 0.18, "grad_norm": 5.222815576090352, "learning_rate": 1.8873014471397225e-05, "loss": 4.6762, "step": 445 }, { "epoch": 0.18, "grad_norm": 3.8429763054526256, "learning_rate": 1.886703230195609e-05, "loss": 4.963, "step": 446 }, { "epoch": 0.18, "grad_norm": 4.934157371040181, "learning_rate": 1.8861035250748343e-05, "loss": 4.8002, "step": 447 }, { "epoch": 0.18, "grad_norm": 4.717891500942569, "learning_rate": 1.8855023327838984e-05, "loss": 4.7741, "step": 448 }, { "epoch": 0.18, "grad_norm": 4.000372249774135, "learning_rate": 1.8848996543317982e-05, "loss": 4.6798, "step": 449 }, { "epoch": 0.18, "grad_norm": 3.7911306427773557, "learning_rate": 1.8842954907300236e-05, "loss": 4.7683, "step": 450 }, { "epoch": 0.18, "grad_norm": 5.059160382982438, "learning_rate": 1.8836898429925586e-05, "loss": 4.7103, "step": 451 }, { "epoch": 0.18, "grad_norm": 4.154690931845284, "learning_rate": 1.883082712135877e-05, "loss": 4.6826, "step": 452 }, { "epoch": 0.18, "grad_norm": 7.718997675526316, "learning_rate": 1.8824740991789417e-05, "loss": 4.6789, "step": 453 }, { "epoch": 0.18, "grad_norm": 3.9877134941447205, "learning_rate": 1.8818640051432036e-05, "loss": 4.7378, "step": 454 }, { "epoch": 0.18, "grad_norm": 6.986902956873382, "learning_rate": 1.881252431052599e-05, "loss": 4.8052, "step": 455 }, { "epoch": 0.18, "grad_norm": 3.581422466169945, "learning_rate": 1.8806393779335483e-05, "loss": 4.7852, "step": 456 }, { "epoch": 0.18, "grad_norm": 4.657535097217237, "learning_rate": 1.8800248468149545e-05, "loss": 4.8474, "step": 457 }, { "epoch": 0.18, "grad_norm": 4.965672082104629, "learning_rate": 1.8794088387282e-05, "loss": 4.7586, "step": 458 }, { "epoch": 0.18, "grad_norm": 4.277918670720924, "learning_rate": 1.8787913547071485e-05, "loss": 4.629, "step": 459 }, { "epoch": 0.18, "grad_norm": 7.159016365365489, "learning_rate": 1.8781723957881374e-05, "loss": 4.7081, "step": 460 }, { "epoch": 0.18, "grad_norm": 4.816953315704852, "learning_rate": 1.8775519630099822e-05, "loss": 4.5669, "step": 461 }, { "epoch": 0.18, "grad_norm": 5.130734125757001, "learning_rate": 1.876930057413971e-05, "loss": 4.598, "step": 462 }, { "epoch": 0.19, "grad_norm": 4.014308605597524, "learning_rate": 1.8763066800438638e-05, "loss": 4.8269, "step": 463 }, { "epoch": 0.19, "grad_norm": 4.896624107406017, "learning_rate": 1.875681831945891e-05, "loss": 4.5851, "step": 464 }, { "epoch": 0.19, "grad_norm": 4.525332134586662, "learning_rate": 1.87505551416875e-05, "loss": 4.662, "step": 465 }, { "epoch": 0.19, "grad_norm": 4.06267120156479, "learning_rate": 1.874427727763607e-05, "loss": 4.694, "step": 466 }, { "epoch": 0.19, "grad_norm": 3.9585474393739455, "learning_rate": 1.873798473784092e-05, "loss": 4.7157, "step": 467 }, { "epoch": 0.19, "grad_norm": 3.831951830757437, "learning_rate": 1.8731677532862975e-05, "loss": 4.8873, "step": 468 }, { "epoch": 0.19, "grad_norm": 4.762319748001275, "learning_rate": 1.872535567328778e-05, "loss": 4.5554, "step": 469 }, { "epoch": 0.19, "grad_norm": 3.9742662708177727, "learning_rate": 1.871901916972547e-05, "loss": 4.8751, "step": 470 }, { "epoch": 0.19, "grad_norm": 3.8286378361260334, "learning_rate": 1.8712668032810767e-05, "loss": 4.67, "step": 471 }, { "epoch": 0.19, "grad_norm": 3.917732126677209, "learning_rate": 1.870630227320294e-05, "loss": 4.8138, "step": 472 }, { "epoch": 0.19, "grad_norm": 3.9483842076440188, "learning_rate": 1.8699921901585814e-05, "loss": 4.6845, "step": 473 }, { "epoch": 0.19, "grad_norm": 3.8546046730369565, "learning_rate": 1.8693526928667724e-05, "loss": 4.5114, "step": 474 }, { "epoch": 0.19, "grad_norm": 4.077329695550013, "learning_rate": 1.8687117365181514e-05, "loss": 4.4996, "step": 475 }, { "epoch": 0.19, "grad_norm": 4.131904872521195, "learning_rate": 1.868069322188452e-05, "loss": 4.6157, "step": 476 }, { "epoch": 0.19, "grad_norm": 4.167070219544355, "learning_rate": 1.8674254509558544e-05, "loss": 4.635, "step": 477 }, { "epoch": 0.19, "grad_norm": 5.080283219503045, "learning_rate": 1.8667801239009845e-05, "loss": 4.7456, "step": 478 }, { "epoch": 0.19, "grad_norm": 4.617084882050906, "learning_rate": 1.866133342106911e-05, "loss": 4.6322, "step": 479 }, { "epoch": 0.19, "grad_norm": 3.877045676966755, "learning_rate": 1.865485106659145e-05, "loss": 4.6529, "step": 480 }, { "epoch": 0.19, "grad_norm": 4.8546137617071095, "learning_rate": 1.864835418645635e-05, "loss": 4.5812, "step": 481 }, { "epoch": 0.19, "grad_norm": 5.024800269628755, "learning_rate": 1.86418427915677e-05, "loss": 4.7275, "step": 482 }, { "epoch": 0.19, "grad_norm": 4.104346808360058, "learning_rate": 1.863531689285374e-05, "loss": 4.4275, "step": 483 }, { "epoch": 0.19, "grad_norm": 5.443182810817108, "learning_rate": 1.8628776501267052e-05, "loss": 4.5006, "step": 484 }, { "epoch": 0.19, "grad_norm": 3.7630454146072916, "learning_rate": 1.862222162778454e-05, "loss": 4.6852, "step": 485 }, { "epoch": 0.19, "grad_norm": 5.377530807213343, "learning_rate": 1.861565228340742e-05, "loss": 4.4133, "step": 486 }, { "epoch": 0.19, "grad_norm": 5.074254813435036, "learning_rate": 1.8609068479161182e-05, "loss": 4.7537, "step": 487 }, { "epoch": 0.2, "grad_norm": 7.8222231116074195, "learning_rate": 1.8602470226095602e-05, "loss": 4.5551, "step": 488 }, { "epoch": 0.2, "grad_norm": 5.37608449638734, "learning_rate": 1.8595857535284692e-05, "loss": 4.53, "step": 489 }, { "epoch": 0.2, "grad_norm": 5.128267316594894, "learning_rate": 1.85892304178267e-05, "loss": 4.6295, "step": 490 }, { "epoch": 0.2, "grad_norm": 4.860631729644693, "learning_rate": 1.8582588884844086e-05, "loss": 4.4502, "step": 491 }, { "epoch": 0.2, "grad_norm": 4.9510406392102295, "learning_rate": 1.8575932947483503e-05, "loss": 4.7843, "step": 492 }, { "epoch": 0.2, "grad_norm": 3.948182711081336, "learning_rate": 1.8569262616915784e-05, "loss": 4.7088, "step": 493 }, { "epoch": 0.2, "grad_norm": 4.802694876664713, "learning_rate": 1.8562577904335913e-05, "loss": 4.4809, "step": 494 }, { "epoch": 0.2, "grad_norm": 4.871236430723091, "learning_rate": 1.8555878820963014e-05, "loss": 4.5609, "step": 495 }, { "epoch": 0.2, "grad_norm": 4.297328298890337, "learning_rate": 1.8549165378040328e-05, "loss": 4.5167, "step": 496 }, { "epoch": 0.2, "grad_norm": 4.604119686459511, "learning_rate": 1.8542437586835202e-05, "loss": 4.7448, "step": 497 }, { "epoch": 0.2, "grad_norm": 4.557903280049274, "learning_rate": 1.8535695458639056e-05, "loss": 4.6444, "step": 498 }, { "epoch": 0.2, "grad_norm": 3.8352904144837736, "learning_rate": 1.8528939004767377e-05, "loss": 4.5181, "step": 499 }, { "epoch": 0.2, "grad_norm": 4.4315903519140605, "learning_rate": 1.8522168236559693e-05, "loss": 4.5232, "step": 500 }, { "epoch": 0.2, "grad_norm": 4.5003343526651225, "learning_rate": 1.851538316537956e-05, "loss": 4.5658, "step": 501 }, { "epoch": 0.2, "grad_norm": 4.655561193479631, "learning_rate": 1.8508583802614534e-05, "loss": 4.6049, "step": 502 }, { "epoch": 0.2, "grad_norm": 4.336819493633404, "learning_rate": 1.8501770159676157e-05, "loss": 4.385, "step": 503 }, { "epoch": 0.2, "grad_norm": 4.213922156906962, "learning_rate": 1.849494224799994e-05, "loss": 4.5663, "step": 504 }, { "epoch": 0.2, "grad_norm": 3.8801853326394733, "learning_rate": 1.8488100079045345e-05, "loss": 4.4578, "step": 505 }, { "epoch": 0.2, "grad_norm": 5.638341688055788, "learning_rate": 1.848124366429576e-05, "loss": 4.4374, "step": 506 }, { "epoch": 0.2, "grad_norm": 4.1259886569912405, "learning_rate": 1.8474373015258472e-05, "loss": 4.5498, "step": 507 }, { "epoch": 0.2, "grad_norm": 5.335260758484502, "learning_rate": 1.846748814346468e-05, "loss": 4.4976, "step": 508 }, { "epoch": 0.2, "grad_norm": 3.5227862557003005, "learning_rate": 1.846058906046943e-05, "loss": 4.4723, "step": 509 }, { "epoch": 0.2, "grad_norm": 4.391719508660204, "learning_rate": 1.8453675777851627e-05, "loss": 4.6382, "step": 510 }, { "epoch": 0.2, "grad_norm": 4.119916316504547, "learning_rate": 1.844674830721402e-05, "loss": 4.5931, "step": 511 }, { "epoch": 0.2, "grad_norm": 3.5972094158028476, "learning_rate": 1.843980666018315e-05, "loss": 4.4852, "step": 512 }, { "epoch": 0.21, "grad_norm": 5.437761861563065, "learning_rate": 1.8432850848409367e-05, "loss": 4.5205, "step": 513 }, { "epoch": 0.21, "grad_norm": 3.4619487319523112, "learning_rate": 1.8425880883566784e-05, "loss": 4.5162, "step": 514 }, { "epoch": 0.21, "grad_norm": 4.246560330909532, "learning_rate": 1.8418896777353272e-05, "loss": 4.6419, "step": 515 }, { "epoch": 0.21, "grad_norm": 4.504108212476313, "learning_rate": 1.8411898541490433e-05, "loss": 4.5368, "step": 516 }, { "epoch": 0.21, "grad_norm": 4.7748515692704, "learning_rate": 1.840488618772359e-05, "loss": 4.325, "step": 517 }, { "epoch": 0.21, "grad_norm": 4.400533553399993, "learning_rate": 1.8397859727821747e-05, "loss": 4.7751, "step": 518 }, { "epoch": 0.21, "grad_norm": 3.796804590072757, "learning_rate": 1.83908191735776e-05, "loss": 4.6476, "step": 519 }, { "epoch": 0.21, "grad_norm": 5.318892580222769, "learning_rate": 1.8383764536807486e-05, "loss": 4.7816, "step": 520 }, { "epoch": 0.21, "grad_norm": 3.755398814310214, "learning_rate": 1.8376695829351378e-05, "loss": 4.4902, "step": 521 }, { "epoch": 0.21, "grad_norm": 4.66174991561927, "learning_rate": 1.8369613063072875e-05, "loss": 4.5982, "step": 522 }, { "epoch": 0.21, "grad_norm": 5.16713458203689, "learning_rate": 1.8362516249859164e-05, "loss": 4.4873, "step": 523 }, { "epoch": 0.21, "grad_norm": 4.714662973498544, "learning_rate": 1.8355405401621e-05, "loss": 4.5149, "step": 524 }, { "epoch": 0.21, "grad_norm": 4.496778637749604, "learning_rate": 1.8348280530292712e-05, "loss": 4.5553, "step": 525 }, { "epoch": 0.21, "grad_norm": 3.908462628514215, "learning_rate": 1.834114164783215e-05, "loss": 4.5851, "step": 526 }, { "epoch": 0.21, "grad_norm": 4.0522977320295945, "learning_rate": 1.8333988766220676e-05, "loss": 4.526, "step": 527 }, { "epoch": 0.21, "grad_norm": 5.000357193034575, "learning_rate": 1.832682189746316e-05, "loss": 4.4757, "step": 528 }, { "epoch": 0.21, "grad_norm": 4.2730152597229, "learning_rate": 1.831964105358794e-05, "loss": 4.5307, "step": 529 }, { "epoch": 0.21, "grad_norm": 6.0852057209225, "learning_rate": 1.831244624664681e-05, "loss": 4.5722, "step": 530 }, { "epoch": 0.21, "grad_norm": 5.9744696789409755, "learning_rate": 1.8305237488714995e-05, "loss": 4.4842, "step": 531 }, { "epoch": 0.21, "grad_norm": 7.004849793326939, "learning_rate": 1.8298014791891138e-05, "loss": 4.3618, "step": 532 }, { "epoch": 0.21, "grad_norm": 4.7948194156817605, "learning_rate": 1.829077816829728e-05, "loss": 4.7279, "step": 533 }, { "epoch": 0.21, "grad_norm": 6.8258386178589605, "learning_rate": 1.8283527630078827e-05, "loss": 4.4468, "step": 534 }, { "epoch": 0.21, "grad_norm": 5.047435784425765, "learning_rate": 1.827626318940454e-05, "loss": 4.242, "step": 535 }, { "epoch": 0.21, "grad_norm": 4.245582218556195, "learning_rate": 1.8268984858466524e-05, "loss": 4.4191, "step": 536 }, { "epoch": 0.21, "grad_norm": 5.212873965419334, "learning_rate": 1.8261692649480174e-05, "loss": 4.3846, "step": 537 }, { "epoch": 0.22, "grad_norm": 3.973443022794257, "learning_rate": 1.8254386574684205e-05, "loss": 4.4775, "step": 538 }, { "epoch": 0.22, "grad_norm": 4.655778663416384, "learning_rate": 1.824706664634058e-05, "loss": 4.6469, "step": 539 }, { "epoch": 0.22, "grad_norm": 3.7745155639127477, "learning_rate": 1.8239732876734525e-05, "loss": 4.4211, "step": 540 }, { "epoch": 0.22, "grad_norm": 3.8928105110404876, "learning_rate": 1.823238527817449e-05, "loss": 4.5191, "step": 541 }, { "epoch": 0.22, "grad_norm": 4.012209741102411, "learning_rate": 1.822502386299214e-05, "loss": 4.7298, "step": 542 }, { "epoch": 0.22, "grad_norm": 4.09491355504511, "learning_rate": 1.8217648643542326e-05, "loss": 4.4712, "step": 543 }, { "epoch": 0.22, "grad_norm": 3.9623455755711756, "learning_rate": 1.8210259632203063e-05, "loss": 4.5201, "step": 544 }, { "epoch": 0.22, "grad_norm": 3.278338142567551, "learning_rate": 1.8202856841375517e-05, "loss": 4.5629, "step": 545 }, { "epoch": 0.22, "grad_norm": 3.470833412471676, "learning_rate": 1.819544028348399e-05, "loss": 4.5509, "step": 546 }, { "epoch": 0.22, "grad_norm": 3.557667082733272, "learning_rate": 1.818800997097587e-05, "loss": 4.4987, "step": 547 }, { "epoch": 0.22, "grad_norm": 3.307435410054455, "learning_rate": 1.8180565916321646e-05, "loss": 4.5655, "step": 548 }, { "epoch": 0.22, "grad_norm": 3.8047001558686695, "learning_rate": 1.817310813201486e-05, "loss": 4.647, "step": 549 }, { "epoch": 0.22, "grad_norm": 4.232758498965093, "learning_rate": 1.816563663057211e-05, "loss": 4.5642, "step": 550 }, { "epoch": 0.22, "grad_norm": 3.9694403182709084, "learning_rate": 1.8158151424533002e-05, "loss": 4.3808, "step": 551 }, { "epoch": 0.22, "grad_norm": 4.511990232061259, "learning_rate": 1.8150652526460146e-05, "loss": 4.6089, "step": 552 }, { "epoch": 0.22, "grad_norm": 5.589363784630558, "learning_rate": 1.8143139948939138e-05, "loss": 4.424, "step": 553 }, { "epoch": 0.22, "grad_norm": 5.391501846213585, "learning_rate": 1.8135613704578525e-05, "loss": 4.6266, "step": 554 }, { "epoch": 0.22, "grad_norm": 4.081782646799652, "learning_rate": 1.81280738060098e-05, "loss": 4.5131, "step": 555 }, { "epoch": 0.22, "grad_norm": 4.701357773481821, "learning_rate": 1.8120520265887364e-05, "loss": 4.7093, "step": 556 }, { "epoch": 0.22, "grad_norm": 4.560945380217301, "learning_rate": 1.8112953096888517e-05, "loss": 4.5045, "step": 557 }, { "epoch": 0.22, "grad_norm": 4.7367764771999035, "learning_rate": 1.810537231171343e-05, "loss": 4.4186, "step": 558 }, { "epoch": 0.22, "grad_norm": 4.383419123230545, "learning_rate": 1.809777792308513e-05, "loss": 4.4929, "step": 559 }, { "epoch": 0.22, "grad_norm": 5.130267653307888, "learning_rate": 1.8090169943749477e-05, "loss": 4.4062, "step": 560 }, { "epoch": 0.22, "grad_norm": 4.531928301060983, "learning_rate": 1.808254838647513e-05, "loss": 4.3487, "step": 561 }, { "epoch": 0.22, "grad_norm": 5.1958588839876905, "learning_rate": 1.8074913264053547e-05, "loss": 4.5398, "step": 562 }, { "epoch": 0.23, "grad_norm": 4.145503981737899, "learning_rate": 1.8067264589298945e-05, "loss": 4.6086, "step": 563 }, { "epoch": 0.23, "grad_norm": 5.874804141420842, "learning_rate": 1.8059602375048294e-05, "loss": 4.3948, "step": 564 }, { "epoch": 0.23, "grad_norm": 3.710773936481747, "learning_rate": 1.8051926634161282e-05, "loss": 4.4046, "step": 565 }, { "epoch": 0.23, "grad_norm": 3.7682629786468107, "learning_rate": 1.8044237379520305e-05, "loss": 4.5396, "step": 566 }, { "epoch": 0.23, "grad_norm": 3.9988341208652693, "learning_rate": 1.8036534624030428e-05, "loss": 4.5059, "step": 567 }, { "epoch": 0.23, "grad_norm": 4.393107693200345, "learning_rate": 1.802881838061939e-05, "loss": 4.3796, "step": 568 }, { "epoch": 0.23, "grad_norm": 5.574902532810956, "learning_rate": 1.802108866223755e-05, "loss": 4.3632, "step": 569 }, { "epoch": 0.23, "grad_norm": 3.8128396404739546, "learning_rate": 1.8013345481857903e-05, "loss": 4.3907, "step": 570 }, { "epoch": 0.23, "grad_norm": 4.33414503434105, "learning_rate": 1.8005588852476018e-05, "loss": 4.4354, "step": 571 }, { "epoch": 0.23, "grad_norm": 5.301332379657507, "learning_rate": 1.7997818787110043e-05, "loss": 4.4156, "step": 572 }, { "epoch": 0.23, "grad_norm": 3.718201275219629, "learning_rate": 1.7990035298800682e-05, "loss": 4.6335, "step": 573 }, { "epoch": 0.23, "grad_norm": 4.82328878051179, "learning_rate": 1.798223840061116e-05, "loss": 4.3818, "step": 574 }, { "epoch": 0.23, "grad_norm": 4.009302341530525, "learning_rate": 1.797442810562721e-05, "loss": 4.4935, "step": 575 }, { "epoch": 0.23, "grad_norm": 4.470726269769983, "learning_rate": 1.796660442695705e-05, "loss": 4.4081, "step": 576 }, { "epoch": 0.23, "grad_norm": 4.144773016409501, "learning_rate": 1.795876737773136e-05, "loss": 4.5555, "step": 577 }, { "epoch": 0.23, "grad_norm": 4.540527520249997, "learning_rate": 1.795091697110326e-05, "loss": 4.4273, "step": 578 }, { "epoch": 0.23, "grad_norm": 4.539566055605331, "learning_rate": 1.7943053220248284e-05, "loss": 4.583, "step": 579 }, { "epoch": 0.23, "grad_norm": 4.932252647457211, "learning_rate": 1.793517613836437e-05, "loss": 4.4988, "step": 580 }, { "epoch": 0.23, "grad_norm": 4.217312041012894, "learning_rate": 1.7927285738671825e-05, "loss": 4.4821, "step": 581 }, { "epoch": 0.23, "grad_norm": 4.185334769654126, "learning_rate": 1.7919382034413306e-05, "loss": 4.558, "step": 582 }, { "epoch": 0.23, "grad_norm": 4.583654330854395, "learning_rate": 1.7911465038853805e-05, "loss": 4.4156, "step": 583 }, { "epoch": 0.23, "grad_norm": 5.437316693787005, "learning_rate": 1.7903534765280616e-05, "loss": 4.3659, "step": 584 }, { "epoch": 0.23, "grad_norm": 5.442733348394627, "learning_rate": 1.7895591227003316e-05, "loss": 4.2313, "step": 585 }, { "epoch": 0.23, "grad_norm": 4.898831347856921, "learning_rate": 1.7887634437353754e-05, "loss": 4.3839, "step": 586 }, { "epoch": 0.23, "grad_norm": 4.187477019121502, "learning_rate": 1.7879664409686007e-05, "loss": 4.5723, "step": 587 }, { "epoch": 0.24, "grad_norm": 5.317936443725569, "learning_rate": 1.7871681157376382e-05, "loss": 4.2695, "step": 588 }, { "epoch": 0.24, "grad_norm": 4.427780735865234, "learning_rate": 1.7863684693823375e-05, "loss": 4.5148, "step": 589 }, { "epoch": 0.24, "grad_norm": 4.2148392567243835, "learning_rate": 1.7855675032447648e-05, "loss": 4.3736, "step": 590 }, { "epoch": 0.24, "grad_norm": 4.085066691271566, "learning_rate": 1.7847652186692025e-05, "loss": 4.4033, "step": 591 }, { "epoch": 0.24, "grad_norm": 4.106345509099876, "learning_rate": 1.7839616170021452e-05, "loss": 4.3073, "step": 592 }, { "epoch": 0.24, "grad_norm": 3.795043396096179, "learning_rate": 1.7831566995922983e-05, "loss": 4.4571, "step": 593 }, { "epoch": 0.24, "grad_norm": 5.040391570866654, "learning_rate": 1.782350467790575e-05, "loss": 4.5967, "step": 594 }, { "epoch": 0.24, "grad_norm": 5.172887527977414, "learning_rate": 1.7815429229500946e-05, "loss": 4.4391, "step": 595 }, { "epoch": 0.24, "grad_norm": 3.8558696182852272, "learning_rate": 1.78073406642618e-05, "loss": 4.5532, "step": 596 }, { "epoch": 0.24, "grad_norm": 5.324716483505234, "learning_rate": 1.779923899576357e-05, "loss": 4.3905, "step": 597 }, { "epoch": 0.24, "grad_norm": 3.876541883027683, "learning_rate": 1.7791124237603477e-05, "loss": 4.32, "step": 598 }, { "epoch": 0.24, "grad_norm": 3.7679255531997913, "learning_rate": 1.7782996403400737e-05, "loss": 4.3041, "step": 599 }, { "epoch": 0.24, "grad_norm": 5.222611374687213, "learning_rate": 1.7774855506796497e-05, "loss": 4.4244, "step": 600 }, { "epoch": 0.24, "grad_norm": 3.304244924899713, "learning_rate": 1.776670156145383e-05, "loss": 4.5282, "step": 601 }, { "epoch": 0.24, "grad_norm": 4.981595214348723, "learning_rate": 1.775853458105772e-05, "loss": 4.4572, "step": 602 }, { "epoch": 0.24, "grad_norm": 4.183377523359722, "learning_rate": 1.7750354579315004e-05, "loss": 4.6287, "step": 603 }, { "epoch": 0.24, "grad_norm": 3.5751970777478586, "learning_rate": 1.77421615699544e-05, "loss": 4.2279, "step": 604 }, { "epoch": 0.24, "grad_norm": 3.8162581902884876, "learning_rate": 1.7733955566726438e-05, "loss": 4.2113, "step": 605 }, { "epoch": 0.24, "grad_norm": 4.545507736912509, "learning_rate": 1.772573658340347e-05, "loss": 4.3691, "step": 606 }, { "epoch": 0.24, "grad_norm": 4.224629578143067, "learning_rate": 1.7717504633779618e-05, "loss": 4.3486, "step": 607 }, { "epoch": 0.24, "grad_norm": 4.610588665876807, "learning_rate": 1.7709259731670774e-05, "loss": 4.38, "step": 608 }, { "epoch": 0.24, "grad_norm": 5.70075196848019, "learning_rate": 1.770100189091457e-05, "loss": 4.1483, "step": 609 }, { "epoch": 0.24, "grad_norm": 3.2360932038131613, "learning_rate": 1.7692731125370355e-05, "loss": 4.3603, "step": 610 }, { "epoch": 0.24, "grad_norm": 3.8433889218472244, "learning_rate": 1.7684447448919156e-05, "loss": 4.4584, "step": 611 }, { "epoch": 0.24, "grad_norm": 4.286041363522565, "learning_rate": 1.7676150875463688e-05, "loss": 4.3522, "step": 612 }, { "epoch": 0.25, "grad_norm": 3.324312507834647, "learning_rate": 1.7667841418928292e-05, "loss": 4.34, "step": 613 }, { "epoch": 0.25, "grad_norm": 3.826648128672422, "learning_rate": 1.765951909325895e-05, "loss": 4.3366, "step": 614 }, { "epoch": 0.25, "grad_norm": 4.326699626627627, "learning_rate": 1.7651183912423228e-05, "loss": 4.22, "step": 615 }, { "epoch": 0.25, "grad_norm": 3.502208687825821, "learning_rate": 1.764283589041028e-05, "loss": 4.434, "step": 616 }, { "epoch": 0.25, "grad_norm": 3.9726600059699697, "learning_rate": 1.7634475041230796e-05, "loss": 4.4077, "step": 617 }, { "epoch": 0.25, "grad_norm": 4.794767934344185, "learning_rate": 1.7626101378917004e-05, "loss": 4.3801, "step": 618 }, { "epoch": 0.25, "grad_norm": 3.828415872023352, "learning_rate": 1.761771491752264e-05, "loss": 4.5012, "step": 619 }, { "epoch": 0.25, "grad_norm": 3.864618059858188, "learning_rate": 1.7609315671122912e-05, "loss": 4.1654, "step": 620 }, { "epoch": 0.25, "grad_norm": 3.5444444341424144, "learning_rate": 1.760090365381449e-05, "loss": 4.2813, "step": 621 }, { "epoch": 0.25, "grad_norm": 3.0754856971618674, "learning_rate": 1.759247887971548e-05, "loss": 4.4173, "step": 622 }, { "epoch": 0.25, "grad_norm": 4.051093782169358, "learning_rate": 1.7584041362965397e-05, "loss": 4.2801, "step": 623 }, { "epoch": 0.25, "grad_norm": 3.703146776128272, "learning_rate": 1.7575591117725132e-05, "loss": 4.2089, "step": 624 }, { "epoch": 0.25, "grad_norm": 3.194884286403555, "learning_rate": 1.7567128158176955e-05, "loss": 4.4067, "step": 625 }, { "epoch": 0.25, "grad_norm": 3.7197087523579073, "learning_rate": 1.7558652498524464e-05, "loss": 4.5624, "step": 626 }, { "epoch": 0.25, "grad_norm": 3.9997767290885187, "learning_rate": 1.7550164152992573e-05, "loss": 3.9206, "step": 627 }, { "epoch": 0.25, "grad_norm": 3.723995563236504, "learning_rate": 1.7541663135827493e-05, "loss": 4.2779, "step": 628 }, { "epoch": 0.25, "grad_norm": 3.8566179329855492, "learning_rate": 1.75331494612967e-05, "loss": 4.4607, "step": 629 }, { "epoch": 0.25, "grad_norm": 3.6837802961843953, "learning_rate": 1.7524623143688905e-05, "loss": 4.3756, "step": 630 }, { "epoch": 0.25, "grad_norm": 3.3643578361540993, "learning_rate": 1.7516084197314044e-05, "loss": 4.3226, "step": 631 }, { "epoch": 0.25, "grad_norm": 4.13921138996773, "learning_rate": 1.7507532636503256e-05, "loss": 3.9717, "step": 632 }, { "epoch": 0.25, "grad_norm": 3.7857046012154534, "learning_rate": 1.749896847560884e-05, "loss": 4.2997, "step": 633 }, { "epoch": 0.25, "grad_norm": 3.4312849218009567, "learning_rate": 1.7490391729004242e-05, "loss": 4.3715, "step": 634 }, { "epoch": 0.25, "grad_norm": 5.031659106872237, "learning_rate": 1.748180241108404e-05, "loss": 4.4076, "step": 635 }, { "epoch": 0.25, "grad_norm": 3.29646662187367, "learning_rate": 1.7473200536263905e-05, "loss": 4.3464, "step": 636 }, { "epoch": 0.25, "grad_norm": 4.453734263665285, "learning_rate": 1.746458611898058e-05, "loss": 4.497, "step": 637 }, { "epoch": 0.26, "grad_norm": 4.267571577595243, "learning_rate": 1.7455959173691863e-05, "loss": 4.326, "step": 638 }, { "epoch": 0.26, "grad_norm": 4.037988035302162, "learning_rate": 1.744731971487658e-05, "loss": 4.2036, "step": 639 }, { "epoch": 0.26, "grad_norm": 5.6499546934773, "learning_rate": 1.7438667757034547e-05, "loss": 4.1384, "step": 640 }, { "epoch": 0.26, "grad_norm": 4.193992253945382, "learning_rate": 1.743000331468657e-05, "loss": 4.1671, "step": 641 }, { "epoch": 0.26, "grad_norm": 3.9238693224223846, "learning_rate": 1.7421326402374406e-05, "loss": 4.2942, "step": 642 }, { "epoch": 0.26, "grad_norm": 3.7221881431043866, "learning_rate": 1.7412637034660735e-05, "loss": 4.2449, "step": 643 }, { "epoch": 0.26, "grad_norm": 3.33969588119697, "learning_rate": 1.740393522612915e-05, "loss": 4.397, "step": 644 }, { "epoch": 0.26, "grad_norm": 3.0812195963488773, "learning_rate": 1.739522099138411e-05, "loss": 4.3136, "step": 645 }, { "epoch": 0.26, "grad_norm": 4.248841778221737, "learning_rate": 1.7386494345050944e-05, "loss": 4.2977, "step": 646 }, { "epoch": 0.26, "grad_norm": 3.4512523414792726, "learning_rate": 1.73777553017758e-05, "loss": 4.181, "step": 647 }, { "epoch": 0.26, "grad_norm": 4.056102712659825, "learning_rate": 1.7369003876225644e-05, "loss": 4.262, "step": 648 }, { "epoch": 0.26, "grad_norm": 3.7592475887210575, "learning_rate": 1.7360240083088213e-05, "loss": 4.2013, "step": 649 }, { "epoch": 0.26, "grad_norm": 3.661785505723803, "learning_rate": 1.7351463937072008e-05, "loss": 4.4768, "step": 650 }, { "epoch": 0.26, "grad_norm": 3.6526178793870967, "learning_rate": 1.734267545290625e-05, "loss": 4.1434, "step": 651 }, { "epoch": 0.26, "grad_norm": 3.362778318040067, "learning_rate": 1.7333874645340886e-05, "loss": 4.5816, "step": 652 }, { "epoch": 0.26, "grad_norm": 3.6785134226263883, "learning_rate": 1.7325061529146528e-05, "loss": 4.1977, "step": 653 }, { "epoch": 0.26, "grad_norm": 3.6203914812532583, "learning_rate": 1.7316236119114466e-05, "loss": 4.1502, "step": 654 }, { "epoch": 0.26, "grad_norm": 3.909042527107989, "learning_rate": 1.7307398430056595e-05, "loss": 4.3402, "step": 655 }, { "epoch": 0.26, "grad_norm": 4.343420425268675, "learning_rate": 1.7298548476805446e-05, "loss": 4.1416, "step": 656 }, { "epoch": 0.26, "grad_norm": 3.8231798578591825, "learning_rate": 1.7289686274214116e-05, "loss": 4.4234, "step": 657 }, { "epoch": 0.26, "grad_norm": 4.104953811214553, "learning_rate": 1.7280811837156268e-05, "loss": 4.3791, "step": 658 }, { "epoch": 0.26, "grad_norm": 4.006233314995586, "learning_rate": 1.7271925180526094e-05, "loss": 4.1491, "step": 659 }, { "epoch": 0.26, "grad_norm": 3.493796073974652, "learning_rate": 1.72630263192383e-05, "loss": 4.0866, "step": 660 }, { "epoch": 0.26, "grad_norm": 5.5861790784869365, "learning_rate": 1.7254115268228073e-05, "loss": 4.1643, "step": 661 }, { "epoch": 0.26, "grad_norm": 4.094790170581724, "learning_rate": 1.724519204245105e-05, "loss": 4.3313, "step": 662 }, { "epoch": 0.27, "grad_norm": 4.8169362681688614, "learning_rate": 1.723625665688331e-05, "loss": 4.2777, "step": 663 }, { "epoch": 0.27, "grad_norm": 3.5415226420284696, "learning_rate": 1.7227309126521347e-05, "loss": 4.2133, "step": 664 }, { "epoch": 0.27, "grad_norm": 3.447252681075696, "learning_rate": 1.7218349466382024e-05, "loss": 4.4371, "step": 665 }, { "epoch": 0.27, "grad_norm": 3.857931839153315, "learning_rate": 1.7209377691502565e-05, "loss": 4.3564, "step": 666 }, { "epoch": 0.27, "grad_norm": 3.868644542943814, "learning_rate": 1.720039381694053e-05, "loss": 4.3347, "step": 667 }, { "epoch": 0.27, "grad_norm": 3.9872569129304427, "learning_rate": 1.7191397857773787e-05, "loss": 4.1822, "step": 668 }, { "epoch": 0.27, "grad_norm": 4.377738459178347, "learning_rate": 1.7182389829100484e-05, "loss": 4.2361, "step": 669 }, { "epoch": 0.27, "grad_norm": 4.526630194902284, "learning_rate": 1.7173369746039026e-05, "loss": 4.2334, "step": 670 }, { "epoch": 0.27, "grad_norm": 3.903151798762359, "learning_rate": 1.7164337623728044e-05, "loss": 4.2507, "step": 671 }, { "epoch": 0.27, "grad_norm": 3.416704913811221, "learning_rate": 1.7155293477326385e-05, "loss": 4.3865, "step": 672 }, { "epoch": 0.27, "grad_norm": 4.668815207748128, "learning_rate": 1.714623732201307e-05, "loss": 4.1203, "step": 673 }, { "epoch": 0.27, "grad_norm": 4.835149742451225, "learning_rate": 1.713716917298727e-05, "loss": 4.1392, "step": 674 }, { "epoch": 0.27, "grad_norm": 4.881418868309846, "learning_rate": 1.7128089045468294e-05, "loss": 4.2875, "step": 675 }, { "epoch": 0.27, "grad_norm": 5.034412844093579, "learning_rate": 1.7118996954695553e-05, "loss": 4.0848, "step": 676 }, { "epoch": 0.27, "grad_norm": 4.518013043332243, "learning_rate": 1.7109892915928535e-05, "loss": 4.3151, "step": 677 }, { "epoch": 0.27, "grad_norm": 4.478304353081878, "learning_rate": 1.7100776944446783e-05, "loss": 4.3224, "step": 678 }, { "epoch": 0.27, "grad_norm": 5.048591652502835, "learning_rate": 1.709164905554986e-05, "loss": 4.0666, "step": 679 }, { "epoch": 0.27, "grad_norm": 5.0357109087365535, "learning_rate": 1.7082509264557333e-05, "loss": 4.0443, "step": 680 }, { "epoch": 0.27, "grad_norm": 4.3409258985175585, "learning_rate": 1.7073357586808753e-05, "loss": 4.2371, "step": 681 }, { "epoch": 0.27, "grad_norm": 4.582253486092752, "learning_rate": 1.706419403766361e-05, "loss": 4.077, "step": 682 }, { "epoch": 0.27, "grad_norm": 4.801114373409387, "learning_rate": 1.7055018632501326e-05, "loss": 4.2553, "step": 683 }, { "epoch": 0.27, "grad_norm": 3.8747080745349978, "learning_rate": 1.7045831386721213e-05, "loss": 4.1561, "step": 684 }, { "epoch": 0.27, "grad_norm": 4.696948918314867, "learning_rate": 1.7036632315742464e-05, "loss": 4.2312, "step": 685 }, { "epoch": 0.27, "grad_norm": 5.258212104642618, "learning_rate": 1.7027421435004114e-05, "loss": 4.1866, "step": 686 }, { "epoch": 0.27, "grad_norm": 5.388331452291105, "learning_rate": 1.7018198759965018e-05, "loss": 4.1992, "step": 687 }, { "epoch": 0.28, "grad_norm": 5.647950158243207, "learning_rate": 1.7008964306103823e-05, "loss": 4.3468, "step": 688 }, { "epoch": 0.28, "grad_norm": 5.168886137147408, "learning_rate": 1.6999718088918956e-05, "loss": 4.0308, "step": 689 }, { "epoch": 0.28, "grad_norm": 4.357151028206129, "learning_rate": 1.6990460123928577e-05, "loss": 4.3588, "step": 690 }, { "epoch": 0.28, "grad_norm": 4.530147627269544, "learning_rate": 1.698119042667056e-05, "loss": 4.3144, "step": 691 }, { "epoch": 0.28, "grad_norm": 4.817151116073827, "learning_rate": 1.6971909012702483e-05, "loss": 4.2138, "step": 692 }, { "epoch": 0.28, "grad_norm": 5.092987342888455, "learning_rate": 1.6962615897601573e-05, "loss": 4.2911, "step": 693 }, { "epoch": 0.28, "grad_norm": 5.613134484273773, "learning_rate": 1.6953311096964706e-05, "loss": 4.3282, "step": 694 }, { "epoch": 0.28, "grad_norm": 5.173274414167209, "learning_rate": 1.6943994626408365e-05, "loss": 4.0758, "step": 695 }, { "epoch": 0.28, "grad_norm": 4.190457374578632, "learning_rate": 1.6934666501568618e-05, "loss": 4.3804, "step": 696 }, { "epoch": 0.28, "grad_norm": 5.601597668292631, "learning_rate": 1.69253267381011e-05, "loss": 4.3887, "step": 697 }, { "epoch": 0.28, "grad_norm": 4.210265170570191, "learning_rate": 1.6915975351680968e-05, "loss": 4.1695, "step": 698 }, { "epoch": 0.28, "grad_norm": 4.137670168126125, "learning_rate": 1.69066123580029e-05, "loss": 4.3003, "step": 699 }, { "epoch": 0.28, "grad_norm": 3.747988286996686, "learning_rate": 1.6897237772781046e-05, "loss": 4.4487, "step": 700 }, { "epoch": 0.28, "grad_norm": 4.594341934110022, "learning_rate": 1.6887851611749005e-05, "loss": 4.1977, "step": 701 }, { "epoch": 0.28, "grad_norm": 4.693710767713602, "learning_rate": 1.6878453890659815e-05, "loss": 4.4077, "step": 702 }, { "epoch": 0.28, "grad_norm": 3.9460120139260026, "learning_rate": 1.686904462528591e-05, "loss": 4.3575, "step": 703 }, { "epoch": 0.28, "grad_norm": 3.903534992356641, "learning_rate": 1.68596238314191e-05, "loss": 4.2751, "step": 704 }, { "epoch": 0.28, "grad_norm": 3.6197148558188004, "learning_rate": 1.6850191524870548e-05, "loss": 4.3067, "step": 705 }, { "epoch": 0.28, "grad_norm": 6.013695095794239, "learning_rate": 1.6840747721470733e-05, "loss": 4.3629, "step": 706 }, { "epoch": 0.28, "grad_norm": 5.793940986645831, "learning_rate": 1.6831292437069425e-05, "loss": 4.1589, "step": 707 }, { "epoch": 0.28, "grad_norm": 5.068061320502639, "learning_rate": 1.6821825687535675e-05, "loss": 4.3191, "step": 708 }, { "epoch": 0.28, "grad_norm": 4.835071559565869, "learning_rate": 1.6812347488757774e-05, "loss": 4.3855, "step": 709 }, { "epoch": 0.28, "grad_norm": 5.352222600656303, "learning_rate": 1.6802857856643214e-05, "loss": 4.1151, "step": 710 }, { "epoch": 0.28, "grad_norm": 5.223271549995184, "learning_rate": 1.6793356807118695e-05, "loss": 4.2358, "step": 711 }, { "epoch": 0.28, "grad_norm": 5.697294907663005, "learning_rate": 1.6783844356130073e-05, "loss": 4.0528, "step": 712 }, { "epoch": 0.29, "grad_norm": 5.019657412383279, "learning_rate": 1.677432051964233e-05, "loss": 4.3906, "step": 713 }, { "epoch": 0.29, "grad_norm": 4.330982781472389, "learning_rate": 1.6764785313639568e-05, "loss": 4.2145, "step": 714 }, { "epoch": 0.29, "grad_norm": 5.159400687537892, "learning_rate": 1.6755238754124965e-05, "loss": 4.15, "step": 715 }, { "epoch": 0.29, "grad_norm": 4.743019702246863, "learning_rate": 1.6745680857120757e-05, "loss": 4.3182, "step": 716 }, { "epoch": 0.29, "grad_norm": 5.376804285660271, "learning_rate": 1.6736111638668203e-05, "loss": 4.1689, "step": 717 }, { "epoch": 0.29, "grad_norm": 4.358221275967151, "learning_rate": 1.6726531114827572e-05, "loss": 4.1588, "step": 718 }, { "epoch": 0.29, "grad_norm": 4.0442497727339966, "learning_rate": 1.6716939301678098e-05, "loss": 4.4582, "step": 719 }, { "epoch": 0.29, "grad_norm": 6.046366033000383, "learning_rate": 1.6707336215317968e-05, "loss": 4.2036, "step": 720 }, { "epoch": 0.29, "grad_norm": 4.445621373056556, "learning_rate": 1.6697721871864286e-05, "loss": 4.348, "step": 721 }, { "epoch": 0.29, "grad_norm": 3.8971967069923776, "learning_rate": 1.6688096287453048e-05, "loss": 4.1313, "step": 722 }, { "epoch": 0.29, "grad_norm": 3.9511536886764964, "learning_rate": 1.6678459478239116e-05, "loss": 4.1277, "step": 723 }, { "epoch": 0.29, "grad_norm": 3.7444284748312713, "learning_rate": 1.6668811460396202e-05, "loss": 4.3639, "step": 724 }, { "epoch": 0.29, "grad_norm": 4.535042203472182, "learning_rate": 1.665915225011681e-05, "loss": 4.2301, "step": 725 }, { "epoch": 0.29, "grad_norm": 4.344050025912771, "learning_rate": 1.664948186361225e-05, "loss": 4.084, "step": 726 }, { "epoch": 0.29, "grad_norm": 4.583266578664799, "learning_rate": 1.663980031711257e-05, "loss": 4.1723, "step": 727 }, { "epoch": 0.29, "grad_norm": 4.094665681041583, "learning_rate": 1.6630107626866558e-05, "loss": 4.3757, "step": 728 }, { "epoch": 0.29, "grad_norm": 4.904919081388526, "learning_rate": 1.6620403809141707e-05, "loss": 4.1186, "step": 729 }, { "epoch": 0.29, "grad_norm": 4.399804648357315, "learning_rate": 1.6610688880224178e-05, "loss": 4.0947, "step": 730 }, { "epoch": 0.29, "grad_norm": 3.923219829805552, "learning_rate": 1.6600962856418782e-05, "loss": 4.335, "step": 731 }, { "epoch": 0.29, "grad_norm": 4.427663133020312, "learning_rate": 1.6591225754048963e-05, "loss": 4.1266, "step": 732 }, { "epoch": 0.29, "grad_norm": 5.097746227281494, "learning_rate": 1.6581477589456737e-05, "loss": 4.1285, "step": 733 }, { "epoch": 0.29, "grad_norm": 4.745457635840251, "learning_rate": 1.6571718379002705e-05, "loss": 4.3592, "step": 734 }, { "epoch": 0.29, "grad_norm": 3.758424791186076, "learning_rate": 1.6561948139065997e-05, "loss": 3.9558, "step": 735 }, { "epoch": 0.29, "grad_norm": 4.6602131909735185, "learning_rate": 1.6552166886044253e-05, "loss": 4.2786, "step": 736 }, { "epoch": 0.29, "grad_norm": 4.331991964126386, "learning_rate": 1.6542374636353605e-05, "loss": 4.205, "step": 737 }, { "epoch": 0.3, "grad_norm": 4.742271026876533, "learning_rate": 1.653257140642863e-05, "loss": 4.1134, "step": 738 }, { "epoch": 0.3, "grad_norm": 4.128592064049964, "learning_rate": 1.6522757212722346e-05, "loss": 4.2645, "step": 739 }, { "epoch": 0.3, "grad_norm": 4.128026840094599, "learning_rate": 1.6512932071706153e-05, "loss": 4.1295, "step": 740 }, { "epoch": 0.3, "grad_norm": 3.73216243413116, "learning_rate": 1.650309599986985e-05, "loss": 4.3763, "step": 741 }, { "epoch": 0.3, "grad_norm": 4.406532996935937, "learning_rate": 1.6493249013721558e-05, "loss": 4.2893, "step": 742 }, { "epoch": 0.3, "grad_norm": 4.892317898810144, "learning_rate": 1.6483391129787725e-05, "loss": 4.0252, "step": 743 }, { "epoch": 0.3, "grad_norm": 3.655969865077262, "learning_rate": 1.64735223646131e-05, "loss": 4.0342, "step": 744 }, { "epoch": 0.3, "grad_norm": 4.237746678866634, "learning_rate": 1.646364273476067e-05, "loss": 4.2777, "step": 745 }, { "epoch": 0.3, "grad_norm": 4.7276325571184055, "learning_rate": 1.6453752256811676e-05, "loss": 4.1562, "step": 746 }, { "epoch": 0.3, "grad_norm": 5.040221659089892, "learning_rate": 1.644385094736556e-05, "loss": 4.0892, "step": 747 }, { "epoch": 0.3, "grad_norm": 4.219234733858174, "learning_rate": 1.6433938823039942e-05, "loss": 4.2405, "step": 748 }, { "epoch": 0.3, "grad_norm": 4.014880781194069, "learning_rate": 1.642401590047059e-05, "loss": 4.1244, "step": 749 }, { "epoch": 0.3, "grad_norm": 5.603922662946277, "learning_rate": 1.6414082196311402e-05, "loss": 4.1623, "step": 750 }, { "epoch": 0.3, "grad_norm": 4.169752318365569, "learning_rate": 1.6404137727234366e-05, "loss": 4.1791, "step": 751 }, { "epoch": 0.3, "grad_norm": 4.800322679915261, "learning_rate": 1.639418250992954e-05, "loss": 4.0497, "step": 752 }, { "epoch": 0.3, "grad_norm": 4.819362617310608, "learning_rate": 1.6384216561105014e-05, "loss": 4.2569, "step": 753 }, { "epoch": 0.3, "grad_norm": 3.8943247801354315, "learning_rate": 1.63742398974869e-05, "loss": 4.2013, "step": 754 }, { "epoch": 0.3, "grad_norm": 4.237703478343301, "learning_rate": 1.6364252535819284e-05, "loss": 4.2323, "step": 755 }, { "epoch": 0.3, "grad_norm": 4.289135372736193, "learning_rate": 1.635425449286421e-05, "loss": 4.2226, "step": 756 }, { "epoch": 0.3, "grad_norm": 4.03279421005735, "learning_rate": 1.6344245785401653e-05, "loss": 4.0047, "step": 757 }, { "epoch": 0.3, "grad_norm": 4.12143068811916, "learning_rate": 1.6334226430229475e-05, "loss": 4.1518, "step": 758 }, { "epoch": 0.3, "grad_norm": 4.045018027459975, "learning_rate": 1.632419644416342e-05, "loss": 3.9894, "step": 759 }, { "epoch": 0.3, "grad_norm": 3.3634370875446935, "learning_rate": 1.6314155844037074e-05, "loss": 4.1409, "step": 760 }, { "epoch": 0.3, "grad_norm": 4.424185767990525, "learning_rate": 1.6304104646701818e-05, "loss": 4.0657, "step": 761 }, { "epoch": 0.3, "grad_norm": 3.8362370349072434, "learning_rate": 1.629404286902685e-05, "loss": 4.2315, "step": 762 }, { "epoch": 0.31, "grad_norm": 4.054880336997564, "learning_rate": 1.62839705278991e-05, "loss": 4.1762, "step": 763 }, { "epoch": 0.31, "grad_norm": 3.455945336863922, "learning_rate": 1.627388764022323e-05, "loss": 3.9945, "step": 764 }, { "epoch": 0.31, "grad_norm": 4.089347168700492, "learning_rate": 1.626379422292162e-05, "loss": 4.1632, "step": 765 }, { "epoch": 0.31, "grad_norm": 3.9432572091307634, "learning_rate": 1.6253690292934303e-05, "loss": 4.1629, "step": 766 }, { "epoch": 0.31, "grad_norm": 3.0237380868373296, "learning_rate": 1.624357586721896e-05, "loss": 4.3338, "step": 767 }, { "epoch": 0.31, "grad_norm": 3.6907839068821535, "learning_rate": 1.6233450962750895e-05, "loss": 4.0344, "step": 768 }, { "epoch": 0.31, "grad_norm": 4.241780550240553, "learning_rate": 1.622331559652299e-05, "loss": 4.1167, "step": 769 }, { "epoch": 0.31, "grad_norm": 3.354524584576279, "learning_rate": 1.6213169785545688e-05, "loss": 4.3221, "step": 770 }, { "epoch": 0.31, "grad_norm": 3.787194573086892, "learning_rate": 1.6203013546846967e-05, "loss": 4.0784, "step": 771 }, { "epoch": 0.31, "grad_norm": 4.173642942286069, "learning_rate": 1.61928468974723e-05, "loss": 4.1752, "step": 772 }, { "epoch": 0.31, "grad_norm": 3.9260357041238607, "learning_rate": 1.618266985448463e-05, "loss": 4.2805, "step": 773 }, { "epoch": 0.31, "grad_norm": 5.923440227675206, "learning_rate": 1.6172482434964353e-05, "loss": 4.0797, "step": 774 }, { "epoch": 0.31, "grad_norm": 5.2466075816715, "learning_rate": 1.6162284656009276e-05, "loss": 4.2478, "step": 775 }, { "epoch": 0.31, "grad_norm": 4.969458945548505, "learning_rate": 1.6152076534734585e-05, "loss": 4.2096, "step": 776 }, { "epoch": 0.31, "grad_norm": 6.042108075929687, "learning_rate": 1.6141858088272838e-05, "loss": 3.8777, "step": 777 }, { "epoch": 0.31, "grad_norm": 3.9542589642669697, "learning_rate": 1.6131629333773908e-05, "loss": 4.1506, "step": 778 }, { "epoch": 0.31, "grad_norm": 4.219621112468623, "learning_rate": 1.612139028840498e-05, "loss": 4.0092, "step": 779 }, { "epoch": 0.31, "grad_norm": 3.6474863522574874, "learning_rate": 1.6111140969350504e-05, "loss": 4.1307, "step": 780 }, { "epoch": 0.31, "grad_norm": 3.5346835812296193, "learning_rate": 1.610088139381217e-05, "loss": 4.1175, "step": 781 }, { "epoch": 0.31, "grad_norm": 3.725366186832991, "learning_rate": 1.609061157900889e-05, "loss": 4.0711, "step": 782 }, { "epoch": 0.31, "grad_norm": 4.667673965148235, "learning_rate": 1.6080331542176754e-05, "loss": 3.918, "step": 783 }, { "epoch": 0.31, "grad_norm": 4.085941511402021, "learning_rate": 1.6070041300569014e-05, "loss": 3.9859, "step": 784 }, { "epoch": 0.31, "grad_norm": 4.663966021746003, "learning_rate": 1.6059740871456035e-05, "loss": 3.9296, "step": 785 }, { "epoch": 0.31, "grad_norm": 4.171734077729142, "learning_rate": 1.60494302721253e-05, "loss": 4.25, "step": 786 }, { "epoch": 0.31, "grad_norm": 3.470608583836035, "learning_rate": 1.603910951988135e-05, "loss": 4.1841, "step": 787 }, { "epoch": 0.32, "grad_norm": 4.944785987346097, "learning_rate": 1.602877863204576e-05, "loss": 3.9134, "step": 788 }, { "epoch": 0.32, "grad_norm": 3.679669019856182, "learning_rate": 1.6018437625957135e-05, "loss": 4.1185, "step": 789 }, { "epoch": 0.32, "grad_norm": 4.8456129680158755, "learning_rate": 1.6008086518971037e-05, "loss": 3.9203, "step": 790 }, { "epoch": 0.32, "grad_norm": 4.038185112299229, "learning_rate": 1.599772532846e-05, "loss": 4.1326, "step": 791 }, { "epoch": 0.32, "grad_norm": 4.110740634200455, "learning_rate": 1.598735407181347e-05, "loss": 4.188, "step": 792 }, { "epoch": 0.32, "grad_norm": 4.088496773536847, "learning_rate": 1.5976972766437796e-05, "loss": 3.9539, "step": 793 }, { "epoch": 0.32, "grad_norm": 3.548087741743393, "learning_rate": 1.596658142975618e-05, "loss": 4.0481, "step": 794 }, { "epoch": 0.32, "grad_norm": 3.368884980792822, "learning_rate": 1.5956180079208684e-05, "loss": 4.162, "step": 795 }, { "epoch": 0.32, "grad_norm": 3.320305188255113, "learning_rate": 1.5945768732252144e-05, "loss": 4.143, "step": 796 }, { "epoch": 0.32, "grad_norm": 3.9640549005931587, "learning_rate": 1.5935347406360192e-05, "loss": 3.9122, "step": 797 }, { "epoch": 0.32, "grad_norm": 4.222874057950338, "learning_rate": 1.5924916119023214e-05, "loss": 4.2344, "step": 798 }, { "epoch": 0.32, "grad_norm": 3.5720174004747354, "learning_rate": 1.5914474887748297e-05, "loss": 4.1743, "step": 799 }, { "epoch": 0.32, "grad_norm": 4.231737588287248, "learning_rate": 1.5904023730059227e-05, "loss": 4.0582, "step": 800 }, { "epoch": 0.32, "grad_norm": 3.5855444742625484, "learning_rate": 1.589356266349645e-05, "loss": 4.1918, "step": 801 }, { "epoch": 0.32, "grad_norm": 3.6077052786289086, "learning_rate": 1.5883091705617045e-05, "loss": 3.9826, "step": 802 }, { "epoch": 0.32, "grad_norm": 3.462258780982009, "learning_rate": 1.5872610873994685e-05, "loss": 4.2167, "step": 803 }, { "epoch": 0.32, "grad_norm": 3.915782876006761, "learning_rate": 1.5862120186219614e-05, "loss": 4.0085, "step": 804 }, { "epoch": 0.32, "grad_norm": 4.072359468444531, "learning_rate": 1.5851619659898623e-05, "loss": 3.8627, "step": 805 }, { "epoch": 0.32, "grad_norm": 3.9217692589848068, "learning_rate": 1.5841109312655017e-05, "loss": 3.9516, "step": 806 }, { "epoch": 0.32, "grad_norm": 4.086637237247661, "learning_rate": 1.5830589162128574e-05, "loss": 4.0158, "step": 807 }, { "epoch": 0.32, "grad_norm": 4.674894390048435, "learning_rate": 1.582005922597553e-05, "loss": 3.9575, "step": 808 }, { "epoch": 0.32, "grad_norm": 4.18175058284887, "learning_rate": 1.580951952186856e-05, "loss": 4.0599, "step": 809 }, { "epoch": 0.32, "grad_norm": 3.8499748638554805, "learning_rate": 1.57989700674967e-05, "loss": 4.0656, "step": 810 }, { "epoch": 0.32, "grad_norm": 3.716337060023653, "learning_rate": 1.578841088056538e-05, "loss": 4.157, "step": 811 }, { "epoch": 0.32, "grad_norm": 4.305254563957067, "learning_rate": 1.5777841978796348e-05, "loss": 4.1538, "step": 812 }, { "epoch": 0.33, "grad_norm": 4.559347319241137, "learning_rate": 1.5767263379927663e-05, "loss": 3.9864, "step": 813 }, { "epoch": 0.33, "grad_norm": 3.9159846433046575, "learning_rate": 1.5756675101713657e-05, "loss": 4.1094, "step": 814 }, { "epoch": 0.33, "grad_norm": 4.299425354443105, "learning_rate": 1.5746077161924905e-05, "loss": 4.1367, "step": 815 }, { "epoch": 0.33, "grad_norm": 4.316883826806145, "learning_rate": 1.573546957834821e-05, "loss": 4.0427, "step": 816 }, { "epoch": 0.33, "grad_norm": 3.878824079534932, "learning_rate": 1.572485236878654e-05, "loss": 4.1057, "step": 817 }, { "epoch": 0.33, "grad_norm": 5.495346619133286, "learning_rate": 1.5714225551059027e-05, "loss": 4.0545, "step": 818 }, { "epoch": 0.33, "grad_norm": 4.9758999194804545, "learning_rate": 1.570358914300094e-05, "loss": 4.0423, "step": 819 }, { "epoch": 0.33, "grad_norm": 3.604552161772746, "learning_rate": 1.5692943162463628e-05, "loss": 3.9757, "step": 820 }, { "epoch": 0.33, "grad_norm": 4.851176884663419, "learning_rate": 1.5682287627314513e-05, "loss": 3.9113, "step": 821 }, { "epoch": 0.33, "grad_norm": 4.289288179124551, "learning_rate": 1.5671622555437055e-05, "loss": 4.2302, "step": 822 }, { "epoch": 0.33, "grad_norm": 3.7811955845085796, "learning_rate": 1.566094796473071e-05, "loss": 4.0358, "step": 823 }, { "epoch": 0.33, "grad_norm": 4.674141666896367, "learning_rate": 1.565026387311092e-05, "loss": 4.1475, "step": 824 }, { "epoch": 0.33, "grad_norm": 3.5027858102986764, "learning_rate": 1.5639570298509067e-05, "loss": 4.053, "step": 825 }, { "epoch": 0.33, "grad_norm": 3.6916507627982593, "learning_rate": 1.562886725887245e-05, "loss": 4.2153, "step": 826 }, { "epoch": 0.33, "grad_norm": 3.561565009887269, "learning_rate": 1.5618154772164257e-05, "loss": 4.1343, "step": 827 }, { "epoch": 0.33, "grad_norm": 3.8244028110454558, "learning_rate": 1.5607432856363523e-05, "loss": 3.8806, "step": 828 }, { "epoch": 0.33, "grad_norm": 4.460749430124214, "learning_rate": 1.559670152946512e-05, "loss": 3.9809, "step": 829 }, { "epoch": 0.33, "grad_norm": 3.755014824313866, "learning_rate": 1.5585960809479698e-05, "loss": 4.0223, "step": 830 }, { "epoch": 0.33, "grad_norm": 4.104852010995523, "learning_rate": 1.5575210714433687e-05, "loss": 4.0399, "step": 831 }, { "epoch": 0.33, "grad_norm": 3.7760475308616637, "learning_rate": 1.5564451262369247e-05, "loss": 4.0801, "step": 832 }, { "epoch": 0.33, "grad_norm": 4.2793544157021355, "learning_rate": 1.5553682471344237e-05, "loss": 4.0501, "step": 833 }, { "epoch": 0.33, "grad_norm": 4.629557175983418, "learning_rate": 1.5542904359432198e-05, "loss": 4.0355, "step": 834 }, { "epoch": 0.33, "grad_norm": 3.7883624814021895, "learning_rate": 1.5532116944722308e-05, "loss": 4.1627, "step": 835 }, { "epoch": 0.33, "grad_norm": 3.7953856863825313, "learning_rate": 1.5521320245319364e-05, "loss": 3.9935, "step": 836 }, { "epoch": 0.33, "grad_norm": 4.788743717332434, "learning_rate": 1.5510514279343736e-05, "loss": 4.0584, "step": 837 }, { "epoch": 0.34, "grad_norm": 4.138431050489037, "learning_rate": 1.5499699064931354e-05, "loss": 3.9832, "step": 838 }, { "epoch": 0.34, "grad_norm": 3.2686778021991447, "learning_rate": 1.5488874620233674e-05, "loss": 4.0388, "step": 839 }, { "epoch": 0.34, "grad_norm": 4.340524461273947, "learning_rate": 1.547804096341763e-05, "loss": 4.1873, "step": 840 }, { "epoch": 0.34, "grad_norm": 3.9077568345088065, "learning_rate": 1.5467198112665632e-05, "loss": 4.0613, "step": 841 }, { "epoch": 0.34, "grad_norm": 4.12493499904231, "learning_rate": 1.5456346086175508e-05, "loss": 4.2309, "step": 842 }, { "epoch": 0.34, "grad_norm": 4.341152194190719, "learning_rate": 1.5445484902160494e-05, "loss": 3.8967, "step": 843 }, { "epoch": 0.34, "grad_norm": 3.8927081089261724, "learning_rate": 1.543461457884919e-05, "loss": 3.9914, "step": 844 }, { "epoch": 0.34, "grad_norm": 4.07972545980503, "learning_rate": 1.5423735134485537e-05, "loss": 3.8737, "step": 845 }, { "epoch": 0.34, "grad_norm": 4.153125798186397, "learning_rate": 1.541284658732878e-05, "loss": 4.1366, "step": 846 }, { "epoch": 0.34, "grad_norm": 3.9414313278213275, "learning_rate": 1.540194895565346e-05, "loss": 4.0546, "step": 847 }, { "epoch": 0.34, "grad_norm": 5.007997536551065, "learning_rate": 1.5391042257749338e-05, "loss": 4.0203, "step": 848 }, { "epoch": 0.34, "grad_norm": 3.7302176266174256, "learning_rate": 1.5380126511921404e-05, "loss": 4.0391, "step": 849 }, { "epoch": 0.34, "grad_norm": 3.785687015338851, "learning_rate": 1.536920173648984e-05, "loss": 4.0755, "step": 850 }, { "epoch": 0.34, "grad_norm": 4.949326413164564, "learning_rate": 1.5358267949789968e-05, "loss": 3.9168, "step": 851 }, { "epoch": 0.34, "grad_norm": 4.339362469992984, "learning_rate": 1.5347325170172246e-05, "loss": 4.044, "step": 852 }, { "epoch": 0.34, "grad_norm": 3.894035836096662, "learning_rate": 1.533637341600221e-05, "loss": 3.9281, "step": 853 }, { "epoch": 0.34, "grad_norm": 4.585643645017433, "learning_rate": 1.532541270566049e-05, "loss": 3.958, "step": 854 }, { "epoch": 0.34, "grad_norm": 4.369189203074978, "learning_rate": 1.5314443057542703e-05, "loss": 4.1421, "step": 855 }, { "epoch": 0.34, "grad_norm": 3.6856868676372194, "learning_rate": 1.5303464490059506e-05, "loss": 4.0606, "step": 856 }, { "epoch": 0.34, "grad_norm": 3.7601211848512306, "learning_rate": 1.5292477021636498e-05, "loss": 3.9721, "step": 857 }, { "epoch": 0.34, "grad_norm": 3.8571531489482274, "learning_rate": 1.528148067071423e-05, "loss": 3.8854, "step": 858 }, { "epoch": 0.34, "grad_norm": 5.285596884849708, "learning_rate": 1.5270475455748165e-05, "loss": 3.9812, "step": 859 }, { "epoch": 0.34, "grad_norm": 4.5621322826993085, "learning_rate": 1.5259461395208628e-05, "loss": 4.0867, "step": 860 }, { "epoch": 0.34, "grad_norm": 3.7099938172783467, "learning_rate": 1.5248438507580806e-05, "loss": 4.1262, "step": 861 }, { "epoch": 0.34, "grad_norm": 4.822370952909795, "learning_rate": 1.5237406811364682e-05, "loss": 3.8992, "step": 862 }, { "epoch": 0.35, "grad_norm": 4.615930344249073, "learning_rate": 1.5226366325075042e-05, "loss": 3.9119, "step": 863 }, { "epoch": 0.35, "grad_norm": 4.141109925790369, "learning_rate": 1.5215317067241415e-05, "loss": 3.922, "step": 864 }, { "epoch": 0.35, "grad_norm": 4.91182469534649, "learning_rate": 1.5204259056408046e-05, "loss": 4.1266, "step": 865 }, { "epoch": 0.35, "grad_norm": 3.739014813917722, "learning_rate": 1.5193192311133884e-05, "loss": 4.1846, "step": 866 }, { "epoch": 0.35, "grad_norm": 3.733700050825112, "learning_rate": 1.5182116849992528e-05, "loss": 4.1189, "step": 867 }, { "epoch": 0.35, "grad_norm": 4.577400393069631, "learning_rate": 1.5171032691572207e-05, "loss": 3.8833, "step": 868 }, { "epoch": 0.35, "grad_norm": 3.9216623914448743, "learning_rate": 1.5159939854475743e-05, "loss": 3.8559, "step": 869 }, { "epoch": 0.35, "grad_norm": 4.248938111614794, "learning_rate": 1.5148838357320537e-05, "loss": 3.9905, "step": 870 }, { "epoch": 0.35, "grad_norm": 3.7734583568786384, "learning_rate": 1.5137728218738504e-05, "loss": 4.1165, "step": 871 }, { "epoch": 0.35, "grad_norm": 3.3465062079897963, "learning_rate": 1.512660945737608e-05, "loss": 4.1162, "step": 872 }, { "epoch": 0.35, "grad_norm": 4.716306651251567, "learning_rate": 1.5115482091894164e-05, "loss": 4.1707, "step": 873 }, { "epoch": 0.35, "grad_norm": 3.776023574626814, "learning_rate": 1.5104346140968096e-05, "loss": 3.9443, "step": 874 }, { "epoch": 0.35, "grad_norm": 3.3972508182391574, "learning_rate": 1.5093201623287631e-05, "loss": 4.0875, "step": 875 }, { "epoch": 0.35, "grad_norm": 4.557313751621321, "learning_rate": 1.5082048557556892e-05, "loss": 3.9307, "step": 876 }, { "epoch": 0.35, "grad_norm": 4.009659574014854, "learning_rate": 1.507088696249436e-05, "loss": 3.9834, "step": 877 }, { "epoch": 0.35, "grad_norm": 3.4421878630309575, "learning_rate": 1.505971685683282e-05, "loss": 3.9976, "step": 878 }, { "epoch": 0.35, "grad_norm": 4.090750290145485, "learning_rate": 1.5048538259319347e-05, "loss": 3.7983, "step": 879 }, { "epoch": 0.35, "grad_norm": 4.084453101262391, "learning_rate": 1.5037351188715265e-05, "loss": 3.8341, "step": 880 }, { "epoch": 0.35, "grad_norm": 3.7389574740374556, "learning_rate": 1.5026155663796123e-05, "loss": 3.6981, "step": 881 }, { "epoch": 0.35, "grad_norm": 3.2705691585414263, "learning_rate": 1.5014951703351655e-05, "loss": 3.9908, "step": 882 }, { "epoch": 0.35, "grad_norm": 3.6909533401956316, "learning_rate": 1.500373932618575e-05, "loss": 3.8345, "step": 883 }, { "epoch": 0.35, "grad_norm": 3.8291194048755024, "learning_rate": 1.4992518551116436e-05, "loss": 4.0007, "step": 884 }, { "epoch": 0.35, "grad_norm": 2.9216771533735, "learning_rate": 1.4981289396975818e-05, "loss": 4.0789, "step": 885 }, { "epoch": 0.35, "grad_norm": 3.5513257467131147, "learning_rate": 1.4970051882610073e-05, "loss": 4.0285, "step": 886 }, { "epoch": 0.35, "grad_norm": 3.938085563815545, "learning_rate": 1.4958806026879411e-05, "loss": 3.9651, "step": 887 }, { "epoch": 0.36, "grad_norm": 3.200770892095531, "learning_rate": 1.4947551848658036e-05, "loss": 4.0959, "step": 888 }, { "epoch": 0.36, "grad_norm": 3.9964458964377174, "learning_rate": 1.4936289366834123e-05, "loss": 3.8196, "step": 889 }, { "epoch": 0.36, "grad_norm": 3.3213563792790763, "learning_rate": 1.4925018600309784e-05, "loss": 4.109, "step": 890 }, { "epoch": 0.36, "grad_norm": 4.041337943923999, "learning_rate": 1.4913739568001034e-05, "loss": 4.0301, "step": 891 }, { "epoch": 0.36, "grad_norm": 3.4764085757009324, "learning_rate": 1.4902452288837761e-05, "loss": 3.8225, "step": 892 }, { "epoch": 0.36, "grad_norm": 3.592918007172055, "learning_rate": 1.4891156781763692e-05, "loss": 3.8364, "step": 893 }, { "epoch": 0.36, "grad_norm": 3.592437141265646, "learning_rate": 1.4879853065736366e-05, "loss": 4.0147, "step": 894 }, { "epoch": 0.36, "grad_norm": 3.7811593209134333, "learning_rate": 1.4868541159727097e-05, "loss": 3.8975, "step": 895 }, { "epoch": 0.36, "grad_norm": 3.17581172791539, "learning_rate": 1.485722108272095e-05, "loss": 4.0324, "step": 896 }, { "epoch": 0.36, "grad_norm": 3.631807623212032, "learning_rate": 1.4845892853716692e-05, "loss": 3.9237, "step": 897 }, { "epoch": 0.36, "grad_norm": 3.7247974116244467, "learning_rate": 1.4834556491726781e-05, "loss": 3.9782, "step": 898 }, { "epoch": 0.36, "grad_norm": 4.953397859539653, "learning_rate": 1.482321201577733e-05, "loss": 4.0207, "step": 899 }, { "epoch": 0.36, "grad_norm": 3.286067218324245, "learning_rate": 1.4811859444908053e-05, "loss": 3.9184, "step": 900 }, { "epoch": 0.36, "grad_norm": 3.9689525465502546, "learning_rate": 1.4800498798172263e-05, "loss": 3.9761, "step": 901 }, { "epoch": 0.36, "grad_norm": 3.9432696950576682, "learning_rate": 1.478913009463682e-05, "loss": 3.8415, "step": 902 }, { "epoch": 0.36, "grad_norm": 3.784967591738362, "learning_rate": 1.4777753353382121e-05, "loss": 3.9511, "step": 903 }, { "epoch": 0.36, "grad_norm": 3.649524601044959, "learning_rate": 1.4766368593502028e-05, "loss": 3.9241, "step": 904 }, { "epoch": 0.36, "grad_norm": 4.125027971868248, "learning_rate": 1.4754975834103877e-05, "loss": 3.8264, "step": 905 }, { "epoch": 0.36, "grad_norm": 4.167521116754602, "learning_rate": 1.474357509430843e-05, "loss": 3.9653, "step": 906 }, { "epoch": 0.36, "grad_norm": 4.694796156291042, "learning_rate": 1.473216639324984e-05, "loss": 3.8908, "step": 907 }, { "epoch": 0.36, "grad_norm": 3.880655396365558, "learning_rate": 1.472074975007562e-05, "loss": 3.8734, "step": 908 }, { "epoch": 0.36, "grad_norm": 3.7017895815934265, "learning_rate": 1.4709325183946613e-05, "loss": 3.9442, "step": 909 }, { "epoch": 0.36, "grad_norm": 4.055857844353811, "learning_rate": 1.4697892714036959e-05, "loss": 3.9686, "step": 910 }, { "epoch": 0.36, "grad_norm": 4.938238712811652, "learning_rate": 1.4686452359534067e-05, "loss": 3.9718, "step": 911 }, { "epoch": 0.36, "grad_norm": 3.633536668094118, "learning_rate": 1.467500413963857e-05, "loss": 4.0176, "step": 912 }, { "epoch": 0.37, "grad_norm": 5.039475965540521, "learning_rate": 1.4663548073564316e-05, "loss": 3.8204, "step": 913 }, { "epoch": 0.37, "grad_norm": 4.586728291833797, "learning_rate": 1.4652084180538304e-05, "loss": 3.9394, "step": 914 }, { "epoch": 0.37, "grad_norm": 3.6290231319278643, "learning_rate": 1.4640612479800686e-05, "loss": 3.9651, "step": 915 }, { "epoch": 0.37, "grad_norm": 4.523810100980663, "learning_rate": 1.4629132990604706e-05, "loss": 3.9046, "step": 916 }, { "epoch": 0.37, "grad_norm": 3.3352008780739277, "learning_rate": 1.4617645732216686e-05, "loss": 3.9908, "step": 917 }, { "epoch": 0.37, "grad_norm": 4.216385007164741, "learning_rate": 1.4606150723915984e-05, "loss": 4.1919, "step": 918 }, { "epoch": 0.37, "grad_norm": 3.3379333114880017, "learning_rate": 1.4594647984994966e-05, "loss": 3.8451, "step": 919 }, { "epoch": 0.37, "grad_norm": 4.326212397572884, "learning_rate": 1.4583137534758968e-05, "loss": 4.0293, "step": 920 }, { "epoch": 0.37, "grad_norm": 4.104354270670977, "learning_rate": 1.4571619392526279e-05, "loss": 4.0689, "step": 921 }, { "epoch": 0.37, "grad_norm": 4.855954672044245, "learning_rate": 1.456009357762809e-05, "loss": 3.8103, "step": 922 }, { "epoch": 0.37, "grad_norm": 3.4625438644586413, "learning_rate": 1.4548560109408465e-05, "loss": 3.9354, "step": 923 }, { "epoch": 0.37, "grad_norm": 3.244228225779234, "learning_rate": 1.4537019007224324e-05, "loss": 4.1405, "step": 924 }, { "epoch": 0.37, "grad_norm": 4.736402044407656, "learning_rate": 1.4525470290445392e-05, "loss": 3.9756, "step": 925 }, { "epoch": 0.37, "grad_norm": 3.804301068297979, "learning_rate": 1.4513913978454169e-05, "loss": 4.1198, "step": 926 }, { "epoch": 0.37, "grad_norm": 4.157084681702177, "learning_rate": 1.4502350090645919e-05, "loss": 3.8946, "step": 927 }, { "epoch": 0.37, "grad_norm": 3.4439326017172562, "learning_rate": 1.4490778646428601e-05, "loss": 3.9797, "step": 928 }, { "epoch": 0.37, "grad_norm": 3.855937635878492, "learning_rate": 1.4479199665222869e-05, "loss": 3.8229, "step": 929 }, { "epoch": 0.37, "grad_norm": 3.52379856141619, "learning_rate": 1.4467613166462024e-05, "loss": 4.1031, "step": 930 }, { "epoch": 0.37, "grad_norm": 4.343080438472811, "learning_rate": 1.445601916959198e-05, "loss": 3.8626, "step": 931 }, { "epoch": 0.37, "grad_norm": 3.3607075037651586, "learning_rate": 1.4444417694071242e-05, "loss": 4.0184, "step": 932 }, { "epoch": 0.37, "grad_norm": 3.7952187164245066, "learning_rate": 1.4432808759370853e-05, "loss": 3.7856, "step": 933 }, { "epoch": 0.37, "grad_norm": 4.346280524599407, "learning_rate": 1.4421192384974396e-05, "loss": 3.8906, "step": 934 }, { "epoch": 0.37, "grad_norm": 3.4646318285115116, "learning_rate": 1.4409568590377918e-05, "loss": 4.0531, "step": 935 }, { "epoch": 0.37, "grad_norm": 4.182963241175465, "learning_rate": 1.439793739508994e-05, "loss": 3.8542, "step": 936 }, { "epoch": 0.37, "grad_norm": 3.7121887516870933, "learning_rate": 1.4386298818631388e-05, "loss": 3.9009, "step": 937 }, { "epoch": 0.38, "grad_norm": 3.281667995792813, "learning_rate": 1.437465288053558e-05, "loss": 3.9757, "step": 938 }, { "epoch": 0.38, "grad_norm": 3.356495016960309, "learning_rate": 1.4362999600348198e-05, "loss": 3.8267, "step": 939 }, { "epoch": 0.38, "grad_norm": 4.711219010948904, "learning_rate": 1.4351338997627233e-05, "loss": 3.9867, "step": 940 }, { "epoch": 0.38, "grad_norm": 3.556481836187361, "learning_rate": 1.433967109194298e-05, "loss": 3.9451, "step": 941 }, { "epoch": 0.38, "grad_norm": 4.265019987798413, "learning_rate": 1.4327995902877972e-05, "loss": 3.8217, "step": 942 }, { "epoch": 0.38, "grad_norm": 3.703379886530379, "learning_rate": 1.4316313450026986e-05, "loss": 3.8829, "step": 943 }, { "epoch": 0.38, "grad_norm": 3.5889354627304004, "learning_rate": 1.4304623752996974e-05, "loss": 4.1259, "step": 944 }, { "epoch": 0.38, "grad_norm": 4.097019004750559, "learning_rate": 1.429292683140706e-05, "loss": 3.7848, "step": 945 }, { "epoch": 0.38, "grad_norm": 3.7604395396181123, "learning_rate": 1.428122270488848e-05, "loss": 4.0124, "step": 946 }, { "epoch": 0.38, "grad_norm": 3.1999883377450113, "learning_rate": 1.4269511393084572e-05, "loss": 4.0831, "step": 947 }, { "epoch": 0.38, "grad_norm": 3.382731690898, "learning_rate": 1.4257792915650728e-05, "loss": 3.8673, "step": 948 }, { "epoch": 0.38, "grad_norm": 3.513739284257866, "learning_rate": 1.4246067292254367e-05, "loss": 3.8318, "step": 949 }, { "epoch": 0.38, "grad_norm": 3.723969538475621, "learning_rate": 1.4234334542574906e-05, "loss": 3.9158, "step": 950 }, { "epoch": 0.38, "grad_norm": 3.389218087656833, "learning_rate": 1.4222594686303707e-05, "loss": 3.9276, "step": 951 }, { "epoch": 0.38, "grad_norm": 4.042149116988585, "learning_rate": 1.4210847743144087e-05, "loss": 3.8478, "step": 952 }, { "epoch": 0.38, "grad_norm": 3.823734180174385, "learning_rate": 1.4199093732811227e-05, "loss": 3.8644, "step": 953 }, { "epoch": 0.38, "grad_norm": 3.4666325491254772, "learning_rate": 1.4187332675032189e-05, "loss": 3.9167, "step": 954 }, { "epoch": 0.38, "grad_norm": 4.565609657289887, "learning_rate": 1.4175564589545853e-05, "loss": 3.7594, "step": 955 }, { "epoch": 0.38, "grad_norm": 4.279210813025605, "learning_rate": 1.4163789496102902e-05, "loss": 3.9157, "step": 956 }, { "epoch": 0.38, "grad_norm": 3.6109420515584296, "learning_rate": 1.4152007414465771e-05, "loss": 3.775, "step": 957 }, { "epoch": 0.38, "grad_norm": 3.5183027997433562, "learning_rate": 1.4140218364408634e-05, "loss": 3.8613, "step": 958 }, { "epoch": 0.38, "grad_norm": 4.7680112519128635, "learning_rate": 1.4128422365717346e-05, "loss": 3.8931, "step": 959 }, { "epoch": 0.38, "grad_norm": 4.065893326610565, "learning_rate": 1.411661943818944e-05, "loss": 3.926, "step": 960 }, { "epoch": 0.38, "grad_norm": 4.062397562509862, "learning_rate": 1.4104809601634069e-05, "loss": 3.8562, "step": 961 }, { "epoch": 0.38, "grad_norm": 4.143634258192227, "learning_rate": 1.409299287587198e-05, "loss": 3.7753, "step": 962 }, { "epoch": 0.39, "grad_norm": 4.103196921879452, "learning_rate": 1.4081169280735488e-05, "loss": 4.0006, "step": 963 }, { "epoch": 0.39, "grad_norm": 3.7702584463005606, "learning_rate": 1.4069338836068434e-05, "loss": 3.8562, "step": 964 }, { "epoch": 0.39, "grad_norm": 3.664516815419518, "learning_rate": 1.4057501561726157e-05, "loss": 4.1171, "step": 965 }, { "epoch": 0.39, "grad_norm": 4.196260439411363, "learning_rate": 1.404565747757545e-05, "loss": 3.872, "step": 966 }, { "epoch": 0.39, "grad_norm": 3.837246468651308, "learning_rate": 1.403380660349455e-05, "loss": 4.0388, "step": 967 }, { "epoch": 0.39, "grad_norm": 3.665536340443557, "learning_rate": 1.4021948959373075e-05, "loss": 3.9043, "step": 968 }, { "epoch": 0.39, "grad_norm": 4.561871423373831, "learning_rate": 1.4010084565112018e-05, "loss": 4.0049, "step": 969 }, { "epoch": 0.39, "grad_norm": 4.31227711238914, "learning_rate": 1.3998213440623691e-05, "loss": 3.9867, "step": 970 }, { "epoch": 0.39, "grad_norm": 3.3638177448238977, "learning_rate": 1.3986335605831707e-05, "loss": 4.2255, "step": 971 }, { "epoch": 0.39, "grad_norm": 3.7103949192821384, "learning_rate": 1.3974451080670934e-05, "loss": 3.9219, "step": 972 }, { "epoch": 0.39, "grad_norm": 3.981342474110125, "learning_rate": 1.3962559885087482e-05, "loss": 3.8157, "step": 973 }, { "epoch": 0.39, "grad_norm": 4.679894552303274, "learning_rate": 1.3950662039038643e-05, "loss": 3.9717, "step": 974 }, { "epoch": 0.39, "grad_norm": 4.367030262675531, "learning_rate": 1.3938757562492873e-05, "loss": 3.8756, "step": 975 }, { "epoch": 0.39, "grad_norm": 3.7171472220445456, "learning_rate": 1.3926846475429767e-05, "loss": 3.8459, "step": 976 }, { "epoch": 0.39, "grad_norm": 4.508351826758392, "learning_rate": 1.3914928797839996e-05, "loss": 3.6319, "step": 977 }, { "epoch": 0.39, "grad_norm": 4.474974790718888, "learning_rate": 1.3903004549725313e-05, "loss": 3.8134, "step": 978 }, { "epoch": 0.39, "grad_norm": 3.555949869772792, "learning_rate": 1.3891073751098481e-05, "loss": 3.917, "step": 979 }, { "epoch": 0.39, "grad_norm": 4.225391171204873, "learning_rate": 1.3879136421983265e-05, "loss": 3.9551, "step": 980 }, { "epoch": 0.39, "grad_norm": 3.666887533774541, "learning_rate": 1.3867192582414393e-05, "loss": 3.8632, "step": 981 }, { "epoch": 0.39, "grad_norm": 3.802718705221923, "learning_rate": 1.3855242252437511e-05, "loss": 3.9409, "step": 982 }, { "epoch": 0.39, "grad_norm": 3.5413928934272914, "learning_rate": 1.3843285452109166e-05, "loss": 3.8817, "step": 983 }, { "epoch": 0.39, "grad_norm": 4.159489305465269, "learning_rate": 1.3831322201496757e-05, "loss": 3.857, "step": 984 }, { "epoch": 0.39, "grad_norm": 4.823310889696872, "learning_rate": 1.3819352520678519e-05, "loss": 4.1045, "step": 985 }, { "epoch": 0.39, "grad_norm": 3.3447101805626867, "learning_rate": 1.3807376429743467e-05, "loss": 3.9642, "step": 986 }, { "epoch": 0.39, "grad_norm": 3.869554340354115, "learning_rate": 1.3795393948791382e-05, "loss": 3.8103, "step": 987 }, { "epoch": 0.4, "grad_norm": 4.624295111996602, "learning_rate": 1.3783405097932772e-05, "loss": 3.7571, "step": 988 }, { "epoch": 0.4, "grad_norm": 3.8202389870193754, "learning_rate": 1.3771409897288823e-05, "loss": 3.7456, "step": 989 }, { "epoch": 0.4, "grad_norm": 3.4182725294539593, "learning_rate": 1.3759408366991391e-05, "loss": 3.9806, "step": 990 }, { "epoch": 0.4, "grad_norm": 3.899990351720217, "learning_rate": 1.3747400527182952e-05, "loss": 4.1462, "step": 991 }, { "epoch": 0.4, "grad_norm": 3.2462833506393514, "learning_rate": 1.373538639801657e-05, "loss": 4.1141, "step": 992 }, { "epoch": 0.4, "grad_norm": 3.778151533518385, "learning_rate": 1.3723365999655859e-05, "loss": 3.8378, "step": 993 }, { "epoch": 0.4, "grad_norm": 3.4612512836444447, "learning_rate": 1.3711339352274969e-05, "loss": 3.961, "step": 994 }, { "epoch": 0.4, "grad_norm": 3.4892336558699677, "learning_rate": 1.3699306476058523e-05, "loss": 3.8107, "step": 995 }, { "epoch": 0.4, "grad_norm": 3.667163737052047, "learning_rate": 1.3687267391201604e-05, "loss": 3.8194, "step": 996 }, { "epoch": 0.4, "grad_norm": 3.7753395478216945, "learning_rate": 1.3675222117909716e-05, "loss": 3.8419, "step": 997 }, { "epoch": 0.4, "grad_norm": 4.210105830897785, "learning_rate": 1.366317067639875e-05, "loss": 3.8023, "step": 998 }, { "epoch": 0.4, "grad_norm": 3.723917366613343, "learning_rate": 1.3651113086894951e-05, "loss": 4.0149, "step": 999 }, { "epoch": 0.4, "grad_norm": 4.244202977967538, "learning_rate": 1.3639049369634878e-05, "loss": 4.0083, "step": 1000 }, { "epoch": 0.4, "grad_norm": 4.02953633918017, "learning_rate": 1.3626979544865369e-05, "loss": 3.6242, "step": 1001 }, { "epoch": 0.4, "grad_norm": 3.8798268517813534, "learning_rate": 1.3614903632843523e-05, "loss": 3.8001, "step": 1002 }, { "epoch": 0.4, "grad_norm": 3.9228004237258767, "learning_rate": 1.3602821653836654e-05, "loss": 3.8272, "step": 1003 }, { "epoch": 0.4, "grad_norm": 3.904727851654214, "learning_rate": 1.3590733628122253e-05, "loss": 3.9567, "step": 1004 }, { "epoch": 0.4, "grad_norm": 3.7882268070089697, "learning_rate": 1.357863957598796e-05, "loss": 3.8891, "step": 1005 }, { "epoch": 0.4, "grad_norm": 5.706967784285879, "learning_rate": 1.3566539517731536e-05, "loss": 3.8622, "step": 1006 }, { "epoch": 0.4, "grad_norm": 4.748018650903142, "learning_rate": 1.3554433473660818e-05, "loss": 3.7923, "step": 1007 }, { "epoch": 0.4, "grad_norm": 3.4758174617619013, "learning_rate": 1.354232146409368e-05, "loss": 3.8247, "step": 1008 }, { "epoch": 0.4, "grad_norm": 4.646296934730249, "learning_rate": 1.353020350935803e-05, "loss": 3.828, "step": 1009 }, { "epoch": 0.4, "grad_norm": 4.470041626731096, "learning_rate": 1.3518079629791725e-05, "loss": 3.7678, "step": 1010 }, { "epoch": 0.4, "grad_norm": 4.079568258122262, "learning_rate": 1.3505949845742599e-05, "loss": 3.7364, "step": 1011 }, { "epoch": 0.4, "grad_norm": 3.8456370828497772, "learning_rate": 1.3493814177568365e-05, "loss": 3.9458, "step": 1012 }, { "epoch": 0.41, "grad_norm": 3.6309196422021865, "learning_rate": 1.3481672645636627e-05, "loss": 3.8194, "step": 1013 }, { "epoch": 0.41, "grad_norm": 4.163434543886125, "learning_rate": 1.3469525270324835e-05, "loss": 3.728, "step": 1014 }, { "epoch": 0.41, "grad_norm": 4.21450380310228, "learning_rate": 1.345737207202023e-05, "loss": 3.9219, "step": 1015 }, { "epoch": 0.41, "grad_norm": 3.974894908073055, "learning_rate": 1.3445213071119841e-05, "loss": 3.795, "step": 1016 }, { "epoch": 0.41, "grad_norm": 3.7309967078096324, "learning_rate": 1.3433048288030424e-05, "loss": 3.9291, "step": 1017 }, { "epoch": 0.41, "grad_norm": 4.882915305390519, "learning_rate": 1.342087774316845e-05, "loss": 3.9465, "step": 1018 }, { "epoch": 0.41, "grad_norm": 5.298426712564282, "learning_rate": 1.3408701456960052e-05, "loss": 3.9188, "step": 1019 }, { "epoch": 0.41, "grad_norm": 4.075499444985152, "learning_rate": 1.3396519449841006e-05, "loss": 3.8499, "step": 1020 }, { "epoch": 0.41, "grad_norm": 4.058588213246766, "learning_rate": 1.338433174225668e-05, "loss": 3.9857, "step": 1021 }, { "epoch": 0.41, "grad_norm": 4.441001425059062, "learning_rate": 1.3372138354662018e-05, "loss": 3.6495, "step": 1022 }, { "epoch": 0.41, "grad_norm": 4.737403468138393, "learning_rate": 1.3359939307521494e-05, "loss": 3.5714, "step": 1023 }, { "epoch": 0.41, "grad_norm": 4.489644852015869, "learning_rate": 1.3347734621309076e-05, "loss": 3.7954, "step": 1024 }, { "epoch": 0.41, "grad_norm": 4.698554077016783, "learning_rate": 1.3335524316508208e-05, "loss": 3.7721, "step": 1025 }, { "epoch": 0.41, "grad_norm": 4.537685736192624, "learning_rate": 1.3323308413611748e-05, "loss": 3.6806, "step": 1026 }, { "epoch": 0.41, "grad_norm": 4.6844980885590415, "learning_rate": 1.3311086933121961e-05, "loss": 3.95, "step": 1027 }, { "epoch": 0.41, "grad_norm": 3.7048407750955152, "learning_rate": 1.3298859895550473e-05, "loss": 3.9469, "step": 1028 }, { "epoch": 0.41, "grad_norm": 3.66973118936307, "learning_rate": 1.3286627321418229e-05, "loss": 3.8629, "step": 1029 }, { "epoch": 0.41, "grad_norm": 3.8653046493517436, "learning_rate": 1.3274389231255466e-05, "loss": 3.8781, "step": 1030 }, { "epoch": 0.41, "grad_norm": 4.60833520162824, "learning_rate": 1.3262145645601693e-05, "loss": 3.786, "step": 1031 }, { "epoch": 0.41, "grad_norm": 3.5817623342333627, "learning_rate": 1.3249896585005628e-05, "loss": 3.877, "step": 1032 }, { "epoch": 0.41, "grad_norm": 4.387212721078502, "learning_rate": 1.3237642070025183e-05, "loss": 3.8994, "step": 1033 }, { "epoch": 0.41, "grad_norm": 4.159428683537451, "learning_rate": 1.322538212122742e-05, "loss": 3.7944, "step": 1034 }, { "epoch": 0.41, "grad_norm": 3.6203299759058987, "learning_rate": 1.3213116759188525e-05, "loss": 3.985, "step": 1035 }, { "epoch": 0.41, "grad_norm": 4.1657884558644, "learning_rate": 1.320084600449377e-05, "loss": 3.7315, "step": 1036 }, { "epoch": 0.41, "grad_norm": 4.281052074286255, "learning_rate": 1.3188569877737474e-05, "loss": 3.7459, "step": 1037 }, { "epoch": 0.42, "grad_norm": 3.7677672464400835, "learning_rate": 1.3176288399522975e-05, "loss": 3.5716, "step": 1038 }, { "epoch": 0.42, "grad_norm": 4.428918038754764, "learning_rate": 1.3164001590462592e-05, "loss": 3.8393, "step": 1039 }, { "epoch": 0.42, "grad_norm": 3.298395603507693, "learning_rate": 1.3151709471177589e-05, "loss": 3.8122, "step": 1040 }, { "epoch": 0.42, "grad_norm": 4.280164110278819, "learning_rate": 1.3139412062298141e-05, "loss": 3.5995, "step": 1041 }, { "epoch": 0.42, "grad_norm": 3.500876125985926, "learning_rate": 1.312710938446331e-05, "loss": 3.8619, "step": 1042 }, { "epoch": 0.42, "grad_norm": 3.6210925789968, "learning_rate": 1.3114801458320988e-05, "loss": 3.6235, "step": 1043 }, { "epoch": 0.42, "grad_norm": 3.7574064252584147, "learning_rate": 1.3102488304527883e-05, "loss": 3.8639, "step": 1044 }, { "epoch": 0.42, "grad_norm": 4.1328056061368095, "learning_rate": 1.3090169943749475e-05, "loss": 3.987, "step": 1045 }, { "epoch": 0.42, "grad_norm": 4.01823522119479, "learning_rate": 1.3077846396659986e-05, "loss": 3.7406, "step": 1046 }, { "epoch": 0.42, "grad_norm": 4.1690337753538556, "learning_rate": 1.3065517683942339e-05, "loss": 3.8761, "step": 1047 }, { "epoch": 0.42, "grad_norm": 4.118559827596486, "learning_rate": 1.3053183826288124e-05, "loss": 3.8011, "step": 1048 }, { "epoch": 0.42, "grad_norm": 4.382326156860178, "learning_rate": 1.3040844844397573e-05, "loss": 3.7709, "step": 1049 }, { "epoch": 0.42, "grad_norm": 5.4798186185746225, "learning_rate": 1.3028500758979507e-05, "loss": 3.9834, "step": 1050 }, { "epoch": 0.42, "grad_norm": 4.40810523021627, "learning_rate": 1.3016151590751332e-05, "loss": 3.7962, "step": 1051 }, { "epoch": 0.42, "grad_norm": 3.8039902850152925, "learning_rate": 1.3003797360438961e-05, "loss": 3.9683, "step": 1052 }, { "epoch": 0.42, "grad_norm": 3.894557739361905, "learning_rate": 1.2991438088776818e-05, "loss": 3.6801, "step": 1053 }, { "epoch": 0.42, "grad_norm": 5.239443292560934, "learning_rate": 1.2979073796507786e-05, "loss": 3.8762, "step": 1054 }, { "epoch": 0.42, "grad_norm": 3.5021977746058317, "learning_rate": 1.296670450438317e-05, "loss": 3.774, "step": 1055 }, { "epoch": 0.42, "grad_norm": 4.1109819520202056, "learning_rate": 1.2954330233162669e-05, "loss": 3.7251, "step": 1056 }, { "epoch": 0.42, "grad_norm": 3.8759541874915167, "learning_rate": 1.2941951003614337e-05, "loss": 3.7179, "step": 1057 }, { "epoch": 0.42, "grad_norm": 4.6162051772187995, "learning_rate": 1.2929566836514556e-05, "loss": 3.7607, "step": 1058 }, { "epoch": 0.42, "grad_norm": 3.801727408834567, "learning_rate": 1.291717775264798e-05, "loss": 3.8958, "step": 1059 }, { "epoch": 0.42, "grad_norm": 4.407215697298154, "learning_rate": 1.2904783772807534e-05, "loss": 3.8602, "step": 1060 }, { "epoch": 0.42, "grad_norm": 3.560458114043935, "learning_rate": 1.2892384917794347e-05, "loss": 3.9323, "step": 1061 }, { "epoch": 0.42, "grad_norm": 3.8926615511691383, "learning_rate": 1.2879981208417735e-05, "loss": 3.6922, "step": 1062 }, { "epoch": 0.43, "grad_norm": 4.022272943906542, "learning_rate": 1.2867572665495156e-05, "loss": 3.8297, "step": 1063 }, { "epoch": 0.43, "grad_norm": 4.174392840890098, "learning_rate": 1.285515930985219e-05, "loss": 3.7822, "step": 1064 }, { "epoch": 0.43, "grad_norm": 4.48534689846332, "learning_rate": 1.2842741162322487e-05, "loss": 3.9492, "step": 1065 }, { "epoch": 0.43, "grad_norm": 3.7857352405447555, "learning_rate": 1.2830318243747736e-05, "loss": 3.8072, "step": 1066 }, { "epoch": 0.43, "grad_norm": 3.607574037963277, "learning_rate": 1.2817890574977648e-05, "loss": 3.9189, "step": 1067 }, { "epoch": 0.43, "grad_norm": 3.4592415194671173, "learning_rate": 1.2805458176869885e-05, "loss": 3.8008, "step": 1068 }, { "epoch": 0.43, "grad_norm": 3.7386853887528777, "learning_rate": 1.2793021070290065e-05, "loss": 3.8409, "step": 1069 }, { "epoch": 0.43, "grad_norm": 4.53759324100156, "learning_rate": 1.2780579276111702e-05, "loss": 3.6507, "step": 1070 }, { "epoch": 0.43, "grad_norm": 4.945952450142829, "learning_rate": 1.2768132815216174e-05, "loss": 3.8486, "step": 1071 }, { "epoch": 0.43, "grad_norm": 3.8818735387168606, "learning_rate": 1.2755681708492696e-05, "loss": 3.751, "step": 1072 }, { "epoch": 0.43, "grad_norm": 4.159697782524046, "learning_rate": 1.2743225976838277e-05, "loss": 3.7202, "step": 1073 }, { "epoch": 0.43, "grad_norm": 3.688463314558805, "learning_rate": 1.2730765641157689e-05, "loss": 3.8645, "step": 1074 }, { "epoch": 0.43, "grad_norm": 3.8295978281953693, "learning_rate": 1.2718300722363431e-05, "loss": 3.7975, "step": 1075 }, { "epoch": 0.43, "grad_norm": 3.924531322625543, "learning_rate": 1.2705831241375695e-05, "loss": 3.5147, "step": 1076 }, { "epoch": 0.43, "grad_norm": 4.318083146218096, "learning_rate": 1.2693357219122331e-05, "loss": 3.9557, "step": 1077 }, { "epoch": 0.43, "grad_norm": 3.677130448505102, "learning_rate": 1.2680878676538804e-05, "loss": 3.7197, "step": 1078 }, { "epoch": 0.43, "grad_norm": 4.8463986020512575, "learning_rate": 1.2668395634568175e-05, "loss": 3.777, "step": 1079 }, { "epoch": 0.43, "grad_norm": 5.058312752920507, "learning_rate": 1.2655908114161053e-05, "loss": 4.0007, "step": 1080 }, { "epoch": 0.43, "grad_norm": 3.638632419006762, "learning_rate": 1.2643416136275557e-05, "loss": 3.7884, "step": 1081 }, { "epoch": 0.43, "grad_norm": 3.5887543927846846, "learning_rate": 1.2630919721877299e-05, "loss": 3.8393, "step": 1082 }, { "epoch": 0.43, "grad_norm": 4.149681723639608, "learning_rate": 1.261841889193932e-05, "loss": 3.7256, "step": 1083 }, { "epoch": 0.43, "grad_norm": 4.897842387997689, "learning_rate": 1.2605913667442096e-05, "loss": 3.817, "step": 1084 }, { "epoch": 0.43, "grad_norm": 4.088190503564847, "learning_rate": 1.2593404069373452e-05, "loss": 3.7328, "step": 1085 }, { "epoch": 0.43, "grad_norm": 4.346641744766435, "learning_rate": 1.2580890118728572e-05, "loss": 3.6659, "step": 1086 }, { "epoch": 0.43, "grad_norm": 3.9172265316872497, "learning_rate": 1.2568371836509936e-05, "loss": 3.6883, "step": 1087 }, { "epoch": 0.44, "grad_norm": 4.0186942695070895, "learning_rate": 1.2555849243727298e-05, "loss": 3.756, "step": 1088 }, { "epoch": 0.44, "grad_norm": 4.2788739628746395, "learning_rate": 1.2543322361397648e-05, "loss": 3.8754, "step": 1089 }, { "epoch": 0.44, "grad_norm": 4.187805152986359, "learning_rate": 1.2530791210545163e-05, "loss": 3.7108, "step": 1090 }, { "epoch": 0.44, "grad_norm": 4.32176195886882, "learning_rate": 1.2518255812201203e-05, "loss": 3.648, "step": 1091 }, { "epoch": 0.44, "grad_norm": 3.967619501198415, "learning_rate": 1.2505716187404242e-05, "loss": 3.9049, "step": 1092 }, { "epoch": 0.44, "grad_norm": 4.8773142998279955, "learning_rate": 1.2493172357199856e-05, "loss": 3.6919, "step": 1093 }, { "epoch": 0.44, "grad_norm": 3.6144010815109398, "learning_rate": 1.2480624342640673e-05, "loss": 3.7658, "step": 1094 }, { "epoch": 0.44, "grad_norm": 4.2419265154497285, "learning_rate": 1.2468072164786342e-05, "loss": 3.8289, "step": 1095 }, { "epoch": 0.44, "grad_norm": 3.481230443413657, "learning_rate": 1.2455515844703512e-05, "loss": 3.8465, "step": 1096 }, { "epoch": 0.44, "grad_norm": 4.165489059771342, "learning_rate": 1.2442955403465768e-05, "loss": 3.6346, "step": 1097 }, { "epoch": 0.44, "grad_norm": 3.8231188657999677, "learning_rate": 1.2430390862153625e-05, "loss": 3.8528, "step": 1098 }, { "epoch": 0.44, "grad_norm": 3.896380824909673, "learning_rate": 1.2417822241854466e-05, "loss": 3.785, "step": 1099 }, { "epoch": 0.44, "grad_norm": 3.190572844245639, "learning_rate": 1.2405249563662539e-05, "loss": 3.9942, "step": 1100 }, { "epoch": 0.44, "grad_norm": 4.049608251353112, "learning_rate": 1.2392672848678877e-05, "loss": 3.8083, "step": 1101 }, { "epoch": 0.44, "grad_norm": 3.1717023508880833, "learning_rate": 1.238009211801131e-05, "loss": 3.8426, "step": 1102 }, { "epoch": 0.44, "grad_norm": 3.570540640509857, "learning_rate": 1.2367507392774398e-05, "loss": 3.5437, "step": 1103 }, { "epoch": 0.44, "grad_norm": 3.5967168685654696, "learning_rate": 1.2354918694089406e-05, "loss": 3.7432, "step": 1104 }, { "epoch": 0.44, "grad_norm": 3.6697195244996283, "learning_rate": 1.2342326043084268e-05, "loss": 3.7574, "step": 1105 }, { "epoch": 0.44, "grad_norm": 3.536043300169957, "learning_rate": 1.2329729460893552e-05, "loss": 3.6837, "step": 1106 }, { "epoch": 0.44, "grad_norm": 3.736616862422548, "learning_rate": 1.2317128968658424e-05, "loss": 3.9007, "step": 1107 }, { "epoch": 0.44, "grad_norm": 3.6992203544786433, "learning_rate": 1.2304524587526609e-05, "loss": 3.9669, "step": 1108 }, { "epoch": 0.44, "grad_norm": 3.499394891696174, "learning_rate": 1.2291916338652365e-05, "loss": 3.7316, "step": 1109 }, { "epoch": 0.44, "grad_norm": 3.8376821119143663, "learning_rate": 1.2279304243196438e-05, "loss": 3.952, "step": 1110 }, { "epoch": 0.44, "grad_norm": 4.12009679545591, "learning_rate": 1.2266688322326024e-05, "loss": 3.8621, "step": 1111 }, { "epoch": 0.44, "grad_norm": 3.411885666351011, "learning_rate": 1.225406859721475e-05, "loss": 3.8947, "step": 1112 }, { "epoch": 0.45, "grad_norm": 3.975578622769097, "learning_rate": 1.2241445089042623e-05, "loss": 3.636, "step": 1113 }, { "epoch": 0.45, "grad_norm": 4.414469682007408, "learning_rate": 1.2228817818995998e-05, "loss": 3.806, "step": 1114 }, { "epoch": 0.45, "grad_norm": 3.831971776801647, "learning_rate": 1.2216186808267544e-05, "loss": 3.9733, "step": 1115 }, { "epoch": 0.45, "grad_norm": 3.810781796334439, "learning_rate": 1.2203552078056209e-05, "loss": 3.772, "step": 1116 }, { "epoch": 0.45, "grad_norm": 3.884988279276643, "learning_rate": 1.2190913649567185e-05, "loss": 3.932, "step": 1117 }, { "epoch": 0.45, "grad_norm": 3.5364222816956867, "learning_rate": 1.2178271544011864e-05, "loss": 3.8833, "step": 1118 }, { "epoch": 0.45, "grad_norm": 4.086827186062164, "learning_rate": 1.2165625782607817e-05, "loss": 3.8387, "step": 1119 }, { "epoch": 0.45, "grad_norm": 4.2399315656735865, "learning_rate": 1.215297638657875e-05, "loss": 3.7389, "step": 1120 }, { "epoch": 0.45, "grad_norm": 3.317299756870864, "learning_rate": 1.2140323377154467e-05, "loss": 3.9934, "step": 1121 }, { "epoch": 0.45, "grad_norm": 3.755435455736925, "learning_rate": 1.2127666775570837e-05, "loss": 3.6894, "step": 1122 }, { "epoch": 0.45, "grad_norm": 3.702006572771475, "learning_rate": 1.211500660306975e-05, "loss": 3.7638, "step": 1123 }, { "epoch": 0.45, "grad_norm": 3.6732674412435555, "learning_rate": 1.210234288089911e-05, "loss": 3.5788, "step": 1124 }, { "epoch": 0.45, "grad_norm": 3.733951889792287, "learning_rate": 1.2089675630312755e-05, "loss": 3.7181, "step": 1125 }, { "epoch": 0.45, "grad_norm": 4.5632865766418425, "learning_rate": 1.2077004872570454e-05, "loss": 3.7373, "step": 1126 }, { "epoch": 0.45, "grad_norm": 3.682464186358648, "learning_rate": 1.206433062893787e-05, "loss": 3.7892, "step": 1127 }, { "epoch": 0.45, "grad_norm": 3.8827651672073524, "learning_rate": 1.2051652920686505e-05, "loss": 3.6032, "step": 1128 }, { "epoch": 0.45, "grad_norm": 4.079285241985473, "learning_rate": 1.2038971769093685e-05, "loss": 3.8538, "step": 1129 }, { "epoch": 0.45, "grad_norm": 4.306943261397105, "learning_rate": 1.2026287195442503e-05, "loss": 3.7325, "step": 1130 }, { "epoch": 0.45, "grad_norm": 4.058850469182537, "learning_rate": 1.201359922102181e-05, "loss": 3.7228, "step": 1131 }, { "epoch": 0.45, "grad_norm": 3.8833854615216725, "learning_rate": 1.200090786712615e-05, "loss": 3.9011, "step": 1132 }, { "epoch": 0.45, "grad_norm": 4.401892254030929, "learning_rate": 1.1988213155055754e-05, "loss": 3.7903, "step": 1133 }, { "epoch": 0.45, "grad_norm": 4.239794919660805, "learning_rate": 1.1975515106116472e-05, "loss": 3.7108, "step": 1134 }, { "epoch": 0.45, "grad_norm": 3.857128357765898, "learning_rate": 1.1962813741619777e-05, "loss": 3.8152, "step": 1135 }, { "epoch": 0.45, "grad_norm": 3.7166704461674835, "learning_rate": 1.1950109082882681e-05, "loss": 3.692, "step": 1136 }, { "epoch": 0.45, "grad_norm": 3.529483667781026, "learning_rate": 1.193740115122774e-05, "loss": 4.073, "step": 1137 }, { "epoch": 0.46, "grad_norm": 3.79176223989985, "learning_rate": 1.1924689967983006e-05, "loss": 3.8253, "step": 1138 }, { "epoch": 0.46, "grad_norm": 3.603835067055338, "learning_rate": 1.191197555448197e-05, "loss": 3.8131, "step": 1139 }, { "epoch": 0.46, "grad_norm": 3.6152376513863014, "learning_rate": 1.189925793206357e-05, "loss": 3.7461, "step": 1140 }, { "epoch": 0.46, "grad_norm": 3.410975002301851, "learning_rate": 1.1886537122072106e-05, "loss": 3.901, "step": 1141 }, { "epoch": 0.46, "grad_norm": 3.3186385039853183, "learning_rate": 1.187381314585725e-05, "loss": 3.9929, "step": 1142 }, { "epoch": 0.46, "grad_norm": 3.7670461549530887, "learning_rate": 1.1861086024773963e-05, "loss": 3.7552, "step": 1143 }, { "epoch": 0.46, "grad_norm": 3.564484757844791, "learning_rate": 1.1848355780182502e-05, "loss": 3.8756, "step": 1144 }, { "epoch": 0.46, "grad_norm": 3.719641849142067, "learning_rate": 1.1835622433448361e-05, "loss": 3.6392, "step": 1145 }, { "epoch": 0.46, "grad_norm": 3.699068447698048, "learning_rate": 1.1822886005942244e-05, "loss": 3.6641, "step": 1146 }, { "epoch": 0.46, "grad_norm": 3.7519943682708763, "learning_rate": 1.1810146519040023e-05, "loss": 3.8413, "step": 1147 }, { "epoch": 0.46, "grad_norm": 3.5090371131676132, "learning_rate": 1.1797403994122698e-05, "loss": 3.7514, "step": 1148 }, { "epoch": 0.46, "grad_norm": 3.483919091866205, "learning_rate": 1.178465845257638e-05, "loss": 3.8969, "step": 1149 }, { "epoch": 0.46, "grad_norm": 3.9152410753151496, "learning_rate": 1.177190991579223e-05, "loss": 3.7882, "step": 1150 }, { "epoch": 0.46, "grad_norm": 3.7833928552124285, "learning_rate": 1.1759158405166446e-05, "loss": 3.8664, "step": 1151 }, { "epoch": 0.46, "grad_norm": 3.217725364864304, "learning_rate": 1.1746403942100215e-05, "loss": 3.807, "step": 1152 }, { "epoch": 0.46, "grad_norm": 3.4669536263663465, "learning_rate": 1.1733646547999678e-05, "loss": 3.7389, "step": 1153 }, { "epoch": 0.46, "grad_norm": 3.4928261841706183, "learning_rate": 1.1720886244275893e-05, "loss": 3.7921, "step": 1154 }, { "epoch": 0.46, "grad_norm": 3.989073154700081, "learning_rate": 1.1708123052344803e-05, "loss": 3.6648, "step": 1155 }, { "epoch": 0.46, "grad_norm": 3.693225989563176, "learning_rate": 1.1695356993627203e-05, "loss": 3.9464, "step": 1156 }, { "epoch": 0.46, "grad_norm": 4.5615429339832385, "learning_rate": 1.1682588089548692e-05, "loss": 3.6917, "step": 1157 }, { "epoch": 0.46, "grad_norm": 3.680963349085811, "learning_rate": 1.1669816361539647e-05, "loss": 3.7782, "step": 1158 }, { "epoch": 0.46, "grad_norm": 3.598393270654093, "learning_rate": 1.1657041831035186e-05, "loss": 3.7299, "step": 1159 }, { "epoch": 0.46, "grad_norm": 4.466531188320706, "learning_rate": 1.164426451947513e-05, "loss": 3.6484, "step": 1160 }, { "epoch": 0.46, "grad_norm": 4.035699717288, "learning_rate": 1.1631484448303964e-05, "loss": 3.62, "step": 1161 }, { "epoch": 0.46, "grad_norm": 4.121097309892934, "learning_rate": 1.1618701638970815e-05, "loss": 3.6402, "step": 1162 }, { "epoch": 0.47, "grad_norm": 4.323877433118681, "learning_rate": 1.1605916112929388e-05, "loss": 3.6339, "step": 1163 }, { "epoch": 0.47, "grad_norm": 4.3841070809544185, "learning_rate": 1.1593127891637968e-05, "loss": 3.8707, "step": 1164 }, { "epoch": 0.47, "grad_norm": 3.361551441812719, "learning_rate": 1.1580336996559343e-05, "loss": 3.702, "step": 1165 }, { "epoch": 0.47, "grad_norm": 3.073065467204595, "learning_rate": 1.156754344916081e-05, "loss": 3.8247, "step": 1166 }, { "epoch": 0.47, "grad_norm": 4.520324161669763, "learning_rate": 1.1554747270914098e-05, "loss": 3.5686, "step": 1167 }, { "epoch": 0.47, "grad_norm": 4.654913388365398, "learning_rate": 1.1541948483295358e-05, "loss": 3.8035, "step": 1168 }, { "epoch": 0.47, "grad_norm": 3.2461626059038178, "learning_rate": 1.1529147107785129e-05, "loss": 3.7129, "step": 1169 }, { "epoch": 0.47, "grad_norm": 4.180515399607351, "learning_rate": 1.151634316586828e-05, "loss": 3.8563, "step": 1170 }, { "epoch": 0.47, "grad_norm": 4.5441403758051315, "learning_rate": 1.1503536679034e-05, "loss": 3.6948, "step": 1171 }, { "epoch": 0.47, "grad_norm": 4.078399152054442, "learning_rate": 1.1490727668775735e-05, "loss": 3.7647, "step": 1172 }, { "epoch": 0.47, "grad_norm": 4.17839698744333, "learning_rate": 1.147791615659118e-05, "loss": 3.6863, "step": 1173 }, { "epoch": 0.47, "grad_norm": 3.411722230592826, "learning_rate": 1.1465102163982218e-05, "loss": 3.7029, "step": 1174 }, { "epoch": 0.47, "grad_norm": 4.572334871578432, "learning_rate": 1.1452285712454905e-05, "loss": 3.7514, "step": 1175 }, { "epoch": 0.47, "grad_norm": 4.010392827821349, "learning_rate": 1.1439466823519414e-05, "loss": 3.7983, "step": 1176 }, { "epoch": 0.47, "grad_norm": 5.15222057397017, "learning_rate": 1.1426645518690015e-05, "loss": 3.6849, "step": 1177 }, { "epoch": 0.47, "grad_norm": 4.519535347933383, "learning_rate": 1.1413821819485035e-05, "loss": 3.6528, "step": 1178 }, { "epoch": 0.47, "grad_norm": 4.14465571981964, "learning_rate": 1.140099574742681e-05, "loss": 3.8408, "step": 1179 }, { "epoch": 0.47, "grad_norm": 3.8855608293576704, "learning_rate": 1.138816732404167e-05, "loss": 3.7254, "step": 1180 }, { "epoch": 0.47, "grad_norm": 3.66606300571839, "learning_rate": 1.1375336570859877e-05, "loss": 3.6236, "step": 1181 }, { "epoch": 0.47, "grad_norm": 4.159327154865983, "learning_rate": 1.136250350941562e-05, "loss": 3.6226, "step": 1182 }, { "epoch": 0.47, "grad_norm": 3.8058559454023757, "learning_rate": 1.1349668161246945e-05, "loss": 3.7024, "step": 1183 }, { "epoch": 0.47, "grad_norm": 4.583400400011528, "learning_rate": 1.1336830547895752e-05, "loss": 3.5792, "step": 1184 }, { "epoch": 0.47, "grad_norm": 3.7655923073287294, "learning_rate": 1.1323990690907734e-05, "loss": 3.7676, "step": 1185 }, { "epoch": 0.47, "grad_norm": 4.466605227178359, "learning_rate": 1.1311148611832346e-05, "loss": 3.7292, "step": 1186 }, { "epoch": 0.47, "grad_norm": 3.7647284604385653, "learning_rate": 1.129830433222278e-05, "loss": 3.9269, "step": 1187 }, { "epoch": 0.48, "grad_norm": 4.733028020978156, "learning_rate": 1.128545787363592e-05, "loss": 3.6541, "step": 1188 }, { "epoch": 0.48, "grad_norm": 5.784316104630818, "learning_rate": 1.1272609257632305e-05, "loss": 3.7619, "step": 1189 }, { "epoch": 0.48, "grad_norm": 3.6535503281391337, "learning_rate": 1.1259758505776092e-05, "loss": 4.0232, "step": 1190 }, { "epoch": 0.48, "grad_norm": 3.9417157274952443, "learning_rate": 1.1246905639635029e-05, "loss": 3.8453, "step": 1191 }, { "epoch": 0.48, "grad_norm": 3.8028096902179827, "learning_rate": 1.1234050680780407e-05, "loss": 3.7516, "step": 1192 }, { "epoch": 0.48, "grad_norm": 5.259264025052849, "learning_rate": 1.1221193650787032e-05, "loss": 3.823, "step": 1193 }, { "epoch": 0.48, "grad_norm": 4.238159290220217, "learning_rate": 1.1208334571233186e-05, "loss": 3.9508, "step": 1194 }, { "epoch": 0.48, "grad_norm": 3.763516705043185, "learning_rate": 1.119547346370059e-05, "loss": 3.6778, "step": 1195 }, { "epoch": 0.48, "grad_norm": 3.801301507686709, "learning_rate": 1.118261034977437e-05, "loss": 3.5536, "step": 1196 }, { "epoch": 0.48, "grad_norm": 4.0995292505809795, "learning_rate": 1.116974525104302e-05, "loss": 3.5847, "step": 1197 }, { "epoch": 0.48, "grad_norm": 4.052881858623986, "learning_rate": 1.1156878189098357e-05, "loss": 3.7869, "step": 1198 }, { "epoch": 0.48, "grad_norm": 3.759889351369552, "learning_rate": 1.114400918553551e-05, "loss": 3.7752, "step": 1199 }, { "epoch": 0.48, "grad_norm": 3.641606335216125, "learning_rate": 1.1131138261952845e-05, "loss": 3.7229, "step": 1200 }, { "epoch": 0.48, "grad_norm": 3.2624956103832945, "learning_rate": 1.1118265439951968e-05, "loss": 3.5944, "step": 1201 }, { "epoch": 0.48, "grad_norm": 4.1450940216557415, "learning_rate": 1.110539074113766e-05, "loss": 3.7048, "step": 1202 }, { "epoch": 0.48, "grad_norm": 3.3447326327359326, "learning_rate": 1.1092514187117865e-05, "loss": 3.6985, "step": 1203 }, { "epoch": 0.48, "grad_norm": 3.5461055814418447, "learning_rate": 1.1079635799503625e-05, "loss": 3.7647, "step": 1204 }, { "epoch": 0.48, "grad_norm": 3.3556065313219903, "learning_rate": 1.1066755599909065e-05, "loss": 3.734, "step": 1205 }, { "epoch": 0.48, "grad_norm": 3.8317618472216592, "learning_rate": 1.1053873609951362e-05, "loss": 3.5126, "step": 1206 }, { "epoch": 0.48, "grad_norm": 3.8356744970212726, "learning_rate": 1.1040989851250678e-05, "loss": 3.8545, "step": 1207 }, { "epoch": 0.48, "grad_norm": 3.8078493058296803, "learning_rate": 1.1028104345430161e-05, "loss": 3.6943, "step": 1208 }, { "epoch": 0.48, "grad_norm": 3.806203159509349, "learning_rate": 1.1015217114115884e-05, "loss": 3.5924, "step": 1209 }, { "epoch": 0.48, "grad_norm": 3.4329163591887006, "learning_rate": 1.1002328178936813e-05, "loss": 3.5889, "step": 1210 }, { "epoch": 0.48, "grad_norm": 3.2311989382515263, "learning_rate": 1.0989437561524776e-05, "loss": 3.8365, "step": 1211 }, { "epoch": 0.48, "grad_norm": 3.7876161541916766, "learning_rate": 1.097654528351443e-05, "loss": 3.7401, "step": 1212 }, { "epoch": 0.49, "grad_norm": 4.450794072226459, "learning_rate": 1.0963651366543214e-05, "loss": 3.6452, "step": 1213 }, { "epoch": 0.49, "grad_norm": 3.4025224897119974, "learning_rate": 1.095075583225131e-05, "loss": 3.7985, "step": 1214 }, { "epoch": 0.49, "grad_norm": 4.266609502557084, "learning_rate": 1.0937858702281631e-05, "loss": 3.653, "step": 1215 }, { "epoch": 0.49, "grad_norm": 4.352530022697998, "learning_rate": 1.0924959998279754e-05, "loss": 3.5712, "step": 1216 }, { "epoch": 0.49, "grad_norm": 4.7920904023153685, "learning_rate": 1.0912059741893908e-05, "loss": 3.5353, "step": 1217 }, { "epoch": 0.49, "grad_norm": 4.27009447990584, "learning_rate": 1.089915795477492e-05, "loss": 3.6817, "step": 1218 }, { "epoch": 0.49, "grad_norm": 4.093104784383785, "learning_rate": 1.0886254658576186e-05, "loss": 3.5539, "step": 1219 }, { "epoch": 0.49, "grad_norm": 5.121766404390022, "learning_rate": 1.087334987495364e-05, "loss": 3.6973, "step": 1220 }, { "epoch": 0.49, "grad_norm": 3.6097287264286777, "learning_rate": 1.0860443625565712e-05, "loss": 3.7054, "step": 1221 }, { "epoch": 0.49, "grad_norm": 4.099101696343389, "learning_rate": 1.0847535932073288e-05, "loss": 3.821, "step": 1222 }, { "epoch": 0.49, "grad_norm": 3.7239613892649963, "learning_rate": 1.0834626816139678e-05, "loss": 3.8649, "step": 1223 }, { "epoch": 0.49, "grad_norm": 3.927013781158877, "learning_rate": 1.0821716299430577e-05, "loss": 3.7919, "step": 1224 }, { "epoch": 0.49, "grad_norm": 3.926068050669173, "learning_rate": 1.0808804403614044e-05, "loss": 3.7943, "step": 1225 }, { "epoch": 0.49, "grad_norm": 3.9343822616688335, "learning_rate": 1.0795891150360435e-05, "loss": 3.6685, "step": 1226 }, { "epoch": 0.49, "grad_norm": 3.816742376232799, "learning_rate": 1.0782976561342398e-05, "loss": 3.5803, "step": 1227 }, { "epoch": 0.49, "grad_norm": 3.3950844811577676, "learning_rate": 1.0770060658234815e-05, "loss": 3.6176, "step": 1228 }, { "epoch": 0.49, "grad_norm": 3.9333804501497704, "learning_rate": 1.0757143462714777e-05, "loss": 3.5919, "step": 1229 }, { "epoch": 0.49, "grad_norm": 4.418557233366642, "learning_rate": 1.0744224996461541e-05, "loss": 3.6716, "step": 1230 }, { "epoch": 0.49, "grad_norm": 3.7898331498735236, "learning_rate": 1.0731305281156499e-05, "loss": 3.6412, "step": 1231 }, { "epoch": 0.49, "grad_norm": 3.7969582329269076, "learning_rate": 1.0718384338483141e-05, "loss": 3.7543, "step": 1232 }, { "epoch": 0.49, "grad_norm": 4.148636874878519, "learning_rate": 1.0705462190127011e-05, "loss": 3.7497, "step": 1233 }, { "epoch": 0.49, "grad_norm": 3.3035604956864364, "learning_rate": 1.0692538857775685e-05, "loss": 3.8701, "step": 1234 }, { "epoch": 0.49, "grad_norm": 4.008854288704277, "learning_rate": 1.0679614363118718e-05, "loss": 3.4629, "step": 1235 }, { "epoch": 0.49, "grad_norm": 3.8217933247978877, "learning_rate": 1.066668872784762e-05, "loss": 3.599, "step": 1236 }, { "epoch": 0.49, "grad_norm": 3.505699888668807, "learning_rate": 1.0653761973655819e-05, "loss": 3.61, "step": 1237 }, { "epoch": 0.5, "grad_norm": 3.630452722607848, "learning_rate": 1.0640834122238606e-05, "loss": 3.6772, "step": 1238 }, { "epoch": 0.5, "grad_norm": 3.4696551759013063, "learning_rate": 1.0627905195293135e-05, "loss": 3.8177, "step": 1239 }, { "epoch": 0.5, "grad_norm": 3.600962380822384, "learning_rate": 1.061497521451835e-05, "loss": 3.7467, "step": 1240 }, { "epoch": 0.5, "grad_norm": 3.5615429684192623, "learning_rate": 1.0602044201614965e-05, "loss": 3.7444, "step": 1241 }, { "epoch": 0.5, "grad_norm": 3.6893702737397125, "learning_rate": 1.0589112178285432e-05, "loss": 3.7873, "step": 1242 }, { "epoch": 0.5, "grad_norm": 4.088874847636349, "learning_rate": 1.0576179166233895e-05, "loss": 3.7347, "step": 1243 }, { "epoch": 0.5, "grad_norm": 3.6194124320453978, "learning_rate": 1.056324518716616e-05, "loss": 3.6616, "step": 1244 }, { "epoch": 0.5, "grad_norm": 3.709738290667363, "learning_rate": 1.055031026278965e-05, "loss": 3.5815, "step": 1245 }, { "epoch": 0.5, "grad_norm": 4.3126884197713675, "learning_rate": 1.0537374414813384e-05, "loss": 3.7129, "step": 1246 }, { "epoch": 0.5, "grad_norm": 4.126610206839559, "learning_rate": 1.0524437664947918e-05, "loss": 3.8666, "step": 1247 }, { "epoch": 0.5, "grad_norm": 4.228485912654699, "learning_rate": 1.051150003490534e-05, "loss": 3.8475, "step": 1248 }, { "epoch": 0.5, "grad_norm": 4.414002348692545, "learning_rate": 1.0498561546399194e-05, "loss": 3.7105, "step": 1249 }, { "epoch": 0.5, "grad_norm": 3.544218783563698, "learning_rate": 1.0485622221144485e-05, "loss": 3.6683, "step": 1250 }, { "epoch": 0.5, "grad_norm": 3.6293603164028023, "learning_rate": 1.0472682080857606e-05, "loss": 3.7261, "step": 1251 }, { "epoch": 0.5, "grad_norm": 4.119603916787359, "learning_rate": 1.0459741147256325e-05, "loss": 3.645, "step": 1252 }, { "epoch": 0.5, "grad_norm": 3.309856421704084, "learning_rate": 1.044679944205975e-05, "loss": 3.7916, "step": 1253 }, { "epoch": 0.5, "grad_norm": 3.3126208630112717, "learning_rate": 1.043385698698826e-05, "loss": 3.5428, "step": 1254 }, { "epoch": 0.5, "grad_norm": 3.435020152190706, "learning_rate": 1.0420913803763522e-05, "loss": 3.6746, "step": 1255 }, { "epoch": 0.5, "grad_norm": 3.442250663980389, "learning_rate": 1.04079699141084e-05, "loss": 3.8303, "step": 1256 }, { "epoch": 0.5, "grad_norm": 3.6876733935381405, "learning_rate": 1.0395025339746965e-05, "loss": 3.7032, "step": 1257 }, { "epoch": 0.5, "grad_norm": 3.680657228628987, "learning_rate": 1.0382080102404417e-05, "loss": 3.6895, "step": 1258 }, { "epoch": 0.5, "grad_norm": 3.4319284847026, "learning_rate": 1.0369134223807082e-05, "loss": 3.5902, "step": 1259 }, { "epoch": 0.5, "grad_norm": 3.146045404231594, "learning_rate": 1.0356187725682359e-05, "loss": 3.7994, "step": 1260 }, { "epoch": 0.5, "grad_norm": 3.279354700190127, "learning_rate": 1.0343240629758683e-05, "loss": 3.6323, "step": 1261 }, { "epoch": 0.5, "grad_norm": 3.6423447713494674, "learning_rate": 1.0330292957765502e-05, "loss": 3.7522, "step": 1262 }, { "epoch": 0.51, "grad_norm": 3.9929601866704547, "learning_rate": 1.0317344731433217e-05, "loss": 3.6607, "step": 1263 }, { "epoch": 0.51, "grad_norm": 3.884660944168367, "learning_rate": 1.0304395972493172e-05, "loss": 3.9169, "step": 1264 }, { "epoch": 0.51, "grad_norm": 3.4149693145698077, "learning_rate": 1.0291446702677598e-05, "loss": 3.6962, "step": 1265 }, { "epoch": 0.51, "grad_norm": 3.45271671752356, "learning_rate": 1.0278496943719585e-05, "loss": 3.7441, "step": 1266 }, { "epoch": 0.51, "grad_norm": 3.6133398318052428, "learning_rate": 1.0265546717353041e-05, "loss": 3.683, "step": 1267 }, { "epoch": 0.51, "grad_norm": 3.371173249364838, "learning_rate": 1.0252596045312666e-05, "loss": 3.6712, "step": 1268 }, { "epoch": 0.51, "grad_norm": 3.83144634459696, "learning_rate": 1.02396449493339e-05, "loss": 3.6621, "step": 1269 }, { "epoch": 0.51, "grad_norm": 3.5042655204060793, "learning_rate": 1.02266934511529e-05, "loss": 3.6081, "step": 1270 }, { "epoch": 0.51, "grad_norm": 3.901785995386197, "learning_rate": 1.0213741572506497e-05, "loss": 3.5902, "step": 1271 }, { "epoch": 0.51, "grad_norm": 3.597912776915605, "learning_rate": 1.0200789335132157e-05, "loss": 3.6509, "step": 1272 }, { "epoch": 0.51, "grad_norm": 3.576944142026537, "learning_rate": 1.0187836760767954e-05, "loss": 3.6438, "step": 1273 }, { "epoch": 0.51, "grad_norm": 4.171711826080111, "learning_rate": 1.0174883871152517e-05, "loss": 3.6199, "step": 1274 }, { "epoch": 0.51, "grad_norm": 3.997960682784523, "learning_rate": 1.0161930688025018e-05, "loss": 3.597, "step": 1275 }, { "epoch": 0.51, "grad_norm": 4.118551729202294, "learning_rate": 1.014897723312511e-05, "loss": 3.7327, "step": 1276 }, { "epoch": 0.51, "grad_norm": 4.115281655837042, "learning_rate": 1.013602352819291e-05, "loss": 3.6384, "step": 1277 }, { "epoch": 0.51, "grad_norm": 3.8326653216963527, "learning_rate": 1.0123069594968952e-05, "loss": 3.6624, "step": 1278 }, { "epoch": 0.51, "grad_norm": 6.420306455859883, "learning_rate": 1.0110115455194157e-05, "loss": 3.6849, "step": 1279 }, { "epoch": 0.51, "grad_norm": 3.8078583089194438, "learning_rate": 1.0097161130609774e-05, "loss": 3.6237, "step": 1280 }, { "epoch": 0.51, "grad_norm": 3.4296770703932693, "learning_rate": 1.0084206642957393e-05, "loss": 3.772, "step": 1281 }, { "epoch": 0.51, "grad_norm": 4.127642158329446, "learning_rate": 1.0071252013978852e-05, "loss": 3.571, "step": 1282 }, { "epoch": 0.51, "grad_norm": 3.7661312311309287, "learning_rate": 1.0058297265416234e-05, "loss": 3.6234, "step": 1283 }, { "epoch": 0.51, "grad_norm": 3.543027913063287, "learning_rate": 1.0045342419011832e-05, "loss": 3.5259, "step": 1284 }, { "epoch": 0.51, "grad_norm": 4.157795877165396, "learning_rate": 1.003238749650809e-05, "loss": 3.5826, "step": 1285 }, { "epoch": 0.51, "grad_norm": 3.9799792666825966, "learning_rate": 1.0019432519647585e-05, "loss": 3.6644, "step": 1286 }, { "epoch": 0.51, "grad_norm": 3.922820795521154, "learning_rate": 1.0006477510172984e-05, "loss": 3.7645, "step": 1287 }, { "epoch": 0.52, "grad_norm": 3.2795127075695625, "learning_rate": 9.993522489827016e-06, "loss": 3.7082, "step": 1288 }, { "epoch": 0.52, "grad_norm": 4.160932501828583, "learning_rate": 9.980567480352417e-06, "loss": 3.7132, "step": 1289 }, { "epoch": 0.52, "grad_norm": 3.9551672926611743, "learning_rate": 9.967612503491915e-06, "loss": 3.6536, "step": 1290 }, { "epoch": 0.52, "grad_norm": 3.567688655247914, "learning_rate": 9.954657580988171e-06, "loss": 3.6158, "step": 1291 }, { "epoch": 0.52, "grad_norm": 4.504476394633079, "learning_rate": 9.941702734583771e-06, "loss": 3.6681, "step": 1292 }, { "epoch": 0.52, "grad_norm": 3.8738546664048155, "learning_rate": 9.928747986021153e-06, "loss": 3.9168, "step": 1293 }, { "epoch": 0.52, "grad_norm": 3.681866936469431, "learning_rate": 9.91579335704261e-06, "loss": 3.5911, "step": 1294 }, { "epoch": 0.52, "grad_norm": 3.8543107213799708, "learning_rate": 9.90283886939023e-06, "loss": 3.649, "step": 1295 }, { "epoch": 0.52, "grad_norm": 4.213819491800523, "learning_rate": 9.88988454480585e-06, "loss": 3.6681, "step": 1296 }, { "epoch": 0.52, "grad_norm": 3.672911779908602, "learning_rate": 9.876930405031047e-06, "loss": 3.4539, "step": 1297 }, { "epoch": 0.52, "grad_norm": 4.242243787102731, "learning_rate": 9.86397647180709e-06, "loss": 3.6487, "step": 1298 }, { "epoch": 0.52, "grad_norm": 3.2004720405539255, "learning_rate": 9.851022766874892e-06, "loss": 3.8127, "step": 1299 }, { "epoch": 0.52, "grad_norm": 4.09162308157697, "learning_rate": 9.838069311974986e-06, "loss": 3.6898, "step": 1300 }, { "epoch": 0.52, "grad_norm": 3.548750561025053, "learning_rate": 9.825116128847488e-06, "loss": 3.6147, "step": 1301 }, { "epoch": 0.52, "grad_norm": 3.8328336344463603, "learning_rate": 9.812163239232051e-06, "loss": 3.5539, "step": 1302 }, { "epoch": 0.52, "grad_norm": 3.2443184038824855, "learning_rate": 9.799210664867844e-06, "loss": 3.6883, "step": 1303 }, { "epoch": 0.52, "grad_norm": 3.878823223982976, "learning_rate": 9.786258427493505e-06, "loss": 3.823, "step": 1304 }, { "epoch": 0.52, "grad_norm": 3.4116361274295186, "learning_rate": 9.773306548847102e-06, "loss": 3.578, "step": 1305 }, { "epoch": 0.52, "grad_norm": 4.24034110874559, "learning_rate": 9.760355050666102e-06, "loss": 3.727, "step": 1306 }, { "epoch": 0.52, "grad_norm": 4.634306245889952, "learning_rate": 9.747403954687334e-06, "loss": 3.7736, "step": 1307 }, { "epoch": 0.52, "grad_norm": 3.749776046989676, "learning_rate": 9.734453282646962e-06, "loss": 3.6961, "step": 1308 }, { "epoch": 0.52, "grad_norm": 3.932417095522838, "learning_rate": 9.721503056280418e-06, "loss": 3.6418, "step": 1309 }, { "epoch": 0.52, "grad_norm": 4.542879382334435, "learning_rate": 9.708553297322407e-06, "loss": 3.5859, "step": 1310 }, { "epoch": 0.52, "grad_norm": 3.5258395671961127, "learning_rate": 9.69560402750683e-06, "loss": 3.7562, "step": 1311 }, { "epoch": 0.52, "grad_norm": 4.135406118551599, "learning_rate": 9.682655268566783e-06, "loss": 3.5696, "step": 1312 }, { "epoch": 0.53, "grad_norm": 3.8528419671684597, "learning_rate": 9.669707042234502e-06, "loss": 3.6807, "step": 1313 }, { "epoch": 0.53, "grad_norm": 4.201555676577082, "learning_rate": 9.656759370241318e-06, "loss": 3.5825, "step": 1314 }, { "epoch": 0.53, "grad_norm": 3.64562077172273, "learning_rate": 9.643812274317644e-06, "loss": 3.4712, "step": 1315 }, { "epoch": 0.53, "grad_norm": 3.8396792440839103, "learning_rate": 9.630865776192918e-06, "loss": 3.7105, "step": 1316 }, { "epoch": 0.53, "grad_norm": 4.4577250263526444, "learning_rate": 9.617919897595586e-06, "loss": 3.5812, "step": 1317 }, { "epoch": 0.53, "grad_norm": 3.39166849313604, "learning_rate": 9.604974660253039e-06, "loss": 3.5843, "step": 1318 }, { "epoch": 0.53, "grad_norm": 3.1918890159235995, "learning_rate": 9.592030085891602e-06, "loss": 3.6049, "step": 1319 }, { "epoch": 0.53, "grad_norm": 3.664646495100284, "learning_rate": 9.579086196236483e-06, "loss": 3.6161, "step": 1320 }, { "epoch": 0.53, "grad_norm": 4.318782755233096, "learning_rate": 9.56614301301174e-06, "loss": 3.8907, "step": 1321 }, { "epoch": 0.53, "grad_norm": 3.1588859391321322, "learning_rate": 9.553200557940254e-06, "loss": 3.7873, "step": 1322 }, { "epoch": 0.53, "grad_norm": 3.967215300253095, "learning_rate": 9.540258852743676e-06, "loss": 3.5864, "step": 1323 }, { "epoch": 0.53, "grad_norm": 3.941485118974778, "learning_rate": 9.527317919142398e-06, "loss": 3.9642, "step": 1324 }, { "epoch": 0.53, "grad_norm": 3.592458924548226, "learning_rate": 9.514377778855521e-06, "loss": 3.3537, "step": 1325 }, { "epoch": 0.53, "grad_norm": 3.7732706931466202, "learning_rate": 9.501438453600808e-06, "loss": 3.6944, "step": 1326 }, { "epoch": 0.53, "grad_norm": 3.7412543307988915, "learning_rate": 9.488499965094664e-06, "loss": 3.6244, "step": 1327 }, { "epoch": 0.53, "grad_norm": 3.9239890999288694, "learning_rate": 9.475562335052086e-06, "loss": 3.5095, "step": 1328 }, { "epoch": 0.53, "grad_norm": 3.5112467552110247, "learning_rate": 9.462625585186621e-06, "loss": 3.7438, "step": 1329 }, { "epoch": 0.53, "grad_norm": 3.6543160157821206, "learning_rate": 9.449689737210352e-06, "loss": 3.6419, "step": 1330 }, { "epoch": 0.53, "grad_norm": 3.988862136866014, "learning_rate": 9.436754812833843e-06, "loss": 3.6131, "step": 1331 }, { "epoch": 0.53, "grad_norm": 3.3299291715841157, "learning_rate": 9.423820833766108e-06, "loss": 3.7091, "step": 1332 }, { "epoch": 0.53, "grad_norm": 3.9403033777529255, "learning_rate": 9.410887821714571e-06, "loss": 3.6045, "step": 1333 }, { "epoch": 0.53, "grad_norm": 3.8157108095673604, "learning_rate": 9.39795579838504e-06, "loss": 3.5397, "step": 1334 }, { "epoch": 0.53, "grad_norm": 3.2338840832210667, "learning_rate": 9.385024785481653e-06, "loss": 3.6673, "step": 1335 }, { "epoch": 0.53, "grad_norm": 3.6445781586174437, "learning_rate": 9.372094804706867e-06, "loss": 3.5268, "step": 1336 }, { "epoch": 0.53, "grad_norm": 3.9066716946435434, "learning_rate": 9.359165877761396e-06, "loss": 3.6854, "step": 1337 }, { "epoch": 0.54, "grad_norm": 3.2966925584112547, "learning_rate": 9.346238026344186e-06, "loss": 3.5206, "step": 1338 }, { "epoch": 0.54, "grad_norm": 3.299893845106976, "learning_rate": 9.333311272152385e-06, "loss": 3.6807, "step": 1339 }, { "epoch": 0.54, "grad_norm": 3.899885401635834, "learning_rate": 9.320385636881283e-06, "loss": 3.5119, "step": 1340 }, { "epoch": 0.54, "grad_norm": 4.026905163814134, "learning_rate": 9.307461142224318e-06, "loss": 3.442, "step": 1341 }, { "epoch": 0.54, "grad_norm": 3.880216888977279, "learning_rate": 9.29453780987299e-06, "loss": 3.5532, "step": 1342 }, { "epoch": 0.54, "grad_norm": 4.387280773854988, "learning_rate": 9.281615661516866e-06, "loss": 3.7189, "step": 1343 }, { "epoch": 0.54, "grad_norm": 3.6555244517963943, "learning_rate": 9.268694718843503e-06, "loss": 3.6117, "step": 1344 }, { "epoch": 0.54, "grad_norm": 4.277666315779683, "learning_rate": 9.255775003538462e-06, "loss": 3.4797, "step": 1345 }, { "epoch": 0.54, "grad_norm": 3.6156687964961898, "learning_rate": 9.242856537285227e-06, "loss": 3.6104, "step": 1346 }, { "epoch": 0.54, "grad_norm": 3.5831485134788332, "learning_rate": 9.229939341765188e-06, "loss": 3.7844, "step": 1347 }, { "epoch": 0.54, "grad_norm": 3.6448623997034795, "learning_rate": 9.217023438657606e-06, "loss": 3.483, "step": 1348 }, { "epoch": 0.54, "grad_norm": 4.37271213698544, "learning_rate": 9.204108849639565e-06, "loss": 3.7646, "step": 1349 }, { "epoch": 0.54, "grad_norm": 3.945295172360582, "learning_rate": 9.19119559638596e-06, "loss": 3.5386, "step": 1350 }, { "epoch": 0.54, "grad_norm": 3.2435047963682893, "learning_rate": 9.178283700569424e-06, "loss": 3.5083, "step": 1351 }, { "epoch": 0.54, "grad_norm": 3.6441695000819863, "learning_rate": 9.165373183860329e-06, "loss": 3.6747, "step": 1352 }, { "epoch": 0.54, "grad_norm": 3.758895378834221, "learning_rate": 9.152464067926717e-06, "loss": 3.5133, "step": 1353 }, { "epoch": 0.54, "grad_norm": 3.8045119099664175, "learning_rate": 9.139556374434288e-06, "loss": 3.6009, "step": 1354 }, { "epoch": 0.54, "grad_norm": 3.862260842586843, "learning_rate": 9.126650125046361e-06, "loss": 3.4762, "step": 1355 }, { "epoch": 0.54, "grad_norm": 3.5145809371669103, "learning_rate": 9.113745341423816e-06, "loss": 3.6713, "step": 1356 }, { "epoch": 0.54, "grad_norm": 3.828220691263425, "learning_rate": 9.100842045225084e-06, "loss": 3.7191, "step": 1357 }, { "epoch": 0.54, "grad_norm": 3.774746236211917, "learning_rate": 9.087940258106093e-06, "loss": 3.4438, "step": 1358 }, { "epoch": 0.54, "grad_norm": 3.7795138083033657, "learning_rate": 9.075040001720247e-06, "loss": 3.4842, "step": 1359 }, { "epoch": 0.54, "grad_norm": 3.94093397333636, "learning_rate": 9.062141297718372e-06, "loss": 3.6428, "step": 1360 }, { "epoch": 0.54, "grad_norm": 4.471037661246279, "learning_rate": 9.049244167748694e-06, "loss": 3.5629, "step": 1361 }, { "epoch": 0.54, "grad_norm": 3.392239101547643, "learning_rate": 9.036348633456791e-06, "loss": 3.5099, "step": 1362 }, { "epoch": 0.55, "grad_norm": 3.437481937306805, "learning_rate": 9.023454716485572e-06, "loss": 3.6523, "step": 1363 }, { "epoch": 0.55, "grad_norm": 4.716482112384828, "learning_rate": 9.010562438475225e-06, "loss": 3.6393, "step": 1364 }, { "epoch": 0.55, "grad_norm": 4.20173417003461, "learning_rate": 8.99767182106319e-06, "loss": 3.4431, "step": 1365 }, { "epoch": 0.55, "grad_norm": 3.601247359299966, "learning_rate": 8.984782885884119e-06, "loss": 3.5896, "step": 1366 }, { "epoch": 0.55, "grad_norm": 3.4135121264981185, "learning_rate": 8.971895654569842e-06, "loss": 3.7265, "step": 1367 }, { "epoch": 0.55, "grad_norm": 3.9001461297693223, "learning_rate": 8.959010148749324e-06, "loss": 3.5086, "step": 1368 }, { "epoch": 0.55, "grad_norm": 4.03454895841195, "learning_rate": 8.94612639004864e-06, "loss": 3.677, "step": 1369 }, { "epoch": 0.55, "grad_norm": 3.650697251071297, "learning_rate": 8.933244400090937e-06, "loss": 3.6443, "step": 1370 }, { "epoch": 0.55, "grad_norm": 3.46434005555104, "learning_rate": 8.92036420049638e-06, "loss": 3.5778, "step": 1371 }, { "epoch": 0.55, "grad_norm": 3.93551298155282, "learning_rate": 8.907485812882137e-06, "loss": 3.444, "step": 1372 }, { "epoch": 0.55, "grad_norm": 4.137595514523684, "learning_rate": 8.89460925886234e-06, "loss": 3.5835, "step": 1373 }, { "epoch": 0.55, "grad_norm": 3.1221207733061735, "learning_rate": 8.881734560048037e-06, "loss": 3.4763, "step": 1374 }, { "epoch": 0.55, "grad_norm": 3.6063693650856203, "learning_rate": 8.868861738047158e-06, "loss": 3.512, "step": 1375 }, { "epoch": 0.55, "grad_norm": 3.8006663498045854, "learning_rate": 8.855990814464497e-06, "loss": 3.6161, "step": 1376 }, { "epoch": 0.55, "grad_norm": 3.689346473741649, "learning_rate": 8.843121810901643e-06, "loss": 3.4761, "step": 1377 }, { "epoch": 0.55, "grad_norm": 3.5443205545427614, "learning_rate": 8.830254748956983e-06, "loss": 3.5129, "step": 1378 }, { "epoch": 0.55, "grad_norm": 4.461742310621672, "learning_rate": 8.817389650225631e-06, "loss": 3.5642, "step": 1379 }, { "epoch": 0.55, "grad_norm": 4.0981378966007, "learning_rate": 8.804526536299413e-06, "loss": 3.5141, "step": 1380 }, { "epoch": 0.55, "grad_norm": 3.8550298590996155, "learning_rate": 8.79166542876682e-06, "loss": 3.5965, "step": 1381 }, { "epoch": 0.55, "grad_norm": 4.045038429383355, "learning_rate": 8.778806349212968e-06, "loss": 3.6084, "step": 1382 }, { "epoch": 0.55, "grad_norm": 3.955366018201757, "learning_rate": 8.765949319219595e-06, "loss": 3.5752, "step": 1383 }, { "epoch": 0.55, "grad_norm": 3.457754380490334, "learning_rate": 8.753094360364973e-06, "loss": 3.7728, "step": 1384 }, { "epoch": 0.55, "grad_norm": 3.786851175476542, "learning_rate": 8.740241494223911e-06, "loss": 3.5675, "step": 1385 }, { "epoch": 0.55, "grad_norm": 3.660440730641834, "learning_rate": 8.727390742367698e-06, "loss": 3.6431, "step": 1386 }, { "epoch": 0.55, "grad_norm": 4.084737719287929, "learning_rate": 8.71454212636408e-06, "loss": 3.5924, "step": 1387 }, { "epoch": 0.56, "grad_norm": 3.2622905116568877, "learning_rate": 8.701695667777221e-06, "loss": 3.7183, "step": 1388 }, { "epoch": 0.56, "grad_norm": 3.55476228024875, "learning_rate": 8.688851388167658e-06, "loss": 3.4861, "step": 1389 }, { "epoch": 0.56, "grad_norm": 3.6473137503472546, "learning_rate": 8.676009309092273e-06, "loss": 3.6158, "step": 1390 }, { "epoch": 0.56, "grad_norm": 3.7945829817862524, "learning_rate": 8.663169452104248e-06, "loss": 3.6302, "step": 1391 }, { "epoch": 0.56, "grad_norm": 3.640805329322732, "learning_rate": 8.650331838753057e-06, "loss": 3.7547, "step": 1392 }, { "epoch": 0.56, "grad_norm": 3.8124210910003455, "learning_rate": 8.637496490584385e-06, "loss": 3.5456, "step": 1393 }, { "epoch": 0.56, "grad_norm": 3.9704591066995767, "learning_rate": 8.624663429140128e-06, "loss": 3.4268, "step": 1394 }, { "epoch": 0.56, "grad_norm": 4.049723146662677, "learning_rate": 8.611832675958335e-06, "loss": 3.5364, "step": 1395 }, { "epoch": 0.56, "grad_norm": 3.533547042548108, "learning_rate": 8.599004252573191e-06, "loss": 3.6708, "step": 1396 }, { "epoch": 0.56, "grad_norm": 4.58267549077645, "learning_rate": 8.586178180514968e-06, "loss": 3.5749, "step": 1397 }, { "epoch": 0.56, "grad_norm": 4.9116869644342795, "learning_rate": 8.573354481309986e-06, "loss": 3.5463, "step": 1398 }, { "epoch": 0.56, "grad_norm": 4.055182017276201, "learning_rate": 8.560533176480588e-06, "loss": 3.8375, "step": 1399 }, { "epoch": 0.56, "grad_norm": 3.627516942271804, "learning_rate": 8.5477142875451e-06, "loss": 3.5717, "step": 1400 }, { "epoch": 0.56, "grad_norm": 4.812255836731826, "learning_rate": 8.534897836017784e-06, "loss": 3.5476, "step": 1401 }, { "epoch": 0.56, "grad_norm": 5.078113719579199, "learning_rate": 8.522083843408823e-06, "loss": 3.6681, "step": 1402 }, { "epoch": 0.56, "grad_norm": 3.5598802470842936, "learning_rate": 8.50927233122427e-06, "loss": 3.3416, "step": 1403 }, { "epoch": 0.56, "grad_norm": 3.648309157965994, "learning_rate": 8.496463320966004e-06, "loss": 3.5791, "step": 1404 }, { "epoch": 0.56, "grad_norm": 3.7101771856350076, "learning_rate": 8.48365683413172e-06, "loss": 3.674, "step": 1405 }, { "epoch": 0.56, "grad_norm": 4.9313354547428485, "learning_rate": 8.470852892214875e-06, "loss": 3.6009, "step": 1406 }, { "epoch": 0.56, "grad_norm": 3.395437754771472, "learning_rate": 8.458051516704644e-06, "loss": 3.6776, "step": 1407 }, { "epoch": 0.56, "grad_norm": 3.269641226339698, "learning_rate": 8.445252729085907e-06, "loss": 3.5882, "step": 1408 }, { "epoch": 0.56, "grad_norm": 3.238326516283703, "learning_rate": 8.432456550839196e-06, "loss": 3.8776, "step": 1409 }, { "epoch": 0.56, "grad_norm": 3.850496269302347, "learning_rate": 8.419663003440657e-06, "loss": 3.5365, "step": 1410 }, { "epoch": 0.56, "grad_norm": 3.651812598740248, "learning_rate": 8.406872108362034e-06, "loss": 3.6866, "step": 1411 }, { "epoch": 0.56, "grad_norm": 4.310329473688571, "learning_rate": 8.394083887070614e-06, "loss": 3.5812, "step": 1412 }, { "epoch": 0.57, "grad_norm": 3.1603001545864196, "learning_rate": 8.38129836102919e-06, "loss": 3.6525, "step": 1413 }, { "epoch": 0.57, "grad_norm": 4.259331885962401, "learning_rate": 8.36851555169604e-06, "loss": 3.3805, "step": 1414 }, { "epoch": 0.57, "grad_norm": 3.6620436971814896, "learning_rate": 8.355735480524874e-06, "loss": 3.5445, "step": 1415 }, { "epoch": 0.57, "grad_norm": 4.124009902890339, "learning_rate": 8.342958168964816e-06, "loss": 3.6279, "step": 1416 }, { "epoch": 0.57, "grad_norm": 3.515479094471101, "learning_rate": 8.330183638460356e-06, "loss": 3.3126, "step": 1417 }, { "epoch": 0.57, "grad_norm": 3.510007177496425, "learning_rate": 8.317411910451313e-06, "loss": 3.5948, "step": 1418 }, { "epoch": 0.57, "grad_norm": 3.9870743550694567, "learning_rate": 8.304643006372797e-06, "loss": 3.5617, "step": 1419 }, { "epoch": 0.57, "grad_norm": 3.31871331046679, "learning_rate": 8.291876947655197e-06, "loss": 3.6182, "step": 1420 }, { "epoch": 0.57, "grad_norm": 3.657244073140862, "learning_rate": 8.27911375572411e-06, "loss": 3.4358, "step": 1421 }, { "epoch": 0.57, "grad_norm": 3.6885031118944838, "learning_rate": 8.266353452000326e-06, "loss": 3.4941, "step": 1422 }, { "epoch": 0.57, "grad_norm": 3.4968539016285725, "learning_rate": 8.253596057899788e-06, "loss": 3.4569, "step": 1423 }, { "epoch": 0.57, "grad_norm": 4.241081464957698, "learning_rate": 8.240841594833554e-06, "loss": 3.5976, "step": 1424 }, { "epoch": 0.57, "grad_norm": 3.992209502684881, "learning_rate": 8.228090084207773e-06, "loss": 3.5895, "step": 1425 }, { "epoch": 0.57, "grad_norm": 3.6248505694886615, "learning_rate": 8.215341547423624e-06, "loss": 3.7637, "step": 1426 }, { "epoch": 0.57, "grad_norm": 3.548230202941458, "learning_rate": 8.202596005877307e-06, "loss": 3.6086, "step": 1427 }, { "epoch": 0.57, "grad_norm": 3.8281555139288903, "learning_rate": 8.189853480959982e-06, "loss": 3.6207, "step": 1428 }, { "epoch": 0.57, "grad_norm": 3.5927761035877155, "learning_rate": 8.177113994057756e-06, "loss": 3.5961, "step": 1429 }, { "epoch": 0.57, "grad_norm": 3.296788483429141, "learning_rate": 8.16437756655164e-06, "loss": 3.6734, "step": 1430 }, { "epoch": 0.57, "grad_norm": 3.7427103365741834, "learning_rate": 8.1516442198175e-06, "loss": 3.6577, "step": 1431 }, { "epoch": 0.57, "grad_norm": 3.7415928654401127, "learning_rate": 8.138913975226044e-06, "loss": 3.4335, "step": 1432 }, { "epoch": 0.57, "grad_norm": 3.7793528215077097, "learning_rate": 8.126186854142752e-06, "loss": 3.4863, "step": 1433 }, { "epoch": 0.57, "grad_norm": 3.2918021656616068, "learning_rate": 8.113462877927893e-06, "loss": 3.4836, "step": 1434 }, { "epoch": 0.57, "grad_norm": 3.658566957717168, "learning_rate": 8.100742067936432e-06, "loss": 3.3866, "step": 1435 }, { "epoch": 0.57, "grad_norm": 3.6849277344716795, "learning_rate": 8.088024445518033e-06, "loss": 3.6564, "step": 1436 }, { "epoch": 0.57, "grad_norm": 3.935693476386245, "learning_rate": 8.075310032017e-06, "loss": 3.6148, "step": 1437 }, { "epoch": 0.58, "grad_norm": 3.899449265414525, "learning_rate": 8.062598848772261e-06, "loss": 3.5728, "step": 1438 }, { "epoch": 0.58, "grad_norm": 4.121515481621947, "learning_rate": 8.049890917117322e-06, "loss": 3.6736, "step": 1439 }, { "epoch": 0.58, "grad_norm": 3.806565706148004, "learning_rate": 8.037186258380226e-06, "loss": 3.6357, "step": 1440 }, { "epoch": 0.58, "grad_norm": 5.195115835009729, "learning_rate": 8.02448489388353e-06, "loss": 3.5017, "step": 1441 }, { "epoch": 0.58, "grad_norm": 4.586544394139627, "learning_rate": 8.01178684494425e-06, "loss": 3.685, "step": 1442 }, { "epoch": 0.58, "grad_norm": 3.966560123803234, "learning_rate": 7.999092132873851e-06, "loss": 3.5741, "step": 1443 }, { "epoch": 0.58, "grad_norm": 4.172759572645984, "learning_rate": 7.986400778978192e-06, "loss": 3.6334, "step": 1444 }, { "epoch": 0.58, "grad_norm": 4.244342787839209, "learning_rate": 7.9737128045575e-06, "loss": 3.8758, "step": 1445 }, { "epoch": 0.58, "grad_norm": 5.992017579746834, "learning_rate": 7.96102823090632e-06, "loss": 3.6978, "step": 1446 }, { "epoch": 0.58, "grad_norm": 4.134386091152547, "learning_rate": 7.948347079313494e-06, "loss": 3.6387, "step": 1447 }, { "epoch": 0.58, "grad_norm": 4.282185418813217, "learning_rate": 7.935669371062132e-06, "loss": 3.7487, "step": 1448 }, { "epoch": 0.58, "grad_norm": 4.552626650054134, "learning_rate": 7.922995127429547e-06, "loss": 3.7081, "step": 1449 }, { "epoch": 0.58, "grad_norm": 4.447248945720088, "learning_rate": 7.91032436968725e-06, "loss": 3.3661, "step": 1450 }, { "epoch": 0.58, "grad_norm": 4.113127895541361, "learning_rate": 7.897657119100896e-06, "loss": 3.349, "step": 1451 }, { "epoch": 0.58, "grad_norm": 3.3860791856708996, "learning_rate": 7.88499339693025e-06, "loss": 3.7081, "step": 1452 }, { "epoch": 0.58, "grad_norm": 4.2905573209490475, "learning_rate": 7.872333224429166e-06, "loss": 3.6622, "step": 1453 }, { "epoch": 0.58, "grad_norm": 4.364258205258447, "learning_rate": 7.859676622845535e-06, "loss": 3.6177, "step": 1454 }, { "epoch": 0.58, "grad_norm": 3.9009187557616913, "learning_rate": 7.847023613421251e-06, "loss": 3.6914, "step": 1455 }, { "epoch": 0.58, "grad_norm": 3.5785002847394134, "learning_rate": 7.834374217392188e-06, "loss": 3.544, "step": 1456 }, { "epoch": 0.58, "grad_norm": 3.5018286056581798, "learning_rate": 7.82172845598814e-06, "loss": 3.5752, "step": 1457 }, { "epoch": 0.58, "grad_norm": 4.181096789262609, "learning_rate": 7.80908635043282e-06, "loss": 3.495, "step": 1458 }, { "epoch": 0.58, "grad_norm": 3.9920855934196773, "learning_rate": 7.796447921943793e-06, "loss": 3.529, "step": 1459 }, { "epoch": 0.58, "grad_norm": 4.293480957964132, "learning_rate": 7.78381319173246e-06, "loss": 3.4376, "step": 1460 }, { "epoch": 0.58, "grad_norm": 3.9040102950110485, "learning_rate": 7.771182181004005e-06, "loss": 3.5162, "step": 1461 }, { "epoch": 0.58, "grad_norm": 3.635380000175011, "learning_rate": 7.758554910957378e-06, "loss": 3.8755, "step": 1462 }, { "epoch": 0.59, "grad_norm": 4.32258353400662, "learning_rate": 7.745931402785252e-06, "loss": 3.5296, "step": 1463 }, { "epoch": 0.59, "grad_norm": 3.6660451711288147, "learning_rate": 7.733311677673979e-06, "loss": 3.8214, "step": 1464 }, { "epoch": 0.59, "grad_norm": 3.6681150611869375, "learning_rate": 7.720695756803569e-06, "loss": 3.52, "step": 1465 }, { "epoch": 0.59, "grad_norm": 3.419776479875391, "learning_rate": 7.708083661347637e-06, "loss": 3.3692, "step": 1466 }, { "epoch": 0.59, "grad_norm": 3.8962339569352924, "learning_rate": 7.695475412473393e-06, "loss": 3.4717, "step": 1467 }, { "epoch": 0.59, "grad_norm": 3.8587481606461775, "learning_rate": 7.682871031341579e-06, "loss": 3.7205, "step": 1468 }, { "epoch": 0.59, "grad_norm": 3.0586472108646032, "learning_rate": 7.670270539106452e-06, "loss": 3.4851, "step": 1469 }, { "epoch": 0.59, "grad_norm": 3.455954380269373, "learning_rate": 7.657673956915735e-06, "loss": 3.6466, "step": 1470 }, { "epoch": 0.59, "grad_norm": 4.047129327789186, "learning_rate": 7.645081305910596e-06, "loss": 3.267, "step": 1471 }, { "epoch": 0.59, "grad_norm": 3.8169951162740348, "learning_rate": 7.632492607225604e-06, "loss": 3.6871, "step": 1472 }, { "epoch": 0.59, "grad_norm": 3.8305487121163644, "learning_rate": 7.619907881988692e-06, "loss": 3.5829, "step": 1473 }, { "epoch": 0.59, "grad_norm": 3.2735811073421788, "learning_rate": 7.607327151321127e-06, "loss": 3.6891, "step": 1474 }, { "epoch": 0.59, "grad_norm": 3.9136084760331418, "learning_rate": 7.594750436337467e-06, "loss": 3.4515, "step": 1475 }, { "epoch": 0.59, "grad_norm": 3.828639513589231, "learning_rate": 7.582177758145532e-06, "loss": 3.5019, "step": 1476 }, { "epoch": 0.59, "grad_norm": 3.9437593119820415, "learning_rate": 7.569609137846376e-06, "loss": 3.7731, "step": 1477 }, { "epoch": 0.59, "grad_norm": 4.164958834406506, "learning_rate": 7.557044596534234e-06, "loss": 3.7246, "step": 1478 }, { "epoch": 0.59, "grad_norm": 3.7055840109179368, "learning_rate": 7.544484155296492e-06, "loss": 3.6791, "step": 1479 }, { "epoch": 0.59, "grad_norm": 3.7527652506423284, "learning_rate": 7.531927835213657e-06, "loss": 3.6093, "step": 1480 }, { "epoch": 0.59, "grad_norm": 4.014478944178257, "learning_rate": 7.519375657359331e-06, "loss": 3.5312, "step": 1481 }, { "epoch": 0.59, "grad_norm": 4.246264132846788, "learning_rate": 7.506827642800146e-06, "loss": 3.7168, "step": 1482 }, { "epoch": 0.59, "grad_norm": 4.241663502305347, "learning_rate": 7.49428381259576e-06, "loss": 3.4936, "step": 1483 }, { "epoch": 0.59, "grad_norm": 3.985732421048951, "learning_rate": 7.4817441877988005e-06, "loss": 3.4383, "step": 1484 }, { "epoch": 0.59, "grad_norm": 3.520163919263216, "learning_rate": 7.469208789454838e-06, "loss": 3.5112, "step": 1485 }, { "epoch": 0.59, "grad_norm": 4.100364501529123, "learning_rate": 7.456677638602355e-06, "loss": 3.3978, "step": 1486 }, { "epoch": 0.59, "grad_norm": 3.906400338438948, "learning_rate": 7.444150756272704e-06, "loss": 3.5787, "step": 1487 }, { "epoch": 0.6, "grad_norm": 4.142247742585712, "learning_rate": 7.431628163490067e-06, "loss": 3.5451, "step": 1488 }, { "epoch": 0.6, "grad_norm": 3.8484953594054607, "learning_rate": 7.419109881271434e-06, "loss": 3.5031, "step": 1489 }, { "epoch": 0.6, "grad_norm": 3.5376977157395064, "learning_rate": 7.40659593062655e-06, "loss": 3.5643, "step": 1490 }, { "epoch": 0.6, "grad_norm": 3.684148878346785, "learning_rate": 7.394086332557907e-06, "loss": 3.7448, "step": 1491 }, { "epoch": 0.6, "grad_norm": 3.7070400002530035, "learning_rate": 7.38158110806068e-06, "loss": 3.5748, "step": 1492 }, { "epoch": 0.6, "grad_norm": 3.0700666156104046, "learning_rate": 7.3690802781227056e-06, "loss": 3.6341, "step": 1493 }, { "epoch": 0.6, "grad_norm": 3.8456034836325275, "learning_rate": 7.356583863724442e-06, "loss": 3.5115, "step": 1494 }, { "epoch": 0.6, "grad_norm": 3.789836055161562, "learning_rate": 7.344091885838949e-06, "loss": 3.5078, "step": 1495 }, { "epoch": 0.6, "grad_norm": 3.782648371746039, "learning_rate": 7.331604365431826e-06, "loss": 3.4992, "step": 1496 }, { "epoch": 0.6, "grad_norm": 3.399890932099315, "learning_rate": 7.319121323461198e-06, "loss": 3.5788, "step": 1497 }, { "epoch": 0.6, "grad_norm": 3.519272517962314, "learning_rate": 7.3066427808776754e-06, "loss": 3.5133, "step": 1498 }, { "epoch": 0.6, "grad_norm": 3.607027156933682, "learning_rate": 7.294168758624307e-06, "loss": 3.5752, "step": 1499 }, { "epoch": 0.6, "grad_norm": 3.6015766335135573, "learning_rate": 7.2816992776365714e-06, "loss": 3.8539, "step": 1500 }, { "epoch": 0.6, "grad_norm": 3.7429477576654637, "learning_rate": 7.269234358842314e-06, "loss": 3.6593, "step": 1501 }, { "epoch": 0.6, "grad_norm": 3.417196523965054, "learning_rate": 7.256774023161728e-06, "loss": 3.8102, "step": 1502 }, { "epoch": 0.6, "grad_norm": 3.902532703962836, "learning_rate": 7.244318291507308e-06, "loss": 3.4636, "step": 1503 }, { "epoch": 0.6, "grad_norm": 3.1780987980769613, "learning_rate": 7.231867184783826e-06, "loss": 3.5413, "step": 1504 }, { "epoch": 0.6, "grad_norm": 3.195603868320716, "learning_rate": 7.219420723888301e-06, "loss": 3.6185, "step": 1505 }, { "epoch": 0.6, "grad_norm": 3.5099189748150685, "learning_rate": 7.2069789297099355e-06, "loss": 3.6203, "step": 1506 }, { "epoch": 0.6, "grad_norm": 3.7368146967887084, "learning_rate": 7.194541823130119e-06, "loss": 3.4034, "step": 1507 }, { "epoch": 0.6, "grad_norm": 3.2477208654607175, "learning_rate": 7.182109425022357e-06, "loss": 3.5462, "step": 1508 }, { "epoch": 0.6, "grad_norm": 3.954509777924059, "learning_rate": 7.169681756252265e-06, "loss": 3.3927, "step": 1509 }, { "epoch": 0.6, "grad_norm": 3.44799095991217, "learning_rate": 7.157258837677514e-06, "loss": 3.3569, "step": 1510 }, { "epoch": 0.6, "grad_norm": 3.7064912121782094, "learning_rate": 7.144840690147812e-06, "loss": 3.5461, "step": 1511 }, { "epoch": 0.6, "grad_norm": 3.8854217452893565, "learning_rate": 7.132427334504846e-06, "loss": 3.8537, "step": 1512 }, { "epoch": 0.61, "grad_norm": 2.9978972776658903, "learning_rate": 7.120018791582266e-06, "loss": 3.6769, "step": 1513 }, { "epoch": 0.61, "grad_norm": 4.105775013442488, "learning_rate": 7.107615082205654e-06, "loss": 3.6836, "step": 1514 }, { "epoch": 0.61, "grad_norm": 3.594521170717367, "learning_rate": 7.095216227192467e-06, "loss": 3.5434, "step": 1515 }, { "epoch": 0.61, "grad_norm": 4.083671392728415, "learning_rate": 7.082822247352024e-06, "loss": 3.7165, "step": 1516 }, { "epoch": 0.61, "grad_norm": 3.282797887972504, "learning_rate": 7.070433163485451e-06, "loss": 3.6366, "step": 1517 }, { "epoch": 0.61, "grad_norm": 3.372384984933006, "learning_rate": 7.0580489963856646e-06, "loss": 3.5777, "step": 1518 }, { "epoch": 0.61, "grad_norm": 4.160592298870379, "learning_rate": 7.045669766837333e-06, "loss": 3.3398, "step": 1519 }, { "epoch": 0.61, "grad_norm": 4.020626122432115, "learning_rate": 7.033295495616834e-06, "loss": 3.3516, "step": 1520 }, { "epoch": 0.61, "grad_norm": 4.17132157189251, "learning_rate": 7.020926203492218e-06, "loss": 3.4481, "step": 1521 }, { "epoch": 0.61, "grad_norm": 3.648358936658591, "learning_rate": 7.008561911223186e-06, "loss": 3.6298, "step": 1522 }, { "epoch": 0.61, "grad_norm": 5.03872622210649, "learning_rate": 6.9962026395610416e-06, "loss": 3.6469, "step": 1523 }, { "epoch": 0.61, "grad_norm": 3.796123861492065, "learning_rate": 6.983848409248672e-06, "loss": 3.625, "step": 1524 }, { "epoch": 0.61, "grad_norm": 4.01401433098644, "learning_rate": 6.971499241020495e-06, "loss": 3.4491, "step": 1525 }, { "epoch": 0.61, "grad_norm": 3.600759214686406, "learning_rate": 6.959155155602433e-06, "loss": 3.6289, "step": 1526 }, { "epoch": 0.61, "grad_norm": 4.2385099838211495, "learning_rate": 6.946816173711878e-06, "loss": 3.5884, "step": 1527 }, { "epoch": 0.61, "grad_norm": 4.017534211523324, "learning_rate": 6.934482316057663e-06, "loss": 3.6964, "step": 1528 }, { "epoch": 0.61, "grad_norm": 4.284559410654498, "learning_rate": 6.922153603340016e-06, "loss": 3.509, "step": 1529 }, { "epoch": 0.61, "grad_norm": 3.624699831426992, "learning_rate": 6.909830056250527e-06, "loss": 3.315, "step": 1530 }, { "epoch": 0.61, "grad_norm": 3.9957231893286465, "learning_rate": 6.897511695472124e-06, "loss": 3.3653, "step": 1531 }, { "epoch": 0.61, "grad_norm": 3.7559802871043244, "learning_rate": 6.885198541679016e-06, "loss": 3.5622, "step": 1532 }, { "epoch": 0.61, "grad_norm": 3.515750722822639, "learning_rate": 6.872890615536694e-06, "loss": 3.6934, "step": 1533 }, { "epoch": 0.61, "grad_norm": 3.361687296630023, "learning_rate": 6.860587937701862e-06, "loss": 3.7604, "step": 1534 }, { "epoch": 0.61, "grad_norm": 3.665985775304246, "learning_rate": 6.848290528822417e-06, "loss": 3.6467, "step": 1535 }, { "epoch": 0.61, "grad_norm": 3.5742810689464894, "learning_rate": 6.835998409537412e-06, "loss": 3.6129, "step": 1536 }, { "epoch": 0.61, "grad_norm": 3.1398746962171953, "learning_rate": 6.823711600477025e-06, "loss": 3.5571, "step": 1537 }, { "epoch": 0.62, "grad_norm": 3.8744598615958283, "learning_rate": 6.811430122262529e-06, "loss": 3.4432, "step": 1538 }, { "epoch": 0.62, "grad_norm": 3.680958440614546, "learning_rate": 6.799153995506234e-06, "loss": 3.4893, "step": 1539 }, { "epoch": 0.62, "grad_norm": 4.251229349989716, "learning_rate": 6.786883240811479e-06, "loss": 3.421, "step": 1540 }, { "epoch": 0.62, "grad_norm": 3.363805693156403, "learning_rate": 6.774617878772582e-06, "loss": 3.7277, "step": 1541 }, { "epoch": 0.62, "grad_norm": 3.8060970867616395, "learning_rate": 6.76235792997482e-06, "loss": 3.5927, "step": 1542 }, { "epoch": 0.62, "grad_norm": 4.121071467859133, "learning_rate": 6.750103414994374e-06, "loss": 3.6922, "step": 1543 }, { "epoch": 0.62, "grad_norm": 3.5635624234510095, "learning_rate": 6.737854354398308e-06, "loss": 3.3097, "step": 1544 }, { "epoch": 0.62, "grad_norm": 3.3704037106511224, "learning_rate": 6.725610768744535e-06, "loss": 3.5523, "step": 1545 }, { "epoch": 0.62, "grad_norm": 5.6183604752793395, "learning_rate": 6.713372678581773e-06, "loss": 3.6057, "step": 1546 }, { "epoch": 0.62, "grad_norm": 3.8491054462248853, "learning_rate": 6.7011401044495304e-06, "loss": 3.4293, "step": 1547 }, { "epoch": 0.62, "grad_norm": 3.4848739882554973, "learning_rate": 6.68891306687804e-06, "loss": 3.6692, "step": 1548 }, { "epoch": 0.62, "grad_norm": 3.7199367426632524, "learning_rate": 6.676691586388255e-06, "loss": 3.449, "step": 1549 }, { "epoch": 0.62, "grad_norm": 3.580681658335725, "learning_rate": 6.664475683491797e-06, "loss": 3.5791, "step": 1550 }, { "epoch": 0.62, "grad_norm": 3.5016383532515483, "learning_rate": 6.652265378690923e-06, "loss": 3.6628, "step": 1551 }, { "epoch": 0.62, "grad_norm": 3.2121234727237225, "learning_rate": 6.6400606924785095e-06, "loss": 3.3646, "step": 1552 }, { "epoch": 0.62, "grad_norm": 3.9771811074622305, "learning_rate": 6.627861645337984e-06, "loss": 3.4899, "step": 1553 }, { "epoch": 0.62, "grad_norm": 3.424344005443578, "learning_rate": 6.615668257743322e-06, "loss": 3.6783, "step": 1554 }, { "epoch": 0.62, "grad_norm": 3.9008718425597486, "learning_rate": 6.603480550158995e-06, "loss": 3.4927, "step": 1555 }, { "epoch": 0.62, "grad_norm": 3.5524895975215767, "learning_rate": 6.591298543039949e-06, "loss": 3.6305, "step": 1556 }, { "epoch": 0.62, "grad_norm": 3.199136364840206, "learning_rate": 6.579122256831551e-06, "loss": 3.469, "step": 1557 }, { "epoch": 0.62, "grad_norm": 3.136039927950006, "learning_rate": 6.566951711969581e-06, "loss": 3.8279, "step": 1558 }, { "epoch": 0.62, "grad_norm": 3.5719592951558004, "learning_rate": 6.554786928880165e-06, "loss": 3.6708, "step": 1559 }, { "epoch": 0.62, "grad_norm": 3.265159798599461, "learning_rate": 6.542627927979772e-06, "loss": 3.3514, "step": 1560 }, { "epoch": 0.62, "grad_norm": 3.705950892355711, "learning_rate": 6.530474729675167e-06, "loss": 3.6671, "step": 1561 }, { "epoch": 0.62, "grad_norm": 4.27802567413278, "learning_rate": 6.518327354363374e-06, "loss": 3.4778, "step": 1562 }, { "epoch": 0.63, "grad_norm": 3.4155897157303494, "learning_rate": 6.506185822431639e-06, "loss": 3.6436, "step": 1563 }, { "epoch": 0.63, "grad_norm": 3.4645941258945476, "learning_rate": 6.494050154257408e-06, "loss": 3.5203, "step": 1564 }, { "epoch": 0.63, "grad_norm": 3.5880428732526735, "learning_rate": 6.481920370208274e-06, "loss": 3.292, "step": 1565 }, { "epoch": 0.63, "grad_norm": 3.453501383628343, "learning_rate": 6.469796490641974e-06, "loss": 3.8199, "step": 1566 }, { "epoch": 0.63, "grad_norm": 3.7078788389918738, "learning_rate": 6.4576785359063225e-06, "loss": 3.629, "step": 1567 }, { "epoch": 0.63, "grad_norm": 4.268044566714057, "learning_rate": 6.445566526339187e-06, "loss": 3.4953, "step": 1568 }, { "epoch": 0.63, "grad_norm": 3.1001497410904943, "learning_rate": 6.4334604822684645e-06, "loss": 3.6877, "step": 1569 }, { "epoch": 0.63, "grad_norm": 4.245953209024202, "learning_rate": 6.421360424012039e-06, "loss": 3.4786, "step": 1570 }, { "epoch": 0.63, "grad_norm": 3.2821730226566808, "learning_rate": 6.409266371877751e-06, "loss": 3.4555, "step": 1571 }, { "epoch": 0.63, "grad_norm": 4.182794902814907, "learning_rate": 6.397178346163348e-06, "loss": 3.3958, "step": 1572 }, { "epoch": 0.63, "grad_norm": 3.766092498437075, "learning_rate": 6.38509636715648e-06, "loss": 3.4831, "step": 1573 }, { "epoch": 0.63, "grad_norm": 3.3471215501567446, "learning_rate": 6.373020455134633e-06, "loss": 3.3831, "step": 1574 }, { "epoch": 0.63, "grad_norm": 3.9241836485526003, "learning_rate": 6.360950630365126e-06, "loss": 3.6346, "step": 1575 }, { "epoch": 0.63, "grad_norm": 3.3124279763521547, "learning_rate": 6.3488869131050505e-06, "loss": 3.3606, "step": 1576 }, { "epoch": 0.63, "grad_norm": 3.8207348660033094, "learning_rate": 6.33682932360125e-06, "loss": 3.4873, "step": 1577 }, { "epoch": 0.63, "grad_norm": 4.313411276985893, "learning_rate": 6.324777882090287e-06, "loss": 3.547, "step": 1578 }, { "epoch": 0.63, "grad_norm": 3.8492618965358716, "learning_rate": 6.3127326087983974e-06, "loss": 3.4882, "step": 1579 }, { "epoch": 0.63, "grad_norm": 3.7440406276540643, "learning_rate": 6.300693523941481e-06, "loss": 3.5546, "step": 1580 }, { "epoch": 0.63, "grad_norm": 4.256194385654137, "learning_rate": 6.2886606477250345e-06, "loss": 3.542, "step": 1581 }, { "epoch": 0.63, "grad_norm": 4.100847067766242, "learning_rate": 6.276634000344144e-06, "loss": 3.4619, "step": 1582 }, { "epoch": 0.63, "grad_norm": 3.863838035866122, "learning_rate": 6.264613601983435e-06, "loss": 3.5449, "step": 1583 }, { "epoch": 0.63, "grad_norm": 3.4931068818315585, "learning_rate": 6.2525994728170495e-06, "loss": 3.5324, "step": 1584 }, { "epoch": 0.63, "grad_norm": 3.3210210896064334, "learning_rate": 6.2405916330086106e-06, "loss": 3.4511, "step": 1585 }, { "epoch": 0.63, "grad_norm": 4.435646053815424, "learning_rate": 6.2285901027111806e-06, "loss": 3.2911, "step": 1586 }, { "epoch": 0.63, "grad_norm": 3.954806179069221, "learning_rate": 6.216594902067233e-06, "loss": 3.5649, "step": 1587 }, { "epoch": 0.64, "grad_norm": 3.220561982446002, "learning_rate": 6.204606051208617e-06, "loss": 3.4769, "step": 1588 }, { "epoch": 0.64, "grad_norm": 3.5975537055749225, "learning_rate": 6.192623570256535e-06, "loss": 3.5477, "step": 1589 }, { "epoch": 0.64, "grad_norm": 3.8078229439619187, "learning_rate": 6.180647479321484e-06, "loss": 3.534, "step": 1590 }, { "epoch": 0.64, "grad_norm": 3.5466201216437394, "learning_rate": 6.168677798503246e-06, "loss": 3.5194, "step": 1591 }, { "epoch": 0.64, "grad_norm": 3.6779273264121106, "learning_rate": 6.156714547890838e-06, "loss": 3.4331, "step": 1592 }, { "epoch": 0.64, "grad_norm": 3.7255338440199077, "learning_rate": 6.14475774756249e-06, "loss": 3.5986, "step": 1593 }, { "epoch": 0.64, "grad_norm": 3.626610821101966, "learning_rate": 6.13280741758561e-06, "loss": 3.497, "step": 1594 }, { "epoch": 0.64, "grad_norm": 4.147161579623455, "learning_rate": 6.120863578016736e-06, "loss": 3.7874, "step": 1595 }, { "epoch": 0.64, "grad_norm": 3.6261110954731866, "learning_rate": 6.108926248901521e-06, "loss": 3.4446, "step": 1596 }, { "epoch": 0.64, "grad_norm": 3.4428710403247615, "learning_rate": 6.0969954502746916e-06, "loss": 3.4673, "step": 1597 }, { "epoch": 0.64, "grad_norm": 3.5732663274462007, "learning_rate": 6.0850712021600044e-06, "loss": 3.4875, "step": 1598 }, { "epoch": 0.64, "grad_norm": 3.507148824224458, "learning_rate": 6.073153524570236e-06, "loss": 3.5483, "step": 1599 }, { "epoch": 0.64, "grad_norm": 3.5971890810464346, "learning_rate": 6.061242437507131e-06, "loss": 3.3854, "step": 1600 }, { "epoch": 0.64, "grad_norm": 4.158234085592182, "learning_rate": 6.049337960961362e-06, "loss": 3.3238, "step": 1601 }, { "epoch": 0.64, "grad_norm": 3.8319143470427215, "learning_rate": 6.037440114912521e-06, "loss": 3.5227, "step": 1602 }, { "epoch": 0.64, "grad_norm": 3.3015817963676706, "learning_rate": 6.0255489193290675e-06, "loss": 3.6485, "step": 1603 }, { "epoch": 0.64, "grad_norm": 3.6210568854004803, "learning_rate": 6.013664394168297e-06, "loss": 3.5156, "step": 1604 }, { "epoch": 0.64, "grad_norm": 3.5389233431815312, "learning_rate": 6.00178655937631e-06, "loss": 3.5009, "step": 1605 }, { "epoch": 0.64, "grad_norm": 3.6576189067512854, "learning_rate": 5.989915434887985e-06, "loss": 3.3947, "step": 1606 }, { "epoch": 0.64, "grad_norm": 3.339883862987916, "learning_rate": 5.9780510406269245e-06, "loss": 3.4721, "step": 1607 }, { "epoch": 0.64, "grad_norm": 4.52954677971464, "learning_rate": 5.966193396505452e-06, "loss": 3.3105, "step": 1608 }, { "epoch": 0.64, "grad_norm": 3.797901067597484, "learning_rate": 5.954342522424553e-06, "loss": 3.5331, "step": 1609 }, { "epoch": 0.64, "grad_norm": 4.025259163522358, "learning_rate": 5.942498438273849e-06, "loss": 3.4357, "step": 1610 }, { "epoch": 0.64, "grad_norm": 3.883933535536824, "learning_rate": 5.930661163931572e-06, "loss": 3.5552, "step": 1611 }, { "epoch": 0.64, "grad_norm": 3.9176809770017296, "learning_rate": 5.918830719264514e-06, "loss": 3.6888, "step": 1612 }, { "epoch": 0.65, "grad_norm": 4.158931640923236, "learning_rate": 5.9070071241280235e-06, "loss": 3.745, "step": 1613 }, { "epoch": 0.65, "grad_norm": 4.087354007066473, "learning_rate": 5.895190398365935e-06, "loss": 3.6546, "step": 1614 }, { "epoch": 0.65, "grad_norm": 4.238168845466041, "learning_rate": 5.8833805618105635e-06, "loss": 3.6311, "step": 1615 }, { "epoch": 0.65, "grad_norm": 4.921391574264145, "learning_rate": 5.871577634282655e-06, "loss": 3.3619, "step": 1616 }, { "epoch": 0.65, "grad_norm": 5.222025825827338, "learning_rate": 5.8597816355913685e-06, "loss": 3.5381, "step": 1617 }, { "epoch": 0.65, "grad_norm": 4.526603701604458, "learning_rate": 5.84799258553423e-06, "loss": 3.2635, "step": 1618 }, { "epoch": 0.65, "grad_norm": 4.328920440404449, "learning_rate": 5.836210503897099e-06, "loss": 3.3941, "step": 1619 }, { "epoch": 0.65, "grad_norm": 3.694703294976491, "learning_rate": 5.82443541045415e-06, "loss": 3.6353, "step": 1620 }, { "epoch": 0.65, "grad_norm": 4.046321767564445, "learning_rate": 5.812667324967813e-06, "loss": 3.56, "step": 1621 }, { "epoch": 0.65, "grad_norm": 4.7512923075952695, "learning_rate": 5.800906267188773e-06, "loss": 3.5772, "step": 1622 }, { "epoch": 0.65, "grad_norm": 3.919553279950237, "learning_rate": 5.789152256855917e-06, "loss": 3.4645, "step": 1623 }, { "epoch": 0.65, "grad_norm": 3.9665901288915806, "learning_rate": 5.777405313696294e-06, "loss": 3.4751, "step": 1624 }, { "epoch": 0.65, "grad_norm": 3.9438397686222832, "learning_rate": 5.765665457425102e-06, "loss": 3.5324, "step": 1625 }, { "epoch": 0.65, "grad_norm": 3.720691178895891, "learning_rate": 5.753932707745635e-06, "loss": 3.3672, "step": 1626 }, { "epoch": 0.65, "grad_norm": 3.4053190270389604, "learning_rate": 5.742207084349274e-06, "loss": 3.5297, "step": 1627 }, { "epoch": 0.65, "grad_norm": 3.8384090142645753, "learning_rate": 5.73048860691543e-06, "loss": 3.6019, "step": 1628 }, { "epoch": 0.65, "grad_norm": 4.155590506087914, "learning_rate": 5.718777295111524e-06, "loss": 3.541, "step": 1629 }, { "epoch": 0.65, "grad_norm": 3.935875106073164, "learning_rate": 5.707073168592943e-06, "loss": 3.4477, "step": 1630 }, { "epoch": 0.65, "grad_norm": 4.0601087611821365, "learning_rate": 5.695376247003025e-06, "loss": 3.5035, "step": 1631 }, { "epoch": 0.65, "grad_norm": 4.005956841096344, "learning_rate": 5.683686549973018e-06, "loss": 3.6031, "step": 1632 }, { "epoch": 0.65, "grad_norm": 4.018384680043002, "learning_rate": 5.672004097122033e-06, "loss": 3.4305, "step": 1633 }, { "epoch": 0.65, "grad_norm": 4.089213051550755, "learning_rate": 5.6603289080570274e-06, "loss": 3.4664, "step": 1634 }, { "epoch": 0.65, "grad_norm": 4.0324336465554556, "learning_rate": 5.648661002372769e-06, "loss": 3.5037, "step": 1635 }, { "epoch": 0.65, "grad_norm": 3.8702545614500976, "learning_rate": 5.637000399651804e-06, "loss": 3.3685, "step": 1636 }, { "epoch": 0.65, "grad_norm": 4.044818800764605, "learning_rate": 5.625347119464422e-06, "loss": 3.4022, "step": 1637 }, { "epoch": 0.66, "grad_norm": 3.4606581789411743, "learning_rate": 5.613701181368618e-06, "loss": 3.7145, "step": 1638 }, { "epoch": 0.66, "grad_norm": 4.083316905669718, "learning_rate": 5.602062604910064e-06, "loss": 3.6496, "step": 1639 }, { "epoch": 0.66, "grad_norm": 4.959151798357818, "learning_rate": 5.590431409622081e-06, "loss": 3.8787, "step": 1640 }, { "epoch": 0.66, "grad_norm": 4.588318519706414, "learning_rate": 5.5788076150256075e-06, "loss": 3.6376, "step": 1641 }, { "epoch": 0.66, "grad_norm": 4.939429818845424, "learning_rate": 5.567191240629151e-06, "loss": 3.3604, "step": 1642 }, { "epoch": 0.66, "grad_norm": 4.024057438227261, "learning_rate": 5.555582305928766e-06, "loss": 3.3678, "step": 1643 }, { "epoch": 0.66, "grad_norm": 3.702973125759723, "learning_rate": 5.5439808304080225e-06, "loss": 3.6029, "step": 1644 }, { "epoch": 0.66, "grad_norm": 3.553114471083393, "learning_rate": 5.5323868335379775e-06, "loss": 3.4881, "step": 1645 }, { "epoch": 0.66, "grad_norm": 4.531936177964785, "learning_rate": 5.520800334777132e-06, "loss": 3.7625, "step": 1646 }, { "epoch": 0.66, "grad_norm": 3.7173152360158865, "learning_rate": 5.509221353571404e-06, "loss": 3.2961, "step": 1647 }, { "epoch": 0.66, "grad_norm": 3.9439784637905064, "learning_rate": 5.497649909354084e-06, "loss": 3.3936, "step": 1648 }, { "epoch": 0.66, "grad_norm": 3.78139801390855, "learning_rate": 5.486086021545829e-06, "loss": 3.5477, "step": 1649 }, { "epoch": 0.66, "grad_norm": 3.8954021579269464, "learning_rate": 5.4745297095546125e-06, "loss": 3.3632, "step": 1650 }, { "epoch": 0.66, "grad_norm": 3.9087623151938984, "learning_rate": 5.4629809927756794e-06, "loss": 3.4839, "step": 1651 }, { "epoch": 0.66, "grad_norm": 3.635931087210787, "learning_rate": 5.451439890591539e-06, "loss": 3.4566, "step": 1652 }, { "epoch": 0.66, "grad_norm": 3.58703249038166, "learning_rate": 5.439906422371914e-06, "loss": 3.6018, "step": 1653 }, { "epoch": 0.66, "grad_norm": 3.3918410665471743, "learning_rate": 5.42838060747372e-06, "loss": 3.3492, "step": 1654 }, { "epoch": 0.66, "grad_norm": 4.307113537697726, "learning_rate": 5.416862465241033e-06, "loss": 3.4002, "step": 1655 }, { "epoch": 0.66, "grad_norm": 3.4614695792966033, "learning_rate": 5.405352015005039e-06, "loss": 3.5605, "step": 1656 }, { "epoch": 0.66, "grad_norm": 3.5235364563297984, "learning_rate": 5.3938492760840176e-06, "loss": 3.4043, "step": 1657 }, { "epoch": 0.66, "grad_norm": 3.6270246557742647, "learning_rate": 5.382354267783316e-06, "loss": 3.6153, "step": 1658 }, { "epoch": 0.66, "grad_norm": 3.7458961603016636, "learning_rate": 5.370867009395294e-06, "loss": 3.3846, "step": 1659 }, { "epoch": 0.66, "grad_norm": 3.9253003162704854, "learning_rate": 5.359387520199317e-06, "loss": 3.5766, "step": 1660 }, { "epoch": 0.66, "grad_norm": 3.66451016542722, "learning_rate": 5.3479158194617e-06, "loss": 3.3781, "step": 1661 }, { "epoch": 0.66, "grad_norm": 3.913591319050972, "learning_rate": 5.336451926435688e-06, "loss": 3.5858, "step": 1662 }, { "epoch": 0.67, "grad_norm": 3.6297933301525984, "learning_rate": 5.32499586036143e-06, "loss": 3.3996, "step": 1663 }, { "epoch": 0.67, "grad_norm": 3.409251034623019, "learning_rate": 5.313547640465937e-06, "loss": 3.6274, "step": 1664 }, { "epoch": 0.67, "grad_norm": 3.5973973468416776, "learning_rate": 5.302107285963045e-06, "loss": 3.3889, "step": 1665 }, { "epoch": 0.67, "grad_norm": 3.1962049167523157, "learning_rate": 5.2906748160533895e-06, "loss": 3.4495, "step": 1666 }, { "epoch": 0.67, "grad_norm": 4.0059543728084375, "learning_rate": 5.279250249924384e-06, "loss": 3.5386, "step": 1667 }, { "epoch": 0.67, "grad_norm": 3.7194107001174936, "learning_rate": 5.26783360675016e-06, "loss": 3.5265, "step": 1668 }, { "epoch": 0.67, "grad_norm": 3.7921834437433484, "learning_rate": 5.2564249056915704e-06, "loss": 3.5549, "step": 1669 }, { "epoch": 0.67, "grad_norm": 4.403776989679724, "learning_rate": 5.245024165896126e-06, "loss": 3.3479, "step": 1670 }, { "epoch": 0.67, "grad_norm": 3.8704830411141518, "learning_rate": 5.2336314064979766e-06, "loss": 3.3499, "step": 1671 }, { "epoch": 0.67, "grad_norm": 3.7989300337853664, "learning_rate": 5.222246646617886e-06, "loss": 3.5178, "step": 1672 }, { "epoch": 0.67, "grad_norm": 3.734045664578303, "learning_rate": 5.210869905363178e-06, "loss": 3.464, "step": 1673 }, { "epoch": 0.67, "grad_norm": 3.832535478682915, "learning_rate": 5.199501201827741e-06, "loss": 3.7964, "step": 1674 }, { "epoch": 0.67, "grad_norm": 3.7729187124032797, "learning_rate": 5.18814055509195e-06, "loss": 3.4932, "step": 1675 }, { "epoch": 0.67, "grad_norm": 3.7973719375176183, "learning_rate": 5.1767879842226745e-06, "loss": 3.4163, "step": 1676 }, { "epoch": 0.67, "grad_norm": 4.2259816427355, "learning_rate": 5.165443508273218e-06, "loss": 3.3001, "step": 1677 }, { "epoch": 0.67, "grad_norm": 4.251724589557688, "learning_rate": 5.154107146283311e-06, "loss": 3.1482, "step": 1678 }, { "epoch": 0.67, "grad_norm": 3.6809587313873013, "learning_rate": 5.1427789172790565e-06, "loss": 3.6134, "step": 1679 }, { "epoch": 0.67, "grad_norm": 3.951286411251701, "learning_rate": 5.131458840272905e-06, "loss": 3.4604, "step": 1680 }, { "epoch": 0.67, "grad_norm": 4.002690650992529, "learning_rate": 5.120146934263638e-06, "loss": 3.4269, "step": 1681 }, { "epoch": 0.67, "grad_norm": 3.587007733749125, "learning_rate": 5.10884321823631e-06, "loss": 3.4809, "step": 1682 }, { "epoch": 0.67, "grad_norm": 4.033586550699142, "learning_rate": 5.097547711162243e-06, "loss": 3.495, "step": 1683 }, { "epoch": 0.67, "grad_norm": 4.358904503933148, "learning_rate": 5.086260431998967e-06, "loss": 3.4295, "step": 1684 }, { "epoch": 0.67, "grad_norm": 3.092011940657899, "learning_rate": 5.074981399690219e-06, "loss": 3.4061, "step": 1685 }, { "epoch": 0.67, "grad_norm": 3.509859863620395, "learning_rate": 5.0637106331658815e-06, "loss": 3.3268, "step": 1686 }, { "epoch": 0.67, "grad_norm": 3.4988932754363944, "learning_rate": 5.0524481513419675e-06, "loss": 3.5498, "step": 1687 }, { "epoch": 0.68, "grad_norm": 3.5371819602716257, "learning_rate": 5.041193973120595e-06, "loss": 3.6162, "step": 1688 }, { "epoch": 0.68, "grad_norm": 3.7999904525150257, "learning_rate": 5.02994811738993e-06, "loss": 3.4791, "step": 1689 }, { "epoch": 0.68, "grad_norm": 4.34916869882989, "learning_rate": 5.018710603024187e-06, "loss": 3.4989, "step": 1690 }, { "epoch": 0.68, "grad_norm": 4.3696963886037965, "learning_rate": 5.007481448883567e-06, "loss": 3.458, "step": 1691 }, { "epoch": 0.68, "grad_norm": 4.2633588151069315, "learning_rate": 4.99626067381425e-06, "loss": 3.4841, "step": 1692 }, { "epoch": 0.68, "grad_norm": 3.6953948473727296, "learning_rate": 4.985048296648346e-06, "loss": 3.3859, "step": 1693 }, { "epoch": 0.68, "grad_norm": 3.7639597092063384, "learning_rate": 4.973844336203879e-06, "loss": 3.5064, "step": 1694 }, { "epoch": 0.68, "grad_norm": 3.6696256156495695, "learning_rate": 4.9626488112847384e-06, "loss": 3.4375, "step": 1695 }, { "epoch": 0.68, "grad_norm": 4.165077667236303, "learning_rate": 4.951461740680655e-06, "loss": 3.4031, "step": 1696 }, { "epoch": 0.68, "grad_norm": 3.6209308184846605, "learning_rate": 4.9402831431671834e-06, "loss": 3.4249, "step": 1697 }, { "epoch": 0.68, "grad_norm": 3.4591739980588274, "learning_rate": 4.929113037505642e-06, "loss": 3.3922, "step": 1698 }, { "epoch": 0.68, "grad_norm": 4.2079804852931355, "learning_rate": 4.91795144244311e-06, "loss": 3.4299, "step": 1699 }, { "epoch": 0.68, "grad_norm": 3.873718125760766, "learning_rate": 4.9067983767123736e-06, "loss": 3.5628, "step": 1700 }, { "epoch": 0.68, "grad_norm": 3.1974630418683976, "learning_rate": 4.895653859031906e-06, "loss": 3.6201, "step": 1701 }, { "epoch": 0.68, "grad_norm": 3.2799431494589464, "learning_rate": 4.884517908105837e-06, "loss": 3.43, "step": 1702 }, { "epoch": 0.68, "grad_norm": 3.6073965976368547, "learning_rate": 4.873390542623922e-06, "loss": 3.4691, "step": 1703 }, { "epoch": 0.68, "grad_norm": 3.6439670250442, "learning_rate": 4.8622717812615e-06, "loss": 3.2538, "step": 1704 }, { "epoch": 0.68, "grad_norm": 3.884889324274793, "learning_rate": 4.851161642679466e-06, "loss": 3.3603, "step": 1705 }, { "epoch": 0.68, "grad_norm": 3.371748248373679, "learning_rate": 4.840060145524254e-06, "loss": 3.4982, "step": 1706 }, { "epoch": 0.68, "grad_norm": 4.07931994670271, "learning_rate": 4.828967308427795e-06, "loss": 3.2928, "step": 1707 }, { "epoch": 0.68, "grad_norm": 3.704691505269074, "learning_rate": 4.817883150007474e-06, "loss": 3.3268, "step": 1708 }, { "epoch": 0.68, "grad_norm": 3.609481060520609, "learning_rate": 4.806807688866119e-06, "loss": 3.5597, "step": 1709 }, { "epoch": 0.68, "grad_norm": 4.287120337678626, "learning_rate": 4.795740943591955e-06, "loss": 3.7014, "step": 1710 }, { "epoch": 0.68, "grad_norm": 3.2417233501828395, "learning_rate": 4.784682932758588e-06, "loss": 3.4061, "step": 1711 }, { "epoch": 0.68, "grad_norm": 4.099954943224899, "learning_rate": 4.77363367492496e-06, "loss": 3.3982, "step": 1712 }, { "epoch": 0.69, "grad_norm": 3.4674350398982248, "learning_rate": 4.7625931886353215e-06, "loss": 3.5787, "step": 1713 }, { "epoch": 0.69, "grad_norm": 3.8299806212591014, "learning_rate": 4.7515614924192026e-06, "loss": 3.2252, "step": 1714 }, { "epoch": 0.69, "grad_norm": 3.3829512664027224, "learning_rate": 4.740538604791371e-06, "loss": 3.585, "step": 1715 }, { "epoch": 0.69, "grad_norm": 3.5722636561234493, "learning_rate": 4.729524544251837e-06, "loss": 3.4258, "step": 1716 }, { "epoch": 0.69, "grad_norm": 4.050179515902923, "learning_rate": 4.718519329285771e-06, "loss": 3.3031, "step": 1717 }, { "epoch": 0.69, "grad_norm": 4.128925748003555, "learning_rate": 4.707522978363508e-06, "loss": 3.5009, "step": 1718 }, { "epoch": 0.69, "grad_norm": 4.002009858329216, "learning_rate": 4.696535509940499e-06, "loss": 3.4921, "step": 1719 }, { "epoch": 0.69, "grad_norm": 3.6431409914843242, "learning_rate": 4.685556942457296e-06, "loss": 3.5246, "step": 1720 }, { "epoch": 0.69, "grad_norm": 3.8368553209068117, "learning_rate": 4.674587294339513e-06, "loss": 3.6919, "step": 1721 }, { "epoch": 0.69, "grad_norm": 3.9066283355125186, "learning_rate": 4.663626583997789e-06, "loss": 3.388, "step": 1722 }, { "epoch": 0.69, "grad_norm": 3.337474491676217, "learning_rate": 4.652674829827762e-06, "loss": 3.6723, "step": 1723 }, { "epoch": 0.69, "grad_norm": 3.8716302465293615, "learning_rate": 4.641732050210032e-06, "loss": 3.5052, "step": 1724 }, { "epoch": 0.69, "grad_norm": 3.5067245303171237, "learning_rate": 4.630798263510162e-06, "loss": 3.6115, "step": 1725 }, { "epoch": 0.69, "grad_norm": 3.7223703218552573, "learning_rate": 4.619873488078597e-06, "loss": 3.3937, "step": 1726 }, { "epoch": 0.69, "grad_norm": 4.510075235746018, "learning_rate": 4.608957742250667e-06, "loss": 3.5372, "step": 1727 }, { "epoch": 0.69, "grad_norm": 3.9874952880001366, "learning_rate": 4.598051044346542e-06, "loss": 3.3717, "step": 1728 }, { "epoch": 0.69, "grad_norm": 3.885816752060376, "learning_rate": 4.587153412671217e-06, "loss": 3.3987, "step": 1729 }, { "epoch": 0.69, "grad_norm": 3.7827071572281445, "learning_rate": 4.576264865514467e-06, "loss": 3.1763, "step": 1730 }, { "epoch": 0.69, "grad_norm": 3.890063791368322, "learning_rate": 4.565385421150817e-06, "loss": 3.5614, "step": 1731 }, { "epoch": 0.69, "grad_norm": 3.783650330588731, "learning_rate": 4.554515097839511e-06, "loss": 3.4039, "step": 1732 }, { "epoch": 0.69, "grad_norm": 3.973268905334733, "learning_rate": 4.543653913824496e-06, "loss": 3.3783, "step": 1733 }, { "epoch": 0.69, "grad_norm": 3.422375253114076, "learning_rate": 4.53280188733437e-06, "loss": 3.4258, "step": 1734 }, { "epoch": 0.69, "grad_norm": 3.6091058977682224, "learning_rate": 4.521959036582372e-06, "loss": 3.5038, "step": 1735 }, { "epoch": 0.69, "grad_norm": 3.449899993245845, "learning_rate": 4.511125379766332e-06, "loss": 3.3051, "step": 1736 }, { "epoch": 0.69, "grad_norm": 3.3716372041379747, "learning_rate": 4.500300935068647e-06, "loss": 3.3973, "step": 1737 }, { "epoch": 0.7, "grad_norm": 3.5533894107507735, "learning_rate": 4.489485720656266e-06, "loss": 3.4855, "step": 1738 }, { "epoch": 0.7, "grad_norm": 3.559773371573033, "learning_rate": 4.478679754680639e-06, "loss": 3.4554, "step": 1739 }, { "epoch": 0.7, "grad_norm": 4.138358956281304, "learning_rate": 4.467883055277696e-06, "loss": 3.5556, "step": 1740 }, { "epoch": 0.7, "grad_norm": 3.049507644251867, "learning_rate": 4.457095640567804e-06, "loss": 3.5686, "step": 1741 }, { "epoch": 0.7, "grad_norm": 3.11767434244862, "learning_rate": 4.4463175286557654e-06, "loss": 3.6089, "step": 1742 }, { "epoch": 0.7, "grad_norm": 3.3567108518742557, "learning_rate": 4.435548737630756e-06, "loss": 3.4967, "step": 1743 }, { "epoch": 0.7, "grad_norm": 3.6967941537174314, "learning_rate": 4.4247892855663164e-06, "loss": 3.3797, "step": 1744 }, { "epoch": 0.7, "grad_norm": 3.179141600245913, "learning_rate": 4.414039190520308e-06, "loss": 3.5925, "step": 1745 }, { "epoch": 0.7, "grad_norm": 3.453056516550807, "learning_rate": 4.403298470534885e-06, "loss": 3.6349, "step": 1746 }, { "epoch": 0.7, "grad_norm": 3.5712888486599774, "learning_rate": 4.39256714363648e-06, "loss": 3.6258, "step": 1747 }, { "epoch": 0.7, "grad_norm": 3.31493960973444, "learning_rate": 4.3818452278357445e-06, "loss": 3.4372, "step": 1748 }, { "epoch": 0.7, "grad_norm": 4.171432917668506, "learning_rate": 4.371132741127553e-06, "loss": 3.3394, "step": 1749 }, { "epoch": 0.7, "grad_norm": 3.656724448043528, "learning_rate": 4.360429701490935e-06, "loss": 3.3471, "step": 1750 }, { "epoch": 0.7, "grad_norm": 4.186499874648426, "learning_rate": 4.349736126889084e-06, "loss": 3.3913, "step": 1751 }, { "epoch": 0.7, "grad_norm": 3.8208946244534188, "learning_rate": 4.339052035269291e-06, "loss": 3.6172, "step": 1752 }, { "epoch": 0.7, "grad_norm": 4.116100858707486, "learning_rate": 4.328377444562948e-06, "loss": 3.4454, "step": 1753 }, { "epoch": 0.7, "grad_norm": 4.161503924768385, "learning_rate": 4.31771237268549e-06, "loss": 3.6466, "step": 1754 }, { "epoch": 0.7, "grad_norm": 4.494899046455282, "learning_rate": 4.307056837536373e-06, "loss": 3.4494, "step": 1755 }, { "epoch": 0.7, "grad_norm": 3.767898841687319, "learning_rate": 4.296410856999062e-06, "loss": 3.3613, "step": 1756 }, { "epoch": 0.7, "grad_norm": 3.9982391161187314, "learning_rate": 4.2857744489409725e-06, "loss": 3.6314, "step": 1757 }, { "epoch": 0.7, "grad_norm": 3.5058775605789996, "learning_rate": 4.2751476312134655e-06, "loss": 3.4211, "step": 1758 }, { "epoch": 0.7, "grad_norm": 3.697058493781567, "learning_rate": 4.264530421651792e-06, "loss": 3.5555, "step": 1759 }, { "epoch": 0.7, "grad_norm": 4.011432271491016, "learning_rate": 4.2539228380750955e-06, "loss": 3.241, "step": 1760 }, { "epoch": 0.7, "grad_norm": 3.9185907235318886, "learning_rate": 4.243324898286349e-06, "loss": 3.4969, "step": 1761 }, { "epoch": 0.7, "grad_norm": 3.5614235766322233, "learning_rate": 4.2327366200723404e-06, "loss": 3.3976, "step": 1762 }, { "epoch": 0.71, "grad_norm": 3.8839560924349317, "learning_rate": 4.222158021203657e-06, "loss": 3.4789, "step": 1763 }, { "epoch": 0.71, "grad_norm": 4.671243277730213, "learning_rate": 4.211589119434622e-06, "loss": 3.5638, "step": 1764 }, { "epoch": 0.71, "grad_norm": 3.3993535836751265, "learning_rate": 4.201029932503303e-06, "loss": 3.5107, "step": 1765 }, { "epoch": 0.71, "grad_norm": 3.013661425202177, "learning_rate": 4.190480478131443e-06, "loss": 3.2718, "step": 1766 }, { "epoch": 0.71, "grad_norm": 3.3719456559452223, "learning_rate": 4.179940774024469e-06, "loss": 3.5625, "step": 1767 }, { "epoch": 0.71, "grad_norm": 3.2931148487027095, "learning_rate": 4.169410837871427e-06, "loss": 3.4839, "step": 1768 }, { "epoch": 0.71, "grad_norm": 3.2833663912883853, "learning_rate": 4.158890687344986e-06, "loss": 3.3587, "step": 1769 }, { "epoch": 0.71, "grad_norm": 3.8882103322283528, "learning_rate": 4.14838034010138e-06, "loss": 3.3663, "step": 1770 }, { "epoch": 0.71, "grad_norm": 3.8955310931768823, "learning_rate": 4.137879813780388e-06, "loss": 3.4513, "step": 1771 }, { "epoch": 0.71, "grad_norm": 3.636956183849931, "learning_rate": 4.127389126005319e-06, "loss": 3.5105, "step": 1772 }, { "epoch": 0.71, "grad_norm": 3.6484577281266577, "learning_rate": 4.116908294382956e-06, "loss": 3.6188, "step": 1773 }, { "epoch": 0.71, "grad_norm": 4.0266571534966005, "learning_rate": 4.10643733650355e-06, "loss": 3.304, "step": 1774 }, { "epoch": 0.71, "grad_norm": 3.88428003537644, "learning_rate": 4.095976269940777e-06, "loss": 3.3975, "step": 1775 }, { "epoch": 0.71, "grad_norm": 3.6192466745520893, "learning_rate": 4.085525112251706e-06, "loss": 3.5155, "step": 1776 }, { "epoch": 0.71, "grad_norm": 3.704237885509808, "learning_rate": 4.0750838809767875e-06, "loss": 3.5143, "step": 1777 }, { "epoch": 0.71, "grad_norm": 3.678360702162961, "learning_rate": 4.0646525936398086e-06, "loss": 3.542, "step": 1778 }, { "epoch": 0.71, "grad_norm": 4.389314698208116, "learning_rate": 4.054231267747862e-06, "loss": 3.5115, "step": 1779 }, { "epoch": 0.71, "grad_norm": 3.701937280915647, "learning_rate": 4.043819920791322e-06, "loss": 3.4142, "step": 1780 }, { "epoch": 0.71, "grad_norm": 3.9218694258931435, "learning_rate": 4.033418570243819e-06, "loss": 3.4976, "step": 1781 }, { "epoch": 0.71, "grad_norm": 3.721482825872019, "learning_rate": 4.0230272335622065e-06, "loss": 3.4643, "step": 1782 }, { "epoch": 0.71, "grad_norm": 3.6470247224867243, "learning_rate": 4.012645928186533e-06, "loss": 3.5088, "step": 1783 }, { "epoch": 0.71, "grad_norm": 3.7696088401191385, "learning_rate": 4.002274671540006e-06, "loss": 3.2971, "step": 1784 }, { "epoch": 0.71, "grad_norm": 3.7052611261579202, "learning_rate": 3.991913481028965e-06, "loss": 3.5161, "step": 1785 }, { "epoch": 0.71, "grad_norm": 3.7585081410257684, "learning_rate": 3.981562374042867e-06, "loss": 3.1938, "step": 1786 }, { "epoch": 0.71, "grad_norm": 3.9585861280160684, "learning_rate": 3.971221367954239e-06, "loss": 3.4439, "step": 1787 }, { "epoch": 0.72, "grad_norm": 3.283636045591956, "learning_rate": 3.960890480118653e-06, "loss": 3.6489, "step": 1788 }, { "epoch": 0.72, "grad_norm": 3.654578635777536, "learning_rate": 3.950569727874704e-06, "loss": 3.4375, "step": 1789 }, { "epoch": 0.72, "grad_norm": 4.169022532820428, "learning_rate": 3.940259128543967e-06, "loss": 3.499, "step": 1790 }, { "epoch": 0.72, "grad_norm": 3.3344543139240477, "learning_rate": 3.9299586994309905e-06, "loss": 3.4579, "step": 1791 }, { "epoch": 0.72, "grad_norm": 3.746352581841832, "learning_rate": 3.919668457823248e-06, "loss": 3.4583, "step": 1792 }, { "epoch": 0.72, "grad_norm": 3.6192979524953657, "learning_rate": 3.909388420991113e-06, "loss": 3.4426, "step": 1793 }, { "epoch": 0.72, "grad_norm": 3.736628337328482, "learning_rate": 3.899118606187832e-06, "loss": 3.5413, "step": 1794 }, { "epoch": 0.72, "grad_norm": 3.4436637095613287, "learning_rate": 3.888859030649498e-06, "loss": 3.4048, "step": 1795 }, { "epoch": 0.72, "grad_norm": 3.2072769723867838, "learning_rate": 3.878609711595022e-06, "loss": 3.3558, "step": 1796 }, { "epoch": 0.72, "grad_norm": 3.7640368299387825, "learning_rate": 3.8683706662260945e-06, "loss": 3.4847, "step": 1797 }, { "epoch": 0.72, "grad_norm": 3.436055272932363, "learning_rate": 3.858141911727168e-06, "loss": 3.2878, "step": 1798 }, { "epoch": 0.72, "grad_norm": 3.529156547637179, "learning_rate": 3.847923465265418e-06, "loss": 3.7081, "step": 1799 }, { "epoch": 0.72, "grad_norm": 3.468301049973078, "learning_rate": 3.837715343990727e-06, "loss": 3.3457, "step": 1800 }, { "epoch": 0.72, "grad_norm": 3.5741394728587204, "learning_rate": 3.8275175650356485e-06, "loss": 3.523, "step": 1801 }, { "epoch": 0.72, "grad_norm": 3.6469623554292134, "learning_rate": 3.817330145515374e-06, "loss": 3.3227, "step": 1802 }, { "epoch": 0.72, "grad_norm": 3.332529000908109, "learning_rate": 3.807153102527704e-06, "loss": 3.3699, "step": 1803 }, { "epoch": 0.72, "grad_norm": 4.208393616053263, "learning_rate": 3.7969864531530344e-06, "loss": 3.4386, "step": 1804 }, { "epoch": 0.72, "grad_norm": 3.512302703646855, "learning_rate": 3.7868302144543146e-06, "loss": 3.6729, "step": 1805 }, { "epoch": 0.72, "grad_norm": 3.7091168641858503, "learning_rate": 3.7766844034770155e-06, "loss": 3.7557, "step": 1806 }, { "epoch": 0.72, "grad_norm": 4.132021104400233, "learning_rate": 3.766549037249112e-06, "loss": 3.3382, "step": 1807 }, { "epoch": 0.72, "grad_norm": 4.002942906418598, "learning_rate": 3.7564241327810436e-06, "loss": 3.4619, "step": 1808 }, { "epoch": 0.72, "grad_norm": 3.9271576290560906, "learning_rate": 3.7463097070656995e-06, "loss": 3.5258, "step": 1809 }, { "epoch": 0.72, "grad_norm": 3.98998238474593, "learning_rate": 3.736205777078381e-06, "loss": 3.4328, "step": 1810 }, { "epoch": 0.72, "grad_norm": 4.765986861107831, "learning_rate": 3.72611235977677e-06, "loss": 3.371, "step": 1811 }, { "epoch": 0.72, "grad_norm": 4.13175319308379, "learning_rate": 3.7160294721009026e-06, "loss": 3.2895, "step": 1812 }, { "epoch": 0.73, "grad_norm": 4.5676925128829895, "learning_rate": 3.705957130973149e-06, "loss": 3.3525, "step": 1813 }, { "epoch": 0.73, "grad_norm": 4.065579675390456, "learning_rate": 3.69589535329818e-06, "loss": 3.4717, "step": 1814 }, { "epoch": 0.73, "grad_norm": 3.6964342085032156, "learning_rate": 3.685844155962931e-06, "loss": 3.374, "step": 1815 }, { "epoch": 0.73, "grad_norm": 3.5454709243268656, "learning_rate": 3.675803555836582e-06, "loss": 3.2699, "step": 1816 }, { "epoch": 0.73, "grad_norm": 3.6910173645241513, "learning_rate": 3.6657735697705267e-06, "loss": 3.2236, "step": 1817 }, { "epoch": 0.73, "grad_norm": 3.588311501677217, "learning_rate": 3.6557542145983495e-06, "loss": 3.5066, "step": 1818 }, { "epoch": 0.73, "grad_norm": 4.651925132300316, "learning_rate": 3.6457455071357918e-06, "loss": 3.3634, "step": 1819 }, { "epoch": 0.73, "grad_norm": 4.250683821727388, "learning_rate": 3.63574746418072e-06, "loss": 3.267, "step": 1820 }, { "epoch": 0.73, "grad_norm": 3.429181473137611, "learning_rate": 3.625760102513103e-06, "loss": 3.5255, "step": 1821 }, { "epoch": 0.73, "grad_norm": 3.1222267069199283, "learning_rate": 3.6157834388949907e-06, "loss": 3.5681, "step": 1822 }, { "epoch": 0.73, "grad_norm": 3.43599969407311, "learning_rate": 3.6058174900704646e-06, "loss": 3.5532, "step": 1823 }, { "epoch": 0.73, "grad_norm": 3.9924024367280544, "learning_rate": 3.595862272765638e-06, "loss": 3.3557, "step": 1824 }, { "epoch": 0.73, "grad_norm": 4.335774679391719, "learning_rate": 3.585917803688603e-06, "loss": 3.1922, "step": 1825 }, { "epoch": 0.73, "grad_norm": 3.317027772024, "learning_rate": 3.5759840995294136e-06, "loss": 3.3783, "step": 1826 }, { "epoch": 0.73, "grad_norm": 3.805915177072342, "learning_rate": 3.5660611769600604e-06, "loss": 3.5855, "step": 1827 }, { "epoch": 0.73, "grad_norm": 3.9192673343873063, "learning_rate": 3.556149052634443e-06, "loss": 3.4866, "step": 1828 }, { "epoch": 0.73, "grad_norm": 3.484350084902121, "learning_rate": 3.546247743188328e-06, "loss": 3.5675, "step": 1829 }, { "epoch": 0.73, "grad_norm": 3.103281813841342, "learning_rate": 3.536357265239333e-06, "loss": 3.5552, "step": 1830 }, { "epoch": 0.73, "grad_norm": 3.450382295757395, "learning_rate": 3.5264776353869046e-06, "loss": 3.4087, "step": 1831 }, { "epoch": 0.73, "grad_norm": 4.437727669001523, "learning_rate": 3.5166088702122738e-06, "loss": 3.3265, "step": 1832 }, { "epoch": 0.73, "grad_norm": 4.346381212758512, "learning_rate": 3.5067509862784455e-06, "loss": 3.4703, "step": 1833 }, { "epoch": 0.73, "grad_norm": 3.587343604875265, "learning_rate": 3.4969040001301513e-06, "loss": 3.6085, "step": 1834 }, { "epoch": 0.73, "grad_norm": 3.854682116713474, "learning_rate": 3.487067928293848e-06, "loss": 3.4987, "step": 1835 }, { "epoch": 0.73, "grad_norm": 3.9170447966343542, "learning_rate": 3.4772427872776606e-06, "loss": 3.3871, "step": 1836 }, { "epoch": 0.73, "grad_norm": 3.652408689543764, "learning_rate": 3.4674285935713715e-06, "loss": 3.5386, "step": 1837 }, { "epoch": 0.74, "grad_norm": 3.6076480873164796, "learning_rate": 3.4576253636463996e-06, "loss": 3.4289, "step": 1838 }, { "epoch": 0.74, "grad_norm": 3.633468327300971, "learning_rate": 3.4478331139557475e-06, "loss": 3.4698, "step": 1839 }, { "epoch": 0.74, "grad_norm": 3.584011854682265, "learning_rate": 3.4380518609340076e-06, "loss": 3.3809, "step": 1840 }, { "epoch": 0.74, "grad_norm": 3.4086643945235453, "learning_rate": 3.428281620997296e-06, "loss": 3.4472, "step": 1841 }, { "epoch": 0.74, "grad_norm": 3.7236703417587798, "learning_rate": 3.418522410543266e-06, "loss": 3.3683, "step": 1842 }, { "epoch": 0.74, "grad_norm": 4.516193849667952, "learning_rate": 3.4087742459510396e-06, "loss": 3.3648, "step": 1843 }, { "epoch": 0.74, "grad_norm": 4.0536722168971995, "learning_rate": 3.3990371435812185e-06, "loss": 3.3732, "step": 1844 }, { "epoch": 0.74, "grad_norm": 3.7285338094616565, "learning_rate": 3.3893111197758276e-06, "loss": 3.3925, "step": 1845 }, { "epoch": 0.74, "grad_norm": 3.7303715346811033, "learning_rate": 3.3795961908582965e-06, "loss": 3.3005, "step": 1846 }, { "epoch": 0.74, "grad_norm": 3.9474329011980913, "learning_rate": 3.3698923731334453e-06, "loss": 3.4754, "step": 1847 }, { "epoch": 0.74, "grad_norm": 3.5606404505123774, "learning_rate": 3.360199682887433e-06, "loss": 3.675, "step": 1848 }, { "epoch": 0.74, "grad_norm": 3.7856691217724734, "learning_rate": 3.3505181363877536e-06, "loss": 3.342, "step": 1849 }, { "epoch": 0.74, "grad_norm": 3.300592236425659, "learning_rate": 3.3408477498831917e-06, "loss": 3.2336, "step": 1850 }, { "epoch": 0.74, "grad_norm": 3.511252927635383, "learning_rate": 3.3311885396038002e-06, "loss": 3.47, "step": 1851 }, { "epoch": 0.74, "grad_norm": 3.496345516082245, "learning_rate": 3.321540521760883e-06, "loss": 3.4671, "step": 1852 }, { "epoch": 0.74, "grad_norm": 3.20909351432755, "learning_rate": 3.3119037125469553e-06, "loss": 3.6159, "step": 1853 }, { "epoch": 0.74, "grad_norm": 3.5867603857436006, "learning_rate": 3.3022781281357184e-06, "loss": 3.7095, "step": 1854 }, { "epoch": 0.74, "grad_norm": 3.4734426795727376, "learning_rate": 3.2926637846820366e-06, "loss": 3.5205, "step": 1855 }, { "epoch": 0.74, "grad_norm": 4.055599777957748, "learning_rate": 3.2830606983219038e-06, "loss": 3.4546, "step": 1856 }, { "epoch": 0.74, "grad_norm": 4.196656750316166, "learning_rate": 3.2734688851724273e-06, "loss": 3.3104, "step": 1857 }, { "epoch": 0.74, "grad_norm": 3.761411862813457, "learning_rate": 3.2638883613317974e-06, "loss": 3.4606, "step": 1858 }, { "epoch": 0.74, "grad_norm": 3.2224622867474353, "learning_rate": 3.2543191428792466e-06, "loss": 3.6587, "step": 1859 }, { "epoch": 0.74, "grad_norm": 3.252120608521762, "learning_rate": 3.2447612458750365e-06, "loss": 3.3622, "step": 1860 }, { "epoch": 0.74, "grad_norm": 3.4210000097935085, "learning_rate": 3.2352146863604317e-06, "loss": 3.3387, "step": 1861 }, { "epoch": 0.74, "grad_norm": 3.9081317042211747, "learning_rate": 3.2256794803576707e-06, "loss": 3.5707, "step": 1862 }, { "epoch": 0.75, "grad_norm": 4.2730047394313795, "learning_rate": 3.2161556438699303e-06, "loss": 3.5105, "step": 1863 }, { "epoch": 0.75, "grad_norm": 3.566834764286657, "learning_rate": 3.2066431928813068e-06, "loss": 3.5039, "step": 1864 }, { "epoch": 0.75, "grad_norm": 3.4615537663089855, "learning_rate": 3.197142143356787e-06, "loss": 3.2387, "step": 1865 }, { "epoch": 0.75, "grad_norm": 3.7727034614513606, "learning_rate": 3.1876525112422283e-06, "loss": 3.4536, "step": 1866 }, { "epoch": 0.75, "grad_norm": 3.850992472141217, "learning_rate": 3.178174312464326e-06, "loss": 3.4271, "step": 1867 }, { "epoch": 0.75, "grad_norm": 3.4404782089592967, "learning_rate": 3.1687075629305787e-06, "loss": 3.5452, "step": 1868 }, { "epoch": 0.75, "grad_norm": 3.4309534748380246, "learning_rate": 3.1592522785292714e-06, "loss": 3.5336, "step": 1869 }, { "epoch": 0.75, "grad_norm": 3.9067620491946853, "learning_rate": 3.1498084751294523e-06, "loss": 3.598, "step": 1870 }, { "epoch": 0.75, "grad_norm": 3.448718948350417, "learning_rate": 3.1403761685809007e-06, "loss": 3.3707, "step": 1871 }, { "epoch": 0.75, "grad_norm": 3.3574537673903415, "learning_rate": 3.130955374714094e-06, "loss": 3.4151, "step": 1872 }, { "epoch": 0.75, "grad_norm": 3.542471370777354, "learning_rate": 3.1215461093401913e-06, "loss": 3.2864, "step": 1873 }, { "epoch": 0.75, "grad_norm": 3.96498352917791, "learning_rate": 3.1121483882509996e-06, "loss": 3.4146, "step": 1874 }, { "epoch": 0.75, "grad_norm": 2.9306184568657394, "learning_rate": 3.1027622272189572e-06, "loss": 3.6675, "step": 1875 }, { "epoch": 0.75, "grad_norm": 3.563712587378033, "learning_rate": 3.0933876419971008e-06, "loss": 3.5339, "step": 1876 }, { "epoch": 0.75, "grad_norm": 3.656991955040363, "learning_rate": 3.0840246483190338e-06, "loss": 3.4484, "step": 1877 }, { "epoch": 0.75, "grad_norm": 3.8382126444965428, "learning_rate": 3.074673261898903e-06, "loss": 3.4853, "step": 1878 }, { "epoch": 0.75, "grad_norm": 3.775782565231201, "learning_rate": 3.065333498431381e-06, "loss": 3.193, "step": 1879 }, { "epoch": 0.75, "grad_norm": 4.55715379265718, "learning_rate": 3.0560053735916372e-06, "loss": 3.3536, "step": 1880 }, { "epoch": 0.75, "grad_norm": 3.852296770838666, "learning_rate": 3.0466889030352976e-06, "loss": 3.5737, "step": 1881 }, { "epoch": 0.75, "grad_norm": 3.9813297344776477, "learning_rate": 3.037384102398431e-06, "loss": 3.6859, "step": 1882 }, { "epoch": 0.75, "grad_norm": 3.7225704836822513, "learning_rate": 3.0280909872975194e-06, "loss": 3.4207, "step": 1883 }, { "epoch": 0.75, "grad_norm": 3.424724501437131, "learning_rate": 3.0188095733294388e-06, "loss": 3.4869, "step": 1884 }, { "epoch": 0.75, "grad_norm": 3.5295304976839144, "learning_rate": 3.009539876071427e-06, "loss": 3.3601, "step": 1885 }, { "epoch": 0.75, "grad_norm": 3.5881747955842727, "learning_rate": 3.0002819110810475e-06, "loss": 3.6793, "step": 1886 }, { "epoch": 0.75, "grad_norm": 3.9583141250494607, "learning_rate": 2.9910356938961782e-06, "loss": 3.0941, "step": 1887 }, { "epoch": 0.76, "grad_norm": 3.912358664094153, "learning_rate": 2.981801240034985e-06, "loss": 3.3709, "step": 1888 }, { "epoch": 0.76, "grad_norm": 3.263720825562735, "learning_rate": 2.9725785649958895e-06, "loss": 3.3977, "step": 1889 }, { "epoch": 0.76, "grad_norm": 3.509980044518029, "learning_rate": 2.9633676842575386e-06, "loss": 3.3369, "step": 1890 }, { "epoch": 0.76, "grad_norm": 3.1011270995474804, "learning_rate": 2.9541686132787907e-06, "loss": 3.4079, "step": 1891 }, { "epoch": 0.76, "grad_norm": 3.0679947729345174, "learning_rate": 2.944981367498677e-06, "loss": 3.3624, "step": 1892 }, { "epoch": 0.76, "grad_norm": 3.288148926844505, "learning_rate": 2.93580596233639e-06, "loss": 3.4671, "step": 1893 }, { "epoch": 0.76, "grad_norm": 3.6214058326443292, "learning_rate": 2.9266424131912495e-06, "loss": 3.3457, "step": 1894 }, { "epoch": 0.76, "grad_norm": 3.53592212281632, "learning_rate": 2.9174907354426696e-06, "loss": 3.3712, "step": 1895 }, { "epoch": 0.76, "grad_norm": 3.5987696162272527, "learning_rate": 2.9083509444501433e-06, "loss": 3.3665, "step": 1896 }, { "epoch": 0.76, "grad_norm": 3.085596638238194, "learning_rate": 2.899223055553221e-06, "loss": 3.6694, "step": 1897 }, { "epoch": 0.76, "grad_norm": 3.4340867270724083, "learning_rate": 2.890107084071465e-06, "loss": 3.316, "step": 1898 }, { "epoch": 0.76, "grad_norm": 4.285488215778594, "learning_rate": 2.881003045304448e-06, "loss": 3.4162, "step": 1899 }, { "epoch": 0.76, "grad_norm": 3.6478133045438454, "learning_rate": 2.8719109545317102e-06, "loss": 3.4196, "step": 1900 }, { "epoch": 0.76, "grad_norm": 3.4821976881960612, "learning_rate": 2.8628308270127335e-06, "loss": 3.2749, "step": 1901 }, { "epoch": 0.76, "grad_norm": 3.8625654362844153, "learning_rate": 2.853762677986932e-06, "loss": 3.1237, "step": 1902 }, { "epoch": 0.76, "grad_norm": 3.9299865920722934, "learning_rate": 2.844706522673616e-06, "loss": 3.5478, "step": 1903 }, { "epoch": 0.76, "grad_norm": 3.410719964347605, "learning_rate": 2.835662376271957e-06, "loss": 3.7158, "step": 1904 }, { "epoch": 0.76, "grad_norm": 3.873022927865709, "learning_rate": 2.8266302539609747e-06, "loss": 3.4461, "step": 1905 }, { "epoch": 0.76, "grad_norm": 3.67026539624037, "learning_rate": 2.8176101708995174e-06, "loss": 3.4867, "step": 1906 }, { "epoch": 0.76, "grad_norm": 3.9182061324797237, "learning_rate": 2.808602142226212e-06, "loss": 3.2738, "step": 1907 }, { "epoch": 0.76, "grad_norm": 4.124918425721532, "learning_rate": 2.7996061830594714e-06, "loss": 3.4517, "step": 1908 }, { "epoch": 0.76, "grad_norm": 4.105538309675829, "learning_rate": 2.7906223084974405e-06, "loss": 3.2271, "step": 1909 }, { "epoch": 0.76, "grad_norm": 3.3859102048877254, "learning_rate": 2.78165053361798e-06, "loss": 3.3504, "step": 1910 }, { "epoch": 0.76, "grad_norm": 3.7425966269377486, "learning_rate": 2.772690873478656e-06, "loss": 3.3253, "step": 1911 }, { "epoch": 0.76, "grad_norm": 3.310424851528983, "learning_rate": 2.7637433431166903e-06, "loss": 3.2082, "step": 1912 }, { "epoch": 0.77, "grad_norm": 3.221875373344164, "learning_rate": 2.754807957548955e-06, "loss": 3.6851, "step": 1913 }, { "epoch": 0.77, "grad_norm": 3.5123775223492832, "learning_rate": 2.745884731771931e-06, "loss": 3.5482, "step": 1914 }, { "epoch": 0.77, "grad_norm": 3.044351047721393, "learning_rate": 2.736973680761702e-06, "loss": 3.4826, "step": 1915 }, { "epoch": 0.77, "grad_norm": 4.1297812159014615, "learning_rate": 2.728074819473908e-06, "loss": 3.4117, "step": 1916 }, { "epoch": 0.77, "grad_norm": 3.8714590716432395, "learning_rate": 2.7191881628437335e-06, "loss": 3.479, "step": 1917 }, { "epoch": 0.77, "grad_norm": 3.5456711355596324, "learning_rate": 2.7103137257858867e-06, "loss": 3.4652, "step": 1918 }, { "epoch": 0.77, "grad_norm": 3.8344813934172226, "learning_rate": 2.7014515231945557e-06, "loss": 3.4678, "step": 1919 }, { "epoch": 0.77, "grad_norm": 3.4176638962488086, "learning_rate": 2.692601569943407e-06, "loss": 3.5222, "step": 1920 }, { "epoch": 0.77, "grad_norm": 4.1479220590942765, "learning_rate": 2.683763880885538e-06, "loss": 3.5278, "step": 1921 }, { "epoch": 0.77, "grad_norm": 3.7676113482167803, "learning_rate": 2.674938470853472e-06, "loss": 3.5333, "step": 1922 }, { "epoch": 0.77, "grad_norm": 3.506946240907651, "learning_rate": 2.6661253546591158e-06, "loss": 3.6749, "step": 1923 }, { "epoch": 0.77, "grad_norm": 3.6055160050017987, "learning_rate": 2.6573245470937527e-06, "loss": 3.518, "step": 1924 }, { "epoch": 0.77, "grad_norm": 3.7106478573047106, "learning_rate": 2.648536062927999e-06, "loss": 3.54, "step": 1925 }, { "epoch": 0.77, "grad_norm": 3.5451801376739276, "learning_rate": 2.639759916911788e-06, "loss": 3.5237, "step": 1926 }, { "epoch": 0.77, "grad_norm": 3.8798623404933728, "learning_rate": 2.6309961237743587e-06, "loss": 3.491, "step": 1927 }, { "epoch": 0.77, "grad_norm": 3.5768159862331284, "learning_rate": 2.6222446982242e-06, "loss": 3.2704, "step": 1928 }, { "epoch": 0.77, "grad_norm": 4.269484671851689, "learning_rate": 2.61350565494906e-06, "loss": 3.4937, "step": 1929 }, { "epoch": 0.77, "grad_norm": 3.332641827711261, "learning_rate": 2.604779008615895e-06, "loss": 3.5345, "step": 1930 }, { "epoch": 0.77, "grad_norm": 3.829641692493816, "learning_rate": 2.5960647738708553e-06, "loss": 3.3914, "step": 1931 }, { "epoch": 0.77, "grad_norm": 3.231876182233534, "learning_rate": 2.5873629653392653e-06, "loss": 3.5457, "step": 1932 }, { "epoch": 0.77, "grad_norm": 3.92942692841208, "learning_rate": 2.578673597625597e-06, "loss": 3.4388, "step": 1933 }, { "epoch": 0.77, "grad_norm": 4.207353567728463, "learning_rate": 2.569996685313434e-06, "loss": 3.4373, "step": 1934 }, { "epoch": 0.77, "grad_norm": 3.81926065122999, "learning_rate": 2.5613322429654573e-06, "loss": 3.4003, "step": 1935 }, { "epoch": 0.77, "grad_norm": 3.736117305431284, "learning_rate": 2.5526802851234268e-06, "loss": 3.259, "step": 1936 }, { "epoch": 0.77, "grad_norm": 3.8281798178776683, "learning_rate": 2.5440408263081385e-06, "loss": 3.5264, "step": 1937 }, { "epoch": 0.78, "grad_norm": 4.418895548861695, "learning_rate": 2.535413881019423e-06, "loss": 3.3591, "step": 1938 }, { "epoch": 0.78, "grad_norm": 3.4143628309124843, "learning_rate": 2.526799463736099e-06, "loss": 3.3624, "step": 1939 }, { "epoch": 0.78, "grad_norm": 4.041022539593813, "learning_rate": 2.5181975889159615e-06, "loss": 3.5832, "step": 1940 }, { "epoch": 0.78, "grad_norm": 3.6739434254765775, "learning_rate": 2.509608270995758e-06, "loss": 3.3057, "step": 1941 }, { "epoch": 0.78, "grad_norm": 4.493683119459877, "learning_rate": 2.501031524391163e-06, "loss": 3.1586, "step": 1942 }, { "epoch": 0.78, "grad_norm": 3.8597452477414396, "learning_rate": 2.492467363496747e-06, "loss": 3.3849, "step": 1943 }, { "epoch": 0.78, "grad_norm": 3.817698743749816, "learning_rate": 2.483915802685959e-06, "loss": 3.46, "step": 1944 }, { "epoch": 0.78, "grad_norm": 4.0309323747808055, "learning_rate": 2.475376856311097e-06, "loss": 3.5345, "step": 1945 }, { "epoch": 0.78, "grad_norm": 3.4469163019967795, "learning_rate": 2.4668505387033025e-06, "loss": 3.3929, "step": 1946 }, { "epoch": 0.78, "grad_norm": 3.761660769645593, "learning_rate": 2.458336864172508e-06, "loss": 3.4265, "step": 1947 }, { "epoch": 0.78, "grad_norm": 3.534763224235543, "learning_rate": 2.44983584700743e-06, "loss": 3.696, "step": 1948 }, { "epoch": 0.78, "grad_norm": 3.3341157071801257, "learning_rate": 2.4413475014755396e-06, "loss": 3.385, "step": 1949 }, { "epoch": 0.78, "grad_norm": 4.9798835388607525, "learning_rate": 2.432871841823047e-06, "loss": 3.2967, "step": 1950 }, { "epoch": 0.78, "grad_norm": 3.9600306056181904, "learning_rate": 2.42440888227487e-06, "loss": 3.2849, "step": 1951 }, { "epoch": 0.78, "grad_norm": 4.104498375827391, "learning_rate": 2.415958637034609e-06, "loss": 3.4203, "step": 1952 }, { "epoch": 0.78, "grad_norm": 3.7502083975036826, "learning_rate": 2.407521120284523e-06, "loss": 3.3641, "step": 1953 }, { "epoch": 0.78, "grad_norm": 3.5881591927638516, "learning_rate": 2.3990963461855075e-06, "loss": 3.2882, "step": 1954 }, { "epoch": 0.78, "grad_norm": 3.428645383919292, "learning_rate": 2.390684328877089e-06, "loss": 3.4263, "step": 1955 }, { "epoch": 0.78, "grad_norm": 3.721307523264979, "learning_rate": 2.3822850824773623e-06, "loss": 3.4291, "step": 1956 }, { "epoch": 0.78, "grad_norm": 3.6339505062874577, "learning_rate": 2.3738986210829997e-06, "loss": 3.5408, "step": 1957 }, { "epoch": 0.78, "grad_norm": 3.7015671534693193, "learning_rate": 2.3655249587692073e-06, "loss": 3.367, "step": 1958 }, { "epoch": 0.78, "grad_norm": 3.4071277961892066, "learning_rate": 2.3571641095897223e-06, "loss": 3.1734, "step": 1959 }, { "epoch": 0.78, "grad_norm": 3.704927639099572, "learning_rate": 2.3488160875767717e-06, "loss": 3.4119, "step": 1960 }, { "epoch": 0.78, "grad_norm": 3.1341363150404877, "learning_rate": 2.340480906741053e-06, "loss": 3.2445, "step": 1961 }, { "epoch": 0.78, "grad_norm": 3.58868146610442, "learning_rate": 2.332158581071712e-06, "loss": 3.5284, "step": 1962 }, { "epoch": 0.79, "grad_norm": 3.31203736033518, "learning_rate": 2.323849124536315e-06, "loss": 3.4336, "step": 1963 }, { "epoch": 0.79, "grad_norm": 4.813832593935863, "learning_rate": 2.3155525510808453e-06, "loss": 3.509, "step": 1964 }, { "epoch": 0.79, "grad_norm": 4.0666930929331615, "learning_rate": 2.307268874629649e-06, "loss": 3.3478, "step": 1965 }, { "epoch": 0.79, "grad_norm": 3.479016340534677, "learning_rate": 2.2989981090854306e-06, "loss": 3.5218, "step": 1966 }, { "epoch": 0.79, "grad_norm": 4.087492566542144, "learning_rate": 2.2907402683292268e-06, "loss": 3.4544, "step": 1967 }, { "epoch": 0.79, "grad_norm": 3.227066132714872, "learning_rate": 2.2824953662203832e-06, "loss": 3.4573, "step": 1968 }, { "epoch": 0.79, "grad_norm": 3.859067379027617, "learning_rate": 2.2742634165965317e-06, "loss": 3.4539, "step": 1969 }, { "epoch": 0.79, "grad_norm": 3.413802073964399, "learning_rate": 2.266044433273562e-06, "loss": 3.3942, "step": 1970 }, { "epoch": 0.79, "grad_norm": 3.086380726334998, "learning_rate": 2.2578384300456014e-06, "loss": 3.5544, "step": 1971 }, { "epoch": 0.79, "grad_norm": 3.591331242244212, "learning_rate": 2.249645420684998e-06, "loss": 3.1801, "step": 1972 }, { "epoch": 0.79, "grad_norm": 3.990643884284416, "learning_rate": 2.2414654189422845e-06, "loss": 3.3328, "step": 1973 }, { "epoch": 0.79, "grad_norm": 3.7177062831411893, "learning_rate": 2.233298438546172e-06, "loss": 3.5575, "step": 1974 }, { "epoch": 0.79, "grad_norm": 4.057689293818614, "learning_rate": 2.2251444932035094e-06, "loss": 3.5605, "step": 1975 }, { "epoch": 0.79, "grad_norm": 3.5873009841450383, "learning_rate": 2.2170035965992674e-06, "loss": 3.5157, "step": 1976 }, { "epoch": 0.79, "grad_norm": 3.539103717358677, "learning_rate": 2.2088757623965263e-06, "loss": 3.3443, "step": 1977 }, { "epoch": 0.79, "grad_norm": 4.0314317779393285, "learning_rate": 2.2007610042364337e-06, "loss": 3.4664, "step": 1978 }, { "epoch": 0.79, "grad_norm": 3.319751629440479, "learning_rate": 2.1926593357382e-06, "loss": 3.3711, "step": 1979 }, { "epoch": 0.79, "grad_norm": 3.5932358520391667, "learning_rate": 2.184570770499056e-06, "loss": 3.4833, "step": 1980 }, { "epoch": 0.79, "grad_norm": 3.367779707746548, "learning_rate": 2.176495322094254e-06, "loss": 3.3852, "step": 1981 }, { "epoch": 0.79, "grad_norm": 3.3471031613965927, "learning_rate": 2.1684330040770183e-06, "loss": 3.5292, "step": 1982 }, { "epoch": 0.79, "grad_norm": 3.3645699007477976, "learning_rate": 2.1603838299785486e-06, "loss": 3.3293, "step": 1983 }, { "epoch": 0.79, "grad_norm": 4.175500406999358, "learning_rate": 2.1523478133079776e-06, "loss": 3.4913, "step": 1984 }, { "epoch": 0.79, "grad_norm": 3.455711229312703, "learning_rate": 2.1443249675523536e-06, "loss": 3.4397, "step": 1985 }, { "epoch": 0.79, "grad_norm": 3.3649792008508643, "learning_rate": 2.1363153061766297e-06, "loss": 3.2848, "step": 1986 }, { "epoch": 0.79, "grad_norm": 4.389255501639541, "learning_rate": 2.128318842623618e-06, "loss": 3.3366, "step": 1987 }, { "epoch": 0.8, "grad_norm": 3.9054400348160923, "learning_rate": 2.1203355903139934e-06, "loss": 3.3767, "step": 1988 }, { "epoch": 0.8, "grad_norm": 3.202178172426877, "learning_rate": 2.112365562646248e-06, "loss": 3.3604, "step": 1989 }, { "epoch": 0.8, "grad_norm": 3.6981929021416575, "learning_rate": 2.1044087729966856e-06, "loss": 3.5228, "step": 1990 }, { "epoch": 0.8, "grad_norm": 2.9855355748316836, "learning_rate": 2.0964652347193894e-06, "loss": 3.4482, "step": 1991 }, { "epoch": 0.8, "grad_norm": 3.522924579103291, "learning_rate": 2.088534961146197e-06, "loss": 3.4379, "step": 1992 }, { "epoch": 0.8, "grad_norm": 3.2621294434046266, "learning_rate": 2.0806179655866964e-06, "loss": 3.4508, "step": 1993 }, { "epoch": 0.8, "grad_norm": 3.5103053461158957, "learning_rate": 2.072714261328177e-06, "loss": 3.2115, "step": 1994 }, { "epoch": 0.8, "grad_norm": 3.6445225225407483, "learning_rate": 2.064823861635633e-06, "loss": 3.4337, "step": 1995 }, { "epoch": 0.8, "grad_norm": 3.760005549912783, "learning_rate": 2.0569467797517173e-06, "loss": 3.3243, "step": 1996 }, { "epoch": 0.8, "grad_norm": 3.2248284331492862, "learning_rate": 2.0490830288967443e-06, "loss": 3.1467, "step": 1997 }, { "epoch": 0.8, "grad_norm": 4.170608568490723, "learning_rate": 2.041232622268642e-06, "loss": 3.1809, "step": 1998 }, { "epoch": 0.8, "grad_norm": 3.6936094239380752, "learning_rate": 2.033395573042952e-06, "loss": 3.3979, "step": 1999 }, { "epoch": 0.8, "grad_norm": 3.0024131234462748, "learning_rate": 2.025571894372794e-06, "loss": 3.473, "step": 2000 }, { "epoch": 0.8, "grad_norm": 3.5589140858814954, "learning_rate": 2.017761599388842e-06, "loss": 3.3617, "step": 2001 }, { "epoch": 0.8, "grad_norm": 3.517662937978228, "learning_rate": 2.0099647011993217e-06, "loss": 3.3516, "step": 2002 }, { "epoch": 0.8, "grad_norm": 3.243366018931923, "learning_rate": 2.00218121288996e-06, "loss": 3.3382, "step": 2003 }, { "epoch": 0.8, "grad_norm": 3.5996319864021356, "learning_rate": 1.994411147523987e-06, "loss": 3.4044, "step": 2004 }, { "epoch": 0.8, "grad_norm": 3.931460370263261, "learning_rate": 1.9866545181421016e-06, "loss": 3.394, "step": 2005 }, { "epoch": 0.8, "grad_norm": 3.521015795117093, "learning_rate": 1.97891133776245e-06, "loss": 3.5298, "step": 2006 }, { "epoch": 0.8, "grad_norm": 3.625159681693936, "learning_rate": 1.971181619380611e-06, "loss": 3.192, "step": 2007 }, { "epoch": 0.8, "grad_norm": 3.8264105777364743, "learning_rate": 1.963465375969572e-06, "loss": 3.3629, "step": 2008 }, { "epoch": 0.8, "grad_norm": 3.676670729415169, "learning_rate": 1.955762620479699e-06, "loss": 3.4384, "step": 2009 }, { "epoch": 0.8, "grad_norm": 3.394143019524009, "learning_rate": 1.9480733658387175e-06, "loss": 3.3489, "step": 2010 }, { "epoch": 0.8, "grad_norm": 3.2006120148546935, "learning_rate": 1.940397624951709e-06, "loss": 3.4182, "step": 2011 }, { "epoch": 0.8, "grad_norm": 3.6169611140886224, "learning_rate": 1.9327354107010566e-06, "loss": 3.4577, "step": 2012 }, { "epoch": 0.81, "grad_norm": 3.518943092013396, "learning_rate": 1.9250867359464575e-06, "loss": 3.3019, "step": 2013 }, { "epoch": 0.81, "grad_norm": 3.257401642789152, "learning_rate": 1.9174516135248745e-06, "loss": 3.5141, "step": 2014 }, { "epoch": 0.81, "grad_norm": 3.6641653222885147, "learning_rate": 1.9098300562505266e-06, "loss": 3.3903, "step": 2015 }, { "epoch": 0.81, "grad_norm": 3.7826868132289713, "learning_rate": 1.902222076914869e-06, "loss": 3.4909, "step": 2016 }, { "epoch": 0.81, "grad_norm": 3.5520282434078796, "learning_rate": 1.894627688286571e-06, "loss": 3.312, "step": 2017 }, { "epoch": 0.81, "grad_norm": 3.6540670643755684, "learning_rate": 1.8870469031114868e-06, "loss": 3.3058, "step": 2018 }, { "epoch": 0.81, "grad_norm": 3.735593509692127, "learning_rate": 1.8794797341126403e-06, "loss": 3.423, "step": 2019 }, { "epoch": 0.81, "grad_norm": 3.328850503165228, "learning_rate": 1.8719261939902023e-06, "loss": 3.3835, "step": 2020 }, { "epoch": 0.81, "grad_norm": 3.5312614490842464, "learning_rate": 1.8643862954214754e-06, "loss": 3.4535, "step": 2021 }, { "epoch": 0.81, "grad_norm": 3.530286273299819, "learning_rate": 1.8568600510608659e-06, "loss": 3.1957, "step": 2022 }, { "epoch": 0.81, "grad_norm": 3.3364923534174658, "learning_rate": 1.8493474735398575e-06, "loss": 3.4549, "step": 2023 }, { "epoch": 0.81, "grad_norm": 3.495691119616844, "learning_rate": 1.8418485754670013e-06, "loss": 3.4042, "step": 2024 }, { "epoch": 0.81, "grad_norm": 3.8025310219035995, "learning_rate": 1.8343633694278895e-06, "loss": 3.7479, "step": 2025 }, { "epoch": 0.81, "grad_norm": 3.8728585066365886, "learning_rate": 1.8268918679851388e-06, "loss": 3.4093, "step": 2026 }, { "epoch": 0.81, "grad_norm": 3.7564618280991113, "learning_rate": 1.8194340836783565e-06, "loss": 3.3054, "step": 2027 }, { "epoch": 0.81, "grad_norm": 3.681863260494089, "learning_rate": 1.8119900290241331e-06, "loss": 3.5343, "step": 2028 }, { "epoch": 0.81, "grad_norm": 3.5455921958019068, "learning_rate": 1.8045597165160134e-06, "loss": 3.3716, "step": 2029 }, { "epoch": 0.81, "grad_norm": 3.356563608206486, "learning_rate": 1.7971431586244814e-06, "loss": 3.4031, "step": 2030 }, { "epoch": 0.81, "grad_norm": 3.2452422437785047, "learning_rate": 1.7897403677969405e-06, "loss": 3.3848, "step": 2031 }, { "epoch": 0.81, "grad_norm": 2.914647472140249, "learning_rate": 1.7823513564576788e-06, "loss": 3.3433, "step": 2032 }, { "epoch": 0.81, "grad_norm": 3.856800399645909, "learning_rate": 1.774976137007861e-06, "loss": 3.3345, "step": 2033 }, { "epoch": 0.81, "grad_norm": 3.45709066528513, "learning_rate": 1.7676147218255092e-06, "loss": 3.4962, "step": 2034 }, { "epoch": 0.81, "grad_norm": 3.766310680972499, "learning_rate": 1.7602671232654755e-06, "loss": 3.4752, "step": 2035 }, { "epoch": 0.81, "grad_norm": 3.1432477155905585, "learning_rate": 1.7529333536594217e-06, "loss": 3.4079, "step": 2036 }, { "epoch": 0.81, "grad_norm": 4.178186217696704, "learning_rate": 1.7456134253157976e-06, "loss": 3.4548, "step": 2037 }, { "epoch": 0.82, "grad_norm": 4.075392019420518, "learning_rate": 1.7383073505198255e-06, "loss": 3.387, "step": 2038 }, { "epoch": 0.82, "grad_norm": 3.822195351239234, "learning_rate": 1.7310151415334798e-06, "loss": 3.4997, "step": 2039 }, { "epoch": 0.82, "grad_norm": 3.4265966877599485, "learning_rate": 1.723736810595461e-06, "loss": 3.484, "step": 2040 }, { "epoch": 0.82, "grad_norm": 2.9330877865873854, "learning_rate": 1.7164723699211782e-06, "loss": 3.4072, "step": 2041 }, { "epoch": 0.82, "grad_norm": 3.25944471322417, "learning_rate": 1.709221831702723e-06, "loss": 3.2468, "step": 2042 }, { "epoch": 0.82, "grad_norm": 3.6246985984462836, "learning_rate": 1.7019852081088616e-06, "loss": 3.429, "step": 2043 }, { "epoch": 0.82, "grad_norm": 3.455585489352441, "learning_rate": 1.6947625112850074e-06, "loss": 3.5424, "step": 2044 }, { "epoch": 0.82, "grad_norm": 3.443555610888772, "learning_rate": 1.687553753353195e-06, "loss": 3.5144, "step": 2045 }, { "epoch": 0.82, "grad_norm": 3.818876261129042, "learning_rate": 1.680358946412064e-06, "loss": 3.454, "step": 2046 }, { "epoch": 0.82, "grad_norm": 3.343897423268437, "learning_rate": 1.6731781025368422e-06, "loss": 3.42, "step": 2047 }, { "epoch": 0.82, "grad_norm": 3.4004073563010193, "learning_rate": 1.6660112337793256e-06, "loss": 3.5292, "step": 2048 }, { "epoch": 0.82, "grad_norm": 3.6316960690058284, "learning_rate": 1.6588583521678536e-06, "loss": 3.571, "step": 2049 }, { "epoch": 0.82, "grad_norm": 3.5002122945295335, "learning_rate": 1.6517194697072903e-06, "loss": 3.5268, "step": 2050 }, { "epoch": 0.82, "grad_norm": 3.2795958724183882, "learning_rate": 1.644594598378999e-06, "loss": 3.3957, "step": 2051 }, { "epoch": 0.82, "grad_norm": 3.6550159003093974, "learning_rate": 1.6374837501408403e-06, "loss": 3.7017, "step": 2052 }, { "epoch": 0.82, "grad_norm": 4.488224452276052, "learning_rate": 1.6303869369271264e-06, "loss": 3.3991, "step": 2053 }, { "epoch": 0.82, "grad_norm": 3.357249271462046, "learning_rate": 1.6233041706486253e-06, "loss": 3.5832, "step": 2054 }, { "epoch": 0.82, "grad_norm": 3.266031195079106, "learning_rate": 1.6162354631925203e-06, "loss": 3.3827, "step": 2055 }, { "epoch": 0.82, "grad_norm": 3.6682563446568577, "learning_rate": 1.609180826422404e-06, "loss": 3.412, "step": 2056 }, { "epoch": 0.82, "grad_norm": 4.033994422891126, "learning_rate": 1.602140272178253e-06, "loss": 3.5786, "step": 2057 }, { "epoch": 0.82, "grad_norm": 3.9632696082851995, "learning_rate": 1.5951138122764132e-06, "loss": 3.2678, "step": 2058 }, { "epoch": 0.82, "grad_norm": 3.8403714532035282, "learning_rate": 1.58810145850957e-06, "loss": 3.4687, "step": 2059 }, { "epoch": 0.82, "grad_norm": 4.193247452585058, "learning_rate": 1.5811032226467304e-06, "loss": 3.6549, "step": 2060 }, { "epoch": 0.82, "grad_norm": 3.2398053957952713, "learning_rate": 1.5741191164332192e-06, "loss": 3.2968, "step": 2061 }, { "epoch": 0.82, "grad_norm": 3.6586197558812015, "learning_rate": 1.5671491515906355e-06, "loss": 3.4885, "step": 2062 }, { "epoch": 0.83, "grad_norm": 3.61379309237948, "learning_rate": 1.5601933398168523e-06, "loss": 3.5474, "step": 2063 }, { "epoch": 0.83, "grad_norm": 3.558606413314661, "learning_rate": 1.5532516927859853e-06, "loss": 3.3917, "step": 2064 }, { "epoch": 0.83, "grad_norm": 4.051200426686327, "learning_rate": 1.5463242221483742e-06, "loss": 3.1919, "step": 2065 }, { "epoch": 0.83, "grad_norm": 3.502973571132373, "learning_rate": 1.5394109395305757e-06, "loss": 3.2573, "step": 2066 }, { "epoch": 0.83, "grad_norm": 3.683744743231853, "learning_rate": 1.5325118565353237e-06, "loss": 3.5408, "step": 2067 }, { "epoch": 0.83, "grad_norm": 3.5358865096945524, "learning_rate": 1.5256269847415283e-06, "loss": 3.3673, "step": 2068 }, { "epoch": 0.83, "grad_norm": 3.8122495212068617, "learning_rate": 1.5187563357042423e-06, "loss": 3.436, "step": 2069 }, { "epoch": 0.83, "grad_norm": 3.406497985667074, "learning_rate": 1.511899920954656e-06, "loss": 3.6248, "step": 2070 }, { "epoch": 0.83, "grad_norm": 3.690777983132295, "learning_rate": 1.5050577520000608e-06, "loss": 3.3869, "step": 2071 }, { "epoch": 0.83, "grad_norm": 3.171874530283066, "learning_rate": 1.498229840323847e-06, "loss": 3.4189, "step": 2072 }, { "epoch": 0.83, "grad_norm": 3.492803640863754, "learning_rate": 1.4914161973854714e-06, "loss": 3.2746, "step": 2073 }, { "epoch": 0.83, "grad_norm": 3.4706085858486273, "learning_rate": 1.4846168346204425e-06, "loss": 3.388, "step": 2074 }, { "epoch": 0.83, "grad_norm": 4.0469632561414475, "learning_rate": 1.4778317634403082e-06, "loss": 3.2526, "step": 2075 }, { "epoch": 0.83, "grad_norm": 3.3027608809487523, "learning_rate": 1.4710609952326239e-06, "loss": 3.4284, "step": 2076 }, { "epoch": 0.83, "grad_norm": 3.4405453383957854, "learning_rate": 1.464304541360946e-06, "loss": 3.4081, "step": 2077 }, { "epoch": 0.83, "grad_norm": 3.6129503858999756, "learning_rate": 1.457562413164799e-06, "loss": 3.6202, "step": 2078 }, { "epoch": 0.83, "grad_norm": 3.508733737407462, "learning_rate": 1.4508346219596725e-06, "loss": 3.3593, "step": 2079 }, { "epoch": 0.83, "grad_norm": 3.5745164993431753, "learning_rate": 1.4441211790369892e-06, "loss": 3.4305, "step": 2080 }, { "epoch": 0.83, "grad_norm": 4.000893397603999, "learning_rate": 1.4374220956640895e-06, "loss": 3.3896, "step": 2081 }, { "epoch": 0.83, "grad_norm": 3.482970629880628, "learning_rate": 1.4307373830842174e-06, "loss": 3.2945, "step": 2082 }, { "epoch": 0.83, "grad_norm": 3.9659815944283183, "learning_rate": 1.424067052516499e-06, "loss": 3.3155, "step": 2083 }, { "epoch": 0.83, "grad_norm": 3.3102034971632275, "learning_rate": 1.4174111151559188e-06, "loss": 3.3685, "step": 2084 }, { "epoch": 0.83, "grad_norm": 3.1837029622658832, "learning_rate": 1.4107695821733026e-06, "loss": 3.2757, "step": 2085 }, { "epoch": 0.83, "grad_norm": 3.267635574821952, "learning_rate": 1.4041424647153112e-06, "loss": 3.3863, "step": 2086 }, { "epoch": 0.83, "grad_norm": 3.4560576602866955, "learning_rate": 1.3975297739043992e-06, "loss": 3.435, "step": 2087 }, { "epoch": 0.84, "grad_norm": 3.989060712448598, "learning_rate": 1.3909315208388185e-06, "loss": 3.419, "step": 2088 }, { "epoch": 0.84, "grad_norm": 3.30029757789045, "learning_rate": 1.3843477165925846e-06, "loss": 3.5192, "step": 2089 }, { "epoch": 0.84, "grad_norm": 3.4550946449825046, "learning_rate": 1.3777783722154603e-06, "loss": 3.5344, "step": 2090 }, { "epoch": 0.84, "grad_norm": 4.44034970355981, "learning_rate": 1.3712234987329486e-06, "loss": 3.6119, "step": 2091 }, { "epoch": 0.84, "grad_norm": 3.3075509781316645, "learning_rate": 1.3646831071462606e-06, "loss": 3.4391, "step": 2092 }, { "epoch": 0.84, "grad_norm": 3.4657392511161556, "learning_rate": 1.3581572084323014e-06, "loss": 3.4166, "step": 2093 }, { "epoch": 0.84, "grad_norm": 3.1882512927254556, "learning_rate": 1.3516458135436539e-06, "loss": 3.3661, "step": 2094 }, { "epoch": 0.84, "grad_norm": 4.174656242832297, "learning_rate": 1.3451489334085555e-06, "loss": 3.2017, "step": 2095 }, { "epoch": 0.84, "grad_norm": 3.757745275768527, "learning_rate": 1.3386665789308885e-06, "loss": 3.4389, "step": 2096 }, { "epoch": 0.84, "grad_norm": 3.4372378669298747, "learning_rate": 1.3321987609901553e-06, "loss": 3.4804, "step": 2097 }, { "epoch": 0.84, "grad_norm": 3.961745026644603, "learning_rate": 1.325745490441458e-06, "loss": 3.3249, "step": 2098 }, { "epoch": 0.84, "grad_norm": 3.9095905978056797, "learning_rate": 1.3193067781154835e-06, "loss": 3.3281, "step": 2099 }, { "epoch": 0.84, "grad_norm": 3.546582952524952, "learning_rate": 1.3128826348184886e-06, "loss": 3.1577, "step": 2100 }, { "epoch": 0.84, "grad_norm": 3.161408052005003, "learning_rate": 1.3064730713322793e-06, "loss": 3.3952, "step": 2101 }, { "epoch": 0.84, "grad_norm": 3.4427792034132265, "learning_rate": 1.3000780984141881e-06, "loss": 3.4078, "step": 2102 }, { "epoch": 0.84, "grad_norm": 3.8874612168561593, "learning_rate": 1.2936977267970597e-06, "loss": 3.4509, "step": 2103 }, { "epoch": 0.84, "grad_norm": 4.018729625106304, "learning_rate": 1.2873319671892337e-06, "loss": 3.2307, "step": 2104 }, { "epoch": 0.84, "grad_norm": 3.1584635393788365, "learning_rate": 1.2809808302745298e-06, "loss": 3.4466, "step": 2105 }, { "epoch": 0.84, "grad_norm": 3.5165182729856266, "learning_rate": 1.2746443267122233e-06, "loss": 3.4201, "step": 2106 }, { "epoch": 0.84, "grad_norm": 3.6148373495878054, "learning_rate": 1.2683224671370286e-06, "loss": 3.3471, "step": 2107 }, { "epoch": 0.84, "grad_norm": 3.520082081357915, "learning_rate": 1.262015262159082e-06, "loss": 3.2257, "step": 2108 }, { "epoch": 0.84, "grad_norm": 3.5479686710776432, "learning_rate": 1.255722722363929e-06, "loss": 3.2691, "step": 2109 }, { "epoch": 0.84, "grad_norm": 2.956232806174273, "learning_rate": 1.249444858312502e-06, "loss": 3.4043, "step": 2110 }, { "epoch": 0.84, "grad_norm": 2.855373187723047, "learning_rate": 1.2431816805410968e-06, "loss": 3.4917, "step": 2111 }, { "epoch": 0.84, "grad_norm": 3.1965735034591716, "learning_rate": 1.2369331995613664e-06, "loss": 3.3842, "step": 2112 }, { "epoch": 0.85, "grad_norm": 3.028949144756169, "learning_rate": 1.2306994258602922e-06, "loss": 3.3098, "step": 2113 }, { "epoch": 0.85, "grad_norm": 4.362284334661307, "learning_rate": 1.2244803699001785e-06, "loss": 3.2978, "step": 2114 }, { "epoch": 0.85, "grad_norm": 3.779755730336405, "learning_rate": 1.218276042118629e-06, "loss": 3.3034, "step": 2115 }, { "epoch": 0.85, "grad_norm": 3.5439051586156336, "learning_rate": 1.2120864529285203e-06, "loss": 3.3475, "step": 2116 }, { "epoch": 0.85, "grad_norm": 3.3052105946171926, "learning_rate": 1.2059116127179993e-06, "loss": 3.3798, "step": 2117 }, { "epoch": 0.85, "grad_norm": 3.218353846479608, "learning_rate": 1.199751531850457e-06, "loss": 3.2223, "step": 2118 }, { "epoch": 0.85, "grad_norm": 3.300669411980882, "learning_rate": 1.1936062206645183e-06, "loss": 3.4321, "step": 2119 }, { "epoch": 0.85, "grad_norm": 3.5403763231563574, "learning_rate": 1.1874756894740137e-06, "loss": 3.2812, "step": 2120 }, { "epoch": 0.85, "grad_norm": 3.2628393837539784, "learning_rate": 1.1813599485679684e-06, "loss": 3.5328, "step": 2121 }, { "epoch": 0.85, "grad_norm": 3.507796005006337, "learning_rate": 1.1752590082105863e-06, "loss": 3.2593, "step": 2122 }, { "epoch": 0.85, "grad_norm": 3.77010141147154, "learning_rate": 1.1691728786412315e-06, "loss": 3.4443, "step": 2123 }, { "epoch": 0.85, "grad_norm": 3.2616859923362456, "learning_rate": 1.1631015700744153e-06, "loss": 3.5765, "step": 2124 }, { "epoch": 0.85, "grad_norm": 3.345595498088119, "learning_rate": 1.1570450926997657e-06, "loss": 3.4364, "step": 2125 }, { "epoch": 0.85, "grad_norm": 3.502868345524852, "learning_rate": 1.1510034566820205e-06, "loss": 3.2767, "step": 2126 }, { "epoch": 0.85, "grad_norm": 3.7828869424110594, "learning_rate": 1.144976672161019e-06, "loss": 3.379, "step": 2127 }, { "epoch": 0.85, "grad_norm": 3.4442099961092265, "learning_rate": 1.1389647492516598e-06, "loss": 3.4932, "step": 2128 }, { "epoch": 0.85, "grad_norm": 3.5130471167919657, "learning_rate": 1.132967698043913e-06, "loss": 3.1723, "step": 2129 }, { "epoch": 0.85, "grad_norm": 3.7375709878896384, "learning_rate": 1.1269855286027798e-06, "loss": 3.3686, "step": 2130 }, { "epoch": 0.85, "grad_norm": 3.6802875325445674, "learning_rate": 1.1210182509682854e-06, "loss": 3.2742, "step": 2131 }, { "epoch": 0.85, "grad_norm": 3.3259916423660383, "learning_rate": 1.1150658751554667e-06, "loss": 3.4796, "step": 2132 }, { "epoch": 0.85, "grad_norm": 3.234854692123814, "learning_rate": 1.1091284111543499e-06, "loss": 3.3168, "step": 2133 }, { "epoch": 0.85, "grad_norm": 3.632315148384429, "learning_rate": 1.1032058689299297e-06, "loss": 3.4312, "step": 2134 }, { "epoch": 0.85, "grad_norm": 3.1620238621266283, "learning_rate": 1.0972982584221592e-06, "loss": 3.5919, "step": 2135 }, { "epoch": 0.85, "grad_norm": 3.3617634363233004, "learning_rate": 1.0914055895459353e-06, "loss": 3.383, "step": 2136 }, { "epoch": 0.85, "grad_norm": 3.922553259067895, "learning_rate": 1.08552787219107e-06, "loss": 3.2105, "step": 2137 }, { "epoch": 0.86, "grad_norm": 3.37654662519368, "learning_rate": 1.0796651162222916e-06, "loss": 3.4856, "step": 2138 }, { "epoch": 0.86, "grad_norm": 3.7966863859829796, "learning_rate": 1.07381733147921e-06, "loss": 3.501, "step": 2139 }, { "epoch": 0.86, "grad_norm": 3.840564360081184, "learning_rate": 1.067984527776309e-06, "loss": 3.4146, "step": 2140 }, { "epoch": 0.86, "grad_norm": 3.4357237633421547, "learning_rate": 1.062166714902938e-06, "loss": 3.5836, "step": 2141 }, { "epoch": 0.86, "grad_norm": 3.2728261173976994, "learning_rate": 1.0563639026232742e-06, "loss": 3.214, "step": 2142 }, { "epoch": 0.86, "grad_norm": 3.915665784391846, "learning_rate": 1.0505761006763315e-06, "loss": 3.342, "step": 2143 }, { "epoch": 0.86, "grad_norm": 3.2202663116738095, "learning_rate": 1.044803318775922e-06, "loss": 3.3531, "step": 2144 }, { "epoch": 0.86, "grad_norm": 3.65403499358109, "learning_rate": 1.0390455666106547e-06, "loss": 3.4521, "step": 2145 }, { "epoch": 0.86, "grad_norm": 3.662658535408528, "learning_rate": 1.0333028538439093e-06, "loss": 3.3767, "step": 2146 }, { "epoch": 0.86, "grad_norm": 3.3917540438518996, "learning_rate": 1.027575190113832e-06, "loss": 3.5931, "step": 2147 }, { "epoch": 0.86, "grad_norm": 3.397753021489659, "learning_rate": 1.021862585033304e-06, "loss": 3.3456, "step": 2148 }, { "epoch": 0.86, "grad_norm": 3.2907386279763573, "learning_rate": 1.0161650481899344e-06, "loss": 3.4717, "step": 2149 }, { "epoch": 0.86, "grad_norm": 3.316699462123251, "learning_rate": 1.010482589146048e-06, "loss": 3.4836, "step": 2150 }, { "epoch": 0.86, "grad_norm": 3.5992791259273185, "learning_rate": 1.0048152174386584e-06, "loss": 3.5198, "step": 2151 }, { "epoch": 0.86, "grad_norm": 3.897952562761025, "learning_rate": 9.991629425794624e-07, "loss": 3.5266, "step": 2152 }, { "epoch": 0.86, "grad_norm": 3.645336662391037, "learning_rate": 9.935257740548143e-07, "loss": 3.6137, "step": 2153 }, { "epoch": 0.86, "grad_norm": 4.079152642547377, "learning_rate": 9.879037213257214e-07, "loss": 3.3159, "step": 2154 }, { "epoch": 0.86, "grad_norm": 3.9483279351701177, "learning_rate": 9.822967938278172e-07, "loss": 3.2368, "step": 2155 }, { "epoch": 0.86, "grad_norm": 3.4047908302860526, "learning_rate": 9.767050009713476e-07, "loss": 3.1575, "step": 2156 }, { "epoch": 0.86, "grad_norm": 3.1730170838584604, "learning_rate": 9.711283521411674e-07, "loss": 3.5803, "step": 2157 }, { "epoch": 0.86, "grad_norm": 3.5454281828947, "learning_rate": 9.655668566967026e-07, "loss": 3.4175, "step": 2158 }, { "epoch": 0.86, "grad_norm": 3.6667236181520173, "learning_rate": 9.600205239719584e-07, "loss": 3.4001, "step": 2159 }, { "epoch": 0.86, "grad_norm": 3.3294844387626874, "learning_rate": 9.544893632754816e-07, "loss": 3.3551, "step": 2160 }, { "epoch": 0.86, "grad_norm": 3.654141927545411, "learning_rate": 9.489733838903648e-07, "loss": 3.6266, "step": 2161 }, { "epoch": 0.86, "grad_norm": 3.8045320643118186, "learning_rate": 9.434725950742119e-07, "loss": 3.4512, "step": 2162 }, { "epoch": 0.87, "grad_norm": 3.6024403263194764, "learning_rate": 9.379870060591434e-07, "loss": 3.3326, "step": 2163 }, { "epoch": 0.87, "grad_norm": 3.386820474420187, "learning_rate": 9.325166260517593e-07, "loss": 3.428, "step": 2164 }, { "epoch": 0.87, "grad_norm": 3.651991887916642, "learning_rate": 9.270614642331377e-07, "loss": 3.6325, "step": 2165 }, { "epoch": 0.87, "grad_norm": 3.3168626955308005, "learning_rate": 9.216215297588182e-07, "loss": 3.3719, "step": 2166 }, { "epoch": 0.87, "grad_norm": 3.808956179858258, "learning_rate": 9.161968317587788e-07, "loss": 3.1597, "step": 2167 }, { "epoch": 0.87, "grad_norm": 3.5534169108074485, "learning_rate": 9.107873793374322e-07, "loss": 3.4365, "step": 2168 }, { "epoch": 0.87, "grad_norm": 3.713641504637213, "learning_rate": 9.053931815735995e-07, "loss": 3.3859, "step": 2169 }, { "epoch": 0.87, "grad_norm": 3.291244265002922, "learning_rate": 9.000142475204965e-07, "loss": 3.2342, "step": 2170 }, { "epoch": 0.87, "grad_norm": 3.6304683284911143, "learning_rate": 8.946505862057286e-07, "loss": 3.3497, "step": 2171 }, { "epoch": 0.87, "grad_norm": 3.036041707321972, "learning_rate": 8.893022066312674e-07, "loss": 3.297, "step": 2172 }, { "epoch": 0.87, "grad_norm": 3.5995359541359333, "learning_rate": 8.839691177734322e-07, "loss": 3.3432, "step": 2173 }, { "epoch": 0.87, "grad_norm": 3.5355815132434922, "learning_rate": 8.786513285828835e-07, "loss": 3.4101, "step": 2174 }, { "epoch": 0.87, "grad_norm": 3.2445046978749734, "learning_rate": 8.733488479845997e-07, "loss": 3.3971, "step": 2175 }, { "epoch": 0.87, "grad_norm": 3.3838590014717567, "learning_rate": 8.680616848778711e-07, "loss": 3.3631, "step": 2176 }, { "epoch": 0.87, "grad_norm": 3.808898429237898, "learning_rate": 8.627898481362817e-07, "loss": 3.5121, "step": 2177 }, { "epoch": 0.87, "grad_norm": 4.148682068047069, "learning_rate": 8.575333466076863e-07, "loss": 3.5631, "step": 2178 }, { "epoch": 0.87, "grad_norm": 3.349487356821742, "learning_rate": 8.522921891142034e-07, "loss": 3.3379, "step": 2179 }, { "epoch": 0.87, "grad_norm": 3.335460985598822, "learning_rate": 8.470663844522053e-07, "loss": 3.2859, "step": 2180 }, { "epoch": 0.87, "grad_norm": 3.5146350091888077, "learning_rate": 8.418559413922933e-07, "loss": 3.4165, "step": 2181 }, { "epoch": 0.87, "grad_norm": 3.0467244644452345, "learning_rate": 8.366608686792854e-07, "loss": 3.491, "step": 2182 }, { "epoch": 0.87, "grad_norm": 3.37795744884803, "learning_rate": 8.31481175032206e-07, "loss": 3.3244, "step": 2183 }, { "epoch": 0.87, "grad_norm": 3.8806468735539363, "learning_rate": 8.263168691442624e-07, "loss": 3.2478, "step": 2184 }, { "epoch": 0.87, "grad_norm": 3.399507866382731, "learning_rate": 8.211679596828481e-07, "loss": 3.493, "step": 2185 }, { "epoch": 0.87, "grad_norm": 3.6180760047276843, "learning_rate": 8.160344552895061e-07, "loss": 3.4451, "step": 2186 }, { "epoch": 0.87, "grad_norm": 3.975696190654819, "learning_rate": 8.109163645799267e-07, "loss": 3.5058, "step": 2187 }, { "epoch": 0.88, "grad_norm": 3.290590938027743, "learning_rate": 8.058136961439333e-07, "loss": 3.4879, "step": 2188 }, { "epoch": 0.88, "grad_norm": 2.9777779041564334, "learning_rate": 8.007264585454632e-07, "loss": 3.3098, "step": 2189 }, { "epoch": 0.88, "grad_norm": 3.7754034334706503, "learning_rate": 7.956546603225601e-07, "loss": 3.3984, "step": 2190 }, { "epoch": 0.88, "grad_norm": 4.098136120772462, "learning_rate": 7.905983099873504e-07, "loss": 3.3151, "step": 2191 }, { "epoch": 0.88, "grad_norm": 3.840743601541193, "learning_rate": 7.855574160260371e-07, "loss": 3.492, "step": 2192 }, { "epoch": 0.88, "grad_norm": 3.6265418650700787, "learning_rate": 7.805319868988759e-07, "loss": 3.4926, "step": 2193 }, { "epoch": 0.88, "grad_norm": 3.85235771264861, "learning_rate": 7.755220310401812e-07, "loss": 3.4783, "step": 2194 }, { "epoch": 0.88, "grad_norm": 3.5818307264323272, "learning_rate": 7.705275568582848e-07, "loss": 3.2884, "step": 2195 }, { "epoch": 0.88, "grad_norm": 3.270453873784988, "learning_rate": 7.655485727355416e-07, "loss": 3.2947, "step": 2196 }, { "epoch": 0.88, "grad_norm": 3.6452666439703183, "learning_rate": 7.60585087028305e-07, "loss": 3.1528, "step": 2197 }, { "epoch": 0.88, "grad_norm": 2.9837151835317623, "learning_rate": 7.556371080669222e-07, "loss": 3.3995, "step": 2198 }, { "epoch": 0.88, "grad_norm": 3.505327878248056, "learning_rate": 7.507046441557142e-07, "loss": 3.3998, "step": 2199 }, { "epoch": 0.88, "grad_norm": 3.1591895177001366, "learning_rate": 7.457877035729588e-07, "loss": 3.317, "step": 2200 }, { "epoch": 0.88, "grad_norm": 3.5187197845554508, "learning_rate": 7.408862945708839e-07, "loss": 3.0716, "step": 2201 }, { "epoch": 0.88, "grad_norm": 3.4083125288162908, "learning_rate": 7.360004253756459e-07, "loss": 3.3461, "step": 2202 }, { "epoch": 0.88, "grad_norm": 3.0850483605956924, "learning_rate": 7.311301041873276e-07, "loss": 3.432, "step": 2203 }, { "epoch": 0.88, "grad_norm": 3.8955991470826112, "learning_rate": 7.262753391799127e-07, "loss": 3.3223, "step": 2204 }, { "epoch": 0.88, "grad_norm": 3.171206654714713, "learning_rate": 7.21436138501278e-07, "loss": 3.5402, "step": 2205 }, { "epoch": 0.88, "grad_norm": 3.3432430917547724, "learning_rate": 7.166125102731735e-07, "loss": 3.3176, "step": 2206 }, { "epoch": 0.88, "grad_norm": 3.694322018316045, "learning_rate": 7.118044625912213e-07, "loss": 3.5505, "step": 2207 }, { "epoch": 0.88, "grad_norm": 3.359500371733772, "learning_rate": 7.070120035248906e-07, "loss": 3.4174, "step": 2208 }, { "epoch": 0.88, "grad_norm": 4.010062042226531, "learning_rate": 7.022351411174866e-07, "loss": 3.0835, "step": 2209 }, { "epoch": 0.88, "grad_norm": 3.5776919424054485, "learning_rate": 6.974738833861383e-07, "loss": 3.5002, "step": 2210 }, { "epoch": 0.88, "grad_norm": 3.0916541703552607, "learning_rate": 6.927282383217893e-07, "loss": 3.3724, "step": 2211 }, { "epoch": 0.88, "grad_norm": 3.483857548891637, "learning_rate": 6.879982138891717e-07, "loss": 3.2362, "step": 2212 }, { "epoch": 0.89, "grad_norm": 3.783788104808641, "learning_rate": 6.83283818026812e-07, "loss": 3.4115, "step": 2213 }, { "epoch": 0.89, "grad_norm": 3.924748913054718, "learning_rate": 6.785850586469989e-07, "loss": 3.3712, "step": 2214 }, { "epoch": 0.89, "grad_norm": 3.2175573933479846, "learning_rate": 6.739019436357774e-07, "loss": 3.4548, "step": 2215 }, { "epoch": 0.89, "grad_norm": 3.843047627078977, "learning_rate": 6.692344808529427e-07, "loss": 3.4471, "step": 2216 }, { "epoch": 0.89, "grad_norm": 3.746682602581423, "learning_rate": 6.645826781320141e-07, "loss": 3.2257, "step": 2217 }, { "epoch": 0.89, "grad_norm": 3.632430383751459, "learning_rate": 6.599465432802332e-07, "loss": 3.5574, "step": 2218 }, { "epoch": 0.89, "grad_norm": 3.591676174959998, "learning_rate": 6.553260840785414e-07, "loss": 3.2546, "step": 2219 }, { "epoch": 0.89, "grad_norm": 3.739713355457886, "learning_rate": 6.507213082815745e-07, "loss": 3.3196, "step": 2220 }, { "epoch": 0.89, "grad_norm": 3.5008219313024265, "learning_rate": 6.461322236176438e-07, "loss": 3.465, "step": 2221 }, { "epoch": 0.89, "grad_norm": 3.310600462482619, "learning_rate": 6.415588377887305e-07, "loss": 3.433, "step": 2222 }, { "epoch": 0.89, "grad_norm": 3.4841994321670917, "learning_rate": 6.370011584704617e-07, "loss": 3.3722, "step": 2223 }, { "epoch": 0.89, "grad_norm": 3.6842707477566856, "learning_rate": 6.324591933121072e-07, "loss": 3.3919, "step": 2224 }, { "epoch": 0.89, "grad_norm": 3.8566346491530994, "learning_rate": 6.279329499365649e-07, "loss": 3.6028, "step": 2225 }, { "epoch": 0.89, "grad_norm": 3.3556102440025666, "learning_rate": 6.234224359403407e-07, "loss": 3.2679, "step": 2226 }, { "epoch": 0.89, "grad_norm": 3.248424488867721, "learning_rate": 6.1892765889355e-07, "loss": 3.3181, "step": 2227 }, { "epoch": 0.89, "grad_norm": 3.433953805154286, "learning_rate": 6.144486263398886e-07, "loss": 3.3922, "step": 2228 }, { "epoch": 0.89, "grad_norm": 3.606794192325152, "learning_rate": 6.099853457966342e-07, "loss": 3.4089, "step": 2229 }, { "epoch": 0.89, "grad_norm": 3.2863483621832756, "learning_rate": 6.055378247546217e-07, "loss": 3.3874, "step": 2230 }, { "epoch": 0.89, "grad_norm": 3.5462763362220646, "learning_rate": 6.01106070678239e-07, "loss": 3.2217, "step": 2231 }, { "epoch": 0.89, "grad_norm": 3.18770247047478, "learning_rate": 5.966900910054141e-07, "loss": 3.414, "step": 2232 }, { "epoch": 0.89, "grad_norm": 3.3584454420282794, "learning_rate": 5.922898931475973e-07, "loss": 3.548, "step": 2233 }, { "epoch": 0.89, "grad_norm": 3.2032427721789545, "learning_rate": 5.879054844897536e-07, "loss": 3.4794, "step": 2234 }, { "epoch": 0.89, "grad_norm": 3.172170956432926, "learning_rate": 5.835368723903456e-07, "loss": 3.478, "step": 2235 }, { "epoch": 0.89, "grad_norm": 3.8577441327169724, "learning_rate": 5.791840641813295e-07, "loss": 3.4523, "step": 2236 }, { "epoch": 0.89, "grad_norm": 3.5517894641461263, "learning_rate": 5.748470671681328e-07, "loss": 3.367, "step": 2237 }, { "epoch": 0.9, "grad_norm": 3.3612955774508038, "learning_rate": 5.705258886296494e-07, "loss": 3.2858, "step": 2238 }, { "epoch": 0.9, "grad_norm": 3.620965017753635, "learning_rate": 5.662205358182226e-07, "loss": 3.5161, "step": 2239 }, { "epoch": 0.9, "grad_norm": 3.306799867032343, "learning_rate": 5.619310159596358e-07, "loss": 3.3049, "step": 2240 }, { "epoch": 0.9, "grad_norm": 3.892165227199948, "learning_rate": 5.576573362531001e-07, "loss": 3.2364, "step": 2241 }, { "epoch": 0.9, "grad_norm": 3.418545935942755, "learning_rate": 5.533995038712403e-07, "loss": 3.5193, "step": 2242 }, { "epoch": 0.9, "grad_norm": 3.605324441619377, "learning_rate": 5.491575259600879e-07, "loss": 3.2923, "step": 2243 }, { "epoch": 0.9, "grad_norm": 3.2932202213739408, "learning_rate": 5.449314096390601e-07, "loss": 3.3651, "step": 2244 }, { "epoch": 0.9, "grad_norm": 3.7819035993265215, "learning_rate": 5.407211620009545e-07, "loss": 3.1878, "step": 2245 }, { "epoch": 0.9, "grad_norm": 3.342693554550334, "learning_rate": 5.365267901119398e-07, "loss": 3.497, "step": 2246 }, { "epoch": 0.9, "grad_norm": 3.0434406319244784, "learning_rate": 5.323483010115382e-07, "loss": 3.4321, "step": 2247 }, { "epoch": 0.9, "grad_norm": 3.9546557643627587, "learning_rate": 5.281857017126124e-07, "loss": 3.4684, "step": 2248 }, { "epoch": 0.9, "grad_norm": 3.202307405158531, "learning_rate": 5.240389992013606e-07, "loss": 3.4182, "step": 2249 }, { "epoch": 0.9, "grad_norm": 3.6682108167874126, "learning_rate": 5.199082004372958e-07, "loss": 3.5151, "step": 2250 }, { "epoch": 0.9, "grad_norm": 3.517041176431839, "learning_rate": 5.157933123532466e-07, "loss": 3.5793, "step": 2251 }, { "epoch": 0.9, "grad_norm": 4.060991767391451, "learning_rate": 5.116943418553355e-07, "loss": 3.1333, "step": 2252 }, { "epoch": 0.9, "grad_norm": 3.4617925682458828, "learning_rate": 5.076112958229673e-07, "loss": 3.4533, "step": 2253 }, { "epoch": 0.9, "grad_norm": 3.8325002574788374, "learning_rate": 5.035441811088204e-07, "loss": 3.3571, "step": 2254 }, { "epoch": 0.9, "grad_norm": 3.87591566298437, "learning_rate": 4.994930045388414e-07, "loss": 3.2436, "step": 2255 }, { "epoch": 0.9, "grad_norm": 3.2740730847041695, "learning_rate": 4.954577729122212e-07, "loss": 3.363, "step": 2256 }, { "epoch": 0.9, "grad_norm": 3.9086146571113733, "learning_rate": 4.914384930013927e-07, "loss": 3.3876, "step": 2257 }, { "epoch": 0.9, "grad_norm": 2.9553725583407164, "learning_rate": 4.874351715520154e-07, "loss": 3.4666, "step": 2258 }, { "epoch": 0.9, "grad_norm": 3.409566328810979, "learning_rate": 4.834478152829658e-07, "loss": 3.3281, "step": 2259 }, { "epoch": 0.9, "grad_norm": 3.6652751960125713, "learning_rate": 4.794764308863242e-07, "loss": 3.3374, "step": 2260 }, { "epoch": 0.9, "grad_norm": 3.7376935408690137, "learning_rate": 4.755210250273701e-07, "loss": 3.3913, "step": 2261 }, { "epoch": 0.9, "grad_norm": 3.5403464753418685, "learning_rate": 4.715816043445609e-07, "loss": 3.3002, "step": 2262 }, { "epoch": 0.91, "grad_norm": 3.488337281386899, "learning_rate": 4.676581754495235e-07, "loss": 3.4845, "step": 2263 }, { "epoch": 0.91, "grad_norm": 3.2385604249717814, "learning_rate": 4.6375074492705173e-07, "loss": 3.4416, "step": 2264 }, { "epoch": 0.91, "grad_norm": 3.8998958534577404, "learning_rate": 4.5985931933508757e-07, "loss": 3.2112, "step": 2265 }, { "epoch": 0.91, "grad_norm": 3.4381788905534654, "learning_rate": 4.559839052047066e-07, "loss": 3.24, "step": 2266 }, { "epoch": 0.91, "grad_norm": 2.9753181370969592, "learning_rate": 4.521245090401172e-07, "loss": 3.372, "step": 2267 }, { "epoch": 0.91, "grad_norm": 3.220562802271603, "learning_rate": 4.482811373186402e-07, "loss": 3.4169, "step": 2268 }, { "epoch": 0.91, "grad_norm": 3.4575228301184597, "learning_rate": 4.4445379649070587e-07, "loss": 3.3371, "step": 2269 }, { "epoch": 0.91, "grad_norm": 3.4887598974576735, "learning_rate": 4.406424929798403e-07, "loss": 3.2589, "step": 2270 }, { "epoch": 0.91, "grad_norm": 3.285211766969659, "learning_rate": 4.368472331826479e-07, "loss": 3.2891, "step": 2271 }, { "epoch": 0.91, "grad_norm": 3.2239734105128406, "learning_rate": 4.3306802346881116e-07, "loss": 3.3581, "step": 2272 }, { "epoch": 0.91, "grad_norm": 3.484669516554814, "learning_rate": 4.2930487018107425e-07, "loss": 3.4378, "step": 2273 }, { "epoch": 0.91, "grad_norm": 3.487229626033901, "learning_rate": 4.2555777963523506e-07, "loss": 3.4464, "step": 2274 }, { "epoch": 0.91, "grad_norm": 3.1419345073346125, "learning_rate": 4.218267581201296e-07, "loss": 3.4621, "step": 2275 }, { "epoch": 0.91, "grad_norm": 3.219933497970078, "learning_rate": 4.1811181189762684e-07, "loss": 3.2655, "step": 2276 }, { "epoch": 0.91, "grad_norm": 3.594623370400675, "learning_rate": 4.1441294720261373e-07, "loss": 3.3844, "step": 2277 }, { "epoch": 0.91, "grad_norm": 3.7548340359274093, "learning_rate": 4.107301702429922e-07, "loss": 3.3651, "step": 2278 }, { "epoch": 0.91, "grad_norm": 3.3666014345227353, "learning_rate": 4.070634871996615e-07, "loss": 3.4483, "step": 2279 }, { "epoch": 0.91, "grad_norm": 3.2939439702731037, "learning_rate": 4.034129042265067e-07, "loss": 3.1616, "step": 2280 }, { "epoch": 0.91, "grad_norm": 3.7606232331352465, "learning_rate": 3.9977842745039464e-07, "loss": 3.2778, "step": 2281 }, { "epoch": 0.91, "grad_norm": 3.4635150898245977, "learning_rate": 3.961600629711615e-07, "loss": 3.2342, "step": 2282 }, { "epoch": 0.91, "grad_norm": 3.5295595342864443, "learning_rate": 3.925578168616007e-07, "loss": 3.319, "step": 2283 }, { "epoch": 0.91, "grad_norm": 3.792507271237906, "learning_rate": 3.889716951674549e-07, "loss": 3.3114, "step": 2284 }, { "epoch": 0.91, "grad_norm": 4.228923211595772, "learning_rate": 3.8540170390740097e-07, "loss": 3.5547, "step": 2285 }, { "epoch": 0.91, "grad_norm": 3.442052688731442, "learning_rate": 3.8184784907304704e-07, "loss": 3.457, "step": 2286 }, { "epoch": 0.91, "grad_norm": 3.3026272696031045, "learning_rate": 3.783101366289199e-07, "loss": 3.3291, "step": 2287 }, { "epoch": 0.92, "grad_norm": 3.436857110291015, "learning_rate": 3.747885725124523e-07, "loss": 3.2545, "step": 2288 }, { "epoch": 0.92, "grad_norm": 3.569007423588782, "learning_rate": 3.712831626339752e-07, "loss": 3.5329, "step": 2289 }, { "epoch": 0.92, "grad_norm": 3.413741939457016, "learning_rate": 3.67793912876705e-07, "loss": 3.4219, "step": 2290 }, { "epoch": 0.92, "grad_norm": 3.9608789183114315, "learning_rate": 3.643208290967415e-07, "loss": 3.4362, "step": 2291 }, { "epoch": 0.92, "grad_norm": 4.037967550302035, "learning_rate": 3.608639171230488e-07, "loss": 3.4947, "step": 2292 }, { "epoch": 0.92, "grad_norm": 2.963217206035839, "learning_rate": 3.5742318275745147e-07, "loss": 3.4222, "step": 2293 }, { "epoch": 0.92, "grad_norm": 4.404616702862916, "learning_rate": 3.5399863177462024e-07, "loss": 3.5048, "step": 2294 }, { "epoch": 0.92, "grad_norm": 3.5130147839058874, "learning_rate": 3.5059026992206645e-07, "loss": 3.5803, "step": 2295 }, { "epoch": 0.92, "grad_norm": 3.145610428854735, "learning_rate": 3.4719810292013214e-07, "loss": 3.3987, "step": 2296 }, { "epoch": 0.92, "grad_norm": 3.1675486907915675, "learning_rate": 3.438221364619776e-07, "loss": 3.3274, "step": 2297 }, { "epoch": 0.92, "grad_norm": 3.5141499328252555, "learning_rate": 3.404623762135728e-07, "loss": 3.2946, "step": 2298 }, { "epoch": 0.92, "grad_norm": 3.5931881815304108, "learning_rate": 3.371188278136883e-07, "loss": 3.4986, "step": 2299 }, { "epoch": 0.92, "grad_norm": 3.237549412583975, "learning_rate": 3.3379149687388866e-07, "loss": 3.4054, "step": 2300 }, { "epoch": 0.92, "grad_norm": 3.4046964323023268, "learning_rate": 3.3048038897851576e-07, "loss": 3.3034, "step": 2301 }, { "epoch": 0.92, "grad_norm": 3.288139297934954, "learning_rate": 3.271855096846899e-07, "loss": 3.3689, "step": 2302 }, { "epoch": 0.92, "grad_norm": 3.512789471000361, "learning_rate": 3.2390686452228983e-07, "loss": 3.4543, "step": 2303 }, { "epoch": 0.92, "grad_norm": 3.843382156078518, "learning_rate": 3.2064445899394723e-07, "loss": 3.1137, "step": 2304 }, { "epoch": 0.92, "grad_norm": 4.054753404859892, "learning_rate": 3.1739829857504235e-07, "loss": 3.2675, "step": 2305 }, { "epoch": 0.92, "grad_norm": 3.567866951457264, "learning_rate": 3.1416838871368925e-07, "loss": 3.3668, "step": 2306 }, { "epoch": 0.92, "grad_norm": 3.588050596796032, "learning_rate": 3.1095473483072733e-07, "loss": 3.5633, "step": 2307 }, { "epoch": 0.92, "grad_norm": 3.3237287020633057, "learning_rate": 3.0775734231971443e-07, "loss": 3.498, "step": 2308 }, { "epoch": 0.92, "grad_norm": 3.5006180356875145, "learning_rate": 3.045762165469168e-07, "loss": 3.351, "step": 2309 }, { "epoch": 0.92, "grad_norm": 3.783894516690254, "learning_rate": 3.0141136285129825e-07, "loss": 3.3718, "step": 2310 }, { "epoch": 0.92, "grad_norm": 3.8365546836380733, "learning_rate": 2.982627865445109e-07, "loss": 3.3811, "step": 2311 }, { "epoch": 0.92, "grad_norm": 3.6036011049940466, "learning_rate": 2.951304929108956e-07, "loss": 3.3192, "step": 2312 }, { "epoch": 0.93, "grad_norm": 4.117468304701286, "learning_rate": 2.9201448720745706e-07, "loss": 3.3399, "step": 2313 }, { "epoch": 0.93, "grad_norm": 3.360859819170253, "learning_rate": 2.8891477466386987e-07, "loss": 3.6341, "step": 2314 }, { "epoch": 0.93, "grad_norm": 3.7695749961891507, "learning_rate": 2.8583136048245697e-07, "loss": 3.2734, "step": 2315 }, { "epoch": 0.93, "grad_norm": 3.6133036111449046, "learning_rate": 2.827642498381955e-07, "loss": 3.2883, "step": 2316 }, { "epoch": 0.93, "grad_norm": 3.4475207107442603, "learning_rate": 2.7971344787869114e-07, "loss": 3.231, "step": 2317 }, { "epoch": 0.93, "grad_norm": 3.569809544449258, "learning_rate": 2.76678959724187e-07, "loss": 3.1757, "step": 2318 }, { "epoch": 0.93, "grad_norm": 3.310205615755986, "learning_rate": 2.7366079046753925e-07, "loss": 3.4045, "step": 2319 }, { "epoch": 0.93, "grad_norm": 3.043808757165406, "learning_rate": 2.706589451742181e-07, "loss": 3.341, "step": 2320 }, { "epoch": 0.93, "grad_norm": 3.4378814841858576, "learning_rate": 2.6767342888229907e-07, "loss": 3.4172, "step": 2321 }, { "epoch": 0.93, "grad_norm": 3.3665108545388387, "learning_rate": 2.647042466024485e-07, "loss": 3.4747, "step": 2322 }, { "epoch": 0.93, "grad_norm": 3.0931178224328666, "learning_rate": 2.617514033179236e-07, "loss": 3.3271, "step": 2323 }, { "epoch": 0.93, "grad_norm": 3.4596327927003623, "learning_rate": 2.588149039845533e-07, "loss": 3.3217, "step": 2324 }, { "epoch": 0.93, "grad_norm": 3.611293101153788, "learning_rate": 2.5589475353073987e-07, "loss": 3.5053, "step": 2325 }, { "epoch": 0.93, "grad_norm": 3.4077089320299736, "learning_rate": 2.5299095685744734e-07, "loss": 3.5835, "step": 2326 }, { "epoch": 0.93, "grad_norm": 3.362519772264958, "learning_rate": 2.5010351883819283e-07, "loss": 3.441, "step": 2327 }, { "epoch": 0.93, "grad_norm": 3.6639173507916767, "learning_rate": 2.472324443190355e-07, "loss": 3.5446, "step": 2328 }, { "epoch": 0.93, "grad_norm": 3.5669895297852423, "learning_rate": 2.4437773811857304e-07, "loss": 3.3679, "step": 2329 }, { "epoch": 0.93, "grad_norm": 3.9325205560382717, "learning_rate": 2.4153940502793185e-07, "loss": 3.3625, "step": 2330 }, { "epoch": 0.93, "grad_norm": 3.4262239886835975, "learning_rate": 2.387174498107614e-07, "loss": 3.1708, "step": 2331 }, { "epoch": 0.93, "grad_norm": 3.382901267977727, "learning_rate": 2.359118772032176e-07, "loss": 3.6955, "step": 2332 }, { "epoch": 0.93, "grad_norm": 3.8769220455422917, "learning_rate": 2.3312269191396619e-07, "loss": 3.4398, "step": 2333 }, { "epoch": 0.93, "grad_norm": 3.2645249976733792, "learning_rate": 2.30349898624167e-07, "loss": 3.5903, "step": 2334 }, { "epoch": 0.93, "grad_norm": 3.2437369619688132, "learning_rate": 2.2759350198746978e-07, "loss": 3.2821, "step": 2335 }, { "epoch": 0.93, "grad_norm": 3.585064031562485, "learning_rate": 2.2485350663000727e-07, "loss": 3.3699, "step": 2336 }, { "epoch": 0.93, "grad_norm": 3.529688315884326, "learning_rate": 2.2212991715038324e-07, "loss": 3.3273, "step": 2337 }, { "epoch": 0.94, "grad_norm": 3.5455952725918034, "learning_rate": 2.1942273811966563e-07, "loss": 3.3869, "step": 2338 }, { "epoch": 0.94, "grad_norm": 3.7542843609631973, "learning_rate": 2.1673197408138115e-07, "loss": 3.5519, "step": 2339 }, { "epoch": 0.94, "grad_norm": 3.7024764625310995, "learning_rate": 2.1405762955151178e-07, "loss": 3.3642, "step": 2340 }, { "epoch": 0.94, "grad_norm": 3.2576909980611903, "learning_rate": 2.1139970901847607e-07, "loss": 3.1324, "step": 2341 }, { "epoch": 0.94, "grad_norm": 3.291794267112212, "learning_rate": 2.0875821694313014e-07, "loss": 3.492, "step": 2342 }, { "epoch": 0.94, "grad_norm": 3.851766738786476, "learning_rate": 2.0613315775875665e-07, "loss": 3.4191, "step": 2343 }, { "epoch": 0.94, "grad_norm": 3.3503396542169863, "learning_rate": 2.0352453587105914e-07, "loss": 3.4286, "step": 2344 }, { "epoch": 0.94, "grad_norm": 3.0832706870783757, "learning_rate": 2.009323556581566e-07, "loss": 3.4376, "step": 2345 }, { "epoch": 0.94, "grad_norm": 3.2176834985322738, "learning_rate": 1.9835662147057012e-07, "loss": 3.7042, "step": 2346 }, { "epoch": 0.94, "grad_norm": 3.508572081745489, "learning_rate": 1.9579733763121943e-07, "loss": 3.2487, "step": 2347 }, { "epoch": 0.94, "grad_norm": 3.3815565196920945, "learning_rate": 1.932545084354154e-07, "loss": 3.3544, "step": 2348 }, { "epoch": 0.94, "grad_norm": 3.224405710797126, "learning_rate": 1.9072813815085523e-07, "loss": 3.3338, "step": 2349 }, { "epoch": 0.94, "grad_norm": 3.587434527550172, "learning_rate": 1.8821823101760949e-07, "loss": 3.2779, "step": 2350 }, { "epoch": 0.94, "grad_norm": 4.0105448519410976, "learning_rate": 1.857247912481197e-07, "loss": 3.5875, "step": 2351 }, { "epoch": 0.94, "grad_norm": 3.3545775574783443, "learning_rate": 1.8324782302718835e-07, "loss": 3.6504, "step": 2352 }, { "epoch": 0.94, "grad_norm": 3.772545381786392, "learning_rate": 1.8078733051197561e-07, "loss": 3.2604, "step": 2353 }, { "epoch": 0.94, "grad_norm": 3.514296507565377, "learning_rate": 1.7834331783198933e-07, "loss": 3.4913, "step": 2354 }, { "epoch": 0.94, "grad_norm": 3.2130027457938604, "learning_rate": 1.7591578908907724e-07, "loss": 3.2063, "step": 2355 }, { "epoch": 0.94, "grad_norm": 3.473408546211247, "learning_rate": 1.735047483574215e-07, "loss": 3.4455, "step": 2356 }, { "epoch": 0.94, "grad_norm": 4.611810751851434, "learning_rate": 1.7111019968353625e-07, "loss": 3.4056, "step": 2357 }, { "epoch": 0.94, "grad_norm": 3.448733256723853, "learning_rate": 1.687321470862524e-07, "loss": 3.4959, "step": 2358 }, { "epoch": 0.94, "grad_norm": 3.1270683895468423, "learning_rate": 1.6637059455671623e-07, "loss": 3.4274, "step": 2359 }, { "epoch": 0.94, "grad_norm": 3.7016371313782153, "learning_rate": 1.6402554605838173e-07, "loss": 3.5068, "step": 2360 }, { "epoch": 0.94, "grad_norm": 3.7952189658057955, "learning_rate": 1.6169700552700284e-07, "loss": 3.3875, "step": 2361 }, { "epoch": 0.94, "grad_norm": 3.6167241598160005, "learning_rate": 1.5938497687062905e-07, "loss": 3.4302, "step": 2362 }, { "epoch": 0.95, "grad_norm": 3.899382383808761, "learning_rate": 1.5708946396959856e-07, "loss": 3.112, "step": 2363 }, { "epoch": 0.95, "grad_norm": 3.61072616759305, "learning_rate": 1.5481047067652744e-07, "loss": 3.0877, "step": 2364 }, { "epoch": 0.95, "grad_norm": 3.6528911414970993, "learning_rate": 1.5254800081630828e-07, "loss": 3.1509, "step": 2365 }, { "epoch": 0.95, "grad_norm": 3.7677655630715092, "learning_rate": 1.5030205818610255e-07, "loss": 3.286, "step": 2366 }, { "epoch": 0.95, "grad_norm": 3.0290147877844635, "learning_rate": 1.4807264655533282e-07, "loss": 3.4737, "step": 2367 }, { "epoch": 0.95, "grad_norm": 3.3276357529009775, "learning_rate": 1.4585976966567826e-07, "loss": 3.292, "step": 2368 }, { "epoch": 0.95, "grad_norm": 3.5897484848827896, "learning_rate": 1.4366343123106697e-07, "loss": 3.1764, "step": 2369 }, { "epoch": 0.95, "grad_norm": 3.931259962842155, "learning_rate": 1.4148363493766803e-07, "loss": 3.2836, "step": 2370 }, { "epoch": 0.95, "grad_norm": 2.8715543592371913, "learning_rate": 1.3932038444389063e-07, "loss": 3.456, "step": 2371 }, { "epoch": 0.95, "grad_norm": 3.6658305690245285, "learning_rate": 1.3717368338037163e-07, "loss": 3.5232, "step": 2372 }, { "epoch": 0.95, "grad_norm": 3.5552091059925846, "learning_rate": 1.3504353534997682e-07, "loss": 3.4265, "step": 2373 }, { "epoch": 0.95, "grad_norm": 4.060853993648709, "learning_rate": 1.3292994392778535e-07, "loss": 3.6187, "step": 2374 }, { "epoch": 0.95, "grad_norm": 3.4127774044560337, "learning_rate": 1.30832912661093e-07, "loss": 3.4672, "step": 2375 }, { "epoch": 0.95, "grad_norm": 3.3298217946063042, "learning_rate": 1.287524450694011e-07, "loss": 3.3182, "step": 2376 }, { "epoch": 0.95, "grad_norm": 3.765897310420544, "learning_rate": 1.2668854464441104e-07, "loss": 3.5001, "step": 2377 }, { "epoch": 0.95, "grad_norm": 3.6799767075818983, "learning_rate": 1.246412148500198e-07, "loss": 3.3275, "step": 2378 }, { "epoch": 0.95, "grad_norm": 3.271650758753963, "learning_rate": 1.2261045912231318e-07, "loss": 3.4898, "step": 2379 }, { "epoch": 0.95, "grad_norm": 3.4900191809890475, "learning_rate": 1.2059628086956044e-07, "loss": 3.3488, "step": 2380 }, { "epoch": 0.95, "grad_norm": 3.432667422028782, "learning_rate": 1.1859868347220749e-07, "loss": 3.3059, "step": 2381 }, { "epoch": 0.95, "grad_norm": 3.1095534136728507, "learning_rate": 1.1661767028287363e-07, "loss": 3.3028, "step": 2382 }, { "epoch": 0.95, "grad_norm": 3.2825282377319884, "learning_rate": 1.1465324462634375e-07, "loss": 3.2163, "step": 2383 }, { "epoch": 0.95, "grad_norm": 3.348291541570649, "learning_rate": 1.1270540979956501e-07, "loss": 3.4357, "step": 2384 }, { "epoch": 0.95, "grad_norm": 3.8133420709248758, "learning_rate": 1.1077416907163573e-07, "loss": 3.3492, "step": 2385 }, { "epoch": 0.95, "grad_norm": 4.416116463150174, "learning_rate": 1.0885952568380764e-07, "loss": 3.2941, "step": 2386 }, { "epoch": 0.95, "grad_norm": 3.2336397899674707, "learning_rate": 1.0696148284947694e-07, "loss": 3.2592, "step": 2387 }, { "epoch": 0.96, "grad_norm": 3.290423744841851, "learning_rate": 1.0508004375417546e-07, "loss": 3.4197, "step": 2388 }, { "epoch": 0.96, "grad_norm": 3.4718862466754437, "learning_rate": 1.032152115555718e-07, "loss": 3.1517, "step": 2389 }, { "epoch": 0.96, "grad_norm": 3.4536745523599475, "learning_rate": 1.0136698938346012e-07, "loss": 3.3263, "step": 2390 }, { "epoch": 0.96, "grad_norm": 3.4892773617678046, "learning_rate": 9.953538033975918e-08, "loss": 3.548, "step": 2391 }, { "epoch": 0.96, "grad_norm": 3.274808744119864, "learning_rate": 9.772038749850665e-08, "loss": 3.4609, "step": 2392 }, { "epoch": 0.96, "grad_norm": 3.1066954963040736, "learning_rate": 9.59220139058492e-08, "loss": 3.5136, "step": 2393 }, { "epoch": 0.96, "grad_norm": 3.819428406700386, "learning_rate": 9.414026258004583e-08, "loss": 3.3541, "step": 2394 }, { "epoch": 0.96, "grad_norm": 3.5996884033585483, "learning_rate": 9.237513651145224e-08, "loss": 3.5706, "step": 2395 }, { "epoch": 0.96, "grad_norm": 4.280278075970925, "learning_rate": 9.062663866252541e-08, "loss": 3.422, "step": 2396 }, { "epoch": 0.96, "grad_norm": 3.6535216978039577, "learning_rate": 8.889477196781571e-08, "loss": 3.2123, "step": 2397 }, { "epoch": 0.96, "grad_norm": 3.6186688736381183, "learning_rate": 8.717953933395695e-08, "loss": 3.5313, "step": 2398 }, { "epoch": 0.96, "grad_norm": 3.4552877616197284, "learning_rate": 8.548094363966974e-08, "loss": 3.2186, "step": 2399 }, { "epoch": 0.96, "grad_norm": 3.614807339334542, "learning_rate": 8.379898773574924e-08, "loss": 3.4121, "step": 2400 }, { "epoch": 0.96, "grad_norm": 3.560606455212878, "learning_rate": 8.213367444506515e-08, "loss": 3.4588, "step": 2401 }, { "epoch": 0.96, "grad_norm": 3.894001650217336, "learning_rate": 8.04850065625551e-08, "loss": 3.3264, "step": 2402 }, { "epoch": 0.96, "grad_norm": 3.7294219687904273, "learning_rate": 7.885298685522235e-08, "loss": 3.268, "step": 2403 }, { "epoch": 0.96, "grad_norm": 3.3727386747414427, "learning_rate": 7.723761806212371e-08, "loss": 3.5093, "step": 2404 }, { "epoch": 0.96, "grad_norm": 3.6980302225240456, "learning_rate": 7.563890289437825e-08, "loss": 3.2537, "step": 2405 }, { "epoch": 0.96, "grad_norm": 3.3141221393707134, "learning_rate": 7.405684403514635e-08, "loss": 3.3421, "step": 2406 }, { "epoch": 0.96, "grad_norm": 3.943541524063672, "learning_rate": 7.24914441396396e-08, "loss": 3.3136, "step": 2407 }, { "epoch": 0.96, "grad_norm": 3.621160257306377, "learning_rate": 7.094270583510976e-08, "loss": 3.4558, "step": 2408 }, { "epoch": 0.96, "grad_norm": 3.611032570898534, "learning_rate": 6.941063172084094e-08, "loss": 3.5023, "step": 2409 }, { "epoch": 0.96, "grad_norm": 3.069405550611829, "learning_rate": 6.78952243681541e-08, "loss": 3.5456, "step": 2410 }, { "epoch": 0.96, "grad_norm": 3.2993895937672195, "learning_rate": 6.639648632039697e-08, "loss": 3.5449, "step": 2411 }, { "epoch": 0.96, "grad_norm": 3.673634537765846, "learning_rate": 6.491442009293858e-08, "loss": 3.4421, "step": 2412 }, { "epoch": 0.97, "grad_norm": 3.7532147515910252, "learning_rate": 6.344902817316811e-08, "loss": 3.3273, "step": 2413 }, { "epoch": 0.97, "grad_norm": 3.7110501713964834, "learning_rate": 6.200031302049048e-08, "loss": 3.2886, "step": 2414 }, { "epoch": 0.97, "grad_norm": 3.438293669662373, "learning_rate": 6.056827706632185e-08, "loss": 3.5594, "step": 2415 }, { "epoch": 0.97, "grad_norm": 3.508072088402561, "learning_rate": 5.915292271408524e-08, "loss": 3.3602, "step": 2416 }, { "epoch": 0.97, "grad_norm": 3.5684197432611096, "learning_rate": 5.7754252339204955e-08, "loss": 3.414, "step": 2417 }, { "epoch": 0.97, "grad_norm": 4.319063366225016, "learning_rate": 5.637226828910436e-08, "loss": 3.3698, "step": 2418 }, { "epoch": 0.97, "grad_norm": 3.8336668797631637, "learning_rate": 5.5006972883204776e-08, "loss": 3.3801, "step": 2419 }, { "epoch": 0.97, "grad_norm": 3.6058057493735585, "learning_rate": 5.365836841291439e-08, "loss": 3.4924, "step": 2420 }, { "epoch": 0.97, "grad_norm": 3.4600163464627554, "learning_rate": 5.232645714163265e-08, "loss": 3.4741, "step": 2421 }, { "epoch": 0.97, "grad_norm": 3.480906233508525, "learning_rate": 5.1011241304738115e-08, "loss": 3.3203, "step": 2422 }, { "epoch": 0.97, "grad_norm": 3.3298471961537905, "learning_rate": 4.9712723109590636e-08, "loss": 3.4202, "step": 2423 }, { "epoch": 0.97, "grad_norm": 3.337131802937764, "learning_rate": 4.843090473552914e-08, "loss": 3.4015, "step": 2424 }, { "epoch": 0.97, "grad_norm": 3.2352102980270843, "learning_rate": 4.716578833386054e-08, "loss": 3.2217, "step": 2425 }, { "epoch": 0.97, "grad_norm": 3.436029228983509, "learning_rate": 4.5917376027861945e-08, "loss": 3.2638, "step": 2426 }, { "epoch": 0.97, "grad_norm": 3.8479008982912006, "learning_rate": 4.468566991277512e-08, "loss": 3.2157, "step": 2427 }, { "epoch": 0.97, "grad_norm": 3.500510734627635, "learning_rate": 4.347067205580424e-08, "loss": 3.4505, "step": 2428 }, { "epoch": 0.97, "grad_norm": 4.057314345010156, "learning_rate": 4.2272384496112597e-08, "loss": 3.4177, "step": 2429 }, { "epoch": 0.97, "grad_norm": 3.5742843904617185, "learning_rate": 4.109080924481479e-08, "loss": 3.2047, "step": 2430 }, { "epoch": 0.97, "grad_norm": 3.980513447383297, "learning_rate": 3.9925948284980086e-08, "loss": 3.3224, "step": 2431 }, { "epoch": 0.97, "grad_norm": 3.523660677303886, "learning_rate": 3.877780357162353e-08, "loss": 3.1595, "step": 2432 }, { "epoch": 0.97, "grad_norm": 3.7222719612091675, "learning_rate": 3.764637703170593e-08, "loss": 3.4171, "step": 2433 }, { "epoch": 0.97, "grad_norm": 3.294507134265397, "learning_rate": 3.653167056413054e-08, "loss": 3.4806, "step": 2434 }, { "epoch": 0.97, "grad_norm": 3.4418301658508055, "learning_rate": 3.543368603973529e-08, "loss": 3.2865, "step": 2435 }, { "epoch": 0.97, "grad_norm": 3.3616588149744073, "learning_rate": 3.435242530129723e-08, "loss": 3.4426, "step": 2436 }, { "epoch": 0.97, "grad_norm": 3.9010966248921273, "learning_rate": 3.3287890163523626e-08, "loss": 3.5075, "step": 2437 }, { "epoch": 0.98, "grad_norm": 3.116521733307449, "learning_rate": 3.224008241304977e-08, "loss": 3.2828, "step": 2438 }, { "epoch": 0.98, "grad_norm": 3.285127613209865, "learning_rate": 3.120900380844116e-08, "loss": 3.3661, "step": 2439 }, { "epoch": 0.98, "grad_norm": 3.5071884384835306, "learning_rate": 3.019465608018024e-08, "loss": 3.5639, "step": 2440 }, { "epoch": 0.98, "grad_norm": 3.1870037421829043, "learning_rate": 2.9197040930674102e-08, "loss": 3.1974, "step": 2441 }, { "epoch": 0.98, "grad_norm": 3.509012040156903, "learning_rate": 2.8216160034244544e-08, "loss": 3.3545, "step": 2442 }, { "epoch": 0.98, "grad_norm": 3.70637322606787, "learning_rate": 2.7252015037131373e-08, "loss": 3.3309, "step": 2443 }, { "epoch": 0.98, "grad_norm": 3.8258205503720206, "learning_rate": 2.6304607557481322e-08, "loss": 3.3279, "step": 2444 }, { "epoch": 0.98, "grad_norm": 3.3015147026301306, "learning_rate": 2.537393918535358e-08, "loss": 3.3331, "step": 2445 }, { "epoch": 0.98, "grad_norm": 3.558951915848977, "learning_rate": 2.4460011482713153e-08, "loss": 3.2497, "step": 2446 }, { "epoch": 0.98, "grad_norm": 3.989173866915556, "learning_rate": 2.3562825983427517e-08, "loss": 3.5516, "step": 2447 }, { "epoch": 0.98, "grad_norm": 3.639158558560462, "learning_rate": 2.2682384193266625e-08, "loss": 3.4531, "step": 2448 }, { "epoch": 0.98, "grad_norm": 4.331617858251365, "learning_rate": 2.1818687589896248e-08, "loss": 3.4148, "step": 2449 }, { "epoch": 0.98, "grad_norm": 3.5011814560248915, "learning_rate": 2.0971737622883515e-08, "loss": 3.2685, "step": 2450 }, { "epoch": 0.98, "grad_norm": 3.4780309027220215, "learning_rate": 2.01415357136836e-08, "loss": 3.2917, "step": 2451 }, { "epoch": 0.98, "grad_norm": 4.40132084644993, "learning_rate": 1.93280832556475e-08, "loss": 3.3335, "step": 2452 }, { "epoch": 0.98, "grad_norm": 3.359641935311238, "learning_rate": 1.8531381614013133e-08, "loss": 3.4068, "step": 2453 }, { "epoch": 0.98, "grad_norm": 3.1694399725333087, "learning_rate": 1.7751432125903133e-08, "loss": 3.3822, "step": 2454 }, { "epoch": 0.98, "grad_norm": 3.587399184824128, "learning_rate": 1.698823610032929e-08, "loss": 3.4473, "step": 2455 }, { "epoch": 0.98, "grad_norm": 3.302820237000699, "learning_rate": 1.6241794818180333e-08, "loss": 3.2511, "step": 2456 }, { "epoch": 0.98, "grad_norm": 3.546048636973119, "learning_rate": 1.5512109532229703e-08, "loss": 3.3763, "step": 2457 }, { "epoch": 0.98, "grad_norm": 3.6815940730038124, "learning_rate": 1.4799181467125557e-08, "loss": 3.5656, "step": 2458 }, { "epoch": 0.98, "grad_norm": 3.23628539901506, "learning_rate": 1.4103011819395218e-08, "loss": 3.652, "step": 2459 }, { "epoch": 0.98, "grad_norm": 4.047804798596946, "learning_rate": 1.3423601757436289e-08, "loss": 3.2655, "step": 2460 }, { "epoch": 0.98, "grad_norm": 3.528437058663026, "learning_rate": 1.276095242151998e-08, "loss": 3.1583, "step": 2461 }, { "epoch": 0.98, "grad_norm": 3.4972696524475113, "learning_rate": 1.2115064923787778e-08, "loss": 3.344, "step": 2462 }, { "epoch": 0.99, "grad_norm": 2.9816851292985413, "learning_rate": 1.1485940348249235e-08, "loss": 3.2459, "step": 2463 }, { "epoch": 0.99, "grad_norm": 3.5442232507559304, "learning_rate": 1.087357975078085e-08, "loss": 3.352, "step": 2464 }, { "epoch": 0.99, "grad_norm": 3.4841578124819246, "learning_rate": 1.0277984159122734e-08, "loss": 3.423, "step": 2465 }, { "epoch": 0.99, "grad_norm": 3.5554413670728247, "learning_rate": 9.699154572877511e-09, "loss": 3.3301, "step": 2466 }, { "epoch": 0.99, "grad_norm": 3.6871360033669887, "learning_rate": 9.137091963510314e-09, "loss": 3.2715, "step": 2467 }, { "epoch": 0.99, "grad_norm": 3.165684460893977, "learning_rate": 8.591797274344338e-09, "loss": 3.4842, "step": 2468 }, { "epoch": 0.99, "grad_norm": 2.9791606895583, "learning_rate": 8.063271420563068e-09, "loss": 3.4481, "step": 2469 }, { "epoch": 0.99, "grad_norm": 2.9882821622236264, "learning_rate": 7.551515289203615e-09, "loss": 3.5619, "step": 2470 }, { "epoch": 0.99, "grad_norm": 3.7516938084672558, "learning_rate": 7.056529739158935e-09, "loss": 3.488, "step": 2471 }, { "epoch": 0.99, "grad_norm": 3.80255477519852, "learning_rate": 6.5783156011778315e-09, "loss": 3.3546, "step": 2472 }, { "epoch": 0.99, "grad_norm": 4.145429658546217, "learning_rate": 6.116873677858293e-09, "loss": 3.3526, "step": 2473 }, { "epoch": 0.99, "grad_norm": 3.6512344972109707, "learning_rate": 5.6722047436497115e-09, "loss": 3.3866, "step": 2474 }, { "epoch": 0.99, "grad_norm": 3.9255532464199225, "learning_rate": 5.2443095448506674e-09, "loss": 3.4207, "step": 2475 }, { "epoch": 0.99, "grad_norm": 3.500865341292617, "learning_rate": 4.833188799610033e-09, "loss": 3.5687, "step": 2476 }, { "epoch": 0.99, "grad_norm": 3.4307967357678644, "learning_rate": 4.438843197922538e-09, "loss": 3.3044, "step": 2477 }, { "epoch": 0.99, "grad_norm": 3.3415100972592366, "learning_rate": 4.061273401627653e-09, "loss": 3.3692, "step": 2478 }, { "epoch": 0.99, "grad_norm": 3.3171125128650845, "learning_rate": 3.7004800444095933e-09, "loss": 3.3511, "step": 2479 }, { "epoch": 0.99, "grad_norm": 3.563985957009853, "learning_rate": 3.3564637317984318e-09, "loss": 3.2359, "step": 2480 }, { "epoch": 0.99, "grad_norm": 3.2396809393878967, "learning_rate": 3.0292250411645406e-09, "loss": 3.5031, "step": 2481 }, { "epoch": 0.99, "grad_norm": 3.487495476298278, "learning_rate": 2.7187645217219283e-09, "loss": 3.2537, "step": 2482 }, { "epoch": 0.99, "grad_norm": 3.462183993392574, "learning_rate": 2.4250826945226847e-09, "loss": 3.4758, "step": 2483 }, { "epoch": 0.99, "grad_norm": 3.45474989584317, "learning_rate": 2.148180052462534e-09, "loss": 3.4833, "step": 2484 }, { "epoch": 0.99, "grad_norm": 3.5726004000973304, "learning_rate": 1.888057060274173e-09, "loss": 3.227, "step": 2485 }, { "epoch": 0.99, "grad_norm": 3.2144747119002104, "learning_rate": 1.6447141545272717e-09, "loss": 3.168, "step": 2486 }, { "epoch": 0.99, "grad_norm": 3.608690469923541, "learning_rate": 1.4181517436306913e-09, "loss": 3.2924, "step": 2487 }, { "epoch": 1.0, "grad_norm": 3.523006771698765, "learning_rate": 1.2083702078302673e-09, "loss": 3.4771, "step": 2488 }, { "epoch": 1.0, "grad_norm": 3.393916464258931, "learning_rate": 1.0153698992088069e-09, "loss": 3.4871, "step": 2489 }, { "epoch": 1.0, "grad_norm": 3.5257507681162927, "learning_rate": 8.391511416816489e-10, "loss": 3.3328, "step": 2490 }, { "epoch": 1.0, "grad_norm": 3.5827728588499856, "learning_rate": 6.797142310022154e-10, "loss": 3.0827, "step": 2491 }, { "epoch": 1.0, "grad_norm": 3.412401860436348, "learning_rate": 5.370594347575697e-10, "loss": 3.4849, "step": 2492 }, { "epoch": 1.0, "grad_norm": 3.3902248151074974, "learning_rate": 4.1118699236841753e-10, "loss": 3.478, "step": 2493 }, { "epoch": 1.0, "grad_norm": 3.1063910939480706, "learning_rate": 3.0209711509132657e-10, "loss": 3.4109, "step": 2494 }, { "epoch": 1.0, "grad_norm": 3.5735345416169753, "learning_rate": 2.0978998601206558e-10, "loss": 3.3083, "step": 2495 }, { "epoch": 1.0, "grad_norm": 3.4576775876063928, "learning_rate": 1.342657600544861e-10, "loss": 3.51, "step": 2496 }, { "epoch": 1.0, "grad_norm": 3.835368532804796, "learning_rate": 7.552456397053042e-11, "loss": 3.3847, "step": 2497 }, { "epoch": 1.0, "grad_norm": 2.9813514920634563, "learning_rate": 3.3566496349113355e-11, "loss": 3.5756, "step": 2498 }, { "epoch": 1.0, "grad_norm": 3.2882708884674345, "learning_rate": 8.39162760835066e-12, "loss": 3.4625, "step": 2499 }, { "epoch": 1.0, "grad_norm": 3.528204116206639, "learning_rate": 0.0, "loss": 3.2533, "step": 2500 }, { "epoch": 1.0, "step": 2500, "total_flos": 5.835554469802148e+17, "train_loss": 4.1818853558540345, "train_runtime": 17904.7495, "train_samples_per_second": 4.468, "train_steps_per_second": 0.14 } ], "logging_steps": 1.0, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 240, "total_flos": 5.835554469802148e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }