{ "best_metric": 0.08259893208742142, "best_model_checkpoint": "arabic-embedding-model-only-pairs-v2/checkpoint-49115", "epoch": 5.0, "eval_steps": 500, "global_step": 49115, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025450473378804844, "grad_norm": 17.451679229736328, "learning_rate": 1.0179153094462541e-07, "loss": 0.3458, "step": 25 }, { "epoch": 0.005090094675760969, "grad_norm": 8.519526481628418, "learning_rate": 2.0358306188925083e-07, "loss": 0.3213, "step": 50 }, { "epoch": 0.0076351420136414536, "grad_norm": 14.82270622253418, "learning_rate": 3.053745928338762e-07, "loss": 0.3414, "step": 75 }, { "epoch": 0.010180189351521938, "grad_norm": 20.397518157958984, "learning_rate": 4.0716612377850166e-07, "loss": 0.3622, "step": 100 }, { "epoch": 0.012725236689402423, "grad_norm": 14.741048812866211, "learning_rate": 5.089576547231271e-07, "loss": 0.3607, "step": 125 }, { "epoch": 0.015270284027282907, "grad_norm": 16.42844581604004, "learning_rate": 6.107491856677524e-07, "loss": 0.3514, "step": 150 }, { "epoch": 0.017815331365163393, "grad_norm": 4.865312099456787, "learning_rate": 7.12540716612378e-07, "loss": 0.2282, "step": 175 }, { "epoch": 0.020360378703043875, "grad_norm": 14.216117858886719, "learning_rate": 8.143322475570033e-07, "loss": 0.2781, "step": 200 }, { "epoch": 0.02290542604092436, "grad_norm": 11.410103797912598, "learning_rate": 9.161237785016288e-07, "loss": 0.3206, "step": 225 }, { "epoch": 0.025450473378804846, "grad_norm": 13.930344581604004, "learning_rate": 1.0179153094462542e-06, "loss": 0.3592, "step": 250 }, { "epoch": 0.027995520716685332, "grad_norm": 9.695392608642578, "learning_rate": 1.1197068403908795e-06, "loss": 0.2837, "step": 275 }, { "epoch": 0.030540568054565814, "grad_norm": 19.564462661743164, "learning_rate": 1.2214983713355049e-06, "loss": 0.2227, "step": 300 }, { "epoch": 0.033085615392446296, "grad_norm": 6.403764247894287, "learning_rate": 1.3232899022801304e-06, "loss": 0.3072, "step": 325 }, { "epoch": 0.035630662730326786, "grad_norm": 4.398595333099365, "learning_rate": 1.425081433224756e-06, "loss": 0.3487, "step": 350 }, { "epoch": 0.03817571006820727, "grad_norm": 14.698935508728027, "learning_rate": 1.5268729641693813e-06, "loss": 0.2439, "step": 375 }, { "epoch": 0.04072075740608775, "grad_norm": 4.241194248199463, "learning_rate": 1.6286644951140066e-06, "loss": 0.291, "step": 400 }, { "epoch": 0.04326580474396824, "grad_norm": 12.753772735595703, "learning_rate": 1.730456026058632e-06, "loss": 0.2325, "step": 425 }, { "epoch": 0.04581085208184872, "grad_norm": 8.159408569335938, "learning_rate": 1.8322475570032575e-06, "loss": 0.3068, "step": 450 }, { "epoch": 0.048355899419729204, "grad_norm": 7.215402126312256, "learning_rate": 1.934039087947883e-06, "loss": 0.3268, "step": 475 }, { "epoch": 0.05090094675760969, "grad_norm": 13.441118240356445, "learning_rate": 2.0358306188925084e-06, "loss": 0.2973, "step": 500 }, { "epoch": 0.053445994095490175, "grad_norm": 4.826434135437012, "learning_rate": 2.137622149837134e-06, "loss": 0.3116, "step": 525 }, { "epoch": 0.055991041433370664, "grad_norm": 9.862593650817871, "learning_rate": 2.239413680781759e-06, "loss": 0.2543, "step": 550 }, { "epoch": 0.058536088771251146, "grad_norm": 7.720365524291992, "learning_rate": 2.3412052117263846e-06, "loss": 0.2747, "step": 575 }, { "epoch": 0.06108113610913163, "grad_norm": 10.59146785736084, "learning_rate": 2.4429967426710097e-06, "loss": 0.2809, "step": 600 }, { "epoch": 0.06362618344701211, "grad_norm": 9.026193618774414, "learning_rate": 2.5447882736156353e-06, "loss": 0.3561, "step": 625 }, { "epoch": 0.06617123078489259, "grad_norm": 11.523633003234863, "learning_rate": 2.6425081433224757e-06, "loss": 0.2683, "step": 650 }, { "epoch": 0.06871627812277309, "grad_norm": 13.002677917480469, "learning_rate": 2.7442996742671013e-06, "loss": 0.249, "step": 675 }, { "epoch": 0.07126132546065357, "grad_norm": 12.167911529541016, "learning_rate": 2.846091205211727e-06, "loss": 0.2482, "step": 700 }, { "epoch": 0.07380637279853405, "grad_norm": 7.002504348754883, "learning_rate": 2.9478827361563524e-06, "loss": 0.3015, "step": 725 }, { "epoch": 0.07635142013641454, "grad_norm": 14.07034683227539, "learning_rate": 3.0496742671009775e-06, "loss": 0.2422, "step": 750 }, { "epoch": 0.07889646747429502, "grad_norm": 9.051650047302246, "learning_rate": 3.1514657980456026e-06, "loss": 0.259, "step": 775 }, { "epoch": 0.0814415148121755, "grad_norm": 7.5605692863464355, "learning_rate": 3.253257328990228e-06, "loss": 0.2565, "step": 800 }, { "epoch": 0.083986562150056, "grad_norm": 7.689245223999023, "learning_rate": 3.3550488599348537e-06, "loss": 0.254, "step": 825 }, { "epoch": 0.08653160948793648, "grad_norm": 10.289682388305664, "learning_rate": 3.456840390879479e-06, "loss": 0.2366, "step": 850 }, { "epoch": 0.08907665682581696, "grad_norm": 14.462525367736816, "learning_rate": 3.5586319218241044e-06, "loss": 0.2805, "step": 875 }, { "epoch": 0.09162170416369744, "grad_norm": 12.019689559936523, "learning_rate": 3.66042345276873e-06, "loss": 0.2881, "step": 900 }, { "epoch": 0.09416675150157792, "grad_norm": 6.248305320739746, "learning_rate": 3.7622149837133555e-06, "loss": 0.2424, "step": 925 }, { "epoch": 0.09671179883945841, "grad_norm": 8.205031394958496, "learning_rate": 3.864006514657981e-06, "loss": 0.2342, "step": 950 }, { "epoch": 0.0992568461773389, "grad_norm": 10.721994400024414, "learning_rate": 3.965798045602606e-06, "loss": 0.2036, "step": 975 }, { "epoch": 0.10180189351521939, "grad_norm": 10.379674911499023, "learning_rate": 4.067589576547232e-06, "loss": 0.2432, "step": 1000 }, { "epoch": 0.10434694085309987, "grad_norm": 11.975798606872559, "learning_rate": 4.169381107491857e-06, "loss": 0.2782, "step": 1025 }, { "epoch": 0.10689198819098035, "grad_norm": 9.442329406738281, "learning_rate": 4.271172638436483e-06, "loss": 0.2303, "step": 1050 }, { "epoch": 0.10943703552886083, "grad_norm": 11.444186210632324, "learning_rate": 4.372964169381108e-06, "loss": 0.2418, "step": 1075 }, { "epoch": 0.11198208286674133, "grad_norm": 7.217404365539551, "learning_rate": 4.474755700325733e-06, "loss": 0.1989, "step": 1100 }, { "epoch": 0.11452713020462181, "grad_norm": 13.797436714172363, "learning_rate": 4.5765472312703586e-06, "loss": 0.2189, "step": 1125 }, { "epoch": 0.11707217754250229, "grad_norm": 7.028829097747803, "learning_rate": 4.678338762214984e-06, "loss": 0.2295, "step": 1150 }, { "epoch": 0.11961722488038277, "grad_norm": 8.137258529663086, "learning_rate": 4.78013029315961e-06, "loss": 0.1583, "step": 1175 }, { "epoch": 0.12216227221826326, "grad_norm": 10.967634201049805, "learning_rate": 4.881921824104235e-06, "loss": 0.2247, "step": 1200 }, { "epoch": 0.12470731955614374, "grad_norm": 4.914444446563721, "learning_rate": 4.98371335504886e-06, "loss": 0.2127, "step": 1225 }, { "epoch": 0.12725236689402422, "grad_norm": 6.940595626831055, "learning_rate": 5.0855048859934855e-06, "loss": 0.2244, "step": 1250 }, { "epoch": 0.12979741423190472, "grad_norm": 11.294093132019043, "learning_rate": 5.187296416938111e-06, "loss": 0.2218, "step": 1275 }, { "epoch": 0.13234246156978519, "grad_norm": 9.178437232971191, "learning_rate": 5.2890879478827366e-06, "loss": 0.2489, "step": 1300 }, { "epoch": 0.13488750890766568, "grad_norm": 7.749022006988525, "learning_rate": 5.390879478827362e-06, "loss": 0.1778, "step": 1325 }, { "epoch": 0.13743255624554618, "grad_norm": 32.12667465209961, "learning_rate": 5.492671009771987e-06, "loss": 0.2243, "step": 1350 }, { "epoch": 0.13997760358342665, "grad_norm": 8.55077075958252, "learning_rate": 5.594462540716613e-06, "loss": 0.2063, "step": 1375 }, { "epoch": 0.14252265092130714, "grad_norm": 15.310531616210938, "learning_rate": 5.696254071661238e-06, "loss": 0.2187, "step": 1400 }, { "epoch": 0.1450676982591876, "grad_norm": 11.116787910461426, "learning_rate": 5.798045602605864e-06, "loss": 0.204, "step": 1425 }, { "epoch": 0.1476127455970681, "grad_norm": 6.040707588195801, "learning_rate": 5.899837133550489e-06, "loss": 0.2039, "step": 1450 }, { "epoch": 0.1501577929349486, "grad_norm": 10.210320472717285, "learning_rate": 6.0016286644951145e-06, "loss": 0.2186, "step": 1475 }, { "epoch": 0.15270284027282907, "grad_norm": 11.306329727172852, "learning_rate": 6.10342019543974e-06, "loss": 0.2352, "step": 1500 }, { "epoch": 0.15524788761070957, "grad_norm": 8.576631546020508, "learning_rate": 6.205211726384366e-06, "loss": 0.2578, "step": 1525 }, { "epoch": 0.15779293494859004, "grad_norm": 16.613567352294922, "learning_rate": 6.307003257328991e-06, "loss": 0.2047, "step": 1550 }, { "epoch": 0.16033798228647053, "grad_norm": 6.0128936767578125, "learning_rate": 6.408794788273616e-06, "loss": 0.1833, "step": 1575 }, { "epoch": 0.162883029624351, "grad_norm": 11.588747024536133, "learning_rate": 6.5105863192182414e-06, "loss": 0.2564, "step": 1600 }, { "epoch": 0.1654280769622315, "grad_norm": 14.360913276672363, "learning_rate": 6.612377850162867e-06, "loss": 0.2021, "step": 1625 }, { "epoch": 0.167973124300112, "grad_norm": 7.827113628387451, "learning_rate": 6.7141693811074925e-06, "loss": 0.1901, "step": 1650 }, { "epoch": 0.17051817163799246, "grad_norm": 6.506629467010498, "learning_rate": 6.815960912052117e-06, "loss": 0.2183, "step": 1675 }, { "epoch": 0.17306321897587296, "grad_norm": 9.81983470916748, "learning_rate": 6.917752442996744e-06, "loss": 0.1661, "step": 1700 }, { "epoch": 0.17560826631375342, "grad_norm": 15.31557559967041, "learning_rate": 7.019543973941368e-06, "loss": 0.1797, "step": 1725 }, { "epoch": 0.17815331365163392, "grad_norm": 13.521235466003418, "learning_rate": 7.121335504885995e-06, "loss": 0.2105, "step": 1750 }, { "epoch": 0.18069836098951442, "grad_norm": 6.115505695343018, "learning_rate": 7.223127035830619e-06, "loss": 0.2214, "step": 1775 }, { "epoch": 0.18324340832739489, "grad_norm": 11.252988815307617, "learning_rate": 7.324918566775245e-06, "loss": 0.2475, "step": 1800 }, { "epoch": 0.18578845566527538, "grad_norm": 20.72943878173828, "learning_rate": 7.4267100977198705e-06, "loss": 0.2395, "step": 1825 }, { "epoch": 0.18833350300315585, "grad_norm": 12.99436092376709, "learning_rate": 7.528501628664495e-06, "loss": 0.1911, "step": 1850 }, { "epoch": 0.19087855034103635, "grad_norm": 3.998025417327881, "learning_rate": 7.630293159609122e-06, "loss": 0.1588, "step": 1875 }, { "epoch": 0.19342359767891681, "grad_norm": 6.960880279541016, "learning_rate": 7.732084690553745e-06, "loss": 0.2358, "step": 1900 }, { "epoch": 0.1959686450167973, "grad_norm": 7.048129558563232, "learning_rate": 7.833876221498373e-06, "loss": 0.1773, "step": 1925 }, { "epoch": 0.1985136923546778, "grad_norm": 35.43663787841797, "learning_rate": 7.935667752442997e-06, "loss": 0.2347, "step": 1950 }, { "epoch": 0.20105873969255827, "grad_norm": 4.078209400177002, "learning_rate": 8.037459283387624e-06, "loss": 0.188, "step": 1975 }, { "epoch": 0.20360378703043877, "grad_norm": 7.5787458419799805, "learning_rate": 8.139250814332248e-06, "loss": 0.1925, "step": 2000 }, { "epoch": 0.20614883436831924, "grad_norm": 5.975426197052002, "learning_rate": 8.241042345276873e-06, "loss": 0.191, "step": 2025 }, { "epoch": 0.20869388170619974, "grad_norm": 18.67757797241211, "learning_rate": 8.342833876221499e-06, "loss": 0.2032, "step": 2050 }, { "epoch": 0.21123892904408023, "grad_norm": 7.5380449295043945, "learning_rate": 8.444625407166124e-06, "loss": 0.1787, "step": 2075 }, { "epoch": 0.2137839763819607, "grad_norm": 11.195531845092773, "learning_rate": 8.54641693811075e-06, "loss": 0.2327, "step": 2100 }, { "epoch": 0.2163290237198412, "grad_norm": 3.4572274684906006, "learning_rate": 8.648208469055375e-06, "loss": 0.196, "step": 2125 }, { "epoch": 0.21887407105772166, "grad_norm": 8.280584335327148, "learning_rate": 8.750000000000001e-06, "loss": 0.1945, "step": 2150 }, { "epoch": 0.22141911839560216, "grad_norm": 7.217907905578613, "learning_rate": 8.851791530944625e-06, "loss": 0.1918, "step": 2175 }, { "epoch": 0.22396416573348266, "grad_norm": 2.9696857929229736, "learning_rate": 8.953583061889252e-06, "loss": 0.2011, "step": 2200 }, { "epoch": 0.22650921307136312, "grad_norm": 13.338550567626953, "learning_rate": 9.055374592833876e-06, "loss": 0.1711, "step": 2225 }, { "epoch": 0.22905426040924362, "grad_norm": 11.651854515075684, "learning_rate": 9.157166123778503e-06, "loss": 0.2687, "step": 2250 }, { "epoch": 0.2315993077471241, "grad_norm": 1.1671559810638428, "learning_rate": 9.258957654723127e-06, "loss": 0.1998, "step": 2275 }, { "epoch": 0.23414435508500459, "grad_norm": 8.616079330444336, "learning_rate": 9.360749185667754e-06, "loss": 0.1604, "step": 2300 }, { "epoch": 0.23668940242288505, "grad_norm": 4.227112770080566, "learning_rate": 9.462540716612378e-06, "loss": 0.1871, "step": 2325 }, { "epoch": 0.23923444976076555, "grad_norm": 7.659778594970703, "learning_rate": 9.564332247557004e-06, "loss": 0.273, "step": 2350 }, { "epoch": 0.24177949709864605, "grad_norm": 5.232609272003174, "learning_rate": 9.66612377850163e-06, "loss": 0.2405, "step": 2375 }, { "epoch": 0.24432454443652651, "grad_norm": 8.198054313659668, "learning_rate": 9.767915309446255e-06, "loss": 0.2212, "step": 2400 }, { "epoch": 0.246869591774407, "grad_norm": 10.666227340698242, "learning_rate": 9.86970684039088e-06, "loss": 0.2293, "step": 2425 }, { "epoch": 0.24941463911228748, "grad_norm": 12.714786529541016, "learning_rate": 9.971498371335506e-06, "loss": 0.1887, "step": 2450 }, { "epoch": 0.251959686450168, "grad_norm": 9.544090270996094, "learning_rate": 1.007328990228013e-05, "loss": 0.233, "step": 2475 }, { "epoch": 0.25450473378804844, "grad_norm": 9.07693862915039, "learning_rate": 1.0175081433224757e-05, "loss": 0.1761, "step": 2500 }, { "epoch": 0.25704978112592897, "grad_norm": 6.585052490234375, "learning_rate": 1.0276872964169382e-05, "loss": 0.2103, "step": 2525 }, { "epoch": 0.25959482846380943, "grad_norm": 10.828688621520996, "learning_rate": 1.0378664495114008e-05, "loss": 0.2042, "step": 2550 }, { "epoch": 0.2621398758016899, "grad_norm": 4.312686920166016, "learning_rate": 1.0480456026058632e-05, "loss": 0.1831, "step": 2575 }, { "epoch": 0.26468492313957037, "grad_norm": 7.761200428009033, "learning_rate": 1.0582247557003257e-05, "loss": 0.2006, "step": 2600 }, { "epoch": 0.2672299704774509, "grad_norm": 12.342726707458496, "learning_rate": 1.0684039087947885e-05, "loss": 0.1801, "step": 2625 }, { "epoch": 0.26977501781533136, "grad_norm": 12.449806213378906, "learning_rate": 1.078583061889251e-05, "loss": 0.1625, "step": 2650 }, { "epoch": 0.27232006515321183, "grad_norm": 1.6762300729751587, "learning_rate": 1.0887622149837134e-05, "loss": 0.1741, "step": 2675 }, { "epoch": 0.27486511249109236, "grad_norm": 13.451739311218262, "learning_rate": 1.098941368078176e-05, "loss": 0.2158, "step": 2700 }, { "epoch": 0.2774101598289728, "grad_norm": 5.619725227355957, "learning_rate": 1.1091205211726385e-05, "loss": 0.2246, "step": 2725 }, { "epoch": 0.2799552071668533, "grad_norm": 3.5796515941619873, "learning_rate": 1.1192996742671012e-05, "loss": 0.1823, "step": 2750 }, { "epoch": 0.28250025450473376, "grad_norm": 6.106029033660889, "learning_rate": 1.1294788273615636e-05, "loss": 0.1484, "step": 2775 }, { "epoch": 0.2850453018426143, "grad_norm": 10.085928916931152, "learning_rate": 1.1396579804560262e-05, "loss": 0.1743, "step": 2800 }, { "epoch": 0.28759034918049475, "grad_norm": 5.781569480895996, "learning_rate": 1.1498371335504887e-05, "loss": 0.2208, "step": 2825 }, { "epoch": 0.2901353965183752, "grad_norm": 9.282120704650879, "learning_rate": 1.1600162866449511e-05, "loss": 0.1441, "step": 2850 }, { "epoch": 0.29268044385625575, "grad_norm": 21.168132781982422, "learning_rate": 1.1701954397394137e-05, "loss": 0.1998, "step": 2875 }, { "epoch": 0.2952254911941362, "grad_norm": 7.404329776763916, "learning_rate": 1.1803745928338764e-05, "loss": 0.2111, "step": 2900 }, { "epoch": 0.2977705385320167, "grad_norm": 11.75802230834961, "learning_rate": 1.190553745928339e-05, "loss": 0.1657, "step": 2925 }, { "epoch": 0.3003155858698972, "grad_norm": 7.526225566864014, "learning_rate": 1.2007328990228013e-05, "loss": 0.2189, "step": 2950 }, { "epoch": 0.3028606332077777, "grad_norm": 5.306471347808838, "learning_rate": 1.2109120521172639e-05, "loss": 0.203, "step": 2975 }, { "epoch": 0.30540568054565814, "grad_norm": 2.9105184078216553, "learning_rate": 1.2210912052117264e-05, "loss": 0.1789, "step": 3000 }, { "epoch": 0.3079507278835386, "grad_norm": 0.9518977403640747, "learning_rate": 1.2312703583061892e-05, "loss": 0.1435, "step": 3025 }, { "epoch": 0.31049577522141913, "grad_norm": 1.7667123079299927, "learning_rate": 1.2414495114006516e-05, "loss": 0.1727, "step": 3050 }, { "epoch": 0.3130408225592996, "grad_norm": 9.456025123596191, "learning_rate": 1.2516286644951141e-05, "loss": 0.2045, "step": 3075 }, { "epoch": 0.31558586989718007, "grad_norm": 8.54640007019043, "learning_rate": 1.2618078175895767e-05, "loss": 0.2143, "step": 3100 }, { "epoch": 0.3181309172350606, "grad_norm": 4.005561828613281, "learning_rate": 1.271986970684039e-05, "loss": 0.1971, "step": 3125 }, { "epoch": 0.32067596457294106, "grad_norm": 4.398591995239258, "learning_rate": 1.2821661237785018e-05, "loss": 0.2121, "step": 3150 }, { "epoch": 0.32322101191082153, "grad_norm": 8.242807388305664, "learning_rate": 1.2923452768729643e-05, "loss": 0.183, "step": 3175 }, { "epoch": 0.325766059248702, "grad_norm": 7.716986656188965, "learning_rate": 1.3025244299674269e-05, "loss": 0.2471, "step": 3200 }, { "epoch": 0.3283111065865825, "grad_norm": 2.0308592319488525, "learning_rate": 1.3127035830618893e-05, "loss": 0.1714, "step": 3225 }, { "epoch": 0.330856153924463, "grad_norm": 5.050072193145752, "learning_rate": 1.3228827361563518e-05, "loss": 0.1689, "step": 3250 }, { "epoch": 0.33340120126234346, "grad_norm": 8.50062370300293, "learning_rate": 1.3330618892508145e-05, "loss": 0.1817, "step": 3275 }, { "epoch": 0.335946248600224, "grad_norm": 12.829822540283203, "learning_rate": 1.3432410423452771e-05, "loss": 0.1616, "step": 3300 }, { "epoch": 0.33849129593810445, "grad_norm": 6.215601921081543, "learning_rate": 1.3534201954397395e-05, "loss": 0.152, "step": 3325 }, { "epoch": 0.3410363432759849, "grad_norm": 9.163058280944824, "learning_rate": 1.363599348534202e-05, "loss": 0.1619, "step": 3350 }, { "epoch": 0.34358139061386544, "grad_norm": 6.206748008728027, "learning_rate": 1.3737785016286646e-05, "loss": 0.1881, "step": 3375 }, { "epoch": 0.3461264379517459, "grad_norm": 10.556598663330078, "learning_rate": 1.383957654723127e-05, "loss": 0.1974, "step": 3400 }, { "epoch": 0.3486714852896264, "grad_norm": 5.033000946044922, "learning_rate": 1.3941368078175897e-05, "loss": 0.1609, "step": 3425 }, { "epoch": 0.35121653262750685, "grad_norm": 9.810371398925781, "learning_rate": 1.4043159609120523e-05, "loss": 0.185, "step": 3450 }, { "epoch": 0.3537615799653874, "grad_norm": 2.4697887897491455, "learning_rate": 1.4144951140065148e-05, "loss": 0.1459, "step": 3475 }, { "epoch": 0.35630662730326784, "grad_norm": 7.219648838043213, "learning_rate": 1.4246742671009772e-05, "loss": 0.1896, "step": 3500 }, { "epoch": 0.3588516746411483, "grad_norm": 5.063100337982178, "learning_rate": 1.4348534201954398e-05, "loss": 0.1944, "step": 3525 }, { "epoch": 0.36139672197902883, "grad_norm": 4.737133502960205, "learning_rate": 1.4450325732899025e-05, "loss": 0.2766, "step": 3550 }, { "epoch": 0.3639417693169093, "grad_norm": 12.325284004211426, "learning_rate": 1.455211726384365e-05, "loss": 0.166, "step": 3575 }, { "epoch": 0.36648681665478977, "grad_norm": 5.75470495223999, "learning_rate": 1.4653908794788274e-05, "loss": 0.1889, "step": 3600 }, { "epoch": 0.36903186399267024, "grad_norm": 0.9477280378341675, "learning_rate": 1.47557003257329e-05, "loss": 0.1586, "step": 3625 }, { "epoch": 0.37157691133055076, "grad_norm": 12.12957763671875, "learning_rate": 1.4857491856677525e-05, "loss": 0.1832, "step": 3650 }, { "epoch": 0.37412195866843123, "grad_norm": 4.511873245239258, "learning_rate": 1.4959283387622153e-05, "loss": 0.15, "step": 3675 }, { "epoch": 0.3766670060063117, "grad_norm": 2.4949402809143066, "learning_rate": 1.5061074918566776e-05, "loss": 0.2203, "step": 3700 }, { "epoch": 0.3792120533441922, "grad_norm": 6.411753177642822, "learning_rate": 1.5162866449511402e-05, "loss": 0.1409, "step": 3725 }, { "epoch": 0.3817571006820727, "grad_norm": 7.92652702331543, "learning_rate": 1.5264657980456027e-05, "loss": 0.157, "step": 3750 }, { "epoch": 0.38430214801995316, "grad_norm": 6.347306728363037, "learning_rate": 1.536644951140065e-05, "loss": 0.2132, "step": 3775 }, { "epoch": 0.38684719535783363, "grad_norm": 4.59040641784668, "learning_rate": 1.546824104234528e-05, "loss": 0.1653, "step": 3800 }, { "epoch": 0.38939224269571415, "grad_norm": 6.297945022583008, "learning_rate": 1.5570032573289902e-05, "loss": 0.2345, "step": 3825 }, { "epoch": 0.3919372900335946, "grad_norm": 6.927093982696533, "learning_rate": 1.567182410423453e-05, "loss": 0.171, "step": 3850 }, { "epoch": 0.3944823373714751, "grad_norm": 3.081691265106201, "learning_rate": 1.5773615635179154e-05, "loss": 0.1914, "step": 3875 }, { "epoch": 0.3970273847093556, "grad_norm": 8.08444595336914, "learning_rate": 1.587540716612378e-05, "loss": 0.1674, "step": 3900 }, { "epoch": 0.3995724320472361, "grad_norm": 4.390102386474609, "learning_rate": 1.5977198697068405e-05, "loss": 0.2075, "step": 3925 }, { "epoch": 0.40211747938511655, "grad_norm": 4.687044620513916, "learning_rate": 1.6078990228013032e-05, "loss": 0.1706, "step": 3950 }, { "epoch": 0.4046625267229971, "grad_norm": 6.264878749847412, "learning_rate": 1.6180781758957656e-05, "loss": 0.1797, "step": 3975 }, { "epoch": 0.40720757406087754, "grad_norm": 6.146754264831543, "learning_rate": 1.6282573289902283e-05, "loss": 0.2039, "step": 4000 }, { "epoch": 0.409752621398758, "grad_norm": 4.093338966369629, "learning_rate": 1.6384364820846907e-05, "loss": 0.1719, "step": 4025 }, { "epoch": 0.4122976687366385, "grad_norm": 4.742345809936523, "learning_rate": 1.648615635179153e-05, "loss": 0.1802, "step": 4050 }, { "epoch": 0.414842716074519, "grad_norm": 7.697132110595703, "learning_rate": 1.6587947882736158e-05, "loss": 0.1495, "step": 4075 }, { "epoch": 0.41738776341239947, "grad_norm": 9.093936920166016, "learning_rate": 1.6689739413680782e-05, "loss": 0.1765, "step": 4100 }, { "epoch": 0.41993281075027994, "grad_norm": 2.59102725982666, "learning_rate": 1.679153094462541e-05, "loss": 0.2024, "step": 4125 }, { "epoch": 0.42247785808816046, "grad_norm": 14.66810131072998, "learning_rate": 1.6893322475570033e-05, "loss": 0.2391, "step": 4150 }, { "epoch": 0.42502290542604093, "grad_norm": 9.222769737243652, "learning_rate": 1.699511400651466e-05, "loss": 0.1881, "step": 4175 }, { "epoch": 0.4275679527639214, "grad_norm": 5.003434658050537, "learning_rate": 1.7096905537459284e-05, "loss": 0.1537, "step": 4200 }, { "epoch": 0.43011300010180187, "grad_norm": 5.4720139503479, "learning_rate": 1.719869706840391e-05, "loss": 0.1573, "step": 4225 }, { "epoch": 0.4326580474396824, "grad_norm": 4.233233451843262, "learning_rate": 1.7300488599348535e-05, "loss": 0.1619, "step": 4250 }, { "epoch": 0.43520309477756286, "grad_norm": 5.573329925537109, "learning_rate": 1.7402280130293162e-05, "loss": 0.1827, "step": 4275 }, { "epoch": 0.43774814211544333, "grad_norm": 7.554567813873291, "learning_rate": 1.7504071661237786e-05, "loss": 0.1722, "step": 4300 }, { "epoch": 0.44029318945332385, "grad_norm": 8.24718952178955, "learning_rate": 1.760586319218241e-05, "loss": 0.138, "step": 4325 }, { "epoch": 0.4428382367912043, "grad_norm": 8.050121307373047, "learning_rate": 1.7707654723127037e-05, "loss": 0.1826, "step": 4350 }, { "epoch": 0.4453832841290848, "grad_norm": 2.46492338180542, "learning_rate": 1.7809446254071664e-05, "loss": 0.1447, "step": 4375 }, { "epoch": 0.4479283314669653, "grad_norm": 8.03940486907959, "learning_rate": 1.791123778501629e-05, "loss": 0.1745, "step": 4400 }, { "epoch": 0.4504733788048458, "grad_norm": 4.312441825866699, "learning_rate": 1.8013029315960912e-05, "loss": 0.1308, "step": 4425 }, { "epoch": 0.45301842614272625, "grad_norm": 8.30965805053711, "learning_rate": 1.811482084690554e-05, "loss": 0.2061, "step": 4450 }, { "epoch": 0.4555634734806067, "grad_norm": 4.426910400390625, "learning_rate": 1.8216612377850163e-05, "loss": 0.1735, "step": 4475 }, { "epoch": 0.45810852081848724, "grad_norm": 12.524816513061523, "learning_rate": 1.831840390879479e-05, "loss": 0.1937, "step": 4500 }, { "epoch": 0.4606535681563677, "grad_norm": 7.2023797035217285, "learning_rate": 1.8420195439739414e-05, "loss": 0.1794, "step": 4525 }, { "epoch": 0.4631986154942482, "grad_norm": 5.721170425415039, "learning_rate": 1.852198697068404e-05, "loss": 0.2055, "step": 4550 }, { "epoch": 0.4657436628321287, "grad_norm": 6.1515350341796875, "learning_rate": 1.8623778501628665e-05, "loss": 0.1835, "step": 4575 }, { "epoch": 0.46828871017000917, "grad_norm": 6.489041805267334, "learning_rate": 1.8725570032573293e-05, "loss": 0.1597, "step": 4600 }, { "epoch": 0.47083375750788964, "grad_norm": 6.8791093826293945, "learning_rate": 1.8827361563517917e-05, "loss": 0.1608, "step": 4625 }, { "epoch": 0.4733788048457701, "grad_norm": 3.147822141647339, "learning_rate": 1.8929153094462544e-05, "loss": 0.1653, "step": 4650 }, { "epoch": 0.47592385218365063, "grad_norm": 1.5910171270370483, "learning_rate": 1.9030944625407168e-05, "loss": 0.1526, "step": 4675 }, { "epoch": 0.4784688995215311, "grad_norm": 1.3465055227279663, "learning_rate": 1.9128664495114007e-05, "loss": 0.1507, "step": 4700 }, { "epoch": 0.48101394685941157, "grad_norm": 7.712373733520508, "learning_rate": 1.923045602605863e-05, "loss": 0.1571, "step": 4725 }, { "epoch": 0.4835589941972921, "grad_norm": 9.888788223266602, "learning_rate": 1.933224755700326e-05, "loss": 0.2194, "step": 4750 }, { "epoch": 0.48610404153517256, "grad_norm": 7.199795722961426, "learning_rate": 1.9434039087947886e-05, "loss": 0.1781, "step": 4775 }, { "epoch": 0.48864908887305303, "grad_norm": 12.238067626953125, "learning_rate": 1.953583061889251e-05, "loss": 0.1745, "step": 4800 }, { "epoch": 0.4911941362109335, "grad_norm": 1.1686809062957764, "learning_rate": 1.9637622149837133e-05, "loss": 0.1348, "step": 4825 }, { "epoch": 0.493739183548814, "grad_norm": 6.479466915130615, "learning_rate": 1.973941368078176e-05, "loss": 0.2164, "step": 4850 }, { "epoch": 0.4962842308866945, "grad_norm": 6.764782905578613, "learning_rate": 1.9841205211726388e-05, "loss": 0.1724, "step": 4875 }, { "epoch": 0.49882927822457496, "grad_norm": 2.8050501346588135, "learning_rate": 1.994299674267101e-05, "loss": 0.1563, "step": 4900 }, { "epoch": 0.5013743255624554, "grad_norm": 7.177031517028809, "learning_rate": 1.9995022962242382e-05, "loss": 0.2448, "step": 4925 }, { "epoch": 0.503919372900336, "grad_norm": 12.320595741271973, "learning_rate": 1.998371151279325e-05, "loss": 0.1796, "step": 4950 }, { "epoch": 0.5064644202382165, "grad_norm": 7.938055038452148, "learning_rate": 1.997240006334412e-05, "loss": 0.1541, "step": 4975 }, { "epoch": 0.5090094675760969, "grad_norm": 4.647416591644287, "learning_rate": 1.9961088613894987e-05, "loss": 0.1906, "step": 5000 }, { "epoch": 0.5115545149139774, "grad_norm": 4.05308723449707, "learning_rate": 1.9949777164445853e-05, "loss": 0.1822, "step": 5025 }, { "epoch": 0.5140995622518579, "grad_norm": 3.471247434616089, "learning_rate": 1.9938465714996723e-05, "loss": 0.1778, "step": 5050 }, { "epoch": 0.5166446095897383, "grad_norm": 8.766022682189941, "learning_rate": 1.992715426554759e-05, "loss": 0.1741, "step": 5075 }, { "epoch": 0.5191896569276189, "grad_norm": 7.638113021850586, "learning_rate": 1.9915842816098455e-05, "loss": 0.2008, "step": 5100 }, { "epoch": 0.5217347042654993, "grad_norm": 2.3895423412323, "learning_rate": 1.9904531366649325e-05, "loss": 0.1857, "step": 5125 }, { "epoch": 0.5242797516033798, "grad_norm": 9.395495414733887, "learning_rate": 1.9893219917200194e-05, "loss": 0.1966, "step": 5150 }, { "epoch": 0.5268247989412603, "grad_norm": 7.309564590454102, "learning_rate": 1.988190846775106e-05, "loss": 0.1868, "step": 5175 }, { "epoch": 0.5293698462791407, "grad_norm": 2.8665430545806885, "learning_rate": 1.9870597018301927e-05, "loss": 0.1607, "step": 5200 }, { "epoch": 0.5319148936170213, "grad_norm": 4.512245178222656, "learning_rate": 1.9859285568852796e-05, "loss": 0.1793, "step": 5225 }, { "epoch": 0.5344599409549018, "grad_norm": 5.876766681671143, "learning_rate": 1.9847974119403662e-05, "loss": 0.173, "step": 5250 }, { "epoch": 0.5370049882927822, "grad_norm": 13.551549911499023, "learning_rate": 1.983666266995453e-05, "loss": 0.1564, "step": 5275 }, { "epoch": 0.5395500356306627, "grad_norm": 8.268135070800781, "learning_rate": 1.9825351220505398e-05, "loss": 0.1681, "step": 5300 }, { "epoch": 0.5420950829685433, "grad_norm": 4.490458011627197, "learning_rate": 1.9814039771056264e-05, "loss": 0.2074, "step": 5325 }, { "epoch": 0.5446401303064237, "grad_norm": 7.859119415283203, "learning_rate": 1.9802728321607134e-05, "loss": 0.208, "step": 5350 }, { "epoch": 0.5471851776443042, "grad_norm": 3.8342528343200684, "learning_rate": 1.9791416872158e-05, "loss": 0.1809, "step": 5375 }, { "epoch": 0.5497302249821847, "grad_norm": 6.6303181648254395, "learning_rate": 1.978010542270887e-05, "loss": 0.1548, "step": 5400 }, { "epoch": 0.5522752723200651, "grad_norm": 5.547021865844727, "learning_rate": 1.9768793973259735e-05, "loss": 0.1755, "step": 5425 }, { "epoch": 0.5548203196579456, "grad_norm": 5.783038139343262, "learning_rate": 1.97574825238106e-05, "loss": 0.1891, "step": 5450 }, { "epoch": 0.5573653669958262, "grad_norm": 4.4773359298706055, "learning_rate": 1.974617107436147e-05, "loss": 0.1569, "step": 5475 }, { "epoch": 0.5599104143337066, "grad_norm": 7.39841890335083, "learning_rate": 1.9734859624912337e-05, "loss": 0.1599, "step": 5500 }, { "epoch": 0.5624554616715871, "grad_norm": 7.901022434234619, "learning_rate": 1.9723548175463207e-05, "loss": 0.1919, "step": 5525 }, { "epoch": 0.5650005090094675, "grad_norm": 5.0281596183776855, "learning_rate": 1.9712236726014073e-05, "loss": 0.1901, "step": 5550 }, { "epoch": 0.567545556347348, "grad_norm": 5.846480369567871, "learning_rate": 1.9700925276564943e-05, "loss": 0.1736, "step": 5575 }, { "epoch": 0.5700906036852286, "grad_norm": 5.52054500579834, "learning_rate": 1.968961382711581e-05, "loss": 0.1477, "step": 5600 }, { "epoch": 0.572635651023109, "grad_norm": 8.760751724243164, "learning_rate": 1.9678302377666675e-05, "loss": 0.189, "step": 5625 }, { "epoch": 0.5751806983609895, "grad_norm": 5.988430500030518, "learning_rate": 1.9666990928217544e-05, "loss": 0.2132, "step": 5650 }, { "epoch": 0.57772574569887, "grad_norm": 5.293154716491699, "learning_rate": 1.965567947876841e-05, "loss": 0.1636, "step": 5675 }, { "epoch": 0.5802707930367504, "grad_norm": 6.205268859863281, "learning_rate": 1.964436802931928e-05, "loss": 0.1592, "step": 5700 }, { "epoch": 0.582815840374631, "grad_norm": 3.520357370376587, "learning_rate": 1.9633056579870146e-05, "loss": 0.1607, "step": 5725 }, { "epoch": 0.5853608877125115, "grad_norm": 9.744399070739746, "learning_rate": 1.9621745130421012e-05, "loss": 0.1978, "step": 5750 }, { "epoch": 0.5879059350503919, "grad_norm": 7.006613254547119, "learning_rate": 1.9610433680971882e-05, "loss": 0.1954, "step": 5775 }, { "epoch": 0.5904509823882724, "grad_norm": 2.8386662006378174, "learning_rate": 1.9599122231522748e-05, "loss": 0.1457, "step": 5800 }, { "epoch": 0.592996029726153, "grad_norm": 9.212706565856934, "learning_rate": 1.9587810782073618e-05, "loss": 0.1229, "step": 5825 }, { "epoch": 0.5955410770640334, "grad_norm": 2.414585828781128, "learning_rate": 1.9576499332624484e-05, "loss": 0.1644, "step": 5850 }, { "epoch": 0.5980861244019139, "grad_norm": 9.854455947875977, "learning_rate": 1.9565187883175353e-05, "loss": 0.1804, "step": 5875 }, { "epoch": 0.6006311717397944, "grad_norm": 5.590005874633789, "learning_rate": 1.955387643372622e-05, "loss": 0.1697, "step": 5900 }, { "epoch": 0.6031762190776748, "grad_norm": 3.120067834854126, "learning_rate": 1.9542564984277086e-05, "loss": 0.1828, "step": 5925 }, { "epoch": 0.6057212664155553, "grad_norm": 11.758387565612793, "learning_rate": 1.9531253534827955e-05, "loss": 0.1476, "step": 5950 }, { "epoch": 0.6082663137534358, "grad_norm": 3.0814547538757324, "learning_rate": 1.951994208537882e-05, "loss": 0.1765, "step": 5975 }, { "epoch": 0.6108113610913163, "grad_norm": 5.264235496520996, "learning_rate": 1.950863063592969e-05, "loss": 0.1937, "step": 6000 }, { "epoch": 0.6133564084291968, "grad_norm": 7.090575695037842, "learning_rate": 1.9497319186480557e-05, "loss": 0.1737, "step": 6025 }, { "epoch": 0.6159014557670772, "grad_norm": 1.6197588443756104, "learning_rate": 1.9486007737031426e-05, "loss": 0.1403, "step": 6050 }, { "epoch": 0.6184465031049577, "grad_norm": 6.1466240882873535, "learning_rate": 1.9474696287582293e-05, "loss": 0.1869, "step": 6075 }, { "epoch": 0.6209915504428383, "grad_norm": 3.4034574031829834, "learning_rate": 1.946338483813316e-05, "loss": 0.17, "step": 6100 }, { "epoch": 0.6235365977807187, "grad_norm": 6.991599082946777, "learning_rate": 1.945207338868403e-05, "loss": 0.1534, "step": 6125 }, { "epoch": 0.6260816451185992, "grad_norm": 4.051377773284912, "learning_rate": 1.9440761939234894e-05, "loss": 0.1778, "step": 6150 }, { "epoch": 0.6286266924564797, "grad_norm": 5.443165302276611, "learning_rate": 1.942945048978576e-05, "loss": 0.1741, "step": 6175 }, { "epoch": 0.6311717397943601, "grad_norm": 5.732553482055664, "learning_rate": 1.941813904033663e-05, "loss": 0.1622, "step": 6200 }, { "epoch": 0.6337167871322407, "grad_norm": 6.2127180099487305, "learning_rate": 1.94068275908875e-05, "loss": 0.2172, "step": 6225 }, { "epoch": 0.6362618344701212, "grad_norm": 5.593441963195801, "learning_rate": 1.9395516141438366e-05, "loss": 0.1618, "step": 6250 }, { "epoch": 0.6388068818080016, "grad_norm": 4.706175804138184, "learning_rate": 1.9384204691989232e-05, "loss": 0.1973, "step": 6275 }, { "epoch": 0.6413519291458821, "grad_norm": 5.412227630615234, "learning_rate": 1.93728932425401e-05, "loss": 0.1896, "step": 6300 }, { "epoch": 0.6438969764837627, "grad_norm": 4.554426193237305, "learning_rate": 1.9361581793090968e-05, "loss": 0.1766, "step": 6325 }, { "epoch": 0.6464420238216431, "grad_norm": 5.6457839012146, "learning_rate": 1.9350270343641834e-05, "loss": 0.1692, "step": 6350 }, { "epoch": 0.6489870711595236, "grad_norm": 5.07610559463501, "learning_rate": 1.9338958894192703e-05, "loss": 0.1687, "step": 6375 }, { "epoch": 0.651532118497404, "grad_norm": 8.431342124938965, "learning_rate": 1.9327647444743573e-05, "loss": 0.1635, "step": 6400 }, { "epoch": 0.6540771658352845, "grad_norm": 2.4447405338287354, "learning_rate": 1.931633599529444e-05, "loss": 0.1625, "step": 6425 }, { "epoch": 0.656622213173165, "grad_norm": 5.066686153411865, "learning_rate": 1.9305024545845305e-05, "loss": 0.1532, "step": 6450 }, { "epoch": 0.6591672605110455, "grad_norm": 5.495453357696533, "learning_rate": 1.9293713096396175e-05, "loss": 0.1668, "step": 6475 }, { "epoch": 0.661712307848926, "grad_norm": 12.891332626342773, "learning_rate": 1.928240164694704e-05, "loss": 0.1869, "step": 6500 }, { "epoch": 0.6642573551868065, "grad_norm": 4.865134239196777, "learning_rate": 1.9271090197497907e-05, "loss": 0.1891, "step": 6525 }, { "epoch": 0.6668024025246869, "grad_norm": 7.412440776824951, "learning_rate": 1.9259778748048777e-05, "loss": 0.1914, "step": 6550 }, { "epoch": 0.6693474498625674, "grad_norm": 4.157654285430908, "learning_rate": 1.9248467298599646e-05, "loss": 0.1951, "step": 6575 }, { "epoch": 0.671892497200448, "grad_norm": 3.6485140323638916, "learning_rate": 1.9237155849150512e-05, "loss": 0.1453, "step": 6600 }, { "epoch": 0.6744375445383284, "grad_norm": 6.457579612731934, "learning_rate": 1.922584439970138e-05, "loss": 0.1671, "step": 6625 }, { "epoch": 0.6769825918762089, "grad_norm": 4.9921650886535645, "learning_rate": 1.9214532950252248e-05, "loss": 0.1527, "step": 6650 }, { "epoch": 0.6795276392140894, "grad_norm": 11.607213973999023, "learning_rate": 1.9203221500803114e-05, "loss": 0.1945, "step": 6675 }, { "epoch": 0.6820726865519698, "grad_norm": 1.1529160737991333, "learning_rate": 1.919191005135398e-05, "loss": 0.1575, "step": 6700 }, { "epoch": 0.6846177338898504, "grad_norm": 3.0068256855010986, "learning_rate": 1.918059860190485e-05, "loss": 0.1633, "step": 6725 }, { "epoch": 0.6871627812277309, "grad_norm": 4.913267135620117, "learning_rate": 1.916928715245572e-05, "loss": 0.1672, "step": 6750 }, { "epoch": 0.6897078285656113, "grad_norm": 6.968467712402344, "learning_rate": 1.9157975703006582e-05, "loss": 0.1768, "step": 6775 }, { "epoch": 0.6922528759034918, "grad_norm": 3.2676682472229004, "learning_rate": 1.914711671153542e-05, "loss": 0.1957, "step": 6800 }, { "epoch": 0.6947979232413722, "grad_norm": 2.3184611797332764, "learning_rate": 1.9135805262086286e-05, "loss": 0.1427, "step": 6825 }, { "epoch": 0.6973429705792528, "grad_norm": 7.692404270172119, "learning_rate": 1.9124493812637152e-05, "loss": 0.1333, "step": 6850 }, { "epoch": 0.6998880179171333, "grad_norm": 10.22700309753418, "learning_rate": 1.9113182363188022e-05, "loss": 0.1457, "step": 6875 }, { "epoch": 0.7024330652550137, "grad_norm": 6.087644577026367, "learning_rate": 1.9101870913738888e-05, "loss": 0.1821, "step": 6900 }, { "epoch": 0.7049781125928942, "grad_norm": 4.5725836753845215, "learning_rate": 1.9090559464289754e-05, "loss": 0.1419, "step": 6925 }, { "epoch": 0.7075231599307747, "grad_norm": 5.20031213760376, "learning_rate": 1.9079248014840624e-05, "loss": 0.1687, "step": 6950 }, { "epoch": 0.7100682072686552, "grad_norm": 5.029293060302734, "learning_rate": 1.9067936565391493e-05, "loss": 0.2075, "step": 6975 }, { "epoch": 0.7126132546065357, "grad_norm": 6.754817962646484, "learning_rate": 1.905662511594236e-05, "loss": 0.1931, "step": 7000 }, { "epoch": 0.7151583019444162, "grad_norm": 0.8655216693878174, "learning_rate": 1.9045313666493225e-05, "loss": 0.1519, "step": 7025 }, { "epoch": 0.7177033492822966, "grad_norm": 6.249109268188477, "learning_rate": 1.9034002217044095e-05, "loss": 0.135, "step": 7050 }, { "epoch": 0.7202483966201771, "grad_norm": 2.2959837913513184, "learning_rate": 1.902269076759496e-05, "loss": 0.1467, "step": 7075 }, { "epoch": 0.7227934439580577, "grad_norm": 7.267690658569336, "learning_rate": 1.9011379318145827e-05, "loss": 0.2012, "step": 7100 }, { "epoch": 0.7253384912959381, "grad_norm": 3.546318292617798, "learning_rate": 1.9000067868696697e-05, "loss": 0.1632, "step": 7125 }, { "epoch": 0.7278835386338186, "grad_norm": 7.313410758972168, "learning_rate": 1.8988756419247566e-05, "loss": 0.1581, "step": 7150 }, { "epoch": 0.730428585971699, "grad_norm": 5.693080902099609, "learning_rate": 1.8977444969798432e-05, "loss": 0.1336, "step": 7175 }, { "epoch": 0.7329736333095795, "grad_norm": 5.479151248931885, "learning_rate": 1.89661335203493e-05, "loss": 0.1555, "step": 7200 }, { "epoch": 0.7355186806474601, "grad_norm": 6.8671345710754395, "learning_rate": 1.8954822070900168e-05, "loss": 0.1901, "step": 7225 }, { "epoch": 0.7380637279853405, "grad_norm": 5.530952453613281, "learning_rate": 1.8943510621451034e-05, "loss": 0.1602, "step": 7250 }, { "epoch": 0.740608775323221, "grad_norm": 8.927380561828613, "learning_rate": 1.89321991720019e-05, "loss": 0.1987, "step": 7275 }, { "epoch": 0.7431538226611015, "grad_norm": 7.032819747924805, "learning_rate": 1.892088772255277e-05, "loss": 0.1757, "step": 7300 }, { "epoch": 0.7456988699989819, "grad_norm": 1.044941782951355, "learning_rate": 1.890957627310364e-05, "loss": 0.1622, "step": 7325 }, { "epoch": 0.7482439173368625, "grad_norm": 4.259761810302734, "learning_rate": 1.8898264823654506e-05, "loss": 0.1402, "step": 7350 }, { "epoch": 0.750788964674743, "grad_norm": 6.318974494934082, "learning_rate": 1.8886953374205372e-05, "loss": 0.1463, "step": 7375 }, { "epoch": 0.7533340120126234, "grad_norm": 9.901176452636719, "learning_rate": 1.887564192475624e-05, "loss": 0.1683, "step": 7400 }, { "epoch": 0.7558790593505039, "grad_norm": 5.005266189575195, "learning_rate": 1.8864330475307108e-05, "loss": 0.1813, "step": 7425 }, { "epoch": 0.7584241066883844, "grad_norm": 7.308044910430908, "learning_rate": 1.8853019025857974e-05, "loss": 0.137, "step": 7450 }, { "epoch": 0.7609691540262649, "grad_norm": 7.757829666137695, "learning_rate": 1.8841707576408843e-05, "loss": 0.1778, "step": 7475 }, { "epoch": 0.7635142013641454, "grad_norm": 8.0330228805542, "learning_rate": 1.883039612695971e-05, "loss": 0.1876, "step": 7500 }, { "epoch": 0.7660592487020259, "grad_norm": 11.038161277770996, "learning_rate": 1.881908467751058e-05, "loss": 0.151, "step": 7525 }, { "epoch": 0.7686042960399063, "grad_norm": 7.047212600708008, "learning_rate": 1.8807773228061445e-05, "loss": 0.1672, "step": 7550 }, { "epoch": 0.7711493433777868, "grad_norm": 2.1965415477752686, "learning_rate": 1.8796461778612315e-05, "loss": 0.119, "step": 7575 }, { "epoch": 0.7736943907156673, "grad_norm": 5.676666259765625, "learning_rate": 1.878515032916318e-05, "loss": 0.1732, "step": 7600 }, { "epoch": 0.7762394380535478, "grad_norm": 5.984830856323242, "learning_rate": 1.8773838879714047e-05, "loss": 0.1328, "step": 7625 }, { "epoch": 0.7787844853914283, "grad_norm": 5.381333827972412, "learning_rate": 1.8762527430264916e-05, "loss": 0.1857, "step": 7650 }, { "epoch": 0.7813295327293087, "grad_norm": 6.377498149871826, "learning_rate": 1.8751215980815783e-05, "loss": 0.1954, "step": 7675 }, { "epoch": 0.7838745800671892, "grad_norm": 7.693028926849365, "learning_rate": 1.8739904531366652e-05, "loss": 0.1599, "step": 7700 }, { "epoch": 0.7864196274050698, "grad_norm": 5.4145426750183105, "learning_rate": 1.8728593081917518e-05, "loss": 0.1917, "step": 7725 }, { "epoch": 0.7889646747429502, "grad_norm": 4.35727596282959, "learning_rate": 1.8717281632468388e-05, "loss": 0.1735, "step": 7750 }, { "epoch": 0.7915097220808307, "grad_norm": 8.166193008422852, "learning_rate": 1.8705970183019254e-05, "loss": 0.158, "step": 7775 }, { "epoch": 0.7940547694187112, "grad_norm": 2.884567975997925, "learning_rate": 1.8695111191548085e-05, "loss": 0.1422, "step": 7800 }, { "epoch": 0.7965998167565916, "grad_norm": 7.425231456756592, "learning_rate": 1.8683799742098954e-05, "loss": 0.1841, "step": 7825 }, { "epoch": 0.7991448640944722, "grad_norm": 5.505438804626465, "learning_rate": 1.867248829264982e-05, "loss": 0.1473, "step": 7850 }, { "epoch": 0.8016899114323527, "grad_norm": 8.657942771911621, "learning_rate": 1.866117684320069e-05, "loss": 0.1494, "step": 7875 }, { "epoch": 0.8042349587702331, "grad_norm": 10.716988563537598, "learning_rate": 1.8649865393751556e-05, "loss": 0.1657, "step": 7900 }, { "epoch": 0.8067800061081136, "grad_norm": 5.407134056091309, "learning_rate": 1.8638553944302426e-05, "loss": 0.1637, "step": 7925 }, { "epoch": 0.8093250534459941, "grad_norm": 3.173985719680786, "learning_rate": 1.8627242494853292e-05, "loss": 0.1618, "step": 7950 }, { "epoch": 0.8118701007838746, "grad_norm": 7.4745330810546875, "learning_rate": 1.8615931045404158e-05, "loss": 0.1896, "step": 7975 }, { "epoch": 0.8144151481217551, "grad_norm": 4.304909706115723, "learning_rate": 1.8604619595955028e-05, "loss": 0.1893, "step": 8000 }, { "epoch": 0.8169601954596355, "grad_norm": 7.029309272766113, "learning_rate": 1.8593308146505894e-05, "loss": 0.2058, "step": 8025 }, { "epoch": 0.819505242797516, "grad_norm": 7.313731670379639, "learning_rate": 1.8581996697056763e-05, "loss": 0.1796, "step": 8050 }, { "epoch": 0.8220502901353965, "grad_norm": 5.254171371459961, "learning_rate": 1.857068524760763e-05, "loss": 0.1906, "step": 8075 }, { "epoch": 0.824595337473277, "grad_norm": 4.730832099914551, "learning_rate": 1.85593737981585e-05, "loss": 0.1581, "step": 8100 }, { "epoch": 0.8271403848111575, "grad_norm": 7.461226463317871, "learning_rate": 1.8548062348709365e-05, "loss": 0.1871, "step": 8125 }, { "epoch": 0.829685432149038, "grad_norm": 3.6966590881347656, "learning_rate": 1.853675089926023e-05, "loss": 0.1231, "step": 8150 }, { "epoch": 0.8322304794869184, "grad_norm": 9.96469783782959, "learning_rate": 1.85254394498111e-05, "loss": 0.2116, "step": 8175 }, { "epoch": 0.8347755268247989, "grad_norm": 6.719648361206055, "learning_rate": 1.8514128000361967e-05, "loss": 0.1809, "step": 8200 }, { "epoch": 0.8373205741626795, "grad_norm": 2.1537251472473145, "learning_rate": 1.8502816550912837e-05, "loss": 0.1456, "step": 8225 }, { "epoch": 0.8398656215005599, "grad_norm": 7.317068576812744, "learning_rate": 1.8491505101463703e-05, "loss": 0.1526, "step": 8250 }, { "epoch": 0.8424106688384404, "grad_norm": 11.354976654052734, "learning_rate": 1.8480193652014572e-05, "loss": 0.1698, "step": 8275 }, { "epoch": 0.8449557161763209, "grad_norm": 4.932126045227051, "learning_rate": 1.846888220256544e-05, "loss": 0.1893, "step": 8300 }, { "epoch": 0.8475007635142013, "grad_norm": 7.772342681884766, "learning_rate": 1.8457570753116305e-05, "loss": 0.1755, "step": 8325 }, { "epoch": 0.8500458108520819, "grad_norm": 6.9974541664123535, "learning_rate": 1.8446259303667174e-05, "loss": 0.1748, "step": 8350 }, { "epoch": 0.8525908581899624, "grad_norm": 3.3724286556243896, "learning_rate": 1.843494785421804e-05, "loss": 0.172, "step": 8375 }, { "epoch": 0.8551359055278428, "grad_norm": 3.287820339202881, "learning_rate": 1.8423636404768906e-05, "loss": 0.1222, "step": 8400 }, { "epoch": 0.8576809528657233, "grad_norm": 4.461840629577637, "learning_rate": 1.8412324955319776e-05, "loss": 0.1604, "step": 8425 }, { "epoch": 0.8602260002036037, "grad_norm": 10.443443298339844, "learning_rate": 1.8401013505870646e-05, "loss": 0.1862, "step": 8450 }, { "epoch": 0.8627710475414843, "grad_norm": 6.476747035980225, "learning_rate": 1.838970205642151e-05, "loss": 0.1848, "step": 8475 }, { "epoch": 0.8653160948793648, "grad_norm": 8.956707000732422, "learning_rate": 1.8378390606972378e-05, "loss": 0.1445, "step": 8500 }, { "epoch": 0.8678611422172452, "grad_norm": 2.2105791568756104, "learning_rate": 1.8367079157523247e-05, "loss": 0.1686, "step": 8525 }, { "epoch": 0.8704061895551257, "grad_norm": 10.493624687194824, "learning_rate": 1.8355767708074114e-05, "loss": 0.1835, "step": 8550 }, { "epoch": 0.8729512368930062, "grad_norm": 9.583802223205566, "learning_rate": 1.834445625862498e-05, "loss": 0.1464, "step": 8575 }, { "epoch": 0.8754962842308867, "grad_norm": 10.556769371032715, "learning_rate": 1.833314480917585e-05, "loss": 0.187, "step": 8600 }, { "epoch": 0.8780413315687672, "grad_norm": 5.583988666534424, "learning_rate": 1.832183335972672e-05, "loss": 0.1457, "step": 8625 }, { "epoch": 0.8805863789066477, "grad_norm": 3.9103634357452393, "learning_rate": 1.8310521910277585e-05, "loss": 0.1535, "step": 8650 }, { "epoch": 0.8831314262445281, "grad_norm": 5.067739009857178, "learning_rate": 1.829921046082845e-05, "loss": 0.1457, "step": 8675 }, { "epoch": 0.8856764735824086, "grad_norm": 4.853885650634766, "learning_rate": 1.828789901137932e-05, "loss": 0.1417, "step": 8700 }, { "epoch": 0.8882215209202892, "grad_norm": 9.625697135925293, "learning_rate": 1.8276587561930187e-05, "loss": 0.1428, "step": 8725 }, { "epoch": 0.8907665682581696, "grad_norm": 7.169886112213135, "learning_rate": 1.8265276112481053e-05, "loss": 0.1764, "step": 8750 }, { "epoch": 0.8933116155960501, "grad_norm": 3.6491191387176514, "learning_rate": 1.8253964663031922e-05, "loss": 0.1454, "step": 8775 }, { "epoch": 0.8958566629339306, "grad_norm": 6.77921724319458, "learning_rate": 1.8242653213582792e-05, "loss": 0.2102, "step": 8800 }, { "epoch": 0.898401710271811, "grad_norm": 5.925548076629639, "learning_rate": 1.8231341764133658e-05, "loss": 0.1703, "step": 8825 }, { "epoch": 0.9009467576096916, "grad_norm": 5.68595552444458, "learning_rate": 1.8220030314684524e-05, "loss": 0.1503, "step": 8850 }, { "epoch": 0.903491804947572, "grad_norm": 4.654073238372803, "learning_rate": 1.8208718865235394e-05, "loss": 0.1741, "step": 8875 }, { "epoch": 0.9060368522854525, "grad_norm": 4.704840660095215, "learning_rate": 1.819740741578626e-05, "loss": 0.1608, "step": 8900 }, { "epoch": 0.908581899623333, "grad_norm": 4.562308311462402, "learning_rate": 1.8186095966337126e-05, "loss": 0.1881, "step": 8925 }, { "epoch": 0.9111269469612134, "grad_norm": 7.123320579528809, "learning_rate": 1.8174784516887996e-05, "loss": 0.1739, "step": 8950 }, { "epoch": 0.913671994299094, "grad_norm": 1.0297778844833374, "learning_rate": 1.8163473067438865e-05, "loss": 0.1314, "step": 8975 }, { "epoch": 0.9162170416369745, "grad_norm": 2.9744787216186523, "learning_rate": 1.815216161798973e-05, "loss": 0.1516, "step": 9000 }, { "epoch": 0.9187620889748549, "grad_norm": 7.400439262390137, "learning_rate": 1.8140850168540597e-05, "loss": 0.1486, "step": 9025 }, { "epoch": 0.9213071363127354, "grad_norm": 7.018895149230957, "learning_rate": 1.8129538719091467e-05, "loss": 0.1222, "step": 9050 }, { "epoch": 0.9238521836506159, "grad_norm": 5.089198589324951, "learning_rate": 1.8118227269642333e-05, "loss": 0.2115, "step": 9075 }, { "epoch": 0.9263972309884964, "grad_norm": 6.323294162750244, "learning_rate": 1.81069158201932e-05, "loss": 0.1607, "step": 9100 }, { "epoch": 0.9289422783263769, "grad_norm": 3.201341390609741, "learning_rate": 1.809560437074407e-05, "loss": 0.1259, "step": 9125 }, { "epoch": 0.9314873256642574, "grad_norm": 6.361789703369141, "learning_rate": 1.808429292129494e-05, "loss": 0.1885, "step": 9150 }, { "epoch": 0.9340323730021378, "grad_norm": 4.8685383796691895, "learning_rate": 1.8072981471845805e-05, "loss": 0.1819, "step": 9175 }, { "epoch": 0.9365774203400183, "grad_norm": 6.192000865936279, "learning_rate": 1.806167002239667e-05, "loss": 0.1732, "step": 9200 }, { "epoch": 0.9391224676778988, "grad_norm": 7.19197940826416, "learning_rate": 1.805035857294754e-05, "loss": 0.1635, "step": 9225 }, { "epoch": 0.9416675150157793, "grad_norm": 8.539782524108887, "learning_rate": 1.8039047123498406e-05, "loss": 0.1896, "step": 9250 }, { "epoch": 0.9442125623536598, "grad_norm": 5.8749494552612305, "learning_rate": 1.8027735674049273e-05, "loss": 0.1402, "step": 9275 }, { "epoch": 0.9467576096915402, "grad_norm": 1.8965274095535278, "learning_rate": 1.8016424224600142e-05, "loss": 0.1247, "step": 9300 }, { "epoch": 0.9493026570294207, "grad_norm": 0.6471710801124573, "learning_rate": 1.800511277515101e-05, "loss": 0.1337, "step": 9325 }, { "epoch": 0.9518477043673013, "grad_norm": 9.45545768737793, "learning_rate": 1.7993801325701878e-05, "loss": 0.1447, "step": 9350 }, { "epoch": 0.9543927517051817, "grad_norm": 3.9917619228363037, "learning_rate": 1.7982489876252744e-05, "loss": 0.1978, "step": 9375 }, { "epoch": 0.9569377990430622, "grad_norm": 5.922330379486084, "learning_rate": 1.7971178426803613e-05, "loss": 0.1773, "step": 9400 }, { "epoch": 0.9594828463809427, "grad_norm": 7.029813289642334, "learning_rate": 1.795986697735448e-05, "loss": 0.185, "step": 9425 }, { "epoch": 0.9620278937188231, "grad_norm": 2.9291632175445557, "learning_rate": 1.7948555527905346e-05, "loss": 0.1906, "step": 9450 }, { "epoch": 0.9645729410567037, "grad_norm": 2.9473659992218018, "learning_rate": 1.7937244078456215e-05, "loss": 0.1606, "step": 9475 }, { "epoch": 0.9671179883945842, "grad_norm": 7.865957260131836, "learning_rate": 1.7925932629007085e-05, "loss": 0.1815, "step": 9500 }, { "epoch": 0.9696630357324646, "grad_norm": 4.849057197570801, "learning_rate": 1.791462117955795e-05, "loss": 0.1642, "step": 9525 }, { "epoch": 0.9722080830703451, "grad_norm": 6.303313255310059, "learning_rate": 1.7903309730108817e-05, "loss": 0.1785, "step": 9550 }, { "epoch": 0.9747531304082256, "grad_norm": 2.399278402328491, "learning_rate": 1.7891998280659687e-05, "loss": 0.1578, "step": 9575 }, { "epoch": 0.9772981777461061, "grad_norm": 7.749370574951172, "learning_rate": 1.7880686831210553e-05, "loss": 0.1341, "step": 9600 }, { "epoch": 0.9798432250839866, "grad_norm": 4.824361801147461, "learning_rate": 1.786937538176142e-05, "loss": 0.1381, "step": 9625 }, { "epoch": 0.982388272421867, "grad_norm": 4.130845069885254, "learning_rate": 1.785806393231229e-05, "loss": 0.1397, "step": 9650 }, { "epoch": 0.9849333197597475, "grad_norm": 3.9672348499298096, "learning_rate": 1.7846752482863158e-05, "loss": 0.1547, "step": 9675 }, { "epoch": 0.987478367097628, "grad_norm": 4.505708694458008, "learning_rate": 1.7835441033414024e-05, "loss": 0.1801, "step": 9700 }, { "epoch": 0.9900234144355085, "grad_norm": 5.979536533355713, "learning_rate": 1.782412958396489e-05, "loss": 0.1419, "step": 9725 }, { "epoch": 0.992568461773389, "grad_norm": 3.7445387840270996, "learning_rate": 1.781281813451576e-05, "loss": 0.1665, "step": 9750 }, { "epoch": 0.9951135091112695, "grad_norm": 3.4875290393829346, "learning_rate": 1.7801506685066626e-05, "loss": 0.1535, "step": 9775 }, { "epoch": 0.9976585564491499, "grad_norm": 3.284198760986328, "learning_rate": 1.7790195235617492e-05, "loss": 0.1482, "step": 9800 }, { "epoch": 1.0, "eval_loss": 0.10157867521047592, "eval_runtime": 7.0586, "eval_samples_per_second": 964.495, "eval_steps_per_second": 15.159, "step": 9823 }, { "epoch": 1.0002036037870305, "grad_norm": 2.1478254795074463, "learning_rate": 1.7778883786168362e-05, "loss": 0.0948, "step": 9825 }, { "epoch": 1.0027486511249109, "grad_norm": 5.228854656219482, "learning_rate": 1.7767572336719228e-05, "loss": 0.1329, "step": 9850 }, { "epoch": 1.0052936984627914, "grad_norm": 1.1476223468780518, "learning_rate": 1.7756260887270097e-05, "loss": 0.1203, "step": 9875 }, { "epoch": 1.007838745800672, "grad_norm": 5.739945411682129, "learning_rate": 1.7744949437820964e-05, "loss": 0.1418, "step": 9900 }, { "epoch": 1.0103837931385524, "grad_norm": 2.3966896533966064, "learning_rate": 1.7733637988371833e-05, "loss": 0.1122, "step": 9925 }, { "epoch": 1.012928840476433, "grad_norm": 4.959926128387451, "learning_rate": 1.77223265389227e-05, "loss": 0.1192, "step": 9950 }, { "epoch": 1.0154738878143132, "grad_norm": 5.601362705230713, "learning_rate": 1.7711015089473565e-05, "loss": 0.0988, "step": 9975 }, { "epoch": 1.0180189351521938, "grad_norm": 3.998955249786377, "learning_rate": 1.7699703640024435e-05, "loss": 0.1295, "step": 10000 }, { "epoch": 1.0205639824900743, "grad_norm": 4.754059314727783, "learning_rate": 1.76883921905753e-05, "loss": 0.1348, "step": 10025 }, { "epoch": 1.0231090298279548, "grad_norm": 4.0749735832214355, "learning_rate": 1.767708074112617e-05, "loss": 0.1333, "step": 10050 }, { "epoch": 1.0256540771658353, "grad_norm": 10.547972679138184, "learning_rate": 1.7665769291677037e-05, "loss": 0.1693, "step": 10075 }, { "epoch": 1.0281991245037159, "grad_norm": 6.241606712341309, "learning_rate": 1.7654457842227906e-05, "loss": 0.0902, "step": 10100 }, { "epoch": 1.0307441718415962, "grad_norm": 2.9035451412200928, "learning_rate": 1.7643146392778773e-05, "loss": 0.1196, "step": 10125 }, { "epoch": 1.0332892191794767, "grad_norm": 5.422848224639893, "learning_rate": 1.763183494332964e-05, "loss": 0.1624, "step": 10150 }, { "epoch": 1.0358342665173572, "grad_norm": 4.132188320159912, "learning_rate": 1.7620523493880508e-05, "loss": 0.1308, "step": 10175 }, { "epoch": 1.0383793138552377, "grad_norm": 5.723974704742432, "learning_rate": 1.7609212044431374e-05, "loss": 0.1226, "step": 10200 }, { "epoch": 1.0409243611931183, "grad_norm": 2.8668813705444336, "learning_rate": 1.7597900594982244e-05, "loss": 0.1049, "step": 10225 }, { "epoch": 1.0434694085309988, "grad_norm": 6.671713829040527, "learning_rate": 1.758658914553311e-05, "loss": 0.1377, "step": 10250 }, { "epoch": 1.046014455868879, "grad_norm": 1.478005290031433, "learning_rate": 1.7575277696083976e-05, "loss": 0.1183, "step": 10275 }, { "epoch": 1.0485595032067596, "grad_norm": 7.64422607421875, "learning_rate": 1.7563966246634846e-05, "loss": 0.1262, "step": 10300 }, { "epoch": 1.0511045505446401, "grad_norm": 9.314327239990234, "learning_rate": 1.7552654797185712e-05, "loss": 0.1159, "step": 10325 }, { "epoch": 1.0536495978825207, "grad_norm": 7.101593017578125, "learning_rate": 1.754134334773658e-05, "loss": 0.1545, "step": 10350 }, { "epoch": 1.0561946452204012, "grad_norm": 3.6233043670654297, "learning_rate": 1.7530031898287448e-05, "loss": 0.1602, "step": 10375 }, { "epoch": 1.0587396925582815, "grad_norm": 0.4728439450263977, "learning_rate": 1.7518720448838317e-05, "loss": 0.1373, "step": 10400 }, { "epoch": 1.061284739896162, "grad_norm": 2.2750091552734375, "learning_rate": 1.7507408999389183e-05, "loss": 0.1503, "step": 10425 }, { "epoch": 1.0638297872340425, "grad_norm": 6.410482406616211, "learning_rate": 1.749609754994005e-05, "loss": 0.1787, "step": 10450 }, { "epoch": 1.066374834571923, "grad_norm": 5.287363052368164, "learning_rate": 1.748478610049092e-05, "loss": 0.1642, "step": 10475 }, { "epoch": 1.0689198819098036, "grad_norm": 4.545390605926514, "learning_rate": 1.7473474651041785e-05, "loss": 0.1128, "step": 10500 }, { "epoch": 1.071464929247684, "grad_norm": 4.9489288330078125, "learning_rate": 1.7462163201592655e-05, "loss": 0.1206, "step": 10525 }, { "epoch": 1.0740099765855644, "grad_norm": 3.488626003265381, "learning_rate": 1.745085175214352e-05, "loss": 0.1176, "step": 10550 }, { "epoch": 1.076555023923445, "grad_norm": 5.163557529449463, "learning_rate": 1.743954030269439e-05, "loss": 0.1531, "step": 10575 }, { "epoch": 1.0791000712613255, "grad_norm": 7.763261795043945, "learning_rate": 1.7428228853245256e-05, "loss": 0.1371, "step": 10600 }, { "epoch": 1.081645118599206, "grad_norm": 3.1402170658111572, "learning_rate": 1.7416917403796123e-05, "loss": 0.1588, "step": 10625 }, { "epoch": 1.0841901659370865, "grad_norm": 3.7544312477111816, "learning_rate": 1.7405605954346992e-05, "loss": 0.1445, "step": 10650 }, { "epoch": 1.086735213274967, "grad_norm": 6.183403968811035, "learning_rate": 1.739429450489786e-05, "loss": 0.1113, "step": 10675 }, { "epoch": 1.0892802606128473, "grad_norm": 6.552886962890625, "learning_rate": 1.7382983055448724e-05, "loss": 0.1347, "step": 10700 }, { "epoch": 1.0918253079507279, "grad_norm": 3.3716797828674316, "learning_rate": 1.7371671605999594e-05, "loss": 0.1546, "step": 10725 }, { "epoch": 1.0943703552886084, "grad_norm": 4.622589111328125, "learning_rate": 1.7360360156550464e-05, "loss": 0.1687, "step": 10750 }, { "epoch": 1.096915402626489, "grad_norm": 4.305492401123047, "learning_rate": 1.734904870710133e-05, "loss": 0.1446, "step": 10775 }, { "epoch": 1.0994604499643694, "grad_norm": 6.439985275268555, "learning_rate": 1.7337737257652196e-05, "loss": 0.1295, "step": 10800 }, { "epoch": 1.1020054973022497, "grad_norm": 3.2125208377838135, "learning_rate": 1.7326425808203065e-05, "loss": 0.1218, "step": 10825 }, { "epoch": 1.1045505446401302, "grad_norm": 6.812816143035889, "learning_rate": 1.731511435875393e-05, "loss": 0.1201, "step": 10850 }, { "epoch": 1.1070955919780108, "grad_norm": 9.083202362060547, "learning_rate": 1.7303802909304798e-05, "loss": 0.1475, "step": 10875 }, { "epoch": 1.1096406393158913, "grad_norm": 9.970014572143555, "learning_rate": 1.7292491459855667e-05, "loss": 0.1278, "step": 10900 }, { "epoch": 1.1121856866537718, "grad_norm": 6.782029151916504, "learning_rate": 1.7281180010406537e-05, "loss": 0.1655, "step": 10925 }, { "epoch": 1.1147307339916523, "grad_norm": 2.6370034217834473, "learning_rate": 1.7269868560957403e-05, "loss": 0.134, "step": 10950 }, { "epoch": 1.1172757813295326, "grad_norm": 10.335969924926758, "learning_rate": 1.725855711150827e-05, "loss": 0.1174, "step": 10975 }, { "epoch": 1.1198208286674132, "grad_norm": 7.169666290283203, "learning_rate": 1.724724566205914e-05, "loss": 0.1559, "step": 11000 }, { "epoch": 1.1223658760052937, "grad_norm": 3.894927740097046, "learning_rate": 1.7235934212610005e-05, "loss": 0.1276, "step": 11025 }, { "epoch": 1.1249109233431742, "grad_norm": 8.561812400817871, "learning_rate": 1.722462276316087e-05, "loss": 0.1026, "step": 11050 }, { "epoch": 1.1274559706810547, "grad_norm": 7.845750331878662, "learning_rate": 1.721331131371174e-05, "loss": 0.1386, "step": 11075 }, { "epoch": 1.1300010180189353, "grad_norm": 3.596691370010376, "learning_rate": 1.720199986426261e-05, "loss": 0.1558, "step": 11100 }, { "epoch": 1.1325460653568156, "grad_norm": 2.8355915546417236, "learning_rate": 1.7190688414813476e-05, "loss": 0.1428, "step": 11125 }, { "epoch": 1.135091112694696, "grad_norm": 3.2784042358398438, "learning_rate": 1.7179376965364342e-05, "loss": 0.1838, "step": 11150 }, { "epoch": 1.1376361600325766, "grad_norm": 5.514401912689209, "learning_rate": 1.7168065515915212e-05, "loss": 0.0894, "step": 11175 }, { "epoch": 1.1401812073704571, "grad_norm": 6.449738502502441, "learning_rate": 1.7156754066466078e-05, "loss": 0.125, "step": 11200 }, { "epoch": 1.1427262547083377, "grad_norm": 3.2359158992767334, "learning_rate": 1.7145442617016944e-05, "loss": 0.1509, "step": 11225 }, { "epoch": 1.145271302046218, "grad_norm": 4.851259708404541, "learning_rate": 1.7134131167567814e-05, "loss": 0.1359, "step": 11250 }, { "epoch": 1.1478163493840985, "grad_norm": 3.3549108505249023, "learning_rate": 1.7122819718118683e-05, "loss": 0.1086, "step": 11275 }, { "epoch": 1.150361396721979, "grad_norm": 7.090845584869385, "learning_rate": 1.711150826866955e-05, "loss": 0.1361, "step": 11300 }, { "epoch": 1.1529064440598595, "grad_norm": 0.37024664878845215, "learning_rate": 1.7100196819220416e-05, "loss": 0.0929, "step": 11325 }, { "epoch": 1.15545149139774, "grad_norm": 2.443056583404541, "learning_rate": 1.7088885369771285e-05, "loss": 0.1291, "step": 11350 }, { "epoch": 1.1579965387356204, "grad_norm": 2.6655914783477783, "learning_rate": 1.707757392032215e-05, "loss": 0.0942, "step": 11375 }, { "epoch": 1.1605415860735009, "grad_norm": 4.226424217224121, "learning_rate": 1.7066262470873017e-05, "loss": 0.1658, "step": 11400 }, { "epoch": 1.1630866334113814, "grad_norm": 7.379846096038818, "learning_rate": 1.7054951021423887e-05, "loss": 0.1238, "step": 11425 }, { "epoch": 1.165631680749262, "grad_norm": 4.336475849151611, "learning_rate": 1.7043639571974756e-05, "loss": 0.143, "step": 11450 }, { "epoch": 1.1681767280871425, "grad_norm": 7.550657272338867, "learning_rate": 1.7032328122525623e-05, "loss": 0.1232, "step": 11475 }, { "epoch": 1.170721775425023, "grad_norm": 2.325849771499634, "learning_rate": 1.702101667307649e-05, "loss": 0.1253, "step": 11500 }, { "epoch": 1.1732668227629035, "grad_norm": 0.20483657717704773, "learning_rate": 1.7009705223627358e-05, "loss": 0.1415, "step": 11525 }, { "epoch": 1.1758118701007838, "grad_norm": 6.844511032104492, "learning_rate": 1.6998393774178224e-05, "loss": 0.1451, "step": 11550 }, { "epoch": 1.1783569174386643, "grad_norm": 2.30995512008667, "learning_rate": 1.698708232472909e-05, "loss": 0.1371, "step": 11575 }, { "epoch": 1.1809019647765449, "grad_norm": 5.756036758422852, "learning_rate": 1.697577087527996e-05, "loss": 0.1172, "step": 11600 }, { "epoch": 1.1834470121144254, "grad_norm": 4.841132164001465, "learning_rate": 1.696445942583083e-05, "loss": 0.1409, "step": 11625 }, { "epoch": 1.185992059452306, "grad_norm": 4.341500759124756, "learning_rate": 1.6953147976381696e-05, "loss": 0.1197, "step": 11650 }, { "epoch": 1.1885371067901862, "grad_norm": 3.437917947769165, "learning_rate": 1.6941836526932562e-05, "loss": 0.1384, "step": 11675 }, { "epoch": 1.1910821541280667, "grad_norm": 0.36744824051856995, "learning_rate": 1.693052507748343e-05, "loss": 0.1053, "step": 11700 }, { "epoch": 1.1936272014659473, "grad_norm": 6.063575744628906, "learning_rate": 1.6919213628034298e-05, "loss": 0.1402, "step": 11725 }, { "epoch": 1.1961722488038278, "grad_norm": 3.604062795639038, "learning_rate": 1.6907902178585164e-05, "loss": 0.1094, "step": 11750 }, { "epoch": 1.1987172961417083, "grad_norm": 6.017195701599121, "learning_rate": 1.6896590729136033e-05, "loss": 0.1301, "step": 11775 }, { "epoch": 1.2012623434795886, "grad_norm": 2.256840229034424, "learning_rate": 1.6885279279686903e-05, "loss": 0.1302, "step": 11800 }, { "epoch": 1.2038073908174691, "grad_norm": 2.5422046184539795, "learning_rate": 1.687396783023777e-05, "loss": 0.1111, "step": 11825 }, { "epoch": 1.2063524381553496, "grad_norm": 1.03132963180542, "learning_rate": 1.6862656380788635e-05, "loss": 0.1131, "step": 11850 }, { "epoch": 1.2088974854932302, "grad_norm": 3.2958521842956543, "learning_rate": 1.6851344931339505e-05, "loss": 0.1303, "step": 11875 }, { "epoch": 1.2114425328311107, "grad_norm": 4.987472057342529, "learning_rate": 1.684003348189037e-05, "loss": 0.1298, "step": 11900 }, { "epoch": 1.2139875801689912, "grad_norm": 2.972264289855957, "learning_rate": 1.6828722032441237e-05, "loss": 0.1426, "step": 11925 }, { "epoch": 1.2165326275068717, "grad_norm": 1.8931164741516113, "learning_rate": 1.6817410582992107e-05, "loss": 0.1437, "step": 11950 }, { "epoch": 1.219077674844752, "grad_norm": 4.350847244262695, "learning_rate": 1.6806099133542976e-05, "loss": 0.1301, "step": 11975 }, { "epoch": 1.2216227221826326, "grad_norm": 3.05570387840271, "learning_rate": 1.6794787684093842e-05, "loss": 0.1209, "step": 12000 }, { "epoch": 1.224167769520513, "grad_norm": 6.202553749084473, "learning_rate": 1.678347623464471e-05, "loss": 0.1203, "step": 12025 }, { "epoch": 1.2267128168583936, "grad_norm": 4.269680500030518, "learning_rate": 1.6772164785195578e-05, "loss": 0.1393, "step": 12050 }, { "epoch": 1.2292578641962741, "grad_norm": 13.57045841217041, "learning_rate": 1.6760853335746444e-05, "loss": 0.1442, "step": 12075 }, { "epoch": 1.2318029115341544, "grad_norm": 4.8651018142700195, "learning_rate": 1.674954188629731e-05, "loss": 0.1376, "step": 12100 }, { "epoch": 1.234347958872035, "grad_norm": 7.477112770080566, "learning_rate": 1.673823043684818e-05, "loss": 0.1278, "step": 12125 }, { "epoch": 1.2368930062099155, "grad_norm": 6.843944549560547, "learning_rate": 1.6726918987399046e-05, "loss": 0.1118, "step": 12150 }, { "epoch": 1.239438053547796, "grad_norm": 5.739787578582764, "learning_rate": 1.6715607537949915e-05, "loss": 0.1169, "step": 12175 }, { "epoch": 1.2419831008856765, "grad_norm": 2.3320653438568115, "learning_rate": 1.670429608850078e-05, "loss": 0.1234, "step": 12200 }, { "epoch": 1.2445281482235568, "grad_norm": 2.789673328399658, "learning_rate": 1.669298463905165e-05, "loss": 0.1482, "step": 12225 }, { "epoch": 1.2470731955614374, "grad_norm": 6.369720935821533, "learning_rate": 1.6681673189602517e-05, "loss": 0.1398, "step": 12250 }, { "epoch": 1.2496182428993179, "grad_norm": 5.785031318664551, "learning_rate": 1.6670361740153383e-05, "loss": 0.1433, "step": 12275 }, { "epoch": 1.2521632902371984, "grad_norm": 4.050633430480957, "learning_rate": 1.6659050290704253e-05, "loss": 0.1128, "step": 12300 }, { "epoch": 1.254708337575079, "grad_norm": 8.750215530395508, "learning_rate": 1.664773884125512e-05, "loss": 0.1636, "step": 12325 }, { "epoch": 1.2572533849129595, "grad_norm": 3.612732410430908, "learning_rate": 1.663642739180599e-05, "loss": 0.1722, "step": 12350 }, { "epoch": 1.25979843225084, "grad_norm": 5.242950439453125, "learning_rate": 1.6625115942356855e-05, "loss": 0.1421, "step": 12375 }, { "epoch": 1.2623434795887203, "grad_norm": 3.4691531658172607, "learning_rate": 1.6613804492907724e-05, "loss": 0.1413, "step": 12400 }, { "epoch": 1.2648885269266008, "grad_norm": 9.25007438659668, "learning_rate": 1.660249304345859e-05, "loss": 0.142, "step": 12425 }, { "epoch": 1.2674335742644813, "grad_norm": 4.163516044616699, "learning_rate": 1.6591181594009457e-05, "loss": 0.1528, "step": 12450 }, { "epoch": 1.2699786216023619, "grad_norm": 4.836366176605225, "learning_rate": 1.6579870144560326e-05, "loss": 0.13, "step": 12475 }, { "epoch": 1.2725236689402424, "grad_norm": 5.360639572143555, "learning_rate": 1.6568558695111192e-05, "loss": 0.1355, "step": 12500 }, { "epoch": 1.2750687162781227, "grad_norm": 3.893960952758789, "learning_rate": 1.6557247245662062e-05, "loss": 0.1, "step": 12525 }, { "epoch": 1.2776137636160032, "grad_norm": 5.7345991134643555, "learning_rate": 1.6545935796212928e-05, "loss": 0.1435, "step": 12550 }, { "epoch": 1.2801588109538837, "grad_norm": 8.154664993286133, "learning_rate": 1.6534624346763794e-05, "loss": 0.1381, "step": 12575 }, { "epoch": 1.2827038582917643, "grad_norm": 2.673351287841797, "learning_rate": 1.6523312897314664e-05, "loss": 0.127, "step": 12600 }, { "epoch": 1.2852489056296448, "grad_norm": 6.447815418243408, "learning_rate": 1.651200144786553e-05, "loss": 0.1345, "step": 12625 }, { "epoch": 1.287793952967525, "grad_norm": 6.898881435394287, "learning_rate": 1.65006899984164e-05, "loss": 0.1195, "step": 12650 }, { "epoch": 1.2903390003054056, "grad_norm": 6.427828311920166, "learning_rate": 1.6489378548967266e-05, "loss": 0.0898, "step": 12675 }, { "epoch": 1.2928840476432861, "grad_norm": 4.725465774536133, "learning_rate": 1.6478067099518135e-05, "loss": 0.1683, "step": 12700 }, { "epoch": 1.2954290949811667, "grad_norm": 5.63037633895874, "learning_rate": 1.6466755650069e-05, "loss": 0.1517, "step": 12725 }, { "epoch": 1.2979741423190472, "grad_norm": 4.656158447265625, "learning_rate": 1.6455444200619867e-05, "loss": 0.1627, "step": 12750 }, { "epoch": 1.3005191896569277, "grad_norm": 6.372010707855225, "learning_rate": 1.6444132751170737e-05, "loss": 0.1247, "step": 12775 }, { "epoch": 1.3030642369948082, "grad_norm": 5.822622299194336, "learning_rate": 1.6432821301721603e-05, "loss": 0.1421, "step": 12800 }, { "epoch": 1.3056092843326885, "grad_norm": 6.20810079574585, "learning_rate": 1.6421509852272473e-05, "loss": 0.1499, "step": 12825 }, { "epoch": 1.308154331670569, "grad_norm": 3.9495532512664795, "learning_rate": 1.641019840282334e-05, "loss": 0.1329, "step": 12850 }, { "epoch": 1.3106993790084496, "grad_norm": 4.609329700469971, "learning_rate": 1.639888695337421e-05, "loss": 0.1182, "step": 12875 }, { "epoch": 1.31324442634633, "grad_norm": 6.07358455657959, "learning_rate": 1.6387575503925075e-05, "loss": 0.1256, "step": 12900 }, { "epoch": 1.3157894736842106, "grad_norm": 1.8420575857162476, "learning_rate": 1.637626405447594e-05, "loss": 0.1173, "step": 12925 }, { "epoch": 1.318334521022091, "grad_norm": 7.779773235321045, "learning_rate": 1.636495260502681e-05, "loss": 0.1328, "step": 12950 }, { "epoch": 1.3208795683599714, "grad_norm": 8.295707702636719, "learning_rate": 1.6353641155577676e-05, "loss": 0.1315, "step": 12975 }, { "epoch": 1.323424615697852, "grad_norm": 3.329450845718384, "learning_rate": 1.6342329706128546e-05, "loss": 0.1224, "step": 13000 }, { "epoch": 1.3259696630357325, "grad_norm": 7.493209362030029, "learning_rate": 1.6331018256679412e-05, "loss": 0.1203, "step": 13025 }, { "epoch": 1.328514710373613, "grad_norm": 3.4829013347625732, "learning_rate": 1.631970680723028e-05, "loss": 0.1319, "step": 13050 }, { "epoch": 1.3310597577114933, "grad_norm": 4.998099327087402, "learning_rate": 1.6308395357781148e-05, "loss": 0.1272, "step": 13075 }, { "epoch": 1.3336048050493738, "grad_norm": 6.8269524574279785, "learning_rate": 1.6297083908332014e-05, "loss": 0.1759, "step": 13100 }, { "epoch": 1.3361498523872544, "grad_norm": 2.3060414791107178, "learning_rate": 1.6285772458882883e-05, "loss": 0.1174, "step": 13125 }, { "epoch": 1.338694899725135, "grad_norm": 2.708559274673462, "learning_rate": 1.627446100943375e-05, "loss": 0.1042, "step": 13150 }, { "epoch": 1.3412399470630154, "grad_norm": 5.088305950164795, "learning_rate": 1.6263149559984616e-05, "loss": 0.1399, "step": 13175 }, { "epoch": 1.343784994400896, "grad_norm": 3.7776496410369873, "learning_rate": 1.6251838110535485e-05, "loss": 0.1133, "step": 13200 }, { "epoch": 1.3463300417387765, "grad_norm": 1.5227245092391968, "learning_rate": 1.6240526661086355e-05, "loss": 0.1199, "step": 13225 }, { "epoch": 1.3488750890766568, "grad_norm": 3.7397494316101074, "learning_rate": 1.622921521163722e-05, "loss": 0.1066, "step": 13250 }, { "epoch": 1.3514201364145373, "grad_norm": 6.5920209884643555, "learning_rate": 1.6217903762188087e-05, "loss": 0.153, "step": 13275 }, { "epoch": 1.3539651837524178, "grad_norm": 8.203237533569336, "learning_rate": 1.6206592312738957e-05, "loss": 0.1267, "step": 13300 }, { "epoch": 1.3565102310902983, "grad_norm": 6.9769415855407715, "learning_rate": 1.6195280863289823e-05, "loss": 0.1227, "step": 13325 }, { "epoch": 1.3590552784281789, "grad_norm": 7.504185199737549, "learning_rate": 1.618396941384069e-05, "loss": 0.1384, "step": 13350 }, { "epoch": 1.3616003257660592, "grad_norm": 7.599664688110352, "learning_rate": 1.617265796439156e-05, "loss": 0.1268, "step": 13375 }, { "epoch": 1.3641453731039397, "grad_norm": 4.518153667449951, "learning_rate": 1.6161346514942428e-05, "loss": 0.1095, "step": 13400 }, { "epoch": 1.3666904204418202, "grad_norm": 5.297679901123047, "learning_rate": 1.6150035065493294e-05, "loss": 0.168, "step": 13425 }, { "epoch": 1.3692354677797007, "grad_norm": 5.273420810699463, "learning_rate": 1.613872361604416e-05, "loss": 0.1334, "step": 13450 }, { "epoch": 1.3717805151175813, "grad_norm": 3.286939859390259, "learning_rate": 1.612741216659503e-05, "loss": 0.085, "step": 13475 }, { "epoch": 1.3743255624554616, "grad_norm": 8.941014289855957, "learning_rate": 1.6116100717145896e-05, "loss": 0.135, "step": 13500 }, { "epoch": 1.376870609793342, "grad_norm": 4.475488185882568, "learning_rate": 1.6104789267696762e-05, "loss": 0.1172, "step": 13525 }, { "epoch": 1.3794156571312226, "grad_norm": 3.8119313716888428, "learning_rate": 1.6093477818247632e-05, "loss": 0.1344, "step": 13550 }, { "epoch": 1.3819607044691031, "grad_norm": 1.4924836158752441, "learning_rate": 1.60821663687985e-05, "loss": 0.1608, "step": 13575 }, { "epoch": 1.3845057518069837, "grad_norm": 5.886949062347412, "learning_rate": 1.6070854919349364e-05, "loss": 0.1453, "step": 13600 }, { "epoch": 1.3870507991448642, "grad_norm": 5.144103527069092, "learning_rate": 1.6059543469900234e-05, "loss": 0.1446, "step": 13625 }, { "epoch": 1.3895958464827447, "grad_norm": 7.620888710021973, "learning_rate": 1.6048232020451103e-05, "loss": 0.1304, "step": 13650 }, { "epoch": 1.392140893820625, "grad_norm": 5.124198913574219, "learning_rate": 1.603692057100197e-05, "loss": 0.1467, "step": 13675 }, { "epoch": 1.3946859411585055, "grad_norm": 0.9429985284805298, "learning_rate": 1.6025609121552835e-05, "loss": 0.0788, "step": 13700 }, { "epoch": 1.397230988496386, "grad_norm": 7.610143184661865, "learning_rate": 1.6014297672103705e-05, "loss": 0.1546, "step": 13725 }, { "epoch": 1.3997760358342666, "grad_norm": 3.9099016189575195, "learning_rate": 1.6002986222654574e-05, "loss": 0.0995, "step": 13750 }, { "epoch": 1.402321083172147, "grad_norm": 2.6341960430145264, "learning_rate": 1.5991674773205437e-05, "loss": 0.1381, "step": 13775 }, { "epoch": 1.4048661305100274, "grad_norm": 1.491227149963379, "learning_rate": 1.5980815781734275e-05, "loss": 0.1514, "step": 13800 }, { "epoch": 1.407411177847908, "grad_norm": 3.6851260662078857, "learning_rate": 1.596950433228514e-05, "loss": 0.1678, "step": 13825 }, { "epoch": 1.4099562251857884, "grad_norm": 6.391075134277344, "learning_rate": 1.5958192882836007e-05, "loss": 0.1329, "step": 13850 }, { "epoch": 1.412501272523669, "grad_norm": 4.474203109741211, "learning_rate": 1.5946881433386877e-05, "loss": 0.1137, "step": 13875 }, { "epoch": 1.4150463198615495, "grad_norm": 0.19339196383953094, "learning_rate": 1.5935569983937743e-05, "loss": 0.1353, "step": 13900 }, { "epoch": 1.4175913671994298, "grad_norm": 3.198152780532837, "learning_rate": 1.592425853448861e-05, "loss": 0.1347, "step": 13925 }, { "epoch": 1.4201364145373103, "grad_norm": 2.9210307598114014, "learning_rate": 1.591294708503948e-05, "loss": 0.0968, "step": 13950 }, { "epoch": 1.4226814618751908, "grad_norm": 0.791731059551239, "learning_rate": 1.5901635635590348e-05, "loss": 0.1162, "step": 13975 }, { "epoch": 1.4252265092130714, "grad_norm": 7.955835342407227, "learning_rate": 1.5890324186141214e-05, "loss": 0.1213, "step": 14000 }, { "epoch": 1.427771556550952, "grad_norm": 8.638495445251465, "learning_rate": 1.587901273669208e-05, "loss": 0.1339, "step": 14025 }, { "epoch": 1.4303166038888322, "grad_norm": 4.877995014190674, "learning_rate": 1.586770128724295e-05, "loss": 0.1296, "step": 14050 }, { "epoch": 1.432861651226713, "grad_norm": 10.070806503295898, "learning_rate": 1.5856389837793816e-05, "loss": 0.1447, "step": 14075 }, { "epoch": 1.4354066985645932, "grad_norm": 3.1885571479797363, "learning_rate": 1.5845078388344682e-05, "loss": 0.1256, "step": 14100 }, { "epoch": 1.4379517459024738, "grad_norm": 9.940485954284668, "learning_rate": 1.5833766938895552e-05, "loss": 0.1325, "step": 14125 }, { "epoch": 1.4404967932403543, "grad_norm": 9.942497253417969, "learning_rate": 1.582245548944642e-05, "loss": 0.1258, "step": 14150 }, { "epoch": 1.4430418405782348, "grad_norm": 1.4657670259475708, "learning_rate": 1.5811144039997288e-05, "loss": 0.1295, "step": 14175 }, { "epoch": 1.4455868879161153, "grad_norm": 1.3000506162643433, "learning_rate": 1.5799832590548154e-05, "loss": 0.1533, "step": 14200 }, { "epoch": 1.4481319352539956, "grad_norm": 3.836527109146118, "learning_rate": 1.5788521141099023e-05, "loss": 0.1188, "step": 14225 }, { "epoch": 1.4506769825918762, "grad_norm": 7.592291355133057, "learning_rate": 1.577720969164989e-05, "loss": 0.1273, "step": 14250 }, { "epoch": 1.4532220299297567, "grad_norm": 0.6994871497154236, "learning_rate": 1.5765898242200756e-05, "loss": 0.1291, "step": 14275 }, { "epoch": 1.4557670772676372, "grad_norm": 5.725295066833496, "learning_rate": 1.5754586792751625e-05, "loss": 0.1534, "step": 14300 }, { "epoch": 1.4583121246055177, "grad_norm": 8.091775894165039, "learning_rate": 1.574327534330249e-05, "loss": 0.1256, "step": 14325 }, { "epoch": 1.460857171943398, "grad_norm": 1.0600472688674927, "learning_rate": 1.573196389385336e-05, "loss": 0.1317, "step": 14350 }, { "epoch": 1.4634022192812786, "grad_norm": 4.549612522125244, "learning_rate": 1.5720652444404227e-05, "loss": 0.1729, "step": 14375 }, { "epoch": 1.465947266619159, "grad_norm": 3.8479483127593994, "learning_rate": 1.5709340994955096e-05, "loss": 0.1498, "step": 14400 }, { "epoch": 1.4684923139570396, "grad_norm": 6.539849281311035, "learning_rate": 1.5698029545505963e-05, "loss": 0.113, "step": 14425 }, { "epoch": 1.4710373612949201, "grad_norm": 3.222187042236328, "learning_rate": 1.568671809605683e-05, "loss": 0.1169, "step": 14450 }, { "epoch": 1.4735824086328004, "grad_norm": 6.104904651641846, "learning_rate": 1.56754066466077e-05, "loss": 0.1429, "step": 14475 }, { "epoch": 1.4761274559706812, "grad_norm": 2.730999708175659, "learning_rate": 1.5664095197158564e-05, "loss": 0.1178, "step": 14500 }, { "epoch": 1.4786725033085615, "grad_norm": 6.132184982299805, "learning_rate": 1.5652783747709434e-05, "loss": 0.1333, "step": 14525 }, { "epoch": 1.481217550646442, "grad_norm": 2.02093505859375, "learning_rate": 1.56414722982603e-05, "loss": 0.1247, "step": 14550 }, { "epoch": 1.4837625979843225, "grad_norm": 5.066205978393555, "learning_rate": 1.563016084881117e-05, "loss": 0.08, "step": 14575 }, { "epoch": 1.486307645322203, "grad_norm": 2.487128973007202, "learning_rate": 1.5618849399362036e-05, "loss": 0.107, "step": 14600 }, { "epoch": 1.4888526926600836, "grad_norm": 4.138731479644775, "learning_rate": 1.5607537949912902e-05, "loss": 0.126, "step": 14625 }, { "epoch": 1.4913977399979639, "grad_norm": 3.9390079975128174, "learning_rate": 1.559622650046377e-05, "loss": 0.1206, "step": 14650 }, { "epoch": 1.4939427873358444, "grad_norm": 5.015639305114746, "learning_rate": 1.5584915051014638e-05, "loss": 0.1353, "step": 14675 }, { "epoch": 1.496487834673725, "grad_norm": 6.541854381561279, "learning_rate": 1.5573603601565507e-05, "loss": 0.1401, "step": 14700 }, { "epoch": 1.4990328820116054, "grad_norm": 8.357683181762695, "learning_rate": 1.5562292152116373e-05, "loss": 0.1244, "step": 14725 }, { "epoch": 1.501577929349486, "grad_norm": 0.8040742874145508, "learning_rate": 1.5550980702667243e-05, "loss": 0.127, "step": 14750 }, { "epoch": 1.5041229766873663, "grad_norm": 4.00300931930542, "learning_rate": 1.553966925321811e-05, "loss": 0.1174, "step": 14775 }, { "epoch": 1.506668024025247, "grad_norm": 4.621161460876465, "learning_rate": 1.5528357803768975e-05, "loss": 0.1155, "step": 14800 }, { "epoch": 1.5092130713631273, "grad_norm": 4.954023838043213, "learning_rate": 1.5517046354319845e-05, "loss": 0.1682, "step": 14825 }, { "epoch": 1.5117581187010078, "grad_norm": 1.7072348594665527, "learning_rate": 1.550573490487071e-05, "loss": 0.1423, "step": 14850 }, { "epoch": 1.5143031660388884, "grad_norm": 6.226248264312744, "learning_rate": 1.549442345542158e-05, "loss": 0.1285, "step": 14875 }, { "epoch": 1.5168482133767687, "grad_norm": 6.3383259773254395, "learning_rate": 1.5483112005972447e-05, "loss": 0.108, "step": 14900 }, { "epoch": 1.5193932607146494, "grad_norm": 4.042302131652832, "learning_rate": 1.5471800556523313e-05, "loss": 0.1085, "step": 14925 }, { "epoch": 1.5219383080525297, "grad_norm": 2.5637173652648926, "learning_rate": 1.5460489107074182e-05, "loss": 0.1202, "step": 14950 }, { "epoch": 1.5244833553904102, "grad_norm": 2.83852219581604, "learning_rate": 1.544917765762505e-05, "loss": 0.1306, "step": 14975 }, { "epoch": 1.5270284027282908, "grad_norm": 5.585648059844971, "learning_rate": 1.5437866208175918e-05, "loss": 0.1391, "step": 15000 }, { "epoch": 1.529573450066171, "grad_norm": 4.606128692626953, "learning_rate": 1.5426554758726784e-05, "loss": 0.1483, "step": 15025 }, { "epoch": 1.5321184974040518, "grad_norm": 9.550539016723633, "learning_rate": 1.5415243309277654e-05, "loss": 0.1047, "step": 15050 }, { "epoch": 1.5346635447419321, "grad_norm": 4.530235767364502, "learning_rate": 1.540393185982852e-05, "loss": 0.1181, "step": 15075 }, { "epoch": 1.5372085920798126, "grad_norm": 4.647919654846191, "learning_rate": 1.5392620410379386e-05, "loss": 0.118, "step": 15100 }, { "epoch": 1.5397536394176932, "grad_norm": 2.899007797241211, "learning_rate": 1.5381308960930256e-05, "loss": 0.1248, "step": 15125 }, { "epoch": 1.5422986867555737, "grad_norm": 11.52777099609375, "learning_rate": 1.536999751148112e-05, "loss": 0.1114, "step": 15150 }, { "epoch": 1.5448437340934542, "grad_norm": 1.5432101488113403, "learning_rate": 1.535868606203199e-05, "loss": 0.1162, "step": 15175 }, { "epoch": 1.5473887814313345, "grad_norm": 6.4070658683776855, "learning_rate": 1.5347374612582857e-05, "loss": 0.1618, "step": 15200 }, { "epoch": 1.5499338287692153, "grad_norm": 9.226142883300781, "learning_rate": 1.5336063163133727e-05, "loss": 0.1624, "step": 15225 }, { "epoch": 1.5524788761070956, "grad_norm": 1.0666420459747314, "learning_rate": 1.5324751713684593e-05, "loss": 0.1109, "step": 15250 }, { "epoch": 1.555023923444976, "grad_norm": 4.738593578338623, "learning_rate": 1.531344026423546e-05, "loss": 0.1333, "step": 15275 }, { "epoch": 1.5575689707828566, "grad_norm": 1.8429763317108154, "learning_rate": 1.530212881478633e-05, "loss": 0.1386, "step": 15300 }, { "epoch": 1.560114018120737, "grad_norm": 7.202968120574951, "learning_rate": 1.5290817365337195e-05, "loss": 0.1179, "step": 15325 }, { "epoch": 1.5626590654586177, "grad_norm": 9.482736587524414, "learning_rate": 1.527950591588806e-05, "loss": 0.16, "step": 15350 }, { "epoch": 1.565204112796498, "grad_norm": 3.1656908988952637, "learning_rate": 1.526819446643893e-05, "loss": 0.1053, "step": 15375 }, { "epoch": 1.5677491601343785, "grad_norm": 7.062704563140869, "learning_rate": 1.5256883016989798e-05, "loss": 0.1121, "step": 15400 }, { "epoch": 1.570294207472259, "grad_norm": 3.370267629623413, "learning_rate": 1.5245571567540666e-05, "loss": 0.112, "step": 15425 }, { "epoch": 1.5728392548101393, "grad_norm": 5.777838230133057, "learning_rate": 1.5234260118091532e-05, "loss": 0.1433, "step": 15450 }, { "epoch": 1.57538430214802, "grad_norm": 2.517376184463501, "learning_rate": 1.52229486686424e-05, "loss": 0.1068, "step": 15475 }, { "epoch": 1.5779293494859004, "grad_norm": 4.369190216064453, "learning_rate": 1.521163721919327e-05, "loss": 0.151, "step": 15500 }, { "epoch": 1.5804743968237809, "grad_norm": 1.2526782751083374, "learning_rate": 1.5200325769744136e-05, "loss": 0.1137, "step": 15525 }, { "epoch": 1.5830194441616614, "grad_norm": 3.6932199001312256, "learning_rate": 1.5189014320295004e-05, "loss": 0.1045, "step": 15550 }, { "epoch": 1.585564491499542, "grad_norm": 0.895078182220459, "learning_rate": 1.5177702870845872e-05, "loss": 0.1014, "step": 15575 }, { "epoch": 1.5881095388374225, "grad_norm": 4.923023223876953, "learning_rate": 1.516639142139674e-05, "loss": 0.1102, "step": 15600 }, { "epoch": 1.5906545861753028, "grad_norm": 6.089389801025391, "learning_rate": 1.5155079971947606e-05, "loss": 0.1446, "step": 15625 }, { "epoch": 1.5931996335131835, "grad_norm": 3.304971933364868, "learning_rate": 1.5143768522498474e-05, "loss": 0.098, "step": 15650 }, { "epoch": 1.5957446808510638, "grad_norm": 8.50531005859375, "learning_rate": 1.5132457073049343e-05, "loss": 0.1312, "step": 15675 }, { "epoch": 1.5982897281889443, "grad_norm": 2.5584232807159424, "learning_rate": 1.512114562360021e-05, "loss": 0.108, "step": 15700 }, { "epoch": 1.6008347755268248, "grad_norm": 7.9185590744018555, "learning_rate": 1.5109834174151077e-05, "loss": 0.1321, "step": 15725 }, { "epoch": 1.6033798228647052, "grad_norm": 4.034428119659424, "learning_rate": 1.5098522724701945e-05, "loss": 0.1454, "step": 15750 }, { "epoch": 1.605924870202586, "grad_norm": 0.8627036809921265, "learning_rate": 1.5087211275252811e-05, "loss": 0.1119, "step": 15775 }, { "epoch": 1.6084699175404662, "grad_norm": 5.200289726257324, "learning_rate": 1.5075899825803679e-05, "loss": 0.1152, "step": 15800 }, { "epoch": 1.6110149648783467, "grad_norm": 7.20810079574585, "learning_rate": 1.5064588376354547e-05, "loss": 0.0935, "step": 15825 }, { "epoch": 1.6135600122162272, "grad_norm": 1.9433012008666992, "learning_rate": 1.5053276926905416e-05, "loss": 0.1604, "step": 15850 }, { "epoch": 1.6161050595541075, "grad_norm": 3.855752944946289, "learning_rate": 1.5042417935434249e-05, "loss": 0.1198, "step": 15875 }, { "epoch": 1.6186501068919883, "grad_norm": 4.6323652267456055, "learning_rate": 1.5031106485985117e-05, "loss": 0.1409, "step": 15900 }, { "epoch": 1.6211951542298686, "grad_norm": 2.9197826385498047, "learning_rate": 1.5019795036535983e-05, "loss": 0.0972, "step": 15925 }, { "epoch": 1.6237402015677491, "grad_norm": 4.8417158126831055, "learning_rate": 1.500848358708685e-05, "loss": 0.1041, "step": 15950 }, { "epoch": 1.6262852489056296, "grad_norm": 0.9690869450569153, "learning_rate": 1.4997172137637719e-05, "loss": 0.1215, "step": 15975 }, { "epoch": 1.6288302962435102, "grad_norm": 5.960648536682129, "learning_rate": 1.4985860688188585e-05, "loss": 0.1569, "step": 16000 }, { "epoch": 1.6313753435813907, "grad_norm": 9.493753433227539, "learning_rate": 1.4974549238739453e-05, "loss": 0.1169, "step": 16025 }, { "epoch": 1.633920390919271, "grad_norm": 2.922004461288452, "learning_rate": 1.4963237789290322e-05, "loss": 0.112, "step": 16050 }, { "epoch": 1.6364654382571517, "grad_norm": 7.904358863830566, "learning_rate": 1.495192633984119e-05, "loss": 0.1441, "step": 16075 }, { "epoch": 1.639010485595032, "grad_norm": 0.4537775218486786, "learning_rate": 1.4940614890392056e-05, "loss": 0.0963, "step": 16100 }, { "epoch": 1.6415555329329126, "grad_norm": 8.233470916748047, "learning_rate": 1.4929303440942924e-05, "loss": 0.1392, "step": 16125 }, { "epoch": 1.644100580270793, "grad_norm": 10.062689781188965, "learning_rate": 1.4917991991493792e-05, "loss": 0.1349, "step": 16150 }, { "epoch": 1.6466456276086734, "grad_norm": 7.0017218589782715, "learning_rate": 1.4906680542044658e-05, "loss": 0.1563, "step": 16175 }, { "epoch": 1.6491906749465541, "grad_norm": 1.1416449546813965, "learning_rate": 1.4895369092595526e-05, "loss": 0.1232, "step": 16200 }, { "epoch": 1.6517357222844344, "grad_norm": 1.434586763381958, "learning_rate": 1.4884057643146395e-05, "loss": 0.1227, "step": 16225 }, { "epoch": 1.654280769622315, "grad_norm": 4.737888336181641, "learning_rate": 1.487274619369726e-05, "loss": 0.0974, "step": 16250 }, { "epoch": 1.6568258169601955, "grad_norm": 7.9679718017578125, "learning_rate": 1.486143474424813e-05, "loss": 0.1188, "step": 16275 }, { "epoch": 1.6593708642980758, "grad_norm": 3.9078972339630127, "learning_rate": 1.4850123294798997e-05, "loss": 0.0956, "step": 16300 }, { "epoch": 1.6619159116359565, "grad_norm": 0.9742254614830017, "learning_rate": 1.4838811845349865e-05, "loss": 0.1065, "step": 16325 }, { "epoch": 1.6644609589738368, "grad_norm": 8.059602737426758, "learning_rate": 1.4827500395900731e-05, "loss": 0.1578, "step": 16350 }, { "epoch": 1.6670060063117174, "grad_norm": 2.904923677444458, "learning_rate": 1.4816188946451599e-05, "loss": 0.0992, "step": 16375 }, { "epoch": 1.6695510536495979, "grad_norm": 3.145519733428955, "learning_rate": 1.4804877497002469e-05, "loss": 0.119, "step": 16400 }, { "epoch": 1.6720961009874784, "grad_norm": 3.9910943508148193, "learning_rate": 1.4793566047553333e-05, "loss": 0.1349, "step": 16425 }, { "epoch": 1.674641148325359, "grad_norm": 6.027825355529785, "learning_rate": 1.4782254598104203e-05, "loss": 0.1392, "step": 16450 }, { "epoch": 1.6771861956632392, "grad_norm": 2.852295398712158, "learning_rate": 1.477094314865507e-05, "loss": 0.1405, "step": 16475 }, { "epoch": 1.67973124300112, "grad_norm": 3.4339046478271484, "learning_rate": 1.4759631699205938e-05, "loss": 0.1204, "step": 16500 }, { "epoch": 1.6822762903390003, "grad_norm": 9.087562561035156, "learning_rate": 1.4748320249756804e-05, "loss": 0.1483, "step": 16525 }, { "epoch": 1.6848213376768808, "grad_norm": 8.998224258422852, "learning_rate": 1.4737008800307672e-05, "loss": 0.1192, "step": 16550 }, { "epoch": 1.6873663850147613, "grad_norm": 4.188343048095703, "learning_rate": 1.4725697350858542e-05, "loss": 0.1044, "step": 16575 }, { "epoch": 1.6899114323526416, "grad_norm": 0.9458564519882202, "learning_rate": 1.4714385901409406e-05, "loss": 0.1418, "step": 16600 }, { "epoch": 1.6924564796905224, "grad_norm": 7.491810321807861, "learning_rate": 1.4703074451960276e-05, "loss": 0.1299, "step": 16625 }, { "epoch": 1.6950015270284027, "grad_norm": 8.30079460144043, "learning_rate": 1.4691763002511144e-05, "loss": 0.1225, "step": 16650 }, { "epoch": 1.6975465743662832, "grad_norm": 4.6412272453308105, "learning_rate": 1.468045155306201e-05, "loss": 0.1487, "step": 16675 }, { "epoch": 1.7000916217041637, "grad_norm": 0.662234365940094, "learning_rate": 1.4669140103612878e-05, "loss": 0.0988, "step": 16700 }, { "epoch": 1.702636669042044, "grad_norm": 0.9764494895935059, "learning_rate": 1.4657828654163745e-05, "loss": 0.1137, "step": 16725 }, { "epoch": 1.7051817163799248, "grad_norm": 3.0529532432556152, "learning_rate": 1.4646517204714615e-05, "loss": 0.104, "step": 16750 }, { "epoch": 1.707726763717805, "grad_norm": 2.24320387840271, "learning_rate": 1.463520575526548e-05, "loss": 0.1038, "step": 16775 }, { "epoch": 1.7102718110556856, "grad_norm": 12.847587585449219, "learning_rate": 1.4623894305816349e-05, "loss": 0.1447, "step": 16800 }, { "epoch": 1.7128168583935661, "grad_norm": 2.5556368827819824, "learning_rate": 1.4612582856367217e-05, "loss": 0.1125, "step": 16825 }, { "epoch": 1.7153619057314466, "grad_norm": 1.6241698265075684, "learning_rate": 1.4601271406918083e-05, "loss": 0.0814, "step": 16850 }, { "epoch": 1.7179069530693272, "grad_norm": 7.694450855255127, "learning_rate": 1.4589959957468951e-05, "loss": 0.1414, "step": 16875 }, { "epoch": 1.7204520004072075, "grad_norm": 3.1575894355773926, "learning_rate": 1.4578648508019819e-05, "loss": 0.1473, "step": 16900 }, { "epoch": 1.7229970477450882, "grad_norm": 8.672683715820312, "learning_rate": 1.4567337058570688e-05, "loss": 0.1017, "step": 16925 }, { "epoch": 1.7255420950829685, "grad_norm": 6.179168701171875, "learning_rate": 1.4556025609121553e-05, "loss": 0.1454, "step": 16950 }, { "epoch": 1.728087142420849, "grad_norm": 4.374868392944336, "learning_rate": 1.4544714159672422e-05, "loss": 0.117, "step": 16975 }, { "epoch": 1.7306321897587296, "grad_norm": 3.160280227661133, "learning_rate": 1.453340271022329e-05, "loss": 0.0905, "step": 17000 }, { "epoch": 1.7331772370966099, "grad_norm": 3.6998531818389893, "learning_rate": 1.4522091260774156e-05, "loss": 0.1017, "step": 17025 }, { "epoch": 1.7357222844344906, "grad_norm": 5.593207359313965, "learning_rate": 1.4510779811325024e-05, "loss": 0.1295, "step": 17050 }, { "epoch": 1.738267331772371, "grad_norm": 6.101248264312744, "learning_rate": 1.4499468361875892e-05, "loss": 0.1057, "step": 17075 }, { "epoch": 1.7408123791102514, "grad_norm": 2.3513197898864746, "learning_rate": 1.4488156912426758e-05, "loss": 0.1048, "step": 17100 }, { "epoch": 1.743357426448132, "grad_norm": 5.067135810852051, "learning_rate": 1.4476845462977626e-05, "loss": 0.1083, "step": 17125 }, { "epoch": 1.7459024737860123, "grad_norm": 7.471678256988525, "learning_rate": 1.4465534013528495e-05, "loss": 0.1393, "step": 17150 }, { "epoch": 1.748447521123893, "grad_norm": 6.90990686416626, "learning_rate": 1.4454222564079363e-05, "loss": 0.1097, "step": 17175 }, { "epoch": 1.7509925684617733, "grad_norm": 6.522189617156982, "learning_rate": 1.444291111463023e-05, "loss": 0.1335, "step": 17200 }, { "epoch": 1.7535376157996538, "grad_norm": 3.7323899269104004, "learning_rate": 1.4431599665181097e-05, "loss": 0.1182, "step": 17225 }, { "epoch": 1.7560826631375344, "grad_norm": 5.302699089050293, "learning_rate": 1.4420288215731965e-05, "loss": 0.1103, "step": 17250 }, { "epoch": 1.7586277104754149, "grad_norm": 0.8149896860122681, "learning_rate": 1.4408976766282831e-05, "loss": 0.1399, "step": 17275 }, { "epoch": 1.7611727578132954, "grad_norm": 5.488561153411865, "learning_rate": 1.43976653168337e-05, "loss": 0.1057, "step": 17300 }, { "epoch": 1.7637178051511757, "grad_norm": 6.034390926361084, "learning_rate": 1.4386353867384569e-05, "loss": 0.1112, "step": 17325 }, { "epoch": 1.7662628524890565, "grad_norm": 4.187770366668701, "learning_rate": 1.4375042417935437e-05, "loss": 0.1282, "step": 17350 }, { "epoch": 1.7688078998269368, "grad_norm": 5.549367427825928, "learning_rate": 1.4363730968486303e-05, "loss": 0.1056, "step": 17375 }, { "epoch": 1.7713529471648173, "grad_norm": 1.8145831823349, "learning_rate": 1.435241951903717e-05, "loss": 0.0953, "step": 17400 }, { "epoch": 1.7738979945026978, "grad_norm": 2.8839643001556396, "learning_rate": 1.4341108069588038e-05, "loss": 0.1086, "step": 17425 }, { "epoch": 1.776443041840578, "grad_norm": 0.7977824807167053, "learning_rate": 1.4329796620138905e-05, "loss": 0.1209, "step": 17450 }, { "epoch": 1.7789880891784589, "grad_norm": 8.126140594482422, "learning_rate": 1.4318485170689772e-05, "loss": 0.1062, "step": 17475 }, { "epoch": 1.7815331365163392, "grad_norm": 4.229276657104492, "learning_rate": 1.4307173721240642e-05, "loss": 0.1279, "step": 17500 }, { "epoch": 1.7840781838542197, "grad_norm": 4.933651924133301, "learning_rate": 1.4295862271791508e-05, "loss": 0.1199, "step": 17525 }, { "epoch": 1.7866232311921002, "grad_norm": 4.240781784057617, "learning_rate": 1.4284550822342376e-05, "loss": 0.1176, "step": 17550 }, { "epoch": 1.7891682785299805, "grad_norm": 2.898059368133545, "learning_rate": 1.4273239372893244e-05, "loss": 0.0957, "step": 17575 }, { "epoch": 1.7917133258678613, "grad_norm": 3.8518853187561035, "learning_rate": 1.4261927923444112e-05, "loss": 0.0978, "step": 17600 }, { "epoch": 1.7942583732057416, "grad_norm": 3.217885732650757, "learning_rate": 1.4250616473994978e-05, "loss": 0.1244, "step": 17625 }, { "epoch": 1.796803420543622, "grad_norm": 2.9937503337860107, "learning_rate": 1.4239305024545846e-05, "loss": 0.1163, "step": 17650 }, { "epoch": 1.7993484678815026, "grad_norm": 3.574305534362793, "learning_rate": 1.4227993575096715e-05, "loss": 0.1512, "step": 17675 }, { "epoch": 1.8018935152193831, "grad_norm": 10.027637481689453, "learning_rate": 1.4216682125647581e-05, "loss": 0.1373, "step": 17700 }, { "epoch": 1.8044385625572636, "grad_norm": 1.1961842775344849, "learning_rate": 1.4205370676198449e-05, "loss": 0.1315, "step": 17725 }, { "epoch": 1.806983609895144, "grad_norm": 3.1464712619781494, "learning_rate": 1.4194059226749317e-05, "loss": 0.1205, "step": 17750 }, { "epoch": 1.8095286572330247, "grad_norm": 5.239190578460693, "learning_rate": 1.4182747777300185e-05, "loss": 0.1095, "step": 17775 }, { "epoch": 1.812073704570905, "grad_norm": 6.392675399780273, "learning_rate": 1.4171436327851051e-05, "loss": 0.1442, "step": 17800 }, { "epoch": 1.8146187519087855, "grad_norm": 5.46378755569458, "learning_rate": 1.4160124878401919e-05, "loss": 0.1126, "step": 17825 }, { "epoch": 1.817163799246666, "grad_norm": 3.494110107421875, "learning_rate": 1.4148813428952788e-05, "loss": 0.1166, "step": 17850 }, { "epoch": 1.8197088465845463, "grad_norm": 4.842367649078369, "learning_rate": 1.4137501979503655e-05, "loss": 0.1038, "step": 17875 }, { "epoch": 1.822253893922427, "grad_norm": 3.657884359359741, "learning_rate": 1.4126190530054522e-05, "loss": 0.1669, "step": 17900 }, { "epoch": 1.8247989412603074, "grad_norm": 3.146895408630371, "learning_rate": 1.411487908060539e-05, "loss": 0.1639, "step": 17925 }, { "epoch": 1.827343988598188, "grad_norm": 4.50389289855957, "learning_rate": 1.4103567631156258e-05, "loss": 0.1279, "step": 17950 }, { "epoch": 1.8298890359360684, "grad_norm": 7.898961067199707, "learning_rate": 1.4092256181707124e-05, "loss": 0.1571, "step": 17975 }, { "epoch": 1.8324340832739487, "grad_norm": 0.7812437415122986, "learning_rate": 1.4080944732257992e-05, "loss": 0.1268, "step": 18000 }, { "epoch": 1.8349791306118295, "grad_norm": 5.2326579093933105, "learning_rate": 1.4069633282808862e-05, "loss": 0.1148, "step": 18025 }, { "epoch": 1.8375241779497098, "grad_norm": 3.574772357940674, "learning_rate": 1.4058321833359728e-05, "loss": 0.1484, "step": 18050 }, { "epoch": 1.8400692252875903, "grad_norm": 5.390368461608887, "learning_rate": 1.4047010383910596e-05, "loss": 0.1509, "step": 18075 }, { "epoch": 1.8426142726254708, "grad_norm": 3.923863410949707, "learning_rate": 1.4035698934461463e-05, "loss": 0.1158, "step": 18100 }, { "epoch": 1.8451593199633514, "grad_norm": 1.9413028955459595, "learning_rate": 1.402438748501233e-05, "loss": 0.1092, "step": 18125 }, { "epoch": 1.8477043673012319, "grad_norm": 4.675200462341309, "learning_rate": 1.4013528493541164e-05, "loss": 0.0942, "step": 18150 }, { "epoch": 1.8502494146391122, "grad_norm": 4.470402717590332, "learning_rate": 1.400221704409203e-05, "loss": 0.1146, "step": 18175 }, { "epoch": 1.852794461976993, "grad_norm": 3.648837089538574, "learning_rate": 1.3990905594642898e-05, "loss": 0.1386, "step": 18200 }, { "epoch": 1.8553395093148732, "grad_norm": 3.77079701423645, "learning_rate": 1.3979594145193767e-05, "loss": 0.1316, "step": 18225 }, { "epoch": 1.8578845566527538, "grad_norm": 4.982840538024902, "learning_rate": 1.3968282695744635e-05, "loss": 0.13, "step": 18250 }, { "epoch": 1.8604296039906343, "grad_norm": 8.14816665649414, "learning_rate": 1.3956971246295501e-05, "loss": 0.0977, "step": 18275 }, { "epoch": 1.8629746513285146, "grad_norm": 2.803504467010498, "learning_rate": 1.394565979684637e-05, "loss": 0.1599, "step": 18300 }, { "epoch": 1.8655196986663953, "grad_norm": 3.785940408706665, "learning_rate": 1.3934348347397237e-05, "loss": 0.094, "step": 18325 }, { "epoch": 1.8680647460042756, "grad_norm": 0.8537298440933228, "learning_rate": 1.3923036897948103e-05, "loss": 0.0993, "step": 18350 }, { "epoch": 1.8706097933421562, "grad_norm": 3.6978354454040527, "learning_rate": 1.3911725448498971e-05, "loss": 0.1357, "step": 18375 }, { "epoch": 1.8731548406800367, "grad_norm": 2.4232470989227295, "learning_rate": 1.390041399904984e-05, "loss": 0.109, "step": 18400 }, { "epoch": 1.875699888017917, "grad_norm": 6.006959915161133, "learning_rate": 1.3889102549600705e-05, "loss": 0.1133, "step": 18425 }, { "epoch": 1.8782449353557977, "grad_norm": 3.3002946376800537, "learning_rate": 1.3877791100151575e-05, "loss": 0.1429, "step": 18450 }, { "epoch": 1.880789982693678, "grad_norm": 3.789641857147217, "learning_rate": 1.3866479650702443e-05, "loss": 0.1547, "step": 18475 }, { "epoch": 1.8833350300315586, "grad_norm": 5.670425891876221, "learning_rate": 1.385516820125331e-05, "loss": 0.1347, "step": 18500 }, { "epoch": 1.885880077369439, "grad_norm": 1.6732641458511353, "learning_rate": 1.3843856751804177e-05, "loss": 0.0954, "step": 18525 }, { "epoch": 1.8884251247073196, "grad_norm": 0.9046832919120789, "learning_rate": 1.3832545302355044e-05, "loss": 0.1444, "step": 18550 }, { "epoch": 1.8909701720452001, "grad_norm": 6.483386516571045, "learning_rate": 1.3821233852905914e-05, "loss": 0.1367, "step": 18575 }, { "epoch": 1.8935152193830804, "grad_norm": 4.16316556930542, "learning_rate": 1.3809922403456778e-05, "loss": 0.1292, "step": 18600 }, { "epoch": 1.8960602667209612, "grad_norm": 8.144868850708008, "learning_rate": 1.3798610954007648e-05, "loss": 0.0942, "step": 18625 }, { "epoch": 1.8986053140588415, "grad_norm": 5.008410453796387, "learning_rate": 1.3787299504558516e-05, "loss": 0.1263, "step": 18650 }, { "epoch": 1.901150361396722, "grad_norm": 5.292370796203613, "learning_rate": 1.3775988055109384e-05, "loss": 0.1237, "step": 18675 }, { "epoch": 1.9036954087346025, "grad_norm": 2.5285484790802, "learning_rate": 1.376467660566025e-05, "loss": 0.1126, "step": 18700 }, { "epoch": 1.9062404560724828, "grad_norm": 2.1306982040405273, "learning_rate": 1.3753365156211118e-05, "loss": 0.112, "step": 18725 }, { "epoch": 1.9087855034103636, "grad_norm": 0.5022045969963074, "learning_rate": 1.3742053706761987e-05, "loss": 0.1209, "step": 18750 }, { "epoch": 1.9113305507482439, "grad_norm": 3.427983045578003, "learning_rate": 1.3730742257312852e-05, "loss": 0.103, "step": 18775 }, { "epoch": 1.9138755980861244, "grad_norm": 3.986631393432617, "learning_rate": 1.3719430807863721e-05, "loss": 0.0819, "step": 18800 }, { "epoch": 1.916420645424005, "grad_norm": 3.5819921493530273, "learning_rate": 1.3708119358414589e-05, "loss": 0.1688, "step": 18825 }, { "epoch": 1.9189656927618852, "grad_norm": 9.21711540222168, "learning_rate": 1.3696807908965455e-05, "loss": 0.1095, "step": 18850 }, { "epoch": 1.921510740099766, "grad_norm": 4.607287883758545, "learning_rate": 1.3685496459516323e-05, "loss": 0.1251, "step": 18875 }, { "epoch": 1.9240557874376463, "grad_norm": 3.5619211196899414, "learning_rate": 1.367418501006719e-05, "loss": 0.1163, "step": 18900 }, { "epoch": 1.9266008347755268, "grad_norm": 4.004523754119873, "learning_rate": 1.366287356061806e-05, "loss": 0.1449, "step": 18925 }, { "epoch": 1.9291458821134073, "grad_norm": 6.862831115722656, "learning_rate": 1.3651562111168925e-05, "loss": 0.1331, "step": 18950 }, { "epoch": 1.9316909294512878, "grad_norm": 6.602372646331787, "learning_rate": 1.3640250661719794e-05, "loss": 0.1345, "step": 18975 }, { "epoch": 1.9342359767891684, "grad_norm": 4.988379955291748, "learning_rate": 1.3628939212270662e-05, "loss": 0.1184, "step": 19000 }, { "epoch": 1.9367810241270487, "grad_norm": 7.112786293029785, "learning_rate": 1.3617627762821528e-05, "loss": 0.1287, "step": 19025 }, { "epoch": 1.9393260714649294, "grad_norm": 3.288667917251587, "learning_rate": 1.3606316313372396e-05, "loss": 0.1298, "step": 19050 }, { "epoch": 1.9418711188028097, "grad_norm": 2.2869608402252197, "learning_rate": 1.3595004863923264e-05, "loss": 0.0996, "step": 19075 }, { "epoch": 1.9444161661406902, "grad_norm": 4.9739532470703125, "learning_rate": 1.3583693414474134e-05, "loss": 0.125, "step": 19100 }, { "epoch": 1.9469612134785708, "grad_norm": 7.579037189483643, "learning_rate": 1.3572381965024998e-05, "loss": 0.1243, "step": 19125 }, { "epoch": 1.949506260816451, "grad_norm": 3.219343662261963, "learning_rate": 1.3561070515575868e-05, "loss": 0.1329, "step": 19150 }, { "epoch": 1.9520513081543318, "grad_norm": 3.9998912811279297, "learning_rate": 1.3549759066126735e-05, "loss": 0.1067, "step": 19175 }, { "epoch": 1.9545963554922121, "grad_norm": 1.2954130172729492, "learning_rate": 1.3538447616677602e-05, "loss": 0.1176, "step": 19200 }, { "epoch": 1.9571414028300926, "grad_norm": 3.8018393516540527, "learning_rate": 1.352713616722847e-05, "loss": 0.1318, "step": 19225 }, { "epoch": 1.9596864501679732, "grad_norm": 4.415460586547852, "learning_rate": 1.3515824717779337e-05, "loss": 0.1273, "step": 19250 }, { "epoch": 1.9622314975058535, "grad_norm": 0.9868888854980469, "learning_rate": 1.3504513268330207e-05, "loss": 0.1303, "step": 19275 }, { "epoch": 1.9647765448437342, "grad_norm": 11.183004379272461, "learning_rate": 1.3493201818881071e-05, "loss": 0.1699, "step": 19300 }, { "epoch": 1.9673215921816145, "grad_norm": 8.36978816986084, "learning_rate": 1.348189036943194e-05, "loss": 0.1163, "step": 19325 }, { "epoch": 1.969866639519495, "grad_norm": 2.3566489219665527, "learning_rate": 1.3470578919982809e-05, "loss": 0.1186, "step": 19350 }, { "epoch": 1.9724116868573756, "grad_norm": 4.07636833190918, "learning_rate": 1.3459267470533675e-05, "loss": 0.1196, "step": 19375 }, { "epoch": 1.974956734195256, "grad_norm": 1.6372345685958862, "learning_rate": 1.3447956021084543e-05, "loss": 0.1028, "step": 19400 }, { "epoch": 1.9775017815331366, "grad_norm": 1.4987009763717651, "learning_rate": 1.343664457163541e-05, "loss": 0.089, "step": 19425 }, { "epoch": 1.980046828871017, "grad_norm": 2.3132851123809814, "learning_rate": 1.3425333122186277e-05, "loss": 0.1074, "step": 19450 }, { "epoch": 1.9825918762088974, "grad_norm": 1.2867132425308228, "learning_rate": 1.3414021672737144e-05, "loss": 0.1352, "step": 19475 }, { "epoch": 1.985136923546778, "grad_norm": 2.911659002304077, "learning_rate": 1.3402710223288014e-05, "loss": 0.1128, "step": 19500 }, { "epoch": 1.9876819708846585, "grad_norm": 3.1398777961730957, "learning_rate": 1.3391398773838882e-05, "loss": 0.1311, "step": 19525 }, { "epoch": 1.990227018222539, "grad_norm": 3.4940083026885986, "learning_rate": 1.3380087324389748e-05, "loss": 0.0797, "step": 19550 }, { "epoch": 1.9927720655604193, "grad_norm": 5.973903656005859, "learning_rate": 1.3368775874940616e-05, "loss": 0.0901, "step": 19575 }, { "epoch": 1.9953171128983, "grad_norm": 6.650625705718994, "learning_rate": 1.3357464425491484e-05, "loss": 0.1026, "step": 19600 }, { "epoch": 1.9978621602361804, "grad_norm": 2.1521973609924316, "learning_rate": 1.334615297604235e-05, "loss": 0.1444, "step": 19625 }, { "epoch": 2.0, "eval_loss": 0.09097875654697418, "eval_runtime": 7.4187, "eval_samples_per_second": 917.686, "eval_steps_per_second": 14.423, "step": 19646 }, { "epoch": 2.000407207574061, "grad_norm": 5.6037187576293945, "learning_rate": 1.3334841526593218e-05, "loss": 0.0984, "step": 19650 }, { "epoch": 2.0029522549119414, "grad_norm": 1.090952754020691, "learning_rate": 1.3323530077144087e-05, "loss": 0.0879, "step": 19675 }, { "epoch": 2.0054973022498217, "grad_norm": 7.369726181030273, "learning_rate": 1.3312218627694955e-05, "loss": 0.1064, "step": 19700 }, { "epoch": 2.0080423495877024, "grad_norm": 0.6371269822120667, "learning_rate": 1.3300907178245821e-05, "loss": 0.1009, "step": 19725 }, { "epoch": 2.0105873969255827, "grad_norm": 1.3027335405349731, "learning_rate": 1.3289595728796689e-05, "loss": 0.1037, "step": 19750 }, { "epoch": 2.0131324442634635, "grad_norm": 1.485375165939331, "learning_rate": 1.3278284279347557e-05, "loss": 0.0781, "step": 19775 }, { "epoch": 2.015677491601344, "grad_norm": 3.413872718811035, "learning_rate": 1.3266972829898423e-05, "loss": 0.105, "step": 19800 }, { "epoch": 2.018222538939224, "grad_norm": 3.911144971847534, "learning_rate": 1.3255661380449291e-05, "loss": 0.1214, "step": 19825 }, { "epoch": 2.020767586277105, "grad_norm": 9.321831703186035, "learning_rate": 1.324434993100016e-05, "loss": 0.0892, "step": 19850 }, { "epoch": 2.023312633614985, "grad_norm": 5.0478081703186035, "learning_rate": 1.3233038481551027e-05, "loss": 0.0776, "step": 19875 }, { "epoch": 2.025857680952866, "grad_norm": 4.059922695159912, "learning_rate": 1.3221727032101894e-05, "loss": 0.1019, "step": 19900 }, { "epoch": 2.028402728290746, "grad_norm": 2.502460241317749, "learning_rate": 1.3210415582652762e-05, "loss": 0.1305, "step": 19925 }, { "epoch": 2.0309477756286265, "grad_norm": 8.318452835083008, "learning_rate": 1.319910413320363e-05, "loss": 0.0912, "step": 19950 }, { "epoch": 2.0334928229665072, "grad_norm": 0.444395512342453, "learning_rate": 1.3188245141732463e-05, "loss": 0.0858, "step": 19975 }, { "epoch": 2.0360378703043875, "grad_norm": 5.8228349685668945, "learning_rate": 1.317693369228333e-05, "loss": 0.1094, "step": 20000 }, { "epoch": 2.0385829176422683, "grad_norm": 6.549695014953613, "learning_rate": 1.3165622242834197e-05, "loss": 0.0803, "step": 20025 }, { "epoch": 2.0411279649801486, "grad_norm": 4.000819683074951, "learning_rate": 1.3154310793385066e-05, "loss": 0.0938, "step": 20050 }, { "epoch": 2.0436730123180293, "grad_norm": 1.8325241804122925, "learning_rate": 1.3142999343935934e-05, "loss": 0.0798, "step": 20075 }, { "epoch": 2.0462180596559096, "grad_norm": 7.206612586975098, "learning_rate": 1.31316878944868e-05, "loss": 0.0883, "step": 20100 }, { "epoch": 2.04876310699379, "grad_norm": 3.23553729057312, "learning_rate": 1.3120376445037668e-05, "loss": 0.114, "step": 20125 }, { "epoch": 2.0513081543316707, "grad_norm": 0.41115373373031616, "learning_rate": 1.3109064995588536e-05, "loss": 0.1187, "step": 20150 }, { "epoch": 2.053853201669551, "grad_norm": 1.5195305347442627, "learning_rate": 1.3097753546139402e-05, "loss": 0.1014, "step": 20175 }, { "epoch": 2.0563982490074317, "grad_norm": 4.764736652374268, "learning_rate": 1.308644209669027e-05, "loss": 0.1099, "step": 20200 }, { "epoch": 2.058943296345312, "grad_norm": 0.8784139156341553, "learning_rate": 1.307513064724114e-05, "loss": 0.1045, "step": 20225 }, { "epoch": 2.0614883436831923, "grad_norm": 0.10267910361289978, "learning_rate": 1.3063819197792007e-05, "loss": 0.1031, "step": 20250 }, { "epoch": 2.064033391021073, "grad_norm": 0.8482717871665955, "learning_rate": 1.3052507748342874e-05, "loss": 0.0902, "step": 20275 }, { "epoch": 2.0665784383589534, "grad_norm": 4.93229341506958, "learning_rate": 1.3041196298893741e-05, "loss": 0.0859, "step": 20300 }, { "epoch": 2.069123485696834, "grad_norm": 0.9881433248519897, "learning_rate": 1.302988484944461e-05, "loss": 0.0931, "step": 20325 }, { "epoch": 2.0716685330347144, "grad_norm": 0.9810713529586792, "learning_rate": 1.3018573399995475e-05, "loss": 0.0846, "step": 20350 }, { "epoch": 2.0742135803725947, "grad_norm": 7.200399398803711, "learning_rate": 1.3007261950546343e-05, "loss": 0.0842, "step": 20375 }, { "epoch": 2.0767586277104755, "grad_norm": 6.763378143310547, "learning_rate": 1.2995950501097213e-05, "loss": 0.1141, "step": 20400 }, { "epoch": 2.079303675048356, "grad_norm": 1.3273606300354004, "learning_rate": 1.298463905164808e-05, "loss": 0.0947, "step": 20425 }, { "epoch": 2.0818487223862365, "grad_norm": 1.6942657232284546, "learning_rate": 1.2973327602198947e-05, "loss": 0.1171, "step": 20450 }, { "epoch": 2.084393769724117, "grad_norm": 3.745817184448242, "learning_rate": 1.2962016152749815e-05, "loss": 0.0943, "step": 20475 }, { "epoch": 2.0869388170619976, "grad_norm": 2.1933345794677734, "learning_rate": 1.2950704703300682e-05, "loss": 0.0748, "step": 20500 }, { "epoch": 2.089483864399878, "grad_norm": 0.5361920595169067, "learning_rate": 1.2939393253851549e-05, "loss": 0.1071, "step": 20525 }, { "epoch": 2.092028911737758, "grad_norm": 2.4719314575195312, "learning_rate": 1.2928081804402416e-05, "loss": 0.0777, "step": 20550 }, { "epoch": 2.094573959075639, "grad_norm": 4.1660990715026855, "learning_rate": 1.2916770354953286e-05, "loss": 0.0773, "step": 20575 }, { "epoch": 2.0971190064135192, "grad_norm": 4.774954319000244, "learning_rate": 1.290545890550415e-05, "loss": 0.1215, "step": 20600 }, { "epoch": 2.0996640537514, "grad_norm": 6.706242561340332, "learning_rate": 1.289414745605502e-05, "loss": 0.1075, "step": 20625 }, { "epoch": 2.1022091010892803, "grad_norm": 4.085606575012207, "learning_rate": 1.2882836006605888e-05, "loss": 0.1012, "step": 20650 }, { "epoch": 2.1047541484271606, "grad_norm": 5.016777515411377, "learning_rate": 1.2871524557156756e-05, "loss": 0.1105, "step": 20675 }, { "epoch": 2.1072991957650413, "grad_norm": 6.600816249847412, "learning_rate": 1.2860213107707622e-05, "loss": 0.1018, "step": 20700 }, { "epoch": 2.1098442431029216, "grad_norm": 2.2283174991607666, "learning_rate": 1.284890165825849e-05, "loss": 0.0926, "step": 20725 }, { "epoch": 2.1123892904408024, "grad_norm": 5.262929916381836, "learning_rate": 1.283759020880936e-05, "loss": 0.1175, "step": 20750 }, { "epoch": 2.1149343377786827, "grad_norm": 1.850588083267212, "learning_rate": 1.2826278759360224e-05, "loss": 0.0949, "step": 20775 }, { "epoch": 2.117479385116563, "grad_norm": 3.330935478210449, "learning_rate": 1.2814967309911093e-05, "loss": 0.1297, "step": 20800 }, { "epoch": 2.1200244324544437, "grad_norm": 2.4614529609680176, "learning_rate": 1.2803655860461961e-05, "loss": 0.0803, "step": 20825 }, { "epoch": 2.122569479792324, "grad_norm": 4.3904337882995605, "learning_rate": 1.2792344411012829e-05, "loss": 0.0997, "step": 20850 }, { "epoch": 2.1251145271302048, "grad_norm": 1.6169228553771973, "learning_rate": 1.2781032961563695e-05, "loss": 0.1115, "step": 20875 }, { "epoch": 2.127659574468085, "grad_norm": 3.5835373401641846, "learning_rate": 1.2769721512114563e-05, "loss": 0.084, "step": 20900 }, { "epoch": 2.130204621805966, "grad_norm": 8.013703346252441, "learning_rate": 1.2758410062665432e-05, "loss": 0.0928, "step": 20925 }, { "epoch": 2.132749669143846, "grad_norm": 6.659077167510986, "learning_rate": 1.2747098613216297e-05, "loss": 0.1014, "step": 20950 }, { "epoch": 2.1352947164817264, "grad_norm": 2.2046802043914795, "learning_rate": 1.2735787163767166e-05, "loss": 0.1198, "step": 20975 }, { "epoch": 2.137839763819607, "grad_norm": 3.954470157623291, "learning_rate": 1.2724475714318034e-05, "loss": 0.1347, "step": 21000 }, { "epoch": 2.1403848111574875, "grad_norm": 1.1410102844238281, "learning_rate": 1.2713164264868902e-05, "loss": 0.0831, "step": 21025 }, { "epoch": 2.142929858495368, "grad_norm": 5.025262355804443, "learning_rate": 1.2701852815419768e-05, "loss": 0.0995, "step": 21050 }, { "epoch": 2.1454749058332485, "grad_norm": 4.646005630493164, "learning_rate": 1.2690541365970636e-05, "loss": 0.1139, "step": 21075 }, { "epoch": 2.148019953171129, "grad_norm": 5.704071521759033, "learning_rate": 1.2679229916521506e-05, "loss": 0.0884, "step": 21100 }, { "epoch": 2.1505650005090096, "grad_norm": 2.248732566833496, "learning_rate": 1.266791846707237e-05, "loss": 0.0799, "step": 21125 }, { "epoch": 2.15311004784689, "grad_norm": 2.9786360263824463, "learning_rate": 1.265660701762324e-05, "loss": 0.0761, "step": 21150 }, { "epoch": 2.1556550951847706, "grad_norm": 4.9691948890686035, "learning_rate": 1.2645295568174108e-05, "loss": 0.0813, "step": 21175 }, { "epoch": 2.158200142522651, "grad_norm": 3.8157265186309814, "learning_rate": 1.2633984118724974e-05, "loss": 0.0996, "step": 21200 }, { "epoch": 2.160745189860531, "grad_norm": 3.571915864944458, "learning_rate": 1.2622672669275842e-05, "loss": 0.0908, "step": 21225 }, { "epoch": 2.163290237198412, "grad_norm": 3.316852331161499, "learning_rate": 1.261136121982671e-05, "loss": 0.1066, "step": 21250 }, { "epoch": 2.1658352845362923, "grad_norm": 2.9165329933166504, "learning_rate": 1.2600049770377579e-05, "loss": 0.1119, "step": 21275 }, { "epoch": 2.168380331874173, "grad_norm": 17.929237365722656, "learning_rate": 1.2588738320928443e-05, "loss": 0.1363, "step": 21300 }, { "epoch": 2.1709253792120533, "grad_norm": 10.439239501953125, "learning_rate": 1.2577426871479313e-05, "loss": 0.0573, "step": 21325 }, { "epoch": 2.173470426549934, "grad_norm": 4.958606243133545, "learning_rate": 1.256611542203018e-05, "loss": 0.1083, "step": 21350 }, { "epoch": 2.1760154738878144, "grad_norm": 0.9944233298301697, "learning_rate": 1.2554803972581047e-05, "loss": 0.0806, "step": 21375 }, { "epoch": 2.1785605212256947, "grad_norm": 0.8803888559341431, "learning_rate": 1.2543492523131915e-05, "loss": 0.1165, "step": 21400 }, { "epoch": 2.1811055685635754, "grad_norm": 1.1710690259933472, "learning_rate": 1.2532181073682783e-05, "loss": 0.0919, "step": 21425 }, { "epoch": 2.1836506159014557, "grad_norm": 4.190867900848389, "learning_rate": 1.2520869624233652e-05, "loss": 0.1119, "step": 21450 }, { "epoch": 2.1861956632393365, "grad_norm": 5.893057346343994, "learning_rate": 1.2509558174784517e-05, "loss": 0.097, "step": 21475 }, { "epoch": 2.1887407105772168, "grad_norm": 3.533395767211914, "learning_rate": 1.2498246725335386e-05, "loss": 0.086, "step": 21500 }, { "epoch": 2.191285757915097, "grad_norm": 2.7442116737365723, "learning_rate": 1.2486935275886254e-05, "loss": 0.1254, "step": 21525 }, { "epoch": 2.193830805252978, "grad_norm": 4.103741645812988, "learning_rate": 1.247562382643712e-05, "loss": 0.1215, "step": 21550 }, { "epoch": 2.196375852590858, "grad_norm": 2.0932254791259766, "learning_rate": 1.2464312376987988e-05, "loss": 0.1174, "step": 21575 }, { "epoch": 2.198920899928739, "grad_norm": 5.704500198364258, "learning_rate": 1.2453000927538856e-05, "loss": 0.1348, "step": 21600 }, { "epoch": 2.201465947266619, "grad_norm": 4.199222087860107, "learning_rate": 1.2441689478089722e-05, "loss": 0.11, "step": 21625 }, { "epoch": 2.2040109946044995, "grad_norm": 3.764247417449951, "learning_rate": 1.243037802864059e-05, "loss": 0.1224, "step": 21650 }, { "epoch": 2.20655604194238, "grad_norm": 1.0659103393554688, "learning_rate": 1.241906657919146e-05, "loss": 0.1164, "step": 21675 }, { "epoch": 2.2091010892802605, "grad_norm": 3.8435420989990234, "learning_rate": 1.2407755129742327e-05, "loss": 0.1127, "step": 21700 }, { "epoch": 2.2116461366181412, "grad_norm": 2.6357994079589844, "learning_rate": 1.2396443680293193e-05, "loss": 0.0713, "step": 21725 }, { "epoch": 2.2141911839560215, "grad_norm": 5.551704406738281, "learning_rate": 1.2385132230844061e-05, "loss": 0.0999, "step": 21750 }, { "epoch": 2.2167362312939023, "grad_norm": 3.5522947311401367, "learning_rate": 1.2373820781394929e-05, "loss": 0.0822, "step": 21775 }, { "epoch": 2.2192812786317826, "grad_norm": 6.4888739585876465, "learning_rate": 1.2362509331945795e-05, "loss": 0.1165, "step": 21800 }, { "epoch": 2.221826325969663, "grad_norm": 2.415229082107544, "learning_rate": 1.2351197882496663e-05, "loss": 0.0943, "step": 21825 }, { "epoch": 2.2243713733075436, "grad_norm": 2.813169479370117, "learning_rate": 1.2339886433047533e-05, "loss": 0.0936, "step": 21850 }, { "epoch": 2.226916420645424, "grad_norm": 5.718323707580566, "learning_rate": 1.23285749835984e-05, "loss": 0.1375, "step": 21875 }, { "epoch": 2.2294614679833047, "grad_norm": 6.90181827545166, "learning_rate": 1.2317263534149267e-05, "loss": 0.0972, "step": 21900 }, { "epoch": 2.232006515321185, "grad_norm": 1.21304452419281, "learning_rate": 1.2305952084700134e-05, "loss": 0.1095, "step": 21925 }, { "epoch": 2.2345515626590653, "grad_norm": 9.322980880737305, "learning_rate": 1.2294640635251002e-05, "loss": 0.0997, "step": 21950 }, { "epoch": 2.237096609996946, "grad_norm": 4.291175842285156, "learning_rate": 1.2283329185801868e-05, "loss": 0.0908, "step": 21975 }, { "epoch": 2.2396416573348263, "grad_norm": 3.542860746383667, "learning_rate": 1.2272017736352736e-05, "loss": 0.0871, "step": 22000 }, { "epoch": 2.242186704672707, "grad_norm": 6.1715312004089355, "learning_rate": 1.2260706286903606e-05, "loss": 0.1264, "step": 22025 }, { "epoch": 2.2447317520105874, "grad_norm": 2.7312450408935547, "learning_rate": 1.2249394837454472e-05, "loss": 0.112, "step": 22050 }, { "epoch": 2.2472767993484677, "grad_norm": 4.341641902923584, "learning_rate": 1.223808338800534e-05, "loss": 0.0914, "step": 22075 }, { "epoch": 2.2498218466863484, "grad_norm": 2.093118667602539, "learning_rate": 1.2226771938556208e-05, "loss": 0.0638, "step": 22100 }, { "epoch": 2.2523668940242287, "grad_norm": 2.3410089015960693, "learning_rate": 1.2215460489107075e-05, "loss": 0.0941, "step": 22125 }, { "epoch": 2.2549119413621095, "grad_norm": 3.056443452835083, "learning_rate": 1.2204149039657942e-05, "loss": 0.0983, "step": 22150 }, { "epoch": 2.25745698869999, "grad_norm": 1.6339013576507568, "learning_rate": 1.2192837590208811e-05, "loss": 0.1036, "step": 22175 }, { "epoch": 2.2600020360378705, "grad_norm": 8.467519760131836, "learning_rate": 1.2181526140759679e-05, "loss": 0.0898, "step": 22200 }, { "epoch": 2.262547083375751, "grad_norm": 2.0889439582824707, "learning_rate": 1.2170214691310545e-05, "loss": 0.0984, "step": 22225 }, { "epoch": 2.265092130713631, "grad_norm": 0.9072188138961792, "learning_rate": 1.215935569983938e-05, "loss": 0.1118, "step": 22250 }, { "epoch": 2.267637178051512, "grad_norm": 4.564201354980469, "learning_rate": 1.2148044250390246e-05, "loss": 0.0865, "step": 22275 }, { "epoch": 2.270182225389392, "grad_norm": 3.500279188156128, "learning_rate": 1.2136732800941113e-05, "loss": 0.1191, "step": 22300 }, { "epoch": 2.2727272727272725, "grad_norm": 4.1375579833984375, "learning_rate": 1.2125421351491981e-05, "loss": 0.0984, "step": 22325 }, { "epoch": 2.2752723200651532, "grad_norm": 3.598430871963501, "learning_rate": 1.211410990204285e-05, "loss": 0.0829, "step": 22350 }, { "epoch": 2.2778173674030335, "grad_norm": 6.143867015838623, "learning_rate": 1.2102798452593715e-05, "loss": 0.1363, "step": 22375 }, { "epoch": 2.2803624147409143, "grad_norm": 2.5650177001953125, "learning_rate": 1.2091487003144585e-05, "loss": 0.0876, "step": 22400 }, { "epoch": 2.2829074620787946, "grad_norm": 6.063427448272705, "learning_rate": 1.2080175553695453e-05, "loss": 0.0932, "step": 22425 }, { "epoch": 2.2854525094166753, "grad_norm": 4.583015441894531, "learning_rate": 1.2068864104246319e-05, "loss": 0.095, "step": 22450 }, { "epoch": 2.2879975567545556, "grad_norm": 6.4546332359313965, "learning_rate": 1.2057552654797187e-05, "loss": 0.0957, "step": 22475 }, { "epoch": 2.290542604092436, "grad_norm": 3.2949342727661133, "learning_rate": 1.2046241205348055e-05, "loss": 0.0999, "step": 22500 }, { "epoch": 2.2930876514303167, "grad_norm": 8.154585838317871, "learning_rate": 1.203492975589892e-05, "loss": 0.1071, "step": 22525 }, { "epoch": 2.295632698768197, "grad_norm": 3.5857834815979004, "learning_rate": 1.2023618306449789e-05, "loss": 0.1223, "step": 22550 }, { "epoch": 2.2981777461060777, "grad_norm": 1.3705384731292725, "learning_rate": 1.2012306857000658e-05, "loss": 0.0748, "step": 22575 }, { "epoch": 2.300722793443958, "grad_norm": 2.1247451305389404, "learning_rate": 1.2000995407551526e-05, "loss": 0.0866, "step": 22600 }, { "epoch": 2.3032678407818388, "grad_norm": 3.3343911170959473, "learning_rate": 1.1989683958102392e-05, "loss": 0.1079, "step": 22625 }, { "epoch": 2.305812888119719, "grad_norm": 4.785033702850342, "learning_rate": 1.197837250865326e-05, "loss": 0.0928, "step": 22650 }, { "epoch": 2.3083579354575994, "grad_norm": 0.6399935483932495, "learning_rate": 1.1967061059204128e-05, "loss": 0.1176, "step": 22675 }, { "epoch": 2.31090298279548, "grad_norm": 6.403359413146973, "learning_rate": 1.1955749609754994e-05, "loss": 0.1455, "step": 22700 }, { "epoch": 2.3134480301333604, "grad_norm": 1.8424525260925293, "learning_rate": 1.1944438160305862e-05, "loss": 0.0922, "step": 22725 }, { "epoch": 2.3159930774712407, "grad_norm": 4.637752056121826, "learning_rate": 1.1933126710856731e-05, "loss": 0.1412, "step": 22750 }, { "epoch": 2.3185381248091215, "grad_norm": 2.118675708770752, "learning_rate": 1.19218152614076e-05, "loss": 0.0885, "step": 22775 }, { "epoch": 2.3210831721470018, "grad_norm": 2.0802664756774902, "learning_rate": 1.1910503811958465e-05, "loss": 0.084, "step": 22800 }, { "epoch": 2.3236282194848825, "grad_norm": 3.8109188079833984, "learning_rate": 1.1899192362509333e-05, "loss": 0.1206, "step": 22825 }, { "epoch": 2.326173266822763, "grad_norm": 2.5389983654022217, "learning_rate": 1.1887880913060201e-05, "loss": 0.1152, "step": 22850 }, { "epoch": 2.3287183141606436, "grad_norm": 2.187694787979126, "learning_rate": 1.1876569463611067e-05, "loss": 0.0903, "step": 22875 }, { "epoch": 2.331263361498524, "grad_norm": 7.514569282531738, "learning_rate": 1.1865258014161935e-05, "loss": 0.1087, "step": 22900 }, { "epoch": 2.333808408836404, "grad_norm": 4.942701816558838, "learning_rate": 1.1853946564712805e-05, "loss": 0.0602, "step": 22925 }, { "epoch": 2.336353456174285, "grad_norm": 2.9110524654388428, "learning_rate": 1.1842635115263669e-05, "loss": 0.0941, "step": 22950 }, { "epoch": 2.338898503512165, "grad_norm": 2.718231678009033, "learning_rate": 1.1831323665814539e-05, "loss": 0.0776, "step": 22975 }, { "epoch": 2.341443550850046, "grad_norm": 2.2280826568603516, "learning_rate": 1.1820012216365406e-05, "loss": 0.0844, "step": 23000 }, { "epoch": 2.3439885981879263, "grad_norm": 2.983081340789795, "learning_rate": 1.1808700766916274e-05, "loss": 0.0959, "step": 23025 }, { "epoch": 2.346533645525807, "grad_norm": 1.6823519468307495, "learning_rate": 1.179738931746714e-05, "loss": 0.0882, "step": 23050 }, { "epoch": 2.3490786928636873, "grad_norm": 4.962369918823242, "learning_rate": 1.1786077868018008e-05, "loss": 0.0826, "step": 23075 }, { "epoch": 2.3516237402015676, "grad_norm": 4.208407402038574, "learning_rate": 1.1774766418568878e-05, "loss": 0.0792, "step": 23100 }, { "epoch": 2.3541687875394484, "grad_norm": 2.5069408416748047, "learning_rate": 1.1763454969119742e-05, "loss": 0.0889, "step": 23125 }, { "epoch": 2.3567138348773287, "grad_norm": 2.7248406410217285, "learning_rate": 1.1752143519670612e-05, "loss": 0.0914, "step": 23150 }, { "epoch": 2.359258882215209, "grad_norm": 6.141697883605957, "learning_rate": 1.174083207022148e-05, "loss": 0.0866, "step": 23175 }, { "epoch": 2.3618039295530897, "grad_norm": 7.505110740661621, "learning_rate": 1.1729520620772347e-05, "loss": 0.0787, "step": 23200 }, { "epoch": 2.36434897689097, "grad_norm": 4.82844877243042, "learning_rate": 1.1718209171323214e-05, "loss": 0.1134, "step": 23225 }, { "epoch": 2.3668940242288508, "grad_norm": 3.9105417728424072, "learning_rate": 1.1706897721874081e-05, "loss": 0.1057, "step": 23250 }, { "epoch": 2.369439071566731, "grad_norm": 2.952836513519287, "learning_rate": 1.1695586272424951e-05, "loss": 0.1038, "step": 23275 }, { "epoch": 2.371984118904612, "grad_norm": 2.2733404636383057, "learning_rate": 1.1684274822975815e-05, "loss": 0.107, "step": 23300 }, { "epoch": 2.374529166242492, "grad_norm": 4.268314838409424, "learning_rate": 1.1672963373526685e-05, "loss": 0.1015, "step": 23325 }, { "epoch": 2.3770742135803724, "grad_norm": 1.9675124883651733, "learning_rate": 1.1661651924077553e-05, "loss": 0.0873, "step": 23350 }, { "epoch": 2.379619260918253, "grad_norm": 2.5415396690368652, "learning_rate": 1.1650340474628419e-05, "loss": 0.087, "step": 23375 }, { "epoch": 2.3821643082561335, "grad_norm": 3.148134231567383, "learning_rate": 1.1639029025179287e-05, "loss": 0.1126, "step": 23400 }, { "epoch": 2.384709355594014, "grad_norm": 0.9491432309150696, "learning_rate": 1.1627717575730155e-05, "loss": 0.0685, "step": 23425 }, { "epoch": 2.3872544029318945, "grad_norm": 4.692232131958008, "learning_rate": 1.1616406126281024e-05, "loss": 0.1078, "step": 23450 }, { "epoch": 2.3897994502697752, "grad_norm": 1.1305272579193115, "learning_rate": 1.1605094676831889e-05, "loss": 0.0767, "step": 23475 }, { "epoch": 2.3923444976076556, "grad_norm": 2.8438282012939453, "learning_rate": 1.1593783227382758e-05, "loss": 0.0982, "step": 23500 }, { "epoch": 2.394889544945536, "grad_norm": 4.794705867767334, "learning_rate": 1.1582471777933626e-05, "loss": 0.0794, "step": 23525 }, { "epoch": 2.3974345922834166, "grad_norm": 6.11577033996582, "learning_rate": 1.1571160328484492e-05, "loss": 0.1003, "step": 23550 }, { "epoch": 2.399979639621297, "grad_norm": 7.616250514984131, "learning_rate": 1.155984887903536e-05, "loss": 0.1272, "step": 23575 }, { "epoch": 2.402524686959177, "grad_norm": 1.4506219625473022, "learning_rate": 1.1548537429586228e-05, "loss": 0.1003, "step": 23600 }, { "epoch": 2.405069734297058, "grad_norm": 8.559371948242188, "learning_rate": 1.1537225980137097e-05, "loss": 0.1174, "step": 23625 }, { "epoch": 2.4076147816349383, "grad_norm": 0.7198318839073181, "learning_rate": 1.1525914530687964e-05, "loss": 0.1227, "step": 23650 }, { "epoch": 2.410159828972819, "grad_norm": 1.5881752967834473, "learning_rate": 1.1514603081238831e-05, "loss": 0.1009, "step": 23675 }, { "epoch": 2.4127048763106993, "grad_norm": 6.224447727203369, "learning_rate": 1.15032916317897e-05, "loss": 0.0939, "step": 23700 }, { "epoch": 2.41524992364858, "grad_norm": 2.6777849197387695, "learning_rate": 1.1491980182340565e-05, "loss": 0.0768, "step": 23725 }, { "epoch": 2.4177949709864603, "grad_norm": 4.088348388671875, "learning_rate": 1.1480668732891433e-05, "loss": 0.1155, "step": 23750 }, { "epoch": 2.4203400183243406, "grad_norm": 0.4692411422729492, "learning_rate": 1.1469357283442301e-05, "loss": 0.1223, "step": 23775 }, { "epoch": 2.4228850656622214, "grad_norm": 6.204223155975342, "learning_rate": 1.1458045833993167e-05, "loss": 0.1063, "step": 23800 }, { "epoch": 2.4254301130001017, "grad_norm": 6.6200032234191895, "learning_rate": 1.1446734384544037e-05, "loss": 0.0932, "step": 23825 }, { "epoch": 2.4279751603379824, "grad_norm": 4.673367023468018, "learning_rate": 1.1435422935094905e-05, "loss": 0.0962, "step": 23850 }, { "epoch": 2.4305202076758627, "grad_norm": 0.12596215307712555, "learning_rate": 1.1424111485645772e-05, "loss": 0.0822, "step": 23875 }, { "epoch": 2.4330652550137435, "grad_norm": 4.890219688415527, "learning_rate": 1.1412800036196639e-05, "loss": 0.086, "step": 23900 }, { "epoch": 2.435610302351624, "grad_norm": 7.528822422027588, "learning_rate": 1.1401488586747506e-05, "loss": 0.0967, "step": 23925 }, { "epoch": 2.438155349689504, "grad_norm": 4.572819232940674, "learning_rate": 1.1390177137298374e-05, "loss": 0.0703, "step": 23950 }, { "epoch": 2.440700397027385, "grad_norm": 1.6426405906677246, "learning_rate": 1.137886568784924e-05, "loss": 0.085, "step": 23975 }, { "epoch": 2.443245444365265, "grad_norm": 4.192370414733887, "learning_rate": 1.136755423840011e-05, "loss": 0.1188, "step": 24000 }, { "epoch": 2.4457904917031454, "grad_norm": 4.600924968719482, "learning_rate": 1.1356242788950978e-05, "loss": 0.1026, "step": 24025 }, { "epoch": 2.448335539041026, "grad_norm": 5.712591171264648, "learning_rate": 1.1344931339501846e-05, "loss": 0.079, "step": 24050 }, { "epoch": 2.4508805863789065, "grad_norm": 3.8217313289642334, "learning_rate": 1.1333619890052712e-05, "loss": 0.1202, "step": 24075 }, { "epoch": 2.4534256337167872, "grad_norm": 5.072776794433594, "learning_rate": 1.132230844060358e-05, "loss": 0.1105, "step": 24100 }, { "epoch": 2.4559706810546675, "grad_norm": 0.7305828928947449, "learning_rate": 1.1310996991154448e-05, "loss": 0.0713, "step": 24125 }, { "epoch": 2.4585157283925483, "grad_norm": 1.228698968887329, "learning_rate": 1.1299685541705314e-05, "loss": 0.0883, "step": 24150 }, { "epoch": 2.4610607757304286, "grad_norm": 4.841889381408691, "learning_rate": 1.1288374092256183e-05, "loss": 0.0943, "step": 24175 }, { "epoch": 2.463605823068309, "grad_norm": 5.082825660705566, "learning_rate": 1.1277062642807051e-05, "loss": 0.1042, "step": 24200 }, { "epoch": 2.4661508704061896, "grad_norm": 5.81834077835083, "learning_rate": 1.1265751193357919e-05, "loss": 0.0933, "step": 24225 }, { "epoch": 2.46869591774407, "grad_norm": 1.649214744567871, "learning_rate": 1.1254439743908785e-05, "loss": 0.1272, "step": 24250 }, { "epoch": 2.4712409650819507, "grad_norm": 3.9719619750976562, "learning_rate": 1.1243128294459653e-05, "loss": 0.0857, "step": 24275 }, { "epoch": 2.473786012419831, "grad_norm": 3.341063976287842, "learning_rate": 1.123181684501052e-05, "loss": 0.089, "step": 24300 }, { "epoch": 2.4763310597577117, "grad_norm": 5.203547477722168, "learning_rate": 1.1220505395561387e-05, "loss": 0.1298, "step": 24325 }, { "epoch": 2.478876107095592, "grad_norm": 8.336240768432617, "learning_rate": 1.1209193946112256e-05, "loss": 0.1025, "step": 24350 }, { "epoch": 2.4814211544334723, "grad_norm": 5.071165561676025, "learning_rate": 1.1197882496663124e-05, "loss": 0.1062, "step": 24375 }, { "epoch": 2.483966201771353, "grad_norm": 2.8873372077941895, "learning_rate": 1.118657104721399e-05, "loss": 0.0863, "step": 24400 }, { "epoch": 2.4865112491092334, "grad_norm": 0.5643141269683838, "learning_rate": 1.1175259597764858e-05, "loss": 0.0778, "step": 24425 }, { "epoch": 2.4890562964471137, "grad_norm": 5.715143203735352, "learning_rate": 1.1163948148315726e-05, "loss": 0.1379, "step": 24450 }, { "epoch": 2.4916013437849944, "grad_norm": 5.828744411468506, "learning_rate": 1.1152636698866594e-05, "loss": 0.1094, "step": 24475 }, { "epoch": 2.4941463911228747, "grad_norm": 5.1948933601379395, "learning_rate": 1.114132524941746e-05, "loss": 0.1035, "step": 24500 }, { "epoch": 2.4966914384607555, "grad_norm": 1.6300240755081177, "learning_rate": 1.113001379996833e-05, "loss": 0.0861, "step": 24525 }, { "epoch": 2.4992364857986358, "grad_norm": 3.659991979598999, "learning_rate": 1.1118702350519198e-05, "loss": 0.092, "step": 24550 }, { "epoch": 2.5017815331365165, "grad_norm": 4.651791095733643, "learning_rate": 1.1107390901070064e-05, "loss": 0.0907, "step": 24575 }, { "epoch": 2.504326580474397, "grad_norm": 2.967970609664917, "learning_rate": 1.1096079451620932e-05, "loss": 0.0933, "step": 24600 }, { "epoch": 2.506871627812277, "grad_norm": 1.0345920324325562, "learning_rate": 1.10847680021718e-05, "loss": 0.0955, "step": 24625 }, { "epoch": 2.509416675150158, "grad_norm": 0.5405799150466919, "learning_rate": 1.1073456552722667e-05, "loss": 0.1027, "step": 24650 }, { "epoch": 2.511961722488038, "grad_norm": 4.543354511260986, "learning_rate": 1.1062145103273533e-05, "loss": 0.1106, "step": 24675 }, { "epoch": 2.514506769825919, "grad_norm": 5.559842109680176, "learning_rate": 1.1050833653824403e-05, "loss": 0.1031, "step": 24700 }, { "epoch": 2.517051817163799, "grad_norm": 0.921740710735321, "learning_rate": 1.103952220437527e-05, "loss": 0.0935, "step": 24725 }, { "epoch": 2.51959686450168, "grad_norm": 7.651474475860596, "learning_rate": 1.1028210754926137e-05, "loss": 0.0732, "step": 24750 }, { "epoch": 2.5221419118395603, "grad_norm": 4.730767250061035, "learning_rate": 1.1016899305477005e-05, "loss": 0.0983, "step": 24775 }, { "epoch": 2.5246869591774406, "grad_norm": 6.074618339538574, "learning_rate": 1.1005587856027873e-05, "loss": 0.0886, "step": 24800 }, { "epoch": 2.5272320065153213, "grad_norm": 3.169323682785034, "learning_rate": 1.0994276406578739e-05, "loss": 0.0815, "step": 24825 }, { "epoch": 2.5297770538532016, "grad_norm": 3.6971890926361084, "learning_rate": 1.0982964957129607e-05, "loss": 0.1011, "step": 24850 }, { "epoch": 2.532322101191082, "grad_norm": 2.3330135345458984, "learning_rate": 1.0971653507680476e-05, "loss": 0.0796, "step": 24875 }, { "epoch": 2.5348671485289627, "grad_norm": 1.2037569284439087, "learning_rate": 1.0960342058231344e-05, "loss": 0.0883, "step": 24900 }, { "epoch": 2.537412195866843, "grad_norm": 0.8009597659111023, "learning_rate": 1.094903060878221e-05, "loss": 0.0981, "step": 24925 }, { "epoch": 2.5399572432047237, "grad_norm": 3.767286539077759, "learning_rate": 1.0937719159333078e-05, "loss": 0.1247, "step": 24950 }, { "epoch": 2.542502290542604, "grad_norm": 4.843240737915039, "learning_rate": 1.0926407709883946e-05, "loss": 0.1035, "step": 24975 }, { "epoch": 2.5450473378804848, "grad_norm": 0.4151710271835327, "learning_rate": 1.0915096260434812e-05, "loss": 0.115, "step": 25000 }, { "epoch": 2.547592385218365, "grad_norm": 3.2080724239349365, "learning_rate": 1.090378481098568e-05, "loss": 0.0916, "step": 25025 }, { "epoch": 2.5501374325562454, "grad_norm": 5.026012897491455, "learning_rate": 1.089247336153655e-05, "loss": 0.1199, "step": 25050 }, { "epoch": 2.552682479894126, "grad_norm": 2.0379691123962402, "learning_rate": 1.0881161912087417e-05, "loss": 0.1045, "step": 25075 }, { "epoch": 2.5552275272320064, "grad_norm": 2.8039698600769043, "learning_rate": 1.0869850462638283e-05, "loss": 0.1002, "step": 25100 }, { "epoch": 2.557772574569887, "grad_norm": 0.7228884696960449, "learning_rate": 1.0858539013189151e-05, "loss": 0.1184, "step": 25125 }, { "epoch": 2.5603176219077675, "grad_norm": 3.297924518585205, "learning_rate": 1.0847227563740019e-05, "loss": 0.077, "step": 25150 }, { "epoch": 2.562862669245648, "grad_norm": 0.8766332268714905, "learning_rate": 1.0835916114290885e-05, "loss": 0.0948, "step": 25175 }, { "epoch": 2.5654077165835285, "grad_norm": 6.345101833343506, "learning_rate": 1.0824604664841753e-05, "loss": 0.0991, "step": 25200 }, { "epoch": 2.567952763921409, "grad_norm": 2.1553688049316406, "learning_rate": 1.0813293215392623e-05, "loss": 0.1005, "step": 25225 }, { "epoch": 2.5704978112592896, "grad_norm": 5.529004096984863, "learning_rate": 1.0801981765943487e-05, "loss": 0.1139, "step": 25250 }, { "epoch": 2.57304285859717, "grad_norm": 6.425251007080078, "learning_rate": 1.0790670316494357e-05, "loss": 0.0919, "step": 25275 }, { "epoch": 2.57558790593505, "grad_norm": 4.08704137802124, "learning_rate": 1.0779358867045224e-05, "loss": 0.1483, "step": 25300 }, { "epoch": 2.578132953272931, "grad_norm": 5.872612476348877, "learning_rate": 1.0768047417596092e-05, "loss": 0.0965, "step": 25325 }, { "epoch": 2.580678000610811, "grad_norm": 3.455596923828125, "learning_rate": 1.0756735968146958e-05, "loss": 0.0925, "step": 25350 }, { "epoch": 2.583223047948692, "grad_norm": 3.5879950523376465, "learning_rate": 1.0745424518697826e-05, "loss": 0.0897, "step": 25375 }, { "epoch": 2.5857680952865723, "grad_norm": 1.831623911857605, "learning_rate": 1.0734113069248696e-05, "loss": 0.1098, "step": 25400 }, { "epoch": 2.588313142624453, "grad_norm": 3.8397438526153564, "learning_rate": 1.072280161979956e-05, "loss": 0.1025, "step": 25425 }, { "epoch": 2.5908581899623333, "grad_norm": 3.1397838592529297, "learning_rate": 1.071149017035043e-05, "loss": 0.0988, "step": 25450 }, { "epoch": 2.5934032373002136, "grad_norm": 0.3933914005756378, "learning_rate": 1.0700178720901298e-05, "loss": 0.0964, "step": 25475 }, { "epoch": 2.5959482846380943, "grad_norm": 4.579705238342285, "learning_rate": 1.0688867271452165e-05, "loss": 0.1126, "step": 25500 }, { "epoch": 2.5984933319759747, "grad_norm": 1.6875035762786865, "learning_rate": 1.0677555822003032e-05, "loss": 0.0902, "step": 25525 }, { "epoch": 2.6010383793138554, "grad_norm": 0.3768336772918701, "learning_rate": 1.06662443725539e-05, "loss": 0.1293, "step": 25550 }, { "epoch": 2.6035834266517357, "grad_norm": 3.3800220489501953, "learning_rate": 1.0654932923104769e-05, "loss": 0.1078, "step": 25575 }, { "epoch": 2.6061284739896164, "grad_norm": 3.3132853507995605, "learning_rate": 1.0643621473655633e-05, "loss": 0.1227, "step": 25600 }, { "epoch": 2.6086735213274967, "grad_norm": 3.5779366493225098, "learning_rate": 1.0632310024206503e-05, "loss": 0.0942, "step": 25625 }, { "epoch": 2.611218568665377, "grad_norm": 1.6557024717330933, "learning_rate": 1.0620998574757371e-05, "loss": 0.1184, "step": 25650 }, { "epoch": 2.613763616003258, "grad_norm": 2.5037038326263428, "learning_rate": 1.0609687125308239e-05, "loss": 0.1013, "step": 25675 }, { "epoch": 2.616308663341138, "grad_norm": 1.7757961750030518, "learning_rate": 1.0598375675859105e-05, "loss": 0.1082, "step": 25700 }, { "epoch": 2.6188537106790184, "grad_norm": 4.928213596343994, "learning_rate": 1.0587064226409973e-05, "loss": 0.0963, "step": 25725 }, { "epoch": 2.621398758016899, "grad_norm": 8.937736511230469, "learning_rate": 1.0575752776960842e-05, "loss": 0.1063, "step": 25750 }, { "epoch": 2.6239438053547794, "grad_norm": 2.6956536769866943, "learning_rate": 1.0564441327511707e-05, "loss": 0.0912, "step": 25775 }, { "epoch": 2.62648885269266, "grad_norm": 2.5104563236236572, "learning_rate": 1.0553129878062576e-05, "loss": 0.0929, "step": 25800 }, { "epoch": 2.6290339000305405, "grad_norm": 3.791407823562622, "learning_rate": 1.0541818428613444e-05, "loss": 0.1296, "step": 25825 }, { "epoch": 2.6315789473684212, "grad_norm": 1.7674590349197388, "learning_rate": 1.053050697916431e-05, "loss": 0.0797, "step": 25850 }, { "epoch": 2.6341239947063015, "grad_norm": 5.181350231170654, "learning_rate": 1.0519195529715178e-05, "loss": 0.0891, "step": 25875 }, { "epoch": 2.636669042044182, "grad_norm": 5.08585262298584, "learning_rate": 1.0507884080266046e-05, "loss": 0.101, "step": 25900 }, { "epoch": 2.6392140893820626, "grad_norm": 3.8872954845428467, "learning_rate": 1.0496572630816915e-05, "loss": 0.0651, "step": 25925 }, { "epoch": 2.641759136719943, "grad_norm": 2.1056861877441406, "learning_rate": 1.048526118136778e-05, "loss": 0.0778, "step": 25950 }, { "epoch": 2.6443041840578236, "grad_norm": 4.589896202087402, "learning_rate": 1.047394973191865e-05, "loss": 0.0976, "step": 25975 }, { "epoch": 2.646849231395704, "grad_norm": 1.1529279947280884, "learning_rate": 1.0462638282469517e-05, "loss": 0.0872, "step": 26000 }, { "epoch": 2.6493942787335847, "grad_norm": 8.964022636413574, "learning_rate": 1.0451326833020383e-05, "loss": 0.0974, "step": 26025 }, { "epoch": 2.651939326071465, "grad_norm": 0.959297776222229, "learning_rate": 1.0440015383571251e-05, "loss": 0.1183, "step": 26050 }, { "epoch": 2.6544843734093453, "grad_norm": 3.5030341148376465, "learning_rate": 1.0428703934122119e-05, "loss": 0.1122, "step": 26075 }, { "epoch": 2.657029420747226, "grad_norm": 3.678330659866333, "learning_rate": 1.0417392484672989e-05, "loss": 0.1087, "step": 26100 }, { "epoch": 2.6595744680851063, "grad_norm": 5.365817546844482, "learning_rate": 1.0406081035223853e-05, "loss": 0.1147, "step": 26125 }, { "epoch": 2.6621195154229866, "grad_norm": 5.787365436553955, "learning_rate": 1.0394769585774723e-05, "loss": 0.1113, "step": 26150 }, { "epoch": 2.6646645627608674, "grad_norm": 0.9475263953208923, "learning_rate": 1.038345813632559e-05, "loss": 0.0729, "step": 26175 }, { "epoch": 2.6672096100987477, "grad_norm": 2.679917573928833, "learning_rate": 1.0372146686876457e-05, "loss": 0.0869, "step": 26200 }, { "epoch": 2.6697546574366284, "grad_norm": 5.277957916259766, "learning_rate": 1.0360835237427325e-05, "loss": 0.1159, "step": 26225 }, { "epoch": 2.6722997047745087, "grad_norm": 2.8972973823547363, "learning_rate": 1.0349976245956157e-05, "loss": 0.0984, "step": 26250 }, { "epoch": 2.6748447521123895, "grad_norm": 7.060084819793701, "learning_rate": 1.0338664796507025e-05, "loss": 0.1173, "step": 26275 }, { "epoch": 2.67738979945027, "grad_norm": 3.6646735668182373, "learning_rate": 1.0327353347057893e-05, "loss": 0.0998, "step": 26300 }, { "epoch": 2.67993484678815, "grad_norm": 7.1751017570495605, "learning_rate": 1.0316041897608759e-05, "loss": 0.1257, "step": 26325 }, { "epoch": 2.682479894126031, "grad_norm": 4.124204158782959, "learning_rate": 1.0304730448159629e-05, "loss": 0.1221, "step": 26350 }, { "epoch": 2.685024941463911, "grad_norm": 2.890241861343384, "learning_rate": 1.0293418998710496e-05, "loss": 0.0779, "step": 26375 }, { "epoch": 2.687569988801792, "grad_norm": 3.6809606552124023, "learning_rate": 1.0282107549261364e-05, "loss": 0.1453, "step": 26400 }, { "epoch": 2.690115036139672, "grad_norm": 0.7839491963386536, "learning_rate": 1.027079609981223e-05, "loss": 0.1117, "step": 26425 }, { "epoch": 2.692660083477553, "grad_norm": 4.305838584899902, "learning_rate": 1.0259484650363098e-05, "loss": 0.1391, "step": 26450 }, { "epoch": 2.6952051308154332, "grad_norm": 5.504796504974365, "learning_rate": 1.0248173200913966e-05, "loss": 0.0844, "step": 26475 }, { "epoch": 2.6977501781533135, "grad_norm": 1.3983299732208252, "learning_rate": 1.0236861751464832e-05, "loss": 0.1011, "step": 26500 }, { "epoch": 2.7002952254911943, "grad_norm": 3.5293939113616943, "learning_rate": 1.0225550302015702e-05, "loss": 0.0949, "step": 26525 }, { "epoch": 2.7028402728290746, "grad_norm": 0.8638243675231934, "learning_rate": 1.021423885256657e-05, "loss": 0.0631, "step": 26550 }, { "epoch": 2.705385320166955, "grad_norm": 7.877193450927734, "learning_rate": 1.0202927403117436e-05, "loss": 0.1002, "step": 26575 }, { "epoch": 2.7079303675048356, "grad_norm": 4.632683753967285, "learning_rate": 1.0191615953668304e-05, "loss": 0.0856, "step": 26600 }, { "epoch": 2.710475414842716, "grad_norm": 2.3460853099823, "learning_rate": 1.0180304504219171e-05, "loss": 0.1046, "step": 26625 }, { "epoch": 2.7130204621805967, "grad_norm": 4.212901592254639, "learning_rate": 1.016899305477004e-05, "loss": 0.1039, "step": 26650 }, { "epoch": 2.715565509518477, "grad_norm": 5.075952053070068, "learning_rate": 1.0157681605320905e-05, "loss": 0.1168, "step": 26675 }, { "epoch": 2.7181105568563577, "grad_norm": 3.706630229949951, "learning_rate": 1.0146370155871775e-05, "loss": 0.1187, "step": 26700 }, { "epoch": 2.720655604194238, "grad_norm": 8.349642753601074, "learning_rate": 1.0135058706422643e-05, "loss": 0.1226, "step": 26725 }, { "epoch": 2.7232006515321183, "grad_norm": 7.56433629989624, "learning_rate": 1.0123747256973509e-05, "loss": 0.1012, "step": 26750 }, { "epoch": 2.725745698869999, "grad_norm": 5.832910060882568, "learning_rate": 1.0112435807524377e-05, "loss": 0.106, "step": 26775 }, { "epoch": 2.7282907462078794, "grad_norm": 7.008131504058838, "learning_rate": 1.0101124358075245e-05, "loss": 0.0892, "step": 26800 }, { "epoch": 2.73083579354576, "grad_norm": 4.23357629776001, "learning_rate": 1.0089812908626113e-05, "loss": 0.0899, "step": 26825 }, { "epoch": 2.7333808408836404, "grad_norm": 2.169849395751953, "learning_rate": 1.0078501459176979e-05, "loss": 0.105, "step": 26850 }, { "epoch": 2.735925888221521, "grad_norm": 4.076704025268555, "learning_rate": 1.0067190009727848e-05, "loss": 0.0766, "step": 26875 }, { "epoch": 2.7384709355594015, "grad_norm": 4.680348873138428, "learning_rate": 1.0055878560278716e-05, "loss": 0.0855, "step": 26900 }, { "epoch": 2.7410159828972818, "grad_norm": 2.6306278705596924, "learning_rate": 1.0044567110829582e-05, "loss": 0.127, "step": 26925 }, { "epoch": 2.7435610302351625, "grad_norm": 6.218433380126953, "learning_rate": 1.003325566138045e-05, "loss": 0.1265, "step": 26950 }, { "epoch": 2.746106077573043, "grad_norm": 3.8491039276123047, "learning_rate": 1.0021944211931318e-05, "loss": 0.1109, "step": 26975 }, { "epoch": 2.748651124910923, "grad_norm": 4.13882303237915, "learning_rate": 1.0010632762482184e-05, "loss": 0.1001, "step": 27000 }, { "epoch": 2.751196172248804, "grad_norm": 2.967843770980835, "learning_rate": 9.999321313033052e-06, "loss": 0.1056, "step": 27025 }, { "epoch": 2.753741219586684, "grad_norm": 1.7531013488769531, "learning_rate": 9.988009863583921e-06, "loss": 0.1146, "step": 27050 }, { "epoch": 2.756286266924565, "grad_norm": 3.6976428031921387, "learning_rate": 9.976698414134788e-06, "loss": 0.1213, "step": 27075 }, { "epoch": 2.758831314262445, "grad_norm": 4.366905212402344, "learning_rate": 9.965386964685655e-06, "loss": 0.084, "step": 27100 }, { "epoch": 2.761376361600326, "grad_norm": 0.365721732378006, "learning_rate": 9.954075515236523e-06, "loss": 0.0955, "step": 27125 }, { "epoch": 2.7639214089382063, "grad_norm": 3.8855741024017334, "learning_rate": 9.94276406578739e-06, "loss": 0.0983, "step": 27150 }, { "epoch": 2.7664664562760866, "grad_norm": 2.5487568378448486, "learning_rate": 9.931452616338259e-06, "loss": 0.0997, "step": 27175 }, { "epoch": 2.7690115036139673, "grad_norm": 2.598275661468506, "learning_rate": 9.920141166889125e-06, "loss": 0.1203, "step": 27200 }, { "epoch": 2.7715565509518476, "grad_norm": 7.30168342590332, "learning_rate": 9.908829717439995e-06, "loss": 0.1122, "step": 27225 }, { "epoch": 2.7741015982897284, "grad_norm": 6.6037468910217285, "learning_rate": 9.89751826799086e-06, "loss": 0.114, "step": 27250 }, { "epoch": 2.7766466456276087, "grad_norm": 3.6901676654815674, "learning_rate": 9.886206818541729e-06, "loss": 0.0957, "step": 27275 }, { "epoch": 2.7791916929654894, "grad_norm": 1.6863963603973389, "learning_rate": 9.874895369092597e-06, "loss": 0.1075, "step": 27300 }, { "epoch": 2.7817367403033697, "grad_norm": 2.468958854675293, "learning_rate": 9.863583919643464e-06, "loss": 0.0911, "step": 27325 }, { "epoch": 2.78428178764125, "grad_norm": 2.852034330368042, "learning_rate": 9.852272470194332e-06, "loss": 0.1005, "step": 27350 }, { "epoch": 2.7868268349791308, "grad_norm": 2.5854666233062744, "learning_rate": 9.840961020745198e-06, "loss": 0.0996, "step": 27375 }, { "epoch": 2.789371882317011, "grad_norm": 6.25841760635376, "learning_rate": 9.829649571296068e-06, "loss": 0.1035, "step": 27400 }, { "epoch": 2.7919169296548914, "grad_norm": 6.368521690368652, "learning_rate": 9.818338121846934e-06, "loss": 0.1205, "step": 27425 }, { "epoch": 2.794461976992772, "grad_norm": 1.1734464168548584, "learning_rate": 9.807026672397802e-06, "loss": 0.1068, "step": 27450 }, { "epoch": 2.7970070243306524, "grad_norm": 5.702926158905029, "learning_rate": 9.79571522294867e-06, "loss": 0.0723, "step": 27475 }, { "epoch": 2.799552071668533, "grad_norm": 5.801497459411621, "learning_rate": 9.784403773499538e-06, "loss": 0.0965, "step": 27500 }, { "epoch": 2.8020971190064135, "grad_norm": 3.9957456588745117, "learning_rate": 9.773092324050405e-06, "loss": 0.0719, "step": 27525 }, { "epoch": 2.804642166344294, "grad_norm": 1.4273048639297485, "learning_rate": 9.761780874601272e-06, "loss": 0.0983, "step": 27550 }, { "epoch": 2.8071872136821745, "grad_norm": 5.444447994232178, "learning_rate": 9.750469425152141e-06, "loss": 0.1104, "step": 27575 }, { "epoch": 2.809732261020055, "grad_norm": 2.195810556411743, "learning_rate": 9.739157975703007e-06, "loss": 0.0842, "step": 27600 }, { "epoch": 2.8122773083579355, "grad_norm": 0.5844478011131287, "learning_rate": 9.727846526253875e-06, "loss": 0.0925, "step": 27625 }, { "epoch": 2.814822355695816, "grad_norm": 5.637092113494873, "learning_rate": 9.716535076804743e-06, "loss": 0.096, "step": 27650 }, { "epoch": 2.817367403033696, "grad_norm": 4.654635906219482, "learning_rate": 9.70522362735561e-06, "loss": 0.1455, "step": 27675 }, { "epoch": 2.819912450371577, "grad_norm": 4.294269561767578, "learning_rate": 9.693912177906479e-06, "loss": 0.1088, "step": 27700 }, { "epoch": 2.8224574977094576, "grad_norm": 2.964021921157837, "learning_rate": 9.682600728457345e-06, "loss": 0.0763, "step": 27725 }, { "epoch": 2.825002545047338, "grad_norm": 3.658801317214966, "learning_rate": 9.671289279008213e-06, "loss": 0.0907, "step": 27750 }, { "epoch": 2.8275475923852182, "grad_norm": 0.45649853348731995, "learning_rate": 9.65997782955908e-06, "loss": 0.1006, "step": 27775 }, { "epoch": 2.830092639723099, "grad_norm": 1.596645474433899, "learning_rate": 9.648666380109948e-06, "loss": 0.0753, "step": 27800 }, { "epoch": 2.8326376870609793, "grad_norm": 0.9637457132339478, "learning_rate": 9.637354930660816e-06, "loss": 0.0974, "step": 27825 }, { "epoch": 2.8351827343988596, "grad_norm": 4.628258228302002, "learning_rate": 9.626043481211684e-06, "loss": 0.1121, "step": 27850 }, { "epoch": 2.8377277817367403, "grad_norm": 2.2486555576324463, "learning_rate": 9.61473203176255e-06, "loss": 0.0653, "step": 27875 }, { "epoch": 2.8402728290746206, "grad_norm": 3.0560920238494873, "learning_rate": 9.603420582313418e-06, "loss": 0.1054, "step": 27900 }, { "epoch": 2.8428178764125014, "grad_norm": 5.340658187866211, "learning_rate": 9.592109132864286e-06, "loss": 0.094, "step": 27925 }, { "epoch": 2.8453629237503817, "grad_norm": 5.649479389190674, "learning_rate": 9.580797683415154e-06, "loss": 0.1088, "step": 27950 }, { "epoch": 2.8479079710882624, "grad_norm": 2.4257004261016846, "learning_rate": 9.569486233966022e-06, "loss": 0.0947, "step": 27975 }, { "epoch": 2.8504530184261427, "grad_norm": 6.8244242668151855, "learning_rate": 9.55817478451689e-06, "loss": 0.1219, "step": 28000 }, { "epoch": 2.852998065764023, "grad_norm": 4.071915149688721, "learning_rate": 9.546863335067757e-06, "loss": 0.087, "step": 28025 }, { "epoch": 2.855543113101904, "grad_norm": 2.7061493396759033, "learning_rate": 9.535551885618623e-06, "loss": 0.1076, "step": 28050 }, { "epoch": 2.858088160439784, "grad_norm": 0.8928428888320923, "learning_rate": 9.524240436169491e-06, "loss": 0.1001, "step": 28075 }, { "epoch": 2.8606332077776644, "grad_norm": 3.2439606189727783, "learning_rate": 9.512928986720359e-06, "loss": 0.0919, "step": 28100 }, { "epoch": 2.863178255115545, "grad_norm": 2.2897791862487793, "learning_rate": 9.501617537271227e-06, "loss": 0.1167, "step": 28125 }, { "epoch": 2.865723302453426, "grad_norm": 4.564056873321533, "learning_rate": 9.490306087822095e-06, "loss": 0.0769, "step": 28150 }, { "epoch": 2.868268349791306, "grad_norm": 4.80610466003418, "learning_rate": 9.478994638372961e-06, "loss": 0.1048, "step": 28175 }, { "epoch": 2.8708133971291865, "grad_norm": 3.0985665321350098, "learning_rate": 9.46768318892383e-06, "loss": 0.0918, "step": 28200 }, { "epoch": 2.8733584444670672, "grad_norm": 5.210390567779541, "learning_rate": 9.456371739474697e-06, "loss": 0.1116, "step": 28225 }, { "epoch": 2.8759034918049475, "grad_norm": 3.7207789421081543, "learning_rate": 9.445060290025564e-06, "loss": 0.106, "step": 28250 }, { "epoch": 2.878448539142828, "grad_norm": 3.766141653060913, "learning_rate": 9.433748840576432e-06, "loss": 0.1243, "step": 28275 }, { "epoch": 2.8809935864807086, "grad_norm": 5.841989994049072, "learning_rate": 9.422437391127298e-06, "loss": 0.1119, "step": 28300 }, { "epoch": 2.883538633818589, "grad_norm": 2.617666244506836, "learning_rate": 9.411125941678168e-06, "loss": 0.0793, "step": 28325 }, { "epoch": 2.8860836811564696, "grad_norm": 11.287698745727539, "learning_rate": 9.399814492229034e-06, "loss": 0.0999, "step": 28350 }, { "epoch": 2.88862872849435, "grad_norm": 4.3712358474731445, "learning_rate": 9.388503042779904e-06, "loss": 0.0853, "step": 28375 }, { "epoch": 2.8911737758322307, "grad_norm": 3.98750376701355, "learning_rate": 9.37719159333077e-06, "loss": 0.1512, "step": 28400 }, { "epoch": 2.893718823170111, "grad_norm": 0.24144740402698517, "learning_rate": 9.365880143881638e-06, "loss": 0.1036, "step": 28425 }, { "epoch": 2.8962638705079913, "grad_norm": 7.1824727058410645, "learning_rate": 9.354568694432506e-06, "loss": 0.1061, "step": 28450 }, { "epoch": 2.898808917845872, "grad_norm": 4.1219162940979, "learning_rate": 9.343257244983372e-06, "loss": 0.1099, "step": 28475 }, { "epoch": 2.9013539651837523, "grad_norm": 2.997187614440918, "learning_rate": 9.331945795534241e-06, "loss": 0.1071, "step": 28500 }, { "epoch": 2.9038990125216326, "grad_norm": 3.1705572605133057, "learning_rate": 9.320634346085107e-06, "loss": 0.0869, "step": 28525 }, { "epoch": 2.9064440598595134, "grad_norm": 1.6288036108016968, "learning_rate": 9.309322896635977e-06, "loss": 0.0949, "step": 28550 }, { "epoch": 2.908989107197394, "grad_norm": 8.080781936645508, "learning_rate": 9.298011447186843e-06, "loss": 0.1113, "step": 28575 }, { "epoch": 2.9115341545352744, "grad_norm": 5.966607093811035, "learning_rate": 9.286699997737711e-06, "loss": 0.0832, "step": 28600 }, { "epoch": 2.9140792018731547, "grad_norm": 5.3897247314453125, "learning_rate": 9.275388548288579e-06, "loss": 0.0958, "step": 28625 }, { "epoch": 2.9166242492110355, "grad_norm": 1.9493873119354248, "learning_rate": 9.264077098839445e-06, "loss": 0.0913, "step": 28650 }, { "epoch": 2.9191692965489158, "grad_norm": 1.491591215133667, "learning_rate": 9.252765649390314e-06, "loss": 0.1015, "step": 28675 }, { "epoch": 2.921714343886796, "grad_norm": 4.423450946807861, "learning_rate": 9.24145419994118e-06, "loss": 0.0838, "step": 28700 }, { "epoch": 2.924259391224677, "grad_norm": 1.92838716506958, "learning_rate": 9.23014275049205e-06, "loss": 0.0823, "step": 28725 }, { "epoch": 2.926804438562557, "grad_norm": 0.5521523952484131, "learning_rate": 9.218831301042916e-06, "loss": 0.0754, "step": 28750 }, { "epoch": 2.929349485900438, "grad_norm": 5.309305667877197, "learning_rate": 9.207519851593784e-06, "loss": 0.0894, "step": 28775 }, { "epoch": 2.931894533238318, "grad_norm": 0.7912878394126892, "learning_rate": 9.196208402144652e-06, "loss": 0.0937, "step": 28800 }, { "epoch": 2.934439580576199, "grad_norm": 0.36283740401268005, "learning_rate": 9.184896952695518e-06, "loss": 0.0749, "step": 28825 }, { "epoch": 2.936984627914079, "grad_norm": 4.372325420379639, "learning_rate": 9.173585503246388e-06, "loss": 0.0965, "step": 28850 }, { "epoch": 2.9395296752519595, "grad_norm": 1.6252275705337524, "learning_rate": 9.162274053797254e-06, "loss": 0.1027, "step": 28875 }, { "epoch": 2.9420747225898403, "grad_norm": 3.252793073654175, "learning_rate": 9.150962604348122e-06, "loss": 0.0908, "step": 28900 }, { "epoch": 2.9446197699277206, "grad_norm": 7.335224628448486, "learning_rate": 9.140103612876954e-06, "loss": 0.0792, "step": 28925 }, { "epoch": 2.947164817265601, "grad_norm": 6.060163974761963, "learning_rate": 9.129244621405787e-06, "loss": 0.0602, "step": 28950 }, { "epoch": 2.9497098646034816, "grad_norm": 4.286234378814697, "learning_rate": 9.117933171956655e-06, "loss": 0.1196, "step": 28975 }, { "epoch": 2.9522549119413624, "grad_norm": 2.4993934631347656, "learning_rate": 9.106621722507523e-06, "loss": 0.0894, "step": 29000 }, { "epoch": 2.9547999592792427, "grad_norm": 5.36308479309082, "learning_rate": 9.09531027305839e-06, "loss": 0.0854, "step": 29025 }, { "epoch": 2.957345006617123, "grad_norm": 1.1274926662445068, "learning_rate": 9.083998823609258e-06, "loss": 0.1062, "step": 29050 }, { "epoch": 2.9598900539550037, "grad_norm": 0.503273606300354, "learning_rate": 9.072687374160125e-06, "loss": 0.0947, "step": 29075 }, { "epoch": 2.962435101292884, "grad_norm": 6.292426586151123, "learning_rate": 9.061375924710994e-06, "loss": 0.1098, "step": 29100 }, { "epoch": 2.9649801486307643, "grad_norm": 3.6071207523345947, "learning_rate": 9.05006447526186e-06, "loss": 0.1041, "step": 29125 }, { "epoch": 2.967525195968645, "grad_norm": 3.5902364253997803, "learning_rate": 9.038753025812728e-06, "loss": 0.1027, "step": 29150 }, { "epoch": 2.9700702433065254, "grad_norm": 1.8288074731826782, "learning_rate": 9.027441576363596e-06, "loss": 0.0874, "step": 29175 }, { "epoch": 2.972615290644406, "grad_norm": 5.246890068054199, "learning_rate": 9.016130126914464e-06, "loss": 0.1144, "step": 29200 }, { "epoch": 2.9751603379822864, "grad_norm": 2.4337267875671387, "learning_rate": 9.004818677465332e-06, "loss": 0.0763, "step": 29225 }, { "epoch": 2.977705385320167, "grad_norm": 10.622210502624512, "learning_rate": 8.993507228016198e-06, "loss": 0.0686, "step": 29250 }, { "epoch": 2.9802504326580475, "grad_norm": 6.085145950317383, "learning_rate": 8.982195778567067e-06, "loss": 0.095, "step": 29275 }, { "epoch": 2.9827954799959278, "grad_norm": 5.365617752075195, "learning_rate": 8.970884329117933e-06, "loss": 0.0624, "step": 29300 }, { "epoch": 2.9853405273338085, "grad_norm": 6.674490451812744, "learning_rate": 8.959572879668801e-06, "loss": 0.0654, "step": 29325 }, { "epoch": 2.987885574671689, "grad_norm": 2.957017421722412, "learning_rate": 8.948261430219669e-06, "loss": 0.0963, "step": 29350 }, { "epoch": 2.990430622009569, "grad_norm": 6.887320518493652, "learning_rate": 8.936949980770537e-06, "loss": 0.0882, "step": 29375 }, { "epoch": 2.99297566934745, "grad_norm": 3.883092164993286, "learning_rate": 8.925638531321405e-06, "loss": 0.0863, "step": 29400 }, { "epoch": 2.9955207166853306, "grad_norm": 1.341077208518982, "learning_rate": 8.914327081872271e-06, "loss": 0.0833, "step": 29425 }, { "epoch": 2.998065764023211, "grad_norm": 6.598110198974609, "learning_rate": 8.90301563242314e-06, "loss": 0.1083, "step": 29450 }, { "epoch": 3.0, "eval_loss": 0.08900922536849976, "eval_runtime": 7.4605, "eval_samples_per_second": 912.542, "eval_steps_per_second": 14.342, "step": 29469 }, { "epoch": 3.000610811361091, "grad_norm": 5.234329700469971, "learning_rate": 8.891704182974007e-06, "loss": 0.0903, "step": 29475 }, { "epoch": 3.003155858698972, "grad_norm": 2.3512911796569824, "learning_rate": 8.880392733524874e-06, "loss": 0.0831, "step": 29500 }, { "epoch": 3.0057009060368522, "grad_norm": 5.458704471588135, "learning_rate": 8.869081284075742e-06, "loss": 0.0656, "step": 29525 }, { "epoch": 3.0082459533747326, "grad_norm": 2.52107834815979, "learning_rate": 8.85776983462661e-06, "loss": 0.1035, "step": 29550 }, { "epoch": 3.0107910007126133, "grad_norm": 1.5311448574066162, "learning_rate": 8.846458385177478e-06, "loss": 0.0727, "step": 29575 }, { "epoch": 3.0133360480504936, "grad_norm": 5.179780960083008, "learning_rate": 8.835146935728344e-06, "loss": 0.0761, "step": 29600 }, { "epoch": 3.0158810953883743, "grad_norm": 3.7667253017425537, "learning_rate": 8.823835486279214e-06, "loss": 0.0873, "step": 29625 }, { "epoch": 3.0184261427262546, "grad_norm": 4.558624744415283, "learning_rate": 8.81252403683008e-06, "loss": 0.1085, "step": 29650 }, { "epoch": 3.0209711900641354, "grad_norm": 3.4559326171875, "learning_rate": 8.801212587380948e-06, "loss": 0.0939, "step": 29675 }, { "epoch": 3.0235162374020157, "grad_norm": 6.828591346740723, "learning_rate": 8.789901137931816e-06, "loss": 0.0735, "step": 29700 }, { "epoch": 3.026061284739896, "grad_norm": 2.5354812145233154, "learning_rate": 8.778589688482683e-06, "loss": 0.0719, "step": 29725 }, { "epoch": 3.0286063320777767, "grad_norm": 1.035030484199524, "learning_rate": 8.767278239033551e-06, "loss": 0.0957, "step": 29750 }, { "epoch": 3.031151379415657, "grad_norm": 2.077613592147827, "learning_rate": 8.755966789584417e-06, "loss": 0.0942, "step": 29775 }, { "epoch": 3.033696426753538, "grad_norm": 2.0107665061950684, "learning_rate": 8.744655340135285e-06, "loss": 0.0697, "step": 29800 }, { "epoch": 3.036241474091418, "grad_norm": 3.822760581970215, "learning_rate": 8.733343890686153e-06, "loss": 0.0969, "step": 29825 }, { "epoch": 3.0387865214292984, "grad_norm": 1.4622304439544678, "learning_rate": 8.722032441237021e-06, "loss": 0.0733, "step": 29850 }, { "epoch": 3.041331568767179, "grad_norm": 4.076181411743164, "learning_rate": 8.710720991787889e-06, "loss": 0.0528, "step": 29875 }, { "epoch": 3.0438766161050594, "grad_norm": 1.466534972190857, "learning_rate": 8.699409542338757e-06, "loss": 0.0909, "step": 29900 }, { "epoch": 3.04642166344294, "grad_norm": 1.0620015859603882, "learning_rate": 8.688098092889624e-06, "loss": 0.0576, "step": 29925 }, { "epoch": 3.0489667107808205, "grad_norm": 2.384843587875366, "learning_rate": 8.67678664344049e-06, "loss": 0.1137, "step": 29950 }, { "epoch": 3.051511758118701, "grad_norm": 4.1187334060668945, "learning_rate": 8.665475193991358e-06, "loss": 0.0854, "step": 29975 }, { "epoch": 3.0540568054565815, "grad_norm": 3.1828019618988037, "learning_rate": 8.654163744542226e-06, "loss": 0.0794, "step": 30000 }, { "epoch": 3.056601852794462, "grad_norm": 1.8246716260910034, "learning_rate": 8.642852295093094e-06, "loss": 0.0612, "step": 30025 }, { "epoch": 3.0591469001323426, "grad_norm": 6.157846450805664, "learning_rate": 8.631540845643962e-06, "loss": 0.0834, "step": 30050 }, { "epoch": 3.061691947470223, "grad_norm": 5.81326961517334, "learning_rate": 8.62022939619483e-06, "loss": 0.0833, "step": 30075 }, { "epoch": 3.0642369948081036, "grad_norm": 8.527436256408691, "learning_rate": 8.608917946745696e-06, "loss": 0.1205, "step": 30100 }, { "epoch": 3.066782042145984, "grad_norm": 2.045522928237915, "learning_rate": 8.597606497296564e-06, "loss": 0.079, "step": 30125 }, { "epoch": 3.0693270894838642, "grad_norm": 4.061382293701172, "learning_rate": 8.586295047847432e-06, "loss": 0.0598, "step": 30150 }, { "epoch": 3.071872136821745, "grad_norm": 1.9634780883789062, "learning_rate": 8.5749835983983e-06, "loss": 0.0794, "step": 30175 }, { "epoch": 3.0744171841596253, "grad_norm": 2.8861138820648193, "learning_rate": 8.563672148949167e-06, "loss": 0.079, "step": 30200 }, { "epoch": 3.076962231497506, "grad_norm": 4.507253170013428, "learning_rate": 8.552360699500034e-06, "loss": 0.0752, "step": 30225 }, { "epoch": 3.0795072788353863, "grad_norm": 12.6360445022583, "learning_rate": 8.541049250050903e-06, "loss": 0.1178, "step": 30250 }, { "epoch": 3.0820523261732666, "grad_norm": 5.043088436126709, "learning_rate": 8.52973780060177e-06, "loss": 0.1003, "step": 30275 }, { "epoch": 3.0845973735111474, "grad_norm": 1.5982271432876587, "learning_rate": 8.518426351152637e-06, "loss": 0.0765, "step": 30300 }, { "epoch": 3.0871424208490277, "grad_norm": 4.377776622772217, "learning_rate": 8.507114901703505e-06, "loss": 0.0984, "step": 30325 }, { "epoch": 3.0896874681869084, "grad_norm": 2.552151679992676, "learning_rate": 8.495803452254373e-06, "loss": 0.0629, "step": 30350 }, { "epoch": 3.0922325155247887, "grad_norm": 8.379557609558105, "learning_rate": 8.48449200280524e-06, "loss": 0.059, "step": 30375 }, { "epoch": 3.094777562862669, "grad_norm": 0.7588167786598206, "learning_rate": 8.473180553356107e-06, "loss": 0.0763, "step": 30400 }, { "epoch": 3.0973226102005498, "grad_norm": 5.55032205581665, "learning_rate": 8.461869103906976e-06, "loss": 0.0731, "step": 30425 }, { "epoch": 3.09986765753843, "grad_norm": 1.346492052078247, "learning_rate": 8.450557654457842e-06, "loss": 0.0777, "step": 30450 }, { "epoch": 3.102412704876311, "grad_norm": 6.685083389282227, "learning_rate": 8.43924620500871e-06, "loss": 0.08, "step": 30475 }, { "epoch": 3.104957752214191, "grad_norm": 2.1243855953216553, "learning_rate": 8.427934755559578e-06, "loss": 0.0775, "step": 30500 }, { "epoch": 3.107502799552072, "grad_norm": 5.800358772277832, "learning_rate": 8.416623306110446e-06, "loss": 0.0833, "step": 30525 }, { "epoch": 3.110047846889952, "grad_norm": 3.737163782119751, "learning_rate": 8.405311856661314e-06, "loss": 0.0905, "step": 30550 }, { "epoch": 3.1125928942278325, "grad_norm": 0.30908119678497314, "learning_rate": 8.39400040721218e-06, "loss": 0.0782, "step": 30575 }, { "epoch": 3.115137941565713, "grad_norm": 4.908120155334473, "learning_rate": 8.38268895776305e-06, "loss": 0.0613, "step": 30600 }, { "epoch": 3.1176829889035935, "grad_norm": 2.8786909580230713, "learning_rate": 8.371377508313916e-06, "loss": 0.0618, "step": 30625 }, { "epoch": 3.1202280362414743, "grad_norm": 4.245732307434082, "learning_rate": 8.360066058864783e-06, "loss": 0.0579, "step": 30650 }, { "epoch": 3.1227730835793546, "grad_norm": 1.4124987125396729, "learning_rate": 8.348754609415651e-06, "loss": 0.0649, "step": 30675 }, { "epoch": 3.125318130917235, "grad_norm": 5.46633243560791, "learning_rate": 8.33744315996652e-06, "loss": 0.0777, "step": 30700 }, { "epoch": 3.1278631782551156, "grad_norm": 2.1597933769226074, "learning_rate": 8.326131710517387e-06, "loss": 0.0882, "step": 30725 }, { "epoch": 3.130408225592996, "grad_norm": 1.188536524772644, "learning_rate": 8.314820261068253e-06, "loss": 0.0792, "step": 30750 }, { "epoch": 3.1329532729308767, "grad_norm": 3.5105278491973877, "learning_rate": 8.303508811619123e-06, "loss": 0.0673, "step": 30775 }, { "epoch": 3.135498320268757, "grad_norm": 1.3239123821258545, "learning_rate": 8.292197362169989e-06, "loss": 0.0644, "step": 30800 }, { "epoch": 3.1380433676066373, "grad_norm": 0.6118612885475159, "learning_rate": 8.280885912720857e-06, "loss": 0.0681, "step": 30825 }, { "epoch": 3.140588414944518, "grad_norm": 3.204726457595825, "learning_rate": 8.269574463271725e-06, "loss": 0.077, "step": 30850 }, { "epoch": 3.1431334622823983, "grad_norm": 7.448500156402588, "learning_rate": 8.258263013822592e-06, "loss": 0.0897, "step": 30875 }, { "epoch": 3.145678509620279, "grad_norm": 6.173210144042969, "learning_rate": 8.24695156437346e-06, "loss": 0.0906, "step": 30900 }, { "epoch": 3.1482235569581594, "grad_norm": 1.42446768283844, "learning_rate": 8.235640114924326e-06, "loss": 0.0698, "step": 30925 }, { "epoch": 3.15076860429604, "grad_norm": 2.045053482055664, "learning_rate": 8.224328665475194e-06, "loss": 0.0682, "step": 30950 }, { "epoch": 3.1533136516339204, "grad_norm": 5.3061442375183105, "learning_rate": 8.213017216026062e-06, "loss": 0.1129, "step": 30975 }, { "epoch": 3.1558586989718007, "grad_norm": 0.9427996873855591, "learning_rate": 8.20170576657693e-06, "loss": 0.0822, "step": 31000 }, { "epoch": 3.1584037463096815, "grad_norm": 3.678173542022705, "learning_rate": 8.190394317127798e-06, "loss": 0.077, "step": 31025 }, { "epoch": 3.1609487936475618, "grad_norm": 1.559303641319275, "learning_rate": 8.179082867678666e-06, "loss": 0.0681, "step": 31050 }, { "epoch": 3.1634938409854425, "grad_norm": 9.293242454528809, "learning_rate": 8.167771418229533e-06, "loss": 0.0791, "step": 31075 }, { "epoch": 3.166038888323323, "grad_norm": 2.898684024810791, "learning_rate": 8.1564599687804e-06, "loss": 0.0937, "step": 31100 }, { "epoch": 3.168583935661203, "grad_norm": 0.6863572001457214, "learning_rate": 8.145148519331267e-06, "loss": 0.0911, "step": 31125 }, { "epoch": 3.171128982999084, "grad_norm": 1.8808056116104126, "learning_rate": 8.133837069882135e-06, "loss": 0.0686, "step": 31150 }, { "epoch": 3.173674030336964, "grad_norm": 0.8487688899040222, "learning_rate": 8.122525620433003e-06, "loss": 0.0677, "step": 31175 }, { "epoch": 3.176219077674845, "grad_norm": 2.2487783432006836, "learning_rate": 8.111214170983871e-06, "loss": 0.0722, "step": 31200 }, { "epoch": 3.178764125012725, "grad_norm": 2.0429940223693848, "learning_rate": 8.099902721534739e-06, "loss": 0.1115, "step": 31225 }, { "epoch": 3.1813091723506055, "grad_norm": 3.094607353210449, "learning_rate": 8.088591272085605e-06, "loss": 0.1071, "step": 31250 }, { "epoch": 3.1838542196884863, "grad_norm": 0.322568416595459, "learning_rate": 8.077279822636473e-06, "loss": 0.0717, "step": 31275 }, { "epoch": 3.1863992670263666, "grad_norm": 2.278010845184326, "learning_rate": 8.06596837318734e-06, "loss": 0.0865, "step": 31300 }, { "epoch": 3.1889443143642473, "grad_norm": 1.3545632362365723, "learning_rate": 8.054656923738209e-06, "loss": 0.0756, "step": 31325 }, { "epoch": 3.1914893617021276, "grad_norm": 4.890456199645996, "learning_rate": 8.043345474289076e-06, "loss": 0.0894, "step": 31350 }, { "epoch": 3.1940344090400083, "grad_norm": 3.303328514099121, "learning_rate": 8.032034024839943e-06, "loss": 0.0853, "step": 31375 }, { "epoch": 3.1965794563778887, "grad_norm": 0.5776151418685913, "learning_rate": 8.020722575390812e-06, "loss": 0.081, "step": 31400 }, { "epoch": 3.199124503715769, "grad_norm": 5.187852382659912, "learning_rate": 8.009411125941678e-06, "loss": 0.0774, "step": 31425 }, { "epoch": 3.2016695510536497, "grad_norm": 4.518702507019043, "learning_rate": 7.998099676492546e-06, "loss": 0.0559, "step": 31450 }, { "epoch": 3.20421459839153, "grad_norm": 4.617893218994141, "learning_rate": 7.986788227043414e-06, "loss": 0.0731, "step": 31475 }, { "epoch": 3.2067596457294107, "grad_norm": 0.9509522914886475, "learning_rate": 7.975476777594282e-06, "loss": 0.0882, "step": 31500 }, { "epoch": 3.209304693067291, "grad_norm": 5.752696514129639, "learning_rate": 7.96416532814515e-06, "loss": 0.1022, "step": 31525 }, { "epoch": 3.2118497404051713, "grad_norm": 2.312471389770508, "learning_rate": 7.952853878696016e-06, "loss": 0.1257, "step": 31550 }, { "epoch": 3.214394787743052, "grad_norm": 0.9991899132728577, "learning_rate": 7.941542429246885e-06, "loss": 0.0745, "step": 31575 }, { "epoch": 3.2169398350809324, "grad_norm": 1.8518400192260742, "learning_rate": 7.930230979797751e-06, "loss": 0.0905, "step": 31600 }, { "epoch": 3.219484882418813, "grad_norm": 1.001679539680481, "learning_rate": 7.91891953034862e-06, "loss": 0.0764, "step": 31625 }, { "epoch": 3.2220299297566934, "grad_norm": 1.2771847248077393, "learning_rate": 7.907608080899487e-06, "loss": 0.0649, "step": 31650 }, { "epoch": 3.2245749770945737, "grad_norm": 1.6247678995132446, "learning_rate": 7.896296631450355e-06, "loss": 0.0843, "step": 31675 }, { "epoch": 3.2271200244324545, "grad_norm": 2.9473443031311035, "learning_rate": 7.884985182001223e-06, "loss": 0.1045, "step": 31700 }, { "epoch": 3.229665071770335, "grad_norm": 0.8892804980278015, "learning_rate": 7.873673732552089e-06, "loss": 0.0714, "step": 31725 }, { "epoch": 3.2322101191082155, "grad_norm": 1.752219796180725, "learning_rate": 7.862362283102959e-06, "loss": 0.0999, "step": 31750 }, { "epoch": 3.234755166446096, "grad_norm": 2.873009443283081, "learning_rate": 7.851050833653825e-06, "loss": 0.0779, "step": 31775 }, { "epoch": 3.2373002137839766, "grad_norm": 2.4850993156433105, "learning_rate": 7.839739384204693e-06, "loss": 0.0622, "step": 31800 }, { "epoch": 3.239845261121857, "grad_norm": 0.40809884667396545, "learning_rate": 7.82842793475556e-06, "loss": 0.0775, "step": 31825 }, { "epoch": 3.242390308459737, "grad_norm": 2.4178569316864014, "learning_rate": 7.817116485306428e-06, "loss": 0.0738, "step": 31850 }, { "epoch": 3.244935355797618, "grad_norm": 3.458000421524048, "learning_rate": 7.805805035857296e-06, "loss": 0.092, "step": 31875 }, { "epoch": 3.2474804031354982, "grad_norm": 2.608797788619995, "learning_rate": 7.794493586408162e-06, "loss": 0.0772, "step": 31900 }, { "epoch": 3.250025450473379, "grad_norm": 1.444105863571167, "learning_rate": 7.783182136959032e-06, "loss": 0.063, "step": 31925 }, { "epoch": 3.2525704978112593, "grad_norm": 2.722048759460449, "learning_rate": 7.771870687509898e-06, "loss": 0.0831, "step": 31950 }, { "epoch": 3.2551155451491396, "grad_norm": 2.0308265686035156, "learning_rate": 7.760559238060766e-06, "loss": 0.0805, "step": 31975 }, { "epoch": 3.2576605924870203, "grad_norm": 1.9619109630584717, "learning_rate": 7.749247788611634e-06, "loss": 0.0866, "step": 32000 }, { "epoch": 3.2602056398249006, "grad_norm": 1.6465561389923096, "learning_rate": 7.737936339162501e-06, "loss": 0.0968, "step": 32025 }, { "epoch": 3.2627506871627814, "grad_norm": 1.229393720626831, "learning_rate": 7.72662488971337e-06, "loss": 0.0701, "step": 32050 }, { "epoch": 3.2652957345006617, "grad_norm": 3.8762965202331543, "learning_rate": 7.715313440264235e-06, "loss": 0.0815, "step": 32075 }, { "epoch": 3.267840781838542, "grad_norm": 1.9568135738372803, "learning_rate": 7.704001990815103e-06, "loss": 0.0718, "step": 32100 }, { "epoch": 3.2703858291764227, "grad_norm": 3.7460007667541504, "learning_rate": 7.692690541365971e-06, "loss": 0.09, "step": 32125 }, { "epoch": 3.272930876514303, "grad_norm": 0.3636890649795532, "learning_rate": 7.681379091916839e-06, "loss": 0.0874, "step": 32150 }, { "epoch": 3.275475923852184, "grad_norm": 4.578624725341797, "learning_rate": 7.670067642467707e-06, "loss": 0.0717, "step": 32175 }, { "epoch": 3.278020971190064, "grad_norm": 7.152780532836914, "learning_rate": 7.658756193018575e-06, "loss": 0.065, "step": 32200 }, { "epoch": 3.280566018527945, "grad_norm": 2.064504384994507, "learning_rate": 7.647444743569442e-06, "loss": 0.08, "step": 32225 }, { "epoch": 3.283111065865825, "grad_norm": 0.4257572293281555, "learning_rate": 7.636133294120309e-06, "loss": 0.0773, "step": 32250 }, { "epoch": 3.2856561132037054, "grad_norm": 0.999352216720581, "learning_rate": 7.6248218446711765e-06, "loss": 0.0728, "step": 32275 }, { "epoch": 3.288201160541586, "grad_norm": 2.5123298168182373, "learning_rate": 7.613510395222044e-06, "loss": 0.0756, "step": 32300 }, { "epoch": 3.2907462078794665, "grad_norm": 6.116274833679199, "learning_rate": 7.602198945772911e-06, "loss": 0.0733, "step": 32325 }, { "epoch": 3.2932912552173472, "grad_norm": 7.821257591247559, "learning_rate": 7.59088749632378e-06, "loss": 0.0616, "step": 32350 }, { "epoch": 3.2958363025552275, "grad_norm": 4.588683128356934, "learning_rate": 7.579576046874647e-06, "loss": 0.0935, "step": 32375 }, { "epoch": 3.298381349893108, "grad_norm": 7.06011962890625, "learning_rate": 7.568264597425514e-06, "loss": 0.0604, "step": 32400 }, { "epoch": 3.3009263972309886, "grad_norm": 5.7961201667785645, "learning_rate": 7.556953147976383e-06, "loss": 0.0774, "step": 32425 }, { "epoch": 3.303471444568869, "grad_norm": 1.789345383644104, "learning_rate": 7.54564169852725e-06, "loss": 0.0879, "step": 32450 }, { "epoch": 3.3060164919067496, "grad_norm": 5.761059284210205, "learning_rate": 7.5343302490781176e-06, "loss": 0.1144, "step": 32475 }, { "epoch": 3.30856153924463, "grad_norm": 1.1593629121780396, "learning_rate": 7.5230187996289846e-06, "loss": 0.0879, "step": 32500 }, { "epoch": 3.3111065865825102, "grad_norm": 1.7188637256622314, "learning_rate": 7.511707350179853e-06, "loss": 0.0765, "step": 32525 }, { "epoch": 3.313651633920391, "grad_norm": 0.9226068258285522, "learning_rate": 7.50039590073072e-06, "loss": 0.063, "step": 32550 }, { "epoch": 3.3161966812582713, "grad_norm": 5.403501987457275, "learning_rate": 7.489084451281587e-06, "loss": 0.0773, "step": 32575 }, { "epoch": 3.318741728596152, "grad_norm": 3.45514178276062, "learning_rate": 7.477773001832456e-06, "loss": 0.0614, "step": 32600 }, { "epoch": 3.3212867759340323, "grad_norm": 3.7157094478607178, "learning_rate": 7.466461552383323e-06, "loss": 0.0876, "step": 32625 }, { "epoch": 3.323831823271913, "grad_norm": 5.224157333374023, "learning_rate": 7.455150102934191e-06, "loss": 0.0801, "step": 32650 }, { "epoch": 3.3263768706097934, "grad_norm": 5.631719589233398, "learning_rate": 7.443838653485058e-06, "loss": 0.0864, "step": 32675 }, { "epoch": 3.3289219179476737, "grad_norm": 1.3295774459838867, "learning_rate": 7.432527204035926e-06, "loss": 0.0991, "step": 32700 }, { "epoch": 3.3314669652855544, "grad_norm": 1.4614148139953613, "learning_rate": 7.4212157545867935e-06, "loss": 0.1021, "step": 32725 }, { "epoch": 3.3340120126234347, "grad_norm": 1.3813050985336304, "learning_rate": 7.4099043051376605e-06, "loss": 0.0542, "step": 32750 }, { "epoch": 3.3365570599613155, "grad_norm": 2.403815984725952, "learning_rate": 7.398592855688529e-06, "loss": 0.0896, "step": 32775 }, { "epoch": 3.3391021072991958, "grad_norm": 4.827250957489014, "learning_rate": 7.387281406239396e-06, "loss": 0.0828, "step": 32800 }, { "epoch": 3.341647154637076, "grad_norm": 5.572385311126709, "learning_rate": 7.375969956790263e-06, "loss": 0.0814, "step": 32825 }, { "epoch": 3.344192201974957, "grad_norm": 3.9175562858581543, "learning_rate": 7.364658507341131e-06, "loss": 0.0883, "step": 32850 }, { "epoch": 3.346737249312837, "grad_norm": 0.47550782561302185, "learning_rate": 7.353347057891999e-06, "loss": 0.0829, "step": 32875 }, { "epoch": 3.349282296650718, "grad_norm": 1.9550349712371826, "learning_rate": 7.342035608442867e-06, "loss": 0.0717, "step": 32900 }, { "epoch": 3.351827343988598, "grad_norm": 4.848787307739258, "learning_rate": 7.330724158993734e-06, "loss": 0.0727, "step": 32925 }, { "epoch": 3.3543723913264785, "grad_norm": 1.4837538003921509, "learning_rate": 7.319412709544602e-06, "loss": 0.0573, "step": 32950 }, { "epoch": 3.356917438664359, "grad_norm": 6.307371139526367, "learning_rate": 7.308101260095469e-06, "loss": 0.0789, "step": 32975 }, { "epoch": 3.3594624860022395, "grad_norm": 4.478782653808594, "learning_rate": 7.296789810646336e-06, "loss": 0.0763, "step": 33000 }, { "epoch": 3.3620075333401203, "grad_norm": 3.771304130554199, "learning_rate": 7.285478361197204e-06, "loss": 0.071, "step": 33025 }, { "epoch": 3.3645525806780006, "grad_norm": 4.51546049118042, "learning_rate": 7.274166911748072e-06, "loss": 0.0943, "step": 33050 }, { "epoch": 3.3670976280158813, "grad_norm": 1.119053840637207, "learning_rate": 7.26285546229894e-06, "loss": 0.0653, "step": 33075 }, { "epoch": 3.3696426753537616, "grad_norm": 4.0295305252075195, "learning_rate": 7.251544012849807e-06, "loss": 0.098, "step": 33100 }, { "epoch": 3.372187722691642, "grad_norm": 3.882275104522705, "learning_rate": 7.240232563400674e-06, "loss": 0.0764, "step": 33125 }, { "epoch": 3.3747327700295227, "grad_norm": 4.427306175231934, "learning_rate": 7.228921113951543e-06, "loss": 0.0687, "step": 33150 }, { "epoch": 3.377277817367403, "grad_norm": 2.3375508785247803, "learning_rate": 7.21760966450241e-06, "loss": 0.083, "step": 33175 }, { "epoch": 3.3798228647052837, "grad_norm": 1.8188954591751099, "learning_rate": 7.2062982150532774e-06, "loss": 0.0731, "step": 33200 }, { "epoch": 3.382367912043164, "grad_norm": 4.421236515045166, "learning_rate": 7.194986765604145e-06, "loss": 0.0699, "step": 33225 }, { "epoch": 3.3849129593810443, "grad_norm": 3.788590669631958, "learning_rate": 7.183675316155013e-06, "loss": 0.0962, "step": 33250 }, { "epoch": 3.387458006718925, "grad_norm": 2.9976346492767334, "learning_rate": 7.17236386670588e-06, "loss": 0.0867, "step": 33275 }, { "epoch": 3.3900030540568054, "grad_norm": 4.236015319824219, "learning_rate": 7.161052417256747e-06, "loss": 0.0845, "step": 33300 }, { "epoch": 3.392548101394686, "grad_norm": 4.394545555114746, "learning_rate": 7.149740967807616e-06, "loss": 0.0667, "step": 33325 }, { "epoch": 3.3950931487325664, "grad_norm": 1.5164837837219238, "learning_rate": 7.138429518358483e-06, "loss": 0.0715, "step": 33350 }, { "epoch": 3.3976381960704467, "grad_norm": 3.266295909881592, "learning_rate": 7.1271180689093515e-06, "loss": 0.113, "step": 33375 }, { "epoch": 3.4001832434083274, "grad_norm": 1.4110231399536133, "learning_rate": 7.1158066194602185e-06, "loss": 0.076, "step": 33400 }, { "epoch": 3.4027282907462078, "grad_norm": 5.217437267303467, "learning_rate": 7.1044951700110855e-06, "loss": 0.0642, "step": 33425 }, { "epoch": 3.4052733380840885, "grad_norm": 1.9967962503433228, "learning_rate": 7.093183720561953e-06, "loss": 0.0691, "step": 33450 }, { "epoch": 3.407818385421969, "grad_norm": 1.0126267671585083, "learning_rate": 7.08187227111282e-06, "loss": 0.0813, "step": 33475 }, { "epoch": 3.4103634327598495, "grad_norm": 3.0378081798553467, "learning_rate": 7.070560821663689e-06, "loss": 0.0828, "step": 33500 }, { "epoch": 3.41290848009773, "grad_norm": 3.2735512256622314, "learning_rate": 7.059249372214556e-06, "loss": 0.0724, "step": 33525 }, { "epoch": 3.41545352743561, "grad_norm": 1.570923089981079, "learning_rate": 7.047937922765423e-06, "loss": 0.0923, "step": 33550 }, { "epoch": 3.417998574773491, "grad_norm": 1.2357620000839233, "learning_rate": 7.036626473316292e-06, "loss": 0.0926, "step": 33575 }, { "epoch": 3.420543622111371, "grad_norm": 2.3194172382354736, "learning_rate": 7.025315023867159e-06, "loss": 0.0837, "step": 33600 }, { "epoch": 3.423088669449252, "grad_norm": 4.630713939666748, "learning_rate": 7.014003574418027e-06, "loss": 0.0948, "step": 33625 }, { "epoch": 3.4256337167871322, "grad_norm": 2.8066182136535645, "learning_rate": 7.0026921249688936e-06, "loss": 0.1054, "step": 33650 }, { "epoch": 3.4281787641250125, "grad_norm": 2.1490931510925293, "learning_rate": 6.991380675519762e-06, "loss": 0.0792, "step": 33675 }, { "epoch": 3.4307238114628933, "grad_norm": 2.8342697620391846, "learning_rate": 6.980069226070629e-06, "loss": 0.0857, "step": 33700 }, { "epoch": 3.4332688588007736, "grad_norm": 3.6628048419952393, "learning_rate": 6.968757776621496e-06, "loss": 0.0751, "step": 33725 }, { "epoch": 3.4358139061386543, "grad_norm": 0.3415294587612152, "learning_rate": 6.957446327172365e-06, "loss": 0.0847, "step": 33750 }, { "epoch": 3.4383589534765346, "grad_norm": 1.9966833591461182, "learning_rate": 6.946134877723232e-06, "loss": 0.0598, "step": 33775 }, { "epoch": 3.440904000814415, "grad_norm": 6.195132255554199, "learning_rate": 6.9348234282741e-06, "loss": 0.0697, "step": 33800 }, { "epoch": 3.4434490481522957, "grad_norm": 3.475865364074707, "learning_rate": 6.923511978824967e-06, "loss": 0.0943, "step": 33825 }, { "epoch": 3.445994095490176, "grad_norm": 2.3039791584014893, "learning_rate": 6.912200529375835e-06, "loss": 0.089, "step": 33850 }, { "epoch": 3.4485391428280567, "grad_norm": 5.273120880126953, "learning_rate": 6.901341537904668e-06, "loss": 0.0887, "step": 33875 }, { "epoch": 3.451084190165937, "grad_norm": 5.2622880935668945, "learning_rate": 6.890030088455535e-06, "loss": 0.0665, "step": 33900 }, { "epoch": 3.453629237503818, "grad_norm": 4.5153422355651855, "learning_rate": 6.878718639006403e-06, "loss": 0.0661, "step": 33925 }, { "epoch": 3.456174284841698, "grad_norm": 3.4902474880218506, "learning_rate": 6.86740718955727e-06, "loss": 0.0894, "step": 33950 }, { "epoch": 3.4587193321795784, "grad_norm": 1.9006632566452026, "learning_rate": 6.856095740108139e-06, "loss": 0.0913, "step": 33975 }, { "epoch": 3.461264379517459, "grad_norm": 1.2017245292663574, "learning_rate": 6.844784290659006e-06, "loss": 0.0917, "step": 34000 }, { "epoch": 3.4638094268553394, "grad_norm": 4.50941276550293, "learning_rate": 6.833472841209873e-06, "loss": 0.1008, "step": 34025 }, { "epoch": 3.46635447419322, "grad_norm": 8.09762191772461, "learning_rate": 6.822161391760741e-06, "loss": 0.0715, "step": 34050 }, { "epoch": 3.4688995215311005, "grad_norm": 6.90067720413208, "learning_rate": 6.810849942311608e-06, "loss": 0.0698, "step": 34075 }, { "epoch": 3.471444568868981, "grad_norm": 5.395306587219238, "learning_rate": 6.799538492862476e-06, "loss": 0.0722, "step": 34100 }, { "epoch": 3.4739896162068615, "grad_norm": 2.4700376987457275, "learning_rate": 6.788227043413343e-06, "loss": 0.0854, "step": 34125 }, { "epoch": 3.476534663544742, "grad_norm": 6.859822750091553, "learning_rate": 6.77691559396421e-06, "loss": 0.0733, "step": 34150 }, { "epoch": 3.4790797108826226, "grad_norm": 0.3821323812007904, "learning_rate": 6.765604144515079e-06, "loss": 0.089, "step": 34175 }, { "epoch": 3.481624758220503, "grad_norm": 4.010385990142822, "learning_rate": 6.754292695065946e-06, "loss": 0.0828, "step": 34200 }, { "epoch": 3.484169805558383, "grad_norm": 0.7583944797515869, "learning_rate": 6.742981245616815e-06, "loss": 0.1021, "step": 34225 }, { "epoch": 3.486714852896264, "grad_norm": 1.1786764860153198, "learning_rate": 6.731669796167682e-06, "loss": 0.0603, "step": 34250 }, { "epoch": 3.4892599002341442, "grad_norm": 2.502434492111206, "learning_rate": 6.7203583467185494e-06, "loss": 0.127, "step": 34275 }, { "epoch": 3.491804947572025, "grad_norm": 4.207679271697998, "learning_rate": 6.7090468972694164e-06, "loss": 0.0795, "step": 34300 }, { "epoch": 3.4943499949099053, "grad_norm": 3.7935028076171875, "learning_rate": 6.6977354478202834e-06, "loss": 0.0925, "step": 34325 }, { "epoch": 3.496895042247786, "grad_norm": 3.029387950897217, "learning_rate": 6.686423998371152e-06, "loss": 0.0838, "step": 34350 }, { "epoch": 3.4994400895856663, "grad_norm": 3.835543394088745, "learning_rate": 6.675112548922019e-06, "loss": 0.0792, "step": 34375 }, { "epoch": 3.5019851369235466, "grad_norm": 7.944116115570068, "learning_rate": 6.663801099472888e-06, "loss": 0.0866, "step": 34400 }, { "epoch": 3.5045301842614274, "grad_norm": 3.8808681964874268, "learning_rate": 6.652489650023755e-06, "loss": 0.0775, "step": 34425 }, { "epoch": 3.5070752315993077, "grad_norm": 0.7064144611358643, "learning_rate": 6.641178200574622e-06, "loss": 0.0887, "step": 34450 }, { "epoch": 3.509620278937188, "grad_norm": 6.215606689453125, "learning_rate": 6.62986675112549e-06, "loss": 0.1262, "step": 34475 }, { "epoch": 3.5121653262750687, "grad_norm": 4.817259788513184, "learning_rate": 6.618555301676357e-06, "loss": 0.0753, "step": 34500 }, { "epoch": 3.5147103736129495, "grad_norm": 0.5267298817634583, "learning_rate": 6.607243852227225e-06, "loss": 0.1185, "step": 34525 }, { "epoch": 3.5172554209508298, "grad_norm": 8.5421781539917, "learning_rate": 6.595932402778092e-06, "loss": 0.1017, "step": 34550 }, { "epoch": 3.51980046828871, "grad_norm": 0.9383201003074646, "learning_rate": 6.584620953328959e-06, "loss": 0.0829, "step": 34575 }, { "epoch": 3.522345515626591, "grad_norm": 7.349745273590088, "learning_rate": 6.573309503879828e-06, "loss": 0.0883, "step": 34600 }, { "epoch": 3.524890562964471, "grad_norm": 1.4736689329147339, "learning_rate": 6.561998054430695e-06, "loss": 0.0912, "step": 34625 }, { "epoch": 3.5274356103023514, "grad_norm": 1.4113051891326904, "learning_rate": 6.550686604981563e-06, "loss": 0.0796, "step": 34650 }, { "epoch": 3.529980657640232, "grad_norm": 7.705810546875, "learning_rate": 6.539375155532431e-06, "loss": 0.0784, "step": 34675 }, { "epoch": 3.5325257049781125, "grad_norm": 2.766878843307495, "learning_rate": 6.5280637060832986e-06, "loss": 0.0777, "step": 34700 }, { "epoch": 3.535070752315993, "grad_norm": 1.7747935056686401, "learning_rate": 6.5167522566341656e-06, "loss": 0.0695, "step": 34725 }, { "epoch": 3.5376157996538735, "grad_norm": 2.4587645530700684, "learning_rate": 6.5054408071850326e-06, "loss": 0.0839, "step": 34750 }, { "epoch": 3.5401608469917543, "grad_norm": 4.459035873413086, "learning_rate": 6.494129357735901e-06, "loss": 0.0931, "step": 34775 }, { "epoch": 3.5427058943296346, "grad_norm": 1.7593748569488525, "learning_rate": 6.482817908286768e-06, "loss": 0.07, "step": 34800 }, { "epoch": 3.545250941667515, "grad_norm": 3.184353828430176, "learning_rate": 6.471506458837636e-06, "loss": 0.0715, "step": 34825 }, { "epoch": 3.5477959890053956, "grad_norm": 3.696401357650757, "learning_rate": 6.460195009388504e-06, "loss": 0.0697, "step": 34850 }, { "epoch": 3.550341036343276, "grad_norm": 2.8429620265960693, "learning_rate": 6.448883559939371e-06, "loss": 0.0925, "step": 34875 }, { "epoch": 3.552886083681156, "grad_norm": 1.776930809020996, "learning_rate": 6.437572110490239e-06, "loss": 0.0657, "step": 34900 }, { "epoch": 3.555431131019037, "grad_norm": 1.681552529335022, "learning_rate": 6.426260661041106e-06, "loss": 0.0755, "step": 34925 }, { "epoch": 3.5579761783569177, "grad_norm": 3.1120924949645996, "learning_rate": 6.4149492115919745e-06, "loss": 0.0978, "step": 34950 }, { "epoch": 3.560521225694798, "grad_norm": 2.5734004974365234, "learning_rate": 6.4036377621428415e-06, "loss": 0.0792, "step": 34975 }, { "epoch": 3.5630662730326783, "grad_norm": 1.1274526119232178, "learning_rate": 6.392326312693709e-06, "loss": 0.0853, "step": 35000 }, { "epoch": 3.565611320370559, "grad_norm": 2.7420918941497803, "learning_rate": 6.381014863244577e-06, "loss": 0.0852, "step": 35025 }, { "epoch": 3.5681563677084394, "grad_norm": 0.742755115032196, "learning_rate": 6.369703413795444e-06, "loss": 0.1186, "step": 35050 }, { "epoch": 3.5707014150463197, "grad_norm": 6.001101493835449, "learning_rate": 6.358391964346312e-06, "loss": 0.1153, "step": 35075 }, { "epoch": 3.5732464623842004, "grad_norm": 3.2310545444488525, "learning_rate": 6.347080514897179e-06, "loss": 0.0848, "step": 35100 }, { "epoch": 3.5757915097220807, "grad_norm": 5.802320957183838, "learning_rate": 6.335769065448048e-06, "loss": 0.0753, "step": 35125 }, { "epoch": 3.5783365570599615, "grad_norm": 4.039153099060059, "learning_rate": 6.324457615998915e-06, "loss": 0.076, "step": 35150 }, { "epoch": 3.5808816043978418, "grad_norm": 3.7043445110321045, "learning_rate": 6.313146166549782e-06, "loss": 0.0626, "step": 35175 }, { "epoch": 3.5834266517357225, "grad_norm": 2.622565984725952, "learning_rate": 6.30183471710065e-06, "loss": 0.0944, "step": 35200 }, { "epoch": 3.585971699073603, "grad_norm": 2.2240231037139893, "learning_rate": 6.290523267651517e-06, "loss": 0.0923, "step": 35225 }, { "epoch": 3.588516746411483, "grad_norm": 3.193891763687134, "learning_rate": 6.279211818202385e-06, "loss": 0.0982, "step": 35250 }, { "epoch": 3.591061793749364, "grad_norm": 3.5597784519195557, "learning_rate": 6.267900368753252e-06, "loss": 0.0916, "step": 35275 }, { "epoch": 3.593606841087244, "grad_norm": 4.625251770019531, "learning_rate": 6.256588919304119e-06, "loss": 0.1102, "step": 35300 }, { "epoch": 3.5961518884251245, "grad_norm": 4.520129203796387, "learning_rate": 6.245277469854988e-06, "loss": 0.0769, "step": 35325 }, { "epoch": 3.598696935763005, "grad_norm": 3.313098192214966, "learning_rate": 6.233966020405855e-06, "loss": 0.0769, "step": 35350 }, { "epoch": 3.601241983100886, "grad_norm": 3.836454153060913, "learning_rate": 6.222654570956724e-06, "loss": 0.0771, "step": 35375 }, { "epoch": 3.6037870304387662, "grad_norm": 3.4968671798706055, "learning_rate": 6.211343121507591e-06, "loss": 0.0928, "step": 35400 }, { "epoch": 3.6063320777766465, "grad_norm": 3.556164026260376, "learning_rate": 6.2000316720584585e-06, "loss": 0.074, "step": 35425 }, { "epoch": 3.6088771251145273, "grad_norm": 2.0920588970184326, "learning_rate": 6.1887202226093254e-06, "loss": 0.1027, "step": 35450 }, { "epoch": 3.6114221724524076, "grad_norm": 5.674222469329834, "learning_rate": 6.1774087731601924e-06, "loss": 0.0874, "step": 35475 }, { "epoch": 3.613967219790288, "grad_norm": 2.4876623153686523, "learning_rate": 6.166097323711061e-06, "loss": 0.0627, "step": 35500 }, { "epoch": 3.6165122671281686, "grad_norm": 4.723164081573486, "learning_rate": 6.154785874261928e-06, "loss": 0.108, "step": 35525 }, { "epoch": 3.619057314466049, "grad_norm": 0.5420049428939819, "learning_rate": 6.143474424812797e-06, "loss": 0.101, "step": 35550 }, { "epoch": 3.6216023618039297, "grad_norm": 0.39233437180519104, "learning_rate": 6.132162975363664e-06, "loss": 0.0738, "step": 35575 }, { "epoch": 3.62414740914181, "grad_norm": 5.043944835662842, "learning_rate": 6.120851525914531e-06, "loss": 0.0988, "step": 35600 }, { "epoch": 3.6266924564796907, "grad_norm": 8.465002059936523, "learning_rate": 6.109540076465399e-06, "loss": 0.0928, "step": 35625 }, { "epoch": 3.629237503817571, "grad_norm": 2.7983903884887695, "learning_rate": 6.098228627016266e-06, "loss": 0.0771, "step": 35650 }, { "epoch": 3.6317825511554513, "grad_norm": 5.560369968414307, "learning_rate": 6.086917177567134e-06, "loss": 0.0965, "step": 35675 }, { "epoch": 3.634327598493332, "grad_norm": 4.1370038986206055, "learning_rate": 6.075605728118001e-06, "loss": 0.0954, "step": 35700 }, { "epoch": 3.6368726458312124, "grad_norm": 4.174961090087891, "learning_rate": 6.06429427866887e-06, "loss": 0.0736, "step": 35725 }, { "epoch": 3.6394176931690927, "grad_norm": 11.217022895812988, "learning_rate": 6.052982829219737e-06, "loss": 0.0914, "step": 35750 }, { "epoch": 3.6419627405069734, "grad_norm": 7.772279739379883, "learning_rate": 6.041671379770604e-06, "loss": 0.0849, "step": 35775 }, { "epoch": 3.644507787844854, "grad_norm": 4.524623870849609, "learning_rate": 6.030359930321472e-06, "loss": 0.0997, "step": 35800 }, { "epoch": 3.6470528351827345, "grad_norm": 3.7422924041748047, "learning_rate": 6.019048480872339e-06, "loss": 0.0755, "step": 35825 }, { "epoch": 3.649597882520615, "grad_norm": 5.526354789733887, "learning_rate": 6.007737031423208e-06, "loss": 0.0858, "step": 35850 }, { "epoch": 3.6521429298584955, "grad_norm": 8.158432006835938, "learning_rate": 5.996425581974075e-06, "loss": 0.1136, "step": 35875 }, { "epoch": 3.654687977196376, "grad_norm": 6.096880912780762, "learning_rate": 5.985114132524942e-06, "loss": 0.0758, "step": 35900 }, { "epoch": 3.657233024534256, "grad_norm": 3.285121440887451, "learning_rate": 5.97380268307581e-06, "loss": 0.0922, "step": 35925 }, { "epoch": 3.659778071872137, "grad_norm": 1.9945199489593506, "learning_rate": 5.962491233626677e-06, "loss": 0.0701, "step": 35950 }, { "epoch": 3.662323119210017, "grad_norm": 0.6748458743095398, "learning_rate": 5.951632242155511e-06, "loss": 0.0724, "step": 35975 }, { "epoch": 3.664868166547898, "grad_norm": 1.5930081605911255, "learning_rate": 5.940320792706378e-06, "loss": 0.0842, "step": 36000 }, { "epoch": 3.6674132138857782, "grad_norm": 3.754176139831543, "learning_rate": 5.929009343257246e-06, "loss": 0.0763, "step": 36025 }, { "epoch": 3.669958261223659, "grad_norm": 3.0190577507019043, "learning_rate": 5.9176978938081135e-06, "loss": 0.0806, "step": 36050 }, { "epoch": 3.6725033085615393, "grad_norm": 4.963992118835449, "learning_rate": 5.9063864443589805e-06, "loss": 0.1168, "step": 36075 }, { "epoch": 3.6750483558994196, "grad_norm": 3.961651086807251, "learning_rate": 5.895074994909848e-06, "loss": 0.1051, "step": 36100 }, { "epoch": 3.6775934032373003, "grad_norm": 3.3032639026641846, "learning_rate": 5.883763545460715e-06, "loss": 0.0837, "step": 36125 }, { "epoch": 3.6801384505751806, "grad_norm": 8.002496719360352, "learning_rate": 5.872452096011584e-06, "loss": 0.0634, "step": 36150 }, { "epoch": 3.682683497913061, "grad_norm": 3.493532419204712, "learning_rate": 5.861140646562451e-06, "loss": 0.0702, "step": 36175 }, { "epoch": 3.6852285452509417, "grad_norm": 5.555636405944824, "learning_rate": 5.849829197113318e-06, "loss": 0.0892, "step": 36200 }, { "epoch": 3.6877735925888224, "grad_norm": 0.9933393597602844, "learning_rate": 5.838517747664187e-06, "loss": 0.073, "step": 36225 }, { "epoch": 3.6903186399267027, "grad_norm": 3.9761369228363037, "learning_rate": 5.827206298215054e-06, "loss": 0.0872, "step": 36250 }, { "epoch": 3.692863687264583, "grad_norm": 0.9500647783279419, "learning_rate": 5.8158948487659215e-06, "loss": 0.102, "step": 36275 }, { "epoch": 3.6954087346024638, "grad_norm": 0.9967595934867859, "learning_rate": 5.8045833993167885e-06, "loss": 0.098, "step": 36300 }, { "epoch": 3.697953781940344, "grad_norm": 1.624714732170105, "learning_rate": 5.793271949867657e-06, "loss": 0.0982, "step": 36325 }, { "epoch": 3.7004988292782244, "grad_norm": 0.7662064433097839, "learning_rate": 5.781960500418524e-06, "loss": 0.0527, "step": 36350 }, { "epoch": 3.703043876616105, "grad_norm": 7.585689544677734, "learning_rate": 5.770649050969391e-06, "loss": 0.0883, "step": 36375 }, { "epoch": 3.7055889239539854, "grad_norm": 8.151590347290039, "learning_rate": 5.75933760152026e-06, "loss": 0.0966, "step": 36400 }, { "epoch": 3.708133971291866, "grad_norm": 4.967852592468262, "learning_rate": 5.748026152071127e-06, "loss": 0.0637, "step": 36425 }, { "epoch": 3.7106790186297465, "grad_norm": 4.405301094055176, "learning_rate": 5.736714702621995e-06, "loss": 0.0837, "step": 36450 }, { "epoch": 3.713224065967627, "grad_norm": 2.9060306549072266, "learning_rate": 5.725403253172862e-06, "loss": 0.0747, "step": 36475 }, { "epoch": 3.7157691133055075, "grad_norm": 1.2156260013580322, "learning_rate": 5.71409180372373e-06, "loss": 0.0524, "step": 36500 }, { "epoch": 3.718314160643388, "grad_norm": 5.655495643615723, "learning_rate": 5.7027803542745974e-06, "loss": 0.094, "step": 36525 }, { "epoch": 3.7208592079812686, "grad_norm": 1.6090160608291626, "learning_rate": 5.6914689048254644e-06, "loss": 0.0596, "step": 36550 }, { "epoch": 3.723404255319149, "grad_norm": 4.2439680099487305, "learning_rate": 5.680157455376333e-06, "loss": 0.0859, "step": 36575 }, { "epoch": 3.725949302657029, "grad_norm": 1.56519615650177, "learning_rate": 5.6688460059272e-06, "loss": 0.075, "step": 36600 }, { "epoch": 3.72849434999491, "grad_norm": 4.301578521728516, "learning_rate": 5.657534556478067e-06, "loss": 0.0861, "step": 36625 }, { "epoch": 3.7310393973327907, "grad_norm": 2.4486348628997803, "learning_rate": 5.646223107028935e-06, "loss": 0.0569, "step": 36650 }, { "epoch": 3.733584444670671, "grad_norm": 4.346898078918457, "learning_rate": 5.634911657579803e-06, "loss": 0.0641, "step": 36675 }, { "epoch": 3.7361294920085513, "grad_norm": 1.4785878658294678, "learning_rate": 5.623600208130671e-06, "loss": 0.0787, "step": 36700 }, { "epoch": 3.738674539346432, "grad_norm": 1.4040265083312988, "learning_rate": 5.612288758681538e-06, "loss": 0.0642, "step": 36725 }, { "epoch": 3.7412195866843123, "grad_norm": 0.16664713621139526, "learning_rate": 5.600977309232406e-06, "loss": 0.0821, "step": 36750 }, { "epoch": 3.7437646340221926, "grad_norm": 5.200014591217041, "learning_rate": 5.589665859783273e-06, "loss": 0.0636, "step": 36775 }, { "epoch": 3.7463096813600734, "grad_norm": 0.5989975929260254, "learning_rate": 5.57835441033414e-06, "loss": 0.0769, "step": 36800 }, { "epoch": 3.7488547286979537, "grad_norm": 2.7018861770629883, "learning_rate": 5.567042960885008e-06, "loss": 0.0983, "step": 36825 }, { "epoch": 3.7513997760358344, "grad_norm": 4.699031829833984, "learning_rate": 5.555731511435876e-06, "loss": 0.0821, "step": 36850 }, { "epoch": 3.7539448233737147, "grad_norm": 6.736579895019531, "learning_rate": 5.544420061986744e-06, "loss": 0.0777, "step": 36875 }, { "epoch": 3.7564898707115955, "grad_norm": 4.7765350341796875, "learning_rate": 5.533108612537611e-06, "loss": 0.0798, "step": 36900 }, { "epoch": 3.7590349180494758, "grad_norm": 3.2896828651428223, "learning_rate": 5.521797163088478e-06, "loss": 0.0885, "step": 36925 }, { "epoch": 3.761579965387356, "grad_norm": 4.15386438369751, "learning_rate": 5.5104857136393466e-06, "loss": 0.0832, "step": 36950 }, { "epoch": 3.764125012725237, "grad_norm": 1.831699252128601, "learning_rate": 5.4991742641902136e-06, "loss": 0.0762, "step": 36975 }, { "epoch": 3.766670060063117, "grad_norm": 2.754887580871582, "learning_rate": 5.487862814741081e-06, "loss": 0.1048, "step": 37000 }, { "epoch": 3.7692151074009974, "grad_norm": 6.482685565948486, "learning_rate": 5.476551365291949e-06, "loss": 0.0884, "step": 37025 }, { "epoch": 3.771760154738878, "grad_norm": 5.080991268157959, "learning_rate": 5.465239915842816e-06, "loss": 0.0967, "step": 37050 }, { "epoch": 3.774305202076759, "grad_norm": 5.668878078460693, "learning_rate": 5.453928466393684e-06, "loss": 0.0947, "step": 37075 }, { "epoch": 3.776850249414639, "grad_norm": 2.5355238914489746, "learning_rate": 5.442617016944551e-06, "loss": 0.0829, "step": 37100 }, { "epoch": 3.7793952967525195, "grad_norm": 4.266459941864014, "learning_rate": 5.43130556749542e-06, "loss": 0.1081, "step": 37125 }, { "epoch": 3.7819403440904003, "grad_norm": 5.158119201660156, "learning_rate": 5.419994118046287e-06, "loss": 0.0677, "step": 37150 }, { "epoch": 3.7844853914282806, "grad_norm": 3.6510822772979736, "learning_rate": 5.408682668597155e-06, "loss": 0.0542, "step": 37175 }, { "epoch": 3.787030438766161, "grad_norm": 4.6219892501831055, "learning_rate": 5.3973712191480225e-06, "loss": 0.0905, "step": 37200 }, { "epoch": 3.7895754861040416, "grad_norm": 1.2882821559906006, "learning_rate": 5.3860597696988895e-06, "loss": 0.0844, "step": 37225 }, { "epoch": 3.792120533441922, "grad_norm": 1.5892689228057861, "learning_rate": 5.374748320249757e-06, "loss": 0.081, "step": 37250 }, { "epoch": 3.7946655807798026, "grad_norm": 2.733649730682373, "learning_rate": 5.363436870800624e-06, "loss": 0.085, "step": 37275 }, { "epoch": 3.797210628117683, "grad_norm": 0.7701331377029419, "learning_rate": 5.352125421351493e-06, "loss": 0.104, "step": 37300 }, { "epoch": 3.7997556754555637, "grad_norm": 2.512700080871582, "learning_rate": 5.34081397190236e-06, "loss": 0.0711, "step": 37325 }, { "epoch": 3.802300722793444, "grad_norm": 4.5707879066467285, "learning_rate": 5.329502522453227e-06, "loss": 0.0774, "step": 37350 }, { "epoch": 3.8048457701313243, "grad_norm": 8.487582206726074, "learning_rate": 5.318191073004096e-06, "loss": 0.0951, "step": 37375 }, { "epoch": 3.807390817469205, "grad_norm": 2.3424673080444336, "learning_rate": 5.306879623554963e-06, "loss": 0.0613, "step": 37400 }, { "epoch": 3.8099358648070853, "grad_norm": 1.476176381111145, "learning_rate": 5.2955681741058305e-06, "loss": 0.0795, "step": 37425 }, { "epoch": 3.8124809121449657, "grad_norm": 1.8778008222579956, "learning_rate": 5.2842567246566975e-06, "loss": 0.0954, "step": 37450 }, { "epoch": 3.8150259594828464, "grad_norm": 3.7538087368011475, "learning_rate": 5.272945275207566e-06, "loss": 0.1211, "step": 37475 }, { "epoch": 3.817571006820727, "grad_norm": 2.788107395172119, "learning_rate": 5.261633825758433e-06, "loss": 0.0835, "step": 37500 }, { "epoch": 3.8201160541586074, "grad_norm": 5.375553131103516, "learning_rate": 5.2503223763093e-06, "loss": 0.0829, "step": 37525 }, { "epoch": 3.8226611014964877, "grad_norm": 1.8406885862350464, "learning_rate": 5.239010926860169e-06, "loss": 0.0862, "step": 37550 }, { "epoch": 3.8252061488343685, "grad_norm": 1.745599389076233, "learning_rate": 5.227699477411036e-06, "loss": 0.0696, "step": 37575 }, { "epoch": 3.827751196172249, "grad_norm": 4.49993371963501, "learning_rate": 5.216388027961904e-06, "loss": 0.0963, "step": 37600 }, { "epoch": 3.830296243510129, "grad_norm": 2.113429307937622, "learning_rate": 5.205076578512771e-06, "loss": 0.0757, "step": 37625 }, { "epoch": 3.83284129084801, "grad_norm": 3.142502546310425, "learning_rate": 5.193765129063638e-06, "loss": 0.0781, "step": 37650 }, { "epoch": 3.83538633818589, "grad_norm": 2.529702663421631, "learning_rate": 5.1824536796145065e-06, "loss": 0.0585, "step": 37675 }, { "epoch": 3.837931385523771, "grad_norm": 1.782511591911316, "learning_rate": 5.1711422301653735e-06, "loss": 0.0915, "step": 37700 }, { "epoch": 3.840476432861651, "grad_norm": 3.4654805660247803, "learning_rate": 5.159830780716242e-06, "loss": 0.1021, "step": 37725 }, { "epoch": 3.843021480199532, "grad_norm": 7.175243854522705, "learning_rate": 5.148519331267109e-06, "loss": 0.0943, "step": 37750 }, { "epoch": 3.8455665275374122, "grad_norm": 3.8063061237335205, "learning_rate": 5.137207881817976e-06, "loss": 0.0666, "step": 37775 }, { "epoch": 3.8481115748752925, "grad_norm": 0.8635425567626953, "learning_rate": 5.125896432368844e-06, "loss": 0.0564, "step": 37800 }, { "epoch": 3.8506566222131733, "grad_norm": 3.5335941314697266, "learning_rate": 5.114584982919711e-06, "loss": 0.0717, "step": 37825 }, { "epoch": 3.8532016695510536, "grad_norm": 3.7828986644744873, "learning_rate": 5.10327353347058e-06, "loss": 0.0861, "step": 37850 }, { "epoch": 3.855746716888934, "grad_norm": 1.944362759590149, "learning_rate": 5.091962084021447e-06, "loss": 0.0611, "step": 37875 }, { "epoch": 3.8582917642268146, "grad_norm": 2.3826661109924316, "learning_rate": 5.080650634572315e-06, "loss": 0.0868, "step": 37900 }, { "epoch": 3.860836811564695, "grad_norm": 0.43980684876441956, "learning_rate": 5.069339185123182e-06, "loss": 0.0844, "step": 37925 }, { "epoch": 3.8633818589025757, "grad_norm": 1.001634120941162, "learning_rate": 5.058027735674049e-06, "loss": 0.0842, "step": 37950 }, { "epoch": 3.865926906240456, "grad_norm": 2.313572883605957, "learning_rate": 5.046716286224917e-06, "loss": 0.0689, "step": 37975 }, { "epoch": 3.8684719535783367, "grad_norm": 2.2224433422088623, "learning_rate": 5.035404836775784e-06, "loss": 0.0747, "step": 38000 }, { "epoch": 3.871017000916217, "grad_norm": 3.743778944015503, "learning_rate": 5.024093387326653e-06, "loss": 0.0557, "step": 38025 }, { "epoch": 3.8735620482540973, "grad_norm": 5.278766632080078, "learning_rate": 5.01278193787752e-06, "loss": 0.0761, "step": 38050 }, { "epoch": 3.876107095591978, "grad_norm": 6.019226551055908, "learning_rate": 5.001470488428387e-06, "loss": 0.0843, "step": 38075 }, { "epoch": 3.8786521429298584, "grad_norm": 1.7229478359222412, "learning_rate": 4.990159038979256e-06, "loss": 0.099, "step": 38100 }, { "epoch": 3.881197190267739, "grad_norm": 4.364188194274902, "learning_rate": 4.9788475895301234e-06, "loss": 0.0812, "step": 38125 }, { "epoch": 3.8837422376056194, "grad_norm": 0.27184224128723145, "learning_rate": 4.9675361400809904e-06, "loss": 0.0668, "step": 38150 }, { "epoch": 3.8862872849435, "grad_norm": 1.0122255086898804, "learning_rate": 4.9562246906318574e-06, "loss": 0.074, "step": 38175 }, { "epoch": 3.8888323322813805, "grad_norm": 4.959586143493652, "learning_rate": 4.944913241182725e-06, "loss": 0.0933, "step": 38200 }, { "epoch": 3.891377379619261, "grad_norm": 5.364275932312012, "learning_rate": 4.933601791733593e-06, "loss": 0.1152, "step": 38225 }, { "epoch": 3.8939224269571415, "grad_norm": 3.487792491912842, "learning_rate": 4.922290342284461e-06, "loss": 0.0639, "step": 38250 }, { "epoch": 3.896467474295022, "grad_norm": 6.711452007293701, "learning_rate": 4.911431350813294e-06, "loss": 0.0997, "step": 38275 }, { "epoch": 3.899012521632902, "grad_norm": 3.9726145267486572, "learning_rate": 4.900119901364161e-06, "loss": 0.0741, "step": 38300 }, { "epoch": 3.901557568970783, "grad_norm": 6.3305277824401855, "learning_rate": 4.8888084519150285e-06, "loss": 0.0705, "step": 38325 }, { "epoch": 3.904102616308663, "grad_norm": 1.8397938013076782, "learning_rate": 4.877497002465896e-06, "loss": 0.0663, "step": 38350 }, { "epoch": 3.906647663646544, "grad_norm": 1.2621251344680786, "learning_rate": 4.866185553016764e-06, "loss": 0.0659, "step": 38375 }, { "epoch": 3.9091927109844242, "grad_norm": 1.6359314918518066, "learning_rate": 4.854874103567632e-06, "loss": 0.0894, "step": 38400 }, { "epoch": 3.911737758322305, "grad_norm": 1.9118399620056152, "learning_rate": 4.843562654118499e-06, "loss": 0.0752, "step": 38425 }, { "epoch": 3.9142828056601853, "grad_norm": 0.7467500567436218, "learning_rate": 4.832251204669367e-06, "loss": 0.0588, "step": 38450 }, { "epoch": 3.9168278529980656, "grad_norm": 1.7692584991455078, "learning_rate": 4.820939755220234e-06, "loss": 0.0914, "step": 38475 }, { "epoch": 3.9193729003359463, "grad_norm": 2.5429513454437256, "learning_rate": 4.809628305771102e-06, "loss": 0.1153, "step": 38500 }, { "epoch": 3.9219179476738266, "grad_norm": 1.0103989839553833, "learning_rate": 4.7983168563219695e-06, "loss": 0.0536, "step": 38525 }, { "epoch": 3.9244629950117074, "grad_norm": 4.339071273803711, "learning_rate": 4.787005406872837e-06, "loss": 0.0906, "step": 38550 }, { "epoch": 3.9270080423495877, "grad_norm": 0.8115999698638916, "learning_rate": 4.775693957423705e-06, "loss": 0.0871, "step": 38575 }, { "epoch": 3.9295530896874684, "grad_norm": 1.1291712522506714, "learning_rate": 4.764382507974572e-06, "loss": 0.0968, "step": 38600 }, { "epoch": 3.9320981370253487, "grad_norm": 2.7206923961639404, "learning_rate": 4.75307105852544e-06, "loss": 0.0918, "step": 38625 }, { "epoch": 3.934643184363229, "grad_norm": 0.8271558880805969, "learning_rate": 4.741759609076307e-06, "loss": 0.0939, "step": 38650 }, { "epoch": 3.9371882317011098, "grad_norm": 0.7608330845832825, "learning_rate": 4.730448159627175e-06, "loss": 0.0861, "step": 38675 }, { "epoch": 3.93973327903899, "grad_norm": 0.4039336144924164, "learning_rate": 4.719136710178043e-06, "loss": 0.0662, "step": 38700 }, { "epoch": 3.9422783263768704, "grad_norm": 5.616768836975098, "learning_rate": 4.708277718706875e-06, "loss": 0.0845, "step": 38725 }, { "epoch": 3.944823373714751, "grad_norm": 2.859184503555298, "learning_rate": 4.696966269257742e-06, "loss": 0.0843, "step": 38750 }, { "epoch": 3.9473684210526314, "grad_norm": 7.883376598358154, "learning_rate": 4.68565481980861e-06, "loss": 0.0902, "step": 38775 }, { "epoch": 3.949913468390512, "grad_norm": 5.071816444396973, "learning_rate": 4.674343370359478e-06, "loss": 0.1097, "step": 38800 }, { "epoch": 3.9524585157283925, "grad_norm": 4.740415096282959, "learning_rate": 4.663031920910346e-06, "loss": 0.0721, "step": 38825 }, { "epoch": 3.955003563066273, "grad_norm": 0.4243851602077484, "learning_rate": 4.651720471461214e-06, "loss": 0.1045, "step": 38850 }, { "epoch": 3.9575486104041535, "grad_norm": 3.7491915225982666, "learning_rate": 4.640409022012082e-06, "loss": 0.0846, "step": 38875 }, { "epoch": 3.960093657742034, "grad_norm": 2.330118417739868, "learning_rate": 4.629097572562949e-06, "loss": 0.0772, "step": 38900 }, { "epoch": 3.9626387050799146, "grad_norm": 2.0773494243621826, "learning_rate": 4.617786123113816e-06, "loss": 0.0605, "step": 38925 }, { "epoch": 3.965183752417795, "grad_norm": 1.8872305154800415, "learning_rate": 4.6064746736646835e-06, "loss": 0.076, "step": 38950 }, { "epoch": 3.9677287997556756, "grad_norm": 4.278075218200684, "learning_rate": 4.595163224215551e-06, "loss": 0.0584, "step": 38975 }, { "epoch": 3.970273847093556, "grad_norm": 1.7767386436462402, "learning_rate": 4.583851774766419e-06, "loss": 0.076, "step": 39000 }, { "epoch": 3.9728188944314367, "grad_norm": 0.9367185235023499, "learning_rate": 4.572540325317287e-06, "loss": 0.0854, "step": 39025 }, { "epoch": 3.975363941769317, "grad_norm": 4.025914669036865, "learning_rate": 4.561228875868154e-06, "loss": 0.0836, "step": 39050 }, { "epoch": 3.9779089891071973, "grad_norm": 0.49854645133018494, "learning_rate": 4.549917426419022e-06, "loss": 0.0704, "step": 39075 }, { "epoch": 3.980454036445078, "grad_norm": 2.95552921295166, "learning_rate": 4.538605976969889e-06, "loss": 0.081, "step": 39100 }, { "epoch": 3.9829990837829583, "grad_norm": 3.494931697845459, "learning_rate": 4.527294527520757e-06, "loss": 0.0846, "step": 39125 }, { "epoch": 3.9855441311208386, "grad_norm": 2.842068910598755, "learning_rate": 4.5159830780716245e-06, "loss": 0.056, "step": 39150 }, { "epoch": 3.9880891784587194, "grad_norm": 1.898389458656311, "learning_rate": 4.504671628622492e-06, "loss": 0.063, "step": 39175 }, { "epoch": 3.9906342257965997, "grad_norm": 3.592628240585327, "learning_rate": 4.493360179173359e-06, "loss": 0.0935, "step": 39200 }, { "epoch": 3.9931792731344804, "grad_norm": 1.6398416757583618, "learning_rate": 4.482048729724227e-06, "loss": 0.0895, "step": 39225 }, { "epoch": 3.9957243204723607, "grad_norm": 3.1050050258636475, "learning_rate": 4.470737280275095e-06, "loss": 0.0748, "step": 39250 }, { "epoch": 3.9982693678102414, "grad_norm": 3.505990505218506, "learning_rate": 4.459425830825962e-06, "loss": 0.0704, "step": 39275 }, { "epoch": 4.0, "eval_loss": 0.08321376889944077, "eval_runtime": 7.2624, "eval_samples_per_second": 937.43, "eval_steps_per_second": 14.733, "step": 39292 }, { "epoch": 4.000814415148122, "grad_norm": 4.233890056610107, "learning_rate": 4.44811438137683e-06, "loss": 0.0657, "step": 39300 }, { "epoch": 4.003359462486002, "grad_norm": 6.2566704750061035, "learning_rate": 4.436802931927698e-06, "loss": 0.0788, "step": 39325 }, { "epoch": 4.005904509823883, "grad_norm": 6.7541704177856445, "learning_rate": 4.425491482478565e-06, "loss": 0.0847, "step": 39350 }, { "epoch": 4.0084495571617635, "grad_norm": 2.7961132526397705, "learning_rate": 4.414180033029433e-06, "loss": 0.0667, "step": 39375 }, { "epoch": 4.010994604499643, "grad_norm": 3.7333006858825684, "learning_rate": 4.4028685835803005e-06, "loss": 0.0646, "step": 39400 }, { "epoch": 4.013539651837524, "grad_norm": 5.583388805389404, "learning_rate": 4.391557134131168e-06, "loss": 0.0528, "step": 39425 }, { "epoch": 4.016084699175405, "grad_norm": 0.5492544174194336, "learning_rate": 4.380245684682035e-06, "loss": 0.0563, "step": 39450 }, { "epoch": 4.018629746513285, "grad_norm": 3.7247314453125, "learning_rate": 4.368934235232903e-06, "loss": 0.0723, "step": 39475 }, { "epoch": 4.0211747938511655, "grad_norm": 1.7265299558639526, "learning_rate": 4.35762278578377e-06, "loss": 0.0703, "step": 39500 }, { "epoch": 4.023719841189046, "grad_norm": 4.306021690368652, "learning_rate": 4.346311336334638e-06, "loss": 0.0788, "step": 39525 }, { "epoch": 4.026264888526927, "grad_norm": 1.109270453453064, "learning_rate": 4.334999886885506e-06, "loss": 0.0572, "step": 39550 }, { "epoch": 4.028809935864807, "grad_norm": 1.2954659461975098, "learning_rate": 4.323688437436374e-06, "loss": 0.0757, "step": 39575 }, { "epoch": 4.031354983202688, "grad_norm": 1.4971513748168945, "learning_rate": 4.3123769879872415e-06, "loss": 0.0593, "step": 39600 }, { "epoch": 4.033900030540568, "grad_norm": 1.8888990879058838, "learning_rate": 4.3010655385381085e-06, "loss": 0.0739, "step": 39625 }, { "epoch": 4.036445077878448, "grad_norm": 4.024899482727051, "learning_rate": 4.289754089088976e-06, "loss": 0.06, "step": 39650 }, { "epoch": 4.038990125216329, "grad_norm": 3.701951503753662, "learning_rate": 4.278442639639843e-06, "loss": 0.0736, "step": 39675 }, { "epoch": 4.04153517255421, "grad_norm": 3.4520320892333984, "learning_rate": 4.267131190190711e-06, "loss": 0.0743, "step": 39700 }, { "epoch": 4.04408021989209, "grad_norm": 2.047182083129883, "learning_rate": 4.255819740741579e-06, "loss": 0.0655, "step": 39725 }, { "epoch": 4.04662526722997, "grad_norm": 7.164806365966797, "learning_rate": 4.244508291292447e-06, "loss": 0.0398, "step": 39750 }, { "epoch": 4.049170314567851, "grad_norm": 1.4133317470550537, "learning_rate": 4.233196841843314e-06, "loss": 0.0622, "step": 39775 }, { "epoch": 4.051715361905732, "grad_norm": 2.1031320095062256, "learning_rate": 4.221885392394182e-06, "loss": 0.0774, "step": 39800 }, { "epoch": 4.054260409243612, "grad_norm": 2.162343740463257, "learning_rate": 4.21057394294505e-06, "loss": 0.074, "step": 39825 }, { "epoch": 4.056805456581492, "grad_norm": 7.961151123046875, "learning_rate": 4.199262493495917e-06, "loss": 0.0537, "step": 39850 }, { "epoch": 4.059350503919373, "grad_norm": 3.6725287437438965, "learning_rate": 4.1879510440467844e-06, "loss": 0.0635, "step": 39875 }, { "epoch": 4.061895551257253, "grad_norm": 2.4533803462982178, "learning_rate": 4.176639594597652e-06, "loss": 0.0578, "step": 39900 }, { "epoch": 4.064440598595134, "grad_norm": 7.141175270080566, "learning_rate": 4.165328145148519e-06, "loss": 0.0987, "step": 39925 }, { "epoch": 4.0669856459330145, "grad_norm": 0.6580816507339478, "learning_rate": 4.154016695699387e-06, "loss": 0.0931, "step": 39950 }, { "epoch": 4.069530693270895, "grad_norm": 2.766205310821533, "learning_rate": 4.142705246250255e-06, "loss": 0.0899, "step": 39975 }, { "epoch": 4.072075740608775, "grad_norm": 4.015225410461426, "learning_rate": 4.131393796801123e-06, "loss": 0.0797, "step": 40000 }, { "epoch": 4.074620787946656, "grad_norm": 5.227399826049805, "learning_rate": 4.12008234735199e-06, "loss": 0.0803, "step": 40025 }, { "epoch": 4.077165835284537, "grad_norm": 1.7184827327728271, "learning_rate": 4.108770897902858e-06, "loss": 0.0851, "step": 40050 }, { "epoch": 4.079710882622416, "grad_norm": 3.887936592102051, "learning_rate": 4.097459448453725e-06, "loss": 0.0826, "step": 40075 }, { "epoch": 4.082255929960297, "grad_norm": 3.3515472412109375, "learning_rate": 4.0861479990045925e-06, "loss": 0.0808, "step": 40100 }, { "epoch": 4.084800977298178, "grad_norm": 1.4721667766571045, "learning_rate": 4.07483654955546e-06, "loss": 0.0734, "step": 40125 }, { "epoch": 4.087346024636059, "grad_norm": 3.627595901489258, "learning_rate": 4.063525100106328e-06, "loss": 0.0634, "step": 40150 }, { "epoch": 4.0898910719739385, "grad_norm": 0.0573771595954895, "learning_rate": 4.052213650657196e-06, "loss": 0.0792, "step": 40175 }, { "epoch": 4.092436119311819, "grad_norm": 2.682100772857666, "learning_rate": 4.040902201208063e-06, "loss": 0.0921, "step": 40200 }, { "epoch": 4.0949811666497, "grad_norm": 0.7701199054718018, "learning_rate": 4.029590751758931e-06, "loss": 0.0754, "step": 40225 }, { "epoch": 4.09752621398758, "grad_norm": 8.216882705688477, "learning_rate": 4.018279302309798e-06, "loss": 0.0925, "step": 40250 }, { "epoch": 4.100071261325461, "grad_norm": 5.027645111083984, "learning_rate": 4.006967852860666e-06, "loss": 0.0777, "step": 40275 }, { "epoch": 4.102616308663341, "grad_norm": 0.8864279985427856, "learning_rate": 3.9956564034115336e-06, "loss": 0.0746, "step": 40300 }, { "epoch": 4.105161356001221, "grad_norm": 1.5097590684890747, "learning_rate": 3.984344953962401e-06, "loss": 0.058, "step": 40325 }, { "epoch": 4.107706403339102, "grad_norm": 4.6194963455200195, "learning_rate": 3.973033504513268e-06, "loss": 0.0549, "step": 40350 }, { "epoch": 4.110251450676983, "grad_norm": 1.5456056594848633, "learning_rate": 3.961722055064136e-06, "loss": 0.0958, "step": 40375 }, { "epoch": 4.1127964980148635, "grad_norm": 6.5456647872924805, "learning_rate": 3.950410605615004e-06, "loss": 0.1041, "step": 40400 }, { "epoch": 4.115341545352743, "grad_norm": 3.12103009223938, "learning_rate": 3.939099156165871e-06, "loss": 0.0841, "step": 40425 }, { "epoch": 4.117886592690624, "grad_norm": 1.520664095878601, "learning_rate": 3.927787706716739e-06, "loss": 0.1077, "step": 40450 }, { "epoch": 4.120431640028505, "grad_norm": 2.8787078857421875, "learning_rate": 3.916476257267607e-06, "loss": 0.0653, "step": 40475 }, { "epoch": 4.122976687366385, "grad_norm": 0.7856115102767944, "learning_rate": 3.905164807818474e-06, "loss": 0.065, "step": 40500 }, { "epoch": 4.125521734704265, "grad_norm": 1.089637041091919, "learning_rate": 3.893853358369342e-06, "loss": 0.094, "step": 40525 }, { "epoch": 4.128066782042146, "grad_norm": 4.611286163330078, "learning_rate": 3.8825419089202095e-06, "loss": 0.0612, "step": 40550 }, { "epoch": 4.130611829380026, "grad_norm": 2.995384693145752, "learning_rate": 3.871230459471077e-06, "loss": 0.0677, "step": 40575 }, { "epoch": 4.133156876717907, "grad_norm": 2.4871647357940674, "learning_rate": 3.859919010021944e-06, "loss": 0.092, "step": 40600 }, { "epoch": 4.1357019240557875, "grad_norm": 0.4613381028175354, "learning_rate": 3.848607560572812e-06, "loss": 0.0927, "step": 40625 }, { "epoch": 4.138246971393668, "grad_norm": 4.346590042114258, "learning_rate": 3.837296111123679e-06, "loss": 0.073, "step": 40650 }, { "epoch": 4.140792018731548, "grad_norm": 4.693612575531006, "learning_rate": 3.825984661674547e-06, "loss": 0.0581, "step": 40675 }, { "epoch": 4.143337066069429, "grad_norm": 1.904029130935669, "learning_rate": 3.814673212225415e-06, "loss": 0.0616, "step": 40700 }, { "epoch": 4.14588211340731, "grad_norm": 4.891720294952393, "learning_rate": 3.8033617627762827e-06, "loss": 0.0797, "step": 40725 }, { "epoch": 4.1484271607451895, "grad_norm": 1.2688778638839722, "learning_rate": 3.79205031332715e-06, "loss": 0.0869, "step": 40750 }, { "epoch": 4.15097220808307, "grad_norm": 4.826283931732178, "learning_rate": 3.7807388638780175e-06, "loss": 0.0808, "step": 40775 }, { "epoch": 4.153517255420951, "grad_norm": 8.116747856140137, "learning_rate": 3.769427414428885e-06, "loss": 0.0762, "step": 40800 }, { "epoch": 4.156062302758832, "grad_norm": 5.6922807693481445, "learning_rate": 3.758568422957718e-06, "loss": 0.0831, "step": 40825 }, { "epoch": 4.158607350096712, "grad_norm": 2.4496312141418457, "learning_rate": 3.7472569735085855e-06, "loss": 0.0775, "step": 40850 }, { "epoch": 4.161152397434592, "grad_norm": 3.060380697250366, "learning_rate": 3.7359455240594533e-06, "loss": 0.0685, "step": 40875 }, { "epoch": 4.163697444772473, "grad_norm": 4.143198490142822, "learning_rate": 3.724634074610321e-06, "loss": 0.055, "step": 40900 }, { "epoch": 4.166242492110353, "grad_norm": 5.605116367340088, "learning_rate": 3.7133226251611886e-06, "loss": 0.0893, "step": 40925 }, { "epoch": 4.168787539448234, "grad_norm": 4.88701868057251, "learning_rate": 3.702011175712056e-06, "loss": 0.0536, "step": 40950 }, { "epoch": 4.171332586786114, "grad_norm": 3.8386881351470947, "learning_rate": 3.6906997262629234e-06, "loss": 0.0943, "step": 40975 }, { "epoch": 4.173877634123995, "grad_norm": 0.8875072598457336, "learning_rate": 3.6793882768137913e-06, "loss": 0.0643, "step": 41000 }, { "epoch": 4.176422681461875, "grad_norm": 3.3152079582214355, "learning_rate": 3.6680768273646587e-06, "loss": 0.0601, "step": 41025 }, { "epoch": 4.178967728799756, "grad_norm": 2.363868236541748, "learning_rate": 3.6567653779155265e-06, "loss": 0.0836, "step": 41050 }, { "epoch": 4.1815127761376365, "grad_norm": 1.1816363334655762, "learning_rate": 3.6454539284663944e-06, "loss": 0.0687, "step": 41075 }, { "epoch": 4.184057823475516, "grad_norm": 4.630997180938721, "learning_rate": 3.6341424790172614e-06, "loss": 0.0577, "step": 41100 }, { "epoch": 4.186602870813397, "grad_norm": 3.6776726245880127, "learning_rate": 3.6228310295681292e-06, "loss": 0.0838, "step": 41125 }, { "epoch": 4.189147918151278, "grad_norm": 4.161885738372803, "learning_rate": 3.6115195801189966e-06, "loss": 0.0694, "step": 41150 }, { "epoch": 4.191692965489158, "grad_norm": 3.5846166610717773, "learning_rate": 3.6002081306698645e-06, "loss": 0.0835, "step": 41175 }, { "epoch": 4.1942380128270385, "grad_norm": 3.8520216941833496, "learning_rate": 3.588896681220732e-06, "loss": 0.0767, "step": 41200 }, { "epoch": 4.196783060164919, "grad_norm": 4.335388660430908, "learning_rate": 3.5775852317715997e-06, "loss": 0.0542, "step": 41225 }, { "epoch": 4.1993281075028, "grad_norm": 1.6342220306396484, "learning_rate": 3.5662737823224667e-06, "loss": 0.0838, "step": 41250 }, { "epoch": 4.20187315484068, "grad_norm": 5.2642011642456055, "learning_rate": 3.5549623328733346e-06, "loss": 0.06, "step": 41275 }, { "epoch": 4.2044182021785605, "grad_norm": 3.4589884281158447, "learning_rate": 3.5436508834242024e-06, "loss": 0.067, "step": 41300 }, { "epoch": 4.206963249516441, "grad_norm": 6.109384536743164, "learning_rate": 3.53233943397507e-06, "loss": 0.0531, "step": 41325 }, { "epoch": 4.209508296854321, "grad_norm": 4.634017467498779, "learning_rate": 3.5210279845259377e-06, "loss": 0.0502, "step": 41350 }, { "epoch": 4.212053344192202, "grad_norm": 3.371673107147217, "learning_rate": 3.5097165350768047e-06, "loss": 0.0748, "step": 41375 }, { "epoch": 4.214598391530083, "grad_norm": 1.5770014524459839, "learning_rate": 3.4984050856276725e-06, "loss": 0.0845, "step": 41400 }, { "epoch": 4.2171434388679625, "grad_norm": 5.147093772888184, "learning_rate": 3.48709363617854e-06, "loss": 0.0725, "step": 41425 }, { "epoch": 4.219688486205843, "grad_norm": 0.8180754780769348, "learning_rate": 3.475782186729408e-06, "loss": 0.07, "step": 41450 }, { "epoch": 4.222233533543724, "grad_norm": 0.36477014422416687, "learning_rate": 3.4644707372802757e-06, "loss": 0.0587, "step": 41475 }, { "epoch": 4.224778580881605, "grad_norm": 2.6513445377349854, "learning_rate": 3.453159287831143e-06, "loss": 0.0811, "step": 41500 }, { "epoch": 4.227323628219485, "grad_norm": 8.273859977722168, "learning_rate": 3.4418478383820105e-06, "loss": 0.0902, "step": 41525 }, { "epoch": 4.229868675557365, "grad_norm": 1.1668052673339844, "learning_rate": 3.430536388932878e-06, "loss": 0.0587, "step": 41550 }, { "epoch": 4.232413722895246, "grad_norm": 4.071606636047363, "learning_rate": 3.4192249394837458e-06, "loss": 0.0882, "step": 41575 }, { "epoch": 4.234958770233126, "grad_norm": 2.3633975982666016, "learning_rate": 3.407913490034613e-06, "loss": 0.0491, "step": 41600 }, { "epoch": 4.237503817571007, "grad_norm": 5.4014363288879395, "learning_rate": 3.396602040585481e-06, "loss": 0.0747, "step": 41625 }, { "epoch": 4.240048864908887, "grad_norm": 5.445051193237305, "learning_rate": 3.385290591136349e-06, "loss": 0.0634, "step": 41650 }, { "epoch": 4.242593912246768, "grad_norm": 0.34296977519989014, "learning_rate": 3.373979141687216e-06, "loss": 0.0718, "step": 41675 }, { "epoch": 4.245138959584648, "grad_norm": 5.036444664001465, "learning_rate": 3.3626676922380837e-06, "loss": 0.0572, "step": 41700 }, { "epoch": 4.247684006922529, "grad_norm": 5.49483060836792, "learning_rate": 3.351356242788951e-06, "loss": 0.0779, "step": 41725 }, { "epoch": 4.2502290542604095, "grad_norm": 1.3740614652633667, "learning_rate": 3.340044793339819e-06, "loss": 0.0938, "step": 41750 }, { "epoch": 4.252774101598289, "grad_norm": 2.491666793823242, "learning_rate": 3.3287333438906864e-06, "loss": 0.0509, "step": 41775 }, { "epoch": 4.25531914893617, "grad_norm": 6.901576995849609, "learning_rate": 3.3174218944415543e-06, "loss": 0.0669, "step": 41800 }, { "epoch": 4.257864196274051, "grad_norm": 3.491737127304077, "learning_rate": 3.3061104449924213e-06, "loss": 0.0862, "step": 41825 }, { "epoch": 4.260409243611932, "grad_norm": 1.0229004621505737, "learning_rate": 3.294798995543289e-06, "loss": 0.0594, "step": 41850 }, { "epoch": 4.2629542909498115, "grad_norm": 4.715412139892578, "learning_rate": 3.283487546094157e-06, "loss": 0.0773, "step": 41875 }, { "epoch": 4.265499338287692, "grad_norm": 5.1960530281066895, "learning_rate": 3.2721760966450244e-06, "loss": 0.0618, "step": 41900 }, { "epoch": 4.268044385625573, "grad_norm": 1.639708161354065, "learning_rate": 3.260864647195892e-06, "loss": 0.0524, "step": 41925 }, { "epoch": 4.270589432963453, "grad_norm": 1.5220186710357666, "learning_rate": 3.249553197746759e-06, "loss": 0.0597, "step": 41950 }, { "epoch": 4.273134480301334, "grad_norm": 0.25770363211631775, "learning_rate": 3.238241748297627e-06, "loss": 0.0994, "step": 41975 }, { "epoch": 4.275679527639214, "grad_norm": 2.196244478225708, "learning_rate": 3.2269302988484945e-06, "loss": 0.055, "step": 42000 }, { "epoch": 4.278224574977094, "grad_norm": 0.6241010427474976, "learning_rate": 3.2156188493993623e-06, "loss": 0.0937, "step": 42025 }, { "epoch": 4.280769622314975, "grad_norm": 4.9643096923828125, "learning_rate": 3.20430739995023e-06, "loss": 0.0678, "step": 42050 }, { "epoch": 4.283314669652856, "grad_norm": 1.383632779121399, "learning_rate": 3.1929959505010976e-06, "loss": 0.0581, "step": 42075 }, { "epoch": 4.285859716990736, "grad_norm": 2.5081474781036377, "learning_rate": 3.181684501051965e-06, "loss": 0.0821, "step": 42100 }, { "epoch": 4.288404764328616, "grad_norm": 1.9261081218719482, "learning_rate": 3.1703730516028324e-06, "loss": 0.088, "step": 42125 }, { "epoch": 4.290949811666497, "grad_norm": 2.8013453483581543, "learning_rate": 3.1590616021537003e-06, "loss": 0.0634, "step": 42150 }, { "epoch": 4.293494859004378, "grad_norm": 2.583024501800537, "learning_rate": 3.1477501527045677e-06, "loss": 0.0826, "step": 42175 }, { "epoch": 4.296039906342258, "grad_norm": 2.5302445888519287, "learning_rate": 3.1364387032554355e-06, "loss": 0.1037, "step": 42200 }, { "epoch": 4.298584953680138, "grad_norm": 1.894404649734497, "learning_rate": 3.1251272538063034e-06, "loss": 0.0693, "step": 42225 }, { "epoch": 4.301130001018019, "grad_norm": 4.712514877319336, "learning_rate": 3.1138158043571704e-06, "loss": 0.074, "step": 42250 }, { "epoch": 4.303675048355899, "grad_norm": 6.2421183586120605, "learning_rate": 3.1025043549080382e-06, "loss": 0.0831, "step": 42275 }, { "epoch": 4.30622009569378, "grad_norm": 6.7714924812316895, "learning_rate": 3.0911929054589057e-06, "loss": 0.0691, "step": 42300 }, { "epoch": 4.3087651430316605, "grad_norm": 1.5627204179763794, "learning_rate": 3.0798814560097735e-06, "loss": 0.0689, "step": 42325 }, { "epoch": 4.311310190369541, "grad_norm": 0.7110241651535034, "learning_rate": 3.068570006560641e-06, "loss": 0.0668, "step": 42350 }, { "epoch": 4.313855237707421, "grad_norm": 4.596431732177734, "learning_rate": 3.0572585571115088e-06, "loss": 0.0782, "step": 42375 }, { "epoch": 4.316400285045302, "grad_norm": 4.947603702545166, "learning_rate": 3.0459471076623758e-06, "loss": 0.0566, "step": 42400 }, { "epoch": 4.318945332383183, "grad_norm": 3.2261035442352295, "learning_rate": 3.0346356582132436e-06, "loss": 0.0602, "step": 42425 }, { "epoch": 4.321490379721062, "grad_norm": 1.8598774671554565, "learning_rate": 3.0233242087641115e-06, "loss": 0.0789, "step": 42450 }, { "epoch": 4.324035427058943, "grad_norm": 5.585264682769775, "learning_rate": 3.012012759314979e-06, "loss": 0.088, "step": 42475 }, { "epoch": 4.326580474396824, "grad_norm": 6.0666046142578125, "learning_rate": 3.0007013098658467e-06, "loss": 0.0787, "step": 42500 }, { "epoch": 4.329125521734705, "grad_norm": 3.998551607131958, "learning_rate": 2.989389860416714e-06, "loss": 0.1013, "step": 42525 }, { "epoch": 4.3316705690725845, "grad_norm": 0.7877366542816162, "learning_rate": 2.9780784109675816e-06, "loss": 0.0691, "step": 42550 }, { "epoch": 4.334215616410465, "grad_norm": 4.090976238250732, "learning_rate": 2.966766961518449e-06, "loss": 0.0796, "step": 42575 }, { "epoch": 4.336760663748346, "grad_norm": 2.965317726135254, "learning_rate": 2.955455512069317e-06, "loss": 0.0927, "step": 42600 }, { "epoch": 4.339305711086226, "grad_norm": 0.9894925355911255, "learning_rate": 2.9441440626201847e-06, "loss": 0.0613, "step": 42625 }, { "epoch": 4.341850758424107, "grad_norm": 3.499403953552246, "learning_rate": 2.932832613171052e-06, "loss": 0.0857, "step": 42650 }, { "epoch": 4.344395805761987, "grad_norm": 2.901555061340332, "learning_rate": 2.921521163721919e-06, "loss": 0.0583, "step": 42675 }, { "epoch": 4.346940853099868, "grad_norm": 1.397181749343872, "learning_rate": 2.910209714272787e-06, "loss": 0.0684, "step": 42700 }, { "epoch": 4.349485900437748, "grad_norm": 2.98871111869812, "learning_rate": 2.8988982648236548e-06, "loss": 0.0699, "step": 42725 }, { "epoch": 4.352030947775629, "grad_norm": 2.4545464515686035, "learning_rate": 2.887586815374522e-06, "loss": 0.0771, "step": 42750 }, { "epoch": 4.3545759951135095, "grad_norm": 1.2507919073104858, "learning_rate": 2.87627536592539e-06, "loss": 0.0912, "step": 42775 }, { "epoch": 4.357121042451389, "grad_norm": 3.777596950531006, "learning_rate": 2.864963916476258e-06, "loss": 0.0766, "step": 42800 }, { "epoch": 4.35966608978927, "grad_norm": 3.1019132137298584, "learning_rate": 2.853652467027125e-06, "loss": 0.0637, "step": 42825 }, { "epoch": 4.362211137127151, "grad_norm": 1.4555113315582275, "learning_rate": 2.8423410175779923e-06, "loss": 0.0932, "step": 42850 }, { "epoch": 4.364756184465031, "grad_norm": 1.262918472290039, "learning_rate": 2.83102956812886e-06, "loss": 0.0606, "step": 42875 }, { "epoch": 4.367301231802911, "grad_norm": 1.907347321510315, "learning_rate": 2.819718118679728e-06, "loss": 0.0749, "step": 42900 }, { "epoch": 4.369846279140792, "grad_norm": 0.9305869340896606, "learning_rate": 2.8084066692305954e-06, "loss": 0.0793, "step": 42925 }, { "epoch": 4.372391326478673, "grad_norm": 2.1973392963409424, "learning_rate": 2.7970952197814633e-06, "loss": 0.091, "step": 42950 }, { "epoch": 4.374936373816553, "grad_norm": 4.098190784454346, "learning_rate": 2.7857837703323303e-06, "loss": 0.0951, "step": 42975 }, { "epoch": 4.3774814211544335, "grad_norm": 3.911076307296753, "learning_rate": 2.774472320883198e-06, "loss": 0.081, "step": 43000 }, { "epoch": 4.380026468492314, "grad_norm": 4.323380470275879, "learning_rate": 2.7631608714340655e-06, "loss": 0.0565, "step": 43025 }, { "epoch": 4.382571515830194, "grad_norm": 1.0312169790267944, "learning_rate": 2.7518494219849334e-06, "loss": 0.0604, "step": 43050 }, { "epoch": 4.385116563168075, "grad_norm": 3.453737258911133, "learning_rate": 2.7405379725358012e-06, "loss": 0.0684, "step": 43075 }, { "epoch": 4.387661610505956, "grad_norm": 1.7941606044769287, "learning_rate": 2.7292265230866687e-06, "loss": 0.0412, "step": 43100 }, { "epoch": 4.3902066578438355, "grad_norm": 1.500749945640564, "learning_rate": 2.717915073637536e-06, "loss": 0.0753, "step": 43125 }, { "epoch": 4.392751705181716, "grad_norm": 6.635108470916748, "learning_rate": 2.7066036241884035e-06, "loss": 0.0708, "step": 43150 }, { "epoch": 4.395296752519597, "grad_norm": 7.838114261627197, "learning_rate": 2.6952921747392713e-06, "loss": 0.0713, "step": 43175 }, { "epoch": 4.397841799857478, "grad_norm": 5.980000972747803, "learning_rate": 2.683980725290139e-06, "loss": 0.0757, "step": 43200 }, { "epoch": 4.4003868471953576, "grad_norm": 2.0675323009490967, "learning_rate": 2.6726692758410066e-06, "loss": 0.0795, "step": 43225 }, { "epoch": 4.402931894533238, "grad_norm": 4.80326509475708, "learning_rate": 2.6613578263918745e-06, "loss": 0.055, "step": 43250 }, { "epoch": 4.405476941871119, "grad_norm": 2.1413259506225586, "learning_rate": 2.6500463769427414e-06, "loss": 0.0775, "step": 43275 }, { "epoch": 4.408021989208999, "grad_norm": 1.2402043342590332, "learning_rate": 2.6387349274936093e-06, "loss": 0.0661, "step": 43300 }, { "epoch": 4.41056703654688, "grad_norm": 2.842992067337036, "learning_rate": 2.6274234780444767e-06, "loss": 0.0664, "step": 43325 }, { "epoch": 4.41311208388476, "grad_norm": 4.652822017669678, "learning_rate": 2.6161120285953446e-06, "loss": 0.077, "step": 43350 }, { "epoch": 4.415657131222641, "grad_norm": 2.196883201599121, "learning_rate": 2.6048005791462124e-06, "loss": 0.0595, "step": 43375 }, { "epoch": 4.418202178560521, "grad_norm": 3.021293878555298, "learning_rate": 2.5934891296970794e-06, "loss": 0.0633, "step": 43400 }, { "epoch": 4.420747225898402, "grad_norm": 7.387508392333984, "learning_rate": 2.582177680247947e-06, "loss": 0.0617, "step": 43425 }, { "epoch": 4.4232922732362825, "grad_norm": 1.5187947750091553, "learning_rate": 2.5708662307988147e-06, "loss": 0.0466, "step": 43450 }, { "epoch": 4.425837320574162, "grad_norm": 5.505544662475586, "learning_rate": 2.5595547813496825e-06, "loss": 0.0636, "step": 43475 }, { "epoch": 4.428382367912043, "grad_norm": 0.9313585162162781, "learning_rate": 2.54824333190055e-06, "loss": 0.0765, "step": 43500 }, { "epoch": 4.430927415249924, "grad_norm": 0.5325255393981934, "learning_rate": 2.5369318824514178e-06, "loss": 0.0807, "step": 43525 }, { "epoch": 4.433472462587805, "grad_norm": 7.443079471588135, "learning_rate": 2.5256204330022848e-06, "loss": 0.0659, "step": 43550 }, { "epoch": 4.436017509925684, "grad_norm": 1.6353135108947754, "learning_rate": 2.5143089835531526e-06, "loss": 0.0854, "step": 43575 }, { "epoch": 4.438562557263565, "grad_norm": 5.86698055267334, "learning_rate": 2.50299753410402e-06, "loss": 0.0658, "step": 43600 }, { "epoch": 4.441107604601446, "grad_norm": 5.454168319702148, "learning_rate": 2.491686084654888e-06, "loss": 0.0696, "step": 43625 }, { "epoch": 4.443652651939326, "grad_norm": 4.088606834411621, "learning_rate": 2.4803746352057553e-06, "loss": 0.0687, "step": 43650 }, { "epoch": 4.4461976992772065, "grad_norm": 3.597466468811035, "learning_rate": 2.469063185756623e-06, "loss": 0.0821, "step": 43675 }, { "epoch": 4.448742746615087, "grad_norm": 4.886236667633057, "learning_rate": 2.4577517363074906e-06, "loss": 0.0924, "step": 43700 }, { "epoch": 4.451287793952967, "grad_norm": 1.914562702178955, "learning_rate": 2.446440286858358e-06, "loss": 0.0541, "step": 43725 }, { "epoch": 4.453832841290848, "grad_norm": 5.678765773773193, "learning_rate": 2.435128837409226e-06, "loss": 0.061, "step": 43750 }, { "epoch": 4.456377888628729, "grad_norm": 1.6028286218643188, "learning_rate": 2.4238173879600933e-06, "loss": 0.069, "step": 43775 }, { "epoch": 4.458922935966609, "grad_norm": 4.589537620544434, "learning_rate": 2.4125059385109607e-06, "loss": 0.0944, "step": 43800 }, { "epoch": 4.461467983304489, "grad_norm": 0.4559231400489807, "learning_rate": 2.4011944890618285e-06, "loss": 0.0735, "step": 43825 }, { "epoch": 4.46401303064237, "grad_norm": 3.7404086589813232, "learning_rate": 2.3898830396126964e-06, "loss": 0.0507, "step": 43850 }, { "epoch": 4.466558077980251, "grad_norm": 0.8973321914672852, "learning_rate": 2.378571590163564e-06, "loss": 0.085, "step": 43875 }, { "epoch": 4.469103125318131, "grad_norm": 0.6244153380393982, "learning_rate": 2.3672601407144312e-06, "loss": 0.0778, "step": 43900 }, { "epoch": 4.471648172656011, "grad_norm": 0.38073861598968506, "learning_rate": 2.355948691265299e-06, "loss": 0.0658, "step": 43925 }, { "epoch": 4.474193219993892, "grad_norm": 4.970991611480713, "learning_rate": 2.3446372418161665e-06, "loss": 0.073, "step": 43950 }, { "epoch": 4.476738267331772, "grad_norm": 5.226351261138916, "learning_rate": 2.333325792367034e-06, "loss": 0.0753, "step": 43975 }, { "epoch": 4.479283314669653, "grad_norm": 2.576582431793213, "learning_rate": 2.3220143429179018e-06, "loss": 0.0623, "step": 44000 }, { "epoch": 4.481828362007533, "grad_norm": 1.2187014818191528, "learning_rate": 2.310702893468769e-06, "loss": 0.078, "step": 44025 }, { "epoch": 4.484373409345414, "grad_norm": 4.498028755187988, "learning_rate": 2.299391444019637e-06, "loss": 0.069, "step": 44050 }, { "epoch": 4.486918456683294, "grad_norm": 1.715052604675293, "learning_rate": 2.2880799945705044e-06, "loss": 0.0685, "step": 44075 }, { "epoch": 4.489463504021175, "grad_norm": 5.307081699371338, "learning_rate": 2.276768545121372e-06, "loss": 0.0679, "step": 44100 }, { "epoch": 4.4920085513590555, "grad_norm": 4.219576835632324, "learning_rate": 2.2654570956722397e-06, "loss": 0.0607, "step": 44125 }, { "epoch": 4.494553598696935, "grad_norm": 4.1396355628967285, "learning_rate": 2.254145646223107e-06, "loss": 0.0845, "step": 44150 }, { "epoch": 4.497098646034816, "grad_norm": 2.164240837097168, "learning_rate": 2.2428341967739746e-06, "loss": 0.0707, "step": 44175 }, { "epoch": 4.499643693372697, "grad_norm": 5.686645984649658, "learning_rate": 2.2315227473248424e-06, "loss": 0.0898, "step": 44200 }, { "epoch": 4.502188740710578, "grad_norm": 6.213311672210693, "learning_rate": 2.2202112978757102e-06, "loss": 0.0622, "step": 44225 }, { "epoch": 4.5047337880484575, "grad_norm": 1.4041837453842163, "learning_rate": 2.2088998484265777e-06, "loss": 0.0477, "step": 44250 }, { "epoch": 4.507278835386338, "grad_norm": 6.660885810852051, "learning_rate": 2.197588398977445e-06, "loss": 0.0729, "step": 44275 }, { "epoch": 4.509823882724219, "grad_norm": 4.362778663635254, "learning_rate": 2.186276949528313e-06, "loss": 0.0633, "step": 44300 }, { "epoch": 4.512368930062099, "grad_norm": 3.135101556777954, "learning_rate": 2.1749655000791804e-06, "loss": 0.0804, "step": 44325 }, { "epoch": 4.51491397739998, "grad_norm": 1.2437292337417603, "learning_rate": 2.1636540506300478e-06, "loss": 0.0768, "step": 44350 }, { "epoch": 4.51745902473786, "grad_norm": 4.525523662567139, "learning_rate": 2.152342601180915e-06, "loss": 0.0566, "step": 44375 }, { "epoch": 4.520004072075741, "grad_norm": 4.344095230102539, "learning_rate": 2.141031151731783e-06, "loss": 0.0749, "step": 44400 }, { "epoch": 4.522549119413621, "grad_norm": 3.743915319442749, "learning_rate": 2.129719702282651e-06, "loss": 0.0769, "step": 44425 }, { "epoch": 4.525094166751502, "grad_norm": 3.3865807056427, "learning_rate": 2.1184082528335183e-06, "loss": 0.0692, "step": 44450 }, { "epoch": 4.527639214089382, "grad_norm": 1.5223188400268555, "learning_rate": 2.1070968033843857e-06, "loss": 0.0508, "step": 44475 }, { "epoch": 4.530184261427262, "grad_norm": 1.2191075086593628, "learning_rate": 2.0957853539352536e-06, "loss": 0.0626, "step": 44500 }, { "epoch": 4.532729308765143, "grad_norm": 1.1970452070236206, "learning_rate": 2.084473904486121e-06, "loss": 0.0684, "step": 44525 }, { "epoch": 4.535274356103024, "grad_norm": 2.423342704772949, "learning_rate": 2.0731624550369884e-06, "loss": 0.0585, "step": 44550 }, { "epoch": 4.537819403440904, "grad_norm": 4.101266384124756, "learning_rate": 2.0618510055878563e-06, "loss": 0.0692, "step": 44575 }, { "epoch": 4.540364450778784, "grad_norm": 4.081343173980713, "learning_rate": 2.0505395561387237e-06, "loss": 0.0932, "step": 44600 }, { "epoch": 4.542909498116665, "grad_norm": 2.0687055587768555, "learning_rate": 2.0392281066895915e-06, "loss": 0.0529, "step": 44625 }, { "epoch": 4.545454545454545, "grad_norm": 5.054721832275391, "learning_rate": 2.027916657240459e-06, "loss": 0.0739, "step": 44650 }, { "epoch": 4.547999592792426, "grad_norm": 0.6480259299278259, "learning_rate": 2.0166052077913264e-06, "loss": 0.0529, "step": 44675 }, { "epoch": 4.5505446401303065, "grad_norm": 0.2223999947309494, "learning_rate": 2.0052937583421942e-06, "loss": 0.0812, "step": 44700 }, { "epoch": 4.553089687468187, "grad_norm": 9.688573837280273, "learning_rate": 1.9939823088930616e-06, "loss": 0.0608, "step": 44725 }, { "epoch": 4.555634734806067, "grad_norm": 2.712524652481079, "learning_rate": 1.982670859443929e-06, "loss": 0.0618, "step": 44750 }, { "epoch": 4.558179782143948, "grad_norm": 4.108211040496826, "learning_rate": 1.971359409994797e-06, "loss": 0.0673, "step": 44775 }, { "epoch": 4.560724829481829, "grad_norm": 1.9700902700424194, "learning_rate": 1.9600479605456648e-06, "loss": 0.0721, "step": 44800 }, { "epoch": 4.563269876819708, "grad_norm": 3.264569044113159, "learning_rate": 1.948736511096532e-06, "loss": 0.0691, "step": 44825 }, { "epoch": 4.565814924157589, "grad_norm": 1.5905423164367676, "learning_rate": 1.9374250616473996e-06, "loss": 0.0738, "step": 44850 }, { "epoch": 4.56835997149547, "grad_norm": 1.5995819568634033, "learning_rate": 1.9261136121982674e-06, "loss": 0.0853, "step": 44875 }, { "epoch": 4.570905018833351, "grad_norm": 2.281751871109009, "learning_rate": 1.914802162749135e-06, "loss": 0.0587, "step": 44900 }, { "epoch": 4.5734500661712305, "grad_norm": 1.2428480386734009, "learning_rate": 1.9034907133000025e-06, "loss": 0.0597, "step": 44925 }, { "epoch": 4.575995113509111, "grad_norm": 3.320080518722534, "learning_rate": 1.8921792638508701e-06, "loss": 0.04, "step": 44950 }, { "epoch": 4.578540160846992, "grad_norm": 4.8358330726623535, "learning_rate": 1.8808678144017376e-06, "loss": 0.0716, "step": 44975 }, { "epoch": 4.581085208184872, "grad_norm": 2.170146942138672, "learning_rate": 1.8695563649526052e-06, "loss": 0.0595, "step": 45000 }, { "epoch": 4.583630255522753, "grad_norm": 6.489071846008301, "learning_rate": 1.8582449155034726e-06, "loss": 0.0798, "step": 45025 }, { "epoch": 4.586175302860633, "grad_norm": 2.01780366897583, "learning_rate": 1.8469334660543402e-06, "loss": 0.0537, "step": 45050 }, { "epoch": 4.588720350198514, "grad_norm": 6.3545637130737305, "learning_rate": 1.835622016605208e-06, "loss": 0.08, "step": 45075 }, { "epoch": 4.591265397536394, "grad_norm": 1.5675911903381348, "learning_rate": 1.8243105671560755e-06, "loss": 0.0953, "step": 45100 }, { "epoch": 4.593810444874275, "grad_norm": 1.2118556499481201, "learning_rate": 1.8129991177069431e-06, "loss": 0.0519, "step": 45125 }, { "epoch": 4.5963554922121554, "grad_norm": 1.4789040088653564, "learning_rate": 1.8016876682578108e-06, "loss": 0.1003, "step": 45150 }, { "epoch": 4.598900539550035, "grad_norm": 4.09182071685791, "learning_rate": 1.7903762188086782e-06, "loss": 0.0774, "step": 45175 }, { "epoch": 4.601445586887916, "grad_norm": 5.658119201660156, "learning_rate": 1.7790647693595458e-06, "loss": 0.0947, "step": 45200 }, { "epoch": 4.603990634225797, "grad_norm": 1.70206618309021, "learning_rate": 1.7677533199104135e-06, "loss": 0.0708, "step": 45225 }, { "epoch": 4.6065356815636775, "grad_norm": 1.5394465923309326, "learning_rate": 1.7564418704612809e-06, "loss": 0.1066, "step": 45250 }, { "epoch": 4.609080728901557, "grad_norm": 1.8833317756652832, "learning_rate": 1.7451304210121487e-06, "loss": 0.0555, "step": 45275 }, { "epoch": 4.611625776239438, "grad_norm": 1.0039482116699219, "learning_rate": 1.7342714295409816e-06, "loss": 0.0826, "step": 45300 }, { "epoch": 4.614170823577319, "grad_norm": 2.0755767822265625, "learning_rate": 1.7229599800918492e-06, "loss": 0.0528, "step": 45325 }, { "epoch": 4.616715870915199, "grad_norm": 4.706948280334473, "learning_rate": 1.7116485306427167e-06, "loss": 0.0594, "step": 45350 }, { "epoch": 4.6192609182530795, "grad_norm": 5.853977680206299, "learning_rate": 1.7003370811935843e-06, "loss": 0.0731, "step": 45375 }, { "epoch": 4.62180596559096, "grad_norm": 1.9859325885772705, "learning_rate": 1.689025631744452e-06, "loss": 0.0808, "step": 45400 }, { "epoch": 4.62435101292884, "grad_norm": 2.232928514480591, "learning_rate": 1.6777141822953193e-06, "loss": 0.0684, "step": 45425 }, { "epoch": 4.626896060266721, "grad_norm": 1.054357886314392, "learning_rate": 1.6664027328461872e-06, "loss": 0.079, "step": 45450 }, { "epoch": 4.629441107604602, "grad_norm": 3.0795228481292725, "learning_rate": 1.6550912833970548e-06, "loss": 0.0828, "step": 45475 }, { "epoch": 4.6319861549424814, "grad_norm": 2.889328718185425, "learning_rate": 1.6437798339479222e-06, "loss": 0.0683, "step": 45500 }, { "epoch": 4.634531202280362, "grad_norm": 2.0245718955993652, "learning_rate": 1.6324683844987899e-06, "loss": 0.0757, "step": 45525 }, { "epoch": 4.637076249618243, "grad_norm": 6.983794212341309, "learning_rate": 1.6211569350496573e-06, "loss": 0.0748, "step": 45550 }, { "epoch": 4.639621296956124, "grad_norm": 2.034707546234131, "learning_rate": 1.609845485600525e-06, "loss": 0.0844, "step": 45575 }, { "epoch": 4.6421663442940035, "grad_norm": 2.9529507160186768, "learning_rate": 1.5985340361513926e-06, "loss": 0.078, "step": 45600 }, { "epoch": 4.644711391631884, "grad_norm": 3.2529144287109375, "learning_rate": 1.58722258670226e-06, "loss": 0.0762, "step": 45625 }, { "epoch": 4.647256438969765, "grad_norm": 3.109240770339966, "learning_rate": 1.5759111372531276e-06, "loss": 0.0811, "step": 45650 }, { "epoch": 4.649801486307645, "grad_norm": 3.7589831352233887, "learning_rate": 1.5645996878039955e-06, "loss": 0.0643, "step": 45675 }, { "epoch": 4.652346533645526, "grad_norm": 8.635754585266113, "learning_rate": 1.5532882383548629e-06, "loss": 0.0701, "step": 45700 }, { "epoch": 4.654891580983406, "grad_norm": 1.9032418727874756, "learning_rate": 1.5419767889057305e-06, "loss": 0.0684, "step": 45725 }, { "epoch": 4.657436628321287, "grad_norm": 2.2510008811950684, "learning_rate": 1.5306653394565982e-06, "loss": 0.0579, "step": 45750 }, { "epoch": 4.659981675659167, "grad_norm": 4.032014846801758, "learning_rate": 1.5193538900074656e-06, "loss": 0.0824, "step": 45775 }, { "epoch": 4.662526722997048, "grad_norm": 4.007706642150879, "learning_rate": 1.5080424405583332e-06, "loss": 0.0851, "step": 45800 }, { "epoch": 4.6650717703349285, "grad_norm": 0.07311418652534485, "learning_rate": 1.496730991109201e-06, "loss": 0.1039, "step": 45825 }, { "epoch": 4.667616817672808, "grad_norm": 1.875131607055664, "learning_rate": 1.4854195416600683e-06, "loss": 0.0788, "step": 45850 }, { "epoch": 4.670161865010689, "grad_norm": 2.887582778930664, "learning_rate": 1.4741080922109361e-06, "loss": 0.0558, "step": 45875 }, { "epoch": 4.67270691234857, "grad_norm": 1.3971534967422485, "learning_rate": 1.4627966427618037e-06, "loss": 0.073, "step": 45900 }, { "epoch": 4.675251959686451, "grad_norm": 1.5338753461837769, "learning_rate": 1.4514851933126712e-06, "loss": 0.0696, "step": 45925 }, { "epoch": 4.67779700702433, "grad_norm": 6.803866863250732, "learning_rate": 1.4401737438635388e-06, "loss": 0.0676, "step": 45950 }, { "epoch": 4.680342054362211, "grad_norm": 0.43175041675567627, "learning_rate": 1.4288622944144064e-06, "loss": 0.0737, "step": 45975 }, { "epoch": 4.682887101700092, "grad_norm": 2.761035919189453, "learning_rate": 1.4175508449652739e-06, "loss": 0.0587, "step": 46000 }, { "epoch": 4.685432149037972, "grad_norm": 4.648847579956055, "learning_rate": 1.4062393955161415e-06, "loss": 0.0832, "step": 46025 }, { "epoch": 4.6879771963758525, "grad_norm": 3.9929113388061523, "learning_rate": 1.3949279460670093e-06, "loss": 0.0854, "step": 46050 }, { "epoch": 4.690522243713733, "grad_norm": 2.7865846157073975, "learning_rate": 1.3836164966178768e-06, "loss": 0.0774, "step": 46075 }, { "epoch": 4.693067291051614, "grad_norm": 2.160618543624878, "learning_rate": 1.3723050471687444e-06, "loss": 0.0674, "step": 46100 }, { "epoch": 4.695612338389494, "grad_norm": 1.6433202028274536, "learning_rate": 1.360993597719612e-06, "loss": 0.0912, "step": 46125 }, { "epoch": 4.698157385727375, "grad_norm": 1.3290517330169678, "learning_rate": 1.3496821482704794e-06, "loss": 0.095, "step": 46150 }, { "epoch": 4.700702433065255, "grad_norm": 0.6854248046875, "learning_rate": 1.338370698821347e-06, "loss": 0.0713, "step": 46175 }, { "epoch": 4.703247480403135, "grad_norm": 1.6427950859069824, "learning_rate": 1.3270592493722145e-06, "loss": 0.054, "step": 46200 }, { "epoch": 4.705792527741016, "grad_norm": 2.2108538150787354, "learning_rate": 1.3157477999230821e-06, "loss": 0.0655, "step": 46225 }, { "epoch": 4.708337575078897, "grad_norm": 3.9831509590148926, "learning_rate": 1.30443635047395e-06, "loss": 0.0738, "step": 46250 }, { "epoch": 4.710882622416777, "grad_norm": 4.917258262634277, "learning_rate": 1.2931249010248174e-06, "loss": 0.0916, "step": 46275 }, { "epoch": 4.713427669754657, "grad_norm": 0.8042176365852356, "learning_rate": 1.281813451575685e-06, "loss": 0.068, "step": 46300 }, { "epoch": 4.715972717092538, "grad_norm": 0.9012075662612915, "learning_rate": 1.2705020021265527e-06, "loss": 0.045, "step": 46325 }, { "epoch": 4.718517764430418, "grad_norm": 0.9209958910942078, "learning_rate": 1.25919055267742e-06, "loss": 0.0564, "step": 46350 }, { "epoch": 4.721062811768299, "grad_norm": 1.8551143407821655, "learning_rate": 1.2478791032282877e-06, "loss": 0.0805, "step": 46375 }, { "epoch": 4.723607859106179, "grad_norm": 0.4579830467700958, "learning_rate": 1.2365676537791554e-06, "loss": 0.0493, "step": 46400 }, { "epoch": 4.72615290644406, "grad_norm": 0.38200685381889343, "learning_rate": 1.225256204330023e-06, "loss": 0.0859, "step": 46425 }, { "epoch": 4.72869795378194, "grad_norm": 3.643800735473633, "learning_rate": 1.2139447548808906e-06, "loss": 0.0613, "step": 46450 }, { "epoch": 4.731243001119821, "grad_norm": 3.732656240463257, "learning_rate": 1.202633305431758e-06, "loss": 0.0685, "step": 46475 }, { "epoch": 4.7337880484577015, "grad_norm": 5.9201579093933105, "learning_rate": 1.1913218559826257e-06, "loss": 0.0697, "step": 46500 }, { "epoch": 4.736333095795581, "grad_norm": 6.302225112915039, "learning_rate": 1.1800104065334933e-06, "loss": 0.0808, "step": 46525 }, { "epoch": 4.738878143133462, "grad_norm": 0.6190359592437744, "learning_rate": 1.168698957084361e-06, "loss": 0.081, "step": 46550 }, { "epoch": 4.741423190471343, "grad_norm": 9.110676765441895, "learning_rate": 1.1578399656131938e-06, "loss": 0.0815, "step": 46575 }, { "epoch": 4.743968237809224, "grad_norm": 5.75714111328125, "learning_rate": 1.1465285161640612e-06, "loss": 0.0875, "step": 46600 }, { "epoch": 4.7465132851471035, "grad_norm": 1.9609627723693848, "learning_rate": 1.135217066714929e-06, "loss": 0.0711, "step": 46625 }, { "epoch": 4.749058332484984, "grad_norm": 2.6252379417419434, "learning_rate": 1.1239056172657965e-06, "loss": 0.0544, "step": 46650 }, { "epoch": 4.751603379822865, "grad_norm": 1.7580702304840088, "learning_rate": 1.1125941678166641e-06, "loss": 0.0975, "step": 46675 }, { "epoch": 4.754148427160745, "grad_norm": 1.8180975914001465, "learning_rate": 1.1012827183675318e-06, "loss": 0.0843, "step": 46700 }, { "epoch": 4.756693474498626, "grad_norm": 4.706656455993652, "learning_rate": 1.0899712689183994e-06, "loss": 0.1002, "step": 46725 }, { "epoch": 4.759238521836506, "grad_norm": 2.2304418087005615, "learning_rate": 1.0786598194692668e-06, "loss": 0.0497, "step": 46750 }, { "epoch": 4.761783569174387, "grad_norm": 3.291037082672119, "learning_rate": 1.0673483700201345e-06, "loss": 0.0993, "step": 46775 }, { "epoch": 4.764328616512267, "grad_norm": 1.777855634689331, "learning_rate": 1.056036920571002e-06, "loss": 0.0683, "step": 46800 }, { "epoch": 4.766873663850148, "grad_norm": 2.230701208114624, "learning_rate": 1.0447254711218697e-06, "loss": 0.0892, "step": 46825 }, { "epoch": 4.769418711188028, "grad_norm": 5.757084846496582, "learning_rate": 1.0334140216727371e-06, "loss": 0.0594, "step": 46850 }, { "epoch": 4.771963758525908, "grad_norm": 2.717808961868286, "learning_rate": 1.0221025722236048e-06, "loss": 0.0766, "step": 46875 }, { "epoch": 4.774508805863789, "grad_norm": 4.532754898071289, "learning_rate": 1.0107911227744724e-06, "loss": 0.0603, "step": 46900 }, { "epoch": 4.77705385320167, "grad_norm": 2.240419626235962, "learning_rate": 9.994796733253398e-07, "loss": 0.0797, "step": 46925 }, { "epoch": 4.7795989005395505, "grad_norm": 1.5470459461212158, "learning_rate": 9.881682238762077e-07, "loss": 0.0602, "step": 46950 }, { "epoch": 4.78214394787743, "grad_norm": 3.4554860591888428, "learning_rate": 9.76856774427075e-07, "loss": 0.0658, "step": 46975 }, { "epoch": 4.784688995215311, "grad_norm": 5.003359794616699, "learning_rate": 9.655453249779427e-07, "loss": 0.0669, "step": 47000 }, { "epoch": 4.787234042553192, "grad_norm": 2.6245150566101074, "learning_rate": 9.542338755288104e-07, "loss": 0.0566, "step": 47025 }, { "epoch": 4.789779089891072, "grad_norm": 3.7440950870513916, "learning_rate": 9.429224260796779e-07, "loss": 0.0832, "step": 47050 }, { "epoch": 4.7923241372289525, "grad_norm": 3.2227530479431152, "learning_rate": 9.316109766305455e-07, "loss": 0.0925, "step": 47075 }, { "epoch": 4.794869184566833, "grad_norm": 0.83717280626297, "learning_rate": 9.202995271814132e-07, "loss": 0.0611, "step": 47100 }, { "epoch": 4.797414231904713, "grad_norm": 1.0189921855926514, "learning_rate": 9.089880777322807e-07, "loss": 0.0941, "step": 47125 }, { "epoch": 4.799959279242594, "grad_norm": 7.039854526519775, "learning_rate": 8.976766282831482e-07, "loss": 0.0621, "step": 47150 }, { "epoch": 4.8025043265804745, "grad_norm": 2.496320962905884, "learning_rate": 8.86365178834016e-07, "loss": 0.0617, "step": 47175 }, { "epoch": 4.805049373918354, "grad_norm": 2.810091972351074, "learning_rate": 8.750537293848835e-07, "loss": 0.0708, "step": 47200 }, { "epoch": 4.807594421256235, "grad_norm": 4.1311469078063965, "learning_rate": 8.63742279935751e-07, "loss": 0.0732, "step": 47225 }, { "epoch": 4.810139468594116, "grad_norm": 3.5702366828918457, "learning_rate": 8.524308304866185e-07, "loss": 0.0798, "step": 47250 }, { "epoch": 4.812684515931997, "grad_norm": 2.6123664379119873, "learning_rate": 8.411193810374863e-07, "loss": 0.0673, "step": 47275 }, { "epoch": 4.8152295632698765, "grad_norm": 1.623597264289856, "learning_rate": 8.298079315883538e-07, "loss": 0.0998, "step": 47300 }, { "epoch": 4.817774610607757, "grad_norm": 4.469209671020508, "learning_rate": 8.184964821392213e-07, "loss": 0.081, "step": 47325 }, { "epoch": 4.820319657945638, "grad_norm": 3.1409494876861572, "learning_rate": 8.071850326900891e-07, "loss": 0.0713, "step": 47350 }, { "epoch": 4.822864705283518, "grad_norm": 3.3526580333709717, "learning_rate": 7.958735832409566e-07, "loss": 0.0848, "step": 47375 }, { "epoch": 4.825409752621399, "grad_norm": 1.9072054624557495, "learning_rate": 7.845621337918241e-07, "loss": 0.0883, "step": 47400 }, { "epoch": 4.827954799959279, "grad_norm": 1.9279520511627197, "learning_rate": 7.732506843426918e-07, "loss": 0.0579, "step": 47425 }, { "epoch": 4.83049984729716, "grad_norm": 4.597317695617676, "learning_rate": 7.619392348935594e-07, "loss": 0.0817, "step": 47450 }, { "epoch": 4.83304489463504, "grad_norm": 0.9331306219100952, "learning_rate": 7.506277854444269e-07, "loss": 0.083, "step": 47475 }, { "epoch": 4.835589941972921, "grad_norm": 2.566190719604492, "learning_rate": 7.393163359952945e-07, "loss": 0.0661, "step": 47500 }, { "epoch": 4.838134989310801, "grad_norm": 1.445909023284912, "learning_rate": 7.280048865461621e-07, "loss": 0.0838, "step": 47525 }, { "epoch": 4.840680036648681, "grad_norm": 1.3846561908721924, "learning_rate": 7.166934370970296e-07, "loss": 0.0773, "step": 47550 }, { "epoch": 4.843225083986562, "grad_norm": 4.2333269119262695, "learning_rate": 7.053819876478972e-07, "loss": 0.0868, "step": 47575 }, { "epoch": 4.845770131324443, "grad_norm": 4.511714935302734, "learning_rate": 6.940705381987649e-07, "loss": 0.0762, "step": 47600 }, { "epoch": 4.8483151786623235, "grad_norm": 3.7210798263549805, "learning_rate": 6.827590887496324e-07, "loss": 0.081, "step": 47625 }, { "epoch": 4.850860226000203, "grad_norm": 1.1287949085235596, "learning_rate": 6.714476393004999e-07, "loss": 0.0911, "step": 47650 }, { "epoch": 4.853405273338084, "grad_norm": 1.98590087890625, "learning_rate": 6.601361898513677e-07, "loss": 0.0695, "step": 47675 }, { "epoch": 4.855950320675965, "grad_norm": 1.839471459388733, "learning_rate": 6.488247404022352e-07, "loss": 0.077, "step": 47700 }, { "epoch": 4.858495368013845, "grad_norm": 2.250749111175537, "learning_rate": 6.375132909531027e-07, "loss": 0.0641, "step": 47725 }, { "epoch": 4.8610404153517255, "grad_norm": 0.87042635679245, "learning_rate": 6.262018415039705e-07, "loss": 0.0646, "step": 47750 }, { "epoch": 4.863585462689606, "grad_norm": 1.301599144935608, "learning_rate": 6.14890392054838e-07, "loss": 0.0801, "step": 47775 }, { "epoch": 4.866130510027487, "grad_norm": 2.1233930587768555, "learning_rate": 6.035789426057055e-07, "loss": 0.0714, "step": 47800 }, { "epoch": 4.868675557365367, "grad_norm": 1.3377180099487305, "learning_rate": 5.922674931565731e-07, "loss": 0.0921, "step": 47825 }, { "epoch": 4.871220604703248, "grad_norm": 2.148926258087158, "learning_rate": 5.809560437074408e-07, "loss": 0.0669, "step": 47850 }, { "epoch": 4.873765652041128, "grad_norm": 1.1764349937438965, "learning_rate": 5.696445942583083e-07, "loss": 0.0996, "step": 47875 }, { "epoch": 4.876310699379008, "grad_norm": 5.230283737182617, "learning_rate": 5.583331448091759e-07, "loss": 0.091, "step": 47900 }, { "epoch": 4.878855746716889, "grad_norm": 3.0122835636138916, "learning_rate": 5.470216953600435e-07, "loss": 0.084, "step": 47925 }, { "epoch": 4.88140079405477, "grad_norm": 0.7090607285499573, "learning_rate": 5.357102459109111e-07, "loss": 0.0652, "step": 47950 }, { "epoch": 4.8839458413926495, "grad_norm": 4.014002799987793, "learning_rate": 5.243987964617786e-07, "loss": 0.0904, "step": 47975 }, { "epoch": 4.88649088873053, "grad_norm": 1.4843350648880005, "learning_rate": 5.130873470126463e-07, "loss": 0.06, "step": 48000 }, { "epoch": 4.889035936068411, "grad_norm": 0.7930936813354492, "learning_rate": 5.017758975635138e-07, "loss": 0.0668, "step": 48025 }, { "epoch": 4.891580983406291, "grad_norm": 2.7590086460113525, "learning_rate": 4.904644481143814e-07, "loss": 0.1019, "step": 48050 }, { "epoch": 4.894126030744172, "grad_norm": 2.253613233566284, "learning_rate": 4.79152998665249e-07, "loss": 0.0593, "step": 48075 }, { "epoch": 4.896671078082052, "grad_norm": 3.688272714614868, "learning_rate": 4.678415492161166e-07, "loss": 0.0817, "step": 48100 }, { "epoch": 4.899216125419933, "grad_norm": 3.342242956161499, "learning_rate": 4.5653009976698417e-07, "loss": 0.0633, "step": 48125 }, { "epoch": 4.901761172757813, "grad_norm": 2.113290786743164, "learning_rate": 4.4521865031785175e-07, "loss": 0.0762, "step": 48150 }, { "epoch": 4.904306220095694, "grad_norm": 1.4051839113235474, "learning_rate": 4.339072008687194e-07, "loss": 0.0879, "step": 48175 }, { "epoch": 4.9068512674335745, "grad_norm": 2.42224383354187, "learning_rate": 4.225957514195869e-07, "loss": 0.063, "step": 48200 }, { "epoch": 4.909396314771454, "grad_norm": 0.3117251694202423, "learning_rate": 4.1128430197045454e-07, "loss": 0.0613, "step": 48225 }, { "epoch": 4.911941362109335, "grad_norm": 4.431111812591553, "learning_rate": 3.999728525213221e-07, "loss": 0.0905, "step": 48250 }, { "epoch": 4.914486409447216, "grad_norm": 3.4864866733551025, "learning_rate": 3.886614030721897e-07, "loss": 0.097, "step": 48275 }, { "epoch": 4.917031456785097, "grad_norm": 4.725757598876953, "learning_rate": 3.773499536230573e-07, "loss": 0.0863, "step": 48300 }, { "epoch": 4.919576504122976, "grad_norm": 7.7111496925354, "learning_rate": 3.6603850417392486e-07, "loss": 0.0558, "step": 48325 }, { "epoch": 4.922121551460857, "grad_norm": 3.0076076984405518, "learning_rate": 3.5472705472479244e-07, "loss": 0.0819, "step": 48350 }, { "epoch": 4.924666598798738, "grad_norm": 4.618623733520508, "learning_rate": 3.434156052756601e-07, "loss": 0.0883, "step": 48375 }, { "epoch": 4.927211646136618, "grad_norm": 2.2570934295654297, "learning_rate": 3.321041558265276e-07, "loss": 0.0725, "step": 48400 }, { "epoch": 4.9297566934744985, "grad_norm": 1.5796055793762207, "learning_rate": 3.2079270637739524e-07, "loss": 0.1031, "step": 48425 }, { "epoch": 4.932301740812379, "grad_norm": 7.036586284637451, "learning_rate": 3.094812569282628e-07, "loss": 0.0724, "step": 48450 }, { "epoch": 4.93484678815026, "grad_norm": 4.79809045791626, "learning_rate": 2.981698074791304e-07, "loss": 0.0745, "step": 48475 }, { "epoch": 4.93739183548814, "grad_norm": 0.8551825284957886, "learning_rate": 2.86858358029998e-07, "loss": 0.0655, "step": 48500 }, { "epoch": 4.939936882826021, "grad_norm": 2.2786452770233154, "learning_rate": 2.755469085808656e-07, "loss": 0.0623, "step": 48525 }, { "epoch": 4.942481930163901, "grad_norm": 2.4696829319000244, "learning_rate": 2.642354591317332e-07, "loss": 0.0768, "step": 48550 }, { "epoch": 4.945026977501781, "grad_norm": 2.277616024017334, "learning_rate": 2.529240096826007e-07, "loss": 0.072, "step": 48575 }, { "epoch": 4.947572024839662, "grad_norm": 4.403280735015869, "learning_rate": 2.416125602334683e-07, "loss": 0.0721, "step": 48600 }, { "epoch": 4.950117072177543, "grad_norm": 1.1744745969772339, "learning_rate": 2.3030111078433593e-07, "loss": 0.049, "step": 48625 }, { "epoch": 4.9526621195154235, "grad_norm": 5.233054161071777, "learning_rate": 2.1898966133520351e-07, "loss": 0.0608, "step": 48650 }, { "epoch": 4.955207166853303, "grad_norm": 2.6371543407440186, "learning_rate": 2.076782118860711e-07, "loss": 0.0593, "step": 48675 }, { "epoch": 4.957752214191184, "grad_norm": 3.5058162212371826, "learning_rate": 1.9636676243693867e-07, "loss": 0.0855, "step": 48700 }, { "epoch": 4.960297261529065, "grad_norm": 2.090693235397339, "learning_rate": 1.8505531298780628e-07, "loss": 0.0626, "step": 48725 }, { "epoch": 4.962842308866945, "grad_norm": 0.4760298430919647, "learning_rate": 1.7374386353867386e-07, "loss": 0.0859, "step": 48750 }, { "epoch": 4.965387356204825, "grad_norm": 7.703958988189697, "learning_rate": 1.6243241408954144e-07, "loss": 0.0896, "step": 48775 }, { "epoch": 4.967932403542706, "grad_norm": 0.33076685667037964, "learning_rate": 1.5112096464040902e-07, "loss": 0.0748, "step": 48800 }, { "epoch": 4.970477450880586, "grad_norm": 2.938126564025879, "learning_rate": 1.3980951519127663e-07, "loss": 0.0663, "step": 48825 }, { "epoch": 4.973022498218467, "grad_norm": 2.024932861328125, "learning_rate": 1.284980657421442e-07, "loss": 0.0539, "step": 48850 }, { "epoch": 4.9755675455563475, "grad_norm": 5.843230247497559, "learning_rate": 1.171866162930118e-07, "loss": 0.0824, "step": 48875 }, { "epoch": 4.978112592894227, "grad_norm": 3.76930570602417, "learning_rate": 1.0587516684387938e-07, "loss": 0.0556, "step": 48900 }, { "epoch": 4.980657640232108, "grad_norm": 3.4306960105895996, "learning_rate": 9.456371739474698e-08, "loss": 0.0587, "step": 48925 }, { "epoch": 4.983202687569989, "grad_norm": 3.4686169624328613, "learning_rate": 8.325226794561456e-08, "loss": 0.0681, "step": 48950 }, { "epoch": 4.98574773490787, "grad_norm": 3.3935706615448, "learning_rate": 7.194081849648214e-08, "loss": 0.0651, "step": 48975 }, { "epoch": 4.9882927822457495, "grad_norm": 2.937288999557495, "learning_rate": 6.062936904734973e-08, "loss": 0.0616, "step": 49000 }, { "epoch": 4.99083782958363, "grad_norm": 0.9331411719322205, "learning_rate": 4.931791959821732e-08, "loss": 0.071, "step": 49025 }, { "epoch": 4.993382876921511, "grad_norm": 4.932324409484863, "learning_rate": 3.8006470149084906e-08, "loss": 0.0742, "step": 49050 }, { "epoch": 4.995927924259391, "grad_norm": 1.9142768383026123, "learning_rate": 2.6695020699952493e-08, "loss": 0.0889, "step": 49075 }, { "epoch": 4.9984729715972716, "grad_norm": 3.7317426204681396, "learning_rate": 1.538357125082008e-08, "loss": 0.0777, "step": 49100 }, { "epoch": 5.0, "eval_loss": 0.08259893208742142, "eval_runtime": 7.0041, "eval_samples_per_second": 971.995, "eval_steps_per_second": 15.277, "step": 49115 } ], "logging_steps": 25, "max_steps": 49115, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }