{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.987995198079231, "eval_steps": 500, "global_step": 2080, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.024009603841536616, "grad_norm": 0.48156169056892395, "learning_rate": 4.9999287112445194e-05, "loss": 1.2537, "step": 5 }, { "epoch": 0.04801920768307323, "grad_norm": 0.48818066716194153, "learning_rate": 4.999714849043745e-05, "loss": 1.1695, "step": 10 }, { "epoch": 0.07202881152460984, "grad_norm": 0.42254194617271423, "learning_rate": 4.999358425594454e-05, "loss": 1.1879, "step": 15 }, { "epoch": 0.09603841536614646, "grad_norm": 0.4787527024745941, "learning_rate": 4.9988594612238336e-05, "loss": 1.0137, "step": 20 }, { "epoch": 0.12004801920768307, "grad_norm": 0.5006477236747742, "learning_rate": 4.9982179843883225e-05, "loss": 1.1109, "step": 25 }, { "epoch": 0.14405762304921968, "grad_norm": 0.4857625961303711, "learning_rate": 4.9974340316719906e-05, "loss": 1.04, "step": 30 }, { "epoch": 0.16806722689075632, "grad_norm": 0.5294004678726196, "learning_rate": 4.996507647784446e-05, "loss": 1.0158, "step": 35 }, { "epoch": 0.19207683073229292, "grad_norm": 0.5924685597419739, "learning_rate": 4.995438885558294e-05, "loss": 0.9004, "step": 40 }, { "epoch": 0.21608643457382953, "grad_norm": 0.5567044615745544, "learning_rate": 4.9942278059461175e-05, "loss": 0.9626, "step": 45 }, { "epoch": 0.24009603841536614, "grad_norm": 0.5250746607780457, "learning_rate": 4.992874478017003e-05, "loss": 1.0787, "step": 50 }, { "epoch": 0.26410564225690275, "grad_norm": 0.5340147018432617, "learning_rate": 4.9913789789526023e-05, "loss": 0.8533, "step": 55 }, { "epoch": 0.28811524609843936, "grad_norm": 0.5907275080680847, "learning_rate": 4.989741394042727e-05, "loss": 0.9406, "step": 60 }, { "epoch": 0.31212484993997597, "grad_norm": 0.6365830898284912, "learning_rate": 4.987961816680492e-05, "loss": 1.0017, "step": 65 }, { "epoch": 0.33613445378151263, "grad_norm": 0.6287215948104858, "learning_rate": 4.9860403483569805e-05, "loss": 0.8902, "step": 70 }, { "epoch": 0.36014405762304924, "grad_norm": 0.6473432183265686, "learning_rate": 4.983977098655461e-05, "loss": 0.907, "step": 75 }, { "epoch": 0.38415366146458585, "grad_norm": 0.663551926612854, "learning_rate": 4.981772185245135e-05, "loss": 1.0197, "step": 80 }, { "epoch": 0.40816326530612246, "grad_norm": 0.5929960012435913, "learning_rate": 4.979425733874431e-05, "loss": 0.8131, "step": 85 }, { "epoch": 0.43217286914765907, "grad_norm": 0.6948341131210327, "learning_rate": 4.9769378783638255e-05, "loss": 0.8665, "step": 90 }, { "epoch": 0.4561824729891957, "grad_norm": 0.8430640697479248, "learning_rate": 4.974308760598218e-05, "loss": 0.9219, "step": 95 }, { "epoch": 0.4801920768307323, "grad_norm": 0.7752465009689331, "learning_rate": 4.971538530518836e-05, "loss": 0.9425, "step": 100 }, { "epoch": 0.5042016806722689, "grad_norm": 0.7160285711288452, "learning_rate": 4.968627346114681e-05, "loss": 0.8781, "step": 105 }, { "epoch": 0.5282112845138055, "grad_norm": 0.7385575175285339, "learning_rate": 4.965575373413527e-05, "loss": 0.8446, "step": 110 }, { "epoch": 0.5522208883553421, "grad_norm": 0.8385459184646606, "learning_rate": 4.9623827864724394e-05, "loss": 0.9453, "step": 115 }, { "epoch": 0.5762304921968787, "grad_norm": 0.8416388630867004, "learning_rate": 4.959049767367859e-05, "loss": 0.8086, "step": 120 }, { "epoch": 0.6002400960384153, "grad_norm": 0.7977305054664612, "learning_rate": 4.955576506185213e-05, "loss": 0.8657, "step": 125 }, { "epoch": 0.6242496998799519, "grad_norm": 0.8073853850364685, "learning_rate": 4.951963201008076e-05, "loss": 0.9168, "step": 130 }, { "epoch": 0.6482593037214885, "grad_norm": 0.825240433216095, "learning_rate": 4.9482100579068706e-05, "loss": 0.9192, "step": 135 }, { "epoch": 0.6722689075630253, "grad_norm": 0.7907525300979614, "learning_rate": 4.944317290927117e-05, "loss": 0.7927, "step": 140 }, { "epoch": 0.6962785114045619, "grad_norm": 0.7800418138504028, "learning_rate": 4.9402851220772274e-05, "loss": 0.8486, "step": 145 }, { "epoch": 0.7202881152460985, "grad_norm": 0.8340911269187927, "learning_rate": 4.93611378131584e-05, "loss": 0.7937, "step": 150 }, { "epoch": 0.7442977190876351, "grad_norm": 0.9103155732154846, "learning_rate": 4.931803506538707e-05, "loss": 0.834, "step": 155 }, { "epoch": 0.7683073229291717, "grad_norm": 0.8660081028938293, "learning_rate": 4.92735454356513e-05, "loss": 0.7799, "step": 160 }, { "epoch": 0.7923169267707083, "grad_norm": 0.7390795350074768, "learning_rate": 4.9227671461239354e-05, "loss": 0.8608, "step": 165 }, { "epoch": 0.8163265306122449, "grad_norm": 0.8534154891967773, "learning_rate": 4.918041575839007e-05, "loss": 0.8872, "step": 170 }, { "epoch": 0.8403361344537815, "grad_norm": 1.0927950143814087, "learning_rate": 4.913178102214363e-05, "loss": 0.8681, "step": 175 }, { "epoch": 0.8643457382953181, "grad_norm": 0.8660680055618286, "learning_rate": 4.9081770026187914e-05, "loss": 0.8451, "step": 180 }, { "epoch": 0.8883553421368547, "grad_norm": 0.7952775955200195, "learning_rate": 4.9030385622700225e-05, "loss": 0.8662, "step": 185 }, { "epoch": 0.9123649459783914, "grad_norm": 0.8808633685112, "learning_rate": 4.897763074218472e-05, "loss": 0.8183, "step": 190 }, { "epoch": 0.936374549819928, "grad_norm": 0.7455188632011414, "learning_rate": 4.892350839330522e-05, "loss": 0.8887, "step": 195 }, { "epoch": 0.9603841536614646, "grad_norm": 0.7687739729881287, "learning_rate": 4.886802166271364e-05, "loss": 0.8505, "step": 200 }, { "epoch": 0.9843937575030012, "grad_norm": 0.7754830121994019, "learning_rate": 4.881117371487396e-05, "loss": 0.835, "step": 205 }, { "epoch": 1.0084033613445378, "grad_norm": 0.7828541398048401, "learning_rate": 4.875296779188173e-05, "loss": 0.8531, "step": 210 }, { "epoch": 1.0324129651860745, "grad_norm": 1.015205979347229, "learning_rate": 4.8693407213279206e-05, "loss": 0.8679, "step": 215 }, { "epoch": 1.056422569027611, "grad_norm": 0.8347631692886353, "learning_rate": 4.8632495375866004e-05, "loss": 0.8432, "step": 220 }, { "epoch": 1.0804321728691477, "grad_norm": 0.9165553450584412, "learning_rate": 4.8570235753505406e-05, "loss": 0.8362, "step": 225 }, { "epoch": 1.1044417767106842, "grad_norm": 0.954313337802887, "learning_rate": 4.850663189692619e-05, "loss": 0.7704, "step": 230 }, { "epoch": 1.128451380552221, "grad_norm": 1.096909761428833, "learning_rate": 4.844168743352019e-05, "loss": 0.8357, "step": 235 }, { "epoch": 1.1524609843937574, "grad_norm": 0.9319111108779907, "learning_rate": 4.837540606713538e-05, "loss": 0.7461, "step": 240 }, { "epoch": 1.1764705882352942, "grad_norm": 1.0731585025787354, "learning_rate": 4.830779157786465e-05, "loss": 0.8821, "step": 245 }, { "epoch": 1.2004801920768307, "grad_norm": 0.8962486982345581, "learning_rate": 4.823884782183023e-05, "loss": 0.889, "step": 250 }, { "epoch": 1.2244897959183674, "grad_norm": 0.8899649381637573, "learning_rate": 4.8168578730963804e-05, "loss": 0.7407, "step": 255 }, { "epoch": 1.2484993997599039, "grad_norm": 0.9980818033218384, "learning_rate": 4.8096988312782174e-05, "loss": 0.7822, "step": 260 }, { "epoch": 1.2725090036014406, "grad_norm": 0.9607000350952148, "learning_rate": 4.80240806501588e-05, "loss": 0.9008, "step": 265 }, { "epoch": 1.296518607442977, "grad_norm": 1.0252315998077393, "learning_rate": 4.7949859901090896e-05, "loss": 0.7477, "step": 270 }, { "epoch": 1.3205282112845138, "grad_norm": 0.8092382550239563, "learning_rate": 4.787433029846236e-05, "loss": 0.7846, "step": 275 }, { "epoch": 1.3445378151260505, "grad_norm": 0.9593423008918762, "learning_rate": 4.7797496149802256e-05, "loss": 0.8544, "step": 280 }, { "epoch": 1.368547418967587, "grad_norm": 0.9516417980194092, "learning_rate": 4.771936183703927e-05, "loss": 0.7706, "step": 285 }, { "epoch": 1.3925570228091235, "grad_norm": 1.2291992902755737, "learning_rate": 4.763993181625174e-05, "loss": 0.8256, "step": 290 }, { "epoch": 1.4165666266506602, "grad_norm": 0.9861960411071777, "learning_rate": 4.7559210617413514e-05, "loss": 0.8185, "step": 295 }, { "epoch": 1.440576230492197, "grad_norm": 0.8679934144020081, "learning_rate": 4.7477202844135646e-05, "loss": 0.7356, "step": 300 }, { "epoch": 1.4645858343337335, "grad_norm": 0.754160463809967, "learning_rate": 4.739391317340383e-05, "loss": 0.8015, "step": 305 }, { "epoch": 1.4885954381752702, "grad_norm": 1.0975936651229858, "learning_rate": 4.730934635531161e-05, "loss": 0.8103, "step": 310 }, { "epoch": 1.5126050420168067, "grad_norm": 0.8794222474098206, "learning_rate": 4.722350721278958e-05, "loss": 0.6949, "step": 315 }, { "epoch": 1.5366146458583434, "grad_norm": 1.009118676185608, "learning_rate": 4.713640064133025e-05, "loss": 0.779, "step": 320 }, { "epoch": 1.5606242496998801, "grad_norm": 1.1100735664367676, "learning_rate": 4.7048031608708876e-05, "loss": 0.8016, "step": 325 }, { "epoch": 1.5846338535414166, "grad_norm": 0.8312719464302063, "learning_rate": 4.6958405154700154e-05, "loss": 0.746, "step": 330 }, { "epoch": 1.6086434573829531, "grad_norm": 0.9220231175422668, "learning_rate": 4.686752639079076e-05, "loss": 0.7439, "step": 335 }, { "epoch": 1.6326530612244898, "grad_norm": 1.1262476444244385, "learning_rate": 4.677540049988789e-05, "loss": 0.7969, "step": 340 }, { "epoch": 1.6566626650660266, "grad_norm": 1.1132248640060425, "learning_rate": 4.668203273602363e-05, "loss": 0.858, "step": 345 }, { "epoch": 1.680672268907563, "grad_norm": 1.0028260946273804, "learning_rate": 4.6587428424055326e-05, "loss": 0.8538, "step": 350 }, { "epoch": 1.7046818727490995, "grad_norm": 1.057077407836914, "learning_rate": 4.649159295936191e-05, "loss": 0.8241, "step": 355 }, { "epoch": 1.7286914765906363, "grad_norm": 0.8777754902839661, "learning_rate": 4.639453180753619e-05, "loss": 0.7388, "step": 360 }, { "epoch": 1.752701080432173, "grad_norm": 0.9972108006477356, "learning_rate": 4.6296250504073145e-05, "loss": 0.7924, "step": 365 }, { "epoch": 1.7767106842737095, "grad_norm": 1.0971965789794922, "learning_rate": 4.6196754654054216e-05, "loss": 0.7719, "step": 370 }, { "epoch": 1.800720288115246, "grad_norm": 1.1859172582626343, "learning_rate": 4.609604993182767e-05, "loss": 0.8365, "step": 375 }, { "epoch": 1.8247298919567827, "grad_norm": 0.9028448462486267, "learning_rate": 4.599414208068495e-05, "loss": 0.8118, "step": 380 }, { "epoch": 1.8487394957983194, "grad_norm": 0.8329624533653259, "learning_rate": 4.589103691253317e-05, "loss": 0.7881, "step": 385 }, { "epoch": 1.872749099639856, "grad_norm": 0.985603928565979, "learning_rate": 4.5786740307563636e-05, "loss": 0.7155, "step": 390 }, { "epoch": 1.8967587034813924, "grad_norm": 0.9097521901130676, "learning_rate": 4.568125821391647e-05, "loss": 0.8555, "step": 395 }, { "epoch": 1.9207683073229291, "grad_norm": 1.0277526378631592, "learning_rate": 4.557459664734141e-05, "loss": 0.8685, "step": 400 }, { "epoch": 1.9447779111644659, "grad_norm": 0.8617818355560303, "learning_rate": 4.5466761690854746e-05, "loss": 0.7708, "step": 405 }, { "epoch": 1.9687875150060024, "grad_norm": 0.8086485266685486, "learning_rate": 4.535775949439235e-05, "loss": 0.8448, "step": 410 }, { "epoch": 1.9927971188475389, "grad_norm": 1.0069591999053955, "learning_rate": 4.5247596274458956e-05, "loss": 0.736, "step": 415 }, { "epoch": 2.0168067226890756, "grad_norm": 0.7122059464454651, "learning_rate": 4.513627831377365e-05, "loss": 0.7223, "step": 420 }, { "epoch": 2.0408163265306123, "grad_norm": 0.9053510427474976, "learning_rate": 4.502381196091154e-05, "loss": 0.7504, "step": 425 }, { "epoch": 2.064825930372149, "grad_norm": 1.0357030630111694, "learning_rate": 4.491020362994168e-05, "loss": 0.8011, "step": 430 }, { "epoch": 2.0888355342136853, "grad_norm": 0.9885731339454651, "learning_rate": 4.47954598000613e-05, "loss": 0.8665, "step": 435 }, { "epoch": 2.112845138055222, "grad_norm": 0.9989228844642639, "learning_rate": 4.4679587015226253e-05, "loss": 0.749, "step": 440 }, { "epoch": 2.1368547418967587, "grad_norm": 0.8704085350036621, "learning_rate": 4.456259188377786e-05, "loss": 0.7962, "step": 445 }, { "epoch": 2.1608643457382954, "grad_norm": 0.9389162063598633, "learning_rate": 4.444448107806596e-05, "loss": 0.7964, "step": 450 }, { "epoch": 2.184873949579832, "grad_norm": 0.9259791970252991, "learning_rate": 4.4325261334068426e-05, "loss": 0.7516, "step": 455 }, { "epoch": 2.2088835534213684, "grad_norm": 1.4041215181350708, "learning_rate": 4.420493945100702e-05, "loss": 0.8173, "step": 460 }, { "epoch": 2.232893157262905, "grad_norm": 0.9259509444236755, "learning_rate": 4.4083522290959564e-05, "loss": 0.8038, "step": 465 }, { "epoch": 2.256902761104442, "grad_norm": 1.0205409526824951, "learning_rate": 4.396101677846866e-05, "loss": 0.8055, "step": 470 }, { "epoch": 2.280912364945978, "grad_norm": 0.9302276372909546, "learning_rate": 4.383742990014671e-05, "loss": 0.7459, "step": 475 }, { "epoch": 2.304921968787515, "grad_norm": 0.9929239749908447, "learning_rate": 4.371276870427753e-05, "loss": 0.7959, "step": 480 }, { "epoch": 2.3289315726290516, "grad_norm": 1.0315582752227783, "learning_rate": 4.358704030041432e-05, "loss": 0.8528, "step": 485 }, { "epoch": 2.3529411764705883, "grad_norm": 0.9557288289070129, "learning_rate": 4.346025185897424e-05, "loss": 0.759, "step": 490 }, { "epoch": 2.376950780312125, "grad_norm": 1.080580472946167, "learning_rate": 4.333241061082944e-05, "loss": 0.7499, "step": 495 }, { "epoch": 2.4009603841536613, "grad_norm": 0.96761155128479, "learning_rate": 4.320352384689471e-05, "loss": 0.7279, "step": 500 }, { "epoch": 2.424969987995198, "grad_norm": 1.0945556163787842, "learning_rate": 4.307359891771165e-05, "loss": 0.7453, "step": 505 }, { "epoch": 2.4489795918367347, "grad_norm": 0.9983144998550415, "learning_rate": 4.294264323302946e-05, "loss": 0.8311, "step": 510 }, { "epoch": 2.4729891956782715, "grad_norm": 0.8600877523422241, "learning_rate": 4.2810664261382375e-05, "loss": 0.7172, "step": 515 }, { "epoch": 2.4969987995198077, "grad_norm": 1.0828715562820435, "learning_rate": 4.267766952966369e-05, "loss": 0.8084, "step": 520 }, { "epoch": 2.5210084033613445, "grad_norm": 0.9153307676315308, "learning_rate": 4.254366662269655e-05, "loss": 0.702, "step": 525 }, { "epoch": 2.545018007202881, "grad_norm": 0.8633860349655151, "learning_rate": 4.240866318280132e-05, "loss": 0.7261, "step": 530 }, { "epoch": 2.569027611044418, "grad_norm": 0.9197015762329102, "learning_rate": 4.227266690935978e-05, "loss": 0.754, "step": 535 }, { "epoch": 2.593037214885954, "grad_norm": 0.9729764461517334, "learning_rate": 4.2135685558375994e-05, "loss": 0.7855, "step": 540 }, { "epoch": 2.617046818727491, "grad_norm": 0.981951117515564, "learning_rate": 4.199772694203399e-05, "loss": 0.7584, "step": 545 }, { "epoch": 2.6410564225690276, "grad_norm": 0.8585065603256226, "learning_rate": 4.185879892825222e-05, "loss": 0.8196, "step": 550 }, { "epoch": 2.6650660264105643, "grad_norm": 0.8955056071281433, "learning_rate": 4.1718909440234853e-05, "loss": 0.6836, "step": 555 }, { "epoch": 2.689075630252101, "grad_norm": 0.9751344919204712, "learning_rate": 4.157806645601988e-05, "loss": 0.7656, "step": 560 }, { "epoch": 2.7130852340936373, "grad_norm": 0.9694618582725525, "learning_rate": 4.143627800802417e-05, "loss": 0.7787, "step": 565 }, { "epoch": 2.737094837935174, "grad_norm": 1.0527385473251343, "learning_rate": 4.1293552182585307e-05, "loss": 0.7665, "step": 570 }, { "epoch": 2.7611044417767108, "grad_norm": 0.9396669864654541, "learning_rate": 4.114989711950047e-05, "loss": 0.748, "step": 575 }, { "epoch": 2.785114045618247, "grad_norm": 1.0898234844207764, "learning_rate": 4.1005321011562206e-05, "loss": 0.7761, "step": 580 }, { "epoch": 2.8091236494597838, "grad_norm": 1.0972914695739746, "learning_rate": 4.085983210409114e-05, "loss": 0.7873, "step": 585 }, { "epoch": 2.8331332533013205, "grad_norm": 1.1150754690170288, "learning_rate": 4.0713438694465806e-05, "loss": 0.8288, "step": 590 }, { "epoch": 2.857142857142857, "grad_norm": 0.9612060785293579, "learning_rate": 4.056614913164938e-05, "loss": 0.7587, "step": 595 }, { "epoch": 2.881152460984394, "grad_norm": 0.8540961146354675, "learning_rate": 4.0417971815713584e-05, "loss": 0.7096, "step": 600 }, { "epoch": 2.90516206482593, "grad_norm": 1.1552797555923462, "learning_rate": 4.026891519735955e-05, "loss": 0.7177, "step": 605 }, { "epoch": 2.929171668667467, "grad_norm": 0.8366991281509399, "learning_rate": 4.011898777743594e-05, "loss": 0.7244, "step": 610 }, { "epoch": 2.9531812725090036, "grad_norm": 1.1516789197921753, "learning_rate": 3.99681981064541e-05, "loss": 0.7627, "step": 615 }, { "epoch": 2.9771908763505404, "grad_norm": 1.119287133216858, "learning_rate": 3.981655478410043e-05, "loss": 0.7562, "step": 620 }, { "epoch": 3.0012004801920766, "grad_norm": 0.9897541403770447, "learning_rate": 3.966406645874589e-05, "loss": 0.8271, "step": 625 }, { "epoch": 3.0252100840336134, "grad_norm": 0.979117214679718, "learning_rate": 3.951074182695284e-05, "loss": 0.7639, "step": 630 }, { "epoch": 3.04921968787515, "grad_norm": 1.234833836555481, "learning_rate": 3.935658963297902e-05, "loss": 0.7117, "step": 635 }, { "epoch": 3.073229291716687, "grad_norm": 1.0178768634796143, "learning_rate": 3.920161866827889e-05, "loss": 0.7358, "step": 640 }, { "epoch": 3.097238895558223, "grad_norm": 0.9386712312698364, "learning_rate": 3.904583777100223e-05, "loss": 0.7409, "step": 645 }, { "epoch": 3.12124849939976, "grad_norm": 1.140830397605896, "learning_rate": 3.888925582549006e-05, "loss": 0.7052, "step": 650 }, { "epoch": 3.1452581032412965, "grad_norm": 0.9676099419593811, "learning_rate": 3.8731881761768e-05, "loss": 0.7602, "step": 655 }, { "epoch": 3.1692677070828332, "grad_norm": 1.1593329906463623, "learning_rate": 3.857372455503697e-05, "loss": 0.8514, "step": 660 }, { "epoch": 3.19327731092437, "grad_norm": 1.1297277212142944, "learning_rate": 3.8414793225161325e-05, "loss": 0.7699, "step": 665 }, { "epoch": 3.2172869147659062, "grad_norm": 0.9413173794746399, "learning_rate": 3.825509683615442e-05, "loss": 0.8067, "step": 670 }, { "epoch": 3.241296518607443, "grad_norm": 0.9704225063323975, "learning_rate": 3.809464449566175e-05, "loss": 0.6783, "step": 675 }, { "epoch": 3.2653061224489797, "grad_norm": 0.9347226023674011, "learning_rate": 3.793344535444142e-05, "loss": 0.7315, "step": 680 }, { "epoch": 3.2893157262905164, "grad_norm": 0.9760938882827759, "learning_rate": 3.777150860584237e-05, "loss": 0.6999, "step": 685 }, { "epoch": 3.3133253301320527, "grad_norm": 1.114785075187683, "learning_rate": 3.760884348528002e-05, "loss": 0.7182, "step": 690 }, { "epoch": 3.3373349339735894, "grad_norm": 1.0659857988357544, "learning_rate": 3.744545926970957e-05, "loss": 0.7577, "step": 695 }, { "epoch": 3.361344537815126, "grad_norm": 1.0988335609436035, "learning_rate": 3.728136527709694e-05, "loss": 0.7825, "step": 700 }, { "epoch": 3.385354141656663, "grad_norm": 0.9283490180969238, "learning_rate": 3.711657086588733e-05, "loss": 0.7414, "step": 705 }, { "epoch": 3.409363745498199, "grad_norm": 1.1617555618286133, "learning_rate": 3.695108543447154e-05, "loss": 0.7144, "step": 710 }, { "epoch": 3.433373349339736, "grad_norm": 1.1011674404144287, "learning_rate": 3.678491842064995e-05, "loss": 0.7516, "step": 715 }, { "epoch": 3.4573829531812725, "grad_norm": 0.9130650758743286, "learning_rate": 3.6618079301094216e-05, "loss": 0.7484, "step": 720 }, { "epoch": 3.4813925570228093, "grad_norm": 1.1056820154190063, "learning_rate": 3.645057759080692e-05, "loss": 0.7775, "step": 725 }, { "epoch": 3.505402160864346, "grad_norm": 1.219381332397461, "learning_rate": 3.6282422842578845e-05, "loss": 0.7086, "step": 730 }, { "epoch": 3.5294117647058822, "grad_norm": 1.2623860836029053, "learning_rate": 3.611362464644415e-05, "loss": 0.7165, "step": 735 }, { "epoch": 3.553421368547419, "grad_norm": 1.0355645418167114, "learning_rate": 3.594419262913351e-05, "loss": 0.7668, "step": 740 }, { "epoch": 3.5774309723889557, "grad_norm": 0.9813769459724426, "learning_rate": 3.577413645352506e-05, "loss": 0.6595, "step": 745 }, { "epoch": 3.601440576230492, "grad_norm": 1.0704172849655151, "learning_rate": 3.560346581809328e-05, "loss": 0.7469, "step": 750 }, { "epoch": 3.6254501800720287, "grad_norm": 1.0033190250396729, "learning_rate": 3.543219045635593e-05, "loss": 0.7063, "step": 755 }, { "epoch": 3.6494597839135654, "grad_norm": 0.9432305097579956, "learning_rate": 3.526032013631893e-05, "loss": 0.6694, "step": 760 }, { "epoch": 3.673469387755102, "grad_norm": 1.0187017917633057, "learning_rate": 3.508786465991923e-05, "loss": 0.7789, "step": 765 }, { "epoch": 3.697478991596639, "grad_norm": 1.076158881187439, "learning_rate": 3.491483386246588e-05, "loss": 0.7934, "step": 770 }, { "epoch": 3.721488595438175, "grad_norm": 0.8803985714912415, "learning_rate": 3.474123761207905e-05, "loss": 0.7295, "step": 775 }, { "epoch": 3.745498199279712, "grad_norm": 0.9059305787086487, "learning_rate": 3.456708580912725e-05, "loss": 0.7474, "step": 780 }, { "epoch": 3.7695078031212486, "grad_norm": 1.1186120510101318, "learning_rate": 3.4392388385662714e-05, "loss": 0.7711, "step": 785 }, { "epoch": 3.7935174069627853, "grad_norm": 1.0779385566711426, "learning_rate": 3.4217155304854976e-05, "loss": 0.7363, "step": 790 }, { "epoch": 3.817527010804322, "grad_norm": 1.0094141960144043, "learning_rate": 3.4041396560422624e-05, "loss": 0.8531, "step": 795 }, { "epoch": 3.8415366146458583, "grad_norm": 0.9442546963691711, "learning_rate": 3.386512217606339e-05, "loss": 0.8025, "step": 800 }, { "epoch": 3.865546218487395, "grad_norm": 1.0685694217681885, "learning_rate": 3.3688342204882466e-05, "loss": 0.6634, "step": 805 }, { "epoch": 3.8895558223289317, "grad_norm": 0.9987934827804565, "learning_rate": 3.351106672881915e-05, "loss": 0.7664, "step": 810 }, { "epoch": 3.913565426170468, "grad_norm": 1.0503917932510376, "learning_rate": 3.33333058580719e-05, "loss": 0.865, "step": 815 }, { "epoch": 3.9375750300120047, "grad_norm": 0.9366332292556763, "learning_rate": 3.3155069730521735e-05, "loss": 0.7096, "step": 820 }, { "epoch": 3.9615846338535414, "grad_norm": 1.173298954963684, "learning_rate": 3.2976368511153996e-05, "loss": 0.8476, "step": 825 }, { "epoch": 3.985594237695078, "grad_norm": 1.0985863208770752, "learning_rate": 3.2797212391478724e-05, "loss": 0.705, "step": 830 }, { "epoch": 4.009603841536615, "grad_norm": 1.124514102935791, "learning_rate": 3.261761158894937e-05, "loss": 0.7086, "step": 835 }, { "epoch": 4.033613445378151, "grad_norm": 0.9647889733314514, "learning_rate": 3.243757634638008e-05, "loss": 0.6899, "step": 840 }, { "epoch": 4.057623049219688, "grad_norm": 0.9771691560745239, "learning_rate": 3.225711693136156e-05, "loss": 0.8088, "step": 845 }, { "epoch": 4.081632653061225, "grad_norm": 1.1176905632019043, "learning_rate": 3.2076243635675513e-05, "loss": 0.7096, "step": 850 }, { "epoch": 4.105642256902761, "grad_norm": 1.0890402793884277, "learning_rate": 3.189496677470765e-05, "loss": 0.7646, "step": 855 }, { "epoch": 4.129651860744298, "grad_norm": 1.099631905555725, "learning_rate": 3.1713296686859426e-05, "loss": 0.7525, "step": 860 }, { "epoch": 4.153661464585834, "grad_norm": 0.995556652545929, "learning_rate": 3.153124373295841e-05, "loss": 0.7231, "step": 865 }, { "epoch": 4.177671068427371, "grad_norm": 1.1333503723144531, "learning_rate": 3.1348818295667424e-05, "loss": 0.7431, "step": 870 }, { "epoch": 4.201680672268908, "grad_norm": 1.0261584520339966, "learning_rate": 3.116603077889238e-05, "loss": 0.7619, "step": 875 }, { "epoch": 4.225690276110444, "grad_norm": 0.9810305833816528, "learning_rate": 3.098289160718895e-05, "loss": 0.7806, "step": 880 }, { "epoch": 4.249699879951981, "grad_norm": 1.160054087638855, "learning_rate": 3.079941122516803e-05, "loss": 0.701, "step": 885 }, { "epoch": 4.2737094837935174, "grad_norm": 0.9885622262954712, "learning_rate": 3.061560009690011e-05, "loss": 0.7114, "step": 890 }, { "epoch": 4.297719087635054, "grad_norm": 1.092362880706787, "learning_rate": 3.0431468705318424e-05, "loss": 0.7407, "step": 895 }, { "epoch": 4.321728691476591, "grad_norm": 1.3226685523986816, "learning_rate": 3.024702755162119e-05, "loss": 0.756, "step": 900 }, { "epoch": 4.345738295318127, "grad_norm": 1.164993166923523, "learning_rate": 3.0062287154672658e-05, "loss": 0.6897, "step": 905 }, { "epoch": 4.369747899159664, "grad_norm": 1.1299794912338257, "learning_rate": 2.9877258050403212e-05, "loss": 0.7212, "step": 910 }, { "epoch": 4.393757503001201, "grad_norm": 1.1793334484100342, "learning_rate": 2.9691950791208502e-05, "loss": 0.8578, "step": 915 }, { "epoch": 4.417767106842737, "grad_norm": 1.2607135772705078, "learning_rate": 2.950637594534765e-05, "loss": 0.7116, "step": 920 }, { "epoch": 4.441776710684274, "grad_norm": 1.309200644493103, "learning_rate": 2.9320544096340493e-05, "loss": 0.7693, "step": 925 }, { "epoch": 4.46578631452581, "grad_norm": 0.9688151478767395, "learning_rate": 2.9134465842364035e-05, "loss": 0.6641, "step": 930 }, { "epoch": 4.489795918367347, "grad_norm": 1.155928373336792, "learning_rate": 2.8948151795647993e-05, "loss": 0.7488, "step": 935 }, { "epoch": 4.513805522208884, "grad_norm": 1.2800368070602417, "learning_rate": 2.876161258186958e-05, "loss": 0.6626, "step": 940 }, { "epoch": 4.53781512605042, "grad_norm": 1.2294648885726929, "learning_rate": 2.8574858839547512e-05, "loss": 0.7064, "step": 945 }, { "epoch": 4.561824729891956, "grad_norm": 1.1497923135757446, "learning_rate": 2.83879012194353e-05, "loss": 0.703, "step": 950 }, { "epoch": 4.5858343337334935, "grad_norm": 1.0689631700515747, "learning_rate": 2.8200750383913776e-05, "loss": 0.8085, "step": 955 }, { "epoch": 4.60984393757503, "grad_norm": 1.038040041923523, "learning_rate": 2.8013417006383076e-05, "loss": 0.7251, "step": 960 }, { "epoch": 4.633853541416567, "grad_norm": 0.9959046244621277, "learning_rate": 2.782591177065388e-05, "loss": 0.7358, "step": 965 }, { "epoch": 4.657863145258103, "grad_norm": 1.1039036512374878, "learning_rate": 2.763824537033809e-05, "loss": 0.7625, "step": 970 }, { "epoch": 4.6818727490996395, "grad_norm": 0.98404860496521, "learning_rate": 2.7450428508239024e-05, "loss": 0.8123, "step": 975 }, { "epoch": 4.705882352941177, "grad_norm": 1.1559010744094849, "learning_rate": 2.726247189574095e-05, "loss": 0.7068, "step": 980 }, { "epoch": 4.729891956782713, "grad_norm": 1.0589702129364014, "learning_rate": 2.707438625219827e-05, "loss": 0.7431, "step": 985 }, { "epoch": 4.75390156062425, "grad_norm": 1.207343339920044, "learning_rate": 2.6886182304324153e-05, "loss": 0.7161, "step": 990 }, { "epoch": 4.777911164465786, "grad_norm": 1.023870825767517, "learning_rate": 2.669787078557876e-05, "loss": 0.7311, "step": 995 }, { "epoch": 4.801920768307323, "grad_norm": 0.9353678226470947, "learning_rate": 2.6509462435557152e-05, "loss": 0.6689, "step": 1000 }, { "epoch": 4.82593037214886, "grad_norm": 0.8823081851005554, "learning_rate": 2.6320967999376767e-05, "loss": 0.6888, "step": 1005 }, { "epoch": 4.849939975990396, "grad_norm": 1.1103065013885498, "learning_rate": 2.6132398227064615e-05, "loss": 0.7879, "step": 1010 }, { "epoch": 4.873949579831933, "grad_norm": 1.061865210533142, "learning_rate": 2.5943763872944206e-05, "loss": 0.6958, "step": 1015 }, { "epoch": 4.8979591836734695, "grad_norm": 1.1248444318771362, "learning_rate": 2.5755075695022224e-05, "loss": 0.6858, "step": 1020 }, { "epoch": 4.921968787515006, "grad_norm": 1.1603715419769287, "learning_rate": 2.5566344454374968e-05, "loss": 0.6799, "step": 1025 }, { "epoch": 4.945978391356543, "grad_norm": 1.1303590536117554, "learning_rate": 2.5377580914534647e-05, "loss": 0.767, "step": 1030 }, { "epoch": 4.969987995198079, "grad_norm": 1.2461163997650146, "learning_rate": 2.5188795840875544e-05, "loss": 0.7453, "step": 1035 }, { "epoch": 4.9939975990396155, "grad_norm": 1.0747658014297485, "learning_rate": 2.5e-05, "loss": 0.7559, "step": 1040 }, { "epoch": 5.018007202881153, "grad_norm": 1.0154122114181519, "learning_rate": 2.481120415912446e-05, "loss": 0.7729, "step": 1045 }, { "epoch": 5.042016806722689, "grad_norm": 1.195665717124939, "learning_rate": 2.4622419085465355e-05, "loss": 0.6956, "step": 1050 }, { "epoch": 5.066026410564226, "grad_norm": 1.1738536357879639, "learning_rate": 2.4433655545625038e-05, "loss": 0.7124, "step": 1055 }, { "epoch": 5.090036014405762, "grad_norm": 1.158319354057312, "learning_rate": 2.4244924304977785e-05, "loss": 0.7472, "step": 1060 }, { "epoch": 5.114045618247299, "grad_norm": 1.1242644786834717, "learning_rate": 2.40562361270558e-05, "loss": 0.6805, "step": 1065 }, { "epoch": 5.138055222088836, "grad_norm": 1.0289393663406372, "learning_rate": 2.3867601772935397e-05, "loss": 0.6898, "step": 1070 }, { "epoch": 5.162064825930372, "grad_norm": 1.0357887744903564, "learning_rate": 2.367903200062324e-05, "loss": 0.6664, "step": 1075 }, { "epoch": 5.186074429771908, "grad_norm": 1.1436580419540405, "learning_rate": 2.3490537564442847e-05, "loss": 0.7125, "step": 1080 }, { "epoch": 5.2100840336134455, "grad_norm": 1.1259691715240479, "learning_rate": 2.3302129214421242e-05, "loss": 0.7797, "step": 1085 }, { "epoch": 5.234093637454982, "grad_norm": 1.1516937017440796, "learning_rate": 2.3113817695675853e-05, "loss": 0.7587, "step": 1090 }, { "epoch": 5.258103241296519, "grad_norm": 1.1358476877212524, "learning_rate": 2.292561374780173e-05, "loss": 0.7414, "step": 1095 }, { "epoch": 5.282112845138055, "grad_norm": 1.2753323316574097, "learning_rate": 2.2737528104259056e-05, "loss": 0.6972, "step": 1100 }, { "epoch": 5.3061224489795915, "grad_norm": 1.2500908374786377, "learning_rate": 2.2549571491760986e-05, "loss": 0.7168, "step": 1105 }, { "epoch": 5.330132052821129, "grad_norm": 1.157196283340454, "learning_rate": 2.236175462966192e-05, "loss": 0.6441, "step": 1110 }, { "epoch": 5.354141656662665, "grad_norm": 1.1226704120635986, "learning_rate": 2.217408822934613e-05, "loss": 0.7473, "step": 1115 }, { "epoch": 5.378151260504202, "grad_norm": 1.0768887996673584, "learning_rate": 2.1986582993616926e-05, "loss": 0.6849, "step": 1120 }, { "epoch": 5.402160864345738, "grad_norm": 1.2554682493209839, "learning_rate": 2.179924961608623e-05, "loss": 0.6715, "step": 1125 }, { "epoch": 5.426170468187275, "grad_norm": 0.9506689310073853, "learning_rate": 2.1612098780564714e-05, "loss": 0.7139, "step": 1130 }, { "epoch": 5.450180072028812, "grad_norm": 1.2209597826004028, "learning_rate": 2.1425141160452494e-05, "loss": 0.8102, "step": 1135 }, { "epoch": 5.474189675870348, "grad_norm": 0.9822332262992859, "learning_rate": 2.1238387418130422e-05, "loss": 0.7772, "step": 1140 }, { "epoch": 5.498199279711884, "grad_norm": 1.1269121170043945, "learning_rate": 2.1051848204352013e-05, "loss": 0.7506, "step": 1145 }, { "epoch": 5.5222088835534215, "grad_norm": 1.1985368728637695, "learning_rate": 2.0865534157635967e-05, "loss": 0.7174, "step": 1150 }, { "epoch": 5.546218487394958, "grad_norm": 1.1794227361679077, "learning_rate": 2.0679455903659513e-05, "loss": 0.7339, "step": 1155 }, { "epoch": 5.570228091236495, "grad_norm": 1.1553770303726196, "learning_rate": 2.0493624054652357e-05, "loss": 0.7076, "step": 1160 }, { "epoch": 5.594237695078031, "grad_norm": 1.0076508522033691, "learning_rate": 2.0308049208791507e-05, "loss": 0.7758, "step": 1165 }, { "epoch": 5.6182472989195675, "grad_norm": 1.0329455137252808, "learning_rate": 2.0122741949596797e-05, "loss": 0.7103, "step": 1170 }, { "epoch": 5.642256902761105, "grad_norm": 1.0466042757034302, "learning_rate": 1.9937712845327345e-05, "loss": 0.747, "step": 1175 }, { "epoch": 5.666266506602641, "grad_norm": 1.1813960075378418, "learning_rate": 1.9752972448378814e-05, "loss": 0.7325, "step": 1180 }, { "epoch": 5.690276110444177, "grad_norm": 1.0979856252670288, "learning_rate": 1.9568531294681586e-05, "loss": 0.7336, "step": 1185 }, { "epoch": 5.714285714285714, "grad_norm": 1.1668668985366821, "learning_rate": 1.938439990309991e-05, "loss": 0.7238, "step": 1190 }, { "epoch": 5.738295318127251, "grad_norm": 1.1428868770599365, "learning_rate": 1.9200588774831975e-05, "loss": 0.6934, "step": 1195 }, { "epoch": 5.762304921968788, "grad_norm": 1.1208099126815796, "learning_rate": 1.9017108392811065e-05, "loss": 0.7573, "step": 1200 }, { "epoch": 5.786314525810324, "grad_norm": 1.1175333261489868, "learning_rate": 1.8833969221107622e-05, "loss": 0.7232, "step": 1205 }, { "epoch": 5.81032412965186, "grad_norm": 0.9412729740142822, "learning_rate": 1.8651181704332578e-05, "loss": 0.7221, "step": 1210 }, { "epoch": 5.834333733493398, "grad_norm": 1.0848782062530518, "learning_rate": 1.8468756267041595e-05, "loss": 0.755, "step": 1215 }, { "epoch": 5.858343337334934, "grad_norm": 1.2409135103225708, "learning_rate": 1.828670331314058e-05, "loss": 0.6962, "step": 1220 }, { "epoch": 5.882352941176471, "grad_norm": 1.05846107006073, "learning_rate": 1.810503322529236e-05, "loss": 0.7443, "step": 1225 }, { "epoch": 5.906362545018007, "grad_norm": 1.0066033601760864, "learning_rate": 1.7923756364324492e-05, "loss": 0.6654, "step": 1230 }, { "epoch": 5.930372148859544, "grad_norm": 0.9819132685661316, "learning_rate": 1.7742883068638447e-05, "loss": 0.7271, "step": 1235 }, { "epoch": 5.954381752701081, "grad_norm": 1.143143892288208, "learning_rate": 1.756242365361993e-05, "loss": 0.7538, "step": 1240 }, { "epoch": 5.978391356542617, "grad_norm": 1.1252750158309937, "learning_rate": 1.7382388411050638e-05, "loss": 0.6795, "step": 1245 }, { "epoch": 6.002400960384153, "grad_norm": 1.3320536613464355, "learning_rate": 1.7202787608521278e-05, "loss": 0.6446, "step": 1250 }, { "epoch": 6.02641056422569, "grad_norm": 1.1098029613494873, "learning_rate": 1.7023631488846006e-05, "loss": 0.6994, "step": 1255 }, { "epoch": 6.050420168067227, "grad_norm": 1.1677411794662476, "learning_rate": 1.6844930269478274e-05, "loss": 0.6886, "step": 1260 }, { "epoch": 6.074429771908764, "grad_norm": 1.1150180101394653, "learning_rate": 1.6666694141928096e-05, "loss": 0.7485, "step": 1265 }, { "epoch": 6.0984393757503, "grad_norm": 1.2847915887832642, "learning_rate": 1.6488933271180845e-05, "loss": 0.6399, "step": 1270 }, { "epoch": 6.122448979591836, "grad_norm": 1.1303415298461914, "learning_rate": 1.631165779511754e-05, "loss": 0.7799, "step": 1275 }, { "epoch": 6.146458583433374, "grad_norm": 1.2571803331375122, "learning_rate": 1.613487782393661e-05, "loss": 0.7589, "step": 1280 }, { "epoch": 6.17046818727491, "grad_norm": 1.0048649311065674, "learning_rate": 1.595860343957738e-05, "loss": 0.6969, "step": 1285 }, { "epoch": 6.194477791116446, "grad_norm": 1.129724383354187, "learning_rate": 1.5782844695145033e-05, "loss": 0.7108, "step": 1290 }, { "epoch": 6.218487394957983, "grad_norm": 1.0512356758117676, "learning_rate": 1.5607611614337292e-05, "loss": 0.7478, "step": 1295 }, { "epoch": 6.24249699879952, "grad_norm": 1.0714658498764038, "learning_rate": 1.5432914190872757e-05, "loss": 0.7225, "step": 1300 }, { "epoch": 6.266506602641057, "grad_norm": 1.2147399187088013, "learning_rate": 1.5258762387920956e-05, "loss": 0.6964, "step": 1305 }, { "epoch": 6.290516206482593, "grad_norm": 1.1545813083648682, "learning_rate": 1.5085166137534123e-05, "loss": 0.7673, "step": 1310 }, { "epoch": 6.314525810324129, "grad_norm": 1.092599630355835, "learning_rate": 1.4912135340080774e-05, "loss": 0.7194, "step": 1315 }, { "epoch": 6.3385354141656665, "grad_norm": 1.2281053066253662, "learning_rate": 1.4739679863681086e-05, "loss": 0.6869, "step": 1320 }, { "epoch": 6.362545018007203, "grad_norm": 1.1293562650680542, "learning_rate": 1.4567809543644076e-05, "loss": 0.6851, "step": 1325 }, { "epoch": 6.38655462184874, "grad_norm": 1.459465742111206, "learning_rate": 1.4396534181906725e-05, "loss": 0.7242, "step": 1330 }, { "epoch": 6.410564225690276, "grad_norm": 1.203008770942688, "learning_rate": 1.4225863546474943e-05, "loss": 0.7155, "step": 1335 }, { "epoch": 6.4345738295318124, "grad_norm": 1.0592601299285889, "learning_rate": 1.4055807370866485e-05, "loss": 0.6706, "step": 1340 }, { "epoch": 6.45858343337335, "grad_norm": 1.1361570358276367, "learning_rate": 1.388637535355585e-05, "loss": 0.6937, "step": 1345 }, { "epoch": 6.482593037214886, "grad_norm": 1.135237455368042, "learning_rate": 1.3717577157421169e-05, "loss": 0.8316, "step": 1350 }, { "epoch": 6.506602641056423, "grad_norm": 1.0417910814285278, "learning_rate": 1.3549422409193083e-05, "loss": 0.7229, "step": 1355 }, { "epoch": 6.530612244897959, "grad_norm": 1.1558960676193237, "learning_rate": 1.3381920698905787e-05, "loss": 0.7025, "step": 1360 }, { "epoch": 6.554621848739496, "grad_norm": 1.0858848094940186, "learning_rate": 1.3215081579350058e-05, "loss": 0.742, "step": 1365 }, { "epoch": 6.578631452581033, "grad_norm": 1.2004518508911133, "learning_rate": 1.3048914565528454e-05, "loss": 0.6737, "step": 1370 }, { "epoch": 6.602641056422569, "grad_norm": 1.1666450500488281, "learning_rate": 1.2883429134112673e-05, "loss": 0.7577, "step": 1375 }, { "epoch": 6.626650660264105, "grad_norm": 1.1294101476669312, "learning_rate": 1.2718634722903073e-05, "loss": 0.6495, "step": 1380 }, { "epoch": 6.6506602641056425, "grad_norm": 1.2848787307739258, "learning_rate": 1.2554540730290437e-05, "loss": 0.7069, "step": 1385 }, { "epoch": 6.674669867947179, "grad_norm": 1.2171602249145508, "learning_rate": 1.2391156514719984e-05, "loss": 0.705, "step": 1390 }, { "epoch": 6.698679471788715, "grad_norm": 1.0168397426605225, "learning_rate": 1.222849139415764e-05, "loss": 0.671, "step": 1395 }, { "epoch": 6.722689075630252, "grad_norm": 1.2341060638427734, "learning_rate": 1.2066554645558578e-05, "loss": 0.7456, "step": 1400 }, { "epoch": 6.7466986794717885, "grad_norm": 1.174575686454773, "learning_rate": 1.1905355504338248e-05, "loss": 0.6899, "step": 1405 }, { "epoch": 6.770708283313326, "grad_norm": 1.125368595123291, "learning_rate": 1.1744903163845577e-05, "loss": 0.7024, "step": 1410 }, { "epoch": 6.794717887154862, "grad_norm": 1.157349705696106, "learning_rate": 1.1585206774838683e-05, "loss": 0.6571, "step": 1415 }, { "epoch": 6.818727490996398, "grad_norm": 1.1240679025650024, "learning_rate": 1.1426275444963034e-05, "loss": 0.729, "step": 1420 }, { "epoch": 6.842737094837935, "grad_norm": 1.2598992586135864, "learning_rate": 1.1268118238232003e-05, "loss": 0.7971, "step": 1425 }, { "epoch": 6.866746698679472, "grad_norm": 1.2101902961730957, "learning_rate": 1.1110744174509952e-05, "loss": 0.6376, "step": 1430 }, { "epoch": 6.890756302521009, "grad_norm": 1.2665272951126099, "learning_rate": 1.0954162228997777e-05, "loss": 0.6663, "step": 1435 }, { "epoch": 6.914765906362545, "grad_norm": 1.0040076971054077, "learning_rate": 1.0798381331721109e-05, "loss": 0.6898, "step": 1440 }, { "epoch": 6.938775510204081, "grad_norm": 1.180012583732605, "learning_rate": 1.0643410367020983e-05, "loss": 0.686, "step": 1445 }, { "epoch": 6.9627851140456185, "grad_norm": 1.1320979595184326, "learning_rate": 1.048925817304717e-05, "loss": 0.7803, "step": 1450 }, { "epoch": 6.986794717887155, "grad_norm": 1.141840934753418, "learning_rate": 1.0335933541254129e-05, "loss": 0.6809, "step": 1455 }, { "epoch": 7.010804321728691, "grad_norm": 1.0778816938400269, "learning_rate": 1.0183445215899584e-05, "loss": 0.6936, "step": 1460 }, { "epoch": 7.034813925570228, "grad_norm": 0.9368105530738831, "learning_rate": 1.0031801893545895e-05, "loss": 0.7005, "step": 1465 }, { "epoch": 7.0588235294117645, "grad_norm": 1.0788689851760864, "learning_rate": 9.881012222564065e-06, "loss": 0.8484, "step": 1470 }, { "epoch": 7.082833133253302, "grad_norm": 1.1011931896209717, "learning_rate": 9.731084802640459e-06, "loss": 0.7897, "step": 1475 }, { "epoch": 7.106842737094838, "grad_norm": 1.22800612449646, "learning_rate": 9.582028184286423e-06, "loss": 0.6696, "step": 1480 }, { "epoch": 7.130852340936374, "grad_norm": 1.224184274673462, "learning_rate": 9.43385086835062e-06, "loss": 0.6867, "step": 1485 }, { "epoch": 7.154861944777911, "grad_norm": 1.2134716510772705, "learning_rate": 9.286561305534203e-06, "loss": 0.7372, "step": 1490 }, { "epoch": 7.178871548619448, "grad_norm": 1.01565682888031, "learning_rate": 9.140167895908867e-06, "loss": 0.6718, "step": 1495 }, { "epoch": 7.202881152460985, "grad_norm": 1.176138162612915, "learning_rate": 8.994678988437802e-06, "loss": 0.6819, "step": 1500 }, { "epoch": 7.226890756302521, "grad_norm": 1.1396265029907227, "learning_rate": 8.850102880499531e-06, "loss": 0.7094, "step": 1505 }, { "epoch": 7.250900360144057, "grad_norm": 1.1229872703552246, "learning_rate": 8.706447817414696e-06, "loss": 0.7423, "step": 1510 }, { "epoch": 7.2749099639855945, "grad_norm": 1.0950995683670044, "learning_rate": 8.563721991975843e-06, "loss": 0.6957, "step": 1515 }, { "epoch": 7.298919567827131, "grad_norm": 1.322180986404419, "learning_rate": 8.421933543980126e-06, "loss": 0.6382, "step": 1520 }, { "epoch": 7.322929171668667, "grad_norm": 1.1946529150009155, "learning_rate": 8.281090559765156e-06, "loss": 0.6939, "step": 1525 }, { "epoch": 7.346938775510204, "grad_norm": 1.3446452617645264, "learning_rate": 8.141201071747784e-06, "loss": 0.7682, "step": 1530 }, { "epoch": 7.3709483793517405, "grad_norm": 1.063050627708435, "learning_rate": 8.002273057966011e-06, "loss": 0.6319, "step": 1535 }, { "epoch": 7.394957983193278, "grad_norm": 1.0960049629211426, "learning_rate": 7.864314441624004e-06, "loss": 0.7331, "step": 1540 }, { "epoch": 7.418967587034814, "grad_norm": 1.153906226158142, "learning_rate": 7.727333090640218e-06, "loss": 0.7249, "step": 1545 }, { "epoch": 7.44297719087635, "grad_norm": 1.1610156297683716, "learning_rate": 7.591336817198682e-06, "loss": 0.6484, "step": 1550 }, { "epoch": 7.466986794717887, "grad_norm": 1.1550188064575195, "learning_rate": 7.456333377303457e-06, "loss": 0.7723, "step": 1555 }, { "epoch": 7.490996398559424, "grad_norm": 1.2671056985855103, "learning_rate": 7.3223304703363135e-06, "loss": 0.7266, "step": 1560 }, { "epoch": 7.515006002400961, "grad_norm": 1.234147548675537, "learning_rate": 7.189335738617633e-06, "loss": 0.6927, "step": 1565 }, { "epoch": 7.539015606242497, "grad_norm": 1.1650274991989136, "learning_rate": 7.057356766970541e-06, "loss": 0.7219, "step": 1570 }, { "epoch": 7.563025210084033, "grad_norm": 1.2067919969558716, "learning_rate": 6.926401082288359e-06, "loss": 0.6933, "step": 1575 }, { "epoch": 7.5870348139255706, "grad_norm": 1.1489832401275635, "learning_rate": 6.796476153105294e-06, "loss": 0.7651, "step": 1580 }, { "epoch": 7.611044417767107, "grad_norm": 1.24705970287323, "learning_rate": 6.667589389170562e-06, "loss": 0.6558, "step": 1585 }, { "epoch": 7.635054021608643, "grad_norm": 1.0954616069793701, "learning_rate": 6.5397481410257645e-06, "loss": 0.6601, "step": 1590 }, { "epoch": 7.65906362545018, "grad_norm": 1.272865653038025, "learning_rate": 6.41295969958568e-06, "loss": 0.6938, "step": 1595 }, { "epoch": 7.6830732292917165, "grad_norm": 1.2320659160614014, "learning_rate": 6.28723129572247e-06, "loss": 0.7301, "step": 1600 }, { "epoch": 7.707082833133253, "grad_norm": 1.2697712182998657, "learning_rate": 6.16257009985329e-06, "loss": 0.6848, "step": 1605 }, { "epoch": 7.73109243697479, "grad_norm": 1.1409835815429688, "learning_rate": 6.038983221531352e-06, "loss": 0.6372, "step": 1610 }, { "epoch": 7.755102040816326, "grad_norm": 1.1304911375045776, "learning_rate": 5.916477709040444e-06, "loss": 0.6736, "step": 1615 }, { "epoch": 7.779111644657863, "grad_norm": 1.1968624591827393, "learning_rate": 5.79506054899299e-06, "loss": 0.6915, "step": 1620 }, { "epoch": 7.8031212484994, "grad_norm": 1.0843942165374756, "learning_rate": 5.674738665931575e-06, "loss": 0.696, "step": 1625 }, { "epoch": 7.827130852340936, "grad_norm": 1.0682406425476074, "learning_rate": 5.555518921934047e-06, "loss": 0.7064, "step": 1630 }, { "epoch": 7.851140456182473, "grad_norm": 1.13326096534729, "learning_rate": 5.437408116222148e-06, "loss": 0.7497, "step": 1635 }, { "epoch": 7.875150060024009, "grad_norm": 1.1618019342422485, "learning_rate": 5.320412984773748e-06, "loss": 0.7025, "step": 1640 }, { "epoch": 7.899159663865547, "grad_norm": 1.2662303447723389, "learning_rate": 5.204540199938707e-06, "loss": 0.6823, "step": 1645 }, { "epoch": 7.923169267707083, "grad_norm": 1.1508044004440308, "learning_rate": 5.089796370058325e-06, "loss": 0.8013, "step": 1650 }, { "epoch": 7.947178871548619, "grad_norm": 1.1236162185668945, "learning_rate": 4.9761880390884694e-06, "loss": 0.7497, "step": 1655 }, { "epoch": 7.971188475390156, "grad_norm": 1.239221453666687, "learning_rate": 4.86372168622635e-06, "loss": 0.7817, "step": 1660 }, { "epoch": 7.995198079231693, "grad_norm": 1.1400153636932373, "learning_rate": 4.7524037255410434e-06, "loss": 0.615, "step": 1665 }, { "epoch": 8.01920768307323, "grad_norm": 1.1075925827026367, "learning_rate": 4.642240505607659e-06, "loss": 0.7037, "step": 1670 }, { "epoch": 8.043217286914766, "grad_norm": 1.244471788406372, "learning_rate": 4.533238309145258e-06, "loss": 0.6747, "step": 1675 }, { "epoch": 8.067226890756302, "grad_norm": 1.114261269569397, "learning_rate": 4.425403352658591e-06, "loss": 0.7154, "step": 1680 }, { "epoch": 8.091236494597839, "grad_norm": 1.2810643911361694, "learning_rate": 4.318741786083538e-06, "loss": 0.7838, "step": 1685 }, { "epoch": 8.115246098439377, "grad_norm": 1.1281261444091797, "learning_rate": 4.213259692436367e-06, "loss": 0.7525, "step": 1690 }, { "epoch": 8.139255702280913, "grad_norm": 1.111745834350586, "learning_rate": 4.1089630874668325e-06, "loss": 0.6624, "step": 1695 }, { "epoch": 8.16326530612245, "grad_norm": 0.9732471108436584, "learning_rate": 4.0058579193150535e-06, "loss": 0.6821, "step": 1700 }, { "epoch": 8.187274909963985, "grad_norm": 1.1068557500839233, "learning_rate": 3.903950068172338e-06, "loss": 0.6932, "step": 1705 }, { "epoch": 8.211284513805522, "grad_norm": 1.1466325521469116, "learning_rate": 3.8032453459457884e-06, "loss": 0.7308, "step": 1710 }, { "epoch": 8.235294117647058, "grad_norm": 1.076535940170288, "learning_rate": 3.7037494959268644e-06, "loss": 0.7136, "step": 1715 }, { "epoch": 8.259303721488596, "grad_norm": 1.234830617904663, "learning_rate": 3.605468192463815e-06, "loss": 0.6595, "step": 1720 }, { "epoch": 8.283313325330132, "grad_norm": 1.2779568433761597, "learning_rate": 3.5084070406380897e-06, "loss": 0.7671, "step": 1725 }, { "epoch": 8.307322929171669, "grad_norm": 1.0826342105865479, "learning_rate": 3.4125715759446785e-06, "loss": 0.6561, "step": 1730 }, { "epoch": 8.331332533013205, "grad_norm": 1.292898416519165, "learning_rate": 3.317967263976374e-06, "loss": 0.6876, "step": 1735 }, { "epoch": 8.355342136854741, "grad_norm": 1.2453337907791138, "learning_rate": 3.2245995001121106e-06, "loss": 0.6819, "step": 1740 }, { "epoch": 8.37935174069628, "grad_norm": 1.109753131866455, "learning_rate": 3.1324736092092412e-06, "loss": 0.7044, "step": 1745 }, { "epoch": 8.403361344537815, "grad_norm": 1.1907482147216797, "learning_rate": 3.0415948452998557e-06, "loss": 0.6317, "step": 1750 }, { "epoch": 8.427370948379352, "grad_norm": 1.036994218826294, "learning_rate": 2.9519683912911266e-06, "loss": 0.6521, "step": 1755 }, { "epoch": 8.451380552220888, "grad_norm": 1.1481200456619263, "learning_rate": 2.8635993586697553e-06, "loss": 0.7572, "step": 1760 }, { "epoch": 8.475390156062424, "grad_norm": 1.2365052700042725, "learning_rate": 2.776492787210425e-06, "loss": 0.7184, "step": 1765 }, { "epoch": 8.499399759903962, "grad_norm": 1.1394840478897095, "learning_rate": 2.690653644688393e-06, "loss": 0.6881, "step": 1770 }, { "epoch": 8.523409363745499, "grad_norm": 1.2600808143615723, "learning_rate": 2.6060868265961822e-06, "loss": 0.6575, "step": 1775 }, { "epoch": 8.547418967587035, "grad_norm": 1.09065842628479, "learning_rate": 2.5227971558643537e-06, "loss": 0.7457, "step": 1780 }, { "epoch": 8.571428571428571, "grad_norm": 1.245969533920288, "learning_rate": 2.4407893825864892e-06, "loss": 0.7127, "step": 1785 }, { "epoch": 8.595438175270107, "grad_norm": 1.3175572156906128, "learning_rate": 2.360068183748268e-06, "loss": 0.7442, "step": 1790 }, { "epoch": 8.619447779111646, "grad_norm": 1.220615267753601, "learning_rate": 2.2806381629607327e-06, "loss": 0.6743, "step": 1795 }, { "epoch": 8.643457382953182, "grad_norm": 1.1195836067199707, "learning_rate": 2.2025038501977486e-06, "loss": 0.7338, "step": 1800 }, { "epoch": 8.667466986794718, "grad_norm": 1.075049638748169, "learning_rate": 2.125669701537647e-06, "loss": 0.7514, "step": 1805 }, { "epoch": 8.691476590636254, "grad_norm": 1.1793599128723145, "learning_rate": 2.0501400989091036e-06, "loss": 0.7774, "step": 1810 }, { "epoch": 8.71548619447779, "grad_norm": 1.165880560874939, "learning_rate": 1.97591934984121e-06, "loss": 0.6895, "step": 1815 }, { "epoch": 8.739495798319329, "grad_norm": 1.101431131362915, "learning_rate": 1.9030116872178316e-06, "loss": 0.7218, "step": 1820 }, { "epoch": 8.763505402160865, "grad_norm": 1.2261079549789429, "learning_rate": 1.8314212690361987e-06, "loss": 0.7553, "step": 1825 }, { "epoch": 8.787515006002401, "grad_norm": 1.1773276329040527, "learning_rate": 1.7611521781697644e-06, "loss": 0.6779, "step": 1830 }, { "epoch": 8.811524609843937, "grad_norm": 1.2577153444290161, "learning_rate": 1.6922084221353607e-06, "loss": 0.6946, "step": 1835 }, { "epoch": 8.835534213685474, "grad_norm": 1.1246775388717651, "learning_rate": 1.624593932864632e-06, "loss": 0.7149, "step": 1840 }, { "epoch": 8.85954381752701, "grad_norm": 1.1089775562286377, "learning_rate": 1.5583125664798165e-06, "loss": 0.6681, "step": 1845 }, { "epoch": 8.883553421368548, "grad_norm": 1.138650894165039, "learning_rate": 1.4933681030738138e-06, "loss": 0.6519, "step": 1850 }, { "epoch": 8.907563025210084, "grad_norm": 1.1409391164779663, "learning_rate": 1.429764246494597e-06, "loss": 0.6957, "step": 1855 }, { "epoch": 8.93157262905162, "grad_norm": 1.2397470474243164, "learning_rate": 1.3675046241339918e-06, "loss": 0.6942, "step": 1860 }, { "epoch": 8.955582232893157, "grad_norm": 1.310257077217102, "learning_rate": 1.306592786720795e-06, "loss": 0.6578, "step": 1865 }, { "epoch": 8.979591836734693, "grad_norm": 1.057267189025879, "learning_rate": 1.2470322081182761e-06, "loss": 0.6994, "step": 1870 }, { "epoch": 9.003601440576231, "grad_norm": 1.2477996349334717, "learning_rate": 1.1888262851260462e-06, "loss": 0.643, "step": 1875 }, { "epoch": 9.027611044417768, "grad_norm": 1.2775191068649292, "learning_rate": 1.1319783372863602e-06, "loss": 0.6307, "step": 1880 }, { "epoch": 9.051620648259304, "grad_norm": 1.1136633157730103, "learning_rate": 1.0764916066947794e-06, "loss": 0.7088, "step": 1885 }, { "epoch": 9.07563025210084, "grad_norm": 1.2463817596435547, "learning_rate": 1.0223692578152782e-06, "loss": 0.6603, "step": 1890 }, { "epoch": 9.099639855942376, "grad_norm": 1.140369176864624, "learning_rate": 9.696143772997768e-07, "loss": 0.8255, "step": 1895 }, { "epoch": 9.123649459783914, "grad_norm": 1.0478204488754272, "learning_rate": 9.182299738120931e-07, "loss": 0.6834, "step": 1900 }, { "epoch": 9.14765906362545, "grad_norm": 1.243027925491333, "learning_rate": 8.682189778563693e-07, "loss": 0.6365, "step": 1905 }, { "epoch": 9.171668667466987, "grad_norm": 1.1335963010787964, "learning_rate": 8.195842416099359e-07, "loss": 0.7317, "step": 1910 }, { "epoch": 9.195678271308523, "grad_norm": 1.2012873888015747, "learning_rate": 7.723285387606471e-07, "loss": 0.741, "step": 1915 }, { "epoch": 9.21968787515006, "grad_norm": 1.098880648612976, "learning_rate": 7.264545643486997e-07, "loss": 0.6881, "step": 1920 }, { "epoch": 9.243697478991596, "grad_norm": 1.138432502746582, "learning_rate": 6.819649346129304e-07, "loss": 0.735, "step": 1925 }, { "epoch": 9.267707082833134, "grad_norm": 1.163900375366211, "learning_rate": 6.3886218684161e-07, "loss": 0.7227, "step": 1930 }, { "epoch": 9.29171668667467, "grad_norm": 1.163901448249817, "learning_rate": 5.971487792277297e-07, "loss": 0.681, "step": 1935 }, { "epoch": 9.315726290516206, "grad_norm": 1.3683992624282837, "learning_rate": 5.568270907288287e-07, "loss": 0.7081, "step": 1940 }, { "epoch": 9.339735894357743, "grad_norm": 1.2360917329788208, "learning_rate": 5.178994209312948e-07, "loss": 0.7064, "step": 1945 }, { "epoch": 9.363745498199279, "grad_norm": 1.0632774829864502, "learning_rate": 4.803679899192392e-07, "loss": 0.6829, "step": 1950 }, { "epoch": 9.387755102040817, "grad_norm": 1.200908899307251, "learning_rate": 4.4423493814786667e-07, "loss": 0.6814, "step": 1955 }, { "epoch": 9.411764705882353, "grad_norm": 1.2517386674880981, "learning_rate": 4.095023263214121e-07, "loss": 0.769, "step": 1960 }, { "epoch": 9.43577430972389, "grad_norm": 1.0798803567886353, "learning_rate": 3.761721352756098e-07, "loss": 0.7539, "step": 1965 }, { "epoch": 9.459783913565426, "grad_norm": 0.9635175466537476, "learning_rate": 3.4424626586473385e-07, "loss": 0.6506, "step": 1970 }, { "epoch": 9.483793517406962, "grad_norm": 1.1478654146194458, "learning_rate": 3.1372653885318736e-07, "loss": 0.6763, "step": 1975 }, { "epoch": 9.5078031212485, "grad_norm": 1.2677192687988281, "learning_rate": 2.846146948116468e-07, "loss": 0.6776, "step": 1980 }, { "epoch": 9.531812725090036, "grad_norm": 1.1099594831466675, "learning_rate": 2.569123940178192e-07, "loss": 0.7292, "step": 1985 }, { "epoch": 9.555822328931573, "grad_norm": 1.1103848218917847, "learning_rate": 2.3062121636174826e-07, "loss": 0.8111, "step": 1990 }, { "epoch": 9.579831932773109, "grad_norm": 1.1750984191894531, "learning_rate": 2.0574266125569509e-07, "loss": 0.6473, "step": 1995 }, { "epoch": 9.603841536614645, "grad_norm": 1.0524730682373047, "learning_rate": 1.8227814754865068e-07, "loss": 0.6774, "step": 2000 }, { "epoch": 9.627851140456183, "grad_norm": 1.1044491529464722, "learning_rate": 1.6022901344539543e-07, "loss": 0.6782, "step": 2005 }, { "epoch": 9.65186074429772, "grad_norm": 1.2015644311904907, "learning_rate": 1.39596516430196e-07, "loss": 0.6965, "step": 2010 }, { "epoch": 9.675870348139256, "grad_norm": 1.066968560218811, "learning_rate": 1.2038183319507955e-07, "loss": 0.6496, "step": 2015 }, { "epoch": 9.699879951980792, "grad_norm": 1.1246190071105957, "learning_rate": 1.0258605957272627e-07, "loss": 0.6511, "step": 2020 }, { "epoch": 9.723889555822328, "grad_norm": 1.017182469367981, "learning_rate": 8.621021047398314e-08, "loss": 0.7524, "step": 2025 }, { "epoch": 9.747899159663866, "grad_norm": 1.2211177349090576, "learning_rate": 7.125521982997152e-08, "loss": 0.6653, "step": 2030 }, { "epoch": 9.771908763505403, "grad_norm": 1.2134476900100708, "learning_rate": 5.772194053882962e-08, "loss": 0.6718, "step": 2035 }, { "epoch": 9.795918367346939, "grad_norm": 1.3641300201416016, "learning_rate": 4.56111444170626e-08, "loss": 0.7114, "step": 2040 }, { "epoch": 9.819927971188475, "grad_norm": 1.1759401559829712, "learning_rate": 3.4923522155544394e-08, "loss": 0.7125, "step": 2045 }, { "epoch": 9.843937575030012, "grad_norm": 1.0511598587036133, "learning_rate": 2.5659683280102044e-08, "loss": 0.6236, "step": 2050 }, { "epoch": 9.867947178871548, "grad_norm": 1.1891964673995972, "learning_rate": 1.782015611677401e-08, "loss": 0.7353, "step": 2055 }, { "epoch": 9.891956782713086, "grad_norm": 1.116445541381836, "learning_rate": 1.1405387761664887e-08, "loss": 0.7489, "step": 2060 }, { "epoch": 9.915966386554622, "grad_norm": 1.1506844758987427, "learning_rate": 6.415744055460193e-09, "loss": 0.7743, "step": 2065 }, { "epoch": 9.939975990396158, "grad_norm": 1.2120825052261353, "learning_rate": 2.8515095625514244e-09, "loss": 0.6435, "step": 2070 }, { "epoch": 9.963985594237695, "grad_norm": 1.1971231698989868, "learning_rate": 7.128875548101377e-10, "loss": 0.7375, "step": 2075 }, { "epoch": 9.987995198079231, "grad_norm": 1.2055232524871826, "learning_rate": 0.0, "loss": 0.7238, "step": 2080 }, { "epoch": 9.987995198079231, "step": 2080, "total_flos": 8.185270082173747e+17, "train_loss": 0.7521515275423344, "train_runtime": 19400.1091, "train_samples_per_second": 1.717, "train_steps_per_second": 0.107 } ], "logging_steps": 5, "max_steps": 2080, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "total_flos": 8.185270082173747e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }